rails_spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +165 -0
  3. data/README.md +33 -0
  4. data/app/assets/config/the_spider_manifest.js +4 -0
  5. data/app/assets/javascripts/the_spider/application.js +1 -0
  6. data/app/assets/stylesheets/the_spider/application.css +4 -0
  7. data/app/controllers/the_spider/application_controller.rb +9 -0
  8. data/app/controllers/the_spider/locals_controller.rb +62 -0
  9. data/app/controllers/the_spider/works_controller.rb +60 -0
  10. data/app/helpers/the_spider/application_helper.rb +4 -0
  11. data/app/helpers/the_spider/locals_helper.rb +4 -0
  12. data/app/helpers/the_spider/works_helper.rb +4 -0
  13. data/app/jobs/the_spider/application_job.rb +4 -0
  14. data/app/jobs/the_spider/parser_job.rb +11 -0
  15. data/app/jobs/the_spider/work_job.rb +11 -0
  16. data/app/mailers/the_spider/application_mailer.rb +6 -0
  17. data/app/models/rails_spider/application_record.rb +5 -0
  18. data/app/models/rails_spider/cookie.rb +9 -0
  19. data/app/models/rails_spider/failed_url.rb +7 -0
  20. data/app/models/rails_spider/local.rb +14 -0
  21. data/app/models/rails_spider/work.rb +24 -0
  22. data/app/views/layouts/the_spider/application.html.erb +14 -0
  23. data/app/views/the_spider/locals/_form.html.erb +17 -0
  24. data/app/views/the_spider/locals/edit.html.erb +6 -0
  25. data/app/views/the_spider/locals/index.html.erb +25 -0
  26. data/app/views/the_spider/locals/new.html.erb +5 -0
  27. data/app/views/the_spider/locals/show.html.erb +4 -0
  28. data/app/views/the_spider/works/_form.html.erb +9 -0
  29. data/app/views/the_spider/works/edit.html.erb +6 -0
  30. data/app/views/the_spider/works/index.html.erb +44 -0
  31. data/app/views/the_spider/works/new.html.erb +5 -0
  32. data/app/views/the_spider/works/show.html.erb +4 -0
  33. data/config/routes.rb +8 -0
  34. data/config/schedule.rb +35 -0
  35. data/db/migrate/20170502153051_rails_spider_init.rb +38 -0
  36. data/lib/config/config.rb +27 -0
  37. data/lib/config/keywords.json +22 -0
  38. data/lib/config/proxy.json +10 -0
  39. data/lib/helper/helper.rb +6 -0
  40. data/lib/helper/location_helper.rb +46 -0
  41. data/lib/helper/price_helper.rb +23 -0
  42. data/lib/helper/tag_helper.rb +17 -0
  43. data/lib/helper/text_helper.rb +41 -0
  44. data/lib/helper/time_helper.rb +140 -0
  45. data/lib/logger.rb +146 -0
  46. data/lib/proxy/allproxylists.txt +2366 -0
  47. data/lib/proxy/proxy.rb +216 -0
  48. data/lib/proxy/proxylists.txt +625 -0
  49. data/lib/rails_spider.rb +10 -0
  50. data/lib/rails_spider/engine.rb +9 -0
  51. data/lib/rails_spider/fetchers.rb +2 -0
  52. data/lib/rails_spider/fetchers/base.rb +146 -0
  53. data/lib/rails_spider/fetchers/mechanize.rb +83 -0
  54. data/lib/rails_spider/fetchers/witar.rb +73 -0
  55. data/lib/rails_spider/parser.rb +14 -0
  56. data/lib/rails_spider/parser/szlawyers.rb +26 -0
  57. data/lib/rails_spider/resource.rb +58 -0
  58. data/lib/rails_spider/strategies.rb +6 -0
  59. data/lib/rails_spider/version.rb +3 -0
  60. data/lib/sync_qiniu.rb +35 -0
  61. data/lib/sync_qiniu/getimages.rb +98 -0
  62. data/lib/sync_qiniu/getimages_info.rb +37 -0
  63. data/lib/sync_qiniu/getlocation.rb +48 -0
  64. data/lib/sync_qiniu/getproxy.rb +95 -0
  65. data/lib/tasks/the_spider_tasks.rake +4 -0
  66. data/rakefile +284 -0
  67. metadata +165 -0
data/lib/sync_qiniu.rb ADDED
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'fileutils'
4
+ require 'open-uri'
5
+ require 'bundler'
6
+ require 'logger'
7
+ Bundler.require
8
+
9
+ Mongoid.load!("./config/mongoid.yml","development")
10
+
11
+ Dir.glob("#{File.dirname(__FILE__)}/models/*.rb").each do |m|
12
+ require m
13
+ end
14
+
15
+ FileUtils.mkdir_p("log")
16
+ logger = Logger.new("log/getimage_log.txt")
17
+
18
+ @events = Event
19
+
20
+ logger.log(1, 'Prepare folder')
21
+ dir = "images/faces/#{Date.today.to_s.gsub("-", "_")}"
22
+ log = "Try to make dir #{dir}"
23
+ logger.log(1, log)
24
+ FileUtils.mkdir_p(dir)
25
+
26
+ pwd = File.dirname(File.expand_path('.', __FILE__))
27
+ log = "pwd #{pwd}"
28
+ logger.log(1, log)
29
+ config = "#{pwd}/config/qiniu_conf.json"
30
+ qrsync = "#{pwd}/vendor/qiniu/qrsync"
31
+ log = 'sync to qiniu after download all images.'
32
+ logger.log(1, log)
33
+ cmd = "#{qrsync} #{config}"
34
+ logger.log(1, cmd)
35
+ `#{cmd}`
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'fileutils'
4
+ require 'open-uri'
5
+ require 'bundler'
6
+ require 'logger'
7
+ Bundler.require
8
+
9
+ Mongoid.load!("./config/mongoid.yml","development")
10
+
11
+ Dir.glob("#{File.dirname(__FILE__)}/models/*.rb").each do |m|
12
+ require m
13
+ end
14
+
15
+ Dir.glob("#{File.dirname(__FILE__)}/helper/*.rb").each do |h|
16
+ require h
17
+ end
18
+
19
+ FileUtils.mkdir_p("log")
20
+ logger = Logger.new("log/getimage_log.txt")
21
+
22
+ @events = Event
23
+
24
+ logger.log(1, 'Prepare folder')
25
+ dir = "images/faces/#{Date.today.to_s.gsub("-", "_")}"
26
+ log = "Try to make dir #{dir}"
27
+ logger.log(1, log)
28
+ FileUtils.mkdir_p(dir)
29
+
30
+ @events = @events.where(filename: nil)
31
+
32
+ count_to_download = @events.count
33
+
34
+ log = "Will download #{count_to_download} events face"
35
+ logger.log(1, log)
36
+
37
+ @events = @events.to_a
38
+
39
+ group_len = 800
40
+
41
+ threads = []
42
+
43
+ @events.each_slice(group_len).each do |event_group|
44
+ threads << Thread.new do
45
+ event_group.each do |e|
46
+ begin
47
+ data = open(URI.encode(e.face)){ |f| f.read }
48
+
49
+ filetype = File.extname(e.face).gsub(/\?.*$/, "")
50
+ filename = "face-" + e.id.to_s + filetype
51
+ filename = "#{dir}/#{filename}"
52
+ open(filename, "wb") { |f| f.write(data) }
53
+ log = "Download #{filename} to images/..."
54
+ logger.log(1, log)
55
+ e.update_attribute(:filename, filename)
56
+ sleep 0.5
57
+ rescue => se
58
+ log = "throw a exception:\n#{se.message}\n#{se.backtrace}\n#{e.url}"
59
+ logger.log(1, log)
60
+ sleep 1
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ threads.map(&:join)
67
+
68
+ # sync to qiniu after download all images.
69
+ if count_to_download > 0
70
+ pwd = File.dirname(File.expand_path('.', __FILE__))
71
+ log = "pwd #{pwd}"
72
+ logger.log(1, log)
73
+ config = "#{pwd}/config/qiniu_conf.json"
74
+ qrsync = "#{pwd}/vendor/qiniu/qrsync"
75
+ log = 'sync to qiniu after download all images.'
76
+ logger.log(1, log)
77
+ cmd = "#{qrsync} #{config}"
78
+ logger.log(1, cmd)
79
+ `#{cmd}`
80
+ end
81
+
82
+ threads = []
83
+ @events = Event.where(poster_width: nil).to_a
84
+ @events.each_slice(group_len).each do |event_group|
85
+ threads << Thread.new do
86
+ event_group.each do |e|
87
+ next if e.filename.blank?
88
+ begin
89
+ e.update_image_info
90
+ rescue => se
91
+ log = "#{Time.now}\n#{se.message}\n#{se.backtrace}#{e.url}"
92
+ logger.log(1, log)
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+ threads.map(&:join)
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'fileutils'
4
+ require 'open-uri'
5
+ require 'bundler'
6
+ Bundler.require
7
+
8
+ Mongoid.load!("./config/mongoid.yml","development")
9
+
10
+ Dir.glob("#{File.dirname(__FILE__)}/models/*.rb").each do |m|
11
+ require m
12
+ end
13
+
14
+ group_len = 200
15
+ id = "534274765275627a63000000"
16
+ count = Event.where(poster_width: nil, :id.gt => id).count
17
+ while count > 100
18
+ threads = []
19
+ @events = Event.where(poster_width: nil, :id.gt => id).limit(1000).to_a
20
+ @events.each_slice(group_len).each do |event_group|
21
+ threads << Thread.new do
22
+ event_group.each do |e|
23
+ next if e.filename.blank?
24
+ begin
25
+ e.update_image_info
26
+ rescue => se
27
+ log = "#{Time.now}\n#{se.message}\n#{se.backtrace}#{e.url}"
28
+ puts log
29
+ end
30
+ end
31
+ end
32
+ end
33
+ threads.map(&:join)
34
+ id = @events.last.id.to_s
35
+ count = Event.where(poster_width: nil, :id.gt => id).count
36
+ end
37
+
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'bundler'
4
+ require 'net/http'
5
+ Bundler.require
6
+
7
+ Mongoid.load!("./config/mongoid.yml","development")
8
+
9
+ Dir.glob("#{File.dirname(__FILE__)}/models/*.rb").each do |m|
10
+ require m
11
+ end
12
+
13
+ @events = Event.where(:source.ne => "douban") # can not use only because of update attribute
14
+
15
+ @events.each do |e|
16
+ puts "ID: #{e.id}"
17
+ puts e.place
18
+ url = URI.escape("http://api.map.baidu.com/geocoder/v2/?address=#{e.place}&output=json&ak=A38d59da730152d77b407446a3c0dd2b")
19
+ # Geocoding API: http://developer.baidu.com/map/webservice-geocoding.htm
20
+ # http://api.map.baidu.com/geocoder/v2/?address=%E5%BE%90%E5%AE%B6%E6%B1%87&output=json&ak=A38d59da730152d77b407446a3c0dd2b&callback=showLocation
21
+ begin
22
+ response = Net::HTTP.get_response(URI(url))
23
+ puts response.body
24
+ data = response.body
25
+ rescue SocketError
26
+ sleep 10
27
+ retry
28
+ end
29
+ result = JSON.parse(data)
30
+ puts result
31
+ # status 说明文档: http://developer.baidu.com/map/webservice-geocoding.htm#.E6.8E.A5.E5.8F.A3.E7.A4.BA.E4.BE.8A
32
+ if result["status"] != 0
33
+ e.location = [0.0, 0.0]
34
+ e.save
35
+ puts "This place can not translate to location......"
36
+ else
37
+ # {"status"=>0, "result"=>{"location"=>{"lng"=>121.48026424818, "lat"=>31.229092805768}, "precise"=>1, "confidence"=>80, "level"=>"道路"}}
38
+ puts result["result"]["location"]
39
+ store_result = []
40
+ store_result << result["result"]["location"]["lng"]
41
+ store_result << result["result"]["location"]["lat"]
42
+ puts store_result
43
+ e.location = store_result
44
+ e.save
45
+ puts "save location #{store_result} of ID: #{e.id} success!"
46
+ puts '--------------------'
47
+ end
48
+ end
@@ -0,0 +1,95 @@
1
+ # encoding: utf-8
2
+ require 'rubygems'
3
+ require 'bundler'
4
+ Bundler.require
5
+
6
+ # Fetch proxy from: http://proxy.com.ru/
7
+ class ProxyList
8
+
9
+ def initialize
10
+ @proxylists = []
11
+ @urls = []
12
+ (1..2).each do |i|
13
+ @urls << "http://proxy.com.ru/list_#{i}.html"
14
+ end
15
+ end
16
+
17
+ def fetch_list
18
+ @urls.each do |url|
19
+ puts proxys_in_url(url).class
20
+ @proxylists += proxys_in_url(url)
21
+ end
22
+ @proxylists.uniq!
23
+ puts @proxylists
24
+ puts "Fetched proxys #{@proxylists.size}"
25
+ end
26
+
27
+ alias :start :fetch_list
28
+
29
+ def proxys_in_url(url)
30
+ proxys = []
31
+ agent = Mechanize.new
32
+ page = agent.get(url)
33
+ list = page.search("body font table tr td:last font table tr")
34
+ list[1..-1].each do |tr|
35
+ ip = tr.search("td")[1].text
36
+ port = tr.search("td")[2].text
37
+ proxy = { ip: ip, port: port }
38
+ proxy = verify_proxy(proxy) # 比较费时间
39
+ proxys << proxy unless proxy == false
40
+ puts "add #{proxy} to proxys array..."
41
+ end
42
+ return proxys
43
+ end
44
+
45
+ def get_urls
46
+ return @urls
47
+ end
48
+
49
+ def get_proxylists
50
+ return @proxylists
51
+ end
52
+
53
+ def save_proxylists
54
+ pls = @proxylists.to_s
55
+ puts pls
56
+ if File.exist?("proxylists.txt")
57
+ File.rename("proxylists.txt","proxylists.txt.bak")
58
+ end
59
+ f = File.new("proxylists.txt","w+")
60
+ f.write(pls)
61
+ f.close
62
+ puts "Save proxylists successfully."
63
+ end
64
+
65
+ def save_all_proxylists
66
+ pls = @proxylists.to_s
67
+ puts pls
68
+ if File.exist?("allproxylists.txt")
69
+ File.rename("allproxylists.txt","allproxylists.txt.bak")
70
+ end
71
+ f = File.new("allproxylists.txt","w+")
72
+ f.write(pls)
73
+ f.close
74
+ puts "Save all proxylists successfully."
75
+ end
76
+
77
+ def verify_proxy(proxy)
78
+ ip, port = proxy[:ip], proxy[:port]
79
+ testagent = Mechanize.new
80
+ testagent.set_proxy ip, port
81
+ testagent.read_timeout = 10
82
+ begin
83
+ page = testagent.get("http://www.baidu.com")
84
+ if page.title == "百度一下,你就知道"
85
+ return proxy
86
+ puts 'That a good proxy'
87
+ end
88
+ rescue => e
89
+ puts e.message
90
+ puts 'That a bad proxy'
91
+ proxy = nil
92
+ return
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,4 @@
1
+ # desc "Explaining what the task does"
2
+ # task :rails_spider do
3
+ # # Task goes here
4
+ # end
data/rakefile ADDED
@@ -0,0 +1,284 @@
1
+ begin
2
+ require 'bundler/setup'
3
+ rescue LoadError
4
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
5
+ end
6
+
7
+ require 'rdoc/task'
8
+
9
+ RDoc::Task.new(:rdoc) do |rdoc|
10
+ rdoc.rdoc_dir = 'rdoc'
11
+ rdoc.title = 'RailsSpider'
12
+ rdoc.options << '--line-numbers'
13
+ rdoc.rdoc_files.include('README.md')
14
+ rdoc.rdoc_files.include('lib/**/*.rb')
15
+ end
16
+
17
+ APP_RAKEFILE = File.expand_path("../test/dummy/Rakefile", __FILE__)
18
+ load 'rails/tasks/engine.rake'
19
+
20
+
21
+ load 'rails/tasks/statistics.rake'
22
+
23
+
24
+
25
+ require 'bundler/gem_tasks'
26
+
27
+ require 'rake/testtask'
28
+
29
+ Rake::TestTask.new(:test) do |t|
30
+ t.libs << 'test'
31
+ t.pattern = 'test/**/*_test.rb'
32
+ t.verbose = false
33
+ end
34
+
35
+ # coding: utf-8
36
+ #!/usr/bin/env ruby
37
+
38
+ #require 'rubygems'
39
+ #require 'fileutils'
40
+ #require 'open-uri'
41
+ #require 'bundler'
42
+
43
+ ENV['SPIDER_ENV'] = 'development'
44
+ require File.expand_path './config/environment'
45
+
46
+ include EventSpider
47
+
48
+ namespace :first do
49
+
50
+ desc 'grab all event of a city for the first time'
51
+ task :get_all_events do
52
+ spiders = [Douban]#, Weibo, Waitan, CityMoment]
53
+ spiders.each do |spider|
54
+ spider.new.run
55
+ end
56
+ end
57
+
58
+ # TODO 将不能使用的代理IP删除
59
+ desc 'get proxy from web'
60
+ task :test_proxy do
61
+ proxy_list = EventSpider::Proxy.proxy_list
62
+ puts proxy_list
63
+ puts "Proxy IP count: #{proxy_list.size}"
64
+ pro = EventSpider::Proxy.new
65
+ proxy_list.each do |proxy|
66
+ #agent.set_proxy proxy[:ip], proxy[:port]
67
+ puts "Test proxy IP:#{proxy}"
68
+ page = pro.get_page_from('http://www.douban.com', proxy, nil, 0)
69
+ if page
70
+ puts page.title
71
+ else
72
+ puts "Not get page by IP: #{proxy}"
73
+ end
74
+ end
75
+ end
76
+
77
+ desc 'Event sequid'
78
+ task :set_sequid do
79
+ es = Event.where(sequid: nil)#.order_by(:created_at.asc)
80
+ #total_count = es.count
81
+ #limit_count = 100
82
+ #page = total_count / limit_count
83
+ #page += 1 if (total_count % limit_count) > 0
84
+ i = Event.max(:sequid).to_i + 1
85
+ #(0..page).each do |pg|
86
+ es.to_a.each do |e|
87
+ e.update_attribute(:sequid, i)
88
+ puts "#{e.created_at} | #{e.int_id} | #{e.sequid}"
89
+ i += 1
90
+ end
91
+ #end
92
+ end
93
+
94
+ desc 'Reset Event sequid number'
95
+ task :reset_event_sequid do
96
+ max = Event.max(:sequid)
97
+ puts "Event sequid max number is #{max}"
98
+ inc=MongoidAutoInc::Incrementor.new({})
99
+ res = inc['event_sequid'].set max
100
+ puts "Set the sequid to the max #{res}"
101
+ end
102
+
103
+ end
104
+
105
+ namespace :everyday do
106
+ desc 'Update grab and Sync image to Qiniu'
107
+ task :update_and_sync do
108
+ Rake::Task["everyday:grab_update"].invoke
109
+ Rake::Task["everyday:event_image"].invoke
110
+ end
111
+
112
+ desc 'update grab every day'
113
+ task :grab_update do
114
+ spiders = [Douban, Weibo, Waitan, CityMoment]
115
+ spiders.each do |spider|
116
+ spider.new.grab_update
117
+ end
118
+ end
119
+
120
+ desc 'download event image and sync to qiniu'
121
+ task :event_image do
122
+ logger = Logger.new("log/image.log")
123
+ images_dir = "images/faces/#{Date.today.to_s.gsub("-", "_")}"
124
+ FileUtils.mkdir_p(images_dir)
125
+ logger.info "Make image file folder #{images_dir}"
126
+
127
+ events = Event.where(filename: nil).to_a
128
+ logger.info "There are #{events.count} images need to download"
129
+ group_len = 800
130
+ threads = []
131
+
132
+ events.each_slice(group_len).each do |event_group|
133
+ threads << Thread.new do
134
+ event_group.each do |ent|
135
+ begin
136
+ image_data = open(URI.encode(ent.face)){|f| f.read}
137
+ filetype = File.extname(ent.face).gsub(/\?.*$/, '')
138
+ filename = "#{images_dir}/face-#{ent.id.to_s}#{filetype}"
139
+ logger.info "File type:#{filetype} | name:#{filename}"
140
+ open(filename, "wb") { |f| f.write(image_data) }
141
+ logger.info "Download #{filename}"
142
+ ent.update_attribute(:filename, filename)
143
+ sleep 1
144
+ rescue => e
145
+ logger.error e.message
146
+ # e.backtrace.each do |msg|
147
+ # logger.error msg
148
+ # end
149
+ sleep 1
150
+ end
151
+ end
152
+ end
153
+ end
154
+ threads.map(&:join)
155
+
156
+ if events.count > 0
157
+ pwd = File.dirname(File.expand_path('.', __FILE__))
158
+ qiniu_config = "#{pwd}/config/qiniu_conf.json"
159
+ qiniu_sync = "#{pwd}/vendor/qiniu/qrsync"
160
+ cmd = "#{qiniu_sync} #{qiniu_config}"
161
+ logger.info "Sync images to QiNiu"
162
+ `#{cmd}`
163
+ logger.info "#{cmd} done"
164
+ end
165
+
166
+ threads = []
167
+ events = Event.where(poster_width: nil).to_a
168
+ events.each_slice(group_len).each do |event_group|
169
+ threads << Thread.new do
170
+ event_group.each do |ent|
171
+ next if ent.filename.blank?
172
+ begin
173
+ ent.update_image_info
174
+ logger.info "Update image info success with event:#{ent.id}"
175
+ rescue => e
176
+ logger.error e.message
177
+ #e.backtrace.each{|msg| logger.error msg}
178
+ logger.error "Event update image info wrong:#{ent.url}"
179
+ end
180
+ end
181
+ end
182
+ end
183
+ threads.map(&:join)
184
+ end
185
+
186
+ desc 'upate iamge info'
187
+ task :update_image_info do
188
+ logger = Logger.new("#{ENV['SPIDER_ENV']}.log")
189
+ threads = []
190
+ group_len = 800
191
+ events = Event.where(poster_width: nil).to_a
192
+ logger.info 'Start update image info'
193
+ events.each_slice(group_len).each do |event_group|
194
+ threads << Thread.new do
195
+ event_group.each do |ent|
196
+ next if ent.filename.blank?
197
+ begin
198
+ ent.update_image_info
199
+ logger.info "Update image info success with event:#{ent.id}"
200
+ rescue => e
201
+ logger.error e.message
202
+ #e.backtrace.each{|msg| logger.error msg}
203
+ logger.error "Event update image info wrong:#{ent.url}"
204
+ end
205
+ end
206
+ end
207
+ end
208
+ threads.map(&:join)
209
+ logger.info 'End of update image info'
210
+ end
211
+
212
+ end
213
+
214
+ namespace :proxy do
215
+ # 爬取www.youdaili.com上的代理IP
216
+ # 并校验是否能用
217
+ # 并保存到文件proxylists.txt中
218
+ desc 'get youdaili proxy IP'
219
+ task :proxy_youdaili do
220
+ used_proxy = EventSpider::Proxy.proxy_list.uniq
221
+ proxy_spider = EventSpider::Proxy.new(used_proxy)
222
+ proxy_list = proxy_spider.get_youdaili_proxy
223
+ proxy_list = proxy_spider.validate_proxy(proxy_list)
224
+ proxy_spider.write_proxy_to_file(proxy_list)
225
+ end
226
+
227
+ # http://www.xici.net.co/nn/
228
+ # 抓取代理IP,并校验,再保存到本地文件
229
+ desc 'get xici.net proxy IP'
230
+ task :proxy_xici do
231
+ used_proxy = EventSpider::Proxy.proxy_list.uniq
232
+ proxy_spider = EventSpider::Proxy.new(used_proxy)
233
+ proxys = proxy_spider.get_xici_proxy
234
+ proxys = proxy_spider.validate_proxy(proxys)
235
+ proxy_spider.write_proxy_to_file(proxys)
236
+ end
237
+
238
+ # 校验本地手动输入代理文件allproxylists.txt
239
+ desc 'validate allproxylists.txt proxy IP and save to proxylists.txt'
240
+ task :validate_allproxylists do
241
+ proxy_list = EventSpider::Proxy.get_allproxylists.uniq
242
+ proxy_spider = EventSpider::Proxy.new
243
+ proxys = proxy_spider.validate_proxy(proxy_list)
244
+ proxy_spider.write_proxy_to_file(proxys)
245
+ end
246
+
247
+
248
+ # http://api.map.baidu.com/geocoder/v2/?address=%20%E9%87%8D%E5%BA%86%20%E6%B2%99%E5%9D%AA%E5%9D%9D%E5%8C%BA%20NUTS&output=json&ak=A38d59da730152d77b407446a3c0dd2b&callback=showLocation
249
+ desc 're-get event location from baidu with location is [0.1,0.1]'
250
+ task :reget_location do
251
+ Event.where(location: [0.1, 0.1]).each do |event|
252
+ next if event.place.nil? || event.place == ''
253
+ baidu_api = URI.escape("http://api.map.baidu.com/geocoder/v2/?address=#{event.place}&output=json&ak=A38d59da730152d77b407446a3c0dd2b")
254
+ puts baidu_api
255
+ begin
256
+ response = Net::HTTP.get_response(URI(baidu_api))
257
+ data = response.body
258
+ result = JSON.parse(data)
259
+ if result["status"] != 0
260
+ location = [0.0, 0.0]
261
+ else
262
+ location = []
263
+ location << result["result"]["location"]["lng"]
264
+ location << result["result"]["location"]["lat"]
265
+ end
266
+ puts location
267
+ rescue Timeout::Error
268
+ i ||= 0
269
+ if i <= 5
270
+ retry
271
+ end
272
+ rescue => e
273
+ puts event.place
274
+ puts e
275
+ location = [0.1, 0.1]
276
+ end
277
+ end
278
+ end
279
+
280
+ end
281
+
282
+
283
+
284
+ task default: :test