rails_spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +165 -0
- data/README.md +33 -0
- data/app/assets/config/the_spider_manifest.js +4 -0
- data/app/assets/javascripts/the_spider/application.js +1 -0
- data/app/assets/stylesheets/the_spider/application.css +4 -0
- data/app/controllers/the_spider/application_controller.rb +9 -0
- data/app/controllers/the_spider/locals_controller.rb +62 -0
- data/app/controllers/the_spider/works_controller.rb +60 -0
- data/app/helpers/the_spider/application_helper.rb +4 -0
- data/app/helpers/the_spider/locals_helper.rb +4 -0
- data/app/helpers/the_spider/works_helper.rb +4 -0
- data/app/jobs/the_spider/application_job.rb +4 -0
- data/app/jobs/the_spider/parser_job.rb +11 -0
- data/app/jobs/the_spider/work_job.rb +11 -0
- data/app/mailers/the_spider/application_mailer.rb +6 -0
- data/app/models/rails_spider/application_record.rb +5 -0
- data/app/models/rails_spider/cookie.rb +9 -0
- data/app/models/rails_spider/failed_url.rb +7 -0
- data/app/models/rails_spider/local.rb +14 -0
- data/app/models/rails_spider/work.rb +24 -0
- data/app/views/layouts/the_spider/application.html.erb +14 -0
- data/app/views/the_spider/locals/_form.html.erb +17 -0
- data/app/views/the_spider/locals/edit.html.erb +6 -0
- data/app/views/the_spider/locals/index.html.erb +25 -0
- data/app/views/the_spider/locals/new.html.erb +5 -0
- data/app/views/the_spider/locals/show.html.erb +4 -0
- data/app/views/the_spider/works/_form.html.erb +9 -0
- data/app/views/the_spider/works/edit.html.erb +6 -0
- data/app/views/the_spider/works/index.html.erb +44 -0
- data/app/views/the_spider/works/new.html.erb +5 -0
- data/app/views/the_spider/works/show.html.erb +4 -0
- data/config/routes.rb +8 -0
- data/config/schedule.rb +35 -0
- data/db/migrate/20170502153051_rails_spider_init.rb +38 -0
- data/lib/config/config.rb +27 -0
- data/lib/config/keywords.json +22 -0
- data/lib/config/proxy.json +10 -0
- data/lib/helper/helper.rb +6 -0
- data/lib/helper/location_helper.rb +46 -0
- data/lib/helper/price_helper.rb +23 -0
- data/lib/helper/tag_helper.rb +17 -0
- data/lib/helper/text_helper.rb +41 -0
- data/lib/helper/time_helper.rb +140 -0
- data/lib/logger.rb +146 -0
- data/lib/proxy/allproxylists.txt +2366 -0
- data/lib/proxy/proxy.rb +216 -0
- data/lib/proxy/proxylists.txt +625 -0
- data/lib/rails_spider.rb +10 -0
- data/lib/rails_spider/engine.rb +9 -0
- data/lib/rails_spider/fetchers.rb +2 -0
- data/lib/rails_spider/fetchers/base.rb +146 -0
- data/lib/rails_spider/fetchers/mechanize.rb +83 -0
- data/lib/rails_spider/fetchers/witar.rb +73 -0
- data/lib/rails_spider/parser.rb +14 -0
- data/lib/rails_spider/parser/szlawyers.rb +26 -0
- data/lib/rails_spider/resource.rb +58 -0
- data/lib/rails_spider/strategies.rb +6 -0
- data/lib/rails_spider/version.rb +3 -0
- data/lib/sync_qiniu.rb +35 -0
- data/lib/sync_qiniu/getimages.rb +98 -0
- data/lib/sync_qiniu/getimages_info.rb +37 -0
- data/lib/sync_qiniu/getlocation.rb +48 -0
- data/lib/sync_qiniu/getproxy.rb +95 -0
- data/lib/tasks/the_spider_tasks.rake +4 -0
- data/rakefile +284 -0
- metadata +165 -0
data/lib/sync_qiniu.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'bundler'
|
6
|
+
require 'logger'
|
7
|
+
Bundler.require
|
8
|
+
|
9
|
+
Mongoid.load!("./config/mongoid.yml","development")
|
10
|
+
|
11
|
+
Dir.glob("#{File.dirname(__FILE__)}/models/*.rb").each do |m|
|
12
|
+
require m
|
13
|
+
end
|
14
|
+
|
15
|
+
FileUtils.mkdir_p("log")
|
16
|
+
logger = Logger.new("log/getimage_log.txt")
|
17
|
+
|
18
|
+
@events = Event
|
19
|
+
|
20
|
+
logger.log(1, 'Prepare folder')
|
21
|
+
dir = "images/faces/#{Date.today.to_s.gsub("-", "_")}"
|
22
|
+
log = "Try to make dir #{dir}"
|
23
|
+
logger.log(1, log)
|
24
|
+
FileUtils.mkdir_p(dir)
|
25
|
+
|
26
|
+
pwd = File.dirname(File.expand_path('.', __FILE__))
|
27
|
+
log = "pwd #{pwd}"
|
28
|
+
logger.log(1, log)
|
29
|
+
config = "#{pwd}/config/qiniu_conf.json"
|
30
|
+
qrsync = "#{pwd}/vendor/qiniu/qrsync"
|
31
|
+
log = 'sync to qiniu after download all images.'
|
32
|
+
logger.log(1, log)
|
33
|
+
cmd = "#{qrsync} #{config}"
|
34
|
+
logger.log(1, cmd)
|
35
|
+
`#{cmd}`
|
@@ -0,0 +1,98 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'bundler'
|
6
|
+
require 'logger'
|
7
|
+
Bundler.require
|
8
|
+
|
9
|
+
Mongoid.load!("./config/mongoid.yml","development")
|
10
|
+
|
11
|
+
Dir.glob("#{File.dirname(__FILE__)}/models/*.rb").each do |m|
|
12
|
+
require m
|
13
|
+
end
|
14
|
+
|
15
|
+
Dir.glob("#{File.dirname(__FILE__)}/helper/*.rb").each do |h|
|
16
|
+
require h
|
17
|
+
end
|
18
|
+
|
19
|
+
FileUtils.mkdir_p("log")
|
20
|
+
logger = Logger.new("log/getimage_log.txt")
|
21
|
+
|
22
|
+
@events = Event
|
23
|
+
|
24
|
+
logger.log(1, 'Prepare folder')
|
25
|
+
dir = "images/faces/#{Date.today.to_s.gsub("-", "_")}"
|
26
|
+
log = "Try to make dir #{dir}"
|
27
|
+
logger.log(1, log)
|
28
|
+
FileUtils.mkdir_p(dir)
|
29
|
+
|
30
|
+
@events = @events.where(filename: nil)
|
31
|
+
|
32
|
+
count_to_download = @events.count
|
33
|
+
|
34
|
+
log = "Will download #{count_to_download} events face"
|
35
|
+
logger.log(1, log)
|
36
|
+
|
37
|
+
@events = @events.to_a
|
38
|
+
|
39
|
+
group_len = 800
|
40
|
+
|
41
|
+
threads = []
|
42
|
+
|
43
|
+
@events.each_slice(group_len).each do |event_group|
|
44
|
+
threads << Thread.new do
|
45
|
+
event_group.each do |e|
|
46
|
+
begin
|
47
|
+
data = open(URI.encode(e.face)){ |f| f.read }
|
48
|
+
|
49
|
+
filetype = File.extname(e.face).gsub(/\?.*$/, "")
|
50
|
+
filename = "face-" + e.id.to_s + filetype
|
51
|
+
filename = "#{dir}/#{filename}"
|
52
|
+
open(filename, "wb") { |f| f.write(data) }
|
53
|
+
log = "Download #{filename} to images/..."
|
54
|
+
logger.log(1, log)
|
55
|
+
e.update_attribute(:filename, filename)
|
56
|
+
sleep 0.5
|
57
|
+
rescue => se
|
58
|
+
log = "throw a exception:\n#{se.message}\n#{se.backtrace}\n#{e.url}"
|
59
|
+
logger.log(1, log)
|
60
|
+
sleep 1
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
threads.map(&:join)
|
67
|
+
|
68
|
+
# sync to qiniu after download all images.
|
69
|
+
if count_to_download > 0
|
70
|
+
pwd = File.dirname(File.expand_path('.', __FILE__))
|
71
|
+
log = "pwd #{pwd}"
|
72
|
+
logger.log(1, log)
|
73
|
+
config = "#{pwd}/config/qiniu_conf.json"
|
74
|
+
qrsync = "#{pwd}/vendor/qiniu/qrsync"
|
75
|
+
log = 'sync to qiniu after download all images.'
|
76
|
+
logger.log(1, log)
|
77
|
+
cmd = "#{qrsync} #{config}"
|
78
|
+
logger.log(1, cmd)
|
79
|
+
`#{cmd}`
|
80
|
+
end
|
81
|
+
|
82
|
+
threads = []
|
83
|
+
@events = Event.where(poster_width: nil).to_a
|
84
|
+
@events.each_slice(group_len).each do |event_group|
|
85
|
+
threads << Thread.new do
|
86
|
+
event_group.each do |e|
|
87
|
+
next if e.filename.blank?
|
88
|
+
begin
|
89
|
+
e.update_image_info
|
90
|
+
rescue => se
|
91
|
+
log = "#{Time.now}\n#{se.message}\n#{se.backtrace}#{e.url}"
|
92
|
+
logger.log(1, log)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
threads.map(&:join)
|
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'bundler'
|
6
|
+
Bundler.require
|
7
|
+
|
8
|
+
Mongoid.load!("./config/mongoid.yml","development")
|
9
|
+
|
10
|
+
Dir.glob("#{File.dirname(__FILE__)}/models/*.rb").each do |m|
|
11
|
+
require m
|
12
|
+
end
|
13
|
+
|
14
|
+
group_len = 200
|
15
|
+
id = "534274765275627a63000000"
|
16
|
+
count = Event.where(poster_width: nil, :id.gt => id).count
|
17
|
+
while count > 100
|
18
|
+
threads = []
|
19
|
+
@events = Event.where(poster_width: nil, :id.gt => id).limit(1000).to_a
|
20
|
+
@events.each_slice(group_len).each do |event_group|
|
21
|
+
threads << Thread.new do
|
22
|
+
event_group.each do |e|
|
23
|
+
next if e.filename.blank?
|
24
|
+
begin
|
25
|
+
e.update_image_info
|
26
|
+
rescue => se
|
27
|
+
log = "#{Time.now}\n#{se.message}\n#{se.backtrace}#{e.url}"
|
28
|
+
puts log
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
threads.map(&:join)
|
34
|
+
id = @events.last.id.to_s
|
35
|
+
count = Event.where(poster_width: nil, :id.gt => id).count
|
36
|
+
end
|
37
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler'
|
4
|
+
require 'net/http'
|
5
|
+
Bundler.require
|
6
|
+
|
7
|
+
Mongoid.load!("./config/mongoid.yml","development")
|
8
|
+
|
9
|
+
Dir.glob("#{File.dirname(__FILE__)}/models/*.rb").each do |m|
|
10
|
+
require m
|
11
|
+
end
|
12
|
+
|
13
|
+
@events = Event.where(:source.ne => "douban") # can not use only because of update attribute
|
14
|
+
|
15
|
+
@events.each do |e|
|
16
|
+
puts "ID: #{e.id}"
|
17
|
+
puts e.place
|
18
|
+
url = URI.escape("http://api.map.baidu.com/geocoder/v2/?address=#{e.place}&output=json&ak=A38d59da730152d77b407446a3c0dd2b")
|
19
|
+
# Geocoding API: http://developer.baidu.com/map/webservice-geocoding.htm
|
20
|
+
# http://api.map.baidu.com/geocoder/v2/?address=%E5%BE%90%E5%AE%B6%E6%B1%87&output=json&ak=A38d59da730152d77b407446a3c0dd2b&callback=showLocation
|
21
|
+
begin
|
22
|
+
response = Net::HTTP.get_response(URI(url))
|
23
|
+
puts response.body
|
24
|
+
data = response.body
|
25
|
+
rescue SocketError
|
26
|
+
sleep 10
|
27
|
+
retry
|
28
|
+
end
|
29
|
+
result = JSON.parse(data)
|
30
|
+
puts result
|
31
|
+
# status 说明文档: http://developer.baidu.com/map/webservice-geocoding.htm#.E6.8E.A5.E5.8F.A3.E7.A4.BA.E4.BE.8A
|
32
|
+
if result["status"] != 0
|
33
|
+
e.location = [0.0, 0.0]
|
34
|
+
e.save
|
35
|
+
puts "This place can not translate to location......"
|
36
|
+
else
|
37
|
+
# {"status"=>0, "result"=>{"location"=>{"lng"=>121.48026424818, "lat"=>31.229092805768}, "precise"=>1, "confidence"=>80, "level"=>"道路"}}
|
38
|
+
puts result["result"]["location"]
|
39
|
+
store_result = []
|
40
|
+
store_result << result["result"]["location"]["lng"]
|
41
|
+
store_result << result["result"]["location"]["lat"]
|
42
|
+
puts store_result
|
43
|
+
e.location = store_result
|
44
|
+
e.save
|
45
|
+
puts "save location #{store_result} of ID: #{e.id} success!"
|
46
|
+
puts '--------------------'
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler'
|
4
|
+
Bundler.require
|
5
|
+
|
6
|
+
# Fetch proxy from: http://proxy.com.ru/
|
7
|
+
class ProxyList
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@proxylists = []
|
11
|
+
@urls = []
|
12
|
+
(1..2).each do |i|
|
13
|
+
@urls << "http://proxy.com.ru/list_#{i}.html"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def fetch_list
|
18
|
+
@urls.each do |url|
|
19
|
+
puts proxys_in_url(url).class
|
20
|
+
@proxylists += proxys_in_url(url)
|
21
|
+
end
|
22
|
+
@proxylists.uniq!
|
23
|
+
puts @proxylists
|
24
|
+
puts "Fetched proxys #{@proxylists.size}"
|
25
|
+
end
|
26
|
+
|
27
|
+
alias :start :fetch_list
|
28
|
+
|
29
|
+
def proxys_in_url(url)
|
30
|
+
proxys = []
|
31
|
+
agent = Mechanize.new
|
32
|
+
page = agent.get(url)
|
33
|
+
list = page.search("body font table tr td:last font table tr")
|
34
|
+
list[1..-1].each do |tr|
|
35
|
+
ip = tr.search("td")[1].text
|
36
|
+
port = tr.search("td")[2].text
|
37
|
+
proxy = { ip: ip, port: port }
|
38
|
+
proxy = verify_proxy(proxy) # 比较费时间
|
39
|
+
proxys << proxy unless proxy == false
|
40
|
+
puts "add #{proxy} to proxys array..."
|
41
|
+
end
|
42
|
+
return proxys
|
43
|
+
end
|
44
|
+
|
45
|
+
def get_urls
|
46
|
+
return @urls
|
47
|
+
end
|
48
|
+
|
49
|
+
def get_proxylists
|
50
|
+
return @proxylists
|
51
|
+
end
|
52
|
+
|
53
|
+
def save_proxylists
|
54
|
+
pls = @proxylists.to_s
|
55
|
+
puts pls
|
56
|
+
if File.exist?("proxylists.txt")
|
57
|
+
File.rename("proxylists.txt","proxylists.txt.bak")
|
58
|
+
end
|
59
|
+
f = File.new("proxylists.txt","w+")
|
60
|
+
f.write(pls)
|
61
|
+
f.close
|
62
|
+
puts "Save proxylists successfully."
|
63
|
+
end
|
64
|
+
|
65
|
+
def save_all_proxylists
|
66
|
+
pls = @proxylists.to_s
|
67
|
+
puts pls
|
68
|
+
if File.exist?("allproxylists.txt")
|
69
|
+
File.rename("allproxylists.txt","allproxylists.txt.bak")
|
70
|
+
end
|
71
|
+
f = File.new("allproxylists.txt","w+")
|
72
|
+
f.write(pls)
|
73
|
+
f.close
|
74
|
+
puts "Save all proxylists successfully."
|
75
|
+
end
|
76
|
+
|
77
|
+
def verify_proxy(proxy)
|
78
|
+
ip, port = proxy[:ip], proxy[:port]
|
79
|
+
testagent = Mechanize.new
|
80
|
+
testagent.set_proxy ip, port
|
81
|
+
testagent.read_timeout = 10
|
82
|
+
begin
|
83
|
+
page = testagent.get("http://www.baidu.com")
|
84
|
+
if page.title == "百度一下,你就知道"
|
85
|
+
return proxy
|
86
|
+
puts 'That a good proxy'
|
87
|
+
end
|
88
|
+
rescue => e
|
89
|
+
puts e.message
|
90
|
+
puts 'That a bad proxy'
|
91
|
+
proxy = nil
|
92
|
+
return
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/rakefile
ADDED
@@ -0,0 +1,284 @@
|
|
1
|
+
begin
|
2
|
+
require 'bundler/setup'
|
3
|
+
rescue LoadError
|
4
|
+
puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'rdoc/task'
|
8
|
+
|
9
|
+
RDoc::Task.new(:rdoc) do |rdoc|
|
10
|
+
rdoc.rdoc_dir = 'rdoc'
|
11
|
+
rdoc.title = 'RailsSpider'
|
12
|
+
rdoc.options << '--line-numbers'
|
13
|
+
rdoc.rdoc_files.include('README.md')
|
14
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
15
|
+
end
|
16
|
+
|
17
|
+
APP_RAKEFILE = File.expand_path("../test/dummy/Rakefile", __FILE__)
|
18
|
+
load 'rails/tasks/engine.rake'
|
19
|
+
|
20
|
+
|
21
|
+
load 'rails/tasks/statistics.rake'
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
require 'bundler/gem_tasks'
|
26
|
+
|
27
|
+
require 'rake/testtask'
|
28
|
+
|
29
|
+
Rake::TestTask.new(:test) do |t|
|
30
|
+
t.libs << 'test'
|
31
|
+
t.pattern = 'test/**/*_test.rb'
|
32
|
+
t.verbose = false
|
33
|
+
end
|
34
|
+
|
35
|
+
# coding: utf-8
|
36
|
+
#!/usr/bin/env ruby
|
37
|
+
|
38
|
+
#require 'rubygems'
|
39
|
+
#require 'fileutils'
|
40
|
+
#require 'open-uri'
|
41
|
+
#require 'bundler'
|
42
|
+
|
43
|
+
ENV['SPIDER_ENV'] = 'development'
|
44
|
+
require File.expand_path './config/environment'
|
45
|
+
|
46
|
+
include EventSpider
|
47
|
+
|
48
|
+
namespace :first do
|
49
|
+
|
50
|
+
desc 'grab all event of a city for the first time'
|
51
|
+
task :get_all_events do
|
52
|
+
spiders = [Douban]#, Weibo, Waitan, CityMoment]
|
53
|
+
spiders.each do |spider|
|
54
|
+
spider.new.run
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# TODO 将不能使用的代理IP删除
|
59
|
+
desc 'get proxy from web'
|
60
|
+
task :test_proxy do
|
61
|
+
proxy_list = EventSpider::Proxy.proxy_list
|
62
|
+
puts proxy_list
|
63
|
+
puts "Proxy IP count: #{proxy_list.size}"
|
64
|
+
pro = EventSpider::Proxy.new
|
65
|
+
proxy_list.each do |proxy|
|
66
|
+
#agent.set_proxy proxy[:ip], proxy[:port]
|
67
|
+
puts "Test proxy IP:#{proxy}"
|
68
|
+
page = pro.get_page_from('http://www.douban.com', proxy, nil, 0)
|
69
|
+
if page
|
70
|
+
puts page.title
|
71
|
+
else
|
72
|
+
puts "Not get page by IP: #{proxy}"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
desc 'Event sequid'
|
78
|
+
task :set_sequid do
|
79
|
+
es = Event.where(sequid: nil)#.order_by(:created_at.asc)
|
80
|
+
#total_count = es.count
|
81
|
+
#limit_count = 100
|
82
|
+
#page = total_count / limit_count
|
83
|
+
#page += 1 if (total_count % limit_count) > 0
|
84
|
+
i = Event.max(:sequid).to_i + 1
|
85
|
+
#(0..page).each do |pg|
|
86
|
+
es.to_a.each do |e|
|
87
|
+
e.update_attribute(:sequid, i)
|
88
|
+
puts "#{e.created_at} | #{e.int_id} | #{e.sequid}"
|
89
|
+
i += 1
|
90
|
+
end
|
91
|
+
#end
|
92
|
+
end
|
93
|
+
|
94
|
+
desc 'Reset Event sequid number'
|
95
|
+
task :reset_event_sequid do
|
96
|
+
max = Event.max(:sequid)
|
97
|
+
puts "Event sequid max number is #{max}"
|
98
|
+
inc=MongoidAutoInc::Incrementor.new({})
|
99
|
+
res = inc['event_sequid'].set max
|
100
|
+
puts "Set the sequid to the max #{res}"
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
namespace :everyday do
|
106
|
+
desc 'Update grab and Sync image to Qiniu'
|
107
|
+
task :update_and_sync do
|
108
|
+
Rake::Task["everyday:grab_update"].invoke
|
109
|
+
Rake::Task["everyday:event_image"].invoke
|
110
|
+
end
|
111
|
+
|
112
|
+
desc 'update grab every day'
|
113
|
+
task :grab_update do
|
114
|
+
spiders = [Douban, Weibo, Waitan, CityMoment]
|
115
|
+
spiders.each do |spider|
|
116
|
+
spider.new.grab_update
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
desc 'download event image and sync to qiniu'
|
121
|
+
task :event_image do
|
122
|
+
logger = Logger.new("log/image.log")
|
123
|
+
images_dir = "images/faces/#{Date.today.to_s.gsub("-", "_")}"
|
124
|
+
FileUtils.mkdir_p(images_dir)
|
125
|
+
logger.info "Make image file folder #{images_dir}"
|
126
|
+
|
127
|
+
events = Event.where(filename: nil).to_a
|
128
|
+
logger.info "There are #{events.count} images need to download"
|
129
|
+
group_len = 800
|
130
|
+
threads = []
|
131
|
+
|
132
|
+
events.each_slice(group_len).each do |event_group|
|
133
|
+
threads << Thread.new do
|
134
|
+
event_group.each do |ent|
|
135
|
+
begin
|
136
|
+
image_data = open(URI.encode(ent.face)){|f| f.read}
|
137
|
+
filetype = File.extname(ent.face).gsub(/\?.*$/, '')
|
138
|
+
filename = "#{images_dir}/face-#{ent.id.to_s}#{filetype}"
|
139
|
+
logger.info "File type:#{filetype} | name:#{filename}"
|
140
|
+
open(filename, "wb") { |f| f.write(image_data) }
|
141
|
+
logger.info "Download #{filename}"
|
142
|
+
ent.update_attribute(:filename, filename)
|
143
|
+
sleep 1
|
144
|
+
rescue => e
|
145
|
+
logger.error e.message
|
146
|
+
# e.backtrace.each do |msg|
|
147
|
+
# logger.error msg
|
148
|
+
# end
|
149
|
+
sleep 1
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
threads.map(&:join)
|
155
|
+
|
156
|
+
if events.count > 0
|
157
|
+
pwd = File.dirname(File.expand_path('.', __FILE__))
|
158
|
+
qiniu_config = "#{pwd}/config/qiniu_conf.json"
|
159
|
+
qiniu_sync = "#{pwd}/vendor/qiniu/qrsync"
|
160
|
+
cmd = "#{qiniu_sync} #{qiniu_config}"
|
161
|
+
logger.info "Sync images to QiNiu"
|
162
|
+
`#{cmd}`
|
163
|
+
logger.info "#{cmd} done"
|
164
|
+
end
|
165
|
+
|
166
|
+
threads = []
|
167
|
+
events = Event.where(poster_width: nil).to_a
|
168
|
+
events.each_slice(group_len).each do |event_group|
|
169
|
+
threads << Thread.new do
|
170
|
+
event_group.each do |ent|
|
171
|
+
next if ent.filename.blank?
|
172
|
+
begin
|
173
|
+
ent.update_image_info
|
174
|
+
logger.info "Update image info success with event:#{ent.id}"
|
175
|
+
rescue => e
|
176
|
+
logger.error e.message
|
177
|
+
#e.backtrace.each{|msg| logger.error msg}
|
178
|
+
logger.error "Event update image info wrong:#{ent.url}"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
threads.map(&:join)
|
184
|
+
end
|
185
|
+
|
186
|
+
desc 'upate iamge info'
|
187
|
+
task :update_image_info do
|
188
|
+
logger = Logger.new("#{ENV['SPIDER_ENV']}.log")
|
189
|
+
threads = []
|
190
|
+
group_len = 800
|
191
|
+
events = Event.where(poster_width: nil).to_a
|
192
|
+
logger.info 'Start update image info'
|
193
|
+
events.each_slice(group_len).each do |event_group|
|
194
|
+
threads << Thread.new do
|
195
|
+
event_group.each do |ent|
|
196
|
+
next if ent.filename.blank?
|
197
|
+
begin
|
198
|
+
ent.update_image_info
|
199
|
+
logger.info "Update image info success with event:#{ent.id}"
|
200
|
+
rescue => e
|
201
|
+
logger.error e.message
|
202
|
+
#e.backtrace.each{|msg| logger.error msg}
|
203
|
+
logger.error "Event update image info wrong:#{ent.url}"
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
threads.map(&:join)
|
209
|
+
logger.info 'End of update image info'
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
213
|
+
|
214
|
+
namespace :proxy do
|
215
|
+
# 爬取www.youdaili.com上的代理IP
|
216
|
+
# 并校验是否能用
|
217
|
+
# 并保存到文件proxylists.txt中
|
218
|
+
desc 'get youdaili proxy IP'
|
219
|
+
task :proxy_youdaili do
|
220
|
+
used_proxy = EventSpider::Proxy.proxy_list.uniq
|
221
|
+
proxy_spider = EventSpider::Proxy.new(used_proxy)
|
222
|
+
proxy_list = proxy_spider.get_youdaili_proxy
|
223
|
+
proxy_list = proxy_spider.validate_proxy(proxy_list)
|
224
|
+
proxy_spider.write_proxy_to_file(proxy_list)
|
225
|
+
end
|
226
|
+
|
227
|
+
# http://www.xici.net.co/nn/
|
228
|
+
# 抓取代理IP,并校验,再保存到本地文件
|
229
|
+
desc 'get xici.net proxy IP'
|
230
|
+
task :proxy_xici do
|
231
|
+
used_proxy = EventSpider::Proxy.proxy_list.uniq
|
232
|
+
proxy_spider = EventSpider::Proxy.new(used_proxy)
|
233
|
+
proxys = proxy_spider.get_xici_proxy
|
234
|
+
proxys = proxy_spider.validate_proxy(proxys)
|
235
|
+
proxy_spider.write_proxy_to_file(proxys)
|
236
|
+
end
|
237
|
+
|
238
|
+
# 校验本地手动输入代理文件allproxylists.txt
|
239
|
+
desc 'validate allproxylists.txt proxy IP and save to proxylists.txt'
|
240
|
+
task :validate_allproxylists do
|
241
|
+
proxy_list = EventSpider::Proxy.get_allproxylists.uniq
|
242
|
+
proxy_spider = EventSpider::Proxy.new
|
243
|
+
proxys = proxy_spider.validate_proxy(proxy_list)
|
244
|
+
proxy_spider.write_proxy_to_file(proxys)
|
245
|
+
end
|
246
|
+
|
247
|
+
|
248
|
+
# http://api.map.baidu.com/geocoder/v2/?address=%20%E9%87%8D%E5%BA%86%20%E6%B2%99%E5%9D%AA%E5%9D%9D%E5%8C%BA%20NUTS&output=json&ak=A38d59da730152d77b407446a3c0dd2b&callback=showLocation
|
249
|
+
desc 're-get event location from baidu with location is [0.1,0.1]'
|
250
|
+
task :reget_location do
|
251
|
+
Event.where(location: [0.1, 0.1]).each do |event|
|
252
|
+
next if event.place.nil? || event.place == ''
|
253
|
+
baidu_api = URI.escape("http://api.map.baidu.com/geocoder/v2/?address=#{event.place}&output=json&ak=A38d59da730152d77b407446a3c0dd2b")
|
254
|
+
puts baidu_api
|
255
|
+
begin
|
256
|
+
response = Net::HTTP.get_response(URI(baidu_api))
|
257
|
+
data = response.body
|
258
|
+
result = JSON.parse(data)
|
259
|
+
if result["status"] != 0
|
260
|
+
location = [0.0, 0.0]
|
261
|
+
else
|
262
|
+
location = []
|
263
|
+
location << result["result"]["location"]["lng"]
|
264
|
+
location << result["result"]["location"]["lat"]
|
265
|
+
end
|
266
|
+
puts location
|
267
|
+
rescue Timeout::Error
|
268
|
+
i ||= 0
|
269
|
+
if i <= 5
|
270
|
+
retry
|
271
|
+
end
|
272
|
+
rescue => e
|
273
|
+
puts event.place
|
274
|
+
puts e
|
275
|
+
location = [0.1, 0.1]
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
end
|
281
|
+
|
282
|
+
|
283
|
+
|
284
|
+
task default: :test
|