rails_spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +165 -0
- data/README.md +33 -0
- data/app/assets/config/the_spider_manifest.js +4 -0
- data/app/assets/javascripts/the_spider/application.js +1 -0
- data/app/assets/stylesheets/the_spider/application.css +4 -0
- data/app/controllers/the_spider/application_controller.rb +9 -0
- data/app/controllers/the_spider/locals_controller.rb +62 -0
- data/app/controllers/the_spider/works_controller.rb +60 -0
- data/app/helpers/the_spider/application_helper.rb +4 -0
- data/app/helpers/the_spider/locals_helper.rb +4 -0
- data/app/helpers/the_spider/works_helper.rb +4 -0
- data/app/jobs/the_spider/application_job.rb +4 -0
- data/app/jobs/the_spider/parser_job.rb +11 -0
- data/app/jobs/the_spider/work_job.rb +11 -0
- data/app/mailers/the_spider/application_mailer.rb +6 -0
- data/app/models/rails_spider/application_record.rb +5 -0
- data/app/models/rails_spider/cookie.rb +9 -0
- data/app/models/rails_spider/failed_url.rb +7 -0
- data/app/models/rails_spider/local.rb +14 -0
- data/app/models/rails_spider/work.rb +24 -0
- data/app/views/layouts/the_spider/application.html.erb +14 -0
- data/app/views/the_spider/locals/_form.html.erb +17 -0
- data/app/views/the_spider/locals/edit.html.erb +6 -0
- data/app/views/the_spider/locals/index.html.erb +25 -0
- data/app/views/the_spider/locals/new.html.erb +5 -0
- data/app/views/the_spider/locals/show.html.erb +4 -0
- data/app/views/the_spider/works/_form.html.erb +9 -0
- data/app/views/the_spider/works/edit.html.erb +6 -0
- data/app/views/the_spider/works/index.html.erb +44 -0
- data/app/views/the_spider/works/new.html.erb +5 -0
- data/app/views/the_spider/works/show.html.erb +4 -0
- data/config/routes.rb +8 -0
- data/config/schedule.rb +35 -0
- data/db/migrate/20170502153051_rails_spider_init.rb +38 -0
- data/lib/config/config.rb +27 -0
- data/lib/config/keywords.json +22 -0
- data/lib/config/proxy.json +10 -0
- data/lib/helper/helper.rb +6 -0
- data/lib/helper/location_helper.rb +46 -0
- data/lib/helper/price_helper.rb +23 -0
- data/lib/helper/tag_helper.rb +17 -0
- data/lib/helper/text_helper.rb +41 -0
- data/lib/helper/time_helper.rb +140 -0
- data/lib/logger.rb +146 -0
- data/lib/proxy/allproxylists.txt +2366 -0
- data/lib/proxy/proxy.rb +216 -0
- data/lib/proxy/proxylists.txt +625 -0
- data/lib/rails_spider.rb +10 -0
- data/lib/rails_spider/engine.rb +9 -0
- data/lib/rails_spider/fetchers.rb +2 -0
- data/lib/rails_spider/fetchers/base.rb +146 -0
- data/lib/rails_spider/fetchers/mechanize.rb +83 -0
- data/lib/rails_spider/fetchers/witar.rb +73 -0
- data/lib/rails_spider/parser.rb +14 -0
- data/lib/rails_spider/parser/szlawyers.rb +26 -0
- data/lib/rails_spider/resource.rb +58 -0
- data/lib/rails_spider/strategies.rb +6 -0
- data/lib/rails_spider/version.rb +3 -0
- data/lib/sync_qiniu.rb +35 -0
- data/lib/sync_qiniu/getimages.rb +98 -0
- data/lib/sync_qiniu/getimages_info.rb +37 -0
- data/lib/sync_qiniu/getlocation.rb +48 -0
- data/lib/sync_qiniu/getproxy.rb +95 -0
- data/lib/tasks/the_spider_tasks.rake +4 -0
- data/rakefile +284 -0
- metadata +165 -0
data/lib/rails_spider.rb
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
module RailsSpider
|
2
|
+
class Fetcher
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@page = ''
|
6
|
+
end
|
7
|
+
|
8
|
+
def event_class
|
9
|
+
@event_class = EventSpider.config.event_class.constantize
|
10
|
+
end
|
11
|
+
|
12
|
+
def page_by_url(url, proxy_hash=nil, header_hash=nil, repeat=5)
|
13
|
+
logger.info "Grab the page #{url}"
|
14
|
+
begin
|
15
|
+
change_another_proxy(proxy_hash, header_hash)
|
16
|
+
logger.info "Changed to a new proxy: #{@mechanize.proxy_addr}:#{@mechanize.proxy_port} for #{url}"
|
17
|
+
page = @mechanize.get(url)
|
18
|
+
logger.info "Has been get the page #{url}"
|
19
|
+
page
|
20
|
+
rescue => e
|
21
|
+
logger.error e.message
|
22
|
+
e.backtrace.each do |msg|
|
23
|
+
error_log.error msg
|
24
|
+
end
|
25
|
+
error_log.error "\n"
|
26
|
+
i ||= 0
|
27
|
+
if i < repeat
|
28
|
+
logger.info "Retry to get page for #{i} times"
|
29
|
+
i += 1
|
30
|
+
retry
|
31
|
+
else
|
32
|
+
if url.include?('douban')
|
33
|
+
source = 'douban'
|
34
|
+
elsif url.include?('weibo')
|
35
|
+
source = 'weibo'
|
36
|
+
elsif url.include?('rockbundartmuseum')
|
37
|
+
source = 'waitan'
|
38
|
+
elsif url.include?('citymoments')
|
39
|
+
source = 'citymoment'
|
40
|
+
else
|
41
|
+
source = 'else'
|
42
|
+
end
|
43
|
+
FailUrl.create(url: url, source: source, flag: "spider")
|
44
|
+
logger.warn "Cann't grab url #{url}"
|
45
|
+
return
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def save_page(page)
|
51
|
+
begin
|
52
|
+
page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
|
53
|
+
rescue => e
|
54
|
+
logger.error e.message
|
55
|
+
logger.warn "cann't save page #{page.uri}"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def change_another_proxy(proxy_hash=nil, header_hash=nil)
|
60
|
+
if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
|
61
|
+
ip = proxy_hash[:ip]
|
62
|
+
port = proxy_hash[:port]
|
63
|
+
else
|
64
|
+
index = rand(@proxy.size)
|
65
|
+
ip = @proxy[index][:ip]
|
66
|
+
port = @proxy[index][:port]
|
67
|
+
end
|
68
|
+
@mechanize.set_proxy ip, port
|
69
|
+
|
70
|
+
@mechanize.request_headers = header_hash unless header_hash.nil?
|
71
|
+
end
|
72
|
+
|
73
|
+
def is_grab?(url)
|
74
|
+
event_class.where(url: url).exists? # 表示没有抓取
|
75
|
+
end
|
76
|
+
|
77
|
+
def run
|
78
|
+
logger.info "Start #{self.class} Spider..."
|
79
|
+
|
80
|
+
@links.each do |link|
|
81
|
+
#@city = link.values.first
|
82
|
+
grab_list_link(link.keys.first)
|
83
|
+
end
|
84
|
+
|
85
|
+
logger.info "End of #{self.class} Spider..."
|
86
|
+
end
|
87
|
+
|
88
|
+
def grab_update
|
89
|
+
logger.info "Start #{self.class} Spider grab_update."
|
90
|
+
|
91
|
+
@newlinks.each do |link|
|
92
|
+
@city = link['city'] unless link['city'].blank?
|
93
|
+
grab_list_link(link['url'])
|
94
|
+
end
|
95
|
+
|
96
|
+
logger.info "End of #{self.class} Spider grab_update."
|
97
|
+
end
|
98
|
+
|
99
|
+
def create_event(event_hash)
|
100
|
+
if event_hash.blank?
|
101
|
+
logger.warn "Cann't create event by blank data"
|
102
|
+
return
|
103
|
+
end
|
104
|
+
if is_existed?(event_hash)
|
105
|
+
logger.warn "Paramter:#{event_hash} has been existed cann't to create"
|
106
|
+
return
|
107
|
+
end
|
108
|
+
event = Event.new(event_hash)
|
109
|
+
if event_hash[:place].blank?
|
110
|
+
event.status = -1
|
111
|
+
end
|
112
|
+
event.kind_id = Kind.find_or_create_by(name: event_hash[:kind]).id unless event_hash[:kind].blank?
|
113
|
+
event.subkind_id = set_subkind_id(event_hash[:subkind]) unless event_hash[:subkind].blank?
|
114
|
+
if event_hash[:tags]
|
115
|
+
event_hash[:tags].each do |t|
|
116
|
+
EventTag.create(event_id: event.id, tag_id: Tag.find_or_create_by(name: t).id)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
event.int_id = Event.max(:int_id).blank? ? 1 : Event.max(:int_id) + 1
|
120
|
+
event.save
|
121
|
+
unless event.errors.blank?
|
122
|
+
logger.info event.errors.full_messages.join(' / ')
|
123
|
+
else
|
124
|
+
logger.info 'Save event success'
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def is_existed?(event_hash)
|
129
|
+
#if event_hash[:event_id] && event_class.where(event_id: event_hash[:event_id]).first
|
130
|
+
# return true
|
131
|
+
#end
|
132
|
+
# TODO title and city are the same
|
133
|
+
#if event_hash[:title] && event_class.where(title: event_hash[:title]).first
|
134
|
+
# return true
|
135
|
+
#end
|
136
|
+
if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
|
137
|
+
logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
|
138
|
+
return true
|
139
|
+
end
|
140
|
+
return false
|
141
|
+
end
|
142
|
+
|
143
|
+
def keep_on?; return true end # keep on grab?
|
144
|
+
|
145
|
+
end
|
146
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'rails_spider/fetchers/base'
|
3
|
+
|
4
|
+
module RailsSpider
|
5
|
+
class Mechanize < Fetcher
|
6
|
+
attr_accessor :mechanize, :logger
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
super
|
10
|
+
@mechanize = ::Mechanize.new
|
11
|
+
@mechanize.open_timeout = 20
|
12
|
+
@mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html']
|
13
|
+
@logger = Logger.new STDOUT
|
14
|
+
end
|
15
|
+
|
16
|
+
def page(url)
|
17
|
+
mechanize.get(url)
|
18
|
+
end
|
19
|
+
|
20
|
+
def body(url)
|
21
|
+
page(url).search('body')
|
22
|
+
end
|
23
|
+
|
24
|
+
def links(url)
|
25
|
+
page(url).links.map do |link|
|
26
|
+
begin
|
27
|
+
link.resolved_uri.to_s
|
28
|
+
rescue ::Mechanize::UnsupportedSchemeError
|
29
|
+
''
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def change_another_proxy(proxy_hash=nil, header_hash=nil)
|
35
|
+
if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
|
36
|
+
ip = proxy_hash[:ip]
|
37
|
+
port = proxy_hash[:port]
|
38
|
+
else
|
39
|
+
index = rand(@proxy.size)
|
40
|
+
ip = @proxy[index][:ip]
|
41
|
+
port = @proxy[index][:port]
|
42
|
+
end
|
43
|
+
@mechanize.set_proxy ip, port
|
44
|
+
|
45
|
+
@mechanize.request_headers = header_hash unless header_hash.nil?
|
46
|
+
end
|
47
|
+
|
48
|
+
def is_grab?(url)
|
49
|
+
event_class.where(url: url).exists?
|
50
|
+
end
|
51
|
+
|
52
|
+
def run
|
53
|
+
logger.info "Start #{self.class} Spider..."
|
54
|
+
|
55
|
+
@links.each do |link|
|
56
|
+
#@city = link.values.first
|
57
|
+
grab_list_link(link.keys.first)
|
58
|
+
end
|
59
|
+
|
60
|
+
logger.info "End of #{self.class} Spider..."
|
61
|
+
end
|
62
|
+
|
63
|
+
def grab_update
|
64
|
+
logger.info "Start #{self.class} Spider grab_update."
|
65
|
+
|
66
|
+
@newlinks.each do |link|
|
67
|
+
@city = link['city'] unless link['city'].blank?
|
68
|
+
grab_list_link(link['url'])
|
69
|
+
end
|
70
|
+
|
71
|
+
logger.info "End of #{self.class} Spider grab_update."
|
72
|
+
end
|
73
|
+
|
74
|
+
def is_existed?(event_hash)
|
75
|
+
if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
|
76
|
+
logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
|
77
|
+
return true
|
78
|
+
end
|
79
|
+
return false
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module RailsSpider
|
4
|
+
class Mechanize < Fetcher
|
5
|
+
attr_accessor :mechanize, :logger
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
super
|
9
|
+
@mechanize = Mechanize.new
|
10
|
+
@mechanize.open_timeout = 20
|
11
|
+
@mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html']
|
12
|
+
@logger = Logger.new STDOUT
|
13
|
+
end
|
14
|
+
|
15
|
+
def save_page(page)
|
16
|
+
begin
|
17
|
+
page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
|
18
|
+
rescue => e
|
19
|
+
logger.error e.message
|
20
|
+
logger.warn "cann't save page #{page.uri}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def change_another_proxy(proxy_hash=nil, header_hash=nil)
|
25
|
+
if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
|
26
|
+
ip = proxy_hash[:ip]
|
27
|
+
port = proxy_hash[:port]
|
28
|
+
else
|
29
|
+
index = rand(@proxy.size)
|
30
|
+
ip = @proxy[index][:ip]
|
31
|
+
port = @proxy[index][:port]
|
32
|
+
end
|
33
|
+
@mechanize.set_proxy ip, port
|
34
|
+
|
35
|
+
@mechanize.request_headers = header_hash unless header_hash.nil?
|
36
|
+
end
|
37
|
+
|
38
|
+
def is_grab?(url)
|
39
|
+
event_class.where(url: url).exists?
|
40
|
+
end
|
41
|
+
|
42
|
+
def run
|
43
|
+
logger.info "Start #{self.class} Spider..."
|
44
|
+
|
45
|
+
@links.each do |link|
|
46
|
+
#@city = link.values.first
|
47
|
+
grab_list_link(link.keys.first)
|
48
|
+
end
|
49
|
+
|
50
|
+
logger.info "End of #{self.class} Spider..."
|
51
|
+
end
|
52
|
+
|
53
|
+
def grab_update
|
54
|
+
logger.info "Start #{self.class} Spider grab_update."
|
55
|
+
|
56
|
+
@newlinks.each do |link|
|
57
|
+
@city = link['city'] unless link['city'].blank?
|
58
|
+
grab_list_link(link['url'])
|
59
|
+
end
|
60
|
+
|
61
|
+
logger.info "End of #{self.class} Spider grab_update."
|
62
|
+
end
|
63
|
+
|
64
|
+
def is_existed?(event_hash)
|
65
|
+
if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
|
66
|
+
logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
|
67
|
+
return true
|
68
|
+
end
|
69
|
+
return false
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module RailsSpider
|
2
|
+
class Szlawyers < Parser
|
3
|
+
|
4
|
+
def name
|
5
|
+
doc.at_css('span#lawlist_LawerName').text
|
6
|
+
end
|
7
|
+
|
8
|
+
def sex
|
9
|
+
doc.at_css('span#lawlist_LawerSex').text
|
10
|
+
end
|
11
|
+
|
12
|
+
def office
|
13
|
+
doc.at_css('span#lawlist_Enterprise').text
|
14
|
+
end
|
15
|
+
|
16
|
+
# 资格证号
|
17
|
+
def identify
|
18
|
+
doc.at_css('span#lawlist_LawerqualNo').text
|
19
|
+
end
|
20
|
+
|
21
|
+
def time
|
22
|
+
doc.at_css('span#lawlist_dtLawerqualNo').text
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rails_spider/fetchers/mechanize'
|
2
|
+
|
3
|
+
module RailsSpider
|
4
|
+
class Resource
|
5
|
+
attr_reader :fetcher, :work, :host, :item_path, :list_path, :page_params
|
6
|
+
attr_accessor :page
|
7
|
+
DEFAULT_EXP = "([^\/.?]+)"
|
8
|
+
SYMBOL_EXP = /:\w+/
|
9
|
+
|
10
|
+
def initialize(work, **options)
|
11
|
+
@work = work
|
12
|
+
@host = work.host
|
13
|
+
@list_path = work.list_path
|
14
|
+
@item_path = work.item_path
|
15
|
+
@page_params = work.page_params
|
16
|
+
@page = 1
|
17
|
+
@fetcher ||= RailsSpider::Mechanize.new
|
18
|
+
end
|
19
|
+
|
20
|
+
def run
|
21
|
+
items = get_items
|
22
|
+
|
23
|
+
while items.size > 0 do
|
24
|
+
items.each do |item|
|
25
|
+
save(item)
|
26
|
+
end
|
27
|
+
self.page += 1
|
28
|
+
items = get_items
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_items
|
33
|
+
fetcher.links(list_url).select { |link| item_exp.match? link }
|
34
|
+
end
|
35
|
+
|
36
|
+
def save(url)
|
37
|
+
body = fetcher.body(url)
|
38
|
+
local = Local.find_or_initialize_by url: url, work_id: work.id
|
39
|
+
local.body = body
|
40
|
+
local.save
|
41
|
+
end
|
42
|
+
|
43
|
+
def list_url
|
44
|
+
list_url = URI.join host, list_path
|
45
|
+
if page.to_i > 0
|
46
|
+
page_query = URI.encode_www_form page_params => page
|
47
|
+
list_url.query = page_query
|
48
|
+
end
|
49
|
+
|
50
|
+
list_url
|
51
|
+
end
|
52
|
+
|
53
|
+
def item_exp
|
54
|
+
Regexp.new(item_path.gsub SYMBOL_EXP, DEFAULT_EXP)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|