rails_spider 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +165 -0
  3. data/README.md +33 -0
  4. data/app/assets/config/the_spider_manifest.js +4 -0
  5. data/app/assets/javascripts/the_spider/application.js +1 -0
  6. data/app/assets/stylesheets/the_spider/application.css +4 -0
  7. data/app/controllers/the_spider/application_controller.rb +9 -0
  8. data/app/controllers/the_spider/locals_controller.rb +62 -0
  9. data/app/controllers/the_spider/works_controller.rb +60 -0
  10. data/app/helpers/the_spider/application_helper.rb +4 -0
  11. data/app/helpers/the_spider/locals_helper.rb +4 -0
  12. data/app/helpers/the_spider/works_helper.rb +4 -0
  13. data/app/jobs/the_spider/application_job.rb +4 -0
  14. data/app/jobs/the_spider/parser_job.rb +11 -0
  15. data/app/jobs/the_spider/work_job.rb +11 -0
  16. data/app/mailers/the_spider/application_mailer.rb +6 -0
  17. data/app/models/rails_spider/application_record.rb +5 -0
  18. data/app/models/rails_spider/cookie.rb +9 -0
  19. data/app/models/rails_spider/failed_url.rb +7 -0
  20. data/app/models/rails_spider/local.rb +14 -0
  21. data/app/models/rails_spider/work.rb +24 -0
  22. data/app/views/layouts/the_spider/application.html.erb +14 -0
  23. data/app/views/the_spider/locals/_form.html.erb +17 -0
  24. data/app/views/the_spider/locals/edit.html.erb +6 -0
  25. data/app/views/the_spider/locals/index.html.erb +25 -0
  26. data/app/views/the_spider/locals/new.html.erb +5 -0
  27. data/app/views/the_spider/locals/show.html.erb +4 -0
  28. data/app/views/the_spider/works/_form.html.erb +9 -0
  29. data/app/views/the_spider/works/edit.html.erb +6 -0
  30. data/app/views/the_spider/works/index.html.erb +44 -0
  31. data/app/views/the_spider/works/new.html.erb +5 -0
  32. data/app/views/the_spider/works/show.html.erb +4 -0
  33. data/config/routes.rb +8 -0
  34. data/config/schedule.rb +35 -0
  35. data/db/migrate/20170502153051_rails_spider_init.rb +38 -0
  36. data/lib/config/config.rb +27 -0
  37. data/lib/config/keywords.json +22 -0
  38. data/lib/config/proxy.json +10 -0
  39. data/lib/helper/helper.rb +6 -0
  40. data/lib/helper/location_helper.rb +46 -0
  41. data/lib/helper/price_helper.rb +23 -0
  42. data/lib/helper/tag_helper.rb +17 -0
  43. data/lib/helper/text_helper.rb +41 -0
  44. data/lib/helper/time_helper.rb +140 -0
  45. data/lib/logger.rb +146 -0
  46. data/lib/proxy/allproxylists.txt +2366 -0
  47. data/lib/proxy/proxy.rb +216 -0
  48. data/lib/proxy/proxylists.txt +625 -0
  49. data/lib/rails_spider.rb +10 -0
  50. data/lib/rails_spider/engine.rb +9 -0
  51. data/lib/rails_spider/fetchers.rb +2 -0
  52. data/lib/rails_spider/fetchers/base.rb +146 -0
  53. data/lib/rails_spider/fetchers/mechanize.rb +83 -0
  54. data/lib/rails_spider/fetchers/witar.rb +73 -0
  55. data/lib/rails_spider/parser.rb +14 -0
  56. data/lib/rails_spider/parser/szlawyers.rb +26 -0
  57. data/lib/rails_spider/resource.rb +58 -0
  58. data/lib/rails_spider/strategies.rb +6 -0
  59. data/lib/rails_spider/version.rb +3 -0
  60. data/lib/sync_qiniu.rb +35 -0
  61. data/lib/sync_qiniu/getimages.rb +98 -0
  62. data/lib/sync_qiniu/getimages_info.rb +37 -0
  63. data/lib/sync_qiniu/getlocation.rb +48 -0
  64. data/lib/sync_qiniu/getproxy.rb +95 -0
  65. data/lib/tasks/the_spider_tasks.rake +4 -0
  66. data/rakefile +284 -0
  67. metadata +165 -0
@@ -0,0 +1,10 @@
1
+ require 'rails_spider/engine'
2
+
3
+ require 'rails_spider/resource'
4
+
5
+ require 'rails_spider/parser'
6
+ require 'rails_spider/parser/szlawyers'
7
+
8
+ module RailsSpider
9
+ # Your code goes here...
10
+ end
@@ -0,0 +1,9 @@
1
+ module RailsSpider
2
+ class Engine < ::Rails::Engine
3
+ isolate_namespace RailsSpider
4
+
5
+ initializer 'rails_spider.assets.precompile' do |app|
6
+ app.config.assets.precompile += ['rails_spider_manifest.js']
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,2 @@
1
+ require 'rails_spider/fetchers/base'
2
+ require 'rails_spider/fetchers/mechanize'
@@ -0,0 +1,146 @@
1
+ module RailsSpider
2
+ class Fetcher
3
+
4
+ def initialize
5
+ @page = ''
6
+ end
7
+
8
+ def event_class
9
+ @event_class = EventSpider.config.event_class.constantize
10
+ end
11
+
12
+ def page_by_url(url, proxy_hash=nil, header_hash=nil, repeat=5)
13
+ logger.info "Grab the page #{url}"
14
+ begin
15
+ change_another_proxy(proxy_hash, header_hash)
16
+ logger.info "Changed to a new proxy: #{@mechanize.proxy_addr}:#{@mechanize.proxy_port} for #{url}"
17
+ page = @mechanize.get(url)
18
+ logger.info "Has been get the page #{url}"
19
+ page
20
+ rescue => e
21
+ logger.error e.message
22
+ e.backtrace.each do |msg|
23
+ error_log.error msg
24
+ end
25
+ error_log.error "\n"
26
+ i ||= 0
27
+ if i < repeat
28
+ logger.info "Retry to get page for #{i} times"
29
+ i += 1
30
+ retry
31
+ else
32
+ if url.include?('douban')
33
+ source = 'douban'
34
+ elsif url.include?('weibo')
35
+ source = 'weibo'
36
+ elsif url.include?('rockbundartmuseum')
37
+ source = 'waitan'
38
+ elsif url.include?('citymoments')
39
+ source = 'citymoment'
40
+ else
41
+ source = 'else'
42
+ end
43
+ FailUrl.create(url: url, source: source, flag: "spider")
44
+ logger.warn "Cann't grab url #{url}"
45
+ return
46
+ end
47
+ end
48
+ end
49
+
50
+ def save_page(page)
51
+ begin
52
+ page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
53
+ rescue => e
54
+ logger.error e.message
55
+ logger.warn "cann't save page #{page.uri}"
56
+ end
57
+ end
58
+
59
+ def change_another_proxy(proxy_hash=nil, header_hash=nil)
60
+ if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
61
+ ip = proxy_hash[:ip]
62
+ port = proxy_hash[:port]
63
+ else
64
+ index = rand(@proxy.size)
65
+ ip = @proxy[index][:ip]
66
+ port = @proxy[index][:port]
67
+ end
68
+ @mechanize.set_proxy ip, port
69
+
70
+ @mechanize.request_headers = header_hash unless header_hash.nil?
71
+ end
72
+
73
+ def is_grab?(url)
74
+ event_class.where(url: url).exists? # 表示没有抓取
75
+ end
76
+
77
+ def run
78
+ logger.info "Start #{self.class} Spider..."
79
+
80
+ @links.each do |link|
81
+ #@city = link.values.first
82
+ grab_list_link(link.keys.first)
83
+ end
84
+
85
+ logger.info "End of #{self.class} Spider..."
86
+ end
87
+
88
+ def grab_update
89
+ logger.info "Start #{self.class} Spider grab_update."
90
+
91
+ @newlinks.each do |link|
92
+ @city = link['city'] unless link['city'].blank?
93
+ grab_list_link(link['url'])
94
+ end
95
+
96
+ logger.info "End of #{self.class} Spider grab_update."
97
+ end
98
+
99
+ def create_event(event_hash)
100
+ if event_hash.blank?
101
+ logger.warn "Cann't create event by blank data"
102
+ return
103
+ end
104
+ if is_existed?(event_hash)
105
+ logger.warn "Paramter:#{event_hash} has been existed cann't to create"
106
+ return
107
+ end
108
+ event = Event.new(event_hash)
109
+ if event_hash[:place].blank?
110
+ event.status = -1
111
+ end
112
+ event.kind_id = Kind.find_or_create_by(name: event_hash[:kind]).id unless event_hash[:kind].blank?
113
+ event.subkind_id = set_subkind_id(event_hash[:subkind]) unless event_hash[:subkind].blank?
114
+ if event_hash[:tags]
115
+ event_hash[:tags].each do |t|
116
+ EventTag.create(event_id: event.id, tag_id: Tag.find_or_create_by(name: t).id)
117
+ end
118
+ end
119
+ event.int_id = Event.max(:int_id).blank? ? 1 : Event.max(:int_id) + 1
120
+ event.save
121
+ unless event.errors.blank?
122
+ logger.info event.errors.full_messages.join(' / ')
123
+ else
124
+ logger.info 'Save event success'
125
+ end
126
+ end
127
+
128
+ def is_existed?(event_hash)
129
+ #if event_hash[:event_id] && event_class.where(event_id: event_hash[:event_id]).first
130
+ # return true
131
+ #end
132
+ # TODO title and city are the same
133
+ #if event_hash[:title] && event_class.where(title: event_hash[:title]).first
134
+ # return true
135
+ #end
136
+ if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
137
+ logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
138
+ return true
139
+ end
140
+ return false
141
+ end
142
+
143
+ def keep_on?; return true end # keep on grab?
144
+
145
+ end
146
+ end
@@ -0,0 +1,83 @@
1
+ require 'mechanize'
2
+ require 'rails_spider/fetchers/base'
3
+
4
+ module RailsSpider
5
+ class Mechanize < Fetcher
6
+ attr_accessor :mechanize, :logger
7
+
8
+ def initialize
9
+ super
10
+ @mechanize = ::Mechanize.new
11
+ @mechanize.open_timeout = 20
12
+ @mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html']
13
+ @logger = Logger.new STDOUT
14
+ end
15
+
16
+ def page(url)
17
+ mechanize.get(url)
18
+ end
19
+
20
+ def body(url)
21
+ page(url).search('body')
22
+ end
23
+
24
+ def links(url)
25
+ page(url).links.map do |link|
26
+ begin
27
+ link.resolved_uri.to_s
28
+ rescue ::Mechanize::UnsupportedSchemeError
29
+ ''
30
+ end
31
+ end
32
+ end
33
+
34
+ def change_another_proxy(proxy_hash=nil, header_hash=nil)
35
+ if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
36
+ ip = proxy_hash[:ip]
37
+ port = proxy_hash[:port]
38
+ else
39
+ index = rand(@proxy.size)
40
+ ip = @proxy[index][:ip]
41
+ port = @proxy[index][:port]
42
+ end
43
+ @mechanize.set_proxy ip, port
44
+
45
+ @mechanize.request_headers = header_hash unless header_hash.nil?
46
+ end
47
+
48
+ def is_grab?(url)
49
+ event_class.where(url: url).exists?
50
+ end
51
+
52
+ def run
53
+ logger.info "Start #{self.class} Spider..."
54
+
55
+ @links.each do |link|
56
+ #@city = link.values.first
57
+ grab_list_link(link.keys.first)
58
+ end
59
+
60
+ logger.info "End of #{self.class} Spider..."
61
+ end
62
+
63
+ def grab_update
64
+ logger.info "Start #{self.class} Spider grab_update."
65
+
66
+ @newlinks.each do |link|
67
+ @city = link['city'] unless link['city'].blank?
68
+ grab_list_link(link['url'])
69
+ end
70
+
71
+ logger.info "End of #{self.class} Spider grab_update."
72
+ end
73
+
74
+ def is_existed?(event_hash)
75
+ if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
76
+ logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
77
+ return true
78
+ end
79
+ return false
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,73 @@
1
+ require 'mechanize'
2
+
3
+ module RailsSpider
4
+ class Mechanize < Fetcher
5
+ attr_accessor :mechanize, :logger
6
+
7
+ def initialize
8
+ super
9
+ @mechanize = Mechanize.new
10
+ @mechanize.open_timeout = 20
11
+ @mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html']
12
+ @logger = Logger.new STDOUT
13
+ end
14
+
15
+ def save_page(page)
16
+ begin
17
+ page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
18
+ rescue => e
19
+ logger.error e.message
20
+ logger.warn "cann't save page #{page.uri}"
21
+ end
22
+ end
23
+
24
+ def change_another_proxy(proxy_hash=nil, header_hash=nil)
25
+ if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
26
+ ip = proxy_hash[:ip]
27
+ port = proxy_hash[:port]
28
+ else
29
+ index = rand(@proxy.size)
30
+ ip = @proxy[index][:ip]
31
+ port = @proxy[index][:port]
32
+ end
33
+ @mechanize.set_proxy ip, port
34
+
35
+ @mechanize.request_headers = header_hash unless header_hash.nil?
36
+ end
37
+
38
+ def is_grab?(url)
39
+ event_class.where(url: url).exists?
40
+ end
41
+
42
+ def run
43
+ logger.info "Start #{self.class} Spider..."
44
+
45
+ @links.each do |link|
46
+ #@city = link.values.first
47
+ grab_list_link(link.keys.first)
48
+ end
49
+
50
+ logger.info "End of #{self.class} Spider..."
51
+ end
52
+
53
+ def grab_update
54
+ logger.info "Start #{self.class} Spider grab_update."
55
+
56
+ @newlinks.each do |link|
57
+ @city = link['city'] unless link['city'].blank?
58
+ grab_list_link(link['url'])
59
+ end
60
+
61
+ logger.info "End of #{self.class} Spider grab_update."
62
+ end
63
+
64
+ def is_existed?(event_hash)
65
+ if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
66
+ logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
67
+ return true
68
+ end
69
+ return false
70
+ end
71
+
72
+ end
73
+ end
@@ -0,0 +1,14 @@
1
+ module RailsSpider
2
+ class Parser
3
+ attr_accessor :doc
4
+
5
+ def initialize(body)
6
+ @doc = Nokogiri::HTML(body)
7
+ end
8
+
9
+ def save
10
+ raise 'Should implement in subclass'
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,26 @@
1
+ module RailsSpider
2
+ class Szlawyers < Parser
3
+
4
+ def name
5
+ doc.at_css('span#lawlist_LawerName').text
6
+ end
7
+
8
+ def sex
9
+ doc.at_css('span#lawlist_LawerSex').text
10
+ end
11
+
12
+ def office
13
+ doc.at_css('span#lawlist_Enterprise').text
14
+ end
15
+
16
+ # 资格证号
17
+ def identify
18
+ doc.at_css('span#lawlist_LawerqualNo').text
19
+ end
20
+
21
+ def time
22
+ doc.at_css('span#lawlist_dtLawerqualNo').text
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,58 @@
1
+ require 'rails_spider/fetchers/mechanize'
2
+
3
+ module RailsSpider
4
+ class Resource
5
+ attr_reader :fetcher, :work, :host, :item_path, :list_path, :page_params
6
+ attr_accessor :page
7
+ DEFAULT_EXP = "([^\/.?]+)"
8
+ SYMBOL_EXP = /:\w+/
9
+
10
+ def initialize(work, **options)
11
+ @work = work
12
+ @host = work.host
13
+ @list_path = work.list_path
14
+ @item_path = work.item_path
15
+ @page_params = work.page_params
16
+ @page = 1
17
+ @fetcher ||= RailsSpider::Mechanize.new
18
+ end
19
+
20
+ def run
21
+ items = get_items
22
+
23
+ while items.size > 0 do
24
+ items.each do |item|
25
+ save(item)
26
+ end
27
+ self.page += 1
28
+ items = get_items
29
+ end
30
+ end
31
+
32
+ def get_items
33
+ fetcher.links(list_url).select { |link| item_exp.match? link }
34
+ end
35
+
36
+ def save(url)
37
+ body = fetcher.body(url)
38
+ local = Local.find_or_initialize_by url: url, work_id: work.id
39
+ local.body = body
40
+ local.save
41
+ end
42
+
43
+ def list_url
44
+ list_url = URI.join host, list_path
45
+ if page.to_i > 0
46
+ page_query = URI.encode_www_form page_params => page
47
+ list_url.query = page_query
48
+ end
49
+
50
+ list_url
51
+ end
52
+
53
+ def item_exp
54
+ Regexp.new(item_path.gsub SYMBOL_EXP, DEFAULT_EXP)
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,6 @@
1
+ require 'event_spider/strategies/base'
2
+ require 'event_spider/strategies/douban'
3
+ require 'event_spider/strategies/weibo'
4
+ require 'event_spider/strategies/qunar'
5
+ require 'event_spider/strategies/waitan'
6
+ require 'event_spider/strategies/citymoment'
@@ -0,0 +1,3 @@
1
+ module RailsSpider
2
+ VERSION = '0.1.0'
3
+ end