rails_spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +165 -0
  3. data/README.md +33 -0
  4. data/app/assets/config/the_spider_manifest.js +4 -0
  5. data/app/assets/javascripts/the_spider/application.js +1 -0
  6. data/app/assets/stylesheets/the_spider/application.css +4 -0
  7. data/app/controllers/the_spider/application_controller.rb +9 -0
  8. data/app/controllers/the_spider/locals_controller.rb +62 -0
  9. data/app/controllers/the_spider/works_controller.rb +60 -0
  10. data/app/helpers/the_spider/application_helper.rb +4 -0
  11. data/app/helpers/the_spider/locals_helper.rb +4 -0
  12. data/app/helpers/the_spider/works_helper.rb +4 -0
  13. data/app/jobs/the_spider/application_job.rb +4 -0
  14. data/app/jobs/the_spider/parser_job.rb +11 -0
  15. data/app/jobs/the_spider/work_job.rb +11 -0
  16. data/app/mailers/the_spider/application_mailer.rb +6 -0
  17. data/app/models/rails_spider/application_record.rb +5 -0
  18. data/app/models/rails_spider/cookie.rb +9 -0
  19. data/app/models/rails_spider/failed_url.rb +7 -0
  20. data/app/models/rails_spider/local.rb +14 -0
  21. data/app/models/rails_spider/work.rb +24 -0
  22. data/app/views/layouts/the_spider/application.html.erb +14 -0
  23. data/app/views/the_spider/locals/_form.html.erb +17 -0
  24. data/app/views/the_spider/locals/edit.html.erb +6 -0
  25. data/app/views/the_spider/locals/index.html.erb +25 -0
  26. data/app/views/the_spider/locals/new.html.erb +5 -0
  27. data/app/views/the_spider/locals/show.html.erb +4 -0
  28. data/app/views/the_spider/works/_form.html.erb +9 -0
  29. data/app/views/the_spider/works/edit.html.erb +6 -0
  30. data/app/views/the_spider/works/index.html.erb +44 -0
  31. data/app/views/the_spider/works/new.html.erb +5 -0
  32. data/app/views/the_spider/works/show.html.erb +4 -0
  33. data/config/routes.rb +8 -0
  34. data/config/schedule.rb +35 -0
  35. data/db/migrate/20170502153051_rails_spider_init.rb +38 -0
  36. data/lib/config/config.rb +27 -0
  37. data/lib/config/keywords.json +22 -0
  38. data/lib/config/proxy.json +10 -0
  39. data/lib/helper/helper.rb +6 -0
  40. data/lib/helper/location_helper.rb +46 -0
  41. data/lib/helper/price_helper.rb +23 -0
  42. data/lib/helper/tag_helper.rb +17 -0
  43. data/lib/helper/text_helper.rb +41 -0
  44. data/lib/helper/time_helper.rb +140 -0
  45. data/lib/logger.rb +146 -0
  46. data/lib/proxy/allproxylists.txt +2366 -0
  47. data/lib/proxy/proxy.rb +216 -0
  48. data/lib/proxy/proxylists.txt +625 -0
  49. data/lib/rails_spider.rb +10 -0
  50. data/lib/rails_spider/engine.rb +9 -0
  51. data/lib/rails_spider/fetchers.rb +2 -0
  52. data/lib/rails_spider/fetchers/base.rb +146 -0
  53. data/lib/rails_spider/fetchers/mechanize.rb +83 -0
  54. data/lib/rails_spider/fetchers/witar.rb +73 -0
  55. data/lib/rails_spider/parser.rb +14 -0
  56. data/lib/rails_spider/parser/szlawyers.rb +26 -0
  57. data/lib/rails_spider/resource.rb +58 -0
  58. data/lib/rails_spider/strategies.rb +6 -0
  59. data/lib/rails_spider/version.rb +3 -0
  60. data/lib/sync_qiniu.rb +35 -0
  61. data/lib/sync_qiniu/getimages.rb +98 -0
  62. data/lib/sync_qiniu/getimages_info.rb +37 -0
  63. data/lib/sync_qiniu/getlocation.rb +48 -0
  64. data/lib/sync_qiniu/getproxy.rb +95 -0
  65. data/lib/tasks/the_spider_tasks.rake +4 -0
  66. data/rakefile +284 -0
  67. metadata +165 -0
@@ -0,0 +1,10 @@
1
+ require 'rails_spider/engine'
2
+
3
+ require 'rails_spider/resource'
4
+
5
+ require 'rails_spider/parser'
6
+ require 'rails_spider/parser/szlawyers'
7
+
8
+ module RailsSpider
9
+ # Your code goes here...
10
+ end
@@ -0,0 +1,9 @@
1
+ module RailsSpider
2
+ class Engine < ::Rails::Engine
3
+ isolate_namespace RailsSpider
4
+
5
+ initializer 'rails_spider.assets.precompile' do |app|
6
+ app.config.assets.precompile += ['rails_spider_manifest.js']
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,2 @@
1
+ require 'rails_spider/fetchers/base'
2
+ require 'rails_spider/fetchers/mechanize'
@@ -0,0 +1,146 @@
1
+ module RailsSpider
2
+ class Fetcher
3
+
4
+ def initialize
5
+ @page = ''
6
+ end
7
+
8
+ def event_class
9
+ @event_class = EventSpider.config.event_class.constantize
10
+ end
11
+
12
+ def page_by_url(url, proxy_hash=nil, header_hash=nil, repeat=5)
13
+ logger.info "Grab the page #{url}"
14
+ begin
15
+ change_another_proxy(proxy_hash, header_hash)
16
+ logger.info "Changed to a new proxy: #{@mechanize.proxy_addr}:#{@mechanize.proxy_port} for #{url}"
17
+ page = @mechanize.get(url)
18
+ logger.info "Has been get the page #{url}"
19
+ page
20
+ rescue => e
21
+ logger.error e.message
22
+ e.backtrace.each do |msg|
23
+ error_log.error msg
24
+ end
25
+ error_log.error "\n"
26
+ i ||= 0
27
+ if i < repeat
28
+ logger.info "Retry to get page for #{i} times"
29
+ i += 1
30
+ retry
31
+ else
32
+ if url.include?('douban')
33
+ source = 'douban'
34
+ elsif url.include?('weibo')
35
+ source = 'weibo'
36
+ elsif url.include?('rockbundartmuseum')
37
+ source = 'waitan'
38
+ elsif url.include?('citymoments')
39
+ source = 'citymoment'
40
+ else
41
+ source = 'else'
42
+ end
43
+ FailUrl.create(url: url, source: source, flag: "spider")
44
+ logger.warn "Cann't grab url #{url}"
45
+ return
46
+ end
47
+ end
48
+ end
49
+
50
+ def save_page(page)
51
+ begin
52
+ page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
53
+ rescue => e
54
+ logger.error e.message
55
+ logger.warn "cann't save page #{page.uri}"
56
+ end
57
+ end
58
+
59
+ def change_another_proxy(proxy_hash=nil, header_hash=nil)
60
+ if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
61
+ ip = proxy_hash[:ip]
62
+ port = proxy_hash[:port]
63
+ else
64
+ index = rand(@proxy.size)
65
+ ip = @proxy[index][:ip]
66
+ port = @proxy[index][:port]
67
+ end
68
+ @mechanize.set_proxy ip, port
69
+
70
+ @mechanize.request_headers = header_hash unless header_hash.nil?
71
+ end
72
+
73
+ def is_grab?(url)
74
+ event_class.where(url: url).exists? # 表示没有抓取
75
+ end
76
+
77
+ def run
78
+ logger.info "Start #{self.class} Spider..."
79
+
80
+ @links.each do |link|
81
+ #@city = link.values.first
82
+ grab_list_link(link.keys.first)
83
+ end
84
+
85
+ logger.info "End of #{self.class} Spider..."
86
+ end
87
+
88
+ def grab_update
89
+ logger.info "Start #{self.class} Spider grab_update."
90
+
91
+ @newlinks.each do |link|
92
+ @city = link['city'] unless link['city'].blank?
93
+ grab_list_link(link['url'])
94
+ end
95
+
96
+ logger.info "End of #{self.class} Spider grab_update."
97
+ end
98
+
99
+ def create_event(event_hash)
100
+ if event_hash.blank?
101
+ logger.warn "Cann't create event by blank data"
102
+ return
103
+ end
104
+ if is_existed?(event_hash)
105
+ logger.warn "Paramter:#{event_hash} has been existed cann't to create"
106
+ return
107
+ end
108
+ event = Event.new(event_hash)
109
+ if event_hash[:place].blank?
110
+ event.status = -1
111
+ end
112
+ event.kind_id = Kind.find_or_create_by(name: event_hash[:kind]).id unless event_hash[:kind].blank?
113
+ event.subkind_id = set_subkind_id(event_hash[:subkind]) unless event_hash[:subkind].blank?
114
+ if event_hash[:tags]
115
+ event_hash[:tags].each do |t|
116
+ EventTag.create(event_id: event.id, tag_id: Tag.find_or_create_by(name: t).id)
117
+ end
118
+ end
119
+ event.int_id = Event.max(:int_id).blank? ? 1 : Event.max(:int_id) + 1
120
+ event.save
121
+ unless event.errors.blank?
122
+ logger.info event.errors.full_messages.join(' / ')
123
+ else
124
+ logger.info 'Save event success'
125
+ end
126
+ end
127
+
128
+ def is_existed?(event_hash)
129
+ #if event_hash[:event_id] && event_class.where(event_id: event_hash[:event_id]).first
130
+ # return true
131
+ #end
132
+ # TODO title and city are the same
133
+ #if event_hash[:title] && event_class.where(title: event_hash[:title]).first
134
+ # return true
135
+ #end
136
+ if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
137
+ logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
138
+ return true
139
+ end
140
+ return false
141
+ end
142
+
143
+ def keep_on?; return true end # keep on grab?
144
+
145
+ end
146
+ end
@@ -0,0 +1,83 @@
1
+ require 'mechanize'
2
+ require 'rails_spider/fetchers/base'
3
+
4
+ module RailsSpider
5
+ class Mechanize < Fetcher
6
+ attr_accessor :mechanize, :logger
7
+
8
+ def initialize
9
+ super
10
+ @mechanize = ::Mechanize.new
11
+ @mechanize.open_timeout = 20
12
+ @mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html']
13
+ @logger = Logger.new STDOUT
14
+ end
15
+
16
+ def page(url)
17
+ mechanize.get(url)
18
+ end
19
+
20
+ def body(url)
21
+ page(url).search('body')
22
+ end
23
+
24
+ def links(url)
25
+ page(url).links.map do |link|
26
+ begin
27
+ link.resolved_uri.to_s
28
+ rescue ::Mechanize::UnsupportedSchemeError
29
+ ''
30
+ end
31
+ end
32
+ end
33
+
34
+ def change_another_proxy(proxy_hash=nil, header_hash=nil)
35
+ if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
36
+ ip = proxy_hash[:ip]
37
+ port = proxy_hash[:port]
38
+ else
39
+ index = rand(@proxy.size)
40
+ ip = @proxy[index][:ip]
41
+ port = @proxy[index][:port]
42
+ end
43
+ @mechanize.set_proxy ip, port
44
+
45
+ @mechanize.request_headers = header_hash unless header_hash.nil?
46
+ end
47
+
48
+ def is_grab?(url)
49
+ event_class.where(url: url).exists?
50
+ end
51
+
52
+ def run
53
+ logger.info "Start #{self.class} Spider..."
54
+
55
+ @links.each do |link|
56
+ #@city = link.values.first
57
+ grab_list_link(link.keys.first)
58
+ end
59
+
60
+ logger.info "End of #{self.class} Spider..."
61
+ end
62
+
63
+ def grab_update
64
+ logger.info "Start #{self.class} Spider grab_update."
65
+
66
+ @newlinks.each do |link|
67
+ @city = link['city'] unless link['city'].blank?
68
+ grab_list_link(link['url'])
69
+ end
70
+
71
+ logger.info "End of #{self.class} Spider grab_update."
72
+ end
73
+
74
+ def is_existed?(event_hash)
75
+ if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
76
+ logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
77
+ return true
78
+ end
79
+ return false
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,73 @@
1
+ require 'mechanize'
2
+
3
+ module RailsSpider
4
+ class Mechanize < Fetcher
5
+ attr_accessor :mechanize, :logger
6
+
7
+ def initialize
8
+ super
9
+ @mechanize = Mechanize.new
10
+ @mechanize.open_timeout = 20
11
+ @mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html']
12
+ @logger = Logger.new STDOUT
13
+ end
14
+
15
+ def save_page(page)
16
+ begin
17
+ page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
18
+ rescue => e
19
+ logger.error e.message
20
+ logger.warn "cann't save page #{page.uri}"
21
+ end
22
+ end
23
+
24
+ def change_another_proxy(proxy_hash=nil, header_hash=nil)
25
+ if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
26
+ ip = proxy_hash[:ip]
27
+ port = proxy_hash[:port]
28
+ else
29
+ index = rand(@proxy.size)
30
+ ip = @proxy[index][:ip]
31
+ port = @proxy[index][:port]
32
+ end
33
+ @mechanize.set_proxy ip, port
34
+
35
+ @mechanize.request_headers = header_hash unless header_hash.nil?
36
+ end
37
+
38
+ def is_grab?(url)
39
+ event_class.where(url: url).exists?
40
+ end
41
+
42
+ def run
43
+ logger.info "Start #{self.class} Spider..."
44
+
45
+ @links.each do |link|
46
+ #@city = link.values.first
47
+ grab_list_link(link.keys.first)
48
+ end
49
+
50
+ logger.info "End of #{self.class} Spider..."
51
+ end
52
+
53
+ def grab_update
54
+ logger.info "Start #{self.class} Spider grab_update."
55
+
56
+ @newlinks.each do |link|
57
+ @city = link['city'] unless link['city'].blank?
58
+ grab_list_link(link['url'])
59
+ end
60
+
61
+ logger.info "End of #{self.class} Spider grab_update."
62
+ end
63
+
64
+ def is_existed?(event_hash)
65
+ if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
66
+ logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
67
+ return true
68
+ end
69
+ return false
70
+ end
71
+
72
+ end
73
+ end
@@ -0,0 +1,14 @@
1
+ module RailsSpider
2
+ class Parser
3
+ attr_accessor :doc
4
+
5
+ def initialize(body)
6
+ @doc = Nokogiri::HTML(body)
7
+ end
8
+
9
+ def save
10
+ raise 'Should implement in subclass'
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,26 @@
1
+ module RailsSpider
2
+ class Szlawyers < Parser
3
+
4
+ def name
5
+ doc.at_css('span#lawlist_LawerName').text
6
+ end
7
+
8
+ def sex
9
+ doc.at_css('span#lawlist_LawerSex').text
10
+ end
11
+
12
+ def office
13
+ doc.at_css('span#lawlist_Enterprise').text
14
+ end
15
+
16
+ # 资格证号
17
+ def identify
18
+ doc.at_css('span#lawlist_LawerqualNo').text
19
+ end
20
+
21
+ def time
22
+ doc.at_css('span#lawlist_dtLawerqualNo').text
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,58 @@
1
+ require 'rails_spider/fetchers/mechanize'
2
+
3
+ module RailsSpider
4
+ class Resource
5
+ attr_reader :fetcher, :work, :host, :item_path, :list_path, :page_params
6
+ attr_accessor :page
7
+ DEFAULT_EXP = "([^\/.?]+)"
8
+ SYMBOL_EXP = /:\w+/
9
+
10
+ def initialize(work, **options)
11
+ @work = work
12
+ @host = work.host
13
+ @list_path = work.list_path
14
+ @item_path = work.item_path
15
+ @page_params = work.page_params
16
+ @page = 1
17
+ @fetcher ||= RailsSpider::Mechanize.new
18
+ end
19
+
20
+ def run
21
+ items = get_items
22
+
23
+ while items.size > 0 do
24
+ items.each do |item|
25
+ save(item)
26
+ end
27
+ self.page += 1
28
+ items = get_items
29
+ end
30
+ end
31
+
32
+ def get_items
33
+ fetcher.links(list_url).select { |link| item_exp.match? link }
34
+ end
35
+
36
+ def save(url)
37
+ body = fetcher.body(url)
38
+ local = Local.find_or_initialize_by url: url, work_id: work.id
39
+ local.body = body
40
+ local.save
41
+ end
42
+
43
+ def list_url
44
+ list_url = URI.join host, list_path
45
+ if page.to_i > 0
46
+ page_query = URI.encode_www_form page_params => page
47
+ list_url.query = page_query
48
+ end
49
+
50
+ list_url
51
+ end
52
+
53
+ def item_exp
54
+ Regexp.new(item_path.gsub SYMBOL_EXP, DEFAULT_EXP)
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,6 @@
1
+ require 'event_spider/strategies/base'
2
+ require 'event_spider/strategies/douban'
3
+ require 'event_spider/strategies/weibo'
4
+ require 'event_spider/strategies/qunar'
5
+ require 'event_spider/strategies/waitan'
6
+ require 'event_spider/strategies/citymoment'
@@ -0,0 +1,3 @@
1
+ module RailsSpider
2
+ VERSION = '0.1.0'
3
+ end