parallel588_polipus 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,282 @@
1
+ # encoding: UTF-8
2
+ require 'net/https'
3
+ require 'polipus/page'
4
+ require 'zlib'
5
+ require 'http/cookie'
6
+
7
+ module Polipus
8
+ class HTTP
9
+ # Maximum number of redirects to follow on each get_response
10
+ REDIRECT_LIMIT = 5
11
+ RESCUABLE_ERRORS = [
12
+ EOFError,
13
+ Errno::ECONNREFUSED,
14
+ Errno::ECONNRESET,
15
+ Errno::EHOSTUNREACH,
16
+ Errno::EINVAL,
17
+ Errno::EPIPE,
18
+ Errno::ETIMEDOUT,
19
+ Net::HTTPBadResponse,
20
+ Net::HTTPHeaderSyntaxError,
21
+ Net::ProtocolError,
22
+ SocketError,
23
+ Timeout::Error,
24
+ Zlib::DataError,
25
+ Zlib::GzipFile::Error
26
+ ]
27
+
28
+ def initialize(opts = {})
29
+ @connections = {}
30
+ @connections_hits = {}
31
+ @opts = opts
32
+ end
33
+
34
+ #
35
+ # Fetch a single Page from the response of an HTTP request to *url*.
36
+ # Just gets the final destination page.
37
+ #
38
+ def fetch_page(url, referer = nil, depth = nil)
39
+ fetch_pages(url, referer, depth).last
40
+ end
41
+
42
+ #
43
+ # Create new Pages from the response of an HTTP request to *url*,
44
+ # including redirects
45
+ #
46
+ def fetch_pages(url, referer = nil, depth = nil)
47
+ url = URI(url)
48
+ pages = []
49
+ get(url, referer) do |response, code, location, redirect_to, response_time|
50
+ handle_compression response
51
+ pages << Page.new(location, body: response.body,
52
+ code: code,
53
+ headers: response.to_hash,
54
+ referer: referer,
55
+ depth: depth,
56
+ redirect_to: redirect_to,
57
+ response_time: response_time,
58
+ fetched_at: Time.now.to_i)
59
+ end
60
+
61
+ pages
62
+ rescue *RESCUABLE_ERRORS => e
63
+ if verbose?
64
+ puts e.inspect
65
+ puts e.backtrace
66
+ end
67
+
68
+ [Page.new(url, error: e, referer: referer, depth: depth)]
69
+ end
70
+
71
+ #
72
+ # The maximum number of redirects to follow
73
+ #
74
+ def redirect_limit
75
+ @opts[:redirect_limit] || REDIRECT_LIMIT
76
+ end
77
+
78
+ #
79
+ # The user-agent string which will be sent with each request,
80
+ # or nil if no such option is set
81
+ #
82
+ def user_agent
83
+ if @opts[:user_agent].respond_to?(:sample)
84
+ @opts[:user_agent].sample
85
+ else
86
+ @opts[:user_agent]
87
+ end
88
+ end
89
+
90
+ #
91
+ # The proxy address string
92
+ #
93
+ def proxy_host
94
+ @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
95
+ end
96
+
97
+ #
98
+ # The proxy port
99
+ #
100
+ def proxy_port
101
+ @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
102
+ end
103
+
104
+ #
105
+ # The proxy username
106
+ #
107
+ def proxy_user
108
+ @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
109
+ end
110
+
111
+ #
112
+ # The proxy password
113
+ #
114
+ def proxy_pass
115
+ #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
116
+ @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
117
+ end
118
+
119
+ #
120
+ # Shorthand to get proxy info with a single call
121
+ # It returns an array of ['addr', port, 'user', 'pass']
122
+ #
123
+ def proxy_host_port
124
+ @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
125
+ end
126
+
127
+ #
128
+ # HTTP read timeout in seconds
129
+ #
130
+ def read_timeout
131
+ @opts[:read_timeout]
132
+ end
133
+
134
+ #
135
+ # HTTP open timeout in seconds
136
+ #
137
+ def open_timeout
138
+ @opts[:open_timeout]
139
+ end
140
+
141
+ # Does this HTTP client accept cookies from the server?
142
+ #
143
+ def accept_cookies?
144
+ @opts[:accept_cookies]
145
+ end
146
+
147
+ def cookie_jar
148
+ @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
149
+ @opts[:cookie_jar]
150
+ end
151
+
152
+ private
153
+
154
+ #
155
+ # Retrieve HTTP responses for *url*, including redirects.
156
+ # Yields the response object, response code, and URI location
157
+ # for each response.
158
+ #
159
+ def get(url, referer = nil)
160
+ limit = redirect_limit
161
+ loc = url
162
+ loop do
163
+ # if redirected to a relative url, merge it with the host of the original
164
+ # request url
165
+ loc = url.merge(loc) if loc.relative?
166
+
167
+ response, response_time = get_response(loc, referer)
168
+ code = Integer(response.code)
169
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
170
+ yield response, code, loc, redirect_to, response_time
171
+ limit -= 1
172
+ break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
173
+ end
174
+ end
175
+
176
+ #
177
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
178
+ #
179
+ def get_response(url, referer = nil)
180
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
181
+
182
+ opts = {}
183
+ opts['User-Agent'] = user_agent if user_agent
184
+ opts['Referer'] = referer.to_s if referer
185
+ opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
186
+ opts['Accept-Encoding'] = 'gzip,deflate'
187
+
188
+ retries = 0
189
+ begin
190
+ start = Time.now
191
+ # format request
192
+ req = Net::HTTP::Get.new(full_path, opts)
193
+ # HTTP Basic authentication
194
+ req.basic_auth url.user, url.password if url.user
195
+ if @opts[:http_user]
196
+ req.basic_auth @opts[:http_user], @opts[:http_password]
197
+ end
198
+ # urls auth schema has higher priority
199
+ req.basic_auth url.user, url.password if url.user
200
+ response = connection(url).request(req)
201
+ finish = Time.now
202
+ response_time = ((finish - start) * 1000).round
203
+ cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
204
+ return response, response_time
205
+ rescue *RESCUABLE_ERRORS => e
206
+ puts e.inspect if verbose?
207
+ refresh_connection(url)
208
+ retries += 1
209
+ if retries < 3
210
+ retry
211
+ else
212
+ raise e
213
+ end
214
+ end
215
+ end
216
+
217
+ def connection(url)
218
+ @connections[url.host] ||= {}
219
+ @connections_hits[url.host] ||= {}
220
+
221
+ if @connections[url.host][url.port]
222
+ if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
223
+ @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
224
+ return refresh_connection url
225
+ end
226
+ @connections_hits[url.host][url.port] += 1
227
+ return @connections[url.host][url.port]
228
+ end
229
+
230
+ refresh_connection url
231
+ end
232
+
233
+ def refresh_connection(url)
234
+ if @opts[:logger] && proxy_host && proxy_port
235
+ @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
236
+ end
237
+
238
+ # Block has higher priority
239
+ unless @opts[:proxy_host_port].nil?
240
+ p_host, p_port, p_user, p_pass = proxy_host_port
241
+ else
242
+ p_host = proxy_host
243
+ p_port = proxy_port
244
+ p_user = proxy_user
245
+ p_pass = proxy_pass
246
+ end
247
+
248
+ http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)
249
+
250
+ http.read_timeout = read_timeout if read_timeout
251
+ http.open_timeout = open_timeout if open_timeout
252
+
253
+ if url.scheme == 'https'
254
+ http.use_ssl = true
255
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
256
+ end
257
+ @connections_hits[url.host][url.port] = 1
258
+ @connections[url.host][url.port] = http.start
259
+ end
260
+
261
+ def verbose?
262
+ @opts[:verbose]
263
+ end
264
+
265
+ #
266
+ # Allowed to connect to the requested url?
267
+ #
268
+ def allowed?(to_url, from_url)
269
+ to_url.host.nil? || (to_url.host == from_url.host)
270
+ end
271
+
272
+ def handle_compression(response)
273
+ case response['content-encoding']
274
+ when 'gzip', 'x-gzip'
275
+ body_io = StringIO.new(response.body)
276
+ response.body.replace Zlib::GzipReader.new(body_io).read
277
+ when 'deflate'
278
+ response.body.replace Zlib::Inflate.inflate(response.body)
279
+ end
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,256 @@
1
+ # encoding: UTF-8
2
+ require 'nokogiri'
3
+ require 'json'
4
+ require 'ostruct'
5
+ require 'set'
6
+ require 'kconv'
7
+
8
+ module Polipus
9
+ class Page
10
+ # The URL of the page
11
+ attr_reader :url
12
+ # The raw HTTP response body of the page
13
+ attr_reader :body
14
+ # Headers of the HTTP response
15
+ attr_reader :headers
16
+ # URL of the page this one redirected to, if any
17
+ attr_reader :redirect_to
18
+ # Exception object, if one was raised during HTTP#fetch_page
19
+ attr_reader :error
20
+ # Integer response code of the page
21
+ attr_accessor :code
22
+ # Depth of this page from the root of the crawl.
23
+ attr_accessor :depth
24
+ # URL of the page that brought us to this page
25
+ attr_accessor :referer
26
+ # Response time of the request for this page in milliseconds
27
+ attr_accessor :response_time
28
+ # OpenStruct it holds users defined data
29
+ attr_accessor :user_data
30
+
31
+ attr_accessor :aliases
32
+
33
+ attr_accessor :domain_aliases
34
+
35
+ # Whether the current page should be stored
36
+ # Default: true
37
+ attr_accessor :storable
38
+
39
+ attr_accessor :fetched_at
40
+
41
+ #
42
+ # Create a new page
43
+ #
44
+ def initialize(url, params = {})
45
+ @url = URI(url)
46
+ @code = params[:code]
47
+ @headers = params[:headers] || {}
48
+ @headers['content-type'] ||= ['']
49
+ @aliases = Array(params[:aka]).compact
50
+ @referer = params[:referer]
51
+ @depth = params[:depth] || 0
52
+ @redirect_to = to_absolute(params[:redirect_to])
53
+ @response_time = params[:response_time]
54
+ @body = params[:body]
55
+ @error = params[:error]
56
+ @fetched = !params[:code].nil?
57
+ @user_data = OpenStruct.new
58
+ @domain_aliases = params[:domain_aliases] ||= []
59
+ @storable = true
60
+ @fetched_at = params[:fetched_at]
61
+ end
62
+
63
+ #
64
+ # Array of distinct A tag HREFs from the page
65
+ #
66
+ def links
67
+ return @links.to_a unless @links.nil?
68
+ @links = Set.new
69
+ return [] unless doc
70
+
71
+ doc.search('//a[@href]').each do |a|
72
+ u = a['href']
73
+ next if u.nil? || u.empty?
74
+ abs = to_absolute(u) rescue next
75
+ @links << abs if in_domain?(abs)
76
+ end
77
+ @links.to_a
78
+ end
79
+
80
+ #
81
+ # Nokogiri document for the HTML body
82
+ #
83
+ def doc
84
+ return @doc if @doc
85
+ @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html? rescue nil
86
+ end
87
+
88
+ #
89
+ # Discard links, a next call of page.links will return an empty array
90
+ #
91
+ def discard_links!
92
+ @links = []
93
+ end
94
+
95
+ #
96
+ # Delete the Nokogiri document and response body to conserve memory
97
+ #
98
+ def discard_doc!
99
+ links # force parsing of page links before we trash the document
100
+ @doc = @body = nil
101
+ end
102
+
103
+ #
104
+ # Was the page successfully fetched?
105
+ # +true+ if the page was fetched with no error, +false+ otherwise.
106
+ #
107
+ def fetched?
108
+ @fetched
109
+ end
110
+
111
+ #
112
+ # The content-type returned by the HTTP request for this page
113
+ #
114
+ def content_type
115
+ headers['content-type'].first
116
+ end
117
+
118
+ #
119
+ # Returns +true+ if the page is a HTML document, returns +false+
120
+ # otherwise.
121
+ #
122
+ def html?
123
+ content_type =~ %r{^(text/html|application/xhtml+xml)\b}
124
+ end
125
+
126
+ #
127
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
128
+ # otherwise.
129
+ #
130
+ def redirect?
131
+ (300...400).include?(@code)
132
+ end
133
+
134
+ #
135
+ # Returns +true+ if the page is a HTTP success, returns +false+
136
+ # otherwise.
137
+ #
138
+ def success?
139
+ (200..206).include?(@code)
140
+ end
141
+
142
+ #
143
+ # Returns +true+ if the page was not found (returned 404 code),
144
+ # returns +false+ otherwise.
145
+ #
146
+ def not_found?
147
+ 404 == @code
148
+ end
149
+
150
+ #
151
+ # Base URI from the HTML doc head element
152
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
153
+ #
154
+ def base
155
+ @base = if doc
156
+ href = doc.search('//head/base/@href')
157
+ URI(href.to_s) unless href.nil? rescue nil
158
+ end unless @base
159
+
160
+ return nil if @base && @base.to_s.empty?
161
+ @base
162
+ end
163
+
164
+ #
165
+ # Converts relative URL *link* into an absolute URL based on the
166
+ # location of the page
167
+ #
168
+ def to_absolute(link)
169
+ return nil if link.nil?
170
+
171
+ # remove anchor
172
+ link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
173
+
174
+ relative = URI(link)
175
+ absolute = base ? base.merge(relative) : @url.merge(relative)
176
+
177
+ absolute.path = '/' if absolute.path.empty?
178
+
179
+ absolute
180
+ end
181
+
182
+ #
183
+ # Returns +true+ if *uri* is in the same domain as the page, returns
184
+ # +false+ otherwise
185
+ #
186
+ def in_domain?(uri)
187
+ @domain_aliases ||= []
188
+ uri.host == @url.host || @domain_aliases.include?(uri.host)
189
+ end
190
+
191
+ def to_hash
192
+ {
193
+ 'url' => @url.to_s,
194
+ 'headers' => Marshal.dump(@headers),
195
+ 'body' => @body,
196
+ 'links' => links.map(&:to_s),
197
+ 'code' => @code,
198
+ 'depth' => @depth,
199
+ 'referer' => @referer.to_s,
200
+ 'redirect_to' => @redirect_to.to_s,
201
+ 'response_time' => @response_time,
202
+ 'fetched' => @fetched,
203
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
204
+ 'fetched_at' => @fetched_at,
205
+ 'error' => @error.to_s
206
+ }
207
+ end
208
+
209
+ def to_json
210
+ th = to_hash.dup
211
+ th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
212
+ th.delete('headers') if content_type.empty?
213
+ th.to_json
214
+ end
215
+
216
+ #
217
+ # Returns +true+ if page is marked as storeable
218
+ # +false+ otherwise
219
+ # Default is +true+
220
+ #
221
+ def storable?
222
+ @storable
223
+ end
224
+
225
+ def expired?(ttl)
226
+ return false if fetched_at.nil?
227
+ (Time.now.to_i - ttl) > fetched_at
228
+ end
229
+
230
+ def self.from_hash(hash)
231
+ page = new(URI(hash['url']))
232
+ {
233
+ '@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
234
+ '@body' => hash['body'],
235
+ '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
236
+ '@code' => hash['code'].to_i,
237
+ '@depth' => hash['depth'].to_i,
238
+ '@referer' => hash['referer'],
239
+ '@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
240
+ '@response_time' => hash['response_time'].to_i,
241
+ '@fetched' => hash['fetched'],
242
+ '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
243
+ '@fetched_at' => hash['fetched_at'],
244
+ '@error' => hash['error']
245
+ }.each do |var, value|
246
+ page.instance_variable_set(var, value)
247
+ end
248
+ page
249
+ end
250
+
251
+ def self.from_json(json)
252
+ hash = JSON.parse json
253
+ from_hash hash
254
+ end
255
+ end
256
+ end