parallel588_polipus 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,282 @@
1
+ # encoding: UTF-8
2
+ require 'net/https'
3
+ require 'polipus/page'
4
+ require 'zlib'
5
+ require 'http/cookie'
6
+
7
+ module Polipus
8
+ class HTTP
9
+ # Maximum number of redirects to follow on each get_response
10
+ REDIRECT_LIMIT = 5
11
+ RESCUABLE_ERRORS = [
12
+ EOFError,
13
+ Errno::ECONNREFUSED,
14
+ Errno::ECONNRESET,
15
+ Errno::EHOSTUNREACH,
16
+ Errno::EINVAL,
17
+ Errno::EPIPE,
18
+ Errno::ETIMEDOUT,
19
+ Net::HTTPBadResponse,
20
+ Net::HTTPHeaderSyntaxError,
21
+ Net::ProtocolError,
22
+ SocketError,
23
+ Timeout::Error,
24
+ Zlib::DataError,
25
+ Zlib::GzipFile::Error
26
+ ]
27
+
28
+ def initialize(opts = {})
29
+ @connections = {}
30
+ @connections_hits = {}
31
+ @opts = opts
32
+ end
33
+
34
+ #
35
+ # Fetch a single Page from the response of an HTTP request to *url*.
36
+ # Just gets the final destination page.
37
+ #
38
+ def fetch_page(url, referer = nil, depth = nil)
39
+ fetch_pages(url, referer, depth).last
40
+ end
41
+
42
+ #
43
+ # Create new Pages from the response of an HTTP request to *url*,
44
+ # including redirects
45
+ #
46
+ def fetch_pages(url, referer = nil, depth = nil)
47
+ url = URI(url)
48
+ pages = []
49
+ get(url, referer) do |response, code, location, redirect_to, response_time|
50
+ handle_compression response
51
+ pages << Page.new(location, body: response.body,
52
+ code: code,
53
+ headers: response.to_hash,
54
+ referer: referer,
55
+ depth: depth,
56
+ redirect_to: redirect_to,
57
+ response_time: response_time,
58
+ fetched_at: Time.now.to_i)
59
+ end
60
+
61
+ pages
62
+ rescue *RESCUABLE_ERRORS => e
63
+ if verbose?
64
+ puts e.inspect
65
+ puts e.backtrace
66
+ end
67
+
68
+ [Page.new(url, error: e, referer: referer, depth: depth)]
69
+ end
70
+
71
+ #
72
+ # The maximum number of redirects to follow
73
+ #
74
+ def redirect_limit
75
+ @opts[:redirect_limit] || REDIRECT_LIMIT
76
+ end
77
+
78
+ #
79
+ # The user-agent string which will be sent with each request,
80
+ # or nil if no such option is set
81
+ #
82
+ def user_agent
83
+ if @opts[:user_agent].respond_to?(:sample)
84
+ @opts[:user_agent].sample
85
+ else
86
+ @opts[:user_agent]
87
+ end
88
+ end
89
+
90
+ #
91
+ # The proxy address string
92
+ #
93
+ def proxy_host
94
+ @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
95
+ end
96
+
97
+ #
98
+ # The proxy port
99
+ #
100
+ def proxy_port
101
+ @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
102
+ end
103
+
104
+ #
105
+ # The proxy username
106
+ #
107
+ def proxy_user
108
+ @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
109
+ end
110
+
111
+ #
112
+ # The proxy password
113
+ #
114
+ def proxy_pass
115
+ #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
116
+ @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
117
+ end
118
+
119
+ #
120
+ # Shorthand to get proxy info with a single call
121
+ # It returns an array of ['addr', port, 'user', 'pass']
122
+ #
123
+ def proxy_host_port
124
+ @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
125
+ end
126
+
127
+ #
128
+ # HTTP read timeout in seconds
129
+ #
130
+ def read_timeout
131
+ @opts[:read_timeout]
132
+ end
133
+
134
+ #
135
+ # HTTP open timeout in seconds
136
+ #
137
+ def open_timeout
138
+ @opts[:open_timeout]
139
+ end
140
+
141
+ # Does this HTTP client accept cookies from the server?
142
+ #
143
+ def accept_cookies?
144
+ @opts[:accept_cookies]
145
+ end
146
+
147
+ def cookie_jar
148
+ @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
149
+ @opts[:cookie_jar]
150
+ end
151
+
152
+ private
153
+
154
+ #
155
+ # Retrieve HTTP responses for *url*, including redirects.
156
+ # Yields the response object, response code, and URI location
157
+ # for each response.
158
+ #
159
+ def get(url, referer = nil)
160
+ limit = redirect_limit
161
+ loc = url
162
+ loop do
163
+ # if redirected to a relative url, merge it with the host of the original
164
+ # request url
165
+ loc = url.merge(loc) if loc.relative?
166
+
167
+ response, response_time = get_response(loc, referer)
168
+ code = Integer(response.code)
169
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
170
+ yield response, code, loc, redirect_to, response_time
171
+ limit -= 1
172
+ break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
173
+ end
174
+ end
175
+
176
+ #
177
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
178
+ #
179
+ def get_response(url, referer = nil)
180
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
181
+
182
+ opts = {}
183
+ opts['User-Agent'] = user_agent if user_agent
184
+ opts['Referer'] = referer.to_s if referer
185
+ opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
186
+ opts['Accept-Encoding'] = 'gzip,deflate'
187
+
188
+ retries = 0
189
+ begin
190
+ start = Time.now
191
+ # format request
192
+ req = Net::HTTP::Get.new(full_path, opts)
193
+ # HTTP Basic authentication
194
+ req.basic_auth url.user, url.password if url.user
195
+ if @opts[:http_user]
196
+ req.basic_auth @opts[:http_user], @opts[:http_password]
197
+ end
198
+ # urls auth schema has higher priority
199
+ req.basic_auth url.user, url.password if url.user
200
+ response = connection(url).request(req)
201
+ finish = Time.now
202
+ response_time = ((finish - start) * 1000).round
203
+ cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
204
+ return response, response_time
205
+ rescue *RESCUABLE_ERRORS => e
206
+ puts e.inspect if verbose?
207
+ refresh_connection(url)
208
+ retries += 1
209
+ if retries < 3
210
+ retry
211
+ else
212
+ raise e
213
+ end
214
+ end
215
+ end
216
+
217
+ def connection(url)
218
+ @connections[url.host] ||= {}
219
+ @connections_hits[url.host] ||= {}
220
+
221
+ if @connections[url.host][url.port]
222
+ if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
223
+ @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
224
+ return refresh_connection url
225
+ end
226
+ @connections_hits[url.host][url.port] += 1
227
+ return @connections[url.host][url.port]
228
+ end
229
+
230
+ refresh_connection url
231
+ end
232
+
233
+ def refresh_connection(url)
234
+ if @opts[:logger] && proxy_host && proxy_port
235
+ @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
236
+ end
237
+
238
+ # Block has higher priority
239
+ unless @opts[:proxy_host_port].nil?
240
+ p_host, p_port, p_user, p_pass = proxy_host_port
241
+ else
242
+ p_host = proxy_host
243
+ p_port = proxy_port
244
+ p_user = proxy_user
245
+ p_pass = proxy_pass
246
+ end
247
+
248
+ http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)
249
+
250
+ http.read_timeout = read_timeout if read_timeout
251
+ http.open_timeout = open_timeout if open_timeout
252
+
253
+ if url.scheme == 'https'
254
+ http.use_ssl = true
255
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
256
+ end
257
+ @connections_hits[url.host][url.port] = 1
258
+ @connections[url.host][url.port] = http.start
259
+ end
260
+
261
+ def verbose?
262
+ @opts[:verbose]
263
+ end
264
+
265
+ #
266
+ # Allowed to connect to the requested url?
267
+ #
268
+ def allowed?(to_url, from_url)
269
+ to_url.host.nil? || (to_url.host == from_url.host)
270
+ end
271
+
272
+ def handle_compression(response)
273
+ case response['content-encoding']
274
+ when 'gzip', 'x-gzip'
275
+ body_io = StringIO.new(response.body)
276
+ response.body.replace Zlib::GzipReader.new(body_io).read
277
+ when 'deflate'
278
+ response.body.replace Zlib::Inflate.inflate(response.body)
279
+ end
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,256 @@
1
+ # encoding: UTF-8
2
+ require 'nokogiri'
3
+ require 'json'
4
+ require 'ostruct'
5
+ require 'set'
6
+ require 'kconv'
7
+
8
+ module Polipus
9
+ class Page
10
+ # The URL of the page
11
+ attr_reader :url
12
+ # The raw HTTP response body of the page
13
+ attr_reader :body
14
+ # Headers of the HTTP response
15
+ attr_reader :headers
16
+ # URL of the page this one redirected to, if any
17
+ attr_reader :redirect_to
18
+ # Exception object, if one was raised during HTTP#fetch_page
19
+ attr_reader :error
20
+ # Integer response code of the page
21
+ attr_accessor :code
22
+ # Depth of this page from the root of the crawl.
23
+ attr_accessor :depth
24
+ # URL of the page that brought us to this page
25
+ attr_accessor :referer
26
+ # Response time of the request for this page in milliseconds
27
+ attr_accessor :response_time
28
+ # OpenStruct it holds users defined data
29
+ attr_accessor :user_data
30
+
31
+ attr_accessor :aliases
32
+
33
+ attr_accessor :domain_aliases
34
+
35
+ # Whether the current page should be stored
36
+ # Default: true
37
+ attr_accessor :storable
38
+
39
+ attr_accessor :fetched_at
40
+
41
+ #
42
+ # Create a new page
43
+ #
44
+ def initialize(url, params = {})
45
+ @url = URI(url)
46
+ @code = params[:code]
47
+ @headers = params[:headers] || {}
48
+ @headers['content-type'] ||= ['']
49
+ @aliases = Array(params[:aka]).compact
50
+ @referer = params[:referer]
51
+ @depth = params[:depth] || 0
52
+ @redirect_to = to_absolute(params[:redirect_to])
53
+ @response_time = params[:response_time]
54
+ @body = params[:body]
55
+ @error = params[:error]
56
+ @fetched = !params[:code].nil?
57
+ @user_data = OpenStruct.new
58
+ @domain_aliases = params[:domain_aliases] ||= []
59
+ @storable = true
60
+ @fetched_at = params[:fetched_at]
61
+ end
62
+
63
+ #
64
+ # Array of distinct A tag HREFs from the page
65
+ #
66
+ def links
67
+ return @links.to_a unless @links.nil?
68
+ @links = Set.new
69
+ return [] unless doc
70
+
71
+ doc.search('//a[@href]').each do |a|
72
+ u = a['href']
73
+ next if u.nil? || u.empty?
74
+ abs = to_absolute(u) rescue next
75
+ @links << abs if in_domain?(abs)
76
+ end
77
+ @links.to_a
78
+ end
79
+
80
+ #
81
+ # Nokogiri document for the HTML body
82
+ #
83
+ def doc
84
+ return @doc if @doc
85
+ @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html? rescue nil
86
+ end
87
+
88
+ #
89
+ # Discard links, a next call of page.links will return an empty array
90
+ #
91
+ def discard_links!
92
+ @links = []
93
+ end
94
+
95
+ #
96
+ # Delete the Nokogiri document and response body to conserve memory
97
+ #
98
+ def discard_doc!
99
+ links # force parsing of page links before we trash the document
100
+ @doc = @body = nil
101
+ end
102
+
103
+ #
104
+ # Was the page successfully fetched?
105
+ # +true+ if the page was fetched with no error, +false+ otherwise.
106
+ #
107
+ def fetched?
108
+ @fetched
109
+ end
110
+
111
+ #
112
+ # The content-type returned by the HTTP request for this page
113
+ #
114
+ def content_type
115
+ headers['content-type'].first
116
+ end
117
+
118
+ #
119
+ # Returns +true+ if the page is a HTML document, returns +false+
120
+ # otherwise.
121
+ #
122
+ def html?
123
+ content_type =~ %r{^(text/html|application/xhtml+xml)\b}
124
+ end
125
+
126
+ #
127
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
128
+ # otherwise.
129
+ #
130
+ def redirect?
131
+ (300...400).include?(@code)
132
+ end
133
+
134
+ #
135
+ # Returns +true+ if the page is a HTTP success, returns +false+
136
+ # otherwise.
137
+ #
138
+ def success?
139
+ (200..206).include?(@code)
140
+ end
141
+
142
+ #
143
+ # Returns +true+ if the page was not found (returned 404 code),
144
+ # returns +false+ otherwise.
145
+ #
146
+ def not_found?
147
+ 404 == @code
148
+ end
149
+
150
+ #
151
+ # Base URI from the HTML doc head element
152
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
153
+ #
154
+ def base
155
+ @base = if doc
156
+ href = doc.search('//head/base/@href')
157
+ URI(href.to_s) unless href.nil? rescue nil
158
+ end unless @base
159
+
160
+ return nil if @base && @base.to_s.empty?
161
+ @base
162
+ end
163
+
164
+ #
165
+ # Converts relative URL *link* into an absolute URL based on the
166
+ # location of the page
167
+ #
168
+ def to_absolute(link)
169
+ return nil if link.nil?
170
+
171
+ # remove anchor
172
+ link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
173
+
174
+ relative = URI(link)
175
+ absolute = base ? base.merge(relative) : @url.merge(relative)
176
+
177
+ absolute.path = '/' if absolute.path.empty?
178
+
179
+ absolute
180
+ end
181
+
182
+ #
183
+ # Returns +true+ if *uri* is in the same domain as the page, returns
184
+ # +false+ otherwise
185
+ #
186
+ def in_domain?(uri)
187
+ @domain_aliases ||= []
188
+ uri.host == @url.host || @domain_aliases.include?(uri.host)
189
+ end
190
+
191
+ def to_hash
192
+ {
193
+ 'url' => @url.to_s,
194
+ 'headers' => Marshal.dump(@headers),
195
+ 'body' => @body,
196
+ 'links' => links.map(&:to_s),
197
+ 'code' => @code,
198
+ 'depth' => @depth,
199
+ 'referer' => @referer.to_s,
200
+ 'redirect_to' => @redirect_to.to_s,
201
+ 'response_time' => @response_time,
202
+ 'fetched' => @fetched,
203
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
204
+ 'fetched_at' => @fetched_at,
205
+ 'error' => @error.to_s
206
+ }
207
+ end
208
+
209
+ def to_json
210
+ th = to_hash.dup
211
+ th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
212
+ th.delete('headers') if content_type.empty?
213
+ th.to_json
214
+ end
215
+
216
+ #
217
+ # Returns +true+ if page is marked as storeable
218
+ # +false+ otherwise
219
+ # Default is +true+
220
+ #
221
+ def storable?
222
+ @storable
223
+ end
224
+
225
+ def expired?(ttl)
226
+ return false if fetched_at.nil?
227
+ (Time.now.to_i - ttl) > fetched_at
228
+ end
229
+
230
+ def self.from_hash(hash)
231
+ page = new(URI(hash['url']))
232
+ {
233
+ '@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
234
+ '@body' => hash['body'],
235
+ '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
236
+ '@code' => hash['code'].to_i,
237
+ '@depth' => hash['depth'].to_i,
238
+ '@referer' => hash['referer'],
239
+ '@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
240
+ '@response_time' => hash['response_time'].to_i,
241
+ '@fetched' => hash['fetched'],
242
+ '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
243
+ '@fetched_at' => hash['fetched_at'],
244
+ '@error' => hash['error']
245
+ }.each do |var, value|
246
+ page.instance_variable_set(var, value)
247
+ end
248
+ page
249
+ end
250
+
251
+ def self.from_json(json)
252
+ hash = JSON.parse json
253
+ from_hash hash
254
+ end
255
+ end
256
+ end