polipus 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- Mzc5NDczMjAxNmUzY2JlYzA4YzUwZDUzMGY0Y2ZjZGQxOTBmMTk5YQ==
4
+ ZTc3MjQ1OWQwNzVhMWFhMGQ2NTdlYjM3ZTkyZDQ3ZDAwZDExZWQ1Mw==
5
5
  data.tar.gz: !binary |-
6
- NDgxNzhlYTAyNTMxYWE1MTBkMzIxMWMyMDRlMmQ0NjA2ZTc1MzY2Mw==
6
+ OTUzYTE5M2U4YTQ3ZGVmZTAzMzdiYjJmZWYzM2Q3MTU0NDMyYzAwMQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- YTNmMGY4YWRkNmU5YThhMTY0ODBiMzY1MmI3MzkzMzEzZjhiZGI4NDI1Mjdh
10
- MGI0YWNjZDEyM2NhZDlkZTY2MjY3ZTA5YzVjNWE5YjFjZWEwZjRhMGI3ZWE2
11
- MTc0NDczOWIyYTE1YjY1MzA2ZjcwOTRjNWZiOGVlMjIyOTJkNGU=
9
+ MDk5MGQ2MzBkYzU2MjJlNDg1YTkwYTU1YjJjYWQ0YjAyNDY5OTZkNWJlZDIw
10
+ NDAwNjY2ZjMwMGUxZWE0NTNiNzc5YmIzZTg2NjcwNjFjZTMyNzIxZjZlYzZm
11
+ N2ZjMTk2ZjRkYjU0M2VjZDk0NWMxYzk0MjE4MWRkOWFiY2M3YTA=
12
12
  data.tar.gz: !binary |-
13
- ZTVmZGQ0NTgyNTA2NGYwZDVhMTg3NmUzMWM0NDExMGIwOTU5NjM2ZGZkNWM3
14
- MWYyYWQwM2NkMmNiMDgxYWY3NTdmYjE5NDZhOGE1YTQxNjRjNjUxNTIzYTc2
15
- NTU2ZDZiYzllNWYxZjdiNDIwMzYyNGYyN2YyM2ZmY2VmNWU1NzA=
13
+ OTgwMTI5MWFhNWQ5Mjk4OWNmZTk3ZGE0MTMyYzM5NDlkMWJhMjFiMWQ4NDQ4
14
+ OGI1NDU3ZDQ0ZTkzNWFkMzAyZjg3YmRiNDlmN2I0ZDNlNWRlZmVkMjIzMWQ2
15
+ MGY0NGQ4YTQ1ZmEyMGQ0M2VkNzE2YzIyOGMxOGE4MDQzMWFkZjU=
data/AUTHORS.md ADDED
@@ -0,0 +1,4 @@
1
+ # Authors
2
+
3
+ * [Francesco Laurita](francesco.laurita@gmail.com)
4
+ * [Tobias L. Maier](http://tobiasmaier.info/)
data/CHANGELOG.md ADDED
@@ -0,0 +1,20 @@
1
+ # Changelog
2
+
3
+ ## 0.3.0 (2015-06-02)
4
+
5
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.2.2...0.3.0)
6
+
7
+ * Add `PolipusCrawler#add_to_queue` to add a page back to the queue
8
+ [#24](https://github.com/taganaka/polipus/pull/24)
9
+ * Introduce new block `PolipusCrawler#on_page_error` which runs when there was an error (`Page#error`).
10
+ For example a connectivity error.
11
+ See `/examples/error_handling.rb`
12
+ [#15](https://github.com/taganaka/polipus/issues/15)
13
+ * Add `Page#success?` which returns true if HTTP code is something in between 200 and 206.
14
+ * Polipus supports now `robots.txt` directives.
15
+ Set the option `:obey_robots_txt` to `true`.
16
+ See `/examples/robots_txt_handling.rb`
17
+ [#30](https://github.com/taganaka/polipus/pull/30)
18
+ * Add support for GZIP and deflate compressed HTTP requests
19
+ [#26](https://github.com/taganaka/polipus/pull/26)
20
+ * Minor improvements to code style
data/Gemfile CHANGED
@@ -1,3 +1,3 @@
1
- source "http://rubygems.org"
1
+ source 'https://rubygems.org'
2
2
 
3
- gemspec
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ require 'polipus'
2
+
3
+ Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
4
+ # Handle connectivity errors
5
+ # Only runs when there is an error
6
+ crawler.on_page_error do |page|
7
+ # Don't store the page
8
+ page.storable = false
9
+ # Add the URL again to the queue
10
+ crawler.add_to_queue(page)
11
+ end
12
+
13
+ # In-place page processing
14
+ # Runs also when there was an error in the page
15
+ crawler.on_page_downloaded do |page|
16
+ # Skip block if there is an error
17
+ return if page.error
18
+
19
+ # A nokogiri object
20
+ puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
21
+ end
22
+ end
@@ -0,0 +1,13 @@
1
+ require 'polipus'
2
+
3
+ options = {
4
+ user_agent: 'Googlebot', # Act as Google bot
5
+ obey_robots_txt: true # Follow /robots.txt rules if any
6
+ }
7
+
8
+ Polipus.crawler('rubygems', 'http://rubygems.org/', options) do |crawler|
9
+
10
+ crawler.on_page_downloaded do |page|
11
+ puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
12
+ end
13
+ end
data/lib/polipus.rb CHANGED
@@ -8,6 +8,7 @@ require "polipus/storage"
8
8
  require "polipus/url_tracker"
9
9
  require "polipus/plugin"
10
10
  require "polipus/queue_overflow"
11
+ require "polipus/robotex"
11
12
  require "thread"
12
13
  require "logger"
13
14
  require "json"
@@ -62,6 +63,7 @@ module Polipus
62
63
  :stats_enabled => false,
63
64
  # Cookies strategy
64
65
  :cookie_jar => nil,
66
+ # whether or not accept cookies
65
67
  :accept_cookies => false,
66
68
  # A set of hosts that should be considered parts of the same domain
67
69
  # Eg It can be used to follow links with and without 'www' domain
@@ -69,7 +71,9 @@ module Polipus
69
71
  # Mark a connection as staled after connection_max_hits request
70
72
  :connection_max_hits => nil,
71
73
  # Page TTL: mark a page as expired after ttl_page seconds
72
- :ttl_page => nil
74
+ :ttl_page => nil,
75
+ # don't obey the robots exclusion protocol
76
+ :obey_robots_txt => false
73
77
  }
74
78
 
75
79
  attr_reader :storage
@@ -110,6 +114,7 @@ module Polipus
110
114
  @skip_links_like = []
111
115
  @on_page_downloaded = []
112
116
  @on_before_save = []
117
+ @on_page_error = []
113
118
  @focus_crawl_block = nil
114
119
  @on_crawl_end = []
115
120
  @redis_factory = nil
@@ -122,8 +127,8 @@ module Polipus
122
127
 
123
128
  @urls = [urls].flatten.map{ |url| URI(url) }
124
129
  @urls.each{ |url| url.path = '/' if url.path.empty? }
125
-
126
130
  @internal_queue = queue_factory
131
+ @robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
127
132
 
128
133
  execute_plugin 'on_initialize'
129
134
 
@@ -139,14 +144,10 @@ module Polipus
139
144
  PolipusSignalHandler.enable
140
145
  overflow_items_controller if queue_overflow_adapter
141
146
 
142
- q = queue_factory
143
147
  @urls.each do |u|
144
- page = Page.new(u.to_s, :referer => '')
145
- page.user_data.p_seeded = true
146
- q << page.to_json
148
+ add_url(u) { |page| page.user_data.p_seeded = true }
147
149
  end
148
-
149
- return if q.empty?
150
+ return if @internal_queue.empty?
150
151
 
151
152
  execute_plugin 'on_crawl_start'
152
153
  @options[:workers].times do |worker_number|
@@ -194,27 +195,28 @@ module Polipus
194
195
  page = pages.last
195
196
  end
196
197
 
197
- # Execute on_before_save blocks
198
- @on_before_save.each {|e| e.call(page)} unless page.nil?
199
198
  execute_plugin 'on_after_download'
200
199
 
201
- @logger.warn {"Page #{page.url} has error: #{page.error}"} if page.error
200
+ if page.error
201
+ @logger.warn {"Page #{page.url} has error: #{page.error}"}
202
+ incr_error
203
+ @on_page_error.each {|e| e.call(page)}
204
+ end
202
205
 
203
- incr_error if page.error
206
+ # Execute on_before_save blocks
207
+ @on_before_save.each {|e| e.call(page)}
204
208
 
205
- if page && page.storable?
209
+ if page.storable?
206
210
  @storage.add page
207
211
  end
208
212
 
209
- if page
210
- @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
211
- @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
212
- end
213
-
213
+ @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
214
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
215
+
214
216
  incr_pages
215
217
 
216
218
  # Execute on_page_downloaded blocks
217
- @on_page_downloaded.each {|e| e.call(page)} unless page.nil?
219
+ @on_page_downloaded.each {|e| e.call(page)}
218
220
 
219
221
  if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
220
222
  links_for(page).each do |url_to_visit|
@@ -264,6 +266,7 @@ module Polipus
264
266
  self
265
267
  end
266
268
 
269
+ # A block of code will be executed when crawl session is over
267
270
  def on_crawl_end(&block)
268
271
  @on_crawl_end << block
269
272
  self
@@ -276,6 +279,12 @@ module Polipus
276
279
  self
277
280
  end
278
281
 
282
+ # A block of code will be executed whether a page contains an error
283
+ def on_page_error(&block)
284
+ @on_page_error << block
285
+ self
286
+ end
287
+
279
288
  # A block of code will be executed
280
289
  # on every page downloaded. The code is used to extract urls to visit
281
290
  # see links_for method
@@ -313,9 +322,18 @@ module Polipus
313
322
  @redis ||= redis_factory_adapter
314
323
  end
315
324
 
325
+ def add_to_queue(page)
326
+ if [:url, :referer, :depth].all? { |method| page.respond_to?(method) }
327
+ add_url(page.url, referer: page.referer, depth: page.depth)
328
+ else
329
+ add_url(page)
330
+ end
331
+ end
332
+
316
333
  # Enqueue an url, no matter what
317
- def add_url url
318
- page = Page.new(url)
334
+ def add_url(url, params = {})
335
+ page = Page.new(url, params)
336
+ yield(page) if block_given?
319
337
  @internal_queue << page.to_json
320
338
  end
321
339
 
@@ -329,7 +347,11 @@ module Polipus
329
347
  private
330
348
  # URLs enqueue policy
331
349
  def should_be_visited?(url, with_tracker = true)
350
+
332
351
  case
352
+ # robots.txt
353
+ when !allowed_by_robot?(url)
354
+ false
333
355
  # Check against whitelist pattern matching
334
356
  when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
335
357
  false
@@ -368,6 +390,17 @@ module Polipus
368
390
  @storage.exists?(page) && !page_expired?(page)
369
391
  end
370
392
 
393
+ #
394
+ # Returns +true+ if we are obeying robots.txt and the link
395
+ # is granted access in it. Always returns +true+ when we are
396
+ # not obeying robots.txt.
397
+ #
398
+ def allowed_by_robot?(link)
399
+ return true if @robots.nil?
400
+ @options[:obey_robots_txt] ? @robots.allowed?(link) : true
401
+ end
402
+
403
+
371
404
  # The url is enqueued for a later visit
372
405
  def enqueue url_to_visit, current_page, queue
373
406
  page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)
data/lib/polipus/http.rb CHANGED
@@ -7,6 +7,21 @@ module Polipus
7
7
  class HTTP
8
8
  # Maximum number of redirects to follow on each get_response
9
9
  REDIRECT_LIMIT = 5
10
+ RESCUABLE_ERRORS = [
11
+ EOFError,
12
+ Errno::ECONNREFUSED,
13
+ Errno::ECONNRESET,
14
+ Errno::EHOSTUNREACH,
15
+ Errno::EINVAL,
16
+ Errno::EPIPE,
17
+ Errno::ETIMEDOUT,
18
+ Net::HTTPBadResponse,
19
+ Net::HTTPHeaderSyntaxError,
20
+ Net::ProtocolError,
21
+ SocketError,
22
+ Timeout::Error,
23
+ Zlib::DataError
24
+ ]
10
25
 
11
26
  def initialize(opts = {})
12
27
  @connections = {}
@@ -30,13 +45,8 @@ module Polipus
30
45
  url = URI(url)
31
46
  pages = []
32
47
  get(url, referer) do |response, code, location, redirect_to, response_time|
33
- body = response.body.dup
34
- if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
35
- gzip = Zlib::GzipReader.new(StringIO.new(body))
36
- body = gzip.read
37
- end
38
-
39
- pages << Page.new(location, :body => body,
48
+ handle_compression response
49
+ pages << Page.new(location, :body => response.body,
40
50
  :code => code,
41
51
  :headers => response.to_hash,
42
52
  :referer => referer,
@@ -47,13 +57,13 @@ module Polipus
47
57
  end
48
58
 
49
59
  pages
50
- rescue StandardError => e
60
+ rescue *RESCUABLE_ERRORS => e
51
61
  if verbose?
52
62
  puts e.inspect
53
63
  puts e.backtrace
54
64
  end
55
65
 
56
- [Page.new(url, :error => e)]
66
+ [Page.new(url, error: e, referer: referer, depth: depth)]
57
67
  end
58
68
 
59
69
  #
@@ -154,7 +164,7 @@ module Polipus
154
164
  opts['User-Agent'] = user_agent if user_agent
155
165
  opts['Referer'] = referer.to_s if referer
156
166
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
157
- opts['Accept-Encoding'] = 'gzip'
167
+ opts['Accept-Encoding'] = 'gzip,deflate'
158
168
 
159
169
 
160
170
  retries = 0
@@ -169,8 +179,7 @@ module Polipus
169
179
  response_time = ((finish - start) * 1000).round
170
180
  cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
171
181
  return response, response_time
172
- rescue StandardError => e
173
-
182
+ rescue *RESCUABLE_ERRORS => e
174
183
  puts e.inspect if verbose?
175
184
  refresh_connection(url)
176
185
  retries += 1
@@ -229,8 +238,14 @@ module Polipus
229
238
  to_url.host.nil? || (to_url.host == from_url.host)
230
239
  end
231
240
 
232
- def gzip_enabled?
233
- @opts[:gzip_enabled]
241
+ def handle_compression response
242
+ case response["content-encoding"]
243
+ when "gzip", "x-gzip"
244
+ body_io = StringIO.new(response.body)
245
+ response.body.replace Zlib::GzipReader.new(body_io).read
246
+ when "deflate"
247
+ response.body.replace Zlib::Inflate.inflate(response.body)
248
+ end
234
249
  end
235
250
 
236
251
  end
data/lib/polipus/page.rb CHANGED
@@ -17,8 +17,7 @@ module Polipus
17
17
  attr_reader :error
18
18
  # Integer response code of the page
19
19
  attr_accessor :code
20
- # Depth of this page from the root of the crawl. This is not necessarily the
21
- # shortest path; use PageStore#shortest_paths! to find that value.
20
+ # Depth of this page from the root of the crawl.
22
21
  attr_accessor :depth
23
22
  # URL of the page that brought us to this page
24
23
  attr_accessor :referer
@@ -41,7 +40,7 @@ module Polipus
41
40
  # Create a new page
42
41
  #
43
42
  def initialize(url, params = {})
44
- @url = url.kind_of?(URI) ? url : URI(url)
43
+ @url = URI(url)
45
44
  @code = params[:code]
46
45
  @headers = params[:headers] || {}
47
46
  @headers['content-type'] ||= ['']
@@ -130,6 +129,14 @@ module Polipus
130
129
  (300..307).include?(@code)
131
130
  end
132
131
 
132
+ #
133
+ # Returns +true+ if the page is a HTTP success, returns +false+
134
+ # otherwise.
135
+ #
136
+ def success?
137
+ (200..206).include?(@code)
138
+ end
139
+
133
140
  #
134
141
  # Returns +true+ if the page was not found (returned 404 code),
135
142
  # returns +false+ otherwise.
@@ -192,7 +199,8 @@ module Polipus
192
199
  'response_time' => @response_time,
193
200
  'fetched' => @fetched,
194
201
  'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
195
- 'fetched_at' => @fetched_at
202
+ 'fetched_at' => @fetched_at,
203
+ 'error' => @error
196
204
  }
197
205
  end
198
206
 
@@ -230,7 +238,8 @@ module Polipus
230
238
  '@response_time' => hash['response_time'].to_i,
231
239
  '@fetched' => hash['fetched'],
232
240
  '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
233
- '@fetched_at' => hash['fetched_at']
241
+ '@fetched_at' => hash['fetched_at'],
242
+ '@error' => hash['error']
234
243
  }.each do |var, value|
235
244
  page.instance_variable_set(var, value)
236
245
  end
@@ -242,4 +251,4 @@ module Polipus
242
251
  self.from_hash hash
243
252
  end
244
253
  end
245
- end
254
+ end
@@ -0,0 +1,154 @@
1
+ require 'open-uri'
2
+ require 'uri'
3
+ require 'timeout'
4
+ module Polipus
5
+
6
+ # Original code taken from
7
+ # https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
8
+
9
+ class Robotex
10
+
11
+ DEFAULT_TIMEOUT = 3
12
+ VERSION = '1.0.0'
13
+
14
+ attr_reader :user_agent
15
+
16
+ class ParsedRobots
17
+
18
+ def initialize(uri, user_agent)
19
+ io = Robotex.get_robots_txt(uri, user_agent)
20
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
21
+ io = StringIO.new("User-agent: *\nAllow: /\n")
22
+ end
23
+
24
+ @disallows = {}
25
+ @allows = {}
26
+ @delays = {}
27
+ agent = /.*/
28
+ io.each do |line|
29
+ next if line =~ /^\s*(#.*|$)/
30
+ arr = line.split(":")
31
+ key = arr.shift
32
+ value = arr.join(":").strip
33
+ value.strip!
34
+ case key.downcase
35
+ when "user-agent"
36
+ agent = to_regex(value)
37
+ when "allow"
38
+ unless value.empty?
39
+ @allows[agent] ||= []
40
+ @allows[agent] << to_regex(value)
41
+ end
42
+ when "disallow"
43
+ unless value.empty?
44
+ @disallows[agent] ||= []
45
+ @disallows[agent] << to_regex(value)
46
+ end
47
+ when "crawl-delay"
48
+ @delays[agent] = value.to_i
49
+ end
50
+ end
51
+ @parsed = true
52
+ end
53
+
54
+ def allowed?(uri, user_agent)
55
+ return true unless @parsed
56
+ allowed = true
57
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
58
+ path = uri.request_uri
59
+
60
+ @allows.each do |key, value|
61
+ unless allowed
62
+ if user_agent =~ key
63
+ value.each do |rule|
64
+ if path =~ rule
65
+ allowed = true
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ @disallows.each do |key, value|
73
+ if user_agent =~ key
74
+ value.each do |rule|
75
+ if path =~ rule
76
+ allowed = false
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ return allowed
83
+ end
84
+
85
+ def delay(user_agent)
86
+ @delays.each do |agent, delay|
87
+ return delay if agent =~ user_agent
88
+ end
89
+ nil
90
+ end
91
+
92
+ protected
93
+
94
+ def to_regex(pattern)
95
+ pattern = Regexp.escape(pattern)
96
+ pattern.gsub!(Regexp.escape("*"), ".*")
97
+ Regexp.compile("^#{pattern}")
98
+ end
99
+ end
100
+
101
+ def self.get_robots_txt(uri, user_agent)
102
+ begin
103
+ Timeout::timeout(Robotex.timeout) do
104
+ URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
105
+ end
106
+ rescue Timeout::Error
107
+ STDERR.puts "robots.txt request timed out"
108
+ end
109
+ end
110
+
111
+ def self.timeout=(t)
112
+ @timeout = t
113
+ end
114
+
115
+ def self.timeout
116
+ @timeout || DEFAULT_TIMEOUT
117
+ end
118
+
119
+ def initialize(user_agent = nil)
120
+ user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
121
+ @user_agent = user_agent
122
+ @last_accessed = Time.at(1)
123
+ @parsed = {}
124
+ end
125
+
126
+ def parse_host(uri)
127
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
128
+ @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
129
+ end
130
+
131
+ #
132
+ # Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
133
+ #
134
+ def allowed?(uri)
135
+ parse_host(uri).allowed?(uri, @user_agent)
136
+ end
137
+
138
+ #
139
+ # Return the value of the Crawl-Delay directive, or nil if none
140
+ def delay(uri)
141
+ parse_host(uri).delay(@user_agent)
142
+ end
143
+
144
+ #
145
+ # Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
146
+ #
147
+ def delay!(uri)
148
+ delay = delay(uri)
149
+ sleep delay - (Time.now - @last_accessed) if !!delay
150
+ @last_accessed = Time.now
151
+ end
152
+
153
+ end
154
+ end