polipus 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- Mzc5NDczMjAxNmUzY2JlYzA4YzUwZDUzMGY0Y2ZjZGQxOTBmMTk5YQ==
4
+ ZTc3MjQ1OWQwNzVhMWFhMGQ2NTdlYjM3ZTkyZDQ3ZDAwZDExZWQ1Mw==
5
5
  data.tar.gz: !binary |-
6
- NDgxNzhlYTAyNTMxYWE1MTBkMzIxMWMyMDRlMmQ0NjA2ZTc1MzY2Mw==
6
+ OTUzYTE5M2U4YTQ3ZGVmZTAzMzdiYjJmZWYzM2Q3MTU0NDMyYzAwMQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- YTNmMGY4YWRkNmU5YThhMTY0ODBiMzY1MmI3MzkzMzEzZjhiZGI4NDI1Mjdh
10
- MGI0YWNjZDEyM2NhZDlkZTY2MjY3ZTA5YzVjNWE5YjFjZWEwZjRhMGI3ZWE2
11
- MTc0NDczOWIyYTE1YjY1MzA2ZjcwOTRjNWZiOGVlMjIyOTJkNGU=
9
+ MDk5MGQ2MzBkYzU2MjJlNDg1YTkwYTU1YjJjYWQ0YjAyNDY5OTZkNWJlZDIw
10
+ NDAwNjY2ZjMwMGUxZWE0NTNiNzc5YmIzZTg2NjcwNjFjZTMyNzIxZjZlYzZm
11
+ N2ZjMTk2ZjRkYjU0M2VjZDk0NWMxYzk0MjE4MWRkOWFiY2M3YTA=
12
12
  data.tar.gz: !binary |-
13
- ZTVmZGQ0NTgyNTA2NGYwZDVhMTg3NmUzMWM0NDExMGIwOTU5NjM2ZGZkNWM3
14
- MWYyYWQwM2NkMmNiMDgxYWY3NTdmYjE5NDZhOGE1YTQxNjRjNjUxNTIzYTc2
15
- NTU2ZDZiYzllNWYxZjdiNDIwMzYyNGYyN2YyM2ZmY2VmNWU1NzA=
13
+ OTgwMTI5MWFhNWQ5Mjk4OWNmZTk3ZGE0MTMyYzM5NDlkMWJhMjFiMWQ4NDQ4
14
+ OGI1NDU3ZDQ0ZTkzNWFkMzAyZjg3YmRiNDlmN2I0ZDNlNWRlZmVkMjIzMWQ2
15
+ MGY0NGQ4YTQ1ZmEyMGQ0M2VkNzE2YzIyOGMxOGE4MDQzMWFkZjU=
data/AUTHORS.md ADDED
@@ -0,0 +1,4 @@
1
+ # Authors
2
+
3
+ * [Francesco Laurita](francesco.laurita@gmail.com)
4
+ * [Tobias L. Maier](http://tobiasmaier.info/)
data/CHANGELOG.md ADDED
@@ -0,0 +1,20 @@
1
+ # Changelog
2
+
3
+ ## 0.3.0 (2015-06-02)
4
+
5
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.2.2...0.3.0)
6
+
7
+ * Add `PolipusCrawler#add_to_queue` to add a page back to the queue
8
+ [#24](https://github.com/taganaka/polipus/pull/24)
9
+ * Introduce new block `PolipusCrawler#on_page_error` which runs when there was an error (`Page#error`).
10
+ For example a connectivity error.
11
+ See `/examples/error_handling.rb`
12
+ [#15](https://github.com/taganaka/polipus/issues/15)
13
+ * Add `Page#success?` which returns true if HTTP code is something in between 200 and 206.
14
+ * Polipus supports now `robots.txt` directives.
15
+ Set the option `:obey_robots_txt` to `true`.
16
+ See `/examples/robots_txt_handling.rb`
17
+ [#30](https://github.com/taganaka/polipus/pull/30)
18
+ * Add support for GZIP and deflate compressed HTTP requests
19
+ [#26](https://github.com/taganaka/polipus/pull/26)
20
+ * Minor improvements to code style
data/Gemfile CHANGED
@@ -1,3 +1,3 @@
1
- source "http://rubygems.org"
1
+ source 'https://rubygems.org'
2
2
 
3
- gemspec
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ require 'polipus'
2
+
3
+ Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
4
+ # Handle connectivity errors
5
+ # Only runs when there is an error
6
+ crawler.on_page_error do |page|
7
+ # Don't store the page
8
+ page.storable = false
9
+ # Add the URL again to the queue
10
+ crawler.add_to_queue(page)
11
+ end
12
+
13
+ # In-place page processing
14
+ # Runs also when there was an error in the page
15
+ crawler.on_page_downloaded do |page|
16
+ # Skip block if there is an error
17
+ return if page.error
18
+
19
+ # A nokogiri object
20
+ puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
21
+ end
22
+ end
@@ -0,0 +1,13 @@
1
+ require 'polipus'
2
+
3
+ options = {
4
+ user_agent: 'Googlebot', # Act as Google bot
5
+ obey_robots_txt: true # Follow /robots.txt rules if any
6
+ }
7
+
8
+ Polipus.crawler('rubygems', 'http://rubygems.org/', options) do |crawler|
9
+
10
+ crawler.on_page_downloaded do |page|
11
+ puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
12
+ end
13
+ end
data/lib/polipus.rb CHANGED
@@ -8,6 +8,7 @@ require "polipus/storage"
8
8
  require "polipus/url_tracker"
9
9
  require "polipus/plugin"
10
10
  require "polipus/queue_overflow"
11
+ require "polipus/robotex"
11
12
  require "thread"
12
13
  require "logger"
13
14
  require "json"
@@ -62,6 +63,7 @@ module Polipus
62
63
  :stats_enabled => false,
63
64
  # Cookies strategy
64
65
  :cookie_jar => nil,
66
+ # whether or not accept cookies
65
67
  :accept_cookies => false,
66
68
  # A set of hosts that should be considered parts of the same domain
67
69
  # Eg It can be used to follow links with and without 'www' domain
@@ -69,7 +71,9 @@ module Polipus
69
71
  # Mark a connection as staled after connection_max_hits request
70
72
  :connection_max_hits => nil,
71
73
  # Page TTL: mark a page as expired after ttl_page seconds
72
- :ttl_page => nil
74
+ :ttl_page => nil,
75
+ # don't obey the robots exclusion protocol
76
+ :obey_robots_txt => false
73
77
  }
74
78
 
75
79
  attr_reader :storage
@@ -110,6 +114,7 @@ module Polipus
110
114
  @skip_links_like = []
111
115
  @on_page_downloaded = []
112
116
  @on_before_save = []
117
+ @on_page_error = []
113
118
  @focus_crawl_block = nil
114
119
  @on_crawl_end = []
115
120
  @redis_factory = nil
@@ -122,8 +127,8 @@ module Polipus
122
127
 
123
128
  @urls = [urls].flatten.map{ |url| URI(url) }
124
129
  @urls.each{ |url| url.path = '/' if url.path.empty? }
125
-
126
130
  @internal_queue = queue_factory
131
+ @robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
127
132
 
128
133
  execute_plugin 'on_initialize'
129
134
 
@@ -139,14 +144,10 @@ module Polipus
139
144
  PolipusSignalHandler.enable
140
145
  overflow_items_controller if queue_overflow_adapter
141
146
 
142
- q = queue_factory
143
147
  @urls.each do |u|
144
- page = Page.new(u.to_s, :referer => '')
145
- page.user_data.p_seeded = true
146
- q << page.to_json
148
+ add_url(u) { |page| page.user_data.p_seeded = true }
147
149
  end
148
-
149
- return if q.empty?
150
+ return if @internal_queue.empty?
150
151
 
151
152
  execute_plugin 'on_crawl_start'
152
153
  @options[:workers].times do |worker_number|
@@ -194,27 +195,28 @@ module Polipus
194
195
  page = pages.last
195
196
  end
196
197
 
197
- # Execute on_before_save blocks
198
- @on_before_save.each {|e| e.call(page)} unless page.nil?
199
198
  execute_plugin 'on_after_download'
200
199
 
201
- @logger.warn {"Page #{page.url} has error: #{page.error}"} if page.error
200
+ if page.error
201
+ @logger.warn {"Page #{page.url} has error: #{page.error}"}
202
+ incr_error
203
+ @on_page_error.each {|e| e.call(page)}
204
+ end
202
205
 
203
- incr_error if page.error
206
+ # Execute on_before_save blocks
207
+ @on_before_save.each {|e| e.call(page)}
204
208
 
205
- if page && page.storable?
209
+ if page.storable?
206
210
  @storage.add page
207
211
  end
208
212
 
209
- if page
210
- @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
211
- @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
212
- end
213
-
213
+ @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
214
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
215
+
214
216
  incr_pages
215
217
 
216
218
  # Execute on_page_downloaded blocks
217
- @on_page_downloaded.each {|e| e.call(page)} unless page.nil?
219
+ @on_page_downloaded.each {|e| e.call(page)}
218
220
 
219
221
  if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
220
222
  links_for(page).each do |url_to_visit|
@@ -264,6 +266,7 @@ module Polipus
264
266
  self
265
267
  end
266
268
 
269
+ # A block of code will be executed when crawl session is over
267
270
  def on_crawl_end(&block)
268
271
  @on_crawl_end << block
269
272
  self
@@ -276,6 +279,12 @@ module Polipus
276
279
  self
277
280
  end
278
281
 
282
+ # A block of code will be executed whether a page contains an error
283
+ def on_page_error(&block)
284
+ @on_page_error << block
285
+ self
286
+ end
287
+
279
288
  # A block of code will be executed
280
289
  # on every page downloaded. The code is used to extract urls to visit
281
290
  # see links_for method
@@ -313,9 +322,18 @@ module Polipus
313
322
  @redis ||= redis_factory_adapter
314
323
  end
315
324
 
325
+ def add_to_queue(page)
326
+ if [:url, :referer, :depth].all? { |method| page.respond_to?(method) }
327
+ add_url(page.url, referer: page.referer, depth: page.depth)
328
+ else
329
+ add_url(page)
330
+ end
331
+ end
332
+
316
333
  # Enqueue an url, no matter what
317
- def add_url url
318
- page = Page.new(url)
334
+ def add_url(url, params = {})
335
+ page = Page.new(url, params)
336
+ yield(page) if block_given?
319
337
  @internal_queue << page.to_json
320
338
  end
321
339
 
@@ -329,7 +347,11 @@ module Polipus
329
347
  private
330
348
  # URLs enqueue policy
331
349
  def should_be_visited?(url, with_tracker = true)
350
+
332
351
  case
352
+ # robots.txt
353
+ when !allowed_by_robot?(url)
354
+ false
333
355
  # Check against whitelist pattern matching
334
356
  when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
335
357
  false
@@ -368,6 +390,17 @@ module Polipus
368
390
  @storage.exists?(page) && !page_expired?(page)
369
391
  end
370
392
 
393
+ #
394
+ # Returns +true+ if we are obeying robots.txt and the link
395
+ # is granted access in it. Always returns +true+ when we are
396
+ # not obeying robots.txt.
397
+ #
398
+ def allowed_by_robot?(link)
399
+ return true if @robots.nil?
400
+ @options[:obey_robots_txt] ? @robots.allowed?(link) : true
401
+ end
402
+
403
+
371
404
  # The url is enqueued for a later visit
372
405
  def enqueue url_to_visit, current_page, queue
373
406
  page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)
data/lib/polipus/http.rb CHANGED
@@ -7,6 +7,21 @@ module Polipus
7
7
  class HTTP
8
8
  # Maximum number of redirects to follow on each get_response
9
9
  REDIRECT_LIMIT = 5
10
+ RESCUABLE_ERRORS = [
11
+ EOFError,
12
+ Errno::ECONNREFUSED,
13
+ Errno::ECONNRESET,
14
+ Errno::EHOSTUNREACH,
15
+ Errno::EINVAL,
16
+ Errno::EPIPE,
17
+ Errno::ETIMEDOUT,
18
+ Net::HTTPBadResponse,
19
+ Net::HTTPHeaderSyntaxError,
20
+ Net::ProtocolError,
21
+ SocketError,
22
+ Timeout::Error,
23
+ Zlib::DataError
24
+ ]
10
25
 
11
26
  def initialize(opts = {})
12
27
  @connections = {}
@@ -30,13 +45,8 @@ module Polipus
30
45
  url = URI(url)
31
46
  pages = []
32
47
  get(url, referer) do |response, code, location, redirect_to, response_time|
33
- body = response.body.dup
34
- if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
35
- gzip = Zlib::GzipReader.new(StringIO.new(body))
36
- body = gzip.read
37
- end
38
-
39
- pages << Page.new(location, :body => body,
48
+ handle_compression response
49
+ pages << Page.new(location, :body => response.body,
40
50
  :code => code,
41
51
  :headers => response.to_hash,
42
52
  :referer => referer,
@@ -47,13 +57,13 @@ module Polipus
47
57
  end
48
58
 
49
59
  pages
50
- rescue StandardError => e
60
+ rescue *RESCUABLE_ERRORS => e
51
61
  if verbose?
52
62
  puts e.inspect
53
63
  puts e.backtrace
54
64
  end
55
65
 
56
- [Page.new(url, :error => e)]
66
+ [Page.new(url, error: e, referer: referer, depth: depth)]
57
67
  end
58
68
 
59
69
  #
@@ -154,7 +164,7 @@ module Polipus
154
164
  opts['User-Agent'] = user_agent if user_agent
155
165
  opts['Referer'] = referer.to_s if referer
156
166
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
157
- opts['Accept-Encoding'] = 'gzip'
167
+ opts['Accept-Encoding'] = 'gzip,deflate'
158
168
 
159
169
 
160
170
  retries = 0
@@ -169,8 +179,7 @@ module Polipus
169
179
  response_time = ((finish - start) * 1000).round
170
180
  cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
171
181
  return response, response_time
172
- rescue StandardError => e
173
-
182
+ rescue *RESCUABLE_ERRORS => e
174
183
  puts e.inspect if verbose?
175
184
  refresh_connection(url)
176
185
  retries += 1
@@ -229,8 +238,14 @@ module Polipus
229
238
  to_url.host.nil? || (to_url.host == from_url.host)
230
239
  end
231
240
 
232
- def gzip_enabled?
233
- @opts[:gzip_enabled]
241
+ def handle_compression response
242
+ case response["content-encoding"]
243
+ when "gzip", "x-gzip"
244
+ body_io = StringIO.new(response.body)
245
+ response.body.replace Zlib::GzipReader.new(body_io).read
246
+ when "deflate"
247
+ response.body.replace Zlib::Inflate.inflate(response.body)
248
+ end
234
249
  end
235
250
 
236
251
  end
data/lib/polipus/page.rb CHANGED
@@ -17,8 +17,7 @@ module Polipus
17
17
  attr_reader :error
18
18
  # Integer response code of the page
19
19
  attr_accessor :code
20
- # Depth of this page from the root of the crawl. This is not necessarily the
21
- # shortest path; use PageStore#shortest_paths! to find that value.
20
+ # Depth of this page from the root of the crawl.
22
21
  attr_accessor :depth
23
22
  # URL of the page that brought us to this page
24
23
  attr_accessor :referer
@@ -41,7 +40,7 @@ module Polipus
41
40
  # Create a new page
42
41
  #
43
42
  def initialize(url, params = {})
44
- @url = url.kind_of?(URI) ? url : URI(url)
43
+ @url = URI(url)
45
44
  @code = params[:code]
46
45
  @headers = params[:headers] || {}
47
46
  @headers['content-type'] ||= ['']
@@ -130,6 +129,14 @@ module Polipus
130
129
  (300..307).include?(@code)
131
130
  end
132
131
 
132
+ #
133
+ # Returns +true+ if the page is a HTTP success, returns +false+
134
+ # otherwise.
135
+ #
136
+ def success?
137
+ (200..206).include?(@code)
138
+ end
139
+
133
140
  #
134
141
  # Returns +true+ if the page was not found (returned 404 code),
135
142
  # returns +false+ otherwise.
@@ -192,7 +199,8 @@ module Polipus
192
199
  'response_time' => @response_time,
193
200
  'fetched' => @fetched,
194
201
  'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
195
- 'fetched_at' => @fetched_at
202
+ 'fetched_at' => @fetched_at,
203
+ 'error' => @error
196
204
  }
197
205
  end
198
206
 
@@ -230,7 +238,8 @@ module Polipus
230
238
  '@response_time' => hash['response_time'].to_i,
231
239
  '@fetched' => hash['fetched'],
232
240
  '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
233
- '@fetched_at' => hash['fetched_at']
241
+ '@fetched_at' => hash['fetched_at'],
242
+ '@error' => hash['error']
234
243
  }.each do |var, value|
235
244
  page.instance_variable_set(var, value)
236
245
  end
@@ -242,4 +251,4 @@ module Polipus
242
251
  self.from_hash hash
243
252
  end
244
253
  end
245
- end
254
+ end
@@ -0,0 +1,154 @@
1
+ require 'open-uri'
2
+ require 'uri'
3
+ require 'timeout'
4
+ module Polipus
5
+
6
+ # Original code taken from
7
+ # https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
8
+
9
+ class Robotex
10
+
11
+ DEFAULT_TIMEOUT = 3
12
+ VERSION = '1.0.0'
13
+
14
+ attr_reader :user_agent
15
+
16
+ class ParsedRobots
17
+
18
+ def initialize(uri, user_agent)
19
+ io = Robotex.get_robots_txt(uri, user_agent)
20
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
21
+ io = StringIO.new("User-agent: *\nAllow: /\n")
22
+ end
23
+
24
+ @disallows = {}
25
+ @allows = {}
26
+ @delays = {}
27
+ agent = /.*/
28
+ io.each do |line|
29
+ next if line =~ /^\s*(#.*|$)/
30
+ arr = line.split(":")
31
+ key = arr.shift
32
+ value = arr.join(":").strip
33
+ value.strip!
34
+ case key.downcase
35
+ when "user-agent"
36
+ agent = to_regex(value)
37
+ when "allow"
38
+ unless value.empty?
39
+ @allows[agent] ||= []
40
+ @allows[agent] << to_regex(value)
41
+ end
42
+ when "disallow"
43
+ unless value.empty?
44
+ @disallows[agent] ||= []
45
+ @disallows[agent] << to_regex(value)
46
+ end
47
+ when "crawl-delay"
48
+ @delays[agent] = value.to_i
49
+ end
50
+ end
51
+ @parsed = true
52
+ end
53
+
54
+ def allowed?(uri, user_agent)
55
+ return true unless @parsed
56
+ allowed = true
57
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
58
+ path = uri.request_uri
59
+
60
+ @allows.each do |key, value|
61
+ unless allowed
62
+ if user_agent =~ key
63
+ value.each do |rule|
64
+ if path =~ rule
65
+ allowed = true
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ @disallows.each do |key, value|
73
+ if user_agent =~ key
74
+ value.each do |rule|
75
+ if path =~ rule
76
+ allowed = false
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ return allowed
83
+ end
84
+
85
+ def delay(user_agent)
86
+ @delays.each do |agent, delay|
87
+ return delay if agent =~ user_agent
88
+ end
89
+ nil
90
+ end
91
+
92
+ protected
93
+
94
+ def to_regex(pattern)
95
+ pattern = Regexp.escape(pattern)
96
+ pattern.gsub!(Regexp.escape("*"), ".*")
97
+ Regexp.compile("^#{pattern}")
98
+ end
99
+ end
100
+
101
+ def self.get_robots_txt(uri, user_agent)
102
+ begin
103
+ Timeout::timeout(Robotex.timeout) do
104
+ URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
105
+ end
106
+ rescue Timeout::Error
107
+ STDERR.puts "robots.txt request timed out"
108
+ end
109
+ end
110
+
111
+ def self.timeout=(t)
112
+ @timeout = t
113
+ end
114
+
115
+ def self.timeout
116
+ @timeout || DEFAULT_TIMEOUT
117
+ end
118
+
119
+ def initialize(user_agent = nil)
120
+ user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
121
+ @user_agent = user_agent
122
+ @last_accessed = Time.at(1)
123
+ @parsed = {}
124
+ end
125
+
126
+ def parse_host(uri)
127
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
128
+ @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
129
+ end
130
+
131
+ #
132
+ # Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
133
+ #
134
+ def allowed?(uri)
135
+ parse_host(uri).allowed?(uri, @user_agent)
136
+ end
137
+
138
+ #
139
+ # Return the value of the Crawl-Delay directive, or nil if none
140
+ def delay(uri)
141
+ parse_host(uri).delay(@user_agent)
142
+ end
143
+
144
+ #
145
+ # Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
146
+ #
147
+ def delay!(uri)
148
+ delay = delay(uri)
149
+ sleep delay - (Time.now - @last_accessed) if !!delay
150
+ @last_accessed = Time.now
151
+ end
152
+
153
+ end
154
+ end