polipus 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MTk2M2YyNzJhYjhhY2IxNDUwNThkMjUwZTkzNTgyNjUyMTAxODY0Nw==
4
+ OGE5NThiNjIyZmFmMDhkMzQ1OGFjMDY3MTgxOThhMDUwMzM4NDE2Zg==
5
5
  data.tar.gz: !binary |-
6
- MTNkMzZmZDcyODBhNmRjZmMyZjRmOTA3NmM1NGY1OTY3MDhhZGM5ZQ==
6
+ YjhkYjUzOGI5MjJiYjdjMjQ0ZjdlOTAxOTBlODA0ZjRiYzBhZTI4Ng==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MTk3NjZmNmE4MWZmYmI4YzRlMDQxMGI2YTBhN2U2ZjdhMzNjZjQ3ZDk2ODQz
10
- OWU0ODQ1ZjMxOGYwZjYyNWE1M2Q2MTE2ZjIxY2E3NjhmZTQwMGNjZTdlZjVm
11
- ZTYxYzY4ZjBjNTkyNDY3MDVhODNkMWYwNDE3NjYyZWM0YzhiNjU=
9
+ NzEzOTk5NDQwMjdmYTZiOTlkNjllYmUxY2M0ZTY1ZWRjOTMwMzEwNjZmZGMy
10
+ ZjA0NjJiZDgzNzFhYzRkNGQ2ZGY3MmQxMTdlNmJjZDg2MTZiMGE2ZDA0N2Iz
11
+ NThlMjU2ODllNTIxMjA5OWRhYmY0ZDBlN2I2MjAzNjI3YzVlZDQ=
12
12
  data.tar.gz: !binary |-
13
- ZTJkMzc2MDI4Yjg5ODMzZmJhMmM5ZmU5YzliMGRlNTliMWI3YWFjNzQ2MWQ1
14
- YzBjMzRhZGFlODUyODdjNjExMzFmMWRhZTJiMDhjNjI2OTM0NTQ1NDk4NWE4
15
- MzEyN2UxZTIzNjc1OTc2NzBmMTM0MTMwZTgyN2M5NjkxN2IyZDA=
13
+ ODM0ZDVkNDZlZDQ0NzM2NWEwOGY4MDRkY2IzM2U0MzBiYjQ3YzA0Njk0ZDU5
14
+ ZTVhYjkwZmQ1ZDRhYjhkMThlOWVkMTNiYzAxNTc1NDUwMDNiMjNmYjE5Nzhm
15
+ Yzc0MDI5N2ZkMjQxMzEwOTExODUyMmJjNWU1YTdmODAyYmIwNzU=
@@ -0,0 +1,62 @@
1
+ require "polipus"
2
+ require "mongo"
3
+
4
+ # Define a Mongo connection
5
+ mongo = Mongo::Connection.new(:pool_size => 15, :pool_timeout => 5).db('crawler')
6
+ # Override some default options
7
+ options = {
8
+ #Redis connection
9
+ :redis_options => {
10
+ :host => 'localhost',
11
+ :db => 5,
12
+ :driver => 'hiredis'
13
+ },
14
+ # Page storage: pages is the name of the collection where
15
+ # pages will be stored
16
+ :storage => Polipus::Storage.mongo_store(mongo, 'pages'),
17
+ # Use your custom user agent
18
+ :user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
19
+ # Use 10 threads
20
+ :workers => 20,
21
+ # Logs goes to the crawler.log file
22
+ :logger => Logger.new(STDOUT),
23
+ # Do not go deeper than 2 levels
24
+ :depth_limit => 5,
25
+
26
+ # Incremental download:
27
+ # Set a ttl for each stored page
28
+ # If a previous stored page is now expired, it will re-downloaded
29
+ # Mark a page expired after 60s
30
+ :ttl_page => 60
31
+ }
32
+
33
+ starting_urls = ["http://rubygems.org/gems"]
34
+
35
+ # Crawl the entire rubygems's site
36
+ # Polipus.crawler('polipus-rubygems', starting_urls, options)
37
+
38
+ Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
39
+ # Ignore urls pointing to a gem file
40
+ crawler.skip_links_like(/\.gem$/)
41
+ # Ignore urls pointing to an atom feed
42
+ crawler.skip_links_like(/\.atom$/)
43
+ # Ignore urls containing /versions/ path
44
+ crawler.skip_links_like(/\/versions\//)
45
+
46
+ # Adding some metadata to a page
47
+ # The metadata will be stored on mongo
48
+ crawler.on_before_save do |page|
49
+ page.user_data.processed = false
50
+ end
51
+
52
+ # In-place page processing
53
+ crawler.on_page_downloaded do |page|
54
+ # A nokogiri object
55
+ puts "Page title: #{page.doc.css('title').text}" rescue "ERROR"
56
+ end
57
+
58
+ # Do a nifty stuff at the end of the crawling session
59
+ crawler.on_crawl_end do
60
+ # Gong.bang(:loudly)
61
+ end
62
+ end
data/lib/polipus/http.rb CHANGED
@@ -36,13 +36,14 @@ module Polipus
36
36
  gzip = Zlib::GzipReader.new(StringIO.new(body))
37
37
  body = gzip.read
38
38
  end
39
- pages << Page.new(location, :body => response.body.dup,
40
- :code => code,
41
- :headers => response.to_hash,
42
- :referer => referer,
43
- :depth => depth,
44
- :redirect_to => redirect_to,
45
- :response_time => response_time)
39
+ pages << Page.new(location, :body => response.body.dup,
40
+ :code => code,
41
+ :headers => response.to_hash,
42
+ :referer => referer,
43
+ :depth => depth,
44
+ :redirect_to => redirect_to,
45
+ :response_time => response_time,
46
+ :fetched_at => Time.now.to_i)
46
47
  end
47
48
 
48
49
  return pages
data/lib/polipus/page.rb CHANGED
@@ -35,6 +35,8 @@ module Polipus
35
35
  # Default: true
36
36
  attr_accessor :storable
37
37
 
38
+ attr_accessor :fetched_at
39
+
38
40
  #
39
41
  # Create a new page
40
42
  #
@@ -54,6 +56,7 @@ module Polipus
54
56
  @user_data = OpenStruct.new
55
57
  @domain_aliases = params[:domain_aliases] ||= []
56
58
  @storable = true
59
+ @fetched_at = params[:fetched_at]
57
60
  end
58
61
 
59
62
  #
@@ -177,17 +180,19 @@ module Polipus
177
180
  end
178
181
 
179
182
  def to_hash
180
- {'url' => @url.to_s,
181
- 'headers' => Marshal.dump(@headers),
182
- 'body' => @body,
183
- 'links' => links.map(&:to_s),
184
- 'code' => @code,
185
- 'depth' => @depth,
186
- 'referer' => @referer.to_s,
187
- 'redirect_to' => @redirect_to.to_s,
183
+ {
184
+ 'url' => @url.to_s,
185
+ 'headers' => Marshal.dump(@headers),
186
+ 'body' => @body,
187
+ 'links' => links.map(&:to_s),
188
+ 'code' => @code,
189
+ 'depth' => @depth,
190
+ 'referer' => @referer.to_s,
191
+ 'redirect_to' => @redirect_to.to_s,
188
192
  'response_time' => @response_time,
189
- 'fetched' => @fetched,
190
- 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump
193
+ 'fetched' => @fetched,
194
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
195
+ 'fetched_at' => @fetched_at
191
196
  }
192
197
  end
193
198
 
@@ -198,22 +203,34 @@ module Polipus
198
203
  th.to_json
199
204
  end
200
205
 
206
+ #
207
+ # Returns +true+ if page is marked as storeable
208
+ # +false+ otherwise
209
+ # Default is +true+
210
+ #
201
211
  def storable?
202
212
  @storable
203
213
  end
204
214
 
215
+ def expired? ttl
216
+ return false if fetched_at.nil?
217
+ (Time.now.to_i - ttl) > fetched_at
218
+ end
219
+
205
220
  def self.from_hash(hash)
206
221
  page = self.new(URI(hash['url']))
207
- {'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
208
- '@body' => hash['body'],
209
- '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
210
- '@code' => hash['code'].to_i,
211
- '@depth' => hash['depth'].to_i,
212
- '@referer' => hash['referer'],
213
- '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
214
- '@response_time' => hash['response_time'].to_i,
215
- '@fetched' => hash['fetched'],
216
- '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil
222
+ {
223
+ '@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
224
+ '@body' => hash['body'],
225
+ '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
226
+ '@code' => hash['code'].to_i,
227
+ '@depth' => hash['depth'].to_i,
228
+ '@referer' => hash['referer'],
229
+ '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
230
+ '@response_time' => hash['response_time'].to_i,
231
+ '@fetched' => hash['fetched'],
232
+ '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
233
+ '@fetched_at' => hash['fetched_at']
217
234
  }.each do |var, value|
218
235
  page.instance_variable_set(var, value)
219
236
  end
@@ -0,0 +1,56 @@
1
+ require "thread"
2
+ module Polipus
3
+ module Storage
4
+ class MemoryStore < Base
5
+
6
+ def initialize(options = {})
7
+ @store = Hash.new
8
+ @semaphore = Mutex.new
9
+ end
10
+
11
+ def add page
12
+ @semaphore.synchronize {
13
+ u = uuid(page)
14
+ @store[u] = page
15
+ u
16
+ }
17
+ end
18
+
19
+ def exists?(page)
20
+ @semaphore.synchronize {
21
+ @store.key?(uuid(page))
22
+ }
23
+ end
24
+
25
+ def get page
26
+ @semaphore.synchronize {
27
+ @store[uuid(page)]
28
+ }
29
+ end
30
+
31
+ def remove page
32
+ @semaphore.synchronize {
33
+ @store.delete(uuid(page))
34
+ }
35
+ end
36
+
37
+ def count
38
+ @semaphore.synchronize {
39
+ @store.count
40
+ }
41
+ end
42
+
43
+ def each
44
+ @store.each do |k,v|
45
+ yield k,v
46
+ end
47
+ end
48
+
49
+ def clear
50
+ @semaphore.synchronize {
51
+ @store = Hash.new
52
+ }
53
+ end
54
+ end
55
+ end
56
+ end
@@ -39,9 +39,7 @@ module Polipus
39
39
  def get page
40
40
  @semaphore.synchronize {
41
41
  data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
42
- if data
43
- return load_page(data)
44
- end
42
+ return load_page(data) if data
45
43
  }
46
44
  end
47
45
 
@@ -75,7 +73,11 @@ module Polipus
75
73
  end
76
74
  begin
77
75
  hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
78
- return Page.from_hash(hash)
76
+ page = Page.from_hash(hash)
77
+ if page.fetched_at.nil?
78
+ page.fetched_at = hash['_id'].generation_time.to_i
79
+ end
80
+ return page
79
81
  rescue
80
82
  end
81
83
  nil
@@ -27,5 +27,10 @@ module Polipus
27
27
  require 'polipus/storage/dev_null'
28
28
  self::DevNull.new
29
29
  end
30
+
31
+ def self.memory_store
32
+ require 'polipus/storage/memory_store'
33
+ self::MemoryStore.new
34
+ end
30
35
  end
31
36
  end
@@ -1,4 +1,4 @@
1
1
  module Polipus
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  HOMEPAGE = "https://github.com/taganaka/polipus"
4
4
  end
data/lib/polipus.rb CHANGED
@@ -23,7 +23,7 @@ module Polipus
23
23
  OPTS = {
24
24
  # run 4 threads
25
25
  :workers => 4,
26
- # identify self as Anemone/VERSION
26
+ # identify self as Polipus/VERSION
27
27
  :user_agent => "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
28
28
  # by default, don't limit the depth of the crawl
29
29
  :depth_limit => false,
@@ -39,12 +39,17 @@ module Polipus
39
39
  :read_timeout => 30,
40
40
  # HTTP open connection timeout in seconds
41
41
  :open_timeout => 10,
42
+ # Time to wait for new messages on Redis
43
+ # After this timeout, current crawling session is marked as terminated
44
+ :queue_timeout => 30,
42
45
  # An URL tracker instance. default is Bloomfilter based on redis
43
46
  :url_tracker => nil,
44
47
  # A Redis options {} that will be passed directly to Redis.new
45
48
  :redis_options => {},
46
49
  # An instance of logger
47
50
  :logger => nil,
51
+ # A logger level
52
+ :logger_level => nil,
48
53
  # whether the query string should be included in the saved page
49
54
  :include_query_string_in_saved_page => true,
50
55
  # Max number of items to keep on redis
@@ -62,7 +67,9 @@ module Polipus
62
67
  # Eg It can be used to follow links with and without 'www' domain
63
68
  :domain_aliases => [],
64
69
  # Mark a connection as staled after connection_max_hits request
65
- :connection_max_hits => nil
70
+ :connection_max_hits => nil,
71
+ # Page TTL: mark a page as expired after ttl_page seconds
72
+ :ttl_page => nil
66
73
  }
67
74
 
68
75
  attr_reader :storage
@@ -86,6 +93,7 @@ module Polipus
86
93
 
87
94
  @job_name = job_name
88
95
  @options = OPTS.merge(options)
96
+ @options[:queue_timeout] = 1 if @options[:queue_timeout] <= 0
89
97
  @logger = @options[:logger] ||= Logger.new(nil)
90
98
 
91
99
  unless @logger.class.to_s == "Log4r::Logger"
@@ -137,8 +145,9 @@ module Polipus
137
145
 
138
146
  q = queue_factory
139
147
  @urls.each do |u|
140
- next if url_tracker.visited?(u.to_s)
141
- q << Page.new(u.to_s, :referer => '').to_json
148
+ page = Page.new(u.to_s, :referer => '')
149
+ page.user_data.p_seeded = true
150
+ q << page.to_json
142
151
  end
143
152
 
144
153
  return if q.empty?
@@ -149,7 +158,7 @@ module Polipus
149
158
  @logger.debug {"Start worker #{worker_number}"}
150
159
  http = @http_pool[worker_number] ||= HTTP.new(@options)
151
160
  queue = @queues_pool[worker_number] ||= queue_factory
152
- queue.process(false, @options[:read_timeout]) do |message|
161
+ queue.process(false, @options[:queue_timeout]) do |message|
153
162
 
154
163
  next if message.nil?
155
164
 
@@ -163,7 +172,7 @@ module Polipus
163
172
  next
164
173
  end
165
174
 
166
- if @storage.exists? page
175
+ if page_exists? page
167
176
  @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
168
177
  queue.commit
169
178
  next
@@ -180,7 +189,7 @@ module Polipus
180
189
  @logger.info {"Got redirects! #{rurls}"}
181
190
  page = pages.pop
182
191
  page.aliases = pages.collect { |e| e.url }
183
- if @storage.exists?(page)
192
+ if page_exists? page
184
193
  @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
185
194
  queue.commit
186
195
  next
@@ -202,7 +211,7 @@ module Polipus
202
211
  end
203
212
 
204
213
  if page
205
- @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
214
+ @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
206
215
  @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] downloaded"}
207
216
  end
208
217
 
@@ -264,7 +273,7 @@ module Polipus
264
273
  self
265
274
  end
266
275
 
267
- # A block of code will be executed on every page donloaded
276
+ # A block of code will be executed on every page downloaded
268
277
  # before being saved in the registered storage
269
278
  def on_before_save(&block)
270
279
  @on_before_save << block
@@ -272,7 +281,7 @@ module Polipus
272
281
  end
273
282
 
274
283
  # A block of code will be executed
275
- # on every page donloaded. The code is used to extract urls to visit
284
+ # on every page downloaded. The code is used to extract urls to visit
276
285
  # see links_for method
277
286
  def focus_crawl(&block)
278
287
  @focus_crawl_block = block
@@ -332,6 +341,11 @@ module Polipus
332
341
  # URLs enqueue policy
333
342
  def should_be_visited?(url, with_tracker = true)
334
343
 
344
+ # return +true+ If an url is part of the initial seeder
345
+ # no matter what
346
+
347
+ return true if @urls.map(&:to_s).include?(url.to_s)
348
+
335
349
  # Check against whitelist pattern matching
336
350
  unless @follow_links_like.empty?
337
351
  return false unless @follow_links_like.any?{|p| url.path =~ p}
@@ -342,9 +356,12 @@ module Polipus
342
356
  return false if @skip_links_like.any?{|p| url.path =~ p}
343
357
  end
344
358
 
359
+ #Page is marked as expired
360
+ return true if page_expired?(Page.new(url))
361
+
345
362
  # Check against url tracker
346
363
  if with_tracker
347
- return false if url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
364
+ return false if url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
348
365
  end
349
366
  true
350
367
  end
@@ -356,6 +373,19 @@ module Polipus
356
373
  links
357
374
  end
358
375
 
376
+ def page_expired? page
377
+ return false if @options[:ttl_page].nil?
378
+ stored_page = @storage.get(page)
379
+ r = stored_page && stored_page.expired?(@options[:ttl_page])
380
+ @logger.debug {"Page #{page.url.to_s} marked as expired"} if r
381
+ r
382
+ end
383
+
384
+ def page_exists? page
385
+ return false if page.user_data && page.user_data.p_seeded
386
+ @storage.exists?(page) && !page_expired?(page)
387
+ end
388
+
359
389
  # The url is enqueued for a later visit
360
390
  def enqueue url_to_visit, current_page, queue
361
391
  page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)