polipus 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MTk2M2YyNzJhYjhhY2IxNDUwNThkMjUwZTkzNTgyNjUyMTAxODY0Nw==
4
+ OGE5NThiNjIyZmFmMDhkMzQ1OGFjMDY3MTgxOThhMDUwMzM4NDE2Zg==
5
5
  data.tar.gz: !binary |-
6
- MTNkMzZmZDcyODBhNmRjZmMyZjRmOTA3NmM1NGY1OTY3MDhhZGM5ZQ==
6
+ YjhkYjUzOGI5MjJiYjdjMjQ0ZjdlOTAxOTBlODA0ZjRiYzBhZTI4Ng==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MTk3NjZmNmE4MWZmYmI4YzRlMDQxMGI2YTBhN2U2ZjdhMzNjZjQ3ZDk2ODQz
10
- OWU0ODQ1ZjMxOGYwZjYyNWE1M2Q2MTE2ZjIxY2E3NjhmZTQwMGNjZTdlZjVm
11
- ZTYxYzY4ZjBjNTkyNDY3MDVhODNkMWYwNDE3NjYyZWM0YzhiNjU=
9
+ NzEzOTk5NDQwMjdmYTZiOTlkNjllYmUxY2M0ZTY1ZWRjOTMwMzEwNjZmZGMy
10
+ ZjA0NjJiZDgzNzFhYzRkNGQ2ZGY3MmQxMTdlNmJjZDg2MTZiMGE2ZDA0N2Iz
11
+ NThlMjU2ODllNTIxMjA5OWRhYmY0ZDBlN2I2MjAzNjI3YzVlZDQ=
12
12
  data.tar.gz: !binary |-
13
- ZTJkMzc2MDI4Yjg5ODMzZmJhMmM5ZmU5YzliMGRlNTliMWI3YWFjNzQ2MWQ1
14
- YzBjMzRhZGFlODUyODdjNjExMzFmMWRhZTJiMDhjNjI2OTM0NTQ1NDk4NWE4
15
- MzEyN2UxZTIzNjc1OTc2NzBmMTM0MTMwZTgyN2M5NjkxN2IyZDA=
13
+ ODM0ZDVkNDZlZDQ0NzM2NWEwOGY4MDRkY2IzM2U0MzBiYjQ3YzA0Njk0ZDU5
14
+ ZTVhYjkwZmQ1ZDRhYjhkMThlOWVkMTNiYzAxNTc1NDUwMDNiMjNmYjE5Nzhm
15
+ Yzc0MDI5N2ZkMjQxMzEwOTExODUyMmJjNWU1YTdmODAyYmIwNzU=
@@ -0,0 +1,62 @@
1
+ require "polipus"
2
+ require "mongo"
3
+
4
+ # Define a Mongo connection
5
+ mongo = Mongo::Connection.new(:pool_size => 15, :pool_timeout => 5).db('crawler')
6
+ # Override some default options
7
+ options = {
8
+ #Redis connection
9
+ :redis_options => {
10
+ :host => 'localhost',
11
+ :db => 5,
12
+ :driver => 'hiredis'
13
+ },
14
+ # Page storage: pages is the name of the collection where
15
+ # pages will be stored
16
+ :storage => Polipus::Storage.mongo_store(mongo, 'pages'),
17
+ # Use your custom user agent
18
+ :user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
19
+ # Use 10 threads
20
+ :workers => 20,
21
+ # Logs goes to the crawler.log file
22
+ :logger => Logger.new(STDOUT),
23
+ # Do not go deeper than 2 levels
24
+ :depth_limit => 5,
25
+
26
+ # Incremental download:
27
+ # Set a ttl for each stored page
28
+ # If a previous stored page is now expired, it will re-downloaded
29
+ # Mark a page expired after 60s
30
+ :ttl_page => 60
31
+ }
32
+
33
+ starting_urls = ["http://rubygems.org/gems"]
34
+
35
+ # Crawl the entire rubygems's site
36
+ # Polipus.crawler('polipus-rubygems', starting_urls, options)
37
+
38
+ Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
39
+ # Ignore urls pointing to a gem file
40
+ crawler.skip_links_like(/\.gem$/)
41
+ # Ignore urls pointing to an atom feed
42
+ crawler.skip_links_like(/\.atom$/)
43
+ # Ignore urls containing /versions/ path
44
+ crawler.skip_links_like(/\/versions\//)
45
+
46
+ # Adding some metadata to a page
47
+ # The metadata will be stored on mongo
48
+ crawler.on_before_save do |page|
49
+ page.user_data.processed = false
50
+ end
51
+
52
+ # In-place page processing
53
+ crawler.on_page_downloaded do |page|
54
+ # A nokogiri object
55
+ puts "Page title: #{page.doc.css('title').text}" rescue "ERROR"
56
+ end
57
+
58
+ # Do a nifty stuff at the end of the crawling session
59
+ crawler.on_crawl_end do
60
+ # Gong.bang(:loudly)
61
+ end
62
+ end
data/lib/polipus/http.rb CHANGED
@@ -36,13 +36,14 @@ module Polipus
36
36
  gzip = Zlib::GzipReader.new(StringIO.new(body))
37
37
  body = gzip.read
38
38
  end
39
- pages << Page.new(location, :body => response.body.dup,
40
- :code => code,
41
- :headers => response.to_hash,
42
- :referer => referer,
43
- :depth => depth,
44
- :redirect_to => redirect_to,
45
- :response_time => response_time)
39
+ pages << Page.new(location, :body => response.body.dup,
40
+ :code => code,
41
+ :headers => response.to_hash,
42
+ :referer => referer,
43
+ :depth => depth,
44
+ :redirect_to => redirect_to,
45
+ :response_time => response_time,
46
+ :fetched_at => Time.now.to_i)
46
47
  end
47
48
 
48
49
  return pages
data/lib/polipus/page.rb CHANGED
@@ -35,6 +35,8 @@ module Polipus
35
35
  # Default: true
36
36
  attr_accessor :storable
37
37
 
38
+ attr_accessor :fetched_at
39
+
38
40
  #
39
41
  # Create a new page
40
42
  #
@@ -54,6 +56,7 @@ module Polipus
54
56
  @user_data = OpenStruct.new
55
57
  @domain_aliases = params[:domain_aliases] ||= []
56
58
  @storable = true
59
+ @fetched_at = params[:fetched_at]
57
60
  end
58
61
 
59
62
  #
@@ -177,17 +180,19 @@ module Polipus
177
180
  end
178
181
 
179
182
  def to_hash
180
- {'url' => @url.to_s,
181
- 'headers' => Marshal.dump(@headers),
182
- 'body' => @body,
183
- 'links' => links.map(&:to_s),
184
- 'code' => @code,
185
- 'depth' => @depth,
186
- 'referer' => @referer.to_s,
187
- 'redirect_to' => @redirect_to.to_s,
183
+ {
184
+ 'url' => @url.to_s,
185
+ 'headers' => Marshal.dump(@headers),
186
+ 'body' => @body,
187
+ 'links' => links.map(&:to_s),
188
+ 'code' => @code,
189
+ 'depth' => @depth,
190
+ 'referer' => @referer.to_s,
191
+ 'redirect_to' => @redirect_to.to_s,
188
192
  'response_time' => @response_time,
189
- 'fetched' => @fetched,
190
- 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump
193
+ 'fetched' => @fetched,
194
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
195
+ 'fetched_at' => @fetched_at
191
196
  }
192
197
  end
193
198
 
@@ -198,22 +203,34 @@ module Polipus
198
203
  th.to_json
199
204
  end
200
205
 
206
+ #
207
+ # Returns +true+ if page is marked as storeable
208
+ # +false+ otherwise
209
+ # Default is +true+
210
+ #
201
211
  def storable?
202
212
  @storable
203
213
  end
204
214
 
215
+ def expired? ttl
216
+ return false if fetched_at.nil?
217
+ (Time.now.to_i - ttl) > fetched_at
218
+ end
219
+
205
220
  def self.from_hash(hash)
206
221
  page = self.new(URI(hash['url']))
207
- {'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
208
- '@body' => hash['body'],
209
- '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
210
- '@code' => hash['code'].to_i,
211
- '@depth' => hash['depth'].to_i,
212
- '@referer' => hash['referer'],
213
- '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
214
- '@response_time' => hash['response_time'].to_i,
215
- '@fetched' => hash['fetched'],
216
- '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil
222
+ {
223
+ '@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
224
+ '@body' => hash['body'],
225
+ '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
226
+ '@code' => hash['code'].to_i,
227
+ '@depth' => hash['depth'].to_i,
228
+ '@referer' => hash['referer'],
229
+ '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
230
+ '@response_time' => hash['response_time'].to_i,
231
+ '@fetched' => hash['fetched'],
232
+ '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
233
+ '@fetched_at' => hash['fetched_at']
217
234
  }.each do |var, value|
218
235
  page.instance_variable_set(var, value)
219
236
  end
@@ -0,0 +1,56 @@
1
+ require "thread"
2
+ module Polipus
3
+ module Storage
4
+ class MemoryStore < Base
5
+
6
+ def initialize(options = {})
7
+ @store = Hash.new
8
+ @semaphore = Mutex.new
9
+ end
10
+
11
+ def add page
12
+ @semaphore.synchronize {
13
+ u = uuid(page)
14
+ @store[u] = page
15
+ u
16
+ }
17
+ end
18
+
19
+ def exists?(page)
20
+ @semaphore.synchronize {
21
+ @store.key?(uuid(page))
22
+ }
23
+ end
24
+
25
+ def get page
26
+ @semaphore.synchronize {
27
+ @store[uuid(page)]
28
+ }
29
+ end
30
+
31
+ def remove page
32
+ @semaphore.synchronize {
33
+ @store.delete(uuid(page))
34
+ }
35
+ end
36
+
37
+ def count
38
+ @semaphore.synchronize {
39
+ @store.count
40
+ }
41
+ end
42
+
43
+ def each
44
+ @store.each do |k,v|
45
+ yield k,v
46
+ end
47
+ end
48
+
49
+ def clear
50
+ @semaphore.synchronize {
51
+ @store = Hash.new
52
+ }
53
+ end
54
+ end
55
+ end
56
+ end
@@ -39,9 +39,7 @@ module Polipus
39
39
  def get page
40
40
  @semaphore.synchronize {
41
41
  data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
42
- if data
43
- return load_page(data)
44
- end
42
+ return load_page(data) if data
45
43
  }
46
44
  end
47
45
 
@@ -75,7 +73,11 @@ module Polipus
75
73
  end
76
74
  begin
77
75
  hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
78
- return Page.from_hash(hash)
76
+ page = Page.from_hash(hash)
77
+ if page.fetched_at.nil?
78
+ page.fetched_at = hash['_id'].generation_time.to_i
79
+ end
80
+ return page
79
81
  rescue
80
82
  end
81
83
  nil
@@ -27,5 +27,10 @@ module Polipus
27
27
  require 'polipus/storage/dev_null'
28
28
  self::DevNull.new
29
29
  end
30
+
31
+ def self.memory_store
32
+ require 'polipus/storage/memory_store'
33
+ self::MemoryStore.new
34
+ end
30
35
  end
31
36
  end
@@ -1,4 +1,4 @@
1
1
  module Polipus
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  HOMEPAGE = "https://github.com/taganaka/polipus"
4
4
  end
data/lib/polipus.rb CHANGED
@@ -23,7 +23,7 @@ module Polipus
23
23
  OPTS = {
24
24
  # run 4 threads
25
25
  :workers => 4,
26
- # identify self as Anemone/VERSION
26
+ # identify self as Polipus/VERSION
27
27
  :user_agent => "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
28
28
  # by default, don't limit the depth of the crawl
29
29
  :depth_limit => false,
@@ -39,12 +39,17 @@ module Polipus
39
39
  :read_timeout => 30,
40
40
  # HTTP open connection timeout in seconds
41
41
  :open_timeout => 10,
42
+ # Time to wait for new messages on Redis
43
+ # After this timeout, current crawling session is marked as terminated
44
+ :queue_timeout => 30,
42
45
  # An URL tracker instance. default is Bloomfilter based on redis
43
46
  :url_tracker => nil,
44
47
  # A Redis options {} that will be passed directly to Redis.new
45
48
  :redis_options => {},
46
49
  # An instance of logger
47
50
  :logger => nil,
51
+ # A logger level
52
+ :logger_level => nil,
48
53
  # whether the query string should be included in the saved page
49
54
  :include_query_string_in_saved_page => true,
50
55
  # Max number of items to keep on redis
@@ -62,7 +67,9 @@ module Polipus
62
67
  # Eg It can be used to follow links with and without 'www' domain
63
68
  :domain_aliases => [],
64
69
  # Mark a connection as staled after connection_max_hits request
65
- :connection_max_hits => nil
70
+ :connection_max_hits => nil,
71
+ # Page TTL: mark a page as expired after ttl_page seconds
72
+ :ttl_page => nil
66
73
  }
67
74
 
68
75
  attr_reader :storage
@@ -86,6 +93,7 @@ module Polipus
86
93
 
87
94
  @job_name = job_name
88
95
  @options = OPTS.merge(options)
96
+ @options[:queue_timeout] = 1 if @options[:queue_timeout] <= 0
89
97
  @logger = @options[:logger] ||= Logger.new(nil)
90
98
 
91
99
  unless @logger.class.to_s == "Log4r::Logger"
@@ -137,8 +145,9 @@ module Polipus
137
145
 
138
146
  q = queue_factory
139
147
  @urls.each do |u|
140
- next if url_tracker.visited?(u.to_s)
141
- q << Page.new(u.to_s, :referer => '').to_json
148
+ page = Page.new(u.to_s, :referer => '')
149
+ page.user_data.p_seeded = true
150
+ q << page.to_json
142
151
  end
143
152
 
144
153
  return if q.empty?
@@ -149,7 +158,7 @@ module Polipus
149
158
  @logger.debug {"Start worker #{worker_number}"}
150
159
  http = @http_pool[worker_number] ||= HTTP.new(@options)
151
160
  queue = @queues_pool[worker_number] ||= queue_factory
152
- queue.process(false, @options[:read_timeout]) do |message|
161
+ queue.process(false, @options[:queue_timeout]) do |message|
153
162
 
154
163
  next if message.nil?
155
164
 
@@ -163,7 +172,7 @@ module Polipus
163
172
  next
164
173
  end
165
174
 
166
- if @storage.exists? page
175
+ if page_exists? page
167
176
  @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
168
177
  queue.commit
169
178
  next
@@ -180,7 +189,7 @@ module Polipus
180
189
  @logger.info {"Got redirects! #{rurls}"}
181
190
  page = pages.pop
182
191
  page.aliases = pages.collect { |e| e.url }
183
- if @storage.exists?(page)
192
+ if page_exists? page
184
193
  @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
185
194
  queue.commit
186
195
  next
@@ -202,7 +211,7 @@ module Polipus
202
211
  end
203
212
 
204
213
  if page
205
- @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
214
+ @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
206
215
  @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] downloaded"}
207
216
  end
208
217
 
@@ -264,7 +273,7 @@ module Polipus
264
273
  self
265
274
  end
266
275
 
267
- # A block of code will be executed on every page donloaded
276
+ # A block of code will be executed on every page downloaded
268
277
  # before being saved in the registered storage
269
278
  def on_before_save(&block)
270
279
  @on_before_save << block
@@ -272,7 +281,7 @@ module Polipus
272
281
  end
273
282
 
274
283
  # A block of code will be executed
275
- # on every page donloaded. The code is used to extract urls to visit
284
+ # on every page downloaded. The code is used to extract urls to visit
276
285
  # see links_for method
277
286
  def focus_crawl(&block)
278
287
  @focus_crawl_block = block
@@ -332,6 +341,11 @@ module Polipus
332
341
  # URLs enqueue policy
333
342
  def should_be_visited?(url, with_tracker = true)
334
343
 
344
+ # return +true+ If an url is part of the initial seeder
345
+ # no matter what
346
+
347
+ return true if @urls.map(&:to_s).include?(url.to_s)
348
+
335
349
  # Check against whitelist pattern matching
336
350
  unless @follow_links_like.empty?
337
351
  return false unless @follow_links_like.any?{|p| url.path =~ p}
@@ -342,9 +356,12 @@ module Polipus
342
356
  return false if @skip_links_like.any?{|p| url.path =~ p}
343
357
  end
344
358
 
359
+ #Page is marked as expired
360
+ return true if page_expired?(Page.new(url))
361
+
345
362
  # Check against url tracker
346
363
  if with_tracker
347
- return false if url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
364
+ return false if url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
348
365
  end
349
366
  true
350
367
  end
@@ -356,6 +373,19 @@ module Polipus
356
373
  links
357
374
  end
358
375
 
376
+ def page_expired? page
377
+ return false if @options[:ttl_page].nil?
378
+ stored_page = @storage.get(page)
379
+ r = stored_page && stored_page.expired?(@options[:ttl_page])
380
+ @logger.debug {"Page #{page.url.to_s} marked as expired"} if r
381
+ r
382
+ end
383
+
384
+ def page_exists? page
385
+ return false if page.user_data && page.user_data.p_seeded
386
+ @storage.exists?(page) && !page_expired?(page)
387
+ end
388
+
359
389
  # The url is enqueued for a later visit
360
390
  def enqueue url_to_visit, current_page, queue
361
391
  page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)