polipus 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/examples/incremental.rb +62 -0
- data/lib/polipus/http.rb +8 -7
- data/lib/polipus/page.rb +37 -20
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +6 -4
- data/lib/polipus/storage.rb +5 -0
- data/lib/polipus/version.rb +1 -1
- data/lib/polipus.rb +41 -11
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/http_spec.rb +1 -0
- data/spec/page_spec.rb +21 -0
- data/spec/polipus_spec.rb +77 -0
- data/spec/storage_memory_spec.rb +89 -0
- data/spec/storage_mongo_spec.rb +18 -0
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
OGE5NThiNjIyZmFmMDhkMzQ1OGFjMDY3MTgxOThhMDUwMzM4NDE2Zg==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YjhkYjUzOGI5MjJiYjdjMjQ0ZjdlOTAxOTBlODA0ZjRiYzBhZTI4Ng==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NzEzOTk5NDQwMjdmYTZiOTlkNjllYmUxY2M0ZTY1ZWRjOTMwMzEwNjZmZGMy
|
10
|
+
ZjA0NjJiZDgzNzFhYzRkNGQ2ZGY3MmQxMTdlNmJjZDg2MTZiMGE2ZDA0N2Iz
|
11
|
+
NThlMjU2ODllNTIxMjA5OWRhYmY0ZDBlN2I2MjAzNjI3YzVlZDQ=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ODM0ZDVkNDZlZDQ0NzM2NWEwOGY4MDRkY2IzM2U0MzBiYjQ3YzA0Njk0ZDU5
|
14
|
+
ZTVhYjkwZmQ1ZDRhYjhkMThlOWVkMTNiYzAxNTc1NDUwMDNiMjNmYjE5Nzhm
|
15
|
+
Yzc0MDI5N2ZkMjQxMzEwOTExODUyMmJjNWU1YTdmODAyYmIwNzU=
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require "polipus"
|
2
|
+
require "mongo"
|
3
|
+
|
4
|
+
# Define a Mongo connection
|
5
|
+
mongo = Mongo::Connection.new(:pool_size => 15, :pool_timeout => 5).db('crawler')
|
6
|
+
# Override some default options
|
7
|
+
options = {
|
8
|
+
#Redis connection
|
9
|
+
:redis_options => {
|
10
|
+
:host => 'localhost',
|
11
|
+
:db => 5,
|
12
|
+
:driver => 'hiredis'
|
13
|
+
},
|
14
|
+
# Page storage: pages is the name of the collection where
|
15
|
+
# pages will be stored
|
16
|
+
:storage => Polipus::Storage.mongo_store(mongo, 'pages'),
|
17
|
+
# Use your custom user agent
|
18
|
+
:user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
|
19
|
+
# Use 10 threads
|
20
|
+
:workers => 20,
|
21
|
+
# Logs goes to the crawler.log file
|
22
|
+
:logger => Logger.new(STDOUT),
|
23
|
+
# Do not go deeper than 2 levels
|
24
|
+
:depth_limit => 5,
|
25
|
+
|
26
|
+
# Incremental download:
|
27
|
+
# Set a ttl for each stored page
|
28
|
+
# If a previous stored page is now expired, it will re-downloaded
|
29
|
+
# Mark a page expired after 60s
|
30
|
+
:ttl_page => 60
|
31
|
+
}
|
32
|
+
|
33
|
+
starting_urls = ["http://rubygems.org/gems"]
|
34
|
+
|
35
|
+
# Crawl the entire rubygems's site
|
36
|
+
# Polipus.crawler('polipus-rubygems', starting_urls, options)
|
37
|
+
|
38
|
+
Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
|
39
|
+
# Ignore urls pointing to a gem file
|
40
|
+
crawler.skip_links_like(/\.gem$/)
|
41
|
+
# Ignore urls pointing to an atom feed
|
42
|
+
crawler.skip_links_like(/\.atom$/)
|
43
|
+
# Ignore urls containing /versions/ path
|
44
|
+
crawler.skip_links_like(/\/versions\//)
|
45
|
+
|
46
|
+
# Adding some metadata to a page
|
47
|
+
# The metadata will be stored on mongo
|
48
|
+
crawler.on_before_save do |page|
|
49
|
+
page.user_data.processed = false
|
50
|
+
end
|
51
|
+
|
52
|
+
# In-place page processing
|
53
|
+
crawler.on_page_downloaded do |page|
|
54
|
+
# A nokogiri object
|
55
|
+
puts "Page title: #{page.doc.css('title').text}" rescue "ERROR"
|
56
|
+
end
|
57
|
+
|
58
|
+
# Do a nifty stuff at the end of the crawling session
|
59
|
+
crawler.on_crawl_end do
|
60
|
+
# Gong.bang(:loudly)
|
61
|
+
end
|
62
|
+
end
|
data/lib/polipus/http.rb
CHANGED
@@ -36,13 +36,14 @@ module Polipus
|
|
36
36
|
gzip = Zlib::GzipReader.new(StringIO.new(body))
|
37
37
|
body = gzip.read
|
38
38
|
end
|
39
|
-
pages << Page.new(location, :body
|
40
|
-
:code
|
41
|
-
:headers
|
42
|
-
:referer
|
43
|
-
:depth
|
44
|
-
:redirect_to
|
45
|
-
:response_time => response_time
|
39
|
+
pages << Page.new(location, :body => response.body.dup,
|
40
|
+
:code => code,
|
41
|
+
:headers => response.to_hash,
|
42
|
+
:referer => referer,
|
43
|
+
:depth => depth,
|
44
|
+
:redirect_to => redirect_to,
|
45
|
+
:response_time => response_time,
|
46
|
+
:fetched_at => Time.now.to_i)
|
46
47
|
end
|
47
48
|
|
48
49
|
return pages
|
data/lib/polipus/page.rb
CHANGED
@@ -35,6 +35,8 @@ module Polipus
|
|
35
35
|
# Default: true
|
36
36
|
attr_accessor :storable
|
37
37
|
|
38
|
+
attr_accessor :fetched_at
|
39
|
+
|
38
40
|
#
|
39
41
|
# Create a new page
|
40
42
|
#
|
@@ -54,6 +56,7 @@ module Polipus
|
|
54
56
|
@user_data = OpenStruct.new
|
55
57
|
@domain_aliases = params[:domain_aliases] ||= []
|
56
58
|
@storable = true
|
59
|
+
@fetched_at = params[:fetched_at]
|
57
60
|
end
|
58
61
|
|
59
62
|
#
|
@@ -177,17 +180,19 @@ module Polipus
|
|
177
180
|
end
|
178
181
|
|
179
182
|
def to_hash
|
180
|
-
{
|
181
|
-
'
|
182
|
-
'
|
183
|
-
'
|
184
|
-
'
|
185
|
-
'
|
186
|
-
'
|
187
|
-
'
|
183
|
+
{
|
184
|
+
'url' => @url.to_s,
|
185
|
+
'headers' => Marshal.dump(@headers),
|
186
|
+
'body' => @body,
|
187
|
+
'links' => links.map(&:to_s),
|
188
|
+
'code' => @code,
|
189
|
+
'depth' => @depth,
|
190
|
+
'referer' => @referer.to_s,
|
191
|
+
'redirect_to' => @redirect_to.to_s,
|
188
192
|
'response_time' => @response_time,
|
189
|
-
'fetched'
|
190
|
-
'user_data'
|
193
|
+
'fetched' => @fetched,
|
194
|
+
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
195
|
+
'fetched_at' => @fetched_at
|
191
196
|
}
|
192
197
|
end
|
193
198
|
|
@@ -198,22 +203,34 @@ module Polipus
|
|
198
203
|
th.to_json
|
199
204
|
end
|
200
205
|
|
206
|
+
#
|
207
|
+
# Returns +true+ if page is marked as storeable
|
208
|
+
# +false+ otherwise
|
209
|
+
# Default is +true+
|
210
|
+
#
|
201
211
|
def storable?
|
202
212
|
@storable
|
203
213
|
end
|
204
214
|
|
215
|
+
def expired? ttl
|
216
|
+
return false if fetched_at.nil?
|
217
|
+
(Time.now.to_i - ttl) > fetched_at
|
218
|
+
end
|
219
|
+
|
205
220
|
def self.from_hash(hash)
|
206
221
|
page = self.new(URI(hash['url']))
|
207
|
-
{
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
222
|
+
{
|
223
|
+
'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
|
224
|
+
'@body' => hash['body'],
|
225
|
+
'@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
|
226
|
+
'@code' => hash['code'].to_i,
|
227
|
+
'@depth' => hash['depth'].to_i,
|
228
|
+
'@referer' => hash['referer'],
|
229
|
+
'@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
230
|
+
'@response_time' => hash['response_time'].to_i,
|
231
|
+
'@fetched' => hash['fetched'],
|
232
|
+
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
|
233
|
+
'@fetched_at' => hash['fetched_at']
|
217
234
|
}.each do |var, value|
|
218
235
|
page.instance_variable_set(var, value)
|
219
236
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require "thread"
|
2
|
+
module Polipus
|
3
|
+
module Storage
|
4
|
+
class MemoryStore < Base
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@store = Hash.new
|
8
|
+
@semaphore = Mutex.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def add page
|
12
|
+
@semaphore.synchronize {
|
13
|
+
u = uuid(page)
|
14
|
+
@store[u] = page
|
15
|
+
u
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
def exists?(page)
|
20
|
+
@semaphore.synchronize {
|
21
|
+
@store.key?(uuid(page))
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def get page
|
26
|
+
@semaphore.synchronize {
|
27
|
+
@store[uuid(page)]
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
def remove page
|
32
|
+
@semaphore.synchronize {
|
33
|
+
@store.delete(uuid(page))
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
def count
|
38
|
+
@semaphore.synchronize {
|
39
|
+
@store.count
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
def each
|
44
|
+
@store.each do |k,v|
|
45
|
+
yield k,v
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def clear
|
50
|
+
@semaphore.synchronize {
|
51
|
+
@store = Hash.new
|
52
|
+
}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -39,9 +39,7 @@ module Polipus
|
|
39
39
|
def get page
|
40
40
|
@semaphore.synchronize {
|
41
41
|
data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
|
42
|
-
if data
|
43
|
-
return load_page(data)
|
44
|
-
end
|
42
|
+
return load_page(data) if data
|
45
43
|
}
|
46
44
|
end
|
47
45
|
|
@@ -75,7 +73,11 @@ module Polipus
|
|
75
73
|
end
|
76
74
|
begin
|
77
75
|
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
78
|
-
|
76
|
+
page = Page.from_hash(hash)
|
77
|
+
if page.fetched_at.nil?
|
78
|
+
page.fetched_at = hash['_id'].generation_time.to_i
|
79
|
+
end
|
80
|
+
return page
|
79
81
|
rescue
|
80
82
|
end
|
81
83
|
nil
|
data/lib/polipus/storage.rb
CHANGED
data/lib/polipus/version.rb
CHANGED
data/lib/polipus.rb
CHANGED
@@ -23,7 +23,7 @@ module Polipus
|
|
23
23
|
OPTS = {
|
24
24
|
# run 4 threads
|
25
25
|
:workers => 4,
|
26
|
-
# identify self as
|
26
|
+
# identify self as Polipus/VERSION
|
27
27
|
:user_agent => "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
|
28
28
|
# by default, don't limit the depth of the crawl
|
29
29
|
:depth_limit => false,
|
@@ -39,12 +39,17 @@ module Polipus
|
|
39
39
|
:read_timeout => 30,
|
40
40
|
# HTTP open connection timeout in seconds
|
41
41
|
:open_timeout => 10,
|
42
|
+
# Time to wait for new messages on Redis
|
43
|
+
# After this timeout, current crawling session is marked as terminated
|
44
|
+
:queue_timeout => 30,
|
42
45
|
# An URL tracker instance. default is Bloomfilter based on redis
|
43
46
|
:url_tracker => nil,
|
44
47
|
# A Redis options {} that will be passed directly to Redis.new
|
45
48
|
:redis_options => {},
|
46
49
|
# An instance of logger
|
47
50
|
:logger => nil,
|
51
|
+
# A logger level
|
52
|
+
:logger_level => nil,
|
48
53
|
# whether the query string should be included in the saved page
|
49
54
|
:include_query_string_in_saved_page => true,
|
50
55
|
# Max number of items to keep on redis
|
@@ -62,7 +67,9 @@ module Polipus
|
|
62
67
|
# Eg It can be used to follow links with and without 'www' domain
|
63
68
|
:domain_aliases => [],
|
64
69
|
# Mark a connection as staled after connection_max_hits request
|
65
|
-
:connection_max_hits => nil
|
70
|
+
:connection_max_hits => nil,
|
71
|
+
# Page TTL: mark a page as expired after ttl_page seconds
|
72
|
+
:ttl_page => nil
|
66
73
|
}
|
67
74
|
|
68
75
|
attr_reader :storage
|
@@ -86,6 +93,7 @@ module Polipus
|
|
86
93
|
|
87
94
|
@job_name = job_name
|
88
95
|
@options = OPTS.merge(options)
|
96
|
+
@options[:queue_timeout] = 1 if @options[:queue_timeout] <= 0
|
89
97
|
@logger = @options[:logger] ||= Logger.new(nil)
|
90
98
|
|
91
99
|
unless @logger.class.to_s == "Log4r::Logger"
|
@@ -137,8 +145,9 @@ module Polipus
|
|
137
145
|
|
138
146
|
q = queue_factory
|
139
147
|
@urls.each do |u|
|
140
|
-
|
141
|
-
|
148
|
+
page = Page.new(u.to_s, :referer => '')
|
149
|
+
page.user_data.p_seeded = true
|
150
|
+
q << page.to_json
|
142
151
|
end
|
143
152
|
|
144
153
|
return if q.empty?
|
@@ -149,7 +158,7 @@ module Polipus
|
|
149
158
|
@logger.debug {"Start worker #{worker_number}"}
|
150
159
|
http = @http_pool[worker_number] ||= HTTP.new(@options)
|
151
160
|
queue = @queues_pool[worker_number] ||= queue_factory
|
152
|
-
queue.process(false, @options[:
|
161
|
+
queue.process(false, @options[:queue_timeout]) do |message|
|
153
162
|
|
154
163
|
next if message.nil?
|
155
164
|
|
@@ -163,7 +172,7 @@ module Polipus
|
|
163
172
|
next
|
164
173
|
end
|
165
174
|
|
166
|
-
if
|
175
|
+
if page_exists? page
|
167
176
|
@logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
|
168
177
|
queue.commit
|
169
178
|
next
|
@@ -180,7 +189,7 @@ module Polipus
|
|
180
189
|
@logger.info {"Got redirects! #{rurls}"}
|
181
190
|
page = pages.pop
|
182
191
|
page.aliases = pages.collect { |e| e.url }
|
183
|
-
if
|
192
|
+
if page_exists? page
|
184
193
|
@logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
|
185
194
|
queue.commit
|
186
195
|
next
|
@@ -202,7 +211,7 @@ module Polipus
|
|
202
211
|
end
|
203
212
|
|
204
213
|
if page
|
205
|
-
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}]
|
214
|
+
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
|
206
215
|
@logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] downloaded"}
|
207
216
|
end
|
208
217
|
|
@@ -264,7 +273,7 @@ module Polipus
|
|
264
273
|
self
|
265
274
|
end
|
266
275
|
|
267
|
-
# A block of code will be executed on every page
|
276
|
+
# A block of code will be executed on every page downloaded
|
268
277
|
# before being saved in the registered storage
|
269
278
|
def on_before_save(&block)
|
270
279
|
@on_before_save << block
|
@@ -272,7 +281,7 @@ module Polipus
|
|
272
281
|
end
|
273
282
|
|
274
283
|
# A block of code will be executed
|
275
|
-
# on every page
|
284
|
+
# on every page downloaded. The code is used to extract urls to visit
|
276
285
|
# see links_for method
|
277
286
|
def focus_crawl(&block)
|
278
287
|
@focus_crawl_block = block
|
@@ -332,6 +341,11 @@ module Polipus
|
|
332
341
|
# URLs enqueue policy
|
333
342
|
def should_be_visited?(url, with_tracker = true)
|
334
343
|
|
344
|
+
# return +true+ If an url is part of the initial seeder
|
345
|
+
# no matter what
|
346
|
+
|
347
|
+
return true if @urls.map(&:to_s).include?(url.to_s)
|
348
|
+
|
335
349
|
# Check against whitelist pattern matching
|
336
350
|
unless @follow_links_like.empty?
|
337
351
|
return false unless @follow_links_like.any?{|p| url.path =~ p}
|
@@ -342,9 +356,12 @@ module Polipus
|
|
342
356
|
return false if @skip_links_like.any?{|p| url.path =~ p}
|
343
357
|
end
|
344
358
|
|
359
|
+
#Page is marked as expired
|
360
|
+
return true if page_expired?(Page.new(url))
|
361
|
+
|
345
362
|
# Check against url tracker
|
346
363
|
if with_tracker
|
347
|
-
return false if
|
364
|
+
return false if url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
|
348
365
|
end
|
349
366
|
true
|
350
367
|
end
|
@@ -356,6 +373,19 @@ module Polipus
|
|
356
373
|
links
|
357
374
|
end
|
358
375
|
|
376
|
+
def page_expired? page
|
377
|
+
return false if @options[:ttl_page].nil?
|
378
|
+
stored_page = @storage.get(page)
|
379
|
+
r = stored_page && stored_page.expired?(@options[:ttl_page])
|
380
|
+
@logger.debug {"Page #{page.url.to_s} marked as expired"} if r
|
381
|
+
r
|
382
|
+
end
|
383
|
+
|
384
|
+
def page_exists? page
|
385
|
+
return false if page.user_data && page.user_data.p_seeded
|
386
|
+
@storage.exists?(page) && !page_expired?(page)
|
387
|
+
end
|
388
|
+
|
359
389
|
# The url is enqueued for a later visit
|
360
390
|
def enqueue url_to_visit, current_page, queue
|
361
391
|
page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)
|