polipus 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/examples/incremental.rb +62 -0
- data/lib/polipus/http.rb +8 -7
- data/lib/polipus/page.rb +37 -20
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +6 -4
- data/lib/polipus/storage.rb +5 -0
- data/lib/polipus/version.rb +1 -1
- data/lib/polipus.rb +41 -11
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/http_spec.rb +1 -0
- data/spec/page_spec.rb +21 -0
- data/spec/polipus_spec.rb +77 -0
- data/spec/storage_memory_spec.rb +89 -0
- data/spec/storage_mongo_spec.rb +18 -0
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
OGE5NThiNjIyZmFmMDhkMzQ1OGFjMDY3MTgxOThhMDUwMzM4NDE2Zg==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YjhkYjUzOGI5MjJiYjdjMjQ0ZjdlOTAxOTBlODA0ZjRiYzBhZTI4Ng==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NzEzOTk5NDQwMjdmYTZiOTlkNjllYmUxY2M0ZTY1ZWRjOTMwMzEwNjZmZGMy
|
10
|
+
ZjA0NjJiZDgzNzFhYzRkNGQ2ZGY3MmQxMTdlNmJjZDg2MTZiMGE2ZDA0N2Iz
|
11
|
+
NThlMjU2ODllNTIxMjA5OWRhYmY0ZDBlN2I2MjAzNjI3YzVlZDQ=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ODM0ZDVkNDZlZDQ0NzM2NWEwOGY4MDRkY2IzM2U0MzBiYjQ3YzA0Njk0ZDU5
|
14
|
+
ZTVhYjkwZmQ1ZDRhYjhkMThlOWVkMTNiYzAxNTc1NDUwMDNiMjNmYjE5Nzhm
|
15
|
+
Yzc0MDI5N2ZkMjQxMzEwOTExODUyMmJjNWU1YTdmODAyYmIwNzU=
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require "polipus"
|
2
|
+
require "mongo"
|
3
|
+
|
4
|
+
# Define a Mongo connection
|
5
|
+
mongo = Mongo::Connection.new(:pool_size => 15, :pool_timeout => 5).db('crawler')
|
6
|
+
# Override some default options
|
7
|
+
options = {
|
8
|
+
#Redis connection
|
9
|
+
:redis_options => {
|
10
|
+
:host => 'localhost',
|
11
|
+
:db => 5,
|
12
|
+
:driver => 'hiredis'
|
13
|
+
},
|
14
|
+
# Page storage: pages is the name of the collection where
|
15
|
+
# pages will be stored
|
16
|
+
:storage => Polipus::Storage.mongo_store(mongo, 'pages'),
|
17
|
+
# Use your custom user agent
|
18
|
+
:user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
|
19
|
+
# Use 10 threads
|
20
|
+
:workers => 20,
|
21
|
+
# Logs goes to the crawler.log file
|
22
|
+
:logger => Logger.new(STDOUT),
|
23
|
+
# Do not go deeper than 2 levels
|
24
|
+
:depth_limit => 5,
|
25
|
+
|
26
|
+
# Incremental download:
|
27
|
+
# Set a ttl for each stored page
|
28
|
+
# If a previous stored page is now expired, it will re-downloaded
|
29
|
+
# Mark a page expired after 60s
|
30
|
+
:ttl_page => 60
|
31
|
+
}
|
32
|
+
|
33
|
+
starting_urls = ["http://rubygems.org/gems"]
|
34
|
+
|
35
|
+
# Crawl the entire rubygems's site
|
36
|
+
# Polipus.crawler('polipus-rubygems', starting_urls, options)
|
37
|
+
|
38
|
+
Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
|
39
|
+
# Ignore urls pointing to a gem file
|
40
|
+
crawler.skip_links_like(/\.gem$/)
|
41
|
+
# Ignore urls pointing to an atom feed
|
42
|
+
crawler.skip_links_like(/\.atom$/)
|
43
|
+
# Ignore urls containing /versions/ path
|
44
|
+
crawler.skip_links_like(/\/versions\//)
|
45
|
+
|
46
|
+
# Adding some metadata to a page
|
47
|
+
# The metadata will be stored on mongo
|
48
|
+
crawler.on_before_save do |page|
|
49
|
+
page.user_data.processed = false
|
50
|
+
end
|
51
|
+
|
52
|
+
# In-place page processing
|
53
|
+
crawler.on_page_downloaded do |page|
|
54
|
+
# A nokogiri object
|
55
|
+
puts "Page title: #{page.doc.css('title').text}" rescue "ERROR"
|
56
|
+
end
|
57
|
+
|
58
|
+
# Do a nifty stuff at the end of the crawling session
|
59
|
+
crawler.on_crawl_end do
|
60
|
+
# Gong.bang(:loudly)
|
61
|
+
end
|
62
|
+
end
|
data/lib/polipus/http.rb
CHANGED
@@ -36,13 +36,14 @@ module Polipus
|
|
36
36
|
gzip = Zlib::GzipReader.new(StringIO.new(body))
|
37
37
|
body = gzip.read
|
38
38
|
end
|
39
|
-
pages << Page.new(location, :body
|
40
|
-
:code
|
41
|
-
:headers
|
42
|
-
:referer
|
43
|
-
:depth
|
44
|
-
:redirect_to
|
45
|
-
:response_time => response_time
|
39
|
+
pages << Page.new(location, :body => response.body.dup,
|
40
|
+
:code => code,
|
41
|
+
:headers => response.to_hash,
|
42
|
+
:referer => referer,
|
43
|
+
:depth => depth,
|
44
|
+
:redirect_to => redirect_to,
|
45
|
+
:response_time => response_time,
|
46
|
+
:fetched_at => Time.now.to_i)
|
46
47
|
end
|
47
48
|
|
48
49
|
return pages
|
data/lib/polipus/page.rb
CHANGED
@@ -35,6 +35,8 @@ module Polipus
|
|
35
35
|
# Default: true
|
36
36
|
attr_accessor :storable
|
37
37
|
|
38
|
+
attr_accessor :fetched_at
|
39
|
+
|
38
40
|
#
|
39
41
|
# Create a new page
|
40
42
|
#
|
@@ -54,6 +56,7 @@ module Polipus
|
|
54
56
|
@user_data = OpenStruct.new
|
55
57
|
@domain_aliases = params[:domain_aliases] ||= []
|
56
58
|
@storable = true
|
59
|
+
@fetched_at = params[:fetched_at]
|
57
60
|
end
|
58
61
|
|
59
62
|
#
|
@@ -177,17 +180,19 @@ module Polipus
|
|
177
180
|
end
|
178
181
|
|
179
182
|
def to_hash
|
180
|
-
{
|
181
|
-
'
|
182
|
-
'
|
183
|
-
'
|
184
|
-
'
|
185
|
-
'
|
186
|
-
'
|
187
|
-
'
|
183
|
+
{
|
184
|
+
'url' => @url.to_s,
|
185
|
+
'headers' => Marshal.dump(@headers),
|
186
|
+
'body' => @body,
|
187
|
+
'links' => links.map(&:to_s),
|
188
|
+
'code' => @code,
|
189
|
+
'depth' => @depth,
|
190
|
+
'referer' => @referer.to_s,
|
191
|
+
'redirect_to' => @redirect_to.to_s,
|
188
192
|
'response_time' => @response_time,
|
189
|
-
'fetched'
|
190
|
-
'user_data'
|
193
|
+
'fetched' => @fetched,
|
194
|
+
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
195
|
+
'fetched_at' => @fetched_at
|
191
196
|
}
|
192
197
|
end
|
193
198
|
|
@@ -198,22 +203,34 @@ module Polipus
|
|
198
203
|
th.to_json
|
199
204
|
end
|
200
205
|
|
206
|
+
#
|
207
|
+
# Returns +true+ if page is marked as storeable
|
208
|
+
# +false+ otherwise
|
209
|
+
# Default is +true+
|
210
|
+
#
|
201
211
|
def storable?
|
202
212
|
@storable
|
203
213
|
end
|
204
214
|
|
215
|
+
def expired? ttl
|
216
|
+
return false if fetched_at.nil?
|
217
|
+
(Time.now.to_i - ttl) > fetched_at
|
218
|
+
end
|
219
|
+
|
205
220
|
def self.from_hash(hash)
|
206
221
|
page = self.new(URI(hash['url']))
|
207
|
-
{
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
222
|
+
{
|
223
|
+
'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
|
224
|
+
'@body' => hash['body'],
|
225
|
+
'@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
|
226
|
+
'@code' => hash['code'].to_i,
|
227
|
+
'@depth' => hash['depth'].to_i,
|
228
|
+
'@referer' => hash['referer'],
|
229
|
+
'@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
230
|
+
'@response_time' => hash['response_time'].to_i,
|
231
|
+
'@fetched' => hash['fetched'],
|
232
|
+
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
|
233
|
+
'@fetched_at' => hash['fetched_at']
|
217
234
|
}.each do |var, value|
|
218
235
|
page.instance_variable_set(var, value)
|
219
236
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require "thread"
|
2
|
+
module Polipus
|
3
|
+
module Storage
|
4
|
+
class MemoryStore < Base
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@store = Hash.new
|
8
|
+
@semaphore = Mutex.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def add page
|
12
|
+
@semaphore.synchronize {
|
13
|
+
u = uuid(page)
|
14
|
+
@store[u] = page
|
15
|
+
u
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
def exists?(page)
|
20
|
+
@semaphore.synchronize {
|
21
|
+
@store.key?(uuid(page))
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def get page
|
26
|
+
@semaphore.synchronize {
|
27
|
+
@store[uuid(page)]
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
def remove page
|
32
|
+
@semaphore.synchronize {
|
33
|
+
@store.delete(uuid(page))
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
def count
|
38
|
+
@semaphore.synchronize {
|
39
|
+
@store.count
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
def each
|
44
|
+
@store.each do |k,v|
|
45
|
+
yield k,v
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def clear
|
50
|
+
@semaphore.synchronize {
|
51
|
+
@store = Hash.new
|
52
|
+
}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -39,9 +39,7 @@ module Polipus
|
|
39
39
|
def get page
|
40
40
|
@semaphore.synchronize {
|
41
41
|
data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
|
42
|
-
if data
|
43
|
-
return load_page(data)
|
44
|
-
end
|
42
|
+
return load_page(data) if data
|
45
43
|
}
|
46
44
|
end
|
47
45
|
|
@@ -75,7 +73,11 @@ module Polipus
|
|
75
73
|
end
|
76
74
|
begin
|
77
75
|
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
78
|
-
|
76
|
+
page = Page.from_hash(hash)
|
77
|
+
if page.fetched_at.nil?
|
78
|
+
page.fetched_at = hash['_id'].generation_time.to_i
|
79
|
+
end
|
80
|
+
return page
|
79
81
|
rescue
|
80
82
|
end
|
81
83
|
nil
|
data/lib/polipus/storage.rb
CHANGED
data/lib/polipus/version.rb
CHANGED
data/lib/polipus.rb
CHANGED
@@ -23,7 +23,7 @@ module Polipus
|
|
23
23
|
OPTS = {
|
24
24
|
# run 4 threads
|
25
25
|
:workers => 4,
|
26
|
-
# identify self as
|
26
|
+
# identify self as Polipus/VERSION
|
27
27
|
:user_agent => "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
|
28
28
|
# by default, don't limit the depth of the crawl
|
29
29
|
:depth_limit => false,
|
@@ -39,12 +39,17 @@ module Polipus
|
|
39
39
|
:read_timeout => 30,
|
40
40
|
# HTTP open connection timeout in seconds
|
41
41
|
:open_timeout => 10,
|
42
|
+
# Time to wait for new messages on Redis
|
43
|
+
# After this timeout, current crawling session is marked as terminated
|
44
|
+
:queue_timeout => 30,
|
42
45
|
# An URL tracker instance. default is Bloomfilter based on redis
|
43
46
|
:url_tracker => nil,
|
44
47
|
# A Redis options {} that will be passed directly to Redis.new
|
45
48
|
:redis_options => {},
|
46
49
|
# An instance of logger
|
47
50
|
:logger => nil,
|
51
|
+
# A logger level
|
52
|
+
:logger_level => nil,
|
48
53
|
# whether the query string should be included in the saved page
|
49
54
|
:include_query_string_in_saved_page => true,
|
50
55
|
# Max number of items to keep on redis
|
@@ -62,7 +67,9 @@ module Polipus
|
|
62
67
|
# Eg It can be used to follow links with and without 'www' domain
|
63
68
|
:domain_aliases => [],
|
64
69
|
# Mark a connection as staled after connection_max_hits request
|
65
|
-
:connection_max_hits => nil
|
70
|
+
:connection_max_hits => nil,
|
71
|
+
# Page TTL: mark a page as expired after ttl_page seconds
|
72
|
+
:ttl_page => nil
|
66
73
|
}
|
67
74
|
|
68
75
|
attr_reader :storage
|
@@ -86,6 +93,7 @@ module Polipus
|
|
86
93
|
|
87
94
|
@job_name = job_name
|
88
95
|
@options = OPTS.merge(options)
|
96
|
+
@options[:queue_timeout] = 1 if @options[:queue_timeout] <= 0
|
89
97
|
@logger = @options[:logger] ||= Logger.new(nil)
|
90
98
|
|
91
99
|
unless @logger.class.to_s == "Log4r::Logger"
|
@@ -137,8 +145,9 @@ module Polipus
|
|
137
145
|
|
138
146
|
q = queue_factory
|
139
147
|
@urls.each do |u|
|
140
|
-
|
141
|
-
|
148
|
+
page = Page.new(u.to_s, :referer => '')
|
149
|
+
page.user_data.p_seeded = true
|
150
|
+
q << page.to_json
|
142
151
|
end
|
143
152
|
|
144
153
|
return if q.empty?
|
@@ -149,7 +158,7 @@ module Polipus
|
|
149
158
|
@logger.debug {"Start worker #{worker_number}"}
|
150
159
|
http = @http_pool[worker_number] ||= HTTP.new(@options)
|
151
160
|
queue = @queues_pool[worker_number] ||= queue_factory
|
152
|
-
queue.process(false, @options[:
|
161
|
+
queue.process(false, @options[:queue_timeout]) do |message|
|
153
162
|
|
154
163
|
next if message.nil?
|
155
164
|
|
@@ -163,7 +172,7 @@ module Polipus
|
|
163
172
|
next
|
164
173
|
end
|
165
174
|
|
166
|
-
if
|
175
|
+
if page_exists? page
|
167
176
|
@logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
|
168
177
|
queue.commit
|
169
178
|
next
|
@@ -180,7 +189,7 @@ module Polipus
|
|
180
189
|
@logger.info {"Got redirects! #{rurls}"}
|
181
190
|
page = pages.pop
|
182
191
|
page.aliases = pages.collect { |e| e.url }
|
183
|
-
if
|
192
|
+
if page_exists? page
|
184
193
|
@logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
|
185
194
|
queue.commit
|
186
195
|
next
|
@@ -202,7 +211,7 @@ module Polipus
|
|
202
211
|
end
|
203
212
|
|
204
213
|
if page
|
205
|
-
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}]
|
214
|
+
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
|
206
215
|
@logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] downloaded"}
|
207
216
|
end
|
208
217
|
|
@@ -264,7 +273,7 @@ module Polipus
|
|
264
273
|
self
|
265
274
|
end
|
266
275
|
|
267
|
-
# A block of code will be executed on every page
|
276
|
+
# A block of code will be executed on every page downloaded
|
268
277
|
# before being saved in the registered storage
|
269
278
|
def on_before_save(&block)
|
270
279
|
@on_before_save << block
|
@@ -272,7 +281,7 @@ module Polipus
|
|
272
281
|
end
|
273
282
|
|
274
283
|
# A block of code will be executed
|
275
|
-
# on every page
|
284
|
+
# on every page downloaded. The code is used to extract urls to visit
|
276
285
|
# see links_for method
|
277
286
|
def focus_crawl(&block)
|
278
287
|
@focus_crawl_block = block
|
@@ -332,6 +341,11 @@ module Polipus
|
|
332
341
|
# URLs enqueue policy
|
333
342
|
def should_be_visited?(url, with_tracker = true)
|
334
343
|
|
344
|
+
# return +true+ If an url is part of the initial seeder
|
345
|
+
# no matter what
|
346
|
+
|
347
|
+
return true if @urls.map(&:to_s).include?(url.to_s)
|
348
|
+
|
335
349
|
# Check against whitelist pattern matching
|
336
350
|
unless @follow_links_like.empty?
|
337
351
|
return false unless @follow_links_like.any?{|p| url.path =~ p}
|
@@ -342,9 +356,12 @@ module Polipus
|
|
342
356
|
return false if @skip_links_like.any?{|p| url.path =~ p}
|
343
357
|
end
|
344
358
|
|
359
|
+
#Page is marked as expired
|
360
|
+
return true if page_expired?(Page.new(url))
|
361
|
+
|
345
362
|
# Check against url tracker
|
346
363
|
if with_tracker
|
347
|
-
return false if
|
364
|
+
return false if url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
|
348
365
|
end
|
349
366
|
true
|
350
367
|
end
|
@@ -356,6 +373,19 @@ module Polipus
|
|
356
373
|
links
|
357
374
|
end
|
358
375
|
|
376
|
+
def page_expired? page
|
377
|
+
return false if @options[:ttl_page].nil?
|
378
|
+
stored_page = @storage.get(page)
|
379
|
+
r = stored_page && stored_page.expired?(@options[:ttl_page])
|
380
|
+
@logger.debug {"Page #{page.url.to_s} marked as expired"} if r
|
381
|
+
r
|
382
|
+
end
|
383
|
+
|
384
|
+
def page_exists? page
|
385
|
+
return false if page.user_data && page.user_data.p_seeded
|
386
|
+
@storage.exists?(page) && !page_expired?(page)
|
387
|
+
end
|
388
|
+
|
359
389
|
# The url is enqueued for a later visit
|
360
390
|
def enqueue url_to_visit, current_page, queue
|
361
391
|
page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)
|