polipus 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/examples/incremental.rb +62 -0
- data/lib/polipus/http.rb +8 -7
- data/lib/polipus/page.rb +37 -20
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +6 -4
- data/lib/polipus/storage.rb +5 -0
- data/lib/polipus/version.rb +1 -1
- data/lib/polipus.rb +41 -11
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/http_spec.rb +1 -0
- data/spec/page_spec.rb +21 -0
- data/spec/polipus_spec.rb +77 -0
- data/spec/storage_memory_spec.rb +89 -0
- data/spec/storage_mongo_spec.rb +18 -0
- metadata +20 -2
data/spec/http_spec.rb
CHANGED
@@ -12,6 +12,7 @@ describe Polipus::HTTP do
|
|
12
12
|
page = http.fetch_page("http://sfbay.craigslist.org/apa/")
|
13
13
|
page.should be_an_instance_of(Polipus::Page)
|
14
14
|
page.doc.search("title").text.strip.should eq "SF bay area apts/housing for rent classifieds - craigslist"
|
15
|
+
page.fetched_at.should_not be_nil
|
15
16
|
end
|
16
17
|
end
|
17
18
|
|
data/spec/page_spec.rb
CHANGED
@@ -28,4 +28,25 @@ EOF
|
|
28
28
|
it 'should honor domain_aliases attribute' do
|
29
29
|
page.links.count.should be 4
|
30
30
|
end
|
31
|
+
|
32
|
+
context 'page expiring' do
|
33
|
+
let(:page) do
|
34
|
+
Polipus::Page.new 'http://www.google.com/',
|
35
|
+
code: 200,
|
36
|
+
body: '',
|
37
|
+
headers: {'content-type' => ['text/html']},
|
38
|
+
domain_aliases: %w(www.google.com google.com),
|
39
|
+
fetched_at: (Time.now.to_i - 30)
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should be marked at expired' do
|
43
|
+
page.expired?(20).should be_true
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should NOT be marked at expired' do
|
47
|
+
page.expired?(60).should be_false
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
|
31
52
|
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Polipus::PolipusCrawler do
|
4
|
+
after(:each) {Redis.new(db:10).flushdb}
|
5
|
+
let(:p_options) {
|
6
|
+
{
|
7
|
+
workers: 1,
|
8
|
+
redis_options: {host: 'localhost', db:10},
|
9
|
+
depth_limit: 1,
|
10
|
+
queue_timeout: 1,
|
11
|
+
user_agent: 'polipus-rspec',
|
12
|
+
logger: logger,
|
13
|
+
logger_level: Logger::DEBUG,
|
14
|
+
storage: Polipus::Storage.memory_store
|
15
|
+
}
|
16
|
+
}
|
17
|
+
let(:polipus) {
|
18
|
+
Polipus::PolipusCrawler.new("polipus-rspec", ["http://rubygems.org/gems"], p_options)
|
19
|
+
}
|
20
|
+
|
21
|
+
let(:init_page){
|
22
|
+
init_page = Polipus::Page.new "http://rubygems.org/gems"
|
23
|
+
}
|
24
|
+
|
25
|
+
let(:logger){Logger.new(nil)}
|
26
|
+
|
27
|
+
context "polipus" do
|
28
|
+
|
29
|
+
it "should create a polipus instance" do
|
30
|
+
polipus.should be_an_instance_of Polipus::PolipusCrawler
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should execute a crawling session" do
|
34
|
+
polipus.takeover
|
35
|
+
polipus.storage.exists?(init_page).should be_true
|
36
|
+
polipus.storage.get(init_page).links.count.should be polipus.storage.count
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should filter unwanted urls" do
|
40
|
+
polipus.skip_links_like(/\/pages\//)
|
41
|
+
polipus.takeover
|
42
|
+
polipus.storage.get(init_page).links
|
43
|
+
.reject { |e| e.path.to_s =~ /\/pages\// }.count.should be polipus.storage.count
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should follow only wanted urls" do
|
47
|
+
polipus.follow_links_like(/\/pages\//)
|
48
|
+
polipus.follow_links_like(/\/gems$/)
|
49
|
+
polipus.takeover
|
50
|
+
polipus.storage.get(init_page).links
|
51
|
+
.reject { |e| ![/\/pages\//, /\/gems$/].any?{|p| e.path =~ p} }
|
52
|
+
.count.should be polipus.storage.count
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should refresh expired pages" do
|
56
|
+
polipus.ttl_page = 3600
|
57
|
+
polipus.takeover
|
58
|
+
polipus.storage.each {|id, page| page.fetched_at = page.fetched_at - 3600; polipus.storage.add(page)}
|
59
|
+
polipus.storage.each {|id, page| page.expired?(3600).should be_true}
|
60
|
+
polipus.takeover
|
61
|
+
polipus.storage.each {|id, page| page.expired?(3600).should be_false}
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should re-download seeder urls no matter what" do
|
65
|
+
cache_hit = {}
|
66
|
+
polipus.follow_links_like(/\/gems$/)
|
67
|
+
polipus.on_page_downloaded do |page|
|
68
|
+
cache_hit[page.url.to_s] ||= 0
|
69
|
+
cache_hit[page.url.to_s] += 1
|
70
|
+
end
|
71
|
+
polipus.takeover
|
72
|
+
polipus.takeover
|
73
|
+
cache_hit["http://rubygems.org/gems"].should be 2
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "mongo"
|
3
|
+
require "polipus/storage/memory_store"
|
4
|
+
describe Polipus::Storage::MemoryStore do
|
5
|
+
|
6
|
+
let(:storage){Polipus::Storage.memory_store}
|
7
|
+
|
8
|
+
it 'should store a page' do
|
9
|
+
p = page_factory 'http://www.google.com', :code => 200, :body => '<html></html>'
|
10
|
+
uuid = storage.add p
|
11
|
+
uuid.should be == 'ed646a3334ca891fd3467db131372140'
|
12
|
+
storage.count.should be 1
|
13
|
+
p = storage.get p
|
14
|
+
p.url.to_s.should be == 'http://www.google.com'
|
15
|
+
p.body.should be == '<html></html>'
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should update a page' do
|
19
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
20
|
+
storage.add p
|
21
|
+
p = storage.get p
|
22
|
+
p.code.should be == 301
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should iterate over stored pages' do
|
26
|
+
storage.each do |k, page|
|
27
|
+
k.should be == "ed646a3334ca891fd3467db131372140"
|
28
|
+
page.url.to_s.should be == 'http://www.google.com'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should delete a page' do
|
33
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
34
|
+
storage.remove p
|
35
|
+
storage.get(p).should be_nil
|
36
|
+
storage.count.should be 0
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should store a page removing a query string from the uuid generation' do
|
40
|
+
p = page_factory 'http://www.asd.com/?asd=lol', :code => 200, :body => '<html></html>'
|
41
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', :code => 200, :body => '<html></html>'
|
42
|
+
storage.include_query_string_in_uuid = false
|
43
|
+
storage.add p
|
44
|
+
storage.exists?(p_no_query).should be_true
|
45
|
+
storage.remove p
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
49
|
+
p = page_factory 'http://www.asd.com?asd=lol', :code => 200, :body => '<html></html>'
|
50
|
+
p_no_query = page_factory 'http://www.asd.com', :code => 200, :body => '<html></html>'
|
51
|
+
storage.include_query_string_in_uuid = false
|
52
|
+
storage.add p
|
53
|
+
storage.exists?(p_no_query).should be_true
|
54
|
+
storage.remove p
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should store a page with user data associated' do
|
58
|
+
p = page_factory 'http://www.user.com', :code => 200, :body => '<html></html>'
|
59
|
+
p.user_data.name = 'Test User Data'
|
60
|
+
storage.add p
|
61
|
+
storage.exists?(p).should be_true
|
62
|
+
p = storage.get(p)
|
63
|
+
p.user_data.name.should be == 'Test User Data'
|
64
|
+
storage.remove p
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should honor the except parameters' do
|
68
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
|
69
|
+
p = page_factory 'http://www.user-doo.com', :code => 200, :body => '<html></html>'
|
70
|
+
storage.add p
|
71
|
+
p = storage.get p
|
72
|
+
p.body.should be_empty
|
73
|
+
storage.clear
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'should return false if a doc not exists' do
|
77
|
+
storage.include_query_string_in_uuid = false
|
78
|
+
p_other = page_factory 'http://www.asdrrrr.com', :code => 200, :body => '<html></html>'
|
79
|
+
storage.exists?(p_other).should be_false
|
80
|
+
storage.add p_other
|
81
|
+
storage.exists?(p_other).should be_true
|
82
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', :code => 200, :body => '<html></html>'
|
83
|
+
storage.exists?(p_other).should be_true
|
84
|
+
storage.include_query_string_in_uuid = true
|
85
|
+
storage.exists?(p_other).should be_false
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
data/spec/storage_mongo_spec.rb
CHANGED
@@ -99,4 +99,22 @@ describe Polipus::Storage::MongoStore do
|
|
99
99
|
|
100
100
|
end
|
101
101
|
|
102
|
+
it 'should set page.fetched_at based on the id creation' do
|
103
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
104
|
+
p = page_factory 'http://www.user-doojo.com', :code => 200, :body => '<html></html>'
|
105
|
+
storage.add p
|
106
|
+
p.fetched_at.should be_nil
|
107
|
+
p = storage.get p
|
108
|
+
p.fetched_at.should_not be_nil
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'should NOT set page.fetched_at if already present' do
|
112
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
113
|
+
p = page_factory 'http://www.user-doojooo.com', :code => 200, :body => '<html></html>'
|
114
|
+
p.fetched_at = 10
|
115
|
+
storage.add p
|
116
|
+
p = storage.get p
|
117
|
+
p.fetched_at.should be 10
|
118
|
+
end
|
119
|
+
|
102
120
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis-bloomfilter
|
@@ -287,6 +287,7 @@ files:
|
|
287
287
|
- README.rdoc
|
288
288
|
- Rakefile
|
289
289
|
- examples/basic.rb
|
290
|
+
- examples/incremental.rb
|
290
291
|
- examples/survival.rb
|
291
292
|
- lib/polipus.rb
|
292
293
|
- lib/polipus/http.rb
|
@@ -304,6 +305,7 @@ files:
|
|
304
305
|
- lib/polipus/storage.rb
|
305
306
|
- lib/polipus/storage/base.rb
|
306
307
|
- lib/polipus/storage/dev_null.rb
|
308
|
+
- lib/polipus/storage/memory_store.rb
|
307
309
|
- lib/polipus/storage/mongo_store.rb
|
308
310
|
- lib/polipus/storage/s3_store.rb
|
309
311
|
- lib/polipus/url_tracker.rb
|
@@ -312,13 +314,19 @@ files:
|
|
312
314
|
- lib/polipus/version.rb
|
313
315
|
- polipus.gemspec
|
314
316
|
- spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
|
317
|
+
- spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
|
315
318
|
- spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
|
316
319
|
- spec/cassettes/4640919145753505af2d0f8423de37f3.yml
|
317
320
|
- spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
|
321
|
+
- spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml
|
318
322
|
- spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
|
323
|
+
- spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml
|
319
324
|
- spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
|
320
325
|
- spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
|
321
326
|
- spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
|
327
|
+
- spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
|
328
|
+
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
329
|
+
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
322
330
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
323
331
|
- spec/cassettes/http_tconnection_max_hits.yml
|
324
332
|
- spec/cassettes/http_test.yml
|
@@ -326,9 +334,11 @@ files:
|
|
326
334
|
- spec/clear.rb
|
327
335
|
- spec/http_spec.rb
|
328
336
|
- spec/page_spec.rb
|
337
|
+
- spec/polipus_spec.rb
|
329
338
|
- spec/queue_overflow_manager_spec.rb
|
330
339
|
- spec/queue_overflow_spec.rb
|
331
340
|
- spec/spec_helper.rb
|
341
|
+
- spec/storage_memory_spec.rb
|
332
342
|
- spec/storage_mongo_spec.rb
|
333
343
|
- spec/storage_s3_spec.rb
|
334
344
|
- spec/url_tracker_spec.rb
|
@@ -358,13 +368,19 @@ specification_version: 4
|
|
358
368
|
summary: Polipus distributed web-crawler framework
|
359
369
|
test_files:
|
360
370
|
- spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
|
371
|
+
- spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
|
361
372
|
- spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
|
362
373
|
- spec/cassettes/4640919145753505af2d0f8423de37f3.yml
|
363
374
|
- spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
|
375
|
+
- spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml
|
364
376
|
- spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
|
377
|
+
- spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml
|
365
378
|
- spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
|
366
379
|
- spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
|
367
380
|
- spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
|
381
|
+
- spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
|
382
|
+
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
383
|
+
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
368
384
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
369
385
|
- spec/cassettes/http_tconnection_max_hits.yml
|
370
386
|
- spec/cassettes/http_test.yml
|
@@ -372,9 +388,11 @@ test_files:
|
|
372
388
|
- spec/clear.rb
|
373
389
|
- spec/http_spec.rb
|
374
390
|
- spec/page_spec.rb
|
391
|
+
- spec/polipus_spec.rb
|
375
392
|
- spec/queue_overflow_manager_spec.rb
|
376
393
|
- spec/queue_overflow_spec.rb
|
377
394
|
- spec/spec_helper.rb
|
395
|
+
- spec/storage_memory_spec.rb
|
378
396
|
- spec/storage_mongo_spec.rb
|
379
397
|
- spec/storage_s3_spec.rb
|
380
398
|
- spec/url_tracker_spec.rb
|