polipus 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/examples/incremental.rb +62 -0
- data/lib/polipus/http.rb +8 -7
- data/lib/polipus/page.rb +37 -20
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +6 -4
- data/lib/polipus/storage.rb +5 -0
- data/lib/polipus/version.rb +1 -1
- data/lib/polipus.rb +41 -11
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/http_spec.rb +1 -0
- data/spec/page_spec.rb +21 -0
- data/spec/polipus_spec.rb +77 -0
- data/spec/storage_memory_spec.rb +89 -0
- data/spec/storage_mongo_spec.rb +18 -0
- metadata +20 -2
data/spec/http_spec.rb
CHANGED
@@ -12,6 +12,7 @@ describe Polipus::HTTP do
|
|
12
12
|
page = http.fetch_page("http://sfbay.craigslist.org/apa/")
|
13
13
|
page.should be_an_instance_of(Polipus::Page)
|
14
14
|
page.doc.search("title").text.strip.should eq "SF bay area apts/housing for rent classifieds - craigslist"
|
15
|
+
page.fetched_at.should_not be_nil
|
15
16
|
end
|
16
17
|
end
|
17
18
|
|
data/spec/page_spec.rb
CHANGED
@@ -28,4 +28,25 @@ EOF
|
|
28
28
|
it 'should honor domain_aliases attribute' do
|
29
29
|
page.links.count.should be 4
|
30
30
|
end
|
31
|
+
|
32
|
+
context 'page expiring' do
|
33
|
+
let(:page) do
|
34
|
+
Polipus::Page.new 'http://www.google.com/',
|
35
|
+
code: 200,
|
36
|
+
body: '',
|
37
|
+
headers: {'content-type' => ['text/html']},
|
38
|
+
domain_aliases: %w(www.google.com google.com),
|
39
|
+
fetched_at: (Time.now.to_i - 30)
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should be marked at expired' do
|
43
|
+
page.expired?(20).should be_true
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should NOT be marked at expired' do
|
47
|
+
page.expired?(60).should be_false
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
|
31
52
|
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Polipus::PolipusCrawler do
|
4
|
+
after(:each) {Redis.new(db:10).flushdb}
|
5
|
+
let(:p_options) {
|
6
|
+
{
|
7
|
+
workers: 1,
|
8
|
+
redis_options: {host: 'localhost', db:10},
|
9
|
+
depth_limit: 1,
|
10
|
+
queue_timeout: 1,
|
11
|
+
user_agent: 'polipus-rspec',
|
12
|
+
logger: logger,
|
13
|
+
logger_level: Logger::DEBUG,
|
14
|
+
storage: Polipus::Storage.memory_store
|
15
|
+
}
|
16
|
+
}
|
17
|
+
let(:polipus) {
|
18
|
+
Polipus::PolipusCrawler.new("polipus-rspec", ["http://rubygems.org/gems"], p_options)
|
19
|
+
}
|
20
|
+
|
21
|
+
let(:init_page){
|
22
|
+
init_page = Polipus::Page.new "http://rubygems.org/gems"
|
23
|
+
}
|
24
|
+
|
25
|
+
let(:logger){Logger.new(nil)}
|
26
|
+
|
27
|
+
context "polipus" do
|
28
|
+
|
29
|
+
it "should create a polipus instance" do
|
30
|
+
polipus.should be_an_instance_of Polipus::PolipusCrawler
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should execute a crawling session" do
|
34
|
+
polipus.takeover
|
35
|
+
polipus.storage.exists?(init_page).should be_true
|
36
|
+
polipus.storage.get(init_page).links.count.should be polipus.storage.count
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should filter unwanted urls" do
|
40
|
+
polipus.skip_links_like(/\/pages\//)
|
41
|
+
polipus.takeover
|
42
|
+
polipus.storage.get(init_page).links
|
43
|
+
.reject { |e| e.path.to_s =~ /\/pages\// }.count.should be polipus.storage.count
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should follow only wanted urls" do
|
47
|
+
polipus.follow_links_like(/\/pages\//)
|
48
|
+
polipus.follow_links_like(/\/gems$/)
|
49
|
+
polipus.takeover
|
50
|
+
polipus.storage.get(init_page).links
|
51
|
+
.reject { |e| ![/\/pages\//, /\/gems$/].any?{|p| e.path =~ p} }
|
52
|
+
.count.should be polipus.storage.count
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should refresh expired pages" do
|
56
|
+
polipus.ttl_page = 3600
|
57
|
+
polipus.takeover
|
58
|
+
polipus.storage.each {|id, page| page.fetched_at = page.fetched_at - 3600; polipus.storage.add(page)}
|
59
|
+
polipus.storage.each {|id, page| page.expired?(3600).should be_true}
|
60
|
+
polipus.takeover
|
61
|
+
polipus.storage.each {|id, page| page.expired?(3600).should be_false}
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should re-download seeder urls no matter what" do
|
65
|
+
cache_hit = {}
|
66
|
+
polipus.follow_links_like(/\/gems$/)
|
67
|
+
polipus.on_page_downloaded do |page|
|
68
|
+
cache_hit[page.url.to_s] ||= 0
|
69
|
+
cache_hit[page.url.to_s] += 1
|
70
|
+
end
|
71
|
+
polipus.takeover
|
72
|
+
polipus.takeover
|
73
|
+
cache_hit["http://rubygems.org/gems"].should be 2
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "mongo"
|
3
|
+
require "polipus/storage/memory_store"
|
4
|
+
describe Polipus::Storage::MemoryStore do
|
5
|
+
|
6
|
+
let(:storage){Polipus::Storage.memory_store}
|
7
|
+
|
8
|
+
it 'should store a page' do
|
9
|
+
p = page_factory 'http://www.google.com', :code => 200, :body => '<html></html>'
|
10
|
+
uuid = storage.add p
|
11
|
+
uuid.should be == 'ed646a3334ca891fd3467db131372140'
|
12
|
+
storage.count.should be 1
|
13
|
+
p = storage.get p
|
14
|
+
p.url.to_s.should be == 'http://www.google.com'
|
15
|
+
p.body.should be == '<html></html>'
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should update a page' do
|
19
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
20
|
+
storage.add p
|
21
|
+
p = storage.get p
|
22
|
+
p.code.should be == 301
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should iterate over stored pages' do
|
26
|
+
storage.each do |k, page|
|
27
|
+
k.should be == "ed646a3334ca891fd3467db131372140"
|
28
|
+
page.url.to_s.should be == 'http://www.google.com'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should delete a page' do
|
33
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
34
|
+
storage.remove p
|
35
|
+
storage.get(p).should be_nil
|
36
|
+
storage.count.should be 0
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should store a page removing a query string from the uuid generation' do
|
40
|
+
p = page_factory 'http://www.asd.com/?asd=lol', :code => 200, :body => '<html></html>'
|
41
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', :code => 200, :body => '<html></html>'
|
42
|
+
storage.include_query_string_in_uuid = false
|
43
|
+
storage.add p
|
44
|
+
storage.exists?(p_no_query).should be_true
|
45
|
+
storage.remove p
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
49
|
+
p = page_factory 'http://www.asd.com?asd=lol', :code => 200, :body => '<html></html>'
|
50
|
+
p_no_query = page_factory 'http://www.asd.com', :code => 200, :body => '<html></html>'
|
51
|
+
storage.include_query_string_in_uuid = false
|
52
|
+
storage.add p
|
53
|
+
storage.exists?(p_no_query).should be_true
|
54
|
+
storage.remove p
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should store a page with user data associated' do
|
58
|
+
p = page_factory 'http://www.user.com', :code => 200, :body => '<html></html>'
|
59
|
+
p.user_data.name = 'Test User Data'
|
60
|
+
storage.add p
|
61
|
+
storage.exists?(p).should be_true
|
62
|
+
p = storage.get(p)
|
63
|
+
p.user_data.name.should be == 'Test User Data'
|
64
|
+
storage.remove p
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should honor the except parameters' do
|
68
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
|
69
|
+
p = page_factory 'http://www.user-doo.com', :code => 200, :body => '<html></html>'
|
70
|
+
storage.add p
|
71
|
+
p = storage.get p
|
72
|
+
p.body.should be_empty
|
73
|
+
storage.clear
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'should return false if a doc not exists' do
|
77
|
+
storage.include_query_string_in_uuid = false
|
78
|
+
p_other = page_factory 'http://www.asdrrrr.com', :code => 200, :body => '<html></html>'
|
79
|
+
storage.exists?(p_other).should be_false
|
80
|
+
storage.add p_other
|
81
|
+
storage.exists?(p_other).should be_true
|
82
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', :code => 200, :body => '<html></html>'
|
83
|
+
storage.exists?(p_other).should be_true
|
84
|
+
storage.include_query_string_in_uuid = true
|
85
|
+
storage.exists?(p_other).should be_false
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
data/spec/storage_mongo_spec.rb
CHANGED
@@ -99,4 +99,22 @@ describe Polipus::Storage::MongoStore do
|
|
99
99
|
|
100
100
|
end
|
101
101
|
|
102
|
+
it 'should set page.fetched_at based on the id creation' do
|
103
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
104
|
+
p = page_factory 'http://www.user-doojo.com', :code => 200, :body => '<html></html>'
|
105
|
+
storage.add p
|
106
|
+
p.fetched_at.should be_nil
|
107
|
+
p = storage.get p
|
108
|
+
p.fetched_at.should_not be_nil
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'should NOT set page.fetched_at if already present' do
|
112
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
113
|
+
p = page_factory 'http://www.user-doojooo.com', :code => 200, :body => '<html></html>'
|
114
|
+
p.fetched_at = 10
|
115
|
+
storage.add p
|
116
|
+
p = storage.get p
|
117
|
+
p.fetched_at.should be 10
|
118
|
+
end
|
119
|
+
|
102
120
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis-bloomfilter
|
@@ -287,6 +287,7 @@ files:
|
|
287
287
|
- README.rdoc
|
288
288
|
- Rakefile
|
289
289
|
- examples/basic.rb
|
290
|
+
- examples/incremental.rb
|
290
291
|
- examples/survival.rb
|
291
292
|
- lib/polipus.rb
|
292
293
|
- lib/polipus/http.rb
|
@@ -304,6 +305,7 @@ files:
|
|
304
305
|
- lib/polipus/storage.rb
|
305
306
|
- lib/polipus/storage/base.rb
|
306
307
|
- lib/polipus/storage/dev_null.rb
|
308
|
+
- lib/polipus/storage/memory_store.rb
|
307
309
|
- lib/polipus/storage/mongo_store.rb
|
308
310
|
- lib/polipus/storage/s3_store.rb
|
309
311
|
- lib/polipus/url_tracker.rb
|
@@ -312,13 +314,19 @@ files:
|
|
312
314
|
- lib/polipus/version.rb
|
313
315
|
- polipus.gemspec
|
314
316
|
- spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
|
317
|
+
- spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
|
315
318
|
- spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
|
316
319
|
- spec/cassettes/4640919145753505af2d0f8423de37f3.yml
|
317
320
|
- spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
|
321
|
+
- spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml
|
318
322
|
- spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
|
323
|
+
- spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml
|
319
324
|
- spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
|
320
325
|
- spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
|
321
326
|
- spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
|
327
|
+
- spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
|
328
|
+
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
329
|
+
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
322
330
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
323
331
|
- spec/cassettes/http_tconnection_max_hits.yml
|
324
332
|
- spec/cassettes/http_test.yml
|
@@ -326,9 +334,11 @@ files:
|
|
326
334
|
- spec/clear.rb
|
327
335
|
- spec/http_spec.rb
|
328
336
|
- spec/page_spec.rb
|
337
|
+
- spec/polipus_spec.rb
|
329
338
|
- spec/queue_overflow_manager_spec.rb
|
330
339
|
- spec/queue_overflow_spec.rb
|
331
340
|
- spec/spec_helper.rb
|
341
|
+
- spec/storage_memory_spec.rb
|
332
342
|
- spec/storage_mongo_spec.rb
|
333
343
|
- spec/storage_s3_spec.rb
|
334
344
|
- spec/url_tracker_spec.rb
|
@@ -358,13 +368,19 @@ specification_version: 4
|
|
358
368
|
summary: Polipus distributed web-crawler framework
|
359
369
|
test_files:
|
360
370
|
- spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml
|
371
|
+
- spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml
|
361
372
|
- spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml
|
362
373
|
- spec/cassettes/4640919145753505af2d0f8423de37f3.yml
|
363
374
|
- spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml
|
375
|
+
- spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml
|
364
376
|
- spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml
|
377
|
+
- spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml
|
365
378
|
- spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml
|
366
379
|
- spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml
|
367
380
|
- spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml
|
381
|
+
- spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml
|
382
|
+
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
383
|
+
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
368
384
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
369
385
|
- spec/cassettes/http_tconnection_max_hits.yml
|
370
386
|
- spec/cassettes/http_test.yml
|
@@ -372,9 +388,11 @@ test_files:
|
|
372
388
|
- spec/clear.rb
|
373
389
|
- spec/http_spec.rb
|
374
390
|
- spec/page_spec.rb
|
391
|
+
- spec/polipus_spec.rb
|
375
392
|
- spec/queue_overflow_manager_spec.rb
|
376
393
|
- spec/queue_overflow_spec.rb
|
377
394
|
- spec/spec_helper.rb
|
395
|
+
- spec/storage_memory_spec.rb
|
378
396
|
- spec/storage_mongo_spec.rb
|
379
397
|
- spec/storage_s3_spec.rb
|
380
398
|
- spec/url_tracker_spec.rb
|