parallel588_polipus 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +33 -0
- data/.travis.yml +22 -0
- data/AUTHORS.md +5 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +70 -0
- data/Rakefile +8 -0
- data/examples/basic.rb +63 -0
- data/examples/error_handling.rb +23 -0
- data/examples/incremental.rb +63 -0
- data/examples/robots_txt_handling.rb +14 -0
- data/examples/survival.rb +10 -0
- data/lib/polipus.rb +488 -0
- data/lib/polipus/http.rb +282 -0
- data/lib/polipus/page.rb +256 -0
- data/lib/polipus/plugin.rb +14 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +15 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +26 -0
- data/lib/polipus/queue_overflow/base.rb +7 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
- data/lib/polipus/queue_overflow/manager.rb +57 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/robotex.rb +145 -0
- data/lib/polipus/signal_handler.rb +42 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +20 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +90 -0
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/url_tracker.rb +21 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +5 -0
- data/polipus.gemspec +44 -0
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/cassettes/gzipped_on.yml +147 -0
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +12 -0
- data/spec/polipus/http_spec.rb +139 -0
- data/spec/polipus/page_spec.rb +68 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
- data/spec/polipus/queue_overflow_spec.rb +66 -0
- data/spec/polipus/robotex_spec.rb +85 -0
- data/spec/polipus/signal_handler_spec.rb +15 -0
- data/spec/polipus/storage/memory_store_spec.rb +87 -0
- data/spec/polipus/storage/mongo_store_spec.rb +119 -0
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/polipus/url_tracker_spec.rb +29 -0
- data/spec/polipus_spec.rb +107 -0
- data/spec/spec_helper.rb +42 -0
- metadata +348 -0
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'polipus/robotex'
|
3
|
+
|
4
|
+
describe Polipus::Robotex do
|
5
|
+
let(:spec_domain) { 'http://www.example.com/' }
|
6
|
+
before(:each) do
|
7
|
+
robots = <<-END
|
8
|
+
User-Agent: msnbot
|
9
|
+
Crawl-Delay: 20
|
10
|
+
|
11
|
+
User-Agent: bender
|
12
|
+
Disallow: /my_shiny_metal_ass
|
13
|
+
|
14
|
+
User-Agent: *
|
15
|
+
Disallow: /login
|
16
|
+
Allow: /
|
17
|
+
|
18
|
+
Disallow: /locked
|
19
|
+
Allow: /locked
|
20
|
+
END
|
21
|
+
stub_request(:get, 'http://www.example.com/robots.txt')
|
22
|
+
.to_return(body: robots, status: [200, 'OK'], headers: { 'Content-Type' => 'text/plain' })
|
23
|
+
end
|
24
|
+
|
25
|
+
describe '#initialize' do
|
26
|
+
context 'when no arguments are supplied' do
|
27
|
+
it 'returns a Robotex with the default user-agent' do
|
28
|
+
expect(Polipus::Robotex.new.user_agent).to eq("Robotex/#{Polipus::Robotex::VERSION} (http://www.github.com/chriskite/robotex)")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'when a user-agent is specified' do
|
33
|
+
it 'returns a Robotex with the specified user-agent' do
|
34
|
+
ua = 'My User Agent'
|
35
|
+
expect(Polipus::Robotex.new(ua).user_agent).to eq(ua)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe '#allowed?' do
|
41
|
+
context 'when the robots.txt disallows the user-agent to the url' do
|
42
|
+
it 'returns false' do
|
43
|
+
robotex = Polipus::Robotex.new('bender')
|
44
|
+
expect(robotex.allowed?(spec_domain + 'my_shiny_metal_ass')).to be_falsey
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
|
49
|
+
it 'returns true' do
|
50
|
+
robotex = Polipus::Robotex.new('bender')
|
51
|
+
expect(robotex.allowed?(spec_domain + 'cigars')).to be_truthy
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context 'when the robots.txt disallows any user-agent to the url' do
|
56
|
+
it 'returns false' do
|
57
|
+
robotex = Polipus::Robotex.new
|
58
|
+
expect(robotex.allowed?(spec_domain + 'login')).to be_falsey
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context 'when the robots.txt disallows and then allows the url' do
|
63
|
+
it 'returns false' do
|
64
|
+
robotex = Polipus::Robotex.new
|
65
|
+
expect(robotex.allowed?(spec_domain + 'locked')).to be_falsey
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe '#delay' do
|
71
|
+
context 'when no Crawl-Delay is specified for the user-agent' do
|
72
|
+
it 'returns nil' do
|
73
|
+
robotex = Polipus::Robotex.new
|
74
|
+
expect(robotex.delay(spec_domain)).to be_nil
|
75
|
+
end
|
76
|
+
|
77
|
+
context 'when Crawl-Delay is specified for the user-agent' do
|
78
|
+
it 'returns the delay as a Fixnum' do
|
79
|
+
robotex = Polipus::Robotex.new('msnbot')
|
80
|
+
expect(robotex.delay(spec_domain)).to eq(20)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Polipus::SignalHandler do
|
4
|
+
context 'signal handler' do
|
5
|
+
it 'should be enabled by default' do
|
6
|
+
Polipus::PolipusCrawler.new('polipus-rspec', [])
|
7
|
+
expect(Polipus::SignalHandler.enabled?).to be true
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should be disabled if specified' do
|
11
|
+
Polipus::PolipusCrawler.new('polipus-rspec', [], enable_signal_handler: false)
|
12
|
+
expect(Polipus::SignalHandler.enabled?).to be false
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'mongo'
|
4
|
+
require 'polipus/storage/memory_store'
|
5
|
+
describe Polipus::Storage::MemoryStore do
|
6
|
+
let(:storage) { Polipus::Storage.memory_store }
|
7
|
+
|
8
|
+
it 'should store a page' do
|
9
|
+
p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
10
|
+
uuid = storage.add p
|
11
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
12
|
+
expect(storage.count).to be 1
|
13
|
+
p = storage.get p
|
14
|
+
expect(p.url.to_s).to eq('http://www.google.com')
|
15
|
+
expect(p.body).to eq('<html></html>')
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should update a page' do
|
19
|
+
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
20
|
+
storage.add p
|
21
|
+
p = storage.get p
|
22
|
+
expect(p.code).to eq(301)
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should iterate over stored pages' do
|
26
|
+
storage.each do |k, page|
|
27
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
28
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should delete a page' do
|
33
|
+
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
34
|
+
storage.remove p
|
35
|
+
expect(storage.get(p)).to be_nil
|
36
|
+
expect(storage.count).to be 0
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should store a page removing a query string from the uuid generation' do
|
40
|
+
p = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
|
41
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
|
42
|
+
storage.include_query_string_in_uuid = false
|
43
|
+
storage.add p
|
44
|
+
expect(storage.exists?(p_no_query)).to be_truthy
|
45
|
+
storage.remove p
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
49
|
+
p = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
|
50
|
+
p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
|
51
|
+
storage.include_query_string_in_uuid = false
|
52
|
+
storage.add p
|
53
|
+
expect(storage.exists?(p_no_query)).to be_truthy
|
54
|
+
storage.remove p
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should store a page with user data associated' do
|
58
|
+
p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
|
59
|
+
p.user_data.name = 'Test User Data'
|
60
|
+
storage.add p
|
61
|
+
expect(storage.exists?(p)).to be_truthy
|
62
|
+
p = storage.get(p)
|
63
|
+
expect(p.user_data.name).to eq('Test User Data')
|
64
|
+
storage.remove p
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should honor the except parameters' do
|
68
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
|
69
|
+
p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
70
|
+
storage.add p
|
71
|
+
p = storage.get p
|
72
|
+
expect(p.body).to be_empty
|
73
|
+
storage.clear
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'should return false if a doc not exists' do
|
77
|
+
storage.include_query_string_in_uuid = false
|
78
|
+
p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
|
79
|
+
expect(storage.exists?(p_other)).to be_falsey
|
80
|
+
storage.add p_other
|
81
|
+
expect(storage.exists?(p_other)).to be_truthy
|
82
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
|
83
|
+
expect(storage.exists?(p_other)).to be_truthy
|
84
|
+
storage.include_query_string_in_uuid = true
|
85
|
+
expect(storage.exists?(p_other)).to be_falsey
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'mongo'
|
4
|
+
require 'polipus/storage/mongo_store'
|
5
|
+
describe Polipus::Storage::MongoStore do
|
6
|
+
before(:all)do
|
7
|
+
@mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
|
8
|
+
@mongo['_test_pages'].drop
|
9
|
+
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
10
|
+
end
|
11
|
+
|
12
|
+
after(:all) do
|
13
|
+
@mongo['_test_pages'].drop
|
14
|
+
end
|
15
|
+
|
16
|
+
after(:each) do
|
17
|
+
@mongo['_test_pages'].drop
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should store a page' do
|
21
|
+
p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
22
|
+
uuid = @storage.add p
|
23
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
24
|
+
expect(@storage.count).to be 1
|
25
|
+
expect(@mongo['_test_pages'].count).to be 1
|
26
|
+
p = @storage.get p
|
27
|
+
expect(p.url.to_s).to eq('http://www.google.com')
|
28
|
+
expect(p.body).to eq('<html></html>')
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should update a page' do
|
32
|
+
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
33
|
+
@storage.add p
|
34
|
+
p = @storage.get p
|
35
|
+
expect(p.code).to eq(301)
|
36
|
+
expect(@mongo['_test_pages'].count).to be 1
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should iterate over stored pages' do
|
40
|
+
@storage.each do |k, page|
|
41
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
42
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should delete a page' do
|
47
|
+
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
48
|
+
@storage.remove p
|
49
|
+
expect(@storage.get(p)).to be_nil
|
50
|
+
expect(@storage.count).to be 0
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should store a page removing a query string from the uuid generation' do
|
54
|
+
p = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
|
55
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
|
56
|
+
@storage.include_query_string_in_uuid = false
|
57
|
+
@storage.add p
|
58
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
59
|
+
@storage.remove p
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
63
|
+
p = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
|
64
|
+
p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
|
65
|
+
@storage.include_query_string_in_uuid = false
|
66
|
+
@storage.add p
|
67
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
68
|
+
@storage.remove p
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should store a page with user data associated' do
|
72
|
+
p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
|
73
|
+
p.user_data.name = 'Test User Data'
|
74
|
+
@storage.add p
|
75
|
+
expect(@storage.exists?(p)).to be_truthy
|
76
|
+
p = @storage.get(p)
|
77
|
+
expect(p.user_data.name).to eq('Test User Data')
|
78
|
+
@storage.remove p
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'should honor the except parameters' do
|
82
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
|
83
|
+
p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
84
|
+
storage.add p
|
85
|
+
p = storage.get p
|
86
|
+
expect(p.body).to be_empty
|
87
|
+
storage.clear
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'should return false if a doc not exists' do
|
91
|
+
@storage.include_query_string_in_uuid = false
|
92
|
+
p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
|
93
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
94
|
+
@storage.add p_other
|
95
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
96
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
|
97
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
98
|
+
@storage.include_query_string_in_uuid = true
|
99
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'should set page.fetched_at based on the id creation' do
|
103
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
104
|
+
p = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
|
105
|
+
storage.add p
|
106
|
+
expect(p.fetched_at).to be_nil
|
107
|
+
p = storage.get p
|
108
|
+
expect(p.fetched_at).not_to be_nil
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'should NOT set page.fetched_at if already present' do
|
112
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
113
|
+
p = page_factory 'http://www.user-doojooo.com', code: 200, body: '<html></html>'
|
114
|
+
p.fetched_at = 10
|
115
|
+
storage.add p
|
116
|
+
p = storage.get p
|
117
|
+
expect(p.fetched_at).to be 10
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'polipus/storage/rethink_store'
|
4
|
+
|
5
|
+
describe Polipus::Storage::RethinkStore do
|
6
|
+
before(:all)do
|
7
|
+
@r = RethinkDB::RQL.new
|
8
|
+
@rethink = @r.connect(host: 'localhost', port: 28_015, db: 'polipus_spec')
|
9
|
+
@r.db_create('polipus_spec').run(@rethink) unless @r.db_list.run(@rethink).include?('polipus_spec')
|
10
|
+
@table = 'test_pages'
|
11
|
+
@storage = Polipus::Storage.rethink_store(@rethink, @table)
|
12
|
+
end
|
13
|
+
|
14
|
+
after(:each) do
|
15
|
+
@r.table(@table).delete.run(@rethink)
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should store a page' do
|
19
|
+
page = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
20
|
+
uuid = @storage.add page
|
21
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
22
|
+
expect(@storage.count).to eq(1)
|
23
|
+
expect(@r.table(@table).count.run(@rethink)).to eq(1)
|
24
|
+
page = @storage.get page
|
25
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
26
|
+
expect(page.body).to eq('<html></html>')
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should update a page' do
|
30
|
+
page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
31
|
+
@storage.add page
|
32
|
+
page = @storage.get page
|
33
|
+
expect(page.code).to eq(301)
|
34
|
+
expect(@r.table(@table).count.run(@rethink)).to eq(1)
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should iterate over stored pages' do
|
38
|
+
@storage.each do |k, page|
|
39
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
40
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'should delete a page' do
|
45
|
+
page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
46
|
+
@storage.remove page
|
47
|
+
expect(@storage.get(page)).to be_nil
|
48
|
+
expect(@storage.count).to be 0
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should store a page removing a query string from the uuid generation' do
|
52
|
+
page = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
|
53
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
|
54
|
+
@storage.include_query_string_in_uuid = false
|
55
|
+
@storage.add page
|
56
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
57
|
+
@storage.remove page
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
61
|
+
page = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
|
62
|
+
p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
|
63
|
+
@storage.include_query_string_in_uuid = false
|
64
|
+
@storage.add page
|
65
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
66
|
+
@storage.remove page
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should store a page with user data associated' do
|
70
|
+
page = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
|
71
|
+
page.user_data.name = 'Test User Data'
|
72
|
+
@storage.add page
|
73
|
+
expect(@storage.exists?(page)).to be_truthy
|
74
|
+
page = @storage.get(page)
|
75
|
+
expect(page.user_data.name).to eq('Test User Data')
|
76
|
+
@storage.remove page
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should honor the except parameters' do
|
80
|
+
storage = Polipus::Storage.rethink_store(@rethink, @table, ['body'])
|
81
|
+
page = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
82
|
+
storage.add page
|
83
|
+
page = storage.get page
|
84
|
+
expect(page.body).to be_empty
|
85
|
+
storage.clear
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should return false if a doc not exists' do
|
89
|
+
@storage.include_query_string_in_uuid = false
|
90
|
+
p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
|
91
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
92
|
+
@storage.add p_other
|
93
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
94
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
|
95
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
96
|
+
@storage.include_query_string_in_uuid = true
|
97
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'should set page.fetched_at based on the id creation' do
|
101
|
+
storage = Polipus::Storage.rethink_store(@rethink, @table)
|
102
|
+
page = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
|
103
|
+
storage.add page
|
104
|
+
expect(page.fetched_at).to be_nil
|
105
|
+
page = storage.get page
|
106
|
+
expect(page.fetched_at).not_to be_nil
|
107
|
+
end
|
108
|
+
|
109
|
+
it 'should NOT set page.fetched_at if already present' do
|
110
|
+
storage = Polipus::Storage.rethink_store(@rethink, @table)
|
111
|
+
page = page_factory 'http://www.user-doojooo.com', code: 200, body: '<html></html>'
|
112
|
+
page.fetched_at = 10
|
113
|
+
storage.add page
|
114
|
+
page = storage.get page
|
115
|
+
expect(page.fetched_at).to be 10
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'polipus/url_tracker'
|
4
|
+
|
5
|
+
describe Polipus::UrlTracker do
|
6
|
+
before(:all) do
|
7
|
+
@bf = Polipus::UrlTracker.bloomfilter
|
8
|
+
@set = Polipus::UrlTracker.redis_set
|
9
|
+
end
|
10
|
+
|
11
|
+
after(:all) do
|
12
|
+
@bf.clear
|
13
|
+
@set.clear
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should work (bf)' do
|
17
|
+
url = 'http://www.asd.com/asd/lol'
|
18
|
+
@bf.visit url
|
19
|
+
expect(@bf.visited?(url)).to be_truthy
|
20
|
+
expect(@bf.visited?('http://www.google.com')).to be_falsey
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should work (redis_set)' do
|
24
|
+
url = 'http://www.asd.com/asd/lol'
|
25
|
+
@set.visit url
|
26
|
+
expect(@set.visited?(url)).to be_truthy
|
27
|
+
expect(@set.visited?('http://www.google.com')).to be_falsey
|
28
|
+
end
|
29
|
+
end
|