polipus 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +55 -0
- data/README.rdoc +3 -0
- data/Rakefile +9 -0
- data/examples/basic.rb +58 -0
- data/examples/survival.rb +9 -0
- data/lib/polipus.rb +451 -0
- data/lib/polipus/http.rb +195 -0
- data/lib/polipus/page.rb +219 -0
- data/lib/polipus/plugin.rb +13 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +17 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +24 -0
- data/lib/polipus/queue_overflow/base.rb +6 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
- data/lib/polipus/queue_overflow/manager.rb +50 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +17 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/mongo_store.rb +86 -0
- data/lib/polipus/storage/s3_store.rb +100 -0
- data/lib/polipus/url_tracker.rb +20 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +4 -0
- data/polipus.gemspec +39 -0
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +11 -0
- data/spec/http_spec.rb +31 -0
- data/spec/page_spec.rb +22 -0
- data/spec/queue_overflow_manager_spec.rb +89 -0
- data/spec/queue_overflow_spec.rb +71 -0
- data/spec/spec_helper.rb +34 -0
- data/spec/storage_mongo_spec.rb +102 -0
- data/spec/storage_s3_spec.rb +115 -0
- data/spec/url_tracker_spec.rb +28 -0
- metadata +313 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept:
|
11
|
+
- ! '*/*'
|
12
|
+
User-Agent:
|
13
|
+
- Ruby
|
14
|
+
response:
|
15
|
+
status:
|
16
|
+
code: 300
|
17
|
+
message: Multiple Choices
|
18
|
+
headers:
|
19
|
+
Date:
|
20
|
+
- Mon, 10 Jun 2013 08:58:25 GMT
|
21
|
+
Server:
|
22
|
+
- Apache/2.2.22 (Ubuntu)
|
23
|
+
Cache-Control:
|
24
|
+
- no-cache
|
25
|
+
Location:
|
26
|
+
- http://greenbytes.de/tech/tc/httpredirects/300.txt
|
27
|
+
Content-Length:
|
28
|
+
- '27'
|
29
|
+
body:
|
30
|
+
encoding: US-ASCII
|
31
|
+
string: ! '300 Redirect Response Body
|
32
|
+
|
33
|
+
'
|
34
|
+
http_version:
|
35
|
+
recorded_at: Mon, 10 Jun 2013 08:58:25 GMT
|
36
|
+
- request:
|
37
|
+
method: get
|
38
|
+
uri: http://greenbytes.de/tech/tc/httpredirects/300.txt
|
39
|
+
body:
|
40
|
+
encoding: US-ASCII
|
41
|
+
string: ''
|
42
|
+
headers:
|
43
|
+
Accept:
|
44
|
+
- ! '*/*'
|
45
|
+
User-Agent:
|
46
|
+
- Ruby
|
47
|
+
response:
|
48
|
+
status:
|
49
|
+
code: 200
|
50
|
+
message: OK
|
51
|
+
headers:
|
52
|
+
Date:
|
53
|
+
- Mon, 10 Jun 2013 08:58:25 GMT
|
54
|
+
Server:
|
55
|
+
- Apache/2.2.22 (Ubuntu)
|
56
|
+
Last-Modified:
|
57
|
+
- Tue, 08 Jan 2013 17:31:05 GMT
|
58
|
+
Etag:
|
59
|
+
- ! '"b8306c-31-4d2ca4f7df2ca"'
|
60
|
+
Accept-Ranges:
|
61
|
+
- bytes
|
62
|
+
Content-Length:
|
63
|
+
- '49'
|
64
|
+
Content-Type:
|
65
|
+
- text/plain
|
66
|
+
body:
|
67
|
+
encoding: US-ASCII
|
68
|
+
string: ! "You have reached the target\r\nof a 300 redirect.\r\n"
|
69
|
+
http_version:
|
70
|
+
recorded_at: Mon, 10 Jun 2013 08:58:25 GMT
|
71
|
+
recorded_with: VCR 2.5.0
|
data/spec/clear.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require "yaml"
|
2
|
+
Dir.glob('./cassettes/*.yml').each do|f|
|
3
|
+
next unless f =~ /[a-f0-9]{32}/
|
4
|
+
d = YAML.load_file(f)
|
5
|
+
d['http_interactions'].each do |r|
|
6
|
+
r['request'].delete('headers')
|
7
|
+
r['response'].delete('headers')
|
8
|
+
end
|
9
|
+
File.open(f, 'w') {|fw| fw.write(d.to_yaml) }
|
10
|
+
#puts d.to_yaml
|
11
|
+
end
|
data/spec/http_spec.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "mongo"
|
3
|
+
require "polipus/http"
|
4
|
+
require "polipus/page"
|
5
|
+
|
6
|
+
describe Polipus::HTTP do
|
7
|
+
|
8
|
+
it 'should download a page' do
|
9
|
+
|
10
|
+
VCR.use_cassette('http_test') do
|
11
|
+
http = Polipus::HTTP.new
|
12
|
+
page = http.fetch_page("http://sfbay.craigslist.org/apa/")
|
13
|
+
page.should be_an_instance_of(Polipus::Page)
|
14
|
+
page.doc.search("title").text.strip.should be == "SF bay area apts/housing for rent classifieds - craigslist"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should follow a redirect' do
|
19
|
+
VCR.use_cassette('http_test_redirect') do
|
20
|
+
|
21
|
+
http = Polipus::HTTP.new
|
22
|
+
page = http.fetch_page("http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis")
|
23
|
+
|
24
|
+
page.should be_an_instance_of(Polipus::Page)
|
25
|
+
page.code.should be == 200
|
26
|
+
page.url.to_s.should be == "http://greenbytes.de/tech/tc/httpredirects/300.txt"
|
27
|
+
page.body.strip.should be == "You have reached the target\r\nof a 300 redirect."
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "polipus/page"
|
3
|
+
|
4
|
+
describe Polipus::Page do
|
5
|
+
it 'should honor domain_aliases attribute' do
|
6
|
+
body = <<EOF
|
7
|
+
<html>
|
8
|
+
<body>
|
9
|
+
<a href="/page/1">1</a>
|
10
|
+
<a href="/page/2">2</a>
|
11
|
+
<a href="http://www.google.com/page/3">3</a>
|
12
|
+
<a href="http://google.com/page/3">4</a>
|
13
|
+
<a href="http://not.google.com/page/3">4</a>
|
14
|
+
</body>
|
15
|
+
</html>
|
16
|
+
EOF
|
17
|
+
h = {'content-type' => ['text/html']}
|
18
|
+
domain_aliases = %w(www.google.com google.com)
|
19
|
+
p = Polipus::Page.new 'http://www.google.com/', :code => 200, :body => body, :headers => h, :domain_aliases => domain_aliases
|
20
|
+
p.links.count.should be == 4
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "polipus/queue_overflow"
|
3
|
+
require "redis-queue"
|
4
|
+
|
5
|
+
describe Polipus::QueueOverflow::Manager do
|
6
|
+
before(:all) do
|
7
|
+
@mongo = Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('_test_polipus')
|
8
|
+
@mongo['_test_pages'].drop
|
9
|
+
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
10
|
+
@redis_q = Redis::Queue.new("queue_test","bp_queue_test", :redis => Redis.new())
|
11
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, "queue_test")
|
12
|
+
@polipus = flexmock("polipus")
|
13
|
+
@polipus.should_receive(:queue_overflow_adapter).and_return(@queue_overflow)
|
14
|
+
@polipus.should_receive(:storage).and_return(@storage)
|
15
|
+
@manager = Polipus::QueueOverflow::Manager.new(@polipus, @redis_q, 10)
|
16
|
+
end
|
17
|
+
|
18
|
+
before(:each) do
|
19
|
+
@queue_overflow.clear
|
20
|
+
@redis_q.clear
|
21
|
+
@storage.clear
|
22
|
+
end
|
23
|
+
|
24
|
+
after(:all) do
|
25
|
+
@queue_overflow.clear
|
26
|
+
@redis_q.clear
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should remove 10 items' do
|
30
|
+
@manager.perform.should be == [0,0]
|
31
|
+
20.times {|i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", :code => 200, :body => '<html></html>').to_json }
|
32
|
+
@manager.perform.should be == [10, 0]
|
33
|
+
@queue_overflow.size.should be == 10
|
34
|
+
@redis_q.size.should be == 10
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should restore 10 items' do
|
38
|
+
@manager.perform.should be == [0,0]
|
39
|
+
10.times {|i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", :code => 200, :body => '<html></html>').to_json }
|
40
|
+
@manager.perform.should be == [0, 10]
|
41
|
+
@queue_overflow.size.should be == 0
|
42
|
+
@redis_q.size.should be == 10
|
43
|
+
@manager.perform.should be == [0, 0]
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should restore 3 items' do
|
48
|
+
|
49
|
+
@manager.perform.should be == [0,0]
|
50
|
+
3.times {|i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", :code => 200, :body => '<html></html>').to_json }
|
51
|
+
@manager.perform.should be == [0, 3]
|
52
|
+
@queue_overflow.size.should be == 0
|
53
|
+
@redis_q.size.should be == 3
|
54
|
+
@manager.perform.should be == [0, 0]
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should restore 0 items' do
|
59
|
+
|
60
|
+
@manager.perform.should be == [0,0]
|
61
|
+
10.times {|i|
|
62
|
+
p = page_factory("http://www.user-doo-bu.com/page_#{i}", :code => 200, :body => '<html></html>')
|
63
|
+
@storage.add p
|
64
|
+
@queue_overflow << p.to_json
|
65
|
+
}
|
66
|
+
@manager.perform.should be == [0, 0]
|
67
|
+
@queue_overflow.size.should be == 0
|
68
|
+
@redis_q.size.should be == 0
|
69
|
+
@manager.perform.should be == [0, 0]
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'should filter an url based on the spec' do
|
74
|
+
@queue_overflow.clear
|
75
|
+
@redis_q.clear
|
76
|
+
10.times {|i| @queue_overflow << page_factory("http://www.user-doo.com/page_#{i}", :code => 200, :body => '<html></html>').to_json }
|
77
|
+
@manager.url_filter do |page|
|
78
|
+
page.url.to_s.end_with?("page_0") ? false : true
|
79
|
+
end
|
80
|
+
@manager.perform.should be == [0,9]
|
81
|
+
@queue_overflow.size.should be == 0
|
82
|
+
@redis_q.size.should be == 9
|
83
|
+
@manager.url_filter do |page|
|
84
|
+
true
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "polipus/queue_overflow"
|
3
|
+
|
4
|
+
describe Polipus::QueueOverflow do
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, "queue_test")
|
8
|
+
@queue_overflow_capped = Polipus::QueueOverflow.mongo_queue_capped(nil, "queue_test_c", {:max => 20})
|
9
|
+
@queue_overflow_uniq = Polipus::QueueOverflow.mongo_queue(nil, "queue_test_u", {:ensure_uniq => true })
|
10
|
+
|
11
|
+
end
|
12
|
+
|
13
|
+
before(:each) do
|
14
|
+
@queue_overflow.clear
|
15
|
+
@queue_overflow_capped.clear
|
16
|
+
@queue_overflow_uniq.clear
|
17
|
+
end
|
18
|
+
|
19
|
+
after(:all) do
|
20
|
+
@queue_overflow.clear
|
21
|
+
@queue_overflow_uniq.clear
|
22
|
+
@queue_overflow_capped.clear
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should work' do
|
26
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
27
|
+
q.empty?.should be_true
|
28
|
+
q.pop.should be_nil
|
29
|
+
q << "test"
|
30
|
+
q.size.should be == 1
|
31
|
+
q.pop.should be == "test"
|
32
|
+
q.empty?.should be_true
|
33
|
+
q.pop.should be_nil
|
34
|
+
q.size.should be == 0
|
35
|
+
q.empty?.should be_true
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should act as a queue' do
|
41
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
42
|
+
10.times { |i| q << "message_#{i}" }
|
43
|
+
q.size.should be == 10
|
44
|
+
q.pop.should be == "message_0"
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'should work with complex paylod' do
|
50
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
51
|
+
a = {'a' => [1,2,3], 'b' => 'a_string'}
|
52
|
+
q << a.to_json
|
53
|
+
b = q.pop
|
54
|
+
JSON.parse(b).should be == a
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'should honor max items if it is capped' do
|
60
|
+
30.times { |i| @queue_overflow_capped << "message_#{i}" }
|
61
|
+
@queue_overflow_capped.size.should be == 20
|
62
|
+
@queue_overflow_capped.pop.should be == "message_10"
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'should contains only unique items' do
|
66
|
+
20.times {@queue_overflow_uniq << "A"}
|
67
|
+
20.times {@queue_overflow_uniq << "B"}
|
68
|
+
@queue_overflow_uniq.size.should be == 2
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
require "digest/md5"
|
8
|
+
RSpec.configure do |config|
|
9
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
10
|
+
config.run_all_when_everything_filtered = true
|
11
|
+
config.filter_run :focus
|
12
|
+
|
13
|
+
# Run specs in random order to surface order dependencies. If you find an
|
14
|
+
# order dependency and want to debug it, you can fix the order by providing
|
15
|
+
# the seed, which is printed after each run.
|
16
|
+
# --seed 1234
|
17
|
+
config.order = 'random'
|
18
|
+
config.mock_with :flexmock
|
19
|
+
config.around(:each) do |example|
|
20
|
+
VCR.use_cassette(Digest::MD5.hexdigest(example.metadata[:full_description])) do
|
21
|
+
example.run
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
require "vcr"
|
26
|
+
require "polipus"
|
27
|
+
VCR.configure do |c|
|
28
|
+
c.cassette_library_dir = "#{File.dirname(__FILE__)}/cassettes"
|
29
|
+
c.hook_into :webmock
|
30
|
+
end
|
31
|
+
|
32
|
+
def page_factory url, params = {}
|
33
|
+
Polipus::Page.new url, params
|
34
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "mongo"
|
3
|
+
require "polipus/storage/mongo_store"
|
4
|
+
describe Polipus::Storage::MongoStore do
|
5
|
+
before(:all)do
|
6
|
+
@mongo = Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('_test_polipus')
|
7
|
+
@mongo['_test_pages'].drop
|
8
|
+
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
9
|
+
end
|
10
|
+
|
11
|
+
after(:all) do
|
12
|
+
@mongo['_test_pages'].drop
|
13
|
+
end
|
14
|
+
|
15
|
+
after(:each) do
|
16
|
+
@mongo['_test_pages'].drop
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should store a page' do
|
20
|
+
p = page_factory 'http://www.google.com', :code => 200, :body => '<html></html>'
|
21
|
+
uuid = @storage.add p
|
22
|
+
uuid.should be == 'ed646a3334ca891fd3467db131372140'
|
23
|
+
@storage.count.should be 1
|
24
|
+
@mongo['_test_pages'].count.should be 1
|
25
|
+
p = @storage.get p
|
26
|
+
p.url.to_s.should be == 'http://www.google.com'
|
27
|
+
p.body.should be == '<html></html>'
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should update a page' do
|
31
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
32
|
+
@storage.add p
|
33
|
+
p = @storage.get p
|
34
|
+
p.code.should be == 301
|
35
|
+
@mongo['_test_pages'].count.should be 1
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should iterate over stored pages' do
|
39
|
+
@storage.each do |k, page|
|
40
|
+
k.should be == "ed646a3334ca891fd3467db131372140"
|
41
|
+
page.url.to_s.should be == 'http://www.google.com'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should delete a page' do
|
46
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
47
|
+
@storage.remove p
|
48
|
+
@storage.get(p).should be_nil
|
49
|
+
@storage.count.should be 0
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should store a page removing a query string from the uuid generation' do
|
53
|
+
p = page_factory 'http://www.asd.com/?asd=lol', :code => 200, :body => '<html></html>'
|
54
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', :code => 200, :body => '<html></html>'
|
55
|
+
@storage.include_query_string_in_uuid = false
|
56
|
+
@storage.add p
|
57
|
+
@storage.exists?(p_no_query).should be_true
|
58
|
+
@storage.remove p
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
62
|
+
p = page_factory 'http://www.asd.com?asd=lol', :code => 200, :body => '<html></html>'
|
63
|
+
p_no_query = page_factory 'http://www.asd.com', :code => 200, :body => '<html></html>'
|
64
|
+
@storage.include_query_string_in_uuid = false
|
65
|
+
@storage.add p
|
66
|
+
@storage.exists?(p_no_query).should be_true
|
67
|
+
@storage.remove p
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should store a page with user data associated' do
|
71
|
+
p = page_factory 'http://www.user.com', :code => 200, :body => '<html></html>'
|
72
|
+
p.user_data.name = 'Test User Data'
|
73
|
+
@storage.add p
|
74
|
+
@storage.exists?(p).should be_true
|
75
|
+
p = @storage.get(p)
|
76
|
+
p.user_data.name.should be == 'Test User Data'
|
77
|
+
@storage.remove p
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'should honor the except parameters' do
|
81
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
|
82
|
+
p = page_factory 'http://www.user-doo.com', :code => 200, :body => '<html></html>'
|
83
|
+
storage.add p
|
84
|
+
p = storage.get p
|
85
|
+
p.body.should be_empty
|
86
|
+
storage.clear
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'should return false if a doc not exists' do
|
90
|
+
@storage.include_query_string_in_uuid = false
|
91
|
+
p_other = page_factory 'http://www.asdrrrr.com', :code => 200, :body => '<html></html>'
|
92
|
+
@storage.exists?(p_other).should be_false
|
93
|
+
@storage.add p_other
|
94
|
+
@storage.exists?(p_other).should be_true
|
95
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', :code => 200, :body => '<html></html>'
|
96
|
+
@storage.exists?(p_other).should be_true
|
97
|
+
@storage.include_query_string_in_uuid = true
|
98
|
+
@storage.exists?(p_other).should be_false
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "aws/s3"
|
3
|
+
require "polipus/storage/s3_store"
|
4
|
+
describe Polipus::Storage::S3Store do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@storage = Polipus::Storage.s3_store(
|
8
|
+
'_test_pages',
|
9
|
+
{
|
10
|
+
:access_key_id => 'XXXXXXX',
|
11
|
+
:secret_access_key => 'XXXX'
|
12
|
+
}
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
after(:each) {@storage.clear}
|
17
|
+
|
18
|
+
|
19
|
+
it 'should store a page' do
|
20
|
+
|
21
|
+
p = page_factory 'http://www.google.com', :code => 200, :body => '<html></html>'
|
22
|
+
uuid = @storage.add p
|
23
|
+
uuid.should be == 'ed646a3334ca891fd3467db131372140'
|
24
|
+
@storage.count.should be 1
|
25
|
+
p = @storage.get p
|
26
|
+
p.url.to_s.should be == 'http://www.google.com'
|
27
|
+
p.body.should be == '<html></html>'
|
28
|
+
@storage.remove p
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should update a page' do
|
33
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
34
|
+
@storage.add p
|
35
|
+
p = @storage.get p
|
36
|
+
p.code.should be == 301
|
37
|
+
@storage.count.should be == 1
|
38
|
+
@storage.remove p
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'should iterate over stored pages' do
|
42
|
+
10.times {|i| @storage.add page_factory("http://www.google.com/p_#{i}", :code => 200, :body => "<html>#{i}</html>")}
|
43
|
+
@storage.count.should be 10
|
44
|
+
@storage.each do |k, page|
|
45
|
+
k.should be =~ /[a-f0-9]{32}/
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'should delete a page' do
|
50
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
51
|
+
@storage.add p
|
52
|
+
@storage.remove p
|
53
|
+
@storage.get(p).should be_nil
|
54
|
+
@storage.count.should be 0
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should store a page removing a query string from the uuid generation' do
|
58
|
+
p = page_factory 'http://www.asd.com/?asd=lol', :code => 200, :body => '<html></html>'
|
59
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', :code => 200, :body => '<html></html>'
|
60
|
+
@storage.include_query_string_in_uuid = false
|
61
|
+
@storage.add p
|
62
|
+
@storage.exists?(p_no_query).should be_true
|
63
|
+
@storage.remove p
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
67
|
+
p = page_factory 'http://www.asd.com?asd=lol', :code => 200, :body => '<html></html>'
|
68
|
+
p_no_query = page_factory 'http://www.asd.com', :code => 200, :body => '<html></html>'
|
69
|
+
@storage.include_query_string_in_uuid = false
|
70
|
+
@storage.add p
|
71
|
+
@storage.exists?(p_no_query).should be_true
|
72
|
+
@storage.remove p
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should store a page with user data associated' do
|
76
|
+
p = page_factory 'http://www.user.com', :code => 200, :body => '<html></html>'
|
77
|
+
p.user_data.name = 'Test User Data'
|
78
|
+
@storage.add p
|
79
|
+
@storage.exists?(p).should be_true
|
80
|
+
p = @storage.get(p)
|
81
|
+
p.user_data.name.should be == 'Test User Data'
|
82
|
+
@storage.remove p
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should honor the except parameters' do
|
86
|
+
storage = storage = Polipus::Storage.s3_store(
|
87
|
+
'_test_pages',
|
88
|
+
{
|
89
|
+
:access_key_id => 'XXXXXXX',
|
90
|
+
:secret_access_key => 'XXXX'
|
91
|
+
},
|
92
|
+
['body']
|
93
|
+
)
|
94
|
+
p = page_factory 'http://www.user-doo.com', :code => 200, :body => '<html></html>'
|
95
|
+
storage.add p
|
96
|
+
p = storage.get p
|
97
|
+
|
98
|
+
p.body.should be_nil
|
99
|
+
storage.clear
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'should return false if a doc not exists' do
|
103
|
+
@storage.include_query_string_in_uuid = false
|
104
|
+
p_other = page_factory 'http://www.asdrrrr.com', :code => 200, :body => '<html></html>'
|
105
|
+
@storage.exists?(p_other).should be_false
|
106
|
+
@storage.add p_other
|
107
|
+
@storage.exists?(p_other).should be_true
|
108
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', :code => 200, :body => '<html></html>'
|
109
|
+
@storage.exists?(p_other).should be_true
|
110
|
+
@storage.include_query_string_in_uuid = true
|
111
|
+
@storage.exists?(p_other).should be_false
|
112
|
+
@storage.remove p_other
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|