polipus 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +55 -0
- data/README.rdoc +3 -0
- data/Rakefile +9 -0
- data/examples/basic.rb +58 -0
- data/examples/survival.rb +9 -0
- data/lib/polipus.rb +451 -0
- data/lib/polipus/http.rb +195 -0
- data/lib/polipus/page.rb +219 -0
- data/lib/polipus/plugin.rb +13 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +17 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +24 -0
- data/lib/polipus/queue_overflow/base.rb +6 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
- data/lib/polipus/queue_overflow/manager.rb +50 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +17 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/mongo_store.rb +86 -0
- data/lib/polipus/storage/s3_store.rb +100 -0
- data/lib/polipus/url_tracker.rb +20 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +4 -0
- data/polipus.gemspec +39 -0
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +11 -0
- data/spec/http_spec.rb +31 -0
- data/spec/page_spec.rb +22 -0
- data/spec/queue_overflow_manager_spec.rb +89 -0
- data/spec/queue_overflow_spec.rb +71 -0
- data/spec/spec_helper.rb +34 -0
- data/spec/storage_mongo_spec.rb +102 -0
- data/spec/storage_s3_spec.rb +115 -0
- data/spec/url_tracker_spec.rb +28 -0
- metadata +313 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept:
|
11
|
+
- ! '*/*'
|
12
|
+
User-Agent:
|
13
|
+
- Ruby
|
14
|
+
response:
|
15
|
+
status:
|
16
|
+
code: 300
|
17
|
+
message: Multiple Choices
|
18
|
+
headers:
|
19
|
+
Date:
|
20
|
+
- Mon, 10 Jun 2013 08:58:25 GMT
|
21
|
+
Server:
|
22
|
+
- Apache/2.2.22 (Ubuntu)
|
23
|
+
Cache-Control:
|
24
|
+
- no-cache
|
25
|
+
Location:
|
26
|
+
- http://greenbytes.de/tech/tc/httpredirects/300.txt
|
27
|
+
Content-Length:
|
28
|
+
- '27'
|
29
|
+
body:
|
30
|
+
encoding: US-ASCII
|
31
|
+
string: ! '300 Redirect Response Body
|
32
|
+
|
33
|
+
'
|
34
|
+
http_version:
|
35
|
+
recorded_at: Mon, 10 Jun 2013 08:58:25 GMT
|
36
|
+
- request:
|
37
|
+
method: get
|
38
|
+
uri: http://greenbytes.de/tech/tc/httpredirects/300.txt
|
39
|
+
body:
|
40
|
+
encoding: US-ASCII
|
41
|
+
string: ''
|
42
|
+
headers:
|
43
|
+
Accept:
|
44
|
+
- ! '*/*'
|
45
|
+
User-Agent:
|
46
|
+
- Ruby
|
47
|
+
response:
|
48
|
+
status:
|
49
|
+
code: 200
|
50
|
+
message: OK
|
51
|
+
headers:
|
52
|
+
Date:
|
53
|
+
- Mon, 10 Jun 2013 08:58:25 GMT
|
54
|
+
Server:
|
55
|
+
- Apache/2.2.22 (Ubuntu)
|
56
|
+
Last-Modified:
|
57
|
+
- Tue, 08 Jan 2013 17:31:05 GMT
|
58
|
+
Etag:
|
59
|
+
- ! '"b8306c-31-4d2ca4f7df2ca"'
|
60
|
+
Accept-Ranges:
|
61
|
+
- bytes
|
62
|
+
Content-Length:
|
63
|
+
- '49'
|
64
|
+
Content-Type:
|
65
|
+
- text/plain
|
66
|
+
body:
|
67
|
+
encoding: US-ASCII
|
68
|
+
string: ! "You have reached the target\r\nof a 300 redirect.\r\n"
|
69
|
+
http_version:
|
70
|
+
recorded_at: Mon, 10 Jun 2013 08:58:25 GMT
|
71
|
+
recorded_with: VCR 2.5.0
|
data/spec/clear.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require "yaml"
|
2
|
+
Dir.glob('./cassettes/*.yml').each do|f|
|
3
|
+
next unless f =~ /[a-f0-9]{32}/
|
4
|
+
d = YAML.load_file(f)
|
5
|
+
d['http_interactions'].each do |r|
|
6
|
+
r['request'].delete('headers')
|
7
|
+
r['response'].delete('headers')
|
8
|
+
end
|
9
|
+
File.open(f, 'w') {|fw| fw.write(d.to_yaml) }
|
10
|
+
#puts d.to_yaml
|
11
|
+
end
|
data/spec/http_spec.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "mongo"
|
3
|
+
require "polipus/http"
|
4
|
+
require "polipus/page"
|
5
|
+
|
6
|
+
describe Polipus::HTTP do
|
7
|
+
|
8
|
+
it 'should download a page' do
|
9
|
+
|
10
|
+
VCR.use_cassette('http_test') do
|
11
|
+
http = Polipus::HTTP.new
|
12
|
+
page = http.fetch_page("http://sfbay.craigslist.org/apa/")
|
13
|
+
page.should be_an_instance_of(Polipus::Page)
|
14
|
+
page.doc.search("title").text.strip.should be == "SF bay area apts/housing for rent classifieds - craigslist"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should follow a redirect' do
|
19
|
+
VCR.use_cassette('http_test_redirect') do
|
20
|
+
|
21
|
+
http = Polipus::HTTP.new
|
22
|
+
page = http.fetch_page("http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis")
|
23
|
+
|
24
|
+
page.should be_an_instance_of(Polipus::Page)
|
25
|
+
page.code.should be == 200
|
26
|
+
page.url.to_s.should be == "http://greenbytes.de/tech/tc/httpredirects/300.txt"
|
27
|
+
page.body.strip.should be == "You have reached the target\r\nof a 300 redirect."
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "polipus/page"
|
3
|
+
|
4
|
+
describe Polipus::Page do
|
5
|
+
it 'should honor domain_aliases attribute' do
|
6
|
+
body = <<EOF
|
7
|
+
<html>
|
8
|
+
<body>
|
9
|
+
<a href="/page/1">1</a>
|
10
|
+
<a href="/page/2">2</a>
|
11
|
+
<a href="http://www.google.com/page/3">3</a>
|
12
|
+
<a href="http://google.com/page/3">4</a>
|
13
|
+
<a href="http://not.google.com/page/3">4</a>
|
14
|
+
</body>
|
15
|
+
</html>
|
16
|
+
EOF
|
17
|
+
h = {'content-type' => ['text/html']}
|
18
|
+
domain_aliases = %w(www.google.com google.com)
|
19
|
+
p = Polipus::Page.new 'http://www.google.com/', :code => 200, :body => body, :headers => h, :domain_aliases => domain_aliases
|
20
|
+
p.links.count.should be == 4
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "polipus/queue_overflow"
|
3
|
+
require "redis-queue"
|
4
|
+
|
5
|
+
describe Polipus::QueueOverflow::Manager do
|
6
|
+
before(:all) do
|
7
|
+
@mongo = Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('_test_polipus')
|
8
|
+
@mongo['_test_pages'].drop
|
9
|
+
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
10
|
+
@redis_q = Redis::Queue.new("queue_test","bp_queue_test", :redis => Redis.new())
|
11
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, "queue_test")
|
12
|
+
@polipus = flexmock("polipus")
|
13
|
+
@polipus.should_receive(:queue_overflow_adapter).and_return(@queue_overflow)
|
14
|
+
@polipus.should_receive(:storage).and_return(@storage)
|
15
|
+
@manager = Polipus::QueueOverflow::Manager.new(@polipus, @redis_q, 10)
|
16
|
+
end
|
17
|
+
|
18
|
+
before(:each) do
|
19
|
+
@queue_overflow.clear
|
20
|
+
@redis_q.clear
|
21
|
+
@storage.clear
|
22
|
+
end
|
23
|
+
|
24
|
+
after(:all) do
|
25
|
+
@queue_overflow.clear
|
26
|
+
@redis_q.clear
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should remove 10 items' do
|
30
|
+
@manager.perform.should be == [0,0]
|
31
|
+
20.times {|i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", :code => 200, :body => '<html></html>').to_json }
|
32
|
+
@manager.perform.should be == [10, 0]
|
33
|
+
@queue_overflow.size.should be == 10
|
34
|
+
@redis_q.size.should be == 10
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should restore 10 items' do
|
38
|
+
@manager.perform.should be == [0,0]
|
39
|
+
10.times {|i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", :code => 200, :body => '<html></html>').to_json }
|
40
|
+
@manager.perform.should be == [0, 10]
|
41
|
+
@queue_overflow.size.should be == 0
|
42
|
+
@redis_q.size.should be == 10
|
43
|
+
@manager.perform.should be == [0, 0]
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should restore 3 items' do
|
48
|
+
|
49
|
+
@manager.perform.should be == [0,0]
|
50
|
+
3.times {|i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", :code => 200, :body => '<html></html>').to_json }
|
51
|
+
@manager.perform.should be == [0, 3]
|
52
|
+
@queue_overflow.size.should be == 0
|
53
|
+
@redis_q.size.should be == 3
|
54
|
+
@manager.perform.should be == [0, 0]
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should restore 0 items' do
|
59
|
+
|
60
|
+
@manager.perform.should be == [0,0]
|
61
|
+
10.times {|i|
|
62
|
+
p = page_factory("http://www.user-doo-bu.com/page_#{i}", :code => 200, :body => '<html></html>')
|
63
|
+
@storage.add p
|
64
|
+
@queue_overflow << p.to_json
|
65
|
+
}
|
66
|
+
@manager.perform.should be == [0, 0]
|
67
|
+
@queue_overflow.size.should be == 0
|
68
|
+
@redis_q.size.should be == 0
|
69
|
+
@manager.perform.should be == [0, 0]
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'should filter an url based on the spec' do
|
74
|
+
@queue_overflow.clear
|
75
|
+
@redis_q.clear
|
76
|
+
10.times {|i| @queue_overflow << page_factory("http://www.user-doo.com/page_#{i}", :code => 200, :body => '<html></html>').to_json }
|
77
|
+
@manager.url_filter do |page|
|
78
|
+
page.url.to_s.end_with?("page_0") ? false : true
|
79
|
+
end
|
80
|
+
@manager.perform.should be == [0,9]
|
81
|
+
@queue_overflow.size.should be == 0
|
82
|
+
@redis_q.size.should be == 9
|
83
|
+
@manager.url_filter do |page|
|
84
|
+
true
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "polipus/queue_overflow"
|
3
|
+
|
4
|
+
describe Polipus::QueueOverflow do
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, "queue_test")
|
8
|
+
@queue_overflow_capped = Polipus::QueueOverflow.mongo_queue_capped(nil, "queue_test_c", {:max => 20})
|
9
|
+
@queue_overflow_uniq = Polipus::QueueOverflow.mongo_queue(nil, "queue_test_u", {:ensure_uniq => true })
|
10
|
+
|
11
|
+
end
|
12
|
+
|
13
|
+
before(:each) do
|
14
|
+
@queue_overflow.clear
|
15
|
+
@queue_overflow_capped.clear
|
16
|
+
@queue_overflow_uniq.clear
|
17
|
+
end
|
18
|
+
|
19
|
+
after(:all) do
|
20
|
+
@queue_overflow.clear
|
21
|
+
@queue_overflow_uniq.clear
|
22
|
+
@queue_overflow_capped.clear
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should work' do
|
26
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
27
|
+
q.empty?.should be_true
|
28
|
+
q.pop.should be_nil
|
29
|
+
q << "test"
|
30
|
+
q.size.should be == 1
|
31
|
+
q.pop.should be == "test"
|
32
|
+
q.empty?.should be_true
|
33
|
+
q.pop.should be_nil
|
34
|
+
q.size.should be == 0
|
35
|
+
q.empty?.should be_true
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should act as a queue' do
|
41
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
42
|
+
10.times { |i| q << "message_#{i}" }
|
43
|
+
q.size.should be == 10
|
44
|
+
q.pop.should be == "message_0"
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'should work with complex paylod' do
|
50
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
51
|
+
a = {'a' => [1,2,3], 'b' => 'a_string'}
|
52
|
+
q << a.to_json
|
53
|
+
b = q.pop
|
54
|
+
JSON.parse(b).should be == a
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'should honor max items if it is capped' do
|
60
|
+
30.times { |i| @queue_overflow_capped << "message_#{i}" }
|
61
|
+
@queue_overflow_capped.size.should be == 20
|
62
|
+
@queue_overflow_capped.pop.should be == "message_10"
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'should contains only unique items' do
|
66
|
+
20.times {@queue_overflow_uniq << "A"}
|
67
|
+
20.times {@queue_overflow_uniq << "B"}
|
68
|
+
@queue_overflow_uniq.size.should be == 2
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
require "digest/md5"
|
8
|
+
RSpec.configure do |config|
|
9
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
10
|
+
config.run_all_when_everything_filtered = true
|
11
|
+
config.filter_run :focus
|
12
|
+
|
13
|
+
# Run specs in random order to surface order dependencies. If you find an
|
14
|
+
# order dependency and want to debug it, you can fix the order by providing
|
15
|
+
# the seed, which is printed after each run.
|
16
|
+
# --seed 1234
|
17
|
+
config.order = 'random'
|
18
|
+
config.mock_with :flexmock
|
19
|
+
config.around(:each) do |example|
|
20
|
+
VCR.use_cassette(Digest::MD5.hexdigest(example.metadata[:full_description])) do
|
21
|
+
example.run
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
require "vcr"
|
26
|
+
require "polipus"
|
27
|
+
VCR.configure do |c|
|
28
|
+
c.cassette_library_dir = "#{File.dirname(__FILE__)}/cassettes"
|
29
|
+
c.hook_into :webmock
|
30
|
+
end
|
31
|
+
|
32
|
+
def page_factory url, params = {}
|
33
|
+
Polipus::Page.new url, params
|
34
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "mongo"
|
3
|
+
require "polipus/storage/mongo_store"
|
4
|
+
describe Polipus::Storage::MongoStore do
|
5
|
+
before(:all)do
|
6
|
+
@mongo = Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('_test_polipus')
|
7
|
+
@mongo['_test_pages'].drop
|
8
|
+
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
9
|
+
end
|
10
|
+
|
11
|
+
after(:all) do
|
12
|
+
@mongo['_test_pages'].drop
|
13
|
+
end
|
14
|
+
|
15
|
+
after(:each) do
|
16
|
+
@mongo['_test_pages'].drop
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should store a page' do
|
20
|
+
p = page_factory 'http://www.google.com', :code => 200, :body => '<html></html>'
|
21
|
+
uuid = @storage.add p
|
22
|
+
uuid.should be == 'ed646a3334ca891fd3467db131372140'
|
23
|
+
@storage.count.should be 1
|
24
|
+
@mongo['_test_pages'].count.should be 1
|
25
|
+
p = @storage.get p
|
26
|
+
p.url.to_s.should be == 'http://www.google.com'
|
27
|
+
p.body.should be == '<html></html>'
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should update a page' do
|
31
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
32
|
+
@storage.add p
|
33
|
+
p = @storage.get p
|
34
|
+
p.code.should be == 301
|
35
|
+
@mongo['_test_pages'].count.should be 1
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should iterate over stored pages' do
|
39
|
+
@storage.each do |k, page|
|
40
|
+
k.should be == "ed646a3334ca891fd3467db131372140"
|
41
|
+
page.url.to_s.should be == 'http://www.google.com'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should delete a page' do
|
46
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
47
|
+
@storage.remove p
|
48
|
+
@storage.get(p).should be_nil
|
49
|
+
@storage.count.should be 0
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should store a page removing a query string from the uuid generation' do
|
53
|
+
p = page_factory 'http://www.asd.com/?asd=lol', :code => 200, :body => '<html></html>'
|
54
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', :code => 200, :body => '<html></html>'
|
55
|
+
@storage.include_query_string_in_uuid = false
|
56
|
+
@storage.add p
|
57
|
+
@storage.exists?(p_no_query).should be_true
|
58
|
+
@storage.remove p
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
62
|
+
p = page_factory 'http://www.asd.com?asd=lol', :code => 200, :body => '<html></html>'
|
63
|
+
p_no_query = page_factory 'http://www.asd.com', :code => 200, :body => '<html></html>'
|
64
|
+
@storage.include_query_string_in_uuid = false
|
65
|
+
@storage.add p
|
66
|
+
@storage.exists?(p_no_query).should be_true
|
67
|
+
@storage.remove p
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should store a page with user data associated' do
|
71
|
+
p = page_factory 'http://www.user.com', :code => 200, :body => '<html></html>'
|
72
|
+
p.user_data.name = 'Test User Data'
|
73
|
+
@storage.add p
|
74
|
+
@storage.exists?(p).should be_true
|
75
|
+
p = @storage.get(p)
|
76
|
+
p.user_data.name.should be == 'Test User Data'
|
77
|
+
@storage.remove p
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'should honor the except parameters' do
|
81
|
+
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages', ['body'])
|
82
|
+
p = page_factory 'http://www.user-doo.com', :code => 200, :body => '<html></html>'
|
83
|
+
storage.add p
|
84
|
+
p = storage.get p
|
85
|
+
p.body.should be_empty
|
86
|
+
storage.clear
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'should return false if a doc not exists' do
|
90
|
+
@storage.include_query_string_in_uuid = false
|
91
|
+
p_other = page_factory 'http://www.asdrrrr.com', :code => 200, :body => '<html></html>'
|
92
|
+
@storage.exists?(p_other).should be_false
|
93
|
+
@storage.add p_other
|
94
|
+
@storage.exists?(p_other).should be_true
|
95
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', :code => 200, :body => '<html></html>'
|
96
|
+
@storage.exists?(p_other).should be_true
|
97
|
+
@storage.include_query_string_in_uuid = true
|
98
|
+
@storage.exists?(p_other).should be_false
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "aws/s3"
|
3
|
+
require "polipus/storage/s3_store"
|
4
|
+
describe Polipus::Storage::S3Store do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@storage = Polipus::Storage.s3_store(
|
8
|
+
'_test_pages',
|
9
|
+
{
|
10
|
+
:access_key_id => 'XXXXXXX',
|
11
|
+
:secret_access_key => 'XXXX'
|
12
|
+
}
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
after(:each) {@storage.clear}
|
17
|
+
|
18
|
+
|
19
|
+
it 'should store a page' do
|
20
|
+
|
21
|
+
p = page_factory 'http://www.google.com', :code => 200, :body => '<html></html>'
|
22
|
+
uuid = @storage.add p
|
23
|
+
uuid.should be == 'ed646a3334ca891fd3467db131372140'
|
24
|
+
@storage.count.should be 1
|
25
|
+
p = @storage.get p
|
26
|
+
p.url.to_s.should be == 'http://www.google.com'
|
27
|
+
p.body.should be == '<html></html>'
|
28
|
+
@storage.remove p
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should update a page' do
|
33
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
34
|
+
@storage.add p
|
35
|
+
p = @storage.get p
|
36
|
+
p.code.should be == 301
|
37
|
+
@storage.count.should be == 1
|
38
|
+
@storage.remove p
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'should iterate over stored pages' do
|
42
|
+
10.times {|i| @storage.add page_factory("http://www.google.com/p_#{i}", :code => 200, :body => "<html>#{i}</html>")}
|
43
|
+
@storage.count.should be 10
|
44
|
+
@storage.each do |k, page|
|
45
|
+
k.should be =~ /[a-f0-9]{32}/
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'should delete a page' do
|
50
|
+
p = page_factory 'http://www.google.com', :code => 301, :body => '<html></html>'
|
51
|
+
@storage.add p
|
52
|
+
@storage.remove p
|
53
|
+
@storage.get(p).should be_nil
|
54
|
+
@storage.count.should be 0
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should store a page removing a query string from the uuid generation' do
|
58
|
+
p = page_factory 'http://www.asd.com/?asd=lol', :code => 200, :body => '<html></html>'
|
59
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', :code => 200, :body => '<html></html>'
|
60
|
+
@storage.include_query_string_in_uuid = false
|
61
|
+
@storage.add p
|
62
|
+
@storage.exists?(p_no_query).should be_true
|
63
|
+
@storage.remove p
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
67
|
+
p = page_factory 'http://www.asd.com?asd=lol', :code => 200, :body => '<html></html>'
|
68
|
+
p_no_query = page_factory 'http://www.asd.com', :code => 200, :body => '<html></html>'
|
69
|
+
@storage.include_query_string_in_uuid = false
|
70
|
+
@storage.add p
|
71
|
+
@storage.exists?(p_no_query).should be_true
|
72
|
+
@storage.remove p
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should store a page with user data associated' do
|
76
|
+
p = page_factory 'http://www.user.com', :code => 200, :body => '<html></html>'
|
77
|
+
p.user_data.name = 'Test User Data'
|
78
|
+
@storage.add p
|
79
|
+
@storage.exists?(p).should be_true
|
80
|
+
p = @storage.get(p)
|
81
|
+
p.user_data.name.should be == 'Test User Data'
|
82
|
+
@storage.remove p
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should honor the except parameters' do
|
86
|
+
storage = storage = Polipus::Storage.s3_store(
|
87
|
+
'_test_pages',
|
88
|
+
{
|
89
|
+
:access_key_id => 'XXXXXXX',
|
90
|
+
:secret_access_key => 'XXXX'
|
91
|
+
},
|
92
|
+
['body']
|
93
|
+
)
|
94
|
+
p = page_factory 'http://www.user-doo.com', :code => 200, :body => '<html></html>'
|
95
|
+
storage.add p
|
96
|
+
p = storage.get p
|
97
|
+
|
98
|
+
p.body.should be_nil
|
99
|
+
storage.clear
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'should return false if a doc not exists' do
|
103
|
+
@storage.include_query_string_in_uuid = false
|
104
|
+
p_other = page_factory 'http://www.asdrrrr.com', :code => 200, :body => '<html></html>'
|
105
|
+
@storage.exists?(p_other).should be_false
|
106
|
+
@storage.add p_other
|
107
|
+
@storage.exists?(p_other).should be_true
|
108
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', :code => 200, :body => '<html></html>'
|
109
|
+
@storage.exists?(p_other).should be_true
|
110
|
+
@storage.include_query_string_in_uuid = true
|
111
|
+
@storage.exists?(p_other).should be_false
|
112
|
+
@storage.remove p_other
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|