parallel588_polipus 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +33 -0
- data/.travis.yml +22 -0
- data/AUTHORS.md +5 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +70 -0
- data/Rakefile +8 -0
- data/examples/basic.rb +63 -0
- data/examples/error_handling.rb +23 -0
- data/examples/incremental.rb +63 -0
- data/examples/robots_txt_handling.rb +14 -0
- data/examples/survival.rb +10 -0
- data/lib/polipus.rb +488 -0
- data/lib/polipus/http.rb +282 -0
- data/lib/polipus/page.rb +256 -0
- data/lib/polipus/plugin.rb +14 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +15 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +26 -0
- data/lib/polipus/queue_overflow/base.rb +7 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
- data/lib/polipus/queue_overflow/manager.rb +57 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/robotex.rb +145 -0
- data/lib/polipus/signal_handler.rb +42 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +20 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +90 -0
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/url_tracker.rb +21 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +5 -0
- data/polipus.gemspec +44 -0
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/cassettes/gzipped_on.yml +147 -0
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +12 -0
- data/spec/polipus/http_spec.rb +139 -0
- data/spec/polipus/page_spec.rb +68 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
- data/spec/polipus/queue_overflow_spec.rb +66 -0
- data/spec/polipus/robotex_spec.rb +85 -0
- data/spec/polipus/signal_handler_spec.rb +15 -0
- data/spec/polipus/storage/memory_store_spec.rb +87 -0
- data/spec/polipus/storage/mongo_store_spec.rb +119 -0
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/polipus/url_tracker_spec.rb +29 -0
- data/spec/polipus_spec.rb +107 -0
- data/spec/spec_helper.rb +42 -0
- metadata +348 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept:
|
11
|
+
- ! '*/*'
|
12
|
+
User-Agent:
|
13
|
+
- Ruby
|
14
|
+
response:
|
15
|
+
status:
|
16
|
+
code: 300
|
17
|
+
message: Multiple Choices
|
18
|
+
headers:
|
19
|
+
Date:
|
20
|
+
- Mon, 10 Jun 2013 08:58:25 GMT
|
21
|
+
Server:
|
22
|
+
- Apache/2.2.22 (Ubuntu)
|
23
|
+
Cache-Control:
|
24
|
+
- no-cache
|
25
|
+
Location:
|
26
|
+
- http://greenbytes.de/tech/tc/httpredirects/300.txt
|
27
|
+
Content-Length:
|
28
|
+
- '27'
|
29
|
+
body:
|
30
|
+
encoding: US-ASCII
|
31
|
+
string: ! '300 Redirect Response Body
|
32
|
+
|
33
|
+
'
|
34
|
+
http_version:
|
35
|
+
recorded_at: Mon, 10 Jun 2013 08:58:25 GMT
|
36
|
+
- request:
|
37
|
+
method: get
|
38
|
+
uri: http://greenbytes.de/tech/tc/httpredirects/300.txt
|
39
|
+
body:
|
40
|
+
encoding: US-ASCII
|
41
|
+
string: ''
|
42
|
+
headers:
|
43
|
+
Accept:
|
44
|
+
- ! '*/*'
|
45
|
+
User-Agent:
|
46
|
+
- Ruby
|
47
|
+
response:
|
48
|
+
status:
|
49
|
+
code: 200
|
50
|
+
message: OK
|
51
|
+
headers:
|
52
|
+
Date:
|
53
|
+
- Mon, 10 Jun 2013 08:58:25 GMT
|
54
|
+
Server:
|
55
|
+
- Apache/2.2.22 (Ubuntu)
|
56
|
+
Last-Modified:
|
57
|
+
- Tue, 08 Jan 2013 17:31:05 GMT
|
58
|
+
Etag:
|
59
|
+
- ! '"b8306c-31-4d2ca4f7df2ca"'
|
60
|
+
Accept-Ranges:
|
61
|
+
- bytes
|
62
|
+
Content-Length:
|
63
|
+
- '49'
|
64
|
+
Content-Type:
|
65
|
+
- text/plain
|
66
|
+
body:
|
67
|
+
encoding: US-ASCII
|
68
|
+
string: ! "You have reached the target\r\nof a 300 redirect.\r\n"
|
69
|
+
http_version:
|
70
|
+
recorded_at: Mon, 10 Jun 2013 08:58:25 GMT
|
71
|
+
recorded_with: VCR 2.5.0
|
data/spec/clear.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'yaml'
|
3
|
+
Dir.glob('./cassettes/*.yml').each do|f|
|
4
|
+
next unless f =~ /[a-f0-9]{32}/
|
5
|
+
d = YAML.load_file(f)
|
6
|
+
d['http_interactions'].each do |r|
|
7
|
+
r['request'].delete('headers')
|
8
|
+
r['response'].delete('headers')
|
9
|
+
end
|
10
|
+
File.open(f, 'w') { |fw| fw.write(d.to_yaml) }
|
11
|
+
# puts d.to_yaml
|
12
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'mongo'
|
4
|
+
require 'polipus/http'
|
5
|
+
require 'polipus/page'
|
6
|
+
|
7
|
+
describe Polipus::HTTP do
|
8
|
+
it 'should download a page' do
|
9
|
+
VCR.use_cassette('http_test') do
|
10
|
+
http = Polipus::HTTP.new
|
11
|
+
page = http.fetch_page('http://sfbay.craigslist.org/apa/')
|
12
|
+
expect(page).to be_an_instance_of(Polipus::Page)
|
13
|
+
expect(page.doc.search('title').text.strip).to eq 'SF bay area apts/housing for rent classifieds - craigslist'
|
14
|
+
expect(page.fetched_at).not_to be_nil
|
15
|
+
expect(page.fetched?).to be_truthy
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should follow a redirect' do
|
20
|
+
VCR.use_cassette('http_test_redirect') do
|
21
|
+
http = Polipus::HTTP.new
|
22
|
+
page = http.fetch_page('http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis')
|
23
|
+
|
24
|
+
expect(page).to be_an_instance_of(Polipus::Page)
|
25
|
+
expect(page.code).to be 200
|
26
|
+
expect(page.url.to_s).to eq 'http://greenbytes.de/tech/tc/httpredirects/300.txt'
|
27
|
+
expect(page.body.strip).to eq "You have reached the target\r\nof a 300 redirect."
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe 'proxy settings' do
|
32
|
+
it 'should set proxy correctly using a procedure' do
|
33
|
+
http = Polipus::HTTP.new(proxy_host: -> _con { '127.0.0.0' }, proxy_port: -> _con { 8080 })
|
34
|
+
expect(http.proxy_host).to eq '127.0.0.0'
|
35
|
+
expect(http.proxy_port).to be 8080
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should set proxy correctly using shorthand method' do
|
39
|
+
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080] })
|
40
|
+
expect(http.proxy_host_port).to eq ['127.0.0.0', 8080]
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should set proxy w/ auth correctly using shorthand method' do
|
44
|
+
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080, 'a', 'b'] })
|
45
|
+
expect(http.proxy_host_port).to eq ['127.0.0.0', 8080, 'a', 'b']
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should set proxy settings' do
|
49
|
+
http = Polipus::HTTP.new(proxy_host: '127.0.0.0', proxy_port: 8080, proxy_user: 'a', proxy_pass: 'b')
|
50
|
+
expect(http.proxy_port).to be 8080
|
51
|
+
expect(http.proxy_host).to eq '127.0.0.0'
|
52
|
+
expect(http.proxy_user).to eq 'a'
|
53
|
+
expect(http.proxy_pass).to eq 'b'
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe 'compressed content handling' do
|
58
|
+
it 'should decode gzip content' do
|
59
|
+
VCR.use_cassette('gzipped_on') do
|
60
|
+
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
61
|
+
page = http.fetch_page('http://www.whatsmyip.org/http-compression-test/')
|
62
|
+
expect(page.doc.css('.gzip_yes')).not_to be_empty
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should decode deflate content' do
|
67
|
+
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
68
|
+
page = http.fetch_page('http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http')
|
69
|
+
expect(page.headers.fetch('content-encoding').first).to eq 'deflate'
|
70
|
+
expect(page.body.include?('deflate-http')).to be_truthy
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe 'staled connections' do
|
75
|
+
it 'should refresh a staled connection' do
|
76
|
+
VCR.use_cassette('http_tconnection_max_hits') do
|
77
|
+
http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
|
78
|
+
http.class.__send__(:attr_reader, :connections)
|
79
|
+
http.class.__send__(:attr_reader, :connections_hits)
|
80
|
+
http.fetch_page('https://www.yahoo.com/')
|
81
|
+
expect(http.connections['www.yahoo.com'][443]).not_to be_nil
|
82
|
+
old_conn = http.connections['www.yahoo.com'][443]
|
83
|
+
expect(http.connections_hits['www.yahoo.com'][443]).to be 1
|
84
|
+
|
85
|
+
http.fetch_page('https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html')
|
86
|
+
expect(http.connections_hits['www.yahoo.com'][443]).to be 1
|
87
|
+
expect(http.connections['www.yahoo.com'][443]).not_to be old_conn
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe 'cookies' do
|
93
|
+
it 'should handle cookies correctly' do
|
94
|
+
VCR.use_cassette('http_cookies') do
|
95
|
+
http = Polipus::HTTP.new(accept_cookies: true)
|
96
|
+
http.fetch_page 'http://www.whatarecookies.com/cookietest.asp'
|
97
|
+
expect(http.accept_cookies?).to be_truthy
|
98
|
+
expect(http.cookie_jar.cookies(URI('http://www.whatarecookies.com/cookietest.asp'))).not_to be_empty
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
describe 'net errors' do
|
104
|
+
it 'should handle net errors correctly' do
|
105
|
+
VCR.use_cassette('http_errors') do
|
106
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1)
|
107
|
+
expect(http.fetch_page('http://www.wrong-domain.lol/').error).not_to be_nil
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
describe 'random user_agent' do
|
113
|
+
context 'when user_agent is string' do
|
114
|
+
it '#user_agent' do
|
115
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: 'Googlebot')
|
116
|
+
expect(http.user_agent).to eq('Googlebot')
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
context 'when user_agent is list' do
|
121
|
+
let(:user_agents) {
|
122
|
+
["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
|
123
|
+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
|
124
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
|
125
|
+
"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
|
126
|
+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
|
127
|
+
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
|
128
|
+
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
|
129
|
+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)"
|
130
|
+
]
|
131
|
+
}
|
132
|
+
|
133
|
+
it '#user_agent' do
|
134
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: user_agents)
|
135
|
+
expect(user_agents).to include(http.user_agent)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'polipus/page'
|
4
|
+
|
5
|
+
describe Polipus::Page do
|
6
|
+
let(:page) do
|
7
|
+
body = <<EOF
|
8
|
+
<html>
|
9
|
+
<body>
|
10
|
+
<a href="/page/1">1</a>
|
11
|
+
<a href="/page/2">2</a>
|
12
|
+
<a href="http://www.google.com/page/3">3</a>
|
13
|
+
<a href="http://google.com/page/3">4</a>
|
14
|
+
<a href="http://not.google.com/page/3">4</a>
|
15
|
+
</body>
|
16
|
+
</html>
|
17
|
+
EOF
|
18
|
+
Polipus::Page.new 'http://www.google.com/',
|
19
|
+
code: 200,
|
20
|
+
body: body,
|
21
|
+
headers: { 'content-type' => ['text/html'] },
|
22
|
+
domain_aliases: %w(www.google.com google.com)
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should be fetched' do
|
26
|
+
expect(page.fetched?).to be_truthy
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should honor domain_aliases attribute' do
|
30
|
+
expect(page.links.count).to be 4
|
31
|
+
end
|
32
|
+
|
33
|
+
context 'page expiring' do
|
34
|
+
let(:page) do
|
35
|
+
Polipus::Page.new 'http://www.google.com/',
|
36
|
+
code: 200,
|
37
|
+
body: '',
|
38
|
+
headers: { 'content-type' => ['text/html'] },
|
39
|
+
domain_aliases: %w(www.google.com google.com),
|
40
|
+
fetched_at: (Time.now.to_i - 30)
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should be marked at expired' do
|
44
|
+
expect(page.expired?(20)).to be_truthy
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should NOT be marked at expired' do
|
48
|
+
expect(page.expired?(60)).to be_falsey
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context 'page error' do
|
53
|
+
let(:page) do
|
54
|
+
Polipus::Page.new 'http://www.google.com/', error: 'an error'
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should serialize an error' do
|
58
|
+
expect(page.to_hash['error']).to eq 'an error'
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context 'page code' do
|
63
|
+
it 'should identify HTTPSuccess code' do
|
64
|
+
expect(Polipus::Page.new('http://www.google.com/', code: 201).success?).to be_truthy
|
65
|
+
expect(Polipus::Page.new('http://www.google.com/', code: 404).success?).to be_falsey
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'mongo'
|
4
|
+
require 'polipus/queue_overflow'
|
5
|
+
require 'redis-queue'
|
6
|
+
|
7
|
+
describe Polipus::QueueOverflow::Manager do
|
8
|
+
before(:all) do
|
9
|
+
@mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
|
10
|
+
@mongo['_test_pages'].drop
|
11
|
+
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
12
|
+
@redis_q = Redis::Queue.new('queue_test', 'bp_queue_test', redis: Redis.new)
|
13
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test')
|
14
|
+
@redis = Redis.new
|
15
|
+
@polipus = flexmock('polipus')
|
16
|
+
@polipus.should_receive(:queue_overflow_adapter).and_return(@queue_overflow)
|
17
|
+
@polipus.should_receive(:storage).and_return(@storage)
|
18
|
+
@polipus.should_receive(:redis).and_return(@redis)
|
19
|
+
@polipus.should_receive(:job_name).and_return('___test')
|
20
|
+
@polipus.should_receive(:logger).and_return(Logger.new(nil))
|
21
|
+
@manager = Polipus::QueueOverflow::Manager.new(@polipus, @redis_q, 10)
|
22
|
+
end
|
23
|
+
|
24
|
+
before(:each) do
|
25
|
+
@queue_overflow.clear
|
26
|
+
@redis_q.clear
|
27
|
+
@storage.clear
|
28
|
+
end
|
29
|
+
|
30
|
+
after(:all) do
|
31
|
+
@queue_overflow.clear
|
32
|
+
@redis_q.clear
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should remove 10 items' do
|
36
|
+
expect(@manager.perform).to eq([0, 0])
|
37
|
+
20.times { |i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
38
|
+
expect(@manager.perform).to eq([10, 0])
|
39
|
+
expect(@queue_overflow.size).to eq(10)
|
40
|
+
expect(@redis_q.size).to eq(10)
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should restore 10 items' do
|
44
|
+
expect(@manager.perform).to eq([0, 0])
|
45
|
+
10.times { |i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
46
|
+
expect(@manager.perform).to eq([0, 10])
|
47
|
+
expect(@queue_overflow.size).to eq(0)
|
48
|
+
expect(@redis_q.size).to eq(10)
|
49
|
+
expect(@manager.perform).to eq([0, 0])
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should restore 3 items' do
|
53
|
+
expect(@manager.perform).to eq([0, 0])
|
54
|
+
3.times { |i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
55
|
+
expect(@manager.perform).to eq([0, 3])
|
56
|
+
expect(@queue_overflow.size).to eq(0)
|
57
|
+
expect(@redis_q.size).to eq(3)
|
58
|
+
expect(@manager.perform).to eq([0, 0])
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should restore 0 items' do
|
62
|
+
expect(@manager.perform).to eq([0, 0])
|
63
|
+
10.times do|i|
|
64
|
+
p = page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>')
|
65
|
+
@storage.add p
|
66
|
+
@queue_overflow << p.to_json
|
67
|
+
end
|
68
|
+
expect(@manager.perform).to eq([0, 0])
|
69
|
+
expect(@queue_overflow.size).to eq(0)
|
70
|
+
expect(@redis_q.size).to eq(0)
|
71
|
+
expect(@manager.perform).to eq([0, 0])
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'should filter an url based on the spec' do
|
75
|
+
@queue_overflow.clear
|
76
|
+
@redis_q.clear
|
77
|
+
10.times { |i| @queue_overflow << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
78
|
+
@manager.url_filter do |page|
|
79
|
+
page.url.to_s.end_with?('page_0') ? false : true
|
80
|
+
end
|
81
|
+
expect(@manager.perform).to eq([0, 9])
|
82
|
+
expect(@queue_overflow.size).to eq(0)
|
83
|
+
expect(@redis_q.size).to eq(9)
|
84
|
+
@manager.url_filter do |_page|
|
85
|
+
true
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'polipus/queue_overflow'
|
4
|
+
|
5
|
+
describe Polipus::QueueOverflow do
|
6
|
+
before(:all) do
|
7
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test')
|
8
|
+
@queue_overflow_capped = Polipus::QueueOverflow.mongo_queue_capped(nil, 'queue_test_c', max: 20)
|
9
|
+
@queue_overflow_uniq = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test_u', ensure_uniq: true)
|
10
|
+
end
|
11
|
+
|
12
|
+
before(:each) do
|
13
|
+
@queue_overflow.clear
|
14
|
+
@queue_overflow_capped.clear
|
15
|
+
@queue_overflow_uniq.clear
|
16
|
+
end
|
17
|
+
|
18
|
+
after(:all) do
|
19
|
+
@queue_overflow.clear
|
20
|
+
@queue_overflow_uniq.clear
|
21
|
+
@queue_overflow_capped.clear
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should work' do
|
25
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
26
|
+
expect(q.empty?).to be_truthy
|
27
|
+
expect(q.pop).to be_nil
|
28
|
+
q << 'test'
|
29
|
+
expect(q.size).to eq(1)
|
30
|
+
expect(q.pop).to eq('test')
|
31
|
+
expect(q.empty?).to be_truthy
|
32
|
+
expect(q.pop).to be_nil
|
33
|
+
expect(q.size).to eq(0)
|
34
|
+
expect(q.empty?).to be_truthy
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should act as a queue' do
|
39
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
40
|
+
10.times { |i| q << "message_#{i}" }
|
41
|
+
expect(q.size).to eq(10)
|
42
|
+
expect(q.pop).to eq('message_0')
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should work with complex paylod' do
|
47
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
48
|
+
a = { 'a' => [1, 2, 3], 'b' => 'a_string' }
|
49
|
+
q << a.to_json
|
50
|
+
b = q.pop
|
51
|
+
expect(JSON.parse(b)).to eq(a)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should honor max items if it is capped' do
|
56
|
+
30.times { |i| @queue_overflow_capped << "message_#{i}" }
|
57
|
+
expect(@queue_overflow_capped.size).to eq(20)
|
58
|
+
expect(@queue_overflow_capped.pop).to eq('message_10')
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should contains only unique items' do
|
62
|
+
20.times { @queue_overflow_uniq << 'A' }
|
63
|
+
20.times { @queue_overflow_uniq << 'B' }
|
64
|
+
expect(@queue_overflow_uniq.size).to eq(2)
|
65
|
+
end
|
66
|
+
end
|