parallel588_polipus 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +33 -0
- data/.travis.yml +22 -0
- data/AUTHORS.md +5 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +70 -0
- data/Rakefile +8 -0
- data/examples/basic.rb +63 -0
- data/examples/error_handling.rb +23 -0
- data/examples/incremental.rb +63 -0
- data/examples/robots_txt_handling.rb +14 -0
- data/examples/survival.rb +10 -0
- data/lib/polipus.rb +488 -0
- data/lib/polipus/http.rb +282 -0
- data/lib/polipus/page.rb +256 -0
- data/lib/polipus/plugin.rb +14 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +15 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +26 -0
- data/lib/polipus/queue_overflow/base.rb +7 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
- data/lib/polipus/queue_overflow/manager.rb +57 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/robotex.rb +145 -0
- data/lib/polipus/signal_handler.rb +42 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +20 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +90 -0
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/url_tracker.rb +21 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +5 -0
- data/polipus.gemspec +44 -0
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/cassettes/gzipped_on.yml +147 -0
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +12 -0
- data/spec/polipus/http_spec.rb +139 -0
- data/spec/polipus/page_spec.rb +68 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
- data/spec/polipus/queue_overflow_spec.rb +66 -0
- data/spec/polipus/robotex_spec.rb +85 -0
- data/spec/polipus/signal_handler_spec.rb +15 -0
- data/spec/polipus/storage/memory_store_spec.rb +87 -0
- data/spec/polipus/storage/mongo_store_spec.rb +119 -0
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/polipus/url_tracker_spec.rb +29 -0
- data/spec/polipus_spec.rb +107 -0
- data/spec/spec_helper.rb +42 -0
- metadata +348 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept:
|
11
|
+
- ! '*/*'
|
12
|
+
User-Agent:
|
13
|
+
- Ruby
|
14
|
+
response:
|
15
|
+
status:
|
16
|
+
code: 300
|
17
|
+
message: Multiple Choices
|
18
|
+
headers:
|
19
|
+
Date:
|
20
|
+
- Mon, 10 Jun 2013 08:58:25 GMT
|
21
|
+
Server:
|
22
|
+
- Apache/2.2.22 (Ubuntu)
|
23
|
+
Cache-Control:
|
24
|
+
- no-cache
|
25
|
+
Location:
|
26
|
+
- http://greenbytes.de/tech/tc/httpredirects/300.txt
|
27
|
+
Content-Length:
|
28
|
+
- '27'
|
29
|
+
body:
|
30
|
+
encoding: US-ASCII
|
31
|
+
string: ! '300 Redirect Response Body
|
32
|
+
|
33
|
+
'
|
34
|
+
http_version:
|
35
|
+
recorded_at: Mon, 10 Jun 2013 08:58:25 GMT
|
36
|
+
- request:
|
37
|
+
method: get
|
38
|
+
uri: http://greenbytes.de/tech/tc/httpredirects/300.txt
|
39
|
+
body:
|
40
|
+
encoding: US-ASCII
|
41
|
+
string: ''
|
42
|
+
headers:
|
43
|
+
Accept:
|
44
|
+
- ! '*/*'
|
45
|
+
User-Agent:
|
46
|
+
- Ruby
|
47
|
+
response:
|
48
|
+
status:
|
49
|
+
code: 200
|
50
|
+
message: OK
|
51
|
+
headers:
|
52
|
+
Date:
|
53
|
+
- Mon, 10 Jun 2013 08:58:25 GMT
|
54
|
+
Server:
|
55
|
+
- Apache/2.2.22 (Ubuntu)
|
56
|
+
Last-Modified:
|
57
|
+
- Tue, 08 Jan 2013 17:31:05 GMT
|
58
|
+
Etag:
|
59
|
+
- ! '"b8306c-31-4d2ca4f7df2ca"'
|
60
|
+
Accept-Ranges:
|
61
|
+
- bytes
|
62
|
+
Content-Length:
|
63
|
+
- '49'
|
64
|
+
Content-Type:
|
65
|
+
- text/plain
|
66
|
+
body:
|
67
|
+
encoding: US-ASCII
|
68
|
+
string: ! "You have reached the target\r\nof a 300 redirect.\r\n"
|
69
|
+
http_version:
|
70
|
+
recorded_at: Mon, 10 Jun 2013 08:58:25 GMT
|
71
|
+
recorded_with: VCR 2.5.0
|
data/spec/clear.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'yaml'
|
3
|
+
Dir.glob('./cassettes/*.yml').each do|f|
|
4
|
+
next unless f =~ /[a-f0-9]{32}/
|
5
|
+
d = YAML.load_file(f)
|
6
|
+
d['http_interactions'].each do |r|
|
7
|
+
r['request'].delete('headers')
|
8
|
+
r['response'].delete('headers')
|
9
|
+
end
|
10
|
+
File.open(f, 'w') { |fw| fw.write(d.to_yaml) }
|
11
|
+
# puts d.to_yaml
|
12
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'mongo'
|
4
|
+
require 'polipus/http'
|
5
|
+
require 'polipus/page'
|
6
|
+
|
7
|
+
describe Polipus::HTTP do
|
8
|
+
it 'should download a page' do
|
9
|
+
VCR.use_cassette('http_test') do
|
10
|
+
http = Polipus::HTTP.new
|
11
|
+
page = http.fetch_page('http://sfbay.craigslist.org/apa/')
|
12
|
+
expect(page).to be_an_instance_of(Polipus::Page)
|
13
|
+
expect(page.doc.search('title').text.strip).to eq 'SF bay area apts/housing for rent classifieds - craigslist'
|
14
|
+
expect(page.fetched_at).not_to be_nil
|
15
|
+
expect(page.fetched?).to be_truthy
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should follow a redirect' do
|
20
|
+
VCR.use_cassette('http_test_redirect') do
|
21
|
+
http = Polipus::HTTP.new
|
22
|
+
page = http.fetch_page('http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis')
|
23
|
+
|
24
|
+
expect(page).to be_an_instance_of(Polipus::Page)
|
25
|
+
expect(page.code).to be 200
|
26
|
+
expect(page.url.to_s).to eq 'http://greenbytes.de/tech/tc/httpredirects/300.txt'
|
27
|
+
expect(page.body.strip).to eq "You have reached the target\r\nof a 300 redirect."
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe 'proxy settings' do
|
32
|
+
it 'should set proxy correctly using a procedure' do
|
33
|
+
http = Polipus::HTTP.new(proxy_host: -> _con { '127.0.0.0' }, proxy_port: -> _con { 8080 })
|
34
|
+
expect(http.proxy_host).to eq '127.0.0.0'
|
35
|
+
expect(http.proxy_port).to be 8080
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should set proxy correctly using shorthand method' do
|
39
|
+
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080] })
|
40
|
+
expect(http.proxy_host_port).to eq ['127.0.0.0', 8080]
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should set proxy w/ auth correctly using shorthand method' do
|
44
|
+
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080, 'a', 'b'] })
|
45
|
+
expect(http.proxy_host_port).to eq ['127.0.0.0', 8080, 'a', 'b']
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should set proxy settings' do
|
49
|
+
http = Polipus::HTTP.new(proxy_host: '127.0.0.0', proxy_port: 8080, proxy_user: 'a', proxy_pass: 'b')
|
50
|
+
expect(http.proxy_port).to be 8080
|
51
|
+
expect(http.proxy_host).to eq '127.0.0.0'
|
52
|
+
expect(http.proxy_user).to eq 'a'
|
53
|
+
expect(http.proxy_pass).to eq 'b'
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe 'compressed content handling' do
|
58
|
+
it 'should decode gzip content' do
|
59
|
+
VCR.use_cassette('gzipped_on') do
|
60
|
+
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
61
|
+
page = http.fetch_page('http://www.whatsmyip.org/http-compression-test/')
|
62
|
+
expect(page.doc.css('.gzip_yes')).not_to be_empty
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should decode deflate content' do
|
67
|
+
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
68
|
+
page = http.fetch_page('http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http')
|
69
|
+
expect(page.headers.fetch('content-encoding').first).to eq 'deflate'
|
70
|
+
expect(page.body.include?('deflate-http')).to be_truthy
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe 'staled connections' do
|
75
|
+
it 'should refresh a staled connection' do
|
76
|
+
VCR.use_cassette('http_tconnection_max_hits') do
|
77
|
+
http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
|
78
|
+
http.class.__send__(:attr_reader, :connections)
|
79
|
+
http.class.__send__(:attr_reader, :connections_hits)
|
80
|
+
http.fetch_page('https://www.yahoo.com/')
|
81
|
+
expect(http.connections['www.yahoo.com'][443]).not_to be_nil
|
82
|
+
old_conn = http.connections['www.yahoo.com'][443]
|
83
|
+
expect(http.connections_hits['www.yahoo.com'][443]).to be 1
|
84
|
+
|
85
|
+
http.fetch_page('https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html')
|
86
|
+
expect(http.connections_hits['www.yahoo.com'][443]).to be 1
|
87
|
+
expect(http.connections['www.yahoo.com'][443]).not_to be old_conn
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe 'cookies' do
|
93
|
+
it 'should handle cookies correctly' do
|
94
|
+
VCR.use_cassette('http_cookies') do
|
95
|
+
http = Polipus::HTTP.new(accept_cookies: true)
|
96
|
+
http.fetch_page 'http://www.whatarecookies.com/cookietest.asp'
|
97
|
+
expect(http.accept_cookies?).to be_truthy
|
98
|
+
expect(http.cookie_jar.cookies(URI('http://www.whatarecookies.com/cookietest.asp'))).not_to be_empty
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
describe 'net errors' do
|
104
|
+
it 'should handle net errors correctly' do
|
105
|
+
VCR.use_cassette('http_errors') do
|
106
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1)
|
107
|
+
expect(http.fetch_page('http://www.wrong-domain.lol/').error).not_to be_nil
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
describe 'random user_agent' do
|
113
|
+
context 'when user_agent is string' do
|
114
|
+
it '#user_agent' do
|
115
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: 'Googlebot')
|
116
|
+
expect(http.user_agent).to eq('Googlebot')
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
context 'when user_agent is list' do
|
121
|
+
let(:user_agents) {
|
122
|
+
["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
|
123
|
+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
|
124
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
|
125
|
+
"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
|
126
|
+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
|
127
|
+
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
|
128
|
+
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
|
129
|
+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)"
|
130
|
+
]
|
131
|
+
}
|
132
|
+
|
133
|
+
it '#user_agent' do
|
134
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1, user_agent: user_agents)
|
135
|
+
expect(user_agents).to include(http.user_agent)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'polipus/page'
|
4
|
+
|
5
|
+
describe Polipus::Page do
|
6
|
+
let(:page) do
|
7
|
+
body = <<EOF
|
8
|
+
<html>
|
9
|
+
<body>
|
10
|
+
<a href="/page/1">1</a>
|
11
|
+
<a href="/page/2">2</a>
|
12
|
+
<a href="http://www.google.com/page/3">3</a>
|
13
|
+
<a href="http://google.com/page/3">4</a>
|
14
|
+
<a href="http://not.google.com/page/3">4</a>
|
15
|
+
</body>
|
16
|
+
</html>
|
17
|
+
EOF
|
18
|
+
Polipus::Page.new 'http://www.google.com/',
|
19
|
+
code: 200,
|
20
|
+
body: body,
|
21
|
+
headers: { 'content-type' => ['text/html'] },
|
22
|
+
domain_aliases: %w(www.google.com google.com)
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should be fetched' do
|
26
|
+
expect(page.fetched?).to be_truthy
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should honor domain_aliases attribute' do
|
30
|
+
expect(page.links.count).to be 4
|
31
|
+
end
|
32
|
+
|
33
|
+
context 'page expiring' do
|
34
|
+
let(:page) do
|
35
|
+
Polipus::Page.new 'http://www.google.com/',
|
36
|
+
code: 200,
|
37
|
+
body: '',
|
38
|
+
headers: { 'content-type' => ['text/html'] },
|
39
|
+
domain_aliases: %w(www.google.com google.com),
|
40
|
+
fetched_at: (Time.now.to_i - 30)
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should be marked at expired' do
|
44
|
+
expect(page.expired?(20)).to be_truthy
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should NOT be marked at expired' do
|
48
|
+
expect(page.expired?(60)).to be_falsey
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context 'page error' do
|
53
|
+
let(:page) do
|
54
|
+
Polipus::Page.new 'http://www.google.com/', error: 'an error'
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should serialize an error' do
|
58
|
+
expect(page.to_hash['error']).to eq 'an error'
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context 'page code' do
|
63
|
+
it 'should identify HTTPSuccess code' do
|
64
|
+
expect(Polipus::Page.new('http://www.google.com/', code: 201).success?).to be_truthy
|
65
|
+
expect(Polipus::Page.new('http://www.google.com/', code: 404).success?).to be_falsey
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'mongo'
|
4
|
+
require 'polipus/queue_overflow'
|
5
|
+
require 'redis-queue'
|
6
|
+
|
7
|
+
describe Polipus::QueueOverflow::Manager do
|
8
|
+
before(:all) do
|
9
|
+
@mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
|
10
|
+
@mongo['_test_pages'].drop
|
11
|
+
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
12
|
+
@redis_q = Redis::Queue.new('queue_test', 'bp_queue_test', redis: Redis.new)
|
13
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test')
|
14
|
+
@redis = Redis.new
|
15
|
+
@polipus = flexmock('polipus')
|
16
|
+
@polipus.should_receive(:queue_overflow_adapter).and_return(@queue_overflow)
|
17
|
+
@polipus.should_receive(:storage).and_return(@storage)
|
18
|
+
@polipus.should_receive(:redis).and_return(@redis)
|
19
|
+
@polipus.should_receive(:job_name).and_return('___test')
|
20
|
+
@polipus.should_receive(:logger).and_return(Logger.new(nil))
|
21
|
+
@manager = Polipus::QueueOverflow::Manager.new(@polipus, @redis_q, 10)
|
22
|
+
end
|
23
|
+
|
24
|
+
before(:each) do
|
25
|
+
@queue_overflow.clear
|
26
|
+
@redis_q.clear
|
27
|
+
@storage.clear
|
28
|
+
end
|
29
|
+
|
30
|
+
after(:all) do
|
31
|
+
@queue_overflow.clear
|
32
|
+
@redis_q.clear
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should remove 10 items' do
|
36
|
+
expect(@manager.perform).to eq([0, 0])
|
37
|
+
20.times { |i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
38
|
+
expect(@manager.perform).to eq([10, 0])
|
39
|
+
expect(@queue_overflow.size).to eq(10)
|
40
|
+
expect(@redis_q.size).to eq(10)
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should restore 10 items' do
|
44
|
+
expect(@manager.perform).to eq([0, 0])
|
45
|
+
10.times { |i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
46
|
+
expect(@manager.perform).to eq([0, 10])
|
47
|
+
expect(@queue_overflow.size).to eq(0)
|
48
|
+
expect(@redis_q.size).to eq(10)
|
49
|
+
expect(@manager.perform).to eq([0, 0])
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should restore 3 items' do
|
53
|
+
expect(@manager.perform).to eq([0, 0])
|
54
|
+
3.times { |i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
55
|
+
expect(@manager.perform).to eq([0, 3])
|
56
|
+
expect(@queue_overflow.size).to eq(0)
|
57
|
+
expect(@redis_q.size).to eq(3)
|
58
|
+
expect(@manager.perform).to eq([0, 0])
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should restore 0 items' do
|
62
|
+
expect(@manager.perform).to eq([0, 0])
|
63
|
+
10.times do|i|
|
64
|
+
p = page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>')
|
65
|
+
@storage.add p
|
66
|
+
@queue_overflow << p.to_json
|
67
|
+
end
|
68
|
+
expect(@manager.perform).to eq([0, 0])
|
69
|
+
expect(@queue_overflow.size).to eq(0)
|
70
|
+
expect(@redis_q.size).to eq(0)
|
71
|
+
expect(@manager.perform).to eq([0, 0])
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'should filter an url based on the spec' do
|
75
|
+
@queue_overflow.clear
|
76
|
+
@redis_q.clear
|
77
|
+
10.times { |i| @queue_overflow << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
78
|
+
@manager.url_filter do |page|
|
79
|
+
page.url.to_s.end_with?('page_0') ? false : true
|
80
|
+
end
|
81
|
+
expect(@manager.perform).to eq([0, 9])
|
82
|
+
expect(@queue_overflow.size).to eq(0)
|
83
|
+
expect(@redis_q.size).to eq(9)
|
84
|
+
@manager.url_filter do |_page|
|
85
|
+
true
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'polipus/queue_overflow'
|
4
|
+
|
5
|
+
describe Polipus::QueueOverflow do
|
6
|
+
before(:all) do
|
7
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test')
|
8
|
+
@queue_overflow_capped = Polipus::QueueOverflow.mongo_queue_capped(nil, 'queue_test_c', max: 20)
|
9
|
+
@queue_overflow_uniq = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test_u', ensure_uniq: true)
|
10
|
+
end
|
11
|
+
|
12
|
+
before(:each) do
|
13
|
+
@queue_overflow.clear
|
14
|
+
@queue_overflow_capped.clear
|
15
|
+
@queue_overflow_uniq.clear
|
16
|
+
end
|
17
|
+
|
18
|
+
after(:all) do
|
19
|
+
@queue_overflow.clear
|
20
|
+
@queue_overflow_uniq.clear
|
21
|
+
@queue_overflow_capped.clear
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should work' do
|
25
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
26
|
+
expect(q.empty?).to be_truthy
|
27
|
+
expect(q.pop).to be_nil
|
28
|
+
q << 'test'
|
29
|
+
expect(q.size).to eq(1)
|
30
|
+
expect(q.pop).to eq('test')
|
31
|
+
expect(q.empty?).to be_truthy
|
32
|
+
expect(q.pop).to be_nil
|
33
|
+
expect(q.size).to eq(0)
|
34
|
+
expect(q.empty?).to be_truthy
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should act as a queue' do
|
39
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
40
|
+
10.times { |i| q << "message_#{i}" }
|
41
|
+
expect(q.size).to eq(10)
|
42
|
+
expect(q.pop).to eq('message_0')
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should work with complex paylod' do
|
47
|
+
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
48
|
+
a = { 'a' => [1, 2, 3], 'b' => 'a_string' }
|
49
|
+
q << a.to_json
|
50
|
+
b = q.pop
|
51
|
+
expect(JSON.parse(b)).to eq(a)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should honor max items if it is capped' do
|
56
|
+
30.times { |i| @queue_overflow_capped << "message_#{i}" }
|
57
|
+
expect(@queue_overflow_capped.size).to eq(20)
|
58
|
+
expect(@queue_overflow_capped.pop).to eq('message_10')
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should contains only unique items' do
|
62
|
+
20.times { @queue_overflow_uniq << 'A' }
|
63
|
+
20.times { @queue_overflow_uniq << 'B' }
|
64
|
+
expect(@queue_overflow_uniq.size).to eq(2)
|
65
|
+
end
|
66
|
+
end
|