polipus 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +37 -0
- data/.travis.yml +2 -1
- data/CHANGELOG.md +20 -0
- data/README.md +10 -0
- data/Rakefile +4 -4
- data/examples/basic.rb +16 -19
- data/examples/incremental.rb +17 -17
- data/examples/robots_txt_handling.rb +1 -1
- data/examples/survival.rb +3 -3
- data/lib/polipus.rb +186 -229
- data/lib/polipus/http.rb +41 -42
- data/lib/polipus/page.rb +33 -34
- data/lib/polipus/plugin.rb +2 -2
- data/lib/polipus/plugins/cleaner.rb +7 -8
- data/lib/polipus/plugins/sample.rb +6 -9
- data/lib/polipus/plugins/sleeper.rb +7 -8
- data/lib/polipus/queue_overflow.rb +11 -11
- data/lib/polipus/queue_overflow/base.rb +1 -1
- data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
- data/lib/polipus/queue_overflow/manager.rb +28 -25
- data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
- data/lib/polipus/robotex.rb +41 -51
- data/lib/polipus/signal_handler.rb +41 -0
- data/lib/polipus/storage.rb +11 -11
- data/lib/polipus/storage/base.rb +10 -8
- data/lib/polipus/storage/dev_null.rb +6 -7
- data/lib/polipus/storage/memory_store.rb +21 -22
- data/lib/polipus/storage/mongo_store.rb +34 -38
- data/lib/polipus/storage/s3_store.rb +33 -38
- data/lib/polipus/url_tracker.rb +3 -3
- data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
- data/lib/polipus/url_tracker/redis_set.rb +3 -4
- data/lib/polipus/version.rb +3 -3
- data/polipus.gemspec +12 -13
- data/spec/clear.rb +3 -3
- data/spec/http_spec.rb +27 -28
- data/spec/page_spec.rb +16 -16
- data/spec/polipus_spec.rb +34 -31
- data/spec/queue_overflow_manager_spec.rb +30 -28
- data/spec/queue_overflow_spec.rb +15 -15
- data/spec/robotex_spec.rb +9 -10
- data/spec/signal_handler_spec.rb +18 -0
- data/spec/spec_helper.rb +7 -6
- data/spec/storage_memory_spec.rb +18 -18
- data/spec/storage_mongo_spec.rb +19 -19
- data/spec/storage_s3_spec.rb +30 -31
- data/spec/url_tracker_spec.rb +7 -7
- metadata +7 -2
data/lib/polipus/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
1
|
module Polipus
|
2
|
-
VERSION =
|
3
|
-
HOMEPAGE =
|
4
|
-
end
|
2
|
+
VERSION = '0.3.1'
|
3
|
+
HOMEPAGE = 'https://github.com/taganaka/polipus'
|
4
|
+
end
|
data/polipus.gemspec
CHANGED
@@ -1,25 +1,25 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require
|
2
|
+
$LOAD_PATH.push File.expand_path('../lib', __FILE__)
|
3
|
+
require 'polipus/version'
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
|
-
s.name =
|
6
|
+
s.name = 'polipus'
|
7
7
|
s.version = Polipus::VERSION
|
8
|
-
s.authors = [
|
9
|
-
s.email = [
|
8
|
+
s.authors = ['Francesco Laurita']
|
9
|
+
s.email = ['francesco.laurita@gmail.com']
|
10
10
|
s.homepage = Polipus::HOMEPAGE
|
11
|
-
s.summary = %q
|
12
|
-
s.description = %q
|
11
|
+
s.summary = %q(Polipus distributed web-crawler framework)
|
12
|
+
s.description = %q(
|
13
13
|
An easy to use distributed web-crawler framework based on Redis
|
14
|
-
|
15
|
-
s.licenses = [
|
14
|
+
)
|
15
|
+
s.licenses = ['MIT']
|
16
16
|
|
17
|
-
s.rubyforge_project =
|
17
|
+
s.rubyforge_project = 'polipus'
|
18
18
|
|
19
19
|
s.files = `git ls-files`.split("\n")
|
20
20
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
-
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
-
s.require_paths = [
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
22
|
+
s.require_paths = ['lib']
|
23
23
|
|
24
24
|
s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.1'
|
25
25
|
s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.3'
|
@@ -43,5 +43,4 @@ Gem::Specification.new do |s|
|
|
43
43
|
s.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
|
44
44
|
s.add_development_dependency 'coveralls'
|
45
45
|
|
46
|
-
|
47
46
|
end
|
data/spec/clear.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'yaml'
|
2
2
|
Dir.glob('./cassettes/*.yml').each do|f|
|
3
3
|
next unless f =~ /[a-f0-9]{32}/
|
4
4
|
d = YAML.load_file(f)
|
@@ -6,6 +6,6 @@ Dir.glob('./cassettes/*.yml').each do|f|
|
|
6
6
|
r['request'].delete('headers')
|
7
7
|
r['response'].delete('headers')
|
8
8
|
end
|
9
|
-
File.open(f, 'w') {|fw| fw.write(d.to_yaml) }
|
10
|
-
#puts d.to_yaml
|
9
|
+
File.open(f, 'w') { |fw| fw.write(d.to_yaml) }
|
10
|
+
# puts d.to_yaml
|
11
11
|
end
|
data/spec/http_spec.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'mongo'
|
3
|
+
require 'polipus/http'
|
4
|
+
require 'polipus/page'
|
5
5
|
|
6
6
|
describe Polipus::HTTP do
|
7
|
-
|
7
|
+
|
8
8
|
it 'should download a page' do
|
9
9
|
VCR.use_cassette('http_test') do
|
10
10
|
http = Polipus::HTTP.new
|
11
|
-
page = http.fetch_page(
|
11
|
+
page = http.fetch_page('http://sfbay.craigslist.org/apa/')
|
12
12
|
page.should be_an_instance_of(Polipus::Page)
|
13
|
-
page.doc.search(
|
13
|
+
page.doc.search('title').text.strip.should eq 'SF bay area apts/housing for rent classifieds - craigslist'
|
14
14
|
page.fetched_at.should_not be_nil
|
15
15
|
page.fetched?.should be_true
|
16
16
|
end
|
@@ -20,11 +20,11 @@ describe Polipus::HTTP do
|
|
20
20
|
VCR.use_cassette('http_test_redirect') do
|
21
21
|
|
22
22
|
http = Polipus::HTTP.new
|
23
|
-
page = http.fetch_page(
|
23
|
+
page = http.fetch_page('http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis')
|
24
24
|
|
25
25
|
page.should be_an_instance_of(Polipus::Page)
|
26
26
|
page.code.should be 200
|
27
|
-
page.url.to_s.should eq
|
27
|
+
page.url.to_s.should eq 'http://greenbytes.de/tech/tc/httpredirects/300.txt'
|
28
28
|
page.body.strip.should eq "You have reached the target\r\nof a 300 redirect."
|
29
29
|
end
|
30
30
|
end
|
@@ -32,59 +32,58 @@ describe Polipus::HTTP do
|
|
32
32
|
describe 'proxy settings' do
|
33
33
|
|
34
34
|
it 'should set proxy correctly using a procedure' do
|
35
|
-
http = Polipus::HTTP.new(
|
36
|
-
http.proxy_host.should eq
|
35
|
+
http = Polipus::HTTP.new(proxy_host: -> _con { '127.0.0.0' }, proxy_port: -> _con { 8080 })
|
36
|
+
http.proxy_host.should eq '127.0.0.0'
|
37
37
|
http.proxy_port.should be 8080
|
38
38
|
end
|
39
39
|
|
40
40
|
it 'should set proxy correctly using shorthand method' do
|
41
|
-
http = Polipus::HTTP.new(
|
42
|
-
http.proxy_host_port.should eq [
|
41
|
+
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080] })
|
42
|
+
http.proxy_host_port.should eq ['127.0.0.0', 8080]
|
43
43
|
http.proxy_port.should be 8080
|
44
|
-
http.proxy_host.should eq
|
44
|
+
http.proxy_host.should eq '127.0.0.0'
|
45
45
|
end
|
46
46
|
|
47
47
|
it 'should set proxy settings' do
|
48
|
-
http = Polipus::HTTP.new(
|
48
|
+
http = Polipus::HTTP.new(proxy_host: '127.0.0.0', proxy_port: 8080)
|
49
49
|
http.proxy_port.should be 8080
|
50
|
-
http.proxy_host.should eq
|
50
|
+
http.proxy_host.should eq '127.0.0.0'
|
51
51
|
end
|
52
52
|
|
53
53
|
end
|
54
54
|
|
55
|
-
|
56
55
|
describe 'compressed content handling' do
|
57
56
|
|
58
57
|
it 'should decode gzip content' do
|
59
58
|
VCR.use_cassette('gzipped_on') do
|
60
59
|
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
61
|
-
page = http.fetch_page(
|
60
|
+
page = http.fetch_page('http://www.whatsmyip.org/http-compression-test/')
|
62
61
|
page.doc.css('.gzip_yes').should_not be_empty
|
63
62
|
end
|
64
63
|
end
|
65
64
|
|
66
65
|
it 'should decode deflate content' do
|
67
66
|
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
68
|
-
page = http.fetch_page(
|
67
|
+
page = http.fetch_page('http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http')
|
69
68
|
page.headers.fetch('content-encoding').first.should eq 'deflate'
|
70
|
-
page.body.include?(
|
69
|
+
page.body.include?('deflate-http').should be_true
|
71
70
|
end
|
72
71
|
|
73
72
|
end
|
74
73
|
|
75
74
|
describe 'staled connections' do
|
76
|
-
|
75
|
+
|
77
76
|
it 'should refresh a staled connection' do
|
78
77
|
VCR.use_cassette('http_tconnection_max_hits') do
|
79
78
|
http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
|
80
79
|
http.class.__send__(:attr_reader, :connections)
|
81
80
|
http.class.__send__(:attr_reader, :connections_hits)
|
82
|
-
http.fetch_page(
|
81
|
+
http.fetch_page('https://www.yahoo.com/')
|
83
82
|
http.connections['www.yahoo.com'][443].should_not be_nil
|
84
83
|
old_conn = http.connections['www.yahoo.com'][443]
|
85
84
|
http.connections_hits['www.yahoo.com'][443].should be 1
|
86
85
|
|
87
|
-
http.fetch_page(
|
86
|
+
http.fetch_page('https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html')
|
88
87
|
http.connections_hits['www.yahoo.com'][443].should be 1
|
89
88
|
http.connections['www.yahoo.com'][443].should_not be old_conn
|
90
89
|
end
|
@@ -97,9 +96,9 @@ describe Polipus::HTTP do
|
|
97
96
|
it 'should handle cookies correctly' do
|
98
97
|
VCR.use_cassette('http_cookies') do
|
99
98
|
http = Polipus::HTTP.new(accept_cookies: true)
|
100
|
-
http.fetch_page
|
99
|
+
http.fetch_page 'http://www.whatarecookies.com/cookietest.asp'
|
101
100
|
http.accept_cookies?.should be_true
|
102
|
-
http.cookie_jar.cookies(URI(
|
101
|
+
http.cookie_jar.cookies(URI('http://www.whatarecookies.com/cookietest.asp')).should_not be_empty
|
103
102
|
end
|
104
103
|
end
|
105
104
|
|
@@ -108,10 +107,10 @@ describe Polipus::HTTP do
|
|
108
107
|
describe 'net errors' do
|
109
108
|
it 'should handle net errors correctly' do
|
110
109
|
VCR.use_cassette('http_errors') do
|
111
|
-
http = Polipus::HTTP.new(open_timeout:1, read_timeout: 1)
|
112
|
-
http.fetch_page(
|
110
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1)
|
111
|
+
http.fetch_page('http://www.wrong-domain.lol/').error.should_not be_nil
|
113
112
|
end
|
114
113
|
end
|
115
114
|
end
|
116
115
|
|
117
|
-
end
|
116
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'polipus/page'
|
3
3
|
|
4
4
|
describe Polipus::Page do
|
5
5
|
let(:page) do
|
@@ -14,29 +14,29 @@ describe Polipus::Page do
|
|
14
14
|
</body>
|
15
15
|
</html>
|
16
16
|
EOF
|
17
|
-
Polipus::Page.new 'http://www.google.com/',
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
Polipus::Page.new 'http://www.google.com/',
|
18
|
+
code: 200,
|
19
|
+
body: body,
|
20
|
+
headers: { 'content-type' => ['text/html'] },
|
21
|
+
domain_aliases: %w(www.google.com google.com)
|
22
22
|
end
|
23
23
|
|
24
24
|
it 'should be fetched' do
|
25
25
|
page.fetched?.should be_true
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
it 'should honor domain_aliases attribute' do
|
29
29
|
page.links.count.should be 4
|
30
30
|
end
|
31
31
|
|
32
32
|
context 'page expiring' do
|
33
33
|
let(:page) do
|
34
|
-
Polipus::Page.new 'http://www.google.com/',
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
Polipus::Page.new 'http://www.google.com/',
|
35
|
+
code: 200,
|
36
|
+
body: '',
|
37
|
+
headers: { 'content-type' => ['text/html'] },
|
38
|
+
domain_aliases: %w(www.google.com google.com),
|
39
|
+
fetched_at: (Time.now.to_i - 30)
|
40
40
|
end
|
41
41
|
|
42
42
|
it 'should be marked at expired' do
|
@@ -49,12 +49,12 @@ EOF
|
|
49
49
|
end
|
50
50
|
|
51
51
|
context 'page error' do
|
52
|
-
|
52
|
+
|
53
53
|
let(:page) do
|
54
54
|
Polipus::Page.new 'http://www.google.com/', error: 'an error'
|
55
55
|
end
|
56
56
|
|
57
|
-
it 'should serialize an error' do
|
57
|
+
it 'should serialize an error' do
|
58
58
|
page.to_hash['error'].should eq 'an error'
|
59
59
|
end
|
60
60
|
|
data/spec/polipus_spec.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Polipus::PolipusCrawler do
|
4
|
-
after(:each) {Redis.new(db:10).flushdb}
|
5
|
-
let(:p_options)
|
4
|
+
after(:each) { Redis.new(db: 10).flushdb }
|
5
|
+
let(:p_options) do
|
6
6
|
{
|
7
7
|
workers: 1,
|
8
|
-
redis_options: {host: 'localhost', db:10},
|
8
|
+
redis_options: { host: 'localhost', db: 10 },
|
9
9
|
depth_limit: 1,
|
10
10
|
queue_timeout: 1,
|
11
11
|
user_agent: 'polipus-rspec',
|
@@ -13,55 +13,58 @@ describe Polipus::PolipusCrawler do
|
|
13
13
|
logger_level: Logger::DEBUG,
|
14
14
|
storage: Polipus::Storage.memory_store
|
15
15
|
}
|
16
|
-
|
17
|
-
let(:polipus)
|
18
|
-
Polipus::PolipusCrawler.new(
|
19
|
-
|
16
|
+
end
|
17
|
+
let(:polipus) do
|
18
|
+
Polipus::PolipusCrawler.new('polipus-rspec', ['http://rubygems.org/gems'], p_options)
|
19
|
+
end
|
20
20
|
|
21
|
-
let(:init_page)
|
22
|
-
Polipus::Page.new
|
23
|
-
|
21
|
+
let(:init_page)do
|
22
|
+
Polipus::Page.new 'http://rubygems.org/gems'
|
23
|
+
end
|
24
24
|
|
25
|
-
let(:logger){Logger.new(nil)}
|
25
|
+
let(:logger) { Logger.new(nil) }
|
26
26
|
|
27
|
-
context
|
27
|
+
context 'polipus' do
|
28
28
|
|
29
|
-
it
|
29
|
+
it 'should create a polipus instance' do
|
30
30
|
polipus.should be_an_instance_of Polipus::PolipusCrawler
|
31
31
|
end
|
32
32
|
|
33
|
-
it
|
33
|
+
it 'should execute a crawling session' do
|
34
34
|
polipus.takeover
|
35
35
|
polipus.storage.exists?(init_page).should be_true
|
36
36
|
polipus.storage.get(init_page).links.count.should be polipus.storage.count
|
37
37
|
end
|
38
38
|
|
39
|
-
it
|
39
|
+
it 'should filter unwanted urls' do
|
40
40
|
polipus.skip_links_like(/\/pages\//)
|
41
41
|
polipus.takeover
|
42
42
|
polipus.storage.get(init_page).links
|
43
43
|
.reject { |e| e.path.to_s =~ /\/pages\// }.count.should be polipus.storage.count
|
44
44
|
end
|
45
45
|
|
46
|
-
it
|
46
|
+
it 'should follow only wanted urls' do
|
47
47
|
polipus.follow_links_like(/\/pages\//)
|
48
48
|
polipus.follow_links_like(/\/gems$/)
|
49
49
|
polipus.takeover
|
50
50
|
polipus.storage.get(init_page).links
|
51
|
-
.reject { |e| ![/\/pages\//, /\/gems$/].any?{|p| e.path =~ p} }
|
51
|
+
.reject { |e| ![/\/pages\//, /\/gems$/].any? { |p| e.path =~ p } }
|
52
52
|
.count.should be polipus.storage.count
|
53
53
|
end
|
54
54
|
|
55
|
-
it
|
55
|
+
it 'should refresh expired pages' do
|
56
56
|
polipus.ttl_page = 3600
|
57
57
|
polipus.takeover
|
58
|
-
polipus.storage.each
|
59
|
-
|
58
|
+
polipus.storage.each do |_id, page|
|
59
|
+
page.fetched_at = page.fetched_at - 3600
|
60
|
+
polipus.storage.add(page)
|
61
|
+
end
|
62
|
+
polipus.storage.each { |_id, page| page.expired?(3600).should be_true }
|
60
63
|
polipus.takeover
|
61
|
-
polipus.storage.each {|
|
64
|
+
polipus.storage.each { |_id, page| page.expired?(3600).should be_false }
|
62
65
|
end
|
63
66
|
|
64
|
-
it
|
67
|
+
it 'should re-download seeder urls no matter what' do
|
65
68
|
cache_hit = {}
|
66
69
|
polipus.follow_links_like(/\/gems$/)
|
67
70
|
polipus.on_page_downloaded do |page|
|
@@ -70,26 +73,26 @@ describe Polipus::PolipusCrawler do
|
|
70
73
|
end
|
71
74
|
polipus.takeover
|
72
75
|
polipus.takeover
|
73
|
-
cache_hit[
|
76
|
+
cache_hit['http://rubygems.org/gems'].should be 2
|
74
77
|
end
|
75
78
|
|
76
|
-
it
|
77
|
-
p = Polipus::PolipusCrawler.new(
|
79
|
+
it 'should call on_page_error code blocks when a page has error' do
|
80
|
+
p = Polipus::PolipusCrawler.new('polipus-rspec', ['http://dasd.adad.dom/'], p_options.merge(open_timeout: 1, read_timeout: 1))
|
78
81
|
a_page = nil
|
79
|
-
p.on_page_error {|page| a_page = page}
|
82
|
+
p.on_page_error { |page| a_page = page }
|
80
83
|
p.takeover
|
81
84
|
a_page.should_not be_nil
|
82
85
|
a_page.error.should_not be_nil
|
83
86
|
end
|
84
87
|
|
85
|
-
it
|
88
|
+
it 'should obey to the robots.txt file' do
|
86
89
|
lopt = p_options
|
87
90
|
lopt[:obey_robots_txt] = true
|
88
|
-
polipus = Polipus::PolipusCrawler.new(
|
91
|
+
polipus = Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
|
89
92
|
polipus.depth_limit = 1
|
90
93
|
polipus.takeover
|
91
|
-
polipus.storage.each {|
|
94
|
+
polipus.storage.each { |_id, page| (page.url.path =~ /$\/downloads\//).should be_false }
|
92
95
|
end
|
93
96
|
|
94
97
|
end
|
95
|
-
end
|
98
|
+
end
|
@@ -1,20 +1,22 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'mongo'
|
3
|
+
require 'polipus/queue_overflow'
|
4
|
+
require 'redis-queue'
|
4
5
|
|
5
6
|
describe Polipus::QueueOverflow::Manager do
|
6
7
|
before(:all) do
|
7
|
-
@mongo = Mongo::Connection.new(
|
8
|
+
@mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
|
8
9
|
@mongo['_test_pages'].drop
|
9
10
|
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
10
|
-
@redis_q = Redis::Queue.new(
|
11
|
-
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil,
|
11
|
+
@redis_q = Redis::Queue.new('queue_test', 'bp_queue_test', redis: Redis.new)
|
12
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test')
|
12
13
|
@redis = Redis.new
|
13
|
-
@polipus = flexmock(
|
14
|
+
@polipus = flexmock('polipus')
|
14
15
|
@polipus.should_receive(:queue_overflow_adapter).and_return(@queue_overflow)
|
15
16
|
@polipus.should_receive(:storage).and_return(@storage)
|
16
17
|
@polipus.should_receive(:redis).and_return(@redis)
|
17
|
-
@polipus.should_receive(:job_name).and_return(
|
18
|
+
@polipus.should_receive(:job_name).and_return('___test')
|
19
|
+
@polipus.should_receive(:logger).and_return(Logger.new(nil))
|
18
20
|
@manager = Polipus::QueueOverflow::Manager.new(@polipus, @redis_q, 10)
|
19
21
|
end
|
20
22
|
|
@@ -30,16 +32,16 @@ describe Polipus::QueueOverflow::Manager do
|
|
30
32
|
end
|
31
33
|
|
32
34
|
it 'should remove 10 items' do
|
33
|
-
@manager.perform.should be == [0,0]
|
34
|
-
20.times {|i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", :
|
35
|
+
@manager.perform.should be == [0, 0]
|
36
|
+
20.times { |i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
35
37
|
@manager.perform.should be == [10, 0]
|
36
38
|
@queue_overflow.size.should be == 10
|
37
39
|
@redis_q.size.should be == 10
|
38
40
|
end
|
39
41
|
|
40
42
|
it 'should restore 10 items' do
|
41
|
-
@manager.perform.should be == [0,0]
|
42
|
-
10.times {|i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", :
|
43
|
+
@manager.perform.should be == [0, 0]
|
44
|
+
10.times { |i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
43
45
|
@manager.perform.should be == [0, 10]
|
44
46
|
@queue_overflow.size.should be == 0
|
45
47
|
@redis_q.size.should be == 10
|
@@ -48,45 +50,45 @@ describe Polipus::QueueOverflow::Manager do
|
|
48
50
|
end
|
49
51
|
|
50
52
|
it 'should restore 3 items' do
|
51
|
-
|
52
|
-
@manager.perform.should be == [0,0]
|
53
|
-
3.times {|i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", :
|
53
|
+
|
54
|
+
@manager.perform.should be == [0, 0]
|
55
|
+
3.times { |i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
54
56
|
@manager.perform.should be == [0, 3]
|
55
57
|
@queue_overflow.size.should be == 0
|
56
58
|
@redis_q.size.should be == 3
|
57
59
|
@manager.perform.should be == [0, 0]
|
58
|
-
|
60
|
+
|
59
61
|
end
|
60
62
|
|
61
63
|
it 'should restore 0 items' do
|
62
|
-
|
63
|
-
@manager.perform.should be == [0,0]
|
64
|
-
10.times
|
65
|
-
p = page_factory("http://www.user-doo-bu.com/page_#{i}", :
|
64
|
+
|
65
|
+
@manager.perform.should be == [0, 0]
|
66
|
+
10.times do|i|
|
67
|
+
p = page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>')
|
66
68
|
@storage.add p
|
67
|
-
@queue_overflow << p.to_json
|
68
|
-
|
69
|
+
@queue_overflow << p.to_json
|
70
|
+
end
|
69
71
|
@manager.perform.should be == [0, 0]
|
70
72
|
@queue_overflow.size.should be == 0
|
71
73
|
@redis_q.size.should be == 0
|
72
74
|
@manager.perform.should be == [0, 0]
|
73
|
-
|
75
|
+
|
74
76
|
end
|
75
77
|
|
76
78
|
it 'should filter an url based on the spec' do
|
77
79
|
@queue_overflow.clear
|
78
80
|
@redis_q.clear
|
79
|
-
10.times {|i| @queue_overflow << page_factory("http://www.user-doo.com/page_#{i}", :
|
81
|
+
10.times { |i| @queue_overflow << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
80
82
|
@manager.url_filter do |page|
|
81
|
-
page.url.to_s.end_with?(
|
83
|
+
page.url.to_s.end_with?('page_0') ? false : true
|
82
84
|
end
|
83
|
-
@manager.perform.should be == [0,9]
|
85
|
+
@manager.perform.should be == [0, 9]
|
84
86
|
@queue_overflow.size.should be == 0
|
85
87
|
@redis_q.size.should be == 9
|
86
|
-
@manager.url_filter do |
|
88
|
+
@manager.url_filter do |_page|
|
87
89
|
true
|
88
90
|
end
|
89
91
|
|
90
92
|
end
|
91
93
|
|
92
|
-
end
|
94
|
+
end
|