polipus 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +37 -0
- data/.travis.yml +2 -1
- data/CHANGELOG.md +20 -0
- data/README.md +10 -0
- data/Rakefile +4 -4
- data/examples/basic.rb +16 -19
- data/examples/incremental.rb +17 -17
- data/examples/robots_txt_handling.rb +1 -1
- data/examples/survival.rb +3 -3
- data/lib/polipus.rb +186 -229
- data/lib/polipus/http.rb +41 -42
- data/lib/polipus/page.rb +33 -34
- data/lib/polipus/plugin.rb +2 -2
- data/lib/polipus/plugins/cleaner.rb +7 -8
- data/lib/polipus/plugins/sample.rb +6 -9
- data/lib/polipus/plugins/sleeper.rb +7 -8
- data/lib/polipus/queue_overflow.rb +11 -11
- data/lib/polipus/queue_overflow/base.rb +1 -1
- data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
- data/lib/polipus/queue_overflow/manager.rb +28 -25
- data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
- data/lib/polipus/robotex.rb +41 -51
- data/lib/polipus/signal_handler.rb +41 -0
- data/lib/polipus/storage.rb +11 -11
- data/lib/polipus/storage/base.rb +10 -8
- data/lib/polipus/storage/dev_null.rb +6 -7
- data/lib/polipus/storage/memory_store.rb +21 -22
- data/lib/polipus/storage/mongo_store.rb +34 -38
- data/lib/polipus/storage/s3_store.rb +33 -38
- data/lib/polipus/url_tracker.rb +3 -3
- data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
- data/lib/polipus/url_tracker/redis_set.rb +3 -4
- data/lib/polipus/version.rb +3 -3
- data/polipus.gemspec +12 -13
- data/spec/clear.rb +3 -3
- data/spec/http_spec.rb +27 -28
- data/spec/page_spec.rb +16 -16
- data/spec/polipus_spec.rb +34 -31
- data/spec/queue_overflow_manager_spec.rb +30 -28
- data/spec/queue_overflow_spec.rb +15 -15
- data/spec/robotex_spec.rb +9 -10
- data/spec/signal_handler_spec.rb +18 -0
- data/spec/spec_helper.rb +7 -6
- data/spec/storage_memory_spec.rb +18 -18
- data/spec/storage_mongo_spec.rb +19 -19
- data/spec/storage_s3_spec.rb +30 -31
- data/spec/url_tracker_spec.rb +7 -7
- metadata +7 -2
data/lib/polipus/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
1
|
module Polipus
|
2
|
-
VERSION =
|
3
|
-
HOMEPAGE =
|
4
|
-
end
|
2
|
+
VERSION = '0.3.1'
|
3
|
+
HOMEPAGE = 'https://github.com/taganaka/polipus'
|
4
|
+
end
|
data/polipus.gemspec
CHANGED
@@ -1,25 +1,25 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require
|
2
|
+
$LOAD_PATH.push File.expand_path('../lib', __FILE__)
|
3
|
+
require 'polipus/version'
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
|
-
s.name =
|
6
|
+
s.name = 'polipus'
|
7
7
|
s.version = Polipus::VERSION
|
8
|
-
s.authors = [
|
9
|
-
s.email = [
|
8
|
+
s.authors = ['Francesco Laurita']
|
9
|
+
s.email = ['francesco.laurita@gmail.com']
|
10
10
|
s.homepage = Polipus::HOMEPAGE
|
11
|
-
s.summary = %q
|
12
|
-
s.description = %q
|
11
|
+
s.summary = %q(Polipus distributed web-crawler framework)
|
12
|
+
s.description = %q(
|
13
13
|
An easy to use distributed web-crawler framework based on Redis
|
14
|
-
|
15
|
-
s.licenses = [
|
14
|
+
)
|
15
|
+
s.licenses = ['MIT']
|
16
16
|
|
17
|
-
s.rubyforge_project =
|
17
|
+
s.rubyforge_project = 'polipus'
|
18
18
|
|
19
19
|
s.files = `git ls-files`.split("\n")
|
20
20
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
-
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
-
s.require_paths = [
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
22
|
+
s.require_paths = ['lib']
|
23
23
|
|
24
24
|
s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.1'
|
25
25
|
s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.3'
|
@@ -43,5 +43,4 @@ Gem::Specification.new do |s|
|
|
43
43
|
s.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
|
44
44
|
s.add_development_dependency 'coveralls'
|
45
45
|
|
46
|
-
|
47
46
|
end
|
data/spec/clear.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'yaml'
|
2
2
|
Dir.glob('./cassettes/*.yml').each do|f|
|
3
3
|
next unless f =~ /[a-f0-9]{32}/
|
4
4
|
d = YAML.load_file(f)
|
@@ -6,6 +6,6 @@ Dir.glob('./cassettes/*.yml').each do|f|
|
|
6
6
|
r['request'].delete('headers')
|
7
7
|
r['response'].delete('headers')
|
8
8
|
end
|
9
|
-
File.open(f, 'w') {|fw| fw.write(d.to_yaml) }
|
10
|
-
#puts d.to_yaml
|
9
|
+
File.open(f, 'w') { |fw| fw.write(d.to_yaml) }
|
10
|
+
# puts d.to_yaml
|
11
11
|
end
|
data/spec/http_spec.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'mongo'
|
3
|
+
require 'polipus/http'
|
4
|
+
require 'polipus/page'
|
5
5
|
|
6
6
|
describe Polipus::HTTP do
|
7
|
-
|
7
|
+
|
8
8
|
it 'should download a page' do
|
9
9
|
VCR.use_cassette('http_test') do
|
10
10
|
http = Polipus::HTTP.new
|
11
|
-
page = http.fetch_page(
|
11
|
+
page = http.fetch_page('http://sfbay.craigslist.org/apa/')
|
12
12
|
page.should be_an_instance_of(Polipus::Page)
|
13
|
-
page.doc.search(
|
13
|
+
page.doc.search('title').text.strip.should eq 'SF bay area apts/housing for rent classifieds - craigslist'
|
14
14
|
page.fetched_at.should_not be_nil
|
15
15
|
page.fetched?.should be_true
|
16
16
|
end
|
@@ -20,11 +20,11 @@ describe Polipus::HTTP do
|
|
20
20
|
VCR.use_cassette('http_test_redirect') do
|
21
21
|
|
22
22
|
http = Polipus::HTTP.new
|
23
|
-
page = http.fetch_page(
|
23
|
+
page = http.fetch_page('http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis')
|
24
24
|
|
25
25
|
page.should be_an_instance_of(Polipus::Page)
|
26
26
|
page.code.should be 200
|
27
|
-
page.url.to_s.should eq
|
27
|
+
page.url.to_s.should eq 'http://greenbytes.de/tech/tc/httpredirects/300.txt'
|
28
28
|
page.body.strip.should eq "You have reached the target\r\nof a 300 redirect."
|
29
29
|
end
|
30
30
|
end
|
@@ -32,59 +32,58 @@ describe Polipus::HTTP do
|
|
32
32
|
describe 'proxy settings' do
|
33
33
|
|
34
34
|
it 'should set proxy correctly using a procedure' do
|
35
|
-
http = Polipus::HTTP.new(
|
36
|
-
http.proxy_host.should eq
|
35
|
+
http = Polipus::HTTP.new(proxy_host: -> _con { '127.0.0.0' }, proxy_port: -> _con { 8080 })
|
36
|
+
http.proxy_host.should eq '127.0.0.0'
|
37
37
|
http.proxy_port.should be 8080
|
38
38
|
end
|
39
39
|
|
40
40
|
it 'should set proxy correctly using shorthand method' do
|
41
|
-
http = Polipus::HTTP.new(
|
42
|
-
http.proxy_host_port.should eq [
|
41
|
+
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080] })
|
42
|
+
http.proxy_host_port.should eq ['127.0.0.0', 8080]
|
43
43
|
http.proxy_port.should be 8080
|
44
|
-
http.proxy_host.should eq
|
44
|
+
http.proxy_host.should eq '127.0.0.0'
|
45
45
|
end
|
46
46
|
|
47
47
|
it 'should set proxy settings' do
|
48
|
-
http = Polipus::HTTP.new(
|
48
|
+
http = Polipus::HTTP.new(proxy_host: '127.0.0.0', proxy_port: 8080)
|
49
49
|
http.proxy_port.should be 8080
|
50
|
-
http.proxy_host.should eq
|
50
|
+
http.proxy_host.should eq '127.0.0.0'
|
51
51
|
end
|
52
52
|
|
53
53
|
end
|
54
54
|
|
55
|
-
|
56
55
|
describe 'compressed content handling' do
|
57
56
|
|
58
57
|
it 'should decode gzip content' do
|
59
58
|
VCR.use_cassette('gzipped_on') do
|
60
59
|
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
61
|
-
page = http.fetch_page(
|
60
|
+
page = http.fetch_page('http://www.whatsmyip.org/http-compression-test/')
|
62
61
|
page.doc.css('.gzip_yes').should_not be_empty
|
63
62
|
end
|
64
63
|
end
|
65
64
|
|
66
65
|
it 'should decode deflate content' do
|
67
66
|
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
68
|
-
page = http.fetch_page(
|
67
|
+
page = http.fetch_page('http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http')
|
69
68
|
page.headers.fetch('content-encoding').first.should eq 'deflate'
|
70
|
-
page.body.include?(
|
69
|
+
page.body.include?('deflate-http').should be_true
|
71
70
|
end
|
72
71
|
|
73
72
|
end
|
74
73
|
|
75
74
|
describe 'staled connections' do
|
76
|
-
|
75
|
+
|
77
76
|
it 'should refresh a staled connection' do
|
78
77
|
VCR.use_cassette('http_tconnection_max_hits') do
|
79
78
|
http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
|
80
79
|
http.class.__send__(:attr_reader, :connections)
|
81
80
|
http.class.__send__(:attr_reader, :connections_hits)
|
82
|
-
http.fetch_page(
|
81
|
+
http.fetch_page('https://www.yahoo.com/')
|
83
82
|
http.connections['www.yahoo.com'][443].should_not be_nil
|
84
83
|
old_conn = http.connections['www.yahoo.com'][443]
|
85
84
|
http.connections_hits['www.yahoo.com'][443].should be 1
|
86
85
|
|
87
|
-
http.fetch_page(
|
86
|
+
http.fetch_page('https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html')
|
88
87
|
http.connections_hits['www.yahoo.com'][443].should be 1
|
89
88
|
http.connections['www.yahoo.com'][443].should_not be old_conn
|
90
89
|
end
|
@@ -97,9 +96,9 @@ describe Polipus::HTTP do
|
|
97
96
|
it 'should handle cookies correctly' do
|
98
97
|
VCR.use_cassette('http_cookies') do
|
99
98
|
http = Polipus::HTTP.new(accept_cookies: true)
|
100
|
-
http.fetch_page
|
99
|
+
http.fetch_page 'http://www.whatarecookies.com/cookietest.asp'
|
101
100
|
http.accept_cookies?.should be_true
|
102
|
-
http.cookie_jar.cookies(URI(
|
101
|
+
http.cookie_jar.cookies(URI('http://www.whatarecookies.com/cookietest.asp')).should_not be_empty
|
103
102
|
end
|
104
103
|
end
|
105
104
|
|
@@ -108,10 +107,10 @@ describe Polipus::HTTP do
|
|
108
107
|
describe 'net errors' do
|
109
108
|
it 'should handle net errors correctly' do
|
110
109
|
VCR.use_cassette('http_errors') do
|
111
|
-
http = Polipus::HTTP.new(open_timeout:1, read_timeout: 1)
|
112
|
-
http.fetch_page(
|
110
|
+
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1)
|
111
|
+
http.fetch_page('http://www.wrong-domain.lol/').error.should_not be_nil
|
113
112
|
end
|
114
113
|
end
|
115
114
|
end
|
116
115
|
|
117
|
-
end
|
116
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'polipus/page'
|
3
3
|
|
4
4
|
describe Polipus::Page do
|
5
5
|
let(:page) do
|
@@ -14,29 +14,29 @@ describe Polipus::Page do
|
|
14
14
|
</body>
|
15
15
|
</html>
|
16
16
|
EOF
|
17
|
-
Polipus::Page.new 'http://www.google.com/',
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
Polipus::Page.new 'http://www.google.com/',
|
18
|
+
code: 200,
|
19
|
+
body: body,
|
20
|
+
headers: { 'content-type' => ['text/html'] },
|
21
|
+
domain_aliases: %w(www.google.com google.com)
|
22
22
|
end
|
23
23
|
|
24
24
|
it 'should be fetched' do
|
25
25
|
page.fetched?.should be_true
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
it 'should honor domain_aliases attribute' do
|
29
29
|
page.links.count.should be 4
|
30
30
|
end
|
31
31
|
|
32
32
|
context 'page expiring' do
|
33
33
|
let(:page) do
|
34
|
-
Polipus::Page.new 'http://www.google.com/',
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
Polipus::Page.new 'http://www.google.com/',
|
35
|
+
code: 200,
|
36
|
+
body: '',
|
37
|
+
headers: { 'content-type' => ['text/html'] },
|
38
|
+
domain_aliases: %w(www.google.com google.com),
|
39
|
+
fetched_at: (Time.now.to_i - 30)
|
40
40
|
end
|
41
41
|
|
42
42
|
it 'should be marked at expired' do
|
@@ -49,12 +49,12 @@ EOF
|
|
49
49
|
end
|
50
50
|
|
51
51
|
context 'page error' do
|
52
|
-
|
52
|
+
|
53
53
|
let(:page) do
|
54
54
|
Polipus::Page.new 'http://www.google.com/', error: 'an error'
|
55
55
|
end
|
56
56
|
|
57
|
-
it 'should serialize an error' do
|
57
|
+
it 'should serialize an error' do
|
58
58
|
page.to_hash['error'].should eq 'an error'
|
59
59
|
end
|
60
60
|
|
data/spec/polipus_spec.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Polipus::PolipusCrawler do
|
4
|
-
after(:each) {Redis.new(db:10).flushdb}
|
5
|
-
let(:p_options)
|
4
|
+
after(:each) { Redis.new(db: 10).flushdb }
|
5
|
+
let(:p_options) do
|
6
6
|
{
|
7
7
|
workers: 1,
|
8
|
-
redis_options: {host: 'localhost', db:10},
|
8
|
+
redis_options: { host: 'localhost', db: 10 },
|
9
9
|
depth_limit: 1,
|
10
10
|
queue_timeout: 1,
|
11
11
|
user_agent: 'polipus-rspec',
|
@@ -13,55 +13,58 @@ describe Polipus::PolipusCrawler do
|
|
13
13
|
logger_level: Logger::DEBUG,
|
14
14
|
storage: Polipus::Storage.memory_store
|
15
15
|
}
|
16
|
-
|
17
|
-
let(:polipus)
|
18
|
-
Polipus::PolipusCrawler.new(
|
19
|
-
|
16
|
+
end
|
17
|
+
let(:polipus) do
|
18
|
+
Polipus::PolipusCrawler.new('polipus-rspec', ['http://rubygems.org/gems'], p_options)
|
19
|
+
end
|
20
20
|
|
21
|
-
let(:init_page)
|
22
|
-
Polipus::Page.new
|
23
|
-
|
21
|
+
let(:init_page)do
|
22
|
+
Polipus::Page.new 'http://rubygems.org/gems'
|
23
|
+
end
|
24
24
|
|
25
|
-
let(:logger){Logger.new(nil)}
|
25
|
+
let(:logger) { Logger.new(nil) }
|
26
26
|
|
27
|
-
context
|
27
|
+
context 'polipus' do
|
28
28
|
|
29
|
-
it
|
29
|
+
it 'should create a polipus instance' do
|
30
30
|
polipus.should be_an_instance_of Polipus::PolipusCrawler
|
31
31
|
end
|
32
32
|
|
33
|
-
it
|
33
|
+
it 'should execute a crawling session' do
|
34
34
|
polipus.takeover
|
35
35
|
polipus.storage.exists?(init_page).should be_true
|
36
36
|
polipus.storage.get(init_page).links.count.should be polipus.storage.count
|
37
37
|
end
|
38
38
|
|
39
|
-
it
|
39
|
+
it 'should filter unwanted urls' do
|
40
40
|
polipus.skip_links_like(/\/pages\//)
|
41
41
|
polipus.takeover
|
42
42
|
polipus.storage.get(init_page).links
|
43
43
|
.reject { |e| e.path.to_s =~ /\/pages\// }.count.should be polipus.storage.count
|
44
44
|
end
|
45
45
|
|
46
|
-
it
|
46
|
+
it 'should follow only wanted urls' do
|
47
47
|
polipus.follow_links_like(/\/pages\//)
|
48
48
|
polipus.follow_links_like(/\/gems$/)
|
49
49
|
polipus.takeover
|
50
50
|
polipus.storage.get(init_page).links
|
51
|
-
.reject { |e| ![/\/pages\//, /\/gems$/].any?{|p| e.path =~ p} }
|
51
|
+
.reject { |e| ![/\/pages\//, /\/gems$/].any? { |p| e.path =~ p } }
|
52
52
|
.count.should be polipus.storage.count
|
53
53
|
end
|
54
54
|
|
55
|
-
it
|
55
|
+
it 'should refresh expired pages' do
|
56
56
|
polipus.ttl_page = 3600
|
57
57
|
polipus.takeover
|
58
|
-
polipus.storage.each
|
59
|
-
|
58
|
+
polipus.storage.each do |_id, page|
|
59
|
+
page.fetched_at = page.fetched_at - 3600
|
60
|
+
polipus.storage.add(page)
|
61
|
+
end
|
62
|
+
polipus.storage.each { |_id, page| page.expired?(3600).should be_true }
|
60
63
|
polipus.takeover
|
61
|
-
polipus.storage.each {|
|
64
|
+
polipus.storage.each { |_id, page| page.expired?(3600).should be_false }
|
62
65
|
end
|
63
66
|
|
64
|
-
it
|
67
|
+
it 'should re-download seeder urls no matter what' do
|
65
68
|
cache_hit = {}
|
66
69
|
polipus.follow_links_like(/\/gems$/)
|
67
70
|
polipus.on_page_downloaded do |page|
|
@@ -70,26 +73,26 @@ describe Polipus::PolipusCrawler do
|
|
70
73
|
end
|
71
74
|
polipus.takeover
|
72
75
|
polipus.takeover
|
73
|
-
cache_hit[
|
76
|
+
cache_hit['http://rubygems.org/gems'].should be 2
|
74
77
|
end
|
75
78
|
|
76
|
-
it
|
77
|
-
p = Polipus::PolipusCrawler.new(
|
79
|
+
it 'should call on_page_error code blocks when a page has error' do
|
80
|
+
p = Polipus::PolipusCrawler.new('polipus-rspec', ['http://dasd.adad.dom/'], p_options.merge(open_timeout: 1, read_timeout: 1))
|
78
81
|
a_page = nil
|
79
|
-
p.on_page_error {|page| a_page = page}
|
82
|
+
p.on_page_error { |page| a_page = page }
|
80
83
|
p.takeover
|
81
84
|
a_page.should_not be_nil
|
82
85
|
a_page.error.should_not be_nil
|
83
86
|
end
|
84
87
|
|
85
|
-
it
|
88
|
+
it 'should obey to the robots.txt file' do
|
86
89
|
lopt = p_options
|
87
90
|
lopt[:obey_robots_txt] = true
|
88
|
-
polipus = Polipus::PolipusCrawler.new(
|
91
|
+
polipus = Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
|
89
92
|
polipus.depth_limit = 1
|
90
93
|
polipus.takeover
|
91
|
-
polipus.storage.each {|
|
94
|
+
polipus.storage.each { |_id, page| (page.url.path =~ /$\/downloads\//).should be_false }
|
92
95
|
end
|
93
96
|
|
94
97
|
end
|
95
|
-
end
|
98
|
+
end
|
@@ -1,20 +1,22 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'mongo'
|
3
|
+
require 'polipus/queue_overflow'
|
4
|
+
require 'redis-queue'
|
4
5
|
|
5
6
|
describe Polipus::QueueOverflow::Manager do
|
6
7
|
before(:all) do
|
7
|
-
@mongo = Mongo::Connection.new(
|
8
|
+
@mongo = Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('_test_polipus')
|
8
9
|
@mongo['_test_pages'].drop
|
9
10
|
@storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
10
|
-
@redis_q = Redis::Queue.new(
|
11
|
-
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil,
|
11
|
+
@redis_q = Redis::Queue.new('queue_test', 'bp_queue_test', redis: Redis.new)
|
12
|
+
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test')
|
12
13
|
@redis = Redis.new
|
13
|
-
@polipus = flexmock(
|
14
|
+
@polipus = flexmock('polipus')
|
14
15
|
@polipus.should_receive(:queue_overflow_adapter).and_return(@queue_overflow)
|
15
16
|
@polipus.should_receive(:storage).and_return(@storage)
|
16
17
|
@polipus.should_receive(:redis).and_return(@redis)
|
17
|
-
@polipus.should_receive(:job_name).and_return(
|
18
|
+
@polipus.should_receive(:job_name).and_return('___test')
|
19
|
+
@polipus.should_receive(:logger).and_return(Logger.new(nil))
|
18
20
|
@manager = Polipus::QueueOverflow::Manager.new(@polipus, @redis_q, 10)
|
19
21
|
end
|
20
22
|
|
@@ -30,16 +32,16 @@ describe Polipus::QueueOverflow::Manager do
|
|
30
32
|
end
|
31
33
|
|
32
34
|
it 'should remove 10 items' do
|
33
|
-
@manager.perform.should be == [0,0]
|
34
|
-
20.times {|i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", :
|
35
|
+
@manager.perform.should be == [0, 0]
|
36
|
+
20.times { |i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
35
37
|
@manager.perform.should be == [10, 0]
|
36
38
|
@queue_overflow.size.should be == 10
|
37
39
|
@redis_q.size.should be == 10
|
38
40
|
end
|
39
41
|
|
40
42
|
it 'should restore 10 items' do
|
41
|
-
@manager.perform.should be == [0,0]
|
42
|
-
10.times {|i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", :
|
43
|
+
@manager.perform.should be == [0, 0]
|
44
|
+
10.times { |i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
43
45
|
@manager.perform.should be == [0, 10]
|
44
46
|
@queue_overflow.size.should be == 0
|
45
47
|
@redis_q.size.should be == 10
|
@@ -48,45 +50,45 @@ describe Polipus::QueueOverflow::Manager do
|
|
48
50
|
end
|
49
51
|
|
50
52
|
it 'should restore 3 items' do
|
51
|
-
|
52
|
-
@manager.perform.should be == [0,0]
|
53
|
-
3.times {|i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", :
|
53
|
+
|
54
|
+
@manager.perform.should be == [0, 0]
|
55
|
+
3.times { |i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
54
56
|
@manager.perform.should be == [0, 3]
|
55
57
|
@queue_overflow.size.should be == 0
|
56
58
|
@redis_q.size.should be == 3
|
57
59
|
@manager.perform.should be == [0, 0]
|
58
|
-
|
60
|
+
|
59
61
|
end
|
60
62
|
|
61
63
|
it 'should restore 0 items' do
|
62
|
-
|
63
|
-
@manager.perform.should be == [0,0]
|
64
|
-
10.times
|
65
|
-
p = page_factory("http://www.user-doo-bu.com/page_#{i}", :
|
64
|
+
|
65
|
+
@manager.perform.should be == [0, 0]
|
66
|
+
10.times do|i|
|
67
|
+
p = page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>')
|
66
68
|
@storage.add p
|
67
|
-
@queue_overflow << p.to_json
|
68
|
-
|
69
|
+
@queue_overflow << p.to_json
|
70
|
+
end
|
69
71
|
@manager.perform.should be == [0, 0]
|
70
72
|
@queue_overflow.size.should be == 0
|
71
73
|
@redis_q.size.should be == 0
|
72
74
|
@manager.perform.should be == [0, 0]
|
73
|
-
|
75
|
+
|
74
76
|
end
|
75
77
|
|
76
78
|
it 'should filter an url based on the spec' do
|
77
79
|
@queue_overflow.clear
|
78
80
|
@redis_q.clear
|
79
|
-
10.times {|i| @queue_overflow << page_factory("http://www.user-doo.com/page_#{i}", :
|
81
|
+
10.times { |i| @queue_overflow << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
80
82
|
@manager.url_filter do |page|
|
81
|
-
page.url.to_s.end_with?(
|
83
|
+
page.url.to_s.end_with?('page_0') ? false : true
|
82
84
|
end
|
83
|
-
@manager.perform.should be == [0,9]
|
85
|
+
@manager.perform.should be == [0, 9]
|
84
86
|
@queue_overflow.size.should be == 0
|
85
87
|
@redis_q.size.should be == 9
|
86
|
-
@manager.url_filter do |
|
88
|
+
@manager.url_filter do |_page|
|
87
89
|
true
|
88
90
|
end
|
89
91
|
|
90
92
|
end
|
91
93
|
|
92
|
-
end
|
94
|
+
end
|