polipus 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.rspec +1 -1
- data/.rubocop.yml +3 -3
- data/.rubocop_todo.yml +1 -1
- data/.travis.yml +14 -4
- data/AUTHORS.md +1 -0
- data/CHANGELOG.md +9 -1
- data/Gemfile +9 -0
- data/README.md +2 -3
- data/Rakefile +1 -3
- data/examples/basic.rb +8 -1
- data/lib/polipus.rb +25 -13
- data/lib/polipus/queue_overflow.rb +1 -0
- data/lib/polipus/queue_overflow/manager.rb +1 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +1 -1
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/storage.rb +10 -16
- data/lib/polipus/storage/mongo_store.rb +6 -1
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +16 -18
- data/spec/{http_spec.rb → polipus/http_spec.rb} +26 -37
- data/spec/{page_spec.rb → polipus/page_spec.rb} +7 -11
- data/spec/{queue_overflow_manager_spec.rb → polipus/queue_overflow/manager_spec.rb} +22 -29
- data/spec/{queue_overflow_spec.rb → polipus/queue_overflow_spec.rb} +14 -20
- data/spec/{robotex_spec.rb → polipus/robotex_spec.rb} +10 -11
- data/spec/{signal_handler_spec.rb → polipus/signal_handler_spec.rb} +2 -6
- data/spec/{storage_memory_spec.rb → polipus/storage/memory_store_spec.rb} +18 -21
- data/spec/{storage_mongo_spec.rb → polipus/storage/mongo_store_spec.rb} +23 -25
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/{url_tracker_spec.rb → polipus/url_tracker_spec.rb} +4 -4
- data/spec/polipus_spec.rb +13 -15
- data/spec/spec_helper.rb +13 -12
- metadata +76 -154
- data/lib/polipus/storage/s3_store.rb +0 -96
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +0 -166
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +0 -166
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +0 -270
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +0 -194
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +0 -183
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +0 -221
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +0 -221
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +0 -221
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +0 -695
- data/spec/storage_s3_spec.rb +0 -115
data/lib/polipus/version.rb
CHANGED
data/polipus.gemspec
CHANGED
@@ -13,6 +13,7 @@ Gem::Specification.new do |s|
|
|
13
13
|
An easy to use distributed web-crawler framework based on Redis
|
14
14
|
)
|
15
15
|
s.licenses = ['MIT']
|
16
|
+
s.platform = Gem::Platform::RUBY
|
16
17
|
|
17
18
|
s.rubyforge_project = 'polipus'
|
18
19
|
|
@@ -21,26 +22,23 @@ Gem::Specification.new do |s|
|
|
21
22
|
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
22
23
|
s.require_paths = ['lib']
|
23
24
|
|
24
|
-
s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.1'
|
25
|
-
s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.3'
|
26
25
|
s.add_runtime_dependency 'nokogiri', '~> 1.6', '>= 1.6.0'
|
27
|
-
s.add_runtime_dependency 'hiredis', '~> 0.4', '>= 0.4.5'
|
28
|
-
s.add_runtime_dependency 'redis', '~> 3.0', '>= 3.0.4'
|
29
|
-
s.add_runtime_dependency 'mongo', '~> 1.9.0', '>= 1.9.2'
|
30
|
-
|
31
|
-
if defined?(JRUBY_VERSION)
|
32
|
-
s.add_runtime_dependency 'bson', '~> 1.9', '>= 1.9.2'
|
33
|
-
else
|
34
|
-
s.add_runtime_dependency 'bson_ext', '~> 1.9', '>= 1.9.2'
|
35
|
-
end
|
36
|
-
s.add_runtime_dependency 'aws-s3', '~> 0.6', '>= 0.6.3'
|
37
26
|
s.add_runtime_dependency 'http-cookie', '~> 1.0', '>= 1.0.1'
|
38
27
|
|
39
|
-
s.
|
40
|
-
s.
|
41
|
-
s.
|
42
|
-
s.
|
43
|
-
|
44
|
-
s.add_development_dependency '
|
28
|
+
s.add_runtime_dependency 'redis', '~> 3.0', '>= 3.0.4'
|
29
|
+
s.add_runtime_dependency 'hiredis', '~> 0.5', '>= 0.4.5'
|
30
|
+
s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.4'
|
31
|
+
s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.3'
|
32
|
+
|
33
|
+
s.add_development_dependency 'mongo', '~>1.11.0'
|
34
|
+
s.add_development_dependency 'rethinkdb', '~>1.15.0'
|
45
35
|
|
36
|
+
s.add_development_dependency 'rake', '~> 10.3'
|
37
|
+
s.add_development_dependency 'rspec', '~> 3.1.0'
|
38
|
+
s.add_development_dependency 'flexmock', '~> 1.3'
|
39
|
+
|
40
|
+
s.add_development_dependency 'vcr', '~> 2.9.0'
|
41
|
+
s.add_development_dependency 'webmock', '~> 1.20.0'
|
42
|
+
|
43
|
+
s.add_development_dependency 'coveralls'
|
46
44
|
end
|
@@ -5,118 +5,107 @@ require 'polipus/http'
|
|
5
5
|
require 'polipus/page'
|
6
6
|
|
7
7
|
describe Polipus::HTTP do
|
8
|
-
|
9
8
|
it 'should download a page' do
|
10
9
|
VCR.use_cassette('http_test') do
|
11
10
|
http = Polipus::HTTP.new
|
12
11
|
page = http.fetch_page('http://sfbay.craigslist.org/apa/')
|
13
|
-
page.
|
14
|
-
page.doc.search('title').text.strip.
|
15
|
-
page.fetched_at.
|
16
|
-
page.fetched
|
12
|
+
expect(page).to be_an_instance_of(Polipus::Page)
|
13
|
+
expect(page.doc.search('title').text.strip).to eq 'SF bay area apts/housing for rent classifieds - craigslist'
|
14
|
+
expect(page.fetched_at).not_to be_nil
|
15
|
+
expect(page.fetched?).to be_truthy
|
17
16
|
end
|
18
17
|
end
|
19
18
|
|
20
19
|
it 'should follow a redirect' do
|
21
20
|
VCR.use_cassette('http_test_redirect') do
|
22
|
-
|
23
21
|
http = Polipus::HTTP.new
|
24
22
|
page = http.fetch_page('http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis')
|
25
23
|
|
26
|
-
page.
|
27
|
-
page.code.
|
28
|
-
page.url.to_s.
|
29
|
-
page.body.strip.
|
24
|
+
expect(page).to be_an_instance_of(Polipus::Page)
|
25
|
+
expect(page.code).to be 200
|
26
|
+
expect(page.url.to_s).to eq 'http://greenbytes.de/tech/tc/httpredirects/300.txt'
|
27
|
+
expect(page.body.strip).to eq "You have reached the target\r\nof a 300 redirect."
|
30
28
|
end
|
31
29
|
end
|
32
30
|
|
33
31
|
describe 'proxy settings' do
|
34
|
-
|
35
32
|
it 'should set proxy correctly using a procedure' do
|
36
33
|
http = Polipus::HTTP.new(proxy_host: -> _con { '127.0.0.0' }, proxy_port: -> _con { 8080 })
|
37
|
-
http.proxy_host.
|
38
|
-
http.proxy_port.
|
34
|
+
expect(http.proxy_host).to eq '127.0.0.0'
|
35
|
+
expect(http.proxy_port).to be 8080
|
39
36
|
end
|
40
37
|
|
41
38
|
it 'should set proxy correctly using shorthand method' do
|
42
39
|
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080] })
|
43
|
-
http.proxy_host_port.
|
40
|
+
expect(http.proxy_host_port).to eq ['127.0.0.0', 8080]
|
44
41
|
end
|
45
42
|
|
46
43
|
it 'should set proxy w/ auth correctly using shorthand method' do
|
47
44
|
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080, 'a', 'b'] })
|
48
|
-
http.proxy_host_port.
|
45
|
+
expect(http.proxy_host_port).to eq ['127.0.0.0', 8080, 'a', 'b']
|
49
46
|
end
|
50
47
|
|
51
48
|
it 'should set proxy settings' do
|
52
49
|
http = Polipus::HTTP.new(proxy_host: '127.0.0.0', proxy_port: 8080, proxy_user: 'a', proxy_pass: 'b')
|
53
|
-
http.proxy_port.
|
54
|
-
http.proxy_host.
|
55
|
-
http.proxy_user.
|
56
|
-
http.proxy_pass.
|
50
|
+
expect(http.proxy_port).to be 8080
|
51
|
+
expect(http.proxy_host).to eq '127.0.0.0'
|
52
|
+
expect(http.proxy_user).to eq 'a'
|
53
|
+
expect(http.proxy_pass).to eq 'b'
|
57
54
|
end
|
58
|
-
|
59
55
|
end
|
60
56
|
|
61
57
|
describe 'compressed content handling' do
|
62
|
-
|
63
58
|
it 'should decode gzip content' do
|
64
59
|
VCR.use_cassette('gzipped_on') do
|
65
60
|
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
66
61
|
page = http.fetch_page('http://www.whatsmyip.org/http-compression-test/')
|
67
|
-
page.doc.css('.gzip_yes').
|
62
|
+
expect(page.doc.css('.gzip_yes')).not_to be_empty
|
68
63
|
end
|
69
64
|
end
|
70
65
|
|
71
66
|
it 'should decode deflate content' do
|
72
67
|
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
73
68
|
page = http.fetch_page('http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http')
|
74
|
-
page.headers.fetch('content-encoding').first.
|
75
|
-
page.body.include?('deflate-http').
|
69
|
+
expect(page.headers.fetch('content-encoding').first).to eq 'deflate'
|
70
|
+
expect(page.body.include?('deflate-http')).to be_truthy
|
76
71
|
end
|
77
|
-
|
78
72
|
end
|
79
73
|
|
80
74
|
describe 'staled connections' do
|
81
|
-
|
82
75
|
it 'should refresh a staled connection' do
|
83
76
|
VCR.use_cassette('http_tconnection_max_hits') do
|
84
77
|
http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
|
85
78
|
http.class.__send__(:attr_reader, :connections)
|
86
79
|
http.class.__send__(:attr_reader, :connections_hits)
|
87
80
|
http.fetch_page('https://www.yahoo.com/')
|
88
|
-
http.connections['www.yahoo.com'][443].
|
81
|
+
expect(http.connections['www.yahoo.com'][443]).not_to be_nil
|
89
82
|
old_conn = http.connections['www.yahoo.com'][443]
|
90
|
-
http.connections_hits['www.yahoo.com'][443].
|
83
|
+
expect(http.connections_hits['www.yahoo.com'][443]).to be 1
|
91
84
|
|
92
85
|
http.fetch_page('https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html')
|
93
|
-
http.connections_hits['www.yahoo.com'][443].
|
94
|
-
http.connections['www.yahoo.com'][443].
|
86
|
+
expect(http.connections_hits['www.yahoo.com'][443]).to be 1
|
87
|
+
expect(http.connections['www.yahoo.com'][443]).not_to be old_conn
|
95
88
|
end
|
96
89
|
end
|
97
|
-
|
98
90
|
end
|
99
91
|
|
100
92
|
describe 'cookies' do
|
101
|
-
|
102
93
|
it 'should handle cookies correctly' do
|
103
94
|
VCR.use_cassette('http_cookies') do
|
104
95
|
http = Polipus::HTTP.new(accept_cookies: true)
|
105
96
|
http.fetch_page 'http://www.whatarecookies.com/cookietest.asp'
|
106
|
-
http.accept_cookies
|
107
|
-
http.cookie_jar.cookies(URI('http://www.whatarecookies.com/cookietest.asp')).
|
97
|
+
expect(http.accept_cookies?).to be_truthy
|
98
|
+
expect(http.cookie_jar.cookies(URI('http://www.whatarecookies.com/cookietest.asp'))).not_to be_empty
|
108
99
|
end
|
109
100
|
end
|
110
|
-
|
111
101
|
end
|
112
102
|
|
113
103
|
describe 'net errors' do
|
114
104
|
it 'should handle net errors correctly' do
|
115
105
|
VCR.use_cassette('http_errors') do
|
116
106
|
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1)
|
117
|
-
http.fetch_page('http://www.wrong-domain.lol/').error.
|
107
|
+
expect(http.fetch_page('http://www.wrong-domain.lol/').error).not_to be_nil
|
118
108
|
end
|
119
109
|
end
|
120
110
|
end
|
121
|
-
|
122
111
|
end
|
@@ -23,11 +23,11 @@ EOF
|
|
23
23
|
end
|
24
24
|
|
25
25
|
it 'should be fetched' do
|
26
|
-
page.fetched
|
26
|
+
expect(page.fetched?).to be_truthy
|
27
27
|
end
|
28
28
|
|
29
29
|
it 'should honor domain_aliases attribute' do
|
30
|
-
page.links.count.
|
30
|
+
expect(page.links.count).to be 4
|
31
31
|
end
|
32
32
|
|
33
33
|
context 'page expiring' do
|
@@ -41,32 +41,28 @@ EOF
|
|
41
41
|
end
|
42
42
|
|
43
43
|
it 'should be marked at expired' do
|
44
|
-
page.expired?(20).
|
44
|
+
expect(page.expired?(20)).to be_truthy
|
45
45
|
end
|
46
46
|
|
47
47
|
it 'should NOT be marked at expired' do
|
48
|
-
page.expired?(60).
|
48
|
+
expect(page.expired?(60)).to be_falsey
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
52
|
context 'page error' do
|
53
|
-
|
54
53
|
let(:page) do
|
55
54
|
Polipus::Page.new 'http://www.google.com/', error: 'an error'
|
56
55
|
end
|
57
56
|
|
58
57
|
it 'should serialize an error' do
|
59
|
-
page.to_hash['error'].
|
58
|
+
expect(page.to_hash['error']).to eq 'an error'
|
60
59
|
end
|
61
|
-
|
62
60
|
end
|
63
61
|
|
64
62
|
context 'page code' do
|
65
63
|
it 'should identify HTTPSuccess code' do
|
66
|
-
Polipus::Page.new('http://www.google.com/', code: 201).success
|
67
|
-
Polipus::Page.new('http://www.google.com/', code: 404).success
|
64
|
+
expect(Polipus::Page.new('http://www.google.com/', code: 201).success?).to be_truthy
|
65
|
+
expect(Polipus::Page.new('http://www.google.com/', code: 404).success?).to be_falsey
|
68
66
|
end
|
69
|
-
|
70
67
|
end
|
71
|
-
|
72
68
|
end
|
@@ -33,47 +33,42 @@ describe Polipus::QueueOverflow::Manager do
|
|
33
33
|
end
|
34
34
|
|
35
35
|
it 'should remove 10 items' do
|
36
|
-
@manager.perform.
|
36
|
+
expect(@manager.perform).to eq([0, 0])
|
37
37
|
20.times { |i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
38
|
-
@manager.perform.
|
39
|
-
@queue_overflow.size.
|
40
|
-
@redis_q.size.
|
38
|
+
expect(@manager.perform).to eq([10, 0])
|
39
|
+
expect(@queue_overflow.size).to eq(10)
|
40
|
+
expect(@redis_q.size).to eq(10)
|
41
41
|
end
|
42
42
|
|
43
43
|
it 'should restore 10 items' do
|
44
|
-
@manager.perform.
|
44
|
+
expect(@manager.perform).to eq([0, 0])
|
45
45
|
10.times { |i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
46
|
-
@manager.perform.
|
47
|
-
@queue_overflow.size.
|
48
|
-
@redis_q.size.
|
49
|
-
@manager.perform.
|
50
|
-
|
46
|
+
expect(@manager.perform).to eq([0, 10])
|
47
|
+
expect(@queue_overflow.size).to eq(0)
|
48
|
+
expect(@redis_q.size).to eq(10)
|
49
|
+
expect(@manager.perform).to eq([0, 0])
|
51
50
|
end
|
52
51
|
|
53
52
|
it 'should restore 3 items' do
|
54
|
-
|
55
|
-
@manager.perform.should be == [0, 0]
|
53
|
+
expect(@manager.perform).to eq([0, 0])
|
56
54
|
3.times { |i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
57
|
-
@manager.perform.
|
58
|
-
@queue_overflow.size.
|
59
|
-
@redis_q.size.
|
60
|
-
@manager.perform.
|
61
|
-
|
55
|
+
expect(@manager.perform).to eq([0, 3])
|
56
|
+
expect(@queue_overflow.size).to eq(0)
|
57
|
+
expect(@redis_q.size).to eq(3)
|
58
|
+
expect(@manager.perform).to eq([0, 0])
|
62
59
|
end
|
63
60
|
|
64
61
|
it 'should restore 0 items' do
|
65
|
-
|
66
|
-
@manager.perform.should be == [0, 0]
|
62
|
+
expect(@manager.perform).to eq([0, 0])
|
67
63
|
10.times do|i|
|
68
64
|
p = page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>')
|
69
65
|
@storage.add p
|
70
66
|
@queue_overflow << p.to_json
|
71
67
|
end
|
72
|
-
@manager.perform.
|
73
|
-
@queue_overflow.size.
|
74
|
-
@redis_q.size.
|
75
|
-
@manager.perform.
|
76
|
-
|
68
|
+
expect(@manager.perform).to eq([0, 0])
|
69
|
+
expect(@queue_overflow.size).to eq(0)
|
70
|
+
expect(@redis_q.size).to eq(0)
|
71
|
+
expect(@manager.perform).to eq([0, 0])
|
77
72
|
end
|
78
73
|
|
79
74
|
it 'should filter an url based on the spec' do
|
@@ -83,13 +78,11 @@ describe Polipus::QueueOverflow::Manager do
|
|
83
78
|
@manager.url_filter do |page|
|
84
79
|
page.url.to_s.end_with?('page_0') ? false : true
|
85
80
|
end
|
86
|
-
@manager.perform.
|
87
|
-
@queue_overflow.size.
|
88
|
-
@redis_q.size.
|
81
|
+
expect(@manager.perform).to eq([0, 9])
|
82
|
+
expect(@queue_overflow.size).to eq(0)
|
83
|
+
expect(@redis_q.size).to eq(9)
|
89
84
|
@manager.url_filter do |_page|
|
90
85
|
true
|
91
86
|
end
|
92
|
-
|
93
87
|
end
|
94
|
-
|
95
88
|
end
|
@@ -3,12 +3,10 @@ require 'spec_helper'
|
|
3
3
|
require 'polipus/queue_overflow'
|
4
4
|
|
5
5
|
describe Polipus::QueueOverflow do
|
6
|
-
|
7
6
|
before(:all) do
|
8
7
|
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test')
|
9
8
|
@queue_overflow_capped = Polipus::QueueOverflow.mongo_queue_capped(nil, 'queue_test_c', max: 20)
|
10
9
|
@queue_overflow_uniq = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test_u', ensure_uniq: true)
|
11
|
-
|
12
10
|
end
|
13
11
|
|
14
12
|
before(:each) do
|
@@ -25,26 +23,24 @@ describe Polipus::QueueOverflow do
|
|
25
23
|
|
26
24
|
it 'should work' do
|
27
25
|
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
28
|
-
q.empty
|
29
|
-
q.pop.
|
26
|
+
expect(q.empty?).to be_truthy
|
27
|
+
expect(q.pop).to be_nil
|
30
28
|
q << 'test'
|
31
|
-
q.size.
|
32
|
-
q.pop.
|
33
|
-
q.empty
|
34
|
-
q.pop.
|
35
|
-
q.size.
|
36
|
-
q.empty
|
29
|
+
expect(q.size).to eq(1)
|
30
|
+
expect(q.pop).to eq('test')
|
31
|
+
expect(q.empty?).to be_truthy
|
32
|
+
expect(q.pop).to be_nil
|
33
|
+
expect(q.size).to eq(0)
|
34
|
+
expect(q.empty?).to be_truthy
|
37
35
|
end
|
38
|
-
|
39
36
|
end
|
40
37
|
|
41
38
|
it 'should act as a queue' do
|
42
39
|
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
43
40
|
10.times { |i| q << "message_#{i}" }
|
44
|
-
q.size.
|
45
|
-
q.pop.
|
41
|
+
expect(q.size).to eq(10)
|
42
|
+
expect(q.pop).to eq('message_0')
|
46
43
|
end
|
47
|
-
|
48
44
|
end
|
49
45
|
|
50
46
|
it 'should work with complex paylod' do
|
@@ -52,21 +48,19 @@ describe Polipus::QueueOverflow do
|
|
52
48
|
a = { 'a' => [1, 2, 3], 'b' => 'a_string' }
|
53
49
|
q << a.to_json
|
54
50
|
b = q.pop
|
55
|
-
JSON.parse(b).
|
51
|
+
expect(JSON.parse(b)).to eq(a)
|
56
52
|
end
|
57
|
-
|
58
53
|
end
|
59
54
|
|
60
55
|
it 'should honor max items if it is capped' do
|
61
56
|
30.times { |i| @queue_overflow_capped << "message_#{i}" }
|
62
|
-
@queue_overflow_capped.size.
|
63
|
-
@queue_overflow_capped.pop.
|
57
|
+
expect(@queue_overflow_capped.size).to eq(20)
|
58
|
+
expect(@queue_overflow_capped.pop).to eq('message_10')
|
64
59
|
end
|
65
60
|
|
66
61
|
it 'should contains only unique items' do
|
67
62
|
20.times { @queue_overflow_uniq << 'A' }
|
68
63
|
20.times { @queue_overflow_uniq << 'B' }
|
69
|
-
@queue_overflow_uniq.size.
|
64
|
+
expect(@queue_overflow_uniq.size).to eq(2)
|
70
65
|
end
|
71
|
-
|
72
66
|
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
# encoding: UTF-8
|
2
1
|
require 'spec_helper'
|
3
2
|
require 'polipus/robotex'
|
3
|
+
|
4
4
|
describe Polipus::Robotex do
|
5
5
|
let(:spec_domain) { 'http://www.example.com/' }
|
6
6
|
before(:each) do
|
@@ -19,20 +19,20 @@ Disallow: /locked
|
|
19
19
|
Allow: /locked
|
20
20
|
END
|
21
21
|
stub_request(:get, 'http://www.example.com/robots.txt')
|
22
|
-
|
22
|
+
.to_return(body: robots, status: [200, 'OK'], headers: { 'Content-Type' => 'text/plain' })
|
23
23
|
end
|
24
24
|
|
25
25
|
describe '#initialize' do
|
26
26
|
context 'when no arguments are supplied' do
|
27
27
|
it 'returns a Robotex with the default user-agent' do
|
28
|
-
Polipus::Robotex.new.user_agent.
|
28
|
+
expect(Polipus::Robotex.new.user_agent).to eq("Robotex/#{Polipus::Robotex::VERSION} (http://www.github.com/chriskite/robotex)")
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
32
|
context 'when a user-agent is specified' do
|
33
33
|
it 'returns a Robotex with the specified user-agent' do
|
34
34
|
ua = 'My User Agent'
|
35
|
-
Polipus::Robotex.new(ua).user_agent.
|
35
|
+
expect(Polipus::Robotex.new(ua).user_agent).to eq(ua)
|
36
36
|
end
|
37
37
|
end
|
38
38
|
end
|
@@ -41,28 +41,28 @@ END
|
|
41
41
|
context 'when the robots.txt disallows the user-agent to the url' do
|
42
42
|
it 'returns false' do
|
43
43
|
robotex = Polipus::Robotex.new('bender')
|
44
|
-
robotex.allowed?(spec_domain + 'my_shiny_metal_ass').
|
44
|
+
expect(robotex.allowed?(spec_domain + 'my_shiny_metal_ass')).to be_falsey
|
45
45
|
end
|
46
46
|
end
|
47
47
|
|
48
48
|
context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
|
49
49
|
it 'returns true' do
|
50
50
|
robotex = Polipus::Robotex.new('bender')
|
51
|
-
robotex.allowed?(spec_domain + 'cigars').
|
51
|
+
expect(robotex.allowed?(spec_domain + 'cigars')).to be_truthy
|
52
52
|
end
|
53
53
|
end
|
54
54
|
|
55
55
|
context 'when the robots.txt disallows any user-agent to the url' do
|
56
56
|
it 'returns false' do
|
57
57
|
robotex = Polipus::Robotex.new
|
58
|
-
robotex.allowed?(spec_domain + 'login').
|
58
|
+
expect(robotex.allowed?(spec_domain + 'login')).to be_falsey
|
59
59
|
end
|
60
60
|
end
|
61
61
|
|
62
62
|
context 'when the robots.txt disallows and then allows the url' do
|
63
63
|
it 'returns false' do
|
64
64
|
robotex = Polipus::Robotex.new
|
65
|
-
robotex.allowed?(spec_domain + 'locked').
|
65
|
+
expect(robotex.allowed?(spec_domain + 'locked')).to be_falsey
|
66
66
|
end
|
67
67
|
end
|
68
68
|
end
|
@@ -71,16 +71,15 @@ END
|
|
71
71
|
context 'when no Crawl-Delay is specified for the user-agent' do
|
72
72
|
it 'returns nil' do
|
73
73
|
robotex = Polipus::Robotex.new
|
74
|
-
robotex.delay(spec_domain).
|
74
|
+
expect(robotex.delay(spec_domain)).to be_nil
|
75
75
|
end
|
76
76
|
|
77
77
|
context 'when Crawl-Delay is specified for the user-agent' do
|
78
78
|
it 'returns the delay as a Fixnum' do
|
79
79
|
robotex = Polipus::Robotex.new('msnbot')
|
80
|
-
robotex.delay(spec_domain).
|
80
|
+
expect(robotex.delay(spec_domain)).to eq(20)
|
81
81
|
end
|
82
82
|
end
|
83
83
|
end
|
84
84
|
end
|
85
|
-
|
86
85
|
end
|