polipus 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.rspec +1 -1
- data/.rubocop.yml +3 -3
- data/.rubocop_todo.yml +1 -1
- data/.travis.yml +14 -4
- data/AUTHORS.md +1 -0
- data/CHANGELOG.md +9 -1
- data/Gemfile +9 -0
- data/README.md +2 -3
- data/Rakefile +1 -3
- data/examples/basic.rb +8 -1
- data/lib/polipus.rb +25 -13
- data/lib/polipus/queue_overflow.rb +1 -0
- data/lib/polipus/queue_overflow/manager.rb +1 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +1 -1
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/storage.rb +10 -16
- data/lib/polipus/storage/mongo_store.rb +6 -1
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +16 -18
- data/spec/{http_spec.rb → polipus/http_spec.rb} +26 -37
- data/spec/{page_spec.rb → polipus/page_spec.rb} +7 -11
- data/spec/{queue_overflow_manager_spec.rb → polipus/queue_overflow/manager_spec.rb} +22 -29
- data/spec/{queue_overflow_spec.rb → polipus/queue_overflow_spec.rb} +14 -20
- data/spec/{robotex_spec.rb → polipus/robotex_spec.rb} +10 -11
- data/spec/{signal_handler_spec.rb → polipus/signal_handler_spec.rb} +2 -6
- data/spec/{storage_memory_spec.rb → polipus/storage/memory_store_spec.rb} +18 -21
- data/spec/{storage_mongo_spec.rb → polipus/storage/mongo_store_spec.rb} +23 -25
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/{url_tracker_spec.rb → polipus/url_tracker_spec.rb} +4 -4
- data/spec/polipus_spec.rb +13 -15
- data/spec/spec_helper.rb +13 -12
- metadata +76 -154
- data/lib/polipus/storage/s3_store.rb +0 -96
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +0 -166
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +0 -166
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +0 -270
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +0 -194
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +0 -183
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +0 -221
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +0 -221
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +0 -221
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +0 -695
- data/spec/storage_s3_spec.rb +0 -115
data/lib/polipus/version.rb
CHANGED
data/polipus.gemspec
CHANGED
@@ -13,6 +13,7 @@ Gem::Specification.new do |s|
|
|
13
13
|
An easy to use distributed web-crawler framework based on Redis
|
14
14
|
)
|
15
15
|
s.licenses = ['MIT']
|
16
|
+
s.platform = Gem::Platform::RUBY
|
16
17
|
|
17
18
|
s.rubyforge_project = 'polipus'
|
18
19
|
|
@@ -21,26 +22,23 @@ Gem::Specification.new do |s|
|
|
21
22
|
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
22
23
|
s.require_paths = ['lib']
|
23
24
|
|
24
|
-
s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.1'
|
25
|
-
s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.3'
|
26
25
|
s.add_runtime_dependency 'nokogiri', '~> 1.6', '>= 1.6.0'
|
27
|
-
s.add_runtime_dependency 'hiredis', '~> 0.4', '>= 0.4.5'
|
28
|
-
s.add_runtime_dependency 'redis', '~> 3.0', '>= 3.0.4'
|
29
|
-
s.add_runtime_dependency 'mongo', '~> 1.9.0', '>= 1.9.2'
|
30
|
-
|
31
|
-
if defined?(JRUBY_VERSION)
|
32
|
-
s.add_runtime_dependency 'bson', '~> 1.9', '>= 1.9.2'
|
33
|
-
else
|
34
|
-
s.add_runtime_dependency 'bson_ext', '~> 1.9', '>= 1.9.2'
|
35
|
-
end
|
36
|
-
s.add_runtime_dependency 'aws-s3', '~> 0.6', '>= 0.6.3'
|
37
26
|
s.add_runtime_dependency 'http-cookie', '~> 1.0', '>= 1.0.1'
|
38
27
|
|
39
|
-
s.
|
40
|
-
s.
|
41
|
-
s.
|
42
|
-
s.
|
43
|
-
|
44
|
-
s.add_development_dependency '
|
28
|
+
s.add_runtime_dependency 'redis', '~> 3.0', '>= 3.0.4'
|
29
|
+
s.add_runtime_dependency 'hiredis', '~> 0.5', '>= 0.4.5'
|
30
|
+
s.add_runtime_dependency 'redis-queue', '~> 0.0', '>= 0.0.4'
|
31
|
+
s.add_runtime_dependency 'redis-bloomfilter', '~> 0.0', '>= 0.0.3'
|
32
|
+
|
33
|
+
s.add_development_dependency 'mongo', '~>1.11.0'
|
34
|
+
s.add_development_dependency 'rethinkdb', '~>1.15.0'
|
45
35
|
|
36
|
+
s.add_development_dependency 'rake', '~> 10.3'
|
37
|
+
s.add_development_dependency 'rspec', '~> 3.1.0'
|
38
|
+
s.add_development_dependency 'flexmock', '~> 1.3'
|
39
|
+
|
40
|
+
s.add_development_dependency 'vcr', '~> 2.9.0'
|
41
|
+
s.add_development_dependency 'webmock', '~> 1.20.0'
|
42
|
+
|
43
|
+
s.add_development_dependency 'coveralls'
|
46
44
|
end
|
@@ -5,118 +5,107 @@ require 'polipus/http'
|
|
5
5
|
require 'polipus/page'
|
6
6
|
|
7
7
|
describe Polipus::HTTP do
|
8
|
-
|
9
8
|
it 'should download a page' do
|
10
9
|
VCR.use_cassette('http_test') do
|
11
10
|
http = Polipus::HTTP.new
|
12
11
|
page = http.fetch_page('http://sfbay.craigslist.org/apa/')
|
13
|
-
page.
|
14
|
-
page.doc.search('title').text.strip.
|
15
|
-
page.fetched_at.
|
16
|
-
page.fetched
|
12
|
+
expect(page).to be_an_instance_of(Polipus::Page)
|
13
|
+
expect(page.doc.search('title').text.strip).to eq 'SF bay area apts/housing for rent classifieds - craigslist'
|
14
|
+
expect(page.fetched_at).not_to be_nil
|
15
|
+
expect(page.fetched?).to be_truthy
|
17
16
|
end
|
18
17
|
end
|
19
18
|
|
20
19
|
it 'should follow a redirect' do
|
21
20
|
VCR.use_cassette('http_test_redirect') do
|
22
|
-
|
23
21
|
http = Polipus::HTTP.new
|
24
22
|
page = http.fetch_page('http://greenbytes.de/tech/tc/httpredirects/t300bodyandloc.asis')
|
25
23
|
|
26
|
-
page.
|
27
|
-
page.code.
|
28
|
-
page.url.to_s.
|
29
|
-
page.body.strip.
|
24
|
+
expect(page).to be_an_instance_of(Polipus::Page)
|
25
|
+
expect(page.code).to be 200
|
26
|
+
expect(page.url.to_s).to eq 'http://greenbytes.de/tech/tc/httpredirects/300.txt'
|
27
|
+
expect(page.body.strip).to eq "You have reached the target\r\nof a 300 redirect."
|
30
28
|
end
|
31
29
|
end
|
32
30
|
|
33
31
|
describe 'proxy settings' do
|
34
|
-
|
35
32
|
it 'should set proxy correctly using a procedure' do
|
36
33
|
http = Polipus::HTTP.new(proxy_host: -> _con { '127.0.0.0' }, proxy_port: -> _con { 8080 })
|
37
|
-
http.proxy_host.
|
38
|
-
http.proxy_port.
|
34
|
+
expect(http.proxy_host).to eq '127.0.0.0'
|
35
|
+
expect(http.proxy_port).to be 8080
|
39
36
|
end
|
40
37
|
|
41
38
|
it 'should set proxy correctly using shorthand method' do
|
42
39
|
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080] })
|
43
|
-
http.proxy_host_port.
|
40
|
+
expect(http.proxy_host_port).to eq ['127.0.0.0', 8080]
|
44
41
|
end
|
45
42
|
|
46
43
|
it 'should set proxy w/ auth correctly using shorthand method' do
|
47
44
|
http = Polipus::HTTP.new(proxy_host_port: -> _con { ['127.0.0.0', 8080, 'a', 'b'] })
|
48
|
-
http.proxy_host_port.
|
45
|
+
expect(http.proxy_host_port).to eq ['127.0.0.0', 8080, 'a', 'b']
|
49
46
|
end
|
50
47
|
|
51
48
|
it 'should set proxy settings' do
|
52
49
|
http = Polipus::HTTP.new(proxy_host: '127.0.0.0', proxy_port: 8080, proxy_user: 'a', proxy_pass: 'b')
|
53
|
-
http.proxy_port.
|
54
|
-
http.proxy_host.
|
55
|
-
http.proxy_user.
|
56
|
-
http.proxy_pass.
|
50
|
+
expect(http.proxy_port).to be 8080
|
51
|
+
expect(http.proxy_host).to eq '127.0.0.0'
|
52
|
+
expect(http.proxy_user).to eq 'a'
|
53
|
+
expect(http.proxy_pass).to eq 'b'
|
57
54
|
end
|
58
|
-
|
59
55
|
end
|
60
56
|
|
61
57
|
describe 'compressed content handling' do
|
62
|
-
|
63
58
|
it 'should decode gzip content' do
|
64
59
|
VCR.use_cassette('gzipped_on') do
|
65
60
|
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
66
61
|
page = http.fetch_page('http://www.whatsmyip.org/http-compression-test/')
|
67
|
-
page.doc.css('.gzip_yes').
|
62
|
+
expect(page.doc.css('.gzip_yes')).not_to be_empty
|
68
63
|
end
|
69
64
|
end
|
70
65
|
|
71
66
|
it 'should decode deflate content' do
|
72
67
|
http = Polipus::HTTP.new(logger: Logger.new(STDOUT))
|
73
68
|
page = http.fetch_page('http://david.fullrecall.com/browser-http-compression-test?compression=deflate-http')
|
74
|
-
page.headers.fetch('content-encoding').first.
|
75
|
-
page.body.include?('deflate-http').
|
69
|
+
expect(page.headers.fetch('content-encoding').first).to eq 'deflate'
|
70
|
+
expect(page.body.include?('deflate-http')).to be_truthy
|
76
71
|
end
|
77
|
-
|
78
72
|
end
|
79
73
|
|
80
74
|
describe 'staled connections' do
|
81
|
-
|
82
75
|
it 'should refresh a staled connection' do
|
83
76
|
VCR.use_cassette('http_tconnection_max_hits') do
|
84
77
|
http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
|
85
78
|
http.class.__send__(:attr_reader, :connections)
|
86
79
|
http.class.__send__(:attr_reader, :connections_hits)
|
87
80
|
http.fetch_page('https://www.yahoo.com/')
|
88
|
-
http.connections['www.yahoo.com'][443].
|
81
|
+
expect(http.connections['www.yahoo.com'][443]).not_to be_nil
|
89
82
|
old_conn = http.connections['www.yahoo.com'][443]
|
90
|
-
http.connections_hits['www.yahoo.com'][443].
|
83
|
+
expect(http.connections_hits['www.yahoo.com'][443]).to be 1
|
91
84
|
|
92
85
|
http.fetch_page('https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html')
|
93
|
-
http.connections_hits['www.yahoo.com'][443].
|
94
|
-
http.connections['www.yahoo.com'][443].
|
86
|
+
expect(http.connections_hits['www.yahoo.com'][443]).to be 1
|
87
|
+
expect(http.connections['www.yahoo.com'][443]).not_to be old_conn
|
95
88
|
end
|
96
89
|
end
|
97
|
-
|
98
90
|
end
|
99
91
|
|
100
92
|
describe 'cookies' do
|
101
|
-
|
102
93
|
it 'should handle cookies correctly' do
|
103
94
|
VCR.use_cassette('http_cookies') do
|
104
95
|
http = Polipus::HTTP.new(accept_cookies: true)
|
105
96
|
http.fetch_page 'http://www.whatarecookies.com/cookietest.asp'
|
106
|
-
http.accept_cookies
|
107
|
-
http.cookie_jar.cookies(URI('http://www.whatarecookies.com/cookietest.asp')).
|
97
|
+
expect(http.accept_cookies?).to be_truthy
|
98
|
+
expect(http.cookie_jar.cookies(URI('http://www.whatarecookies.com/cookietest.asp'))).not_to be_empty
|
108
99
|
end
|
109
100
|
end
|
110
|
-
|
111
101
|
end
|
112
102
|
|
113
103
|
describe 'net errors' do
|
114
104
|
it 'should handle net errors correctly' do
|
115
105
|
VCR.use_cassette('http_errors') do
|
116
106
|
http = Polipus::HTTP.new(open_timeout: 1, read_timeout: 1)
|
117
|
-
http.fetch_page('http://www.wrong-domain.lol/').error.
|
107
|
+
expect(http.fetch_page('http://www.wrong-domain.lol/').error).not_to be_nil
|
118
108
|
end
|
119
109
|
end
|
120
110
|
end
|
121
|
-
|
122
111
|
end
|
@@ -23,11 +23,11 @@ EOF
|
|
23
23
|
end
|
24
24
|
|
25
25
|
it 'should be fetched' do
|
26
|
-
page.fetched
|
26
|
+
expect(page.fetched?).to be_truthy
|
27
27
|
end
|
28
28
|
|
29
29
|
it 'should honor domain_aliases attribute' do
|
30
|
-
page.links.count.
|
30
|
+
expect(page.links.count).to be 4
|
31
31
|
end
|
32
32
|
|
33
33
|
context 'page expiring' do
|
@@ -41,32 +41,28 @@ EOF
|
|
41
41
|
end
|
42
42
|
|
43
43
|
it 'should be marked at expired' do
|
44
|
-
page.expired?(20).
|
44
|
+
expect(page.expired?(20)).to be_truthy
|
45
45
|
end
|
46
46
|
|
47
47
|
it 'should NOT be marked at expired' do
|
48
|
-
page.expired?(60).
|
48
|
+
expect(page.expired?(60)).to be_falsey
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
52
|
context 'page error' do
|
53
|
-
|
54
53
|
let(:page) do
|
55
54
|
Polipus::Page.new 'http://www.google.com/', error: 'an error'
|
56
55
|
end
|
57
56
|
|
58
57
|
it 'should serialize an error' do
|
59
|
-
page.to_hash['error'].
|
58
|
+
expect(page.to_hash['error']).to eq 'an error'
|
60
59
|
end
|
61
|
-
|
62
60
|
end
|
63
61
|
|
64
62
|
context 'page code' do
|
65
63
|
it 'should identify HTTPSuccess code' do
|
66
|
-
Polipus::Page.new('http://www.google.com/', code: 201).success
|
67
|
-
Polipus::Page.new('http://www.google.com/', code: 404).success
|
64
|
+
expect(Polipus::Page.new('http://www.google.com/', code: 201).success?).to be_truthy
|
65
|
+
expect(Polipus::Page.new('http://www.google.com/', code: 404).success?).to be_falsey
|
68
66
|
end
|
69
|
-
|
70
67
|
end
|
71
|
-
|
72
68
|
end
|
@@ -33,47 +33,42 @@ describe Polipus::QueueOverflow::Manager do
|
|
33
33
|
end
|
34
34
|
|
35
35
|
it 'should remove 10 items' do
|
36
|
-
@manager.perform.
|
36
|
+
expect(@manager.perform).to eq([0, 0])
|
37
37
|
20.times { |i| @redis_q << page_factory("http://www.user-doo.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
38
|
-
@manager.perform.
|
39
|
-
@queue_overflow.size.
|
40
|
-
@redis_q.size.
|
38
|
+
expect(@manager.perform).to eq([10, 0])
|
39
|
+
expect(@queue_overflow.size).to eq(10)
|
40
|
+
expect(@redis_q.size).to eq(10)
|
41
41
|
end
|
42
42
|
|
43
43
|
it 'should restore 10 items' do
|
44
|
-
@manager.perform.
|
44
|
+
expect(@manager.perform).to eq([0, 0])
|
45
45
|
10.times { |i| @queue_overflow << page_factory("http://www.user-doo-bla.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
46
|
-
@manager.perform.
|
47
|
-
@queue_overflow.size.
|
48
|
-
@redis_q.size.
|
49
|
-
@manager.perform.
|
50
|
-
|
46
|
+
expect(@manager.perform).to eq([0, 10])
|
47
|
+
expect(@queue_overflow.size).to eq(0)
|
48
|
+
expect(@redis_q.size).to eq(10)
|
49
|
+
expect(@manager.perform).to eq([0, 0])
|
51
50
|
end
|
52
51
|
|
53
52
|
it 'should restore 3 items' do
|
54
|
-
|
55
|
-
@manager.perform.should be == [0, 0]
|
53
|
+
expect(@manager.perform).to eq([0, 0])
|
56
54
|
3.times { |i| @queue_overflow << page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>').to_json }
|
57
|
-
@manager.perform.
|
58
|
-
@queue_overflow.size.
|
59
|
-
@redis_q.size.
|
60
|
-
@manager.perform.
|
61
|
-
|
55
|
+
expect(@manager.perform).to eq([0, 3])
|
56
|
+
expect(@queue_overflow.size).to eq(0)
|
57
|
+
expect(@redis_q.size).to eq(3)
|
58
|
+
expect(@manager.perform).to eq([0, 0])
|
62
59
|
end
|
63
60
|
|
64
61
|
it 'should restore 0 items' do
|
65
|
-
|
66
|
-
@manager.perform.should be == [0, 0]
|
62
|
+
expect(@manager.perform).to eq([0, 0])
|
67
63
|
10.times do|i|
|
68
64
|
p = page_factory("http://www.user-doo-bu.com/page_#{i}", code: 200, body: '<html></html>')
|
69
65
|
@storage.add p
|
70
66
|
@queue_overflow << p.to_json
|
71
67
|
end
|
72
|
-
@manager.perform.
|
73
|
-
@queue_overflow.size.
|
74
|
-
@redis_q.size.
|
75
|
-
@manager.perform.
|
76
|
-
|
68
|
+
expect(@manager.perform).to eq([0, 0])
|
69
|
+
expect(@queue_overflow.size).to eq(0)
|
70
|
+
expect(@redis_q.size).to eq(0)
|
71
|
+
expect(@manager.perform).to eq([0, 0])
|
77
72
|
end
|
78
73
|
|
79
74
|
it 'should filter an url based on the spec' do
|
@@ -83,13 +78,11 @@ describe Polipus::QueueOverflow::Manager do
|
|
83
78
|
@manager.url_filter do |page|
|
84
79
|
page.url.to_s.end_with?('page_0') ? false : true
|
85
80
|
end
|
86
|
-
@manager.perform.
|
87
|
-
@queue_overflow.size.
|
88
|
-
@redis_q.size.
|
81
|
+
expect(@manager.perform).to eq([0, 9])
|
82
|
+
expect(@queue_overflow.size).to eq(0)
|
83
|
+
expect(@redis_q.size).to eq(9)
|
89
84
|
@manager.url_filter do |_page|
|
90
85
|
true
|
91
86
|
end
|
92
|
-
|
93
87
|
end
|
94
|
-
|
95
88
|
end
|
@@ -3,12 +3,10 @@ require 'spec_helper'
|
|
3
3
|
require 'polipus/queue_overflow'
|
4
4
|
|
5
5
|
describe Polipus::QueueOverflow do
|
6
|
-
|
7
6
|
before(:all) do
|
8
7
|
@queue_overflow = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test')
|
9
8
|
@queue_overflow_capped = Polipus::QueueOverflow.mongo_queue_capped(nil, 'queue_test_c', max: 20)
|
10
9
|
@queue_overflow_uniq = Polipus::QueueOverflow.mongo_queue(nil, 'queue_test_u', ensure_uniq: true)
|
11
|
-
|
12
10
|
end
|
13
11
|
|
14
12
|
before(:each) do
|
@@ -25,26 +23,24 @@ describe Polipus::QueueOverflow do
|
|
25
23
|
|
26
24
|
it 'should work' do
|
27
25
|
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
28
|
-
q.empty
|
29
|
-
q.pop.
|
26
|
+
expect(q.empty?).to be_truthy
|
27
|
+
expect(q.pop).to be_nil
|
30
28
|
q << 'test'
|
31
|
-
q.size.
|
32
|
-
q.pop.
|
33
|
-
q.empty
|
34
|
-
q.pop.
|
35
|
-
q.size.
|
36
|
-
q.empty
|
29
|
+
expect(q.size).to eq(1)
|
30
|
+
expect(q.pop).to eq('test')
|
31
|
+
expect(q.empty?).to be_truthy
|
32
|
+
expect(q.pop).to be_nil
|
33
|
+
expect(q.size).to eq(0)
|
34
|
+
expect(q.empty?).to be_truthy
|
37
35
|
end
|
38
|
-
|
39
36
|
end
|
40
37
|
|
41
38
|
it 'should act as a queue' do
|
42
39
|
[@queue_overflow, @queue_overflow_capped, @queue_overflow_uniq].each do |q|
|
43
40
|
10.times { |i| q << "message_#{i}" }
|
44
|
-
q.size.
|
45
|
-
q.pop.
|
41
|
+
expect(q.size).to eq(10)
|
42
|
+
expect(q.pop).to eq('message_0')
|
46
43
|
end
|
47
|
-
|
48
44
|
end
|
49
45
|
|
50
46
|
it 'should work with complex paylod' do
|
@@ -52,21 +48,19 @@ describe Polipus::QueueOverflow do
|
|
52
48
|
a = { 'a' => [1, 2, 3], 'b' => 'a_string' }
|
53
49
|
q << a.to_json
|
54
50
|
b = q.pop
|
55
|
-
JSON.parse(b).
|
51
|
+
expect(JSON.parse(b)).to eq(a)
|
56
52
|
end
|
57
|
-
|
58
53
|
end
|
59
54
|
|
60
55
|
it 'should honor max items if it is capped' do
|
61
56
|
30.times { |i| @queue_overflow_capped << "message_#{i}" }
|
62
|
-
@queue_overflow_capped.size.
|
63
|
-
@queue_overflow_capped.pop.
|
57
|
+
expect(@queue_overflow_capped.size).to eq(20)
|
58
|
+
expect(@queue_overflow_capped.pop).to eq('message_10')
|
64
59
|
end
|
65
60
|
|
66
61
|
it 'should contains only unique items' do
|
67
62
|
20.times { @queue_overflow_uniq << 'A' }
|
68
63
|
20.times { @queue_overflow_uniq << 'B' }
|
69
|
-
@queue_overflow_uniq.size.
|
64
|
+
expect(@queue_overflow_uniq.size).to eq(2)
|
70
65
|
end
|
71
|
-
|
72
66
|
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
# encoding: UTF-8
|
2
1
|
require 'spec_helper'
|
3
2
|
require 'polipus/robotex'
|
3
|
+
|
4
4
|
describe Polipus::Robotex do
|
5
5
|
let(:spec_domain) { 'http://www.example.com/' }
|
6
6
|
before(:each) do
|
@@ -19,20 +19,20 @@ Disallow: /locked
|
|
19
19
|
Allow: /locked
|
20
20
|
END
|
21
21
|
stub_request(:get, 'http://www.example.com/robots.txt')
|
22
|
-
|
22
|
+
.to_return(body: robots, status: [200, 'OK'], headers: { 'Content-Type' => 'text/plain' })
|
23
23
|
end
|
24
24
|
|
25
25
|
describe '#initialize' do
|
26
26
|
context 'when no arguments are supplied' do
|
27
27
|
it 'returns a Robotex with the default user-agent' do
|
28
|
-
Polipus::Robotex.new.user_agent.
|
28
|
+
expect(Polipus::Robotex.new.user_agent).to eq("Robotex/#{Polipus::Robotex::VERSION} (http://www.github.com/chriskite/robotex)")
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
32
|
context 'when a user-agent is specified' do
|
33
33
|
it 'returns a Robotex with the specified user-agent' do
|
34
34
|
ua = 'My User Agent'
|
35
|
-
Polipus::Robotex.new(ua).user_agent.
|
35
|
+
expect(Polipus::Robotex.new(ua).user_agent).to eq(ua)
|
36
36
|
end
|
37
37
|
end
|
38
38
|
end
|
@@ -41,28 +41,28 @@ END
|
|
41
41
|
context 'when the robots.txt disallows the user-agent to the url' do
|
42
42
|
it 'returns false' do
|
43
43
|
robotex = Polipus::Robotex.new('bender')
|
44
|
-
robotex.allowed?(spec_domain + 'my_shiny_metal_ass').
|
44
|
+
expect(robotex.allowed?(spec_domain + 'my_shiny_metal_ass')).to be_falsey
|
45
45
|
end
|
46
46
|
end
|
47
47
|
|
48
48
|
context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
|
49
49
|
it 'returns true' do
|
50
50
|
robotex = Polipus::Robotex.new('bender')
|
51
|
-
robotex.allowed?(spec_domain + 'cigars').
|
51
|
+
expect(robotex.allowed?(spec_domain + 'cigars')).to be_truthy
|
52
52
|
end
|
53
53
|
end
|
54
54
|
|
55
55
|
context 'when the robots.txt disallows any user-agent to the url' do
|
56
56
|
it 'returns false' do
|
57
57
|
robotex = Polipus::Robotex.new
|
58
|
-
robotex.allowed?(spec_domain + 'login').
|
58
|
+
expect(robotex.allowed?(spec_domain + 'login')).to be_falsey
|
59
59
|
end
|
60
60
|
end
|
61
61
|
|
62
62
|
context 'when the robots.txt disallows and then allows the url' do
|
63
63
|
it 'returns false' do
|
64
64
|
robotex = Polipus::Robotex.new
|
65
|
-
robotex.allowed?(spec_domain + 'locked').
|
65
|
+
expect(robotex.allowed?(spec_domain + 'locked')).to be_falsey
|
66
66
|
end
|
67
67
|
end
|
68
68
|
end
|
@@ -71,16 +71,15 @@ END
|
|
71
71
|
context 'when no Crawl-Delay is specified for the user-agent' do
|
72
72
|
it 'returns nil' do
|
73
73
|
robotex = Polipus::Robotex.new
|
74
|
-
robotex.delay(spec_domain).
|
74
|
+
expect(robotex.delay(spec_domain)).to be_nil
|
75
75
|
end
|
76
76
|
|
77
77
|
context 'when Crawl-Delay is specified for the user-agent' do
|
78
78
|
it 'returns the delay as a Fixnum' do
|
79
79
|
robotex = Polipus::Robotex.new('msnbot')
|
80
|
-
robotex.delay(spec_domain).
|
80
|
+
expect(robotex.delay(spec_domain)).to eq(20)
|
81
81
|
end
|
82
82
|
end
|
83
83
|
end
|
84
84
|
end
|
85
|
-
|
86
85
|
end
|