polipus 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.rspec +1 -1
- data/.rubocop.yml +3 -3
- data/.rubocop_todo.yml +1 -1
- data/.travis.yml +14 -4
- data/AUTHORS.md +1 -0
- data/CHANGELOG.md +9 -1
- data/Gemfile +9 -0
- data/README.md +2 -3
- data/Rakefile +1 -3
- data/examples/basic.rb +8 -1
- data/lib/polipus.rb +25 -13
- data/lib/polipus/queue_overflow.rb +1 -0
- data/lib/polipus/queue_overflow/manager.rb +1 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +1 -1
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/storage.rb +10 -16
- data/lib/polipus/storage/mongo_store.rb +6 -1
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +16 -18
- data/spec/{http_spec.rb → polipus/http_spec.rb} +26 -37
- data/spec/{page_spec.rb → polipus/page_spec.rb} +7 -11
- data/spec/{queue_overflow_manager_spec.rb → polipus/queue_overflow/manager_spec.rb} +22 -29
- data/spec/{queue_overflow_spec.rb → polipus/queue_overflow_spec.rb} +14 -20
- data/spec/{robotex_spec.rb → polipus/robotex_spec.rb} +10 -11
- data/spec/{signal_handler_spec.rb → polipus/signal_handler_spec.rb} +2 -6
- data/spec/{storage_memory_spec.rb → polipus/storage/memory_store_spec.rb} +18 -21
- data/spec/{storage_mongo_spec.rb → polipus/storage/mongo_store_spec.rb} +23 -25
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/{url_tracker_spec.rb → polipus/url_tracker_spec.rb} +4 -4
- data/spec/polipus_spec.rb +13 -15
- data/spec/spec_helper.rb +13 -12
- metadata +76 -154
- data/lib/polipus/storage/s3_store.rb +0 -96
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +0 -166
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +0 -166
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +0 -270
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +0 -194
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +0 -183
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +0 -221
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +0 -221
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +0 -221
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +0 -695
- data/spec/storage_s3_spec.rb +0 -115
@@ -1,19 +1,15 @@
|
|
1
|
-
# encoding: UTF-8
|
2
1
|
require 'spec_helper'
|
3
2
|
|
4
3
|
describe Polipus::SignalHandler do
|
5
|
-
|
6
4
|
context 'signal handler' do
|
7
|
-
|
8
5
|
it 'should be enabled by default' do
|
9
6
|
Polipus::PolipusCrawler.new('polipus-rspec', [])
|
10
|
-
Polipus::SignalHandler.enabled
|
7
|
+
expect(Polipus::SignalHandler.enabled?).to be true
|
11
8
|
end
|
12
9
|
|
13
10
|
it 'should be disabled if specified' do
|
14
11
|
Polipus::PolipusCrawler.new('polipus-rspec', [], enable_signal_handler: false)
|
15
|
-
Polipus::SignalHandler.enabled
|
12
|
+
expect(Polipus::SignalHandler.enabled?).to be false
|
16
13
|
end
|
17
|
-
|
18
14
|
end
|
19
15
|
end
|
@@ -3,38 +3,37 @@ require 'spec_helper'
|
|
3
3
|
require 'mongo'
|
4
4
|
require 'polipus/storage/memory_store'
|
5
5
|
describe Polipus::Storage::MemoryStore do
|
6
|
-
|
7
6
|
let(:storage) { Polipus::Storage.memory_store }
|
8
7
|
|
9
8
|
it 'should store a page' do
|
10
9
|
p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
11
10
|
uuid = storage.add p
|
12
|
-
uuid.
|
13
|
-
storage.count.
|
11
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
12
|
+
expect(storage.count).to be 1
|
14
13
|
p = storage.get p
|
15
|
-
p.url.to_s.
|
16
|
-
p.body.
|
14
|
+
expect(p.url.to_s).to eq('http://www.google.com')
|
15
|
+
expect(p.body).to eq('<html></html>')
|
17
16
|
end
|
18
17
|
|
19
18
|
it 'should update a page' do
|
20
19
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
21
20
|
storage.add p
|
22
21
|
p = storage.get p
|
23
|
-
p.code.
|
22
|
+
expect(p.code).to eq(301)
|
24
23
|
end
|
25
24
|
|
26
25
|
it 'should iterate over stored pages' do
|
27
26
|
storage.each do |k, page|
|
28
|
-
k.
|
29
|
-
page.url.to_s.
|
27
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
28
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
30
29
|
end
|
31
30
|
end
|
32
31
|
|
33
32
|
it 'should delete a page' do
|
34
33
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
35
34
|
storage.remove p
|
36
|
-
storage.get(p).
|
37
|
-
storage.count.
|
35
|
+
expect(storage.get(p)).to be_nil
|
36
|
+
expect(storage.count).to be 0
|
38
37
|
end
|
39
38
|
|
40
39
|
it 'should store a page removing a query string from the uuid generation' do
|
@@ -42,7 +41,7 @@ describe Polipus::Storage::MemoryStore do
|
|
42
41
|
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
|
43
42
|
storage.include_query_string_in_uuid = false
|
44
43
|
storage.add p
|
45
|
-
storage.exists?(p_no_query).
|
44
|
+
expect(storage.exists?(p_no_query)).to be_truthy
|
46
45
|
storage.remove p
|
47
46
|
end
|
48
47
|
|
@@ -51,7 +50,7 @@ describe Polipus::Storage::MemoryStore do
|
|
51
50
|
p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
|
52
51
|
storage.include_query_string_in_uuid = false
|
53
52
|
storage.add p
|
54
|
-
storage.exists?(p_no_query).
|
53
|
+
expect(storage.exists?(p_no_query)).to be_truthy
|
55
54
|
storage.remove p
|
56
55
|
end
|
57
56
|
|
@@ -59,9 +58,9 @@ describe Polipus::Storage::MemoryStore do
|
|
59
58
|
p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
|
60
59
|
p.user_data.name = 'Test User Data'
|
61
60
|
storage.add p
|
62
|
-
storage.exists?(p).
|
61
|
+
expect(storage.exists?(p)).to be_truthy
|
63
62
|
p = storage.get(p)
|
64
|
-
p.user_data.name.
|
63
|
+
expect(p.user_data.name).to eq('Test User Data')
|
65
64
|
storage.remove p
|
66
65
|
end
|
67
66
|
|
@@ -70,21 +69,19 @@ describe Polipus::Storage::MemoryStore do
|
|
70
69
|
p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
71
70
|
storage.add p
|
72
71
|
p = storage.get p
|
73
|
-
p.body.
|
72
|
+
expect(p.body).to be_empty
|
74
73
|
storage.clear
|
75
74
|
end
|
76
75
|
|
77
76
|
it 'should return false if a doc not exists' do
|
78
77
|
storage.include_query_string_in_uuid = false
|
79
78
|
p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
|
80
|
-
storage.exists?(p_other).
|
79
|
+
expect(storage.exists?(p_other)).to be_falsey
|
81
80
|
storage.add p_other
|
82
|
-
storage.exists?(p_other).
|
81
|
+
expect(storage.exists?(p_other)).to be_truthy
|
83
82
|
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
|
84
|
-
storage.exists?(p_other).
|
83
|
+
expect(storage.exists?(p_other)).to be_truthy
|
85
84
|
storage.include_query_string_in_uuid = true
|
86
|
-
storage.exists?(p_other).
|
87
|
-
|
85
|
+
expect(storage.exists?(p_other)).to be_falsey
|
88
86
|
end
|
89
|
-
|
90
87
|
end
|
@@ -20,34 +20,34 @@ describe Polipus::Storage::MongoStore do
|
|
20
20
|
it 'should store a page' do
|
21
21
|
p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
22
22
|
uuid = @storage.add p
|
23
|
-
uuid.
|
24
|
-
@storage.count.
|
25
|
-
@mongo['_test_pages'].count.
|
23
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
24
|
+
expect(@storage.count).to be 1
|
25
|
+
expect(@mongo['_test_pages'].count).to be 1
|
26
26
|
p = @storage.get p
|
27
|
-
p.url.to_s.
|
28
|
-
p.body.
|
27
|
+
expect(p.url.to_s).to eq('http://www.google.com')
|
28
|
+
expect(p.body).to eq('<html></html>')
|
29
29
|
end
|
30
30
|
|
31
31
|
it 'should update a page' do
|
32
32
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
33
33
|
@storage.add p
|
34
34
|
p = @storage.get p
|
35
|
-
p.code.
|
36
|
-
@mongo['_test_pages'].count.
|
35
|
+
expect(p.code).to eq(301)
|
36
|
+
expect(@mongo['_test_pages'].count).to be 1
|
37
37
|
end
|
38
38
|
|
39
39
|
it 'should iterate over stored pages' do
|
40
40
|
@storage.each do |k, page|
|
41
|
-
k.
|
42
|
-
page.url.to_s.
|
41
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
42
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
46
46
|
it 'should delete a page' do
|
47
47
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
48
48
|
@storage.remove p
|
49
|
-
@storage.get(p).
|
50
|
-
@storage.count.
|
49
|
+
expect(@storage.get(p)).to be_nil
|
50
|
+
expect(@storage.count).to be 0
|
51
51
|
end
|
52
52
|
|
53
53
|
it 'should store a page removing a query string from the uuid generation' do
|
@@ -55,7 +55,7 @@ describe Polipus::Storage::MongoStore do
|
|
55
55
|
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
|
56
56
|
@storage.include_query_string_in_uuid = false
|
57
57
|
@storage.add p
|
58
|
-
@storage.exists?(p_no_query).
|
58
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
59
59
|
@storage.remove p
|
60
60
|
end
|
61
61
|
|
@@ -64,7 +64,7 @@ describe Polipus::Storage::MongoStore do
|
|
64
64
|
p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
|
65
65
|
@storage.include_query_string_in_uuid = false
|
66
66
|
@storage.add p
|
67
|
-
@storage.exists?(p_no_query).
|
67
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
68
68
|
@storage.remove p
|
69
69
|
end
|
70
70
|
|
@@ -72,9 +72,9 @@ describe Polipus::Storage::MongoStore do
|
|
72
72
|
p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
|
73
73
|
p.user_data.name = 'Test User Data'
|
74
74
|
@storage.add p
|
75
|
-
@storage.exists?(p).
|
75
|
+
expect(@storage.exists?(p)).to be_truthy
|
76
76
|
p = @storage.get(p)
|
77
|
-
p.user_data.name.
|
77
|
+
expect(p.user_data.name).to eq('Test User Data')
|
78
78
|
@storage.remove p
|
79
79
|
end
|
80
80
|
|
@@ -83,30 +83,29 @@ describe Polipus::Storage::MongoStore do
|
|
83
83
|
p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
84
84
|
storage.add p
|
85
85
|
p = storage.get p
|
86
|
-
p.body.
|
86
|
+
expect(p.body).to be_empty
|
87
87
|
storage.clear
|
88
88
|
end
|
89
89
|
|
90
90
|
it 'should return false if a doc not exists' do
|
91
91
|
@storage.include_query_string_in_uuid = false
|
92
92
|
p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
|
93
|
-
@storage.exists?(p_other).
|
93
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
94
94
|
@storage.add p_other
|
95
|
-
@storage.exists?(p_other).
|
95
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
96
96
|
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
|
97
|
-
@storage.exists?(p_other).
|
97
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
98
98
|
@storage.include_query_string_in_uuid = true
|
99
|
-
@storage.exists?(p_other).
|
100
|
-
|
99
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
101
100
|
end
|
102
101
|
|
103
102
|
it 'should set page.fetched_at based on the id creation' do
|
104
103
|
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
105
104
|
p = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
|
106
105
|
storage.add p
|
107
|
-
p.fetched_at.
|
106
|
+
expect(p.fetched_at).to be_nil
|
108
107
|
p = storage.get p
|
109
|
-
p.fetched_at.
|
108
|
+
expect(p.fetched_at).not_to be_nil
|
110
109
|
end
|
111
110
|
|
112
111
|
it 'should NOT set page.fetched_at if already present' do
|
@@ -115,7 +114,6 @@ describe Polipus::Storage::MongoStore do
|
|
115
114
|
p.fetched_at = 10
|
116
115
|
storage.add p
|
117
116
|
p = storage.get p
|
118
|
-
p.fetched_at.
|
117
|
+
expect(p.fetched_at).to be 10
|
119
118
|
end
|
120
|
-
|
121
119
|
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'polipus/storage/rethink_store'
|
4
|
+
|
5
|
+
describe Polipus::Storage::RethinkStore do
|
6
|
+
before(:all)do
|
7
|
+
@r = RethinkDB::RQL.new
|
8
|
+
@rethink = @r.connect(host: 'localhost', port: 28_015, db: 'polipus_spec')
|
9
|
+
@r.db_create('polipus_spec').run(@rethink) unless @r.db_list.run(@rethink).include?('polipus_spec')
|
10
|
+
@table = 'test_pages'
|
11
|
+
@storage = Polipus::Storage.rethink_store(@rethink, @table)
|
12
|
+
end
|
13
|
+
|
14
|
+
after(:each) do
|
15
|
+
@r.table(@table).delete.run(@rethink)
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should store a page' do
|
19
|
+
page = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
20
|
+
uuid = @storage.add page
|
21
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
22
|
+
expect(@storage.count).to eq(1)
|
23
|
+
expect(@r.table(@table).count.run(@rethink)).to eq(1)
|
24
|
+
page = @storage.get page
|
25
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
26
|
+
expect(page.body).to eq('<html></html>')
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should update a page' do
|
30
|
+
page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
31
|
+
@storage.add page
|
32
|
+
page = @storage.get page
|
33
|
+
expect(page.code).to eq(301)
|
34
|
+
expect(@r.table(@table).count.run(@rethink)).to eq(1)
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should iterate over stored pages' do
|
38
|
+
@storage.each do |k, page|
|
39
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
40
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'should delete a page' do
|
45
|
+
page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
46
|
+
@storage.remove page
|
47
|
+
expect(@storage.get(page)).to be_nil
|
48
|
+
expect(@storage.count).to be 0
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should store a page removing a query string from the uuid generation' do
|
52
|
+
page = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
|
53
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
|
54
|
+
@storage.include_query_string_in_uuid = false
|
55
|
+
@storage.add page
|
56
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
57
|
+
@storage.remove page
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
61
|
+
page = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
|
62
|
+
p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
|
63
|
+
@storage.include_query_string_in_uuid = false
|
64
|
+
@storage.add page
|
65
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
66
|
+
@storage.remove page
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should store a page with user data associated' do
|
70
|
+
page = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
|
71
|
+
page.user_data.name = 'Test User Data'
|
72
|
+
@storage.add page
|
73
|
+
expect(@storage.exists?(page)).to be_truthy
|
74
|
+
page = @storage.get(page)
|
75
|
+
expect(page.user_data.name).to eq('Test User Data')
|
76
|
+
@storage.remove page
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should honor the except parameters' do
|
80
|
+
storage = Polipus::Storage.rethink_store(@rethink, @table, ['body'])
|
81
|
+
page = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
82
|
+
storage.add page
|
83
|
+
page = storage.get page
|
84
|
+
expect(page.body).to be_empty
|
85
|
+
storage.clear
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should return false if a doc not exists' do
|
89
|
+
@storage.include_query_string_in_uuid = false
|
90
|
+
p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
|
91
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
92
|
+
@storage.add p_other
|
93
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
94
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
|
95
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
96
|
+
@storage.include_query_string_in_uuid = true
|
97
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'should set page.fetched_at based on the id creation' do
|
101
|
+
storage = Polipus::Storage.rethink_store(@rethink, @table)
|
102
|
+
page = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
|
103
|
+
storage.add page
|
104
|
+
expect(page.fetched_at).to be_nil
|
105
|
+
page = storage.get page
|
106
|
+
expect(page.fetched_at).not_to be_nil
|
107
|
+
end
|
108
|
+
|
109
|
+
it 'should NOT set page.fetched_at if already present' do
|
110
|
+
storage = Polipus::Storage.rethink_store(@rethink, @table)
|
111
|
+
page = page_factory 'http://www.user-doojooo.com', code: 200, body: '<html></html>'
|
112
|
+
page.fetched_at = 10
|
113
|
+
storage.add page
|
114
|
+
page = storage.get page
|
115
|
+
expect(page.fetched_at).to be 10
|
116
|
+
end
|
117
|
+
end
|
@@ -16,14 +16,14 @@ describe Polipus::UrlTracker do
|
|
16
16
|
it 'should work (bf)' do
|
17
17
|
url = 'http://www.asd.com/asd/lol'
|
18
18
|
@bf.visit url
|
19
|
-
@bf.visited?(url).
|
20
|
-
@bf.visited?('http://www.google.com').
|
19
|
+
expect(@bf.visited?(url)).to be_truthy
|
20
|
+
expect(@bf.visited?('http://www.google.com')).to be_falsey
|
21
21
|
end
|
22
22
|
|
23
23
|
it 'should work (redis_set)' do
|
24
24
|
url = 'http://www.asd.com/asd/lol'
|
25
25
|
@set.visit url
|
26
|
-
@set.visited?(url).
|
27
|
-
@set.visited?('http://www.google.com').
|
26
|
+
expect(@set.visited?(url)).to be_truthy
|
27
|
+
expect(@set.visited?('http://www.google.com')).to be_falsey
|
28
28
|
end
|
29
29
|
end
|
data/spec/polipus_spec.rb
CHANGED
@@ -26,31 +26,30 @@ describe Polipus::PolipusCrawler do
|
|
26
26
|
let(:logger) { Logger.new(nil) }
|
27
27
|
|
28
28
|
context 'polipus' do
|
29
|
-
|
30
29
|
it 'should create a polipus instance' do
|
31
|
-
polipus.
|
30
|
+
expect(polipus).to be_an_instance_of Polipus::PolipusCrawler
|
32
31
|
end
|
33
32
|
|
34
33
|
it 'should execute a crawling session' do
|
35
34
|
polipus.takeover
|
36
|
-
polipus.storage.exists?(init_page).
|
37
|
-
polipus.storage.get(init_page).links.count.
|
35
|
+
expect(polipus.storage.exists?(init_page)).to be_truthy
|
36
|
+
expect(polipus.storage.get(init_page).links.count).to be polipus.storage.count
|
38
37
|
end
|
39
38
|
|
40
39
|
it 'should filter unwanted urls' do
|
41
40
|
polipus.skip_links_like(/\/pages\//)
|
42
41
|
polipus.takeover
|
43
|
-
polipus.storage.get(init_page).links
|
44
|
-
.reject { |e| e.path.to_s =~ /\/pages\// }.count.
|
42
|
+
expect(polipus.storage.get(init_page).links
|
43
|
+
.reject { |e| e.path.to_s =~ /\/pages\// }.count).to be polipus.storage.count
|
45
44
|
end
|
46
45
|
|
47
46
|
it 'should follow only wanted urls' do
|
48
47
|
polipus.follow_links_like(/\/pages\//)
|
49
48
|
polipus.follow_links_like(/\/gems$/)
|
50
49
|
polipus.takeover
|
51
|
-
polipus.storage.get(init_page).links
|
50
|
+
expect(polipus.storage.get(init_page).links
|
52
51
|
.reject { |e| ![/\/pages\//, /\/gems$/].any? { |p| e.path =~ p } }
|
53
|
-
.count.
|
52
|
+
.count).to be polipus.storage.count
|
54
53
|
end
|
55
54
|
|
56
55
|
it 'should refresh expired pages' do
|
@@ -60,9 +59,9 @@ describe Polipus::PolipusCrawler do
|
|
60
59
|
page.fetched_at = page.fetched_at - 3600
|
61
60
|
polipus.storage.add(page)
|
62
61
|
end
|
63
|
-
polipus.storage.each { |_id, page| page.expired?(3600).
|
62
|
+
polipus.storage.each { |_id, page| expect(page.expired?(3600)).to be_truthy }
|
64
63
|
polipus.takeover
|
65
|
-
polipus.storage.each { |_id, page| page.expired?(3600).
|
64
|
+
polipus.storage.each { |_id, page| expect(page.expired?(3600)).to be_falsey }
|
66
65
|
end
|
67
66
|
|
68
67
|
it 'should re-download seeder urls no matter what' do
|
@@ -74,7 +73,7 @@ describe Polipus::PolipusCrawler do
|
|
74
73
|
end
|
75
74
|
polipus.takeover
|
76
75
|
polipus.takeover
|
77
|
-
cache_hit['http://rubygems.org/gems'].
|
76
|
+
expect(cache_hit['http://rubygems.org/gems']).to be 2
|
78
77
|
end
|
79
78
|
|
80
79
|
it 'should call on_page_error code blocks when a page has error' do
|
@@ -82,8 +81,8 @@ describe Polipus::PolipusCrawler do
|
|
82
81
|
a_page = nil
|
83
82
|
p.on_page_error { |page| a_page = page }
|
84
83
|
p.takeover
|
85
|
-
a_page.
|
86
|
-
a_page.error.
|
84
|
+
expect(a_page).not_to be_nil
|
85
|
+
expect(a_page.error).not_to be_nil
|
87
86
|
end
|
88
87
|
|
89
88
|
it 'should obey to the robots.txt file' do
|
@@ -92,8 +91,7 @@ describe Polipus::PolipusCrawler do
|
|
92
91
|
polipus = Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
|
93
92
|
polipus.depth_limit = 1
|
94
93
|
polipus.takeover
|
95
|
-
polipus.storage.each { |_id, page| (page.url.path =~ /$\/downloads\//).
|
94
|
+
polipus.storage.each { |_id, page| expect(page.url.path =~ /$\/downloads\//).to be_falsey }
|
96
95
|
end
|
97
|
-
|
98
96
|
end
|
99
97
|
end
|