polipus 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.rspec +1 -1
- data/.rubocop.yml +3 -3
- data/.rubocop_todo.yml +1 -1
- data/.travis.yml +14 -4
- data/AUTHORS.md +1 -0
- data/CHANGELOG.md +9 -1
- data/Gemfile +9 -0
- data/README.md +2 -3
- data/Rakefile +1 -3
- data/examples/basic.rb +8 -1
- data/lib/polipus.rb +25 -13
- data/lib/polipus/queue_overflow.rb +1 -0
- data/lib/polipus/queue_overflow/manager.rb +1 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +1 -1
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/storage.rb +10 -16
- data/lib/polipus/storage/mongo_store.rb +6 -1
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +16 -18
- data/spec/{http_spec.rb → polipus/http_spec.rb} +26 -37
- data/spec/{page_spec.rb → polipus/page_spec.rb} +7 -11
- data/spec/{queue_overflow_manager_spec.rb → polipus/queue_overflow/manager_spec.rb} +22 -29
- data/spec/{queue_overflow_spec.rb → polipus/queue_overflow_spec.rb} +14 -20
- data/spec/{robotex_spec.rb → polipus/robotex_spec.rb} +10 -11
- data/spec/{signal_handler_spec.rb → polipus/signal_handler_spec.rb} +2 -6
- data/spec/{storage_memory_spec.rb → polipus/storage/memory_store_spec.rb} +18 -21
- data/spec/{storage_mongo_spec.rb → polipus/storage/mongo_store_spec.rb} +23 -25
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/{url_tracker_spec.rb → polipus/url_tracker_spec.rb} +4 -4
- data/spec/polipus_spec.rb +13 -15
- data/spec/spec_helper.rb +13 -12
- metadata +76 -154
- data/lib/polipus/storage/s3_store.rb +0 -96
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +0 -166
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +0 -166
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +0 -270
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +0 -194
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +0 -183
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +0 -221
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +0 -221
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +0 -221
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +0 -695
- data/spec/storage_s3_spec.rb +0 -115
@@ -1,19 +1,15 @@
|
|
1
|
-
# encoding: UTF-8
|
2
1
|
require 'spec_helper'
|
3
2
|
|
4
3
|
describe Polipus::SignalHandler do
|
5
|
-
|
6
4
|
context 'signal handler' do
|
7
|
-
|
8
5
|
it 'should be enabled by default' do
|
9
6
|
Polipus::PolipusCrawler.new('polipus-rspec', [])
|
10
|
-
Polipus::SignalHandler.enabled
|
7
|
+
expect(Polipus::SignalHandler.enabled?).to be true
|
11
8
|
end
|
12
9
|
|
13
10
|
it 'should be disabled if specified' do
|
14
11
|
Polipus::PolipusCrawler.new('polipus-rspec', [], enable_signal_handler: false)
|
15
|
-
Polipus::SignalHandler.enabled
|
12
|
+
expect(Polipus::SignalHandler.enabled?).to be false
|
16
13
|
end
|
17
|
-
|
18
14
|
end
|
19
15
|
end
|
@@ -3,38 +3,37 @@ require 'spec_helper'
|
|
3
3
|
require 'mongo'
|
4
4
|
require 'polipus/storage/memory_store'
|
5
5
|
describe Polipus::Storage::MemoryStore do
|
6
|
-
|
7
6
|
let(:storage) { Polipus::Storage.memory_store }
|
8
7
|
|
9
8
|
it 'should store a page' do
|
10
9
|
p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
11
10
|
uuid = storage.add p
|
12
|
-
uuid.
|
13
|
-
storage.count.
|
11
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
12
|
+
expect(storage.count).to be 1
|
14
13
|
p = storage.get p
|
15
|
-
p.url.to_s.
|
16
|
-
p.body.
|
14
|
+
expect(p.url.to_s).to eq('http://www.google.com')
|
15
|
+
expect(p.body).to eq('<html></html>')
|
17
16
|
end
|
18
17
|
|
19
18
|
it 'should update a page' do
|
20
19
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
21
20
|
storage.add p
|
22
21
|
p = storage.get p
|
23
|
-
p.code.
|
22
|
+
expect(p.code).to eq(301)
|
24
23
|
end
|
25
24
|
|
26
25
|
it 'should iterate over stored pages' do
|
27
26
|
storage.each do |k, page|
|
28
|
-
k.
|
29
|
-
page.url.to_s.
|
27
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
28
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
30
29
|
end
|
31
30
|
end
|
32
31
|
|
33
32
|
it 'should delete a page' do
|
34
33
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
35
34
|
storage.remove p
|
36
|
-
storage.get(p).
|
37
|
-
storage.count.
|
35
|
+
expect(storage.get(p)).to be_nil
|
36
|
+
expect(storage.count).to be 0
|
38
37
|
end
|
39
38
|
|
40
39
|
it 'should store a page removing a query string from the uuid generation' do
|
@@ -42,7 +41,7 @@ describe Polipus::Storage::MemoryStore do
|
|
42
41
|
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
|
43
42
|
storage.include_query_string_in_uuid = false
|
44
43
|
storage.add p
|
45
|
-
storage.exists?(p_no_query).
|
44
|
+
expect(storage.exists?(p_no_query)).to be_truthy
|
46
45
|
storage.remove p
|
47
46
|
end
|
48
47
|
|
@@ -51,7 +50,7 @@ describe Polipus::Storage::MemoryStore do
|
|
51
50
|
p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
|
52
51
|
storage.include_query_string_in_uuid = false
|
53
52
|
storage.add p
|
54
|
-
storage.exists?(p_no_query).
|
53
|
+
expect(storage.exists?(p_no_query)).to be_truthy
|
55
54
|
storage.remove p
|
56
55
|
end
|
57
56
|
|
@@ -59,9 +58,9 @@ describe Polipus::Storage::MemoryStore do
|
|
59
58
|
p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
|
60
59
|
p.user_data.name = 'Test User Data'
|
61
60
|
storage.add p
|
62
|
-
storage.exists?(p).
|
61
|
+
expect(storage.exists?(p)).to be_truthy
|
63
62
|
p = storage.get(p)
|
64
|
-
p.user_data.name.
|
63
|
+
expect(p.user_data.name).to eq('Test User Data')
|
65
64
|
storage.remove p
|
66
65
|
end
|
67
66
|
|
@@ -70,21 +69,19 @@ describe Polipus::Storage::MemoryStore do
|
|
70
69
|
p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
71
70
|
storage.add p
|
72
71
|
p = storage.get p
|
73
|
-
p.body.
|
72
|
+
expect(p.body).to be_empty
|
74
73
|
storage.clear
|
75
74
|
end
|
76
75
|
|
77
76
|
it 'should return false if a doc not exists' do
|
78
77
|
storage.include_query_string_in_uuid = false
|
79
78
|
p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
|
80
|
-
storage.exists?(p_other).
|
79
|
+
expect(storage.exists?(p_other)).to be_falsey
|
81
80
|
storage.add p_other
|
82
|
-
storage.exists?(p_other).
|
81
|
+
expect(storage.exists?(p_other)).to be_truthy
|
83
82
|
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
|
84
|
-
storage.exists?(p_other).
|
83
|
+
expect(storage.exists?(p_other)).to be_truthy
|
85
84
|
storage.include_query_string_in_uuid = true
|
86
|
-
storage.exists?(p_other).
|
87
|
-
|
85
|
+
expect(storage.exists?(p_other)).to be_falsey
|
88
86
|
end
|
89
|
-
|
90
87
|
end
|
@@ -20,34 +20,34 @@ describe Polipus::Storage::MongoStore do
|
|
20
20
|
it 'should store a page' do
|
21
21
|
p = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
22
22
|
uuid = @storage.add p
|
23
|
-
uuid.
|
24
|
-
@storage.count.
|
25
|
-
@mongo['_test_pages'].count.
|
23
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
24
|
+
expect(@storage.count).to be 1
|
25
|
+
expect(@mongo['_test_pages'].count).to be 1
|
26
26
|
p = @storage.get p
|
27
|
-
p.url.to_s.
|
28
|
-
p.body.
|
27
|
+
expect(p.url.to_s).to eq('http://www.google.com')
|
28
|
+
expect(p.body).to eq('<html></html>')
|
29
29
|
end
|
30
30
|
|
31
31
|
it 'should update a page' do
|
32
32
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
33
33
|
@storage.add p
|
34
34
|
p = @storage.get p
|
35
|
-
p.code.
|
36
|
-
@mongo['_test_pages'].count.
|
35
|
+
expect(p.code).to eq(301)
|
36
|
+
expect(@mongo['_test_pages'].count).to be 1
|
37
37
|
end
|
38
38
|
|
39
39
|
it 'should iterate over stored pages' do
|
40
40
|
@storage.each do |k, page|
|
41
|
-
k.
|
42
|
-
page.url.to_s.
|
41
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
42
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
46
46
|
it 'should delete a page' do
|
47
47
|
p = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
48
48
|
@storage.remove p
|
49
|
-
@storage.get(p).
|
50
|
-
@storage.count.
|
49
|
+
expect(@storage.get(p)).to be_nil
|
50
|
+
expect(@storage.count).to be 0
|
51
51
|
end
|
52
52
|
|
53
53
|
it 'should store a page removing a query string from the uuid generation' do
|
@@ -55,7 +55,7 @@ describe Polipus::Storage::MongoStore do
|
|
55
55
|
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
|
56
56
|
@storage.include_query_string_in_uuid = false
|
57
57
|
@storage.add p
|
58
|
-
@storage.exists?(p_no_query).
|
58
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
59
59
|
@storage.remove p
|
60
60
|
end
|
61
61
|
|
@@ -64,7 +64,7 @@ describe Polipus::Storage::MongoStore do
|
|
64
64
|
p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
|
65
65
|
@storage.include_query_string_in_uuid = false
|
66
66
|
@storage.add p
|
67
|
-
@storage.exists?(p_no_query).
|
67
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
68
68
|
@storage.remove p
|
69
69
|
end
|
70
70
|
|
@@ -72,9 +72,9 @@ describe Polipus::Storage::MongoStore do
|
|
72
72
|
p = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
|
73
73
|
p.user_data.name = 'Test User Data'
|
74
74
|
@storage.add p
|
75
|
-
@storage.exists?(p).
|
75
|
+
expect(@storage.exists?(p)).to be_truthy
|
76
76
|
p = @storage.get(p)
|
77
|
-
p.user_data.name.
|
77
|
+
expect(p.user_data.name).to eq('Test User Data')
|
78
78
|
@storage.remove p
|
79
79
|
end
|
80
80
|
|
@@ -83,30 +83,29 @@ describe Polipus::Storage::MongoStore do
|
|
83
83
|
p = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
84
84
|
storage.add p
|
85
85
|
p = storage.get p
|
86
|
-
p.body.
|
86
|
+
expect(p.body).to be_empty
|
87
87
|
storage.clear
|
88
88
|
end
|
89
89
|
|
90
90
|
it 'should return false if a doc not exists' do
|
91
91
|
@storage.include_query_string_in_uuid = false
|
92
92
|
p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
|
93
|
-
@storage.exists?(p_other).
|
93
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
94
94
|
@storage.add p_other
|
95
|
-
@storage.exists?(p_other).
|
95
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
96
96
|
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
|
97
|
-
@storage.exists?(p_other).
|
97
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
98
98
|
@storage.include_query_string_in_uuid = true
|
99
|
-
@storage.exists?(p_other).
|
100
|
-
|
99
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
101
100
|
end
|
102
101
|
|
103
102
|
it 'should set page.fetched_at based on the id creation' do
|
104
103
|
storage = Polipus::Storage.mongo_store(@mongo, '_test_pages')
|
105
104
|
p = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
|
106
105
|
storage.add p
|
107
|
-
p.fetched_at.
|
106
|
+
expect(p.fetched_at).to be_nil
|
108
107
|
p = storage.get p
|
109
|
-
p.fetched_at.
|
108
|
+
expect(p.fetched_at).not_to be_nil
|
110
109
|
end
|
111
110
|
|
112
111
|
it 'should NOT set page.fetched_at if already present' do
|
@@ -115,7 +114,6 @@ describe Polipus::Storage::MongoStore do
|
|
115
114
|
p.fetched_at = 10
|
116
115
|
storage.add p
|
117
116
|
p = storage.get p
|
118
|
-
p.fetched_at.
|
117
|
+
expect(p.fetched_at).to be 10
|
119
118
|
end
|
120
|
-
|
121
119
|
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'polipus/storage/rethink_store'
|
4
|
+
|
5
|
+
describe Polipus::Storage::RethinkStore do
|
6
|
+
before(:all)do
|
7
|
+
@r = RethinkDB::RQL.new
|
8
|
+
@rethink = @r.connect(host: 'localhost', port: 28_015, db: 'polipus_spec')
|
9
|
+
@r.db_create('polipus_spec').run(@rethink) unless @r.db_list.run(@rethink).include?('polipus_spec')
|
10
|
+
@table = 'test_pages'
|
11
|
+
@storage = Polipus::Storage.rethink_store(@rethink, @table)
|
12
|
+
end
|
13
|
+
|
14
|
+
after(:each) do
|
15
|
+
@r.table(@table).delete.run(@rethink)
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should store a page' do
|
19
|
+
page = page_factory 'http://www.google.com', code: 200, body: '<html></html>'
|
20
|
+
uuid = @storage.add page
|
21
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
22
|
+
expect(@storage.count).to eq(1)
|
23
|
+
expect(@r.table(@table).count.run(@rethink)).to eq(1)
|
24
|
+
page = @storage.get page
|
25
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
26
|
+
expect(page.body).to eq('<html></html>')
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should update a page' do
|
30
|
+
page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
31
|
+
@storage.add page
|
32
|
+
page = @storage.get page
|
33
|
+
expect(page.code).to eq(301)
|
34
|
+
expect(@r.table(@table).count.run(@rethink)).to eq(1)
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should iterate over stored pages' do
|
38
|
+
@storage.each do |k, page|
|
39
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
40
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'should delete a page' do
|
45
|
+
page = page_factory 'http://www.google.com', code: 301, body: '<html></html>'
|
46
|
+
@storage.remove page
|
47
|
+
expect(@storage.get(page)).to be_nil
|
48
|
+
expect(@storage.count).to be 0
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should store a page removing a query string from the uuid generation' do
|
52
|
+
page = page_factory 'http://www.asd.com/?asd=lol', code: 200, body: '<html></html>'
|
53
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1', code: 200, body: '<html></html>'
|
54
|
+
@storage.include_query_string_in_uuid = false
|
55
|
+
@storage.add page
|
56
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
57
|
+
@storage.remove page
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
61
|
+
page = page_factory 'http://www.asd.com?asd=lol', code: 200, body: '<html></html>'
|
62
|
+
p_no_query = page_factory 'http://www.asd.com', code: 200, body: '<html></html>'
|
63
|
+
@storage.include_query_string_in_uuid = false
|
64
|
+
@storage.add page
|
65
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
66
|
+
@storage.remove page
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should store a page with user data associated' do
|
70
|
+
page = page_factory 'http://www.user.com', code: 200, body: '<html></html>'
|
71
|
+
page.user_data.name = 'Test User Data'
|
72
|
+
@storage.add page
|
73
|
+
expect(@storage.exists?(page)).to be_truthy
|
74
|
+
page = @storage.get(page)
|
75
|
+
expect(page.user_data.name).to eq('Test User Data')
|
76
|
+
@storage.remove page
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should honor the except parameters' do
|
80
|
+
storage = Polipus::Storage.rethink_store(@rethink, @table, ['body'])
|
81
|
+
page = page_factory 'http://www.user-doo.com', code: 200, body: '<html></html>'
|
82
|
+
storage.add page
|
83
|
+
page = storage.get page
|
84
|
+
expect(page.body).to be_empty
|
85
|
+
storage.clear
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should return false if a doc not exists' do
|
89
|
+
@storage.include_query_string_in_uuid = false
|
90
|
+
p_other = page_factory 'http://www.asdrrrr.com', code: 200, body: '<html></html>'
|
91
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
92
|
+
@storage.add p_other
|
93
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
94
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol', code: 200, body: '<html></html>'
|
95
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
96
|
+
@storage.include_query_string_in_uuid = true
|
97
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'should set page.fetched_at based on the id creation' do
|
101
|
+
storage = Polipus::Storage.rethink_store(@rethink, @table)
|
102
|
+
page = page_factory 'http://www.user-doojo.com', code: 200, body: '<html></html>'
|
103
|
+
storage.add page
|
104
|
+
expect(page.fetched_at).to be_nil
|
105
|
+
page = storage.get page
|
106
|
+
expect(page.fetched_at).not_to be_nil
|
107
|
+
end
|
108
|
+
|
109
|
+
it 'should NOT set page.fetched_at if already present' do
|
110
|
+
storage = Polipus::Storage.rethink_store(@rethink, @table)
|
111
|
+
page = page_factory 'http://www.user-doojooo.com', code: 200, body: '<html></html>'
|
112
|
+
page.fetched_at = 10
|
113
|
+
storage.add page
|
114
|
+
page = storage.get page
|
115
|
+
expect(page.fetched_at).to be 10
|
116
|
+
end
|
117
|
+
end
|
@@ -16,14 +16,14 @@ describe Polipus::UrlTracker do
|
|
16
16
|
it 'should work (bf)' do
|
17
17
|
url = 'http://www.asd.com/asd/lol'
|
18
18
|
@bf.visit url
|
19
|
-
@bf.visited?(url).
|
20
|
-
@bf.visited?('http://www.google.com').
|
19
|
+
expect(@bf.visited?(url)).to be_truthy
|
20
|
+
expect(@bf.visited?('http://www.google.com')).to be_falsey
|
21
21
|
end
|
22
22
|
|
23
23
|
it 'should work (redis_set)' do
|
24
24
|
url = 'http://www.asd.com/asd/lol'
|
25
25
|
@set.visit url
|
26
|
-
@set.visited?(url).
|
27
|
-
@set.visited?('http://www.google.com').
|
26
|
+
expect(@set.visited?(url)).to be_truthy
|
27
|
+
expect(@set.visited?('http://www.google.com')).to be_falsey
|
28
28
|
end
|
29
29
|
end
|
data/spec/polipus_spec.rb
CHANGED
@@ -26,31 +26,30 @@ describe Polipus::PolipusCrawler do
|
|
26
26
|
let(:logger) { Logger.new(nil) }
|
27
27
|
|
28
28
|
context 'polipus' do
|
29
|
-
|
30
29
|
it 'should create a polipus instance' do
|
31
|
-
polipus.
|
30
|
+
expect(polipus).to be_an_instance_of Polipus::PolipusCrawler
|
32
31
|
end
|
33
32
|
|
34
33
|
it 'should execute a crawling session' do
|
35
34
|
polipus.takeover
|
36
|
-
polipus.storage.exists?(init_page).
|
37
|
-
polipus.storage.get(init_page).links.count.
|
35
|
+
expect(polipus.storage.exists?(init_page)).to be_truthy
|
36
|
+
expect(polipus.storage.get(init_page).links.count).to be polipus.storage.count
|
38
37
|
end
|
39
38
|
|
40
39
|
it 'should filter unwanted urls' do
|
41
40
|
polipus.skip_links_like(/\/pages\//)
|
42
41
|
polipus.takeover
|
43
|
-
polipus.storage.get(init_page).links
|
44
|
-
.reject { |e| e.path.to_s =~ /\/pages\// }.count.
|
42
|
+
expect(polipus.storage.get(init_page).links
|
43
|
+
.reject { |e| e.path.to_s =~ /\/pages\// }.count).to be polipus.storage.count
|
45
44
|
end
|
46
45
|
|
47
46
|
it 'should follow only wanted urls' do
|
48
47
|
polipus.follow_links_like(/\/pages\//)
|
49
48
|
polipus.follow_links_like(/\/gems$/)
|
50
49
|
polipus.takeover
|
51
|
-
polipus.storage.get(init_page).links
|
50
|
+
expect(polipus.storage.get(init_page).links
|
52
51
|
.reject { |e| ![/\/pages\//, /\/gems$/].any? { |p| e.path =~ p } }
|
53
|
-
.count.
|
52
|
+
.count).to be polipus.storage.count
|
54
53
|
end
|
55
54
|
|
56
55
|
it 'should refresh expired pages' do
|
@@ -60,9 +59,9 @@ describe Polipus::PolipusCrawler do
|
|
60
59
|
page.fetched_at = page.fetched_at - 3600
|
61
60
|
polipus.storage.add(page)
|
62
61
|
end
|
63
|
-
polipus.storage.each { |_id, page| page.expired?(3600).
|
62
|
+
polipus.storage.each { |_id, page| expect(page.expired?(3600)).to be_truthy }
|
64
63
|
polipus.takeover
|
65
|
-
polipus.storage.each { |_id, page| page.expired?(3600).
|
64
|
+
polipus.storage.each { |_id, page| expect(page.expired?(3600)).to be_falsey }
|
66
65
|
end
|
67
66
|
|
68
67
|
it 'should re-download seeder urls no matter what' do
|
@@ -74,7 +73,7 @@ describe Polipus::PolipusCrawler do
|
|
74
73
|
end
|
75
74
|
polipus.takeover
|
76
75
|
polipus.takeover
|
77
|
-
cache_hit['http://rubygems.org/gems'].
|
76
|
+
expect(cache_hit['http://rubygems.org/gems']).to be 2
|
78
77
|
end
|
79
78
|
|
80
79
|
it 'should call on_page_error code blocks when a page has error' do
|
@@ -82,8 +81,8 @@ describe Polipus::PolipusCrawler do
|
|
82
81
|
a_page = nil
|
83
82
|
p.on_page_error { |page| a_page = page }
|
84
83
|
p.takeover
|
85
|
-
a_page.
|
86
|
-
a_page.error.
|
84
|
+
expect(a_page).not_to be_nil
|
85
|
+
expect(a_page.error).not_to be_nil
|
87
86
|
end
|
88
87
|
|
89
88
|
it 'should obey to the robots.txt file' do
|
@@ -92,8 +91,7 @@ describe Polipus::PolipusCrawler do
|
|
92
91
|
polipus = Polipus::PolipusCrawler.new('polipus-rspec', ['https://rubygems.org/gems/polipus'], lopt)
|
93
92
|
polipus.depth_limit = 1
|
94
93
|
polipus.takeover
|
95
|
-
polipus.storage.each { |_id, page| (page.url.path =~ /$\/downloads\//).
|
94
|
+
polipus.storage.each { |_id, page| expect(page.url.path =~ /$\/downloads\//).to be_falsey }
|
96
95
|
end
|
97
|
-
|
98
96
|
end
|
99
97
|
end
|