powerdlz23 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/grell/.rspec +2 -0
- package/grell/.travis.yml +28 -0
- package/grell/CHANGELOG.md +111 -0
- package/grell/Gemfile +7 -0
- package/grell/LICENSE.txt +22 -0
- package/grell/README.md +213 -0
- package/grell/Rakefile +2 -0
- package/grell/grell.gemspec +36 -0
- package/grell/lib/grell/capybara_driver.rb +44 -0
- package/grell/lib/grell/crawler.rb +83 -0
- package/grell/lib/grell/crawler_manager.rb +84 -0
- package/grell/lib/grell/grell_logger.rb +10 -0
- package/grell/lib/grell/page.rb +275 -0
- package/grell/lib/grell/page_collection.rb +62 -0
- package/grell/lib/grell/rawpage.rb +62 -0
- package/grell/lib/grell/reader.rb +18 -0
- package/grell/lib/grell/version.rb +3 -0
- package/grell/lib/grell.rb +11 -0
- package/grell/spec/lib/capybara_driver_spec.rb +38 -0
- package/grell/spec/lib/crawler_manager_spec.rb +174 -0
- package/grell/spec/lib/crawler_spec.rb +361 -0
- package/grell/spec/lib/page_collection_spec.rb +159 -0
- package/grell/spec/lib/page_spec.rb +418 -0
- package/grell/spec/lib/reader_spec.rb +43 -0
- package/grell/spec/spec_helper.rb +66 -0
- package/heartmagic/config.py +1 -0
- package/heartmagic/heart.py +3 -0
- package/heartmagic/pytransform/__init__.py +483 -0
- package/heartmagic/pytransform/_pytransform.dll +0 -0
- package/heartmagic/pytransform/_pytransform.so +0 -0
- package/httpStatusCode/README.md +2 -0
- package/httpStatusCode/httpStatusCode.js +4 -0
- package/httpStatusCode/reasonPhrases.js +344 -0
- package/httpStatusCode/statusCodes.js +344 -0
- package/package.json +1 -1
- package/snapcrawl/.changelog.old.md +157 -0
- package/snapcrawl/.gitattributes +1 -0
- package/snapcrawl/.github/workflows/test.yml +41 -0
- package/snapcrawl/.rspec +3 -0
- package/snapcrawl/.rubocop.yml +23 -0
- package/snapcrawl/CHANGELOG.md +182 -0
- package/snapcrawl/Gemfile +15 -0
- package/snapcrawl/LICENSE +21 -0
- package/snapcrawl/README.md +135 -0
- package/snapcrawl/Runfile +35 -0
- package/snapcrawl/bin/snapcrawl +25 -0
- package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
- package/snapcrawl/lib/snapcrawl/config.rb +60 -0
- package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
- package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
- package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
- package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
- package/snapcrawl/lib/snapcrawl/page.rb +118 -0
- package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
- package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
- package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
- package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
- package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
- package/snapcrawl/lib/snapcrawl/version.rb +3 -0
- package/snapcrawl/lib/snapcrawl.rb +20 -0
- package/snapcrawl/snapcrawl.gemspec +27 -0
- package/snapcrawl/snapcrawl.yml +41 -0
- package/snapcrawl/spec/README.md +16 -0
- package/snapcrawl/spec/approvals/bin/help +26 -0
- package/snapcrawl/spec/approvals/bin/usage +4 -0
- package/snapcrawl/spec/approvals/cli/usage +4 -0
- package/snapcrawl/spec/approvals/config/defaults +15 -0
- package/snapcrawl/spec/approvals/config/minimal +15 -0
- package/snapcrawl/spec/approvals/integration/blacklist +14 -0
- package/snapcrawl/spec/approvals/integration/default-config +14 -0
- package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
- package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
- package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
- package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
- package/snapcrawl/spec/approvals/integration/whitelist +14 -0
- package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
- package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
- package/snapcrawl/spec/server/config.ru +97 -0
- package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
- package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
- package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
- package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
- package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
- package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
- package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
- package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
- package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
- package/snapcrawl/spec/spec_helper.rb +22 -0
- package/snapcrawl/spec/spec_mixin.rb +10 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
|
|
2
|
+
RSpec.describe Grell::Crawler do
|
|
3
|
+
let(:page_id) { rand(10).floor + 10 }
|
|
4
|
+
let(:parent_page_id) { rand(10).floor }
|
|
5
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
|
6
|
+
let(:host) { 'http://www.example.com' }
|
|
7
|
+
let(:url) { 'http://www.example.com/test' }
|
|
8
|
+
let(:add_match_block) { nil }
|
|
9
|
+
let(:denylist) { /a^/ }
|
|
10
|
+
let(:allowlist) { /.*/ }
|
|
11
|
+
let(:crawler) do
|
|
12
|
+
Grell::Crawler.new(
|
|
13
|
+
logger: Logger.new(nil),
|
|
14
|
+
driver: double(nil),
|
|
15
|
+
evaluate_in_each_page: script,
|
|
16
|
+
add_match_block: add_match_block,
|
|
17
|
+
denylist: denylist,
|
|
18
|
+
allowlist: allowlist)
|
|
19
|
+
end
|
|
20
|
+
let(:script) { nil }
|
|
21
|
+
let(:body) { 'body' }
|
|
22
|
+
let(:custom_add_match) do
|
|
23
|
+
Proc.new do |collection_page, page|
|
|
24
|
+
collection_page.path == page.path
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
before do
|
|
29
|
+
proxy.stub(url).and_return(body: body, code: 200)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe '#crawl' do
|
|
33
|
+
before do
|
|
34
|
+
crawler.instance_variable_set('@collection', Grell::PageCollection.new(custom_add_match))
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'yields the result if a block is given' do
|
|
38
|
+
result = []
|
|
39
|
+
block = Proc.new { |n| result.push(n) }
|
|
40
|
+
crawler.crawl(page, block)
|
|
41
|
+
expect(result.size).to eq(1)
|
|
42
|
+
expect(result.first.url).to eq(url)
|
|
43
|
+
expect(result.first.visited?).to eq(true)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it 'rescues any specified exceptions raised during the block execution' do
|
|
47
|
+
block = Proc.new { |n| raise Capybara::Poltergeist::BrowserError, 'Exception' }
|
|
48
|
+
expect{ crawler.crawl(page, block) }.to_not raise_error
|
|
49
|
+
expect(page.status).to eq(404)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it 'logs interesting information' do
|
|
53
|
+
crawler
|
|
54
|
+
expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
|
|
55
|
+
crawler.crawl(page, nil)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it 'retries when the block returns :retry' do
|
|
59
|
+
counter = 0
|
|
60
|
+
times_retrying = 2
|
|
61
|
+
block = Proc.new do |n|
|
|
62
|
+
if counter < times_retrying
|
|
63
|
+
counter += 1
|
|
64
|
+
:retry
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
crawler.crawl(page, block)
|
|
68
|
+
expect(counter).to eq(times_retrying)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
it 'handles redirects by adding the current_url to the page collection' do
|
|
72
|
+
redirect_url = 'http://www.example.com/test/landing_page'
|
|
73
|
+
allow(page).to receive(:current_url).and_return(redirect_url)
|
|
74
|
+
expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
|
|
75
|
+
crawler.crawl(page, nil)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
context 'without script' do
|
|
79
|
+
it 'does not evaluate a script' do
|
|
80
|
+
expect_any_instance_of(Capybara::Session).not_to receive(:evaluate_script)
|
|
81
|
+
crawler.crawl(page, nil)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
context 'with script' do
|
|
86
|
+
let(:script) { "(typeof(jQuery)!='undefined') && $('.dropdown').addClass('open');" }
|
|
87
|
+
it 'evaluates a script' do
|
|
88
|
+
expect_any_instance_of(Capybara::Session).to receive(:evaluate_script).with(script)
|
|
89
|
+
crawler.crawl(page, nil)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
context '#start_crawling' do
|
|
95
|
+
let(:body) do
|
|
96
|
+
<<-EOS
|
|
97
|
+
<html><head></head><body>
|
|
98
|
+
<a href="/musmis.html">trusmis</a>
|
|
99
|
+
Hello world!
|
|
100
|
+
</body></html>
|
|
101
|
+
EOS
|
|
102
|
+
end
|
|
103
|
+
let(:url_visited) { "http://www.example.com/musmis.html" }
|
|
104
|
+
|
|
105
|
+
before do
|
|
106
|
+
proxy.stub(url_visited).and_return(body: 'body', code: 200)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
it 'calls the block we used to start_crawling' do
|
|
110
|
+
result = []
|
|
111
|
+
block = Proc.new { |n| result.push(n) }
|
|
112
|
+
crawler.start_crawling(url, &block)
|
|
113
|
+
expect(result.size).to eq(2)
|
|
114
|
+
expect(result[0].url).to eq(url)
|
|
115
|
+
expect(result[1].url).to eq(url_visited)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
shared_examples_for 'visits all available pages' do
|
|
121
|
+
it 'visits all the pages' do
|
|
122
|
+
crawler.start_crawling(url)
|
|
123
|
+
expect(crawler.collection.visited_pages.size).to eq(visited_pages_count)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
it 'has no more pages to discover' do
|
|
127
|
+
crawler.start_crawling(url)
|
|
128
|
+
expect(crawler.collection.discovered_pages.size).to eq(0)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it 'contains the allowlisted page and the base page only' do
|
|
132
|
+
crawler.start_crawling(url)
|
|
133
|
+
expect(crawler.collection.visited_pages.map(&:url)).
|
|
134
|
+
to eq(visited_pages)
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
context 'the url has no links' do
|
|
139
|
+
let(:body) do
|
|
140
|
+
"<html><head></head><body>
|
|
141
|
+
Hello world!
|
|
142
|
+
</body></html>"
|
|
143
|
+
end
|
|
144
|
+
let(:visited_pages_count) { 1 }
|
|
145
|
+
let(:visited_pages) { ['http://www.example.com/test'] }
|
|
146
|
+
|
|
147
|
+
it_behaves_like 'visits all available pages'
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
context 'the url has several links' do
|
|
151
|
+
let(:visited_pages_count) { 3 }
|
|
152
|
+
let(:visited_pages) do
|
|
153
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
|
154
|
+
end
|
|
155
|
+
let(:body) do
|
|
156
|
+
"<html><head></head><body>
|
|
157
|
+
<a href=\"/trusmis.html\">trusmis</a>
|
|
158
|
+
<a href=\"/help.html\">help</a>
|
|
159
|
+
Hello world!
|
|
160
|
+
</body></html>"
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
before do
|
|
164
|
+
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
|
165
|
+
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
it_behaves_like 'visits all available pages'
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
describe '#allowlist' do
|
|
172
|
+
let(:body) do
|
|
173
|
+
"<html><head></head><body>
|
|
174
|
+
<a href=\"/trusmis.html\">trusmis</a>
|
|
175
|
+
<a href=\"/help.html\">help</a>
|
|
176
|
+
Hello world!
|
|
177
|
+
</body></html>"
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
before do
|
|
181
|
+
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
|
182
|
+
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
context 'using a single string' do
|
|
186
|
+
let(:allowlist) { '/trusmis.html' }
|
|
187
|
+
let(:visited_pages_count) { 2 } # my own page + trusmis
|
|
188
|
+
let(:visited_pages) do
|
|
189
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it_behaves_like 'visits all available pages'
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
context 'using an array of strings' do
|
|
196
|
+
let(:allowlist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
|
197
|
+
let(:visited_pages_count) { 2 }
|
|
198
|
+
let(:visited_pages) do
|
|
199
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it_behaves_like 'visits all available pages'
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
context 'using a regexp' do
|
|
206
|
+
let(:allowlist) { /\/trusmis\.html/ }
|
|
207
|
+
let(:visited_pages_count) { 2 }
|
|
208
|
+
let(:visited_pages) do
|
|
209
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
it_behaves_like 'visits all available pages'
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
context 'using an array of regexps' do
|
|
216
|
+
let(:allowlist) { [/\/trusmis\.html/] }
|
|
217
|
+
let(:visited_pages_count) { 2 }
|
|
218
|
+
let(:visited_pages) do
|
|
219
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
it_behaves_like 'visits all available pages'
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
context 'using an empty array' do
|
|
226
|
+
let(:allowlist) { [] }
|
|
227
|
+
let(:visited_pages_count) { 1 } # my own page only
|
|
228
|
+
let(:visited_pages) do
|
|
229
|
+
['http://www.example.com/test']
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
it_behaves_like 'visits all available pages'
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
context 'adding all links to the allowlist' do
|
|
236
|
+
let(:allowlist) { ['/trusmis', '/help'] }
|
|
237
|
+
let(:visited_pages_count) { 3 } # all links
|
|
238
|
+
let(:visited_pages) do
|
|
239
|
+
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
it_behaves_like 'visits all available pages'
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
describe '#denylist' do
|
|
248
|
+
let(:body) do
|
|
249
|
+
"<html><head></head><body>
|
|
250
|
+
<a href=\"/trusmis.html\">trusmis</a>
|
|
251
|
+
<a href=\"/help.html\">help</a>
|
|
252
|
+
Hello world!
|
|
253
|
+
</body></html>"
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
before do
|
|
257
|
+
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
|
258
|
+
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
context 'using a single string' do
|
|
262
|
+
let(:denylist) { '/trusmis.html' }
|
|
263
|
+
let(:visited_pages_count) {2}
|
|
264
|
+
let(:visited_pages) do
|
|
265
|
+
['http://www.example.com/test','http://www.example.com/help.html']
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
it_behaves_like 'visits all available pages'
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
context 'using an array of strings' do
|
|
272
|
+
let(:denylist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
|
273
|
+
let(:visited_pages_count) {2}
|
|
274
|
+
let(:visited_pages) do
|
|
275
|
+
['http://www.example.com/test','http://www.example.com/help.html']
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
it_behaves_like 'visits all available pages'
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
context 'using a regexp' do
|
|
282
|
+
let(:denylist) { /\/trusmis\.html/ }
|
|
283
|
+
let(:visited_pages_count) {2}
|
|
284
|
+
let(:visited_pages) do
|
|
285
|
+
['http://www.example.com/test','http://www.example.com/help.html']
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
it_behaves_like 'visits all available pages'
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
context 'using an array of regexps' do
|
|
292
|
+
let(:denylist) { [/\/trusmis\.html/] }
|
|
293
|
+
let(:visited_pages_count) {2}
|
|
294
|
+
let(:visited_pages) do
|
|
295
|
+
['http://www.example.com/test','http://www.example.com/help.html']
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
it_behaves_like 'visits all available pages'
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
context 'using an empty array' do
|
|
302
|
+
let(:denylist) { [] }
|
|
303
|
+
let(:visited_pages_count) { 3 } # all links
|
|
304
|
+
let(:visited_pages) do
|
|
305
|
+
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
it_behaves_like 'visits all available pages'
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
context 'adding all links to the denylist' do
|
|
312
|
+
let(:denylist) { ['/trusmis', '/help'] }
|
|
313
|
+
let(:visited_pages_count) { 1 }
|
|
314
|
+
let(:visited_pages) do
|
|
315
|
+
['http://www.example.com/test']
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
it_behaves_like 'visits all available pages'
|
|
319
|
+
end
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
describe 'allowlisting and denylisting' do
|
|
324
|
+
let(:body) do
|
|
325
|
+
"<html><head></head><body>
|
|
326
|
+
<a href=\"/trusmis.html\">trusmis</a>
|
|
327
|
+
<a href=\"/help.html\">help</a>
|
|
328
|
+
Hello world!
|
|
329
|
+
</body></html>"
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
before do
|
|
333
|
+
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
|
334
|
+
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
context 'we denylist the only allowlisted page' do
|
|
338
|
+
let(:allowlist) { '/trusmis.html' }
|
|
339
|
+
let(:denylist) { '/trusmis.html' }
|
|
340
|
+
let(:visited_pages_count) { 1 }
|
|
341
|
+
let(:visited_pages) do
|
|
342
|
+
['http://www.example.com/test']
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
it_behaves_like 'visits all available pages'
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
context 'we denylist none of the allowlisted pages' do
|
|
349
|
+
let(:allowlist) { '/trusmis.html' }
|
|
350
|
+
let(:denylist) { '/raistlin.html' }
|
|
351
|
+
let(:visited_pages_count) { 2 }
|
|
352
|
+
let(:visited_pages) do
|
|
353
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
it_behaves_like 'visits all available pages'
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
end
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
|
|
2
|
+
RSpec.describe Grell::PageCollection do
|
|
3
|
+
let(:add_match_block) do
|
|
4
|
+
Proc.new do |collection_page, page|
|
|
5
|
+
collection_page.url.downcase == page.url.downcase
|
|
6
|
+
end
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
let(:collection) { Grell::PageCollection.new(add_match_block) }
|
|
10
|
+
let(:url) { 'http://www.github.com/SomeUser/dragonlance?search=false' }
|
|
11
|
+
let(:url2) { 'http://www.github.com/OtherUser/forgotten?search=false' }
|
|
12
|
+
|
|
13
|
+
context 'empty collection' do
|
|
14
|
+
|
|
15
|
+
it 'has no visited pages' do
|
|
16
|
+
expect(collection.visited_pages).to be_empty
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'has no discovered pages' do
|
|
20
|
+
expect(collection.discovered_pages).to be_empty
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it 'next page is nil' do
|
|
24
|
+
expect(collection.next_page).to be_nil
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
context 'one unvisited page' do
|
|
29
|
+
let(:page) { collection.create_page(url, 0) }
|
|
30
|
+
|
|
31
|
+
before do
|
|
32
|
+
allow(page).to receive(:visited?).and_return(false)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it 'has no visited pages' do
|
|
36
|
+
expect(collection.visited_pages).to be_empty
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'has one discovered page' do
|
|
40
|
+
expect(collection.discovered_pages).to eq([page])
|
|
41
|
+
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it 'next page is the unvisited page' do
|
|
45
|
+
expect(collection.next_page).to eq(page)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
context 'one visited page' do
|
|
50
|
+
let(:page) { collection.create_page(url, 0) }
|
|
51
|
+
|
|
52
|
+
before do
|
|
53
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it 'has one visited page' do
|
|
57
|
+
expect(collection.visited_pages).to eq([page])
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'has no discovered pages' do
|
|
61
|
+
expect(collection.discovered_pages).to be_empty
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'next page is nil' do
|
|
65
|
+
expect(collection.next_page).to be_nil
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
context 'one visited and one unvisited page with the same url' do
|
|
70
|
+
let(:page) { collection.create_page(url, 0) }
|
|
71
|
+
let(:unvisited) { collection.create_page(url.upcase, 0) }
|
|
72
|
+
|
|
73
|
+
before do
|
|
74
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
75
|
+
allow(unvisited).to receive(:visited?).and_return(false)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it 'first page has id 0' do
|
|
79
|
+
expect(page.id).to eq(0)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it 'second page has id 1' do
|
|
83
|
+
expect(unvisited.id).to eq(1)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
it 'has one visited page' do
|
|
87
|
+
expect(collection.visited_pages).to eq([page])
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
it 'has no discovered pages' do
|
|
91
|
+
expect(collection.discovered_pages).to be_empty
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
it 'next page is nil' do
|
|
95
|
+
expect(collection.next_page).to be_nil
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
context 'one visited and one unvisited page with different URLs' do
|
|
100
|
+
let(:page) { collection.create_page(url, 0) }
|
|
101
|
+
let(:unvisited) { collection.create_page(url2, 0) }
|
|
102
|
+
|
|
103
|
+
before do
|
|
104
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
105
|
+
allow(unvisited).to receive(:visited?).and_return(false)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
it 'has one visited page' do
|
|
109
|
+
expect(collection.visited_pages).to eq([page])
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it 'has one discovered page' do
|
|
113
|
+
expect(collection.discovered_pages).to eq([unvisited])
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
it 'next page is the unvisited page' do
|
|
117
|
+
expect(collection.next_page).to eq(unvisited)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
context 'one visited and one unvisited page with different URLs only different by the query' do
|
|
122
|
+
let(:page) { collection.create_page(url, 0) }
|
|
123
|
+
let(:url3) { 'http://www.github.com/SomeUser/dragonlance?search=true' }
|
|
124
|
+
let(:unvisited) { collection.create_page(url3, 0) }
|
|
125
|
+
|
|
126
|
+
before do
|
|
127
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
128
|
+
allow(unvisited).to receive(:visited?).and_return(false)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it 'has one visited page' do
|
|
132
|
+
expect(collection.visited_pages).to eq([page])
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
it 'has one discovered page' do
|
|
136
|
+
expect(collection.discovered_pages).to eq([unvisited])
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
it 'next page is the unvisited page' do
|
|
140
|
+
expect(collection.next_page).to eq(unvisited)
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
context 'several unvisited pages' do
|
|
145
|
+
let(:page) { collection.create_page(url, 2) }
|
|
146
|
+
let(:page2) { collection.create_page(url2, 0) }
|
|
147
|
+
|
|
148
|
+
before do
|
|
149
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
150
|
+
allow(page2).to receive(:visited?).and_return(false)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it 'returns the page which has an earlier parent' do
|
|
154
|
+
expect(collection.next_page).to eq(page2)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
end
|