grell 1.6.10 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,84 @@
1
+ module Grell
2
+ # Manages the state of the process crawling, does not care about individual pages but about logging,
3
+ # restarting and quiting the crawler correctly.
4
+ class CrawlerManager
5
+ # logger: logger to use for Grell's messages
6
+ # on_periodic_restart: if set, the driver will restart every :each visits (100 default) and execute the :do block
7
+ # driver_options: Any extra options for the Capybara driver
8
+ def initialize(logger: nil, on_periodic_restart: {}, driver: nil)
9
+ Grell.logger = logger ? logger : Logger.new(STDOUT)
10
+ @periodic_restart_block = on_periodic_restart[:do]
11
+ @periodic_restart_period = on_periodic_restart[:each] || PAGES_TO_RESTART
12
+ @driver = driver || CapybaraDriver.new.setup_capybara
13
+ if @periodic_restart_period <= 0
14
+ Grell.logger.warn "GRELL. Restart option misconfigured with a negative period. Ignoring option."
15
+ end
16
+ end
17
+
18
+ # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
19
+ def restart
20
+ Grell.logger.info "GRELL. Driver restarting"
21
+ @driver.restart
22
+ Grell.logger.info "GRELL. Driver restarted"
23
+ end
24
+
25
+ # Quits the poltergeist driver.
26
+ def quit
27
+ Grell.logger.info "GRELL. Driver quitting"
28
+ @driver.quit
29
+ end
30
+
31
+ # PhantomJS seems to consume memory increasingly as it crawls, periodic restart allows to restart
32
+ # the driver, potentially calling a block.
33
+ def check_periodic_restart(collection)
34
+ return unless @periodic_restart_block
35
+ return unless @periodic_restart_period > 0
36
+ return unless (collection.visited_pages.size % @periodic_restart_period).zero?
37
+ restart
38
+ @periodic_restart_block.call
39
+ end
40
+
41
+ def self.cleanup_all_processes
42
+ PhantomJSManager.new.cleanup_all_processes
43
+ end
44
+
45
+ private
46
+
47
+ PAGES_TO_RESTART = 100 # Default number of pages before we restart the driver.
48
+ KILL_TIMEOUT = 2 # Number of seconds we wait till we kill the process.
49
+
50
+ # Manages the PhantomJS process
51
+ class PhantomJSManager
52
+ def cleanup_all_processes
53
+ pids = running_phantomjs_pids
54
+ return if pids.empty?
55
+ Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}"
56
+ pids.each do |pid|
57
+ Grell.logger.warn "GRELL. Sending KILL to PhantomJS process #{pid}"
58
+ kill_process(pid.to_i)
59
+ end
60
+ end
61
+
62
+ def running_phantomjs_pids
63
+ list_phantomjs_processes_cmd = "ps -ef | grep -E 'bin/phantomjs' | grep -v grep"
64
+ `#{list_phantomjs_processes_cmd} | awk '{print $2;}'`.split("\n")
65
+ end
66
+
67
+ def kill_process(pid)
68
+ Process.kill('TERM', pid)
69
+ force_kill(pid)
70
+ rescue Errno::ESRCH, Errno::ECHILD
71
+ # successfully terminated
72
+ rescue => e
73
+ Grell.logger.error ["GRELL. PhantomJS process could not be killed", e.message, *e.backtrace].join($/)
74
+ end
75
+
76
+ def force_kill(pid)
77
+ Timeout.timeout(KILL_TIMEOUT) { Process.wait(pid) }
78
+ rescue Timeout::Error
79
+ Process.kill('KILL', pid)
80
+ Process.wait(pid)
81
+ end
82
+ end
83
+ end
84
+ end
data/lib/grell/page.rb CHANGED
@@ -26,11 +26,12 @@ module Grell
26
26
  end
27
27
 
28
28
  def navigate
29
- # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try or best to workaround inconsistencies on poltergeist
29
+ # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try our best to workaround inconsistencies on poltergeist
30
30
  Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
31
31
  @rawpage.status && !@rawpage.headers.empty? &&
32
32
  @rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
33
33
  end
34
+ @rawpage.wait_for_all_ajax_requests(WAIT_TIME, WAIT_INTERVAL)
34
35
  @result_page = VisitedPage.new(@rawpage)
35
36
  @timestamp = Time.now
36
37
  rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
@@ -10,7 +10,7 @@ module Grell
10
10
  # to the collection or if it is already present will be passed to the initializer.
11
11
  def initialize(add_match_block)
12
12
  @collection = []
13
- @add_match_block = add_match_block
13
+ @add_match_block = add_match_block || default_add_match
14
14
  end
15
15
 
16
16
  def create_page(url, parent_id)
@@ -50,5 +50,13 @@ module Grell
50
50
  end
51
51
  end
52
52
 
53
+ # If add_match_block is not provided, url matching to determine if a new page should be added
54
+ # to the page collection will default to this proc
55
+ def default_add_match
56
+ Proc.new do |collection_page, page|
57
+ collection_page.url.downcase == page.url.downcase
58
+ end
59
+ end
60
+
53
61
  end
54
62
  end
data/lib/grell/rawpage.rb CHANGED
@@ -27,7 +27,6 @@ module Grell
27
27
  all('[href]', visible: true).to_a + all('[data-href]', visible: true).to_a
28
28
  end
29
29
 
30
-
31
30
  def host
32
31
  page.current_host
33
32
  end
@@ -36,6 +35,17 @@ module Grell
36
35
  page.has_selector?(selector)
37
36
  end
38
37
 
38
+ def wait_for_all_ajax_requests(timeout, interval)
39
+ Timeout::timeout(timeout) do
40
+ (timeout / interval).ceil.times do
41
+ jquery_active = page.evaluate_script("typeof jQuery !== 'undefined' && jQuery.active;")
42
+ break if (!jquery_active || jquery_active.zero?)
43
+ sleep(interval)
44
+ end
45
+ end
46
+ true
47
+ end
48
+
39
49
  private
40
50
 
41
51
  def follow_redirects!
data/lib/grell/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Grell
2
- VERSION = "1.6.10".freeze
2
+ VERSION = "2.1.2".freeze
3
3
  end
@@ -1,6 +1,9 @@
1
1
 
2
2
  RSpec.describe Grell::CapybaraDriver do
3
3
  let(:ts) { Time.now }
4
+ before do
5
+ Grell.logger = Logger.new(nil)
6
+ end
4
7
 
5
8
  describe 'setup_capybara' do
6
9
  it 'properly registers the poltergeist driver' do
@@ -25,14 +28,6 @@ RSpec.describe Grell::CapybaraDriver do
25
28
  end
26
29
  end
27
30
 
28
- describe 'quit' do
29
- let(:driver) { Grell::CapybaraDriver.new.setup_capybara }
30
- it 'quits the poltergeist driver' do
31
- expect_any_instance_of(Capybara::Poltergeist::Driver).to receive(:quit)
32
- driver.quit
33
- end
34
- end
35
-
36
31
  after do
37
32
  Timecop.return
38
33
 
@@ -0,0 +1,174 @@
1
+ RSpec.describe Grell::CrawlerManager do
2
+ let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
3
+ let(:host) { 'http://www.example.com' }
4
+ let(:url) { 'http://www.example.com/test' }
5
+ let(:driver) { double(Grell::CapybaraDriver) }
6
+ let(:logger) { Logger.new(nil) }
7
+ let(:crawler_manager) do
8
+ described_class.new(logger: logger, driver: driver)
9
+ end
10
+
11
+ describe 'initialize' do
12
+ context 'provides a logger' do
13
+ let(:logger) { 33 }
14
+
15
+ it 'sets custom logger' do
16
+ crawler_manager
17
+ expect(Grell.logger).to eq(33)
18
+ Grell.logger = Logger.new(nil)
19
+ end
20
+ end
21
+
22
+ context 'does not provides a logger' do
23
+ let(:logger) { nil }
24
+
25
+ it 'sets default logger' do
26
+ crawler_manager
27
+ expect(Grell.logger).to be_instance_of(Logger)
28
+ Grell.logger = Logger.new(nil)
29
+ end
30
+ end
31
+
32
+ context 'does not provide a driver' do
33
+ let(:driver) { nil }
34
+
35
+ it 'setups a new Capybara driver' do
36
+ expect_any_instance_of(Grell::CapybaraDriver).to receive(:setup_capybara)
37
+ crawler_manager
38
+ end
39
+ end
40
+ end
41
+
42
+ describe '#quit' do
43
+ let(:driver) { double }
44
+
45
+ it 'quits the poltergeist driver' do
46
+ expect(logger).to receive(:info).with("GRELL. Driver quitting")
47
+ expect(driver).to receive(:quit)
48
+ crawler_manager.quit
49
+ end
50
+ end
51
+
52
+ describe '#restart' do
53
+ let(:driver) { double }
54
+
55
+ it 'restarts the poltergeist driver' do
56
+ expect(driver).to receive(:restart)
57
+ expect(logger).to receive(:info).with("GRELL. Driver restarted")
58
+ expect(logger).to receive(:info).with("GRELL. Driver restarting")
59
+ crawler_manager.restart
60
+ end
61
+ end
62
+
63
+ describe '#check_periodic_restart' do
64
+ let(:collection) { double }
65
+
66
+ context 'Periodic restart not setup' do
67
+ it 'does not restart' do
68
+ allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
69
+ expect(crawler_manager).not_to receive(:restart)
70
+ crawler_manager.check_periodic_restart(collection)
71
+ end
72
+ end
73
+
74
+ context 'Periodic restart setup with default period' do
75
+ let(:do_something) { proc {} }
76
+ let(:crawler_manager) do
77
+ Grell::CrawlerManager.new(
78
+ logger: logger,
79
+ driver: driver,
80
+ on_periodic_restart: { do: do_something }
81
+ )
82
+ end
83
+
84
+ it 'does not restart after visiting 99 pages' do
85
+ allow(collection).to receive_message_chain(:visited_pages, :size) { 99 }
86
+ expect(crawler_manager).not_to receive(:restart)
87
+ crawler_manager.check_periodic_restart(collection)
88
+ end
89
+
90
+ it 'restarts after visiting 100 pages' do
91
+ allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
92
+ expect(crawler_manager).to receive(:restart)
93
+ crawler_manager.check_periodic_restart(collection)
94
+ end
95
+ end
96
+
97
+ context 'Periodic restart setup with custom period' do
98
+ let(:do_something) { proc {} }
99
+ let(:period) { 50 }
100
+ let(:crawler_manager) do
101
+ Grell::CrawlerManager.new(
102
+ logger: logger,
103
+ driver: driver,
104
+ on_periodic_restart: { do: do_something, each: period }
105
+ )
106
+ end
107
+
108
+ context 'restart option is not positive' do
109
+ let(:period) { 0 }
110
+
111
+ it 'logs a warning' do
112
+ message = 'GRELL. Restart option misconfigured with a negative period. Ignoring option.'
113
+ expect(logger).to receive(:warn).with(message)
114
+ crawler_manager
115
+ end
116
+ end
117
+
118
+ it 'does not restart after visiting a number different from custom period pages' do
119
+ allow(collection).to receive_message_chain(:visited_pages, :size) { period * 1.2 }
120
+ expect(crawler_manager).not_to receive(:restart)
121
+ crawler_manager.check_periodic_restart(collection)
122
+ end
123
+
124
+ it 'restarts after visiting custom period pages' do
125
+ allow(collection).to receive_message_chain(:visited_pages, :size) { period }
126
+ expect(crawler_manager).to receive(:restart)
127
+ crawler_manager.check_periodic_restart(collection)
128
+ end
129
+ end
130
+ end
131
+
132
+ describe '.cleanup_all_processes' do
133
+ let(:driver) { double }
134
+
135
+ context 'There are some phantomjs processes running' do
136
+ let(:pids) { [10, 11] }
137
+ before do
138
+ allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
139
+ .to receive(:running_phantomjs_pids).and_return(pids)
140
+ end
141
+
142
+ it 'logs processes pids' do
143
+ expect(Grell.logger).to receive(:warn).with('GRELL. Killing PhantomJS processes: [10, 11]')
144
+ expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 10')
145
+ expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 11')
146
+ described_class.cleanup_all_processes
147
+ end
148
+
149
+ it 'kills all phantomjs processes' do
150
+ expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(10)
151
+ expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(11)
152
+ described_class.cleanup_all_processes
153
+ end
154
+ end
155
+
156
+ context 'There are no phantomjs processes running' do
157
+ let(:pids) { [] }
158
+ before do
159
+ allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
160
+ .to receive(:running_phantomjs_pids).and_return(pids)
161
+ end
162
+
163
+ it 'no warning is logged' do
164
+ expect(Grell.logger).not_to receive(:warn)
165
+ described_class.cleanup_all_processes
166
+ end
167
+
168
+ it 'No process is killed' do
169
+ expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).not_to receive(:kill_process)
170
+ described_class.cleanup_all_processes
171
+ end
172
+ end
173
+ end
174
+ end
@@ -5,7 +5,19 @@ RSpec.describe Grell::Crawler do
5
5
  let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
6
6
  let(:host) { 'http://www.example.com' }
7
7
  let(:url) { 'http://www.example.com/test' }
8
- let(:crawler) { Grell::Crawler.new(logger: Logger.new(nil), external_driver: true) }
8
+ let(:add_match_block) { nil }
9
+ let(:denylist) { /a^/ }
10
+ let(:allowlist) { /.*/ }
11
+ let(:crawler) do
12
+ Grell::Crawler.new(
13
+ logger: Logger.new(nil),
14
+ driver: double(nil),
15
+ evaluate_in_each_page: script,
16
+ add_match_block: add_match_block,
17
+ denylist: denylist,
18
+ allowlist: allowlist)
19
+ end
20
+ let(:script) { nil }
9
21
  let(:body) { 'body' }
10
22
  let(:custom_add_match) do
11
23
  Proc.new do |collection_page, page|
@@ -17,29 +29,6 @@ RSpec.describe Grell::Crawler do
17
29
  proxy.stub(url).and_return(body: body, code: 200)
18
30
  end
19
31
 
20
- describe 'initialize' do
21
- it 'can provide your own logger' do
22
- Grell::Crawler.new(external_driver: true, logger: 33)
23
- expect(Grell.logger).to eq(33)
24
- Grell.logger = Logger.new(nil)
25
- end
26
-
27
- it 'provides a stdout logger if nothing provided' do
28
- crawler
29
- expect(Grell.logger).to be_instance_of(Logger)
30
- end
31
- end
32
-
33
- describe '#quit' do
34
- let(:driver) { double }
35
- before { allow(Grell::CapybaraDriver).to receive(:setup).and_return(driver) }
36
-
37
- it 'quits the poltergeist driver' do
38
- expect(driver).to receive(:quit)
39
- crawler.quit
40
- end
41
- end
42
-
43
32
  describe '#crawl' do
44
33
  before do
45
34
  crawler.instance_variable_set('@collection', Grell::PageCollection.new(custom_add_match))
@@ -85,6 +74,21 @@ RSpec.describe Grell::Crawler do
85
74
  expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
86
75
  crawler.crawl(page, nil)
87
76
  end
77
+
78
+ context 'without script' do
79
+ it 'does not evaluate a script' do
80
+ expect_any_instance_of(Capybara::Session).not_to receive(:evaluate_script)
81
+ crawler.crawl(page, nil)
82
+ end
83
+ end
84
+
85
+ context 'with script' do
86
+ let(:script) { "(typeof(jQuery)!='undefined') && $('.dropdown').addClass('open');" }
87
+ it 'evaluates a script' do
88
+ expect_any_instance_of(Capybara::Session).to receive(:evaluate_script).with(script)
89
+ crawler.crawl(page, nil)
90
+ end
91
+ end
88
92
  end
89
93
 
90
94
  context '#start_crawling' do
@@ -111,15 +115,6 @@ RSpec.describe Grell::Crawler do
111
115
  expect(result[1].url).to eq(url_visited)
112
116
  end
113
117
 
114
- it 'can use a custom url add matcher block' do
115
- expect(crawler).to_not receive(:default_add_match)
116
- crawler.start_crawling(url, add_match_block: custom_add_match)
117
- end
118
-
119
- it 'uses a default url add matched if not provided' do
120
- expect(crawler).to receive(:default_add_match).and_return(custom_add_match)
121
- crawler.start_crawling(url)
122
- end
123
118
  end
124
119
 
125
120
  shared_examples_for 'visits all available pages' do
@@ -133,7 +128,7 @@ RSpec.describe Grell::Crawler do
133
128
  expect(crawler.collection.discovered_pages.size).to eq(0)
134
129
  end
135
130
 
136
- it 'contains the whitelisted page and the base page only' do
131
+ it 'contains the allowlisted page and the base page only' do
137
132
  crawler.start_crawling(url)
138
133
  expect(crawler.collection.visited_pages.map(&:url)).
139
134
  to eq(visited_pages)
@@ -173,7 +168,7 @@ RSpec.describe Grell::Crawler do
173
168
  it_behaves_like 'visits all available pages'
174
169
  end
175
170
 
176
- describe '#whitelist' do
171
+ describe '#allowlist' do
177
172
  let(:body) do
178
173
  "<html><head></head><body>
179
174
  <a href=\"/trusmis.html\">trusmis</a>
@@ -188,10 +183,7 @@ RSpec.describe Grell::Crawler do
188
183
  end
189
184
 
190
185
  context 'using a single string' do
191
- before do
192
- crawler.whitelist('/trusmis.html')
193
- end
194
-
186
+ let(:allowlist) { '/trusmis.html' }
195
187
  let(:visited_pages_count) { 2 } # my own page + trusmis
196
188
  let(:visited_pages) do
197
189
  ['http://www.example.com/test', 'http://www.example.com/trusmis.html']
@@ -201,10 +193,7 @@ RSpec.describe Grell::Crawler do
201
193
  end
202
194
 
203
195
  context 'using an array of strings' do
204
- before do
205
- crawler.whitelist(['/trusmis.html', '/nothere', 'another.html'])
206
- end
207
-
196
+ let(:allowlist) { ['/trusmis.html', '/nothere', 'another.html'] }
208
197
  let(:visited_pages_count) { 2 }
209
198
  let(:visited_pages) do
210
199
  ['http://www.example.com/test', 'http://www.example.com/trusmis.html']
@@ -214,10 +203,7 @@ RSpec.describe Grell::Crawler do
214
203
  end
215
204
 
216
205
  context 'using a regexp' do
217
- before do
218
- crawler.whitelist(/\/trusmis\.html/)
219
- end
220
-
206
+ let(:allowlist) { /\/trusmis\.html/ }
221
207
  let(:visited_pages_count) { 2 }
222
208
  let(:visited_pages) do
223
209
  ['http://www.example.com/test', 'http://www.example.com/trusmis.html']
@@ -227,10 +213,7 @@ RSpec.describe Grell::Crawler do
227
213
  end
228
214
 
229
215
  context 'using an array of regexps' do
230
- before do
231
- crawler.whitelist([/\/trusmis\.html/])
232
- end
233
-
216
+ let(:allowlist) { [/\/trusmis\.html/] }
234
217
  let(:visited_pages_count) { 2 }
235
218
  let(:visited_pages) do
236
219
  ['http://www.example.com/test', 'http://www.example.com/trusmis.html']
@@ -240,10 +223,7 @@ RSpec.describe Grell::Crawler do
240
223
  end
241
224
 
242
225
  context 'using an empty array' do
243
- before do
244
- crawler.whitelist([])
245
- end
246
-
226
+ let(:allowlist) { [] }
247
227
  let(:visited_pages_count) { 1 } # my own page only
248
228
  let(:visited_pages) do
249
229
  ['http://www.example.com/test']
@@ -252,11 +232,8 @@ RSpec.describe Grell::Crawler do
252
232
  it_behaves_like 'visits all available pages'
253
233
  end
254
234
 
255
- context 'adding all links to the whitelist' do
256
- before do
257
- crawler.whitelist(['/trusmis', '/help'])
258
- end
259
-
235
+ context 'adding all links to the allowlist' do
236
+ let(:allowlist) { ['/trusmis', '/help'] }
260
237
  let(:visited_pages_count) { 3 } # all links
261
238
  let(:visited_pages) do
262
239
  ['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
@@ -267,7 +244,7 @@ RSpec.describe Grell::Crawler do
267
244
  end
268
245
 
269
246
 
270
- describe '#blacklist' do
247
+ describe '#denylist' do
271
248
  let(:body) do
272
249
  "<html><head></head><body>
273
250
  <a href=\"/trusmis.html\">trusmis</a>
@@ -282,9 +259,7 @@ RSpec.describe Grell::Crawler do
282
259
  end
283
260
 
284
261
  context 'using a single string' do
285
- before do
286
- crawler.blacklist('/trusmis.html')
287
- end
262
+ let(:denylist) { '/trusmis.html' }
288
263
  let(:visited_pages_count) {2}
289
264
  let(:visited_pages) do
290
265
  ['http://www.example.com/test','http://www.example.com/help.html']
@@ -294,9 +269,7 @@ RSpec.describe Grell::Crawler do
294
269
  end
295
270
 
296
271
  context 'using an array of strings' do
297
- before do
298
- crawler.blacklist(['/trusmis.html', '/nothere', 'another.html'])
299
- end
272
+ let(:denylist) { ['/trusmis.html', '/nothere', 'another.html'] }
300
273
  let(:visited_pages_count) {2}
301
274
  let(:visited_pages) do
302
275
  ['http://www.example.com/test','http://www.example.com/help.html']
@@ -306,9 +279,7 @@ RSpec.describe Grell::Crawler do
306
279
  end
307
280
 
308
281
  context 'using a regexp' do
309
- before do
310
- crawler.blacklist(/\/trusmis\.html/)
311
- end
282
+ let(:denylist) { /\/trusmis\.html/ }
312
283
  let(:visited_pages_count) {2}
313
284
  let(:visited_pages) do
314
285
  ['http://www.example.com/test','http://www.example.com/help.html']
@@ -318,9 +289,7 @@ RSpec.describe Grell::Crawler do
318
289
  end
319
290
 
320
291
  context 'using an array of regexps' do
321
- before do
322
- crawler.blacklist([/\/trusmis\.html/])
323
- end
292
+ let(:denylist) { [/\/trusmis\.html/] }
324
293
  let(:visited_pages_count) {2}
325
294
  let(:visited_pages) do
326
295
  ['http://www.example.com/test','http://www.example.com/help.html']
@@ -330,9 +299,7 @@ RSpec.describe Grell::Crawler do
330
299
  end
331
300
 
332
301
  context 'using an empty array' do
333
- before do
334
- crawler.blacklist([])
335
- end
302
+ let(:denylist) { [] }
336
303
  let(:visited_pages_count) { 3 } # all links
337
304
  let(:visited_pages) do
338
305
  ['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
@@ -341,10 +308,8 @@ RSpec.describe Grell::Crawler do
341
308
  it_behaves_like 'visits all available pages'
342
309
  end
343
310
 
344
- context 'adding all links to the whitelist' do
345
- before do
346
- crawler.blacklist(['/trusmis', '/help'])
347
- end
311
+ context 'adding all links to the denylist' do
312
+ let(:denylist) { ['/trusmis', '/help'] }
348
313
  let(:visited_pages_count) { 1 }
349
314
  let(:visited_pages) do
350
315
  ['http://www.example.com/test']
@@ -355,7 +320,7 @@ RSpec.describe Grell::Crawler do
355
320
  end
356
321
 
357
322
 
358
- describe 'Whitelisting and blacklisting' do
323
+ describe 'allowlisting and denylisting' do
359
324
  let(:body) do
360
325
  "<html><head></head><body>
361
326
  <a href=\"/trusmis.html\">trusmis</a>
@@ -369,12 +334,9 @@ RSpec.describe Grell::Crawler do
369
334
  proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
370
335
  end
371
336
 
372
- context 'we blacklist the only whitelisted page' do
373
- before do
374
- crawler.whitelist('/trusmis.html')
375
- crawler.blacklist('/trusmis.html')
376
- end
377
-
337
+ context 'we denylist the only allowlisted page' do
338
+ let(:allowlist) { '/trusmis.html' }
339
+ let(:denylist) { '/trusmis.html' }
378
340
  let(:visited_pages_count) { 1 }
379
341
  let(:visited_pages) do
380
342
  ['http://www.example.com/test']
@@ -383,12 +345,9 @@ RSpec.describe Grell::Crawler do
383
345
  it_behaves_like 'visits all available pages'
384
346
  end
385
347
 
386
- context 'we blacklist none of the whitelisted pages' do
387
- before do
388
- crawler.whitelist('/trusmis.html')
389
- crawler.blacklist('/raistlin.html')
390
- end
391
-
348
+ context 'we denylist none of the allowlisted pages' do
349
+ let(:allowlist) { '/trusmis.html' }
350
+ let(:denylist) { '/raistlin.html' }
392
351
  let(:visited_pages_count) { 2 }
393
352
  let(:visited_pages) do
394
353
  ['http://www.example.com/test', 'http://www.example.com/trusmis.html']