grell 1.6.10 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +17 -3
- data/CHANGELOG.md +24 -0
- data/Gemfile +4 -0
- data/README.md +100 -63
- data/grell.gemspec +6 -7
- data/lib/grell.rb +1 -0
- data/lib/grell/capybara_driver.rb +9 -19
- data/lib/grell/crawler.rb +27 -53
- data/lib/grell/crawler_manager.rb +84 -0
- data/lib/grell/page.rb +2 -1
- data/lib/grell/page_collection.rb +9 -1
- data/lib/grell/rawpage.rb +11 -1
- data/lib/grell/version.rb +1 -1
- data/spec/lib/capybara_driver_spec.rb +3 -8
- data/spec/lib/crawler_manager_spec.rb +174 -0
- data/spec/lib/crawler_spec.rb +52 -93
- data/spec/lib/page_spec.rb +11 -0
- data/spec/spec_helper.rb +1 -0
- metadata +15 -42
@@ -0,0 +1,84 @@
|
|
1
|
+
module Grell
|
2
|
+
# Manages the state of the process crawling, does not care about individual pages but about logging,
|
3
|
+
# restarting and quiting the crawler correctly.
|
4
|
+
class CrawlerManager
|
5
|
+
# logger: logger to use for Grell's messages
|
6
|
+
# on_periodic_restart: if set, the driver will restart every :each visits (100 default) and execute the :do block
|
7
|
+
# driver_options: Any extra options for the Capybara driver
|
8
|
+
def initialize(logger: nil, on_periodic_restart: {}, driver: nil)
|
9
|
+
Grell.logger = logger ? logger : Logger.new(STDOUT)
|
10
|
+
@periodic_restart_block = on_periodic_restart[:do]
|
11
|
+
@periodic_restart_period = on_periodic_restart[:each] || PAGES_TO_RESTART
|
12
|
+
@driver = driver || CapybaraDriver.new.setup_capybara
|
13
|
+
if @periodic_restart_period <= 0
|
14
|
+
Grell.logger.warn "GRELL. Restart option misconfigured with a negative period. Ignoring option."
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
19
|
+
def restart
|
20
|
+
Grell.logger.info "GRELL. Driver restarting"
|
21
|
+
@driver.restart
|
22
|
+
Grell.logger.info "GRELL. Driver restarted"
|
23
|
+
end
|
24
|
+
|
25
|
+
# Quits the poltergeist driver.
|
26
|
+
def quit
|
27
|
+
Grell.logger.info "GRELL. Driver quitting"
|
28
|
+
@driver.quit
|
29
|
+
end
|
30
|
+
|
31
|
+
# PhantomJS seems to consume memory increasingly as it crawls, periodic restart allows to restart
|
32
|
+
# the driver, potentially calling a block.
|
33
|
+
def check_periodic_restart(collection)
|
34
|
+
return unless @periodic_restart_block
|
35
|
+
return unless @periodic_restart_period > 0
|
36
|
+
return unless (collection.visited_pages.size % @periodic_restart_period).zero?
|
37
|
+
restart
|
38
|
+
@periodic_restart_block.call
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.cleanup_all_processes
|
42
|
+
PhantomJSManager.new.cleanup_all_processes
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
PAGES_TO_RESTART = 100 # Default number of pages before we restart the driver.
|
48
|
+
KILL_TIMEOUT = 2 # Number of seconds we wait till we kill the process.
|
49
|
+
|
50
|
+
# Manages the PhantomJS process
|
51
|
+
class PhantomJSManager
|
52
|
+
def cleanup_all_processes
|
53
|
+
pids = running_phantomjs_pids
|
54
|
+
return if pids.empty?
|
55
|
+
Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}"
|
56
|
+
pids.each do |pid|
|
57
|
+
Grell.logger.warn "GRELL. Sending KILL to PhantomJS process #{pid}"
|
58
|
+
kill_process(pid.to_i)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def running_phantomjs_pids
|
63
|
+
list_phantomjs_processes_cmd = "ps -ef | grep -E 'bin/phantomjs' | grep -v grep"
|
64
|
+
`#{list_phantomjs_processes_cmd} | awk '{print $2;}'`.split("\n")
|
65
|
+
end
|
66
|
+
|
67
|
+
def kill_process(pid)
|
68
|
+
Process.kill('TERM', pid)
|
69
|
+
force_kill(pid)
|
70
|
+
rescue Errno::ESRCH, Errno::ECHILD
|
71
|
+
# successfully terminated
|
72
|
+
rescue => e
|
73
|
+
Grell.logger.error ["GRELL. PhantomJS process could not be killed", e.message, *e.backtrace].join($/)
|
74
|
+
end
|
75
|
+
|
76
|
+
def force_kill(pid)
|
77
|
+
Timeout.timeout(KILL_TIMEOUT) { Process.wait(pid) }
|
78
|
+
rescue Timeout::Error
|
79
|
+
Process.kill('KILL', pid)
|
80
|
+
Process.wait(pid)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/grell/page.rb
CHANGED
@@ -26,11 +26,12 @@ module Grell
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def navigate
|
29
|
-
# We wait a maximum of WAIT_TIME seconds to get an HTML page. We try
|
29
|
+
# We wait a maximum of WAIT_TIME seconds to get an HTML page. We try our best to workaround inconsistencies on poltergeist
|
30
30
|
Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
|
31
31
|
@rawpage.status && !@rawpage.headers.empty? &&
|
32
32
|
@rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
|
33
33
|
end
|
34
|
+
@rawpage.wait_for_all_ajax_requests(WAIT_TIME, WAIT_INTERVAL)
|
34
35
|
@result_page = VisitedPage.new(@rawpage)
|
35
36
|
@timestamp = Time.now
|
36
37
|
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
|
@@ -10,7 +10,7 @@ module Grell
|
|
10
10
|
# to the collection or if it is already present will be passed to the initializer.
|
11
11
|
def initialize(add_match_block)
|
12
12
|
@collection = []
|
13
|
-
@add_match_block = add_match_block
|
13
|
+
@add_match_block = add_match_block || default_add_match
|
14
14
|
end
|
15
15
|
|
16
16
|
def create_page(url, parent_id)
|
@@ -50,5 +50,13 @@ module Grell
|
|
50
50
|
end
|
51
51
|
end
|
52
52
|
|
53
|
+
# If add_match_block is not provided, url matching to determine if a new page should be added
|
54
|
+
# to the page collection will default to this proc
|
55
|
+
def default_add_match
|
56
|
+
Proc.new do |collection_page, page|
|
57
|
+
collection_page.url.downcase == page.url.downcase
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
53
61
|
end
|
54
62
|
end
|
data/lib/grell/rawpage.rb
CHANGED
@@ -27,7 +27,6 @@ module Grell
|
|
27
27
|
all('[href]', visible: true).to_a + all('[data-href]', visible: true).to_a
|
28
28
|
end
|
29
29
|
|
30
|
-
|
31
30
|
def host
|
32
31
|
page.current_host
|
33
32
|
end
|
@@ -36,6 +35,17 @@ module Grell
|
|
36
35
|
page.has_selector?(selector)
|
37
36
|
end
|
38
37
|
|
38
|
+
def wait_for_all_ajax_requests(timeout, interval)
|
39
|
+
Timeout::timeout(timeout) do
|
40
|
+
(timeout / interval).ceil.times do
|
41
|
+
jquery_active = page.evaluate_script("typeof jQuery !== 'undefined' && jQuery.active;")
|
42
|
+
break if (!jquery_active || jquery_active.zero?)
|
43
|
+
sleep(interval)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
true
|
47
|
+
end
|
48
|
+
|
39
49
|
private
|
40
50
|
|
41
51
|
def follow_redirects!
|
data/lib/grell/version.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
|
2
2
|
RSpec.describe Grell::CapybaraDriver do
|
3
3
|
let(:ts) { Time.now }
|
4
|
+
before do
|
5
|
+
Grell.logger = Logger.new(nil)
|
6
|
+
end
|
4
7
|
|
5
8
|
describe 'setup_capybara' do
|
6
9
|
it 'properly registers the poltergeist driver' do
|
@@ -25,14 +28,6 @@ RSpec.describe Grell::CapybaraDriver do
|
|
25
28
|
end
|
26
29
|
end
|
27
30
|
|
28
|
-
describe 'quit' do
|
29
|
-
let(:driver) { Grell::CapybaraDriver.new.setup_capybara }
|
30
|
-
it 'quits the poltergeist driver' do
|
31
|
-
expect_any_instance_of(Capybara::Poltergeist::Driver).to receive(:quit)
|
32
|
-
driver.quit
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
31
|
after do
|
37
32
|
Timecop.return
|
38
33
|
|
@@ -0,0 +1,174 @@
|
|
1
|
+
RSpec.describe Grell::CrawlerManager do
|
2
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
3
|
+
let(:host) { 'http://www.example.com' }
|
4
|
+
let(:url) { 'http://www.example.com/test' }
|
5
|
+
let(:driver) { double(Grell::CapybaraDriver) }
|
6
|
+
let(:logger) { Logger.new(nil) }
|
7
|
+
let(:crawler_manager) do
|
8
|
+
described_class.new(logger: logger, driver: driver)
|
9
|
+
end
|
10
|
+
|
11
|
+
describe 'initialize' do
|
12
|
+
context 'provides a logger' do
|
13
|
+
let(:logger) { 33 }
|
14
|
+
|
15
|
+
it 'sets custom logger' do
|
16
|
+
crawler_manager
|
17
|
+
expect(Grell.logger).to eq(33)
|
18
|
+
Grell.logger = Logger.new(nil)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context 'does not provides a logger' do
|
23
|
+
let(:logger) { nil }
|
24
|
+
|
25
|
+
it 'sets default logger' do
|
26
|
+
crawler_manager
|
27
|
+
expect(Grell.logger).to be_instance_of(Logger)
|
28
|
+
Grell.logger = Logger.new(nil)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'does not provide a driver' do
|
33
|
+
let(:driver) { nil }
|
34
|
+
|
35
|
+
it 'setups a new Capybara driver' do
|
36
|
+
expect_any_instance_of(Grell::CapybaraDriver).to receive(:setup_capybara)
|
37
|
+
crawler_manager
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe '#quit' do
|
43
|
+
let(:driver) { double }
|
44
|
+
|
45
|
+
it 'quits the poltergeist driver' do
|
46
|
+
expect(logger).to receive(:info).with("GRELL. Driver quitting")
|
47
|
+
expect(driver).to receive(:quit)
|
48
|
+
crawler_manager.quit
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe '#restart' do
|
53
|
+
let(:driver) { double }
|
54
|
+
|
55
|
+
it 'restarts the poltergeist driver' do
|
56
|
+
expect(driver).to receive(:restart)
|
57
|
+
expect(logger).to receive(:info).with("GRELL. Driver restarted")
|
58
|
+
expect(logger).to receive(:info).with("GRELL. Driver restarting")
|
59
|
+
crawler_manager.restart
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
describe '#check_periodic_restart' do
|
64
|
+
let(:collection) { double }
|
65
|
+
|
66
|
+
context 'Periodic restart not setup' do
|
67
|
+
it 'does not restart' do
|
68
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
|
69
|
+
expect(crawler_manager).not_to receive(:restart)
|
70
|
+
crawler_manager.check_periodic_restart(collection)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
context 'Periodic restart setup with default period' do
|
75
|
+
let(:do_something) { proc {} }
|
76
|
+
let(:crawler_manager) do
|
77
|
+
Grell::CrawlerManager.new(
|
78
|
+
logger: logger,
|
79
|
+
driver: driver,
|
80
|
+
on_periodic_restart: { do: do_something }
|
81
|
+
)
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'does not restart after visiting 99 pages' do
|
85
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { 99 }
|
86
|
+
expect(crawler_manager).not_to receive(:restart)
|
87
|
+
crawler_manager.check_periodic_restart(collection)
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'restarts after visiting 100 pages' do
|
91
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
|
92
|
+
expect(crawler_manager).to receive(:restart)
|
93
|
+
crawler_manager.check_periodic_restart(collection)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
context 'Periodic restart setup with custom period' do
|
98
|
+
let(:do_something) { proc {} }
|
99
|
+
let(:period) { 50 }
|
100
|
+
let(:crawler_manager) do
|
101
|
+
Grell::CrawlerManager.new(
|
102
|
+
logger: logger,
|
103
|
+
driver: driver,
|
104
|
+
on_periodic_restart: { do: do_something, each: period }
|
105
|
+
)
|
106
|
+
end
|
107
|
+
|
108
|
+
context 'restart option is not positive' do
|
109
|
+
let(:period) { 0 }
|
110
|
+
|
111
|
+
it 'logs a warning' do
|
112
|
+
message = 'GRELL. Restart option misconfigured with a negative period. Ignoring option.'
|
113
|
+
expect(logger).to receive(:warn).with(message)
|
114
|
+
crawler_manager
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
it 'does not restart after visiting a number different from custom period pages' do
|
119
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { period * 1.2 }
|
120
|
+
expect(crawler_manager).not_to receive(:restart)
|
121
|
+
crawler_manager.check_periodic_restart(collection)
|
122
|
+
end
|
123
|
+
|
124
|
+
it 'restarts after visiting custom period pages' do
|
125
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { period }
|
126
|
+
expect(crawler_manager).to receive(:restart)
|
127
|
+
crawler_manager.check_periodic_restart(collection)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
describe '.cleanup_all_processes' do
|
133
|
+
let(:driver) { double }
|
134
|
+
|
135
|
+
context 'There are some phantomjs processes running' do
|
136
|
+
let(:pids) { [10, 11] }
|
137
|
+
before do
|
138
|
+
allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
|
139
|
+
.to receive(:running_phantomjs_pids).and_return(pids)
|
140
|
+
end
|
141
|
+
|
142
|
+
it 'logs processes pids' do
|
143
|
+
expect(Grell.logger).to receive(:warn).with('GRELL. Killing PhantomJS processes: [10, 11]')
|
144
|
+
expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 10')
|
145
|
+
expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 11')
|
146
|
+
described_class.cleanup_all_processes
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'kills all phantomjs processes' do
|
150
|
+
expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(10)
|
151
|
+
expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(11)
|
152
|
+
described_class.cleanup_all_processes
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
context 'There are no phantomjs processes running' do
|
157
|
+
let(:pids) { [] }
|
158
|
+
before do
|
159
|
+
allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
|
160
|
+
.to receive(:running_phantomjs_pids).and_return(pids)
|
161
|
+
end
|
162
|
+
|
163
|
+
it 'no warning is logged' do
|
164
|
+
expect(Grell.logger).not_to receive(:warn)
|
165
|
+
described_class.cleanup_all_processes
|
166
|
+
end
|
167
|
+
|
168
|
+
it 'No process is killed' do
|
169
|
+
expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).not_to receive(:kill_process)
|
170
|
+
described_class.cleanup_all_processes
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
data/spec/lib/crawler_spec.rb
CHANGED
@@ -5,7 +5,19 @@ RSpec.describe Grell::Crawler do
|
|
5
5
|
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
6
6
|
let(:host) { 'http://www.example.com' }
|
7
7
|
let(:url) { 'http://www.example.com/test' }
|
8
|
-
let(:
|
8
|
+
let(:add_match_block) { nil }
|
9
|
+
let(:denylist) { /a^/ }
|
10
|
+
let(:allowlist) { /.*/ }
|
11
|
+
let(:crawler) do
|
12
|
+
Grell::Crawler.new(
|
13
|
+
logger: Logger.new(nil),
|
14
|
+
driver: double(nil),
|
15
|
+
evaluate_in_each_page: script,
|
16
|
+
add_match_block: add_match_block,
|
17
|
+
denylist: denylist,
|
18
|
+
allowlist: allowlist)
|
19
|
+
end
|
20
|
+
let(:script) { nil }
|
9
21
|
let(:body) { 'body' }
|
10
22
|
let(:custom_add_match) do
|
11
23
|
Proc.new do |collection_page, page|
|
@@ -17,29 +29,6 @@ RSpec.describe Grell::Crawler do
|
|
17
29
|
proxy.stub(url).and_return(body: body, code: 200)
|
18
30
|
end
|
19
31
|
|
20
|
-
describe 'initialize' do
|
21
|
-
it 'can provide your own logger' do
|
22
|
-
Grell::Crawler.new(external_driver: true, logger: 33)
|
23
|
-
expect(Grell.logger).to eq(33)
|
24
|
-
Grell.logger = Logger.new(nil)
|
25
|
-
end
|
26
|
-
|
27
|
-
it 'provides a stdout logger if nothing provided' do
|
28
|
-
crawler
|
29
|
-
expect(Grell.logger).to be_instance_of(Logger)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
describe '#quit' do
|
34
|
-
let(:driver) { double }
|
35
|
-
before { allow(Grell::CapybaraDriver).to receive(:setup).and_return(driver) }
|
36
|
-
|
37
|
-
it 'quits the poltergeist driver' do
|
38
|
-
expect(driver).to receive(:quit)
|
39
|
-
crawler.quit
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
32
|
describe '#crawl' do
|
44
33
|
before do
|
45
34
|
crawler.instance_variable_set('@collection', Grell::PageCollection.new(custom_add_match))
|
@@ -85,6 +74,21 @@ RSpec.describe Grell::Crawler do
|
|
85
74
|
expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
|
86
75
|
crawler.crawl(page, nil)
|
87
76
|
end
|
77
|
+
|
78
|
+
context 'without script' do
|
79
|
+
it 'does not evaluate a script' do
|
80
|
+
expect_any_instance_of(Capybara::Session).not_to receive(:evaluate_script)
|
81
|
+
crawler.crawl(page, nil)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
context 'with script' do
|
86
|
+
let(:script) { "(typeof(jQuery)!='undefined') && $('.dropdown').addClass('open');" }
|
87
|
+
it 'evaluates a script' do
|
88
|
+
expect_any_instance_of(Capybara::Session).to receive(:evaluate_script).with(script)
|
89
|
+
crawler.crawl(page, nil)
|
90
|
+
end
|
91
|
+
end
|
88
92
|
end
|
89
93
|
|
90
94
|
context '#start_crawling' do
|
@@ -111,15 +115,6 @@ RSpec.describe Grell::Crawler do
|
|
111
115
|
expect(result[1].url).to eq(url_visited)
|
112
116
|
end
|
113
117
|
|
114
|
-
it 'can use a custom url add matcher block' do
|
115
|
-
expect(crawler).to_not receive(:default_add_match)
|
116
|
-
crawler.start_crawling(url, add_match_block: custom_add_match)
|
117
|
-
end
|
118
|
-
|
119
|
-
it 'uses a default url add matched if not provided' do
|
120
|
-
expect(crawler).to receive(:default_add_match).and_return(custom_add_match)
|
121
|
-
crawler.start_crawling(url)
|
122
|
-
end
|
123
118
|
end
|
124
119
|
|
125
120
|
shared_examples_for 'visits all available pages' do
|
@@ -133,7 +128,7 @@ RSpec.describe Grell::Crawler do
|
|
133
128
|
expect(crawler.collection.discovered_pages.size).to eq(0)
|
134
129
|
end
|
135
130
|
|
136
|
-
it 'contains the
|
131
|
+
it 'contains the allowlisted page and the base page only' do
|
137
132
|
crawler.start_crawling(url)
|
138
133
|
expect(crawler.collection.visited_pages.map(&:url)).
|
139
134
|
to eq(visited_pages)
|
@@ -173,7 +168,7 @@ RSpec.describe Grell::Crawler do
|
|
173
168
|
it_behaves_like 'visits all available pages'
|
174
169
|
end
|
175
170
|
|
176
|
-
describe '#
|
171
|
+
describe '#allowlist' do
|
177
172
|
let(:body) do
|
178
173
|
"<html><head></head><body>
|
179
174
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -188,10 +183,7 @@ RSpec.describe Grell::Crawler do
|
|
188
183
|
end
|
189
184
|
|
190
185
|
context 'using a single string' do
|
191
|
-
|
192
|
-
crawler.whitelist('/trusmis.html')
|
193
|
-
end
|
194
|
-
|
186
|
+
let(:allowlist) { '/trusmis.html' }
|
195
187
|
let(:visited_pages_count) { 2 } # my own page + trusmis
|
196
188
|
let(:visited_pages) do
|
197
189
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -201,10 +193,7 @@ RSpec.describe Grell::Crawler do
|
|
201
193
|
end
|
202
194
|
|
203
195
|
context 'using an array of strings' do
|
204
|
-
|
205
|
-
crawler.whitelist(['/trusmis.html', '/nothere', 'another.html'])
|
206
|
-
end
|
207
|
-
|
196
|
+
let(:allowlist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
208
197
|
let(:visited_pages_count) { 2 }
|
209
198
|
let(:visited_pages) do
|
210
199
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -214,10 +203,7 @@ RSpec.describe Grell::Crawler do
|
|
214
203
|
end
|
215
204
|
|
216
205
|
context 'using a regexp' do
|
217
|
-
|
218
|
-
crawler.whitelist(/\/trusmis\.html/)
|
219
|
-
end
|
220
|
-
|
206
|
+
let(:allowlist) { /\/trusmis\.html/ }
|
221
207
|
let(:visited_pages_count) { 2 }
|
222
208
|
let(:visited_pages) do
|
223
209
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -227,10 +213,7 @@ RSpec.describe Grell::Crawler do
|
|
227
213
|
end
|
228
214
|
|
229
215
|
context 'using an array of regexps' do
|
230
|
-
|
231
|
-
crawler.whitelist([/\/trusmis\.html/])
|
232
|
-
end
|
233
|
-
|
216
|
+
let(:allowlist) { [/\/trusmis\.html/] }
|
234
217
|
let(:visited_pages_count) { 2 }
|
235
218
|
let(:visited_pages) do
|
236
219
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
@@ -240,10 +223,7 @@ RSpec.describe Grell::Crawler do
|
|
240
223
|
end
|
241
224
|
|
242
225
|
context 'using an empty array' do
|
243
|
-
|
244
|
-
crawler.whitelist([])
|
245
|
-
end
|
246
|
-
|
226
|
+
let(:allowlist) { [] }
|
247
227
|
let(:visited_pages_count) { 1 } # my own page only
|
248
228
|
let(:visited_pages) do
|
249
229
|
['http://www.example.com/test']
|
@@ -252,11 +232,8 @@ RSpec.describe Grell::Crawler do
|
|
252
232
|
it_behaves_like 'visits all available pages'
|
253
233
|
end
|
254
234
|
|
255
|
-
context 'adding all links to the
|
256
|
-
|
257
|
-
crawler.whitelist(['/trusmis', '/help'])
|
258
|
-
end
|
259
|
-
|
235
|
+
context 'adding all links to the allowlist' do
|
236
|
+
let(:allowlist) { ['/trusmis', '/help'] }
|
260
237
|
let(:visited_pages_count) { 3 } # all links
|
261
238
|
let(:visited_pages) do
|
262
239
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
@@ -267,7 +244,7 @@ RSpec.describe Grell::Crawler do
|
|
267
244
|
end
|
268
245
|
|
269
246
|
|
270
|
-
describe '#
|
247
|
+
describe '#denylist' do
|
271
248
|
let(:body) do
|
272
249
|
"<html><head></head><body>
|
273
250
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -282,9 +259,7 @@ RSpec.describe Grell::Crawler do
|
|
282
259
|
end
|
283
260
|
|
284
261
|
context 'using a single string' do
|
285
|
-
|
286
|
-
crawler.blacklist('/trusmis.html')
|
287
|
-
end
|
262
|
+
let(:denylist) { '/trusmis.html' }
|
288
263
|
let(:visited_pages_count) {2}
|
289
264
|
let(:visited_pages) do
|
290
265
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -294,9 +269,7 @@ RSpec.describe Grell::Crawler do
|
|
294
269
|
end
|
295
270
|
|
296
271
|
context 'using an array of strings' do
|
297
|
-
|
298
|
-
crawler.blacklist(['/trusmis.html', '/nothere', 'another.html'])
|
299
|
-
end
|
272
|
+
let(:denylist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
300
273
|
let(:visited_pages_count) {2}
|
301
274
|
let(:visited_pages) do
|
302
275
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -306,9 +279,7 @@ RSpec.describe Grell::Crawler do
|
|
306
279
|
end
|
307
280
|
|
308
281
|
context 'using a regexp' do
|
309
|
-
|
310
|
-
crawler.blacklist(/\/trusmis\.html/)
|
311
|
-
end
|
282
|
+
let(:denylist) { /\/trusmis\.html/ }
|
312
283
|
let(:visited_pages_count) {2}
|
313
284
|
let(:visited_pages) do
|
314
285
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -318,9 +289,7 @@ RSpec.describe Grell::Crawler do
|
|
318
289
|
end
|
319
290
|
|
320
291
|
context 'using an array of regexps' do
|
321
|
-
|
322
|
-
crawler.blacklist([/\/trusmis\.html/])
|
323
|
-
end
|
292
|
+
let(:denylist) { [/\/trusmis\.html/] }
|
324
293
|
let(:visited_pages_count) {2}
|
325
294
|
let(:visited_pages) do
|
326
295
|
['http://www.example.com/test','http://www.example.com/help.html']
|
@@ -330,9 +299,7 @@ RSpec.describe Grell::Crawler do
|
|
330
299
|
end
|
331
300
|
|
332
301
|
context 'using an empty array' do
|
333
|
-
|
334
|
-
crawler.blacklist([])
|
335
|
-
end
|
302
|
+
let(:denylist) { [] }
|
336
303
|
let(:visited_pages_count) { 3 } # all links
|
337
304
|
let(:visited_pages) do
|
338
305
|
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
@@ -341,10 +308,8 @@ RSpec.describe Grell::Crawler do
|
|
341
308
|
it_behaves_like 'visits all available pages'
|
342
309
|
end
|
343
310
|
|
344
|
-
context 'adding all links to the
|
345
|
-
|
346
|
-
crawler.blacklist(['/trusmis', '/help'])
|
347
|
-
end
|
311
|
+
context 'adding all links to the denylist' do
|
312
|
+
let(:denylist) { ['/trusmis', '/help'] }
|
348
313
|
let(:visited_pages_count) { 1 }
|
349
314
|
let(:visited_pages) do
|
350
315
|
['http://www.example.com/test']
|
@@ -355,7 +320,7 @@ RSpec.describe Grell::Crawler do
|
|
355
320
|
end
|
356
321
|
|
357
322
|
|
358
|
-
describe '
|
323
|
+
describe 'allowlisting and denylisting' do
|
359
324
|
let(:body) do
|
360
325
|
"<html><head></head><body>
|
361
326
|
<a href=\"/trusmis.html\">trusmis</a>
|
@@ -369,12 +334,9 @@ RSpec.describe Grell::Crawler do
|
|
369
334
|
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
370
335
|
end
|
371
336
|
|
372
|
-
context 'we
|
373
|
-
|
374
|
-
|
375
|
-
crawler.blacklist('/trusmis.html')
|
376
|
-
end
|
377
|
-
|
337
|
+
context 'we denylist the only allowlisted page' do
|
338
|
+
let(:allowlist) { '/trusmis.html' }
|
339
|
+
let(:denylist) { '/trusmis.html' }
|
378
340
|
let(:visited_pages_count) { 1 }
|
379
341
|
let(:visited_pages) do
|
380
342
|
['http://www.example.com/test']
|
@@ -383,12 +345,9 @@ RSpec.describe Grell::Crawler do
|
|
383
345
|
it_behaves_like 'visits all available pages'
|
384
346
|
end
|
385
347
|
|
386
|
-
context 'we
|
387
|
-
|
388
|
-
|
389
|
-
crawler.blacklist('/raistlin.html')
|
390
|
-
end
|
391
|
-
|
348
|
+
context 'we denylist none of the allowlisted pages' do
|
349
|
+
let(:allowlist) { '/trusmis.html' }
|
350
|
+
let(:denylist) { '/raistlin.html' }
|
392
351
|
let(:visited_pages_count) { 2 }
|
393
352
|
let(:visited_pages) do
|
394
353
|
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|