grell 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +0 -1
- data/lib/grell/capybara_driver.rb +8 -18
- data/lib/grell/crawler_manager.rb +38 -31
- data/lib/grell/version.rb +1 -1
- data/spec/lib/capybara_driver_spec.rb +3 -8
- data/spec/lib/crawler_manager_spec.rb +68 -7
- data/spec/lib/crawler_spec.rb +1 -1
- metadata +2 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d9acf34b301934669bf458555b29ef7c429f936
|
4
|
+
data.tar.gz: 3949e320d75862e04325581986c24209440fd46e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3cd06fcba285f5f894d0127c7de96575f032fff8d10cdcfa2ee5042b287b01babdbd3119c5c1d979c8610ae9ee9d571f8c6144711415acbcf09f3d43ceac567c
|
7
|
+
data.tar.gz: 4dfb9e9d2e83095ca66df36bc8f41082fc37a29995d75e0553f8466eeb0dd36b62c3578861f171c9c29e4815ca257807dbd53dab4eee875301d1613fde6a437a
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# 2.1.0
|
2
|
+
* Delete `driver_options` configuration key as it was never used.
|
3
|
+
* `cleanup_all_processes` is a self method as intended to.
|
4
|
+
|
1
5
|
# 2.0.0
|
2
6
|
* New configuration key `on_periodic_restart`.
|
3
7
|
* CrawlerManager.cleanup_all_processes method destroy all instances of phantomjs in this machine.
|
data/README.md
CHANGED
@@ -96,7 +96,6 @@ The `Grell:Crawler` class can be passed options to customize its behavior:
|
|
96
96
|
- `blacklist`: Setups a blacklist filter for URLs to be avoided. Default: no URL is blacklisted.
|
97
97
|
- `add_match_block`: Block evaluated to consider if a given page should be part of the pages to be visited. Default: add unique URLs.
|
98
98
|
- `evaluate_in_each_page`: Javascript block to be evaluated on each page visited. Default: Nothing evaluated.
|
99
|
-
- `driver_options`: Driver options will be passed to the Capybara driver which connects to PhantomJS.
|
100
99
|
|
101
100
|
Grell by default will follow all the links it finds in the site being crawled.
|
102
101
|
It will never follow links linking outside your site.
|
@@ -1,16 +1,10 @@
|
|
1
|
-
|
2
1
|
module Grell
|
3
|
-
|
4
|
-
#
|
2
|
+
# This class setups the driver for capybara. Used internally by the CrawlerManager
|
3
|
+
# It uses Portelgeist to control PhantomJS
|
5
4
|
class CapybaraDriver
|
6
|
-
include Capybara::DSL
|
7
|
-
|
8
5
|
USER_AGENT = "Mozilla/5.0 (Grell Crawler)".freeze
|
9
6
|
|
10
|
-
|
11
|
-
new.setup_capybara unless options[:external_driver]
|
12
|
-
end
|
13
|
-
|
7
|
+
# Returns a poltergeist driver
|
14
8
|
def setup_capybara
|
15
9
|
@poltergeist_driver = nil
|
16
10
|
|
@@ -20,18 +14,17 @@ module Grell
|
|
20
14
|
Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
|
21
15
|
|
22
16
|
Capybara.register_driver driver_name do |app|
|
23
|
-
@poltergeist_driver = Capybara::Poltergeist::Driver.new(app,
|
17
|
+
@poltergeist_driver = Capybara::Poltergeist::Driver.new(app,
|
24
18
|
js_errors: false,
|
25
19
|
inspector: false,
|
26
20
|
phantomjs_logger: FakePoltergeistLogger,
|
27
|
-
phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1']
|
28
|
-
})
|
21
|
+
phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1'])
|
29
22
|
end
|
30
23
|
|
31
24
|
Capybara.default_max_wait_time = 3
|
32
25
|
Capybara.run_server = false
|
33
26
|
Capybara.default_driver = driver_name
|
34
|
-
|
27
|
+
Capybara.current_session.driver.headers = { # The driver gets initialized when modified here
|
35
28
|
"DNT" => 1,
|
36
29
|
"User-Agent" => USER_AGENT
|
37
30
|
}
|
@@ -41,14 +34,11 @@ module Grell
|
|
41
34
|
@poltergeist_driver
|
42
35
|
end
|
43
36
|
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
|
37
|
+
# Poltergeist driver needs a class with this signature. The javascript console.log is sent here.
|
38
|
+
# We just discard that information.
|
48
39
|
module FakePoltergeistLogger
|
49
40
|
def self.puts(*)
|
50
41
|
end
|
51
42
|
end
|
52
43
|
end
|
53
|
-
|
54
44
|
end
|
@@ -5,26 +5,26 @@ module Grell
|
|
5
5
|
# logger: logger to use for Grell's messages
|
6
6
|
# on_periodic_restart: if set, the driver will restart every :each visits (100 default) and execute the :do block
|
7
7
|
# driver_options: Any extra options for the Capybara driver
|
8
|
-
def initialize(logger: nil, on_periodic_restart: {}, driver: nil
|
8
|
+
def initialize(logger: nil, on_periodic_restart: {}, driver: nil)
|
9
9
|
Grell.logger = logger ? logger : Logger.new(STDOUT)
|
10
10
|
@periodic_restart_block = on_periodic_restart[:do]
|
11
11
|
@periodic_restart_period = on_periodic_restart[:each] || PAGES_TO_RESTART
|
12
|
-
@driver = driver || CapybaraDriver.
|
12
|
+
@driver = driver || CapybaraDriver.new.setup_capybara
|
13
13
|
if @periodic_restart_period <= 0
|
14
|
-
Grell.logger.warn "GRELL
|
14
|
+
Grell.logger.warn "GRELL. Restart option misconfigured with a negative period. Ignoring option."
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
18
|
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
19
19
|
def restart
|
20
|
-
Grell.logger.info "GRELL
|
20
|
+
Grell.logger.info "GRELL. Driver restarting"
|
21
21
|
@driver.restart
|
22
|
-
Grell.logger.info "GRELL
|
22
|
+
Grell.logger.info "GRELL. Driver restarted"
|
23
23
|
end
|
24
24
|
|
25
25
|
# Quits the poltergeist driver.
|
26
26
|
def quit
|
27
|
-
Grell.logger.info "GRELL
|
27
|
+
Grell.logger.info "GRELL. Driver quitting"
|
28
28
|
@driver.quit
|
29
29
|
end
|
30
30
|
|
@@ -38,14 +38,8 @@ module Grell
|
|
38
38
|
@periodic_restart_block.call
|
39
39
|
end
|
40
40
|
|
41
|
-
def cleanup_all_processes
|
42
|
-
|
43
|
-
return if pids.empty?
|
44
|
-
Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}"
|
45
|
-
pids.each do |pid|
|
46
|
-
Grell.logger.warn "Sending KILL to PhantomJS process #{pid}"
|
47
|
-
kill_process(pid.to_i)
|
48
|
-
end
|
41
|
+
def self.cleanup_all_processes
|
42
|
+
PhantomJSManager.new.cleanup_all_processes
|
49
43
|
end
|
50
44
|
|
51
45
|
private
|
@@ -53,25 +47,38 @@ module Grell
|
|
53
47
|
PAGES_TO_RESTART = 100 # Default number of pages before we restart the driver.
|
54
48
|
KILL_TIMEOUT = 2 # Number of seconds we wait till we kill the process.
|
55
49
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
50
|
+
# Manages the PhantomJS process
|
51
|
+
class PhantomJSManager
|
52
|
+
def cleanup_all_processes
|
53
|
+
pids = running_phantomjs_pids
|
54
|
+
return if pids.empty?
|
55
|
+
Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}"
|
56
|
+
pids.each do |pid|
|
57
|
+
Grell.logger.warn "GRELL. Sending KILL to PhantomJS process #{pid}"
|
58
|
+
kill_process(pid.to_i)
|
59
|
+
end
|
60
|
+
end
|
60
61
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
62
|
+
def running_phantomjs_pids
|
63
|
+
list_phantomjs_processes_cmd = "ps -ef | grep -E 'bin/phantomjs' | grep -v grep"
|
64
|
+
`#{list_phantomjs_processes_cmd} | awk '{print $2;}'`.split("\n")
|
65
|
+
end
|
66
|
+
|
67
|
+
def kill_process(pid)
|
68
|
+
Process.kill('TERM', pid)
|
69
|
+
force_kill(pid)
|
70
|
+
rescue Errno::ESRCH, Errno::ECHILD
|
71
|
+
# successfully terminated
|
72
|
+
rescue => e
|
73
|
+
Grell.logger.exception e, "GRELL. PhantomJS process could not be killed"
|
74
|
+
end
|
69
75
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
76
|
+
def force_kill(pid)
|
77
|
+
Timeout.timeout(KILL_TIMEOUT) { Process.wait(pid) }
|
78
|
+
rescue Timeout::Error
|
79
|
+
Process.kill('KILL', pid)
|
80
|
+
Process.wait(pid)
|
81
|
+
end
|
75
82
|
end
|
76
83
|
end
|
77
84
|
end
|
data/lib/grell/version.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
|
2
2
|
RSpec.describe Grell::CapybaraDriver do
|
3
3
|
let(:ts) { Time.now }
|
4
|
+
before do
|
5
|
+
Grell.logger = Logger.new(nil)
|
6
|
+
end
|
4
7
|
|
5
8
|
describe 'setup_capybara' do
|
6
9
|
it 'properly registers the poltergeist driver' do
|
@@ -25,14 +28,6 @@ RSpec.describe Grell::CapybaraDriver do
|
|
25
28
|
end
|
26
29
|
end
|
27
30
|
|
28
|
-
describe 'quit' do
|
29
|
-
let(:driver) { Grell::CapybaraDriver.new.setup_capybara }
|
30
|
-
it 'quits the poltergeist driver' do
|
31
|
-
expect_any_instance_of(Capybara::Poltergeist::Driver).to receive(:quit)
|
32
|
-
driver.quit
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
31
|
after do
|
37
32
|
Timecop.return
|
38
33
|
|
@@ -11,26 +11,39 @@ RSpec.describe Grell::CrawlerManager do
|
|
11
11
|
describe 'initialize' do
|
12
12
|
context 'provides a logger' do
|
13
13
|
let(:logger) { 33 }
|
14
|
+
|
14
15
|
it 'sets custom logger' do
|
15
16
|
crawler_manager
|
16
17
|
expect(Grell.logger).to eq(33)
|
17
18
|
Grell.logger = Logger.new(nil)
|
18
19
|
end
|
19
20
|
end
|
21
|
+
|
20
22
|
context 'does not provides a logger' do
|
21
23
|
let(:logger) { nil }
|
24
|
+
|
22
25
|
it 'sets default logger' do
|
23
26
|
crawler_manager
|
24
27
|
expect(Grell.logger).to be_instance_of(Logger)
|
25
28
|
Grell.logger = Logger.new(nil)
|
26
29
|
end
|
27
30
|
end
|
31
|
+
|
32
|
+
context 'does not provide a driver' do
|
33
|
+
let(:driver) { nil }
|
34
|
+
|
35
|
+
it 'setups a new Capybara driver' do
|
36
|
+
expect_any_instance_of(Grell::CapybaraDriver).to receive(:setup_capybara)
|
37
|
+
crawler_manager
|
38
|
+
end
|
39
|
+
end
|
28
40
|
end
|
29
41
|
|
30
42
|
describe '#quit' do
|
31
43
|
let(:driver) { double }
|
32
44
|
|
33
45
|
it 'quits the poltergeist driver' do
|
46
|
+
expect(logger).to receive(:info).with("GRELL. Driver quitting")
|
34
47
|
expect(driver).to receive(:quit)
|
35
48
|
crawler_manager.quit
|
36
49
|
end
|
@@ -41,14 +54,15 @@ RSpec.describe Grell::CrawlerManager do
|
|
41
54
|
|
42
55
|
it 'restarts the poltergeist driver' do
|
43
56
|
expect(driver).to receive(:restart)
|
44
|
-
expect(logger).to receive(:info).with("GRELL
|
45
|
-
expect(logger).to receive(:info).with("GRELL
|
57
|
+
expect(logger).to receive(:info).with("GRELL. Driver restarted")
|
58
|
+
expect(logger).to receive(:info).with("GRELL. Driver restarting")
|
46
59
|
crawler_manager.restart
|
47
60
|
end
|
48
61
|
end
|
49
62
|
|
50
63
|
describe '#check_periodic_restart' do
|
51
64
|
let(:collection) { double }
|
65
|
+
|
52
66
|
context 'Periodic restart not setup' do
|
53
67
|
it 'does not restart' do
|
54
68
|
allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
|
@@ -56,6 +70,7 @@ RSpec.describe Grell::CrawlerManager do
|
|
56
70
|
crawler_manager.check_periodic_restart(collection)
|
57
71
|
end
|
58
72
|
end
|
73
|
+
|
59
74
|
context 'Periodic restart setup with default period' do
|
60
75
|
let(:do_something) { proc {} }
|
61
76
|
let(:crawler_manager) do
|
@@ -71,12 +86,14 @@ RSpec.describe Grell::CrawlerManager do
|
|
71
86
|
expect(crawler_manager).not_to receive(:restart)
|
72
87
|
crawler_manager.check_periodic_restart(collection)
|
73
88
|
end
|
89
|
+
|
74
90
|
it 'restarts after visiting 100 pages' do
|
75
91
|
allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
|
76
92
|
expect(crawler_manager).to receive(:restart)
|
77
93
|
crawler_manager.check_periodic_restart(collection)
|
78
94
|
end
|
79
95
|
end
|
96
|
+
|
80
97
|
context 'Periodic restart setup with custom period' do
|
81
98
|
let(:do_something) { proc {} }
|
82
99
|
let(:period) { 50 }
|
@@ -88,11 +105,22 @@ RSpec.describe Grell::CrawlerManager do
|
|
88
105
|
)
|
89
106
|
end
|
90
107
|
|
108
|
+
context 'restart option is not positive' do
|
109
|
+
let(:period) { 0 }
|
110
|
+
|
111
|
+
it 'logs a warning' do
|
112
|
+
message = 'GRELL. Restart option misconfigured with a negative period. Ignoring option.'
|
113
|
+
expect(logger).to receive(:warn).with(message)
|
114
|
+
crawler_manager
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
91
118
|
it 'does not restart after visiting a number different from custom period pages' do
|
92
119
|
allow(collection).to receive_message_chain(:visited_pages, :size) { period * 1.2 }
|
93
120
|
expect(crawler_manager).not_to receive(:restart)
|
94
121
|
crawler_manager.check_periodic_restart(collection)
|
95
122
|
end
|
123
|
+
|
96
124
|
it 'restarts after visiting custom period pages' do
|
97
125
|
allow(collection).to receive_message_chain(:visited_pages, :size) { period }
|
98
126
|
expect(crawler_manager).to receive(:restart)
|
@@ -101,13 +129,46 @@ RSpec.describe Grell::CrawlerManager do
|
|
101
129
|
end
|
102
130
|
end
|
103
131
|
|
104
|
-
describe '
|
132
|
+
describe '.cleanup_all_processes' do
|
105
133
|
let(:driver) { double }
|
106
134
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
135
|
+
context 'There are some phantomjs processes running' do
|
136
|
+
let(:pids) { [10, 11] }
|
137
|
+
before do
|
138
|
+
allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
|
139
|
+
.to receive(:running_phantomjs_pids).and_return(pids)
|
140
|
+
end
|
141
|
+
|
142
|
+
it 'logs processes pids' do
|
143
|
+
expect(Grell.logger).to receive(:warn).with('GRELL. Killing PhantomJS processes: [10, 11]')
|
144
|
+
expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 10')
|
145
|
+
expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 11')
|
146
|
+
described_class.cleanup_all_processes
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'kills all phantomjs processes' do
|
150
|
+
expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(10)
|
151
|
+
expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(11)
|
152
|
+
described_class.cleanup_all_processes
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
context 'There are no phantomjs processes running' do
|
157
|
+
let(:pids) { [] }
|
158
|
+
before do
|
159
|
+
allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
|
160
|
+
.to receive(:running_phantomjs_pids).and_return(pids)
|
161
|
+
end
|
162
|
+
|
163
|
+
it 'no warning is logged' do
|
164
|
+
expect(Grell.logger).not_to receive(:warn)
|
165
|
+
described_class.cleanup_all_processes
|
166
|
+
end
|
167
|
+
|
168
|
+
it 'No process is killed' do
|
169
|
+
expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).not_to receive(:kill_process)
|
170
|
+
described_class.cleanup_all_processes
|
171
|
+
end
|
111
172
|
end
|
112
173
|
end
|
113
174
|
end
|
data/spec/lib/crawler_spec.rb
CHANGED
@@ -11,7 +11,7 @@ RSpec.describe Grell::Crawler do
|
|
11
11
|
let(:crawler) do
|
12
12
|
Grell::Crawler.new(
|
13
13
|
logger: Logger.new(nil),
|
14
|
-
|
14
|
+
driver: double(nil),
|
15
15
|
evaluate_in_each_page: script,
|
16
16
|
add_match_block: add_match_block,
|
17
17
|
blacklist: blacklist,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-11-
|
11
|
+
date: 2016-11-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -228,4 +228,3 @@ test_files:
|
|
228
228
|
- spec/lib/page_spec.rb
|
229
229
|
- spec/lib/reader_spec.rb
|
230
230
|
- spec/spec_helper.rb
|
231
|
-
has_rdoc:
|