grell 2.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bbd2a19c7858d2e755e7d51a038b84901835f32b
4
- data.tar.gz: 7c65a731bfdacb7b65cf523c8ba128f1f3390f96
3
+ metadata.gz: 8d9acf34b301934669bf458555b29ef7c429f936
4
+ data.tar.gz: 3949e320d75862e04325581986c24209440fd46e
5
5
  SHA512:
6
- metadata.gz: fcf31c442f8d51cd4a9534270cc3afd7f5bac432a74a9ec2e429914145df710131e0a99aca61e4cd98d2e831a5cedb511c21dce9d1562881541fd1bfb4016014
7
- data.tar.gz: c413503a4a8765fadbec10ed64467ba140b8739b6209ae4a19e585549ad689231b64b3d670e1bdceec560597051d93f8b90d696f66571cb1dad175f15af1b4a0
6
+ metadata.gz: 3cd06fcba285f5f894d0127c7de96575f032fff8d10cdcfa2ee5042b287b01babdbd3119c5c1d979c8610ae9ee9d571f8c6144711415acbcf09f3d43ceac567c
7
+ data.tar.gz: 4dfb9e9d2e83095ca66df36bc8f41082fc37a29995d75e0553f8466eeb0dd36b62c3578861f171c9c29e4815ca257807dbd53dab4eee875301d1613fde6a437a
@@ -1,3 +1,7 @@
1
+ # 2.1.0
2
+ * Delete `driver_options` configuration key as it was never used.
3
+ * `cleanup_all_processes` is a self method as intended to.
4
+
1
5
  # 2.0.0
2
6
  * New configuration key `on_periodic_restart`.
3
7
  * CrawlerManager.cleanup_all_processes method destroy all instances of phantomjs in this machine.
data/README.md CHANGED
@@ -96,7 +96,6 @@ The `Grell:Crawler` class can be passed options to customize its behavior:
96
96
  - `blacklist`: Setups a blacklist filter for URLs to be avoided. Default: no URL is blacklisted.
97
97
  - `add_match_block`: Block evaluated to consider if a given page should be part of the pages to be visited. Default: add unique URLs.
98
98
  - `evaluate_in_each_page`: Javascript block to be evaluated on each page visited. Default: Nothing evaluated.
99
- - `driver_options`: Driver options will be passed to the Capybara driver which connects to PhantomJS.
100
99
 
101
100
  Grell by default will follow all the links it finds in the site being crawled.
102
101
  It will never follow links linking outside your site.
@@ -1,16 +1,10 @@
1
-
2
1
  module Grell
3
-
4
- #The driver for Capybara. It uses Portelgeist to control PhantomJS
2
+ # This class setups the driver for capybara. Used internally by the CrawlerManager
3
+ # It uses Portelgeist to control PhantomJS
5
4
  class CapybaraDriver
6
- include Capybara::DSL
7
-
8
5
  USER_AGENT = "Mozilla/5.0 (Grell Crawler)".freeze
9
6
 
10
- def self.setup(options)
11
- new.setup_capybara unless options[:external_driver]
12
- end
13
-
7
+ # Returns a poltergeist driver
14
8
  def setup_capybara
15
9
  @poltergeist_driver = nil
16
10
 
@@ -20,18 +14,17 @@ module Grell
20
14
  Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
21
15
 
22
16
  Capybara.register_driver driver_name do |app|
23
- @poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
17
+ @poltergeist_driver = Capybara::Poltergeist::Driver.new(app,
24
18
  js_errors: false,
25
19
  inspector: false,
26
20
  phantomjs_logger: FakePoltergeistLogger,
27
- phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1']
28
- })
21
+ phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1'])
29
22
  end
30
23
 
31
24
  Capybara.default_max_wait_time = 3
32
25
  Capybara.run_server = false
33
26
  Capybara.default_driver = driver_name
34
- page.driver.headers = {
27
+ Capybara.current_session.driver.headers = { # The driver gets initialized when modified here
35
28
  "DNT" => 1,
36
29
  "User-Agent" => USER_AGENT
37
30
  }
@@ -41,14 +34,11 @@ module Grell
41
34
  @poltergeist_driver
42
35
  end
43
36
 
44
- def quit
45
- @poltergeist_driver.quit
46
- end
47
-
37
+ # Poltergeist driver needs a class with this signature. The javascript console.log is sent here.
38
+ # We just discard that information.
48
39
  module FakePoltergeistLogger
49
40
  def self.puts(*)
50
41
  end
51
42
  end
52
43
  end
53
-
54
44
  end
@@ -5,26 +5,26 @@ module Grell
5
5
  # logger: logger to use for Grell's messages
6
6
  # on_periodic_restart: if set, the driver will restart every :each visits (100 default) and execute the :do block
7
7
  # driver_options: Any extra options for the Capybara driver
8
- def initialize(logger: nil, on_periodic_restart: {}, driver: nil, driver_options: {})
8
+ def initialize(logger: nil, on_periodic_restart: {}, driver: nil)
9
9
  Grell.logger = logger ? logger : Logger.new(STDOUT)
10
10
  @periodic_restart_block = on_periodic_restart[:do]
11
11
  @periodic_restart_period = on_periodic_restart[:each] || PAGES_TO_RESTART
12
- @driver = driver || CapybaraDriver.setup(driver_options)
12
+ @driver = driver || CapybaraDriver.new.setup_capybara
13
13
  if @periodic_restart_period <= 0
14
- Grell.logger.warn "GRELL being misconfigured with a negative period to restart. Ignoring option."
14
+ Grell.logger.warn "GRELL. Restart option misconfigured with a negative period. Ignoring option."
15
15
  end
16
16
  end
17
17
 
18
18
  # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
19
19
  def restart
20
- Grell.logger.info "GRELL is restarting"
20
+ Grell.logger.info "GRELL. Driver restarting"
21
21
  @driver.restart
22
- Grell.logger.info "GRELL has restarted"
22
+ Grell.logger.info "GRELL. Driver restarted"
23
23
  end
24
24
 
25
25
  # Quits the poltergeist driver.
26
26
  def quit
27
- Grell.logger.info "GRELL is quitting the poltergeist driver"
27
+ Grell.logger.info "GRELL. Driver quitting"
28
28
  @driver.quit
29
29
  end
30
30
 
@@ -38,14 +38,8 @@ module Grell
38
38
  @periodic_restart_block.call
39
39
  end
40
40
 
41
- def cleanup_all_processes
42
- pids = running_phantomjs_pids
43
- return if pids.empty?
44
- Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}"
45
- pids.each do |pid|
46
- Grell.logger.warn "Sending KILL to PhantomJS process #{pid}"
47
- kill_process(pid.to_i)
48
- end
41
+ def self.cleanup_all_processes
42
+ PhantomJSManager.new.cleanup_all_processes
49
43
  end
50
44
 
51
45
  private
@@ -53,25 +47,38 @@ module Grell
53
47
  PAGES_TO_RESTART = 100 # Default number of pages before we restart the driver.
54
48
  KILL_TIMEOUT = 2 # Number of seconds we wait till we kill the process.
55
49
 
56
- def running_phantomjs_pids
57
- list_phantomjs_processes_cmd = "ps -ef | grep -E 'bin/phantomjs' | grep -v grep"
58
- `#{list_phantomjs_processes_cmd} | awk '{print $2;}'`.split("\n")
59
- end
50
+ # Manages the PhantomJS process
51
+ class PhantomJSManager
52
+ def cleanup_all_processes
53
+ pids = running_phantomjs_pids
54
+ return if pids.empty?
55
+ Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}"
56
+ pids.each do |pid|
57
+ Grell.logger.warn "GRELL. Sending KILL to PhantomJS process #{pid}"
58
+ kill_process(pid.to_i)
59
+ end
60
+ end
60
61
 
61
- def kill_process(pid)
62
- Process.kill('TERM', pid)
63
- force_kill(pid)
64
- rescue Errno::ESRCH, Errno::ECHILD
65
- # successfully terminated
66
- rescue => e
67
- Grell.logger.exception e, "PhantomJS process could not be killed"
68
- end
62
+ def running_phantomjs_pids
63
+ list_phantomjs_processes_cmd = "ps -ef | grep -E 'bin/phantomjs' | grep -v grep"
64
+ `#{list_phantomjs_processes_cmd} | awk '{print $2;}'`.split("\n")
65
+ end
66
+
67
+ def kill_process(pid)
68
+ Process.kill('TERM', pid)
69
+ force_kill(pid)
70
+ rescue Errno::ESRCH, Errno::ECHILD
71
+ # successfully terminated
72
+ rescue => e
73
+ Grell.logger.exception e, "GRELL. PhantomJS process could not be killed"
74
+ end
69
75
 
70
- def force_kill(pid)
71
- Timeout.timeout(KILL_TIMEOUT) { Process.wait(pid) }
72
- rescue Timeout::Error
73
- Process.kill('KILL', pid)
74
- Process.wait(pid)
76
+ def force_kill(pid)
77
+ Timeout.timeout(KILL_TIMEOUT) { Process.wait(pid) }
78
+ rescue Timeout::Error
79
+ Process.kill('KILL', pid)
80
+ Process.wait(pid)
81
+ end
75
82
  end
76
83
  end
77
84
  end
@@ -1,3 +1,3 @@
1
1
  module Grell
2
- VERSION = "2.0.0".freeze
2
+ VERSION = "2.1.0".freeze
3
3
  end
@@ -1,6 +1,9 @@
1
1
 
2
2
  RSpec.describe Grell::CapybaraDriver do
3
3
  let(:ts) { Time.now }
4
+ before do
5
+ Grell.logger = Logger.new(nil)
6
+ end
4
7
 
5
8
  describe 'setup_capybara' do
6
9
  it 'properly registers the poltergeist driver' do
@@ -25,14 +28,6 @@ RSpec.describe Grell::CapybaraDriver do
25
28
  end
26
29
  end
27
30
 
28
- describe 'quit' do
29
- let(:driver) { Grell::CapybaraDriver.new.setup_capybara }
30
- it 'quits the poltergeist driver' do
31
- expect_any_instance_of(Capybara::Poltergeist::Driver).to receive(:quit)
32
- driver.quit
33
- end
34
- end
35
-
36
31
  after do
37
32
  Timecop.return
38
33
 
@@ -11,26 +11,39 @@ RSpec.describe Grell::CrawlerManager do
11
11
  describe 'initialize' do
12
12
  context 'provides a logger' do
13
13
  let(:logger) { 33 }
14
+
14
15
  it 'sets custom logger' do
15
16
  crawler_manager
16
17
  expect(Grell.logger).to eq(33)
17
18
  Grell.logger = Logger.new(nil)
18
19
  end
19
20
  end
21
+
20
22
  context 'does not provides a logger' do
21
23
  let(:logger) { nil }
24
+
22
25
  it 'sets default logger' do
23
26
  crawler_manager
24
27
  expect(Grell.logger).to be_instance_of(Logger)
25
28
  Grell.logger = Logger.new(nil)
26
29
  end
27
30
  end
31
+
32
+ context 'does not provide a driver' do
33
+ let(:driver) { nil }
34
+
35
+ it 'setups a new Capybara driver' do
36
+ expect_any_instance_of(Grell::CapybaraDriver).to receive(:setup_capybara)
37
+ crawler_manager
38
+ end
39
+ end
28
40
  end
29
41
 
30
42
  describe '#quit' do
31
43
  let(:driver) { double }
32
44
 
33
45
  it 'quits the poltergeist driver' do
46
+ expect(logger).to receive(:info).with("GRELL. Driver quitting")
34
47
  expect(driver).to receive(:quit)
35
48
  crawler_manager.quit
36
49
  end
@@ -41,14 +54,15 @@ RSpec.describe Grell::CrawlerManager do
41
54
 
42
55
  it 'restarts the poltergeist driver' do
43
56
  expect(driver).to receive(:restart)
44
- expect(logger).to receive(:info).with("GRELL is restarting")
45
- expect(logger).to receive(:info).with("GRELL has restarted")
57
+ expect(logger).to receive(:info).with("GRELL. Driver restarted")
58
+ expect(logger).to receive(:info).with("GRELL. Driver restarting")
46
59
  crawler_manager.restart
47
60
  end
48
61
  end
49
62
 
50
63
  describe '#check_periodic_restart' do
51
64
  let(:collection) { double }
65
+
52
66
  context 'Periodic restart not setup' do
53
67
  it 'does not restart' do
54
68
  allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
@@ -56,6 +70,7 @@ RSpec.describe Grell::CrawlerManager do
56
70
  crawler_manager.check_periodic_restart(collection)
57
71
  end
58
72
  end
73
+
59
74
  context 'Periodic restart setup with default period' do
60
75
  let(:do_something) { proc {} }
61
76
  let(:crawler_manager) do
@@ -71,12 +86,14 @@ RSpec.describe Grell::CrawlerManager do
71
86
  expect(crawler_manager).not_to receive(:restart)
72
87
  crawler_manager.check_periodic_restart(collection)
73
88
  end
89
+
74
90
  it 'restarts after visiting 100 pages' do
75
91
  allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
76
92
  expect(crawler_manager).to receive(:restart)
77
93
  crawler_manager.check_periodic_restart(collection)
78
94
  end
79
95
  end
96
+
80
97
  context 'Periodic restart setup with custom period' do
81
98
  let(:do_something) { proc {} }
82
99
  let(:period) { 50 }
@@ -88,11 +105,22 @@ RSpec.describe Grell::CrawlerManager do
88
105
  )
89
106
  end
90
107
 
108
+ context 'restart option is not positive' do
109
+ let(:period) { 0 }
110
+
111
+ it 'logs a warning' do
112
+ message = 'GRELL. Restart option misconfigured with a negative period. Ignoring option.'
113
+ expect(logger).to receive(:warn).with(message)
114
+ crawler_manager
115
+ end
116
+ end
117
+
91
118
  it 'does not restart after visiting a number different from custom period pages' do
92
119
  allow(collection).to receive_message_chain(:visited_pages, :size) { period * 1.2 }
93
120
  expect(crawler_manager).not_to receive(:restart)
94
121
  crawler_manager.check_periodic_restart(collection)
95
122
  end
123
+
96
124
  it 'restarts after visiting custom period pages' do
97
125
  allow(collection).to receive_message_chain(:visited_pages, :size) { period }
98
126
  expect(crawler_manager).to receive(:restart)
@@ -101,13 +129,46 @@ RSpec.describe Grell::CrawlerManager do
101
129
  end
102
130
  end
103
131
 
104
- describe '#cleanup_all_processes' do
132
+ describe '.cleanup_all_processes' do
105
133
  let(:driver) { double }
106
134
 
107
- it 'kills all phantomjs processes' do
108
- allow(crawler_manager).to receive(:running_phantomjs_pids).and_return([10])
109
- expect(crawler_manager).to receive(:kill_process).with(10)
110
- crawler_manager.cleanup_all_processes
135
+ context 'There are some phantomjs processes running' do
136
+ let(:pids) { [10, 11] }
137
+ before do
138
+ allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
139
+ .to receive(:running_phantomjs_pids).and_return(pids)
140
+ end
141
+
142
+ it 'logs processes pids' do
143
+ expect(Grell.logger).to receive(:warn).with('GRELL. Killing PhantomJS processes: [10, 11]')
144
+ expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 10')
145
+ expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 11')
146
+ described_class.cleanup_all_processes
147
+ end
148
+
149
+ it 'kills all phantomjs processes' do
150
+ expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(10)
151
+ expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(11)
152
+ described_class.cleanup_all_processes
153
+ end
154
+ end
155
+
156
+ context 'There are no phantomjs processes running' do
157
+ let(:pids) { [] }
158
+ before do
159
+ allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
160
+ .to receive(:running_phantomjs_pids).and_return(pids)
161
+ end
162
+
163
+ it 'no warning is logged' do
164
+ expect(Grell.logger).not_to receive(:warn)
165
+ described_class.cleanup_all_processes
166
+ end
167
+
168
+ it 'No process is killed' do
169
+ expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).not_to receive(:kill_process)
170
+ described_class.cleanup_all_processes
171
+ end
111
172
  end
112
173
  end
113
174
  end
@@ -11,7 +11,7 @@ RSpec.describe Grell::Crawler do
11
11
  let(:crawler) do
12
12
  Grell::Crawler.new(
13
13
  logger: Logger.new(nil),
14
- driver_options: { external_driver: true },
14
+ driver: double(nil),
15
15
  evaluate_in_each_page: script,
16
16
  add_match_block: add_match_block,
17
17
  blacklist: blacklist,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grell
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jordi Polo Carres
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-16 00:00:00.000000000 Z
11
+ date: 2016-11-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -228,4 +228,3 @@ test_files:
228
228
  - spec/lib/page_spec.rb
229
229
  - spec/lib/reader_spec.rb
230
230
  - spec/spec_helper.rb
231
- has_rdoc: