powerdlz23 1.2.4 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/grell/.rspec +2 -0
  2. package/grell/.travis.yml +28 -0
  3. package/grell/CHANGELOG.md +111 -0
  4. package/grell/Gemfile +7 -0
  5. package/grell/LICENSE.txt +22 -0
  6. package/grell/README.md +213 -0
  7. package/grell/Rakefile +2 -0
  8. package/grell/grell.gemspec +36 -0
  9. package/grell/lib/grell/capybara_driver.rb +44 -0
  10. package/grell/lib/grell/crawler.rb +83 -0
  11. package/grell/lib/grell/crawler_manager.rb +84 -0
  12. package/grell/lib/grell/grell_logger.rb +10 -0
  13. package/grell/lib/grell/page.rb +275 -0
  14. package/grell/lib/grell/page_collection.rb +62 -0
  15. package/grell/lib/grell/rawpage.rb +62 -0
  16. package/grell/lib/grell/reader.rb +18 -0
  17. package/grell/lib/grell/version.rb +3 -0
  18. package/grell/lib/grell.rb +11 -0
  19. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  20. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  21. package/grell/spec/lib/crawler_spec.rb +361 -0
  22. package/grell/spec/lib/page_collection_spec.rb +159 -0
  23. package/grell/spec/lib/page_spec.rb +418 -0
  24. package/grell/spec/lib/reader_spec.rb +43 -0
  25. package/grell/spec/spec_helper.rb +66 -0
  26. package/heartmagic/config.py +1 -0
  27. package/heartmagic/heart.py +3 -0
  28. package/heartmagic/pytransform/__init__.py +483 -0
  29. package/heartmagic/pytransform/_pytransform.dll +0 -0
  30. package/heartmagic/pytransform/_pytransform.so +0 -0
  31. package/httpStatusCode/README.md +2 -0
  32. package/httpStatusCode/httpStatusCode.js +4 -0
  33. package/httpStatusCode/reasonPhrases.js +344 -0
  34. package/httpStatusCode/statusCodes.js +344 -0
  35. package/package.json +1 -1
  36. package/snapcrawl/.changelog.old.md +157 -0
  37. package/snapcrawl/.gitattributes +1 -0
  38. package/snapcrawl/.github/workflows/test.yml +41 -0
  39. package/snapcrawl/.rspec +3 -0
  40. package/snapcrawl/.rubocop.yml +23 -0
  41. package/snapcrawl/CHANGELOG.md +182 -0
  42. package/snapcrawl/Gemfile +15 -0
  43. package/snapcrawl/LICENSE +21 -0
  44. package/snapcrawl/README.md +135 -0
  45. package/snapcrawl/Runfile +35 -0
  46. package/snapcrawl/bin/snapcrawl +25 -0
  47. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  48. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  49. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  50. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  51. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  52. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  53. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  54. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  55. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  56. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  57. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  58. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  59. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  60. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  61. package/snapcrawl/lib/snapcrawl.rb +20 -0
  62. package/snapcrawl/snapcrawl.gemspec +27 -0
  63. package/snapcrawl/snapcrawl.yml +41 -0
  64. package/snapcrawl/spec/README.md +16 -0
  65. package/snapcrawl/spec/approvals/bin/help +26 -0
  66. package/snapcrawl/spec/approvals/bin/usage +4 -0
  67. package/snapcrawl/spec/approvals/cli/usage +4 -0
  68. package/snapcrawl/spec/approvals/config/defaults +15 -0
  69. package/snapcrawl/spec/approvals/config/minimal +15 -0
  70. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  71. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  72. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  73. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  74. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  75. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  76. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  77. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  78. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  79. package/snapcrawl/spec/server/config.ru +97 -0
  80. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  81. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  82. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  83. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  84. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  85. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  86. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  87. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  88. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  89. package/snapcrawl/spec/spec_helper.rb +22 -0
  90. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,275 @@
1
+ require 'forwardable'
2
+
3
+ module Grell
4
+ # This class contains the logic related to work with each page we crawl. It is also the interface we use
5
+ # To access the information of each page.
6
+ # This information comes from result private classes below.
7
+ class Page
8
+ extend Forwardable
9
+
10
+ WAIT_TIME = 10
11
+ WAIT_INTERVAL = 0.5
12
+
13
+ attr_reader :url, :timestamp, :id, :parent_id, :rawpage
14
+
15
+ #Most of the interesting information accessed through this class is accessed by the methods below
16
+ def_delegators :@result_page, :headers, :body, :status, :links, :has_selector?, :host, :visited?
17
+
18
+ def initialize( url, id, parent_id)
19
+ @rawpage = RawPage.new
20
+ @url = url
21
+ @id = id
22
+ @parent_id = parent_id
23
+ @timestamp = nil
24
+ @times_visited = 0
25
+ @result_page = UnvisitedPage.new
26
+ end
27
+
28
+ def navigate
29
+ # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try our best to workaround inconsistencies on poltergeist
30
+ Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
31
+ @rawpage.status && !@rawpage.headers.empty? &&
32
+ @rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
33
+ end
34
+ @rawpage.wait_for_all_ajax_requests(WAIT_TIME, WAIT_INTERVAL)
35
+ @result_page = VisitedPage.new(@rawpage)
36
+ @timestamp = Time.now
37
+ rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
38
+ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
39
+ Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
40
+ unavailable_page(404, e)
41
+ ensure
42
+ @times_visited += 1
43
+ end
44
+
45
+ # Number of times we have retried the current page
46
+ def retries
47
+ [@times_visited - 1, 0].max
48
+ end
49
+
50
+ # The current URL, this may be different from the URL we asked for if there was some redirect
51
+ def current_url
52
+ @rawpage.current_url
53
+ end
54
+
55
+ # True if we followed a redirect to get the current contents
56
+ def followed_redirects?
57
+ current_url != @url
58
+ end
59
+
60
+ # True if there page responded with an error
61
+ def error?
62
+ !!(status.to_s =~ /[4|5]\d\d/)
63
+ end
64
+
65
+ # Extracts the path (e.g. /actions/test_action) from the URL
66
+ def path
67
+ URI.parse(@url).path
68
+ rescue URI::InvalidURIError # Invalid URLs will be added and caught when we try to navigate to them
69
+ @url
70
+ end
71
+
72
+ def unavailable_page(status, exception)
73
+ Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
74
+ @result_page = ErroredPage.new(status, exception)
75
+ @timestamp = Time.now
76
+ end
77
+
78
+ private
79
+
80
+ # Private class.
81
+ # This is a result page when it has not been visited yet. Essentially empty of information
82
+ #
83
+ class UnvisitedPage
84
+ def status
85
+ nil
86
+ end
87
+
88
+ def body
89
+ ''
90
+ end
91
+
92
+ def headers
93
+ {grellStatus: 'NotVisited' }
94
+ end
95
+
96
+ def links
97
+ []
98
+ end
99
+
100
+ def host
101
+ ''
102
+ end
103
+
104
+ def visited?
105
+ false
106
+ end
107
+
108
+ def has_selector?(selector)
109
+ false
110
+ end
111
+
112
+ end
113
+
114
+ # Private class.
115
+ # This is a result page when some error happened. It provides some information about the error.
116
+ #
117
+ class ErroredPage
118
+ def initialize(error_code, exception)
119
+ @error_code = error_code
120
+ @exception = exception
121
+ end
122
+
123
+ def status
124
+ @error_code
125
+ end
126
+
127
+ def body
128
+ ''
129
+ end
130
+
131
+ def headers
132
+ message = begin
133
+ @exception.message
134
+ rescue StandardError
135
+ "Error message can not be accessed" #Poltergeist may try to access a nil object when accessing message
136
+ end
137
+
138
+ {
139
+ grellStatus: 'Error',
140
+ errorClass: @exception.class.to_s,
141
+ errorMessage: message
142
+ }
143
+ end
144
+
145
+ def links
146
+ []
147
+ end
148
+
149
+ def host
150
+ ''
151
+ end
152
+
153
+ def visited?
154
+ true
155
+ end
156
+
157
+ def has_selector?(selector)
158
+ false
159
+ end
160
+
161
+ end
162
+
163
+
164
+ # Private class.
165
+ # This is a result page when we successfully got some information back after visiting the page.
166
+ # It delegates most of the information to the @rawpage capybara page. But any transformation or logic is here
167
+ #
168
+ class VisitedPage
169
+ def initialize(rawpage)
170
+ @rawpage = rawpage
171
+ end
172
+
173
+ def status
174
+ @rawpage.status
175
+ end
176
+
177
+ def body
178
+ @rawpage.body
179
+ end
180
+
181
+ def headers
182
+ @rawpage.headers
183
+ rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug.
184
+ {
185
+ grellStatus: 'Error',
186
+ errorClass: e.class.to_s,
187
+ errorMessage: e.message
188
+ }
189
+ end
190
+
191
+ def links
192
+ @links ||= all_links
193
+ end
194
+
195
+ def host
196
+ @rawpage.host
197
+ end
198
+
199
+ def visited?
200
+ true
201
+ end
202
+
203
+ def has_selector?(selector)
204
+ @rawpage.has_selector?(selector)
205
+ end
206
+
207
+ private
208
+ def all_links
209
+ links = @rawpage.all_anchors.map { |anchor| Link.new(anchor) }
210
+ body_enabled_links = links.reject { |link| link.inside_header? || link.disabled? || link.js_href? }
211
+ body_enabled_links.map { |link| link.to_url(host) }.uniq.compact
212
+
213
+ rescue Capybara::Poltergeist::ObsoleteNode
214
+ Grell.logger.warn "We found an obsolete node in #{@url}. Ignoring all links"
215
+ # Sometimes Javascript and timing may screw this, we lose these links.
216
+ # TODO: Can we do something more intelligent here?
217
+ []
218
+ end
219
+
220
+ # Private class to group all the methods related to links.
221
+ class Link
222
+ def initialize(anchor)
223
+ @anchor = anchor
224
+ end
225
+
226
+ # <link> can only be used in the <head> as of: https://developer.mozilla.org/en/docs/Web/HTML/Element/link
227
+ def inside_header?
228
+ @anchor.tag_name == 'link'
229
+ end
230
+
231
+ # Is the link disabled by either Javascript or CSS?
232
+ def disabled?
233
+ @anchor.disabled? || !!@anchor.native.attributes['disabled']
234
+ end
235
+
236
+ # Does the href use javascript?
237
+ def js_href?
238
+ href.start_with?('javascript:')
239
+ end
240
+
241
+ # Some links may use data-href + javascript to do interesting things
242
+ def href
243
+ @anchor['href'] || @anchor['data-href']
244
+ end
245
+
246
+ # We only accept links in this same host that start with a path
247
+ def to_url(host)
248
+ uri = URI.parse(href)
249
+ if uri.absolute?
250
+ if uri.host != URI.parse(host).host
251
+ Grell.logger.debug "GRELL does not follow links to external hosts: #{href}"
252
+ nil
253
+ else
254
+ href # Absolute link to our own host
255
+ end
256
+ else
257
+ if uri.path.nil?
258
+ Grell.logger.debug "GRELL does not follow links without a path: #{uri}"
259
+ nil
260
+ end
261
+ if uri.path.start_with?('/')
262
+ host + href # convert to full URL
263
+ else # links like href="google.com" the browser would go to http://google.com like "http://#{link}"
264
+ Grell.logger.debug "GRELL Bad formatted link: #{href}, assuming external"
265
+ nil
266
+ end
267
+ end
268
+ rescue URI::InvalidURIError # Invalid links propagating till we navigate to them
269
+ href
270
+ end
271
+ end
272
+
273
+ end
274
+ end
275
+ end
@@ -0,0 +1,62 @@
1
+ module Grell
2
+ # Keeps a record of all the pages crawled.
3
+ # When a new url is found it is added to this collection, which makes sure it is unique.
4
+ # This page is part of the discovered pages. Eventually that page will be navigated to, then
5
+ # the page will be part of the visited pages.
6
+ class PageCollection
7
+ attr_reader :collection
8
+
9
+ # A block containing the logic that determines if a new URL should be added
10
+ # to the collection or if it is already present will be passed to the initializer.
11
+ def initialize(add_match_block)
12
+ @collection = []
13
+ @add_match_block = add_match_block || default_add_match
14
+ end
15
+
16
+ def create_page(url, parent_id)
17
+ page_id = next_id
18
+ page = Page.new(url, page_id, parent_id)
19
+ add(page)
20
+ page
21
+ end
22
+
23
+ def visited_pages
24
+ @collection.select {|page| page.visited?}
25
+ end
26
+
27
+ def discovered_pages
28
+ @collection - visited_pages
29
+ end
30
+
31
+ def next_page
32
+ discovered_pages.sort_by{|page| page.parent_id}.first
33
+ end
34
+
35
+ private
36
+
37
+ def next_id
38
+ @collection.size
39
+ end
40
+
41
+ def add(page)
42
+ # Although finding unique pages based on URL will add pages with different query parameters,
43
+ # in some cases we do link to different pages depending on the query parameters like when using proxies
44
+ new_url = @collection.none? do |collection_page|
45
+ @add_match_block.call(collection_page, page)
46
+ end
47
+
48
+ if new_url
49
+ @collection.push page
50
+ end
51
+ end
52
+
53
+ # If add_match_block is not provided, url matching to determine if a new page should be added
54
+ # to the page collection will default to this proc
55
+ def default_add_match
56
+ Proc.new do |collection_page, page|
57
+ collection_page.url.downcase == page.url.downcase
58
+ end
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,62 @@
1
+ module Grell
2
+ # This class depends heavily on Capybara but contains no logic.
3
+ class RawPage
4
+ include Capybara::DSL
5
+
6
+ def navigate(url)
7
+ visit(url)
8
+ follow_redirects!
9
+ end
10
+
11
+ def headers
12
+ page.response_headers
13
+ end
14
+
15
+ def status
16
+ page.status_code
17
+ end
18
+
19
+ def body
20
+ page.body
21
+ end
22
+
23
+ def all_anchors
24
+ # Some elements may not be "a" elements but still provide a link. This usually is done for Javascript
25
+ # to convert other elements which are not links to be able to be clicked naturally.
26
+ # Only return links which are visible.
27
+ all('[href]', visible: true).to_a + all('[data-href]', visible: true).to_a
28
+ end
29
+
30
+ def host
31
+ page.current_host
32
+ end
33
+
34
+ def has_selector?(selector)
35
+ page.has_selector?(selector)
36
+ end
37
+
38
+ def wait_for_all_ajax_requests(timeout, interval)
39
+ Timeout::timeout(timeout) do
40
+ (timeout / interval).ceil.times do
41
+ jquery_active = page.evaluate_script("typeof jQuery !== 'undefined' && jQuery.active;")
42
+ break if (!jquery_active || jquery_active.zero?)
43
+ sleep(interval)
44
+ end
45
+ end
46
+ true
47
+ end
48
+
49
+ private
50
+
51
+ def follow_redirects!
52
+ # Phantom is very weird, it will follow a redirect to provide the correct body but will not fill the
53
+ # status and the headers, if we are in that situation, revisit the page with the correct url this time.
54
+ # Note that we will still fail if we have more than 5 redirects on a row
55
+ redirects = 0
56
+ while(page.status_code == nil && redirects < 5)
57
+ visit( CGI.unescape(page.current_url))
58
+ redirects = redirects + 1
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,18 @@
1
+ module Grell
2
+ # A tooling class, it waits a maximum of max_waiting for an action to finish. If the action is not
3
+ # finished by then, it will continue anyway.
4
+ # The wait may be long but we want to finish it as soon as the action has finished
5
+ class Reader
6
+ def self.wait_for(action, max_waiting, sleeping_time)
7
+ time_start = Time.now
8
+ action.call()
9
+ return if yield
10
+ while (Time.now < time_start + max_waiting)
11
+ action.call()
12
+ break if yield
13
+ sleep(sleeping_time)
14
+ end
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,3 @@
1
+ module Grell
2
+ VERSION = "2.1.2".freeze
3
+ end
@@ -0,0 +1,11 @@
1
+ require 'capybara/poltergeist'
2
+ require 'capybara/dsl'
3
+
4
+ require 'grell/grell_logger'
5
+ require 'grell/capybara_driver'
6
+ require 'grell/crawler_manager'
7
+ require 'grell/crawler'
8
+ require 'grell/rawpage'
9
+ require 'grell/page'
10
+ require 'grell/page_collection'
11
+ require 'grell/reader'
@@ -0,0 +1,38 @@
1
+
2
+ RSpec.describe Grell::CapybaraDriver do
3
+ let(:ts) { Time.now }
4
+ before do
5
+ Grell.logger = Logger.new(nil)
6
+ end
7
+
8
+ describe 'setup_capybara' do
9
+ it 'properly registers the poltergeist driver' do
10
+ Timecop.freeze(ts)
11
+ driver = Grell::CapybaraDriver.new.setup_capybara
12
+ expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
13
+ end
14
+
15
+ it 'raises an exception if the driver cannot be initialized' do
16
+ Timecop.freeze(ts + 60)
17
+
18
+ # Attempt to register twice with the same driver name
19
+ Grell::CapybaraDriver.new.setup_capybara
20
+ expect { Grell::CapybaraDriver.new.setup_capybara }.
21
+ to raise_error "Poltergeist Driver could not be properly initialized"
22
+ end
23
+
24
+ it 'can register the poltergeist driver multiple times in a row' do
25
+ Timecop.freeze(ts + 120)
26
+ driver = Grell::CapybaraDriver.new.setup_capybara
27
+ expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
28
+ end
29
+ end
30
+
31
+ after do
32
+ Timecop.return
33
+
34
+ # Reset Capybara so future tests can easily stub HTTP requests
35
+ Capybara.javascript_driver = :poltergeist_billy
36
+ Capybara.default_driver = :poltergeist_billy
37
+ end
38
+ end
@@ -0,0 +1,174 @@
1
+ RSpec.describe Grell::CrawlerManager do
2
+ let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
3
+ let(:host) { 'http://www.example.com' }
4
+ let(:url) { 'http://www.example.com/test' }
5
+ let(:driver) { double(Grell::CapybaraDriver) }
6
+ let(:logger) { Logger.new(nil) }
7
+ let(:crawler_manager) do
8
+ described_class.new(logger: logger, driver: driver)
9
+ end
10
+
11
+ describe 'initialize' do
12
+ context 'provides a logger' do
13
+ let(:logger) { 33 }
14
+
15
+ it 'sets custom logger' do
16
+ crawler_manager
17
+ expect(Grell.logger).to eq(33)
18
+ Grell.logger = Logger.new(nil)
19
+ end
20
+ end
21
+
22
+ context 'does not provides a logger' do
23
+ let(:logger) { nil }
24
+
25
+ it 'sets default logger' do
26
+ crawler_manager
27
+ expect(Grell.logger).to be_instance_of(Logger)
28
+ Grell.logger = Logger.new(nil)
29
+ end
30
+ end
31
+
32
+ context 'does not provide a driver' do
33
+ let(:driver) { nil }
34
+
35
+ it 'setups a new Capybara driver' do
36
+ expect_any_instance_of(Grell::CapybaraDriver).to receive(:setup_capybara)
37
+ crawler_manager
38
+ end
39
+ end
40
+ end
41
+
42
+ describe '#quit' do
43
+ let(:driver) { double }
44
+
45
+ it 'quits the poltergeist driver' do
46
+ expect(logger).to receive(:info).with("GRELL. Driver quitting")
47
+ expect(driver).to receive(:quit)
48
+ crawler_manager.quit
49
+ end
50
+ end
51
+
52
+ describe '#restart' do
53
+ let(:driver) { double }
54
+
55
+ it 'restarts the poltergeist driver' do
56
+ expect(driver).to receive(:restart)
57
+ expect(logger).to receive(:info).with("GRELL. Driver restarted")
58
+ expect(logger).to receive(:info).with("GRELL. Driver restarting")
59
+ crawler_manager.restart
60
+ end
61
+ end
62
+
63
+ describe '#check_periodic_restart' do
64
+ let(:collection) { double }
65
+
66
+ context 'Periodic restart not setup' do
67
+ it 'does not restart' do
68
+ allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
69
+ expect(crawler_manager).not_to receive(:restart)
70
+ crawler_manager.check_periodic_restart(collection)
71
+ end
72
+ end
73
+
74
+ context 'Periodic restart setup with default period' do
75
+ let(:do_something) { proc {} }
76
+ let(:crawler_manager) do
77
+ Grell::CrawlerManager.new(
78
+ logger: logger,
79
+ driver: driver,
80
+ on_periodic_restart: { do: do_something }
81
+ )
82
+ end
83
+
84
+ it 'does not restart after visiting 99 pages' do
85
+ allow(collection).to receive_message_chain(:visited_pages, :size) { 99 }
86
+ expect(crawler_manager).not_to receive(:restart)
87
+ crawler_manager.check_periodic_restart(collection)
88
+ end
89
+
90
+ it 'restarts after visiting 100 pages' do
91
+ allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
92
+ expect(crawler_manager).to receive(:restart)
93
+ crawler_manager.check_periodic_restart(collection)
94
+ end
95
+ end
96
+
97
+ context 'Periodic restart setup with custom period' do
98
+ let(:do_something) { proc {} }
99
+ let(:period) { 50 }
100
+ let(:crawler_manager) do
101
+ Grell::CrawlerManager.new(
102
+ logger: logger,
103
+ driver: driver,
104
+ on_periodic_restart: { do: do_something, each: period }
105
+ )
106
+ end
107
+
108
+ context 'restart option is not positive' do
109
+ let(:period) { 0 }
110
+
111
+ it 'logs a warning' do
112
+ message = 'GRELL. Restart option misconfigured with a negative period. Ignoring option.'
113
+ expect(logger).to receive(:warn).with(message)
114
+ crawler_manager
115
+ end
116
+ end
117
+
118
+ it 'does not restart after visiting a number different from custom period pages' do
119
+ allow(collection).to receive_message_chain(:visited_pages, :size) { period * 1.2 }
120
+ expect(crawler_manager).not_to receive(:restart)
121
+ crawler_manager.check_periodic_restart(collection)
122
+ end
123
+
124
+ it 'restarts after visiting custom period pages' do
125
+ allow(collection).to receive_message_chain(:visited_pages, :size) { period }
126
+ expect(crawler_manager).to receive(:restart)
127
+ crawler_manager.check_periodic_restart(collection)
128
+ end
129
+ end
130
+ end
131
+
132
+ describe '.cleanup_all_processes' do
133
+ let(:driver) { double }
134
+
135
+ context 'There are some phantomjs processes running' do
136
+ let(:pids) { [10, 11] }
137
+ before do
138
+ allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
139
+ .to receive(:running_phantomjs_pids).and_return(pids)
140
+ end
141
+
142
+ it 'logs processes pids' do
143
+ expect(Grell.logger).to receive(:warn).with('GRELL. Killing PhantomJS processes: [10, 11]')
144
+ expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 10')
145
+ expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 11')
146
+ described_class.cleanup_all_processes
147
+ end
148
+
149
+ it 'kills all phantomjs processes' do
150
+ expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(10)
151
+ expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(11)
152
+ described_class.cleanup_all_processes
153
+ end
154
+ end
155
+
156
+ context 'There are no phantomjs processes running' do
157
+ let(:pids) { [] }
158
+ before do
159
+ allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
160
+ .to receive(:running_phantomjs_pids).and_return(pids)
161
+ end
162
+
163
+ it 'no warning is logged' do
164
+ expect(Grell.logger).not_to receive(:warn)
165
+ described_class.cleanup_all_processes
166
+ end
167
+
168
+ it 'No process is killed' do
169
+ expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).not_to receive(:kill_process)
170
+ described_class.cleanup_all_processes
171
+ end
172
+ end
173
+ end
174
+ end