powerdlz23 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/grell/.rspec +2 -0
- package/grell/.travis.yml +28 -0
- package/grell/CHANGELOG.md +111 -0
- package/grell/Gemfile +7 -0
- package/grell/LICENSE.txt +22 -0
- package/grell/README.md +213 -0
- package/grell/Rakefile +2 -0
- package/grell/grell.gemspec +36 -0
- package/grell/lib/grell/capybara_driver.rb +44 -0
- package/grell/lib/grell/crawler.rb +83 -0
- package/grell/lib/grell/crawler_manager.rb +84 -0
- package/grell/lib/grell/grell_logger.rb +10 -0
- package/grell/lib/grell/page.rb +275 -0
- package/grell/lib/grell/page_collection.rb +62 -0
- package/grell/lib/grell/rawpage.rb +62 -0
- package/grell/lib/grell/reader.rb +18 -0
- package/grell/lib/grell/version.rb +3 -0
- package/grell/lib/grell.rb +11 -0
- package/grell/spec/lib/capybara_driver_spec.rb +38 -0
- package/grell/spec/lib/crawler_manager_spec.rb +174 -0
- package/grell/spec/lib/crawler_spec.rb +361 -0
- package/grell/spec/lib/page_collection_spec.rb +159 -0
- package/grell/spec/lib/page_spec.rb +418 -0
- package/grell/spec/lib/reader_spec.rb +43 -0
- package/grell/spec/spec_helper.rb +66 -0
- package/heartmagic/config.py +1 -0
- package/heartmagic/heart.py +3 -0
- package/heartmagic/pytransform/__init__.py +483 -0
- package/heartmagic/pytransform/_pytransform.dll +0 -0
- package/heartmagic/pytransform/_pytransform.so +0 -0
- package/httpStatusCode/README.md +2 -0
- package/httpStatusCode/httpStatusCode.js +4 -0
- package/httpStatusCode/reasonPhrases.js +344 -0
- package/httpStatusCode/statusCodes.js +344 -0
- package/package.json +1 -1
- package/snapcrawl/.changelog.old.md +157 -0
- package/snapcrawl/.gitattributes +1 -0
- package/snapcrawl/.github/workflows/test.yml +41 -0
- package/snapcrawl/.rspec +3 -0
- package/snapcrawl/.rubocop.yml +23 -0
- package/snapcrawl/CHANGELOG.md +182 -0
- package/snapcrawl/Gemfile +15 -0
- package/snapcrawl/LICENSE +21 -0
- package/snapcrawl/README.md +135 -0
- package/snapcrawl/Runfile +35 -0
- package/snapcrawl/bin/snapcrawl +25 -0
- package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
- package/snapcrawl/lib/snapcrawl/config.rb +60 -0
- package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
- package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
- package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
- package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
- package/snapcrawl/lib/snapcrawl/page.rb +118 -0
- package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
- package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
- package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
- package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
- package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
- package/snapcrawl/lib/snapcrawl/version.rb +3 -0
- package/snapcrawl/lib/snapcrawl.rb +20 -0
- package/snapcrawl/snapcrawl.gemspec +27 -0
- package/snapcrawl/snapcrawl.yml +41 -0
- package/snapcrawl/spec/README.md +16 -0
- package/snapcrawl/spec/approvals/bin/help +26 -0
- package/snapcrawl/spec/approvals/bin/usage +4 -0
- package/snapcrawl/spec/approvals/cli/usage +4 -0
- package/snapcrawl/spec/approvals/config/defaults +15 -0
- package/snapcrawl/spec/approvals/config/minimal +15 -0
- package/snapcrawl/spec/approvals/integration/blacklist +14 -0
- package/snapcrawl/spec/approvals/integration/default-config +14 -0
- package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
- package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
- package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
- package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
- package/snapcrawl/spec/approvals/integration/whitelist +14 -0
- package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
- package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
- package/snapcrawl/spec/server/config.ru +97 -0
- package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
- package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
- package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
- package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
- package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
- package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
- package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
- package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
- package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
- package/snapcrawl/spec/spec_helper.rb +22 -0
- package/snapcrawl/spec/spec_mixin.rb +10 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
require 'forwardable'
|
|
2
|
+
|
|
3
|
+
module Grell
|
|
4
|
+
# This class contains the logic related to work with each page we crawl. It is also the interface we use
|
|
5
|
+
# To access the information of each page.
|
|
6
|
+
# This information comes from result private classes below.
|
|
7
|
+
class Page
|
|
8
|
+
extend Forwardable
|
|
9
|
+
|
|
10
|
+
WAIT_TIME = 10
|
|
11
|
+
WAIT_INTERVAL = 0.5
|
|
12
|
+
|
|
13
|
+
attr_reader :url, :timestamp, :id, :parent_id, :rawpage
|
|
14
|
+
|
|
15
|
+
#Most of the interesting information accessed through this class is accessed by the methods below
|
|
16
|
+
def_delegators :@result_page, :headers, :body, :status, :links, :has_selector?, :host, :visited?
|
|
17
|
+
|
|
18
|
+
def initialize( url, id, parent_id)
|
|
19
|
+
@rawpage = RawPage.new
|
|
20
|
+
@url = url
|
|
21
|
+
@id = id
|
|
22
|
+
@parent_id = parent_id
|
|
23
|
+
@timestamp = nil
|
|
24
|
+
@times_visited = 0
|
|
25
|
+
@result_page = UnvisitedPage.new
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def navigate
|
|
29
|
+
# We wait a maximum of WAIT_TIME seconds to get an HTML page. We try our best to workaround inconsistencies on poltergeist
|
|
30
|
+
Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
|
|
31
|
+
@rawpage.status && !@rawpage.headers.empty? &&
|
|
32
|
+
@rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
|
|
33
|
+
end
|
|
34
|
+
@rawpage.wait_for_all_ajax_requests(WAIT_TIME, WAIT_INTERVAL)
|
|
35
|
+
@result_page = VisitedPage.new(@rawpage)
|
|
36
|
+
@timestamp = Time.now
|
|
37
|
+
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
|
|
38
|
+
Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
|
|
39
|
+
Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
|
|
40
|
+
unavailable_page(404, e)
|
|
41
|
+
ensure
|
|
42
|
+
@times_visited += 1
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Number of times we have retried the current page
|
|
46
|
+
def retries
|
|
47
|
+
[@times_visited - 1, 0].max
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# The current URL, this may be different from the URL we asked for if there was some redirect
|
|
51
|
+
def current_url
|
|
52
|
+
@rawpage.current_url
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# True if we followed a redirect to get the current contents
|
|
56
|
+
def followed_redirects?
|
|
57
|
+
current_url != @url
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# True if there page responded with an error
|
|
61
|
+
def error?
|
|
62
|
+
!!(status.to_s =~ /[4|5]\d\d/)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Extracts the path (e.g. /actions/test_action) from the URL
|
|
66
|
+
def path
|
|
67
|
+
URI.parse(@url).path
|
|
68
|
+
rescue URI::InvalidURIError # Invalid URLs will be added and caught when we try to navigate to them
|
|
69
|
+
@url
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def unavailable_page(status, exception)
|
|
73
|
+
Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
|
|
74
|
+
@result_page = ErroredPage.new(status, exception)
|
|
75
|
+
@timestamp = Time.now
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
# Private class.
|
|
81
|
+
# This is a result page when it has not been visited yet. Essentially empty of information
|
|
82
|
+
#
|
|
83
|
+
class UnvisitedPage
|
|
84
|
+
def status
|
|
85
|
+
nil
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def body
|
|
89
|
+
''
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def headers
|
|
93
|
+
{grellStatus: 'NotVisited' }
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def links
|
|
97
|
+
[]
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def host
|
|
101
|
+
''
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def visited?
|
|
105
|
+
false
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def has_selector?(selector)
|
|
109
|
+
false
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Private class.
|
|
115
|
+
# This is a result page when some error happened. It provides some information about the error.
|
|
116
|
+
#
|
|
117
|
+
class ErroredPage
|
|
118
|
+
def initialize(error_code, exception)
|
|
119
|
+
@error_code = error_code
|
|
120
|
+
@exception = exception
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def status
|
|
124
|
+
@error_code
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def body
|
|
128
|
+
''
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def headers
|
|
132
|
+
message = begin
|
|
133
|
+
@exception.message
|
|
134
|
+
rescue StandardError
|
|
135
|
+
"Error message can not be accessed" #Poltergeist may try to access a nil object when accessing message
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
{
|
|
139
|
+
grellStatus: 'Error',
|
|
140
|
+
errorClass: @exception.class.to_s,
|
|
141
|
+
errorMessage: message
|
|
142
|
+
}
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def links
|
|
146
|
+
[]
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def host
|
|
150
|
+
''
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def visited?
|
|
154
|
+
true
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def has_selector?(selector)
|
|
158
|
+
false
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# Private class.
|
|
165
|
+
# This is a result page when we successfully got some information back after visiting the page.
|
|
166
|
+
# It delegates most of the information to the @rawpage capybara page. But any transformation or logic is here
|
|
167
|
+
#
|
|
168
|
+
class VisitedPage
|
|
169
|
+
def initialize(rawpage)
|
|
170
|
+
@rawpage = rawpage
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def status
|
|
174
|
+
@rawpage.status
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def body
|
|
178
|
+
@rawpage.body
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def headers
|
|
182
|
+
@rawpage.headers
|
|
183
|
+
rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug.
|
|
184
|
+
{
|
|
185
|
+
grellStatus: 'Error',
|
|
186
|
+
errorClass: e.class.to_s,
|
|
187
|
+
errorMessage: e.message
|
|
188
|
+
}
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def links
|
|
192
|
+
@links ||= all_links
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def host
|
|
196
|
+
@rawpage.host
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def visited?
|
|
200
|
+
true
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def has_selector?(selector)
|
|
204
|
+
@rawpage.has_selector?(selector)
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
private
|
|
208
|
+
def all_links
|
|
209
|
+
links = @rawpage.all_anchors.map { |anchor| Link.new(anchor) }
|
|
210
|
+
body_enabled_links = links.reject { |link| link.inside_header? || link.disabled? || link.js_href? }
|
|
211
|
+
body_enabled_links.map { |link| link.to_url(host) }.uniq.compact
|
|
212
|
+
|
|
213
|
+
rescue Capybara::Poltergeist::ObsoleteNode
|
|
214
|
+
Grell.logger.warn "We found an obsolete node in #{@url}. Ignoring all links"
|
|
215
|
+
# Sometimes Javascript and timing may screw this, we lose these links.
|
|
216
|
+
# TODO: Can we do something more intelligent here?
|
|
217
|
+
[]
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Private class to group all the methods related to links.
|
|
221
|
+
class Link
|
|
222
|
+
def initialize(anchor)
|
|
223
|
+
@anchor = anchor
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# <link> can only be used in the <head> as of: https://developer.mozilla.org/en/docs/Web/HTML/Element/link
|
|
227
|
+
def inside_header?
|
|
228
|
+
@anchor.tag_name == 'link'
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Is the link disabled by either Javascript or CSS?
|
|
232
|
+
def disabled?
|
|
233
|
+
@anchor.disabled? || !!@anchor.native.attributes['disabled']
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Does the href use javascript?
|
|
237
|
+
def js_href?
|
|
238
|
+
href.start_with?('javascript:')
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Some links may use data-href + javascript to do interesting things
|
|
242
|
+
def href
|
|
243
|
+
@anchor['href'] || @anchor['data-href']
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# We only accept links in this same host that start with a path
|
|
247
|
+
def to_url(host)
|
|
248
|
+
uri = URI.parse(href)
|
|
249
|
+
if uri.absolute?
|
|
250
|
+
if uri.host != URI.parse(host).host
|
|
251
|
+
Grell.logger.debug "GRELL does not follow links to external hosts: #{href}"
|
|
252
|
+
nil
|
|
253
|
+
else
|
|
254
|
+
href # Absolute link to our own host
|
|
255
|
+
end
|
|
256
|
+
else
|
|
257
|
+
if uri.path.nil?
|
|
258
|
+
Grell.logger.debug "GRELL does not follow links without a path: #{uri}"
|
|
259
|
+
nil
|
|
260
|
+
end
|
|
261
|
+
if uri.path.start_with?('/')
|
|
262
|
+
host + href # convert to full URL
|
|
263
|
+
else # links like href="google.com" the browser would go to http://google.com like "http://#{link}"
|
|
264
|
+
Grell.logger.debug "GRELL Bad formatted link: #{href}, assuming external"
|
|
265
|
+
nil
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
rescue URI::InvalidURIError # Invalid links propagating till we navigate to them
|
|
269
|
+
href
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
module Grell
|
|
2
|
+
# Keeps a record of all the pages crawled.
|
|
3
|
+
# When a new url is found it is added to this collection, which makes sure it is unique.
|
|
4
|
+
# This page is part of the discovered pages. Eventually that page will be navigated to, then
|
|
5
|
+
# the page will be part of the visited pages.
|
|
6
|
+
class PageCollection
|
|
7
|
+
attr_reader :collection
|
|
8
|
+
|
|
9
|
+
# A block containing the logic that determines if a new URL should be added
|
|
10
|
+
# to the collection or if it is already present will be passed to the initializer.
|
|
11
|
+
def initialize(add_match_block)
|
|
12
|
+
@collection = []
|
|
13
|
+
@add_match_block = add_match_block || default_add_match
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def create_page(url, parent_id)
|
|
17
|
+
page_id = next_id
|
|
18
|
+
page = Page.new(url, page_id, parent_id)
|
|
19
|
+
add(page)
|
|
20
|
+
page
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def visited_pages
|
|
24
|
+
@collection.select {|page| page.visited?}
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def discovered_pages
|
|
28
|
+
@collection - visited_pages
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def next_page
|
|
32
|
+
discovered_pages.sort_by{|page| page.parent_id}.first
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def next_id
|
|
38
|
+
@collection.size
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def add(page)
|
|
42
|
+
# Although finding unique pages based on URL will add pages with different query parameters,
|
|
43
|
+
# in some cases we do link to different pages depending on the query parameters like when using proxies
|
|
44
|
+
new_url = @collection.none? do |collection_page|
|
|
45
|
+
@add_match_block.call(collection_page, page)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
if new_url
|
|
49
|
+
@collection.push page
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# If add_match_block is not provided, url matching to determine if a new page should be added
|
|
54
|
+
# to the page collection will default to this proc
|
|
55
|
+
def default_add_match
|
|
56
|
+
Proc.new do |collection_page, page|
|
|
57
|
+
collection_page.url.downcase == page.url.downcase
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
module Grell
|
|
2
|
+
# This class depends heavily on Capybara but contains no logic.
|
|
3
|
+
class RawPage
|
|
4
|
+
include Capybara::DSL
|
|
5
|
+
|
|
6
|
+
def navigate(url)
|
|
7
|
+
visit(url)
|
|
8
|
+
follow_redirects!
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def headers
|
|
12
|
+
page.response_headers
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def status
|
|
16
|
+
page.status_code
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def body
|
|
20
|
+
page.body
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def all_anchors
|
|
24
|
+
# Some elements may not be "a" elements but still provide a link. This usually is done for Javascript
|
|
25
|
+
# to convert other elements which are not links to be able to be clicked naturally.
|
|
26
|
+
# Only return links which are visible.
|
|
27
|
+
all('[href]', visible: true).to_a + all('[data-href]', visible: true).to_a
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def host
|
|
31
|
+
page.current_host
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def has_selector?(selector)
|
|
35
|
+
page.has_selector?(selector)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def wait_for_all_ajax_requests(timeout, interval)
|
|
39
|
+
Timeout::timeout(timeout) do
|
|
40
|
+
(timeout / interval).ceil.times do
|
|
41
|
+
jquery_active = page.evaluate_script("typeof jQuery !== 'undefined' && jQuery.active;")
|
|
42
|
+
break if (!jquery_active || jquery_active.zero?)
|
|
43
|
+
sleep(interval)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
true
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def follow_redirects!
|
|
52
|
+
# Phantom is very weird, it will follow a redirect to provide the correct body but will not fill the
|
|
53
|
+
# status and the headers, if we are in that situation, revisit the page with the correct url this time.
|
|
54
|
+
# Note that we will still fail if we have more than 5 redirects on a row
|
|
55
|
+
redirects = 0
|
|
56
|
+
while(page.status_code == nil && redirects < 5)
|
|
57
|
+
visit( CGI.unescape(page.current_url))
|
|
58
|
+
redirects = redirects + 1
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module Grell
|
|
2
|
+
# A tooling class, it waits a maximum of max_waiting for an action to finish. If the action is not
|
|
3
|
+
# finished by then, it will continue anyway.
|
|
4
|
+
# The wait may be long but we want to finish it as soon as the action has finished
|
|
5
|
+
class Reader
|
|
6
|
+
def self.wait_for(action, max_waiting, sleeping_time)
|
|
7
|
+
time_start = Time.now
|
|
8
|
+
action.call()
|
|
9
|
+
return if yield
|
|
10
|
+
while (Time.now < time_start + max_waiting)
|
|
11
|
+
action.call()
|
|
12
|
+
break if yield
|
|
13
|
+
sleep(sleeping_time)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
require 'capybara/poltergeist'
|
|
2
|
+
require 'capybara/dsl'
|
|
3
|
+
|
|
4
|
+
require 'grell/grell_logger'
|
|
5
|
+
require 'grell/capybara_driver'
|
|
6
|
+
require 'grell/crawler_manager'
|
|
7
|
+
require 'grell/crawler'
|
|
8
|
+
require 'grell/rawpage'
|
|
9
|
+
require 'grell/page'
|
|
10
|
+
require 'grell/page_collection'
|
|
11
|
+
require 'grell/reader'
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
|
|
2
|
+
RSpec.describe Grell::CapybaraDriver do
|
|
3
|
+
let(:ts) { Time.now }
|
|
4
|
+
before do
|
|
5
|
+
Grell.logger = Logger.new(nil)
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
describe 'setup_capybara' do
|
|
9
|
+
it 'properly registers the poltergeist driver' do
|
|
10
|
+
Timecop.freeze(ts)
|
|
11
|
+
driver = Grell::CapybaraDriver.new.setup_capybara
|
|
12
|
+
expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
it 'raises an exception if the driver cannot be initialized' do
|
|
16
|
+
Timecop.freeze(ts + 60)
|
|
17
|
+
|
|
18
|
+
# Attempt to register twice with the same driver name
|
|
19
|
+
Grell::CapybaraDriver.new.setup_capybara
|
|
20
|
+
expect { Grell::CapybaraDriver.new.setup_capybara }.
|
|
21
|
+
to raise_error "Poltergeist Driver could not be properly initialized"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it 'can register the poltergeist driver multiple times in a row' do
|
|
25
|
+
Timecop.freeze(ts + 120)
|
|
26
|
+
driver = Grell::CapybaraDriver.new.setup_capybara
|
|
27
|
+
expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
after do
|
|
32
|
+
Timecop.return
|
|
33
|
+
|
|
34
|
+
# Reset Capybara so future tests can easily stub HTTP requests
|
|
35
|
+
Capybara.javascript_driver = :poltergeist_billy
|
|
36
|
+
Capybara.default_driver = :poltergeist_billy
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
RSpec.describe Grell::CrawlerManager do
|
|
2
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
|
3
|
+
let(:host) { 'http://www.example.com' }
|
|
4
|
+
let(:url) { 'http://www.example.com/test' }
|
|
5
|
+
let(:driver) { double(Grell::CapybaraDriver) }
|
|
6
|
+
let(:logger) { Logger.new(nil) }
|
|
7
|
+
let(:crawler_manager) do
|
|
8
|
+
described_class.new(logger: logger, driver: driver)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
describe 'initialize' do
|
|
12
|
+
context 'provides a logger' do
|
|
13
|
+
let(:logger) { 33 }
|
|
14
|
+
|
|
15
|
+
it 'sets custom logger' do
|
|
16
|
+
crawler_manager
|
|
17
|
+
expect(Grell.logger).to eq(33)
|
|
18
|
+
Grell.logger = Logger.new(nil)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
context 'does not provides a logger' do
|
|
23
|
+
let(:logger) { nil }
|
|
24
|
+
|
|
25
|
+
it 'sets default logger' do
|
|
26
|
+
crawler_manager
|
|
27
|
+
expect(Grell.logger).to be_instance_of(Logger)
|
|
28
|
+
Grell.logger = Logger.new(nil)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
context 'does not provide a driver' do
|
|
33
|
+
let(:driver) { nil }
|
|
34
|
+
|
|
35
|
+
it 'setups a new Capybara driver' do
|
|
36
|
+
expect_any_instance_of(Grell::CapybaraDriver).to receive(:setup_capybara)
|
|
37
|
+
crawler_manager
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
describe '#quit' do
|
|
43
|
+
let(:driver) { double }
|
|
44
|
+
|
|
45
|
+
it 'quits the poltergeist driver' do
|
|
46
|
+
expect(logger).to receive(:info).with("GRELL. Driver quitting")
|
|
47
|
+
expect(driver).to receive(:quit)
|
|
48
|
+
crawler_manager.quit
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
describe '#restart' do
|
|
53
|
+
let(:driver) { double }
|
|
54
|
+
|
|
55
|
+
it 'restarts the poltergeist driver' do
|
|
56
|
+
expect(driver).to receive(:restart)
|
|
57
|
+
expect(logger).to receive(:info).with("GRELL. Driver restarted")
|
|
58
|
+
expect(logger).to receive(:info).with("GRELL. Driver restarting")
|
|
59
|
+
crawler_manager.restart
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
describe '#check_periodic_restart' do
|
|
64
|
+
let(:collection) { double }
|
|
65
|
+
|
|
66
|
+
context 'Periodic restart not setup' do
|
|
67
|
+
it 'does not restart' do
|
|
68
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
|
|
69
|
+
expect(crawler_manager).not_to receive(:restart)
|
|
70
|
+
crawler_manager.check_periodic_restart(collection)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
context 'Periodic restart setup with default period' do
|
|
75
|
+
let(:do_something) { proc {} }
|
|
76
|
+
let(:crawler_manager) do
|
|
77
|
+
Grell::CrawlerManager.new(
|
|
78
|
+
logger: logger,
|
|
79
|
+
driver: driver,
|
|
80
|
+
on_periodic_restart: { do: do_something }
|
|
81
|
+
)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
it 'does not restart after visiting 99 pages' do
|
|
85
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { 99 }
|
|
86
|
+
expect(crawler_manager).not_to receive(:restart)
|
|
87
|
+
crawler_manager.check_periodic_restart(collection)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
it 'restarts after visiting 100 pages' do
|
|
91
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
|
|
92
|
+
expect(crawler_manager).to receive(:restart)
|
|
93
|
+
crawler_manager.check_periodic_restart(collection)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
context 'Periodic restart setup with custom period' do
|
|
98
|
+
let(:do_something) { proc {} }
|
|
99
|
+
let(:period) { 50 }
|
|
100
|
+
let(:crawler_manager) do
|
|
101
|
+
Grell::CrawlerManager.new(
|
|
102
|
+
logger: logger,
|
|
103
|
+
driver: driver,
|
|
104
|
+
on_periodic_restart: { do: do_something, each: period }
|
|
105
|
+
)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
context 'restart option is not positive' do
|
|
109
|
+
let(:period) { 0 }
|
|
110
|
+
|
|
111
|
+
it 'logs a warning' do
|
|
112
|
+
message = 'GRELL. Restart option misconfigured with a negative period. Ignoring option.'
|
|
113
|
+
expect(logger).to receive(:warn).with(message)
|
|
114
|
+
crawler_manager
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
it 'does not restart after visiting a number different from custom period pages' do
|
|
119
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { period * 1.2 }
|
|
120
|
+
expect(crawler_manager).not_to receive(:restart)
|
|
121
|
+
crawler_manager.check_periodic_restart(collection)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
it 'restarts after visiting custom period pages' do
|
|
125
|
+
allow(collection).to receive_message_chain(:visited_pages, :size) { period }
|
|
126
|
+
expect(crawler_manager).to receive(:restart)
|
|
127
|
+
crawler_manager.check_periodic_restart(collection)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
describe '.cleanup_all_processes' do
|
|
133
|
+
let(:driver) { double }
|
|
134
|
+
|
|
135
|
+
context 'There are some phantomjs processes running' do
|
|
136
|
+
let(:pids) { [10, 11] }
|
|
137
|
+
before do
|
|
138
|
+
allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
|
|
139
|
+
.to receive(:running_phantomjs_pids).and_return(pids)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
it 'logs processes pids' do
|
|
143
|
+
expect(Grell.logger).to receive(:warn).with('GRELL. Killing PhantomJS processes: [10, 11]')
|
|
144
|
+
expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 10')
|
|
145
|
+
expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 11')
|
|
146
|
+
described_class.cleanup_all_processes
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
it 'kills all phantomjs processes' do
|
|
150
|
+
expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(10)
|
|
151
|
+
expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(11)
|
|
152
|
+
described_class.cleanup_all_processes
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
context 'There are no phantomjs processes running' do
|
|
157
|
+
let(:pids) { [] }
|
|
158
|
+
before do
|
|
159
|
+
allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
|
|
160
|
+
.to receive(:running_phantomjs_pids).and_return(pids)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it 'no warning is logged' do
|
|
164
|
+
expect(Grell.logger).not_to receive(:warn)
|
|
165
|
+
described_class.cleanup_all_processes
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
it 'No process is killed' do
|
|
169
|
+
expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).not_to receive(:kill_process)
|
|
170
|
+
described_class.cleanup_all_processes
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|