slmndr 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/slmndr.rb +36 -11
- data.tar.gz.sig +0 -0
- metadata +1 -1
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80694b7cb325f3f082327ef4408b045ee23e9355
|
4
|
+
data.tar.gz: feabcf945e9b69da248d5190130063dbbe3e47b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d88f18d2641a21b233fe8af543b66e3512aa9671445a1d43cdc6790c6de17e4450011df6d0171cf77f9280b77d6752c182fb8b06ed72ef2dcbd81daed5b364ab
|
7
|
+
data.tar.gz: 77f9ea7a1eca3d161647e7a50960baec47fddcc7051df74b5ee51628dca108db76902d00a0718c40c77b98169b47d0c0acd9d9dade6e91b6d45aa5fb11f65cc1
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/slmndr.rb
CHANGED
@@ -56,11 +56,26 @@ module Salamander
|
|
56
56
|
|
57
57
|
# Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
|
58
58
|
# Function blocks until all threads terminate.
|
59
|
-
#
|
60
|
-
#
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
59
|
+
# This function can receive a code block like so...
|
60
|
+
# Salamander::crawl(urls, args) do |request, response, depth|
|
61
|
+
# # request: the URL string used to request the current page
|
62
|
+
# # response: a hash containing data pertaining to the response to the requested URL
|
63
|
+
# # depth: a positive integer indicating the breadth/depth of the current page, relative to one of the seed URLs
|
64
|
+
# end
|
65
|
+
# Response Hash Contents
|
66
|
+
# base_uri: The base_uri field of OpenURI's response
|
67
|
+
# meta: The meta field of OpenURI's response
|
68
|
+
# status: The status field of OpenURI's response
|
69
|
+
# content_type: The content_type field of OpenURI's response
|
70
|
+
# charset: The charset field of OpenURI's response
|
71
|
+
# content_encoding: The content_encoding field of OpenURI's response
|
72
|
+
# last_modified: The last_modified field of OpenURI's response
|
73
|
+
# body: Contains the body of OpenURI's response
|
74
|
+
# Optional Arguments
|
75
|
+
# visit: A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
|
76
|
+
# delay: A positive float indicating the number of seconds between requests in one thread. Defaults to 1.
|
77
|
+
# threads: A positive integer indicating the number of allowed simultaneous requests to the target web asset. Defaults to 1.
|
78
|
+
# agent: The user-agent string to be used. Defaults to "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)".
|
64
79
|
# @param urls A list of strings containing the seed URLs.
|
65
80
|
# @param args A hash containing optional arguments for the function.
|
66
81
|
def crawl(urls, args = {})
|
@@ -134,21 +149,31 @@ module Salamander
|
|
134
149
|
# Get all links in page pointed to by job URL
|
135
150
|
begin
|
136
151
|
open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
|
152
|
+
_response = {
|
153
|
+
base_uri: response.base_uri,
|
154
|
+
meta: response.meta,
|
155
|
+
status: response.status,
|
156
|
+
content_type: response.content_type,
|
157
|
+
charset: response.charset,
|
158
|
+
content_encoding: response.content_encoding,
|
159
|
+
last_modified: response.last_modified,
|
160
|
+
body: response.read
|
161
|
+
}
|
137
162
|
# Callback
|
138
163
|
jlock.synchronize do
|
139
|
-
yield "#{job_url}",
|
164
|
+
yield "#{job_url}", _response, job_depth
|
140
165
|
end
|
141
166
|
# If resolved URL is in scope
|
142
167
|
if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
|
143
168
|
# Add resolved URL to job queue and mark it as complete if it does not exist yet
|
144
169
|
jlock.synchronize do
|
145
170
|
if jobs[:"#{response.base_uri}"] == nil then
|
146
|
-
yield "#{response.base_uri}",
|
171
|
+
yield "#{response.base_uri}", _response, job_depth
|
147
172
|
jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
|
148
173
|
end
|
149
174
|
end
|
150
175
|
# Get links for resolve URL
|
151
|
-
Salamander::get_links(response.base_uri,
|
176
|
+
Salamander::get_links(response.base_uri, _response[:body]) do |link|
|
152
177
|
# Determine if the link should be visited
|
153
178
|
if visit.nil? || visit.call(link) then
|
154
179
|
jlock.synchronize do
|
@@ -291,7 +316,7 @@ if __FILE__ == $0 then
|
|
291
316
|
end
|
292
317
|
end
|
293
318
|
rescue => e
|
294
|
-
puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling",
|
319
|
+
puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", backtrace: e.backtrace })
|
295
320
|
exit
|
296
321
|
end
|
297
322
|
end
|
@@ -304,7 +329,7 @@ if __FILE__ == $0 then
|
|
304
329
|
STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
|
305
330
|
rescue e
|
306
331
|
# Print any uncaught exceptions
|
307
|
-
puts JSON.pretty_generate({ result: "exception", message: "unknown error",
|
332
|
+
puts JSON.pretty_generate({ result: "exception", message: "unknown error", backtrace: e.backtrace })
|
308
333
|
exit
|
309
334
|
end
|
310
|
-
end
|
335
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
metadata.gz.sig
CHANGED
Binary file
|