slmndr 0.0.2 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/slmndr.rb +36 -11
- data.tar.gz.sig +0 -0
- metadata +1 -1
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80694b7cb325f3f082327ef4408b045ee23e9355
|
4
|
+
data.tar.gz: feabcf945e9b69da248d5190130063dbbe3e47b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d88f18d2641a21b233fe8af543b66e3512aa9671445a1d43cdc6790c6de17e4450011df6d0171cf77f9280b77d6752c182fb8b06ed72ef2dcbd81daed5b364ab
|
7
|
+
data.tar.gz: 77f9ea7a1eca3d161647e7a50960baec47fddcc7051df74b5ee51628dca108db76902d00a0718c40c77b98169b47d0c0acd9d9dade6e91b6d45aa5fb11f65cc1
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/slmndr.rb
CHANGED
@@ -56,11 +56,26 @@ module Salamander
|
|
56
56
|
|
57
57
|
# Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
|
58
58
|
# Function blocks until all threads terminate.
|
59
|
-
#
|
60
|
-
#
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
59
|
+
# This function can receive a code block like so...
|
60
|
+
# Salamander::crawl(urls, args) do |request, response, depth|
|
61
|
+
# # request: the URL string used to request the current page
|
62
|
+
# # response: a hash containing data pertaining to the response to the requested URL
|
63
|
+
# # depth: a positive integer indicating the breadth/depth of the current page, relative to one of the seed URLs
|
64
|
+
# end
|
65
|
+
# Response Hash Contents
|
66
|
+
# base_uri: The base_uri field of OpenURI's response
|
67
|
+
# meta: The meta field of OpenURI's response
|
68
|
+
# status: The status field of OpenURI's response
|
69
|
+
# content_type: The content_type field of OpenURI's response
|
70
|
+
# charset: The charset field of OpenURI's response
|
71
|
+
# content_encoding: The content_encoding field of OpenURI's response
|
72
|
+
# last_modified: The last_modified field of OpenURI's response
|
73
|
+
# body: Contains the body of OpenURI's response
|
74
|
+
# Optional Arguments
|
75
|
+
# visit: A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
|
76
|
+
# delay: A positive float indicating the number of seconds between requests in one thread. Defaults to 1.
|
77
|
+
# threads: A positive integer indicating the number of allowed simultaneous requests to the target web asset. Defaults to 1.
|
78
|
+
# agent: The user-agent string to be used. Defaults to "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)".
|
64
79
|
# @param urls A list of strings containing the seed URLs.
|
65
80
|
# @param args A hash containing optional arguments for the function.
|
66
81
|
def crawl(urls, args = {})
|
@@ -134,21 +149,31 @@ module Salamander
|
|
134
149
|
# Get all links in page pointed to by job URL
|
135
150
|
begin
|
136
151
|
open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
|
152
|
+
_response = {
|
153
|
+
base_uri: response.base_uri,
|
154
|
+
meta: response.meta,
|
155
|
+
status: response.status,
|
156
|
+
content_type: response.content_type,
|
157
|
+
charset: response.charset,
|
158
|
+
content_encoding: response.content_encoding,
|
159
|
+
last_modified: response.last_modified,
|
160
|
+
body: response.read
|
161
|
+
}
|
137
162
|
# Callback
|
138
163
|
jlock.synchronize do
|
139
|
-
yield "#{job_url}",
|
164
|
+
yield "#{job_url}", _response, job_depth
|
140
165
|
end
|
141
166
|
# If resolved URL is in scope
|
142
167
|
if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
|
143
168
|
# Add resolved URL to job queue and mark it as complete if it does not exist yet
|
144
169
|
jlock.synchronize do
|
145
170
|
if jobs[:"#{response.base_uri}"] == nil then
|
146
|
-
yield "#{response.base_uri}",
|
171
|
+
yield "#{response.base_uri}", _response, job_depth
|
147
172
|
jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
|
148
173
|
end
|
149
174
|
end
|
150
175
|
# Get links for resolve URL
|
151
|
-
Salamander::get_links(response.base_uri,
|
176
|
+
Salamander::get_links(response.base_uri, _response[:body]) do |link|
|
152
177
|
# Determine if the link should be visited
|
153
178
|
if visit.nil? || visit.call(link) then
|
154
179
|
jlock.synchronize do
|
@@ -291,7 +316,7 @@ if __FILE__ == $0 then
|
|
291
316
|
end
|
292
317
|
end
|
293
318
|
rescue => e
|
294
|
-
puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling",
|
319
|
+
puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", backtrace: e.backtrace })
|
295
320
|
exit
|
296
321
|
end
|
297
322
|
end
|
@@ -304,7 +329,7 @@ if __FILE__ == $0 then
|
|
304
329
|
STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
|
305
330
|
rescue e
|
306
331
|
# Print any uncaught exceptions
|
307
|
-
puts JSON.pretty_generate({ result: "exception", message: "unknown error",
|
332
|
+
puts JSON.pretty_generate({ result: "exception", message: "unknown error", backtrace: e.backtrace })
|
308
333
|
exit
|
309
334
|
end
|
310
|
-
end
|
335
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
metadata.gz.sig
CHANGED
Binary file
|