slmndr 0.0.2 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2ebd74e5e1f4bf29b4b8a01362c13eb8196e4ff5
4
- data.tar.gz: 6053b2e0ec26ba5dd0df380060dd60bcf3a45185
3
+ metadata.gz: 80694b7cb325f3f082327ef4408b045ee23e9355
4
+ data.tar.gz: feabcf945e9b69da248d5190130063dbbe3e47b2
5
5
  SHA512:
6
- metadata.gz: 77db1e9e5d4d8ba8e2ef70d1ccd87e4220f95a06e093405e43cd29d0f8f6ca8c9643d2872e10eb5ce38496796c650bcd0b0118c7a963dc363c05fa5b91ea5c18
7
- data.tar.gz: 88ba03da2130c604382f9db2690da8542cf87d8e03c036823afc4dd89f0885a68f192827e5d6a6cc024736cc6a89289b605f54569b798aa11fbd21352ef27da9
6
+ metadata.gz: d88f18d2641a21b233fe8af543b66e3512aa9671445a1d43cdc6790c6de17e4450011df6d0171cf77f9280b77d6752c182fb8b06ed72ef2dcbd81daed5b364ab
7
+ data.tar.gz: 77f9ea7a1eca3d161647e7a50960baec47fddcc7051df74b5ee51628dca108db76902d00a0718c40c77b98169b47d0c0acd9d9dade6e91b6d45aa5fb11f65cc1
checksums.yaml.gz.sig CHANGED
Binary file
data/lib/slmndr.rb CHANGED
@@ -56,11 +56,26 @@ module Salamander
56
56
 
57
57
  # Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
58
58
  # Function blocks until all threads terminate.
59
- # Optional Arguments (Place these inside the 'args' hash)
60
- # visit: A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
61
- # delay: A positive float indicating the number of seconds between requests in one thread. Defaults to 1.
62
- # threads: A positive integer indicating the number of allowed simultaneous requests to the target web asset. Defaults to 1.
63
- # agent: The user-agent string to be used. Defaults to "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)".
59
+ # This function can receive a code block like so...
60
+ # Salamander::crawl(urls, args) do |request, response, depth|
61
+ # # request: the URL string used to request the current page
62
+ # # response: a hash containing data pertaining to the response to the requested URL
63
+ # # depth: a positive integer indicating the breadth/depth of the current page, relative to one of the seed URLs
64
+ # end
65
+ # Response Hash Contents
66
+ # base_uri: The base_uri field of OpenURI's response
67
+ # meta: The meta field of OpenURI's response
68
+ # status: The status field of OpenURI's response
69
+ # content_type: The content_type field of OpenURI's response
70
+ # charset: The charset field of OpenURI's response
71
+ # content_encoding: The content_encoding field of OpenURI's response
72
+ # last_modified: The last_modified field of OpenURI's response
73
+ # body: Contains the body of OpenURI's response
74
+ # Optional Arguments
75
+ # visit: A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
76
+ # delay: A positive float indicating the number of seconds between requests in one thread. Defaults to 1.
77
+ # threads: A positive integer indicating the number of allowed simultaneous requests to the target web asset. Defaults to 1.
78
+ # agent: The user-agent string to be used. Defaults to "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)".
64
79
  # @param urls A list of strings containing the seed URLs.
65
80
  # @param args A hash containing optional arguments for the function.
66
81
  def crawl(urls, args = {})
@@ -134,21 +149,31 @@ module Salamander
134
149
  # Get all links in page pointed to by job URL
135
150
  begin
136
151
  open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
152
+ _response = {
153
+ base_uri: response.base_uri,
154
+ meta: response.meta,
155
+ status: response.status,
156
+ content_type: response.content_type,
157
+ charset: response.charset,
158
+ content_encoding: response.content_encoding,
159
+ last_modified: response.last_modified,
160
+ body: response.read
161
+ }
137
162
  # Callback
138
163
  jlock.synchronize do
139
- yield "#{job_url}", response, job_depth
164
+ yield "#{job_url}", _response, job_depth
140
165
  end
141
166
  # If resolved URL is in scope
142
167
  if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
143
168
  # Add resolved URL to job queue and mark it as complete if it does not exist yet
144
169
  jlock.synchronize do
145
170
  if jobs[:"#{response.base_uri}"] == nil then
146
- yield "#{response.base_uri}", response, job_depth
171
+ yield "#{response.base_uri}", _response, job_depth
147
172
  jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
148
173
  end
149
174
  end
150
175
  # Get links for resolve URL
151
- Salamander::get_links(response.base_uri, response) do |link|
176
+ Salamander::get_links(response.base_uri, _response[:body]) do |link|
152
177
  # Determine if the link should be visited
153
178
  if visit.nil? || visit.call(link) then
154
179
  jlock.synchronize do
@@ -291,7 +316,7 @@ if __FILE__ == $0 then
291
316
  end
292
317
  end
293
318
  rescue => e
294
- puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", inspect: e.inspect })
319
+ puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", backtrace: e.backtrace })
295
320
  exit
296
321
  end
297
322
  end
@@ -304,7 +329,7 @@ if __FILE__ == $0 then
304
329
  STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
305
330
  rescue e
306
331
  # Print any uncaught exceptions
307
- puts JSON.pretty_generate({ result: "exception", message: "unknown error", inspect: e.inspect })
332
+ puts JSON.pretty_generate({ result: "exception", message: "unknown error", backtrace: e.backtrace })
308
333
  exit
309
334
  end
310
- end
335
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slmndr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Lawrence M. Penafiel
metadata.gz.sig CHANGED
Binary file