slmndr 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2ebd74e5e1f4bf29b4b8a01362c13eb8196e4ff5
4
- data.tar.gz: 6053b2e0ec26ba5dd0df380060dd60bcf3a45185
3
+ metadata.gz: 80694b7cb325f3f082327ef4408b045ee23e9355
4
+ data.tar.gz: feabcf945e9b69da248d5190130063dbbe3e47b2
5
5
  SHA512:
6
- metadata.gz: 77db1e9e5d4d8ba8e2ef70d1ccd87e4220f95a06e093405e43cd29d0f8f6ca8c9643d2872e10eb5ce38496796c650bcd0b0118c7a963dc363c05fa5b91ea5c18
7
- data.tar.gz: 88ba03da2130c604382f9db2690da8542cf87d8e03c036823afc4dd89f0885a68f192827e5d6a6cc024736cc6a89289b605f54569b798aa11fbd21352ef27da9
6
+ metadata.gz: d88f18d2641a21b233fe8af543b66e3512aa9671445a1d43cdc6790c6de17e4450011df6d0171cf77f9280b77d6752c182fb8b06ed72ef2dcbd81daed5b364ab
7
+ data.tar.gz: 77f9ea7a1eca3d161647e7a50960baec47fddcc7051df74b5ee51628dca108db76902d00a0718c40c77b98169b47d0c0acd9d9dade6e91b6d45aa5fb11f65cc1
checksums.yaml.gz.sig CHANGED
Binary file
data/lib/slmndr.rb CHANGED
@@ -56,11 +56,26 @@ module Salamander
56
56
 
57
57
  # Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
58
58
  # Function blocks until all threads terminate.
59
- # Optional Arguments (Place these inside the 'args' hash)
60
- # visit: A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
61
- # delay: A positive float indicating the number of seconds between requests in one thread. Defaults to 1.
62
- # threads: A positive integer indicating the number of allowed simultaneous requests to the target web asset. Defaults to 1.
63
- # agent: The user-agent string to be used. Defaults to "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)".
59
+ # This function can receive a code block like so...
60
+ # Salamander::crawl(urls, args) do |request, response, depth|
61
+ # # request: the URL string used to request the current page
62
+ # # response: a hash containing data pertaining to the response to the requested URL
63
+ # # depth: a positive integer indicating the breadth/depth of the current page, relative to one of the seed URLs
64
+ # end
65
+ # Response Hash Contents
66
+ # base_uri: The base_uri field of OpenURI's response
67
+ # meta: The meta field of OpenURI's response
68
+ # status: The status field of OpenURI's response
69
+ # content_type: The content_type field of OpenURI's response
70
+ # charset: The charset field of OpenURI's response
71
+ # content_encoding: The content_encoding field of OpenURI's response
72
+ # last_modified: The last_modified field of OpenURI's response
73
+ # body: Contains the body of OpenURI's response
74
+ # Optional Arguments
75
+ # visit: A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
76
+ # delay: A positive float indicating the number of seconds between requests in one thread. Defaults to 1.
77
+ # threads: A positive integer indicating the number of allowed simultaneous requests to the target web asset. Defaults to 1.
78
+ # agent: The user-agent string to be used. Defaults to "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)".
64
79
  # @param urls A list of strings containing the seed URLs.
65
80
  # @param args A hash containing optional arguments for the function.
66
81
  def crawl(urls, args = {})
@@ -134,21 +149,31 @@ module Salamander
134
149
  # Get all links in page pointed to by job URL
135
150
  begin
136
151
  open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
152
+ _response = {
153
+ base_uri: response.base_uri,
154
+ meta: response.meta,
155
+ status: response.status,
156
+ content_type: response.content_type,
157
+ charset: response.charset,
158
+ content_encoding: response.content_encoding,
159
+ last_modified: response.last_modified,
160
+ body: response.read
161
+ }
137
162
  # Callback
138
163
  jlock.synchronize do
139
- yield "#{job_url}", response, job_depth
164
+ yield "#{job_url}", _response, job_depth
140
165
  end
141
166
  # If resolved URL is in scope
142
167
  if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
143
168
  # Add resolved URL to job queue and mark it as complete if it does not exist yet
144
169
  jlock.synchronize do
145
170
  if jobs[:"#{response.base_uri}"] == nil then
146
- yield "#{response.base_uri}", response, job_depth
171
+ yield "#{response.base_uri}", _response, job_depth
147
172
  jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
148
173
  end
149
174
  end
150
175
  # Get links for resolve URL
151
- Salamander::get_links(response.base_uri, response) do |link|
176
+ Salamander::get_links(response.base_uri, _response[:body]) do |link|
152
177
  # Determine if the link should be visited
153
178
  if visit.nil? || visit.call(link) then
154
179
  jlock.synchronize do
@@ -291,7 +316,7 @@ if __FILE__ == $0 then
291
316
  end
292
317
  end
293
318
  rescue => e
294
- puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", inspect: e.inspect })
319
+ puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", backtrace: e.backtrace })
295
320
  exit
296
321
  end
297
322
  end
@@ -304,7 +329,7 @@ if __FILE__ == $0 then
304
329
  STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
305
330
  rescue e
306
331
  # Print any uncaught exceptions
307
- puts JSON.pretty_generate({ result: "exception", message: "unknown error", inspect: e.inspect })
332
+ puts JSON.pretty_generate({ result: "exception", message: "unknown error", backtrace: e.backtrace })
308
333
  exit
309
334
  end
310
- end
335
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slmndr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Lawrence M. Penafiel
metadata.gz.sig CHANGED
Binary file