slmndr 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/slmndr.rb +137 -80
- metadata +2 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e90e3f806c2fa3b3dc195b6a11ddd412d9e6302
|
4
|
+
data.tar.gz: f637bf43473abf16538f6aa6e3927e08901b9c27
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5dbd8d6b09f8d9ba76dcfdcdb9ed3848ee843aa6cb5b25c7caa20a48f3a74df35a5677f69ef78326e704a0ebbc1ef495e3bdaf12ab553230b2485f52eee0e9f0
|
7
|
+
data.tar.gz: 1c499a5358a878b38190ed5842bc58f3077b319e2a630853b9fa9c5b1813a88697e1f882d2f7b0e3c92cc787169a0b72d309e46afa66c359b1e714a7b3c2f366
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
data/lib/slmndr.rb
CHANGED
@@ -54,7 +54,7 @@ module Salamander
|
|
54
54
|
end
|
55
55
|
end
|
56
56
|
|
57
|
-
# Performs
|
57
|
+
# Performs an unauthenticated, breadth-first crawl of the target web asset.
|
58
58
|
# Function blocks until all threads terminate.
|
59
59
|
# This function can receive a code block like so...
|
60
60
|
# Salamander::crawl(urls, args) do |request, response, depth|
|
@@ -102,106 +102,154 @@ module Salamander
|
|
102
102
|
if args[:agent] != nil then
|
103
103
|
agent = args[:agent]
|
104
104
|
end
|
105
|
-
# Create threads list
|
106
|
-
_threads =
|
105
|
+
# Create threads list and lock
|
106
|
+
_threads = {}
|
107
|
+
tlock = Mutex.new
|
107
108
|
# Create jobs map and lock
|
108
109
|
jobs = {}
|
109
110
|
jlock = Mutex.new
|
111
|
+
# Create yield job list and lock
|
112
|
+
yields = []
|
113
|
+
ylock = Mutex.new
|
110
114
|
# Create job; Job States: 0: waiting, 1: working, 2: done
|
111
115
|
urls.each do |url|
|
112
116
|
jobs[:"#{url}"] = { state: 0, depth: 0 }
|
113
117
|
end
|
114
118
|
# Create and launch crawl threads
|
115
|
-
for
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
kill = true
|
122
|
-
job_url = nil
|
123
|
-
jlock.synchronize do
|
124
|
-
# For each job
|
125
|
-
jobs.each do |u, j|
|
126
|
-
# If job is waiting
|
127
|
-
if j[:state] == 0 then
|
128
|
-
# Take job
|
129
|
-
job_url = u
|
130
|
-
j[:state] = 1
|
131
|
-
kill = false
|
132
|
-
break
|
133
|
-
elsif j[:state] == 1 then
|
134
|
-
# Some jobs are still working; anticipate more jobs in the future
|
135
|
-
kill = false
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
# If all jobs are done, and no job is found
|
140
|
-
if kill then
|
141
|
-
break
|
119
|
+
for i in 1..threads
|
120
|
+
tlock.synchronize do
|
121
|
+
# Create crawl thread
|
122
|
+
thread = Thread.new do
|
123
|
+
# Wait until all threads are created
|
124
|
+
tlock.synchronize do
|
142
125
|
end
|
143
|
-
#
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
126
|
+
# Get thread id
|
127
|
+
_id = Thread.current.object_id
|
128
|
+
# Loop
|
129
|
+
while true
|
130
|
+
# Check if thread has been forcefully killed
|
131
|
+
kill = false
|
132
|
+
tlock.synchronize do
|
133
|
+
kill = _threads[_id][:kill]
|
134
|
+
end
|
135
|
+
if kill then
|
136
|
+
break
|
137
|
+
end
|
138
|
+
# Find job to do
|
139
|
+
kill = true
|
140
|
+
job_url = nil
|
141
|
+
jlock.synchronize do
|
142
|
+
# For each job
|
143
|
+
jobs.each do |u, j|
|
144
|
+
# If job is waiting
|
145
|
+
if j[:state] == 0 then
|
146
|
+
# Take job
|
147
|
+
job_url = u
|
148
|
+
j[:state] = 1
|
149
|
+
kill = false
|
150
|
+
break
|
151
|
+
elsif j[:state] == 1 then
|
152
|
+
# Some jobs are still working; anticipate more jobs in the future
|
153
|
+
kill = false
|
154
|
+
end
|
165
155
|
end
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
156
|
+
end
|
157
|
+
# If all jobs are done, and no job is found
|
158
|
+
if kill then
|
159
|
+
break
|
160
|
+
end
|
161
|
+
# If no job found but some jobs are still being worked on, skip
|
162
|
+
if job_url == nil then
|
163
|
+
next
|
164
|
+
end
|
165
|
+
# Get job depth
|
166
|
+
job_depth = jobs[:"#{job_url}"][:depth]
|
167
|
+
# Get all links in page pointed to by job URL
|
168
|
+
begin
|
169
|
+
open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
|
170
|
+
_response = {
|
171
|
+
base_uri: response.base_uri,
|
172
|
+
meta: response.meta,
|
173
|
+
status: response.status,
|
174
|
+
content_type: response.content_type,
|
175
|
+
charset: response.charset,
|
176
|
+
content_encoding: response.content_encoding,
|
177
|
+
last_modified: response.last_modified,
|
178
|
+
body: response.read
|
179
|
+
}
|
180
|
+
# Callback
|
181
|
+
ylock.synchronize do
|
182
|
+
yields << { request: "#{job_url}", response: _response, depth: job_depth }
|
174
183
|
end
|
175
|
-
#
|
176
|
-
|
177
|
-
#
|
178
|
-
|
179
|
-
|
180
|
-
#
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
+
# If resolved URL is in scope
|
185
|
+
if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
|
186
|
+
# Add resolved URL to job queue and mark it as complete if it does not exist yet
|
187
|
+
jlock.synchronize do
|
188
|
+
if jobs[:"#{response.base_uri}"] == nil then
|
189
|
+
jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
|
190
|
+
end
|
191
|
+
end
|
192
|
+
# Get links for resolve URL
|
193
|
+
Salamander::get_links(response.base_uri, _response[:body]) do |link|
|
194
|
+
# Determine if the link should be visited
|
195
|
+
if visit.nil? || visit.call(link) then
|
196
|
+
jlock.synchronize do
|
197
|
+
# If link is not in job queue
|
198
|
+
if jobs[:"#{link}"] == nil then
|
199
|
+
# Create job for the given link
|
200
|
+
jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
|
201
|
+
end
|
184
202
|
end
|
185
203
|
end
|
186
204
|
end
|
187
205
|
end
|
188
206
|
end
|
207
|
+
rescue
|
189
208
|
end
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
209
|
+
# Flag job as complete
|
210
|
+
jlock.synchronize do
|
211
|
+
jobs[:"#{job_url}"][:state] = 2
|
212
|
+
end
|
213
|
+
# Perform delay
|
214
|
+
sleep(delay)
|
195
215
|
end
|
196
|
-
# Perform delay
|
197
|
-
sleep(delay)
|
198
216
|
end
|
217
|
+
_threads[thread.object_id] = { thread: thread, kill: false }
|
199
218
|
end
|
200
|
-
_threads << thread
|
201
219
|
end
|
202
220
|
# Wait for all threads to die
|
203
|
-
|
204
|
-
|
221
|
+
while true
|
222
|
+
# Execute yields
|
223
|
+
y = nil
|
224
|
+
ylock.synchronize do
|
225
|
+
y = yields.shift
|
226
|
+
end
|
227
|
+
if y != nil then
|
228
|
+
tlock.synchronize do
|
229
|
+
# Pre-emptive kill if yield breaks
|
230
|
+
_threads.each do |id, _thread|
|
231
|
+
_thread[:kill] = true
|
232
|
+
end
|
233
|
+
# Yield
|
234
|
+
yield y[:request], y[:response], y[:depth]
|
235
|
+
# Cancel kill if yield does not break
|
236
|
+
_threads.each do |id, _thread|
|
237
|
+
_thread[:kill] = false
|
238
|
+
end
|
239
|
+
end
|
240
|
+
next
|
241
|
+
end
|
242
|
+
# Check if dead
|
243
|
+
alive = false
|
244
|
+
_threads.each do |id, _thread|
|
245
|
+
alive = alive || _thread[:thread].alive?
|
246
|
+
if alive then
|
247
|
+
break
|
248
|
+
end
|
249
|
+
end
|
250
|
+
if !alive then
|
251
|
+
break
|
252
|
+
end
|
205
253
|
end
|
206
254
|
end
|
207
255
|
|
@@ -239,6 +287,10 @@ if __FILE__ == $0 then
|
|
239
287
|
puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
|
240
288
|
exit
|
241
289
|
end
|
290
|
+
urls_hosts = Set.new
|
291
|
+
urls.each do |url|
|
292
|
+
urls_hosts << Addressable::URI.parse(url).host
|
293
|
+
end
|
242
294
|
urls.each do |url|
|
243
295
|
begin
|
244
296
|
Addressable::URI.parse(url)
|
@@ -247,7 +299,11 @@ if __FILE__ == $0 then
|
|
247
299
|
exit
|
248
300
|
end
|
249
301
|
end
|
250
|
-
_args = {
|
302
|
+
_args = { visit:
|
303
|
+
lambda do |url|
|
304
|
+
return urls_hosts.include?(Addressable::URI.parse(url).host)
|
305
|
+
end
|
306
|
+
}
|
251
307
|
# Attempt to retrieve the delay parameter
|
252
308
|
if args['delay'] != nil then
|
253
309
|
begin
|
@@ -277,6 +333,7 @@ if __FILE__ == $0 then
|
|
277
333
|
STDERR.puts " Welcome to the Salamander Web Crawler Demo!"
|
278
334
|
STDERR.puts
|
279
335
|
STDERR.puts " This is a command-line demonstration of the Salamander Web Crawler in action."
|
336
|
+
STDERR.puts " The crawl is restricted to the hosts inside the URL list that was provided."
|
280
337
|
STDERR.puts " Press Ctrl+C at any time to interrupt the crawl."
|
281
338
|
STDERR.puts
|
282
339
|
STDERR.puts " Starting crawl at the following URLs:"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slmndr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Lawrence M. Penafiel
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
|
31
31
|
1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
|
32
32
|
-----END CERTIFICATE-----
|
33
|
-
date: 2015-
|
33
|
+
date: 2015-07-01 00:00:00.000000000 Z
|
34
34
|
dependencies:
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
36
|
name: json
|
metadata.gz.sig
CHANGED
Binary file
|