slmndr 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/slmndr.rb +137 -80
- metadata +2 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e90e3f806c2fa3b3dc195b6a11ddd412d9e6302
|
4
|
+
data.tar.gz: f637bf43473abf16538f6aa6e3927e08901b9c27
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5dbd8d6b09f8d9ba76dcfdcdb9ed3848ee843aa6cb5b25c7caa20a48f3a74df35a5677f69ef78326e704a0ebbc1ef495e3bdaf12ab553230b2485f52eee0e9f0
|
7
|
+
data.tar.gz: 1c499a5358a878b38190ed5842bc58f3077b319e2a630853b9fa9c5b1813a88697e1f882d2f7b0e3c92cc787169a0b72d309e46afa66c359b1e714a7b3c2f366
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
data/lib/slmndr.rb
CHANGED
@@ -54,7 +54,7 @@ module Salamander
|
|
54
54
|
end
|
55
55
|
end
|
56
56
|
|
57
|
-
# Performs
|
57
|
+
# Performs an unauthenticated, breadth-first crawl of the target web asset.
|
58
58
|
# Function blocks until all threads terminate.
|
59
59
|
# This function can receive a code block like so...
|
60
60
|
# Salamander::crawl(urls, args) do |request, response, depth|
|
@@ -102,106 +102,154 @@ module Salamander
|
|
102
102
|
if args[:agent] != nil then
|
103
103
|
agent = args[:agent]
|
104
104
|
end
|
105
|
-
# Create threads list
|
106
|
-
_threads =
|
105
|
+
# Create threads list and lock
|
106
|
+
_threads = {}
|
107
|
+
tlock = Mutex.new
|
107
108
|
# Create jobs map and lock
|
108
109
|
jobs = {}
|
109
110
|
jlock = Mutex.new
|
111
|
+
# Create yield job list and lock
|
112
|
+
yields = []
|
113
|
+
ylock = Mutex.new
|
110
114
|
# Create job; Job States: 0: waiting, 1: working, 2: done
|
111
115
|
urls.each do |url|
|
112
116
|
jobs[:"#{url}"] = { state: 0, depth: 0 }
|
113
117
|
end
|
114
118
|
# Create and launch crawl threads
|
115
|
-
for
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
kill = true
|
122
|
-
job_url = nil
|
123
|
-
jlock.synchronize do
|
124
|
-
# For each job
|
125
|
-
jobs.each do |u, j|
|
126
|
-
# If job is waiting
|
127
|
-
if j[:state] == 0 then
|
128
|
-
# Take job
|
129
|
-
job_url = u
|
130
|
-
j[:state] = 1
|
131
|
-
kill = false
|
132
|
-
break
|
133
|
-
elsif j[:state] == 1 then
|
134
|
-
# Some jobs are still working; anticipate more jobs in the future
|
135
|
-
kill = false
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
# If all jobs are done, and no job is found
|
140
|
-
if kill then
|
141
|
-
break
|
119
|
+
for i in 1..threads
|
120
|
+
tlock.synchronize do
|
121
|
+
# Create crawl thread
|
122
|
+
thread = Thread.new do
|
123
|
+
# Wait until all threads are created
|
124
|
+
tlock.synchronize do
|
142
125
|
end
|
143
|
-
#
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
126
|
+
# Get thread id
|
127
|
+
_id = Thread.current.object_id
|
128
|
+
# Loop
|
129
|
+
while true
|
130
|
+
# Check if thread has been forcefully killed
|
131
|
+
kill = false
|
132
|
+
tlock.synchronize do
|
133
|
+
kill = _threads[_id][:kill]
|
134
|
+
end
|
135
|
+
if kill then
|
136
|
+
break
|
137
|
+
end
|
138
|
+
# Find job to do
|
139
|
+
kill = true
|
140
|
+
job_url = nil
|
141
|
+
jlock.synchronize do
|
142
|
+
# For each job
|
143
|
+
jobs.each do |u, j|
|
144
|
+
# If job is waiting
|
145
|
+
if j[:state] == 0 then
|
146
|
+
# Take job
|
147
|
+
job_url = u
|
148
|
+
j[:state] = 1
|
149
|
+
kill = false
|
150
|
+
break
|
151
|
+
elsif j[:state] == 1 then
|
152
|
+
# Some jobs are still working; anticipate more jobs in the future
|
153
|
+
kill = false
|
154
|
+
end
|
165
155
|
end
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
156
|
+
end
|
157
|
+
# If all jobs are done, and no job is found
|
158
|
+
if kill then
|
159
|
+
break
|
160
|
+
end
|
161
|
+
# If no job found but some jobs are still being worked on, skip
|
162
|
+
if job_url == nil then
|
163
|
+
next
|
164
|
+
end
|
165
|
+
# Get job depth
|
166
|
+
job_depth = jobs[:"#{job_url}"][:depth]
|
167
|
+
# Get all links in page pointed to by job URL
|
168
|
+
begin
|
169
|
+
open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
|
170
|
+
_response = {
|
171
|
+
base_uri: response.base_uri,
|
172
|
+
meta: response.meta,
|
173
|
+
status: response.status,
|
174
|
+
content_type: response.content_type,
|
175
|
+
charset: response.charset,
|
176
|
+
content_encoding: response.content_encoding,
|
177
|
+
last_modified: response.last_modified,
|
178
|
+
body: response.read
|
179
|
+
}
|
180
|
+
# Callback
|
181
|
+
ylock.synchronize do
|
182
|
+
yields << { request: "#{job_url}", response: _response, depth: job_depth }
|
174
183
|
end
|
175
|
-
#
|
176
|
-
|
177
|
-
#
|
178
|
-
|
179
|
-
|
180
|
-
#
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
+
# If resolved URL is in scope
|
185
|
+
if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
|
186
|
+
# Add resolved URL to job queue and mark it as complete if it does not exist yet
|
187
|
+
jlock.synchronize do
|
188
|
+
if jobs[:"#{response.base_uri}"] == nil then
|
189
|
+
jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
|
190
|
+
end
|
191
|
+
end
|
192
|
+
# Get links for resolve URL
|
193
|
+
Salamander::get_links(response.base_uri, _response[:body]) do |link|
|
194
|
+
# Determine if the link should be visited
|
195
|
+
if visit.nil? || visit.call(link) then
|
196
|
+
jlock.synchronize do
|
197
|
+
# If link is not in job queue
|
198
|
+
if jobs[:"#{link}"] == nil then
|
199
|
+
# Create job for the given link
|
200
|
+
jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
|
201
|
+
end
|
184
202
|
end
|
185
203
|
end
|
186
204
|
end
|
187
205
|
end
|
188
206
|
end
|
207
|
+
rescue
|
189
208
|
end
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
209
|
+
# Flag job as complete
|
210
|
+
jlock.synchronize do
|
211
|
+
jobs[:"#{job_url}"][:state] = 2
|
212
|
+
end
|
213
|
+
# Perform delay
|
214
|
+
sleep(delay)
|
195
215
|
end
|
196
|
-
# Perform delay
|
197
|
-
sleep(delay)
|
198
216
|
end
|
217
|
+
_threads[thread.object_id] = { thread: thread, kill: false }
|
199
218
|
end
|
200
|
-
_threads << thread
|
201
219
|
end
|
202
220
|
# Wait for all threads to die
|
203
|
-
|
204
|
-
|
221
|
+
while true
|
222
|
+
# Execute yields
|
223
|
+
y = nil
|
224
|
+
ylock.synchronize do
|
225
|
+
y = yields.shift
|
226
|
+
end
|
227
|
+
if y != nil then
|
228
|
+
tlock.synchronize do
|
229
|
+
# Pre-emptive kill if yield breaks
|
230
|
+
_threads.each do |id, _thread|
|
231
|
+
_thread[:kill] = true
|
232
|
+
end
|
233
|
+
# Yield
|
234
|
+
yield y[:request], y[:response], y[:depth]
|
235
|
+
# Cancel kill if yield does not break
|
236
|
+
_threads.each do |id, _thread|
|
237
|
+
_thread[:kill] = false
|
238
|
+
end
|
239
|
+
end
|
240
|
+
next
|
241
|
+
end
|
242
|
+
# Check if dead
|
243
|
+
alive = false
|
244
|
+
_threads.each do |id, _thread|
|
245
|
+
alive = alive || _thread[:thread].alive?
|
246
|
+
if alive then
|
247
|
+
break
|
248
|
+
end
|
249
|
+
end
|
250
|
+
if !alive then
|
251
|
+
break
|
252
|
+
end
|
205
253
|
end
|
206
254
|
end
|
207
255
|
|
@@ -239,6 +287,10 @@ if __FILE__ == $0 then
|
|
239
287
|
puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
|
240
288
|
exit
|
241
289
|
end
|
290
|
+
urls_hosts = Set.new
|
291
|
+
urls.each do |url|
|
292
|
+
urls_hosts << Addressable::URI.parse(url).host
|
293
|
+
end
|
242
294
|
urls.each do |url|
|
243
295
|
begin
|
244
296
|
Addressable::URI.parse(url)
|
@@ -247,7 +299,11 @@ if __FILE__ == $0 then
|
|
247
299
|
exit
|
248
300
|
end
|
249
301
|
end
|
250
|
-
_args = {
|
302
|
+
_args = { visit:
|
303
|
+
lambda do |url|
|
304
|
+
return urls_hosts.include?(Addressable::URI.parse(url).host)
|
305
|
+
end
|
306
|
+
}
|
251
307
|
# Attempt to retrieve the delay parameter
|
252
308
|
if args['delay'] != nil then
|
253
309
|
begin
|
@@ -277,6 +333,7 @@ if __FILE__ == $0 then
|
|
277
333
|
STDERR.puts " Welcome to the Salamander Web Crawler Demo!"
|
278
334
|
STDERR.puts
|
279
335
|
STDERR.puts " This is a command-line demonstration of the Salamander Web Crawler in action."
|
336
|
+
STDERR.puts " The crawl is restricted to the hosts inside the URL list that was provided."
|
280
337
|
STDERR.puts " Press Ctrl+C at any time to interrupt the crawl."
|
281
338
|
STDERR.puts
|
282
339
|
STDERR.puts " Starting crawl at the following URLs:"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slmndr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Lawrence M. Penafiel
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
|
31
31
|
1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
|
32
32
|
-----END CERTIFICATE-----
|
33
|
-
date: 2015-
|
33
|
+
date: 2015-07-01 00:00:00.000000000 Z
|
34
34
|
dependencies:
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
36
|
name: json
|
metadata.gz.sig
CHANGED
Binary file
|