slmndr 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/lib/slmndr.rb +137 -80
  5. metadata +2 -2
  6. metadata.gz.sig +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 80694b7cb325f3f082327ef4408b045ee23e9355
4
- data.tar.gz: feabcf945e9b69da248d5190130063dbbe3e47b2
3
+ metadata.gz: 9e90e3f806c2fa3b3dc195b6a11ddd412d9e6302
4
+ data.tar.gz: f637bf43473abf16538f6aa6e3927e08901b9c27
5
5
  SHA512:
6
- metadata.gz: d88f18d2641a21b233fe8af543b66e3512aa9671445a1d43cdc6790c6de17e4450011df6d0171cf77f9280b77d6752c182fb8b06ed72ef2dcbd81daed5b364ab
7
- data.tar.gz: 77f9ea7a1eca3d161647e7a50960baec47fddcc7051df74b5ee51628dca108db76902d00a0718c40c77b98169b47d0c0acd9d9dade6e91b6d45aa5fb11f65cc1
6
+ metadata.gz: 5dbd8d6b09f8d9ba76dcfdcdb9ed3848ee843aa6cb5b25c7caa20a48f3a74df35a5677f69ef78326e704a0ebbc1ef495e3bdaf12ab553230b2485f52eee0e9f0
7
+ data.tar.gz: 1c499a5358a878b38190ed5842bc58f3077b319e2a630853b9fa9c5b1813a88697e1f882d2f7b0e3c92cc787169a0b72d309e46afa66c359b1e714a7b3c2f366
checksums.yaml.gz.sig CHANGED
Binary file
data.tar.gz.sig CHANGED
Binary file
data/lib/slmndr.rb CHANGED
@@ -54,7 +54,7 @@ module Salamander
54
54
  end
55
55
  end
56
56
 
57
- # Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
57
+ # Performs an unauthenticated, breadth-first crawl of the target web asset.
58
58
  # Function blocks until all threads terminate.
59
59
  # This function can receive a code block like so...
60
60
  # Salamander::crawl(urls, args) do |request, response, depth|
@@ -102,106 +102,154 @@ module Salamander
102
102
  if args[:agent] != nil then
103
103
  agent = args[:agent]
104
104
  end
105
- # Create threads list
106
- _threads = []
105
+ # Create threads list and lock
106
+ _threads = {}
107
+ tlock = Mutex.new
107
108
  # Create jobs map and lock
108
109
  jobs = {}
109
110
  jlock = Mutex.new
111
+ # Create yield job list and lock
112
+ yields = []
113
+ ylock = Mutex.new
110
114
  # Create job; Job States: 0: waiting, 1: working, 2: done
111
115
  urls.each do |url|
112
116
  jobs[:"#{url}"] = { state: 0, depth: 0 }
113
117
  end
114
118
  # Create and launch crawl threads
115
- for id in 1..threads
116
- # Create crawl thread
117
- thread = Thread.new do
118
- # Loop
119
- while true
120
- # Find job to do
121
- kill = true
122
- job_url = nil
123
- jlock.synchronize do
124
- # For each job
125
- jobs.each do |u, j|
126
- # If job is waiting
127
- if j[:state] == 0 then
128
- # Take job
129
- job_url = u
130
- j[:state] = 1
131
- kill = false
132
- break
133
- elsif j[:state] == 1 then
134
- # Some jobs are still working; anticipate more jobs in the future
135
- kill = false
136
- end
137
- end
138
- end
139
- # If all jobs are done, and no job is found
140
- if kill then
141
- break
119
+ for i in 1..threads
120
+ tlock.synchronize do
121
+ # Create crawl thread
122
+ thread = Thread.new do
123
+ # Wait until all threads are created
124
+ tlock.synchronize do
142
125
  end
143
- # If no job found but some jobs are still being worked on, skip
144
- if job_url == nil then
145
- next
146
- end
147
- # Get job depth
148
- job_depth = jobs[:"#{job_url}"][:depth]
149
- # Get all links in page pointed to by job URL
150
- begin
151
- open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
152
- _response = {
153
- base_uri: response.base_uri,
154
- meta: response.meta,
155
- status: response.status,
156
- content_type: response.content_type,
157
- charset: response.charset,
158
- content_encoding: response.content_encoding,
159
- last_modified: response.last_modified,
160
- body: response.read
161
- }
162
- # Callback
163
- jlock.synchronize do
164
- yield "#{job_url}", _response, job_depth
126
+ # Get thread id
127
+ _id = Thread.current.object_id
128
+ # Loop
129
+ while true
130
+ # Check if thread has been forcefully killed
131
+ kill = false
132
+ tlock.synchronize do
133
+ kill = _threads[_id][:kill]
134
+ end
135
+ if kill then
136
+ break
137
+ end
138
+ # Find job to do
139
+ kill = true
140
+ job_url = nil
141
+ jlock.synchronize do
142
+ # For each job
143
+ jobs.each do |u, j|
144
+ # If job is waiting
145
+ if j[:state] == 0 then
146
+ # Take job
147
+ job_url = u
148
+ j[:state] = 1
149
+ kill = false
150
+ break
151
+ elsif j[:state] == 1 then
152
+ # Some jobs are still working; anticipate more jobs in the future
153
+ kill = false
154
+ end
165
155
  end
166
- # If resolved URL is in scope
167
- if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
168
- # Add resolved URL to job queue and mark it as complete if it does not exist yet
169
- jlock.synchronize do
170
- if jobs[:"#{response.base_uri}"] == nil then
171
- yield "#{response.base_uri}", _response, job_depth
172
- jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
173
- end
156
+ end
157
+ # If all jobs are done, and no job is found
158
+ if kill then
159
+ break
160
+ end
161
+ # If no job found but some jobs are still being worked on, skip
162
+ if job_url == nil then
163
+ next
164
+ end
165
+ # Get job depth
166
+ job_depth = jobs[:"#{job_url}"][:depth]
167
+ # Get all links in page pointed to by job URL
168
+ begin
169
+ open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
170
+ _response = {
171
+ base_uri: response.base_uri,
172
+ meta: response.meta,
173
+ status: response.status,
174
+ content_type: response.content_type,
175
+ charset: response.charset,
176
+ content_encoding: response.content_encoding,
177
+ last_modified: response.last_modified,
178
+ body: response.read
179
+ }
180
+ # Callback
181
+ ylock.synchronize do
182
+ yields << { request: "#{job_url}", response: _response, depth: job_depth }
174
183
  end
175
- # Get links for resolve URL
176
- Salamander::get_links(response.base_uri, _response[:body]) do |link|
177
- # Determine if the link should be visited
178
- if visit.nil? || visit.call(link) then
179
- jlock.synchronize do
180
- # If link is not in job queue
181
- if jobs[:"#{link}"] == nil then
182
- # Create job for the given link
183
- jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
184
+ # If resolved URL is in scope
185
+ if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
186
+ # Add resolved URL to job queue and mark it as complete if it does not exist yet
187
+ jlock.synchronize do
188
+ if jobs[:"#{response.base_uri}"] == nil then
189
+ jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
190
+ end
191
+ end
192
+ # Get links for resolve URL
193
+ Salamander::get_links(response.base_uri, _response[:body]) do |link|
194
+ # Determine if the link should be visited
195
+ if visit.nil? || visit.call(link) then
196
+ jlock.synchronize do
197
+ # If link is not in job queue
198
+ if jobs[:"#{link}"] == nil then
199
+ # Create job for the given link
200
+ jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
201
+ end
184
202
  end
185
203
  end
186
204
  end
187
205
  end
188
206
  end
207
+ rescue
189
208
  end
190
- rescue
191
- end
192
- # Flag job as complete
193
- jlock.synchronize do
194
- jobs[:"#{job_url}"][:state] = 2
209
+ # Flag job as complete
210
+ jlock.synchronize do
211
+ jobs[:"#{job_url}"][:state] = 2
212
+ end
213
+ # Perform delay
214
+ sleep(delay)
195
215
  end
196
- # Perform delay
197
- sleep(delay)
198
216
  end
217
+ _threads[thread.object_id] = { thread: thread, kill: false }
199
218
  end
200
- _threads << thread
201
219
  end
202
220
  # Wait for all threads to die
203
- _threads.each do |_thread|
204
- _thread.join
221
+ while true
222
+ # Execute yields
223
+ y = nil
224
+ ylock.synchronize do
225
+ y = yields.shift
226
+ end
227
+ if y != nil then
228
+ tlock.synchronize do
229
+ # Pre-emptive kill if yield breaks
230
+ _threads.each do |id, _thread|
231
+ _thread[:kill] = true
232
+ end
233
+ # Yield
234
+ yield y[:request], y[:response], y[:depth]
235
+ # Cancel kill if yield does not break
236
+ _threads.each do |id, _thread|
237
+ _thread[:kill] = false
238
+ end
239
+ end
240
+ next
241
+ end
242
+ # Check if dead
243
+ alive = false
244
+ _threads.each do |id, _thread|
245
+ alive = alive || _thread[:thread].alive?
246
+ if alive then
247
+ break
248
+ end
249
+ end
250
+ if !alive then
251
+ break
252
+ end
205
253
  end
206
254
  end
207
255
 
@@ -239,6 +287,10 @@ if __FILE__ == $0 then
239
287
  puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
240
288
  exit
241
289
  end
290
+ urls_hosts = Set.new
291
+ urls.each do |url|
292
+ urls_hosts << Addressable::URI.parse(url).host
293
+ end
242
294
  urls.each do |url|
243
295
  begin
244
296
  Addressable::URI.parse(url)
@@ -247,7 +299,11 @@ if __FILE__ == $0 then
247
299
  exit
248
300
  end
249
301
  end
250
- _args = {}
302
+ _args = { visit:
303
+ lambda do |url|
304
+ return urls_hosts.include?(Addressable::URI.parse(url).host)
305
+ end
306
+ }
251
307
  # Attempt to retrieve the delay parameter
252
308
  if args['delay'] != nil then
253
309
  begin
@@ -277,6 +333,7 @@ if __FILE__ == $0 then
277
333
  STDERR.puts " Welcome to the Salamander Web Crawler Demo!"
278
334
  STDERR.puts
279
335
  STDERR.puts " This is a command-line demonstration of the Salamander Web Crawler in action."
336
+ STDERR.puts " The crawl is restricted to the hosts inside the URL list that was provided."
280
337
  STDERR.puts " Press Ctrl+C at any time to interrupt the crawl."
281
338
  STDERR.puts
282
339
  STDERR.puts " Starting crawl at the following URLs:"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slmndr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Lawrence M. Penafiel
@@ -30,7 +30,7 @@ cert_chain:
30
30
  kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
31
31
  1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
32
32
  -----END CERTIFICATE-----
33
- date: 2015-06-30 00:00:00.000000000 Z
33
+ date: 2015-07-01 00:00:00.000000000 Z
34
34
  dependencies:
35
35
  - !ruby/object:Gem::Dependency
36
36
  name: json
metadata.gz.sig CHANGED
Binary file