slmndr 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/lib/slmndr.rb +137 -80
  5. metadata +2 -2
  6. metadata.gz.sig +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 80694b7cb325f3f082327ef4408b045ee23e9355
4
- data.tar.gz: feabcf945e9b69da248d5190130063dbbe3e47b2
3
+ metadata.gz: 9e90e3f806c2fa3b3dc195b6a11ddd412d9e6302
4
+ data.tar.gz: f637bf43473abf16538f6aa6e3927e08901b9c27
5
5
  SHA512:
6
- metadata.gz: d88f18d2641a21b233fe8af543b66e3512aa9671445a1d43cdc6790c6de17e4450011df6d0171cf77f9280b77d6752c182fb8b06ed72ef2dcbd81daed5b364ab
7
- data.tar.gz: 77f9ea7a1eca3d161647e7a50960baec47fddcc7051df74b5ee51628dca108db76902d00a0718c40c77b98169b47d0c0acd9d9dade6e91b6d45aa5fb11f65cc1
6
+ metadata.gz: 5dbd8d6b09f8d9ba76dcfdcdb9ed3848ee843aa6cb5b25c7caa20a48f3a74df35a5677f69ef78326e704a0ebbc1ef495e3bdaf12ab553230b2485f52eee0e9f0
7
+ data.tar.gz: 1c499a5358a878b38190ed5842bc58f3077b319e2a630853b9fa9c5b1813a88697e1f882d2f7b0e3c92cc787169a0b72d309e46afa66c359b1e714a7b3c2f366
checksums.yaml.gz.sig CHANGED
Binary file
data.tar.gz.sig CHANGED
Binary file
data/lib/slmndr.rb CHANGED
@@ -54,7 +54,7 @@ module Salamander
54
54
  end
55
55
  end
56
56
 
57
- # Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
57
+ # Performs an unauthenticated, breadth-first crawl of the target web asset.
58
58
  # Function blocks until all threads terminate.
59
59
  # This function can receive a code block like so...
60
60
  # Salamander::crawl(urls, args) do |request, response, depth|
@@ -102,106 +102,154 @@ module Salamander
102
102
  if args[:agent] != nil then
103
103
  agent = args[:agent]
104
104
  end
105
- # Create threads list
106
- _threads = []
105
+ # Create threads list and lock
106
+ _threads = {}
107
+ tlock = Mutex.new
107
108
  # Create jobs map and lock
108
109
  jobs = {}
109
110
  jlock = Mutex.new
111
+ # Create yield job list and lock
112
+ yields = []
113
+ ylock = Mutex.new
110
114
  # Create job; Job States: 0: waiting, 1: working, 2: done
111
115
  urls.each do |url|
112
116
  jobs[:"#{url}"] = { state: 0, depth: 0 }
113
117
  end
114
118
  # Create and launch crawl threads
115
- for id in 1..threads
116
- # Create crawl thread
117
- thread = Thread.new do
118
- # Loop
119
- while true
120
- # Find job to do
121
- kill = true
122
- job_url = nil
123
- jlock.synchronize do
124
- # For each job
125
- jobs.each do |u, j|
126
- # If job is waiting
127
- if j[:state] == 0 then
128
- # Take job
129
- job_url = u
130
- j[:state] = 1
131
- kill = false
132
- break
133
- elsif j[:state] == 1 then
134
- # Some jobs are still working; anticipate more jobs in the future
135
- kill = false
136
- end
137
- end
138
- end
139
- # If all jobs are done, and no job is found
140
- if kill then
141
- break
119
+ for i in 1..threads
120
+ tlock.synchronize do
121
+ # Create crawl thread
122
+ thread = Thread.new do
123
+ # Wait until all threads are created
124
+ tlock.synchronize do
142
125
  end
143
- # If no job found but some jobs are still being worked on, skip
144
- if job_url == nil then
145
- next
146
- end
147
- # Get job depth
148
- job_depth = jobs[:"#{job_url}"][:depth]
149
- # Get all links in page pointed to by job URL
150
- begin
151
- open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
152
- _response = {
153
- base_uri: response.base_uri,
154
- meta: response.meta,
155
- status: response.status,
156
- content_type: response.content_type,
157
- charset: response.charset,
158
- content_encoding: response.content_encoding,
159
- last_modified: response.last_modified,
160
- body: response.read
161
- }
162
- # Callback
163
- jlock.synchronize do
164
- yield "#{job_url}", _response, job_depth
126
+ # Get thread id
127
+ _id = Thread.current.object_id
128
+ # Loop
129
+ while true
130
+ # Check if thread has been forcefully killed
131
+ kill = false
132
+ tlock.synchronize do
133
+ kill = _threads[_id][:kill]
134
+ end
135
+ if kill then
136
+ break
137
+ end
138
+ # Find job to do
139
+ kill = true
140
+ job_url = nil
141
+ jlock.synchronize do
142
+ # For each job
143
+ jobs.each do |u, j|
144
+ # If job is waiting
145
+ if j[:state] == 0 then
146
+ # Take job
147
+ job_url = u
148
+ j[:state] = 1
149
+ kill = false
150
+ break
151
+ elsif j[:state] == 1 then
152
+ # Some jobs are still working; anticipate more jobs in the future
153
+ kill = false
154
+ end
165
155
  end
166
- # If resolved URL is in scope
167
- if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
168
- # Add resolved URL to job queue and mark it as complete if it does not exist yet
169
- jlock.synchronize do
170
- if jobs[:"#{response.base_uri}"] == nil then
171
- yield "#{response.base_uri}", _response, job_depth
172
- jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
173
- end
156
+ end
157
+ # If all jobs are done, and no job is found
158
+ if kill then
159
+ break
160
+ end
161
+ # If no job found but some jobs are still being worked on, skip
162
+ if job_url == nil then
163
+ next
164
+ end
165
+ # Get job depth
166
+ job_depth = jobs[:"#{job_url}"][:depth]
167
+ # Get all links in page pointed to by job URL
168
+ begin
169
+ open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
170
+ _response = {
171
+ base_uri: response.base_uri,
172
+ meta: response.meta,
173
+ status: response.status,
174
+ content_type: response.content_type,
175
+ charset: response.charset,
176
+ content_encoding: response.content_encoding,
177
+ last_modified: response.last_modified,
178
+ body: response.read
179
+ }
180
+ # Callback
181
+ ylock.synchronize do
182
+ yields << { request: "#{job_url}", response: _response, depth: job_depth }
174
183
  end
175
- # Get links for resolve URL
176
- Salamander::get_links(response.base_uri, _response[:body]) do |link|
177
- # Determine if the link should be visited
178
- if visit.nil? || visit.call(link) then
179
- jlock.synchronize do
180
- # If link is not in job queue
181
- if jobs[:"#{link}"] == nil then
182
- # Create job for the given link
183
- jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
184
+ # If resolved URL is in scope
185
+ if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
186
+ # Add resolved URL to job queue and mark it as complete if it does not exist yet
187
+ jlock.synchronize do
188
+ if jobs[:"#{response.base_uri}"] == nil then
189
+ jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
190
+ end
191
+ end
192
+ # Get links for resolve URL
193
+ Salamander::get_links(response.base_uri, _response[:body]) do |link|
194
+ # Determine if the link should be visited
195
+ if visit.nil? || visit.call(link) then
196
+ jlock.synchronize do
197
+ # If link is not in job queue
198
+ if jobs[:"#{link}"] == nil then
199
+ # Create job for the given link
200
+ jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
201
+ end
184
202
  end
185
203
  end
186
204
  end
187
205
  end
188
206
  end
207
+ rescue
189
208
  end
190
- rescue
191
- end
192
- # Flag job as complete
193
- jlock.synchronize do
194
- jobs[:"#{job_url}"][:state] = 2
209
+ # Flag job as complete
210
+ jlock.synchronize do
211
+ jobs[:"#{job_url}"][:state] = 2
212
+ end
213
+ # Perform delay
214
+ sleep(delay)
195
215
  end
196
- # Perform delay
197
- sleep(delay)
198
216
  end
217
+ _threads[thread.object_id] = { thread: thread, kill: false }
199
218
  end
200
- _threads << thread
201
219
  end
202
220
  # Wait for all threads to die
203
- _threads.each do |_thread|
204
- _thread.join
221
+ while true
222
+ # Execute yields
223
+ y = nil
224
+ ylock.synchronize do
225
+ y = yields.shift
226
+ end
227
+ if y != nil then
228
+ tlock.synchronize do
229
+ # Pre-emptive kill if yield breaks
230
+ _threads.each do |id, _thread|
231
+ _thread[:kill] = true
232
+ end
233
+ # Yield
234
+ yield y[:request], y[:response], y[:depth]
235
+ # Cancel kill if yield does not break
236
+ _threads.each do |id, _thread|
237
+ _thread[:kill] = false
238
+ end
239
+ end
240
+ next
241
+ end
242
+ # Check if dead
243
+ alive = false
244
+ _threads.each do |id, _thread|
245
+ alive = alive || _thread[:thread].alive?
246
+ if alive then
247
+ break
248
+ end
249
+ end
250
+ if !alive then
251
+ break
252
+ end
205
253
  end
206
254
  end
207
255
 
@@ -239,6 +287,10 @@ if __FILE__ == $0 then
239
287
  puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
240
288
  exit
241
289
  end
290
+ urls_hosts = Set.new
291
+ urls.each do |url|
292
+ urls_hosts << Addressable::URI.parse(url).host
293
+ end
242
294
  urls.each do |url|
243
295
  begin
244
296
  Addressable::URI.parse(url)
@@ -247,7 +299,11 @@ if __FILE__ == $0 then
247
299
  exit
248
300
  end
249
301
  end
250
- _args = {}
302
+ _args = { visit:
303
+ lambda do |url|
304
+ return urls_hosts.include?(Addressable::URI.parse(url).host)
305
+ end
306
+ }
251
307
  # Attempt to retrieve the delay parameter
252
308
  if args['delay'] != nil then
253
309
  begin
@@ -277,6 +333,7 @@ if __FILE__ == $0 then
277
333
  STDERR.puts " Welcome to the Salamander Web Crawler Demo!"
278
334
  STDERR.puts
279
335
  STDERR.puts " This is a command-line demonstration of the Salamander Web Crawler in action."
336
+ STDERR.puts " The crawl is restricted to the hosts inside the URL list that was provided."
280
337
  STDERR.puts " Press Ctrl+C at any time to interrupt the crawl."
281
338
  STDERR.puts
282
339
  STDERR.puts " Starting crawl at the following URLs:"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slmndr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Lawrence M. Penafiel
@@ -30,7 +30,7 @@ cert_chain:
30
30
  kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
31
31
  1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
32
32
  -----END CERTIFICATE-----
33
- date: 2015-06-30 00:00:00.000000000 Z
33
+ date: 2015-07-01 00:00:00.000000000 Z
34
34
  dependencies:
35
35
  - !ruby/object:Gem::Dependency
36
36
  name: json
metadata.gz.sig CHANGED
Binary file