slmndr 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +2 -0
  3. data/lib/slmndr.rb +322 -0
  4. data.tar.gz.sig +0 -0
  5. metadata +66 -0
  6. metadata.gz.sig +1 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 20770bf44914d9179137aee64b3f86626b260d57
4
+ data.tar.gz: dd6067dd15692a27fa5216ab67cbe0b7321eaacc
5
+ SHA512:
6
+ metadata.gz: 2c7d1723c78474eec28e2f7f22e5b19b601fc5903fb09bec22aa80c6bd266e8260608bd0b7a4ca11e3c012a5bb84ead4c1475b8426cfed46335657c474bea190
7
+ data.tar.gz: f12be9d14cb2f5d4f1813cb09c77fd3e207ce6268f9e8f12e4caac8ebba01ca9e8e57ba77e3614dd46a616a779d76764cc3e6f182ae5652d880e4240657f1730
checksums.yaml.gz.sig ADDED
@@ -0,0 +1,2 @@
1
+ �"�|0@�U}vcƥ�Q��=:��@};}N@�}���p_l7����.�� �O�W�yS�jJ���[��7�7�I�s�H�C����/��P���8�U���HQ��ApO^s���S�s|�җ�*&
2
+ �T���,�f�b��t
data/lib/slmndr.rb ADDED
@@ -0,0 +1,322 @@
1
+ ## Salamander: A minimalistic ruby web crawling framework.
2
+ ## Authored by: John Lawrence M. Penafiel
3
+
4
+ require 'time'
5
+ require 'thread'
6
+ require 'set'
7
+ require 'open-uri'
8
+ require 'openssl'
9
+
10
+ require 'json'
11
+ require 'open_uri_redirections'
12
+ require 'nokogiri'
13
+ require 'addressable/uri'
14
+
15
+ ## Module
16
+ ## Salamander
17
+ ## Description
18
+ ## The Crawler module provides an easy way for the other components of the Salamander system to perform crawling.
19
+ ## Functions
20
+ ## Salamander::crawl
21
+ module Salamander
22
+
23
+ ## Function
24
+ ## get_links
25
+ ## Description
26
+ ## Extracts outgoing links from the HTML pointed to by the given URL string.
27
+ ## Parameters
28
+ ## url - The URL of the HTML page the function is extracting links from.
29
+ ## html - The HTML data to extract links from.
30
+ def self.get_links(url, html)
31
+ # Initialize
32
+ uri = Addressable::URI.parse(url)
33
+ # Parse as HTML
34
+ _html = Nokogiri::HTML(html)
35
+ # Get all anchors
36
+ _html.xpath('//a').each do |l|
37
+ # Extract hyper link
38
+ href = l['href']
39
+ # Skip if hyper link does not exist
40
+ if href == nil then
41
+ next
42
+ end
43
+ # Convert hyper link to URI object
44
+ link = Addressable::URI.parse(href)
45
+ # Skip if hyper link is not HTTP
46
+ if link.scheme != nil && link.scheme != 'http' && link.scheme != 'https' then
47
+ next
48
+ end
49
+ # Convert hyper link to absolute form
50
+ if link.host == nil then
51
+ link.host = uri.host
52
+ end
53
+ if link.scheme == nil then
54
+ link.scheme = uri.scheme
55
+ end
56
+ if link.port == nil then
57
+ link.port = uri.port
58
+ end
59
+ # Remove link fragment
60
+ link.fragment = nil
61
+ # Yield
62
+ yield link
63
+ end
64
+ end
65
+
66
+ ## Function
67
+ ## crawl
68
+ ## Description
69
+ ## Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
70
+ ## Function blocks until all threads terminate.
71
+ ## Parameters
72
+ ## urls - Required. A list of strings containing the seed URLs.
73
+ ## args - Optional. Default: {}. A hash containing optional arguments for the function.
74
+ ## visit - Optional. Default: nil. A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
75
+ ## delay - Optional. Default: 1. A positive float indicating the number of seconds between requests in one thread.
76
+ ## threads - Optional. Default: 1. A positive integer indicating the number of allowed simultaneous requests to the target web asset.
77
+ ## agent - Optional. Default: "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)". The user-agent string to be used.
78
+ def crawl(urls, args = {})
79
+ # Get arguments
80
+ visit = nil
81
+ if args[:visit] != nil then
82
+ visit = args[:visit]
83
+ end
84
+ delay = 1
85
+ if args[:delay] != nil then
86
+ delay = args[:delay]
87
+ end
88
+ if delay < 0 then
89
+ raise "delay must be a positive float"
90
+ end
91
+ threads = 1
92
+ if args[:threads] != nil then
93
+ threads = args[:threads]
94
+ end
95
+ if threads < 0 then
96
+ raise "threads must be a positive integer"
97
+ end
98
+ agent = "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)"
99
+ if args[:agent] != nil then
100
+ agent = args[:agent]
101
+ end
102
+ # Create threads list
103
+ _threads = []
104
+ # Create jobs map and lock
105
+ jobs = {}
106
+ jlock = Mutex.new
107
+ # Create job; Job States: 0: waiting, 1: working, 2: done
108
+ urls.each do |url|
109
+ jobs[:"#{url}"] = { state: 0, depth: 0 }
110
+ end
111
+ # Create and launch crawl threads
112
+ for id in 1..threads
113
+ # Create crawl thread
114
+ thread = Thread.new do
115
+ # Loop
116
+ while true
117
+ # Find job to do
118
+ kill = true
119
+ job_url = nil
120
+ jlock.synchronize do
121
+ # For each job
122
+ jobs.each do |u, j|
123
+ # If job is waiting
124
+ if j[:state] == 0 then
125
+ # Take job
126
+ job_url = u
127
+ j[:state] = 1
128
+ kill = false
129
+ break
130
+ elsif j[:state] == 1 then
131
+ # Some jobs are still working; anticipate more jobs in the future
132
+ kill = false
133
+ end
134
+ end
135
+ end
136
+ # If all jobs are done, and no job is found
137
+ if kill then
138
+ break
139
+ end
140
+ # If no job found but some jobs are still being worked on, skip
141
+ if job_url == nil then
142
+ next
143
+ end
144
+ # Get job depth
145
+ job_depth = jobs[:"#{job_url}"][:depth]
146
+ # Get all links in page pointed to by job URL
147
+ begin
148
+ open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
149
+ # Callback
150
+ jlock.synchronize do
151
+ yield "#{job_url}", response, job_depth
152
+ end
153
+ # If resolved URL is in scope
154
+ if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
155
+ # Add resolved URL to job queue and mark it as complete if it does not exist yet
156
+ jlock.synchronize do
157
+ if jobs[:"#{response.base_uri}"] == nil then
158
+ yield "#{response.base_uri}", response, job_depth
159
+ jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
160
+ end
161
+ end
162
+ # Get links for resolve URL
163
+ Salamander::get_links(response.base_uri, response) do |link|
164
+ # Determine if the link should be visited
165
+ if visit.nil? || visit.call(link) then
166
+ jlock.synchronize do
167
+ # If link is not in job queue
168
+ if jobs[:"#{link}"] == nil then
169
+ # Create job for the given link
170
+ jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
171
+ end
172
+ end
173
+ end
174
+ end
175
+ end
176
+ end
177
+ rescue
178
+ end
179
+ # Flag job as complete
180
+ jlock.synchronize do
181
+ jobs[:"#{job_url}"][:state] = 2
182
+ end
183
+ # Perform delay
184
+ sleep(delay)
185
+ end
186
+ end
187
+ _threads << thread
188
+ end
189
+ # Wait for all threads to die
190
+ _threads.each do |_thread|
191
+ _thread.join
192
+ end
193
+ end
194
+
195
+ module_function :crawl
196
+
197
+ end
198
+
199
+ # For direct invocation of crawler.rb
200
+ if __FILE__ == $0 then
201
+ # Record start time
202
+ time = Time.new
203
+ # Arbitrary terminal width
204
+ twid = 70
205
+ # Declare crawl count variable
206
+ count = 0
207
+ # Attempt to catch interrupt signals and unknown errors
208
+ begin
209
+ # Read arguments JSON from standard input
210
+ stdin = STDIN.read
211
+ args = nil
212
+ begin
213
+ args = JSON.parse(stdin)
214
+ rescue
215
+ puts JSON.pretty_generate({ result: "exception", message: "unable to parse json from stdin" })
216
+ exit
217
+ end
218
+ # Make sure the urls parameter has been supplied
219
+ if args['urls'] == nil then
220
+ puts JSON.pretty_generate({ result: "misuse", message: "'urls' parameter not specified" })
221
+ exit
222
+ else
223
+ # Retrieve the url parameter
224
+ urls = args['urls']
225
+ if !urls.kind_of?(Array) then
226
+ puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
227
+ exit
228
+ end
229
+ urls.each do |url|
230
+ begin
231
+ Addressable::URI.parse(url)
232
+ rescue
233
+ puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
234
+ exit
235
+ end
236
+ end
237
+ _args = {}
238
+ # Attempt to retrieve the delay parameter
239
+ if args['delay'] != nil then
240
+ begin
241
+ _args[:delay] = args['delay'].to_f
242
+ rescue
243
+ puts JSON.pretty_generate({ result: "exception", message: "delay must be a float" })
244
+ exit
245
+ end
246
+ end
247
+ # Attempt to retrieve the threads parameter
248
+ if args['threads'] != nil then
249
+ begin
250
+ _args[:threads] = args['threads'].to_i
251
+ rescue
252
+ puts JSON.pretty_generate({ result: "exception", message: "threads must be an integer" })
253
+ exit
254
+ end
255
+ end
256
+ # Attempt to retrieve the agent parameter
257
+ if args['agent'] != nil then
258
+ _args[:agent] = args['agent']
259
+ end
260
+ # Begin crawl; try to catch exceptions
261
+ begin
262
+ # Print banner
263
+ STDERR.puts
264
+ STDERR.puts " Welcome to the Salamander Web Crawler Demo!"
265
+ STDERR.puts
266
+ STDERR.puts " This is a command-line demonstration of the Salamander Web Crawler in action."
267
+ STDERR.puts " Press Ctrl+C at any time to interrupt the crawl."
268
+ STDERR.puts
269
+ STDERR.puts " Starting crawl at the following URLs:"
270
+ urls.each do |url|
271
+ STDERR.puts " - #{url}"
272
+ end
273
+ STDERR.puts
274
+ STDERR.print " Depth URL"
275
+ first = true
276
+ # Do actual crawl
277
+ Salamander::crawl(urls, _args) do |request, response, depth|
278
+ begin
279
+ # Increment crawl count
280
+ count = count + 1
281
+ # Truncate URL string
282
+ if request.length > twid - 2 then
283
+ _url = "#{request[0, twid - 5]}..."
284
+ else
285
+ _url = request
286
+ end
287
+ # Print crawl hit
288
+ if first then
289
+ STDERR.puts
290
+ first = false
291
+ end
292
+ STDERR.puts
293
+ STDERR.print " #{format('%02d', depth)} #{_url}"
294
+ STDERR.flush
295
+ rescue Interrupt => e
296
+ # Catch interrupt cleanly
297
+ STDERR.puts
298
+ STDERR.puts
299
+ STDERR.puts " Program terminated successfully"
300
+ STDERR.puts " Number of Pages Crawled: #{count}"
301
+ STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
302
+ break
303
+ end
304
+ end
305
+ rescue => e
306
+ puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", inspect: e.inspect })
307
+ exit
308
+ end
309
+ end
310
+ rescue Interrupt => e
311
+ # Catch interrupt cleanly
312
+ STDERR.puts
313
+ STDERR.puts
314
+ STDERR.puts " Program terminated successfully"
315
+ STDERR.puts " Number of Pages Crawled: #{count}"
316
+ STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
317
+ rescue e
318
+ # Print any uncaught exceptions
319
+ puts JSON.pretty_generate({ result: "exception", message: "unknown error", inspect: e.inspect })
320
+ exit
321
+ end
322
+ end
data.tar.gz.sig ADDED
Binary file
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: slmndr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - John Lawrence M. Penafiel
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIDfDCCAmSgAwIBAgIBATANBgkqhkiG9w0BAQUFADBCMRQwEgYDVQQDDAtwZW5h
14
+ ZmllbGpsbTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYD
15
+ Y29tMB4XDTE1MDYyOTE1MDQ1MVoXDTE2MDYyODE1MDQ1MVowQjEUMBIGA1UEAwwL
16
+ cGVuYWZpZWxqbG0xFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
17
+ ARkWA2NvbTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAPO7rVa+t3P8
18
+ pcv9C6k9LjbpVBQhLfHIkIrE98qGrXzzQANhTqRqrLgpCcYw44rMETCg2py7lti2
19
+ +g5997njU0IpPVlwLX71S3ddsvezi2KeYm+q2z0DMN5y1MsgcmoRHL8tGlbLw9FC
20
+ zq0KtJuIdcyrQQf9c3N/qxNbFyAJ5ZiuXq3bueOndt0zH8waOCdZKz+XZQYDJsTn
21
+ Kob8pLZill4ftwP1AOj2wqNG6ElR8NWJW77Il6Lxlh8fVwOxSaYAtRYyQU3C59M+
22
+ fEas9aXk12wEumDql2M3W0DNYiM1dXbamPSxFGu3c/BBrdJcEuQnuDAPYteaKj4c
23
+ 2DXoicDGn+UCAwEAAaN9MHswCQYDVR0TBAIwADALBgNVHQ8EBAMCBLAwHQYDVR0O
24
+ BBYEFPMx9v4kXBciiOP03co6iSatkViCMCAGA1UdEQQZMBeBFXBlbmFmaWVsamxt
25
+ QGdtYWlsLmNvbTAgBgNVHRIEGTAXgRVwZW5hZmllbGpsbUBnbWFpbC5jb20wDQYJ
26
+ KoZIhvcNAQEFBQADggEBADUAbj69LIDpA6rLmCQoAKjZRaBYVweZp4G07Gf838+f
27
+ swa/B68PpSjoWmWeErvAahkJhTCNY7SuFinUh6/lDqeqUXIQPQu8PW3Oyfp63u1U
28
+ vnM0Opft5VBhaP/dBEYvN2e/F2q62ObsRtzq9hXW9k/EfgVtlmKeeAeH7k2mPeGi
29
+ 7rxC4nb8yYf55rPGLG52BYSBmDwFIh64JiEmLJi3jEiTEMGB+dVQk++0HSuHDFi1
30
+ kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
31
+ 1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
32
+ -----END CERTIFICATE-----
33
+ date: 2015-06-29 00:00:00.000000000 Z
34
+ dependencies: []
35
+ description: A minimalistic ruby web crawling framework
36
+ email: penafieljlm@gmail.com
37
+ executables: []
38
+ extensions: []
39
+ extra_rdoc_files: []
40
+ files:
41
+ - lib/slmndr.rb
42
+ homepage: http://rubygems.org/gems/slmndr
43
+ licenses:
44
+ - MIT
45
+ metadata: {}
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ requirements: []
61
+ rubyforge_project:
62
+ rubygems_version: 2.4.5
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: Salamander
66
+ test_files: []
metadata.gz.sig ADDED
@@ -0,0 +1 @@
1
+ �@�v<gu���$�XGR�*�d�Xcca#x��X�S]��9o�-�f))� (`��J����/�~쎆�E�:y����Ǐ|캕�̟8���Hr� #x/�% ���/�IyF�M�8�_m� ,�P��p��l��BDM� �B̠��@���Jfd�'�!A�y��zq�=]�:��x��0,�f ֖��d��n}��Ŭoɯ��G��D�]�b��S�2׹:�K� �L�y6�"�b؎T��9�C��~�pL.