slmndr 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +2 -0
  3. data/lib/slmndr.rb +322 -0
  4. data.tar.gz.sig +0 -0
  5. metadata +66 -0
  6. metadata.gz.sig +1 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 20770bf44914d9179137aee64b3f86626b260d57
4
+ data.tar.gz: dd6067dd15692a27fa5216ab67cbe0b7321eaacc
5
+ SHA512:
6
+ metadata.gz: 2c7d1723c78474eec28e2f7f22e5b19b601fc5903fb09bec22aa80c6bd266e8260608bd0b7a4ca11e3c012a5bb84ead4c1475b8426cfed46335657c474bea190
7
+ data.tar.gz: f12be9d14cb2f5d4f1813cb09c77fd3e207ce6268f9e8f12e4caac8ebba01ca9e8e57ba77e3614dd46a616a779d76764cc3e6f182ae5652d880e4240657f1730
checksums.yaml.gz.sig ADDED
@@ -0,0 +1,2 @@
1
+ �"�|0@�U}vcƥ�Q��=:��@};}N@�}���p_l7����.�� �O�W�yS�jJ���[��7�7�I�s�H�C����/��P���8�U���HQ��ApO^s���S�s|�җ�*&
2
+ �T���,�f�b��t
data/lib/slmndr.rb ADDED
@@ -0,0 +1,322 @@
1
+ ## Salamander: A minimalistic ruby web crawling framework.
2
+ ## Authored by: John Lawrence M. Penafiel
3
+
4
+ require 'time'
5
+ require 'thread'
6
+ require 'set'
7
+ require 'open-uri'
8
+ require 'openssl'
9
+
10
+ require 'json'
11
+ require 'open_uri_redirections'
12
+ require 'nokogiri'
13
+ require 'addressable/uri'
14
+
15
+ ## Module
16
+ ## Salamander
17
+ ## Description
18
+ ## The Crawler module provides an easy way for the other components of the Salamander system to perform crawling.
19
+ ## Functions
20
+ ## Salamander::crawl
21
+ module Salamander
22
+
23
+ ## Function
24
+ ## get_links
25
+ ## Description
26
+ ## Extracts outgoing links from the HTML pointed to by the given URL string.
27
+ ## Parameters
28
+ ## url - The URL of the HTML page the function is extracting links from.
29
+ ## html - The HTML data to extract links from.
30
+ def self.get_links(url, html)
31
+ # Initialize
32
+ uri = Addressable::URI.parse(url)
33
+ # Parse as HTML
34
+ _html = Nokogiri::HTML(html)
35
+ # Get all anchors
36
+ _html.xpath('//a').each do |l|
37
+ # Extract hyper link
38
+ href = l['href']
39
+ # Skip if hyper link does not exist
40
+ if href == nil then
41
+ next
42
+ end
43
+ # Convert hyper link to URI object
44
+ link = Addressable::URI.parse(href)
45
+ # Skip if hyper link is not HTTP
46
+ if link.scheme != nil && link.scheme != 'http' && link.scheme != 'https' then
47
+ next
48
+ end
49
+ # Convert hyper link to absolute form
50
+ if link.host == nil then
51
+ link.host = uri.host
52
+ end
53
+ if link.scheme == nil then
54
+ link.scheme = uri.scheme
55
+ end
56
+ if link.port == nil then
57
+ link.port = uri.port
58
+ end
59
+ # Remove link fragment
60
+ link.fragment = nil
61
+ # Yield
62
+ yield link
63
+ end
64
+ end
65
+
66
+ ## Function
67
+ ## crawl
68
+ ## Description
69
+ ## Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
70
+ ## Function blocks until all threads terminate.
71
+ ## Parameters
72
+ ## urls - Required. A list of strings containing the seed URLs.
73
+ ## args - Optional. Default: {}. A hash containing optional arguments for the function.
74
+ ## visit - Optional. Default: nil. A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
75
+ ## delay - Optional. Default: 1. A positive float indicating the number of seconds between requests in one thread.
76
+ ## threads - Optional. Default: 1. A positive integer indicating the number of allowed simultaneous requests to the target web asset.
77
+ ## agent - Optional. Default: "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)". The user-agent string to be used.
78
+ def crawl(urls, args = {})
79
+ # Get arguments
80
+ visit = nil
81
+ if args[:visit] != nil then
82
+ visit = args[:visit]
83
+ end
84
+ delay = 1
85
+ if args[:delay] != nil then
86
+ delay = args[:delay]
87
+ end
88
+ if delay < 0 then
89
+ raise "delay must be a positive float"
90
+ end
91
+ threads = 1
92
+ if args[:threads] != nil then
93
+ threads = args[:threads]
94
+ end
95
+ if threads < 0 then
96
+ raise "threads must be a positive integer"
97
+ end
98
+ agent = "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)"
99
+ if args[:agent] != nil then
100
+ agent = args[:agent]
101
+ end
102
+ # Create threads list
103
+ _threads = []
104
+ # Create jobs map and lock
105
+ jobs = {}
106
+ jlock = Mutex.new
107
+ # Create job; Job States: 0: waiting, 1: working, 2: done
108
+ urls.each do |url|
109
+ jobs[:"#{url}"] = { state: 0, depth: 0 }
110
+ end
111
+ # Create and launch crawl threads
112
+ for id in 1..threads
113
+ # Create crawl thread
114
+ thread = Thread.new do
115
+ # Loop
116
+ while true
117
+ # Find job to do
118
+ kill = true
119
+ job_url = nil
120
+ jlock.synchronize do
121
+ # For each job
122
+ jobs.each do |u, j|
123
+ # If job is waiting
124
+ if j[:state] == 0 then
125
+ # Take job
126
+ job_url = u
127
+ j[:state] = 1
128
+ kill = false
129
+ break
130
+ elsif j[:state] == 1 then
131
+ # Some jobs are still working; anticipate more jobs in the future
132
+ kill = false
133
+ end
134
+ end
135
+ end
136
+ # If all jobs are done, and no job is found
137
+ if kill then
138
+ break
139
+ end
140
+ # If no job found but some jobs are still being worked on, skip
141
+ if job_url == nil then
142
+ next
143
+ end
144
+ # Get job depth
145
+ job_depth = jobs[:"#{job_url}"][:depth]
146
+ # Get all links in page pointed to by job URL
147
+ begin
148
+ open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
149
+ # Callback
150
+ jlock.synchronize do
151
+ yield "#{job_url}", response, job_depth
152
+ end
153
+ # If resolved URL is in scope
154
+ if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
155
+ # Add resolved URL to job queue and mark it as complete if it does not exist yet
156
+ jlock.synchronize do
157
+ if jobs[:"#{response.base_uri}"] == nil then
158
+ yield "#{response.base_uri}", response, job_depth
159
+ jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
160
+ end
161
+ end
162
+ # Get links for resolve URL
163
+ Salamander::get_links(response.base_uri, response) do |link|
164
+ # Determine if the link should be visited
165
+ if visit.nil? || visit.call(link) then
166
+ jlock.synchronize do
167
+ # If link is not in job queue
168
+ if jobs[:"#{link}"] == nil then
169
+ # Create job for the given link
170
+ jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
171
+ end
172
+ end
173
+ end
174
+ end
175
+ end
176
+ end
177
+ rescue
178
+ end
179
+ # Flag job as complete
180
+ jlock.synchronize do
181
+ jobs[:"#{job_url}"][:state] = 2
182
+ end
183
+ # Perform delay
184
+ sleep(delay)
185
+ end
186
+ end
187
+ _threads << thread
188
+ end
189
+ # Wait for all threads to die
190
+ _threads.each do |_thread|
191
+ _thread.join
192
+ end
193
+ end
194
+
195
+ module_function :crawl
196
+
197
+ end
198
+
199
+ # For direct invocation of crawler.rb
200
+ if __FILE__ == $0 then
201
+ # Record start time
202
+ time = Time.new
203
+ # Arbitrary terminal width
204
+ twid = 70
205
+ # Declare crawl count variable
206
+ count = 0
207
+ # Attempt to catch interrupt signals and unknown errors
208
+ begin
209
+ # Read arguments JSON from standard input
210
+ stdin = STDIN.read
211
+ args = nil
212
+ begin
213
+ args = JSON.parse(stdin)
214
+ rescue
215
+ puts JSON.pretty_generate({ result: "exception", message: "unable to parse json from stdin" })
216
+ exit
217
+ end
218
+ # Make sure the urls parameter has been supplied
219
+ if args['urls'] == nil then
220
+ puts JSON.pretty_generate({ result: "misuse", message: "'urls' parameter not specified" })
221
+ exit
222
+ else
223
+ # Retrieve the url parameter
224
+ urls = args['urls']
225
+ if !urls.kind_of?(Array) then
226
+ puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
227
+ exit
228
+ end
229
+ urls.each do |url|
230
+ begin
231
+ Addressable::URI.parse(url)
232
+ rescue
233
+ puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
234
+ exit
235
+ end
236
+ end
237
+ _args = {}
238
+ # Attempt to retrieve the delay parameter
239
+ if args['delay'] != nil then
240
+ begin
241
+ _args[:delay] = args['delay'].to_f
242
+ rescue
243
+ puts JSON.pretty_generate({ result: "exception", message: "delay must be a float" })
244
+ exit
245
+ end
246
+ end
247
+ # Attempt to retrieve the threads parameter
248
+ if args['threads'] != nil then
249
+ begin
250
+ _args[:threads] = args['threads'].to_i
251
+ rescue
252
+ puts JSON.pretty_generate({ result: "exception", message: "threads must be an integer" })
253
+ exit
254
+ end
255
+ end
256
+ # Attempt to retrieve the agent parameter
257
+ if args['agent'] != nil then
258
+ _args[:agent] = args['agent']
259
+ end
260
+ # Begin crawl; try to catch exceptions
261
+ begin
262
+ # Print banner
263
+ STDERR.puts
264
+ STDERR.puts " Welcome to the Salamander Web Crawler Demo!"
265
+ STDERR.puts
266
+ STDERR.puts " This is a command-line demonstration of the Salamander Web Crawler in action."
267
+ STDERR.puts " Press Ctrl+C at any time to interrupt the crawl."
268
+ STDERR.puts
269
+ STDERR.puts " Starting crawl at the following URLs:"
270
+ urls.each do |url|
271
+ STDERR.puts " - #{url}"
272
+ end
273
+ STDERR.puts
274
+ STDERR.print " Depth URL"
275
+ first = true
276
+ # Do actual crawl
277
+ Salamander::crawl(urls, _args) do |request, response, depth|
278
+ begin
279
+ # Increment crawl count
280
+ count = count + 1
281
+ # Truncate URL string
282
+ if request.length > twid - 2 then
283
+ _url = "#{request[0, twid - 5]}..."
284
+ else
285
+ _url = request
286
+ end
287
+ # Print crawl hit
288
+ if first then
289
+ STDERR.puts
290
+ first = false
291
+ end
292
+ STDERR.puts
293
+ STDERR.print " #{format('%02d', depth)} #{_url}"
294
+ STDERR.flush
295
+ rescue Interrupt => e
296
+ # Catch interrupt cleanly
297
+ STDERR.puts
298
+ STDERR.puts
299
+ STDERR.puts " Program terminated successfully"
300
+ STDERR.puts " Number of Pages Crawled: #{count}"
301
+ STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
302
+ break
303
+ end
304
+ end
305
+ rescue => e
306
+ puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", inspect: e.inspect })
307
+ exit
308
+ end
309
+ end
310
+ rescue Interrupt => e
311
+ # Catch interrupt cleanly
312
+ STDERR.puts
313
+ STDERR.puts
314
+ STDERR.puts " Program terminated successfully"
315
+ STDERR.puts " Number of Pages Crawled: #{count}"
316
+ STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
317
+ rescue e
318
+ # Print any uncaught exceptions
319
+ puts JSON.pretty_generate({ result: "exception", message: "unknown error", inspect: e.inspect })
320
+ exit
321
+ end
322
+ end
data.tar.gz.sig ADDED
Binary file
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: slmndr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - John Lawrence M. Penafiel
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIDfDCCAmSgAwIBAgIBATANBgkqhkiG9w0BAQUFADBCMRQwEgYDVQQDDAtwZW5h
14
+ ZmllbGpsbTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYD
15
+ Y29tMB4XDTE1MDYyOTE1MDQ1MVoXDTE2MDYyODE1MDQ1MVowQjEUMBIGA1UEAwwL
16
+ cGVuYWZpZWxqbG0xFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
17
+ ARkWA2NvbTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAPO7rVa+t3P8
18
+ pcv9C6k9LjbpVBQhLfHIkIrE98qGrXzzQANhTqRqrLgpCcYw44rMETCg2py7lti2
19
+ +g5997njU0IpPVlwLX71S3ddsvezi2KeYm+q2z0DMN5y1MsgcmoRHL8tGlbLw9FC
20
+ zq0KtJuIdcyrQQf9c3N/qxNbFyAJ5ZiuXq3bueOndt0zH8waOCdZKz+XZQYDJsTn
21
+ Kob8pLZill4ftwP1AOj2wqNG6ElR8NWJW77Il6Lxlh8fVwOxSaYAtRYyQU3C59M+
22
+ fEas9aXk12wEumDql2M3W0DNYiM1dXbamPSxFGu3c/BBrdJcEuQnuDAPYteaKj4c
23
+ 2DXoicDGn+UCAwEAAaN9MHswCQYDVR0TBAIwADALBgNVHQ8EBAMCBLAwHQYDVR0O
24
+ BBYEFPMx9v4kXBciiOP03co6iSatkViCMCAGA1UdEQQZMBeBFXBlbmFmaWVsamxt
25
+ QGdtYWlsLmNvbTAgBgNVHRIEGTAXgRVwZW5hZmllbGpsbUBnbWFpbC5jb20wDQYJ
26
+ KoZIhvcNAQEFBQADggEBADUAbj69LIDpA6rLmCQoAKjZRaBYVweZp4G07Gf838+f
27
+ swa/B68PpSjoWmWeErvAahkJhTCNY7SuFinUh6/lDqeqUXIQPQu8PW3Oyfp63u1U
28
+ vnM0Opft5VBhaP/dBEYvN2e/F2q62ObsRtzq9hXW9k/EfgVtlmKeeAeH7k2mPeGi
29
+ 7rxC4nb8yYf55rPGLG52BYSBmDwFIh64JiEmLJi3jEiTEMGB+dVQk++0HSuHDFi1
30
+ kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
31
+ 1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
32
+ -----END CERTIFICATE-----
33
+ date: 2015-06-29 00:00:00.000000000 Z
34
+ dependencies: []
35
+ description: A minimalistic ruby web crawling framework
36
+ email: penafieljlm@gmail.com
37
+ executables: []
38
+ extensions: []
39
+ extra_rdoc_files: []
40
+ files:
41
+ - lib/slmndr.rb
42
+ homepage: http://rubygems.org/gems/slmndr
43
+ licenses:
44
+ - MIT
45
+ metadata: {}
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ requirements: []
61
+ rubyforge_project:
62
+ rubygems_version: 2.4.5
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: Salamander
66
+ test_files: []
metadata.gz.sig ADDED
@@ -0,0 +1 @@
1
+ �@�v<gu���$�XGR�*�d�Xcca#x��X�S]��9o�-�f))� (`��J����/�~쎆�E�:y����Ǐ|캕�̟8���Hr� #x/�% ���/�IyF�M�8�_m� ,�P��p��l��BDM� �B̠��@���Jfd�'�!A�y��zq�=]�:��x��0,�f ֖��d��n}��Ŭoɯ��G��D�]�b��S�2׹:�K� �L�y6�"�b؎T��9�C��~�pL.