slmndr 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +2 -0
- data/lib/slmndr.rb +322 -0
- data.tar.gz.sig +0 -0
- metadata +66 -0
- metadata.gz.sig +1 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 20770bf44914d9179137aee64b3f86626b260d57
|
4
|
+
data.tar.gz: dd6067dd15692a27fa5216ab67cbe0b7321eaacc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2c7d1723c78474eec28e2f7f22e5b19b601fc5903fb09bec22aa80c6bd266e8260608bd0b7a4ca11e3c012a5bb84ead4c1475b8426cfed46335657c474bea190
|
7
|
+
data.tar.gz: f12be9d14cb2f5d4f1813cb09c77fd3e207ce6268f9e8f12e4caac8ebba01ca9e8e57ba77e3614dd46a616a779d76764cc3e6f182ae5652d880e4240657f1730
|
checksums.yaml.gz.sig
ADDED
data/lib/slmndr.rb
ADDED
@@ -0,0 +1,322 @@
|
|
1
|
+
## Salamander: A minimalistic ruby web crawling framework.
|
2
|
+
## Authored by: John Lawrence M. Penafiel
|
3
|
+
|
4
|
+
require 'time'
|
5
|
+
require 'thread'
|
6
|
+
require 'set'
|
7
|
+
require 'open-uri'
|
8
|
+
require 'openssl'
|
9
|
+
|
10
|
+
require 'json'
|
11
|
+
require 'open_uri_redirections'
|
12
|
+
require 'nokogiri'
|
13
|
+
require 'addressable/uri'
|
14
|
+
|
15
|
+
## Module
|
16
|
+
## Salamander
|
17
|
+
## Description
|
18
|
+
## The Crawler module provides an easy way for the other components of the Salamander system to perform crawling.
|
19
|
+
## Functions
|
20
|
+
## Salamander::crawl
|
21
|
+
module Salamander
|
22
|
+
|
23
|
+
## Function
|
24
|
+
## get_links
|
25
|
+
## Description
|
26
|
+
## Extracts outgoing links from the HTML pointed to by the given URL string.
|
27
|
+
## Parameters
|
28
|
+
## url - The URL of the HTML page the function is extracting links from.
|
29
|
+
## html - The HTML data to extract links from.
|
30
|
+
def self.get_links(url, html)
|
31
|
+
# Initialize
|
32
|
+
uri = Addressable::URI.parse(url)
|
33
|
+
# Parse as HTML
|
34
|
+
_html = Nokogiri::HTML(html)
|
35
|
+
# Get all anchors
|
36
|
+
_html.xpath('//a').each do |l|
|
37
|
+
# Extract hyper link
|
38
|
+
href = l['href']
|
39
|
+
# Skip if hyper link does not exist
|
40
|
+
if href == nil then
|
41
|
+
next
|
42
|
+
end
|
43
|
+
# Convert hyper link to URI object
|
44
|
+
link = Addressable::URI.parse(href)
|
45
|
+
# Skip if hyper link is not HTTP
|
46
|
+
if link.scheme != nil && link.scheme != 'http' && link.scheme != 'https' then
|
47
|
+
next
|
48
|
+
end
|
49
|
+
# Convert hyper link to absolute form
|
50
|
+
if link.host == nil then
|
51
|
+
link.host = uri.host
|
52
|
+
end
|
53
|
+
if link.scheme == nil then
|
54
|
+
link.scheme = uri.scheme
|
55
|
+
end
|
56
|
+
if link.port == nil then
|
57
|
+
link.port = uri.port
|
58
|
+
end
|
59
|
+
# Remove link fragment
|
60
|
+
link.fragment = nil
|
61
|
+
# Yield
|
62
|
+
yield link
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
## Function
|
67
|
+
## crawl
|
68
|
+
## Description
|
69
|
+
## Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
|
70
|
+
## Function blocks until all threads terminate.
|
71
|
+
## Parameters
|
72
|
+
## urls - Required. A list of strings containing the seed URLs.
|
73
|
+
## args - Optional. Default: {}. A hash containing optional arguments for the function.
|
74
|
+
## visit - Optional. Default: nil. A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
|
75
|
+
## delay - Optional. Default: 1. A positive float indicating the number of seconds between requests in one thread.
|
76
|
+
## threads - Optional. Default: 1. A positive integer indicating the number of allowed simultaneous requests to the target web asset.
|
77
|
+
## agent - Optional. Default: "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)". The user-agent string to be used.
|
78
|
+
def crawl(urls, args = {})
|
79
|
+
# Get arguments
|
80
|
+
visit = nil
|
81
|
+
if args[:visit] != nil then
|
82
|
+
visit = args[:visit]
|
83
|
+
end
|
84
|
+
delay = 1
|
85
|
+
if args[:delay] != nil then
|
86
|
+
delay = args[:delay]
|
87
|
+
end
|
88
|
+
if delay < 0 then
|
89
|
+
raise "delay must be a positive float"
|
90
|
+
end
|
91
|
+
threads = 1
|
92
|
+
if args[:threads] != nil then
|
93
|
+
threads = args[:threads]
|
94
|
+
end
|
95
|
+
if threads < 0 then
|
96
|
+
raise "threads must be a positive integer"
|
97
|
+
end
|
98
|
+
agent = "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)"
|
99
|
+
if args[:agent] != nil then
|
100
|
+
agent = args[:agent]
|
101
|
+
end
|
102
|
+
# Create threads list
|
103
|
+
_threads = []
|
104
|
+
# Create jobs map and lock
|
105
|
+
jobs = {}
|
106
|
+
jlock = Mutex.new
|
107
|
+
# Create job; Job States: 0: waiting, 1: working, 2: done
|
108
|
+
urls.each do |url|
|
109
|
+
jobs[:"#{url}"] = { state: 0, depth: 0 }
|
110
|
+
end
|
111
|
+
# Create and launch crawl threads
|
112
|
+
for id in 1..threads
|
113
|
+
# Create crawl thread
|
114
|
+
thread = Thread.new do
|
115
|
+
# Loop
|
116
|
+
while true
|
117
|
+
# Find job to do
|
118
|
+
kill = true
|
119
|
+
job_url = nil
|
120
|
+
jlock.synchronize do
|
121
|
+
# For each job
|
122
|
+
jobs.each do |u, j|
|
123
|
+
# If job is waiting
|
124
|
+
if j[:state] == 0 then
|
125
|
+
# Take job
|
126
|
+
job_url = u
|
127
|
+
j[:state] = 1
|
128
|
+
kill = false
|
129
|
+
break
|
130
|
+
elsif j[:state] == 1 then
|
131
|
+
# Some jobs are still working; anticipate more jobs in the future
|
132
|
+
kill = false
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
# If all jobs are done, and no job is found
|
137
|
+
if kill then
|
138
|
+
break
|
139
|
+
end
|
140
|
+
# If no job found but some jobs are still being worked on, skip
|
141
|
+
if job_url == nil then
|
142
|
+
next
|
143
|
+
end
|
144
|
+
# Get job depth
|
145
|
+
job_depth = jobs[:"#{job_url}"][:depth]
|
146
|
+
# Get all links in page pointed to by job URL
|
147
|
+
begin
|
148
|
+
open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
|
149
|
+
# Callback
|
150
|
+
jlock.synchronize do
|
151
|
+
yield "#{job_url}", response, job_depth
|
152
|
+
end
|
153
|
+
# If resolved URL is in scope
|
154
|
+
if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
|
155
|
+
# Add resolved URL to job queue and mark it as complete if it does not exist yet
|
156
|
+
jlock.synchronize do
|
157
|
+
if jobs[:"#{response.base_uri}"] == nil then
|
158
|
+
yield "#{response.base_uri}", response, job_depth
|
159
|
+
jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
|
160
|
+
end
|
161
|
+
end
|
162
|
+
# Get links for resolve URL
|
163
|
+
Salamander::get_links(response.base_uri, response) do |link|
|
164
|
+
# Determine if the link should be visited
|
165
|
+
if visit.nil? || visit.call(link) then
|
166
|
+
jlock.synchronize do
|
167
|
+
# If link is not in job queue
|
168
|
+
if jobs[:"#{link}"] == nil then
|
169
|
+
# Create job for the given link
|
170
|
+
jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
rescue
|
178
|
+
end
|
179
|
+
# Flag job as complete
|
180
|
+
jlock.synchronize do
|
181
|
+
jobs[:"#{job_url}"][:state] = 2
|
182
|
+
end
|
183
|
+
# Perform delay
|
184
|
+
sleep(delay)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
_threads << thread
|
188
|
+
end
|
189
|
+
# Wait for all threads to die
|
190
|
+
_threads.each do |_thread|
|
191
|
+
_thread.join
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
module_function :crawl
|
196
|
+
|
197
|
+
end
|
198
|
+
|
199
|
+
# For direct invocation of crawler.rb
|
200
|
+
if __FILE__ == $0 then
|
201
|
+
# Record start time
|
202
|
+
time = Time.new
|
203
|
+
# Arbitrary terminal width
|
204
|
+
twid = 70
|
205
|
+
# Declare crawl count variable
|
206
|
+
count = 0
|
207
|
+
# Attempt to catch interrupt signals and unknown errors
|
208
|
+
begin
|
209
|
+
# Read arguments JSON from standard input
|
210
|
+
stdin = STDIN.read
|
211
|
+
args = nil
|
212
|
+
begin
|
213
|
+
args = JSON.parse(stdin)
|
214
|
+
rescue
|
215
|
+
puts JSON.pretty_generate({ result: "exception", message: "unable to parse json from stdin" })
|
216
|
+
exit
|
217
|
+
end
|
218
|
+
# Make sure the urls parameter has been supplied
|
219
|
+
if args['urls'] == nil then
|
220
|
+
puts JSON.pretty_generate({ result: "misuse", message: "'urls' parameter not specified" })
|
221
|
+
exit
|
222
|
+
else
|
223
|
+
# Retrieve the url parameter
|
224
|
+
urls = args['urls']
|
225
|
+
if !urls.kind_of?(Array) then
|
226
|
+
puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
|
227
|
+
exit
|
228
|
+
end
|
229
|
+
urls.each do |url|
|
230
|
+
begin
|
231
|
+
Addressable::URI.parse(url)
|
232
|
+
rescue
|
233
|
+
puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
|
234
|
+
exit
|
235
|
+
end
|
236
|
+
end
|
237
|
+
_args = {}
|
238
|
+
# Attempt to retrieve the delay parameter
|
239
|
+
if args['delay'] != nil then
|
240
|
+
begin
|
241
|
+
_args[:delay] = args['delay'].to_f
|
242
|
+
rescue
|
243
|
+
puts JSON.pretty_generate({ result: "exception", message: "delay must be a float" })
|
244
|
+
exit
|
245
|
+
end
|
246
|
+
end
|
247
|
+
# Attempt to retrieve the threads parameter
|
248
|
+
if args['threads'] != nil then
|
249
|
+
begin
|
250
|
+
_args[:threads] = args['threads'].to_i
|
251
|
+
rescue
|
252
|
+
puts JSON.pretty_generate({ result: "exception", message: "threads must be an integer" })
|
253
|
+
exit
|
254
|
+
end
|
255
|
+
end
|
256
|
+
# Attempt to retrieve the agent parameter
|
257
|
+
if args['agent'] != nil then
|
258
|
+
_args[:agent] = args['agent']
|
259
|
+
end
|
260
|
+
# Begin crawl; try to catch exceptions
|
261
|
+
begin
|
262
|
+
# Print banner
|
263
|
+
STDERR.puts
|
264
|
+
STDERR.puts " Welcome to the Salamander Web Crawler Demo!"
|
265
|
+
STDERR.puts
|
266
|
+
STDERR.puts " This is a command-line demonstration of the Salamander Web Crawler in action."
|
267
|
+
STDERR.puts " Press Ctrl+C at any time to interrupt the crawl."
|
268
|
+
STDERR.puts
|
269
|
+
STDERR.puts " Starting crawl at the following URLs:"
|
270
|
+
urls.each do |url|
|
271
|
+
STDERR.puts " - #{url}"
|
272
|
+
end
|
273
|
+
STDERR.puts
|
274
|
+
STDERR.print " Depth URL"
|
275
|
+
first = true
|
276
|
+
# Do actual crawl
|
277
|
+
Salamander::crawl(urls, _args) do |request, response, depth|
|
278
|
+
begin
|
279
|
+
# Increment crawl count
|
280
|
+
count = count + 1
|
281
|
+
# Truncate URL string
|
282
|
+
if request.length > twid - 2 then
|
283
|
+
_url = "#{request[0, twid - 5]}..."
|
284
|
+
else
|
285
|
+
_url = request
|
286
|
+
end
|
287
|
+
# Print crawl hit
|
288
|
+
if first then
|
289
|
+
STDERR.puts
|
290
|
+
first = false
|
291
|
+
end
|
292
|
+
STDERR.puts
|
293
|
+
STDERR.print " #{format('%02d', depth)} #{_url}"
|
294
|
+
STDERR.flush
|
295
|
+
rescue Interrupt => e
|
296
|
+
# Catch interrupt cleanly
|
297
|
+
STDERR.puts
|
298
|
+
STDERR.puts
|
299
|
+
STDERR.puts " Program terminated successfully"
|
300
|
+
STDERR.puts " Number of Pages Crawled: #{count}"
|
301
|
+
STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
|
302
|
+
break
|
303
|
+
end
|
304
|
+
end
|
305
|
+
rescue => e
|
306
|
+
puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", inspect: e.inspect })
|
307
|
+
exit
|
308
|
+
end
|
309
|
+
end
|
310
|
+
rescue Interrupt => e
|
311
|
+
# Catch interrupt cleanly
|
312
|
+
STDERR.puts
|
313
|
+
STDERR.puts
|
314
|
+
STDERR.puts " Program terminated successfully"
|
315
|
+
STDERR.puts " Number of Pages Crawled: #{count}"
|
316
|
+
STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
|
317
|
+
rescue e
|
318
|
+
# Print any uncaught exceptions
|
319
|
+
puts JSON.pretty_generate({ result: "exception", message: "unknown error", inspect: e.inspect })
|
320
|
+
exit
|
321
|
+
end
|
322
|
+
end
|
data.tar.gz.sig
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: slmndr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John Lawrence M. Penafiel
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIDfDCCAmSgAwIBAgIBATANBgkqhkiG9w0BAQUFADBCMRQwEgYDVQQDDAtwZW5h
|
14
|
+
ZmllbGpsbTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYD
|
15
|
+
Y29tMB4XDTE1MDYyOTE1MDQ1MVoXDTE2MDYyODE1MDQ1MVowQjEUMBIGA1UEAwwL
|
16
|
+
cGVuYWZpZWxqbG0xFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
|
17
|
+
ARkWA2NvbTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAPO7rVa+t3P8
|
18
|
+
pcv9C6k9LjbpVBQhLfHIkIrE98qGrXzzQANhTqRqrLgpCcYw44rMETCg2py7lti2
|
19
|
+
+g5997njU0IpPVlwLX71S3ddsvezi2KeYm+q2z0DMN5y1MsgcmoRHL8tGlbLw9FC
|
20
|
+
zq0KtJuIdcyrQQf9c3N/qxNbFyAJ5ZiuXq3bueOndt0zH8waOCdZKz+XZQYDJsTn
|
21
|
+
Kob8pLZill4ftwP1AOj2wqNG6ElR8NWJW77Il6Lxlh8fVwOxSaYAtRYyQU3C59M+
|
22
|
+
fEas9aXk12wEumDql2M3W0DNYiM1dXbamPSxFGu3c/BBrdJcEuQnuDAPYteaKj4c
|
23
|
+
2DXoicDGn+UCAwEAAaN9MHswCQYDVR0TBAIwADALBgNVHQ8EBAMCBLAwHQYDVR0O
|
24
|
+
BBYEFPMx9v4kXBciiOP03co6iSatkViCMCAGA1UdEQQZMBeBFXBlbmFmaWVsamxt
|
25
|
+
QGdtYWlsLmNvbTAgBgNVHRIEGTAXgRVwZW5hZmllbGpsbUBnbWFpbC5jb20wDQYJ
|
26
|
+
KoZIhvcNAQEFBQADggEBADUAbj69LIDpA6rLmCQoAKjZRaBYVweZp4G07Gf838+f
|
27
|
+
swa/B68PpSjoWmWeErvAahkJhTCNY7SuFinUh6/lDqeqUXIQPQu8PW3Oyfp63u1U
|
28
|
+
vnM0Opft5VBhaP/dBEYvN2e/F2q62ObsRtzq9hXW9k/EfgVtlmKeeAeH7k2mPeGi
|
29
|
+
7rxC4nb8yYf55rPGLG52BYSBmDwFIh64JiEmLJi3jEiTEMGB+dVQk++0HSuHDFi1
|
30
|
+
kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
|
31
|
+
1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
|
32
|
+
-----END CERTIFICATE-----
|
33
|
+
date: 2015-06-29 00:00:00.000000000 Z
|
34
|
+
dependencies: []
|
35
|
+
description: A minimalistic ruby web crawling framework
|
36
|
+
email: penafieljlm@gmail.com
|
37
|
+
executables: []
|
38
|
+
extensions: []
|
39
|
+
extra_rdoc_files: []
|
40
|
+
files:
|
41
|
+
- lib/slmndr.rb
|
42
|
+
homepage: http://rubygems.org/gems/slmndr
|
43
|
+
licenses:
|
44
|
+
- MIT
|
45
|
+
metadata: {}
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 2.4.5
|
63
|
+
signing_key:
|
64
|
+
specification_version: 4
|
65
|
+
summary: Salamander
|
66
|
+
test_files: []
|
metadata.gz.sig
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
�@�v<gu���$�XGR�*�d�Xcca#x��X�S]��9o�-�f))�(`��J����/�~쎆�E�:y����Ǐ|캕�̟8���Hr� #x/�%���/�IyF�M�8�_m�,�P��p��l��BDM� �B̠��@���Jfd�'�!A�y��zq�=]�:��x��0,�f ֖��d��n}��Ŭoɯ��G��D�]�b��S�2:�K��L�y6�"�b؎T��9�C��~�pL.
|