slmndr 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +2 -0
- data/lib/slmndr.rb +322 -0
- data.tar.gz.sig +0 -0
- metadata +66 -0
- metadata.gz.sig +1 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 20770bf44914d9179137aee64b3f86626b260d57
|
4
|
+
data.tar.gz: dd6067dd15692a27fa5216ab67cbe0b7321eaacc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2c7d1723c78474eec28e2f7f22e5b19b601fc5903fb09bec22aa80c6bd266e8260608bd0b7a4ca11e3c012a5bb84ead4c1475b8426cfed46335657c474bea190
|
7
|
+
data.tar.gz: f12be9d14cb2f5d4f1813cb09c77fd3e207ce6268f9e8f12e4caac8ebba01ca9e8e57ba77e3614dd46a616a779d76764cc3e6f182ae5652d880e4240657f1730
|
checksums.yaml.gz.sig
ADDED
data/lib/slmndr.rb
ADDED
@@ -0,0 +1,322 @@
|
|
1
|
+
## Salamander: A minimalistic ruby web crawling framework.
|
2
|
+
## Authored by: John Lawrence M. Penafiel
|
3
|
+
|
4
|
+
require 'time'
|
5
|
+
require 'thread'
|
6
|
+
require 'set'
|
7
|
+
require 'open-uri'
|
8
|
+
require 'openssl'
|
9
|
+
|
10
|
+
require 'json'
|
11
|
+
require 'open_uri_redirections'
|
12
|
+
require 'nokogiri'
|
13
|
+
require 'addressable/uri'
|
14
|
+
|
15
|
+
## Module
|
16
|
+
## Salamander
|
17
|
+
## Description
|
18
|
+
## The Crawler module provides an easy way for the other components of the Salamander system to perform crawling.
|
19
|
+
## Functions
|
20
|
+
## Salamander::crawl
|
21
|
+
module Salamander
|
22
|
+
|
23
|
+
## Function
|
24
|
+
## get_links
|
25
|
+
## Description
|
26
|
+
## Extracts outgoing links from the HTML pointed to by the given URL string.
|
27
|
+
## Parameters
|
28
|
+
## url - The URL of the HTML page the function is extracting links from.
|
29
|
+
## html - The HTML data to extract links from.
|
30
|
+
def self.get_links(url, html)
|
31
|
+
# Initialize
|
32
|
+
uri = Addressable::URI.parse(url)
|
33
|
+
# Parse as HTML
|
34
|
+
_html = Nokogiri::HTML(html)
|
35
|
+
# Get all anchors
|
36
|
+
_html.xpath('//a').each do |l|
|
37
|
+
# Extract hyper link
|
38
|
+
href = l['href']
|
39
|
+
# Skip if hyper link does not exist
|
40
|
+
if href == nil then
|
41
|
+
next
|
42
|
+
end
|
43
|
+
# Convert hyper link to URI object
|
44
|
+
link = Addressable::URI.parse(href)
|
45
|
+
# Skip if hyper link is not HTTP
|
46
|
+
if link.scheme != nil && link.scheme != 'http' && link.scheme != 'https' then
|
47
|
+
next
|
48
|
+
end
|
49
|
+
# Convert hyper link to absolute form
|
50
|
+
if link.host == nil then
|
51
|
+
link.host = uri.host
|
52
|
+
end
|
53
|
+
if link.scheme == nil then
|
54
|
+
link.scheme = uri.scheme
|
55
|
+
end
|
56
|
+
if link.port == nil then
|
57
|
+
link.port = uri.port
|
58
|
+
end
|
59
|
+
# Remove link fragment
|
60
|
+
link.fragment = nil
|
61
|
+
# Yield
|
62
|
+
yield link
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
## Function
|
67
|
+
## crawl
|
68
|
+
## Description
|
69
|
+
## Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
|
70
|
+
## Function blocks until all threads terminate.
|
71
|
+
## Parameters
|
72
|
+
## urls - Required. A list of strings containing the seed URLs.
|
73
|
+
## args - Optional. Default: {}. A hash containing optional arguments for the function.
|
74
|
+
## visit - Optional. Default: nil. A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
|
75
|
+
## delay - Optional. Default: 1. A positive float indicating the number of seconds between requests in one thread.
|
76
|
+
## threads - Optional. Default: 1. A positive integer indicating the number of allowed simultaneous requests to the target web asset.
|
77
|
+
## agent - Optional. Default: "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)". The user-agent string to be used.
|
78
|
+
def crawl(urls, args = {})
|
79
|
+
# Get arguments
|
80
|
+
visit = nil
|
81
|
+
if args[:visit] != nil then
|
82
|
+
visit = args[:visit]
|
83
|
+
end
|
84
|
+
delay = 1
|
85
|
+
if args[:delay] != nil then
|
86
|
+
delay = args[:delay]
|
87
|
+
end
|
88
|
+
if delay < 0 then
|
89
|
+
raise "delay must be a positive float"
|
90
|
+
end
|
91
|
+
threads = 1
|
92
|
+
if args[:threads] != nil then
|
93
|
+
threads = args[:threads]
|
94
|
+
end
|
95
|
+
if threads < 0 then
|
96
|
+
raise "threads must be a positive integer"
|
97
|
+
end
|
98
|
+
agent = "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)"
|
99
|
+
if args[:agent] != nil then
|
100
|
+
agent = args[:agent]
|
101
|
+
end
|
102
|
+
# Create threads list
|
103
|
+
_threads = []
|
104
|
+
# Create jobs map and lock
|
105
|
+
jobs = {}
|
106
|
+
jlock = Mutex.new
|
107
|
+
# Create job; Job States: 0: waiting, 1: working, 2: done
|
108
|
+
urls.each do |url|
|
109
|
+
jobs[:"#{url}"] = { state: 0, depth: 0 }
|
110
|
+
end
|
111
|
+
# Create and launch crawl threads
|
112
|
+
for id in 1..threads
|
113
|
+
# Create crawl thread
|
114
|
+
thread = Thread.new do
|
115
|
+
# Loop
|
116
|
+
while true
|
117
|
+
# Find job to do
|
118
|
+
kill = true
|
119
|
+
job_url = nil
|
120
|
+
jlock.synchronize do
|
121
|
+
# For each job
|
122
|
+
jobs.each do |u, j|
|
123
|
+
# If job is waiting
|
124
|
+
if j[:state] == 0 then
|
125
|
+
# Take job
|
126
|
+
job_url = u
|
127
|
+
j[:state] = 1
|
128
|
+
kill = false
|
129
|
+
break
|
130
|
+
elsif j[:state] == 1 then
|
131
|
+
# Some jobs are still working; anticipate more jobs in the future
|
132
|
+
kill = false
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
# If all jobs are done, and no job is found
|
137
|
+
if kill then
|
138
|
+
break
|
139
|
+
end
|
140
|
+
# If no job found but some jobs are still being worked on, skip
|
141
|
+
if job_url == nil then
|
142
|
+
next
|
143
|
+
end
|
144
|
+
# Get job depth
|
145
|
+
job_depth = jobs[:"#{job_url}"][:depth]
|
146
|
+
# Get all links in page pointed to by job URL
|
147
|
+
begin
|
148
|
+
open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
|
149
|
+
# Callback
|
150
|
+
jlock.synchronize do
|
151
|
+
yield "#{job_url}", response, job_depth
|
152
|
+
end
|
153
|
+
# If resolved URL is in scope
|
154
|
+
if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
|
155
|
+
# Add resolved URL to job queue and mark it as complete if it does not exist yet
|
156
|
+
jlock.synchronize do
|
157
|
+
if jobs[:"#{response.base_uri}"] == nil then
|
158
|
+
yield "#{response.base_uri}", response, job_depth
|
159
|
+
jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
|
160
|
+
end
|
161
|
+
end
|
162
|
+
# Get links for resolve URL
|
163
|
+
Salamander::get_links(response.base_uri, response) do |link|
|
164
|
+
# Determine if the link should be visited
|
165
|
+
if visit.nil? || visit.call(link) then
|
166
|
+
jlock.synchronize do
|
167
|
+
# If link is not in job queue
|
168
|
+
if jobs[:"#{link}"] == nil then
|
169
|
+
# Create job for the given link
|
170
|
+
jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
rescue
|
178
|
+
end
|
179
|
+
# Flag job as complete
|
180
|
+
jlock.synchronize do
|
181
|
+
jobs[:"#{job_url}"][:state] = 2
|
182
|
+
end
|
183
|
+
# Perform delay
|
184
|
+
sleep(delay)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
_threads << thread
|
188
|
+
end
|
189
|
+
# Wait for all threads to die
|
190
|
+
_threads.each do |_thread|
|
191
|
+
_thread.join
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
module_function :crawl
|
196
|
+
|
197
|
+
end
|
198
|
+
|
199
|
+
# For direct invocation of crawler.rb
|
200
|
+
if __FILE__ == $0 then
|
201
|
+
# Record start time
|
202
|
+
time = Time.new
|
203
|
+
# Arbitrary terminal width
|
204
|
+
twid = 70
|
205
|
+
# Declare crawl count variable
|
206
|
+
count = 0
|
207
|
+
# Attempt to catch interrupt signals and unknown errors
|
208
|
+
begin
|
209
|
+
# Read arguments JSON from standard input
|
210
|
+
stdin = STDIN.read
|
211
|
+
args = nil
|
212
|
+
begin
|
213
|
+
args = JSON.parse(stdin)
|
214
|
+
rescue
|
215
|
+
puts JSON.pretty_generate({ result: "exception", message: "unable to parse json from stdin" })
|
216
|
+
exit
|
217
|
+
end
|
218
|
+
# Make sure the urls parameter has been supplied
|
219
|
+
if args['urls'] == nil then
|
220
|
+
puts JSON.pretty_generate({ result: "misuse", message: "'urls' parameter not specified" })
|
221
|
+
exit
|
222
|
+
else
|
223
|
+
# Retrieve the url parameter
|
224
|
+
urls = args['urls']
|
225
|
+
if !urls.kind_of?(Array) then
|
226
|
+
puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
|
227
|
+
exit
|
228
|
+
end
|
229
|
+
urls.each do |url|
|
230
|
+
begin
|
231
|
+
Addressable::URI.parse(url)
|
232
|
+
rescue
|
233
|
+
puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
|
234
|
+
exit
|
235
|
+
end
|
236
|
+
end
|
237
|
+
_args = {}
|
238
|
+
# Attempt to retrieve the delay parameter
|
239
|
+
if args['delay'] != nil then
|
240
|
+
begin
|
241
|
+
_args[:delay] = args['delay'].to_f
|
242
|
+
rescue
|
243
|
+
puts JSON.pretty_generate({ result: "exception", message: "delay must be a float" })
|
244
|
+
exit
|
245
|
+
end
|
246
|
+
end
|
247
|
+
# Attempt to retrieve the threads parameter
|
248
|
+
if args['threads'] != nil then
|
249
|
+
begin
|
250
|
+
_args[:threads] = args['threads'].to_i
|
251
|
+
rescue
|
252
|
+
puts JSON.pretty_generate({ result: "exception", message: "threads must be an integer" })
|
253
|
+
exit
|
254
|
+
end
|
255
|
+
end
|
256
|
+
# Attempt to retrieve the agent parameter
|
257
|
+
if args['agent'] != nil then
|
258
|
+
_args[:agent] = args['agent']
|
259
|
+
end
|
260
|
+
# Begin crawl; try to catch exceptions
|
261
|
+
begin
|
262
|
+
# Print banner
|
263
|
+
STDERR.puts
|
264
|
+
STDERR.puts " Welcome to the Salamander Web Crawler Demo!"
|
265
|
+
STDERR.puts
|
266
|
+
STDERR.puts " This is a command-line demonstration of the Salamander Web Crawler in action."
|
267
|
+
STDERR.puts " Press Ctrl+C at any time to interrupt the crawl."
|
268
|
+
STDERR.puts
|
269
|
+
STDERR.puts " Starting crawl at the following URLs:"
|
270
|
+
urls.each do |url|
|
271
|
+
STDERR.puts " - #{url}"
|
272
|
+
end
|
273
|
+
STDERR.puts
|
274
|
+
STDERR.print " Depth URL"
|
275
|
+
first = true
|
276
|
+
# Do actual crawl
|
277
|
+
Salamander::crawl(urls, _args) do |request, response, depth|
|
278
|
+
begin
|
279
|
+
# Increment crawl count
|
280
|
+
count = count + 1
|
281
|
+
# Truncate URL string
|
282
|
+
if request.length > twid - 2 then
|
283
|
+
_url = "#{request[0, twid - 5]}..."
|
284
|
+
else
|
285
|
+
_url = request
|
286
|
+
end
|
287
|
+
# Print crawl hit
|
288
|
+
if first then
|
289
|
+
STDERR.puts
|
290
|
+
first = false
|
291
|
+
end
|
292
|
+
STDERR.puts
|
293
|
+
STDERR.print " #{format('%02d', depth)} #{_url}"
|
294
|
+
STDERR.flush
|
295
|
+
rescue Interrupt => e
|
296
|
+
# Catch interrupt cleanly
|
297
|
+
STDERR.puts
|
298
|
+
STDERR.puts
|
299
|
+
STDERR.puts " Program terminated successfully"
|
300
|
+
STDERR.puts " Number of Pages Crawled: #{count}"
|
301
|
+
STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
|
302
|
+
break
|
303
|
+
end
|
304
|
+
end
|
305
|
+
rescue => e
|
306
|
+
puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", inspect: e.inspect })
|
307
|
+
exit
|
308
|
+
end
|
309
|
+
end
|
310
|
+
rescue Interrupt => e
|
311
|
+
# Catch interrupt cleanly
|
312
|
+
STDERR.puts
|
313
|
+
STDERR.puts
|
314
|
+
STDERR.puts " Program terminated successfully"
|
315
|
+
STDERR.puts " Number of Pages Crawled: #{count}"
|
316
|
+
STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
|
317
|
+
rescue e
|
318
|
+
# Print any uncaught exceptions
|
319
|
+
puts JSON.pretty_generate({ result: "exception", message: "unknown error", inspect: e.inspect })
|
320
|
+
exit
|
321
|
+
end
|
322
|
+
end
|
data.tar.gz.sig
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: slmndr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John Lawrence M. Penafiel
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIDfDCCAmSgAwIBAgIBATANBgkqhkiG9w0BAQUFADBCMRQwEgYDVQQDDAtwZW5h
|
14
|
+
ZmllbGpsbTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYD
|
15
|
+
Y29tMB4XDTE1MDYyOTE1MDQ1MVoXDTE2MDYyODE1MDQ1MVowQjEUMBIGA1UEAwwL
|
16
|
+
cGVuYWZpZWxqbG0xFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
|
17
|
+
ARkWA2NvbTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAPO7rVa+t3P8
|
18
|
+
pcv9C6k9LjbpVBQhLfHIkIrE98qGrXzzQANhTqRqrLgpCcYw44rMETCg2py7lti2
|
19
|
+
+g5997njU0IpPVlwLX71S3ddsvezi2KeYm+q2z0DMN5y1MsgcmoRHL8tGlbLw9FC
|
20
|
+
zq0KtJuIdcyrQQf9c3N/qxNbFyAJ5ZiuXq3bueOndt0zH8waOCdZKz+XZQYDJsTn
|
21
|
+
Kob8pLZill4ftwP1AOj2wqNG6ElR8NWJW77Il6Lxlh8fVwOxSaYAtRYyQU3C59M+
|
22
|
+
fEas9aXk12wEumDql2M3W0DNYiM1dXbamPSxFGu3c/BBrdJcEuQnuDAPYteaKj4c
|
23
|
+
2DXoicDGn+UCAwEAAaN9MHswCQYDVR0TBAIwADALBgNVHQ8EBAMCBLAwHQYDVR0O
|
24
|
+
BBYEFPMx9v4kXBciiOP03co6iSatkViCMCAGA1UdEQQZMBeBFXBlbmFmaWVsamxt
|
25
|
+
QGdtYWlsLmNvbTAgBgNVHRIEGTAXgRVwZW5hZmllbGpsbUBnbWFpbC5jb20wDQYJ
|
26
|
+
KoZIhvcNAQEFBQADggEBADUAbj69LIDpA6rLmCQoAKjZRaBYVweZp4G07Gf838+f
|
27
|
+
swa/B68PpSjoWmWeErvAahkJhTCNY7SuFinUh6/lDqeqUXIQPQu8PW3Oyfp63u1U
|
28
|
+
vnM0Opft5VBhaP/dBEYvN2e/F2q62ObsRtzq9hXW9k/EfgVtlmKeeAeH7k2mPeGi
|
29
|
+
7rxC4nb8yYf55rPGLG52BYSBmDwFIh64JiEmLJi3jEiTEMGB+dVQk++0HSuHDFi1
|
30
|
+
kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
|
31
|
+
1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
|
32
|
+
-----END CERTIFICATE-----
|
33
|
+
date: 2015-06-29 00:00:00.000000000 Z
|
34
|
+
dependencies: []
|
35
|
+
description: A minimalistic ruby web crawling framework
|
36
|
+
email: penafieljlm@gmail.com
|
37
|
+
executables: []
|
38
|
+
extensions: []
|
39
|
+
extra_rdoc_files: []
|
40
|
+
files:
|
41
|
+
- lib/slmndr.rb
|
42
|
+
homepage: http://rubygems.org/gems/slmndr
|
43
|
+
licenses:
|
44
|
+
- MIT
|
45
|
+
metadata: {}
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 2.4.5
|
63
|
+
signing_key:
|
64
|
+
specification_version: 4
|
65
|
+
summary: Salamander
|
66
|
+
test_files: []
|
metadata.gz.sig
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
�@�v<gu���$�XGR�*�d�Xcca#x��X�S]��9o�-�f))�(`��J����/�~쎆�E�:y����Ǐ|캕�̟8���Hr� #x/�%���/�IyF�M�8�_m�,�P��p��l��BDM� �B̠��@���Jfd�'�!A�y��zq�=]�:��x��0,�f ֖��d��n}��Ŭoɯ��G��D�]�b��S�2:�K��L�y6�"�b؎T��9�C��~�pL.
|