shelob 0.1.0.beta3 → 0.1.0.beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/shelob +10 -1
- data/lib/extractor.rb +1 -1
- data/lib/shelob/version.rb +1 -1
- data/lib/shelob.rb +52 -43
- data/test/test_extractor.rb +9 -0
- data/test/test_shelob.rb +6 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 797b409821f95b66b1a3c7a7852a4f6e1cc16159
|
4
|
+
data.tar.gz: 42946abb1bec2cb2545598b41e6dd97e0a27eb2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9cea61a95b7dcebdd8b49552260864e7e01b1ceda91cf537663787cbcec8357022a5c8c0a4c793f423c6fa2b6bc82c6f606e66b58806171e0254abde7bc9ff16
|
7
|
+
data.tar.gz: ef49faa8de4267ba382274e2a4f912a41a57967565d6dce769ec878e1ed070e15fd15c05e001adbb6b1daa8df0a39048f4a3b8ae161fc588056c527a99ccb460
|
data/bin/shelob
CHANGED
@@ -2,9 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'optparse'
|
4
4
|
require 'shelob'
|
5
|
+
require 'shelob/version'
|
5
6
|
|
6
7
|
def main args, options
|
7
|
-
puts Shelob::Spider.new(args[0],
|
8
|
+
puts Shelob::Spider.new(args[0], options).check
|
8
9
|
|
9
10
|
0
|
10
11
|
end
|
@@ -20,11 +21,19 @@ optparse = OptionParser.new do |opts|
|
|
20
21
|
opts.on('-r', '--[no-]really-verbose', "Print lots of information(overrides -v)") do
|
21
22
|
options[:verbose] = 2
|
22
23
|
end
|
24
|
+
|
25
|
+
opts.on('-s', '--seed SEED_URL', "Initial seed url if different from root url") do |seed|
|
26
|
+
options[:seed] = seed
|
27
|
+
end
|
23
28
|
|
24
29
|
opts.on_tail('-h', '--help', 'Show this message') do
|
25
30
|
puts opts
|
26
31
|
exit
|
27
32
|
end
|
33
|
+
|
34
|
+
opts.on_tail('--version', 'Show version') do
|
35
|
+
puts Shelob::VERSION
|
36
|
+
end
|
28
37
|
end.parse!
|
29
38
|
|
30
39
|
if ARGV.empty?
|
data/lib/extractor.rb
CHANGED
@@ -10,7 +10,7 @@ module Shelob
|
|
10
10
|
def extract
|
11
11
|
content = Nokogiri::HTML(@fetched.body)
|
12
12
|
raw = content.css('a').map { |anchor| anchor['href'] }
|
13
|
-
raw.map do |link|
|
13
|
+
raw.reject(&:nil?).map do |link|
|
14
14
|
if link.start_with? '/'
|
15
15
|
u = URI(@fetched.url)
|
16
16
|
"#{u.scheme}://#{u.host}#{link}"
|
data/lib/shelob/version.rb
CHANGED
data/lib/shelob.rb
CHANGED
@@ -12,12 +12,17 @@ module Shelob
|
|
12
12
|
# underneath
|
13
13
|
attr_accessor :hostname
|
14
14
|
|
15
|
+
# The current queue of urls to check
|
16
|
+
attr_accessor :queue
|
17
|
+
|
15
18
|
# Create a new spider with the given hostname and
|
16
19
|
# options
|
17
20
|
#
|
18
21
|
# Valid options:
|
19
|
-
# *
|
20
|
-
#
|
22
|
+
# * verbose: 0 for no output, 1 for progress output, 2
|
23
|
+
# for verbose output
|
24
|
+
# * seed: Provide an initial seed value, other than the
|
25
|
+
# root url you're providing
|
21
26
|
def initialize hostname, options = {}
|
22
27
|
# Data
|
23
28
|
@hostname = hostname
|
@@ -27,7 +32,51 @@ module Shelob
|
|
27
32
|
@chatty = options[:verbose] == 2 ? true : false
|
28
33
|
|
29
34
|
# Internal
|
30
|
-
|
35
|
+
if options[:seed].nil?
|
36
|
+
@queue = [ hostname ]
|
37
|
+
else
|
38
|
+
@queue = [ options[:seed] ]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Entry point to the main spider process. This is the
|
43
|
+
# main API point, and will return once the site has
|
44
|
+
# been completely spidered.
|
45
|
+
#
|
46
|
+
# Returns a list of all failed urls, and their
|
47
|
+
# particular error code (404, 500, etc.)
|
48
|
+
def check
|
49
|
+
# set up variables
|
50
|
+
@urls ||= Set.new
|
51
|
+
@failures ||= []
|
52
|
+
|
53
|
+
# kick the spider off
|
54
|
+
run_spider
|
55
|
+
|
56
|
+
@failures
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns a count of the remaining urls to parse - this
|
60
|
+
# number is only a view of the current state, as more
|
61
|
+
# urls are constantly being added as other urls
|
62
|
+
# resolve.
|
63
|
+
#
|
64
|
+
# This would only be useful to call from another thread
|
65
|
+
# at this time, as check is a blocking call
|
66
|
+
def remaining
|
67
|
+
return @queue.count
|
68
|
+
end
|
69
|
+
|
70
|
+
# Return the total number of urls that were fetched in
|
71
|
+
# the spidering process.
|
72
|
+
def requests
|
73
|
+
return @urls.count
|
74
|
+
end
|
75
|
+
|
76
|
+
# Return an array of all urls that were fetched in the
|
77
|
+
# process of spidering the site.
|
78
|
+
def fetched
|
79
|
+
return @urls
|
31
80
|
end
|
32
81
|
|
33
82
|
# Notify that a url is about to be processed. Currently
|
@@ -112,45 +161,5 @@ module Shelob
|
|
112
161
|
post_process_notify url
|
113
162
|
end
|
114
163
|
end
|
115
|
-
|
116
|
-
# Entry point to the main spider process. This is the
|
117
|
-
# main API point, and will return once the site has
|
118
|
-
# been completely spidered.
|
119
|
-
#
|
120
|
-
# Returns a list of all failed urls, and their
|
121
|
-
# particular error code (404, 500, etc.)
|
122
|
-
def check
|
123
|
-
# set up variables
|
124
|
-
@urls ||= Set.new
|
125
|
-
@failures ||= []
|
126
|
-
|
127
|
-
# kick the spider off
|
128
|
-
run_spider
|
129
|
-
|
130
|
-
@failures
|
131
|
-
end
|
132
|
-
|
133
|
-
# Returns a count of the remaining urls to parse - this
|
134
|
-
# number is only a view of the current state, as more
|
135
|
-
# urls are constantly being added as other urls
|
136
|
-
# resolve.
|
137
|
-
#
|
138
|
-
# This would only be useful to call from another thread
|
139
|
-
# at this time, as check is a blocking call
|
140
|
-
def remaining
|
141
|
-
return @queue.count
|
142
|
-
end
|
143
|
-
|
144
|
-
# Return the total number of urls that were fetched in
|
145
|
-
# the spidering process.
|
146
|
-
def requests
|
147
|
-
return @urls.count
|
148
|
-
end
|
149
|
-
|
150
|
-
# Return an array of all urls that were fetched in the
|
151
|
-
# process of spidering the site.
|
152
|
-
def fetched
|
153
|
-
return @urls
|
154
|
-
end
|
155
164
|
end
|
156
165
|
end
|
data/test/test_extractor.rb
CHANGED
@@ -15,8 +15,10 @@ describe Shelob::Extractor, "Link extracting module" do
|
|
15
15
|
before do
|
16
16
|
@result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
|
17
17
|
@result2 = LinkResult.new("http://google.com/something", 200, '<html><head><title>hi</title></head><body><a href="/about">about</a></body></html>')
|
18
|
+
@result3 = LinkResult.new("http://google.com/another", 200, '<html><head><title>hi</title></head><body><a>about</a><a href="http://boop.com">boop</a></body></html>')
|
18
19
|
@le = Shelob::Extractor.new(@result)
|
19
20
|
@le2 = Shelob::Extractor.new(@result2)
|
21
|
+
@le3 = Shelob::Extractor.new(@result3)
|
20
22
|
end
|
21
23
|
|
22
24
|
it "should return a list of the links in the page" do
|
@@ -31,6 +33,13 @@ describe Shelob::Extractor, "Link extracting module" do
|
|
31
33
|
extracts.must_equal ["http://google.com/about"]
|
32
34
|
end
|
33
35
|
|
36
|
+
it "should gracefully handle empty links" do
|
37
|
+
# we shouldn't get an exception here
|
38
|
+
extracts = @le3.extract
|
39
|
+
extracts.must_be_kind_of Array
|
40
|
+
extracts.must_equal ["http://boop.com"]
|
41
|
+
end
|
42
|
+
|
34
43
|
end # describe
|
35
44
|
|
36
45
|
end # describe
|
data/test/test_shelob.rb
CHANGED
@@ -30,6 +30,12 @@ describe Shelob::Spider, "Link checking spider" do
|
|
30
30
|
spider.wont_be_nil
|
31
31
|
spider.hostname.must_equal "http://bmnick.com"
|
32
32
|
end
|
33
|
+
it "should be able to take a seperate seed url" do
|
34
|
+
spider = Shelob::Spider.new("http://bmnick.com", seed: "http://bmnick.com/resume")
|
35
|
+
spider.wont_be_nil
|
36
|
+
spider.hostname.must_equal "http://bmnick.com"
|
37
|
+
spider.queue.must_include "http://bmnick.com/resume"
|
38
|
+
end
|
33
39
|
end
|
34
40
|
describe "when checking links" do
|
35
41
|
before do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: shelob
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.beta4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Nicholas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-12-
|
11
|
+
date: 2013-12-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|