shelob 0.1.0.beta3 → 0.1.0.beta4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/shelob +10 -1
- data/lib/extractor.rb +1 -1
- data/lib/shelob/version.rb +1 -1
- data/lib/shelob.rb +52 -43
- data/test/test_extractor.rb +9 -0
- data/test/test_shelob.rb +6 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 797b409821f95b66b1a3c7a7852a4f6e1cc16159
|
4
|
+
data.tar.gz: 42946abb1bec2cb2545598b41e6dd97e0a27eb2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9cea61a95b7dcebdd8b49552260864e7e01b1ceda91cf537663787cbcec8357022a5c8c0a4c793f423c6fa2b6bc82c6f606e66b58806171e0254abde7bc9ff16
|
7
|
+
data.tar.gz: ef49faa8de4267ba382274e2a4f912a41a57967565d6dce769ec878e1ed070e15fd15c05e001adbb6b1daa8df0a39048f4a3b8ae161fc588056c527a99ccb460
|
data/bin/shelob
CHANGED
@@ -2,9 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'optparse'
|
4
4
|
require 'shelob'
|
5
|
+
require 'shelob/version'
|
5
6
|
|
6
7
|
def main args, options
|
7
|
-
puts Shelob::Spider.new(args[0],
|
8
|
+
puts Shelob::Spider.new(args[0], options).check
|
8
9
|
|
9
10
|
0
|
10
11
|
end
|
@@ -20,11 +21,19 @@ optparse = OptionParser.new do |opts|
|
|
20
21
|
opts.on('-r', '--[no-]really-verbose', "Print lots of information(overrides -v)") do
|
21
22
|
options[:verbose] = 2
|
22
23
|
end
|
24
|
+
|
25
|
+
opts.on('-s', '--seed SEED_URL', "Initial seed url if different from root url") do |seed|
|
26
|
+
options[:seed] = seed
|
27
|
+
end
|
23
28
|
|
24
29
|
opts.on_tail('-h', '--help', 'Show this message') do
|
25
30
|
puts opts
|
26
31
|
exit
|
27
32
|
end
|
33
|
+
|
34
|
+
opts.on_tail('--version', 'Show version') do
|
35
|
+
puts Shelob::VERSION
|
36
|
+
end
|
28
37
|
end.parse!
|
29
38
|
|
30
39
|
if ARGV.empty?
|
data/lib/extractor.rb
CHANGED
@@ -10,7 +10,7 @@ module Shelob
|
|
10
10
|
def extract
|
11
11
|
content = Nokogiri::HTML(@fetched.body)
|
12
12
|
raw = content.css('a').map { |anchor| anchor['href'] }
|
13
|
-
raw.map do |link|
|
13
|
+
raw.reject(&:nil?).map do |link|
|
14
14
|
if link.start_with? '/'
|
15
15
|
u = URI(@fetched.url)
|
16
16
|
"#{u.scheme}://#{u.host}#{link}"
|
data/lib/shelob/version.rb
CHANGED
data/lib/shelob.rb
CHANGED
@@ -12,12 +12,17 @@ module Shelob
|
|
12
12
|
# underneath
|
13
13
|
attr_accessor :hostname
|
14
14
|
|
15
|
+
# The current queue of urls to check
|
16
|
+
attr_accessor :queue
|
17
|
+
|
15
18
|
# Create a new spider with the given hostname and
|
16
19
|
# options
|
17
20
|
#
|
18
21
|
# Valid options:
|
19
|
-
# *
|
20
|
-
#
|
22
|
+
# * verbose: 0 for no output, 1 for progress output, 2
|
23
|
+
# for verbose output
|
24
|
+
# * seed: Provide an initial seed value, other than the
|
25
|
+
# root url you're providing
|
21
26
|
def initialize hostname, options = {}
|
22
27
|
# Data
|
23
28
|
@hostname = hostname
|
@@ -27,7 +32,51 @@ module Shelob
|
|
27
32
|
@chatty = options[:verbose] == 2 ? true : false
|
28
33
|
|
29
34
|
# Internal
|
30
|
-
|
35
|
+
if options[:seed].nil?
|
36
|
+
@queue = [ hostname ]
|
37
|
+
else
|
38
|
+
@queue = [ options[:seed] ]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Entry point to the main spider process. This is the
|
43
|
+
# main API point, and will return once the site has
|
44
|
+
# been completely spidered.
|
45
|
+
#
|
46
|
+
# Returns a list of all failed urls, and their
|
47
|
+
# particular error code (404, 500, etc.)
|
48
|
+
def check
|
49
|
+
# set up variables
|
50
|
+
@urls ||= Set.new
|
51
|
+
@failures ||= []
|
52
|
+
|
53
|
+
# kick the spider off
|
54
|
+
run_spider
|
55
|
+
|
56
|
+
@failures
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns a count of the remaining urls to parse - this
|
60
|
+
# number is only a view of the current state, as more
|
61
|
+
# urls are constantly being added as other urls
|
62
|
+
# resolve.
|
63
|
+
#
|
64
|
+
# This would only be useful to call from another thread
|
65
|
+
# at this time, as check is a blocking call
|
66
|
+
def remaining
|
67
|
+
return @queue.count
|
68
|
+
end
|
69
|
+
|
70
|
+
# Return the total number of urls that were fetched in
|
71
|
+
# the spidering process.
|
72
|
+
def requests
|
73
|
+
return @urls.count
|
74
|
+
end
|
75
|
+
|
76
|
+
# Return an array of all urls that were fetched in the
|
77
|
+
# process of spidering the site.
|
78
|
+
def fetched
|
79
|
+
return @urls
|
31
80
|
end
|
32
81
|
|
33
82
|
# Notify that a url is about to be processed. Currently
|
@@ -112,45 +161,5 @@ module Shelob
|
|
112
161
|
post_process_notify url
|
113
162
|
end
|
114
163
|
end
|
115
|
-
|
116
|
-
# Entry point to the main spider process. This is the
|
117
|
-
# main API point, and will return once the site has
|
118
|
-
# been completely spidered.
|
119
|
-
#
|
120
|
-
# Returns a list of all failed urls, and their
|
121
|
-
# particular error code (404, 500, etc.)
|
122
|
-
def check
|
123
|
-
# set up variables
|
124
|
-
@urls ||= Set.new
|
125
|
-
@failures ||= []
|
126
|
-
|
127
|
-
# kick the spider off
|
128
|
-
run_spider
|
129
|
-
|
130
|
-
@failures
|
131
|
-
end
|
132
|
-
|
133
|
-
# Returns a count of the remaining urls to parse - this
|
134
|
-
# number is only a view of the current state, as more
|
135
|
-
# urls are constantly being added as other urls
|
136
|
-
# resolve.
|
137
|
-
#
|
138
|
-
# This would only be useful to call from another thread
|
139
|
-
# at this time, as check is a blocking call
|
140
|
-
def remaining
|
141
|
-
return @queue.count
|
142
|
-
end
|
143
|
-
|
144
|
-
# Return the total number of urls that were fetched in
|
145
|
-
# the spidering process.
|
146
|
-
def requests
|
147
|
-
return @urls.count
|
148
|
-
end
|
149
|
-
|
150
|
-
# Return an array of all urls that were fetched in the
|
151
|
-
# process of spidering the site.
|
152
|
-
def fetched
|
153
|
-
return @urls
|
154
|
-
end
|
155
164
|
end
|
156
165
|
end
|
data/test/test_extractor.rb
CHANGED
@@ -15,8 +15,10 @@ describe Shelob::Extractor, "Link extracting module" do
|
|
15
15
|
before do
|
16
16
|
@result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
|
17
17
|
@result2 = LinkResult.new("http://google.com/something", 200, '<html><head><title>hi</title></head><body><a href="/about">about</a></body></html>')
|
18
|
+
@result3 = LinkResult.new("http://google.com/another", 200, '<html><head><title>hi</title></head><body><a>about</a><a href="http://boop.com">boop</a></body></html>')
|
18
19
|
@le = Shelob::Extractor.new(@result)
|
19
20
|
@le2 = Shelob::Extractor.new(@result2)
|
21
|
+
@le3 = Shelob::Extractor.new(@result3)
|
20
22
|
end
|
21
23
|
|
22
24
|
it "should return a list of the links in the page" do
|
@@ -31,6 +33,13 @@ describe Shelob::Extractor, "Link extracting module" do
|
|
31
33
|
extracts.must_equal ["http://google.com/about"]
|
32
34
|
end
|
33
35
|
|
36
|
+
it "should gracefully handle empty links" do
|
37
|
+
# we shouldn't get an exception here
|
38
|
+
extracts = @le3.extract
|
39
|
+
extracts.must_be_kind_of Array
|
40
|
+
extracts.must_equal ["http://boop.com"]
|
41
|
+
end
|
42
|
+
|
34
43
|
end # describe
|
35
44
|
|
36
45
|
end # describe
|
data/test/test_shelob.rb
CHANGED
@@ -30,6 +30,12 @@ describe Shelob::Spider, "Link checking spider" do
|
|
30
30
|
spider.wont_be_nil
|
31
31
|
spider.hostname.must_equal "http://bmnick.com"
|
32
32
|
end
|
33
|
+
it "should be able to take a seperate seed url" do
|
34
|
+
spider = Shelob::Spider.new("http://bmnick.com", seed: "http://bmnick.com/resume")
|
35
|
+
spider.wont_be_nil
|
36
|
+
spider.hostname.must_equal "http://bmnick.com"
|
37
|
+
spider.queue.must_include "http://bmnick.com/resume"
|
38
|
+
end
|
33
39
|
end
|
34
40
|
describe "when checking links" do
|
35
41
|
before do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: shelob
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.beta4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Nicholas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-12-
|
11
|
+
date: 2013-12-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|