shelob 0.1.0.beta2 → 0.1.0.beta3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +14 -8
- data/lib/link_result.rb +4 -0
- data/lib/shelob/version.rb +1 -1
- data/lib/shelob.rb +116 -24
- data/shelob.gemspec +1 -0
- data/test/test_link_result.rb +6 -0
- data/test/test_shelob.rb +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7554cee96efb6430592a84c0954da5df9cb7efc2
|
4
|
+
data.tar.gz: 1cf0cfad3ed2f1505f88cd45c22394b09d55c27a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 94d3264022e2e80736a54eb5fe1d0e68e2252be4e4008fc2855df6e841dbd141a073009c97fe10e8121ab06ccae9bf21aa558a26f0067bc0e7dc9985bc836bfd
|
7
|
+
data.tar.gz: 8f2d4e39cae612176646421eb29d1441a24cfacf2ae64c00c6d08fa15e341d72d89d21c1106ed462d2a2d4cbd345bf7296d9a8da54ba30bb0d3b3afe03b245aa
|
data/README.md
CHANGED
@@ -1,12 +1,21 @@
|
|
1
|
-
#
|
1
|
+
# Shelob
|
2
2
|
|
3
|
-
|
3
|
+
Shelob is a giant spider that starts on a given page, finds all links on the page, ensure they resolve, and recurses if the link is underneath the starting url. Intended primarily for double checking that your site has no horrible error pages to be exposed to the user by clicking on a link.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
shelob [-r|v] root_url
|
8
|
+
|
9
|
+
-r: really verbose, will print each url it checks
|
10
|
+
-v: verbose, will just print a progress indicator for each url so you don't think it just stopped
|
11
|
+
|
12
|
+
You can also use the link resolver, extractor, or the spider itself programmatically. Check the tests for usage until I can write up some good documentation...
|
4
13
|
|
5
14
|
## Installation
|
6
15
|
|
7
16
|
Add this line to your application's Gemfile:
|
8
17
|
|
9
|
-
gem '
|
18
|
+
gem 'shelob'
|
10
19
|
|
11
20
|
And then execute:
|
12
21
|
|
@@ -14,16 +23,13 @@ And then execute:
|
|
14
23
|
|
15
24
|
Or install it yourself as:
|
16
25
|
|
17
|
-
$ gem install
|
18
|
-
|
19
|
-
## Usage
|
20
|
-
|
21
|
-
TODO: Write usage instructions here
|
26
|
+
$ gem install shelob
|
22
27
|
|
23
28
|
## Contributing
|
24
29
|
|
25
30
|
1. Fork it
|
26
31
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
32
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
33
|
+
4. Make sure you have tests, and they pass! (`rake`)
|
28
34
|
4. Push to the branch (`git push origin my-new-feature`)
|
29
35
|
5. Create new Pull Request
|
data/lib/link_result.rb
CHANGED
data/lib/shelob/version.rb
CHANGED
data/lib/shelob.rb
CHANGED
@@ -4,59 +4,151 @@ require "extractor"
|
|
4
4
|
require "set"
|
5
5
|
|
6
6
|
module Shelob
|
7
|
+
# This is the central workhorse class of Shelob. It takes
|
8
|
+
# a url, fetches it, and then spiders through any
|
9
|
+
# children of that url and fetches them as well.
|
7
10
|
class Spider
|
11
|
+
# The root url which this Spider instance is working
|
12
|
+
# underneath
|
8
13
|
attr_accessor :hostname
|
9
14
|
|
15
|
+
# Create a new spider with the given hostname and
|
16
|
+
# options
|
17
|
+
#
|
18
|
+
# Valid options:
|
19
|
+
# * Verbose: 0 for no output, 1 for progress output, 2
|
20
|
+
# for verbose output
|
10
21
|
def initialize hostname, options = {}
|
22
|
+
# Data
|
11
23
|
@hostname = hostname
|
12
|
-
|
13
|
-
|
14
|
-
@failures = []
|
24
|
+
|
25
|
+
# Options
|
15
26
|
@verbose = options[:verbose] == 1 ? true : false
|
16
27
|
@chatty = options[:verbose] == 2 ? true : false
|
28
|
+
|
29
|
+
# Internal
|
30
|
+
@queue = [ hostname ]
|
17
31
|
end
|
18
32
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
33
|
+
# Notify that a url is about to be processed. Currently
|
34
|
+
# only used to print status
|
35
|
+
def pre_process_notify url
|
36
|
+
print "#{url}... " if @chatty
|
37
|
+
end
|
38
|
+
|
39
|
+
# Notify that a url has just been processed. Currently
|
40
|
+
# only used to print status
|
41
|
+
def post_process_notify url
|
42
|
+
print '.' if @verbose
|
43
|
+
puts "checked!" if @chatty
|
44
|
+
end
|
45
|
+
|
46
|
+
# Load a page from the internet, appending it to the
|
47
|
+
# failures array if the fetch encountered an error.
|
48
|
+
#
|
49
|
+
# Returns a LinkResult with the results of fetching the
|
50
|
+
# page.
|
51
|
+
def fetch url
|
52
|
+
page = Resolver.new(url).resolve
|
53
|
+
|
54
|
+
@failures << page if page.failed
|
55
|
+
|
56
|
+
page
|
57
|
+
end
|
58
|
+
|
59
|
+
# Extract links from the given url.
|
60
|
+
#
|
61
|
+
# Returns an array of all link targets on the page.
|
62
|
+
def extract url
|
63
|
+
page = fetch url
|
64
|
+
|
65
|
+
Extractor.new(page).extract
|
66
|
+
end
|
67
|
+
|
68
|
+
# Filter links to ensure they are children of the root
|
69
|
+
# url, and removes duplicates
|
70
|
+
def filter links
|
71
|
+
links.select do |link|
|
72
|
+
link.start_with? @hostname
|
73
|
+
end.uniq
|
74
|
+
end
|
75
|
+
|
76
|
+
# Add the given links to our internal queue to ensure
|
77
|
+
# they are checked.
|
78
|
+
def enqueue links
|
79
|
+
children = filter links
|
80
|
+
|
81
|
+
@queue.push(*children)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Signal that processing is done on a given url, so
|
85
|
+
# that it won't be checked again
|
86
|
+
def finish url
|
87
|
+
@urls << url
|
88
|
+
end
|
23
89
|
|
24
|
-
|
25
|
-
|
26
|
-
|
90
|
+
# Given a url, fetch it, extract all links, and enqueue
|
91
|
+
# those links for later processing.
|
92
|
+
def process url
|
93
|
+
links = extract url
|
27
94
|
|
28
|
-
|
29
|
-
print "#{url}... "
|
30
|
-
end
|
95
|
+
enqueue links
|
31
96
|
|
32
|
-
|
97
|
+
finish url
|
98
|
+
end
|
33
99
|
|
34
|
-
|
100
|
+
# Internal helper method to kick off the spider once
|
101
|
+
# everything has been properly configured.
|
102
|
+
def run_spider
|
103
|
+
while not @queue.empty?
|
104
|
+
url = @queue.shift
|
35
105
|
|
36
|
-
|
106
|
+
next if @urls.include? url
|
37
107
|
|
38
|
-
|
39
|
-
link.start_with? @hostname and !@urls.include? link
|
40
|
-
end
|
108
|
+
pre_process_notify url
|
41
109
|
|
42
|
-
|
43
|
-
puts "checked!"
|
44
|
-
end
|
110
|
+
process url
|
45
111
|
|
46
|
-
|
112
|
+
post_process_notify url
|
47
113
|
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Entry point to the main spider process. This is the
|
117
|
+
# main API point, and will return once the site has
|
118
|
+
# been completely spidered.
|
119
|
+
#
|
120
|
+
# Returns a list of all failed urls, and their
|
121
|
+
# particular error code (404, 500, etc.)
|
122
|
+
def check
|
123
|
+
# set up variables
|
124
|
+
@urls ||= Set.new
|
125
|
+
@failures ||= []
|
126
|
+
|
127
|
+
# kick the spider off
|
128
|
+
run_spider
|
48
129
|
|
49
130
|
@failures
|
50
131
|
end
|
51
132
|
|
133
|
+
# Returns a count of the remaining urls to parse - this
|
134
|
+
# number is only a view of the current state, as more
|
135
|
+
# urls are constantly being added as other urls
|
136
|
+
# resolve.
|
137
|
+
#
|
138
|
+
# This would only be useful to call from another thread
|
139
|
+
# at this time, as check is a blocking call
|
52
140
|
def remaining
|
53
141
|
return @queue.count
|
54
142
|
end
|
55
143
|
|
144
|
+
# Return the total number of urls that were fetched in
|
145
|
+
# the spidering process.
|
56
146
|
def requests
|
57
147
|
return @urls.count
|
58
148
|
end
|
59
|
-
|
149
|
+
|
150
|
+
# Return an array of all urls that were fetched in the
|
151
|
+
# process of spidering the site.
|
60
152
|
def fetched
|
61
153
|
return @urls
|
62
154
|
end
|
data/shelob.gemspec
CHANGED
@@ -6,6 +6,7 @@ require 'shelob/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "shelob"
|
8
8
|
spec.version = Shelob::VERSION
|
9
|
+
spec.homepage = 'https://github.com/bmnick/shelob'
|
9
10
|
spec.authors = ["Benjamin Nicholas"]
|
10
11
|
spec.email = ["bnicholas@brandnetworksinc.com"]
|
11
12
|
spec.description = %q{A giant spider that starts on a given page, finds all links on the page, ensure they resolve, and recurses if the link is underneath the starting url}
|
data/test/test_link_result.rb
CHANGED
@@ -4,6 +4,7 @@ require 'link_result'
|
|
4
4
|
describe LinkResult, "Link fetch result" do
|
5
5
|
before do
|
6
6
|
@result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
|
7
|
+
@failed = LinkResult.new("http://google.com", 404, 'Not found')
|
7
8
|
end
|
8
9
|
|
9
10
|
describe "when created" do
|
@@ -24,6 +25,11 @@ describe LinkResult, "Link fetch result" do
|
|
24
25
|
it "should have a clean string rep" do
|
25
26
|
@result.to_s.must_equal "200: http://google.com"
|
26
27
|
end
|
28
|
+
|
29
|
+
it "should determine if a request is failed" do
|
30
|
+
@result.failed.must_equal false
|
31
|
+
@failed.failed.must_equal true
|
32
|
+
end
|
27
33
|
end
|
28
34
|
end
|
29
35
|
|
data/test/test_shelob.rb
CHANGED
@@ -14,7 +14,7 @@ end
|
|
14
14
|
|
15
15
|
describe Shelob::Spider, "Link checking spider" do
|
16
16
|
before do
|
17
|
-
stub_request(:any, 'http://bmnick.com/resume').to_return(body: '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a><a href="http://bmnick.com/resume/secret"</body></html>')
|
17
|
+
stub_request(:any, 'http://bmnick.com/resume').to_return(body: '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a><a href="http://bmnick.com/resume/secret"</body></html>').times(1).then.to_return(status: 514)
|
18
18
|
stub_request(:any, 'http://bmnick.com/').to_return(status: 200, body: '<html><head><title>pdf</title></head><body><a href="http://bmnick.com/resume/">resume</a><a href="http://bmnick.com/">home</a><a href="http://bmnick.com/resume/secret">no touchy!</a></body></html>')
|
19
19
|
stub_request(:any, 'http://bmnick.com/resume/secret').to_return(body: '<html><head><title>secrets</title></head><body><a href="http://bmnick.com/resume/boring">boredom</a><a href="http://bmnick.com/resume">resume</a><a href="/resume/relative">relative</a></body></html>"')
|
20
20
|
stub_request(:any, 'http://bmnick.com/resume/resume.pdf').to_return(status: 404)
|
@@ -26,9 +26,9 @@ describe Shelob::Spider, "Link checking spider" do
|
|
26
26
|
Shelob::Spider.wont_be_nil
|
27
27
|
end
|
28
28
|
it "should store the initial url" do
|
29
|
-
spider = Shelob::Spider.new("
|
29
|
+
spider = Shelob::Spider.new("http://bmnick.com")
|
30
30
|
spider.wont_be_nil
|
31
|
-
spider.hostname.must_equal "
|
31
|
+
spider.hostname.must_equal "http://bmnick.com"
|
32
32
|
end
|
33
33
|
end
|
34
34
|
describe "when checking links" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: shelob
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.beta3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Nicholas
|
@@ -134,7 +134,7 @@ files:
|
|
134
134
|
- test/test_link_result.rb
|
135
135
|
- test/test_resolver.rb
|
136
136
|
- test/test_shelob.rb
|
137
|
-
homepage:
|
137
|
+
homepage: https://github.com/bmnick/shelob
|
138
138
|
licenses:
|
139
139
|
- MIT
|
140
140
|
metadata: {}
|