shelob 0.1.0.beta2 → 0.1.0.beta3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6169a929fd11d18cbd5a14c6901ff233c27a56a0
4
- data.tar.gz: b14b82eebbaf401303286b5e1c5744157c471a34
3
+ metadata.gz: 7554cee96efb6430592a84c0954da5df9cb7efc2
4
+ data.tar.gz: 1cf0cfad3ed2f1505f88cd45c22394b09d55c27a
5
5
  SHA512:
6
- metadata.gz: c8f0f6363eb626baceab44365fea723a41dc4859a574aa881c87e3aed3bee96cbf3d24e00b6d795bdf83d535a521da23cc8d07b14a3cb91dd9333e624b09bc77
7
- data.tar.gz: 80aa2da0b5596a9f4a294ddb0a6669728db1ca23e8e6980f9ac530dc761836b79fde792c62de623d41f0477e72014568e48728a70cd788167b4f03bd6913b102
6
+ metadata.gz: 94d3264022e2e80736a54eb5fe1d0e68e2252be4e4008fc2855df6e841dbd141a073009c97fe10e8121ab06ccae9bf21aa558a26f0067bc0e7dc9985bc836bfd
7
+ data.tar.gz: 8f2d4e39cae612176646421eb29d1441a24cfacf2ae64c00c6d08fa15e341d72d89d21c1106ed462d2a2d4cbd345bf7296d9a8da54ba30bb0d3b3afe03b245aa
data/README.md CHANGED
@@ -1,12 +1,21 @@
1
- # LinkChecker
1
+ # Shelob
2
2
 
3
- TODO: Write a gem description
3
+ Shelob is a giant spider that starts on a given page, finds all links on the page, ensure they resolve, and recurses if the link is underneath the starting url. Intended primarily for double checking that your site has no horrible error pages to be exposed to the user by clicking on a link.
4
+
5
+ ## Usage
6
+
7
+ shelob [-r|v] root_url
8
+
9
+ -r: really verbose, will print each url it checks
10
+ -v: verbose, will just print a progress indicator for each url so you don't think it just stopped
11
+
12
+ You can also use the link resolver, extractor, or the spider itself programmatically. Check the tests for usage until I can write up some good documentation...
4
13
 
5
14
  ## Installation
6
15
 
7
16
  Add this line to your application's Gemfile:
8
17
 
9
- gem 'link_checker'
18
+ gem 'shelob'
10
19
 
11
20
  And then execute:
12
21
 
@@ -14,16 +23,13 @@ And then execute:
14
23
 
15
24
  Or install it yourself as:
16
25
 
17
- $ gem install link_checker
18
-
19
- ## Usage
20
-
21
- TODO: Write usage instructions here
26
+ $ gem install shelob
22
27
 
23
28
  ## Contributing
24
29
 
25
30
  1. Fork it
26
31
  2. Create your feature branch (`git checkout -b my-new-feature`)
27
32
  3. Commit your changes (`git commit -am 'Add some feature'`)
33
+ 4. Make sure you have tests, and they pass! (`rake`)
28
34
  4. Push to the branch (`git push origin my-new-feature`)
29
35
  5. Create new Pull Request
data/lib/link_result.rb CHANGED
@@ -10,4 +10,8 @@ class LinkResult
10
10
  def to_s
11
11
  "#{@status}: #{@url}"
12
12
  end
13
+
14
+ def failed
15
+ @status.to_i >= 400
16
+ end
13
17
  end
@@ -1,3 +1,3 @@
1
1
  module Shelob
2
- VERSION = "0.1.0.beta2"
2
+ VERSION = "0.1.0.beta3"
3
3
  end
data/lib/shelob.rb CHANGED
@@ -4,59 +4,151 @@ require "extractor"
4
4
  require "set"
5
5
 
6
6
  module Shelob
7
+ # This is the central workhorse class of Shelob. It takes
8
+ # a url, fetches it, and then spiders through any
9
+ # children of that url and fetches them as well.
7
10
  class Spider
11
+ # The root url which this Spider instance is working
12
+ # underneath
8
13
  attr_accessor :hostname
9
14
 
15
+ # Create a new spider with the given hostname and
16
+ # options
17
+ #
18
+ # Valid options:
19
+ # * Verbose: 0 for no output, 1 for progress output, 2
20
+ # for verbose output
10
21
  def initialize hostname, options = {}
22
+ # Data
11
23
  @hostname = hostname
12
- @queue = [ hostname ]
13
- @urls = Set.new @queue
14
- @failures = []
24
+
25
+ # Options
15
26
  @verbose = options[:verbose] == 1 ? true : false
16
27
  @chatty = options[:verbose] == 2 ? true : false
28
+
29
+ # Internal
30
+ @queue = [ hostname ]
17
31
  end
18
32
 
19
- def check
20
- while not @queue.empty?
21
- url = @queue.shift
22
- @urls << url
33
+ # Notify that a url is about to be processed. Currently
34
+ # only used to print status
35
+ def pre_process_notify url
36
+ print "#{url}... " if @chatty
37
+ end
38
+
39
+ # Notify that a url has just been processed. Currently
40
+ # only used to print status
41
+ def post_process_notify url
42
+ print '.' if @verbose
43
+ puts "checked!" if @chatty
44
+ end
45
+
46
+ # Load a page from the internet, appending it to the
47
+ # failures array if the fetch encountered an error.
48
+ #
49
+ # Returns a LinkResult with the results of fetching the
50
+ # page.
51
+ def fetch url
52
+ page = Resolver.new(url).resolve
53
+
54
+ @failures << page if page.failed
55
+
56
+ page
57
+ end
58
+
59
+ # Extract links from the given url.
60
+ #
61
+ # Returns an array of all link targets on the page.
62
+ def extract url
63
+ page = fetch url
64
+
65
+ Extractor.new(page).extract
66
+ end
67
+
68
+ # Filter links to ensure they are children of the root
69
+ # url, and removes duplicates
70
+ def filter links
71
+ links.select do |link|
72
+ link.start_with? @hostname
73
+ end.uniq
74
+ end
75
+
76
+ # Add the given links to our internal queue to ensure
77
+ # they are checked.
78
+ def enqueue links
79
+ children = filter links
80
+
81
+ @queue.push(*children)
82
+ end
83
+
84
+ # Signal that processing is done on a given url, so
85
+ # that it won't be checked again
86
+ def finish url
87
+ @urls << url
88
+ end
23
89
 
24
- if @verbose
25
- print '.'
26
- end
90
+ # Given a url, fetch it, extract all links, and enqueue
91
+ # those links for later processing.
92
+ def process url
93
+ links = extract url
27
94
 
28
- if @chatty
29
- print "#{url}... "
30
- end
95
+ enqueue links
31
96
 
32
- fetch = Resolver.new(url).resolve
97
+ finish url
98
+ end
33
99
 
34
- @failures << fetch if fetch.status >= 400
100
+ # Internal helper method to kick off the spider once
101
+ # everything has been properly configured.
102
+ def run_spider
103
+ while not @queue.empty?
104
+ url = @queue.shift
35
105
 
36
- links = Extractor.new(fetch).extract
106
+ next if @urls.include? url
37
107
 
38
- filtered = links.select do |link|
39
- link.start_with? @hostname and !@urls.include? link
40
- end
108
+ pre_process_notify url
41
109
 
42
- if @chatty
43
- puts "checked!"
44
- end
110
+ process url
45
111
 
46
- @queue.push(*filtered)
112
+ post_process_notify url
47
113
  end
114
+ end
115
+
116
+ # Entry point to the main spider process. This is the
117
+ # main API point, and will return once the site has
118
+ # been completely spidered.
119
+ #
120
+ # Returns a list of all failed urls, and their
121
+ # particular error code (404, 500, etc.)
122
+ def check
123
+ # set up variables
124
+ @urls ||= Set.new
125
+ @failures ||= []
126
+
127
+ # kick the spider off
128
+ run_spider
48
129
 
49
130
  @failures
50
131
  end
51
132
 
133
+ # Returns a count of the remaining urls to parse - this
134
+ # number is only a view of the current state, as more
135
+ # urls are constantly being added as other urls
136
+ # resolve.
137
+ #
138
+ # This would only be useful to call from another thread
139
+ # at this time, as check is a blocking call
52
140
  def remaining
53
141
  return @queue.count
54
142
  end
55
143
 
144
+ # Return the total number of urls that were fetched in
145
+ # the spidering process.
56
146
  def requests
57
147
  return @urls.count
58
148
  end
59
-
149
+
150
+ # Return an array of all urls that were fetched in the
151
+ # process of spidering the site.
60
152
  def fetched
61
153
  return @urls
62
154
  end
data/shelob.gemspec CHANGED
@@ -6,6 +6,7 @@ require 'shelob/version'
6
6
  Gem::Specification.new do |spec|
7
7
  spec.name = "shelob"
8
8
  spec.version = Shelob::VERSION
9
+ spec.homepage = 'https://github.com/bmnick/shelob'
9
10
  spec.authors = ["Benjamin Nicholas"]
10
11
  spec.email = ["bnicholas@brandnetworksinc.com"]
11
12
  spec.description = %q{A giant spider that starts on a given page, finds all links on the page, ensure they resolve, and recurses if the link is underneath the starting url}
@@ -4,6 +4,7 @@ require 'link_result'
4
4
  describe LinkResult, "Link fetch result" do
5
5
  before do
6
6
  @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
7
+ @failed = LinkResult.new("http://google.com", 404, 'Not found')
7
8
  end
8
9
 
9
10
  describe "when created" do
@@ -24,6 +25,11 @@ describe LinkResult, "Link fetch result" do
24
25
  it "should have a clean string rep" do
25
26
  @result.to_s.must_equal "200: http://google.com"
26
27
  end
28
+
29
+ it "should determine if a request is failed" do
30
+ @result.failed.must_equal false
31
+ @failed.failed.must_equal true
32
+ end
27
33
  end
28
34
  end
29
35
 
data/test/test_shelob.rb CHANGED
@@ -14,7 +14,7 @@ end
14
14
 
15
15
  describe Shelob::Spider, "Link checking spider" do
16
16
  before do
17
- stub_request(:any, 'http://bmnick.com/resume').to_return(body: '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a><a href="http://bmnick.com/resume/secret"</body></html>')
17
+ stub_request(:any, 'http://bmnick.com/resume').to_return(body: '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a><a href="http://bmnick.com/resume/secret"</body></html>').times(1).then.to_return(status: 514)
18
18
  stub_request(:any, 'http://bmnick.com/').to_return(status: 200, body: '<html><head><title>pdf</title></head><body><a href="http://bmnick.com/resume/">resume</a><a href="http://bmnick.com/">home</a><a href="http://bmnick.com/resume/secret">no touchy!</a></body></html>')
19
19
  stub_request(:any, 'http://bmnick.com/resume/secret').to_return(body: '<html><head><title>secrets</title></head><body><a href="http://bmnick.com/resume/boring">boredom</a><a href="http://bmnick.com/resume">resume</a><a href="/resume/relative">relative</a></body></html>"')
20
20
  stub_request(:any, 'http://bmnick.com/resume/resume.pdf').to_return(status: 404)
@@ -26,9 +26,9 @@ describe Shelob::Spider, "Link checking spider" do
26
26
  Shelob::Spider.wont_be_nil
27
27
  end
28
28
  it "should store the initial url" do
29
- spider = Shelob::Spider.new("https://openforum.com")
29
+ spider = Shelob::Spider.new("http://bmnick.com")
30
30
  spider.wont_be_nil
31
- spider.hostname.must_equal "https://openforum.com"
31
+ spider.hostname.must_equal "http://bmnick.com"
32
32
  end
33
33
  end
34
34
  describe "when checking links" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: shelob
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.beta2
4
+ version: 0.1.0.beta3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Nicholas
@@ -134,7 +134,7 @@ files:
134
134
  - test/test_link_result.rb
135
135
  - test/test_resolver.rb
136
136
  - test/test_shelob.rb
137
- homepage:
137
+ homepage: https://github.com/bmnick/shelob
138
138
  licenses:
139
139
  - MIT
140
140
  metadata: {}