shelob 0.1.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4fd39cfeaa059a074821a0d60ee93b464ff48819
4
+ data.tar.gz: fc04fd19c13c1a970abfed9c126854da9ad6eaeb
5
+ SHA512:
6
+ metadata.gz: 72b0645723887dfb1942108c93e5cfda5f50de550d5a240f5da0119763fbd02745fdce0f88038b42849b863e54a81cf214db6285badab74eed6dbb082debd8ab
7
+ data.tar.gz: e53ddc74da61a78b19acde7d9e87859c9682dc48bd044f736f51942d1ee51d79b573b868ee5c72e3c87e9ab1e4e922f6c3760357947a2497492f1b1ce9ecd693
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in link_checker.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,7 @@
1
+ guard 'minitest' do
2
+
3
+ # with Minitest::Unit
4
+ watch(%r|^test/(.*)\/?test_(.*)\.rb$|)
5
+ watch(%r|^lib/(.*?)([^/]+)\.rb$|) { |m| "test/#{m[1]}test_#{m[2]}.rb" }
6
+
7
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Benjamin Nicholas
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # LinkChecker
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'link_checker'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install link_checker
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.pattern = "test/test_*.rb"
6
+ end
7
+
8
+ task :default => 'test'
data/bin/shelob ADDED
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'shelob'
5
+
6
+ def main args
7
+ puts Shelob::Spider.new(args[0], verbose: options[:verbose]).check
8
+ end
9
+
10
+ options = {verbose: 0}
11
+ optparse = OptionParser.new do |opts|
12
+ opts.banner = "Usage: shelob [options] root_url"
13
+
14
+ opts.on('-v', "--[no-]verbose", "Print simple information(overrides -r)") do
15
+ options[:verbose] = 1
16
+ end
17
+
18
+ opts.on('-r', '--[no-]really-verbose', "Print lots of information(overrides -v)") do
19
+ options[:verbose] = 2
20
+ end
21
+
22
+ opts.on_tail('-h', '--help', 'Show this message') do
23
+ puts opts
24
+ exit
25
+ end
26
+ end.parse!
27
+
28
+ if ARGV.empty?
29
+ puts optparse
30
+ exit 1
31
+ end
32
+
33
+ begin
34
+ exit main(ARGV)
35
+ rescue => ex
36
+ STDERR.puts ex.message
37
+ end
38
+
data/lib/extractor.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+
4
+ module Shelob
5
+ class Extractor
6
+ def initialize fetched
7
+ @fetched = fetched
8
+ end
9
+
10
+ def extract
11
+ content = Nokogiri::HTML(@fetched.body)
12
+ raw = content.css('a').map { |anchor| anchor['href'] }
13
+ raw.map do |link|
14
+ if link.start_with? '/'
15
+ u = URI(@fetched.url)
16
+ "#{u.scheme}://#{u.host}#{link}"
17
+ else
18
+ link
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,13 @@
1
+ class LinkResult
2
+ attr_reader :url, :status, :body
3
+
4
+ def initialize url, status, body
5
+ @url = url
6
+ @status = status
7
+ @body = body
8
+ end
9
+
10
+ def to_s
11
+ "#{@status}: #{@url}"
12
+ end
13
+ end
data/lib/resolver.rb ADDED
@@ -0,0 +1,16 @@
1
+ require 'link_result'
2
+ require 'net/http'
3
+
4
+ module Shelob
5
+ class Resolver
6
+ def initialize url
7
+ @uri = URI(url)
8
+ end
9
+
10
+ def resolve
11
+ resp = Net::HTTP.get_response(@uri)
12
+
13
+ LinkResult.new @uri.to_s, resp.code.to_i, resp.body
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,3 @@
1
+ module Shelob
2
+ VERSION = "0.1.0.beta1"
3
+ end
data/lib/shelob.rb ADDED
@@ -0,0 +1,64 @@
1
+ require "shelob/version"
2
+ require "resolver"
3
+ require "extractor"
4
+ require "set"
5
+
6
+ module Shelob
7
+ class Spider
8
+ attr_accessor :hostname
9
+
10
+ def initialize hostname, options = {}
11
+ @hostname = hostname
12
+ @queue = [ hostname ]
13
+ @urls = Set.new @queue
14
+ @failures = []
15
+ @verbose = options[:verbose] == 1 ? true : false
16
+ @chatty = options[:verbose] == 2 ? true : false
17
+ end
18
+
19
+ def check
20
+ while not @queue.empty?
21
+ url = @queue.shift
22
+ @urls << url
23
+
24
+ if @verbose
25
+ print '.'
26
+ end
27
+
28
+ if @chatty
29
+ print "#{url}... "
30
+ end
31
+
32
+ fetch = Resolver.new(url).resolve
33
+
34
+ @failures << fetch if fetch.status >= 400
35
+
36
+ links = Extractor.new(fetch).extract
37
+
38
+ filtered = links.select do |link|
39
+ link.start_with? @hostname and !@urls.include? link
40
+ end
41
+
42
+ if @chatty
43
+ puts "checked!"
44
+ end
45
+
46
+ @queue.push(*filtered)
47
+ end
48
+
49
+ @failures
50
+ end
51
+
52
+ def remaining
53
+ return @queue.count
54
+ end
55
+
56
+ def requests
57
+ return @urls.count
58
+ end
59
+
60
+ def fetched
61
+ return @urls
62
+ end
63
+ end
64
+ end
data/shelob.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'shelob/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "shelob"
8
+ spec.version = Shelob::VERSION
9
+ spec.authors = ["Benjamin Nicholas"]
10
+ spec.email = ["bnicholas@brandnetworksinc.com"]
11
+ spec.description = %q{A giant spider that starts on a given page, finds all links on the page, ensure they resolve, and recurses if the link is underneath the starting url}
12
+ spec.summary = %q{Spider a site and check links}
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.3"
21
+ spec.add_development_dependency "rake"
22
+ spec.add_development_dependency "minitest"
23
+ spec.add_development_dependency "webmock"
24
+ spec.add_development_dependency "guard"
25
+ spec.add_development_dependency "guard-minitest"
26
+
27
+ spec.add_runtime_dependency "nokogiri"
28
+ end
@@ -0,0 +1,37 @@
1
+ require 'minitest/autorun'
2
+ require 'extractor'
3
+ require 'link_result'
4
+
5
+ describe Shelob::Extractor, "Link extracting module" do
6
+
7
+ describe "when created" do
8
+ it "should be created with a LinkResult" do
9
+ le = LinkResult.new("http://google.com", 200, '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a></body></html>')
10
+ le.wont_be_nil
11
+ end
12
+ end
13
+
14
+ describe "when used" do
15
+ before do
16
+ @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
17
+ @result2 = LinkResult.new("http://google.com/something", 200, '<html><head><title>hi</title></head><body><a href="/about">about</a></body></html>')
18
+ @le = Shelob::Extractor.new(@result)
19
+ @le2 = Shelob::Extractor.new(@result2)
20
+ end
21
+
22
+ it "should return a list of the links in the page" do
23
+ extracts = @le.extract
24
+ extracts.must_be_kind_of Array
25
+ extracts.must_equal ["http://bing.com", "http://yahoo.com"]
26
+ end
27
+
28
+ it "should transform relative links to absolute" do
29
+ extracts = @le2.extract
30
+ extracts.must_be_kind_of Array
31
+ extracts.must_equal ["http://google.com/about"]
32
+ end
33
+
34
+ end # describe
35
+
36
+ end # describe
37
+
@@ -0,0 +1,29 @@
1
+ require 'minitest/autorun'
2
+ require 'link_result'
3
+
4
+ describe LinkResult, "Link fetch result" do
5
+ before do
6
+ @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
7
+ end
8
+
9
+ describe "when created" do
10
+ it "should take three arguments" do
11
+ @result.wont_be_nil
12
+ end
13
+
14
+ it "should save arguments" do
15
+ @result.url.must_equal "http://google.com"
16
+ @result.status.must_equal 200
17
+ @result.body.must_equal '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>'
18
+ end
19
+
20
+ it "should be immutable" do
21
+ proc { @result.status = 404 }.must_raise NoMethodError
22
+ end
23
+
24
+ it "should have a clean string rep" do
25
+ @result.to_s.must_equal "200: http://google.com"
26
+ end
27
+ end
28
+ end
29
+
@@ -0,0 +1,31 @@
1
+ require 'minitest/autorun'
2
+ require 'webmock/minitest'
3
+ require 'resolver'
4
+ require 'link_result'
5
+
6
+ WebMock.allow_net_connect!
7
+
8
+ describe Shelob::Resolver, "Link fetching module" do
9
+ describe "when created" do
10
+ it "should be created with a url" do
11
+ Shelob::Resolver.new("http://bmnick.com/ruby-c-extensions")
12
+ end
13
+ end
14
+
15
+ describe "when used" do
16
+ before do
17
+ @resolver = Shelob::Resolver.new("http://bmnick.com/ruby-c-extensions")
18
+ @result = @resolver.resolve
19
+ end
20
+
21
+ it "should return a LinkResult" do
22
+ @result.must_be_kind_of LinkResult
23
+ end
24
+
25
+ it "should return live result" do
26
+ @result.body.must_match(/CExt/)
27
+ end
28
+
29
+ end
30
+
31
+ end
@@ -0,0 +1,97 @@
1
+ require 'minitest/autorun'
2
+ require 'webmock/minitest'
3
+ require 'shelob'
4
+
5
+ # Stub out requests
6
+
7
+ describe Shelob, "Link checking module" do
8
+ describe "when created" do
9
+ it "should exist" do
10
+ Shelob.wont_be_nil
11
+ end
12
+ end
13
+ end
14
+
15
+ describe Shelob::Spider, "Link checking spider" do
16
+ before do
17
+ stub_request(:any, 'http://bmnick.com/resume').to_return(body: '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a><a href="http://bmnick.com/resume/secret"</body></html>')
18
+ stub_request(:any, 'http://bmnick.com/').to_return(status: 200, body: '<html><head><title>pdf</title></head><body><a href="http://bmnick.com/resume/">resume</a><a href="http://bmnick.com/">home</a><a href="http://bmnick.com/resume/secret">no touchy!</a></body></html>')
19
+ stub_request(:any, 'http://bmnick.com/resume/secret').to_return(body: '<html><head><title>secrets</title></head><body><a href="http://bmnick.com/resume/boring">boredom</a><a href="http://bmnick.com/resume">resume</a><a href="/resume/relative">relative</a></body></html>"')
20
+ stub_request(:any, 'http://bmnick.com/resume/resume.pdf').to_return(status: 404)
21
+ stub_request(:any, 'http://bmnick.com/resume/boring').to_return(status: 500)
22
+ stub_request(:any, 'http://bmnick.com/resume/relative').to_return(status: 204)
23
+ end
24
+ describe "when created" do
25
+ it "should exist" do
26
+ Shelob::Spider.wont_be_nil
27
+ end
28
+ it "should store the initial url" do
29
+ spider = Shelob::Spider.new("https://openforum.com")
30
+ spider.wont_be_nil
31
+ spider.hostname.must_equal "https://openforum.com"
32
+ end
33
+ end
34
+ describe "when checking links" do
35
+ before do
36
+
37
+ @spider = Shelob::Spider.new("http://bmnick.com/resume")
38
+ @results = @spider.check
39
+ end
40
+
41
+ it "should return an array from check" do
42
+ @results.must_be_kind_of Array
43
+ end
44
+ it "should return only error links" do
45
+ @results.select{|r| r.status == 200}.must_be_empty
46
+ end
47
+ it "should provide remaining counts" do
48
+ @spider.remaining.must_equal 0
49
+ end
50
+ it "should fetch the original url" do
51
+ @spider.fetched.must_include "http://bmnick.com/resume"
52
+ end
53
+ it "should provide a number of urls fetched" do
54
+ # http://bmnick.com/resume
55
+ # http://bmnick.com/resume/resume.pdf
56
+ # http://bmnick.com/resume/secret
57
+ # http://bmnick.com/resume/boring
58
+ # http://bmnick.com/resume/relative
59
+ @spider.requests.must_equal 5
60
+ end
61
+ it "should make a web request for the original url" do
62
+ assert_requested :get, "http://bmnick.com/resume"
63
+ end
64
+ it "should make a web request for child urls" do
65
+ # 404
66
+ assert_requested :get, "http://bmnick.com/resume/resume.pdf"
67
+ @spider.fetched.must_include "http://bmnick.com/resume/resume.pdf"
68
+
69
+ # successful
70
+ assert_requested :get, "http://bmnick.com/resume/secret"
71
+ @spider.fetched.must_include "http://bmnick.com/resume/secret"
72
+ end
73
+ it "should return the failed request" do
74
+ # http://bmnick.com/resume/resume.pdf => 404
75
+ # http://bmnick.com/resume/boring => 500
76
+ @results.count.must_equal 2
77
+ end
78
+ it "shouldn't request pages without the prefix" do
79
+ assert_not_requested :get, "http://bmnick.com"
80
+ end
81
+ it "shouldn't request pages multiple times" do
82
+ assert_requested :get, "http://bmnick.com/resume", times: 1
83
+ end
84
+ it "should continue to spider down the page" do
85
+ assert_requested :get, "http://bmnick.com/resume/boring"
86
+ @spider.fetched.must_include "http://bmnick.com/resume/boring"
87
+ end
88
+ it "should support relative links" do
89
+ assert_requested :get, "http://bmnick.com/resume/relative"
90
+ @spider.fetched.must_include "http://bmnick.com/resume/relative"
91
+ end
92
+ it "should format a string cleanly" do
93
+ @results.map{|r|r.to_s}.join("\n").must_equal "404: http://bmnick.com/resume/resume.pdf
94
+ 500: http://bmnick.com/resume/boring"
95
+ end
96
+ end
97
+ end
metadata ADDED
@@ -0,0 +1,165 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: shelob
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0.beta1
5
+ platform: ruby
6
+ authors:
7
+ - Benjamin Nicholas
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: webmock
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: guard-minitest
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: nokogiri
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: A giant spider that starts on a given page, finds all links on the page,
112
+ ensure they resolve, and recurses if the link is underneath the starting url
113
+ email:
114
+ - bnicholas@brandnetworksinc.com
115
+ executables:
116
+ - shelob
117
+ extensions: []
118
+ extra_rdoc_files: []
119
+ files:
120
+ - .gitignore
121
+ - Gemfile
122
+ - Guardfile
123
+ - LICENSE.txt
124
+ - README.md
125
+ - Rakefile
126
+ - bin/shelob
127
+ - lib/extractor.rb
128
+ - lib/link_result.rb
129
+ - lib/resolver.rb
130
+ - lib/shelob.rb
131
+ - lib/shelob/version.rb
132
+ - shelob.gemspec
133
+ - test/test_extractor.rb
134
+ - test/test_link_result.rb
135
+ - test/test_resolver.rb
136
+ - test/test_shelob.rb
137
+ homepage:
138
+ licenses:
139
+ - MIT
140
+ metadata: {}
141
+ post_install_message:
142
+ rdoc_options: []
143
+ require_paths:
144
+ - lib
145
+ required_ruby_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ required_rubygems_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - '>'
153
+ - !ruby/object:Gem::Version
154
+ version: 1.3.1
155
+ requirements: []
156
+ rubyforge_project:
157
+ rubygems_version: 2.0.3
158
+ signing_key:
159
+ specification_version: 4
160
+ summary: Spider a site and check links
161
+ test_files:
162
+ - test/test_extractor.rb
163
+ - test/test_link_result.rb
164
+ - test/test_resolver.rb
165
+ - test/test_shelob.rb