robinson 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +5 -0
- data/bin/robinson +3 -0
- data/lib/robinson.rb +157 -0
- metadata +70 -0
data/Gemfile
ADDED
data/bin/robinson
ADDED
data/lib/robinson.rb
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
#!/bin/ruby
|
2
|
+
|
3
|
+
class Invocation
|
4
|
+
def initialize(args)
|
5
|
+
@address = args.first || ''
|
6
|
+
@ignoring_pages = args.include? '--ignoring'
|
7
|
+
@ignored_pages = args.slice(2..(args.size)) || []
|
8
|
+
end
|
9
|
+
|
10
|
+
def execute
|
11
|
+
check_args
|
12
|
+
Robinson.crawl @address, @ignored_pages
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
def check_args
|
17
|
+
if @address.include? '/' then usage('only accepts website server host[:port], not paths') end
|
18
|
+
if @address.empty? then usage('you need to pass in the website server host[:port]') end
|
19
|
+
if @ignoring_pages
|
20
|
+
if @ignored_pages.empty? then usage('you need to specify the paths of the pages to ignore') end
|
21
|
+
@ignored_pages.each { |path|
|
22
|
+
if !path.start_with?('/') then usage("the ignored pages' paths must all start with / character") end
|
23
|
+
}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def usage weirdness = ''
|
28
|
+
if weirdness.length > 0
|
29
|
+
puts "\nSorry, #{weirdness}\n\n"
|
30
|
+
end
|
31
|
+
puts "Usage: ./robinson <host>[:<port>] [--ignoring <ignorepath> [...]"
|
32
|
+
puts " e.g. ./robinson www.example.com"
|
33
|
+
puts " e.g. ./robinson localhost:8080 --ignoring /blogfeed /external_content"
|
34
|
+
exit 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
require 'anemone'
|
41
|
+
require 'smart_colored'
|
42
|
+
|
43
|
+
class Link
|
44
|
+
def initialize(uri)
|
45
|
+
@uri = uri
|
46
|
+
end
|
47
|
+
def on_website?(address)
|
48
|
+
#puts "#{host_and_port} vs #{host_and_port_of(address)}"
|
49
|
+
host_and_port == host_and_port_of(address)
|
50
|
+
end
|
51
|
+
private
|
52
|
+
def host_and_port_of(address)
|
53
|
+
address.include?(':') ? address : address + ':80'
|
54
|
+
end
|
55
|
+
def host_and_port
|
56
|
+
@uri.host + ':' + @uri.port.to_s
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
class Reporter
|
61
|
+
def on_see_link(uri)
|
62
|
+
end
|
63
|
+
def on_visit(page)
|
64
|
+
page.puts
|
65
|
+
end
|
66
|
+
def exit_code
|
67
|
+
0
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class NoisyReporter < Reporter
|
72
|
+
def on_see_link(uri)
|
73
|
+
puts "seen: #{uri}"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class InvestigativeReporter < Reporter
|
78
|
+
def initialize
|
79
|
+
@broken = []
|
80
|
+
@ok = []
|
81
|
+
end
|
82
|
+
def on_visit(page)
|
83
|
+
if page.broken?
|
84
|
+
@broken << page
|
85
|
+
else
|
86
|
+
@ok << page
|
87
|
+
end
|
88
|
+
page.puts
|
89
|
+
end
|
90
|
+
def exit_code
|
91
|
+
@broken.empty? ? success : failure
|
92
|
+
end
|
93
|
+
def success
|
94
|
+
puts "\nAll links (#{@ok.size}) check out OK."
|
95
|
+
0
|
96
|
+
end
|
97
|
+
def failure
|
98
|
+
puts "\nBroken links (#{@broken.size} out of #{@ok.size}):"
|
99
|
+
@broken.each { |page| page.puts }
|
100
|
+
@broken.size
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class Page
|
105
|
+
def initialize(anemone_page)
|
106
|
+
@page = anemone_page
|
107
|
+
end
|
108
|
+
def puts
|
109
|
+
if unfetchable?
|
110
|
+
$stdout.puts(pagelog("BROKEN!!", 'no http response').colored.red)
|
111
|
+
elsif broken?
|
112
|
+
$stdout.puts(pagelog("BROKEN!!", @page.code).colored.red)
|
113
|
+
else
|
114
|
+
$stdout.puts(pagelog("checked", @page.code).colored.green)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
def pagelog(what, code)
|
118
|
+
"#{what}: #{@page.url} - #{code} (referer: #{@page.referer})"
|
119
|
+
end
|
120
|
+
def unfetchable?
|
121
|
+
@page.code.nil?
|
122
|
+
end
|
123
|
+
def broken?
|
124
|
+
if unfetchable?
|
125
|
+
return true
|
126
|
+
end
|
127
|
+
@page.code >= 400
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
class Robinson
|
132
|
+
def self.crawl(address, ignored_paths = [], reporter = InvestigativeReporter.new)
|
133
|
+
puts "Website server to check: '#{address}', ignoring paths '#{ignored_paths.join(', ')}' - NB. only internal links will be checked"
|
134
|
+
Anemone.crawl("http://#{address}") do |anemone|
|
135
|
+
anemone.focus_crawl { |page|
|
136
|
+
page.links.each { |link| reporter.on_see_link(link) }
|
137
|
+
links = page.links.select { |uri|
|
138
|
+
link = Link.new(uri)
|
139
|
+
link.on_website?(address) && !ignored_paths.include?(uri.path)
|
140
|
+
}
|
141
|
+
links
|
142
|
+
}
|
143
|
+
anemone.on_every_page { |anemone_page|
|
144
|
+
reporter.on_visit Page.new(anemone_page)
|
145
|
+
}
|
146
|
+
end
|
147
|
+
exit_code = reporter.exit_code
|
148
|
+
puts "finished (#{exit_code})"
|
149
|
+
exit(exit_code)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def robinson_main(arguments)
|
154
|
+
Invocation.new(arguments).execute
|
155
|
+
end
|
156
|
+
|
157
|
+
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: robinson
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- damned
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-03-11 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: anemone
|
16
|
+
requirement: &18383740 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *18383740
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: smart_colored
|
27
|
+
requirement: &18382440 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *18382440
|
36
|
+
description:
|
37
|
+
email:
|
38
|
+
executables:
|
39
|
+
- robinson
|
40
|
+
extensions: []
|
41
|
+
extra_rdoc_files: []
|
42
|
+
files:
|
43
|
+
- Gemfile
|
44
|
+
- lib/robinson.rb
|
45
|
+
- bin/robinson
|
46
|
+
homepage:
|
47
|
+
licenses: []
|
48
|
+
post_install_message:
|
49
|
+
rdoc_options: []
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ! '>='
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ! '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubyforge_project:
|
66
|
+
rubygems_version: 1.8.11
|
67
|
+
signing_key:
|
68
|
+
specification_version: 3
|
69
|
+
summary: A working website link checker
|
70
|
+
test_files: []
|