robinson 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/Gemfile +5 -0
  2. data/bin/robinson +3 -0
  3. data/lib/robinson.rb +157 -0
  4. metadata +70 -0
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ # A sample Gemfile
2
+ source "https://rubygems.org"
3
+
4
+ gem "smart_colored"
5
+ gem "anemone"
data/bin/robinson ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require 'robinson'
3
+ robinson_main(ARGV)
data/lib/robinson.rb ADDED
@@ -0,0 +1,157 @@
1
+ #!/bin/ruby
2
+
3
+ class Invocation
4
+ def initialize(args)
5
+ @address = args.first || ''
6
+ @ignoring_pages = args.include? '--ignoring'
7
+ @ignored_pages = args.slice(2..(args.size)) || []
8
+ end
9
+
10
+ def execute
11
+ check_args
12
+ Robinson.crawl @address, @ignored_pages
13
+ end
14
+
15
+ private
16
+ def check_args
17
+ if @address.include? '/' then usage('only accepts website server host[:port], not paths') end
18
+ if @address.empty? then usage('you need to pass in the website server host[:port]') end
19
+ if @ignoring_pages
20
+ if @ignored_pages.empty? then usage('you need to specify the paths of the pages to ignore') end
21
+ @ignored_pages.each { |path|
22
+ if !path.start_with?('/') then usage("the ignored pages' paths must all start with / character") end
23
+ }
24
+ end
25
+ end
26
+
27
+ def usage weirdness = ''
28
+ if weirdness.length > 0
29
+ puts "\nSorry, #{weirdness}\n\n"
30
+ end
31
+ puts "Usage: ./robinson <host>[:<port>] [--ignoring <ignorepath> [...]"
32
+ puts " e.g. ./robinson www.example.com"
33
+ puts " e.g. ./robinson localhost:8080 --ignoring /blogfeed /external_content"
34
+ exit 1
35
+ end
36
+ end
37
+
38
+
39
+
40
+ require 'anemone'
41
+ require 'smart_colored'
42
+
43
+ class Link
44
+ def initialize(uri)
45
+ @uri = uri
46
+ end
47
+ def on_website?(address)
48
+ #puts "#{host_and_port} vs #{host_and_port_of(address)}"
49
+ host_and_port == host_and_port_of(address)
50
+ end
51
+ private
52
+ def host_and_port_of(address)
53
+ address.include?(':') ? address : address + ':80'
54
+ end
55
+ def host_and_port
56
+ @uri.host + ':' + @uri.port.to_s
57
+ end
58
+ end
59
+
60
+ class Reporter
61
+ def on_see_link(uri)
62
+ end
63
+ def on_visit(page)
64
+ page.puts
65
+ end
66
+ def exit_code
67
+ 0
68
+ end
69
+ end
70
+
71
+ class NoisyReporter < Reporter
72
+ def on_see_link(uri)
73
+ puts "seen: #{uri}"
74
+ end
75
+ end
76
+
77
+ class InvestigativeReporter < Reporter
78
+ def initialize
79
+ @broken = []
80
+ @ok = []
81
+ end
82
+ def on_visit(page)
83
+ if page.broken?
84
+ @broken << page
85
+ else
86
+ @ok << page
87
+ end
88
+ page.puts
89
+ end
90
+ def exit_code
91
+ @broken.empty? ? success : failure
92
+ end
93
+ def success
94
+ puts "\nAll links (#{@ok.size}) check out OK."
95
+ 0
96
+ end
97
+ def failure
98
+ puts "\nBroken links (#{@broken.size} out of #{@ok.size}):"
99
+ @broken.each { |page| page.puts }
100
+ @broken.size
101
+ end
102
+ end
103
+
104
+ class Page
105
+ def initialize(anemone_page)
106
+ @page = anemone_page
107
+ end
108
+ def puts
109
+ if unfetchable?
110
+ $stdout.puts(pagelog("BROKEN!!", 'no http response').colored.red)
111
+ elsif broken?
112
+ $stdout.puts(pagelog("BROKEN!!", @page.code).colored.red)
113
+ else
114
+ $stdout.puts(pagelog("checked", @page.code).colored.green)
115
+ end
116
+ end
117
+ def pagelog(what, code)
118
+ "#{what}: #{@page.url} - #{code} (referer: #{@page.referer})"
119
+ end
120
+ def unfetchable?
121
+ @page.code.nil?
122
+ end
123
+ def broken?
124
+ if unfetchable?
125
+ return true
126
+ end
127
+ @page.code >= 400
128
+ end
129
+ end
130
+
131
+ class Robinson
132
+ def self.crawl(address, ignored_paths = [], reporter = InvestigativeReporter.new)
133
+ puts "Website server to check: '#{address}', ignoring paths '#{ignored_paths.join(', ')}' - NB. only internal links will be checked"
134
+ Anemone.crawl("http://#{address}") do |anemone|
135
+ anemone.focus_crawl { |page|
136
+ page.links.each { |link| reporter.on_see_link(link) }
137
+ links = page.links.select { |uri|
138
+ link = Link.new(uri)
139
+ link.on_website?(address) && !ignored_paths.include?(uri.path)
140
+ }
141
+ links
142
+ }
143
+ anemone.on_every_page { |anemone_page|
144
+ reporter.on_visit Page.new(anemone_page)
145
+ }
146
+ end
147
+ exit_code = reporter.exit_code
148
+ puts "finished (#{exit_code})"
149
+ exit(exit_code)
150
+ end
151
+ end
152
+
153
+ def robinson_main(arguments)
154
+ Invocation.new(arguments).execute
155
+ end
156
+
157
+
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: robinson
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - damned
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-11 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: anemone
16
+ requirement: &18383740 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *18383740
25
+ - !ruby/object:Gem::Dependency
26
+ name: smart_colored
27
+ requirement: &18382440 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *18382440
36
+ description:
37
+ email:
38
+ executables:
39
+ - robinson
40
+ extensions: []
41
+ extra_rdoc_files: []
42
+ files:
43
+ - Gemfile
44
+ - lib/robinson.rb
45
+ - bin/robinson
46
+ homepage:
47
+ licenses: []
48
+ post_install_message:
49
+ rdoc_options: []
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ! '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ! '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 1.8.11
67
+ signing_key:
68
+ specification_version: 3
69
+ summary: A working website link checker
70
+ test_files: []