utterson 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ef4a4e14d3f5b03451d25de3bbd09e7a977ec2b4
4
+ data.tar.gz: 5fb7fe1675f420b24267cc78e937bf98db2e4d8a
5
+ SHA512:
6
+ metadata.gz: 3244214395c560032c66ca75d82fe73c319bf1ac527783bb263a9e1676f3910057fc83ae529ba4db99f4ccbc7691ed367d2c5fea9ec514b237f079ad965a04e5
7
+ data.tar.gz: cc7d65f5cf0689d10ed99a65d62bcc5a1ee82589f2eebd2aefba9a533f5cfc1861ba49b33bd8e6fb0107191fc035dbad07e274b01f9fa6acb82c4861a786ce14
@@ -0,0 +1,8 @@
1
+ Simple utility to traverse directory of html files and check links in them.
2
+
3
+ ## Why the name?
4
+
5
+ I developed this to help me checking links in my Jekyll powered blog
6
+ and Mr. Utterson is the main character in the [Strange Case of Dr
7
+ Jekyll and Mr
8
+ Hyde](https://en.wikipedia.org/wiki/Strange_Case_of_Dr_Jekyll_and_Mr_Hyde).
@@ -0,0 +1,12 @@
1
+ #! /bin/env ruby
2
+
3
+ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
4
+
5
+ require 'trollop'
6
+ require 'utterson'
7
+
8
+ opts = Trollop::options do
9
+ opt :root, "Root directory for the site if it differs from target dir", type: :string
10
+ end
11
+
12
+ Utterson.new(opts.merge(dir: ARGV[0])).check
@@ -0,0 +1,105 @@
1
+ require 'nokogiri'
2
+
3
+ require 'net/http'
4
+ require 'timeout'
5
+
6
+ class Utterson
7
+ attr_reader :errors
8
+
9
+ def initialize(opts={})
10
+ @dir = opts[:dir] || './'
11
+ @root = opts[:root] || @dir
12
+ @errors = {}
13
+ @checked_urls = {}
14
+ @stats = {errors: 0, files: 0, urls: 0}
15
+ end
16
+
17
+ def check
18
+ Dir.glob(File.join(@dir, '**/*.{html,htm}')) do |f|
19
+ @stats[:files] += 1
20
+ puts "Checking #{f}"
21
+ collect_uris_from(f).each do |u|
22
+ @stats[:urls] += 1
23
+ check_uri(u, f)
24
+ end
25
+ end
26
+ print_results
27
+ end
28
+
29
+ def collect_uris_from(f)
30
+ ret = []
31
+ doc = Nokogiri::HTML(File.read(f))
32
+ doc.traverse do |el|
33
+ ret << el['src'] unless el['src'].nil?
34
+ ret << el['href'] unless el['href'].nil?
35
+ end
36
+ ret
37
+ end
38
+
39
+ def check_uri(url, file)
40
+ return if @checked_urls[url]
41
+
42
+ if url =~ /^(https?:)?\/\//
43
+ check_remote_uri url, file
44
+ else
45
+ check_local_uri url, file
46
+ end
47
+ @checked_urls[url] = true
48
+ end
49
+
50
+ def check_remote_uri(url, file)
51
+ begin
52
+ uri = URI(url.gsub(/^\/\//, 'http://'))
53
+ rescue URI::InvalidURIError => e
54
+ return add_error(file, uri.to_s, e.message)
55
+ end
56
+ begin
57
+ response = Net::HTTP.start(uri.host, uri.port,
58
+ :use_ssl => uri.scheme == 'https') do |http|
59
+ p = uri.path.empty? ? "/" : uri.path
60
+ http.head(p)
61
+ end
62
+ if response.code =~ /^[^23]/
63
+ add_error(file, uri.to_s, response)
64
+ end
65
+ rescue Timeout::Error
66
+ add_error(file, uri.to_s, "Reading buffer timed out")
67
+ rescue Errno::ETIMEDOUT
68
+ add_error(file, uri.to_s, "Connection timed out")
69
+ rescue SocketError => e
70
+ add_error(file, uri.to_s, e.message)
71
+ end
72
+ end
73
+
74
+ def check_local_uri(url, file)
75
+ url.gsub!(/\?.*$/, '')
76
+ if url =~ /^\//
77
+ path = File.expand_path(".#{url}", @root)
78
+ else
79
+ path = File.expand_path(url, File.dirname(file))
80
+ end
81
+ add_error(file, url, "File not found") unless File.exists? path
82
+ end
83
+
84
+ def add_error(file, url, response)
85
+ @stats[:errors] += 1
86
+ @errors[file] = {} if @errors[file].nil?
87
+ @errors[file][url] = response
88
+ end
89
+
90
+ def print_results
91
+ @errors.each do |file, info|
92
+ puts file
93
+ info.each do |url, response|
94
+ s = response.respond_to?(:code) ? "HTTP #{response.code}" : response
95
+ puts "\t#{url}\n\t\t#{s}"
96
+ end
97
+ end
98
+ if @stats[:errors] == 0
99
+ puts "#{@stats[:files]} files with #{@stats[:urls]} urls checked."
100
+ else
101
+ puts "#{@stats[:files]} files with #{@stats[:urls]} urls checked and #{@stats[:errors]} errors found."
102
+ end
103
+ end
104
+
105
+ end
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ <link type="text/css" rel="stylesheet" href="style.css" />
6
+ <script type="text/javascript" src="script.js"></script>
7
+ </head>
8
+ <body>
9
+ <h1>I'm a heading!</h1>
10
+
11
+ <p>I'm an <a href="http://example.com">example link</a>.</p>
12
+
13
+ <img src="image.jpg" />
14
+ </body>
15
+ </html>
@@ -0,0 +1,20 @@
1
+ #require 'rubygems'
2
+
3
+ require 'webmock/rspec'
4
+
5
+ require 'simplecov'
6
+ SimpleCov.start
7
+
8
+ require 'utterson'
9
+
10
+ require 'stringio'
11
+
12
+ def capture_stdout &block
13
+ old_stdout = $stdout
14
+ fake_stdout = StringIO.new
15
+ $stdout = fake_stdout
16
+ block.call
17
+ fake_stdout.string
18
+ ensure
19
+ $stdout = old_stdout
20
+ end
@@ -0,0 +1,164 @@
1
+ require 'spec_helper'
2
+
3
+ describe Utterson do
4
+ it "should go through all htm and html files in target dir" do
5
+ u = Utterson.new(dir: "spec/fixtures/dir-structure")
6
+ u.stub(:collect_uris_from) {[]}
7
+
8
+ ["spec/fixtures/dir-structure/1.htm",
9
+ "spec/fixtures/dir-structure/2.html",
10
+ "spec/fixtures/dir-structure/a/3.htm",
11
+ "spec/fixtures/dir-structure/a/b/4.html"].each do |file|
12
+ u.should_receive(:collect_uris_from).with(file)
13
+ end
14
+
15
+ u.check
16
+ end
17
+
18
+ it "should check all urls which are found" do
19
+ u = Utterson.new(dir: "spec/fixtures")
20
+ u.stub(:check_uri) {}
21
+ u.should_receive(:check_uri).exactly(4).times
22
+
23
+ u.check
24
+ end
25
+
26
+ it "should find all uris from sample document" do
27
+ u = Utterson.new
28
+ uris = u.collect_uris_from("spec/fixtures/sample.html")
29
+ uris.should include("script.js")
30
+ uris.should include("style.css")
31
+ uris.should include("http://example.com")
32
+ uris.should include("image.jpg")
33
+ end
34
+
35
+ describe "#check_uri" do
36
+ let(:u) {Utterson.new}
37
+
38
+ it "should use remote checking for http protocol" do
39
+ u.stub(:check_remote_uri) {}
40
+ u.should_receive(:check_remote_uri).with("http://example.com", "file.html")
41
+ u.check_uri("http://example.com", "file.html")
42
+ end
43
+
44
+ it "should use remote checking for https protocol" do
45
+ u.stub(:check_remote_uri) {}
46
+ u.should_receive(:check_remote_uri).with("https://example.com", "file.html")
47
+ u.check_uri("https://example.com", "file.html")
48
+ end
49
+
50
+ it "should use remote checking when only // is specified" do
51
+ u.stub(:check_remote_uri) {}
52
+ u.should_receive(:check_remote_uri).with("//example.com", "file.html")
53
+ u.check_uri("//example.com", "file.html")
54
+ end
55
+
56
+ it "should use local checking for relative uris" do
57
+ u.stub(:check_local_uri) {}
58
+ u.should_receive(:check_local_uri).with("../file.html", "file.html")
59
+ u.check_uri("../file.html", "file.html")
60
+ end
61
+ end
62
+
63
+ describe "#check_local_uri" do
64
+ let(:u) {Utterson.new(dir: "spec/fixtures/dir-structure")}
65
+
66
+ it "should not assign error info if file exists" do
67
+ u.check_local_uri("../sample.html", "spec/fixtures/dir-structure/1.htm")
68
+ u.errors.should be_empty
69
+ end
70
+
71
+ it "should assign error info if file doesn't exist" do
72
+ u.check_local_uri("../sample_not_found.html", "spec/fixtures/dir-structure/1.htm")
73
+ u.errors["spec/fixtures/dir-structure/1.htm"].should == {"../sample_not_found.html" => "File not found"}
74
+ end
75
+
76
+ it "should use root directory when urls start with /" do
77
+ u2 = Utterson.new(dir: "spec/fixtures/dir-structure", root: "spec/fixtures")
78
+ u2.check_local_uri("/sample.html", "spec/fixtures/dir-structure/1.htm")
79
+ u2.errors.should be_empty
80
+ end
81
+
82
+ it "should handle target directory as root for urls starting with / if root is no available" do
83
+ u.check_local_uri("/2.html", "spec/fixtures/dir-structure/1.htm")
84
+ u.errors.should be_empty
85
+ end
86
+
87
+ it "should ignore query string when checking local files" do
88
+ u.check_local_uri("2.html?queryparam=value", "spec/fixtures/dir-structure/1.htm")
89
+ u.errors.should be_empty
90
+ end
91
+ end
92
+
93
+ describe "#check_remote_uri" do
94
+ let(:u) {Utterson.new}
95
+
96
+ it "should not assign error info if request is successfull" do
97
+ stub_request(:head, "http://example.com/index.html").
98
+ with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
99
+ to_return(:status => 200, :body => "", :headers => {})
100
+ u.check_remote_uri("http://example.com/index.html", "test.html")
101
+ u.errors.should be_empty
102
+ end
103
+
104
+ it "should assign error info if there is error response" do
105
+ stub_request(:head, "http://example.com/404.html").
106
+ with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
107
+ to_return(:status => 404, :body => "", :headers => {})
108
+ u.check_remote_uri("http://example.com/404.html", "test.html")
109
+ puts u.errors.inspect
110
+ u.errors["test.html"].should_not be_empty
111
+ u.errors["test.html"]["http://example.com/404.html"].instance_of?(Net::HTTPNotFound).should be_true
112
+ end
113
+
114
+ it "should add error status from buffer timeouts" do
115
+ stub_request(:head, "http://example.com/index.html").to_timeout
116
+ u.check_remote_uri("http://example.com/index.html", "test.html")
117
+ u.errors.should_not be_empty
118
+ end
119
+
120
+ it "should add error status from connection timeouts" do
121
+ stub_request(:head, "http://example.com/index.html").to_raise(Errno::ETIMEDOUT)
122
+ u.check_remote_uri("http://example.com/index.html", "test.html")
123
+ u.errors.should_not be_empty
124
+ end
125
+
126
+ it "shoud add error status from name resolution errors" do
127
+ stub_request(:head, "http://example.com/index.html").
128
+ to_raise(SocketError.new('getaddrinfo: Name or service not known'))
129
+ u.check_remote_uri("http://example.com/index.html", "test.html")
130
+ u.errors.should_not be_empty
131
+ end
132
+
133
+ it "shoud add error status when invalid URI" do
134
+ URI.stub(:new).and_raise(URI::InvalidURIError)
135
+ u.check_remote_uri("http://invalid_uri", "test.html")
136
+ u.errors.should_not be_empty
137
+ end
138
+ end
139
+
140
+ describe "#print_results" do
141
+ it "should output only basic stats if no errors" do
142
+ u = Utterson.new(dir: "spec/fixtures/dir-structure")
143
+ output = capture_stdout do
144
+ u.check
145
+ end
146
+ output.should match(/4 files with 0 urls checked/)
147
+ end
148
+
149
+ it "should output error information" do
150
+ stub_request(:head, "http://example.com/").
151
+ with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
152
+ to_return(:status => 404, :body => "", :headers => {})
153
+ u = Utterson.new(dir: "spec/fixtures")
154
+ output = capture_stdout do
155
+ u.check
156
+ end
157
+ output.should match("spec/fixtures/sample.html\n\tstyle.css\n\t\tFile not found")
158
+ output.should match("script.js\n\t\tFile not found")
159
+ output.should match("image.jpg\n\t\tFile not found")
160
+ output.should match("http://example.com\n\t\tHTTP 404")
161
+ output.should match("5 files with 4 urls checked and 4 errors found")
162
+ end
163
+ end
164
+ end
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: utterson
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Juhamatti Niemelä
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-11-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: trollop
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webmock
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: 1.15.2
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 1.15.2
97
+ description: Traverses all HTML files from given directory and checks links found
98
+ in them.
99
+ email: iiska@iki.fi
100
+ executables:
101
+ - utterson
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - bin/utterson
106
+ - lib/utterson.rb
107
+ - README.md
108
+ - spec/spec_helper.rb
109
+ - spec/fixtures/dir-structure/a/3.htm
110
+ - spec/fixtures/dir-structure/a/b/4.html
111
+ - spec/fixtures/dir-structure/2.html
112
+ - spec/fixtures/dir-structure/1.htm
113
+ - spec/fixtures/sample.html
114
+ - spec/utterson_spec.rb
115
+ homepage: https://github.com/iiska/utterson
116
+ licenses:
117
+ - MIT
118
+ metadata: {}
119
+ post_install_message:
120
+ rdoc_options: []
121
+ require_paths:
122
+ - lib
123
+ required_ruby_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ~>
126
+ - !ruby/object:Gem::Version
127
+ version: '2.0'
128
+ required_rubygems_version: !ruby/object:Gem::Requirement
129
+ requirements:
130
+ - - '>='
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
133
+ requirements: []
134
+ rubyforge_project:
135
+ rubygems_version: 2.0.3
136
+ signing_key:
137
+ specification_version: 4
138
+ summary: Friendly HTML crawler and url checker
139
+ test_files:
140
+ - spec/spec_helper.rb
141
+ - spec/fixtures/dir-structure/a/3.htm
142
+ - spec/fixtures/dir-structure/a/b/4.html
143
+ - spec/fixtures/dir-structure/2.html
144
+ - spec/fixtures/dir-structure/1.htm
145
+ - spec/fixtures/sample.html
146
+ - spec/utterson_spec.rb
147
+ has_rdoc: