utterson 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ef4a4e14d3f5b03451d25de3bbd09e7a977ec2b4
4
+ data.tar.gz: 5fb7fe1675f420b24267cc78e937bf98db2e4d8a
5
+ SHA512:
6
+ metadata.gz: 3244214395c560032c66ca75d82fe73c319bf1ac527783bb263a9e1676f3910057fc83ae529ba4db99f4ccbc7691ed367d2c5fea9ec514b237f079ad965a04e5
7
+ data.tar.gz: cc7d65f5cf0689d10ed99a65d62bcc5a1ee82589f2eebd2aefba9a533f5cfc1861ba49b33bd8e6fb0107191fc035dbad07e274b01f9fa6acb82c4861a786ce14
@@ -0,0 +1,8 @@
1
+ Simple utility to traverse directory of html files and check links in them.
2
+
3
+ ## Why the name?
4
+
5
+ I developed this to help me checking links in my Jekyll powered blog
6
+ and Mr. Utterson is the main character in the [Strange Case of Dr
7
+ Jekyll and Mr
8
+ Hyde](https://en.wikipedia.org/wiki/Strange_Case_of_Dr_Jekyll_and_Mr_Hyde).
@@ -0,0 +1,12 @@
1
+ #! /bin/env ruby
2
+
3
+ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
4
+
5
+ require 'trollop'
6
+ require 'utterson'
7
+
8
+ opts = Trollop::options do
9
+ opt :root, "Root directory for the site if it differs from target dir", type: :string
10
+ end
11
+
12
+ Utterson.new(opts.merge(dir: ARGV[0])).check
@@ -0,0 +1,105 @@
1
+ require 'nokogiri'
2
+
3
+ require 'net/http'
4
+ require 'timeout'
5
+
6
+ class Utterson
7
+ attr_reader :errors
8
+
9
+ def initialize(opts={})
10
+ @dir = opts[:dir] || './'
11
+ @root = opts[:root] || @dir
12
+ @errors = {}
13
+ @checked_urls = {}
14
+ @stats = {errors: 0, files: 0, urls: 0}
15
+ end
16
+
17
+ def check
18
+ Dir.glob(File.join(@dir, '**/*.{html,htm}')) do |f|
19
+ @stats[:files] += 1
20
+ puts "Checking #{f}"
21
+ collect_uris_from(f).each do |u|
22
+ @stats[:urls] += 1
23
+ check_uri(u, f)
24
+ end
25
+ end
26
+ print_results
27
+ end
28
+
29
+ def collect_uris_from(f)
30
+ ret = []
31
+ doc = Nokogiri::HTML(File.read(f))
32
+ doc.traverse do |el|
33
+ ret << el['src'] unless el['src'].nil?
34
+ ret << el['href'] unless el['href'].nil?
35
+ end
36
+ ret
37
+ end
38
+
39
+ def check_uri(url, file)
40
+ return if @checked_urls[url]
41
+
42
+ if url =~ /^(https?:)?\/\//
43
+ check_remote_uri url, file
44
+ else
45
+ check_local_uri url, file
46
+ end
47
+ @checked_urls[url] = true
48
+ end
49
+
50
+ def check_remote_uri(url, file)
51
+ begin
52
+ uri = URI(url.gsub(/^\/\//, 'http://'))
53
+ rescue URI::InvalidURIError => e
54
+ return add_error(file, uri.to_s, e.message)
55
+ end
56
+ begin
57
+ response = Net::HTTP.start(uri.host, uri.port,
58
+ :use_ssl => uri.scheme == 'https') do |http|
59
+ p = uri.path.empty? ? "/" : uri.path
60
+ http.head(p)
61
+ end
62
+ if response.code =~ /^[^23]/
63
+ add_error(file, uri.to_s, response)
64
+ end
65
+ rescue Timeout::Error
66
+ add_error(file, uri.to_s, "Reading buffer timed out")
67
+ rescue Errno::ETIMEDOUT
68
+ add_error(file, uri.to_s, "Connection timed out")
69
+ rescue SocketError => e
70
+ add_error(file, uri.to_s, e.message)
71
+ end
72
+ end
73
+
74
+ def check_local_uri(url, file)
75
+ url.gsub!(/\?.*$/, '')
76
+ if url =~ /^\//
77
+ path = File.expand_path(".#{url}", @root)
78
+ else
79
+ path = File.expand_path(url, File.dirname(file))
80
+ end
81
+ add_error(file, url, "File not found") unless File.exists? path
82
+ end
83
+
84
+ def add_error(file, url, response)
85
+ @stats[:errors] += 1
86
+ @errors[file] = {} if @errors[file].nil?
87
+ @errors[file][url] = response
88
+ end
89
+
90
+ def print_results
91
+ @errors.each do |file, info|
92
+ puts file
93
+ info.each do |url, response|
94
+ s = response.respond_to?(:code) ? "HTTP #{response.code}" : response
95
+ puts "\t#{url}\n\t\t#{s}"
96
+ end
97
+ end
98
+ if @stats[:errors] == 0
99
+ puts "#{@stats[:files]} files with #{@stats[:urls]} urls checked."
100
+ else
101
+ puts "#{@stats[:files]} files with #{@stats[:urls]} urls checked and #{@stats[:errors]} errors found."
102
+ end
103
+ end
104
+
105
+ end
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ <link type="text/css" rel="stylesheet" href="style.css" />
6
+ <script type="text/javascript" src="script.js"></script>
7
+ </head>
8
+ <body>
9
+ <h1>I'm a heading!</h1>
10
+
11
+ <p>I'm an <a href="http://example.com">example link</a>.</p>
12
+
13
+ <img src="image.jpg" />
14
+ </body>
15
+ </html>
@@ -0,0 +1,20 @@
1
+ #require 'rubygems'
2
+
3
+ require 'webmock/rspec'
4
+
5
+ require 'simplecov'
6
+ SimpleCov.start
7
+
8
+ require 'utterson'
9
+
10
+ require 'stringio'
11
+
12
+ def capture_stdout &block
13
+ old_stdout = $stdout
14
+ fake_stdout = StringIO.new
15
+ $stdout = fake_stdout
16
+ block.call
17
+ fake_stdout.string
18
+ ensure
19
+ $stdout = old_stdout
20
+ end
@@ -0,0 +1,164 @@
1
+ require 'spec_helper'
2
+
3
+ describe Utterson do
4
+ it "should go through all htm and html files in target dir" do
5
+ u = Utterson.new(dir: "spec/fixtures/dir-structure")
6
+ u.stub(:collect_uris_from) {[]}
7
+
8
+ ["spec/fixtures/dir-structure/1.htm",
9
+ "spec/fixtures/dir-structure/2.html",
10
+ "spec/fixtures/dir-structure/a/3.htm",
11
+ "spec/fixtures/dir-structure/a/b/4.html"].each do |file|
12
+ u.should_receive(:collect_uris_from).with(file)
13
+ end
14
+
15
+ u.check
16
+ end
17
+
18
+ it "should check all urls which are found" do
19
+ u = Utterson.new(dir: "spec/fixtures")
20
+ u.stub(:check_uri) {}
21
+ u.should_receive(:check_uri).exactly(4).times
22
+
23
+ u.check
24
+ end
25
+
26
+ it "should find all uris from sample document" do
27
+ u = Utterson.new
28
+ uris = u.collect_uris_from("spec/fixtures/sample.html")
29
+ uris.should include("script.js")
30
+ uris.should include("style.css")
31
+ uris.should include("http://example.com")
32
+ uris.should include("image.jpg")
33
+ end
34
+
35
+ describe "#check_uri" do
36
+ let(:u) {Utterson.new}
37
+
38
+ it "should use remote checking for http protocol" do
39
+ u.stub(:check_remote_uri) {}
40
+ u.should_receive(:check_remote_uri).with("http://example.com", "file.html")
41
+ u.check_uri("http://example.com", "file.html")
42
+ end
43
+
44
+ it "should use remote checking for https protocol" do
45
+ u.stub(:check_remote_uri) {}
46
+ u.should_receive(:check_remote_uri).with("https://example.com", "file.html")
47
+ u.check_uri("https://example.com", "file.html")
48
+ end
49
+
50
+ it "should use remote checking when only // is specified" do
51
+ u.stub(:check_remote_uri) {}
52
+ u.should_receive(:check_remote_uri).with("//example.com", "file.html")
53
+ u.check_uri("//example.com", "file.html")
54
+ end
55
+
56
+ it "should use local checking for relative uris" do
57
+ u.stub(:check_local_uri) {}
58
+ u.should_receive(:check_local_uri).with("../file.html", "file.html")
59
+ u.check_uri("../file.html", "file.html")
60
+ end
61
+ end
62
+
63
+ describe "#check_local_uri" do
64
+ let(:u) {Utterson.new(dir: "spec/fixtures/dir-structure")}
65
+
66
+ it "should not assign error info if file exists" do
67
+ u.check_local_uri("../sample.html", "spec/fixtures/dir-structure/1.htm")
68
+ u.errors.should be_empty
69
+ end
70
+
71
+ it "should assign error info if file doesn't exist" do
72
+ u.check_local_uri("../sample_not_found.html", "spec/fixtures/dir-structure/1.htm")
73
+ u.errors["spec/fixtures/dir-structure/1.htm"].should == {"../sample_not_found.html" => "File not found"}
74
+ end
75
+
76
+ it "should use root directory when urls start with /" do
77
+ u2 = Utterson.new(dir: "spec/fixtures/dir-structure", root: "spec/fixtures")
78
+ u2.check_local_uri("/sample.html", "spec/fixtures/dir-structure/1.htm")
79
+ u2.errors.should be_empty
80
+ end
81
+
82
+ it "should handle target directory as root for urls starting with / if root is no available" do
83
+ u.check_local_uri("/2.html", "spec/fixtures/dir-structure/1.htm")
84
+ u.errors.should be_empty
85
+ end
86
+
87
+ it "should ignore query string when checking local files" do
88
+ u.check_local_uri("2.html?queryparam=value", "spec/fixtures/dir-structure/1.htm")
89
+ u.errors.should be_empty
90
+ end
91
+ end
92
+
93
+ describe "#check_remote_uri" do
94
+ let(:u) {Utterson.new}
95
+
96
+ it "should not assign error info if request is successfull" do
97
+ stub_request(:head, "http://example.com/index.html").
98
+ with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
99
+ to_return(:status => 200, :body => "", :headers => {})
100
+ u.check_remote_uri("http://example.com/index.html", "test.html")
101
+ u.errors.should be_empty
102
+ end
103
+
104
+ it "should assign error info if there is error response" do
105
+ stub_request(:head, "http://example.com/404.html").
106
+ with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
107
+ to_return(:status => 404, :body => "", :headers => {})
108
+ u.check_remote_uri("http://example.com/404.html", "test.html")
109
+ puts u.errors.inspect
110
+ u.errors["test.html"].should_not be_empty
111
+ u.errors["test.html"]["http://example.com/404.html"].instance_of?(Net::HTTPNotFound).should be_true
112
+ end
113
+
114
+ it "should add error status from buffer timeouts" do
115
+ stub_request(:head, "http://example.com/index.html").to_timeout
116
+ u.check_remote_uri("http://example.com/index.html", "test.html")
117
+ u.errors.should_not be_empty
118
+ end
119
+
120
+ it "should add error status from connection timeouts" do
121
+ stub_request(:head, "http://example.com/index.html").to_raise(Errno::ETIMEDOUT)
122
+ u.check_remote_uri("http://example.com/index.html", "test.html")
123
+ u.errors.should_not be_empty
124
+ end
125
+
126
+ it "shoud add error status from name resolution errors" do
127
+ stub_request(:head, "http://example.com/index.html").
128
+ to_raise(SocketError.new('getaddrinfo: Name or service not known'))
129
+ u.check_remote_uri("http://example.com/index.html", "test.html")
130
+ u.errors.should_not be_empty
131
+ end
132
+
133
+ it "shoud add error status when invalid URI" do
134
+ URI.stub(:new).and_raise(URI::InvalidURIError)
135
+ u.check_remote_uri("http://invalid_uri", "test.html")
136
+ u.errors.should_not be_empty
137
+ end
138
+ end
139
+
140
+ describe "#print_results" do
141
+ it "should output only basic stats if no errors" do
142
+ u = Utterson.new(dir: "spec/fixtures/dir-structure")
143
+ output = capture_stdout do
144
+ u.check
145
+ end
146
+ output.should match(/4 files with 0 urls checked/)
147
+ end
148
+
149
+ it "should output error information" do
150
+ stub_request(:head, "http://example.com/").
151
+ with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
152
+ to_return(:status => 404, :body => "", :headers => {})
153
+ u = Utterson.new(dir: "spec/fixtures")
154
+ output = capture_stdout do
155
+ u.check
156
+ end
157
+ output.should match("spec/fixtures/sample.html\n\tstyle.css\n\t\tFile not found")
158
+ output.should match("script.js\n\t\tFile not found")
159
+ output.should match("image.jpg\n\t\tFile not found")
160
+ output.should match("http://example.com\n\t\tHTTP 404")
161
+ output.should match("5 files with 4 urls checked and 4 errors found")
162
+ end
163
+ end
164
+ end
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: utterson
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Juhamatti Niemelä
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-11-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: trollop
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webmock
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: 1.15.2
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 1.15.2
97
+ description: Traverses all HTML files from given directory and checks links found
98
+ in them.
99
+ email: iiska@iki.fi
100
+ executables:
101
+ - utterson
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - bin/utterson
106
+ - lib/utterson.rb
107
+ - README.md
108
+ - spec/spec_helper.rb
109
+ - spec/fixtures/dir-structure/a/3.htm
110
+ - spec/fixtures/dir-structure/a/b/4.html
111
+ - spec/fixtures/dir-structure/2.html
112
+ - spec/fixtures/dir-structure/1.htm
113
+ - spec/fixtures/sample.html
114
+ - spec/utterson_spec.rb
115
+ homepage: https://github.com/iiska/utterson
116
+ licenses:
117
+ - MIT
118
+ metadata: {}
119
+ post_install_message:
120
+ rdoc_options: []
121
+ require_paths:
122
+ - lib
123
+ required_ruby_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ~>
126
+ - !ruby/object:Gem::Version
127
+ version: '2.0'
128
+ required_rubygems_version: !ruby/object:Gem::Requirement
129
+ requirements:
130
+ - - '>='
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
133
+ requirements: []
134
+ rubyforge_project:
135
+ rubygems_version: 2.0.3
136
+ signing_key:
137
+ specification_version: 4
138
+ summary: Friendly HTML crawler and url checker
139
+ test_files:
140
+ - spec/spec_helper.rb
141
+ - spec/fixtures/dir-structure/a/3.htm
142
+ - spec/fixtures/dir-structure/a/b/4.html
143
+ - spec/fixtures/dir-structure/2.html
144
+ - spec/fixtures/dir-structure/1.htm
145
+ - spec/fixtures/sample.html
146
+ - spec/utterson_spec.rb
147
+ has_rdoc: