checklinks 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 77b94b65c8046e3e8c4aadb5a21d1ea0e511011462497aaa304cb44519af05c8
4
+ data.tar.gz: 6b46cf2b5108b9ddc2028b58bf6253b95b3696bef60a75748aa997e30ad27b3b
5
+ SHA512:
6
+ metadata.gz: be687604bef24ea7ece0d9322d68589de3e8b3ec84055138b2369c4eea9c3b6128b619bfd47fb6d75b04b584ca3a9109eb1b242733207d0bd2f5fd0fefae7aae
7
+ data.tar.gz: 9a383deb39970cd0eea213e12fe1de467ec9d6c9c9ce8e47db27bc85d5a0a07c0ec41f68d927b5123c09edeabc94689f4b671618aaff111bfa019015a5b802ee
@@ -0,0 +1 @@
1
+ *.gem
@@ -0,0 +1 @@
1
+ 2.5.8
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
@@ -0,0 +1,17 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ checklinks (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+
10
+ PLATFORMS
11
+ ruby
12
+
13
+ DEPENDENCIES
14
+ checklinks!
15
+
16
+ BUNDLED WITH
17
+ 1.17.2
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Chris Seaton
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,48 @@
1
+ # Checklinks
2
+
3
+ Checklinks finds broken or redirected links in a static webpage.
4
+
5
+ ```
6
+ % gem install checklinks
7
+ % checklinks **/*.md
8
+ ```
9
+
10
+ It uses code from [awesome_bot](https://github.com/dkhamsing/awesome_bot), but the output is much more terse so it's easier to find which links are broken, and it also prints link locations which makes updating them easier.
11
+
12
+ ## Configuration
13
+
14
+ ```
15
+ % checklinks --config .checklinks.yaml **/*.md
16
+ ```
17
+
18
+ ```yaml
19
+ ignore:
20
+ exact:
21
+ - ...
22
+ prefixes:
23
+ - ...
24
+ suffixes:
25
+ - ...
26
+ forbidden:
27
+ exact:
28
+ - ...
29
+ prefixes:
30
+ - ...
31
+ suffixes:
32
+ - ...
33
+ redirect:
34
+ exact:
35
+ - ...
36
+ prefixes:
37
+ - ...
38
+ suffixes:
39
+ - ...
40
+ ```
41
+
42
+ ## Author
43
+
44
+ [Chris Seaton](https://chrisseaton.com/)
45
+
46
+ ## License
47
+
48
+ Copyright © 2020 Chris Seaton. Available under the MIT license.
@@ -0,0 +1,163 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'set'
4
+ require 'yaml'
5
+
6
+ require 'checklinks'
7
+
8
+ # Process command line arguments
9
+
10
+ files = []
11
+ config = {}
12
+
13
+ args = ARGV.dup
14
+ until args.empty?
15
+ arg = args.shift
16
+ if arg.start_with?('-')
17
+ case arg
18
+ when '--config'
19
+ config_file = args.shift
20
+ raise unless config_file
21
+ config = YAML.load(File.read(config_file))
22
+ else
23
+ raise
24
+ end
25
+ else
26
+ files.push arg
27
+ end
28
+ end
29
+
30
+ # Create worklists - ahead of processing anything as they refer to each other cylically
31
+
32
+ file_worklist = Checklinks::Worklist.new(*files)
33
+ url_worklist = Checklinks::Worklist.new
34
+ link_collector = Checklinks::Worklist.collector
35
+ status_worklist = Checklinks::Worklist.new
36
+ error_collector = Checklinks::Worklist.collector
37
+
38
+ # Process files to produce URLs and links
39
+
40
+ file_worklist.process(Set.new) do |filename, found_urls|
41
+ File.open(filename, encoding: Encoding::UTF_8) do |file|
42
+ line_number = 1
43
+ file.each_line do |line|
44
+ urls = URI.extract(line, /http()s?/)
45
+ urls = Checklinks::Awesome.links_filter(urls)
46
+ urls.each do |url|
47
+ next unless found_urls.add?(url)
48
+ url_worklist.push url
49
+ link_collector.push [url, filename, line_number]
50
+ end
51
+ line_number += 1
52
+ end
53
+ end
54
+ end
55
+
56
+ # Process URLs to produce status updates and errors
57
+
58
+ url_worklist.process_concurrently(16) do |url|
59
+ status_worklist.push [:try, url]
60
+ if config.dig('ignore', 'exact')&.any? { |exact| url.start_with?(exact) } ||
61
+ config.dig('ignore', 'prefixes')&.any? { |prefix| url.start_with?(prefix) } ||
62
+ config.dig('ignore', 'suffixes')&.any? { |suffix| url.end_with?(suffix) }
63
+ status_worklist.push [:ignored]
64
+ next
65
+ end
66
+ begin
67
+ code, headers = Checklinks::Awesome.net_status(url, true)
68
+ rescue Errno::ECONNREFUSED
69
+ code = :refused
70
+ rescue Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout
71
+ code = :timeout
72
+ rescue OpenSSL::SSL::SSLError
73
+ code = :https
74
+ rescue => e
75
+ code = e.class.name
76
+ end
77
+ if code == 200
78
+ status_worklist.push [:ok]
79
+ else
80
+ if (301..302).include?(code) && (config.dig('ignore', 'redirect', 'exact')&.any? { |exact| url.start_with?(exact) } ||
81
+ config.dig('ignore', 'redirect', 'prefixes')&.any? { |prefix| url.start_with?(prefix) } ||
82
+ config.dig('ignore', 'redirect', 'suffixes')&.any? { |suffix| url.end_with?(suffix) })
83
+ status_worklist.push [:ignored]
84
+ next
85
+ end
86
+ if code == 403 && (config.dig('ignore', 'forbidden', 'exact')&.any? { |exact| url.start_with?(exact) } ||
87
+ config.dig('ignore', 'forbidden', 'prefixes')&.any? { |prefix| url.start_with?(prefix) } ||
88
+ config.dig('ignore', 'forbidden', 'suffixes')&.any? { |suffix| url.end_with?(suffix) })
89
+ status_worklist.push [:ignored]
90
+ next
91
+ end
92
+ status_worklist.push [:error]
93
+ error_collector.push [url, code, headers]
94
+ end
95
+ end
96
+
97
+ # Process status updates to print progress on the screen
98
+
99
+ status_worklist.process({ok: 0, error: 0, ignored: 0, clear: 0}) do |message, state|
100
+ print "\r#{' '*state[:clear]}\r"
101
+ case message.first
102
+ when :try
103
+ puts message.last
104
+ when :ok
105
+ state[:ok] += 1
106
+ when :ignored
107
+ state[:ignored] += 1
108
+ when :error
109
+ state[:error] += 1
110
+ error_collector.push
111
+ end
112
+ line = "[#{state[:ok].to_s.rjust(5)} ok, #{state[:ignored].to_s.rjust(5)} ignored, #{state[:error].to_s.rjust(5)} errors, #{file_worklist.size.to_s.rjust(5)} files left, #{url_worklist.size.to_s.rjust(5)} URLs left]"
113
+ print line
114
+ state[:clear] = line.length
115
+ end
116
+
117
+ # Run everything to conclusion
118
+
119
+ Checklinks::Worklist.close file_worklist, url_worklist
120
+ status_state = status_worklist.close
121
+ links = link_collector.close
122
+ errors = error_collector.close
123
+
124
+ # Print a report
125
+
126
+ puts "\r#{' '*status_state[:clear]}\r"
127
+
128
+ if errors.empty?
129
+ puts 'Ok!'
130
+ else
131
+ puts 'Errors:'
132
+
133
+ # Create a map of errored URLs back to their locations
134
+
135
+ urls = errors.map(&:first).to_set
136
+ url_links = {}
137
+ links.each do |url, filename, line_number|
138
+ next unless urls.include?(url)
139
+ references = url_links[url]
140
+ unless references
141
+ references = []
142
+ url_links[url] = references
143
+ end
144
+ references.push [filename, line_number]
145
+ end
146
+
147
+ # Print each error
148
+
149
+ errors.each do |url, code, headers|
150
+ message = code
151
+ if (301..302).include?(code)
152
+ message = "#{code} -> #{headers['location']}"
153
+ else
154
+ message = code
155
+ end
156
+ puts " #{url} (#{message})"
157
+ url_links[url].each do |filename, line_number|
158
+ puts " #{filename}:#{line_number}"
159
+ end
160
+ end
161
+ end
162
+
163
+ exit errors.empty?
@@ -0,0 +1,21 @@
1
+ require_relative 'lib/checklinks/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = 'checklinks'
5
+ spec.version = Checklinks::VERSION
6
+ spec.authors = ['Chris Seaton']
7
+ spec.email = ['chris@chrisseaton.com']
8
+
9
+ spec.summary = 'Find broken links in your static website'
10
+ spec.homepage = 'https://github.com/chrisseaton/checklinks/'
11
+ spec.license = 'MIT'
12
+
13
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
14
+ `git ls-files -z`.split("\x0")
15
+ end
16
+ spec.bindir = 'bin'
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.required_ruby_version = '>= 2.5.8'
21
+ end
@@ -0,0 +1,3 @@
1
+ require 'checklinks/version'
2
+ require 'checklinks/worklist'
3
+ require 'checklinks/awesome'
@@ -0,0 +1,105 @@
1
+ module Checklinks
2
+ module Awesome
3
+ # This module is code copied from awesome_bot by dkhamsing
4
+
5
+ # The MIT License (MIT)
6
+ #
7
+ # Copyright (c) 2015
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ # of this software and associated documentation files (the "Software"), to deal
11
+ # in the Software without restriction, including without limitation the rights
12
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ # copies of the Software, and to permit persons to whom the Software is
14
+ # furnished to do so, subject to the following conditions:
15
+ #
16
+ # The above copyright notice and this permission notice shall be included in all
17
+ # copies or substantial portions of the Software.
18
+ #
19
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ # SOFTWARE.
26
+
27
+ def self.links_filter(list)
28
+ list.reject { |x| x.length < 9 }
29
+ .map do |x|
30
+ x.gsub(',','%2c').gsub(/'.*/, '').gsub(/,.*/, '')
31
+ end
32
+ .map do |x|
33
+ if x.include? ')]'
34
+ x.gsub /\)\].*/, ''
35
+ elsif (x.scan(')').count == 1) && (x.scan('(').count == 1)
36
+ x
37
+ elsif (x.scan(')').count == 2) && (x.scan('(').count == 1)
38
+ x.gsub(/\)\).*/, ')')
39
+ elsif (x.scan(')').count > 0)
40
+ if (x.include? 'wikipedia')
41
+ if (x.scan(')').count >= 1) && (x.scan('(').count == 0)
42
+ x.gsub(/\).*/, '')
43
+ else
44
+ x
45
+ end
46
+ else
47
+ x.gsub(/\).*/, '')
48
+ end
49
+ elsif x.include? '[' # adoc
50
+ x.gsub(/\[.*/, '')
51
+ elsif x[-1]=='.' || x[-1]==':'
52
+ x[0..-2]
53
+ elsif x[-1]=='.'
54
+ x[0..-2]
55
+ elsif x[-3..-1]=='%2c'
56
+ x[0..-4]
57
+ else
58
+ x
59
+ end
60
+ end
61
+ end
62
+
63
+ def self.net_status(url, timeout=30, head)
64
+ require 'net/http'
65
+ require 'openssl'
66
+ require 'uri'
67
+
68
+ uri = URI.parse url
69
+ Net::HTTP.start(uri.host, uri.port, :use_ssl => uri.scheme == 'https', :open_timeout => timeout) do |http|
70
+ ua = {'User-Agent' => 'checklinks'}
71
+ if head
72
+ request = Net::HTTP::Head.new(uri,ua)
73
+ else
74
+ request = Net::HTTP::Get.new(uri,ua)
75
+ end
76
+
77
+ if uri.userinfo
78
+ auth_user, auth_pass = uri.userinfo.split(/:/)
79
+ request.basic_auth auth_user, auth_pass
80
+ end
81
+
82
+ response = http.request request
83
+
84
+ code = response.code==nil ? 200 : response.code.to_i
85
+
86
+ headers = {}
87
+ response.each do |k, v|
88
+ headers[k] = v.force_encoding("utf-8")
89
+ end
90
+
91
+ # handle incomplete redirect
92
+ loc = headers['location']
93
+ unless loc.nil?
94
+ loc_uri = URI.parse loc
95
+ if loc_uri.scheme.nil?
96
+ new_loc = uri.scheme + '://' + uri.host + loc
97
+ headers['location'] = new_loc
98
+ end
99
+ end
100
+
101
+ return [code, headers]
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,6 @@
1
+ module Checklinks
2
+ MAJOR_VERSION = 0
3
+ MINOR_VERSION = 1
4
+ PATCH_VERSION = 0
5
+ VERSION = "#{MAJOR_VERSION}.#{MINOR_VERSION}.#{PATCH_VERSION}"
6
+ end
@@ -0,0 +1,79 @@
1
+ module Checklinks
2
+ # A worklist is a list of jobs to do and possibly a pool of threads
3
+ # working on it
4
+ class Worklist
5
+ # Create a worklist, possibly with a list of jobs to start with
6
+ def initialize(*jobs)
7
+ @state = nil
8
+ @queue = Queue.new
9
+ @active = Queue.new
10
+ @processing = false
11
+ push *jobs
12
+ end
13
+
14
+ # Push more jobs onto the worklist
15
+ def push(*jobs)
16
+ jobs.each do |job|
17
+ raise unless job
18
+ @queue.push job
19
+ end
20
+ end
21
+
22
+ # Process the jobs in the worklist in a background thread. You can pass a
23
+ # state object which is then passed into the block to process each job.
24
+ def process(state=nil)
25
+ @state = state
26
+ process_concurrently(1) do |job|
27
+ yield job, @state
28
+ end
29
+ end
30
+
31
+ # Process the jobs in the worklist in multiple concurrent background threads
32
+ def process_concurrently(n)
33
+ raise if @threads
34
+ @threads = n.times.map {
35
+ Thread.new {
36
+ until @queue.closed? && @queue.empty?
37
+ job = @queue.pop
38
+ next unless job
39
+ @active.push :active
40
+ yield job
41
+ @active.pop
42
+ end
43
+ }
44
+ }
45
+ end
46
+
47
+ # How many jobs are left to process
48
+ def size
49
+ # This is a a bit racy - we don't update these two queues atomically
50
+ @queue.size + @active.size
51
+ end
52
+
53
+ # Declare that you're not going to push any more jobs, and wait for the
54
+ # current jobs to be processed if you've started that
55
+ def close
56
+ @threads.size.times do
57
+ @queue.push nil
58
+ end
59
+ @queue.close
60
+ @threads.each &:join
61
+ @state
62
+ end
63
+
64
+ # Close a set of worklists
65
+ def self.close(*worklists)
66
+ worklists.map(&:close)
67
+ end
68
+
69
+ # Create a worklist that collects values into an array and returns
70
+ # it when closed
71
+ def self.collector
72
+ worklist = Worklist.new
73
+ worklist.process([]) do |value, collected|
74
+ collected.push value
75
+ end
76
+ worklist
77
+ end
78
+ end
79
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: checklinks
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Chris Seaton
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-13 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email:
15
+ - chris@chrisseaton.com
16
+ executables:
17
+ - checklinks
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".gitignore"
22
+ - ".ruby-version"
23
+ - Gemfile
24
+ - Gemfile.lock
25
+ - LICENSE
26
+ - README.md
27
+ - bin/checklinks
28
+ - checklinks.gemspec
29
+ - lib/checklinks.rb
30
+ - lib/checklinks/awesome.rb
31
+ - lib/checklinks/version.rb
32
+ - lib/checklinks/worklist.rb
33
+ homepage: https://github.com/chrisseaton/checklinks/
34
+ licenses:
35
+ - MIT
36
+ metadata: {}
37
+ post_install_message:
38
+ rdoc_options: []
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: 2.5.8
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ requirements: []
52
+ rubyforge_project:
53
+ rubygems_version: 2.7.6.2
54
+ signing_key:
55
+ specification_version: 4
56
+ summary: Find broken links in your static website
57
+ test_files: []