checklinks 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 77b94b65c8046e3e8c4aadb5a21d1ea0e511011462497aaa304cb44519af05c8
4
+ data.tar.gz: 6b46cf2b5108b9ddc2028b58bf6253b95b3696bef60a75748aa997e30ad27b3b
5
+ SHA512:
6
+ metadata.gz: be687604bef24ea7ece0d9322d68589de3e8b3ec84055138b2369c4eea9c3b6128b619bfd47fb6d75b04b584ca3a9109eb1b242733207d0bd2f5fd0fefae7aae
7
+ data.tar.gz: 9a383deb39970cd0eea213e12fe1de467ec9d6c9c9ce8e47db27bc85d5a0a07c0ec41f68d927b5123c09edeabc94689f4b671618aaff111bfa019015a5b802ee
@@ -0,0 +1 @@
1
+ *.gem
@@ -0,0 +1 @@
1
+ 2.5.8
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
@@ -0,0 +1,17 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ checklinks (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+
10
+ PLATFORMS
11
+ ruby
12
+
13
+ DEPENDENCIES
14
+ checklinks!
15
+
16
+ BUNDLED WITH
17
+ 1.17.2
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Chris Seaton
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,48 @@
1
+ # Checklinks
2
+
3
+ Checklinks finds broken or redirected links in a static webpage.
4
+
5
+ ```
6
+ % gem install checklinks
7
+ % checklinks **/*.md
8
+ ```
9
+
10
+ It uses code from [awesome_bot](https://github.com/dkhamsing/awesome_bot), but the output is much more terse so it's easier to find which links are broken, and it also prints link locations which makes updating them easier.
11
+
12
+ ## Configuration
13
+
14
+ ```
15
+ % checklinks --config .checklinks.yaml **/*.md
16
+ ```
17
+
18
+ ```yaml
19
+ ignore:
20
+ exact:
21
+ - ...
22
+ prefixes:
23
+ - ...
24
+ suffixes:
25
+ - ...
26
+ forbidden:
27
+ exact:
28
+ - ...
29
+ prefixes:
30
+ - ...
31
+ suffixes:
32
+ - ...
33
+ redirect:
34
+ exact:
35
+ - ...
36
+ prefixes:
37
+ - ...
38
+ suffixes:
39
+ - ...
40
+ ```
41
+
42
+ ## Author
43
+
44
+ [Chris Seaton](https://chrisseaton.com/)
45
+
46
+ ## License
47
+
48
+ Copyright © 2020 Chris Seaton. Available under the MIT license.
@@ -0,0 +1,163 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'set'
4
+ require 'yaml'
5
+
6
+ require 'checklinks'
7
+
8
+ # Process command line arguments
9
+
10
+ files = []
11
+ config = {}
12
+
13
+ args = ARGV.dup
14
+ until args.empty?
15
+ arg = args.shift
16
+ if arg.start_with?('-')
17
+ case arg
18
+ when '--config'
19
+ config_file = args.shift
20
+ raise unless config_file
21
+ config = YAML.load(File.read(config_file))
22
+ else
23
+ raise
24
+ end
25
+ else
26
+ files.push arg
27
+ end
28
+ end
29
+
30
+ # Create worklists - ahead of processing anything as they refer to each other cylically
31
+
32
+ file_worklist = Checklinks::Worklist.new(*files)
33
+ url_worklist = Checklinks::Worklist.new
34
+ link_collector = Checklinks::Worklist.collector
35
+ status_worklist = Checklinks::Worklist.new
36
+ error_collector = Checklinks::Worklist.collector
37
+
38
+ # Process files to produce URLs and links
39
+
40
+ file_worklist.process(Set.new) do |filename, found_urls|
41
+ File.open(filename, encoding: Encoding::UTF_8) do |file|
42
+ line_number = 1
43
+ file.each_line do |line|
44
+ urls = URI.extract(line, /http()s?/)
45
+ urls = Checklinks::Awesome.links_filter(urls)
46
+ urls.each do |url|
47
+ next unless found_urls.add?(url)
48
+ url_worklist.push url
49
+ link_collector.push [url, filename, line_number]
50
+ end
51
+ line_number += 1
52
+ end
53
+ end
54
+ end
55
+
56
+ # Process URLs to produce status updates and errors
57
+
58
+ url_worklist.process_concurrently(16) do |url|
59
+ status_worklist.push [:try, url]
60
+ if config.dig('ignore', 'exact')&.any? { |exact| url.start_with?(exact) } ||
61
+ config.dig('ignore', 'prefixes')&.any? { |prefix| url.start_with?(prefix) } ||
62
+ config.dig('ignore', 'suffixes')&.any? { |suffix| url.end_with?(suffix) }
63
+ status_worklist.push [:ignored]
64
+ next
65
+ end
66
+ begin
67
+ code, headers = Checklinks::Awesome.net_status(url, true)
68
+ rescue Errno::ECONNREFUSED
69
+ code = :refused
70
+ rescue Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout
71
+ code = :timeout
72
+ rescue OpenSSL::SSL::SSLError
73
+ code = :https
74
+ rescue => e
75
+ code = e.class.name
76
+ end
77
+ if code == 200
78
+ status_worklist.push [:ok]
79
+ else
80
+ if (301..302).include?(code) && (config.dig('ignore', 'redirect', 'exact')&.any? { |exact| url.start_with?(exact) } ||
81
+ config.dig('ignore', 'redirect', 'prefixes')&.any? { |prefix| url.start_with?(prefix) } ||
82
+ config.dig('ignore', 'redirect', 'suffixes')&.any? { |suffix| url.end_with?(suffix) })
83
+ status_worklist.push [:ignored]
84
+ next
85
+ end
86
+ if code == 403 && (config.dig('ignore', 'forbidden', 'exact')&.any? { |exact| url.start_with?(exact) } ||
87
+ config.dig('ignore', 'forbidden', 'prefixes')&.any? { |prefix| url.start_with?(prefix) } ||
88
+ config.dig('ignore', 'forbidden', 'suffixes')&.any? { |suffix| url.end_with?(suffix) })
89
+ status_worklist.push [:ignored]
90
+ next
91
+ end
92
+ status_worklist.push [:error]
93
+ error_collector.push [url, code, headers]
94
+ end
95
+ end
96
+
97
+ # Process status updates to print progress on the screen
98
+
99
+ status_worklist.process({ok: 0, error: 0, ignored: 0, clear: 0}) do |message, state|
100
+ print "\r#{' '*state[:clear]}\r"
101
+ case message.first
102
+ when :try
103
+ puts message.last
104
+ when :ok
105
+ state[:ok] += 1
106
+ when :ignored
107
+ state[:ignored] += 1
108
+ when :error
109
+ state[:error] += 1
110
+ error_collector.push
111
+ end
112
+ line = "[#{state[:ok].to_s.rjust(5)} ok, #{state[:ignored].to_s.rjust(5)} ignored, #{state[:error].to_s.rjust(5)} errors, #{file_worklist.size.to_s.rjust(5)} files left, #{url_worklist.size.to_s.rjust(5)} URLs left]"
113
+ print line
114
+ state[:clear] = line.length
115
+ end
116
+
117
+ # Run everything to conclusion
118
+
119
+ Checklinks::Worklist.close file_worklist, url_worklist
120
+ status_state = status_worklist.close
121
+ links = link_collector.close
122
+ errors = error_collector.close
123
+
124
+ # Print a report
125
+
126
+ puts "\r#{' '*status_state[:clear]}\r"
127
+
128
+ if errors.empty?
129
+ puts 'Ok!'
130
+ else
131
+ puts 'Errors:'
132
+
133
+ # Create a map of errored URLs back to their locations
134
+
135
+ urls = errors.map(&:first).to_set
136
+ url_links = {}
137
+ links.each do |url, filename, line_number|
138
+ next unless urls.include?(url)
139
+ references = url_links[url]
140
+ unless references
141
+ references = []
142
+ url_links[url] = references
143
+ end
144
+ references.push [filename, line_number]
145
+ end
146
+
147
+ # Print each error
148
+
149
+ errors.each do |url, code, headers|
150
+ message = code
151
+ if (301..302).include?(code)
152
+ message = "#{code} -> #{headers['location']}"
153
+ else
154
+ message = code
155
+ end
156
+ puts " #{url} (#{message})"
157
+ url_links[url].each do |filename, line_number|
158
+ puts " #{filename}:#{line_number}"
159
+ end
160
+ end
161
+ end
162
+
163
+ exit errors.empty?
@@ -0,0 +1,21 @@
1
+ require_relative 'lib/checklinks/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = 'checklinks'
5
+ spec.version = Checklinks::VERSION
6
+ spec.authors = ['Chris Seaton']
7
+ spec.email = ['chris@chrisseaton.com']
8
+
9
+ spec.summary = 'Find broken links in your static website'
10
+ spec.homepage = 'https://github.com/chrisseaton/checklinks/'
11
+ spec.license = 'MIT'
12
+
13
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
14
+ `git ls-files -z`.split("\x0")
15
+ end
16
+ spec.bindir = 'bin'
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.required_ruby_version = '>= 2.5.8'
21
+ end
@@ -0,0 +1,3 @@
1
+ require 'checklinks/version'
2
+ require 'checklinks/worklist'
3
+ require 'checklinks/awesome'
@@ -0,0 +1,105 @@
1
+ module Checklinks
2
+ module Awesome
3
+ # This module is code copied from awesome_bot by dkhamsing
4
+
5
+ # The MIT License (MIT)
6
+ #
7
+ # Copyright (c) 2015
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ # of this software and associated documentation files (the "Software"), to deal
11
+ # in the Software without restriction, including without limitation the rights
12
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ # copies of the Software, and to permit persons to whom the Software is
14
+ # furnished to do so, subject to the following conditions:
15
+ #
16
+ # The above copyright notice and this permission notice shall be included in all
17
+ # copies or substantial portions of the Software.
18
+ #
19
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ # SOFTWARE.
26
+
27
+ def self.links_filter(list)
28
+ list.reject { |x| x.length < 9 }
29
+ .map do |x|
30
+ x.gsub(',','%2c').gsub(/'.*/, '').gsub(/,.*/, '')
31
+ end
32
+ .map do |x|
33
+ if x.include? ')]'
34
+ x.gsub /\)\].*/, ''
35
+ elsif (x.scan(')').count == 1) && (x.scan('(').count == 1)
36
+ x
37
+ elsif (x.scan(')').count == 2) && (x.scan('(').count == 1)
38
+ x.gsub(/\)\).*/, ')')
39
+ elsif (x.scan(')').count > 0)
40
+ if (x.include? 'wikipedia')
41
+ if (x.scan(')').count >= 1) && (x.scan('(').count == 0)
42
+ x.gsub(/\).*/, '')
43
+ else
44
+ x
45
+ end
46
+ else
47
+ x.gsub(/\).*/, '')
48
+ end
49
+ elsif x.include? '[' # adoc
50
+ x.gsub(/\[.*/, '')
51
+ elsif x[-1]=='.' || x[-1]==':'
52
+ x[0..-2]
53
+ elsif x[-1]=='.'
54
+ x[0..-2]
55
+ elsif x[-3..-1]=='%2c'
56
+ x[0..-4]
57
+ else
58
+ x
59
+ end
60
+ end
61
+ end
62
+
63
+ def self.net_status(url, timeout=30, head)
64
+ require 'net/http'
65
+ require 'openssl'
66
+ require 'uri'
67
+
68
+ uri = URI.parse url
69
+ Net::HTTP.start(uri.host, uri.port, :use_ssl => uri.scheme == 'https', :open_timeout => timeout) do |http|
70
+ ua = {'User-Agent' => 'checklinks'}
71
+ if head
72
+ request = Net::HTTP::Head.new(uri,ua)
73
+ else
74
+ request = Net::HTTP::Get.new(uri,ua)
75
+ end
76
+
77
+ if uri.userinfo
78
+ auth_user, auth_pass = uri.userinfo.split(/:/)
79
+ request.basic_auth auth_user, auth_pass
80
+ end
81
+
82
+ response = http.request request
83
+
84
+ code = response.code==nil ? 200 : response.code.to_i
85
+
86
+ headers = {}
87
+ response.each do |k, v|
88
+ headers[k] = v.force_encoding("utf-8")
89
+ end
90
+
91
+ # handle incomplete redirect
92
+ loc = headers['location']
93
+ unless loc.nil?
94
+ loc_uri = URI.parse loc
95
+ if loc_uri.scheme.nil?
96
+ new_loc = uri.scheme + '://' + uri.host + loc
97
+ headers['location'] = new_loc
98
+ end
99
+ end
100
+
101
+ return [code, headers]
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,6 @@
1
+ module Checklinks
2
+ MAJOR_VERSION = 0
3
+ MINOR_VERSION = 1
4
+ PATCH_VERSION = 0
5
+ VERSION = "#{MAJOR_VERSION}.#{MINOR_VERSION}.#{PATCH_VERSION}"
6
+ end
@@ -0,0 +1,79 @@
1
+ module Checklinks
2
+ # A worklist is a list of jobs to do and possibly a pool of threads
3
+ # working on it
4
+ class Worklist
5
+ # Create a worklist, possibly with a list of jobs to start with
6
+ def initialize(*jobs)
7
+ @state = nil
8
+ @queue = Queue.new
9
+ @active = Queue.new
10
+ @processing = false
11
+ push *jobs
12
+ end
13
+
14
+ # Push more jobs onto the worklist
15
+ def push(*jobs)
16
+ jobs.each do |job|
17
+ raise unless job
18
+ @queue.push job
19
+ end
20
+ end
21
+
22
+ # Process the jobs in the worklist in a background thread. You can pass a
23
+ # state object which is then passed into the block to process each job.
24
+ def process(state=nil)
25
+ @state = state
26
+ process_concurrently(1) do |job|
27
+ yield job, @state
28
+ end
29
+ end
30
+
31
+ # Process the jobs in the worklist in multiple concurrent background threads
32
+ def process_concurrently(n)
33
+ raise if @threads
34
+ @threads = n.times.map {
35
+ Thread.new {
36
+ until @queue.closed? && @queue.empty?
37
+ job = @queue.pop
38
+ next unless job
39
+ @active.push :active
40
+ yield job
41
+ @active.pop
42
+ end
43
+ }
44
+ }
45
+ end
46
+
47
+ # How many jobs are left to process
48
+ def size
49
+ # This is a a bit racy - we don't update these two queues atomically
50
+ @queue.size + @active.size
51
+ end
52
+
53
+ # Declare that you're not going to push any more jobs, and wait for the
54
+ # current jobs to be processed if you've started that
55
+ def close
56
+ @threads.size.times do
57
+ @queue.push nil
58
+ end
59
+ @queue.close
60
+ @threads.each &:join
61
+ @state
62
+ end
63
+
64
+ # Close a set of worklists
65
+ def self.close(*worklists)
66
+ worklists.map(&:close)
67
+ end
68
+
69
+ # Create a worklist that collects values into an array and returns
70
+ # it when closed
71
+ def self.collector
72
+ worklist = Worklist.new
73
+ worklist.process([]) do |value, collected|
74
+ collected.push value
75
+ end
76
+ worklist
77
+ end
78
+ end
79
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: checklinks
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Chris Seaton
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-13 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email:
15
+ - chris@chrisseaton.com
16
+ executables:
17
+ - checklinks
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".gitignore"
22
+ - ".ruby-version"
23
+ - Gemfile
24
+ - Gemfile.lock
25
+ - LICENSE
26
+ - README.md
27
+ - bin/checklinks
28
+ - checklinks.gemspec
29
+ - lib/checklinks.rb
30
+ - lib/checklinks/awesome.rb
31
+ - lib/checklinks/version.rb
32
+ - lib/checklinks/worklist.rb
33
+ homepage: https://github.com/chrisseaton/checklinks/
34
+ licenses:
35
+ - MIT
36
+ metadata: {}
37
+ post_install_message:
38
+ rdoc_options: []
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: 2.5.8
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ requirements: []
52
+ rubyforge_project:
53
+ rubygems_version: 2.7.6.2
54
+ signing_key:
55
+ specification_version: 4
56
+ summary: Find broken links in your static website
57
+ test_files: []