broken_link_finder 0.9.4 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for reporting in a HTML format.
5
+ class HTMLReporter < Reporter
6
+ # Returns a new HTMLReporter instance.
7
+ # stream is any Object that responds to :puts and :print.
8
+ def initialize(stream, sort,
9
+ broken_links, ignored_links,
10
+ broken_link_map, crawl_stats)
11
+ super
12
+ end
13
+
14
+ # Pretty print a report detailing the full link summary.
15
+ def call(broken_verbose: true, ignored_verbose: false)
16
+ puts '<div class="broken_link_finder_report">'
17
+
18
+ report_crawl_summary
19
+ report_broken_links(verbose: broken_verbose)
20
+ report_ignored_links(verbose: ignored_verbose)
21
+
22
+ puts '</div>'
23
+
24
+ nil
25
+ end
26
+
27
+ private
28
+
29
+ # Report a summary of the overall crawl.
30
+ def report_crawl_summary
31
+ puts format(
32
+ '<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
33
+ @crawl_stats[:url],
34
+ @crawl_stats[:url],
35
+ @crawl_stats[:num_pages],
36
+ @crawl_stats[:num_links],
37
+ @crawl_stats[:duration]&.truncate(2)
38
+ )
39
+ end
40
+
41
+ # Report a summary of the broken links.
42
+ def report_broken_links(verbose: true)
43
+ puts '<div class="broken_links">'
44
+
45
+ if @broken_links.empty?
46
+ puts_summary 'Good news, there are no broken links!', type: :broken
47
+ else
48
+ num_pages, num_links = get_hash_stats(@broken_links)
49
+ puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
50
+
51
+ @broken_links.each do |key, values|
52
+ puts_group(key, type: :broken) # Puts the opening <p> element.
53
+
54
+ if verbose || (values.length <= NUM_VALUES)
55
+ values.each { |value| puts_group_item value, type: :broken }
56
+ else # Only print N values and summarise the rest.
57
+ NUM_VALUES.times { |i| puts_group_item values[i], type: :broken }
58
+
59
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
60
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all<br />"
61
+ end
62
+
63
+ puts '</p>'
64
+ end
65
+ end
66
+
67
+ puts '</div>'
68
+ end
69
+
70
+ # Report a summary of the ignored links.
71
+ def report_ignored_links(verbose: false)
72
+ puts '<div class="ignored_links">'
73
+
74
+ if @ignored_links.any?
75
+ num_pages, num_links = get_hash_stats(@ignored_links)
76
+ puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
77
+
78
+ @ignored_links.each do |key, values|
79
+ puts_group(key, type: :ignored) # Puts the opening <p> element.
80
+
81
+ if verbose || (values.length <= NUM_VALUES)
82
+ values.each { |value| puts_group_item value, type: :ignored }
83
+ else # Only print N values and summarise the rest.
84
+ NUM_VALUES.times { |i| puts_group_item values[i], type: :ignored }
85
+
86
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
87
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all<br />"
88
+ end
89
+
90
+ puts '</p>'
91
+ end
92
+ end
93
+
94
+ puts '</div>'
95
+ end
96
+
97
+ def puts_summary(text, type:)
98
+ klass = (type == :broken) ? 'broken_links_summary' : 'ignored_links_summary'
99
+ puts "<p class=\"#{klass}\">#{text}</p>"
100
+ end
101
+
102
+ def puts_group(link, type:)
103
+ href = build_url(link)
104
+ a_element = "<a href=\"#{href}\">#{link}</a>"
105
+
106
+ case type
107
+ when :broken
108
+ msg = sort_by_page? ?
109
+ "The following broken links were found on '#{a_element}':" :
110
+ "The broken link '#{a_element}' was found on the following pages:"
111
+ klass = 'broken_links_group'
112
+ when :ignored
113
+ msg = sort_by_page? ?
114
+ "The following links were ignored on '#{a_element}':" :
115
+ "The link '#{a_element}' was ignored on the following pages:"
116
+ klass = 'ignored_links_group'
117
+ else
118
+ raise "type: must be :broken or :ignored, not: #{type}"
119
+ end
120
+
121
+ puts "<p class=\"#{klass}\">"
122
+ puts msg + '<br />'
123
+ end
124
+
125
+ def puts_group_item(value, type:)
126
+ klass = (type == :broken) ? 'broken_links_group_item' : 'ignored_links_group_item'
127
+ puts "<a class=\"#{klass}\" href=\"#{build_url(value)}\">#{value}</a><br />"
128
+ end
129
+
130
+ def build_url(link)
131
+ href = @broken_link_map[link]
132
+ href || link
133
+ end
134
+
135
+ alias_method :report, :call
136
+ end
137
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Generic reporter class to be inherited from by format specific reporters.
5
+ class Reporter
6
+ # The amount of pages/links to display when verbose is false.
7
+ NUM_VALUES = 3
8
+
9
+ # Returns a new Reporter instance.
10
+ # stream is any Object that responds to :puts and :print.
11
+ def initialize(stream, sort,
12
+ broken_links, ignored_links,
13
+ broken_link_map, crawl_stats)
14
+ unless stream.respond_to?(:puts) && stream.respond_to?(:print)
15
+ raise 'stream must respond_to? :puts and :print'
16
+ end
17
+ raise "sort by either :page or :link, not #{sort}" \
18
+ unless %i[page link].include?(sort)
19
+
20
+ @stream = stream
21
+ @sort = sort
22
+ @broken_links = broken_links
23
+ @ignored_links = ignored_links
24
+ @broken_link_map = broken_link_map
25
+ @crawl_stats = crawl_stats
26
+ end
27
+
28
+ # Pretty print a report detailing the full link summary.
29
+ def call(broken_verbose: true, ignored_verbose: false)
30
+ raise 'Not implemented by parent class'
31
+ end
32
+
33
+ protected
34
+
35
+ # Return true if the sort is by page.
36
+ def sort_by_page?
37
+ @sort == :page
38
+ end
39
+
40
+ # Returns the key/value statistics of hash e.g. the number of keys and
41
+ # combined values. The hash should be of the format: { 'str' => [...] }.
42
+ # Use like: `num_pages, num_links = get_hash_stats(links)`.
43
+ def get_hash_stats(hash)
44
+ num_keys = hash.keys.length
45
+ num_values = hash.values.flatten.uniq.length
46
+
47
+ sort_by_page? ?
48
+ [num_keys, num_values] :
49
+ [num_values, num_keys]
50
+ end
51
+
52
+ # Prints the text. Defaults to a blank line.
53
+ def print(text = '')
54
+ @stream.print(text)
55
+ end
56
+
57
+ # Prints the text + \n. Defaults to a blank line.
58
+ def puts(text = '')
59
+ @stream.puts(text)
60
+ end
61
+
62
+ # Prints text + \n\n.
63
+ def putsn(text)
64
+ puts(text)
65
+ puts
66
+ end
67
+
68
+ # Prints \n + text + \n.
69
+ def nputs(text)
70
+ puts
71
+ puts(text)
72
+ end
73
+
74
+ alias_method :report, :call
75
+ end
76
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for reporting in a text format.
5
+ class TextReporter < Reporter
6
+ # Returns a new TextReporter instance.
7
+ # stream is any Object that responds to :puts and :print.
8
+ def initialize(stream, sort,
9
+ broken_links, ignored_links,
10
+ broken_link_map, crawl_stats)
11
+ super
12
+ end
13
+
14
+ # Pretty print a report detailing the full link summary.
15
+ def call(broken_verbose: true, ignored_verbose: false)
16
+ report_crawl_summary
17
+ report_broken_links(verbose: broken_verbose)
18
+ report_ignored_links(verbose: ignored_verbose)
19
+
20
+ nil
21
+ end
22
+
23
+ private
24
+
25
+ # Report a summary of the overall crawl.
26
+ def report_crawl_summary
27
+ puts "Crawled #{@crawl_stats[:url]}"
28
+ putsn format(
29
+ '%s page(s) containing %s unique link(s) in %s seconds',
30
+ @crawl_stats[:num_pages],
31
+ @crawl_stats[:num_links],
32
+ @crawl_stats[:duration]&.truncate(2)
33
+ )
34
+ end
35
+
36
+ # Report a summary of the broken links.
37
+ def report_broken_links(verbose: true)
38
+ if @broken_links.empty?
39
+ puts 'Good news, there are no broken links!'
40
+ else
41
+ num_pages, num_links = get_hash_stats(@broken_links)
42
+ puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
43
+
44
+ @broken_links.each do |key, values|
45
+ msg = sort_by_page? ?
46
+ "The following broken links were found on '#{key}':" :
47
+ "The broken link '#{key}' was found on the following pages:"
48
+ nputs msg
49
+
50
+ if verbose || (values.length <= NUM_VALUES)
51
+ values.each { |value| puts value }
52
+ else # Only print N values and summarise the rest.
53
+ NUM_VALUES.times { |i| puts values[i] }
54
+
55
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
56
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all"
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ # Report a summary of the ignored links.
63
+ def report_ignored_links(verbose: false)
64
+ if @ignored_links.any?
65
+ num_pages, num_links = get_hash_stats(@ignored_links)
66
+ nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
67
+
68
+ @ignored_links.each do |key, values|
69
+ msg = sort_by_page? ?
70
+ "The following links were ignored on '#{key}':" :
71
+ "The link '#{key}' was ignored on the following pages:"
72
+ nputs msg
73
+
74
+ if verbose || (values.length <= NUM_VALUES)
75
+ values.each { |value| puts value }
76
+ else # Only print N values and summarise the rest.
77
+ NUM_VALUES.times { |i| puts values[i] }
78
+
79
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
80
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all"
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ alias_method :report, :call
87
+ end
88
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.9.4'
4
+ VERSION = '0.12.0'
5
5
  end
@@ -1,11 +1,31 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # We extract all the Document's links, not just the links to other webpages.
4
- Wgit::Document.define_extension(
3
+ # Define a method on each doc for recording unparsable links.
4
+ # Unparsable links are recorded as broken links by Finder.
5
+ class Wgit::Document
6
+ def unparsable_links
7
+ @unparsable_links ||= []
8
+ end
9
+ end
10
+
11
+ # Returns a Wgit::Url or nil (if link is unparsable).
12
+ # A proc is preferrable to a function to avoid polluting the global namespace.
13
+ parse_link = lambda do |doc, link|
14
+ Wgit::Url.new(link)
15
+ rescue StandardError
16
+ doc.unparsable_links << link
17
+ nil
18
+ end
19
+
20
+ # Define a custom extractor for all page links we're interested in checking.
21
+ Wgit::Document.define_extractor(
5
22
  :all_links,
6
- '//*/@href | //*/@src', # Any element with a href or src attribute.
23
+ lambda { BrokenLinkFinder::link_xpath },
7
24
  singleton: false,
8
25
  text_content_only: true
9
- ) do |links|
10
- links.uniq.to_urls
26
+ ) do |links, doc|
27
+ links
28
+ .uniq
29
+ .map { |link| parse_link.call(doc, link) }
30
+ .compact
11
31
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Extract all the Document's <body> links e.g. <a>, <img>, <script> etc.
5
+ DEFAULT_LINK_XPATH = '/html/body//*/@href | /html/body//*/@src'
6
+
7
+ @link_xpath = DEFAULT_LINK_XPATH
8
+
9
+ class << self
10
+ # The xpath used to extract links from a crawled page.
11
+ # Can be overridden as required.
12
+ attr_accessor :link_xpath
13
+ end
14
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.4
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-11-02 00:00:00.000000000 Z
11
+ date: 2021-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -72,14 +72,14 @@ dependencies:
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '10.0'
75
+ version: '13.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '10.0'
82
+ version: '13.0'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: webmock
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -100,42 +100,42 @@ dependencies:
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: 0.20.3
103
+ version: '0.20'
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: 0.20.3
110
+ version: '0.20'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: thread
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: 0.2.0
117
+ version: '0.2'
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: 0.2.0
124
+ version: '0.2'
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: wgit
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
129
  - - "~>"
130
130
  - !ruby/object:Gem::Version
131
- version: 0.5.0
131
+ version: '0.10'
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
- version: 0.5.0
138
+ version: '0.10'
139
139
  description: Finds a website's broken links using the 'wgit' gem and reports back
140
140
  to you with a summary.
141
141
  email: michael.telford@live.com
@@ -159,15 +159,22 @@ files:
159
159
  - exe/broken_link_finder
160
160
  - lib/broken_link_finder.rb
161
161
  - lib/broken_link_finder/finder.rb
162
- - lib/broken_link_finder/reporter.rb
162
+ - lib/broken_link_finder/link_manager.rb
163
+ - lib/broken_link_finder/reporter/html_reporter.rb
164
+ - lib/broken_link_finder/reporter/reporter.rb
165
+ - lib/broken_link_finder/reporter/text_reporter.rb
163
166
  - lib/broken_link_finder/version.rb
164
167
  - lib/broken_link_finder/wgit_extensions.rb
168
+ - lib/broken_link_finder/xpath.rb
165
169
  - load.rb
166
170
  homepage: https://github.com/michaeltelford/broken-link-finder
167
171
  licenses:
168
172
  - MIT
169
173
  metadata:
170
174
  source_code_uri: https://github.com/michaeltelford/broken-link-finder
175
+ changelog_uri: https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md
176
+ bug_tracker_uri: https://github.com/michaeltelford/broken-link-finder/issues
177
+ documentation_uri: https://www.rubydoc.info/gems/broken_link_finder
171
178
  allowed_push_host: https://rubygems.org
172
179
  post_install_message: Added the executable 'broken_link_finder' to $PATH
173
180
  rdoc_options: []
@@ -184,9 +191,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
184
191
  - !ruby/object:Gem::Version
185
192
  version: '0'
186
193
  requirements: []
187
- rubyforge_project:
188
- rubygems_version: 2.7.6
189
- signing_key:
194
+ rubygems_version: 3.1.2
195
+ signing_key:
190
196
  specification_version: 4
191
197
  summary: Finds a website's broken links and reports back to you with a summary.
192
198
  test_files: []