broken_link_finder 0.9.4 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for reporting in a HTML format.
5
+ class HTMLReporter < Reporter
6
+ # Returns a new HTMLReporter instance.
7
+ # stream is any Object that responds to :puts and :print.
8
+ def initialize(stream, sort,
9
+ broken_links, ignored_links,
10
+ broken_link_map, crawl_stats)
11
+ super
12
+ end
13
+
14
+ # Pretty print a report detailing the full link summary.
15
+ def call(broken_verbose: true, ignored_verbose: false)
16
+ puts '<div class="broken_link_finder_report">'
17
+
18
+ report_crawl_summary
19
+ report_broken_links(verbose: broken_verbose)
20
+ report_ignored_links(verbose: ignored_verbose)
21
+
22
+ puts '</div>'
23
+
24
+ nil
25
+ end
26
+
27
+ private
28
+
29
+ # Report a summary of the overall crawl.
30
+ def report_crawl_summary
31
+ puts format(
32
+ '<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
33
+ @crawl_stats[:url],
34
+ @crawl_stats[:url],
35
+ @crawl_stats[:num_pages],
36
+ @crawl_stats[:num_links],
37
+ @crawl_stats[:duration]&.truncate(2)
38
+ )
39
+ end
40
+
41
+ # Report a summary of the broken links.
42
+ def report_broken_links(verbose: true)
43
+ puts '<div class="broken_links">'
44
+
45
+ if @broken_links.empty?
46
+ puts_summary 'Good news, there are no broken links!', type: :broken
47
+ else
48
+ num_pages, num_links = get_hash_stats(@broken_links)
49
+ puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
50
+
51
+ @broken_links.each do |key, values|
52
+ puts_group(key, type: :broken) # Puts the opening <p> element.
53
+
54
+ if verbose || (values.length <= NUM_VALUES)
55
+ values.each { |value| puts_group_item value, type: :broken }
56
+ else # Only print N values and summarise the rest.
57
+ NUM_VALUES.times { |i| puts_group_item values[i], type: :broken }
58
+
59
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
60
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all<br />"
61
+ end
62
+
63
+ puts '</p>'
64
+ end
65
+ end
66
+
67
+ puts '</div>'
68
+ end
69
+
70
+ # Report a summary of the ignored links.
71
+ def report_ignored_links(verbose: false)
72
+ puts '<div class="ignored_links">'
73
+
74
+ if @ignored_links.any?
75
+ num_pages, num_links = get_hash_stats(@ignored_links)
76
+ puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
77
+
78
+ @ignored_links.each do |key, values|
79
+ puts_group(key, type: :ignored) # Puts the opening <p> element.
80
+
81
+ if verbose || (values.length <= NUM_VALUES)
82
+ values.each { |value| puts_group_item value, type: :ignored }
83
+ else # Only print N values and summarise the rest.
84
+ NUM_VALUES.times { |i| puts_group_item values[i], type: :ignored }
85
+
86
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
87
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all<br />"
88
+ end
89
+
90
+ puts '</p>'
91
+ end
92
+ end
93
+
94
+ puts '</div>'
95
+ end
96
+
97
+ def puts_summary(text, type:)
98
+ klass = (type == :broken) ? 'broken_links_summary' : 'ignored_links_summary'
99
+ puts "<p class=\"#{klass}\">#{text}</p>"
100
+ end
101
+
102
+ def puts_group(link, type:)
103
+ href = build_url(link)
104
+ a_element = "<a href=\"#{href}\">#{link}</a>"
105
+
106
+ case type
107
+ when :broken
108
+ msg = sort_by_page? ?
109
+ "The following broken links were found on '#{a_element}':" :
110
+ "The broken link '#{a_element}' was found on the following pages:"
111
+ klass = 'broken_links_group'
112
+ when :ignored
113
+ msg = sort_by_page? ?
114
+ "The following links were ignored on '#{a_element}':" :
115
+ "The link '#{a_element}' was ignored on the following pages:"
116
+ klass = 'ignored_links_group'
117
+ else
118
+ raise "type: must be :broken or :ignored, not: #{type}"
119
+ end
120
+
121
+ puts "<p class=\"#{klass}\">"
122
+ puts msg + '<br />'
123
+ end
124
+
125
+ def puts_group_item(value, type:)
126
+ klass = (type == :broken) ? 'broken_links_group_item' : 'ignored_links_group_item'
127
+ puts "<a class=\"#{klass}\" href=\"#{build_url(value)}\">#{value}</a><br />"
128
+ end
129
+
130
+ def build_url(link)
131
+ href = @broken_link_map[link]
132
+ href || link
133
+ end
134
+
135
+ alias_method :report, :call
136
+ end
137
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Generic reporter class to be inherited from by format specific reporters.
5
+ class Reporter
6
+ # The amount of pages/links to display when verbose is false.
7
+ NUM_VALUES = 3
8
+
9
+ # Returns a new Reporter instance.
10
+ # stream is any Object that responds to :puts and :print.
11
+ def initialize(stream, sort,
12
+ broken_links, ignored_links,
13
+ broken_link_map, crawl_stats)
14
+ unless stream.respond_to?(:puts) && stream.respond_to?(:print)
15
+ raise 'stream must respond_to? :puts and :print'
16
+ end
17
+ raise "sort by either :page or :link, not #{sort}" \
18
+ unless %i[page link].include?(sort)
19
+
20
+ @stream = stream
21
+ @sort = sort
22
+ @broken_links = broken_links
23
+ @ignored_links = ignored_links
24
+ @broken_link_map = broken_link_map
25
+ @crawl_stats = crawl_stats
26
+ end
27
+
28
+ # Pretty print a report detailing the full link summary.
29
+ def call(broken_verbose: true, ignored_verbose: false)
30
+ raise 'Not implemented by parent class'
31
+ end
32
+
33
+ protected
34
+
35
+ # Return true if the sort is by page.
36
+ def sort_by_page?
37
+ @sort == :page
38
+ end
39
+
40
+ # Returns the key/value statistics of hash e.g. the number of keys and
41
+ # combined values. The hash should be of the format: { 'str' => [...] }.
42
+ # Use like: `num_pages, num_links = get_hash_stats(links)`.
43
+ def get_hash_stats(hash)
44
+ num_keys = hash.keys.length
45
+ num_values = hash.values.flatten.uniq.length
46
+
47
+ sort_by_page? ?
48
+ [num_keys, num_values] :
49
+ [num_values, num_keys]
50
+ end
51
+
52
+ # Prints the text. Defaults to a blank line.
53
+ def print(text = '')
54
+ @stream.print(text)
55
+ end
56
+
57
+ # Prints the text + \n. Defaults to a blank line.
58
+ def puts(text = '')
59
+ @stream.puts(text)
60
+ end
61
+
62
+ # Prints text + \n\n.
63
+ def putsn(text)
64
+ puts(text)
65
+ puts
66
+ end
67
+
68
+ # Prints \n + text + \n.
69
+ def nputs(text)
70
+ puts
71
+ puts(text)
72
+ end
73
+
74
+ alias_method :report, :call
75
+ end
76
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for reporting in a text format.
5
+ class TextReporter < Reporter
6
+ # Returns a new TextReporter instance.
7
+ # stream is any Object that responds to :puts and :print.
8
+ def initialize(stream, sort,
9
+ broken_links, ignored_links,
10
+ broken_link_map, crawl_stats)
11
+ super
12
+ end
13
+
14
+ # Pretty print a report detailing the full link summary.
15
+ def call(broken_verbose: true, ignored_verbose: false)
16
+ report_crawl_summary
17
+ report_broken_links(verbose: broken_verbose)
18
+ report_ignored_links(verbose: ignored_verbose)
19
+
20
+ nil
21
+ end
22
+
23
+ private
24
+
25
+ # Report a summary of the overall crawl.
26
+ def report_crawl_summary
27
+ puts "Crawled #{@crawl_stats[:url]}"
28
+ putsn format(
29
+ '%s page(s) containing %s unique link(s) in %s seconds',
30
+ @crawl_stats[:num_pages],
31
+ @crawl_stats[:num_links],
32
+ @crawl_stats[:duration]&.truncate(2)
33
+ )
34
+ end
35
+
36
+ # Report a summary of the broken links.
37
+ def report_broken_links(verbose: true)
38
+ if @broken_links.empty?
39
+ puts 'Good news, there are no broken links!'
40
+ else
41
+ num_pages, num_links = get_hash_stats(@broken_links)
42
+ puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
43
+
44
+ @broken_links.each do |key, values|
45
+ msg = sort_by_page? ?
46
+ "The following broken links were found on '#{key}':" :
47
+ "The broken link '#{key}' was found on the following pages:"
48
+ nputs msg
49
+
50
+ if verbose || (values.length <= NUM_VALUES)
51
+ values.each { |value| puts value }
52
+ else # Only print N values and summarise the rest.
53
+ NUM_VALUES.times { |i| puts values[i] }
54
+
55
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
56
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all"
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ # Report a summary of the ignored links.
63
+ def report_ignored_links(verbose: false)
64
+ if @ignored_links.any?
65
+ num_pages, num_links = get_hash_stats(@ignored_links)
66
+ nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
67
+
68
+ @ignored_links.each do |key, values|
69
+ msg = sort_by_page? ?
70
+ "The following links were ignored on '#{key}':" :
71
+ "The link '#{key}' was ignored on the following pages:"
72
+ nputs msg
73
+
74
+ if verbose || (values.length <= NUM_VALUES)
75
+ values.each { |value| puts value }
76
+ else # Only print N values and summarise the rest.
77
+ NUM_VALUES.times { |i| puts values[i] }
78
+
79
+ objects = sort_by_page? ? 'link(s)' : 'page(s)'
80
+ puts "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all"
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ alias_method :report, :call
87
+ end
88
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.9.4'
4
+ VERSION = '0.12.0'
5
5
  end
@@ -1,11 +1,31 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # We extract all the Document's links, not just the links to other webpages.
4
- Wgit::Document.define_extension(
3
+ # Define a method on each doc for recording unparsable links.
4
+ # Unparsable links are recorded as broken links by Finder.
5
+ class Wgit::Document
6
+ def unparsable_links
7
+ @unparsable_links ||= []
8
+ end
9
+ end
10
+
11
+ # Returns a Wgit::Url or nil (if link is unparsable).
12
+ # A proc is preferrable to a function to avoid polluting the global namespace.
13
+ parse_link = lambda do |doc, link|
14
+ Wgit::Url.new(link)
15
+ rescue StandardError
16
+ doc.unparsable_links << link
17
+ nil
18
+ end
19
+
20
+ # Define a custom extractor for all page links we're interested in checking.
21
+ Wgit::Document.define_extractor(
5
22
  :all_links,
6
- '//*/@href | //*/@src', # Any element with a href or src attribute.
23
+ lambda { BrokenLinkFinder::link_xpath },
7
24
  singleton: false,
8
25
  text_content_only: true
9
- ) do |links|
10
- links.uniq.to_urls
26
+ ) do |links, doc|
27
+ links
28
+ .uniq
29
+ .map { |link| parse_link.call(doc, link) }
30
+ .compact
11
31
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Extract all the Document's <body> links e.g. <a>, <img>, <script> etc.
5
+ DEFAULT_LINK_XPATH = '/html/body//*/@href | /html/body//*/@src'
6
+
7
+ @link_xpath = DEFAULT_LINK_XPATH
8
+
9
+ class << self
10
+ # The xpath used to extract links from a crawled page.
11
+ # Can be overridden as required.
12
+ attr_accessor :link_xpath
13
+ end
14
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.4
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-11-02 00:00:00.000000000 Z
11
+ date: 2021-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -72,14 +72,14 @@ dependencies:
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '10.0'
75
+ version: '13.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '10.0'
82
+ version: '13.0'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: webmock
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -100,42 +100,42 @@ dependencies:
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: 0.20.3
103
+ version: '0.20'
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: 0.20.3
110
+ version: '0.20'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: thread
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: 0.2.0
117
+ version: '0.2'
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: 0.2.0
124
+ version: '0.2'
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: wgit
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
129
  - - "~>"
130
130
  - !ruby/object:Gem::Version
131
- version: 0.5.0
131
+ version: '0.10'
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
- version: 0.5.0
138
+ version: '0.10'
139
139
  description: Finds a website's broken links using the 'wgit' gem and reports back
140
140
  to you with a summary.
141
141
  email: michael.telford@live.com
@@ -159,15 +159,22 @@ files:
159
159
  - exe/broken_link_finder
160
160
  - lib/broken_link_finder.rb
161
161
  - lib/broken_link_finder/finder.rb
162
- - lib/broken_link_finder/reporter.rb
162
+ - lib/broken_link_finder/link_manager.rb
163
+ - lib/broken_link_finder/reporter/html_reporter.rb
164
+ - lib/broken_link_finder/reporter/reporter.rb
165
+ - lib/broken_link_finder/reporter/text_reporter.rb
163
166
  - lib/broken_link_finder/version.rb
164
167
  - lib/broken_link_finder/wgit_extensions.rb
168
+ - lib/broken_link_finder/xpath.rb
165
169
  - load.rb
166
170
  homepage: https://github.com/michaeltelford/broken-link-finder
167
171
  licenses:
168
172
  - MIT
169
173
  metadata:
170
174
  source_code_uri: https://github.com/michaeltelford/broken-link-finder
175
+ changelog_uri: https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md
176
+ bug_tracker_uri: https://github.com/michaeltelford/broken-link-finder/issues
177
+ documentation_uri: https://www.rubydoc.info/gems/broken_link_finder
171
178
  allowed_push_host: https://rubygems.org
172
179
  post_install_message: Added the executable 'broken_link_finder' to $PATH
173
180
  rdoc_options: []
@@ -184,9 +191,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
184
191
  - !ruby/object:Gem::Version
185
192
  version: '0'
186
193
  requirements: []
187
- rubyforge_project:
188
- rubygems_version: 2.7.6
189
- signing_key:
194
+ rubygems_version: 3.1.2
195
+ signing_key:
190
196
  specification_version: 4
191
197
  summary: Finds a website's broken links and reports back to you with a summary.
192
198
  test_files: []