broken_link_finder 0.9.4 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,30 +9,41 @@ class BrokenLinkFinderCLI < Thor
9
9
  desc 'crawl [URL]', 'Find broken links at the URL'
10
10
  option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
11
11
  option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
12
+ option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
13
+ option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
12
14
  option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
13
15
  option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
14
16
  option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
15
17
  def crawl(url)
16
18
  url = "http://#{url}" unless url.start_with?('http')
17
19
 
20
+ report_type = options[:html] ? :html : :text
18
21
  sort_by = options[:sort_by_link] ? :link : :page
19
22
  max_threads = options[:threads]
20
23
  broken_verbose = !options[:concise]
21
24
  ignored_verbose = options[:verbose]
22
25
 
26
+ BrokenLinkFinder.link_xpath = options[:xpath]
23
27
  finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
24
28
  options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
25
- finder.pretty_print_link_report(
26
- broken_verbose: broken_verbose,
29
+ finder.report(
30
+ type: report_type,
31
+ broken_verbose: broken_verbose,
27
32
  ignored_verbose: ignored_verbose
28
33
  )
29
- rescue Exception => e
34
+
35
+ exit 0
36
+ rescue StandardError => e
30
37
  puts "An error has occurred: #{e.message}"
38
+
39
+ exit 1
31
40
  end
32
41
 
33
42
  desc 'version', 'Display the currently installed version'
34
43
  def version
35
44
  puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
45
+
46
+ exit 0
36
47
  end
37
48
  end
38
49
 
@@ -2,8 +2,14 @@
2
2
 
3
3
  require 'wgit'
4
4
  require 'wgit/core_ext'
5
+ require 'thread/pool'
6
+ require 'set'
5
7
 
6
- require_relative './broken_link_finder/wgit_extensions'
7
8
  require_relative './broken_link_finder/version'
8
- require_relative './broken_link_finder/reporter'
9
+ require_relative './broken_link_finder/xpath'
10
+ require_relative './broken_link_finder/wgit_extensions'
11
+ require_relative './broken_link_finder/link_manager'
12
+ require_relative './broken_link_finder/reporter/reporter'
13
+ require_relative './broken_link_finder/reporter/text_reporter'
14
+ require_relative './broken_link_finder/reporter/html_reporter'
9
15
  require_relative './broken_link_finder/finder'
@@ -1,228 +1,227 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'reporter'
4
- require 'thread/pool'
5
- require 'set'
6
-
7
3
  module BrokenLinkFinder
8
- DEFAULT_MAX_THREADS = 100
4
+ DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
5
+ SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
9
6
 
10
7
  # Alias for BrokenLinkFinder::Finder.new.
11
8
  def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
12
9
  Finder.new(sort: sort, max_threads: max_threads)
13
10
  end
14
11
 
12
+ # Class responsible for finding broken links on a page or site.
15
13
  class Finder
16
- attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
14
+ # The collection key - either :page or :link.
15
+ attr_reader :sort
16
+
17
+ # The max number of threads created during #crawl_site - one thread per page.
18
+ attr_reader :max_threads
17
19
 
18
- # Creates a new Finder instance.
19
- def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
20
+ # Returns a new Finder instance.
21
+ def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
20
22
  raise "Sort by either :page or :link, not #{sort}" \
21
23
  unless %i[page link].include?(sort)
22
24
 
23
25
  @sort = sort
24
26
  @max_threads = max_threads
25
- @lock = Mutex.new
26
27
  @crawler = Wgit::Crawler.new
28
+ @manager = BrokenLinkFinder::LinkManager.new(@sort)
29
+ end
27
30
 
28
- clear_links
31
+ # Returns the current broken links.
32
+ def broken_links
33
+ @manager.broken_links
29
34
  end
30
35
 
31
- # Clear/empty the link collection Hashes.
32
- def clear_links
33
- @broken_links = {}
34
- @ignored_links = {}
35
- @total_links_crawled = 0
36
- @all_broken_links = Set.new
37
- @all_intact_links = Set.new
36
+ # Returns the current ignored links.
37
+ def ignored_links
38
+ @manager.ignored_links
38
39
  end
39
40
 
40
- # Finds broken links within a single page and appends them to the
41
- # @broken_links array. Returns true if at least one broken link was found.
41
+ # Returns the current crawl stats.
42
+ def crawl_stats
43
+ @manager.crawl_stats
44
+ end
45
+
46
+ # Finds broken links within a single page and records them.
47
+ # Returns true if at least one broken link was found.
42
48
  # Access the broken links afterwards with Finder#broken_links.
43
49
  def crawl_url(url)
44
- clear_links
50
+ @manager.empty
51
+
52
+ start = Time.now
53
+ url = url.to_url
45
54
 
46
- url = url.to_url
47
- doc = @crawler.crawl(url)
55
+ # We dup the url to avoid recording any redirects.
56
+ doc = @crawler.crawl(url.dup)
48
57
 
49
58
  # Ensure the given page url is valid.
50
59
  raise "Invalid or broken URL: #{url}" unless doc
51
60
 
52
61
  # Get all page links and determine which are broken.
53
62
  find_broken_links(doc)
63
+ retry_broken_links
54
64
 
55
- sort_links
56
- set_total_links_crawled
65
+ @manager.sort
66
+ @manager.tally(url: url, pages_crawled: [url], start: start)
57
67
 
58
- @broken_links.any?
68
+ broken_links.any?
59
69
  end
60
70
 
61
- # Finds broken links within an entire site and appends them to the
62
- # @broken_links array. Returns a tuple containing a Boolean of true if
63
- # at least one broken link was found and an Array of all pages crawled.
71
+ # Finds broken links within an entire site and records them.
72
+ # Returns true if at least one broken link was found.
64
73
  # Access the broken links afterwards with Finder#broken_links.
65
- def crawl_site(url)
66
- clear_links
74
+ def crawl_site(url, allow_paths: nil, disallow_paths: nil)
75
+ @manager.empty
67
76
 
68
- url = url.to_url
69
- pool = Thread.pool(@max_threads)
70
- crawled_pages = []
77
+ start = Time.now
78
+ url = url.to_url
79
+ pool = Thread.pool(@max_threads)
80
+ crawled = Set.new
71
81
 
72
82
  # Crawl the site's HTML web pages looking for links.
73
- externals = @crawler.crawl_site(url) do |doc|
74
- crawled_pages << doc.url
83
+ # We dup the url to avoid recording any redirects.
84
+ paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
85
+ externals = @crawler.crawl_site(url.dup, **paths) do |doc|
86
+ crawled << doc.url
75
87
  next unless doc
76
88
 
77
89
  # Start a thread for each page, checking for broken links.
78
90
  pool.process { find_broken_links(doc) }
79
91
  end
80
92
 
93
+ # Wait for all threads to finish, even if url was invalid.
94
+ pool.shutdown
95
+
81
96
  # Ensure the given website url is valid.
82
97
  raise "Invalid or broken URL: #{url}" unless externals
83
98
 
84
- # Wait for all threads to finish.
85
- pool.shutdown
99
+ retry_broken_links
86
100
 
87
- sort_links
88
- set_total_links_crawled
101
+ @manager.sort
102
+ @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
89
103
 
90
- [@broken_links.any?, crawled_pages.uniq]
104
+ broken_links.any?
105
+ ensure
106
+ pool.shutdown if defined?(pool)
91
107
  end
92
108
 
93
- # Pretty prints the link report into a stream e.g. STDOUT or a file,
109
+ # Outputs the link report into a stream e.g. STDOUT or a file,
94
110
  # anything that respond_to? :puts. Defaults to STDOUT.
95
- # Returns true if there were broken links and vice versa.
96
- def pretty_print_link_report(
97
- stream = STDOUT,
98
- broken_verbose: true,
99
- ignored_verbose: false
100
- )
101
- reporter = BrokenLinkFinder::Reporter.new(
102
- stream, @sort, @broken_links, @ignored_links
103
- )
104
- reporter.pretty_print_link_report(
105
- broken_verbose: broken_verbose,
106
- ignored_verbose: ignored_verbose
107
- )
108
-
109
- @broken_links.any?
111
+ def report(stream = STDOUT, type: :text,
112
+ broken_verbose: true, ignored_verbose: false)
113
+ klass = case type
114
+ when :text
115
+ BrokenLinkFinder::TextReporter
116
+ when :html
117
+ BrokenLinkFinder::HTMLReporter
118
+ else
119
+ raise "The type: must be :text or :html, not: :#{type}"
120
+ end
121
+
122
+ reporter = klass.new(stream, @sort,
123
+ broken_links, ignored_links,
124
+ @manager.broken_link_map, crawl_stats)
125
+ reporter.call(broken_verbose: broken_verbose,
126
+ ignored_verbose: ignored_verbose)
110
127
  end
111
128
 
112
129
  private
113
130
 
114
131
  # Finds which links are unsupported or broken and records the details.
115
132
  def find_broken_links(page)
133
+ record_unparsable_links(page) # Record them as broken.
134
+
116
135
  links = get_supported_links(page)
117
136
 
118
137
  # Iterate over the supported links checking if they're broken or not.
119
138
  links.each do |link|
120
- # Check if the link has already been processed previously.
121
- next if @all_intact_links.include?(link)
139
+ # Skip if the link has been encountered previously.
140
+ next if @manager.all_intact_links.include?(link)
122
141
 
123
- if @all_broken_links.include?(link)
124
- append_broken_link(page.url, link)
142
+ if @manager.all_broken_links.include?(link)
143
+ # The link has already been proven broken so simply record it.
144
+ @manager.append_broken_link(page, link, map: false)
125
145
  next
126
146
  end
127
147
 
128
- # The link hasn't been processed before so we crawl it.
148
+ # The link hasn't been encountered before so we crawl it.
129
149
  link_doc = crawl_link(page, link)
130
150
 
131
- # Determine if the crawled link is broken or not.
132
- if link_doc.nil? ||
133
- @crawler.last_response.not_found? ||
134
- has_broken_anchor(link_doc)
135
- append_broken_link(page.url, link)
151
+ # Determine if the crawled link is broken or not and record it.
152
+ if link_broken?(link_doc)
153
+ @manager.append_broken_link(page, link)
136
154
  else
137
- @lock.synchronize { @all_intact_links << link }
155
+ @manager.append_intact_link(link)
138
156
  end
139
157
  end
140
158
 
141
159
  nil
142
160
  end
143
161
 
144
- # Report and reject any non supported links. Any link that is absolute and
145
- # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
146
- def get_supported_links(doc)
147
- doc.all_links
148
- .reject do |link|
149
- if link.is_absolute? && !link.start_with?('http')
150
- append_ignored_link(doc.url, link)
151
- true
152
- end
153
- end
154
- end
155
-
156
- # Makes the link absolute and crawls it, returning its Wgit::Document.
157
- def crawl_link(doc, link)
158
- link = link.prefix_base(doc)
159
- @crawler.crawl(link)
160
- end
162
+ # Implements a retry mechanism for each of the broken links found.
163
+ # Removes any broken links found to be working OK.
164
+ def retry_broken_links
165
+ sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
161
166
 
162
- # Returns true if the link is/contains a broken anchor/fragment.
163
- def has_broken_anchor(doc)
164
- raise 'link document is nil' unless doc
167
+ @manager.broken_link_map.select! do |link, href|
168
+ # Don't retry unparsable links (which are Strings).
169
+ next(true) unless href.is_a?(Wgit::Url)
165
170
 
166
- fragment = doc.url.fragment
167
- return false if fragment.nil? || fragment.empty?
171
+ doc = @crawler.crawl(href.dup)
168
172
 
169
- doc.xpath("//*[@id='#{fragment}']").empty?
173
+ if link_broken?(doc)
174
+ true
175
+ else
176
+ @manager.remove_broken_link(link)
177
+ false
178
+ end
179
+ end
170
180
  end
171
181
 
172
- # Append key => [value] to @broken_links.
173
- def append_broken_link(url, link)
174
- key, value = get_key_value(url, link)
175
-
176
- @lock.synchronize do
177
- @broken_links[key] = [] unless @broken_links[key]
178
- @broken_links[key] << value
179
-
180
- @all_broken_links << link
182
+ # Record each unparsable link as a broken link.
183
+ def record_unparsable_links(doc)
184
+ doc.unparsable_links.each do |link|
185
+ # We map the link ourselves because link is a String, not a Wgit::Url.
186
+ @manager.append_broken_link(doc, link, map: false)
187
+ @manager.broken_link_map[link] = link
181
188
  end
182
189
  end
183
190
 
184
- # Append key => [value] to @ignored_links.
185
- def append_ignored_link(url, link)
186
- key, value = get_key_value(url, link)
187
-
188
- @lock.synchronize do
189
- @ignored_links[key] = [] unless @ignored_links[key]
190
- @ignored_links[key] << value
191
+ # Report and reject any non supported links. Any link that is absolute and
192
+ # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
193
+ def get_supported_links(doc)
194
+ doc.all_links.reject do |link|
195
+ if link.is_absolute? && !link.start_with?('http')
196
+ @manager.append_ignored_link(doc.url, link)
197
+ true
198
+ end
191
199
  end
192
200
  end
193
201
 
194
- # Returns the correct key value depending on the @sort type.
195
- # @sort == :page ? [url, link] : [link, url]
196
- def get_key_value(url, link)
197
- case @sort
198
- when :page
199
- [url, link]
200
- when :link
201
- [link, url]
202
- else
203
- raise "Unsupported sort type: #{sort}"
204
- end
202
+ # Make the link absolute and crawl it, returning its Wgit::Document.
203
+ def crawl_link(doc, link)
204
+ link = link.make_absolute(doc)
205
+ @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
205
206
  end
206
207
 
207
- # Sort keys and values alphabetically.
208
- def sort_links
209
- @broken_links.values.map(&:uniq!)
210
- @ignored_links.values.map(&:uniq!)
208
+ # Return if the crawled link is broken or not.
209
+ def link_broken?(doc)
210
+ doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
211
+ end
211
212
 
212
- @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
213
- @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
213
+ # Returns true if the link is/contains a broken anchor/fragment.
214
+ # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
215
+ def has_broken_anchor(doc)
216
+ raise 'The link document is nil' unless doc
214
217
 
215
- @broken_links.each { |_k, v| v.sort! }
216
- @ignored_links.each { |_k, v| v.sort! }
217
- end
218
+ fragment = doc.url.fragment
219
+ return false if fragment.nil? || fragment.empty?
218
220
 
219
- # Sets and returns the total number of links crawled.
220
- def set_total_links_crawled
221
- @total_links_crawled = @all_broken_links.size + @all_intact_links.size
221
+ doc.xpath("//*[@id='#{fragment}']").empty?
222
222
  end
223
223
 
224
- alias crawl_page crawl_url
225
- alias crawl_r crawl_site
226
- alias pretty_print_link_summary pretty_print_link_report
224
+ alias crawl_page crawl_url
225
+ alias crawl_r crawl_site
227
226
  end
228
227
  end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for handling the link collection logic.
5
+ class LinkManager
6
+ # Used for mapping pages to broken links.
7
+ attr_reader :broken_links
8
+
9
+ # Used for mapping pages to ignored links.
10
+ attr_reader :ignored_links
11
+
12
+ # Used to record crawl statistics e.g. duration etc.
13
+ attr_reader :crawl_stats
14
+
15
+ # Used to map a link (as is) to its absolute (crawlable) form.
16
+ attr_reader :broken_link_map
17
+
18
+ # Used to prevent crawling a broken link twice.
19
+ attr_reader :all_broken_links
20
+
21
+ # Used to prevent crawling an intact link twice.
22
+ attr_reader :all_intact_links
23
+
24
+ # Used for building crawl statistics.
25
+ attr_reader :all_ignored_links
26
+
27
+ # Returns a new LinkManager instance with empty link collections.
28
+ def initialize(sort)
29
+ raise "Sort by either :page or :link, not #{sort}" \
30
+ unless %i[page link].include?(sort)
31
+
32
+ @sort = sort
33
+ @lock = Mutex.new
34
+
35
+ empty # Initialises the link collections.
36
+ end
37
+
38
+ # Initialise/empty the link collection objects.
39
+ def empty
40
+ @broken_links = {}
41
+ @ignored_links = {}
42
+ @crawl_stats = {}
43
+ @broken_link_map = {}
44
+ @all_broken_links = Set.new
45
+ @all_intact_links = Set.new
46
+ @all_ignored_links = Set.new
47
+ end
48
+
49
+ # Append key => [value] to the broken link collections.
50
+ # If map: true, then the link will also be recorded in @broken_link_map.
51
+ def append_broken_link(doc, link, map: true)
52
+ key, value = get_key_value(doc.url, link)
53
+
54
+ @lock.synchronize do
55
+ @broken_links[key] = [] unless @broken_links[key]
56
+ @broken_links[key] << value
57
+
58
+ @all_broken_links << link
59
+
60
+ @broken_link_map[link] = link.make_absolute(doc) if map
61
+ end
62
+ end
63
+
64
+ # Remove the broken link from the necessary collections.
65
+ def remove_broken_link(link)
66
+ @lock.synchronize do
67
+ if @sort == :page
68
+ @broken_links.each { |_k, links| links.delete(link) }
69
+ @broken_links.delete_if { |_k, links| links.empty? }
70
+ else
71
+ @broken_links.delete(link)
72
+ end
73
+
74
+ @all_broken_links.delete(link)
75
+ @all_intact_links << link
76
+ end
77
+ end
78
+
79
+ # Append key => [value] to the ignored link collections.
80
+ def append_ignored_link(url, link)
81
+ key, value = get_key_value(url, link)
82
+
83
+ @lock.synchronize do
84
+ @ignored_links[key] = [] unless @ignored_links[key]
85
+ @ignored_links[key] << value
86
+
87
+ @all_ignored_links << link
88
+ end
89
+ end
90
+
91
+ # Append link to @all_intact_links.
92
+ def append_intact_link(link)
93
+ @lock.synchronize { @all_intact_links << link }
94
+ end
95
+
96
+ # Sorts the link collection's keys and values alphabetically.
97
+ def sort
98
+ @broken_links.values.map(&:uniq!)
99
+ @ignored_links.values.map(&:uniq!)
100
+
101
+ @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
102
+ @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
103
+
104
+ @broken_links.each { |_k, v| v.sort! }
105
+ @ignored_links.each { |_k, v| v.sort! }
106
+ end
107
+
108
+ # Tally's up various statistics about the crawl and its links.
109
+ def tally(url:, pages_crawled:, start:)
110
+ @crawl_stats[:url] = url
111
+ @crawl_stats[:pages_crawled] = pages_crawled
112
+ @crawl_stats[:num_pages] = pages_crawled.size
113
+ @crawl_stats[:num_links] = (
114
+ @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
115
+ )
116
+ @crawl_stats[:num_broken_links] = @all_broken_links.size
117
+ @crawl_stats[:num_intact_links] = @all_intact_links.size
118
+ @crawl_stats[:num_ignored_links] = @all_ignored_links.size
119
+ @crawl_stats[:duration] = Time.now - start
120
+ end
121
+
122
+ private
123
+
124
+ # Returns the correct key value depending on the @sort type.
125
+ # @sort == :page ? [url, link] : [link, url]
126
+ def get_key_value(url, link)
127
+ case @sort
128
+ when :page
129
+ [url, link]
130
+ when :link
131
+ [link, url]
132
+ else
133
+ raise "Unsupported sort type: #{sort}"
134
+ end
135
+ end
136
+ end
137
+ end