broken_link_finder 0.9.4 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,30 +9,41 @@ class BrokenLinkFinderCLI < Thor
9
9
  desc 'crawl [URL]', 'Find broken links at the URL'
10
10
  option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
11
11
  option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
12
+ option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
13
+ option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
12
14
  option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
13
15
  option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
14
16
  option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
15
17
  def crawl(url)
16
18
  url = "http://#{url}" unless url.start_with?('http')
17
19
 
20
+ report_type = options[:html] ? :html : :text
18
21
  sort_by = options[:sort_by_link] ? :link : :page
19
22
  max_threads = options[:threads]
20
23
  broken_verbose = !options[:concise]
21
24
  ignored_verbose = options[:verbose]
22
25
 
26
+ BrokenLinkFinder.link_xpath = options[:xpath]
23
27
  finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
24
28
  options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
25
- finder.pretty_print_link_report(
26
- broken_verbose: broken_verbose,
29
+ finder.report(
30
+ type: report_type,
31
+ broken_verbose: broken_verbose,
27
32
  ignored_verbose: ignored_verbose
28
33
  )
29
- rescue Exception => e
34
+
35
+ exit 0
36
+ rescue StandardError => e
30
37
  puts "An error has occurred: #{e.message}"
38
+
39
+ exit 1
31
40
  end
32
41
 
33
42
  desc 'version', 'Display the currently installed version'
34
43
  def version
35
44
  puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
45
+
46
+ exit 0
36
47
  end
37
48
  end
38
49
 
@@ -2,8 +2,14 @@
2
2
 
3
3
  require 'wgit'
4
4
  require 'wgit/core_ext'
5
+ require 'thread/pool'
6
+ require 'set'
5
7
 
6
- require_relative './broken_link_finder/wgit_extensions'
7
8
  require_relative './broken_link_finder/version'
8
- require_relative './broken_link_finder/reporter'
9
+ require_relative './broken_link_finder/xpath'
10
+ require_relative './broken_link_finder/wgit_extensions'
11
+ require_relative './broken_link_finder/link_manager'
12
+ require_relative './broken_link_finder/reporter/reporter'
13
+ require_relative './broken_link_finder/reporter/text_reporter'
14
+ require_relative './broken_link_finder/reporter/html_reporter'
9
15
  require_relative './broken_link_finder/finder'
@@ -1,228 +1,227 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'reporter'
4
- require 'thread/pool'
5
- require 'set'
6
-
7
3
  module BrokenLinkFinder
8
- DEFAULT_MAX_THREADS = 100
4
+ DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
5
+ SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
9
6
 
10
7
  # Alias for BrokenLinkFinder::Finder.new.
11
8
  def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
12
9
  Finder.new(sort: sort, max_threads: max_threads)
13
10
  end
14
11
 
12
+ # Class responsible for finding broken links on a page or site.
15
13
  class Finder
16
- attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
14
+ # The collection key - either :page or :link.
15
+ attr_reader :sort
16
+
17
+ # The max number of threads created during #crawl_site - one thread per page.
18
+ attr_reader :max_threads
17
19
 
18
- # Creates a new Finder instance.
19
- def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
20
+ # Returns a new Finder instance.
21
+ def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
20
22
  raise "Sort by either :page or :link, not #{sort}" \
21
23
  unless %i[page link].include?(sort)
22
24
 
23
25
  @sort = sort
24
26
  @max_threads = max_threads
25
- @lock = Mutex.new
26
27
  @crawler = Wgit::Crawler.new
28
+ @manager = BrokenLinkFinder::LinkManager.new(@sort)
29
+ end
27
30
 
28
- clear_links
31
+ # Returns the current broken links.
32
+ def broken_links
33
+ @manager.broken_links
29
34
  end
30
35
 
31
- # Clear/empty the link collection Hashes.
32
- def clear_links
33
- @broken_links = {}
34
- @ignored_links = {}
35
- @total_links_crawled = 0
36
- @all_broken_links = Set.new
37
- @all_intact_links = Set.new
36
+ # Returns the current ignored links.
37
+ def ignored_links
38
+ @manager.ignored_links
38
39
  end
39
40
 
40
- # Finds broken links within a single page and appends them to the
41
- # @broken_links array. Returns true if at least one broken link was found.
41
+ # Returns the current crawl stats.
42
+ def crawl_stats
43
+ @manager.crawl_stats
44
+ end
45
+
46
+ # Finds broken links within a single page and records them.
47
+ # Returns true if at least one broken link was found.
42
48
  # Access the broken links afterwards with Finder#broken_links.
43
49
  def crawl_url(url)
44
- clear_links
50
+ @manager.empty
51
+
52
+ start = Time.now
53
+ url = url.to_url
45
54
 
46
- url = url.to_url
47
- doc = @crawler.crawl(url)
55
+ # We dup the url to avoid recording any redirects.
56
+ doc = @crawler.crawl(url.dup)
48
57
 
49
58
  # Ensure the given page url is valid.
50
59
  raise "Invalid or broken URL: #{url}" unless doc
51
60
 
52
61
  # Get all page links and determine which are broken.
53
62
  find_broken_links(doc)
63
+ retry_broken_links
54
64
 
55
- sort_links
56
- set_total_links_crawled
65
+ @manager.sort
66
+ @manager.tally(url: url, pages_crawled: [url], start: start)
57
67
 
58
- @broken_links.any?
68
+ broken_links.any?
59
69
  end
60
70
 
61
- # Finds broken links within an entire site and appends them to the
62
- # @broken_links array. Returns a tuple containing a Boolean of true if
63
- # at least one broken link was found and an Array of all pages crawled.
71
+ # Finds broken links within an entire site and records them.
72
+ # Returns true if at least one broken link was found.
64
73
  # Access the broken links afterwards with Finder#broken_links.
65
- def crawl_site(url)
66
- clear_links
74
+ def crawl_site(url, allow_paths: nil, disallow_paths: nil)
75
+ @manager.empty
67
76
 
68
- url = url.to_url
69
- pool = Thread.pool(@max_threads)
70
- crawled_pages = []
77
+ start = Time.now
78
+ url = url.to_url
79
+ pool = Thread.pool(@max_threads)
80
+ crawled = Set.new
71
81
 
72
82
  # Crawl the site's HTML web pages looking for links.
73
- externals = @crawler.crawl_site(url) do |doc|
74
- crawled_pages << doc.url
83
+ # We dup the url to avoid recording any redirects.
84
+ paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
85
+ externals = @crawler.crawl_site(url.dup, **paths) do |doc|
86
+ crawled << doc.url
75
87
  next unless doc
76
88
 
77
89
  # Start a thread for each page, checking for broken links.
78
90
  pool.process { find_broken_links(doc) }
79
91
  end
80
92
 
93
+ # Wait for all threads to finish, even if url was invalid.
94
+ pool.shutdown
95
+
81
96
  # Ensure the given website url is valid.
82
97
  raise "Invalid or broken URL: #{url}" unless externals
83
98
 
84
- # Wait for all threads to finish.
85
- pool.shutdown
99
+ retry_broken_links
86
100
 
87
- sort_links
88
- set_total_links_crawled
101
+ @manager.sort
102
+ @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
89
103
 
90
- [@broken_links.any?, crawled_pages.uniq]
104
+ broken_links.any?
105
+ ensure
106
+ pool.shutdown if defined?(pool)
91
107
  end
92
108
 
93
- # Pretty prints the link report into a stream e.g. STDOUT or a file,
109
+ # Outputs the link report into a stream e.g. STDOUT or a file,
94
110
  # anything that respond_to? :puts. Defaults to STDOUT.
95
- # Returns true if there were broken links and vice versa.
96
- def pretty_print_link_report(
97
- stream = STDOUT,
98
- broken_verbose: true,
99
- ignored_verbose: false
100
- )
101
- reporter = BrokenLinkFinder::Reporter.new(
102
- stream, @sort, @broken_links, @ignored_links
103
- )
104
- reporter.pretty_print_link_report(
105
- broken_verbose: broken_verbose,
106
- ignored_verbose: ignored_verbose
107
- )
108
-
109
- @broken_links.any?
111
+ def report(stream = STDOUT, type: :text,
112
+ broken_verbose: true, ignored_verbose: false)
113
+ klass = case type
114
+ when :text
115
+ BrokenLinkFinder::TextReporter
116
+ when :html
117
+ BrokenLinkFinder::HTMLReporter
118
+ else
119
+ raise "The type: must be :text or :html, not: :#{type}"
120
+ end
121
+
122
+ reporter = klass.new(stream, @sort,
123
+ broken_links, ignored_links,
124
+ @manager.broken_link_map, crawl_stats)
125
+ reporter.call(broken_verbose: broken_verbose,
126
+ ignored_verbose: ignored_verbose)
110
127
  end
111
128
 
112
129
  private
113
130
 
114
131
  # Finds which links are unsupported or broken and records the details.
115
132
  def find_broken_links(page)
133
+ record_unparsable_links(page) # Record them as broken.
134
+
116
135
  links = get_supported_links(page)
117
136
 
118
137
  # Iterate over the supported links checking if they're broken or not.
119
138
  links.each do |link|
120
- # Check if the link has already been processed previously.
121
- next if @all_intact_links.include?(link)
139
+ # Skip if the link has been encountered previously.
140
+ next if @manager.all_intact_links.include?(link)
122
141
 
123
- if @all_broken_links.include?(link)
124
- append_broken_link(page.url, link)
142
+ if @manager.all_broken_links.include?(link)
143
+ # The link has already been proven broken so simply record it.
144
+ @manager.append_broken_link(page, link, map: false)
125
145
  next
126
146
  end
127
147
 
128
- # The link hasn't been processed before so we crawl it.
148
+ # The link hasn't been encountered before so we crawl it.
129
149
  link_doc = crawl_link(page, link)
130
150
 
131
- # Determine if the crawled link is broken or not.
132
- if link_doc.nil? ||
133
- @crawler.last_response.not_found? ||
134
- has_broken_anchor(link_doc)
135
- append_broken_link(page.url, link)
151
+ # Determine if the crawled link is broken or not and record it.
152
+ if link_broken?(link_doc)
153
+ @manager.append_broken_link(page, link)
136
154
  else
137
- @lock.synchronize { @all_intact_links << link }
155
+ @manager.append_intact_link(link)
138
156
  end
139
157
  end
140
158
 
141
159
  nil
142
160
  end
143
161
 
144
- # Report and reject any non supported links. Any link that is absolute and
145
- # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
146
- def get_supported_links(doc)
147
- doc.all_links
148
- .reject do |link|
149
- if link.is_absolute? && !link.start_with?('http')
150
- append_ignored_link(doc.url, link)
151
- true
152
- end
153
- end
154
- end
155
-
156
- # Makes the link absolute and crawls it, returning its Wgit::Document.
157
- def crawl_link(doc, link)
158
- link = link.prefix_base(doc)
159
- @crawler.crawl(link)
160
- end
162
+ # Implements a retry mechanism for each of the broken links found.
163
+ # Removes any broken links found to be working OK.
164
+ def retry_broken_links
165
+ sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
161
166
 
162
- # Returns true if the link is/contains a broken anchor/fragment.
163
- def has_broken_anchor(doc)
164
- raise 'link document is nil' unless doc
167
+ @manager.broken_link_map.select! do |link, href|
168
+ # Don't retry unparsable links (which are Strings).
169
+ next(true) unless href.is_a?(Wgit::Url)
165
170
 
166
- fragment = doc.url.fragment
167
- return false if fragment.nil? || fragment.empty?
171
+ doc = @crawler.crawl(href.dup)
168
172
 
169
- doc.xpath("//*[@id='#{fragment}']").empty?
173
+ if link_broken?(doc)
174
+ true
175
+ else
176
+ @manager.remove_broken_link(link)
177
+ false
178
+ end
179
+ end
170
180
  end
171
181
 
172
- # Append key => [value] to @broken_links.
173
- def append_broken_link(url, link)
174
- key, value = get_key_value(url, link)
175
-
176
- @lock.synchronize do
177
- @broken_links[key] = [] unless @broken_links[key]
178
- @broken_links[key] << value
179
-
180
- @all_broken_links << link
182
+ # Record each unparsable link as a broken link.
183
+ def record_unparsable_links(doc)
184
+ doc.unparsable_links.each do |link|
185
+ # We map the link ourselves because link is a String, not a Wgit::Url.
186
+ @manager.append_broken_link(doc, link, map: false)
187
+ @manager.broken_link_map[link] = link
181
188
  end
182
189
  end
183
190
 
184
- # Append key => [value] to @ignored_links.
185
- def append_ignored_link(url, link)
186
- key, value = get_key_value(url, link)
187
-
188
- @lock.synchronize do
189
- @ignored_links[key] = [] unless @ignored_links[key]
190
- @ignored_links[key] << value
191
+ # Report and reject any non supported links. Any link that is absolute and
192
+ # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
193
+ def get_supported_links(doc)
194
+ doc.all_links.reject do |link|
195
+ if link.is_absolute? && !link.start_with?('http')
196
+ @manager.append_ignored_link(doc.url, link)
197
+ true
198
+ end
191
199
  end
192
200
  end
193
201
 
194
- # Returns the correct key value depending on the @sort type.
195
- # @sort == :page ? [url, link] : [link, url]
196
- def get_key_value(url, link)
197
- case @sort
198
- when :page
199
- [url, link]
200
- when :link
201
- [link, url]
202
- else
203
- raise "Unsupported sort type: #{sort}"
204
- end
202
+ # Make the link absolute and crawl it, returning its Wgit::Document.
203
+ def crawl_link(doc, link)
204
+ link = link.make_absolute(doc)
205
+ @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
205
206
  end
206
207
 
207
- # Sort keys and values alphabetically.
208
- def sort_links
209
- @broken_links.values.map(&:uniq!)
210
- @ignored_links.values.map(&:uniq!)
208
+ # Return if the crawled link is broken or not.
209
+ def link_broken?(doc)
210
+ doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
211
+ end
211
212
 
212
- @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
213
- @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
213
+ # Returns true if the link is/contains a broken anchor/fragment.
214
+ # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
215
+ def has_broken_anchor(doc)
216
+ raise 'The link document is nil' unless doc
214
217
 
215
- @broken_links.each { |_k, v| v.sort! }
216
- @ignored_links.each { |_k, v| v.sort! }
217
- end
218
+ fragment = doc.url.fragment
219
+ return false if fragment.nil? || fragment.empty?
218
220
 
219
- # Sets and returns the total number of links crawled.
220
- def set_total_links_crawled
221
- @total_links_crawled = @all_broken_links.size + @all_intact_links.size
221
+ doc.xpath("//*[@id='#{fragment}']").empty?
222
222
  end
223
223
 
224
- alias crawl_page crawl_url
225
- alias crawl_r crawl_site
226
- alias pretty_print_link_summary pretty_print_link_report
224
+ alias crawl_page crawl_url
225
+ alias crawl_r crawl_site
227
226
  end
228
227
  end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for handling the link collection logic.
5
+ class LinkManager
6
+ # Used for mapping pages to broken links.
7
+ attr_reader :broken_links
8
+
9
+ # Used for mapping pages to ignored links.
10
+ attr_reader :ignored_links
11
+
12
+ # Used to record crawl statistics e.g. duration etc.
13
+ attr_reader :crawl_stats
14
+
15
+ # Used to map a link (as is) to its absolute (crawlable) form.
16
+ attr_reader :broken_link_map
17
+
18
+ # Used to prevent crawling a broken link twice.
19
+ attr_reader :all_broken_links
20
+
21
+ # Used to prevent crawling an intact link twice.
22
+ attr_reader :all_intact_links
23
+
24
+ # Used for building crawl statistics.
25
+ attr_reader :all_ignored_links
26
+
27
+ # Returns a new LinkManager instance with empty link collections.
28
+ def initialize(sort)
29
+ raise "Sort by either :page or :link, not #{sort}" \
30
+ unless %i[page link].include?(sort)
31
+
32
+ @sort = sort
33
+ @lock = Mutex.new
34
+
35
+ empty # Initialises the link collections.
36
+ end
37
+
38
+ # Initialise/empty the link collection objects.
39
+ def empty
40
+ @broken_links = {}
41
+ @ignored_links = {}
42
+ @crawl_stats = {}
43
+ @broken_link_map = {}
44
+ @all_broken_links = Set.new
45
+ @all_intact_links = Set.new
46
+ @all_ignored_links = Set.new
47
+ end
48
+
49
+ # Append key => [value] to the broken link collections.
50
+ # If map: true, then the link will also be recorded in @broken_link_map.
51
+ def append_broken_link(doc, link, map: true)
52
+ key, value = get_key_value(doc.url, link)
53
+
54
+ @lock.synchronize do
55
+ @broken_links[key] = [] unless @broken_links[key]
56
+ @broken_links[key] << value
57
+
58
+ @all_broken_links << link
59
+
60
+ @broken_link_map[link] = link.make_absolute(doc) if map
61
+ end
62
+ end
63
+
64
+ # Remove the broken link from the necessary collections.
65
+ def remove_broken_link(link)
66
+ @lock.synchronize do
67
+ if @sort == :page
68
+ @broken_links.each { |_k, links| links.delete(link) }
69
+ @broken_links.delete_if { |_k, links| links.empty? }
70
+ else
71
+ @broken_links.delete(link)
72
+ end
73
+
74
+ @all_broken_links.delete(link)
75
+ @all_intact_links << link
76
+ end
77
+ end
78
+
79
+ # Append key => [value] to the ignored link collections.
80
+ def append_ignored_link(url, link)
81
+ key, value = get_key_value(url, link)
82
+
83
+ @lock.synchronize do
84
+ @ignored_links[key] = [] unless @ignored_links[key]
85
+ @ignored_links[key] << value
86
+
87
+ @all_ignored_links << link
88
+ end
89
+ end
90
+
91
+ # Append link to @all_intact_links.
92
+ def append_intact_link(link)
93
+ @lock.synchronize { @all_intact_links << link }
94
+ end
95
+
96
+ # Sorts the link collection's keys and values alphabetically.
97
+ def sort
98
+ @broken_links.values.map(&:uniq!)
99
+ @ignored_links.values.map(&:uniq!)
100
+
101
+ @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
102
+ @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
103
+
104
+ @broken_links.each { |_k, v| v.sort! }
105
+ @ignored_links.each { |_k, v| v.sort! }
106
+ end
107
+
108
+ # Tally's up various statistics about the crawl and its links.
109
+ def tally(url:, pages_crawled:, start:)
110
+ @crawl_stats[:url] = url
111
+ @crawl_stats[:pages_crawled] = pages_crawled
112
+ @crawl_stats[:num_pages] = pages_crawled.size
113
+ @crawl_stats[:num_links] = (
114
+ @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
115
+ )
116
+ @crawl_stats[:num_broken_links] = @all_broken_links.size
117
+ @crawl_stats[:num_intact_links] = @all_intact_links.size
118
+ @crawl_stats[:num_ignored_links] = @all_ignored_links.size
119
+ @crawl_stats[:duration] = Time.now - start
120
+ end
121
+
122
+ private
123
+
124
+ # Returns the correct key value depending on the @sort type.
125
+ # @sort == :page ? [url, link] : [link, url]
126
+ def get_key_value(url, link)
127
+ case @sort
128
+ when :page
129
+ [url, link]
130
+ when :link
131
+ [link, url]
132
+ else
133
+ raise "Unsupported sort type: #{sort}"
134
+ end
135
+ end
136
+ end
137
+ end