broken_link_finder 0.10.0 → 0.12.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a53784c1bd2f75c18b3492ea782b4cc2e229a94f89afcf33b60ef633512554e
4
- data.tar.gz: 393dca220b7f00d72314c93e7b877e0412afdf784fa2e563bbecb2dc6c6b29f7
3
+ metadata.gz: 88b1e96f1de644a1a3c06ba7cc0ee1b53f75a3de6686b343e55028e8fa69da9f
4
+ data.tar.gz: e399ca05a4b0b9b2c0644b2846fa9dc6be6acd664e1bdc58758eb9ca7a5543cd
5
5
  SHA512:
6
- metadata.gz: c0d304e5b0a9258265c5c084c0a6e5819c169ba8eb02b3c6317a37784a9ca12982b0fc520c3cca1060fde60126ee936708d7891c69133c5d72c9c0287a79b3f5
7
- data.tar.gz: c21a4aec2c077e2617fb625debad28f746148ad98229a27a590a4412601e30759c709aa3a6e6d80e81c16160e16968fc0392181fc9c75e4da06578452f7c5ab6
6
+ metadata.gz: 57a1604358b0297b66604d1fc5a60a9d1bda05aa9bd5f6b91135ddc2aec4a6eb703c00ef4d905ac156170b190bf500481ce56cf6319f07e8b57447cca4c6a210
7
+ data.tar.gz: f4b88e66c9c4fcd2bcbca2fe882abdede7c531e1d5e752a2ac986e39cf51d87714852dcb6e7e8e4870b623d54b468cc8f3ec88c253e7182c1fe89c0af91366a4
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.5.3
1
+ 3.0.2
data/CHANGELOG.md CHANGED
@@ -9,6 +9,46 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.12.1
13
+ ### Added
14
+ - Support for Ruby 3.
15
+ ### Changed/Removed
16
+ - Removed support for Ruby 2.5 (as it's too old).
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
21
+ ## v0.12.0
22
+ ### Added
23
+ - `BrokenLinkFinder::link_xpath` and `link_xpath=` methods so you can customise how links are extracted from each crawled page using the API.
24
+ - An `--xpath` (or just `-x`) command line flag so you can customise how links are extracted when using the command line.
25
+ ### Changed/Removed
26
+ - Changed the default way in which links are extracted from a page. Previously any element with a `href` or `src` attribute was extracted and checked; now only those links inside the `<body>` are extracted and checked, ignoring the `<head>` section entirely. You can change this behaviour back with: `BrokenLinkFinder::link_xpath = '//*/@href | //*/@src'` before you perform a crawl. Alternatively, if using the command line, use the `--xpath //*/@href | //*/@src` option.
27
+ ### Fixed
28
+ - [Scheme relative bug](https://github.com/michaeltelford/broken_link_finder/issues/16) by upgrading to `wgit v0.10.0`.
29
+ ---
30
+
31
+ ## v0.11.1
32
+ ### Added
33
+ - ...
34
+ ### Changed/Removed
35
+ - Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
36
+ ### Fixed
37
+ - ...
38
+ ---
39
+
40
+ ## v0.11.0
41
+ ### Added
42
+ - Additional crawl statistics.
43
+ - Exit code handling to executable. `0` for success, `1` for an error scenario.
44
+ ### Changed/Removed
45
+ - Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
46
+ ### Fixed
47
+ - Bug in html report, summary url is now an `<a>` link.
48
+ - Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
49
+ - Bug causing an error on crawling unparsable/invalid URL's.
50
+ ---
51
+
12
52
  ## v0.10.0
13
53
  ### Added
14
54
  - A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
data/Gemfile CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  source 'https://rubygems.org'
4
4
 
5
- ruby '~> 2.5'
5
+ ruby '>= 2.6', '< 4'
6
6
 
7
7
  # Specify your gem's dependencies in broken_link_finder.gemspec
8
8
  gemspec
data/Gemfile.lock CHANGED
@@ -1,50 +1,63 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.10.0)
4
+ broken_link_finder (0.12.1)
5
5
  thor (~> 0.20)
6
6
  thread (~> 0.2)
7
- wgit (~> 0.5)
7
+ wgit (~> 0.10)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- addressable (2.6.0)
13
- public_suffix (>= 2.0.2, < 4.0)
14
- bson (4.6.0)
15
- byebug (11.0.1)
16
- coderay (1.1.2)
17
- crack (0.4.3)
18
- safe_yaml (~> 1.0.0)
19
- ethon (0.12.0)
20
- ffi (>= 1.3.0)
21
- ffi (1.11.3)
22
- hashdiff (1.0.0)
23
- maxitest (3.4.0)
24
- minitest (>= 5.0.0, < 5.13.0)
25
- method_source (0.9.2)
26
- mini_portile2 (2.4.0)
27
- minitest (5.12.2)
28
- mongo (2.11.1)
29
- bson (>= 4.6.0, < 5.0.0)
30
- nokogiri (1.10.5)
31
- mini_portile2 (~> 2.4.0)
32
- pry (0.12.2)
33
- coderay (~> 1.1.0)
34
- method_source (~> 0.9.0)
35
- public_suffix (3.1.0)
36
- rake (10.5.0)
37
- safe_yaml (1.0.5)
12
+ addressable (2.8.0)
13
+ public_suffix (>= 2.0.2, < 5.0)
14
+ bson (4.12.1)
15
+ byebug (11.1.3)
16
+ cliver (0.3.2)
17
+ coderay (1.1.3)
18
+ concurrent-ruby (1.1.9)
19
+ crack (0.4.5)
20
+ rexml
21
+ ethon (0.15.0)
22
+ ffi (>= 1.15.0)
23
+ ferrum (0.11)
24
+ addressable (~> 2.5)
25
+ cliver (~> 0.3)
26
+ concurrent-ruby (~> 1.1)
27
+ websocket-driver (>= 0.6, < 0.8)
28
+ ffi (1.15.4)
29
+ hashdiff (1.0.1)
30
+ maxitest (3.7.0)
31
+ minitest (>= 5.0.0, < 5.15.0)
32
+ method_source (1.0.0)
33
+ mini_portile2 (2.6.1)
34
+ minitest (5.14.4)
35
+ mongo (2.17.0)
36
+ bson (>= 4.8.2, < 5.0.0)
37
+ nokogiri (1.12.5)
38
+ mini_portile2 (~> 2.6.1)
39
+ racc (~> 1.4)
40
+ pry (0.14.1)
41
+ coderay (~> 1.1)
42
+ method_source (~> 1.0)
43
+ public_suffix (4.0.6)
44
+ racc (1.6.0)
45
+ rake (13.0.6)
46
+ rexml (3.2.5)
38
47
  thor (0.20.3)
39
48
  thread (0.2.2)
40
- typhoeus (1.3.1)
49
+ typhoeus (1.4.0)
41
50
  ethon (>= 0.9.0)
42
- webmock (3.7.6)
43
- addressable (>= 2.3.6)
51
+ webmock (3.14.0)
52
+ addressable (>= 2.8.0)
44
53
  crack (>= 0.3.2)
45
54
  hashdiff (>= 0.4.0, < 2.0.0)
46
- wgit (0.5.1)
55
+ websocket-driver (0.7.5)
56
+ websocket-extensions (>= 0.1.0)
57
+ websocket-extensions (0.1.5)
58
+ wgit (0.10.2)
47
59
  addressable (~> 2.6)
60
+ ferrum (~> 0.8)
48
61
  mongo (~> 2.9)
49
62
  nokogiri (~> 1.10)
50
63
  typhoeus (~> 1.3)
@@ -58,11 +71,11 @@ DEPENDENCIES
58
71
  byebug (~> 11.0)
59
72
  maxitest (~> 3.3)
60
73
  pry (~> 0.12)
61
- rake (~> 10.0)
74
+ rake (~> 13.0)
62
75
  webmock (~> 3.6)
63
76
 
64
77
  RUBY VERSION
65
- ruby 2.5.3p105
78
+ ruby 3.0.2p107
66
79
 
67
80
  BUNDLED WITH
68
- 2.0.2
81
+ 2.2.22
data/README.md CHANGED
@@ -1,14 +1,16 @@
1
1
  # Broken Link Finder
2
2
 
3
- Does what it says on the tin; Finds a website's broken links.
3
+ Does what it says on the tin - finds a website's broken links.
4
4
 
5
- Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a concise summary of the broken links found.
5
+ Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
6
6
 
7
- Because `libcurl` is used under the hood, Broken Link Finder is fast!
7
+ Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
8
8
 
9
9
  ## How It Works
10
10
 
11
- Any HTML page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions constitutes that the link is broken:
11
+ Any HTML element within `<body>` with a `href` or `src` attribute is considered a link (this is [configurable](#Link-Extraction) however).
12
+
13
+ For each link on a given page, any of the following conditions constitutes that the link is broken:
12
14
 
13
15
  - An empty HTML response body is returned.
14
16
  - A response status code of `404 Not Found` is returned.
@@ -29,27 +31,27 @@ With that said, the usual array of HTTP URL features are supported including anc
29
31
 
30
32
  ## Installation
31
33
 
32
- Add this line to your application's Gemfile:
34
+ Only MRI Ruby is tested and supported, but `broken_link_finder` may work with other Ruby implementations.
33
35
 
34
- ```ruby
35
- gem 'broken_link_finder'
36
- ```
36
+ Currently, the required MRI Ruby version is:
37
37
 
38
- And then execute:
38
+ `ruby '>= 2.6', '< 4'`
39
39
 
40
- $ bundle
40
+ ### Using Bundler
41
41
 
42
- Or install it yourself as:
42
+ $ bundle add broken_link_finder
43
+
44
+ ### Using RubyGems
43
45
 
44
46
  $ gem install broken_link_finder
45
47
 
46
- Finally, verify the installation with:
48
+ ### Verify
47
49
 
48
50
  $ broken_link_finder version
49
51
 
50
52
  ## Usage
51
53
 
52
- You can check for broken links via the library or executable.
54
+ You can check for broken links via the executable or library.
53
55
 
54
56
  ### Executable
55
57
 
@@ -91,9 +93,10 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
91
93
  If broken links are found then the output will look something like:
92
94
 
93
95
  ```text
94
- Crawled http://txti.es (7 page(s) in 7.88 seconds)
96
+ Crawled http://txti.es
97
+ 7 page(s) containing 32 unique link(s) in 6.82 seconds
95
98
 
96
- Found 6 broken link(s) across 2 page(s):
99
+ Found 6 unique broken link(s) across 2 page(s):
97
100
 
98
101
  The following broken links were found on 'http://txti.es/about':
99
102
  http://twitter.com/thebarrytone
@@ -105,7 +108,7 @@ The following broken links were found on 'http://txti.es/how':
105
108
  http://en.wikipedia.org/wiki/Markdown
106
109
  http://imgur.com
107
110
 
108
- Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
111
+ Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
109
112
 
110
113
  The following links were ignored on 'http://txti.es':
111
114
  tel:+13174562564
@@ -117,6 +120,35 @@ ftp://server.com
117
120
 
118
121
  You can provide the `--html` flag if you'd prefer a HTML based report.
119
122
 
123
+ ## Link Extraction
124
+
125
+ You can customise the XPath used to extract links from each crawled page. This can be done via the executable or library.
126
+
127
+ ### Executable
128
+
129
+ Add the `--xpath` (or `-x`) flag to the crawl command e.g.
130
+
131
+ $ broken_link_finder crawl http://txti.es -x //img/@src
132
+
133
+ ### Library
134
+
135
+ Set the desired XPath using the accessor methods provided:
136
+
137
+ > main.rb
138
+
139
+ ```ruby
140
+ require 'broken_link_finder'
141
+
142
+ # Set your desired xpath before crawling...
143
+ BrokenLinkFinder::link_xpath = '//img/@src'
144
+
145
+ # Now crawl as normal and only your custom targeted links will be checked.
146
+ BrokenLinkFinder.new.crawl_page 'http://txti.es'
147
+
148
+ # Go back to using the default provided xpath as needed.
149
+ BrokenLinkFinder::link_xpath = BrokenLinkFinder::DEFAULT_LINK_XPATH
150
+ ```
151
+
120
152
  ## Contributing
121
153
 
122
154
  Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
data/bin/console CHANGED
@@ -23,12 +23,14 @@ end
23
23
  # You can add fixtures and/or initialization code here...
24
24
  reload
25
25
 
26
- url = 'http://txti.es/'
27
- by_page = Finder.new
28
- by_link = Finder.new sort: :link
29
- finder = by_page
26
+ def url; @url ||= 'http://txti.es/'; end
27
+ def by_page; @by_page ||= Finder.new; end
28
+ def by_link; @by_link ||= Finder.new(sort: :link); end
29
+ def finder; @finder ||= by_page; end
30
30
 
31
31
  # Start the console.
32
- puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
32
+ puts
33
+ puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
34
+ puts
33
35
 
34
- binding.pry
36
+ Pry.start
data/bin/setup CHANGED
@@ -5,4 +5,4 @@ set -vx
5
5
 
6
6
  bundle install
7
7
 
8
- # Do any other automated setup that you need to do here
8
+ # Do any other automated setup that you need to do here...
@@ -38,16 +38,16 @@ Gem::Specification.new do |spec|
38
38
  spec.require_paths = ['lib']
39
39
  spec.post_install_message = "Added the executable 'broken_link_finder' to $PATH"
40
40
 
41
- spec.required_ruby_version = '~> 2.5'
41
+ spec.required_ruby_version = '>= 2.6', '< 4'
42
42
 
43
43
  spec.add_development_dependency 'bundler', '~> 2.0'
44
44
  spec.add_development_dependency 'byebug', '~> 11.0'
45
45
  spec.add_development_dependency 'maxitest', '~> 3.3'
46
46
  spec.add_development_dependency 'pry', '~> 0.12'
47
- spec.add_development_dependency 'rake', '~> 10.0'
47
+ spec.add_development_dependency 'rake', '~> 13.0'
48
48
  spec.add_development_dependency 'webmock', '~> 3.6'
49
49
 
50
50
  spec.add_runtime_dependency 'thor', '~> 0.20'
51
51
  spec.add_runtime_dependency 'thread', '~> 0.2'
52
- spec.add_runtime_dependency 'wgit', '~> 0.5'
52
+ spec.add_runtime_dependency 'wgit', '~> 0.10'
53
53
  end
@@ -9,6 +9,7 @@ class BrokenLinkFinderCLI < Thor
9
9
  desc 'crawl [URL]', 'Find broken links at the URL'
10
10
  option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
11
11
  option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
12
+ option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
12
13
  option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
13
14
  option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
14
15
  option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
@@ -22,6 +23,7 @@ class BrokenLinkFinderCLI < Thor
22
23
  broken_verbose = !options[:concise]
23
24
  ignored_verbose = options[:verbose]
24
25
 
26
+ BrokenLinkFinder.link_xpath = options[:xpath]
25
27
  finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
26
28
  options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
27
29
  finder.report(
@@ -29,13 +31,19 @@ class BrokenLinkFinderCLI < Thor
29
31
  broken_verbose: broken_verbose,
30
32
  ignored_verbose: ignored_verbose
31
33
  )
32
- rescue Exception => e
34
+
35
+ exit 0
36
+ rescue StandardError => e
33
37
  puts "An error has occurred: #{e.message}"
38
+
39
+ exit 1
34
40
  end
35
41
 
36
42
  desc 'version', 'Display the currently installed version'
37
43
  def version
38
44
  puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
45
+
46
+ exit 0
39
47
  end
40
48
  end
41
49
 
@@ -1,48 +1,59 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- DEFAULT_MAX_THREADS = 100
4
+ DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
5
+ SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
5
6
 
6
7
  # Alias for BrokenLinkFinder::Finder.new.
7
8
  def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
8
9
  Finder.new(sort: sort, max_threads: max_threads)
9
10
  end
10
11
 
12
+ # Class responsible for finding broken links on a page or site.
11
13
  class Finder
12
- attr_reader :sort, :max_threads, :broken_links, :ignored_links, :crawl_stats
14
+ # The collection key - either :page or :link.
15
+ attr_reader :sort
13
16
 
14
- # Creates a new Finder instance.
15
- def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
17
+ # The max number of threads created during #crawl_site - one thread per page.
18
+ attr_reader :max_threads
19
+
20
+ # Returns a new Finder instance.
21
+ def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
16
22
  raise "Sort by either :page or :link, not #{sort}" \
17
23
  unless %i[page link].include?(sort)
18
24
 
19
25
  @sort = sort
20
26
  @max_threads = max_threads
21
- @lock = Mutex.new
22
27
  @crawler = Wgit::Crawler.new
28
+ @manager = BrokenLinkFinder::LinkManager.new(@sort)
29
+ end
30
+
31
+ # Returns the current broken links.
32
+ def broken_links
33
+ @manager.broken_links
34
+ end
23
35
 
24
- reset_crawl
36
+ # Returns the current ignored links.
37
+ def ignored_links
38
+ @manager.ignored_links
25
39
  end
26
40
 
27
- # Clear/empty the link collection Hashes.
28
- def reset_crawl
29
- @broken_links = {}
30
- @ignored_links = {}
31
- @all_broken_links = Set.new # Used to prevent crawling a link twice.
32
- @all_intact_links = Set.new # "
33
- @broken_link_map = {} # Maps a link to its absolute form.
34
- @crawl_stats = {} # Records crawl stats e.g. duration etc.
41
+ # Returns the current crawl stats.
42
+ def crawl_stats
43
+ @manager.crawl_stats
35
44
  end
36
45
 
37
- # Finds broken links within a single page and appends them to the
38
- # @broken_links array. Returns true if at least one broken link was found.
46
+ # Finds broken links within a single page and records them.
47
+ # Returns true if at least one broken link was found.
39
48
  # Access the broken links afterwards with Finder#broken_links.
40
49
  def crawl_url(url)
41
- reset_crawl
50
+ @manager.empty
42
51
 
43
52
  start = Time.now
44
53
  url = url.to_url
45
- doc = @crawler.crawl(url)
54
+
55
+ # We dup the url to avoid recording any redirects.
56
+ doc = @crawler.crawl(url.dup)
46
57
 
47
58
  # Ensure the given page url is valid.
48
59
  raise "Invalid or broken URL: #{url}" unless doc
@@ -51,18 +62,17 @@ module BrokenLinkFinder
51
62
  find_broken_links(doc)
52
63
  retry_broken_links
53
64
 
54
- sort_links
55
- set_crawl_stats(url: url, pages_crawled: [url], start: start)
65
+ @manager.sort
66
+ @manager.tally(url: url, pages_crawled: [url], start: start)
56
67
 
57
- @broken_links.any?
68
+ broken_links.any?
58
69
  end
59
70
 
60
- # Finds broken links within an entire site and appends them to the
61
- # @broken_links array. Returns a tuple containing a Boolean of true if
62
- # at least one broken link was found and an Array of all pages crawled.
71
+ # Finds broken links within an entire site and records them.
72
+ # Returns true if at least one broken link was found.
63
73
  # Access the broken links afterwards with Finder#broken_links.
64
- def crawl_site(url)
65
- reset_crawl
74
+ def crawl_site(url, allow_paths: nil, disallow_paths: nil)
75
+ @manager.empty
66
76
 
67
77
  start = Time.now
68
78
  url = url.to_url
@@ -70,7 +80,9 @@ module BrokenLinkFinder
70
80
  crawled = Set.new
71
81
 
72
82
  # Crawl the site's HTML web pages looking for links.
73
- externals = @crawler.crawl_site(url) do |doc|
83
+ # We dup the url to avoid recording any redirects.
84
+ paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
85
+ externals = @crawler.crawl_site(url.dup, **paths) do |doc|
74
86
  crawled << doc.url
75
87
  next unless doc
76
88
 
@@ -78,35 +90,39 @@ module BrokenLinkFinder
78
90
  pool.process { find_broken_links(doc) }
79
91
  end
80
92
 
93
+ # Wait for all threads to finish, even if url was invalid.
94
+ pool.shutdown
95
+
81
96
  # Ensure the given website url is valid.
82
97
  raise "Invalid or broken URL: #{url}" unless externals
83
98
 
84
- # Wait for all threads to finish.
85
- pool.shutdown
86
99
  retry_broken_links
87
100
 
88
- sort_links
89
- set_crawl_stats(url: url, pages_crawled: crawled.to_a, start: start)
101
+ @manager.sort
102
+ @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
90
103
 
91
- @broken_links.any?
104
+ broken_links.any?
105
+ ensure
106
+ pool.shutdown if defined?(pool)
92
107
  end
93
108
 
94
- # Pretty prints the link report into a stream e.g. STDOUT or a file,
109
+ # Outputs the link report into a stream e.g. STDOUT or a file,
95
110
  # anything that respond_to? :puts. Defaults to STDOUT.
96
- def report(stream = STDOUT,
97
- type: :text, broken_verbose: true, ignored_verbose: false)
111
+ def report(stream = STDOUT, type: :text,
112
+ broken_verbose: true, ignored_verbose: false)
98
113
  klass = case type
99
114
  when :text
100
115
  BrokenLinkFinder::TextReporter
101
116
  when :html
102
117
  BrokenLinkFinder::HTMLReporter
103
118
  else
104
- raise "type: must be :text or :html, not: :#{type}"
119
+ raise "The type: must be :text or :html, not: :#{type}"
105
120
  end
106
121
 
107
- reporter = klass.new(stream, @sort, @broken_links,
108
- @ignored_links, @broken_link_map, @crawl_stats)
109
- reporter.call(broken_verbose: broken_verbose,
122
+ reporter = klass.new(stream, @sort,
123
+ broken_links, ignored_links,
124
+ @manager.broken_link_map, crawl_stats)
125
+ reporter.call(broken_verbose: broken_verbose,
110
126
  ignored_verbose: ignored_verbose)
111
127
  end
112
128
 
@@ -114,26 +130,29 @@ module BrokenLinkFinder
114
130
 
115
131
  # Finds which links are unsupported or broken and records the details.
116
132
  def find_broken_links(page)
133
+ record_unparsable_links(page) # Record them as broken.
134
+
117
135
  links = get_supported_links(page)
118
136
 
119
137
  # Iterate over the supported links checking if they're broken or not.
120
138
  links.each do |link|
121
- # Skip if the link has been processed previously.
122
- next if @all_intact_links.include?(link)
139
+ # Skip if the link has been encountered previously.
140
+ next if @manager.all_intact_links.include?(link)
123
141
 
124
- if @all_broken_links.include?(link)
125
- append_broken_link(page.url, link) # Record on which page.
142
+ if @manager.all_broken_links.include?(link)
143
+ # The link has already been proven broken so simply record it.
144
+ @manager.append_broken_link(page, link, map: false)
126
145
  next
127
146
  end
128
147
 
129
- # The link hasn't been processed before so we crawl it.
148
+ # The link hasn't been encountered before so we crawl it.
130
149
  link_doc = crawl_link(page, link)
131
150
 
132
- # Determine if the crawled link is broken or not.
151
+ # Determine if the crawled link is broken or not and record it.
133
152
  if link_broken?(link_doc)
134
- append_broken_link(page.url, link, doc: page)
153
+ @manager.append_broken_link(page, link)
135
154
  else
136
- @lock.synchronize { @all_intact_links << link }
155
+ @manager.append_intact_link(link)
137
156
  end
138
157
  end
139
158
 
@@ -143,30 +162,47 @@ module BrokenLinkFinder
143
162
  # Implements a retry mechanism for each of the broken links found.
144
163
  # Removes any broken links found to be working OK.
145
164
  def retry_broken_links
146
- sleep(0.5) # Give the servers a break, then retry the links.
165
+ sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
166
+
167
+ @manager.broken_link_map.select! do |link, href|
168
+ # Don't retry unparsable links (which are Strings).
169
+ next(true) unless href.is_a?(Wgit::Url)
170
+
171
+ doc = @crawler.crawl(href.dup)
147
172
 
148
- @broken_link_map.each do |link, href|
149
- doc = @crawler.crawl(href)
150
- remove_broken_link(link) unless link_broken?(doc)
173
+ if link_broken?(doc)
174
+ true
175
+ else
176
+ @manager.remove_broken_link(link)
177
+ false
178
+ end
179
+ end
180
+ end
181
+
182
+ # Record each unparsable link as a broken link.
183
+ def record_unparsable_links(doc)
184
+ doc.unparsable_links.each do |link|
185
+ # We map the link ourselves because link is a String, not a Wgit::Url.
186
+ @manager.append_broken_link(doc, link, map: false)
187
+ @manager.broken_link_map[link] = link
151
188
  end
152
189
  end
153
190
 
154
191
  # Report and reject any non supported links. Any link that is absolute and
155
192
  # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
156
193
  def get_supported_links(doc)
157
- doc.all_links
158
- .reject do |link|
159
- if link.is_absolute? && !link.start_with?('http')
160
- append_ignored_link(doc.url, link)
161
- true
162
- end
163
- end
194
+ doc.all_links.reject do |link|
195
+ if link.is_absolute? && !link.start_with?('http')
196
+ @manager.append_ignored_link(doc.url, link)
197
+ true
198
+ end
199
+ end
164
200
  end
165
201
 
166
202
  # Make the link absolute and crawl it, returning its Wgit::Document.
167
203
  def crawl_link(doc, link)
168
- link = link.prefix_base(doc)
169
- @crawler.crawl(link)
204
+ link = link.make_absolute(doc)
205
+ @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
170
206
  end
171
207
 
172
208
  # Return if the crawled link is broken or not.
@@ -175,8 +211,9 @@ module BrokenLinkFinder
175
211
  end
176
212
 
177
213
  # Returns true if the link is/contains a broken anchor/fragment.
214
+ # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
178
215
  def has_broken_anchor(doc)
179
- raise 'link document is nil' unless doc
216
+ raise 'The link document is nil' unless doc
180
217
 
181
218
  fragment = doc.url.fragment
182
219
  return false if fragment.nil? || fragment.empty?
@@ -184,80 +221,6 @@ module BrokenLinkFinder
184
221
  doc.xpath("//*[@id='#{fragment}']").empty?
185
222
  end
186
223
 
187
- # Append key => [value] to @broken_links.
188
- # If doc: is provided then the link will be recorded in absolute form.
189
- def append_broken_link(url, link, doc: nil)
190
- key, value = get_key_value(url, link)
191
-
192
- @lock.synchronize do
193
- @broken_links[key] = [] unless @broken_links[key]
194
- @broken_links[key] << value
195
-
196
- @all_broken_links << link
197
-
198
- @broken_link_map[link] = link.prefix_base(doc) if doc
199
- end
200
- end
201
-
202
- # Remove the broken_link from the necessary collections.
203
- def remove_broken_link(link)
204
- @lock.synchronize do
205
- if @sort == :page
206
- @broken_links.each { |_k, links| links.delete(link) }
207
- @broken_links.delete_if { |_k, links| links.empty? }
208
- else
209
- @broken_links.delete(link)
210
- end
211
-
212
- @all_broken_links.delete(link)
213
- @all_intact_links << link
214
- end
215
- end
216
-
217
- # Append key => [value] to @ignored_links.
218
- def append_ignored_link(url, link)
219
- key, value = get_key_value(url, link)
220
-
221
- @lock.synchronize do
222
- @ignored_links[key] = [] unless @ignored_links[key]
223
- @ignored_links[key] << value
224
- end
225
- end
226
-
227
- # Returns the correct key value depending on the @sort type.
228
- # @sort == :page ? [url, link] : [link, url]
229
- def get_key_value(url, link)
230
- case @sort
231
- when :page
232
- [url, link]
233
- when :link
234
- [link, url]
235
- else
236
- raise "Unsupported sort type: #{sort}"
237
- end
238
- end
239
-
240
- # Sort keys and values alphabetically.
241
- def sort_links
242
- @broken_links.values.map(&:uniq!)
243
- @ignored_links.values.map(&:uniq!)
244
-
245
- @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
246
- @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
247
-
248
- @broken_links.each { |_k, v| v.sort! }
249
- @ignored_links.each { |_k, v| v.sort! }
250
- end
251
-
252
- # Sets and returns the total number of links crawled.
253
- def set_crawl_stats(url:, pages_crawled:, start:)
254
- @crawl_stats[:url] = url
255
- @crawl_stats[:pages_crawled] = pages_crawled
256
- @crawl_stats[:num_pages] = pages_crawled.size
257
- @crawl_stats[:num_links] = @all_broken_links.size + @all_intact_links.size
258
- @crawl_stats[:duration] = Time.now - start
259
- end
260
-
261
224
  alias crawl_page crawl_url
262
225
  alias crawl_r crawl_site
263
226
  end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Class responsible for handling the link collection logic.
5
+ class LinkManager
6
+ # Used for mapping pages to broken links.
7
+ attr_reader :broken_links
8
+
9
+ # Used for mapping pages to ignored links.
10
+ attr_reader :ignored_links
11
+
12
+ # Used to record crawl statistics e.g. duration etc.
13
+ attr_reader :crawl_stats
14
+
15
+ # Used to map a link (as is) to its absolute (crawlable) form.
16
+ attr_reader :broken_link_map
17
+
18
+ # Used to prevent crawling a broken link twice.
19
+ attr_reader :all_broken_links
20
+
21
+ # Used to prevent crawling an intact link twice.
22
+ attr_reader :all_intact_links
23
+
24
+ # Used for building crawl statistics.
25
+ attr_reader :all_ignored_links
26
+
27
+ # Returns a new LinkManager instance with empty link collections.
28
+ def initialize(sort)
29
+ raise "Sort by either :page or :link, not #{sort}" \
30
+ unless %i[page link].include?(sort)
31
+
32
+ @sort = sort
33
+ @lock = Mutex.new
34
+
35
+ empty # Initialises the link collections.
36
+ end
37
+
38
+ # Initialise/empty the link collection objects.
39
+ def empty
40
+ @broken_links = {}
41
+ @ignored_links = {}
42
+ @crawl_stats = {}
43
+ @broken_link_map = {}
44
+ @all_broken_links = Set.new
45
+ @all_intact_links = Set.new
46
+ @all_ignored_links = Set.new
47
+ end
48
+
49
+ # Append key => [value] to the broken link collections.
50
+ # If map: true, then the link will also be recorded in @broken_link_map.
51
+ def append_broken_link(doc, link, map: true)
52
+ key, value = get_key_value(doc.url, link)
53
+
54
+ @lock.synchronize do
55
+ @broken_links[key] = [] unless @broken_links[key]
56
+ @broken_links[key] << value
57
+
58
+ @all_broken_links << link
59
+
60
+ @broken_link_map[link] = link.make_absolute(doc) if map
61
+ end
62
+ end
63
+
64
+ # Remove the broken link from the necessary collections.
65
+ def remove_broken_link(link)
66
+ @lock.synchronize do
67
+ if @sort == :page
68
+ @broken_links.each { |_k, links| links.delete(link) }
69
+ @broken_links.delete_if { |_k, links| links.empty? }
70
+ else
71
+ @broken_links.delete(link)
72
+ end
73
+
74
+ @all_broken_links.delete(link)
75
+ @all_intact_links << link
76
+ end
77
+ end
78
+
79
+ # Append key => [value] to the ignored link collections.
80
+ def append_ignored_link(url, link)
81
+ key, value = get_key_value(url, link)
82
+
83
+ @lock.synchronize do
84
+ @ignored_links[key] = [] unless @ignored_links[key]
85
+ @ignored_links[key] << value
86
+
87
+ @all_ignored_links << link
88
+ end
89
+ end
90
+
91
+ # Append link to @all_intact_links.
92
+ def append_intact_link(link)
93
+ @lock.synchronize { @all_intact_links << link }
94
+ end
95
+
96
+ # Sorts the link collection's keys and values alphabetically.
97
+ def sort
98
+ @broken_links.values.map(&:uniq!)
99
+ @ignored_links.values.map(&:uniq!)
100
+
101
+ @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
102
+ @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
103
+
104
+ @broken_links.each { |_k, v| v.sort! }
105
+ @ignored_links.each { |_k, v| v.sort! }
106
+ end
107
+
108
+ # Tally's up various statistics about the crawl and its links.
109
+ def tally(url:, pages_crawled:, start:)
110
+ @crawl_stats[:url] = url
111
+ @crawl_stats[:pages_crawled] = pages_crawled
112
+ @crawl_stats[:num_pages] = pages_crawled.size
113
+ @crawl_stats[:num_links] = (
114
+ @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
115
+ )
116
+ @crawl_stats[:num_broken_links] = @all_broken_links.size
117
+ @crawl_stats[:num_intact_links] = @all_intact_links.size
118
+ @crawl_stats[:num_ignored_links] = @all_ignored_links.size
119
+ @crawl_stats[:duration] = Time.now - start
120
+ end
121
+
122
+ private
123
+
124
+ # Returns the correct key value depending on the @sort type.
125
+ # @sort == :page ? [url, link] : [link, url]
126
+ def get_key_value(url, link)
127
+ case @sort
128
+ when :page
129
+ [url, link]
130
+ when :link
131
+ [link, url]
132
+ else
133
+ raise "Unsupported sort type: #{sort}"
134
+ end
135
+ end
136
+ end
137
+ end
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
+ # Class responsible for reporting in a HTML format.
4
5
  class HTMLReporter < Reporter
5
- # Creates a new HTMLReporter instance.
6
+ # Returns a new HTMLReporter instance.
6
7
  # stream is any Object that responds to :puts and :print.
7
8
  def initialize(stream, sort,
8
9
  broken_links, ignored_links,
@@ -28,9 +29,11 @@ module BrokenLinkFinder
28
29
  # Report a summary of the overall crawl.
29
30
  def report_crawl_summary
30
31
  puts format(
31
- '<p class="crawl_summary">Crawled %s (%s page(s) in %s seconds)</p>',
32
+ '<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
33
+ @crawl_stats[:url],
32
34
  @crawl_stats[:url],
33
35
  @crawl_stats[:num_pages],
36
+ @crawl_stats[:num_links],
34
37
  @crawl_stats[:duration]&.truncate(2)
35
38
  )
36
39
  end
@@ -43,7 +46,7 @@ module BrokenLinkFinder
43
46
  puts_summary 'Good news, there are no broken links!', type: :broken
44
47
  else
45
48
  num_pages, num_links = get_hash_stats(@broken_links)
46
- puts_summary "Found #{num_links} broken link(s) across #{num_pages} page(s):", type: :broken
49
+ puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
47
50
 
48
51
  @broken_links.each do |key, values|
49
52
  puts_group(key, type: :broken) # Puts the opening <p> element.
@@ -70,7 +73,7 @@ module BrokenLinkFinder
70
73
 
71
74
  if @ignored_links.any?
72
75
  num_pages, num_links = get_hash_stats(@ignored_links)
73
- puts_summary "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
76
+ puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
74
77
 
75
78
  @ignored_links.each do |key, values|
76
79
  puts_group(key, type: :ignored) # Puts the opening <p> element.
@@ -125,8 +128,8 @@ module BrokenLinkFinder
125
128
  end
126
129
 
127
130
  def build_url(link)
128
- return link if link.to_url.absolute?
129
- @broken_link_map.fetch(link)
131
+ href = @broken_link_map[link]
132
+ href || link
130
133
  end
131
134
 
132
135
  alias_method :report, :call
@@ -6,7 +6,7 @@ module BrokenLinkFinder
6
6
  # The amount of pages/links to display when verbose is false.
7
7
  NUM_VALUES = 3
8
8
 
9
- # Creates a new Reporter instance.
9
+ # Returns a new Reporter instance.
10
10
  # stream is any Object that responds to :puts and :print.
11
11
  def initialize(stream, sort,
12
12
  broken_links, ignored_links,
@@ -42,8 +42,7 @@ module BrokenLinkFinder
42
42
  # Use like: `num_pages, num_links = get_hash_stats(links)`.
43
43
  def get_hash_stats(hash)
44
44
  num_keys = hash.keys.length
45
- values = hash.values.flatten
46
- num_values = sort_by_page? ? values.length : values.uniq.length
45
+ num_values = hash.values.flatten.uniq.length
47
46
 
48
47
  sort_by_page? ?
49
48
  [num_keys, num_values] :
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
+ # Class responsible for reporting in a text format.
4
5
  class TextReporter < Reporter
5
- # Creates a new TextReporter instance.
6
+ # Returns a new TextReporter instance.
6
7
  # stream is any Object that responds to :puts and :print.
7
8
  def initialize(stream, sort,
8
9
  broken_links, ignored_links,
@@ -23,10 +24,11 @@ module BrokenLinkFinder
23
24
 
24
25
  # Report a summary of the overall crawl.
25
26
  def report_crawl_summary
27
+ puts "Crawled #{@crawl_stats[:url]}"
26
28
  putsn format(
27
- 'Crawled %s (%s page(s) in %s seconds)',
28
- @crawl_stats[:url],
29
+ '%s page(s) containing %s unique link(s) in %s seconds',
29
30
  @crawl_stats[:num_pages],
31
+ @crawl_stats[:num_links],
30
32
  @crawl_stats[:duration]&.truncate(2)
31
33
  )
32
34
  end
@@ -37,7 +39,7 @@ module BrokenLinkFinder
37
39
  puts 'Good news, there are no broken links!'
38
40
  else
39
41
  num_pages, num_links = get_hash_stats(@broken_links)
40
- puts "Found #{num_links} broken link(s) across #{num_pages} page(s):"
42
+ puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
41
43
 
42
44
  @broken_links.each do |key, values|
43
45
  msg = sort_by_page? ?
@@ -61,7 +63,7 @@ module BrokenLinkFinder
61
63
  def report_ignored_links(verbose: false)
62
64
  if @ignored_links.any?
63
65
  num_pages, num_links = get_hash_stats(@ignored_links)
64
- nputs "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:"
66
+ nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
65
67
 
66
68
  @ignored_links.each do |key, values|
67
69
  msg = sort_by_page? ?
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.10.0'
4
+ VERSION = '0.12.1'
5
5
  end
@@ -1,11 +1,31 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # We extract all the Document's links, not just the links to other webpages.
4
- Wgit::Document.define_extension(
3
+ # Define a method on each doc for recording unparsable links.
4
+ # Unparsable links are recorded as broken links by Finder.
5
+ class Wgit::Document
6
+ def unparsable_links
7
+ @unparsable_links ||= []
8
+ end
9
+ end
10
+
11
+ # Returns a Wgit::Url or nil (if link is unparsable).
12
+ # A proc is preferrable to a function to avoid polluting the global namespace.
13
+ parse_link = lambda do |doc, link|
14
+ Wgit::Url.new(link)
15
+ rescue StandardError
16
+ doc.unparsable_links << link
17
+ nil
18
+ end
19
+
20
+ # Define a custom extractor for all page links we're interested in checking.
21
+ Wgit::Document.define_extractor(
5
22
  :all_links,
6
- '//*/@href | //*/@src', # Any element with a href or src attribute.
23
+ lambda { BrokenLinkFinder::link_xpath },
7
24
  singleton: false,
8
25
  text_content_only: true
9
- ) do |links|
10
- links.uniq.to_urls
26
+ ) do |links, doc|
27
+ links
28
+ .uniq
29
+ .map { |link| parse_link.call(doc, link) }
30
+ .compact
11
31
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BrokenLinkFinder
4
+ # Extract all the Document's <body> links e.g. <a>, <img>, <script> etc.
5
+ DEFAULT_LINK_XPATH = '/html/body//*/@href | /html/body//*/@src'
6
+
7
+ @link_xpath = DEFAULT_LINK_XPATH
8
+
9
+ class << self
10
+ # The xpath used to extract links from a crawled page.
11
+ # Can be overridden as required.
12
+ attr_accessor :link_xpath
13
+ end
14
+ end
@@ -5,8 +5,10 @@ require 'wgit/core_ext'
5
5
  require 'thread/pool'
6
6
  require 'set'
7
7
 
8
- require_relative './broken_link_finder/wgit_extensions'
9
8
  require_relative './broken_link_finder/version'
9
+ require_relative './broken_link_finder/xpath'
10
+ require_relative './broken_link_finder/wgit_extensions'
11
+ require_relative './broken_link_finder/link_manager'
10
12
  require_relative './broken_link_finder/reporter/reporter'
11
13
  require_relative './broken_link_finder/reporter/text_reporter'
12
14
  require_relative './broken_link_finder/reporter/html_reporter'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.12.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-11-28 00:00:00.000000000 Z
11
+ date: 2021-11-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -72,14 +72,14 @@ dependencies:
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '10.0'
75
+ version: '13.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '10.0'
82
+ version: '13.0'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: webmock
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -128,14 +128,14 @@ dependencies:
128
128
  requirements:
129
129
  - - "~>"
130
130
  - !ruby/object:Gem::Version
131
- version: '0.5'
131
+ version: '0.10'
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
- version: '0.5'
138
+ version: '0.10'
139
139
  description: Finds a website's broken links using the 'wgit' gem and reports back
140
140
  to you with a summary.
141
141
  email: michael.telford@live.com
@@ -159,11 +159,13 @@ files:
159
159
  - exe/broken_link_finder
160
160
  - lib/broken_link_finder.rb
161
161
  - lib/broken_link_finder/finder.rb
162
+ - lib/broken_link_finder/link_manager.rb
162
163
  - lib/broken_link_finder/reporter/html_reporter.rb
163
164
  - lib/broken_link_finder/reporter/reporter.rb
164
165
  - lib/broken_link_finder/reporter/text_reporter.rb
165
166
  - lib/broken_link_finder/version.rb
166
167
  - lib/broken_link_finder/wgit_extensions.rb
168
+ - lib/broken_link_finder/xpath.rb
167
169
  - load.rb
168
170
  homepage: https://github.com/michaeltelford/broken-link-finder
169
171
  licenses:
@@ -180,17 +182,20 @@ require_paths:
180
182
  - lib
181
183
  required_ruby_version: !ruby/object:Gem::Requirement
182
184
  requirements:
183
- - - "~>"
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '2.6'
188
+ - - "<"
184
189
  - !ruby/object:Gem::Version
185
- version: '2.5'
190
+ version: '4'
186
191
  required_rubygems_version: !ruby/object:Gem::Requirement
187
192
  requirements:
188
193
  - - ">="
189
194
  - !ruby/object:Gem::Version
190
195
  version: '0'
191
196
  requirements: []
192
- rubygems_version: 3.0.6
193
- signing_key:
197
+ rubygems_version: 3.2.22
198
+ signing_key:
194
199
  specification_version: 4
195
200
  summary: Finds a website's broken links and reports back to you with a summary.
196
201
  test_files: []