broken_link_finder 0.12.1 → 0.12.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +20 -0
- data/Gemfile.lock +42 -38
- data/broken_link_finder.gemspec +2 -2
- data/exe/broken_link_finder +11 -7
- data/lib/broken_link_finder/finder.rb +13 -5
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +7 -0
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4e60e87e2f9f1ae05d4ebe58066169e389994fba88ea3393a9dc5c3d04df20b6
|
4
|
+
data.tar.gz: 8019a1671fb811d6bf67feefb044f77be18dc04b0904b1b4e43cb258ee0d5fa3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6b16be9cbbd35ea468ff879a833b8a20cc6557b6bf1f2c0c79ae7243fc8a75d0f1b2d19ff9030dc112816871b232f85cbe49e344ee0ada405062b731961912a8
|
7
|
+
data.tar.gz: cf4f43c4d5369f218e1700b941fd5a4b3681aaafc8131e1394b734b72d343b6ee743690f15c115f711ccf5e133fe91777224fd943b04ae89e46e3ca06f09e424
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
3.0
|
1
|
+
3.3.0
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,26 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.12.3
|
13
|
+
### Added
|
14
|
+
- Added `BrokenLinkFinder::Finder#manager` getter method.
|
15
|
+
### Changed/Removed
|
16
|
+
- Updated production dependencies including `wgit` (to `v0.12.0`).
|
17
|
+
- Updated `Wgit::Url`s to look like Strings when inspected.
|
18
|
+
### Fixed
|
19
|
+
- ...
|
20
|
+
---
|
21
|
+
|
22
|
+
## v0.12.2
|
23
|
+
### Added
|
24
|
+
- Updated to Ruby 3.3 and updated production dependencies including Wgit (v0.11.0)
|
25
|
+
- Added `--js` and `--js-delay` flag options to the executable. This allows JS parsing to update a page's DOM before it get crawled.
|
26
|
+
### Changed/Removed
|
27
|
+
- ...
|
28
|
+
### Fixed
|
29
|
+
- ...
|
30
|
+
---
|
31
|
+
|
12
32
|
## v0.12.1
|
13
33
|
### Added
|
14
34
|
- Support for Ruby 3.
|
data/Gemfile.lock
CHANGED
@@ -1,66 +1,70 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.12.
|
5
|
-
thor (~>
|
4
|
+
broken_link_finder (0.12.3)
|
5
|
+
thor (~> 1.3)
|
6
6
|
thread (~> 0.2)
|
7
|
-
wgit (~> 0.
|
7
|
+
wgit (~> 0.12)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
|
-
addressable (2.8.
|
13
|
-
public_suffix (>= 2.0.2, <
|
14
|
-
|
12
|
+
addressable (2.8.7)
|
13
|
+
public_suffix (>= 2.0.2, < 7.0)
|
14
|
+
base64 (0.2.0)
|
15
|
+
bigdecimal (3.1.8)
|
16
|
+
bson (5.0.1)
|
15
17
|
byebug (11.1.3)
|
16
|
-
cliver (0.3.2)
|
17
18
|
coderay (1.1.3)
|
18
|
-
concurrent-ruby (1.
|
19
|
-
crack (0.
|
19
|
+
concurrent-ruby (1.3.4)
|
20
|
+
crack (1.0.0)
|
21
|
+
bigdecimal
|
20
22
|
rexml
|
21
|
-
ethon (0.
|
23
|
+
ethon (0.16.0)
|
22
24
|
ffi (>= 1.15.0)
|
23
|
-
ferrum (0.
|
25
|
+
ferrum (0.15)
|
24
26
|
addressable (~> 2.5)
|
25
|
-
cliver (~> 0.3)
|
26
27
|
concurrent-ruby (~> 1.1)
|
27
|
-
|
28
|
-
|
29
|
-
|
28
|
+
webrick (~> 1.7)
|
29
|
+
websocket-driver (~> 0.7)
|
30
|
+
ffi (1.17.0)
|
31
|
+
hashdiff (1.1.1)
|
30
32
|
maxitest (3.7.0)
|
31
33
|
minitest (>= 5.0.0, < 5.15.0)
|
32
|
-
method_source (1.
|
33
|
-
mini_portile2 (2.
|
34
|
+
method_source (1.1.0)
|
35
|
+
mini_portile2 (2.8.7)
|
34
36
|
minitest (5.14.4)
|
35
|
-
mongo (2.
|
36
|
-
bson (>= 4.
|
37
|
-
nokogiri (1.
|
38
|
-
mini_portile2 (~> 2.
|
37
|
+
mongo (2.21.0)
|
38
|
+
bson (>= 4.14.1, < 6.0.0)
|
39
|
+
nokogiri (1.16.7)
|
40
|
+
mini_portile2 (~> 2.8.2)
|
39
41
|
racc (~> 1.4)
|
40
|
-
pry (0.14.
|
42
|
+
pry (0.14.2)
|
41
43
|
coderay (~> 1.1)
|
42
44
|
method_source (~> 1.0)
|
43
|
-
public_suffix (
|
44
|
-
racc (1.
|
45
|
-
rake (13.
|
46
|
-
rexml (3.
|
47
|
-
thor (
|
45
|
+
public_suffix (6.0.1)
|
46
|
+
racc (1.8.1)
|
47
|
+
rake (13.2.1)
|
48
|
+
rexml (3.3.9)
|
49
|
+
thor (1.3.2)
|
48
50
|
thread (0.2.2)
|
49
|
-
typhoeus (1.4.
|
51
|
+
typhoeus (1.4.1)
|
50
52
|
ethon (>= 0.9.0)
|
51
|
-
webmock (3.
|
53
|
+
webmock (3.24.0)
|
52
54
|
addressable (>= 2.8.0)
|
53
55
|
crack (>= 0.3.2)
|
54
56
|
hashdiff (>= 0.4.0, < 2.0.0)
|
55
|
-
|
57
|
+
webrick (1.8.2)
|
58
|
+
websocket-driver (0.7.6)
|
56
59
|
websocket-extensions (>= 0.1.0)
|
57
60
|
websocket-extensions (0.1.5)
|
58
|
-
wgit (0.
|
59
|
-
addressable (~> 2.
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
61
|
+
wgit (0.12.0)
|
62
|
+
addressable (~> 2.8)
|
63
|
+
base64 (~> 0.2)
|
64
|
+
ferrum (~> 0.14)
|
65
|
+
mongo (~> 2.19)
|
66
|
+
nokogiri (~> 1.15)
|
67
|
+
typhoeus (~> 1.4)
|
64
68
|
|
65
69
|
PLATFORMS
|
66
70
|
ruby
|
@@ -75,7 +79,7 @@ DEPENDENCIES
|
|
75
79
|
webmock (~> 3.6)
|
76
80
|
|
77
81
|
RUBY VERSION
|
78
|
-
ruby 3.
|
82
|
+
ruby 3.3.0p0
|
79
83
|
|
80
84
|
BUNDLED WITH
|
81
|
-
2.
|
85
|
+
2.5.3
|
data/broken_link_finder.gemspec
CHANGED
@@ -47,7 +47,7 @@ Gem::Specification.new do |spec|
|
|
47
47
|
spec.add_development_dependency 'rake', '~> 13.0'
|
48
48
|
spec.add_development_dependency 'webmock', '~> 3.6'
|
49
49
|
|
50
|
-
spec.add_runtime_dependency 'thor', '~>
|
50
|
+
spec.add_runtime_dependency 'thor', '~> 1.3'
|
51
51
|
spec.add_runtime_dependency 'thread', '~> 0.2'
|
52
|
-
spec.add_runtime_dependency 'wgit', '~> 0.
|
52
|
+
spec.add_runtime_dependency 'wgit', '~> 0.12'
|
53
53
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -9,7 +9,9 @@ class BrokenLinkFinderCLI < Thor
|
|
9
9
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
10
10
|
option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
|
11
11
|
option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
|
12
|
-
option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
|
12
|
+
option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH, desc: 'The xpath to extract links with, before checking if broken'
|
13
|
+
option :js, type: :boolean, default: false, desc: 'Run the Javascript on a page before crawling the HTML, requires Chrome/Chromium to be installed to $PATH'
|
14
|
+
option :js_delay, type: :numeric, default: 1, desc: "The seconds of delay time given to a page's Javascript for it to update the DOM, requires the --js flag"
|
13
15
|
option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
|
14
16
|
option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
15
17
|
option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
|
@@ -22,15 +24,17 @@ class BrokenLinkFinderCLI < Thor
|
|
22
24
|
max_threads = options[:threads]
|
23
25
|
broken_verbose = !options[:concise]
|
24
26
|
ignored_verbose = options[:verbose]
|
27
|
+
parse_js = options[:js]
|
28
|
+
parse_js_delay = options[:js_delay]
|
25
29
|
|
26
30
|
BrokenLinkFinder.link_xpath = options[:xpath]
|
27
|
-
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads:
|
31
|
+
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads:) do |crawler|
|
32
|
+
crawler.parse_javascript = parse_js
|
33
|
+
crawler.parse_javascript_delay = parse_js_delay
|
34
|
+
end
|
35
|
+
|
28
36
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
29
|
-
finder.report(
|
30
|
-
type: report_type,
|
31
|
-
broken_verbose: broken_verbose,
|
32
|
-
ignored_verbose: ignored_verbose
|
33
|
-
)
|
37
|
+
finder.report(type: report_type, broken_verbose:, ignored_verbose:)
|
34
38
|
|
35
39
|
exit 0
|
36
40
|
rescue StandardError => e
|
@@ -5,8 +5,8 @@ module BrokenLinkFinder
|
|
5
5
|
SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
|
6
6
|
|
7
7
|
# Alias for BrokenLinkFinder::Finder.new.
|
8
|
-
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
9
|
-
Finder.new(sort: sort, max_threads: max_threads)
|
8
|
+
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS, &block)
|
9
|
+
Finder.new(sort: sort, max_threads: max_threads, &block)
|
10
10
|
end
|
11
11
|
|
12
12
|
# Class responsible for finding broken links on a page or site.
|
@@ -17,8 +17,14 @@ module BrokenLinkFinder
|
|
17
17
|
# The max number of threads created during #crawl_site - one thread per page.
|
18
18
|
attr_reader :max_threads
|
19
19
|
|
20
|
+
# The underlying Wgit::Crawler used by this instance of Finder.
|
21
|
+
attr_reader :crawler
|
22
|
+
|
23
|
+
# The underlying link manager used by this instance of Finder.
|
24
|
+
attr_reader :manager
|
25
|
+
|
20
26
|
# Returns a new Finder instance.
|
21
|
-
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
27
|
+
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS, &block)
|
22
28
|
raise "Sort by either :page or :link, not #{sort}" \
|
23
29
|
unless %i[page link].include?(sort)
|
24
30
|
|
@@ -26,6 +32,8 @@ module BrokenLinkFinder
|
|
26
32
|
@max_threads = max_threads
|
27
33
|
@crawler = Wgit::Crawler.new
|
28
34
|
@manager = BrokenLinkFinder::LinkManager.new(@sort)
|
35
|
+
|
36
|
+
yield @crawler if block_given?
|
29
37
|
end
|
30
38
|
|
31
39
|
# Returns the current broken links.
|
@@ -56,7 +64,7 @@ module BrokenLinkFinder
|
|
56
64
|
doc = @crawler.crawl(url.dup)
|
57
65
|
|
58
66
|
# Ensure the given page url is valid.
|
59
|
-
raise "Invalid or broken URL: #{url}"
|
67
|
+
raise "Invalid or broken URL: #{url}" if doc.empty?
|
60
68
|
|
61
69
|
# Get all page links and determine which are broken.
|
62
70
|
find_broken_links(doc)
|
@@ -207,7 +215,7 @@ module BrokenLinkFinder
|
|
207
215
|
|
208
216
|
# Return if the crawled link is broken or not.
|
209
217
|
def link_broken?(doc)
|
210
|
-
doc.
|
218
|
+
doc.empty? || @crawler.last_response.not_found? || has_broken_anchor(doc)
|
211
219
|
end
|
212
220
|
|
213
221
|
# Returns true if the link is/contains a broken anchor/fragment.
|
@@ -1,5 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
# Make Wgit::Urls look like Strings when inspected.
|
4
|
+
class Wgit::Url
|
5
|
+
def inspect
|
6
|
+
to_s.inspect
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
3
10
|
# Define a method on each doc for recording unparsable links.
|
4
11
|
# Unparsable links are recorded as broken links by Finder.
|
5
12
|
class Wgit::Document
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,14 +100,14 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
103
|
+
version: '1.3'
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
110
|
+
version: '1.3'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: thread
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - "~>"
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
131
|
+
version: '0.12'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
138
|
+
version: '0.12'
|
139
139
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
140
140
|
to you with a summary.
|
141
141
|
email: michael.telford@live.com
|
@@ -194,7 +194,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
194
194
|
- !ruby/object:Gem::Version
|
195
195
|
version: '0'
|
196
196
|
requirements: []
|
197
|
-
rubygems_version: 3.
|
197
|
+
rubygems_version: 3.5.22
|
198
198
|
signing_key:
|
199
199
|
specification_version: 4
|
200
200
|
summary: Finds a website's broken links and reports back to you with a summary.
|