broken_link_finder 0.11.1 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Gemfile.lock +20 -18
- data/README.md +41 -10
- data/broken_link_finder.gemspec +1 -1
- data/exe/broken_link_finder +2 -0
- data/lib/broken_link_finder.rb +2 -1
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +2 -2
- data/lib/broken_link_finder/xpath.rb +14 -0
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 24ca9c7a6071b07f5ab3132c9c79c4628570c9c3e157b77a27a05cdc0578ac6e
|
4
|
+
data.tar.gz: 6668eb430c8296e1439f56c242e7e08a27733605d724ec1c5cfa638dcfaa8b52
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1d1cdc47ade4651b8bc2df01212364ba938ee73269bf53e7278519ecd374247291c932abfa73a031973403ed55d360bc9d14b5c60ba312aca4b32837b5064294
|
7
|
+
data.tar.gz: f56308da4b9d7a4a39afd43808f77d2b6f2fbbf00f17502d2d889de504bcc82ee1858fb673333b11693a65ad73a4f5fb65a97b15955443e8268b1e0ab08b4e51
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,16 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.12.0
|
13
|
+
### Added
|
14
|
+
- `BrokenLinkFinder::link_xpath` and `link_xpath=` methods so you can customise how links are extracted from each crawled page using the API.
|
15
|
+
- An `--xpath` (or just `-x`) command line flag so you can customise how links are extracted when using the command line.
|
16
|
+
### Changed/Removed
|
17
|
+
- Changed the default way in which links are extracted from a page. Previously any element with a `href` or `src` attribute was extracted and checked; now only those links inside the `<body>` are extracted and checked, ignoring the `<head>` section entirely. You can change this behaviour back with: `BrokenLinkFinder::link_xpath = '//*/@href | //*/@src'` before you perform a crawl. Alternatively, if using the command line, use the `--xpath //*/@href | //*/@src` option.
|
18
|
+
### Fixed
|
19
|
+
- [Scheme relative bug](https://github.com/michaeltelford/broken_link_finder/issues/16) by upgrading to `wgit v0.10.0`.
|
20
|
+
---
|
21
|
+
|
12
22
|
## v0.11.1
|
13
23
|
### Added
|
14
24
|
- ...
|
data/Gemfile.lock
CHANGED
@@ -1,59 +1,61 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.12.0)
|
5
5
|
thor (~> 0.20)
|
6
6
|
thread (~> 0.2)
|
7
|
-
wgit (~> 0.
|
7
|
+
wgit (~> 0.10)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
12
|
addressable (2.7.0)
|
13
13
|
public_suffix (>= 2.0.2, < 5.0)
|
14
|
-
bson (4.
|
14
|
+
bson (4.12.0)
|
15
15
|
byebug (11.1.3)
|
16
16
|
cliver (0.3.2)
|
17
17
|
coderay (1.1.3)
|
18
|
-
concurrent-ruby (1.1.
|
19
|
-
crack (0.4.
|
20
|
-
|
18
|
+
concurrent-ruby (1.1.8)
|
19
|
+
crack (0.4.5)
|
20
|
+
rexml
|
21
21
|
ethon (0.12.0)
|
22
22
|
ffi (>= 1.3.0)
|
23
|
-
ferrum (0.
|
23
|
+
ferrum (0.11)
|
24
24
|
addressable (~> 2.5)
|
25
25
|
cliver (~> 0.3)
|
26
26
|
concurrent-ruby (~> 1.1)
|
27
27
|
websocket-driver (>= 0.6, < 0.8)
|
28
|
-
ffi (1.
|
28
|
+
ffi (1.15.0)
|
29
29
|
hashdiff (1.0.1)
|
30
30
|
maxitest (3.6.0)
|
31
31
|
minitest (>= 5.0.0, < 5.14.0)
|
32
32
|
method_source (1.0.0)
|
33
|
-
mini_portile2 (2.
|
33
|
+
mini_portile2 (2.5.0)
|
34
34
|
minitest (5.13.0)
|
35
|
-
mongo (2.
|
35
|
+
mongo (2.14.0)
|
36
36
|
bson (>= 4.8.2, < 5.0.0)
|
37
|
-
nokogiri (1.
|
38
|
-
mini_portile2 (~> 2.
|
39
|
-
|
37
|
+
nokogiri (1.11.2)
|
38
|
+
mini_portile2 (~> 2.5.0)
|
39
|
+
racc (~> 1.4)
|
40
|
+
pry (0.14.0)
|
40
41
|
coderay (~> 1.1)
|
41
42
|
method_source (~> 1.0)
|
42
|
-
public_suffix (4.0.
|
43
|
-
|
44
|
-
|
43
|
+
public_suffix (4.0.6)
|
44
|
+
racc (1.5.2)
|
45
|
+
rake (13.0.3)
|
46
|
+
rexml (3.2.4)
|
45
47
|
thor (0.20.3)
|
46
48
|
thread (0.2.2)
|
47
49
|
typhoeus (1.4.0)
|
48
50
|
ethon (>= 0.9.0)
|
49
|
-
webmock (3.
|
51
|
+
webmock (3.12.2)
|
50
52
|
addressable (>= 2.3.6)
|
51
53
|
crack (>= 0.3.2)
|
52
54
|
hashdiff (>= 0.4.0, < 2.0.0)
|
53
55
|
websocket-driver (0.7.3)
|
54
56
|
websocket-extensions (>= 0.1.0)
|
55
57
|
websocket-extensions (0.1.5)
|
56
|
-
wgit (0.
|
58
|
+
wgit (0.10.0)
|
57
59
|
addressable (~> 2.6)
|
58
60
|
ferrum (~> 0.8)
|
59
61
|
mongo (~> 2.9)
|
data/README.md
CHANGED
@@ -8,7 +8,9 @@ Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fas
|
|
8
8
|
|
9
9
|
## How It Works
|
10
10
|
|
11
|
-
Any HTML
|
11
|
+
Any HTML element within `<body>` with a `href` or `src` attribute is considered a link (this is [configurable](#Link-Extraction) however).
|
12
|
+
|
13
|
+
For each link on a given page, any of the following conditions constitutes that the link is broken:
|
12
14
|
|
13
15
|
- An empty HTML response body is returned.
|
14
16
|
- A response status code of `404 Not Found` is returned.
|
@@ -29,27 +31,27 @@ With that said, the usual array of HTTP URL features are supported including anc
|
|
29
31
|
|
30
32
|
## Installation
|
31
33
|
|
32
|
-
|
34
|
+
Only MRI Ruby is tested and supported, but `broken_link_finder` may work with other Ruby implementations.
|
33
35
|
|
34
|
-
|
35
|
-
gem 'broken_link_finder'
|
36
|
-
```
|
36
|
+
Currently, the required MRI Ruby version is:
|
37
37
|
|
38
|
-
|
38
|
+
`~> 2.5` (a.k.a.) `>= 2.5 && < 3`
|
39
39
|
|
40
|
-
|
40
|
+
### Using Bundler
|
41
41
|
|
42
|
-
|
42
|
+
$ bundle add broken_link_finder
|
43
|
+
|
44
|
+
### Using RubyGems
|
43
45
|
|
44
46
|
$ gem install broken_link_finder
|
45
47
|
|
46
|
-
|
48
|
+
### Verify
|
47
49
|
|
48
50
|
$ broken_link_finder version
|
49
51
|
|
50
52
|
## Usage
|
51
53
|
|
52
|
-
You can check for broken links via the
|
54
|
+
You can check for broken links via the executable or library.
|
53
55
|
|
54
56
|
### Executable
|
55
57
|
|
@@ -118,6 +120,35 @@ ftp://server.com
|
|
118
120
|
|
119
121
|
You can provide the `--html` flag if you'd prefer a HTML based report.
|
120
122
|
|
123
|
+
## Link Extraction
|
124
|
+
|
125
|
+
You can customise the XPath used to extract links from each crawled page. This can be done via the executable or library.
|
126
|
+
|
127
|
+
### Executable
|
128
|
+
|
129
|
+
Add the `--xpath` (or `-x`) flag to the crawl command e.g.
|
130
|
+
|
131
|
+
$ broken_link_finder crawl http://txti.es -x //img/@src
|
132
|
+
|
133
|
+
### Library
|
134
|
+
|
135
|
+
Set the desired XPath using the accessor methods provided:
|
136
|
+
|
137
|
+
> main.rb
|
138
|
+
|
139
|
+
```ruby
|
140
|
+
require 'broken_link_finder'
|
141
|
+
|
142
|
+
# Set your desired xpath before crawling...
|
143
|
+
BrokenLinkFinder::link_xpath = '//img/@src'
|
144
|
+
|
145
|
+
# Now crawl as normal and only your custom targeted links will be checked.
|
146
|
+
BrokenLinkFinder.new.crawl_page 'http://txti.es'
|
147
|
+
|
148
|
+
# Go back to using the default provided xpath as needed.
|
149
|
+
BrokenLinkFinder::link_xpath = BrokenLinkFinder::DEFAULT_LINK_XPATH
|
150
|
+
```
|
151
|
+
|
121
152
|
## Contributing
|
122
153
|
|
123
154
|
Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
|
data/broken_link_finder.gemspec
CHANGED
data/exe/broken_link_finder
CHANGED
@@ -9,6 +9,7 @@ class BrokenLinkFinderCLI < Thor
|
|
9
9
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
10
10
|
option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
|
11
11
|
option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
|
12
|
+
option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
|
12
13
|
option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
|
13
14
|
option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
14
15
|
option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
|
@@ -22,6 +23,7 @@ class BrokenLinkFinderCLI < Thor
|
|
22
23
|
broken_verbose = !options[:concise]
|
23
24
|
ignored_verbose = options[:verbose]
|
24
25
|
|
26
|
+
BrokenLinkFinder.link_xpath = options[:xpath]
|
25
27
|
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
|
26
28
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
27
29
|
finder.report(
|
data/lib/broken_link_finder.rb
CHANGED
@@ -5,8 +5,9 @@ require 'wgit/core_ext'
|
|
5
5
|
require 'thread/pool'
|
6
6
|
require 'set'
|
7
7
|
|
8
|
-
require_relative './broken_link_finder/wgit_extensions'
|
9
8
|
require_relative './broken_link_finder/version'
|
9
|
+
require_relative './broken_link_finder/xpath'
|
10
|
+
require_relative './broken_link_finder/wgit_extensions'
|
10
11
|
require_relative './broken_link_finder/link_manager'
|
11
12
|
require_relative './broken_link_finder/reporter/reporter'
|
12
13
|
require_relative './broken_link_finder/reporter/text_reporter'
|
@@ -17,10 +17,10 @@ rescue StandardError
|
|
17
17
|
nil
|
18
18
|
end
|
19
19
|
|
20
|
-
#
|
20
|
+
# Define a custom extractor for all page links we're interested in checking.
|
21
21
|
Wgit::Document.define_extractor(
|
22
22
|
:all_links,
|
23
|
-
|
23
|
+
lambda { BrokenLinkFinder::link_xpath },
|
24
24
|
singleton: false,
|
25
25
|
text_content_only: true
|
26
26
|
) do |links, doc|
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Extract all the Document's <body> links e.g. <a>, <img>, <script> etc.
|
5
|
+
DEFAULT_LINK_XPATH = '/html/body//*/@href | /html/body//*/@src'
|
6
|
+
|
7
|
+
@link_xpath = DEFAULT_LINK_XPATH
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# The xpath used to extract links from a crawled page.
|
11
|
+
# Can be overridden as required.
|
12
|
+
attr_accessor :link_xpath
|
13
|
+
end
|
14
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - "~>"
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
131
|
+
version: '0.10'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
138
|
+
version: '0.10'
|
139
139
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
140
140
|
to you with a summary.
|
141
141
|
email: michael.telford@live.com
|
@@ -165,6 +165,7 @@ files:
|
|
165
165
|
- lib/broken_link_finder/reporter/text_reporter.rb
|
166
166
|
- lib/broken_link_finder/version.rb
|
167
167
|
- lib/broken_link_finder/wgit_extensions.rb
|
168
|
+
- lib/broken_link_finder/xpath.rb
|
168
169
|
- load.rb
|
169
170
|
homepage: https://github.com/michaeltelford/broken-link-finder
|
170
171
|
licenses:
|
@@ -191,7 +192,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
191
192
|
version: '0'
|
192
193
|
requirements: []
|
193
194
|
rubygems_version: 3.1.2
|
194
|
-
signing_key:
|
195
|
+
signing_key:
|
195
196
|
specification_version: 4
|
196
197
|
summary: Finds a website's broken links and reports back to you with a summary.
|
197
198
|
test_files: []
|