broken_link_finder 0.9.4 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +52 -0
- data/Gemfile.lock +51 -38
- data/README.md +65 -29
- data/benchmark.rb +9 -5
- data/bin/console +11 -19
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +8 -5
- data/exe/broken_link_finder +14 -3
- data/lib/broken_link_finder.rb +8 -2
- data/lib/broken_link_finder/finder.rb +131 -132
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
- data/lib/broken_link_finder/reporter/reporter.rb +76 -0
- data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- data/lib/broken_link_finder/xpath.rb +14 -0
- metadata +21 -15
- data/lib/broken_link_finder/reporter.rb +0 -116
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 24ca9c7a6071b07f5ab3132c9c79c4628570c9c3e157b77a27a05cdc0578ac6e
|
4
|
+
data.tar.gz: 6668eb430c8296e1439f56c242e7e08a27733605d724ec1c5cfa638dcfaa8b52
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1d1cdc47ade4651b8bc2df01212364ba938ee73269bf53e7278519ecd374247291c932abfa73a031973403ed55d360bc9d14b5c60ba312aca4b32837b5064294
|
7
|
+
data.tar.gz: f56308da4b9d7a4a39afd43808f77d2b6f2fbbf00f17502d2d889de504bcc82ee1858fb673333b11693a65ad73a4f5fb65a97b15955443e8268b1e0ab08b4e51
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.7.0
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,58 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.12.0
|
13
|
+
### Added
|
14
|
+
- `BrokenLinkFinder::link_xpath` and `link_xpath=` methods so you can customise how links are extracted from each crawled page using the API.
|
15
|
+
- An `--xpath` (or just `-x`) command line flag so you can customise how links are extracted when using the command line.
|
16
|
+
### Changed/Removed
|
17
|
+
- Changed the default way in which links are extracted from a page. Previously any element with a `href` or `src` attribute was extracted and checked; now only those links inside the `<body>` are extracted and checked, ignoring the `<head>` section entirely. You can change this behaviour back with: `BrokenLinkFinder::link_xpath = '//*/@href | //*/@src'` before you perform a crawl. Alternatively, if using the command line, use the `--xpath //*/@href | //*/@src` option.
|
18
|
+
### Fixed
|
19
|
+
- [Scheme relative bug](https://github.com/michaeltelford/broken_link_finder/issues/16) by upgrading to `wgit v0.10.0`.
|
20
|
+
---
|
21
|
+
|
22
|
+
## v0.11.1
|
23
|
+
### Added
|
24
|
+
- ...
|
25
|
+
### Changed/Removed
|
26
|
+
- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
|
27
|
+
### Fixed
|
28
|
+
- ...
|
29
|
+
---
|
30
|
+
|
31
|
+
## v0.11.0
|
32
|
+
### Added
|
33
|
+
- Additional crawl statistics.
|
34
|
+
- Exit code handling to executable. `0` for success, `1` for an error scenario.
|
35
|
+
### Changed/Removed
|
36
|
+
- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
|
37
|
+
### Fixed
|
38
|
+
- Bug in html report, summary url is now an `<a>` link.
|
39
|
+
- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
|
40
|
+
- Bug causing an error on crawling unparsable/invalid URL's.
|
41
|
+
---
|
42
|
+
|
43
|
+
## v0.10.0
|
44
|
+
### Added
|
45
|
+
- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
|
46
|
+
- Added a 'retry' mechanism for any broken links found. This is essentially a verification step before generating a report.
|
47
|
+
- `Finder#crawl_stats` for info such as crawl duration, total links crawled etc.
|
48
|
+
### Changed/Removed
|
49
|
+
- The API has changed somewhat. See the [docs](https://www.rubydoc.info/gems/broken_link_finder) for the up to date code signatures if you're using `broken_link_finder` outside of its executable.
|
50
|
+
### Fixed
|
51
|
+
- ...
|
52
|
+
---
|
53
|
+
|
54
|
+
## v0.9.5
|
55
|
+
### Added
|
56
|
+
- ...
|
57
|
+
### Changed/Removed
|
58
|
+
- Now using optimistic dep versioning.
|
59
|
+
- Updated `wgit` to version 0.5.1 containing improvements and bug fixes.
|
60
|
+
### Fixed
|
61
|
+
- ...
|
62
|
+
---
|
63
|
+
|
12
64
|
## v0.9.4
|
13
65
|
### Added
|
14
66
|
- ...
|
data/Gemfile.lock
CHANGED
@@ -1,53 +1,66 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
5
|
-
thor (~> 0.20
|
6
|
-
thread (~> 0.2
|
7
|
-
wgit (~> 0.
|
4
|
+
broken_link_finder (0.12.0)
|
5
|
+
thor (~> 0.20)
|
6
|
+
thread (~> 0.2)
|
7
|
+
wgit (~> 0.10)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
|
-
addressable (2.
|
13
|
-
public_suffix (>= 2.0.2, <
|
14
|
-
bson (4.
|
15
|
-
byebug (11.
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
addressable (2.7.0)
|
13
|
+
public_suffix (>= 2.0.2, < 5.0)
|
14
|
+
bson (4.12.0)
|
15
|
+
byebug (11.1.3)
|
16
|
+
cliver (0.3.2)
|
17
|
+
coderay (1.1.3)
|
18
|
+
concurrent-ruby (1.1.8)
|
19
|
+
crack (0.4.5)
|
20
|
+
rexml
|
19
21
|
ethon (0.12.0)
|
20
22
|
ffi (>= 1.3.0)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
23
|
+
ferrum (0.11)
|
24
|
+
addressable (~> 2.5)
|
25
|
+
cliver (~> 0.3)
|
26
|
+
concurrent-ruby (~> 1.1)
|
27
|
+
websocket-driver (>= 0.6, < 0.8)
|
28
|
+
ffi (1.15.0)
|
29
|
+
hashdiff (1.0.1)
|
30
|
+
maxitest (3.6.0)
|
31
|
+
minitest (>= 5.0.0, < 5.14.0)
|
32
|
+
method_source (1.0.0)
|
33
|
+
mini_portile2 (2.5.0)
|
34
|
+
minitest (5.13.0)
|
35
|
+
mongo (2.14.0)
|
36
|
+
bson (>= 4.8.2, < 5.0.0)
|
37
|
+
nokogiri (1.11.2)
|
38
|
+
mini_portile2 (~> 2.5.0)
|
39
|
+
racc (~> 1.4)
|
40
|
+
pry (0.14.0)
|
41
|
+
coderay (~> 1.1)
|
42
|
+
method_source (~> 1.0)
|
43
|
+
public_suffix (4.0.6)
|
44
|
+
racc (1.5.2)
|
45
|
+
rake (13.0.3)
|
46
|
+
rexml (3.2.4)
|
38
47
|
thor (0.20.3)
|
39
48
|
thread (0.2.2)
|
40
|
-
typhoeus (1.
|
49
|
+
typhoeus (1.4.0)
|
41
50
|
ethon (>= 0.9.0)
|
42
|
-
webmock (3.
|
51
|
+
webmock (3.12.2)
|
43
52
|
addressable (>= 2.3.6)
|
44
53
|
crack (>= 0.3.2)
|
45
54
|
hashdiff (>= 0.4.0, < 2.0.0)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
55
|
+
websocket-driver (0.7.3)
|
56
|
+
websocket-extensions (>= 0.1.0)
|
57
|
+
websocket-extensions (0.1.5)
|
58
|
+
wgit (0.10.0)
|
59
|
+
addressable (~> 2.6)
|
60
|
+
ferrum (~> 0.8)
|
61
|
+
mongo (~> 2.9)
|
62
|
+
nokogiri (~> 1.10)
|
63
|
+
typhoeus (~> 1.3)
|
51
64
|
|
52
65
|
PLATFORMS
|
53
66
|
ruby
|
@@ -58,11 +71,11 @@ DEPENDENCIES
|
|
58
71
|
byebug (~> 11.0)
|
59
72
|
maxitest (~> 3.3)
|
60
73
|
pry (~> 0.12)
|
61
|
-
rake (~>
|
74
|
+
rake (~> 13.0)
|
62
75
|
webmock (~> 3.6)
|
63
76
|
|
64
77
|
RUBY VERSION
|
65
|
-
ruby 2.
|
78
|
+
ruby 2.7.0p0
|
66
79
|
|
67
80
|
BUNDLED WITH
|
68
|
-
2.
|
81
|
+
2.1.4
|
data/README.md
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
# Broken Link Finder
|
2
2
|
|
3
|
-
Does what it says on the tin
|
3
|
+
Does what it says on the tin - finds a website's broken links.
|
4
4
|
|
5
|
-
Simply point it at a website and it will crawl all of its webpages searching for and identifing
|
5
|
+
Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
|
6
6
|
|
7
|
-
|
7
|
+
Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
|
8
8
|
|
9
9
|
## How It Works
|
10
10
|
|
11
|
-
Any HTML
|
11
|
+
Any HTML element within `<body>` with a `href` or `src` attribute is considered a link (this is [configurable](#Link-Extraction) however).
|
12
|
+
|
13
|
+
For each link on a given page, any of the following conditions constitutes that the link is broken:
|
12
14
|
|
13
15
|
- An empty HTML response body is returned.
|
14
16
|
- A response status code of `404 Not Found` is returned.
|
@@ -29,27 +31,27 @@ With that said, the usual array of HTTP URL features are supported including anc
|
|
29
31
|
|
30
32
|
## Installation
|
31
33
|
|
32
|
-
|
34
|
+
Only MRI Ruby is tested and supported, but `broken_link_finder` may work with other Ruby implementations.
|
33
35
|
|
34
|
-
|
35
|
-
gem 'broken_link_finder'
|
36
|
-
```
|
36
|
+
Currently, the required MRI Ruby version is:
|
37
37
|
|
38
|
-
|
38
|
+
`~> 2.5` (a.k.a.) `>= 2.5 && < 3`
|
39
39
|
|
40
|
-
|
40
|
+
### Using Bundler
|
41
41
|
|
42
|
-
|
42
|
+
$ bundle add broken_link_finder
|
43
|
+
|
44
|
+
### Using RubyGems
|
43
45
|
|
44
46
|
$ gem install broken_link_finder
|
45
47
|
|
46
|
-
|
48
|
+
### Verify
|
47
49
|
|
48
50
|
$ broken_link_finder version
|
49
51
|
|
50
52
|
## Usage
|
51
53
|
|
52
|
-
You can check for broken links via the
|
54
|
+
You can check for broken links via the executable or library.
|
53
55
|
|
54
56
|
### Executable
|
55
57
|
|
@@ -57,7 +59,7 @@ Installing this gem installs the `broken_link_finder` executable into your `$PAT
|
|
57
59
|
|
58
60
|
$ broken_link_finder crawl http://txti.es
|
59
61
|
|
60
|
-
Adding the
|
62
|
+
Adding the `--recursive` flag would crawl the entire `txti.es` site, not just its index page.
|
61
63
|
|
62
64
|
See the [output](#Output) section below for an example of a site with broken links.
|
63
65
|
|
@@ -76,7 +78,7 @@ require 'broken_link_finder'
|
|
76
78
|
|
77
79
|
finder = BrokenLinkFinder.new
|
78
80
|
finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
|
79
|
-
finder.
|
81
|
+
finder.report # Or use Finder#broken_links and Finder#ignored_links
|
80
82
|
# for direct access to the link Hashes.
|
81
83
|
```
|
82
84
|
|
@@ -91,28 +93,62 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
|
|
91
93
|
If broken links are found then the output will look something like:
|
92
94
|
|
93
95
|
```text
|
94
|
-
|
96
|
+
Crawled http://txti.es
|
97
|
+
7 page(s) containing 32 unique link(s) in 6.82 seconds
|
98
|
+
|
99
|
+
Found 6 unique broken link(s) across 2 page(s):
|
95
100
|
|
96
101
|
The following broken links were found on 'http://txti.es/about':
|
97
102
|
http://twitter.com/thebarrytone
|
103
|
+
/doesntexist
|
98
104
|
http://twitter.com/nwbld
|
99
|
-
|
100
|
-
https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
|
105
|
+
twitter.com/txties
|
101
106
|
|
102
107
|
The following broken links were found on 'http://txti.es/how':
|
103
108
|
http://en.wikipedia.org/wiki/Markdown
|
104
109
|
http://imgur.com
|
105
110
|
|
106
|
-
Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
|
111
|
+
Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
|
107
112
|
|
108
|
-
The following links were ignored on http://txti.es:
|
113
|
+
The following links were ignored on 'http://txti.es':
|
109
114
|
tel:+13174562564
|
110
115
|
mailto:big.jim@jmail.com
|
111
116
|
|
112
|
-
The following links were ignored on http://txti.es/contact:
|
117
|
+
The following links were ignored on 'http://txti.es/contact':
|
113
118
|
ftp://server.com
|
114
119
|
```
|
115
120
|
|
121
|
+
You can provide the `--html` flag if you'd prefer a HTML based report.
|
122
|
+
|
123
|
+
## Link Extraction
|
124
|
+
|
125
|
+
You can customise the XPath used to extract links from each crawled page. This can be done via the executable or library.
|
126
|
+
|
127
|
+
### Executable
|
128
|
+
|
129
|
+
Add the `--xpath` (or `-x`) flag to the crawl command e.g.
|
130
|
+
|
131
|
+
$ broken_link_finder crawl http://txti.es -x //img/@src
|
132
|
+
|
133
|
+
### Library
|
134
|
+
|
135
|
+
Set the desired XPath using the accessor methods provided:
|
136
|
+
|
137
|
+
> main.rb
|
138
|
+
|
139
|
+
```ruby
|
140
|
+
require 'broken_link_finder'
|
141
|
+
|
142
|
+
# Set your desired xpath before crawling...
|
143
|
+
BrokenLinkFinder::link_xpath = '//img/@src'
|
144
|
+
|
145
|
+
# Now crawl as normal and only your custom targeted links will be checked.
|
146
|
+
BrokenLinkFinder.new.crawl_page 'http://txti.es'
|
147
|
+
|
148
|
+
# Go back to using the default provided xpath as needed.
|
149
|
+
BrokenLinkFinder::link_xpath = BrokenLinkFinder::DEFAULT_LINK_XPATH
|
150
|
+
```
|
151
|
+
|
116
152
|
## Contributing
|
117
153
|
|
118
154
|
Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
|
@@ -128,11 +164,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
128
164
|
To install this gem onto your local machine, run `bundle exec rake install`.
|
129
165
|
|
130
166
|
To release a new gem version:
|
131
|
-
- Update the deps in the `*.gemspec
|
132
|
-
- Update the version number in `version.rb` and add the new version to the `CHANGELOG
|
133
|
-
- Run `bundle install
|
134
|
-
- Run `bundle exec rake test` ensuring all tests pass
|
135
|
-
- Run `bundle exec rake compile` ensuring no warnings
|
136
|
-
- Run `bundle exec rake install && rbenv rehash
|
137
|
-
- Manually test the executable
|
138
|
-
- Run `bundle exec rake release[origin]
|
167
|
+
- Update the deps in the `*.gemspec`, if necessary.
|
168
|
+
- Update the version number in `version.rb` and add the new version to the `CHANGELOG`.
|
169
|
+
- Run `bundle install`.
|
170
|
+
- Run `bundle exec rake test` ensuring all tests pass.
|
171
|
+
- Run `bundle exec rake compile` ensuring no warnings.
|
172
|
+
- Run `bundle exec rake install && rbenv rehash`.
|
173
|
+
- Manually test the executable.
|
174
|
+
- Run `bundle exec rake release[origin]`.
|
data/benchmark.rb
CHANGED
@@ -10,15 +10,19 @@ finder = BrokenLinkFinder::Finder.new
|
|
10
10
|
puts Benchmark.measure { finder.crawl_site url }
|
11
11
|
puts "Links crawled: #{finder.total_links_crawled}"
|
12
12
|
|
13
|
-
# http://txti.es page crawl
|
14
|
-
# Pre
|
15
|
-
# Post
|
13
|
+
# http://txti.es page crawl with threading
|
14
|
+
# Pre: 17.5 seconds
|
15
|
+
# Post: 7.5 seconds
|
16
16
|
|
17
|
-
# http://txti.es
|
17
|
+
# http://txti.es with threading - page vs site crawl
|
18
18
|
# Page: 9.526981
|
19
19
|
# Site: 9.732416
|
20
20
|
# Multi-threading crawl_site now yields the same time as a single page
|
21
21
|
|
22
|
-
# Large site crawl -
|
22
|
+
# Large site crawl - all link recording functionality
|
23
23
|
# Pre: 608 seconds with 7665 links crawled
|
24
24
|
# Post: 355 seconds with 1099 links crawled
|
25
|
+
|
26
|
+
# Large site crawl - retry mechanism
|
27
|
+
# Pre: 140 seconds
|
28
|
+
# Post: 170 seconds
|
data/bin/console
CHANGED
@@ -5,20 +5,10 @@ require 'bundler/setup'
|
|
5
5
|
require 'pry'
|
6
6
|
require 'byebug'
|
7
7
|
require 'broken_link_finder'
|
8
|
+
require 'logger'
|
8
9
|
|
9
|
-
#
|
10
|
-
|
11
|
-
singleton_class.class_eval do
|
12
|
-
alias_method :orig_get, :get
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.get(base_url, options = {})
|
16
|
-
puts "[typhoeus] Sending GET: #{base_url}"
|
17
|
-
resp = orig_get(base_url, options)
|
18
|
-
puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
|
19
|
-
resp
|
20
|
-
end
|
21
|
-
end
|
10
|
+
# Logs all HTTP requests.
|
11
|
+
Wgit.logger.level = Logger::DEBUG
|
22
12
|
|
23
13
|
# Call reload to load all recent code changes.
|
24
14
|
def reload
|
@@ -33,12 +23,14 @@ end
|
|
33
23
|
# You can add fixtures and/or initialization code here...
|
34
24
|
reload
|
35
25
|
|
36
|
-
url
|
37
|
-
by_page
|
38
|
-
by_link
|
39
|
-
finder
|
26
|
+
def url; @url ||= 'http://txti.es/'; end
|
27
|
+
def by_page; @by_page ||= Finder.new; end
|
28
|
+
def by_link; @by_link ||= Finder.new(sort: :link); end
|
29
|
+
def finder; @finder ||= by_page; end
|
40
30
|
|
41
31
|
# Start the console.
|
42
|
-
puts
|
32
|
+
puts
|
33
|
+
puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
|
34
|
+
puts
|
43
35
|
|
44
|
-
|
36
|
+
Pry.start
|
data/bin/setup
CHANGED
data/broken_link_finder.gemspec
CHANGED
@@ -15,7 +15,10 @@ Gem::Specification.new do |spec|
|
|
15
15
|
spec.homepage = 'https://github.com/michaeltelford/broken-link-finder'
|
16
16
|
spec.license = 'MIT'
|
17
17
|
spec.metadata = {
|
18
|
-
'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder'
|
18
|
+
'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder',
|
19
|
+
'changelog_uri' => 'https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md',
|
20
|
+
'bug_tracker_uri' => 'https://github.com/michaeltelford/broken-link-finder/issues',
|
21
|
+
'documentation_uri' => 'https://www.rubydoc.info/gems/broken_link_finder'
|
19
22
|
}
|
20
23
|
|
21
24
|
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
@@ -41,10 +44,10 @@ Gem::Specification.new do |spec|
|
|
41
44
|
spec.add_development_dependency 'byebug', '~> 11.0'
|
42
45
|
spec.add_development_dependency 'maxitest', '~> 3.3'
|
43
46
|
spec.add_development_dependency 'pry', '~> 0.12'
|
44
|
-
spec.add_development_dependency 'rake', '~>
|
47
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
45
48
|
spec.add_development_dependency 'webmock', '~> 3.6'
|
46
49
|
|
47
|
-
spec.add_runtime_dependency 'thor', '~> 0.20
|
48
|
-
spec.add_runtime_dependency 'thread', '~> 0.2
|
49
|
-
spec.add_runtime_dependency 'wgit', '~> 0.
|
50
|
+
spec.add_runtime_dependency 'thor', '~> 0.20'
|
51
|
+
spec.add_runtime_dependency 'thread', '~> 0.2'
|
52
|
+
spec.add_runtime_dependency 'wgit', '~> 0.10'
|
50
53
|
end
|