broken_link_finder 0.9.4 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +52 -0
- data/Gemfile.lock +51 -38
- data/README.md +65 -29
- data/benchmark.rb +9 -5
- data/bin/console +11 -19
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +8 -5
- data/exe/broken_link_finder +14 -3
- data/lib/broken_link_finder.rb +8 -2
- data/lib/broken_link_finder/finder.rb +131 -132
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
- data/lib/broken_link_finder/reporter/reporter.rb +76 -0
- data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- data/lib/broken_link_finder/xpath.rb +14 -0
- metadata +21 -15
- data/lib/broken_link_finder/reporter.rb +0 -116
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 24ca9c7a6071b07f5ab3132c9c79c4628570c9c3e157b77a27a05cdc0578ac6e
|
4
|
+
data.tar.gz: 6668eb430c8296e1439f56c242e7e08a27733605d724ec1c5cfa638dcfaa8b52
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1d1cdc47ade4651b8bc2df01212364ba938ee73269bf53e7278519ecd374247291c932abfa73a031973403ed55d360bc9d14b5c60ba312aca4b32837b5064294
|
7
|
+
data.tar.gz: f56308da4b9d7a4a39afd43808f77d2b6f2fbbf00f17502d2d889de504bcc82ee1858fb673333b11693a65ad73a4f5fb65a97b15955443e8268b1e0ab08b4e51
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.7.0
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,58 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.12.0
|
13
|
+
### Added
|
14
|
+
- `BrokenLinkFinder::link_xpath` and `link_xpath=` methods so you can customise how links are extracted from each crawled page using the API.
|
15
|
+
- An `--xpath` (or just `-x`) command line flag so you can customise how links are extracted when using the command line.
|
16
|
+
### Changed/Removed
|
17
|
+
- Changed the default way in which links are extracted from a page. Previously any element with a `href` or `src` attribute was extracted and checked; now only those links inside the `<body>` are extracted and checked, ignoring the `<head>` section entirely. You can change this behaviour back with: `BrokenLinkFinder::link_xpath = '//*/@href | //*/@src'` before you perform a crawl. Alternatively, if using the command line, use the `--xpath //*/@href | //*/@src` option.
|
18
|
+
### Fixed
|
19
|
+
- [Scheme relative bug](https://github.com/michaeltelford/broken_link_finder/issues/16) by upgrading to `wgit v0.10.0`.
|
20
|
+
---
|
21
|
+
|
22
|
+
## v0.11.1
|
23
|
+
### Added
|
24
|
+
- ...
|
25
|
+
### Changed/Removed
|
26
|
+
- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
|
27
|
+
### Fixed
|
28
|
+
- ...
|
29
|
+
---
|
30
|
+
|
31
|
+
## v0.11.0
|
32
|
+
### Added
|
33
|
+
- Additional crawl statistics.
|
34
|
+
- Exit code handling to executable. `0` for success, `1` for an error scenario.
|
35
|
+
### Changed/Removed
|
36
|
+
- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
|
37
|
+
### Fixed
|
38
|
+
- Bug in html report, summary url is now an `<a>` link.
|
39
|
+
- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
|
40
|
+
- Bug causing an error on crawling unparsable/invalid URL's.
|
41
|
+
---
|
42
|
+
|
43
|
+
## v0.10.0
|
44
|
+
### Added
|
45
|
+
- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
|
46
|
+
- Added a 'retry' mechanism for any broken links found. This is essentially a verification step before generating a report.
|
47
|
+
- `Finder#crawl_stats` for info such as crawl duration, total links crawled etc.
|
48
|
+
### Changed/Removed
|
49
|
+
- The API has changed somewhat. See the [docs](https://www.rubydoc.info/gems/broken_link_finder) for the up to date code signatures if you're using `broken_link_finder` outside of its executable.
|
50
|
+
### Fixed
|
51
|
+
- ...
|
52
|
+
---
|
53
|
+
|
54
|
+
## v0.9.5
|
55
|
+
### Added
|
56
|
+
- ...
|
57
|
+
### Changed/Removed
|
58
|
+
- Now using optimistic dep versioning.
|
59
|
+
- Updated `wgit` to version 0.5.1 containing improvements and bug fixes.
|
60
|
+
### Fixed
|
61
|
+
- ...
|
62
|
+
---
|
63
|
+
|
12
64
|
## v0.9.4
|
13
65
|
### Added
|
14
66
|
- ...
|
data/Gemfile.lock
CHANGED
@@ -1,53 +1,66 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
5
|
-
thor (~> 0.20
|
6
|
-
thread (~> 0.2
|
7
|
-
wgit (~> 0.
|
4
|
+
broken_link_finder (0.12.0)
|
5
|
+
thor (~> 0.20)
|
6
|
+
thread (~> 0.2)
|
7
|
+
wgit (~> 0.10)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
|
-
addressable (2.
|
13
|
-
public_suffix (>= 2.0.2, <
|
14
|
-
bson (4.
|
15
|
-
byebug (11.
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
addressable (2.7.0)
|
13
|
+
public_suffix (>= 2.0.2, < 5.0)
|
14
|
+
bson (4.12.0)
|
15
|
+
byebug (11.1.3)
|
16
|
+
cliver (0.3.2)
|
17
|
+
coderay (1.1.3)
|
18
|
+
concurrent-ruby (1.1.8)
|
19
|
+
crack (0.4.5)
|
20
|
+
rexml
|
19
21
|
ethon (0.12.0)
|
20
22
|
ffi (>= 1.3.0)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
23
|
+
ferrum (0.11)
|
24
|
+
addressable (~> 2.5)
|
25
|
+
cliver (~> 0.3)
|
26
|
+
concurrent-ruby (~> 1.1)
|
27
|
+
websocket-driver (>= 0.6, < 0.8)
|
28
|
+
ffi (1.15.0)
|
29
|
+
hashdiff (1.0.1)
|
30
|
+
maxitest (3.6.0)
|
31
|
+
minitest (>= 5.0.0, < 5.14.0)
|
32
|
+
method_source (1.0.0)
|
33
|
+
mini_portile2 (2.5.0)
|
34
|
+
minitest (5.13.0)
|
35
|
+
mongo (2.14.0)
|
36
|
+
bson (>= 4.8.2, < 5.0.0)
|
37
|
+
nokogiri (1.11.2)
|
38
|
+
mini_portile2 (~> 2.5.0)
|
39
|
+
racc (~> 1.4)
|
40
|
+
pry (0.14.0)
|
41
|
+
coderay (~> 1.1)
|
42
|
+
method_source (~> 1.0)
|
43
|
+
public_suffix (4.0.6)
|
44
|
+
racc (1.5.2)
|
45
|
+
rake (13.0.3)
|
46
|
+
rexml (3.2.4)
|
38
47
|
thor (0.20.3)
|
39
48
|
thread (0.2.2)
|
40
|
-
typhoeus (1.
|
49
|
+
typhoeus (1.4.0)
|
41
50
|
ethon (>= 0.9.0)
|
42
|
-
webmock (3.
|
51
|
+
webmock (3.12.2)
|
43
52
|
addressable (>= 2.3.6)
|
44
53
|
crack (>= 0.3.2)
|
45
54
|
hashdiff (>= 0.4.0, < 2.0.0)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
55
|
+
websocket-driver (0.7.3)
|
56
|
+
websocket-extensions (>= 0.1.0)
|
57
|
+
websocket-extensions (0.1.5)
|
58
|
+
wgit (0.10.0)
|
59
|
+
addressable (~> 2.6)
|
60
|
+
ferrum (~> 0.8)
|
61
|
+
mongo (~> 2.9)
|
62
|
+
nokogiri (~> 1.10)
|
63
|
+
typhoeus (~> 1.3)
|
51
64
|
|
52
65
|
PLATFORMS
|
53
66
|
ruby
|
@@ -58,11 +71,11 @@ DEPENDENCIES
|
|
58
71
|
byebug (~> 11.0)
|
59
72
|
maxitest (~> 3.3)
|
60
73
|
pry (~> 0.12)
|
61
|
-
rake (~>
|
74
|
+
rake (~> 13.0)
|
62
75
|
webmock (~> 3.6)
|
63
76
|
|
64
77
|
RUBY VERSION
|
65
|
-
ruby 2.
|
78
|
+
ruby 2.7.0p0
|
66
79
|
|
67
80
|
BUNDLED WITH
|
68
|
-
2.
|
81
|
+
2.1.4
|
data/README.md
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
# Broken Link Finder
|
2
2
|
|
3
|
-
Does what it says on the tin
|
3
|
+
Does what it says on the tin - finds a website's broken links.
|
4
4
|
|
5
|
-
Simply point it at a website and it will crawl all of its webpages searching for and identifing
|
5
|
+
Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
|
6
6
|
|
7
|
-
|
7
|
+
Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
|
8
8
|
|
9
9
|
## How It Works
|
10
10
|
|
11
|
-
Any HTML
|
11
|
+
Any HTML element within `<body>` with a `href` or `src` attribute is considered a link (this is [configurable](#Link-Extraction) however).
|
12
|
+
|
13
|
+
For each link on a given page, any of the following conditions constitutes that the link is broken:
|
12
14
|
|
13
15
|
- An empty HTML response body is returned.
|
14
16
|
- A response status code of `404 Not Found` is returned.
|
@@ -29,27 +31,27 @@ With that said, the usual array of HTTP URL features are supported including anc
|
|
29
31
|
|
30
32
|
## Installation
|
31
33
|
|
32
|
-
|
34
|
+
Only MRI Ruby is tested and supported, but `broken_link_finder` may work with other Ruby implementations.
|
33
35
|
|
34
|
-
|
35
|
-
gem 'broken_link_finder'
|
36
|
-
```
|
36
|
+
Currently, the required MRI Ruby version is:
|
37
37
|
|
38
|
-
|
38
|
+
`~> 2.5` (a.k.a.) `>= 2.5 && < 3`
|
39
39
|
|
40
|
-
|
40
|
+
### Using Bundler
|
41
41
|
|
42
|
-
|
42
|
+
$ bundle add broken_link_finder
|
43
|
+
|
44
|
+
### Using RubyGems
|
43
45
|
|
44
46
|
$ gem install broken_link_finder
|
45
47
|
|
46
|
-
|
48
|
+
### Verify
|
47
49
|
|
48
50
|
$ broken_link_finder version
|
49
51
|
|
50
52
|
## Usage
|
51
53
|
|
52
|
-
You can check for broken links via the
|
54
|
+
You can check for broken links via the executable or library.
|
53
55
|
|
54
56
|
### Executable
|
55
57
|
|
@@ -57,7 +59,7 @@ Installing this gem installs the `broken_link_finder` executable into your `$PAT
|
|
57
59
|
|
58
60
|
$ broken_link_finder crawl http://txti.es
|
59
61
|
|
60
|
-
Adding the
|
62
|
+
Adding the `--recursive` flag would crawl the entire `txti.es` site, not just its index page.
|
61
63
|
|
62
64
|
See the [output](#Output) section below for an example of a site with broken links.
|
63
65
|
|
@@ -76,7 +78,7 @@ require 'broken_link_finder'
|
|
76
78
|
|
77
79
|
finder = BrokenLinkFinder.new
|
78
80
|
finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
|
79
|
-
finder.
|
81
|
+
finder.report # Or use Finder#broken_links and Finder#ignored_links
|
80
82
|
# for direct access to the link Hashes.
|
81
83
|
```
|
82
84
|
|
@@ -91,28 +93,62 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
|
|
91
93
|
If broken links are found then the output will look something like:
|
92
94
|
|
93
95
|
```text
|
94
|
-
|
96
|
+
Crawled http://txti.es
|
97
|
+
7 page(s) containing 32 unique link(s) in 6.82 seconds
|
98
|
+
|
99
|
+
Found 6 unique broken link(s) across 2 page(s):
|
95
100
|
|
96
101
|
The following broken links were found on 'http://txti.es/about':
|
97
102
|
http://twitter.com/thebarrytone
|
103
|
+
/doesntexist
|
98
104
|
http://twitter.com/nwbld
|
99
|
-
|
100
|
-
https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
|
105
|
+
twitter.com/txties
|
101
106
|
|
102
107
|
The following broken links were found on 'http://txti.es/how':
|
103
108
|
http://en.wikipedia.org/wiki/Markdown
|
104
109
|
http://imgur.com
|
105
110
|
|
106
|
-
Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
|
111
|
+
Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
|
107
112
|
|
108
|
-
The following links were ignored on http://txti.es:
|
113
|
+
The following links were ignored on 'http://txti.es':
|
109
114
|
tel:+13174562564
|
110
115
|
mailto:big.jim@jmail.com
|
111
116
|
|
112
|
-
The following links were ignored on http://txti.es/contact:
|
117
|
+
The following links were ignored on 'http://txti.es/contact':
|
113
118
|
ftp://server.com
|
114
119
|
```
|
115
120
|
|
121
|
+
You can provide the `--html` flag if you'd prefer a HTML based report.
|
122
|
+
|
123
|
+
## Link Extraction
|
124
|
+
|
125
|
+
You can customise the XPath used to extract links from each crawled page. This can be done via the executable or library.
|
126
|
+
|
127
|
+
### Executable
|
128
|
+
|
129
|
+
Add the `--xpath` (or `-x`) flag to the crawl command e.g.
|
130
|
+
|
131
|
+
$ broken_link_finder crawl http://txti.es -x //img/@src
|
132
|
+
|
133
|
+
### Library
|
134
|
+
|
135
|
+
Set the desired XPath using the accessor methods provided:
|
136
|
+
|
137
|
+
> main.rb
|
138
|
+
|
139
|
+
```ruby
|
140
|
+
require 'broken_link_finder'
|
141
|
+
|
142
|
+
# Set your desired xpath before crawling...
|
143
|
+
BrokenLinkFinder::link_xpath = '//img/@src'
|
144
|
+
|
145
|
+
# Now crawl as normal and only your custom targeted links will be checked.
|
146
|
+
BrokenLinkFinder.new.crawl_page 'http://txti.es'
|
147
|
+
|
148
|
+
# Go back to using the default provided xpath as needed.
|
149
|
+
BrokenLinkFinder::link_xpath = BrokenLinkFinder::DEFAULT_LINK_XPATH
|
150
|
+
```
|
151
|
+
|
116
152
|
## Contributing
|
117
153
|
|
118
154
|
Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
|
@@ -128,11 +164,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
128
164
|
To install this gem onto your local machine, run `bundle exec rake install`.
|
129
165
|
|
130
166
|
To release a new gem version:
|
131
|
-
- Update the deps in the `*.gemspec
|
132
|
-
- Update the version number in `version.rb` and add the new version to the `CHANGELOG
|
133
|
-
- Run `bundle install
|
134
|
-
- Run `bundle exec rake test` ensuring all tests pass
|
135
|
-
- Run `bundle exec rake compile` ensuring no warnings
|
136
|
-
- Run `bundle exec rake install && rbenv rehash
|
137
|
-
- Manually test the executable
|
138
|
-
- Run `bundle exec rake release[origin]
|
167
|
+
- Update the deps in the `*.gemspec`, if necessary.
|
168
|
+
- Update the version number in `version.rb` and add the new version to the `CHANGELOG`.
|
169
|
+
- Run `bundle install`.
|
170
|
+
- Run `bundle exec rake test` ensuring all tests pass.
|
171
|
+
- Run `bundle exec rake compile` ensuring no warnings.
|
172
|
+
- Run `bundle exec rake install && rbenv rehash`.
|
173
|
+
- Manually test the executable.
|
174
|
+
- Run `bundle exec rake release[origin]`.
|
data/benchmark.rb
CHANGED
@@ -10,15 +10,19 @@ finder = BrokenLinkFinder::Finder.new
|
|
10
10
|
puts Benchmark.measure { finder.crawl_site url }
|
11
11
|
puts "Links crawled: #{finder.total_links_crawled}"
|
12
12
|
|
13
|
-
# http://txti.es page crawl
|
14
|
-
# Pre
|
15
|
-
# Post
|
13
|
+
# http://txti.es page crawl with threading
|
14
|
+
# Pre: 17.5 seconds
|
15
|
+
# Post: 7.5 seconds
|
16
16
|
|
17
|
-
# http://txti.es
|
17
|
+
# http://txti.es with threading - page vs site crawl
|
18
18
|
# Page: 9.526981
|
19
19
|
# Site: 9.732416
|
20
20
|
# Multi-threading crawl_site now yields the same time as a single page
|
21
21
|
|
22
|
-
# Large site crawl -
|
22
|
+
# Large site crawl - all link recording functionality
|
23
23
|
# Pre: 608 seconds with 7665 links crawled
|
24
24
|
# Post: 355 seconds with 1099 links crawled
|
25
|
+
|
26
|
+
# Large site crawl - retry mechanism
|
27
|
+
# Pre: 140 seconds
|
28
|
+
# Post: 170 seconds
|
data/bin/console
CHANGED
@@ -5,20 +5,10 @@ require 'bundler/setup'
|
|
5
5
|
require 'pry'
|
6
6
|
require 'byebug'
|
7
7
|
require 'broken_link_finder'
|
8
|
+
require 'logger'
|
8
9
|
|
9
|
-
#
|
10
|
-
|
11
|
-
singleton_class.class_eval do
|
12
|
-
alias_method :orig_get, :get
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.get(base_url, options = {})
|
16
|
-
puts "[typhoeus] Sending GET: #{base_url}"
|
17
|
-
resp = orig_get(base_url, options)
|
18
|
-
puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
|
19
|
-
resp
|
20
|
-
end
|
21
|
-
end
|
10
|
+
# Logs all HTTP requests.
|
11
|
+
Wgit.logger.level = Logger::DEBUG
|
22
12
|
|
23
13
|
# Call reload to load all recent code changes.
|
24
14
|
def reload
|
@@ -33,12 +23,14 @@ end
|
|
33
23
|
# You can add fixtures and/or initialization code here...
|
34
24
|
reload
|
35
25
|
|
36
|
-
url
|
37
|
-
by_page
|
38
|
-
by_link
|
39
|
-
finder
|
26
|
+
def url; @url ||= 'http://txti.es/'; end
|
27
|
+
def by_page; @by_page ||= Finder.new; end
|
28
|
+
def by_link; @by_link ||= Finder.new(sort: :link); end
|
29
|
+
def finder; @finder ||= by_page; end
|
40
30
|
|
41
31
|
# Start the console.
|
42
|
-
puts
|
32
|
+
puts
|
33
|
+
puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
|
34
|
+
puts
|
43
35
|
|
44
|
-
|
36
|
+
Pry.start
|
data/bin/setup
CHANGED
data/broken_link_finder.gemspec
CHANGED
@@ -15,7 +15,10 @@ Gem::Specification.new do |spec|
|
|
15
15
|
spec.homepage = 'https://github.com/michaeltelford/broken-link-finder'
|
16
16
|
spec.license = 'MIT'
|
17
17
|
spec.metadata = {
|
18
|
-
'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder'
|
18
|
+
'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder',
|
19
|
+
'changelog_uri' => 'https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md',
|
20
|
+
'bug_tracker_uri' => 'https://github.com/michaeltelford/broken-link-finder/issues',
|
21
|
+
'documentation_uri' => 'https://www.rubydoc.info/gems/broken_link_finder'
|
19
22
|
}
|
20
23
|
|
21
24
|
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
@@ -41,10 +44,10 @@ Gem::Specification.new do |spec|
|
|
41
44
|
spec.add_development_dependency 'byebug', '~> 11.0'
|
42
45
|
spec.add_development_dependency 'maxitest', '~> 3.3'
|
43
46
|
spec.add_development_dependency 'pry', '~> 0.12'
|
44
|
-
spec.add_development_dependency 'rake', '~>
|
47
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
45
48
|
spec.add_development_dependency 'webmock', '~> 3.6'
|
46
49
|
|
47
|
-
spec.add_runtime_dependency 'thor', '~> 0.20
|
48
|
-
spec.add_runtime_dependency 'thread', '~> 0.2
|
49
|
-
spec.add_runtime_dependency 'wgit', '~> 0.
|
50
|
+
spec.add_runtime_dependency 'thor', '~> 0.20'
|
51
|
+
spec.add_runtime_dependency 'thread', '~> 0.2'
|
52
|
+
spec.add_runtime_dependency 'wgit', '~> 0.10'
|
50
53
|
end
|