broken_link_finder 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +24 -21
- data/README.md +3 -1
- data/benchmark.rb +0 -1
- data/bin/console +13 -22
- data/broken_link_finder.gemspec +5 -7
- data/lib/broken_link_finder/finder.rb +25 -16
- data/lib/broken_link_finder/reporter.rb +4 -4
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +1 -1
- metadata +17 -45
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 110068a5db9454d69709454f50ade3f667c0e63dad6d101c23bf991a04f770eb
|
4
|
+
data.tar.gz: 0fd46fed486bf382935d68020fcae9391c288b5d5742ce19ba61d2896e2eebe7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c9c744045ee462b8981ab418a5c3acdb9162c36e14c8603391c1f79e9247872de877a83ce2a7dd5f8779d8774a044562054445410666a5af518d11f5fded3f22
|
7
|
+
data.tar.gz: e4e0e2b2d19596493564a361fef0a98afdddcefb395309dba139df6bd508a12385ca16e47004bd503c585954454d9608449995b4ed6009926e0b856d11962181
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,15 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.9.2
|
13
|
+
### Added
|
14
|
+
- ...
|
15
|
+
### Changed/Removed
|
16
|
+
- Updated `wgit` gem to version 0.4.0 which brings a speed boost to crawls.
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
12
21
|
## v0.9.1
|
13
22
|
### Added
|
14
23
|
- `BrokenLinkFinder::Finder.crawl_site` alias: `crawl_r`.
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.9.
|
5
|
-
thor (
|
6
|
-
thread (
|
7
|
-
wgit (
|
4
|
+
broken_link_finder (0.9.2)
|
5
|
+
thor (~> 0.20.3)
|
6
|
+
thread (~> 0.2.0)
|
7
|
+
wgit (~> 0.4.0)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -16,15 +16,16 @@ GEM
|
|
16
16
|
coderay (1.1.2)
|
17
17
|
crack (0.4.3)
|
18
18
|
safe_yaml (~> 1.0.0)
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
ethon (0.12.0)
|
20
|
+
ffi (>= 1.3.0)
|
21
|
+
ffi (1.11.1)
|
22
|
+
hashdiff (1.0.0)
|
23
|
+
maxitest (3.4.0)
|
24
|
+
minitest (>= 5.0.0, < 5.13.0)
|
24
25
|
method_source (0.9.2)
|
25
26
|
mini_portile2 (2.4.0)
|
26
|
-
minitest (5.
|
27
|
-
mongo (2.9.
|
27
|
+
minitest (5.12.2)
|
28
|
+
mongo (2.9.2)
|
28
29
|
bson (>= 4.4.2, < 5.0.0)
|
29
30
|
nokogiri (1.10.4)
|
30
31
|
mini_portile2 (~> 2.4.0)
|
@@ -32,20 +33,21 @@ GEM
|
|
32
33
|
coderay (~> 1.1.0)
|
33
34
|
method_source (~> 0.9.0)
|
34
35
|
public_suffix (3.1.0)
|
35
|
-
rack (2.0.7)
|
36
|
-
rainbow (3.0.0)
|
37
36
|
rake (10.5.0)
|
38
37
|
safe_yaml (1.0.5)
|
39
38
|
thor (0.20.3)
|
40
|
-
thread (0.2.
|
41
|
-
|
39
|
+
thread (0.2.2)
|
40
|
+
typhoeus (1.3.1)
|
41
|
+
ethon (>= 0.9.0)
|
42
|
+
webmock (3.7.6)
|
42
43
|
addressable (>= 2.3.6)
|
43
44
|
crack (>= 0.3.2)
|
44
|
-
hashdiff
|
45
|
-
wgit (0.
|
45
|
+
hashdiff (>= 0.4.0, < 2.0.0)
|
46
|
+
wgit (0.4.0)
|
46
47
|
addressable (~> 2.6.0)
|
47
48
|
mongo (~> 2.9.0)
|
48
49
|
nokogiri (~> 1.10.3)
|
50
|
+
typhoeus (~> 1.3.1)
|
49
51
|
|
50
52
|
PLATFORMS
|
51
53
|
ruby
|
@@ -54,12 +56,13 @@ DEPENDENCIES
|
|
54
56
|
broken_link_finder!
|
55
57
|
bundler (~> 2.0)
|
56
58
|
byebug (~> 11.0)
|
57
|
-
|
58
|
-
memory_profiler (~> 0.9)
|
59
|
-
minitest (~> 5.0)
|
59
|
+
maxitest (~> 3.3)
|
60
60
|
pry (~> 0.12)
|
61
61
|
rake (~> 10.0)
|
62
|
-
webmock (~> 3.
|
62
|
+
webmock (~> 3.6)
|
63
|
+
|
64
|
+
RUBY VERSION
|
65
|
+
ruby 2.5.3p105
|
63
66
|
|
64
67
|
BUNDLED WITH
|
65
68
|
2.0.1
|
data/README.md
CHANGED
@@ -19,9 +19,11 @@ In a nutshell, only HTTP(S) based links can be successfully verified by `broken_
|
|
19
19
|
|
20
20
|
See the [usage](#Usage) section below on how to check which links have been ignored during a crawl.
|
21
21
|
|
22
|
+
With that said, the usual array of HTTP URL features are supported including anchors/fragments, query strings and IRI's (non ASCII based URL's).
|
23
|
+
|
22
24
|
## Made Possible By
|
23
25
|
|
24
|
-
`broken_link_finder` relies heavily on the `wgit` Ruby gem. See its [repository](https://github.com/michaeltelford/wgit) for more details.
|
26
|
+
`broken_link_finder` relies heavily on the `wgit` Ruby gem by the same author. See its [repository](https://github.com/michaeltelford/wgit) for more details.
|
25
27
|
|
26
28
|
## Installation
|
27
29
|
|
data/benchmark.rb
CHANGED
data/bin/console
CHANGED
@@ -4,30 +4,20 @@
|
|
4
4
|
require 'bundler/setup'
|
5
5
|
require 'pry'
|
6
6
|
require 'byebug'
|
7
|
-
require 'logger'
|
8
|
-
require 'httplog'
|
9
7
|
require 'broken_link_finder'
|
10
8
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
config.log_headers = false
|
24
|
-
config.log_data = false
|
25
|
-
config.log_status = true
|
26
|
-
config.log_response = false
|
27
|
-
config.log_benchmark = false
|
28
|
-
|
29
|
-
config.compact_log = false
|
30
|
-
config.json_log = false
|
9
|
+
# Monkey patch and log all HTTP requests made during the console.
|
10
|
+
module Typhoeus
|
11
|
+
singleton_class.class_eval do
|
12
|
+
alias_method :orig_get, :get
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.get(base_url, options = {})
|
16
|
+
puts "[typhoeus] Sending GET: #{base_url}"
|
17
|
+
resp = orig_get(base_url, options)
|
18
|
+
puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
|
19
|
+
resp
|
20
|
+
end
|
31
21
|
end
|
32
22
|
|
33
23
|
# Call reload to load all recent code changes.
|
@@ -48,6 +38,7 @@ by_page = Finder.new
|
|
48
38
|
by_link = Finder.new sort: :link
|
49
39
|
finder = by_page
|
50
40
|
|
41
|
+
# Start the console.
|
51
42
|
puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
|
52
43
|
|
53
44
|
binding.pry
|
data/broken_link_finder.gemspec
CHANGED
@@ -39,14 +39,12 @@ Gem::Specification.new do |spec|
|
|
39
39
|
|
40
40
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
41
41
|
spec.add_development_dependency 'byebug', '~> 11.0'
|
42
|
-
spec.add_development_dependency '
|
43
|
-
spec.add_development_dependency 'memory_profiler', '~> 0.9'
|
44
|
-
spec.add_development_dependency 'minitest', '~> 5.0'
|
42
|
+
spec.add_development_dependency 'maxitest', '~> 3.3'
|
45
43
|
spec.add_development_dependency 'pry', '~> 0.12'
|
46
44
|
spec.add_development_dependency 'rake', '~> 10.0'
|
47
|
-
spec.add_development_dependency 'webmock', '~> 3.
|
45
|
+
spec.add_development_dependency 'webmock', '~> 3.6'
|
48
46
|
|
49
|
-
spec.add_runtime_dependency 'thor', '0.20.3'
|
50
|
-
spec.add_runtime_dependency 'thread', '0.2'
|
51
|
-
spec.add_runtime_dependency 'wgit', '0.
|
47
|
+
spec.add_runtime_dependency 'thor', '~> 0.20.3'
|
48
|
+
spec.add_runtime_dependency 'thread', '~> 0.2.0'
|
49
|
+
spec.add_runtime_dependency 'wgit', '~> 0.4.0'
|
52
50
|
end
|
@@ -39,7 +39,7 @@ module BrokenLinkFinder
|
|
39
39
|
|
40
40
|
# Finds broken links within a single page and appends them to the
|
41
41
|
# @broken_links array. Returns true if at least one broken link was found.
|
42
|
-
# Access the broken links with Finder#broken_links.
|
42
|
+
# Access the broken links afterwards with Finder#broken_links.
|
43
43
|
def crawl_url(url)
|
44
44
|
clear_links
|
45
45
|
|
@@ -61,7 +61,7 @@ module BrokenLinkFinder
|
|
61
61
|
# Finds broken links within an entire site and appends them to the
|
62
62
|
# @broken_links array. Returns a tuple containing a Boolean of true if
|
63
63
|
# at least one broken link was found and an Array of all pages crawled.
|
64
|
-
# Access the broken links with Finder#broken_links.
|
64
|
+
# Access the broken links afterwards with Finder#broken_links.
|
65
65
|
def crawl_site(url)
|
66
66
|
clear_links
|
67
67
|
|
@@ -70,7 +70,7 @@ module BrokenLinkFinder
|
|
70
70
|
crawled_pages = []
|
71
71
|
|
72
72
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
|
73
|
+
externals = @crawler.crawl_site(url) do |doc|
|
74
74
|
crawled_pages << doc.url
|
75
75
|
next unless doc
|
76
76
|
|
@@ -79,7 +79,7 @@ module BrokenLinkFinder
|
|
79
79
|
end
|
80
80
|
|
81
81
|
# Ensure the given website url is valid.
|
82
|
-
raise "Invalid or broken URL: #{url}"
|
82
|
+
raise "Invalid or broken URL: #{url}" unless externals
|
83
83
|
|
84
84
|
# Wait for all threads to finish.
|
85
85
|
pool.shutdown
|
@@ -113,15 +113,7 @@ module BrokenLinkFinder
|
|
113
113
|
|
114
114
|
# Finds which links are unsupported or broken and records the details.
|
115
115
|
def find_broken_links(doc)
|
116
|
-
|
117
|
-
links = doc.all_links
|
118
|
-
.reject do |link|
|
119
|
-
if link.is_absolute? && !link.start_with?('http')
|
120
|
-
append_ignored_link(doc.url, link)
|
121
|
-
true
|
122
|
-
end
|
123
|
-
end
|
124
|
-
.uniq
|
116
|
+
links = get_supported_links(doc)
|
125
117
|
|
126
118
|
# Iterate over the supported links checking if they're broken or not.
|
127
119
|
links.each do |link|
|
@@ -134,11 +126,10 @@ module BrokenLinkFinder
|
|
134
126
|
end
|
135
127
|
|
136
128
|
# The link hasn't been processed before so we crawl it.
|
137
|
-
|
138
|
-
link_doc = @crawler.crawl_url(link_url)
|
129
|
+
link_doc = crawl_link(doc, link)
|
139
130
|
|
140
131
|
# Determine if the crawled link is broken or not.
|
141
|
-
if @crawler.last_response.
|
132
|
+
if @crawler.last_response.code == 404 ||
|
142
133
|
link_doc.nil? ||
|
143
134
|
has_broken_anchor(link_doc)
|
144
135
|
append_broken_link(doc.url, link)
|
@@ -150,6 +141,24 @@ module BrokenLinkFinder
|
|
150
141
|
nil
|
151
142
|
end
|
152
143
|
|
144
|
+
# Report and reject any non supported links. Any link that is absolute and
|
145
|
+
# doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
|
146
|
+
def get_supported_links(doc)
|
147
|
+
doc.all_links
|
148
|
+
.reject do |link|
|
149
|
+
if link.is_absolute? && !link.start_with?('http')
|
150
|
+
append_ignored_link(doc.url, link)
|
151
|
+
true
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Makes the link absolute and crawls it, returning its Wgit::Document.
|
157
|
+
def crawl_link(doc, link)
|
158
|
+
link = get_absolute_link(doc, link)
|
159
|
+
@crawler.crawl_url(link)
|
160
|
+
end
|
161
|
+
|
153
162
|
# Returns the link in absolute form so it can be crawled.
|
154
163
|
def get_absolute_link(doc, link)
|
155
164
|
link.is_relative? ? doc.base_url(link: link).concat(link) : link
|
@@ -12,10 +12,10 @@ module BrokenLinkFinder
|
|
12
12
|
raise "sort by either :page or :link, not #{sort}" \
|
13
13
|
unless %i[page link].include?(sort)
|
14
14
|
|
15
|
-
@stream
|
16
|
-
@sort
|
17
|
-
@broken_links
|
18
|
-
@ignored_links
|
15
|
+
@stream = stream
|
16
|
+
@sort = sort
|
17
|
+
@broken_links = broken_links
|
18
|
+
@ignored_links = ignored_links
|
19
19
|
end
|
20
20
|
|
21
21
|
# Pretty print a report detailing the link summary.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -39,47 +39,19 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '11.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: maxitest
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '3.3'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: memory_profiler
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - "~>"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0.9'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - "~>"
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0.9'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: minitest
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - "~>"
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '5.0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - "~>"
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '5.0'
|
54
|
+
version: '3.3'
|
83
55
|
- !ruby/object:Gem::Dependency
|
84
56
|
name: pry
|
85
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -114,56 +86,56 @@ dependencies:
|
|
114
86
|
requirements:
|
115
87
|
- - "~>"
|
116
88
|
- !ruby/object:Gem::Version
|
117
|
-
version: '3.
|
89
|
+
version: '3.6'
|
118
90
|
type: :development
|
119
91
|
prerelease: false
|
120
92
|
version_requirements: !ruby/object:Gem::Requirement
|
121
93
|
requirements:
|
122
94
|
- - "~>"
|
123
95
|
- !ruby/object:Gem::Version
|
124
|
-
version: '3.
|
96
|
+
version: '3.6'
|
125
97
|
- !ruby/object:Gem::Dependency
|
126
98
|
name: thor
|
127
99
|
requirement: !ruby/object:Gem::Requirement
|
128
100
|
requirements:
|
129
|
-
- -
|
101
|
+
- - "~>"
|
130
102
|
- !ruby/object:Gem::Version
|
131
103
|
version: 0.20.3
|
132
104
|
type: :runtime
|
133
105
|
prerelease: false
|
134
106
|
version_requirements: !ruby/object:Gem::Requirement
|
135
107
|
requirements:
|
136
|
-
- -
|
108
|
+
- - "~>"
|
137
109
|
- !ruby/object:Gem::Version
|
138
110
|
version: 0.20.3
|
139
111
|
- !ruby/object:Gem::Dependency
|
140
112
|
name: thread
|
141
113
|
requirement: !ruby/object:Gem::Requirement
|
142
114
|
requirements:
|
143
|
-
- -
|
115
|
+
- - "~>"
|
144
116
|
- !ruby/object:Gem::Version
|
145
|
-
version:
|
117
|
+
version: 0.2.0
|
146
118
|
type: :runtime
|
147
119
|
prerelease: false
|
148
120
|
version_requirements: !ruby/object:Gem::Requirement
|
149
121
|
requirements:
|
150
|
-
- -
|
122
|
+
- - "~>"
|
151
123
|
- !ruby/object:Gem::Version
|
152
|
-
version:
|
124
|
+
version: 0.2.0
|
153
125
|
- !ruby/object:Gem::Dependency
|
154
126
|
name: wgit
|
155
127
|
requirement: !ruby/object:Gem::Requirement
|
156
128
|
requirements:
|
157
|
-
- -
|
129
|
+
- - "~>"
|
158
130
|
- !ruby/object:Gem::Version
|
159
|
-
version: 0.
|
131
|
+
version: 0.4.0
|
160
132
|
type: :runtime
|
161
133
|
prerelease: false
|
162
134
|
version_requirements: !ruby/object:Gem::Requirement
|
163
135
|
requirements:
|
164
|
-
- -
|
136
|
+
- - "~>"
|
165
137
|
- !ruby/object:Gem::Version
|
166
|
-
version: 0.
|
138
|
+
version: 0.4.0
|
167
139
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
168
140
|
to you with a summary.
|
169
141
|
email: michael.telford@live.com
|