broken_link_finder 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +6 -5
- data/benchmark.rb +13 -8
- data/broken_link_finder.gemspec +2 -2
- data/exe/broken_link_finder +2 -0
- data/lib/broken_link_finder/finder.rb +41 -9
- data/lib/broken_link_finder/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3e926c0997f5dcb805a9cdb4bf367237e3dff3284259395f6f38fa6b23a69172
|
4
|
+
data.tar.gz: 5c4020b8d1ae713a34593107580e82e3b3c06c75ec9a675e6cb536c07add1aca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c988b4c602441da10fcf602ca14e528ad8ce2e076e961659e10130c3393c4d2a0729a1c51e4611445f3d20c27402d65ae2cab8bb937e1009b4af1123462aaa01
|
7
|
+
data.tar.gz: 448fbc03310b0378a91e2a2bf3db95bd33cc4cd2fe51af36c55e0409e05482c5beed9783ee6465b3747d8e25f4afd3fdfa72c0604834ab2b0df070921605a183
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,15 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.8.0
|
13
|
+
### Added
|
14
|
+
- Logic to prevent re-crawling links for more efficiency.
|
15
|
+
### Changed/Removed
|
16
|
+
- Updated the `wgit` gem which fixes a bug in `crawl_site` and adds support for IRI's.
|
17
|
+
### Fixed
|
18
|
+
- Bug where an error from the executable wasn't being rescued.
|
19
|
+
---
|
20
|
+
|
12
21
|
## v0.7.0
|
13
22
|
### Added
|
14
23
|
- Added the `--verbose` flag to the executable for displaying all ignored links.
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.8.0)
|
5
5
|
thor (= 0.20.3)
|
6
6
|
thread (= 0.2)
|
7
|
-
wgit (= 0.0.
|
7
|
+
wgit (= 0.0.15)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -24,7 +24,7 @@ GEM
|
|
24
24
|
method_source (0.9.2)
|
25
25
|
mini_portile2 (2.4.0)
|
26
26
|
minitest (5.11.3)
|
27
|
-
mongo (2.
|
27
|
+
mongo (2.9.1)
|
28
28
|
bson (>= 4.4.2, < 5.0.0)
|
29
29
|
nokogiri (1.10.4)
|
30
30
|
mini_portile2 (~> 2.4.0)
|
@@ -42,8 +42,9 @@ GEM
|
|
42
42
|
addressable (>= 2.3.6)
|
43
43
|
crack (>= 0.3.2)
|
44
44
|
hashdiff
|
45
|
-
wgit (0.0.
|
46
|
-
|
45
|
+
wgit (0.0.15)
|
46
|
+
addressable (~> 2.6.0)
|
47
|
+
mongo (~> 2.9.0)
|
47
48
|
nokogiri (~> 1.10.3)
|
48
49
|
|
49
50
|
PLATFORMS
|
data/benchmark.rb
CHANGED
@@ -1,18 +1,23 @@
|
|
1
|
-
|
1
|
+
require_relative './lib/broken_link_finder'
|
2
2
|
require 'benchmark'
|
3
3
|
require 'memory_profiler'
|
4
4
|
|
5
|
-
url = ARGV[0] ||
|
5
|
+
url = ARGV[0] || 'http://txti.es'
|
6
6
|
finder = BrokenLinkFinder::Finder.new
|
7
7
|
|
8
|
-
puts Benchmark.measure { finder.crawl_page url }
|
8
|
+
# puts Benchmark.measure { finder.crawl_page url }
|
9
9
|
puts Benchmark.measure { finder.crawl_site url }
|
10
|
+
puts "Links crawled: #{finder.total_links_crawled}"
|
10
11
|
|
11
|
-
# http://txti.es
|
12
|
-
# Pre threading: 17.
|
13
|
-
# Post threading: 7.
|
12
|
+
# http://txti.es page crawl
|
13
|
+
# Pre threading: 17.5 seconds
|
14
|
+
# Post threading: 7.5 seconds
|
14
15
|
|
15
|
-
# http://txti.es
|
16
|
+
# http://txti.es post threading - page vs site crawl
|
16
17
|
# Page: 9.526981
|
17
18
|
# Site: 9.732416
|
18
|
-
# Multi-threading crawl_site now yields the same time as a single page
|
19
|
+
# Multi-threading crawl_site now yields the same time as a single page
|
20
|
+
|
21
|
+
# https://meos.ch/ site crawl - post all link recording functionality
|
22
|
+
# Pre: 608 seconds with 7665 links crawled
|
23
|
+
# Post: 355 seconds with 1099 links crawled
|
data/broken_link_finder.gemspec
CHANGED
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.require_paths = ["lib"]
|
35
35
|
spec.post_install_message = "Added the executable 'broken_link_finder' to $PATH"
|
36
36
|
|
37
|
-
spec.required_ruby_version = '~> 2.5'
|
37
|
+
spec.required_ruby_version = '~> 2.5'
|
38
38
|
|
39
39
|
spec.add_development_dependency "bundler", "~> 2.0"
|
40
40
|
spec.add_development_dependency "rake", "~> 10.0"
|
@@ -45,7 +45,7 @@ Gem::Specification.new do |spec|
|
|
45
45
|
spec.add_development_dependency "httplog", "~> 1.3"
|
46
46
|
spec.add_development_dependency "memory_profiler", "~> 0.9"
|
47
47
|
|
48
|
-
spec.add_runtime_dependency "wgit", "0.0.
|
48
|
+
spec.add_runtime_dependency "wgit", "0.0.15"
|
49
49
|
spec.add_runtime_dependency "thread", "0.2"
|
50
50
|
spec.add_runtime_dependency "thor", "0.20.3"
|
51
51
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require_relative 'reporter'
|
2
2
|
require 'wgit'
|
3
3
|
require 'thread/pool'
|
4
|
+
require 'set'
|
4
5
|
|
5
6
|
module BrokenLinkFinder
|
6
7
|
# Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
|
@@ -12,7 +13,7 @@ module BrokenLinkFinder
|
|
12
13
|
class Finder
|
13
14
|
DEFAULT_MAX_THREADS = 30.freeze
|
14
15
|
|
15
|
-
attr_reader :broken_links, :ignored_links
|
16
|
+
attr_reader :broken_links, :ignored_links, :total_links_crawled
|
16
17
|
|
17
18
|
# Creates a new Finder instance.
|
18
19
|
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
@@ -20,18 +21,21 @@ module BrokenLinkFinder
|
|
20
21
|
raise "sort by either :page or :link, not #{sort}"
|
21
22
|
end
|
22
23
|
|
23
|
-
@sort
|
24
|
+
@sort = sort
|
24
25
|
@max_threads = max_threads
|
25
|
-
@lock
|
26
|
-
@crawler
|
26
|
+
@lock = Mutex.new
|
27
|
+
@crawler = Wgit::Crawler.new
|
27
28
|
|
28
29
|
clear_links
|
29
30
|
end
|
30
31
|
|
31
32
|
# Clear/empty the link collection Hashes.
|
32
33
|
def clear_links
|
33
|
-
@broken_links
|
34
|
-
@ignored_links
|
34
|
+
@broken_links = {}
|
35
|
+
@ignored_links = {}
|
36
|
+
@total_links_crawled = 0
|
37
|
+
@all_broken_links = Set.new
|
38
|
+
@all_intact_links = Set.new
|
35
39
|
end
|
36
40
|
|
37
41
|
# Finds broken links within a single page and appends them to the
|
@@ -39,9 +43,9 @@ module BrokenLinkFinder
|
|
39
43
|
# Access the broken links with Finder#broken_links.
|
40
44
|
def crawl_url(url)
|
41
45
|
clear_links
|
42
|
-
url = Wgit::Url.new(url)
|
43
46
|
|
44
47
|
# Ensure the given page url is valid.
|
48
|
+
url = Wgit::Url.new(url)
|
45
49
|
doc = @crawler.crawl_url(url)
|
46
50
|
raise "Invalid URL: #{url}" unless doc
|
47
51
|
|
@@ -49,6 +53,8 @@ module BrokenLinkFinder
|
|
49
53
|
find_broken_links(doc)
|
50
54
|
|
51
55
|
sort_links
|
56
|
+
set_total_links_crawled
|
57
|
+
|
52
58
|
@broken_links.any?
|
53
59
|
end
|
54
60
|
|
@@ -58,6 +64,7 @@ module BrokenLinkFinder
|
|
58
64
|
# Access the broken links with Finder#broken_links.
|
59
65
|
def crawl_site(url)
|
60
66
|
clear_links
|
67
|
+
|
61
68
|
url = Wgit::Url.new(url)
|
62
69
|
pool = Thread.pool(@max_threads)
|
63
70
|
crawled_pages = []
|
@@ -73,8 +80,12 @@ module BrokenLinkFinder
|
|
73
80
|
pool.process { find_broken_links(doc) }
|
74
81
|
end
|
75
82
|
|
76
|
-
|
83
|
+
# Wait for all threads to finish.
|
84
|
+
pool.shutdown
|
85
|
+
|
77
86
|
sort_links
|
87
|
+
set_total_links_crawled
|
88
|
+
|
78
89
|
[@broken_links.any?, crawled_pages]
|
79
90
|
end
|
80
91
|
|
@@ -101,7 +112,7 @@ module BrokenLinkFinder
|
|
101
112
|
|
102
113
|
# Finds which links are unsupported or broken and records the details.
|
103
114
|
def find_broken_links(doc)
|
104
|
-
#
|
115
|
+
# Report and reject any non supported links.
|
105
116
|
links = doc.all_links.
|
106
117
|
reject do |link|
|
107
118
|
if !link.is_relative? and !link.start_with?('http')
|
@@ -113,13 +124,25 @@ module BrokenLinkFinder
|
|
113
124
|
|
114
125
|
# Iterate over the supported links checking if they're broken or not.
|
115
126
|
links.each do |link|
|
127
|
+
# Check if the link has already been processed previously.
|
128
|
+
next if @all_intact_links.include?(link)
|
129
|
+
|
130
|
+
if @all_broken_links.include?(link)
|
131
|
+
append_broken_link(doc.url, link)
|
132
|
+
next
|
133
|
+
end
|
134
|
+
|
135
|
+
# The link hasn't been processed before so we crawl it.
|
116
136
|
link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
|
117
137
|
link_doc = @crawler.crawl_url(link_url)
|
118
138
|
|
139
|
+
# Determine if the crawled link is broken or not.
|
119
140
|
if @crawler.last_response.is_a?(Net::HTTPNotFound) or
|
120
141
|
link_doc.nil? or
|
121
142
|
has_broken_anchor(link_doc)
|
122
143
|
append_broken_link(doc.url, link)
|
144
|
+
else
|
145
|
+
@lock.synchronize { @all_intact_links << link }
|
123
146
|
end
|
124
147
|
end
|
125
148
|
|
@@ -140,17 +163,21 @@ module BrokenLinkFinder
|
|
140
163
|
# Append key => [value] to @broken_links.
|
141
164
|
def append_broken_link(url, link)
|
142
165
|
key, value = get_key_value(url, link)
|
166
|
+
|
143
167
|
@lock.synchronize do
|
144
168
|
unless @broken_links[key]
|
145
169
|
@broken_links[key] = []
|
146
170
|
end
|
147
171
|
@broken_links[key] << value
|
172
|
+
|
173
|
+
@all_broken_links << link
|
148
174
|
end
|
149
175
|
end
|
150
176
|
|
151
177
|
# Append key => [value] to @ignored_links.
|
152
178
|
def append_ignored_link(url, link)
|
153
179
|
key, value = get_key_value(url, link)
|
180
|
+
|
154
181
|
@lock.synchronize do
|
155
182
|
unless @ignored_links[key]
|
156
183
|
@ignored_links[key] = []
|
@@ -180,6 +207,11 @@ module BrokenLinkFinder
|
|
180
207
|
@ignored_links.each { |k, v| v.sort! }
|
181
208
|
end
|
182
209
|
|
210
|
+
# Sets and returns the total number of links crawled.
|
211
|
+
def set_total_links_crawled
|
212
|
+
@total_links_crawled = @all_broken_links.size + @all_intact_links.size
|
213
|
+
end
|
214
|
+
|
183
215
|
alias_method :crawl_page, :crawl_url
|
184
216
|
alias_method :pretty_print_link_summary, :pretty_print_link_report
|
185
217
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-08-
|
11
|
+
date: 2019-08-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - '='
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: 0.0.
|
131
|
+
version: 0.0.15
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - '='
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: 0.0.
|
138
|
+
version: 0.0.15
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
140
|
name: thread
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|