broken_link_finder 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +6 -5
- data/benchmark.rb +13 -8
- data/broken_link_finder.gemspec +2 -2
- data/exe/broken_link_finder +2 -0
- data/lib/broken_link_finder/finder.rb +41 -9
- data/lib/broken_link_finder/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3e926c0997f5dcb805a9cdb4bf367237e3dff3284259395f6f38fa6b23a69172
|
4
|
+
data.tar.gz: 5c4020b8d1ae713a34593107580e82e3b3c06c75ec9a675e6cb536c07add1aca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c988b4c602441da10fcf602ca14e528ad8ce2e076e961659e10130c3393c4d2a0729a1c51e4611445f3d20c27402d65ae2cab8bb937e1009b4af1123462aaa01
|
7
|
+
data.tar.gz: 448fbc03310b0378a91e2a2bf3db95bd33cc4cd2fe51af36c55e0409e05482c5beed9783ee6465b3747d8e25f4afd3fdfa72c0604834ab2b0df070921605a183
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,15 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.8.0
|
13
|
+
### Added
|
14
|
+
- Logic to prevent re-crawling links for more efficiency.
|
15
|
+
### Changed/Removed
|
16
|
+
- Updated the `wgit` gem which fixes a bug in `crawl_site` and adds support for IRI's.
|
17
|
+
### Fixed
|
18
|
+
- Bug where an error from the executable wasn't being rescued.
|
19
|
+
---
|
20
|
+
|
12
21
|
## v0.7.0
|
13
22
|
### Added
|
14
23
|
- Added the `--verbose` flag to the executable for displaying all ignored links.
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.8.0)
|
5
5
|
thor (= 0.20.3)
|
6
6
|
thread (= 0.2)
|
7
|
-
wgit (= 0.0.
|
7
|
+
wgit (= 0.0.15)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -24,7 +24,7 @@ GEM
|
|
24
24
|
method_source (0.9.2)
|
25
25
|
mini_portile2 (2.4.0)
|
26
26
|
minitest (5.11.3)
|
27
|
-
mongo (2.
|
27
|
+
mongo (2.9.1)
|
28
28
|
bson (>= 4.4.2, < 5.0.0)
|
29
29
|
nokogiri (1.10.4)
|
30
30
|
mini_portile2 (~> 2.4.0)
|
@@ -42,8 +42,9 @@ GEM
|
|
42
42
|
addressable (>= 2.3.6)
|
43
43
|
crack (>= 0.3.2)
|
44
44
|
hashdiff
|
45
|
-
wgit (0.0.
|
46
|
-
|
45
|
+
wgit (0.0.15)
|
46
|
+
addressable (~> 2.6.0)
|
47
|
+
mongo (~> 2.9.0)
|
47
48
|
nokogiri (~> 1.10.3)
|
48
49
|
|
49
50
|
PLATFORMS
|
data/benchmark.rb
CHANGED
@@ -1,18 +1,23 @@
|
|
1
|
-
|
1
|
+
require_relative './lib/broken_link_finder'
|
2
2
|
require 'benchmark'
|
3
3
|
require 'memory_profiler'
|
4
4
|
|
5
|
-
url = ARGV[0] ||
|
5
|
+
url = ARGV[0] || 'http://txti.es'
|
6
6
|
finder = BrokenLinkFinder::Finder.new
|
7
7
|
|
8
|
-
puts Benchmark.measure { finder.crawl_page url }
|
8
|
+
# puts Benchmark.measure { finder.crawl_page url }
|
9
9
|
puts Benchmark.measure { finder.crawl_site url }
|
10
|
+
puts "Links crawled: #{finder.total_links_crawled}"
|
10
11
|
|
11
|
-
# http://txti.es
|
12
|
-
# Pre threading: 17.
|
13
|
-
# Post threading: 7.
|
12
|
+
# http://txti.es page crawl
|
13
|
+
# Pre threading: 17.5 seconds
|
14
|
+
# Post threading: 7.5 seconds
|
14
15
|
|
15
|
-
# http://txti.es
|
16
|
+
# http://txti.es post threading - page vs site crawl
|
16
17
|
# Page: 9.526981
|
17
18
|
# Site: 9.732416
|
18
|
-
# Multi-threading crawl_site now yields the same time as a single page
|
19
|
+
# Multi-threading crawl_site now yields the same time as a single page
|
20
|
+
|
21
|
+
# https://meos.ch/ site crawl - post all link recording functionality
|
22
|
+
# Pre: 608 seconds with 7665 links crawled
|
23
|
+
# Post: 355 seconds with 1099 links crawled
|
data/broken_link_finder.gemspec
CHANGED
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.require_paths = ["lib"]
|
35
35
|
spec.post_install_message = "Added the executable 'broken_link_finder' to $PATH"
|
36
36
|
|
37
|
-
spec.required_ruby_version = '~> 2.5'
|
37
|
+
spec.required_ruby_version = '~> 2.5'
|
38
38
|
|
39
39
|
spec.add_development_dependency "bundler", "~> 2.0"
|
40
40
|
spec.add_development_dependency "rake", "~> 10.0"
|
@@ -45,7 +45,7 @@ Gem::Specification.new do |spec|
|
|
45
45
|
spec.add_development_dependency "httplog", "~> 1.3"
|
46
46
|
spec.add_development_dependency "memory_profiler", "~> 0.9"
|
47
47
|
|
48
|
-
spec.add_runtime_dependency "wgit", "0.0.
|
48
|
+
spec.add_runtime_dependency "wgit", "0.0.15"
|
49
49
|
spec.add_runtime_dependency "thread", "0.2"
|
50
50
|
spec.add_runtime_dependency "thor", "0.20.3"
|
51
51
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require_relative 'reporter'
|
2
2
|
require 'wgit'
|
3
3
|
require 'thread/pool'
|
4
|
+
require 'set'
|
4
5
|
|
5
6
|
module BrokenLinkFinder
|
6
7
|
# Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
|
@@ -12,7 +13,7 @@ module BrokenLinkFinder
|
|
12
13
|
class Finder
|
13
14
|
DEFAULT_MAX_THREADS = 30.freeze
|
14
15
|
|
15
|
-
attr_reader :broken_links, :ignored_links
|
16
|
+
attr_reader :broken_links, :ignored_links, :total_links_crawled
|
16
17
|
|
17
18
|
# Creates a new Finder instance.
|
18
19
|
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
@@ -20,18 +21,21 @@ module BrokenLinkFinder
|
|
20
21
|
raise "sort by either :page or :link, not #{sort}"
|
21
22
|
end
|
22
23
|
|
23
|
-
@sort
|
24
|
+
@sort = sort
|
24
25
|
@max_threads = max_threads
|
25
|
-
@lock
|
26
|
-
@crawler
|
26
|
+
@lock = Mutex.new
|
27
|
+
@crawler = Wgit::Crawler.new
|
27
28
|
|
28
29
|
clear_links
|
29
30
|
end
|
30
31
|
|
31
32
|
# Clear/empty the link collection Hashes.
|
32
33
|
def clear_links
|
33
|
-
@broken_links
|
34
|
-
@ignored_links
|
34
|
+
@broken_links = {}
|
35
|
+
@ignored_links = {}
|
36
|
+
@total_links_crawled = 0
|
37
|
+
@all_broken_links = Set.new
|
38
|
+
@all_intact_links = Set.new
|
35
39
|
end
|
36
40
|
|
37
41
|
# Finds broken links within a single page and appends them to the
|
@@ -39,9 +43,9 @@ module BrokenLinkFinder
|
|
39
43
|
# Access the broken links with Finder#broken_links.
|
40
44
|
def crawl_url(url)
|
41
45
|
clear_links
|
42
|
-
url = Wgit::Url.new(url)
|
43
46
|
|
44
47
|
# Ensure the given page url is valid.
|
48
|
+
url = Wgit::Url.new(url)
|
45
49
|
doc = @crawler.crawl_url(url)
|
46
50
|
raise "Invalid URL: #{url}" unless doc
|
47
51
|
|
@@ -49,6 +53,8 @@ module BrokenLinkFinder
|
|
49
53
|
find_broken_links(doc)
|
50
54
|
|
51
55
|
sort_links
|
56
|
+
set_total_links_crawled
|
57
|
+
|
52
58
|
@broken_links.any?
|
53
59
|
end
|
54
60
|
|
@@ -58,6 +64,7 @@ module BrokenLinkFinder
|
|
58
64
|
# Access the broken links with Finder#broken_links.
|
59
65
|
def crawl_site(url)
|
60
66
|
clear_links
|
67
|
+
|
61
68
|
url = Wgit::Url.new(url)
|
62
69
|
pool = Thread.pool(@max_threads)
|
63
70
|
crawled_pages = []
|
@@ -73,8 +80,12 @@ module BrokenLinkFinder
|
|
73
80
|
pool.process { find_broken_links(doc) }
|
74
81
|
end
|
75
82
|
|
76
|
-
|
83
|
+
# Wait for all threads to finish.
|
84
|
+
pool.shutdown
|
85
|
+
|
77
86
|
sort_links
|
87
|
+
set_total_links_crawled
|
88
|
+
|
78
89
|
[@broken_links.any?, crawled_pages]
|
79
90
|
end
|
80
91
|
|
@@ -101,7 +112,7 @@ module BrokenLinkFinder
|
|
101
112
|
|
102
113
|
# Finds which links are unsupported or broken and records the details.
|
103
114
|
def find_broken_links(doc)
|
104
|
-
#
|
115
|
+
# Report and reject any non supported links.
|
105
116
|
links = doc.all_links.
|
106
117
|
reject do |link|
|
107
118
|
if !link.is_relative? and !link.start_with?('http')
|
@@ -113,13 +124,25 @@ module BrokenLinkFinder
|
|
113
124
|
|
114
125
|
# Iterate over the supported links checking if they're broken or not.
|
115
126
|
links.each do |link|
|
127
|
+
# Check if the link has already been processed previously.
|
128
|
+
next if @all_intact_links.include?(link)
|
129
|
+
|
130
|
+
if @all_broken_links.include?(link)
|
131
|
+
append_broken_link(doc.url, link)
|
132
|
+
next
|
133
|
+
end
|
134
|
+
|
135
|
+
# The link hasn't been processed before so we crawl it.
|
116
136
|
link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
|
117
137
|
link_doc = @crawler.crawl_url(link_url)
|
118
138
|
|
139
|
+
# Determine if the crawled link is broken or not.
|
119
140
|
if @crawler.last_response.is_a?(Net::HTTPNotFound) or
|
120
141
|
link_doc.nil? or
|
121
142
|
has_broken_anchor(link_doc)
|
122
143
|
append_broken_link(doc.url, link)
|
144
|
+
else
|
145
|
+
@lock.synchronize { @all_intact_links << link }
|
123
146
|
end
|
124
147
|
end
|
125
148
|
|
@@ -140,17 +163,21 @@ module BrokenLinkFinder
|
|
140
163
|
# Append key => [value] to @broken_links.
|
141
164
|
def append_broken_link(url, link)
|
142
165
|
key, value = get_key_value(url, link)
|
166
|
+
|
143
167
|
@lock.synchronize do
|
144
168
|
unless @broken_links[key]
|
145
169
|
@broken_links[key] = []
|
146
170
|
end
|
147
171
|
@broken_links[key] << value
|
172
|
+
|
173
|
+
@all_broken_links << link
|
148
174
|
end
|
149
175
|
end
|
150
176
|
|
151
177
|
# Append key => [value] to @ignored_links.
|
152
178
|
def append_ignored_link(url, link)
|
153
179
|
key, value = get_key_value(url, link)
|
180
|
+
|
154
181
|
@lock.synchronize do
|
155
182
|
unless @ignored_links[key]
|
156
183
|
@ignored_links[key] = []
|
@@ -180,6 +207,11 @@ module BrokenLinkFinder
|
|
180
207
|
@ignored_links.each { |k, v| v.sort! }
|
181
208
|
end
|
182
209
|
|
210
|
+
# Sets and returns the total number of links crawled.
|
211
|
+
def set_total_links_crawled
|
212
|
+
@total_links_crawled = @all_broken_links.size + @all_intact_links.size
|
213
|
+
end
|
214
|
+
|
183
215
|
alias_method :crawl_page, :crawl_url
|
184
216
|
alias_method :pretty_print_link_summary, :pretty_print_link_report
|
185
217
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-08-
|
11
|
+
date: 2019-08-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - '='
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: 0.0.
|
131
|
+
version: 0.0.15
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - '='
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: 0.0.
|
138
|
+
version: 0.0.15
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
140
|
name: thread
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|