broken_link_finder 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Gemfile.lock +3 -3
- data/README.md +4 -0
- data/Rakefile +10 -0
- data/benchmark.rb +1 -1
- data/bin/console +15 -3
- data/broken_link_finder.gemspec +1 -1
- data/exe/broken_link_finder +15 -8
- data/lib/broken_link_finder/finder.rb +28 -19
- data/lib/broken_link_finder/reporter.rb +5 -4
- data/lib/broken_link_finder/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d3f285779c735d254089d0b757d78a5fe1e1a082e0cbe21f591f499f2da54ba6
|
4
|
+
data.tar.gz: 3f62093f5589eb15df77f2289c86f826c1d7750401713b4854d26686e44dd745
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05d14ed145636e0a711b19519370b66189b39854a032ac1131c222d34f2a304c82cbcaefb7e60ed976ed0bd94a4a0dc191d7c80f885d7fd9811fe9ed3a8aafcb
|
7
|
+
data.tar.gz: a0ff34531ec08bd8abe134e7214ac74e3cf90b0d84d57f0268395b8ce2f55a31f8e0b8efeace50fa594e1a5af8b1afb5302fbac4cf7820579c2150d4ad63133c
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,17 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.9.0
|
13
|
+
### Added
|
14
|
+
- The `version` command to the executable.
|
15
|
+
- The `--threads` aka `-t` option to the executable's `crawl` command to control crawl speed vs. resource usage.
|
16
|
+
### Changed/Removed
|
17
|
+
- Changed the default number of maximum threads for a recursive crawl from 30 to 100. Users will see a speed boost with increased resource usage as a result. This is configurable using the new `crawl` command option e.g. `--threads 30`.
|
18
|
+
### Fixed
|
19
|
+
- Several bugs by updating the `wgit` dependancy.
|
20
|
+
- A bug in the report logic causing an incorrect link count.
|
21
|
+
---
|
22
|
+
|
12
23
|
## v0.8.1
|
13
24
|
### Added
|
14
25
|
- ...
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.9.0)
|
5
5
|
thor (= 0.20.3)
|
6
6
|
thread (= 0.2)
|
7
|
-
wgit (= 0.0.
|
7
|
+
wgit (= 0.0.17)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -42,7 +42,7 @@ GEM
|
|
42
42
|
addressable (>= 2.3.6)
|
43
43
|
crack (>= 0.3.2)
|
44
44
|
hashdiff
|
45
|
-
wgit (0.0.
|
45
|
+
wgit (0.0.17)
|
46
46
|
addressable (~> 2.6.0)
|
47
47
|
mongo (~> 2.9.0)
|
48
48
|
nokogiri (~> 1.10.3)
|
data/README.md
CHANGED
data/Rakefile
CHANGED
@@ -15,6 +15,16 @@ task :help do
|
|
15
15
|
system "bundle exec rake -D"
|
16
16
|
end
|
17
17
|
|
18
|
+
desc "Run the setup script"
|
19
|
+
task :setup do
|
20
|
+
system "./bin/setup"
|
21
|
+
end
|
22
|
+
|
23
|
+
desc "Run the development console"
|
24
|
+
task :console do
|
25
|
+
system "./bin/console"
|
26
|
+
end
|
27
|
+
|
18
28
|
desc "Compile all project Ruby files with warnings."
|
19
29
|
task :compile do
|
20
30
|
paths = Dir["**/*.rb", "**/*.gemspec", 'exe/broken_link_finder']
|
data/benchmark.rb
CHANGED
@@ -18,6 +18,6 @@ puts "Links crawled: #{finder.total_links_crawled}"
|
|
18
18
|
# Site: 9.732416
|
19
19
|
# Multi-threading crawl_site now yields the same time as a single page
|
20
20
|
|
21
|
-
#
|
21
|
+
# Large site crawl - post all link recording functionality
|
22
22
|
# Pre: 608 seconds with 7665 links crawled
|
23
23
|
# Post: 355 seconds with 1099 links crawled
|
data/bin/console
CHANGED
@@ -5,20 +5,29 @@ require "pry"
|
|
5
5
|
require "byebug"
|
6
6
|
require "broken_link_finder"
|
7
7
|
require 'wgit/core_ext'
|
8
|
+
require 'logger'
|
8
9
|
require 'httplog'
|
9
10
|
|
11
|
+
logger = Logger.new(STDOUT)
|
12
|
+
logger.formatter = proc do |severity, datetime, progname, msg|
|
13
|
+
"#{msg}\n"
|
14
|
+
end
|
15
|
+
|
10
16
|
# Monkey patch all Net:HTTP network calls and log them.
|
11
17
|
HttpLog.configure do |config|
|
18
|
+
config.enabled = true
|
19
|
+
config.logger = logger
|
20
|
+
|
12
21
|
config.log_connect = false
|
13
22
|
config.log_request = true
|
14
23
|
config.log_headers = false
|
15
24
|
config.log_data = false
|
16
25
|
config.log_status = true
|
17
26
|
config.log_response = false
|
18
|
-
config.log_benchmark =
|
27
|
+
config.log_benchmark = false
|
19
28
|
|
20
|
-
config.compact_log =
|
21
|
-
config.json_log =
|
29
|
+
config.compact_log = false
|
30
|
+
config.json_log = false
|
22
31
|
end
|
23
32
|
|
24
33
|
# Call reload to load all recent code changes.
|
@@ -33,9 +42,12 @@ end
|
|
33
42
|
|
34
43
|
# You can add fixtures and/or initialization code here...
|
35
44
|
reload
|
45
|
+
|
36
46
|
url = "http://txti.es/"
|
37
47
|
by_page = Finder.new
|
38
48
|
by_link = Finder.new sort: :link
|
39
49
|
finder = by_page
|
40
50
|
|
51
|
+
puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
|
52
|
+
|
41
53
|
binding.pry
|
data/broken_link_finder.gemspec
CHANGED
@@ -45,7 +45,7 @@ Gem::Specification.new do |spec|
|
|
45
45
|
spec.add_development_dependency "httplog", "~> 1.3"
|
46
46
|
spec.add_development_dependency "memory_profiler", "~> 0.9"
|
47
47
|
|
48
|
-
spec.add_runtime_dependency "wgit", "0.0.
|
48
|
+
spec.add_runtime_dependency "wgit", "0.0.17"
|
49
49
|
spec.add_runtime_dependency "thread", "0.2"
|
50
50
|
spec.add_runtime_dependency "thor", "0.20.3"
|
51
51
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -6,18 +6,20 @@ require 'thor'
|
|
6
6
|
|
7
7
|
class BrokenLinkFinderCLI < Thor
|
8
8
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
9
|
-
option :recursive, type: :boolean, aliases: [:r], desc: 'Crawl the entire site.'
|
10
|
-
option :
|
11
|
-
option :
|
12
|
-
option :
|
9
|
+
option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
|
10
|
+
option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
|
11
|
+
option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
12
|
+
option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
|
13
|
+
option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
|
13
14
|
def crawl(url)
|
14
15
|
url = "http://#{url}" unless url.start_with?('http')
|
15
16
|
|
16
|
-
sort_by = options[:sort_by_link]
|
17
|
-
|
18
|
-
|
17
|
+
sort_by = options[:sort_by_link] ? :link : :page
|
18
|
+
max_threads = options[:threads]
|
19
|
+
broken_verbose = !options[:concise]
|
20
|
+
ignored_verbose = options[:verbose]
|
19
21
|
|
20
|
-
finder = BrokenLinkFinder::Finder.new(sort: sort_by)
|
22
|
+
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
|
21
23
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
22
24
|
finder.pretty_print_link_report(
|
23
25
|
broken_verbose: broken_verbose,
|
@@ -26,6 +28,11 @@ class BrokenLinkFinderCLI < Thor
|
|
26
28
|
rescue Exception => ex
|
27
29
|
puts "An error has occurred: #{ex.message}"
|
28
30
|
end
|
31
|
+
|
32
|
+
desc 'version', 'Display the currently installed version'
|
33
|
+
def version
|
34
|
+
puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
|
35
|
+
end
|
29
36
|
end
|
30
37
|
|
31
38
|
BrokenLinkFinderCLI.start(ARGV)
|
@@ -4,19 +4,18 @@ require 'thread/pool'
|
|
4
4
|
require 'set'
|
5
5
|
|
6
6
|
module BrokenLinkFinder
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
DEFAULT_MAX_THREADS = 100.freeze
|
8
|
+
|
9
|
+
# Alias for BrokenLinkFinder::Finder.new.
|
10
|
+
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
11
|
+
Finder.new(sort: sort, max_threads: max_threads)
|
11
12
|
end
|
12
13
|
|
13
14
|
class Finder
|
14
|
-
|
15
|
-
|
16
|
-
attr_reader :broken_links, :ignored_links, :total_links_crawled
|
15
|
+
attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
|
17
16
|
|
18
17
|
# Creates a new Finder instance.
|
19
|
-
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
18
|
+
def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
|
20
19
|
unless [:page, :link].include?(sort)
|
21
20
|
raise "sort by either :page or :link, not #{sort}"
|
22
21
|
end
|
@@ -44,10 +43,11 @@ module BrokenLinkFinder
|
|
44
43
|
def crawl_url(url)
|
45
44
|
clear_links
|
46
45
|
|
47
|
-
# Ensure the given page url is valid.
|
48
46
|
url = Wgit::Url.new(url)
|
49
47
|
doc = @crawler.crawl_url(url)
|
50
|
-
|
48
|
+
|
49
|
+
# Ensure the given page url is valid.
|
50
|
+
raise "Invalid or broken URL: #{url}" unless doc
|
51
51
|
|
52
52
|
# Get all page links and determine which are broken.
|
53
53
|
find_broken_links(doc)
|
@@ -70,23 +70,24 @@ module BrokenLinkFinder
|
|
70
70
|
crawled_pages = []
|
71
71
|
|
72
72
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
@crawler.crawl_site(url) do |doc|
|
74
|
-
# Ensure the given website url is valid.
|
75
|
-
raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
|
73
|
+
orig_doc = @crawler.crawl_site(url) do |doc|
|
76
74
|
crawled_pages << doc.url
|
77
|
-
|
78
|
-
# Get all page links and determine which are broken.
|
79
75
|
next unless doc
|
76
|
+
|
77
|
+
# Start a thread for each page, checking for broken links.
|
80
78
|
pool.process { find_broken_links(doc) }
|
81
79
|
end
|
82
80
|
|
81
|
+
# Ensure the given website url is valid.
|
82
|
+
raise "Invalid or broken URL: #{url}" if orig_doc.nil?
|
83
|
+
|
83
84
|
# Wait for all threads to finish.
|
84
85
|
pool.shutdown
|
85
86
|
|
86
87
|
sort_links
|
87
88
|
set_total_links_crawled
|
88
89
|
|
89
|
-
[@broken_links.any?, crawled_pages]
|
90
|
+
[@broken_links.any?, crawled_pages.uniq]
|
90
91
|
end
|
91
92
|
|
92
93
|
# Pretty prints the link report into a stream e.g. STDOUT or a file,
|
@@ -133,7 +134,7 @@ module BrokenLinkFinder
|
|
133
134
|
end
|
134
135
|
|
135
136
|
# The link hasn't been processed before so we crawl it.
|
136
|
-
link_url =
|
137
|
+
link_url = get_absolute_link(doc, link)
|
137
138
|
link_doc = @crawler.crawl_url(link_url)
|
138
139
|
|
139
140
|
# Determine if the crawled link is broken or not.
|
@@ -149,6 +150,11 @@ module BrokenLinkFinder
|
|
149
150
|
nil
|
150
151
|
end
|
151
152
|
|
153
|
+
# Returns the link in absolute form so it can be crawled.
|
154
|
+
def get_absolute_link(doc, link)
|
155
|
+
link.is_relative? ? doc.base_url(link: link).concat(link) : link
|
156
|
+
end
|
157
|
+
|
152
158
|
# Returns true if the link is/contains a broken anchor.
|
153
159
|
def has_broken_anchor(doc)
|
154
160
|
raise "link document is nil" unless doc
|
@@ -200,10 +206,13 @@ module BrokenLinkFinder
|
|
200
206
|
|
201
207
|
# Sort keys and values alphabetically.
|
202
208
|
def sort_links
|
203
|
-
@broken_links
|
209
|
+
@broken_links.values.map { |v| v.uniq! }
|
210
|
+
@ignored_links.values.map { |v| v.uniq! }
|
211
|
+
|
212
|
+
@broken_links = @broken_links.sort_by { |k, v| k }.to_h
|
204
213
|
@ignored_links = @ignored_links.sort_by { |k, v| k }.to_h
|
205
214
|
|
206
|
-
@broken_links.each
|
215
|
+
@broken_links.each { |k, v| v.sort! }
|
207
216
|
@ignored_links.each { |k, v| v.sort! }
|
208
217
|
end
|
209
218
|
|
@@ -11,16 +11,17 @@ module BrokenLinkFinder
|
|
11
11
|
raise "sort by either :page or :link, not #{sort}"
|
12
12
|
end
|
13
13
|
|
14
|
-
@stream
|
15
|
-
@sort
|
16
|
-
@broken_links
|
17
|
-
@ignored_links
|
14
|
+
@stream = stream
|
15
|
+
@sort = sort
|
16
|
+
@broken_links = broken_links
|
17
|
+
@ignored_links = ignored_links
|
18
18
|
end
|
19
19
|
|
20
20
|
# Pretty print a report detailing the link summary.
|
21
21
|
def pretty_print_link_report(broken_verbose: true, ignored_verbose: false)
|
22
22
|
report_broken_links(verbose: broken_verbose)
|
23
23
|
report_ignored_links(verbose: ignored_verbose)
|
24
|
+
nil
|
24
25
|
end
|
25
26
|
|
26
27
|
private
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-08-
|
11
|
+
date: 2019-08-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - '='
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: 0.0.
|
131
|
+
version: 0.0.17
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - '='
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: 0.0.
|
138
|
+
version: 0.0.17
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
140
|
name: thread
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|