broken_link_finder 0.8.1 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Gemfile.lock +3 -3
- data/README.md +4 -0
- data/Rakefile +10 -0
- data/benchmark.rb +1 -1
- data/bin/console +15 -3
- data/broken_link_finder.gemspec +1 -1
- data/exe/broken_link_finder +15 -8
- data/lib/broken_link_finder/finder.rb +28 -19
- data/lib/broken_link_finder/reporter.rb +5 -4
- data/lib/broken_link_finder/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d3f285779c735d254089d0b757d78a5fe1e1a082e0cbe21f591f499f2da54ba6
|
4
|
+
data.tar.gz: 3f62093f5589eb15df77f2289c86f826c1d7750401713b4854d26686e44dd745
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05d14ed145636e0a711b19519370b66189b39854a032ac1131c222d34f2a304c82cbcaefb7e60ed976ed0bd94a4a0dc191d7c80f885d7fd9811fe9ed3a8aafcb
|
7
|
+
data.tar.gz: a0ff34531ec08bd8abe134e7214ac74e3cf90b0d84d57f0268395b8ce2f55a31f8e0b8efeace50fa594e1a5af8b1afb5302fbac4cf7820579c2150d4ad63133c
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,17 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.9.0
|
13
|
+
### Added
|
14
|
+
- The `version` command to the executable.
|
15
|
+
- The `--threads` aka `-t` option to the executable's `crawl` command to control crawl speed vs. resource usage.
|
16
|
+
### Changed/Removed
|
17
|
+
- Changed the default number of maximum threads for a recursive crawl from 30 to 100. Users will see a speed boost with increased resource usage as a result. This is configurable using the new `crawl` command option e.g. `--threads 30`.
|
18
|
+
### Fixed
|
19
|
+
- Several bugs by updating the `wgit` dependancy.
|
20
|
+
- A bug in the report logic causing an incorrect link count.
|
21
|
+
---
|
22
|
+
|
12
23
|
## v0.8.1
|
13
24
|
### Added
|
14
25
|
- ...
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.9.0)
|
5
5
|
thor (= 0.20.3)
|
6
6
|
thread (= 0.2)
|
7
|
-
wgit (= 0.0.
|
7
|
+
wgit (= 0.0.17)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -42,7 +42,7 @@ GEM
|
|
42
42
|
addressable (>= 2.3.6)
|
43
43
|
crack (>= 0.3.2)
|
44
44
|
hashdiff
|
45
|
-
wgit (0.0.
|
45
|
+
wgit (0.0.17)
|
46
46
|
addressable (~> 2.6.0)
|
47
47
|
mongo (~> 2.9.0)
|
48
48
|
nokogiri (~> 1.10.3)
|
data/README.md
CHANGED
data/Rakefile
CHANGED
@@ -15,6 +15,16 @@ task :help do
|
|
15
15
|
system "bundle exec rake -D"
|
16
16
|
end
|
17
17
|
|
18
|
+
desc "Run the setup script"
|
19
|
+
task :setup do
|
20
|
+
system "./bin/setup"
|
21
|
+
end
|
22
|
+
|
23
|
+
desc "Run the development console"
|
24
|
+
task :console do
|
25
|
+
system "./bin/console"
|
26
|
+
end
|
27
|
+
|
18
28
|
desc "Compile all project Ruby files with warnings."
|
19
29
|
task :compile do
|
20
30
|
paths = Dir["**/*.rb", "**/*.gemspec", 'exe/broken_link_finder']
|
data/benchmark.rb
CHANGED
@@ -18,6 +18,6 @@ puts "Links crawled: #{finder.total_links_crawled}"
|
|
18
18
|
# Site: 9.732416
|
19
19
|
# Multi-threading crawl_site now yields the same time as a single page
|
20
20
|
|
21
|
-
#
|
21
|
+
# Large site crawl - post all link recording functionality
|
22
22
|
# Pre: 608 seconds with 7665 links crawled
|
23
23
|
# Post: 355 seconds with 1099 links crawled
|
data/bin/console
CHANGED
@@ -5,20 +5,29 @@ require "pry"
|
|
5
5
|
require "byebug"
|
6
6
|
require "broken_link_finder"
|
7
7
|
require 'wgit/core_ext'
|
8
|
+
require 'logger'
|
8
9
|
require 'httplog'
|
9
10
|
|
11
|
+
logger = Logger.new(STDOUT)
|
12
|
+
logger.formatter = proc do |severity, datetime, progname, msg|
|
13
|
+
"#{msg}\n"
|
14
|
+
end
|
15
|
+
|
10
16
|
# Monkey patch all Net:HTTP network calls and log them.
|
11
17
|
HttpLog.configure do |config|
|
18
|
+
config.enabled = true
|
19
|
+
config.logger = logger
|
20
|
+
|
12
21
|
config.log_connect = false
|
13
22
|
config.log_request = true
|
14
23
|
config.log_headers = false
|
15
24
|
config.log_data = false
|
16
25
|
config.log_status = true
|
17
26
|
config.log_response = false
|
18
|
-
config.log_benchmark =
|
27
|
+
config.log_benchmark = false
|
19
28
|
|
20
|
-
config.compact_log =
|
21
|
-
config.json_log =
|
29
|
+
config.compact_log = false
|
30
|
+
config.json_log = false
|
22
31
|
end
|
23
32
|
|
24
33
|
# Call reload to load all recent code changes.
|
@@ -33,9 +42,12 @@ end
|
|
33
42
|
|
34
43
|
# You can add fixtures and/or initialization code here...
|
35
44
|
reload
|
45
|
+
|
36
46
|
url = "http://txti.es/"
|
37
47
|
by_page = Finder.new
|
38
48
|
by_link = Finder.new sort: :link
|
39
49
|
finder = by_page
|
40
50
|
|
51
|
+
puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
|
52
|
+
|
41
53
|
binding.pry
|
data/broken_link_finder.gemspec
CHANGED
@@ -45,7 +45,7 @@ Gem::Specification.new do |spec|
|
|
45
45
|
spec.add_development_dependency "httplog", "~> 1.3"
|
46
46
|
spec.add_development_dependency "memory_profiler", "~> 0.9"
|
47
47
|
|
48
|
-
spec.add_runtime_dependency "wgit", "0.0.
|
48
|
+
spec.add_runtime_dependency "wgit", "0.0.17"
|
49
49
|
spec.add_runtime_dependency "thread", "0.2"
|
50
50
|
spec.add_runtime_dependency "thor", "0.20.3"
|
51
51
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -6,18 +6,20 @@ require 'thor'
|
|
6
6
|
|
7
7
|
class BrokenLinkFinderCLI < Thor
|
8
8
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
9
|
-
option :recursive, type: :boolean, aliases: [:r], desc: 'Crawl the entire site.'
|
10
|
-
option :
|
11
|
-
option :
|
12
|
-
option :
|
9
|
+
option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
|
10
|
+
option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
|
11
|
+
option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
12
|
+
option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
|
13
|
+
option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
|
13
14
|
def crawl(url)
|
14
15
|
url = "http://#{url}" unless url.start_with?('http')
|
15
16
|
|
16
|
-
sort_by = options[:sort_by_link]
|
17
|
-
|
18
|
-
|
17
|
+
sort_by = options[:sort_by_link] ? :link : :page
|
18
|
+
max_threads = options[:threads]
|
19
|
+
broken_verbose = !options[:concise]
|
20
|
+
ignored_verbose = options[:verbose]
|
19
21
|
|
20
|
-
finder = BrokenLinkFinder::Finder.new(sort: sort_by)
|
22
|
+
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
|
21
23
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
22
24
|
finder.pretty_print_link_report(
|
23
25
|
broken_verbose: broken_verbose,
|
@@ -26,6 +28,11 @@ class BrokenLinkFinderCLI < Thor
|
|
26
28
|
rescue Exception => ex
|
27
29
|
puts "An error has occurred: #{ex.message}"
|
28
30
|
end
|
31
|
+
|
32
|
+
desc 'version', 'Display the currently installed version'
|
33
|
+
def version
|
34
|
+
puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
|
35
|
+
end
|
29
36
|
end
|
30
37
|
|
31
38
|
BrokenLinkFinderCLI.start(ARGV)
|
@@ -4,19 +4,18 @@ require 'thread/pool'
|
|
4
4
|
require 'set'
|
5
5
|
|
6
6
|
module BrokenLinkFinder
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
DEFAULT_MAX_THREADS = 100.freeze
|
8
|
+
|
9
|
+
# Alias for BrokenLinkFinder::Finder.new.
|
10
|
+
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
11
|
+
Finder.new(sort: sort, max_threads: max_threads)
|
11
12
|
end
|
12
13
|
|
13
14
|
class Finder
|
14
|
-
|
15
|
-
|
16
|
-
attr_reader :broken_links, :ignored_links, :total_links_crawled
|
15
|
+
attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
|
17
16
|
|
18
17
|
# Creates a new Finder instance.
|
19
|
-
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
18
|
+
def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
|
20
19
|
unless [:page, :link].include?(sort)
|
21
20
|
raise "sort by either :page or :link, not #{sort}"
|
22
21
|
end
|
@@ -44,10 +43,11 @@ module BrokenLinkFinder
|
|
44
43
|
def crawl_url(url)
|
45
44
|
clear_links
|
46
45
|
|
47
|
-
# Ensure the given page url is valid.
|
48
46
|
url = Wgit::Url.new(url)
|
49
47
|
doc = @crawler.crawl_url(url)
|
50
|
-
|
48
|
+
|
49
|
+
# Ensure the given page url is valid.
|
50
|
+
raise "Invalid or broken URL: #{url}" unless doc
|
51
51
|
|
52
52
|
# Get all page links and determine which are broken.
|
53
53
|
find_broken_links(doc)
|
@@ -70,23 +70,24 @@ module BrokenLinkFinder
|
|
70
70
|
crawled_pages = []
|
71
71
|
|
72
72
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
@crawler.crawl_site(url) do |doc|
|
74
|
-
# Ensure the given website url is valid.
|
75
|
-
raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
|
73
|
+
orig_doc = @crawler.crawl_site(url) do |doc|
|
76
74
|
crawled_pages << doc.url
|
77
|
-
|
78
|
-
# Get all page links and determine which are broken.
|
79
75
|
next unless doc
|
76
|
+
|
77
|
+
# Start a thread for each page, checking for broken links.
|
80
78
|
pool.process { find_broken_links(doc) }
|
81
79
|
end
|
82
80
|
|
81
|
+
# Ensure the given website url is valid.
|
82
|
+
raise "Invalid or broken URL: #{url}" if orig_doc.nil?
|
83
|
+
|
83
84
|
# Wait for all threads to finish.
|
84
85
|
pool.shutdown
|
85
86
|
|
86
87
|
sort_links
|
87
88
|
set_total_links_crawled
|
88
89
|
|
89
|
-
[@broken_links.any?, crawled_pages]
|
90
|
+
[@broken_links.any?, crawled_pages.uniq]
|
90
91
|
end
|
91
92
|
|
92
93
|
# Pretty prints the link report into a stream e.g. STDOUT or a file,
|
@@ -133,7 +134,7 @@ module BrokenLinkFinder
|
|
133
134
|
end
|
134
135
|
|
135
136
|
# The link hasn't been processed before so we crawl it.
|
136
|
-
link_url =
|
137
|
+
link_url = get_absolute_link(doc, link)
|
137
138
|
link_doc = @crawler.crawl_url(link_url)
|
138
139
|
|
139
140
|
# Determine if the crawled link is broken or not.
|
@@ -149,6 +150,11 @@ module BrokenLinkFinder
|
|
149
150
|
nil
|
150
151
|
end
|
151
152
|
|
153
|
+
# Returns the link in absolute form so it can be crawled.
|
154
|
+
def get_absolute_link(doc, link)
|
155
|
+
link.is_relative? ? doc.base_url(link: link).concat(link) : link
|
156
|
+
end
|
157
|
+
|
152
158
|
# Returns true if the link is/contains a broken anchor.
|
153
159
|
def has_broken_anchor(doc)
|
154
160
|
raise "link document is nil" unless doc
|
@@ -200,10 +206,13 @@ module BrokenLinkFinder
|
|
200
206
|
|
201
207
|
# Sort keys and values alphabetically.
|
202
208
|
def sort_links
|
203
|
-
@broken_links
|
209
|
+
@broken_links.values.map { |v| v.uniq! }
|
210
|
+
@ignored_links.values.map { |v| v.uniq! }
|
211
|
+
|
212
|
+
@broken_links = @broken_links.sort_by { |k, v| k }.to_h
|
204
213
|
@ignored_links = @ignored_links.sort_by { |k, v| k }.to_h
|
205
214
|
|
206
|
-
@broken_links.each
|
215
|
+
@broken_links.each { |k, v| v.sort! }
|
207
216
|
@ignored_links.each { |k, v| v.sort! }
|
208
217
|
end
|
209
218
|
|
@@ -11,16 +11,17 @@ module BrokenLinkFinder
|
|
11
11
|
raise "sort by either :page or :link, not #{sort}"
|
12
12
|
end
|
13
13
|
|
14
|
-
@stream
|
15
|
-
@sort
|
16
|
-
@broken_links
|
17
|
-
@ignored_links
|
14
|
+
@stream = stream
|
15
|
+
@sort = sort
|
16
|
+
@broken_links = broken_links
|
17
|
+
@ignored_links = ignored_links
|
18
18
|
end
|
19
19
|
|
20
20
|
# Pretty print a report detailing the link summary.
|
21
21
|
def pretty_print_link_report(broken_verbose: true, ignored_verbose: false)
|
22
22
|
report_broken_links(verbose: broken_verbose)
|
23
23
|
report_ignored_links(verbose: ignored_verbose)
|
24
|
+
nil
|
24
25
|
end
|
25
26
|
|
26
27
|
private
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-08-
|
11
|
+
date: 2019-08-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - '='
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: 0.0.
|
131
|
+
version: 0.0.17
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - '='
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: 0.0.
|
138
|
+
version: 0.0.17
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
140
|
name: thread
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|