broken_link_finder 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6d7f59b84300909c1325badc759f20ae3e87026dc615e4f8c8a3a9e4c5e2a5b3
4
- data.tar.gz: add7905653a1036b2b1e6939671bf59dbfef3718336b2943c023f95c9d18e5c1
3
+ metadata.gz: d3f285779c735d254089d0b757d78a5fe1e1a082e0cbe21f591f499f2da54ba6
4
+ data.tar.gz: 3f62093f5589eb15df77f2289c86f826c1d7750401713b4854d26686e44dd745
5
5
  SHA512:
6
- metadata.gz: 85d406b55aa0431bb6c2a0b7a52f2cc6b3cfa526d11a2969f2339d72af8cbeb69fc7d0c3c17d7bc723a998c0aa650f120843b107d7a4030db515be74883f41d2
7
- data.tar.gz: 194273bfdcd99eff722b83dd3a86748d5e8a5875c48db8f7281cdce7fd856e93d6e764fcd2ded030c9a4ed22898277e386b0b61a02282a79599269545d903bdb
6
+ metadata.gz: 05d14ed145636e0a711b19519370b66189b39854a032ac1131c222d34f2a304c82cbcaefb7e60ed976ed0bd94a4a0dc191d7c80f885d7fd9811fe9ed3a8aafcb
7
+ data.tar.gz: a0ff34531ec08bd8abe134e7214ac74e3cf90b0d84d57f0268395b8ce2f55a31f8e0b8efeace50fa594e1a5af8b1afb5302fbac4cf7820579c2150d4ad63133c
@@ -9,6 +9,17 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.9.0
13
+ ### Added
14
+ - The `version` command to the executable.
15
+ - The `--threads` aka `-t` option to the executable's `crawl` command to control crawl speed vs. resource usage.
16
+ ### Changed/Removed
17
+ - Changed the default number of maximum threads for a recursive crawl from 30 to 100. Users will see a speed boost with increased resource usage as a result. This is configurable using the new `crawl` command option e.g. `--threads 30`.
18
+ ### Fixed
19
+ - Several bugs by updating the `wgit` dependancy.
20
+ - A bug in the report logic causing an incorrect link count.
21
+ ---
22
+
12
23
  ## v0.8.1
13
24
  ### Added
14
25
  - ...
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.8.1)
4
+ broken_link_finder (0.9.0)
5
5
  thor (= 0.20.3)
6
6
  thread (= 0.2)
7
- wgit (= 0.0.16)
7
+ wgit (= 0.0.17)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -42,7 +42,7 @@ GEM
42
42
  addressable (>= 2.3.6)
43
43
  crack (>= 0.3.2)
44
44
  hashdiff
45
- wgit (0.0.16)
45
+ wgit (0.0.17)
46
46
  addressable (~> 2.6.0)
47
47
  mongo (~> 2.9.0)
48
48
  nokogiri (~> 1.10.3)
data/README.md CHANGED
@@ -39,6 +39,10 @@ Or install it yourself as:
39
39
 
40
40
  $ gem install broken_link_finder
41
41
 
42
+ Finally, verify the installation with:
43
+
44
+ $ broken_link_finder version
45
+
42
46
  ## Usage
43
47
 
44
48
  You can check for broken links via the library or executable.
data/Rakefile CHANGED
@@ -15,6 +15,16 @@ task :help do
15
15
  system "bundle exec rake -D"
16
16
  end
17
17
 
18
+ desc "Run the setup script"
19
+ task :setup do
20
+ system "./bin/setup"
21
+ end
22
+
23
+ desc "Run the development console"
24
+ task :console do
25
+ system "./bin/console"
26
+ end
27
+
18
28
  desc "Compile all project Ruby files with warnings."
19
29
  task :compile do
20
30
  paths = Dir["**/*.rb", "**/*.gemspec", 'exe/broken_link_finder']
@@ -18,6 +18,6 @@ puts "Links crawled: #{finder.total_links_crawled}"
18
18
  # Site: 9.732416
19
19
  # Multi-threading crawl_site now yields the same time as a single page
20
20
 
21
- # https://meos.ch/ site crawl - post all link recording functionality
21
+ # Large site crawl - post all link recording functionality
22
22
  # Pre: 608 seconds with 7665 links crawled
23
23
  # Post: 355 seconds with 1099 links crawled
@@ -5,20 +5,29 @@ require "pry"
5
5
  require "byebug"
6
6
  require "broken_link_finder"
7
7
  require 'wgit/core_ext'
8
+ require 'logger'
8
9
  require 'httplog'
9
10
 
11
+ logger = Logger.new(STDOUT)
12
+ logger.formatter = proc do |severity, datetime, progname, msg|
13
+ "#{msg}\n"
14
+ end
15
+
10
16
  # Monkey patch all Net:HTTP network calls and log them.
11
17
  HttpLog.configure do |config|
18
+ config.enabled = true
19
+ config.logger = logger
20
+
12
21
  config.log_connect = false
13
22
  config.log_request = true
14
23
  config.log_headers = false
15
24
  config.log_data = false
16
25
  config.log_status = true
17
26
  config.log_response = false
18
- config.log_benchmark = true
27
+ config.log_benchmark = false
19
28
 
20
- config.compact_log = true
21
- config.json_log = true
29
+ config.compact_log = false
30
+ config.json_log = false
22
31
  end
23
32
 
24
33
  # Call reload to load all recent code changes.
@@ -33,9 +42,12 @@ end
33
42
 
34
43
  # You can add fixtures and/or initialization code here...
35
44
  reload
45
+
36
46
  url = "http://txti.es/"
37
47
  by_page = Finder.new
38
48
  by_link = Finder.new sort: :link
39
49
  finder = by_page
40
50
 
51
+ puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
52
+
41
53
  binding.pry
@@ -45,7 +45,7 @@ Gem::Specification.new do |spec|
45
45
  spec.add_development_dependency "httplog", "~> 1.3"
46
46
  spec.add_development_dependency "memory_profiler", "~> 0.9"
47
47
 
48
- spec.add_runtime_dependency "wgit", "0.0.16"
48
+ spec.add_runtime_dependency "wgit", "0.0.17"
49
49
  spec.add_runtime_dependency "thread", "0.2"
50
50
  spec.add_runtime_dependency "thor", "0.20.3"
51
51
  end
@@ -6,18 +6,20 @@ require 'thor'
6
6
 
7
7
  class BrokenLinkFinderCLI < Thor
8
8
  desc 'crawl [URL]', 'Find broken links at the URL'
9
- option :recursive, type: :boolean, aliases: [:r], desc: 'Crawl the entire site.'
10
- option :sort_by_link, type: :boolean, aliases: [:l], desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
11
- option :verbose, type: :boolean, aliases: [:v], desc: 'Display all ignored links.'
12
- option :concise, type: :boolean, aliases: [:c], desc: 'Display only a summary of broken links.'
9
+ option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
10
+ option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
11
+ option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
12
+ option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
13
+ option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
13
14
  def crawl(url)
14
15
  url = "http://#{url}" unless url.start_with?('http')
15
16
 
16
- sort_by = options[:sort_by_link] ? :link : :page
17
- broken_verbose = options[:concise] ? false : true
18
- ignored_verbose = options[:verbose] ? true : false
17
+ sort_by = options[:sort_by_link] ? :link : :page
18
+ max_threads = options[:threads]
19
+ broken_verbose = !options[:concise]
20
+ ignored_verbose = options[:verbose]
19
21
 
20
- finder = BrokenLinkFinder::Finder.new(sort: sort_by)
22
+ finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
21
23
  options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
22
24
  finder.pretty_print_link_report(
23
25
  broken_verbose: broken_verbose,
@@ -26,6 +28,11 @@ class BrokenLinkFinderCLI < Thor
26
28
  rescue Exception => ex
27
29
  puts "An error has occurred: #{ex.message}"
28
30
  end
31
+
32
+ desc 'version', 'Display the currently installed version'
33
+ def version
34
+ puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
35
+ end
29
36
  end
30
37
 
31
38
  BrokenLinkFinderCLI.start(ARGV)
@@ -4,19 +4,18 @@ require 'thread/pool'
4
4
  require 'set'
5
5
 
6
6
  module BrokenLinkFinder
7
- # Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
8
- # override the max_threads variable.
9
- def self.new(sort: :page)
10
- Finder.new(sort: sort)
7
+ DEFAULT_MAX_THREADS = 100.freeze
8
+
9
+ # Alias for BrokenLinkFinder::Finder.new.
10
+ def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
11
+ Finder.new(sort: sort, max_threads: max_threads)
11
12
  end
12
13
 
13
14
  class Finder
14
- DEFAULT_MAX_THREADS = 30.freeze
15
-
16
- attr_reader :broken_links, :ignored_links, :total_links_crawled
15
+ attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
17
16
 
18
17
  # Creates a new Finder instance.
19
- def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
18
+ def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
20
19
  unless [:page, :link].include?(sort)
21
20
  raise "sort by either :page or :link, not #{sort}"
22
21
  end
@@ -44,10 +43,11 @@ module BrokenLinkFinder
44
43
  def crawl_url(url)
45
44
  clear_links
46
45
 
47
- # Ensure the given page url is valid.
48
46
  url = Wgit::Url.new(url)
49
47
  doc = @crawler.crawl_url(url)
50
- raise "Invalid URL: #{url}" unless doc
48
+
49
+ # Ensure the given page url is valid.
50
+ raise "Invalid or broken URL: #{url}" unless doc
51
51
 
52
52
  # Get all page links and determine which are broken.
53
53
  find_broken_links(doc)
@@ -70,23 +70,24 @@ module BrokenLinkFinder
70
70
  crawled_pages = []
71
71
 
72
72
  # Crawl the site's HTML web pages looking for links.
73
- @crawler.crawl_site(url) do |doc|
74
- # Ensure the given website url is valid.
75
- raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
73
+ orig_doc = @crawler.crawl_site(url) do |doc|
76
74
  crawled_pages << doc.url
77
-
78
- # Get all page links and determine which are broken.
79
75
  next unless doc
76
+
77
+ # Start a thread for each page, checking for broken links.
80
78
  pool.process { find_broken_links(doc) }
81
79
  end
82
80
 
81
+ # Ensure the given website url is valid.
82
+ raise "Invalid or broken URL: #{url}" if orig_doc.nil?
83
+
83
84
  # Wait for all threads to finish.
84
85
  pool.shutdown
85
86
 
86
87
  sort_links
87
88
  set_total_links_crawled
88
89
 
89
- [@broken_links.any?, crawled_pages]
90
+ [@broken_links.any?, crawled_pages.uniq]
90
91
  end
91
92
 
92
93
  # Pretty prints the link report into a stream e.g. STDOUT or a file,
@@ -133,7 +134,7 @@ module BrokenLinkFinder
133
134
  end
134
135
 
135
136
  # The link hasn't been processed before so we crawl it.
136
- link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
137
+ link_url = get_absolute_link(doc, link)
137
138
  link_doc = @crawler.crawl_url(link_url)
138
139
 
139
140
  # Determine if the crawled link is broken or not.
@@ -149,6 +150,11 @@ module BrokenLinkFinder
149
150
  nil
150
151
  end
151
152
 
153
+ # Returns the link in absolute form so it can be crawled.
154
+ def get_absolute_link(doc, link)
155
+ link.is_relative? ? doc.base_url(link: link).concat(link) : link
156
+ end
157
+
152
158
  # Returns true if the link is/contains a broken anchor.
153
159
  def has_broken_anchor(doc)
154
160
  raise "link document is nil" unless doc
@@ -200,10 +206,13 @@ module BrokenLinkFinder
200
206
 
201
207
  # Sort keys and values alphabetically.
202
208
  def sort_links
203
- @broken_links = @broken_links.sort_by { |k, v| k }.to_h
209
+ @broken_links.values.map { |v| v.uniq! }
210
+ @ignored_links.values.map { |v| v.uniq! }
211
+
212
+ @broken_links = @broken_links.sort_by { |k, v| k }.to_h
204
213
  @ignored_links = @ignored_links.sort_by { |k, v| k }.to_h
205
214
 
206
- @broken_links.each { |k, v| v.sort! }
215
+ @broken_links.each { |k, v| v.sort! }
207
216
  @ignored_links.each { |k, v| v.sort! }
208
217
  end
209
218
 
@@ -11,16 +11,17 @@ module BrokenLinkFinder
11
11
  raise "sort by either :page or :link, not #{sort}"
12
12
  end
13
13
 
14
- @stream = stream
15
- @sort = sort
16
- @broken_links = broken_links
17
- @ignored_links = ignored_links
14
+ @stream = stream
15
+ @sort = sort
16
+ @broken_links = broken_links
17
+ @ignored_links = ignored_links
18
18
  end
19
19
 
20
20
  # Pretty print a report detailing the link summary.
21
21
  def pretty_print_link_report(broken_verbose: true, ignored_verbose: false)
22
22
  report_broken_links(verbose: broken_verbose)
23
23
  report_ignored_links(verbose: ignored_verbose)
24
+ nil
24
25
  end
25
26
 
26
27
  private
@@ -1,3 +1,3 @@
1
1
  module BrokenLinkFinder
2
- VERSION = "0.8.1"
2
+ VERSION = "0.9.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-08-21 00:00:00.000000000 Z
11
+ date: 2019-08-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -128,14 +128,14 @@ dependencies:
128
128
  requirements:
129
129
  - - '='
130
130
  - !ruby/object:Gem::Version
131
- version: 0.0.16
131
+ version: 0.0.17
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - '='
137
137
  - !ruby/object:Gem::Version
138
- version: 0.0.16
138
+ version: 0.0.17
139
139
  - !ruby/object:Gem::Dependency
140
140
  name: thread
141
141
  requirement: !ruby/object:Gem::Requirement