broken_link_finder 0.8.1 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6d7f59b84300909c1325badc759f20ae3e87026dc615e4f8c8a3a9e4c5e2a5b3
4
- data.tar.gz: add7905653a1036b2b1e6939671bf59dbfef3718336b2943c023f95c9d18e5c1
3
+ metadata.gz: d3f285779c735d254089d0b757d78a5fe1e1a082e0cbe21f591f499f2da54ba6
4
+ data.tar.gz: 3f62093f5589eb15df77f2289c86f826c1d7750401713b4854d26686e44dd745
5
5
  SHA512:
6
- metadata.gz: 85d406b55aa0431bb6c2a0b7a52f2cc6b3cfa526d11a2969f2339d72af8cbeb69fc7d0c3c17d7bc723a998c0aa650f120843b107d7a4030db515be74883f41d2
7
- data.tar.gz: 194273bfdcd99eff722b83dd3a86748d5e8a5875c48db8f7281cdce7fd856e93d6e764fcd2ded030c9a4ed22898277e386b0b61a02282a79599269545d903bdb
6
+ metadata.gz: 05d14ed145636e0a711b19519370b66189b39854a032ac1131c222d34f2a304c82cbcaefb7e60ed976ed0bd94a4a0dc191d7c80f885d7fd9811fe9ed3a8aafcb
7
+ data.tar.gz: a0ff34531ec08bd8abe134e7214ac74e3cf90b0d84d57f0268395b8ce2f55a31f8e0b8efeace50fa594e1a5af8b1afb5302fbac4cf7820579c2150d4ad63133c
@@ -9,6 +9,17 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.9.0
13
+ ### Added
14
+ - The `version` command to the executable.
15
+ - The `--threads` aka `-t` option to the executable's `crawl` command to control crawl speed vs. resource usage.
16
+ ### Changed/Removed
17
+ - Changed the default number of maximum threads for a recursive crawl from 30 to 100. Users will see a speed boost with increased resource usage as a result. This is configurable using the new `crawl` command option e.g. `--threads 30`.
18
+ ### Fixed
19
+ - Several bugs by updating the `wgit` dependancy.
20
+ - A bug in the report logic causing an incorrect link count.
21
+ ---
22
+
12
23
  ## v0.8.1
13
24
  ### Added
14
25
  - ...
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.8.1)
4
+ broken_link_finder (0.9.0)
5
5
  thor (= 0.20.3)
6
6
  thread (= 0.2)
7
- wgit (= 0.0.16)
7
+ wgit (= 0.0.17)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -42,7 +42,7 @@ GEM
42
42
  addressable (>= 2.3.6)
43
43
  crack (>= 0.3.2)
44
44
  hashdiff
45
- wgit (0.0.16)
45
+ wgit (0.0.17)
46
46
  addressable (~> 2.6.0)
47
47
  mongo (~> 2.9.0)
48
48
  nokogiri (~> 1.10.3)
data/README.md CHANGED
@@ -39,6 +39,10 @@ Or install it yourself as:
39
39
 
40
40
  $ gem install broken_link_finder
41
41
 
42
+ Finally, verify the installation with:
43
+
44
+ $ broken_link_finder version
45
+
42
46
  ## Usage
43
47
 
44
48
  You can check for broken links via the library or executable.
data/Rakefile CHANGED
@@ -15,6 +15,16 @@ task :help do
15
15
  system "bundle exec rake -D"
16
16
  end
17
17
 
18
+ desc "Run the setup script"
19
+ task :setup do
20
+ system "./bin/setup"
21
+ end
22
+
23
+ desc "Run the development console"
24
+ task :console do
25
+ system "./bin/console"
26
+ end
27
+
18
28
  desc "Compile all project Ruby files with warnings."
19
29
  task :compile do
20
30
  paths = Dir["**/*.rb", "**/*.gemspec", 'exe/broken_link_finder']
@@ -18,6 +18,6 @@ puts "Links crawled: #{finder.total_links_crawled}"
18
18
  # Site: 9.732416
19
19
  # Multi-threading crawl_site now yields the same time as a single page
20
20
 
21
- # https://meos.ch/ site crawl - post all link recording functionality
21
+ # Large site crawl - post all link recording functionality
22
22
  # Pre: 608 seconds with 7665 links crawled
23
23
  # Post: 355 seconds with 1099 links crawled
@@ -5,20 +5,29 @@ require "pry"
5
5
  require "byebug"
6
6
  require "broken_link_finder"
7
7
  require 'wgit/core_ext'
8
+ require 'logger'
8
9
  require 'httplog'
9
10
 
11
+ logger = Logger.new(STDOUT)
12
+ logger.formatter = proc do |severity, datetime, progname, msg|
13
+ "#{msg}\n"
14
+ end
15
+
10
16
  # Monkey patch all Net:HTTP network calls and log them.
11
17
  HttpLog.configure do |config|
18
+ config.enabled = true
19
+ config.logger = logger
20
+
12
21
  config.log_connect = false
13
22
  config.log_request = true
14
23
  config.log_headers = false
15
24
  config.log_data = false
16
25
  config.log_status = true
17
26
  config.log_response = false
18
- config.log_benchmark = true
27
+ config.log_benchmark = false
19
28
 
20
- config.compact_log = true
21
- config.json_log = true
29
+ config.compact_log = false
30
+ config.json_log = false
22
31
  end
23
32
 
24
33
  # Call reload to load all recent code changes.
@@ -33,9 +42,12 @@ end
33
42
 
34
43
  # You can add fixtures and/or initialization code here...
35
44
  reload
45
+
36
46
  url = "http://txti.es/"
37
47
  by_page = Finder.new
38
48
  by_link = Finder.new sort: :link
39
49
  finder = by_page
40
50
 
51
+ puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
52
+
41
53
  binding.pry
@@ -45,7 +45,7 @@ Gem::Specification.new do |spec|
45
45
  spec.add_development_dependency "httplog", "~> 1.3"
46
46
  spec.add_development_dependency "memory_profiler", "~> 0.9"
47
47
 
48
- spec.add_runtime_dependency "wgit", "0.0.16"
48
+ spec.add_runtime_dependency "wgit", "0.0.17"
49
49
  spec.add_runtime_dependency "thread", "0.2"
50
50
  spec.add_runtime_dependency "thor", "0.20.3"
51
51
  end
@@ -6,18 +6,20 @@ require 'thor'
6
6
 
7
7
  class BrokenLinkFinderCLI < Thor
8
8
  desc 'crawl [URL]', 'Find broken links at the URL'
9
- option :recursive, type: :boolean, aliases: [:r], desc: 'Crawl the entire site.'
10
- option :sort_by_link, type: :boolean, aliases: [:l], desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
11
- option :verbose, type: :boolean, aliases: [:v], desc: 'Display all ignored links.'
12
- option :concise, type: :boolean, aliases: [:c], desc: 'Display only a summary of broken links.'
9
+ option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
10
+ option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
11
+ option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
12
+ option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
13
+ option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
13
14
  def crawl(url)
14
15
  url = "http://#{url}" unless url.start_with?('http')
15
16
 
16
- sort_by = options[:sort_by_link] ? :link : :page
17
- broken_verbose = options[:concise] ? false : true
18
- ignored_verbose = options[:verbose] ? true : false
17
+ sort_by = options[:sort_by_link] ? :link : :page
18
+ max_threads = options[:threads]
19
+ broken_verbose = !options[:concise]
20
+ ignored_verbose = options[:verbose]
19
21
 
20
- finder = BrokenLinkFinder::Finder.new(sort: sort_by)
22
+ finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
21
23
  options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
22
24
  finder.pretty_print_link_report(
23
25
  broken_verbose: broken_verbose,
@@ -26,6 +28,11 @@ class BrokenLinkFinderCLI < Thor
26
28
  rescue Exception => ex
27
29
  puts "An error has occurred: #{ex.message}"
28
30
  end
31
+
32
+ desc 'version', 'Display the currently installed version'
33
+ def version
34
+ puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
35
+ end
29
36
  end
30
37
 
31
38
  BrokenLinkFinderCLI.start(ARGV)
@@ -4,19 +4,18 @@ require 'thread/pool'
4
4
  require 'set'
5
5
 
6
6
  module BrokenLinkFinder
7
- # Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
8
- # override the max_threads variable.
9
- def self.new(sort: :page)
10
- Finder.new(sort: sort)
7
+ DEFAULT_MAX_THREADS = 100.freeze
8
+
9
+ # Alias for BrokenLinkFinder::Finder.new.
10
+ def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
11
+ Finder.new(sort: sort, max_threads: max_threads)
11
12
  end
12
13
 
13
14
  class Finder
14
- DEFAULT_MAX_THREADS = 30.freeze
15
-
16
- attr_reader :broken_links, :ignored_links, :total_links_crawled
15
+ attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
17
16
 
18
17
  # Creates a new Finder instance.
19
- def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
18
+ def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
20
19
  unless [:page, :link].include?(sort)
21
20
  raise "sort by either :page or :link, not #{sort}"
22
21
  end
@@ -44,10 +43,11 @@ module BrokenLinkFinder
44
43
  def crawl_url(url)
45
44
  clear_links
46
45
 
47
- # Ensure the given page url is valid.
48
46
  url = Wgit::Url.new(url)
49
47
  doc = @crawler.crawl_url(url)
50
- raise "Invalid URL: #{url}" unless doc
48
+
49
+ # Ensure the given page url is valid.
50
+ raise "Invalid or broken URL: #{url}" unless doc
51
51
 
52
52
  # Get all page links and determine which are broken.
53
53
  find_broken_links(doc)
@@ -70,23 +70,24 @@ module BrokenLinkFinder
70
70
  crawled_pages = []
71
71
 
72
72
  # Crawl the site's HTML web pages looking for links.
73
- @crawler.crawl_site(url) do |doc|
74
- # Ensure the given website url is valid.
75
- raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
73
+ orig_doc = @crawler.crawl_site(url) do |doc|
76
74
  crawled_pages << doc.url
77
-
78
- # Get all page links and determine which are broken.
79
75
  next unless doc
76
+
77
+ # Start a thread for each page, checking for broken links.
80
78
  pool.process { find_broken_links(doc) }
81
79
  end
82
80
 
81
+ # Ensure the given website url is valid.
82
+ raise "Invalid or broken URL: #{url}" if orig_doc.nil?
83
+
83
84
  # Wait for all threads to finish.
84
85
  pool.shutdown
85
86
 
86
87
  sort_links
87
88
  set_total_links_crawled
88
89
 
89
- [@broken_links.any?, crawled_pages]
90
+ [@broken_links.any?, crawled_pages.uniq]
90
91
  end
91
92
 
92
93
  # Pretty prints the link report into a stream e.g. STDOUT or a file,
@@ -133,7 +134,7 @@ module BrokenLinkFinder
133
134
  end
134
135
 
135
136
  # The link hasn't been processed before so we crawl it.
136
- link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
137
+ link_url = get_absolute_link(doc, link)
137
138
  link_doc = @crawler.crawl_url(link_url)
138
139
 
139
140
  # Determine if the crawled link is broken or not.
@@ -149,6 +150,11 @@ module BrokenLinkFinder
149
150
  nil
150
151
  end
151
152
 
153
+ # Returns the link in absolute form so it can be crawled.
154
+ def get_absolute_link(doc, link)
155
+ link.is_relative? ? doc.base_url(link: link).concat(link) : link
156
+ end
157
+
152
158
  # Returns true if the link is/contains a broken anchor.
153
159
  def has_broken_anchor(doc)
154
160
  raise "link document is nil" unless doc
@@ -200,10 +206,13 @@ module BrokenLinkFinder
200
206
 
201
207
  # Sort keys and values alphabetically.
202
208
  def sort_links
203
- @broken_links = @broken_links.sort_by { |k, v| k }.to_h
209
+ @broken_links.values.map { |v| v.uniq! }
210
+ @ignored_links.values.map { |v| v.uniq! }
211
+
212
+ @broken_links = @broken_links.sort_by { |k, v| k }.to_h
204
213
  @ignored_links = @ignored_links.sort_by { |k, v| k }.to_h
205
214
 
206
- @broken_links.each { |k, v| v.sort! }
215
+ @broken_links.each { |k, v| v.sort! }
207
216
  @ignored_links.each { |k, v| v.sort! }
208
217
  end
209
218
 
@@ -11,16 +11,17 @@ module BrokenLinkFinder
11
11
  raise "sort by either :page or :link, not #{sort}"
12
12
  end
13
13
 
14
- @stream = stream
15
- @sort = sort
16
- @broken_links = broken_links
17
- @ignored_links = ignored_links
14
+ @stream = stream
15
+ @sort = sort
16
+ @broken_links = broken_links
17
+ @ignored_links = ignored_links
18
18
  end
19
19
 
20
20
  # Pretty print a report detailing the link summary.
21
21
  def pretty_print_link_report(broken_verbose: true, ignored_verbose: false)
22
22
  report_broken_links(verbose: broken_verbose)
23
23
  report_ignored_links(verbose: ignored_verbose)
24
+ nil
24
25
  end
25
26
 
26
27
  private
@@ -1,3 +1,3 @@
1
1
  module BrokenLinkFinder
2
- VERSION = "0.8.1"
2
+ VERSION = "0.9.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-08-21 00:00:00.000000000 Z
11
+ date: 2019-08-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -128,14 +128,14 @@ dependencies:
128
128
  requirements:
129
129
  - - '='
130
130
  - !ruby/object:Gem::Version
131
- version: 0.0.16
131
+ version: 0.0.17
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - '='
137
137
  - !ruby/object:Gem::Version
138
- version: 0.0.16
138
+ version: 0.0.17
139
139
  - !ruby/object:Gem::Dependency
140
140
  name: thread
141
141
  requirement: !ruby/object:Gem::Requirement