broken_link_finder 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be656fb33a6363c5da6bb1cd05e52f8bc9b1f7223825bd7dc4ada8af2bdea1d2
4
- data.tar.gz: 2b4db95eaf086c10ac6f7528ec63c426ead3507f9adc4ecf0fd177c828d487f2
3
+ metadata.gz: 3e926c0997f5dcb805a9cdb4bf367237e3dff3284259395f6f38fa6b23a69172
4
+ data.tar.gz: 5c4020b8d1ae713a34593107580e82e3b3c06c75ec9a675e6cb536c07add1aca
5
5
  SHA512:
6
- metadata.gz: 59bdd686ff0cce9359e51415011ba0521404834790ba7eec304d29d085bbdfafb3c1c0f8fd63bd235d41c0aed61278a2211f05fdb0f3d9c15d31a53f1b18b877
7
- data.tar.gz: 27986886e3fa6ab4123027ff3067633787470c00401001942f7b28f4ba87fcfe38a11513caa2ade8dc5bfed0efb8a9db4af9749130c8960ac7ccb84a169fc154
6
+ metadata.gz: c988b4c602441da10fcf602ca14e528ad8ce2e076e961659e10130c3393c4d2a0729a1c51e4611445f3d20c27402d65ae2cab8bb937e1009b4af1123462aaa01
7
+ data.tar.gz: 448fbc03310b0378a91e2a2bf3db95bd33cc4cd2fe51af36c55e0409e05482c5beed9783ee6465b3747d8e25f4afd3fdfa72c0604834ab2b0df070921605a183
@@ -9,6 +9,15 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.8.0
13
+ ### Added
14
+ - Logic to prevent re-crawling links for more efficiency.
15
+ ### Changed/Removed
16
+ - Updated the `wgit` gem which fixes a bug in `crawl_site` and adds support for IRI's.
17
+ ### Fixed
18
+ - Bug where an error from the executable wasn't being rescued.
19
+ ---
20
+
12
21
  ## v0.7.0
13
22
  ### Added
14
23
  - Added the `--verbose` flag to the executable for displaying all ignored links.
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.7.0)
4
+ broken_link_finder (0.8.0)
5
5
  thor (= 0.20.3)
6
6
  thread (= 0.2)
7
- wgit (= 0.0.13)
7
+ wgit (= 0.0.15)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -24,7 +24,7 @@ GEM
24
24
  method_source (0.9.2)
25
25
  mini_portile2 (2.4.0)
26
26
  minitest (5.11.3)
27
- mongo (2.8.0)
27
+ mongo (2.9.1)
28
28
  bson (>= 4.4.2, < 5.0.0)
29
29
  nokogiri (1.10.4)
30
30
  mini_portile2 (~> 2.4.0)
@@ -42,8 +42,9 @@ GEM
42
42
  addressable (>= 2.3.6)
43
43
  crack (>= 0.3.2)
44
44
  hashdiff
45
- wgit (0.0.13)
46
- mongo (~> 2.8.0)
45
+ wgit (0.0.15)
46
+ addressable (~> 2.6.0)
47
+ mongo (~> 2.9.0)
47
48
  nokogiri (~> 1.10.3)
48
49
 
49
50
  PLATFORMS
@@ -1,18 +1,23 @@
1
- require 'broken_link_finder'
1
+ require_relative './lib/broken_link_finder'
2
2
  require 'benchmark'
3
3
  require 'memory_profiler'
4
4
 
5
- url = ARGV[0] || "http://txti.es"
5
+ url = ARGV[0] || 'http://txti.es'
6
6
  finder = BrokenLinkFinder::Finder.new
7
7
 
8
- puts Benchmark.measure { finder.crawl_page url }
8
+ # puts Benchmark.measure { finder.crawl_page url }
9
9
  puts Benchmark.measure { finder.crawl_site url }
10
+ puts "Links crawled: #{finder.total_links_crawled}"
10
11
 
11
- # http://txti.es
12
- # Pre threading: 17.591528
13
- # Post threading: 7.508828 :-)
12
+ # http://txti.es page crawl
13
+ # Pre threading: 17.5 seconds
14
+ # Post threading: 7.5 seconds
14
15
 
15
- # http://txti.es
16
+ # http://txti.es post threading - page vs site crawl
16
17
  # Page: 9.526981
17
18
  # Site: 9.732416
18
- # Multi-threading crawl_site now yields the same time as a single page.
19
+ # Multi-threading crawl_site now yields the same time as a single page
20
+
21
+ # https://meos.ch/ site crawl - post all link recording functionality
22
+ # Pre: 608 seconds with 7665 links crawled
23
+ # Post: 355 seconds with 1099 links crawled
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
34
34
  spec.require_paths = ["lib"]
35
35
  spec.post_install_message = "Added the executable 'broken_link_finder' to $PATH"
36
36
 
37
- spec.required_ruby_version = '~> 2.5' # Only works with ruby 2.5.x
37
+ spec.required_ruby_version = '~> 2.5'
38
38
 
39
39
  spec.add_development_dependency "bundler", "~> 2.0"
40
40
  spec.add_development_dependency "rake", "~> 10.0"
@@ -45,7 +45,7 @@ Gem::Specification.new do |spec|
45
45
  spec.add_development_dependency "httplog", "~> 1.3"
46
46
  spec.add_development_dependency "memory_profiler", "~> 0.9"
47
47
 
48
- spec.add_runtime_dependency "wgit", "0.0.13"
48
+ spec.add_runtime_dependency "wgit", "0.0.15"
49
49
  spec.add_runtime_dependency "thread", "0.2"
50
50
  spec.add_runtime_dependency "thor", "0.20.3"
51
51
  end
@@ -23,6 +23,8 @@ class BrokenLinkFinderCLI < Thor
23
23
  broken_verbose: broken_verbose,
24
24
  ignored_verbose: ignored_verbose
25
25
  )
26
+ rescue Exception => ex
27
+ puts "An error has occurred: #{ex.message}"
26
28
  end
27
29
  end
28
30
 
@@ -1,6 +1,7 @@
1
1
  require_relative 'reporter'
2
2
  require 'wgit'
3
3
  require 'thread/pool'
4
+ require 'set'
4
5
 
5
6
  module BrokenLinkFinder
6
7
  # Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
@@ -12,7 +13,7 @@ module BrokenLinkFinder
12
13
  class Finder
13
14
  DEFAULT_MAX_THREADS = 30.freeze
14
15
 
15
- attr_reader :broken_links, :ignored_links
16
+ attr_reader :broken_links, :ignored_links, :total_links_crawled
16
17
 
17
18
  # Creates a new Finder instance.
18
19
  def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
@@ -20,18 +21,21 @@ module BrokenLinkFinder
20
21
  raise "sort by either :page or :link, not #{sort}"
21
22
  end
22
23
 
23
- @sort = sort
24
+ @sort = sort
24
25
  @max_threads = max_threads
25
- @lock = Mutex.new
26
- @crawler = Wgit::Crawler.new
26
+ @lock = Mutex.new
27
+ @crawler = Wgit::Crawler.new
27
28
 
28
29
  clear_links
29
30
  end
30
31
 
31
32
  # Clear/empty the link collection Hashes.
32
33
  def clear_links
33
- @broken_links = {}
34
- @ignored_links = {}
34
+ @broken_links = {}
35
+ @ignored_links = {}
36
+ @total_links_crawled = 0
37
+ @all_broken_links = Set.new
38
+ @all_intact_links = Set.new
35
39
  end
36
40
 
37
41
  # Finds broken links within a single page and appends them to the
@@ -39,9 +43,9 @@ module BrokenLinkFinder
39
43
  # Access the broken links with Finder#broken_links.
40
44
  def crawl_url(url)
41
45
  clear_links
42
- url = Wgit::Url.new(url)
43
46
 
44
47
  # Ensure the given page url is valid.
48
+ url = Wgit::Url.new(url)
45
49
  doc = @crawler.crawl_url(url)
46
50
  raise "Invalid URL: #{url}" unless doc
47
51
 
@@ -49,6 +53,8 @@ module BrokenLinkFinder
49
53
  find_broken_links(doc)
50
54
 
51
55
  sort_links
56
+ set_total_links_crawled
57
+
52
58
  @broken_links.any?
53
59
  end
54
60
 
@@ -58,6 +64,7 @@ module BrokenLinkFinder
58
64
  # Access the broken links with Finder#broken_links.
59
65
  def crawl_site(url)
60
66
  clear_links
67
+
61
68
  url = Wgit::Url.new(url)
62
69
  pool = Thread.pool(@max_threads)
63
70
  crawled_pages = []
@@ -73,8 +80,12 @@ module BrokenLinkFinder
73
80
  pool.process { find_broken_links(doc) }
74
81
  end
75
82
 
76
- pool.shutdown # Wait for all threads to finish.
83
+ # Wait for all threads to finish.
84
+ pool.shutdown
85
+
77
86
  sort_links
87
+ set_total_links_crawled
88
+
78
89
  [@broken_links.any?, crawled_pages]
79
90
  end
80
91
 
@@ -101,7 +112,7 @@ module BrokenLinkFinder
101
112
 
102
113
  # Finds which links are unsupported or broken and records the details.
103
114
  def find_broken_links(doc)
104
- # Process the Document's links before checking if they're broke.
115
+ # Report and reject any non supported links.
105
116
  links = doc.all_links.
106
117
  reject do |link|
107
118
  if !link.is_relative? and !link.start_with?('http')
@@ -113,13 +124,25 @@ module BrokenLinkFinder
113
124
 
114
125
  # Iterate over the supported links checking if they're broken or not.
115
126
  links.each do |link|
127
+ # Check if the link has already been processed previously.
128
+ next if @all_intact_links.include?(link)
129
+
130
+ if @all_broken_links.include?(link)
131
+ append_broken_link(doc.url, link)
132
+ next
133
+ end
134
+
135
+ # The link hasn't been processed before so we crawl it.
116
136
  link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
117
137
  link_doc = @crawler.crawl_url(link_url)
118
138
 
139
+ # Determine if the crawled link is broken or not.
119
140
  if @crawler.last_response.is_a?(Net::HTTPNotFound) or
120
141
  link_doc.nil? or
121
142
  has_broken_anchor(link_doc)
122
143
  append_broken_link(doc.url, link)
144
+ else
145
+ @lock.synchronize { @all_intact_links << link }
123
146
  end
124
147
  end
125
148
 
@@ -140,17 +163,21 @@ module BrokenLinkFinder
140
163
  # Append key => [value] to @broken_links.
141
164
  def append_broken_link(url, link)
142
165
  key, value = get_key_value(url, link)
166
+
143
167
  @lock.synchronize do
144
168
  unless @broken_links[key]
145
169
  @broken_links[key] = []
146
170
  end
147
171
  @broken_links[key] << value
172
+
173
+ @all_broken_links << link
148
174
  end
149
175
  end
150
176
 
151
177
  # Append key => [value] to @ignored_links.
152
178
  def append_ignored_link(url, link)
153
179
  key, value = get_key_value(url, link)
180
+
154
181
  @lock.synchronize do
155
182
  unless @ignored_links[key]
156
183
  @ignored_links[key] = []
@@ -180,6 +207,11 @@ module BrokenLinkFinder
180
207
  @ignored_links.each { |k, v| v.sort! }
181
208
  end
182
209
 
210
+ # Sets and returns the total number of links crawled.
211
+ def set_total_links_crawled
212
+ @total_links_crawled = @all_broken_links.size + @all_intact_links.size
213
+ end
214
+
183
215
  alias_method :crawl_page, :crawl_url
184
216
  alias_method :pretty_print_link_summary, :pretty_print_link_report
185
217
  end
@@ -1,3 +1,3 @@
1
1
  module BrokenLinkFinder
2
- VERSION = "0.7.0"
2
+ VERSION = "0.8.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-08-13 00:00:00.000000000 Z
11
+ date: 2019-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -128,14 +128,14 @@ dependencies:
128
128
  requirements:
129
129
  - - '='
130
130
  - !ruby/object:Gem::Version
131
- version: 0.0.13
131
+ version: 0.0.15
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - '='
137
137
  - !ruby/object:Gem::Version
138
- version: 0.0.13
138
+ version: 0.0.15
139
139
  - !ruby/object:Gem::Dependency
140
140
  name: thread
141
141
  requirement: !ruby/object:Gem::Requirement