broken_link_finder 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be656fb33a6363c5da6bb1cd05e52f8bc9b1f7223825bd7dc4ada8af2bdea1d2
4
- data.tar.gz: 2b4db95eaf086c10ac6f7528ec63c426ead3507f9adc4ecf0fd177c828d487f2
3
+ metadata.gz: 3e926c0997f5dcb805a9cdb4bf367237e3dff3284259395f6f38fa6b23a69172
4
+ data.tar.gz: 5c4020b8d1ae713a34593107580e82e3b3c06c75ec9a675e6cb536c07add1aca
5
5
  SHA512:
6
- metadata.gz: 59bdd686ff0cce9359e51415011ba0521404834790ba7eec304d29d085bbdfafb3c1c0f8fd63bd235d41c0aed61278a2211f05fdb0f3d9c15d31a53f1b18b877
7
- data.tar.gz: 27986886e3fa6ab4123027ff3067633787470c00401001942f7b28f4ba87fcfe38a11513caa2ade8dc5bfed0efb8a9db4af9749130c8960ac7ccb84a169fc154
6
+ metadata.gz: c988b4c602441da10fcf602ca14e528ad8ce2e076e961659e10130c3393c4d2a0729a1c51e4611445f3d20c27402d65ae2cab8bb937e1009b4af1123462aaa01
7
+ data.tar.gz: 448fbc03310b0378a91e2a2bf3db95bd33cc4cd2fe51af36c55e0409e05482c5beed9783ee6465b3747d8e25f4afd3fdfa72c0604834ab2b0df070921605a183
@@ -9,6 +9,15 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.8.0
13
+ ### Added
14
+ - Logic to prevent re-crawling links for more efficiency.
15
+ ### Changed/Removed
16
+ - Updated the `wgit` gem which fixes a bug in `crawl_site` and adds support for IRI's.
17
+ ### Fixed
18
+ - Bug where an error from the executable wasn't being rescued.
19
+ ---
20
+
12
21
  ## v0.7.0
13
22
  ### Added
14
23
  - Added the `--verbose` flag to the executable for displaying all ignored links.
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.7.0)
4
+ broken_link_finder (0.8.0)
5
5
  thor (= 0.20.3)
6
6
  thread (= 0.2)
7
- wgit (= 0.0.13)
7
+ wgit (= 0.0.15)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -24,7 +24,7 @@ GEM
24
24
  method_source (0.9.2)
25
25
  mini_portile2 (2.4.0)
26
26
  minitest (5.11.3)
27
- mongo (2.8.0)
27
+ mongo (2.9.1)
28
28
  bson (>= 4.4.2, < 5.0.0)
29
29
  nokogiri (1.10.4)
30
30
  mini_portile2 (~> 2.4.0)
@@ -42,8 +42,9 @@ GEM
42
42
  addressable (>= 2.3.6)
43
43
  crack (>= 0.3.2)
44
44
  hashdiff
45
- wgit (0.0.13)
46
- mongo (~> 2.8.0)
45
+ wgit (0.0.15)
46
+ addressable (~> 2.6.0)
47
+ mongo (~> 2.9.0)
47
48
  nokogiri (~> 1.10.3)
48
49
 
49
50
  PLATFORMS
@@ -1,18 +1,23 @@
1
- require 'broken_link_finder'
1
+ require_relative './lib/broken_link_finder'
2
2
  require 'benchmark'
3
3
  require 'memory_profiler'
4
4
 
5
- url = ARGV[0] || "http://txti.es"
5
+ url = ARGV[0] || 'http://txti.es'
6
6
  finder = BrokenLinkFinder::Finder.new
7
7
 
8
- puts Benchmark.measure { finder.crawl_page url }
8
+ # puts Benchmark.measure { finder.crawl_page url }
9
9
  puts Benchmark.measure { finder.crawl_site url }
10
+ puts "Links crawled: #{finder.total_links_crawled}"
10
11
 
11
- # http://txti.es
12
- # Pre threading: 17.591528
13
- # Post threading: 7.508828 :-)
12
+ # http://txti.es page crawl
13
+ # Pre threading: 17.5 seconds
14
+ # Post threading: 7.5 seconds
14
15
 
15
- # http://txti.es
16
+ # http://txti.es post threading - page vs site crawl
16
17
  # Page: 9.526981
17
18
  # Site: 9.732416
18
- # Multi-threading crawl_site now yields the same time as a single page.
19
+ # Multi-threading crawl_site now yields the same time as a single page
20
+
21
+ # https://meos.ch/ site crawl - post all link recording functionality
22
+ # Pre: 608 seconds with 7665 links crawled
23
+ # Post: 355 seconds with 1099 links crawled
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
34
34
  spec.require_paths = ["lib"]
35
35
  spec.post_install_message = "Added the executable 'broken_link_finder' to $PATH"
36
36
 
37
- spec.required_ruby_version = '~> 2.5' # Only works with ruby 2.5.x
37
+ spec.required_ruby_version = '~> 2.5'
38
38
 
39
39
  spec.add_development_dependency "bundler", "~> 2.0"
40
40
  spec.add_development_dependency "rake", "~> 10.0"
@@ -45,7 +45,7 @@ Gem::Specification.new do |spec|
45
45
  spec.add_development_dependency "httplog", "~> 1.3"
46
46
  spec.add_development_dependency "memory_profiler", "~> 0.9"
47
47
 
48
- spec.add_runtime_dependency "wgit", "0.0.13"
48
+ spec.add_runtime_dependency "wgit", "0.0.15"
49
49
  spec.add_runtime_dependency "thread", "0.2"
50
50
  spec.add_runtime_dependency "thor", "0.20.3"
51
51
  end
@@ -23,6 +23,8 @@ class BrokenLinkFinderCLI < Thor
23
23
  broken_verbose: broken_verbose,
24
24
  ignored_verbose: ignored_verbose
25
25
  )
26
+ rescue Exception => ex
27
+ puts "An error has occurred: #{ex.message}"
26
28
  end
27
29
  end
28
30
 
@@ -1,6 +1,7 @@
1
1
  require_relative 'reporter'
2
2
  require 'wgit'
3
3
  require 'thread/pool'
4
+ require 'set'
4
5
 
5
6
  module BrokenLinkFinder
6
7
  # Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
@@ -12,7 +13,7 @@ module BrokenLinkFinder
12
13
  class Finder
13
14
  DEFAULT_MAX_THREADS = 30.freeze
14
15
 
15
- attr_reader :broken_links, :ignored_links
16
+ attr_reader :broken_links, :ignored_links, :total_links_crawled
16
17
 
17
18
  # Creates a new Finder instance.
18
19
  def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
@@ -20,18 +21,21 @@ module BrokenLinkFinder
20
21
  raise "sort by either :page or :link, not #{sort}"
21
22
  end
22
23
 
23
- @sort = sort
24
+ @sort = sort
24
25
  @max_threads = max_threads
25
- @lock = Mutex.new
26
- @crawler = Wgit::Crawler.new
26
+ @lock = Mutex.new
27
+ @crawler = Wgit::Crawler.new
27
28
 
28
29
  clear_links
29
30
  end
30
31
 
31
32
  # Clear/empty the link collection Hashes.
32
33
  def clear_links
33
- @broken_links = {}
34
- @ignored_links = {}
34
+ @broken_links = {}
35
+ @ignored_links = {}
36
+ @total_links_crawled = 0
37
+ @all_broken_links = Set.new
38
+ @all_intact_links = Set.new
35
39
  end
36
40
 
37
41
  # Finds broken links within a single page and appends them to the
@@ -39,9 +43,9 @@ module BrokenLinkFinder
39
43
  # Access the broken links with Finder#broken_links.
40
44
  def crawl_url(url)
41
45
  clear_links
42
- url = Wgit::Url.new(url)
43
46
 
44
47
  # Ensure the given page url is valid.
48
+ url = Wgit::Url.new(url)
45
49
  doc = @crawler.crawl_url(url)
46
50
  raise "Invalid URL: #{url}" unless doc
47
51
 
@@ -49,6 +53,8 @@ module BrokenLinkFinder
49
53
  find_broken_links(doc)
50
54
 
51
55
  sort_links
56
+ set_total_links_crawled
57
+
52
58
  @broken_links.any?
53
59
  end
54
60
 
@@ -58,6 +64,7 @@ module BrokenLinkFinder
58
64
  # Access the broken links with Finder#broken_links.
59
65
  def crawl_site(url)
60
66
  clear_links
67
+
61
68
  url = Wgit::Url.new(url)
62
69
  pool = Thread.pool(@max_threads)
63
70
  crawled_pages = []
@@ -73,8 +80,12 @@ module BrokenLinkFinder
73
80
  pool.process { find_broken_links(doc) }
74
81
  end
75
82
 
76
- pool.shutdown # Wait for all threads to finish.
83
+ # Wait for all threads to finish.
84
+ pool.shutdown
85
+
77
86
  sort_links
87
+ set_total_links_crawled
88
+
78
89
  [@broken_links.any?, crawled_pages]
79
90
  end
80
91
 
@@ -101,7 +112,7 @@ module BrokenLinkFinder
101
112
 
102
113
  # Finds which links are unsupported or broken and records the details.
103
114
  def find_broken_links(doc)
104
- # Process the Document's links before checking if they're broke.
115
+ # Report and reject any non supported links.
105
116
  links = doc.all_links.
106
117
  reject do |link|
107
118
  if !link.is_relative? and !link.start_with?('http')
@@ -113,13 +124,25 @@ module BrokenLinkFinder
113
124
 
114
125
  # Iterate over the supported links checking if they're broken or not.
115
126
  links.each do |link|
127
+ # Check if the link has already been processed previously.
128
+ next if @all_intact_links.include?(link)
129
+
130
+ if @all_broken_links.include?(link)
131
+ append_broken_link(doc.url, link)
132
+ next
133
+ end
134
+
135
+ # The link hasn't been processed before so we crawl it.
116
136
  link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
117
137
  link_doc = @crawler.crawl_url(link_url)
118
138
 
139
+ # Determine if the crawled link is broken or not.
119
140
  if @crawler.last_response.is_a?(Net::HTTPNotFound) or
120
141
  link_doc.nil? or
121
142
  has_broken_anchor(link_doc)
122
143
  append_broken_link(doc.url, link)
144
+ else
145
+ @lock.synchronize { @all_intact_links << link }
123
146
  end
124
147
  end
125
148
 
@@ -140,17 +163,21 @@ module BrokenLinkFinder
140
163
  # Append key => [value] to @broken_links.
141
164
  def append_broken_link(url, link)
142
165
  key, value = get_key_value(url, link)
166
+
143
167
  @lock.synchronize do
144
168
  unless @broken_links[key]
145
169
  @broken_links[key] = []
146
170
  end
147
171
  @broken_links[key] << value
172
+
173
+ @all_broken_links << link
148
174
  end
149
175
  end
150
176
 
151
177
  # Append key => [value] to @ignored_links.
152
178
  def append_ignored_link(url, link)
153
179
  key, value = get_key_value(url, link)
180
+
154
181
  @lock.synchronize do
155
182
  unless @ignored_links[key]
156
183
  @ignored_links[key] = []
@@ -180,6 +207,11 @@ module BrokenLinkFinder
180
207
  @ignored_links.each { |k, v| v.sort! }
181
208
  end
182
209
 
210
+ # Sets and returns the total number of links crawled.
211
+ def set_total_links_crawled
212
+ @total_links_crawled = @all_broken_links.size + @all_intact_links.size
213
+ end
214
+
183
215
  alias_method :crawl_page, :crawl_url
184
216
  alias_method :pretty_print_link_summary, :pretty_print_link_report
185
217
  end
@@ -1,3 +1,3 @@
1
1
  module BrokenLinkFinder
2
- VERSION = "0.7.0"
2
+ VERSION = "0.8.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-08-13 00:00:00.000000000 Z
11
+ date: 2019-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -128,14 +128,14 @@ dependencies:
128
128
  requirements:
129
129
  - - '='
130
130
  - !ruby/object:Gem::Version
131
- version: 0.0.13
131
+ version: 0.0.15
132
132
  type: :runtime
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
136
  - - '='
137
137
  - !ruby/object:Gem::Version
138
- version: 0.0.13
138
+ version: 0.0.15
139
139
  - !ruby/object:Gem::Dependency
140
140
  name: thread
141
141
  requirement: !ruby/object:Gem::Requirement