broken_link_finder 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 715744f5d7676d5d4ba2cddb80f0f8109f79a7f92689c3ff3088a52f307f5f1f
4
- data.tar.gz: 7026f6037f0d710d8dab3bc710ddf7b202594c25ac8a8522398e62af3f4e78dd
3
+ metadata.gz: 110068a5db9454d69709454f50ade3f667c0e63dad6d101c23bf991a04f770eb
4
+ data.tar.gz: 0fd46fed486bf382935d68020fcae9391c288b5d5742ce19ba61d2896e2eebe7
5
5
  SHA512:
6
- metadata.gz: 7844d0c6d2c39752a98dcb9a7eb455f6492012321be71ff40f49641b7adc3658f4f541a35afc9ca1a9d6ce330472c4f166db0895acc1e1d6ecad53f86af7f0ac
7
- data.tar.gz: f5224db527d4636e9006ea332813d9cd133ea221506aa7a45edecc6fd230f212e21f4db8d1757a070c753876d6b003329a381b0f9a48663c966318b9fb2d1c86
6
+ metadata.gz: c9c744045ee462b8981ab418a5c3acdb9162c36e14c8603391c1f79e9247872de877a83ce2a7dd5f8779d8774a044562054445410666a5af518d11f5fded3f22
7
+ data.tar.gz: e4e0e2b2d19596493564a361fef0a98afdddcefb395309dba139df6bd508a12385ca16e47004bd503c585954454d9608449995b4ed6009926e0b856d11962181
data/CHANGELOG.md CHANGED
@@ -9,6 +9,15 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.9.2
13
+ ### Added
14
+ - ...
15
+ ### Changed/Removed
16
+ - Updated `wgit` gem to version 0.4.0 which brings a speed boost to crawls.
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
12
21
  ## v0.9.1
13
22
  ### Added
14
23
  - `BrokenLinkFinder::Finder.crawl_site` alias: `crawl_r`.
data/Gemfile CHANGED
@@ -2,5 +2,7 @@
2
2
 
3
3
  source 'https://rubygems.org'
4
4
 
5
+ ruby '~> 2.5'
6
+
5
7
  # Specify your gem's dependencies in broken_link_finder.gemspec
6
8
  gemspec
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.9.1)
5
- thor (= 0.20.3)
6
- thread (= 0.2)
7
- wgit (= 0.2.0)
4
+ broken_link_finder (0.9.2)
5
+ thor (~> 0.20.3)
6
+ thread (~> 0.2.0)
7
+ wgit (~> 0.4.0)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -16,15 +16,16 @@ GEM
16
16
  coderay (1.1.2)
17
17
  crack (0.4.3)
18
18
  safe_yaml (~> 1.0.0)
19
- hashdiff (0.4.0)
20
- httplog (1.3.2)
21
- rack (>= 1.0)
22
- rainbow (>= 2.0.0)
23
- memory_profiler (0.9.14)
19
+ ethon (0.12.0)
20
+ ffi (>= 1.3.0)
21
+ ffi (1.11.1)
22
+ hashdiff (1.0.0)
23
+ maxitest (3.4.0)
24
+ minitest (>= 5.0.0, < 5.13.0)
24
25
  method_source (0.9.2)
25
26
  mini_portile2 (2.4.0)
26
- minitest (5.11.3)
27
- mongo (2.9.1)
27
+ minitest (5.12.2)
28
+ mongo (2.9.2)
28
29
  bson (>= 4.4.2, < 5.0.0)
29
30
  nokogiri (1.10.4)
30
31
  mini_portile2 (~> 2.4.0)
@@ -32,20 +33,21 @@ GEM
32
33
  coderay (~> 1.1.0)
33
34
  method_source (~> 0.9.0)
34
35
  public_suffix (3.1.0)
35
- rack (2.0.7)
36
- rainbow (3.0.0)
37
36
  rake (10.5.0)
38
37
  safe_yaml (1.0.5)
39
38
  thor (0.20.3)
40
- thread (0.2.0)
41
- webmock (3.5.1)
39
+ thread (0.2.2)
40
+ typhoeus (1.3.1)
41
+ ethon (>= 0.9.0)
42
+ webmock (3.7.6)
42
43
  addressable (>= 2.3.6)
43
44
  crack (>= 0.3.2)
44
- hashdiff
45
- wgit (0.2.0)
45
+ hashdiff (>= 0.4.0, < 2.0.0)
46
+ wgit (0.4.0)
46
47
  addressable (~> 2.6.0)
47
48
  mongo (~> 2.9.0)
48
49
  nokogiri (~> 1.10.3)
50
+ typhoeus (~> 1.3.1)
49
51
 
50
52
  PLATFORMS
51
53
  ruby
@@ -54,12 +56,13 @@ DEPENDENCIES
54
56
  broken_link_finder!
55
57
  bundler (~> 2.0)
56
58
  byebug (~> 11.0)
57
- httplog (~> 1.3)
58
- memory_profiler (~> 0.9)
59
- minitest (~> 5.0)
59
+ maxitest (~> 3.3)
60
60
  pry (~> 0.12)
61
61
  rake (~> 10.0)
62
- webmock (~> 3.5)
62
+ webmock (~> 3.6)
63
+
64
+ RUBY VERSION
65
+ ruby 2.5.3p105
63
66
 
64
67
  BUNDLED WITH
65
68
  2.0.1
data/README.md CHANGED
@@ -19,9 +19,11 @@ In a nutshell, only HTTP(S) based links can be successfully verified by `broken_
19
19
 
20
20
  See the [usage](#Usage) section below on how to check which links have been ignored during a crawl.
21
21
 
22
+ With that said, the usual array of HTTP URL features are supported including anchors/fragments, query strings and IRI's (non ASCII based URL's).
23
+
22
24
  ## Made Possible By
23
25
 
24
- `broken_link_finder` relies heavily on the `wgit` Ruby gem. See its [repository](https://github.com/michaeltelford/wgit) for more details.
26
+ `broken_link_finder` relies heavily on the `wgit` Ruby gem by the same author. See its [repository](https://github.com/michaeltelford/wgit) for more details.
25
27
 
26
28
  ## Installation
27
29
 
data/benchmark.rb CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  require_relative './lib/broken_link_finder'
4
4
  require 'benchmark'
5
- require 'memory_profiler'
6
5
 
7
6
  url = ARGV[0] || 'http://txti.es'
8
7
  finder = BrokenLinkFinder::Finder.new
data/bin/console CHANGED
@@ -4,30 +4,20 @@
4
4
  require 'bundler/setup'
5
5
  require 'pry'
6
6
  require 'byebug'
7
- require 'logger'
8
- require 'httplog'
9
7
  require 'broken_link_finder'
10
8
 
11
- logger = Logger.new(STDOUT)
12
- logger.formatter = proc do |_severity, _datetime, _progname, msg|
13
- "#{msg}\n"
14
- end
15
-
16
- # Monkey patch all Net:HTTP network calls and log them.
17
- HttpLog.configure do |config|
18
- config.enabled = true
19
- config.logger = logger
20
-
21
- config.log_connect = false
22
- config.log_request = true
23
- config.log_headers = false
24
- config.log_data = false
25
- config.log_status = true
26
- config.log_response = false
27
- config.log_benchmark = false
28
-
29
- config.compact_log = false
30
- config.json_log = false
9
+ # Monkey patch and log all HTTP requests made during the console.
10
+ module Typhoeus
11
+ singleton_class.class_eval do
12
+ alias_method :orig_get, :get
13
+ end
14
+
15
+ def self.get(base_url, options = {})
16
+ puts "[typhoeus] Sending GET: #{base_url}"
17
+ resp = orig_get(base_url, options)
18
+ puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
19
+ resp
20
+ end
31
21
  end
32
22
 
33
23
  # Call reload to load all recent code changes.
@@ -48,6 +38,7 @@ by_page = Finder.new
48
38
  by_link = Finder.new sort: :link
49
39
  finder = by_page
50
40
 
41
+ # Start the console.
51
42
  puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
52
43
 
53
44
  binding.pry
@@ -39,14 +39,12 @@ Gem::Specification.new do |spec|
39
39
 
40
40
  spec.add_development_dependency 'bundler', '~> 2.0'
41
41
  spec.add_development_dependency 'byebug', '~> 11.0'
42
- spec.add_development_dependency 'httplog', '~> 1.3'
43
- spec.add_development_dependency 'memory_profiler', '~> 0.9'
44
- spec.add_development_dependency 'minitest', '~> 5.0'
42
+ spec.add_development_dependency 'maxitest', '~> 3.3'
45
43
  spec.add_development_dependency 'pry', '~> 0.12'
46
44
  spec.add_development_dependency 'rake', '~> 10.0'
47
- spec.add_development_dependency 'webmock', '~> 3.5'
45
+ spec.add_development_dependency 'webmock', '~> 3.6'
48
46
 
49
- spec.add_runtime_dependency 'thor', '0.20.3'
50
- spec.add_runtime_dependency 'thread', '0.2'
51
- spec.add_runtime_dependency 'wgit', '0.2.0'
47
+ spec.add_runtime_dependency 'thor', '~> 0.20.3'
48
+ spec.add_runtime_dependency 'thread', '~> 0.2.0'
49
+ spec.add_runtime_dependency 'wgit', '~> 0.4.0'
52
50
  end
@@ -39,7 +39,7 @@ module BrokenLinkFinder
39
39
 
40
40
  # Finds broken links within a single page and appends them to the
41
41
  # @broken_links array. Returns true if at least one broken link was found.
42
- # Access the broken links with Finder#broken_links.
42
+ # Access the broken links afterwards with Finder#broken_links.
43
43
  def crawl_url(url)
44
44
  clear_links
45
45
 
@@ -61,7 +61,7 @@ module BrokenLinkFinder
61
61
  # Finds broken links within an entire site and appends them to the
62
62
  # @broken_links array. Returns a tuple containing a Boolean of true if
63
63
  # at least one broken link was found and an Array of all pages crawled.
64
- # Access the broken links with Finder#broken_links.
64
+ # Access the broken links afterwards with Finder#broken_links.
65
65
  def crawl_site(url)
66
66
  clear_links
67
67
 
@@ -70,7 +70,7 @@ module BrokenLinkFinder
70
70
  crawled_pages = []
71
71
 
72
72
  # Crawl the site's HTML web pages looking for links.
73
- orig_doc = @crawler.crawl_site(url) do |doc|
73
+ externals = @crawler.crawl_site(url) do |doc|
74
74
  crawled_pages << doc.url
75
75
  next unless doc
76
76
 
@@ -79,7 +79,7 @@ module BrokenLinkFinder
79
79
  end
80
80
 
81
81
  # Ensure the given website url is valid.
82
- raise "Invalid or broken URL: #{url}" if orig_doc.nil?
82
+ raise "Invalid or broken URL: #{url}" unless externals
83
83
 
84
84
  # Wait for all threads to finish.
85
85
  pool.shutdown
@@ -113,15 +113,7 @@ module BrokenLinkFinder
113
113
 
114
114
  # Finds which links are unsupported or broken and records the details.
115
115
  def find_broken_links(doc)
116
- # Report and reject any non supported links.
117
- links = doc.all_links
118
- .reject do |link|
119
- if link.is_absolute? && !link.start_with?('http')
120
- append_ignored_link(doc.url, link)
121
- true
122
- end
123
- end
124
- .uniq
116
+ links = get_supported_links(doc)
125
117
 
126
118
  # Iterate over the supported links checking if they're broken or not.
127
119
  links.each do |link|
@@ -134,11 +126,10 @@ module BrokenLinkFinder
134
126
  end
135
127
 
136
128
  # The link hasn't been processed before so we crawl it.
137
- link_url = get_absolute_link(doc, link)
138
- link_doc = @crawler.crawl_url(link_url)
129
+ link_doc = crawl_link(doc, link)
139
130
 
140
131
  # Determine if the crawled link is broken or not.
141
- if @crawler.last_response.is_a?(Net::HTTPNotFound) ||
132
+ if @crawler.last_response.code == 404 ||
142
133
  link_doc.nil? ||
143
134
  has_broken_anchor(link_doc)
144
135
  append_broken_link(doc.url, link)
@@ -150,6 +141,24 @@ module BrokenLinkFinder
150
141
  nil
151
142
  end
152
143
 
144
+ # Report and reject any non supported links. Any link that is absolute and
145
+ # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
146
+ def get_supported_links(doc)
147
+ doc.all_links
148
+ .reject do |link|
149
+ if link.is_absolute? && !link.start_with?('http')
150
+ append_ignored_link(doc.url, link)
151
+ true
152
+ end
153
+ end
154
+ end
155
+
156
+ # Makes the link absolute and crawls it, returning its Wgit::Document.
157
+ def crawl_link(doc, link)
158
+ link = get_absolute_link(doc, link)
159
+ @crawler.crawl_url(link)
160
+ end
161
+
153
162
  # Returns the link in absolute form so it can be crawled.
154
163
  def get_absolute_link(doc, link)
155
164
  link.is_relative? ? doc.base_url(link: link).concat(link) : link
@@ -12,10 +12,10 @@ module BrokenLinkFinder
12
12
  raise "sort by either :page or :link, not #{sort}" \
13
13
  unless %i[page link].include?(sort)
14
14
 
15
- @stream = stream
16
- @sort = sort
17
- @broken_links = broken_links
18
- @ignored_links = ignored_links
15
+ @stream = stream
16
+ @sort = sort
17
+ @broken_links = broken_links
18
+ @ignored_links = ignored_links
19
19
  end
20
20
 
21
21
  # Pretty print a report detailing the link summary.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.9.1'
4
+ VERSION = '0.9.2'
5
5
  end
@@ -7,5 +7,5 @@ Wgit::Document.define_extension(
7
7
  singleton: false,
8
8
  text_content_only: true
9
9
  ) do |links|
10
- links&.map(&:to_url)&.uniq
10
+ links.uniq.to_urls
11
11
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.1
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-22 00:00:00.000000000 Z
11
+ date: 2019-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -39,47 +39,19 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '11.0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: httplog
42
+ name: maxitest
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.3'
47
+ version: '3.3'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.3'
55
- - !ruby/object:Gem::Dependency
56
- name: memory_profiler
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: '0.9'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: '0.9'
69
- - !ruby/object:Gem::Dependency
70
- name: minitest
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: '5.0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: '5.0'
54
+ version: '3.3'
83
55
  - !ruby/object:Gem::Dependency
84
56
  name: pry
85
57
  requirement: !ruby/object:Gem::Requirement
@@ -114,56 +86,56 @@ dependencies:
114
86
  requirements:
115
87
  - - "~>"
116
88
  - !ruby/object:Gem::Version
117
- version: '3.5'
89
+ version: '3.6'
118
90
  type: :development
119
91
  prerelease: false
120
92
  version_requirements: !ruby/object:Gem::Requirement
121
93
  requirements:
122
94
  - - "~>"
123
95
  - !ruby/object:Gem::Version
124
- version: '3.5'
96
+ version: '3.6'
125
97
  - !ruby/object:Gem::Dependency
126
98
  name: thor
127
99
  requirement: !ruby/object:Gem::Requirement
128
100
  requirements:
129
- - - '='
101
+ - - "~>"
130
102
  - !ruby/object:Gem::Version
131
103
  version: 0.20.3
132
104
  type: :runtime
133
105
  prerelease: false
134
106
  version_requirements: !ruby/object:Gem::Requirement
135
107
  requirements:
136
- - - '='
108
+ - - "~>"
137
109
  - !ruby/object:Gem::Version
138
110
  version: 0.20.3
139
111
  - !ruby/object:Gem::Dependency
140
112
  name: thread
141
113
  requirement: !ruby/object:Gem::Requirement
142
114
  requirements:
143
- - - '='
115
+ - - "~>"
144
116
  - !ruby/object:Gem::Version
145
- version: '0.2'
117
+ version: 0.2.0
146
118
  type: :runtime
147
119
  prerelease: false
148
120
  version_requirements: !ruby/object:Gem::Requirement
149
121
  requirements:
150
- - - '='
122
+ - - "~>"
151
123
  - !ruby/object:Gem::Version
152
- version: '0.2'
124
+ version: 0.2.0
153
125
  - !ruby/object:Gem::Dependency
154
126
  name: wgit
155
127
  requirement: !ruby/object:Gem::Requirement
156
128
  requirements:
157
- - - '='
129
+ - - "~>"
158
130
  - !ruby/object:Gem::Version
159
- version: 0.2.0
131
+ version: 0.4.0
160
132
  type: :runtime
161
133
  prerelease: false
162
134
  version_requirements: !ruby/object:Gem::Requirement
163
135
  requirements:
164
- - - '='
136
+ - - "~>"
165
137
  - !ruby/object:Gem::Version
166
- version: 0.2.0
138
+ version: 0.4.0
167
139
  description: Finds a website's broken links using the 'wgit' gem and reports back
168
140
  to you with a summary.
169
141
  email: michael.telford@live.com