broken_link_finder 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 715744f5d7676d5d4ba2cddb80f0f8109f79a7f92689c3ff3088a52f307f5f1f
4
- data.tar.gz: 7026f6037f0d710d8dab3bc710ddf7b202594c25ac8a8522398e62af3f4e78dd
3
+ metadata.gz: 110068a5db9454d69709454f50ade3f667c0e63dad6d101c23bf991a04f770eb
4
+ data.tar.gz: 0fd46fed486bf382935d68020fcae9391c288b5d5742ce19ba61d2896e2eebe7
5
5
  SHA512:
6
- metadata.gz: 7844d0c6d2c39752a98dcb9a7eb455f6492012321be71ff40f49641b7adc3658f4f541a35afc9ca1a9d6ce330472c4f166db0895acc1e1d6ecad53f86af7f0ac
7
- data.tar.gz: f5224db527d4636e9006ea332813d9cd133ea221506aa7a45edecc6fd230f212e21f4db8d1757a070c753876d6b003329a381b0f9a48663c966318b9fb2d1c86
6
+ metadata.gz: c9c744045ee462b8981ab418a5c3acdb9162c36e14c8603391c1f79e9247872de877a83ce2a7dd5f8779d8774a044562054445410666a5af518d11f5fded3f22
7
+ data.tar.gz: e4e0e2b2d19596493564a361fef0a98afdddcefb395309dba139df6bd508a12385ca16e47004bd503c585954454d9608449995b4ed6009926e0b856d11962181
data/CHANGELOG.md CHANGED
@@ -9,6 +9,15 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.9.2
13
+ ### Added
14
+ - ...
15
+ ### Changed/Removed
16
+ - Updated `wgit` gem to version 0.4.0 which brings a speed boost to crawls.
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
12
21
  ## v0.9.1
13
22
  ### Added
14
23
  - `BrokenLinkFinder::Finder.crawl_site` alias: `crawl_r`.
data/Gemfile CHANGED
@@ -2,5 +2,7 @@
2
2
 
3
3
  source 'https://rubygems.org'
4
4
 
5
+ ruby '~> 2.5'
6
+
5
7
  # Specify your gem's dependencies in broken_link_finder.gemspec
6
8
  gemspec
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.9.1)
5
- thor (= 0.20.3)
6
- thread (= 0.2)
7
- wgit (= 0.2.0)
4
+ broken_link_finder (0.9.2)
5
+ thor (~> 0.20.3)
6
+ thread (~> 0.2.0)
7
+ wgit (~> 0.4.0)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -16,15 +16,16 @@ GEM
16
16
  coderay (1.1.2)
17
17
  crack (0.4.3)
18
18
  safe_yaml (~> 1.0.0)
19
- hashdiff (0.4.0)
20
- httplog (1.3.2)
21
- rack (>= 1.0)
22
- rainbow (>= 2.0.0)
23
- memory_profiler (0.9.14)
19
+ ethon (0.12.0)
20
+ ffi (>= 1.3.0)
21
+ ffi (1.11.1)
22
+ hashdiff (1.0.0)
23
+ maxitest (3.4.0)
24
+ minitest (>= 5.0.0, < 5.13.0)
24
25
  method_source (0.9.2)
25
26
  mini_portile2 (2.4.0)
26
- minitest (5.11.3)
27
- mongo (2.9.1)
27
+ minitest (5.12.2)
28
+ mongo (2.9.2)
28
29
  bson (>= 4.4.2, < 5.0.0)
29
30
  nokogiri (1.10.4)
30
31
  mini_portile2 (~> 2.4.0)
@@ -32,20 +33,21 @@ GEM
32
33
  coderay (~> 1.1.0)
33
34
  method_source (~> 0.9.0)
34
35
  public_suffix (3.1.0)
35
- rack (2.0.7)
36
- rainbow (3.0.0)
37
36
  rake (10.5.0)
38
37
  safe_yaml (1.0.5)
39
38
  thor (0.20.3)
40
- thread (0.2.0)
41
- webmock (3.5.1)
39
+ thread (0.2.2)
40
+ typhoeus (1.3.1)
41
+ ethon (>= 0.9.0)
42
+ webmock (3.7.6)
42
43
  addressable (>= 2.3.6)
43
44
  crack (>= 0.3.2)
44
- hashdiff
45
- wgit (0.2.0)
45
+ hashdiff (>= 0.4.0, < 2.0.0)
46
+ wgit (0.4.0)
46
47
  addressable (~> 2.6.0)
47
48
  mongo (~> 2.9.0)
48
49
  nokogiri (~> 1.10.3)
50
+ typhoeus (~> 1.3.1)
49
51
 
50
52
  PLATFORMS
51
53
  ruby
@@ -54,12 +56,13 @@ DEPENDENCIES
54
56
  broken_link_finder!
55
57
  bundler (~> 2.0)
56
58
  byebug (~> 11.0)
57
- httplog (~> 1.3)
58
- memory_profiler (~> 0.9)
59
- minitest (~> 5.0)
59
+ maxitest (~> 3.3)
60
60
  pry (~> 0.12)
61
61
  rake (~> 10.0)
62
- webmock (~> 3.5)
62
+ webmock (~> 3.6)
63
+
64
+ RUBY VERSION
65
+ ruby 2.5.3p105
63
66
 
64
67
  BUNDLED WITH
65
68
  2.0.1
data/README.md CHANGED
@@ -19,9 +19,11 @@ In a nutshell, only HTTP(S) based links can be successfully verified by `broken_
19
19
 
20
20
  See the [usage](#Usage) section below on how to check which links have been ignored during a crawl.
21
21
 
22
+ With that said, the usual array of HTTP URL features are supported including anchors/fragments, query strings and IRI's (non ASCII based URL's).
23
+
22
24
  ## Made Possible By
23
25
 
24
- `broken_link_finder` relies heavily on the `wgit` Ruby gem. See its [repository](https://github.com/michaeltelford/wgit) for more details.
26
+ `broken_link_finder` relies heavily on the `wgit` Ruby gem by the same author. See its [repository](https://github.com/michaeltelford/wgit) for more details.
25
27
 
26
28
  ## Installation
27
29
 
data/benchmark.rb CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  require_relative './lib/broken_link_finder'
4
4
  require 'benchmark'
5
- require 'memory_profiler'
6
5
 
7
6
  url = ARGV[0] || 'http://txti.es'
8
7
  finder = BrokenLinkFinder::Finder.new
data/bin/console CHANGED
@@ -4,30 +4,20 @@
4
4
  require 'bundler/setup'
5
5
  require 'pry'
6
6
  require 'byebug'
7
- require 'logger'
8
- require 'httplog'
9
7
  require 'broken_link_finder'
10
8
 
11
- logger = Logger.new(STDOUT)
12
- logger.formatter = proc do |_severity, _datetime, _progname, msg|
13
- "#{msg}\n"
14
- end
15
-
16
- # Monkey patch all Net:HTTP network calls and log them.
17
- HttpLog.configure do |config|
18
- config.enabled = true
19
- config.logger = logger
20
-
21
- config.log_connect = false
22
- config.log_request = true
23
- config.log_headers = false
24
- config.log_data = false
25
- config.log_status = true
26
- config.log_response = false
27
- config.log_benchmark = false
28
-
29
- config.compact_log = false
30
- config.json_log = false
9
+ # Monkey patch and log all HTTP requests made during the console.
10
+ module Typhoeus
11
+ singleton_class.class_eval do
12
+ alias_method :orig_get, :get
13
+ end
14
+
15
+ def self.get(base_url, options = {})
16
+ puts "[typhoeus] Sending GET: #{base_url}"
17
+ resp = orig_get(base_url, options)
18
+ puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
19
+ resp
20
+ end
31
21
  end
32
22
 
33
23
  # Call reload to load all recent code changes.
@@ -48,6 +38,7 @@ by_page = Finder.new
48
38
  by_link = Finder.new sort: :link
49
39
  finder = by_page
50
40
 
41
+ # Start the console.
51
42
  puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
52
43
 
53
44
  binding.pry
@@ -39,14 +39,12 @@ Gem::Specification.new do |spec|
39
39
 
40
40
  spec.add_development_dependency 'bundler', '~> 2.0'
41
41
  spec.add_development_dependency 'byebug', '~> 11.0'
42
- spec.add_development_dependency 'httplog', '~> 1.3'
43
- spec.add_development_dependency 'memory_profiler', '~> 0.9'
44
- spec.add_development_dependency 'minitest', '~> 5.0'
42
+ spec.add_development_dependency 'maxitest', '~> 3.3'
45
43
  spec.add_development_dependency 'pry', '~> 0.12'
46
44
  spec.add_development_dependency 'rake', '~> 10.0'
47
- spec.add_development_dependency 'webmock', '~> 3.5'
45
+ spec.add_development_dependency 'webmock', '~> 3.6'
48
46
 
49
- spec.add_runtime_dependency 'thor', '0.20.3'
50
- spec.add_runtime_dependency 'thread', '0.2'
51
- spec.add_runtime_dependency 'wgit', '0.2.0'
47
+ spec.add_runtime_dependency 'thor', '~> 0.20.3'
48
+ spec.add_runtime_dependency 'thread', '~> 0.2.0'
49
+ spec.add_runtime_dependency 'wgit', '~> 0.4.0'
52
50
  end
@@ -39,7 +39,7 @@ module BrokenLinkFinder
39
39
 
40
40
  # Finds broken links within a single page and appends them to the
41
41
  # @broken_links array. Returns true if at least one broken link was found.
42
- # Access the broken links with Finder#broken_links.
42
+ # Access the broken links afterwards with Finder#broken_links.
43
43
  def crawl_url(url)
44
44
  clear_links
45
45
 
@@ -61,7 +61,7 @@ module BrokenLinkFinder
61
61
  # Finds broken links within an entire site and appends them to the
62
62
  # @broken_links array. Returns a tuple containing a Boolean of true if
63
63
  # at least one broken link was found and an Array of all pages crawled.
64
- # Access the broken links with Finder#broken_links.
64
+ # Access the broken links afterwards with Finder#broken_links.
65
65
  def crawl_site(url)
66
66
  clear_links
67
67
 
@@ -70,7 +70,7 @@ module BrokenLinkFinder
70
70
  crawled_pages = []
71
71
 
72
72
  # Crawl the site's HTML web pages looking for links.
73
- orig_doc = @crawler.crawl_site(url) do |doc|
73
+ externals = @crawler.crawl_site(url) do |doc|
74
74
  crawled_pages << doc.url
75
75
  next unless doc
76
76
 
@@ -79,7 +79,7 @@ module BrokenLinkFinder
79
79
  end
80
80
 
81
81
  # Ensure the given website url is valid.
82
- raise "Invalid or broken URL: #{url}" if orig_doc.nil?
82
+ raise "Invalid or broken URL: #{url}" unless externals
83
83
 
84
84
  # Wait for all threads to finish.
85
85
  pool.shutdown
@@ -113,15 +113,7 @@ module BrokenLinkFinder
113
113
 
114
114
  # Finds which links are unsupported or broken and records the details.
115
115
  def find_broken_links(doc)
116
- # Report and reject any non supported links.
117
- links = doc.all_links
118
- .reject do |link|
119
- if link.is_absolute? && !link.start_with?('http')
120
- append_ignored_link(doc.url, link)
121
- true
122
- end
123
- end
124
- .uniq
116
+ links = get_supported_links(doc)
125
117
 
126
118
  # Iterate over the supported links checking if they're broken or not.
127
119
  links.each do |link|
@@ -134,11 +126,10 @@ module BrokenLinkFinder
134
126
  end
135
127
 
136
128
  # The link hasn't been processed before so we crawl it.
137
- link_url = get_absolute_link(doc, link)
138
- link_doc = @crawler.crawl_url(link_url)
129
+ link_doc = crawl_link(doc, link)
139
130
 
140
131
  # Determine if the crawled link is broken or not.
141
- if @crawler.last_response.is_a?(Net::HTTPNotFound) ||
132
+ if @crawler.last_response.code == 404 ||
142
133
  link_doc.nil? ||
143
134
  has_broken_anchor(link_doc)
144
135
  append_broken_link(doc.url, link)
@@ -150,6 +141,24 @@ module BrokenLinkFinder
150
141
  nil
151
142
  end
152
143
 
144
+ # Report and reject any non supported links. Any link that is absolute and
145
+ # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
146
+ def get_supported_links(doc)
147
+ doc.all_links
148
+ .reject do |link|
149
+ if link.is_absolute? && !link.start_with?('http')
150
+ append_ignored_link(doc.url, link)
151
+ true
152
+ end
153
+ end
154
+ end
155
+
156
+ # Makes the link absolute and crawls it, returning its Wgit::Document.
157
+ def crawl_link(doc, link)
158
+ link = get_absolute_link(doc, link)
159
+ @crawler.crawl_url(link)
160
+ end
161
+
153
162
  # Returns the link in absolute form so it can be crawled.
154
163
  def get_absolute_link(doc, link)
155
164
  link.is_relative? ? doc.base_url(link: link).concat(link) : link
@@ -12,10 +12,10 @@ module BrokenLinkFinder
12
12
  raise "sort by either :page or :link, not #{sort}" \
13
13
  unless %i[page link].include?(sort)
14
14
 
15
- @stream = stream
16
- @sort = sort
17
- @broken_links = broken_links
18
- @ignored_links = ignored_links
15
+ @stream = stream
16
+ @sort = sort
17
+ @broken_links = broken_links
18
+ @ignored_links = ignored_links
19
19
  end
20
20
 
21
21
  # Pretty print a report detailing the link summary.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BrokenLinkFinder
4
- VERSION = '0.9.1'
4
+ VERSION = '0.9.2'
5
5
  end
@@ -7,5 +7,5 @@ Wgit::Document.define_extension(
7
7
  singleton: false,
8
8
  text_content_only: true
9
9
  ) do |links|
10
- links&.map(&:to_url)&.uniq
10
+ links.uniq.to_urls
11
11
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: broken_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.1
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-22 00:00:00.000000000 Z
11
+ date: 2019-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -39,47 +39,19 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '11.0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: httplog
42
+ name: maxitest
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.3'
47
+ version: '3.3'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.3'
55
- - !ruby/object:Gem::Dependency
56
- name: memory_profiler
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: '0.9'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: '0.9'
69
- - !ruby/object:Gem::Dependency
70
- name: minitest
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: '5.0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: '5.0'
54
+ version: '3.3'
83
55
  - !ruby/object:Gem::Dependency
84
56
  name: pry
85
57
  requirement: !ruby/object:Gem::Requirement
@@ -114,56 +86,56 @@ dependencies:
114
86
  requirements:
115
87
  - - "~>"
116
88
  - !ruby/object:Gem::Version
117
- version: '3.5'
89
+ version: '3.6'
118
90
  type: :development
119
91
  prerelease: false
120
92
  version_requirements: !ruby/object:Gem::Requirement
121
93
  requirements:
122
94
  - - "~>"
123
95
  - !ruby/object:Gem::Version
124
- version: '3.5'
96
+ version: '3.6'
125
97
  - !ruby/object:Gem::Dependency
126
98
  name: thor
127
99
  requirement: !ruby/object:Gem::Requirement
128
100
  requirements:
129
- - - '='
101
+ - - "~>"
130
102
  - !ruby/object:Gem::Version
131
103
  version: 0.20.3
132
104
  type: :runtime
133
105
  prerelease: false
134
106
  version_requirements: !ruby/object:Gem::Requirement
135
107
  requirements:
136
- - - '='
108
+ - - "~>"
137
109
  - !ruby/object:Gem::Version
138
110
  version: 0.20.3
139
111
  - !ruby/object:Gem::Dependency
140
112
  name: thread
141
113
  requirement: !ruby/object:Gem::Requirement
142
114
  requirements:
143
- - - '='
115
+ - - "~>"
144
116
  - !ruby/object:Gem::Version
145
- version: '0.2'
117
+ version: 0.2.0
146
118
  type: :runtime
147
119
  prerelease: false
148
120
  version_requirements: !ruby/object:Gem::Requirement
149
121
  requirements:
150
- - - '='
122
+ - - "~>"
151
123
  - !ruby/object:Gem::Version
152
- version: '0.2'
124
+ version: 0.2.0
153
125
  - !ruby/object:Gem::Dependency
154
126
  name: wgit
155
127
  requirement: !ruby/object:Gem::Requirement
156
128
  requirements:
157
- - - '='
129
+ - - "~>"
158
130
  - !ruby/object:Gem::Version
159
- version: 0.2.0
131
+ version: 0.4.0
160
132
  type: :runtime
161
133
  prerelease: false
162
134
  version_requirements: !ruby/object:Gem::Requirement
163
135
  requirements:
164
- - - '='
136
+ - - "~>"
165
137
  - !ruby/object:Gem::Version
166
- version: 0.2.0
138
+ version: 0.4.0
167
139
  description: Finds a website's broken links using the 'wgit' gem and reports back
168
140
  to you with a summary.
169
141
  email: michael.telford@live.com