deadfinder 1.5.1 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8ddb8f1182cbb94fdb1c270ff6d0af2ae176fdb2376a52496fc535c69a1c576c
4
- data.tar.gz: 5f49781224441d738fade165e0fae6c851c6739de2959e8c32601ebd9495e16c
3
+ metadata.gz: d8c1697ba3269001737ec44650a3f27ffd73cbf5467bd110df7e09cc7fc85ba5
4
+ data.tar.gz: f7ac73c33b9862cdbaaa661712d4969a999d73e12cdf4e6e725f9280fe67203b
5
5
  SHA512:
6
- metadata.gz: 5875750e01d1b909390a566b94a4daeeee5e2df3da81528d2c2cf4f831ef92c5e1d46732bbe4425cf5b0ba751136faa9ac7cb0dffeb6beefa0a4fabfa805e2b7
7
- data.tar.gz: bc62403a89307870b0166f530928f8ce1c951e131818b3ac55bc6d879e5e8273189dc120ee51740fccb3eaf2f473bc5d9bcfbb3441ae17c1793fe83f9c0de716
6
+ metadata.gz: c59678fbca26d0caa131cc31c931ddb43712bae58f389fb8b5207eb4564904e8ed29b746645ec2802d7379be2f98e502a87207dbcf5d56bb5dfbe5d8b211448d
7
+ data.tar.gz: ddb8e07bda9415278724e09b559bc3e76d49c56a9bc8143d7a5e4426914c6034884a9b90bccf34982c2097d79e28d80833bf1025beee16b3d8dfe9e1b3a9f2d6
data/bin/deadfinder CHANGED
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'deadfinder'
4
- DeadFinder.start(ARGV)
4
+ DeadFinder::CLI.start(ARGV)
@@ -4,41 +4,24 @@ require 'uri'
4
4
 
5
5
  def generate_url(text, base_url)
6
6
  node = text.to_s
7
+ return node if node.start_with?('http://', 'https://')
8
+
7
9
  begin
8
- unless node.start_with?('http://', 'https://')
9
- uri = URI(base_url)
10
- if node.start_with? '//'
11
- return "#{uri.scheme}:#{node}"
12
- elsif node.start_with? '/'
13
- return "#{uri.scheme}://#{uri.host}#{node}"
14
- elsif ignore_scheme? node
15
- return nil
16
- else
17
- return "#{extract_directory(uri)}#{node}"
18
- end
10
+ uri = URI(base_url)
11
+ if node.start_with?('//')
12
+ "#{uri.scheme}:#{node}"
13
+ elsif node.start_with?('/')
14
+ "#{uri.scheme}://#{uri.host}#{node}"
15
+ elsif ignore_scheme?(node)
16
+ nil
17
+ else
18
+ URI.join(base_url, node).to_s
19
19
  end
20
20
  rescue StandardError
21
- # puts e
21
+ nil
22
22
  end
23
- node
24
23
  end
25
24
 
26
25
  def ignore_scheme?(url)
27
26
  url.start_with?('mailto:', 'tel:', 'sms:', 'data:', 'file:')
28
27
  end
29
-
30
- def extract_directory(uri)
31
- return "#{uri.scheme}://#{uri.host}#{uri.path}" if uri.path.end_with?('/')
32
-
33
- path_components = uri.path.split('/')
34
- path_components.last
35
- path_components.pop
36
-
37
- directory_path = path_components.join('/')
38
-
39
- if directory_path.start_with?('/')
40
- "#{uri.scheme}://#{uri.host}#{directory_path}/"
41
- else
42
- "#{uri.scheme}://#{uri.host}/#{directory_path}/"
43
- end
44
- end
@@ -1,3 +1,3 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- VERSION = '1.5.1'
3
+ VERSION = '1.6.1'
data/lib/deadfinder.rb CHANGED
@@ -10,224 +10,243 @@ require 'deadfinder/version'
10
10
  require 'concurrent-edge'
11
11
  require 'sitemap-parser'
12
12
  require 'json'
13
+ require 'yaml'
14
+ require 'csv'
13
15
 
14
- Channel = Concurrent::Channel
15
- CACHE_SET = Concurrent::Map.new
16
- CACHE_QUE = Concurrent::Map.new
17
- OUTPUT = {}
18
-
19
- class DeadFinderRunner
20
- def default_options
21
- {
22
- 'concurrency' => 50,
23
- 'timeout' => 10,
24
- 'output' => '',
25
- 'headers' => [],
26
- 'worker_headers' => [],
27
- 'silent' => true,
28
- 'verbose' => false,
29
- 'include30x' => false
30
- }
16
+ module DeadFinder
17
+ Channel = Concurrent::Channel
18
+ CACHE_SET = Concurrent::Map.new
19
+ CACHE_QUE = Concurrent::Map.new
20
+
21
+ @output = {}
22
+ def self.output
23
+ @output
31
24
  end
32
25
 
33
- def run(target, options)
34
- Logger.set_silent if options['silent']
35
- headers = options['headers'].each_with_object({}) do |header, hash|
36
- kv = header.split(': ')
37
- hash[kv[0]] = kv[1]
38
- rescue StandardError
26
+ def self.output=(val)
27
+ @output = val
28
+ end
29
+
30
+ class Runner
31
+ def default_options
32
+ {
33
+ 'concurrency' => 50,
34
+ 'timeout' => 10,
35
+ 'output' => '',
36
+ 'output_format' => 'json',
37
+ 'headers' => [],
38
+ 'worker_headers' => [],
39
+ 'silent' => true,
40
+ 'verbose' => false,
41
+ 'include30x' => false
42
+ }
39
43
  end
40
- page = Nokogiri::HTML(URI.open(target, headers))
41
- links = extract_links(page)
42
44
 
43
- total_links_count = links.values.flatten.length
44
- # Generate link info string for non-empty link types
45
- link_info = links.map { |type, urls| "#{type}:#{urls.length}" if urls.length.positive? }.compact.join(' / ')
45
+ def run(target, options)
46
+ Logger.set_silent if options['silent']
47
+ headers = options['headers'].each_with_object({}) do |header, hash|
48
+ kv = header.split(': ')
49
+ hash[kv[0]] = kv[1]
50
+ rescue StandardError
51
+ end
52
+ page = Nokogiri::HTML(URI.open(target, headers))
53
+ links = extract_links(page)
46
54
 
47
- # Log the information if there are any links
48
- Logger.sub_info "Found #{total_links_count} URLs. [#{link_info}]" unless link_info.empty?
49
- Logger.sub_info 'Checking'
55
+ total_links_count = links.values.flatten.length
56
+ link_info = links.map { |type, urls| "#{type}:#{urls.length}" if urls.length.positive? }
57
+ .compact.join(' / ')
58
+ Logger.sub_info "Found #{total_links_count} URLs. [#{link_info}]" unless link_info.empty?
59
+ Logger.sub_info 'Checking'
50
60
 
51
- jobs = Channel.new(buffer: :buffered, capacity: 1000)
52
- results = Channel.new(buffer: :buffered, capacity: 1000)
61
+ jobs = Channel.new(buffer: :buffered, capacity: 1000)
62
+ results = Channel.new(buffer: :buffered, capacity: 1000)
53
63
 
54
- (1..options['concurrency']).each do |w|
55
- Channel.go { worker(w, jobs, results, target, options) }
56
- end
64
+ (1..options['concurrency']).each do |w|
65
+ Channel.go { worker(w, jobs, results, target, options) }
66
+ end
57
67
 
58
- links.values.flatten.uniq.each do |node|
59
- result = generate_url(node, target)
60
- jobs << result unless result.nil?
61
- end
68
+ links.values.flatten.uniq.each do |node|
69
+ result = generate_url(node, target)
70
+ jobs << result unless result.nil?
71
+ end
62
72
 
63
- jobs_size = jobs.size
64
- jobs.close
73
+ jobs_size = jobs.size
74
+ jobs.close
65
75
 
66
- (1..jobs_size).each do
67
- ~results
76
+ (1..jobs_size).each { ~results }
77
+ Logger.sub_done 'Done'
78
+ rescue StandardError => e
79
+ Logger.error "[#{e}] #{target}"
68
80
  end
69
- Logger.sub_done 'Done'
70
- rescue StandardError => e
71
- Logger.error "[#{e}] #{target}"
72
- end
73
81
 
74
- def worker(_id, jobs, results, target, options)
75
- jobs.each do |j|
76
- if CACHE_SET[j]
77
- Logger.found "[404 Not Found] #{j}" unless CACHE_QUE[j]
78
- else
79
- CACHE_SET[j] = true
80
- begin
81
- CACHE_QUE[j] = true
82
- uri = URI.parse(j)
83
-
84
- # Create HTTP request with timeout and headers
85
- proxy_uri = URI.parse(options['proxy']) if options['proxy'] && !options['proxy'].empty?
86
- http = if proxy_uri
87
- Net::HTTP.new(uri.host, uri.port, proxy_uri.host, proxy_uri.port, proxy_uri.user, proxy_uri.password)
88
- else
89
- Net::HTTP.new(uri.host, uri.port)
90
- end
91
- http.use_ssl = (uri.scheme == 'https')
92
- http.read_timeout = options['timeout'].to_i if options['timeout']
93
-
94
- # Set SSL verification mode
95
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE if http.use_ssl?
96
-
97
- request = Net::HTTP::Get.new(uri.request_uri)
98
-
99
- # Add User-Agent header
100
- request['User-Agent'] = options['user_agent']
101
-
102
- # Add worker headers if provided
103
- options['worker_headers']&.each do |header|
104
- key, value = header.split(':', 2)
105
- request[key.strip] = value.strip
82
+ def worker(_id, jobs, results, target, options)
83
+ jobs.each do |j|
84
+ if CACHE_SET[j]
85
+ Logger.found "[404 Not Found] #{j}" unless CACHE_QUE[j]
86
+ else
87
+ CACHE_SET[j] = true
88
+ begin
89
+ CACHE_QUE[j] = true
90
+ uri = URI.parse(j)
91
+ proxy_uri = URI.parse(options['proxy']) if options['proxy'] && !options['proxy'].empty?
92
+ http = if proxy_uri
93
+ Net::HTTP.new(uri.host, uri.port,
94
+ proxy_uri.host, proxy_uri.port,
95
+ proxy_uri.user, proxy_uri.password)
96
+ else
97
+ Net::HTTP.new(uri.host, uri.port)
98
+ end
99
+ http.use_ssl = (uri.scheme == 'https')
100
+ http.read_timeout = options['timeout'].to_i if options['timeout']
101
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE if http.use_ssl?
102
+
103
+ request = Net::HTTP::Get.new(uri.request_uri)
104
+ request['User-Agent'] = options['user_agent']
105
+ options['worker_headers']&.each do |header|
106
+ key, value = header.split(':', 2)
107
+ request[key.strip] = value.strip
108
+ end
109
+
110
+ response = http.request(request)
111
+ status_code = response.code.to_i
112
+ Logger.verbose "Status Code: #{status_code} for #{j}" if options['verbose']
113
+
114
+ if status_code >= 400 || (status_code >= 300 && options['include30x'])
115
+ Logger.found "[#{status_code} #{response.message}] #{j}"
116
+ CACHE_QUE[j] = false
117
+ DeadFinder.output[target] ||= []
118
+ DeadFinder.output[target] << j
119
+ end
120
+ rescue StandardError => e
121
+ Logger.verbose "[#{e}] #{j}" if options['verbose']
106
122
  end
107
-
108
- response = http.request(request)
109
- status_code = response.code.to_i
110
- Logger.verbose "Status Code: #{status_code} for #{j}" if options['verbose']
111
-
112
- if status_code >= 400 || (status_code >= 300 && options['include30x'])
113
- Logger.found "[#{status_code} #{response.message}] #{j}"
114
- CACHE_QUE[j] = false
115
- OUTPUT[target] ||= []
116
- OUTPUT[target] << j
117
- end
118
- rescue StandardError => e
119
- Logger.verbose "[#{e}] #{j}" if options['verbose']
120
123
  end
124
+ results << j
121
125
  end
122
- results << j
123
126
  end
124
- end
125
127
 
126
- private
127
-
128
- def extract_links(page)
129
- {
130
- anchor: page.css('a').map { |element| element['href'] }.compact,
131
- script: page.css('script').map { |element| element['src'] }.compact,
132
- link: page.css('link').map { |element| element['href'] }.compact,
133
- iframe: page.css('iframe').map { |element| element['src'] }.compact,
134
- form: page.css('form').map { |element| element['action'] }.compact,
135
- object: page.css('object').map { |element| element['data'] }.compact,
136
- embed: page.css('embed').map { |element| element['src'] }.compact
137
- }
128
+ private
129
+
130
+ def extract_links(page)
131
+ {
132
+ anchor: page.css('a').map { |element| element['href'] }.compact,
133
+ script: page.css('script').map { |element| element['src'] }.compact,
134
+ link: page.css('link').map { |element| element['href'] }.compact,
135
+ iframe: page.css('iframe').map { |element| element['src'] }.compact,
136
+ form: page.css('form').map { |element| element['action'] }.compact,
137
+ object: page.css('object').map { |element| element['data'] }.compact,
138
+ embed: page.css('embed').map { |element| element['src'] }.compact
139
+ }
140
+ end
138
141
  end
139
- end
140
142
 
141
- def run_pipe(options)
142
- Logger.set_silent if options['silent']
143
-
144
- Logger.info 'Reading from STDIN'
145
- app = DeadFinderRunner.new
146
- while $stdin.gets
147
- target = $LAST_READ_LINE.chomp
148
- Logger.target "Checking: #{target}"
149
- app.run target, options
143
+ def self.run_pipe(options)
144
+ Logger.set_silent if options['silent']
145
+ Logger.info 'Reading from STDIN'
146
+ app = Runner.new
147
+ while $stdin.gets
148
+ target = $LAST_READ_LINE.chomp
149
+ Logger.target "Checking: #{target}"
150
+ app.run target, options
151
+ end
152
+ gen_output(options)
150
153
  end
151
- gen_output(options)
152
- end
153
-
154
- def run_file(filename, options)
155
- Logger.set_silent if options['silent']
156
154
 
157
- Logger.info "Reading: #{filename}"
158
- app = DeadFinderRunner.new
159
- File.foreach(filename) do |line|
160
- target = line.chomp
161
- Logger.target "Checking: #{target}"
162
- app.run target, options
155
+ def self.run_file(filename, options)
156
+ Logger.set_silent if options['silent']
157
+ Logger.info "Reading: #{filename}"
158
+ app = Runner.new
159
+ File.foreach(filename) do |line|
160
+ target = line.chomp
161
+ Logger.target "Checking: #{target}"
162
+ app.run target, options
163
+ end
164
+ gen_output(options)
163
165
  end
164
- gen_output(options)
165
- end
166
-
167
- def run_url(url, options)
168
- Logger.set_silent if options['silent']
169
-
170
- Logger.target "Checking: #{url}"
171
- app = DeadFinderRunner.new
172
- app.run url, options
173
- gen_output(options)
174
- end
175
166
 
176
- def run_sitemap(sitemap_url, options)
177
- Logger.set_silent if options['silent']
178
- Logger.info "Parsing sitemap: #{sitemap_url}"
179
- app = DeadFinderRunner.new
180
- base_uri = URI(sitemap_url)
181
- sitemap = SitemapParser.new sitemap_url, { recurse: true }
182
- sitemap.to_a.each do |url|
183
- turl = generate_url url, base_uri
184
- Logger.target "Checking: #{turl}"
185
- app.run turl, options
167
+ def self.run_url(url, options)
168
+ Logger.set_silent if options['silent']
169
+ Logger.target "Checking: #{url}"
170
+ app = Runner.new
171
+ app.run url, options
172
+ gen_output(options)
186
173
  end
187
- gen_output(options)
188
- end
189
174
 
190
- def gen_output(options)
191
- output_data = OUTPUT.to_h
192
- File.write(options['output'], JSON.pretty_generate(output_data)) unless options['output'].empty?
193
- end
194
-
195
- class DeadFinder < Thor
196
- class_option :include30x, aliases: :r, default: false, type: :boolean, desc: 'Include 30x redirections'
197
- class_option :concurrency, aliases: :c, default: 50, type: :numeric, desc: 'Number of concurrency'
198
- class_option :timeout, aliases: :t, default: 10, type: :numeric, desc: 'Timeout in seconds'
199
- class_option :output, aliases: :o, default: '', type: :string, desc: 'File to write JSON result'
200
- class_option :headers, aliases: :H, default: [], type: :array,
201
- desc: 'Custom HTTP headers to send with initial request'
202
- class_option :worker_headers, default: [], type: :array, desc: 'Custom HTTP headers to send with worker requests'
203
- class_option :user_agent, default: 'Mozilla/5.0 (compatible; DeadFinder/1.5.1;)', type: :string,
204
- desc: 'User-Agent string to use for requests'
205
- class_option :proxy, aliases: :p, default: '', type: :string, desc: 'Proxy server to use for requests'
206
- class_option :silent, aliases: :s, default: false, type: :boolean, desc: 'Silent mode'
207
- class_option :verbose, aliases: :v, default: false, type: :boolean, desc: 'Verbose mode'
208
-
209
- desc 'pipe', 'Scan the URLs from STDIN. (e.g cat urls.txt | deadfinder pipe)'
210
- def pipe
211
- run_pipe options
175
+ def self.run_sitemap(sitemap_url, options)
176
+ Logger.set_silent if options['silent']
177
+ Logger.info "Parsing sitemap: #{sitemap_url}"
178
+ app = Runner.new
179
+ base_uri = URI(sitemap_url)
180
+ sitemap = SitemapParser.new sitemap_url, { recurse: true }
181
+ sitemap.to_a.each do |url|
182
+ turl = generate_url(url, base_uri)
183
+ Logger.target "Checking: #{turl}"
184
+ app.run turl, options
185
+ end
186
+ gen_output(options)
212
187
  end
213
188
 
214
- desc 'file <FILE>', 'Scan the URLs from File. (e.g deadfinder file urls.txt)'
215
- def file(filename)
216
- run_file filename, options
189
+ def self.gen_output(options)
190
+ return if options['output'].empty?
191
+
192
+ output_data = DeadFinder.output.to_h
193
+ format = options['output_format'].to_s.downcase
194
+
195
+ content = case format
196
+ when 'yaml', 'yml'
197
+ output_data.to_yaml
198
+ when 'csv'
199
+ CSV.generate do |csv|
200
+ csv << %w[target url]
201
+ output_data.each do |target, urls|
202
+ Array(urls).each { |url| csv << [target, url] }
203
+ end
204
+ end
205
+ else
206
+ JSON.pretty_generate(output_data)
207
+ end
208
+
209
+ File.write(options['output'], content)
217
210
  end
218
211
 
219
- desc 'url <URL>', 'Scan the Single URL.'
220
- def url(url)
221
- run_url url, options
222
- end
212
+ class CLI < Thor
213
+ class_option :include30x, aliases: :r, default: false, type: :boolean, desc: 'Include 30x redirections'
214
+ class_option :concurrency, aliases: :c, default: 50, type: :numeric, desc: 'Number of concurrency'
215
+ class_option :timeout, aliases: :t, default: 10, type: :numeric, desc: 'Timeout in seconds'
216
+ class_option :output, aliases: :o, default: '', type: :string, desc: 'File to write result (e.g., json, yaml, csv)'
217
+ class_option :output_format, aliases: :f, default: 'json', type: :string, desc: 'Output format'
218
+ class_option :headers, aliases: :H, default: [], type: :array,
219
+ desc: 'Custom HTTP headers to send with initial request'
220
+ class_option :worker_headers, default: [], type: :array, desc: 'Custom HTTP headers to send with worker requests'
221
+ class_option :user_agent, default: 'Mozilla/5.0 (compatible; DeadFinder/1.6.1;)', type: :string,
222
+ desc: 'User-Agent string to use for requests'
223
+ class_option :proxy, aliases: :p, default: '', type: :string, desc: 'Proxy server to use for requests'
224
+ class_option :silent, aliases: :s, default: false, type: :boolean, desc: 'Silent mode'
225
+ class_option :verbose, aliases: :v, default: false, type: :boolean, desc: 'Verbose mode'
226
+
227
+ desc 'pipe', 'Scan the URLs from STDIN. (e.g., cat urls.txt | deadfinder pipe)'
228
+ def pipe
229
+ DeadFinder.run_pipe options
230
+ end
223
231
 
224
- desc 'sitemap <SITEMAP-URL>', 'Scan the URLs from sitemap.'
225
- def sitemap(sitemap)
226
- run_sitemap sitemap, options
227
- end
232
+ desc 'file <FILE>', 'Scan the URLs from File. (e.g., deadfinder file urls.txt)'
233
+ def file(filename)
234
+ DeadFinder.run_file filename, options
235
+ end
228
236
 
229
- desc 'version', 'Show version.'
230
- def version
231
- Logger.info "deadfinder #{VERSION}"
237
+ desc 'url <URL>', 'Scan the Single URL.'
238
+ def url(url)
239
+ DeadFinder.run_url url, options
240
+ end
241
+
242
+ desc 'sitemap <SITEMAP-URL>', 'Scan the URLs from sitemap.'
243
+ def sitemap(sitemap)
244
+ DeadFinder.run_sitemap sitemap, options
245
+ end
246
+
247
+ desc 'version', 'Show version.'
248
+ def version
249
+ Logger.info "deadfinder #{VERSION}"
250
+ end
232
251
  end
233
252
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: deadfinder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.1
4
+ version: 1.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - hahwul
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2024-12-26 00:00:00.000000000 Z
10
+ date: 2025-02-17 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: colorize
@@ -169,6 +169,20 @@ dependencies:
169
169
  - - ">="
170
170
  - !ruby/object:Gem::Version
171
171
  version: 1.2.0
172
+ - !ruby/object:Gem::Dependency
173
+ name: rspec
174
+ requirement: !ruby/object:Gem::Requirement
175
+ requirements:
176
+ - - ">="
177
+ - !ruby/object:Gem::Version
178
+ version: '0'
179
+ type: :development
180
+ prerelease: false
181
+ version_requirements: !ruby/object:Gem::Requirement
182
+ requirements:
183
+ - - ">="
184
+ - !ruby/object:Gem::Version
185
+ version: '0'
172
186
  description: Find dead-links (broken links). Dead link (broken link) means a link
173
187
  within a web page that cannot be connected. These links can have a negative impact
174
188
  to SEO and Security. This tool makes it easy to identify and modify.
@@ -183,7 +197,7 @@ files:
183
197
  - lib/deadfinder/logger.rb
184
198
  - lib/deadfinder/utils.rb
185
199
  - lib/deadfinder/version.rb
186
- homepage: https://www.hahwul.com
200
+ homepage: https://www.hahwul.com/projects/deadfinder/
187
201
  licenses:
188
202
  - MIT
189
203
  metadata:
@@ -203,7 +217,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
203
217
  - !ruby/object:Gem::Version
204
218
  version: '0'
205
219
  requirements: []
206
- rubygems_version: 3.6.2
220
+ rubygems_version: 3.6.3
207
221
  specification_version: 4
208
222
  summary: Find dead-links (broken links)
209
223
  test_files: []