wayback_machine_downloader_straw 2.4.0 → 2.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 35a8c4a865a9da5cb45e7f63e2f832f491895f5c69c3d440b9c8b4230b8444f1
4
- data.tar.gz: a96d746b41f3e3b7a1cf6df38df3b23a79361f57f667eea562be72961bf391c2
3
+ metadata.gz: ce7592163165a7f8235bf4a6e1915cf531511fafc7f6874c0d1673fb29db704f
4
+ data.tar.gz: 7d48ffebf130d3b32d1ec233cf5141cc3cf192bcf16751db4380bf62863971c1
5
5
  SHA512:
6
- metadata.gz: 783bb658ee95bd523fb3dc8c2c11a027947becc4e72902e2fff85eb725bbc8e3ef8e7bb22b08598f015f77e801526354f36b6d920144df9fd6bca440cccf8127
7
- data.tar.gz: a2e0ce3e4df543574b1c04e349d120b31d900bbbfe3f9bf512706f57094d89c49574290520df25fdd8c920577baf561272af65ca4c36d058a3a4097efa167a83
6
+ metadata.gz: 16d56de1814e36174c47ab5bda6c9d5e02aba15bafa72a1d57056d0ac146e5fff5c6ca43f9198262d90820e4dcbe4e63772f01bd1ee5207c7ab07e9bb959e069
7
+ data.tar.gz: 07602af4f0cfb9927d43239da0c38cb2411aa408d11fe3f91cb4a403fa415ca8de095eee7467e4613d32aadb8c0a13ffea19ac2f93fd5bf005a991d91e8a064a
@@ -11,6 +11,7 @@ require 'concurrent-ruby'
11
11
  require 'logger'
12
12
  require 'zlib'
13
13
  require 'stringio'
14
+ require 'digest'
14
15
  require_relative 'wayback_machine_downloader/tidy_bytes'
15
16
  require_relative 'wayback_machine_downloader/to_regex'
16
17
  require_relative 'wayback_machine_downloader/archive_api'
@@ -116,7 +117,7 @@ class WaybackMachineDownloader
116
117
  include ArchiveAPI
117
118
  include SubdomainProcessor
118
119
 
119
- VERSION = "2.4.0"
120
+ VERSION = "2.4.3"
120
121
  DEFAULT_TIMEOUT = 30
121
122
  MAX_RETRIES = 3
122
123
  RETRY_DELAY = 2
@@ -171,12 +172,19 @@ class WaybackMachineDownloader
171
172
 
172
173
  def backup_name
173
174
  url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
174
-
175
- if url_to_process.include? '//'
175
+ raw = if url_to_process.include?('//')
176
176
  url_to_process.split('/')[2]
177
177
  else
178
178
  url_to_process
179
179
  end
180
+
181
+ # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
182
+ if Gem.win_platform?
183
+ raw = raw.gsub(/[:*?"<>|]/, '_')
184
+ raw = raw.gsub(/[ .]+\z/, '')
185
+ end
186
+ raw = 'site' if raw.nil? || raw.empty?
187
+ raw
180
188
  end
181
189
 
182
190
  def backup_path
@@ -340,15 +348,15 @@ class WaybackMachineDownloader
340
348
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
341
349
  next unless file_url.include?('/')
342
350
  next if file_timestamp.to_i > target_timestamp
343
- file_id = file_url.split('/')[3..-1].join('/')
344
- file_id = CGI::unescape file_id
345
- file_id = file_id.tidy_bytes unless file_id == ""
351
+
352
+ raw_tail = file_url.split('/')[3..-1]&.join('/')
353
+ file_id = sanitize_and_prepare_id(raw_tail, file_url)
346
354
  next if file_id.nil?
347
355
  next if match_exclude_filter(file_url)
348
356
  next unless match_only_filter(file_url)
349
- # Select the most recent version <= target_timestamp
357
+
350
358
  if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
351
- file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
359
+ file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
352
360
  end
353
361
  end
354
362
  file_versions.values
@@ -368,22 +376,27 @@ class WaybackMachineDownloader
368
376
  file_list_curated = Hash.new
369
377
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
370
378
  next unless file_url.include?('/')
371
- file_id = file_url.split('/')[3..-1].join('/')
372
- file_id = CGI::unescape file_id
373
- file_id = file_id.tidy_bytes unless file_id == ""
379
+
380
+ raw_tail = file_url.split('/')[3..-1]&.join('/')
381
+ file_id = sanitize_and_prepare_id(raw_tail, file_url)
374
382
  if file_id.nil?
375
383
  puts "Malformed file url, ignoring: #{file_url}"
384
+ next
385
+ end
386
+
387
+ if file_id.include?('<') || file_id.include?('>')
388
+ puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
376
389
  else
377
390
  if match_exclude_filter(file_url)
378
391
  puts "File url matches exclude filter, ignoring: #{file_url}"
379
- elsif not match_only_filter(file_url)
392
+ elsif !match_only_filter(file_url)
380
393
  puts "File url doesn't match only filter, ignoring: #{file_url}"
381
394
  elsif file_list_curated[file_id]
382
395
  unless file_list_curated[file_id][:timestamp] > file_timestamp
383
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
396
+ file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
384
397
  end
385
398
  else
386
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
399
+ file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
387
400
  end
388
401
  end
389
402
  end
@@ -394,21 +407,32 @@ class WaybackMachineDownloader
394
407
  file_list_curated = Hash.new
395
408
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
396
409
  next unless file_url.include?('/')
397
- file_id = file_url.split('/')[3..-1].join('/')
398
- file_id_and_timestamp = [file_timestamp, file_id].join('/')
399
- file_id_and_timestamp = CGI::unescape file_id_and_timestamp
400
- file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
410
+
411
+ raw_tail = file_url.split('/')[3..-1]&.join('/')
412
+ file_id = sanitize_and_prepare_id(raw_tail, file_url)
401
413
  if file_id.nil?
402
414
  puts "Malformed file url, ignoring: #{file_url}"
415
+ next
416
+ end
417
+
418
+ file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
419
+ file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
420
+ if file_id_and_timestamp.nil?
421
+ puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
422
+ next
423
+ end
424
+
425
+ if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
426
+ puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
403
427
  else
404
428
  if match_exclude_filter(file_url)
405
429
  puts "File url matches exclude filter, ignoring: #{file_url}"
406
- elsif not match_only_filter(file_url)
430
+ elsif !match_only_filter(file_url)
407
431
  puts "File url doesn't match only filter, ignoring: #{file_url}"
408
432
  elsif file_list_curated[file_id_and_timestamp]
409
- puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
433
+ # duplicate combo, ignore silently (verbose flag not shown here)
410
434
  else
411
- file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
435
+ file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
412
436
  end
413
437
  end
414
438
  end
@@ -749,6 +773,86 @@ class WaybackMachineDownloader
749
773
  end
750
774
  logger
751
775
  end
776
+
777
+ # safely sanitize a file id (or id+timestamp)
778
+ def sanitize_and_prepare_id(raw, file_url)
779
+ return nil if raw.nil?
780
+ return "" if raw.empty?
781
+ original = raw.dup
782
+ begin
783
+ # work on a binary copy to avoid premature encoding errors
784
+ raw = raw.dup.force_encoding(Encoding::BINARY)
785
+
786
+ # percent-decode (repeat until stable in case of double-encoding)
787
+ loop do
788
+ decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
789
+ break if decoded == raw
790
+ raw = decoded
791
+ end
792
+
793
+ # try tidy_bytes
794
+ begin
795
+ raw = raw.tidy_bytes
796
+ rescue StandardError
797
+ # fallback: scrub to UTF-8
798
+ raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
799
+ end
800
+
801
+ # ensure UTF-8 and scrub again
802
+ unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
803
+ raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
804
+ end
805
+
806
+ # strip HTML/comment artifacts & control chars
807
+ raw.gsub!(/<!--+/, '')
808
+ raw.gsub!(/[\x00-\x1F]/, '')
809
+
810
+ # split query; hash it for stable short name
811
+ path_part, query_part = raw.split('?', 2)
812
+ if query_part && !query_part.empty?
813
+ q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
814
+ if path_part.include?('.')
815
+ pre, _sep, post = path_part.rpartition('.')
816
+ path_part = "#{pre}__q#{q_digest}.#{post}"
817
+ else
818
+ path_part = "#{path_part}__q#{q_digest}"
819
+ end
820
+ end
821
+ raw = path_part
822
+
823
+ # collapse slashes & trim leading slash
824
+ raw.gsub!(%r{/+}, '/')
825
+ raw.sub!(%r{\A/}, '')
826
+
827
+ # segment-wise sanitation
828
+ raw = raw.split('/').map do |segment|
829
+ seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
830
+ seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
831
+ seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
832
+ seg.empty? ? '_' : seg
833
+ end.join('/')
834
+
835
+ # remove any remaining angle brackets
836
+ raw.tr!('<>', '')
837
+
838
+ # final fallback if empty
839
+ raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
840
+
841
+ raw
842
+ rescue => e
843
+ @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
844
+ # deterministic fallback – never return nil so caller won’t mark malformed
845
+ "file__#{Digest::SHA1.hexdigest(original)[0,10]}"
846
+ end
847
+ end
848
+
849
+ # wrap URL in parentheses if it contains characters that commonly break unquoted
850
+ # Windows CMD usage (e.g., &). This is only for display; user still must quote
851
+ # when invoking manually.
852
+ def safe_display_url(url)
853
+ return url unless url && url.match?(/[&]/)
854
+ "(#{url})"
855
+ end
752
856
 
753
857
  def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
754
858
  retries = 0
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.0
4
+ version: 2.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-08-04 00:00:00.000000000 Z
10
+ date: 2025-08-19 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby