wayback_machine_downloader_straw 2.4.1 → 2.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f6650c4217f2630db6307bc50ae2d6cefcbc38afc18b5701cc90a956af5cf1cf
4
- data.tar.gz: 0ad44d7daa4c69b75d319c3518c4b801810be071545d5eded4497073caab4667
3
+ metadata.gz: ce7592163165a7f8235bf4a6e1915cf531511fafc7f6874c0d1673fb29db704f
4
+ data.tar.gz: 7d48ffebf130d3b32d1ec233cf5141cc3cf192bcf16751db4380bf62863971c1
5
5
  SHA512:
6
- metadata.gz: 7a8cfd1cda19bc3ff2db8859e03877395eaf44092ffbe9f5334218fbd6293ff1aecc60e2bf272f875a67ecd086a209c56640db221f4d13739669a27eada1c826
7
- data.tar.gz: 877436af63fa205add55ebeb55bafcd39fec0afa56707ee742871014dac48998e8028ef4616a0b611bee5f9a93ed0d8d136375d457503a3e34b9a37f87321787
6
+ metadata.gz: 16d56de1814e36174c47ab5bda6c9d5e02aba15bafa72a1d57056d0ac146e5fff5c6ca43f9198262d90820e4dcbe4e63772f01bd1ee5207c7ab07e9bb959e069
7
+ data.tar.gz: 07602af4f0cfb9927d43239da0c38cb2411aa408d11fe3f91cb4a403fa415ca8de095eee7467e4613d32aadb8c0a13ffea19ac2f93fd5bf005a991d91e8a064a
@@ -11,6 +11,7 @@ require 'concurrent-ruby'
11
11
  require 'logger'
12
12
  require 'zlib'
13
13
  require 'stringio'
14
+ require 'digest'
14
15
  require_relative 'wayback_machine_downloader/tidy_bytes'
15
16
  require_relative 'wayback_machine_downloader/to_regex'
16
17
  require_relative 'wayback_machine_downloader/archive_api'
@@ -116,7 +117,7 @@ class WaybackMachineDownloader
116
117
  include ArchiveAPI
117
118
  include SubdomainProcessor
118
119
 
119
- VERSION = "2.4.1"
120
+ VERSION = "2.4.3"
120
121
  DEFAULT_TIMEOUT = 30
121
122
  MAX_RETRIES = 3
122
123
  RETRY_DELAY = 2
@@ -171,12 +172,19 @@ class WaybackMachineDownloader
171
172
 
172
173
  def backup_name
173
174
  url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
174
-
175
- if url_to_process.include? '//'
175
+ raw = if url_to_process.include?('//')
176
176
  url_to_process.split('/')[2]
177
177
  else
178
178
  url_to_process
179
179
  end
180
+
181
+ # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
182
+ if Gem.win_platform?
183
+ raw = raw.gsub(/[:*?"<>|]/, '_')
184
+ raw = raw.gsub(/[ .]+\z/, '')
185
+ end
186
+ raw = 'site' if raw.nil? || raw.empty?
187
+ raw
180
188
  end
181
189
 
182
190
  def backup_path
@@ -769,17 +777,83 @@ class WaybackMachineDownloader
769
777
  # safely sanitize a file id (or id+timestamp)
770
778
  def sanitize_and_prepare_id(raw, file_url)
771
779
  return nil if raw.nil?
780
+ return "" if raw.empty?
781
+ original = raw.dup
772
782
  begin
773
- raw = CGI.unescape(raw) rescue raw
774
- raw.gsub!(/<[^>]*>/, '')
775
- raw = raw.tidy_bytes unless raw.empty?
783
+ # work on a binary copy to avoid premature encoding errors
784
+ raw = raw.dup.force_encoding(Encoding::BINARY)
785
+
786
+ # percent-decode (repeat until stable in case of double-encoding)
787
+ loop do
788
+ decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
789
+ break if decoded == raw
790
+ raw = decoded
791
+ end
792
+
793
+ # try tidy_bytes
794
+ begin
795
+ raw = raw.tidy_bytes
796
+ rescue StandardError
797
+ # fallback: scrub to UTF-8
798
+ raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
799
+ end
800
+
801
+ # ensure UTF-8 and scrub again
802
+ unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
803
+ raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
804
+ end
805
+
806
+ # strip HTML/comment artifacts & control chars
807
+ raw.gsub!(/<!--+/, '')
808
+ raw.gsub!(/[\x00-\x1F]/, '')
809
+
810
+ # split query; hash it for stable short name
811
+ path_part, query_part = raw.split('?', 2)
812
+ if query_part && !query_part.empty?
813
+ q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
814
+ if path_part.include?('.')
815
+ pre, _sep, post = path_part.rpartition('.')
816
+ path_part = "#{pre}__q#{q_digest}.#{post}"
817
+ else
818
+ path_part = "#{path_part}__q#{q_digest}"
819
+ end
820
+ end
821
+ raw = path_part
822
+
823
+ # collapse slashes & trim leading slash
824
+ raw.gsub!(%r{/+}, '/')
825
+ raw.sub!(%r{\A/}, '')
826
+
827
+ # segment-wise sanitation
828
+ raw = raw.split('/').map do |segment|
829
+ seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
830
+ seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
831
+ seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
832
+ seg.empty? ? '_' : seg
833
+ end.join('/')
834
+
835
+ # remove any remaining angle brackets
836
+ raw.tr!('<>', '')
837
+
838
+ # final fallback if empty
839
+ raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
840
+
776
841
  raw
777
842
  rescue => e
778
843
  @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
779
- nil
844
+ # deterministic fallback – never return nil so caller won’t mark malformed
845
+ "file__#{Digest::SHA1.hexdigest(original)[0,10]}"
780
846
  end
781
847
  end
782
848
 
849
+ # wrap URL in parentheses if it contains characters that commonly break unquoted
850
+ # Windows CMD usage (e.g., &). This is only for display; user still must quote
851
+ # when invoking manually.
852
+ def safe_display_url(url)
853
+ return url unless url && url.match?(/[&]/)
854
+ "(#{url})"
855
+ end
856
+
783
857
  def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
784
858
  retries = 0
785
859
  begin
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.1
4
+ version: 2.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-08-12 00:00:00.000000000 Z
10
+ date: 2025-08-19 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby