wayback_machine_downloader_straw 2.4.1 → 2.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader.rb +81 -7
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ce7592163165a7f8235bf4a6e1915cf531511fafc7f6874c0d1673fb29db704f
|
|
4
|
+
data.tar.gz: 7d48ffebf130d3b32d1ec233cf5141cc3cf192bcf16751db4380bf62863971c1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 16d56de1814e36174c47ab5bda6c9d5e02aba15bafa72a1d57056d0ac146e5fff5c6ca43f9198262d90820e4dcbe4e63772f01bd1ee5207c7ab07e9bb959e069
|
|
7
|
+
data.tar.gz: 07602af4f0cfb9927d43239da0c38cb2411aa408d11fe3f91cb4a403fa415ca8de095eee7467e4613d32aadb8c0a13ffea19ac2f93fd5bf005a991d91e8a064a
|
|
@@ -11,6 +11,7 @@ require 'concurrent-ruby'
|
|
|
11
11
|
require 'logger'
|
|
12
12
|
require 'zlib'
|
|
13
13
|
require 'stringio'
|
|
14
|
+
require 'digest'
|
|
14
15
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
|
15
16
|
require_relative 'wayback_machine_downloader/to_regex'
|
|
16
17
|
require_relative 'wayback_machine_downloader/archive_api'
|
|
@@ -116,7 +117,7 @@ class WaybackMachineDownloader
|
|
|
116
117
|
include ArchiveAPI
|
|
117
118
|
include SubdomainProcessor
|
|
118
119
|
|
|
119
|
-
VERSION = "2.4.
|
|
120
|
+
VERSION = "2.4.3"
|
|
120
121
|
DEFAULT_TIMEOUT = 30
|
|
121
122
|
MAX_RETRIES = 3
|
|
122
123
|
RETRY_DELAY = 2
|
|
@@ -171,12 +172,19 @@ class WaybackMachineDownloader
|
|
|
171
172
|
|
|
172
173
|
def backup_name
|
|
173
174
|
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
|
174
|
-
|
|
175
|
-
if url_to_process.include? '//'
|
|
175
|
+
raw = if url_to_process.include?('//')
|
|
176
176
|
url_to_process.split('/')[2]
|
|
177
177
|
else
|
|
178
178
|
url_to_process
|
|
179
179
|
end
|
|
180
|
+
|
|
181
|
+
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
|
182
|
+
if Gem.win_platform?
|
|
183
|
+
raw = raw.gsub(/[:*?"<>|]/, '_')
|
|
184
|
+
raw = raw.gsub(/[ .]+\z/, '')
|
|
185
|
+
end
|
|
186
|
+
raw = 'site' if raw.nil? || raw.empty?
|
|
187
|
+
raw
|
|
180
188
|
end
|
|
181
189
|
|
|
182
190
|
def backup_path
|
|
@@ -769,17 +777,83 @@ class WaybackMachineDownloader
|
|
|
769
777
|
# safely sanitize a file id (or id+timestamp)
|
|
770
778
|
def sanitize_and_prepare_id(raw, file_url)
|
|
771
779
|
return nil if raw.nil?
|
|
780
|
+
return "" if raw.empty?
|
|
781
|
+
original = raw.dup
|
|
772
782
|
begin
|
|
773
|
-
|
|
774
|
-
raw.
|
|
775
|
-
|
|
783
|
+
# work on a binary copy to avoid premature encoding errors
|
|
784
|
+
raw = raw.dup.force_encoding(Encoding::BINARY)
|
|
785
|
+
|
|
786
|
+
# percent-decode (repeat until stable in case of double-encoding)
|
|
787
|
+
loop do
|
|
788
|
+
decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
|
|
789
|
+
break if decoded == raw
|
|
790
|
+
raw = decoded
|
|
791
|
+
end
|
|
792
|
+
|
|
793
|
+
# try tidy_bytes
|
|
794
|
+
begin
|
|
795
|
+
raw = raw.tidy_bytes
|
|
796
|
+
rescue StandardError
|
|
797
|
+
# fallback: scrub to UTF-8
|
|
798
|
+
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
799
|
+
end
|
|
800
|
+
|
|
801
|
+
# ensure UTF-8 and scrub again
|
|
802
|
+
unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
|
|
803
|
+
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
804
|
+
end
|
|
805
|
+
|
|
806
|
+
# strip HTML/comment artifacts & control chars
|
|
807
|
+
raw.gsub!(/<!--+/, '')
|
|
808
|
+
raw.gsub!(/[\x00-\x1F]/, '')
|
|
809
|
+
|
|
810
|
+
# split query; hash it for stable short name
|
|
811
|
+
path_part, query_part = raw.split('?', 2)
|
|
812
|
+
if query_part && !query_part.empty?
|
|
813
|
+
q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
|
|
814
|
+
if path_part.include?('.')
|
|
815
|
+
pre, _sep, post = path_part.rpartition('.')
|
|
816
|
+
path_part = "#{pre}__q#{q_digest}.#{post}"
|
|
817
|
+
else
|
|
818
|
+
path_part = "#{path_part}__q#{q_digest}"
|
|
819
|
+
end
|
|
820
|
+
end
|
|
821
|
+
raw = path_part
|
|
822
|
+
|
|
823
|
+
# collapse slashes & trim leading slash
|
|
824
|
+
raw.gsub!(%r{/+}, '/')
|
|
825
|
+
raw.sub!(%r{\A/}, '')
|
|
826
|
+
|
|
827
|
+
# segment-wise sanitation
|
|
828
|
+
raw = raw.split('/').map do |segment|
|
|
829
|
+
seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
830
|
+
seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
|
|
831
|
+
seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
|
|
832
|
+
seg.empty? ? '_' : seg
|
|
833
|
+
end.join('/')
|
|
834
|
+
|
|
835
|
+
# remove any remaining angle brackets
|
|
836
|
+
raw.tr!('<>', '')
|
|
837
|
+
|
|
838
|
+
# final fallback if empty
|
|
839
|
+
raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
|
|
840
|
+
|
|
776
841
|
raw
|
|
777
842
|
rescue => e
|
|
778
843
|
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
|
779
|
-
nil
|
|
844
|
+
# deterministic fallback – never return nil so caller won’t mark malformed
|
|
845
|
+
"file__#{Digest::SHA1.hexdigest(original)[0,10]}"
|
|
780
846
|
end
|
|
781
847
|
end
|
|
782
848
|
|
|
849
|
+
# wrap URL in parentheses if it contains characters that commonly break unquoted
|
|
850
|
+
# Windows CMD usage (e.g., &). This is only for display; user still must quote
|
|
851
|
+
# when invoking manually.
|
|
852
|
+
def safe_display_url(url)
|
|
853
|
+
return url unless url && url.match?(/[&]/)
|
|
854
|
+
"(#{url})"
|
|
855
|
+
end
|
|
856
|
+
|
|
783
857
|
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
|
784
858
|
retries = 0
|
|
785
859
|
begin
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.4.
|
|
4
|
+
version: 2.4.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-08-
|
|
10
|
+
date: 2025-08-19 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|