wayback_machine_downloader_straw 2.4.0 → 2.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader.rb +125 -21
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ce7592163165a7f8235bf4a6e1915cf531511fafc7f6874c0d1673fb29db704f
|
|
4
|
+
data.tar.gz: 7d48ffebf130d3b32d1ec233cf5141cc3cf192bcf16751db4380bf62863971c1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 16d56de1814e36174c47ab5bda6c9d5e02aba15bafa72a1d57056d0ac146e5fff5c6ca43f9198262d90820e4dcbe4e63772f01bd1ee5207c7ab07e9bb959e069
|
|
7
|
+
data.tar.gz: 07602af4f0cfb9927d43239da0c38cb2411aa408d11fe3f91cb4a403fa415ca8de095eee7467e4613d32aadb8c0a13ffea19ac2f93fd5bf005a991d91e8a064a
|
|
@@ -11,6 +11,7 @@ require 'concurrent-ruby'
|
|
|
11
11
|
require 'logger'
|
|
12
12
|
require 'zlib'
|
|
13
13
|
require 'stringio'
|
|
14
|
+
require 'digest'
|
|
14
15
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
|
15
16
|
require_relative 'wayback_machine_downloader/to_regex'
|
|
16
17
|
require_relative 'wayback_machine_downloader/archive_api'
|
|
@@ -116,7 +117,7 @@ class WaybackMachineDownloader
|
|
|
116
117
|
include ArchiveAPI
|
|
117
118
|
include SubdomainProcessor
|
|
118
119
|
|
|
119
|
-
VERSION = "2.4.
|
|
120
|
+
VERSION = "2.4.3"
|
|
120
121
|
DEFAULT_TIMEOUT = 30
|
|
121
122
|
MAX_RETRIES = 3
|
|
122
123
|
RETRY_DELAY = 2
|
|
@@ -171,12 +172,19 @@ class WaybackMachineDownloader
|
|
|
171
172
|
|
|
172
173
|
def backup_name
|
|
173
174
|
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
|
174
|
-
|
|
175
|
-
if url_to_process.include? '//'
|
|
175
|
+
raw = if url_to_process.include?('//')
|
|
176
176
|
url_to_process.split('/')[2]
|
|
177
177
|
else
|
|
178
178
|
url_to_process
|
|
179
179
|
end
|
|
180
|
+
|
|
181
|
+
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
|
182
|
+
if Gem.win_platform?
|
|
183
|
+
raw = raw.gsub(/[:*?"<>|]/, '_')
|
|
184
|
+
raw = raw.gsub(/[ .]+\z/, '')
|
|
185
|
+
end
|
|
186
|
+
raw = 'site' if raw.nil? || raw.empty?
|
|
187
|
+
raw
|
|
180
188
|
end
|
|
181
189
|
|
|
182
190
|
def backup_path
|
|
@@ -340,15 +348,15 @@ class WaybackMachineDownloader
|
|
|
340
348
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
341
349
|
next unless file_url.include?('/')
|
|
342
350
|
next if file_timestamp.to_i > target_timestamp
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
file_id =
|
|
351
|
+
|
|
352
|
+
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
|
353
|
+
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
|
346
354
|
next if file_id.nil?
|
|
347
355
|
next if match_exclude_filter(file_url)
|
|
348
356
|
next unless match_only_filter(file_url)
|
|
349
|
-
|
|
357
|
+
|
|
350
358
|
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
|
351
|
-
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
|
|
359
|
+
file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
|
|
352
360
|
end
|
|
353
361
|
end
|
|
354
362
|
file_versions.values
|
|
@@ -368,22 +376,27 @@ class WaybackMachineDownloader
|
|
|
368
376
|
file_list_curated = Hash.new
|
|
369
377
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
370
378
|
next unless file_url.include?('/')
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
file_id =
|
|
379
|
+
|
|
380
|
+
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
|
381
|
+
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
|
374
382
|
if file_id.nil?
|
|
375
383
|
puts "Malformed file url, ignoring: #{file_url}"
|
|
384
|
+
next
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
if file_id.include?('<') || file_id.include?('>')
|
|
388
|
+
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
|
376
389
|
else
|
|
377
390
|
if match_exclude_filter(file_url)
|
|
378
391
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
|
379
|
-
elsif
|
|
392
|
+
elsif !match_only_filter(file_url)
|
|
380
393
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
|
381
394
|
elsif file_list_curated[file_id]
|
|
382
395
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
|
383
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
|
396
|
+
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
|
384
397
|
end
|
|
385
398
|
else
|
|
386
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
|
399
|
+
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
|
387
400
|
end
|
|
388
401
|
end
|
|
389
402
|
end
|
|
@@ -394,21 +407,32 @@ class WaybackMachineDownloader
|
|
|
394
407
|
file_list_curated = Hash.new
|
|
395
408
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
396
409
|
next unless file_url.include?('/')
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
|
410
|
+
|
|
411
|
+
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
|
412
|
+
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
|
401
413
|
if file_id.nil?
|
|
402
414
|
puts "Malformed file url, ignoring: #{file_url}"
|
|
415
|
+
next
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
|
|
419
|
+
file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
|
|
420
|
+
if file_id_and_timestamp.nil?
|
|
421
|
+
puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
|
|
422
|
+
next
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
|
|
426
|
+
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
|
403
427
|
else
|
|
404
428
|
if match_exclude_filter(file_url)
|
|
405
429
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
|
406
|
-
elsif
|
|
430
|
+
elsif !match_only_filter(file_url)
|
|
407
431
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
|
408
432
|
elsif file_list_curated[file_id_and_timestamp]
|
|
409
|
-
|
|
433
|
+
# duplicate combo, ignore silently (verbose flag not shown here)
|
|
410
434
|
else
|
|
411
|
-
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
|
435
|
+
file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
|
|
412
436
|
end
|
|
413
437
|
end
|
|
414
438
|
end
|
|
@@ -749,6 +773,86 @@ class WaybackMachineDownloader
|
|
|
749
773
|
end
|
|
750
774
|
logger
|
|
751
775
|
end
|
|
776
|
+
|
|
777
|
+
# safely sanitize a file id (or id+timestamp)
|
|
778
|
+
def sanitize_and_prepare_id(raw, file_url)
|
|
779
|
+
return nil if raw.nil?
|
|
780
|
+
return "" if raw.empty?
|
|
781
|
+
original = raw.dup
|
|
782
|
+
begin
|
|
783
|
+
# work on a binary copy to avoid premature encoding errors
|
|
784
|
+
raw = raw.dup.force_encoding(Encoding::BINARY)
|
|
785
|
+
|
|
786
|
+
# percent-decode (repeat until stable in case of double-encoding)
|
|
787
|
+
loop do
|
|
788
|
+
decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
|
|
789
|
+
break if decoded == raw
|
|
790
|
+
raw = decoded
|
|
791
|
+
end
|
|
792
|
+
|
|
793
|
+
# try tidy_bytes
|
|
794
|
+
begin
|
|
795
|
+
raw = raw.tidy_bytes
|
|
796
|
+
rescue StandardError
|
|
797
|
+
# fallback: scrub to UTF-8
|
|
798
|
+
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
799
|
+
end
|
|
800
|
+
|
|
801
|
+
# ensure UTF-8 and scrub again
|
|
802
|
+
unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
|
|
803
|
+
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
804
|
+
end
|
|
805
|
+
|
|
806
|
+
# strip HTML/comment artifacts & control chars
|
|
807
|
+
raw.gsub!(/<!--+/, '')
|
|
808
|
+
raw.gsub!(/[\x00-\x1F]/, '')
|
|
809
|
+
|
|
810
|
+
# split query; hash it for stable short name
|
|
811
|
+
path_part, query_part = raw.split('?', 2)
|
|
812
|
+
if query_part && !query_part.empty?
|
|
813
|
+
q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
|
|
814
|
+
if path_part.include?('.')
|
|
815
|
+
pre, _sep, post = path_part.rpartition('.')
|
|
816
|
+
path_part = "#{pre}__q#{q_digest}.#{post}"
|
|
817
|
+
else
|
|
818
|
+
path_part = "#{path_part}__q#{q_digest}"
|
|
819
|
+
end
|
|
820
|
+
end
|
|
821
|
+
raw = path_part
|
|
822
|
+
|
|
823
|
+
# collapse slashes & trim leading slash
|
|
824
|
+
raw.gsub!(%r{/+}, '/')
|
|
825
|
+
raw.sub!(%r{\A/}, '')
|
|
826
|
+
|
|
827
|
+
# segment-wise sanitation
|
|
828
|
+
raw = raw.split('/').map do |segment|
|
|
829
|
+
seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
830
|
+
seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
|
|
831
|
+
seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
|
|
832
|
+
seg.empty? ? '_' : seg
|
|
833
|
+
end.join('/')
|
|
834
|
+
|
|
835
|
+
# remove any remaining angle brackets
|
|
836
|
+
raw.tr!('<>', '')
|
|
837
|
+
|
|
838
|
+
# final fallback if empty
|
|
839
|
+
raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
|
|
840
|
+
|
|
841
|
+
raw
|
|
842
|
+
rescue => e
|
|
843
|
+
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
|
844
|
+
# deterministic fallback – never return nil so caller won’t mark malformed
|
|
845
|
+
"file__#{Digest::SHA1.hexdigest(original)[0,10]}"
|
|
846
|
+
end
|
|
847
|
+
end
|
|
848
|
+
|
|
849
|
+
# wrap URL in parentheses if it contains characters that commonly break unquoted
|
|
850
|
+
# Windows CMD usage (e.g., &). This is only for display; user still must quote
|
|
851
|
+
# when invoking manually.
|
|
852
|
+
def safe_display_url(url)
|
|
853
|
+
return url unless url && url.match?(/[&]/)
|
|
854
|
+
"(#{url})"
|
|
855
|
+
end
|
|
752
856
|
|
|
753
857
|
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
|
754
858
|
retries = 0
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.4.
|
|
4
|
+
version: 2.4.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-08-
|
|
10
|
+
date: 2025-08-19 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|