wayback_machine_downloader_straw 2.4.0 → 2.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader.rb +49 -19
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f6650c4217f2630db6307bc50ae2d6cefcbc38afc18b5701cc90a956af5cf1cf
|
|
4
|
+
data.tar.gz: 0ad44d7daa4c69b75d319c3518c4b801810be071545d5eded4497073caab4667
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7a8cfd1cda19bc3ff2db8859e03877395eaf44092ffbe9f5334218fbd6293ff1aecc60e2bf272f875a67ecd086a209c56640db221f4d13739669a27eada1c826
|
|
7
|
+
data.tar.gz: 877436af63fa205add55ebeb55bafcd39fec0afa56707ee742871014dac48998e8028ef4616a0b611bee5f9a93ed0d8d136375d457503a3e34b9a37f87321787
|
|
@@ -116,7 +116,7 @@ class WaybackMachineDownloader
|
|
|
116
116
|
include ArchiveAPI
|
|
117
117
|
include SubdomainProcessor
|
|
118
118
|
|
|
119
|
-
VERSION = "2.4.
|
|
119
|
+
VERSION = "2.4.1"
|
|
120
120
|
DEFAULT_TIMEOUT = 30
|
|
121
121
|
MAX_RETRIES = 3
|
|
122
122
|
RETRY_DELAY = 2
|
|
@@ -340,15 +340,15 @@ class WaybackMachineDownloader
|
|
|
340
340
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
341
341
|
next unless file_url.include?('/')
|
|
342
342
|
next if file_timestamp.to_i > target_timestamp
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
file_id =
|
|
343
|
+
|
|
344
|
+
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
|
345
|
+
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
|
346
346
|
next if file_id.nil?
|
|
347
347
|
next if match_exclude_filter(file_url)
|
|
348
348
|
next unless match_only_filter(file_url)
|
|
349
|
-
|
|
349
|
+
|
|
350
350
|
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
|
351
|
-
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
|
|
351
|
+
file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
|
|
352
352
|
end
|
|
353
353
|
end
|
|
354
354
|
file_versions.values
|
|
@@ -368,22 +368,27 @@ class WaybackMachineDownloader
|
|
|
368
368
|
file_list_curated = Hash.new
|
|
369
369
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
370
370
|
next unless file_url.include?('/')
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
file_id =
|
|
371
|
+
|
|
372
|
+
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
|
373
|
+
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
|
374
374
|
if file_id.nil?
|
|
375
375
|
puts "Malformed file url, ignoring: #{file_url}"
|
|
376
|
+
next
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
if file_id.include?('<') || file_id.include?('>')
|
|
380
|
+
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
|
376
381
|
else
|
|
377
382
|
if match_exclude_filter(file_url)
|
|
378
383
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
|
379
|
-
elsif
|
|
384
|
+
elsif !match_only_filter(file_url)
|
|
380
385
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
|
381
386
|
elsif file_list_curated[file_id]
|
|
382
387
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
|
383
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
|
388
|
+
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
|
384
389
|
end
|
|
385
390
|
else
|
|
386
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
|
391
|
+
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
|
387
392
|
end
|
|
388
393
|
end
|
|
389
394
|
end
|
|
@@ -394,21 +399,32 @@ class WaybackMachineDownloader
|
|
|
394
399
|
file_list_curated = Hash.new
|
|
395
400
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
396
401
|
next unless file_url.include?('/')
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
|
402
|
+
|
|
403
|
+
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
|
404
|
+
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
|
401
405
|
if file_id.nil?
|
|
402
406
|
puts "Malformed file url, ignoring: #{file_url}"
|
|
407
|
+
next
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
|
|
411
|
+
file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
|
|
412
|
+
if file_id_and_timestamp.nil?
|
|
413
|
+
puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
|
|
414
|
+
next
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
|
|
418
|
+
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
|
403
419
|
else
|
|
404
420
|
if match_exclude_filter(file_url)
|
|
405
421
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
|
406
|
-
elsif
|
|
422
|
+
elsif !match_only_filter(file_url)
|
|
407
423
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
|
408
424
|
elsif file_list_curated[file_id_and_timestamp]
|
|
409
|
-
|
|
425
|
+
# duplicate combo, ignore silently (verbose flag not shown here)
|
|
410
426
|
else
|
|
411
|
-
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
|
427
|
+
file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
|
|
412
428
|
end
|
|
413
429
|
end
|
|
414
430
|
end
|
|
@@ -749,6 +765,20 @@ class WaybackMachineDownloader
|
|
|
749
765
|
end
|
|
750
766
|
logger
|
|
751
767
|
end
|
|
768
|
+
|
|
769
|
+
# safely sanitize a file id (or id+timestamp)
|
|
770
|
+
def sanitize_and_prepare_id(raw, file_url)
|
|
771
|
+
return nil if raw.nil?
|
|
772
|
+
begin
|
|
773
|
+
raw = CGI.unescape(raw) rescue raw
|
|
774
|
+
raw.gsub!(/<[^>]*>/, '')
|
|
775
|
+
raw = raw.tidy_bytes unless raw.empty?
|
|
776
|
+
raw
|
|
777
|
+
rescue => e
|
|
778
|
+
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
|
779
|
+
nil
|
|
780
|
+
end
|
|
781
|
+
end
|
|
752
782
|
|
|
753
783
|
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
|
754
784
|
retries = 0
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.4.
|
|
4
|
+
version: 2.4.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-08-
|
|
10
|
+
date: 2025-08-12 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|