wayback_machine_downloader_straw 2.3.8 → 2.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader/archive_api.rb +2 -1
- data/lib/wayback_machine_downloader.rb +37 -21
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef661bf573b09f79453cf6343d737c24715f343b6593cf313f2502ecd9a650cb
|
4
|
+
data.tar.gz: b80be4aaae7ab4ff695af6cc85273ac437fab1e6a68d3d8bdad67a9661be17e4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3dfb6477b142eebb45741e1b5a4552dd33feac34baa1eae5453baaa08a9a5be242ba46d4f1162e2dd2b68e8903e6de8402d6b6fa86128f312defac74f2e8da29
|
7
|
+
data.tar.gz: 39758aef4bda77babb81d479ef9f266e3fa328af163c7c3c053290796fda95ccb8ec8d3725a9dae5164b79debc6530919cd79df3f7421842f951b0ee6ef79e60
|
@@ -7,7 +7,8 @@ module ArchiveAPI
|
|
7
7
|
# Automatically append /* if the URL doesn't contain a path after the domain
|
8
8
|
# This is a workaround for an issue with the API and *some* domains.
|
9
9
|
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
10
|
-
|
10
|
+
# But don't do this when exact_url flag is set
|
11
|
+
if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
|
11
12
|
url = "#{url}/*"
|
12
13
|
end
|
13
14
|
|
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|
113
113
|
|
114
114
|
include ArchiveAPI
|
115
115
|
|
116
|
-
VERSION = "2.3.
|
116
|
+
VERSION = "2.3.10"
|
117
117
|
DEFAULT_TIMEOUT = 30
|
118
118
|
MAX_RETRIES = 3
|
119
119
|
RETRY_DELAY = 2
|
@@ -131,7 +131,11 @@ class WaybackMachineDownloader
|
|
131
131
|
validate_params(params)
|
132
132
|
@base_url = params[:base_url]
|
133
133
|
@exact_url = params[:exact_url]
|
134
|
-
|
134
|
+
if params[:directory]
|
135
|
+
@directory = File.expand_path(params[:directory])
|
136
|
+
else
|
137
|
+
@directory = nil
|
138
|
+
end
|
135
139
|
@all_timestamps = params[:all_timestamps]
|
136
140
|
@from_timestamp = params[:from_timestamp].to_i
|
137
141
|
@to_timestamp = params[:to_timestamp].to_i
|
@@ -165,13 +169,11 @@ class WaybackMachineDownloader
|
|
165
169
|
|
166
170
|
def backup_path
|
167
171
|
if @directory
|
168
|
-
|
169
|
-
|
170
|
-
else
|
171
|
-
@directory + '/'
|
172
|
-
end
|
172
|
+
# because @directory is already an absolute path, we just ensure it exists
|
173
|
+
@directory
|
173
174
|
else
|
174
|
-
|
175
|
+
# ensure the default path is absolute and normalized
|
176
|
+
File.expand_path(File.join('websites', backup_name))
|
175
177
|
end
|
176
178
|
end
|
177
179
|
|
@@ -382,7 +384,7 @@ class WaybackMachineDownloader
|
|
382
384
|
end
|
383
385
|
else
|
384
386
|
file_list_curated = get_file_list_curated
|
385
|
-
file_list_curated = file_list_curated.sort_by { |
|
387
|
+
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
386
388
|
file_list_curated.map do |file_remote_info|
|
387
389
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
388
390
|
file_remote_info[1]
|
@@ -638,21 +640,35 @@ class WaybackMachineDownloader
|
|
638
640
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
639
641
|
file_id = file_remote_info[:file_id]
|
640
642
|
file_timestamp = file_remote_info[:timestamp]
|
641
|
-
|
643
|
+
|
644
|
+
# sanitize file_id to ensure it is a valid path component
|
645
|
+
raw_path_elements = file_id.split('/')
|
646
|
+
|
647
|
+
sanitized_path_elements = raw_path_elements.map do |element|
|
648
|
+
if Gem.win_platform?
|
649
|
+
# for Windows, we need to sanitize path components to avoid invalid characters
|
650
|
+
# this prevents issues with file names that contain characters not allowed in
|
651
|
+
# Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
|
652
|
+
element.gsub(/[:\*?"<>\|\&\=\/\\]/) { |match| '%' + match.ord.to_s(16).upcase }
|
653
|
+
else
|
654
|
+
element
|
655
|
+
end
|
656
|
+
end
|
657
|
+
|
658
|
+
current_backup_path = backup_path
|
642
659
|
|
643
660
|
if file_id == ""
|
644
|
-
dir_path =
|
645
|
-
file_path =
|
646
|
-
elsif file_url[-1] == '/'
|
647
|
-
|
648
|
-
|
661
|
+
dir_path = current_backup_path
|
662
|
+
file_path = File.join(dir_path, 'index.html')
|
663
|
+
elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
|
664
|
+
# if file_id is a directory, we treat it as such
|
665
|
+
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
666
|
+
file_path = File.join(dir_path, 'index.html')
|
649
667
|
else
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
655
|
-
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
668
|
+
# if file_id is a file, we treat it as such
|
669
|
+
filename = sanitized_path_elements.pop
|
670
|
+
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
671
|
+
file_path = File.join(dir_path, filename)
|
656
672
|
end
|
657
673
|
|
658
674
|
# check existence *before* download attempt
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
8
|
bindir: bin
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-06-
|
10
|
+
date: 2025-06-27 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: concurrent-ruby
|