wayback_machine_downloader_straw 2.3.7 → 2.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader/archive_api.rb +8 -0
- data/lib/wayback_machine_downloader.rb +47 -24
- metadata +3 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1df8b0394ed1e39ab2d1ef050c51c809afbc0005533e286b83389bab99a7507c
|
4
|
+
data.tar.gz: 500212ad8353f9252ade813c5e405e9bc0fa1f6ae4967dcfee36686caecff0fa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c5b855e47b617217591f385b6aeb5afd55631e10acaa39d1acf0929603717e68f350457fea2e184f17e338baeb4fd1cb6ba9ce6632967c56a80c8a66c2fbae87
|
7
|
+
data.tar.gz: b2c13318213a68827d47e73b8fba54ad2d52f49f73ea76726207f80d2db3bb9b77bb349980b823ac12a42f1ec1ba7705eb06affdf579970945eb655c07c955f2
|
@@ -4,6 +4,14 @@ require 'uri'
|
|
4
4
|
module ArchiveAPI
|
5
5
|
|
6
6
|
def get_raw_list_from_api(url, page_index, http)
|
7
|
+
# Automatically append /* if the URL doesn't contain a path after the domain
|
8
|
+
# This is a workaround for an issue with the API and *some* domains.
|
9
|
+
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
10
|
+
# But don't do this when exact_url flag is set
|
11
|
+
if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
|
12
|
+
url = "#{url}/*"
|
13
|
+
end
|
14
|
+
|
7
15
|
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
8
16
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
9
17
|
request_url.query = URI.encode_www_form(params)
|
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|
113
113
|
|
114
114
|
include ArchiveAPI
|
115
115
|
|
116
|
-
VERSION = "2.3.
|
116
|
+
VERSION = "2.3.9"
|
117
117
|
DEFAULT_TIMEOUT = 30
|
118
118
|
MAX_RETRIES = 3
|
119
119
|
RETRY_DELAY = 2
|
@@ -131,7 +131,11 @@ class WaybackMachineDownloader
|
|
131
131
|
validate_params(params)
|
132
132
|
@base_url = params[:base_url]
|
133
133
|
@exact_url = params[:exact_url]
|
134
|
-
|
134
|
+
if params[:directory]
|
135
|
+
@directory = File.expand_path(params[:directory])
|
136
|
+
else
|
137
|
+
@directory = nil
|
138
|
+
end
|
135
139
|
@all_timestamps = params[:all_timestamps]
|
136
140
|
@from_timestamp = params[:from_timestamp].to_i
|
137
141
|
@to_timestamp = params[:to_timestamp].to_i
|
@@ -154,22 +158,22 @@ class WaybackMachineDownloader
|
|
154
158
|
end
|
155
159
|
|
156
160
|
def backup_name
|
157
|
-
|
158
|
-
|
161
|
+
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
162
|
+
|
163
|
+
if url_to_process.include? '//'
|
164
|
+
url_to_process.split('/')[2]
|
159
165
|
else
|
160
|
-
|
166
|
+
url_to_process
|
161
167
|
end
|
162
168
|
end
|
163
169
|
|
164
170
|
def backup_path
|
165
171
|
if @directory
|
166
|
-
|
167
|
-
|
168
|
-
else
|
169
|
-
@directory + '/'
|
170
|
-
end
|
172
|
+
# because @directory is already an absolute path, we just ensure it exists
|
173
|
+
@directory
|
171
174
|
else
|
172
|
-
|
175
|
+
# ensure the default path is absolute and normalized
|
176
|
+
File.expand_path(File.join('websites', backup_name))
|
173
177
|
end
|
174
178
|
end
|
175
179
|
|
@@ -241,6 +245,7 @@ class WaybackMachineDownloader
|
|
241
245
|
# Fetch the initial set of snapshots, sequentially
|
242
246
|
@connection_pool.with_connection do |connection|
|
243
247
|
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
248
|
+
initial_list ||= []
|
244
249
|
mutex.synchronize do
|
245
250
|
snapshot_list_to_consider.concat(initial_list)
|
246
251
|
print "."
|
@@ -265,6 +270,7 @@ class WaybackMachineDownloader
|
|
265
270
|
@connection_pool.with_connection do |connection|
|
266
271
|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
267
272
|
end
|
273
|
+
result ||= []
|
268
274
|
[page, result]
|
269
275
|
end
|
270
276
|
end
|
@@ -284,7 +290,7 @@ class WaybackMachineDownloader
|
|
284
290
|
|
285
291
|
# Process results and check for empty pages
|
286
292
|
results.each do |page, result|
|
287
|
-
if result.empty?
|
293
|
+
if result.nil? || result.empty?
|
288
294
|
continue_fetching = false
|
289
295
|
break
|
290
296
|
else
|
@@ -634,21 +640,35 @@ class WaybackMachineDownloader
|
|
634
640
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
635
641
|
file_id = file_remote_info[:file_id]
|
636
642
|
file_timestamp = file_remote_info[:timestamp]
|
637
|
-
|
643
|
+
|
644
|
+
# sanitize file_id to ensure it is a valid path component
|
645
|
+
raw_path_elements = file_id.split('/')
|
646
|
+
|
647
|
+
sanitized_path_elements = raw_path_elements.map do |element|
|
648
|
+
if Gem.win_platform?
|
649
|
+
# for Windows, we need to sanitize path components to avoid invalid characters
|
650
|
+
# this prevents issues with file names that contain characters not allowed in
|
651
|
+
# Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
|
652
|
+
element.gsub(/[:\*?"<>\|\&\=\/\\]/, ->(match) { '%' + match.ord.to_s(16).upcase })
|
653
|
+
else
|
654
|
+
element
|
655
|
+
end
|
656
|
+
end
|
657
|
+
|
658
|
+
current_backup_path = backup_path
|
638
659
|
|
639
660
|
if file_id == ""
|
640
|
-
dir_path =
|
641
|
-
file_path =
|
642
|
-
elsif file_url[-1] == '/'
|
643
|
-
|
644
|
-
|
661
|
+
dir_path = current_backup_path
|
662
|
+
file_path = File.join(dir_path, 'index.html')
|
663
|
+
elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
|
664
|
+
# if file_id is a directory, we treat it as such
|
665
|
+
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
666
|
+
file_path = File.join(dir_path, 'index.html')
|
645
667
|
else
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
651
|
-
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
668
|
+
# if file_id is a file, we treat it as such
|
669
|
+
filename = sanitized_path_elements.pop
|
670
|
+
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
671
|
+
file_path = File.join(dir_path, filename)
|
652
672
|
end
|
653
673
|
|
654
674
|
# check existence *before* download attempt
|
@@ -717,6 +737,9 @@ class WaybackMachineDownloader
|
|
717
737
|
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
718
738
|
end
|
719
739
|
|
740
|
+
# Escape square brackets because they are not valid in URI()
|
741
|
+
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
742
|
+
|
720
743
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
721
744
|
request["Connection"] = "keep-alive"
|
722
745
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date: 2025-
|
10
|
+
date: 2025-06-18 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: concurrent-ruby
|
@@ -78,7 +77,6 @@ homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
|
|
78
77
|
licenses:
|
79
78
|
- MIT
|
80
79
|
metadata: {}
|
81
|
-
post_install_message:
|
82
80
|
rdoc_options: []
|
83
81
|
require_paths:
|
84
82
|
- lib
|
@@ -93,8 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
91
|
- !ruby/object:Gem::Version
|
94
92
|
version: '0'
|
95
93
|
requirements: []
|
96
|
-
rubygems_version: 3.
|
97
|
-
signing_key:
|
94
|
+
rubygems_version: 3.6.2
|
98
95
|
specification_version: 4
|
99
96
|
summary: Download an entire website from the Wayback Machine.
|
100
97
|
test_files: []
|