wayback_machine_downloader_straw 2.3.7 → 2.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader/archive_api.rb +7 -0
- data/lib/wayback_machine_downloader.rb +12 -5
- metadata +3 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: df42d96c68c19fd39b6da3c9e9d51934197484ccb1ceb7a9387116622b0214a7
|
4
|
+
data.tar.gz: d6f04e3dc44c9f216b9d3dc631275fac5e48447ebd963a33818e82baf1ff79b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9654877bb591082e1ef1c5dfdacff0bf887ed68f8ae1b2d995a99b87232523aa3350aede2d8cbb4045dbb15b380a1e93451004a45f881ad323615c0f66632c5
|
7
|
+
data.tar.gz: eb8753d3ceb689e9b8c3f3dbaeeac7c9dd818497f916882d5d3271f1901c099f8b7103e7b49bcef51d71aab86b2607174ac2eece768a092242b0d5e0dcec9b28
|
@@ -4,6 +4,13 @@ require 'uri'
|
|
4
4
|
module ArchiveAPI
|
5
5
|
|
6
6
|
def get_raw_list_from_api(url, page_index, http)
|
7
|
+
# Automatically append /* if the URL doesn't contain a path after the domain
|
8
|
+
# This is a workaround for an issue with the API and *some* domains.
|
9
|
+
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
10
|
+
if url && !url.match(/^https?:\/\/.*\//i)
|
11
|
+
url = "#{url}/*"
|
12
|
+
end
|
13
|
+
|
7
14
|
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
8
15
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
9
16
|
request_url.query = URI.encode_www_form(params)
|
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|
113
113
|
|
114
114
|
include ArchiveAPI
|
115
115
|
|
116
|
-
VERSION = "2.3.
|
116
|
+
VERSION = "2.3.8"
|
117
117
|
DEFAULT_TIMEOUT = 30
|
118
118
|
MAX_RETRIES = 3
|
119
119
|
RETRY_DELAY = 2
|
@@ -154,10 +154,12 @@ class WaybackMachineDownloader
|
|
154
154
|
end
|
155
155
|
|
156
156
|
def backup_name
|
157
|
-
|
158
|
-
|
157
|
+
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
158
|
+
|
159
|
+
if url_to_process.include? '//'
|
160
|
+
url_to_process.split('/')[2]
|
159
161
|
else
|
160
|
-
|
162
|
+
url_to_process
|
161
163
|
end
|
162
164
|
end
|
163
165
|
|
@@ -241,6 +243,7 @@ class WaybackMachineDownloader
|
|
241
243
|
# Fetch the initial set of snapshots, sequentially
|
242
244
|
@connection_pool.with_connection do |connection|
|
243
245
|
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
246
|
+
initial_list ||= []
|
244
247
|
mutex.synchronize do
|
245
248
|
snapshot_list_to_consider.concat(initial_list)
|
246
249
|
print "."
|
@@ -265,6 +268,7 @@ class WaybackMachineDownloader
|
|
265
268
|
@connection_pool.with_connection do |connection|
|
266
269
|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
267
270
|
end
|
271
|
+
result ||= []
|
268
272
|
[page, result]
|
269
273
|
end
|
270
274
|
end
|
@@ -284,7 +288,7 @@ class WaybackMachineDownloader
|
|
284
288
|
|
285
289
|
# Process results and check for empty pages
|
286
290
|
results.each do |page, result|
|
287
|
-
if result.empty?
|
291
|
+
if result.nil? || result.empty?
|
288
292
|
continue_fetching = false
|
289
293
|
break
|
290
294
|
else
|
@@ -717,6 +721,9 @@ class WaybackMachineDownloader
|
|
717
721
|
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
718
722
|
end
|
719
723
|
|
724
|
+
# Escape square brackets because they are not valid in URI()
|
725
|
+
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
726
|
+
|
720
727
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
721
728
|
request["Connection"] = "keep-alive"
|
722
729
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date: 2025-05
|
10
|
+
date: 2025-06-05 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: concurrent-ruby
|
@@ -78,7 +77,6 @@ homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
|
|
78
77
|
licenses:
|
79
78
|
- MIT
|
80
79
|
metadata: {}
|
81
|
-
post_install_message:
|
82
80
|
rdoc_options: []
|
83
81
|
require_paths:
|
84
82
|
- lib
|
@@ -93,8 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
91
|
- !ruby/object:Gem::Version
|
94
92
|
version: '0'
|
95
93
|
requirements: []
|
96
|
-
rubygems_version: 3.
|
97
|
-
signing_key:
|
94
|
+
rubygems_version: 3.6.2
|
98
95
|
specification_version: 4
|
99
96
|
summary: Download an entire website from the Wayback Machine.
|
100
97
|
test_files: []
|