wayback_machine_downloader_straw 2.3.6 → 2.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader/archive_api.rb +7 -0
- data/lib/wayback_machine_downloader.rb +65 -27
- metadata +3 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: df42d96c68c19fd39b6da3c9e9d51934197484ccb1ceb7a9387116622b0214a7
|
4
|
+
data.tar.gz: d6f04e3dc44c9f216b9d3dc631275fac5e48447ebd963a33818e82baf1ff79b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9654877bb591082e1ef1c5dfdacff0bf887ed68f8ae1b2d995a99b87232523aa3350aede2d8cbb4045dbb15b380a1e93451004a45f881ad323615c0f66632c5
|
7
|
+
data.tar.gz: eb8753d3ceb689e9b8c3f3dbaeeac7c9dd818497f916882d5d3271f1901c099f8b7103e7b49bcef51d71aab86b2607174ac2eece768a092242b0d5e0dcec9b28
|
@@ -4,6 +4,13 @@ require 'uri'
|
|
4
4
|
module ArchiveAPI
|
5
5
|
|
6
6
|
def get_raw_list_from_api(url, page_index, http)
|
7
|
+
# Automatically append /* if the URL doesn't contain a path after the domain
|
8
|
+
# This is a workaround for an issue with the API and *some* domains.
|
9
|
+
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
10
|
+
if url && !url.match(/^https?:\/\/.*\//i)
|
11
|
+
url = "#{url}/*"
|
12
|
+
end
|
13
|
+
|
7
14
|
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
8
15
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
9
16
|
request_url.query = URI.encode_www_form(params)
|
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|
113
113
|
|
114
114
|
include ArchiveAPI
|
115
115
|
|
116
|
-
VERSION = "2.3.
|
116
|
+
VERSION = "2.3.8"
|
117
117
|
DEFAULT_TIMEOUT = 30
|
118
118
|
MAX_RETRIES = 3
|
119
119
|
RETRY_DELAY = 2
|
@@ -154,10 +154,12 @@ class WaybackMachineDownloader
|
|
154
154
|
end
|
155
155
|
|
156
156
|
def backup_name
|
157
|
-
|
158
|
-
|
157
|
+
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
158
|
+
|
159
|
+
if url_to_process.include? '//'
|
160
|
+
url_to_process.split('/')[2]
|
159
161
|
else
|
160
|
-
|
162
|
+
url_to_process
|
161
163
|
end
|
162
164
|
end
|
163
165
|
|
@@ -241,6 +243,7 @@ class WaybackMachineDownloader
|
|
241
243
|
# Fetch the initial set of snapshots, sequentially
|
242
244
|
@connection_pool.with_connection do |connection|
|
243
245
|
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
246
|
+
initial_list ||= []
|
244
247
|
mutex.synchronize do
|
245
248
|
snapshot_list_to_consider.concat(initial_list)
|
246
249
|
print "."
|
@@ -265,6 +268,7 @@ class WaybackMachineDownloader
|
|
265
268
|
@connection_pool.with_connection do |connection|
|
266
269
|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
267
270
|
end
|
271
|
+
result ||= []
|
268
272
|
[page, result]
|
269
273
|
end
|
270
274
|
end
|
@@ -284,7 +288,7 @@ class WaybackMachineDownloader
|
|
284
288
|
|
285
289
|
# Process results and check for empty pages
|
286
290
|
results.each do |page, result|
|
287
|
-
if result.empty?
|
291
|
+
if result.nil? || result.empty?
|
288
292
|
continue_fetching = false
|
289
293
|
break
|
290
294
|
else
|
@@ -477,8 +481,8 @@ class WaybackMachineDownloader
|
|
477
481
|
begin
|
478
482
|
@connection_pool.with_connection do |connection|
|
479
483
|
result_message = download_file(file_remote_info, connection)
|
480
|
-
#
|
481
|
-
if result_message &&
|
484
|
+
# assume download success if the result message contains ' -> '
|
485
|
+
if result_message && result_message.include?(' -> ')
|
482
486
|
download_success = true
|
483
487
|
end
|
484
488
|
@download_mutex.synchronize do
|
@@ -659,11 +663,21 @@ class WaybackMachineDownloader
|
|
659
663
|
|
660
664
|
begin
|
661
665
|
structure_dir_path dir_path
|
662
|
-
download_with_retry(file_path, file_url, file_timestamp, http)
|
663
|
-
|
664
|
-
|
666
|
+
status = download_with_retry(file_path, file_url, file_timestamp, http)
|
667
|
+
|
668
|
+
case status
|
669
|
+
when :saved
|
670
|
+
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
671
|
+
rewrite_urls_to_relative(file_path)
|
672
|
+
end
|
673
|
+
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
674
|
+
when :skipped_not_found
|
675
|
+
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
676
|
+
else
|
677
|
+
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
678
|
+
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
679
|
+
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
665
680
|
end
|
666
|
-
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
667
681
|
rescue StandardError => e
|
668
682
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
669
683
|
if File.exist?(file_path) and File.size(file_path) == 0
|
@@ -707,6 +721,9 @@ class WaybackMachineDownloader
|
|
707
721
|
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
708
722
|
end
|
709
723
|
|
724
|
+
# Escape square brackets because they are not valid in URI()
|
725
|
+
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
726
|
+
|
710
727
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
711
728
|
request["Connection"] = "keep-alive"
|
712
729
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
@@ -714,8 +731,7 @@ class WaybackMachineDownloader
|
|
714
731
|
|
715
732
|
response = connection.request(request)
|
716
733
|
|
717
|
-
|
718
|
-
when Net::HTTPSuccess
|
734
|
+
save_response_body = lambda do
|
719
735
|
File.open(file_path, "wb") do |file|
|
720
736
|
body = response.body
|
721
737
|
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
@@ -725,26 +741,48 @@ class WaybackMachineDownloader
|
|
725
741
|
gz.close
|
726
742
|
file.write(decompressed_body)
|
727
743
|
rescue Zlib::GzipFile::Error => e
|
728
|
-
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
744
|
+
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
|
729
745
|
file.write(body)
|
730
746
|
end
|
731
747
|
else
|
732
748
|
file.write(body) if body
|
733
749
|
end
|
734
750
|
end
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
751
|
+
end
|
752
|
+
|
753
|
+
if @all
|
754
|
+
case response
|
755
|
+
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
|
756
|
+
save_response_body.call
|
757
|
+
if response.is_a?(Net::HTTPRedirection)
|
758
|
+
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
|
759
|
+
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
|
760
|
+
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
|
761
|
+
end
|
762
|
+
return :saved
|
763
|
+
else
|
764
|
+
# for any other response type when --all is true, treat as an error to be retried or failed
|
765
|
+
raise "Unhandled HTTP response: #{response.code} #{response.message}"
|
766
|
+
end
|
767
|
+
else # not @all (our default behavior)
|
768
|
+
case response
|
769
|
+
when Net::HTTPSuccess
|
770
|
+
save_response_body.call
|
771
|
+
return :saved
|
772
|
+
when Net::HTTPRedirection
|
773
|
+
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
774
|
+
location = response['location']
|
775
|
+
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
776
|
+
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
777
|
+
when Net::HTTPTooManyRequests
|
778
|
+
sleep(RATE_LIMIT * 2)
|
779
|
+
raise "Rate limited, retrying..."
|
780
|
+
when Net::HTTPNotFound
|
781
|
+
@logger.warn("File not found, skipping: #{file_url}")
|
782
|
+
return :skipped_not_found
|
783
|
+
else
|
784
|
+
raise "HTTP Error: #{response.code} #{response.message}"
|
785
|
+
end
|
748
786
|
end
|
749
787
|
|
750
788
|
rescue StandardError => e
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date: 2025-05
|
10
|
+
date: 2025-06-05 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: concurrent-ruby
|
@@ -78,7 +77,6 @@ homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
|
|
78
77
|
licenses:
|
79
78
|
- MIT
|
80
79
|
metadata: {}
|
81
|
-
post_install_message:
|
82
80
|
rdoc_options: []
|
83
81
|
require_paths:
|
84
82
|
- lib
|
@@ -93,8 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
91
|
- !ruby/object:Gem::Version
|
94
92
|
version: '0'
|
95
93
|
requirements: []
|
96
|
-
rubygems_version: 3.
|
97
|
-
signing_key:
|
94
|
+
rubygems_version: 3.6.2
|
98
95
|
specification_version: 4
|
99
96
|
summary: Download an entire website from the Wayback Machine.
|
100
97
|
test_files: []
|