wayback_machine_downloader_straw 2.3.6 → 2.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader.rb +54 -23
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b739c4ecda1e325f9d5a33872fa71a8a5103f1770cc18c7e1b46516c96c8fef6
|
4
|
+
data.tar.gz: 991cf1f67783f35a8da233e6d9e82edc4d933ef0229d5ecffbe8963c5d049c98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f9b71d59d4c5c5bdb82f58fceacd848242a34b12d15abf93c101e4d61ab8fcab46e60011b80f966b0851474160af153c92ab46db5ed2c2e80b0fec3afdc53f8c
|
7
|
+
data.tar.gz: 88f39d47bb8405f682ddca4236bd2e3ce93ffbfd426c2430532b904c98e7cb1593406271fa4453847ab95615adbffc36049072bd7c8b45b171e2cecb77bb41ab
|
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|
113
113
|
|
114
114
|
include ArchiveAPI
|
115
115
|
|
116
|
-
VERSION = "2.3.
|
116
|
+
VERSION = "2.3.7"
|
117
117
|
DEFAULT_TIMEOUT = 30
|
118
118
|
MAX_RETRIES = 3
|
119
119
|
RETRY_DELAY = 2
|
@@ -477,8 +477,8 @@ class WaybackMachineDownloader
|
|
477
477
|
begin
|
478
478
|
@connection_pool.with_connection do |connection|
|
479
479
|
result_message = download_file(file_remote_info, connection)
|
480
|
-
#
|
481
|
-
if result_message &&
|
480
|
+
# assume download success if the result message contains ' -> '
|
481
|
+
if result_message && result_message.include?(' -> ')
|
482
482
|
download_success = true
|
483
483
|
end
|
484
484
|
@download_mutex.synchronize do
|
@@ -659,11 +659,21 @@ class WaybackMachineDownloader
|
|
659
659
|
|
660
660
|
begin
|
661
661
|
structure_dir_path dir_path
|
662
|
-
download_with_retry(file_path, file_url, file_timestamp, http)
|
663
|
-
|
664
|
-
|
662
|
+
status = download_with_retry(file_path, file_url, file_timestamp, http)
|
663
|
+
|
664
|
+
case status
|
665
|
+
when :saved
|
666
|
+
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
667
|
+
rewrite_urls_to_relative(file_path)
|
668
|
+
end
|
669
|
+
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
670
|
+
when :skipped_not_found
|
671
|
+
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
672
|
+
else
|
673
|
+
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
674
|
+
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
675
|
+
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
665
676
|
end
|
666
|
-
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
667
677
|
rescue StandardError => e
|
668
678
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
669
679
|
if File.exist?(file_path) and File.size(file_path) == 0
|
@@ -714,8 +724,7 @@ class WaybackMachineDownloader
|
|
714
724
|
|
715
725
|
response = connection.request(request)
|
716
726
|
|
717
|
-
|
718
|
-
when Net::HTTPSuccess
|
727
|
+
save_response_body = lambda do
|
719
728
|
File.open(file_path, "wb") do |file|
|
720
729
|
body = response.body
|
721
730
|
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
@@ -725,26 +734,48 @@ class WaybackMachineDownloader
|
|
725
734
|
gz.close
|
726
735
|
file.write(decompressed_body)
|
727
736
|
rescue Zlib::GzipFile::Error => e
|
728
|
-
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
737
|
+
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
|
729
738
|
file.write(body)
|
730
739
|
end
|
731
740
|
else
|
732
741
|
file.write(body) if body
|
733
742
|
end
|
734
743
|
end
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
744
|
+
end
|
745
|
+
|
746
|
+
if @all
|
747
|
+
case response
|
748
|
+
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
|
749
|
+
save_response_body.call
|
750
|
+
if response.is_a?(Net::HTTPRedirection)
|
751
|
+
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
|
752
|
+
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
|
753
|
+
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
|
754
|
+
end
|
755
|
+
return :saved
|
756
|
+
else
|
757
|
+
# for any other response type when --all is true, treat as an error to be retried or failed
|
758
|
+
raise "Unhandled HTTP response: #{response.code} #{response.message}"
|
759
|
+
end
|
760
|
+
else # not @all (our default behavior)
|
761
|
+
case response
|
762
|
+
when Net::HTTPSuccess
|
763
|
+
save_response_body.call
|
764
|
+
return :saved
|
765
|
+
when Net::HTTPRedirection
|
766
|
+
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
767
|
+
location = response['location']
|
768
|
+
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
769
|
+
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
770
|
+
when Net::HTTPTooManyRequests
|
771
|
+
sleep(RATE_LIMIT * 2)
|
772
|
+
raise "Rate limited, retrying..."
|
773
|
+
when Net::HTTPNotFound
|
774
|
+
@logger.warn("File not found, skipping: #{file_url}")
|
775
|
+
return :skipped_not_found
|
776
|
+
else
|
777
|
+
raise "HTTP Error: #{response.code} #{response.message}"
|
778
|
+
end
|
748
779
|
end
|
749
780
|
|
750
781
|
rescue StandardError => e
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-05-
|
11
|
+
date: 2025-05-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: concurrent-ruby
|