wayback_machine_downloader_straw 2.3.6 → 2.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 04ac6f9f045b4f92a7481ad8544f2f9138454b9eabdcf6f47b28195c1dd1cdaf
4
- data.tar.gz: '09a16685d1299afb338d86495d1c58825482a6785e7e1a596bb02eb2da1fc7f1'
3
+ metadata.gz: b739c4ecda1e325f9d5a33872fa71a8a5103f1770cc18c7e1b46516c96c8fef6
4
+ data.tar.gz: 991cf1f67783f35a8da233e6d9e82edc4d933ef0229d5ecffbe8963c5d049c98
5
5
  SHA512:
6
- metadata.gz: fd157e047c8631ff5cdfd4ca540840a7d49196131dc4de9f9725c3989164151e4c05dda0dae0dc884bfb9bbb51483f061378ef7a1e737b36d1d11882719bcf60
7
- data.tar.gz: e9b814bbbed6caef69972b9e94891f7af9be61674cf50bdd3bb1bf4a60c3622156e93b07de8a3761dba87a852bd67aa10439481c4ca72bffe564019f04451ed5
6
+ metadata.gz: f9b71d59d4c5c5bdb82f58fceacd848242a34b12d15abf93c101e4d61ab8fcab46e60011b80f966b0851474160af153c92ab46db5ed2c2e80b0fec3afdc53f8c
7
+ data.tar.gz: 88f39d47bb8405f682ddca4236bd2e3ce93ffbfd426c2430532b904c98e7cb1593406271fa4453847ab95615adbffc36049072bd7c8b45b171e2cecb77bb41ab
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
113
113
 
114
114
  include ArchiveAPI
115
115
 
116
- VERSION = "2.3.6"
116
+ VERSION = "2.3.7"
117
117
  DEFAULT_TIMEOUT = 30
118
118
  MAX_RETRIES = 3
119
119
  RETRY_DELAY = 2
@@ -477,8 +477,8 @@ class WaybackMachineDownloader
477
477
  begin
478
478
  @connection_pool.with_connection do |connection|
479
479
  result_message = download_file(file_remote_info, connection)
480
- # for now, assume success if no exception and message doesn't indicate error/skip
481
- if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
480
+ # assume download success if the result message contains ' -> '
481
+ if result_message && result_message.include?(' -> ')
482
482
  download_success = true
483
483
  end
484
484
  @download_mutex.synchronize do
@@ -659,11 +659,21 @@ class WaybackMachineDownloader
659
659
 
660
660
  begin
661
661
  structure_dir_path dir_path
662
- download_with_retry(file_path, file_url, file_timestamp, http)
663
- if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
664
- rewrite_urls_to_relative(file_path)
662
+ status = download_with_retry(file_path, file_url, file_timestamp, http)
663
+
664
+ case status
665
+ when :saved
666
+ if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
667
+ rewrite_urls_to_relative(file_path)
668
+ end
669
+ "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
670
+ when :skipped_not_found
671
+ "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
672
+ else
673
+ # ideally, this case should not be reached if download_with_retry behaves as expected.
674
+ @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
675
+ "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
665
676
  end
666
- "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
667
677
  rescue StandardError => e
668
678
  msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
669
679
  if File.exist?(file_path) and File.size(file_path) == 0
@@ -714,8 +724,7 @@ class WaybackMachineDownloader
714
724
 
715
725
  response = connection.request(request)
716
726
 
717
- case response
718
- when Net::HTTPSuccess
727
+ save_response_body = lambda do
719
728
  File.open(file_path, "wb") do |file|
720
729
  body = response.body
721
730
  if response['content-encoding'] == 'gzip' && body && !body.empty?
@@ -725,26 +734,48 @@ class WaybackMachineDownloader
725
734
  gz.close
726
735
  file.write(decompressed_body)
727
736
  rescue Zlib::GzipFile::Error => e
728
- @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
737
+ @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
729
738
  file.write(body)
730
739
  end
731
740
  else
732
741
  file.write(body) if body
733
742
  end
734
743
  end
735
- when Net::HTTPRedirection
736
- raise "Too many redirects for #{file_url}" if redirect_count >= 2
737
- location = response['location']
738
- @logger.warn("Redirect found for #{file_url} -> #{location}")
739
- return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
740
- when Net::HTTPTooManyRequests
741
- sleep(RATE_LIMIT * 2)
742
- raise "Rate limited, retrying..."
743
- when Net::HTTPNotFound
744
- @logger.warn("File not found, skipping: #{file_url}")
745
- return
746
- else
747
- raise "HTTP Error: #{response.code} #{response.message}"
744
+ end
745
+
746
+ if @all
747
+ case response
748
+ when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
749
+ save_response_body.call
750
+ if response.is_a?(Net::HTTPRedirection)
751
+ @logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
752
+ elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
753
+ @logger.info("Saved error page for #{file_url} (status #{response.code}).")
754
+ end
755
+ return :saved
756
+ else
757
+ # for any other response type when --all is true, treat as an error to be retried or failed
758
+ raise "Unhandled HTTP response: #{response.code} #{response.message}"
759
+ end
760
+ else # not @all (our default behavior)
761
+ case response
762
+ when Net::HTTPSuccess
763
+ save_response_body.call
764
+ return :saved
765
+ when Net::HTTPRedirection
766
+ raise "Too many redirects for #{file_url}" if redirect_count >= 2
767
+ location = response['location']
768
+ @logger.warn("Redirect found for #{file_url} -> #{location}")
769
+ return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
770
+ when Net::HTTPTooManyRequests
771
+ sleep(RATE_LIMIT * 2)
772
+ raise "Rate limited, retrying..."
773
+ when Net::HTTPNotFound
774
+ @logger.warn("File not found, skipping: #{file_url}")
775
+ return :skipped_not_found
776
+ else
777
+ raise "HTTP Error: #{response.code} #{response.message}"
778
+ end
748
779
  end
749
780
 
750
781
  rescue StandardError => e
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.6
4
+ version: 2.3.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-05-18 00:00:00.000000000 Z
11
+ date: 2025-05-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: concurrent-ruby