wayback_machine_downloader_straw 2.3.6 → 2.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 04ac6f9f045b4f92a7481ad8544f2f9138454b9eabdcf6f47b28195c1dd1cdaf
4
- data.tar.gz: '09a16685d1299afb338d86495d1c58825482a6785e7e1a596bb02eb2da1fc7f1'
3
+ metadata.gz: df42d96c68c19fd39b6da3c9e9d51934197484ccb1ceb7a9387116622b0214a7
4
+ data.tar.gz: d6f04e3dc44c9f216b9d3dc631275fac5e48447ebd963a33818e82baf1ff79b3
5
5
  SHA512:
6
- metadata.gz: fd157e047c8631ff5cdfd4ca540840a7d49196131dc4de9f9725c3989164151e4c05dda0dae0dc884bfb9bbb51483f061378ef7a1e737b36d1d11882719bcf60
7
- data.tar.gz: e9b814bbbed6caef69972b9e94891f7af9be61674cf50bdd3bb1bf4a60c3622156e93b07de8a3761dba87a852bd67aa10439481c4ca72bffe564019f04451ed5
6
+ metadata.gz: b9654877bb591082e1ef1c5dfdacff0bf887ed68f8ae1b2d995a99b87232523aa3350aede2d8cbb4045dbb15b380a1e93451004a45f881ad323615c0f66632c5
7
+ data.tar.gz: eb8753d3ceb689e9b8c3f3dbaeeac7c9dd818497f916882d5d3271f1901c099f8b7103e7b49bcef51d71aab86b2607174ac2eece768a092242b0d5e0dcec9b28
@@ -4,6 +4,13 @@ require 'uri'
4
4
  module ArchiveAPI
5
5
 
6
6
  def get_raw_list_from_api(url, page_index, http)
7
+ # Automatically append /* if the URL doesn't contain a path after the domain
8
+ # This is a workaround for an issue with the API and *some* domains.
9
+ # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
10
+ if url && !url.match(/^https?:\/\/.*\//i)
11
+ url = "#{url}/*"
12
+ end
13
+
7
14
  request_url = URI("https://web.archive.org/cdx/search/cdx")
8
15
  params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
9
16
  request_url.query = URI.encode_www_form(params)
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
113
113
 
114
114
  include ArchiveAPI
115
115
 
116
- VERSION = "2.3.6"
116
+ VERSION = "2.3.8"
117
117
  DEFAULT_TIMEOUT = 30
118
118
  MAX_RETRIES = 3
119
119
  RETRY_DELAY = 2
@@ -154,10 +154,12 @@ class WaybackMachineDownloader
154
154
  end
155
155
 
156
156
  def backup_name
157
- if @base_url.include? '//'
158
- @base_url.split('/')[2]
157
+ url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
158
+
159
+ if url_to_process.include? '//'
160
+ url_to_process.split('/')[2]
159
161
  else
160
- @base_url
162
+ url_to_process
161
163
  end
162
164
  end
163
165
 
@@ -241,6 +243,7 @@ class WaybackMachineDownloader
241
243
  # Fetch the initial set of snapshots, sequentially
242
244
  @connection_pool.with_connection do |connection|
243
245
  initial_list = get_raw_list_from_api(@base_url, nil, connection)
246
+ initial_list ||= []
244
247
  mutex.synchronize do
245
248
  snapshot_list_to_consider.concat(initial_list)
246
249
  print "."
@@ -265,6 +268,7 @@ class WaybackMachineDownloader
265
268
  @connection_pool.with_connection do |connection|
266
269
  result = get_raw_list_from_api("#{@base_url}/*", page, connection)
267
270
  end
271
+ result ||= []
268
272
  [page, result]
269
273
  end
270
274
  end
@@ -284,7 +288,7 @@ class WaybackMachineDownloader
284
288
 
285
289
  # Process results and check for empty pages
286
290
  results.each do |page, result|
287
- if result.empty?
291
+ if result.nil? || result.empty?
288
292
  continue_fetching = false
289
293
  break
290
294
  else
@@ -477,8 +481,8 @@ class WaybackMachineDownloader
477
481
  begin
478
482
  @connection_pool.with_connection do |connection|
479
483
  result_message = download_file(file_remote_info, connection)
480
- # for now, assume success if no exception and message doesn't indicate error/skip
481
- if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
484
+ # assume download success if the result message contains ' -> '
485
+ if result_message && result_message.include?(' -> ')
482
486
  download_success = true
483
487
  end
484
488
  @download_mutex.synchronize do
@@ -659,11 +663,21 @@ class WaybackMachineDownloader
659
663
 
660
664
  begin
661
665
  structure_dir_path dir_path
662
- download_with_retry(file_path, file_url, file_timestamp, http)
663
- if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
664
- rewrite_urls_to_relative(file_path)
666
+ status = download_with_retry(file_path, file_url, file_timestamp, http)
667
+
668
+ case status
669
+ when :saved
670
+ if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
671
+ rewrite_urls_to_relative(file_path)
672
+ end
673
+ "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
674
+ when :skipped_not_found
675
+ "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
676
+ else
677
+ # ideally, this case should not be reached if download_with_retry behaves as expected.
678
+ @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
679
+ "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
665
680
  end
666
- "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
667
681
  rescue StandardError => e
668
682
  msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
669
683
  if File.exist?(file_path) and File.size(file_path) == 0
@@ -707,6 +721,9 @@ class WaybackMachineDownloader
707
721
  "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
708
722
  end
709
723
 
724
+ # Escape square brackets because they are not valid in URI()
725
+ wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
726
+
710
727
  request = Net::HTTP::Get.new(URI(wayback_url))
711
728
  request["Connection"] = "keep-alive"
712
729
  request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
@@ -714,8 +731,7 @@ class WaybackMachineDownloader
714
731
 
715
732
  response = connection.request(request)
716
733
 
717
- case response
718
- when Net::HTTPSuccess
734
+ save_response_body = lambda do
719
735
  File.open(file_path, "wb") do |file|
720
736
  body = response.body
721
737
  if response['content-encoding'] == 'gzip' && body && !body.empty?
@@ -725,26 +741,48 @@ class WaybackMachineDownloader
725
741
  gz.close
726
742
  file.write(decompressed_body)
727
743
  rescue Zlib::GzipFile::Error => e
728
- @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
744
+ @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
729
745
  file.write(body)
730
746
  end
731
747
  else
732
748
  file.write(body) if body
733
749
  end
734
750
  end
735
- when Net::HTTPRedirection
736
- raise "Too many redirects for #{file_url}" if redirect_count >= 2
737
- location = response['location']
738
- @logger.warn("Redirect found for #{file_url} -> #{location}")
739
- return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
740
- when Net::HTTPTooManyRequests
741
- sleep(RATE_LIMIT * 2)
742
- raise "Rate limited, retrying..."
743
- when Net::HTTPNotFound
744
- @logger.warn("File not found, skipping: #{file_url}")
745
- return
746
- else
747
- raise "HTTP Error: #{response.code} #{response.message}"
751
+ end
752
+
753
+ if @all
754
+ case response
755
+ when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
756
+ save_response_body.call
757
+ if response.is_a?(Net::HTTPRedirection)
758
+ @logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
759
+ elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
760
+ @logger.info("Saved error page for #{file_url} (status #{response.code}).")
761
+ end
762
+ return :saved
763
+ else
764
+ # for any other response type when --all is true, treat as an error to be retried or failed
765
+ raise "Unhandled HTTP response: #{response.code} #{response.message}"
766
+ end
767
+ else # not @all (our default behavior)
768
+ case response
769
+ when Net::HTTPSuccess
770
+ save_response_body.call
771
+ return :saved
772
+ when Net::HTTPRedirection
773
+ raise "Too many redirects for #{file_url}" if redirect_count >= 2
774
+ location = response['location']
775
+ @logger.warn("Redirect found for #{file_url} -> #{location}")
776
+ return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
777
+ when Net::HTTPTooManyRequests
778
+ sleep(RATE_LIMIT * 2)
779
+ raise "Rate limited, retrying..."
780
+ when Net::HTTPNotFound
781
+ @logger.warn("File not found, skipping: #{file_url}")
782
+ return :skipped_not_found
783
+ else
784
+ raise "HTTP Error: #{response.code} #{response.message}"
785
+ end
748
786
  end
749
787
 
750
788
  rescue StandardError => e
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.6
4
+ version: 2.3.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2025-05-18 00:00:00.000000000 Z
10
+ date: 2025-06-05 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: concurrent-ruby
@@ -78,7 +77,6 @@ homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
78
77
  licenses:
79
78
  - MIT
80
79
  metadata: {}
81
- post_install_message:
82
80
  rdoc_options: []
83
81
  require_paths:
84
82
  - lib
@@ -93,8 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
91
  - !ruby/object:Gem::Version
94
92
  version: '0'
95
93
  requirements: []
96
- rubygems_version: 3.5.11
97
- signing_key:
94
+ rubygems_version: 3.6.2
98
95
  specification_version: 4
99
96
  summary: Download an entire website from the Wayback Machine.
100
97
  test_files: []