RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.3.6 → 2.3.8 - Mend

wayback_machine_downloader_straw 2.3.6 → 2.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/wayback_machine_downloader/archive_api.rb +7 -0
data/lib/wayback_machine_downloader.rb +65 -27
metadata +3 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 04ac6f9f045b4f92a7481ad8544f2f9138454b9eabdcf6f47b28195c1dd1cdaf
-  data.tar.gz: '09a16685d1299afb338d86495d1c58825482a6785e7e1a596bb02eb2da1fc7f1'
+  metadata.gz: df42d96c68c19fd39b6da3c9e9d51934197484ccb1ceb7a9387116622b0214a7
+  data.tar.gz: d6f04e3dc44c9f216b9d3dc631275fac5e48447ebd963a33818e82baf1ff79b3
 SHA512:
-  metadata.gz: fd157e047c8631ff5cdfd4ca540840a7d49196131dc4de9f9725c3989164151e4c05dda0dae0dc884bfb9bbb51483f061378ef7a1e737b36d1d11882719bcf60
-  data.tar.gz: e9b814bbbed6caef69972b9e94891f7af9be61674cf50bdd3bb1bf4a60c3622156e93b07de8a3761dba87a852bd67aa10439481c4ca72bffe564019f04451ed5
+  metadata.gz: b9654877bb591082e1ef1c5dfdacff0bf887ed68f8ae1b2d995a99b87232523aa3350aede2d8cbb4045dbb15b380a1e93451004a45f881ad323615c0f66632c5
+  data.tar.gz: eb8753d3ceb689e9b8c3f3dbaeeac7c9dd818497f916882d5d3271f1901c099f8b7103e7b49bcef51d71aab86b2607174ac2eece768a092242b0d5e0dcec9b28

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -4,6 +4,13 @@ require 'uri'
 module ArchiveAPI
   def get_raw_list_from_api(url, page_index, http)
+    # Automatically append /* if the URL doesn't contain a path after the domain
+    # This is a workaround for an issue with the API and *some* domains.
+    # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
+    if url && !url.match(/^https?:\/\/.*\//i)
+      url = "#{url}/*"
+    end
     request_url = URI("https://web.archive.org/cdx/search/cdx")
     params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
     request_url.query = URI.encode_www_form(params)

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -113,7 +113,7 @@ class WaybackMachineDownloader
   include ArchiveAPI
-  VERSION = "2.3.6"
+  VERSION = "2.3.8"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2
@@ -154,10 +154,12 @@ class WaybackMachineDownloader
   end
   def backup_name
-    if @base_url.include? '//'
-      @base_url.split('/')[2]
+    url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
+    if url_to_process.include? '//'
+      url_to_process.split('/')[2]
     else
-      @base_url
+      url_to_process
     end
   end
@@ -241,6 +243,7 @@ class WaybackMachineDownloader
     # Fetch the initial set of snapshots, sequentially
     @connection_pool.with_connection do |connection|
       initial_list = get_raw_list_from_api(@base_url, nil, connection)
+      initial_list ||= []
       mutex.synchronize do
         snapshot_list_to_consider.concat(initial_list)
         print "."
@@ -265,6 +268,7 @@ class WaybackMachineDownloader
             @connection_pool.with_connection do |connection|
               result = get_raw_list_from_api("#{@base_url}/*", page, connection)
             end
+            result ||= []
             [page, result]
           end
         end
@@ -284,7 +288,7 @@ class WaybackMachineDownloader
         # Process results and check for empty pages
         results.each do |page, result|
-          if result.empty?
+          if result.nil? || result.empty?
             continue_fetching = false
             break
           else
@@ -477,8 +481,8 @@ class WaybackMachineDownloader
         begin
           @connection_pool.with_connection do |connection|
             result_message = download_file(file_remote_info, connection)
-            # for now, assume success if no exception and message doesn't indicate error/skip
-            if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
+            # assume download success if the result message contains ' -> '
+            if result_message && result_message.include?(' -> ')
                download_success = true
             end
             @download_mutex.synchronize do
@@ -659,11 +663,21 @@ class WaybackMachineDownloader
     begin
       structure_dir_path dir_path
-      download_with_retry(file_path, file_url, file_timestamp, http)
-      if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
-        rewrite_urls_to_relative(file_path)
+      status = download_with_retry(file_path, file_url, file_timestamp, http)
+      case status
+      when :saved
+        if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
+          rewrite_urls_to_relative(file_path)
+        end
+        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
+      when :skipped_not_found
+        "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
+      else
+        # ideally, this case should not be reached if download_with_retry behaves as expected.
+        @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
+        "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
       end
-      "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
     rescue StandardError => e
       msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
       if File.exist?(file_path) and File.size(file_path) == 0
@@ -707,6 +721,9 @@ class WaybackMachineDownloader
         "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
       end
+      # Escape square brackets because they are not valid in URI()
+      wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
       request = Net::HTTP::Get.new(URI(wayback_url))
       request["Connection"] = "keep-alive"
       request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
@@ -714,8 +731,7 @@ class WaybackMachineDownloader
       response = connection.request(request)
-      case response
-      when Net::HTTPSuccess
+      save_response_body = lambda do
         File.open(file_path, "wb") do |file|
           body = response.body
           if response['content-encoding'] == 'gzip' && body && !body.empty?
@@ -725,26 +741,48 @@ class WaybackMachineDownloader
               gz.close
               file.write(decompressed_body)
             rescue Zlib::GzipFile::Error => e
-              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
+              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
               file.write(body)
             end
           else
             file.write(body) if body
           end
         end
-      when Net::HTTPRedirection
-        raise "Too many redirects for #{file_url}" if redirect_count >= 2
-        location = response['location']
-        @logger.warn("Redirect found for #{file_url} -> #{location}")
-        return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
-      when Net::HTTPTooManyRequests
-        sleep(RATE_LIMIT * 2)
-        raise "Rate limited, retrying..."
-      when Net::HTTPNotFound
-        @logger.warn("File not found, skipping: #{file_url}")
-        return
-      else
-        raise "HTTP Error: #{response.code} #{response.message}"
+      end
+      if @all
+        case response
+        when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
+          save_response_body.call
+          if response.is_a?(Net::HTTPRedirection)
+            @logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
+          elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
+            @logger.info("Saved error page for #{file_url} (status #{response.code}).")
+          end
+          return :saved
+        else
+          # for any other response type when --all is true, treat as an error to be retried or failed
+          raise "Unhandled HTTP response: #{response.code} #{response.message}"
+        end
+      else # not @all (our default behavior)
+        case response
+        when Net::HTTPSuccess
+          save_response_body.call
+          return :saved
+        when Net::HTTPRedirection
+          raise "Too many redirects for #{file_url}" if redirect_count >= 2
+          location = response['location']
+          @logger.warn("Redirect found for #{file_url} -> #{location}")
+          return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
+        when Net::HTTPTooManyRequests
+          sleep(RATE_LIMIT * 2)
+          raise "Rate limited, retrying..."
+        when Net::HTTPNotFound
+          @logger.warn("File not found, skipping: #{file_url}")
+          return :skipped_not_found
+        else
+          raise "HTTP Error: #{response.code} #{response.message}"
+        end
       end
     rescue StandardError => e

metadata CHANGED Viewed

@@ -1,14 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader_straw
 version: !ruby/object:Gem::Version
-  version: 2.3.6
+  version: 2.3.8
 platform: ruby
 authors:
 - strawberrymaster
-autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-05-18 00:00:00.000000000 Z
+date: 2025-06-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: concurrent-ruby
@@ -78,7 +77,6 @@ homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
 licenses:
 - MIT
 metadata: {}
-post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -93,8 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.11
-signing_key:
+rubygems_version: 3.6.2
 specification_version: 4
 summary: Download an entire website from the Wayback Machine.
 test_files: []