RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.4.6 → 2.4.7 - Mend

wayback_machine_downloader_straw 2.4.6 → 2.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +4 -0
data/lib/wayback_machine_downloader/archive_api.rb +17 -6
data/lib/wayback_machine_downloader.rb +97 -19
metadata +4 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e52df092b59b0eec27b390b5b00fcfc17fab271acd6cd9df774912f57cfc4dc1
-  data.tar.gz: ce170b42caad7e8136b07c2aa5cb6e751f57dd64bd40c0addcd42a31798d0047
+  metadata.gz: 558d3187ee31faeadb08cf83e32a87307ae9d55a3327206598f27a78fb715e08
+  data.tar.gz: 9845999e0e618afde419869bb01b04277aca318aa80b238feb8252540fc16315
 SHA512:
-  metadata.gz: 558e9cdfc3d7d4d2081ccb49b12a96bdb64b7768697eb0a1b9a431ed1ad3017ce894975e046a6e50766928c3863797715c7b45d013b6ab7ad78bca59ea86c6d0
-  data.tar.gz: af3064f1489d32cf078fd5d87d2773700e9dfa498075f089029e0e7ec47c500c7815e84d51f426bb6fc3067bf02c9a9404da3b6f74d263c99b4ae96fc32dab35
+  metadata.gz: b323c1065ea1ab1d3c5909458cae726462ba1b88fd89effe8cd1efbdd1301d2022363c56b191f8ddbe28c0143fe87dfc94ca847865bb08e3564e29ba36f231a4
+  data.tar.gz: 00c4e775ee05e176e1048d6e5ddd9ddc436edaf9bf7fea61dbab33480e4adc499ea8a3a584e591858aad3a4c2f10096a3b6eb1ff6c8146f26ba6a3d42f104e32

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -48,6 +48,10 @@ option_parser = OptionParser.new do |opts|
     options[:all] = true
   end
+  opts.on("--keep-duplicates", "Do not collapse duplicate CDX captures by digest") do |t|
+    options[:keep_duplicates] = true
+  end
   opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
     options[:threads_count] = t
   end

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -4,12 +4,22 @@ require 'uri'
 module ArchiveAPI
   def get_raw_list_from_api(url, page_index, http)
-    # Automatically append /* if the URL doesn't contain a path after the domain
+    # Automatically append /* for host-only URLs
     # This is a workaround for an issue with the API and *some* domains.
     # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
-    # But don't do this when exact_url flag is set
-    if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
-      url = "#{url}/*"
+    # But don't do this when exact_url flag is set, and never append twice
+    if url && !@exact_url
+      normalized_url = url.to_s
+      has_wildcard = normalized_url.include?('*')
+      host_and_rest = normalized_url
+        .sub(/\Ahttps?:\/\//i, '')
+        .split(/[?#]/, 2)
+        .first
+      has_path = host_and_rest.include?('/')
+      unless has_wildcard || has_path
+        url = "#{normalized_url}/*"
+      end
     end
     request_url = URI("https://web.archive.org/cdx/search/cdx")
@@ -63,8 +73,9 @@ module ArchiveAPI
   end
   def parameters_for_api(page_index)
-    parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "true"]]
-    parameters.push(["filter", "statuscode:200"]) unless @all
+    parameters = [["fl", "timestamp,original"], ["gzip", "true"]]
+    parameters.push(["collapse", "digest"]) unless @keep_duplicates || @all_timestamps
+    parameters.push(["filter", "statuscode:2..|30[12378]"]) unless @all
     parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
     parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
     parameters.push(["page", page_index]) if page_index

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -2,11 +2,8 @@
 require 'thread'
 require 'net/http'
-require 'open-uri'
 require 'fileutils'
-require 'cgi'
 require 'json'
-require 'time'
 require 'concurrent-ruby'
 require 'logger'
 require 'zlib'
@@ -133,7 +130,7 @@ class WaybackMachineDownloader
   include SubdomainProcessor
   include URLRewrite
-  VERSION = "2.4.6"
+  VERSION = "2.4.7"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2
@@ -146,7 +143,7 @@ class WaybackMachineDownloader
   attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
-    :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
+    :all, :keep_duplicates, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
     :snapshot_at, :page_requisites
   def initialize params
@@ -165,6 +162,7 @@ class WaybackMachineDownloader
     @only_filter = params[:only_filter]
     @exclude_filter = params[:exclude_filter]
     @all = params[:all]
+    @keep_duplicates = params[:keep_duplicates] || false
     @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
     @threads_count = [params[:threads_count].to_i, 1].max
     @rewritten = params[:rewritten]
@@ -293,10 +291,8 @@ class WaybackMachineDownloader
     # if snapshot_at is set, limit CDX queries to snapshots at or before that timestamp
     original_to = @to_timestamp
-    skip_cache = false
     if @snapshot_at
       @to_timestamp = @snapshot_at
-      skip_cache = true
     end
     puts "Getting snapshot pages from Wayback Machine API..."
@@ -329,7 +325,7 @@ class WaybackMachineDownloader
             Concurrent::Future.execute(executor: fetch_pool) do
               result = nil
               @connection_pool.with_connection do |connection|
-                result = get_raw_list_from_api("#{@base_url}/*", page, connection)
+                result = get_raw_list_from_api(@base_url, page, connection)
               end
               result ||= []
               [page, result]
@@ -382,10 +378,8 @@ class WaybackMachineDownloader
     # save the fetched list to the cache file
     begin
       FileUtils.mkdir_p(File.dirname(cdx_path))
-      unless skip_cache
-        File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
-        puts "Saved snapshot list to #{cdx_path}"
-      end
+      File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
+      puts "Saved snapshot list to #{cdx_path}"
     rescue => e
       puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
     ensure
@@ -532,7 +526,16 @@ class WaybackMachineDownloader
     if File.exist?(db_path) && !@reset
       puts "Loading list of already downloaded files from #{db_path}"
       begin
-        File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
+        File.foreach(db_path) do |line|
+          id = line.strip
+          # only trust DB entries that actually exist on disk; this helps when resuming
+          path = local_path_for_file_id(id)
+          if path && File.exist?(path)
+            downloaded_ids.add(id)
+          else
+            puts "Found DB entry but file missing, will requeue: #{id}" if @logger && @logger.level == Logger::DEBUG
+          end
+        end
       rescue => e
         puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
         downloaded_ids.clear
@@ -687,6 +690,23 @@ class WaybackMachineDownloader
   def process_single_file(file_remote_info)
     download_success = false
     downloaded_path = nil
+    # fast-path for resumed runs: if file already exists locally, avoid HTTP work entirely
+    existing_path = local_path_for_file_id(file_remote_info[:file_id])
+    if existing_path && File.exist?(existing_path)
+      result_message = "#{color("[EXISTS]", :cyan)} #{file_remote_info[:file_url]} (#{@processed_file_count + 1}/#{@total_to_download})"
+      @download_mutex.synchronize do
+        @processed_file_count += 1 if @processed_file_count < @total_to_download
+        puts result_message
+      end
+      append_to_db(file_remote_info[:file_id])
+      if @page_requisites && File.extname(existing_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
+        process_page_requisites(existing_path, file_remote_info)
+      end
+      return
+    end
     @connection_pool.with_connection do |connection|
       result_message, downloaded_path = download_file(file_remote_info, connection)
@@ -994,6 +1014,27 @@ class WaybackMachineDownloader
     end
   end
+  # derive the local filesystem path for a sanitized `file_id` stored in the DB
+  def local_path_for_file_id(file_id)
+    return nil if file_id.nil?
+    current_backup_path = backup_path
+    # file_id coming from DB is expected to already be sanitized
+    raw_path_elements = file_id.split('/')
+    if file_id == ""
+      dir_path = current_backup_path
+      return File.join(dir_path, 'index.html')
+    elsif file_id[-1] == '/' || (raw_path_elements.last && !raw_path_elements.last.include?('.'))
+      dir_path = File.join(current_backup_path, *raw_path_elements)
+      return File.join(dir_path, 'index.html')
+    else
+      filename = raw_path_elements.pop
+      dir_path = File.join(current_backup_path, *raw_path_elements)
+      return File.join(dir_path, filename)
+    end
+  end
   def color(text, color_code)
     return text if Gem.win_platform? && !ENV['ENABLE_ANSI']
     codes = { red: 31, green: 32, yellow: 33, blue: 34, magenta: 35, cyan: 36, white: 37 }
@@ -1111,6 +1152,46 @@ class WaybackMachineDownloader
     end
   end
+  def build_wayback_url(source_url, file_timestamp)
+    source = source_url.to_s
+    return source if wayback_archive_url?(source)
+    if source.start_with?('/web/')
+      return "https://web.archive.org#{source}"
+    end
+    if @rewritten
+      "https://web.archive.org/web/#{file_timestamp}/#{source}"
+    else
+      "https://web.archive.org/web/#{file_timestamp}id_/#{source}"
+    end
+  end
+  def wayback_archive_url?(url)
+    url.to_s.match?(%r{\Ahttps?://web\.archive\.org/web/})
+  end
+  def extract_original_url(url)
+    match = url.to_s.match(%r{\Ahttps?://web\.archive\.org/web/\d{1,14}(?:[a-z_]*)/(https?://.+)\z})
+    match && match[1]
+  end
+  def resolve_redirect_source(current_source_url, location)
+    return nil if location.nil? || location.empty?
+    location = location.to_s
+    return location if wayback_archive_url?(location)
+    if location.start_with?('/web/')
+      return "https://web.archive.org#{location}"
+    end
+    base_url = extract_original_url(current_source_url) || current_source_url.to_s
+    URI.join(base_url, location).to_s
+  rescue URI::InvalidURIError
+    location
+  end
   # wrap URL in parentheses if it contains characters that commonly break unquoted
   # Windows CMD usage (e.g., &). This is only for display; user still must quote
   # when invoking manually.
@@ -1122,11 +1203,7 @@ class WaybackMachineDownloader
   def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
     retries = 0
     begin
-      wayback_url = if @rewritten
-        "https://web.archive.org/web/#{file_timestamp}/#{file_url}"
-      else
-        "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
-      end
+      wayback_url = build_wayback_url(file_url, file_timestamp)
       # Escape characters that are not valid in URI()
       wayback_url = wayback_url.gsub(' ', '%20').gsub('[', '%5B').gsub(']', '%5D')
@@ -1193,7 +1270,8 @@ class WaybackMachineDownloader
           raise "Too many redirects for #{file_url}" if redirect_count >= 5
           location = response['location']
           @logger.warn("Redirect found for #{file_url} -> #{location}")
-          return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
+          redirected_source = resolve_redirect_source(file_url, location)
+          return download_with_retry(file_path, redirected_source, file_timestamp, connection, redirect_count + 1)
         when Net::HTTPTooManyRequests
           sleep(RATE_LIMIT * 2)
           raise "Rate limited, retrying..."

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader_straw
 version: !ruby/object:Gem::Version
-  version: 2.4.6
+  version: 2.4.7
 platform: ruby
 authors:
 - strawberrymaster
@@ -18,7 +18,7 @@ dependencies:
         version: '1.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.3.4
+        version: 1.3.6
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -28,7 +28,7 @@ dependencies:
         version: '1.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.3.4
+        version: 1.3.6
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
@@ -94,7 +94,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.3
+rubygems_version: 4.0.6
 specification_version: 4
 summary: Download an entire website from the Wayback Machine.
 test_files: []