RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.3.3 → 2.3.4 - Mend

wayback_machine_downloader_straw 2.3.3 → 2.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +9 -1
data/lib/wayback_machine_downloader/archive_api.rb +1 -1
data/lib/wayback_machine_downloader.rb +252 -55
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a8d577ca08cca3858efd95bfd879b198a57aa6262fa8e0a7f83ab4f3a362f1fc
-  data.tar.gz: ef73d81d745e7b3e9226458a66b5d54c2410db646ea85cc7145813bc26789dc7
+  metadata.gz: 220999514eb0c1dd5bce948a2ac028e4527eb07f089b9f6b437f02a6a00860be
+  data.tar.gz: '03780351285ee37d38ba04652725ffa33f6112837e01e68469c8be3cda13eb45'
 SHA512:
-  metadata.gz: 938e8544bb16b4afc6c81d0e4da602b5d3cd3e05482b3cc945ad3405681278fc03c7ccc1b84992b1ecafe66cf202aa71f306226b92f4b93b30b1c5c7edcbc86e
-  data.tar.gz: 8c236877be6274b9bb3c474fde9ad5a72a30abd5db3eadfc11eeae997488a2ddf27a72388b6d7d47455235ed0e084b1d14a7782911d0dd69e41f2fdcada713e2
+  metadata.gz: 3b05448a6271b8e45d5655b5ee415851f6e8e2daaec5f9bb12b0681e58292c06fe4ab91ab4f2ca1530edb0632755808dc8a465165c5e73fea2673481dddad610
+  data.tar.gz: 95440ee51316da6f2e48c3ec1d54f9fc391b2d59447625f07052222ecfeacf6fc26d430ac64ede6da589c0115fddc7a71fc7eb2fa45ff403491f1b3dc51b66ec

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -59,7 +59,15 @@ option_parser = OptionParser.new do |opts|
   end
   opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
-    options[:rewritten] = t
+    options[:rewritten] = true
+  end
+  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
+    options[:reset] = true
+  end
+  opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
+    options[:keep] = true
   end
   opts.on("-v", "--version", "Display version") do |t|

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -4,7 +4,7 @@ require 'uri'
 module ArchiveAPI
   def get_raw_list_from_api(url, page_index, http)
-    request_url = URI("https://web.archive.org/cdx/search/xd")
+    request_url = URI("https://web.archive.org/cdx/search/cdx")
     params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
     request_url.query = URI.encode_www_form(params)

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -9,6 +9,8 @@ require 'json'
 require 'time'
 require 'concurrent-ruby'
 require 'logger'
+require 'zlib'
+require 'stringio'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
@@ -111,17 +113,19 @@ class WaybackMachineDownloader
   include ArchiveAPI
-  VERSION = "2.3.3"
+  VERSION = "2.3.4"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2
   RATE_LIMIT = 0.25  # Delay between requests in seconds
   CONNECTION_POOL_SIZE = 10
   MEMORY_BUFFER_SIZE = 16384  # 16KB chunks
+  STATE_CDX_FILENAME = ".cdx.json"
+  STATE_DB_FILENAME = ".downloaded.txt"
   attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
-    :all, :maximum_pages, :threads_count, :logger
+    :all, :maximum_pages, :threads_count, :logger, :reset, :keep
   def initialize params
     validate_params(params)
@@ -137,10 +141,15 @@ class WaybackMachineDownloader
     @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
     @threads_count = [params[:threads_count].to_i, 1].max
     @rewritten = params[:rewritten]
+    @reset = params[:reset]
+    @keep = params[:keep]
     @timeout = params[:timeout] || DEFAULT_TIMEOUT
     @logger = setup_logger
     @failed_downloads = Concurrent::Array.new
     @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
+    @db_mutex = Mutex.new
+    handle_reset
   end
   def backup_name
@@ -163,6 +172,23 @@ class WaybackMachineDownloader
     end
   end
+  def cdx_path
+    File.join(backup_path, STATE_CDX_FILENAME)
+  end
+  def db_path
+    File.join(backup_path, STATE_DB_FILENAME)
+  end
+  def handle_reset
+    if @reset
+      puts "Resetting download state..."
+      FileUtils.rm_f(cdx_path)
+      FileUtils.rm_f(db_path)
+      puts "Removed state files: #{cdx_path}, #{db_path}"
+    end
+  end
   def match_only_filter file_url
     if @only_filter
       only_filter_regex = @only_filter.to_regex
@@ -190,28 +216,100 @@ class WaybackMachineDownloader
   end
   def get_all_snapshots_to_consider
-    snapshot_list_to_consider = []
+    if File.exist?(cdx_path) && !@reset
+      puts "Loading snapshot list from #{cdx_path}"
+      begin
+        snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
+        puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
+        puts
+        return Concurrent::Array.new(snapshot_list_to_consider)
+      rescue JSON::ParserError => e
+        puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
+        FileUtils.rm_f(cdx_path)
+      rescue => e
+        puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
+        FileUtils.rm_f(cdx_path)
+      end
+    end
+    snapshot_list_to_consider = Concurrent::Array.new
+    mutex = Mutex.new
+    puts "Getting snapshot pages from Wayback Machine API..."
+    # Fetch the initial set of snapshots, sequentially
     @connection_pool.with_connection do |connection|
-      puts "Getting snapshot pages"
+      initial_list = get_raw_list_from_api(@base_url, nil, connection)
+      mutex.synchronize do
+        snapshot_list_to_consider.concat(initial_list)
+        print "."
+      end
+    end
+    # Fetch additional pages if the exact URL flag is not set
+    unless @exact_url
+      page_index = 0
+      batch_size = [@threads_count, 5].min
+      continue_fetching = true
+      while continue_fetching && page_index < @maximum_pages
+        # Determine the range of pages to fetch in this batch
+        end_index = [page_index + batch_size, @maximum_pages].min
+        current_batch = (page_index...end_index).to_a
+        # Create futures for concurrent API calls
+        futures = current_batch.map do |page|
+          Concurrent::Future.execute do
+            result = nil
+            @connection_pool.with_connection do |connection|
+              result = get_raw_list_from_api("#{@base_url}/*", page, connection)
+            end
+            [page, result]
+          end
+        end
-      # Fetch the initial set of snapshots
-      snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
-      print "."
+        results = []
-      # Fetch additional pages if the exact URL flag is not set
-      unless @exact_url
-        @maximum_pages.times do |page_index|
-          snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection)
-          break if snapshot_list.empty?
+        futures.each do |future|
+          begin
+            results << future.value
+          rescue => e
+            puts "\nError fetching page #{future}: #{e.message}"
+          end
+        end
+        # Sort results by page number to maintain order
+        results.sort_by! { |page, _| page }
-          snapshot_list_to_consider += snapshot_list
-          print "."
+        # Process results and check for empty pages
+        results.each do |page, result|
+          if result.empty?
+            continue_fetching = false
+            break
+          else
+            mutex.synchronize do
+              snapshot_list_to_consider.concat(result)
+              print "."
+            end
+          end
         end
+        page_index = end_index
+        sleep(RATE_LIMIT) if continue_fetching
       end
     end
-    puts " found #{snapshot_list_to_consider.length} snapshots to consider."
+    puts " found #{snapshot_list_to_consider.length} snapshots."
+    # Save the fetched list to the cache file
+    begin
+      FileUtils.mkdir_p(File.dirname(cdx_path))
+      File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
+      puts "Saved snapshot list to #{cdx_path}"
+    rescue => e
+      puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
+    end
     puts
     snapshot_list_to_consider
@@ -301,32 +399,103 @@ class WaybackMachineDownloader
     puts "]"
   end
+  def load_downloaded_ids
+    downloaded_ids = Set.new
+    if File.exist?(db_path) && !@reset
+      puts "Loading list of already downloaded files from #{db_path}"
+      begin
+        File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
+      rescue => e
+        puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
+        downloaded_ids.clear
+      end
+    end
+    downloaded_ids
+  end
+  def append_to_db(file_id)
+    @db_mutex.synchronize do
+      begin
+        FileUtils.mkdir_p(File.dirname(db_path))
+        File.open(db_path, 'a') { |f| f.puts(file_id) }
+      rescue => e
+        @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
+      end
+    end
+  end
   def download_files
     start_time = Time.now
     puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
-    if file_list_by_timestamp.empty?
-      puts "No files to download."
+    FileUtils.mkdir_p(backup_path)
+    # Load the list of files to potentially download
+    files_to_download = file_list_by_timestamp
+    if files_to_download.empty?
+      puts "No files found matching criteria."
+      cleanup
+      return
+    end
+    total_files = files_to_download.count
+    puts "#{total_files} files found matching criteria."
+    # Load IDs of already downloaded files
+    downloaded_ids = load_downloaded_ids
+    files_to_process = files_to_download.reject do |file_info|
+      downloaded_ids.include?(file_info[:file_id])
+    end
+    remaining_count = files_to_process.count
+    skipped_count = total_files - remaining_count
+    if skipped_count > 0
+      puts "Found #{skipped_count} previously downloaded files, skipping them."
+    end
+    if remaining_count == 0
+      puts "All matching files have already been downloaded."
+      cleanup
       return
     end
-    total_files = file_list_by_timestamp.count
-    puts "#{total_files} files to download:"
+    puts "#{remaining_count} files to download:"
     @processed_file_count = 0
+    @total_to_download = remaining_count
     @download_mutex = Mutex.new
     thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
     pool = Concurrent::FixedThreadPool.new(thread_count)
-    file_list_by_timestamp.each do |file_remote_info|
+    files_to_process.each do |file_remote_info|
       pool.post do
-        @connection_pool.with_connection do |connection|
-          result = download_file(file_remote_info, connection)
-          @download_mutex.synchronize do
-            @processed_file_count += 1
-            puts result if result
+        download_success = false
+        begin
+          @connection_pool.with_connection do |connection|
+            result_message = download_file(file_remote_info, connection)
+            # for now, assume success if no exception and message doesn't indicate error/skip
+            if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
+               download_success = true
+            end
+            @download_mutex.synchronize do
+              @processed_file_count += 1
+              # adjust progress message to reflect remaining files
+              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
+              puts progress_message if progress_message
+            end
+          end
+          # sppend to DB only after successful download outside the connection block
+          if download_success
+            append_to_db(file_remote_info[:file_id])
           end
+        rescue => e
+          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
+           @download_mutex.synchronize do
+              @processed_file_count += 1
+           end
         end
         sleep(RATE_LIMIT)
       end
@@ -336,7 +505,8 @@ class WaybackMachineDownloader
     pool.wait_for_termination
     end_time = Time.now
-    puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
+    puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
+    puts "Results saved in #{backup_path}"
     cleanup
   end
@@ -368,9 +538,10 @@ class WaybackMachineDownloader
     file_url = file_remote_info[:file_url].encode(current_encoding)
     file_id = file_remote_info[:file_id]
     file_timestamp = file_remote_info[:timestamp]
-    file_path_elements = file_id.split('/')
+    original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
+    file_path_elements = original_file_id.split('/')
-    if file_id == ""
+    if original_file_id == ""
       dir_path = backup_path
       file_path = backup_path + 'index.html'
     elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
@@ -384,21 +555,24 @@ class WaybackMachineDownloader
       dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
       file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
     end
-    unless File.exist? file_path
-      begin
-        structure_dir_path dir_path
-        download_with_retry(file_path, file_url, file_timestamp, http)
-        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
-      rescue StandardError => e
-        msg = "#{file_url} # #{e}"
-        if not @all and File.exist?(file_path) and File.size(file_path) == 0
-          File.delete(file_path)
-          msg += "\n#{file_path} was empty and was removed."
-        end
-        msg
+    # check existence *before* download attempt
+    # this handles cases where a file was created manually or by a previous partial run without a .db entry
+    if File.exist? file_path
+       return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
+    end
+    begin
+      structure_dir_path dir_path
+      download_with_retry(file_path, file_url, file_timestamp, http)
+      "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
+    rescue StandardError => e
+      msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
+      if not @all and File.exist?(file_path) and File.size(file_path) == 0
+        File.delete(file_path)
+        msg += "\n#{file_path} was empty and was removed."
       end
-    else
-      "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
+      msg
     end
   end
@@ -431,23 +605,33 @@ class WaybackMachineDownloader
     begin
       wayback_url = if @rewritten
         "https://web.archive.org/web/#{file_timestamp}/#{file_url}"
-      else
+      else
         "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
       end
       request = Net::HTTP::Get.new(URI(wayback_url))
       request["Connection"] = "keep-alive"
       request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
+      request["Accept-Encoding"] = "gzip, deflate"
       response = connection.request(request)
       case response
       when Net::HTTPSuccess
         File.open(file_path, "wb") do |file|
-          if block_given?
-            yield(response, file)
+          body = response.body
+          if response['content-encoding'] == 'gzip' && body && !body.empty?
+            begin
+              gz = Zlib::GzipReader.new(StringIO.new(body))
+              decompressed_body = gz.read
+              gz.close
+              file.write(decompressed_body)
+            rescue Zlib::GzipFile::Error => e
+              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
+              file.write(body)
+            end
           else
-            file.write(response.body)
+            file.write(body) if body
           end
         end
       when Net::HTTPRedirection
@@ -464,7 +648,7 @@ class WaybackMachineDownloader
       else
         raise "HTTP Error: #{response.code} #{response.message}"
       end
     rescue StandardError => e
       if retries < MAX_RETRIES
         retries += 1
@@ -480,12 +664,25 @@ class WaybackMachineDownloader
   def cleanup
     @connection_pool.shutdown
     if @failed_downloads.any?
+      @logger.error("Download completed with errors.")
       @logger.error("Failed downloads summary:")
       @failed_downloads.each do |failure|
         @logger.error("  #{failure[:url]} - #{failure[:error]}")
       end
+      unless @reset
+         puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
+         return
+      end
+    end
+    if !@keep || @reset
+        puts "Cleaning up state files..." unless @keep && !@reset
+        FileUtils.rm_f(cdx_path)
+        FileUtils.rm_f(db_path)
+    elsif @keep
+        puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader_straw
 version: !ruby/object:Gem::Version
-  version: 2.3.3
+  version: 2.3.4
 platform: ruby
 authors:
 - strawberrymaster
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-03-08 00:00:00.000000000 Z
+date: 2025-04-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: concurrent-ruby