RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.4.3 → 2.4.5 - Mend

wayback_machine_downloader_straw 2.4.3 → 2.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +8 -0
data/lib/wayback_machine_downloader/archive_api.rb +19 -1
data/lib/wayback_machine_downloader/page_requisites.rb +33 -0
data/lib/wayback_machine_downloader/url_rewrite.rb +71 -60
data/lib/wayback_machine_downloader.rb +296 -110
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ce7592163165a7f8235bf4a6e1915cf531511fafc7f6874c0d1673fb29db704f
-  data.tar.gz: 7d48ffebf130d3b32d1ec233cf5141cc3cf192bcf16751db4380bf62863971c1
+  metadata.gz: c5ba50bde9b0306e043eed8151b12f37f603f5cfd73013e53260543f7fc134a5
+  data.tar.gz: d3dbc1a0f6f894547fb39e56193c967dbf99685a8fa4d3cecaeaff62070aab4c
 SHA512:
-  metadata.gz: 16d56de1814e36174c47ab5bda6c9d5e02aba15bafa72a1d57056d0ac146e5fff5c6ca43f9198262d90820e4dcbe4e63772f01bd1ee5207c7ab07e9bb959e069
-  data.tar.gz: 07602af4f0cfb9927d43239da0c38cb2411aa408d11fe3f91cb4a403fa415ca8de095eee7467e4613d32aadb8c0a13ffea19ac2f93fd5bf005a991d91e8a064a
+  metadata.gz: 854ec2ccbe2daf620178397bf2c620ed8cf01ca57f175cbb9a2c8e7a057b1495a382ccae64836d75cf08c9a7d5a8ab638859d4c7d34584dcfc51f5eda8b7e5b2
+  data.tar.gz: 1ed3f13c7aadcb097a174c7870730df0920c7b08e9476b786d287dfd2a3e29a50b690ea2fb16b0d762dac726648356788b4122e674db44e7c5a49ffc541f4098

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -74,6 +74,10 @@ option_parser = OptionParser.new do |opts|
     options[:keep] = true
   end
+  opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
+    options[:max_retries] = t
+  end
   opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
     options[:recursive_subdomains] = true
   end
@@ -82,6 +86,10 @@ option_parser = OptionParser.new do |opts|
     options[:subdomain_depth] = t
   end
+  opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
+    options[:page_requisites] = true
+  end
   opts.on("-v", "--version", "Display version") do |t|
     options[:version] = t
   end

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -16,6 +16,10 @@ module ArchiveAPI
     params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
     request_url.query = URI.encode_www_form(params)
+    retries = 0
+    max_retries = (@max_retries || 3)
+    delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
     begin
       response = http.get(request_url)
       body = response.body.to_s.strip
@@ -26,7 +30,21 @@ module ArchiveAPI
       json.shift if json.first == ["timestamp", "original"]
       json
     rescue JSON::ParserError => e
-      warn "Failed to fetch data from API: #{e.message}"
+      warn "Failed to parse JSON from API for #{url}: #{e.message}"
+      []
+    rescue Net::ReadTimeout, Net::OpenTimeout => e
+      if retries < max_retries
+        retries += 1
+        warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
+        sleep(delay * retries)
+        retry
+      else
+        warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
+        []
+      end
+    rescue StandardError => e
+      # treat any other transient-ish error similarly, though without retries for now
+      warn "Error fetching CDX data for #{url}: #{e.message}"
       []
     end
   end

data/lib/wayback_machine_downloader/page_requisites.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module PageRequisites
+  # regex to find links in href, src, url(), and srcset
+  # this ignores data: URIs, mailto:, and anchors
+  ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
+  def self.extract(html_content)
+    assets = []
+    html_content.scan(ASSET_REGEX) do |match|
+      # match is an array of capture groups; find the one that matched
+      url = match.compact.first
+      next unless url
+      # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
+      if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
+        url.split(',').each do |src_def|
+          src_url = src_def.strip.split(' ').first
+          assets << src_url if valid_asset?(src_url)
+        end
+      else
+        assets << url if valid_asset?(url)
+      end
+    end
+    assets.uniq
+  end
+  def self.valid_asset?(url)
+    return false if url.strip.empty?
+    return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
+    true
+  end
+end

data/lib/wayback_machine_downloader/url_rewrite.rb CHANGED Viewed

@@ -1,74 +1,85 @@
 # frozen_string_literal: true
-# URLs in HTML attributes
-def rewrite_html_attr_urls(content)
-  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-    prefix, url, suffix = $1, $2, $3
-    if url.start_with?('http')
-      begin
-        uri = URI.parse(url)
-        path = uri.path
-        path = path[1..-1] if path.start_with?('/')
-        "#{prefix}#{path}#{suffix}"
-      rescue
-        "#{prefix}#{url}#{suffix}"
-      end
-    elsif url.start_with?('/')
-      "#{prefix}./#{url[1..-1]}#{suffix}"
-    else
-      "#{prefix}#{url}#{suffix}"
+module URLRewrite
+  # server-side extensions that should work locally
+  SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
+  def rewrite_html_attr_urls(content)
+    # rewrite URLs to relative paths
+    content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      prefix, path, suffix = $1, $2, $3
+      path = normalize_path_for_local(path)
+      "#{prefix}#{path}#{suffix}"
+    end
+    # rewrite absolute URLs to same domain as relative
+    content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      prefix, path, suffix = $1, $2, $3
+      path = normalize_path_for_local(path)
+      "#{prefix}#{path}#{suffix}"
     end
+    content
   end
-  content
-end
-# URLs in CSS
-def rewrite_css_urls(content)
+  def rewrite_css_urls(content)
+    # rewrite URLs in CSS
+    content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
+      path = normalize_path_for_local($1)
+      "url(\"#{path}\")"
+    end
-  content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
-    url = $1
-    if url.start_with?('http')
-      begin
-        uri = URI.parse(url)
-        path = uri.path
-        path = path[1..-1] if path.start_with?('/')
-        "url(\"#{path}\")"
-      rescue
-        "url(\"#{url}\")"
-      end
-    elsif url.start_with?('/')
-      "url(\"./#{url[1..-1]}\")"
-    else
-      "url(\"#{url}\")"
+    # rewrite absolute URLs in CSS
+    content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
+      path = normalize_path_for_local($1)
+      "url(\"#{path}\")"
     end
+    content
   end
-  content
-end
-# URLs in JavaScript
-def rewrite_js_urls(content)
-  content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-    quote_start, url, quote_end = $1, $2, $3
+  def rewrite_js_urls(content)
+    # rewrite archive.org URLs in JavaScript strings
+    content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      quote_start, path, quote_end = $1, $2, $3
+      path = normalize_path_for_local(path)
+      "#{quote_start}#{path}#{quote_end}"
+    end
+    # rewrite absolute URLs in JavaScript
+    content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      quote_start, path, quote_end = $1, $2, $3
+      next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
+      path = normalize_path_for_local(path)
+      "#{quote_start}#{path}#{quote_end}"
+    end
+    content
+  end
+  private
+  def normalize_path_for_local(path)
+    return "./index.html" if path.empty? || path == "/"
-    if url.start_with?('http')
-      begin
-        uri = URI.parse(url)
-        path = uri.path
-        path = path[1..-1] if path.start_with?('/')
-        "#{quote_start}#{path}#{quote_end}"
-      rescue
-        "#{quote_start}#{url}#{quote_end}"
-      end
-    elsif url.start_with?('/')
-      "#{quote_start}./#{url[1..-1]}#{quote_end}"
+    # handle query strings - they're already part of the filename
+    path = path.split('?').first if path.include?('?')
+    # check if this is a server-side script
+    ext = File.extname(path).downcase
+    if SERVER_SIDE_EXTS.include?(ext)
+      # keep the path as-is but ensure it starts with ./
+      path = "./#{path}" unless path.start_with?('./', '/')
     else
-      "#{quote_start}#{url}#{quote_end}"
+      # regular file handling
+      path = "./#{path}" unless path.start_with?('./', '/')
+      # if it looks like a directory, add index.html
+      if path.end_with?('/') || !path.include?('.')
+        path = "#{path.chomp('/')}/index.html"
+      end
     end
+    path
   end
-  content
 end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -15,6 +15,7 @@ require 'digest'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
+require_relative 'wayback_machine_downloader/page_requisites'
 require_relative 'wayback_machine_downloader/subdom_processor'
 require_relative 'wayback_machine_downloader/url_rewrite'
@@ -25,69 +26,81 @@ class ConnectionPool
   MAX_RETRIES = 3
   def initialize(size)
-    @size = size
-    @pool = Concurrent::Map.new
-    @creation_times = Concurrent::Map.new
+    @pool = SizedQueue.new(size)
+    size.times { @pool << build_connection_entry }
     @cleanup_thread = schedule_cleanup
   end
-  def with_connection(&block)
-    conn = acquire_connection
+  def with_connection
+    entry = acquire_connection
     begin
-      yield conn
+      yield entry[:http]
     ensure
-      release_connection(conn)
+      release_connection(entry)
     end
   end
   def shutdown
     @cleanup_thread&.exit
-    @pool.each_value { |conn| conn.finish if conn&.started? }
-    @pool.clear
-    @creation_times.clear
+    drain_pool { |entry| safe_finish(entry[:http]) }
   end
   private
   def acquire_connection
-    thread_id = Thread.current.object_id
-    conn = @pool[thread_id]
+    entry = @pool.pop
+    if stale?(entry)
+      safe_finish(entry[:http])
+      entry = build_connection_entry
+    end
+    entry
+  end
-    if should_create_new?(conn)
-      conn&.finish if conn&.started?
-      conn = create_connection
-      @pool[thread_id] = conn
-      @creation_times[thread_id] = Time.now
+  def release_connection(entry)
+    if stale?(entry)
+      safe_finish(entry[:http])
+      entry = build_connection_entry
     end
+    @pool << entry
+  end
-    conn
+  def stale?(entry)
+    http = entry[:http]
+    !http.started? || (Time.now - entry[:created_at] > MAX_AGE)
   end
-  def release_connection(conn)
-    return unless conn
-    if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
-      conn.finish
-      @pool.delete(Thread.current.object_id)
-      @creation_times.delete(Thread.current.object_id)
-    end
+  def build_connection_entry
+    { http: create_connection, created_at: Time.now }
   end
-  def should_create_new?(conn)
-    return true if conn.nil?
-    return true unless conn.started?
-    return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
-    false
+  def safe_finish(http)
+    http.finish if http&.started?
+  rescue StandardError
+    nil
   end
-  def create_connection
-    http = Net::HTTP.new("web.archive.org", 443)
-    http.use_ssl = true
-    http.read_timeout = DEFAULT_TIMEOUT
-    http.open_timeout = DEFAULT_TIMEOUT
-    http.keep_alive_timeout = 30
-    http.max_retries = MAX_RETRIES
-    http.start
-    http
+  def drain_pool
+    loop do
+      entry = begin
+        @pool.pop(true)
+      rescue ThreadError
+        break
+      end
+      yield(entry)
+    end
+  end
+  def cleanup_old_connections
+    entry = begin
+      @pool.pop(true)
+    rescue ThreadError
+      return
+    end
+    if stale?(entry)
+      safe_finish(entry[:http])
+      entry = build_connection_entry
+    end
+    @pool << entry
   end
   def schedule_cleanup
@@ -99,16 +112,15 @@ class ConnectionPool
     end
   end
-  def cleanup_old_connections
-    current_time = Time.now
-    @creation_times.each do |thread_id, creation_time|
-      if current_time - creation_time > MAX_AGE
-        conn = @pool[thread_id]
-        conn&.finish if conn&.started?
-        @pool.delete(thread_id)
-        @creation_times.delete(thread_id)
-      end
-    end
+  def create_connection
+    http = Net::HTTP.new("web.archive.org", 443)
+    http.use_ssl = true
+    http.read_timeout = DEFAULT_TIMEOUT
+    http.open_timeout = DEFAULT_TIMEOUT
+    http.keep_alive_timeout = 30
+    http.max_retries = MAX_RETRIES
+    http.start
+    http
   end
 end
@@ -116,8 +128,9 @@ class WaybackMachineDownloader
   include ArchiveAPI
   include SubdomainProcessor
+  include URLRewrite
-  VERSION = "2.4.3"
+  VERSION = "2.4.5"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2
@@ -131,7 +144,7 @@ class WaybackMachineDownloader
   attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
     :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
-    :snapshot_at
+    :snapshot_at, :page_requisites
   def initialize params
     validate_params(params)
@@ -163,6 +176,9 @@ class WaybackMachineDownloader
     @recursive_subdomains = params[:recursive_subdomains] || false
     @subdomain_depth = params[:subdomain_depth] || 1
     @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
+    @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
+    @page_requisites = params[:page_requisites] || false
+    @pending_jobs = Concurrent::AtomicFixnum.new(0)
     # URL for rejecting invalid/unencoded wayback urls
     @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -171,18 +187,29 @@ class WaybackMachineDownloader
   end
   def backup_name
-    url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
+    url_to_process = @base_url
+    url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*')
     raw = if url_to_process.include?('//')
       url_to_process.split('/')[2]
     else
       url_to_process
     end
+    # if it looks like a wildcard pattern, normalize to a safe host-ish name
+    if raw&.start_with?('*.')
+      raw = raw.sub(/\A\*\./, 'all-')
+    end
     # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
     if Gem.win_platform?
       raw = raw.gsub(/[:*?"<>|]/, '_')
       raw = raw.gsub(/[ .]+\z/, '')
+    else
+      # still good practice to strip path separators (and maybe '*' for POSIX too)
+      raw = raw.gsub(/[\/:*?"<>|]/, '_')
     end
     raw = 'site' if raw.nil? || raw.empty?
     raw
   end
@@ -193,7 +220,8 @@ class WaybackMachineDownloader
       @directory
     else
       # ensure the default path is absolute and normalized
-      File.expand_path(File.join('websites', backup_name))
+      cwd = Dir.pwd
+      File.expand_path(File.join(cwd, 'websites', backup_name))
     end
   end
@@ -277,53 +305,62 @@ class WaybackMachineDownloader
       page_index = 0
       batch_size = [@threads_count, 5].min
       continue_fetching = true
-      while continue_fetching && page_index < @maximum_pages
-        # Determine the range of pages to fetch in this batch
-        end_index = [page_index + batch_size, @maximum_pages].min
-        current_batch = (page_index...end_index).to_a
-        # Create futures for concurrent API calls
-        futures = current_batch.map do |page|
-          Concurrent::Future.execute do
-            result = nil
-            @connection_pool.with_connection do |connection|
-              result = get_raw_list_from_api("#{@base_url}/*", page, connection)
+      fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
+      begin
+        while continue_fetching && page_index < @maximum_pages
+          # Determine the range of pages to fetch in this batch
+          end_index = [page_index + batch_size, @maximum_pages].min
+          current_batch = (page_index...end_index).to_a
+          # Create futures for concurrent API calls
+          futures = current_batch.map do |page|
+            Concurrent::Future.execute(executor: fetch_pool) do
+              result = nil
+              @connection_pool.with_connection do |connection|
+                result = get_raw_list_from_api("#{@base_url}/*", page, connection)
+              end
+              result ||= []
+              [page, result]
             end
-            result ||= []
-            [page, result]
           end
-        end
-        results = []
+          results = []
-        futures.each do |future|
-          begin
-            results << future.value
-          rescue => e
-            puts "\nError fetching page #{future}: #{e.message}"
+          futures.each do |future|
+            begin
+              val = future.value
+              # only append if valid
+              if val && val.is_a?(Array) && val.first.is_a?(Integer)
+                results << val
+              end
+            rescue => e
+              puts "\nError fetching page #{future}: #{e.message}"
+            end
           end
-        end
-        # Sort results by page number to maintain order
-        results.sort_by! { |page, _| page }
-        # Process results and check for empty pages
-        results.each do |page, result|
-          if result.nil? || result.empty?
-            continue_fetching = false
-            break
-          else
-            mutex.synchronize do
-              snapshot_list_to_consider.concat(result)
-              print "."
+          # Sort results by page number to maintain order
+          results.sort_by! { |page, _| page }
+          # Process results and check for empty pages
+          results.each do |page, result|
+            if result.nil? || result.empty?
+              continue_fetching = false
+              break
+            else
+              mutex.synchronize do
+                snapshot_list_to_consider.concat(result)
+                print "."
+              end
             end
           end
-        end
-        page_index = end_index
+          page_index = end_index
-        sleep(RATE_LIMIT) if continue_fetching
+          sleep(RATE_LIMIT) if continue_fetching
+        end
+      ensure
+        fetch_pool.shutdown
+        fetch_pool.wait_for_termination
       end
     end
@@ -531,7 +568,7 @@ class WaybackMachineDownloader
       end
     end
   end
   def download_files
     start_time = Time.now
     puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -552,6 +589,12 @@ class WaybackMachineDownloader
     # Load IDs of already downloaded files
     downloaded_ids = load_downloaded_ids
+    # We use a thread-safe Set to track what we have queued/downloaded in this session
+    # to avoid infinite loops with page requisites
+    @session_downloaded_ids = Concurrent::Set.new
+    downloaded_ids.each { |id| @session_downloaded_ids.add(id) }
     files_to_process = files_to_download.reject do |file_info|
       downloaded_ids.include?(file_info[:file_id])
     end
@@ -562,8 +605,8 @@ class WaybackMachineDownloader
     if skipped_count > 0
       puts "Found #{skipped_count} previously downloaded files, skipping them."
     end
-    if remaining_count == 0
+    if remaining_count == 0 && !@page_requisites
       puts "All matching files have already been downloaded."
       cleanup
       return
@@ -576,12 +619,22 @@ class WaybackMachineDownloader
     @download_mutex = Mutex.new
     thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
-    pool = Concurrent::FixedThreadPool.new(thread_count)
+    @worker_pool = Concurrent::FixedThreadPool.new(thread_count)
-    processing_files(pool, files_to_process)
+    # initial batch
+    files_to_process.each do |file_remote_info|
+      @session_downloaded_ids.add(file_remote_info[:file_id])
+      submit_download_job(file_remote_info)
+    end
+    # wait for all jobs to finish
+    loop do
+      sleep 0.5
+      break if @pending_jobs.value == 0
+    end
-    pool.shutdown
-    pool.wait_for_termination
+    @worker_pool.shutdown
+    @worker_pool.wait_for_termination
     end_time = Time.now
     puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
@@ -599,6 +652,138 @@ class WaybackMachineDownloader
     cleanup
   end
+  # helper to submit jobs and increment the counter
+  def submit_download_job(file_remote_info)
+    @pending_jobs.increment
+    @worker_pool.post do
+      begin
+        process_single_file(file_remote_info)
+      ensure
+        @pending_jobs.decrement
+      end
+    end
+  end
+  def process_single_file(file_remote_info)
+    download_success = false
+    downloaded_path = nil
+    @connection_pool.with_connection do |connection|
+      result_message, path = download_file(file_remote_info, connection)
+      downloaded_path = path
+      if result_message && result_message.include?(' -> ')
+         download_success = true
+      end
+      @download_mutex.synchronize do
+        @processed_file_count += 1 if @processed_file_count < @total_to_download
+        # only print if it's a "User" file or a requisite we found
+        puts result_message if result_message
+      end
+    end
+    if download_success
+      append_to_db(file_remote_info[:file_id])
+      if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
+        process_page_requisites(downloaded_path, file_remote_info)
+      end
+    end
+  rescue => e
+    @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
+  end
+  def process_page_requisites(file_path, parent_remote_info)
+    return unless File.exist?(file_path)
+    content = File.read(file_path)
+    content = content.force_encoding('UTF-8').scrub
+    assets = PageRequisites.extract(content)
+    # prepare base URI for resolving relative paths
+    parent_raw = parent_remote_info[:file_url]
+    parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
+    begin
+      base_uri = URI(parent_raw)
+      # calculate the "root" host of the site we are downloading to compare later
+      current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
+    rescue URI::InvalidURIError
+      return
+    end
+    parent_timestamp = parent_remote_info[:timestamp]
+    assets.each do |asset_rel_url|
+      begin
+        # resolve full URL (handles relative paths like "../img/logo.png")
+        resolved_uri = base_uri + asset_rel_url
+        # detect if the asset URL is already a Wayback "web/<timestamp>/.../https://..." embed
+        asset_timestamp = parent_timestamp
+        if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z}
+          embedded_ts = $1
+          begin
+            orig_uri = URI($2)
+            resolved_uri = orig_uri
+            asset_timestamp = embedded_ts.to_i
+          rescue URI::InvalidURIError
+            # fall back to original resolved_uri and parent timestamp
+          end
+        end
+        # filter out navigation links (pages) vs assets
+        # skip if extension is empty or looks like an HTML page
+        path = resolved_uri.path
+        ext = File.extname(path).downcase
+        if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
+           next
+        end
+        # construct the URL for the Wayback API
+        asset_wbm_url = resolved_uri.host + resolved_uri.path
+        asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query
+        # construct the local file ID
+        #  if the asset is on the SAME domain, strip the domain from the folder path
+        #  if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
+        if resolved_uri.host == current_project_host
+           # e.g. /static/css/style.css
+           asset_file_id = resolved_uri.path
+           asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
+        else
+           # e.g. cdn.google.com/jquery.js
+           asset_file_id = asset_wbm_url
+        end
+      rescue URI::InvalidURIError, StandardError
+        next
+      end
+      # sanitize and queue
+      asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)
+      unless @session_downloaded_ids.include?(asset_id)
+        @session_downloaded_ids.add(asset_id)
+        new_file_info = {
+          file_url: asset_wbm_url,
+          timestamp: asset_timestamp,
+          file_id: asset_id
+        }
+        @download_mutex.synchronize do
+          @total_to_download += 1
+          puts "Queued requisite: #{asset_file_id}"
+        end
+        submit_download_job(new_file_info)
+      end
+    end
+  end
   def structure_dir_path dir_path
     begin
       FileUtils::mkdir_p dir_path unless File.exist? dir_path
@@ -630,7 +815,8 @@ class WaybackMachineDownloader
     begin
       content = File.binread(file_path)
-      if file_ext == '.html' || file_ext == '.htm'
+      # detect encoding for HTML files
+      if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
         encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
         content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
       else
@@ -638,21 +824,21 @@ class WaybackMachineDownloader
       end
       # URLs in HTML attributes
-      rewrite_html_attr_urls(content)
+      content = rewrite_html_attr_urls(content)
       # URLs in CSS
-      rewrite_css_urls(content)
+      content = rewrite_css_urls(content)
       # URLs in JavaScript
-      rewrite_js_urls(content)
+      content = rewrite_js_urls(content)
-      # for URLs in HTML attributes that start with a single slash
+      # for URLs that start with a single slash, make them relative
       content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
         prefix, path, suffix = $1, $2, $3
         "#{prefix}./#{path}#{suffix}"
       end
-      # for URLs in CSS that start with a single slash
+      # for URLs in CSS that start with a single slash, make them relative
       content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
         path = $1
         "url(\"./#{path}\")"
@@ -705,7 +891,7 @@ class WaybackMachineDownloader
     # check existence *before* download attempt
     # this handles cases where a file was created manually or by a previous partial run without a .db entry
     if File.exist? file_path
-       return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
+       return ["#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
     end
     begin
@@ -717,13 +903,13 @@ class WaybackMachineDownloader
         if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
           rewrite_urls_to_relative(file_path)
         end
-        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
+        return ["#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
       when :skipped_not_found
-        "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
+        return ["Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
       else
         # ideally, this case should not be reached if download_with_retry behaves as expected.
         @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
-        "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
+        return ["Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
       end
     rescue StandardError => e
       msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
@@ -731,7 +917,7 @@ class WaybackMachineDownloader
         File.delete(file_path)
         msg += "\n#{file_path} was empty and was removed."
       end
-      msg
+      return [msg, nil]
     end
   end
@@ -934,9 +1120,9 @@ class WaybackMachineDownloader
       end
     rescue StandardError => e
-      if retries < MAX_RETRIES
+      if retries < @max_retries
         retries += 1
-        @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
+        @logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
         sleep(RETRY_DELAY * retries)
         retry
       else

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader_straw
 version: !ruby/object:Gem::Version
-  version: 2.4.3
+  version: 2.4.5
 platform: ruby
 authors:
 - strawberrymaster
 bindir: bin
 cert_chain: []
-date: 2025-08-19 00:00:00.000000000 Z
+date: 2026-01-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: concurrent-ruby
@@ -71,6 +71,7 @@ files:
 - bin/wayback_machine_downloader
 - lib/wayback_machine_downloader.rb
 - lib/wayback_machine_downloader/archive_api.rb
+- lib/wayback_machine_downloader/page_requisites.rb
 - lib/wayback_machine_downloader/subdom_processor.rb
 - lib/wayback_machine_downloader/tidy_bytes.rb
 - lib/wayback_machine_downloader/to_regex.rb