RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.3.10 → 2.3.12 - Mend

wayback_machine_downloader_straw 2.3.10 → 2.3.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +8 -0
data/lib/wayback_machine_downloader/subdom_processor.rb +238 -0
data/lib/wayback_machine_downloader/tidy_bytes.rb +1 -0
data/lib/wayback_machine_downloader.rb +79 -6
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ef661bf573b09f79453cf6343d737c24715f343b6593cf313f2502ecd9a650cb
-  data.tar.gz: b80be4aaae7ab4ff695af6cc85273ac437fab1e6a68d3d8bdad67a9661be17e4
+  metadata.gz: 67f774a5476a54ad0224e11f0c9a24b8df6b0d418f5b3c8886277c286bbe3043
+  data.tar.gz: a881ccdac84cd8e4da13edd9fc8117bfdba8c7d432959ef81c85bc95072a0dd9
 SHA512:
-  metadata.gz: 3dfb6477b142eebb45741e1b5a4552dd33feac34baa1eae5453baaa08a9a5be242ba46d4f1162e2dd2b68e8903e6de8402d6b6fa86128f312defac74f2e8da29
-  data.tar.gz: 39758aef4bda77babb81d479ef9f266e3fa328af163c7c3c053290796fda95ccb8ec8d3725a9dae5164b79debc6530919cd79df3f7421842f951b0ee6ef79e60
+  metadata.gz: 01bdc9142820719c1ab17a50067fc478975627f414a29bdca32ea5fedf23227f33fb331f9470bb002af80cc50a6a74c7c8361f214d162c537d100860bdb664bc
+  data.tar.gz: f47436ecd1d4b8a4062d8689dac0d9fc4d73c743d5f84bd96764aa2a186eaae607fcee6c7b9e72f9fd3befd1fadfe9006354a43bfd134c892fbf5dfdd736ee28

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -74,6 +74,14 @@ option_parser = OptionParser.new do |opts|
     options[:keep] = true
   end
+  opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
+    options[:recursive_subdomains] = true
+  end
+  opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
+    options[:subdomain_depth] = t
+  end
   opts.on("-v", "--version", "Display version") do |t|
     options[:version] = t
   end

data/lib/wayback_machine_downloader/subdom_processor.rb ADDED Viewed

@@ -0,0 +1,238 @@
+# frozen_string_literal: true
+module SubdomainProcessor
+  def process_subdomains
+    return unless @recursive_subdomains
+    puts "Starting subdomain processing..."
+    # extract base domain from the URL for comparison
+    base_domain = extract_base_domain(@base_url)
+    @processed_domains = Set.new([base_domain])
+    @subdomain_queue = Queue.new
+    # scan downloaded files for subdomain links
+    initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
+    puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
+    subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
+    if subdomains_found.empty?
+      puts "No subdomains found in downloaded content."
+      return
+    end
+    puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
+    # add found subdomains to the queue
+    subdomains_found.each do |subdomain|
+      full_domain = "#{subdomain}.#{base_domain}"
+      @subdomain_queue << "https://#{full_domain}/"
+    end
+    # process the subdomain queue
+    download_subdomains(base_domain)
+    # after all downloads, rewrite all URLs to make local references
+    rewrite_subdomain_links(base_domain) if @rewrite
+  end
+  private
+  def extract_base_domain(url)
+    uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
+    return nil unless uri
+    host = uri.host || uri.path.split('/').first
+    host = host.downcase
+    # extract the base domain (e.g., "example.com" from "sub.example.com")
+    parts = host.split('.')
+    return host if parts.size <= 2
+    # for domains like co.uk, we want to keep the last 3 parts
+    if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
+      parts.last(3).join('.')
+    else
+      parts.last(2).join('.')
+    end
+  end
+  def scan_files_for_subdomains(files, base_domain)
+    return [] unless base_domain
+    subdomains = Set.new
+    files.each do |file_path|
+      next unless File.exist?(file_path)
+      begin
+        content = File.read(file_path)
+        # extract URLs from HTML href/src attributes
+        content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
+          subdomain = match[0].downcase
+          next if subdomain == 'www' # skip www subdomain
+          subdomains.add(subdomain)
+        end
+        # extract URLs from CSS
+        content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
+          subdomain = match[0].downcase
+          next if subdomain == 'www' # skip www subdomain
+          subdomains.add(subdomain)
+        end
+        # extract URLs from JavaScript strings
+        content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
+          subdomain = match[0].downcase
+          next if subdomain == 'www' # skip www subdomain
+          subdomains.add(subdomain)
+        end
+      rescue => e
+        puts "Error scanning file #{file_path}: #{e.message}"
+      end
+    end
+    subdomains.to_a
+  end
+  def download_subdomains(base_domain)
+    puts "Starting subdomain downloads..."
+    depth = 0
+    max_depth = @subdomain_depth || 1
+    while depth < max_depth && !@subdomain_queue.empty?
+      current_batch = []
+      # get all subdomains at current depth
+      while !@subdomain_queue.empty?
+        current_batch << @subdomain_queue.pop
+      end
+      puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
+      # download each subdomain
+      current_batch.each do |subdomain_url|
+        download_subdomain(subdomain_url, base_domain)
+      end
+      # if we need to go deeper, scan the newly downloaded files
+      if depth + 1 < max_depth
+        # get all files in the subdomains directory
+        new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
+        new_subdomains = scan_files_for_subdomains(new_files, base_domain)
+        # filter out already processed subdomains
+        new_subdomains.each do |subdomain|
+          full_domain = "#{subdomain}.#{base_domain}"
+          unless @processed_domains.include?(full_domain)
+            @processed_domains.add(full_domain)
+            @subdomain_queue << "https://#{full_domain}/"
+          end
+        end
+        puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
+      end
+      depth += 1
+    end
+  end
+  def download_subdomain(subdomain_url, base_domain)
+    begin
+      uri = URI.parse(subdomain_url)
+      subdomain_host = uri.host
+      # skip if already processed
+      if @processed_domains.include?(subdomain_host)
+        puts "Skipping already processed subdomain: #{subdomain_host}"
+        return
+      end
+      @processed_domains.add(subdomain_host)
+      puts "Downloading subdomain: #{subdomain_url}"
+      # create the directory for this subdomain
+      subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
+      FileUtils.mkdir_p(subdomain_dir)
+      # create subdomain downloader with appropriate options
+      subdomain_options = {
+        base_url: subdomain_url,
+        directory: subdomain_dir,
+        from_timestamp: @from_timestamp,
+        to_timestamp: @to_timestamp,
+        all: @all,
+        threads_count: @threads_count,
+        maximum_pages: [@maximum_pages / 2, 10].max,
+        rewrite: @rewrite,
+        # don't recursively process subdomains from here
+        recursive_subdomains: false
+      }
+      # download the subdomain content
+      subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
+      subdomain_downloader.download_files
+      puts "Completed download of subdomain: #{subdomain_host}"
+    rescue => e
+      puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
+    end
+  end
+  def rewrite_subdomain_links(base_domain)
+    puts "Rewriting all files to use local subdomain references..."
+    all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
+    subdomains = @processed_domains.reject { |domain| domain == base_domain }
+    puts "Found #{all_files.size} files to check for rewriting"
+    puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
+    rewritten_count = 0
+    all_files.each do |file_path|
+      next unless File.exist?(file_path)
+      begin
+        content = File.read(file_path)
+        original_content = content.dup
+        # replace subdomain URLs with local paths
+        subdomains.each do |subdomain_host|
+          # for HTML attributes (href, src, etc.)
+          content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
+            prefix, path, suffix = $1, $2, $3
+            path = "/index.html" if path.empty? || path == "/"
+            "#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
+          end
+          # for CSS url()
+          content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
+            path = $1
+            path = "/index.html" if path.empty? || path == "/"
+            "url(\"../subdomains/#{subdomain_host}#{path}\")"
+          end
+          # for JavaScript strings
+          content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
+            quote_start, path, quote_end = $1, $2, $3
+            path = "/index.html" if path.empty? || path == "/"
+            "#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
+          end
+        end
+        # save if modified
+        if content != original_content
+          File.write(file_path, content)
+          rewritten_count += 1
+        end
+      rescue => e
+        puts "Error rewriting file #{file_path}: #{e.message}"
+      end
+    end
+    puts "Rewrote links in #{rewritten_count} files"
+  end
+end

data/lib/wayback_machine_downloader/tidy_bytes.rb CHANGED Viewed

@@ -31,6 +31,7 @@ module TidyBytes
     when 156 then [197, 147]       # LATIN SMALL LIGATURE OE
     when 158 then [197, 190]       # LATIN SMALL LETTER Z WITH CARON
     when 159 then [197, 184]       # LATIN SMALL LETTER Y WITH DIAERESIS
+    else nil                       # ANYTHING ELSE...
     end
   end.freeze

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -14,6 +14,7 @@ require 'stringio'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
+require_relative 'wayback_machine_downloader/subdom_processor'
 class ConnectionPool
   MAX_AGE = 300
@@ -112,8 +113,9 @@ end
 class WaybackMachineDownloader
   include ArchiveAPI
+  include SubdomainProcessor
-  VERSION = "2.3.10"
+  VERSION = "2.3.12"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2
@@ -123,9 +125,11 @@ class WaybackMachineDownloader
   STATE_CDX_FILENAME = ".cdx.json"
   STATE_DB_FILENAME = ".downloaded.txt"
   attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
-    :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
+    :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
+    :snapshot_at
   def initialize params
     validate_params(params)
@@ -153,6 +157,12 @@ class WaybackMachineDownloader
     @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
     @db_mutex = Mutex.new
     @rewrite = params[:rewrite] || false
+    @recursive_subdomains = params[:recursive_subdomains] || false
+    @subdomain_depth = params[:subdomain_depth] || 1
+    @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
+    # URL for rejecting invalid/unencoded wayback urls
+    @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
     handle_reset
   end
@@ -196,7 +206,7 @@ class WaybackMachineDownloader
   def match_only_filter file_url
     if @only_filter
-      only_filter_regex = @only_filter.to_regex
+      only_filter_regex = @only_filter.to_regex(detect: true)
       if only_filter_regex
         only_filter_regex =~ file_url
       else
@@ -209,7 +219,7 @@ class WaybackMachineDownloader
   def match_exclude_filter file_url
     if @exclude_filter
-      exclude_filter_regex = @exclude_filter.to_regex
+      exclude_filter_regex = @exclude_filter.to_regex(detect: true)
       if exclude_filter_regex
         exclude_filter_regex =~ file_url
       else
@@ -322,6 +332,36 @@ class WaybackMachineDownloader
     snapshot_list_to_consider
   end
+  # Get a composite snapshot file list for a specific timestamp
+  def get_composite_snapshot_file_list(target_timestamp)
+    file_versions = {}
+    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
+      next unless file_url.include?('/')
+      next if file_timestamp.to_i > target_timestamp
+      file_id = file_url.split('/')[3..-1].join('/')
+      file_id = CGI::unescape file_id
+      file_id = file_id.tidy_bytes unless file_id == ""
+      next if file_id.nil?
+      next if match_exclude_filter(file_url)
+      next unless match_only_filter(file_url)
+      # Select the most recent version <= target_timestamp
+      if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
+        file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
+      end
+    end
+    file_versions.values
+  end
+  # Returns a list of files for the composite snapshot
+  def get_file_list_composite_snapshot(target_timestamp)
+    file_list = get_composite_snapshot_file_list(target_timestamp)
+    file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
+    file_list.map do |file_remote_info|
+      file_remote_info[1][:file_id] = file_remote_info[0]
+      file_remote_info[1]
+    end
+  end
   def get_file_list_curated
     file_list_curated = Hash.new
     get_all_snapshots_to_consider.each do |file_timestamp, file_url|
@@ -376,7 +416,9 @@ class WaybackMachineDownloader
   def get_file_list_by_timestamp
-    if @all_timestamps
+    if @snapshot_at
+      @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
+    elsif @all_timestamps
       file_list_curated = get_file_list_all_timestamps
       file_list_curated.map do |file_remote_info|
         file_remote_info[1][:file_id] = file_remote_info[0]
@@ -513,6 +555,16 @@ class WaybackMachineDownloader
     end_time = Time.now
     puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
+    # process subdomains if enabled
+    if @recursive_subdomains
+      subdomain_start_time = Time.now
+      process_subdomains
+      subdomain_end_time = Time.now
+      subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
+      puts "Subdomain processing finished in #{subdomain_time}s."
+    end
     puts "Results saved in #{backup_path}"
     cleanup
   end
@@ -709,7 +761,22 @@ class WaybackMachineDownloader
   end
   def file_list_by_timestamp
-    @file_list_by_timestamp ||= get_file_list_by_timestamp
+    if @snapshot_at
+      @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
+    elsif @all_timestamps
+      file_list_curated = get_file_list_all_timestamps
+      file_list_curated.map do |file_remote_info|
+        file_remote_info[1][:file_id] = file_remote_info[0]
+        file_remote_info[1]
+      end
+    else
+      file_list_curated = get_file_list_curated
+      file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
+      file_list_curated.map do |file_remote_info|
+        file_remote_info[1][:file_id] = file_remote_info[0]
+        file_remote_info[1]
+      end
+    end
   end
   private
@@ -740,6 +807,12 @@ class WaybackMachineDownloader
       # Escape square brackets because they are not valid in URI()
       wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
+      # reject invalid/unencoded wayback_url, behaving as if the resource weren't found
+      if not @url_regexp.match?(wayback_url)
+          @logger.warn("Skipped #{file_url}: invalid URL")
+          return :skipped_not_found
+      end
       request = Net::HTTP::Get.new(URI(wayback_url))
       request["Connection"] = "keep-alive"
       request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader_straw
 version: !ruby/object:Gem::Version
-  version: 2.3.10
+  version: 2.3.12
 platform: ruby
 authors:
 - strawberrymaster
 bindir: bin
 cert_chain: []
-date: 2025-06-27 00:00:00.000000000 Z
+date: 2025-07-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: concurrent-ruby
@@ -71,6 +71,7 @@ files:
 - bin/wayback_machine_downloader
 - lib/wayback_machine_downloader.rb
 - lib/wayback_machine_downloader/archive_api.rb
+- lib/wayback_machine_downloader/subdom_processor.rb
 - lib/wayback_machine_downloader/tidy_bytes.rb
 - lib/wayback_machine_downloader/to_regex.rb
 homepage: https://github.com/StrawberryMaster/wayback-machine-downloader