RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.4.1 → 2.4.4 - Mend

wayback_machine_downloader_straw 2.4.1 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +4 -0
data/lib/wayback_machine_downloader.rb +191 -99
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f6650c4217f2630db6307bc50ae2d6cefcbc38afc18b5701cc90a956af5cf1cf
-  data.tar.gz: 0ad44d7daa4c69b75d319c3518c4b801810be071545d5eded4497073caab4667
+  metadata.gz: 1e81619475172540d94968e5d31fbb6b4df7f08a533efce3b200e1ad4ce5035f
+  data.tar.gz: aca008795277ebf489cda7f0d8d1691b1983e2eec6f338b95c9bf11552fdfd94
 SHA512:
-  metadata.gz: 7a8cfd1cda19bc3ff2db8859e03877395eaf44092ffbe9f5334218fbd6293ff1aecc60e2bf272f875a67ecd086a209c56640db221f4d13739669a27eada1c826
-  data.tar.gz: 877436af63fa205add55ebeb55bafcd39fec0afa56707ee742871014dac48998e8028ef4616a0b611bee5f9a93ed0d8d136375d457503a3e34b9a37f87321787
+  metadata.gz: c71eb691ba50308f1f6b29a326c00a0678db1ff94f6ab8620b4d74425c993a64b60dd4d0127f9dd9eb3a72801008672855d901379d870b89aff82862b4f582a3
+  data.tar.gz: 6277c45d37dd02fea219f93906bf790ddede921ef88e928074b9e3da88b9882a64ecebbc99767b4c0eaef34b9a3407e9fee6352c935ea2d9942eed5735c51632

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -74,6 +74,10 @@ option_parser = OptionParser.new do |opts|
     options[:keep] = true
   end
+  opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
+    options[:max_retries] = t
+  end
   opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
     options[:recursive_subdomains] = true
   end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -11,6 +11,7 @@ require 'concurrent-ruby'
 require 'logger'
 require 'zlib'
 require 'stringio'
+require 'digest'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
@@ -24,69 +25,81 @@ class ConnectionPool
   MAX_RETRIES = 3
   def initialize(size)
-    @size = size
-    @pool = Concurrent::Map.new
-    @creation_times = Concurrent::Map.new
+    @pool = SizedQueue.new(size)
+    size.times { @pool << build_connection_entry }
     @cleanup_thread = schedule_cleanup
   end
-  def with_connection(&block)
-    conn = acquire_connection
+  def with_connection
+    entry = acquire_connection
     begin
-      yield conn
+      yield entry[:http]
     ensure
-      release_connection(conn)
+      release_connection(entry)
     end
   end
   def shutdown
     @cleanup_thread&.exit
-    @pool.each_value { |conn| conn.finish if conn&.started? }
-    @pool.clear
-    @creation_times.clear
+    drain_pool { |entry| safe_finish(entry[:http]) }
   end
   private
   def acquire_connection
-    thread_id = Thread.current.object_id
-    conn = @pool[thread_id]
+    entry = @pool.pop
+    if stale?(entry)
+      safe_finish(entry[:http])
+      entry = build_connection_entry
+    end
+    entry
+  end
-    if should_create_new?(conn)
-      conn&.finish if conn&.started?
-      conn = create_connection
-      @pool[thread_id] = conn
-      @creation_times[thread_id] = Time.now
+  def release_connection(entry)
+    if stale?(entry)
+      safe_finish(entry[:http])
+      entry = build_connection_entry
     end
+    @pool << entry
+  end
-    conn
+  def stale?(entry)
+    http = entry[:http]
+    !http.started? || (Time.now - entry[:created_at] > MAX_AGE)
   end
-  def release_connection(conn)
-    return unless conn
-    if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
-      conn.finish
-      @pool.delete(Thread.current.object_id)
-      @creation_times.delete(Thread.current.object_id)
-    end
+  def build_connection_entry
+    { http: create_connection, created_at: Time.now }
   end
-  def should_create_new?(conn)
-    return true if conn.nil?
-    return true unless conn.started?
-    return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
-    false
+  def safe_finish(http)
+    http.finish if http&.started?
+  rescue StandardError
+    nil
   end
-  def create_connection
-    http = Net::HTTP.new("web.archive.org", 443)
-    http.use_ssl = true
-    http.read_timeout = DEFAULT_TIMEOUT
-    http.open_timeout = DEFAULT_TIMEOUT
-    http.keep_alive_timeout = 30
-    http.max_retries = MAX_RETRIES
-    http.start
-    http
+  def drain_pool
+    loop do
+      entry = begin
+        @pool.pop(true)
+      rescue ThreadError
+        break
+      end
+      yield(entry)
+    end
+  end
+  def cleanup_old_connections
+    entry = begin
+      @pool.pop(true)
+    rescue ThreadError
+      return
+    end
+    if stale?(entry)
+      safe_finish(entry[:http])
+      entry = build_connection_entry
+    end
+    @pool << entry
   end
   def schedule_cleanup
@@ -98,16 +111,15 @@ class ConnectionPool
     end
   end
-  def cleanup_old_connections
-    current_time = Time.now
-    @creation_times.each do |thread_id, creation_time|
-      if current_time - creation_time > MAX_AGE
-        conn = @pool[thread_id]
-        conn&.finish if conn&.started?
-        @pool.delete(thread_id)
-        @creation_times.delete(thread_id)
-      end
-    end
+  def create_connection
+    http = Net::HTTP.new("web.archive.org", 443)
+    http.use_ssl = true
+    http.read_timeout = DEFAULT_TIMEOUT
+    http.open_timeout = DEFAULT_TIMEOUT
+    http.keep_alive_timeout = 30
+    http.max_retries = MAX_RETRIES
+    http.start
+    http
   end
 end
@@ -116,7 +128,7 @@ class WaybackMachineDownloader
   include ArchiveAPI
   include SubdomainProcessor
-  VERSION = "2.4.1"
+  VERSION = "2.4.4"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2
@@ -162,6 +174,7 @@ class WaybackMachineDownloader
     @recursive_subdomains = params[:recursive_subdomains] || false
     @subdomain_depth = params[:subdomain_depth] || 1
     @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
+    @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
     # URL for rejecting invalid/unencoded wayback urls
     @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -171,12 +184,19 @@ class WaybackMachineDownloader
   def backup_name
     url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
-    if url_to_process.include? '//'
+    raw = if url_to_process.include?('//')
       url_to_process.split('/')[2]
     else
       url_to_process
     end
+    # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
+    if Gem.win_platform?
+      raw = raw.gsub(/[:*?"<>|]/, '_')
+      raw = raw.gsub(/[ .]+\z/, '')
+    end
+    raw = 'site' if raw.nil? || raw.empty?
+    raw
   end
   def backup_path
@@ -185,7 +205,8 @@ class WaybackMachineDownloader
       @directory
     else
       # ensure the default path is absolute and normalized
-      File.expand_path(File.join('websites', backup_name))
+      cwd = Dir.pwd
+      File.expand_path(File.join(cwd, 'websites', backup_name))
     end
   end
@@ -269,53 +290,58 @@ class WaybackMachineDownloader
       page_index = 0
       batch_size = [@threads_count, 5].min
       continue_fetching = true
-      while continue_fetching && page_index < @maximum_pages
-        # Determine the range of pages to fetch in this batch
-        end_index = [page_index + batch_size, @maximum_pages].min
-        current_batch = (page_index...end_index).to_a
-        # Create futures for concurrent API calls
-        futures = current_batch.map do |page|
-          Concurrent::Future.execute do
-            result = nil
-            @connection_pool.with_connection do |connection|
-              result = get_raw_list_from_api("#{@base_url}/*", page, connection)
+      fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
+      begin
+        while continue_fetching && page_index < @maximum_pages
+          # Determine the range of pages to fetch in this batch
+          end_index = [page_index + batch_size, @maximum_pages].min
+          current_batch = (page_index...end_index).to_a
+          # Create futures for concurrent API calls
+          futures = current_batch.map do |page|
+            Concurrent::Future.execute(executor: fetch_pool) do
+              result = nil
+              @connection_pool.with_connection do |connection|
+                result = get_raw_list_from_api("#{@base_url}/*", page, connection)
+              end
+              result ||= []
+              [page, result]
             end
-            result ||= []
-            [page, result]
           end
-        end
-        results = []
+          results = []
-        futures.each do |future|
-          begin
-            results << future.value
-          rescue => e
-            puts "\nError fetching page #{future}: #{e.message}"
+          futures.each do |future|
+            begin
+              results << future.value
+            rescue => e
+              puts "\nError fetching page #{future}: #{e.message}"
+            end
           end
-        end
-        # Sort results by page number to maintain order
-        results.sort_by! { |page, _| page }
-        # Process results and check for empty pages
-        results.each do |page, result|
-          if result.nil? || result.empty?
-            continue_fetching = false
-            break
-          else
-            mutex.synchronize do
-              snapshot_list_to_consider.concat(result)
-              print "."
+          # Sort results by page number to maintain order
+          results.sort_by! { |page, _| page }
+          # Process results and check for empty pages
+          results.each do |page, result|
+            if result.nil? || result.empty?
+              continue_fetching = false
+              break
+            else
+              mutex.synchronize do
+                snapshot_list_to_consider.concat(result)
+                print "."
+              end
             end
           end
-        end
-        page_index = end_index
+          page_index = end_index
-        sleep(RATE_LIMIT) if continue_fetching
+          sleep(RATE_LIMIT) if continue_fetching
+        end
+      ensure
+        fetch_pool.shutdown
+        fetch_pool.wait_for_termination
       end
     end
@@ -630,13 +656,13 @@ class WaybackMachineDownloader
       end
       # URLs in HTML attributes
-      rewrite_html_attr_urls(content)
+      content = rewrite_html_attr_urls(content)
       # URLs in CSS
-      rewrite_css_urls(content)
+      content = rewrite_css_urls(content)
       # URLs in JavaScript
-      rewrite_js_urls(content)
+      content = rewrite_js_urls(content)
       # for URLs in HTML attributes that start with a single slash
       content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
@@ -769,17 +795,83 @@ class WaybackMachineDownloader
   # safely sanitize a file id (or id+timestamp)
   def sanitize_and_prepare_id(raw, file_url)
     return nil if raw.nil?
+    return ""  if raw.empty?
+    original = raw.dup
     begin
-      raw = CGI.unescape(raw) rescue raw
-      raw.gsub!(/<[^>]*>/, '')
-      raw = raw.tidy_bytes unless raw.empty?
+      # work on a binary copy to avoid premature encoding errors
+      raw = raw.dup.force_encoding(Encoding::BINARY)
+      # percent-decode (repeat until stable in case of double-encoding)
+      loop do
+        decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
+        break if decoded == raw
+        raw = decoded
+      end
+      # try tidy_bytes
+      begin
+        raw = raw.tidy_bytes
+      rescue StandardError
+        # fallback: scrub to UTF-8
+        raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
+      end
+      # ensure UTF-8 and scrub again
+      unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
+        raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
+      end
+      # strip HTML/comment artifacts & control chars
+      raw.gsub!(/<!--+/, '')
+      raw.gsub!(/[\x00-\x1F]/, '')
+      # split query; hash it for stable short name
+      path_part, query_part = raw.split('?', 2)
+      if query_part && !query_part.empty?
+        q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
+        if path_part.include?('.')
+          pre, _sep, post = path_part.rpartition('.')
+          path_part = "#{pre}__q#{q_digest}.#{post}"
+        else
+          path_part = "#{path_part}__q#{q_digest}"
+        end
+      end
+      raw = path_part
+      # collapse slashes & trim leading slash
+      raw.gsub!(%r{/+}, '/')
+      raw.sub!(%r{\A/}, '')
+      # segment-wise sanitation
+      raw = raw.split('/').map do |segment|
+        seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
+        seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
+        seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
+        seg.empty? ? '_' : seg
+      end.join('/')
+      # remove any remaining angle brackets
+      raw.tr!('<>', '')
+      # final fallback if empty
+      raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
       raw
     rescue => e
       @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
-      nil
+      # deterministic fallback – never return nil so caller won’t mark malformed
+      "file__#{Digest::SHA1.hexdigest(original)[0,10]}"
     end
   end
+  # wrap URL in parentheses if it contains characters that commonly break unquoted
+  # Windows CMD usage (e.g., &). This is only for display; user still must quote
+  # when invoking manually.
+  def safe_display_url(url)
+    return url unless url && url.match?(/[&]/)
+    "(#{url})"
+  end
   def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
     retries = 0
     begin
@@ -860,9 +952,9 @@ class WaybackMachineDownloader
       end
     rescue StandardError => e
-      if retries < MAX_RETRIES
+      if retries < @max_retries
         retries += 1
-        @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
+        @logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
         sleep(RETRY_DELAY * retries)
         retry
       else

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader_straw
 version: !ruby/object:Gem::Version
-  version: 2.4.1
+  version: 2.4.4
 platform: ruby
 authors:
 - strawberrymaster
 bindir: bin
 cert_chain: []
-date: 2025-08-12 00:00:00.000000000 Z
+date: 2025-10-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: concurrent-ruby