RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.4.3 → 2.4.4 - Mend

wayback_machine_downloader_straw 2.4.3 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +4 -0
data/lib/wayback_machine_downloader.rb +111 -93
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ce7592163165a7f8235bf4a6e1915cf531511fafc7f6874c0d1673fb29db704f
-  data.tar.gz: 7d48ffebf130d3b32d1ec233cf5141cc3cf192bcf16751db4380bf62863971c1
+  metadata.gz: 1e81619475172540d94968e5d31fbb6b4df7f08a533efce3b200e1ad4ce5035f
+  data.tar.gz: aca008795277ebf489cda7f0d8d1691b1983e2eec6f338b95c9bf11552fdfd94
 SHA512:
-  metadata.gz: 16d56de1814e36174c47ab5bda6c9d5e02aba15bafa72a1d57056d0ac146e5fff5c6ca43f9198262d90820e4dcbe4e63772f01bd1ee5207c7ab07e9bb959e069
-  data.tar.gz: 07602af4f0cfb9927d43239da0c38cb2411aa408d11fe3f91cb4a403fa415ca8de095eee7467e4613d32aadb8c0a13ffea19ac2f93fd5bf005a991d91e8a064a
+  metadata.gz: c71eb691ba50308f1f6b29a326c00a0678db1ff94f6ab8620b4d74425c993a64b60dd4d0127f9dd9eb3a72801008672855d901379d870b89aff82862b4f582a3
+  data.tar.gz: 6277c45d37dd02fea219f93906bf790ddede921ef88e928074b9e3da88b9882a64ecebbc99767b4c0eaef34b9a3407e9fee6352c935ea2d9942eed5735c51632

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -74,6 +74,10 @@ option_parser = OptionParser.new do |opts|
     options[:keep] = true
   end
+  opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
+    options[:max_retries] = t
+  end
   opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
     options[:recursive_subdomains] = true
   end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -25,69 +25,81 @@ class ConnectionPool
   MAX_RETRIES = 3
   def initialize(size)
-    @size = size
-    @pool = Concurrent::Map.new
-    @creation_times = Concurrent::Map.new
+    @pool = SizedQueue.new(size)
+    size.times { @pool << build_connection_entry }
     @cleanup_thread = schedule_cleanup
   end
-  def with_connection(&block)
-    conn = acquire_connection
+  def with_connection
+    entry = acquire_connection
     begin
-      yield conn
+      yield entry[:http]
     ensure
-      release_connection(conn)
+      release_connection(entry)
     end
   end
   def shutdown
     @cleanup_thread&.exit
-    @pool.each_value { |conn| conn.finish if conn&.started? }
-    @pool.clear
-    @creation_times.clear
+    drain_pool { |entry| safe_finish(entry[:http]) }
   end
   private
   def acquire_connection
-    thread_id = Thread.current.object_id
-    conn = @pool[thread_id]
+    entry = @pool.pop
+    if stale?(entry)
+      safe_finish(entry[:http])
+      entry = build_connection_entry
+    end
+    entry
+  end
-    if should_create_new?(conn)
-      conn&.finish if conn&.started?
-      conn = create_connection
-      @pool[thread_id] = conn
-      @creation_times[thread_id] = Time.now
+  def release_connection(entry)
+    if stale?(entry)
+      safe_finish(entry[:http])
+      entry = build_connection_entry
     end
+    @pool << entry
+  end
-    conn
+  def stale?(entry)
+    http = entry[:http]
+    !http.started? || (Time.now - entry[:created_at] > MAX_AGE)
   end
-  def release_connection(conn)
-    return unless conn
-    if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
-      conn.finish
-      @pool.delete(Thread.current.object_id)
-      @creation_times.delete(Thread.current.object_id)
-    end
+  def build_connection_entry
+    { http: create_connection, created_at: Time.now }
   end
-  def should_create_new?(conn)
-    return true if conn.nil?
-    return true unless conn.started?
-    return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
-    false
+  def safe_finish(http)
+    http.finish if http&.started?
+  rescue StandardError
+    nil
   end
-  def create_connection
-    http = Net::HTTP.new("web.archive.org", 443)
-    http.use_ssl = true
-    http.read_timeout = DEFAULT_TIMEOUT
-    http.open_timeout = DEFAULT_TIMEOUT
-    http.keep_alive_timeout = 30
-    http.max_retries = MAX_RETRIES
-    http.start
-    http
+  def drain_pool
+    loop do
+      entry = begin
+        @pool.pop(true)
+      rescue ThreadError
+        break
+      end
+      yield(entry)
+    end
+  end
+  def cleanup_old_connections
+    entry = begin
+      @pool.pop(true)
+    rescue ThreadError
+      return
+    end
+    if stale?(entry)
+      safe_finish(entry[:http])
+      entry = build_connection_entry
+    end
+    @pool << entry
   end
   def schedule_cleanup
@@ -99,16 +111,15 @@ class ConnectionPool
     end
   end
-  def cleanup_old_connections
-    current_time = Time.now
-    @creation_times.each do |thread_id, creation_time|
-      if current_time - creation_time > MAX_AGE
-        conn = @pool[thread_id]
-        conn&.finish if conn&.started?
-        @pool.delete(thread_id)
-        @creation_times.delete(thread_id)
-      end
-    end
+  def create_connection
+    http = Net::HTTP.new("web.archive.org", 443)
+    http.use_ssl = true
+    http.read_timeout = DEFAULT_TIMEOUT
+    http.open_timeout = DEFAULT_TIMEOUT
+    http.keep_alive_timeout = 30
+    http.max_retries = MAX_RETRIES
+    http.start
+    http
   end
 end
@@ -117,7 +128,7 @@ class WaybackMachineDownloader
   include ArchiveAPI
   include SubdomainProcessor
-  VERSION = "2.4.3"
+  VERSION = "2.4.4"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2
@@ -163,6 +174,7 @@ class WaybackMachineDownloader
     @recursive_subdomains = params[:recursive_subdomains] || false
     @subdomain_depth = params[:subdomain_depth] || 1
     @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
+    @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
     # URL for rejecting invalid/unencoded wayback urls
     @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -193,7 +205,8 @@ class WaybackMachineDownloader
       @directory
     else
       # ensure the default path is absolute and normalized
-      File.expand_path(File.join('websites', backup_name))
+      cwd = Dir.pwd
+      File.expand_path(File.join(cwd, 'websites', backup_name))
     end
   end
@@ -277,53 +290,58 @@ class WaybackMachineDownloader
       page_index = 0
       batch_size = [@threads_count, 5].min
       continue_fetching = true
-      while continue_fetching && page_index < @maximum_pages
-        # Determine the range of pages to fetch in this batch
-        end_index = [page_index + batch_size, @maximum_pages].min
-        current_batch = (page_index...end_index).to_a
-        # Create futures for concurrent API calls
-        futures = current_batch.map do |page|
-          Concurrent::Future.execute do
-            result = nil
-            @connection_pool.with_connection do |connection|
-              result = get_raw_list_from_api("#{@base_url}/*", page, connection)
+      fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
+      begin
+        while continue_fetching && page_index < @maximum_pages
+          # Determine the range of pages to fetch in this batch
+          end_index = [page_index + batch_size, @maximum_pages].min
+          current_batch = (page_index...end_index).to_a
+          # Create futures for concurrent API calls
+          futures = current_batch.map do |page|
+            Concurrent::Future.execute(executor: fetch_pool) do
+              result = nil
+              @connection_pool.with_connection do |connection|
+                result = get_raw_list_from_api("#{@base_url}/*", page, connection)
+              end
+              result ||= []
+              [page, result]
             end
-            result ||= []
-            [page, result]
           end
-        end
-        results = []
+          results = []
-        futures.each do |future|
-          begin
-            results << future.value
-          rescue => e
-            puts "\nError fetching page #{future}: #{e.message}"
+          futures.each do |future|
+            begin
+              results << future.value
+            rescue => e
+              puts "\nError fetching page #{future}: #{e.message}"
+            end
           end
-        end
-        # Sort results by page number to maintain order
-        results.sort_by! { |page, _| page }
-        # Process results and check for empty pages
-        results.each do |page, result|
-          if result.nil? || result.empty?
-            continue_fetching = false
-            break
-          else
-            mutex.synchronize do
-              snapshot_list_to_consider.concat(result)
-              print "."
+          # Sort results by page number to maintain order
+          results.sort_by! { |page, _| page }
+          # Process results and check for empty pages
+          results.each do |page, result|
+            if result.nil? || result.empty?
+              continue_fetching = false
+              break
+            else
+              mutex.synchronize do
+                snapshot_list_to_consider.concat(result)
+                print "."
+              end
             end
           end
-        end
-        page_index = end_index
+          page_index = end_index
-        sleep(RATE_LIMIT) if continue_fetching
+          sleep(RATE_LIMIT) if continue_fetching
+        end
+      ensure
+        fetch_pool.shutdown
+        fetch_pool.wait_for_termination
       end
     end
@@ -638,13 +656,13 @@ class WaybackMachineDownloader
       end
       # URLs in HTML attributes
-      rewrite_html_attr_urls(content)
+      content = rewrite_html_attr_urls(content)
       # URLs in CSS
-      rewrite_css_urls(content)
+      content = rewrite_css_urls(content)
       # URLs in JavaScript
-      rewrite_js_urls(content)
+      content = rewrite_js_urls(content)
       # for URLs in HTML attributes that start with a single slash
       content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
@@ -934,9 +952,9 @@ class WaybackMachineDownloader
       end
     rescue StandardError => e
-      if retries < MAX_RETRIES
+      if retries < @max_retries
         retries += 1
-        @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
+        @logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
         sleep(RETRY_DELAY * retries)
         retry
       else

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader_straw
 version: !ruby/object:Gem::Version
-  version: 2.4.3
+  version: 2.4.4
 platform: ruby
 authors:
 - strawberrymaster
 bindir: bin
 cert_chain: []
-date: 2025-08-19 00:00:00.000000000 Z
+date: 2025-10-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: concurrent-ruby