RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.3.12 → 2.4.0 - Mend

wayback_machine_downloader_straw 2.3.12 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/wayback_machine_downloader/archive_api.rb +1 -1
data/lib/wayback_machine_downloader/tidy_bytes.rb +61 -61
data/lib/wayback_machine_downloader/url_rewrite.rb +74 -0
data/lib/wayback_machine_downloader.rb +42 -87
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 67f774a5476a54ad0224e11f0c9a24b8df6b0d418f5b3c8886277c286bbe3043
-  data.tar.gz: a881ccdac84cd8e4da13edd9fc8117bfdba8c7d432959ef81c85bc95072a0dd9
+  metadata.gz: 35a8c4a865a9da5cb45e7f63e2f832f491895f5c69c3d440b9c8b4230b8444f1
+  data.tar.gz: a96d746b41f3e3b7a1cf6df38df3b23a79361f57f667eea562be72961bf391c2
 SHA512:
-  metadata.gz: 01bdc9142820719c1ab17a50067fc478975627f414a29bdca32ea5fedf23227f33fb331f9470bb002af80cc50a6a74c7c8361f214d162c537d100860bdb664bc
-  data.tar.gz: f47436ecd1d4b8a4062d8689dac0d9fc4d73c743d5f84bd96764aa2a186eaae607fcee6c7b9e72f9fd3befd1fadfe9006354a43bfd134c892fbf5dfdd736ee28
+  metadata.gz: 783bb658ee95bd523fb3dc8c2c11a027947becc4e72902e2fff85eb725bbc8e3ef8e7bb22b08598f015f77e801526354f36b6d920144df9fd6bca440cccf8127
+  data.tar.gz: a2e0ce3e4df543574b1c04e349d120b31d900bbbfe3f9bf512706f57094d89c49574290520df25fdd8c920577baf561272af65ca4c36d058a3a4097efa167a83

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module ArchiveAPI
       # Check if the response contains the header ["timestamp", "original"]
       json.shift if json.first == ["timestamp", "original"]
       json
-    rescue JSON::ParserError, StandardError => e
+    rescue JSON::ParserError => e
       warn "Failed to fetch data from API: #{e.message}"
       []
     end

data/lib/wayback_machine_downloader/tidy_bytes.rb CHANGED Viewed

@@ -1,74 +1,74 @@
 # frozen_string_literal: true
+# essentially, this is for converting a string with a potentially
+# broken or unknown encoding into a valid UTF-8 string
+# @todo: consider using charlock_holmes for this in the future
 module TidyBytes
-  # precomputing CP1252 to UTF-8 mappings for bytes 128-159
-  CP1252_MAP = (128..159).map do |byte|
-    case byte
-    when 128 then [226, 130, 172]  # EURO SIGN
-    when 130 then [226, 128, 154]  # SINGLE LOW-9 QUOTATION MARK
-    when 131 then [198, 146]       # LATIN SMALL LETTER F WITH HOOK
-    when 132 then [226, 128, 158]  # DOUBLE LOW-9 QUOTATION MARK
-    when 133 then [226, 128, 166]  # HORIZONTAL ELLIPSIS
-    when 134 then [226, 128, 160]  # DAGGER
-    when 135 then [226, 128, 161]  # DOUBLE DAGGER
-    when 136 then [203, 134]       # MODIFIER LETTER CIRCUMFLEX ACCENT
-    when 137 then [226, 128, 176]  # PER MILLE SIGN
-    when 138 then [197, 160]       # LATIN CAPITAL LETTER S WITH CARON
-    when 139 then [226, 128, 185]  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
-    when 140 then [197, 146]       # LATIN CAPITAL LIGATURE OE
-    when 142 then [197, 189]       # LATIN CAPITAL LETTER Z WITH CARON
-    when 145 then [226, 128, 152]  # LEFT SINGLE QUOTATION MARK
-    when 146 then [226, 128, 153]  # RIGHT SINGLE QUOTATION MARK
-    when 147 then [226, 128, 156]  # LEFT DOUBLE QUOTATION MARK
-    when 148 then [226, 128, 157]  # RIGHT DOUBLE QUOTATION MARK
-    when 149 then [226, 128, 162]  # BULLET
-    when 150 then [226, 128, 147]  # EN DASH
-    when 151 then [226, 128, 148]  # EM DASH
-    when 152 then [203, 156]       # SMALL TILDE
-    when 153 then [226, 132, 162]  # TRADE MARK SIGN
-    when 154 then [197, 161]       # LATIN SMALL LETTER S WITH CARON
-    when 155 then [226, 128, 186]  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
-    when 156 then [197, 147]       # LATIN SMALL LIGATURE OE
-    when 158 then [197, 190]       # LATIN SMALL LETTER Z WITH CARON
-    when 159 then [197, 184]       # LATIN SMALL LETTER Y WITH DIAERESIS
-    else nil                       # ANYTHING ELSE...
+  UNICODE_REPLACEMENT_CHARACTER = "�"
+  # common encodings to try for best multilingual compatibility
+  COMMON_ENCODINGS = [
+    Encoding::UTF_8,
+    Encoding::Windows_1251, # Cyrillic/Russian legacy
+    Encoding::GB18030,      # Simplified Chinese
+    Encoding::Shift_JIS,    # Japanese
+    Encoding::EUC_KR,       # Korean
+    Encoding::ISO_8859_1,   # Western European
+    Encoding::Windows_1252  # Western European/Latin1 superset
+  ].select { |enc| Encoding.name_list.include?(enc.name) }
+  # returns true if the string appears to be binary (has null bytes)
+  def binary_data?
+    self.include?("\x00".b)
+  end
+  # attempts to return a valid UTF-8 version of the string
+  def tidy_bytes
+    return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
+    return self.dup.force_encoding("BINARY") if binary_data?
+    str = self.dup
+    COMMON_ENCODINGS.each do |enc|
+      str.force_encoding(enc)
+      begin
+        utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
+        return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
+      rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+        # try next encoding
+      end
     end
-  end.freeze
-  # precomputing all possible byte conversions
-  CP1252_TO_UTF8 = Array.new(256) do |b|
-    if (128..159).cover?(b)
-      CP1252_MAP[b - 128]&.pack('C*')
-    elsif b < 128
-      b.chr
-    else
-      b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
+    # if no clean conversion found, try again but accept replacement characters
+    str = self.dup
+    COMMON_ENCODINGS.each do |enc|
+      str.force_encoding(enc)
+      begin
+        utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
+        return utf8 if utf8.valid_encoding?
+      rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+        # try next encoding
+      end
     end
-  end.freeze
+    # fallback: replace all invalid/undefined bytes
+    str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
+  end
+  def tidy_bytes!
+    replace(self.tidy_bytes)
+  end
   def self.included(base)
-    base.class_eval do
-      def tidy_bytes(force = false)
-        return nil if empty?
-        if force
-          buffer = String.new(capacity: bytesize)
-          each_byte { |b| buffer << CP1252_TO_UTF8[b] }
-          return buffer.force_encoding(Encoding::UTF_8)
-        end
+    base.send(:include, InstanceMethods)
+  end
-        begin
-          encode('UTF-8')
-        rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
-          buffer = String.new(capacity: bytesize)
-          scrub { |b| CP1252_TO_UTF8[b.ord] }
-        end
-      end
+  module InstanceMethods
+    def tidy_bytes
+      TidyBytes.instance_method(:tidy_bytes).bind(self).call
+    end
-      def tidy_bytes!(force = false)
-        result = tidy_bytes(force)
-        result ? replace(result) : self
-      end
+    def tidy_bytes!
+      TidyBytes.instance_method(:tidy_bytes!).bind(self).call
     end
   end
 end

data/lib/wayback_machine_downloader/url_rewrite.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+# URLs in HTML attributes
+def rewrite_html_attr_urls(content)
+  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
+    prefix, url, suffix = $1, $2, $3
+    if url.start_with?('http')
+      begin
+        uri = URI.parse(url)
+        path = uri.path
+        path = path[1..-1] if path.start_with?('/')
+        "#{prefix}#{path}#{suffix}"
+      rescue
+        "#{prefix}#{url}#{suffix}"
+      end
+    elsif url.start_with?('/')
+      "#{prefix}./#{url[1..-1]}#{suffix}"
+    else
+      "#{prefix}#{url}#{suffix}"
+    end
+  end
+  content
+end
+# URLs in CSS
+def rewrite_css_urls(content)
+  content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
+    url = $1
+    if url.start_with?('http')
+      begin
+        uri = URI.parse(url)
+        path = uri.path
+        path = path[1..-1] if path.start_with?('/')
+        "url(\"#{path}\")"
+      rescue
+        "url(\"#{url}\")"
+      end
+    elsif url.start_with?('/')
+      "url(\"./#{url[1..-1]}\")"
+    else
+      "url(\"#{url}\")"
+    end
+  end
+  content
+end
+# URLs in JavaScript
+def rewrite_js_urls(content)
+  content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
+    quote_start, url, quote_end = $1, $2, $3
+    if url.start_with?('http')
+      begin
+        uri = URI.parse(url)
+        path = uri.path
+        path = path[1..-1] if path.start_with?('/')
+        "#{quote_start}#{path}#{quote_end}"
+      rescue
+        "#{quote_start}#{url}#{quote_end}"
+      end
+    elsif url.start_with?('/')
+      "#{quote_start}./#{url[1..-1]}#{quote_end}"
+    else
+      "#{quote_start}#{url}#{quote_end}"
+    end
+  end
+  content
+end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -15,6 +15,7 @@ require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
 require_relative 'wayback_machine_downloader/subdom_processor'
+require_relative 'wayback_machine_downloader/url_rewrite'
 class ConnectionPool
   MAX_AGE = 300
@@ -115,7 +116,7 @@ class WaybackMachineDownloader
   include ArchiveAPI
   include SubdomainProcessor
-  VERSION = "2.3.12"
+  VERSION = "2.4.0"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2
@@ -133,10 +134,11 @@ class WaybackMachineDownloader
   def initialize params
     validate_params(params)
-    @base_url = params[:base_url]
+    @base_url = params[:base_url]&.tidy_bytes
     @exact_url = params[:exact_url]
     if params[:directory]
-      @directory = File.expand_path(params[:directory])
+      sanitized_dir = params[:directory].tidy_bytes
+      @directory = File.expand_path(sanitized_dir)
     else
       @directory = nil
     end
@@ -473,6 +475,39 @@ class WaybackMachineDownloader
     end
   end
+  def processing_files(pool, files_to_process)
+    files_to_process.each do |file_remote_info|
+      pool.post do
+        download_success = false
+        begin
+          @connection_pool.with_connection do |connection|
+            result_message = download_file(file_remote_info, connection)
+            # assume download success if the result message contains ' -> '
+            if result_message && result_message.include?(' -> ')
+               download_success = true
+            end
+            @download_mutex.synchronize do
+              @processed_file_count += 1
+              # adjust progress message to reflect remaining files
+              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
+              puts progress_message if progress_message
+            end
+          end
+          # sppend to DB only after successful download outside the connection block
+          if download_success
+            append_to_db(file_remote_info[:file_id])
+          end
+        rescue => e
+          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
+           @download_mutex.synchronize do
+              @processed_file_count += 1
+           end
+        end
+        sleep(RATE_LIMIT)
+      end
+    end
+  end
   def download_files
     start_time = Time.now
     puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -519,36 +554,7 @@ class WaybackMachineDownloader
     thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
     pool = Concurrent::FixedThreadPool.new(thread_count)
-    files_to_process.each do |file_remote_info|
-      pool.post do
-        download_success = false
-        begin
-          @connection_pool.with_connection do |connection|
-            result_message = download_file(file_remote_info, connection)
-            # assume download success if the result message contains ' -> '
-            if result_message && result_message.include?(' -> ')
-               download_success = true
-            end
-            @download_mutex.synchronize do
-              @processed_file_count += 1
-              # adjust progress message to reflect remaining files
-              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
-              puts progress_message if progress_message
-            end
-          end
-          # sppend to DB only after successful download outside the connection block
-          if download_success
-            append_to_db(file_remote_info[:file_id])
-          end
-        rescue => e
-          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
-           @download_mutex.synchronize do
-              @processed_file_count += 1
-           end
-        end
-        sleep(RATE_LIMIT)
-      end
-    end
+    processing_files(pool, files_to_process)
     pool.shutdown
     pool.wait_for_termination
@@ -608,64 +614,13 @@ class WaybackMachineDownloader
       end
       # URLs in HTML attributes
-      content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-        prefix, url, suffix = $1, $2, $3
-        if url.start_with?('http')
-          begin
-            uri = URI.parse(url)
-            path = uri.path
-            path = path[1..-1] if path.start_with?('/')
-            "#{prefix}#{path}#{suffix}"
-          rescue
-            "#{prefix}#{url}#{suffix}"
-          end
-        elsif url.start_with?('/')
-          "#{prefix}./#{url[1..-1]}#{suffix}"
-        else
-          "#{prefix}#{url}#{suffix}"
-        end
-      end
+      rewrite_html_attr_urls(content)
       # URLs in CSS
-      content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
-        url = $1
-        if url.start_with?('http')
-          begin
-            uri = URI.parse(url)
-            path = uri.path
-            path = path[1..-1] if path.start_with?('/')
-            "url(\"#{path}\")"
-          rescue
-            "url(\"#{url}\")"
-          end
-        elsif url.start_with?('/')
-          "url(\"./#{url[1..-1]}\")"
-        else
-          "url(\"#{url}\")"
-        end
-      end
+      rewrite_css_urls(content)
       # URLs in JavaScript
-      content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-        quote_start, url, quote_end = $1, $2, $3
-        if url.start_with?('http')
-          begin
-            uri = URI.parse(url)
-            path = uri.path
-            path = path[1..-1] if path.start_with?('/')
-            "#{quote_start}#{path}#{quote_end}"
-          rescue
-            "#{quote_start}#{url}#{quote_end}"
-          end
-        elsif url.start_with?('/')
-          "#{quote_start}./#{url[1..-1]}#{quote_end}"
-        else
-          "#{quote_start}#{url}#{quote_end}"
-        end
-      end
+      rewrite_js_urls(content)
       # for URLs in HTML attributes that start with a single slash
       content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader_straw
 version: !ruby/object:Gem::Version
-  version: 2.3.12
+  version: 2.4.0
 platform: ruby
 authors:
 - strawberrymaster
 bindir: bin
 cert_chain: []
-date: 2025-07-22 00:00:00.000000000 Z
+date: 2025-08-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: concurrent-ruby
@@ -74,6 +74,7 @@ files:
 - lib/wayback_machine_downloader/subdom_processor.rb
 - lib/wayback_machine_downloader/tidy_bytes.rb
 - lib/wayback_machine_downloader/to_regex.rb
+- lib/wayback_machine_downloader/url_rewrite.rb
 homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
 licenses:
 - MIT