RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.3.12 → 2.4.1 - Mend

wayback_machine_downloader_straw 2.3.12 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/wayback_machine_downloader/archive_api.rb +1 -1
data/lib/wayback_machine_downloader/tidy_bytes.rb +61 -61
data/lib/wayback_machine_downloader/url_rewrite.rb +74 -0
data/lib/wayback_machine_downloader.rb +90 -105
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 67f774a5476a54ad0224e11f0c9a24b8df6b0d418f5b3c8886277c286bbe3043
-  data.tar.gz: a881ccdac84cd8e4da13edd9fc8117bfdba8c7d432959ef81c85bc95072a0dd9
+  metadata.gz: f6650c4217f2630db6307bc50ae2d6cefcbc38afc18b5701cc90a956af5cf1cf
+  data.tar.gz: 0ad44d7daa4c69b75d319c3518c4b801810be071545d5eded4497073caab4667
 SHA512:
-  metadata.gz: 01bdc9142820719c1ab17a50067fc478975627f414a29bdca32ea5fedf23227f33fb331f9470bb002af80cc50a6a74c7c8361f214d162c537d100860bdb664bc
-  data.tar.gz: f47436ecd1d4b8a4062d8689dac0d9fc4d73c743d5f84bd96764aa2a186eaae607fcee6c7b9e72f9fd3befd1fadfe9006354a43bfd134c892fbf5dfdd736ee28
+  metadata.gz: 7a8cfd1cda19bc3ff2db8859e03877395eaf44092ffbe9f5334218fbd6293ff1aecc60e2bf272f875a67ecd086a209c56640db221f4d13739669a27eada1c826
+  data.tar.gz: 877436af63fa205add55ebeb55bafcd39fec0afa56707ee742871014dac48998e8028ef4616a0b611bee5f9a93ed0d8d136375d457503a3e34b9a37f87321787

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module ArchiveAPI
       # Check if the response contains the header ["timestamp", "original"]
       json.shift if json.first == ["timestamp", "original"]
       json
-    rescue JSON::ParserError, StandardError => e
+    rescue JSON::ParserError => e
       warn "Failed to fetch data from API: #{e.message}"
       []
     end

data/lib/wayback_machine_downloader/tidy_bytes.rb CHANGED Viewed

@@ -1,74 +1,74 @@
 # frozen_string_literal: true
+# essentially, this is for converting a string with a potentially
+# broken or unknown encoding into a valid UTF-8 string
+# @todo: consider using charlock_holmes for this in the future
 module TidyBytes
-  # precomputing CP1252 to UTF-8 mappings for bytes 128-159
-  CP1252_MAP = (128..159).map do |byte|
-    case byte
-    when 128 then [226, 130, 172]  # EURO SIGN
-    when 130 then [226, 128, 154]  # SINGLE LOW-9 QUOTATION MARK
-    when 131 then [198, 146]       # LATIN SMALL LETTER F WITH HOOK
-    when 132 then [226, 128, 158]  # DOUBLE LOW-9 QUOTATION MARK
-    when 133 then [226, 128, 166]  # HORIZONTAL ELLIPSIS
-    when 134 then [226, 128, 160]  # DAGGER
-    when 135 then [226, 128, 161]  # DOUBLE DAGGER
-    when 136 then [203, 134]       # MODIFIER LETTER CIRCUMFLEX ACCENT
-    when 137 then [226, 128, 176]  # PER MILLE SIGN
-    when 138 then [197, 160]       # LATIN CAPITAL LETTER S WITH CARON
-    when 139 then [226, 128, 185]  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
-    when 140 then [197, 146]       # LATIN CAPITAL LIGATURE OE
-    when 142 then [197, 189]       # LATIN CAPITAL LETTER Z WITH CARON
-    when 145 then [226, 128, 152]  # LEFT SINGLE QUOTATION MARK
-    when 146 then [226, 128, 153]  # RIGHT SINGLE QUOTATION MARK
-    when 147 then [226, 128, 156]  # LEFT DOUBLE QUOTATION MARK
-    when 148 then [226, 128, 157]  # RIGHT DOUBLE QUOTATION MARK
-    when 149 then [226, 128, 162]  # BULLET
-    when 150 then [226, 128, 147]  # EN DASH
-    when 151 then [226, 128, 148]  # EM DASH
-    when 152 then [203, 156]       # SMALL TILDE
-    when 153 then [226, 132, 162]  # TRADE MARK SIGN
-    when 154 then [197, 161]       # LATIN SMALL LETTER S WITH CARON
-    when 155 then [226, 128, 186]  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
-    when 156 then [197, 147]       # LATIN SMALL LIGATURE OE
-    when 158 then [197, 190]       # LATIN SMALL LETTER Z WITH CARON
-    when 159 then [197, 184]       # LATIN SMALL LETTER Y WITH DIAERESIS
-    else nil                       # ANYTHING ELSE...
+  UNICODE_REPLACEMENT_CHARACTER = "�"
+  # common encodings to try for best multilingual compatibility
+  COMMON_ENCODINGS = [
+    Encoding::UTF_8,
+    Encoding::Windows_1251, # Cyrillic/Russian legacy
+    Encoding::GB18030,      # Simplified Chinese
+    Encoding::Shift_JIS,    # Japanese
+    Encoding::EUC_KR,       # Korean
+    Encoding::ISO_8859_1,   # Western European
+    Encoding::Windows_1252  # Western European/Latin1 superset
+  ].select { |enc| Encoding.name_list.include?(enc.name) }
+  # returns true if the string appears to be binary (has null bytes)
+  def binary_data?
+    self.include?("\x00".b)
+  end
+  # attempts to return a valid UTF-8 version of the string
+  def tidy_bytes
+    return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
+    return self.dup.force_encoding("BINARY") if binary_data?
+    str = self.dup
+    COMMON_ENCODINGS.each do |enc|
+      str.force_encoding(enc)
+      begin
+        utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
+        return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
+      rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+        # try next encoding
+      end
     end
-  end.freeze
-  # precomputing all possible byte conversions
-  CP1252_TO_UTF8 = Array.new(256) do |b|
-    if (128..159).cover?(b)
-      CP1252_MAP[b - 128]&.pack('C*')
-    elsif b < 128
-      b.chr
-    else
-      b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
+    # if no clean conversion found, try again but accept replacement characters
+    str = self.dup
+    COMMON_ENCODINGS.each do |enc|
+      str.force_encoding(enc)
+      begin
+        utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
+        return utf8 if utf8.valid_encoding?
+      rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+        # try next encoding
+      end
     end
-  end.freeze
+    # fallback: replace all invalid/undefined bytes
+    str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
+  end
+  def tidy_bytes!
+    replace(self.tidy_bytes)
+  end
   def self.included(base)
-    base.class_eval do
-      def tidy_bytes(force = false)
-        return nil if empty?
-        if force
-          buffer = String.new(capacity: bytesize)
-          each_byte { |b| buffer << CP1252_TO_UTF8[b] }
-          return buffer.force_encoding(Encoding::UTF_8)
-        end
+    base.send(:include, InstanceMethods)
+  end
-        begin
-          encode('UTF-8')
-        rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
-          buffer = String.new(capacity: bytesize)
-          scrub { |b| CP1252_TO_UTF8[b.ord] }
-        end
-      end
+  module InstanceMethods
+    def tidy_bytes
+      TidyBytes.instance_method(:tidy_bytes).bind(self).call
+    end
-      def tidy_bytes!(force = false)
-        result = tidy_bytes(force)
-        result ? replace(result) : self
-      end
+    def tidy_bytes!
+      TidyBytes.instance_method(:tidy_bytes!).bind(self).call
     end
   end
 end

data/lib/wayback_machine_downloader/url_rewrite.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+# URLs in HTML attributes
+def rewrite_html_attr_urls(content)
+  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
+    prefix, url, suffix = $1, $2, $3
+    if url.start_with?('http')
+      begin
+        uri = URI.parse(url)
+        path = uri.path
+        path = path[1..-1] if path.start_with?('/')
+        "#{prefix}#{path}#{suffix}"
+      rescue
+        "#{prefix}#{url}#{suffix}"
+      end
+    elsif url.start_with?('/')
+      "#{prefix}./#{url[1..-1]}#{suffix}"
+    else
+      "#{prefix}#{url}#{suffix}"
+    end
+  end
+  content
+end
+# URLs in CSS
+def rewrite_css_urls(content)
+  content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
+    url = $1
+    if url.start_with?('http')
+      begin
+        uri = URI.parse(url)
+        path = uri.path
+        path = path[1..-1] if path.start_with?('/')
+        "url(\"#{path}\")"
+      rescue
+        "url(\"#{url}\")"
+      end
+    elsif url.start_with?('/')
+      "url(\"./#{url[1..-1]}\")"
+    else
+      "url(\"#{url}\")"
+    end
+  end
+  content
+end
+# URLs in JavaScript
+def rewrite_js_urls(content)
+  content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
+    quote_start, url, quote_end = $1, $2, $3
+    if url.start_with?('http')
+      begin
+        uri = URI.parse(url)
+        path = uri.path
+        path = path[1..-1] if path.start_with?('/')
+        "#{quote_start}#{path}#{quote_end}"
+      rescue
+        "#{quote_start}#{url}#{quote_end}"
+      end
+    elsif url.start_with?('/')
+      "#{quote_start}./#{url[1..-1]}#{quote_end}"
+    else
+      "#{quote_start}#{url}#{quote_end}"
+    end
+  end
+  content
+end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -15,6 +15,7 @@ require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
 require_relative 'wayback_machine_downloader/subdom_processor'
+require_relative 'wayback_machine_downloader/url_rewrite'
 class ConnectionPool
   MAX_AGE = 300
@@ -115,7 +116,7 @@ class WaybackMachineDownloader
   include ArchiveAPI
   include SubdomainProcessor
-  VERSION = "2.3.12"
+  VERSION = "2.4.1"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2
@@ -133,10 +134,11 @@ class WaybackMachineDownloader
   def initialize params
     validate_params(params)
-    @base_url = params[:base_url]
+    @base_url = params[:base_url]&.tidy_bytes
     @exact_url = params[:exact_url]
     if params[:directory]
-      @directory = File.expand_path(params[:directory])
+      sanitized_dir = params[:directory].tidy_bytes
+      @directory = File.expand_path(sanitized_dir)
     else
       @directory = nil
     end
@@ -338,15 +340,15 @@ class WaybackMachineDownloader
     get_all_snapshots_to_consider.each do |file_timestamp, file_url|
       next unless file_url.include?('/')
       next if file_timestamp.to_i > target_timestamp
-      file_id = file_url.split('/')[3..-1].join('/')
-      file_id = CGI::unescape file_id
-      file_id = file_id.tidy_bytes unless file_id == ""
+      raw_tail = file_url.split('/')[3..-1]&.join('/')
+      file_id = sanitize_and_prepare_id(raw_tail, file_url)
       next if file_id.nil?
       next if match_exclude_filter(file_url)
       next unless match_only_filter(file_url)
-      # Select the most recent version <= target_timestamp
       if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
-        file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
+        file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
       end
     end
     file_versions.values
@@ -366,22 +368,27 @@ class WaybackMachineDownloader
     file_list_curated = Hash.new
     get_all_snapshots_to_consider.each do |file_timestamp, file_url|
       next unless file_url.include?('/')
-      file_id = file_url.split('/')[3..-1].join('/')
-      file_id = CGI::unescape file_id
-      file_id = file_id.tidy_bytes unless file_id == ""
+      raw_tail = file_url.split('/')[3..-1]&.join('/')
+      file_id = sanitize_and_prepare_id(raw_tail, file_url)
       if file_id.nil?
         puts "Malformed file url, ignoring: #{file_url}"
+        next
+      end
+      if file_id.include?('<') || file_id.include?('>')
+        puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
       else
         if match_exclude_filter(file_url)
           puts "File url matches exclude filter, ignoring: #{file_url}"
-        elsif not match_only_filter(file_url)
+        elsif !match_only_filter(file_url)
           puts "File url doesn't match only filter, ignoring: #{file_url}"
         elsif file_list_curated[file_id]
           unless file_list_curated[file_id][:timestamp] > file_timestamp
-            file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
+            file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
           end
         else
-          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
+          file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
         end
       end
     end
@@ -392,21 +399,32 @@ class WaybackMachineDownloader
     file_list_curated = Hash.new
     get_all_snapshots_to_consider.each do |file_timestamp, file_url|
       next unless file_url.include?('/')
-      file_id = file_url.split('/')[3..-1].join('/')
-      file_id_and_timestamp = [file_timestamp, file_id].join('/')
-      file_id_and_timestamp = CGI::unescape file_id_and_timestamp
-      file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
+      raw_tail = file_url.split('/')[3..-1]&.join('/')
+      file_id = sanitize_and_prepare_id(raw_tail, file_url)
       if file_id.nil?
         puts "Malformed file url, ignoring: #{file_url}"
+        next
+      end
+      file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
+      file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
+      if file_id_and_timestamp.nil?
+        puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
+        next
+      end
+      if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
+        puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
       else
         if match_exclude_filter(file_url)
           puts "File url matches exclude filter, ignoring: #{file_url}"
-        elsif not match_only_filter(file_url)
+        elsif !match_only_filter(file_url)
           puts "File url doesn't match only filter, ignoring: #{file_url}"
         elsif file_list_curated[file_id_and_timestamp]
-          puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
+          # duplicate combo, ignore silently (verbose flag not shown here)
         else
-          file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
+          file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
         end
       end
     end
@@ -473,6 +491,39 @@ class WaybackMachineDownloader
     end
   end
+  def processing_files(pool, files_to_process)
+    files_to_process.each do |file_remote_info|
+      pool.post do
+        download_success = false
+        begin
+          @connection_pool.with_connection do |connection|
+            result_message = download_file(file_remote_info, connection)
+            # assume download success if the result message contains ' -> '
+            if result_message && result_message.include?(' -> ')
+               download_success = true
+            end
+            @download_mutex.synchronize do
+              @processed_file_count += 1
+              # adjust progress message to reflect remaining files
+              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
+              puts progress_message if progress_message
+            end
+          end
+          # sppend to DB only after successful download outside the connection block
+          if download_success
+            append_to_db(file_remote_info[:file_id])
+          end
+        rescue => e
+          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
+           @download_mutex.synchronize do
+              @processed_file_count += 1
+           end
+        end
+        sleep(RATE_LIMIT)
+      end
+    end
+  end
   def download_files
     start_time = Time.now
     puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -519,36 +570,7 @@ class WaybackMachineDownloader
     thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
     pool = Concurrent::FixedThreadPool.new(thread_count)
-    files_to_process.each do |file_remote_info|
-      pool.post do
-        download_success = false
-        begin
-          @connection_pool.with_connection do |connection|
-            result_message = download_file(file_remote_info, connection)
-            # assume download success if the result message contains ' -> '
-            if result_message && result_message.include?(' -> ')
-               download_success = true
-            end
-            @download_mutex.synchronize do
-              @processed_file_count += 1
-              # adjust progress message to reflect remaining files
-              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
-              puts progress_message if progress_message
-            end
-          end
-          # sppend to DB only after successful download outside the connection block
-          if download_success
-            append_to_db(file_remote_info[:file_id])
-          end
-        rescue => e
-          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
-           @download_mutex.synchronize do
-              @processed_file_count += 1
-           end
-        end
-        sleep(RATE_LIMIT)
-      end
-    end
+    processing_files(pool, files_to_process)
     pool.shutdown
     pool.wait_for_termination
@@ -608,64 +630,13 @@ class WaybackMachineDownloader
       end
       # URLs in HTML attributes
-      content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-        prefix, url, suffix = $1, $2, $3
-        if url.start_with?('http')
-          begin
-            uri = URI.parse(url)
-            path = uri.path
-            path = path[1..-1] if path.start_with?('/')
-            "#{prefix}#{path}#{suffix}"
-          rescue
-            "#{prefix}#{url}#{suffix}"
-          end
-        elsif url.start_with?('/')
-          "#{prefix}./#{url[1..-1]}#{suffix}"
-        else
-          "#{prefix}#{url}#{suffix}"
-        end
-      end
+      rewrite_html_attr_urls(content)
       # URLs in CSS
-      content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
-        url = $1
-        if url.start_with?('http')
-          begin
-            uri = URI.parse(url)
-            path = uri.path
-            path = path[1..-1] if path.start_with?('/')
-            "url(\"#{path}\")"
-          rescue
-            "url(\"#{url}\")"
-          end
-        elsif url.start_with?('/')
-          "url(\"./#{url[1..-1]}\")"
-        else
-          "url(\"#{url}\")"
-        end
-      end
+      rewrite_css_urls(content)
       # URLs in JavaScript
-      content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-        quote_start, url, quote_end = $1, $2, $3
-        if url.start_with?('http')
-          begin
-            uri = URI.parse(url)
-            path = uri.path
-            path = path[1..-1] if path.start_with?('/')
-            "#{quote_start}#{path}#{quote_end}"
-          rescue
-            "#{quote_start}#{url}#{quote_end}"
-          end
-        elsif url.start_with?('/')
-          "#{quote_start}./#{url[1..-1]}#{quote_end}"
-        else
-          "#{quote_start}#{url}#{quote_end}"
-        end
-      end
+      rewrite_js_urls(content)
       # for URLs in HTML attributes that start with a single slash
       content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
@@ -794,6 +765,20 @@ class WaybackMachineDownloader
     end
     logger
   end
+  # safely sanitize a file id (or id+timestamp)
+  def sanitize_and_prepare_id(raw, file_url)
+    return nil if raw.nil?
+    begin
+      raw = CGI.unescape(raw) rescue raw
+      raw.gsub!(/<[^>]*>/, '')
+      raw = raw.tidy_bytes unless raw.empty?
+      raw
+    rescue => e
+      @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
+      nil
+    end
+  end
   def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
     retries = 0

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader_straw
 version: !ruby/object:Gem::Version
-  version: 2.3.12
+  version: 2.4.1
 platform: ruby
 authors:
 - strawberrymaster
 bindir: bin
 cert_chain: []
-date: 2025-07-22 00:00:00.000000000 Z
+date: 2025-08-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: concurrent-ruby
@@ -74,6 +74,7 @@ files:
 - lib/wayback_machine_downloader/subdom_processor.rb
 - lib/wayback_machine_downloader/tidy_bytes.rb
 - lib/wayback_machine_downloader/to_regex.rb
+- lib/wayback_machine_downloader/url_rewrite.rb
 homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
 licenses:
 - MIT