RubyGems - hyll - Versions diffs - 0.2.0 → 1.0.0 - Mend

hyll 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +80 -0
data/README.md +53 -18
data/examples/v1_benchmark.rb +93 -0
data/lib/hyll/algorithms/enhanced_hyperloglog.rb +234 -120
data/lib/hyll/algorithms/hyperloglog.rb +262 -338
data/lib/hyll/constants.rb +75 -0
data/lib/hyll/utils/hash.rb +132 -21
data/lib/hyll/utils/math.rb +129 -75
data/lib/hyll/version.rb +1 -1
metadata +3 -2

data/lib/hyll/algorithms/enhanced_hyperloglog.rb CHANGED Viewed

@@ -1,12 +1,14 @@
 # frozen_string_literal: true
 module Hyll
-  # A strictly enhanced version of HyperLogLog with additional features - inspired by Presto's P4HYPERLOGLOG
+  # Ultra-optimized EnhancedHyperLogLog v1.0.0
+  # A strictly enhanced version of HyperLogLog with streaming martingale estimator
+  # Features: vectorized operations, in-place updates, minimal allocations
   class EnhancedHyperLogLog < HyperLogLog
     def initialize(precision = 10)
       super(precision)
-      # Always use dense format
+      # Always use dense format - pre-allocate for zero GC pressure
       @using_exact_counting = false
       @small_set = nil
       @registers = Array.new(@m, 0)
@@ -14,150 +16,208 @@ module Hyll
       # Flag to track if this was converted from standard format
       @converted_from_standard = false
       @was_merged = false
-      # Streaming martingale estimator
+      # Streaming martingale estimator - optimized state
       @streaming_estimate = 0.0
       @last_modification_probability = nil
       @quadratic_variation = 0.0
+      # Cache for modification probability
+      @cached_mod_prob = nil
+      @registers_dirty = true
     end
-    # Add an element to the HyperLogLog counter
+    # Add an element - ultra-optimized path
     # @param element [Object] the element to add
     # @return [EnhancedHyperLogLog] self for method chaining
     def add(element)
-      # Store the registers before adding the element
-      old_registers = @registers.dup
+      # Hash and extract in one pass
+      hash = murmurhash3(element.to_s)
+      register_index = hash & @register_mask
+      value = hash >> @precision
+      leading_zeros = count_leading_zeros(value) + 1
+      old_value = @registers[register_index]
-      # Calculate modification probability before adding
-      mod_probability = modification_probability
+      # Fast path: no update needed
+      return self if leading_zeros <= old_value
-      # Add element to registers (parent implementation)
-      add_to_registers(element)
+      # Calculate modification probability before update
+      mod_probability = modification_probability_fast
+      # Update register
+      @registers[register_index] = leading_zeros
       @converted_from_standard = false
+      @registers_dirty = true
-      # Sequential detection for integers
-      handle_sequential_detection(element)
+      # Update streaming estimate
+      increment = 1.0 / mod_probability
+      @streaming_estimate += increment
+      @quadratic_variation += (increment - 1.0) ** 2
+      @last_modification_probability = mod_probability
-      # Update streaming estimate if the sketch was modified
-      if old_registers != @registers
-        increment = 1.0 / mod_probability
-        @streaming_estimate += increment
+      # Sequential detection for integers (deferred)
+      handle_sequential_detection(element) if element.is_a?(Integer)
+      self
+    end
+    # Batch add - optimized for throughput
+    # @param elements [Array] elements to add
+    # @return [EnhancedHyperLogLog] self
+    def add_all(elements)
+      return self if elements.empty?
+      mod_probability = modification_probability_fast
+      modified = false
+      elements.each do |element|
+        hash = murmurhash3(element.to_s)
+        register_index = hash & @register_mask
+        value = hash >> @precision
+        leading_zeros = count_leading_zeros(value) + 1
+        if leading_zeros > @registers[register_index]
+          @registers[register_index] = leading_zeros
+          modified = true
-        # Update quadratic variation for error estimation
-        @quadratic_variation += (increment - 1)**2
+          increment = 1.0 / mod_probability
+          @streaming_estimate += increment
+          @quadratic_variation += (increment - 1.0) ** 2
+        end
+      end
+      if modified
+        @converted_from_standard = false
+        @registers_dirty = true
         @last_modification_probability = mod_probability
       end
+      # Sequential detection for integer batches
+      if elements.first.is_a?(Integer)
+        @last_values = elements.last(10)
+        detect_sequential if @last_values.size >= 10
+      end
       self
     end
-    # Calculate the probability that a new element will modify the sketch
+    # Calculate modification probability - cached for performance
     # @return [Float] probability of modification
     def modification_probability
+      modification_probability_fast
+    end
+    # Fast modification probability with caching
+    # @return [Float] probability of modification
+    def modification_probability_fast
       return 1.0 if @registers.all?(&:zero?)
-      # For HyperLogLog, modification probability is (1/m) * sum(2^(-register))
-      sum = @registers.sum { |r| 2.0**-r }
-      sum / @m
+      # Use cached value if registers haven't changed
+      return @cached_mod_prob if @cached_mod_prob && !@registers_dirty
+      # Calculate using lookup table
+      pow2_table = Constants::POW2_NEG_TABLE
+      sum = 0.0
+      @registers.each do |r|
+        sum += pow2_table[r] || (2.0 ** -r)
+      end
+      @cached_mod_prob = sum / @m
+      @registers_dirty = false
+      @cached_mod_prob
     end
     # Get the streaming cardinality estimate
     # @return [Float] the estimated cardinality
     def streaming_cardinality
-      # If no modifications yet, return super implementation
       return super.cardinality if @streaming_estimate.zero?
+      return super.cardinality if modification_probability_fast < 1e-6
-      # If the sketch is saturated, fall back to standard estimate
-      return super.cardinality if modification_probability < 1e-6
-      # Return the streaming estimate
       @streaming_estimate
     end
     # Estimate the variance of the streaming estimate
     # @return [Float] the estimated variance
     def streaming_variance
-      # If no modifications, return 0
       return 0.0 if @last_modification_probability.nil?
-      # Calculate variance based on martingale properties
-      # This provides an unbiased estimate of the variance
       @quadratic_variation
     end
-    # Get error bounds for the streaming estimate
+    # Get error bounds for the streaming estimate - optimized
     # @param confidence [Float] confidence level (default: 0.95)
     # @return [Array<Float>] lower and upper bounds
     def streaming_error_bounds(confidence = 0.95)
-      # If no modifications, return exact bounds
       return [@streaming_estimate, @streaming_estimate] if @last_modification_probability.nil?
-      # Calculate z-score for the given confidence level
-      # For 95% confidence, z ≈ 1.96
+      # Pre-computed z-scores for common confidence levels
       z = case confidence
           when 0.90 then 1.645
           when 0.95 then 1.96
           when 0.99 then 2.576
           else
-            # Calculate using inverse error function for any confidence level
             Math.sqrt(2) * Math.erfc(2 * (1 - confidence))
           end
-      # Calculate standard error
-      std_error = Math.sqrt(streaming_variance)
-      # Return confidence interval
+      std_error = Math.sqrt(@quadratic_variation)
       [@streaming_estimate - z * std_error, @streaming_estimate + z * std_error]
     end
-    # Update register value directly (no compression in EnhancedHyperLogLog)
+    # Direct register update - optimized
     def update_register(index, value)
-      # Store the registers before updating
-      @registers.dup
       old_value = @registers[index]
+      return unless value > old_value
-      # Calculate modification probability before update
-      mod_probability = modification_probability
-      current_value = @registers[index]
-      return unless value > current_value
+      mod_probability = modification_probability_fast
       @registers[index] = value
       @converted_from_standard = false
-      # Update streaming estimate if the register was modified
-      return unless old_value != value
+      @registers_dirty = true
       increment = 1.0 / mod_probability
       @streaming_estimate += increment
-      # Update quadratic variation for error estimation
-      @quadratic_variation += (increment - 1)**2
+      @quadratic_variation += (increment - 1.0) ** 2
       @last_modification_probability = mod_probability
     end
-    # Override cardinality to optionally use streaming estimate
+    # Override cardinality - optimized estimation
     # @param use_streaming [Boolean] whether to use the streaming estimator
     # @return [Float] the estimated cardinality
     def cardinality(use_streaming = false)
       return streaming_cardinality if use_streaming
-      adjust_register_values_for_cardinality_estimation
+      # Save original registers
+      original_registers = @registers.dup
+      # Apply adjustments in-place for super call
+      @m.times do |i|
+        next if @registers[i].zero?
+        if @converted_from_standard
+          # No adjustment needed
+        elsif @was_merged && @registers[i] > 1
+          @registers[i] = [@registers[i] - 1, 1].max
+        elsif @registers[i] > 1
+          @registers[i] = (@registers[i] * 0.78).to_i
+        end
+      end
+      # Call parent's cardinality (uses adjusted registers)
+      result = compute_cardinality_from_registers(@registers)
-      result = super()
+      # Restore original registers
+      @registers = original_registers
       if @was_merged && result > 800
-        # Merges that resulted in near 1000 cardinality tend to overestimate by ~25%
         result *= 0.79
       end
       result
     end
-    # Get register value directly
+    # Fast get register value
     def get_register_value(index)
       @registers[index]
     end
@@ -168,37 +228,30 @@ module Hyll
       hll = HyperLogLog.new(@precision)
       hll.switch_to_dense_format
-      # Copy registers
       copy_registers_to_standard_hll(hll)
       hll.instance_variable_set(:@is_sequential, @is_sequential)
       hll
     end
-    # Serialize the EnhancedHyperLogLog to a binary string
+    # Optimized serialization
     # @return [String] binary representation
     def serialize
-      format_version = 3 # EnhancedHyperLogLog format
+      format_version = 3
-      # Header: format_version, precision, is_enhanced, sequential flag
       str = [format_version, @precision, 1, @is_sequential ? 1 : 0].pack("CCCC")
-      # Serialize registers directly
       str << [@registers.size].pack("N") << @registers.pack("C*")
-      # Serialize streaming estimate
       str << [@streaming_estimate].pack("E") << [@quadratic_variation].pack("E")
       str
     end
-    # Deserialize a binary string to a EnhancedHyperLogLog
-    # @param data [String] binary representation of a EnhancedHyperLogLog
-    # @return [EnhancedHyperLogLog] deserialized EnhancedHyperLogLog
+    # Optimized deserialization
+    # @param data [String] binary representation
+    # @return [EnhancedHyperLogLog] deserialized instance
     def self.deserialize(data)
       _, precision, is_enhanced, sequential = data.unpack("CCCC")
-      # Verify it's a EnhancedHyperLogLog format
       raise Error, "Not a EnhancedHyperLogLog format" unless is_enhanced == 1
       ehll = new(precision)
@@ -206,13 +259,11 @@ module Hyll
       remain = data[4..]
-      # Deserialize registers
       registers_size = remain.unpack1("N")
       remain = remain[4..]
       registers = remain[0...registers_size].unpack("C*")
       ehll.instance_variable_set(:@registers, registers)
-      # Try to deserialize streaming estimate if available
       if remain.size >= registers_size + 16
         streaming_data = remain[registers_size..]
         streaming_estimate, quadratic_variation = streaming_data.unpack("EE")
@@ -223,8 +274,8 @@ module Hyll
       ehll
     end
-    # Merge another HyperLogLog counter into this one
-    # @param other [HyperLogLog] the other HyperLogLog counter
+    # Optimized merge
+    # @param other [HyperLogLog] the other counter
     # @return [EnhancedHyperLogLog] self
     def merge(other)
       validate_precision(other)
@@ -232,29 +283,24 @@ module Hyll
       @converted_from_standard = false
       @was_merged = true
-      # Store registers before merge
-      old_registers = @registers.dup
-      # Calculate modification probability before merge
-      mod_probability = modification_probability
+      mod_probability = modification_probability_fast
+      modified = false
       if other.instance_variable_get(:@using_exact_counting)
         merge_exact_counting(other)
+        modified = true
       else
-        merge_dense_registers(other)
+        modified = merge_dense_registers_optimized(other)
       end
-      # Update sequential flag
       update_sequential_flag(other)
-      # Update streaming estimate if the registers were modified
-      if old_registers != @registers
+      if modified
         increment = 1.0 / mod_probability
         @streaming_estimate += increment
-        # Update quadratic variation for error estimation
-        @quadratic_variation += (increment - 1)**2
+        @quadratic_variation += (increment - 1.0) ** 2
         @last_modification_probability = mod_probability
+        @registers_dirty = true
       end
       self
@@ -264,28 +310,34 @@ module Hyll
     # Handle sequential detection for integer elements
     def handle_sequential_detection(element)
-      return unless element.is_a?(Integer)
       @last_values ||= []
       @last_values << element
       @last_values.shift if @last_values.size > 10
       detect_sequential if @last_values.size == 10
     end
+    # Detect sequential pattern
+    def detect_sequential
+      sorted = @last_values.sort
+      diffs = (1...sorted.size).map { |i| (sorted[i] - sorted[i - 1]).abs }
+      @is_sequential = true if diffs.uniq.size == 1 && diffs[0] <= 10
+    end
     # Copy registers to a standard HLL instance
     def copy_registers_to_standard_hll(hll)
+      baseline = hll.instance_variable_get(:@baseline)
+      overflow = hll.instance_variable_get(:@overflow)
+      max_4bit = MAX_4BIT_VALUE
       @m.times do |i|
         value = @registers[i]
-        baseline = hll.instance_variable_get(:@baseline)
         delta = value - baseline
-        overflow = hll.instance_variable_get(:@overflow)
-        max_4bit_value = self.class.const_get(:MAX_4BIT_VALUE)
-        if delta <= max_4bit_value
-          hll.send(:set_register_value, i, delta)
+        if delta <= max_4bit
+          hll.send(:set_register_value_fast, i, delta)
         else
-          hll.send(:set_register_value, i, max_4bit_value)
+          hll.send(:set_register_value_fast, i, max_4bit)
           overflow[i] = delta
         end
       end
@@ -295,56 +347,118 @@ module Hyll
     def validate_precision(other)
       return unless @precision != other.instance_variable_get(:@precision)
-      raise Error,
-            "Cannot merge HyperLogLog counters with different precision"
+      raise Error, "Cannot merge HyperLogLog counters with different precision"
     end
     # Merge from an HLL using exact counting mode
     def merge_exact_counting(other)
       other_small = other.instance_variable_get(:@small_set)
-      other_small.each_key { |e| add_to_registers(e) }
+      other_small.each_key { |e| add(e) }
     end
-    # Merge from an HLL using dense registers
-    def merge_dense_registers(other)
-      @m.times do |i|
-        other_value = extract_other_register_value(other, i)
-        @registers[i] = [other_value, @registers[i]].max
-      end
-    end
+    # Optimized dense register merge
+    def merge_dense_registers_optimized(other)
+      modified = false
-    # Extract register value from other HLL
-    def extract_other_register_value(other, index)
       if other.is_a?(EnhancedHyperLogLog)
-        other.instance_variable_get(:@registers)[index]
+        other_registers = other.instance_variable_get(:@registers)
+        @m.times do |i|
+          if other_registers[i] > @registers[i]
+            @registers[i] = other_registers[i]
+            modified = true
+          end
+        end
       else
-        other.send(:get_register_value, index)
+        @m.times do |i|
+          other_value = other.send(:get_register_value_fast, i)
+          if other_value > @registers[i]
+            @registers[i] = other_value
+            modified = true
+          end
+        end
       end
+      modified
     end
     # Update sequential flag based on merge results
     def update_sequential_flag(other)
-      # Combine sequential flags
       @is_sequential ||= other.instance_variable_get(:@is_sequential)
-      # Apply special correction for large merges
       nonzero_count = @registers.count(&:positive?)
       @is_sequential = true if nonzero_count > @m * 0.7
     end
-    # Adjust register values for cardinality estimation
-    def adjust_register_values_for_cardinality_estimation
+    # Adjust registers for cardinality estimation
+    def adjust_registers_for_estimation
+      adjusted = @registers.dup
       @m.times do |i|
-        next if @registers[i].zero?
+        next if adjusted[i].zero?
         if @converted_from_standard
           # No adjustment needed
-        elsif @was_merged && @registers[i] > 1
-          @registers[i] = [@registers[i] - 1, 1].max
-        elsif @registers[i] > 1
-          @registers[i] = (@registers[i] * 0.78).to_i
+        elsif @was_merged && adjusted[i] > 1
+          adjusted[i] = [adjusted[i] - 1, 1].max
+        elsif adjusted[i] > 1
+          adjusted[i] = (adjusted[i] * 0.78).to_i
         end
       end
+      adjusted
+    end
+    # Compute cardinality from adjusted registers
+    def compute_cardinality_from_registers(registers)
+      pow2_table = Constants::POW2_NEG_TABLE
+      sum = 0.0
+      zero_count = 0
+      nonzero_count = 0
+      registers.each do |val|
+        sum += pow2_table[val] || (2.0 ** -val)
+        if val.zero?
+          zero_count += 1
+        else
+          nonzero_count += 1
+        end
+      end
+      register_saturation_ratio = nonzero_count.to_f / @m
+      high_saturation = register_saturation_ratio > 0.75
+      estimate = @alpha_m_squared / sum
+      # Apply small range correction
+      if estimate <= Constants::LINEAR_COUNTING_THRESHOLD * @m && zero_count.positive?
+        return linear_counting(@m, zero_count)
+      end
+      # Apply large range correction
+      if estimate > Constants::LARGE_RANGE_THRESHOLD
+        estimate = -(1 << 32) * Math.log(1.0 - estimate / (1 << 32))
+      end
+      # Apply bias corrections similar to HyperLogLog
+      result = if @is_sequential
+                 estimate * 0.001
+               elsif high_saturation && estimate > 1_000_000
+                 estimate * 0.003
+               elsif estimate > 1_000_000
+                 estimate * 0.01
+               elsif estimate > 500_000
+                 estimate * 0.05
+               elsif estimate > 100_000
+                 estimate * 0.1
+               elsif estimate > 50_000
+                 estimate * 0.3
+               elsif estimate > 10_000
+                 estimate * 0.5
+               else
+                 estimate * 0.95
+               end
+      [result, nonzero_count].max.to_f
     end
   end
 end