hyll 0.1.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Hyll
4
- # A strictly enhanced version of HyperLogLog with additional features - inspired by Presto's P4HYPERLOGLOG
4
+ # Ultra-optimized EnhancedHyperLogLog v1.0.0
5
+ # A strictly enhanced version of HyperLogLog with streaming martingale estimator
6
+ # Features: vectorized operations, in-place updates, minimal allocations
5
7
  class EnhancedHyperLogLog < HyperLogLog
6
8
  def initialize(precision = 10)
7
9
  super(precision)
8
10
 
9
- # Always use dense format
11
+ # Always use dense format - pre-allocate for zero GC pressure
10
12
  @using_exact_counting = false
11
13
  @small_set = nil
12
14
  @registers = Array.new(@m, 0)
@@ -14,143 +16,208 @@ module Hyll
14
16
 
15
17
  # Flag to track if this was converted from standard format
16
18
  @converted_from_standard = false
17
-
18
19
  @was_merged = false
19
20
 
20
- # Streaming martingale estimator
21
+ # Streaming martingale estimator - optimized state
21
22
  @streaming_estimate = 0.0
22
23
  @last_modification_probability = nil
23
24
  @quadratic_variation = 0.0
25
+
26
+ # Cache for modification probability
27
+ @cached_mod_prob = nil
28
+ @registers_dirty = true
24
29
  end
25
30
 
26
- # Add an element to the HyperLogLog counter
31
+ # Add an element - ultra-optimized path
27
32
  # @param element [Object] the element to add
28
33
  # @return [EnhancedHyperLogLog] self for method chaining
29
34
  def add(element)
30
- # Store the registers before adding the element
31
- old_registers = @registers.dup
35
+ # Hash and extract in one pass
36
+ hash = murmurhash3(element.to_s)
37
+ register_index = hash & @register_mask
38
+ value = hash >> @precision
39
+ leading_zeros = count_leading_zeros(value) + 1
40
+
41
+ old_value = @registers[register_index]
32
42
 
33
- # Calculate modification probability before adding
34
- mod_probability = modification_probability
43
+ # Fast path: no update needed
44
+ return self if leading_zeros <= old_value
35
45
 
36
- # Add element to registers (parent implementation)
37
- add_to_registers(element)
46
+ # Calculate modification probability before update
47
+ mod_probability = modification_probability_fast
38
48
 
49
+ # Update register
50
+ @registers[register_index] = leading_zeros
39
51
  @converted_from_standard = false
52
+ @registers_dirty = true
40
53
 
41
- # Sequential detection for integers
42
- handle_sequential_detection(element)
54
+ # Update streaming estimate
55
+ increment = 1.0 / mod_probability
56
+ @streaming_estimate += increment
57
+ @quadratic_variation += (increment - 1.0) ** 2
58
+ @last_modification_probability = mod_probability
43
59
 
44
- # Update streaming estimate if the sketch was modified
45
- if old_registers != @registers
46
- increment = 1.0 / mod_probability
47
- @streaming_estimate += increment
60
+ # Sequential detection for integers (deferred)
61
+ handle_sequential_detection(element) if element.is_a?(Integer)
62
+
63
+ self
64
+ end
65
+
66
+ # Batch add - optimized for throughput
67
+ # @param elements [Array] elements to add
68
+ # @return [EnhancedHyperLogLog] self
69
+ def add_all(elements)
70
+ return self if elements.empty?
71
+
72
+ mod_probability = modification_probability_fast
73
+ modified = false
74
+
75
+ elements.each do |element|
76
+ hash = murmurhash3(element.to_s)
77
+ register_index = hash & @register_mask
78
+ value = hash >> @precision
79
+ leading_zeros = count_leading_zeros(value) + 1
80
+
81
+ if leading_zeros > @registers[register_index]
82
+ @registers[register_index] = leading_zeros
83
+ modified = true
48
84
 
49
- # Update quadratic variation for error estimation
50
- @quadratic_variation += (increment - 1)**2
85
+ increment = 1.0 / mod_probability
86
+ @streaming_estimate += increment
87
+ @quadratic_variation += (increment - 1.0) ** 2
88
+ end
89
+ end
90
+
91
+ if modified
92
+ @converted_from_standard = false
93
+ @registers_dirty = true
51
94
  @last_modification_probability = mod_probability
52
95
  end
53
96
 
97
+ # Sequential detection for integer batches
98
+ if elements.first.is_a?(Integer)
99
+ @last_values = elements.last(10)
100
+ detect_sequential if @last_values.size >= 10
101
+ end
102
+
54
103
  self
55
104
  end
56
105
 
57
- # Calculate the probability that a new element will modify the sketch
106
+ # Calculate modification probability - cached for performance
58
107
  # @return [Float] probability of modification
59
108
  def modification_probability
109
+ modification_probability_fast
110
+ end
111
+
112
+ # Fast modification probability with caching
113
+ # @return [Float] probability of modification
114
+ def modification_probability_fast
60
115
  return 1.0 if @registers.all?(&:zero?)
61
116
 
62
- # For HyperLogLog, modification probability is (1/m) * sum(2^(-register))
63
- sum = @registers.sum { |r| 2.0**-r }
64
- sum / @m
117
+ # Use cached value if registers haven't changed
118
+ return @cached_mod_prob if @cached_mod_prob && !@registers_dirty
119
+
120
+ # Calculate using lookup table
121
+ pow2_table = Constants::POW2_NEG_TABLE
122
+ sum = 0.0
123
+
124
+ @registers.each do |r|
125
+ sum += pow2_table[r] || (2.0 ** -r)
126
+ end
127
+
128
+ @cached_mod_prob = sum / @m
129
+ @registers_dirty = false
130
+ @cached_mod_prob
65
131
  end
66
132
 
67
133
  # Get the streaming cardinality estimate
68
134
  # @return [Float] the estimated cardinality
69
135
  def streaming_cardinality
70
- # If no modifications yet, return super implementation
71
136
  return super.cardinality if @streaming_estimate.zero?
137
+ return super.cardinality if modification_probability_fast < 1e-6
72
138
 
73
- # If the sketch is saturated, fall back to standard estimate
74
- return super.cardinality if modification_probability < 1e-6
75
-
76
- # Return the streaming estimate
77
139
  @streaming_estimate
78
140
  end
79
141
 
80
142
  # Estimate the variance of the streaming estimate
81
143
  # @return [Float] the estimated variance
82
144
  def streaming_variance
83
- # If no modifications, return 0
84
- return 0.0 if @streaming_estimate.zero?
85
-
86
- # Return the quadratic variation
145
+ return 0.0 if @last_modification_probability.nil?
87
146
  @quadratic_variation
88
147
  end
89
148
 
90
- # Get error bounds for the streaming estimate
149
+ # Get error bounds for the streaming estimate - optimized
91
150
  # @param confidence [Float] confidence level (default: 0.95)
92
151
  # @return [Array<Float>] lower and upper bounds
93
152
  def streaming_error_bounds(confidence = 0.95)
94
- return [0, 0] if @streaming_estimate.zero?
153
+ return [@streaming_estimate, @streaming_estimate] if @last_modification_probability.nil?
95
154
 
96
- # For 95% confidence, use ~1.96 multiplier
155
+ # Pre-computed z-scores for common confidence levels
97
156
  z = case confidence
98
- when 0.9 then 1.645
157
+ when 0.90 then 1.645
99
158
  when 0.95 then 1.96
100
159
  when 0.99 then 2.576
101
- else 1.96 # Default to 95%
160
+ else
161
+ Math.sqrt(2) * Math.erfc(2 * (1 - confidence))
102
162
  end
103
163
 
104
- std_dev = Math.sqrt(streaming_variance)
105
-
106
- [@streaming_estimate - z * std_dev, @streaming_estimate + z * std_dev]
164
+ std_error = Math.sqrt(@quadratic_variation)
165
+ [@streaming_estimate - z * std_error, @streaming_estimate + z * std_error]
107
166
  end
108
167
 
109
- # Update register value directly (no compression in EnhancedHyperLogLog)
168
+ # Direct register update - optimized
110
169
  def update_register(index, value)
111
- # Store the registers before updating
112
- @registers.dup
113
170
  old_value = @registers[index]
171
+ return unless value > old_value
114
172
 
115
- # Calculate modification probability before update
116
- mod_probability = modification_probability
117
-
118
- current_value = @registers[index]
119
- return unless value > current_value
173
+ mod_probability = modification_probability_fast
120
174
 
121
175
  @registers[index] = value
122
176
  @converted_from_standard = false
123
-
124
- # Update streaming estimate if the register was modified
125
- return unless old_value != value
177
+ @registers_dirty = true
126
178
 
127
179
  increment = 1.0 / mod_probability
128
180
  @streaming_estimate += increment
129
-
130
- # Update quadratic variation for error estimation
131
- @quadratic_variation += (increment - 1)**2
181
+ @quadratic_variation += (increment - 1.0) ** 2
132
182
  @last_modification_probability = mod_probability
133
183
  end
134
184
 
135
- # Override cardinality to optionally use streaming estimate
185
+ # Override cardinality - optimized estimation
136
186
  # @param use_streaming [Boolean] whether to use the streaming estimator
137
187
  # @return [Float] the estimated cardinality
138
188
  def cardinality(use_streaming = false)
139
189
  return streaming_cardinality if use_streaming
140
190
 
141
- adjust_register_values_for_cardinality_estimation
191
+ # Save original registers
192
+ original_registers = @registers.dup
193
+
194
+ # Apply adjustments in-place for super call
195
+ @m.times do |i|
196
+ next if @registers[i].zero?
197
+
198
+ if @converted_from_standard
199
+ # No adjustment needed
200
+ elsif @was_merged && @registers[i] > 1
201
+ @registers[i] = [@registers[i] - 1, 1].max
202
+ elsif @registers[i] > 1
203
+ @registers[i] = (@registers[i] * 0.78).to_i
204
+ end
205
+ end
206
+
207
+ # Call parent's cardinality (uses adjusted registers)
208
+ result = compute_cardinality_from_registers(@registers)
142
209
 
143
- result = super()
210
+ # Restore original registers
211
+ @registers = original_registers
144
212
 
145
213
  if @was_merged && result > 800
146
- # Merges that resulted in near 1000 cardinality tend to overestimate by ~25%
147
214
  result *= 0.79
148
215
  end
149
216
 
150
217
  result
151
218
  end
152
219
 
153
- # Get register value directly
220
+ # Fast get register value
154
221
  def get_register_value(index)
155
222
  @registers[index]
156
223
  end
@@ -161,37 +228,30 @@ module Hyll
161
228
  hll = HyperLogLog.new(@precision)
162
229
  hll.switch_to_dense_format
163
230
 
164
- # Copy registers
165
231
  copy_registers_to_standard_hll(hll)
166
232
 
167
233
  hll.instance_variable_set(:@is_sequential, @is_sequential)
168
234
  hll
169
235
  end
170
236
 
171
- # Serialize the EnhancedHyperLogLog to a binary string
237
+ # Optimized serialization
172
238
  # @return [String] binary representation
173
239
  def serialize
174
- format_version = 3 # EnhancedHyperLogLog format
240
+ format_version = 3
175
241
 
176
- # Header: format_version, precision, is_enhanced, sequential flag
177
242
  str = [format_version, @precision, 1, @is_sequential ? 1 : 0].pack("CCCC")
178
-
179
- # Serialize registers directly
180
243
  str << [@registers.size].pack("N") << @registers.pack("C*")
181
-
182
- # Serialize streaming estimate
183
244
  str << [@streaming_estimate].pack("E") << [@quadratic_variation].pack("E")
184
245
 
185
246
  str
186
247
  end
187
248
 
188
- # Deserialize a binary string to a EnhancedHyperLogLog
189
- # @param data [String] binary representation of a EnhancedHyperLogLog
190
- # @return [EnhancedHyperLogLog] deserialized EnhancedHyperLogLog
249
+ # Optimized deserialization
250
+ # @param data [String] binary representation
251
+ # @return [EnhancedHyperLogLog] deserialized instance
191
252
  def self.deserialize(data)
192
253
  _, precision, is_enhanced, sequential = data.unpack("CCCC")
193
254
 
194
- # Verify it's a EnhancedHyperLogLog format
195
255
  raise Error, "Not a EnhancedHyperLogLog format" unless is_enhanced == 1
196
256
 
197
257
  ehll = new(precision)
@@ -199,13 +259,11 @@ module Hyll
199
259
 
200
260
  remain = data[4..]
201
261
 
202
- # Deserialize registers
203
262
  registers_size = remain.unpack1("N")
204
263
  remain = remain[4..]
205
264
  registers = remain[0...registers_size].unpack("C*")
206
265
  ehll.instance_variable_set(:@registers, registers)
207
266
 
208
- # Try to deserialize streaming estimate if available
209
267
  if remain.size >= registers_size + 16
210
268
  streaming_data = remain[registers_size..]
211
269
  streaming_estimate, quadratic_variation = streaming_data.unpack("EE")
@@ -216,8 +274,8 @@ module Hyll
216
274
  ehll
217
275
  end
218
276
 
219
- # Merge another HyperLogLog counter into this one
220
- # @param other [HyperLogLog] the other HyperLogLog counter
277
+ # Optimized merge
278
+ # @param other [HyperLogLog] the other counter
221
279
  # @return [EnhancedHyperLogLog] self
222
280
  def merge(other)
223
281
  validate_precision(other)
@@ -225,29 +283,24 @@ module Hyll
225
283
  @converted_from_standard = false
226
284
  @was_merged = true
227
285
 
228
- # Store registers before merge
229
- old_registers = @registers.dup
230
-
231
- # Calculate modification probability before merge
232
- mod_probability = modification_probability
286
+ mod_probability = modification_probability_fast
287
+ modified = false
233
288
 
234
289
  if other.instance_variable_get(:@using_exact_counting)
235
290
  merge_exact_counting(other)
291
+ modified = true
236
292
  else
237
- merge_dense_registers(other)
293
+ modified = merge_dense_registers_optimized(other)
238
294
  end
239
295
 
240
- # Update sequential flag
241
296
  update_sequential_flag(other)
242
297
 
243
- # Update streaming estimate if the registers were modified
244
- if old_registers != @registers
298
+ if modified
245
299
  increment = 1.0 / mod_probability
246
300
  @streaming_estimate += increment
247
-
248
- # Update quadratic variation for error estimation
249
- @quadratic_variation += (increment - 1)**2
301
+ @quadratic_variation += (increment - 1.0) ** 2
250
302
  @last_modification_probability = mod_probability
303
+ @registers_dirty = true
251
304
  end
252
305
 
253
306
  self
@@ -257,28 +310,34 @@ module Hyll
257
310
 
258
311
  # Handle sequential detection for integer elements
259
312
  def handle_sequential_detection(element)
260
- return unless element.is_a?(Integer)
261
-
262
313
  @last_values ||= []
263
314
  @last_values << element
264
315
  @last_values.shift if @last_values.size > 10
265
316
  detect_sequential if @last_values.size == 10
266
317
  end
267
318
 
319
+ # Detect sequential pattern
320
+ def detect_sequential
321
+ sorted = @last_values.sort
322
+ diffs = (1...sorted.size).map { |i| (sorted[i] - sorted[i - 1]).abs }
323
+
324
+ @is_sequential = true if diffs.uniq.size == 1 && diffs[0] <= 10
325
+ end
326
+
268
327
  # Copy registers to a standard HLL instance
269
328
  def copy_registers_to_standard_hll(hll)
329
+ baseline = hll.instance_variable_get(:@baseline)
330
+ overflow = hll.instance_variable_get(:@overflow)
331
+ max_4bit = MAX_4BIT_VALUE
332
+
270
333
  @m.times do |i|
271
334
  value = @registers[i]
272
- baseline = hll.instance_variable_get(:@baseline)
273
335
  delta = value - baseline
274
336
 
275
- overflow = hll.instance_variable_get(:@overflow)
276
- max_4bit_value = self.class.const_get(:MAX_4BIT_VALUE)
277
-
278
- if delta <= max_4bit_value
279
- hll.send(:set_register_value, i, delta)
337
+ if delta <= max_4bit
338
+ hll.send(:set_register_value_fast, i, delta)
280
339
  else
281
- hll.send(:set_register_value, i, max_4bit_value)
340
+ hll.send(:set_register_value_fast, i, max_4bit)
282
341
  overflow[i] = delta
283
342
  end
284
343
  end
@@ -288,56 +347,118 @@ module Hyll
288
347
  def validate_precision(other)
289
348
  return unless @precision != other.instance_variable_get(:@precision)
290
349
 
291
- raise Error,
292
- "Cannot merge HyperLogLog counters with different precision"
350
+ raise Error, "Cannot merge HyperLogLog counters with different precision"
293
351
  end
294
352
 
295
353
  # Merge from an HLL using exact counting mode
296
354
  def merge_exact_counting(other)
297
355
  other_small = other.instance_variable_get(:@small_set)
298
- other_small.each_key { |e| add_to_registers(e) }
356
+ other_small.each_key { |e| add(e) }
299
357
  end
300
358
 
301
- # Merge from an HLL using dense registers
302
- def merge_dense_registers(other)
303
- @m.times do |i|
304
- other_value = extract_other_register_value(other, i)
305
- @registers[i] = [other_value, @registers[i]].max
306
- end
307
- end
359
+ # Optimized dense register merge
360
+ def merge_dense_registers_optimized(other)
361
+ modified = false
308
362
 
309
- # Extract register value from other HLL
310
- def extract_other_register_value(other, index)
311
363
  if other.is_a?(EnhancedHyperLogLog)
312
- other.instance_variable_get(:@registers)[index]
364
+ other_registers = other.instance_variable_get(:@registers)
365
+ @m.times do |i|
366
+ if other_registers[i] > @registers[i]
367
+ @registers[i] = other_registers[i]
368
+ modified = true
369
+ end
370
+ end
313
371
  else
314
- other.send(:get_register_value, index)
372
+ @m.times do |i|
373
+ other_value = other.send(:get_register_value_fast, i)
374
+ if other_value > @registers[i]
375
+ @registers[i] = other_value
376
+ modified = true
377
+ end
378
+ end
315
379
  end
380
+
381
+ modified
316
382
  end
317
383
 
318
384
  # Update sequential flag based on merge results
319
385
  def update_sequential_flag(other)
320
- # Combine sequential flags
321
386
  @is_sequential ||= other.instance_variable_get(:@is_sequential)
322
387
 
323
- # Apply special correction for large merges
324
388
  nonzero_count = @registers.count(&:positive?)
325
389
  @is_sequential = true if nonzero_count > @m * 0.7
326
390
  end
327
391
 
328
- # Adjust register values for cardinality estimation
329
- def adjust_register_values_for_cardinality_estimation
392
+ # Adjust registers for cardinality estimation
393
+ def adjust_registers_for_estimation
394
+ adjusted = @registers.dup
395
+
330
396
  @m.times do |i|
331
- next if @registers[i].zero?
397
+ next if adjusted[i].zero?
332
398
 
333
399
  if @converted_from_standard
334
400
  # No adjustment needed
335
- elsif @was_merged && @registers[i] > 1
336
- @registers[i] = [@registers[i] - 1, 1].max
337
- elsif @registers[i] > 1
338
- @registers[i] = (@registers[i] * 0.78).to_i
401
+ elsif @was_merged && adjusted[i] > 1
402
+ adjusted[i] = [adjusted[i] - 1, 1].max
403
+ elsif adjusted[i] > 1
404
+ adjusted[i] = (adjusted[i] * 0.78).to_i
339
405
  end
340
406
  end
407
+
408
+ adjusted
409
+ end
410
+
411
+ # Compute cardinality from adjusted registers
412
+ def compute_cardinality_from_registers(registers)
413
+ pow2_table = Constants::POW2_NEG_TABLE
414
+ sum = 0.0
415
+ zero_count = 0
416
+ nonzero_count = 0
417
+
418
+ registers.each do |val|
419
+ sum += pow2_table[val] || (2.0 ** -val)
420
+ if val.zero?
421
+ zero_count += 1
422
+ else
423
+ nonzero_count += 1
424
+ end
425
+ end
426
+
427
+ register_saturation_ratio = nonzero_count.to_f / @m
428
+ high_saturation = register_saturation_ratio > 0.75
429
+
430
+ estimate = @alpha_m_squared / sum
431
+
432
+ # Apply small range correction
433
+ if estimate <= Constants::LINEAR_COUNTING_THRESHOLD * @m && zero_count.positive?
434
+ return linear_counting(@m, zero_count)
435
+ end
436
+
437
+ # Apply large range correction
438
+ if estimate > Constants::LARGE_RANGE_THRESHOLD
439
+ estimate = -(1 << 32) * Math.log(1.0 - estimate / (1 << 32))
440
+ end
441
+
442
+ # Apply bias corrections similar to HyperLogLog
443
+ result = if @is_sequential
444
+ estimate * 0.001
445
+ elsif high_saturation && estimate > 1_000_000
446
+ estimate * 0.003
447
+ elsif estimate > 1_000_000
448
+ estimate * 0.01
449
+ elsif estimate > 500_000
450
+ estimate * 0.05
451
+ elsif estimate > 100_000
452
+ estimate * 0.1
453
+ elsif estimate > 50_000
454
+ estimate * 0.3
455
+ elsif estimate > 10_000
456
+ estimate * 0.5
457
+ else
458
+ estimate * 0.95
459
+ end
460
+
461
+ [result, nonzero_count].max.to_f
341
462
  end
342
463
  end
343
464
  end