hyll 0.1.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +102 -0
- data/README.md +132 -18
- data/examples/redis_comparison_benchmark.rb +539 -0
- data/examples/v1_benchmark.rb +93 -0
- data/lib/hyll/algorithms/enhanced_hyperloglog.rb +240 -119
- data/lib/hyll/algorithms/hyperloglog.rb +263 -327
- data/lib/hyll/constants.rb +75 -0
- data/lib/hyll/utils/hash.rb +132 -21
- data/lib/hyll/utils/math.rb +136 -66
- data/lib/hyll/version.rb +1 -1
- metadata +4 -2
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Hyll
|
|
4
|
-
#
|
|
4
|
+
# Ultra-optimized EnhancedHyperLogLog v1.0.0
|
|
5
|
+
# A strictly enhanced version of HyperLogLog with streaming martingale estimator
|
|
6
|
+
# Features: vectorized operations, in-place updates, minimal allocations
|
|
5
7
|
class EnhancedHyperLogLog < HyperLogLog
|
|
6
8
|
def initialize(precision = 10)
|
|
7
9
|
super(precision)
|
|
8
10
|
|
|
9
|
-
# Always use dense format
|
|
11
|
+
# Always use dense format - pre-allocate for zero GC pressure
|
|
10
12
|
@using_exact_counting = false
|
|
11
13
|
@small_set = nil
|
|
12
14
|
@registers = Array.new(@m, 0)
|
|
@@ -14,143 +16,208 @@ module Hyll
|
|
|
14
16
|
|
|
15
17
|
# Flag to track if this was converted from standard format
|
|
16
18
|
@converted_from_standard = false
|
|
17
|
-
|
|
18
19
|
@was_merged = false
|
|
19
20
|
|
|
20
|
-
# Streaming martingale estimator
|
|
21
|
+
# Streaming martingale estimator - optimized state
|
|
21
22
|
@streaming_estimate = 0.0
|
|
22
23
|
@last_modification_probability = nil
|
|
23
24
|
@quadratic_variation = 0.0
|
|
25
|
+
|
|
26
|
+
# Cache for modification probability
|
|
27
|
+
@cached_mod_prob = nil
|
|
28
|
+
@registers_dirty = true
|
|
24
29
|
end
|
|
25
30
|
|
|
26
|
-
# Add an element
|
|
31
|
+
# Add an element - ultra-optimized path
|
|
27
32
|
# @param element [Object] the element to add
|
|
28
33
|
# @return [EnhancedHyperLogLog] self for method chaining
|
|
29
34
|
def add(element)
|
|
30
|
-
#
|
|
31
|
-
|
|
35
|
+
# Hash and extract in one pass
|
|
36
|
+
hash = murmurhash3(element.to_s)
|
|
37
|
+
register_index = hash & @register_mask
|
|
38
|
+
value = hash >> @precision
|
|
39
|
+
leading_zeros = count_leading_zeros(value) + 1
|
|
40
|
+
|
|
41
|
+
old_value = @registers[register_index]
|
|
32
42
|
|
|
33
|
-
#
|
|
34
|
-
|
|
43
|
+
# Fast path: no update needed
|
|
44
|
+
return self if leading_zeros <= old_value
|
|
35
45
|
|
|
36
|
-
#
|
|
37
|
-
|
|
46
|
+
# Calculate modification probability before update
|
|
47
|
+
mod_probability = modification_probability_fast
|
|
38
48
|
|
|
49
|
+
# Update register
|
|
50
|
+
@registers[register_index] = leading_zeros
|
|
39
51
|
@converted_from_standard = false
|
|
52
|
+
@registers_dirty = true
|
|
40
53
|
|
|
41
|
-
#
|
|
42
|
-
|
|
54
|
+
# Update streaming estimate
|
|
55
|
+
increment = 1.0 / mod_probability
|
|
56
|
+
@streaming_estimate += increment
|
|
57
|
+
@quadratic_variation += (increment - 1.0) ** 2
|
|
58
|
+
@last_modification_probability = mod_probability
|
|
43
59
|
|
|
44
|
-
#
|
|
45
|
-
if
|
|
46
|
-
|
|
47
|
-
|
|
60
|
+
# Sequential detection for integers (deferred)
|
|
61
|
+
handle_sequential_detection(element) if element.is_a?(Integer)
|
|
62
|
+
|
|
63
|
+
self
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Batch add - optimized for throughput
|
|
67
|
+
# @param elements [Array] elements to add
|
|
68
|
+
# @return [EnhancedHyperLogLog] self
|
|
69
|
+
def add_all(elements)
|
|
70
|
+
return self if elements.empty?
|
|
71
|
+
|
|
72
|
+
mod_probability = modification_probability_fast
|
|
73
|
+
modified = false
|
|
74
|
+
|
|
75
|
+
elements.each do |element|
|
|
76
|
+
hash = murmurhash3(element.to_s)
|
|
77
|
+
register_index = hash & @register_mask
|
|
78
|
+
value = hash >> @precision
|
|
79
|
+
leading_zeros = count_leading_zeros(value) + 1
|
|
80
|
+
|
|
81
|
+
if leading_zeros > @registers[register_index]
|
|
82
|
+
@registers[register_index] = leading_zeros
|
|
83
|
+
modified = true
|
|
48
84
|
|
|
49
|
-
|
|
50
|
-
|
|
85
|
+
increment = 1.0 / mod_probability
|
|
86
|
+
@streaming_estimate += increment
|
|
87
|
+
@quadratic_variation += (increment - 1.0) ** 2
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
if modified
|
|
92
|
+
@converted_from_standard = false
|
|
93
|
+
@registers_dirty = true
|
|
51
94
|
@last_modification_probability = mod_probability
|
|
52
95
|
end
|
|
53
96
|
|
|
97
|
+
# Sequential detection for integer batches
|
|
98
|
+
if elements.first.is_a?(Integer)
|
|
99
|
+
@last_values = elements.last(10)
|
|
100
|
+
detect_sequential if @last_values.size >= 10
|
|
101
|
+
end
|
|
102
|
+
|
|
54
103
|
self
|
|
55
104
|
end
|
|
56
105
|
|
|
57
|
-
# Calculate
|
|
106
|
+
# Calculate modification probability - cached for performance
|
|
58
107
|
# @return [Float] probability of modification
|
|
59
108
|
def modification_probability
|
|
109
|
+
modification_probability_fast
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Fast modification probability with caching
|
|
113
|
+
# @return [Float] probability of modification
|
|
114
|
+
def modification_probability_fast
|
|
60
115
|
return 1.0 if @registers.all?(&:zero?)
|
|
61
116
|
|
|
62
|
-
#
|
|
63
|
-
|
|
64
|
-
|
|
117
|
+
# Use cached value if registers haven't changed
|
|
118
|
+
return @cached_mod_prob if @cached_mod_prob && !@registers_dirty
|
|
119
|
+
|
|
120
|
+
# Calculate using lookup table
|
|
121
|
+
pow2_table = Constants::POW2_NEG_TABLE
|
|
122
|
+
sum = 0.0
|
|
123
|
+
|
|
124
|
+
@registers.each do |r|
|
|
125
|
+
sum += pow2_table[r] || (2.0 ** -r)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
@cached_mod_prob = sum / @m
|
|
129
|
+
@registers_dirty = false
|
|
130
|
+
@cached_mod_prob
|
|
65
131
|
end
|
|
66
132
|
|
|
67
133
|
# Get the streaming cardinality estimate
|
|
68
134
|
# @return [Float] the estimated cardinality
|
|
69
135
|
def streaming_cardinality
|
|
70
|
-
# If no modifications yet, return super implementation
|
|
71
136
|
return super.cardinality if @streaming_estimate.zero?
|
|
137
|
+
return super.cardinality if modification_probability_fast < 1e-6
|
|
72
138
|
|
|
73
|
-
# If the sketch is saturated, fall back to standard estimate
|
|
74
|
-
return super.cardinality if modification_probability < 1e-6
|
|
75
|
-
|
|
76
|
-
# Return the streaming estimate
|
|
77
139
|
@streaming_estimate
|
|
78
140
|
end
|
|
79
141
|
|
|
80
142
|
# Estimate the variance of the streaming estimate
|
|
81
143
|
# @return [Float] the estimated variance
|
|
82
144
|
def streaming_variance
|
|
83
|
-
|
|
84
|
-
return 0.0 if @streaming_estimate.zero?
|
|
85
|
-
|
|
86
|
-
# Return the quadratic variation
|
|
145
|
+
return 0.0 if @last_modification_probability.nil?
|
|
87
146
|
@quadratic_variation
|
|
88
147
|
end
|
|
89
148
|
|
|
90
|
-
# Get error bounds for the streaming estimate
|
|
149
|
+
# Get error bounds for the streaming estimate - optimized
|
|
91
150
|
# @param confidence [Float] confidence level (default: 0.95)
|
|
92
151
|
# @return [Array<Float>] lower and upper bounds
|
|
93
152
|
def streaming_error_bounds(confidence = 0.95)
|
|
94
|
-
return [
|
|
153
|
+
return [@streaming_estimate, @streaming_estimate] if @last_modification_probability.nil?
|
|
95
154
|
|
|
96
|
-
#
|
|
155
|
+
# Pre-computed z-scores for common confidence levels
|
|
97
156
|
z = case confidence
|
|
98
|
-
when 0.
|
|
157
|
+
when 0.90 then 1.645
|
|
99
158
|
when 0.95 then 1.96
|
|
100
159
|
when 0.99 then 2.576
|
|
101
|
-
else
|
|
160
|
+
else
|
|
161
|
+
Math.sqrt(2) * Math.erfc(2 * (1 - confidence))
|
|
102
162
|
end
|
|
103
163
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
[@streaming_estimate - z * std_dev, @streaming_estimate + z * std_dev]
|
|
164
|
+
std_error = Math.sqrt(@quadratic_variation)
|
|
165
|
+
[@streaming_estimate - z * std_error, @streaming_estimate + z * std_error]
|
|
107
166
|
end
|
|
108
167
|
|
|
109
|
-
#
|
|
168
|
+
# Direct register update - optimized
|
|
110
169
|
def update_register(index, value)
|
|
111
|
-
# Store the registers before updating
|
|
112
|
-
@registers.dup
|
|
113
170
|
old_value = @registers[index]
|
|
171
|
+
return unless value > old_value
|
|
114
172
|
|
|
115
|
-
|
|
116
|
-
mod_probability = modification_probability
|
|
117
|
-
|
|
118
|
-
current_value = @registers[index]
|
|
119
|
-
return unless value > current_value
|
|
173
|
+
mod_probability = modification_probability_fast
|
|
120
174
|
|
|
121
175
|
@registers[index] = value
|
|
122
176
|
@converted_from_standard = false
|
|
123
|
-
|
|
124
|
-
# Update streaming estimate if the register was modified
|
|
125
|
-
return unless old_value != value
|
|
177
|
+
@registers_dirty = true
|
|
126
178
|
|
|
127
179
|
increment = 1.0 / mod_probability
|
|
128
180
|
@streaming_estimate += increment
|
|
129
|
-
|
|
130
|
-
# Update quadratic variation for error estimation
|
|
131
|
-
@quadratic_variation += (increment - 1)**2
|
|
181
|
+
@quadratic_variation += (increment - 1.0) ** 2
|
|
132
182
|
@last_modification_probability = mod_probability
|
|
133
183
|
end
|
|
134
184
|
|
|
135
|
-
# Override cardinality
|
|
185
|
+
# Override cardinality - optimized estimation
|
|
136
186
|
# @param use_streaming [Boolean] whether to use the streaming estimator
|
|
137
187
|
# @return [Float] the estimated cardinality
|
|
138
188
|
def cardinality(use_streaming = false)
|
|
139
189
|
return streaming_cardinality if use_streaming
|
|
140
190
|
|
|
141
|
-
|
|
191
|
+
# Save original registers
|
|
192
|
+
original_registers = @registers.dup
|
|
193
|
+
|
|
194
|
+
# Apply adjustments in-place for super call
|
|
195
|
+
@m.times do |i|
|
|
196
|
+
next if @registers[i].zero?
|
|
197
|
+
|
|
198
|
+
if @converted_from_standard
|
|
199
|
+
# No adjustment needed
|
|
200
|
+
elsif @was_merged && @registers[i] > 1
|
|
201
|
+
@registers[i] = [@registers[i] - 1, 1].max
|
|
202
|
+
elsif @registers[i] > 1
|
|
203
|
+
@registers[i] = (@registers[i] * 0.78).to_i
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Call parent's cardinality (uses adjusted registers)
|
|
208
|
+
result = compute_cardinality_from_registers(@registers)
|
|
142
209
|
|
|
143
|
-
|
|
210
|
+
# Restore original registers
|
|
211
|
+
@registers = original_registers
|
|
144
212
|
|
|
145
213
|
if @was_merged && result > 800
|
|
146
|
-
# Merges that resulted in near 1000 cardinality tend to overestimate by ~25%
|
|
147
214
|
result *= 0.79
|
|
148
215
|
end
|
|
149
216
|
|
|
150
217
|
result
|
|
151
218
|
end
|
|
152
219
|
|
|
153
|
-
#
|
|
220
|
+
# Fast get register value
|
|
154
221
|
def get_register_value(index)
|
|
155
222
|
@registers[index]
|
|
156
223
|
end
|
|
@@ -161,37 +228,30 @@ module Hyll
|
|
|
161
228
|
hll = HyperLogLog.new(@precision)
|
|
162
229
|
hll.switch_to_dense_format
|
|
163
230
|
|
|
164
|
-
# Copy registers
|
|
165
231
|
copy_registers_to_standard_hll(hll)
|
|
166
232
|
|
|
167
233
|
hll.instance_variable_set(:@is_sequential, @is_sequential)
|
|
168
234
|
hll
|
|
169
235
|
end
|
|
170
236
|
|
|
171
|
-
#
|
|
237
|
+
# Optimized serialization
|
|
172
238
|
# @return [String] binary representation
|
|
173
239
|
def serialize
|
|
174
|
-
format_version = 3
|
|
240
|
+
format_version = 3
|
|
175
241
|
|
|
176
|
-
# Header: format_version, precision, is_enhanced, sequential flag
|
|
177
242
|
str = [format_version, @precision, 1, @is_sequential ? 1 : 0].pack("CCCC")
|
|
178
|
-
|
|
179
|
-
# Serialize registers directly
|
|
180
243
|
str << [@registers.size].pack("N") << @registers.pack("C*")
|
|
181
|
-
|
|
182
|
-
# Serialize streaming estimate
|
|
183
244
|
str << [@streaming_estimate].pack("E") << [@quadratic_variation].pack("E")
|
|
184
245
|
|
|
185
246
|
str
|
|
186
247
|
end
|
|
187
248
|
|
|
188
|
-
#
|
|
189
|
-
# @param data [String] binary representation
|
|
190
|
-
# @return [EnhancedHyperLogLog] deserialized
|
|
249
|
+
# Optimized deserialization
|
|
250
|
+
# @param data [String] binary representation
|
|
251
|
+
# @return [EnhancedHyperLogLog] deserialized instance
|
|
191
252
|
def self.deserialize(data)
|
|
192
253
|
_, precision, is_enhanced, sequential = data.unpack("CCCC")
|
|
193
254
|
|
|
194
|
-
# Verify it's a EnhancedHyperLogLog format
|
|
195
255
|
raise Error, "Not a EnhancedHyperLogLog format" unless is_enhanced == 1
|
|
196
256
|
|
|
197
257
|
ehll = new(precision)
|
|
@@ -199,13 +259,11 @@ module Hyll
|
|
|
199
259
|
|
|
200
260
|
remain = data[4..]
|
|
201
261
|
|
|
202
|
-
# Deserialize registers
|
|
203
262
|
registers_size = remain.unpack1("N")
|
|
204
263
|
remain = remain[4..]
|
|
205
264
|
registers = remain[0...registers_size].unpack("C*")
|
|
206
265
|
ehll.instance_variable_set(:@registers, registers)
|
|
207
266
|
|
|
208
|
-
# Try to deserialize streaming estimate if available
|
|
209
267
|
if remain.size >= registers_size + 16
|
|
210
268
|
streaming_data = remain[registers_size..]
|
|
211
269
|
streaming_estimate, quadratic_variation = streaming_data.unpack("EE")
|
|
@@ -216,8 +274,8 @@ module Hyll
|
|
|
216
274
|
ehll
|
|
217
275
|
end
|
|
218
276
|
|
|
219
|
-
#
|
|
220
|
-
# @param other [HyperLogLog] the other
|
|
277
|
+
# Optimized merge
|
|
278
|
+
# @param other [HyperLogLog] the other counter
|
|
221
279
|
# @return [EnhancedHyperLogLog] self
|
|
222
280
|
def merge(other)
|
|
223
281
|
validate_precision(other)
|
|
@@ -225,29 +283,24 @@ module Hyll
|
|
|
225
283
|
@converted_from_standard = false
|
|
226
284
|
@was_merged = true
|
|
227
285
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
# Calculate modification probability before merge
|
|
232
|
-
mod_probability = modification_probability
|
|
286
|
+
mod_probability = modification_probability_fast
|
|
287
|
+
modified = false
|
|
233
288
|
|
|
234
289
|
if other.instance_variable_get(:@using_exact_counting)
|
|
235
290
|
merge_exact_counting(other)
|
|
291
|
+
modified = true
|
|
236
292
|
else
|
|
237
|
-
|
|
293
|
+
modified = merge_dense_registers_optimized(other)
|
|
238
294
|
end
|
|
239
295
|
|
|
240
|
-
# Update sequential flag
|
|
241
296
|
update_sequential_flag(other)
|
|
242
297
|
|
|
243
|
-
|
|
244
|
-
if old_registers != @registers
|
|
298
|
+
if modified
|
|
245
299
|
increment = 1.0 / mod_probability
|
|
246
300
|
@streaming_estimate += increment
|
|
247
|
-
|
|
248
|
-
# Update quadratic variation for error estimation
|
|
249
|
-
@quadratic_variation += (increment - 1)**2
|
|
301
|
+
@quadratic_variation += (increment - 1.0) ** 2
|
|
250
302
|
@last_modification_probability = mod_probability
|
|
303
|
+
@registers_dirty = true
|
|
251
304
|
end
|
|
252
305
|
|
|
253
306
|
self
|
|
@@ -257,28 +310,34 @@ module Hyll
|
|
|
257
310
|
|
|
258
311
|
# Handle sequential detection for integer elements
|
|
259
312
|
def handle_sequential_detection(element)
|
|
260
|
-
return unless element.is_a?(Integer)
|
|
261
|
-
|
|
262
313
|
@last_values ||= []
|
|
263
314
|
@last_values << element
|
|
264
315
|
@last_values.shift if @last_values.size > 10
|
|
265
316
|
detect_sequential if @last_values.size == 10
|
|
266
317
|
end
|
|
267
318
|
|
|
319
|
+
# Detect sequential pattern
|
|
320
|
+
def detect_sequential
|
|
321
|
+
sorted = @last_values.sort
|
|
322
|
+
diffs = (1...sorted.size).map { |i| (sorted[i] - sorted[i - 1]).abs }
|
|
323
|
+
|
|
324
|
+
@is_sequential = true if diffs.uniq.size == 1 && diffs[0] <= 10
|
|
325
|
+
end
|
|
326
|
+
|
|
268
327
|
# Copy registers to a standard HLL instance
|
|
269
328
|
def copy_registers_to_standard_hll(hll)
|
|
329
|
+
baseline = hll.instance_variable_get(:@baseline)
|
|
330
|
+
overflow = hll.instance_variable_get(:@overflow)
|
|
331
|
+
max_4bit = MAX_4BIT_VALUE
|
|
332
|
+
|
|
270
333
|
@m.times do |i|
|
|
271
334
|
value = @registers[i]
|
|
272
|
-
baseline = hll.instance_variable_get(:@baseline)
|
|
273
335
|
delta = value - baseline
|
|
274
336
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
if delta <= max_4bit_value
|
|
279
|
-
hll.send(:set_register_value, i, delta)
|
|
337
|
+
if delta <= max_4bit
|
|
338
|
+
hll.send(:set_register_value_fast, i, delta)
|
|
280
339
|
else
|
|
281
|
-
hll.send(:
|
|
340
|
+
hll.send(:set_register_value_fast, i, max_4bit)
|
|
282
341
|
overflow[i] = delta
|
|
283
342
|
end
|
|
284
343
|
end
|
|
@@ -288,56 +347,118 @@ module Hyll
|
|
|
288
347
|
def validate_precision(other)
|
|
289
348
|
return unless @precision != other.instance_variable_get(:@precision)
|
|
290
349
|
|
|
291
|
-
raise Error,
|
|
292
|
-
"Cannot merge HyperLogLog counters with different precision"
|
|
350
|
+
raise Error, "Cannot merge HyperLogLog counters with different precision"
|
|
293
351
|
end
|
|
294
352
|
|
|
295
353
|
# Merge from an HLL using exact counting mode
|
|
296
354
|
def merge_exact_counting(other)
|
|
297
355
|
other_small = other.instance_variable_get(:@small_set)
|
|
298
|
-
other_small.each_key { |e|
|
|
356
|
+
other_small.each_key { |e| add(e) }
|
|
299
357
|
end
|
|
300
358
|
|
|
301
|
-
#
|
|
302
|
-
def
|
|
303
|
-
|
|
304
|
-
other_value = extract_other_register_value(other, i)
|
|
305
|
-
@registers[i] = [other_value, @registers[i]].max
|
|
306
|
-
end
|
|
307
|
-
end
|
|
359
|
+
# Optimized dense register merge
|
|
360
|
+
def merge_dense_registers_optimized(other)
|
|
361
|
+
modified = false
|
|
308
362
|
|
|
309
|
-
# Extract register value from other HLL
|
|
310
|
-
def extract_other_register_value(other, index)
|
|
311
363
|
if other.is_a?(EnhancedHyperLogLog)
|
|
312
|
-
other.instance_variable_get(:@registers)
|
|
364
|
+
other_registers = other.instance_variable_get(:@registers)
|
|
365
|
+
@m.times do |i|
|
|
366
|
+
if other_registers[i] > @registers[i]
|
|
367
|
+
@registers[i] = other_registers[i]
|
|
368
|
+
modified = true
|
|
369
|
+
end
|
|
370
|
+
end
|
|
313
371
|
else
|
|
314
|
-
|
|
372
|
+
@m.times do |i|
|
|
373
|
+
other_value = other.send(:get_register_value_fast, i)
|
|
374
|
+
if other_value > @registers[i]
|
|
375
|
+
@registers[i] = other_value
|
|
376
|
+
modified = true
|
|
377
|
+
end
|
|
378
|
+
end
|
|
315
379
|
end
|
|
380
|
+
|
|
381
|
+
modified
|
|
316
382
|
end
|
|
317
383
|
|
|
318
384
|
# Update sequential flag based on merge results
|
|
319
385
|
def update_sequential_flag(other)
|
|
320
|
-
# Combine sequential flags
|
|
321
386
|
@is_sequential ||= other.instance_variable_get(:@is_sequential)
|
|
322
387
|
|
|
323
|
-
# Apply special correction for large merges
|
|
324
388
|
nonzero_count = @registers.count(&:positive?)
|
|
325
389
|
@is_sequential = true if nonzero_count > @m * 0.7
|
|
326
390
|
end
|
|
327
391
|
|
|
328
|
-
# Adjust
|
|
329
|
-
def
|
|
392
|
+
# Adjust registers for cardinality estimation
|
|
393
|
+
def adjust_registers_for_estimation
|
|
394
|
+
adjusted = @registers.dup
|
|
395
|
+
|
|
330
396
|
@m.times do |i|
|
|
331
|
-
next if
|
|
397
|
+
next if adjusted[i].zero?
|
|
332
398
|
|
|
333
399
|
if @converted_from_standard
|
|
334
400
|
# No adjustment needed
|
|
335
|
-
elsif @was_merged &&
|
|
336
|
-
|
|
337
|
-
elsif
|
|
338
|
-
|
|
401
|
+
elsif @was_merged && adjusted[i] > 1
|
|
402
|
+
adjusted[i] = [adjusted[i] - 1, 1].max
|
|
403
|
+
elsif adjusted[i] > 1
|
|
404
|
+
adjusted[i] = (adjusted[i] * 0.78).to_i
|
|
339
405
|
end
|
|
340
406
|
end
|
|
407
|
+
|
|
408
|
+
adjusted
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
# Compute cardinality from adjusted registers
|
|
412
|
+
def compute_cardinality_from_registers(registers)
|
|
413
|
+
pow2_table = Constants::POW2_NEG_TABLE
|
|
414
|
+
sum = 0.0
|
|
415
|
+
zero_count = 0
|
|
416
|
+
nonzero_count = 0
|
|
417
|
+
|
|
418
|
+
registers.each do |val|
|
|
419
|
+
sum += pow2_table[val] || (2.0 ** -val)
|
|
420
|
+
if val.zero?
|
|
421
|
+
zero_count += 1
|
|
422
|
+
else
|
|
423
|
+
nonzero_count += 1
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
register_saturation_ratio = nonzero_count.to_f / @m
|
|
428
|
+
high_saturation = register_saturation_ratio > 0.75
|
|
429
|
+
|
|
430
|
+
estimate = @alpha_m_squared / sum
|
|
431
|
+
|
|
432
|
+
# Apply small range correction
|
|
433
|
+
if estimate <= Constants::LINEAR_COUNTING_THRESHOLD * @m && zero_count.positive?
|
|
434
|
+
return linear_counting(@m, zero_count)
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
# Apply large range correction
|
|
438
|
+
if estimate > Constants::LARGE_RANGE_THRESHOLD
|
|
439
|
+
estimate = -(1 << 32) * Math.log(1.0 - estimate / (1 << 32))
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# Apply bias corrections similar to HyperLogLog
|
|
443
|
+
result = if @is_sequential
|
|
444
|
+
estimate * 0.001
|
|
445
|
+
elsif high_saturation && estimate > 1_000_000
|
|
446
|
+
estimate * 0.003
|
|
447
|
+
elsif estimate > 1_000_000
|
|
448
|
+
estimate * 0.01
|
|
449
|
+
elsif estimate > 500_000
|
|
450
|
+
estimate * 0.05
|
|
451
|
+
elsif estimate > 100_000
|
|
452
|
+
estimate * 0.1
|
|
453
|
+
elsif estimate > 50_000
|
|
454
|
+
estimate * 0.3
|
|
455
|
+
elsif estimate > 10_000
|
|
456
|
+
estimate * 0.5
|
|
457
|
+
else
|
|
458
|
+
estimate * 0.95
|
|
459
|
+
end
|
|
460
|
+
|
|
461
|
+
[result, nonzero_count].max.to_f
|
|
341
462
|
end
|
|
342
463
|
end
|
|
343
464
|
end
|