hyll 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +80 -0
- data/README.md +53 -18
- data/examples/v1_benchmark.rb +93 -0
- data/lib/hyll/algorithms/enhanced_hyperloglog.rb +234 -120
- data/lib/hyll/algorithms/hyperloglog.rb +262 -338
- data/lib/hyll/constants.rb +75 -0
- data/lib/hyll/utils/hash.rb +132 -21
- data/lib/hyll/utils/math.rb +129 -75
- data/lib/hyll/version.rb +1 -1
- metadata +3 -2
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Hyll
|
|
4
|
-
#
|
|
4
|
+
# Ultra-optimized EnhancedHyperLogLog v1.0.0
|
|
5
|
+
# A strictly enhanced version of HyperLogLog with streaming martingale estimator
|
|
6
|
+
# Features: vectorized operations, in-place updates, minimal allocations
|
|
5
7
|
class EnhancedHyperLogLog < HyperLogLog
|
|
6
8
|
def initialize(precision = 10)
|
|
7
9
|
super(precision)
|
|
8
10
|
|
|
9
|
-
# Always use dense format
|
|
11
|
+
# Always use dense format - pre-allocate for zero GC pressure
|
|
10
12
|
@using_exact_counting = false
|
|
11
13
|
@small_set = nil
|
|
12
14
|
@registers = Array.new(@m, 0)
|
|
@@ -14,150 +16,208 @@ module Hyll
|
|
|
14
16
|
|
|
15
17
|
# Flag to track if this was converted from standard format
|
|
16
18
|
@converted_from_standard = false
|
|
17
|
-
|
|
18
19
|
@was_merged = false
|
|
19
20
|
|
|
20
|
-
# Streaming martingale estimator
|
|
21
|
+
# Streaming martingale estimator - optimized state
|
|
21
22
|
@streaming_estimate = 0.0
|
|
22
23
|
@last_modification_probability = nil
|
|
23
24
|
@quadratic_variation = 0.0
|
|
25
|
+
|
|
26
|
+
# Cache for modification probability
|
|
27
|
+
@cached_mod_prob = nil
|
|
28
|
+
@registers_dirty = true
|
|
24
29
|
end
|
|
25
30
|
|
|
26
|
-
# Add an element
|
|
31
|
+
# Add an element - ultra-optimized path
|
|
27
32
|
# @param element [Object] the element to add
|
|
28
33
|
# @return [EnhancedHyperLogLog] self for method chaining
|
|
29
34
|
def add(element)
|
|
30
|
-
#
|
|
31
|
-
|
|
35
|
+
# Hash and extract in one pass
|
|
36
|
+
hash = murmurhash3(element.to_s)
|
|
37
|
+
register_index = hash & @register_mask
|
|
38
|
+
value = hash >> @precision
|
|
39
|
+
leading_zeros = count_leading_zeros(value) + 1
|
|
40
|
+
|
|
41
|
+
old_value = @registers[register_index]
|
|
32
42
|
|
|
33
|
-
#
|
|
34
|
-
|
|
43
|
+
# Fast path: no update needed
|
|
44
|
+
return self if leading_zeros <= old_value
|
|
35
45
|
|
|
36
|
-
#
|
|
37
|
-
|
|
46
|
+
# Calculate modification probability before update
|
|
47
|
+
mod_probability = modification_probability_fast
|
|
38
48
|
|
|
49
|
+
# Update register
|
|
50
|
+
@registers[register_index] = leading_zeros
|
|
39
51
|
@converted_from_standard = false
|
|
52
|
+
@registers_dirty = true
|
|
40
53
|
|
|
41
|
-
#
|
|
42
|
-
|
|
54
|
+
# Update streaming estimate
|
|
55
|
+
increment = 1.0 / mod_probability
|
|
56
|
+
@streaming_estimate += increment
|
|
57
|
+
@quadratic_variation += (increment - 1.0) ** 2
|
|
58
|
+
@last_modification_probability = mod_probability
|
|
43
59
|
|
|
44
|
-
#
|
|
45
|
-
if
|
|
46
|
-
|
|
47
|
-
|
|
60
|
+
# Sequential detection for integers (deferred)
|
|
61
|
+
handle_sequential_detection(element) if element.is_a?(Integer)
|
|
62
|
+
|
|
63
|
+
self
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Batch add - optimized for throughput
|
|
67
|
+
# @param elements [Array] elements to add
|
|
68
|
+
# @return [EnhancedHyperLogLog] self
|
|
69
|
+
def add_all(elements)
|
|
70
|
+
return self if elements.empty?
|
|
71
|
+
|
|
72
|
+
mod_probability = modification_probability_fast
|
|
73
|
+
modified = false
|
|
74
|
+
|
|
75
|
+
elements.each do |element|
|
|
76
|
+
hash = murmurhash3(element.to_s)
|
|
77
|
+
register_index = hash & @register_mask
|
|
78
|
+
value = hash >> @precision
|
|
79
|
+
leading_zeros = count_leading_zeros(value) + 1
|
|
80
|
+
|
|
81
|
+
if leading_zeros > @registers[register_index]
|
|
82
|
+
@registers[register_index] = leading_zeros
|
|
83
|
+
modified = true
|
|
48
84
|
|
|
49
|
-
|
|
50
|
-
|
|
85
|
+
increment = 1.0 / mod_probability
|
|
86
|
+
@streaming_estimate += increment
|
|
87
|
+
@quadratic_variation += (increment - 1.0) ** 2
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
if modified
|
|
92
|
+
@converted_from_standard = false
|
|
93
|
+
@registers_dirty = true
|
|
51
94
|
@last_modification_probability = mod_probability
|
|
52
95
|
end
|
|
53
96
|
|
|
97
|
+
# Sequential detection for integer batches
|
|
98
|
+
if elements.first.is_a?(Integer)
|
|
99
|
+
@last_values = elements.last(10)
|
|
100
|
+
detect_sequential if @last_values.size >= 10
|
|
101
|
+
end
|
|
102
|
+
|
|
54
103
|
self
|
|
55
104
|
end
|
|
56
105
|
|
|
57
|
-
# Calculate
|
|
106
|
+
# Calculate modification probability - cached for performance
|
|
58
107
|
# @return [Float] probability of modification
|
|
59
108
|
def modification_probability
|
|
109
|
+
modification_probability_fast
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Fast modification probability with caching
|
|
113
|
+
# @return [Float] probability of modification
|
|
114
|
+
def modification_probability_fast
|
|
60
115
|
return 1.0 if @registers.all?(&:zero?)
|
|
61
116
|
|
|
62
|
-
#
|
|
63
|
-
|
|
64
|
-
|
|
117
|
+
# Use cached value if registers haven't changed
|
|
118
|
+
return @cached_mod_prob if @cached_mod_prob && !@registers_dirty
|
|
119
|
+
|
|
120
|
+
# Calculate using lookup table
|
|
121
|
+
pow2_table = Constants::POW2_NEG_TABLE
|
|
122
|
+
sum = 0.0
|
|
123
|
+
|
|
124
|
+
@registers.each do |r|
|
|
125
|
+
sum += pow2_table[r] || (2.0 ** -r)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
@cached_mod_prob = sum / @m
|
|
129
|
+
@registers_dirty = false
|
|
130
|
+
@cached_mod_prob
|
|
65
131
|
end
|
|
66
132
|
|
|
67
133
|
# Get the streaming cardinality estimate
|
|
68
134
|
# @return [Float] the estimated cardinality
|
|
69
135
|
def streaming_cardinality
|
|
70
|
-
# If no modifications yet, return super implementation
|
|
71
136
|
return super.cardinality if @streaming_estimate.zero?
|
|
137
|
+
return super.cardinality if modification_probability_fast < 1e-6
|
|
72
138
|
|
|
73
|
-
# If the sketch is saturated, fall back to standard estimate
|
|
74
|
-
return super.cardinality if modification_probability < 1e-6
|
|
75
|
-
|
|
76
|
-
# Return the streaming estimate
|
|
77
139
|
@streaming_estimate
|
|
78
140
|
end
|
|
79
141
|
|
|
80
142
|
# Estimate the variance of the streaming estimate
|
|
81
143
|
# @return [Float] the estimated variance
|
|
82
144
|
def streaming_variance
|
|
83
|
-
# If no modifications, return 0
|
|
84
145
|
return 0.0 if @last_modification_probability.nil?
|
|
85
|
-
|
|
86
|
-
# Calculate variance based on martingale properties
|
|
87
|
-
# This provides an unbiased estimate of the variance
|
|
88
146
|
@quadratic_variation
|
|
89
147
|
end
|
|
90
148
|
|
|
91
|
-
# Get error bounds for the streaming estimate
|
|
149
|
+
# Get error bounds for the streaming estimate - optimized
|
|
92
150
|
# @param confidence [Float] confidence level (default: 0.95)
|
|
93
151
|
# @return [Array<Float>] lower and upper bounds
|
|
94
152
|
def streaming_error_bounds(confidence = 0.95)
|
|
95
|
-
# If no modifications, return exact bounds
|
|
96
153
|
return [@streaming_estimate, @streaming_estimate] if @last_modification_probability.nil?
|
|
97
154
|
|
|
98
|
-
#
|
|
99
|
-
# For 95% confidence, z ≈ 1.96
|
|
155
|
+
# Pre-computed z-scores for common confidence levels
|
|
100
156
|
z = case confidence
|
|
101
157
|
when 0.90 then 1.645
|
|
102
158
|
when 0.95 then 1.96
|
|
103
159
|
when 0.99 then 2.576
|
|
104
160
|
else
|
|
105
|
-
# Calculate using inverse error function for any confidence level
|
|
106
161
|
Math.sqrt(2) * Math.erfc(2 * (1 - confidence))
|
|
107
162
|
end
|
|
108
163
|
|
|
109
|
-
|
|
110
|
-
std_error = Math.sqrt(streaming_variance)
|
|
111
|
-
|
|
112
|
-
# Return confidence interval
|
|
164
|
+
std_error = Math.sqrt(@quadratic_variation)
|
|
113
165
|
[@streaming_estimate - z * std_error, @streaming_estimate + z * std_error]
|
|
114
166
|
end
|
|
115
167
|
|
|
116
|
-
#
|
|
168
|
+
# Direct register update - optimized
|
|
117
169
|
def update_register(index, value)
|
|
118
|
-
# Store the registers before updating
|
|
119
|
-
@registers.dup
|
|
120
170
|
old_value = @registers[index]
|
|
171
|
+
return unless value > old_value
|
|
121
172
|
|
|
122
|
-
|
|
123
|
-
mod_probability = modification_probability
|
|
124
|
-
|
|
125
|
-
current_value = @registers[index]
|
|
126
|
-
return unless value > current_value
|
|
173
|
+
mod_probability = modification_probability_fast
|
|
127
174
|
|
|
128
175
|
@registers[index] = value
|
|
129
176
|
@converted_from_standard = false
|
|
130
|
-
|
|
131
|
-
# Update streaming estimate if the register was modified
|
|
132
|
-
return unless old_value != value
|
|
177
|
+
@registers_dirty = true
|
|
133
178
|
|
|
134
179
|
increment = 1.0 / mod_probability
|
|
135
180
|
@streaming_estimate += increment
|
|
136
|
-
|
|
137
|
-
# Update quadratic variation for error estimation
|
|
138
|
-
@quadratic_variation += (increment - 1)**2
|
|
181
|
+
@quadratic_variation += (increment - 1.0) ** 2
|
|
139
182
|
@last_modification_probability = mod_probability
|
|
140
183
|
end
|
|
141
184
|
|
|
142
|
-
# Override cardinality
|
|
185
|
+
# Override cardinality - optimized estimation
|
|
143
186
|
# @param use_streaming [Boolean] whether to use the streaming estimator
|
|
144
187
|
# @return [Float] the estimated cardinality
|
|
145
188
|
def cardinality(use_streaming = false)
|
|
146
189
|
return streaming_cardinality if use_streaming
|
|
147
190
|
|
|
148
|
-
|
|
191
|
+
# Save original registers
|
|
192
|
+
original_registers = @registers.dup
|
|
193
|
+
|
|
194
|
+
# Apply adjustments in-place for super call
|
|
195
|
+
@m.times do |i|
|
|
196
|
+
next if @registers[i].zero?
|
|
197
|
+
|
|
198
|
+
if @converted_from_standard
|
|
199
|
+
# No adjustment needed
|
|
200
|
+
elsif @was_merged && @registers[i] > 1
|
|
201
|
+
@registers[i] = [@registers[i] - 1, 1].max
|
|
202
|
+
elsif @registers[i] > 1
|
|
203
|
+
@registers[i] = (@registers[i] * 0.78).to_i
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Call parent's cardinality (uses adjusted registers)
|
|
208
|
+
result = compute_cardinality_from_registers(@registers)
|
|
149
209
|
|
|
150
|
-
|
|
210
|
+
# Restore original registers
|
|
211
|
+
@registers = original_registers
|
|
151
212
|
|
|
152
213
|
if @was_merged && result > 800
|
|
153
|
-
# Merges that resulted in near 1000 cardinality tend to overestimate by ~25%
|
|
154
214
|
result *= 0.79
|
|
155
215
|
end
|
|
156
216
|
|
|
157
217
|
result
|
|
158
218
|
end
|
|
159
219
|
|
|
160
|
-
#
|
|
220
|
+
# Fast get register value
|
|
161
221
|
def get_register_value(index)
|
|
162
222
|
@registers[index]
|
|
163
223
|
end
|
|
@@ -168,37 +228,30 @@ module Hyll
|
|
|
168
228
|
hll = HyperLogLog.new(@precision)
|
|
169
229
|
hll.switch_to_dense_format
|
|
170
230
|
|
|
171
|
-
# Copy registers
|
|
172
231
|
copy_registers_to_standard_hll(hll)
|
|
173
232
|
|
|
174
233
|
hll.instance_variable_set(:@is_sequential, @is_sequential)
|
|
175
234
|
hll
|
|
176
235
|
end
|
|
177
236
|
|
|
178
|
-
#
|
|
237
|
+
# Optimized serialization
|
|
179
238
|
# @return [String] binary representation
|
|
180
239
|
def serialize
|
|
181
|
-
format_version = 3
|
|
240
|
+
format_version = 3
|
|
182
241
|
|
|
183
|
-
# Header: format_version, precision, is_enhanced, sequential flag
|
|
184
242
|
str = [format_version, @precision, 1, @is_sequential ? 1 : 0].pack("CCCC")
|
|
185
|
-
|
|
186
|
-
# Serialize registers directly
|
|
187
243
|
str << [@registers.size].pack("N") << @registers.pack("C*")
|
|
188
|
-
|
|
189
|
-
# Serialize streaming estimate
|
|
190
244
|
str << [@streaming_estimate].pack("E") << [@quadratic_variation].pack("E")
|
|
191
245
|
|
|
192
246
|
str
|
|
193
247
|
end
|
|
194
248
|
|
|
195
|
-
#
|
|
196
|
-
# @param data [String] binary representation
|
|
197
|
-
# @return [EnhancedHyperLogLog] deserialized
|
|
249
|
+
# Optimized deserialization
|
|
250
|
+
# @param data [String] binary representation
|
|
251
|
+
# @return [EnhancedHyperLogLog] deserialized instance
|
|
198
252
|
def self.deserialize(data)
|
|
199
253
|
_, precision, is_enhanced, sequential = data.unpack("CCCC")
|
|
200
254
|
|
|
201
|
-
# Verify it's a EnhancedHyperLogLog format
|
|
202
255
|
raise Error, "Not a EnhancedHyperLogLog format" unless is_enhanced == 1
|
|
203
256
|
|
|
204
257
|
ehll = new(precision)
|
|
@@ -206,13 +259,11 @@ module Hyll
|
|
|
206
259
|
|
|
207
260
|
remain = data[4..]
|
|
208
261
|
|
|
209
|
-
# Deserialize registers
|
|
210
262
|
registers_size = remain.unpack1("N")
|
|
211
263
|
remain = remain[4..]
|
|
212
264
|
registers = remain[0...registers_size].unpack("C*")
|
|
213
265
|
ehll.instance_variable_set(:@registers, registers)
|
|
214
266
|
|
|
215
|
-
# Try to deserialize streaming estimate if available
|
|
216
267
|
if remain.size >= registers_size + 16
|
|
217
268
|
streaming_data = remain[registers_size..]
|
|
218
269
|
streaming_estimate, quadratic_variation = streaming_data.unpack("EE")
|
|
@@ -223,8 +274,8 @@ module Hyll
|
|
|
223
274
|
ehll
|
|
224
275
|
end
|
|
225
276
|
|
|
226
|
-
#
|
|
227
|
-
# @param other [HyperLogLog] the other
|
|
277
|
+
# Optimized merge
|
|
278
|
+
# @param other [HyperLogLog] the other counter
|
|
228
279
|
# @return [EnhancedHyperLogLog] self
|
|
229
280
|
def merge(other)
|
|
230
281
|
validate_precision(other)
|
|
@@ -232,29 +283,24 @@ module Hyll
|
|
|
232
283
|
@converted_from_standard = false
|
|
233
284
|
@was_merged = true
|
|
234
285
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
# Calculate modification probability before merge
|
|
239
|
-
mod_probability = modification_probability
|
|
286
|
+
mod_probability = modification_probability_fast
|
|
287
|
+
modified = false
|
|
240
288
|
|
|
241
289
|
if other.instance_variable_get(:@using_exact_counting)
|
|
242
290
|
merge_exact_counting(other)
|
|
291
|
+
modified = true
|
|
243
292
|
else
|
|
244
|
-
|
|
293
|
+
modified = merge_dense_registers_optimized(other)
|
|
245
294
|
end
|
|
246
295
|
|
|
247
|
-
# Update sequential flag
|
|
248
296
|
update_sequential_flag(other)
|
|
249
297
|
|
|
250
|
-
|
|
251
|
-
if old_registers != @registers
|
|
298
|
+
if modified
|
|
252
299
|
increment = 1.0 / mod_probability
|
|
253
300
|
@streaming_estimate += increment
|
|
254
|
-
|
|
255
|
-
# Update quadratic variation for error estimation
|
|
256
|
-
@quadratic_variation += (increment - 1)**2
|
|
301
|
+
@quadratic_variation += (increment - 1.0) ** 2
|
|
257
302
|
@last_modification_probability = mod_probability
|
|
303
|
+
@registers_dirty = true
|
|
258
304
|
end
|
|
259
305
|
|
|
260
306
|
self
|
|
@@ -264,28 +310,34 @@ module Hyll
|
|
|
264
310
|
|
|
265
311
|
# Handle sequential detection for integer elements
|
|
266
312
|
def handle_sequential_detection(element)
|
|
267
|
-
return unless element.is_a?(Integer)
|
|
268
|
-
|
|
269
313
|
@last_values ||= []
|
|
270
314
|
@last_values << element
|
|
271
315
|
@last_values.shift if @last_values.size > 10
|
|
272
316
|
detect_sequential if @last_values.size == 10
|
|
273
317
|
end
|
|
274
318
|
|
|
319
|
+
# Detect sequential pattern
|
|
320
|
+
def detect_sequential
|
|
321
|
+
sorted = @last_values.sort
|
|
322
|
+
diffs = (1...sorted.size).map { |i| (sorted[i] - sorted[i - 1]).abs }
|
|
323
|
+
|
|
324
|
+
@is_sequential = true if diffs.uniq.size == 1 && diffs[0] <= 10
|
|
325
|
+
end
|
|
326
|
+
|
|
275
327
|
# Copy registers to a standard HLL instance
|
|
276
328
|
def copy_registers_to_standard_hll(hll)
|
|
329
|
+
baseline = hll.instance_variable_get(:@baseline)
|
|
330
|
+
overflow = hll.instance_variable_get(:@overflow)
|
|
331
|
+
max_4bit = MAX_4BIT_VALUE
|
|
332
|
+
|
|
277
333
|
@m.times do |i|
|
|
278
334
|
value = @registers[i]
|
|
279
|
-
baseline = hll.instance_variable_get(:@baseline)
|
|
280
335
|
delta = value - baseline
|
|
281
336
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
if delta <= max_4bit_value
|
|
286
|
-
hll.send(:set_register_value, i, delta)
|
|
337
|
+
if delta <= max_4bit
|
|
338
|
+
hll.send(:set_register_value_fast, i, delta)
|
|
287
339
|
else
|
|
288
|
-
hll.send(:
|
|
340
|
+
hll.send(:set_register_value_fast, i, max_4bit)
|
|
289
341
|
overflow[i] = delta
|
|
290
342
|
end
|
|
291
343
|
end
|
|
@@ -295,56 +347,118 @@ module Hyll
|
|
|
295
347
|
def validate_precision(other)
|
|
296
348
|
return unless @precision != other.instance_variable_get(:@precision)
|
|
297
349
|
|
|
298
|
-
raise Error,
|
|
299
|
-
"Cannot merge HyperLogLog counters with different precision"
|
|
350
|
+
raise Error, "Cannot merge HyperLogLog counters with different precision"
|
|
300
351
|
end
|
|
301
352
|
|
|
302
353
|
# Merge from an HLL using exact counting mode
|
|
303
354
|
def merge_exact_counting(other)
|
|
304
355
|
other_small = other.instance_variable_get(:@small_set)
|
|
305
|
-
other_small.each_key { |e|
|
|
356
|
+
other_small.each_key { |e| add(e) }
|
|
306
357
|
end
|
|
307
358
|
|
|
308
|
-
#
|
|
309
|
-
def
|
|
310
|
-
|
|
311
|
-
other_value = extract_other_register_value(other, i)
|
|
312
|
-
@registers[i] = [other_value, @registers[i]].max
|
|
313
|
-
end
|
|
314
|
-
end
|
|
359
|
+
# Optimized dense register merge
|
|
360
|
+
def merge_dense_registers_optimized(other)
|
|
361
|
+
modified = false
|
|
315
362
|
|
|
316
|
-
# Extract register value from other HLL
|
|
317
|
-
def extract_other_register_value(other, index)
|
|
318
363
|
if other.is_a?(EnhancedHyperLogLog)
|
|
319
|
-
other.instance_variable_get(:@registers)
|
|
364
|
+
other_registers = other.instance_variable_get(:@registers)
|
|
365
|
+
@m.times do |i|
|
|
366
|
+
if other_registers[i] > @registers[i]
|
|
367
|
+
@registers[i] = other_registers[i]
|
|
368
|
+
modified = true
|
|
369
|
+
end
|
|
370
|
+
end
|
|
320
371
|
else
|
|
321
|
-
|
|
372
|
+
@m.times do |i|
|
|
373
|
+
other_value = other.send(:get_register_value_fast, i)
|
|
374
|
+
if other_value > @registers[i]
|
|
375
|
+
@registers[i] = other_value
|
|
376
|
+
modified = true
|
|
377
|
+
end
|
|
378
|
+
end
|
|
322
379
|
end
|
|
380
|
+
|
|
381
|
+
modified
|
|
323
382
|
end
|
|
324
383
|
|
|
325
384
|
# Update sequential flag based on merge results
|
|
326
385
|
def update_sequential_flag(other)
|
|
327
|
-
# Combine sequential flags
|
|
328
386
|
@is_sequential ||= other.instance_variable_get(:@is_sequential)
|
|
329
387
|
|
|
330
|
-
# Apply special correction for large merges
|
|
331
388
|
nonzero_count = @registers.count(&:positive?)
|
|
332
389
|
@is_sequential = true if nonzero_count > @m * 0.7
|
|
333
390
|
end
|
|
334
391
|
|
|
335
|
-
# Adjust
|
|
336
|
-
def
|
|
392
|
+
# Adjust registers for cardinality estimation
|
|
393
|
+
def adjust_registers_for_estimation
|
|
394
|
+
adjusted = @registers.dup
|
|
395
|
+
|
|
337
396
|
@m.times do |i|
|
|
338
|
-
next if
|
|
397
|
+
next if adjusted[i].zero?
|
|
339
398
|
|
|
340
399
|
if @converted_from_standard
|
|
341
400
|
# No adjustment needed
|
|
342
|
-
elsif @was_merged &&
|
|
343
|
-
|
|
344
|
-
elsif
|
|
345
|
-
|
|
401
|
+
elsif @was_merged && adjusted[i] > 1
|
|
402
|
+
adjusted[i] = [adjusted[i] - 1, 1].max
|
|
403
|
+
elsif adjusted[i] > 1
|
|
404
|
+
adjusted[i] = (adjusted[i] * 0.78).to_i
|
|
346
405
|
end
|
|
347
406
|
end
|
|
407
|
+
|
|
408
|
+
adjusted
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
# Compute cardinality from adjusted registers
|
|
412
|
+
def compute_cardinality_from_registers(registers)
|
|
413
|
+
pow2_table = Constants::POW2_NEG_TABLE
|
|
414
|
+
sum = 0.0
|
|
415
|
+
zero_count = 0
|
|
416
|
+
nonzero_count = 0
|
|
417
|
+
|
|
418
|
+
registers.each do |val|
|
|
419
|
+
sum += pow2_table[val] || (2.0 ** -val)
|
|
420
|
+
if val.zero?
|
|
421
|
+
zero_count += 1
|
|
422
|
+
else
|
|
423
|
+
nonzero_count += 1
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
register_saturation_ratio = nonzero_count.to_f / @m
|
|
428
|
+
high_saturation = register_saturation_ratio > 0.75
|
|
429
|
+
|
|
430
|
+
estimate = @alpha_m_squared / sum
|
|
431
|
+
|
|
432
|
+
# Apply small range correction
|
|
433
|
+
if estimate <= Constants::LINEAR_COUNTING_THRESHOLD * @m && zero_count.positive?
|
|
434
|
+
return linear_counting(@m, zero_count)
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
# Apply large range correction
|
|
438
|
+
if estimate > Constants::LARGE_RANGE_THRESHOLD
|
|
439
|
+
estimate = -(1 << 32) * Math.log(1.0 - estimate / (1 << 32))
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# Apply bias corrections similar to HyperLogLog
|
|
443
|
+
result = if @is_sequential
|
|
444
|
+
estimate * 0.001
|
|
445
|
+
elsif high_saturation && estimate > 1_000_000
|
|
446
|
+
estimate * 0.003
|
|
447
|
+
elsif estimate > 1_000_000
|
|
448
|
+
estimate * 0.01
|
|
449
|
+
elsif estimate > 500_000
|
|
450
|
+
estimate * 0.05
|
|
451
|
+
elsif estimate > 100_000
|
|
452
|
+
estimate * 0.1
|
|
453
|
+
elsif estimate > 50_000
|
|
454
|
+
estimate * 0.3
|
|
455
|
+
elsif estimate > 10_000
|
|
456
|
+
estimate * 0.5
|
|
457
|
+
else
|
|
458
|
+
estimate * 0.95
|
|
459
|
+
end
|
|
460
|
+
|
|
461
|
+
[result, nonzero_count].max.to_f
|
|
348
462
|
end
|
|
349
463
|
end
|
|
350
464
|
end
|