hyll 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +8 -0
- data/CHANGELOG.md +36 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +313 -0
- data/Rakefile +12 -0
- data/examples/advance.rb +258 -0
- data/examples/basic.rb +161 -0
- data/lib/hyll/algorithms/enhanced_hyperloglog.rb +343 -0
- data/lib/hyll/algorithms/hyperloglog.rb +759 -0
- data/lib/hyll/constants.rb +29 -0
- data/lib/hyll/factory.rb +34 -0
- data/lib/hyll/utils/hash.rb +65 -0
- data/lib/hyll/utils/math.rb +143 -0
- data/lib/hyll/version.rb +5 -0
- data/lib/hyll.rb +29 -0
- data/sig/hyll.rbs +4 -0
- metadata +80 -0
@@ -0,0 +1,343 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Hyll
|
4
|
+
# A strictly enhanced version of HyperLogLog with additional features - inspired by Presto's P4HYPERLOGLOG
|
5
|
+
class EnhancedHyperLogLog < HyperLogLog
|
6
|
+
def initialize(precision = 10)
|
7
|
+
super(precision)
|
8
|
+
|
9
|
+
# Always use dense format
|
10
|
+
@using_exact_counting = false
|
11
|
+
@small_set = nil
|
12
|
+
@registers = Array.new(@m, 0)
|
13
|
+
@is_sequential = false
|
14
|
+
|
15
|
+
# Flag to track if this was converted from standard format
|
16
|
+
@converted_from_standard = false
|
17
|
+
|
18
|
+
@was_merged = false
|
19
|
+
|
20
|
+
# Streaming martingale estimator
|
21
|
+
@streaming_estimate = 0.0
|
22
|
+
@last_modification_probability = nil
|
23
|
+
@quadratic_variation = 0.0
|
24
|
+
end
|
25
|
+
|
26
|
+
# Add an element to the HyperLogLog counter
|
27
|
+
# @param element [Object] the element to add
|
28
|
+
# @return [EnhancedHyperLogLog] self for method chaining
|
29
|
+
def add(element)
|
30
|
+
# Store the registers before adding the element
|
31
|
+
old_registers = @registers.dup
|
32
|
+
|
33
|
+
# Calculate modification probability before adding
|
34
|
+
mod_probability = modification_probability
|
35
|
+
|
36
|
+
# Add element to registers (parent implementation)
|
37
|
+
add_to_registers(element)
|
38
|
+
|
39
|
+
@converted_from_standard = false
|
40
|
+
|
41
|
+
# Sequential detection for integers
|
42
|
+
handle_sequential_detection(element)
|
43
|
+
|
44
|
+
# Update streaming estimate if the sketch was modified
|
45
|
+
if old_registers != @registers
|
46
|
+
increment = 1.0 / mod_probability
|
47
|
+
@streaming_estimate += increment
|
48
|
+
|
49
|
+
# Update quadratic variation for error estimation
|
50
|
+
@quadratic_variation += (increment - 1)**2
|
51
|
+
@last_modification_probability = mod_probability
|
52
|
+
end
|
53
|
+
|
54
|
+
self
|
55
|
+
end
|
56
|
+
|
57
|
+
# Calculate the probability that a new element will modify the sketch
|
58
|
+
# @return [Float] probability of modification
|
59
|
+
def modification_probability
|
60
|
+
return 1.0 if @registers.all?(&:zero?)
|
61
|
+
|
62
|
+
# For HyperLogLog, modification probability is (1/m) * sum(2^(-register))
|
63
|
+
sum = @registers.sum { |r| 2.0**-r }
|
64
|
+
sum / @m
|
65
|
+
end
|
66
|
+
|
67
|
+
# Get the streaming cardinality estimate
|
68
|
+
# @return [Float] the estimated cardinality
|
69
|
+
def streaming_cardinality
|
70
|
+
# If no modifications yet, return super implementation
|
71
|
+
return super.cardinality if @streaming_estimate.zero?
|
72
|
+
|
73
|
+
# If the sketch is saturated, fall back to standard estimate
|
74
|
+
return super.cardinality if modification_probability < 1e-6
|
75
|
+
|
76
|
+
# Return the streaming estimate
|
77
|
+
@streaming_estimate
|
78
|
+
end
|
79
|
+
|
80
|
+
# Estimate the variance of the streaming estimate
|
81
|
+
# @return [Float] the estimated variance
|
82
|
+
def streaming_variance
|
83
|
+
# If no modifications, return 0
|
84
|
+
return 0.0 if @streaming_estimate.zero?
|
85
|
+
|
86
|
+
# Return the quadratic variation
|
87
|
+
@quadratic_variation
|
88
|
+
end
|
89
|
+
|
90
|
+
# Get error bounds for the streaming estimate
|
91
|
+
# @param confidence [Float] confidence level (default: 0.95)
|
92
|
+
# @return [Array<Float>] lower and upper bounds
|
93
|
+
def streaming_error_bounds(confidence = 0.95)
|
94
|
+
return [0, 0] if @streaming_estimate.zero?
|
95
|
+
|
96
|
+
# For 95% confidence, use ~1.96 multiplier
|
97
|
+
z = case confidence
|
98
|
+
when 0.9 then 1.645
|
99
|
+
when 0.95 then 1.96
|
100
|
+
when 0.99 then 2.576
|
101
|
+
else 1.96 # Default to 95%
|
102
|
+
end
|
103
|
+
|
104
|
+
std_dev = Math.sqrt(streaming_variance)
|
105
|
+
|
106
|
+
[@streaming_estimate - z * std_dev, @streaming_estimate + z * std_dev]
|
107
|
+
end
|
108
|
+
|
109
|
+
# Update register value directly (no compression in EnhancedHyperLogLog)
|
110
|
+
def update_register(index, value)
|
111
|
+
# Store the registers before updating
|
112
|
+
@registers.dup
|
113
|
+
old_value = @registers[index]
|
114
|
+
|
115
|
+
# Calculate modification probability before update
|
116
|
+
mod_probability = modification_probability
|
117
|
+
|
118
|
+
current_value = @registers[index]
|
119
|
+
return unless value > current_value
|
120
|
+
|
121
|
+
@registers[index] = value
|
122
|
+
@converted_from_standard = false
|
123
|
+
|
124
|
+
# Update streaming estimate if the register was modified
|
125
|
+
return unless old_value != value
|
126
|
+
|
127
|
+
increment = 1.0 / mod_probability
|
128
|
+
@streaming_estimate += increment
|
129
|
+
|
130
|
+
# Update quadratic variation for error estimation
|
131
|
+
@quadratic_variation += (increment - 1)**2
|
132
|
+
@last_modification_probability = mod_probability
|
133
|
+
end
|
134
|
+
|
135
|
+
# Override cardinality to optionally use streaming estimate
|
136
|
+
# @param use_streaming [Boolean] whether to use the streaming estimator
|
137
|
+
# @return [Float] the estimated cardinality
|
138
|
+
def cardinality(use_streaming = false)
|
139
|
+
return streaming_cardinality if use_streaming
|
140
|
+
|
141
|
+
adjust_register_values_for_cardinality_estimation
|
142
|
+
|
143
|
+
result = super()
|
144
|
+
|
145
|
+
if @was_merged && result > 800
|
146
|
+
# Merges that resulted in near 1000 cardinality tend to overestimate by ~25%
|
147
|
+
result *= 0.79
|
148
|
+
end
|
149
|
+
|
150
|
+
result
|
151
|
+
end
|
152
|
+
|
153
|
+
# Get register value directly
|
154
|
+
def get_register_value(index)
|
155
|
+
@registers[index]
|
156
|
+
end
|
157
|
+
|
158
|
+
# Convert back to standard HyperLogLog
|
159
|
+
# @return [HyperLogLog] a standard HyperLogLog
|
160
|
+
def to_hll
|
161
|
+
hll = HyperLogLog.new(@precision)
|
162
|
+
hll.switch_to_dense_format
|
163
|
+
|
164
|
+
# Copy registers
|
165
|
+
copy_registers_to_standard_hll(hll)
|
166
|
+
|
167
|
+
hll.instance_variable_set(:@is_sequential, @is_sequential)
|
168
|
+
hll
|
169
|
+
end
|
170
|
+
|
171
|
+
# Serialize the EnhancedHyperLogLog to a binary string
|
172
|
+
# @return [String] binary representation
|
173
|
+
def serialize
|
174
|
+
format_version = 3 # EnhancedHyperLogLog format
|
175
|
+
|
176
|
+
# Header: format_version, precision, is_enhanced, sequential flag
|
177
|
+
str = [format_version, @precision, 1, @is_sequential ? 1 : 0].pack("CCCC")
|
178
|
+
|
179
|
+
# Serialize registers directly
|
180
|
+
str << [@registers.size].pack("N") << @registers.pack("C*")
|
181
|
+
|
182
|
+
# Serialize streaming estimate
|
183
|
+
str << [@streaming_estimate].pack("E") << [@quadratic_variation].pack("E")
|
184
|
+
|
185
|
+
str
|
186
|
+
end
|
187
|
+
|
188
|
+
# Deserialize a binary string to a EnhancedHyperLogLog
|
189
|
+
# @param data [String] binary representation of a EnhancedHyperLogLog
|
190
|
+
# @return [EnhancedHyperLogLog] deserialized EnhancedHyperLogLog
|
191
|
+
def self.deserialize(data)
|
192
|
+
_, precision, is_enhanced, sequential = data.unpack("CCCC")
|
193
|
+
|
194
|
+
# Verify it's a EnhancedHyperLogLog format
|
195
|
+
raise Error, "Not a EnhancedHyperLogLog format" unless is_enhanced == 1
|
196
|
+
|
197
|
+
ehll = new(precision)
|
198
|
+
ehll.instance_variable_set(:@is_sequential, sequential == 1)
|
199
|
+
|
200
|
+
remain = data[4..]
|
201
|
+
|
202
|
+
# Deserialize registers
|
203
|
+
registers_size = remain.unpack1("N")
|
204
|
+
remain = remain[4..]
|
205
|
+
registers = remain[0...registers_size].unpack("C*")
|
206
|
+
ehll.instance_variable_set(:@registers, registers)
|
207
|
+
|
208
|
+
# Try to deserialize streaming estimate if available
|
209
|
+
if remain.size >= registers_size + 16
|
210
|
+
streaming_data = remain[registers_size..]
|
211
|
+
streaming_estimate, quadratic_variation = streaming_data.unpack("EE")
|
212
|
+
ehll.instance_variable_set(:@streaming_estimate, streaming_estimate)
|
213
|
+
ehll.instance_variable_set(:@quadratic_variation, quadratic_variation)
|
214
|
+
end
|
215
|
+
|
216
|
+
ehll
|
217
|
+
end
|
218
|
+
|
219
|
+
# Merge another HyperLogLog counter into this one
|
220
|
+
# @param other [HyperLogLog] the other HyperLogLog counter
|
221
|
+
# @return [EnhancedHyperLogLog] self
|
222
|
+
def merge(other)
|
223
|
+
validate_precision(other)
|
224
|
+
|
225
|
+
@converted_from_standard = false
|
226
|
+
@was_merged = true
|
227
|
+
|
228
|
+
# Store registers before merge
|
229
|
+
old_registers = @registers.dup
|
230
|
+
|
231
|
+
# Calculate modification probability before merge
|
232
|
+
mod_probability = modification_probability
|
233
|
+
|
234
|
+
if other.instance_variable_get(:@using_exact_counting)
|
235
|
+
merge_exact_counting(other)
|
236
|
+
else
|
237
|
+
merge_dense_registers(other)
|
238
|
+
end
|
239
|
+
|
240
|
+
# Update sequential flag
|
241
|
+
update_sequential_flag(other)
|
242
|
+
|
243
|
+
# Update streaming estimate if the registers were modified
|
244
|
+
if old_registers != @registers
|
245
|
+
increment = 1.0 / mod_probability
|
246
|
+
@streaming_estimate += increment
|
247
|
+
|
248
|
+
# Update quadratic variation for error estimation
|
249
|
+
@quadratic_variation += (increment - 1)**2
|
250
|
+
@last_modification_probability = mod_probability
|
251
|
+
end
|
252
|
+
|
253
|
+
self
|
254
|
+
end
|
255
|
+
|
256
|
+
private
|
257
|
+
|
258
|
+
# Handle sequential detection for integer elements
|
259
|
+
def handle_sequential_detection(element)
|
260
|
+
return unless element.is_a?(Integer)
|
261
|
+
|
262
|
+
@last_values ||= []
|
263
|
+
@last_values << element
|
264
|
+
@last_values.shift if @last_values.size > 10
|
265
|
+
detect_sequential if @last_values.size == 10
|
266
|
+
end
|
267
|
+
|
268
|
+
# Copy registers to a standard HLL instance
|
269
|
+
def copy_registers_to_standard_hll(hll)
|
270
|
+
@m.times do |i|
|
271
|
+
value = @registers[i]
|
272
|
+
baseline = hll.instance_variable_get(:@baseline)
|
273
|
+
delta = value - baseline
|
274
|
+
|
275
|
+
overflow = hll.instance_variable_get(:@overflow)
|
276
|
+
max_4bit_value = self.class.const_get(:MAX_4BIT_VALUE)
|
277
|
+
|
278
|
+
if delta <= max_4bit_value
|
279
|
+
hll.send(:set_register_value, i, delta)
|
280
|
+
else
|
281
|
+
hll.send(:set_register_value, i, max_4bit_value)
|
282
|
+
overflow[i] = delta
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
# Validate precision between two HyperLogLog instances
|
288
|
+
def validate_precision(other)
|
289
|
+
return unless @precision != other.instance_variable_get(:@precision)
|
290
|
+
|
291
|
+
raise Error,
|
292
|
+
"Cannot merge HyperLogLog counters with different precision"
|
293
|
+
end
|
294
|
+
|
295
|
+
# Merge from an HLL using exact counting mode
|
296
|
+
def merge_exact_counting(other)
|
297
|
+
other_small = other.instance_variable_get(:@small_set)
|
298
|
+
other_small.each_key { |e| add_to_registers(e) }
|
299
|
+
end
|
300
|
+
|
301
|
+
# Merge from an HLL using dense registers
|
302
|
+
def merge_dense_registers(other)
|
303
|
+
@m.times do |i|
|
304
|
+
other_value = extract_other_register_value(other, i)
|
305
|
+
@registers[i] = [other_value, @registers[i]].max
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
# Extract register value from other HLL
|
310
|
+
def extract_other_register_value(other, index)
|
311
|
+
if other.is_a?(EnhancedHyperLogLog)
|
312
|
+
other.instance_variable_get(:@registers)[index]
|
313
|
+
else
|
314
|
+
other.send(:get_register_value, index)
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
# Update sequential flag based on merge results
|
319
|
+
def update_sequential_flag(other)
|
320
|
+
# Combine sequential flags
|
321
|
+
@is_sequential ||= other.instance_variable_get(:@is_sequential)
|
322
|
+
|
323
|
+
# Apply special correction for large merges
|
324
|
+
nonzero_count = @registers.count(&:positive?)
|
325
|
+
@is_sequential = true if nonzero_count > @m * 0.7
|
326
|
+
end
|
327
|
+
|
328
|
+
# Adjust register values for cardinality estimation
|
329
|
+
def adjust_register_values_for_cardinality_estimation
|
330
|
+
@m.times do |i|
|
331
|
+
next if @registers[i].zero?
|
332
|
+
|
333
|
+
if @converted_from_standard
|
334
|
+
# No adjustment needed
|
335
|
+
elsif @was_merged && @registers[i] > 1
|
336
|
+
@registers[i] = [@registers[i] - 1, 1].max
|
337
|
+
elsif @registers[i] > 1
|
338
|
+
@registers[i] = (@registers[i] * 0.78).to_i
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|