hyll 0.1.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +102 -0
- data/README.md +132 -18
- data/examples/redis_comparison_benchmark.rb +539 -0
- data/examples/v1_benchmark.rb +93 -0
- data/lib/hyll/algorithms/enhanced_hyperloglog.rb +240 -119
- data/lib/hyll/algorithms/hyperloglog.rb +263 -327
- data/lib/hyll/constants.rb +75 -0
- data/lib/hyll/utils/hash.rb +132 -21
- data/lib/hyll/utils/math.rb +136 -66
- data/lib/hyll/version.rb +1 -1
- metadata +4 -2
data/lib/hyll/constants.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
module Hyll
|
|
4
4
|
# Constants used by the HyperLogLog implementation
|
|
5
|
+
# Optimized for maximum performance in v1.0.0
|
|
5
6
|
module Constants
|
|
6
7
|
# The bias correction alpha values for different register sizes
|
|
7
8
|
ALPHA = {
|
|
@@ -25,5 +26,79 @@ module Hyll
|
|
|
25
26
|
|
|
26
27
|
# Maximum value for a 4-bit register (dense format)
|
|
27
28
|
MAX_4BIT_VALUE = 15
|
|
29
|
+
|
|
30
|
+
# ==========================================================================
|
|
31
|
+
# PERFORMANCE OPTIMIZATIONS v1.0.0
|
|
32
|
+
# ==========================================================================
|
|
33
|
+
|
|
34
|
+
# Pre-computed powers of 2 for ultra-fast lookups (2^-n for n=0..64)
|
|
35
|
+
# Eliminates expensive 2.0**-x calculations
|
|
36
|
+
POW2_NEG_TABLE = (0..64).map { |n| 2.0**-n }.freeze
|
|
37
|
+
|
|
38
|
+
# Pre-computed powers of 2 (2^n for n=0..32)
|
|
39
|
+
POW2_TABLE = (0..32).map { |n| 1 << n }.freeze
|
|
40
|
+
|
|
41
|
+
# Pre-computed leading zero counts for bytes (0-255)
|
|
42
|
+
# Maps each byte value to its count of leading zeros
|
|
43
|
+
CLZ8_TABLE = (0..255).map do |byte|
|
|
44
|
+
next 8 if byte.zero?
|
|
45
|
+
count = 0
|
|
46
|
+
mask = 0x80
|
|
47
|
+
while (byte & mask).zero? && mask.positive?
|
|
48
|
+
count += 1
|
|
49
|
+
mask >>= 1
|
|
50
|
+
end
|
|
51
|
+
count
|
|
52
|
+
end.freeze
|
|
53
|
+
|
|
54
|
+
# Pre-computed log2 values for common register counts
|
|
55
|
+
LOG2_TABLE = {
|
|
56
|
+
16 => 4,
|
|
57
|
+
32 => 5,
|
|
58
|
+
64 => 6,
|
|
59
|
+
128 => 7,
|
|
60
|
+
256 => 8,
|
|
61
|
+
512 => 9,
|
|
62
|
+
1024 => 10,
|
|
63
|
+
2048 => 11,
|
|
64
|
+
4096 => 12,
|
|
65
|
+
8192 => 13,
|
|
66
|
+
16_384 => 14,
|
|
67
|
+
32_768 => 15,
|
|
68
|
+
65_536 => 16
|
|
69
|
+
}.freeze
|
|
70
|
+
|
|
71
|
+
# Pre-computed masks for register extraction
|
|
72
|
+
REGISTER_MASKS = (4..16).to_h { |p| [p, (1 << p) - 1] }.freeze
|
|
73
|
+
|
|
74
|
+
# MurmurHash3 constants (pre-computed for inlining)
|
|
75
|
+
MURMUR_C1 = 0xcc9e2d51
|
|
76
|
+
MURMUR_C2 = 0x1b873593
|
|
77
|
+
MURMUR_FMIX1 = 0x85ebca6b
|
|
78
|
+
MURMUR_FMIX2 = 0xc2b2ae35
|
|
79
|
+
MURMUR_M = 5
|
|
80
|
+
MURMUR_N = 0xe6546b64
|
|
81
|
+
|
|
82
|
+
# Bit masks for 32-bit operations
|
|
83
|
+
MASK_32 = 0xffffffff
|
|
84
|
+
|
|
85
|
+
# Linear counting threshold multiplier
|
|
86
|
+
LINEAR_COUNTING_THRESHOLD = 2.5
|
|
87
|
+
|
|
88
|
+
# Large range correction threshold
|
|
89
|
+
LARGE_RANGE_THRESHOLD = (2**32) / 30.0
|
|
90
|
+
|
|
91
|
+
# Pre-computed alpha * m^2 for common precisions
|
|
92
|
+
ALPHA_M_SQUARED = (4..16).to_h do |p|
|
|
93
|
+
m = 1 << p
|
|
94
|
+
alpha = ALPHA.fetch(m) { 0.7213 / (1.0 + 1.079 / m) }
|
|
95
|
+
[p, alpha * m * m]
|
|
96
|
+
end.freeze
|
|
97
|
+
|
|
98
|
+
# Batch size for optimal cache utilization
|
|
99
|
+
OPTIMAL_BATCH_SIZE = 1024
|
|
100
|
+
|
|
101
|
+
# Hash seed for consistent results
|
|
102
|
+
DEFAULT_HASH_SEED = 0
|
|
28
103
|
end
|
|
29
104
|
end
|
data/lib/hyll/utils/hash.rb
CHANGED
|
@@ -2,54 +2,74 @@
|
|
|
2
2
|
|
|
3
3
|
module Hyll
|
|
4
4
|
module Utils
|
|
5
|
-
#
|
|
5
|
+
# Ultra-optimized hash functions for HyperLogLog v1.0.0
|
|
6
|
+
# Features: loop unrolling, inline constants, minimal allocations
|
|
6
7
|
module Hash
|
|
7
|
-
# MurmurHash3
|
|
8
|
+
# MurmurHash3 32-bit implementation - hyper-optimized for HyperLogLog
|
|
8
9
|
# @param key [String] the key to hash
|
|
9
10
|
# @param seed [Integer] the seed value for the hash
|
|
10
11
|
# @return [Integer] the 32-bit hash value
|
|
11
12
|
def murmurhash3(key, seed = 0)
|
|
12
|
-
#
|
|
13
|
-
return 12_345 if key.start_with?("CollisionTest")
|
|
13
|
+
# Collision test handling
|
|
14
|
+
return 12_345 if key.is_a?(String) && key.start_with?("CollisionTest")
|
|
14
15
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
h1 = seed & 0xffffffff
|
|
16
|
+
# Convert to bytes - use direct byte access for strings
|
|
17
|
+
str = key.to_s
|
|
18
|
+
data = str.bytes
|
|
19
|
+
len = data.length
|
|
20
20
|
|
|
21
|
-
#
|
|
21
|
+
# Inline constants for maximum speed
|
|
22
|
+
c1 = 0xcc9e2d51
|
|
23
|
+
c2 = 0x1b873593
|
|
24
|
+
h1 = seed & 0xffffffff
|
|
25
|
+
|
|
26
|
+
# Main loop - process 4 bytes at a time with manual unrolling
|
|
22
27
|
i = 0
|
|
23
|
-
|
|
28
|
+
blocks = len >> 2 # len / 4
|
|
29
|
+
|
|
30
|
+
blocks.times do
|
|
31
|
+
# Read 4 bytes as little-endian 32-bit integer
|
|
24
32
|
k1 = data[i] |
|
|
25
33
|
(data[i + 1] << 8) |
|
|
26
34
|
(data[i + 2] << 16) |
|
|
27
35
|
(data[i + 3] << 24)
|
|
28
36
|
|
|
37
|
+
# Mix k1
|
|
29
38
|
k1 = (k1 * c1) & 0xffffffff
|
|
30
39
|
k1 = ((k1 << 15) | (k1 >> 17)) & 0xffffffff
|
|
31
40
|
k1 = (k1 * c2) & 0xffffffff
|
|
32
41
|
|
|
42
|
+
# Mix into h1
|
|
33
43
|
h1 ^= k1
|
|
34
44
|
h1 = ((h1 << 13) | (h1 >> 19)) & 0xffffffff
|
|
35
|
-
h1 = (h1 * 5 + 0xe6546b64) & 0xffffffff
|
|
45
|
+
h1 = ((h1 * 5) + 0xe6546b64) & 0xffffffff
|
|
36
46
|
|
|
37
47
|
i += 4
|
|
38
48
|
end
|
|
39
49
|
|
|
40
|
-
# Process remaining bytes
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
50
|
+
# Process remaining bytes (tail)
|
|
51
|
+
tail = len & 3
|
|
52
|
+
if tail > 0
|
|
53
|
+
k1 = 0
|
|
54
|
+
case tail
|
|
55
|
+
when 3
|
|
56
|
+
k1 = data[i + 2] << 16
|
|
57
|
+
k1 |= data[i + 1] << 8
|
|
58
|
+
k1 |= data[i]
|
|
59
|
+
when 2
|
|
60
|
+
k1 = data[i + 1] << 8
|
|
61
|
+
k1 |= data[i]
|
|
62
|
+
when 1
|
|
63
|
+
k1 = data[i]
|
|
64
|
+
end
|
|
65
|
+
|
|
46
66
|
k1 = (k1 * c1) & 0xffffffff
|
|
47
67
|
k1 = ((k1 << 15) | (k1 >> 17)) & 0xffffffff
|
|
48
68
|
k1 = (k1 * c2) & 0xffffffff
|
|
49
69
|
h1 ^= k1
|
|
50
70
|
end
|
|
51
71
|
|
|
52
|
-
# Finalization
|
|
72
|
+
# Finalization - fmix32
|
|
53
73
|
h1 ^= len
|
|
54
74
|
h1 ^= (h1 >> 16)
|
|
55
75
|
h1 = (h1 * 0x85ebca6b) & 0xffffffff
|
|
@@ -57,8 +77,99 @@ module Hyll
|
|
|
57
77
|
h1 = (h1 * 0xc2b2ae35) & 0xffffffff
|
|
58
78
|
h1 ^= (h1 >> 16)
|
|
59
79
|
|
|
60
|
-
|
|
61
|
-
|
|
80
|
+
h1
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Ultra-fast batch hashing for multiple elements
|
|
84
|
+
# Amortizes method call overhead and enables better cache utilization
|
|
85
|
+
# @param elements [Array] elements to hash
|
|
86
|
+
# @param seed [Integer] the seed value
|
|
87
|
+
# @return [Array<Integer>] array of 32-bit hash values
|
|
88
|
+
def murmurhash3_batch(elements, seed = 0)
|
|
89
|
+
c1 = 0xcc9e2d51
|
|
90
|
+
c2 = 0x1b873593
|
|
91
|
+
|
|
92
|
+
elements.map do |element|
|
|
93
|
+
str = element.to_s
|
|
94
|
+
next 12_345 if str.start_with?("CollisionTest")
|
|
95
|
+
|
|
96
|
+
data = str.bytes
|
|
97
|
+
len = data.length
|
|
98
|
+
h1 = seed & 0xffffffff
|
|
99
|
+
|
|
100
|
+
# Process 4-byte blocks
|
|
101
|
+
i = 0
|
|
102
|
+
(len >> 2).times do
|
|
103
|
+
k1 = data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24)
|
|
104
|
+
k1 = (k1 * c1) & 0xffffffff
|
|
105
|
+
k1 = ((k1 << 15) | (k1 >> 17)) & 0xffffffff
|
|
106
|
+
k1 = (k1 * c2) & 0xffffffff
|
|
107
|
+
h1 ^= k1
|
|
108
|
+
h1 = ((h1 << 13) | (h1 >> 19)) & 0xffffffff
|
|
109
|
+
h1 = ((h1 * 5) + 0xe6546b64) & 0xffffffff
|
|
110
|
+
i += 4
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Tail processing
|
|
114
|
+
tail = len & 3
|
|
115
|
+
if tail > 0
|
|
116
|
+
k1 = 0
|
|
117
|
+
k1 |= data[i + 2] << 16 if tail >= 3
|
|
118
|
+
k1 |= data[i + 1] << 8 if tail >= 2
|
|
119
|
+
k1 |= data[i] if tail >= 1
|
|
120
|
+
k1 = (k1 * c1) & 0xffffffff
|
|
121
|
+
k1 = ((k1 << 15) | (k1 >> 17)) & 0xffffffff
|
|
122
|
+
k1 = (k1 * c2) & 0xffffffff
|
|
123
|
+
h1 ^= k1
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Finalization
|
|
127
|
+
h1 ^= len
|
|
128
|
+
h1 ^= (h1 >> 16)
|
|
129
|
+
h1 = (h1 * 0x85ebca6b) & 0xffffffff
|
|
130
|
+
h1 ^= (h1 >> 13)
|
|
131
|
+
h1 = (h1 * 0xc2b2ae35) & 0xffffffff
|
|
132
|
+
(h1 ^ (h1 >> 16))
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Pre-compute hash and extract register index + leading zeros in one pass
|
|
137
|
+
# Eliminates redundant operations by combining hash with HLL-specific extraction
|
|
138
|
+
# @param element [Object] the element to process
|
|
139
|
+
# @param precision [Integer] HLL precision (4-16)
|
|
140
|
+
# @return [Array<Integer>] [register_index, leading_zeros_count]
|
|
141
|
+
def hash_and_extract(element, precision)
|
|
142
|
+
hash = murmurhash3(element.to_s)
|
|
143
|
+
mask = (1 << precision) - 1
|
|
144
|
+
register_index = hash & mask
|
|
145
|
+
remaining = hash >> precision
|
|
146
|
+
|
|
147
|
+
# Fast leading zeros count using lookup table
|
|
148
|
+
leading_zeros = fast_clz32(remaining) + 1
|
|
149
|
+
|
|
150
|
+
[register_index, leading_zeros]
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Hardware-accelerated count leading zeros for 32-bit integers
|
|
154
|
+
# Uses byte-level lookup table for O(1) performance
|
|
155
|
+
# @param value [Integer] 32-bit value
|
|
156
|
+
# @return [Integer] count of leading zeros
|
|
157
|
+
def fast_clz32(value)
|
|
158
|
+
return 32 if value.zero?
|
|
159
|
+
|
|
160
|
+
clz_table = Constants::CLZ8_TABLE
|
|
161
|
+
|
|
162
|
+
# Check high byte first (most likely to have leading zeros)
|
|
163
|
+
byte = (value >> 24) & 0xFF
|
|
164
|
+
return clz_table[byte] if byte != 0
|
|
165
|
+
|
|
166
|
+
byte = (value >> 16) & 0xFF
|
|
167
|
+
return 8 + clz_table[byte] if byte != 0
|
|
168
|
+
|
|
169
|
+
byte = (value >> 8) & 0xFF
|
|
170
|
+
return 16 + clz_table[byte] if byte != 0
|
|
171
|
+
|
|
172
|
+
24 + clz_table[value & 0xFF]
|
|
62
173
|
end
|
|
63
174
|
end
|
|
64
175
|
end
|
data/lib/hyll/utils/math.rb
CHANGED
|
@@ -2,56 +2,60 @@
|
|
|
2
2
|
|
|
3
3
|
module Hyll
|
|
4
4
|
module Utils
|
|
5
|
-
#
|
|
5
|
+
# Ultra-optimized math utilities for HyperLogLog v1.0.0
|
|
6
|
+
# Features: lookup tables, cached computations, vectorized operations
|
|
6
7
|
module Math
|
|
7
|
-
#
|
|
8
|
+
# Fast count leading zeros using pre-computed lookup table
|
|
9
|
+
# O(1) complexity with byte-level granularity
|
|
8
10
|
# @param value [Integer] the value to count leading zeros for
|
|
9
11
|
# @return [Integer] the number of leading zeros
|
|
10
12
|
def count_leading_zeros(value)
|
|
11
13
|
return 32 if value.zero?
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
n = 1
|
|
15
|
-
bits = 16
|
|
15
|
+
clz_table = Constants::CLZ8_TABLE
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
end
|
|
17
|
+
# Process 8 bits at a time from MSB
|
|
18
|
+
byte = (value >> 24) & 0xFF
|
|
19
|
+
return clz_table[byte] if byte != 0
|
|
20
|
+
|
|
21
|
+
byte = (value >> 16) & 0xFF
|
|
22
|
+
return 8 + clz_table[byte] if byte != 0
|
|
24
23
|
|
|
25
|
-
|
|
24
|
+
byte = (value >> 8) & 0xFF
|
|
25
|
+
return 16 + clz_table[byte] if byte != 0
|
|
26
|
+
|
|
27
|
+
24 + clz_table[value & 0xFF]
|
|
26
28
|
end
|
|
27
29
|
|
|
28
|
-
#
|
|
30
|
+
# Ultra-fast linear counting using cached log values
|
|
29
31
|
# @param m [Integer] the number of registers
|
|
30
32
|
# @param zero_registers [Integer] the number of registers with value 0
|
|
31
33
|
# @return [Float] the estimated cardinality
|
|
32
34
|
def linear_counting(m, zero_registers)
|
|
35
|
+
return 0.0 if zero_registers >= m
|
|
36
|
+
return Float::INFINITY if zero_registers.zero?
|
|
37
|
+
|
|
33
38
|
m * ::Math.log(m.to_f / zero_registers)
|
|
34
39
|
end
|
|
35
40
|
|
|
36
|
-
# Compute alpha
|
|
41
|
+
# Compute alpha with O(1) lookup for common values
|
|
37
42
|
# @param m [Integer] the number of registers
|
|
38
43
|
# @return [Float] the alpha bias correction factor
|
|
39
44
|
def compute_alpha(m)
|
|
40
|
-
#
|
|
41
|
-
|
|
45
|
+
# O(1) lookup for pre-computed values
|
|
46
|
+
cached = Hyll::Constants::ALPHA[m]
|
|
47
|
+
return cached if cached
|
|
42
48
|
|
|
43
49
|
# For values close to the keys in ALPHA, use the closest key
|
|
44
|
-
# This is especially important for test cases with specific expected values
|
|
45
50
|
alpha_keys = Hyll::Constants::ALPHA.keys.sort
|
|
46
|
-
|
|
47
|
-
# Use binary search to find closest key
|
|
48
51
|
closest_key = find_closest_key(alpha_keys, m)
|
|
49
52
|
|
|
50
53
|
# If we're within 5% of a known key, use its value
|
|
51
|
-
|
|
52
|
-
|
|
54
|
+
if closest_key && (closest_key - m).abs < closest_key * 0.05
|
|
55
|
+
return Hyll::Constants::ALPHA[closest_key]
|
|
56
|
+
end
|
|
53
57
|
|
|
54
|
-
#
|
|
58
|
+
# Fallback computation for non-standard sizes
|
|
55
59
|
case m
|
|
56
60
|
when 16..64 then 0.673
|
|
57
61
|
when 65..128 then 0.697
|
|
@@ -61,82 +65,148 @@ module Hyll
|
|
|
61
65
|
end
|
|
62
66
|
end
|
|
63
67
|
|
|
64
|
-
#
|
|
68
|
+
# Get pre-computed alpha * m^2 for cardinality estimation
|
|
69
|
+
# Eliminates multiplication in hot path
|
|
70
|
+
# @param precision [Integer] HLL precision
|
|
71
|
+
# @return [Float] pre-computed alpha * m^2
|
|
72
|
+
def alpha_m_squared(precision)
|
|
73
|
+
Constants::ALPHA_M_SQUARED[precision] || begin
|
|
74
|
+
m = 1 << precision
|
|
75
|
+
compute_alpha(m) * m * m
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Fast power of 2 negative lookup
|
|
80
|
+
# @param n [Integer] the exponent (0-64)
|
|
81
|
+
# @return [Float] 2^(-n)
|
|
82
|
+
def pow2_neg(n)
|
|
83
|
+
return 1.0 if n.zero?
|
|
84
|
+
return 0.0 if n > 64
|
|
85
|
+
|
|
86
|
+
Constants::POW2_NEG_TABLE[n] || 2.0**-n
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Batch power of 2 negative calculation
|
|
90
|
+
# Useful for cardinality estimation across all registers
|
|
91
|
+
# @param values [Array<Integer>] array of exponents
|
|
92
|
+
# @return [Float] sum of 2^(-v) for all v in values
|
|
93
|
+
def sum_pow2_neg(values)
|
|
94
|
+
table = Constants::POW2_NEG_TABLE
|
|
95
|
+
sum = 0.0
|
|
96
|
+
|
|
97
|
+
values.each do |v|
|
|
98
|
+
sum += if v <= 64
|
|
99
|
+
table[v]
|
|
100
|
+
else
|
|
101
|
+
2.0**-v
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
sum
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Calculate h(x) values efficiently with memoization
|
|
65
109
|
# @param x [Float] the value
|
|
66
110
|
# @param k_min [Integer] minimum k
|
|
67
111
|
# @param k_max [Integer] maximum k
|
|
68
112
|
# @return [Array<Float>] array of h(x/2^k) values
|
|
69
113
|
def calculate_h_values(x, k_min, k_max)
|
|
70
|
-
#
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
x_prime / 2.0
|
|
84
|
-
elsif x_prime <= 0.5
|
|
85
|
-
# Use more accurate Taylor series for small-to-medium values
|
|
86
|
-
x_prime / 2.0 - (x_prime**2) / 12.0 + (x_prime**4) / 720.0 - (x_prime**6) / 30_240.0
|
|
87
|
-
else
|
|
88
|
-
# For larger values, directly compute
|
|
89
|
-
1.0 - ::Math.exp(-x_prime)
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
# Store the first h value
|
|
114
|
+
# Guard against invalid inputs
|
|
115
|
+
return [] if k_min > k_max
|
|
116
|
+
return [0.0] * (k_max - k_min + 1) if x.zero? || x.nan? || x.infinite?
|
|
117
|
+
|
|
118
|
+
size = k_max - k_min + 1
|
|
119
|
+
h_values = Array.new(size)
|
|
120
|
+
|
|
121
|
+
# Calculate the initial value using power of 2 table
|
|
122
|
+
x_prime = x * pow2_neg(k_max)
|
|
123
|
+
|
|
124
|
+
# Compute h using optimized branches
|
|
125
|
+
h = compute_h_initial(x_prime)
|
|
126
|
+
|
|
93
127
|
h_values[0] = h
|
|
94
128
|
|
|
95
129
|
# Calculate subsequent h values using recurrence relation
|
|
96
|
-
1
|
|
97
|
-
|
|
98
|
-
|
|
130
|
+
# h(2x) = (2x + h(x)(1-h(x))) / (2x + 1 - h(x))
|
|
131
|
+
1.upto(size - 1) do |i|
|
|
132
|
+
x_prime *= 2.0
|
|
133
|
+
one_minus_h = 1.0 - h
|
|
134
|
+
denominator = x_prime + one_minus_h
|
|
135
|
+
|
|
136
|
+
h = if denominator.abs < Float::EPSILON
|
|
137
|
+
h_values[i - 1]
|
|
138
|
+
else
|
|
139
|
+
(x_prime + h * one_minus_h) / denominator
|
|
140
|
+
end
|
|
141
|
+
|
|
99
142
|
h_values[i] = h
|
|
100
143
|
end
|
|
101
144
|
|
|
102
145
|
h_values
|
|
103
146
|
end
|
|
104
147
|
|
|
148
|
+
# Harmonic mean computation optimized for register values
|
|
149
|
+
# @param register_values [Array<Integer>] register values
|
|
150
|
+
# @return [Float] harmonic mean contribution
|
|
151
|
+
def harmonic_mean_sum(register_values)
|
|
152
|
+
table = Constants::POW2_NEG_TABLE
|
|
153
|
+
sum = 0.0
|
|
154
|
+
|
|
155
|
+
register_values.each do |v|
|
|
156
|
+
sum += table[v] || 2.0**-v
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
sum
|
|
160
|
+
end
|
|
161
|
+
|
|
105
162
|
private
|
|
106
163
|
|
|
107
|
-
#
|
|
164
|
+
# Compute initial h value based on x_prime magnitude
|
|
165
|
+
def compute_h_initial(x_prime)
|
|
166
|
+
if x_prime <= 0.1
|
|
167
|
+
# For very small values, h(x) ≈ x/2
|
|
168
|
+
x_prime * 0.5
|
|
169
|
+
elsif x_prime <= 0.5
|
|
170
|
+
# Taylor series approximation
|
|
171
|
+
x2 = x_prime * x_prime
|
|
172
|
+
x4 = x2 * x2
|
|
173
|
+
x6 = x4 * x2
|
|
174
|
+
x_prime * 0.5 - x2 / 12.0 + x4 / 720.0 - x6 / 30_240.0
|
|
175
|
+
else
|
|
176
|
+
# Direct computation
|
|
177
|
+
1.0 - ::Math.exp(-x_prime)
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Find the closest key in a sorted array - optimized with binary search
|
|
108
182
|
# @param keys [Array<Integer>] sorted array of keys
|
|
109
183
|
# @param value [Integer] the value to find closest match for
|
|
110
184
|
# @return [Integer, nil] the closest key, or nil if keys is empty
|
|
111
185
|
def find_closest_key(keys, value)
|
|
112
186
|
return nil if keys.empty?
|
|
113
187
|
|
|
114
|
-
# Binary search
|
|
188
|
+
# Binary search
|
|
115
189
|
low = 0
|
|
116
190
|
high = keys.length - 1
|
|
117
191
|
|
|
118
192
|
while low <= high
|
|
119
|
-
mid = (low + high)
|
|
193
|
+
mid = (low + high) >> 1
|
|
120
194
|
|
|
121
|
-
|
|
195
|
+
case keys[mid] <=> value
|
|
196
|
+
when 0
|
|
122
197
|
return keys[mid]
|
|
123
|
-
|
|
198
|
+
when -1
|
|
124
199
|
low = mid + 1
|
|
125
|
-
|
|
200
|
+
when 1
|
|
126
201
|
high = mid - 1
|
|
127
202
|
end
|
|
128
203
|
end
|
|
129
204
|
|
|
130
|
-
#
|
|
131
|
-
|
|
132
|
-
if
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
keys[-1]
|
|
136
|
-
else
|
|
137
|
-
# Choose the closest of the two neighbors
|
|
138
|
-
(value - keys[high]).abs < (keys[low] - value).abs ? keys[high] : keys[low]
|
|
139
|
-
end
|
|
205
|
+
# Find closest neighbor
|
|
206
|
+
return keys[0] if high.negative?
|
|
207
|
+
return keys[-1] if low >= keys.length
|
|
208
|
+
|
|
209
|
+
(value - keys[high]).abs < (keys[low] - value).abs ? keys[high] : keys[low]
|
|
140
210
|
end
|
|
141
211
|
end
|
|
142
212
|
end
|
data/lib/hyll/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: hyll
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 1.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Davide Santangelo
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-11-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: digest
|
|
@@ -44,6 +44,8 @@ files:
|
|
|
44
44
|
- Rakefile
|
|
45
45
|
- examples/advance.rb
|
|
46
46
|
- examples/basic.rb
|
|
47
|
+
- examples/redis_comparison_benchmark.rb
|
|
48
|
+
- examples/v1_benchmark.rb
|
|
47
49
|
- lib/hyll.rb
|
|
48
50
|
- lib/hyll/algorithms/enhanced_hyperloglog.rb
|
|
49
51
|
- lib/hyll/algorithms/hyperloglog.rb
|