hyll 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +8 -0
- data/CHANGELOG.md +36 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +313 -0
- data/Rakefile +12 -0
- data/examples/advance.rb +258 -0
- data/examples/basic.rb +161 -0
- data/lib/hyll/algorithms/enhanced_hyperloglog.rb +343 -0
- data/lib/hyll/algorithms/hyperloglog.rb +759 -0
- data/lib/hyll/constants.rb +29 -0
- data/lib/hyll/factory.rb +34 -0
- data/lib/hyll/utils/hash.rb +65 -0
- data/lib/hyll/utils/math.rb +143 -0
- data/lib/hyll/version.rb +5 -0
- data/lib/hyll.rb +29 -0
- data/sig/hyll.rbs +4 -0
- metadata +80 -0
data/examples/advance.rb
ADDED
@@ -0,0 +1,258 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "hyll"
|
4
|
+
|
5
|
+
# ADVANCED USAGE EXAMPLES
|
6
|
+
|
7
|
+
# Define User class outside of method scope
|
8
|
+
class User
|
9
|
+
attr_reader :id, :email
|
10
|
+
|
11
|
+
def initialize(id, email)
|
12
|
+
@id = id
|
13
|
+
@email = email
|
14
|
+
end
|
15
|
+
|
16
|
+
# Override to_s for proper hashing
|
17
|
+
def to_s
|
18
|
+
"User:#{@id}:#{@email}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Example 1: Estimating Intersection Size Between Sets
|
23
|
+
def intersection_example
|
24
|
+
puts "=== Intersection Estimation Example ==="
|
25
|
+
|
26
|
+
# Create two HyperLogLog counters with higher precision for better accuracy
|
27
|
+
hll1 = Hyll.new(precision: 14)
|
28
|
+
hll2 = Hyll.new(precision: 14)
|
29
|
+
|
30
|
+
# Add elements with controlled overlap (30% overlap)
|
31
|
+
total_items = 100_000
|
32
|
+
overlap = (total_items * 0.3).to_i
|
33
|
+
|
34
|
+
# First set: 0 to 99,999
|
35
|
+
total_items.times { |i| hll1.add("item-#{i}") }
|
36
|
+
|
37
|
+
# Second set: 70,000 to 169,999 (30,000 items overlap)
|
38
|
+
total_items.times { |i| hll2.add("item-#{i + total_items - overlap}") }
|
39
|
+
|
40
|
+
# Create union by merging (make copy first to avoid modifying original)
|
41
|
+
union = hll1.to_enhanced
|
42
|
+
union.merge(hll2)
|
43
|
+
|
44
|
+
# Calculate intersection using inclusion-exclusion principle with standard estimates
|
45
|
+
estimate1 = hll1.cardinality
|
46
|
+
estimate2 = hll2.cardinality
|
47
|
+
union_estimate = union.cardinality
|
48
|
+
intersection_estimate = estimate1 + estimate2 - union_estimate
|
49
|
+
|
50
|
+
# Alternative calculation using maximum likelihood estimation for better accuracy
|
51
|
+
mle_estimate1 = hll1.maximum_likelihood_cardinality
|
52
|
+
mle_estimate2 = hll2.maximum_likelihood_cardinality
|
53
|
+
mle_union_estimate = union.maximum_likelihood_cardinality
|
54
|
+
mle_intersection_estimate = mle_estimate1 + mle_estimate2 - mle_union_estimate
|
55
|
+
|
56
|
+
# Apply bias correction for intersection estimates
|
57
|
+
# Intersection estimates often have larger relative error than individual cardinality estimates
|
58
|
+
correction_factor = 0.95 # Slight adjustment to reduce overestimation
|
59
|
+
corrected_intersection = intersection_estimate * correction_factor
|
60
|
+
|
61
|
+
# Calculate average of different estimation methods for better results
|
62
|
+
combined_intersection = (corrected_intersection + mle_intersection_estimate) / 2.0
|
63
|
+
|
64
|
+
puts "Set A cardinality: #{estimate1.round} (MLE: #{mle_estimate1.round})"
|
65
|
+
puts "Set B cardinality: #{estimate2.round} (MLE: #{mle_estimate2.round})"
|
66
|
+
puts "Union cardinality: #{union_estimate.round} (MLE: #{mle_union_estimate.round})"
|
67
|
+
puts "Estimated intersection:"
|
68
|
+
puts " - Basic: #{intersection_estimate.round}"
|
69
|
+
puts " - MLE: #{mle_intersection_estimate.round}"
|
70
|
+
puts " - Corrected: #{corrected_intersection.round}"
|
71
|
+
puts " - Combined: #{combined_intersection.round}"
|
72
|
+
puts "Actual intersection: #{overlap}"
|
73
|
+
puts "Error rates:"
|
74
|
+
puts " - Basic: #{((intersection_estimate - overlap).abs / overlap * 100).round(2)}%"
|
75
|
+
puts " - MLE: #{((mle_intersection_estimate - overlap).abs / overlap * 100).round(2)}%"
|
76
|
+
puts " - Corrected: #{((corrected_intersection - overlap).abs / overlap * 100).round(2)}%"
|
77
|
+
puts " - Combined: #{((combined_intersection - overlap).abs / overlap * 100).round(2)}%"
|
78
|
+
puts "\n"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Example 2: Working with Custom Data Types
|
82
|
+
def custom_data_types_example
|
83
|
+
puts "=== Custom Data Types Example ==="
|
84
|
+
|
85
|
+
# Create HLL counter
|
86
|
+
hll = Hyll.new
|
87
|
+
|
88
|
+
# Add custom objects
|
89
|
+
users = []
|
90
|
+
1000.times do |i|
|
91
|
+
# Some users will have the same email to simulate duplicates
|
92
|
+
email = "user#{i % 800}@example.com"
|
93
|
+
users << User.new(i, email)
|
94
|
+
end
|
95
|
+
|
96
|
+
# Add all users
|
97
|
+
users.each { |user| hll.add(user) }
|
98
|
+
|
99
|
+
# Check cardinality - should be close to 800 (unique emails)
|
100
|
+
puts "Added #{users.size} users with #{users.map(&:email).uniq.size} unique emails"
|
101
|
+
puts "HyperLogLog estimate: #{hll.cardinality.round}"
|
102
|
+
puts "\n"
|
103
|
+
|
104
|
+
# Track unique emails by domain
|
105
|
+
domains = {}
|
106
|
+
|
107
|
+
users.each do |user|
|
108
|
+
domain = user.email.split("@").last
|
109
|
+
domains[domain] ||= Hyll.new
|
110
|
+
domains[domain].add(user.email)
|
111
|
+
end
|
112
|
+
|
113
|
+
domains.each do |domain, counter|
|
114
|
+
puts "Domain #{domain}: ~#{counter.cardinality.round} unique emails"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Example 3: Monitoring Stream Cardinality with Time Windows
|
119
|
+
def time_window_example
|
120
|
+
puts "=== Time Window Monitoring Example ==="
|
121
|
+
|
122
|
+
# Create counters for different time windows
|
123
|
+
minute_counter = Hyll.new
|
124
|
+
hour_counter = Hyll.new
|
125
|
+
day_counter = Hyll.new
|
126
|
+
|
127
|
+
# Simulate time windows with different event rates
|
128
|
+
# For simplicity, we'll compress time in this example
|
129
|
+
|
130
|
+
puts "Simulating a stream of events with varying rates..."
|
131
|
+
|
132
|
+
# Simulate a day's worth of data
|
133
|
+
# Each "minute" has a different number of events
|
134
|
+
24.times do |hour|
|
135
|
+
puts "Hour #{hour}:"
|
136
|
+
|
137
|
+
# Reset minute counter each hour
|
138
|
+
minute_counter.reset
|
139
|
+
|
140
|
+
60.times do |minute|
|
141
|
+
# Generate some data for this minute
|
142
|
+
# Use time of day to vary the rate (busier during work hours)
|
143
|
+
base_rate = 100
|
144
|
+
time_factor = if (9..17).include?(hour)
|
145
|
+
10 # Work hours - 10x more traffic
|
146
|
+
elsif (18..22).include?(hour)
|
147
|
+
5 # Evening - 5x more traffic
|
148
|
+
else
|
149
|
+
1 # Late night/early morning - base traffic
|
150
|
+
end
|
151
|
+
|
152
|
+
# Add some randomness
|
153
|
+
rate = (base_rate * time_factor * (0.5 + rand)).to_i
|
154
|
+
|
155
|
+
# Add unique events for this minute
|
156
|
+
# Some IDs will repeat across minutes/hours to simulate returning users
|
157
|
+
rate.times do |i|
|
158
|
+
# Event ID combines hour, minute and unique ID
|
159
|
+
# We'll make some IDs repeat to simulate returning users
|
160
|
+
event_id = "user-#{(hour * 60 + minute + i) % 10_000}"
|
161
|
+
|
162
|
+
minute_counter.add(event_id)
|
163
|
+
hour_counter.add(event_id)
|
164
|
+
day_counter.add(event_id)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Every 15 minutes, print stats
|
168
|
+
next unless minute % 15 == 14
|
169
|
+
|
170
|
+
puts " Minute #{minute + 1} - Unique users in last:"
|
171
|
+
puts " - Minute: #{minute_counter.cardinality.round}"
|
172
|
+
puts " - Hour: #{hour_counter.cardinality.round}"
|
173
|
+
puts " - Day so far: #{day_counter.cardinality.round}"
|
174
|
+
end
|
175
|
+
|
176
|
+
# Reset hour counter at end of day
|
177
|
+
hour_counter.reset unless hour == 23
|
178
|
+
end
|
179
|
+
|
180
|
+
puts "Simulation complete. Total unique users for the day: #{day_counter.cardinality.round}"
|
181
|
+
puts "\n"
|
182
|
+
end
|
183
|
+
|
184
|
+
# Example 4: Advanced Serialization and Storage
|
185
|
+
def serialization_example
|
186
|
+
puts "=== Advanced Serialization Example ==="
|
187
|
+
|
188
|
+
# Create and populate HLL
|
189
|
+
hll = Hyll.new
|
190
|
+
puts "Adding 1 million items..."
|
191
|
+
1_000_000.times { |i| hll.add("user-#{i}") }
|
192
|
+
|
193
|
+
# Serialize to different formats
|
194
|
+
binary = hll.serialize
|
195
|
+
|
196
|
+
# Simulate storing in a database (Base64 encoded)
|
197
|
+
require "base64"
|
198
|
+
base64_string = Base64.strict_encode64(binary)
|
199
|
+
|
200
|
+
puts "Original cardinality: #{hll.cardinality.round}"
|
201
|
+
puts "Binary serialized size: #{binary.bytesize} bytes"
|
202
|
+
puts "Base64 serialized size: #{base64_string.bytesize} bytes"
|
203
|
+
|
204
|
+
# Demonstrate storage efficiency
|
205
|
+
puts "Storage efficiency: #{(1_000_000 * 8 / binary.bytesize).round}x compression ratio"
|
206
|
+
|
207
|
+
# Simulate retrieving and deserializing
|
208
|
+
retrieved_binary = Base64.strict_decode64(base64_string)
|
209
|
+
retrieved_hll = Hyll.deserialize(retrieved_binary)
|
210
|
+
|
211
|
+
puts "Retrieved cardinality: #{retrieved_hll.cardinality.round}"
|
212
|
+
puts "\n"
|
213
|
+
|
214
|
+
# Convert to enhanced format for interoperability
|
215
|
+
enhanced_hll = hll.to_enhanced
|
216
|
+
enhanced_binary = enhanced_hll.serialize
|
217
|
+
|
218
|
+
puts "Enhanced format serialized size: #{enhanced_binary.bytesize} bytes"
|
219
|
+
puts "\n"
|
220
|
+
end
|
221
|
+
|
222
|
+
# Example 5: Benchmark Different Precision Levels
|
223
|
+
def precision_benchmark
|
224
|
+
puts "=== Precision Benchmark Example ==="
|
225
|
+
|
226
|
+
# Create HLLs with different precision levels
|
227
|
+
precisions = [6, 8, 10, 12, 14]
|
228
|
+
hlls = precisions.map { |p| Hyll.new(precision: p) }
|
229
|
+
|
230
|
+
# Number of unique elements to add
|
231
|
+
num_elements = 1_000_000
|
232
|
+
|
233
|
+
puts "Benchmarking with #{num_elements} unique elements"
|
234
|
+
puts "Precision | Memory (bytes) | Estimate | Error (%)"
|
235
|
+
puts "----------|----------------|----------|----------"
|
236
|
+
|
237
|
+
precisions.each_with_index do |precision, i|
|
238
|
+
# Add elements
|
239
|
+
num_elements.times { |j| hlls[i].add("element-#{j}") }
|
240
|
+
|
241
|
+
# Calculate statistics
|
242
|
+
serialized = hlls[i].serialize
|
243
|
+
memory_used = serialized.bytesize
|
244
|
+
estimate = hlls[i].cardinality
|
245
|
+
error_percent = ((estimate - num_elements).abs / num_elements.to_f * 100).round(2)
|
246
|
+
|
247
|
+
puts format("%9d | %14d | %8d | %9.2f", precision, memory_used, estimate.round, error_percent)
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
# Run all examples
|
252
|
+
intersection_example
|
253
|
+
custom_data_types_example
|
254
|
+
time_window_example
|
255
|
+
serialization_example
|
256
|
+
precision_benchmark
|
257
|
+
|
258
|
+
puts "Advanced examples completed!"
|
data/examples/basic.rb
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "hyll"
|
4
|
+
|
5
|
+
# BASIC USAGE EXAMPLES
|
6
|
+
|
7
|
+
# Example 1: Basic Counting
|
8
|
+
puts "Example 1: Basic counting of unique elements"
|
9
|
+
counter = Hyll::HyperLogLog.new
|
10
|
+
100.times { |i| counter.add(i) }
|
11
|
+
puts "Added 100 unique numbers. Estimated count: #{counter.count}"
|
12
|
+
puts "Raw cardinality estimate: #{counter.cardinality}"
|
13
|
+
puts "\n"
|
14
|
+
|
15
|
+
# Example 2: MLE Estimation
|
16
|
+
puts "Example 2: Using Maximum Likelihood Estimation"
|
17
|
+
counter = Hyll::HyperLogLog.new
|
18
|
+
1000.times { |i| counter.add(i) }
|
19
|
+
puts "Standard estimation: #{counter.cardinality}"
|
20
|
+
puts "MLE estimation: #{counter.mle_cardinality}"
|
21
|
+
puts "\n"
|
22
|
+
|
23
|
+
# Example 3: Custom Precision
|
24
|
+
puts "Example 3: Setting custom precision"
|
25
|
+
# Lower precision (less memory, less accuracy)
|
26
|
+
counter_low = Hyll::HyperLogLog.new(6)
|
27
|
+
# Higher precision (more memory, more accuracy)
|
28
|
+
counter_high = Hyll::HyperLogLog.new(14)
|
29
|
+
|
30
|
+
10_000.times do |i|
|
31
|
+
counter_low.add(i)
|
32
|
+
counter_high.add(i)
|
33
|
+
end
|
34
|
+
|
35
|
+
puts "Low precision (2^6 registers): #{counter_low.count}"
|
36
|
+
puts "High precision (2^14 registers): #{counter_high.count}"
|
37
|
+
puts "\n"
|
38
|
+
|
39
|
+
# Example 4: Adding non-integer elements
|
40
|
+
puts "Example 4: Counting different data types"
|
41
|
+
counter = Hyll::HyperLogLog.new
|
42
|
+
%w[apple banana cherry apple durian].each { |fruit| counter.add(fruit) }
|
43
|
+
puts "Unique fruit count: #{counter.count}" # Should be approximately 4
|
44
|
+
|
45
|
+
counter.reset
|
46
|
+
["user1@example.com", "user2@example.com", "USER1@example.com"].each { |email| counter.add(email.downcase) }
|
47
|
+
puts "Unique email count (case insensitive): #{counter.count}" # Should be approximately 2
|
48
|
+
puts "\n"
|
49
|
+
|
50
|
+
# ADVANCED USAGE EXAMPLES
|
51
|
+
|
52
|
+
# Example 5: Merging counters
|
53
|
+
puts "Example 5: Merging counters"
|
54
|
+
counter1 = Hyll::HyperLogLog.new
|
55
|
+
counter2 = Hyll::HyperLogLog.new
|
56
|
+
|
57
|
+
# Add some unique elements to each counter
|
58
|
+
100.times { |i| counter1.add("item-#{i}") }
|
59
|
+
100.times { |i| counter2.add("item-#{i + 50}") } # 50 overlapping items
|
60
|
+
|
61
|
+
puts "Counter 1 estimate: #{counter1.count}"
|
62
|
+
puts "Counter 2 estimate: #{counter2.count}"
|
63
|
+
|
64
|
+
# Merge counter2 into counter1
|
65
|
+
counter1.merge(counter2)
|
66
|
+
puts "Merged counter estimate: #{counter1.count}" # Should be approximately 150
|
67
|
+
puts "\n"
|
68
|
+
|
69
|
+
# Example 6: Serialization
|
70
|
+
puts "Example 6: Serializing and deserializing"
|
71
|
+
original = Hyll::HyperLogLog.new
|
72
|
+
1000.times { |i| original.add(i) }
|
73
|
+
|
74
|
+
# Serialize the counter
|
75
|
+
serialized = original.serialize
|
76
|
+
puts "Serialized size: #{serialized.bytesize} bytes"
|
77
|
+
|
78
|
+
# Deserialize
|
79
|
+
deserialized = Hyll::HyperLogLog.deserialize(serialized)
|
80
|
+
puts "Original count: #{original.count}"
|
81
|
+
puts "Deserialized count: #{deserialized.count}"
|
82
|
+
puts "\n"
|
83
|
+
|
84
|
+
# Example 7: Using EnhancedHyperLogLog
|
85
|
+
puts "Example 7: Using EnhancedHyperLogLog"
|
86
|
+
enhanced_counter = Hyll::EnhancedHyperLogLog.new(10)
|
87
|
+
10_000.times { |i| enhanced_counter.add(i) }
|
88
|
+
puts "EnhancedHyperLogLog count: #{enhanced_counter.count}"
|
89
|
+
|
90
|
+
# Convert standard HLL to EnhancedHLL
|
91
|
+
standard = Hyll::HyperLogLog.new(10)
|
92
|
+
10_000.times { |i| standard.add(i) }
|
93
|
+
enhanced_converted = standard.to_enhanced
|
94
|
+
puts "Standard HLL converted to Enhanced: #{enhanced_converted.count}"
|
95
|
+
|
96
|
+
# Convert EnhancedHLL back to standard HLL
|
97
|
+
standard_again = enhanced_counter.to_hll
|
98
|
+
puts "Enhanced converted back to standard: #{standard_again.count}"
|
99
|
+
puts "\n"
|
100
|
+
|
101
|
+
# Example 8: Batch Adding
|
102
|
+
puts "Example 8: Batch adding elements"
|
103
|
+
counter = Hyll::HyperLogLog.new
|
104
|
+
elements = (1..10_000).to_a
|
105
|
+
start_time = Time.now
|
106
|
+
counter.add_all(elements)
|
107
|
+
end_time = Time.now
|
108
|
+
puts "Added 10,000 elements in batch: #{counter.count}"
|
109
|
+
puts "Time taken: #{end_time - start_time} seconds"
|
110
|
+
puts "\n"
|
111
|
+
|
112
|
+
# Example 9: Dealing with large datasets
|
113
|
+
puts "Example 9: Memory efficiency with large datasets"
|
114
|
+
counter = Hyll::HyperLogLog.new(12) # 2^12 = 4096 registers
|
115
|
+
puts "Memory usage for 100 million elements is roughly the same as for 1000 elements"
|
116
|
+
puts "Sparse representation used until #{Hyll::Constants::DEFAULT_SPARSE_THRESHOLD} elements are added"
|
117
|
+
|
118
|
+
# Simulate adding 1000 elements and check memory footprint
|
119
|
+
1000.times { |i| counter.add(i) }
|
120
|
+
puts "Estimated memory for 1000 elements: #{counter.serialize.bytesize} bytes"
|
121
|
+
|
122
|
+
# Example 10: Estimating intersection size
|
123
|
+
puts "Example 10: Estimating intersection size"
|
124
|
+
set_a = Hyll::HyperLogLog.new
|
125
|
+
set_b = Hyll::HyperLogLog.new
|
126
|
+
|
127
|
+
# Add elements to both sets with some overlap
|
128
|
+
1000.times { |i| set_a.add("item-#{i}") }
|
129
|
+
1000.times { |i| set_b.add("item-#{i + 500}") } # 500 overlapping items
|
130
|
+
|
131
|
+
# Create a union set by merging
|
132
|
+
union = set_a.to_enhanced # Make a copy first
|
133
|
+
union.merge(set_b)
|
134
|
+
|
135
|
+
# Estimate intersection using inclusion-exclusion principle
|
136
|
+
# |A ∩ B| = |A| + |B| - |A ∪ B|
|
137
|
+
intersection_size = set_a.count + set_b.count - union.count
|
138
|
+
puts "Set A size: #{set_a.count}"
|
139
|
+
puts "Set B size: #{set_b.count}"
|
140
|
+
puts "Union size: #{union.count}"
|
141
|
+
puts "Estimated intersection size: #{intersection_size} (actual: 500)"
|
142
|
+
puts "\n"
|
143
|
+
|
144
|
+
# Example 11: Streaming data application
|
145
|
+
puts "Example 11: Streaming data application"
|
146
|
+
puts "HyperLogLog is perfect for streaming applications where you can't store all data:"
|
147
|
+
|
148
|
+
counter = Hyll::HyperLogLog.new
|
149
|
+
puts "Imagine processing a stream of user IDs from web logs..."
|
150
|
+
# Simulate stream processing
|
151
|
+
10_000.times do
|
152
|
+
# In a real stream, you'd process each item as it arrives
|
153
|
+
user_id = rand(5000) # Simulate about 5000 unique users
|
154
|
+
counter.add(user_id)
|
155
|
+
|
156
|
+
# Periodically report statistics without storing all IDs
|
157
|
+
puts "Processed #{user_id} records, estimated unique users: #{counter.count}" if (user_id % 2500).zero?
|
158
|
+
end
|
159
|
+
|
160
|
+
puts "Final unique user estimate: #{counter.count}"
|
161
|
+
puts "All this with minimal memory usage and O(1) update time!"
|