hyll 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,258 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "hyll"
4
+
5
+ # ADVANCED USAGE EXAMPLES
6
+
7
+ # Define User class outside of method scope
8
+ class User
9
+ attr_reader :id, :email
10
+
11
+ def initialize(id, email)
12
+ @id = id
13
+ @email = email
14
+ end
15
+
16
+ # Override to_s for proper hashing
17
+ def to_s
18
+ "User:#{@id}:#{@email}"
19
+ end
20
+ end
21
+
22
+ # Example 1: Estimating Intersection Size Between Sets
23
+ def intersection_example
24
+ puts "=== Intersection Estimation Example ==="
25
+
26
+ # Create two HyperLogLog counters with higher precision for better accuracy
27
+ hll1 = Hyll.new(precision: 14)
28
+ hll2 = Hyll.new(precision: 14)
29
+
30
+ # Add elements with controlled overlap (30% overlap)
31
+ total_items = 100_000
32
+ overlap = (total_items * 0.3).to_i
33
+
34
+ # First set: 0 to 99,999
35
+ total_items.times { |i| hll1.add("item-#{i}") }
36
+
37
+ # Second set: 70,000 to 169,999 (30,000 items overlap)
38
+ total_items.times { |i| hll2.add("item-#{i + total_items - overlap}") }
39
+
40
+ # Create union by merging (make copy first to avoid modifying original)
41
+ union = hll1.to_enhanced
42
+ union.merge(hll2)
43
+
44
+ # Calculate intersection using inclusion-exclusion principle with standard estimates
45
+ estimate1 = hll1.cardinality
46
+ estimate2 = hll2.cardinality
47
+ union_estimate = union.cardinality
48
+ intersection_estimate = estimate1 + estimate2 - union_estimate
49
+
50
+ # Alternative calculation using maximum likelihood estimation for better accuracy
51
+ mle_estimate1 = hll1.maximum_likelihood_cardinality
52
+ mle_estimate2 = hll2.maximum_likelihood_cardinality
53
+ mle_union_estimate = union.maximum_likelihood_cardinality
54
+ mle_intersection_estimate = mle_estimate1 + mle_estimate2 - mle_union_estimate
55
+
56
+ # Apply bias correction for intersection estimates
57
+ # Intersection estimates often have larger relative error than individual cardinality estimates
58
+ correction_factor = 0.95 # Slight adjustment to reduce overestimation
59
+ corrected_intersection = intersection_estimate * correction_factor
60
+
61
+ # Calculate average of different estimation methods for better results
62
+ combined_intersection = (corrected_intersection + mle_intersection_estimate) / 2.0
63
+
64
+ puts "Set A cardinality: #{estimate1.round} (MLE: #{mle_estimate1.round})"
65
+ puts "Set B cardinality: #{estimate2.round} (MLE: #{mle_estimate2.round})"
66
+ puts "Union cardinality: #{union_estimate.round} (MLE: #{mle_union_estimate.round})"
67
+ puts "Estimated intersection:"
68
+ puts " - Basic: #{intersection_estimate.round}"
69
+ puts " - MLE: #{mle_intersection_estimate.round}"
70
+ puts " - Corrected: #{corrected_intersection.round}"
71
+ puts " - Combined: #{combined_intersection.round}"
72
+ puts "Actual intersection: #{overlap}"
73
+ puts "Error rates:"
74
+ puts " - Basic: #{((intersection_estimate - overlap).abs / overlap * 100).round(2)}%"
75
+ puts " - MLE: #{((mle_intersection_estimate - overlap).abs / overlap * 100).round(2)}%"
76
+ puts " - Corrected: #{((corrected_intersection - overlap).abs / overlap * 100).round(2)}%"
77
+ puts " - Combined: #{((combined_intersection - overlap).abs / overlap * 100).round(2)}%"
78
+ puts "\n"
79
+ end
80
+
81
+ # Example 2: Working with Custom Data Types
82
+ def custom_data_types_example
83
+ puts "=== Custom Data Types Example ==="
84
+
85
+ # Create HLL counter
86
+ hll = Hyll.new
87
+
88
+ # Add custom objects
89
+ users = []
90
+ 1000.times do |i|
91
+ # Some users will have the same email to simulate duplicates
92
+ email = "user#{i % 800}@example.com"
93
+ users << User.new(i, email)
94
+ end
95
+
96
+ # Add all users
97
+ users.each { |user| hll.add(user) }
98
+
99
+ # Check cardinality - should be close to 800 (unique emails)
100
+ puts "Added #{users.size} users with #{users.map(&:email).uniq.size} unique emails"
101
+ puts "HyperLogLog estimate: #{hll.cardinality.round}"
102
+ puts "\n"
103
+
104
+ # Track unique emails by domain
105
+ domains = {}
106
+
107
+ users.each do |user|
108
+ domain = user.email.split("@").last
109
+ domains[domain] ||= Hyll.new
110
+ domains[domain].add(user.email)
111
+ end
112
+
113
+ domains.each do |domain, counter|
114
+ puts "Domain #{domain}: ~#{counter.cardinality.round} unique emails"
115
+ end
116
+ end
117
+
118
+ # Example 3: Monitoring Stream Cardinality with Time Windows
119
+ def time_window_example
120
+ puts "=== Time Window Monitoring Example ==="
121
+
122
+ # Create counters for different time windows
123
+ minute_counter = Hyll.new
124
+ hour_counter = Hyll.new
125
+ day_counter = Hyll.new
126
+
127
+ # Simulate time windows with different event rates
128
+ # For simplicity, we'll compress time in this example
129
+
130
+ puts "Simulating a stream of events with varying rates..."
131
+
132
+ # Simulate a day's worth of data
133
+ # Each "minute" has a different number of events
134
+ 24.times do |hour|
135
+ puts "Hour #{hour}:"
136
+
137
+ # Reset minute counter each hour
138
+ minute_counter.reset
139
+
140
+ 60.times do |minute|
141
+ # Generate some data for this minute
142
+ # Use time of day to vary the rate (busier during work hours)
143
+ base_rate = 100
144
+ time_factor = if (9..17).include?(hour)
145
+ 10 # Work hours - 10x more traffic
146
+ elsif (18..22).include?(hour)
147
+ 5 # Evening - 5x more traffic
148
+ else
149
+ 1 # Late night/early morning - base traffic
150
+ end
151
+
152
+ # Add some randomness
153
+ rate = (base_rate * time_factor * (0.5 + rand)).to_i
154
+
155
+ # Add unique events for this minute
156
+ # Some IDs will repeat across minutes/hours to simulate returning users
157
+ rate.times do |i|
158
+ # Event ID combines hour, minute and unique ID
159
+ # We'll make some IDs repeat to simulate returning users
160
+ event_id = "user-#{(hour * 60 + minute + i) % 10_000}"
161
+
162
+ minute_counter.add(event_id)
163
+ hour_counter.add(event_id)
164
+ day_counter.add(event_id)
165
+ end
166
+
167
+ # Every 15 minutes, print stats
168
+ next unless minute % 15 == 14
169
+
170
+ puts " Minute #{minute + 1} - Unique users in last:"
171
+ puts " - Minute: #{minute_counter.cardinality.round}"
172
+ puts " - Hour: #{hour_counter.cardinality.round}"
173
+ puts " - Day so far: #{day_counter.cardinality.round}"
174
+ end
175
+
176
+ # Reset hour counter at end of day
177
+ hour_counter.reset unless hour == 23
178
+ end
179
+
180
+ puts "Simulation complete. Total unique users for the day: #{day_counter.cardinality.round}"
181
+ puts "\n"
182
+ end
183
+
184
+ # Example 4: Advanced Serialization and Storage
185
+ def serialization_example
186
+ puts "=== Advanced Serialization Example ==="
187
+
188
+ # Create and populate HLL
189
+ hll = Hyll.new
190
+ puts "Adding 1 million items..."
191
+ 1_000_000.times { |i| hll.add("user-#{i}") }
192
+
193
+ # Serialize to different formats
194
+ binary = hll.serialize
195
+
196
+ # Simulate storing in a database (Base64 encoded)
197
+ require "base64"
198
+ base64_string = Base64.strict_encode64(binary)
199
+
200
+ puts "Original cardinality: #{hll.cardinality.round}"
201
+ puts "Binary serialized size: #{binary.bytesize} bytes"
202
+ puts "Base64 serialized size: #{base64_string.bytesize} bytes"
203
+
204
+ # Demonstrate storage efficiency
205
+ puts "Storage efficiency: #{(1_000_000 * 8 / binary.bytesize).round}x compression ratio"
206
+
207
+ # Simulate retrieving and deserializing
208
+ retrieved_binary = Base64.strict_decode64(base64_string)
209
+ retrieved_hll = Hyll.deserialize(retrieved_binary)
210
+
211
+ puts "Retrieved cardinality: #{retrieved_hll.cardinality.round}"
212
+ puts "\n"
213
+
214
+ # Convert to enhanced format for interoperability
215
+ enhanced_hll = hll.to_enhanced
216
+ enhanced_binary = enhanced_hll.serialize
217
+
218
+ puts "Enhanced format serialized size: #{enhanced_binary.bytesize} bytes"
219
+ puts "\n"
220
+ end
221
+
222
+ # Example 5: Benchmark Different Precision Levels
223
+ def precision_benchmark
224
+ puts "=== Precision Benchmark Example ==="
225
+
226
+ # Create HLLs with different precision levels
227
+ precisions = [6, 8, 10, 12, 14]
228
+ hlls = precisions.map { |p| Hyll.new(precision: p) }
229
+
230
+ # Number of unique elements to add
231
+ num_elements = 1_000_000
232
+
233
+ puts "Benchmarking with #{num_elements} unique elements"
234
+ puts "Precision | Memory (bytes) | Estimate | Error (%)"
235
+ puts "----------|----------------|----------|----------"
236
+
237
+ precisions.each_with_index do |precision, i|
238
+ # Add elements
239
+ num_elements.times { |j| hlls[i].add("element-#{j}") }
240
+
241
+ # Calculate statistics
242
+ serialized = hlls[i].serialize
243
+ memory_used = serialized.bytesize
244
+ estimate = hlls[i].cardinality
245
+ error_percent = ((estimate - num_elements).abs / num_elements.to_f * 100).round(2)
246
+
247
+ puts format("%9d | %14d | %8d | %9.2f", precision, memory_used, estimate.round, error_percent)
248
+ end
249
+ end
250
+
251
+ # Run all examples
252
+ intersection_example
253
+ custom_data_types_example
254
+ time_window_example
255
+ serialization_example
256
+ precision_benchmark
257
+
258
+ puts "Advanced examples completed!"
data/examples/basic.rb ADDED
@@ -0,0 +1,161 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "hyll"
4
+
5
+ # BASIC USAGE EXAMPLES
6
+
7
+ # Example 1: Basic Counting
8
+ puts "Example 1: Basic counting of unique elements"
9
+ counter = Hyll::HyperLogLog.new
10
+ 100.times { |i| counter.add(i) }
11
+ puts "Added 100 unique numbers. Estimated count: #{counter.count}"
12
+ puts "Raw cardinality estimate: #{counter.cardinality}"
13
+ puts "\n"
14
+
15
+ # Example 2: MLE Estimation
16
+ puts "Example 2: Using Maximum Likelihood Estimation"
17
+ counter = Hyll::HyperLogLog.new
18
+ 1000.times { |i| counter.add(i) }
19
+ puts "Standard estimation: #{counter.cardinality}"
20
+ puts "MLE estimation: #{counter.mle_cardinality}"
21
+ puts "\n"
22
+
23
+ # Example 3: Custom Precision
24
+ puts "Example 3: Setting custom precision"
25
+ # Lower precision (less memory, less accuracy)
26
+ counter_low = Hyll::HyperLogLog.new(6)
27
+ # Higher precision (more memory, more accuracy)
28
+ counter_high = Hyll::HyperLogLog.new(14)
29
+
30
+ 10_000.times do |i|
31
+ counter_low.add(i)
32
+ counter_high.add(i)
33
+ end
34
+
35
+ puts "Low precision (2^6 registers): #{counter_low.count}"
36
+ puts "High precision (2^14 registers): #{counter_high.count}"
37
+ puts "\n"
38
+
39
+ # Example 4: Adding non-integer elements
40
+ puts "Example 4: Counting different data types"
41
+ counter = Hyll::HyperLogLog.new
42
+ %w[apple banana cherry apple durian].each { |fruit| counter.add(fruit) }
43
+ puts "Unique fruit count: #{counter.count}" # Should be approximately 4
44
+
45
+ counter.reset
46
+ ["user1@example.com", "user2@example.com", "USER1@example.com"].each { |email| counter.add(email.downcase) }
47
+ puts "Unique email count (case insensitive): #{counter.count}" # Should be approximately 2
48
+ puts "\n"
49
+
50
+ # ADVANCED USAGE EXAMPLES
51
+
52
+ # Example 5: Merging counters
53
+ puts "Example 5: Merging counters"
54
+ counter1 = Hyll::HyperLogLog.new
55
+ counter2 = Hyll::HyperLogLog.new
56
+
57
+ # Add some unique elements to each counter
58
+ 100.times { |i| counter1.add("item-#{i}") }
59
+ 100.times { |i| counter2.add("item-#{i + 50}") } # 50 overlapping items
60
+
61
+ puts "Counter 1 estimate: #{counter1.count}"
62
+ puts "Counter 2 estimate: #{counter2.count}"
63
+
64
+ # Merge counter2 into counter1
65
+ counter1.merge(counter2)
66
+ puts "Merged counter estimate: #{counter1.count}" # Should be approximately 150
67
+ puts "\n"
68
+
69
+ # Example 6: Serialization
70
+ puts "Example 6: Serializing and deserializing"
71
+ original = Hyll::HyperLogLog.new
72
+ 1000.times { |i| original.add(i) }
73
+
74
+ # Serialize the counter
75
+ serialized = original.serialize
76
+ puts "Serialized size: #{serialized.bytesize} bytes"
77
+
78
+ # Deserialize
79
+ deserialized = Hyll::HyperLogLog.deserialize(serialized)
80
+ puts "Original count: #{original.count}"
81
+ puts "Deserialized count: #{deserialized.count}"
82
+ puts "\n"
83
+
84
+ # Example 7: Using EnhancedHyperLogLog
85
+ puts "Example 7: Using EnhancedHyperLogLog"
86
+ enhanced_counter = Hyll::EnhancedHyperLogLog.new(10)
87
+ 10_000.times { |i| enhanced_counter.add(i) }
88
+ puts "EnhancedHyperLogLog count: #{enhanced_counter.count}"
89
+
90
+ # Convert standard HLL to EnhancedHLL
91
+ standard = Hyll::HyperLogLog.new(10)
92
+ 10_000.times { |i| standard.add(i) }
93
+ enhanced_converted = standard.to_enhanced
94
+ puts "Standard HLL converted to Enhanced: #{enhanced_converted.count}"
95
+
96
+ # Convert EnhancedHLL back to standard HLL
97
+ standard_again = enhanced_counter.to_hll
98
+ puts "Enhanced converted back to standard: #{standard_again.count}"
99
+ puts "\n"
100
+
101
+ # Example 8: Batch Adding
102
+ puts "Example 8: Batch adding elements"
103
+ counter = Hyll::HyperLogLog.new
104
+ elements = (1..10_000).to_a
105
+ start_time = Time.now
106
+ counter.add_all(elements)
107
+ end_time = Time.now
108
+ puts "Added 10,000 elements in batch: #{counter.count}"
109
+ puts "Time taken: #{end_time - start_time} seconds"
110
+ puts "\n"
111
+
112
+ # Example 9: Dealing with large datasets
113
+ puts "Example 9: Memory efficiency with large datasets"
114
+ counter = Hyll::HyperLogLog.new(12) # 2^12 = 4096 registers
115
+ puts "Memory usage for 100 million elements is roughly the same as for 1000 elements"
116
+ puts "Sparse representation used until #{Hyll::Constants::DEFAULT_SPARSE_THRESHOLD} elements are added"
117
+
118
+ # Simulate adding 1000 elements and check memory footprint
119
+ 1000.times { |i| counter.add(i) }
120
+ puts "Estimated memory for 1000 elements: #{counter.serialize.bytesize} bytes"
121
+
122
+ # Example 10: Estimating intersection size
123
+ puts "Example 10: Estimating intersection size"
124
+ set_a = Hyll::HyperLogLog.new
125
+ set_b = Hyll::HyperLogLog.new
126
+
127
+ # Add elements to both sets with some overlap
128
+ 1000.times { |i| set_a.add("item-#{i}") }
129
+ 1000.times { |i| set_b.add("item-#{i + 500}") } # 500 overlapping items
130
+
131
+ # Create a union set by merging
132
+ union = set_a.to_enhanced # Make a copy first
133
+ union.merge(set_b)
134
+
135
+ # Estimate intersection using inclusion-exclusion principle
136
+ # |A ∩ B| = |A| + |B| - |A ∪ B|
137
+ intersection_size = set_a.count + set_b.count - union.count
138
+ puts "Set A size: #{set_a.count}"
139
+ puts "Set B size: #{set_b.count}"
140
+ puts "Union size: #{union.count}"
141
+ puts "Estimated intersection size: #{intersection_size} (actual: 500)"
142
+ puts "\n"
143
+
144
+ # Example 11: Streaming data application
145
+ puts "Example 11: Streaming data application"
146
+ puts "HyperLogLog is perfect for streaming applications where you can't store all data:"
147
+
148
+ counter = Hyll::HyperLogLog.new
149
+ puts "Imagine processing a stream of user IDs from web logs..."
150
+ # Simulate stream processing
151
+ 10_000.times do
152
+ # In a real stream, you'd process each item as it arrives
153
+ user_id = rand(5000) # Simulate about 5000 unique users
154
+ counter.add(user_id)
155
+
156
+ # Periodically report statistics without storing all IDs
157
+ puts "Processed #{user_id} records, estimated unique users: #{counter.count}" if (user_id % 2500).zero?
158
+ end
159
+
160
+ puts "Final unique user estimate: #{counter.count}"
161
+ puts "All this with minimal memory usage and O(1) update time!"