hyperloglog-redis 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -16,15 +16,6 @@ instance is used for storing the counters. A simple example:
16
16
 
17
17
  puts "There are approximately #{counter.count('beatles')} distinct Beatles"
18
18
 
19
- You can also ask for an estimate from multiple counters and you'll get
20
- an estimate of the size of their union:
21
-
22
- ['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wing_member|
23
- counter.add('wings', wing_member)
24
- end
25
-
26
- puts "There are approximately #{counter.count('beatles', 'wings')} people who were in the Beatles or Wings"
27
-
28
19
  Each HyperLogLog counter uses a small, fixed amount of space but can
29
20
  estimate the cardinality of any set of up to around a billion values with
30
21
  relative error of about 1.04 / Math.sqrt(2 ** b), where b is a parameter
@@ -49,6 +40,34 @@ algorithm"](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf)
49
40
  by Flajolet, Fusy, Gandouet, and Meunier. Our implementation closely
50
41
  follows the program described in Section 4 of that paper.
51
42
 
43
+ Unions and intersections
44
+ ========================
45
+
46
+ You can also ask for an estimate of the union from multiple counters:
47
+
48
+ ['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wings_member|
49
+ counter.add('wings', wings_member)
50
+ end
51
+
52
+ puts "There are approximately #{counter.union('beatles', 'wings')} people who were in the Beatles or Wings"
53
+
54
+ The same relative error guarantee above applies to unions: a union of
55
+ size N can be estimated to within N * (1.04 / Math.sqrt(2 ** b)) elements,
56
+ regardless of how many HyperLogLog counters that union spans.
57
+
58
+ Intersections can also be estimated:
59
+
60
+ puts "There are approximately #{counter.intersection('beatles', 'wings')} people who were in both the Beatles and Wings"
61
+
62
+ However, intersections of HyperLogLog counters are calculated indirectly via the
63
+ [inclusion/exclusion principle](http://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle)
64
+ as a sum of unions and there aren't good theoretical bounds on the error of that sum. In
65
+ practice, the estimates that come out of small intersections tend to follow the
66
+ same relative error patterns, but beware using this type of estimation on large
67
+ intersections, both because the errors can be much larger than those guaranteed
68
+ for unions and the complexity of computing intersections grows exponentially with
69
+ the number of counters being intersected.
70
+
52
71
  Installation
53
72
  ============
54
73
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.3.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "hyperloglog-redis"
8
- s.version = "0.2.0"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Aaron Windsor"]
12
- s.date = "2012-09-27"
12
+ s.date = "2012-09-28"
13
13
  s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
14
14
  s.email = "aaron.windsor@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -26,7 +26,19 @@ class HyperLogLog
26
26
  @redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
27
27
  end
28
28
 
29
- def count(*counter_names)
29
+ def count(counter_name)
30
+ union_helper([counter_name])
31
+ end
32
+
33
+ def union(*counter_names)
34
+ union_helper(counter_names)
35
+ end
36
+
37
+ def intersection(*counter_names)
38
+ [intersection_helper(counter_names, {}), 0].max
39
+ end
40
+
41
+ def union_helper(counter_names)
30
42
  all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
31
43
  .reduce(:concat)
32
44
  .group_by{ |value, score| value }
@@ -46,6 +58,15 @@ class HyperLogLog
46
58
  end
47
59
  end
48
60
 
61
+ def intersection_helper(counter_names, cache)
62
+ sum = union_helper(counter_names) - (1...counter_names.length).map do |k|
63
+ ((-1) ** (k + 1)) * counter_names.combination(k).map do |group|
64
+ cache[group] ||= intersection_helper(group, cache)
65
+ end.inject(0, :+)
66
+ end.inject(0, :+)
67
+ ((-1) ** (counter_names.length + 1)) * sum
68
+ end
69
+
49
70
  # rho(i) is the position of the first 1 in the binary representation of i,
50
71
  # reading from most significant to least significant bits. Some examples:
51
72
  # rho(1...) = 1, rho(001...) = 3, rho(000...0) = @bits_in_hash + 1
@@ -95,7 +95,7 @@ describe HyperLogLog do
95
95
  value3 = Digest::MD5.hexdigest("this is value#{i}")
96
96
  counter.add("mycounter3", value3)
97
97
  actual = 3 * (i + 1)
98
- approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
98
+ approximate = counter.union("mycounter1", "mycounter2", "mycounter3")
99
99
  relative_error = (actual - approximate).abs / Float(actual)
100
100
  bad_estimates += 1 if relative_error > expected_relative_error * 2
101
101
  very_bad_estimates += 1 if relative_error > expected_relative_error * 3
@@ -126,7 +126,7 @@ describe HyperLogLog do
126
126
  value3 = Digest::MD5.hexdigest("this is value#{i}")
127
127
  counter.add("mycounter3", value3)
128
128
  actual = 3 * (i + 1) + intersection_size
129
- approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
129
+ approximate = counter.union("mycounter1", "mycounter2", "mycounter3")
130
130
  relative_error = (actual - approximate).abs / Float(actual)
131
131
  bad_estimates += 1 if relative_error > expected_relative_error * 2
132
132
  very_bad_estimates += 1 if relative_error > expected_relative_error * 3
@@ -136,4 +136,56 @@ describe HyperLogLog do
136
136
  very_bad_estimates.should == 0
137
137
  end
138
138
 
139
+ # There are no good theoretical guarantees that I know of for arbitrary
140
+ # intersection estimation, since it's expessed as the sum of unions of
141
+ # HyperLogLog counters, but it tends to work okay in practice, as seen below.
142
+
143
+ it "produces decent estimates for intersections" do
144
+ b, max_items = 6, 1000
145
+ counter = HyperLogLog.new(Redis.new, b)
146
+ expected_relative_error = 1.04 / Math.sqrt(2 ** b)
147
+
148
+ max_items.times do |i|
149
+ value1 = Digest::MD5.hexdigest("first-value#{i}")
150
+ value2 = Digest::MD5.hexdigest("second-value#{i}")
151
+ value3 = Digest::MD5.hexdigest("third-value#{i}")
152
+ value4 = Digest::MD5.hexdigest("fourth-value#{i}")
153
+ counter.add("mycounter1", value1)
154
+ counter.add("mycounter2", value2)
155
+ counter.add("mycounter3", value3)
156
+ counter.add("mycounter4", value4)
157
+ [value1, value2, value3, value4].each{ |value| counter.add("mycounter5", value) }
158
+ end
159
+
160
+ small_counters = ['mycounter1', 'mycounter2', 'mycounter3', 'mycounter4']
161
+
162
+ small_counters.each do |counter_name|
163
+ intersection_estimate = counter.intersection(counter_name, 'mycounter5')
164
+ intersection_estimate.should > 0
165
+ (intersection_estimate - counter.count(counter_name)).abs.should < max_items * expected_relative_error
166
+ end
167
+
168
+ [2,3].each do |intersection_size|
169
+ small_counters.combination(intersection_size).each do |counter_names|
170
+ intersection_estimate = counter.intersection(*counter_names)
171
+ intersection_estimate.should >= 0
172
+ intersection_estimate.should < intersection_size * max_items * expected_relative_error
173
+ end
174
+ end
175
+
176
+ 100.times do |i|
177
+ value = Digest::MD5.hexdigest("somethingintheintersection#{i}")
178
+ small_counters.each { |counter_name| counter.add(counter_name, value) }
179
+ end
180
+
181
+ [2,3,4].each do |intersection_size|
182
+ small_counters.combination(intersection_size).each do |counter_names|
183
+ intersection_estimate = counter.intersection(*counter_names)
184
+ intersection_estimate.should >= 0
185
+ (intersection_estimate - 100).abs.should < intersection_size * (max_items + 100) * expected_relative_error
186
+ end
187
+ end
188
+
189
+ end
190
+
139
191
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hyperloglog-redis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-27 00:00:00.000000000Z
12
+ date: 2012-09-28 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3
16
- requirement: &2172688860 !ruby/object:Gem::Requirement
16
+ requirement: &2157248980 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.1.3
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2172688860
24
+ version_requirements: *2157248980
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &2172688380 !ruby/object:Gem::Requirement
27
+ requirement: &2157248500 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 3.0.1
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2172688380
35
+ version_requirements: *2157248500
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: jeweler
38
- requirement: &2172687900 !ruby/object:Gem::Requirement
38
+ requirement: &2157248020 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.8.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2172687900
46
+ version_requirements: *2157248020
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rake
49
- requirement: &2172687420 !ruby/object:Gem::Requirement
49
+ requirement: &2157247540 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 0.9.2.2
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2172687420
57
+ version_requirements: *2157247540
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &2172686940 !ruby/object:Gem::Requirement
60
+ requirement: &2157247060 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: 2.11.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2172686940
68
+ version_requirements: *2157247060
69
69
  description: An implementation of the HyperLogLog set cardinality estimation algorithm
70
70
  in Ruby using Redis as a back-end
71
71
  email: aaron.windsor@gmail.com
@@ -103,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
103
103
  version: '0'
104
104
  segments:
105
105
  - 0
106
- hash: -1980876148320618453
106
+ hash: 1810415248006536813
107
107
  required_rubygems_version: !ruby/object:Gem::Requirement
108
108
  none: false
109
109
  requirements: