hyperloglog-redis 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +28 -9
- data/VERSION +1 -1
- data/hyperloglog-redis.gemspec +2 -2
- data/lib/hyper_log_log.rb +22 -1
- data/spec/hyper_log_log_spec.rb +54 -2
- metadata +13 -13
data/README.md
CHANGED
@@ -16,15 +16,6 @@ instance is used for storing the counters. A simple example:
|
|
16
16
|
|
17
17
|
puts "There are approximately #{counter.count('beatles')} distinct Beatles"
|
18
18
|
|
19
|
-
You can also ask for an estimate from multiple counters and you'll get
|
20
|
-
an estimate of the size of their union:
|
21
|
-
|
22
|
-
['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wing_member|
|
23
|
-
counter.add('wings', wing_member)
|
24
|
-
end
|
25
|
-
|
26
|
-
puts "There are approximately #{counter.count('beatles', 'wings')} people who were in the Beatles or Wings"
|
27
|
-
|
28
19
|
Each HyperLogLog counter uses a small, fixed amount of space but can
|
29
20
|
estimate the cardinality of any set of up to around a billion values with
|
30
21
|
relative error of about 1.04 / Math.sqrt(2 ** b), where b is a parameter
|
@@ -49,6 +40,34 @@ algorithm"](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf)
|
|
49
40
|
by Flajolet, Fusy, Gandouet, and Meunier. Our implementation closely
|
50
41
|
follows the program described in Section 4 of that paper.
|
51
42
|
|
43
|
+
Unions and intersections
|
44
|
+
========================
|
45
|
+
|
46
|
+
You can also ask for an estimate of the union from multiple counters:
|
47
|
+
|
48
|
+
['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wings_member|
|
49
|
+
counter.add('wings', wings_member)
|
50
|
+
end
|
51
|
+
|
52
|
+
puts "There are approximately #{counter.union('beatles', 'wings')} people who were in the Beatles or Wings"
|
53
|
+
|
54
|
+
The same relative error guarantee above applies to unions: a union of
|
55
|
+
size N can be estimated to within N * (1.04 / Math.sqrt(2 ** b)) elements,
|
56
|
+
regardless of how many HyperLogLog counters that union spans.
|
57
|
+
|
58
|
+
Intersections can also be estimated:
|
59
|
+
|
60
|
+
puts "There are approximately #{counter.intersection('beatles', 'wings')} people who were in both the Beatles and Wings"
|
61
|
+
|
62
|
+
However, intersections of HyperLogLog counters are calculated indirectly via the
|
63
|
+
[inclusion/exclusion principle](http://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle)
|
64
|
+
as a sum of unions and there aren't good theoretical bounds on the error of that sum. In
|
65
|
+
practice, the estimates that come out of small intersections tend to follow the
|
66
|
+
same relative error patterns, but beware using this type of estimation on large
|
67
|
+
intersections, both because the errors can be much larger than those guaranteed
|
68
|
+
for unions and the complexity of computing intersections grows exponentially with
|
69
|
+
the number of counters being intersected.
|
70
|
+
|
52
71
|
Installation
|
53
72
|
============
|
54
73
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/hyperloglog-redis.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "hyperloglog-redis"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Aaron Windsor"]
|
12
|
-
s.date = "2012-09-
|
12
|
+
s.date = "2012-09-28"
|
13
13
|
s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
|
14
14
|
s.email = "aaron.windsor@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/hyper_log_log.rb
CHANGED
@@ -26,7 +26,19 @@ class HyperLogLog
|
|
26
26
|
@redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
|
27
27
|
end
|
28
28
|
|
29
|
-
def count(
|
29
|
+
def count(counter_name)
|
30
|
+
union_helper([counter_name])
|
31
|
+
end
|
32
|
+
|
33
|
+
def union(*counter_names)
|
34
|
+
union_helper(counter_names)
|
35
|
+
end
|
36
|
+
|
37
|
+
def intersection(*counter_names)
|
38
|
+
[intersection_helper(counter_names, {}), 0].max
|
39
|
+
end
|
40
|
+
|
41
|
+
def union_helper(counter_names)
|
30
42
|
all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
|
31
43
|
.reduce(:concat)
|
32
44
|
.group_by{ |value, score| value }
|
@@ -46,6 +58,15 @@ class HyperLogLog
|
|
46
58
|
end
|
47
59
|
end
|
48
60
|
|
61
|
+
def intersection_helper(counter_names, cache)
|
62
|
+
sum = union_helper(counter_names) - (1...counter_names.length).map do |k|
|
63
|
+
((-1) ** (k + 1)) * counter_names.combination(k).map do |group|
|
64
|
+
cache[group] ||= intersection_helper(group, cache)
|
65
|
+
end.inject(0, :+)
|
66
|
+
end.inject(0, :+)
|
67
|
+
((-1) ** (counter_names.length + 1)) * sum
|
68
|
+
end
|
69
|
+
|
49
70
|
# rho(i) is the position of the first 1 in the binary representation of i,
|
50
71
|
# reading from most significant to least significant bits. Some examples:
|
51
72
|
# rho(1...) = 1, rho(001...) = 3, rho(000...0) = @bits_in_hash + 1
|
data/spec/hyper_log_log_spec.rb
CHANGED
@@ -95,7 +95,7 @@ describe HyperLogLog do
|
|
95
95
|
value3 = Digest::MD5.hexdigest("this is value#{i}")
|
96
96
|
counter.add("mycounter3", value3)
|
97
97
|
actual = 3 * (i + 1)
|
98
|
-
approximate = counter.
|
98
|
+
approximate = counter.union("mycounter1", "mycounter2", "mycounter3")
|
99
99
|
relative_error = (actual - approximate).abs / Float(actual)
|
100
100
|
bad_estimates += 1 if relative_error > expected_relative_error * 2
|
101
101
|
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
|
@@ -126,7 +126,7 @@ describe HyperLogLog do
|
|
126
126
|
value3 = Digest::MD5.hexdigest("this is value#{i}")
|
127
127
|
counter.add("mycounter3", value3)
|
128
128
|
actual = 3 * (i + 1) + intersection_size
|
129
|
-
approximate = counter.
|
129
|
+
approximate = counter.union("mycounter1", "mycounter2", "mycounter3")
|
130
130
|
relative_error = (actual - approximate).abs / Float(actual)
|
131
131
|
bad_estimates += 1 if relative_error > expected_relative_error * 2
|
132
132
|
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
|
@@ -136,4 +136,56 @@ describe HyperLogLog do
|
|
136
136
|
very_bad_estimates.should == 0
|
137
137
|
end
|
138
138
|
|
139
|
+
# There are no good theoretical guarantees that I know of for arbitrary
|
140
|
+
# intersection estimation, since it's expessed as the sum of unions of
|
141
|
+
# HyperLogLog counters, but it tends to work okay in practice, as seen below.
|
142
|
+
|
143
|
+
it "produces decent estimates for intersections" do
|
144
|
+
b, max_items = 6, 1000
|
145
|
+
counter = HyperLogLog.new(Redis.new, b)
|
146
|
+
expected_relative_error = 1.04 / Math.sqrt(2 ** b)
|
147
|
+
|
148
|
+
max_items.times do |i|
|
149
|
+
value1 = Digest::MD5.hexdigest("first-value#{i}")
|
150
|
+
value2 = Digest::MD5.hexdigest("second-value#{i}")
|
151
|
+
value3 = Digest::MD5.hexdigest("third-value#{i}")
|
152
|
+
value4 = Digest::MD5.hexdigest("fourth-value#{i}")
|
153
|
+
counter.add("mycounter1", value1)
|
154
|
+
counter.add("mycounter2", value2)
|
155
|
+
counter.add("mycounter3", value3)
|
156
|
+
counter.add("mycounter4", value4)
|
157
|
+
[value1, value2, value3, value4].each{ |value| counter.add("mycounter5", value) }
|
158
|
+
end
|
159
|
+
|
160
|
+
small_counters = ['mycounter1', 'mycounter2', 'mycounter3', 'mycounter4']
|
161
|
+
|
162
|
+
small_counters.each do |counter_name|
|
163
|
+
intersection_estimate = counter.intersection(counter_name, 'mycounter5')
|
164
|
+
intersection_estimate.should > 0
|
165
|
+
(intersection_estimate - counter.count(counter_name)).abs.should < max_items * expected_relative_error
|
166
|
+
end
|
167
|
+
|
168
|
+
[2,3].each do |intersection_size|
|
169
|
+
small_counters.combination(intersection_size).each do |counter_names|
|
170
|
+
intersection_estimate = counter.intersection(*counter_names)
|
171
|
+
intersection_estimate.should >= 0
|
172
|
+
intersection_estimate.should < intersection_size * max_items * expected_relative_error
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
100.times do |i|
|
177
|
+
value = Digest::MD5.hexdigest("somethingintheintersection#{i}")
|
178
|
+
small_counters.each { |counter_name| counter.add(counter_name, value) }
|
179
|
+
end
|
180
|
+
|
181
|
+
[2,3,4].each do |intersection_size|
|
182
|
+
small_counters.combination(intersection_size).each do |counter_names|
|
183
|
+
intersection_estimate = counter.intersection(*counter_names)
|
184
|
+
intersection_estimate.should >= 0
|
185
|
+
(intersection_estimate - 100).abs.should < intersection_size * (max_items + 100) * expected_relative_error
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
|
139
191
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hyperloglog-redis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-28 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|
16
|
-
requirement: &
|
16
|
+
requirement: &2157248980 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.1.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2157248980
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &2157248500 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 3.0.1
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2157248500
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &2157248020 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.8.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2157248020
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &2157247540 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.9.2.2
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2157247540
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &2157247060 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: 2.11.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2157247060
|
69
69
|
description: An implementation of the HyperLogLog set cardinality estimation algorithm
|
70
70
|
in Ruby using Redis as a back-end
|
71
71
|
email: aaron.windsor@gmail.com
|
@@ -103,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
103
103
|
version: '0'
|
104
104
|
segments:
|
105
105
|
- 0
|
106
|
-
hash:
|
106
|
+
hash: 1810415248006536813
|
107
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
108
|
none: false
|
109
109
|
requirements:
|