hyperloglog-redis 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +28 -9
- data/VERSION +1 -1
- data/hyperloglog-redis.gemspec +2 -2
- data/lib/hyper_log_log.rb +22 -1
- data/spec/hyper_log_log_spec.rb +54 -2
- metadata +13 -13
data/README.md
CHANGED
@@ -16,15 +16,6 @@ instance is used for storing the counters. A simple example:
|
|
16
16
|
|
17
17
|
puts "There are approximately #{counter.count('beatles')} distinct Beatles"
|
18
18
|
|
19
|
-
You can also ask for an estimate from multiple counters and you'll get
|
20
|
-
an estimate of the size of their union:
|
21
|
-
|
22
|
-
['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wing_member|
|
23
|
-
counter.add('wings', wing_member)
|
24
|
-
end
|
25
|
-
|
26
|
-
puts "There are approximately #{counter.count('beatles', 'wings')} people who were in the Beatles or Wings"
|
27
|
-
|
28
19
|
Each HyperLogLog counter uses a small, fixed amount of space but can
|
29
20
|
estimate the cardinality of any set of up to around a billion values with
|
30
21
|
relative error of about 1.04 / Math.sqrt(2 ** b), where b is a parameter
|
@@ -49,6 +40,34 @@ algorithm"](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf)
|
|
49
40
|
by Flajolet, Fusy, Gandouet, and Meunier. Our implementation closely
|
50
41
|
follows the program described in Section 4 of that paper.
|
51
42
|
|
43
|
+
Unions and intersections
|
44
|
+
========================
|
45
|
+
|
46
|
+
You can also ask for an estimate of the union from multiple counters:
|
47
|
+
|
48
|
+
['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wings_member|
|
49
|
+
counter.add('wings', wings_member)
|
50
|
+
end
|
51
|
+
|
52
|
+
puts "There are approximately #{counter.union('beatles', 'wings')} people who were in the Beatles or Wings"
|
53
|
+
|
54
|
+
The same relative error guarantee above applies to unions: a union of
|
55
|
+
size N can be estimated to within N * (1.04 / Math.sqrt(2 ** b)) elements,
|
56
|
+
regardless of how many HyperLogLog counters that union spans.
|
57
|
+
|
58
|
+
Intersections can also be estimated:
|
59
|
+
|
60
|
+
puts "There are approximately #{counter.intersection('beatles', 'wings')} people who were in both the Beatles and Wings"
|
61
|
+
|
62
|
+
However, intersections of HyperLogLog counters are calculated indirectly via the
|
63
|
+
[inclusion/exclusion principle](http://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle)
|
64
|
+
as a sum of unions and there aren't good theoretical bounds on the error of that sum. In
|
65
|
+
practice, the estimates that come out of small intersections tend to follow the
|
66
|
+
same relative error patterns, but beware using this type of estimation on large
|
67
|
+
intersections, both because the errors can be much larger than those guaranteed
|
68
|
+
for unions and the complexity of computing intersections grows exponentially with
|
69
|
+
the number of counters being intersected.
|
70
|
+
|
52
71
|
Installation
|
53
72
|
============
|
54
73
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/hyperloglog-redis.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "hyperloglog-redis"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Aaron Windsor"]
|
12
|
-
s.date = "2012-09-
|
12
|
+
s.date = "2012-09-28"
|
13
13
|
s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
|
14
14
|
s.email = "aaron.windsor@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/hyper_log_log.rb
CHANGED
@@ -26,7 +26,19 @@ class HyperLogLog
|
|
26
26
|
@redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
|
27
27
|
end
|
28
28
|
|
29
|
-
def count(
|
29
|
+
def count(counter_name)
|
30
|
+
union_helper([counter_name])
|
31
|
+
end
|
32
|
+
|
33
|
+
def union(*counter_names)
|
34
|
+
union_helper(counter_names)
|
35
|
+
end
|
36
|
+
|
37
|
+
def intersection(*counter_names)
|
38
|
+
[intersection_helper(counter_names, {}), 0].max
|
39
|
+
end
|
40
|
+
|
41
|
+
def union_helper(counter_names)
|
30
42
|
all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
|
31
43
|
.reduce(:concat)
|
32
44
|
.group_by{ |value, score| value }
|
@@ -46,6 +58,15 @@ class HyperLogLog
|
|
46
58
|
end
|
47
59
|
end
|
48
60
|
|
61
|
+
def intersection_helper(counter_names, cache)
|
62
|
+
sum = union_helper(counter_names) - (1...counter_names.length).map do |k|
|
63
|
+
((-1) ** (k + 1)) * counter_names.combination(k).map do |group|
|
64
|
+
cache[group] ||= intersection_helper(group, cache)
|
65
|
+
end.inject(0, :+)
|
66
|
+
end.inject(0, :+)
|
67
|
+
((-1) ** (counter_names.length + 1)) * sum
|
68
|
+
end
|
69
|
+
|
49
70
|
# rho(i) is the position of the first 1 in the binary representation of i,
|
50
71
|
# reading from most significant to least significant bits. Some examples:
|
51
72
|
# rho(1...) = 1, rho(001...) = 3, rho(000...0) = @bits_in_hash + 1
|
data/spec/hyper_log_log_spec.rb
CHANGED
@@ -95,7 +95,7 @@ describe HyperLogLog do
|
|
95
95
|
value3 = Digest::MD5.hexdigest("this is value#{i}")
|
96
96
|
counter.add("mycounter3", value3)
|
97
97
|
actual = 3 * (i + 1)
|
98
|
-
approximate = counter.
|
98
|
+
approximate = counter.union("mycounter1", "mycounter2", "mycounter3")
|
99
99
|
relative_error = (actual - approximate).abs / Float(actual)
|
100
100
|
bad_estimates += 1 if relative_error > expected_relative_error * 2
|
101
101
|
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
|
@@ -126,7 +126,7 @@ describe HyperLogLog do
|
|
126
126
|
value3 = Digest::MD5.hexdigest("this is value#{i}")
|
127
127
|
counter.add("mycounter3", value3)
|
128
128
|
actual = 3 * (i + 1) + intersection_size
|
129
|
-
approximate = counter.
|
129
|
+
approximate = counter.union("mycounter1", "mycounter2", "mycounter3")
|
130
130
|
relative_error = (actual - approximate).abs / Float(actual)
|
131
131
|
bad_estimates += 1 if relative_error > expected_relative_error * 2
|
132
132
|
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
|
@@ -136,4 +136,56 @@ describe HyperLogLog do
|
|
136
136
|
very_bad_estimates.should == 0
|
137
137
|
end
|
138
138
|
|
139
|
+
# There are no good theoretical guarantees that I know of for arbitrary
|
140
|
+
# intersection estimation, since it's expessed as the sum of unions of
|
141
|
+
# HyperLogLog counters, but it tends to work okay in practice, as seen below.
|
142
|
+
|
143
|
+
it "produces decent estimates for intersections" do
|
144
|
+
b, max_items = 6, 1000
|
145
|
+
counter = HyperLogLog.new(Redis.new, b)
|
146
|
+
expected_relative_error = 1.04 / Math.sqrt(2 ** b)
|
147
|
+
|
148
|
+
max_items.times do |i|
|
149
|
+
value1 = Digest::MD5.hexdigest("first-value#{i}")
|
150
|
+
value2 = Digest::MD5.hexdigest("second-value#{i}")
|
151
|
+
value3 = Digest::MD5.hexdigest("third-value#{i}")
|
152
|
+
value4 = Digest::MD5.hexdigest("fourth-value#{i}")
|
153
|
+
counter.add("mycounter1", value1)
|
154
|
+
counter.add("mycounter2", value2)
|
155
|
+
counter.add("mycounter3", value3)
|
156
|
+
counter.add("mycounter4", value4)
|
157
|
+
[value1, value2, value3, value4].each{ |value| counter.add("mycounter5", value) }
|
158
|
+
end
|
159
|
+
|
160
|
+
small_counters = ['mycounter1', 'mycounter2', 'mycounter3', 'mycounter4']
|
161
|
+
|
162
|
+
small_counters.each do |counter_name|
|
163
|
+
intersection_estimate = counter.intersection(counter_name, 'mycounter5')
|
164
|
+
intersection_estimate.should > 0
|
165
|
+
(intersection_estimate - counter.count(counter_name)).abs.should < max_items * expected_relative_error
|
166
|
+
end
|
167
|
+
|
168
|
+
[2,3].each do |intersection_size|
|
169
|
+
small_counters.combination(intersection_size).each do |counter_names|
|
170
|
+
intersection_estimate = counter.intersection(*counter_names)
|
171
|
+
intersection_estimate.should >= 0
|
172
|
+
intersection_estimate.should < intersection_size * max_items * expected_relative_error
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
100.times do |i|
|
177
|
+
value = Digest::MD5.hexdigest("somethingintheintersection#{i}")
|
178
|
+
small_counters.each { |counter_name| counter.add(counter_name, value) }
|
179
|
+
end
|
180
|
+
|
181
|
+
[2,3,4].each do |intersection_size|
|
182
|
+
small_counters.combination(intersection_size).each do |counter_names|
|
183
|
+
intersection_estimate = counter.intersection(*counter_names)
|
184
|
+
intersection_estimate.should >= 0
|
185
|
+
(intersection_estimate - 100).abs.should < intersection_size * (max_items + 100) * expected_relative_error
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
|
139
191
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hyperloglog-redis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-28 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|
16
|
-
requirement: &
|
16
|
+
requirement: &2157248980 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.1.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2157248980
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &2157248500 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 3.0.1
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2157248500
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &2157248020 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.8.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2157248020
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &2157247540 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.9.2.2
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2157247540
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &2157247060 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: 2.11.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2157247060
|
69
69
|
description: An implementation of the HyperLogLog set cardinality estimation algorithm
|
70
70
|
in Ruby using Redis as a back-end
|
71
71
|
email: aaron.windsor@gmail.com
|
@@ -103,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
103
103
|
version: '0'
|
104
104
|
segments:
|
105
105
|
- 0
|
106
|
-
hash:
|
106
|
+
hash: 1810415248006536813
|
107
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
108
|
none: false
|
109
109
|
requirements:
|