hyperloglog-redis 0.3.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/HISTORY.md +17 -0
- data/README.md +6 -1
- data/VERSION +1 -1
- data/hyperloglog-redis.gemspec +3 -2
- data/lib/hyper_log_log.rb +33 -17
- data/spec/hyper_log_log_spec.rb +33 -0
- metadata +14 -13
data/HISTORY.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
## 1.0.0 (10/26/2012)
|
2
|
+
|
3
|
+
* Changed the underlying storage from Redis sorted sets to Redis hashes. This
|
4
|
+
is a breaking change, if you have existing counters stored from earlier
|
5
|
+
versions of this library, you can upgrade them with something like the
|
6
|
+
following method:
|
7
|
+
|
8
|
+
def upgrade(counter, redis)
|
9
|
+
return if redis.type(counter) == "hash"
|
10
|
+
values = redis.zrange(counter, 0, -1, {withscores: true})
|
11
|
+
redis.del(counter)
|
12
|
+
values.each { |key, value| redis.hset(counter, key, value.to_i) }
|
13
|
+
end
|
14
|
+
|
15
|
+
* Added union_store command, which stores the results of a union for querying
|
16
|
+
or combining with other sets later
|
17
|
+
|
data/README.md
CHANGED
@@ -53,7 +53,12 @@ You can also ask for an estimate of the union from multiple counters:
|
|
53
53
|
|
54
54
|
The same relative error guarantee above applies to unions: a union of
|
55
55
|
size N can be estimated to within N * (1.04 / Math.sqrt(2 ** b)) elements,
|
56
|
-
regardless of how many HyperLogLog counters that union spans.
|
56
|
+
regardless of how many HyperLogLog counters that union spans. You can store
|
57
|
+
a unioned counter for querying or combining later with `union_store`:
|
58
|
+
|
59
|
+
counter.union_store('all_beatles_and_wings_members', 'beatles', 'wings')
|
60
|
+
|
61
|
+
puts "There are approximately #{counter.count('all_beatles_and_wings_members'}} people who were in the Beatles or Wings"
|
57
62
|
|
58
63
|
Intersections can also be estimated:
|
59
64
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/hyperloglog-redis.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "hyperloglog-redis"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "1.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Aaron Windsor"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-10-26"
|
13
13
|
s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
|
14
14
|
s.email = "aaron.windsor@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -21,6 +21,7 @@ Gem::Specification.new do |s|
|
|
21
21
|
".rspec",
|
22
22
|
"Gemfile",
|
23
23
|
"Gemfile.lock",
|
24
|
+
"HISTORY.md",
|
24
25
|
"LICENSE.txt",
|
25
26
|
"README.md",
|
26
27
|
"Rakefile",
|
data/lib/hyper_log_log.rb
CHANGED
@@ -19,30 +19,47 @@ class HyperLogLog
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def add(counter_name, value)
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
22
|
+
hash = MurmurHash3::V32.murmur3_32_str_hash(value)
|
23
|
+
function_name = hash % @m
|
24
|
+
w = hash / @m
|
25
|
+
existing_value = (@redis.hget(counter_name, function_name) || 0).to_i
|
26
|
+
new_value = [existing_value, rho(w)].max
|
27
|
+
@redis.hset(counter_name, function_name, new_value) if new_value > existing_value
|
27
28
|
end
|
28
29
|
|
30
|
+
# Estimate the cardinality of a single set
|
29
31
|
def count(counter_name)
|
30
32
|
union_helper([counter_name])
|
31
33
|
end
|
32
34
|
|
35
|
+
# Estimate the cardinality of the union of several sets
|
33
36
|
def union(*counter_names)
|
34
37
|
union_helper(counter_names)
|
35
38
|
end
|
36
39
|
|
40
|
+
# Store the union of several sets in *destination* so that it can be used as
|
41
|
+
# a HyperLogLog counter later.
|
42
|
+
def union_store(destination, *counter_names)
|
43
|
+
raw_union(counter_names).each do |key, count|
|
44
|
+
@redis.hset(destination, key, count)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Estimate the cardinality of the intersection of several sets. We do this by
|
49
|
+
# using the principle of inclusion and exclusion to represent the size of the
|
50
|
+
# intersection as the alternating sum of an exponential number of
|
51
|
+
# cardinalities of unions of smaller sets.
|
37
52
|
def intersection(*counter_names)
|
38
|
-
|
53
|
+
icount = (1..counter_names.length).map do |k|
|
54
|
+
counter_names.combination(k).map do |group|
|
55
|
+
((k % 2 == 0) ? -1 : 1) * union_helper(group)
|
56
|
+
end.inject(0, :+)
|
57
|
+
end.inject(0, :+)
|
58
|
+
[icount, 0].max
|
39
59
|
end
|
40
60
|
|
41
61
|
def union_helper(counter_names)
|
42
|
-
all_estimates = counter_names.map{ |
|
43
|
-
.reduce(:concat)
|
44
|
-
.group_by{ |value, score| value }
|
45
|
-
.map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
|
62
|
+
all_estimates = raw_union(counter_names).map{ |value, score| 2 ** -score }
|
46
63
|
estimate_sum = all_estimates.reduce(:+) || 0
|
47
64
|
estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
|
48
65
|
if estimate <= 2.5 * @m
|
@@ -58,13 +75,11 @@ class HyperLogLog
|
|
58
75
|
end
|
59
76
|
end
|
60
77
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
end.inject(0, :+)
|
67
|
-
((-1) ** (counter_names.length + 1)) * sum
|
78
|
+
def raw_union(counter_names)
|
79
|
+
counter_names.map{ |counter_name| @redis.hgetall(counter_name).map{ |x,y| [x, y.to_i] } }
|
80
|
+
.reduce(:concat)
|
81
|
+
.group_by{ |key, count| key }
|
82
|
+
.map{ |key, counters| [key, counters.map{ |x| x.last }.max] }
|
68
83
|
end
|
69
84
|
|
70
85
|
# rho(i) is the position of the first 1 in the binary representation of i,
|
@@ -77,4 +92,5 @@ class HyperLogLog
|
|
77
92
|
@bits_in_hash - Math.log(i, 2).floor
|
78
93
|
end
|
79
94
|
end
|
95
|
+
|
80
96
|
end
|
data/spec/hyper_log_log_spec.rb
CHANGED
@@ -39,6 +39,39 @@ describe HyperLogLog do
|
|
39
39
|
counter.count("counter1").should > counter.count("counter2")
|
40
40
|
end
|
41
41
|
|
42
|
+
it "can exactly count small sets" do
|
43
|
+
redis = Redis.new
|
44
|
+
counter = HyperLogLog.new(redis, 11)
|
45
|
+
10.times { |i| counter.add("mycounter", i.to_s) }
|
46
|
+
counter.count("mycounter").should == 10
|
47
|
+
end
|
48
|
+
|
49
|
+
it "can exactly count small unions" do
|
50
|
+
redis = Redis.new
|
51
|
+
counter = HyperLogLog.new(redis, 11)
|
52
|
+
(1..8).each { |i| counter.add("mycounter1", i.to_s) }
|
53
|
+
(5..12).each { |i| counter.add("mycounter2", i.to_s) }
|
54
|
+
counter.union("mycounter1", "mycounter2").should == 12
|
55
|
+
end
|
56
|
+
|
57
|
+
it "can exactly count small intersections" do
|
58
|
+
redis = Redis.new
|
59
|
+
counter = HyperLogLog.new(redis, 11)
|
60
|
+
(1..8).each { |i| counter.add("mycounter1", i.to_s) }
|
61
|
+
(5..12).each { |i| counter.add("mycounter2", i.to_s) }
|
62
|
+
counter.intersection("mycounter1", "mycounter2").should == 4
|
63
|
+
end
|
64
|
+
|
65
|
+
it "can store unions for querying later" do
|
66
|
+
redis = Redis.new
|
67
|
+
counter = HyperLogLog.new(redis, 11)
|
68
|
+
(1..10).each { |i| counter.add("mycounter1", i.to_s) }
|
69
|
+
(5..15).each { |i| counter.add("mycounter2", i.to_s) }
|
70
|
+
(15..25).each { |i| counter.add("mycounter3", i.to_s) }
|
71
|
+
(20..50).each { |i| counter.add("mycounter4", i.to_s) }
|
72
|
+
counter.union_store("aggregate_counter", "mycounter1", "mycounter2", "mycounter3", "mycounter4")
|
73
|
+
counter.union("mycounter1", "mycounter2", "mycounter3", "mycounter4").should == counter.count("aggregate_counter")
|
74
|
+
end
|
42
75
|
|
43
76
|
# With parameter b, HyperLogLog should produce estimates that have
|
44
77
|
# relative error of 1.04 / Math.sqrt(2 ** b). Of course, this analysis
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hyperloglog-redis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-26 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|
16
|
-
requirement: &
|
16
|
+
requirement: &2172774780 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.1.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2172774780
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &2172774300 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 3.0.1
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2172774300
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &2172773820 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.8.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2172773820
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &2172773340 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.9.2.2
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2172773340
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &2172789220 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: 2.11.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2172789220
|
69
69
|
description: An implementation of the HyperLogLog set cardinality estimation algorithm
|
70
70
|
in Ruby using Redis as a back-end
|
71
71
|
email: aaron.windsor@gmail.com
|
@@ -79,6 +79,7 @@ files:
|
|
79
79
|
- .rspec
|
80
80
|
- Gemfile
|
81
81
|
- Gemfile.lock
|
82
|
+
- HISTORY.md
|
82
83
|
- LICENSE.txt
|
83
84
|
- README.md
|
84
85
|
- Rakefile
|
@@ -103,7 +104,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
103
104
|
version: '0'
|
104
105
|
segments:
|
105
106
|
- 0
|
106
|
-
hash:
|
107
|
+
hash: 2426569210961737114
|
107
108
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
109
|
none: false
|
109
110
|
requirements:
|