hyperloglog-redis 0.3.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HISTORY.md +17 -0
- data/README.md +6 -1
- data/VERSION +1 -1
- data/hyperloglog-redis.gemspec +3 -2
- data/lib/hyper_log_log.rb +33 -17
- data/spec/hyper_log_log_spec.rb +33 -0
- metadata +14 -13
data/HISTORY.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
## 1.0.0 (10/26/2012)
|
2
|
+
|
3
|
+
* Changed the underlying storage from Redis sorted sets to Redis hashes. This
|
4
|
+
is a breaking change, if you have existing counters stored from earlier
|
5
|
+
versions of this library, you can upgrade them with something like the
|
6
|
+
following method:
|
7
|
+
|
8
|
+
def upgrade(counter, redis)
|
9
|
+
return if redis.type(counter) == "hash"
|
10
|
+
values = redis.zrange(counter, 0, -1, {withscores: true})
|
11
|
+
redis.del(counter)
|
12
|
+
values.each { |key, value| redis.hset(counter, key, value.to_i) }
|
13
|
+
end
|
14
|
+
|
15
|
+
* Added union_store command, which stores the results of a union for querying
|
16
|
+
or combining with other sets later
|
17
|
+
|
data/README.md
CHANGED
@@ -53,7 +53,12 @@ You can also ask for an estimate of the union from multiple counters:
|
|
53
53
|
|
54
54
|
The same relative error guarantee above applies to unions: a union of
|
55
55
|
size N can be estimated to within N * (1.04 / Math.sqrt(2 ** b)) elements,
|
56
|
-
regardless of how many HyperLogLog counters that union spans.
|
56
|
+
regardless of how many HyperLogLog counters that union spans. You can store
|
57
|
+
a unioned counter for querying or combining later with `union_store`:
|
58
|
+
|
59
|
+
counter.union_store('all_beatles_and_wings_members', 'beatles', 'wings')
|
60
|
+
|
61
|
+
puts "There are approximately #{counter.count('all_beatles_and_wings_members'}} people who were in the Beatles or Wings"
|
57
62
|
|
58
63
|
Intersections can also be estimated:
|
59
64
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/hyperloglog-redis.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "hyperloglog-redis"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "1.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Aaron Windsor"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-10-26"
|
13
13
|
s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
|
14
14
|
s.email = "aaron.windsor@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -21,6 +21,7 @@ Gem::Specification.new do |s|
|
|
21
21
|
".rspec",
|
22
22
|
"Gemfile",
|
23
23
|
"Gemfile.lock",
|
24
|
+
"HISTORY.md",
|
24
25
|
"LICENSE.txt",
|
25
26
|
"README.md",
|
26
27
|
"Rakefile",
|
data/lib/hyper_log_log.rb
CHANGED
@@ -19,30 +19,47 @@ class HyperLogLog
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def add(counter_name, value)
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
22
|
+
hash = MurmurHash3::V32.murmur3_32_str_hash(value)
|
23
|
+
function_name = hash % @m
|
24
|
+
w = hash / @m
|
25
|
+
existing_value = (@redis.hget(counter_name, function_name) || 0).to_i
|
26
|
+
new_value = [existing_value, rho(w)].max
|
27
|
+
@redis.hset(counter_name, function_name, new_value) if new_value > existing_value
|
27
28
|
end
|
28
29
|
|
30
|
+
# Estimate the cardinality of a single set
|
29
31
|
def count(counter_name)
|
30
32
|
union_helper([counter_name])
|
31
33
|
end
|
32
34
|
|
35
|
+
# Estimate the cardinality of the union of several sets
|
33
36
|
def union(*counter_names)
|
34
37
|
union_helper(counter_names)
|
35
38
|
end
|
36
39
|
|
40
|
+
# Store the union of several sets in *destination* so that it can be used as
|
41
|
+
# a HyperLogLog counter later.
|
42
|
+
def union_store(destination, *counter_names)
|
43
|
+
raw_union(counter_names).each do |key, count|
|
44
|
+
@redis.hset(destination, key, count)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Estimate the cardinality of the intersection of several sets. We do this by
|
49
|
+
# using the principle of inclusion and exclusion to represent the size of the
|
50
|
+
# intersection as the alternating sum of an exponential number of
|
51
|
+
# cardinalities of unions of smaller sets.
|
37
52
|
def intersection(*counter_names)
|
38
|
-
|
53
|
+
icount = (1..counter_names.length).map do |k|
|
54
|
+
counter_names.combination(k).map do |group|
|
55
|
+
((k % 2 == 0) ? -1 : 1) * union_helper(group)
|
56
|
+
end.inject(0, :+)
|
57
|
+
end.inject(0, :+)
|
58
|
+
[icount, 0].max
|
39
59
|
end
|
40
60
|
|
41
61
|
def union_helper(counter_names)
|
42
|
-
all_estimates = counter_names.map{ |
|
43
|
-
.reduce(:concat)
|
44
|
-
.group_by{ |value, score| value }
|
45
|
-
.map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
|
62
|
+
all_estimates = raw_union(counter_names).map{ |value, score| 2 ** -score }
|
46
63
|
estimate_sum = all_estimates.reduce(:+) || 0
|
47
64
|
estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
|
48
65
|
if estimate <= 2.5 * @m
|
@@ -58,13 +75,11 @@ class HyperLogLog
|
|
58
75
|
end
|
59
76
|
end
|
60
77
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
end.inject(0, :+)
|
67
|
-
((-1) ** (counter_names.length + 1)) * sum
|
78
|
+
def raw_union(counter_names)
|
79
|
+
counter_names.map{ |counter_name| @redis.hgetall(counter_name).map{ |x,y| [x, y.to_i] } }
|
80
|
+
.reduce(:concat)
|
81
|
+
.group_by{ |key, count| key }
|
82
|
+
.map{ |key, counters| [key, counters.map{ |x| x.last }.max] }
|
68
83
|
end
|
69
84
|
|
70
85
|
# rho(i) is the position of the first 1 in the binary representation of i,
|
@@ -77,4 +92,5 @@ class HyperLogLog
|
|
77
92
|
@bits_in_hash - Math.log(i, 2).floor
|
78
93
|
end
|
79
94
|
end
|
95
|
+
|
80
96
|
end
|
data/spec/hyper_log_log_spec.rb
CHANGED
@@ -39,6 +39,39 @@ describe HyperLogLog do
|
|
39
39
|
counter.count("counter1").should > counter.count("counter2")
|
40
40
|
end
|
41
41
|
|
42
|
+
it "can exactly count small sets" do
|
43
|
+
redis = Redis.new
|
44
|
+
counter = HyperLogLog.new(redis, 11)
|
45
|
+
10.times { |i| counter.add("mycounter", i.to_s) }
|
46
|
+
counter.count("mycounter").should == 10
|
47
|
+
end
|
48
|
+
|
49
|
+
it "can exactly count small unions" do
|
50
|
+
redis = Redis.new
|
51
|
+
counter = HyperLogLog.new(redis, 11)
|
52
|
+
(1..8).each { |i| counter.add("mycounter1", i.to_s) }
|
53
|
+
(5..12).each { |i| counter.add("mycounter2", i.to_s) }
|
54
|
+
counter.union("mycounter1", "mycounter2").should == 12
|
55
|
+
end
|
56
|
+
|
57
|
+
it "can exactly count small intersections" do
|
58
|
+
redis = Redis.new
|
59
|
+
counter = HyperLogLog.new(redis, 11)
|
60
|
+
(1..8).each { |i| counter.add("mycounter1", i.to_s) }
|
61
|
+
(5..12).each { |i| counter.add("mycounter2", i.to_s) }
|
62
|
+
counter.intersection("mycounter1", "mycounter2").should == 4
|
63
|
+
end
|
64
|
+
|
65
|
+
it "can store unions for querying later" do
|
66
|
+
redis = Redis.new
|
67
|
+
counter = HyperLogLog.new(redis, 11)
|
68
|
+
(1..10).each { |i| counter.add("mycounter1", i.to_s) }
|
69
|
+
(5..15).each { |i| counter.add("mycounter2", i.to_s) }
|
70
|
+
(15..25).each { |i| counter.add("mycounter3", i.to_s) }
|
71
|
+
(20..50).each { |i| counter.add("mycounter4", i.to_s) }
|
72
|
+
counter.union_store("aggregate_counter", "mycounter1", "mycounter2", "mycounter3", "mycounter4")
|
73
|
+
counter.union("mycounter1", "mycounter2", "mycounter3", "mycounter4").should == counter.count("aggregate_counter")
|
74
|
+
end
|
42
75
|
|
43
76
|
# With parameter b, HyperLogLog should produce estimates that have
|
44
77
|
# relative error of 1.04 / Math.sqrt(2 ** b). Of course, this analysis
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hyperloglog-redis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-26 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|
16
|
-
requirement: &
|
16
|
+
requirement: &2172774780 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.1.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2172774780
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &2172774300 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 3.0.1
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2172774300
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &2172773820 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.8.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2172773820
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &2172773340 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.9.2.2
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2172773340
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &2172789220 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: 2.11.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2172789220
|
69
69
|
description: An implementation of the HyperLogLog set cardinality estimation algorithm
|
70
70
|
in Ruby using Redis as a back-end
|
71
71
|
email: aaron.windsor@gmail.com
|
@@ -79,6 +79,7 @@ files:
|
|
79
79
|
- .rspec
|
80
80
|
- Gemfile
|
81
81
|
- Gemfile.lock
|
82
|
+
- HISTORY.md
|
82
83
|
- LICENSE.txt
|
83
84
|
- README.md
|
84
85
|
- Rakefile
|
@@ -103,7 +104,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
103
104
|
version: '0'
|
104
105
|
segments:
|
105
106
|
- 0
|
106
|
-
hash:
|
107
|
+
hash: 2426569210961737114
|
107
108
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
109
|
none: false
|
109
110
|
requirements:
|