hyperloglog-redis 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +10 -1
- data/VERSION +1 -1
- data/hyperloglog-redis.gemspec +64 -0
- data/lib/hyper_log_log.rb +6 -3
- data/spec/hyper_log_log_spec.rb +56 -1
- data/spec/spec_helper.rb +3 -0
- metadata +14 -13
data/README.md
CHANGED
@@ -14,7 +14,16 @@ instance is used for storing the counters. A simple example:
|
|
14
14
|
counter.add('beatles', beatle)
|
15
15
|
end
|
16
16
|
|
17
|
-
puts "There are approximately #{counter.count('beatles')} distinct
|
17
|
+
puts "There are approximately #{counter.count('beatles')} distinct Beatles"
|
18
|
+
|
19
|
+
You can also ask for an estimate from multiple counters and you'll get
|
20
|
+
an estimate of the size of their union:
|
21
|
+
|
22
|
+
['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wing_member|
|
23
|
+
counter.add('wings', wing_member)
|
24
|
+
end
|
25
|
+
|
26
|
+
puts "There are approximately #{counter.count('beatles', 'wings')} people who were in the Beatles or Wings"
|
18
27
|
|
19
28
|
Each HyperLogLog counter uses a small, fixed amount of space but can
|
20
29
|
estimate the cardinality of any set of up to around a billion values with
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "hyperloglog-redis"
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Aaron Windsor"]
|
12
|
+
s.date = "2012-09-27"
|
13
|
+
s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
|
14
|
+
s.email = "aaron.windsor@gmail.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".rspec",
|
22
|
+
"Gemfile",
|
23
|
+
"Gemfile.lock",
|
24
|
+
"LICENSE.txt",
|
25
|
+
"README.md",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"hyperloglog-redis.gemspec",
|
29
|
+
"lib/hyper_log_log.rb",
|
30
|
+
"lib/hyperloglog-redis.rb",
|
31
|
+
"spec/hyper_log_log_spec.rb",
|
32
|
+
"spec/spec_helper.rb"
|
33
|
+
]
|
34
|
+
s.homepage = "http://github.com/aaw/hyperloglog-redis"
|
35
|
+
s.licenses = ["MIT"]
|
36
|
+
s.require_paths = ["lib"]
|
37
|
+
s.rubygems_version = "1.8.10"
|
38
|
+
s.summary = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
|
39
|
+
|
40
|
+
if s.respond_to? :specification_version then
|
41
|
+
s.specification_version = 3
|
42
|
+
|
43
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
44
|
+
s.add_runtime_dependency(%q<murmurhash3>, ["~> 0.1.3"])
|
45
|
+
s.add_runtime_dependency(%q<redis>, ["~> 3.0.1"])
|
46
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
|
47
|
+
s.add_development_dependency(%q<rake>, ["~> 0.9.2.2"])
|
48
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
|
49
|
+
else
|
50
|
+
s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
|
51
|
+
s.add_dependency(%q<redis>, ["~> 3.0.1"])
|
52
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
53
|
+
s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
|
54
|
+
s.add_dependency(%q<rspec>, ["~> 2.11.0"])
|
55
|
+
end
|
56
|
+
else
|
57
|
+
s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
|
58
|
+
s.add_dependency(%q<redis>, ["~> 3.0.1"])
|
59
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
60
|
+
s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
|
61
|
+
s.add_dependency(%q<rspec>, ["~> 2.11.0"])
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
data/lib/hyper_log_log.rb
CHANGED
@@ -26,9 +26,12 @@ class HyperLogLog
|
|
26
26
|
@redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
|
27
27
|
end
|
28
28
|
|
29
|
-
def count(
|
30
|
-
all_estimates = @redis.zrange(counter_name, 0, -1, {withscores: true})
|
31
|
-
|
29
|
+
def count(*counter_names)
|
30
|
+
all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
|
31
|
+
.reduce(:concat)
|
32
|
+
.group_by{ |value, score| value }
|
33
|
+
.map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
|
34
|
+
estimate_sum = all_estimates.reduce(:+) || 0
|
32
35
|
estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
|
33
36
|
if estimate <= 2.5 * @m
|
34
37
|
if all_estimates.length == @m
|
data/spec/hyper_log_log_spec.rb
CHANGED
@@ -58,7 +58,7 @@ describe HyperLogLog do
|
|
58
58
|
# implementation, since it exercises all of the cases in HyperLogLog's
|
59
59
|
# count method except for the correction for very large set sizes.
|
60
60
|
|
61
|
-
it "produces acceptable estimates" do
|
61
|
+
it "produces acceptable estimates for counts" do
|
62
62
|
max_items = 1000
|
63
63
|
redis = Redis.new
|
64
64
|
(6..16).each do |b|
|
@@ -81,4 +81,59 @@ describe HyperLogLog do
|
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
84
|
+
it "produces acceptable estimates for unions with few elements in common" do
|
85
|
+
b, max_items = 10, 2000
|
86
|
+
counter = HyperLogLog.new(Redis.new, b)
|
87
|
+
bad_estimates = 0
|
88
|
+
very_bad_estimates = 0
|
89
|
+
expected_relative_error = 1.04 / Math.sqrt(2 ** b)
|
90
|
+
max_items.times do |i|
|
91
|
+
value1 = Digest::MD5.hexdigest("value#{i}")
|
92
|
+
counter.add("mycounter1", value1)
|
93
|
+
value2 = Digest::MD5.hexdigest("value#{i}incounter2")
|
94
|
+
counter.add("mycounter2", value2)
|
95
|
+
value3 = Digest::MD5.hexdigest("this is value#{i}")
|
96
|
+
counter.add("mycounter3", value3)
|
97
|
+
actual = 3 * (i + 1)
|
98
|
+
approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
|
99
|
+
relative_error = (actual - approximate).abs / Float(actual)
|
100
|
+
bad_estimates += 1 if relative_error > expected_relative_error * 2
|
101
|
+
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
|
102
|
+
end
|
103
|
+
bad_estimates.should < (3 * max_items) / 100.00
|
104
|
+
very_bad_estimates.should == 0
|
105
|
+
end
|
106
|
+
|
107
|
+
it "produces acceptable estimates for unions with many elements in common" do
|
108
|
+
b, max_items, intersection_size = 10, 1000, 2000
|
109
|
+
counter = HyperLogLog.new(Redis.new, b)
|
110
|
+
bad_estimates = 0
|
111
|
+
very_bad_estimates = 0
|
112
|
+
expected_relative_error = 1.04 / Math.sqrt(2 ** b)
|
113
|
+
|
114
|
+
intersection_size.times do |i|
|
115
|
+
value = Digest::MD5.hexdigest("test#{i}value")
|
116
|
+
['mycounter1', 'mycounter2', 'mycounter3'].each do |counter_name|
|
117
|
+
counter.add(counter_name, value)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
max_items.times do |i|
|
122
|
+
value1 = Digest::MD5.hexdigest("value#{i}")
|
123
|
+
counter.add("mycounter1", value1)
|
124
|
+
value2 = Digest::MD5.hexdigest("value#{i}isincounter2")
|
125
|
+
counter.add("mycounter2", value2)
|
126
|
+
value3 = Digest::MD5.hexdigest("this is value#{i}")
|
127
|
+
counter.add("mycounter3", value3)
|
128
|
+
actual = 3 * (i + 1) + intersection_size
|
129
|
+
approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
|
130
|
+
relative_error = (actual - approximate).abs / Float(actual)
|
131
|
+
bad_estimates += 1 if relative_error > expected_relative_error * 2
|
132
|
+
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
|
133
|
+
end
|
134
|
+
|
135
|
+
bad_estimates.should < ((3 * max_items) + intersection_size) / 100.00
|
136
|
+
very_bad_estimates.should == 0
|
137
|
+
end
|
138
|
+
|
84
139
|
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hyperloglog-redis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-27 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|
16
|
-
requirement: &
|
16
|
+
requirement: &2172688860 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.1.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2172688860
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &2172688380 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 3.0.1
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2172688380
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &2172687900 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.8.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2172687900
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &2172687420 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.9.2.2
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2172687420
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &2172686940 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: 2.11.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2172686940
|
69
69
|
description: An implementation of the HyperLogLog set cardinality estimation algorithm
|
70
70
|
in Ruby using Redis as a back-end
|
71
71
|
email: aaron.windsor@gmail.com
|
@@ -83,6 +83,7 @@ files:
|
|
83
83
|
- README.md
|
84
84
|
- Rakefile
|
85
85
|
- VERSION
|
86
|
+
- hyperloglog-redis.gemspec
|
86
87
|
- lib/hyper_log_log.rb
|
87
88
|
- lib/hyperloglog-redis.rb
|
88
89
|
- spec/hyper_log_log_spec.rb
|
@@ -102,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
102
103
|
version: '0'
|
103
104
|
segments:
|
104
105
|
- 0
|
105
|
-
hash:
|
106
|
+
hash: -1980876148320618453
|
106
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
107
108
|
none: false
|
108
109
|
requirements:
|