hyperloglog-redis 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +10 -1
- data/VERSION +1 -1
- data/hyperloglog-redis.gemspec +64 -0
- data/lib/hyper_log_log.rb +6 -3
- data/spec/hyper_log_log_spec.rb +56 -1
- data/spec/spec_helper.rb +3 -0
- metadata +14 -13
data/README.md
CHANGED
@@ -14,7 +14,16 @@ instance is used for storing the counters. A simple example:
|
|
14
14
|
counter.add('beatles', beatle)
|
15
15
|
end
|
16
16
|
|
17
|
-
puts "There are approximately #{counter.count('beatles')} distinct
|
17
|
+
puts "There are approximately #{counter.count('beatles')} distinct Beatles"
|
18
|
+
|
19
|
+
You can also ask for an estimate from multiple counters and you'll get
|
20
|
+
an estimate of the size of their union:
|
21
|
+
|
22
|
+
['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wing_member|
|
23
|
+
counter.add('wings', wing_member)
|
24
|
+
end
|
25
|
+
|
26
|
+
puts "There are approximately #{counter.count('beatles', 'wings')} people who were in the Beatles or Wings"
|
18
27
|
|
19
28
|
Each HyperLogLog counter uses a small, fixed amount of space but can
|
20
29
|
estimate the cardinality of any set of up to around a billion values with
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "hyperloglog-redis"
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Aaron Windsor"]
|
12
|
+
s.date = "2012-09-27"
|
13
|
+
s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
|
14
|
+
s.email = "aaron.windsor@gmail.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".rspec",
|
22
|
+
"Gemfile",
|
23
|
+
"Gemfile.lock",
|
24
|
+
"LICENSE.txt",
|
25
|
+
"README.md",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"hyperloglog-redis.gemspec",
|
29
|
+
"lib/hyper_log_log.rb",
|
30
|
+
"lib/hyperloglog-redis.rb",
|
31
|
+
"spec/hyper_log_log_spec.rb",
|
32
|
+
"spec/spec_helper.rb"
|
33
|
+
]
|
34
|
+
s.homepage = "http://github.com/aaw/hyperloglog-redis"
|
35
|
+
s.licenses = ["MIT"]
|
36
|
+
s.require_paths = ["lib"]
|
37
|
+
s.rubygems_version = "1.8.10"
|
38
|
+
s.summary = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
|
39
|
+
|
40
|
+
if s.respond_to? :specification_version then
|
41
|
+
s.specification_version = 3
|
42
|
+
|
43
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
44
|
+
s.add_runtime_dependency(%q<murmurhash3>, ["~> 0.1.3"])
|
45
|
+
s.add_runtime_dependency(%q<redis>, ["~> 3.0.1"])
|
46
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
|
47
|
+
s.add_development_dependency(%q<rake>, ["~> 0.9.2.2"])
|
48
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
|
49
|
+
else
|
50
|
+
s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
|
51
|
+
s.add_dependency(%q<redis>, ["~> 3.0.1"])
|
52
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
53
|
+
s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
|
54
|
+
s.add_dependency(%q<rspec>, ["~> 2.11.0"])
|
55
|
+
end
|
56
|
+
else
|
57
|
+
s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
|
58
|
+
s.add_dependency(%q<redis>, ["~> 3.0.1"])
|
59
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
60
|
+
s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
|
61
|
+
s.add_dependency(%q<rspec>, ["~> 2.11.0"])
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
data/lib/hyper_log_log.rb
CHANGED
@@ -26,9 +26,12 @@ class HyperLogLog
|
|
26
26
|
@redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
|
27
27
|
end
|
28
28
|
|
29
|
-
def count(
|
30
|
-
all_estimates = @redis.zrange(counter_name, 0, -1, {withscores: true})
|
31
|
-
|
29
|
+
def count(*counter_names)
|
30
|
+
all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
|
31
|
+
.reduce(:concat)
|
32
|
+
.group_by{ |value, score| value }
|
33
|
+
.map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
|
34
|
+
estimate_sum = all_estimates.reduce(:+) || 0
|
32
35
|
estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
|
33
36
|
if estimate <= 2.5 * @m
|
34
37
|
if all_estimates.length == @m
|
data/spec/hyper_log_log_spec.rb
CHANGED
@@ -58,7 +58,7 @@ describe HyperLogLog do
|
|
58
58
|
# implementation, since it exercises all of the cases in HyperLogLog's
|
59
59
|
# count method except for the correction for very large set sizes.
|
60
60
|
|
61
|
-
it "produces acceptable estimates" do
|
61
|
+
it "produces acceptable estimates for counts" do
|
62
62
|
max_items = 1000
|
63
63
|
redis = Redis.new
|
64
64
|
(6..16).each do |b|
|
@@ -81,4 +81,59 @@ describe HyperLogLog do
|
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
84
|
+
it "produces acceptable estimates for unions with few elements in common" do
|
85
|
+
b, max_items = 10, 2000
|
86
|
+
counter = HyperLogLog.new(Redis.new, b)
|
87
|
+
bad_estimates = 0
|
88
|
+
very_bad_estimates = 0
|
89
|
+
expected_relative_error = 1.04 / Math.sqrt(2 ** b)
|
90
|
+
max_items.times do |i|
|
91
|
+
value1 = Digest::MD5.hexdigest("value#{i}")
|
92
|
+
counter.add("mycounter1", value1)
|
93
|
+
value2 = Digest::MD5.hexdigest("value#{i}incounter2")
|
94
|
+
counter.add("mycounter2", value2)
|
95
|
+
value3 = Digest::MD5.hexdigest("this is value#{i}")
|
96
|
+
counter.add("mycounter3", value3)
|
97
|
+
actual = 3 * (i + 1)
|
98
|
+
approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
|
99
|
+
relative_error = (actual - approximate).abs / Float(actual)
|
100
|
+
bad_estimates += 1 if relative_error > expected_relative_error * 2
|
101
|
+
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
|
102
|
+
end
|
103
|
+
bad_estimates.should < (3 * max_items) / 100.00
|
104
|
+
very_bad_estimates.should == 0
|
105
|
+
end
|
106
|
+
|
107
|
+
it "produces acceptable estimates for unions with many elements in common" do
|
108
|
+
b, max_items, intersection_size = 10, 1000, 2000
|
109
|
+
counter = HyperLogLog.new(Redis.new, b)
|
110
|
+
bad_estimates = 0
|
111
|
+
very_bad_estimates = 0
|
112
|
+
expected_relative_error = 1.04 / Math.sqrt(2 ** b)
|
113
|
+
|
114
|
+
intersection_size.times do |i|
|
115
|
+
value = Digest::MD5.hexdigest("test#{i}value")
|
116
|
+
['mycounter1', 'mycounter2', 'mycounter3'].each do |counter_name|
|
117
|
+
counter.add(counter_name, value)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
max_items.times do |i|
|
122
|
+
value1 = Digest::MD5.hexdigest("value#{i}")
|
123
|
+
counter.add("mycounter1", value1)
|
124
|
+
value2 = Digest::MD5.hexdigest("value#{i}isincounter2")
|
125
|
+
counter.add("mycounter2", value2)
|
126
|
+
value3 = Digest::MD5.hexdigest("this is value#{i}")
|
127
|
+
counter.add("mycounter3", value3)
|
128
|
+
actual = 3 * (i + 1) + intersection_size
|
129
|
+
approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
|
130
|
+
relative_error = (actual - approximate).abs / Float(actual)
|
131
|
+
bad_estimates += 1 if relative_error > expected_relative_error * 2
|
132
|
+
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
|
133
|
+
end
|
134
|
+
|
135
|
+
bad_estimates.should < ((3 * max_items) + intersection_size) / 100.00
|
136
|
+
very_bad_estimates.should == 0
|
137
|
+
end
|
138
|
+
|
84
139
|
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hyperloglog-redis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-27 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|
16
|
-
requirement: &
|
16
|
+
requirement: &2172688860 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.1.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2172688860
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &2172688380 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 3.0.1
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2172688380
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &2172687900 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.8.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2172687900
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &2172687420 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.9.2.2
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2172687420
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &2172686940 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: 2.11.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2172686940
|
69
69
|
description: An implementation of the HyperLogLog set cardinality estimation algorithm
|
70
70
|
in Ruby using Redis as a back-end
|
71
71
|
email: aaron.windsor@gmail.com
|
@@ -83,6 +83,7 @@ files:
|
|
83
83
|
- README.md
|
84
84
|
- Rakefile
|
85
85
|
- VERSION
|
86
|
+
- hyperloglog-redis.gemspec
|
86
87
|
- lib/hyper_log_log.rb
|
87
88
|
- lib/hyperloglog-redis.rb
|
88
89
|
- spec/hyper_log_log_spec.rb
|
@@ -102,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
102
103
|
version: '0'
|
103
104
|
segments:
|
104
105
|
- 0
|
105
|
-
hash:
|
106
|
+
hash: -1980876148320618453
|
106
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
107
108
|
none: false
|
108
109
|
requirements:
|