hyperloglog-redis 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -14,7 +14,16 @@ instance is used for storing the counters. A simple example:
14
14
  counter.add('beatles', beatle)
15
15
  end
16
16
 
17
- puts "There are approximately #{counter.count('beatles')} distinct beatles!"
17
+ puts "There are approximately #{counter.count('beatles')} distinct Beatles"
18
+
19
+ You can also ask for an estimate from multiple counters and you'll get
20
+ an estimate of the size of their union:
21
+
22
+ ['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wing_member|
23
+ counter.add('wings', wing_member)
24
+ end
25
+
26
+ puts "There are approximately #{counter.count('beatles', 'wings')} people who were in the Beatles or Wings"
18
27
 
19
28
  Each HyperLogLog counter uses a small, fixed amount of space but can
20
29
  estimate the cardinality of any set of up to around a billion values with
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
@@ -0,0 +1,64 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "hyperloglog-redis"
8
+ s.version = "0.2.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Aaron Windsor"]
12
+ s.date = "2012-09-27"
13
+ s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
14
+ s.email = "aaron.windsor@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "LICENSE.txt",
25
+ "README.md",
26
+ "Rakefile",
27
+ "VERSION",
28
+ "hyperloglog-redis.gemspec",
29
+ "lib/hyper_log_log.rb",
30
+ "lib/hyperloglog-redis.rb",
31
+ "spec/hyper_log_log_spec.rb",
32
+ "spec/spec_helper.rb"
33
+ ]
34
+ s.homepage = "http://github.com/aaw/hyperloglog-redis"
35
+ s.licenses = ["MIT"]
36
+ s.require_paths = ["lib"]
37
+ s.rubygems_version = "1.8.10"
38
+ s.summary = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
39
+
40
+ if s.respond_to? :specification_version then
41
+ s.specification_version = 3
42
+
43
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
44
+ s.add_runtime_dependency(%q<murmurhash3>, ["~> 0.1.3"])
45
+ s.add_runtime_dependency(%q<redis>, ["~> 3.0.1"])
46
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
47
+ s.add_development_dependency(%q<rake>, ["~> 0.9.2.2"])
48
+ s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
49
+ else
50
+ s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
51
+ s.add_dependency(%q<redis>, ["~> 3.0.1"])
52
+ s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
53
+ s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
54
+ s.add_dependency(%q<rspec>, ["~> 2.11.0"])
55
+ end
56
+ else
57
+ s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
58
+ s.add_dependency(%q<redis>, ["~> 3.0.1"])
59
+ s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
60
+ s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
61
+ s.add_dependency(%q<rspec>, ["~> 2.11.0"])
62
+ end
63
+ end
64
+
@@ -26,9 +26,12 @@ class HyperLogLog
26
26
  @redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
27
27
  end
28
28
 
29
- def count(counter_name)
30
- all_estimates = @redis.zrange(counter_name, 0, -1, {withscores: true})
31
- estimate_sum = all_estimates.map{ |f, score| 2 ** -score }.reduce(:+) || 0
29
+ def count(*counter_names)
30
+ all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
31
+ .reduce(:concat)
32
+ .group_by{ |value, score| value }
33
+ .map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
34
+ estimate_sum = all_estimates.reduce(:+) || 0
32
35
  estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
33
36
  if estimate <= 2.5 * @m
34
37
  if all_estimates.length == @m
@@ -58,7 +58,7 @@ describe HyperLogLog do
58
58
  # implementation, since it exercises all of the cases in HyperLogLog's
59
59
  # count method except for the correction for very large set sizes.
60
60
 
61
- it "produces acceptable estimates" do
61
+ it "produces acceptable estimates for counts" do
62
62
  max_items = 1000
63
63
  redis = Redis.new
64
64
  (6..16).each do |b|
@@ -81,4 +81,59 @@ describe HyperLogLog do
81
81
  end
82
82
  end
83
83
 
84
+ it "produces acceptable estimates for unions with few elements in common" do
85
+ b, max_items = 10, 2000
86
+ counter = HyperLogLog.new(Redis.new, b)
87
+ bad_estimates = 0
88
+ very_bad_estimates = 0
89
+ expected_relative_error = 1.04 / Math.sqrt(2 ** b)
90
+ max_items.times do |i|
91
+ value1 = Digest::MD5.hexdigest("value#{i}")
92
+ counter.add("mycounter1", value1)
93
+ value2 = Digest::MD5.hexdigest("value#{i}incounter2")
94
+ counter.add("mycounter2", value2)
95
+ value3 = Digest::MD5.hexdigest("this is value#{i}")
96
+ counter.add("mycounter3", value3)
97
+ actual = 3 * (i + 1)
98
+ approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
99
+ relative_error = (actual - approximate).abs / Float(actual)
100
+ bad_estimates += 1 if relative_error > expected_relative_error * 2
101
+ very_bad_estimates += 1 if relative_error > expected_relative_error * 3
102
+ end
103
+ bad_estimates.should < (3 * max_items) / 100.00
104
+ very_bad_estimates.should == 0
105
+ end
106
+
107
+ it "produces acceptable estimates for unions with many elements in common" do
108
+ b, max_items, intersection_size = 10, 1000, 2000
109
+ counter = HyperLogLog.new(Redis.new, b)
110
+ bad_estimates = 0
111
+ very_bad_estimates = 0
112
+ expected_relative_error = 1.04 / Math.sqrt(2 ** b)
113
+
114
+ intersection_size.times do |i|
115
+ value = Digest::MD5.hexdigest("test#{i}value")
116
+ ['mycounter1', 'mycounter2', 'mycounter3'].each do |counter_name|
117
+ counter.add(counter_name, value)
118
+ end
119
+ end
120
+
121
+ max_items.times do |i|
122
+ value1 = Digest::MD5.hexdigest("value#{i}")
123
+ counter.add("mycounter1", value1)
124
+ value2 = Digest::MD5.hexdigest("value#{i}isincounter2")
125
+ counter.add("mycounter2", value2)
126
+ value3 = Digest::MD5.hexdigest("this is value#{i}")
127
+ counter.add("mycounter3", value3)
128
+ actual = 3 * (i + 1) + intersection_size
129
+ approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
130
+ relative_error = (actual - approximate).abs / Float(actual)
131
+ bad_estimates += 1 if relative_error > expected_relative_error * 2
132
+ very_bad_estimates += 1 if relative_error > expected_relative_error * 3
133
+ end
134
+
135
+ bad_estimates.should < ((3 * max_items) + intersection_size) / 100.00
136
+ very_bad_estimates.should == 0
137
+ end
138
+
84
139
  end
@@ -23,4 +23,7 @@ RSpec.configure do |config|
23
23
  config.before(:each) do
24
24
  Redis.new.flushdb
25
25
  end
26
+ config.after(:each) do
27
+ Redis.new.flushdb
28
+ end
26
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hyperloglog-redis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-07 00:00:00.000000000Z
12
+ date: 2012-09-27 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3
16
- requirement: &2159101840 !ruby/object:Gem::Requirement
16
+ requirement: &2172688860 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.1.3
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2159101840
24
+ version_requirements: *2172688860
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &2159101360 !ruby/object:Gem::Requirement
27
+ requirement: &2172688380 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 3.0.1
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2159101360
35
+ version_requirements: *2172688380
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: jeweler
38
- requirement: &2159100880 !ruby/object:Gem::Requirement
38
+ requirement: &2172687900 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.8.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2159100880
46
+ version_requirements: *2172687900
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rake
49
- requirement: &2159100400 !ruby/object:Gem::Requirement
49
+ requirement: &2172687420 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 0.9.2.2
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2159100400
57
+ version_requirements: *2172687420
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &2159099920 !ruby/object:Gem::Requirement
60
+ requirement: &2172686940 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: 2.11.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2159099920
68
+ version_requirements: *2172686940
69
69
  description: An implementation of the HyperLogLog set cardinality estimation algorithm
70
70
  in Ruby using Redis as a back-end
71
71
  email: aaron.windsor@gmail.com
@@ -83,6 +83,7 @@ files:
83
83
  - README.md
84
84
  - Rakefile
85
85
  - VERSION
86
+ - hyperloglog-redis.gemspec
86
87
  - lib/hyper_log_log.rb
87
88
  - lib/hyperloglog-redis.rb
88
89
  - spec/hyper_log_log_spec.rb
@@ -102,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
102
103
  version: '0'
103
104
  segments:
104
105
  - 0
105
- hash: 4418438529594871493
106
+ hash: -1980876148320618453
106
107
  required_rubygems_version: !ruby/object:Gem::Requirement
107
108
  none: false
108
109
  requirements: