hyperloglog-redis 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -14,7 +14,16 @@ instance is used for storing the counters. A simple example:
14
14
  counter.add('beatles', beatle)
15
15
  end
16
16
 
17
- puts "There are approximately #{counter.count('beatles')} distinct beatles!"
17
+ puts "There are approximately #{counter.count('beatles')} distinct Beatles"
18
+
19
+ You can also ask for an estimate from multiple counters and you'll get
20
+ an estimate of the size of their union:
21
+
22
+ ['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wing_member|
23
+ counter.add('wings', wing_member)
24
+ end
25
+
26
+ puts "There are approximately #{counter.count('beatles', 'wings')} people who were in the Beatles or Wings"
18
27
 
19
28
  Each HyperLogLog counter uses a small, fixed amount of space but can
20
29
  estimate the cardinality of any set of up to around a billion values with
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
@@ -0,0 +1,64 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "hyperloglog-redis"
8
+ s.version = "0.2.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Aaron Windsor"]
12
+ s.date = "2012-09-27"
13
+ s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
14
+ s.email = "aaron.windsor@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "LICENSE.txt",
25
+ "README.md",
26
+ "Rakefile",
27
+ "VERSION",
28
+ "hyperloglog-redis.gemspec",
29
+ "lib/hyper_log_log.rb",
30
+ "lib/hyperloglog-redis.rb",
31
+ "spec/hyper_log_log_spec.rb",
32
+ "spec/spec_helper.rb"
33
+ ]
34
+ s.homepage = "http://github.com/aaw/hyperloglog-redis"
35
+ s.licenses = ["MIT"]
36
+ s.require_paths = ["lib"]
37
+ s.rubygems_version = "1.8.10"
38
+ s.summary = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
39
+
40
+ if s.respond_to? :specification_version then
41
+ s.specification_version = 3
42
+
43
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
44
+ s.add_runtime_dependency(%q<murmurhash3>, ["~> 0.1.3"])
45
+ s.add_runtime_dependency(%q<redis>, ["~> 3.0.1"])
46
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
47
+ s.add_development_dependency(%q<rake>, ["~> 0.9.2.2"])
48
+ s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
49
+ else
50
+ s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
51
+ s.add_dependency(%q<redis>, ["~> 3.0.1"])
52
+ s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
53
+ s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
54
+ s.add_dependency(%q<rspec>, ["~> 2.11.0"])
55
+ end
56
+ else
57
+ s.add_dependency(%q<murmurhash3>, ["~> 0.1.3"])
58
+ s.add_dependency(%q<redis>, ["~> 3.0.1"])
59
+ s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
60
+ s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
61
+ s.add_dependency(%q<rspec>, ["~> 2.11.0"])
62
+ end
63
+ end
64
+
@@ -26,9 +26,12 @@ class HyperLogLog
26
26
  @redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
27
27
  end
28
28
 
29
- def count(counter_name)
30
- all_estimates = @redis.zrange(counter_name, 0, -1, {withscores: true})
31
- estimate_sum = all_estimates.map{ |f, score| 2 ** -score }.reduce(:+) || 0
29
+ def count(*counter_names)
30
+ all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
31
+ .reduce(:concat)
32
+ .group_by{ |value, score| value }
33
+ .map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
34
+ estimate_sum = all_estimates.reduce(:+) || 0
32
35
  estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
33
36
  if estimate <= 2.5 * @m
34
37
  if all_estimates.length == @m
@@ -58,7 +58,7 @@ describe HyperLogLog do
58
58
  # implementation, since it exercises all of the cases in HyperLogLog's
59
59
  # count method except for the correction for very large set sizes.
60
60
 
61
- it "produces acceptable estimates" do
61
+ it "produces acceptable estimates for counts" do
62
62
  max_items = 1000
63
63
  redis = Redis.new
64
64
  (6..16).each do |b|
@@ -81,4 +81,59 @@ describe HyperLogLog do
81
81
  end
82
82
  end
83
83
 
84
+ it "produces acceptable estimates for unions with few elements in common" do
85
+ b, max_items = 10, 2000
86
+ counter = HyperLogLog.new(Redis.new, b)
87
+ bad_estimates = 0
88
+ very_bad_estimates = 0
89
+ expected_relative_error = 1.04 / Math.sqrt(2 ** b)
90
+ max_items.times do |i|
91
+ value1 = Digest::MD5.hexdigest("value#{i}")
92
+ counter.add("mycounter1", value1)
93
+ value2 = Digest::MD5.hexdigest("value#{i}incounter2")
94
+ counter.add("mycounter2", value2)
95
+ value3 = Digest::MD5.hexdigest("this is value#{i}")
96
+ counter.add("mycounter3", value3)
97
+ actual = 3 * (i + 1)
98
+ approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
99
+ relative_error = (actual - approximate).abs / Float(actual)
100
+ bad_estimates += 1 if relative_error > expected_relative_error * 2
101
+ very_bad_estimates += 1 if relative_error > expected_relative_error * 3
102
+ end
103
+ bad_estimates.should < (3 * max_items) / 100.00
104
+ very_bad_estimates.should == 0
105
+ end
106
+
107
+ it "produces acceptable estimates for unions with many elements in common" do
108
+ b, max_items, intersection_size = 10, 1000, 2000
109
+ counter = HyperLogLog.new(Redis.new, b)
110
+ bad_estimates = 0
111
+ very_bad_estimates = 0
112
+ expected_relative_error = 1.04 / Math.sqrt(2 ** b)
113
+
114
+ intersection_size.times do |i|
115
+ value = Digest::MD5.hexdigest("test#{i}value")
116
+ ['mycounter1', 'mycounter2', 'mycounter3'].each do |counter_name|
117
+ counter.add(counter_name, value)
118
+ end
119
+ end
120
+
121
+ max_items.times do |i|
122
+ value1 = Digest::MD5.hexdigest("value#{i}")
123
+ counter.add("mycounter1", value1)
124
+ value2 = Digest::MD5.hexdigest("value#{i}isincounter2")
125
+ counter.add("mycounter2", value2)
126
+ value3 = Digest::MD5.hexdigest("this is value#{i}")
127
+ counter.add("mycounter3", value3)
128
+ actual = 3 * (i + 1) + intersection_size
129
+ approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
130
+ relative_error = (actual - approximate).abs / Float(actual)
131
+ bad_estimates += 1 if relative_error > expected_relative_error * 2
132
+ very_bad_estimates += 1 if relative_error > expected_relative_error * 3
133
+ end
134
+
135
+ bad_estimates.should < ((3 * max_items) + intersection_size) / 100.00
136
+ very_bad_estimates.should == 0
137
+ end
138
+
84
139
  end
@@ -23,4 +23,7 @@ RSpec.configure do |config|
23
23
  config.before(:each) do
24
24
  Redis.new.flushdb
25
25
  end
26
+ config.after(:each) do
27
+ Redis.new.flushdb
28
+ end
26
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hyperloglog-redis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-07 00:00:00.000000000Z
12
+ date: 2012-09-27 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3
16
- requirement: &2159101840 !ruby/object:Gem::Requirement
16
+ requirement: &2172688860 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.1.3
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2159101840
24
+ version_requirements: *2172688860
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &2159101360 !ruby/object:Gem::Requirement
27
+ requirement: &2172688380 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 3.0.1
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2159101360
35
+ version_requirements: *2172688380
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: jeweler
38
- requirement: &2159100880 !ruby/object:Gem::Requirement
38
+ requirement: &2172687900 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.8.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2159100880
46
+ version_requirements: *2172687900
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rake
49
- requirement: &2159100400 !ruby/object:Gem::Requirement
49
+ requirement: &2172687420 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 0.9.2.2
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2159100400
57
+ version_requirements: *2172687420
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &2159099920 !ruby/object:Gem::Requirement
60
+ requirement: &2172686940 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: 2.11.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2159099920
68
+ version_requirements: *2172686940
69
69
  description: An implementation of the HyperLogLog set cardinality estimation algorithm
70
70
  in Ruby using Redis as a back-end
71
71
  email: aaron.windsor@gmail.com
@@ -83,6 +83,7 @@ files:
83
83
  - README.md
84
84
  - Rakefile
85
85
  - VERSION
86
+ - hyperloglog-redis.gemspec
86
87
  - lib/hyper_log_log.rb
87
88
  - lib/hyperloglog-redis.rb
88
89
  - spec/hyper_log_log_spec.rb
@@ -102,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
102
103
  version: '0'
103
104
  segments:
104
105
  - 0
105
- hash: 4418438529594871493
106
+ hash: -1980876148320618453
106
107
  required_rubygems_version: !ruby/object:Gem::Requirement
107
108
  none: false
108
109
  requirements: