hyperloglog-redis 0.3.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ ## 1.0.0 (10/26/2012)
2
+
3
+ * Changed the underlying storage from Redis sorted sets to Redis hashes. This
4
+ is a breaking change, if you have existing counters stored from earlier
5
+ versions of this library, you can upgrade them with something like the
6
+ following method:
7
+
8
+ def upgrade(counter, redis)
9
+ return if redis.type(counter) == "hash"
10
+ values = redis.zrange(counter, 0, -1, {withscores: true})
11
+ redis.del(counter)
12
+ values.each { |key, value| redis.hset(counter, key, value.to_i) }
13
+ end
14
+
15
+ * Added union_store command, which stores the results of a union for querying
16
+ or combining with other sets later
17
+
data/README.md CHANGED
@@ -53,7 +53,12 @@ You can also ask for an estimate of the union from multiple counters:
53
53
 
54
54
  The same relative error guarantee above applies to unions: a union of
55
55
  size N can be estimated to within N * (1.04 / Math.sqrt(2 ** b)) elements,
56
- regardless of how many HyperLogLog counters that union spans.
56
+ regardless of how many HyperLogLog counters that union spans. You can store
57
+ a unioned counter for querying or combining later with `union_store`:
58
+
59
+ counter.union_store('all_beatles_and_wings_members', 'beatles', 'wings')
60
+
61
+ puts "There are approximately #{counter.count('all_beatles_and_wings_members'}} people who were in the Beatles or Wings"
57
62
 
58
63
  Intersections can also be estimated:
59
64
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 1.0.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "hyperloglog-redis"
8
- s.version = "0.3.0"
8
+ s.version = "1.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Aaron Windsor"]
12
- s.date = "2012-09-28"
12
+ s.date = "2012-10-26"
13
13
  s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
14
14
  s.email = "aaron.windsor@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -21,6 +21,7 @@ Gem::Specification.new do |s|
21
21
  ".rspec",
22
22
  "Gemfile",
23
23
  "Gemfile.lock",
24
+ "HISTORY.md",
24
25
  "LICENSE.txt",
25
26
  "README.md",
26
27
  "Rakefile",
@@ -19,30 +19,47 @@ class HyperLogLog
19
19
  end
20
20
 
21
21
  def add(counter_name, value)
22
- hash = MurmurHash3::V32.murmur3_32_str_hash(value)
23
- function_name = (hash % @m).to_s
24
- w = hash / @m
25
- max_run_of_zeros = @redis.zscore(counter_name, function_name)
26
- @redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
22
+ hash = MurmurHash3::V32.murmur3_32_str_hash(value)
23
+ function_name = hash % @m
24
+ w = hash / @m
25
+ existing_value = (@redis.hget(counter_name, function_name) || 0).to_i
26
+ new_value = [existing_value, rho(w)].max
27
+ @redis.hset(counter_name, function_name, new_value) if new_value > existing_value
27
28
  end
28
29
 
30
+ # Estimate the cardinality of a single set
29
31
  def count(counter_name)
30
32
  union_helper([counter_name])
31
33
  end
32
34
 
35
+ # Estimate the cardinality of the union of several sets
33
36
  def union(*counter_names)
34
37
  union_helper(counter_names)
35
38
  end
36
39
 
40
+ # Store the union of several sets in *destination* so that it can be used as
41
+ # a HyperLogLog counter later.
42
+ def union_store(destination, *counter_names)
43
+ raw_union(counter_names).each do |key, count|
44
+ @redis.hset(destination, key, count)
45
+ end
46
+ end
47
+
48
+ # Estimate the cardinality of the intersection of several sets. We do this by
49
+ # using the principle of inclusion and exclusion to represent the size of the
50
+ # intersection as the alternating sum of an exponential number of
51
+ # cardinalities of unions of smaller sets.
37
52
  def intersection(*counter_names)
38
- [intersection_helper(counter_names, {}), 0].max
53
+ icount = (1..counter_names.length).map do |k|
54
+ counter_names.combination(k).map do |group|
55
+ ((k % 2 == 0) ? -1 : 1) * union_helper(group)
56
+ end.inject(0, :+)
57
+ end.inject(0, :+)
58
+ [icount, 0].max
39
59
  end
40
60
 
41
61
  def union_helper(counter_names)
42
- all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
43
- .reduce(:concat)
44
- .group_by{ |value, score| value }
45
- .map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
62
+ all_estimates = raw_union(counter_names).map{ |value, score| 2 ** -score }
46
63
  estimate_sum = all_estimates.reduce(:+) || 0
47
64
  estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
48
65
  if estimate <= 2.5 * @m
@@ -58,13 +75,11 @@ class HyperLogLog
58
75
  end
59
76
  end
60
77
 
61
- def intersection_helper(counter_names, cache)
62
- sum = union_helper(counter_names) - (1...counter_names.length).map do |k|
63
- ((-1) ** (k + 1)) * counter_names.combination(k).map do |group|
64
- cache[group] ||= intersection_helper(group, cache)
65
- end.inject(0, :+)
66
- end.inject(0, :+)
67
- ((-1) ** (counter_names.length + 1)) * sum
78
+ def raw_union(counter_names)
79
+ counter_names.map{ |counter_name| @redis.hgetall(counter_name).map{ |x,y| [x, y.to_i] } }
80
+ .reduce(:concat)
81
+ .group_by{ |key, count| key }
82
+ .map{ |key, counters| [key, counters.map{ |x| x.last }.max] }
68
83
  end
69
84
 
70
85
  # rho(i) is the position of the first 1 in the binary representation of i,
@@ -77,4 +92,5 @@ class HyperLogLog
77
92
  @bits_in_hash - Math.log(i, 2).floor
78
93
  end
79
94
  end
95
+
80
96
  end
@@ -39,6 +39,39 @@ describe HyperLogLog do
39
39
  counter.count("counter1").should > counter.count("counter2")
40
40
  end
41
41
 
42
+ it "can exactly count small sets" do
43
+ redis = Redis.new
44
+ counter = HyperLogLog.new(redis, 11)
45
+ 10.times { |i| counter.add("mycounter", i.to_s) }
46
+ counter.count("mycounter").should == 10
47
+ end
48
+
49
+ it "can exactly count small unions" do
50
+ redis = Redis.new
51
+ counter = HyperLogLog.new(redis, 11)
52
+ (1..8).each { |i| counter.add("mycounter1", i.to_s) }
53
+ (5..12).each { |i| counter.add("mycounter2", i.to_s) }
54
+ counter.union("mycounter1", "mycounter2").should == 12
55
+ end
56
+
57
+ it "can exactly count small intersections" do
58
+ redis = Redis.new
59
+ counter = HyperLogLog.new(redis, 11)
60
+ (1..8).each { |i| counter.add("mycounter1", i.to_s) }
61
+ (5..12).each { |i| counter.add("mycounter2", i.to_s) }
62
+ counter.intersection("mycounter1", "mycounter2").should == 4
63
+ end
64
+
65
+ it "can store unions for querying later" do
66
+ redis = Redis.new
67
+ counter = HyperLogLog.new(redis, 11)
68
+ (1..10).each { |i| counter.add("mycounter1", i.to_s) }
69
+ (5..15).each { |i| counter.add("mycounter2", i.to_s) }
70
+ (15..25).each { |i| counter.add("mycounter3", i.to_s) }
71
+ (20..50).each { |i| counter.add("mycounter4", i.to_s) }
72
+ counter.union_store("aggregate_counter", "mycounter1", "mycounter2", "mycounter3", "mycounter4")
73
+ counter.union("mycounter1", "mycounter2", "mycounter3", "mycounter4").should == counter.count("aggregate_counter")
74
+ end
42
75
 
43
76
  # With parameter b, HyperLogLog should produce estimates that have
44
77
  # relative error of 1.04 / Math.sqrt(2 ** b). Of course, this analysis
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hyperloglog-redis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-28 00:00:00.000000000Z
12
+ date: 2012-10-26 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3
16
- requirement: &2157248980 !ruby/object:Gem::Requirement
16
+ requirement: &2172774780 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.1.3
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2157248980
24
+ version_requirements: *2172774780
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &2157248500 !ruby/object:Gem::Requirement
27
+ requirement: &2172774300 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 3.0.1
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2157248500
35
+ version_requirements: *2172774300
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: jeweler
38
- requirement: &2157248020 !ruby/object:Gem::Requirement
38
+ requirement: &2172773820 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.8.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2157248020
46
+ version_requirements: *2172773820
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rake
49
- requirement: &2157247540 !ruby/object:Gem::Requirement
49
+ requirement: &2172773340 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 0.9.2.2
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2157247540
57
+ version_requirements: *2172773340
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &2157247060 !ruby/object:Gem::Requirement
60
+ requirement: &2172789220 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: 2.11.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2157247060
68
+ version_requirements: *2172789220
69
69
  description: An implementation of the HyperLogLog set cardinality estimation algorithm
70
70
  in Ruby using Redis as a back-end
71
71
  email: aaron.windsor@gmail.com
@@ -79,6 +79,7 @@ files:
79
79
  - .rspec
80
80
  - Gemfile
81
81
  - Gemfile.lock
82
+ - HISTORY.md
82
83
  - LICENSE.txt
83
84
  - README.md
84
85
  - Rakefile
@@ -103,7 +104,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
103
104
  version: '0'
104
105
  segments:
105
106
  - 0
106
- hash: 1810415248006536813
107
+ hash: 2426569210961737114
107
108
  required_rubygems_version: !ruby/object:Gem::Requirement
108
109
  none: false
109
110
  requirements: