hyperloglog-redis 0.3.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ ## 1.0.0 (10/26/2012)
2
+
3
+ * Changed the underlying storage from Redis sorted sets to Redis hashes. This
4
+ is a breaking change, if you have existing counters stored from earlier
5
+ versions of this library, you can upgrade them with something like the
6
+ following method:
7
+
8
+ def upgrade(counter, redis)
9
+ return if redis.type(counter) == "hash"
10
+ values = redis.zrange(counter, 0, -1, {withscores: true})
11
+ redis.del(counter)
12
+ values.each { |key, value| redis.hset(counter, key, value.to_i) }
13
+ end
14
+
15
+ * Added union_store command, which stores the results of a union for querying
16
+ or combining with other sets later
17
+
data/README.md CHANGED
@@ -53,7 +53,12 @@ You can also ask for an estimate of the union from multiple counters:
53
53
 
54
54
  The same relative error guarantee above applies to unions: a union of
55
55
  size N can be estimated to within N * (1.04 / Math.sqrt(2 ** b)) elements,
56
- regardless of how many HyperLogLog counters that union spans.
56
+ regardless of how many HyperLogLog counters that union spans. You can store
57
+ a unioned counter for querying or combining later with `union_store`:
58
+
59
+ counter.union_store('all_beatles_and_wings_members', 'beatles', 'wings')
60
+
61
+ puts "There are approximately #{counter.count('all_beatles_and_wings_members'}} people who were in the Beatles or Wings"
57
62
 
58
63
  Intersections can also be estimated:
59
64
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 1.0.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "hyperloglog-redis"
8
- s.version = "0.3.0"
8
+ s.version = "1.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Aaron Windsor"]
12
- s.date = "2012-09-28"
12
+ s.date = "2012-10-26"
13
13
  s.description = "An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end"
14
14
  s.email = "aaron.windsor@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -21,6 +21,7 @@ Gem::Specification.new do |s|
21
21
  ".rspec",
22
22
  "Gemfile",
23
23
  "Gemfile.lock",
24
+ "HISTORY.md",
24
25
  "LICENSE.txt",
25
26
  "README.md",
26
27
  "Rakefile",
@@ -19,30 +19,47 @@ class HyperLogLog
19
19
  end
20
20
 
21
21
  def add(counter_name, value)
22
- hash = MurmurHash3::V32.murmur3_32_str_hash(value)
23
- function_name = (hash % @m).to_s
24
- w = hash / @m
25
- max_run_of_zeros = @redis.zscore(counter_name, function_name)
26
- @redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
22
+ hash = MurmurHash3::V32.murmur3_32_str_hash(value)
23
+ function_name = hash % @m
24
+ w = hash / @m
25
+ existing_value = (@redis.hget(counter_name, function_name) || 0).to_i
26
+ new_value = [existing_value, rho(w)].max
27
+ @redis.hset(counter_name, function_name, new_value) if new_value > existing_value
27
28
  end
28
29
 
30
+ # Estimate the cardinality of a single set
29
31
  def count(counter_name)
30
32
  union_helper([counter_name])
31
33
  end
32
34
 
35
+ # Estimate the cardinality of the union of several sets
33
36
  def union(*counter_names)
34
37
  union_helper(counter_names)
35
38
  end
36
39
 
40
+ # Store the union of several sets in *destination* so that it can be used as
41
+ # a HyperLogLog counter later.
42
+ def union_store(destination, *counter_names)
43
+ raw_union(counter_names).each do |key, count|
44
+ @redis.hset(destination, key, count)
45
+ end
46
+ end
47
+
48
+ # Estimate the cardinality of the intersection of several sets. We do this by
49
+ # using the principle of inclusion and exclusion to represent the size of the
50
+ # intersection as the alternating sum of an exponential number of
51
+ # cardinalities of unions of smaller sets.
37
52
  def intersection(*counter_names)
38
- [intersection_helper(counter_names, {}), 0].max
53
+ icount = (1..counter_names.length).map do |k|
54
+ counter_names.combination(k).map do |group|
55
+ ((k % 2 == 0) ? -1 : 1) * union_helper(group)
56
+ end.inject(0, :+)
57
+ end.inject(0, :+)
58
+ [icount, 0].max
39
59
  end
40
60
 
41
61
  def union_helper(counter_names)
42
- all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
43
- .reduce(:concat)
44
- .group_by{ |value, score| value }
45
- .map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
62
+ all_estimates = raw_union(counter_names).map{ |value, score| 2 ** -score }
46
63
  estimate_sum = all_estimates.reduce(:+) || 0
47
64
  estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
48
65
  if estimate <= 2.5 * @m
@@ -58,13 +75,11 @@ class HyperLogLog
58
75
  end
59
76
  end
60
77
 
61
- def intersection_helper(counter_names, cache)
62
- sum = union_helper(counter_names) - (1...counter_names.length).map do |k|
63
- ((-1) ** (k + 1)) * counter_names.combination(k).map do |group|
64
- cache[group] ||= intersection_helper(group, cache)
65
- end.inject(0, :+)
66
- end.inject(0, :+)
67
- ((-1) ** (counter_names.length + 1)) * sum
78
+ def raw_union(counter_names)
79
+ counter_names.map{ |counter_name| @redis.hgetall(counter_name).map{ |x,y| [x, y.to_i] } }
80
+ .reduce(:concat)
81
+ .group_by{ |key, count| key }
82
+ .map{ |key, counters| [key, counters.map{ |x| x.last }.max] }
68
83
  end
69
84
 
70
85
  # rho(i) is the position of the first 1 in the binary representation of i,
@@ -77,4 +92,5 @@ class HyperLogLog
77
92
  @bits_in_hash - Math.log(i, 2).floor
78
93
  end
79
94
  end
95
+
80
96
  end
@@ -39,6 +39,39 @@ describe HyperLogLog do
39
39
  counter.count("counter1").should > counter.count("counter2")
40
40
  end
41
41
 
42
+ it "can exactly count small sets" do
43
+ redis = Redis.new
44
+ counter = HyperLogLog.new(redis, 11)
45
+ 10.times { |i| counter.add("mycounter", i.to_s) }
46
+ counter.count("mycounter").should == 10
47
+ end
48
+
49
+ it "can exactly count small unions" do
50
+ redis = Redis.new
51
+ counter = HyperLogLog.new(redis, 11)
52
+ (1..8).each { |i| counter.add("mycounter1", i.to_s) }
53
+ (5..12).each { |i| counter.add("mycounter2", i.to_s) }
54
+ counter.union("mycounter1", "mycounter2").should == 12
55
+ end
56
+
57
+ it "can exactly count small intersections" do
58
+ redis = Redis.new
59
+ counter = HyperLogLog.new(redis, 11)
60
+ (1..8).each { |i| counter.add("mycounter1", i.to_s) }
61
+ (5..12).each { |i| counter.add("mycounter2", i.to_s) }
62
+ counter.intersection("mycounter1", "mycounter2").should == 4
63
+ end
64
+
65
+ it "can store unions for querying later" do
66
+ redis = Redis.new
67
+ counter = HyperLogLog.new(redis, 11)
68
+ (1..10).each { |i| counter.add("mycounter1", i.to_s) }
69
+ (5..15).each { |i| counter.add("mycounter2", i.to_s) }
70
+ (15..25).each { |i| counter.add("mycounter3", i.to_s) }
71
+ (20..50).each { |i| counter.add("mycounter4", i.to_s) }
72
+ counter.union_store("aggregate_counter", "mycounter1", "mycounter2", "mycounter3", "mycounter4")
73
+ counter.union("mycounter1", "mycounter2", "mycounter3", "mycounter4").should == counter.count("aggregate_counter")
74
+ end
42
75
 
43
76
  # With parameter b, HyperLogLog should produce estimates that have
44
77
  # relative error of 1.04 / Math.sqrt(2 ** b). Of course, this analysis
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hyperloglog-redis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-28 00:00:00.000000000Z
12
+ date: 2012-10-26 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3
16
- requirement: &2157248980 !ruby/object:Gem::Requirement
16
+ requirement: &2172774780 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.1.3
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2157248980
24
+ version_requirements: *2172774780
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &2157248500 !ruby/object:Gem::Requirement
27
+ requirement: &2172774300 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 3.0.1
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2157248500
35
+ version_requirements: *2172774300
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: jeweler
38
- requirement: &2157248020 !ruby/object:Gem::Requirement
38
+ requirement: &2172773820 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.8.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2157248020
46
+ version_requirements: *2172773820
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rake
49
- requirement: &2157247540 !ruby/object:Gem::Requirement
49
+ requirement: &2172773340 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 0.9.2.2
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2157247540
57
+ version_requirements: *2172773340
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &2157247060 !ruby/object:Gem::Requirement
60
+ requirement: &2172789220 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: 2.11.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2157247060
68
+ version_requirements: *2172789220
69
69
  description: An implementation of the HyperLogLog set cardinality estimation algorithm
70
70
  in Ruby using Redis as a back-end
71
71
  email: aaron.windsor@gmail.com
@@ -79,6 +79,7 @@ files:
79
79
  - .rspec
80
80
  - Gemfile
81
81
  - Gemfile.lock
82
+ - HISTORY.md
82
83
  - LICENSE.txt
83
84
  - README.md
84
85
  - Rakefile
@@ -103,7 +104,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
103
104
  version: '0'
104
105
  segments:
105
106
  - 0
106
- hash: 1810415248006536813
107
+ hash: 2426569210961737114
107
108
  required_rubygems_version: !ruby/object:Gem::Requirement
108
109
  none: false
109
110
  requirements: