hyperloglog-redis 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +19 -0
- data/README.md +46 -0
- data/Rakefile +34 -0
- data/VERSION +1 -0
- data/lib/hyper_log_log.rb +56 -0
- data/lib/hyperloglog-redis.rb +1 -0
- data/spec/hyper_log_log_spec.rb +84 -0
- data/spec/spec_helper.rb +26 -0
- metadata +119 -0
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.8.4)
|
7
|
+
bundler (~> 1.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rdoc
|
11
|
+
json (1.7.5)
|
12
|
+
murmurhash3 (0.1.3)
|
13
|
+
rake (0.9.2.2)
|
14
|
+
rdoc (3.12)
|
15
|
+
json (~> 1.4)
|
16
|
+
redis (3.0.1)
|
17
|
+
rspec (2.11.0)
|
18
|
+
rspec-core (~> 2.11.0)
|
19
|
+
rspec-expectations (~> 2.11.0)
|
20
|
+
rspec-mocks (~> 2.11.0)
|
21
|
+
rspec-core (2.11.1)
|
22
|
+
rspec-expectations (2.11.3)
|
23
|
+
diff-lcs (~> 1.1.3)
|
24
|
+
rspec-mocks (2.11.2)
|
25
|
+
|
26
|
+
PLATFORMS
|
27
|
+
ruby
|
28
|
+
|
29
|
+
DEPENDENCIES
|
30
|
+
jeweler (~> 1.8.4)
|
31
|
+
murmurhash3 (~> 0.1.3)
|
32
|
+
rake (~> 0.9.2.2)
|
33
|
+
redis (~> 3.0.1)
|
34
|
+
rspec (~> 2.11.0)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2012 Art.sy, Inc.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
4
|
+
this software and associated documentation files (the "Software"), to deal in
|
5
|
+
the Software without restriction, including without limitation the rights to
|
6
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
7
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
8
|
+
so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
hyperloglog-redis
|
2
|
+
=================
|
3
|
+
|
4
|
+
This gem is an implementation of the HyperLogLog algorithm for estimating
|
5
|
+
cardinalities of sets observed via a stream of events. A [Redis](http://redis.io)
|
6
|
+
instance is used for storing the counters. A simple example:
|
7
|
+
|
8
|
+
require 'redis'
|
9
|
+
require 'hyperloglog-redis'
|
10
|
+
|
11
|
+
redis = Redis.new
|
12
|
+
counter = HyperLogLog.new(redis)
|
13
|
+
['john', 'paul', 'george', 'ringo', 'john', 'paul'].each do |beatle|
|
14
|
+
counter.add('beatles', beatle)
|
15
|
+
end
|
16
|
+
|
17
|
+
puts "There are approximately #{counter.count('beatles')} distinct beatles!"
|
18
|
+
|
19
|
+
Each HyperLogLog counter uses a small, fixed amount of space but can
|
20
|
+
estimate the cardinality of any set of up to around a billion values with
|
21
|
+
relative error of about 1.04 / Math.sqrt(2 ** b), where b is a parameter
|
22
|
+
passed to the HyperLogLog initializer that defaults to 10. With b = 10,
|
23
|
+
each counter is represented by a Redis sorted set with 2 ** b = 1024 values
|
24
|
+
(a few KB of space) and we get an expected relative error of 3%. Contrast this
|
25
|
+
with the amount of space needed to compute set cardinality exactly, which is
|
26
|
+
over 100 MB for a even a bit vector representing a set with a billion values.
|
27
|
+
|
28
|
+
The basic idea of HyperLogLog (and its predecessors PCSA and LogLog) is to apply
|
29
|
+
a good hash function to each value you see in the stream and record the longest
|
30
|
+
run of zeros that you've seen as a prefix of any hashed value. If the hash
|
31
|
+
function is good, you'd expect that its bits are statistically independent, so
|
32
|
+
seeing a value that starts with exactly X zeros should happen with probability
|
33
|
+
2 ** -(X + 1). So if you've seen a run of 5 zeros in one of your hash values,
|
34
|
+
you're likely to have around 2 ** 6 = 64 values in the underlying set. The actual
|
35
|
+
implementation and analysis are much more advanced than this, but that's the idea.
|
36
|
+
|
37
|
+
The HyperLogLog algorithm is described and analyzed in the paper
|
38
|
+
["HyperLogLog: the analysis of a near-optimal cardinality estimation
|
39
|
+
algorithm"](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf)
|
40
|
+
by Flajolet, Fusy, Gandouet, and Meunier. Our implementation closely
|
41
|
+
follows the program described in Section 4 of that paper.
|
42
|
+
|
43
|
+
Installation
|
44
|
+
============
|
45
|
+
|
46
|
+
gem install hyperloglog-redis
|
data/Rakefile
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "hyperloglog-redis"
|
18
|
+
gem.homepage = "http://github.com/aaw/hyperloglog-redis"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end}
|
21
|
+
gem.description = %Q{An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end}
|
22
|
+
gem.email = "aaron.windsor@gmail.com"
|
23
|
+
gem.authors = ["Aaron Windsor"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
task :default => :spec
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'redis'
|
2
|
+
require 'murmurhash3'
|
3
|
+
|
4
|
+
class HyperLogLog
|
5
|
+
def initialize(redis, b=10)
|
6
|
+
raise "Accuracy not supported. Please choose a value of b between 4 and 16" if b < 4 || b > 16
|
7
|
+
@redis = redis
|
8
|
+
@bits_in_hash = 32 - b
|
9
|
+
@m = (2 ** b).to_i
|
10
|
+
if @m == 16
|
11
|
+
@alpha = 0.673
|
12
|
+
elsif @m == 32
|
13
|
+
@alpha = 0.697
|
14
|
+
elsif @m == 64
|
15
|
+
@alpha = 0.709
|
16
|
+
else
|
17
|
+
@alpha = 0.7213/(1 + 1.079/@m)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def add(counter_name, value)
|
22
|
+
hash = MurmurHash3::V32.murmur3_32_str_hash(value)
|
23
|
+
function_name = (hash % @m).to_s
|
24
|
+
w = hash / @m
|
25
|
+
max_run_of_zeros = @redis.zscore(counter_name, function_name)
|
26
|
+
@redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
|
27
|
+
end
|
28
|
+
|
29
|
+
def count(counter_name)
|
30
|
+
all_estimates = @redis.zrange(counter_name, 0, -1, {withscores: true})
|
31
|
+
estimate_sum = all_estimates.map{ |f, score| 2 ** -score }.reduce(:+) || 0
|
32
|
+
estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
|
33
|
+
if estimate <= 2.5 * @m
|
34
|
+
if all_estimates.length == @m
|
35
|
+
estimate.round
|
36
|
+
else # Correction for small sets
|
37
|
+
(@m * Math.log(Float(@m)/(@m - all_estimates.length))).round
|
38
|
+
end
|
39
|
+
elsif estimate <= 2 ** 32 / 30.0
|
40
|
+
estimate.round
|
41
|
+
else # Correction for large sets
|
42
|
+
(-2**32 * Math.log(1 - estimate/(2.0**32))).round
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# rho(i) is the position of the first 1 in the binary representation of i,
|
47
|
+
# reading from most significant to least significant bits. Some examples:
|
48
|
+
# rho(1...) = 1, rho(001...) = 3, rho(000...0) = @bits_in_hash + 1
|
49
|
+
def rho(i)
|
50
|
+
if i == 0
|
51
|
+
@bits_in_hash + 1
|
52
|
+
else
|
53
|
+
@bits_in_hash - Math.log(i, 2).floor
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require "hyper_log_log"
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe HyperLogLog do
|
4
|
+
|
5
|
+
it "doesn't change its count when it sees values that it's already seen" do
|
6
|
+
redis = Redis.new
|
7
|
+
counter = HyperLogLog.new(redis, 10)
|
8
|
+
test_set = (1..100).map{ |x| x.to_s }
|
9
|
+
test_set.each{ |value| counter.add("mycounter", value) }
|
10
|
+
original_estimate = counter.count("mycounter")
|
11
|
+
5.times do
|
12
|
+
test_set.each do |value|
|
13
|
+
counter.add("mycounter", value)
|
14
|
+
counter.count("mycounter").should == original_estimate
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
it "can maintain more than one logically distinct counter" do
|
20
|
+
redis = Redis.new
|
21
|
+
counter = HyperLogLog.new(redis, 10)
|
22
|
+
other_estimate = counter.count("counter2")
|
23
|
+
(1..100).each do |i|
|
24
|
+
counter.add("counter1", i.to_s)
|
25
|
+
counter.count("counter2").should == other_estimate
|
26
|
+
end
|
27
|
+
other_estimate = counter.count("counter1")
|
28
|
+
(101..200).each do |i|
|
29
|
+
counter.add("counter2", i.to_s)
|
30
|
+
counter.count("counter1").should == other_estimate
|
31
|
+
end
|
32
|
+
other_estimate = counter.count("counter2")
|
33
|
+
(201..300).each do |i|
|
34
|
+
counter.add("counter1", i.to_s)
|
35
|
+
counter.count("counter2").should == other_estimate
|
36
|
+
end
|
37
|
+
counter.count("counter1").should > 100
|
38
|
+
counter.count("counter2").should > 50
|
39
|
+
counter.count("counter1").should > counter.count("counter2")
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
# With parameter b, HyperLogLog should produce estimates that have
|
44
|
+
# relative error of 1.04 / Math.sqrt(2 ** b). Of course, this analysis
|
45
|
+
# is based on assumptions that aren't necessarily true in practice and
|
46
|
+
# the observed relative error will depend on the distribution of data
|
47
|
+
# we receive as well as the interaction of the murmur hash implementation
|
48
|
+
# with that data. Keeping that in mind, the following spec makes sure
|
49
|
+
# that in the process of adding 1000 values to a set, HyperLogLog only
|
50
|
+
# gives bad estimates (more than twice the expected relative error) in
|
51
|
+
# less than 1% of the cases and never gives very bad estimates (more than
|
52
|
+
# three times the expected relative error.)
|
53
|
+
#
|
54
|
+
# It's fine to fudge these numbers a little if the implementation changes,
|
55
|
+
# since you can clearly find a different set of values that make this test
|
56
|
+
# fail even without changing the implementation. But it should serve as a
|
57
|
+
# good indication that there aren't any logical errors in the HyperLogLog
|
58
|
+
# implementation, since it exercises all of the cases in HyperLogLog's
|
59
|
+
# count method except for the correction for very large set sizes.
|
60
|
+
|
61
|
+
it "produces acceptable estimates" do
|
62
|
+
max_items = 1000
|
63
|
+
redis = Redis.new
|
64
|
+
(6..16).each do |b|
|
65
|
+
counter = HyperLogLog.new(redis, b)
|
66
|
+
redis.del('mycounter')
|
67
|
+
bad_estimates = 0
|
68
|
+
very_bad_estimates = 0
|
69
|
+
expected_relative_error = 1.04 / Math.sqrt(2 ** b)
|
70
|
+
max_items.times do |i|
|
71
|
+
value = Digest::MD5.hexdigest("value#{i}")
|
72
|
+
counter.add("mycounter", value)
|
73
|
+
actual = i + 1
|
74
|
+
approximate = counter.count("mycounter")
|
75
|
+
relative_error = (actual - approximate).abs / Float(actual)
|
76
|
+
bad_estimates += 1 if relative_error > expected_relative_error * 2
|
77
|
+
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
|
78
|
+
end
|
79
|
+
bad_estimates.should < max_items / 100.00
|
80
|
+
very_bad_estimates.should == 0
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 'redis'
|
5
|
+
require 'hyperloglog-redis'
|
6
|
+
|
7
|
+
db_number = ENV['REDIS_TEST_DATABASE'] || '15'
|
8
|
+
ENV['REDIS_URL'] = "redis://localhost:6379/#{db_number}"
|
9
|
+
redis = Redis.new
|
10
|
+
if redis.keys('*').length > 0
|
11
|
+
puts "Warning! These specs use database #{db_number} on your local redis instance"
|
12
|
+
puts "running on port 6379. Your database #{db_number} seems to have keys in it."
|
13
|
+
puts "Please clear them before running the specs or set the environment"
|
14
|
+
puts "variable REDIS_TEST_DATABASE to use a different database number."
|
15
|
+
raise SystemExit
|
16
|
+
end
|
17
|
+
|
18
|
+
# Requires supporting files with custom matchers and macros, etc,
|
19
|
+
# in ./support/ and its subdirectories.
|
20
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
21
|
+
|
22
|
+
RSpec.configure do |config|
|
23
|
+
config.before(:each) do
|
24
|
+
Redis.new.flushdb
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hyperloglog-redis
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aaron Windsor
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-07 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: murmurhash3
|
16
|
+
requirement: &2159101840 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.1.3
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *2159101840
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: redis
|
27
|
+
requirement: &2159101360 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 3.0.1
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *2159101360
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: jeweler
|
38
|
+
requirement: &2159100880 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.8.4
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *2159100880
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rake
|
49
|
+
requirement: &2159100400 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.9.2.2
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *2159100400
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: rspec
|
60
|
+
requirement: &2159099920 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ~>
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 2.11.0
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *2159099920
|
69
|
+
description: An implementation of the HyperLogLog set cardinality estimation algorithm
|
70
|
+
in Ruby using Redis as a back-end
|
71
|
+
email: aaron.windsor@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files:
|
75
|
+
- LICENSE.txt
|
76
|
+
- README.md
|
77
|
+
files:
|
78
|
+
- .document
|
79
|
+
- .rspec
|
80
|
+
- Gemfile
|
81
|
+
- Gemfile.lock
|
82
|
+
- LICENSE.txt
|
83
|
+
- README.md
|
84
|
+
- Rakefile
|
85
|
+
- VERSION
|
86
|
+
- lib/hyper_log_log.rb
|
87
|
+
- lib/hyperloglog-redis.rb
|
88
|
+
- spec/hyper_log_log_spec.rb
|
89
|
+
- spec/spec_helper.rb
|
90
|
+
homepage: http://github.com/aaw/hyperloglog-redis
|
91
|
+
licenses:
|
92
|
+
- MIT
|
93
|
+
post_install_message:
|
94
|
+
rdoc_options: []
|
95
|
+
require_paths:
|
96
|
+
- lib
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
99
|
+
requirements:
|
100
|
+
- - ! '>='
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
segments:
|
104
|
+
- 0
|
105
|
+
hash: 4418438529594871493
|
106
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ! '>='
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
requirements: []
|
113
|
+
rubyforge_project:
|
114
|
+
rubygems_version: 1.8.10
|
115
|
+
signing_key:
|
116
|
+
specification_version: 3
|
117
|
+
summary: An implementation of the HyperLogLog set cardinality estimation algorithm
|
118
|
+
in Ruby using Redis as a back-end
|
119
|
+
test_files: []
|