space-saver-redis 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +20 -0
- data/README.md +40 -0
- data/Rakefile +49 -0
- data/VERSION +1 -0
- data/lib/space-saver-redis.rb +22 -0
- data/spec/space-saver-redis_spec.rb +46 -0
- data/spec/spec_helper.rb +29 -0
- metadata +118 -0
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.8.4)
|
7
|
+
bundler (~> 1.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rdoc
|
11
|
+
json (1.7.5)
|
12
|
+
rake (0.9.2.2)
|
13
|
+
rdoc (3.12)
|
14
|
+
json (~> 1.4)
|
15
|
+
redis (3.0.2)
|
16
|
+
rspec (2.11.0)
|
17
|
+
rspec-core (~> 2.11.0)
|
18
|
+
rspec-expectations (~> 2.11.0)
|
19
|
+
rspec-mocks (~> 2.11.0)
|
20
|
+
rspec-core (2.11.1)
|
21
|
+
rspec-expectations (2.11.3)
|
22
|
+
diff-lcs (~> 1.1.3)
|
23
|
+
rspec-mocks (2.11.3)
|
24
|
+
timecop (0.5.4)
|
25
|
+
|
26
|
+
PLATFORMS
|
27
|
+
ruby
|
28
|
+
|
29
|
+
DEPENDENCIES
|
30
|
+
jeweler (~> 1.8.4)
|
31
|
+
rake (~> 0.9.2.2)
|
32
|
+
redis (~> 3.0.1)
|
33
|
+
rspec (~> 2.11.0)
|
34
|
+
timecop (~> 0.5.3)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Art.sy
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
space-saver-redis
|
2
|
+
=================
|
3
|
+
|
4
|
+
This gem is a pure Ruby implementation of Metwally, Agrawal, and Abbadi's
|
5
|
+
[SpaceSaver algorithm](http://www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf)
|
6
|
+
for estimating the top K elements in a data stream. A [Redis](http://redis.io)
|
7
|
+
instance is used for storage.
|
8
|
+
|
9
|
+
Here's an example:
|
10
|
+
|
11
|
+
require 'redis'
|
12
|
+
require 'space-saver-redis'
|
13
|
+
|
14
|
+
# Estimate the top 10 most frequent items seen in a data stream
|
15
|
+
space_saver = SpaceSaver.new(Redis.new, 10)
|
16
|
+
|
17
|
+
urls_visited.each { |url| space_saver.increment("urls", url) }
|
18
|
+
|
19
|
+
After the above code executes, you can query `space_saver.leaders("urls")` to get
|
20
|
+
an estimate of the top 10 most frequent URLs visited along with their estimated
|
21
|
+
counts. The `SpaceSaver` instance uses only a Redis sorted set with at most K
|
22
|
+
elements (10, in this case) at any time to make this estimation.
|
23
|
+
|
24
|
+
Obviously, since the data structure uses only a small, fixed amount of space,
|
25
|
+
there are some data distributions that can cause the top K elements returned to
|
26
|
+
be completely incorrect, but for a lot of data distributions the results are
|
27
|
+
worth the savings in space. In particular, for a `SpaceSaver` instance initialized
|
28
|
+
with parameter K that observes a data stream of N items, any item that occurs more
|
29
|
+
than N/K times is guaranteed to be in the list of estimated leaders.
|
30
|
+
|
31
|
+
One way to cope with the error involved in this kind of estimation is to use a K
|
32
|
+
bigger than you actually need and then truncate the number of leaders returned
|
33
|
+
at query time. You can pass an additional parameter to the call to `leaders` to
|
34
|
+
do this, for example `space_saver.leaders("urls", 3)` will return only the top
|
35
|
+
3 of the 10 estimated most frequent items.
|
36
|
+
|
37
|
+
Installation
|
38
|
+
============
|
39
|
+
|
40
|
+
gem install space-saver-redis
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "space-saver-redis"
|
18
|
+
gem.homepage = "http://github.com/aaw/space-saver-redis"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{A pure Ruby implementation of the SpaceSaver algorithm for approximating the top K elements in a data stream, backed by Redis}
|
21
|
+
gem.description = %Q{A pure Ruby implementation of the SpaceSaver algorithm for approximating the top K elements in a data stream, backed by Redis}
|
22
|
+
gem.email = "aaron.windsor@gmail.com"
|
23
|
+
gem.authors = ["Aaron Windsor"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
spec.rcov = true
|
37
|
+
end
|
38
|
+
|
39
|
+
task :default => :spec
|
40
|
+
|
41
|
+
require 'rake/rdoctask'
|
42
|
+
Rake::RDocTask.new do |rdoc|
|
43
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
44
|
+
|
45
|
+
rdoc.rdoc_dir = 'rdoc'
|
46
|
+
rdoc.title = "space-saver-redis #{version}"
|
47
|
+
rdoc.rdoc_files.include('README*')
|
48
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
49
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class SpaceSaver
|
2
|
+
def initialize(redis, k)
|
3
|
+
@redis = redis
|
4
|
+
@k = k
|
5
|
+
end
|
6
|
+
|
7
|
+
def increment(leaderboard, value)
|
8
|
+
score = @redis.zscore(leaderboard, value)
|
9
|
+
if score || @redis.zcard(leaderboard) < @k
|
10
|
+
@redis.zincrby(leaderboard, 1, value)
|
11
|
+
else
|
12
|
+
item, score = @redis.zrange(leaderboard, 0, 0, withscores: true).first
|
13
|
+
new_score = score.to_i + 1
|
14
|
+
@redis.zadd(leaderboard, new_score, value) if @redis.zrem(leaderboard, item)
|
15
|
+
new_score
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def leaders(leaderboard, k=@k)
|
20
|
+
@redis.zrevrange(leaderboard, 0, k-1, withscores: true)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe SpaceSaver do
|
4
|
+
it "returns an empty list when you ask about an empty leaderboard" do
|
5
|
+
s = SpaceSaver.new(Redis.new, 10)
|
6
|
+
s.leaders('unknown-leaderboard').should be_empty
|
7
|
+
end
|
8
|
+
it "can store different leaderboards in the same Redis database" do
|
9
|
+
s = SpaceSaver.new(Redis.new, 10)
|
10
|
+
s.increment('a','a_item')
|
11
|
+
s.increment('b','b_item')
|
12
|
+
s.leaders('a').should == [["a_item", 1.0]]
|
13
|
+
s.leaders('b').should == [["b_item", 1.0]]
|
14
|
+
end
|
15
|
+
it "uses a Redis sorted set of cardinality K if you pass K to the SpaceSaver constructor" do
|
16
|
+
r = Redis.new
|
17
|
+
[3,4,5].each do |k|
|
18
|
+
s = SpaceSaver.new(r, k)
|
19
|
+
30.times do |i|
|
20
|
+
s.increment("leaderboard#{k}", "item#{i}")
|
21
|
+
r.zcard("leaderboard#{k}").should <= k
|
22
|
+
end
|
23
|
+
r.zcard("leaderboard#{k}").should == k
|
24
|
+
end
|
25
|
+
end
|
26
|
+
it "can return less than k leaders" do
|
27
|
+
s = SpaceSaver.new(Redis.new, 20)
|
28
|
+
100.times { |i| s.increment("leaderboard", "item#{i}") }
|
29
|
+
[5,6,7,8].each do |i|
|
30
|
+
s.leaders("leaderboard", i).length.should == i
|
31
|
+
s.leaders("leaderboard", i+1).should include(*s.leaders("leaderboard", i))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
it "stores any element that occurs at least n/k times" do
|
35
|
+
s = SpaceSaver.new(Redis.new, 5)
|
36
|
+
200.times { s.increment("leaderboard", "foo") }
|
37
|
+
400.times do |i|
|
38
|
+
s.increment("leaderboard", "bar") if i % 2 == 0
|
39
|
+
s.increment("leaderboard", "item#{i}")
|
40
|
+
end
|
41
|
+
200.times { s.increment("leaderboard", "baz") }
|
42
|
+
expected_leaders = ['foo','bar','baz']
|
43
|
+
s.leaders("leaderboard").map{ |x| x.first }.should include('foo','bar','baz')
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 'redis'
|
5
|
+
require 'space-saver-redis'
|
6
|
+
|
7
|
+
db_number = ENV['REDIS_TEST_DATABASE'] || '15'
|
8
|
+
ENV['REDIS_URL'] = "redis://localhost:6379/#{db_number}"
|
9
|
+
redis = Redis.new
|
10
|
+
if redis.keys('*').length > 0
|
11
|
+
puts "Warning! These specs use database #{db_number} on your local redis instance"
|
12
|
+
puts "running on port 6379. Your database #{db_number} seems to have keys in it."
|
13
|
+
puts "Please clear them before running the specs or set the environment"
|
14
|
+
puts "variable REDIS_TEST_DATABASE to use a different database number."
|
15
|
+
raise SystemExit
|
16
|
+
end
|
17
|
+
|
18
|
+
# Requires supporting files with custom matchers and macros, etc,
|
19
|
+
# in ./support/ and its subdirectories.
|
20
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
21
|
+
|
22
|
+
RSpec.configure do |config|
|
23
|
+
config.before(:each) do
|
24
|
+
Redis.new.flushdb
|
25
|
+
end
|
26
|
+
config.after(:each) do
|
27
|
+
Redis.new.flushdb
|
28
|
+
end
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: space-saver-redis
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aaron Windsor
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-12-18 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: redis
|
16
|
+
requirement: &2169308260 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 3.0.1
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *2169308260
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: jeweler
|
27
|
+
requirement: &2169307780 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.8.4
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *2169307780
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rake
|
38
|
+
requirement: &2169307300 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.9.2.2
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *2169307300
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rspec
|
49
|
+
requirement: &2169306820 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.11.0
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *2169306820
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: timecop
|
60
|
+
requirement: &2169306340 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ~>
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 0.5.3
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *2169306340
|
69
|
+
description: A pure Ruby implementation of the SpaceSaver algorithm for approximating
|
70
|
+
the top K elements in a data stream, backed by Redis
|
71
|
+
email: aaron.windsor@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files:
|
75
|
+
- LICENSE.txt
|
76
|
+
- README.md
|
77
|
+
files:
|
78
|
+
- .document
|
79
|
+
- .rspec
|
80
|
+
- Gemfile
|
81
|
+
- Gemfile.lock
|
82
|
+
- LICENSE.txt
|
83
|
+
- README.md
|
84
|
+
- Rakefile
|
85
|
+
- VERSION
|
86
|
+
- lib/space-saver-redis.rb
|
87
|
+
- spec/space-saver-redis_spec.rb
|
88
|
+
- spec/spec_helper.rb
|
89
|
+
homepage: http://github.com/aaw/space-saver-redis
|
90
|
+
licenses:
|
91
|
+
- MIT
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
segments:
|
103
|
+
- 0
|
104
|
+
hash: 300893669371900743
|
105
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ! '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
requirements: []
|
112
|
+
rubyforge_project:
|
113
|
+
rubygems_version: 1.8.10
|
114
|
+
signing_key:
|
115
|
+
specification_version: 3
|
116
|
+
summary: A pure Ruby implementation of the SpaceSaver algorithm for approximating
|
117
|
+
the top K elements in a data stream, backed by Redis
|
118
|
+
test_files: []
|