count_min_sketch 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +71 -0
- data/Rakefile +3 -0
- data/count_min_sketch.gemspec +26 -0
- data/lib/count_min_sketch.rb +1 -0
- data/lib/count_min_sketch/counter.rb +34 -0
- data/lib/count_min_sketch/version.rb +3 -0
- data/spec/count_min_sketch_spec.rb +88 -0
- data/spec/fixtures/band_names.yml +60 -0
- data/spec/spec_helper.rb +3 -0
- data/tasks/rspec.rake +3 -0
- metadata +118 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2073d6bc85d3bf50b04a6db9adfa374c54586713
|
4
|
+
data.tar.gz: 3585ef0b618354e3955f06942726435a8d0865b5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ec86bfdc98e374d9d405f354ed370ed4b34cc2012218cda28804a3cb222a5442c6428bb799d8810689832b59abaac24baf9dda76eb70b44ccedac7cc153bf761
|
7
|
+
data.tar.gz: 0534b13ad4b0733c629c349b291c034b92fde7d836faf9fe8701262094da715b310e193072c30dcc659b629d28c7d6f9414001388631824168d00eabd2abdcaa
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Katie Hoffman
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# Count-Min Sketch
|
2
|
+
|
3
|
+
A Ruby implementation of Graham Cormode and S. Muthu Muthukrishnan's probabilistic sub-linear space streaming algorithm. Relies on [nashby's](https://github.com/nashby) [CityHash gem](https://github.com/nashby/cityhash). To read more about count-min sketches, see the [Wiki page](http://en.wikipedia.org/wiki/Count%E2%80%93min_sketch).
|
4
|
+
|
5
|
+
## Install
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'count_min_sketch'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install count_min_sketch
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
Accepts two optional arguments, `k`, the number of hash functions, and `m`, the column size. These default to 10 and 100000 respectively.
|
24
|
+
|
25
|
+
**Without Arguments**
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
require 'count_min_sketch'
|
29
|
+
|
30
|
+
sketch = CountMinSketch::Counter.new
|
31
|
+
|
32
|
+
sketch.k
|
33
|
+
# => 10
|
34
|
+
sketch.m
|
35
|
+
# => 100000
|
36
|
+
```
|
37
|
+
|
38
|
+
**With Arguments**
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
require 'count_min_sketch'
|
42
|
+
|
43
|
+
name = "Guy Fieri"
|
44
|
+
restaurant = "Guy's American Kitchen and Bar"
|
45
|
+
|
46
|
+
k = 30
|
47
|
+
m = 128
|
48
|
+
sketch = CountMinSketch::Counter.new(k, m)
|
49
|
+
|
50
|
+
sketch.k
|
51
|
+
# => 30
|
52
|
+
sketch.m
|
53
|
+
# => 128
|
54
|
+
|
55
|
+
sketch.insert(name)
|
56
|
+
sketch.insert(restaurant, 2)
|
57
|
+
|
58
|
+
sketch.get_count(name)
|
59
|
+
# => 1
|
60
|
+
|
61
|
+
sketch.get_count(restaurant)
|
62
|
+
# => 2
|
63
|
+
```
|
64
|
+
|
65
|
+
## Contributing
|
66
|
+
|
67
|
+
1. Fork it ( https://github.com/kthffmn/count_min_sketch/fork )
|
68
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
69
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
70
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
71
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'count_min_sketch/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "count_min_sketch"
|
8
|
+
spec.version = CountMinSketch::VERSION
|
9
|
+
spec.authors = ["Katie Hoffman"]
|
10
|
+
spec.email = ["ktahoffman@gmail.com"]
|
11
|
+
spec.summary = "Count-Min Sketch implementation that relies on nashby's CityHash gem."
|
12
|
+
spec.description = "Implementation of Graham Cormode and S. Muthu Muthukrishnan's probabilistic sub-linear space streaming algorithm."
|
13
|
+
spec.homepage = "https://github.com/kthffmn/count_min_sketch"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7.3"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "rspec", "~> 3.1.0"
|
24
|
+
|
25
|
+
spec.add_dependency 'cityhash', "~> 0.8.1"
|
26
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'count_min_sketch/counter'
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'cityhash'
|
2
|
+
require "count_min_sketch/version"
|
3
|
+
|
4
|
+
module CountMinSketch
|
5
|
+
class Counter
|
6
|
+
|
7
|
+
MAX_FIXNUM = 2**(0.size*8 - 2) - 1
|
8
|
+
|
9
|
+
attr_reader :k, :m, :data
|
10
|
+
|
11
|
+
def initialize(k=10, m=100000)
|
12
|
+
@k = k
|
13
|
+
@m = m
|
14
|
+
@data = Array.new(k) { Array.new(m,0) }
|
15
|
+
@seeds = Array.new(k) { rand(MAX_FIXNUM + 1) }
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_count(x)
|
19
|
+
insert(x, 0)
|
20
|
+
end
|
21
|
+
|
22
|
+
def insert(x, n=1)
|
23
|
+
min_count = Float::INFINITY
|
24
|
+
hashes_of_x = @seeds.map { |s| CityHash.hash64(x, s) }
|
25
|
+
hashes_of_x.each_with_index do |hash, i|
|
26
|
+
j = hash % m
|
27
|
+
count = @data[i][j] += n
|
28
|
+
min_count = count if count < min_count
|
29
|
+
end
|
30
|
+
min_count
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
describe 'CountMinSketch' do
|
2
|
+
describe 'Counter' do
|
3
|
+
|
4
|
+
let(:count_min_sketch) { CountMinSketch::Counter.new(30, 128) }
|
5
|
+
let(:bands) { YAML.load_file('spec/fixtures/band_names.yml') }
|
6
|
+
let(:band) { "Sylvan Esso" }
|
7
|
+
let(:singer) { "Valerie June" }
|
8
|
+
|
9
|
+
describe "MAX_FIXNUM" do
|
10
|
+
it "is the maximum Fixnum on the machine" do
|
11
|
+
expect(CountMinSketch::Counter::MAX_FIXNUM.class).to eq(Fixnum)
|
12
|
+
expect((CountMinSketch::Counter::MAX_FIXNUM + 1).class).to eq(Bignum)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "#initialize" do
|
17
|
+
it "does not raise an error initialized with two arguments, 'k' and 'm'" do
|
18
|
+
expect(count_min_sketch.k).to eq(30)
|
19
|
+
expect(count_min_sketch.m).to eq(128)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#insert" do
|
24
|
+
it "returns a count of one when a string is added once" do
|
25
|
+
expect(count_min_sketch.insert(singer)).to eq(1)
|
26
|
+
end
|
27
|
+
|
28
|
+
it "returns a count of two when a string is added once, then a second time" do
|
29
|
+
count_min_sketch.insert(band)
|
30
|
+
expect(count_min_sketch.insert(band)).to eq(2)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "returns a count of two when a string is added with optional times argument of 2" do
|
34
|
+
expect(count_min_sketch.insert(singer, 2)).to eq(2)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "returns a count of ten when a string is added with optional times argument of 10" do
|
38
|
+
expect(count_min_sketch.insert(band, 10)).to eq(10)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe "#get_count" do
|
43
|
+
it "remembers when a string is added once" do
|
44
|
+
count_min_sketch.insert(band)
|
45
|
+
expect(count_min_sketch.get_count(band)).to eq(1)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "remembers when the same string is added twice" do
|
49
|
+
2.times { count_min_sketch.insert(singer) }
|
50
|
+
expect(count_min_sketch.get_count(singer)).to eq(2)
|
51
|
+
end
|
52
|
+
|
53
|
+
it "returns a count of zero for strings that have not been added" do
|
54
|
+
expect(count_min_sketch.get_count("Rhye")).to eq(0)
|
55
|
+
end
|
56
|
+
|
57
|
+
it "returns a zero count for a string that has not been added
|
58
|
+
regardless of other added strings" do
|
59
|
+
2.times do
|
60
|
+
count_min_sketch.insert(singer)
|
61
|
+
count_min_sketch.insert(band)
|
62
|
+
end
|
63
|
+
expect(count_min_sketch.get_count("Rhye")).to eq(0)
|
64
|
+
end
|
65
|
+
|
66
|
+
it "returns the correct count of one regardless of other added strings" do
|
67
|
+
count_min_sketch.insert(band)
|
68
|
+
expect(count_min_sketch.get_count(band)).to eq(1)
|
69
|
+
2.times { count_min_sketch.insert(singer) }
|
70
|
+
expect(count_min_sketch.get_count(band)).to eq(1)
|
71
|
+
end
|
72
|
+
|
73
|
+
it "returns the correct count of two regardless of other added strings" do
|
74
|
+
2.times { count_min_sketch.insert(singer) }
|
75
|
+
expect(count_min_sketch.get_count(singer)).to eq(2)
|
76
|
+
3.times { count_min_sketch.insert(band) }
|
77
|
+
expect(count_min_sketch.get_count(singer)).to eq(2)
|
78
|
+
end
|
79
|
+
|
80
|
+
it "returns the correct count of four regardless of other added strings" do
|
81
|
+
bands.each { |band| count_min_sketch.insert(band) }
|
82
|
+
expect(count_min_sketch.get_count("Chvrches")).to eq(4)
|
83
|
+
expect(count_min_sketch.get_count("St. Vincent")).to eq(4)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
- The Flaming Lips
|
2
|
+
- Valerie June
|
3
|
+
- Miranda Lambert
|
4
|
+
- St. Vincent
|
5
|
+
- Chvrches
|
6
|
+
- Katy B
|
7
|
+
- Sturgill Simpson
|
8
|
+
- St. Vincent
|
9
|
+
- Freddie Gibbs and Madlib
|
10
|
+
- Chvrches
|
11
|
+
- St. Vincent
|
12
|
+
- Sharon Van Etten
|
13
|
+
- Chvrches
|
14
|
+
- EMA
|
15
|
+
- Future Islands
|
16
|
+
- Lake Street Dive
|
17
|
+
- St. Vincent
|
18
|
+
- Chvrches
|
19
|
+
- Swans
|
20
|
+
- Hurray for the Riff Raff
|
21
|
+
- My Bloody Valentine
|
22
|
+
- Skrillex
|
23
|
+
- Jack White
|
24
|
+
- Eric Church
|
25
|
+
- Against Me!
|
26
|
+
- J. Cole
|
27
|
+
- Kurt Vile
|
28
|
+
- Waxahatchee
|
29
|
+
- Bruce Springsteen
|
30
|
+
- Parquet Courts
|
31
|
+
- Best Coast
|
32
|
+
- Earl Sweatshirt
|
33
|
+
- Haim
|
34
|
+
- Beck
|
35
|
+
- Damon Albarn
|
36
|
+
- Tegan & Sara
|
37
|
+
- Kacey Musgraves
|
38
|
+
- Chance The Rapper
|
39
|
+
- The Black Keys
|
40
|
+
- The Roots
|
41
|
+
- Phoenix
|
42
|
+
- Sky Ferreira
|
43
|
+
- Laura Marling
|
44
|
+
- Future
|
45
|
+
- Sia
|
46
|
+
- Ashley Monroe
|
47
|
+
- Danny Brown
|
48
|
+
- David Bowie
|
49
|
+
- Stephen Malkmus and the Jicks
|
50
|
+
- Conor Oberst
|
51
|
+
- Drake
|
52
|
+
- Disclosure
|
53
|
+
- Arctic Monkeys
|
54
|
+
- Pharrell Williams
|
55
|
+
- Tune-Yards
|
56
|
+
- The National
|
57
|
+
- Lorde
|
58
|
+
- Arcade Fire
|
59
|
+
- Schoolboy Q
|
60
|
+
- Calle 13
|
data/spec/spec_helper.rb
ADDED
data/tasks/rspec.rake
ADDED
metadata
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: count_min_sketch
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Katie Hoffman
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-10-24 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.7.3
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.7.3
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 3.1.0
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 3.1.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: cityhash
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.8.1
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.8.1
|
69
|
+
description: Implementation of Graham Cormode and S. Muthu Muthukrishnan's probabilistic
|
70
|
+
sub-linear space streaming algorithm.
|
71
|
+
email:
|
72
|
+
- ktahoffman@gmail.com
|
73
|
+
executables: []
|
74
|
+
extensions: []
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- ".gitignore"
|
78
|
+
- ".rspec"
|
79
|
+
- Gemfile
|
80
|
+
- LICENSE.txt
|
81
|
+
- README.md
|
82
|
+
- Rakefile
|
83
|
+
- count_min_sketch.gemspec
|
84
|
+
- lib/count_min_sketch.rb
|
85
|
+
- lib/count_min_sketch/counter.rb
|
86
|
+
- lib/count_min_sketch/version.rb
|
87
|
+
- spec/count_min_sketch_spec.rb
|
88
|
+
- spec/fixtures/band_names.yml
|
89
|
+
- spec/spec_helper.rb
|
90
|
+
- tasks/rspec.rake
|
91
|
+
homepage: https://github.com/kthffmn/count_min_sketch
|
92
|
+
licenses:
|
93
|
+
- MIT
|
94
|
+
metadata: {}
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options: []
|
97
|
+
require_paths:
|
98
|
+
- lib
|
99
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: '0'
|
109
|
+
requirements: []
|
110
|
+
rubyforge_project:
|
111
|
+
rubygems_version: 2.2.2
|
112
|
+
signing_key:
|
113
|
+
specification_version: 4
|
114
|
+
summary: Count-Min Sketch implementation that relies on nashby's CityHash gem.
|
115
|
+
test_files:
|
116
|
+
- spec/count_min_sketch_spec.rb
|
117
|
+
- spec/fixtures/band_names.yml
|
118
|
+
- spec/spec_helper.rb
|