count_min_sketch 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2073d6bc85d3bf50b04a6db9adfa374c54586713
4
+ data.tar.gz: 3585ef0b618354e3955f06942726435a8d0865b5
5
+ SHA512:
6
+ metadata.gz: ec86bfdc98e374d9d405f354ed370ed4b34cc2012218cda28804a3cb222a5442c6428bb799d8810689832b59abaac24baf9dda76eb70b44ccedac7cc153bf761
7
+ data.tar.gz: 0534b13ad4b0733c629c349b291c034b92fde7d836faf9fe8701262094da715b310e193072c30dcc659b629d28c7d6f9414001388631824168d00eabd2abdcaa
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format documentation
3
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in count_min_sketch.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Katie Hoffman
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,71 @@
1
+ # Count-Min Sketch
2
+
3
+ A Ruby implementation of Graham Cormode and S. Muthu Muthukrishnan's probabilistic sub-linear space streaming algorithm. Relies on [nashby's](https://github.com/nashby) [CityHash gem](https://github.com/nashby/cityhash). To read more about count-min sketches, see the [Wiki page](http://en.wikipedia.org/wiki/Count%E2%80%93min_sketch).
4
+
5
+ ## Install
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'count_min_sketch'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install count_min_sketch
20
+
21
+ ## Usage
22
+
23
+ Accepts two optional arguments, `k`, the number of hash functions, and `m`, the column size. These default to 10 and 100000 respectively.
24
+
25
+ **Without Arguments**
26
+
27
+ ```ruby
28
+ require 'count_min_sketch'
29
+
30
+ sketch = CountMinSketch::Counter.new
31
+
32
+ sketch.k
33
+ # => 10
34
+ sketch.m
35
+ # => 100000
36
+ ```
37
+
38
+ **With Arguments**
39
+
40
+ ```ruby
41
+ require 'count_min_sketch'
42
+
43
+ name = "Guy Fieri"
44
+ restaurant = "Guy's American Kitchen and Bar"
45
+
46
+ k = 30
47
+ m = 128
48
+ sketch = CountMinSketch::Counter.new(k, m)
49
+
50
+ sketch.k
51
+ # => 30
52
+ sketch.m
53
+ # => 128
54
+
55
+ sketch.insert(name)
56
+ sketch.insert(restaurant, 2)
57
+
58
+ sketch.get_count(name)
59
+ # => 1
60
+
61
+ sketch.get_count(restaurant)
62
+ # => 2
63
+ ```
64
+
65
+ ## Contributing
66
+
67
+ 1. Fork it ( https://github.com/kthffmn/count_min_sketch/fork )
68
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
69
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
70
+ 4. Push to the branch (`git push origin my-new-feature`)
71
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ Dir.glob('tasks/**/*.rake').each(&method(:import))
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'count_min_sketch/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "count_min_sketch"
8
+ spec.version = CountMinSketch::VERSION
9
+ spec.authors = ["Katie Hoffman"]
10
+ spec.email = ["ktahoffman@gmail.com"]
11
+ spec.summary = "Count-Min Sketch implementation that relies on nashby's CityHash gem."
12
+ spec.description = "Implementation of Graham Cormode and S. Muthu Muthukrishnan's probabilistic sub-linear space streaming algorithm."
13
+ spec.homepage = "https://github.com/kthffmn/count_min_sketch"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7.3"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec", "~> 3.1.0"
24
+
25
+ spec.add_dependency 'cityhash', "~> 0.8.1"
26
+ end
@@ -0,0 +1 @@
1
+ require 'count_min_sketch/counter'
@@ -0,0 +1,34 @@
1
+ require 'cityhash'
2
+ require "count_min_sketch/version"
3
+
4
+ module CountMinSketch
5
+ class Counter
6
+
7
+ MAX_FIXNUM = 2**(0.size*8 - 2) - 1
8
+
9
+ attr_reader :k, :m, :data
10
+
11
+ def initialize(k=10, m=100000)
12
+ @k = k
13
+ @m = m
14
+ @data = Array.new(k) { Array.new(m,0) }
15
+ @seeds = Array.new(k) { rand(MAX_FIXNUM + 1) }
16
+ end
17
+
18
+ def get_count(x)
19
+ insert(x, 0)
20
+ end
21
+
22
+ def insert(x, n=1)
23
+ min_count = Float::INFINITY
24
+ hashes_of_x = @seeds.map { |s| CityHash.hash64(x, s) }
25
+ hashes_of_x.each_with_index do |hash, i|
26
+ j = hash % m
27
+ count = @data[i][j] += n
28
+ min_count = count if count < min_count
29
+ end
30
+ min_count
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,3 @@
1
+ module CountMinSketch
2
+ VERSION = "0.0.4"
3
+ end
@@ -0,0 +1,88 @@
1
+ describe 'CountMinSketch' do
2
+ describe 'Counter' do
3
+
4
+ let(:count_min_sketch) { CountMinSketch::Counter.new(30, 128) }
5
+ let(:bands) { YAML.load_file('spec/fixtures/band_names.yml') }
6
+ let(:band) { "Sylvan Esso" }
7
+ let(:singer) { "Valerie June" }
8
+
9
+ describe "MAX_FIXNUM" do
10
+ it "is the maximum Fixnum on the machine" do
11
+ expect(CountMinSketch::Counter::MAX_FIXNUM.class).to eq(Fixnum)
12
+ expect((CountMinSketch::Counter::MAX_FIXNUM + 1).class).to eq(Bignum)
13
+ end
14
+ end
15
+
16
+ describe "#initialize" do
17
+ it "does not raise an error initialized with two arguments, 'k' and 'm'" do
18
+ expect(count_min_sketch.k).to eq(30)
19
+ expect(count_min_sketch.m).to eq(128)
20
+ end
21
+ end
22
+
23
+ describe "#insert" do
24
+ it "returns a count of one when a string is added once" do
25
+ expect(count_min_sketch.insert(singer)).to eq(1)
26
+ end
27
+
28
+ it "returns a count of two when a string is added once, then a second time" do
29
+ count_min_sketch.insert(band)
30
+ expect(count_min_sketch.insert(band)).to eq(2)
31
+ end
32
+
33
+ it "returns a count of two when a string is added with optional times argument of 2" do
34
+ expect(count_min_sketch.insert(singer, 2)).to eq(2)
35
+ end
36
+
37
+ it "returns a count of ten when a string is added with optional times argument of 10" do
38
+ expect(count_min_sketch.insert(band, 10)).to eq(10)
39
+ end
40
+ end
41
+
42
+ describe "#get_count" do
43
+ it "remembers when a string is added once" do
44
+ count_min_sketch.insert(band)
45
+ expect(count_min_sketch.get_count(band)).to eq(1)
46
+ end
47
+
48
+ it "remembers when the same string is added twice" do
49
+ 2.times { count_min_sketch.insert(singer) }
50
+ expect(count_min_sketch.get_count(singer)).to eq(2)
51
+ end
52
+
53
+ it "returns a count of zero for strings that have not been added" do
54
+ expect(count_min_sketch.get_count("Rhye")).to eq(0)
55
+ end
56
+
57
+ it "returns a zero count for a string that has not been added
58
+ regardless of other added strings" do
59
+ 2.times do
60
+ count_min_sketch.insert(singer)
61
+ count_min_sketch.insert(band)
62
+ end
63
+ expect(count_min_sketch.get_count("Rhye")).to eq(0)
64
+ end
65
+
66
+ it "returns the correct count of one regardless of other added strings" do
67
+ count_min_sketch.insert(band)
68
+ expect(count_min_sketch.get_count(band)).to eq(1)
69
+ 2.times { count_min_sketch.insert(singer) }
70
+ expect(count_min_sketch.get_count(band)).to eq(1)
71
+ end
72
+
73
+ it "returns the correct count of two regardless of other added strings" do
74
+ 2.times { count_min_sketch.insert(singer) }
75
+ expect(count_min_sketch.get_count(singer)).to eq(2)
76
+ 3.times { count_min_sketch.insert(band) }
77
+ expect(count_min_sketch.get_count(singer)).to eq(2)
78
+ end
79
+
80
+ it "returns the correct count of four regardless of other added strings" do
81
+ bands.each { |band| count_min_sketch.insert(band) }
82
+ expect(count_min_sketch.get_count("Chvrches")).to eq(4)
83
+ expect(count_min_sketch.get_count("St. Vincent")).to eq(4)
84
+ end
85
+ end
86
+
87
+ end
88
+ end
@@ -0,0 +1,60 @@
1
+ - The Flaming Lips
2
+ - Valerie June
3
+ - Miranda Lambert
4
+ - St. Vincent
5
+ - Chvrches
6
+ - Katy B
7
+ - Sturgill Simpson
8
+ - St. Vincent
9
+ - Freddie Gibbs and Madlib
10
+ - Chvrches
11
+ - St. Vincent
12
+ - Sharon Van Etten
13
+ - Chvrches
14
+ - EMA
15
+ - Future Islands
16
+ - Lake Street Dive
17
+ - St. Vincent
18
+ - Chvrches
19
+ - Swans
20
+ - Hurray for the Riff Raff
21
+ - My Bloody Valentine
22
+ - Skrillex
23
+ - Jack White
24
+ - Eric Church
25
+ - Against Me!
26
+ - J. Cole
27
+ - Kurt Vile
28
+ - Waxahatchee
29
+ - Bruce Springsteen
30
+ - Parquet Courts
31
+ - Best Coast
32
+ - Earl Sweatshirt
33
+ - Haim
34
+ - Beck
35
+ - Damon Albarn
36
+ - Tegan & Sara
37
+ - Kacey Musgraves
38
+ - Chance The Rapper
39
+ - The Black Keys
40
+ - The Roots
41
+ - Phoenix
42
+ - Sky Ferreira
43
+ - Laura Marling
44
+ - Future
45
+ - Sia
46
+ - Ashley Monroe
47
+ - Danny Brown
48
+ - David Bowie
49
+ - Stephen Malkmus and the Jicks
50
+ - Conor Oberst
51
+ - Drake
52
+ - Disclosure
53
+ - Arctic Monkeys
54
+ - Pharrell Williams
55
+ - Tune-Yards
56
+ - The National
57
+ - Lorde
58
+ - Arcade Fire
59
+ - Schoolboy Q
60
+ - Calle 13
@@ -0,0 +1,3 @@
1
+ require_relative '../lib/count_min_sketch'
2
+ require 'cityhash'
3
+ require 'yaml'
data/tasks/rspec.rake ADDED
@@ -0,0 +1,3 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ RSpec::Core::RakeTask.new(:spec)
metadata ADDED
@@ -0,0 +1,118 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: count_min_sketch
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Katie Hoffman
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.7.3
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 1.7.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 3.1.0
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 3.1.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: cityhash
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.8.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.8.1
69
+ description: Implementation of Graham Cormode and S. Muthu Muthukrishnan's probabilistic
70
+ sub-linear space streaming algorithm.
71
+ email:
72
+ - ktahoffman@gmail.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - ".rspec"
79
+ - Gemfile
80
+ - LICENSE.txt
81
+ - README.md
82
+ - Rakefile
83
+ - count_min_sketch.gemspec
84
+ - lib/count_min_sketch.rb
85
+ - lib/count_min_sketch/counter.rb
86
+ - lib/count_min_sketch/version.rb
87
+ - spec/count_min_sketch_spec.rb
88
+ - spec/fixtures/band_names.yml
89
+ - spec/spec_helper.rb
90
+ - tasks/rspec.rake
91
+ homepage: https://github.com/kthffmn/count_min_sketch
92
+ licenses:
93
+ - MIT
94
+ metadata: {}
95
+ post_install_message:
96
+ rdoc_options: []
97
+ require_paths:
98
+ - lib
99
+ required_ruby_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: '0'
109
+ requirements: []
110
+ rubyforge_project:
111
+ rubygems_version: 2.2.2
112
+ signing_key:
113
+ specification_version: 4
114
+ summary: Count-Min Sketch implementation that relies on nashby's CityHash gem.
115
+ test_files:
116
+ - spec/count_min_sketch_spec.rb
117
+ - spec/fixtures/band_names.yml
118
+ - spec/spec_helper.rb