bloomer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+ gemspec
3
+
4
+ gem "rake"
5
+ gem "yard"
6
+ gem "rspec", '~> 2.7.0'
data/MIT-LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2012 Matthew McEachen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # Bloomer: a pure-ruby bloom filter with no extra fluff
2
+
3
+
4
+ [Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
5
+ a given string has been seen before--in constant time, and using a fixed amount of RAM.
6
+
7
+ This implementation is the Nth bloom filter gem written in ruby -- but, at the time of conception, the only one that
8
+
9
+ * uses a robust set of hashing functions
10
+ * can marshal state quickly
11
+ * does not require EM or Redis or something else unrelated to simply implementing a bloom filter
12
+
13
+ ## Usage
14
+
15
+ ```ruby
16
+ expected_size = 10_000
17
+ false_positive_probability = 0.01
18
+ b = Bloomer.new(expected_size, false_positive_probability)
19
+ b.add "cat"
20
+ b.include? "cat"
21
+ #=> true
22
+ bf.include? "dog"
23
+ #=> false
24
+ ```
25
+
26
+ Serialization is through [Marshal](http://ruby-doc.org/core-1.8.7/Marshal.html):
27
+
28
+ ```ruby
29
+ b = Bloomer.new(10)
30
+ b.add("a")
31
+ s = Marshal.dump(b)
32
+ new_b = Marshal.load(s)
33
+ new_b.include? "a"
34
+ #=> true
35
+ ```
36
+
37
+ ## History
38
+
39
+ * 0.0.1 Bloom, there it is.
40
+
41
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "yard"
3
+ YARD::Rake::YardocTask.new do |t|
4
+ t.files = ['lib/**/*.rb', 'README.md']
5
+ end
6
+
7
+ require "rspec/core/rake_task"
8
+ RSpec::Core::RakeTask.new(:spec)
9
+
10
+ task :default => :spec
data/bloomer.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ require "bloomer"
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "bloomer"
8
+ s.version = Bloomer::VERSION
9
+ s.authors = ["Matthew McEachen"]
10
+ s.email = ["matthew+github@mceachen.org"]
11
+ s.homepage = ""
12
+ s.summary = %q{Pure-ruby bloom filter with minimal dependencies}
13
+ s.description = %q{Pure-ruby bloom filter with minimal dependencies}
14
+
15
+ s.rubyforge_project = "bloomer"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency "bitarray"
23
+ end
data/lib/bloomer.rb ADDED
@@ -0,0 +1,112 @@
1
+ require 'bitarray'
2
+
3
+ class Bloomer
4
+ VERSION = "0.0.1"
5
+
6
+ def initialize(expected_size, false_positive_probability = 0.001, opts = {})
7
+ @ba = opts[:ba] || begin
8
+ # m is the required number of bits in the array
9
+ m = -(expected_size * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
10
+ BitArray.new(m.round)
11
+ end
12
+
13
+ # k is the number of hash functions that minimizes the probability of false positives
14
+ k = opts[:k] || Math.log(2) * (@ba.size / expected_size)
15
+ @hashes = Hashes.build(k.round)
16
+ end
17
+
18
+ def add string
19
+ indicies(string).each { |ea| @ba[ea] = 1 }
20
+ end
21
+
22
+ def include? string
23
+ !indicies(string).any? { |ea| @ba[ea] == 0 }
24
+ end
25
+
26
+ def _dump(depth)
27
+ [@hashes.size, Marshal.dump(@ba)].join("\n")
28
+ end
29
+
30
+ def self._load(data)
31
+ k, ba = data.split("\n", 2)
32
+ new(nil, nil, :k => k.to_i, :ba => Marshal.load(ba))
33
+ end
34
+
35
+ private
36
+
37
+ def indicies string
38
+ @hashes.collect do |h|
39
+ h.call(string) % @ba.size
40
+ end
41
+ end
42
+
43
+ class CircularQueue < Array
44
+ def rot!
45
+ first = self.shift
46
+ self.push(first)
47
+ first
48
+ end
49
+ end
50
+
51
+ class Hashes
52
+ PRIMES = [3571, 4219, 4447, 5167, 5419, 6211, 7057, 7351, 8269, 9241, 10267, 11719, 12097, 13267, 13669, 16651, 19441, 19927, 22447, 23497, 24571, 25117, 26227, 27361, 33391, 35317]
53
+
54
+ def self.build(number_of_hashes)
55
+ hashes = [djb_hash, js_hash, rs_hash, knr_hash, ruby_hash]
56
+ primes = CircularQueue.new PRIMES
57
+ while (number_of_hashes > hashes.size)
58
+ hashes += [:djb_hash, :js_hash, :rs_hash, :knr_hash, :ruby_hash].collect do |ea|
59
+ send(ea, primes.rot!, primes.rot!)
60
+ end
61
+ end
62
+ return hashes.first(number_of_hashes)
63
+ end
64
+
65
+ MAX = 2**31 - 1
66
+
67
+ # written by Professor Daniel J. Bernstein from comp.lang.c
68
+ def self.djb_hash(a = 5381, b = nil)
69
+ lambda do |data|
70
+ data.each_byte.inject(a) do |hash, ea|
71
+ ((hash << 5) + hash + ea) % MAX
72
+ end
73
+ end
74
+ end
75
+
76
+ # bitwise hash function written by Justin Sobel
77
+ def self.js_hash(a = 1315423911, b = nil)
78
+ lambda do |data|
79
+ data.each_byte.inject(a) do |hash, ea|
80
+ (hash ^ ((hash << 5) + ea + (hash >> 2))) % MAX
81
+ end
82
+ end
83
+ end
84
+
85
+ # simple hash function from Robert Sedgwicks Algorithms in C book
86
+ def self.rs_hash(a = 63689, b = 378551)
87
+ lambda do |data|
88
+ i, j = a, b
89
+ data.each_byte.inject(0) do |hash, ea|
90
+ i = (i * j) % MAX
91
+ (hash * i + ea) % MAX
92
+ end
93
+ end
94
+ end
95
+
96
+ # From Kernigham and Ritchie's "The C Programming Language"
97
+ def self.knr_hash(a = 1619, b = 911)
98
+ lambda do |data|
99
+ data.each_byte.inject(a) do |hash, ea|
100
+ ((hash * b) + ea) % MAX
101
+ end
102
+ end
103
+ end
104
+
105
+ # default hash
106
+ def self.ruby_hash(a = 1, b = 1)
107
+ lambda do |data|
108
+ (data.hash * a) % MAX
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,42 @@
1
+ require "spec_helper"
2
+
3
+ def rand_alpha(size)
4
+ chars = ('a'..'z').to_a + ('A'..'Z').to_a
5
+ (0...size).collect { chars[Kernel.rand(chars.length)] }.join
6
+ end
7
+
8
+ describe Bloomer do
9
+
10
+ it "should work trivially" do
11
+ b = Bloomer.new(10, 0.001)
12
+ b.add("a")
13
+ b.should include("a")
14
+ b.should_not include("")
15
+ b.should_not include("b")
16
+ b.add("b")
17
+ b.should include("b")
18
+ b.should_not include("")
19
+ b.add("")
20
+ b.should include("")
21
+ end
22
+
23
+ it "should find random strings" do
24
+ b = Bloomer.new(5_000, 0.001)
25
+ inputs = 1000.times.collect { rand_alpha(Kernel.rand(50)) }
26
+ inputs.each { |ea| b.add(ea) }
27
+ inputs.each { |ea| b.include?(ea).should be_true }
28
+ 5000.times.each do
29
+ s = rand_alpha(Kernel.rand(50))
30
+ b.include?(s).should == inputs.include?(s)
31
+ end
32
+ end
33
+
34
+ it "should marshal state correctly" do
35
+ b = Bloomer.new(10, 0.001)
36
+ inputs = %q(a b c d)
37
+ inputs.each{|ea|b.add(ea)}
38
+ s = Marshal.dump(b)
39
+ new_b = Marshal.load(s)
40
+ inputs.each{|ea|new_b.should include(ea)}
41
+ end
42
+ end
@@ -0,0 +1,6 @@
1
+ require 'rspec'
2
+
3
+ RSpec.configure do |config|
4
+ config.color_enabled = true
5
+ config.formatter = 'documentation'
6
+ end
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bloomer
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Matthew McEachen
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-01-18 00:00:00 -08:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 3
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ type: :runtime
33
+ name: bitarray
34
+ version_requirements: *id001
35
+ description: Pure-ruby bloom filter with minimal dependencies
36
+ email:
37
+ - matthew+github@mceachen.org
38
+ executables: []
39
+
40
+ extensions: []
41
+
42
+ extra_rdoc_files: []
43
+
44
+ files:
45
+ - .gitignore
46
+ - Gemfile
47
+ - MIT-LICENSE
48
+ - README.md
49
+ - Rakefile
50
+ - bloomer.gemspec
51
+ - lib/bloomer.rb
52
+ - spec/bloomer_spec.rb
53
+ - spec/spec_helper.rb
54
+ has_rdoc: true
55
+ homepage: ""
56
+ licenses: []
57
+
58
+ post_install_message:
59
+ rdoc_options: []
60
+
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ hash: 3
69
+ segments:
70
+ - 0
71
+ version: "0"
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ hash: 3
78
+ segments:
79
+ - 0
80
+ version: "0"
81
+ requirements: []
82
+
83
+ rubyforge_project: bloomer
84
+ rubygems_version: 1.6.2
85
+ signing_key:
86
+ specification_version: 3
87
+ summary: Pure-ruby bloom filter with minimal dependencies
88
+ test_files:
89
+ - spec/bloomer_spec.rb
90
+ - spec/spec_helper.rb