bloomer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+ gemspec
3
+
4
+ gem "rake"
5
+ gem "yard"
6
+ gem "rspec", '~> 2.7.0'
data/MIT-LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2012 Matthew McEachen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # Bloomer: a pure-ruby bloom filter with no extra fluff
2
+
3
+
4
+ [Bloom filters](http://en.wikipedia.org/wiki/Bloom_filter) are great for quickly checking to see if
5
+ a given string has been seen before--in constant time, and using a fixed amount of RAM.
6
+
7
+ This implementation is the Nth bloom filter gem written in ruby -- but, at the time of conception, the only one that
8
+
9
+ * uses a robust set of hashing functions
10
+ * can marshal state quickly
11
+ * does not require EM or Redis or something else unrelated to simply implementing a bloom filter
12
+
13
+ ## Usage
14
+
15
+ ```ruby
16
+ expected_size = 10_000
17
+ false_positive_probability = 0.01
18
+ b = Bloomer.new(expected_size, false_positive_probability)
19
+ b.add "cat"
20
+ b.include? "cat"
21
+ #=> true
22
+ bf.include? "dog"
23
+ #=> false
24
+ ```
25
+
26
+ Serialization is through [Marshal](http://ruby-doc.org/core-1.8.7/Marshal.html):
27
+
28
+ ```ruby
29
+ b = Bloomer.new(10)
30
+ b.add("a")
31
+ s = Marshal.dump(b)
32
+ new_b = Marshal.load(s)
33
+ new_b.include? "a"
34
+ #=> true
35
+ ```
36
+
37
+ ## History
38
+
39
+ * 0.0.1 Bloom, there it is.
40
+
41
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "yard"
3
+ YARD::Rake::YardocTask.new do |t|
4
+ t.files = ['lib/**/*.rb', 'README.md']
5
+ end
6
+
7
+ require "rspec/core/rake_task"
8
+ RSpec::Core::RakeTask.new(:spec)
9
+
10
+ task :default => :spec
data/bloomer.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ require "bloomer"
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "bloomer"
8
+ s.version = Bloomer::VERSION
9
+ s.authors = ["Matthew McEachen"]
10
+ s.email = ["matthew+github@mceachen.org"]
11
+ s.homepage = ""
12
+ s.summary = %q{Pure-ruby bloom filter with minimal dependencies}
13
+ s.description = %q{Pure-ruby bloom filter with minimal dependencies}
14
+
15
+ s.rubyforge_project = "bloomer"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency "bitarray"
23
+ end
data/lib/bloomer.rb ADDED
@@ -0,0 +1,112 @@
1
+ require 'bitarray'
2
+
3
+ class Bloomer
4
+ VERSION = "0.0.1"
5
+
6
+ def initialize(expected_size, false_positive_probability = 0.001, opts = {})
7
+ @ba = opts[:ba] || begin
8
+ # m is the required number of bits in the array
9
+ m = -(expected_size * Math.log(false_positive_probability)) / (Math.log(2) ** 2)
10
+ BitArray.new(m.round)
11
+ end
12
+
13
+ # k is the number of hash functions that minimizes the probability of false positives
14
+ k = opts[:k] || Math.log(2) * (@ba.size / expected_size)
15
+ @hashes = Hashes.build(k.round)
16
+ end
17
+
18
+ def add string
19
+ indicies(string).each { |ea| @ba[ea] = 1 }
20
+ end
21
+
22
+ def include? string
23
+ !indicies(string).any? { |ea| @ba[ea] == 0 }
24
+ end
25
+
26
+ def _dump(depth)
27
+ [@hashes.size, Marshal.dump(@ba)].join("\n")
28
+ end
29
+
30
+ def self._load(data)
31
+ k, ba = data.split("\n", 2)
32
+ new(nil, nil, :k => k.to_i, :ba => Marshal.load(ba))
33
+ end
34
+
35
+ private
36
+
37
+ def indicies string
38
+ @hashes.collect do |h|
39
+ h.call(string) % @ba.size
40
+ end
41
+ end
42
+
43
+ class CircularQueue < Array
44
+ def rot!
45
+ first = self.shift
46
+ self.push(first)
47
+ first
48
+ end
49
+ end
50
+
51
+ class Hashes
52
+ PRIMES = [3571, 4219, 4447, 5167, 5419, 6211, 7057, 7351, 8269, 9241, 10267, 11719, 12097, 13267, 13669, 16651, 19441, 19927, 22447, 23497, 24571, 25117, 26227, 27361, 33391, 35317]
53
+
54
+ def self.build(number_of_hashes)
55
+ hashes = [djb_hash, js_hash, rs_hash, knr_hash, ruby_hash]
56
+ primes = CircularQueue.new PRIMES
57
+ while (number_of_hashes > hashes.size)
58
+ hashes += [:djb_hash, :js_hash, :rs_hash, :knr_hash, :ruby_hash].collect do |ea|
59
+ send(ea, primes.rot!, primes.rot!)
60
+ end
61
+ end
62
+ return hashes.first(number_of_hashes)
63
+ end
64
+
65
+ MAX = 2**31 - 1
66
+
67
+ # written by Professor Daniel J. Bernstein from comp.lang.c
68
+ def self.djb_hash(a = 5381, b = nil)
69
+ lambda do |data|
70
+ data.each_byte.inject(a) do |hash, ea|
71
+ ((hash << 5) + hash + ea) % MAX
72
+ end
73
+ end
74
+ end
75
+
76
+ # bitwise hash function written by Justin Sobel
77
+ def self.js_hash(a = 1315423911, b = nil)
78
+ lambda do |data|
79
+ data.each_byte.inject(a) do |hash, ea|
80
+ (hash ^ ((hash << 5) + ea + (hash >> 2))) % MAX
81
+ end
82
+ end
83
+ end
84
+
85
+ # simple hash function from Robert Sedgwicks Algorithms in C book
86
+ def self.rs_hash(a = 63689, b = 378551)
87
+ lambda do |data|
88
+ i, j = a, b
89
+ data.each_byte.inject(0) do |hash, ea|
90
+ i = (i * j) % MAX
91
+ (hash * i + ea) % MAX
92
+ end
93
+ end
94
+ end
95
+
96
+ # From Kernigham and Ritchie's "The C Programming Language"
97
+ def self.knr_hash(a = 1619, b = 911)
98
+ lambda do |data|
99
+ data.each_byte.inject(a) do |hash, ea|
100
+ ((hash * b) + ea) % MAX
101
+ end
102
+ end
103
+ end
104
+
105
+ # default hash
106
+ def self.ruby_hash(a = 1, b = 1)
107
+ lambda do |data|
108
+ (data.hash * a) % MAX
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,42 @@
1
+ require "spec_helper"
2
+
3
+ def rand_alpha(size)
4
+ chars = ('a'..'z').to_a + ('A'..'Z').to_a
5
+ (0...size).collect { chars[Kernel.rand(chars.length)] }.join
6
+ end
7
+
8
+ describe Bloomer do
9
+
10
+ it "should work trivially" do
11
+ b = Bloomer.new(10, 0.001)
12
+ b.add("a")
13
+ b.should include("a")
14
+ b.should_not include("")
15
+ b.should_not include("b")
16
+ b.add("b")
17
+ b.should include("b")
18
+ b.should_not include("")
19
+ b.add("")
20
+ b.should include("")
21
+ end
22
+
23
+ it "should find random strings" do
24
+ b = Bloomer.new(5_000, 0.001)
25
+ inputs = 1000.times.collect { rand_alpha(Kernel.rand(50)) }
26
+ inputs.each { |ea| b.add(ea) }
27
+ inputs.each { |ea| b.include?(ea).should be_true }
28
+ 5000.times.each do
29
+ s = rand_alpha(Kernel.rand(50))
30
+ b.include?(s).should == inputs.include?(s)
31
+ end
32
+ end
33
+
34
+ it "should marshal state correctly" do
35
+ b = Bloomer.new(10, 0.001)
36
+ inputs = %q(a b c d)
37
+ inputs.each{|ea|b.add(ea)}
38
+ s = Marshal.dump(b)
39
+ new_b = Marshal.load(s)
40
+ inputs.each{|ea|new_b.should include(ea)}
41
+ end
42
+ end
@@ -0,0 +1,6 @@
1
+ require 'rspec'
2
+
3
+ RSpec.configure do |config|
4
+ config.color_enabled = true
5
+ config.formatter = 'documentation'
6
+ end
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bloomer
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Matthew McEachen
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-01-18 00:00:00 -08:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 3
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ type: :runtime
33
+ name: bitarray
34
+ version_requirements: *id001
35
+ description: Pure-ruby bloom filter with minimal dependencies
36
+ email:
37
+ - matthew+github@mceachen.org
38
+ executables: []
39
+
40
+ extensions: []
41
+
42
+ extra_rdoc_files: []
43
+
44
+ files:
45
+ - .gitignore
46
+ - Gemfile
47
+ - MIT-LICENSE
48
+ - README.md
49
+ - Rakefile
50
+ - bloomer.gemspec
51
+ - lib/bloomer.rb
52
+ - spec/bloomer_spec.rb
53
+ - spec/spec_helper.rb
54
+ has_rdoc: true
55
+ homepage: ""
56
+ licenses: []
57
+
58
+ post_install_message:
59
+ rdoc_options: []
60
+
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ hash: 3
69
+ segments:
70
+ - 0
71
+ version: "0"
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ hash: 3
78
+ segments:
79
+ - 0
80
+ version: "0"
81
+ requirements: []
82
+
83
+ rubyforge_project: bloomer
84
+ rubygems_version: 1.6.2
85
+ signing_key:
86
+ specification_version: 3
87
+ summary: Pure-ruby bloom filter with minimal dependencies
88
+ test_files:
89
+ - spec/bloomer_spec.rb
90
+ - spec/spec_helper.rb