qbloom_filter 0.1.0 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +20 -4
- data/bin/console +1 -1
- data/lib/qbloom_filter.rb +115 -0
- data/lib/{bloom_filter → qbloom_filter}/version.rb +1 -1
- data/{bloom_filter.gemspec → qbloom_filter.gemspec} +5 -5
- metadata +10 -10
- data/lib/bloom_filter.rb +0 -63
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d3107815ff8ebf8aa327a4f2cd2cfb80d9f0aefbffd84e428adcfc504295d42c
|
4
|
+
data.tar.gz: fb29b08c1f68f5f50a06dfbc3b395c43ff9be36fbe0e2741cecf47c7ccf75f1e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2e6ded653a4c765234dd4f38e874efb50de33952fa7faef0ae9c9f97ad0142eabe5bae0c63029f621c9224f456a5bc04d7de12839d28f2748539f7b0d7d53bab
|
7
|
+
data.tar.gz: 40670964f95d373b7c4d413ae60ad5369760a34bcf511195b4e31a38fbc46782e76f8b8b9745607c010dfb32330e9803388d8e08c7f5ec95fc7ec30ffd514631
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -7,7 +7,7 @@ A Bloom filter is a space-efficient probabilistic data structure
|
|
7
7
|
Add this line to your application's Gemfile:
|
8
8
|
|
9
9
|
```ruby
|
10
|
-
gem '
|
10
|
+
gem 'qbloom_filter'
|
11
11
|
```
|
12
12
|
|
13
13
|
And then execute:
|
@@ -16,7 +16,7 @@ And then execute:
|
|
16
16
|
|
17
17
|
Or install it yourself as:
|
18
18
|
|
19
|
-
$ gem install
|
19
|
+
$ gem install qbloom_filter
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
@@ -30,7 +30,7 @@ And two parameters can be used to describe the bloom filter:
|
|
30
30
|
bloom_filter = BloomFilter::Filter.new(1000, 0.001)
|
31
31
|
```
|
32
32
|
|
33
|
-
####
|
33
|
+
#### Methods
|
34
34
|
__add(value)__ - add item into filter
|
35
35
|
|
36
36
|
__includes?(value)__ - check if filter includes the value
|
@@ -39,7 +39,23 @@ __contains?(value)__ - alias of __includes?(value)__
|
|
39
39
|
|
40
40
|
__count__ - returns number of inserted items
|
41
41
|
|
42
|
+
__capacity__ - returns initial capacity
|
43
|
+
|
44
|
+
__probability__ - returns initial probability
|
45
|
+
|
46
|
+
__bit_size__ - returns number of bits in the bit array
|
47
|
+
|
48
|
+
__get_bit(position)__ - returns value of a bit(true/false) in the bit array, rises an error if position is out of range of the bit array
|
49
|
+
|
50
|
+
__set_bit(position)__ - set a bit to TRUE in the bit array, rises an error if position is out of range of the bit array
|
51
|
+
|
52
|
+
__clear_bit(position)__ - set a bit to FALSE in the bit array, rises an error if position is out of range of the bit array
|
53
|
+
|
54
|
+
__union_with(bloom_filter)__ - unions current bloom filter with another one, bloom filters should be the instances of this module and have the same initial params(capacity, probability)
|
55
|
+
|
56
|
+
__intersect_with(bloom_filter)__ - intersects current bloom filter with another one, bloom filters should be the instances of this module and have the same initial params(capacity, probability)
|
57
|
+
|
42
58
|
## Contributing
|
43
59
|
|
44
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/superedriver/
|
60
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/superedriver/qbloom_filter
|
45
61
|
|
data/bin/console
CHANGED
@@ -0,0 +1,115 @@
|
|
1
|
+
require "bloom_filter/version"
|
2
|
+
require "bitset"
|
3
|
+
require 'digest/md5'
|
4
|
+
|
5
|
+
module BloomFilter
|
6
|
+
PRIME = 100_000_000_003
|
7
|
+
MAX_HASH_PARAM = 1000
|
8
|
+
OUT_OF_RANGE = "Position is out of range"
|
9
|
+
DIFFERENT_INITIAL_PARAMS = "Bloom filters have different initial params"
|
10
|
+
|
11
|
+
class Filter
|
12
|
+
attr_reader :count, :capacity, :probability
|
13
|
+
|
14
|
+
def initialize(capacity = 100, probability = 0.01)
|
15
|
+
# amount of inserted elements
|
16
|
+
@count = 0
|
17
|
+
|
18
|
+
# params ob filter, are used for comparison with params of other bloom filters
|
19
|
+
@capacity = capacity
|
20
|
+
@probability = probability
|
21
|
+
|
22
|
+
#number of bits in the array
|
23
|
+
@m = (-(capacity * Math.log(probability)) / (Math.log(2) ** 2)).ceil
|
24
|
+
|
25
|
+
@bitset = Bitset.new(@m)
|
26
|
+
|
27
|
+
#number of hash functions that minimizes the probability of false positives
|
28
|
+
@k = (Math.log(2) * (@m / capacity)).ceil
|
29
|
+
end
|
30
|
+
|
31
|
+
def add(value)
|
32
|
+
x = get_hash(value)
|
33
|
+
was_inserted = true
|
34
|
+
@k.times do |i|
|
35
|
+
a, b = get_hash_params(i)
|
36
|
+
position = get_position(a, b, x)
|
37
|
+
was_inserted = false unless self.get_bit(position)
|
38
|
+
self.set_bit(position)
|
39
|
+
end
|
40
|
+
@count += 1 unless was_inserted
|
41
|
+
value
|
42
|
+
end
|
43
|
+
|
44
|
+
def contains?(value)
|
45
|
+
x = get_hash(value)
|
46
|
+
result = true
|
47
|
+
@k.times do |i|
|
48
|
+
a, b = get_hash_params(i)
|
49
|
+
result = false unless self.get_bit(get_position(a, b, x))
|
50
|
+
end
|
51
|
+
|
52
|
+
result
|
53
|
+
end
|
54
|
+
alias :includes? :contains?
|
55
|
+
|
56
|
+
def bit_size
|
57
|
+
@m
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_bit(position)
|
61
|
+
valid_position?(position)
|
62
|
+
@bitset[position]
|
63
|
+
end
|
64
|
+
|
65
|
+
def set_bit(position)
|
66
|
+
valid_position?(position)
|
67
|
+
@bitset[position] = true
|
68
|
+
end
|
69
|
+
|
70
|
+
def clear_bit(position)
|
71
|
+
valid_position?(position)
|
72
|
+
@bitset[position] = false
|
73
|
+
end
|
74
|
+
|
75
|
+
def union_with(bloom_filter)
|
76
|
+
same_params?(bloom_filter)
|
77
|
+
|
78
|
+
@m.times do |i|
|
79
|
+
@bitset[i] = self.get_bit(i) || bloom_filter.get_bit(i)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def intersect_with(bloom_filter)
|
84
|
+
same_params?(bloom_filter)
|
85
|
+
|
86
|
+
@m.times do |i|
|
87
|
+
@bitset[i] = self.get_bit(i) && bloom_filter.get_bit(i)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def get_position(a, b, val)
|
94
|
+
((a * val + b) % PRIME) % @m
|
95
|
+
end
|
96
|
+
|
97
|
+
def get_hash(value)
|
98
|
+
Digest::MD5.hexdigest(value.to_s).to_i(16)
|
99
|
+
end
|
100
|
+
|
101
|
+
def valid_position?(position)
|
102
|
+
raise OUT_OF_RANGE if position >= @m
|
103
|
+
true
|
104
|
+
end
|
105
|
+
|
106
|
+
def same_params?(bf)
|
107
|
+
raise DIFFERENT_INITIAL_PARAMS if self.class != bf.class || bf.capacity != @capacity || bf.probability != @probability
|
108
|
+
true
|
109
|
+
end
|
110
|
+
|
111
|
+
def get_hash_params(i)
|
112
|
+
return 2*i + 1, 2*i + 2
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require_relative 'lib/
|
1
|
+
require_relative 'lib/qbloom_filter/version'
|
2
2
|
|
3
3
|
Gem::Specification.new do |spec|
|
4
4
|
spec.name = "qbloom_filter"
|
@@ -8,15 +8,15 @@ Gem::Specification.new do |spec|
|
|
8
8
|
|
9
9
|
spec.licenses = ['MIT']
|
10
10
|
spec.summary = %q{Bloom Filter}
|
11
|
-
spec.description = %q{
|
12
|
-
spec.homepage = "https://github.com/superedriver/
|
11
|
+
spec.description = %q{Bloom Filter with union and intersection}
|
12
|
+
spec.homepage = "https://github.com/superedriver/qbloom_filter"
|
13
13
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
14
|
|
15
15
|
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
16
16
|
|
17
17
|
spec.metadata["homepage_uri"] = spec.homepage
|
18
|
-
spec.metadata["source_code_uri"] = "https://github.com/superedriver/
|
19
|
-
spec.metadata["changelog_uri"] = "https://github.com/superedriver/
|
18
|
+
spec.metadata["source_code_uri"] = "https://github.com/superedriver/qbloom_filter"
|
19
|
+
spec.metadata["changelog_uri"] = "https://github.com/superedriver/qbloom_filter"
|
20
20
|
|
21
21
|
# Specify which files should be added to the gem when it is released.
|
22
22
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: qbloom_filter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- qaz
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
13
|
+
description: Bloom Filter with union and intersection
|
14
14
|
email:
|
15
15
|
- qaz@qaz.qaz
|
16
16
|
executables: []
|
@@ -30,17 +30,17 @@ files:
|
|
30
30
|
- bin/rake
|
31
31
|
- bin/rspec
|
32
32
|
- bin/setup
|
33
|
-
-
|
34
|
-
- lib/
|
35
|
-
-
|
36
|
-
homepage: https://github.com/superedriver/
|
33
|
+
- lib/qbloom_filter.rb
|
34
|
+
- lib/qbloom_filter/version.rb
|
35
|
+
- qbloom_filter.gemspec
|
36
|
+
homepage: https://github.com/superedriver/qbloom_filter
|
37
37
|
licenses:
|
38
38
|
- MIT
|
39
39
|
metadata:
|
40
40
|
allowed_push_host: https://rubygems.org
|
41
|
-
homepage_uri: https://github.com/superedriver/
|
42
|
-
source_code_uri: https://github.com/superedriver/
|
43
|
-
changelog_uri: https://github.com/superedriver/
|
41
|
+
homepage_uri: https://github.com/superedriver/qbloom_filter
|
42
|
+
source_code_uri: https://github.com/superedriver/qbloom_filter
|
43
|
+
changelog_uri: https://github.com/superedriver/qbloom_filter
|
44
44
|
post_install_message:
|
45
45
|
rdoc_options: []
|
46
46
|
require_paths:
|
data/lib/bloom_filter.rb
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
require "bloom_filter/version"
|
2
|
-
require "bitset"
|
3
|
-
require 'digest/md5'
|
4
|
-
|
5
|
-
module BloomFilter
|
6
|
-
PRIME = 100_000_000_003
|
7
|
-
MAX_HASH_PARAM = 1000
|
8
|
-
class Filter
|
9
|
-
attr_reader :count
|
10
|
-
|
11
|
-
def initialize(capacity = 100, probability = 0.01)
|
12
|
-
# amount of inserted elements
|
13
|
-
@count = 0
|
14
|
-
|
15
|
-
#number of bits in the array
|
16
|
-
@m = (-(capacity * Math.log(probability)) / (Math.log(2) ** 2)).ceil
|
17
|
-
|
18
|
-
@bitset = Bitset.new(@m)
|
19
|
-
|
20
|
-
#number of hash functions that minimizes the probability of false positives
|
21
|
-
@k = (Math.log(2) * (@m / capacity)).ceil
|
22
|
-
|
23
|
-
# a, b params for hash functions
|
24
|
-
@hash_params = []
|
25
|
-
@k.times { @hash_params.push([rand(1000), rand(1000)]) }
|
26
|
-
end
|
27
|
-
|
28
|
-
def add(value)
|
29
|
-
x = get_hash(value)
|
30
|
-
was_inserted = true
|
31
|
-
@k.times do |i|
|
32
|
-
a, b = @hash_params[i]
|
33
|
-
position = get_position(a, b, x)
|
34
|
-
was_inserted = false unless @bitset[position]
|
35
|
-
@bitset[position] = true
|
36
|
-
end
|
37
|
-
@count += 1 unless was_inserted
|
38
|
-
value
|
39
|
-
end
|
40
|
-
|
41
|
-
def contains?(value)
|
42
|
-
x = get_hash(value)
|
43
|
-
result = true
|
44
|
-
@k.times do |i|
|
45
|
-
a, b = @hash_params[i]
|
46
|
-
result = false unless @bitset[get_position(a, b, x)]
|
47
|
-
end
|
48
|
-
|
49
|
-
result
|
50
|
-
end
|
51
|
-
alias :includes? :contains?
|
52
|
-
|
53
|
-
private
|
54
|
-
|
55
|
-
def get_position(a, b, val)
|
56
|
-
((a * val + b) % PRIME) % @m
|
57
|
-
end
|
58
|
-
|
59
|
-
def get_hash(value)
|
60
|
-
Digest::MD5.hexdigest(value.to_s).to_i(16)
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|