aho_corasick_matcher 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +38 -0
- data/lib/aho_corasick_matcher.rb +95 -0
- data/spec/aho_corasick_matcher_spec.rb +81 -0
- data/spec/spec_helper.rb +14 -0
- metadata +66 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cc70023665fc36d0b86508ef0d7c3e330beda705
|
4
|
+
data.tar.gz: e22d0d38e7305f540a6aa3a6f07ca259931eba3a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 62ba80e800c76fd5bc0db9157746413659b9141f63e17e5d0d7b5c7a6736a13dee2e4aee20adcd41cbc8f57257fbd87646a3f6b93620ef0d2c27010d0548a8c3
|
7
|
+
data.tar.gz: 693edb06c4dfd18e8bcbeac19c723bfcf5ec75eaa752388e0cbb55a306f2cb1206581ca151067252646ac3c8c59eb19bf722678c700658703ab6f3b3a1e5b9f2
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Altmetric LLP
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# AhoCorasickMatcher [](https://travis-ci.org/altmetric/aho_corasick_matcher)
|
2
|
+
|
3
|
+
A Ruby gem for finding strings in text using the [Aho-Corasick string matching search](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.96.4671&rep=rep1&type=pdf).
|
4
|
+
|
5
|
+
Aho-Corasick is `O(n + m)` where `n` is the size of the string to be searched
|
6
|
+
and `m` is the size of the dictionary. This means it's particularly suited for
|
7
|
+
searching for occurrences of words using large dictionaries, as the runtime
|
8
|
+
increases only linearly.
|
9
|
+
|
10
|
+
It's quite memory-intensive, and building a matcher is expensive – but once it's
|
11
|
+
been built, matching terms is very fast.
|
12
|
+
|
13
|
+
**Current version:** 1.0.0
|
14
|
+
**Supported Ruby versions:** 1.9.2, 1.9.3, 2.0, 2.1, 2.2, jruby-1.7, rbx-2.2, rbx-2.4
|
15
|
+
|
16
|
+
## Usage
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
require 'aho_corasick_matcher'
|
20
|
+
|
21
|
+
matcher = AhoCorasickMatcher.new(['a', 'b', 'ab'])
|
22
|
+
matcher.match('aba')
|
23
|
+
#=> ['a', 'ab', 'b', 'a']
|
24
|
+
|
25
|
+
matcher = AhoCorasickMatcher.new(["thistle", "sift", "thistles"])
|
26
|
+
matcher.match("Theophilus thistle, the successful thistle sifter, in sifting a sieve full of un-sifted thistles, thrust three thousand thistles through the thick of his thumb.")
|
27
|
+
#=> ["thistle", "thistle", "sift", "sift", "sift", "thistle", "thistles", "thistle", "thistles"]
|
28
|
+
```
|
29
|
+
|
30
|
+
## Thanks
|
31
|
+
|
32
|
+
Loosely based on Tim Cowlishaw's implementation of the same algorithm https://github.com/timcowlishaw/aho_corasick
|
33
|
+
|
34
|
+
## License
|
35
|
+
|
36
|
+
Copyright © 2015 Altmetric LLP
|
37
|
+
|
38
|
+
Distributed under the MIT License.
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
class AhoCorasickMatcher
|
4
|
+
attr_reader :root
|
5
|
+
private :root
|
6
|
+
|
7
|
+
def initialize(dictionary)
|
8
|
+
@root = Node.new
|
9
|
+
|
10
|
+
build_trie(dictionary)
|
11
|
+
build_suffix_map
|
12
|
+
end
|
13
|
+
|
14
|
+
def match(string)
|
15
|
+
[].tap do |matches|
|
16
|
+
string.each_char.reduce(root) do |node, char|
|
17
|
+
(node || root).search(char.intern).tap do |child|
|
18
|
+
matches.push(*child.matches) if child
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def build_trie(dictionary)
|
27
|
+
dictionary.each do |string|
|
28
|
+
string.each_char.reduce(root) do |node, char|
|
29
|
+
node.child_or_create(char.intern)
|
30
|
+
end.matches << string
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def build_suffix_map
|
35
|
+
queue = Queue.new
|
36
|
+
|
37
|
+
root.children.each do |child|
|
38
|
+
child.suffix = root
|
39
|
+
queue << child
|
40
|
+
end
|
41
|
+
|
42
|
+
until queue.empty?
|
43
|
+
node = queue.pop
|
44
|
+
node.children.each { |child| queue << child }
|
45
|
+
node.build_child_suffixes
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
class Node
|
50
|
+
attr_reader :matches, :child_map, :suffix, :parent
|
51
|
+
attr_writer :suffix
|
52
|
+
|
53
|
+
def initialize(parent = nil)
|
54
|
+
@matches = []
|
55
|
+
@child_map = {}
|
56
|
+
@parent = parent
|
57
|
+
end
|
58
|
+
|
59
|
+
def search(char)
|
60
|
+
@child_map[char] || suffix && suffix.search(char)
|
61
|
+
end
|
62
|
+
|
63
|
+
def child_or_create(char)
|
64
|
+
@child_map[char] ||= self.class.new(self)
|
65
|
+
end
|
66
|
+
|
67
|
+
def children
|
68
|
+
@child_map.values
|
69
|
+
end
|
70
|
+
|
71
|
+
def root?
|
72
|
+
parent.nil?
|
73
|
+
end
|
74
|
+
|
75
|
+
def build_child_suffixes
|
76
|
+
child_map.each do |char, child|
|
77
|
+
failure = find_failure_node(char)
|
78
|
+
child_suffix = failure.search(char)
|
79
|
+
|
80
|
+
if child_suffix
|
81
|
+
child.suffix = child_suffix
|
82
|
+
child.matches.push(*child_suffix.matches)
|
83
|
+
elsif failure.root?
|
84
|
+
child.suffix = failure
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def find_failure_node(char)
|
90
|
+
failure = suffix
|
91
|
+
failure = failure.suffix until failure.search(char) || failure.root?
|
92
|
+
failure
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'aho_corasick_matcher'
|
2
|
+
|
3
|
+
RSpec.describe AhoCorasickMatcher do
|
4
|
+
subject(:matcher) { described_class.new(dict) }
|
5
|
+
|
6
|
+
context 'with an empty dictionary' do
|
7
|
+
let(:dict) { [] }
|
8
|
+
|
9
|
+
it 'finds no strings' do
|
10
|
+
expect(matcher.match('I am a test string')).to be_empty
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
context 'with a single-entry dictionary' do
|
15
|
+
let(:dict) { %w(TestString) }
|
16
|
+
|
17
|
+
it 'finds matching strings' do
|
18
|
+
expect(matcher.match('I am a TestString')).to eq(%w(TestString))
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'does not find non-matching strings' do
|
22
|
+
expect(matcher.match('I am a different string')).to be_empty
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'finds all occurrences of strings' do
|
26
|
+
expect(
|
27
|
+
matcher.match('I am a TestString and I say TestString twice')
|
28
|
+
).to eq(%w(TestString TestString))
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'with a multiple-entry dictionary' do
|
33
|
+
let(:dict) { %w(TestString1 TestString2) }
|
34
|
+
|
35
|
+
it 'finds all matching strings' do
|
36
|
+
expect(
|
37
|
+
matcher.match('I am both a TestString1 and a TestString2')
|
38
|
+
).to eq(%w(TestString1 TestString2))
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'finds partial matching strings' do
|
42
|
+
expect(
|
43
|
+
matcher.match('I am a TestString1 but do not contain the other one')
|
44
|
+
).to eq(%w(TestString1))
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'does not find non-matching strings' do
|
48
|
+
expect(matcher.match('I am a different string')).to be_empty
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context 'with a multiple-entry dictionary including prefixes' do
|
53
|
+
let(:dict) { %w(TestString TestStringExtended) }
|
54
|
+
|
55
|
+
it 'finds all matching strings' do
|
56
|
+
expect(
|
57
|
+
matcher.match('I contain TestStringExtended')
|
58
|
+
).to eq(%w(TestString TestStringExtended))
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'finds partial matching strings' do
|
62
|
+
expect(
|
63
|
+
matcher.match('I am a TestString but do not contain the other one')
|
64
|
+
).to eq(%w(TestString))
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'does not find non-matching strings' do
|
68
|
+
expect(matcher.match('I am a different string')).to be_empty
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
context 'with an overlapping dictionary' do
|
73
|
+
let(:dict) { %w(Test String TestString) }
|
74
|
+
|
75
|
+
it 'finds all matching strings' do
|
76
|
+
expect(
|
77
|
+
matcher.match('TestStringTest')
|
78
|
+
).to eq(%w(Test TestString String Test))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
RSpec.configure do |config|
|
2
|
+
config.filter_run :focus
|
3
|
+
config.run_all_when_everything_filtered = true
|
4
|
+
config.disable_monkey_patching!
|
5
|
+
config.warnings = true
|
6
|
+
config.order = :random
|
7
|
+
Kernel.srand config.seed
|
8
|
+
|
9
|
+
config.default_formatter = 'doc' if config.files_to_run.one?
|
10
|
+
|
11
|
+
config.expect_with :rspec do |expectations|
|
12
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
13
|
+
end
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: aho_corasick_matcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Matthew MacLeod
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-06-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.2'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.2'
|
27
|
+
description: |2
|
28
|
+
Uses the fast Aho-Corasick text search system to find occurrences of any of
|
29
|
+
a dictionary of strings across an input string.
|
30
|
+
email: support@altmetric.com
|
31
|
+
executables: []
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- LICENSE
|
36
|
+
- README.md
|
37
|
+
- lib/aho_corasick_matcher.rb
|
38
|
+
- spec/aho_corasick_matcher_spec.rb
|
39
|
+
- spec/spec_helper.rb
|
40
|
+
homepage: https://github.com/altmetric/aho_corasick_matcher
|
41
|
+
licenses:
|
42
|
+
- MIT
|
43
|
+
metadata: {}
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 2.4.5
|
61
|
+
signing_key:
|
62
|
+
specification_version: 4
|
63
|
+
summary: A library to search text for occurrences of a list of strings
|
64
|
+
test_files:
|
65
|
+
- spec/aho_corasick_matcher_spec.rb
|
66
|
+
- spec/spec_helper.rb
|