aho_corasick_matcher 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +38 -0
- data/lib/aho_corasick_matcher.rb +95 -0
- data/spec/aho_corasick_matcher_spec.rb +81 -0
- data/spec/spec_helper.rb +14 -0
- metadata +66 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cc70023665fc36d0b86508ef0d7c3e330beda705
|
4
|
+
data.tar.gz: e22d0d38e7305f540a6aa3a6f07ca259931eba3a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 62ba80e800c76fd5bc0db9157746413659b9141f63e17e5d0d7b5c7a6736a13dee2e4aee20adcd41cbc8f57257fbd87646a3f6b93620ef0d2c27010d0548a8c3
|
7
|
+
data.tar.gz: 693edb06c4dfd18e8bcbeac19c723bfcf5ec75eaa752388e0cbb55a306f2cb1206581ca151067252646ac3c8c59eb19bf722678c700658703ab6f3b3a1e5b9f2
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Altmetric LLP
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# AhoCorasickMatcher [![Build Status](https://travis-ci.org/altmetric/aho_corasick_matcher.svg?branch=master)](https://travis-ci.org/altmetric/aho_corasick_matcher)
|
2
|
+
|
3
|
+
A Ruby gem for finding strings in text using the [Aho-Corasick string matching search](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.96.4671&rep=rep1&type=pdf).
|
4
|
+
|
5
|
+
Aho-Corasick is `O(n + m)` where `n` is the size of the string to be searched
|
6
|
+
and `m` is the size of the dictionary. This means it's particularly suited for
|
7
|
+
searching for occurrences of words using large dictionaries, as the runtime
|
8
|
+
increases only linearly.
|
9
|
+
|
10
|
+
It's quite memory-intensive, and building a matcher is expensive – but once it's
|
11
|
+
been built, matching terms is very fast.
|
12
|
+
|
13
|
+
**Current version:** 1.0.0
|
14
|
+
**Supported Ruby versions:** 1.9.2, 1.9.3, 2.0, 2.1, 2.2, jruby-1.7, rbx-2.2, rbx-2.4
|
15
|
+
|
16
|
+
## Usage
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
require 'aho_corasick_matcher'
|
20
|
+
|
21
|
+
matcher = AhoCorasickMatcher.new(['a', 'b', 'ab'])
|
22
|
+
matcher.match('aba')
|
23
|
+
#=> ['a', 'ab', 'b', 'a']
|
24
|
+
|
25
|
+
matcher = AhoCorasickMatcher.new(["thistle", "sift", "thistles"])
|
26
|
+
matcher.match("Theophilus thistle, the successful thistle sifter, in sifting a sieve full of un-sifted thistles, thrust three thousand thistles through the thick of his thumb.")
|
27
|
+
#=> ["thistle", "thistle", "sift", "sift", "sift", "thistle", "thistles", "thistle", "thistles"]
|
28
|
+
```
|
29
|
+
|
30
|
+
## Thanks
|
31
|
+
|
32
|
+
Loosely based on Tim Cowlishaw's implementation of the same algorithm https://github.com/timcowlishaw/aho_corasick
|
33
|
+
|
34
|
+
## License
|
35
|
+
|
36
|
+
Copyright © 2015 Altmetric LLP
|
37
|
+
|
38
|
+
Distributed under the MIT License.
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
class AhoCorasickMatcher
|
4
|
+
attr_reader :root
|
5
|
+
private :root
|
6
|
+
|
7
|
+
def initialize(dictionary)
|
8
|
+
@root = Node.new
|
9
|
+
|
10
|
+
build_trie(dictionary)
|
11
|
+
build_suffix_map
|
12
|
+
end
|
13
|
+
|
14
|
+
def match(string)
|
15
|
+
[].tap do |matches|
|
16
|
+
string.each_char.reduce(root) do |node, char|
|
17
|
+
(node || root).search(char.intern).tap do |child|
|
18
|
+
matches.push(*child.matches) if child
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def build_trie(dictionary)
|
27
|
+
dictionary.each do |string|
|
28
|
+
string.each_char.reduce(root) do |node, char|
|
29
|
+
node.child_or_create(char.intern)
|
30
|
+
end.matches << string
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def build_suffix_map
|
35
|
+
queue = Queue.new
|
36
|
+
|
37
|
+
root.children.each do |child|
|
38
|
+
child.suffix = root
|
39
|
+
queue << child
|
40
|
+
end
|
41
|
+
|
42
|
+
until queue.empty?
|
43
|
+
node = queue.pop
|
44
|
+
node.children.each { |child| queue << child }
|
45
|
+
node.build_child_suffixes
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
class Node
|
50
|
+
attr_reader :matches, :child_map, :suffix, :parent
|
51
|
+
attr_writer :suffix
|
52
|
+
|
53
|
+
def initialize(parent = nil)
|
54
|
+
@matches = []
|
55
|
+
@child_map = {}
|
56
|
+
@parent = parent
|
57
|
+
end
|
58
|
+
|
59
|
+
def search(char)
|
60
|
+
@child_map[char] || suffix && suffix.search(char)
|
61
|
+
end
|
62
|
+
|
63
|
+
def child_or_create(char)
|
64
|
+
@child_map[char] ||= self.class.new(self)
|
65
|
+
end
|
66
|
+
|
67
|
+
def children
|
68
|
+
@child_map.values
|
69
|
+
end
|
70
|
+
|
71
|
+
def root?
|
72
|
+
parent.nil?
|
73
|
+
end
|
74
|
+
|
75
|
+
def build_child_suffixes
|
76
|
+
child_map.each do |char, child|
|
77
|
+
failure = find_failure_node(char)
|
78
|
+
child_suffix = failure.search(char)
|
79
|
+
|
80
|
+
if child_suffix
|
81
|
+
child.suffix = child_suffix
|
82
|
+
child.matches.push(*child_suffix.matches)
|
83
|
+
elsif failure.root?
|
84
|
+
child.suffix = failure
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def find_failure_node(char)
|
90
|
+
failure = suffix
|
91
|
+
failure = failure.suffix until failure.search(char) || failure.root?
|
92
|
+
failure
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'aho_corasick_matcher'
|
2
|
+
|
3
|
+
RSpec.describe AhoCorasickMatcher do
|
4
|
+
subject(:matcher) { described_class.new(dict) }
|
5
|
+
|
6
|
+
context 'with an empty dictionary' do
|
7
|
+
let(:dict) { [] }
|
8
|
+
|
9
|
+
it 'finds no strings' do
|
10
|
+
expect(matcher.match('I am a test string')).to be_empty
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
context 'with a single-entry dictionary' do
|
15
|
+
let(:dict) { %w(TestString) }
|
16
|
+
|
17
|
+
it 'finds matching strings' do
|
18
|
+
expect(matcher.match('I am a TestString')).to eq(%w(TestString))
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'does not find non-matching strings' do
|
22
|
+
expect(matcher.match('I am a different string')).to be_empty
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'finds all occurrences of strings' do
|
26
|
+
expect(
|
27
|
+
matcher.match('I am a TestString and I say TestString twice')
|
28
|
+
).to eq(%w(TestString TestString))
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'with a multiple-entry dictionary' do
|
33
|
+
let(:dict) { %w(TestString1 TestString2) }
|
34
|
+
|
35
|
+
it 'finds all matching strings' do
|
36
|
+
expect(
|
37
|
+
matcher.match('I am both a TestString1 and a TestString2')
|
38
|
+
).to eq(%w(TestString1 TestString2))
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'finds partial matching strings' do
|
42
|
+
expect(
|
43
|
+
matcher.match('I am a TestString1 but do not contain the other one')
|
44
|
+
).to eq(%w(TestString1))
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'does not find non-matching strings' do
|
48
|
+
expect(matcher.match('I am a different string')).to be_empty
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context 'with a multiple-entry dictionary including prefixes' do
|
53
|
+
let(:dict) { %w(TestString TestStringExtended) }
|
54
|
+
|
55
|
+
it 'finds all matching strings' do
|
56
|
+
expect(
|
57
|
+
matcher.match('I contain TestStringExtended')
|
58
|
+
).to eq(%w(TestString TestStringExtended))
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'finds partial matching strings' do
|
62
|
+
expect(
|
63
|
+
matcher.match('I am a TestString but do not contain the other one')
|
64
|
+
).to eq(%w(TestString))
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'does not find non-matching strings' do
|
68
|
+
expect(matcher.match('I am a different string')).to be_empty
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
context 'with an overlapping dictionary' do
|
73
|
+
let(:dict) { %w(Test String TestString) }
|
74
|
+
|
75
|
+
it 'finds all matching strings' do
|
76
|
+
expect(
|
77
|
+
matcher.match('TestStringTest')
|
78
|
+
).to eq(%w(Test TestString String Test))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
RSpec.configure do |config|
|
2
|
+
config.filter_run :focus
|
3
|
+
config.run_all_when_everything_filtered = true
|
4
|
+
config.disable_monkey_patching!
|
5
|
+
config.warnings = true
|
6
|
+
config.order = :random
|
7
|
+
Kernel.srand config.seed
|
8
|
+
|
9
|
+
config.default_formatter = 'doc' if config.files_to_run.one?
|
10
|
+
|
11
|
+
config.expect_with :rspec do |expectations|
|
12
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
13
|
+
end
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: aho_corasick_matcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Matthew MacLeod
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-06-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.2'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.2'
|
27
|
+
description: |2
|
28
|
+
Uses the fast Aho-Corasick text search system to find occurrences of any of
|
29
|
+
a dictionary of strings across an input string.
|
30
|
+
email: support@altmetric.com
|
31
|
+
executables: []
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- LICENSE
|
36
|
+
- README.md
|
37
|
+
- lib/aho_corasick_matcher.rb
|
38
|
+
- spec/aho_corasick_matcher_spec.rb
|
39
|
+
- spec/spec_helper.rb
|
40
|
+
homepage: https://github.com/altmetric/aho_corasick_matcher
|
41
|
+
licenses:
|
42
|
+
- MIT
|
43
|
+
metadata: {}
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 2.4.5
|
61
|
+
signing_key:
|
62
|
+
specification_version: 4
|
63
|
+
summary: A library to search text for occurrences of a list of strings
|
64
|
+
test_files:
|
65
|
+
- spec/aho_corasick_matcher_spec.rb
|
66
|
+
- spec/spec_helper.rb
|