aho_corasick_matcher 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cc70023665fc36d0b86508ef0d7c3e330beda705
4
+ data.tar.gz: e22d0d38e7305f540a6aa3a6f07ca259931eba3a
5
+ SHA512:
6
+ metadata.gz: 62ba80e800c76fd5bc0db9157746413659b9141f63e17e5d0d7b5c7a6736a13dee2e4aee20adcd41cbc8f57257fbd87646a3f6b93620ef0d2c27010d0548a8c3
7
+ data.tar.gz: 693edb06c4dfd18e8bcbeac19c723bfcf5ec75eaa752388e0cbb55a306f2cb1206581ca151067252646ac3c8c59eb19bf722678c700658703ab6f3b3a1e5b9f2
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Altmetric LLP
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # AhoCorasickMatcher [![Build Status](https://travis-ci.org/altmetric/aho_corasick_matcher.svg?branch=master)](https://travis-ci.org/altmetric/aho_corasick_matcher)
2
+
3
+ A Ruby gem for finding strings in text using the [Aho-Corasick string matching search](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.96.4671&rep=rep1&type=pdf).
4
+
5
+ Aho-Corasick is `O(n + m)` where `n` is the size of the string to be searched
6
+ and `m` is the size of the dictionary. This means it's particularly suited for
7
+ searching for occurrences of words using large dictionaries, as the runtime
8
+ increases only linearly.
9
+
10
+ It's quite memory-intensive, and building a matcher is expensive – but once it's
11
+ been built, matching terms is very fast.
12
+
13
+ **Current version:** 1.0.0
14
+ **Supported Ruby versions:** 1.9.2, 1.9.3, 2.0, 2.1, 2.2, jruby-1.7, rbx-2.2, rbx-2.4
15
+
16
+ ## Usage
17
+
18
+ ```ruby
19
+ require 'aho_corasick_matcher'
20
+
21
+ matcher = AhoCorasickMatcher.new(['a', 'b', 'ab'])
22
+ matcher.match('aba')
23
+ #=> ['a', 'ab', 'b', 'a']
24
+
25
+ matcher = AhoCorasickMatcher.new(["thistle", "sift", "thistles"])
26
+ matcher.match("Theophilus thistle, the successful thistle sifter, in sifting a sieve full of un-sifted thistles, thrust three thousand thistles through the thick of his thumb.")
27
+ #=> ["thistle", "thistle", "sift", "sift", "sift", "thistle", "thistles", "thistle", "thistles"]
28
+ ```
29
+
30
+ ## Thanks
31
+
32
+ Loosely based on Tim Cowlishaw's implementation of the same algorithm https://github.com/timcowlishaw/aho_corasick
33
+
34
+ ## License
35
+
36
+ Copyright © 2015 Altmetric LLP
37
+
38
+ Distributed under the MIT License.
@@ -0,0 +1,95 @@
1
+ require 'thread'
2
+
3
+ class AhoCorasickMatcher
4
+ attr_reader :root
5
+ private :root
6
+
7
+ def initialize(dictionary)
8
+ @root = Node.new
9
+
10
+ build_trie(dictionary)
11
+ build_suffix_map
12
+ end
13
+
14
+ def match(string)
15
+ [].tap do |matches|
16
+ string.each_char.reduce(root) do |node, char|
17
+ (node || root).search(char.intern).tap do |child|
18
+ matches.push(*child.matches) if child
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def build_trie(dictionary)
27
+ dictionary.each do |string|
28
+ string.each_char.reduce(root) do |node, char|
29
+ node.child_or_create(char.intern)
30
+ end.matches << string
31
+ end
32
+ end
33
+
34
+ def build_suffix_map
35
+ queue = Queue.new
36
+
37
+ root.children.each do |child|
38
+ child.suffix = root
39
+ queue << child
40
+ end
41
+
42
+ until queue.empty?
43
+ node = queue.pop
44
+ node.children.each { |child| queue << child }
45
+ node.build_child_suffixes
46
+ end
47
+ end
48
+
49
+ class Node
50
+ attr_reader :matches, :child_map, :suffix, :parent
51
+ attr_writer :suffix
52
+
53
+ def initialize(parent = nil)
54
+ @matches = []
55
+ @child_map = {}
56
+ @parent = parent
57
+ end
58
+
59
+ def search(char)
60
+ @child_map[char] || suffix && suffix.search(char)
61
+ end
62
+
63
+ def child_or_create(char)
64
+ @child_map[char] ||= self.class.new(self)
65
+ end
66
+
67
+ def children
68
+ @child_map.values
69
+ end
70
+
71
+ def root?
72
+ parent.nil?
73
+ end
74
+
75
+ def build_child_suffixes
76
+ child_map.each do |char, child|
77
+ failure = find_failure_node(char)
78
+ child_suffix = failure.search(char)
79
+
80
+ if child_suffix
81
+ child.suffix = child_suffix
82
+ child.matches.push(*child_suffix.matches)
83
+ elsif failure.root?
84
+ child.suffix = failure
85
+ end
86
+ end
87
+ end
88
+
89
+ def find_failure_node(char)
90
+ failure = suffix
91
+ failure = failure.suffix until failure.search(char) || failure.root?
92
+ failure
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,81 @@
1
+ require 'aho_corasick_matcher'
2
+
3
+ RSpec.describe AhoCorasickMatcher do
4
+ subject(:matcher) { described_class.new(dict) }
5
+
6
+ context 'with an empty dictionary' do
7
+ let(:dict) { [] }
8
+
9
+ it 'finds no strings' do
10
+ expect(matcher.match('I am a test string')).to be_empty
11
+ end
12
+ end
13
+
14
+ context 'with a single-entry dictionary' do
15
+ let(:dict) { %w(TestString) }
16
+
17
+ it 'finds matching strings' do
18
+ expect(matcher.match('I am a TestString')).to eq(%w(TestString))
19
+ end
20
+
21
+ it 'does not find non-matching strings' do
22
+ expect(matcher.match('I am a different string')).to be_empty
23
+ end
24
+
25
+ it 'finds all occurrences of strings' do
26
+ expect(
27
+ matcher.match('I am a TestString and I say TestString twice')
28
+ ).to eq(%w(TestString TestString))
29
+ end
30
+ end
31
+
32
+ context 'with a multiple-entry dictionary' do
33
+ let(:dict) { %w(TestString1 TestString2) }
34
+
35
+ it 'finds all matching strings' do
36
+ expect(
37
+ matcher.match('I am both a TestString1 and a TestString2')
38
+ ).to eq(%w(TestString1 TestString2))
39
+ end
40
+
41
+ it 'finds partial matching strings' do
42
+ expect(
43
+ matcher.match('I am a TestString1 but do not contain the other one')
44
+ ).to eq(%w(TestString1))
45
+ end
46
+
47
+ it 'does not find non-matching strings' do
48
+ expect(matcher.match('I am a different string')).to be_empty
49
+ end
50
+ end
51
+
52
+ context 'with a multiple-entry dictionary including prefixes' do
53
+ let(:dict) { %w(TestString TestStringExtended) }
54
+
55
+ it 'finds all matching strings' do
56
+ expect(
57
+ matcher.match('I contain TestStringExtended')
58
+ ).to eq(%w(TestString TestStringExtended))
59
+ end
60
+
61
+ it 'finds partial matching strings' do
62
+ expect(
63
+ matcher.match('I am a TestString but do not contain the other one')
64
+ ).to eq(%w(TestString))
65
+ end
66
+
67
+ it 'does not find non-matching strings' do
68
+ expect(matcher.match('I am a different string')).to be_empty
69
+ end
70
+ end
71
+
72
+ context 'with an overlapping dictionary' do
73
+ let(:dict) { %w(Test String TestString) }
74
+
75
+ it 'finds all matching strings' do
76
+ expect(
77
+ matcher.match('TestStringTest')
78
+ ).to eq(%w(Test TestString String Test))
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,14 @@
1
+ RSpec.configure do |config|
2
+ config.filter_run :focus
3
+ config.run_all_when_everything_filtered = true
4
+ config.disable_monkey_patching!
5
+ config.warnings = true
6
+ config.order = :random
7
+ Kernel.srand config.seed
8
+
9
+ config.default_formatter = 'doc' if config.files_to_run.one?
10
+
11
+ config.expect_with :rspec do |expectations|
12
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
+ end
14
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: aho_corasick_matcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Matthew MacLeod
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-06-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.2'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.2'
27
+ description: |2
28
+ Uses the fast Aho-Corasick text search system to find occurrences of any of
29
+ a dictionary of strings across an input string.
30
+ email: support@altmetric.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - LICENSE
36
+ - README.md
37
+ - lib/aho_corasick_matcher.rb
38
+ - spec/aho_corasick_matcher_spec.rb
39
+ - spec/spec_helper.rb
40
+ homepage: https://github.com/altmetric/aho_corasick_matcher
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 2.4.5
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: A library to search text for occurrences of a list of strings
64
+ test_files:
65
+ - spec/aho_corasick_matcher_spec.rb
66
+ - spec/spec_helper.rb