aho_corasick_matcher 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cc70023665fc36d0b86508ef0d7c3e330beda705
4
+ data.tar.gz: e22d0d38e7305f540a6aa3a6f07ca259931eba3a
5
+ SHA512:
6
+ metadata.gz: 62ba80e800c76fd5bc0db9157746413659b9141f63e17e5d0d7b5c7a6736a13dee2e4aee20adcd41cbc8f57257fbd87646a3f6b93620ef0d2c27010d0548a8c3
7
+ data.tar.gz: 693edb06c4dfd18e8bcbeac19c723bfcf5ec75eaa752388e0cbb55a306f2cb1206581ca151067252646ac3c8c59eb19bf722678c700658703ab6f3b3a1e5b9f2
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Altmetric LLP
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # AhoCorasickMatcher [![Build Status](https://travis-ci.org/altmetric/aho_corasick_matcher.svg?branch=master)](https://travis-ci.org/altmetric/aho_corasick_matcher)
2
+
3
+ A Ruby gem for finding strings in text using the [Aho-Corasick string matching search](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.96.4671&rep=rep1&type=pdf).
4
+
5
+ Aho-Corasick is `O(n + m)` where `n` is the size of the string to be searched
6
+ and `m` is the size of the dictionary. This means it's particularly suited for
7
+ searching for occurrences of words using large dictionaries, as the runtime
8
+ increases only linearly.
9
+
10
+ It's quite memory-intensive, and building a matcher is expensive – but once it's
11
+ been built, matching terms is very fast.
12
+
13
+ **Current version:** 1.0.0
14
+ **Supported Ruby versions:** 1.9.2, 1.9.3, 2.0, 2.1, 2.2, jruby-1.7, rbx-2.2, rbx-2.4
15
+
16
+ ## Usage
17
+
18
+ ```ruby
19
+ require 'aho_corasick_matcher'
20
+
21
+ matcher = AhoCorasickMatcher.new(['a', 'b', 'ab'])
22
+ matcher.match('aba')
23
+ #=> ['a', 'ab', 'b', 'a']
24
+
25
+ matcher = AhoCorasickMatcher.new(["thistle", "sift", "thistles"])
26
+ matcher.match("Theophilus thistle, the successful thistle sifter, in sifting a sieve full of un-sifted thistles, thrust three thousand thistles through the thick of his thumb.")
27
+ #=> ["thistle", "thistle", "sift", "sift", "sift", "thistle", "thistles", "thistle", "thistles"]
28
+ ```
29
+
30
+ ## Thanks
31
+
32
+ Loosely based on Tim Cowlishaw's implementation of the same algorithm https://github.com/timcowlishaw/aho_corasick
33
+
34
+ ## License
35
+
36
+ Copyright © 2015 Altmetric LLP
37
+
38
+ Distributed under the MIT License.
@@ -0,0 +1,95 @@
1
+ require 'thread'
2
+
3
+ class AhoCorasickMatcher
4
+ attr_reader :root
5
+ private :root
6
+
7
+ def initialize(dictionary)
8
+ @root = Node.new
9
+
10
+ build_trie(dictionary)
11
+ build_suffix_map
12
+ end
13
+
14
+ def match(string)
15
+ [].tap do |matches|
16
+ string.each_char.reduce(root) do |node, char|
17
+ (node || root).search(char.intern).tap do |child|
18
+ matches.push(*child.matches) if child
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def build_trie(dictionary)
27
+ dictionary.each do |string|
28
+ string.each_char.reduce(root) do |node, char|
29
+ node.child_or_create(char.intern)
30
+ end.matches << string
31
+ end
32
+ end
33
+
34
+ def build_suffix_map
35
+ queue = Queue.new
36
+
37
+ root.children.each do |child|
38
+ child.suffix = root
39
+ queue << child
40
+ end
41
+
42
+ until queue.empty?
43
+ node = queue.pop
44
+ node.children.each { |child| queue << child }
45
+ node.build_child_suffixes
46
+ end
47
+ end
48
+
49
+ class Node
50
+ attr_reader :matches, :child_map, :suffix, :parent
51
+ attr_writer :suffix
52
+
53
+ def initialize(parent = nil)
54
+ @matches = []
55
+ @child_map = {}
56
+ @parent = parent
57
+ end
58
+
59
+ def search(char)
60
+ @child_map[char] || suffix && suffix.search(char)
61
+ end
62
+
63
+ def child_or_create(char)
64
+ @child_map[char] ||= self.class.new(self)
65
+ end
66
+
67
+ def children
68
+ @child_map.values
69
+ end
70
+
71
+ def root?
72
+ parent.nil?
73
+ end
74
+
75
+ def build_child_suffixes
76
+ child_map.each do |char, child|
77
+ failure = find_failure_node(char)
78
+ child_suffix = failure.search(char)
79
+
80
+ if child_suffix
81
+ child.suffix = child_suffix
82
+ child.matches.push(*child_suffix.matches)
83
+ elsif failure.root?
84
+ child.suffix = failure
85
+ end
86
+ end
87
+ end
88
+
89
+ def find_failure_node(char)
90
+ failure = suffix
91
+ failure = failure.suffix until failure.search(char) || failure.root?
92
+ failure
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,81 @@
1
+ require 'aho_corasick_matcher'
2
+
3
+ RSpec.describe AhoCorasickMatcher do
4
+ subject(:matcher) { described_class.new(dict) }
5
+
6
+ context 'with an empty dictionary' do
7
+ let(:dict) { [] }
8
+
9
+ it 'finds no strings' do
10
+ expect(matcher.match('I am a test string')).to be_empty
11
+ end
12
+ end
13
+
14
+ context 'with a single-entry dictionary' do
15
+ let(:dict) { %w(TestString) }
16
+
17
+ it 'finds matching strings' do
18
+ expect(matcher.match('I am a TestString')).to eq(%w(TestString))
19
+ end
20
+
21
+ it 'does not find non-matching strings' do
22
+ expect(matcher.match('I am a different string')).to be_empty
23
+ end
24
+
25
+ it 'finds all occurrences of strings' do
26
+ expect(
27
+ matcher.match('I am a TestString and I say TestString twice')
28
+ ).to eq(%w(TestString TestString))
29
+ end
30
+ end
31
+
32
+ context 'with a multiple-entry dictionary' do
33
+ let(:dict) { %w(TestString1 TestString2) }
34
+
35
+ it 'finds all matching strings' do
36
+ expect(
37
+ matcher.match('I am both a TestString1 and a TestString2')
38
+ ).to eq(%w(TestString1 TestString2))
39
+ end
40
+
41
+ it 'finds partial matching strings' do
42
+ expect(
43
+ matcher.match('I am a TestString1 but do not contain the other one')
44
+ ).to eq(%w(TestString1))
45
+ end
46
+
47
+ it 'does not find non-matching strings' do
48
+ expect(matcher.match('I am a different string')).to be_empty
49
+ end
50
+ end
51
+
52
+ context 'with a multiple-entry dictionary including prefixes' do
53
+ let(:dict) { %w(TestString TestStringExtended) }
54
+
55
+ it 'finds all matching strings' do
56
+ expect(
57
+ matcher.match('I contain TestStringExtended')
58
+ ).to eq(%w(TestString TestStringExtended))
59
+ end
60
+
61
+ it 'finds partial matching strings' do
62
+ expect(
63
+ matcher.match('I am a TestString but do not contain the other one')
64
+ ).to eq(%w(TestString))
65
+ end
66
+
67
+ it 'does not find non-matching strings' do
68
+ expect(matcher.match('I am a different string')).to be_empty
69
+ end
70
+ end
71
+
72
+ context 'with an overlapping dictionary' do
73
+ let(:dict) { %w(Test String TestString) }
74
+
75
+ it 'finds all matching strings' do
76
+ expect(
77
+ matcher.match('TestStringTest')
78
+ ).to eq(%w(Test TestString String Test))
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,14 @@
1
+ RSpec.configure do |config|
2
+ config.filter_run :focus
3
+ config.run_all_when_everything_filtered = true
4
+ config.disable_monkey_patching!
5
+ config.warnings = true
6
+ config.order = :random
7
+ Kernel.srand config.seed
8
+
9
+ config.default_formatter = 'doc' if config.files_to_run.one?
10
+
11
+ config.expect_with :rspec do |expectations|
12
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
+ end
14
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: aho_corasick_matcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Matthew MacLeod
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-06-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.2'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.2'
27
+ description: |2
28
+ Uses the fast Aho-Corasick text search system to find occurrences of any of
29
+ a dictionary of strings across an input string.
30
+ email: support@altmetric.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - LICENSE
36
+ - README.md
37
+ - lib/aho_corasick_matcher.rb
38
+ - spec/aho_corasick_matcher_spec.rb
39
+ - spec/spec_helper.rb
40
+ homepage: https://github.com/altmetric/aho_corasick_matcher
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 2.4.5
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: A library to search text for occurrences of a list of strings
64
+ test_files:
65
+ - spec/aho_corasick_matcher_spec.rb
66
+ - spec/spec_helper.rb