ahocorasick 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/ahocorasick.rb +291 -0
- metadata +52 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b52f23ad37cb73fd57f7dcacc0dca1fcc19c5f59
|
4
|
+
data.tar.gz: dcf6946c522cfee562659568a5fe8a8d00ba8029
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 258ca8778739da6f927998c9474d0812c1a9c5e00c74d73cae2151a649c552c45410b3565a63ba26e41999f4a49df236d3a4d68545fed3e1cd4ea2ee423b5b8b
|
7
|
+
data.tar.gz: 15ad5d4d259a965e5992d89c4730d993f7283f0fc01d5411aa279a70698c851c63b51d28df058e1a0238842030b5866a4f6b875dd5b7b23999a81b53d8fad876
|
data/lib/ahocorasick.rb
ADDED
@@ -0,0 +1,291 @@
|
|
1
|
+
#==Aho-Corasick Trie Data Structure
|
2
|
+
#:title:Aho-Corasick Module
|
3
|
+
#
|
4
|
+
#A set of classes that implement an automaton for the Aho-Corasick algorithm.
|
5
|
+
#
|
6
|
+
#The Aho-Corasick algorithm provides a linear-time lookup solution to the exact
|
7
|
+
#set matching problem. (i.e. locating all occurrences of a finite set of
|
8
|
+
#patterns within an input target) The algorithm processes the input target in
|
9
|
+
#a single pass versus multiple passes for a pattern.
|
10
|
+
#
|
11
|
+
#An example use might be to search for known sequences in a DNA string.
|
12
|
+
#Although typically used to parse character strings, this implementation
|
13
|
+
#is type independent. It can be used to parse any collection where
|
14
|
+
#the items that make up that collection can be iterated.
|
15
|
+
#
|
16
|
+
#The implementation supports both a Non-deterministic and Deterministic
|
17
|
+
#Finite Automaton construction. The NFA will consume less memory, but
|
18
|
+
#will require more transitions than its DFA counterpart.
|
19
|
+
#
|
20
|
+
#<b>Author: Alexander Nick</b>
|
21
|
+
#
|
22
|
+
#<b>Date: 04/05/2011</b>
|
23
|
+
module AhoC
|
24
|
+
|
25
|
+
#==Aho-Corasick Trie Class
|
26
|
+
#
|
27
|
+
#====Description
|
28
|
+
#A class for constructing and accessing an Aho-Corasick trie
|
29
|
+
#data structure. Any pattern that supports the "each" method
|
30
|
+
#in ruby can be added to the Trie. (e.g. strings or arrays)
|
31
|
+
#
|
32
|
+
#
|
33
|
+
#
|
34
|
+
#===Example Usage:
|
35
|
+
#====Building a Aho-Corasick Trie
|
36
|
+
# myTrie = AhoC::Trie.new
|
37
|
+
# myTrie.add("he")
|
38
|
+
# myTrie.add("she")
|
39
|
+
# myTrie.add("hers")
|
40
|
+
# myTrie.add("his")
|
41
|
+
# myTrie.build
|
42
|
+
#
|
43
|
+
#====Looking up a string
|
44
|
+
# myTrie.lookup("ushers")
|
45
|
+
#
|
46
|
+
#====Output from Lookup
|
47
|
+
# ["she", "he", "hers"]
|
48
|
+
class Trie
|
49
|
+
|
50
|
+
# Creates an empty AhoC Trie with only a root node
|
51
|
+
#
|
52
|
+
# Accepts an optional argument (either :DFA or :NFA)
|
53
|
+
# indicating the type of automaton to build. If no
|
54
|
+
# argument is passed an NFA will be built.
|
55
|
+
def initialize *arg
|
56
|
+
@root = Node.new
|
57
|
+
|
58
|
+
if !arg[0] || arg[0] == :NFA
|
59
|
+
@type = :NFA
|
60
|
+
elsif arg[0] == :DFA
|
61
|
+
@type = :DFA
|
62
|
+
else
|
63
|
+
raise "Only :DFA or :NFA accepted as arguments"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Add a pattern to the Trie.
|
68
|
+
#
|
69
|
+
# Accepts an optional block that can be used
|
70
|
+
# to modify the node output.
|
71
|
+
def add pattern
|
72
|
+
node = @root
|
73
|
+
|
74
|
+
# If this is a string process each character
|
75
|
+
if String(pattern) == pattern
|
76
|
+
pattern.each_char do |char|
|
77
|
+
if node.goto(char) == nil
|
78
|
+
node = node.add(char)
|
79
|
+
else
|
80
|
+
node = node.goto(char)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
else # Otherwise, pattern should support "each" method.
|
84
|
+
for item in pattern
|
85
|
+
if node.goto(item) == nil
|
86
|
+
node = node.add(item)
|
87
|
+
else
|
88
|
+
node = node.goto(item)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
if block_given?
|
94
|
+
node.output = yield pattern
|
95
|
+
else
|
96
|
+
node.output = [pattern]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Sets the failure transitions and output for each node.
|
101
|
+
# Call this method once all the patterns have been added to the Trie.
|
102
|
+
#
|
103
|
+
# Accepts an optional block that can be used to modify the output
|
104
|
+
# constructed from the node and its failure nodes.
|
105
|
+
def build
|
106
|
+
fifo_q = Array.new
|
107
|
+
|
108
|
+
# Set the failures for the nodes coming out of the root node.
|
109
|
+
@root.get.each_pair do |item, node|
|
110
|
+
node.failure = @root
|
111
|
+
fifo_q.push node
|
112
|
+
end
|
113
|
+
|
114
|
+
# Set the failures in breadth-first search order
|
115
|
+
# using a FIFO queue. A failure identifies the deepest node
|
116
|
+
# that is a proper suffix of the current node.
|
117
|
+
while !fifo_q.empty?
|
118
|
+
p_node = fifo_q.shift
|
119
|
+
if p_node.get
|
120
|
+
p_node.get.each_pair do |item, node|
|
121
|
+
# Push the current node onto the queue, so any child
|
122
|
+
# nodes can be processed later.
|
123
|
+
fifo_q.push node
|
124
|
+
|
125
|
+
f_node = p_node.failure
|
126
|
+
|
127
|
+
# Follow the failures until we find a goto transition
|
128
|
+
# or arrive back at the root node
|
129
|
+
while f_node.goto(item) == nil and !f_node.eql? @root
|
130
|
+
f_node = f_node.failure
|
131
|
+
end
|
132
|
+
|
133
|
+
if f_node.eql? @root and f_node.goto(item) == nil
|
134
|
+
node.failure = @root
|
135
|
+
else
|
136
|
+
node.failure = f_node.goto(item)
|
137
|
+
if block_given?
|
138
|
+
node.output = yield node.output, (node.failure).output
|
139
|
+
else
|
140
|
+
if node.output && (node.failure).output
|
141
|
+
node.output = node.output + (node.failure).output
|
142
|
+
elsif (node.failure).output
|
143
|
+
node.output = (node.failure).output
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
build_dfa if @type == :DFA
|
152
|
+
|
153
|
+
end
|
154
|
+
|
155
|
+
# Finds all occurrences of patterns in the Trie contained in target.
|
156
|
+
# Outputs an array of patterns contained within the target.
|
157
|
+
#
|
158
|
+
# Accepts an optional argument to override the type of the output (e.g. String.new)
|
159
|
+
# Accepts an optional block that can modify how the output is built from each node.
|
160
|
+
def lookup target, *arg
|
161
|
+
output = arg[0] ? arg[0] : Array.new
|
162
|
+
node = @root
|
163
|
+
|
164
|
+
# If this is a string process each character
|
165
|
+
if String(target) == target
|
166
|
+
target.each_char do |char|
|
167
|
+
# Follow the failures until a goto transition is found
|
168
|
+
# or we return to the root node.
|
169
|
+
while(!node.goto(char) and !node.eql? @root)
|
170
|
+
node = node.failure
|
171
|
+
end
|
172
|
+
|
173
|
+
# If there is a goto transition follow it; otherwise,
|
174
|
+
# we can assume we are at the root node.
|
175
|
+
if node.goto(char)
|
176
|
+
node = node.goto(char)
|
177
|
+
|
178
|
+
if node.output
|
179
|
+
if block_given?
|
180
|
+
output = yield output, node.output
|
181
|
+
else
|
182
|
+
output = output + node.output
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
end
|
188
|
+
else # Otherwise, target should support "each" method.
|
189
|
+
for item in target
|
190
|
+
# Follow the failures until a goto transition is found
|
191
|
+
# or we return to the root node.
|
192
|
+
while(!node.goto(item) and !node.eql? @root)
|
193
|
+
node = node.failure
|
194
|
+
end
|
195
|
+
|
196
|
+
# If there is a goto transition follow it; otherwise,
|
197
|
+
# we can assume we are at the root node.
|
198
|
+
if node.goto(item)
|
199
|
+
node = node.goto(item)
|
200
|
+
|
201
|
+
if node.output
|
202
|
+
if block_given?
|
203
|
+
output = yield output, node.output
|
204
|
+
else
|
205
|
+
output = output + node.output
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
return output
|
214
|
+
end
|
215
|
+
|
216
|
+
# Builds a Deterministic Finite Automaton from the NFA already constructed.
|
217
|
+
private
|
218
|
+
def build_dfa
|
219
|
+
fifo_q = Array.new
|
220
|
+
|
221
|
+
@root.get.each_pair do |item, node|
|
222
|
+
fifo_q.push node
|
223
|
+
end
|
224
|
+
|
225
|
+
@root.get.default = @root
|
226
|
+
|
227
|
+
while !fifo_q.empty?
|
228
|
+
node = fifo_q.shift
|
229
|
+
|
230
|
+
if node.get
|
231
|
+
node.get.each_pair do |item, node|
|
232
|
+
fifo_q.push node
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
# Assign transitions at the failure node as
|
237
|
+
# goto transitions of our current node
|
238
|
+
if (node.failure).get
|
239
|
+
(node.failure).get.each_pair do |f_item,f_node|
|
240
|
+
|
241
|
+
# Don't overwrite an already existing transition
|
242
|
+
node.set(f_item, f_node) unless node.goto(f_item)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
node.get.default = @root
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
end
|
251
|
+
|
252
|
+
#==Aho-Corasick Node Class
|
253
|
+
#
|
254
|
+
#====Description
|
255
|
+
#Class for creating and accessing nodes that are added to an Aho-Corasick Trie.
|
256
|
+
class Node
|
257
|
+
|
258
|
+
# failure: gets and sets the node to go to when no goto transition exists for an item.
|
259
|
+
attr_accessor :failure
|
260
|
+
# output: gets and sets the output at the node.
|
261
|
+
attr_accessor :output
|
262
|
+
|
263
|
+
# Creates an empty hash table to store goto transitions for this node.
|
264
|
+
def initialize
|
265
|
+
@hash = {}
|
266
|
+
end
|
267
|
+
|
268
|
+
# Creates a node in the hash table for "item". This represents
|
269
|
+
# a goto transition in the Aho-Corasick automaton for the item.
|
270
|
+
def add(item)
|
271
|
+
@hash[item] = Node.new
|
272
|
+
end
|
273
|
+
|
274
|
+
# Return the hash that contains all nodes pointed to by this node.
|
275
|
+
def get
|
276
|
+
@hash
|
277
|
+
end
|
278
|
+
|
279
|
+
# Returns the node pointed to by item.
|
280
|
+
# If no node exists the default value is returned.
|
281
|
+
def goto(item)
|
282
|
+
@hash[item] ? @hash[item] : @hash.default
|
283
|
+
end
|
284
|
+
|
285
|
+
# Assigns a node to the key "item" in the hash table.
|
286
|
+
def set(item, node)
|
287
|
+
@hash[item] = node
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
end
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ahocorasick
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Alexander H. Nick, III
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2013-03-20 00:00:00 Z
|
13
|
+
dependencies: []
|
14
|
+
|
15
|
+
description: This ruby implementation of the Aho-Corasick algorithm focuses on memory efficiency, type independence, and output customization
|
16
|
+
email: xan.nick@gmail.com
|
17
|
+
executables: []
|
18
|
+
|
19
|
+
extensions: []
|
20
|
+
|
21
|
+
extra_rdoc_files: []
|
22
|
+
|
23
|
+
files:
|
24
|
+
- lib/ahocorasick.rb
|
25
|
+
homepage: https://github.com/ahnick/ahocorasick
|
26
|
+
licenses: []
|
27
|
+
|
28
|
+
metadata: {}
|
29
|
+
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
|
33
|
+
require_paths:
|
34
|
+
- lib
|
35
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- &id001
|
38
|
+
- ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: "0"
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- *id001
|
44
|
+
requirements: []
|
45
|
+
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 2.0.3
|
48
|
+
signing_key:
|
49
|
+
specification_version: 4
|
50
|
+
summary: A memory efficient, pure Ruby implementation of the Aho-Corasick algoritm.
|
51
|
+
test_files: []
|
52
|
+
|