ahocorasick 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/ahocorasick.rb +291 -0
- metadata +52 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b52f23ad37cb73fd57f7dcacc0dca1fcc19c5f59
|
4
|
+
data.tar.gz: dcf6946c522cfee562659568a5fe8a8d00ba8029
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 258ca8778739da6f927998c9474d0812c1a9c5e00c74d73cae2151a649c552c45410b3565a63ba26e41999f4a49df236d3a4d68545fed3e1cd4ea2ee423b5b8b
|
7
|
+
data.tar.gz: 15ad5d4d259a965e5992d89c4730d993f7283f0fc01d5411aa279a70698c851c63b51d28df058e1a0238842030b5866a4f6b875dd5b7b23999a81b53d8fad876
|
data/lib/ahocorasick.rb
ADDED
@@ -0,0 +1,291 @@
|
|
1
|
+
#==Aho-Corasick Trie Data Structure
|
2
|
+
#:title:Aho-Corasick Module
|
3
|
+
#
|
4
|
+
#A set of classes that implement an automaton for the Aho-Corasick algorithm.
|
5
|
+
#
|
6
|
+
#The Aho-Corasick algorithm provides a linear-time lookup solution to the exact
|
7
|
+
#set matching problem. (i.e. locating all occurrences of a finite set of
|
8
|
+
#patterns within an input target) The algorithm processes the input target in
|
9
|
+
#a single pass versus multiple passes for a pattern.
|
10
|
+
#
|
11
|
+
#An example use might be to search for known sequences in a DNA string.
|
12
|
+
#Although typically used to parse character strings, this implementation
|
13
|
+
#is type independent. It can be used to parse any collection where
|
14
|
+
#the items that make up that collection can be iterated.
|
15
|
+
#
|
16
|
+
#The implementation supports both a Non-deterministic and Deterministic
|
17
|
+
#Finite Automaton construction. The NFA will consume less memory, but
|
18
|
+
#will require more transitions than its DFA counterpart.
|
19
|
+
#
|
20
|
+
#<b>Author: Alexander Nick</b>
|
21
|
+
#
|
22
|
+
#<b>Date: 04/05/2011</b>
|
23
|
+
module AhoC
|
24
|
+
|
25
|
+
#==Aho-Corasick Trie Class
|
26
|
+
#
|
27
|
+
#====Description
|
28
|
+
#A class for constructing and accessing an Aho-Corasick trie
|
29
|
+
#data structure. Any pattern that supports the "each" method
|
30
|
+
#in ruby can be added to the Trie. (e.g. strings or arrays)
|
31
|
+
#
|
32
|
+
#
|
33
|
+
#
|
34
|
+
#===Example Usage:
|
35
|
+
#====Building a Aho-Corasick Trie
|
36
|
+
# myTrie = AhoC::Trie.new
|
37
|
+
# myTrie.add("he")
|
38
|
+
# myTrie.add("she")
|
39
|
+
# myTrie.add("hers")
|
40
|
+
# myTrie.add("his")
|
41
|
+
# myTrie.build
|
42
|
+
#
|
43
|
+
#====Looking up a string
|
44
|
+
# myTrie.lookup("ushers")
|
45
|
+
#
|
46
|
+
#====Output from Lookup
|
47
|
+
# ["she", "he", "hers"]
|
48
|
+
class Trie
|
49
|
+
|
50
|
+
# Creates an empty AhoC Trie with only a root node
|
51
|
+
#
|
52
|
+
# Accepts an optional argument (either :DFA or :NFA)
|
53
|
+
# indicating the type of automaton to build. If no
|
54
|
+
# argument is passed an NFA will be built.
|
55
|
+
def initialize *arg
|
56
|
+
@root = Node.new
|
57
|
+
|
58
|
+
if !arg[0] || arg[0] == :NFA
|
59
|
+
@type = :NFA
|
60
|
+
elsif arg[0] == :DFA
|
61
|
+
@type = :DFA
|
62
|
+
else
|
63
|
+
raise "Only :DFA or :NFA accepted as arguments"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Add a pattern to the Trie.
|
68
|
+
#
|
69
|
+
# Accepts an optional block that can be used
|
70
|
+
# to modify the node output.
|
71
|
+
def add pattern
|
72
|
+
node = @root
|
73
|
+
|
74
|
+
# If this is a string process each character
|
75
|
+
if String(pattern) == pattern
|
76
|
+
pattern.each_char do |char|
|
77
|
+
if node.goto(char) == nil
|
78
|
+
node = node.add(char)
|
79
|
+
else
|
80
|
+
node = node.goto(char)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
else # Otherwise, pattern should support "each" method.
|
84
|
+
for item in pattern
|
85
|
+
if node.goto(item) == nil
|
86
|
+
node = node.add(item)
|
87
|
+
else
|
88
|
+
node = node.goto(item)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
if block_given?
|
94
|
+
node.output = yield pattern
|
95
|
+
else
|
96
|
+
node.output = [pattern]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Sets the failure transitions and output for each node.
|
101
|
+
# Call this method once all the patterns have been added to the Trie.
|
102
|
+
#
|
103
|
+
# Accepts an optional block that can be used to modify the output
|
104
|
+
# constructed from the node and its failure nodes.
|
105
|
+
def build
|
106
|
+
fifo_q = Array.new
|
107
|
+
|
108
|
+
# Set the failures for the nodes coming out of the root node.
|
109
|
+
@root.get.each_pair do |item, node|
|
110
|
+
node.failure = @root
|
111
|
+
fifo_q.push node
|
112
|
+
end
|
113
|
+
|
114
|
+
# Set the failures in breadth-first search order
|
115
|
+
# using a FIFO queue. A failure identifies the deepest node
|
116
|
+
# that is a proper suffix of the current node.
|
117
|
+
while !fifo_q.empty?
|
118
|
+
p_node = fifo_q.shift
|
119
|
+
if p_node.get
|
120
|
+
p_node.get.each_pair do |item, node|
|
121
|
+
# Push the current node onto the queue, so any child
|
122
|
+
# nodes can be processed later.
|
123
|
+
fifo_q.push node
|
124
|
+
|
125
|
+
f_node = p_node.failure
|
126
|
+
|
127
|
+
# Follow the failures until we find a goto transition
|
128
|
+
# or arrive back at the root node
|
129
|
+
while f_node.goto(item) == nil and !f_node.eql? @root
|
130
|
+
f_node = f_node.failure
|
131
|
+
end
|
132
|
+
|
133
|
+
if f_node.eql? @root and f_node.goto(item) == nil
|
134
|
+
node.failure = @root
|
135
|
+
else
|
136
|
+
node.failure = f_node.goto(item)
|
137
|
+
if block_given?
|
138
|
+
node.output = yield node.output, (node.failure).output
|
139
|
+
else
|
140
|
+
if node.output && (node.failure).output
|
141
|
+
node.output = node.output + (node.failure).output
|
142
|
+
elsif (node.failure).output
|
143
|
+
node.output = (node.failure).output
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
build_dfa if @type == :DFA
|
152
|
+
|
153
|
+
end
|
154
|
+
|
155
|
+
# Finds all occurrences of patterns in the Trie contained in target.
|
156
|
+
# Outputs an array of patterns contained within the target.
|
157
|
+
#
|
158
|
+
# Accepts an optional argument to override the type of the output (e.g. String.new)
|
159
|
+
# Accepts an optional block that can modify how the output is built from each node.
|
160
|
+
def lookup target, *arg
|
161
|
+
output = arg[0] ? arg[0] : Array.new
|
162
|
+
node = @root
|
163
|
+
|
164
|
+
# If this is a string process each character
|
165
|
+
if String(target) == target
|
166
|
+
target.each_char do |char|
|
167
|
+
# Follow the failures until a goto transition is found
|
168
|
+
# or we return to the root node.
|
169
|
+
while(!node.goto(char) and !node.eql? @root)
|
170
|
+
node = node.failure
|
171
|
+
end
|
172
|
+
|
173
|
+
# If there is a goto transition follow it; otherwise,
|
174
|
+
# we can assume we are at the root node.
|
175
|
+
if node.goto(char)
|
176
|
+
node = node.goto(char)
|
177
|
+
|
178
|
+
if node.output
|
179
|
+
if block_given?
|
180
|
+
output = yield output, node.output
|
181
|
+
else
|
182
|
+
output = output + node.output
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
end
|
188
|
+
else # Otherwise, target should support "each" method.
|
189
|
+
for item in target
|
190
|
+
# Follow the failures until a goto transition is found
|
191
|
+
# or we return to the root node.
|
192
|
+
while(!node.goto(item) and !node.eql? @root)
|
193
|
+
node = node.failure
|
194
|
+
end
|
195
|
+
|
196
|
+
# If there is a goto transition follow it; otherwise,
|
197
|
+
# we can assume we are at the root node.
|
198
|
+
if node.goto(item)
|
199
|
+
node = node.goto(item)
|
200
|
+
|
201
|
+
if node.output
|
202
|
+
if block_given?
|
203
|
+
output = yield output, node.output
|
204
|
+
else
|
205
|
+
output = output + node.output
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
return output
|
214
|
+
end
|
215
|
+
|
216
|
+
# Builds a Deterministic Finite Automaton from the NFA already constructed.
|
217
|
+
private
|
218
|
+
def build_dfa
|
219
|
+
fifo_q = Array.new
|
220
|
+
|
221
|
+
@root.get.each_pair do |item, node|
|
222
|
+
fifo_q.push node
|
223
|
+
end
|
224
|
+
|
225
|
+
@root.get.default = @root
|
226
|
+
|
227
|
+
while !fifo_q.empty?
|
228
|
+
node = fifo_q.shift
|
229
|
+
|
230
|
+
if node.get
|
231
|
+
node.get.each_pair do |item, node|
|
232
|
+
fifo_q.push node
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
# Assign transitions at the failure node as
|
237
|
+
# goto transitions of our current node
|
238
|
+
if (node.failure).get
|
239
|
+
(node.failure).get.each_pair do |f_item,f_node|
|
240
|
+
|
241
|
+
# Don't overwrite an already existing transition
|
242
|
+
node.set(f_item, f_node) unless node.goto(f_item)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
node.get.default = @root
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
end
|
251
|
+
|
252
|
+
#==Aho-Corasick Node Class
|
253
|
+
#
|
254
|
+
#====Description
|
255
|
+
#Class for creating and accessing nodes that are added to an Aho-Corasick Trie.
|
256
|
+
class Node
|
257
|
+
|
258
|
+
# failure: gets and sets the node to go to when no goto transition exists for an item.
|
259
|
+
attr_accessor :failure
|
260
|
+
# output: gets and sets the output at the node.
|
261
|
+
attr_accessor :output
|
262
|
+
|
263
|
+
# Creates an empty hash table to store goto transitions for this node.
|
264
|
+
def initialize
|
265
|
+
@hash = {}
|
266
|
+
end
|
267
|
+
|
268
|
+
# Creates a node in the hash table for "item". This represents
|
269
|
+
# a goto transition in the Aho-Corasick automaton for the item.
|
270
|
+
def add(item)
|
271
|
+
@hash[item] = Node.new
|
272
|
+
end
|
273
|
+
|
274
|
+
# Return the hash that contains all nodes pointed to by this node.
|
275
|
+
def get
|
276
|
+
@hash
|
277
|
+
end
|
278
|
+
|
279
|
+
# Returns the node pointed to by item.
|
280
|
+
# If no node exists the default value is returned.
|
281
|
+
def goto(item)
|
282
|
+
@hash[item] ? @hash[item] : @hash.default
|
283
|
+
end
|
284
|
+
|
285
|
+
# Assigns a node to the key "item" in the hash table.
|
286
|
+
def set(item, node)
|
287
|
+
@hash[item] = node
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
end
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ahocorasick
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Alexander H. Nick, III
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2013-03-20 00:00:00 Z
|
13
|
+
dependencies: []
|
14
|
+
|
15
|
+
description: This ruby implementation of the Aho-Corasick algorithm focuses on memory efficiency, type independence, and output customization
|
16
|
+
email: xan.nick@gmail.com
|
17
|
+
executables: []
|
18
|
+
|
19
|
+
extensions: []
|
20
|
+
|
21
|
+
extra_rdoc_files: []
|
22
|
+
|
23
|
+
files:
|
24
|
+
- lib/ahocorasick.rb
|
25
|
+
homepage: https://github.com/ahnick/ahocorasick
|
26
|
+
licenses: []
|
27
|
+
|
28
|
+
metadata: {}
|
29
|
+
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
|
33
|
+
require_paths:
|
34
|
+
- lib
|
35
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- &id001
|
38
|
+
- ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: "0"
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- *id001
|
44
|
+
requirements: []
|
45
|
+
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 2.0.3
|
48
|
+
signing_key:
|
49
|
+
specification_version: 4
|
50
|
+
summary: A memory efficient, pure Ruby implementation of the Aho-Corasick algoritm.
|
51
|
+
test_files: []
|
52
|
+
|