ahocorasick 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/ahocorasick.rb +291 -0
  3. metadata +52 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b52f23ad37cb73fd57f7dcacc0dca1fcc19c5f59
4
+ data.tar.gz: dcf6946c522cfee562659568a5fe8a8d00ba8029
5
+ SHA512:
6
+ metadata.gz: 258ca8778739da6f927998c9474d0812c1a9c5e00c74d73cae2151a649c552c45410b3565a63ba26e41999f4a49df236d3a4d68545fed3e1cd4ea2ee423b5b8b
7
+ data.tar.gz: 15ad5d4d259a965e5992d89c4730d993f7283f0fc01d5411aa279a70698c851c63b51d28df058e1a0238842030b5866a4f6b875dd5b7b23999a81b53d8fad876
@@ -0,0 +1,291 @@
1
+ #==Aho-Corasick Trie Data Structure
2
+ #:title:Aho-Corasick Module
3
+ #
4
+ #A set of classes that implement an automaton for the Aho-Corasick algorithm.
5
+ #
6
+ #The Aho-Corasick algorithm provides a linear-time lookup solution to the exact
7
+ #set matching problem. (i.e. locating all occurrences of a finite set of
8
+ #patterns within an input target) The algorithm processes the input target in
9
+ #a single pass versus multiple passes for a pattern.
10
+ #
11
+ #An example use might be to search for known sequences in a DNA string.
12
+ #Although typically used to parse character strings, this implementation
13
+ #is type independent. It can be used to parse any collection where
14
+ #the items that make up that collection can be iterated.
15
+ #
16
+ #The implementation supports both a Non-deterministic and Deterministic
17
+ #Finite Automaton construction. The NFA will consume less memory, but
18
+ #will require more transitions than its DFA counterpart.
19
+ #
20
+ #<b>Author: Alexander Nick</b>
21
+ #
22
+ #<b>Date: 04/05/2011</b>
23
+ module AhoC
24
+
25
+ #==Aho-Corasick Trie Class
26
+ #
27
+ #====Description
28
+ #A class for constructing and accessing an Aho-Corasick trie
29
+ #data structure. Any pattern that supports the "each" method
30
+ #in ruby can be added to the Trie. (e.g. strings or arrays)
31
+ #
32
+ #
33
+ #
34
+ #===Example Usage:
35
+ #====Building a Aho-Corasick Trie
36
+ # myTrie = AhoC::Trie.new
37
+ # myTrie.add("he")
38
+ # myTrie.add("she")
39
+ # myTrie.add("hers")
40
+ # myTrie.add("his")
41
+ # myTrie.build
42
+ #
43
+ #====Looking up a string
44
+ # myTrie.lookup("ushers")
45
+ #
46
+ #====Output from Lookup
47
+ # ["she", "he", "hers"]
48
+ class Trie
49
+
50
+ # Creates an empty AhoC Trie with only a root node
51
+ #
52
+ # Accepts an optional argument (either :DFA or :NFA)
53
+ # indicating the type of automaton to build. If no
54
+ # argument is passed an NFA will be built.
55
+ def initialize *arg
56
+ @root = Node.new
57
+
58
+ if !arg[0] || arg[0] == :NFA
59
+ @type = :NFA
60
+ elsif arg[0] == :DFA
61
+ @type = :DFA
62
+ else
63
+ raise "Only :DFA or :NFA accepted as arguments"
64
+ end
65
+ end
66
+
67
+ # Add a pattern to the Trie.
68
+ #
69
+ # Accepts an optional block that can be used
70
+ # to modify the node output.
71
+ def add pattern
72
+ node = @root
73
+
74
+ # If this is a string process each character
75
+ if String(pattern) == pattern
76
+ pattern.each_char do |char|
77
+ if node.goto(char) == nil
78
+ node = node.add(char)
79
+ else
80
+ node = node.goto(char)
81
+ end
82
+ end
83
+ else # Otherwise, pattern should support "each" method.
84
+ for item in pattern
85
+ if node.goto(item) == nil
86
+ node = node.add(item)
87
+ else
88
+ node = node.goto(item)
89
+ end
90
+ end
91
+ end
92
+
93
+ if block_given?
94
+ node.output = yield pattern
95
+ else
96
+ node.output = [pattern]
97
+ end
98
+ end
99
+
100
+ # Sets the failure transitions and output for each node.
101
+ # Call this method once all the patterns have been added to the Trie.
102
+ #
103
+ # Accepts an optional block that can be used to modify the output
104
+ # constructed from the node and its failure nodes.
105
+ def build
106
+ fifo_q = Array.new
107
+
108
+ # Set the failures for the nodes coming out of the root node.
109
+ @root.get.each_pair do |item, node|
110
+ node.failure = @root
111
+ fifo_q.push node
112
+ end
113
+
114
+ # Set the failures in breadth-first search order
115
+ # using a FIFO queue. A failure identifies the deepest node
116
+ # that is a proper suffix of the current node.
117
+ while !fifo_q.empty?
118
+ p_node = fifo_q.shift
119
+ if p_node.get
120
+ p_node.get.each_pair do |item, node|
121
+ # Push the current node onto the queue, so any child
122
+ # nodes can be processed later.
123
+ fifo_q.push node
124
+
125
+ f_node = p_node.failure
126
+
127
+ # Follow the failures until we find a goto transition
128
+ # or arrive back at the root node
129
+ while f_node.goto(item) == nil and !f_node.eql? @root
130
+ f_node = f_node.failure
131
+ end
132
+
133
+ if f_node.eql? @root and f_node.goto(item) == nil
134
+ node.failure = @root
135
+ else
136
+ node.failure = f_node.goto(item)
137
+ if block_given?
138
+ node.output = yield node.output, (node.failure).output
139
+ else
140
+ if node.output && (node.failure).output
141
+ node.output = node.output + (node.failure).output
142
+ elsif (node.failure).output
143
+ node.output = (node.failure).output
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end
150
+
151
+ build_dfa if @type == :DFA
152
+
153
+ end
154
+
155
+ # Finds all occurrences of patterns in the Trie contained in target.
156
+ # Outputs an array of patterns contained within the target.
157
+ #
158
+ # Accepts an optional argument to override the type of the output (e.g. String.new)
159
+ # Accepts an optional block that can modify how the output is built from each node.
160
+ def lookup target, *arg
161
+ output = arg[0] ? arg[0] : Array.new
162
+ node = @root
163
+
164
+ # If this is a string process each character
165
+ if String(target) == target
166
+ target.each_char do |char|
167
+ # Follow the failures until a goto transition is found
168
+ # or we return to the root node.
169
+ while(!node.goto(char) and !node.eql? @root)
170
+ node = node.failure
171
+ end
172
+
173
+ # If there is a goto transition follow it; otherwise,
174
+ # we can assume we are at the root node.
175
+ if node.goto(char)
176
+ node = node.goto(char)
177
+
178
+ if node.output
179
+ if block_given?
180
+ output = yield output, node.output
181
+ else
182
+ output = output + node.output
183
+ end
184
+ end
185
+
186
+ end
187
+ end
188
+ else # Otherwise, target should support "each" method.
189
+ for item in target
190
+ # Follow the failures until a goto transition is found
191
+ # or we return to the root node.
192
+ while(!node.goto(item) and !node.eql? @root)
193
+ node = node.failure
194
+ end
195
+
196
+ # If there is a goto transition follow it; otherwise,
197
+ # we can assume we are at the root node.
198
+ if node.goto(item)
199
+ node = node.goto(item)
200
+
201
+ if node.output
202
+ if block_given?
203
+ output = yield output, node.output
204
+ else
205
+ output = output + node.output
206
+ end
207
+ end
208
+
209
+ end
210
+ end
211
+ end
212
+
213
+ return output
214
+ end
215
+
216
+ # Builds a Deterministic Finite Automaton from the NFA already constructed.
217
+ private
218
+ def build_dfa
219
+ fifo_q = Array.new
220
+
221
+ @root.get.each_pair do |item, node|
222
+ fifo_q.push node
223
+ end
224
+
225
+ @root.get.default = @root
226
+
227
+ while !fifo_q.empty?
228
+ node = fifo_q.shift
229
+
230
+ if node.get
231
+ node.get.each_pair do |item, node|
232
+ fifo_q.push node
233
+ end
234
+ end
235
+
236
+ # Assign transitions at the failure node as
237
+ # goto transitions of our current node
238
+ if (node.failure).get
239
+ (node.failure).get.each_pair do |f_item,f_node|
240
+
241
+ # Don't overwrite an already existing transition
242
+ node.set(f_item, f_node) unless node.goto(f_item)
243
+ end
244
+ end
245
+
246
+ node.get.default = @root
247
+ end
248
+ end
249
+
250
+ end
251
+
252
+ #==Aho-Corasick Node Class
253
+ #
254
+ #====Description
255
+ #Class for creating and accessing nodes that are added to an Aho-Corasick Trie.
256
+ class Node
257
+
258
+ # failure: gets and sets the node to go to when no goto transition exists for an item.
259
+ attr_accessor :failure
260
+ # output: gets and sets the output at the node.
261
+ attr_accessor :output
262
+
263
+ # Creates an empty hash table to store goto transitions for this node.
264
+ def initialize
265
+ @hash = {}
266
+ end
267
+
268
+ # Creates a node in the hash table for "item". This represents
269
+ # a goto transition in the Aho-Corasick automaton for the item.
270
+ def add(item)
271
+ @hash[item] = Node.new
272
+ end
273
+
274
+ # Return the hash that contains all nodes pointed to by this node.
275
+ def get
276
+ @hash
277
+ end
278
+
279
+ # Returns the node pointed to by item.
280
+ # If no node exists the default value is returned.
281
+ def goto(item)
282
+ @hash[item] ? @hash[item] : @hash.default
283
+ end
284
+
285
+ # Assigns a node to the key "item" in the hash table.
286
+ def set(item, node)
287
+ @hash[item] = node
288
+ end
289
+ end
290
+
291
+ end
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ahocorasick
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Alexander H. Nick, III
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2013-03-20 00:00:00 Z
13
+ dependencies: []
14
+
15
+ description: This ruby implementation of the Aho-Corasick algorithm focuses on memory efficiency, type independence, and output customization
16
+ email: xan.nick@gmail.com
17
+ executables: []
18
+
19
+ extensions: []
20
+
21
+ extra_rdoc_files: []
22
+
23
+ files:
24
+ - lib/ahocorasick.rb
25
+ homepage: https://github.com/ahnick/ahocorasick
26
+ licenses: []
27
+
28
+ metadata: {}
29
+
30
+ post_install_message:
31
+ rdoc_options: []
32
+
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - &id001
38
+ - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: "0"
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - *id001
44
+ requirements: []
45
+
46
+ rubyforge_project:
47
+ rubygems_version: 2.0.3
48
+ signing_key:
49
+ specification_version: 4
50
+ summary: A memory efficient, pure Ruby implementation of the Aho-Corasick algoritm.
51
+ test_files: []
52
+