ahocorasick 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/ahocorasick.rb +291 -0
  3. metadata +52 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b52f23ad37cb73fd57f7dcacc0dca1fcc19c5f59
4
+ data.tar.gz: dcf6946c522cfee562659568a5fe8a8d00ba8029
5
+ SHA512:
6
+ metadata.gz: 258ca8778739da6f927998c9474d0812c1a9c5e00c74d73cae2151a649c552c45410b3565a63ba26e41999f4a49df236d3a4d68545fed3e1cd4ea2ee423b5b8b
7
+ data.tar.gz: 15ad5d4d259a965e5992d89c4730d993f7283f0fc01d5411aa279a70698c851c63b51d28df058e1a0238842030b5866a4f6b875dd5b7b23999a81b53d8fad876
@@ -0,0 +1,291 @@
1
+ #==Aho-Corasick Trie Data Structure
2
+ #:title:Aho-Corasick Module
3
+ #
4
+ #A set of classes that implement an automaton for the Aho-Corasick algorithm.
5
+ #
6
+ #The Aho-Corasick algorithm provides a linear-time lookup solution to the exact
7
+ #set matching problem. (i.e. locating all occurrences of a finite set of
8
+ #patterns within an input target) The algorithm processes the input target in
9
+ #a single pass versus multiple passes for a pattern.
10
+ #
11
+ #An example use might be to search for known sequences in a DNA string.
12
+ #Although typically used to parse character strings, this implementation
13
+ #is type independent. It can be used to parse any collection where
14
+ #the items that make up that collection can be iterated.
15
+ #
16
+ #The implementation supports both a Non-deterministic and Deterministic
17
+ #Finite Automaton construction. The NFA will consume less memory, but
18
+ #will require more transitions than its DFA counterpart.
19
+ #
20
+ #<b>Author: Alexander Nick</b>
21
+ #
22
+ #<b>Date: 04/05/2011</b>
23
+ module AhoC
24
+
25
+ #==Aho-Corasick Trie Class
26
+ #
27
+ #====Description
28
+ #A class for constructing and accessing an Aho-Corasick trie
29
+ #data structure. Any pattern that supports the "each" method
30
+ #in ruby can be added to the Trie. (e.g. strings or arrays)
31
+ #
32
+ #
33
+ #
34
+ #===Example Usage:
35
+ #====Building a Aho-Corasick Trie
36
+ # myTrie = AhoC::Trie.new
37
+ # myTrie.add("he")
38
+ # myTrie.add("she")
39
+ # myTrie.add("hers")
40
+ # myTrie.add("his")
41
+ # myTrie.build
42
+ #
43
+ #====Looking up a string
44
+ # myTrie.lookup("ushers")
45
+ #
46
+ #====Output from Lookup
47
+ # ["she", "he", "hers"]
48
+ class Trie
49
+
50
+ # Creates an empty AhoC Trie with only a root node
51
+ #
52
+ # Accepts an optional argument (either :DFA or :NFA)
53
+ # indicating the type of automaton to build. If no
54
+ # argument is passed an NFA will be built.
55
+ def initialize *arg
56
+ @root = Node.new
57
+
58
+ if !arg[0] || arg[0] == :NFA
59
+ @type = :NFA
60
+ elsif arg[0] == :DFA
61
+ @type = :DFA
62
+ else
63
+ raise "Only :DFA or :NFA accepted as arguments"
64
+ end
65
+ end
66
+
67
+ # Add a pattern to the Trie.
68
+ #
69
+ # Accepts an optional block that can be used
70
+ # to modify the node output.
71
+ def add pattern
72
+ node = @root
73
+
74
+ # If this is a string process each character
75
+ if String(pattern) == pattern
76
+ pattern.each_char do |char|
77
+ if node.goto(char) == nil
78
+ node = node.add(char)
79
+ else
80
+ node = node.goto(char)
81
+ end
82
+ end
83
+ else # Otherwise, pattern should support "each" method.
84
+ for item in pattern
85
+ if node.goto(item) == nil
86
+ node = node.add(item)
87
+ else
88
+ node = node.goto(item)
89
+ end
90
+ end
91
+ end
92
+
93
+ if block_given?
94
+ node.output = yield pattern
95
+ else
96
+ node.output = [pattern]
97
+ end
98
+ end
99
+
100
+ # Sets the failure transitions and output for each node.
101
+ # Call this method once all the patterns have been added to the Trie.
102
+ #
103
+ # Accepts an optional block that can be used to modify the output
104
+ # constructed from the node and its failure nodes.
105
+ def build
106
+ fifo_q = Array.new
107
+
108
+ # Set the failures for the nodes coming out of the root node.
109
+ @root.get.each_pair do |item, node|
110
+ node.failure = @root
111
+ fifo_q.push node
112
+ end
113
+
114
+ # Set the failures in breadth-first search order
115
+ # using a FIFO queue. A failure identifies the deepest node
116
+ # that is a proper suffix of the current node.
117
+ while !fifo_q.empty?
118
+ p_node = fifo_q.shift
119
+ if p_node.get
120
+ p_node.get.each_pair do |item, node|
121
+ # Push the current node onto the queue, so any child
122
+ # nodes can be processed later.
123
+ fifo_q.push node
124
+
125
+ f_node = p_node.failure
126
+
127
+ # Follow the failures until we find a goto transition
128
+ # or arrive back at the root node
129
+ while f_node.goto(item) == nil and !f_node.eql? @root
130
+ f_node = f_node.failure
131
+ end
132
+
133
+ if f_node.eql? @root and f_node.goto(item) == nil
134
+ node.failure = @root
135
+ else
136
+ node.failure = f_node.goto(item)
137
+ if block_given?
138
+ node.output = yield node.output, (node.failure).output
139
+ else
140
+ if node.output && (node.failure).output
141
+ node.output = node.output + (node.failure).output
142
+ elsif (node.failure).output
143
+ node.output = (node.failure).output
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end
150
+
151
+ build_dfa if @type == :DFA
152
+
153
+ end
154
+
155
+ # Finds all occurrences of patterns in the Trie contained in target.
156
+ # Outputs an array of patterns contained within the target.
157
+ #
158
+ # Accepts an optional argument to override the type of the output (e.g. String.new)
159
+ # Accepts an optional block that can modify how the output is built from each node.
160
+ def lookup target, *arg
161
+ output = arg[0] ? arg[0] : Array.new
162
+ node = @root
163
+
164
+ # If this is a string process each character
165
+ if String(target) == target
166
+ target.each_char do |char|
167
+ # Follow the failures until a goto transition is found
168
+ # or we return to the root node.
169
+ while(!node.goto(char) and !node.eql? @root)
170
+ node = node.failure
171
+ end
172
+
173
+ # If there is a goto transition follow it; otherwise,
174
+ # we can assume we are at the root node.
175
+ if node.goto(char)
176
+ node = node.goto(char)
177
+
178
+ if node.output
179
+ if block_given?
180
+ output = yield output, node.output
181
+ else
182
+ output = output + node.output
183
+ end
184
+ end
185
+
186
+ end
187
+ end
188
+ else # Otherwise, target should support "each" method.
189
+ for item in target
190
+ # Follow the failures until a goto transition is found
191
+ # or we return to the root node.
192
+ while(!node.goto(item) and !node.eql? @root)
193
+ node = node.failure
194
+ end
195
+
196
+ # If there is a goto transition follow it; otherwise,
197
+ # we can assume we are at the root node.
198
+ if node.goto(item)
199
+ node = node.goto(item)
200
+
201
+ if node.output
202
+ if block_given?
203
+ output = yield output, node.output
204
+ else
205
+ output = output + node.output
206
+ end
207
+ end
208
+
209
+ end
210
+ end
211
+ end
212
+
213
+ return output
214
+ end
215
+
216
+ # Builds a Deterministic Finite Automaton from the NFA already constructed.
217
+ private
218
+ def build_dfa
219
+ fifo_q = Array.new
220
+
221
+ @root.get.each_pair do |item, node|
222
+ fifo_q.push node
223
+ end
224
+
225
+ @root.get.default = @root
226
+
227
+ while !fifo_q.empty?
228
+ node = fifo_q.shift
229
+
230
+ if node.get
231
+ node.get.each_pair do |item, node|
232
+ fifo_q.push node
233
+ end
234
+ end
235
+
236
+ # Assign transitions at the failure node as
237
+ # goto transitions of our current node
238
+ if (node.failure).get
239
+ (node.failure).get.each_pair do |f_item,f_node|
240
+
241
+ # Don't overwrite an already existing transition
242
+ node.set(f_item, f_node) unless node.goto(f_item)
243
+ end
244
+ end
245
+
246
+ node.get.default = @root
247
+ end
248
+ end
249
+
250
+ end
251
+
252
+ #==Aho-Corasick Node Class
253
+ #
254
+ #====Description
255
+ #Class for creating and accessing nodes that are added to an Aho-Corasick Trie.
256
+ class Node
257
+
258
+ # failure: gets and sets the node to go to when no goto transition exists for an item.
259
+ attr_accessor :failure
260
+ # output: gets and sets the output at the node.
261
+ attr_accessor :output
262
+
263
+ # Creates an empty hash table to store goto transitions for this node.
264
+ def initialize
265
+ @hash = {}
266
+ end
267
+
268
+ # Creates a node in the hash table for "item". This represents
269
+ # a goto transition in the Aho-Corasick automaton for the item.
270
+ def add(item)
271
+ @hash[item] = Node.new
272
+ end
273
+
274
+ # Return the hash that contains all nodes pointed to by this node.
275
+ def get
276
+ @hash
277
+ end
278
+
279
+ # Returns the node pointed to by item.
280
+ # If no node exists the default value is returned.
281
+ def goto(item)
282
+ @hash[item] ? @hash[item] : @hash.default
283
+ end
284
+
285
+ # Assigns a node to the key "item" in the hash table.
286
+ def set(item, node)
287
+ @hash[item] = node
288
+ end
289
+ end
290
+
291
+ end
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ahocorasick
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Alexander H. Nick, III
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2013-03-20 00:00:00 Z
13
+ dependencies: []
14
+
15
+ description: This ruby implementation of the Aho-Corasick algorithm focuses on memory efficiency, type independence, and output customization
16
+ email: xan.nick@gmail.com
17
+ executables: []
18
+
19
+ extensions: []
20
+
21
+ extra_rdoc_files: []
22
+
23
+ files:
24
+ - lib/ahocorasick.rb
25
+ homepage: https://github.com/ahnick/ahocorasick
26
+ licenses: []
27
+
28
+ metadata: {}
29
+
30
+ post_install_message:
31
+ rdoc_options: []
32
+
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - &id001
38
+ - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: "0"
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - *id001
44
+ requirements: []
45
+
46
+ rubyforge_project:
47
+ rubygems_version: 2.0.3
48
+ signing_key:
49
+ specification_version: 4
50
+ summary: A memory efficient, pure Ruby implementation of the Aho-Corasick algoritm.
51
+ test_files: []
52
+