editalign 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,65 @@
1
+ = Edit Alignment Module
2
+
3
+ The module implements a dynamic programming string alignment algorithm
4
+ that produces both alignments and edit distances between source and
5
+ destination arrays or strings.
6
+
7
+ An alignment between the arrays is represented as a sequence of
8
+ insert, delete, and substitute operations that transform individual
9
+ source items into destination items. For example, the following is an
10
+ alignment between the letters in the words 'captained' and 'caspian':
11
+
12
+ c a - p t a i n e d
13
+ c a s p i a - n - -
14
+
15
+ Here an 's' was inserted, an 'i' was substituted for a 't', and an
16
+ 'i', an 'e', and a 'd' were deleted.
17
+
18
+ The module exports an Alignment class which assigns numeric costs to
19
+ each of these operations and finds the alignment that incurs the
20
+ minimum cost or edit distance. The Alignment class uses Dijkstra's
21
+ algorithm to efficiently find edit distances for partially-aligned
22
+ arrays. See the EditAlign::DijkstraSearch class for details.
23
+
24
+ irb(main):001:0> require 'editalign'
25
+ => true
26
+ irb(main):002:0> a = EditAlign::Alignment.new('captained', 'caspian')
27
+ => <Alignment: 5>
28
+ irb(main):003:0> a.edit_distance
29
+ => 5
30
+ irb(main):004:0> a.edit_operations
31
+ => [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete]
32
+ irb(main):005:0> a.source_alignment
33
+ => ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"]
34
+ irb(main):006:0> a.dest_alignment
35
+ => ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil]
36
+ irb(main):007:0> puts a
37
+ ca-ptained
38
+ caspia-n--
39
+ I S D DD
40
+ 5
41
+ => nil
42
+
43
+ = History
44
+
45
+ * 1-0-0 ... First version
46
+
47
+ = See Also
48
+
49
+ * The Levenshtein[http://po-ru.com/projects/levenshtein/] module generates a Levenshtein edit distance doing an exhaustive search.
50
+
51
+ = Acknowledgments
52
+
53
+ Thanks to Jeremy G. Kahn for suggesting a diagonal-hugging search
54
+ strategy and optimizations to the one implemented here.
55
+
56
+ = Copyright
57
+
58
+ Copyright 2006, William Patrick McNeill
59
+
60
+ This program is distributed under the GNU General Public License.
61
+
62
+ = Author
63
+
64
+ W.P. McNeill mailto:billmcn@u.washington.edu
65
+
@@ -0,0 +1,72 @@
1
+ #!/bin/env ruby
2
+
3
+ #--
4
+ # Copyright 2006 William Patrick McNeill
5
+ #
6
+ # This file is part of Editalign.
7
+ #
8
+ # Editalign is free software; you can redistribute it and/or modify it
9
+ # under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # Editalign is distributed in the hope that it will be useful, but
14
+ # WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ # General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with editalign; if not, write to the Free Software Foundation,
20
+ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #
22
+ #++
23
+
24
+ # Print the character alignment between two strings passed in on the
25
+ # command line.
26
+
27
+ require 'getoptlong'
28
+ require 'editalign'
29
+
30
+ class ExhaustiveSellersAlignment < EditAlign::SellersAlignment
31
+ include EditAlign::ExhaustiveSearch
32
+ end
33
+
34
+ # Process command line options.
35
+ opts = GetoptLong.new(["--match", "-m", GetoptLong::REQUIRED_ARGUMENT],
36
+ ["--nomatch", "-n", GetoptLong::REQUIRED_ARGUMENT],
37
+ ["--insert", "-i",GetoptLong::REQUIRED_ARGUMENT],
38
+ ["--delete", "-d", GetoptLong::REQUIRED_ARGUMENT],
39
+ ["--exhaustive", "-e", GetoptLong::NO_ARGUMENT]
40
+ )
41
+
42
+ match = 0
43
+ mismatch = 1
44
+ insert = 1
45
+ delete = 1
46
+ exhaustive = false
47
+ opts.each do |opt, arg|
48
+ case opt
49
+ when "--match"
50
+ match = arg.to_f
51
+ when "--nomatch"
52
+ mismatch = arg.to_f
53
+ when "--insert"
54
+ insert = arg.to_f
55
+ when "--delete"
56
+ delete = arg.to_f
57
+ when "--exhaustive"
58
+ exhaustive = true
59
+ end
60
+ end
61
+
62
+ source = ARGV[0]
63
+ dest = ARGV[1]
64
+
65
+ # Do alignments and print results.
66
+ alignments = [EditAlign::SellersAlignment]
67
+ alignments << ExhaustiveSellersAlignment if exhaustive
68
+
69
+ alignments.each do |align_class|
70
+ a = align_class.new(source, dest, match, mismatch, insert, delete)
71
+ puts a, a.to_grid
72
+ end
@@ -0,0 +1,165 @@
1
+ #!/bin/env ruby
2
+
3
+ #--
4
+ # Copyright 2006 William Patrick McNeill
5
+ #
6
+ # This file is part of Editalign.
7
+ #
8
+ # Editalign is free software; you can redistribute it and/or modify it
9
+ # under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # Editalign is distributed in the hope that it will be useful, but
14
+ # WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ # General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with editalign; if not, write to the Free Software Foundation,
20
+ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #
22
+ #++
23
+
24
+ require "getoptlong"
25
+ require "editalign"
26
+
27
+ class PureRubyEditAlign < EditAlign::Alignment
28
+ def priority_queue_factory
29
+ require "priority_queue/ruby_priority_queue"
30
+ RubyPriorityQueue.new
31
+ end
32
+ end
33
+
34
+ class IntegerPriorityEditAlign < EditAlign::Alignment
35
+ def priority_factory(cost, cell)
36
+ cost
37
+ end
38
+ end
39
+
40
+
41
+ # Process this script's command line options.
42
+ #
43
+ # There are three required arguments.
44
+ #
45
+ # * The number of alignments to perform
46
+ # * The length of the strings to align
47
+ # * The number of edits
48
+ #
49
+ # There are two optional switches.
50
+ #
51
+ # * verbose ... prints strings and edit distances
52
+ # * pure-ruby ... uses the pure Ruby priority queue instead of the C extension
53
+ # * int-cost ... uses integer costs
54
+ def parse_command_line
55
+ opts = GetoptLong.new(["--verbose", "-v", GetoptLong::NO_ARGUMENT],
56
+ ["--pure-ruby", "-p", GetoptLong::NO_ARGUMENT],
57
+ ["--int-cost", "-i", GetoptLong::NO_ARGUMENT])
58
+ verbose = false
59
+ pure_ruby = false
60
+ int_cost = false
61
+ opts.each do |opt, arg|
62
+ case opt
63
+ when "--verbose"
64
+ verbose = true
65
+ when "--pure-ruby"
66
+ pure_ruby = true
67
+ when "--int-cost"
68
+ int_cost = true
69
+ end
70
+ end
71
+
72
+ trials = Integer(ARGV[0])
73
+ length = Integer(ARGV[1])
74
+ edits = Integer(ARGV[2])
75
+
76
+ if pure_ruby
77
+ klass = PureRubyEditAlign
78
+ elsif int_cost
79
+ klass = IntegerPriorityEditAlign
80
+ else
81
+ klass = EditAlign::Alignment
82
+ end
83
+
84
+ [trials, length, edits, klass, verbose]
85
+ end
86
+
87
+ # Run a number of alignments of the same size.
88
+ def run_stress_test(trials, length, edits, klass, verbose)
89
+ (1..trials).each do |i|
90
+ puts "Trial #{i}" if verbose
91
+
92
+ # Generate a destination string with random differences from the
93
+ # source string.
94
+ source, dest = create_strings(length, edits)
95
+
96
+ # Align the altered string with the one read from the file.
97
+ alignment = klass.new(source, dest)
98
+ print "#{alignment}\n\n" if verbose
99
+ end
100
+ end
101
+
102
+ # Create two unaligned strings.
103
+ def create_strings(length, edits)
104
+ # The alphabet used for inserts and substitutions.
105
+ alphabet = ('A'..'Z').collect
106
+
107
+ # Source contains a repeating lowercase alphabet length characters
108
+ # long.
109
+ last = length - 1
110
+ source = (0..last).map {|x| (97 + x % 26).chr}
111
+
112
+ # There can only be as many edits as there are characters.
113
+ edits = length if edits > length
114
+
115
+ # Create a roughly even number of substitutions, inserts, and
116
+ # deletes.
117
+ n_subs = 0
118
+ n_ins = 0
119
+ n_dels = 0
120
+ (1..edits).each do
121
+ case rand(3)
122
+ when 0
123
+ n_subs += 1
124
+ when 1
125
+ n_ins += 1
126
+ when 2
127
+ n_dels += 1
128
+ end
129
+ end
130
+
131
+ # Distribute edit operations randomly throughout the string.
132
+ unchanged_pos = (0..last).collect
133
+ edit_op = [nil] * length
134
+
135
+ [:substitute, :insert, :delete].zip([n_subs, n_ins, n_dels]) do |op, n|
136
+ (1..n).each do
137
+ i = rand(unchanged_pos.length)
138
+ edit_op[unchanged_pos[i]] = op
139
+ unchanged_pos.delete_at(i)
140
+ end
141
+ end
142
+
143
+ # Use the random edit operations to create a destination string.
144
+ dest = ''
145
+ edit_op.each_index do |i|
146
+ case edit_op[i]
147
+ when nil
148
+ dest += source[i]
149
+ when :substitute
150
+ dest += alphabet[rand(alphabet.length)]
151
+ when :insert
152
+ dest += alphabet[rand(alphabet.length)] + source[i]
153
+ when :delete
154
+ # Do nothing
155
+ end
156
+ end
157
+
158
+ [source, dest]
159
+ end
160
+
161
+
162
+ if __FILE__ == $0
163
+ trials, length, edits, klass, verbose = parse_command_line
164
+ run_stress_test(trials, length, edits, klass, verbose)
165
+ end
data/lib/editalign.rb ADDED
@@ -0,0 +1,516 @@
1
+ # Copyright 2006 William Patrick McNeill
2
+ #
3
+ # Editalign is free software; you can redistribute it and/or modify it
4
+ # under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation; either version 2 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # Editalign is distributed in the hope that it will be useful, but
9
+ # WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with editalign; if not, write to the Free Software Foundation,
15
+ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ # EditAlign is the namespace that contains all edit alignment
18
+ # functions.
19
+ module EditAlign
20
+
21
+ # This module employs Dijkstra's algorithm to find the lowest-cost
22
+ # sequence of edit operations that will transform the source array
23
+ # into the destination array. The alignment grid is treated as a
24
+ # directed acyclic graph where each cell in the grid is a vertex.
25
+ # Edges in the graph correspond to substitution, deletion and
26
+ # insertion operations. The edge weights come from the weighting
27
+ # methods #substitute, #insert, and #delete.
28
+ #
29
+ # Generally speaking, the strategy is to search the diagonal of the
30
+ # alignment grid before the corners. For partially-aligned arrays,
31
+ # this strategy can result in fewer calls to the weighting
32
+ # functions.
33
+ module DijkstraSearch
34
+
35
+ # Cells to be searched in the Dijkstra priority queue are ordered
36
+ # by SearchPriority. SearchPriority orders cells first by cost,
37
+ # and then if the costs are equal by smallest number of hops to
38
+ # the end cell. All things being equal, the latter comparison
39
+ # makes the algorithm search cells near the diagonal first. This
40
+ # can help in instances where the beginings of the arrays are
41
+ # unaligned but the ends are aligned.
42
+ class SearchPriority
43
+
44
+ # The cost to reach the current cell
45
+ attr_reader :cost
46
+
47
+ # The minimum path length to the final cell
48
+ attr_reader :dist
49
+
50
+ include Comparable
51
+
52
+ # Specify the cost to reach a cell, the cell, and the final
53
+ # cell.
54
+ def initialize(cost, cell, end_cell)
55
+ @cost = cost
56
+ @dist = [end_cell.source - cell.source, end_cell.dest - cell.dest].max
57
+ end
58
+
59
+ # Order by cost and then by distance to the end cell.
60
+ def <=>(other)
61
+ comp = (cost <=> other.cost)
62
+ comp = (dist <=> other.dist) if comp == 0
63
+ comp
64
+ end
65
+
66
+ # Interactive stringification
67
+ def inspect
68
+ "Pri(#{cost}, #{dist})"
69
+ end
70
+ end
71
+
72
+ # Search the alignment grid filling in <em>@cost</em> and
73
+ # <em>@backtrace</em>.
74
+ def find_lowest_cost_alignment
75
+ agenda = priority_queue_factory
76
+
77
+ agenda[@start] = priority_factory(0, @start)
78
+ @backtrace = {}
79
+
80
+ until agenda.empty?
81
+ cell, priority = agenda.delete_min
82
+ cost = @cost[cell]
83
+ break if cost >= @cost[@end]
84
+ outgoing(cell) do |next_cell, next_cost|
85
+ next_cost += @cost[cell]
86
+ next unless next_cost < @cost[next_cell]
87
+ @cost[next_cell] = next_cost
88
+ @backtrace[next_cell] = cell
89
+ agenda[next_cell] = priority_factory(next_cost, next_cell) \
90
+ unless next_cost >= @cost[@end]
91
+ end
92
+ end
93
+ end
94
+
95
+ # An enumeration of all the cells adjacent to the specified cell
96
+ # and the costs of transitioning to them. Adjacent cells are
97
+ # reached by performing substitute, delete, and insertion
98
+ # operations.
99
+ def outgoing(cell) # :yields: cell, cost
100
+ # Substitute
101
+ if cell.source < @source.length-1 and cell.dest < @dest.length-1
102
+ next_cell = Alignment::Cell.new(cell.source+1, cell.dest+1)
103
+ if @cost[next_cell] > @cost[cell]
104
+ cost = substitute(@source[next_cell.source], @dest[next_cell.dest])
105
+ yield next_cell, cost
106
+ end
107
+ end
108
+ # Delete
109
+ if cell.source < @source.length-1
110
+ next_cell = Alignment::Cell.new(cell.source+1, cell.dest)
111
+ if @cost[next_cell] > @cost[cell]
112
+ cost = delete(@source[next_cell.source])
113
+ yield next_cell, cost
114
+ end
115
+ end
116
+ # Insert
117
+ if cell.dest < @dest.length-1
118
+ next_cell = Alignment::Cell.new(cell.source, cell.dest+1)
119
+ if @cost[next_cell] > @cost[cell]
120
+ cost = insert(@dest[next_cell.dest])
121
+ yield next_cell, cost
122
+ end
123
+ end
124
+ end
125
+
126
+ # Create the priority queue used by the search. By default
127
+ # EditAlign uses the C extension version of the
128
+ # priority_queue[http://rubyforge.org/projects/priority-queue/]
129
+ # library. If you wish to use a different priority queue
130
+ # implementation you may overload this function in a derived
131
+ # class.
132
+ def priority_queue_factory
133
+ require "priority_queue"
134
+ PriorityQueue.new
135
+
136
+ # Uncomment the following lines to use the pure-Ruby
137
+ # implementation of priority_queue.
138
+ # require "priority_queue/ruby_priority_queue"
139
+ # RubyPriorityQueue.new
140
+ end
141
+
142
+ # Create a new search priority for the queue. The priority must
143
+ # define the <em><=></em> operator. Cells with lower priority
144
+ # value will be searched first. If you wish to use a different
145
+ # prioritization scheme you may overload this function in a
146
+ # derived class.
147
+ def priority_factory(cost, cell)
148
+ SearchPriority.new(cost, cell, @end)
149
+ end
150
+
151
+ private :outgoing
152
+ protected :find_lowest_cost_alignment, :priority_queue_factory
153
+ protected :priority_factory
154
+ end
155
+
156
+
157
+ # This module employs an exhaustive search to find the lowest-cost
158
+ # sequence of edit operations that will transform the source array
159
+ # into the destination array. It finds the lowest cost alignment by
160
+ # filling in every cell in the costs table.
161
+ #
162
+ # This algorithm will return the same results as Dijkstra's
163
+ # algorithm though it is less efficient for nearly-aligned strings.
164
+ # Nevertheless, this search algorithm is commonly cited when
165
+ # describing alignments, and so is implemented here for the sake of
166
+ # documentation and for comparison with the Dijkstra.
167
+ module ExhaustiveSearch
168
+
169
+ # An incoming cell and its associated cost.
170
+ IncomingCost = Struct.new("IncomingCost", :cell, :cost)
171
+
172
+ # Search the alignment grid filling in @cost and @backtrace.
173
+ def find_lowest_cost_alignment
174
+ @backtrace = {}
175
+
176
+ # Fill in the top row of the table.
177
+ (1..@source.length).each do |source|
178
+ @cost[Alignment::Cell.new(source, 0)] = \
179
+ @cost[Alignment::Cell.new(source-1, 0)] + delete(@source[source])
180
+ end
181
+ # Fill in the first column of the table.
182
+ (1..@dest.length).each do |dest|
183
+ @cost[Alignment::Cell.new(0, dest)] = \
184
+ @cost[Alignment::Cell.new(0, dest-1)] + insert(@dest[dest])
185
+ end
186
+ # Fill in all the remaining cells in the table.
187
+ (1..@source.length).each do |source|
188
+ (1..@dest.length).each do |dest|
189
+ incoming = []
190
+ c = Alignment::Cell.new(source-1, dest)
191
+ incoming << IncomingCost.new(c, @cost[c] + delete(@source[source]))
192
+ c = Alignment::Cell.new(source, dest-1)
193
+ incoming << IncomingCost.new(c, @cost[c] + insert(@dest[dest]))
194
+ c = Alignment::Cell.new(source-1, dest-1)
195
+ incoming << \
196
+ IncomingCost.new(c, @cost[c] + substitute(@source[source],
197
+ @dest[dest]))
198
+ best = incoming.min {|a,b| a.cost <=> b.cost}
199
+ @cost[Alignment::Cell.new(source, dest)] = best.cost
200
+ @backtrace[Alignment::Cell.new(source, dest)] = best.cell
201
+ end
202
+ end
203
+ end
204
+
205
+ protected :find_lowest_cost_alignment
206
+ end
207
+
208
+
209
+ # The Alignment class is given a source and destination array at
210
+ # construction time. It does a dynamic programming alignment
211
+ # between them and makes the results of that alignment available
212
+ # through instance methods.
213
+ #
214
+ # If there are multiple alignments with equal edit distances
215
+ # Alignment will find one of them. Which one is undefined.
216
+ #
217
+ # Alignment works by constructing a matrix with dimensions equal to
218
+ # the length of the source and destination arrays. Moving
219
+ # horizontally and vertically in the matrix represents insertion and
220
+ # deletion operations, respectively, while moving diagonally
221
+ # represents substitution. Each cell of the matrix contains the
222
+ # minimum cost it takes to reach that cell. The algorithm fills in
223
+ # cells in the matrix until it reaches the furthest corner.
224
+ #
225
+ # The search is done using Dijkstra's algorithm as implemented in
226
+ # the DijkstraSearch. A different search algorithm may be specified
227
+ # by including a mixin that redefines the
228
+ # #find_lowest_cost_alignment function.
229
+ #
230
+ # This class uses Levenshtein weighting scheme. Levenshtein assigns
231
+ # a cost of 1 to insertions and deletions. It assigns a cost of 1
232
+ # to substitutions when the items are different and 0 when they are
233
+ # the same. Different weighting schemes may be specified by
234
+ # overloading the #insert, #delete, and #substitute functions. The
235
+ # costs must be non-negative numbers.
236
+ class Alignment
237
+ include DijkstraSearch
238
+
239
+ # A location in the alignment grid.
240
+ Cell = Struct.new("Cell", :source, :dest)
241
+
242
+ # The caller specifies a source and destination array. The object
243
+ # performs the alignment at construction time.
244
+ #
245
+ # Optionally either <em>source</em> or <em>dest</em> may be
246
+ # strings, in which they will be treated as arrays of characters.
247
+ def initialize(source, dest)
248
+ # Convert strings into arrays.
249
+ source = source.unpack('U*').collect {|c| c.chr} if source.class == String
250
+ dest = dest.unpack('U*').collect {|c| c.chr} if dest.class == String
251
+
252
+ # Prepend empty elements to the source and destination arrays to
253
+ # handle insertions and deletions at the front of the alignment.
254
+ @source = source.unshift(nil)
255
+ @dest = dest.unshift(nil)
256
+
257
+ # The start and end cells of the search.
258
+ @start = Cell.new(0, 0)
259
+ @end = Cell.new(@source.length-1, @dest.length-1)
260
+
261
+ # The lowest known cost to reach a cell. Unexplored cells have
262
+ # an infinite cost.
263
+ @cost = Hash.new{1.0/0.0}
264
+ @cost[@start] = 0
265
+
266
+ # Fill in the @cost matrix including @cost[@end] and create a
267
+ # lowest-cost sequence of cells in @backtrace.
268
+ find_lowest_cost_alignment
269
+ end
270
+
271
+ # The minimum edit distance
272
+ def edit_distance
273
+ @cost[@end]
274
+ end
275
+
276
+ # The lowest-cost list of edit operations. This is a list of
277
+ # <em>:substitute</em>, <em>:insert</em>, <em>:delete</em> symbols
278
+ # for operatations that changed an array element and
279
+ # <em>marker</em> for operations that did not.
280
+ def edit_operations(marker = nil)
281
+ ops = []
282
+ edit_sequence do |cell, operation|
283
+ if operation == :substitute
284
+ ops << \
285
+ (@source[cell.source] == @dest[cell.dest] ? marker: :substitute)
286
+ else
287
+ ops << operation
288
+ end
289
+ end
290
+ ops
291
+ end
292
+
293
+ # The source items with <em>marker</em> inserted where an
294
+ # insertion took place.
295
+ def source_alignment(marker = nil)
296
+ source = []
297
+ edit_sequence do |cell, operation|
298
+ source << (operation == :insert ? marker:@source[cell.source])
299
+ end
300
+ source
301
+ end
302
+
303
+ # The destination items with <em>marker</em> inserted where a
304
+ # deletion took place.
305
+ def dest_alignment(marker = nil)
306
+ dest = []
307
+ edit_sequence do |cell, operation|
308
+ dest << (operation == :delete ? marker:@dest[cell.dest])
309
+ end
310
+ dest
311
+ end
312
+
313
+ # Enumerate the minimum-cost sequence of edit operations.
314
+ def edit_sequence # :yields: cell, {:substitute, :insert, :delete}
315
+ # The first time this function is called, walk backwards through
316
+ # the backtrace to create the @path instance variable.
317
+ if not @path
318
+ @path = [@end]
319
+ while cell = @backtrace[@path[0]]
320
+ @path.unshift(cell)
321
+ end
322
+ end
323
+ # Walk forwards through the path.
324
+ prev_cell = @start
325
+ @path[1..-1].each do |cell|
326
+ delta_source = cell.source - prev_cell.source
327
+ delta_dest = cell.dest - prev_cell.dest
328
+ if delta_source == 1 and delta_dest == 1
329
+ yield cell, :substitute
330
+ elsif delta_source == 1
331
+ yield cell, :delete
332
+ elsif delta_dest == 1
333
+ yield cell, :insert
334
+ else
335
+ raise "Invalid path link #{prev_cell}->#{cell}"
336
+ end
337
+ prev_cell = cell
338
+ end
339
+ end
340
+
341
+ # Interactive stringification
342
+ def inspect
343
+ "<Alignment: #{edit_distance}>"
344
+ end
345
+
346
+ # The cost of substituting <em>source_item</em> with
347
+ # <em>dest_item</em>.
348
+ def substitute(source_item, dest_item)
349
+ source_item == dest_item ? 0:1
350
+ end
351
+
352
+ # The cost of deleting <em>source_item</em>.
353
+ def delete(source_item)
354
+ 1
355
+ end
356
+
357
+ # The cost of inserting <em>dest_item</em>.
358
+ def insert(dest_item)
359
+ 1
360
+ end
361
+
362
+ # The string representation of the alignment consists of four lines:
363
+ #
364
+ # 1. The source array
365
+ # 2. The destination array
366
+ # 3. An annotation line with S, I, D or nothing for aligned elements.
367
+ # 4. The edit distance
368
+ def to_s
369
+ # Create the source and destination lines.
370
+ s_line = source_alignment('-')
371
+ d_line = dest_alignment('-')
372
+ # Create short pneumonics for the edit operations.
373
+ ops = edit_operations.map do |op|
374
+ case op
375
+ when nil
376
+ c = " "
377
+ when :substitute
378
+ c = "S"
379
+ when :insert
380
+ c = "I"
381
+ when :delete
382
+ c = "D"
383
+ end
384
+ end
385
+ # Find the longest element in all the lines.
386
+ longest = [s_line, d_line, ops].map{|l| l.map{|e| e.length}.max}.max
387
+ # Center each array element over a field of that width.
388
+ lines = [s_line, d_line, ops].map do |list|
389
+ list.map{|c| c.center(longest)}.join
390
+ end
391
+ (lines + [edit_distance]).join("\n")
392
+ end
393
+
394
+ # This prints a grid of all the costs. Cells that were not
395
+ # visited because they could not contribute to the lowest-cost
396
+ # path are marked with an asterisk. Cells that are in the lowest
397
+ # cost path are highlighted with square brackets.
398
+ #
399
+ # irb(main):001:0> require 'editalign'
400
+ # => true
401
+ # irb(main):002:0> a = EditAlign::Alignment.new('captained', 'caspian')
402
+ # => <Alignment: 5>
403
+ # irb(main):003:0> puts a
404
+ # - c a p t a i n e d
405
+ # - [0.00] 1.00 2.00 3.00 4.00 5.00 * * * *
406
+ # c 1.00 [0.00] 1.00 2.00 3.00 4.00 5.00 * * *
407
+ # a 2.00 1.00 [0.00] 1.00 2.00 3.00 4.00 5.00 * *
408
+ # s 3.00 2.00 [1.00] 1.00 2.00 3.00 4.00 5.00 * *
409
+ # p 4.00 3.00 2.00 [1.00] 2.00 3.00 4.00 5.00 * *
410
+ # i 5.00 4.00 3.00 2.00 [2.00] 3.00 3.00 4.00 5.00 *
411
+ # a * 5.00 4.00 3.00 3.00 [2.00] [3.00] 4.00 5.00 *
412
+ # n * * 5.00 4.00 4.00 3.00 3.00 [3.00] [4.00] [5.00]
413
+ # => nil
414
+ def to_grid
415
+ # Make the columns wide enough to accommodate the widest header
416
+ # or value.
417
+ widest_header = @source.find_all{|x| x}.map{|x| x.length}.max
418
+ widest_cost = @cost.values.map{|c| (sprintf "%.2f", c).length + 2}.max
419
+ # The conditional handles empty alignments.
420
+ col_width = (widest_header and widest_cost) ? \
421
+ [widest_header, widest_cost].max: 1
422
+
423
+ # Create the header row.
424
+ header = [""] + @source.map{|x| x ? x: "-"}
425
+ header = header.map {|x| x.center(col_width)}
426
+
427
+ # Make note of which cells are on the lowest-cost path.
428
+ path_cells = {}
429
+ @path.each {|cell| path_cells[cell] = true} if @path
430
+
431
+ # Enumerate the destination, creating the cost rows.
432
+ table = [header]
433
+ (0..@dest.length-1).each do |dest|
434
+ x = @dest[dest]
435
+ x = "-" if not x
436
+ row = [sprintf("%-#{col_width}s", x)]
437
+ (0..@source.length-1).each do |source|
438
+ cell = Cell.new(source, dest)
439
+ c = @cost[cell]
440
+ if c == 1.0/0.0
441
+ # Center the * character in the column.
442
+ value = "*".center(col_width)
443
+ else
444
+ # Put brackets around cells in the best path.
445
+ value = sprintf path_cells[cell] ? "[%.2f]":" %.2f ", c
446
+ value = sprintf "%#{col_width}s", value
447
+ end
448
+ row << value
449
+ end
450
+ table << row
451
+ end
452
+ # Combine the rows into a single string table.
453
+ col_spc = " " * 4
454
+ table.map{|row| row.join(col_spc)}.join("\n")
455
+ end
456
+
457
+ private :edit_sequence
458
+ protected :substitute, :delete, :insert
459
+ end
460
+
461
+ # The Levenshtein alignment yields a cost of 1 for insertions,
462
+ # deletions, and item mismatch. This class is a synonym for
463
+ # Alignment.
464
+ class LevenshteinAlignment < Alignment
465
+ end
466
+
467
+ # The Sellers alignment (aka the Needleman-Wunsch alignment) allows
468
+ # different constant costs to be specified for insertion, deletion,
469
+ # and substitute match and mismatch operations.
470
+ class SellersAlignment < Alignment
471
+
472
+ # The costs for insert, delete, and substitution match and
473
+ # mismatch are specified here. The default values for these costs
474
+ # yields a Levenshtein alignment.
475
+ def initialize(source, dest, match = 0, mismatch = 1,
476
+ insert = 1, delete = 1)
477
+ @match = match
478
+ @mismatch = mismatch
479
+ @insert = insert
480
+ @delete = delete
481
+ super(source, dest)
482
+ end
483
+
484
+ # The cost of substituting <em>source_item</em> with
485
+ # <em>dest_item</em>.
486
+ def substitute(source_item, dest_item)
487
+ source_item == dest_item ? @match:@mismatch
488
+ end
489
+
490
+ # The cost of deleting <em>source_item</em>.
491
+ def delete(source_item)
492
+ @delete
493
+ end
494
+
495
+ # The cost of inserting <em>dest_item</em>.
496
+ def insert(dest_item)
497
+ @insert
498
+ end
499
+
500
+ protected :substitute, :delete, :insert
501
+ end
502
+
503
+ # The Wagner-Fischer alignment specifies the same cost for insertion
504
+ # and deletion operations, another for item match, and another for
505
+ # item mismatch.
506
+ class WagnerFischerAlignment < SellersAlignment
507
+
508
+ # The costs for insert/delete operations and character match and
509
+ # mismatch are specified here. The default values for these costs
510
+ # yields a Levenshtein alignment.
511
+ def initialize(source, dest, match = 0, mismatch = 1, insert_delete = 1)
512
+ super(source, dest, match, mismatch, insert_delete, insert_delete)
513
+ end
514
+ end
515
+
516
+ end
@@ -0,0 +1,167 @@
1
+ #!/bin/env ruby
2
+
3
+ #--
4
+ # Copyright 2006 William Patrick McNeill
5
+ #
6
+ # This file is part of Editalign.
7
+ #
8
+ # Editalign is free software; you can redistribute it and/or modify it
9
+ # under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # Editalign is distributed in the hope that it will be useful, but
14
+ # WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ # General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with editalign; if not, write to the Free Software Foundation,
20
+ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #
22
+ #++
23
+
24
+ # Test cases for the EditAlign module
25
+
26
+ require 'test/unit'
27
+ require 'editalign'
28
+
29
+
30
+ class LevenshteinStringAlignments < Test::Unit::TestCase
31
+ def test_captained_caspian
32
+ a = EditAlign::Alignment.new("captained", "caspian")
33
+ assert_kind_of EditAlign::Alignment, a
34
+ assert_equal 5, a.edit_distance
35
+ assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
36
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
37
+ assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
38
+ end
39
+ end
40
+
41
+
42
+ class SellersAlignments < Test::Unit::TestCase
43
+ def test_captained_caspian
44
+ a = EditAlign::SellersAlignment.new("captained", "caspian")
45
+ assert_kind_of EditAlign::Alignment, a
46
+ assert_equal 5, a.edit_distance
47
+ assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
48
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
49
+ assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
50
+ end
51
+
52
+ def test_captained_caspian_custom_weights
53
+ a = EditAlign::SellersAlignment.new("captained", "caspian", 0.1, 0.5, 0.8, 0.9)
54
+ assert_kind_of EditAlign::Alignment, a
55
+ assert_equal 4.1, a.edit_distance
56
+ assert_equal [nil, nil, :substitute, :substitute, :delete, nil, :substitute, :delete, :substitute], a.edit_operations
57
+ assert_equal ["c", "a", "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
58
+ assert_equal ["c", "a", "s", "p", nil, "i", "a", nil, "n"], a.dest_alignment
59
+ end
60
+ end
61
+
62
+
63
+ class WagnerFischerAlignments < Test::Unit::TestCase
64
+ def test_captained_caspian
65
+ a = EditAlign::WagnerFischerAlignment.new("captained", "caspian")
66
+ assert_kind_of EditAlign::Alignment, a
67
+ assert_equal 5, a.edit_distance
68
+ assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
69
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
70
+ assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
71
+ end
72
+
73
+ def test_captained_caspian_high_substitution_cost
74
+ a = EditAlign::WagnerFischerAlignment.new("captained", "caspian", 0, 20, 1)
75
+ assert_kind_of EditAlign::Alignment, a
76
+ assert_equal 6, a.edit_distance
77
+ assert_equal [nil, nil, :insert, nil, :delete, :delete, nil, :insert, nil, :delete, :delete], a.edit_operations
78
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", nil, "n", "e", "d"], a.source_alignment
79
+ assert_equal ["c", "a", "s", "p", nil, nil, "i", "a", "n", nil, nil], a.dest_alignment
80
+ end
81
+ end
82
+
83
+
84
+ class ParameterOptions < Test::Unit::TestCase
85
+ def test_array_init
86
+ a = EditAlign::Alignment.new(['c', 'a', 'p', 't', 'a', 'i', 'n', 'e', 'd'], ['c', 'a', 's', 'p', 'i', 'a', 'n'])
87
+ assert_kind_of EditAlign::Alignment, a
88
+ assert_equal 5, a.edit_distance
89
+ assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
90
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
91
+ assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
92
+ end
93
+
94
+ def test_marker
95
+ a = EditAlign::Alignment.new("captained", "caspian")
96
+ assert_equal ["X", "X", :insert, "X", :substitute, "X", :delete, "X", :delete, :delete], a.edit_operations("X")
97
+ assert_equal ["c", "a", "X", "p", "t", "a", "i", "n", "e", "d"], a.source_alignment('X')
98
+ assert_equal ["c", "a", "s", "p", "i", "a", "X", "n", "X", "X"], a.dest_alignment('X')
99
+ end
100
+ end
101
+
102
+
103
+ class BoundaryConditions < Test::Unit::TestCase
104
+ def test_empty_alignment
105
+ a = EditAlign::Alignment.new([], [])
106
+ assert_kind_of EditAlign::Alignment, a
107
+ assert_equal 0, a.edit_distance
108
+ assert_equal [], a.source_alignment
109
+ assert_equal [], a.dest_alignment
110
+ assert_equal [], a.edit_operations
111
+ end
112
+
113
+ def test_nonempty_empty_alignment
114
+ a = EditAlign::Alignment.new(['A', 'B', 'C'], [])
115
+ assert_kind_of EditAlign::Alignment, a
116
+ assert_equal 3, a.edit_distance
117
+ assert_equal [:delete, :delete, :delete], a.edit_operations
118
+ assert_equal ['A', 'B', 'C'], a.source_alignment
119
+ assert_equal [nil, nil, nil], a.dest_alignment
120
+ end
121
+
122
+ def test_empty_nonempty_alignment
123
+ a = EditAlign::Alignment.new([], ['A', 'B', 'C'])
124
+ assert_kind_of EditAlign::Alignment, a
125
+ assert_equal 3, a.edit_distance
126
+ assert_equal [:insert, :insert, :insert], a.edit_operations
127
+ assert_equal [nil, nil, nil], a.source_alignment
128
+ assert_equal ['A', 'B', 'C'], a.dest_alignment
129
+ end
130
+
131
+ def test_identical
132
+ a = EditAlign::Alignment.new(['A', 'B', 'C'], ['A', 'B', 'C'])
133
+ assert_kind_of EditAlign::Alignment, a
134
+ assert_equal 0, a.edit_distance
135
+ assert_equal [nil, nil, nil], a.edit_operations
136
+ assert_equal ['A', 'B', 'C'], a.source_alignment
137
+ assert_equal ['A', 'B', 'C'], a.dest_alignment
138
+ end
139
+ end
140
+
141
+ class Stringification < Test::Unit::TestCase
142
+ def test_captained_caspian
143
+ a = EditAlign::Alignment.new("captained", "caspian")
144
+ s = \
145
+ "ca-ptained
146
+ caspia-n--
147
+ I S D DD
148
+ 5"
149
+ assert_equal s, a.to_s
150
+ end
151
+ end
152
+
153
+ class CompareSearchAlgorithm < Test::Unit::TestCase
154
+
155
+ class ExhaustiveAlignment < EditAlign::SellersAlignment
156
+ include EditAlign::ExhaustiveSearch
157
+ end
158
+
159
+ def test_dijkstra_exhaustive
160
+ d = EditAlign::Alignment.new("captained", "caspian")
161
+ e = ExhaustiveAlignment.new("captained", "caspian")
162
+ assert_equal d.edit_distance, e.edit_distance
163
+ assert_equal d.edit_operations, e.edit_operations
164
+ assert_equal d.source_alignment, e.source_alignment
165
+ assert_equal d.dest_alignment, e.dest_alignment
166
+ end
167
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: editalign
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2006-05-30 00:00:00 -07:00
8
+ summary: Edit alignments between arrays
9
+ require_paths:
10
+ - lib
11
+ email: billmcn@u.washington.edu
12
+ homepage: http://staff.washington.edu/billmcn/index.shtml
13
+ rubyforge_project:
14
+ description: This module performs edit alignments between arrays. It returns alignments and edit distances.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - W.P. McNeill
30
+ files:
31
+ - test/test_editalign.rb
32
+ - lib/editalign.rb
33
+ - examples/align-strings
34
+ - examples/stress-test
35
+ - README
36
+ test_files:
37
+ - test/test_editalign.rb
38
+ rdoc_options:
39
+ - --title
40
+ - EditAlign -- Ruby Edit Alignment
41
+ - --main
42
+ - README
43
+ - --line-numbers
44
+ - --inline-source
45
+ extra_rdoc_files:
46
+ - README
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ requirements: []
52
+
53
+ dependencies:
54
+ - !ruby/object:Gem::Dependency
55
+ name: PriorityQueue
56
+ version_requirement:
57
+ version_requirements: !ruby/object:Gem::Version::Requirement
58
+ requirements:
59
+ - - ">"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.0.0
62
+ version: