editalign 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,65 @@
1
+ = Edit Alignment Module
2
+
3
+ The module implements a dynamic programming string alignment algorithm
4
+ that produces both alignments and edit distances between source and
5
+ destination arrays or strings.
6
+
7
+ An alignment between the arrays is represented as a sequence of
8
+ insert, delete, and substitute operations that transform individual
9
+ source items into destination items. For example, the following is an
10
+ alignment between the letters in the words 'captained' and 'caspian':
11
+
12
+ c a - p t a i n e d
13
+ c a s p i a - n - -
14
+
15
+ Here an 's' was inserted, an 'i' was substituted for a 't', and an
16
+ 'i', an 'e', and a 'd' were deleted.
17
+
18
+ The module exports an Alignment class which assigns numeric costs to
19
+ each of these operations and finds the alignment that incurs the
20
+ minimum cost or edit distance. The Alignment class uses Dijkstra's
21
+ algorithm to efficiently find edit distances for partially-aligned
22
+ arrays. See the EditAlign::DijkstraSearch class for details.
23
+
24
+ irb(main):001:0> require 'editalign'
25
+ => true
26
+ irb(main):002:0> a = EditAlign::Alignment.new('captained', 'caspian')
27
+ => <Alignment: 5>
28
+ irb(main):003:0> a.edit_distance
29
+ => 5
30
+ irb(main):004:0> a.edit_operations
31
+ => [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete]
32
+ irb(main):005:0> a.source_alignment
33
+ => ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"]
34
+ irb(main):006:0> a.dest_alignment
35
+ => ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil]
36
+ irb(main):007:0> puts a
37
+ ca-ptained
38
+ caspia-n--
39
+ I S D DD
40
+ 5
41
+ => nil
42
+
43
+ = History
44
+
45
+ * 1-0-0 ... First version
46
+
47
+ = See Also
48
+
49
+ * The Levenshtein[http://po-ru.com/projects/levenshtein/] module generates a Levenshtein edit distance doing an exhaustive search.
50
+
51
+ = Acknowledgments
52
+
53
+ Thanks to Jeremy G. Kahn for suggesting a diagonal-hugging search
54
+ strategy and optimizations to the one implemented here.
55
+
56
+ = Copyright
57
+
58
+ Copyright 2006, William Patrick McNeill
59
+
60
+ This program is distributed under the GNU General Public License.
61
+
62
+ = Author
63
+
64
+ W.P. McNeill mailto:billmcn@u.washington.edu
65
+
@@ -0,0 +1,72 @@
1
+ #!/bin/env ruby
2
+
3
+ #--
4
+ # Copyright 2006 William Patrick McNeill
5
+ #
6
+ # This file is part of Editalign.
7
+ #
8
+ # Editalign is free software; you can redistribute it and/or modify it
9
+ # under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # Editalign is distributed in the hope that it will be useful, but
14
+ # WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ # General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with editalign; if not, write to the Free Software Foundation,
20
+ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #
22
+ #++
23
+
24
+ # Print the character alignment between two strings passed in on the
25
+ # command line.
26
+
27
+ require 'getoptlong'
28
+ require 'editalign'
29
+
30
+ class ExhaustiveSellersAlignment < EditAlign::SellersAlignment
31
+ include EditAlign::ExhaustiveSearch
32
+ end
33
+
34
+ # Process command line options.
35
+ opts = GetoptLong.new(["--match", "-m", GetoptLong::REQUIRED_ARGUMENT],
36
+ ["--nomatch", "-n", GetoptLong::REQUIRED_ARGUMENT],
37
+ ["--insert", "-i",GetoptLong::REQUIRED_ARGUMENT],
38
+ ["--delete", "-d", GetoptLong::REQUIRED_ARGUMENT],
39
+ ["--exhaustive", "-e", GetoptLong::NO_ARGUMENT]
40
+ )
41
+
42
+ match = 0
43
+ mismatch = 1
44
+ insert = 1
45
+ delete = 1
46
+ exhaustive = false
47
+ opts.each do |opt, arg|
48
+ case opt
49
+ when "--match"
50
+ match = arg.to_f
51
+ when "--nomatch"
52
+ mismatch = arg.to_f
53
+ when "--insert"
54
+ insert = arg.to_f
55
+ when "--delete"
56
+ delete = arg.to_f
57
+ when "--exhaustive"
58
+ exhaustive = true
59
+ end
60
+ end
61
+
62
+ source = ARGV[0]
63
+ dest = ARGV[1]
64
+
65
+ # Do alignments and print results.
66
+ alignments = [EditAlign::SellersAlignment]
67
+ alignments << ExhaustiveSellersAlignment if exhaustive
68
+
69
+ alignments.each do |align_class|
70
+ a = align_class.new(source, dest, match, mismatch, insert, delete)
71
+ puts a, a.to_grid
72
+ end
@@ -0,0 +1,165 @@
1
+ #!/bin/env ruby
2
+
3
+ #--
4
+ # Copyright 2006 William Patrick McNeill
5
+ #
6
+ # This file is part of Editalign.
7
+ #
8
+ # Editalign is free software; you can redistribute it and/or modify it
9
+ # under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # Editalign is distributed in the hope that it will be useful, but
14
+ # WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ # General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with editalign; if not, write to the Free Software Foundation,
20
+ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #
22
+ #++
23
+
24
+ require "getoptlong"
25
+ require "editalign"
26
+
27
+ class PureRubyEditAlign < EditAlign::Alignment
28
+ def priority_queue_factory
29
+ require "priority_queue/ruby_priority_queue"
30
+ RubyPriorityQueue.new
31
+ end
32
+ end
33
+
34
+ class IntegerPriorityEditAlign < EditAlign::Alignment
35
+ def priority_factory(cost, cell)
36
+ cost
37
+ end
38
+ end
39
+
40
+
41
+ # Process this script's command line options.
42
+ #
43
+ # There are three required arguments.
44
+ #
45
+ # * The number of alignments to perform
46
+ # * The length of the strings to align
47
+ # * The number of edits
48
+ #
49
+ # There are two optional switches.
50
+ #
51
+ # * verbose ... prints strings and edit distances
52
+ # * pure-ruby ... uses the pure Ruby priority queue instead of the C extension
53
+ # * int-cost ... uses integer costs
54
+ def parse_command_line
55
+ opts = GetoptLong.new(["--verbose", "-v", GetoptLong::NO_ARGUMENT],
56
+ ["--pure-ruby", "-p", GetoptLong::NO_ARGUMENT],
57
+ ["--int-cost", "-i", GetoptLong::NO_ARGUMENT])
58
+ verbose = false
59
+ pure_ruby = false
60
+ int_cost = false
61
+ opts.each do |opt, arg|
62
+ case opt
63
+ when "--verbose"
64
+ verbose = true
65
+ when "--pure-ruby"
66
+ pure_ruby = true
67
+ when "--int-cost"
68
+ int_cost = true
69
+ end
70
+ end
71
+
72
+ trials = Integer(ARGV[0])
73
+ length = Integer(ARGV[1])
74
+ edits = Integer(ARGV[2])
75
+
76
+ if pure_ruby
77
+ klass = PureRubyEditAlign
78
+ elsif int_cost
79
+ klass = IntegerPriorityEditAlign
80
+ else
81
+ klass = EditAlign::Alignment
82
+ end
83
+
84
+ [trials, length, edits, klass, verbose]
85
+ end
86
+
87
+ # Run a number of alignments of the same size.
88
+ def run_stress_test(trials, length, edits, klass, verbose)
89
+ (1..trials).each do |i|
90
+ puts "Trial #{i}" if verbose
91
+
92
+ # Generate a destination string with random differences from the
93
+ # source string.
94
+ source, dest = create_strings(length, edits)
95
+
96
+ # Align the altered string with the one read from the file.
97
+ alignment = klass.new(source, dest)
98
+ print "#{alignment}\n\n" if verbose
99
+ end
100
+ end
101
+
102
+ # Create two unaligned strings.
103
+ def create_strings(length, edits)
104
+ # The alphabet used for inserts and substitutions.
105
+ alphabet = ('A'..'Z').collect
106
+
107
+ # Source contains a repeating lowercase alphabet length characters
108
+ # long.
109
+ last = length - 1
110
+ source = (0..last).map {|x| (97 + x % 26).chr}
111
+
112
+ # There can only be as many edits as there are characters.
113
+ edits = length if edits > length
114
+
115
+ # Create a roughly even number of substitutions, inserts, and
116
+ # deletes.
117
+ n_subs = 0
118
+ n_ins = 0
119
+ n_dels = 0
120
+ (1..edits).each do
121
+ case rand(3)
122
+ when 0
123
+ n_subs += 1
124
+ when 1
125
+ n_ins += 1
126
+ when 2
127
+ n_dels += 1
128
+ end
129
+ end
130
+
131
+ # Distribute edit operations randomly throughout the string.
132
+ unchanged_pos = (0..last).collect
133
+ edit_op = [nil] * length
134
+
135
+ [:substitute, :insert, :delete].zip([n_subs, n_ins, n_dels]) do |op, n|
136
+ (1..n).each do
137
+ i = rand(unchanged_pos.length)
138
+ edit_op[unchanged_pos[i]] = op
139
+ unchanged_pos.delete_at(i)
140
+ end
141
+ end
142
+
143
+ # Use the random edit operations to create a destination string.
144
+ dest = ''
145
+ edit_op.each_index do |i|
146
+ case edit_op[i]
147
+ when nil
148
+ dest += source[i]
149
+ when :substitute
150
+ dest += alphabet[rand(alphabet.length)]
151
+ when :insert
152
+ dest += alphabet[rand(alphabet.length)] + source[i]
153
+ when :delete
154
+ # Do nothing
155
+ end
156
+ end
157
+
158
+ [source, dest]
159
+ end
160
+
161
+
162
+ if __FILE__ == $0
163
+ trials, length, edits, klass, verbose = parse_command_line
164
+ run_stress_test(trials, length, edits, klass, verbose)
165
+ end
data/lib/editalign.rb ADDED
@@ -0,0 +1,516 @@
1
+ # Copyright 2006 William Patrick McNeill
2
+ #
3
+ # Editalign is free software; you can redistribute it and/or modify it
4
+ # under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation; either version 2 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # Editalign is distributed in the hope that it will be useful, but
9
+ # WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with editalign; if not, write to the Free Software Foundation,
15
+ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ # EditAlign is the namespace that contains all edit alignment
18
+ # functions.
19
+ module EditAlign
20
+
21
+ # This module employs Dijkstra's algorithm to find the lowest-cost
22
+ # sequence of edit operations that will transform the source array
23
+ # into the destination array. The alignment grid is treated as a
24
+ # directed acyclic graph where each cell in the grid is a vertex.
25
+ # Edges in the graph correspond to substitution, deletion and
26
+ # insertion operations. The edge weights come from the weighting
27
+ # methods #substitute, #insert, and #delete.
28
+ #
29
+ # Generally speaking, the strategy is to search the diagonal of the
30
+ # alignment grid before the corners. For partially-aligned arrays,
31
+ # this strategy can result in fewer calls to the weighting
32
+ # functions.
33
+ module DijkstraSearch
34
+
35
+ # Cells to be searched in the Dijkstra priority queue are ordered
36
+ # by SearchPriority. SearchPriority orders cells first by cost,
37
+ # and then if the costs are equal by smallest number of hops to
38
+ # the end cell. All things being equal, the latter comparison
39
+ # makes the algorithm search cells near the diagonal first. This
40
+ # can help in instances where the beginings of the arrays are
41
+ # unaligned but the ends are aligned.
42
+ class SearchPriority
43
+
44
+ # The cost to reach the current cell
45
+ attr_reader :cost
46
+
47
+ # The minimum path length to the final cell
48
+ attr_reader :dist
49
+
50
+ include Comparable
51
+
52
+ # Specify the cost to reach a cell, the cell, and the final
53
+ # cell.
54
+ def initialize(cost, cell, end_cell)
55
+ @cost = cost
56
+ @dist = [end_cell.source - cell.source, end_cell.dest - cell.dest].max
57
+ end
58
+
59
+ # Order by cost and then by distance to the end cell.
60
+ def <=>(other)
61
+ comp = (cost <=> other.cost)
62
+ comp = (dist <=> other.dist) if comp == 0
63
+ comp
64
+ end
65
+
66
+ # Interactive stringification
67
+ def inspect
68
+ "Pri(#{cost}, #{dist})"
69
+ end
70
+ end
71
+
72
+ # Search the alignment grid filling in <em>@cost</em> and
73
+ # <em>@backtrace</em>.
74
+ def find_lowest_cost_alignment
75
+ agenda = priority_queue_factory
76
+
77
+ agenda[@start] = priority_factory(0, @start)
78
+ @backtrace = {}
79
+
80
+ until agenda.empty?
81
+ cell, priority = agenda.delete_min
82
+ cost = @cost[cell]
83
+ break if cost >= @cost[@end]
84
+ outgoing(cell) do |next_cell, next_cost|
85
+ next_cost += @cost[cell]
86
+ next unless next_cost < @cost[next_cell]
87
+ @cost[next_cell] = next_cost
88
+ @backtrace[next_cell] = cell
89
+ agenda[next_cell] = priority_factory(next_cost, next_cell) \
90
+ unless next_cost >= @cost[@end]
91
+ end
92
+ end
93
+ end
94
+
95
+ # An enumeration of all the cells adjacent to the specified cell
96
+ # and the costs of transitioning to them. Adjacent cells are
97
+ # reached by performing substitute, delete, and insertion
98
+ # operations.
99
+ def outgoing(cell) # :yields: cell, cost
100
+ # Substitute
101
+ if cell.source < @source.length-1 and cell.dest < @dest.length-1
102
+ next_cell = Alignment::Cell.new(cell.source+1, cell.dest+1)
103
+ if @cost[next_cell] > @cost[cell]
104
+ cost = substitute(@source[next_cell.source], @dest[next_cell.dest])
105
+ yield next_cell, cost
106
+ end
107
+ end
108
+ # Delete
109
+ if cell.source < @source.length-1
110
+ next_cell = Alignment::Cell.new(cell.source+1, cell.dest)
111
+ if @cost[next_cell] > @cost[cell]
112
+ cost = delete(@source[next_cell.source])
113
+ yield next_cell, cost
114
+ end
115
+ end
116
+ # Insert
117
+ if cell.dest < @dest.length-1
118
+ next_cell = Alignment::Cell.new(cell.source, cell.dest+1)
119
+ if @cost[next_cell] > @cost[cell]
120
+ cost = insert(@dest[next_cell.dest])
121
+ yield next_cell, cost
122
+ end
123
+ end
124
+ end
125
+
126
+ # Create the priority queue used by the search. By default
127
+ # EditAlign uses the C extension version of the
128
+ # priority_queue[http://rubyforge.org/projects/priority-queue/]
129
+ # library. If you wish to use a different priority queue
130
+ # implementation you may overload this function in a derived
131
+ # class.
132
+ def priority_queue_factory
133
+ require "priority_queue"
134
+ PriorityQueue.new
135
+
136
+ # Uncomment the following lines to use the pure-Ruby
137
+ # implementation of priority_queue.
138
+ # require "priority_queue/ruby_priority_queue"
139
+ # RubyPriorityQueue.new
140
+ end
141
+
142
+ # Create a new search priority for the queue. The priority must
143
+ # define the <em><=></em> operator. Cells with lower priority
144
+ # value will be searched first. If you wish to use a different
145
+ # prioritization scheme you may overload this function in a
146
+ # derived class.
147
+ def priority_factory(cost, cell)
148
+ SearchPriority.new(cost, cell, @end)
149
+ end
150
+
151
+ private :outgoing
152
+ protected :find_lowest_cost_alignment, :priority_queue_factory
153
+ protected :priority_factory
154
+ end
155
+
156
+
157
+ # This module employs an exhaustive search to find the lowest-cost
158
+ # sequence of edit operations that will transform the source array
159
+ # into the destination array. It finds the lowest cost alignment by
160
+ # filling in every cell in the costs table.
161
+ #
162
+ # This algorithm will return the same results as Dijkstra's
163
+ # algorithm though it is less efficient for nearly-aligned strings.
164
+ # Nevertheless, this search algorithm is commonly cited when
165
+ # describing alignments, and so is implemented here for the sake of
166
+ # documentation and for comparison with the Dijkstra.
167
+ module ExhaustiveSearch
168
+
169
+ # An incoming cell and its associated cost.
170
+ IncomingCost = Struct.new("IncomingCost", :cell, :cost)
171
+
172
+ # Search the alignment grid filling in @cost and @backtrace.
173
+ def find_lowest_cost_alignment
174
+ @backtrace = {}
175
+
176
+ # Fill in the top row of the table.
177
+ (1..@source.length).each do |source|
178
+ @cost[Alignment::Cell.new(source, 0)] = \
179
+ @cost[Alignment::Cell.new(source-1, 0)] + delete(@source[source])
180
+ end
181
+ # Fill in the first column of the table.
182
+ (1..@dest.length).each do |dest|
183
+ @cost[Alignment::Cell.new(0, dest)] = \
184
+ @cost[Alignment::Cell.new(0, dest-1)] + insert(@dest[dest])
185
+ end
186
+ # Fill in all the remaining cells in the table.
187
+ (1..@source.length).each do |source|
188
+ (1..@dest.length).each do |dest|
189
+ incoming = []
190
+ c = Alignment::Cell.new(source-1, dest)
191
+ incoming << IncomingCost.new(c, @cost[c] + delete(@source[source]))
192
+ c = Alignment::Cell.new(source, dest-1)
193
+ incoming << IncomingCost.new(c, @cost[c] + insert(@dest[dest]))
194
+ c = Alignment::Cell.new(source-1, dest-1)
195
+ incoming << \
196
+ IncomingCost.new(c, @cost[c] + substitute(@source[source],
197
+ @dest[dest]))
198
+ best = incoming.min {|a,b| a.cost <=> b.cost}
199
+ @cost[Alignment::Cell.new(source, dest)] = best.cost
200
+ @backtrace[Alignment::Cell.new(source, dest)] = best.cell
201
+ end
202
+ end
203
+ end
204
+
205
+ protected :find_lowest_cost_alignment
206
+ end
207
+
208
+
209
+ # The Alignment class is given a source and destination array at
210
+ # construction time. It does a dynamic programming alignment
211
+ # between them and makes the results of that alignment available
212
+ # through instance methods.
213
+ #
214
+ # If there are multiple alignments with equal edit distances
215
+ # Alignment will find one of them. Which one is undefined.
216
+ #
217
+ # Alignment works by constructing a matrix with dimensions equal to
218
+ # the length of the source and destination arrays. Moving
219
+ # horizontally and vertically in the matrix represents insertion and
220
+ # deletion operations, respectively, while moving diagonally
221
+ # represents substitution. Each cell of the matrix contains the
222
+ # minimum cost it takes to reach that cell. The algorithm fills in
223
+ # cells in the matrix until it reaches the furthest corner.
224
+ #
225
+ # The search is done using Dijkstra's algorithm as implemented in
226
+ # the DijkstraSearch. A different search algorithm may be specified
227
+ # by including a mixin that redefines the
228
+ # #find_lowest_cost_alignment function.
229
+ #
230
+ # This class uses Levenshtein weighting scheme. Levenshtein assigns
231
+ # a cost of 1 to insertions and deletions. It assigns a cost of 1
232
+ # to substitutions when the items are different and 0 when they are
233
+ # the same. Different weighting schemes may be specified by
234
+ # overloading the #insert, #delete, and #substitute functions. The
235
+ # costs must be non-negative numbers.
236
+ class Alignment
237
+ include DijkstraSearch
238
+
239
+ # A location in the alignment grid.
240
+ Cell = Struct.new("Cell", :source, :dest)
241
+
242
+ # The caller specifies a source and destination array. The object
243
+ # performs the alignment at construction time.
244
+ #
245
+ # Optionally either <em>source</em> or <em>dest</em> may be
246
+ # strings, in which they will be treated as arrays of characters.
247
+ def initialize(source, dest)
248
+ # Convert strings into arrays.
249
+ source = source.unpack('U*').collect {|c| c.chr} if source.class == String
250
+ dest = dest.unpack('U*').collect {|c| c.chr} if dest.class == String
251
+
252
+ # Prepend empty elements to the source and destination arrays to
253
+ # handle insertions and deletions at the front of the alignment.
254
+ @source = source.unshift(nil)
255
+ @dest = dest.unshift(nil)
256
+
257
+ # The start and end cells of the search.
258
+ @start = Cell.new(0, 0)
259
+ @end = Cell.new(@source.length-1, @dest.length-1)
260
+
261
+ # The lowest known cost to reach a cell. Unexplored cells have
262
+ # an infinite cost.
263
+ @cost = Hash.new{1.0/0.0}
264
+ @cost[@start] = 0
265
+
266
+ # Fill in the @cost matrix including @cost[@end] and create a
267
+ # lowest-cost sequence of cells in @backtrace.
268
+ find_lowest_cost_alignment
269
+ end
270
+
271
+ # The minimum edit distance
272
+ def edit_distance
273
+ @cost[@end]
274
+ end
275
+
276
+ # The lowest-cost list of edit operations. This is a list of
277
+ # <em>:substitute</em>, <em>:insert</em>, <em>:delete</em> symbols
278
+ # for operatations that changed an array element and
279
+ # <em>marker</em> for operations that did not.
280
+ def edit_operations(marker = nil)
281
+ ops = []
282
+ edit_sequence do |cell, operation|
283
+ if operation == :substitute
284
+ ops << \
285
+ (@source[cell.source] == @dest[cell.dest] ? marker: :substitute)
286
+ else
287
+ ops << operation
288
+ end
289
+ end
290
+ ops
291
+ end
292
+
293
+ # The source items with <em>marker</em> inserted where an
294
+ # insertion took place.
295
+ def source_alignment(marker = nil)
296
+ source = []
297
+ edit_sequence do |cell, operation|
298
+ source << (operation == :insert ? marker:@source[cell.source])
299
+ end
300
+ source
301
+ end
302
+
303
+ # The destination items with <em>marker</em> inserted where a
304
+ # deletion took place.
305
+ def dest_alignment(marker = nil)
306
+ dest = []
307
+ edit_sequence do |cell, operation|
308
+ dest << (operation == :delete ? marker:@dest[cell.dest])
309
+ end
310
+ dest
311
+ end
312
+
313
+ # Enumerate the minimum-cost sequence of edit operations.
314
+ def edit_sequence # :yields: cell, {:substitute, :insert, :delete}
315
+ # The first time this function is called, walk backwards through
316
+ # the backtrace to create the @path instance variable.
317
+ if not @path
318
+ @path = [@end]
319
+ while cell = @backtrace[@path[0]]
320
+ @path.unshift(cell)
321
+ end
322
+ end
323
+ # Walk forwards through the path.
324
+ prev_cell = @start
325
+ @path[1..-1].each do |cell|
326
+ delta_source = cell.source - prev_cell.source
327
+ delta_dest = cell.dest - prev_cell.dest
328
+ if delta_source == 1 and delta_dest == 1
329
+ yield cell, :substitute
330
+ elsif delta_source == 1
331
+ yield cell, :delete
332
+ elsif delta_dest == 1
333
+ yield cell, :insert
334
+ else
335
+ raise "Invalid path link #{prev_cell}->#{cell}"
336
+ end
337
+ prev_cell = cell
338
+ end
339
+ end
340
+
341
+ # Interactive stringification
342
+ def inspect
343
+ "<Alignment: #{edit_distance}>"
344
+ end
345
+
346
+ # The cost of substituting <em>source_item</em> with
347
+ # <em>dest_item</em>.
348
+ def substitute(source_item, dest_item)
349
+ source_item == dest_item ? 0:1
350
+ end
351
+
352
+ # The cost of deleting <em>source_item</em>.
353
+ def delete(source_item)
354
+ 1
355
+ end
356
+
357
+ # The cost of inserting <em>dest_item</em>.
358
+ def insert(dest_item)
359
+ 1
360
+ end
361
+
362
+ # The string representation of the alignment consists of four lines:
363
+ #
364
+ # 1. The source array
365
+ # 2. The destination array
366
+ # 3. An annotation line with S, I, D or nothing for aligned elements.
367
+ # 4. The edit distance
368
+ def to_s
369
+ # Create the source and destination lines.
370
+ s_line = source_alignment('-')
371
+ d_line = dest_alignment('-')
372
+ # Create short pneumonics for the edit operations.
373
+ ops = edit_operations.map do |op|
374
+ case op
375
+ when nil
376
+ c = " "
377
+ when :substitute
378
+ c = "S"
379
+ when :insert
380
+ c = "I"
381
+ when :delete
382
+ c = "D"
383
+ end
384
+ end
385
+ # Find the longest element in all the lines.
386
+ longest = [s_line, d_line, ops].map{|l| l.map{|e| e.length}.max}.max
387
+ # Center each array element over a field of that width.
388
+ lines = [s_line, d_line, ops].map do |list|
389
+ list.map{|c| c.center(longest)}.join
390
+ end
391
+ (lines + [edit_distance]).join("\n")
392
+ end
393
+
394
+ # This prints a grid of all the costs. Cells that were not
395
+ # visited because they could not contribute to the lowest-cost
396
+ # path are marked with an asterisk. Cells that are in the lowest
397
+ # cost path are highlighted with square brackets.
398
+ #
399
+ # irb(main):001:0> require 'editalign'
400
+ # => true
401
+ # irb(main):002:0> a = EditAlign::Alignment.new('captained', 'caspian')
402
+ # => <Alignment: 5>
403
+ # irb(main):003:0> puts a
404
+ # - c a p t a i n e d
405
+ # - [0.00] 1.00 2.00 3.00 4.00 5.00 * * * *
406
+ # c 1.00 [0.00] 1.00 2.00 3.00 4.00 5.00 * * *
407
+ # a 2.00 1.00 [0.00] 1.00 2.00 3.00 4.00 5.00 * *
408
+ # s 3.00 2.00 [1.00] 1.00 2.00 3.00 4.00 5.00 * *
409
+ # p 4.00 3.00 2.00 [1.00] 2.00 3.00 4.00 5.00 * *
410
+ # i 5.00 4.00 3.00 2.00 [2.00] 3.00 3.00 4.00 5.00 *
411
+ # a * 5.00 4.00 3.00 3.00 [2.00] [3.00] 4.00 5.00 *
412
+ # n * * 5.00 4.00 4.00 3.00 3.00 [3.00] [4.00] [5.00]
413
+ # => nil
414
+ def to_grid
415
+ # Make the columns wide enough to accommodate the widest header
416
+ # or value.
417
+ widest_header = @source.find_all{|x| x}.map{|x| x.length}.max
418
+ widest_cost = @cost.values.map{|c| (sprintf "%.2f", c).length + 2}.max
419
+ # The conditional handles empty alignments.
420
+ col_width = (widest_header and widest_cost) ? \
421
+ [widest_header, widest_cost].max: 1
422
+
423
+ # Create the header row.
424
+ header = [""] + @source.map{|x| x ? x: "-"}
425
+ header = header.map {|x| x.center(col_width)}
426
+
427
+ # Make note of which cells are on the lowest-cost path.
428
+ path_cells = {}
429
+ @path.each {|cell| path_cells[cell] = true} if @path
430
+
431
+ # Enumerate the destination, creating the cost rows.
432
+ table = [header]
433
+ (0..@dest.length-1).each do |dest|
434
+ x = @dest[dest]
435
+ x = "-" if not x
436
+ row = [sprintf("%-#{col_width}s", x)]
437
+ (0..@source.length-1).each do |source|
438
+ cell = Cell.new(source, dest)
439
+ c = @cost[cell]
440
+ if c == 1.0/0.0
441
+ # Center the * character in the column.
442
+ value = "*".center(col_width)
443
+ else
444
+ # Put brackets around cells in the best path.
445
+ value = sprintf path_cells[cell] ? "[%.2f]":" %.2f ", c
446
+ value = sprintf "%#{col_width}s", value
447
+ end
448
+ row << value
449
+ end
450
+ table << row
451
+ end
452
+ # Combine the rows into a single string table.
453
+ col_spc = " " * 4
454
+ table.map{|row| row.join(col_spc)}.join("\n")
455
+ end
456
+
457
+ private :edit_sequence
458
+ protected :substitute, :delete, :insert
459
+ end
460
+
461
+ # The Levenshtein alignment yields a cost of 1 for insertions,
462
+ # deletions, and item mismatch. This class is a synonym for
463
+ # Alignment.
464
+ class LevenshteinAlignment < Alignment
465
+ end
466
+
467
+ # The Sellers alignment (aka the Needleman-Wunsch alignment) allows
468
+ # different constant costs to be specified for insertion, deletion,
469
+ # and substitute match and mismatch operations.
470
+ class SellersAlignment < Alignment
471
+
472
+ # The costs for insert, delete, and substitution match and
473
+ # mismatch are specified here. The default values for these costs
474
+ # yields a Levenshtein alignment.
475
+ def initialize(source, dest, match = 0, mismatch = 1,
476
+ insert = 1, delete = 1)
477
+ @match = match
478
+ @mismatch = mismatch
479
+ @insert = insert
480
+ @delete = delete
481
+ super(source, dest)
482
+ end
483
+
484
+ # The cost of substituting <em>source_item</em> with
485
+ # <em>dest_item</em>.
486
+ def substitute(source_item, dest_item)
487
+ source_item == dest_item ? @match:@mismatch
488
+ end
489
+
490
+ # The cost of deleting <em>source_item</em>.
491
+ def delete(source_item)
492
+ @delete
493
+ end
494
+
495
+ # The cost of inserting <em>dest_item</em>.
496
+ def insert(dest_item)
497
+ @insert
498
+ end
499
+
500
+ protected :substitute, :delete, :insert
501
+ end
502
+
503
+ # The Wagner-Fischer alignment specifies the same cost for insertion
504
+ # and deletion operations, another for item match, and another for
505
+ # item mismatch.
506
+ class WagnerFischerAlignment < SellersAlignment
507
+
508
+ # The costs for insert/delete operations and character match and
509
+ # mismatch are specified here. The default values for these costs
510
+ # yields a Levenshtein alignment.
511
+ def initialize(source, dest, match = 0, mismatch = 1, insert_delete = 1)
512
+ super(source, dest, match, mismatch, insert_delete, insert_delete)
513
+ end
514
+ end
515
+
516
+ end
@@ -0,0 +1,167 @@
1
+ #!/bin/env ruby
2
+
3
+ #--
4
+ # Copyright 2006 William Patrick McNeill
5
+ #
6
+ # This file is part of Editalign.
7
+ #
8
+ # Editalign is free software; you can redistribute it and/or modify it
9
+ # under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # Editalign is distributed in the hope that it will be useful, but
14
+ # WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ # General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with editalign; if not, write to the Free Software Foundation,
20
+ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #
22
+ #++
23
+
24
+ # Test cases for the EditAlign module
25
+
26
+ require 'test/unit'
27
+ require 'editalign'
28
+
29
+
30
+ class LevenshteinStringAlignments < Test::Unit::TestCase
31
+ def test_captained_caspian
32
+ a = EditAlign::Alignment.new("captained", "caspian")
33
+ assert_kind_of EditAlign::Alignment, a
34
+ assert_equal 5, a.edit_distance
35
+ assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
36
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
37
+ assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
38
+ end
39
+ end
40
+
41
+
42
+ class SellersAlignments < Test::Unit::TestCase
43
+ def test_captained_caspian
44
+ a = EditAlign::SellersAlignment.new("captained", "caspian")
45
+ assert_kind_of EditAlign::Alignment, a
46
+ assert_equal 5, a.edit_distance
47
+ assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
48
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
49
+ assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
50
+ end
51
+
52
+ def test_captained_caspian_custom_weights
53
+ a = EditAlign::SellersAlignment.new("captained", "caspian", 0.1, 0.5, 0.8, 0.9)
54
+ assert_kind_of EditAlign::Alignment, a
55
+ assert_equal 4.1, a.edit_distance
56
+ assert_equal [nil, nil, :substitute, :substitute, :delete, nil, :substitute, :delete, :substitute], a.edit_operations
57
+ assert_equal ["c", "a", "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
58
+ assert_equal ["c", "a", "s", "p", nil, "i", "a", nil, "n"], a.dest_alignment
59
+ end
60
+ end
61
+
62
+
63
+ class WagnerFischerAlignments < Test::Unit::TestCase
64
+ def test_captained_caspian
65
+ a = EditAlign::WagnerFischerAlignment.new("captained", "caspian")
66
+ assert_kind_of EditAlign::Alignment, a
67
+ assert_equal 5, a.edit_distance
68
+ assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
69
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
70
+ assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
71
+ end
72
+
73
+ def test_captained_caspian_high_substitution_cost
74
+ a = EditAlign::WagnerFischerAlignment.new("captained", "caspian", 0, 20, 1)
75
+ assert_kind_of EditAlign::Alignment, a
76
+ assert_equal 6, a.edit_distance
77
+ assert_equal [nil, nil, :insert, nil, :delete, :delete, nil, :insert, nil, :delete, :delete], a.edit_operations
78
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", nil, "n", "e", "d"], a.source_alignment
79
+ assert_equal ["c", "a", "s", "p", nil, nil, "i", "a", "n", nil, nil], a.dest_alignment
80
+ end
81
+ end
82
+
83
+
84
+ class ParameterOptions < Test::Unit::TestCase
85
+ def test_array_init
86
+ a = EditAlign::Alignment.new(['c', 'a', 'p', 't', 'a', 'i', 'n', 'e', 'd'], ['c', 'a', 's', 'p', 'i', 'a', 'n'])
87
+ assert_kind_of EditAlign::Alignment, a
88
+ assert_equal 5, a.edit_distance
89
+ assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
90
+ assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
91
+ assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
92
+ end
93
+
94
+ def test_marker
95
+ a = EditAlign::Alignment.new("captained", "caspian")
96
+ assert_equal ["X", "X", :insert, "X", :substitute, "X", :delete, "X", :delete, :delete], a.edit_operations("X")
97
+ assert_equal ["c", "a", "X", "p", "t", "a", "i", "n", "e", "d"], a.source_alignment('X')
98
+ assert_equal ["c", "a", "s", "p", "i", "a", "X", "n", "X", "X"], a.dest_alignment('X')
99
+ end
100
+ end
101
+
102
+
103
+ class BoundaryConditions < Test::Unit::TestCase
104
+ def test_empty_alignment
105
+ a = EditAlign::Alignment.new([], [])
106
+ assert_kind_of EditAlign::Alignment, a
107
+ assert_equal 0, a.edit_distance
108
+ assert_equal [], a.source_alignment
109
+ assert_equal [], a.dest_alignment
110
+ assert_equal [], a.edit_operations
111
+ end
112
+
113
+ def test_nonempty_empty_alignment
114
+ a = EditAlign::Alignment.new(['A', 'B', 'C'], [])
115
+ assert_kind_of EditAlign::Alignment, a
116
+ assert_equal 3, a.edit_distance
117
+ assert_equal [:delete, :delete, :delete], a.edit_operations
118
+ assert_equal ['A', 'B', 'C'], a.source_alignment
119
+ assert_equal [nil, nil, nil], a.dest_alignment
120
+ end
121
+
122
+ def test_empty_nonempty_alignment
123
+ a = EditAlign::Alignment.new([], ['A', 'B', 'C'])
124
+ assert_kind_of EditAlign::Alignment, a
125
+ assert_equal 3, a.edit_distance
126
+ assert_equal [:insert, :insert, :insert], a.edit_operations
127
+ assert_equal [nil, nil, nil], a.source_alignment
128
+ assert_equal ['A', 'B', 'C'], a.dest_alignment
129
+ end
130
+
131
+ def test_identical
132
+ a = EditAlign::Alignment.new(['A', 'B', 'C'], ['A', 'B', 'C'])
133
+ assert_kind_of EditAlign::Alignment, a
134
+ assert_equal 0, a.edit_distance
135
+ assert_equal [nil, nil, nil], a.edit_operations
136
+ assert_equal ['A', 'B', 'C'], a.source_alignment
137
+ assert_equal ['A', 'B', 'C'], a.dest_alignment
138
+ end
139
+ end
140
+
141
+ class Stringification < Test::Unit::TestCase
142
+ def test_captained_caspian
143
+ a = EditAlign::Alignment.new("captained", "caspian")
144
+ s = \
145
+ "ca-ptained
146
+ caspia-n--
147
+ I S D DD
148
+ 5"
149
+ assert_equal s, a.to_s
150
+ end
151
+ end
152
+
153
+ class CompareSearchAlgorithm < Test::Unit::TestCase
154
+
155
+ class ExhaustiveAlignment < EditAlign::SellersAlignment
156
+ include EditAlign::ExhaustiveSearch
157
+ end
158
+
159
+ def test_dijkstra_exhaustive
160
+ d = EditAlign::Alignment.new("captained", "caspian")
161
+ e = ExhaustiveAlignment.new("captained", "caspian")
162
+ assert_equal d.edit_distance, e.edit_distance
163
+ assert_equal d.edit_operations, e.edit_operations
164
+ assert_equal d.source_alignment, e.source_alignment
165
+ assert_equal d.dest_alignment, e.dest_alignment
166
+ end
167
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: editalign
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2006-05-30 00:00:00 -07:00
8
+ summary: Edit alignments between arrays
9
+ require_paths:
10
+ - lib
11
+ email: billmcn@u.washington.edu
12
+ homepage: http://staff.washington.edu/billmcn/index.shtml
13
+ rubyforge_project:
14
+ description: This module performs edit alignments between arrays. It returns alignments and edit distances.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - W.P. McNeill
30
+ files:
31
+ - test/test_editalign.rb
32
+ - lib/editalign.rb
33
+ - examples/align-strings
34
+ - examples/stress-test
35
+ - README
36
+ test_files:
37
+ - test/test_editalign.rb
38
+ rdoc_options:
39
+ - --title
40
+ - EditAlign -- Ruby Edit Alignment
41
+ - --main
42
+ - README
43
+ - --line-numbers
44
+ - --inline-source
45
+ extra_rdoc_files:
46
+ - README
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ requirements: []
52
+
53
+ dependencies:
54
+ - !ruby/object:Gem::Dependency
55
+ name: PriorityQueue
56
+ version_requirement:
57
+ version_requirements: !ruby/object:Gem::Version::Requirement
58
+ requirements:
59
+ - - ">"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.0.0
62
+ version: