editalign 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +65 -0
- data/examples/align-strings +72 -0
- data/examples/stress-test +165 -0
- data/lib/editalign.rb +516 -0
- data/test/test_editalign.rb +167 -0
- metadata +62 -0
data/README
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
= Edit Alignment Module
|
2
|
+
|
3
|
+
The module implements a dynamic programming string alignment algorithm
|
4
|
+
that produces both alignments and edit distances between source and
|
5
|
+
destination arrays or strings.
|
6
|
+
|
7
|
+
An alignment between the arrays is represented as a sequence of
|
8
|
+
insert, delete, and substitute operations that transform individual
|
9
|
+
source items into destination items. For example, the following is an
|
10
|
+
alignment between the letters in the words 'captained' and 'caspian':
|
11
|
+
|
12
|
+
c a - p t a i n e d
|
13
|
+
c a s p i a - n - -
|
14
|
+
|
15
|
+
Here an 's' was inserted, an 'i' was substituted for a 't', and an
|
16
|
+
'i', an 'e', and a 'd' were deleted.
|
17
|
+
|
18
|
+
The module exports an Alignment class which assigns numeric costs to
|
19
|
+
each of these operations and finds the alignment that incurs the
|
20
|
+
minimum cost or edit distance. The Alignment class uses Dijkstra's
|
21
|
+
algorithm to efficiently find edit distances for partially-aligned
|
22
|
+
arrays. See the EditAlign::DijkstraSearch class for details.
|
23
|
+
|
24
|
+
irb(main):001:0> require 'editalign'
|
25
|
+
=> true
|
26
|
+
irb(main):002:0> a = EditAlign::Alignment.new('captained', 'caspian')
|
27
|
+
=> <Alignment: 5>
|
28
|
+
irb(main):003:0> a.edit_distance
|
29
|
+
=> 5
|
30
|
+
irb(main):004:0> a.edit_operations
|
31
|
+
=> [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete]
|
32
|
+
irb(main):005:0> a.source_alignment
|
33
|
+
=> ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"]
|
34
|
+
irb(main):006:0> a.dest_alignment
|
35
|
+
=> ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil]
|
36
|
+
irb(main):007:0> puts a
|
37
|
+
ca-ptained
|
38
|
+
caspia-n--
|
39
|
+
I S D DD
|
40
|
+
5
|
41
|
+
=> nil
|
42
|
+
|
43
|
+
= History
|
44
|
+
|
45
|
+
* 1-0-0 ... First version
|
46
|
+
|
47
|
+
= See Also
|
48
|
+
|
49
|
+
* The Levenshtein[http://po-ru.com/projects/levenshtein/] module generates a Levenshtein edit distance doing an exhaustive search.
|
50
|
+
|
51
|
+
= Acknowledgments
|
52
|
+
|
53
|
+
Thanks to Jeremy G. Kahn for suggesting a diagonal-hugging search
|
54
|
+
strategy and optimizations to the one implemented here.
|
55
|
+
|
56
|
+
= Copyright
|
57
|
+
|
58
|
+
Copyright 2006, William Patrick McNeill
|
59
|
+
|
60
|
+
This program is distributed under the GNU General Public License.
|
61
|
+
|
62
|
+
= Author
|
63
|
+
|
64
|
+
W.P. McNeill mailto:billmcn@u.washington.edu
|
65
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Editalign.
|
7
|
+
#
|
8
|
+
# Editalign is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Editalign is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
# Print the character alignment between two strings passed in on the
|
25
|
+
# command line.
|
26
|
+
|
27
|
+
require 'getoptlong'
|
28
|
+
require 'editalign'
|
29
|
+
|
30
|
+
class ExhaustiveSellersAlignment < EditAlign::SellersAlignment
|
31
|
+
include EditAlign::ExhaustiveSearch
|
32
|
+
end
|
33
|
+
|
34
|
+
# Process command line options.
|
35
|
+
opts = GetoptLong.new(["--match", "-m", GetoptLong::REQUIRED_ARGUMENT],
|
36
|
+
["--nomatch", "-n", GetoptLong::REQUIRED_ARGUMENT],
|
37
|
+
["--insert", "-i",GetoptLong::REQUIRED_ARGUMENT],
|
38
|
+
["--delete", "-d", GetoptLong::REQUIRED_ARGUMENT],
|
39
|
+
["--exhaustive", "-e", GetoptLong::NO_ARGUMENT]
|
40
|
+
)
|
41
|
+
|
42
|
+
match = 0
|
43
|
+
mismatch = 1
|
44
|
+
insert = 1
|
45
|
+
delete = 1
|
46
|
+
exhaustive = false
|
47
|
+
opts.each do |opt, arg|
|
48
|
+
case opt
|
49
|
+
when "--match"
|
50
|
+
match = arg.to_f
|
51
|
+
when "--nomatch"
|
52
|
+
mismatch = arg.to_f
|
53
|
+
when "--insert"
|
54
|
+
insert = arg.to_f
|
55
|
+
when "--delete"
|
56
|
+
delete = arg.to_f
|
57
|
+
when "--exhaustive"
|
58
|
+
exhaustive = true
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
source = ARGV[0]
|
63
|
+
dest = ARGV[1]
|
64
|
+
|
65
|
+
# Do alignments and print results.
|
66
|
+
alignments = [EditAlign::SellersAlignment]
|
67
|
+
alignments << ExhaustiveSellersAlignment if exhaustive
|
68
|
+
|
69
|
+
alignments.each do |align_class|
|
70
|
+
a = align_class.new(source, dest, match, mismatch, insert, delete)
|
71
|
+
puts a, a.to_grid
|
72
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Editalign.
|
7
|
+
#
|
8
|
+
# Editalign is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Editalign is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
require "getoptlong"
|
25
|
+
require "editalign"
|
26
|
+
|
27
|
+
class PureRubyEditAlign < EditAlign::Alignment
|
28
|
+
def priority_queue_factory
|
29
|
+
require "priority_queue/ruby_priority_queue"
|
30
|
+
RubyPriorityQueue.new
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class IntegerPriorityEditAlign < EditAlign::Alignment
|
35
|
+
def priority_factory(cost, cell)
|
36
|
+
cost
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
# Process this script's command line options.
|
42
|
+
#
|
43
|
+
# There are three required arguments.
|
44
|
+
#
|
45
|
+
# * The number of alignments to perform
|
46
|
+
# * The length of the strings to align
|
47
|
+
# * The number of edits
|
48
|
+
#
|
49
|
+
# There are two optional switches.
|
50
|
+
#
|
51
|
+
# * verbose ... prints strings and edit distances
|
52
|
+
# * pure-ruby ... uses the pure Ruby priority queue instead of the C extension
|
53
|
+
# * int-cost ... uses integer costs
|
54
|
+
def parse_command_line
|
55
|
+
opts = GetoptLong.new(["--verbose", "-v", GetoptLong::NO_ARGUMENT],
|
56
|
+
["--pure-ruby", "-p", GetoptLong::NO_ARGUMENT],
|
57
|
+
["--int-cost", "-i", GetoptLong::NO_ARGUMENT])
|
58
|
+
verbose = false
|
59
|
+
pure_ruby = false
|
60
|
+
int_cost = false
|
61
|
+
opts.each do |opt, arg|
|
62
|
+
case opt
|
63
|
+
when "--verbose"
|
64
|
+
verbose = true
|
65
|
+
when "--pure-ruby"
|
66
|
+
pure_ruby = true
|
67
|
+
when "--int-cost"
|
68
|
+
int_cost = true
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
trials = Integer(ARGV[0])
|
73
|
+
length = Integer(ARGV[1])
|
74
|
+
edits = Integer(ARGV[2])
|
75
|
+
|
76
|
+
if pure_ruby
|
77
|
+
klass = PureRubyEditAlign
|
78
|
+
elsif int_cost
|
79
|
+
klass = IntegerPriorityEditAlign
|
80
|
+
else
|
81
|
+
klass = EditAlign::Alignment
|
82
|
+
end
|
83
|
+
|
84
|
+
[trials, length, edits, klass, verbose]
|
85
|
+
end
|
86
|
+
|
87
|
+
# Run a number of alignments of the same size.
|
88
|
+
def run_stress_test(trials, length, edits, klass, verbose)
|
89
|
+
(1..trials).each do |i|
|
90
|
+
puts "Trial #{i}" if verbose
|
91
|
+
|
92
|
+
# Generate a destination string with random differences from the
|
93
|
+
# source string.
|
94
|
+
source, dest = create_strings(length, edits)
|
95
|
+
|
96
|
+
# Align the altered string with the one read from the file.
|
97
|
+
alignment = klass.new(source, dest)
|
98
|
+
print "#{alignment}\n\n" if verbose
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Create two unaligned strings.
|
103
|
+
def create_strings(length, edits)
|
104
|
+
# The alphabet used for inserts and substitutions.
|
105
|
+
alphabet = ('A'..'Z').collect
|
106
|
+
|
107
|
+
# Source contains a repeating lowercase alphabet length characters
|
108
|
+
# long.
|
109
|
+
last = length - 1
|
110
|
+
source = (0..last).map {|x| (97 + x % 26).chr}
|
111
|
+
|
112
|
+
# There can only be as many edits as there are characters.
|
113
|
+
edits = length if edits > length
|
114
|
+
|
115
|
+
# Create a roughly even number of substitutions, inserts, and
|
116
|
+
# deletes.
|
117
|
+
n_subs = 0
|
118
|
+
n_ins = 0
|
119
|
+
n_dels = 0
|
120
|
+
(1..edits).each do
|
121
|
+
case rand(3)
|
122
|
+
when 0
|
123
|
+
n_subs += 1
|
124
|
+
when 1
|
125
|
+
n_ins += 1
|
126
|
+
when 2
|
127
|
+
n_dels += 1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Distribute edit operations randomly throughout the string.
|
132
|
+
unchanged_pos = (0..last).collect
|
133
|
+
edit_op = [nil] * length
|
134
|
+
|
135
|
+
[:substitute, :insert, :delete].zip([n_subs, n_ins, n_dels]) do |op, n|
|
136
|
+
(1..n).each do
|
137
|
+
i = rand(unchanged_pos.length)
|
138
|
+
edit_op[unchanged_pos[i]] = op
|
139
|
+
unchanged_pos.delete_at(i)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# Use the random edit operations to create a destination string.
|
144
|
+
dest = ''
|
145
|
+
edit_op.each_index do |i|
|
146
|
+
case edit_op[i]
|
147
|
+
when nil
|
148
|
+
dest += source[i]
|
149
|
+
when :substitute
|
150
|
+
dest += alphabet[rand(alphabet.length)]
|
151
|
+
when :insert
|
152
|
+
dest += alphabet[rand(alphabet.length)] + source[i]
|
153
|
+
when :delete
|
154
|
+
# Do nothing
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
[source, dest]
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
if __FILE__ == $0
|
163
|
+
trials, length, edits, klass, verbose = parse_command_line
|
164
|
+
run_stress_test(trials, length, edits, klass, verbose)
|
165
|
+
end
|
data/lib/editalign.rb
ADDED
@@ -0,0 +1,516 @@
|
|
1
|
+
# Copyright 2006 William Patrick McNeill
|
2
|
+
#
|
3
|
+
# Editalign is free software; you can redistribute it and/or modify it
|
4
|
+
# under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation; either version 2 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# Editalign is distributed in the hope that it will be useful, but
|
9
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
15
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
# EditAlign is the namespace that contains all edit alignment
|
18
|
+
# functions.
|
19
|
+
module EditAlign
|
20
|
+
|
21
|
+
# This module employs Dijkstra's algorithm to find the lowest-cost
|
22
|
+
# sequence of edit operations that will transform the source array
|
23
|
+
# into the destination array. The alignment grid is treated as a
|
24
|
+
# directed acyclic graph where each cell in the grid is a vertex.
|
25
|
+
# Edges in the graph correspond to substitution, deletion and
|
26
|
+
# insertion operations. The edge weights come from the weighting
|
27
|
+
# methods #substitute, #insert, and #delete.
|
28
|
+
#
|
29
|
+
# Generally speaking, the strategy is to search the diagonal of the
|
30
|
+
# alignment grid before the corners. For partially-aligned arrays,
|
31
|
+
# this strategy can result in fewer calls to the weighting
|
32
|
+
# functions.
|
33
|
+
module DijkstraSearch
|
34
|
+
|
35
|
+
# Cells to be searched in the Dijkstra priority queue are ordered
|
36
|
+
# by SearchPriority. SearchPriority orders cells first by cost,
|
37
|
+
# and then if the costs are equal by smallest number of hops to
|
38
|
+
# the end cell. All things being equal, the latter comparison
|
39
|
+
# makes the algorithm search cells near the diagonal first. This
|
40
|
+
# can help in instances where the beginings of the arrays are
|
41
|
+
# unaligned but the ends are aligned.
|
42
|
+
class SearchPriority
|
43
|
+
|
44
|
+
# The cost to reach the current cell
|
45
|
+
attr_reader :cost
|
46
|
+
|
47
|
+
# The minimum path length to the final cell
|
48
|
+
attr_reader :dist
|
49
|
+
|
50
|
+
include Comparable
|
51
|
+
|
52
|
+
# Specify the cost to reach a cell, the cell, and the final
|
53
|
+
# cell.
|
54
|
+
def initialize(cost, cell, end_cell)
|
55
|
+
@cost = cost
|
56
|
+
@dist = [end_cell.source - cell.source, end_cell.dest - cell.dest].max
|
57
|
+
end
|
58
|
+
|
59
|
+
# Order by cost and then by distance to the end cell.
|
60
|
+
def <=>(other)
|
61
|
+
comp = (cost <=> other.cost)
|
62
|
+
comp = (dist <=> other.dist) if comp == 0
|
63
|
+
comp
|
64
|
+
end
|
65
|
+
|
66
|
+
# Interactive stringification
|
67
|
+
def inspect
|
68
|
+
"Pri(#{cost}, #{dist})"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Search the alignment grid filling in <em>@cost</em> and
|
73
|
+
# <em>@backtrace</em>.
|
74
|
+
def find_lowest_cost_alignment
|
75
|
+
agenda = priority_queue_factory
|
76
|
+
|
77
|
+
agenda[@start] = priority_factory(0, @start)
|
78
|
+
@backtrace = {}
|
79
|
+
|
80
|
+
until agenda.empty?
|
81
|
+
cell, priority = agenda.delete_min
|
82
|
+
cost = @cost[cell]
|
83
|
+
break if cost >= @cost[@end]
|
84
|
+
outgoing(cell) do |next_cell, next_cost|
|
85
|
+
next_cost += @cost[cell]
|
86
|
+
next unless next_cost < @cost[next_cell]
|
87
|
+
@cost[next_cell] = next_cost
|
88
|
+
@backtrace[next_cell] = cell
|
89
|
+
agenda[next_cell] = priority_factory(next_cost, next_cell) \
|
90
|
+
unless next_cost >= @cost[@end]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# An enumeration of all the cells adjacent to the specified cell
|
96
|
+
# and the costs of transitioning to them. Adjacent cells are
|
97
|
+
# reached by performing substitute, delete, and insertion
|
98
|
+
# operations.
|
99
|
+
def outgoing(cell) # :yields: cell, cost
|
100
|
+
# Substitute
|
101
|
+
if cell.source < @source.length-1 and cell.dest < @dest.length-1
|
102
|
+
next_cell = Alignment::Cell.new(cell.source+1, cell.dest+1)
|
103
|
+
if @cost[next_cell] > @cost[cell]
|
104
|
+
cost = substitute(@source[next_cell.source], @dest[next_cell.dest])
|
105
|
+
yield next_cell, cost
|
106
|
+
end
|
107
|
+
end
|
108
|
+
# Delete
|
109
|
+
if cell.source < @source.length-1
|
110
|
+
next_cell = Alignment::Cell.new(cell.source+1, cell.dest)
|
111
|
+
if @cost[next_cell] > @cost[cell]
|
112
|
+
cost = delete(@source[next_cell.source])
|
113
|
+
yield next_cell, cost
|
114
|
+
end
|
115
|
+
end
|
116
|
+
# Insert
|
117
|
+
if cell.dest < @dest.length-1
|
118
|
+
next_cell = Alignment::Cell.new(cell.source, cell.dest+1)
|
119
|
+
if @cost[next_cell] > @cost[cell]
|
120
|
+
cost = insert(@dest[next_cell.dest])
|
121
|
+
yield next_cell, cost
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Create the priority queue used by the search. By default
|
127
|
+
# EditAlign uses the C extension version of the
|
128
|
+
# priority_queue[http://rubyforge.org/projects/priority-queue/]
|
129
|
+
# library. If you wish to use a different priority queue
|
130
|
+
# implementation you may overload this function in a derived
|
131
|
+
# class.
|
132
|
+
def priority_queue_factory
|
133
|
+
require "priority_queue"
|
134
|
+
PriorityQueue.new
|
135
|
+
|
136
|
+
# Uncomment the following lines to use the pure-Ruby
|
137
|
+
# implementation of priority_queue.
|
138
|
+
# require "priority_queue/ruby_priority_queue"
|
139
|
+
# RubyPriorityQueue.new
|
140
|
+
end
|
141
|
+
|
142
|
+
# Create a new search priority for the queue. The priority must
|
143
|
+
# define the <em><=></em> operator. Cells with lower priority
|
144
|
+
# value will be searched first. If you wish to use a different
|
145
|
+
# prioritization scheme you may overload this function in a
|
146
|
+
# derived class.
|
147
|
+
def priority_factory(cost, cell)
|
148
|
+
SearchPriority.new(cost, cell, @end)
|
149
|
+
end
|
150
|
+
|
151
|
+
private :outgoing
|
152
|
+
protected :find_lowest_cost_alignment, :priority_queue_factory
|
153
|
+
protected :priority_factory
|
154
|
+
end
|
155
|
+
|
156
|
+
|
157
|
+
# This module employs an exhaustive search to find the lowest-cost
|
158
|
+
# sequence of edit operations that will transform the source array
|
159
|
+
# into the destination array. It finds the lowest cost alignment by
|
160
|
+
# filling in every cell in the costs table.
|
161
|
+
#
|
162
|
+
# This algorithm will return the same results as Dijkstra's
|
163
|
+
# algorithm though it is less efficient for nearly-aligned strings.
|
164
|
+
# Nevertheless, this search algorithm is commonly cited when
|
165
|
+
# describing alignments, and so is implemented here for the sake of
|
166
|
+
# documentation and for comparison with the Dijkstra.
|
167
|
+
module ExhaustiveSearch
|
168
|
+
|
169
|
+
# An incoming cell and its associated cost.
|
170
|
+
IncomingCost = Struct.new("IncomingCost", :cell, :cost)
|
171
|
+
|
172
|
+
# Search the alignment grid filling in @cost and @backtrace.
|
173
|
+
def find_lowest_cost_alignment
|
174
|
+
@backtrace = {}
|
175
|
+
|
176
|
+
# Fill in the top row of the table.
|
177
|
+
(1..@source.length).each do |source|
|
178
|
+
@cost[Alignment::Cell.new(source, 0)] = \
|
179
|
+
@cost[Alignment::Cell.new(source-1, 0)] + delete(@source[source])
|
180
|
+
end
|
181
|
+
# Fill in the first column of the table.
|
182
|
+
(1..@dest.length).each do |dest|
|
183
|
+
@cost[Alignment::Cell.new(0, dest)] = \
|
184
|
+
@cost[Alignment::Cell.new(0, dest-1)] + insert(@dest[dest])
|
185
|
+
end
|
186
|
+
# Fill in all the remaining cells in the table.
|
187
|
+
(1..@source.length).each do |source|
|
188
|
+
(1..@dest.length).each do |dest|
|
189
|
+
incoming = []
|
190
|
+
c = Alignment::Cell.new(source-1, dest)
|
191
|
+
incoming << IncomingCost.new(c, @cost[c] + delete(@source[source]))
|
192
|
+
c = Alignment::Cell.new(source, dest-1)
|
193
|
+
incoming << IncomingCost.new(c, @cost[c] + insert(@dest[dest]))
|
194
|
+
c = Alignment::Cell.new(source-1, dest-1)
|
195
|
+
incoming << \
|
196
|
+
IncomingCost.new(c, @cost[c] + substitute(@source[source],
|
197
|
+
@dest[dest]))
|
198
|
+
best = incoming.min {|a,b| a.cost <=> b.cost}
|
199
|
+
@cost[Alignment::Cell.new(source, dest)] = best.cost
|
200
|
+
@backtrace[Alignment::Cell.new(source, dest)] = best.cell
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
protected :find_lowest_cost_alignment
|
206
|
+
end
|
207
|
+
|
208
|
+
|
209
|
+
# The Alignment class is given a source and destination array at
|
210
|
+
# construction time. It does a dynamic programming alignment
|
211
|
+
# between them and makes the results of that alignment available
|
212
|
+
# through instance methods.
|
213
|
+
#
|
214
|
+
# If there are multiple alignments with equal edit distances
|
215
|
+
# Alignment will find one of them. Which one is undefined.
|
216
|
+
#
|
217
|
+
# Alignment works by constructing a matrix with dimensions equal to
|
218
|
+
# the length of the source and destination arrays. Moving
|
219
|
+
# horizontally and vertically in the matrix represents insertion and
|
220
|
+
# deletion operations, respectively, while moving diagonally
|
221
|
+
# represents substitution. Each cell of the matrix contains the
|
222
|
+
# minimum cost it takes to reach that cell. The algorithm fills in
|
223
|
+
# cells in the matrix until it reaches the furthest corner.
|
224
|
+
#
|
225
|
+
# The search is done using Dijkstra's algorithm as implemented in
|
226
|
+
# the DijkstraSearch. A different search algorithm may be specified
|
227
|
+
# by including a mixin that redefines the
|
228
|
+
# #find_lowest_cost_alignment function.
|
229
|
+
#
|
230
|
+
# This class uses Levenshtein weighting scheme. Levenshtein assigns
|
231
|
+
# a cost of 1 to insertions and deletions. It assigns a cost of 1
|
232
|
+
# to substitutions when the items are different and 0 when they are
|
233
|
+
# the same. Different weighting schemes may be specified by
|
234
|
+
# overloading the #insert, #delete, and #substitute functions. The
|
235
|
+
# costs must be non-negative numbers.
|
236
|
+
class Alignment
|
237
|
+
include DijkstraSearch
|
238
|
+
|
239
|
+
# A location in the alignment grid.
|
240
|
+
Cell = Struct.new("Cell", :source, :dest)
|
241
|
+
|
242
|
+
# The caller specifies a source and destination array. The object
|
243
|
+
# performs the alignment at construction time.
|
244
|
+
#
|
245
|
+
# Optionally either <em>source</em> or <em>dest</em> may be
|
246
|
+
# strings, in which they will be treated as arrays of characters.
|
247
|
+
def initialize(source, dest)
|
248
|
+
# Convert strings into arrays.
|
249
|
+
source = source.unpack('U*').collect {|c| c.chr} if source.class == String
|
250
|
+
dest = dest.unpack('U*').collect {|c| c.chr} if dest.class == String
|
251
|
+
|
252
|
+
# Prepend empty elements to the source and destination arrays to
|
253
|
+
# handle insertions and deletions at the front of the alignment.
|
254
|
+
@source = source.unshift(nil)
|
255
|
+
@dest = dest.unshift(nil)
|
256
|
+
|
257
|
+
# The start and end cells of the search.
|
258
|
+
@start = Cell.new(0, 0)
|
259
|
+
@end = Cell.new(@source.length-1, @dest.length-1)
|
260
|
+
|
261
|
+
# The lowest known cost to reach a cell. Unexplored cells have
|
262
|
+
# an infinite cost.
|
263
|
+
@cost = Hash.new{1.0/0.0}
|
264
|
+
@cost[@start] = 0
|
265
|
+
|
266
|
+
# Fill in the @cost matrix including @cost[@end] and create a
|
267
|
+
# lowest-cost sequence of cells in @backtrace.
|
268
|
+
find_lowest_cost_alignment
|
269
|
+
end
|
270
|
+
|
271
|
+
# The minimum edit distance
|
272
|
+
def edit_distance
|
273
|
+
@cost[@end]
|
274
|
+
end
|
275
|
+
|
276
|
+
# The lowest-cost list of edit operations. This is a list of
|
277
|
+
# <em>:substitute</em>, <em>:insert</em>, <em>:delete</em> symbols
|
278
|
+
# for operatations that changed an array element and
|
279
|
+
# <em>marker</em> for operations that did not.
|
280
|
+
def edit_operations(marker = nil)
|
281
|
+
ops = []
|
282
|
+
edit_sequence do |cell, operation|
|
283
|
+
if operation == :substitute
|
284
|
+
ops << \
|
285
|
+
(@source[cell.source] == @dest[cell.dest] ? marker: :substitute)
|
286
|
+
else
|
287
|
+
ops << operation
|
288
|
+
end
|
289
|
+
end
|
290
|
+
ops
|
291
|
+
end
|
292
|
+
|
293
|
+
# The source items with <em>marker</em> inserted where an
|
294
|
+
# insertion took place.
|
295
|
+
def source_alignment(marker = nil)
|
296
|
+
source = []
|
297
|
+
edit_sequence do |cell, operation|
|
298
|
+
source << (operation == :insert ? marker:@source[cell.source])
|
299
|
+
end
|
300
|
+
source
|
301
|
+
end
|
302
|
+
|
303
|
+
# The destination items with <em>marker</em> inserted where a
|
304
|
+
# deletion took place.
|
305
|
+
def dest_alignment(marker = nil)
|
306
|
+
dest = []
|
307
|
+
edit_sequence do |cell, operation|
|
308
|
+
dest << (operation == :delete ? marker:@dest[cell.dest])
|
309
|
+
end
|
310
|
+
dest
|
311
|
+
end
|
312
|
+
|
313
|
+
# Enumerate the minimum-cost sequence of edit operations.
|
314
|
+
def edit_sequence # :yields: cell, {:substitute, :insert, :delete}
|
315
|
+
# The first time this function is called, walk backwards through
|
316
|
+
# the backtrace to create the @path instance variable.
|
317
|
+
if not @path
|
318
|
+
@path = [@end]
|
319
|
+
while cell = @backtrace[@path[0]]
|
320
|
+
@path.unshift(cell)
|
321
|
+
end
|
322
|
+
end
|
323
|
+
# Walk forwards through the path.
|
324
|
+
prev_cell = @start
|
325
|
+
@path[1..-1].each do |cell|
|
326
|
+
delta_source = cell.source - prev_cell.source
|
327
|
+
delta_dest = cell.dest - prev_cell.dest
|
328
|
+
if delta_source == 1 and delta_dest == 1
|
329
|
+
yield cell, :substitute
|
330
|
+
elsif delta_source == 1
|
331
|
+
yield cell, :delete
|
332
|
+
elsif delta_dest == 1
|
333
|
+
yield cell, :insert
|
334
|
+
else
|
335
|
+
raise "Invalid path link #{prev_cell}->#{cell}"
|
336
|
+
end
|
337
|
+
prev_cell = cell
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
# Interactive stringification
|
342
|
+
def inspect
|
343
|
+
"<Alignment: #{edit_distance}>"
|
344
|
+
end
|
345
|
+
|
346
|
+
# The cost of substituting <em>source_item</em> with
|
347
|
+
# <em>dest_item</em>.
|
348
|
+
def substitute(source_item, dest_item)
|
349
|
+
source_item == dest_item ? 0:1
|
350
|
+
end
|
351
|
+
|
352
|
+
# The cost of deleting <em>source_item</em>.
|
353
|
+
def delete(source_item)
|
354
|
+
1
|
355
|
+
end
|
356
|
+
|
357
|
+
# The cost of inserting <em>dest_item</em>.
|
358
|
+
def insert(dest_item)
|
359
|
+
1
|
360
|
+
end
|
361
|
+
|
362
|
+
# The string representation of the alignment consists of four lines:
|
363
|
+
#
|
364
|
+
# 1. The source array
|
365
|
+
# 2. The destination array
|
366
|
+
# 3. An annotation line with S, I, D or nothing for aligned elements.
|
367
|
+
# 4. The edit distance
|
368
|
+
def to_s
|
369
|
+
# Create the source and destination lines.
|
370
|
+
s_line = source_alignment('-')
|
371
|
+
d_line = dest_alignment('-')
|
372
|
+
# Create short pneumonics for the edit operations.
|
373
|
+
ops = edit_operations.map do |op|
|
374
|
+
case op
|
375
|
+
when nil
|
376
|
+
c = " "
|
377
|
+
when :substitute
|
378
|
+
c = "S"
|
379
|
+
when :insert
|
380
|
+
c = "I"
|
381
|
+
when :delete
|
382
|
+
c = "D"
|
383
|
+
end
|
384
|
+
end
|
385
|
+
# Find the longest element in all the lines.
|
386
|
+
longest = [s_line, d_line, ops].map{|l| l.map{|e| e.length}.max}.max
|
387
|
+
# Center each array element over a field of that width.
|
388
|
+
lines = [s_line, d_line, ops].map do |list|
|
389
|
+
list.map{|c| c.center(longest)}.join
|
390
|
+
end
|
391
|
+
(lines + [edit_distance]).join("\n")
|
392
|
+
end
|
393
|
+
|
394
|
+
# This prints a grid of all the costs. Cells that were not
|
395
|
+
# visited because they could not contribute to the lowest-cost
|
396
|
+
# path are marked with an asterisk. Cells that are in the lowest
|
397
|
+
# cost path are highlighted with square brackets.
|
398
|
+
#
|
399
|
+
# irb(main):001:0> require 'editalign'
|
400
|
+
# => true
|
401
|
+
# irb(main):002:0> a = EditAlign::Alignment.new('captained', 'caspian')
|
402
|
+
# => <Alignment: 5>
|
403
|
+
# irb(main):003:0> puts a
|
404
|
+
# - c a p t a i n e d
|
405
|
+
# - [0.00] 1.00 2.00 3.00 4.00 5.00 * * * *
|
406
|
+
# c 1.00 [0.00] 1.00 2.00 3.00 4.00 5.00 * * *
|
407
|
+
# a 2.00 1.00 [0.00] 1.00 2.00 3.00 4.00 5.00 * *
|
408
|
+
# s 3.00 2.00 [1.00] 1.00 2.00 3.00 4.00 5.00 * *
|
409
|
+
# p 4.00 3.00 2.00 [1.00] 2.00 3.00 4.00 5.00 * *
|
410
|
+
# i 5.00 4.00 3.00 2.00 [2.00] 3.00 3.00 4.00 5.00 *
|
411
|
+
# a * 5.00 4.00 3.00 3.00 [2.00] [3.00] 4.00 5.00 *
|
412
|
+
# n * * 5.00 4.00 4.00 3.00 3.00 [3.00] [4.00] [5.00]
|
413
|
+
# => nil
|
414
|
+
def to_grid
|
415
|
+
# Make the columns wide enough to accommodate the widest header
|
416
|
+
# or value.
|
417
|
+
widest_header = @source.find_all{|x| x}.map{|x| x.length}.max
|
418
|
+
widest_cost = @cost.values.map{|c| (sprintf "%.2f", c).length + 2}.max
|
419
|
+
# The conditional handles empty alignments.
|
420
|
+
col_width = (widest_header and widest_cost) ? \
|
421
|
+
[widest_header, widest_cost].max: 1
|
422
|
+
|
423
|
+
# Create the header row.
|
424
|
+
header = [""] + @source.map{|x| x ? x: "-"}
|
425
|
+
header = header.map {|x| x.center(col_width)}
|
426
|
+
|
427
|
+
# Make note of which cells are on the lowest-cost path.
|
428
|
+
path_cells = {}
|
429
|
+
@path.each {|cell| path_cells[cell] = true} if @path
|
430
|
+
|
431
|
+
# Enumerate the destination, creating the cost rows.
|
432
|
+
table = [header]
|
433
|
+
(0..@dest.length-1).each do |dest|
|
434
|
+
x = @dest[dest]
|
435
|
+
x = "-" if not x
|
436
|
+
row = [sprintf("%-#{col_width}s", x)]
|
437
|
+
(0..@source.length-1).each do |source|
|
438
|
+
cell = Cell.new(source, dest)
|
439
|
+
c = @cost[cell]
|
440
|
+
if c == 1.0/0.0
|
441
|
+
# Center the * character in the column.
|
442
|
+
value = "*".center(col_width)
|
443
|
+
else
|
444
|
+
# Put brackets around cells in the best path.
|
445
|
+
value = sprintf path_cells[cell] ? "[%.2f]":" %.2f ", c
|
446
|
+
value = sprintf "%#{col_width}s", value
|
447
|
+
end
|
448
|
+
row << value
|
449
|
+
end
|
450
|
+
table << row
|
451
|
+
end
|
452
|
+
# Combine the rows into a single string table.
|
453
|
+
col_spc = " " * 4
|
454
|
+
table.map{|row| row.join(col_spc)}.join("\n")
|
455
|
+
end
|
456
|
+
|
457
|
+
private :edit_sequence
|
458
|
+
protected :substitute, :delete, :insert
|
459
|
+
end
|
460
|
+
|
461
|
+
# The Levenshtein alignment yields a cost of 1 for insertions,
|
462
|
+
# deletions, and item mismatch. This class is a synonym for
|
463
|
+
# Alignment.
|
464
|
+
class LevenshteinAlignment < Alignment
|
465
|
+
end
|
466
|
+
|
467
|
+
# The Sellers alignment (aka the Needleman-Wunsch alignment) allows
|
468
|
+
# different constant costs to be specified for insertion, deletion,
|
469
|
+
# and substitute match and mismatch operations.
|
470
|
+
class SellersAlignment < Alignment
|
471
|
+
|
472
|
+
# The costs for insert, delete, and substitution match and
|
473
|
+
# mismatch are specified here. The default values for these costs
|
474
|
+
# yields a Levenshtein alignment.
|
475
|
+
def initialize(source, dest, match = 0, mismatch = 1,
|
476
|
+
insert = 1, delete = 1)
|
477
|
+
@match = match
|
478
|
+
@mismatch = mismatch
|
479
|
+
@insert = insert
|
480
|
+
@delete = delete
|
481
|
+
super(source, dest)
|
482
|
+
end
|
483
|
+
|
484
|
+
# The cost of substituting <em>source_item</em> with
|
485
|
+
# <em>dest_item</em>.
|
486
|
+
def substitute(source_item, dest_item)
|
487
|
+
source_item == dest_item ? @match:@mismatch
|
488
|
+
end
|
489
|
+
|
490
|
+
# The cost of deleting <em>source_item</em>.
|
491
|
+
def delete(source_item)
|
492
|
+
@delete
|
493
|
+
end
|
494
|
+
|
495
|
+
# The cost of inserting <em>dest_item</em>.
|
496
|
+
def insert(dest_item)
|
497
|
+
@insert
|
498
|
+
end
|
499
|
+
|
500
|
+
protected :substitute, :delete, :insert
|
501
|
+
end
|
502
|
+
|
503
|
+
# The Wagner-Fischer alignment specifies the same cost for insertion
|
504
|
+
# and deletion operations, another for item match, and another for
|
505
|
+
# item mismatch.
|
506
|
+
class WagnerFischerAlignment < SellersAlignment
|
507
|
+
|
508
|
+
# The costs for insert/delete operations and character match and
|
509
|
+
# mismatch are specified here. The default values for these costs
|
510
|
+
# yields a Levenshtein alignment.
|
511
|
+
def initialize(source, dest, match = 0, mismatch = 1, insert_delete = 1)
|
512
|
+
super(source, dest, match, mismatch, insert_delete, insert_delete)
|
513
|
+
end
|
514
|
+
end
|
515
|
+
|
516
|
+
end
|
@@ -0,0 +1,167 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Editalign.
|
7
|
+
#
|
8
|
+
# Editalign is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Editalign is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
# Test cases for the EditAlign module
|
25
|
+
|
26
|
+
require 'test/unit'
|
27
|
+
require 'editalign'
|
28
|
+
|
29
|
+
|
30
|
+
class LevenshteinStringAlignments < Test::Unit::TestCase
|
31
|
+
def test_captained_caspian
|
32
|
+
a = EditAlign::Alignment.new("captained", "caspian")
|
33
|
+
assert_kind_of EditAlign::Alignment, a
|
34
|
+
assert_equal 5, a.edit_distance
|
35
|
+
assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
|
36
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
37
|
+
assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
class SellersAlignments < Test::Unit::TestCase
|
43
|
+
def test_captained_caspian
|
44
|
+
a = EditAlign::SellersAlignment.new("captained", "caspian")
|
45
|
+
assert_kind_of EditAlign::Alignment, a
|
46
|
+
assert_equal 5, a.edit_distance
|
47
|
+
assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
|
48
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
49
|
+
assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_captained_caspian_custom_weights
|
53
|
+
a = EditAlign::SellersAlignment.new("captained", "caspian", 0.1, 0.5, 0.8, 0.9)
|
54
|
+
assert_kind_of EditAlign::Alignment, a
|
55
|
+
assert_equal 4.1, a.edit_distance
|
56
|
+
assert_equal [nil, nil, :substitute, :substitute, :delete, nil, :substitute, :delete, :substitute], a.edit_operations
|
57
|
+
assert_equal ["c", "a", "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
58
|
+
assert_equal ["c", "a", "s", "p", nil, "i", "a", nil, "n"], a.dest_alignment
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
class WagnerFischerAlignments < Test::Unit::TestCase
|
64
|
+
def test_captained_caspian
|
65
|
+
a = EditAlign::WagnerFischerAlignment.new("captained", "caspian")
|
66
|
+
assert_kind_of EditAlign::Alignment, a
|
67
|
+
assert_equal 5, a.edit_distance
|
68
|
+
assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
|
69
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
70
|
+
assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_captained_caspian_high_substitution_cost
|
74
|
+
a = EditAlign::WagnerFischerAlignment.new("captained", "caspian", 0, 20, 1)
|
75
|
+
assert_kind_of EditAlign::Alignment, a
|
76
|
+
assert_equal 6, a.edit_distance
|
77
|
+
assert_equal [nil, nil, :insert, nil, :delete, :delete, nil, :insert, nil, :delete, :delete], a.edit_operations
|
78
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", nil, "n", "e", "d"], a.source_alignment
|
79
|
+
assert_equal ["c", "a", "s", "p", nil, nil, "i", "a", "n", nil, nil], a.dest_alignment
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
class ParameterOptions < Test::Unit::TestCase
|
85
|
+
def test_array_init
|
86
|
+
a = EditAlign::Alignment.new(['c', 'a', 'p', 't', 'a', 'i', 'n', 'e', 'd'], ['c', 'a', 's', 'p', 'i', 'a', 'n'])
|
87
|
+
assert_kind_of EditAlign::Alignment, a
|
88
|
+
assert_equal 5, a.edit_distance
|
89
|
+
assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
|
90
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
91
|
+
assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_marker
|
95
|
+
a = EditAlign::Alignment.new("captained", "caspian")
|
96
|
+
assert_equal ["X", "X", :insert, "X", :substitute, "X", :delete, "X", :delete, :delete], a.edit_operations("X")
|
97
|
+
assert_equal ["c", "a", "X", "p", "t", "a", "i", "n", "e", "d"], a.source_alignment('X')
|
98
|
+
assert_equal ["c", "a", "s", "p", "i", "a", "X", "n", "X", "X"], a.dest_alignment('X')
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
class BoundaryConditions < Test::Unit::TestCase
|
104
|
+
def test_empty_alignment
|
105
|
+
a = EditAlign::Alignment.new([], [])
|
106
|
+
assert_kind_of EditAlign::Alignment, a
|
107
|
+
assert_equal 0, a.edit_distance
|
108
|
+
assert_equal [], a.source_alignment
|
109
|
+
assert_equal [], a.dest_alignment
|
110
|
+
assert_equal [], a.edit_operations
|
111
|
+
end
|
112
|
+
|
113
|
+
def test_nonempty_empty_alignment
|
114
|
+
a = EditAlign::Alignment.new(['A', 'B', 'C'], [])
|
115
|
+
assert_kind_of EditAlign::Alignment, a
|
116
|
+
assert_equal 3, a.edit_distance
|
117
|
+
assert_equal [:delete, :delete, :delete], a.edit_operations
|
118
|
+
assert_equal ['A', 'B', 'C'], a.source_alignment
|
119
|
+
assert_equal [nil, nil, nil], a.dest_alignment
|
120
|
+
end
|
121
|
+
|
122
|
+
def test_empty_nonempty_alignment
|
123
|
+
a = EditAlign::Alignment.new([], ['A', 'B', 'C'])
|
124
|
+
assert_kind_of EditAlign::Alignment, a
|
125
|
+
assert_equal 3, a.edit_distance
|
126
|
+
assert_equal [:insert, :insert, :insert], a.edit_operations
|
127
|
+
assert_equal [nil, nil, nil], a.source_alignment
|
128
|
+
assert_equal ['A', 'B', 'C'], a.dest_alignment
|
129
|
+
end
|
130
|
+
|
131
|
+
def test_identical
|
132
|
+
a = EditAlign::Alignment.new(['A', 'B', 'C'], ['A', 'B', 'C'])
|
133
|
+
assert_kind_of EditAlign::Alignment, a
|
134
|
+
assert_equal 0, a.edit_distance
|
135
|
+
assert_equal [nil, nil, nil], a.edit_operations
|
136
|
+
assert_equal ['A', 'B', 'C'], a.source_alignment
|
137
|
+
assert_equal ['A', 'B', 'C'], a.dest_alignment
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
class Stringification < Test::Unit::TestCase
|
142
|
+
def test_captained_caspian
|
143
|
+
a = EditAlign::Alignment.new("captained", "caspian")
|
144
|
+
s = \
|
145
|
+
"ca-ptained
|
146
|
+
caspia-n--
|
147
|
+
I S D DD
|
148
|
+
5"
|
149
|
+
assert_equal s, a.to_s
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
class CompareSearchAlgorithm < Test::Unit::TestCase
|
154
|
+
|
155
|
+
class ExhaustiveAlignment < EditAlign::SellersAlignment
|
156
|
+
include EditAlign::ExhaustiveSearch
|
157
|
+
end
|
158
|
+
|
159
|
+
def test_dijkstra_exhaustive
|
160
|
+
d = EditAlign::Alignment.new("captained", "caspian")
|
161
|
+
e = ExhaustiveAlignment.new("captained", "caspian")
|
162
|
+
assert_equal d.edit_distance, e.edit_distance
|
163
|
+
assert_equal d.edit_operations, e.edit_operations
|
164
|
+
assert_equal d.source_alignment, e.source_alignment
|
165
|
+
assert_equal d.dest_alignment, e.dest_alignment
|
166
|
+
end
|
167
|
+
end
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
!ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: editalign
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2006-05-30 00:00:00 -07:00
|
8
|
+
summary: Edit alignments between arrays
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: billmcn@u.washington.edu
|
12
|
+
homepage: http://staff.washington.edu/billmcn/index.shtml
|
13
|
+
rubyforge_project:
|
14
|
+
description: This module performs edit alignments between arrays. It returns alignments and edit distances.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- W.P. McNeill
|
30
|
+
files:
|
31
|
+
- test/test_editalign.rb
|
32
|
+
- lib/editalign.rb
|
33
|
+
- examples/align-strings
|
34
|
+
- examples/stress-test
|
35
|
+
- README
|
36
|
+
test_files:
|
37
|
+
- test/test_editalign.rb
|
38
|
+
rdoc_options:
|
39
|
+
- --title
|
40
|
+
- EditAlign -- Ruby Edit Alignment
|
41
|
+
- --main
|
42
|
+
- README
|
43
|
+
- --line-numbers
|
44
|
+
- --inline-source
|
45
|
+
extra_rdoc_files:
|
46
|
+
- README
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
requirements: []
|
52
|
+
|
53
|
+
dependencies:
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: PriorityQueue
|
56
|
+
version_requirement:
|
57
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.0.0
|
62
|
+
version:
|