editalign 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +65 -0
- data/examples/align-strings +72 -0
- data/examples/stress-test +165 -0
- data/lib/editalign.rb +516 -0
- data/test/test_editalign.rb +167 -0
- metadata +62 -0
data/README
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
= Edit Alignment Module
|
2
|
+
|
3
|
+
The module implements a dynamic programming string alignment algorithm
|
4
|
+
that produces both alignments and edit distances between source and
|
5
|
+
destination arrays or strings.
|
6
|
+
|
7
|
+
An alignment between the arrays is represented as a sequence of
|
8
|
+
insert, delete, and substitute operations that transform individual
|
9
|
+
source items into destination items. For example, the following is an
|
10
|
+
alignment between the letters in the words 'captained' and 'caspian':
|
11
|
+
|
12
|
+
c a - p t a i n e d
|
13
|
+
c a s p i a - n - -
|
14
|
+
|
15
|
+
Here an 's' was inserted, an 'i' was substituted for a 't', and an
|
16
|
+
'i', an 'e', and a 'd' were deleted.
|
17
|
+
|
18
|
+
The module exports an Alignment class which assigns numeric costs to
|
19
|
+
each of these operations and finds the alignment that incurs the
|
20
|
+
minimum cost or edit distance. The Alignment class uses Dijkstra's
|
21
|
+
algorithm to efficiently find edit distances for partially-aligned
|
22
|
+
arrays. See the EditAlign::DijkstraSearch class for details.
|
23
|
+
|
24
|
+
irb(main):001:0> require 'editalign'
|
25
|
+
=> true
|
26
|
+
irb(main):002:0> a = EditAlign::Alignment.new('captained', 'caspian')
|
27
|
+
=> <Alignment: 5>
|
28
|
+
irb(main):003:0> a.edit_distance
|
29
|
+
=> 5
|
30
|
+
irb(main):004:0> a.edit_operations
|
31
|
+
=> [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete]
|
32
|
+
irb(main):005:0> a.source_alignment
|
33
|
+
=> ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"]
|
34
|
+
irb(main):006:0> a.dest_alignment
|
35
|
+
=> ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil]
|
36
|
+
irb(main):007:0> puts a
|
37
|
+
ca-ptained
|
38
|
+
caspia-n--
|
39
|
+
I S D DD
|
40
|
+
5
|
41
|
+
=> nil
|
42
|
+
|
43
|
+
= History
|
44
|
+
|
45
|
+
* 1-0-0 ... First version
|
46
|
+
|
47
|
+
= See Also
|
48
|
+
|
49
|
+
* The Levenshtein[http://po-ru.com/projects/levenshtein/] module generates a Levenshtein edit distance doing an exhaustive search.
|
50
|
+
|
51
|
+
= Acknowledgments
|
52
|
+
|
53
|
+
Thanks to Jeremy G. Kahn for suggesting a diagonal-hugging search
|
54
|
+
strategy and optimizations to the one implemented here.
|
55
|
+
|
56
|
+
= Copyright
|
57
|
+
|
58
|
+
Copyright 2006, William Patrick McNeill
|
59
|
+
|
60
|
+
This program is distributed under the GNU General Public License.
|
61
|
+
|
62
|
+
= Author
|
63
|
+
|
64
|
+
W.P. McNeill mailto:billmcn@u.washington.edu
|
65
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Editalign.
|
7
|
+
#
|
8
|
+
# Editalign is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Editalign is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
# Print the character alignment between two strings passed in on the
|
25
|
+
# command line.
|
26
|
+
|
27
|
+
require 'getoptlong'
|
28
|
+
require 'editalign'
|
29
|
+
|
30
|
+
class ExhaustiveSellersAlignment < EditAlign::SellersAlignment
|
31
|
+
include EditAlign::ExhaustiveSearch
|
32
|
+
end
|
33
|
+
|
34
|
+
# Process command line options.
|
35
|
+
opts = GetoptLong.new(["--match", "-m", GetoptLong::REQUIRED_ARGUMENT],
|
36
|
+
["--nomatch", "-n", GetoptLong::REQUIRED_ARGUMENT],
|
37
|
+
["--insert", "-i",GetoptLong::REQUIRED_ARGUMENT],
|
38
|
+
["--delete", "-d", GetoptLong::REQUIRED_ARGUMENT],
|
39
|
+
["--exhaustive", "-e", GetoptLong::NO_ARGUMENT]
|
40
|
+
)
|
41
|
+
|
42
|
+
match = 0
|
43
|
+
mismatch = 1
|
44
|
+
insert = 1
|
45
|
+
delete = 1
|
46
|
+
exhaustive = false
|
47
|
+
opts.each do |opt, arg|
|
48
|
+
case opt
|
49
|
+
when "--match"
|
50
|
+
match = arg.to_f
|
51
|
+
when "--nomatch"
|
52
|
+
mismatch = arg.to_f
|
53
|
+
when "--insert"
|
54
|
+
insert = arg.to_f
|
55
|
+
when "--delete"
|
56
|
+
delete = arg.to_f
|
57
|
+
when "--exhaustive"
|
58
|
+
exhaustive = true
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
source = ARGV[0]
|
63
|
+
dest = ARGV[1]
|
64
|
+
|
65
|
+
# Do alignments and print results.
|
66
|
+
alignments = [EditAlign::SellersAlignment]
|
67
|
+
alignments << ExhaustiveSellersAlignment if exhaustive
|
68
|
+
|
69
|
+
alignments.each do |align_class|
|
70
|
+
a = align_class.new(source, dest, match, mismatch, insert, delete)
|
71
|
+
puts a, a.to_grid
|
72
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Editalign.
|
7
|
+
#
|
8
|
+
# Editalign is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Editalign is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
require "getoptlong"
|
25
|
+
require "editalign"
|
26
|
+
|
27
|
+
class PureRubyEditAlign < EditAlign::Alignment
|
28
|
+
def priority_queue_factory
|
29
|
+
require "priority_queue/ruby_priority_queue"
|
30
|
+
RubyPriorityQueue.new
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class IntegerPriorityEditAlign < EditAlign::Alignment
|
35
|
+
def priority_factory(cost, cell)
|
36
|
+
cost
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
# Process this script's command line options.
|
42
|
+
#
|
43
|
+
# There are three required arguments.
|
44
|
+
#
|
45
|
+
# * The number of alignments to perform
|
46
|
+
# * The length of the strings to align
|
47
|
+
# * The number of edits
|
48
|
+
#
|
49
|
+
# There are two optional switches.
|
50
|
+
#
|
51
|
+
# * verbose ... prints strings and edit distances
|
52
|
+
# * pure-ruby ... uses the pure Ruby priority queue instead of the C extension
|
53
|
+
# * int-cost ... uses integer costs
|
54
|
+
def parse_command_line
|
55
|
+
opts = GetoptLong.new(["--verbose", "-v", GetoptLong::NO_ARGUMENT],
|
56
|
+
["--pure-ruby", "-p", GetoptLong::NO_ARGUMENT],
|
57
|
+
["--int-cost", "-i", GetoptLong::NO_ARGUMENT])
|
58
|
+
verbose = false
|
59
|
+
pure_ruby = false
|
60
|
+
int_cost = false
|
61
|
+
opts.each do |opt, arg|
|
62
|
+
case opt
|
63
|
+
when "--verbose"
|
64
|
+
verbose = true
|
65
|
+
when "--pure-ruby"
|
66
|
+
pure_ruby = true
|
67
|
+
when "--int-cost"
|
68
|
+
int_cost = true
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
trials = Integer(ARGV[0])
|
73
|
+
length = Integer(ARGV[1])
|
74
|
+
edits = Integer(ARGV[2])
|
75
|
+
|
76
|
+
if pure_ruby
|
77
|
+
klass = PureRubyEditAlign
|
78
|
+
elsif int_cost
|
79
|
+
klass = IntegerPriorityEditAlign
|
80
|
+
else
|
81
|
+
klass = EditAlign::Alignment
|
82
|
+
end
|
83
|
+
|
84
|
+
[trials, length, edits, klass, verbose]
|
85
|
+
end
|
86
|
+
|
87
|
+
# Run a number of alignments of the same size.
|
88
|
+
def run_stress_test(trials, length, edits, klass, verbose)
|
89
|
+
(1..trials).each do |i|
|
90
|
+
puts "Trial #{i}" if verbose
|
91
|
+
|
92
|
+
# Generate a destination string with random differences from the
|
93
|
+
# source string.
|
94
|
+
source, dest = create_strings(length, edits)
|
95
|
+
|
96
|
+
# Align the altered string with the one read from the file.
|
97
|
+
alignment = klass.new(source, dest)
|
98
|
+
print "#{alignment}\n\n" if verbose
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Create two unaligned strings.
|
103
|
+
def create_strings(length, edits)
|
104
|
+
# The alphabet used for inserts and substitutions.
|
105
|
+
alphabet = ('A'..'Z').collect
|
106
|
+
|
107
|
+
# Source contains a repeating lowercase alphabet length characters
|
108
|
+
# long.
|
109
|
+
last = length - 1
|
110
|
+
source = (0..last).map {|x| (97 + x % 26).chr}
|
111
|
+
|
112
|
+
# There can only be as many edits as there are characters.
|
113
|
+
edits = length if edits > length
|
114
|
+
|
115
|
+
# Create a roughly even number of substitutions, inserts, and
|
116
|
+
# deletes.
|
117
|
+
n_subs = 0
|
118
|
+
n_ins = 0
|
119
|
+
n_dels = 0
|
120
|
+
(1..edits).each do
|
121
|
+
case rand(3)
|
122
|
+
when 0
|
123
|
+
n_subs += 1
|
124
|
+
when 1
|
125
|
+
n_ins += 1
|
126
|
+
when 2
|
127
|
+
n_dels += 1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Distribute edit operations randomly throughout the string.
|
132
|
+
unchanged_pos = (0..last).collect
|
133
|
+
edit_op = [nil] * length
|
134
|
+
|
135
|
+
[:substitute, :insert, :delete].zip([n_subs, n_ins, n_dels]) do |op, n|
|
136
|
+
(1..n).each do
|
137
|
+
i = rand(unchanged_pos.length)
|
138
|
+
edit_op[unchanged_pos[i]] = op
|
139
|
+
unchanged_pos.delete_at(i)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# Use the random edit operations to create a destination string.
|
144
|
+
dest = ''
|
145
|
+
edit_op.each_index do |i|
|
146
|
+
case edit_op[i]
|
147
|
+
when nil
|
148
|
+
dest += source[i]
|
149
|
+
when :substitute
|
150
|
+
dest += alphabet[rand(alphabet.length)]
|
151
|
+
when :insert
|
152
|
+
dest += alphabet[rand(alphabet.length)] + source[i]
|
153
|
+
when :delete
|
154
|
+
# Do nothing
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
[source, dest]
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
if __FILE__ == $0
|
163
|
+
trials, length, edits, klass, verbose = parse_command_line
|
164
|
+
run_stress_test(trials, length, edits, klass, verbose)
|
165
|
+
end
|
data/lib/editalign.rb
ADDED
@@ -0,0 +1,516 @@
|
|
1
|
+
# Copyright 2006 William Patrick McNeill
|
2
|
+
#
|
3
|
+
# Editalign is free software; you can redistribute it and/or modify it
|
4
|
+
# under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation; either version 2 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# Editalign is distributed in the hope that it will be useful, but
|
9
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
15
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
# EditAlign is the namespace that contains all edit alignment
|
18
|
+
# functions.
|
19
|
+
module EditAlign
|
20
|
+
|
21
|
+
# This module employs Dijkstra's algorithm to find the lowest-cost
|
22
|
+
# sequence of edit operations that will transform the source array
|
23
|
+
# into the destination array. The alignment grid is treated as a
|
24
|
+
# directed acyclic graph where each cell in the grid is a vertex.
|
25
|
+
# Edges in the graph correspond to substitution, deletion and
|
26
|
+
# insertion operations. The edge weights come from the weighting
|
27
|
+
# methods #substitute, #insert, and #delete.
|
28
|
+
#
|
29
|
+
# Generally speaking, the strategy is to search the diagonal of the
|
30
|
+
# alignment grid before the corners. For partially-aligned arrays,
|
31
|
+
# this strategy can result in fewer calls to the weighting
|
32
|
+
# functions.
|
33
|
+
module DijkstraSearch
|
34
|
+
|
35
|
+
# Cells to be searched in the Dijkstra priority queue are ordered
|
36
|
+
# by SearchPriority. SearchPriority orders cells first by cost,
|
37
|
+
# and then if the costs are equal by smallest number of hops to
|
38
|
+
# the end cell. All things being equal, the latter comparison
|
39
|
+
# makes the algorithm search cells near the diagonal first. This
|
40
|
+
# can help in instances where the beginings of the arrays are
|
41
|
+
# unaligned but the ends are aligned.
|
42
|
+
class SearchPriority
|
43
|
+
|
44
|
+
# The cost to reach the current cell
|
45
|
+
attr_reader :cost
|
46
|
+
|
47
|
+
# The minimum path length to the final cell
|
48
|
+
attr_reader :dist
|
49
|
+
|
50
|
+
include Comparable
|
51
|
+
|
52
|
+
# Specify the cost to reach a cell, the cell, and the final
|
53
|
+
# cell.
|
54
|
+
def initialize(cost, cell, end_cell)
|
55
|
+
@cost = cost
|
56
|
+
@dist = [end_cell.source - cell.source, end_cell.dest - cell.dest].max
|
57
|
+
end
|
58
|
+
|
59
|
+
# Order by cost and then by distance to the end cell.
|
60
|
+
def <=>(other)
|
61
|
+
comp = (cost <=> other.cost)
|
62
|
+
comp = (dist <=> other.dist) if comp == 0
|
63
|
+
comp
|
64
|
+
end
|
65
|
+
|
66
|
+
# Interactive stringification
|
67
|
+
def inspect
|
68
|
+
"Pri(#{cost}, #{dist})"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Search the alignment grid filling in <em>@cost</em> and
|
73
|
+
# <em>@backtrace</em>.
|
74
|
+
def find_lowest_cost_alignment
|
75
|
+
agenda = priority_queue_factory
|
76
|
+
|
77
|
+
agenda[@start] = priority_factory(0, @start)
|
78
|
+
@backtrace = {}
|
79
|
+
|
80
|
+
until agenda.empty?
|
81
|
+
cell, priority = agenda.delete_min
|
82
|
+
cost = @cost[cell]
|
83
|
+
break if cost >= @cost[@end]
|
84
|
+
outgoing(cell) do |next_cell, next_cost|
|
85
|
+
next_cost += @cost[cell]
|
86
|
+
next unless next_cost < @cost[next_cell]
|
87
|
+
@cost[next_cell] = next_cost
|
88
|
+
@backtrace[next_cell] = cell
|
89
|
+
agenda[next_cell] = priority_factory(next_cost, next_cell) \
|
90
|
+
unless next_cost >= @cost[@end]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# An enumeration of all the cells adjacent to the specified cell
|
96
|
+
# and the costs of transitioning to them. Adjacent cells are
|
97
|
+
# reached by performing substitute, delete, and insertion
|
98
|
+
# operations.
|
99
|
+
def outgoing(cell) # :yields: cell, cost
|
100
|
+
# Substitute
|
101
|
+
if cell.source < @source.length-1 and cell.dest < @dest.length-1
|
102
|
+
next_cell = Alignment::Cell.new(cell.source+1, cell.dest+1)
|
103
|
+
if @cost[next_cell] > @cost[cell]
|
104
|
+
cost = substitute(@source[next_cell.source], @dest[next_cell.dest])
|
105
|
+
yield next_cell, cost
|
106
|
+
end
|
107
|
+
end
|
108
|
+
# Delete
|
109
|
+
if cell.source < @source.length-1
|
110
|
+
next_cell = Alignment::Cell.new(cell.source+1, cell.dest)
|
111
|
+
if @cost[next_cell] > @cost[cell]
|
112
|
+
cost = delete(@source[next_cell.source])
|
113
|
+
yield next_cell, cost
|
114
|
+
end
|
115
|
+
end
|
116
|
+
# Insert
|
117
|
+
if cell.dest < @dest.length-1
|
118
|
+
next_cell = Alignment::Cell.new(cell.source, cell.dest+1)
|
119
|
+
if @cost[next_cell] > @cost[cell]
|
120
|
+
cost = insert(@dest[next_cell.dest])
|
121
|
+
yield next_cell, cost
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Create the priority queue used by the search. By default
|
127
|
+
# EditAlign uses the C extension version of the
|
128
|
+
# priority_queue[http://rubyforge.org/projects/priority-queue/]
|
129
|
+
# library. If you wish to use a different priority queue
|
130
|
+
# implementation you may overload this function in a derived
|
131
|
+
# class.
|
132
|
+
def priority_queue_factory
|
133
|
+
require "priority_queue"
|
134
|
+
PriorityQueue.new
|
135
|
+
|
136
|
+
# Uncomment the following lines to use the pure-Ruby
|
137
|
+
# implementation of priority_queue.
|
138
|
+
# require "priority_queue/ruby_priority_queue"
|
139
|
+
# RubyPriorityQueue.new
|
140
|
+
end
|
141
|
+
|
142
|
+
# Create a new search priority for the queue. The priority must
|
143
|
+
# define the <em><=></em> operator. Cells with lower priority
|
144
|
+
# value will be searched first. If you wish to use a different
|
145
|
+
# prioritization scheme you may overload this function in a
|
146
|
+
# derived class.
|
147
|
+
def priority_factory(cost, cell)
|
148
|
+
SearchPriority.new(cost, cell, @end)
|
149
|
+
end
|
150
|
+
|
151
|
+
private :outgoing
|
152
|
+
protected :find_lowest_cost_alignment, :priority_queue_factory
|
153
|
+
protected :priority_factory
|
154
|
+
end
|
155
|
+
|
156
|
+
|
157
|
+
# This module employs an exhaustive search to find the lowest-cost
|
158
|
+
# sequence of edit operations that will transform the source array
|
159
|
+
# into the destination array. It finds the lowest cost alignment by
|
160
|
+
# filling in every cell in the costs table.
|
161
|
+
#
|
162
|
+
# This algorithm will return the same results as Dijkstra's
|
163
|
+
# algorithm though it is less efficient for nearly-aligned strings.
|
164
|
+
# Nevertheless, this search algorithm is commonly cited when
|
165
|
+
# describing alignments, and so is implemented here for the sake of
|
166
|
+
# documentation and for comparison with the Dijkstra.
|
167
|
+
module ExhaustiveSearch
|
168
|
+
|
169
|
+
# An incoming cell and its associated cost.
|
170
|
+
IncomingCost = Struct.new("IncomingCost", :cell, :cost)
|
171
|
+
|
172
|
+
# Search the alignment grid filling in @cost and @backtrace.
|
173
|
+
def find_lowest_cost_alignment
|
174
|
+
@backtrace = {}
|
175
|
+
|
176
|
+
# Fill in the top row of the table.
|
177
|
+
(1..@source.length).each do |source|
|
178
|
+
@cost[Alignment::Cell.new(source, 0)] = \
|
179
|
+
@cost[Alignment::Cell.new(source-1, 0)] + delete(@source[source])
|
180
|
+
end
|
181
|
+
# Fill in the first column of the table.
|
182
|
+
(1..@dest.length).each do |dest|
|
183
|
+
@cost[Alignment::Cell.new(0, dest)] = \
|
184
|
+
@cost[Alignment::Cell.new(0, dest-1)] + insert(@dest[dest])
|
185
|
+
end
|
186
|
+
# Fill in all the remaining cells in the table.
|
187
|
+
(1..@source.length).each do |source|
|
188
|
+
(1..@dest.length).each do |dest|
|
189
|
+
incoming = []
|
190
|
+
c = Alignment::Cell.new(source-1, dest)
|
191
|
+
incoming << IncomingCost.new(c, @cost[c] + delete(@source[source]))
|
192
|
+
c = Alignment::Cell.new(source, dest-1)
|
193
|
+
incoming << IncomingCost.new(c, @cost[c] + insert(@dest[dest]))
|
194
|
+
c = Alignment::Cell.new(source-1, dest-1)
|
195
|
+
incoming << \
|
196
|
+
IncomingCost.new(c, @cost[c] + substitute(@source[source],
|
197
|
+
@dest[dest]))
|
198
|
+
best = incoming.min {|a,b| a.cost <=> b.cost}
|
199
|
+
@cost[Alignment::Cell.new(source, dest)] = best.cost
|
200
|
+
@backtrace[Alignment::Cell.new(source, dest)] = best.cell
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
protected :find_lowest_cost_alignment
|
206
|
+
end
|
207
|
+
|
208
|
+
|
209
|
+
# The Alignment class is given a source and destination array at
|
210
|
+
# construction time. It does a dynamic programming alignment
|
211
|
+
# between them and makes the results of that alignment available
|
212
|
+
# through instance methods.
|
213
|
+
#
|
214
|
+
# If there are multiple alignments with equal edit distances
|
215
|
+
# Alignment will find one of them. Which one is undefined.
|
216
|
+
#
|
217
|
+
# Alignment works by constructing a matrix with dimensions equal to
|
218
|
+
# the length of the source and destination arrays. Moving
|
219
|
+
# horizontally and vertically in the matrix represents insertion and
|
220
|
+
# deletion operations, respectively, while moving diagonally
|
221
|
+
# represents substitution. Each cell of the matrix contains the
|
222
|
+
# minimum cost it takes to reach that cell. The algorithm fills in
|
223
|
+
# cells in the matrix until it reaches the furthest corner.
|
224
|
+
#
|
225
|
+
# The search is done using Dijkstra's algorithm as implemented in
|
226
|
+
# the DijkstraSearch. A different search algorithm may be specified
|
227
|
+
# by including a mixin that redefines the
|
228
|
+
# #find_lowest_cost_alignment function.
|
229
|
+
#
|
230
|
+
# This class uses Levenshtein weighting scheme. Levenshtein assigns
|
231
|
+
# a cost of 1 to insertions and deletions. It assigns a cost of 1
|
232
|
+
# to substitutions when the items are different and 0 when they are
|
233
|
+
# the same. Different weighting schemes may be specified by
|
234
|
+
# overloading the #insert, #delete, and #substitute functions. The
|
235
|
+
# costs must be non-negative numbers.
|
236
|
+
class Alignment
|
237
|
+
include DijkstraSearch
|
238
|
+
|
239
|
+
# A location in the alignment grid.
|
240
|
+
Cell = Struct.new("Cell", :source, :dest)
|
241
|
+
|
242
|
+
# The caller specifies a source and destination array. The object
|
243
|
+
# performs the alignment at construction time.
|
244
|
+
#
|
245
|
+
# Optionally either <em>source</em> or <em>dest</em> may be
|
246
|
+
# strings, in which they will be treated as arrays of characters.
|
247
|
+
def initialize(source, dest)
|
248
|
+
# Convert strings into arrays.
|
249
|
+
source = source.unpack('U*').collect {|c| c.chr} if source.class == String
|
250
|
+
dest = dest.unpack('U*').collect {|c| c.chr} if dest.class == String
|
251
|
+
|
252
|
+
# Prepend empty elements to the source and destination arrays to
|
253
|
+
# handle insertions and deletions at the front of the alignment.
|
254
|
+
@source = source.unshift(nil)
|
255
|
+
@dest = dest.unshift(nil)
|
256
|
+
|
257
|
+
# The start and end cells of the search.
|
258
|
+
@start = Cell.new(0, 0)
|
259
|
+
@end = Cell.new(@source.length-1, @dest.length-1)
|
260
|
+
|
261
|
+
# The lowest known cost to reach a cell. Unexplored cells have
|
262
|
+
# an infinite cost.
|
263
|
+
@cost = Hash.new{1.0/0.0}
|
264
|
+
@cost[@start] = 0
|
265
|
+
|
266
|
+
# Fill in the @cost matrix including @cost[@end] and create a
|
267
|
+
# lowest-cost sequence of cells in @backtrace.
|
268
|
+
find_lowest_cost_alignment
|
269
|
+
end
|
270
|
+
|
271
|
+
# The minimum edit distance
|
272
|
+
def edit_distance
|
273
|
+
@cost[@end]
|
274
|
+
end
|
275
|
+
|
276
|
+
# The lowest-cost list of edit operations. This is a list of
|
277
|
+
# <em>:substitute</em>, <em>:insert</em>, <em>:delete</em> symbols
|
278
|
+
# for operatations that changed an array element and
|
279
|
+
# <em>marker</em> for operations that did not.
|
280
|
+
def edit_operations(marker = nil)
|
281
|
+
ops = []
|
282
|
+
edit_sequence do |cell, operation|
|
283
|
+
if operation == :substitute
|
284
|
+
ops << \
|
285
|
+
(@source[cell.source] == @dest[cell.dest] ? marker: :substitute)
|
286
|
+
else
|
287
|
+
ops << operation
|
288
|
+
end
|
289
|
+
end
|
290
|
+
ops
|
291
|
+
end
|
292
|
+
|
293
|
+
# The source items with <em>marker</em> inserted where an
|
294
|
+
# insertion took place.
|
295
|
+
def source_alignment(marker = nil)
|
296
|
+
source = []
|
297
|
+
edit_sequence do |cell, operation|
|
298
|
+
source << (operation == :insert ? marker:@source[cell.source])
|
299
|
+
end
|
300
|
+
source
|
301
|
+
end
|
302
|
+
|
303
|
+
# The destination items with <em>marker</em> inserted where a
|
304
|
+
# deletion took place.
|
305
|
+
def dest_alignment(marker = nil)
|
306
|
+
dest = []
|
307
|
+
edit_sequence do |cell, operation|
|
308
|
+
dest << (operation == :delete ? marker:@dest[cell.dest])
|
309
|
+
end
|
310
|
+
dest
|
311
|
+
end
|
312
|
+
|
313
|
+
# Enumerate the minimum-cost sequence of edit operations.
|
314
|
+
def edit_sequence # :yields: cell, {:substitute, :insert, :delete}
|
315
|
+
# The first time this function is called, walk backwards through
|
316
|
+
# the backtrace to create the @path instance variable.
|
317
|
+
if not @path
|
318
|
+
@path = [@end]
|
319
|
+
while cell = @backtrace[@path[0]]
|
320
|
+
@path.unshift(cell)
|
321
|
+
end
|
322
|
+
end
|
323
|
+
# Walk forwards through the path.
|
324
|
+
prev_cell = @start
|
325
|
+
@path[1..-1].each do |cell|
|
326
|
+
delta_source = cell.source - prev_cell.source
|
327
|
+
delta_dest = cell.dest - prev_cell.dest
|
328
|
+
if delta_source == 1 and delta_dest == 1
|
329
|
+
yield cell, :substitute
|
330
|
+
elsif delta_source == 1
|
331
|
+
yield cell, :delete
|
332
|
+
elsif delta_dest == 1
|
333
|
+
yield cell, :insert
|
334
|
+
else
|
335
|
+
raise "Invalid path link #{prev_cell}->#{cell}"
|
336
|
+
end
|
337
|
+
prev_cell = cell
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
# Interactive stringification
|
342
|
+
def inspect
|
343
|
+
"<Alignment: #{edit_distance}>"
|
344
|
+
end
|
345
|
+
|
346
|
+
# The cost of substituting <em>source_item</em> with
|
347
|
+
# <em>dest_item</em>.
|
348
|
+
def substitute(source_item, dest_item)
|
349
|
+
source_item == dest_item ? 0:1
|
350
|
+
end
|
351
|
+
|
352
|
+
# The cost of deleting <em>source_item</em>.
|
353
|
+
def delete(source_item)
|
354
|
+
1
|
355
|
+
end
|
356
|
+
|
357
|
+
# The cost of inserting <em>dest_item</em>.
|
358
|
+
def insert(dest_item)
|
359
|
+
1
|
360
|
+
end
|
361
|
+
|
362
|
+
# The string representation of the alignment consists of four lines:
|
363
|
+
#
|
364
|
+
# 1. The source array
|
365
|
+
# 2. The destination array
|
366
|
+
# 3. An annotation line with S, I, D or nothing for aligned elements.
|
367
|
+
# 4. The edit distance
|
368
|
+
def to_s
|
369
|
+
# Create the source and destination lines.
|
370
|
+
s_line = source_alignment('-')
|
371
|
+
d_line = dest_alignment('-')
|
372
|
+
# Create short pneumonics for the edit operations.
|
373
|
+
ops = edit_operations.map do |op|
|
374
|
+
case op
|
375
|
+
when nil
|
376
|
+
c = " "
|
377
|
+
when :substitute
|
378
|
+
c = "S"
|
379
|
+
when :insert
|
380
|
+
c = "I"
|
381
|
+
when :delete
|
382
|
+
c = "D"
|
383
|
+
end
|
384
|
+
end
|
385
|
+
# Find the longest element in all the lines.
|
386
|
+
longest = [s_line, d_line, ops].map{|l| l.map{|e| e.length}.max}.max
|
387
|
+
# Center each array element over a field of that width.
|
388
|
+
lines = [s_line, d_line, ops].map do |list|
|
389
|
+
list.map{|c| c.center(longest)}.join
|
390
|
+
end
|
391
|
+
(lines + [edit_distance]).join("\n")
|
392
|
+
end
|
393
|
+
|
394
|
+
# This prints a grid of all the costs. Cells that were not
|
395
|
+
# visited because they could not contribute to the lowest-cost
|
396
|
+
# path are marked with an asterisk. Cells that are in the lowest
|
397
|
+
# cost path are highlighted with square brackets.
|
398
|
+
#
|
399
|
+
# irb(main):001:0> require 'editalign'
|
400
|
+
# => true
|
401
|
+
# irb(main):002:0> a = EditAlign::Alignment.new('captained', 'caspian')
|
402
|
+
# => <Alignment: 5>
|
403
|
+
# irb(main):003:0> puts a
|
404
|
+
# - c a p t a i n e d
|
405
|
+
# - [0.00] 1.00 2.00 3.00 4.00 5.00 * * * *
|
406
|
+
# c 1.00 [0.00] 1.00 2.00 3.00 4.00 5.00 * * *
|
407
|
+
# a 2.00 1.00 [0.00] 1.00 2.00 3.00 4.00 5.00 * *
|
408
|
+
# s 3.00 2.00 [1.00] 1.00 2.00 3.00 4.00 5.00 * *
|
409
|
+
# p 4.00 3.00 2.00 [1.00] 2.00 3.00 4.00 5.00 * *
|
410
|
+
# i 5.00 4.00 3.00 2.00 [2.00] 3.00 3.00 4.00 5.00 *
|
411
|
+
# a * 5.00 4.00 3.00 3.00 [2.00] [3.00] 4.00 5.00 *
|
412
|
+
# n * * 5.00 4.00 4.00 3.00 3.00 [3.00] [4.00] [5.00]
|
413
|
+
# => nil
|
414
|
+
def to_grid
|
415
|
+
# Make the columns wide enough to accommodate the widest header
|
416
|
+
# or value.
|
417
|
+
widest_header = @source.find_all{|x| x}.map{|x| x.length}.max
|
418
|
+
widest_cost = @cost.values.map{|c| (sprintf "%.2f", c).length + 2}.max
|
419
|
+
# The conditional handles empty alignments.
|
420
|
+
col_width = (widest_header and widest_cost) ? \
|
421
|
+
[widest_header, widest_cost].max: 1
|
422
|
+
|
423
|
+
# Create the header row.
|
424
|
+
header = [""] + @source.map{|x| x ? x: "-"}
|
425
|
+
header = header.map {|x| x.center(col_width)}
|
426
|
+
|
427
|
+
# Make note of which cells are on the lowest-cost path.
|
428
|
+
path_cells = {}
|
429
|
+
@path.each {|cell| path_cells[cell] = true} if @path
|
430
|
+
|
431
|
+
# Enumerate the destination, creating the cost rows.
|
432
|
+
table = [header]
|
433
|
+
(0..@dest.length-1).each do |dest|
|
434
|
+
x = @dest[dest]
|
435
|
+
x = "-" if not x
|
436
|
+
row = [sprintf("%-#{col_width}s", x)]
|
437
|
+
(0..@source.length-1).each do |source|
|
438
|
+
cell = Cell.new(source, dest)
|
439
|
+
c = @cost[cell]
|
440
|
+
if c == 1.0/0.0
|
441
|
+
# Center the * character in the column.
|
442
|
+
value = "*".center(col_width)
|
443
|
+
else
|
444
|
+
# Put brackets around cells in the best path.
|
445
|
+
value = sprintf path_cells[cell] ? "[%.2f]":" %.2f ", c
|
446
|
+
value = sprintf "%#{col_width}s", value
|
447
|
+
end
|
448
|
+
row << value
|
449
|
+
end
|
450
|
+
table << row
|
451
|
+
end
|
452
|
+
# Combine the rows into a single string table.
|
453
|
+
col_spc = " " * 4
|
454
|
+
table.map{|row| row.join(col_spc)}.join("\n")
|
455
|
+
end
|
456
|
+
|
457
|
+
private :edit_sequence
|
458
|
+
protected :substitute, :delete, :insert
|
459
|
+
end
|
460
|
+
|
461
|
+
# The Levenshtein alignment yields a cost of 1 for insertions,
|
462
|
+
# deletions, and item mismatch. This class is a synonym for
|
463
|
+
# Alignment.
|
464
|
+
class LevenshteinAlignment < Alignment
|
465
|
+
end
|
466
|
+
|
467
|
+
# The Sellers alignment (aka the Needleman-Wunsch alignment) allows
|
468
|
+
# different constant costs to be specified for insertion, deletion,
|
469
|
+
# and substitute match and mismatch operations.
|
470
|
+
class SellersAlignment < Alignment
|
471
|
+
|
472
|
+
# The costs for insert, delete, and substitution match and
|
473
|
+
# mismatch are specified here. The default values for these costs
|
474
|
+
# yields a Levenshtein alignment.
|
475
|
+
def initialize(source, dest, match = 0, mismatch = 1,
|
476
|
+
insert = 1, delete = 1)
|
477
|
+
@match = match
|
478
|
+
@mismatch = mismatch
|
479
|
+
@insert = insert
|
480
|
+
@delete = delete
|
481
|
+
super(source, dest)
|
482
|
+
end
|
483
|
+
|
484
|
+
# The cost of substituting <em>source_item</em> with
|
485
|
+
# <em>dest_item</em>.
|
486
|
+
def substitute(source_item, dest_item)
|
487
|
+
source_item == dest_item ? @match:@mismatch
|
488
|
+
end
|
489
|
+
|
490
|
+
# The cost of deleting <em>source_item</em>.
|
491
|
+
def delete(source_item)
|
492
|
+
@delete
|
493
|
+
end
|
494
|
+
|
495
|
+
# The cost of inserting <em>dest_item</em>.
|
496
|
+
def insert(dest_item)
|
497
|
+
@insert
|
498
|
+
end
|
499
|
+
|
500
|
+
protected :substitute, :delete, :insert
|
501
|
+
end
|
502
|
+
|
503
|
+
# The Wagner-Fischer alignment specifies the same cost for insertion
|
504
|
+
# and deletion operations, another for item match, and another for
|
505
|
+
# item mismatch.
|
506
|
+
class WagnerFischerAlignment < SellersAlignment
|
507
|
+
|
508
|
+
# The costs for insert/delete operations and character match and
|
509
|
+
# mismatch are specified here. The default values for these costs
|
510
|
+
# yields a Levenshtein alignment.
|
511
|
+
def initialize(source, dest, match = 0, mismatch = 1, insert_delete = 1)
|
512
|
+
super(source, dest, match, mismatch, insert_delete, insert_delete)
|
513
|
+
end
|
514
|
+
end
|
515
|
+
|
516
|
+
end
|
@@ -0,0 +1,167 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Editalign.
|
7
|
+
#
|
8
|
+
# Editalign is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Editalign is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
# Test cases for the EditAlign module
|
25
|
+
|
26
|
+
require 'test/unit'
|
27
|
+
require 'editalign'
|
28
|
+
|
29
|
+
|
30
|
+
class LevenshteinStringAlignments < Test::Unit::TestCase
|
31
|
+
def test_captained_caspian
|
32
|
+
a = EditAlign::Alignment.new("captained", "caspian")
|
33
|
+
assert_kind_of EditAlign::Alignment, a
|
34
|
+
assert_equal 5, a.edit_distance
|
35
|
+
assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
|
36
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
37
|
+
assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
class SellersAlignments < Test::Unit::TestCase
|
43
|
+
def test_captained_caspian
|
44
|
+
a = EditAlign::SellersAlignment.new("captained", "caspian")
|
45
|
+
assert_kind_of EditAlign::Alignment, a
|
46
|
+
assert_equal 5, a.edit_distance
|
47
|
+
assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
|
48
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
49
|
+
assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_captained_caspian_custom_weights
|
53
|
+
a = EditAlign::SellersAlignment.new("captained", "caspian", 0.1, 0.5, 0.8, 0.9)
|
54
|
+
assert_kind_of EditAlign::Alignment, a
|
55
|
+
assert_equal 4.1, a.edit_distance
|
56
|
+
assert_equal [nil, nil, :substitute, :substitute, :delete, nil, :substitute, :delete, :substitute], a.edit_operations
|
57
|
+
assert_equal ["c", "a", "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
58
|
+
assert_equal ["c", "a", "s", "p", nil, "i", "a", nil, "n"], a.dest_alignment
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
class WagnerFischerAlignments < Test::Unit::TestCase
|
64
|
+
def test_captained_caspian
|
65
|
+
a = EditAlign::WagnerFischerAlignment.new("captained", "caspian")
|
66
|
+
assert_kind_of EditAlign::Alignment, a
|
67
|
+
assert_equal 5, a.edit_distance
|
68
|
+
assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
|
69
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
70
|
+
assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_captained_caspian_high_substitution_cost
|
74
|
+
a = EditAlign::WagnerFischerAlignment.new("captained", "caspian", 0, 20, 1)
|
75
|
+
assert_kind_of EditAlign::Alignment, a
|
76
|
+
assert_equal 6, a.edit_distance
|
77
|
+
assert_equal [nil, nil, :insert, nil, :delete, :delete, nil, :insert, nil, :delete, :delete], a.edit_operations
|
78
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", nil, "n", "e", "d"], a.source_alignment
|
79
|
+
assert_equal ["c", "a", "s", "p", nil, nil, "i", "a", "n", nil, nil], a.dest_alignment
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
class ParameterOptions < Test::Unit::TestCase
|
85
|
+
def test_array_init
|
86
|
+
a = EditAlign::Alignment.new(['c', 'a', 'p', 't', 'a', 'i', 'n', 'e', 'd'], ['c', 'a', 's', 'p', 'i', 'a', 'n'])
|
87
|
+
assert_kind_of EditAlign::Alignment, a
|
88
|
+
assert_equal 5, a.edit_distance
|
89
|
+
assert_equal [nil, nil, :insert, nil, :substitute, nil, :delete, nil, :delete, :delete], a.edit_operations
|
90
|
+
assert_equal ["c", "a", nil, "p", "t", "a", "i", "n", "e", "d"], a.source_alignment
|
91
|
+
assert_equal ["c", "a", "s", "p", "i", "a", nil, "n", nil, nil], a.dest_alignment
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_marker
|
95
|
+
a = EditAlign::Alignment.new("captained", "caspian")
|
96
|
+
assert_equal ["X", "X", :insert, "X", :substitute, "X", :delete, "X", :delete, :delete], a.edit_operations("X")
|
97
|
+
assert_equal ["c", "a", "X", "p", "t", "a", "i", "n", "e", "d"], a.source_alignment('X')
|
98
|
+
assert_equal ["c", "a", "s", "p", "i", "a", "X", "n", "X", "X"], a.dest_alignment('X')
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
class BoundaryConditions < Test::Unit::TestCase
|
104
|
+
def test_empty_alignment
|
105
|
+
a = EditAlign::Alignment.new([], [])
|
106
|
+
assert_kind_of EditAlign::Alignment, a
|
107
|
+
assert_equal 0, a.edit_distance
|
108
|
+
assert_equal [], a.source_alignment
|
109
|
+
assert_equal [], a.dest_alignment
|
110
|
+
assert_equal [], a.edit_operations
|
111
|
+
end
|
112
|
+
|
113
|
+
def test_nonempty_empty_alignment
|
114
|
+
a = EditAlign::Alignment.new(['A', 'B', 'C'], [])
|
115
|
+
assert_kind_of EditAlign::Alignment, a
|
116
|
+
assert_equal 3, a.edit_distance
|
117
|
+
assert_equal [:delete, :delete, :delete], a.edit_operations
|
118
|
+
assert_equal ['A', 'B', 'C'], a.source_alignment
|
119
|
+
assert_equal [nil, nil, nil], a.dest_alignment
|
120
|
+
end
|
121
|
+
|
122
|
+
def test_empty_nonempty_alignment
|
123
|
+
a = EditAlign::Alignment.new([], ['A', 'B', 'C'])
|
124
|
+
assert_kind_of EditAlign::Alignment, a
|
125
|
+
assert_equal 3, a.edit_distance
|
126
|
+
assert_equal [:insert, :insert, :insert], a.edit_operations
|
127
|
+
assert_equal [nil, nil, nil], a.source_alignment
|
128
|
+
assert_equal ['A', 'B', 'C'], a.dest_alignment
|
129
|
+
end
|
130
|
+
|
131
|
+
def test_identical
|
132
|
+
a = EditAlign::Alignment.new(['A', 'B', 'C'], ['A', 'B', 'C'])
|
133
|
+
assert_kind_of EditAlign::Alignment, a
|
134
|
+
assert_equal 0, a.edit_distance
|
135
|
+
assert_equal [nil, nil, nil], a.edit_operations
|
136
|
+
assert_equal ['A', 'B', 'C'], a.source_alignment
|
137
|
+
assert_equal ['A', 'B', 'C'], a.dest_alignment
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
class Stringification < Test::Unit::TestCase
|
142
|
+
def test_captained_caspian
|
143
|
+
a = EditAlign::Alignment.new("captained", "caspian")
|
144
|
+
s = \
|
145
|
+
"ca-ptained
|
146
|
+
caspia-n--
|
147
|
+
I S D DD
|
148
|
+
5"
|
149
|
+
assert_equal s, a.to_s
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
class CompareSearchAlgorithm < Test::Unit::TestCase
|
154
|
+
|
155
|
+
class ExhaustiveAlignment < EditAlign::SellersAlignment
|
156
|
+
include EditAlign::ExhaustiveSearch
|
157
|
+
end
|
158
|
+
|
159
|
+
def test_dijkstra_exhaustive
|
160
|
+
d = EditAlign::Alignment.new("captained", "caspian")
|
161
|
+
e = ExhaustiveAlignment.new("captained", "caspian")
|
162
|
+
assert_equal d.edit_distance, e.edit_distance
|
163
|
+
assert_equal d.edit_operations, e.edit_operations
|
164
|
+
assert_equal d.source_alignment, e.source_alignment
|
165
|
+
assert_equal d.dest_alignment, e.dest_alignment
|
166
|
+
end
|
167
|
+
end
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
!ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: editalign
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2006-05-30 00:00:00 -07:00
|
8
|
+
summary: Edit alignments between arrays
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: billmcn@u.washington.edu
|
12
|
+
homepage: http://staff.washington.edu/billmcn/index.shtml
|
13
|
+
rubyforge_project:
|
14
|
+
description: This module performs edit alignments between arrays. It returns alignments and edit distances.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- W.P. McNeill
|
30
|
+
files:
|
31
|
+
- test/test_editalign.rb
|
32
|
+
- lib/editalign.rb
|
33
|
+
- examples/align-strings
|
34
|
+
- examples/stress-test
|
35
|
+
- README
|
36
|
+
test_files:
|
37
|
+
- test/test_editalign.rb
|
38
|
+
rdoc_options:
|
39
|
+
- --title
|
40
|
+
- EditAlign -- Ruby Edit Alignment
|
41
|
+
- --main
|
42
|
+
- README
|
43
|
+
- --line-numbers
|
44
|
+
- --inline-source
|
45
|
+
extra_rdoc_files:
|
46
|
+
- README
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
requirements: []
|
52
|
+
|
53
|
+
dependencies:
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: PriorityQueue
|
56
|
+
version_requirement:
|
57
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.0.0
|
62
|
+
version:
|