editalign 1.0.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +4 -2
- data/examples/align-strings +34 -28
- data/examples/stress-test +23 -16
- data/lib/editalign.rb +30 -26
- data/test/test_editalign.rb +2 -2
- metadata +17 -24
data/README
CHANGED
@@ -43,6 +43,8 @@ arrays. See the EditAlign::DijkstraSearch class for details.
|
|
43
43
|
= History
|
44
44
|
|
45
45
|
* 1-0-0 ... First version
|
46
|
+
* 1-1-0 ... Added EditAlign::VERSION
|
47
|
+
* 1-1-1 ... Improved command-line handling in align-strings and stress-test
|
46
48
|
|
47
49
|
= See Also
|
48
50
|
|
@@ -55,11 +57,11 @@ strategy and optimizations to the one implemented here.
|
|
55
57
|
|
56
58
|
= Copyright
|
57
59
|
|
58
|
-
Copyright 2006, William Patrick McNeill
|
60
|
+
Copyright 2006-2009, William Patrick McNeill
|
59
61
|
|
60
62
|
This program is distributed under the GNU General Public License.
|
61
63
|
|
62
64
|
= Author
|
63
65
|
|
64
|
-
W.P. McNeill mailto:billmcn@
|
66
|
+
W.P. McNeill mailto:billmcn@gmail.com
|
65
67
|
|
data/examples/align-strings
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
#!/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
#--
|
4
|
-
# Copyright 2006 William Patrick McNeill
|
4
|
+
# Copyright 2006-2009 William Patrick McNeill
|
5
5
|
#
|
6
6
|
# This file is part of Editalign.
|
7
7
|
#
|
@@ -24,49 +24,55 @@
|
|
24
24
|
# Print the character alignment between two strings passed in on the
|
25
25
|
# command line.
|
26
26
|
|
27
|
-
require '
|
27
|
+
require 'optparse'
|
28
28
|
require 'editalign'
|
29
29
|
|
30
30
|
class ExhaustiveSellersAlignment < EditAlign::SellersAlignment
|
31
31
|
include EditAlign::ExhaustiveSearch
|
32
32
|
end
|
33
33
|
|
34
|
-
# Process command line options.
|
35
|
-
opts = GetoptLong.new(["--match", "-m", GetoptLong::REQUIRED_ARGUMENT],
|
36
|
-
["--nomatch", "-n", GetoptLong::REQUIRED_ARGUMENT],
|
37
|
-
["--insert", "-i",GetoptLong::REQUIRED_ARGUMENT],
|
38
|
-
["--delete", "-d", GetoptLong::REQUIRED_ARGUMENT],
|
39
|
-
["--exhaustive", "-e", GetoptLong::NO_ARGUMENT]
|
40
|
-
)
|
41
|
-
|
42
34
|
match = 0
|
43
|
-
|
35
|
+
substitute = 1
|
44
36
|
insert = 1
|
45
37
|
delete = 1
|
46
38
|
exhaustive = false
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
exhaustive = true
|
39
|
+
OptionParser.new do |opts|
|
40
|
+
opts.banner =<<-EOTEXT
|
41
|
+
#{File.basename(__FILE__)} [OPTION] source dest
|
42
|
+
|
43
|
+
Perform an edit distance alignment between the source and dest strings.
|
44
|
+
|
45
|
+
The options may be used to specify match, substitution, insertion, and
|
46
|
+
deletion costs.
|
47
|
+
EOTEXT
|
48
|
+
opts.on("-m", "--match COST", Float, "Match cost (default 0)") do |value|
|
49
|
+
match = value
|
59
50
|
end
|
60
|
-
|
51
|
+
opts.on("-s", "--substitution COST", Float, "Substitution cost (default 1)") do |value|
|
52
|
+
substitute = value
|
53
|
+
end
|
54
|
+
opts.on("-i", "--insertion COST", Float, "Insertion cost (default 1)") do |value|
|
55
|
+
insert = value
|
56
|
+
end
|
57
|
+
opts.on("-d", "--deletion COST", Float, "Deletion cost (default 1)") do |value|
|
58
|
+
delete = value
|
59
|
+
end
|
60
|
+
opts.on("-e", "--exhaustive", "Do exhaustive search for comparison (default false)") do |value|
|
61
|
+
exhaustive = value
|
62
|
+
end
|
63
|
+
end.parse!
|
61
64
|
|
62
|
-
|
63
|
-
|
65
|
+
if not ARGV.length == 2
|
66
|
+
puts "Incorrect number of arguments."
|
67
|
+
exit(0)
|
68
|
+
end
|
69
|
+
source, dest = ARGV
|
64
70
|
|
65
71
|
# Do alignments and print results.
|
66
72
|
alignments = [EditAlign::SellersAlignment]
|
67
73
|
alignments << ExhaustiveSellersAlignment if exhaustive
|
68
74
|
|
69
75
|
alignments.each do |align_class|
|
70
|
-
a = align_class.new(source, dest, match,
|
76
|
+
a = align_class.new(source, dest, match, substitute, insert, delete)
|
71
77
|
puts a, a.to_grid
|
72
78
|
end
|
data/examples/stress-test
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
#!/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
#--
|
4
|
-
# Copyright 2006 William Patrick McNeill
|
4
|
+
# Copyright 2006-2009 William Patrick McNeill
|
5
5
|
#
|
6
6
|
# This file is part of Editalign.
|
7
7
|
#
|
@@ -21,7 +21,7 @@
|
|
21
21
|
#
|
22
22
|
#++
|
23
23
|
|
24
|
-
require "
|
24
|
+
require "optparse"
|
25
25
|
require "editalign"
|
26
26
|
|
27
27
|
class PureRubyEditAlign < EditAlign::Alignment
|
@@ -51,24 +51,31 @@ end
|
|
51
51
|
# * verbose ... prints strings and edit distances
|
52
52
|
# * pure-ruby ... uses the pure Ruby priority queue instead of the C extension
|
53
53
|
# * int-cost ... uses integer costs
|
54
|
-
def parse_command_line
|
55
|
-
opts = GetoptLong.new(["--verbose", "-v", GetoptLong::NO_ARGUMENT],
|
56
|
-
["--pure-ruby", "-p", GetoptLong::NO_ARGUMENT],
|
57
|
-
["--int-cost", "-i", GetoptLong::NO_ARGUMENT])
|
54
|
+
def parse_command_line
|
58
55
|
verbose = false
|
59
56
|
pure_ruby = false
|
60
57
|
int_cost = false
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
int_cost =
|
58
|
+
OptionParser.new do |opts|
|
59
|
+
opts.banner =<<-EOTEXT
|
60
|
+
#{File.basename(__FILE__)} [OPTION] trials length edits
|
61
|
+
|
62
|
+
Perform a stress test on the alignment code.
|
63
|
+
EOTEXT
|
64
|
+
opts.on("-i", "--int-cost", "Use integer costs") do |value|
|
65
|
+
int_cost = value
|
69
66
|
end
|
67
|
+
opts.on("-p", "--pure-ruby", "Use the pure Ruby priority queue instead of the C extension") do |value|
|
68
|
+
pure_ruby = value
|
69
|
+
end
|
70
|
+
opts.on("-v", "--verbose", "Print strings and edit distances") do |value|
|
71
|
+
verbose = value
|
72
|
+
end
|
73
|
+
end.parse!
|
74
|
+
|
75
|
+
if not ARGV.length == 3
|
76
|
+
puts "Incorrect number of arguments."
|
77
|
+
exit(0)
|
70
78
|
end
|
71
|
-
|
72
79
|
trials = Integer(ARGV[0])
|
73
80
|
length = Integer(ARGV[1])
|
74
81
|
edits = Integer(ARGV[2])
|
data/lib/editalign.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2006 William Patrick McNeill
|
1
|
+
# Copyright 2006-2009 William Patrick McNeill
|
2
2
|
#
|
3
3
|
# Editalign is free software; you can redistribute it and/or modify it
|
4
4
|
# under the terms of the GNU General Public License as published by
|
@@ -18,6 +18,8 @@
|
|
18
18
|
# functions.
|
19
19
|
module EditAlign
|
20
20
|
|
21
|
+
VERSION = "1.1.1"
|
22
|
+
|
21
23
|
# This module employs Dijkstra's algorithm to find the lowest-cost
|
22
24
|
# sequence of edit operations that will transform the source array
|
23
25
|
# into the destination array. The alignment grid is treated as a
|
@@ -86,8 +88,9 @@ module EditAlign
|
|
86
88
|
next unless next_cost < @cost[next_cell]
|
87
89
|
@cost[next_cell] = next_cost
|
88
90
|
@backtrace[next_cell] = cell
|
89
|
-
agenda[next_cell] = priority_factory(next_cost, next_cell) \
|
90
91
|
unless next_cost >= @cost[@end]
|
92
|
+
agenda[next_cell] = priority_factory(next_cost, next_cell)
|
93
|
+
end
|
91
94
|
end
|
92
95
|
end
|
93
96
|
end
|
@@ -207,32 +210,31 @@ module EditAlign
|
|
207
210
|
|
208
211
|
|
209
212
|
# The Alignment class is given a source and destination array at
|
210
|
-
# construction time. It does a dynamic programming alignment
|
211
|
-
#
|
212
|
-
#
|
213
|
+
# construction time. It does a dynamic programming alignment between them
|
214
|
+
# and makes the results of that alignment available through instance
|
215
|
+
# methods.
|
213
216
|
#
|
214
|
-
# If there are multiple alignments with equal edit distances
|
215
|
-
#
|
217
|
+
# If there are multiple alignments with equal edit distances Alignment will
|
218
|
+
# find one of them. Which one is undefined.
|
216
219
|
#
|
217
|
-
# Alignment works by constructing a matrix with dimensions equal to
|
218
|
-
#
|
219
|
-
#
|
220
|
-
#
|
221
|
-
#
|
222
|
-
#
|
223
|
-
#
|
220
|
+
# Alignment works by constructing a matrix with dimensions equal to the
|
221
|
+
# length of the source and destination arrays. Moving horizontally and
|
222
|
+
# vertically in the matrix represents insertion and deletion operations,
|
223
|
+
# respectively, while moving diagonally represents substitution. Each cell
|
224
|
+
# of the matrix contains the minimum cost it takes to reach that cell. The
|
225
|
+
# algorithm fills in cells in the matrix until it reaches the furthest
|
226
|
+
# corner.
|
224
227
|
#
|
225
|
-
# The search is done using Dijkstra's algorithm as implemented in
|
226
|
-
#
|
227
|
-
#
|
228
|
-
# #find_lowest_cost_alignment function.
|
228
|
+
# The search is done using Dijkstra's algorithm as implemented in the
|
229
|
+
# DijkstraSearch. A different search algorithm may be specified by
|
230
|
+
# including a mixin that redefines the #find_lowest_cost_alignment function.
|
229
231
|
#
|
230
|
-
# This class uses Levenshtein weighting scheme. Levenshtein assigns
|
231
|
-
#
|
232
|
-
#
|
233
|
-
#
|
234
|
-
#
|
235
|
-
#
|
232
|
+
# This class uses the Levenshtein weighting scheme. Levenshtein assigns a
|
233
|
+
# cost of 1 to insertions and deletions. It assigns a cost of 1 to
|
234
|
+
# substitutions when the items are different and 0 when they are the same.
|
235
|
+
# Different weighting schemes may be specified by overloading the #insert,
|
236
|
+
# #delete, and #substitute functions. The costs must be non-negative
|
237
|
+
# numbers.
|
236
238
|
class Alignment
|
237
239
|
include DijkstraSearch
|
238
240
|
|
@@ -245,6 +247,8 @@ module EditAlign
|
|
245
247
|
# Optionally either <em>source</em> or <em>dest</em> may be
|
246
248
|
# strings, in which they will be treated as arrays of characters.
|
247
249
|
def initialize(source, dest)
|
250
|
+
@path = nil
|
251
|
+
|
248
252
|
# Convert strings into arrays.
|
249
253
|
source = source.unpack('U*').collect {|c| c.chr} if source.class == String
|
250
254
|
dest = dest.unpack('U*').collect {|c| c.chr} if dest.class == String
|
@@ -314,7 +318,7 @@ module EditAlign
|
|
314
318
|
def edit_sequence # :yields: cell, {:substitute, :insert, :delete}
|
315
319
|
# The first time this function is called, walk backwards through
|
316
320
|
# the backtrace to create the @path instance variable.
|
317
|
-
if
|
321
|
+
if @path.nil?
|
318
322
|
@path = [@end]
|
319
323
|
while cell = @backtrace[@path[0]]
|
320
324
|
@path.unshift(cell)
|
@@ -400,7 +404,7 @@ module EditAlign
|
|
400
404
|
# => true
|
401
405
|
# irb(main):002:0> a = EditAlign::Alignment.new('captained', 'caspian')
|
402
406
|
# => <Alignment: 5>
|
403
|
-
# irb(main):003:0> puts a
|
407
|
+
# irb(main):003:0> puts a.to_grid
|
404
408
|
# - c a p t a i n e d
|
405
409
|
# - [0.00] 1.00 2.00 3.00 4.00 5.00 * * * *
|
406
410
|
# c 1.00 [0.00] 1.00 2.00 3.00 4.00 5.00 * * *
|
data/test/test_editalign.rb
CHANGED
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
|
-
!ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.2
|
3
3
|
specification_version: 1
|
4
4
|
name: editalign
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date:
|
8
|
-
summary:
|
6
|
+
version: 1.1.1
|
7
|
+
date: 2009-04-22 00:00:00 -07:00
|
8
|
+
summary: EditAlign calculates dynamic programming alignments between arrays
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
|
-
email: billmcn@
|
12
|
-
homepage: http://
|
13
|
-
rubyforge_project:
|
11
|
+
email: billmcn@gmail.com
|
12
|
+
homepage: http://editalign.rubyforge.org/
|
13
|
+
rubyforge_project: editalign
|
14
14
|
description: This module performs edit alignments between arrays. It returns alignments and edit distances.
|
15
15
|
autorequire:
|
16
16
|
default_executable:
|
@@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
+
post_install_message:
|
28
29
|
authors:
|
29
30
|
- W.P. McNeill
|
30
31
|
files:
|
@@ -36,12 +37,12 @@ files:
|
|
36
37
|
test_files:
|
37
38
|
- test/test_editalign.rb
|
38
39
|
rdoc_options:
|
39
|
-
- --title
|
40
|
-
- EditAlign --
|
41
|
-
- --main
|
42
|
-
- README
|
43
|
-
- --line-numbers
|
44
|
-
- --inline-source
|
40
|
+
- - --title
|
41
|
+
- EditAlign -- Edit Alignment
|
42
|
+
- --main
|
43
|
+
- README
|
44
|
+
- --line-numbers
|
45
|
+
- --inline-source
|
45
46
|
extra_rdoc_files:
|
46
47
|
- README
|
47
48
|
executables: []
|
@@ -50,13 +51,5 @@ extensions: []
|
|
50
51
|
|
51
52
|
requirements: []
|
52
53
|
|
53
|
-
dependencies:
|
54
|
-
|
55
|
-
name: PriorityQueue
|
56
|
-
version_requirement:
|
57
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: 0.0.0
|
62
|
-
version:
|
54
|
+
dependencies: []
|
55
|
+
|