text_alignment 0.4 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +32 -37
- data/lib/text_alignment/anchor_finder.rb +2 -5
- data/lib/text_alignment/approximate_fit.rb +4 -6
- data/lib/text_alignment/constants.rb +7 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +0 -2
- data/lib/text_alignment/lcs_comparison.rb +2 -1
- data/lib/text_alignment/mixed_alignment.rb +1 -54
- data/lib/text_alignment/text_alignment.rb +193 -13
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07e02285cce988b857421b5eae20c6b39394dbd0c904de2f416344cad69b725f
|
4
|
+
data.tar.gz: 6ccdf9930bc97fc5bc6fc2a2d92f732867744342a6c828eaa3ac8029339f33c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 477e77a9349857cd9ab69b5da4d048dd2b36b1d51a7af742a9d05cb1ee20eba8c6d67cd4882c380f5e2737be1ca0d6fa13c0a81b2c3b257a7f249ccb1f4e589c
|
7
|
+
data.tar.gz: 904fe737512c8774a03c23ae7229df55dd0727fd1a41d3a73031b96e6458920c78c02c66990577624c830525734918c54a8821d1e700eb55d7c8b871ecd6edf3
|
data/bin/align_annotations
CHANGED
@@ -35,6 +35,10 @@ def align_mdoc(source_annotations, target_annotations)
|
|
35
35
|
source_annotations.each do |annotations|
|
36
36
|
alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
|
37
37
|
|
38
|
+
puts alignment.alignment_show
|
39
|
+
puts "-----"
|
40
|
+
puts
|
41
|
+
|
38
42
|
# alignment.block_alignments.each do |a|
|
39
43
|
# p {source:a[:source], target:a[:target]}
|
40
44
|
# puts "--"
|
@@ -103,48 +107,39 @@ target_annotations = if source_annotations.class == Array
|
|
103
107
|
else
|
104
108
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
109
|
|
106
|
-
pp alignment
|
110
|
+
# pp alignment
|
107
111
|
|
108
112
|
# verification
|
109
|
-
source_text = source_annotations[:text]
|
110
|
-
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.rstrip.length).each do |p|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
end
|
119
|
-
puts
|
120
|
-
puts "=====END"
|
121
|
-
|
122
|
-
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.rstrip.length).each do |p|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
else
|
128
|
-
print '.'
|
129
|
-
end
|
130
|
-
end
|
131
|
-
puts
|
132
|
-
puts "=====END"
|
133
|
-
|
134
|
-
# alignment.block_alignments.each do |a|
|
135
|
-
# if a[:alignment].nil? || a[:alignment] == :empty
|
136
|
-
# # p [a[:source], a[:target]]
|
137
|
-
# # p a[:alignment]
|
113
|
+
# source_text = source_annotations[:text]
|
114
|
+
# puts "=====BEGIN"
|
115
|
+
# (0 ... source_text.rstrip.length).each do |p|
|
116
|
+
# t = alignment.transform_begin_position(p)
|
117
|
+
# if t.nil?
|
118
|
+
# print source_text[p]
|
119
|
+
# else
|
120
|
+
# print '.'
|
121
|
+
# end
|
122
|
+
# end
|
123
|
+
# puts
|
124
|
+
# puts "=====END"
|
125
|
+
|
126
|
+
# puts "=====BEGIN"
|
127
|
+
# (0 .. source_text.rstrip.length).each do |p|
|
128
|
+
# t = alignment.transform_end_position(p)
|
129
|
+
# if t.nil?
|
130
|
+
# print source_text[p]
|
138
131
|
# else
|
139
|
-
#
|
140
|
-
# p a[:alignment].similarity
|
141
|
-
# puts "--"
|
142
|
-
# puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
143
|
-
# puts "--"
|
144
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
145
|
-
# puts "======"
|
132
|
+
# print '.'
|
146
133
|
# end
|
147
134
|
# end
|
135
|
+
# puts
|
136
|
+
# puts "=====END"
|
137
|
+
|
138
|
+
source_text = source_annotations[:text]
|
139
|
+
|
140
|
+
puts "[block alignment]"
|
141
|
+
puts alignment.alignment_show
|
142
|
+
puts "====="
|
148
143
|
# exit
|
149
144
|
|
150
145
|
# verification of source denotations
|
@@ -1,18 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
|
-
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
-
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
|
-
|
10
7
|
class TextAlignment::AnchorFinder
|
11
8
|
|
12
9
|
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
13
10
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
11
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
15
|
-
@sim_threshold =
|
12
|
+
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
16
13
|
|
17
14
|
@reverse = (target_str.length < source_str.length)
|
18
15
|
|
@@ -1,13 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
7
|
# approximate the location of str1 in str2
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
9
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
10
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
11
9
|
|
12
10
|
class << TextAlignment
|
13
11
|
|
@@ -16,8 +14,8 @@ class << TextAlignment
|
|
16
14
|
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
17
15
|
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
18
16
|
|
19
|
-
ngram1 = (0 .. str1.length - TextAlignment::
|
20
|
-
ngram2 = (0 .. str2.length - TextAlignment::
|
17
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
|
18
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
|
21
19
|
ngram_shared = ngram1 & ngram2
|
22
20
|
|
23
21
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
@@ -45,7 +43,7 @@ class << TextAlignment
|
|
45
43
|
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
44
|
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
47
45
|
|
48
|
-
break if text_similarity > TextAlignment::
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
49
47
|
fit_begin, fit_end = nil, nil
|
50
48
|
end
|
51
49
|
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
+
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
5
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
|
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
12
|
-
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
|
-
|
14
12
|
class TextAlignment::GLCSTextAlignment
|
15
13
|
attr_reader :position_map_begin, :position_map_end
|
16
14
|
attr_reader :common_elements, :mapped_elements
|
@@ -33,7 +33,8 @@ class TextAlignment::LCSComparison
|
|
33
33
|
@str2_match_initial = sdiff[match_initial].new_position
|
34
34
|
@str1_match_final = sdiff[match_final].old_position
|
35
35
|
@str2_match_final = sdiff[match_final].new_position
|
36
|
-
|
36
|
+
mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
37
|
+
@similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
|
37
38
|
else
|
38
39
|
@str1_match_initial = 0
|
39
40
|
@str2_match_initial = 0
|
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
|
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
13
|
-
TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
|
14
|
-
|
15
13
|
class TextAlignment::MixedAlignment
|
16
14
|
attr_reader :sdiff
|
17
15
|
attr_reader :position_map_begin, :position_map_end
|
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
|
|
21
19
|
|
22
20
|
def initialize(str1, str2, mappings = [])
|
23
21
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
24
|
-
|
25
|
-
|
26
|
-
## preprocessing
|
27
|
-
str1 = str1.dup
|
28
|
-
str2 = str2.dup
|
29
|
-
mappings = mappings.dup
|
30
|
-
|
31
|
-
## find the first nomatch character
|
32
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
33
|
-
if str2.index(c).nil?
|
34
|
-
@nomatch_char1 = c
|
35
|
-
break
|
36
|
-
end
|
37
|
-
end
|
38
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
|
39
|
-
|
40
|
-
## find the first nomatch character
|
41
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
42
|
-
if c != @nomatch_char1 && str1.index(c).nil?
|
43
|
-
@nomatch_char2 = c
|
44
|
-
break
|
45
|
-
end
|
46
|
-
end
|
47
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
|
48
|
-
|
49
|
-
# single character mappings
|
50
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
51
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
52
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
53
|
-
characters_to.gsub!(/-/, '\-')
|
54
|
-
|
55
|
-
str1.tr!(characters_from, characters_to)
|
56
|
-
str2.tr!(characters_from, characters_to)
|
57
|
-
|
58
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
59
|
-
|
60
|
-
# ASCII foldings
|
61
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
62
|
-
ascii_foldings.each do |f|
|
63
|
-
from = f[1]
|
64
|
-
|
65
|
-
if str2.index(f[0])
|
66
|
-
to = f[0] + (@nomatch_char1 * (f[1].length - 1))
|
67
|
-
str1.gsub!(from, to)
|
68
|
-
end
|
69
|
-
|
70
|
-
if str1.index(f[0])
|
71
|
-
to = f[0] + (@nomatch_char2 * (f[1].length - 1))
|
72
|
-
str2.gsub!(from, to)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
22
|
+
mappings ||= []
|
76
23
|
|
77
24
|
_compute_mixed_alignment(str1, str2, mappings)
|
78
25
|
end
|
@@ -1,32 +1,40 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'text_alignment/anchor_finder'
|
3
4
|
require 'text_alignment/mixed_alignment'
|
4
5
|
|
5
6
|
module TextAlignment; end unless defined? TextAlignment
|
6
7
|
|
7
|
-
TextAlignment::
|
8
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
10
|
-
|
8
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
11
9
|
|
12
10
|
class TextAlignment::TextAlignment
|
13
11
|
attr_reader :block_alignments
|
14
12
|
attr_reader :similarity
|
15
13
|
attr_reader :lost_annotations
|
16
14
|
|
17
|
-
def initialize(
|
18
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
|
+
|
18
|
+
@ostr1 = _str1
|
19
|
+
@ostr2 = _str2
|
19
20
|
|
20
|
-
mappings
|
21
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
21
22
|
|
22
23
|
# try exact match
|
23
24
|
block_begin = str2.index(str1)
|
24
25
|
unless block_begin.nil?
|
25
|
-
@block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
|
26
|
+
@block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
27
|
+
return @block_alignments
|
28
|
+
end
|
29
|
+
|
30
|
+
# try exact match
|
31
|
+
block_begin = str2.downcase.index(str1.downcase)
|
32
|
+
unless block_begin.nil?
|
33
|
+
@block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
34
|
return @block_alignments
|
27
35
|
end
|
28
36
|
|
29
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
|
37
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
30
38
|
|
31
39
|
# To collect matched blocks
|
32
40
|
mblocks = []
|
@@ -93,7 +101,7 @@ class TextAlignment::TextAlignment
|
|
93
101
|
end
|
94
102
|
end
|
95
103
|
end
|
96
|
-
@block_alignments << mblocks[0]
|
104
|
+
@block_alignments << mblocks[0].merge(alignment: :block)
|
97
105
|
|
98
106
|
(1 ... mblocks.length).each do |i|
|
99
107
|
b1 = mblocks[i - 1][:source][:end]
|
@@ -114,7 +122,7 @@ class TextAlignment::TextAlignment
|
|
114
122
|
end
|
115
123
|
end
|
116
124
|
end
|
117
|
-
@block_alignments << mblocks[i]
|
125
|
+
@block_alignments << mblocks[i].merge(alignment: :block)
|
118
126
|
end
|
119
127
|
|
120
128
|
# Final step
|
@@ -156,7 +164,7 @@ class TextAlignment::TextAlignment
|
|
156
164
|
i = @block_alignments.index{|b| b[:source][:end] > begin_position}
|
157
165
|
block_alignment = @block_alignments[i]
|
158
166
|
|
159
|
-
b = if block_alignment[:alignment]
|
167
|
+
b = if block_alignment[:alignment] == :block
|
160
168
|
begin_position + block_alignment[:delta]
|
161
169
|
elsif block_alignment[:alignment] == :empty
|
162
170
|
if begin_position == block_alignment[:source][:begin]
|
@@ -175,7 +183,7 @@ class TextAlignment::TextAlignment
|
|
175
183
|
i = @block_alignments.index{|b| b[:source][:end] >= end_position}
|
176
184
|
block_alignment = @block_alignments[i]
|
177
185
|
|
178
|
-
e = if block_alignment[:alignment]
|
186
|
+
e = if block_alignment[:alignment] == :block
|
179
187
|
end_position + block_alignment[:delta]
|
180
188
|
elsif block_alignment[:alignment] == :empty
|
181
189
|
if end_position == block_alignment[:source][:end]
|
@@ -232,4 +240,176 @@ class TextAlignment::TextAlignment
|
|
232
240
|
r
|
233
241
|
end
|
234
242
|
|
243
|
+
def alignment_table
|
244
|
+
table = <<-TABLE
|
245
|
+
<table class='text_alignment_table'>
|
246
|
+
<thead>
|
247
|
+
<tr>
|
248
|
+
<th class='text_alignment_left' style='width:50%'>Text 1</th>
|
249
|
+
<th class='text_alignment_rigt'>Text 2</th>
|
250
|
+
</tr>
|
251
|
+
</thead>
|
252
|
+
<tbody>
|
253
|
+
TABLE
|
254
|
+
|
255
|
+
@block_alignments.each do |a|
|
256
|
+
table += alignment_table_th(a)
|
257
|
+
table += "<tr>\n" + case a[:alignment]
|
258
|
+
when :block
|
259
|
+
"<td colspan='2' class='text_alignment_common'>" +
|
260
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] +
|
261
|
+
"</td>\n"
|
262
|
+
when :empty
|
263
|
+
"<td class='text_alignment_left'>" + @ostr1[a[:source][:begin] ... a[:source][:end]] + "</td>\n" +
|
264
|
+
"<td class='text_alignment_right'>" + @ostr2[a[:target][:begin] ... a[:target][:end]] + "</td>\n"
|
265
|
+
else
|
266
|
+
base = a[:source][:begin]
|
267
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
268
|
+
case c.action
|
269
|
+
when '='
|
270
|
+
@ostr1[c.old_position + base]
|
271
|
+
when '+'
|
272
|
+
'_'
|
273
|
+
when '-'
|
274
|
+
@ostr1[c.old_position + base]
|
275
|
+
when '!'
|
276
|
+
@ostr1[c.old_position + base] + '_'
|
277
|
+
end
|
278
|
+
end.join('')
|
279
|
+
|
280
|
+
base = a[:target][:begin]
|
281
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
282
|
+
case c.action
|
283
|
+
when '='
|
284
|
+
@ostr2[c.new_position + base]
|
285
|
+
when '+'
|
286
|
+
@ostr2[c.new_position + base]
|
287
|
+
when '-'
|
288
|
+
'_'
|
289
|
+
when '!'
|
290
|
+
'_' + @ostr2[c.new_position + base]
|
291
|
+
end
|
292
|
+
end.join('')
|
293
|
+
|
294
|
+
"<td class='text_alignment_left'>" + astr1 + "</td>\n" +
|
295
|
+
"<td class='text_alignment_right'>" + astr2 + "</td>\n"
|
296
|
+
end + "</tr>\n"
|
297
|
+
end
|
298
|
+
table += '</tbody></table>'
|
299
|
+
end
|
300
|
+
|
301
|
+
def alignment_table_th(a)
|
302
|
+
"<tr>" +
|
303
|
+
"<th class='text_alignment_left'>#{a[:source][:begin]} - #{a[:source][:end]}</th>" +
|
304
|
+
"<th class='text_alignment_right'>#{a[:target][:begin]} - #{a[:target][:end]}</th>" +
|
305
|
+
"</tr>"
|
306
|
+
end
|
307
|
+
|
308
|
+
def alignment_show
|
309
|
+
show = ''
|
310
|
+
@block_alignments.each do |a|
|
311
|
+
show += case a[:alignment]
|
312
|
+
when :block
|
313
|
+
"===== common =====\n" +
|
314
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
315
|
+
when :empty
|
316
|
+
"<<<<< string 1\n" +
|
317
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
318
|
+
">>>>> string 2\n" +
|
319
|
+
@ostr2[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
320
|
+
else
|
321
|
+
astr1 = ''
|
322
|
+
astr2 = ''
|
323
|
+
|
324
|
+
base = a[:source][:begin]
|
325
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
326
|
+
case c.action
|
327
|
+
when '='
|
328
|
+
@ostr1[c.old_position + base]
|
329
|
+
when '+'
|
330
|
+
'_'
|
331
|
+
when '-'
|
332
|
+
@ostr1[c.old_position + base]
|
333
|
+
when '!'
|
334
|
+
@ostr1[c.old_position + base] + '_'
|
335
|
+
end
|
336
|
+
end.join('')
|
337
|
+
|
338
|
+
base = a[:target][:begin]
|
339
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
340
|
+
case c.action
|
341
|
+
when '='
|
342
|
+
@ostr2[c.new_position + base]
|
343
|
+
when '+'
|
344
|
+
@ostr2[c.new_position + base]
|
345
|
+
when '-'
|
346
|
+
'_'
|
347
|
+
when '!'
|
348
|
+
'_' + @ostr2[c.new_position + base]
|
349
|
+
end
|
350
|
+
end.join('')
|
351
|
+
|
352
|
+
"***** local mismatch\n" +
|
353
|
+
"[#{astr1}]\n" +
|
354
|
+
"[#{astr2}]\n\n"
|
355
|
+
end
|
356
|
+
end
|
357
|
+
show
|
358
|
+
end
|
359
|
+
|
360
|
+
private
|
361
|
+
|
362
|
+
def string_preprocessing(_str1, _str2)
|
363
|
+
str1 = _str1.dup
|
364
|
+
str2 = _str2.dup
|
365
|
+
mappings = TextAlignment::MAPPINGS.dup
|
366
|
+
|
367
|
+
## single character mappings
|
368
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
369
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
370
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
371
|
+
characters_to.gsub!(/-/, '\-')
|
372
|
+
|
373
|
+
str1.tr!(characters_from, characters_to)
|
374
|
+
str2.tr!(characters_from, characters_to)
|
375
|
+
|
376
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
377
|
+
|
378
|
+
## long to one character mappings
|
379
|
+
pletters = TextAlignment::PADDING_LETTERS
|
380
|
+
|
381
|
+
# find the padding letter for str1
|
382
|
+
padding_letter1 = begin
|
383
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
384
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
385
|
+
TextAlignment::PADDING_LETTERS[i]
|
386
|
+
end
|
387
|
+
|
388
|
+
# find the padding letter for str2
|
389
|
+
padding_letter2 = begin
|
390
|
+
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
391
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
392
|
+
TextAlignment::PADDING_LETTERS[i]
|
393
|
+
end
|
394
|
+
|
395
|
+
# ASCII foldings
|
396
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
397
|
+
ascii_foldings.each do |f|
|
398
|
+
from = f[1]
|
399
|
+
|
400
|
+
if str2.index(f[0])
|
401
|
+
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
402
|
+
str1.gsub!(from, to)
|
403
|
+
end
|
404
|
+
|
405
|
+
if str1.index(f[0])
|
406
|
+
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
407
|
+
str2.gsub!(from, to)
|
408
|
+
end
|
409
|
+
end
|
410
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
411
|
+
|
412
|
+
[str1, str2, mappings]
|
413
|
+
end
|
414
|
+
|
235
415
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,6 +77,7 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/constants.rb
|
80
81
|
- lib/text_alignment/find_divisions.rb
|
81
82
|
- lib/text_alignment/glcs_alignment.rb
|
82
83
|
- lib/text_alignment/glcs_alignment_fast.rb
|