text_alignment 0.4 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +32 -37
- data/lib/text_alignment/anchor_finder.rb +2 -5
- data/lib/text_alignment/approximate_fit.rb +4 -6
- data/lib/text_alignment/constants.rb +7 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +0 -2
- data/lib/text_alignment/lcs_comparison.rb +2 -1
- data/lib/text_alignment/mixed_alignment.rb +1 -54
- data/lib/text_alignment/text_alignment.rb +193 -13
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07e02285cce988b857421b5eae20c6b39394dbd0c904de2f416344cad69b725f
|
4
|
+
data.tar.gz: 6ccdf9930bc97fc5bc6fc2a2d92f732867744342a6c828eaa3ac8029339f33c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 477e77a9349857cd9ab69b5da4d048dd2b36b1d51a7af742a9d05cb1ee20eba8c6d67cd4882c380f5e2737be1ca0d6fa13c0a81b2c3b257a7f249ccb1f4e589c
|
7
|
+
data.tar.gz: 904fe737512c8774a03c23ae7229df55dd0727fd1a41d3a73031b96e6458920c78c02c66990577624c830525734918c54a8821d1e700eb55d7c8b871ecd6edf3
|
data/bin/align_annotations
CHANGED
@@ -35,6 +35,10 @@ def align_mdoc(source_annotations, target_annotations)
|
|
35
35
|
source_annotations.each do |annotations|
|
36
36
|
alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
|
37
37
|
|
38
|
+
puts alignment.alignment_show
|
39
|
+
puts "-----"
|
40
|
+
puts
|
41
|
+
|
38
42
|
# alignment.block_alignments.each do |a|
|
39
43
|
# p {source:a[:source], target:a[:target]}
|
40
44
|
# puts "--"
|
@@ -103,48 +107,39 @@ target_annotations = if source_annotations.class == Array
|
|
103
107
|
else
|
104
108
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
109
|
|
106
|
-
pp alignment
|
110
|
+
# pp alignment
|
107
111
|
|
108
112
|
# verification
|
109
|
-
source_text = source_annotations[:text]
|
110
|
-
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.rstrip.length).each do |p|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
end
|
119
|
-
puts
|
120
|
-
puts "=====END"
|
121
|
-
|
122
|
-
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.rstrip.length).each do |p|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
else
|
128
|
-
print '.'
|
129
|
-
end
|
130
|
-
end
|
131
|
-
puts
|
132
|
-
puts "=====END"
|
133
|
-
|
134
|
-
# alignment.block_alignments.each do |a|
|
135
|
-
# if a[:alignment].nil? || a[:alignment] == :empty
|
136
|
-
# # p [a[:source], a[:target]]
|
137
|
-
# # p a[:alignment]
|
113
|
+
# source_text = source_annotations[:text]
|
114
|
+
# puts "=====BEGIN"
|
115
|
+
# (0 ... source_text.rstrip.length).each do |p|
|
116
|
+
# t = alignment.transform_begin_position(p)
|
117
|
+
# if t.nil?
|
118
|
+
# print source_text[p]
|
119
|
+
# else
|
120
|
+
# print '.'
|
121
|
+
# end
|
122
|
+
# end
|
123
|
+
# puts
|
124
|
+
# puts "=====END"
|
125
|
+
|
126
|
+
# puts "=====BEGIN"
|
127
|
+
# (0 .. source_text.rstrip.length).each do |p|
|
128
|
+
# t = alignment.transform_end_position(p)
|
129
|
+
# if t.nil?
|
130
|
+
# print source_text[p]
|
138
131
|
# else
|
139
|
-
#
|
140
|
-
# p a[:alignment].similarity
|
141
|
-
# puts "--"
|
142
|
-
# puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
143
|
-
# puts "--"
|
144
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
145
|
-
# puts "======"
|
132
|
+
# print '.'
|
146
133
|
# end
|
147
134
|
# end
|
135
|
+
# puts
|
136
|
+
# puts "=====END"
|
137
|
+
|
138
|
+
source_text = source_annotations[:text]
|
139
|
+
|
140
|
+
puts "[block alignment]"
|
141
|
+
puts alignment.alignment_show
|
142
|
+
puts "====="
|
148
143
|
# exit
|
149
144
|
|
150
145
|
# verification of source denotations
|
@@ -1,18 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
|
-
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
-
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
|
-
|
10
7
|
class TextAlignment::AnchorFinder
|
11
8
|
|
12
9
|
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
13
10
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
11
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
15
|
-
@sim_threshold =
|
12
|
+
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
16
13
|
|
17
14
|
@reverse = (target_str.length < source_str.length)
|
18
15
|
|
@@ -1,13 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
7
|
# approximate the location of str1 in str2
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
9
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
10
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
11
9
|
|
12
10
|
class << TextAlignment
|
13
11
|
|
@@ -16,8 +14,8 @@ class << TextAlignment
|
|
16
14
|
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
17
15
|
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
18
16
|
|
19
|
-
ngram1 = (0 .. str1.length - TextAlignment::
|
20
|
-
ngram2 = (0 .. str2.length - TextAlignment::
|
17
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
|
18
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
|
21
19
|
ngram_shared = ngram1 & ngram2
|
22
20
|
|
23
21
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
@@ -45,7 +43,7 @@ class << TextAlignment
|
|
45
43
|
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
44
|
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
47
45
|
|
48
|
-
break if text_similarity > TextAlignment::
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
49
47
|
fit_begin, fit_end = nil, nil
|
50
48
|
end
|
51
49
|
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
+
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
5
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
|
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
12
|
-
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
|
-
|
14
12
|
class TextAlignment::GLCSTextAlignment
|
15
13
|
attr_reader :position_map_begin, :position_map_end
|
16
14
|
attr_reader :common_elements, :mapped_elements
|
@@ -33,7 +33,8 @@ class TextAlignment::LCSComparison
|
|
33
33
|
@str2_match_initial = sdiff[match_initial].new_position
|
34
34
|
@str1_match_final = sdiff[match_final].old_position
|
35
35
|
@str2_match_final = sdiff[match_final].new_position
|
36
|
-
|
36
|
+
mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
37
|
+
@similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
|
37
38
|
else
|
38
39
|
@str1_match_initial = 0
|
39
40
|
@str2_match_initial = 0
|
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
|
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
13
|
-
TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
|
14
|
-
|
15
13
|
class TextAlignment::MixedAlignment
|
16
14
|
attr_reader :sdiff
|
17
15
|
attr_reader :position_map_begin, :position_map_end
|
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
|
|
21
19
|
|
22
20
|
def initialize(str1, str2, mappings = [])
|
23
21
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
24
|
-
|
25
|
-
|
26
|
-
## preprocessing
|
27
|
-
str1 = str1.dup
|
28
|
-
str2 = str2.dup
|
29
|
-
mappings = mappings.dup
|
30
|
-
|
31
|
-
## find the first nomatch character
|
32
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
33
|
-
if str2.index(c).nil?
|
34
|
-
@nomatch_char1 = c
|
35
|
-
break
|
36
|
-
end
|
37
|
-
end
|
38
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
|
39
|
-
|
40
|
-
## find the first nomatch character
|
41
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
42
|
-
if c != @nomatch_char1 && str1.index(c).nil?
|
43
|
-
@nomatch_char2 = c
|
44
|
-
break
|
45
|
-
end
|
46
|
-
end
|
47
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
|
48
|
-
|
49
|
-
# single character mappings
|
50
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
51
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
52
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
53
|
-
characters_to.gsub!(/-/, '\-')
|
54
|
-
|
55
|
-
str1.tr!(characters_from, characters_to)
|
56
|
-
str2.tr!(characters_from, characters_to)
|
57
|
-
|
58
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
59
|
-
|
60
|
-
# ASCII foldings
|
61
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
62
|
-
ascii_foldings.each do |f|
|
63
|
-
from = f[1]
|
64
|
-
|
65
|
-
if str2.index(f[0])
|
66
|
-
to = f[0] + (@nomatch_char1 * (f[1].length - 1))
|
67
|
-
str1.gsub!(from, to)
|
68
|
-
end
|
69
|
-
|
70
|
-
if str1.index(f[0])
|
71
|
-
to = f[0] + (@nomatch_char2 * (f[1].length - 1))
|
72
|
-
str2.gsub!(from, to)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
22
|
+
mappings ||= []
|
76
23
|
|
77
24
|
_compute_mixed_alignment(str1, str2, mappings)
|
78
25
|
end
|
@@ -1,32 +1,40 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'text_alignment/anchor_finder'
|
3
4
|
require 'text_alignment/mixed_alignment'
|
4
5
|
|
5
6
|
module TextAlignment; end unless defined? TextAlignment
|
6
7
|
|
7
|
-
TextAlignment::
|
8
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
10
|
-
|
8
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
11
9
|
|
12
10
|
class TextAlignment::TextAlignment
|
13
11
|
attr_reader :block_alignments
|
14
12
|
attr_reader :similarity
|
15
13
|
attr_reader :lost_annotations
|
16
14
|
|
17
|
-
def initialize(
|
18
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
|
+
|
18
|
+
@ostr1 = _str1
|
19
|
+
@ostr2 = _str2
|
19
20
|
|
20
|
-
mappings
|
21
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
21
22
|
|
22
23
|
# try exact match
|
23
24
|
block_begin = str2.index(str1)
|
24
25
|
unless block_begin.nil?
|
25
|
-
@block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
|
26
|
+
@block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
27
|
+
return @block_alignments
|
28
|
+
end
|
29
|
+
|
30
|
+
# try exact match
|
31
|
+
block_begin = str2.downcase.index(str1.downcase)
|
32
|
+
unless block_begin.nil?
|
33
|
+
@block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
34
|
return @block_alignments
|
27
35
|
end
|
28
36
|
|
29
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
|
37
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
30
38
|
|
31
39
|
# To collect matched blocks
|
32
40
|
mblocks = []
|
@@ -93,7 +101,7 @@ class TextAlignment::TextAlignment
|
|
93
101
|
end
|
94
102
|
end
|
95
103
|
end
|
96
|
-
@block_alignments << mblocks[0]
|
104
|
+
@block_alignments << mblocks[0].merge(alignment: :block)
|
97
105
|
|
98
106
|
(1 ... mblocks.length).each do |i|
|
99
107
|
b1 = mblocks[i - 1][:source][:end]
|
@@ -114,7 +122,7 @@ class TextAlignment::TextAlignment
|
|
114
122
|
end
|
115
123
|
end
|
116
124
|
end
|
117
|
-
@block_alignments << mblocks[i]
|
125
|
+
@block_alignments << mblocks[i].merge(alignment: :block)
|
118
126
|
end
|
119
127
|
|
120
128
|
# Final step
|
@@ -156,7 +164,7 @@ class TextAlignment::TextAlignment
|
|
156
164
|
i = @block_alignments.index{|b| b[:source][:end] > begin_position}
|
157
165
|
block_alignment = @block_alignments[i]
|
158
166
|
|
159
|
-
b = if block_alignment[:alignment]
|
167
|
+
b = if block_alignment[:alignment] == :block
|
160
168
|
begin_position + block_alignment[:delta]
|
161
169
|
elsif block_alignment[:alignment] == :empty
|
162
170
|
if begin_position == block_alignment[:source][:begin]
|
@@ -175,7 +183,7 @@ class TextAlignment::TextAlignment
|
|
175
183
|
i = @block_alignments.index{|b| b[:source][:end] >= end_position}
|
176
184
|
block_alignment = @block_alignments[i]
|
177
185
|
|
178
|
-
e = if block_alignment[:alignment]
|
186
|
+
e = if block_alignment[:alignment] == :block
|
179
187
|
end_position + block_alignment[:delta]
|
180
188
|
elsif block_alignment[:alignment] == :empty
|
181
189
|
if end_position == block_alignment[:source][:end]
|
@@ -232,4 +240,176 @@ class TextAlignment::TextAlignment
|
|
232
240
|
r
|
233
241
|
end
|
234
242
|
|
243
|
+
def alignment_table
|
244
|
+
table = <<-TABLE
|
245
|
+
<table class='text_alignment_table'>
|
246
|
+
<thead>
|
247
|
+
<tr>
|
248
|
+
<th class='text_alignment_left' style='width:50%'>Text 1</th>
|
249
|
+
<th class='text_alignment_rigt'>Text 2</th>
|
250
|
+
</tr>
|
251
|
+
</thead>
|
252
|
+
<tbody>
|
253
|
+
TABLE
|
254
|
+
|
255
|
+
@block_alignments.each do |a|
|
256
|
+
table += alignment_table_th(a)
|
257
|
+
table += "<tr>\n" + case a[:alignment]
|
258
|
+
when :block
|
259
|
+
"<td colspan='2' class='text_alignment_common'>" +
|
260
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] +
|
261
|
+
"</td>\n"
|
262
|
+
when :empty
|
263
|
+
"<td class='text_alignment_left'>" + @ostr1[a[:source][:begin] ... a[:source][:end]] + "</td>\n" +
|
264
|
+
"<td class='text_alignment_right'>" + @ostr2[a[:target][:begin] ... a[:target][:end]] + "</td>\n"
|
265
|
+
else
|
266
|
+
base = a[:source][:begin]
|
267
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
268
|
+
case c.action
|
269
|
+
when '='
|
270
|
+
@ostr1[c.old_position + base]
|
271
|
+
when '+'
|
272
|
+
'_'
|
273
|
+
when '-'
|
274
|
+
@ostr1[c.old_position + base]
|
275
|
+
when '!'
|
276
|
+
@ostr1[c.old_position + base] + '_'
|
277
|
+
end
|
278
|
+
end.join('')
|
279
|
+
|
280
|
+
base = a[:target][:begin]
|
281
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
282
|
+
case c.action
|
283
|
+
when '='
|
284
|
+
@ostr2[c.new_position + base]
|
285
|
+
when '+'
|
286
|
+
@ostr2[c.new_position + base]
|
287
|
+
when '-'
|
288
|
+
'_'
|
289
|
+
when '!'
|
290
|
+
'_' + @ostr2[c.new_position + base]
|
291
|
+
end
|
292
|
+
end.join('')
|
293
|
+
|
294
|
+
"<td class='text_alignment_left'>" + astr1 + "</td>\n" +
|
295
|
+
"<td class='text_alignment_right'>" + astr2 + "</td>\n"
|
296
|
+
end + "</tr>\n"
|
297
|
+
end
|
298
|
+
table += '</tbody></table>'
|
299
|
+
end
|
300
|
+
|
301
|
+
def alignment_table_th(a)
|
302
|
+
"<tr>" +
|
303
|
+
"<th class='text_alignment_left'>#{a[:source][:begin]} - #{a[:source][:end]}</th>" +
|
304
|
+
"<th class='text_alignment_right'>#{a[:target][:begin]} - #{a[:target][:end]}</th>" +
|
305
|
+
"</tr>"
|
306
|
+
end
|
307
|
+
|
308
|
+
def alignment_show
|
309
|
+
show = ''
|
310
|
+
@block_alignments.each do |a|
|
311
|
+
show += case a[:alignment]
|
312
|
+
when :block
|
313
|
+
"===== common =====\n" +
|
314
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
315
|
+
when :empty
|
316
|
+
"<<<<< string 1\n" +
|
317
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
318
|
+
">>>>> string 2\n" +
|
319
|
+
@ostr2[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
320
|
+
else
|
321
|
+
astr1 = ''
|
322
|
+
astr2 = ''
|
323
|
+
|
324
|
+
base = a[:source][:begin]
|
325
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
326
|
+
case c.action
|
327
|
+
when '='
|
328
|
+
@ostr1[c.old_position + base]
|
329
|
+
when '+'
|
330
|
+
'_'
|
331
|
+
when '-'
|
332
|
+
@ostr1[c.old_position + base]
|
333
|
+
when '!'
|
334
|
+
@ostr1[c.old_position + base] + '_'
|
335
|
+
end
|
336
|
+
end.join('')
|
337
|
+
|
338
|
+
base = a[:target][:begin]
|
339
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
340
|
+
case c.action
|
341
|
+
when '='
|
342
|
+
@ostr2[c.new_position + base]
|
343
|
+
when '+'
|
344
|
+
@ostr2[c.new_position + base]
|
345
|
+
when '-'
|
346
|
+
'_'
|
347
|
+
when '!'
|
348
|
+
'_' + @ostr2[c.new_position + base]
|
349
|
+
end
|
350
|
+
end.join('')
|
351
|
+
|
352
|
+
"***** local mismatch\n" +
|
353
|
+
"[#{astr1}]\n" +
|
354
|
+
"[#{astr2}]\n\n"
|
355
|
+
end
|
356
|
+
end
|
357
|
+
show
|
358
|
+
end
|
359
|
+
|
360
|
+
private
|
361
|
+
|
362
|
+
def string_preprocessing(_str1, _str2)
|
363
|
+
str1 = _str1.dup
|
364
|
+
str2 = _str2.dup
|
365
|
+
mappings = TextAlignment::MAPPINGS.dup
|
366
|
+
|
367
|
+
## single character mappings
|
368
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
369
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
370
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
371
|
+
characters_to.gsub!(/-/, '\-')
|
372
|
+
|
373
|
+
str1.tr!(characters_from, characters_to)
|
374
|
+
str2.tr!(characters_from, characters_to)
|
375
|
+
|
376
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
377
|
+
|
378
|
+
## long to one character mappings
|
379
|
+
pletters = TextAlignment::PADDING_LETTERS
|
380
|
+
|
381
|
+
# find the padding letter for str1
|
382
|
+
padding_letter1 = begin
|
383
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
384
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
385
|
+
TextAlignment::PADDING_LETTERS[i]
|
386
|
+
end
|
387
|
+
|
388
|
+
# find the padding letter for str2
|
389
|
+
padding_letter2 = begin
|
390
|
+
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
391
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
392
|
+
TextAlignment::PADDING_LETTERS[i]
|
393
|
+
end
|
394
|
+
|
395
|
+
# ASCII foldings
|
396
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
397
|
+
ascii_foldings.each do |f|
|
398
|
+
from = f[1]
|
399
|
+
|
400
|
+
if str2.index(f[0])
|
401
|
+
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
402
|
+
str1.gsub!(from, to)
|
403
|
+
end
|
404
|
+
|
405
|
+
if str1.index(f[0])
|
406
|
+
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
407
|
+
str2.gsub!(from, to)
|
408
|
+
end
|
409
|
+
end
|
410
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
411
|
+
|
412
|
+
[str1, str2, mappings]
|
413
|
+
end
|
414
|
+
|
235
415
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,6 +77,7 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/constants.rb
|
80
81
|
- lib/text_alignment/find_divisions.rb
|
81
82
|
- lib/text_alignment/glcs_alignment.rb
|
82
83
|
- lib/text_alignment/glcs_alignment_fast.rb
|