text_alignment 0.5.2 → 0.6.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_alignment/lcs_comparison.rb +1 -0
- data/lib/text_alignment/mixed_alignment.rb +74 -4
- data/lib/text_alignment/text_alignment.rb +68 -187
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87f945e356349ed709996d88ed39c8ba5b83622bde1c7fd7b9e5ff63504615c2
|
4
|
+
data.tar.gz: acb6e716113238c39b59a8358928de1bd936382308961a57e2c60e7bc462726f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d5b862bb50b4111c6bd390e458d6761303dc394f2fa7dc9d6b821ee7461541705aecac925f700e5124eb282112567e52a51a9f15b84fa8349da25baaf68fdd9
|
7
|
+
data.tar.gz: a044608a58181e98664a26f410a7d59927dc4d39db8d49a147666f64254e23728ceccaa781a590712b7a74b57222cc449c37eb43a709d3f16da60aa3a55c2e6f
|
@@ -35,6 +35,7 @@ class TextAlignment::LCSComparison
|
|
35
35
|
@str2_match_final = sdiff[match_final].new_position
|
36
36
|
mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
37
37
|
@similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
|
38
|
+
# @similarity = 2 * lcs / (str1[@str1_match_initial .. @str1_match_final].length + str2[@str2_match_initial .. @str2_match_final].length).to_f
|
38
39
|
else
|
39
40
|
@str1_match_initial = 0
|
40
41
|
@str2_match_initial = 0
|
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(
|
21
|
-
raise ArgumentError, "nil string" if
|
22
|
-
|
20
|
+
def initialize(_str1, _str2)
|
21
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
|
+
|
23
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
23
24
|
|
24
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
25
26
|
end
|
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
62
63
|
end
|
63
64
|
|
64
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
65
|
-
@similarity =
|
66
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
67
|
@str1_match_initial = cmp.str1_match_initial
|
67
68
|
@str1_match_final = cmp.str1_match_final
|
68
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
|
|
137
138
|
@position_map_begin = posmap_begin.sort.to_h
|
138
139
|
@position_map_end = posmap_end.sort.to_h
|
139
140
|
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
def string_preprocessing(_str1, _str2)
|
145
|
+
str1 = _str1.dup
|
146
|
+
str2 = _str2.dup
|
147
|
+
mappings = TextAlignment::MAPPINGS.dup
|
148
|
+
|
149
|
+
## single character mappings
|
150
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
+
characters_to.gsub!(/-/, '\-')
|
154
|
+
|
155
|
+
str1.tr!(characters_from, characters_to)
|
156
|
+
str2.tr!(characters_from, characters_to)
|
157
|
+
|
158
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
+
|
160
|
+
## long to one character mappings
|
161
|
+
pletters = TextAlignment::PADDING_LETTERS
|
162
|
+
|
163
|
+
# find the padding letter for str1
|
164
|
+
@padding_letter1 = begin
|
165
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
+
TextAlignment::PADDING_LETTERS[i]
|
168
|
+
end
|
169
|
+
|
170
|
+
# find the padding letter for str2
|
171
|
+
@padding_letter2 = begin
|
172
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
+
TextAlignment::PADDING_LETTERS[i]
|
175
|
+
end
|
176
|
+
|
177
|
+
# ASCII foldings
|
178
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
+
ascii_foldings.each do |f|
|
180
|
+
from = f[1]
|
181
|
+
|
182
|
+
if str2.index(f[0])
|
183
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
+
str1.gsub!(from, to)
|
185
|
+
end
|
186
|
+
|
187
|
+
if str1.index(f[0])
|
188
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
+
str2.gsub!(from, to)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
+
|
194
|
+
[str1, str2, mappings]
|
195
|
+
end
|
196
|
+
|
197
|
+
def compute_similarity(_s1, _s2, sdiff)
|
198
|
+
return 0 if sdiff.nil?
|
199
|
+
|
200
|
+
# compute the lcs only with non-whitespace letters
|
201
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
+
return 0 if lcs == 0
|
203
|
+
|
204
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
+
|
207
|
+
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
+
end
|
209
|
+
|
140
210
|
end
|
@@ -8,30 +8,27 @@ module TextAlignment; end unless defined? TextAlignment
|
|
8
8
|
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
9
|
|
10
10
|
class TextAlignment::TextAlignment
|
11
|
-
attr_reader :
|
11
|
+
attr_reader :block_alignment
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
|
-
@
|
19
|
-
@ostr2 = _str2
|
20
|
-
|
21
|
-
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
18
|
+
@block_alignment = {source_text:str1, target_text:str2}
|
22
19
|
|
23
20
|
# try exact match
|
24
21
|
block_begin = str2.index(str1)
|
25
22
|
unless block_begin.nil?
|
26
|
-
@
|
27
|
-
return @
|
23
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
24
|
+
return @block_alignment
|
28
25
|
end
|
29
26
|
|
30
27
|
# try exact match
|
31
28
|
block_begin = str2.downcase.index(str1.downcase)
|
32
29
|
unless block_begin.nil?
|
33
|
-
@
|
34
|
-
return @
|
30
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
31
|
+
return @block_alignment
|
35
32
|
end
|
36
33
|
|
37
34
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
@@ -64,7 +61,7 @@ class TextAlignment::TextAlignment
|
|
64
61
|
# puts
|
65
62
|
|
66
63
|
## To find block alignments
|
67
|
-
@
|
64
|
+
@block_alignment[:blocks] = []
|
68
65
|
return if mblocks.empty?
|
69
66
|
|
70
67
|
# Initial step
|
@@ -73,35 +70,35 @@ class TextAlignment::TextAlignment
|
|
73
70
|
e2 = mblocks[0][:target][:begin]
|
74
71
|
|
75
72
|
if mblocks[0][:target][:begin] == 0
|
76
|
-
@
|
73
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
|
77
74
|
else
|
78
75
|
_str1 = str1[0 ... e1]
|
79
76
|
_str2 = str2[0 ... e2]
|
80
77
|
|
81
78
|
unless _str1.strip.empty?
|
82
79
|
if _str2.strip.empty?
|
83
|
-
@
|
80
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
84
81
|
else
|
85
82
|
len_min = [_str1.length, _str2.length].min
|
86
83
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
87
84
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
88
85
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
89
86
|
|
90
|
-
@
|
87
|
+
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
91
88
|
|
92
89
|
_str1 = str1[b1 ... e1]
|
93
90
|
_str2 = str2[b2 ... e2]
|
94
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
95
|
-
if alignment.similarity < 0.
|
96
|
-
@
|
91
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
92
|
+
if alignment.similarity < 0.5
|
93
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
97
94
|
else
|
98
|
-
@
|
95
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
99
96
|
end
|
100
97
|
end
|
101
98
|
end
|
102
99
|
end
|
103
100
|
end
|
104
|
-
@
|
101
|
+
@block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
|
105
102
|
|
106
103
|
(1 ... mblocks.length).each do |i|
|
107
104
|
b1 = mblocks[i - 1][:source][:end]
|
@@ -112,17 +109,17 @@ class TextAlignment::TextAlignment
|
|
112
109
|
_str2 = str2[b2 ... e2]
|
113
110
|
unless _str1.strip.empty?
|
114
111
|
if _str2.strip.empty?
|
115
|
-
@
|
112
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
116
113
|
else
|
117
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
118
|
-
if alignment.similarity < 0.
|
119
|
-
@
|
114
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
115
|
+
if alignment.similarity < 0.5
|
116
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
120
117
|
else
|
121
|
-
@
|
118
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
122
119
|
end
|
123
120
|
end
|
124
121
|
end
|
125
|
-
@
|
122
|
+
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
126
123
|
end
|
127
124
|
|
128
125
|
# Final step
|
@@ -134,7 +131,7 @@ class TextAlignment::TextAlignment
|
|
134
131
|
|
135
132
|
unless _str1.strip.empty?
|
136
133
|
if _str2.strip.empty?
|
137
|
-
@
|
134
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
138
135
|
else
|
139
136
|
len_min = [_str1.length, _str2.length].min
|
140
137
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
@@ -143,58 +140,58 @@ class TextAlignment::TextAlignment
|
|
143
140
|
_str1 = str1[b1 ... e1]
|
144
141
|
_str2 = str2[b2 ... e2]
|
145
142
|
|
146
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
147
|
-
if alignment.similarity < 0.
|
148
|
-
@
|
143
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
144
|
+
if alignment.similarity < 0.5
|
145
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
149
146
|
else
|
150
|
-
@
|
147
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
151
148
|
end
|
152
149
|
|
153
|
-
@
|
150
|
+
@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
|
154
151
|
end
|
155
152
|
end
|
156
153
|
end
|
157
154
|
|
158
|
-
@
|
155
|
+
@block_alignment[:blocks].each do |a|
|
159
156
|
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
160
157
|
end
|
161
158
|
end
|
162
159
|
|
163
160
|
def transform_begin_position(begin_position)
|
164
|
-
i = @
|
165
|
-
|
166
|
-
|
167
|
-
b = if
|
168
|
-
begin_position +
|
169
|
-
elsif
|
170
|
-
if begin_position ==
|
171
|
-
|
161
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
162
|
+
block = @block_alignment[:blocks][i]
|
163
|
+
|
164
|
+
b = if block[:alignment] == :block
|
165
|
+
begin_position + block[:delta]
|
166
|
+
elsif block[:alignment] == :empty
|
167
|
+
if begin_position == block[:source][:begin]
|
168
|
+
block[:target][:begin]
|
172
169
|
else
|
173
170
|
# raise "lost annotation"
|
174
171
|
nil
|
175
172
|
end
|
176
173
|
else
|
177
|
-
r =
|
178
|
-
r.nil? ? nil : r +
|
174
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
175
|
+
r.nil? ? nil : r + block[:target][:begin]
|
179
176
|
end
|
180
177
|
end
|
181
178
|
|
182
179
|
def transform_end_position(end_position)
|
183
|
-
i = @
|
184
|
-
|
185
|
-
|
186
|
-
e = if
|
187
|
-
end_position +
|
188
|
-
elsif
|
189
|
-
if end_position ==
|
190
|
-
|
180
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
181
|
+
block = @block_alignment[:blocks][i]
|
182
|
+
|
183
|
+
e = if block[:alignment] == :block
|
184
|
+
end_position + block[:delta]
|
185
|
+
elsif block[:alignment] == :empty
|
186
|
+
if end_position == block[:source][:end]
|
187
|
+
block[:target][:end]
|
191
188
|
else
|
192
189
|
# raise "lost annotation"
|
193
190
|
nil
|
194
191
|
end
|
195
192
|
else
|
196
|
-
r =
|
197
|
-
r.nil? ? nil : r +
|
193
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
194
|
+
r.nil? ? nil : r + block[:target][:begin]
|
198
195
|
end
|
199
196
|
end
|
200
197
|
|
@@ -240,83 +237,22 @@ class TextAlignment::TextAlignment
|
|
240
237
|
r
|
241
238
|
end
|
242
239
|
|
243
|
-
def alignment_table
|
244
|
-
table = <<-TABLE
|
245
|
-
<table class='text_alignment_table'>
|
246
|
-
<thead>
|
247
|
-
<tr>
|
248
|
-
<th class='text_alignment_left' style='width:50%'>Text 1</th>
|
249
|
-
<th class='text_alignment_rigt'>Text 2</th>
|
250
|
-
</tr>
|
251
|
-
</thead>
|
252
|
-
<tbody>
|
253
|
-
TABLE
|
254
|
-
|
255
|
-
@block_alignments.each do |a|
|
256
|
-
table += alignment_table_th(a)
|
257
|
-
table += "<tr>\n" + case a[:alignment]
|
258
|
-
when :block
|
259
|
-
"<td colspan='2' class='text_alignment_common'>" +
|
260
|
-
@ostr1[a[:source][:begin] ... a[:source][:end]] +
|
261
|
-
"</td>\n"
|
262
|
-
when :empty
|
263
|
-
"<td class='text_alignment_left'>" + @ostr1[a[:source][:begin] ... a[:source][:end]] + "</td>\n" +
|
264
|
-
"<td class='text_alignment_right'>" + @ostr2[a[:target][:begin] ... a[:target][:end]] + "</td>\n"
|
265
|
-
else
|
266
|
-
base = a[:source][:begin]
|
267
|
-
astr1 = a[:alignment].sdiff.map do |c|
|
268
|
-
case c.action
|
269
|
-
when '='
|
270
|
-
@ostr1[c.old_position + base]
|
271
|
-
when '+'
|
272
|
-
'_'
|
273
|
-
when '-'
|
274
|
-
@ostr1[c.old_position + base]
|
275
|
-
when '!'
|
276
|
-
@ostr1[c.old_position + base] + '_'
|
277
|
-
end
|
278
|
-
end.join('')
|
279
|
-
|
280
|
-
base = a[:target][:begin]
|
281
|
-
astr2 = a[:alignment].sdiff.map do |c|
|
282
|
-
case c.action
|
283
|
-
when '='
|
284
|
-
@ostr2[c.new_position + base]
|
285
|
-
when '+'
|
286
|
-
@ostr2[c.new_position + base]
|
287
|
-
when '-'
|
288
|
-
'_'
|
289
|
-
when '!'
|
290
|
-
'_' + @ostr2[c.new_position + base]
|
291
|
-
end
|
292
|
-
end.join('')
|
293
|
-
|
294
|
-
"<td class='text_alignment_left'>" + astr1 + "</td>\n" +
|
295
|
-
"<td class='text_alignment_right'>" + astr2 + "</td>\n"
|
296
|
-
end + "</tr>\n"
|
297
|
-
end
|
298
|
-
table += '</tbody></table>'
|
299
|
-
end
|
300
|
-
|
301
|
-
def alignment_table_th(a)
|
302
|
-
"<tr>" +
|
303
|
-
"<th class='text_alignment_left'>#{a[:source][:begin]} - #{a[:source][:end]}</th>" +
|
304
|
-
"<th class='text_alignment_right'>#{a[:target][:begin]} - #{a[:target][:end]}</th>" +
|
305
|
-
"</tr>"
|
306
|
-
end
|
307
|
-
|
308
240
|
def alignment_show
|
241
|
+
stext = @block_alignment[:source_text]
|
242
|
+
ttext = @block_alignment[:target_text]
|
243
|
+
|
309
244
|
show = ''
|
310
|
-
@
|
245
|
+
@block_alignment[:blocks].each do |a|
|
311
246
|
show += case a[:alignment]
|
312
247
|
when :block
|
313
|
-
"===== common
|
314
|
-
|
248
|
+
"===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
249
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
315
250
|
when :empty
|
316
|
-
"
|
317
|
-
|
318
|
-
|
319
|
-
|
251
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
252
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
253
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
254
|
+
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
255
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
320
256
|
else
|
321
257
|
astr1 = ''
|
322
258
|
astr2 = ''
|
@@ -325,13 +261,13 @@ class TextAlignment::TextAlignment
|
|
325
261
|
astr1 = a[:alignment].sdiff.map do |c|
|
326
262
|
case c.action
|
327
263
|
when '='
|
328
|
-
|
264
|
+
stext[c.old_position + base]
|
329
265
|
when '+'
|
330
266
|
'_'
|
331
267
|
when '-'
|
332
|
-
|
268
|
+
stext[c.old_position + base]
|
333
269
|
when '!'
|
334
|
-
|
270
|
+
stext[c.old_position + base] + '_'
|
335
271
|
end
|
336
272
|
end.join('')
|
337
273
|
|
@@ -339,17 +275,17 @@ class TextAlignment::TextAlignment
|
|
339
275
|
astr2 = a[:alignment].sdiff.map do |c|
|
340
276
|
case c.action
|
341
277
|
when '='
|
342
|
-
|
278
|
+
ttext[c.new_position + base]
|
343
279
|
when '+'
|
344
|
-
|
280
|
+
ttext[c.new_position + base]
|
345
281
|
when '-'
|
346
282
|
'_'
|
347
283
|
when '!'
|
348
|
-
'_' +
|
284
|
+
'_' + ttext[c.new_position + base]
|
349
285
|
end
|
350
286
|
end.join('')
|
351
287
|
|
352
|
-
"***** local mismatch\n" +
|
288
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
353
289
|
"[#{astr1}]\n" +
|
354
290
|
"[#{astr2}]\n\n"
|
355
291
|
end
|
@@ -357,59 +293,4 @@ class TextAlignment::TextAlignment
|
|
357
293
|
show
|
358
294
|
end
|
359
295
|
|
360
|
-
private
|
361
|
-
|
362
|
-
def string_preprocessing(_str1, _str2)
|
363
|
-
str1 = _str1.dup
|
364
|
-
str2 = _str2.dup
|
365
|
-
mappings = TextAlignment::MAPPINGS.dup
|
366
|
-
|
367
|
-
## single character mappings
|
368
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
369
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
370
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
371
|
-
characters_to.gsub!(/-/, '\-')
|
372
|
-
|
373
|
-
str1.tr!(characters_from, characters_to)
|
374
|
-
str2.tr!(characters_from, characters_to)
|
375
|
-
|
376
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
377
|
-
|
378
|
-
## long to one character mappings
|
379
|
-
pletters = TextAlignment::PADDING_LETTERS
|
380
|
-
|
381
|
-
# find the padding letter for str1
|
382
|
-
padding_letter1 = begin
|
383
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
384
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
385
|
-
TextAlignment::PADDING_LETTERS[i]
|
386
|
-
end
|
387
|
-
|
388
|
-
# find the padding letter for str2
|
389
|
-
padding_letter2 = begin
|
390
|
-
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
391
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
392
|
-
TextAlignment::PADDING_LETTERS[i]
|
393
|
-
end
|
394
|
-
|
395
|
-
# ASCII foldings
|
396
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
397
|
-
ascii_foldings.each do |f|
|
398
|
-
from = f[1]
|
399
|
-
|
400
|
-
if str2.index(f[0])
|
401
|
-
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
402
|
-
str1.gsub!(from, to)
|
403
|
-
end
|
404
|
-
|
405
|
-
if str1.index(f[0])
|
406
|
-
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
407
|
-
str2.gsub!(from, to)
|
408
|
-
end
|
409
|
-
end
|
410
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
411
|
-
|
412
|
-
[str1, str2, mappings]
|
413
|
-
end
|
414
|
-
|
415
296
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|