text_alignment 0.5.1 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +14 -14
- data/lib/text_alignment/lcs_comparison.rb +3 -1
- data/lib/text_alignment/mixed_alignment.rb +74 -4
- data/lib/text_alignment/text_alignment.rb +68 -187
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6bed1eba72da626227ab727ce22129d226539bcfae5ca22006ac26258b184d8c
|
4
|
+
data.tar.gz: d2c121ea072186fd25fd61fb90c5ffacb886c1d109b82c044a1666220b8f7d8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6e526995325e79fdde8ecd729c04e2e6a21e13f0166acc39b341133055275a1bbd5a3318f78dd5af4a72237c140fa8eb06270441a16e2426e58a57183b91ca6a
|
7
|
+
data.tar.gz: ec423d59036b1ee5595141428fe320f0e9ca16b8b2660d46a0f59f376c3845ad70196d006c2f83390ac12f98b35ff14a1098fcd24cda0ee1c6534f36915def81
|
data/bin/align_annotations
CHANGED
@@ -137,26 +137,26 @@ else
|
|
137
137
|
|
138
138
|
source_text = source_annotations[:text]
|
139
139
|
|
140
|
-
|
140
|
+
puts "[block alignment]"
|
141
141
|
puts alignment.alignment_show
|
142
|
-
|
142
|
+
puts "====="
|
143
143
|
# exit
|
144
144
|
|
145
145
|
# verification of source denotations
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
146
|
+
puts "[Invalid source denotations]"
|
147
|
+
source_annotations[:denotations] do |d|
|
148
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
149
|
+
end
|
150
|
+
puts "====="
|
151
|
+
puts
|
152
152
|
|
153
153
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
154
|
+
puts "[Invalid transformation]"
|
155
|
+
denotations.each do |d|
|
156
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
157
|
+
end
|
158
|
+
puts "====="
|
159
|
+
puts
|
160
160
|
|
161
161
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
162
162
|
|
@@ -33,7 +33,9 @@ class TextAlignment::LCSComparison
|
|
33
33
|
@str2_match_initial = sdiff[match_initial].new_position
|
34
34
|
@str1_match_final = sdiff[match_final].old_position
|
35
35
|
@str2_match_final = sdiff[match_final].new_position
|
36
|
-
|
36
|
+
mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
37
|
+
@similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
|
38
|
+
# @similarity = 2 * lcs / (str1[@str1_match_initial .. @str1_match_final].length + str2[@str2_match_initial .. @str2_match_final].length).to_f
|
37
39
|
else
|
38
40
|
@str1_match_initial = 0
|
39
41
|
@str2_match_initial = 0
|
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(
|
21
|
-
raise ArgumentError, "nil string" if
|
22
|
-
|
20
|
+
def initialize(_str1, _str2)
|
21
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
|
+
|
23
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
23
24
|
|
24
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
25
26
|
end
|
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
62
63
|
end
|
63
64
|
|
64
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
65
|
-
@similarity =
|
66
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
67
|
@str1_match_initial = cmp.str1_match_initial
|
67
68
|
@str1_match_final = cmp.str1_match_final
|
68
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
|
|
137
138
|
@position_map_begin = posmap_begin.sort.to_h
|
138
139
|
@position_map_end = posmap_end.sort.to_h
|
139
140
|
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
def string_preprocessing(_str1, _str2)
|
145
|
+
str1 = _str1.dup
|
146
|
+
str2 = _str2.dup
|
147
|
+
mappings = TextAlignment::MAPPINGS.dup
|
148
|
+
|
149
|
+
## single character mappings
|
150
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
+
characters_to.gsub!(/-/, '\-')
|
154
|
+
|
155
|
+
str1.tr!(characters_from, characters_to)
|
156
|
+
str2.tr!(characters_from, characters_to)
|
157
|
+
|
158
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
+
|
160
|
+
## long to one character mappings
|
161
|
+
pletters = TextAlignment::PADDING_LETTERS
|
162
|
+
|
163
|
+
# find the padding letter for str1
|
164
|
+
@padding_letter1 = begin
|
165
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
+
TextAlignment::PADDING_LETTERS[i]
|
168
|
+
end
|
169
|
+
|
170
|
+
# find the padding letter for str2
|
171
|
+
@padding_letter2 = begin
|
172
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
+
TextAlignment::PADDING_LETTERS[i]
|
175
|
+
end
|
176
|
+
|
177
|
+
# ASCII foldings
|
178
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
+
ascii_foldings.each do |f|
|
180
|
+
from = f[1]
|
181
|
+
|
182
|
+
if str2.index(f[0])
|
183
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
+
str1.gsub!(from, to)
|
185
|
+
end
|
186
|
+
|
187
|
+
if str1.index(f[0])
|
188
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
+
str2.gsub!(from, to)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
+
|
194
|
+
[str1, str2, mappings]
|
195
|
+
end
|
196
|
+
|
197
|
+
def compute_similarity(_s1, _s2, sdiff)
|
198
|
+
return 0 if sdiff.nil?
|
199
|
+
|
200
|
+
# compute the lcs only with non-whitespace letters
|
201
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
+
return 0 if lcs == 0
|
203
|
+
|
204
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
+
|
207
|
+
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
+
end
|
209
|
+
|
140
210
|
end
|
@@ -8,30 +8,27 @@ module TextAlignment; end unless defined? TextAlignment
|
|
8
8
|
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
9
|
|
10
10
|
class TextAlignment::TextAlignment
|
11
|
-
attr_reader :
|
11
|
+
attr_reader :block_alignment
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
|
-
@
|
19
|
-
@ostr2 = _str2
|
20
|
-
|
21
|
-
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
18
|
+
@block_alignment = {source_text:str1, target_text:str2}
|
22
19
|
|
23
20
|
# try exact match
|
24
21
|
block_begin = str2.index(str1)
|
25
22
|
unless block_begin.nil?
|
26
|
-
@
|
27
|
-
return @
|
23
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
24
|
+
return @block_alignment
|
28
25
|
end
|
29
26
|
|
30
27
|
# try exact match
|
31
28
|
block_begin = str2.downcase.index(str1.downcase)
|
32
29
|
unless block_begin.nil?
|
33
|
-
@
|
34
|
-
return @
|
30
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
31
|
+
return @block_alignment
|
35
32
|
end
|
36
33
|
|
37
34
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
@@ -64,7 +61,7 @@ class TextAlignment::TextAlignment
|
|
64
61
|
# puts
|
65
62
|
|
66
63
|
## To find block alignments
|
67
|
-
@
|
64
|
+
@block_alignment[:blocks] = []
|
68
65
|
return if mblocks.empty?
|
69
66
|
|
70
67
|
# Initial step
|
@@ -73,35 +70,35 @@ class TextAlignment::TextAlignment
|
|
73
70
|
e2 = mblocks[0][:target][:begin]
|
74
71
|
|
75
72
|
if mblocks[0][:target][:begin] == 0
|
76
|
-
@
|
73
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
|
77
74
|
else
|
78
75
|
_str1 = str1[0 ... e1]
|
79
76
|
_str2 = str2[0 ... e2]
|
80
77
|
|
81
78
|
unless _str1.strip.empty?
|
82
79
|
if _str2.strip.empty?
|
83
|
-
@
|
80
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
84
81
|
else
|
85
82
|
len_min = [_str1.length, _str2.length].min
|
86
83
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
87
84
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
88
85
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
89
86
|
|
90
|
-
@
|
87
|
+
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
91
88
|
|
92
89
|
_str1 = str1[b1 ... e1]
|
93
90
|
_str2 = str2[b2 ... e2]
|
94
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
95
|
-
if alignment.similarity < 0.
|
96
|
-
@
|
91
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
92
|
+
if alignment.similarity < 0.5
|
93
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
97
94
|
else
|
98
|
-
@
|
95
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
99
96
|
end
|
100
97
|
end
|
101
98
|
end
|
102
99
|
end
|
103
100
|
end
|
104
|
-
@
|
101
|
+
@block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
|
105
102
|
|
106
103
|
(1 ... mblocks.length).each do |i|
|
107
104
|
b1 = mblocks[i - 1][:source][:end]
|
@@ -112,17 +109,17 @@ class TextAlignment::TextAlignment
|
|
112
109
|
_str2 = str2[b2 ... e2]
|
113
110
|
unless _str1.strip.empty?
|
114
111
|
if _str2.strip.empty?
|
115
|
-
@
|
112
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
116
113
|
else
|
117
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
118
|
-
if alignment.similarity < 0.
|
119
|
-
@
|
114
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
115
|
+
if alignment.similarity < 0.5
|
116
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
120
117
|
else
|
121
|
-
@
|
118
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
122
119
|
end
|
123
120
|
end
|
124
121
|
end
|
125
|
-
@
|
122
|
+
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
126
123
|
end
|
127
124
|
|
128
125
|
# Final step
|
@@ -134,7 +131,7 @@ class TextAlignment::TextAlignment
|
|
134
131
|
|
135
132
|
unless _str1.strip.empty?
|
136
133
|
if _str2.strip.empty?
|
137
|
-
@
|
134
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
138
135
|
else
|
139
136
|
len_min = [_str1.length, _str2.length].min
|
140
137
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
@@ -143,58 +140,58 @@ class TextAlignment::TextAlignment
|
|
143
140
|
_str1 = str1[b1 ... e1]
|
144
141
|
_str2 = str2[b2 ... e2]
|
145
142
|
|
146
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
147
|
-
if alignment.similarity < 0.
|
148
|
-
@
|
143
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
144
|
+
if alignment.similarity < 0.5
|
145
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
149
146
|
else
|
150
|
-
@
|
147
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
151
148
|
end
|
152
149
|
|
153
|
-
@
|
150
|
+
@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
|
154
151
|
end
|
155
152
|
end
|
156
153
|
end
|
157
154
|
|
158
|
-
@
|
155
|
+
@block_alignment[:blocks].each do |a|
|
159
156
|
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
160
157
|
end
|
161
158
|
end
|
162
159
|
|
163
160
|
def transform_begin_position(begin_position)
|
164
|
-
i = @
|
165
|
-
|
166
|
-
|
167
|
-
b = if
|
168
|
-
begin_position +
|
169
|
-
elsif
|
170
|
-
if begin_position ==
|
171
|
-
|
161
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
162
|
+
block = @block_alignment[:blocks][i]
|
163
|
+
|
164
|
+
b = if block[:alignment] == :block
|
165
|
+
begin_position + block[:delta]
|
166
|
+
elsif block[:alignment] == :empty
|
167
|
+
if begin_position == block[:source][:begin]
|
168
|
+
block[:target][:begin]
|
172
169
|
else
|
173
170
|
# raise "lost annotation"
|
174
171
|
nil
|
175
172
|
end
|
176
173
|
else
|
177
|
-
r =
|
178
|
-
r.nil? ? nil : r +
|
174
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
175
|
+
r.nil? ? nil : r + block[:target][:begin]
|
179
176
|
end
|
180
177
|
end
|
181
178
|
|
182
179
|
def transform_end_position(end_position)
|
183
|
-
i = @
|
184
|
-
|
185
|
-
|
186
|
-
e = if
|
187
|
-
end_position +
|
188
|
-
elsif
|
189
|
-
if end_position ==
|
190
|
-
|
180
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
181
|
+
block = @block_alignment[:blocks][i]
|
182
|
+
|
183
|
+
e = if block[:alignment] == :block
|
184
|
+
end_position + block[:delta]
|
185
|
+
elsif block[:alignment] == :empty
|
186
|
+
if end_position == block[:source][:end]
|
187
|
+
block[:target][:end]
|
191
188
|
else
|
192
189
|
# raise "lost annotation"
|
193
190
|
nil
|
194
191
|
end
|
195
192
|
else
|
196
|
-
r =
|
197
|
-
r.nil? ? nil : r +
|
193
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
194
|
+
r.nil? ? nil : r + block[:target][:begin]
|
198
195
|
end
|
199
196
|
end
|
200
197
|
|
@@ -240,83 +237,22 @@ class TextAlignment::TextAlignment
|
|
240
237
|
r
|
241
238
|
end
|
242
239
|
|
243
|
-
def alignment_table
|
244
|
-
table = <<-TABLE
|
245
|
-
<table class='text_alignment_table'>
|
246
|
-
<thead>
|
247
|
-
<tr>
|
248
|
-
<th class='text_alignment_left' style='width:50%'>Text 1</th>
|
249
|
-
<th class='text_alignment_rigt'>Text 2</th>
|
250
|
-
</tr>
|
251
|
-
</thead>
|
252
|
-
<tbody>
|
253
|
-
TABLE
|
254
|
-
|
255
|
-
@block_alignments.each do |a|
|
256
|
-
table += alignment_table_th(a)
|
257
|
-
table += "<tr>\n" + case a[:alignment]
|
258
|
-
when :block
|
259
|
-
"<td colspan='2' class='text_alignment_common'>" +
|
260
|
-
@ostr1[a[:source][:begin] ... a[:source][:end]] +
|
261
|
-
"</td>\n"
|
262
|
-
when :empty
|
263
|
-
"<td class='text_alignment_left'>" + @ostr1[a[:source][:begin] ... a[:source][:end]] + "</td>\n" +
|
264
|
-
"<td class='text_alignment_right'>" + @ostr2[a[:target][:begin] ... a[:target][:end]] + "</td>\n"
|
265
|
-
else
|
266
|
-
base = a[:source][:begin]
|
267
|
-
astr1 = a[:alignment].sdiff.map do |c|
|
268
|
-
case c.action
|
269
|
-
when '='
|
270
|
-
@ostr1[c.old_position + base]
|
271
|
-
when '+'
|
272
|
-
'_'
|
273
|
-
when '-'
|
274
|
-
@ostr1[c.old_position + base]
|
275
|
-
when '!'
|
276
|
-
@ostr1[c.old_position + base] + '_'
|
277
|
-
end
|
278
|
-
end.join('')
|
279
|
-
|
280
|
-
base = a[:target][:begin]
|
281
|
-
astr2 = a[:alignment].sdiff.map do |c|
|
282
|
-
case c.action
|
283
|
-
when '='
|
284
|
-
@ostr2[c.new_position + base]
|
285
|
-
when '+'
|
286
|
-
@ostr2[c.new_position + base]
|
287
|
-
when '-'
|
288
|
-
'_'
|
289
|
-
when '!'
|
290
|
-
'_' + @ostr2[c.new_position + base]
|
291
|
-
end
|
292
|
-
end.join('')
|
293
|
-
|
294
|
-
"<td class='text_alignment_left'>" + astr1 + "</td>\n" +
|
295
|
-
"<td class='text_alignment_right'>" + astr2 + "</td>\n"
|
296
|
-
end + "</tr>\n"
|
297
|
-
end
|
298
|
-
table += '</tbody></table>'
|
299
|
-
end
|
300
|
-
|
301
|
-
def alignment_table_th(a)
|
302
|
-
"<tr>" +
|
303
|
-
"<th class='text_alignment_left'>#{a[:source][:begin]} - #{a[:source][:end]}</th>" +
|
304
|
-
"<th class='text_alignment_right'>#{a[:target][:begin]} - #{a[:target][:end]}</th>" +
|
305
|
-
"</tr>"
|
306
|
-
end
|
307
|
-
|
308
240
|
def alignment_show
|
241
|
+
stext = @block_alignment[:source_text]
|
242
|
+
ttext = @block_alignment[:target_text]
|
243
|
+
|
309
244
|
show = ''
|
310
|
-
@
|
245
|
+
@block_alignment[:blocks].each do |a|
|
311
246
|
show += case a[:alignment]
|
312
247
|
when :block
|
313
|
-
"===== common
|
314
|
-
|
248
|
+
"===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
249
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
315
250
|
when :empty
|
316
|
-
"
|
317
|
-
|
318
|
-
|
319
|
-
|
251
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
252
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
253
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
254
|
+
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
255
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
320
256
|
else
|
321
257
|
astr1 = ''
|
322
258
|
astr2 = ''
|
@@ -325,13 +261,13 @@ class TextAlignment::TextAlignment
|
|
325
261
|
astr1 = a[:alignment].sdiff.map do |c|
|
326
262
|
case c.action
|
327
263
|
when '='
|
328
|
-
|
264
|
+
stext[c.old_position + base]
|
329
265
|
when '+'
|
330
266
|
'_'
|
331
267
|
when '-'
|
332
|
-
|
268
|
+
stext[c.old_position + base]
|
333
269
|
when '!'
|
334
|
-
|
270
|
+
stext[c.old_position + base] + '_'
|
335
271
|
end
|
336
272
|
end.join('')
|
337
273
|
|
@@ -339,17 +275,17 @@ class TextAlignment::TextAlignment
|
|
339
275
|
astr2 = a[:alignment].sdiff.map do |c|
|
340
276
|
case c.action
|
341
277
|
when '='
|
342
|
-
|
278
|
+
ttext[c.new_position + base]
|
343
279
|
when '+'
|
344
|
-
|
280
|
+
ttext[c.new_position + base]
|
345
281
|
when '-'
|
346
282
|
'_'
|
347
283
|
when '!'
|
348
|
-
'_' +
|
284
|
+
'_' + ttext[c.new_position + base]
|
349
285
|
end
|
350
286
|
end.join('')
|
351
287
|
|
352
|
-
"***** local mismatch\n" +
|
288
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
353
289
|
"[#{astr1}]\n" +
|
354
290
|
"[#{astr2}]\n\n"
|
355
291
|
end
|
@@ -357,59 +293,4 @@ class TextAlignment::TextAlignment
|
|
357
293
|
show
|
358
294
|
end
|
359
295
|
|
360
|
-
private
|
361
|
-
|
362
|
-
def string_preprocessing(_str1, _str2)
|
363
|
-
str1 = _str1.dup
|
364
|
-
str2 = _str2.dup
|
365
|
-
mappings = TextAlignment::MAPPINGS.dup
|
366
|
-
|
367
|
-
## single character mappings
|
368
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
369
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
370
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
371
|
-
characters_to.gsub!(/-/, '\-')
|
372
|
-
|
373
|
-
str1.tr!(characters_from, characters_to)
|
374
|
-
str2.tr!(characters_from, characters_to)
|
375
|
-
|
376
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
377
|
-
|
378
|
-
## long to one character mappings
|
379
|
-
pletters = TextAlignment::PADDING_LETTERS
|
380
|
-
|
381
|
-
# find the padding letter for str1
|
382
|
-
padding_letter1 = begin
|
383
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
384
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
385
|
-
TextAlignment::PADDING_LETTERS[i]
|
386
|
-
end
|
387
|
-
|
388
|
-
# find the padding letter for str2
|
389
|
-
padding_letter2 = begin
|
390
|
-
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
391
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
392
|
-
TextAlignment::PADDING_LETTERS[i]
|
393
|
-
end
|
394
|
-
|
395
|
-
# ASCII foldings
|
396
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
397
|
-
ascii_foldings.each do |f|
|
398
|
-
from = f[1]
|
399
|
-
|
400
|
-
if str2.index(f[0])
|
401
|
-
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
402
|
-
str1.gsub!(from, to)
|
403
|
-
end
|
404
|
-
|
405
|
-
if str1.index(f[0])
|
406
|
-
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
407
|
-
str2.gsub!(from, to)
|
408
|
-
end
|
409
|
-
end
|
410
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
411
|
-
|
412
|
-
[str1, str2, mappings]
|
413
|
-
end
|
414
|
-
|
415
296
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|