text_alignment 0.5.2 → 0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_alignment/lcs_comparison.rb +1 -0
- data/lib/text_alignment/text_alignment.rb +55 -118
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc820991f5f694f154b94c369158909ccba3760829e0d881c7fd2e6ef7ddd149
|
4
|
+
data.tar.gz: 40ae6f2e388405426a77682bd1a3fb7a3c853076eced9b7301b632081dfd0a57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5802241b4a8394d3c570c1d4b8f5e1d7706c72852e2d6e6fb23bda2f6e2972fa09f7001695db026667144e2af982eeb91ed0b700bd8151af6df794c98e3c069b
|
7
|
+
data.tar.gz: 8d7c93acbef6ab12bb2a0291444a7bcc73b0236bb5b0d06d274e95aa30c9ffc829965653b58270686147a9ac30ccf570518b3ad266120b320dfb20cd1620f5f9
|
@@ -35,6 +35,7 @@ class TextAlignment::LCSComparison
|
|
35
35
|
@str2_match_final = sdiff[match_final].new_position
|
36
36
|
mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
37
37
|
@similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
|
38
|
+
# @similarity = 2 * lcs / (str1[@str1_match_initial .. @str1_match_final].length + str2[@str2_match_initial .. @str2_match_final].length).to_f
|
38
39
|
else
|
39
40
|
@str1_match_initial = 0
|
40
41
|
@str2_match_initial = 0
|
@@ -8,30 +8,29 @@ module TextAlignment; end unless defined? TextAlignment
|
|
8
8
|
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
9
|
|
10
10
|
class TextAlignment::TextAlignment
|
11
|
-
attr_reader :
|
11
|
+
attr_reader :block_alignment
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
15
|
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
16
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
17
|
|
18
|
-
@
|
19
|
-
@ostr2 = _str2
|
18
|
+
@block_alignment = {source_text:_str1, target_text:_str2}
|
20
19
|
|
21
20
|
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
22
21
|
|
23
22
|
# try exact match
|
24
23
|
block_begin = str2.index(str1)
|
25
24
|
unless block_begin.nil?
|
26
|
-
@
|
27
|
-
return @
|
25
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
|
+
return @block_alignment
|
28
27
|
end
|
29
28
|
|
30
29
|
# try exact match
|
31
30
|
block_begin = str2.downcase.index(str1.downcase)
|
32
31
|
unless block_begin.nil?
|
33
|
-
@
|
34
|
-
return @
|
32
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
33
|
+
return @block_alignment
|
35
34
|
end
|
36
35
|
|
37
36
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
@@ -64,7 +63,7 @@ class TextAlignment::TextAlignment
|
|
64
63
|
# puts
|
65
64
|
|
66
65
|
## To find block alignments
|
67
|
-
@
|
66
|
+
@block_alignment[:blocks] = []
|
68
67
|
return if mblocks.empty?
|
69
68
|
|
70
69
|
# Initial step
|
@@ -73,35 +72,35 @@ class TextAlignment::TextAlignment
|
|
73
72
|
e2 = mblocks[0][:target][:begin]
|
74
73
|
|
75
74
|
if mblocks[0][:target][:begin] == 0
|
76
|
-
@
|
75
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
|
77
76
|
else
|
78
77
|
_str1 = str1[0 ... e1]
|
79
78
|
_str2 = str2[0 ... e2]
|
80
79
|
|
81
80
|
unless _str1.strip.empty?
|
82
81
|
if _str2.strip.empty?
|
83
|
-
@
|
82
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
84
83
|
else
|
85
84
|
len_min = [_str1.length, _str2.length].min
|
86
85
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
87
86
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
88
87
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
89
88
|
|
90
|
-
@
|
89
|
+
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
91
90
|
|
92
91
|
_str1 = str1[b1 ... e1]
|
93
92
|
_str2 = str2[b2 ... e2]
|
94
93
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
95
94
|
if alignment.similarity < 0.6
|
96
|
-
@
|
95
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
97
96
|
else
|
98
|
-
@
|
97
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
99
98
|
end
|
100
99
|
end
|
101
100
|
end
|
102
101
|
end
|
103
102
|
end
|
104
|
-
@
|
103
|
+
@block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
|
105
104
|
|
106
105
|
(1 ... mblocks.length).each do |i|
|
107
106
|
b1 = mblocks[i - 1][:source][:end]
|
@@ -112,17 +111,17 @@ class TextAlignment::TextAlignment
|
|
112
111
|
_str2 = str2[b2 ... e2]
|
113
112
|
unless _str1.strip.empty?
|
114
113
|
if _str2.strip.empty?
|
115
|
-
@
|
114
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
116
115
|
else
|
117
116
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
118
117
|
if alignment.similarity < 0.6
|
119
|
-
@
|
118
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
120
119
|
else
|
121
|
-
@
|
120
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
122
121
|
end
|
123
122
|
end
|
124
123
|
end
|
125
|
-
@
|
124
|
+
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
126
125
|
end
|
127
126
|
|
128
127
|
# Final step
|
@@ -134,7 +133,7 @@ class TextAlignment::TextAlignment
|
|
134
133
|
|
135
134
|
unless _str1.strip.empty?
|
136
135
|
if _str2.strip.empty?
|
137
|
-
@
|
136
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
138
137
|
else
|
139
138
|
len_min = [_str1.length, _str2.length].min
|
140
139
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
@@ -145,56 +144,56 @@ class TextAlignment::TextAlignment
|
|
145
144
|
|
146
145
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
147
146
|
if alignment.similarity < 0.6
|
148
|
-
@
|
147
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
149
148
|
else
|
150
|
-
@
|
149
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
151
150
|
end
|
152
151
|
|
153
|
-
@
|
152
|
+
@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
|
154
153
|
end
|
155
154
|
end
|
156
155
|
end
|
157
156
|
|
158
|
-
@
|
157
|
+
@block_alignment[:blocks].each do |a|
|
159
158
|
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
160
159
|
end
|
161
160
|
end
|
162
161
|
|
163
162
|
def transform_begin_position(begin_position)
|
164
|
-
i = @
|
165
|
-
|
166
|
-
|
167
|
-
b = if
|
168
|
-
begin_position +
|
169
|
-
elsif
|
170
|
-
if begin_position ==
|
171
|
-
|
163
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
164
|
+
block = @block_alignment[:blocks][i]
|
165
|
+
|
166
|
+
b = if block[:alignment] == :block
|
167
|
+
begin_position + block[:delta]
|
168
|
+
elsif block[:alignment] == :empty
|
169
|
+
if begin_position == block[:source][:begin]
|
170
|
+
block[:target][:begin]
|
172
171
|
else
|
173
172
|
# raise "lost annotation"
|
174
173
|
nil
|
175
174
|
end
|
176
175
|
else
|
177
|
-
r =
|
178
|
-
r.nil? ? nil : r +
|
176
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
177
|
+
r.nil? ? nil : r + block[:target][:begin]
|
179
178
|
end
|
180
179
|
end
|
181
180
|
|
182
181
|
def transform_end_position(end_position)
|
183
|
-
i = @
|
184
|
-
|
185
|
-
|
186
|
-
e = if
|
187
|
-
end_position +
|
188
|
-
elsif
|
189
|
-
if end_position ==
|
190
|
-
|
182
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
183
|
+
block = @block_alignment[:blocks][i]
|
184
|
+
|
185
|
+
e = if block[:alignment] == :block
|
186
|
+
end_position + block[:delta]
|
187
|
+
elsif block[:alignment] == :empty
|
188
|
+
if end_position == block[:source][:end]
|
189
|
+
block[:target][:end]
|
191
190
|
else
|
192
191
|
# raise "lost annotation"
|
193
192
|
nil
|
194
193
|
end
|
195
194
|
else
|
196
|
-
r =
|
197
|
-
r.nil? ? nil : r +
|
195
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
196
|
+
r.nil? ? nil : r + block[:target][:begin]
|
198
197
|
end
|
199
198
|
end
|
200
199
|
|
@@ -240,83 +239,21 @@ class TextAlignment::TextAlignment
|
|
240
239
|
r
|
241
240
|
end
|
242
241
|
|
243
|
-
def alignment_table
|
244
|
-
table = <<-TABLE
|
245
|
-
<table class='text_alignment_table'>
|
246
|
-
<thead>
|
247
|
-
<tr>
|
248
|
-
<th class='text_alignment_left' style='width:50%'>Text 1</th>
|
249
|
-
<th class='text_alignment_rigt'>Text 2</th>
|
250
|
-
</tr>
|
251
|
-
</thead>
|
252
|
-
<tbody>
|
253
|
-
TABLE
|
254
|
-
|
255
|
-
@block_alignments.each do |a|
|
256
|
-
table += alignment_table_th(a)
|
257
|
-
table += "<tr>\n" + case a[:alignment]
|
258
|
-
when :block
|
259
|
-
"<td colspan='2' class='text_alignment_common'>" +
|
260
|
-
@ostr1[a[:source][:begin] ... a[:source][:end]] +
|
261
|
-
"</td>\n"
|
262
|
-
when :empty
|
263
|
-
"<td class='text_alignment_left'>" + @ostr1[a[:source][:begin] ... a[:source][:end]] + "</td>\n" +
|
264
|
-
"<td class='text_alignment_right'>" + @ostr2[a[:target][:begin] ... a[:target][:end]] + "</td>\n"
|
265
|
-
else
|
266
|
-
base = a[:source][:begin]
|
267
|
-
astr1 = a[:alignment].sdiff.map do |c|
|
268
|
-
case c.action
|
269
|
-
when '='
|
270
|
-
@ostr1[c.old_position + base]
|
271
|
-
when '+'
|
272
|
-
'_'
|
273
|
-
when '-'
|
274
|
-
@ostr1[c.old_position + base]
|
275
|
-
when '!'
|
276
|
-
@ostr1[c.old_position + base] + '_'
|
277
|
-
end
|
278
|
-
end.join('')
|
279
|
-
|
280
|
-
base = a[:target][:begin]
|
281
|
-
astr2 = a[:alignment].sdiff.map do |c|
|
282
|
-
case c.action
|
283
|
-
when '='
|
284
|
-
@ostr2[c.new_position + base]
|
285
|
-
when '+'
|
286
|
-
@ostr2[c.new_position + base]
|
287
|
-
when '-'
|
288
|
-
'_'
|
289
|
-
when '!'
|
290
|
-
'_' + @ostr2[c.new_position + base]
|
291
|
-
end
|
292
|
-
end.join('')
|
293
|
-
|
294
|
-
"<td class='text_alignment_left'>" + astr1 + "</td>\n" +
|
295
|
-
"<td class='text_alignment_right'>" + astr2 + "</td>\n"
|
296
|
-
end + "</tr>\n"
|
297
|
-
end
|
298
|
-
table += '</tbody></table>'
|
299
|
-
end
|
300
|
-
|
301
|
-
def alignment_table_th(a)
|
302
|
-
"<tr>" +
|
303
|
-
"<th class='text_alignment_left'>#{a[:source][:begin]} - #{a[:source][:end]}</th>" +
|
304
|
-
"<th class='text_alignment_right'>#{a[:target][:begin]} - #{a[:target][:end]}</th>" +
|
305
|
-
"</tr>"
|
306
|
-
end
|
307
|
-
|
308
242
|
def alignment_show
|
243
|
+
stext = @block_alignment[:source_text]
|
244
|
+
ttext = @block_alignment[:target_text]
|
245
|
+
|
309
246
|
show = ''
|
310
|
-
@
|
247
|
+
@block_alignment[:blocks].each do |a|
|
311
248
|
show += case a[:alignment]
|
312
249
|
when :block
|
313
250
|
"===== common =====\n" +
|
314
|
-
|
251
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
315
252
|
when :empty
|
316
253
|
"<<<<< string 1\n" +
|
317
|
-
|
254
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
318
255
|
">>>>> string 2\n" +
|
319
|
-
|
256
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
320
257
|
else
|
321
258
|
astr1 = ''
|
322
259
|
astr2 = ''
|
@@ -325,13 +262,13 @@ class TextAlignment::TextAlignment
|
|
325
262
|
astr1 = a[:alignment].sdiff.map do |c|
|
326
263
|
case c.action
|
327
264
|
when '='
|
328
|
-
|
265
|
+
stext[c.old_position + base]
|
329
266
|
when '+'
|
330
267
|
'_'
|
331
268
|
when '-'
|
332
|
-
|
269
|
+
stext[c.old_position + base]
|
333
270
|
when '!'
|
334
|
-
|
271
|
+
stext[c.old_position + base] + '_'
|
335
272
|
end
|
336
273
|
end.join('')
|
337
274
|
|
@@ -339,13 +276,13 @@ class TextAlignment::TextAlignment
|
|
339
276
|
astr2 = a[:alignment].sdiff.map do |c|
|
340
277
|
case c.action
|
341
278
|
when '='
|
342
|
-
|
279
|
+
ttext[c.new_position + base]
|
343
280
|
when '+'
|
344
|
-
|
281
|
+
ttext[c.new_position + base]
|
345
282
|
when '-'
|
346
283
|
'_'
|
347
284
|
when '!'
|
348
|
-
'_' +
|
285
|
+
'_' + ttext[c.new_position + base]
|
349
286
|
end
|
350
287
|
end.join('')
|
351
288
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.6'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|