text_alignment 0.5.2 → 0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/lcs_comparison.rb +1 -0
- data/lib/text_alignment/text_alignment.rb +55 -118
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc820991f5f694f154b94c369158909ccba3760829e0d881c7fd2e6ef7ddd149
|
4
|
+
data.tar.gz: 40ae6f2e388405426a77682bd1a3fb7a3c853076eced9b7301b632081dfd0a57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5802241b4a8394d3c570c1d4b8f5e1d7706c72852e2d6e6fb23bda2f6e2972fa09f7001695db026667144e2af982eeb91ed0b700bd8151af6df794c98e3c069b
|
7
|
+
data.tar.gz: 8d7c93acbef6ab12bb2a0291444a7bcc73b0236bb5b0d06d274e95aa30c9ffc829965653b58270686147a9ac30ccf570518b3ad266120b320dfb20cd1620f5f9
|
@@ -35,6 +35,7 @@ class TextAlignment::LCSComparison
|
|
35
35
|
@str2_match_final = sdiff[match_final].new_position
|
36
36
|
mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
37
37
|
@similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
|
38
|
+
# @similarity = 2 * lcs / (str1[@str1_match_initial .. @str1_match_final].length + str2[@str2_match_initial .. @str2_match_final].length).to_f
|
38
39
|
else
|
39
40
|
@str1_match_initial = 0
|
40
41
|
@str2_match_initial = 0
|
@@ -8,30 +8,29 @@ module TextAlignment; end unless defined? TextAlignment
|
|
8
8
|
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
9
|
|
10
10
|
class TextAlignment::TextAlignment
|
11
|
-
attr_reader :
|
11
|
+
attr_reader :block_alignment
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
15
|
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
16
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
17
|
|
18
|
-
@
|
19
|
-
@ostr2 = _str2
|
18
|
+
@block_alignment = {source_text:_str1, target_text:_str2}
|
20
19
|
|
21
20
|
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
22
21
|
|
23
22
|
# try exact match
|
24
23
|
block_begin = str2.index(str1)
|
25
24
|
unless block_begin.nil?
|
26
|
-
@
|
27
|
-
return @
|
25
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
|
+
return @block_alignment
|
28
27
|
end
|
29
28
|
|
30
29
|
# try exact match
|
31
30
|
block_begin = str2.downcase.index(str1.downcase)
|
32
31
|
unless block_begin.nil?
|
33
|
-
@
|
34
|
-
return @
|
32
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
33
|
+
return @block_alignment
|
35
34
|
end
|
36
35
|
|
37
36
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
@@ -64,7 +63,7 @@ class TextAlignment::TextAlignment
|
|
64
63
|
# puts
|
65
64
|
|
66
65
|
## To find block alignments
|
67
|
-
@
|
66
|
+
@block_alignment[:blocks] = []
|
68
67
|
return if mblocks.empty?
|
69
68
|
|
70
69
|
# Initial step
|
@@ -73,35 +72,35 @@ class TextAlignment::TextAlignment
|
|
73
72
|
e2 = mblocks[0][:target][:begin]
|
74
73
|
|
75
74
|
if mblocks[0][:target][:begin] == 0
|
76
|
-
@
|
75
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
|
77
76
|
else
|
78
77
|
_str1 = str1[0 ... e1]
|
79
78
|
_str2 = str2[0 ... e2]
|
80
79
|
|
81
80
|
unless _str1.strip.empty?
|
82
81
|
if _str2.strip.empty?
|
83
|
-
@
|
82
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
84
83
|
else
|
85
84
|
len_min = [_str1.length, _str2.length].min
|
86
85
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
87
86
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
88
87
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
89
88
|
|
90
|
-
@
|
89
|
+
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
91
90
|
|
92
91
|
_str1 = str1[b1 ... e1]
|
93
92
|
_str2 = str2[b2 ... e2]
|
94
93
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
95
94
|
if alignment.similarity < 0.6
|
96
|
-
@
|
95
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
97
96
|
else
|
98
|
-
@
|
97
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
99
98
|
end
|
100
99
|
end
|
101
100
|
end
|
102
101
|
end
|
103
102
|
end
|
104
|
-
@
|
103
|
+
@block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
|
105
104
|
|
106
105
|
(1 ... mblocks.length).each do |i|
|
107
106
|
b1 = mblocks[i - 1][:source][:end]
|
@@ -112,17 +111,17 @@ class TextAlignment::TextAlignment
|
|
112
111
|
_str2 = str2[b2 ... e2]
|
113
112
|
unless _str1.strip.empty?
|
114
113
|
if _str2.strip.empty?
|
115
|
-
@
|
114
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
116
115
|
else
|
117
116
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
118
117
|
if alignment.similarity < 0.6
|
119
|
-
@
|
118
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
120
119
|
else
|
121
|
-
@
|
120
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
122
121
|
end
|
123
122
|
end
|
124
123
|
end
|
125
|
-
@
|
124
|
+
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
126
125
|
end
|
127
126
|
|
128
127
|
# Final step
|
@@ -134,7 +133,7 @@ class TextAlignment::TextAlignment
|
|
134
133
|
|
135
134
|
unless _str1.strip.empty?
|
136
135
|
if _str2.strip.empty?
|
137
|
-
@
|
136
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
138
137
|
else
|
139
138
|
len_min = [_str1.length, _str2.length].min
|
140
139
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
@@ -145,56 +144,56 @@ class TextAlignment::TextAlignment
|
|
145
144
|
|
146
145
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
147
146
|
if alignment.similarity < 0.6
|
148
|
-
@
|
147
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
149
148
|
else
|
150
|
-
@
|
149
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
151
150
|
end
|
152
151
|
|
153
|
-
@
|
152
|
+
@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
|
154
153
|
end
|
155
154
|
end
|
156
155
|
end
|
157
156
|
|
158
|
-
@
|
157
|
+
@block_alignment[:blocks].each do |a|
|
159
158
|
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
160
159
|
end
|
161
160
|
end
|
162
161
|
|
163
162
|
def transform_begin_position(begin_position)
|
164
|
-
i = @
|
165
|
-
|
166
|
-
|
167
|
-
b = if
|
168
|
-
begin_position +
|
169
|
-
elsif
|
170
|
-
if begin_position ==
|
171
|
-
|
163
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
164
|
+
block = @block_alignment[:blocks][i]
|
165
|
+
|
166
|
+
b = if block[:alignment] == :block
|
167
|
+
begin_position + block[:delta]
|
168
|
+
elsif block[:alignment] == :empty
|
169
|
+
if begin_position == block[:source][:begin]
|
170
|
+
block[:target][:begin]
|
172
171
|
else
|
173
172
|
# raise "lost annotation"
|
174
173
|
nil
|
175
174
|
end
|
176
175
|
else
|
177
|
-
r =
|
178
|
-
r.nil? ? nil : r +
|
176
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
177
|
+
r.nil? ? nil : r + block[:target][:begin]
|
179
178
|
end
|
180
179
|
end
|
181
180
|
|
182
181
|
def transform_end_position(end_position)
|
183
|
-
i = @
|
184
|
-
|
185
|
-
|
186
|
-
e = if
|
187
|
-
end_position +
|
188
|
-
elsif
|
189
|
-
if end_position ==
|
190
|
-
|
182
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
183
|
+
block = @block_alignment[:blocks][i]
|
184
|
+
|
185
|
+
e = if block[:alignment] == :block
|
186
|
+
end_position + block[:delta]
|
187
|
+
elsif block[:alignment] == :empty
|
188
|
+
if end_position == block[:source][:end]
|
189
|
+
block[:target][:end]
|
191
190
|
else
|
192
191
|
# raise "lost annotation"
|
193
192
|
nil
|
194
193
|
end
|
195
194
|
else
|
196
|
-
r =
|
197
|
-
r.nil? ? nil : r +
|
195
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
196
|
+
r.nil? ? nil : r + block[:target][:begin]
|
198
197
|
end
|
199
198
|
end
|
200
199
|
|
@@ -240,83 +239,21 @@ class TextAlignment::TextAlignment
|
|
240
239
|
r
|
241
240
|
end
|
242
241
|
|
243
|
-
def alignment_table
|
244
|
-
table = <<-TABLE
|
245
|
-
<table class='text_alignment_table'>
|
246
|
-
<thead>
|
247
|
-
<tr>
|
248
|
-
<th class='text_alignment_left' style='width:50%'>Text 1</th>
|
249
|
-
<th class='text_alignment_rigt'>Text 2</th>
|
250
|
-
</tr>
|
251
|
-
</thead>
|
252
|
-
<tbody>
|
253
|
-
TABLE
|
254
|
-
|
255
|
-
@block_alignments.each do |a|
|
256
|
-
table += alignment_table_th(a)
|
257
|
-
table += "<tr>\n" + case a[:alignment]
|
258
|
-
when :block
|
259
|
-
"<td colspan='2' class='text_alignment_common'>" +
|
260
|
-
@ostr1[a[:source][:begin] ... a[:source][:end]] +
|
261
|
-
"</td>\n"
|
262
|
-
when :empty
|
263
|
-
"<td class='text_alignment_left'>" + @ostr1[a[:source][:begin] ... a[:source][:end]] + "</td>\n" +
|
264
|
-
"<td class='text_alignment_right'>" + @ostr2[a[:target][:begin] ... a[:target][:end]] + "</td>\n"
|
265
|
-
else
|
266
|
-
base = a[:source][:begin]
|
267
|
-
astr1 = a[:alignment].sdiff.map do |c|
|
268
|
-
case c.action
|
269
|
-
when '='
|
270
|
-
@ostr1[c.old_position + base]
|
271
|
-
when '+'
|
272
|
-
'_'
|
273
|
-
when '-'
|
274
|
-
@ostr1[c.old_position + base]
|
275
|
-
when '!'
|
276
|
-
@ostr1[c.old_position + base] + '_'
|
277
|
-
end
|
278
|
-
end.join('')
|
279
|
-
|
280
|
-
base = a[:target][:begin]
|
281
|
-
astr2 = a[:alignment].sdiff.map do |c|
|
282
|
-
case c.action
|
283
|
-
when '='
|
284
|
-
@ostr2[c.new_position + base]
|
285
|
-
when '+'
|
286
|
-
@ostr2[c.new_position + base]
|
287
|
-
when '-'
|
288
|
-
'_'
|
289
|
-
when '!'
|
290
|
-
'_' + @ostr2[c.new_position + base]
|
291
|
-
end
|
292
|
-
end.join('')
|
293
|
-
|
294
|
-
"<td class='text_alignment_left'>" + astr1 + "</td>\n" +
|
295
|
-
"<td class='text_alignment_right'>" + astr2 + "</td>\n"
|
296
|
-
end + "</tr>\n"
|
297
|
-
end
|
298
|
-
table += '</tbody></table>'
|
299
|
-
end
|
300
|
-
|
301
|
-
def alignment_table_th(a)
|
302
|
-
"<tr>" +
|
303
|
-
"<th class='text_alignment_left'>#{a[:source][:begin]} - #{a[:source][:end]}</th>" +
|
304
|
-
"<th class='text_alignment_right'>#{a[:target][:begin]} - #{a[:target][:end]}</th>" +
|
305
|
-
"</tr>"
|
306
|
-
end
|
307
|
-
|
308
242
|
def alignment_show
|
243
|
+
stext = @block_alignment[:source_text]
|
244
|
+
ttext = @block_alignment[:target_text]
|
245
|
+
|
309
246
|
show = ''
|
310
|
-
@
|
247
|
+
@block_alignment[:blocks].each do |a|
|
311
248
|
show += case a[:alignment]
|
312
249
|
when :block
|
313
250
|
"===== common =====\n" +
|
314
|
-
|
251
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
315
252
|
when :empty
|
316
253
|
"<<<<< string 1\n" +
|
317
|
-
|
254
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
318
255
|
">>>>> string 2\n" +
|
319
|
-
|
256
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
320
257
|
else
|
321
258
|
astr1 = ''
|
322
259
|
astr2 = ''
|
@@ -325,13 +262,13 @@ class TextAlignment::TextAlignment
|
|
325
262
|
astr1 = a[:alignment].sdiff.map do |c|
|
326
263
|
case c.action
|
327
264
|
when '='
|
328
|
-
|
265
|
+
stext[c.old_position + base]
|
329
266
|
when '+'
|
330
267
|
'_'
|
331
268
|
when '-'
|
332
|
-
|
269
|
+
stext[c.old_position + base]
|
333
270
|
when '!'
|
334
|
-
|
271
|
+
stext[c.old_position + base] + '_'
|
335
272
|
end
|
336
273
|
end.join('')
|
337
274
|
|
@@ -339,13 +276,13 @@ class TextAlignment::TextAlignment
|
|
339
276
|
astr2 = a[:alignment].sdiff.map do |c|
|
340
277
|
case c.action
|
341
278
|
when '='
|
342
|
-
|
279
|
+
ttext[c.new_position + base]
|
343
280
|
when '+'
|
344
|
-
|
281
|
+
ttext[c.new_position + base]
|
345
282
|
when '-'
|
346
283
|
'_'
|
347
284
|
when '!'
|
348
|
-
'_' +
|
285
|
+
'_' + ttext[c.new_position + base]
|
349
286
|
end
|
350
287
|
end.join('')
|
351
288
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.6'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|