text_alignment 0.4.3 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +40 -49
- data/lib/text_alignment/text_alignment.rb +133 -5
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9f5f7f27c8628123530d51d0a68060aa6fb850bcef8c7089c8bf990f7257a80b
|
4
|
+
data.tar.gz: 45f768df4e7d89c931295985adb31df9e725156e5a85d7f78a5b7cd26d00be4d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4004293fa10eb247110764d16a24900a10e714072227dd6e5626d2123deca4bcec00b7255e1affaba4d7dda75d7e42049aabf6b1e6c51f4c005124443b5f9ffc
|
7
|
+
data.tar.gz: eb5c8a2c89c8973242bb77457e0f8d9922486d9743e7c39ee12ea4025ece6d888be5308eaddec076e5a21c84149dcb31bc0ed6fcd68881fb6c8c4063e49fb64d
|
data/bin/align_annotations
CHANGED
@@ -103,65 +103,56 @@ target_annotations = if source_annotations.class == Array
|
|
103
103
|
else
|
104
104
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
105
|
|
106
|
-
pp alignment
|
106
|
+
# pp alignment
|
107
107
|
|
108
108
|
# verification
|
109
|
-
source_text = source_annotations[:text]
|
110
|
-
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.rstrip.length).each do |p|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
end
|
119
|
-
puts
|
120
|
-
puts "=====END"
|
121
|
-
|
122
|
-
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.rstrip.length).each do |p|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
else
|
128
|
-
print '.'
|
129
|
-
end
|
130
|
-
end
|
131
|
-
puts
|
132
|
-
puts "=====END"
|
133
|
-
|
134
|
-
# alignment.block_alignments.each do |a|
|
135
|
-
# if a[:alignment].nil? || a[:alignment] == :empty
|
136
|
-
# # p [a[:source], a[:target]]
|
137
|
-
# # p a[:alignment]
|
109
|
+
# source_text = source_annotations[:text]
|
110
|
+
# puts "=====BEGIN"
|
111
|
+
# (0 ... source_text.rstrip.length).each do |p|
|
112
|
+
# t = alignment.transform_begin_position(p)
|
113
|
+
# if t.nil?
|
114
|
+
# print source_text[p]
|
115
|
+
# else
|
116
|
+
# print '.'
|
117
|
+
# end
|
118
|
+
# end
|
119
|
+
# puts
|
120
|
+
# puts "=====END"
|
121
|
+
|
122
|
+
# puts "=====BEGIN"
|
123
|
+
# (0 .. source_text.rstrip.length).each do |p|
|
124
|
+
# t = alignment.transform_end_position(p)
|
125
|
+
# if t.nil?
|
126
|
+
# print source_text[p]
|
138
127
|
# else
|
139
|
-
#
|
140
|
-
# p a[:alignment].similarity
|
141
|
-
# puts "--"
|
142
|
-
# puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
143
|
-
# puts "--"
|
144
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
145
|
-
# puts "======"
|
128
|
+
# print '.'
|
146
129
|
# end
|
147
130
|
# end
|
131
|
+
# puts
|
132
|
+
# puts "=====END"
|
133
|
+
|
134
|
+
source_text = source_annotations[:text]
|
135
|
+
|
136
|
+
# puts "[block alignment]"
|
137
|
+
puts alignment.alignment_table
|
138
|
+
# puts "====="
|
148
139
|
# exit
|
149
140
|
|
150
141
|
# verification of source denotations
|
151
|
-
puts "[Invalid source denotations]"
|
152
|
-
source_annotations[:denotations] do |d|
|
153
|
-
|
154
|
-
end
|
155
|
-
puts "====="
|
156
|
-
puts
|
142
|
+
# puts "[Invalid source denotations]"
|
143
|
+
# source_annotations[:denotations] do |d|
|
144
|
+
# p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
145
|
+
# end
|
146
|
+
# puts "====="
|
147
|
+
# puts
|
157
148
|
|
158
149
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
159
|
-
puts "[Invalid transformation]"
|
160
|
-
denotations.each do |d|
|
161
|
-
|
162
|
-
end
|
163
|
-
puts "====="
|
164
|
-
puts
|
150
|
+
# puts "[Invalid transformation]"
|
151
|
+
# denotations.each do |d|
|
152
|
+
# p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
153
|
+
# end
|
154
|
+
# puts "====="
|
155
|
+
# puts
|
165
156
|
|
166
157
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
167
158
|
|
@@ -15,6 +15,9 @@ class TextAlignment::TextAlignment
|
|
15
15
|
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
16
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
17
|
|
18
|
+
@ostr1 = _str1
|
19
|
+
@ostr2 = _str2
|
20
|
+
|
18
21
|
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
19
22
|
|
20
23
|
# try exact match
|
@@ -24,6 +27,13 @@ class TextAlignment::TextAlignment
|
|
24
27
|
return @block_alignments
|
25
28
|
end
|
26
29
|
|
30
|
+
# try exact match
|
31
|
+
block_begin = str2.downcase.index(str1.downcase)
|
32
|
+
unless block_begin.nil?
|
33
|
+
@block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
|
34
|
+
return @block_alignments
|
35
|
+
end
|
36
|
+
|
27
37
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
28
38
|
|
29
39
|
# To collect matched blocks
|
@@ -91,7 +101,7 @@ class TextAlignment::TextAlignment
|
|
91
101
|
end
|
92
102
|
end
|
93
103
|
end
|
94
|
-
@block_alignments << mblocks[0]
|
104
|
+
@block_alignments << mblocks[0].merge(alignment: :block)
|
95
105
|
|
96
106
|
(1 ... mblocks.length).each do |i|
|
97
107
|
b1 = mblocks[i - 1][:source][:end]
|
@@ -112,7 +122,7 @@ class TextAlignment::TextAlignment
|
|
112
122
|
end
|
113
123
|
end
|
114
124
|
end
|
115
|
-
@block_alignments << mblocks[i]
|
125
|
+
@block_alignments << mblocks[i].merge(alignment: :block)
|
116
126
|
end
|
117
127
|
|
118
128
|
# Final step
|
@@ -154,7 +164,7 @@ class TextAlignment::TextAlignment
|
|
154
164
|
i = @block_alignments.index{|b| b[:source][:end] > begin_position}
|
155
165
|
block_alignment = @block_alignments[i]
|
156
166
|
|
157
|
-
b = if block_alignment[:alignment]
|
167
|
+
b = if block_alignment[:alignment] == :block
|
158
168
|
begin_position + block_alignment[:delta]
|
159
169
|
elsif block_alignment[:alignment] == :empty
|
160
170
|
if begin_position == block_alignment[:source][:begin]
|
@@ -173,7 +183,7 @@ class TextAlignment::TextAlignment
|
|
173
183
|
i = @block_alignments.index{|b| b[:source][:end] >= end_position}
|
174
184
|
block_alignment = @block_alignments[i]
|
175
185
|
|
176
|
-
e = if block_alignment[:alignment]
|
186
|
+
e = if block_alignment[:alignment] == :block
|
177
187
|
end_position + block_alignment[:delta]
|
178
188
|
elsif block_alignment[:alignment] == :empty
|
179
189
|
if end_position == block_alignment[:source][:end]
|
@@ -230,8 +240,126 @@ class TextAlignment::TextAlignment
|
|
230
240
|
r
|
231
241
|
end
|
232
242
|
|
233
|
-
|
243
|
+
def alignment_table
|
244
|
+
table = <<-TABLE
|
245
|
+
<table class='text_alignment_table'>
|
246
|
+
<thead>
|
247
|
+
<tr>
|
248
|
+
<th class='text_alignment_left' style='width:50%'>Text 1</th>
|
249
|
+
<th class='text_alignment_rigt'>Text 2</th>
|
250
|
+
</tr>
|
251
|
+
</thead>
|
252
|
+
<tbody>
|
253
|
+
TABLE
|
254
|
+
|
255
|
+
@block_alignments.each do |a|
|
256
|
+
table += alignment_table_th(a)
|
257
|
+
table += "<tr>\n" + case a[:alignment]
|
258
|
+
when :block
|
259
|
+
"<td colspan='2' class='text_alignment_common'>" +
|
260
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] +
|
261
|
+
"</td>\n"
|
262
|
+
when :empty
|
263
|
+
"<td class='text_alignment_left'>" + @ostr1[a[:source][:begin] ... a[:source][:end]] + "</td>\n" +
|
264
|
+
"<td class='text_alignment_right'>" + @ostr2[a[:target][:begin] ... a[:target][:end]] + "</td>\n"
|
265
|
+
else
|
266
|
+
base = a[:source][:begin]
|
267
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
268
|
+
case c.action
|
269
|
+
when '='
|
270
|
+
@ostr1[c.old_position + base]
|
271
|
+
when '+'
|
272
|
+
'_'
|
273
|
+
when '-'
|
274
|
+
@ostr1[c.old_position + base]
|
275
|
+
when '!'
|
276
|
+
@ostr1[c.old_position + base] + '_'
|
277
|
+
end
|
278
|
+
end.join('')
|
279
|
+
|
280
|
+
base = a[:target][:begin]
|
281
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
282
|
+
case c.action
|
283
|
+
when '='
|
284
|
+
@ostr2[c.new_position + base]
|
285
|
+
when '+'
|
286
|
+
@ostr2[c.new_position + base]
|
287
|
+
when '-'
|
288
|
+
'_'
|
289
|
+
when '!'
|
290
|
+
'_' + @ostr2[c.new_position + base]
|
291
|
+
end
|
292
|
+
end.join('')
|
293
|
+
|
294
|
+
"<td class='text_alignment_left'>" + astr1 + "</td>\n" +
|
295
|
+
"<td class='text_alignment_right'>" + astr2 + "</td>\n"
|
296
|
+
end + "</tr>\n"
|
297
|
+
end
|
298
|
+
table += '</tbody></table>'
|
299
|
+
end
|
300
|
+
|
301
|
+
def alignment_table_th(a)
|
302
|
+
"<tr>" +
|
303
|
+
"<th class='text_alignment_left'>#{a[:source][:begin]} - #{a[:source][:end]}</th>" +
|
304
|
+
"<th class='text_alignment_right'>#{a[:target][:begin]} - #{a[:target][:end]}</th>" +
|
305
|
+
"</tr>"
|
306
|
+
end
|
307
|
+
|
308
|
+
def alignment_show
|
309
|
+
show = ''
|
310
|
+
@block_alignments.each do |a|
|
311
|
+
show += case a[:alignment]
|
312
|
+
when :block
|
313
|
+
"===== common =====\n" +
|
314
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
315
|
+
when :empty
|
316
|
+
puts "<<<<< string 1"
|
317
|
+
p @ostr1[a[:source][:begin] ... a[:source][:end]]
|
318
|
+
puts
|
319
|
+
puts ">>>>> string 2"
|
320
|
+
p @ostr2[a[:target][:begin] ... a[:target][:end]]
|
321
|
+
puts
|
322
|
+
else
|
323
|
+
puts "***** local mismatch"
|
324
|
+
astr1 = ''
|
325
|
+
astr2 = ''
|
326
|
+
|
327
|
+
base = a[:source][:begin]
|
328
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
329
|
+
case c.action
|
330
|
+
when '='
|
331
|
+
@ostr1[c.old_position + base]
|
332
|
+
when '+'
|
333
|
+
'_'
|
334
|
+
when '-'
|
335
|
+
@ostr1[c.old_position + base]
|
336
|
+
when '!'
|
337
|
+
@ostr1[c.old_position + base] + '_'
|
338
|
+
end
|
339
|
+
end.join('')
|
340
|
+
|
341
|
+
base = a[:target][:begin]
|
342
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
343
|
+
case c.action
|
344
|
+
when '='
|
345
|
+
@ostr2[c.new_position + base]
|
346
|
+
when '+'
|
347
|
+
@ostr2[c.new_position + base]
|
348
|
+
when '-'
|
349
|
+
'_'
|
350
|
+
when '!'
|
351
|
+
'_' + @ostr2[c.new_position + base]
|
352
|
+
end
|
353
|
+
end.join('')
|
234
354
|
|
355
|
+
puts '[' + astr1 + ']'
|
356
|
+
puts '[' + astr2 + ']'
|
357
|
+
puts
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
private
|
235
363
|
|
236
364
|
def string_preprocessing(_str1, _str2)
|
237
365
|
str1 = _str1.dup
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.5'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|