text_alignment 0.6 → 0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +4 -19
- data/lib/text_alignment/mixed_alignment.rb +74 -4
- data/lib/text_alignment/text_alignment.rb +199 -168
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
|
4
|
+
data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
|
7
|
+
data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
|
data/bin/align_annotations
CHANGED
@@ -105,9 +105,7 @@ lost_annotations = []
|
|
105
105
|
target_annotations = if source_annotations.class == Array
|
106
106
|
align_mdoc(source_annotations, {text: target_text})
|
107
107
|
else
|
108
|
-
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
109
|
-
|
110
|
-
# pp alignment
|
108
|
+
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
|
111
109
|
|
112
110
|
# verification
|
113
111
|
# source_text = source_annotations[:text]
|
@@ -142,22 +140,7 @@ else
|
|
142
140
|
puts "====="
|
143
141
|
# exit
|
144
142
|
|
145
|
-
# verification of source denotations
|
146
|
-
puts "[Invalid source denotations]"
|
147
|
-
source_annotations[:denotations] do |d|
|
148
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
149
|
-
end
|
150
|
-
puts "====="
|
151
|
-
puts
|
152
|
-
|
153
143
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
154
|
-
puts "[Invalid transformation]"
|
155
|
-
denotations.each do |d|
|
156
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
157
|
-
end
|
158
|
-
puts "====="
|
159
|
-
puts
|
160
|
-
|
161
144
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
162
145
|
|
163
146
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
|
|
194
177
|
|
195
178
|
if lost_annotations
|
196
179
|
warn "\n[lost annotations]"
|
197
|
-
|
180
|
+
lost_annotations.each do |a|
|
181
|
+
p a
|
182
|
+
end
|
198
183
|
end
|
199
184
|
|
200
185
|
#puts target_annotations.to_json
|
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(
|
21
|
-
raise ArgumentError, "nil string" if
|
22
|
-
|
20
|
+
def initialize(_str1, _str2)
|
21
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
|
+
|
23
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
23
24
|
|
24
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
25
26
|
end
|
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
62
63
|
end
|
63
64
|
|
64
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
65
|
-
@similarity =
|
66
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
67
|
@str1_match_initial = cmp.str1_match_initial
|
67
68
|
@str1_match_final = cmp.str1_match_final
|
68
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
|
|
137
138
|
@position_map_begin = posmap_begin.sort.to_h
|
138
139
|
@position_map_end = posmap_end.sort.to_h
|
139
140
|
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
def string_preprocessing(_str1, _str2)
|
145
|
+
str1 = _str1.dup
|
146
|
+
str2 = _str2.dup
|
147
|
+
mappings = TextAlignment::MAPPINGS.dup
|
148
|
+
|
149
|
+
## single character mappings
|
150
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
+
characters_to.gsub!(/-/, '\-')
|
154
|
+
|
155
|
+
str1.tr!(characters_from, characters_to)
|
156
|
+
str2.tr!(characters_from, characters_to)
|
157
|
+
|
158
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
+
|
160
|
+
## long to one character mappings
|
161
|
+
pletters = TextAlignment::PADDING_LETTERS
|
162
|
+
|
163
|
+
# find the padding letter for str1
|
164
|
+
@padding_letter1 = begin
|
165
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
+
TextAlignment::PADDING_LETTERS[i]
|
168
|
+
end
|
169
|
+
|
170
|
+
# find the padding letter for str2
|
171
|
+
@padding_letter2 = begin
|
172
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
+
TextAlignment::PADDING_LETTERS[i]
|
175
|
+
end
|
176
|
+
|
177
|
+
# ASCII foldings
|
178
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
+
ascii_foldings.each do |f|
|
180
|
+
from = f[1]
|
181
|
+
|
182
|
+
if str2.index(f[0])
|
183
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
+
str1.gsub!(from, to)
|
185
|
+
end
|
186
|
+
|
187
|
+
if str1.index(f[0])
|
188
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
+
str2.gsub!(from, to)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
+
|
194
|
+
[str1, str2, mappings]
|
195
|
+
end
|
196
|
+
|
197
|
+
def compute_similarity(_s1, _s2, sdiff)
|
198
|
+
return 0 if sdiff.nil?
|
199
|
+
|
200
|
+
# compute the lcs only with non-whitespace letters
|
201
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
+
return 0 if lcs == 0
|
203
|
+
|
204
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
+
|
207
|
+
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
+
end
|
209
|
+
|
140
210
|
end
|
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
|
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
|
-
@block_alignment = {source_text:
|
18
|
+
@block_alignment = {source_text:str1, target_text:str2}
|
19
|
+
@str1 = str1
|
20
|
+
@str2 = str2
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
-
# try exact match
|
22
|
+
## Block exact match
|
23
23
|
block_begin = str2.index(str1)
|
24
24
|
unless block_begin.nil?
|
25
25
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
|
-
return
|
26
|
+
return
|
27
27
|
end
|
28
28
|
|
29
|
-
# try exact match
|
30
29
|
block_begin = str2.downcase.index(str1.downcase)
|
31
30
|
unless block_begin.nil?
|
32
31
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
33
|
-
return
|
32
|
+
return
|
34
33
|
end
|
35
34
|
|
35
|
+
|
36
|
+
## to find block alignments
|
36
37
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
last
|
42
|
-
|
43
|
-
last[:
|
44
|
-
last[:target][:end] = anchor[:target][:end]
|
39
|
+
blocks = []
|
40
|
+
while block = anchor_finder.get_next_anchor
|
41
|
+
last = blocks.last
|
42
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
43
|
+
last[:source][:end] = block[:source][:end]
|
44
|
+
last[:target][:end] = block[:target][:end]
|
45
45
|
else
|
46
|
-
|
46
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
# pp
|
50
|
+
# pp blocks
|
51
51
|
# puts "-----"
|
52
52
|
# puts
|
53
|
-
#
|
53
|
+
# exit
|
54
|
+
# blocks.each do |b|
|
54
55
|
# p [b[:source], b[:target]]
|
55
56
|
# puts "---"
|
56
57
|
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
@@ -62,114 +63,196 @@ class TextAlignment::TextAlignment
|
|
62
63
|
# puts "-=-=-=-=-"
|
63
64
|
# puts
|
64
65
|
|
65
|
-
##
|
66
|
-
|
67
|
-
|
66
|
+
## to fill the gaps
|
67
|
+
last_block = nil
|
68
|
+
blocks2 = blocks.inject([]) do |sum, block|
|
69
|
+
b1 = last_block ? last_block[:source][:end] : 0
|
70
|
+
e1 = block[:source][:begin]
|
68
71
|
|
69
|
-
|
70
|
-
|
71
|
-
e1 = mblocks[0][:source][:begin]
|
72
|
-
e2 = mblocks[0][:target][:begin]
|
73
|
-
|
74
|
-
if mblocks[0][:target][:begin] == 0
|
75
|
-
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
|
72
|
+
sum += if b1 == e1
|
73
|
+
[block]
|
76
74
|
else
|
77
|
-
|
78
|
-
|
75
|
+
b2 = last_block ? last_block[:target][:end] : 0
|
76
|
+
e2 = block[:target][:begin]
|
77
|
+
|
78
|
+
if b2 == e2
|
79
|
+
[
|
80
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
81
|
+
block
|
82
|
+
]
|
83
|
+
else
|
84
|
+
if b1 == 0 && b2 == 0
|
85
|
+
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
+
b2 = e2 - len_buffer if e2 > len_buffer
|
87
|
+
end
|
88
|
+
|
89
|
+
_str1 = str1[b1 ... e1]
|
90
|
+
_str2 = str2[b2 ... e2]
|
79
91
|
|
80
|
-
|
81
|
-
|
82
|
-
|
92
|
+
if _str1.strip.empty? || _str2.strip.empty?
|
93
|
+
[
|
94
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
95
|
+
block
|
96
|
+
]
|
83
97
|
else
|
84
|
-
|
85
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
-
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
87
|
-
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
88
|
-
|
89
|
-
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
90
|
-
|
91
|
-
_str1 = str1[b1 ... e1]
|
92
|
-
_str2 = str2[b2 ... e2]
|
93
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
94
|
-
if alignment.similarity < 0.6
|
95
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
96
|
-
else
|
97
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
98
|
-
end
|
98
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
99
99
|
end
|
100
100
|
end
|
101
101
|
end
|
102
|
+
|
103
|
+
last_block = block
|
104
|
+
sum
|
102
105
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
if
|
114
|
-
|
106
|
+
|
107
|
+
# the last step
|
108
|
+
blocks2 += if last_block.nil?
|
109
|
+
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
110
|
+
else
|
111
|
+
b1 = last_block[:source][:end]
|
112
|
+
if b1 < str1.length
|
113
|
+
e1 = str1.length
|
114
|
+
|
115
|
+
b2 = last_block[:target][:end]
|
116
|
+
if b2 < str2.length
|
117
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
118
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
119
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
115
120
|
else
|
116
|
-
|
117
|
-
if alignment.similarity < 0.6
|
118
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
119
|
-
else
|
120
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
121
|
-
end
|
121
|
+
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
122
122
|
end
|
123
123
|
end
|
124
|
-
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
125
124
|
end
|
126
125
|
|
127
|
-
|
128
|
-
|
129
|
-
b1 = mblocks[-1][:source][:end]
|
130
|
-
b2 = mblocks[-1][:target][:end]
|
131
|
-
_str1 = str1[b1 ... str1.length]
|
132
|
-
_str2 = str2[b2 ... str2.length]
|
126
|
+
@block_alignment[:blocks] = blocks2
|
127
|
+
end
|
133
128
|
|
134
|
-
|
135
|
-
|
136
|
-
|
129
|
+
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
130
|
+
block2 = str2[b2 ... e2]
|
131
|
+
|
132
|
+
## term-based alignment
|
133
|
+
tblocks = if denotations
|
134
|
+
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
135
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
136
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
137
|
+
|
138
|
+
position = 0
|
139
|
+
tblocks = ds_in_scope.map do |term|
|
140
|
+
lex = term[:lex]
|
141
|
+
r = block2.index(lex, position)
|
142
|
+
if r.nil?
|
143
|
+
position = nil
|
144
|
+
break
|
145
|
+
end
|
146
|
+
position = r + lex.length
|
147
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
148
|
+
end
|
149
|
+
|
150
|
+
# missing term found
|
151
|
+
tblocks = [] if position.nil?
|
152
|
+
|
153
|
+
# redundant matching found
|
154
|
+
unless position.nil?
|
155
|
+
ds_in_scope.each do |term|
|
156
|
+
lex = term[:lex]
|
157
|
+
look_forward = block2.index(lex, position)
|
158
|
+
unless look_forward.nil?
|
159
|
+
puts lex
|
160
|
+
tblocks = []
|
161
|
+
break
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
tblocks
|
167
|
+
end
|
168
|
+
|
169
|
+
if tblocks.empty?
|
170
|
+
if b1 == 0 && e1 == str1.length
|
171
|
+
if str2.length > 2000
|
172
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
137
173
|
else
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
174
|
+
block1 = str1[b1 ... e1]
|
175
|
+
block2 = str2[b2 ... e2]
|
176
|
+
|
177
|
+
## character-based alignment
|
178
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
179
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
|
180
|
+
# alignment = :alignment
|
181
|
+
# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
|
182
|
+
end
|
183
|
+
else
|
184
|
+
block1 = str1[b1 ... e1]
|
185
|
+
block2 = str2[b2 ... e2]
|
186
|
+
|
187
|
+
## character-based alignment
|
188
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
189
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
|
190
|
+
# alignmnet = :alignment
|
191
|
+
# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
|
192
|
+
end
|
193
|
+
else
|
194
|
+
last_tblock = nil
|
195
|
+
lblocks = tblocks.inject([]) do |sum, tblock|
|
196
|
+
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
197
|
+
te1 = tblock[:source][:begin]
|
144
198
|
|
145
|
-
|
146
|
-
|
147
|
-
|
199
|
+
sum += if te1 == tb1
|
200
|
+
[tblock]
|
201
|
+
else
|
202
|
+
tb2 = last_tblock ? tlast_block[:target][:end] : b2
|
203
|
+
te2 = tblock[:target][:begin]
|
204
|
+
|
205
|
+
if b2 == e2
|
206
|
+
[
|
207
|
+
{source:{begin:tb1, end:te1}, alignment: :empty},
|
208
|
+
tblock
|
209
|
+
]
|
148
210
|
else
|
149
|
-
|
211
|
+
[
|
212
|
+
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
213
|
+
tblock
|
214
|
+
]
|
150
215
|
end
|
216
|
+
end
|
151
217
|
|
152
|
-
|
218
|
+
last_tblock = tblock
|
219
|
+
sum
|
220
|
+
end
|
221
|
+
|
222
|
+
if last_tblock[:source][:end] < e1
|
223
|
+
if last_tblock[:target][:end] < e2
|
224
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
225
|
+
else
|
226
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
153
227
|
end
|
154
228
|
end
|
155
|
-
end
|
156
229
|
|
157
|
-
|
158
|
-
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
230
|
+
lblocks
|
159
231
|
end
|
160
232
|
end
|
161
233
|
|
234
|
+
|
235
|
+
def indices(str, target)
|
236
|
+
position = 0
|
237
|
+
len = target.len
|
238
|
+
Enumerator.new do |yielder|
|
239
|
+
while idx = str.index(target, position)
|
240
|
+
yielder << idx
|
241
|
+
position = idx + len
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
162
246
|
def transform_begin_position(begin_position)
|
163
247
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
164
248
|
block = @block_alignment[:blocks][i]
|
165
249
|
|
166
|
-
b = if block[:alignment] == :block
|
250
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
167
251
|
begin_position + block[:delta]
|
168
252
|
elsif block[:alignment] == :empty
|
169
253
|
if begin_position == block[:source][:begin]
|
170
254
|
block[:target][:begin]
|
171
255
|
else
|
172
|
-
# raise "lost annotation"
|
173
256
|
nil
|
174
257
|
end
|
175
258
|
else
|
@@ -182,13 +265,12 @@ class TextAlignment::TextAlignment
|
|
182
265
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
183
266
|
block = @block_alignment[:blocks][i]
|
184
267
|
|
185
|
-
e = if block[:alignment] == :block
|
268
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
186
269
|
end_position + block[:delta]
|
187
270
|
elsif block[:alignment] == :empty
|
188
271
|
if end_position == block[:source][:end]
|
189
272
|
block[:target][:end]
|
190
273
|
else
|
191
|
-
# raise "lost annotation"
|
192
274
|
nil
|
193
275
|
end
|
194
276
|
else
|
@@ -210,14 +292,14 @@ class TextAlignment::TextAlignment
|
|
210
292
|
@lost_annotations = []
|
211
293
|
|
212
294
|
denotations.each do |d|
|
213
|
-
begin
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
end
|
295
|
+
source = {begin:d.begin, end:d.end}
|
296
|
+
d.begin = transform_begin_position(d.begin);
|
297
|
+
d.end = transform_end_position(d.end);
|
298
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
|
299
|
+
rescue
|
300
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
301
|
+
d.begin = nil
|
302
|
+
d.end = nil
|
221
303
|
end
|
222
304
|
|
223
305
|
@lost_annotations
|
@@ -228,12 +310,12 @@ class TextAlignment::TextAlignment
|
|
228
310
|
@lost_annotations = []
|
229
311
|
|
230
312
|
r = hdenotations.collect do |d|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
313
|
+
t = transform_a_span(d[:span])
|
314
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
|
315
|
+
new_d = d.dup.merge({span:t})
|
316
|
+
rescue
|
317
|
+
@lost_annotations << {source: d[:span], target:t}
|
318
|
+
nil
|
237
319
|
end.compact
|
238
320
|
|
239
321
|
r
|
@@ -247,12 +329,16 @@ class TextAlignment::TextAlignment
|
|
247
329
|
@block_alignment[:blocks].each do |a|
|
248
330
|
show += case a[:alignment]
|
249
331
|
when :block
|
250
|
-
"===== common
|
332
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
333
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
334
|
+
when :term
|
335
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
251
336
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
252
337
|
when :empty
|
253
|
-
"
|
338
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
339
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
254
340
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
255
|
-
">>>>> string 2\n" +
|
341
|
+
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
256
342
|
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
257
343
|
else
|
258
344
|
astr1 = ''
|
@@ -286,7 +372,7 @@ class TextAlignment::TextAlignment
|
|
286
372
|
end
|
287
373
|
end.join('')
|
288
374
|
|
289
|
-
"***** local mismatch\n" +
|
375
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
290
376
|
"[#{astr1}]\n" +
|
291
377
|
"[#{astr2}]\n\n"
|
292
378
|
end
|
@@ -294,59 +380,4 @@ class TextAlignment::TextAlignment
|
|
294
380
|
show
|
295
381
|
end
|
296
382
|
|
297
|
-
private
|
298
|
-
|
299
|
-
def string_preprocessing(_str1, _str2)
|
300
|
-
str1 = _str1.dup
|
301
|
-
str2 = _str2.dup
|
302
|
-
mappings = TextAlignment::MAPPINGS.dup
|
303
|
-
|
304
|
-
## single character mappings
|
305
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
306
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
307
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
308
|
-
characters_to.gsub!(/-/, '\-')
|
309
|
-
|
310
|
-
str1.tr!(characters_from, characters_to)
|
311
|
-
str2.tr!(characters_from, characters_to)
|
312
|
-
|
313
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
314
|
-
|
315
|
-
## long to one character mappings
|
316
|
-
pletters = TextAlignment::PADDING_LETTERS
|
317
|
-
|
318
|
-
# find the padding letter for str1
|
319
|
-
padding_letter1 = begin
|
320
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
321
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
322
|
-
TextAlignment::PADDING_LETTERS[i]
|
323
|
-
end
|
324
|
-
|
325
|
-
# find the padding letter for str2
|
326
|
-
padding_letter2 = begin
|
327
|
-
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
328
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
329
|
-
TextAlignment::PADDING_LETTERS[i]
|
330
|
-
end
|
331
|
-
|
332
|
-
# ASCII foldings
|
333
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
334
|
-
ascii_foldings.each do |f|
|
335
|
-
from = f[1]
|
336
|
-
|
337
|
-
if str2.index(f[0])
|
338
|
-
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
339
|
-
str1.gsub!(from, to)
|
340
|
-
end
|
341
|
-
|
342
|
-
if str1.index(f[0])
|
343
|
-
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
344
|
-
str2.gsub!(from, to)
|
345
|
-
end
|
346
|
-
end
|
347
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
348
|
-
|
349
|
-
[str1, str2, mappings]
|
350
|
-
end
|
351
|
-
|
352
383
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.7'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|