text_alignment 0.6.2 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +4 -19
- data/lib/text_alignment/mixed_alignment.rb +74 -4
- data/lib/text_alignment/text_alignment.rb +203 -182
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 972c5735de6aa85f5f9cd289e965f3ec3b8c38c492085e203686bc0ea897a293
|
4
|
+
data.tar.gz: fc0abe3043562c82af5a3c0cf1178586ffcee7921d7f11dbd5cdb93311cbd52a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cfb1e21285616819cea937dce0f8422cddcd2ddb6ccf70d19bf2fd5851a33eede0760b4ed956049dfb3fb1cdfb7758d5bfbf19cff14ffedc2e1ffd80928200e0
|
7
|
+
data.tar.gz: 3fb72b7abe05c1a67db6c18448a0f601260b7d3f733e9b5e9fbe3ba5d9ec791e940bbdf70e00193658139480d320c2f9675426faff5e7e90d80eb9d8b07b074a
|
data/bin/align_annotations
CHANGED
@@ -105,9 +105,7 @@ lost_annotations = []
|
|
105
105
|
target_annotations = if source_annotations.class == Array
|
106
106
|
align_mdoc(source_annotations, {text: target_text})
|
107
107
|
else
|
108
|
-
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
109
|
-
|
110
|
-
# pp alignment
|
108
|
+
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
|
111
109
|
|
112
110
|
# verification
|
113
111
|
# source_text = source_annotations[:text]
|
@@ -142,22 +140,7 @@ else
|
|
142
140
|
puts "====="
|
143
141
|
# exit
|
144
142
|
|
145
|
-
# verification of source denotations
|
146
|
-
puts "[Invalid source denotations]"
|
147
|
-
source_annotations[:denotations] do |d|
|
148
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
149
|
-
end
|
150
|
-
puts "====="
|
151
|
-
puts
|
152
|
-
|
153
143
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
154
|
-
puts "[Invalid transformation]"
|
155
|
-
denotations.each do |d|
|
156
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
157
|
-
end
|
158
|
-
puts "====="
|
159
|
-
puts
|
160
|
-
|
161
144
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
162
145
|
|
163
146
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
|
|
194
177
|
|
195
178
|
if lost_annotations
|
196
179
|
warn "\n[lost annotations]"
|
197
|
-
|
180
|
+
lost_annotations.each do |a|
|
181
|
+
p a
|
182
|
+
end
|
198
183
|
end
|
199
184
|
|
200
185
|
#puts target_annotations.to_json
|
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(
|
21
|
-
raise ArgumentError, "nil string" if
|
22
|
-
|
20
|
+
def initialize(_str1, _str2)
|
21
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
|
+
|
23
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
23
24
|
|
24
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
25
26
|
end
|
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
62
63
|
end
|
63
64
|
|
64
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
65
|
-
@similarity =
|
66
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
67
|
@str1_match_initial = cmp.str1_match_initial
|
67
68
|
@str1_match_final = cmp.str1_match_final
|
68
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
|
|
137
138
|
@position_map_begin = posmap_begin.sort.to_h
|
138
139
|
@position_map_end = posmap_end.sort.to_h
|
139
140
|
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
def string_preprocessing(_str1, _str2)
|
145
|
+
str1 = _str1.dup
|
146
|
+
str2 = _str2.dup
|
147
|
+
mappings = TextAlignment::MAPPINGS.dup
|
148
|
+
|
149
|
+
## single character mappings
|
150
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
+
characters_to.gsub!(/-/, '\-')
|
154
|
+
|
155
|
+
str1.tr!(characters_from, characters_to)
|
156
|
+
str2.tr!(characters_from, characters_to)
|
157
|
+
|
158
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
+
|
160
|
+
## long to one character mappings
|
161
|
+
pletters = TextAlignment::PADDING_LETTERS
|
162
|
+
|
163
|
+
# find the padding letter for str1
|
164
|
+
@padding_letter1 = begin
|
165
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
+
TextAlignment::PADDING_LETTERS[i]
|
168
|
+
end
|
169
|
+
|
170
|
+
# find the padding letter for str2
|
171
|
+
@padding_letter2 = begin
|
172
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
+
TextAlignment::PADDING_LETTERS[i]
|
175
|
+
end
|
176
|
+
|
177
|
+
# ASCII foldings
|
178
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
+
ascii_foldings.each do |f|
|
180
|
+
from = f[1]
|
181
|
+
|
182
|
+
if str2.index(f[0])
|
183
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
+
str1.gsub!(from, to)
|
185
|
+
end
|
186
|
+
|
187
|
+
if str1.index(f[0])
|
188
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
+
str2.gsub!(from, to)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
+
|
194
|
+
[str1, str2, mappings]
|
195
|
+
end
|
196
|
+
|
197
|
+
def compute_similarity(_s1, _s2, sdiff)
|
198
|
+
return 0 if sdiff.nil?
|
199
|
+
|
200
|
+
# compute the lcs only with non-whitespace letters
|
201
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
+
return 0 if lcs == 0
|
203
|
+
|
204
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
+
|
207
|
+
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
+
end
|
209
|
+
|
140
210
|
end
|
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
|
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
|
-
@block_alignment = {source_text:
|
18
|
+
@block_alignment = {source_text:str1, target_text:str2}
|
19
|
+
@str1 = str1
|
20
|
+
@str2 = str2
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
-
# try exact match
|
22
|
+
## Block exact match
|
23
23
|
block_begin = str2.index(str1)
|
24
24
|
unless block_begin.nil?
|
25
25
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
|
-
return
|
26
|
+
return
|
27
27
|
end
|
28
28
|
|
29
|
-
# try exact match
|
30
29
|
block_begin = str2.downcase.index(str1.downcase)
|
31
30
|
unless block_begin.nil?
|
32
31
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
33
|
-
return
|
32
|
+
return
|
34
33
|
end
|
35
34
|
|
35
|
+
|
36
|
+
## to find block alignments
|
36
37
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
last
|
42
|
-
|
43
|
-
last[:
|
44
|
-
last[:target][:end] = anchor[:target][:end]
|
39
|
+
blocks = []
|
40
|
+
while block = anchor_finder.get_next_anchor
|
41
|
+
last = blocks.last
|
42
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
43
|
+
last[:source][:end] = block[:source][:end]
|
44
|
+
last[:target][:end] = block[:target][:end]
|
45
45
|
else
|
46
|
-
|
46
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
# pp
|
50
|
+
# pp blocks
|
51
51
|
# puts "-----"
|
52
52
|
# puts
|
53
|
-
#
|
53
|
+
# exit
|
54
|
+
# blocks.each do |b|
|
54
55
|
# p [b[:source], b[:target]]
|
55
56
|
# puts "---"
|
56
57
|
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
@@ -62,117 +63,202 @@ class TextAlignment::TextAlignment
|
|
62
63
|
# puts "-=-=-=-=-"
|
63
64
|
# puts
|
64
65
|
|
65
|
-
##
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
if mblocks[0][:source][:begin] > 0
|
71
|
-
e1 = mblocks[0][:source][:begin]
|
72
|
-
e2 = mblocks[0][:target][:begin]
|
66
|
+
## to fill the gaps
|
67
|
+
last_block = nil
|
68
|
+
blocks2 = blocks.inject([]) do |sum, block|
|
69
|
+
b1 = last_block ? last_block[:source][:end] : 0
|
70
|
+
e1 = block[:source][:begin]
|
73
71
|
|
74
|
-
if
|
75
|
-
|
72
|
+
sum += if b1 == e1
|
73
|
+
[block]
|
76
74
|
else
|
77
|
-
|
78
|
-
|
75
|
+
b2 = last_block ? last_block[:target][:end] : 0
|
76
|
+
e2 = block[:target][:begin]
|
77
|
+
|
78
|
+
if b2 == e2
|
79
|
+
[
|
80
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
81
|
+
block
|
82
|
+
]
|
83
|
+
else
|
84
|
+
if b1 == 0 && b2 == 0
|
85
|
+
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
+
b2 = e2 - len_buffer if e2 > len_buffer
|
87
|
+
end
|
79
88
|
|
80
|
-
|
81
|
-
|
82
|
-
|
89
|
+
_str1 = str1[b1 ... e1]
|
90
|
+
_str2 = str2[b2 ... e2]
|
91
|
+
|
92
|
+
if _str1.strip.empty? || _str2.strip.empty?
|
93
|
+
[
|
94
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
95
|
+
block
|
96
|
+
]
|
83
97
|
else
|
84
|
-
|
85
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
-
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
87
|
-
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
88
|
-
|
89
|
-
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
90
|
-
|
91
|
-
_str1 = str1[b1 ... e1]
|
92
|
-
_str2 = str2[b2 ... e2]
|
93
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
94
|
-
similarity = alignment_similarity(_str1, _str2, alignment)
|
95
|
-
if similarity < 0.6
|
96
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
|
97
|
-
else
|
98
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
99
|
-
end
|
98
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
100
99
|
end
|
101
100
|
end
|
102
101
|
end
|
102
|
+
|
103
|
+
last_block = block
|
104
|
+
sum
|
103
105
|
end
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
if
|
115
|
-
|
106
|
+
|
107
|
+
# the last step
|
108
|
+
blocks2 += if last_block.nil?
|
109
|
+
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
110
|
+
else
|
111
|
+
b1 = last_block[:source][:end]
|
112
|
+
if b1 < str1.length
|
113
|
+
e1 = str1.length
|
114
|
+
|
115
|
+
b2 = last_block[:target][:end]
|
116
|
+
if b2 < str2.length
|
117
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
118
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
119
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
116
120
|
else
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
121
|
+
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
122
|
+
end
|
123
|
+
else
|
124
|
+
[]
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
@block_alignment[:blocks] = blocks2
|
129
|
+
end
|
130
|
+
|
131
|
+
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
132
|
+
block2 = str2[b2 ... e2]
|
133
|
+
|
134
|
+
## term-based alignment
|
135
|
+
tblocks = if denotations
|
136
|
+
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
137
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
138
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
139
|
+
|
140
|
+
position = 0
|
141
|
+
tblocks = ds_in_scope.map do |term|
|
142
|
+
lex = term[:lex]
|
143
|
+
r = block2.index(lex, position)
|
144
|
+
if r.nil?
|
145
|
+
position = nil
|
146
|
+
break
|
147
|
+
end
|
148
|
+
position = r + lex.length
|
149
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
150
|
+
end
|
151
|
+
|
152
|
+
# missing term found
|
153
|
+
tblocks = [] if position.nil?
|
154
|
+
|
155
|
+
# redundant matching found
|
156
|
+
unless position.nil?
|
157
|
+
ds_in_scope.each do |term|
|
158
|
+
lex = term[:lex]
|
159
|
+
look_forward = block2.index(lex, position)
|
160
|
+
unless look_forward.nil?
|
161
|
+
puts lex
|
162
|
+
tblocks = []
|
163
|
+
break
|
123
164
|
end
|
124
165
|
end
|
125
166
|
end
|
126
|
-
|
167
|
+
|
168
|
+
tblocks
|
127
169
|
end
|
128
170
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
171
|
+
if tblocks.empty?
|
172
|
+
if b1 == 0 && e1 == str1.length
|
173
|
+
if (e1 > 1000) || (e2 > 1000)
|
174
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
175
|
+
else
|
176
|
+
block1 = str1[b1 ... e1]
|
177
|
+
block2 = str2[b2 ... e2]
|
178
|
+
|
179
|
+
## character-based alignment
|
180
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
181
|
+
if alignment.sdiff.nil?
|
182
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
183
|
+
else
|
184
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
else
|
188
|
+
block1 = str1[b1 ... e1]
|
189
|
+
block2 = str2[b2 ... e2]
|
135
190
|
|
136
|
-
|
137
|
-
|
138
|
-
|
191
|
+
## character-based alignment
|
192
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
193
|
+
if alignment.sdiff.nil?
|
194
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
139
195
|
else
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
196
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
197
|
+
end
|
198
|
+
end
|
199
|
+
else
|
200
|
+
last_tblock = nil
|
201
|
+
lblocks = tblocks.inject([]) do |sum, tblock|
|
202
|
+
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
203
|
+
te1 = tblock[:source][:begin]
|
146
204
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
205
|
+
sum += if te1 == tb1
|
206
|
+
[tblock]
|
207
|
+
else
|
208
|
+
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
209
|
+
te2 = tblock[:target][:begin]
|
210
|
+
|
211
|
+
if b2 == e2
|
212
|
+
[
|
213
|
+
{source:{begin:tb1, end:te1}, alignment: :empty},
|
214
|
+
tblock
|
215
|
+
]
|
151
216
|
else
|
152
|
-
|
217
|
+
[
|
218
|
+
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
219
|
+
tblock
|
220
|
+
]
|
153
221
|
end
|
222
|
+
end
|
154
223
|
|
155
|
-
|
224
|
+
last_tblock = tblock
|
225
|
+
sum
|
226
|
+
end
|
227
|
+
|
228
|
+
if last_tblock[:source][:end] < e1
|
229
|
+
if last_tblock[:target][:end] < e2
|
230
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
231
|
+
else
|
232
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
156
233
|
end
|
157
234
|
end
|
158
|
-
end
|
159
235
|
|
160
|
-
|
161
|
-
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
236
|
+
lblocks
|
162
237
|
end
|
163
238
|
end
|
164
239
|
|
240
|
+
|
241
|
+
def indices(str, target)
|
242
|
+
position = 0
|
243
|
+
len = target.len
|
244
|
+
Enumerator.new do |yielder|
|
245
|
+
while idx = str.index(target, position)
|
246
|
+
yielder << idx
|
247
|
+
position = idx + len
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
165
252
|
def transform_begin_position(begin_position)
|
166
253
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
167
254
|
block = @block_alignment[:blocks][i]
|
168
255
|
|
169
|
-
b = if block[:alignment] == :block
|
256
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
170
257
|
begin_position + block[:delta]
|
171
258
|
elsif block[:alignment] == :empty
|
172
259
|
if begin_position == block[:source][:begin]
|
173
260
|
block[:target][:begin]
|
174
261
|
else
|
175
|
-
# raise "lost annotation"
|
176
262
|
nil
|
177
263
|
end
|
178
264
|
else
|
@@ -185,13 +271,12 @@ class TextAlignment::TextAlignment
|
|
185
271
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
186
272
|
block = @block_alignment[:blocks][i]
|
187
273
|
|
188
|
-
e = if block[:alignment] == :block
|
274
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
189
275
|
end_position + block[:delta]
|
190
276
|
elsif block[:alignment] == :empty
|
191
277
|
if end_position == block[:source][:end]
|
192
278
|
block[:target][:end]
|
193
279
|
else
|
194
|
-
# raise "lost annotation"
|
195
280
|
nil
|
196
281
|
end
|
197
282
|
else
|
@@ -213,14 +298,14 @@ class TextAlignment::TextAlignment
|
|
213
298
|
@lost_annotations = []
|
214
299
|
|
215
300
|
denotations.each do |d|
|
216
|
-
begin
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
end
|
301
|
+
source = {begin:d.begin, end:d.end}
|
302
|
+
d.begin = transform_begin_position(d.begin);
|
303
|
+
d.end = transform_end_position(d.end);
|
304
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
|
305
|
+
rescue
|
306
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
307
|
+
d.begin = nil
|
308
|
+
d.end = nil
|
224
309
|
end
|
225
310
|
|
226
311
|
@lost_annotations
|
@@ -231,12 +316,12 @@ class TextAlignment::TextAlignment
|
|
231
316
|
@lost_annotations = []
|
232
317
|
|
233
318
|
r = hdenotations.collect do |d|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
319
|
+
t = transform_a_span(d[:span])
|
320
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
|
321
|
+
new_d = d.dup.merge({span:t})
|
322
|
+
rescue
|
323
|
+
@lost_annotations << {source: d[:span], target:t}
|
324
|
+
nil
|
240
325
|
end.compact
|
241
326
|
|
242
327
|
r
|
@@ -250,13 +335,16 @@ class TextAlignment::TextAlignment
|
|
250
335
|
@block_alignment[:blocks].each do |a|
|
251
336
|
show += case a[:alignment]
|
252
337
|
when :block
|
253
|
-
"===== common
|
338
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
339
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
340
|
+
when :term
|
341
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
254
342
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
255
343
|
when :empty
|
256
344
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
257
|
-
"<<<<< string 1\n" +
|
345
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
258
346
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
259
|
-
">>>>> string 2\n" +
|
347
|
+
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
260
348
|
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
261
349
|
else
|
262
350
|
astr1 = ''
|
@@ -290,7 +378,7 @@ class TextAlignment::TextAlignment
|
|
290
378
|
end
|
291
379
|
end.join('')
|
292
380
|
|
293
|
-
"***** local mismatch\n" +
|
381
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
294
382
|
"[#{astr1}]\n" +
|
295
383
|
"[#{astr2}]\n\n"
|
296
384
|
end
|
@@ -298,71 +386,4 @@ class TextAlignment::TextAlignment
|
|
298
386
|
show
|
299
387
|
end
|
300
388
|
|
301
|
-
private
|
302
|
-
|
303
|
-
def string_preprocessing(_str1, _str2)
|
304
|
-
str1 = _str1.dup
|
305
|
-
str2 = _str2.dup
|
306
|
-
mappings = TextAlignment::MAPPINGS.dup
|
307
|
-
|
308
|
-
## single character mappings
|
309
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
310
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
311
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
312
|
-
characters_to.gsub!(/-/, '\-')
|
313
|
-
|
314
|
-
str1.tr!(characters_from, characters_to)
|
315
|
-
str2.tr!(characters_from, characters_to)
|
316
|
-
|
317
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
318
|
-
|
319
|
-
## long to one character mappings
|
320
|
-
pletters = TextAlignment::PADDING_LETTERS
|
321
|
-
|
322
|
-
# find the padding letter for str1
|
323
|
-
@padding_letter1 = begin
|
324
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
325
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
326
|
-
TextAlignment::PADDING_LETTERS[i]
|
327
|
-
end
|
328
|
-
|
329
|
-
# find the padding letter for str2
|
330
|
-
@padding_letter2 = begin
|
331
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
332
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
333
|
-
TextAlignment::PADDING_LETTERS[i]
|
334
|
-
end
|
335
|
-
|
336
|
-
# ASCII foldings
|
337
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
338
|
-
ascii_foldings.each do |f|
|
339
|
-
from = f[1]
|
340
|
-
|
341
|
-
if str2.index(f[0])
|
342
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
343
|
-
str1.gsub!(from, to)
|
344
|
-
end
|
345
|
-
|
346
|
-
if str1.index(f[0])
|
347
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
348
|
-
str2.gsub!(from, to)
|
349
|
-
end
|
350
|
-
end
|
351
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
352
|
-
|
353
|
-
[str1, str2, mappings]
|
354
|
-
end
|
355
|
-
|
356
|
-
def alignment_similarity(_s1, _s2, alignment)
|
357
|
-
return 0 if alignment.sdiff.nil?
|
358
|
-
|
359
|
-
# compute the lcs only with non-whitespace letters
|
360
|
-
lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
361
|
-
|
362
|
-
s1 = _s1.tr(@padding_letter1, ' ')
|
363
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
364
|
-
|
365
|
-
similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
|
366
|
-
end
|
367
|
-
|
368
389
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|