text_alignment 0.6.2 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +4 -19
- data/lib/text_alignment/mixed_alignment.rb +74 -4
- data/lib/text_alignment/text_alignment.rb +203 -182
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 972c5735de6aa85f5f9cd289e965f3ec3b8c38c492085e203686bc0ea897a293
|
4
|
+
data.tar.gz: fc0abe3043562c82af5a3c0cf1178586ffcee7921d7f11dbd5cdb93311cbd52a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cfb1e21285616819cea937dce0f8422cddcd2ddb6ccf70d19bf2fd5851a33eede0760b4ed956049dfb3fb1cdfb7758d5bfbf19cff14ffedc2e1ffd80928200e0
|
7
|
+
data.tar.gz: 3fb72b7abe05c1a67db6c18448a0f601260b7d3f733e9b5e9fbe3ba5d9ec791e940bbdf70e00193658139480d320c2f9675426faff5e7e90d80eb9d8b07b074a
|
data/bin/align_annotations
CHANGED
@@ -105,9 +105,7 @@ lost_annotations = []
|
|
105
105
|
target_annotations = if source_annotations.class == Array
|
106
106
|
align_mdoc(source_annotations, {text: target_text})
|
107
107
|
else
|
108
|
-
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
109
|
-
|
110
|
-
# pp alignment
|
108
|
+
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
|
111
109
|
|
112
110
|
# verification
|
113
111
|
# source_text = source_annotations[:text]
|
@@ -142,22 +140,7 @@ else
|
|
142
140
|
puts "====="
|
143
141
|
# exit
|
144
142
|
|
145
|
-
# verification of source denotations
|
146
|
-
puts "[Invalid source denotations]"
|
147
|
-
source_annotations[:denotations] do |d|
|
148
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
149
|
-
end
|
150
|
-
puts "====="
|
151
|
-
puts
|
152
|
-
|
153
143
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
154
|
-
puts "[Invalid transformation]"
|
155
|
-
denotations.each do |d|
|
156
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
157
|
-
end
|
158
|
-
puts "====="
|
159
|
-
puts
|
160
|
-
|
161
144
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
162
145
|
|
163
146
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
|
|
194
177
|
|
195
178
|
if lost_annotations
|
196
179
|
warn "\n[lost annotations]"
|
197
|
-
|
180
|
+
lost_annotations.each do |a|
|
181
|
+
p a
|
182
|
+
end
|
198
183
|
end
|
199
184
|
|
200
185
|
#puts target_annotations.to_json
|
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(
|
21
|
-
raise ArgumentError, "nil string" if
|
22
|
-
|
20
|
+
def initialize(_str1, _str2)
|
21
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
|
+
|
23
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
23
24
|
|
24
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
25
26
|
end
|
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
62
63
|
end
|
63
64
|
|
64
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
65
|
-
@similarity =
|
66
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
67
|
@str1_match_initial = cmp.str1_match_initial
|
67
68
|
@str1_match_final = cmp.str1_match_final
|
68
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
|
|
137
138
|
@position_map_begin = posmap_begin.sort.to_h
|
138
139
|
@position_map_end = posmap_end.sort.to_h
|
139
140
|
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
def string_preprocessing(_str1, _str2)
|
145
|
+
str1 = _str1.dup
|
146
|
+
str2 = _str2.dup
|
147
|
+
mappings = TextAlignment::MAPPINGS.dup
|
148
|
+
|
149
|
+
## single character mappings
|
150
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
+
characters_to.gsub!(/-/, '\-')
|
154
|
+
|
155
|
+
str1.tr!(characters_from, characters_to)
|
156
|
+
str2.tr!(characters_from, characters_to)
|
157
|
+
|
158
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
+
|
160
|
+
## long to one character mappings
|
161
|
+
pletters = TextAlignment::PADDING_LETTERS
|
162
|
+
|
163
|
+
# find the padding letter for str1
|
164
|
+
@padding_letter1 = begin
|
165
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
+
TextAlignment::PADDING_LETTERS[i]
|
168
|
+
end
|
169
|
+
|
170
|
+
# find the padding letter for str2
|
171
|
+
@padding_letter2 = begin
|
172
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
+
TextAlignment::PADDING_LETTERS[i]
|
175
|
+
end
|
176
|
+
|
177
|
+
# ASCII foldings
|
178
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
+
ascii_foldings.each do |f|
|
180
|
+
from = f[1]
|
181
|
+
|
182
|
+
if str2.index(f[0])
|
183
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
+
str1.gsub!(from, to)
|
185
|
+
end
|
186
|
+
|
187
|
+
if str1.index(f[0])
|
188
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
+
str2.gsub!(from, to)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
+
|
194
|
+
[str1, str2, mappings]
|
195
|
+
end
|
196
|
+
|
197
|
+
def compute_similarity(_s1, _s2, sdiff)
|
198
|
+
return 0 if sdiff.nil?
|
199
|
+
|
200
|
+
# compute the lcs only with non-whitespace letters
|
201
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
+
return 0 if lcs == 0
|
203
|
+
|
204
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
+
|
207
|
+
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
+
end
|
209
|
+
|
140
210
|
end
|
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
|
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
|
-
@block_alignment = {source_text:
|
18
|
+
@block_alignment = {source_text:str1, target_text:str2}
|
19
|
+
@str1 = str1
|
20
|
+
@str2 = str2
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
-
# try exact match
|
22
|
+
## Block exact match
|
23
23
|
block_begin = str2.index(str1)
|
24
24
|
unless block_begin.nil?
|
25
25
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
|
-
return
|
26
|
+
return
|
27
27
|
end
|
28
28
|
|
29
|
-
# try exact match
|
30
29
|
block_begin = str2.downcase.index(str1.downcase)
|
31
30
|
unless block_begin.nil?
|
32
31
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
33
|
-
return
|
32
|
+
return
|
34
33
|
end
|
35
34
|
|
35
|
+
|
36
|
+
## to find block alignments
|
36
37
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
last
|
42
|
-
|
43
|
-
last[:
|
44
|
-
last[:target][:end] = anchor[:target][:end]
|
39
|
+
blocks = []
|
40
|
+
while block = anchor_finder.get_next_anchor
|
41
|
+
last = blocks.last
|
42
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
43
|
+
last[:source][:end] = block[:source][:end]
|
44
|
+
last[:target][:end] = block[:target][:end]
|
45
45
|
else
|
46
|
-
|
46
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
# pp
|
50
|
+
# pp blocks
|
51
51
|
# puts "-----"
|
52
52
|
# puts
|
53
|
-
#
|
53
|
+
# exit
|
54
|
+
# blocks.each do |b|
|
54
55
|
# p [b[:source], b[:target]]
|
55
56
|
# puts "---"
|
56
57
|
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
@@ -62,117 +63,202 @@ class TextAlignment::TextAlignment
|
|
62
63
|
# puts "-=-=-=-=-"
|
63
64
|
# puts
|
64
65
|
|
65
|
-
##
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
if mblocks[0][:source][:begin] > 0
|
71
|
-
e1 = mblocks[0][:source][:begin]
|
72
|
-
e2 = mblocks[0][:target][:begin]
|
66
|
+
## to fill the gaps
|
67
|
+
last_block = nil
|
68
|
+
blocks2 = blocks.inject([]) do |sum, block|
|
69
|
+
b1 = last_block ? last_block[:source][:end] : 0
|
70
|
+
e1 = block[:source][:begin]
|
73
71
|
|
74
|
-
if
|
75
|
-
|
72
|
+
sum += if b1 == e1
|
73
|
+
[block]
|
76
74
|
else
|
77
|
-
|
78
|
-
|
75
|
+
b2 = last_block ? last_block[:target][:end] : 0
|
76
|
+
e2 = block[:target][:begin]
|
77
|
+
|
78
|
+
if b2 == e2
|
79
|
+
[
|
80
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
81
|
+
block
|
82
|
+
]
|
83
|
+
else
|
84
|
+
if b1 == 0 && b2 == 0
|
85
|
+
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
+
b2 = e2 - len_buffer if e2 > len_buffer
|
87
|
+
end
|
79
88
|
|
80
|
-
|
81
|
-
|
82
|
-
|
89
|
+
_str1 = str1[b1 ... e1]
|
90
|
+
_str2 = str2[b2 ... e2]
|
91
|
+
|
92
|
+
if _str1.strip.empty? || _str2.strip.empty?
|
93
|
+
[
|
94
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
95
|
+
block
|
96
|
+
]
|
83
97
|
else
|
84
|
-
|
85
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
-
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
87
|
-
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
88
|
-
|
89
|
-
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
90
|
-
|
91
|
-
_str1 = str1[b1 ... e1]
|
92
|
-
_str2 = str2[b2 ... e2]
|
93
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
94
|
-
similarity = alignment_similarity(_str1, _str2, alignment)
|
95
|
-
if similarity < 0.6
|
96
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
|
97
|
-
else
|
98
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
99
|
-
end
|
98
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
100
99
|
end
|
101
100
|
end
|
102
101
|
end
|
102
|
+
|
103
|
+
last_block = block
|
104
|
+
sum
|
103
105
|
end
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
if
|
115
|
-
|
106
|
+
|
107
|
+
# the last step
|
108
|
+
blocks2 += if last_block.nil?
|
109
|
+
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
110
|
+
else
|
111
|
+
b1 = last_block[:source][:end]
|
112
|
+
if b1 < str1.length
|
113
|
+
e1 = str1.length
|
114
|
+
|
115
|
+
b2 = last_block[:target][:end]
|
116
|
+
if b2 < str2.length
|
117
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
118
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
119
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
116
120
|
else
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
121
|
+
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
122
|
+
end
|
123
|
+
else
|
124
|
+
[]
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
@block_alignment[:blocks] = blocks2
|
129
|
+
end
|
130
|
+
|
131
|
+
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
132
|
+
block2 = str2[b2 ... e2]
|
133
|
+
|
134
|
+
## term-based alignment
|
135
|
+
tblocks = if denotations
|
136
|
+
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
137
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
138
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
139
|
+
|
140
|
+
position = 0
|
141
|
+
tblocks = ds_in_scope.map do |term|
|
142
|
+
lex = term[:lex]
|
143
|
+
r = block2.index(lex, position)
|
144
|
+
if r.nil?
|
145
|
+
position = nil
|
146
|
+
break
|
147
|
+
end
|
148
|
+
position = r + lex.length
|
149
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
150
|
+
end
|
151
|
+
|
152
|
+
# missing term found
|
153
|
+
tblocks = [] if position.nil?
|
154
|
+
|
155
|
+
# redundant matching found
|
156
|
+
unless position.nil?
|
157
|
+
ds_in_scope.each do |term|
|
158
|
+
lex = term[:lex]
|
159
|
+
look_forward = block2.index(lex, position)
|
160
|
+
unless look_forward.nil?
|
161
|
+
puts lex
|
162
|
+
tblocks = []
|
163
|
+
break
|
123
164
|
end
|
124
165
|
end
|
125
166
|
end
|
126
|
-
|
167
|
+
|
168
|
+
tblocks
|
127
169
|
end
|
128
170
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
171
|
+
if tblocks.empty?
|
172
|
+
if b1 == 0 && e1 == str1.length
|
173
|
+
if (e1 > 1000) || (e2 > 1000)
|
174
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
175
|
+
else
|
176
|
+
block1 = str1[b1 ... e1]
|
177
|
+
block2 = str2[b2 ... e2]
|
178
|
+
|
179
|
+
## character-based alignment
|
180
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
181
|
+
if alignment.sdiff.nil?
|
182
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
183
|
+
else
|
184
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
else
|
188
|
+
block1 = str1[b1 ... e1]
|
189
|
+
block2 = str2[b2 ... e2]
|
135
190
|
|
136
|
-
|
137
|
-
|
138
|
-
|
191
|
+
## character-based alignment
|
192
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
193
|
+
if alignment.sdiff.nil?
|
194
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
139
195
|
else
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
196
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
197
|
+
end
|
198
|
+
end
|
199
|
+
else
|
200
|
+
last_tblock = nil
|
201
|
+
lblocks = tblocks.inject([]) do |sum, tblock|
|
202
|
+
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
203
|
+
te1 = tblock[:source][:begin]
|
146
204
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
205
|
+
sum += if te1 == tb1
|
206
|
+
[tblock]
|
207
|
+
else
|
208
|
+
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
209
|
+
te2 = tblock[:target][:begin]
|
210
|
+
|
211
|
+
if b2 == e2
|
212
|
+
[
|
213
|
+
{source:{begin:tb1, end:te1}, alignment: :empty},
|
214
|
+
tblock
|
215
|
+
]
|
151
216
|
else
|
152
|
-
|
217
|
+
[
|
218
|
+
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
219
|
+
tblock
|
220
|
+
]
|
153
221
|
end
|
222
|
+
end
|
154
223
|
|
155
|
-
|
224
|
+
last_tblock = tblock
|
225
|
+
sum
|
226
|
+
end
|
227
|
+
|
228
|
+
if last_tblock[:source][:end] < e1
|
229
|
+
if last_tblock[:target][:end] < e2
|
230
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
231
|
+
else
|
232
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
156
233
|
end
|
157
234
|
end
|
158
|
-
end
|
159
235
|
|
160
|
-
|
161
|
-
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
236
|
+
lblocks
|
162
237
|
end
|
163
238
|
end
|
164
239
|
|
240
|
+
|
241
|
+
def indices(str, target)
|
242
|
+
position = 0
|
243
|
+
len = target.len
|
244
|
+
Enumerator.new do |yielder|
|
245
|
+
while idx = str.index(target, position)
|
246
|
+
yielder << idx
|
247
|
+
position = idx + len
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
165
252
|
def transform_begin_position(begin_position)
|
166
253
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
167
254
|
block = @block_alignment[:blocks][i]
|
168
255
|
|
169
|
-
b = if block[:alignment] == :block
|
256
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
170
257
|
begin_position + block[:delta]
|
171
258
|
elsif block[:alignment] == :empty
|
172
259
|
if begin_position == block[:source][:begin]
|
173
260
|
block[:target][:begin]
|
174
261
|
else
|
175
|
-
# raise "lost annotation"
|
176
262
|
nil
|
177
263
|
end
|
178
264
|
else
|
@@ -185,13 +271,12 @@ class TextAlignment::TextAlignment
|
|
185
271
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
186
272
|
block = @block_alignment[:blocks][i]
|
187
273
|
|
188
|
-
e = if block[:alignment] == :block
|
274
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
189
275
|
end_position + block[:delta]
|
190
276
|
elsif block[:alignment] == :empty
|
191
277
|
if end_position == block[:source][:end]
|
192
278
|
block[:target][:end]
|
193
279
|
else
|
194
|
-
# raise "lost annotation"
|
195
280
|
nil
|
196
281
|
end
|
197
282
|
else
|
@@ -213,14 +298,14 @@ class TextAlignment::TextAlignment
|
|
213
298
|
@lost_annotations = []
|
214
299
|
|
215
300
|
denotations.each do |d|
|
216
|
-
begin
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
end
|
301
|
+
source = {begin:d.begin, end:d.end}
|
302
|
+
d.begin = transform_begin_position(d.begin);
|
303
|
+
d.end = transform_end_position(d.end);
|
304
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
|
305
|
+
rescue
|
306
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
307
|
+
d.begin = nil
|
308
|
+
d.end = nil
|
224
309
|
end
|
225
310
|
|
226
311
|
@lost_annotations
|
@@ -231,12 +316,12 @@ class TextAlignment::TextAlignment
|
|
231
316
|
@lost_annotations = []
|
232
317
|
|
233
318
|
r = hdenotations.collect do |d|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
319
|
+
t = transform_a_span(d[:span])
|
320
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
|
321
|
+
new_d = d.dup.merge({span:t})
|
322
|
+
rescue
|
323
|
+
@lost_annotations << {source: d[:span], target:t}
|
324
|
+
nil
|
240
325
|
end.compact
|
241
326
|
|
242
327
|
r
|
@@ -250,13 +335,16 @@ class TextAlignment::TextAlignment
|
|
250
335
|
@block_alignment[:blocks].each do |a|
|
251
336
|
show += case a[:alignment]
|
252
337
|
when :block
|
253
|
-
"===== common
|
338
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
339
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
340
|
+
when :term
|
341
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
254
342
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
255
343
|
when :empty
|
256
344
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
257
|
-
"<<<<< string 1\n" +
|
345
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
258
346
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
259
|
-
">>>>> string 2\n" +
|
347
|
+
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
260
348
|
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
261
349
|
else
|
262
350
|
astr1 = ''
|
@@ -290,7 +378,7 @@ class TextAlignment::TextAlignment
|
|
290
378
|
end
|
291
379
|
end.join('')
|
292
380
|
|
293
|
-
"***** local mismatch\n" +
|
381
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
294
382
|
"[#{astr1}]\n" +
|
295
383
|
"[#{astr2}]\n\n"
|
296
384
|
end
|
@@ -298,71 +386,4 @@ class TextAlignment::TextAlignment
|
|
298
386
|
show
|
299
387
|
end
|
300
388
|
|
301
|
-
private
|
302
|
-
|
303
|
-
def string_preprocessing(_str1, _str2)
|
304
|
-
str1 = _str1.dup
|
305
|
-
str2 = _str2.dup
|
306
|
-
mappings = TextAlignment::MAPPINGS.dup
|
307
|
-
|
308
|
-
## single character mappings
|
309
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
310
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
311
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
312
|
-
characters_to.gsub!(/-/, '\-')
|
313
|
-
|
314
|
-
str1.tr!(characters_from, characters_to)
|
315
|
-
str2.tr!(characters_from, characters_to)
|
316
|
-
|
317
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
318
|
-
|
319
|
-
## long to one character mappings
|
320
|
-
pletters = TextAlignment::PADDING_LETTERS
|
321
|
-
|
322
|
-
# find the padding letter for str1
|
323
|
-
@padding_letter1 = begin
|
324
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
325
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
326
|
-
TextAlignment::PADDING_LETTERS[i]
|
327
|
-
end
|
328
|
-
|
329
|
-
# find the padding letter for str2
|
330
|
-
@padding_letter2 = begin
|
331
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
332
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
333
|
-
TextAlignment::PADDING_LETTERS[i]
|
334
|
-
end
|
335
|
-
|
336
|
-
# ASCII foldings
|
337
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
338
|
-
ascii_foldings.each do |f|
|
339
|
-
from = f[1]
|
340
|
-
|
341
|
-
if str2.index(f[0])
|
342
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
343
|
-
str1.gsub!(from, to)
|
344
|
-
end
|
345
|
-
|
346
|
-
if str1.index(f[0])
|
347
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
348
|
-
str2.gsub!(from, to)
|
349
|
-
end
|
350
|
-
end
|
351
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
352
|
-
|
353
|
-
[str1, str2, mappings]
|
354
|
-
end
|
355
|
-
|
356
|
-
def alignment_similarity(_s1, _s2, alignment)
|
357
|
-
return 0 if alignment.sdiff.nil?
|
358
|
-
|
359
|
-
# compute the lcs only with non-whitespace letters
|
360
|
-
lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
361
|
-
|
362
|
-
s1 = _s1.tr(@padding_letter1, ' ')
|
363
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
364
|
-
|
365
|
-
similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
|
366
|
-
end
|
367
|
-
|
368
389
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|