text_alignment 0.6.1 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +4 -19
- data/lib/text_alignment/mixed_alignment.rb +74 -4
- data/lib/text_alignment/text_alignment.rb +199 -179
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '088ea92f4ca68c574cbd04bbf932aa70014b58cbbad82028b7161e0af35cdb4c'
|
|
4
|
+
data.tar.gz: 455e6b53a846e7ebf0a90e724b93b78b61025ccb2d02c1e167f1969946b292e2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 02cdb75cc9b95415c0e86d78bc0278ab8a0cf5a7afa1870ba5c10f0137e59ea782b92e8130ea2f9b3e16fd43b08c8c72da0d2b2ecf4546cb6a46a72ad62957ef
|
|
7
|
+
data.tar.gz: 1216190e0b3880acedc6b735b70a3611469f4fcb4506f059271acbef8d84ffdee793234029c8e5f9e9d1f10cdf088ed44a8c54c93986b19757036aceaf938247
|
data/bin/align_annotations
CHANGED
|
@@ -105,9 +105,7 @@ lost_annotations = []
|
|
|
105
105
|
target_annotations = if source_annotations.class == Array
|
|
106
106
|
align_mdoc(source_annotations, {text: target_text})
|
|
107
107
|
else
|
|
108
|
-
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
|
109
|
-
|
|
110
|
-
# pp alignment
|
|
108
|
+
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
|
|
111
109
|
|
|
112
110
|
# verification
|
|
113
111
|
# source_text = source_annotations[:text]
|
|
@@ -142,22 +140,7 @@ else
|
|
|
142
140
|
puts "====="
|
|
143
141
|
# exit
|
|
144
142
|
|
|
145
|
-
# verification of source denotations
|
|
146
|
-
puts "[Invalid source denotations]"
|
|
147
|
-
source_annotations[:denotations] do |d|
|
|
148
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
|
149
|
-
end
|
|
150
|
-
puts "====="
|
|
151
|
-
puts
|
|
152
|
-
|
|
153
143
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
|
154
|
-
puts "[Invalid transformation]"
|
|
155
|
-
denotations.each do |d|
|
|
156
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
|
157
|
-
end
|
|
158
|
-
puts "====="
|
|
159
|
-
puts
|
|
160
|
-
|
|
161
144
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
|
162
145
|
|
|
163
146
|
source_annotations.merge({text:target_text, denotations:denotations})
|
|
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
|
|
|
194
177
|
|
|
195
178
|
if lost_annotations
|
|
196
179
|
warn "\n[lost annotations]"
|
|
197
|
-
|
|
180
|
+
lost_annotations.each do |a|
|
|
181
|
+
p a
|
|
182
|
+
end
|
|
198
183
|
end
|
|
199
184
|
|
|
200
185
|
#puts target_annotations.to_json
|
|
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
|
|
|
17
17
|
attr_reader :similarity
|
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
|
19
19
|
|
|
20
|
-
def initialize(
|
|
21
|
-
raise ArgumentError, "nil string" if
|
|
22
|
-
|
|
20
|
+
def initialize(_str1, _str2)
|
|
21
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
|
22
|
+
|
|
23
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
|
23
24
|
|
|
24
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
|
25
26
|
end
|
|
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
|
62
63
|
end
|
|
63
64
|
|
|
64
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
|
65
|
-
@similarity =
|
|
66
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
|
66
67
|
@str1_match_initial = cmp.str1_match_initial
|
|
67
68
|
@str1_match_final = cmp.str1_match_final
|
|
68
69
|
@str2_match_initial = cmp.str2_match_initial
|
|
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
|
|
|
137
138
|
@position_map_begin = posmap_begin.sort.to_h
|
|
138
139
|
@position_map_end = posmap_end.sort.to_h
|
|
139
140
|
end
|
|
141
|
+
|
|
142
|
+
private
|
|
143
|
+
|
|
144
|
+
def string_preprocessing(_str1, _str2)
|
|
145
|
+
str1 = _str1.dup
|
|
146
|
+
str2 = _str2.dup
|
|
147
|
+
mappings = TextAlignment::MAPPINGS.dup
|
|
148
|
+
|
|
149
|
+
## single character mappings
|
|
150
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
|
151
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
|
152
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
|
153
|
+
characters_to.gsub!(/-/, '\-')
|
|
154
|
+
|
|
155
|
+
str1.tr!(characters_from, characters_to)
|
|
156
|
+
str2.tr!(characters_from, characters_to)
|
|
157
|
+
|
|
158
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
|
159
|
+
|
|
160
|
+
## long to one character mappings
|
|
161
|
+
pletters = TextAlignment::PADDING_LETTERS
|
|
162
|
+
|
|
163
|
+
# find the padding letter for str1
|
|
164
|
+
@padding_letter1 = begin
|
|
165
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
|
166
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
|
167
|
+
TextAlignment::PADDING_LETTERS[i]
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# find the padding letter for str2
|
|
171
|
+
@padding_letter2 = begin
|
|
172
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
|
173
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
|
174
|
+
TextAlignment::PADDING_LETTERS[i]
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# ASCII foldings
|
|
178
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
|
179
|
+
ascii_foldings.each do |f|
|
|
180
|
+
from = f[1]
|
|
181
|
+
|
|
182
|
+
if str2.index(f[0])
|
|
183
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
|
184
|
+
str1.gsub!(from, to)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
if str1.index(f[0])
|
|
188
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
|
189
|
+
str2.gsub!(from, to)
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
|
193
|
+
|
|
194
|
+
[str1, str2, mappings]
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def compute_similarity(_s1, _s2, sdiff)
|
|
198
|
+
return 0 if sdiff.nil?
|
|
199
|
+
|
|
200
|
+
# compute the lcs only with non-whitespace letters
|
|
201
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
|
202
|
+
return 0 if lcs == 0
|
|
203
|
+
|
|
204
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
|
205
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
|
206
|
+
|
|
207
|
+
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
|
208
|
+
end
|
|
209
|
+
|
|
140
210
|
end
|
|
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
|
|
|
12
12
|
attr_reader :similarity
|
|
13
13
|
attr_reader :lost_annotations
|
|
14
14
|
|
|
15
|
-
def initialize(
|
|
16
|
-
raise ArgumentError, "nil string" if
|
|
15
|
+
def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
|
16
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
|
17
17
|
|
|
18
|
-
@block_alignment = {source_text:
|
|
18
|
+
@block_alignment = {source_text:str1, target_text:str2}
|
|
19
|
+
@str1 = str1
|
|
20
|
+
@str2 = str2
|
|
19
21
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
# try exact match
|
|
22
|
+
## Block exact match
|
|
23
23
|
block_begin = str2.index(str1)
|
|
24
24
|
unless block_begin.nil?
|
|
25
25
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
|
26
|
-
return
|
|
26
|
+
return
|
|
27
27
|
end
|
|
28
28
|
|
|
29
|
-
# try exact match
|
|
30
29
|
block_begin = str2.downcase.index(str1.downcase)
|
|
31
30
|
unless block_begin.nil?
|
|
32
31
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
|
33
|
-
return
|
|
32
|
+
return
|
|
34
33
|
end
|
|
35
34
|
|
|
35
|
+
|
|
36
|
+
## to find block alignments
|
|
36
37
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
last
|
|
42
|
-
|
|
43
|
-
last[:
|
|
44
|
-
last[:target][:end] = anchor[:target][:end]
|
|
39
|
+
blocks = []
|
|
40
|
+
while block = anchor_finder.get_next_anchor
|
|
41
|
+
last = blocks.last
|
|
42
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
|
43
|
+
last[:source][:end] = block[:source][:end]
|
|
44
|
+
last[:target][:end] = block[:target][:end]
|
|
45
45
|
else
|
|
46
|
-
|
|
46
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
|
47
47
|
end
|
|
48
48
|
end
|
|
49
49
|
|
|
50
|
-
# pp
|
|
50
|
+
# pp blocks
|
|
51
51
|
# puts "-----"
|
|
52
52
|
# puts
|
|
53
|
-
#
|
|
53
|
+
# exit
|
|
54
|
+
# blocks.each do |b|
|
|
54
55
|
# p [b[:source], b[:target]]
|
|
55
56
|
# puts "---"
|
|
56
57
|
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
|
@@ -62,117 +63,198 @@ class TextAlignment::TextAlignment
|
|
|
62
63
|
# puts "-=-=-=-=-"
|
|
63
64
|
# puts
|
|
64
65
|
|
|
65
|
-
##
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
if mblocks[0][:source][:begin] > 0
|
|
71
|
-
e1 = mblocks[0][:source][:begin]
|
|
72
|
-
e2 = mblocks[0][:target][:begin]
|
|
66
|
+
## to fill the gaps
|
|
67
|
+
last_block = nil
|
|
68
|
+
blocks2 = blocks.inject([]) do |sum, block|
|
|
69
|
+
b1 = last_block ? last_block[:source][:end] : 0
|
|
70
|
+
e1 = block[:source][:begin]
|
|
73
71
|
|
|
74
|
-
if
|
|
75
|
-
|
|
72
|
+
sum += if b1 == e1
|
|
73
|
+
[block]
|
|
76
74
|
else
|
|
77
|
-
|
|
78
|
-
|
|
75
|
+
b2 = last_block ? last_block[:target][:end] : 0
|
|
76
|
+
e2 = block[:target][:begin]
|
|
77
|
+
|
|
78
|
+
if b2 == e2
|
|
79
|
+
[
|
|
80
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
|
81
|
+
block
|
|
82
|
+
]
|
|
83
|
+
else
|
|
84
|
+
if b1 == 0 && b2 == 0
|
|
85
|
+
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
|
86
|
+
b2 = e2 - len_buffer if e2 > len_buffer
|
|
87
|
+
end
|
|
79
88
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
89
|
+
_str1 = str1[b1 ... e1]
|
|
90
|
+
_str2 = str2[b2 ... e2]
|
|
91
|
+
|
|
92
|
+
if _str1.strip.empty? || _str2.strip.empty?
|
|
93
|
+
[
|
|
94
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
|
95
|
+
block
|
|
96
|
+
]
|
|
83
97
|
else
|
|
84
|
-
|
|
85
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
|
86
|
-
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
|
87
|
-
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
|
88
|
-
|
|
89
|
-
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
|
90
|
-
|
|
91
|
-
_str1 = str1[b1 ... e1]
|
|
92
|
-
_str2 = str2[b2 ... e2]
|
|
93
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
|
94
|
-
similarity = alignment_similarity(_str1, _str2, alignment)
|
|
95
|
-
if similarity < 0.6
|
|
96
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
|
|
97
|
-
else
|
|
98
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
|
99
|
-
end
|
|
98
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
|
100
99
|
end
|
|
101
100
|
end
|
|
102
101
|
end
|
|
102
|
+
|
|
103
|
+
last_block = block
|
|
104
|
+
sum
|
|
103
105
|
end
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
if
|
|
115
|
-
|
|
106
|
+
|
|
107
|
+
# the last step
|
|
108
|
+
blocks2 += if last_block.nil?
|
|
109
|
+
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
|
110
|
+
else
|
|
111
|
+
b1 = last_block[:source][:end]
|
|
112
|
+
if b1 < str1.length
|
|
113
|
+
e1 = str1.length
|
|
114
|
+
|
|
115
|
+
b2 = last_block[:target][:end]
|
|
116
|
+
if b2 < str2.length
|
|
117
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
|
118
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
|
119
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
|
116
120
|
else
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
121
|
+
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
|
122
|
+
end
|
|
123
|
+
else
|
|
124
|
+
[]
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
@block_alignment[:blocks] = blocks2
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
|
132
|
+
block2 = str2[b2 ... e2]
|
|
133
|
+
|
|
134
|
+
## term-based alignment
|
|
135
|
+
tblocks = if denotations
|
|
136
|
+
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
|
137
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
|
138
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
|
139
|
+
|
|
140
|
+
position = 0
|
|
141
|
+
tblocks = ds_in_scope.map do |term|
|
|
142
|
+
lex = term[:lex]
|
|
143
|
+
r = block2.index(lex, position)
|
|
144
|
+
if r.nil?
|
|
145
|
+
position = nil
|
|
146
|
+
break
|
|
147
|
+
end
|
|
148
|
+
position = r + lex.length
|
|
149
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# missing term found
|
|
153
|
+
tblocks = [] if position.nil?
|
|
154
|
+
|
|
155
|
+
# redundant matching found
|
|
156
|
+
unless position.nil?
|
|
157
|
+
ds_in_scope.each do |term|
|
|
158
|
+
lex = term[:lex]
|
|
159
|
+
look_forward = block2.index(lex, position)
|
|
160
|
+
unless look_forward.nil?
|
|
161
|
+
puts lex
|
|
162
|
+
tblocks = []
|
|
163
|
+
break
|
|
123
164
|
end
|
|
124
165
|
end
|
|
125
166
|
end
|
|
126
|
-
|
|
167
|
+
|
|
168
|
+
tblocks
|
|
127
169
|
end
|
|
128
170
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
_str1 = str1[b1 ... str1.length]
|
|
134
|
-
_str2 = str2[b2 ... str2.length]
|
|
171
|
+
if tblocks.empty?
|
|
172
|
+
if b1 == 0 && e1 == str1.length
|
|
173
|
+
block1 = str1[b1 ... e1]
|
|
174
|
+
block2 = str2[b2 ... e2]
|
|
135
175
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
176
|
+
## character-based alignment
|
|
177
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
|
178
|
+
if alignment.sdiff.nil?
|
|
179
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
|
139
180
|
else
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
181
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
|
182
|
+
end
|
|
183
|
+
else
|
|
184
|
+
block1 = str1[b1 ... e1]
|
|
185
|
+
block2 = str2[b2 ... e2]
|
|
186
|
+
|
|
187
|
+
## character-based alignment
|
|
188
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
|
189
|
+
if alignment.sdiff.nil?
|
|
190
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
|
191
|
+
else
|
|
192
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
else
|
|
196
|
+
last_tblock = nil
|
|
197
|
+
lblocks = tblocks.inject([]) do |sum, tblock|
|
|
198
|
+
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
|
199
|
+
te1 = tblock[:source][:begin]
|
|
146
200
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
201
|
+
sum += if te1 == tb1
|
|
202
|
+
[tblock]
|
|
203
|
+
else
|
|
204
|
+
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
|
205
|
+
te2 = tblock[:target][:begin]
|
|
206
|
+
|
|
207
|
+
if b2 == e2
|
|
208
|
+
[
|
|
209
|
+
{source:{begin:tb1, end:te1}, alignment: :empty},
|
|
210
|
+
tblock
|
|
211
|
+
]
|
|
151
212
|
else
|
|
152
|
-
|
|
213
|
+
[
|
|
214
|
+
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
|
215
|
+
tblock
|
|
216
|
+
]
|
|
153
217
|
end
|
|
218
|
+
end
|
|
154
219
|
|
|
155
|
-
|
|
220
|
+
last_tblock = tblock
|
|
221
|
+
sum
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
if last_tblock[:source][:end] < e1
|
|
225
|
+
if last_tblock[:target][:end] < e2
|
|
226
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
|
227
|
+
else
|
|
228
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
|
156
229
|
end
|
|
157
230
|
end
|
|
158
|
-
end
|
|
159
231
|
|
|
160
|
-
|
|
161
|
-
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
|
232
|
+
lblocks
|
|
162
233
|
end
|
|
163
234
|
end
|
|
164
235
|
|
|
236
|
+
|
|
237
|
+
def indices(str, target)
|
|
238
|
+
position = 0
|
|
239
|
+
len = target.len
|
|
240
|
+
Enumerator.new do |yielder|
|
|
241
|
+
while idx = str.index(target, position)
|
|
242
|
+
yielder << idx
|
|
243
|
+
position = idx + len
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
165
248
|
def transform_begin_position(begin_position)
|
|
166
249
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
|
167
250
|
block = @block_alignment[:blocks][i]
|
|
168
251
|
|
|
169
|
-
b = if block[:alignment] == :block
|
|
252
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
|
170
253
|
begin_position + block[:delta]
|
|
171
254
|
elsif block[:alignment] == :empty
|
|
172
255
|
if begin_position == block[:source][:begin]
|
|
173
256
|
block[:target][:begin]
|
|
174
257
|
else
|
|
175
|
-
# raise "lost annotation"
|
|
176
258
|
nil
|
|
177
259
|
end
|
|
178
260
|
else
|
|
@@ -185,13 +267,12 @@ class TextAlignment::TextAlignment
|
|
|
185
267
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
|
186
268
|
block = @block_alignment[:blocks][i]
|
|
187
269
|
|
|
188
|
-
e = if block[:alignment] == :block
|
|
270
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
|
189
271
|
end_position + block[:delta]
|
|
190
272
|
elsif block[:alignment] == :empty
|
|
191
273
|
if end_position == block[:source][:end]
|
|
192
274
|
block[:target][:end]
|
|
193
275
|
else
|
|
194
|
-
# raise "lost annotation"
|
|
195
276
|
nil
|
|
196
277
|
end
|
|
197
278
|
else
|
|
@@ -213,14 +294,14 @@ class TextAlignment::TextAlignment
|
|
|
213
294
|
@lost_annotations = []
|
|
214
295
|
|
|
215
296
|
denotations.each do |d|
|
|
216
|
-
begin
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
end
|
|
297
|
+
source = {begin:d.begin, end:d.end}
|
|
298
|
+
d.begin = transform_begin_position(d.begin);
|
|
299
|
+
d.end = transform_end_position(d.end);
|
|
300
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
|
|
301
|
+
rescue
|
|
302
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
|
303
|
+
d.begin = nil
|
|
304
|
+
d.end = nil
|
|
224
305
|
end
|
|
225
306
|
|
|
226
307
|
@lost_annotations
|
|
@@ -231,12 +312,12 @@ class TextAlignment::TextAlignment
|
|
|
231
312
|
@lost_annotations = []
|
|
232
313
|
|
|
233
314
|
r = hdenotations.collect do |d|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
315
|
+
t = transform_a_span(d[:span])
|
|
316
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
|
|
317
|
+
new_d = d.dup.merge({span:t})
|
|
318
|
+
rescue
|
|
319
|
+
@lost_annotations << {source: d[:span], target:t}
|
|
320
|
+
nil
|
|
240
321
|
end.compact
|
|
241
322
|
|
|
242
323
|
r
|
|
@@ -250,13 +331,16 @@ class TextAlignment::TextAlignment
|
|
|
250
331
|
@block_alignment[:blocks].each do |a|
|
|
251
332
|
show += case a[:alignment]
|
|
252
333
|
when :block
|
|
253
|
-
"===== common
|
|
334
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
|
335
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
|
336
|
+
when :term
|
|
337
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
|
254
338
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
|
255
339
|
when :empty
|
|
256
340
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
|
257
|
-
"<<<<< string 1\n" +
|
|
341
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
|
258
342
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
|
259
|
-
">>>>> string 2\n" +
|
|
343
|
+
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
|
260
344
|
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
|
261
345
|
else
|
|
262
346
|
astr1 = ''
|
|
@@ -290,7 +374,7 @@ class TextAlignment::TextAlignment
|
|
|
290
374
|
end
|
|
291
375
|
end.join('')
|
|
292
376
|
|
|
293
|
-
"***** local mismatch\n" +
|
|
377
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
|
294
378
|
"[#{astr1}]\n" +
|
|
295
379
|
"[#{astr2}]\n\n"
|
|
296
380
|
end
|
|
@@ -298,68 +382,4 @@ class TextAlignment::TextAlignment
|
|
|
298
382
|
show
|
|
299
383
|
end
|
|
300
384
|
|
|
301
|
-
private
|
|
302
|
-
|
|
303
|
-
def string_preprocessing(_str1, _str2)
|
|
304
|
-
str1 = _str1.dup
|
|
305
|
-
str2 = _str2.dup
|
|
306
|
-
mappings = TextAlignment::MAPPINGS.dup
|
|
307
|
-
|
|
308
|
-
## single character mappings
|
|
309
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
|
310
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
|
311
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
|
312
|
-
characters_to.gsub!(/-/, '\-')
|
|
313
|
-
|
|
314
|
-
str1.tr!(characters_from, characters_to)
|
|
315
|
-
str2.tr!(characters_from, characters_to)
|
|
316
|
-
|
|
317
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
|
318
|
-
|
|
319
|
-
## long to one character mappings
|
|
320
|
-
pletters = TextAlignment::PADDING_LETTERS
|
|
321
|
-
|
|
322
|
-
# find the padding letter for str1
|
|
323
|
-
@padding_letter1 = begin
|
|
324
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
|
325
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
|
326
|
-
TextAlignment::PADDING_LETTERS[i]
|
|
327
|
-
end
|
|
328
|
-
|
|
329
|
-
# find the padding letter for str2
|
|
330
|
-
@padding_letter2 = begin
|
|
331
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
|
332
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
|
333
|
-
TextAlignment::PADDING_LETTERS[i]
|
|
334
|
-
end
|
|
335
|
-
|
|
336
|
-
# ASCII foldings
|
|
337
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
|
338
|
-
ascii_foldings.each do |f|
|
|
339
|
-
from = f[1]
|
|
340
|
-
|
|
341
|
-
if str2.index(f[0])
|
|
342
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
|
343
|
-
str1.gsub!(from, to)
|
|
344
|
-
end
|
|
345
|
-
|
|
346
|
-
if str1.index(f[0])
|
|
347
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
|
348
|
-
str2.gsub!(from, to)
|
|
349
|
-
end
|
|
350
|
-
end
|
|
351
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
|
352
|
-
|
|
353
|
-
[str1, str2, mappings]
|
|
354
|
-
end
|
|
355
|
-
|
|
356
|
-
def alignment_similarity(_s1, _s2, alignment)
|
|
357
|
-
# compute the lcs only with non-whitespace letters
|
|
358
|
-
lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
|
359
|
-
|
|
360
|
-
s1 = _s1.tr(@padding_letter1, ' ')
|
|
361
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
|
362
|
-
similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
|
|
363
|
-
end
|
|
364
|
-
|
|
365
385
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: text_alignment
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.7.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jin-Dong Kim
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2020-10-
|
|
11
|
+
date: 2020-10-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: ruby-dictionary
|