text_alignment 0.3.13 → 0.3.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +41 -0
- data/lib/text_alignment/anchor_finder.rb +5 -4
- data/lib/text_alignment/text_alignment.rb +15 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b6821076dd780721d97403f3ca49568a66100f34f7fffe32acbf28572c36429
|
4
|
+
data.tar.gz: 0d29d5b64897b4030de7479eaa8fc3990e13f2135298315a18b42b06ee3aa300
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: efb442cee5dfa76e7428d516b5e8ba768cbf80229580f396d02fcd5aa99f5573f88b2af662d82159669fc2c58f84ad77f6bde72696162f3c0a7fb74a75d4f7e0
|
7
|
+
data.tar.gz: d6fe000e32862fea2c511fb7937c85ca65e75dd53fe33214f0a637cd7b081a947f268919dd050e425d4fa2fca4268125dcffcb1a54fb933c6eb0bd3c4971b058
|
data/bin/align_annotations
CHANGED
@@ -105,6 +105,32 @@ else
|
|
105
105
|
|
106
106
|
pp alignment
|
107
107
|
|
108
|
+
# verification
|
109
|
+
source_text = source_annotations[:text]
|
110
|
+
puts "=====BEGIN"
|
111
|
+
(0 ... source_text.length).each do |p|
|
112
|
+
t = alignment.transform_begin_position(p)
|
113
|
+
if t.nil?
|
114
|
+
print source_text[p]
|
115
|
+
else
|
116
|
+
print '.'
|
117
|
+
end
|
118
|
+
end
|
119
|
+
puts
|
120
|
+
puts "=====END"
|
121
|
+
|
122
|
+
puts "=====BEGIN"
|
123
|
+
(0 .. source_text.length).each do |p|
|
124
|
+
t = alignment.transform_end_position(p)
|
125
|
+
if t.nil?
|
126
|
+
print source_text[p]
|
127
|
+
else
|
128
|
+
print '.'
|
129
|
+
end
|
130
|
+
end
|
131
|
+
puts
|
132
|
+
puts "=====END"
|
133
|
+
|
108
134
|
# alignment.block_alignments.each do |a|
|
109
135
|
# if a[:alignment].nil? || a[:alignment] == :empty
|
110
136
|
# # p [a[:source], a[:target]]
|
@@ -121,7 +147,22 @@ else
|
|
121
147
|
# end
|
122
148
|
# exit
|
123
149
|
|
150
|
+
# verification of source denotations
|
151
|
+
puts "[Invalid source denotations]"
|
152
|
+
source_annotations[:denotations] do |d|
|
153
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
154
|
+
end
|
155
|
+
puts "====="
|
156
|
+
puts
|
157
|
+
|
124
158
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
159
|
+
puts "[Invalid transformation]"
|
160
|
+
denotations.each do |d|
|
161
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
162
|
+
end
|
163
|
+
puts "====="
|
164
|
+
puts
|
165
|
+
|
125
166
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
126
167
|
|
127
168
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -4,7 +4,7 @@ require 'string-similarity'
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
6
|
TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
-
TextAlignment::SIZE_WINDOW =
|
7
|
+
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
8
|
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
@@ -23,6 +23,7 @@ class TextAlignment::AnchorFinder
|
|
23
23
|
|
24
24
|
# current position in s1
|
25
25
|
@beg_s1 = 0
|
26
|
+
@end_s1_prev = 0
|
26
27
|
@end_s2_prev = 0
|
27
28
|
end
|
28
29
|
|
@@ -31,8 +32,8 @@ class TextAlignment::AnchorFinder
|
|
31
32
|
while @beg_s1 < (@s1.length - @size_ngram)
|
32
33
|
anchor = @s1[@beg_s1, @size_ngram]
|
33
34
|
|
34
|
-
search_position = 0
|
35
|
-
|
35
|
+
# search_position = 0
|
36
|
+
search_position = @end_s2_prev
|
36
37
|
while @beg_s2 = @s2.index(anchor, search_position)
|
37
38
|
# if both the begining points are sufficiantly close to the end points of the last match
|
38
39
|
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
@@ -56,7 +57,7 @@ class TextAlignment::AnchorFinder
|
|
56
57
|
# extend the block
|
57
58
|
b1 = @beg_s1
|
58
59
|
b2 = @beg_s2
|
59
|
-
while b1
|
60
|
+
while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
|
60
61
|
b1 -= 1; b2 -= 1
|
61
62
|
end
|
62
63
|
b1 += 1; b2 += 1
|
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
+
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
10
|
+
|
9
11
|
|
10
12
|
class TextAlignment::TextAlignment
|
11
13
|
attr_reader :block_alignments
|
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
|
|
70
72
|
@block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
71
73
|
else
|
72
74
|
len_min = [_str1.length, _str2.length].min
|
73
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
|
75
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
74
76
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
75
77
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
76
78
|
|
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
|
|
135
137
|
@block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
136
138
|
else
|
137
139
|
len_min = [_str1.length, _str2.length].min
|
138
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
|
140
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
139
141
|
e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
|
140
|
-
e2 = _str2.length < len_buffer ?
|
142
|
+
e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
|
143
|
+
_str1 = str1[b1 ... e1]
|
144
|
+
_str2 = str2[b2 ... e2]
|
141
145
|
|
142
146
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
143
147
|
if alignment.similarity < 0.6
|
@@ -166,10 +170,12 @@ class TextAlignment::TextAlignment
|
|
166
170
|
if begin_position == block_alignment[:source][:begin]
|
167
171
|
block_alignment[:target][:begin]
|
168
172
|
else
|
169
|
-
raise "lost annotation"
|
173
|
+
# raise "lost annotation"
|
174
|
+
nil
|
170
175
|
end
|
171
176
|
else
|
172
|
-
block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
|
177
|
+
r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
|
178
|
+
r.nil? ? nil : r + block_alignment[:target][:begin]
|
173
179
|
end
|
174
180
|
end
|
175
181
|
|
@@ -183,10 +189,12 @@ class TextAlignment::TextAlignment
|
|
183
189
|
if end_position == block_alignment[:source][:end]
|
184
190
|
block_alignment[:target][:end]
|
185
191
|
else
|
186
|
-
raise "lost annotation"
|
192
|
+
# raise "lost annotation"
|
193
|
+
nil
|
187
194
|
end
|
188
195
|
else
|
189
|
-
block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
|
196
|
+
r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
|
197
|
+
r.nil? ? nil : r + block_alignment[:target][:begin]
|
190
198
|
end
|
191
199
|
end
|
192
200
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|