text_alignment 0.3.11 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +44 -1
- data/lib/text_alignment/anchor_finder.rb +12 -8
- data/lib/text_alignment/text_alignment.rb +15 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5fc0292470086de4cc05275d8d19dcd534b1bfe66c5560d2043451530a657c5e
|
4
|
+
data.tar.gz: c6eee42e0b2b2111f063abd4bd37cbd83a6eff7c236eb032832a23fe77bb7ed5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 22d172f3fdaa4549edd4f9e4541c03b947c6ae8bab61e46f55c0378120b632639c70d7a30361b321065a978be6930844aedbce6c5e7591521c016c429f4eeb14
|
7
|
+
data.tar.gz: 3babca5408aa2dd7ce23e40dbb7909cb4b60f492583c48cb9a6fe085f84e7cf1d4f9b07ea3942455ca984c98dc3d91fce425b8dc116c98a5b8d0ff753fc232c6
|
data/bin/align_annotations
CHANGED
@@ -103,6 +103,34 @@ target_annotations = if source_annotations.class == Array
|
|
103
103
|
else
|
104
104
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
105
|
|
106
|
+
pp alignment
|
107
|
+
|
108
|
+
# verification
|
109
|
+
source_text = source_annotations[:text]
|
110
|
+
puts "=====BEGIN"
|
111
|
+
(0 ... source_text.length).each do |p|
|
112
|
+
t = alignment.transform_begin_position(p)
|
113
|
+
if t.nil?
|
114
|
+
print source_text[p]
|
115
|
+
else
|
116
|
+
print '.'
|
117
|
+
end
|
118
|
+
end
|
119
|
+
puts
|
120
|
+
puts "=====END"
|
121
|
+
|
122
|
+
puts "=====BEGIN"
|
123
|
+
(0 .. source_text.length).each do |p|
|
124
|
+
t = alignment.transform_end_position(p)
|
125
|
+
if t.nil?
|
126
|
+
print source_text[p]
|
127
|
+
else
|
128
|
+
print '.'
|
129
|
+
end
|
130
|
+
end
|
131
|
+
puts
|
132
|
+
puts "=====END"
|
133
|
+
|
106
134
|
# alignment.block_alignments.each do |a|
|
107
135
|
# if a[:alignment].nil? || a[:alignment] == :empty
|
108
136
|
# # p [a[:source], a[:target]]
|
@@ -119,7 +147,22 @@ else
|
|
119
147
|
# end
|
120
148
|
# exit
|
121
149
|
|
150
|
+
# verification of source denotations
|
151
|
+
puts "[Invalid source denotations]"
|
152
|
+
source_annotations[:denotations] do |d|
|
153
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
154
|
+
end
|
155
|
+
puts "====="
|
156
|
+
puts
|
157
|
+
|
122
158
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
159
|
+
puts "[Invalid transformation]"
|
160
|
+
denotations.each do |d|
|
161
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
162
|
+
end
|
163
|
+
puts "====="
|
164
|
+
puts
|
165
|
+
|
123
166
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
124
167
|
|
125
168
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -159,7 +202,7 @@ if lost_annotations
|
|
159
202
|
warn "#{lost_annotations.length}"
|
160
203
|
end
|
161
204
|
|
162
|
-
puts target_annotations.to_json
|
205
|
+
#puts target_annotations.to_json
|
163
206
|
|
164
207
|
# denotations = anns1[:denotations]
|
165
208
|
|
@@ -3,9 +3,9 @@ require 'string-similarity'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
TextAlignment::SIZE_NGRAM =
|
7
|
-
TextAlignment::SIZE_WINDOW =
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.
|
6
|
+
TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
+
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
+
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
11
11
|
|
@@ -23,6 +23,8 @@ class TextAlignment::AnchorFinder
|
|
23
23
|
|
24
24
|
# current position in s1
|
25
25
|
@beg_s1 = 0
|
26
|
+
@end_s1_prev = 0
|
27
|
+
@end_s2_prev = 0
|
26
28
|
end
|
27
29
|
|
28
30
|
def get_next_anchor
|
@@ -31,15 +33,16 @@ class TextAlignment::AnchorFinder
|
|
31
33
|
anchor = @s1[@beg_s1, @size_ngram]
|
32
34
|
|
33
35
|
search_position = 0
|
36
|
+
# search_position = @end_s2_prev
|
34
37
|
while @beg_s2 = @s2.index(anchor, search_position)
|
35
38
|
# if both the begining points are sufficiantly close to the end points of the last match
|
36
|
-
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
|
39
|
+
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
37
40
|
|
38
41
|
left_window_s1, left_window_s2 = get_left_windows
|
39
|
-
break if left_window_s1
|
42
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
40
43
|
|
41
44
|
right_window_s1, right_window_s2 = get_right_windows
|
42
|
-
break if right_window_s2 && text_similarity(right_window_s1,
|
45
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
43
46
|
|
44
47
|
search_position = @beg_s2 + 1
|
45
48
|
end
|
@@ -54,7 +57,7 @@ class TextAlignment::AnchorFinder
|
|
54
57
|
# extend the block
|
55
58
|
b1 = @beg_s1
|
56
59
|
b2 = @beg_s2
|
57
|
-
while b1
|
60
|
+
while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
|
58
61
|
b1 -= 1; b2 -= 1
|
59
62
|
end
|
60
63
|
b1 += 1; b2 += 1
|
@@ -107,7 +110,7 @@ class TextAlignment::AnchorFinder
|
|
107
110
|
end
|
108
111
|
|
109
112
|
def get_right_windows
|
110
|
-
return if (@beg_s1 + @size_ngram
|
113
|
+
return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
111
114
|
|
112
115
|
window_s1 = ''
|
113
116
|
loc = @beg_s1 + @size_ngram
|
@@ -137,6 +140,7 @@ class TextAlignment::AnchorFinder
|
|
137
140
|
end
|
138
141
|
|
139
142
|
def text_similarity(str1, str2, ngram_order = 2)
|
143
|
+
return 0 if str1.nil? || str2.nil?
|
140
144
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
141
145
|
end
|
142
146
|
|
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
+
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
10
|
+
|
9
11
|
|
10
12
|
class TextAlignment::TextAlignment
|
11
13
|
attr_reader :block_alignments
|
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
|
|
70
72
|
@block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
71
73
|
else
|
72
74
|
len_min = [_str1.length, _str2.length].min
|
73
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
|
75
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
74
76
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
75
77
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
76
78
|
|
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
|
|
135
137
|
@block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
136
138
|
else
|
137
139
|
len_min = [_str1.length, _str2.length].min
|
138
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
|
140
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
139
141
|
e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
|
140
|
-
e2 = _str2.length < len_buffer ?
|
142
|
+
e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
|
143
|
+
_str1 = str1[b1 ... e1]
|
144
|
+
_str2 = str2[b2 ... e2]
|
141
145
|
|
142
146
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
143
147
|
if alignment.similarity < 0.6
|
@@ -166,10 +170,12 @@ class TextAlignment::TextAlignment
|
|
166
170
|
if begin_position == block_alignment[:source][:begin]
|
167
171
|
block_alignment[:target][:begin]
|
168
172
|
else
|
169
|
-
raise "lost annotation"
|
173
|
+
# raise "lost annotation"
|
174
|
+
nil
|
170
175
|
end
|
171
176
|
else
|
172
|
-
block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
|
177
|
+
r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
|
178
|
+
r.nil? ? nil : r + block_alignment[:target][:begin]
|
173
179
|
end
|
174
180
|
end
|
175
181
|
|
@@ -183,10 +189,12 @@ class TextAlignment::TextAlignment
|
|
183
189
|
if end_position == block_alignment[:source][:end]
|
184
190
|
block_alignment[:target][:end]
|
185
191
|
else
|
186
|
-
raise "lost annotation"
|
192
|
+
# raise "lost annotation"
|
193
|
+
nil
|
187
194
|
end
|
188
195
|
else
|
189
|
-
block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
|
196
|
+
r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
|
197
|
+
r.nil? ? nil : r + block_alignment[:target][:begin]
|
190
198
|
end
|
191
199
|
end
|
192
200
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|