text_alignment 0.3.10 → 0.3.16
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +45 -1
- data/lib/text_alignment/anchor_finder.rb +23 -25
- data/lib/text_alignment/text_alignment.rb +24 -16
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 61f98e83ee9c1d220dd228be6bb708b79f36d1c691f04dcb14d4af55f398b6da
|
4
|
+
data.tar.gz: f692e98a27a555baab2797ebe37407ad7133916db172a977e95415b9004e471c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2dd8f865c245601c362e335df4e26413501fa682a97010b8aebd3ebc01864ae4772f6e716725331f9c6bc8f688818d665ef7a21384906211efc0e630b46f2313
|
7
|
+
data.tar.gz: 4c43199f474b94c825d8ec8ca2085b06107ec34df3d5ee988294f7423caef317c893bbba9879637d197fb2e7ae426c9e43c67cf042b7339959b638d5e5f60d01
|
data/bin/align_annotations
CHANGED
@@ -103,6 +103,35 @@ target_annotations = if source_annotations.class == Array
|
|
103
103
|
else
|
104
104
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
105
|
|
106
|
+
# verification
|
107
|
+
source_text = source_annotations[:text]
|
108
|
+
puts "=====BEGIN"
|
109
|
+
(0 ... source_text.length).each do |p|
|
110
|
+
t = alignment.transform_begin_position(p)
|
111
|
+
if t.nil?
|
112
|
+
print source_text[p]
|
113
|
+
else
|
114
|
+
print '.'
|
115
|
+
end
|
116
|
+
end
|
117
|
+
puts
|
118
|
+
puts "=====END"
|
119
|
+
|
120
|
+
puts "=====BEGIN"
|
121
|
+
(0 .. source_text.length).each do |p|
|
122
|
+
t = alignment.transform_end_position(p)
|
123
|
+
if t.nil?
|
124
|
+
print source_text[p]
|
125
|
+
else
|
126
|
+
print '.'
|
127
|
+
end
|
128
|
+
end
|
129
|
+
puts
|
130
|
+
puts "=====END"
|
131
|
+
|
132
|
+
pp alignment
|
133
|
+
|
134
|
+
exit
|
106
135
|
# alignment.block_alignments.each do |a|
|
107
136
|
# if a[:alignment].nil? || a[:alignment] == :empty
|
108
137
|
# # p [a[:source], a[:target]]
|
@@ -119,7 +148,22 @@ else
|
|
119
148
|
# end
|
120
149
|
# exit
|
121
150
|
|
151
|
+
# verification of source denotations
|
152
|
+
puts "[Invalid source denotations]"
|
153
|
+
source_annotations[:denotations] do |d|
|
154
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
155
|
+
end
|
156
|
+
puts "====="
|
157
|
+
puts
|
158
|
+
|
122
159
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
160
|
+
puts "[Invalid transformation]"
|
161
|
+
denotations.each do |d|
|
162
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
163
|
+
end
|
164
|
+
puts "====="
|
165
|
+
puts
|
166
|
+
|
123
167
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
124
168
|
|
125
169
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -159,7 +203,7 @@ if lost_annotations
|
|
159
203
|
warn "#{lost_annotations.length}"
|
160
204
|
end
|
161
205
|
|
162
|
-
puts target_annotations.to_json
|
206
|
+
#puts target_annotations.to_json
|
163
207
|
|
164
208
|
# denotations = anns1[:denotations]
|
165
209
|
|
@@ -3,9 +3,9 @@ require 'string-similarity'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
TextAlignment::SIZE_NGRAM =
|
7
|
-
TextAlignment::SIZE_WINDOW =
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.
|
6
|
+
TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
+
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
+
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
11
11
|
|
@@ -23,44 +23,41 @@ class TextAlignment::AnchorFinder
|
|
23
23
|
|
24
24
|
# current position in s1
|
25
25
|
@beg_s1 = 0
|
26
|
+
@end_s1_prev = 0
|
27
|
+
@end_s2_prev = 0
|
26
28
|
end
|
27
29
|
|
28
30
|
def get_next_anchor
|
29
31
|
# find the position of an anchor ngram in s1 and s2
|
30
|
-
@beg_s2 = nil
|
31
32
|
while @beg_s1 < (@s1.length - @size_ngram)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
break unless @beg_s2.nil?
|
40
|
-
@beg_s1 += 1
|
41
|
-
end
|
33
|
+
anchor = @s1[@beg_s1, @size_ngram]
|
34
|
+
|
35
|
+
search_position = 0
|
36
|
+
# search_position = @end_s2_prev
|
37
|
+
while @beg_s2 = @s2.index(anchor, search_position)
|
38
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
39
|
+
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
42
40
|
|
43
|
-
|
44
|
-
|
41
|
+
left_window_s1, left_window_s2 = get_left_windows
|
42
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
45
43
|
|
46
|
-
|
47
|
-
|
44
|
+
right_window_s1, right_window_s2 = get_right_windows
|
45
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
48
46
|
|
49
|
-
|
50
|
-
|
47
|
+
search_position = @beg_s2 + 1
|
48
|
+
end
|
51
49
|
|
52
|
-
|
53
|
-
break if right_window_s2 && text_similarity(right_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
50
|
+
break unless @beg_s2.nil?
|
54
51
|
|
55
52
|
@beg_s1 += 1
|
56
53
|
end
|
57
54
|
|
58
|
-
return nil if @
|
55
|
+
return nil if @beg_s1 >= (@s1.length - @size_ngram)
|
59
56
|
|
60
57
|
# extend the block
|
61
58
|
b1 = @beg_s1
|
62
59
|
b2 = @beg_s2
|
63
|
-
while b1
|
60
|
+
while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
|
64
61
|
b1 -= 1; b2 -= 1
|
65
62
|
end
|
66
63
|
b1 += 1; b2 += 1
|
@@ -113,7 +110,7 @@ class TextAlignment::AnchorFinder
|
|
113
110
|
end
|
114
111
|
|
115
112
|
def get_right_windows
|
116
|
-
return if (@beg_s1 + @size_ngram
|
113
|
+
return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
117
114
|
|
118
115
|
window_s1 = ''
|
119
116
|
loc = @beg_s1 + @size_ngram
|
@@ -143,6 +140,7 @@ class TextAlignment::AnchorFinder
|
|
143
140
|
end
|
144
141
|
|
145
142
|
def text_similarity(str1, str2, ngram_order = 2)
|
143
|
+
return 0 if str1.nil? || str2.nil?
|
146
144
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
147
145
|
end
|
148
146
|
|
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
+
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
10
|
+
|
9
11
|
|
10
12
|
class TextAlignment::TextAlignment
|
11
13
|
attr_reader :block_alignments
|
@@ -38,17 +40,17 @@ class TextAlignment::TextAlignment
|
|
38
40
|
end
|
39
41
|
end
|
40
42
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
43
|
+
mblocks.each do |b|
|
44
|
+
p [b[:source], b[:target]]
|
45
|
+
puts "---"
|
46
|
+
puts str1[b[:source][:begin] ... b[:source][:end]]
|
47
|
+
puts "---"
|
48
|
+
puts str2[b[:target][:begin] ... b[:target][:end]]
|
49
|
+
puts "====="
|
50
|
+
puts
|
51
|
+
end
|
52
|
+
puts "-=-=-=-=-"
|
53
|
+
puts
|
52
54
|
|
53
55
|
## To find block alignments
|
54
56
|
@block_alignments = []
|
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
|
|
70
72
|
@block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
71
73
|
else
|
72
74
|
len_min = [_str1.length, _str2.length].min
|
73
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
|
75
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
74
76
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
75
77
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
76
78
|
|
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
|
|
135
137
|
@block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
136
138
|
else
|
137
139
|
len_min = [_str1.length, _str2.length].min
|
138
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
|
140
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
139
141
|
e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
|
140
|
-
e2 = _str2.length < len_buffer ?
|
142
|
+
e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
|
143
|
+
_str1 = str1[b1 ... e1]
|
144
|
+
_str2 = str2[b2 ... e2]
|
141
145
|
|
142
146
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
143
147
|
if alignment.similarity < 0.6
|
@@ -166,9 +170,12 @@ class TextAlignment::TextAlignment
|
|
166
170
|
if begin_position == block_alignment[:source][:begin]
|
167
171
|
block_alignment[:target][:begin]
|
168
172
|
else
|
169
|
-
raise "lost annotation"
|
173
|
+
# raise "lost annotation"
|
174
|
+
nil
|
170
175
|
end
|
171
176
|
else
|
177
|
+
p begin_position
|
178
|
+
puts "-----"
|
172
179
|
block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
|
173
180
|
end
|
174
181
|
end
|
@@ -183,7 +190,8 @@ class TextAlignment::TextAlignment
|
|
183
190
|
if end_position == block_alignment[:source][:end]
|
184
191
|
block_alignment[:target][:end]
|
185
192
|
else
|
186
|
-
raise "lost annotation"
|
193
|
+
# raise "lost annotation"
|
194
|
+
nil
|
187
195
|
end
|
188
196
|
else
|
189
197
|
block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.16
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|