text_alignment 0.3.10 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +45 -1
- data/lib/text_alignment/anchor_finder.rb +23 -25
- data/lib/text_alignment/text_alignment.rb +24 -16
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 61f98e83ee9c1d220dd228be6bb708b79f36d1c691f04dcb14d4af55f398b6da
|
4
|
+
data.tar.gz: f692e98a27a555baab2797ebe37407ad7133916db172a977e95415b9004e471c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2dd8f865c245601c362e335df4e26413501fa682a97010b8aebd3ebc01864ae4772f6e716725331f9c6bc8f688818d665ef7a21384906211efc0e630b46f2313
|
7
|
+
data.tar.gz: 4c43199f474b94c825d8ec8ca2085b06107ec34df3d5ee988294f7423caef317c893bbba9879637d197fb2e7ae426c9e43c67cf042b7339959b638d5e5f60d01
|
data/bin/align_annotations
CHANGED
@@ -103,6 +103,35 @@ target_annotations = if source_annotations.class == Array
|
|
103
103
|
else
|
104
104
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
105
|
|
106
|
+
# verification
|
107
|
+
source_text = source_annotations[:text]
|
108
|
+
puts "=====BEGIN"
|
109
|
+
(0 ... source_text.length).each do |p|
|
110
|
+
t = alignment.transform_begin_position(p)
|
111
|
+
if t.nil?
|
112
|
+
print source_text[p]
|
113
|
+
else
|
114
|
+
print '.'
|
115
|
+
end
|
116
|
+
end
|
117
|
+
puts
|
118
|
+
puts "=====END"
|
119
|
+
|
120
|
+
puts "=====BEGIN"
|
121
|
+
(0 .. source_text.length).each do |p|
|
122
|
+
t = alignment.transform_end_position(p)
|
123
|
+
if t.nil?
|
124
|
+
print source_text[p]
|
125
|
+
else
|
126
|
+
print '.'
|
127
|
+
end
|
128
|
+
end
|
129
|
+
puts
|
130
|
+
puts "=====END"
|
131
|
+
|
132
|
+
pp alignment
|
133
|
+
|
134
|
+
exit
|
106
135
|
# alignment.block_alignments.each do |a|
|
107
136
|
# if a[:alignment].nil? || a[:alignment] == :empty
|
108
137
|
# # p [a[:source], a[:target]]
|
@@ -119,7 +148,22 @@ else
|
|
119
148
|
# end
|
120
149
|
# exit
|
121
150
|
|
151
|
+
# verification of source denotations
|
152
|
+
puts "[Invalid source denotations]"
|
153
|
+
source_annotations[:denotations] do |d|
|
154
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
155
|
+
end
|
156
|
+
puts "====="
|
157
|
+
puts
|
158
|
+
|
122
159
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
160
|
+
puts "[Invalid transformation]"
|
161
|
+
denotations.each do |d|
|
162
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
163
|
+
end
|
164
|
+
puts "====="
|
165
|
+
puts
|
166
|
+
|
123
167
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
124
168
|
|
125
169
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -159,7 +203,7 @@ if lost_annotations
|
|
159
203
|
warn "#{lost_annotations.length}"
|
160
204
|
end
|
161
205
|
|
162
|
-
puts target_annotations.to_json
|
206
|
+
#puts target_annotations.to_json
|
163
207
|
|
164
208
|
# denotations = anns1[:denotations]
|
165
209
|
|
@@ -3,9 +3,9 @@ require 'string-similarity'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
TextAlignment::SIZE_NGRAM =
|
7
|
-
TextAlignment::SIZE_WINDOW =
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.
|
6
|
+
TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
+
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
+
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
11
11
|
|
@@ -23,44 +23,41 @@ class TextAlignment::AnchorFinder
|
|
23
23
|
|
24
24
|
# current position in s1
|
25
25
|
@beg_s1 = 0
|
26
|
+
@end_s1_prev = 0
|
27
|
+
@end_s2_prev = 0
|
26
28
|
end
|
27
29
|
|
28
30
|
def get_next_anchor
|
29
31
|
# find the position of an anchor ngram in s1 and s2
|
30
|
-
@beg_s2 = nil
|
31
32
|
while @beg_s1 < (@s1.length - @size_ngram)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
break unless @beg_s2.nil?
|
40
|
-
@beg_s1 += 1
|
41
|
-
end
|
33
|
+
anchor = @s1[@beg_s1, @size_ngram]
|
34
|
+
|
35
|
+
search_position = 0
|
36
|
+
# search_position = @end_s2_prev
|
37
|
+
while @beg_s2 = @s2.index(anchor, search_position)
|
38
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
39
|
+
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
42
40
|
|
43
|
-
|
44
|
-
|
41
|
+
left_window_s1, left_window_s2 = get_left_windows
|
42
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
45
43
|
|
46
|
-
|
47
|
-
|
44
|
+
right_window_s1, right_window_s2 = get_right_windows
|
45
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
48
46
|
|
49
|
-
|
50
|
-
|
47
|
+
search_position = @beg_s2 + 1
|
48
|
+
end
|
51
49
|
|
52
|
-
|
53
|
-
break if right_window_s2 && text_similarity(right_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
50
|
+
break unless @beg_s2.nil?
|
54
51
|
|
55
52
|
@beg_s1 += 1
|
56
53
|
end
|
57
54
|
|
58
|
-
return nil if @
|
55
|
+
return nil if @beg_s1 >= (@s1.length - @size_ngram)
|
59
56
|
|
60
57
|
# extend the block
|
61
58
|
b1 = @beg_s1
|
62
59
|
b2 = @beg_s2
|
63
|
-
while b1
|
60
|
+
while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
|
64
61
|
b1 -= 1; b2 -= 1
|
65
62
|
end
|
66
63
|
b1 += 1; b2 += 1
|
@@ -113,7 +110,7 @@ class TextAlignment::AnchorFinder
|
|
113
110
|
end
|
114
111
|
|
115
112
|
def get_right_windows
|
116
|
-
return if (@beg_s1 + @size_ngram
|
113
|
+
return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
117
114
|
|
118
115
|
window_s1 = ''
|
119
116
|
loc = @beg_s1 + @size_ngram
|
@@ -143,6 +140,7 @@ class TextAlignment::AnchorFinder
|
|
143
140
|
end
|
144
141
|
|
145
142
|
def text_similarity(str1, str2, ngram_order = 2)
|
143
|
+
return 0 if str1.nil? || str2.nil?
|
146
144
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
147
145
|
end
|
148
146
|
|
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
+
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
10
|
+
|
9
11
|
|
10
12
|
class TextAlignment::TextAlignment
|
11
13
|
attr_reader :block_alignments
|
@@ -38,17 +40,17 @@ class TextAlignment::TextAlignment
|
|
38
40
|
end
|
39
41
|
end
|
40
42
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
43
|
+
mblocks.each do |b|
|
44
|
+
p [b[:source], b[:target]]
|
45
|
+
puts "---"
|
46
|
+
puts str1[b[:source][:begin] ... b[:source][:end]]
|
47
|
+
puts "---"
|
48
|
+
puts str2[b[:target][:begin] ... b[:target][:end]]
|
49
|
+
puts "====="
|
50
|
+
puts
|
51
|
+
end
|
52
|
+
puts "-=-=-=-=-"
|
53
|
+
puts
|
52
54
|
|
53
55
|
## To find block alignments
|
54
56
|
@block_alignments = []
|
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
|
|
70
72
|
@block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
71
73
|
else
|
72
74
|
len_min = [_str1.length, _str2.length].min
|
73
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
|
75
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
74
76
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
75
77
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
76
78
|
|
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
|
|
135
137
|
@block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
136
138
|
else
|
137
139
|
len_min = [_str1.length, _str2.length].min
|
138
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
|
140
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
139
141
|
e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
|
140
|
-
e2 = _str2.length < len_buffer ?
|
142
|
+
e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
|
143
|
+
_str1 = str1[b1 ... e1]
|
144
|
+
_str2 = str2[b2 ... e2]
|
141
145
|
|
142
146
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
143
147
|
if alignment.similarity < 0.6
|
@@ -166,9 +170,12 @@ class TextAlignment::TextAlignment
|
|
166
170
|
if begin_position == block_alignment[:source][:begin]
|
167
171
|
block_alignment[:target][:begin]
|
168
172
|
else
|
169
|
-
raise "lost annotation"
|
173
|
+
# raise "lost annotation"
|
174
|
+
nil
|
170
175
|
end
|
171
176
|
else
|
177
|
+
p begin_position
|
178
|
+
puts "-----"
|
172
179
|
block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
|
173
180
|
end
|
174
181
|
end
|
@@ -183,7 +190,8 @@ class TextAlignment::TextAlignment
|
|
183
190
|
if end_position == block_alignment[:source][:end]
|
184
191
|
block_alignment[:target][:end]
|
185
192
|
else
|
186
|
-
raise "lost annotation"
|
193
|
+
# raise "lost annotation"
|
194
|
+
nil
|
187
195
|
end
|
188
196
|
else
|
189
197
|
block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.16
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|