text_alignment 0.3.14 → 0.3.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +10 -2
- data/lib/text_alignment/anchor_finder.rb +6 -5
- data/lib/text_alignment/text_alignment.rb +12 -4
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea57d01970fdb56a95a7929803949a553965692fb3f4748eec72fe026f9a79cf
|
4
|
+
data.tar.gz: 96397baa91646b3eb05a346ff699930b6dacf7d38075273b64ce7916f32d6275
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1d1e7650c35d9bae35a7f1dc2948dbf97fa8f71a86f1f83c5dda9cb64b7179e96ae219db88951e65c2988f078900d960784c4c74489a074654ed893c408be97f
|
7
|
+
data.tar.gz: 49afaec2b6332dc4038c1f0d0e930bf20ce61a6d4933ff3758bbbfea3e05cf347277aefc341d9c3b3d1f8a4692cec24b901528c2875875bacecc1caa7cf9159c
|
data/bin/align_annotations
CHANGED
@@ -103,6 +103,8 @@ target_annotations = if source_annotations.class == Array
|
|
103
103
|
else
|
104
104
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
105
|
|
106
|
+
pp alignment
|
107
|
+
|
106
108
|
# verification
|
107
109
|
source_text = source_annotations[:text]
|
108
110
|
puts "=====BEGIN"
|
@@ -129,8 +131,6 @@ else
|
|
129
131
|
puts
|
130
132
|
puts "=====END"
|
131
133
|
|
132
|
-
# pp alignment
|
133
|
-
|
134
134
|
# alignment.block_alignments.each do |a|
|
135
135
|
# if a[:alignment].nil? || a[:alignment] == :empty
|
136
136
|
# # p [a[:source], a[:target]]
|
@@ -153,8 +153,16 @@ else
|
|
153
153
|
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
154
154
|
end
|
155
155
|
puts "====="
|
156
|
+
puts
|
156
157
|
|
157
158
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
159
|
+
puts "[Invalid transformation]"
|
160
|
+
denotations.each do |d|
|
161
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
162
|
+
end
|
163
|
+
puts "====="
|
164
|
+
puts
|
165
|
+
|
158
166
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
159
167
|
|
160
168
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -3,8 +3,8 @@ require 'string-similarity'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
TextAlignment::SIZE_NGRAM =
|
7
|
-
TextAlignment::SIZE_WINDOW =
|
6
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
+
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
8
|
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
@@ -23,6 +23,7 @@ class TextAlignment::AnchorFinder
|
|
23
23
|
|
24
24
|
# current position in s1
|
25
25
|
@beg_s1 = 0
|
26
|
+
@end_s1_prev = 0
|
26
27
|
@end_s2_prev = 0
|
27
28
|
end
|
28
29
|
|
@@ -31,8 +32,8 @@ class TextAlignment::AnchorFinder
|
|
31
32
|
while @beg_s1 < (@s1.length - @size_ngram)
|
32
33
|
anchor = @s1[@beg_s1, @size_ngram]
|
33
34
|
|
34
|
-
search_position = 0
|
35
|
-
|
35
|
+
# search_position = 0
|
36
|
+
search_position = @end_s2_prev
|
36
37
|
while @beg_s2 = @s2.index(anchor, search_position)
|
37
38
|
# if both the begining points are sufficiantly close to the end points of the last match
|
38
39
|
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
@@ -56,7 +57,7 @@ class TextAlignment::AnchorFinder
|
|
56
57
|
# extend the block
|
57
58
|
b1 = @beg_s1
|
58
59
|
b2 = @beg_s2
|
59
|
-
while b1
|
60
|
+
while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
|
60
61
|
b1 -= 1; b2 -= 1
|
61
62
|
end
|
62
63
|
b1 += 1; b2 += 1
|
@@ -40,6 +40,10 @@ class TextAlignment::TextAlignment
|
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
+
pp mblocks
|
44
|
+
puts "-----"
|
45
|
+
puts
|
46
|
+
|
43
47
|
# mblocks.each do |b|
|
44
48
|
# p [b[:source], b[:target]]
|
45
49
|
# puts "---"
|
@@ -170,10 +174,12 @@ class TextAlignment::TextAlignment
|
|
170
174
|
if begin_position == block_alignment[:source][:begin]
|
171
175
|
block_alignment[:target][:begin]
|
172
176
|
else
|
173
|
-
raise "lost annotation"
|
177
|
+
# raise "lost annotation"
|
178
|
+
nil
|
174
179
|
end
|
175
180
|
else
|
176
|
-
block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
|
181
|
+
r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
|
182
|
+
r.nil? ? nil : r + block_alignment[:target][:begin]
|
177
183
|
end
|
178
184
|
end
|
179
185
|
|
@@ -187,10 +193,12 @@ class TextAlignment::TextAlignment
|
|
187
193
|
if end_position == block_alignment[:source][:end]
|
188
194
|
block_alignment[:target][:end]
|
189
195
|
else
|
190
|
-
raise "lost annotation"
|
196
|
+
# raise "lost annotation"
|
197
|
+
nil
|
191
198
|
end
|
192
199
|
else
|
193
|
-
block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
|
200
|
+
r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
|
201
|
+
r.nil? ? nil : r + block_alignment[:target][:begin]
|
194
202
|
end
|
195
203
|
end
|
196
204
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|