text_alignment 0.3.15 → 0.3.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +2 -3
- data/lib/text_alignment/anchor_finder.rb +5 -4
- data/lib/text_alignment/text_alignment.rb +9 -15
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 179976ef6ac286a34343f99eef5cf8ee3d26997c1a8e1c8e9348793773ac044a
|
4
|
+
data.tar.gz: e15a57b2460a21d2607e3e4775ad32c5a760d9f00067c898b966e21777c241d4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3742565325c8ce8b4ce35093b4524b1d9182b51e332f50d9376ce2c22af8918a168b1fcc5764e4313e819c546684a3fc9411d8ee39607d95557924975bd4143
|
7
|
+
data.tar.gz: 6f68f68799075990fce62f4454c2ef1f1ef4ff260eac44a5332744a9c21605ac14eb35c9d6d24bedb31681f0995222699d8018727f7e70a45c11076df1c82212
|
data/bin/align_annotations
CHANGED
@@ -103,6 +103,8 @@ target_annotations = if source_annotations.class == Array
|
|
103
103
|
else
|
104
104
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
105
|
|
106
|
+
pp alignment
|
107
|
+
|
106
108
|
# verification
|
107
109
|
source_text = source_annotations[:text]
|
108
110
|
puts "=====BEGIN"
|
@@ -129,9 +131,6 @@ else
|
|
129
131
|
puts
|
130
132
|
puts "=====END"
|
131
133
|
|
132
|
-
pp alignment
|
133
|
-
|
134
|
-
exit
|
135
134
|
# alignment.block_alignments.each do |a|
|
136
135
|
# if a[:alignment].nil? || a[:alignment] == :empty
|
137
136
|
# # p [a[:source], a[:target]]
|
@@ -4,7 +4,7 @@ require 'string-similarity'
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
6
|
TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
-
TextAlignment::SIZE_WINDOW =
|
7
|
+
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
8
|
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
@@ -23,6 +23,7 @@ class TextAlignment::AnchorFinder
|
|
23
23
|
|
24
24
|
# current position in s1
|
25
25
|
@beg_s1 = 0
|
26
|
+
@end_s1_prev = 0
|
26
27
|
@end_s2_prev = 0
|
27
28
|
end
|
28
29
|
|
@@ -31,8 +32,8 @@ class TextAlignment::AnchorFinder
|
|
31
32
|
while @beg_s1 < (@s1.length - @size_ngram)
|
32
33
|
anchor = @s1[@beg_s1, @size_ngram]
|
33
34
|
|
34
|
-
search_position = 0
|
35
|
-
|
35
|
+
# search_position = 0
|
36
|
+
search_position = @end_s2_prev
|
36
37
|
while @beg_s2 = @s2.index(anchor, search_position)
|
37
38
|
# if both the begining points are sufficiantly close to the end points of the last match
|
38
39
|
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
@@ -56,7 +57,7 @@ class TextAlignment::AnchorFinder
|
|
56
57
|
# extend the block
|
57
58
|
b1 = @beg_s1
|
58
59
|
b2 = @beg_s2
|
59
|
-
while b1
|
60
|
+
while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
|
60
61
|
b1 -= 1; b2 -= 1
|
61
62
|
end
|
62
63
|
b1 += 1; b2 += 1
|
@@ -40,6 +40,9 @@ class TextAlignment::TextAlignment
|
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
+
# pp mblocks
|
44
|
+
# puts "-----"
|
45
|
+
# puts
|
43
46
|
# mblocks.each do |b|
|
44
47
|
# p [b[:source], b[:target]]
|
45
48
|
# puts "---"
|
@@ -78,6 +81,8 @@ class TextAlignment::TextAlignment
|
|
78
81
|
|
79
82
|
@block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
80
83
|
|
84
|
+
_str1 = str1[b1 ... e1]
|
85
|
+
_str2 = str2[b2 ... e2]
|
81
86
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
82
87
|
if alignment.similarity < 0.6
|
83
88
|
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
@@ -113,19 +118,6 @@ class TextAlignment::TextAlignment
|
|
113
118
|
end
|
114
119
|
|
115
120
|
# Final step
|
116
|
-
if mblocks[-1][:source][:end] < str1.length
|
117
|
-
b1 = mblocks[-1][:source][:end]
|
118
|
-
b2 = mblocks[-1][:target][:end]
|
119
|
-
|
120
|
-
if mblocks[-1][:target][:end] < str2.length
|
121
|
-
|
122
|
-
else
|
123
|
-
e1 = str1.length
|
124
|
-
e2 = str2.length
|
125
|
-
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
121
|
if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
|
130
122
|
b1 = mblocks[-1][:source][:end]
|
131
123
|
b2 = mblocks[-1][:target][:end]
|
@@ -174,7 +166,8 @@ class TextAlignment::TextAlignment
|
|
174
166
|
nil
|
175
167
|
end
|
176
168
|
else
|
177
|
-
block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
|
169
|
+
r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
|
170
|
+
r.nil? ? nil : r + block_alignment[:target][:begin]
|
178
171
|
end
|
179
172
|
end
|
180
173
|
|
@@ -192,7 +185,8 @@ class TextAlignment::TextAlignment
|
|
192
185
|
nil
|
193
186
|
end
|
194
187
|
else
|
195
|
-
block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
|
188
|
+
r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
|
189
|
+
r.nil? ? nil : r + block_alignment[:target][:begin]
|
196
190
|
end
|
197
191
|
end
|
198
192
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|