text_alignment 0.3.19 → 0.3.24
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +2 -2
- data/lib/text_alignment/anchor_finder.rb +11 -4
- data/lib/text_alignment/text_alignment.rb +8 -20
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd59fb1ad977d3286f358d8c08315824e86d99cc6f9d72814adb760f2e680107
|
4
|
+
data.tar.gz: baafd1b76f6c6447a5763ff731b77ad07e449fc87ef94abed74a978376f6334e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d417878396803e1169a24fae67a9a4b0d4e84948d0c6bc678626641d6f1b6ac1fe16c94e9f07ee747b98f83e4305fa553220a607475363378f32fac4d43a65c7
|
7
|
+
data.tar.gz: efaf3640a67be46dddcf7dda9d819cbcb47828a3966d305887b6024eee6ba517a597e9303790f584264b8995f69671d90bdc0c7883fc9244f3b1746695960bf8
|
data/bin/align_annotations
CHANGED
@@ -108,7 +108,7 @@ else
|
|
108
108
|
# verification
|
109
109
|
source_text = source_annotations[:text]
|
110
110
|
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.length).each do |p|
|
111
|
+
(0 ... source_text.rstrip.length).each do |p|
|
112
112
|
t = alignment.transform_begin_position(p)
|
113
113
|
if t.nil?
|
114
114
|
print source_text[p]
|
@@ -120,7 +120,7 @@ else
|
|
120
120
|
puts "=====END"
|
121
121
|
|
122
122
|
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.length).each do |p|
|
123
|
+
(0 .. source_text.rstrip.length).each do |p|
|
124
124
|
t = alignment.transform_end_position(p)
|
125
125
|
if t.nil?
|
126
126
|
print source_text[p]
|
@@ -30,13 +30,17 @@ class TextAlignment::AnchorFinder
|
|
30
30
|
def get_next_anchor
|
31
31
|
# find the position of an anchor ngram in s1 and s2
|
32
32
|
while @beg_s1 < (@s1.length - @size_ngram)
|
33
|
+
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
34
|
+
@beg_s1 += 1
|
35
|
+
next
|
36
|
+
end
|
33
37
|
anchor = @s1[@beg_s1, @size_ngram]
|
34
38
|
|
35
39
|
# search_position = 0
|
36
40
|
search_position = @end_s2_prev
|
37
41
|
while @beg_s2 = @s2.index(anchor, search_position)
|
38
42
|
# if both the begining points are sufficiantly close to the end points of the last match
|
39
|
-
break if @
|
43
|
+
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
40
44
|
|
41
45
|
left_window_s1, left_window_s2 = get_left_windows
|
42
46
|
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
@@ -57,9 +61,10 @@ class TextAlignment::AnchorFinder
|
|
57
61
|
# extend the block
|
58
62
|
b1 = @beg_s1
|
59
63
|
b2 = @beg_s2
|
60
|
-
while b1 >= @end_s1_prev && b2
|
64
|
+
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
61
65
|
b1 -= 1; b2 -= 1
|
62
66
|
end
|
67
|
+
|
63
68
|
b1 += 1; b2 += 1
|
64
69
|
|
65
70
|
e1 = @beg_s1 + @size_ngram
|
@@ -82,7 +87,8 @@ class TextAlignment::AnchorFinder
|
|
82
87
|
private
|
83
88
|
|
84
89
|
def get_left_windows
|
85
|
-
|
90
|
+
# commend below with the assumption that the beginning of a document gives a significant locational information
|
91
|
+
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
86
92
|
|
87
93
|
window_s1 = ''
|
88
94
|
loc = @beg_s1 - 1
|
@@ -110,7 +116,8 @@ class TextAlignment::AnchorFinder
|
|
110
116
|
end
|
111
117
|
|
112
118
|
def get_right_windows
|
113
|
-
|
119
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
120
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
114
121
|
|
115
122
|
window_s1 = ''
|
116
123
|
loc = @beg_s1 + @size_ngram
|
@@ -6,7 +6,7 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN =
|
9
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
10
10
|
|
11
11
|
|
12
12
|
class TextAlignment::TextAlignment
|
@@ -40,10 +40,9 @@ class TextAlignment::TextAlignment
|
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
-
pp mblocks
|
44
|
-
puts "-----"
|
45
|
-
puts
|
46
|
-
|
43
|
+
# pp mblocks
|
44
|
+
# puts "-----"
|
45
|
+
# puts
|
47
46
|
# mblocks.each do |b|
|
48
47
|
# p [b[:source], b[:target]]
|
49
48
|
# puts "---"
|
@@ -82,6 +81,8 @@ class TextAlignment::TextAlignment
|
|
82
81
|
|
83
82
|
@block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
84
83
|
|
84
|
+
_str1 = str1[b1 ... e1]
|
85
|
+
_str2 = str2[b2 ... e2]
|
85
86
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
86
87
|
if alignment.similarity < 0.6
|
87
88
|
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
@@ -117,24 +118,11 @@ class TextAlignment::TextAlignment
|
|
117
118
|
end
|
118
119
|
|
119
120
|
# Final step
|
120
|
-
if mblocks[-1][:source][:end] < str1.length
|
121
|
-
b1 = mblocks[-1][:source][:end]
|
122
|
-
b2 = mblocks[-1][:target][:end]
|
123
|
-
|
124
|
-
if mblocks[-1][:target][:end] < str2.length
|
125
|
-
|
126
|
-
else
|
127
|
-
e1 = str1.length
|
128
|
-
e2 = str2.length
|
129
|
-
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
121
|
if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
|
134
122
|
b1 = mblocks[-1][:source][:end]
|
135
123
|
b2 = mblocks[-1][:target][:end]
|
136
|
-
_str1 = str1[b1 ...
|
137
|
-
_str2 = str2[b2 ...
|
124
|
+
_str1 = str1[b1 ... str1.length]
|
125
|
+
_str2 = str2[b2 ... str2.length]
|
138
126
|
|
139
127
|
unless _str1.strip.empty?
|
140
128
|
if _str2.strip.empty?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|