text_alignment 0.3.20 → 0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +2 -2
- data/lib/text_alignment/anchor_finder.rb +16 -8
- data/lib/text_alignment/text_alignment.rb +3 -3
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 35f83f76cd3e9ca59604327710d2eac4684b8abca9dbd167b2b391d98ea561f9
|
4
|
+
data.tar.gz: 77007a1bfcf72d0681ac9d21b35221e7d519a291383f3d0d8fa0574d631b65ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f936569e072e6693f279b5d3a349965b286192d8f694fffbbf41110f66afbd5d80f865a9232c2bef60ec94920e0a814142c8756f0ba3ff29a109cbbe1f7abec
|
7
|
+
data.tar.gz: 955b85aa639f733361f354cc68ac3d8b14a17a14784f1e6ec24f240934ef0d160c6b8cf01232e1e4243c2eecffcaf1665999cc3e455bfb0e61c0dd905d2cb8b4
|
data/bin/align_annotations
CHANGED
@@ -108,7 +108,7 @@ else
|
|
108
108
|
# verification
|
109
109
|
source_text = source_annotations[:text]
|
110
110
|
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.length).each do |p|
|
111
|
+
(0 ... source_text.rstrip.length).each do |p|
|
112
112
|
t = alignment.transform_begin_position(p)
|
113
113
|
if t.nil?
|
114
114
|
print source_text[p]
|
@@ -120,7 +120,7 @@ else
|
|
120
120
|
puts "=====END"
|
121
121
|
|
122
122
|
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.length).each do |p|
|
123
|
+
(0 .. source_text.rstrip.length).each do |p|
|
124
124
|
t = alignment.transform_end_position(p)
|
125
125
|
if t.nil?
|
126
126
|
print source_text[p]
|
@@ -3,15 +3,16 @@ require 'string-similarity'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
TextAlignment::SIZE_NGRAM =
|
6
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
7
|
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
8
|
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
11
11
|
|
12
|
-
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
|
12
|
+
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
13
13
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
14
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
15
|
+
@sim_threshold = _size_window || TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
15
16
|
|
16
17
|
@reverse = (target_str.length < source_str.length)
|
17
18
|
|
@@ -30,19 +31,23 @@ class TextAlignment::AnchorFinder
|
|
30
31
|
def get_next_anchor
|
31
32
|
# find the position of an anchor ngram in s1 and s2
|
32
33
|
while @beg_s1 < (@s1.length - @size_ngram)
|
34
|
+
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
35
|
+
@beg_s1 += 1
|
36
|
+
next
|
37
|
+
end
|
33
38
|
anchor = @s1[@beg_s1, @size_ngram]
|
34
39
|
|
35
40
|
# search_position = 0
|
36
41
|
search_position = @end_s2_prev
|
37
42
|
while @beg_s2 = @s2.index(anchor, search_position)
|
38
43
|
# if both the begining points are sufficiantly close to the end points of the last match
|
39
|
-
break if @
|
44
|
+
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
40
45
|
|
41
46
|
left_window_s1, left_window_s2 = get_left_windows
|
42
|
-
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) >
|
47
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
43
48
|
|
44
49
|
right_window_s1, right_window_s2 = get_right_windows
|
45
|
-
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) >
|
50
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
46
51
|
|
47
52
|
search_position = @beg_s2 + 1
|
48
53
|
end
|
@@ -57,9 +62,10 @@ class TextAlignment::AnchorFinder
|
|
57
62
|
# extend the block
|
58
63
|
b1 = @beg_s1
|
59
64
|
b2 = @beg_s2
|
60
|
-
while b1 >= @end_s1_prev && b2
|
65
|
+
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
61
66
|
b1 -= 1; b2 -= 1
|
62
67
|
end
|
68
|
+
|
63
69
|
b1 += 1; b2 += 1
|
64
70
|
|
65
71
|
e1 = @beg_s1 + @size_ngram
|
@@ -82,7 +88,8 @@ class TextAlignment::AnchorFinder
|
|
82
88
|
private
|
83
89
|
|
84
90
|
def get_left_windows
|
85
|
-
|
91
|
+
# commend below with the assumption that the beginning of a document gives a significant locational information
|
92
|
+
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
86
93
|
|
87
94
|
window_s1 = ''
|
88
95
|
loc = @beg_s1 - 1
|
@@ -110,7 +117,8 @@ class TextAlignment::AnchorFinder
|
|
110
117
|
end
|
111
118
|
|
112
119
|
def get_right_windows
|
113
|
-
|
120
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
121
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
114
122
|
|
115
123
|
window_s1 = ''
|
116
124
|
loc = @beg_s1 + @size_ngram
|
@@ -6,7 +6,7 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN =
|
9
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
10
10
|
|
11
11
|
|
12
12
|
class TextAlignment::TextAlignment
|
@@ -121,8 +121,8 @@ class TextAlignment::TextAlignment
|
|
121
121
|
if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
|
122
122
|
b1 = mblocks[-1][:source][:end]
|
123
123
|
b2 = mblocks[-1][:target][:end]
|
124
|
-
_str1 = str1[b1 ...
|
125
|
-
_str2 = str2[b2 ...
|
124
|
+
_str1 = str1[b1 ... str1.length]
|
125
|
+
_str2 = str2[b2 ... str2.length]
|
126
126
|
|
127
127
|
unless _str1.strip.empty?
|
128
128
|
if _str2.strip.empty?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.4'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|