text_alignment 0.3.20 → 0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +2 -2
- data/lib/text_alignment/anchor_finder.rb +16 -8
- data/lib/text_alignment/text_alignment.rb +3 -3
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 35f83f76cd3e9ca59604327710d2eac4684b8abca9dbd167b2b391d98ea561f9
|
4
|
+
data.tar.gz: 77007a1bfcf72d0681ac9d21b35221e7d519a291383f3d0d8fa0574d631b65ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f936569e072e6693f279b5d3a349965b286192d8f694fffbbf41110f66afbd5d80f865a9232c2bef60ec94920e0a814142c8756f0ba3ff29a109cbbe1f7abec
|
7
|
+
data.tar.gz: 955b85aa639f733361f354cc68ac3d8b14a17a14784f1e6ec24f240934ef0d160c6b8cf01232e1e4243c2eecffcaf1665999cc3e455bfb0e61c0dd905d2cb8b4
|
data/bin/align_annotations
CHANGED
@@ -108,7 +108,7 @@ else
|
|
108
108
|
# verification
|
109
109
|
source_text = source_annotations[:text]
|
110
110
|
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.length).each do |p|
|
111
|
+
(0 ... source_text.rstrip.length).each do |p|
|
112
112
|
t = alignment.transform_begin_position(p)
|
113
113
|
if t.nil?
|
114
114
|
print source_text[p]
|
@@ -120,7 +120,7 @@ else
|
|
120
120
|
puts "=====END"
|
121
121
|
|
122
122
|
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.length).each do |p|
|
123
|
+
(0 .. source_text.rstrip.length).each do |p|
|
124
124
|
t = alignment.transform_end_position(p)
|
125
125
|
if t.nil?
|
126
126
|
print source_text[p]
|
@@ -3,15 +3,16 @@ require 'string-similarity'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
TextAlignment::SIZE_NGRAM =
|
6
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
7
|
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
8
|
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
11
11
|
|
12
|
-
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
|
12
|
+
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
13
13
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
14
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
15
|
+
@sim_threshold = _size_window || TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
15
16
|
|
16
17
|
@reverse = (target_str.length < source_str.length)
|
17
18
|
|
@@ -30,19 +31,23 @@ class TextAlignment::AnchorFinder
|
|
30
31
|
def get_next_anchor
|
31
32
|
# find the position of an anchor ngram in s1 and s2
|
32
33
|
while @beg_s1 < (@s1.length - @size_ngram)
|
34
|
+
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
35
|
+
@beg_s1 += 1
|
36
|
+
next
|
37
|
+
end
|
33
38
|
anchor = @s1[@beg_s1, @size_ngram]
|
34
39
|
|
35
40
|
# search_position = 0
|
36
41
|
search_position = @end_s2_prev
|
37
42
|
while @beg_s2 = @s2.index(anchor, search_position)
|
38
43
|
# if both the begining points are sufficiantly close to the end points of the last match
|
39
|
-
break if @
|
44
|
+
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
40
45
|
|
41
46
|
left_window_s1, left_window_s2 = get_left_windows
|
42
|
-
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) >
|
47
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
43
48
|
|
44
49
|
right_window_s1, right_window_s2 = get_right_windows
|
45
|
-
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) >
|
50
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
46
51
|
|
47
52
|
search_position = @beg_s2 + 1
|
48
53
|
end
|
@@ -57,9 +62,10 @@ class TextAlignment::AnchorFinder
|
|
57
62
|
# extend the block
|
58
63
|
b1 = @beg_s1
|
59
64
|
b2 = @beg_s2
|
60
|
-
while b1 >= @end_s1_prev && b2
|
65
|
+
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
61
66
|
b1 -= 1; b2 -= 1
|
62
67
|
end
|
68
|
+
|
63
69
|
b1 += 1; b2 += 1
|
64
70
|
|
65
71
|
e1 = @beg_s1 + @size_ngram
|
@@ -82,7 +88,8 @@ class TextAlignment::AnchorFinder
|
|
82
88
|
private
|
83
89
|
|
84
90
|
def get_left_windows
|
85
|
-
|
91
|
+
# commend below with the assumption that the beginning of a document gives a significant locational information
|
92
|
+
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
86
93
|
|
87
94
|
window_s1 = ''
|
88
95
|
loc = @beg_s1 - 1
|
@@ -110,7 +117,8 @@ class TextAlignment::AnchorFinder
|
|
110
117
|
end
|
111
118
|
|
112
119
|
def get_right_windows
|
113
|
-
|
120
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
121
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
114
122
|
|
115
123
|
window_s1 = ''
|
116
124
|
loc = @beg_s1 + @size_ngram
|
@@ -6,7 +6,7 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN =
|
9
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
10
10
|
|
11
11
|
|
12
12
|
class TextAlignment::TextAlignment
|
@@ -121,8 +121,8 @@ class TextAlignment::TextAlignment
|
|
121
121
|
if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
|
122
122
|
b1 = mblocks[-1][:source][:end]
|
123
123
|
b2 = mblocks[-1][:target][:end]
|
124
|
-
_str1 = str1[b1 ...
|
125
|
-
_str2 = str2[b2 ...
|
124
|
+
_str1 = str1[b1 ... str1.length]
|
125
|
+
_str2 = str2[b2 ... str2.length]
|
126
126
|
|
127
127
|
unless _str1.strip.empty?
|
128
128
|
if _str2.strip.empty?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.4'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|