text_alignment 0.3.18 → 0.3.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +2 -2
- data/lib/text_alignment/anchor_finder.rb +12 -5
- data/lib/text_alignment/text_alignment.rb +7 -15
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7678cbfd749833e03a17dcf0b06699def0215e7c39b981d2ff8ddfb3a79b2ea4
|
4
|
+
data.tar.gz: 0251b99cbd984dcd8e23ea38e6190a449a1f83f98119e68251f1f17e5854ac73
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '08055cb7659458e4a5834a193442c7592f9eecab09e206a8e5f20aab658c26cb24cda2949804db3569c160f03c6e71e9a28bdcdb352f8d207f1eb31aa427d963'
|
7
|
+
data.tar.gz: 43f0c0111662212c049a8b4baf2eeb7b2cc15a01d9f02d7cfbc501a9b016fa1344516685b8052c862227c72f8f9866b3232425e06d862899003860ad4700fc70
|
data/bin/align_annotations
CHANGED
@@ -108,7 +108,7 @@ else
|
|
108
108
|
# verification
|
109
109
|
source_text = source_annotations[:text]
|
110
110
|
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.length).each do |p|
|
111
|
+
(0 ... source_text.rstrip.length).each do |p|
|
112
112
|
t = alignment.transform_begin_position(p)
|
113
113
|
if t.nil?
|
114
114
|
print source_text[p]
|
@@ -120,7 +120,7 @@ else
|
|
120
120
|
puts "=====END"
|
121
121
|
|
122
122
|
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.length).each do |p|
|
123
|
+
(0 .. source_text.rstrip.length).each do |p|
|
124
124
|
t = alignment.transform_end_position(p)
|
125
125
|
if t.nil?
|
126
126
|
print source_text[p]
|
@@ -3,7 +3,7 @@ require 'string-similarity'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
TextAlignment::SIZE_NGRAM =
|
6
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
7
|
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
8
|
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
@@ -30,13 +30,17 @@ class TextAlignment::AnchorFinder
|
|
30
30
|
def get_next_anchor
|
31
31
|
# find the position of an anchor ngram in s1 and s2
|
32
32
|
while @beg_s1 < (@s1.length - @size_ngram)
|
33
|
+
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
34
|
+
@beg_s1 += 1
|
35
|
+
next
|
36
|
+
end
|
33
37
|
anchor = @s1[@beg_s1, @size_ngram]
|
34
38
|
|
35
39
|
# search_position = 0
|
36
40
|
search_position = @end_s2_prev
|
37
41
|
while @beg_s2 = @s2.index(anchor, search_position)
|
38
42
|
# if both the begining points are sufficiantly close to the end points of the last match
|
39
|
-
break if @
|
43
|
+
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
40
44
|
|
41
45
|
left_window_s1, left_window_s2 = get_left_windows
|
42
46
|
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
@@ -57,9 +61,10 @@ class TextAlignment::AnchorFinder
|
|
57
61
|
# extend the block
|
58
62
|
b1 = @beg_s1
|
59
63
|
b2 = @beg_s2
|
60
|
-
while b1 >= @end_s1_prev && b2
|
64
|
+
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
61
65
|
b1 -= 1; b2 -= 1
|
62
66
|
end
|
67
|
+
|
63
68
|
b1 += 1; b2 += 1
|
64
69
|
|
65
70
|
e1 = @beg_s1 + @size_ngram
|
@@ -82,7 +87,8 @@ class TextAlignment::AnchorFinder
|
|
82
87
|
private
|
83
88
|
|
84
89
|
def get_left_windows
|
85
|
-
|
90
|
+
# commend below with the assumption that the beginning of a document gives a significant locational information
|
91
|
+
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
86
92
|
|
87
93
|
window_s1 = ''
|
88
94
|
loc = @beg_s1 - 1
|
@@ -110,7 +116,8 @@ class TextAlignment::AnchorFinder
|
|
110
116
|
end
|
111
117
|
|
112
118
|
def get_right_windows
|
113
|
-
|
119
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
120
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
114
121
|
|
115
122
|
window_s1 = ''
|
116
123
|
loc = @beg_s1 + @size_ngram
|
@@ -40,6 +40,9 @@ class TextAlignment::TextAlignment
|
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
+
# pp mblocks
|
44
|
+
# puts "-----"
|
45
|
+
# puts
|
43
46
|
# mblocks.each do |b|
|
44
47
|
# p [b[:source], b[:target]]
|
45
48
|
# puts "---"
|
@@ -78,6 +81,8 @@ class TextAlignment::TextAlignment
|
|
78
81
|
|
79
82
|
@block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
80
83
|
|
84
|
+
_str1 = str1[b1 ... e1]
|
85
|
+
_str2 = str2[b2 ... e2]
|
81
86
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
82
87
|
if alignment.similarity < 0.6
|
83
88
|
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
@@ -113,24 +118,11 @@ class TextAlignment::TextAlignment
|
|
113
118
|
end
|
114
119
|
|
115
120
|
# Final step
|
116
|
-
if mblocks[-1][:source][:end] < str1.length
|
117
|
-
b1 = mblocks[-1][:source][:end]
|
118
|
-
b2 = mblocks[-1][:target][:end]
|
119
|
-
|
120
|
-
if mblocks[-1][:target][:end] < str2.length
|
121
|
-
|
122
|
-
else
|
123
|
-
e1 = str1.length
|
124
|
-
e2 = str2.length
|
125
|
-
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
121
|
if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
|
130
122
|
b1 = mblocks[-1][:source][:end]
|
131
123
|
b2 = mblocks[-1][:target][:end]
|
132
|
-
_str1 = str1[b1 ...
|
133
|
-
_str2 = str2[b2 ...
|
124
|
+
_str1 = str1[b1 ... str1.length]
|
125
|
+
_str2 = str2[b2 ... str2.length]
|
134
126
|
|
135
127
|
unless _str1.strip.empty?
|
136
128
|
if _str2.strip.empty?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.23
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|