text_alignment 0.3.21 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +2 -2
- data/lib/text_alignment/anchor_finder.rb +15 -10
- data/lib/text_alignment/approximate_fit.rb +4 -6
- data/lib/text_alignment/constants.rb +7 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +0 -2
- data/lib/text_alignment/text_alignment.rb +7 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d56b44ee3cc3451b78ddb87134fefd6c043b4730f1170b07069a417bb162bad1
|
4
|
+
data.tar.gz: 67318d70bee5bb82127c98e79e0b45e5e337832a2b8d23469474215168579f20
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60ae2292556d15d860e0c7377be9c1fd5aec6741b26f562fee8c0c2a9a123d0b8c656b2f7f34b8f74f83ea9ac4878b53f17454e3b25d27d00cd98ecf8175a387
|
7
|
+
data.tar.gz: fe8fe545101cddb638c00ffb6f6785f8b2f5ddb109b4010739e399a2c50fe41164d791fe4a5599a76fca5553c494f78fd49630368d48e406a8211fc0516916f5
|
data/bin/align_annotations
CHANGED
@@ -108,7 +108,7 @@ else
|
|
108
108
|
# verification
|
109
109
|
source_text = source_annotations[:text]
|
110
110
|
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.length).each do |p|
|
111
|
+
(0 ... source_text.rstrip.length).each do |p|
|
112
112
|
t = alignment.transform_begin_position(p)
|
113
113
|
if t.nil?
|
114
114
|
print source_text[p]
|
@@ -120,7 +120,7 @@ else
|
|
120
120
|
puts "=====END"
|
121
121
|
|
122
122
|
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.length).each do |p|
|
123
|
+
(0 .. source_text.rstrip.length).each do |p|
|
124
124
|
t = alignment.transform_end_position(p)
|
125
125
|
if t.nil?
|
126
126
|
print source_text[p]
|
@@ -1,17 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
|
-
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
-
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
|
-
|
10
7
|
class TextAlignment::AnchorFinder
|
11
8
|
|
12
|
-
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
|
9
|
+
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
13
10
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
11
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
12
|
+
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
15
13
|
|
16
14
|
@reverse = (target_str.length < source_str.length)
|
17
15
|
|
@@ -30,6 +28,10 @@ class TextAlignment::AnchorFinder
|
|
30
28
|
def get_next_anchor
|
31
29
|
# find the position of an anchor ngram in s1 and s2
|
32
30
|
while @beg_s1 < (@s1.length - @size_ngram)
|
31
|
+
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
32
|
+
@beg_s1 += 1
|
33
|
+
next
|
34
|
+
end
|
33
35
|
anchor = @s1[@beg_s1, @size_ngram]
|
34
36
|
|
35
37
|
# search_position = 0
|
@@ -39,10 +41,10 @@ class TextAlignment::AnchorFinder
|
|
39
41
|
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
40
42
|
|
41
43
|
left_window_s1, left_window_s2 = get_left_windows
|
42
|
-
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) >
|
44
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
43
45
|
|
44
46
|
right_window_s1, right_window_s2 = get_right_windows
|
45
|
-
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) >
|
47
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
46
48
|
|
47
49
|
search_position = @beg_s2 + 1
|
48
50
|
end
|
@@ -57,9 +59,10 @@ class TextAlignment::AnchorFinder
|
|
57
59
|
# extend the block
|
58
60
|
b1 = @beg_s1
|
59
61
|
b2 = @beg_s2
|
60
|
-
while b1 >= @end_s1_prev && b2
|
62
|
+
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
61
63
|
b1 -= 1; b2 -= 1
|
62
64
|
end
|
65
|
+
|
63
66
|
b1 += 1; b2 += 1
|
64
67
|
|
65
68
|
e1 = @beg_s1 + @size_ngram
|
@@ -82,7 +85,8 @@ class TextAlignment::AnchorFinder
|
|
82
85
|
private
|
83
86
|
|
84
87
|
def get_left_windows
|
85
|
-
|
88
|
+
# commend below with the assumption that the beginning of a document gives a significant locational information
|
89
|
+
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
86
90
|
|
87
91
|
window_s1 = ''
|
88
92
|
loc = @beg_s1 - 1
|
@@ -110,7 +114,8 @@ class TextAlignment::AnchorFinder
|
|
110
114
|
end
|
111
115
|
|
112
116
|
def get_right_windows
|
113
|
-
|
117
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
114
119
|
|
115
120
|
window_s1 = ''
|
116
121
|
loc = @beg_s1 + @size_ngram
|
@@ -1,13 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
7
|
# approximate the location of str1 in str2
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
9
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
10
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
11
9
|
|
12
10
|
class << TextAlignment
|
13
11
|
|
@@ -16,8 +14,8 @@ class << TextAlignment
|
|
16
14
|
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
17
15
|
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
18
16
|
|
19
|
-
ngram1 = (0 .. str1.length - TextAlignment::
|
20
|
-
ngram2 = (0 .. str2.length - TextAlignment::
|
17
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
|
18
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
|
21
19
|
ngram_shared = ngram1 & ngram2
|
22
20
|
|
23
21
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
@@ -45,7 +43,7 @@ class << TextAlignment
|
|
45
43
|
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
44
|
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
47
45
|
|
48
|
-
break if text_similarity > TextAlignment::
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
49
47
|
fit_begin, fit_end = nil, nil
|
50
48
|
end
|
51
49
|
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
+
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
5
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.87 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
|
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
12
|
-
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
|
-
|
14
12
|
class TextAlignment::GLCSTextAlignment
|
15
13
|
attr_reader :position_map_begin, :position_map_end
|
16
14
|
attr_reader :common_elements, :mapped_elements
|
@@ -1,22 +1,22 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'text_alignment/anchor_finder'
|
3
4
|
require 'text_alignment/mixed_alignment'
|
4
5
|
|
5
6
|
module TextAlignment; end unless defined? TextAlignment
|
6
7
|
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
10
|
-
|
11
|
-
|
12
8
|
class TextAlignment::TextAlignment
|
13
9
|
attr_reader :block_alignments
|
14
10
|
attr_reader :similarity
|
15
11
|
attr_reader :lost_annotations
|
16
12
|
|
17
|
-
def initialize(str1, str2,
|
13
|
+
def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
18
14
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
19
15
|
|
16
|
+
size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
17
|
+
size_window = _size_window || TextAlignment::SIZE_WINDOW
|
18
|
+
sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
19
|
+
|
20
20
|
mappings ||= TextAlignment::MAPPINGS
|
21
21
|
|
22
22
|
# try exact match
|
@@ -26,7 +26,7 @@ class TextAlignment::TextAlignment
|
|
26
26
|
return @block_alignments
|
27
27
|
end
|
28
28
|
|
29
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
|
29
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, size_ngram, size_window, sim_threshold)
|
30
30
|
|
31
31
|
# To collect matched blocks
|
32
32
|
mblocks = []
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,6 +77,7 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/constants.rb
|
80
81
|
- lib/text_alignment/find_divisions.rb
|
81
82
|
- lib/text_alignment/glcs_alignment.rb
|
82
83
|
- lib/text_alignment/glcs_alignment_fast.rb
|