text_alignment 0.3.21 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +2 -2
- data/lib/text_alignment/anchor_finder.rb +15 -10
- data/lib/text_alignment/approximate_fit.rb +4 -6
- data/lib/text_alignment/constants.rb +7 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +0 -2
- data/lib/text_alignment/text_alignment.rb +7 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d56b44ee3cc3451b78ddb87134fefd6c043b4730f1170b07069a417bb162bad1
|
4
|
+
data.tar.gz: 67318d70bee5bb82127c98e79e0b45e5e337832a2b8d23469474215168579f20
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60ae2292556d15d860e0c7377be9c1fd5aec6741b26f562fee8c0c2a9a123d0b8c656b2f7f34b8f74f83ea9ac4878b53f17454e3b25d27d00cd98ecf8175a387
|
7
|
+
data.tar.gz: fe8fe545101cddb638c00ffb6f6785f8b2f5ddb109b4010739e399a2c50fe41164d791fe4a5599a76fca5553c494f78fd49630368d48e406a8211fc0516916f5
|
data/bin/align_annotations
CHANGED
@@ -108,7 +108,7 @@ else
|
|
108
108
|
# verification
|
109
109
|
source_text = source_annotations[:text]
|
110
110
|
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.length).each do |p|
|
111
|
+
(0 ... source_text.rstrip.length).each do |p|
|
112
112
|
t = alignment.transform_begin_position(p)
|
113
113
|
if t.nil?
|
114
114
|
print source_text[p]
|
@@ -120,7 +120,7 @@ else
|
|
120
120
|
puts "=====END"
|
121
121
|
|
122
122
|
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.length).each do |p|
|
123
|
+
(0 .. source_text.rstrip.length).each do |p|
|
124
124
|
t = alignment.transform_end_position(p)
|
125
125
|
if t.nil?
|
126
126
|
print source_text[p]
|
@@ -1,17 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
|
-
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
-
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
|
-
|
10
7
|
class TextAlignment::AnchorFinder
|
11
8
|
|
12
|
-
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
|
9
|
+
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
13
10
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
11
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
12
|
+
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
15
13
|
|
16
14
|
@reverse = (target_str.length < source_str.length)
|
17
15
|
|
@@ -30,6 +28,10 @@ class TextAlignment::AnchorFinder
|
|
30
28
|
def get_next_anchor
|
31
29
|
# find the position of an anchor ngram in s1 and s2
|
32
30
|
while @beg_s1 < (@s1.length - @size_ngram)
|
31
|
+
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
32
|
+
@beg_s1 += 1
|
33
|
+
next
|
34
|
+
end
|
33
35
|
anchor = @s1[@beg_s1, @size_ngram]
|
34
36
|
|
35
37
|
# search_position = 0
|
@@ -39,10 +41,10 @@ class TextAlignment::AnchorFinder
|
|
39
41
|
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
40
42
|
|
41
43
|
left_window_s1, left_window_s2 = get_left_windows
|
42
|
-
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) >
|
44
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
43
45
|
|
44
46
|
right_window_s1, right_window_s2 = get_right_windows
|
45
|
-
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) >
|
47
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
46
48
|
|
47
49
|
search_position = @beg_s2 + 1
|
48
50
|
end
|
@@ -57,9 +59,10 @@ class TextAlignment::AnchorFinder
|
|
57
59
|
# extend the block
|
58
60
|
b1 = @beg_s1
|
59
61
|
b2 = @beg_s2
|
60
|
-
while b1 >= @end_s1_prev && b2
|
62
|
+
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
61
63
|
b1 -= 1; b2 -= 1
|
62
64
|
end
|
65
|
+
|
63
66
|
b1 += 1; b2 += 1
|
64
67
|
|
65
68
|
e1 = @beg_s1 + @size_ngram
|
@@ -82,7 +85,8 @@ class TextAlignment::AnchorFinder
|
|
82
85
|
private
|
83
86
|
|
84
87
|
def get_left_windows
|
85
|
-
|
88
|
+
# commend below with the assumption that the beginning of a document gives a significant locational information
|
89
|
+
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
86
90
|
|
87
91
|
window_s1 = ''
|
88
92
|
loc = @beg_s1 - 1
|
@@ -110,7 +114,8 @@ class TextAlignment::AnchorFinder
|
|
110
114
|
end
|
111
115
|
|
112
116
|
def get_right_windows
|
113
|
-
|
117
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
114
119
|
|
115
120
|
window_s1 = ''
|
116
121
|
loc = @beg_s1 + @size_ngram
|
@@ -1,13 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
7
|
# approximate the location of str1 in str2
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
9
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
10
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
11
9
|
|
12
10
|
class << TextAlignment
|
13
11
|
|
@@ -16,8 +14,8 @@ class << TextAlignment
|
|
16
14
|
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
17
15
|
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
18
16
|
|
19
|
-
ngram1 = (0 .. str1.length - TextAlignment::
|
20
|
-
ngram2 = (0 .. str2.length - TextAlignment::
|
17
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
|
18
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
|
21
19
|
ngram_shared = ngram1 & ngram2
|
22
20
|
|
23
21
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
@@ -45,7 +43,7 @@ class << TextAlignment
|
|
45
43
|
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
44
|
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
47
45
|
|
48
|
-
break if text_similarity > TextAlignment::
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
49
47
|
fit_begin, fit_end = nil, nil
|
50
48
|
end
|
51
49
|
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
+
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
5
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.87 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
|
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
12
|
-
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
|
-
|
14
12
|
class TextAlignment::GLCSTextAlignment
|
15
13
|
attr_reader :position_map_begin, :position_map_end
|
16
14
|
attr_reader :common_elements, :mapped_elements
|
@@ -1,22 +1,22 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'text_alignment/anchor_finder'
|
3
4
|
require 'text_alignment/mixed_alignment'
|
4
5
|
|
5
6
|
module TextAlignment; end unless defined? TextAlignment
|
6
7
|
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
10
|
-
|
11
|
-
|
12
8
|
class TextAlignment::TextAlignment
|
13
9
|
attr_reader :block_alignments
|
14
10
|
attr_reader :similarity
|
15
11
|
attr_reader :lost_annotations
|
16
12
|
|
17
|
-
def initialize(str1, str2,
|
13
|
+
def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
18
14
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
19
15
|
|
16
|
+
size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
17
|
+
size_window = _size_window || TextAlignment::SIZE_WINDOW
|
18
|
+
sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
19
|
+
|
20
20
|
mappings ||= TextAlignment::MAPPINGS
|
21
21
|
|
22
22
|
# try exact match
|
@@ -26,7 +26,7 @@ class TextAlignment::TextAlignment
|
|
26
26
|
return @block_alignments
|
27
27
|
end
|
28
28
|
|
29
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
|
29
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, size_ngram, size_window, sim_threshold)
|
30
30
|
|
31
31
|
# To collect matched blocks
|
32
32
|
mblocks = []
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,6 +77,7 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/constants.rb
|
80
81
|
- lib/text_alignment/find_divisions.rb
|
81
82
|
- lib/text_alignment/glcs_alignment.rb
|
82
83
|
- lib/text_alignment/glcs_alignment_fast.rb
|