text_alignment 0.3.22 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_alignment/anchor_finder.rb +9 -9
- data/lib/text_alignment/approximate_fit.rb +4 -6
- data/lib/text_alignment/constants.rb +7 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +0 -2
- data/lib/text_alignment/mixed_alignment.rb +1 -54
- data/lib/text_alignment/text_alignment.rb +62 -8
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
|
4
|
+
data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
|
7
|
+
data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
|
@@ -1,17 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
|
-
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
-
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
|
-
|
10
7
|
class TextAlignment::AnchorFinder
|
11
8
|
|
12
|
-
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
|
9
|
+
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
13
10
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
11
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
12
|
+
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
15
13
|
|
16
14
|
@reverse = (target_str.length < source_str.length)
|
17
15
|
|
@@ -43,10 +41,10 @@ class TextAlignment::AnchorFinder
|
|
43
41
|
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
44
42
|
|
45
43
|
left_window_s1, left_window_s2 = get_left_windows
|
46
|
-
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) >
|
44
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
47
45
|
|
48
46
|
right_window_s1, right_window_s2 = get_right_windows
|
49
|
-
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) >
|
47
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
50
48
|
|
51
49
|
search_position = @beg_s2 + 1
|
52
50
|
end
|
@@ -87,7 +85,8 @@ class TextAlignment::AnchorFinder
|
|
87
85
|
private
|
88
86
|
|
89
87
|
def get_left_windows
|
90
|
-
|
88
|
+
# commend below with the assumption that the beginning of a document gives a significant locational information
|
89
|
+
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
91
90
|
|
92
91
|
window_s1 = ''
|
93
92
|
loc = @beg_s1 - 1
|
@@ -115,7 +114,8 @@ class TextAlignment::AnchorFinder
|
|
115
114
|
end
|
116
115
|
|
117
116
|
def get_right_windows
|
118
|
-
|
117
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
119
119
|
|
120
120
|
window_s1 = ''
|
121
121
|
loc = @beg_s1 + @size_ngram
|
@@ -1,13 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
7
|
# approximate the location of str1 in str2
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
9
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
10
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
11
9
|
|
12
10
|
class << TextAlignment
|
13
11
|
|
@@ -16,8 +14,8 @@ class << TextAlignment
|
|
16
14
|
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
17
15
|
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
18
16
|
|
19
|
-
ngram1 = (0 .. str1.length - TextAlignment::
|
20
|
-
ngram2 = (0 .. str2.length - TextAlignment::
|
17
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
|
18
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
|
21
19
|
ngram_shared = ngram1 & ngram2
|
22
20
|
|
23
21
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
@@ -45,7 +43,7 @@ class << TextAlignment
|
|
45
43
|
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
44
|
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
47
45
|
|
48
|
-
break if text_similarity > TextAlignment::
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
49
47
|
fit_begin, fit_end = nil, nil
|
50
48
|
end
|
51
49
|
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
+
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
5
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
|
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
12
|
-
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
|
-
|
14
12
|
class TextAlignment::GLCSTextAlignment
|
15
13
|
attr_reader :position_map_begin, :position_map_end
|
16
14
|
attr_reader :common_elements, :mapped_elements
|
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
|
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
13
|
-
TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
|
14
|
-
|
15
13
|
class TextAlignment::MixedAlignment
|
16
14
|
attr_reader :sdiff
|
17
15
|
attr_reader :position_map_begin, :position_map_end
|
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
|
|
21
19
|
|
22
20
|
def initialize(str1, str2, mappings = [])
|
23
21
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
24
|
-
|
25
|
-
|
26
|
-
## preprocessing
|
27
|
-
str1 = str1.dup
|
28
|
-
str2 = str2.dup
|
29
|
-
mappings = mappings.dup
|
30
|
-
|
31
|
-
## find the first nomatch character
|
32
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
33
|
-
if str2.index(c).nil?
|
34
|
-
@nomatch_char1 = c
|
35
|
-
break
|
36
|
-
end
|
37
|
-
end
|
38
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
|
39
|
-
|
40
|
-
## find the first nomatch character
|
41
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
42
|
-
if c != @nomatch_char1 && str1.index(c).nil?
|
43
|
-
@nomatch_char2 = c
|
44
|
-
break
|
45
|
-
end
|
46
|
-
end
|
47
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
|
48
|
-
|
49
|
-
# single character mappings
|
50
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
51
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
52
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
53
|
-
characters_to.gsub!(/-/, '\-')
|
54
|
-
|
55
|
-
str1.tr!(characters_from, characters_to)
|
56
|
-
str2.tr!(characters_from, characters_to)
|
57
|
-
|
58
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
59
|
-
|
60
|
-
# ASCII foldings
|
61
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
62
|
-
ascii_foldings.each do |f|
|
63
|
-
from = f[1]
|
64
|
-
|
65
|
-
if str2.index(f[0])
|
66
|
-
to = f[0] + (@nomatch_char1 * (f[1].length - 1))
|
67
|
-
str1.gsub!(from, to)
|
68
|
-
end
|
69
|
-
|
70
|
-
if str1.index(f[0])
|
71
|
-
to = f[0] + (@nomatch_char2 * (f[1].length - 1))
|
72
|
-
str2.gsub!(from, to)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
22
|
+
mappings ||= []
|
76
23
|
|
77
24
|
_compute_mixed_alignment(str1, str2, mappings)
|
78
25
|
end
|
@@ -1,23 +1,21 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'text_alignment/anchor_finder'
|
3
4
|
require 'text_alignment/mixed_alignment'
|
4
5
|
|
5
6
|
module TextAlignment; end unless defined? TextAlignment
|
6
7
|
|
7
|
-
TextAlignment::
|
8
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
10
|
-
|
8
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
11
9
|
|
12
10
|
class TextAlignment::TextAlignment
|
13
11
|
attr_reader :block_alignments
|
14
12
|
attr_reader :similarity
|
15
13
|
attr_reader :lost_annotations
|
16
14
|
|
17
|
-
def initialize(
|
18
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
19
17
|
|
20
|
-
mappings
|
18
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
21
19
|
|
22
20
|
# try exact match
|
23
21
|
block_begin = str2.index(str1)
|
@@ -26,7 +24,7 @@ class TextAlignment::TextAlignment
|
|
26
24
|
return @block_alignments
|
27
25
|
end
|
28
26
|
|
29
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
|
27
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
30
28
|
|
31
29
|
# To collect matched blocks
|
32
30
|
mblocks = []
|
@@ -232,4 +230,60 @@ class TextAlignment::TextAlignment
|
|
232
230
|
r
|
233
231
|
end
|
234
232
|
|
233
|
+
private
|
234
|
+
|
235
|
+
|
236
|
+
def string_preprocessing(_str1, _str2)
|
237
|
+
str1 = _str1.dup
|
238
|
+
str2 = _str2.dup
|
239
|
+
mappings = TextAlignment::MAPPINGS.dup
|
240
|
+
|
241
|
+
## single character mappings
|
242
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
243
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
244
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
245
|
+
characters_to.gsub!(/-/, '\-')
|
246
|
+
|
247
|
+
str1.tr!(characters_from, characters_to)
|
248
|
+
str2.tr!(characters_from, characters_to)
|
249
|
+
|
250
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
251
|
+
|
252
|
+
## long to one character mappings
|
253
|
+
pletters = TextAlignment::PADDING_LETTERS
|
254
|
+
|
255
|
+
# find the padding letter for str1
|
256
|
+
padding_letter1 = begin
|
257
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
258
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
259
|
+
TextAlignment::PADDING_LETTERS[i]
|
260
|
+
end
|
261
|
+
|
262
|
+
# find the padding letter for str2
|
263
|
+
padding_letter2 = begin
|
264
|
+
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
265
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
266
|
+
TextAlignment::PADDING_LETTERS[i]
|
267
|
+
end
|
268
|
+
|
269
|
+
# ASCII foldings
|
270
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
271
|
+
ascii_foldings.each do |f|
|
272
|
+
from = f[1]
|
273
|
+
|
274
|
+
if str2.index(f[0])
|
275
|
+
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
276
|
+
str1.gsub!(from, to)
|
277
|
+
end
|
278
|
+
|
279
|
+
if str1.index(f[0])
|
280
|
+
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
281
|
+
str2.gsub!(from, to)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
285
|
+
|
286
|
+
[str1, str2, mappings]
|
287
|
+
end
|
288
|
+
|
235
289
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,6 +77,7 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/constants.rb
|
80
81
|
- lib/text_alignment/find_divisions.rb
|
81
82
|
- lib/text_alignment/glcs_alignment.rb
|
82
83
|
- lib/text_alignment/glcs_alignment_fast.rb
|