text_alignment 0.3.22 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/anchor_finder.rb +9 -9
- data/lib/text_alignment/approximate_fit.rb +4 -6
- data/lib/text_alignment/constants.rb +7 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +0 -2
- data/lib/text_alignment/mixed_alignment.rb +1 -54
- data/lib/text_alignment/text_alignment.rb +62 -8
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
|
4
|
+
data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
|
7
|
+
data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
|
@@ -1,17 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
|
-
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
-
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
|
-
|
10
7
|
class TextAlignment::AnchorFinder
|
11
8
|
|
12
|
-
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
|
9
|
+
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
13
10
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
11
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
12
|
+
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
15
13
|
|
16
14
|
@reverse = (target_str.length < source_str.length)
|
17
15
|
|
@@ -43,10 +41,10 @@ class TextAlignment::AnchorFinder
|
|
43
41
|
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
44
42
|
|
45
43
|
left_window_s1, left_window_s2 = get_left_windows
|
46
|
-
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) >
|
44
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
47
45
|
|
48
46
|
right_window_s1, right_window_s2 = get_right_windows
|
49
|
-
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) >
|
47
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
50
48
|
|
51
49
|
search_position = @beg_s2 + 1
|
52
50
|
end
|
@@ -87,7 +85,8 @@ class TextAlignment::AnchorFinder
|
|
87
85
|
private
|
88
86
|
|
89
87
|
def get_left_windows
|
90
|
-
|
88
|
+
# commend below with the assumption that the beginning of a document gives a significant locational information
|
89
|
+
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
91
90
|
|
92
91
|
window_s1 = ''
|
93
92
|
loc = @beg_s1 - 1
|
@@ -115,7 +114,8 @@ class TextAlignment::AnchorFinder
|
|
115
114
|
end
|
116
115
|
|
117
116
|
def get_right_windows
|
118
|
-
|
117
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
119
119
|
|
120
120
|
window_s1 = ''
|
121
121
|
loc = @beg_s1 + @size_ngram
|
@@ -1,13 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
7
|
# approximate the location of str1 in str2
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
9
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
10
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
11
9
|
|
12
10
|
class << TextAlignment
|
13
11
|
|
@@ -16,8 +14,8 @@ class << TextAlignment
|
|
16
14
|
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
17
15
|
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
18
16
|
|
19
|
-
ngram1 = (0 .. str1.length - TextAlignment::
|
20
|
-
ngram2 = (0 .. str2.length - TextAlignment::
|
17
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
|
18
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
|
21
19
|
ngram_shared = ngram1 & ngram2
|
22
20
|
|
23
21
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
@@ -45,7 +43,7 @@ class << TextAlignment
|
|
45
43
|
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
44
|
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
47
45
|
|
48
|
-
break if text_similarity > TextAlignment::
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
49
47
|
fit_begin, fit_end = nil, nil
|
50
48
|
end
|
51
49
|
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
+
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
5
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
|
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
12
|
-
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
|
-
|
14
12
|
class TextAlignment::GLCSTextAlignment
|
15
13
|
attr_reader :position_map_begin, :position_map_end
|
16
14
|
attr_reader :common_elements, :mapped_elements
|
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
|
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
13
|
-
TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
|
14
|
-
|
15
13
|
class TextAlignment::MixedAlignment
|
16
14
|
attr_reader :sdiff
|
17
15
|
attr_reader :position_map_begin, :position_map_end
|
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
|
|
21
19
|
|
22
20
|
def initialize(str1, str2, mappings = [])
|
23
21
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
24
|
-
|
25
|
-
|
26
|
-
## preprocessing
|
27
|
-
str1 = str1.dup
|
28
|
-
str2 = str2.dup
|
29
|
-
mappings = mappings.dup
|
30
|
-
|
31
|
-
## find the first nomatch character
|
32
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
33
|
-
if str2.index(c).nil?
|
34
|
-
@nomatch_char1 = c
|
35
|
-
break
|
36
|
-
end
|
37
|
-
end
|
38
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
|
39
|
-
|
40
|
-
## find the first nomatch character
|
41
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
42
|
-
if c != @nomatch_char1 && str1.index(c).nil?
|
43
|
-
@nomatch_char2 = c
|
44
|
-
break
|
45
|
-
end
|
46
|
-
end
|
47
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
|
48
|
-
|
49
|
-
# single character mappings
|
50
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
51
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
52
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
53
|
-
characters_to.gsub!(/-/, '\-')
|
54
|
-
|
55
|
-
str1.tr!(characters_from, characters_to)
|
56
|
-
str2.tr!(characters_from, characters_to)
|
57
|
-
|
58
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
59
|
-
|
60
|
-
# ASCII foldings
|
61
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
62
|
-
ascii_foldings.each do |f|
|
63
|
-
from = f[1]
|
64
|
-
|
65
|
-
if str2.index(f[0])
|
66
|
-
to = f[0] + (@nomatch_char1 * (f[1].length - 1))
|
67
|
-
str1.gsub!(from, to)
|
68
|
-
end
|
69
|
-
|
70
|
-
if str1.index(f[0])
|
71
|
-
to = f[0] + (@nomatch_char2 * (f[1].length - 1))
|
72
|
-
str2.gsub!(from, to)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
22
|
+
mappings ||= []
|
76
23
|
|
77
24
|
_compute_mixed_alignment(str1, str2, mappings)
|
78
25
|
end
|
@@ -1,23 +1,21 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'text_alignment/anchor_finder'
|
3
4
|
require 'text_alignment/mixed_alignment'
|
4
5
|
|
5
6
|
module TextAlignment; end unless defined? TextAlignment
|
6
7
|
|
7
|
-
TextAlignment::
|
8
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
10
|
-
|
8
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
11
9
|
|
12
10
|
class TextAlignment::TextAlignment
|
13
11
|
attr_reader :block_alignments
|
14
12
|
attr_reader :similarity
|
15
13
|
attr_reader :lost_annotations
|
16
14
|
|
17
|
-
def initialize(
|
18
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
19
17
|
|
20
|
-
mappings
|
18
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
21
19
|
|
22
20
|
# try exact match
|
23
21
|
block_begin = str2.index(str1)
|
@@ -26,7 +24,7 @@ class TextAlignment::TextAlignment
|
|
26
24
|
return @block_alignments
|
27
25
|
end
|
28
26
|
|
29
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
|
27
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
30
28
|
|
31
29
|
# To collect matched blocks
|
32
30
|
mblocks = []
|
@@ -232,4 +230,60 @@ class TextAlignment::TextAlignment
|
|
232
230
|
r
|
233
231
|
end
|
234
232
|
|
233
|
+
private
|
234
|
+
|
235
|
+
|
236
|
+
def string_preprocessing(_str1, _str2)
|
237
|
+
str1 = _str1.dup
|
238
|
+
str2 = _str2.dup
|
239
|
+
mappings = TextAlignment::MAPPINGS.dup
|
240
|
+
|
241
|
+
## single character mappings
|
242
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
243
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
244
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
245
|
+
characters_to.gsub!(/-/, '\-')
|
246
|
+
|
247
|
+
str1.tr!(characters_from, characters_to)
|
248
|
+
str2.tr!(characters_from, characters_to)
|
249
|
+
|
250
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
251
|
+
|
252
|
+
## long to one character mappings
|
253
|
+
pletters = TextAlignment::PADDING_LETTERS
|
254
|
+
|
255
|
+
# find the padding letter for str1
|
256
|
+
padding_letter1 = begin
|
257
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
258
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
259
|
+
TextAlignment::PADDING_LETTERS[i]
|
260
|
+
end
|
261
|
+
|
262
|
+
# find the padding letter for str2
|
263
|
+
padding_letter2 = begin
|
264
|
+
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
265
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
266
|
+
TextAlignment::PADDING_LETTERS[i]
|
267
|
+
end
|
268
|
+
|
269
|
+
# ASCII foldings
|
270
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
271
|
+
ascii_foldings.each do |f|
|
272
|
+
from = f[1]
|
273
|
+
|
274
|
+
if str2.index(f[0])
|
275
|
+
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
276
|
+
str1.gsub!(from, to)
|
277
|
+
end
|
278
|
+
|
279
|
+
if str1.index(f[0])
|
280
|
+
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
281
|
+
str2.gsub!(from, to)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
285
|
+
|
286
|
+
[str1, str2, mappings]
|
287
|
+
end
|
288
|
+
|
235
289
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,6 +77,7 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/constants.rb
|
80
81
|
- lib/text_alignment/find_divisions.rb
|
81
82
|
- lib/text_alignment/glcs_alignment.rb
|
82
83
|
- lib/text_alignment/glcs_alignment_fast.rb
|