text_alignment 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/approximate_fit.rb +7 -4
- data/lib/text_alignment/find_divisions.rb +112 -12
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c1c45ed630cdd60291606b59e1944f0b854a689cfa0d281ae8b8879bf01e806
|
4
|
+
data.tar.gz: 9b33688f3a08f9110f556b4357fbe598d42d1147bc06933323cae4df7187341f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 856a8fca63f80be4cea7f6beff85dcf475d9237d68bc96728bcfcc030397f414637d2a8e32b139a6fdbccfd8603d327738ff7f3f59d80dc9e61e55a11a04bf20
|
7
|
+
data.tar.gz: 6055f50827354461f194a50da74259e8d97473be9b085634df270964bb815d96d9f916bbad2ef3961ddd845291a80e13b1680f1c42bafeab156b0d59fa3ba952
|
@@ -8,7 +8,7 @@ module TextAlignment
|
|
8
8
|
SIGNATURE_NGRAM = 5
|
9
9
|
MIN_LENGTH_FOR_APPROXIMATION = 50
|
10
10
|
BUFFER_RATE = 0.1
|
11
|
-
TEXT_SIMILARITY_TRESHOLD = 0.
|
11
|
+
TEXT_SIMILARITY_TRESHOLD = 0.7
|
12
12
|
end
|
13
13
|
|
14
14
|
class << TextAlignment
|
@@ -28,6 +28,7 @@ class << TextAlignment
|
|
28
28
|
signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
|
29
29
|
return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
|
30
30
|
|
31
|
+
cache = {}
|
31
32
|
fit_begin, fit_end = nil, nil
|
32
33
|
signature_ngrams.each do |signature_ngram|
|
33
34
|
loc_signature_ngram_in_str1 = str1.index(signature_ngram)
|
@@ -42,13 +43,15 @@ class << TextAlignment
|
|
42
43
|
fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
|
43
44
|
fit_end = str2.length if fit_end > str2.length
|
44
45
|
|
46
|
+
next if cache.has_key?("#{fit_begin}-#{fit_end}")
|
45
47
|
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
48
|
+
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
49
|
+
|
46
50
|
break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
47
51
|
fit_begin, fit_end = nil, nil
|
48
52
|
end
|
49
|
-
|
50
|
-
return nil, nil
|
51
|
-
return fit_begin, fit_end
|
53
|
+
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
54
|
+
return nil, nil
|
52
55
|
end
|
53
56
|
|
54
57
|
private
|
@@ -8,7 +8,7 @@ module TextAlignment; end unless defined? TextAlignment
|
|
8
8
|
# to assume that there is no bag representation to this method
|
9
9
|
|
10
10
|
module TextAlignment
|
11
|
-
TextAlignment::SIMILARITY_THRESHOLD = 0.
|
11
|
+
TextAlignment::SIMILARITY_THRESHOLD = 0.7
|
12
12
|
end
|
13
13
|
|
14
14
|
class << TextAlignment
|
@@ -28,12 +28,102 @@ class << TextAlignment
|
|
28
28
|
target.tr!(characters_from, characters_to)
|
29
29
|
sources.each{|source| source[:text].tr!(characters_from, characters_to)}
|
30
30
|
|
31
|
+
# to process smaller ones first
|
31
32
|
sources.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
|
32
33
|
|
33
34
|
TextAlignment._find_divisions(target, sources)
|
34
35
|
end
|
35
36
|
|
36
|
-
def _find_divisions(
|
37
|
+
def _find_divisions(_target, _sources)
|
38
|
+
indice = []
|
39
|
+
history = []
|
40
|
+
cache = {}
|
41
|
+
target = _target.dup
|
42
|
+
sources = _sources.dup
|
43
|
+
until target.strip.empty? || sources.empty?
|
44
|
+
mode, cmp = nil, nil
|
45
|
+
candidates = []
|
46
|
+
sources.each_with_index do |source, i|
|
47
|
+
if target.size < source[:text].size
|
48
|
+
mode = :t_in_s
|
49
|
+
str1 = target
|
50
|
+
str2 = source[:text]
|
51
|
+
else
|
52
|
+
mode = :s_in_t
|
53
|
+
str1 = source[:text]
|
54
|
+
str2 = target
|
55
|
+
end
|
56
|
+
|
57
|
+
len1 = str1.length
|
58
|
+
len2 = str2.length
|
59
|
+
|
60
|
+
offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
|
61
|
+
approximate_fit(str1, str2)
|
62
|
+
else
|
63
|
+
# the whole target
|
64
|
+
[0, -1]
|
65
|
+
end
|
66
|
+
|
67
|
+
unless offset_begin.nil?
|
68
|
+
key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
|
69
|
+
cmp = if cache.has_key? key
|
70
|
+
cache[key]
|
71
|
+
else
|
72
|
+
cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
|
73
|
+
end
|
74
|
+
cache[key] = cmp
|
75
|
+
|
76
|
+
if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
|
77
|
+
candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# return remaining target and sources if m.nil?
|
83
|
+
break if candidates.empty?
|
84
|
+
|
85
|
+
choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
|
86
|
+
m = choice[:idx]
|
87
|
+
mode = choice[:mode]
|
88
|
+
|
89
|
+
index = if mode == :t_in_s
|
90
|
+
{divid:sources[m][:divid], region:[0, target.size]}
|
91
|
+
else # :s_in_t
|
92
|
+
cmp = choice[:cmp]
|
93
|
+
offset = choice[:offset]
|
94
|
+
{divid:sources[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
|
95
|
+
end
|
96
|
+
|
97
|
+
target = target[0 ... index[:region][0]] + target[index[:region][1] .. -1]
|
98
|
+
history << index[:region].dup
|
99
|
+
|
100
|
+
before_begin = index[:region][0]
|
101
|
+
before_end = index[:region][1]
|
102
|
+
|
103
|
+
rhistory = history.reverse
|
104
|
+
rhistory.shift
|
105
|
+
rhistory.each do |h|
|
106
|
+
gap = h[1] - h[0]
|
107
|
+
index[:region][0] += gap if index[:region][0] >= h[0]
|
108
|
+
index[:region][1] += gap if index[:region][1] > h[0]
|
109
|
+
end
|
110
|
+
|
111
|
+
indice << index
|
112
|
+
|
113
|
+
sources.delete_at(m)
|
114
|
+
end
|
115
|
+
|
116
|
+
unless target.strip.empty? && sources.empty?
|
117
|
+
index = {divid:nil}
|
118
|
+
index[:remaining_target] = target unless target.strip.empty?
|
119
|
+
index[:remaining_sources] = sources.collect{|s| s[:divid]} unless sources.empty?
|
120
|
+
indice << index
|
121
|
+
end
|
122
|
+
|
123
|
+
indice
|
124
|
+
end
|
125
|
+
|
126
|
+
def _find_divisions_old(target, sources)
|
37
127
|
mode, m, c, offset_begin = nil, nil, nil, nil
|
38
128
|
|
39
129
|
sources.each_with_index do |source, i|
|
@@ -88,6 +178,7 @@ class << TextAlignment
|
|
88
178
|
return [index] + more_index
|
89
179
|
end
|
90
180
|
end
|
181
|
+
|
91
182
|
end
|
92
183
|
|
93
184
|
if __FILE__ == $0
|
@@ -98,20 +189,29 @@ if __FILE__ == $0
|
|
98
189
|
|
99
190
|
sources = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
100
191
|
div_index = TextAlignment::find_divisions(target_text, sources)
|
192
|
+
pp div_index
|
101
193
|
|
102
194
|
# str1 = File.read(ARGV[0]).strip
|
103
195
|
# str2 = File.read(ARGV[1]).strip
|
104
196
|
# div_index = TextAlignment::find_divisions(str1, [str2])
|
105
197
|
|
106
|
-
puts "target length: #{target_text.length}"
|
107
|
-
div_index.each do |i|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
198
|
+
# puts "target length: #{target_text.length}"
|
199
|
+
# div_index.each do |i|
|
200
|
+
# unless i[:divid].nil?
|
201
|
+
# puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
|
202
|
+
# puts target_text[i[:region][0] ... i[:region][1]]
|
203
|
+
# puts "=========="
|
204
|
+
# else
|
205
|
+
# p i
|
206
|
+
# end
|
207
|
+
|
208
|
+
# # if i[0] >= 0
|
209
|
+
# # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
|
210
|
+
# # puts target_text[i[1][0] ... i[1][1]]
|
211
|
+
# # puts "=========="
|
212
|
+
# # else
|
213
|
+
# # p i
|
214
|
+
# # end
|
215
|
+
# end
|
116
216
|
end
|
117
217
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|