text_alignment 0.2.8 → 0.2.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2dba8334b3fbdc77976ae32ed2e5844a716f954c850bf24c0937462c1cd2b220
4
- data.tar.gz: 806ac498264b81111ef1055dbb8592fa5ec7fd4755f0b2106c2851c45c6eb498
3
+ metadata.gz: 8c1c45ed630cdd60291606b59e1944f0b854a689cfa0d281ae8b8879bf01e806
4
+ data.tar.gz: 9b33688f3a08f9110f556b4357fbe598d42d1147bc06933323cae4df7187341f
5
5
  SHA512:
6
- metadata.gz: 4732107de89daff9e8bbe89254e0d138db517396958e75b56f7f0697e3ff9d38e6b64082bbc84482bf12a91da7a03cfacdb3691729aaebd3eeac6aa836bf07c5
7
- data.tar.gz: e101fd3c1f5b8a5d9604f4998218a816d0eeecd5e7afbed23bf7504403d97de5e282f0dcad37abb7a46804f1f0749f6b996430bc47e93ae0b4947a916c25f40d
6
+ metadata.gz: 856a8fca63f80be4cea7f6beff85dcf475d9237d68bc96728bcfcc030397f414637d2a8e32b139a6fdbccfd8603d327738ff7f3f59d80dc9e61e55a11a04bf20
7
+ data.tar.gz: 6055f50827354461f194a50da74259e8d97473be9b085634df270964bb815d96d9f916bbad2ef3961ddd845291a80e13b1680f1c42bafeab156b0d59fa3ba952
@@ -8,7 +8,7 @@ module TextAlignment
8
8
  SIGNATURE_NGRAM = 5
9
9
  MIN_LENGTH_FOR_APPROXIMATION = 50
10
10
  BUFFER_RATE = 0.1
11
- TEXT_SIMILARITY_TRESHOLD = 0.8
11
+ TEXT_SIMILARITY_TRESHOLD = 0.7
12
12
  end
13
13
 
14
14
  class << TextAlignment
@@ -28,6 +28,7 @@ class << TextAlignment
28
28
  signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
29
29
  return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
30
30
 
31
+ cache = {}
31
32
  fit_begin, fit_end = nil, nil
32
33
  signature_ngrams.each do |signature_ngram|
33
34
  loc_signature_ngram_in_str1 = str1.index(signature_ngram)
@@ -42,13 +43,15 @@ class << TextAlignment
42
43
  fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
43
44
  fit_end = str2.length if fit_end > str2.length
44
45
 
46
+ next if cache.has_key?("#{fit_begin}-#{fit_end}")
45
47
  text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
48
+ cache["#{fit_begin}-#{fit_end}"] = text_similarity
49
+
46
50
  break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
47
51
  fit_begin, fit_end = nil, nil
48
52
  end
49
-
50
- return nil, nil if fit_begin >= fit_end
51
- return fit_begin, fit_end
53
+ return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
54
+ return nil, nil
52
55
  end
53
56
 
54
57
  private
@@ -8,7 +8,7 @@ module TextAlignment; end unless defined? TextAlignment
8
8
  # to assume that there is no bag representation to this method
9
9
 
10
10
  module TextAlignment
11
- TextAlignment::SIMILARITY_THRESHOLD = 0.8
11
+ TextAlignment::SIMILARITY_THRESHOLD = 0.7
12
12
  end
13
13
 
14
14
  class << TextAlignment
@@ -28,12 +28,102 @@ class << TextAlignment
28
28
  target.tr!(characters_from, characters_to)
29
29
  sources.each{|source| source[:text].tr!(characters_from, characters_to)}
30
30
 
31
+ # to process smaller ones first
31
32
  sources.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
32
33
 
33
34
  TextAlignment._find_divisions(target, sources)
34
35
  end
35
36
 
36
- def _find_divisions(target, sources)
37
+ def _find_divisions(_target, _sources)
38
+ indice = []
39
+ history = []
40
+ cache = {}
41
+ target = _target.dup
42
+ sources = _sources.dup
43
+ until target.strip.empty? || sources.empty?
44
+ mode, cmp = nil, nil
45
+ candidates = []
46
+ sources.each_with_index do |source, i|
47
+ if target.size < source[:text].size
48
+ mode = :t_in_s
49
+ str1 = target
50
+ str2 = source[:text]
51
+ else
52
+ mode = :s_in_t
53
+ str1 = source[:text]
54
+ str2 = target
55
+ end
56
+
57
+ len1 = str1.length
58
+ len2 = str2.length
59
+
60
+ offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
61
+ approximate_fit(str1, str2)
62
+ else
63
+ # the whole target
64
+ [0, -1]
65
+ end
66
+
67
+ unless offset_begin.nil?
68
+ key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
69
+ cmp = if cache.has_key? key
70
+ cache[key]
71
+ else
72
+ cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
73
+ end
74
+ cache[key] = cmp
75
+
76
+ if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
77
+ candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
78
+ end
79
+ end
80
+ end
81
+
82
+ # return remaining target and sources if m.nil?
83
+ break if candidates.empty?
84
+
85
+ choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
86
+ m = choice[:idx]
87
+ mode = choice[:mode]
88
+
89
+ index = if mode == :t_in_s
90
+ {divid:sources[m][:divid], region:[0, target.size]}
91
+ else # :s_in_t
92
+ cmp = choice[:cmp]
93
+ offset = choice[:offset]
94
+ {divid:sources[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
95
+ end
96
+
97
+ target = target[0 ... index[:region][0]] + target[index[:region][1] .. -1]
98
+ history << index[:region].dup
99
+
100
+ before_begin = index[:region][0]
101
+ before_end = index[:region][1]
102
+
103
+ rhistory = history.reverse
104
+ rhistory.shift
105
+ rhistory.each do |h|
106
+ gap = h[1] - h[0]
107
+ index[:region][0] += gap if index[:region][0] >= h[0]
108
+ index[:region][1] += gap if index[:region][1] > h[0]
109
+ end
110
+
111
+ indice << index
112
+
113
+ sources.delete_at(m)
114
+ end
115
+
116
+ unless target.strip.empty? && sources.empty?
117
+ index = {divid:nil}
118
+ index[:remaining_target] = target unless target.strip.empty?
119
+ index[:remaining_sources] = sources.collect{|s| s[:divid]} unless sources.empty?
120
+ indice << index
121
+ end
122
+
123
+ indice
124
+ end
125
+
126
+ def _find_divisions_old(target, sources)
37
127
  mode, m, c, offset_begin = nil, nil, nil, nil
38
128
 
39
129
  sources.each_with_index do |source, i|
@@ -88,6 +178,7 @@ class << TextAlignment
88
178
  return [index] + more_index
89
179
  end
90
180
  end
181
+
91
182
  end
92
183
 
93
184
  if __FILE__ == $0
@@ -98,20 +189,29 @@ if __FILE__ == $0
98
189
 
99
190
  sources = JSON.parse File.read(ARGV[1]), :symbolize_names => true
100
191
  div_index = TextAlignment::find_divisions(target_text, sources)
192
+ pp div_index
101
193
 
102
194
  # str1 = File.read(ARGV[0]).strip
103
195
  # str2 = File.read(ARGV[1]).strip
104
196
  # div_index = TextAlignment::find_divisions(str1, [str2])
105
197
 
106
- puts "target length: #{target_text.length}"
107
- div_index.each do |i|
108
- if i[0] >= 0
109
- puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
110
- puts target_text[i[1][0] ... i[1][1]]
111
- puts "=========="
112
- else
113
- p i
114
- end
115
- end
198
+ # puts "target length: #{target_text.length}"
199
+ # div_index.each do |i|
200
+ # unless i[:divid].nil?
201
+ # puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
202
+ # puts target_text[i[:region][0] ... i[:region][1]]
203
+ # puts "=========="
204
+ # else
205
+ # p i
206
+ # end
207
+
208
+ # # if i[0] >= 0
209
+ # # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
210
+ # # puts target_text[i[1][0] ... i[1][1]]
211
+ # # puts "=========="
212
+ # # else
213
+ # # p i
214
+ # # end
215
+ # end
116
216
  end
117
217
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.2.8'
2
+ VERSION = '0.2.9'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-04 00:00:00.000000000 Z
11
+ date: 2020-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary