text_alignment 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2dba8334b3fbdc77976ae32ed2e5844a716f954c850bf24c0937462c1cd2b220
4
- data.tar.gz: 806ac498264b81111ef1055dbb8592fa5ec7fd4755f0b2106c2851c45c6eb498
3
+ metadata.gz: 8c1c45ed630cdd60291606b59e1944f0b854a689cfa0d281ae8b8879bf01e806
4
+ data.tar.gz: 9b33688f3a08f9110f556b4357fbe598d42d1147bc06933323cae4df7187341f
5
5
  SHA512:
6
- metadata.gz: 4732107de89daff9e8bbe89254e0d138db517396958e75b56f7f0697e3ff9d38e6b64082bbc84482bf12a91da7a03cfacdb3691729aaebd3eeac6aa836bf07c5
7
- data.tar.gz: e101fd3c1f5b8a5d9604f4998218a816d0eeecd5e7afbed23bf7504403d97de5e282f0dcad37abb7a46804f1f0749f6b996430bc47e93ae0b4947a916c25f40d
6
+ metadata.gz: 856a8fca63f80be4cea7f6beff85dcf475d9237d68bc96728bcfcc030397f414637d2a8e32b139a6fdbccfd8603d327738ff7f3f59d80dc9e61e55a11a04bf20
7
+ data.tar.gz: 6055f50827354461f194a50da74259e8d97473be9b085634df270964bb815d96d9f916bbad2ef3961ddd845291a80e13b1680f1c42bafeab156b0d59fa3ba952
@@ -8,7 +8,7 @@ module TextAlignment
8
8
  SIGNATURE_NGRAM = 5
9
9
  MIN_LENGTH_FOR_APPROXIMATION = 50
10
10
  BUFFER_RATE = 0.1
11
- TEXT_SIMILARITY_TRESHOLD = 0.8
11
+ TEXT_SIMILARITY_TRESHOLD = 0.7
12
12
  end
13
13
 
14
14
  class << TextAlignment
@@ -28,6 +28,7 @@ class << TextAlignment
28
28
  signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
29
29
  return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
30
30
 
31
+ cache = {}
31
32
  fit_begin, fit_end = nil, nil
32
33
  signature_ngrams.each do |signature_ngram|
33
34
  loc_signature_ngram_in_str1 = str1.index(signature_ngram)
@@ -42,13 +43,15 @@ class << TextAlignment
42
43
  fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
43
44
  fit_end = str2.length if fit_end > str2.length
44
45
 
46
+ next if cache.has_key?("#{fit_begin}-#{fit_end}")
45
47
  text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
48
+ cache["#{fit_begin}-#{fit_end}"] = text_similarity
49
+
46
50
  break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
47
51
  fit_begin, fit_end = nil, nil
48
52
  end
49
-
50
- return nil, nil if fit_begin >= fit_end
51
- return fit_begin, fit_end
53
+ return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
54
+ return nil, nil
52
55
  end
53
56
 
54
57
  private
@@ -8,7 +8,7 @@ module TextAlignment; end unless defined? TextAlignment
8
8
  # to assume that there is no bag representation to this method
9
9
 
10
10
  module TextAlignment
11
- TextAlignment::SIMILARITY_THRESHOLD = 0.8
11
+ TextAlignment::SIMILARITY_THRESHOLD = 0.7
12
12
  end
13
13
 
14
14
  class << TextAlignment
@@ -28,12 +28,102 @@ class << TextAlignment
28
28
  target.tr!(characters_from, characters_to)
29
29
  sources.each{|source| source[:text].tr!(characters_from, characters_to)}
30
30
 
31
+ # to process smaller ones first
31
32
  sources.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
32
33
 
33
34
  TextAlignment._find_divisions(target, sources)
34
35
  end
35
36
 
36
- def _find_divisions(target, sources)
37
+ def _find_divisions(_target, _sources)
38
+ indice = []
39
+ history = []
40
+ cache = {}
41
+ target = _target.dup
42
+ sources = _sources.dup
43
+ until target.strip.empty? || sources.empty?
44
+ mode, cmp = nil, nil
45
+ candidates = []
46
+ sources.each_with_index do |source, i|
47
+ if target.size < source[:text].size
48
+ mode = :t_in_s
49
+ str1 = target
50
+ str2 = source[:text]
51
+ else
52
+ mode = :s_in_t
53
+ str1 = source[:text]
54
+ str2 = target
55
+ end
56
+
57
+ len1 = str1.length
58
+ len2 = str2.length
59
+
60
+ offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
61
+ approximate_fit(str1, str2)
62
+ else
63
+ # the whole target
64
+ [0, -1]
65
+ end
66
+
67
+ unless offset_begin.nil?
68
+ key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
69
+ cmp = if cache.has_key? key
70
+ cache[key]
71
+ else
72
+ cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
73
+ end
74
+ cache[key] = cmp
75
+
76
+ if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
77
+ candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
78
+ end
79
+ end
80
+ end
81
+
82
+ # return remaining target and sources if m.nil?
83
+ break if candidates.empty?
84
+
85
+ choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
86
+ m = choice[:idx]
87
+ mode = choice[:mode]
88
+
89
+ index = if mode == :t_in_s
90
+ {divid:sources[m][:divid], region:[0, target.size]}
91
+ else # :s_in_t
92
+ cmp = choice[:cmp]
93
+ offset = choice[:offset]
94
+ {divid:sources[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
95
+ end
96
+
97
+ target = target[0 ... index[:region][0]] + target[index[:region][1] .. -1]
98
+ history << index[:region].dup
99
+
100
+ before_begin = index[:region][0]
101
+ before_end = index[:region][1]
102
+
103
+ rhistory = history.reverse
104
+ rhistory.shift
105
+ rhistory.each do |h|
106
+ gap = h[1] - h[0]
107
+ index[:region][0] += gap if index[:region][0] >= h[0]
108
+ index[:region][1] += gap if index[:region][1] > h[0]
109
+ end
110
+
111
+ indice << index
112
+
113
+ sources.delete_at(m)
114
+ end
115
+
116
+ unless target.strip.empty? && sources.empty?
117
+ index = {divid:nil}
118
+ index[:remaining_target] = target unless target.strip.empty?
119
+ index[:remaining_sources] = sources.collect{|s| s[:divid]} unless sources.empty?
120
+ indice << index
121
+ end
122
+
123
+ indice
124
+ end
125
+
126
+ def _find_divisions_old(target, sources)
37
127
  mode, m, c, offset_begin = nil, nil, nil, nil
38
128
 
39
129
  sources.each_with_index do |source, i|
@@ -88,6 +178,7 @@ class << TextAlignment
88
178
  return [index] + more_index
89
179
  end
90
180
  end
181
+
91
182
  end
92
183
 
93
184
  if __FILE__ == $0
@@ -98,20 +189,29 @@ if __FILE__ == $0
98
189
 
99
190
  sources = JSON.parse File.read(ARGV[1]), :symbolize_names => true
100
191
  div_index = TextAlignment::find_divisions(target_text, sources)
192
+ pp div_index
101
193
 
102
194
  # str1 = File.read(ARGV[0]).strip
103
195
  # str2 = File.read(ARGV[1]).strip
104
196
  # div_index = TextAlignment::find_divisions(str1, [str2])
105
197
 
106
- puts "target length: #{target_text.length}"
107
- div_index.each do |i|
108
- if i[0] >= 0
109
- puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
110
- puts target_text[i[1][0] ... i[1][1]]
111
- puts "=========="
112
- else
113
- p i
114
- end
115
- end
198
+ # puts "target length: #{target_text.length}"
199
+ # div_index.each do |i|
200
+ # unless i[:divid].nil?
201
+ # puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
202
+ # puts target_text[i[:region][0] ... i[:region][1]]
203
+ # puts "=========="
204
+ # else
205
+ # p i
206
+ # end
207
+
208
+ # # if i[0] >= 0
209
+ # # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
210
+ # # puts target_text[i[1][0] ... i[1][1]]
211
+ # # puts "=========="
212
+ # # else
213
+ # # p i
214
+ # # end
215
+ # end
116
216
  end
117
217
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.2.8'
2
+ VERSION = '0.2.9'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-04 00:00:00.000000000 Z
11
+ date: 2020-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary