text_alignment 0.2.4 → 0.2.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c61a711930d19f5a72bd4a4128f5f36038a185eeb203b9ca68afce143694dbd3
4
- data.tar.gz: 173fa1ed0277f0384f0804e7a1ea02c3bd1a1eda20a5d3f57523028f815e69ac
3
+ metadata.gz: 8c1c45ed630cdd60291606b59e1944f0b854a689cfa0d281ae8b8879bf01e806
4
+ data.tar.gz: 9b33688f3a08f9110f556b4357fbe598d42d1147bc06933323cae4df7187341f
5
5
  SHA512:
6
- metadata.gz: 8f2a018145a07ec1d9a5c2277fd4d1e4bdb694726cf0936c92e8f25addb325e485545b7a1373a265e3b60fe2da0e39cdccbad586968aef093ed2154070242173
7
- data.tar.gz: 488390234e0e3b9d7d67389f303aed6e59bb0e16b1ad94908c8020be64b9c0bac9a9f404e42944fd37ece188377c930fd43596678a80ee24f3b00174c9e33aa4
6
+ metadata.gz: 856a8fca63f80be4cea7f6beff85dcf475d9237d68bc96728bcfcc030397f414637d2a8e32b139a6fdbccfd8603d327738ff7f3f59d80dc9e61e55a11a04bf20
7
+ data.tar.gz: 6055f50827354461f194a50da74259e8d97473be9b085634df270964bb815d96d9f916bbad2ef3961ddd845291a80e13b1680f1c42bafeab156b0d59fa3ba952
data/Gemfile CHANGED
@@ -1,8 +1,9 @@
1
1
  source 'https://rubygems.org'
2
- ruby '2.3.4'
2
+ ruby '2.5.5'
3
3
 
4
4
  gem 'diff-lcs', '~> 1.3'
5
5
  gem 'ruby-dictionary', '~>1.1', '>=1.1.1'
6
+ gem 'string-similarity', '~> 2.1'
6
7
 
7
8
  group :test do
8
9
  gem 'rspec', '~>3.0'
@@ -15,6 +15,7 @@ GEM
15
15
  rspec-support (~> 3.0.0)
16
16
  rspec-support (3.0.4)
17
17
  ruby-dictionary (1.1.1)
18
+ string-similarity (2.1.0)
18
19
 
19
20
  PLATFORMS
20
21
  ruby
@@ -23,9 +24,10 @@ DEPENDENCIES
23
24
  diff-lcs (~> 1.3)
24
25
  rspec (~> 3.0)
25
26
  ruby-dictionary (~> 1.1, >= 1.1.1)
27
+ string-similarity (~> 2.1)
26
28
 
27
29
  RUBY VERSION
28
- ruby 2.3.4p301
30
+ ruby 2.5.5p157
29
31
 
30
32
  BUNDLED WITH
31
33
  1.17.3
@@ -17,14 +17,35 @@ str2 = anns2[:text]
17
17
 
18
18
  denotations = anns1[:denotations]
19
19
 
20
+ puts "[Alignment1]====="
20
21
  align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
21
22
  puts TextAlignment::sdiff2cdiff(align.sdiff)
22
- puts "\n=====\n\n"
23
-
23
+ puts
24
+ puts "[Similarity]\n#{align.similarity}"
25
+ puts
26
+ puts '[Denotations original]'
27
+ pp denotations
28
+ puts
29
+ puts '[Denotations transformed]'
30
+ new_denotations = align.transform_hdenotations(denotations)
31
+ pp new_denotations
32
+ puts
33
+ puts "[Alignment2 (downcased)]====="
34
+ align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
35
+ puts TextAlignment::sdiff2cdiff(align.sdiff)
36
+ puts
37
+ puts "[Similarity]\n#{align.similarity}"
38
+ puts
39
+ puts '[Denotations original]'
24
40
  pp denotations
25
- puts "-----"
41
+ puts
42
+ puts '[Denotations transformed]'
26
43
  new_denotations = align.transform_hdenotations(denotations)
27
44
  pp new_denotations
45
+ puts
46
+ puts '[Annotations transformed]'
47
+ anns2[:denotations] = new_denotations
48
+ puts anns2.to_json
28
49
 
29
50
  # p align.common_elements
30
51
  # puts "---------------"
@@ -1,11 +1,14 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'string-similarity'
3
+
2
4
  module TextAlignment; end unless defined? TextAlignment
3
5
 
4
6
  # approximate the location of str1 in str2
5
7
  module TextAlignment
6
8
  SIGNATURE_NGRAM = 5
7
9
  MIN_LENGTH_FOR_APPROXIMATION = 50
8
- BUFFER_RATE = 0.2
10
+ BUFFER_RATE = 0.1
11
+ TEXT_SIMILARITY_TRESHOLD = 0.7
9
12
  end
10
13
 
11
14
  class << TextAlignment
@@ -22,29 +25,43 @@ class << TextAlignment
22
25
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
23
26
  return nil, nil if ngram_shared.empty?
24
27
 
25
- # approximate the beginning of the fit
26
- signature_ngram = ngram_shared.detect{|g| ngram2.count(g) == 1}
28
+ signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
29
+ return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
30
+
31
+ cache = {}
32
+ fit_begin, fit_end = nil, nil
33
+ signature_ngrams.each do |signature_ngram|
34
+ loc_signature_ngram_in_str1 = str1.index(signature_ngram)
35
+ loc_signature_ngram_in_str2 = str2.index(signature_ngram)
36
+
37
+ # approximate the beginning of the fit
38
+ fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
39
+ fit_begin = 0 if fit_begin < 0
27
40
 
28
- return nil, nil if signature_ngram.nil? #raise "no signature ngram"
29
- offset = str1.index(signature_ngram)
30
- fit_begin = str2.index(signature_ngram) - offset - (offset * TextAlignment::BUFFER_RATE).to_i
31
- fit_begin = 0 if fit_begin < 0
41
+ # approximate the end of the fit
42
+ offset_end = str1.length - loc_signature_ngram_in_str1
43
+ fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
44
+ fit_end = str2.length if fit_end > str2.length
32
45
 
33
- # to change the order according to ngram2
34
- ngram_shared = ngram2 & ngram1
46
+ next if cache.has_key?("#{fit_begin}-#{fit_end}")
47
+ text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
48
+ cache["#{fit_begin}-#{fit_end}"] = text_similarity
35
49
 
36
- # approximate the end of the fit
37
- ngram_shared_reverse = ngram_shared.reverse
38
- ngram2_reverse = ngram2.reverse
39
- signature_ngram = ngram_shared_reverse.detect{|g| ngram2_reverse.count(g) == 1}
40
- return nil, nil if signature_ngram.nil? # raise "no signature ngram"
41
- offset = str1.length - str1.rindex(signature_ngram)
42
- fit_end = str2.rindex(signature_ngram) + offset + (offset * TextAlignment::BUFFER_RATE).to_i
43
- fit_end = str2.length if fit_end > str2.length
50
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
51
+ fit_begin, fit_end = nil, nil
52
+ end
53
+ return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
54
+ return nil, nil
55
+ end
56
+
57
+ private
44
58
 
45
- return nil, nil if fit_begin >= fit_end
46
- return fit_begin, fit_end
59
+ def text_similarity(str1, str2, ngram_order = 3)
60
+ _str1 = str1.delete(" \t\r\n")
61
+ _str2 = str2.delete(" \t\r\n")
62
+ String::Similarity.cosine(_str1, _str2, ngram:2)
47
63
  end
64
+
48
65
  end
49
66
 
50
67
  if __FILE__ == $0
@@ -8,7 +8,7 @@ module TextAlignment; end unless defined? TextAlignment
8
8
  # to assume that there is no bag representation to this method
9
9
 
10
10
  module TextAlignment
11
- TextAlignment::SIMILARITY_THRESHOLD = 0.8
11
+ TextAlignment::SIMILARITY_THRESHOLD = 0.7
12
12
  end
13
13
 
14
14
  class << TextAlignment
@@ -28,12 +28,102 @@ class << TextAlignment
28
28
  target.tr!(characters_from, characters_to)
29
29
  sources.each{|source| source[:text].tr!(characters_from, characters_to)}
30
30
 
31
+ # to process smaller ones first
31
32
  sources.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
32
33
 
33
34
  TextAlignment._find_divisions(target, sources)
34
35
  end
35
36
 
36
- def _find_divisions(target, sources)
37
+ def _find_divisions(_target, _sources)
38
+ indice = []
39
+ history = []
40
+ cache = {}
41
+ target = _target.dup
42
+ sources = _sources.dup
43
+ until target.strip.empty? || sources.empty?
44
+ mode, cmp = nil, nil
45
+ candidates = []
46
+ sources.each_with_index do |source, i|
47
+ if target.size < source[:text].size
48
+ mode = :t_in_s
49
+ str1 = target
50
+ str2 = source[:text]
51
+ else
52
+ mode = :s_in_t
53
+ str1 = source[:text]
54
+ str2 = target
55
+ end
56
+
57
+ len1 = str1.length
58
+ len2 = str2.length
59
+
60
+ offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
61
+ approximate_fit(str1, str2)
62
+ else
63
+ # the whole target
64
+ [0, -1]
65
+ end
66
+
67
+ unless offset_begin.nil?
68
+ key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
69
+ cmp = if cache.has_key? key
70
+ cache[key]
71
+ else
72
+ cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
73
+ end
74
+ cache[key] = cmp
75
+
76
+ if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
77
+ candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
78
+ end
79
+ end
80
+ end
81
+
82
+ # return remaining target and sources if m.nil?
83
+ break if candidates.empty?
84
+
85
+ choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
86
+ m = choice[:idx]
87
+ mode = choice[:mode]
88
+
89
+ index = if mode == :t_in_s
90
+ {divid:sources[m][:divid], region:[0, target.size]}
91
+ else # :s_in_t
92
+ cmp = choice[:cmp]
93
+ offset = choice[:offset]
94
+ {divid:sources[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
95
+ end
96
+
97
+ target = target[0 ... index[:region][0]] + target[index[:region][1] .. -1]
98
+ history << index[:region].dup
99
+
100
+ before_begin = index[:region][0]
101
+ before_end = index[:region][1]
102
+
103
+ rhistory = history.reverse
104
+ rhistory.shift
105
+ rhistory.each do |h|
106
+ gap = h[1] - h[0]
107
+ index[:region][0] += gap if index[:region][0] >= h[0]
108
+ index[:region][1] += gap if index[:region][1] > h[0]
109
+ end
110
+
111
+ indice << index
112
+
113
+ sources.delete_at(m)
114
+ end
115
+
116
+ unless target.strip.empty? && sources.empty?
117
+ index = {divid:nil}
118
+ index[:remaining_target] = target unless target.strip.empty?
119
+ index[:remaining_sources] = sources.collect{|s| s[:divid]} unless sources.empty?
120
+ indice << index
121
+ end
122
+
123
+ indice
124
+ end
125
+
126
+ def _find_divisions_old(target, sources)
37
127
  mode, m, c, offset_begin = nil, nil, nil, nil
38
128
 
39
129
  sources.each_with_index do |source, i|
@@ -88,6 +178,7 @@ class << TextAlignment
88
178
  return [index] + more_index
89
179
  end
90
180
  end
181
+
91
182
  end
92
183
 
93
184
  if __FILE__ == $0
@@ -98,20 +189,29 @@ if __FILE__ == $0
98
189
 
99
190
  sources = JSON.parse File.read(ARGV[1]), :symbolize_names => true
100
191
  div_index = TextAlignment::find_divisions(target_text, sources)
192
+ pp div_index
101
193
 
102
194
  # str1 = File.read(ARGV[0]).strip
103
195
  # str2 = File.read(ARGV[1]).strip
104
196
  # div_index = TextAlignment::find_divisions(str1, [str2])
105
197
 
106
- puts "target length: #{target_text.length}"
107
- div_index.each do |i|
108
- if i[0] >= 0
109
- puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
110
- puts target_text[i[1][0] ... i[1][1]]
111
- puts "=========="
112
- else
113
- p i
114
- end
115
- end
198
+ # puts "target length: #{target_text.length}"
199
+ # div_index.each do |i|
200
+ # unless i[:divid].nil?
201
+ # puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
202
+ # puts target_text[i[:region][0] ... i[:region][1]]
203
+ # puts "=========="
204
+ # else
205
+ # p i
206
+ # end
207
+
208
+ # # if i[0] >= 0
209
+ # # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
210
+ # # puts target_text[i[1][0] ... i[1][1]]
211
+ # # puts "=========="
212
+ # # else
213
+ # # p i
214
+ # # end
215
+ # end
116
216
  end
117
217
  end
@@ -37,7 +37,7 @@ class << TextAlignment
37
37
  end
38
38
  end
39
39
 
40
- cdiff_str1.gsub(/\n/, ' ') + "\n" + cdiff_str2.gsub(/\n/, ' ')
40
+ cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
41
41
  end
42
42
 
43
43
  end
@@ -79,7 +79,7 @@ class TextAlignment::TextAlignment
79
79
  end
80
80
 
81
81
  def transform_a_span(span)
82
- {:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
82
+ {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
83
83
  end
84
84
 
85
85
  def transform_spans(spans)
@@ -91,11 +91,8 @@ class TextAlignment::TextAlignment
91
91
  end
92
92
 
93
93
  def transform_hdenotations(hdenotations)
94
- unless hdenotations.nil?
95
- hdenotations_new = Array.new(hdenotations)
96
- (0...hdenotations.length).each {|i| hdenotations_new[i][:span] = transform_a_span(hdenotations[i][:span])}
97
- hdenotations_new
98
- end
94
+ return nil if hdenotations.nil?
95
+ hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
99
96
  end
100
97
 
101
98
  private
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.2.4'
2
+ VERSION = '0.2.9'
3
3
  end
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ['lib']
19
19
 
20
- gem.add_development_dependency 'ruby-dictionary', '~>1.1', '>=1.1.1'
20
+ gem.add_runtime_dependency 'ruby-dictionary', '~>1.1', '>=1.1.1'
21
+ gem.add_runtime_dependency 'string-similarity', '~> 2.1'
21
22
  gem.add_development_dependency 'rspec', '~>3.0'
22
23
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-06-11 00:00:00.000000000 Z
11
+ date: 2020-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -20,7 +20,7 @@ dependencies:
20
20
  - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.1.1
23
- type: :development
23
+ type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
@@ -30,6 +30,20 @@ dependencies:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.1.1
33
+ - !ruby/object:Gem::Dependency
34
+ name: string-similarity
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.1'
40
+ type: :runtime
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.1'
33
47
  - !ruby/object:Gem::Dependency
34
48
  name: rspec
35
49
  requirement: !ruby/object:Gem::Requirement
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
112
  - !ruby/object:Gem::Version
99
113
  version: '0'
100
114
  requirements: []
101
- rubygems_version: 3.0.3
115
+ rubygems_version: 3.0.8
102
116
  signing_key:
103
117
  specification_version: 4
104
118
  summary: Ruby class for aligning two character strings