text_alignment 0.2.4 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c61a711930d19f5a72bd4a4128f5f36038a185eeb203b9ca68afce143694dbd3
4
- data.tar.gz: 173fa1ed0277f0384f0804e7a1ea02c3bd1a1eda20a5d3f57523028f815e69ac
3
+ metadata.gz: 8c1c45ed630cdd60291606b59e1944f0b854a689cfa0d281ae8b8879bf01e806
4
+ data.tar.gz: 9b33688f3a08f9110f556b4357fbe598d42d1147bc06933323cae4df7187341f
5
5
  SHA512:
6
- metadata.gz: 8f2a018145a07ec1d9a5c2277fd4d1e4bdb694726cf0936c92e8f25addb325e485545b7a1373a265e3b60fe2da0e39cdccbad586968aef093ed2154070242173
7
- data.tar.gz: 488390234e0e3b9d7d67389f303aed6e59bb0e16b1ad94908c8020be64b9c0bac9a9f404e42944fd37ece188377c930fd43596678a80ee24f3b00174c9e33aa4
6
+ metadata.gz: 856a8fca63f80be4cea7f6beff85dcf475d9237d68bc96728bcfcc030397f414637d2a8e32b139a6fdbccfd8603d327738ff7f3f59d80dc9e61e55a11a04bf20
7
+ data.tar.gz: 6055f50827354461f194a50da74259e8d97473be9b085634df270964bb815d96d9f916bbad2ef3961ddd845291a80e13b1680f1c42bafeab156b0d59fa3ba952
data/Gemfile CHANGED
@@ -1,8 +1,9 @@
1
1
  source 'https://rubygems.org'
2
- ruby '2.3.4'
2
+ ruby '2.5.5'
3
3
 
4
4
  gem 'diff-lcs', '~> 1.3'
5
5
  gem 'ruby-dictionary', '~>1.1', '>=1.1.1'
6
+ gem 'string-similarity', '~> 2.1'
6
7
 
7
8
  group :test do
8
9
  gem 'rspec', '~>3.0'
@@ -15,6 +15,7 @@ GEM
15
15
  rspec-support (~> 3.0.0)
16
16
  rspec-support (3.0.4)
17
17
  ruby-dictionary (1.1.1)
18
+ string-similarity (2.1.0)
18
19
 
19
20
  PLATFORMS
20
21
  ruby
@@ -23,9 +24,10 @@ DEPENDENCIES
23
24
  diff-lcs (~> 1.3)
24
25
  rspec (~> 3.0)
25
26
  ruby-dictionary (~> 1.1, >= 1.1.1)
27
+ string-similarity (~> 2.1)
26
28
 
27
29
  RUBY VERSION
28
- ruby 2.3.4p301
30
+ ruby 2.5.5p157
29
31
 
30
32
  BUNDLED WITH
31
33
  1.17.3
@@ -17,14 +17,35 @@ str2 = anns2[:text]
17
17
 
18
18
  denotations = anns1[:denotations]
19
19
 
20
+ puts "[Alignment1]====="
20
21
  align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
21
22
  puts TextAlignment::sdiff2cdiff(align.sdiff)
22
- puts "\n=====\n\n"
23
-
23
+ puts
24
+ puts "[Similarity]\n#{align.similarity}"
25
+ puts
26
+ puts '[Denotations original]'
27
+ pp denotations
28
+ puts
29
+ puts '[Denotations transformed]'
30
+ new_denotations = align.transform_hdenotations(denotations)
31
+ pp new_denotations
32
+ puts
33
+ puts "[Alignment2 (downcased)]====="
34
+ align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
35
+ puts TextAlignment::sdiff2cdiff(align.sdiff)
36
+ puts
37
+ puts "[Similarity]\n#{align.similarity}"
38
+ puts
39
+ puts '[Denotations original]'
24
40
  pp denotations
25
- puts "-----"
41
+ puts
42
+ puts '[Denotations transformed]'
26
43
  new_denotations = align.transform_hdenotations(denotations)
27
44
  pp new_denotations
45
+ puts
46
+ puts '[Annotations transformed]'
47
+ anns2[:denotations] = new_denotations
48
+ puts anns2.to_json
28
49
 
29
50
  # p align.common_elements
30
51
  # puts "---------------"
@@ -1,11 +1,14 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'string-similarity'
3
+
2
4
  module TextAlignment; end unless defined? TextAlignment
3
5
 
4
6
  # approximate the location of str1 in str2
5
7
  module TextAlignment
6
8
  SIGNATURE_NGRAM = 5
7
9
  MIN_LENGTH_FOR_APPROXIMATION = 50
8
- BUFFER_RATE = 0.2
10
+ BUFFER_RATE = 0.1
11
+ TEXT_SIMILARITY_TRESHOLD = 0.7
9
12
  end
10
13
 
11
14
  class << TextAlignment
@@ -22,29 +25,43 @@ class << TextAlignment
22
25
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
23
26
  return nil, nil if ngram_shared.empty?
24
27
 
25
- # approximate the beginning of the fit
26
- signature_ngram = ngram_shared.detect{|g| ngram2.count(g) == 1}
28
+ signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
29
+ return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
30
+
31
+ cache = {}
32
+ fit_begin, fit_end = nil, nil
33
+ signature_ngrams.each do |signature_ngram|
34
+ loc_signature_ngram_in_str1 = str1.index(signature_ngram)
35
+ loc_signature_ngram_in_str2 = str2.index(signature_ngram)
36
+
37
+ # approximate the beginning of the fit
38
+ fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
39
+ fit_begin = 0 if fit_begin < 0
27
40
 
28
- return nil, nil if signature_ngram.nil? #raise "no signature ngram"
29
- offset = str1.index(signature_ngram)
30
- fit_begin = str2.index(signature_ngram) - offset - (offset * TextAlignment::BUFFER_RATE).to_i
31
- fit_begin = 0 if fit_begin < 0
41
+ # approximate the end of the fit
42
+ offset_end = str1.length - loc_signature_ngram_in_str1
43
+ fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
44
+ fit_end = str2.length if fit_end > str2.length
32
45
 
33
- # to change the order according to ngram2
34
- ngram_shared = ngram2 & ngram1
46
+ next if cache.has_key?("#{fit_begin}-#{fit_end}")
47
+ text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
48
+ cache["#{fit_begin}-#{fit_end}"] = text_similarity
35
49
 
36
- # approximate the end of the fit
37
- ngram_shared_reverse = ngram_shared.reverse
38
- ngram2_reverse = ngram2.reverse
39
- signature_ngram = ngram_shared_reverse.detect{|g| ngram2_reverse.count(g) == 1}
40
- return nil, nil if signature_ngram.nil? # raise "no signature ngram"
41
- offset = str1.length - str1.rindex(signature_ngram)
42
- fit_end = str2.rindex(signature_ngram) + offset + (offset * TextAlignment::BUFFER_RATE).to_i
43
- fit_end = str2.length if fit_end > str2.length
50
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
51
+ fit_begin, fit_end = nil, nil
52
+ end
53
+ return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
54
+ return nil, nil
55
+ end
56
+
57
+ private
44
58
 
45
- return nil, nil if fit_begin >= fit_end
46
- return fit_begin, fit_end
59
+ def text_similarity(str1, str2, ngram_order = 3)
60
+ _str1 = str1.delete(" \t\r\n")
61
+ _str2 = str2.delete(" \t\r\n")
62
+ String::Similarity.cosine(_str1, _str2, ngram:2)
47
63
  end
64
+
48
65
  end
49
66
 
50
67
  if __FILE__ == $0
@@ -8,7 +8,7 @@ module TextAlignment; end unless defined? TextAlignment
8
8
  # to assume that there is no bag representation to this method
9
9
 
10
10
  module TextAlignment
11
- TextAlignment::SIMILARITY_THRESHOLD = 0.8
11
+ TextAlignment::SIMILARITY_THRESHOLD = 0.7
12
12
  end
13
13
 
14
14
  class << TextAlignment
@@ -28,12 +28,102 @@ class << TextAlignment
28
28
  target.tr!(characters_from, characters_to)
29
29
  sources.each{|source| source[:text].tr!(characters_from, characters_to)}
30
30
 
31
+ # to process smaller ones first
31
32
  sources.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
32
33
 
33
34
  TextAlignment._find_divisions(target, sources)
34
35
  end
35
36
 
36
- def _find_divisions(target, sources)
37
+ def _find_divisions(_target, _sources)
38
+ indice = []
39
+ history = []
40
+ cache = {}
41
+ target = _target.dup
42
+ sources = _sources.dup
43
+ until target.strip.empty? || sources.empty?
44
+ mode, cmp = nil, nil
45
+ candidates = []
46
+ sources.each_with_index do |source, i|
47
+ if target.size < source[:text].size
48
+ mode = :t_in_s
49
+ str1 = target
50
+ str2 = source[:text]
51
+ else
52
+ mode = :s_in_t
53
+ str1 = source[:text]
54
+ str2 = target
55
+ end
56
+
57
+ len1 = str1.length
58
+ len2 = str2.length
59
+
60
+ offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
61
+ approximate_fit(str1, str2)
62
+ else
63
+ # the whole target
64
+ [0, -1]
65
+ end
66
+
67
+ unless offset_begin.nil?
68
+ key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
69
+ cmp = if cache.has_key? key
70
+ cache[key]
71
+ else
72
+ cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
73
+ end
74
+ cache[key] = cmp
75
+
76
+ if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
77
+ candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
78
+ end
79
+ end
80
+ end
81
+
82
+ # return remaining target and sources if m.nil?
83
+ break if candidates.empty?
84
+
85
+ choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
86
+ m = choice[:idx]
87
+ mode = choice[:mode]
88
+
89
+ index = if mode == :t_in_s
90
+ {divid:sources[m][:divid], region:[0, target.size]}
91
+ else # :s_in_t
92
+ cmp = choice[:cmp]
93
+ offset = choice[:offset]
94
+ {divid:sources[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
95
+ end
96
+
97
+ target = target[0 ... index[:region][0]] + target[index[:region][1] .. -1]
98
+ history << index[:region].dup
99
+
100
+ before_begin = index[:region][0]
101
+ before_end = index[:region][1]
102
+
103
+ rhistory = history.reverse
104
+ rhistory.shift
105
+ rhistory.each do |h|
106
+ gap = h[1] - h[0]
107
+ index[:region][0] += gap if index[:region][0] >= h[0]
108
+ index[:region][1] += gap if index[:region][1] > h[0]
109
+ end
110
+
111
+ indice << index
112
+
113
+ sources.delete_at(m)
114
+ end
115
+
116
+ unless target.strip.empty? && sources.empty?
117
+ index = {divid:nil}
118
+ index[:remaining_target] = target unless target.strip.empty?
119
+ index[:remaining_sources] = sources.collect{|s| s[:divid]} unless sources.empty?
120
+ indice << index
121
+ end
122
+
123
+ indice
124
+ end
125
+
126
+ def _find_divisions_old(target, sources)
37
127
  mode, m, c, offset_begin = nil, nil, nil, nil
38
128
 
39
129
  sources.each_with_index do |source, i|
@@ -88,6 +178,7 @@ class << TextAlignment
88
178
  return [index] + more_index
89
179
  end
90
180
  end
181
+
91
182
  end
92
183
 
93
184
  if __FILE__ == $0
@@ -98,20 +189,29 @@ if __FILE__ == $0
98
189
 
99
190
  sources = JSON.parse File.read(ARGV[1]), :symbolize_names => true
100
191
  div_index = TextAlignment::find_divisions(target_text, sources)
192
+ pp div_index
101
193
 
102
194
  # str1 = File.read(ARGV[0]).strip
103
195
  # str2 = File.read(ARGV[1]).strip
104
196
  # div_index = TextAlignment::find_divisions(str1, [str2])
105
197
 
106
- puts "target length: #{target_text.length}"
107
- div_index.each do |i|
108
- if i[0] >= 0
109
- puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
110
- puts target_text[i[1][0] ... i[1][1]]
111
- puts "=========="
112
- else
113
- p i
114
- end
115
- end
198
+ # puts "target length: #{target_text.length}"
199
+ # div_index.each do |i|
200
+ # unless i[:divid].nil?
201
+ # puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
202
+ # puts target_text[i[:region][0] ... i[:region][1]]
203
+ # puts "=========="
204
+ # else
205
+ # p i
206
+ # end
207
+
208
+ # # if i[0] >= 0
209
+ # # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
210
+ # # puts target_text[i[1][0] ... i[1][1]]
211
+ # # puts "=========="
212
+ # # else
213
+ # # p i
214
+ # # end
215
+ # end
116
216
  end
117
217
  end
@@ -37,7 +37,7 @@ class << TextAlignment
37
37
  end
38
38
  end
39
39
 
40
- cdiff_str1.gsub(/\n/, ' ') + "\n" + cdiff_str2.gsub(/\n/, ' ')
40
+ cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
41
41
  end
42
42
 
43
43
  end
@@ -79,7 +79,7 @@ class TextAlignment::TextAlignment
79
79
  end
80
80
 
81
81
  def transform_a_span(span)
82
- {:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
82
+ {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
83
83
  end
84
84
 
85
85
  def transform_spans(spans)
@@ -91,11 +91,8 @@ class TextAlignment::TextAlignment
91
91
  end
92
92
 
93
93
  def transform_hdenotations(hdenotations)
94
- unless hdenotations.nil?
95
- hdenotations_new = Array.new(hdenotations)
96
- (0...hdenotations.length).each {|i| hdenotations_new[i][:span] = transform_a_span(hdenotations[i][:span])}
97
- hdenotations_new
98
- end
94
+ return nil if hdenotations.nil?
95
+ hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
99
96
  end
100
97
 
101
98
  private
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.2.4'
2
+ VERSION = '0.2.9'
3
3
  end
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ['lib']
19
19
 
20
- gem.add_development_dependency 'ruby-dictionary', '~>1.1', '>=1.1.1'
20
+ gem.add_runtime_dependency 'ruby-dictionary', '~>1.1', '>=1.1.1'
21
+ gem.add_runtime_dependency 'string-similarity', '~> 2.1'
21
22
  gem.add_development_dependency 'rspec', '~>3.0'
22
23
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-06-11 00:00:00.000000000 Z
11
+ date: 2020-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -20,7 +20,7 @@ dependencies:
20
20
  - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.1.1
23
- type: :development
23
+ type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
@@ -30,6 +30,20 @@ dependencies:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.1.1
33
+ - !ruby/object:Gem::Dependency
34
+ name: string-similarity
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.1'
40
+ type: :runtime
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.1'
33
47
  - !ruby/object:Gem::Dependency
34
48
  name: rspec
35
49
  requirement: !ruby/object:Gem::Requirement
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
112
  - !ruby/object:Gem::Version
99
113
  version: '0'
100
114
  requirements: []
101
- rubygems_version: 3.0.3
115
+ rubygems_version: 3.0.8
102
116
  signing_key:
103
117
  specification_version: 4
104
118
  summary: Ruby class for aligning two character strings