text_alignment 0.2.4 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -1
- data/Gemfile.lock +3 -1
- data/bin/align_annotations +24 -3
- data/lib/text_alignment/approximate_fit.rb +36 -19
- data/lib/text_alignment/find_divisions.rb +112 -12
- data/lib/text_alignment/lcs_cdiff.rb +1 -1
- data/lib/text_alignment/text_alignment.rb +3 -6
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +2 -1
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c1c45ed630cdd60291606b59e1944f0b854a689cfa0d281ae8b8879bf01e806
|
4
|
+
data.tar.gz: 9b33688f3a08f9110f556b4357fbe598d42d1147bc06933323cae4df7187341f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 856a8fca63f80be4cea7f6beff85dcf475d9237d68bc96728bcfcc030397f414637d2a8e32b139a6fdbccfd8603d327738ff7f3f59d80dc9e61e55a11a04bf20
|
7
|
+
data.tar.gz: 6055f50827354461f194a50da74259e8d97473be9b085634df270964bb815d96d9f916bbad2ef3961ddd845291a80e13b1680f1c42bafeab156b0d59fa3ba952
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -15,6 +15,7 @@ GEM
|
|
15
15
|
rspec-support (~> 3.0.0)
|
16
16
|
rspec-support (3.0.4)
|
17
17
|
ruby-dictionary (1.1.1)
|
18
|
+
string-similarity (2.1.0)
|
18
19
|
|
19
20
|
PLATFORMS
|
20
21
|
ruby
|
@@ -23,9 +24,10 @@ DEPENDENCIES
|
|
23
24
|
diff-lcs (~> 1.3)
|
24
25
|
rspec (~> 3.0)
|
25
26
|
ruby-dictionary (~> 1.1, >= 1.1.1)
|
27
|
+
string-similarity (~> 2.1)
|
26
28
|
|
27
29
|
RUBY VERSION
|
28
|
-
ruby 2.
|
30
|
+
ruby 2.5.5p157
|
29
31
|
|
30
32
|
BUNDLED WITH
|
31
33
|
1.17.3
|
data/bin/align_annotations
CHANGED
@@ -17,14 +17,35 @@ str2 = anns2[:text]
|
|
17
17
|
|
18
18
|
denotations = anns1[:denotations]
|
19
19
|
|
20
|
+
puts "[Alignment1]====="
|
20
21
|
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
21
22
|
puts TextAlignment::sdiff2cdiff(align.sdiff)
|
22
|
-
puts
|
23
|
-
|
23
|
+
puts
|
24
|
+
puts "[Similarity]\n#{align.similarity}"
|
25
|
+
puts
|
26
|
+
puts '[Denotations original]'
|
27
|
+
pp denotations
|
28
|
+
puts
|
29
|
+
puts '[Denotations transformed]'
|
30
|
+
new_denotations = align.transform_hdenotations(denotations)
|
31
|
+
pp new_denotations
|
32
|
+
puts
|
33
|
+
puts "[Alignment2 (downcased)]====="
|
34
|
+
align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
|
35
|
+
puts TextAlignment::sdiff2cdiff(align.sdiff)
|
36
|
+
puts
|
37
|
+
puts "[Similarity]\n#{align.similarity}"
|
38
|
+
puts
|
39
|
+
puts '[Denotations original]'
|
24
40
|
pp denotations
|
25
|
-
puts
|
41
|
+
puts
|
42
|
+
puts '[Denotations transformed]'
|
26
43
|
new_denotations = align.transform_hdenotations(denotations)
|
27
44
|
pp new_denotations
|
45
|
+
puts
|
46
|
+
puts '[Annotations transformed]'
|
47
|
+
anns2[:denotations] = new_denotations
|
48
|
+
puts anns2.to_json
|
28
49
|
|
29
50
|
# p align.common_elements
|
30
51
|
# puts "---------------"
|
@@ -1,11 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'string-similarity'
|
3
|
+
|
2
4
|
module TextAlignment; end unless defined? TextAlignment
|
3
5
|
|
4
6
|
# approximate the location of str1 in str2
|
5
7
|
module TextAlignment
|
6
8
|
SIGNATURE_NGRAM = 5
|
7
9
|
MIN_LENGTH_FOR_APPROXIMATION = 50
|
8
|
-
BUFFER_RATE = 0.
|
10
|
+
BUFFER_RATE = 0.1
|
11
|
+
TEXT_SIMILARITY_TRESHOLD = 0.7
|
9
12
|
end
|
10
13
|
|
11
14
|
class << TextAlignment
|
@@ -22,29 +25,43 @@ class << TextAlignment
|
|
22
25
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
23
26
|
return nil, nil if ngram_shared.empty?
|
24
27
|
|
25
|
-
|
26
|
-
|
28
|
+
signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
|
29
|
+
return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
|
30
|
+
|
31
|
+
cache = {}
|
32
|
+
fit_begin, fit_end = nil, nil
|
33
|
+
signature_ngrams.each do |signature_ngram|
|
34
|
+
loc_signature_ngram_in_str1 = str1.index(signature_ngram)
|
35
|
+
loc_signature_ngram_in_str2 = str2.index(signature_ngram)
|
36
|
+
|
37
|
+
# approximate the beginning of the fit
|
38
|
+
fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
|
39
|
+
fit_begin = 0 if fit_begin < 0
|
27
40
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
41
|
+
# approximate the end of the fit
|
42
|
+
offset_end = str1.length - loc_signature_ngram_in_str1
|
43
|
+
fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
|
44
|
+
fit_end = str2.length if fit_end > str2.length
|
32
45
|
|
33
|
-
|
34
|
-
|
46
|
+
next if cache.has_key?("#{fit_begin}-#{fit_end}")
|
47
|
+
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
48
|
+
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
35
49
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
return nil, nil
|
41
|
-
|
42
|
-
|
43
|
-
|
50
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
51
|
+
fit_begin, fit_end = nil, nil
|
52
|
+
end
|
53
|
+
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
54
|
+
return nil, nil
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
44
58
|
|
45
|
-
|
46
|
-
|
59
|
+
def text_similarity(str1, str2, ngram_order = 3)
|
60
|
+
_str1 = str1.delete(" \t\r\n")
|
61
|
+
_str2 = str2.delete(" \t\r\n")
|
62
|
+
String::Similarity.cosine(_str1, _str2, ngram:2)
|
47
63
|
end
|
64
|
+
|
48
65
|
end
|
49
66
|
|
50
67
|
if __FILE__ == $0
|
@@ -8,7 +8,7 @@ module TextAlignment; end unless defined? TextAlignment
|
|
8
8
|
# to assume that there is no bag representation to this method
|
9
9
|
|
10
10
|
module TextAlignment
|
11
|
-
TextAlignment::SIMILARITY_THRESHOLD = 0.
|
11
|
+
TextAlignment::SIMILARITY_THRESHOLD = 0.7
|
12
12
|
end
|
13
13
|
|
14
14
|
class << TextAlignment
|
@@ -28,12 +28,102 @@ class << TextAlignment
|
|
28
28
|
target.tr!(characters_from, characters_to)
|
29
29
|
sources.each{|source| source[:text].tr!(characters_from, characters_to)}
|
30
30
|
|
31
|
+
# to process smaller ones first
|
31
32
|
sources.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
|
32
33
|
|
33
34
|
TextAlignment._find_divisions(target, sources)
|
34
35
|
end
|
35
36
|
|
36
|
-
def _find_divisions(
|
37
|
+
def _find_divisions(_target, _sources)
|
38
|
+
indice = []
|
39
|
+
history = []
|
40
|
+
cache = {}
|
41
|
+
target = _target.dup
|
42
|
+
sources = _sources.dup
|
43
|
+
until target.strip.empty? || sources.empty?
|
44
|
+
mode, cmp = nil, nil
|
45
|
+
candidates = []
|
46
|
+
sources.each_with_index do |source, i|
|
47
|
+
if target.size < source[:text].size
|
48
|
+
mode = :t_in_s
|
49
|
+
str1 = target
|
50
|
+
str2 = source[:text]
|
51
|
+
else
|
52
|
+
mode = :s_in_t
|
53
|
+
str1 = source[:text]
|
54
|
+
str2 = target
|
55
|
+
end
|
56
|
+
|
57
|
+
len1 = str1.length
|
58
|
+
len2 = str2.length
|
59
|
+
|
60
|
+
offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
|
61
|
+
approximate_fit(str1, str2)
|
62
|
+
else
|
63
|
+
# the whole target
|
64
|
+
[0, -1]
|
65
|
+
end
|
66
|
+
|
67
|
+
unless offset_begin.nil?
|
68
|
+
key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
|
69
|
+
cmp = if cache.has_key? key
|
70
|
+
cache[key]
|
71
|
+
else
|
72
|
+
cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
|
73
|
+
end
|
74
|
+
cache[key] = cmp
|
75
|
+
|
76
|
+
if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
|
77
|
+
candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# return remaining target and sources if m.nil?
|
83
|
+
break if candidates.empty?
|
84
|
+
|
85
|
+
choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
|
86
|
+
m = choice[:idx]
|
87
|
+
mode = choice[:mode]
|
88
|
+
|
89
|
+
index = if mode == :t_in_s
|
90
|
+
{divid:sources[m][:divid], region:[0, target.size]}
|
91
|
+
else # :s_in_t
|
92
|
+
cmp = choice[:cmp]
|
93
|
+
offset = choice[:offset]
|
94
|
+
{divid:sources[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
|
95
|
+
end
|
96
|
+
|
97
|
+
target = target[0 ... index[:region][0]] + target[index[:region][1] .. -1]
|
98
|
+
history << index[:region].dup
|
99
|
+
|
100
|
+
before_begin = index[:region][0]
|
101
|
+
before_end = index[:region][1]
|
102
|
+
|
103
|
+
rhistory = history.reverse
|
104
|
+
rhistory.shift
|
105
|
+
rhistory.each do |h|
|
106
|
+
gap = h[1] - h[0]
|
107
|
+
index[:region][0] += gap if index[:region][0] >= h[0]
|
108
|
+
index[:region][1] += gap if index[:region][1] > h[0]
|
109
|
+
end
|
110
|
+
|
111
|
+
indice << index
|
112
|
+
|
113
|
+
sources.delete_at(m)
|
114
|
+
end
|
115
|
+
|
116
|
+
unless target.strip.empty? && sources.empty?
|
117
|
+
index = {divid:nil}
|
118
|
+
index[:remaining_target] = target unless target.strip.empty?
|
119
|
+
index[:remaining_sources] = sources.collect{|s| s[:divid]} unless sources.empty?
|
120
|
+
indice << index
|
121
|
+
end
|
122
|
+
|
123
|
+
indice
|
124
|
+
end
|
125
|
+
|
126
|
+
def _find_divisions_old(target, sources)
|
37
127
|
mode, m, c, offset_begin = nil, nil, nil, nil
|
38
128
|
|
39
129
|
sources.each_with_index do |source, i|
|
@@ -88,6 +178,7 @@ class << TextAlignment
|
|
88
178
|
return [index] + more_index
|
89
179
|
end
|
90
180
|
end
|
181
|
+
|
91
182
|
end
|
92
183
|
|
93
184
|
if __FILE__ == $0
|
@@ -98,20 +189,29 @@ if __FILE__ == $0
|
|
98
189
|
|
99
190
|
sources = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
100
191
|
div_index = TextAlignment::find_divisions(target_text, sources)
|
192
|
+
pp div_index
|
101
193
|
|
102
194
|
# str1 = File.read(ARGV[0]).strip
|
103
195
|
# str2 = File.read(ARGV[1]).strip
|
104
196
|
# div_index = TextAlignment::find_divisions(str1, [str2])
|
105
197
|
|
106
|
-
puts "target length: #{target_text.length}"
|
107
|
-
div_index.each do |i|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
198
|
+
# puts "target length: #{target_text.length}"
|
199
|
+
# div_index.each do |i|
|
200
|
+
# unless i[:divid].nil?
|
201
|
+
# puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
|
202
|
+
# puts target_text[i[:region][0] ... i[:region][1]]
|
203
|
+
# puts "=========="
|
204
|
+
# else
|
205
|
+
# p i
|
206
|
+
# end
|
207
|
+
|
208
|
+
# # if i[0] >= 0
|
209
|
+
# # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
|
210
|
+
# # puts target_text[i[1][0] ... i[1][1]]
|
211
|
+
# # puts "=========="
|
212
|
+
# # else
|
213
|
+
# # p i
|
214
|
+
# # end
|
215
|
+
# end
|
116
216
|
end
|
117
217
|
end
|
@@ -79,7 +79,7 @@ class TextAlignment::TextAlignment
|
|
79
79
|
end
|
80
80
|
|
81
81
|
def transform_a_span(span)
|
82
|
-
{:
|
82
|
+
{begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
|
83
83
|
end
|
84
84
|
|
85
85
|
def transform_spans(spans)
|
@@ -91,11 +91,8 @@ class TextAlignment::TextAlignment
|
|
91
91
|
end
|
92
92
|
|
93
93
|
def transform_hdenotations(hdenotations)
|
94
|
-
|
95
|
-
|
96
|
-
(0...hdenotations.length).each {|i| hdenotations_new[i][:span] = transform_a_span(hdenotations[i][:span])}
|
97
|
-
hdenotations_new
|
98
|
-
end
|
94
|
+
return nil if hdenotations.nil?
|
95
|
+
hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
|
99
96
|
end
|
100
97
|
|
101
98
|
private
|
data/text_alignment.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ['lib']
|
19
19
|
|
20
|
-
gem.
|
20
|
+
gem.add_runtime_dependency 'ruby-dictionary', '~>1.1', '>=1.1.1'
|
21
|
+
gem.add_runtime_dependency 'string-similarity', '~> 2.1'
|
21
22
|
gem.add_development_dependency 'rspec', '~>3.0'
|
22
23
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -20,7 +20,7 @@ dependencies:
|
|
20
20
|
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 1.1.1
|
23
|
-
type: :
|
23
|
+
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
@@ -30,6 +30,20 @@ dependencies:
|
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.1.1
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: string-similarity
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '2.1'
|
40
|
+
type: :runtime
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '2.1'
|
33
47
|
- !ruby/object:Gem::Dependency
|
34
48
|
name: rspec
|
35
49
|
requirement: !ruby/object:Gem::Requirement
|
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
112
|
- !ruby/object:Gem::Version
|
99
113
|
version: '0'
|
100
114
|
requirements: []
|
101
|
-
rubygems_version: 3.0.
|
115
|
+
rubygems_version: 3.0.8
|
102
116
|
signing_key:
|
103
117
|
specification_version: 4
|
104
118
|
summary: Ruby class for aligning two character strings
|