text_alignment 0.2.9 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,211 +7,209 @@ module TextAlignment; end unless defined? TextAlignment
7
7
  # to work on the hash representation of denotations
8
8
  # to assume that there is no bag representation to this method
9
9
 
10
- module TextAlignment
11
- TextAlignment::SIMILARITY_THRESHOLD = 0.7
12
- end
10
+ TextAlignment::SIMILARITY_THRESHOLD = 0.7 unless defined? TextAlignment::SIMILARITY_THRESHOLD
13
11
 
14
12
  class << TextAlignment
15
13
 
16
- # It finds, among the sources, the right divisions for the taraget text to fit in.
17
- def find_divisions(target, sources, mappings = [])
18
- raise ArgumentError, "nil target" if target == nil
19
- raise ArgumentError, "nil or empty sources" if sources == nil || sources.empty?
20
- raise ArgumentError, "nil mappings" if mappings == nil
21
-
22
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
23
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
24
- characters_from = character_mappings.collect{|m| m[0]}.join
25
- characters_to = character_mappings.collect{|m| m[1]}.join
26
- characters_to.gsub!(/-/, '\-')
27
-
28
- target.tr!(characters_from, characters_to)
29
- sources.each{|source| source[:text].tr!(characters_from, characters_to)}
30
-
31
- # to process smaller ones first
32
- sources.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
33
-
34
- TextAlignment._find_divisions(target, sources)
35
- end
36
-
37
- def _find_divisions(_target, _sources)
38
- indice = []
39
- history = []
40
- cache = {}
41
- target = _target.dup
42
- sources = _sources.dup
43
- until target.strip.empty? || sources.empty?
44
- mode, cmp = nil, nil
45
- candidates = []
46
- sources.each_with_index do |source, i|
47
- if target.size < source[:text].size
48
- mode = :t_in_s
49
- str1 = target
50
- str2 = source[:text]
51
- else
52
- mode = :s_in_t
53
- str1 = source[:text]
54
- str2 = target
55
- end
56
-
57
- len1 = str1.length
58
- len2 = str2.length
59
-
60
- offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
61
- approximate_fit(str1, str2)
62
- else
63
- # the whole target
64
- [0, -1]
65
- end
66
-
67
- unless offset_begin.nil?
68
- key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
69
- cmp = if cache.has_key? key
70
- cache[key]
71
- else
72
- cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
73
- end
74
- cache[key] = cmp
75
-
76
- if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
77
- candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
78
- end
79
- end
80
- end
81
-
82
- # return remaining target and sources if m.nil?
83
- break if candidates.empty?
84
-
85
- choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
86
- m = choice[:idx]
87
- mode = choice[:mode]
88
-
89
- index = if mode == :t_in_s
90
- {divid:sources[m][:divid], region:[0, target.size]}
91
- else # :s_in_t
92
- cmp = choice[:cmp]
93
- offset = choice[:offset]
94
- {divid:sources[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
95
- end
96
-
97
- target = target[0 ... index[:region][0]] + target[index[:region][1] .. -1]
98
- history << index[:region].dup
99
-
100
- before_begin = index[:region][0]
101
- before_end = index[:region][1]
102
-
103
- rhistory = history.reverse
104
- rhistory.shift
105
- rhistory.each do |h|
106
- gap = h[1] - h[0]
107
- index[:region][0] += gap if index[:region][0] >= h[0]
108
- index[:region][1] += gap if index[:region][1] > h[0]
109
- end
110
-
111
- indice << index
112
-
113
- sources.delete_at(m)
114
- end
115
-
116
- unless target.strip.empty? && sources.empty?
117
- index = {divid:nil}
118
- index[:remaining_target] = target unless target.strip.empty?
119
- index[:remaining_sources] = sources.collect{|s| s[:divid]} unless sources.empty?
120
- indice << index
121
- end
122
-
123
- indice
124
- end
125
-
126
- def _find_divisions_old(target, sources)
127
- mode, m, c, offset_begin = nil, nil, nil, nil
128
-
129
- sources.each_with_index do |source, i|
130
- if target.size < source[:text].size
131
- mode = :t_in_s
132
- str1 = target
133
- str2 = source[:text]
134
- else
135
- mode = :s_in_t
136
- str1 = source[:text]
137
- str2 = target
138
- end
139
-
140
- len1 = str1.length
141
- len2 = str2.length
142
-
143
- offset_begin, offset_end = 0, -1
144
- offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
145
-
146
- unless offset_begin.nil?
147
- c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
148
- if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
149
- m = i
150
- break
151
- end
152
- end
153
- end
154
-
155
- # return remaining target and sources if m.nil?
156
- return [[-1, [target, sources.collect{|s| s[:divid]}]]] if m.nil?
157
-
158
- index = if mode == :t_in_s
159
- [sources[m][:divid], [0, target.size]]
160
- else # :s_in_t
161
- [sources[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
162
- end
163
-
164
- next_target = target[0 ... index[1][0]] + target[index[1][1] .. -1]
165
- sources.delete_at(m)
166
-
167
- if next_target.strip.empty? || sources.empty?
168
- return [index]
169
- else
170
- more_index = _find_divisions(next_target, sources)
171
- gap = index[1][1] - index[1][0]
172
- more_index.each do |i|
173
- if (i[0] > -1)
174
- i[1][0] += gap if i[1][0] >= index[1][0]
175
- i[1][1] += gap if i[1][1] > index[1][0]
176
- end
177
- end
178
- return [index] + more_index
179
- end
180
- end
14
+ # It finds, among the targets, the right divisions for the taraget text to fit in.
15
+ def find_divisions(source, targets, mappings = [])
16
+ raise ArgumentError, "nil source" if source == nil
17
+ raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty?
18
+ raise ArgumentError, "nil mappings" if mappings == nil
19
+
20
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
21
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
22
+ characters_from = character_mappings.collect{|m| m[0]}.join
23
+ characters_to = character_mappings.collect{|m| m[1]}.join
24
+ characters_to.gsub!(/-/, '\-')
25
+
26
+ source.tr!(characters_from, characters_to)
27
+ targets.each{|target| target[:text].tr!(characters_from, characters_to)}
28
+
29
+ # to process smaller ones first
30
+ targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
31
+
32
+ TextAlignment._find_divisions(source, targets)
33
+ end
34
+
35
+ def _find_divisions(_source, _targets)
36
+ indice = []
37
+ history = []
38
+ cache = {}
39
+ source = _source.dup
40
+ targets = _targets.dup
41
+ until source.strip.empty? || targets.empty?
42
+ mode, cmp = nil, nil
43
+ candidates = []
44
+ targets.each_with_index do |target, i|
45
+ if source.size < target[:text].size
46
+ mode = :t_in_s
47
+ str1 = source
48
+ str2 = target[:text]
49
+ else
50
+ mode = :s_in_t
51
+ str1 = target[:text]
52
+ str2 = source
53
+ end
54
+
55
+ len1 = str1.length
56
+ len2 = str2.length
57
+
58
+ offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
59
+ approximate_fit(str1, str2)
60
+ else
61
+ # the whole source
62
+ [0, -1]
63
+ end
64
+
65
+ unless offset_begin.nil?
66
+ key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
67
+ cmp = if cache.has_key? key
68
+ cache[key]
69
+ else
70
+ cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
71
+ end
72
+ cache[key] = cmp
73
+
74
+ if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
75
+ candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
76
+ end
77
+ end
78
+ end
79
+
80
+ # return remaining source and targets if m.nil?
81
+ break if candidates.empty?
82
+
83
+ choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
84
+ m = choice[:idx]
85
+ mode = choice[:mode]
86
+
87
+ index = if mode == :t_in_s
88
+ {divid:targets[m][:divid], region:[0, source.size]}
89
+ else # :s_in_t
90
+ cmp = choice[:cmp]
91
+ offset = choice[:offset]
92
+ {divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
93
+ end
94
+
95
+ source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1]
96
+ history << index[:region].dup
97
+
98
+ before_begin = index[:region][0]
99
+ before_end = index[:region][1]
100
+
101
+ rhistory = history.reverse
102
+ rhistory.shift
103
+ rhistory.each do |h|
104
+ gap = h[1] - h[0]
105
+ index[:region][0] += gap if index[:region][0] >= h[0]
106
+ index[:region][1] += gap if index[:region][1] > h[0]
107
+ end
108
+
109
+ indice << index
110
+
111
+ targets.delete_at(m)
112
+ end
113
+
114
+ unless source.strip.empty? && targets.empty?
115
+ index = {divid:nil}
116
+ index[:remaining_source] = source unless source.strip.empty?
117
+ index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty?
118
+ indice << index
119
+ end
120
+
121
+ indice
122
+ end
123
+
124
+ def _find_divisions_old(source, targets)
125
+ mode, m, c, offset_begin = nil, nil, nil, nil
126
+
127
+ targets.each_with_index do |target, i|
128
+ if source.size < target[:text].size
129
+ mode = :t_in_s
130
+ str1 = source
131
+ str2 = target[:text]
132
+ else
133
+ mode = :s_in_t
134
+ str1 = target[:text]
135
+ str2 = source
136
+ end
137
+
138
+ len1 = str1.length
139
+ len2 = str2.length
140
+
141
+ offset_begin, offset_end = 0, -1
142
+ offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
143
+
144
+ unless offset_begin.nil?
145
+ c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
146
+ if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
147
+ m = i
148
+ break
149
+ end
150
+ end
151
+ end
152
+
153
+ # return remaining source and targets if m.nil?
154
+ return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil?
155
+
156
+ index = if mode == :t_in_s
157
+ [targets[m][:divid], [0, source.size]]
158
+ else # :s_in_t
159
+ [targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
160
+ end
161
+
162
+ next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1]
163
+ targets.delete_at(m)
164
+
165
+ if next_source.strip.empty? || targets.empty?
166
+ return [index]
167
+ else
168
+ more_index = _find_divisions(next_source, targets)
169
+ gap = index[1][1] - index[1][0]
170
+ more_index.each do |i|
171
+ if (i[0] > -1)
172
+ i[1][0] += gap if i[1][0] >= index[1][0]
173
+ i[1][1] += gap if i[1][1] > index[1][0]
174
+ end
175
+ end
176
+ return [index] + more_index
177
+ end
178
+ end
181
179
 
182
180
  end
183
181
 
184
182
  if __FILE__ == $0
185
- require 'json'
186
- if ARGV.length == 2
187
- target = JSON.parse File.read(ARGV[0]), :symbolize_names => true
188
- target_text = target[:text].strip
189
-
190
- sources = JSON.parse File.read(ARGV[1]), :symbolize_names => true
191
- div_index = TextAlignment::find_divisions(target_text, sources)
192
- pp div_index
193
-
194
- # str1 = File.read(ARGV[0]).strip
195
- # str2 = File.read(ARGV[1]).strip
196
- # div_index = TextAlignment::find_divisions(str1, [str2])
197
-
198
- # puts "target length: #{target_text.length}"
199
- # div_index.each do |i|
200
- # unless i[:divid].nil?
201
- # puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
202
- # puts target_text[i[:region][0] ... i[:region][1]]
203
- # puts "=========="
204
- # else
205
- # p i
206
- # end
207
-
208
- # # if i[0] >= 0
209
- # # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
210
- # # puts target_text[i[1][0] ... i[1][1]]
211
- # # puts "=========="
212
- # # else
213
- # # p i
214
- # # end
215
- # end
216
- end
183
+ require 'json'
184
+ if ARGV.length == 2
185
+ source = JSON.parse File.read(ARGV[0]), :symbolize_names => true
186
+ source_text = source[:text].strip
187
+
188
+ targets = JSON.parse File.read(ARGV[1]), :symbolize_names => true
189
+ div_index = TextAlignment::find_divisions(source_text, targets)
190
+ pp div_index
191
+
192
+ # str1 = File.read(ARGV[0]).strip
193
+ # str2 = File.read(ARGV[1]).strip
194
+ # div_index = TextAlignment::find_divisions(str1, [str2])
195
+
196
+ # puts "source length: #{source_text.length}"
197
+ # div_index.each do |i|
198
+ # unless i[:divid].nil?
199
+ # puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
200
+ # puts source_text[i[:region][0] ... i[:region][1]]
201
+ # puts "=========="
202
+ # else
203
+ # p i
204
+ # end
205
+
206
+ # # if i[0] >= 0
207
+ # # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
208
+ # # puts source_text[i[1][0] ... i[1][1]]
209
+ # # puts "=========="
210
+ # # else
211
+ # # p i
212
+ # # end
213
+ # end
214
+ end
217
215
  end
@@ -6,306 +6,306 @@ module TextAlignment; end unless defined? TextAlignment
6
6
  # An instance of this class holds the results of generalized LCS computation for the two strings str1 and str2.
7
7
  # an optional dictionary is used for generalized suffix comparision.
8
8
  class TextAlignment::GLCSAlignment
9
- # The mapping function from str1 to str2
10
- attr_reader :position_map_begin, :position_map_end
11
-
12
- # The position initial and final position of matching on str1 and str2
13
- attr_reader :str1_match_begin, :str1_match_end, :str2_match_begin, :str2_match_end
14
-
15
- # The length of GLCS
16
- attr_reader :length
17
-
18
- # the elements that are common in the two strings, str1 and str2
19
- attr_reader :common_elements
20
-
21
- # the elements that are mapped to each other in the two strings, str1 and str2
22
- attr_reader :mapped_elements
23
-
24
- # the string of non-mapped characters
25
- attr_reader :diff_strings
26
-
27
- attr_reader :similarity
28
-
29
- # It initializes the GLCS table for the given two strings, str1 and str2.
30
- # When the array, mappings, is given, general suffix comparision is performed based on the mappings.
31
- # Exception is raised when nil given passed to either str1, str2 or dictionary
32
- def initialize(str1, str2, mappings = [])
33
- raise ArgumentError, "nil string" if str1 == nil || str2 == nil
34
- raise ArgumentError, "nil dictionary" if mappings == nil
35
-
36
- # index the mappings in hash.
37
- @dic = (mappings + mappings.map{|e| e.reverse}).to_h
38
-
39
- # prefix dictionary
40
- @pdic = Dictionary.new(mappings.flatten)
41
-
42
- @len1 = str1.length
43
- @len2 = str2.length
44
-
45
- # add a final marker to the end of the strings
46
- @str1 = str1 + '_'
47
- @str2 = str2 + '_'
48
-
49
- # compute the GLCS table
50
- @glcs = _compute_glcs_table
51
- @length = @glcs[0][0]
52
-
53
- _trace_glcs_table
54
- end
55
-
56
- # Prints the GLCS table
57
- def show_glcs
58
- puts "\t\t" + @str2.split(//).join("\t")
59
- @glcs.each_with_index do |row, i|
60
- h = (@str1[i].nil?)? '' : @str1[i]
61
- puts i.to_s + "\t" + h + "\t" + row.join("\t")
62
- end
63
- end
64
-
65
- # Returns the character-by-character difference
66
- def cdiff
67
- cdiff1, cdiff2 = '', ''
68
- p1, p2 = 0, 0
69
- begin
70
- s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
71
- if s1 != nil
72
- l1, l2 = s1.length, s2.length
73
-
74
- cdiff1 += s1; cdiff2 += s2
75
- if l1 > l2 then cdiff2 += ' ' * (l1 - l2) else cdiff1 += ' ' * (l2 - l1) end
76
- p1 += s1.length; p2 += s2.length
77
- elsif p2 < @len2 && (p1 == @len1 or @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
78
- cdiff1 += ' '
79
- cdiff2 += @str2[p2]
80
- p2 += 1
81
- elsif p1 < @len1 && (p2 == @len2 or @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
82
- cdiff1 += @str1[p1]
83
- cdiff2 += ' '
84
- p1 += 1
85
- end
86
- end until p1 == @len1 && p2 == @len2
87
-
88
- return [cdiff1, cdiff2]
89
- end
90
-
91
-
92
- # Computes the similarity of the two strings
93
- def similarity(cut = false)
94
- c = @length
95
-
96
- l1 = c + @diff_strings[0].length
97
- l2 = c + @diff_strings[1].length
98
-
99
- if cut
100
- l1 -= front_overflow if front_overflow > 0
101
- l1 -= rear_overflow if rear_overflow > 0
102
- l1 += front_overflow if front_overflow < 0
103
- l1 += rear_overflow if rear_overflow < 0
104
- end
105
-
106
- similarity = 2 * c / (l1 + l2).to_f
107
- end
108
-
109
- def transform_a_span(span)
110
- {:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
111
- end
112
-
113
- def transform_spans(spans)
114
- spans.map{|span| transform_a_span(span)}
115
- end
116
-
117
-
118
- private
119
-
120
- # Computes the GLCS table for the two strings, @str1 and @str2.
121
- # Unlike normal LCS algorithms, the computation is performed from the end to the beginning of the strings.
122
- def _compute_glcs_table
123
- glcs = Array.new(@len1 + 1) { Array.new(@len2 + 1) }
124
-
125
- # initialize the final row and the final column
126
- (0..@len1).each {|p| glcs[p][@len2] = 0}
127
- (0..@len2).each {|p| glcs[@len1][p] = 0}
128
-
129
- # compute the GLCS table
130
- str1_reverse_iteration = (0...@len1).to_a.reverse
131
- str2_reverse_iteration = (0...@len2).to_a.reverse
132
-
133
- str1_reverse_iteration.each do |p1|
134
- str2_reverse_iteration.each do |p2|
135
- s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
136
- unless s1 == nil
137
- glcs[p1][p2] = glcs[p1 + s1.length][p2 + s2.length] + 1
138
- else
139
- glcs[p1][p2] = (glcs[p1][p2 + 1] > glcs[p1 + 1][p2])? glcs[p1][p2 + 1] : glcs[p1 + 1][p2]
140
- end
141
- end
142
- end
143
-
144
- glcs
145
- end
146
-
147
- # Backtrace the GLCS table, computing the mapping function from str1 to str2
148
- # As its side effect, it updates four global variables
149
- # * front_overflow: the length of the front part of str1 that cannot fit in str2.
150
- # * rear_overflow: the length of the rear part of str1 that cannot fit in str2.
151
- # * common_elements: an array which stores the common elements in the two strings.
152
- # * mapped_elements: an array which stores the mapped elements in the two strings.
153
- def _trace_glcs_table
154
- @front_overflow, @rear_overflow = 0, 0
155
- @common_elements, @mapped_elements = [], []
156
- diff_string1, diff_string2 = '', ''
157
-
158
- @position_map_begin, @position_map_end = {}, {}
159
- addition, deletion = [], []
160
- p1, p2 = 0, 0
161
-
162
- while p1 <= @len1 && p2 <= @len2
163
- s1, s2 = _prefix_eq(@str1[p1..@len1], @str2[p2..@len2])
164
- if s1 != nil
165
- l1, l2 = s1.length, s2.length
166
-
167
- @position_map_begin[p1], @position_map_end[p1] = p2, p2
168
- (p1 + 1 ... p1 + l1).each{|i| @position_map_begin[i], @position_map_end[i] = nil, nil}
169
-
170
- @common_elements << [s1, s2]
171
-
172
- if !addition.empty? && deletion.empty?
173
- # If an addition is found in the front or the rear, it is a case of underflow
174
- @str2_match_begin = addition.length if p1 == 0
175
- @str2_match_end = l2 - addition.length if p1 == @len1
176
-
177
- if p1 == 0
178
- # leave as it is
179
- elsif p1 == @len1
180
- # retract from the end
181
- @position_map_begin[p1] = p2 - addition.length
182
- @position_map_end[p1] = @position_map_begin[p1]
183
- else
184
- # correct the position for end
185
- @position_map_end[p1] = p2 - addition.length
186
- end
187
- elsif addition.empty? && !deletion.empty?
188
- # If a deletion is found in the front or the rear, it is a case of overflow
189
- @str1_match_begin = deletion.length if p1 == deletion.length
190
- @str1_match_end = l1 - deletion.length if p1 == @len1
191
-
192
- deletion.each{|p| @position_map_begin[p], @position_map_end[p] = p2, p2}
193
- elsif !addition.empty? && !deletion.empty?
194
- # If an addition and a deletion are both found in the front or the rear,
195
- # the overflow/underflow is approximated to the difference.
196
- al, dl = addition.length, deletion.length
197
- @front_overflow = dl - al if p1 == dl
198
- @rear_overflow = dl - al if p1 == @len1
199
-
200
- @mapped_elements << [@str1[deletion[0], dl], @str2[addition[0], al]]
201
-
202
- @position_map_begin[deletion[0]], @position_map_end[deletion[0]] = addition[0], addition[0]
203
- deletion[1..-1].each{|p| @position_map_begin[p], @position_map_end[p] = nil, nil}
204
- end
205
-
206
- addition.clear; deletion.clear
207
- p1 += l1; p2 += l2
208
-
209
- elsif p2 < @len2 && (p1 == @len1 || @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
210
- diff_string2 += @str2[p2]
211
-
212
- addition << p2
213
- p2 += 1
214
- elsif p1 < @len1 && (p2 == @len2 || @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
215
- diff_string1 += @str1[p1]
216
-
217
- deletion << p1
218
- p1 += 1
219
- end
220
- end
221
-
222
- @common_elements.pop
223
- @diff_strings = [diff_string1, diff_string2]
224
- end
225
-
226
- # General prefix comparison is performed based on the dictionary.
227
- # The pair of matched suffixes are returned when found.
228
- # Otherwise, the pair of nil values are returned.
229
- def _prefix_eq(str1, str2)
230
- return nil, nil if str1.empty? || str2.empty?
231
- prefixes1 = @pdic.prefixes(str1)
232
- prefixes1.each {|p1| p2 = @dic[p1]; return p1, p2 if str2.start_with?(p2)}
233
- return str1[0], str2[0] if (str1[0] == str2[0])
234
- return nil, nil
235
- end
9
+ # The mapping function from str1 to str2
10
+ attr_reader :position_map_begin, :position_map_end
11
+
12
+ # The position initial and final position of matching on str1 and str2
13
+ attr_reader :str1_match_begin, :str1_match_end, :str2_match_begin, :str2_match_end
14
+
15
+ # The length of GLCS
16
+ attr_reader :length
17
+
18
+ # the elements that are common in the two strings, str1 and str2
19
+ attr_reader :common_elements
20
+
21
+ # the elements that are mapped to each other in the two strings, str1 and str2
22
+ attr_reader :mapped_elements
23
+
24
+ # the string of non-mapped characters
25
+ attr_reader :diff_strings
26
+
27
+ attr_reader :similarity
28
+
29
+ # It initializes the GLCS table for the given two strings, str1 and str2.
30
+ # When the array, mappings, is given, general suffix comparision is performed based on the mappings.
31
+ # Exception is raised when nil given passed to either str1, str2 or dictionary
32
+ def initialize(str1, str2, mappings = [])
33
+ raise ArgumentError, "nil string" if str1 == nil || str2 == nil
34
+ raise ArgumentError, "nil dictionary" if mappings == nil
35
+
36
+ # index the mappings in hash.
37
+ @dic = (mappings + mappings.map{|e| e.reverse}).to_h
38
+
39
+ # prefix dictionary
40
+ @pdic = Dictionary.new(mappings.flatten)
41
+
42
+ @len1 = str1.length
43
+ @len2 = str2.length
44
+
45
+ # add a final marker to the end of the strings
46
+ @str1 = str1 + '_'
47
+ @str2 = str2 + '_'
48
+
49
+ # compute the GLCS table
50
+ @glcs = _compute_glcs_table
51
+ @length = @glcs[0][0]
52
+
53
+ _trace_glcs_table
54
+ end
55
+
56
+ # Prints the GLCS table
57
+ def show_glcs
58
+ puts "\t\t" + @str2.split(//).join("\t")
59
+ @glcs.each_with_index do |row, i|
60
+ h = (@str1[i].nil?)? '' : @str1[i]
61
+ puts i.to_s + "\t" + h + "\t" + row.join("\t")
62
+ end
63
+ end
64
+
65
+ # Returns the character-by-character difference
66
+ def cdiff
67
+ cdiff1, cdiff2 = '', ''
68
+ p1, p2 = 0, 0
69
+ begin
70
+ s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
71
+ if s1 != nil
72
+ l1, l2 = s1.length, s2.length
73
+
74
+ cdiff1 += s1; cdiff2 += s2
75
+ if l1 > l2 then cdiff2 += ' ' * (l1 - l2) else cdiff1 += ' ' * (l2 - l1) end
76
+ p1 += s1.length; p2 += s2.length
77
+ elsif p2 < @len2 && (p1 == @len1 or @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
78
+ cdiff1 += ' '
79
+ cdiff2 += @str2[p2]
80
+ p2 += 1
81
+ elsif p1 < @len1 && (p2 == @len2 or @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
82
+ cdiff1 += @str1[p1]
83
+ cdiff2 += ' '
84
+ p1 += 1
85
+ end
86
+ end until p1 == @len1 && p2 == @len2
87
+
88
+ return [cdiff1, cdiff2]
89
+ end
90
+
91
+
92
+ # Computes the similarity of the two strings
93
+ def similarity(cut = false)
94
+ c = @length
95
+
96
+ l1 = c + @diff_strings[0].length
97
+ l2 = c + @diff_strings[1].length
98
+
99
+ if cut
100
+ l1 -= front_overflow if front_overflow > 0
101
+ l1 -= rear_overflow if rear_overflow > 0
102
+ l1 += front_overflow if front_overflow < 0
103
+ l1 += rear_overflow if rear_overflow < 0
104
+ end
105
+
106
+ similarity = 2 * c / (l1 + l2).to_f
107
+ end
108
+
109
+ def transform_a_span(span)
110
+ {:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
111
+ end
112
+
113
+ def transform_spans(spans)
114
+ spans.map{|span| transform_a_span(span)}
115
+ end
116
+
117
+
118
+ private
119
+
120
+ # Computes the GLCS table for the two strings, @str1 and @str2.
121
+ # Unlike normal LCS algorithms, the computation is performed from the end to the beginning of the strings.
122
+ def _compute_glcs_table
123
+ glcs = Array.new(@len1 + 1) { Array.new(@len2 + 1) }
124
+
125
+ # initialize the final row and the final column
126
+ (0..@len1).each {|p| glcs[p][@len2] = 0}
127
+ (0..@len2).each {|p| glcs[@len1][p] = 0}
128
+
129
+ # compute the GLCS table
130
+ str1_reverse_iteration = (0...@len1).to_a.reverse
131
+ str2_reverse_iteration = (0...@len2).to_a.reverse
132
+
133
+ str1_reverse_iteration.each do |p1|
134
+ str2_reverse_iteration.each do |p2|
135
+ s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
136
+ unless s1 == nil
137
+ glcs[p1][p2] = glcs[p1 + s1.length][p2 + s2.length] + 1
138
+ else
139
+ glcs[p1][p2] = (glcs[p1][p2 + 1] > glcs[p1 + 1][p2])? glcs[p1][p2 + 1] : glcs[p1 + 1][p2]
140
+ end
141
+ end
142
+ end
143
+
144
+ glcs
145
+ end
146
+
147
+ # Backtrace the GLCS table, computing the mapping function from str1 to str2
148
+ # As its side effect, it updates four global variables
149
+ # * front_overflow: the length of the front part of str1 that cannot fit in str2.
150
+ # * rear_overflow: the length of the rear part of str1 that cannot fit in str2.
151
+ # * common_elements: an array which stores the common elements in the two strings.
152
+ # * mapped_elements: an array which stores the mapped elements in the two strings.
153
+ def _trace_glcs_table
154
+ @front_overflow, @rear_overflow = 0, 0
155
+ @common_elements, @mapped_elements = [], []
156
+ diff_string1, diff_string2 = '', ''
157
+
158
+ @position_map_begin, @position_map_end = {}, {}
159
+ addition, deletion = [], []
160
+ p1, p2 = 0, 0
161
+
162
+ while p1 <= @len1 && p2 <= @len2
163
+ s1, s2 = _prefix_eq(@str1[p1..@len1], @str2[p2..@len2])
164
+ if s1 != nil
165
+ l1, l2 = s1.length, s2.length
166
+
167
+ @position_map_begin[p1], @position_map_end[p1] = p2, p2
168
+ (p1 + 1 ... p1 + l1).each{|i| @position_map_begin[i], @position_map_end[i] = nil, nil}
169
+
170
+ @common_elements << [s1, s2]
171
+
172
+ if !addition.empty? && deletion.empty?
173
+ # If an addition is found in the front or the rear, it is a case of underflow
174
+ @str2_match_begin = addition.length if p1 == 0
175
+ @str2_match_end = l2 - addition.length if p1 == @len1
176
+
177
+ if p1 == 0
178
+ # leave as it is
179
+ elsif p1 == @len1
180
+ # retract from the end
181
+ @position_map_begin[p1] = p2 - addition.length
182
+ @position_map_end[p1] = @position_map_begin[p1]
183
+ else
184
+ # correct the position for end
185
+ @position_map_end[p1] = p2 - addition.length
186
+ end
187
+ elsif addition.empty? && !deletion.empty?
188
+ # If a deletion is found in the front or the rear, it is a case of overflow
189
+ @str1_match_begin = deletion.length if p1 == deletion.length
190
+ @str1_match_end = l1 - deletion.length if p1 == @len1
191
+
192
+ deletion.each{|p| @position_map_begin[p], @position_map_end[p] = p2, p2}
193
+ elsif !addition.empty? && !deletion.empty?
194
+ # If an addition and a deletion are both found in the front or the rear,
195
+ # the overflow/underflow is approximated to the difference.
196
+ al, dl = addition.length, deletion.length
197
+ @front_overflow = dl - al if p1 == dl
198
+ @rear_overflow = dl - al if p1 == @len1
199
+
200
+ @mapped_elements << [@str1[deletion[0], dl], @str2[addition[0], al]]
201
+
202
+ @position_map_begin[deletion[0]], @position_map_end[deletion[0]] = addition[0], addition[0]
203
+ deletion[1..-1].each{|p| @position_map_begin[p], @position_map_end[p] = nil, nil}
204
+ end
205
+
206
+ addition.clear; deletion.clear
207
+ p1 += l1; p2 += l2
208
+
209
+ elsif p2 < @len2 && (p1 == @len1 || @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
210
+ diff_string2 += @str2[p2]
211
+
212
+ addition << p2
213
+ p2 += 1
214
+ elsif p1 < @len1 && (p2 == @len2 || @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
215
+ diff_string1 += @str1[p1]
216
+
217
+ deletion << p1
218
+ p1 += 1
219
+ end
220
+ end
221
+
222
+ @common_elements.pop
223
+ @diff_strings = [diff_string1, diff_string2]
224
+ end
225
+
226
+ # General prefix comparison is performed based on the dictionary.
227
+ # The pair of matched suffixes are returned when found.
228
+ # Otherwise, the pair of nil values are returned.
229
+ def _prefix_eq(str1, str2)
230
+ return nil, nil if str1.empty? || str2.empty?
231
+ prefixes1 = @pdic.prefixes(str1)
232
+ prefixes1.each {|p1| p2 = @dic[p1]; return p1, p2 if str2.start_with?(p2)}
233
+ return str1[0], str2[0] if (str1[0] == str2[0])
234
+ return nil, nil
235
+ end
236
236
 
237
237
  end
238
238
 
239
239
  if __FILE__ == $0
240
240
 
241
- dictionary = [
242
- ["×", "x"], #U+00D7 (multiplication sign)
243
- ["•", "*"], #U+2022 (bullet)
244
- ["Δ", "delta"], #U+0394 (greek capital letter delta)
245
- ["Φ", "phi"], #U+03A6 (greek capital letter phi)
246
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
247
- ["β", "beta"], #U+03B2 (greek small letter beta)
248
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
249
- ["δ", "delta"], #U+03B4 (greek small letter delta)
250
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
251
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
252
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
253
- ["μ", "mu"], #U+03BC (greek small letter mu)
254
- ["χ", "chi"], #U+03C7 (greek small letter chi)
255
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
256
- [" ", " "], #U+2009 (thin space)
257
- [" ", " "], #U+200A (hair space)
258
- [" ", " "], #U+00A0 (no-break space)
259
- [" ", " "], #U+3000 (ideographic space)
260
- ["−", "-"], #U+2212 (minus sign)
261
- ["–", "-"], #U+2013 (en dash)
262
- ["′", "'"], #U+2032 (prime)
263
- ["‘", "'"], #U+2018 (left single quotation mark)
264
- ["’", "'"], #U+2019 (right single quotation mark)
265
- ["“", '"'], #U+201C (left double quotation mark)
266
- ["”", '"'] #U+201D (right double quotation mark)
267
- ]
268
-
269
- # str1 = "-betakappaxyz-"
270
- # str2 = "-ijkβκ-"
271
-
272
- # str1 = "-βκ-β-z-xy"
273
- # str2 = "abc-betakappa-beta-z"
274
-
275
- # str1 = "-βκ-z-xy"
276
- # str2 = "abc-betakappa-z"
277
-
278
- # str1 = "abc-βκ-β-z"
279
- # str2 = "-betakappa-beta-z-xyz"
280
-
281
- # str1 = "-β-"
282
- # str2 = "-beta-"
283
-
284
- # str1 = "-κ-"
285
- # str2 = "-kappa-"
286
-
287
- # str1 = File.read(ARGV[0]).strip
288
- # str2 = File.read(ARGV[1]).strip
289
-
290
- str1 = "beta"
291
- str2 = "β***"
292
-
293
- # puts "str1: #{str1}"
294
- # puts "str2: #{str2}"
295
- sa = TextAlignment::GLCSAlignment.new(str1, str2, dictionary)
296
- sa.position_map_begin.each {|h| p h}
297
- puts '-----'
298
- sa.position_map_end.each {|h| p h}
299
- puts '-----'
300
- puts "common_elements: #{sa.common_elements}"
301
- puts '-----'
302
- puts "mapped_elements: #{sa.mapped_elements}"
303
- puts '-----'
304
- # puts "diff_string1: #{sa.diff_strings[0]}"
305
- # puts "diff_string2: #{sa.diff_strings[1]}"
306
- puts "front_overflow: #{sa.front_overflow}"
307
- puts "rear_overflow : #{sa.rear_overflow}"
308
- puts '-----'
309
- puts "similarity : #{sa.similarity}"
310
- puts "similarity(cut): #{sa.similarity(true)}"
241
+ dictionary = [
242
+ ["×", "x"], #U+00D7 (multiplication sign)
243
+ ["•", "*"], #U+2022 (bullet)
244
+ ["Δ", "delta"], #U+0394 (greek capital letter delta)
245
+ ["Φ", "phi"], #U+03A6 (greek capital letter phi)
246
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
247
+ ["β", "beta"], #U+03B2 (greek small letter beta)
248
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
249
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
250
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
251
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
252
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
253
+ ["μ", "mu"], #U+03BC (greek small letter mu)
254
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
255
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
256
+ [" ", " "], #U+2009 (thin space)
257
+ [" ", " "], #U+200A (hair space)
258
+ [" ", " "], #U+00A0 (no-break space)
259
+ [" ", " "], #U+3000 (ideographic space)
260
+ ["−", "-"], #U+2212 (minus sign)
261
+ ["–", "-"], #U+2013 (en dash)
262
+ ["′", "'"], #U+2032 (prime)
263
+ ["‘", "'"], #U+2018 (left single quotation mark)
264
+ ["’", "'"], #U+2019 (right single quotation mark)
265
+ ["“", '"'], #U+201C (left double quotation mark)
266
+ ["”", '"'] #U+201D (right double quotation mark)
267
+ ]
268
+
269
+ # str1 = "-betakappaxyz-"
270
+ # str2 = "-ijkβκ-"
271
+
272
+ # str1 = "-βκ-β-z-xy"
273
+ # str2 = "abc-betakappa-beta-z"
274
+
275
+ # str1 = "-βκ-z-xy"
276
+ # str2 = "abc-betakappa-z"
277
+
278
+ # str1 = "abc-βκ-β-z"
279
+ # str2 = "-betakappa-beta-z-xyz"
280
+
281
+ # str1 = "-β-"
282
+ # str2 = "-beta-"
283
+
284
+ # str1 = "-κ-"
285
+ # str2 = "-kappa-"
286
+
287
+ # str1 = File.read(ARGV[0]).strip
288
+ # str2 = File.read(ARGV[1]).strip
289
+
290
+ str1 = "beta"
291
+ str2 = "β***"
292
+
293
+ # puts "str1: #{str1}"
294
+ # puts "str2: #{str2}"
295
+ sa = TextAlignment::GLCSAlignment.new(str1, str2, dictionary)
296
+ sa.position_map_begin.each {|h| p h}
297
+ puts '-----'
298
+ sa.position_map_end.each {|h| p h}
299
+ puts '-----'
300
+ puts "common_elements: #{sa.common_elements}"
301
+ puts '-----'
302
+ puts "mapped_elements: #{sa.mapped_elements}"
303
+ puts '-----'
304
+ # puts "diff_string1: #{sa.diff_strings[0]}"
305
+ # puts "diff_string2: #{sa.diff_strings[1]}"
306
+ puts "front_overflow: #{sa.front_overflow}"
307
+ puts "rear_overflow : #{sa.rear_overflow}"
308
+ puts '-----'
309
+ puts "similarity : #{sa.similarity}"
310
+ puts "similarity(cut): #{sa.similarity(true)}"
311
311
  end