text_alignment 0.2.9 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,211 +7,209 @@ module TextAlignment; end unless defined? TextAlignment
7
7
  # to work on the hash representation of denotations
8
8
  # to assume that there is no bag representation to this method
9
9
 
10
- module TextAlignment
11
- TextAlignment::SIMILARITY_THRESHOLD = 0.7
12
- end
10
+ TextAlignment::SIMILARITY_THRESHOLD = 0.7 unless defined? TextAlignment::SIMILARITY_THRESHOLD
13
11
 
14
12
  class << TextAlignment
15
13
 
16
- # It finds, among the sources, the right divisions for the taraget text to fit in.
17
- def find_divisions(target, sources, mappings = [])
18
- raise ArgumentError, "nil target" if target == nil
19
- raise ArgumentError, "nil or empty sources" if sources == nil || sources.empty?
20
- raise ArgumentError, "nil mappings" if mappings == nil
21
-
22
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
23
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
24
- characters_from = character_mappings.collect{|m| m[0]}.join
25
- characters_to = character_mappings.collect{|m| m[1]}.join
26
- characters_to.gsub!(/-/, '\-')
27
-
28
- target.tr!(characters_from, characters_to)
29
- sources.each{|source| source[:text].tr!(characters_from, characters_to)}
30
-
31
- # to process smaller ones first
32
- sources.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
33
-
34
- TextAlignment._find_divisions(target, sources)
35
- end
36
-
37
- def _find_divisions(_target, _sources)
38
- indice = []
39
- history = []
40
- cache = {}
41
- target = _target.dup
42
- sources = _sources.dup
43
- until target.strip.empty? || sources.empty?
44
- mode, cmp = nil, nil
45
- candidates = []
46
- sources.each_with_index do |source, i|
47
- if target.size < source[:text].size
48
- mode = :t_in_s
49
- str1 = target
50
- str2 = source[:text]
51
- else
52
- mode = :s_in_t
53
- str1 = source[:text]
54
- str2 = target
55
- end
56
-
57
- len1 = str1.length
58
- len2 = str2.length
59
-
60
- offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
61
- approximate_fit(str1, str2)
62
- else
63
- # the whole target
64
- [0, -1]
65
- end
66
-
67
- unless offset_begin.nil?
68
- key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
69
- cmp = if cache.has_key? key
70
- cache[key]
71
- else
72
- cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
73
- end
74
- cache[key] = cmp
75
-
76
- if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
77
- candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
78
- end
79
- end
80
- end
81
-
82
- # return remaining target and sources if m.nil?
83
- break if candidates.empty?
84
-
85
- choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
86
- m = choice[:idx]
87
- mode = choice[:mode]
88
-
89
- index = if mode == :t_in_s
90
- {divid:sources[m][:divid], region:[0, target.size]}
91
- else # :s_in_t
92
- cmp = choice[:cmp]
93
- offset = choice[:offset]
94
- {divid:sources[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
95
- end
96
-
97
- target = target[0 ... index[:region][0]] + target[index[:region][1] .. -1]
98
- history << index[:region].dup
99
-
100
- before_begin = index[:region][0]
101
- before_end = index[:region][1]
102
-
103
- rhistory = history.reverse
104
- rhistory.shift
105
- rhistory.each do |h|
106
- gap = h[1] - h[0]
107
- index[:region][0] += gap if index[:region][0] >= h[0]
108
- index[:region][1] += gap if index[:region][1] > h[0]
109
- end
110
-
111
- indice << index
112
-
113
- sources.delete_at(m)
114
- end
115
-
116
- unless target.strip.empty? && sources.empty?
117
- index = {divid:nil}
118
- index[:remaining_target] = target unless target.strip.empty?
119
- index[:remaining_sources] = sources.collect{|s| s[:divid]} unless sources.empty?
120
- indice << index
121
- end
122
-
123
- indice
124
- end
125
-
126
- def _find_divisions_old(target, sources)
127
- mode, m, c, offset_begin = nil, nil, nil, nil
128
-
129
- sources.each_with_index do |source, i|
130
- if target.size < source[:text].size
131
- mode = :t_in_s
132
- str1 = target
133
- str2 = source[:text]
134
- else
135
- mode = :s_in_t
136
- str1 = source[:text]
137
- str2 = target
138
- end
139
-
140
- len1 = str1.length
141
- len2 = str2.length
142
-
143
- offset_begin, offset_end = 0, -1
144
- offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
145
-
146
- unless offset_begin.nil?
147
- c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
148
- if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
149
- m = i
150
- break
151
- end
152
- end
153
- end
154
-
155
- # return remaining target and sources if m.nil?
156
- return [[-1, [target, sources.collect{|s| s[:divid]}]]] if m.nil?
157
-
158
- index = if mode == :t_in_s
159
- [sources[m][:divid], [0, target.size]]
160
- else # :s_in_t
161
- [sources[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
162
- end
163
-
164
- next_target = target[0 ... index[1][0]] + target[index[1][1] .. -1]
165
- sources.delete_at(m)
166
-
167
- if next_target.strip.empty? || sources.empty?
168
- return [index]
169
- else
170
- more_index = _find_divisions(next_target, sources)
171
- gap = index[1][1] - index[1][0]
172
- more_index.each do |i|
173
- if (i[0] > -1)
174
- i[1][0] += gap if i[1][0] >= index[1][0]
175
- i[1][1] += gap if i[1][1] > index[1][0]
176
- end
177
- end
178
- return [index] + more_index
179
- end
180
- end
14
+ # It finds, among the targets, the right divisions for the taraget text to fit in.
15
+ def find_divisions(source, targets, mappings = [])
16
+ raise ArgumentError, "nil source" if source == nil
17
+ raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty?
18
+ raise ArgumentError, "nil mappings" if mappings == nil
19
+
20
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
21
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
22
+ characters_from = character_mappings.collect{|m| m[0]}.join
23
+ characters_to = character_mappings.collect{|m| m[1]}.join
24
+ characters_to.gsub!(/-/, '\-')
25
+
26
+ source.tr!(characters_from, characters_to)
27
+ targets.each{|target| target[:text].tr!(characters_from, characters_to)}
28
+
29
+ # to process smaller ones first
30
+ targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
31
+
32
+ TextAlignment._find_divisions(source, targets)
33
+ end
34
+
35
+ def _find_divisions(_source, _targets)
36
+ indice = []
37
+ history = []
38
+ cache = {}
39
+ source = _source.dup
40
+ targets = _targets.dup
41
+ until source.strip.empty? || targets.empty?
42
+ mode, cmp = nil, nil
43
+ candidates = []
44
+ targets.each_with_index do |target, i|
45
+ if source.size < target[:text].size
46
+ mode = :t_in_s
47
+ str1 = source
48
+ str2 = target[:text]
49
+ else
50
+ mode = :s_in_t
51
+ str1 = target[:text]
52
+ str2 = source
53
+ end
54
+
55
+ len1 = str1.length
56
+ len2 = str2.length
57
+
58
+ offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
59
+ approximate_fit(str1, str2)
60
+ else
61
+ # the whole source
62
+ [0, -1]
63
+ end
64
+
65
+ unless offset_begin.nil?
66
+ key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
67
+ cmp = if cache.has_key? key
68
+ cache[key]
69
+ else
70
+ cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
71
+ end
72
+ cache[key] = cmp
73
+
74
+ if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
75
+ candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
76
+ end
77
+ end
78
+ end
79
+
80
+ # return remaining source and targets if m.nil?
81
+ break if candidates.empty?
82
+
83
+ choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
84
+ m = choice[:idx]
85
+ mode = choice[:mode]
86
+
87
+ index = if mode == :t_in_s
88
+ {divid:targets[m][:divid], region:[0, source.size]}
89
+ else # :s_in_t
90
+ cmp = choice[:cmp]
91
+ offset = choice[:offset]
92
+ {divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
93
+ end
94
+
95
+ source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1]
96
+ history << index[:region].dup
97
+
98
+ before_begin = index[:region][0]
99
+ before_end = index[:region][1]
100
+
101
+ rhistory = history.reverse
102
+ rhistory.shift
103
+ rhistory.each do |h|
104
+ gap = h[1] - h[0]
105
+ index[:region][0] += gap if index[:region][0] >= h[0]
106
+ index[:region][1] += gap if index[:region][1] > h[0]
107
+ end
108
+
109
+ indice << index
110
+
111
+ targets.delete_at(m)
112
+ end
113
+
114
+ unless source.strip.empty? && targets.empty?
115
+ index = {divid:nil}
116
+ index[:remaining_source] = source unless source.strip.empty?
117
+ index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty?
118
+ indice << index
119
+ end
120
+
121
+ indice
122
+ end
123
+
124
+ def _find_divisions_old(source, targets)
125
+ mode, m, c, offset_begin = nil, nil, nil, nil
126
+
127
+ targets.each_with_index do |target, i|
128
+ if source.size < target[:text].size
129
+ mode = :t_in_s
130
+ str1 = source
131
+ str2 = target[:text]
132
+ else
133
+ mode = :s_in_t
134
+ str1 = target[:text]
135
+ str2 = source
136
+ end
137
+
138
+ len1 = str1.length
139
+ len2 = str2.length
140
+
141
+ offset_begin, offset_end = 0, -1
142
+ offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
143
+
144
+ unless offset_begin.nil?
145
+ c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
146
+ if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
147
+ m = i
148
+ break
149
+ end
150
+ end
151
+ end
152
+
153
+ # return remaining source and targets if m.nil?
154
+ return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil?
155
+
156
+ index = if mode == :t_in_s
157
+ [targets[m][:divid], [0, source.size]]
158
+ else # :s_in_t
159
+ [targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
160
+ end
161
+
162
+ next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1]
163
+ targets.delete_at(m)
164
+
165
+ if next_source.strip.empty? || targets.empty?
166
+ return [index]
167
+ else
168
+ more_index = _find_divisions(next_source, targets)
169
+ gap = index[1][1] - index[1][0]
170
+ more_index.each do |i|
171
+ if (i[0] > -1)
172
+ i[1][0] += gap if i[1][0] >= index[1][0]
173
+ i[1][1] += gap if i[1][1] > index[1][0]
174
+ end
175
+ end
176
+ return [index] + more_index
177
+ end
178
+ end
181
179
 
182
180
  end
183
181
 
184
182
  if __FILE__ == $0
185
- require 'json'
186
- if ARGV.length == 2
187
- target = JSON.parse File.read(ARGV[0]), :symbolize_names => true
188
- target_text = target[:text].strip
189
-
190
- sources = JSON.parse File.read(ARGV[1]), :symbolize_names => true
191
- div_index = TextAlignment::find_divisions(target_text, sources)
192
- pp div_index
193
-
194
- # str1 = File.read(ARGV[0]).strip
195
- # str2 = File.read(ARGV[1]).strip
196
- # div_index = TextAlignment::find_divisions(str1, [str2])
197
-
198
- # puts "target length: #{target_text.length}"
199
- # div_index.each do |i|
200
- # unless i[:divid].nil?
201
- # puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
202
- # puts target_text[i[:region][0] ... i[:region][1]]
203
- # puts "=========="
204
- # else
205
- # p i
206
- # end
207
-
208
- # # if i[0] >= 0
209
- # # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
210
- # # puts target_text[i[1][0] ... i[1][1]]
211
- # # puts "=========="
212
- # # else
213
- # # p i
214
- # # end
215
- # end
216
- end
183
+ require 'json'
184
+ if ARGV.length == 2
185
+ source = JSON.parse File.read(ARGV[0]), :symbolize_names => true
186
+ source_text = source[:text].strip
187
+
188
+ targets = JSON.parse File.read(ARGV[1]), :symbolize_names => true
189
+ div_index = TextAlignment::find_divisions(source_text, targets)
190
+ pp div_index
191
+
192
+ # str1 = File.read(ARGV[0]).strip
193
+ # str2 = File.read(ARGV[1]).strip
194
+ # div_index = TextAlignment::find_divisions(str1, [str2])
195
+
196
+ # puts "source length: #{source_text.length}"
197
+ # div_index.each do |i|
198
+ # unless i[:divid].nil?
199
+ # puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
200
+ # puts source_text[i[:region][0] ... i[:region][1]]
201
+ # puts "=========="
202
+ # else
203
+ # p i
204
+ # end
205
+
206
+ # # if i[0] >= 0
207
+ # # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
208
+ # # puts source_text[i[1][0] ... i[1][1]]
209
+ # # puts "=========="
210
+ # # else
211
+ # # p i
212
+ # # end
213
+ # end
214
+ end
217
215
  end
@@ -6,306 +6,306 @@ module TextAlignment; end unless defined? TextAlignment
6
6
  # An instance of this class holds the results of generalized LCS computation for the two strings str1 and str2.
7
7
  # an optional dictionary is used for generalized suffix comparision.
8
8
  class TextAlignment::GLCSAlignment
9
- # The mapping function from str1 to str2
10
- attr_reader :position_map_begin, :position_map_end
11
-
12
- # The position initial and final position of matching on str1 and str2
13
- attr_reader :str1_match_begin, :str1_match_end, :str2_match_begin, :str2_match_end
14
-
15
- # The length of GLCS
16
- attr_reader :length
17
-
18
- # the elements that are common in the two strings, str1 and str2
19
- attr_reader :common_elements
20
-
21
- # the elements that are mapped to each other in the two strings, str1 and str2
22
- attr_reader :mapped_elements
23
-
24
- # the string of non-mapped characters
25
- attr_reader :diff_strings
26
-
27
- attr_reader :similarity
28
-
29
- # It initializes the GLCS table for the given two strings, str1 and str2.
30
- # When the array, mappings, is given, general suffix comparision is performed based on the mappings.
31
- # Exception is raised when nil given passed to either str1, str2 or dictionary
32
- def initialize(str1, str2, mappings = [])
33
- raise ArgumentError, "nil string" if str1 == nil || str2 == nil
34
- raise ArgumentError, "nil dictionary" if mappings == nil
35
-
36
- # index the mappings in hash.
37
- @dic = (mappings + mappings.map{|e| e.reverse}).to_h
38
-
39
- # prefix dictionary
40
- @pdic = Dictionary.new(mappings.flatten)
41
-
42
- @len1 = str1.length
43
- @len2 = str2.length
44
-
45
- # add a final marker to the end of the strings
46
- @str1 = str1 + '_'
47
- @str2 = str2 + '_'
48
-
49
- # compute the GLCS table
50
- @glcs = _compute_glcs_table
51
- @length = @glcs[0][0]
52
-
53
- _trace_glcs_table
54
- end
55
-
56
- # Prints the GLCS table
57
- def show_glcs
58
- puts "\t\t" + @str2.split(//).join("\t")
59
- @glcs.each_with_index do |row, i|
60
- h = (@str1[i].nil?)? '' : @str1[i]
61
- puts i.to_s + "\t" + h + "\t" + row.join("\t")
62
- end
63
- end
64
-
65
- # Returns the character-by-character difference
66
- def cdiff
67
- cdiff1, cdiff2 = '', ''
68
- p1, p2 = 0, 0
69
- begin
70
- s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
71
- if s1 != nil
72
- l1, l2 = s1.length, s2.length
73
-
74
- cdiff1 += s1; cdiff2 += s2
75
- if l1 > l2 then cdiff2 += ' ' * (l1 - l2) else cdiff1 += ' ' * (l2 - l1) end
76
- p1 += s1.length; p2 += s2.length
77
- elsif p2 < @len2 && (p1 == @len1 or @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
78
- cdiff1 += ' '
79
- cdiff2 += @str2[p2]
80
- p2 += 1
81
- elsif p1 < @len1 && (p2 == @len2 or @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
82
- cdiff1 += @str1[p1]
83
- cdiff2 += ' '
84
- p1 += 1
85
- end
86
- end until p1 == @len1 && p2 == @len2
87
-
88
- return [cdiff1, cdiff2]
89
- end
90
-
91
-
92
- # Computes the similarity of the two strings
93
- def similarity(cut = false)
94
- c = @length
95
-
96
- l1 = c + @diff_strings[0].length
97
- l2 = c + @diff_strings[1].length
98
-
99
- if cut
100
- l1 -= front_overflow if front_overflow > 0
101
- l1 -= rear_overflow if rear_overflow > 0
102
- l1 += front_overflow if front_overflow < 0
103
- l1 += rear_overflow if rear_overflow < 0
104
- end
105
-
106
- similarity = 2 * c / (l1 + l2).to_f
107
- end
108
-
109
- def transform_a_span(span)
110
- {:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
111
- end
112
-
113
- def transform_spans(spans)
114
- spans.map{|span| transform_a_span(span)}
115
- end
116
-
117
-
118
- private
119
-
120
- # Computes the GLCS table for the two strings, @str1 and @str2.
121
- # Unlike normal LCS algorithms, the computation is performed from the end to the beginning of the strings.
122
- def _compute_glcs_table
123
- glcs = Array.new(@len1 + 1) { Array.new(@len2 + 1) }
124
-
125
- # initialize the final row and the final column
126
- (0..@len1).each {|p| glcs[p][@len2] = 0}
127
- (0..@len2).each {|p| glcs[@len1][p] = 0}
128
-
129
- # compute the GLCS table
130
- str1_reverse_iteration = (0...@len1).to_a.reverse
131
- str2_reverse_iteration = (0...@len2).to_a.reverse
132
-
133
- str1_reverse_iteration.each do |p1|
134
- str2_reverse_iteration.each do |p2|
135
- s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
136
- unless s1 == nil
137
- glcs[p1][p2] = glcs[p1 + s1.length][p2 + s2.length] + 1
138
- else
139
- glcs[p1][p2] = (glcs[p1][p2 + 1] > glcs[p1 + 1][p2])? glcs[p1][p2 + 1] : glcs[p1 + 1][p2]
140
- end
141
- end
142
- end
143
-
144
- glcs
145
- end
146
-
147
- # Backtrace the GLCS table, computing the mapping function from str1 to str2
148
- # As its side effect, it updates four global variables
149
- # * front_overflow: the length of the front part of str1 that cannot fit in str2.
150
- # * rear_overflow: the length of the rear part of str1 that cannot fit in str2.
151
- # * common_elements: an array which stores the common elements in the two strings.
152
- # * mapped_elements: an array which stores the mapped elements in the two strings.
153
- def _trace_glcs_table
154
- @front_overflow, @rear_overflow = 0, 0
155
- @common_elements, @mapped_elements = [], []
156
- diff_string1, diff_string2 = '', ''
157
-
158
- @position_map_begin, @position_map_end = {}, {}
159
- addition, deletion = [], []
160
- p1, p2 = 0, 0
161
-
162
- while p1 <= @len1 && p2 <= @len2
163
- s1, s2 = _prefix_eq(@str1[p1..@len1], @str2[p2..@len2])
164
- if s1 != nil
165
- l1, l2 = s1.length, s2.length
166
-
167
- @position_map_begin[p1], @position_map_end[p1] = p2, p2
168
- (p1 + 1 ... p1 + l1).each{|i| @position_map_begin[i], @position_map_end[i] = nil, nil}
169
-
170
- @common_elements << [s1, s2]
171
-
172
- if !addition.empty? && deletion.empty?
173
- # If an addition is found in the front or the rear, it is a case of underflow
174
- @str2_match_begin = addition.length if p1 == 0
175
- @str2_match_end = l2 - addition.length if p1 == @len1
176
-
177
- if p1 == 0
178
- # leave as it is
179
- elsif p1 == @len1
180
- # retract from the end
181
- @position_map_begin[p1] = p2 - addition.length
182
- @position_map_end[p1] = @position_map_begin[p1]
183
- else
184
- # correct the position for end
185
- @position_map_end[p1] = p2 - addition.length
186
- end
187
- elsif addition.empty? && !deletion.empty?
188
- # If a deletion is found in the front or the rear, it is a case of overflow
189
- @str1_match_begin = deletion.length if p1 == deletion.length
190
- @str1_match_end = l1 - deletion.length if p1 == @len1
191
-
192
- deletion.each{|p| @position_map_begin[p], @position_map_end[p] = p2, p2}
193
- elsif !addition.empty? && !deletion.empty?
194
- # If an addition and a deletion are both found in the front or the rear,
195
- # the overflow/underflow is approximated to the difference.
196
- al, dl = addition.length, deletion.length
197
- @front_overflow = dl - al if p1 == dl
198
- @rear_overflow = dl - al if p1 == @len1
199
-
200
- @mapped_elements << [@str1[deletion[0], dl], @str2[addition[0], al]]
201
-
202
- @position_map_begin[deletion[0]], @position_map_end[deletion[0]] = addition[0], addition[0]
203
- deletion[1..-1].each{|p| @position_map_begin[p], @position_map_end[p] = nil, nil}
204
- end
205
-
206
- addition.clear; deletion.clear
207
- p1 += l1; p2 += l2
208
-
209
- elsif p2 < @len2 && (p1 == @len1 || @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
210
- diff_string2 += @str2[p2]
211
-
212
- addition << p2
213
- p2 += 1
214
- elsif p1 < @len1 && (p2 == @len2 || @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
215
- diff_string1 += @str1[p1]
216
-
217
- deletion << p1
218
- p1 += 1
219
- end
220
- end
221
-
222
- @common_elements.pop
223
- @diff_strings = [diff_string1, diff_string2]
224
- end
225
-
226
- # General prefix comparison is performed based on the dictionary.
227
- # The pair of matched suffixes are returned when found.
228
- # Otherwise, the pair of nil values are returned.
229
- def _prefix_eq(str1, str2)
230
- return nil, nil if str1.empty? || str2.empty?
231
- prefixes1 = @pdic.prefixes(str1)
232
- prefixes1.each {|p1| p2 = @dic[p1]; return p1, p2 if str2.start_with?(p2)}
233
- return str1[0], str2[0] if (str1[0] == str2[0])
234
- return nil, nil
235
- end
9
+ # The mapping function from str1 to str2
10
+ attr_reader :position_map_begin, :position_map_end
11
+
12
+ # The position initial and final position of matching on str1 and str2
13
+ attr_reader :str1_match_begin, :str1_match_end, :str2_match_begin, :str2_match_end
14
+
15
+ # The length of GLCS
16
+ attr_reader :length
17
+
18
+ # the elements that are common in the two strings, str1 and str2
19
+ attr_reader :common_elements
20
+
21
+ # the elements that are mapped to each other in the two strings, str1 and str2
22
+ attr_reader :mapped_elements
23
+
24
+ # the string of non-mapped characters
25
+ attr_reader :diff_strings
26
+
27
+ attr_reader :similarity
28
+
29
+ # It initializes the GLCS table for the given two strings, str1 and str2.
30
+ # When the array, mappings, is given, general suffix comparision is performed based on the mappings.
31
+ # Exception is raised when nil given passed to either str1, str2 or dictionary
32
+ def initialize(str1, str2, mappings = [])
33
+ raise ArgumentError, "nil string" if str1 == nil || str2 == nil
34
+ raise ArgumentError, "nil dictionary" if mappings == nil
35
+
36
+ # index the mappings in hash.
37
+ @dic = (mappings + mappings.map{|e| e.reverse}).to_h
38
+
39
+ # prefix dictionary
40
+ @pdic = Dictionary.new(mappings.flatten)
41
+
42
+ @len1 = str1.length
43
+ @len2 = str2.length
44
+
45
+ # add a final marker to the end of the strings
46
+ @str1 = str1 + '_'
47
+ @str2 = str2 + '_'
48
+
49
+ # compute the GLCS table
50
+ @glcs = _compute_glcs_table
51
+ @length = @glcs[0][0]
52
+
53
+ _trace_glcs_table
54
+ end
55
+
56
+ # Prints the GLCS table
57
+ def show_glcs
58
+ puts "\t\t" + @str2.split(//).join("\t")
59
+ @glcs.each_with_index do |row, i|
60
+ h = (@str1[i].nil?)? '' : @str1[i]
61
+ puts i.to_s + "\t" + h + "\t" + row.join("\t")
62
+ end
63
+ end
64
+
65
+ # Returns the character-by-character difference
66
+ def cdiff
67
+ cdiff1, cdiff2 = '', ''
68
+ p1, p2 = 0, 0
69
+ begin
70
+ s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
71
+ if s1 != nil
72
+ l1, l2 = s1.length, s2.length
73
+
74
+ cdiff1 += s1; cdiff2 += s2
75
+ if l1 > l2 then cdiff2 += ' ' * (l1 - l2) else cdiff1 += ' ' * (l2 - l1) end
76
+ p1 += s1.length; p2 += s2.length
77
+ elsif p2 < @len2 && (p1 == @len1 or @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
78
+ cdiff1 += ' '
79
+ cdiff2 += @str2[p2]
80
+ p2 += 1
81
+ elsif p1 < @len1 && (p2 == @len2 or @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
82
+ cdiff1 += @str1[p1]
83
+ cdiff2 += ' '
84
+ p1 += 1
85
+ end
86
+ end until p1 == @len1 && p2 == @len2
87
+
88
+ return [cdiff1, cdiff2]
89
+ end
90
+
91
+
92
+ # Computes the similarity of the two strings
93
+ def similarity(cut = false)
94
+ c = @length
95
+
96
+ l1 = c + @diff_strings[0].length
97
+ l2 = c + @diff_strings[1].length
98
+
99
+ if cut
100
+ l1 -= front_overflow if front_overflow > 0
101
+ l1 -= rear_overflow if rear_overflow > 0
102
+ l1 += front_overflow if front_overflow < 0
103
+ l1 += rear_overflow if rear_overflow < 0
104
+ end
105
+
106
+ similarity = 2 * c / (l1 + l2).to_f
107
+ end
108
+
109
+ def transform_a_span(span)
110
+ {:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
111
+ end
112
+
113
+ def transform_spans(spans)
114
+ spans.map{|span| transform_a_span(span)}
115
+ end
116
+
117
+
118
+ private
119
+
120
+ # Computes the GLCS table for the two strings, @str1 and @str2.
121
+ # Unlike normal LCS algorithms, the computation is performed from the end to the beginning of the strings.
122
+ def _compute_glcs_table
123
+ glcs = Array.new(@len1 + 1) { Array.new(@len2 + 1) }
124
+
125
+ # initialize the final row and the final column
126
+ (0..@len1).each {|p| glcs[p][@len2] = 0}
127
+ (0..@len2).each {|p| glcs[@len1][p] = 0}
128
+
129
+ # compute the GLCS table
130
+ str1_reverse_iteration = (0...@len1).to_a.reverse
131
+ str2_reverse_iteration = (0...@len2).to_a.reverse
132
+
133
+ str1_reverse_iteration.each do |p1|
134
+ str2_reverse_iteration.each do |p2|
135
+ s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
136
+ unless s1 == nil
137
+ glcs[p1][p2] = glcs[p1 + s1.length][p2 + s2.length] + 1
138
+ else
139
+ glcs[p1][p2] = (glcs[p1][p2 + 1] > glcs[p1 + 1][p2])? glcs[p1][p2 + 1] : glcs[p1 + 1][p2]
140
+ end
141
+ end
142
+ end
143
+
144
+ glcs
145
+ end
146
+
147
+ # Backtrace the GLCS table, computing the mapping function from str1 to str2
148
+ # As its side effect, it updates four global variables
149
+ # * front_overflow: the length of the front part of str1 that cannot fit in str2.
150
+ # * rear_overflow: the length of the rear part of str1 that cannot fit in str2.
151
+ # * common_elements: an array which stores the common elements in the two strings.
152
+ # * mapped_elements: an array which stores the mapped elements in the two strings.
153
+ def _trace_glcs_table
154
+ @front_overflow, @rear_overflow = 0, 0
155
+ @common_elements, @mapped_elements = [], []
156
+ diff_string1, diff_string2 = '', ''
157
+
158
+ @position_map_begin, @position_map_end = {}, {}
159
+ addition, deletion = [], []
160
+ p1, p2 = 0, 0
161
+
162
+ while p1 <= @len1 && p2 <= @len2
163
+ s1, s2 = _prefix_eq(@str1[p1..@len1], @str2[p2..@len2])
164
+ if s1 != nil
165
+ l1, l2 = s1.length, s2.length
166
+
167
+ @position_map_begin[p1], @position_map_end[p1] = p2, p2
168
+ (p1 + 1 ... p1 + l1).each{|i| @position_map_begin[i], @position_map_end[i] = nil, nil}
169
+
170
+ @common_elements << [s1, s2]
171
+
172
+ if !addition.empty? && deletion.empty?
173
+ # If an addition is found in the front or the rear, it is a case of underflow
174
+ @str2_match_begin = addition.length if p1 == 0
175
+ @str2_match_end = l2 - addition.length if p1 == @len1
176
+
177
+ if p1 == 0
178
+ # leave as it is
179
+ elsif p1 == @len1
180
+ # retract from the end
181
+ @position_map_begin[p1] = p2 - addition.length
182
+ @position_map_end[p1] = @position_map_begin[p1]
183
+ else
184
+ # correct the position for end
185
+ @position_map_end[p1] = p2 - addition.length
186
+ end
187
+ elsif addition.empty? && !deletion.empty?
188
+ # If a deletion is found in the front or the rear, it is a case of overflow
189
+ @str1_match_begin = deletion.length if p1 == deletion.length
190
+ @str1_match_end = l1 - deletion.length if p1 == @len1
191
+
192
+ deletion.each{|p| @position_map_begin[p], @position_map_end[p] = p2, p2}
193
+ elsif !addition.empty? && !deletion.empty?
194
+ # If an addition and a deletion are both found in the front or the rear,
195
+ # the overflow/underflow is approximated to the difference.
196
+ al, dl = addition.length, deletion.length
197
+ @front_overflow = dl - al if p1 == dl
198
+ @rear_overflow = dl - al if p1 == @len1
199
+
200
+ @mapped_elements << [@str1[deletion[0], dl], @str2[addition[0], al]]
201
+
202
+ @position_map_begin[deletion[0]], @position_map_end[deletion[0]] = addition[0], addition[0]
203
+ deletion[1..-1].each{|p| @position_map_begin[p], @position_map_end[p] = nil, nil}
204
+ end
205
+
206
+ addition.clear; deletion.clear
207
+ p1 += l1; p2 += l2
208
+
209
+ elsif p2 < @len2 && (p1 == @len1 || @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
210
+ diff_string2 += @str2[p2]
211
+
212
+ addition << p2
213
+ p2 += 1
214
+ elsif p1 < @len1 && (p2 == @len2 || @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
215
+ diff_string1 += @str1[p1]
216
+
217
+ deletion << p1
218
+ p1 += 1
219
+ end
220
+ end
221
+
222
+ @common_elements.pop
223
+ @diff_strings = [diff_string1, diff_string2]
224
+ end
225
+
226
+ # General prefix comparison is performed based on the dictionary.
227
+ # The pair of matched suffixes are returned when found.
228
+ # Otherwise, the pair of nil values are returned.
229
+ def _prefix_eq(str1, str2)
230
+ return nil, nil if str1.empty? || str2.empty?
231
+ prefixes1 = @pdic.prefixes(str1)
232
+ prefixes1.each {|p1| p2 = @dic[p1]; return p1, p2 if str2.start_with?(p2)}
233
+ return str1[0], str2[0] if (str1[0] == str2[0])
234
+ return nil, nil
235
+ end
236
236
 
237
237
  end
238
238
 
239
239
  if __FILE__ == $0
240
240
 
241
- dictionary = [
242
- ["×", "x"], #U+00D7 (multiplication sign)
243
- ["•", "*"], #U+2022 (bullet)
244
- ["Δ", "delta"], #U+0394 (greek capital letter delta)
245
- ["Φ", "phi"], #U+03A6 (greek capital letter phi)
246
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
247
- ["β", "beta"], #U+03B2 (greek small letter beta)
248
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
249
- ["δ", "delta"], #U+03B4 (greek small letter delta)
250
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
251
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
252
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
253
- ["μ", "mu"], #U+03BC (greek small letter mu)
254
- ["χ", "chi"], #U+03C7 (greek small letter chi)
255
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
256
- [" ", " "], #U+2009 (thin space)
257
- [" ", " "], #U+200A (hair space)
258
- [" ", " "], #U+00A0 (no-break space)
259
- [" ", " "], #U+3000 (ideographic space)
260
- ["−", "-"], #U+2212 (minus sign)
261
- ["–", "-"], #U+2013 (en dash)
262
- ["′", "'"], #U+2032 (prime)
263
- ["‘", "'"], #U+2018 (left single quotation mark)
264
- ["’", "'"], #U+2019 (right single quotation mark)
265
- ["“", '"'], #U+201C (left double quotation mark)
266
- ["”", '"'] #U+201D (right double quotation mark)
267
- ]
268
-
269
- # str1 = "-betakappaxyz-"
270
- # str2 = "-ijkβκ-"
271
-
272
- # str1 = "-βκ-β-z-xy"
273
- # str2 = "abc-betakappa-beta-z"
274
-
275
- # str1 = "-βκ-z-xy"
276
- # str2 = "abc-betakappa-z"
277
-
278
- # str1 = "abc-βκ-β-z"
279
- # str2 = "-betakappa-beta-z-xyz"
280
-
281
- # str1 = "-β-"
282
- # str2 = "-beta-"
283
-
284
- # str1 = "-κ-"
285
- # str2 = "-kappa-"
286
-
287
- # str1 = File.read(ARGV[0]).strip
288
- # str2 = File.read(ARGV[1]).strip
289
-
290
- str1 = "beta"
291
- str2 = "β***"
292
-
293
- # puts "str1: #{str1}"
294
- # puts "str2: #{str2}"
295
- sa = TextAlignment::GLCSAlignment.new(str1, str2, dictionary)
296
- sa.position_map_begin.each {|h| p h}
297
- puts '-----'
298
- sa.position_map_end.each {|h| p h}
299
- puts '-----'
300
- puts "common_elements: #{sa.common_elements}"
301
- puts '-----'
302
- puts "mapped_elements: #{sa.mapped_elements}"
303
- puts '-----'
304
- # puts "diff_string1: #{sa.diff_strings[0]}"
305
- # puts "diff_string2: #{sa.diff_strings[1]}"
306
- puts "front_overflow: #{sa.front_overflow}"
307
- puts "rear_overflow : #{sa.rear_overflow}"
308
- puts '-----'
309
- puts "similarity : #{sa.similarity}"
310
- puts "similarity(cut): #{sa.similarity(true)}"
241
+ dictionary = [
242
+ ["×", "x"], #U+00D7 (multiplication sign)
243
+ ["•", "*"], #U+2022 (bullet)
244
+ ["Δ", "delta"], #U+0394 (greek capital letter delta)
245
+ ["Φ", "phi"], #U+03A6 (greek capital letter phi)
246
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
247
+ ["β", "beta"], #U+03B2 (greek small letter beta)
248
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
249
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
250
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
251
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
252
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
253
+ ["μ", "mu"], #U+03BC (greek small letter mu)
254
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
255
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
256
+ [" ", " "], #U+2009 (thin space)
257
+ [" ", " "], #U+200A (hair space)
258
+ [" ", " "], #U+00A0 (no-break space)
259
+ [" ", " "], #U+3000 (ideographic space)
260
+ ["−", "-"], #U+2212 (minus sign)
261
+ ["–", "-"], #U+2013 (en dash)
262
+ ["′", "'"], #U+2032 (prime)
263
+ ["‘", "'"], #U+2018 (left single quotation mark)
264
+ ["’", "'"], #U+2019 (right single quotation mark)
265
+ ["“", '"'], #U+201C (left double quotation mark)
266
+ ["”", '"'] #U+201D (right double quotation mark)
267
+ ]
268
+
269
+ # str1 = "-betakappaxyz-"
270
+ # str2 = "-ijkβκ-"
271
+
272
+ # str1 = "-βκ-β-z-xy"
273
+ # str2 = "abc-betakappa-beta-z"
274
+
275
+ # str1 = "-βκ-z-xy"
276
+ # str2 = "abc-betakappa-z"
277
+
278
+ # str1 = "abc-βκ-β-z"
279
+ # str2 = "-betakappa-beta-z-xyz"
280
+
281
+ # str1 = "-β-"
282
+ # str2 = "-beta-"
283
+
284
+ # str1 = "-κ-"
285
+ # str2 = "-kappa-"
286
+
287
+ # str1 = File.read(ARGV[0]).strip
288
+ # str2 = File.read(ARGV[1]).strip
289
+
290
+ str1 = "beta"
291
+ str2 = "β***"
292
+
293
+ # puts "str1: #{str1}"
294
+ # puts "str2: #{str2}"
295
+ sa = TextAlignment::GLCSAlignment.new(str1, str2, dictionary)
296
+ sa.position_map_begin.each {|h| p h}
297
+ puts '-----'
298
+ sa.position_map_end.each {|h| p h}
299
+ puts '-----'
300
+ puts "common_elements: #{sa.common_elements}"
301
+ puts '-----'
302
+ puts "mapped_elements: #{sa.mapped_elements}"
303
+ puts '-----'
304
+ # puts "diff_string1: #{sa.diff_strings[0]}"
305
+ # puts "diff_string2: #{sa.diff_strings[1]}"
306
+ puts "front_overflow: #{sa.front_overflow}"
307
+ puts "rear_overflow : #{sa.rear_overflow}"
308
+ puts '-----'
309
+ puts "similarity : #{sa.similarity}"
310
+ puts "similarity(cut): #{sa.similarity(true)}"
311
311
  end