text_alignment 0.2.9 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +225 -39
- data/lib/text_alignment/anchor_finder.rb +146 -0
- data/lib/text_alignment/approximate_fit.rb +50 -52
- data/lib/text_alignment/find_divisions.rb +198 -200
- data/lib/text_alignment/glcs_alignment.rb +297 -297
- data/lib/text_alignment/glcs_alignment_fast.rb +94 -94
- data/lib/text_alignment/glcs_required.rb +50 -50
- data/lib/text_alignment/lcs_alignment.rb +115 -115
- data/lib/text_alignment/lcs_cdiff.rb +46 -48
- data/lib/text_alignment/lcs_comparison.rb +53 -53
- data/lib/text_alignment/lcs_min.rb +144 -138
- data/lib/text_alignment/mappings.rb +68 -69
- data/lib/text_alignment/mixed_alignment.rb +193 -0
- data/lib/text_alignment/text_alignment.rb +232 -174
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +1 -1
- metadata +5 -13
- data/spec/spec_helper.rb +0 -1
- data/spec/text_alignment/glcs_alignment_spec.rb +0 -302
- data/spec/text_alignment/lcs_alignment_spec.rb +0 -98
- data/spec/text_alignment/lcs_comparision_spec.rb +0 -322
- data/spec/text_alignment/text_alignment_spec.rb +0 -302
@@ -7,211 +7,209 @@ module TextAlignment; end unless defined? TextAlignment
|
|
7
7
|
# to work on the hash representation of denotations
|
8
8
|
# to assume that there is no bag representation to this method
|
9
9
|
|
10
|
-
|
11
|
-
TextAlignment::SIMILARITY_THRESHOLD = 0.7
|
12
|
-
end
|
10
|
+
TextAlignment::SIMILARITY_THRESHOLD = 0.7 unless defined? TextAlignment::SIMILARITY_THRESHOLD
|
13
11
|
|
14
12
|
class << TextAlignment
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
14
|
+
# It finds, among the targets, the right divisions for the taraget text to fit in.
|
15
|
+
def find_divisions(source, targets, mappings = [])
|
16
|
+
raise ArgumentError, "nil source" if source == nil
|
17
|
+
raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty?
|
18
|
+
raise ArgumentError, "nil mappings" if mappings == nil
|
19
|
+
|
20
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
21
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
22
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
23
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
24
|
+
characters_to.gsub!(/-/, '\-')
|
25
|
+
|
26
|
+
source.tr!(characters_from, characters_to)
|
27
|
+
targets.each{|target| target[:text].tr!(characters_from, characters_to)}
|
28
|
+
|
29
|
+
# to process smaller ones first
|
30
|
+
targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}
|
31
|
+
|
32
|
+
TextAlignment._find_divisions(source, targets)
|
33
|
+
end
|
34
|
+
|
35
|
+
def _find_divisions(_source, _targets)
|
36
|
+
indice = []
|
37
|
+
history = []
|
38
|
+
cache = {}
|
39
|
+
source = _source.dup
|
40
|
+
targets = _targets.dup
|
41
|
+
until source.strip.empty? || targets.empty?
|
42
|
+
mode, cmp = nil, nil
|
43
|
+
candidates = []
|
44
|
+
targets.each_with_index do |target, i|
|
45
|
+
if source.size < target[:text].size
|
46
|
+
mode = :t_in_s
|
47
|
+
str1 = source
|
48
|
+
str2 = target[:text]
|
49
|
+
else
|
50
|
+
mode = :s_in_t
|
51
|
+
str1 = target[:text]
|
52
|
+
str2 = source
|
53
|
+
end
|
54
|
+
|
55
|
+
len1 = str1.length
|
56
|
+
len2 = str2.length
|
57
|
+
|
58
|
+
offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
|
59
|
+
approximate_fit(str1, str2)
|
60
|
+
else
|
61
|
+
# the whole source
|
62
|
+
[0, -1]
|
63
|
+
end
|
64
|
+
|
65
|
+
unless offset_begin.nil?
|
66
|
+
key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
|
67
|
+
cmp = if cache.has_key? key
|
68
|
+
cache[key]
|
69
|
+
else
|
70
|
+
cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
|
71
|
+
end
|
72
|
+
cache[key] = cmp
|
73
|
+
|
74
|
+
if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
|
75
|
+
candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# return remaining source and targets if m.nil?
|
81
|
+
break if candidates.empty?
|
82
|
+
|
83
|
+
choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
|
84
|
+
m = choice[:idx]
|
85
|
+
mode = choice[:mode]
|
86
|
+
|
87
|
+
index = if mode == :t_in_s
|
88
|
+
{divid:targets[m][:divid], region:[0, source.size]}
|
89
|
+
else # :s_in_t
|
90
|
+
cmp = choice[:cmp]
|
91
|
+
offset = choice[:offset]
|
92
|
+
{divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
|
93
|
+
end
|
94
|
+
|
95
|
+
source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1]
|
96
|
+
history << index[:region].dup
|
97
|
+
|
98
|
+
before_begin = index[:region][0]
|
99
|
+
before_end = index[:region][1]
|
100
|
+
|
101
|
+
rhistory = history.reverse
|
102
|
+
rhistory.shift
|
103
|
+
rhistory.each do |h|
|
104
|
+
gap = h[1] - h[0]
|
105
|
+
index[:region][0] += gap if index[:region][0] >= h[0]
|
106
|
+
index[:region][1] += gap if index[:region][1] > h[0]
|
107
|
+
end
|
108
|
+
|
109
|
+
indice << index
|
110
|
+
|
111
|
+
targets.delete_at(m)
|
112
|
+
end
|
113
|
+
|
114
|
+
unless source.strip.empty? && targets.empty?
|
115
|
+
index = {divid:nil}
|
116
|
+
index[:remaining_source] = source unless source.strip.empty?
|
117
|
+
index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty?
|
118
|
+
indice << index
|
119
|
+
end
|
120
|
+
|
121
|
+
indice
|
122
|
+
end
|
123
|
+
|
124
|
+
def _find_divisions_old(source, targets)
|
125
|
+
mode, m, c, offset_begin = nil, nil, nil, nil
|
126
|
+
|
127
|
+
targets.each_with_index do |target, i|
|
128
|
+
if source.size < target[:text].size
|
129
|
+
mode = :t_in_s
|
130
|
+
str1 = source
|
131
|
+
str2 = target[:text]
|
132
|
+
else
|
133
|
+
mode = :s_in_t
|
134
|
+
str1 = target[:text]
|
135
|
+
str2 = source
|
136
|
+
end
|
137
|
+
|
138
|
+
len1 = str1.length
|
139
|
+
len2 = str2.length
|
140
|
+
|
141
|
+
offset_begin, offset_end = 0, -1
|
142
|
+
offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
|
143
|
+
|
144
|
+
unless offset_begin.nil?
|
145
|
+
c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
|
146
|
+
if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
|
147
|
+
m = i
|
148
|
+
break
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# return remaining source and targets if m.nil?
|
154
|
+
return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil?
|
155
|
+
|
156
|
+
index = if mode == :t_in_s
|
157
|
+
[targets[m][:divid], [0, source.size]]
|
158
|
+
else # :s_in_t
|
159
|
+
[targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
|
160
|
+
end
|
161
|
+
|
162
|
+
next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1]
|
163
|
+
targets.delete_at(m)
|
164
|
+
|
165
|
+
if next_source.strip.empty? || targets.empty?
|
166
|
+
return [index]
|
167
|
+
else
|
168
|
+
more_index = _find_divisions(next_source, targets)
|
169
|
+
gap = index[1][1] - index[1][0]
|
170
|
+
more_index.each do |i|
|
171
|
+
if (i[0] > -1)
|
172
|
+
i[1][0] += gap if i[1][0] >= index[1][0]
|
173
|
+
i[1][1] += gap if i[1][1] > index[1][0]
|
174
|
+
end
|
175
|
+
end
|
176
|
+
return [index] + more_index
|
177
|
+
end
|
178
|
+
end
|
181
179
|
|
182
180
|
end
|
183
181
|
|
184
182
|
if __FILE__ == $0
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
183
|
+
require 'json'
|
184
|
+
if ARGV.length == 2
|
185
|
+
source = JSON.parse File.read(ARGV[0]), :symbolize_names => true
|
186
|
+
source_text = source[:text].strip
|
187
|
+
|
188
|
+
targets = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
189
|
+
div_index = TextAlignment::find_divisions(source_text, targets)
|
190
|
+
pp div_index
|
191
|
+
|
192
|
+
# str1 = File.read(ARGV[0]).strip
|
193
|
+
# str2 = File.read(ARGV[1]).strip
|
194
|
+
# div_index = TextAlignment::find_divisions(str1, [str2])
|
195
|
+
|
196
|
+
# puts "source length: #{source_text.length}"
|
197
|
+
# div_index.each do |i|
|
198
|
+
# unless i[:divid].nil?
|
199
|
+
# puts "[Div: #{i[:divid]}] (#{i[:region][0]}, #{i[:region][1]})"
|
200
|
+
# puts source_text[i[:region][0] ... i[:region][1]]
|
201
|
+
# puts "=========="
|
202
|
+
# else
|
203
|
+
# p i
|
204
|
+
# end
|
205
|
+
|
206
|
+
# # if i[0] >= 0
|
207
|
+
# # puts "[Div: #{i[0]}] (#{i[1][0]}, #{i[1][1]})"
|
208
|
+
# # puts source_text[i[1][0] ... i[1][1]]
|
209
|
+
# # puts "=========="
|
210
|
+
# # else
|
211
|
+
# # p i
|
212
|
+
# # end
|
213
|
+
# end
|
214
|
+
end
|
217
215
|
end
|
@@ -6,306 +6,306 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
# An instance of this class holds the results of generalized LCS computation for the two strings str1 and str2.
|
7
7
|
# an optional dictionary is used for generalized suffix comparision.
|
8
8
|
class TextAlignment::GLCSAlignment
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
9
|
+
# The mapping function from str1 to str2
|
10
|
+
attr_reader :position_map_begin, :position_map_end
|
11
|
+
|
12
|
+
# The position initial and final position of matching on str1 and str2
|
13
|
+
attr_reader :str1_match_begin, :str1_match_end, :str2_match_begin, :str2_match_end
|
14
|
+
|
15
|
+
# The length of GLCS
|
16
|
+
attr_reader :length
|
17
|
+
|
18
|
+
# the elements that are common in the two strings, str1 and str2
|
19
|
+
attr_reader :common_elements
|
20
|
+
|
21
|
+
# the elements that are mapped to each other in the two strings, str1 and str2
|
22
|
+
attr_reader :mapped_elements
|
23
|
+
|
24
|
+
# the string of non-mapped characters
|
25
|
+
attr_reader :diff_strings
|
26
|
+
|
27
|
+
attr_reader :similarity
|
28
|
+
|
29
|
+
# It initializes the GLCS table for the given two strings, str1 and str2.
|
30
|
+
# When the array, mappings, is given, general suffix comparision is performed based on the mappings.
|
31
|
+
# Exception is raised when nil given passed to either str1, str2 or dictionary
|
32
|
+
def initialize(str1, str2, mappings = [])
|
33
|
+
raise ArgumentError, "nil string" if str1 == nil || str2 == nil
|
34
|
+
raise ArgumentError, "nil dictionary" if mappings == nil
|
35
|
+
|
36
|
+
# index the mappings in hash.
|
37
|
+
@dic = (mappings + mappings.map{|e| e.reverse}).to_h
|
38
|
+
|
39
|
+
# prefix dictionary
|
40
|
+
@pdic = Dictionary.new(mappings.flatten)
|
41
|
+
|
42
|
+
@len1 = str1.length
|
43
|
+
@len2 = str2.length
|
44
|
+
|
45
|
+
# add a final marker to the end of the strings
|
46
|
+
@str1 = str1 + '_'
|
47
|
+
@str2 = str2 + '_'
|
48
|
+
|
49
|
+
# compute the GLCS table
|
50
|
+
@glcs = _compute_glcs_table
|
51
|
+
@length = @glcs[0][0]
|
52
|
+
|
53
|
+
_trace_glcs_table
|
54
|
+
end
|
55
|
+
|
56
|
+
# Prints the GLCS table
|
57
|
+
def show_glcs
|
58
|
+
puts "\t\t" + @str2.split(//).join("\t")
|
59
|
+
@glcs.each_with_index do |row, i|
|
60
|
+
h = (@str1[i].nil?)? '' : @str1[i]
|
61
|
+
puts i.to_s + "\t" + h + "\t" + row.join("\t")
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Returns the character-by-character difference
|
66
|
+
def cdiff
|
67
|
+
cdiff1, cdiff2 = '', ''
|
68
|
+
p1, p2 = 0, 0
|
69
|
+
begin
|
70
|
+
s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
|
71
|
+
if s1 != nil
|
72
|
+
l1, l2 = s1.length, s2.length
|
73
|
+
|
74
|
+
cdiff1 += s1; cdiff2 += s2
|
75
|
+
if l1 > l2 then cdiff2 += ' ' * (l1 - l2) else cdiff1 += ' ' * (l2 - l1) end
|
76
|
+
p1 += s1.length; p2 += s2.length
|
77
|
+
elsif p2 < @len2 && (p1 == @len1 or @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
|
78
|
+
cdiff1 += ' '
|
79
|
+
cdiff2 += @str2[p2]
|
80
|
+
p2 += 1
|
81
|
+
elsif p1 < @len1 && (p2 == @len2 or @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
|
82
|
+
cdiff1 += @str1[p1]
|
83
|
+
cdiff2 += ' '
|
84
|
+
p1 += 1
|
85
|
+
end
|
86
|
+
end until p1 == @len1 && p2 == @len2
|
87
|
+
|
88
|
+
return [cdiff1, cdiff2]
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
# Computes the similarity of the two strings
|
93
|
+
def similarity(cut = false)
|
94
|
+
c = @length
|
95
|
+
|
96
|
+
l1 = c + @diff_strings[0].length
|
97
|
+
l2 = c + @diff_strings[1].length
|
98
|
+
|
99
|
+
if cut
|
100
|
+
l1 -= front_overflow if front_overflow > 0
|
101
|
+
l1 -= rear_overflow if rear_overflow > 0
|
102
|
+
l1 += front_overflow if front_overflow < 0
|
103
|
+
l1 += rear_overflow if rear_overflow < 0
|
104
|
+
end
|
105
|
+
|
106
|
+
similarity = 2 * c / (l1 + l2).to_f
|
107
|
+
end
|
108
|
+
|
109
|
+
def transform_a_span(span)
|
110
|
+
{:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
|
111
|
+
end
|
112
|
+
|
113
|
+
def transform_spans(spans)
|
114
|
+
spans.map{|span| transform_a_span(span)}
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
private
|
119
|
+
|
120
|
+
# Computes the GLCS table for the two strings, @str1 and @str2.
|
121
|
+
# Unlike normal LCS algorithms, the computation is performed from the end to the beginning of the strings.
|
122
|
+
def _compute_glcs_table
|
123
|
+
glcs = Array.new(@len1 + 1) { Array.new(@len2 + 1) }
|
124
|
+
|
125
|
+
# initialize the final row and the final column
|
126
|
+
(0..@len1).each {|p| glcs[p][@len2] = 0}
|
127
|
+
(0..@len2).each {|p| glcs[@len1][p] = 0}
|
128
|
+
|
129
|
+
# compute the GLCS table
|
130
|
+
str1_reverse_iteration = (0...@len1).to_a.reverse
|
131
|
+
str2_reverse_iteration = (0...@len2).to_a.reverse
|
132
|
+
|
133
|
+
str1_reverse_iteration.each do |p1|
|
134
|
+
str2_reverse_iteration.each do |p2|
|
135
|
+
s1, s2 = _prefix_eq(@str1[p1...@len1], @str2[p2...@len2])
|
136
|
+
unless s1 == nil
|
137
|
+
glcs[p1][p2] = glcs[p1 + s1.length][p2 + s2.length] + 1
|
138
|
+
else
|
139
|
+
glcs[p1][p2] = (glcs[p1][p2 + 1] > glcs[p1 + 1][p2])? glcs[p1][p2 + 1] : glcs[p1 + 1][p2]
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
glcs
|
145
|
+
end
|
146
|
+
|
147
|
+
# Backtrace the GLCS table, computing the mapping function from str1 to str2
|
148
|
+
# As its side effect, it updates four global variables
|
149
|
+
# * front_overflow: the length of the front part of str1 that cannot fit in str2.
|
150
|
+
# * rear_overflow: the length of the rear part of str1 that cannot fit in str2.
|
151
|
+
# * common_elements: an array which stores the common elements in the two strings.
|
152
|
+
# * mapped_elements: an array which stores the mapped elements in the two strings.
|
153
|
+
def _trace_glcs_table
|
154
|
+
@front_overflow, @rear_overflow = 0, 0
|
155
|
+
@common_elements, @mapped_elements = [], []
|
156
|
+
diff_string1, diff_string2 = '', ''
|
157
|
+
|
158
|
+
@position_map_begin, @position_map_end = {}, {}
|
159
|
+
addition, deletion = [], []
|
160
|
+
p1, p2 = 0, 0
|
161
|
+
|
162
|
+
while p1 <= @len1 && p2 <= @len2
|
163
|
+
s1, s2 = _prefix_eq(@str1[p1..@len1], @str2[p2..@len2])
|
164
|
+
if s1 != nil
|
165
|
+
l1, l2 = s1.length, s2.length
|
166
|
+
|
167
|
+
@position_map_begin[p1], @position_map_end[p1] = p2, p2
|
168
|
+
(p1 + 1 ... p1 + l1).each{|i| @position_map_begin[i], @position_map_end[i] = nil, nil}
|
169
|
+
|
170
|
+
@common_elements << [s1, s2]
|
171
|
+
|
172
|
+
if !addition.empty? && deletion.empty?
|
173
|
+
# If an addition is found in the front or the rear, it is a case of underflow
|
174
|
+
@str2_match_begin = addition.length if p1 == 0
|
175
|
+
@str2_match_end = l2 - addition.length if p1 == @len1
|
176
|
+
|
177
|
+
if p1 == 0
|
178
|
+
# leave as it is
|
179
|
+
elsif p1 == @len1
|
180
|
+
# retract from the end
|
181
|
+
@position_map_begin[p1] = p2 - addition.length
|
182
|
+
@position_map_end[p1] = @position_map_begin[p1]
|
183
|
+
else
|
184
|
+
# correct the position for end
|
185
|
+
@position_map_end[p1] = p2 - addition.length
|
186
|
+
end
|
187
|
+
elsif addition.empty? && !deletion.empty?
|
188
|
+
# If a deletion is found in the front or the rear, it is a case of overflow
|
189
|
+
@str1_match_begin = deletion.length if p1 == deletion.length
|
190
|
+
@str1_match_end = l1 - deletion.length if p1 == @len1
|
191
|
+
|
192
|
+
deletion.each{|p| @position_map_begin[p], @position_map_end[p] = p2, p2}
|
193
|
+
elsif !addition.empty? && !deletion.empty?
|
194
|
+
# If an addition and a deletion are both found in the front or the rear,
|
195
|
+
# the overflow/underflow is approximated to the difference.
|
196
|
+
al, dl = addition.length, deletion.length
|
197
|
+
@front_overflow = dl - al if p1 == dl
|
198
|
+
@rear_overflow = dl - al if p1 == @len1
|
199
|
+
|
200
|
+
@mapped_elements << [@str1[deletion[0], dl], @str2[addition[0], al]]
|
201
|
+
|
202
|
+
@position_map_begin[deletion[0]], @position_map_end[deletion[0]] = addition[0], addition[0]
|
203
|
+
deletion[1..-1].each{|p| @position_map_begin[p], @position_map_end[p] = nil, nil}
|
204
|
+
end
|
205
|
+
|
206
|
+
addition.clear; deletion.clear
|
207
|
+
p1 += l1; p2 += l2
|
208
|
+
|
209
|
+
elsif p2 < @len2 && (p1 == @len1 || @glcs[p1][p2 + 1] > @glcs[p1 + 1][p2])
|
210
|
+
diff_string2 += @str2[p2]
|
211
|
+
|
212
|
+
addition << p2
|
213
|
+
p2 += 1
|
214
|
+
elsif p1 < @len1 && (p2 == @len2 || @glcs[p1][p2 + 1] <= @glcs[p1 + 1][p2])
|
215
|
+
diff_string1 += @str1[p1]
|
216
|
+
|
217
|
+
deletion << p1
|
218
|
+
p1 += 1
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
@common_elements.pop
|
223
|
+
@diff_strings = [diff_string1, diff_string2]
|
224
|
+
end
|
225
|
+
|
226
|
+
# General prefix comparison is performed based on the dictionary.
|
227
|
+
# The pair of matched suffixes are returned when found.
|
228
|
+
# Otherwise, the pair of nil values are returned.
|
229
|
+
def _prefix_eq(str1, str2)
|
230
|
+
return nil, nil if str1.empty? || str2.empty?
|
231
|
+
prefixes1 = @pdic.prefixes(str1)
|
232
|
+
prefixes1.each {|p1| p2 = @dic[p1]; return p1, p2 if str2.start_with?(p2)}
|
233
|
+
return str1[0], str2[0] if (str1[0] == str2[0])
|
234
|
+
return nil, nil
|
235
|
+
end
|
236
236
|
|
237
237
|
end
|
238
238
|
|
239
239
|
if __FILE__ == $0
|
240
240
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
241
|
+
dictionary = [
|
242
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
243
|
+
["•", "*"], #U+2022 (bullet)
|
244
|
+
["Δ", "delta"], #U+0394 (greek capital letter delta)
|
245
|
+
["Φ", "phi"], #U+03A6 (greek capital letter phi)
|
246
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
247
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
248
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
249
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
250
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
251
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
252
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
253
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
254
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
255
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
256
|
+
[" ", " "], #U+2009 (thin space)
|
257
|
+
[" ", " "], #U+200A (hair space)
|
258
|
+
[" ", " "], #U+00A0 (no-break space)
|
259
|
+
[" ", " "], #U+3000 (ideographic space)
|
260
|
+
["−", "-"], #U+2212 (minus sign)
|
261
|
+
["–", "-"], #U+2013 (en dash)
|
262
|
+
["′", "'"], #U+2032 (prime)
|
263
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
264
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
265
|
+
["“", '"'], #U+201C (left double quotation mark)
|
266
|
+
["”", '"'] #U+201D (right double quotation mark)
|
267
|
+
]
|
268
|
+
|
269
|
+
# str1 = "-betakappaxyz-"
|
270
|
+
# str2 = "-ijkβκ-"
|
271
|
+
|
272
|
+
# str1 = "-βκ-β-z-xy"
|
273
|
+
# str2 = "abc-betakappa-beta-z"
|
274
|
+
|
275
|
+
# str1 = "-βκ-z-xy"
|
276
|
+
# str2 = "abc-betakappa-z"
|
277
|
+
|
278
|
+
# str1 = "abc-βκ-β-z"
|
279
|
+
# str2 = "-betakappa-beta-z-xyz"
|
280
|
+
|
281
|
+
# str1 = "-β-"
|
282
|
+
# str2 = "-beta-"
|
283
|
+
|
284
|
+
# str1 = "-κ-"
|
285
|
+
# str2 = "-kappa-"
|
286
|
+
|
287
|
+
# str1 = File.read(ARGV[0]).strip
|
288
|
+
# str2 = File.read(ARGV[1]).strip
|
289
|
+
|
290
|
+
str1 = "beta"
|
291
|
+
str2 = "β***"
|
292
|
+
|
293
|
+
# puts "str1: #{str1}"
|
294
|
+
# puts "str2: #{str2}"
|
295
|
+
sa = TextAlignment::GLCSAlignment.new(str1, str2, dictionary)
|
296
|
+
sa.position_map_begin.each {|h| p h}
|
297
|
+
puts '-----'
|
298
|
+
sa.position_map_end.each {|h| p h}
|
299
|
+
puts '-----'
|
300
|
+
puts "common_elements: #{sa.common_elements}"
|
301
|
+
puts '-----'
|
302
|
+
puts "mapped_elements: #{sa.mapped_elements}"
|
303
|
+
puts '-----'
|
304
|
+
# puts "diff_string1: #{sa.diff_strings[0]}"
|
305
|
+
# puts "diff_string2: #{sa.diff_strings[1]}"
|
306
|
+
puts "front_overflow: #{sa.front_overflow}"
|
307
|
+
puts "rear_overflow : #{sa.rear_overflow}"
|
308
|
+
puts '-----'
|
309
|
+
puts "similarity : #{sa.similarity}"
|
310
|
+
puts "similarity(cut): #{sa.similarity(true)}"
|
311
311
|
end
|