text_alignment 0.6.4 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 87f945e356349ed709996d88ed39c8ba5b83622bde1c7fd7b9e5ff63504615c2
4
- data.tar.gz: acb6e716113238c39b59a8358928de1bd936382308961a57e2c60e7bc462726f
3
+ metadata.gz: 0c49793b84e9ca5606a8fe9151530f6732337c2b92cfd1af1549a56ea3c2f39e
4
+ data.tar.gz: 4a0b8328d4c6de43af50bd8c278f83facef311c74516dc3a9a0c9dd5f91fbfc0
5
5
  SHA512:
6
- metadata.gz: 4d5b862bb50b4111c6bd390e458d6761303dc394f2fa7dc9d6b821ee7461541705aecac925f700e5124eb282112567e52a51a9f15b84fa8349da25baaf68fdd9
7
- data.tar.gz: a044608a58181e98664a26f410a7d59927dc4d39db8d49a147666f64254e23728ceccaa781a590712b7a74b57222cc449c37eb43a709d3f16da60aa3a55c2e6f
6
+ metadata.gz: ad70de67b4a7b38290a59d89bed7cd9fa343ce3db62bb67b779e51b499929b9d7c4697871cf25017fa59a34c0525741d5da679559fc2f33d902fcda692f9f1ac
7
+ data.tar.gz: 6242b35cbb8f53effc477b508d428d9060cba2e93895496134963755cff956f3931411a87a7bc0d33b7b91459f1a27eb67c422897693320e1b2ceb06eff6e22b
@@ -26,33 +26,43 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_mdoc(source_annotations, target_annotations)
29
+ def align_denotations(denotations, source_text, target_text, debug = false)
30
+ alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
31
+ new_denotations = alignment.transform_hdenotations(denotations)
32
+
33
+ if debug
34
+ warn "[block alignment]"
35
+ warn alignment.alignment_show
36
+ warn "-----"
37
+ end
38
+
39
+ lost_annotations = alignment.lost_annotations
40
+ unless lost_annotations.empty?
41
+ warn "\n[lost annotations] #{lost_annotations.length}"
42
+ lost_annotations.each do |a|
43
+ warn "#{a}"
44
+ end
45
+ warn "====="
46
+ end
47
+ warn
48
+
49
+ # return target annotations
50
+ new_denotations
51
+ end
52
+
53
+ def align_mannotations(source_annotations, target_text, debug = false)
54
+ target_annotations = {text:target_text}
55
+
30
56
  idnum_denotations = 0
31
57
  idnum_relations = 0
32
58
  idnum_attributes = 0
33
59
  idnum_modifications = 0
34
60
 
35
- source_annotations.each do |annotations|
36
- alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
-
38
- puts alignment.alignment_show
39
- puts "-----"
40
- puts
41
-
42
- # alignment.block_alignments.each do |a|
43
- # p {source:a[:source], target:a[:target]}
44
- # puts "--"
45
- # p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
46
- # puts "--"
47
- # puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
48
- # puts "--"
49
- # puts target_text[a[:target][:begin] ... a[:target][:end]]
50
- # puts "======"
51
- # end
52
-
61
+ source_annotations.each_with_index do |annotations, i|
53
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
54
63
  ididx = {}
55
- denotations = alignment.transform_hdenotations(annotations[:denotations])
64
+ warn "[#{i}]-=-=-=-=-"
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
56
66
  denotations.each do |d|
57
67
  reid = 'T' + (idnum_denotations += 1).to_s
58
68
  ididx[d[:id]] = reid
@@ -101,141 +111,11 @@ end
101
111
  source_annotations = read_annotations(ARGV[0])
102
112
  target_text = read_text(ARGV[1])
103
113
 
104
- lost_annotations = []
105
114
  target_annotations = if source_annotations.class == Array
106
- align_mdoc(source_annotations, {text: target_text})
115
+ align_mannotations(source_annotations, target_text, false)
107
116
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
109
-
110
- # pp alignment
111
-
112
- # verification
113
- # source_text = source_annotations[:text]
114
- # puts "=====BEGIN"
115
- # (0 ... source_text.rstrip.length).each do |p|
116
- # t = alignment.transform_begin_position(p)
117
- # if t.nil?
118
- # print source_text[p]
119
- # else
120
- # print '.'
121
- # end
122
- # end
123
- # puts
124
- # puts "=====END"
125
-
126
- # puts "=====BEGIN"
127
- # (0 .. source_text.rstrip.length).each do |p|
128
- # t = alignment.transform_end_position(p)
129
- # if t.nil?
130
- # print source_text[p]
131
- # else
132
- # print '.'
133
- # end
134
- # end
135
- # puts
136
- # puts "=====END"
137
-
138
- source_text = source_annotations[:text]
139
-
140
- puts "[block alignment]"
141
- puts alignment.alignment_show
142
- puts "====="
143
- # exit
144
-
145
- # verification of source denotations
146
- puts "[Invalid source denotations]"
147
- source_annotations[:denotations] do |d|
148
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
149
- end
150
- puts "====="
151
- puts
152
-
153
- denotations = alignment.transform_hdenotations(source_annotations[:denotations])
154
- puts "[Invalid transformation]"
155
- denotations.each do |d|
156
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
157
- end
158
- puts "====="
159
- puts
160
-
161
- lost_annotations += alignment.lost_annotations if alignment.lost_annotations
162
-
117
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
163
118
  source_annotations.merge({text:target_text, denotations:denotations})
164
119
  end
165
120
 
166
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
167
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
168
- source_annotations.each do |annotations|
169
- num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
170
- num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
171
- num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
172
- num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
173
- end
174
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
175
- else
176
- num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
177
- num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
178
- num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
179
- num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
180
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
181
- end
182
-
183
- warn "[source]"
184
- warn "denotations:\t#{num_denotations_source}"
185
- # warn "relations:\t#{num_relations_source}"
186
- # warn "attributes:\t#{num_attributes_source}"
187
- # warn "modifications:\t#{num_modifications_source}"
188
-
189
- warn "\n[target]"
190
- warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
191
- # warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
192
- # warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
193
- # warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
194
-
195
- if lost_annotations
196
- warn "\n[lost annotations]"
197
- warn "#{lost_annotations.length}"
198
- end
199
-
200
- #puts target_annotations.to_json
201
-
202
- # denotations = anns1[:denotations]
203
-
204
- # puts "[Alignment1]====="
205
- # align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
206
-
207
- # align.alignment.each do |a|
208
- # p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
209
- # end
210
-
211
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
212
- # puts
213
- # puts "[Similarity]\n#{align.similarity}"
214
- # puts
215
- # puts '[Denotations original]'
216
- # pp denotations
217
- # puts
218
- # puts '[Denotations transformed]'
219
- # new_denotations = align.transform_hdenotations(denotations)
220
- # pp new_denotations
221
- # puts
222
- # puts "[Alignment2 (downcased)]====="
223
- # align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
224
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
225
- # puts
226
- # puts "[Similarity]\n#{align.similarity}"
227
- # puts
228
- # puts '[Denotations original]'
229
- # pp denotations
230
- # puts
231
- # puts '[Denotations transformed]'
232
- # new_denotations = align.transform_hdenotations(denotations)
233
- # pp new_denotations
234
- # puts
235
- # puts '[Annotations transformed]'
236
- # anns2[:denotations] = new_denotations
237
- # puts anns2.to_json
238
-
239
- # p align.common_elements
240
- # puts "---------------"
241
- # p align.mapped_elements
121
+ # puts target_annotations.to_json
@@ -1,7 +1,7 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
- TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
4
+ TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
7
  TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -1,74 +1,172 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::MAPPINGS = [
4
- ["©", "(c)"], #U+00A9 (Copyright Sign)
5
-
6
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
- ["β", "beta"], #U+03B2 (greek small letter beta)
8
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
- ["δ", "delta"], #U+03B4 (greek small letter delta)
10
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
- ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
- ["η", "eta"], #U+03B7 (greek small letter eta)
13
- ["θ", "theta"], #U+03B7 (greek small letter eta)
14
- ["ι", "iota"], #U+03B7 (greek small letter eta)
15
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
- ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
- ["μ", "mu"], #U+03BC (greek small letter mu)
19
- ["ν", "nu"], #U+03BD (greek small letter nu)
20
- ["ξ", "xi"], #U+03BE (greek small letter xi)
21
- ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
- ["π", "pi"], #U+03C0 (greek small letter pi)
23
- ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
- ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
- ["τ", "tau"], #U+03C4 (greek small letter tau)
26
- ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
- ["φ", "phi"], #U+03C6 (greek small letter phi)
28
- ["χ", "chi"], #U+03C7 (greek small letter chi)
29
- ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
- ["ω", "omega"], #U+03C9 (greek small letter omega)
31
-
32
- ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
- ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
- ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
- ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
- ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
- ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
- ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
- ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
- ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
- ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
- ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
- ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
- ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
- ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
- ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
- ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
- ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
- ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
- ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
- ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
- ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
- ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
- ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
- ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
- ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
-
58
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
-
60
- ["×", "x"], #U+00D7 (multiplication sign)
61
- ["•", "*"], #U+2022 (bullet)
62
- [" ", " "], #U+2009 (thin space)
63
- [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
65
- [" ", " "], #U+3000 (ideographic space)
66
- ["", "-"], #U+2212 (minus sign)
67
- ["", "-"], #U+2013 (en dash)
68
- ["", "'"], #U+2032 (prime)
69
- ["", "'"], #U+2018 (left single quotation mark)
70
- ["", "'"], #U+2019 (right single quotation mark)
71
- ["", '"'], #U+201C (left double quotation mark)
72
- ["", '"'], #U+201D (right double quotation mark)
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
+
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
+
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
+
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
+
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (no-break space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["", "-"], #U+2211 (Non-Breaking Hyphen)
67
+ ["", "-"], #U+2212 (minus sign)
68
+ ["", "-"], #U+2013 (en dash)
69
+ ["", "'"], #U+2032 (prime)
70
+ ["", "'"], #U+2018 (left single quotation mark)
71
+ ["", "'"], #U+2019 (right single quotation mark)
72
+ ["", '"'], #U+201C (left double quotation mark)
73
+ ["”", '"'], #U+201D (right double quotation mark)
73
74
  ['"', "''"]
74
- ]
75
+ ]
76
+
77
+
78
+ TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
79
+
80
+
81
+ class << TextAlignment
82
+ def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
+ _mappings ||= TextAlignment::MAPPINGS
84
+
85
+ character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
+ if character_mappings.empty?
87
+ [_str1, _str2, _mappings]
88
+ else
89
+ characters_from = character_mappings.collect{|m| m[0]}.join
90
+ characters_to = character_mappings.collect{|m| m[1]}.join
91
+ characters_to.gsub!(/-/, '\-')
92
+
93
+ str1 = _str1.tr(characters_from, characters_to)
94
+ str2 = _str2.tr(characters_from, characters_to)
95
+
96
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
97
+
98
+ [str1, str2, mappings]
99
+ end
100
+ end
101
+
102
+ def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
+ _mappings ||= TextAlignment::MAPPINGS
104
+
105
+ long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
+ if long_to_one_mappings.empty?
107
+ [_str1, _str2, _mappings]
108
+ else
109
+ ## long to one character mappings
110
+ pletters = TextAlignment::PADDING_LETTERS
111
+
112
+ # find the padding letter for str1
113
+ @padding_letter1 = begin
114
+ i = pletters.index{|l| _str2.index(l).nil?}
115
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
+ TextAlignment::PADDING_LETTERS[i]
117
+ end
118
+
119
+ # find the padding letter for str2
120
+ @padding_letter2 = begin
121
+ i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
+ TextAlignment::PADDING_LETTERS[i]
124
+ end
125
+
126
+ str1 = str2 = nil
127
+ long_to_one_mappings.each do |f|
128
+ from = f[1]
129
+
130
+ str1 = if _str2.index(f[0])
131
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
+ _str1.gsub(from, to)
133
+ else
134
+ _str1
135
+ end
136
+
137
+ str2 = if _str1.index(f[0])
138
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
+ _str2.gsub(from, to)
140
+ else
141
+ _str2
142
+ end
143
+ end
144
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
145
+
146
+ [str1, str2, mappings]
147
+ end
148
+ end
149
+
150
+ def compute_similarity(_s1, _s2, sdiff)
151
+ return 0 if sdiff.nil?
152
+
153
+ # compute the lcs only with non-whitespace letters
154
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
+ return 0 if lcs == 0
156
+
157
+ s1 = if @padding_letter1
158
+ _s1.tr(@padding_letter1, ' ')
159
+ else
160
+ _s1
161
+ end
162
+
163
+ s2 = if @padding_letter2
164
+ _s2.tr(@padding_letter2, ' ')
165
+ else
166
+ _s2
167
+ end
168
+
169
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
170
+ end
171
+
172
+ end
@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(_str1, _str2)
20
+ def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = string_preprocessing(_str1, _str2)
23
+ str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
24
24
 
25
25
  _compute_mixed_alignment(str1, str2, mappings)
26
26
  end
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
63
63
  end
64
64
 
65
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = compute_similarity(str1, str2, @sdiff)
66
+ @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
67
67
  @str1_match_initial = cmp.str1_match_initial
68
68
  @str1_match_final = cmp.str1_match_final
69
69
  @str2_match_initial = cmp.str2_match_initial
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
139
139
  @position_map_end = posmap_end.sort.to_h
140
140
  end
141
141
 
142
- private
143
-
144
- def string_preprocessing(_str1, _str2)
145
- str1 = _str1.dup
146
- str2 = _str2.dup
147
- mappings = TextAlignment::MAPPINGS.dup
148
-
149
- ## single character mappings
150
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
- characters_from = character_mappings.collect{|m| m[0]}.join
152
- characters_to = character_mappings.collect{|m| m[1]}.join
153
- characters_to.gsub!(/-/, '\-')
154
-
155
- str1.tr!(characters_from, characters_to)
156
- str2.tr!(characters_from, characters_to)
157
-
158
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
-
160
- ## long to one character mappings
161
- pletters = TextAlignment::PADDING_LETTERS
162
-
163
- # find the padding letter for str1
164
- @padding_letter1 = begin
165
- i = pletters.index{|l| str2.index(l).nil?}
166
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
- TextAlignment::PADDING_LETTERS[i]
168
- end
169
-
170
- # find the padding letter for str2
171
- @padding_letter2 = begin
172
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
- TextAlignment::PADDING_LETTERS[i]
175
- end
176
-
177
- # ASCII foldings
178
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
- ascii_foldings.each do |f|
180
- from = f[1]
181
-
182
- if str2.index(f[0])
183
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
- str1.gsub!(from, to)
185
- end
186
-
187
- if str1.index(f[0])
188
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
- str2.gsub!(from, to)
190
- end
191
- end
192
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
-
194
- [str1, str2, mappings]
195
- end
196
-
197
- def compute_similarity(_s1, _s2, sdiff)
198
- return 0 if sdiff.nil?
199
-
200
- # compute the lcs only with non-whitespace letters
201
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
- return 0 if lcs == 0
203
-
204
- s1 = _s1.tr(@padding_letter1, ' ')
205
- s2 = _s2.tr(@padding_letter2, ' ')
206
-
207
- similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
- end
209
-
210
142
  end
@@ -5,50 +5,44 @@ require 'text_alignment/mixed_alignment'
5
5
 
6
6
  module TextAlignment; end unless defined? TextAlignment
7
7
 
8
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
-
10
8
  class TextAlignment::TextAlignment
11
9
  attr_reader :block_alignment
12
10
  attr_reader :similarity
13
11
  attr_reader :lost_annotations
14
12
 
15
- def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
13
+ def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
17
15
 
18
- @block_alignment = {source_text:str1, target_text:str2}
16
+ @block_alignment = {source_text:_str1, target_text:_str2}
17
+ @original_str1 = _str1
18
+ @original_str2 = _str2
19
19
 
20
- # try exact match
21
- block_begin = str2.index(str1)
22
- unless block_begin.nil?
23
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
24
- return @block_alignment
25
- end
20
+ str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
26
21
 
27
- # try exact match
28
- block_begin = str2.downcase.index(str1.downcase)
29
- unless block_begin.nil?
30
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
31
- return @block_alignment
22
+ if r = whole_block_alignment(str1, str2)
23
+ @block_alignment[:blocks] = r
24
+ return
32
25
  end
33
26
 
27
+ ## to find block alignments
34
28
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
35
29
 
36
- # To collect matched blocks
37
- mblocks = []
38
- while anchor = anchor_finder.get_next_anchor
39
- last = mblocks.last
40
- if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
41
- last[:source][:end] = anchor[:source][:end]
42
- last[:target][:end] = anchor[:target][:end]
30
+ blocks = []
31
+ while block = anchor_finder.get_next_anchor
32
+ last = blocks.last
33
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
34
+ last[:source][:end] = block[:source][:end]
35
+ last[:target][:end] = block[:target][:end]
43
36
  else
44
- mblocks << anchor
37
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
45
38
  end
46
39
  end
47
40
 
48
- # pp mblocks
41
+ # pp blocks
49
42
  # puts "-----"
50
43
  # puts
51
- # mblocks.each do |b|
44
+ # exit
45
+ # blocks.each do |b|
52
46
  # p [b[:source], b[:target]]
53
47
  # puts "---"
54
48
  # puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -60,114 +54,218 @@ class TextAlignment::TextAlignment
60
54
  # puts "-=-=-=-=-"
61
55
  # puts
62
56
 
63
- ## To find block alignments
64
- @block_alignment[:blocks] = []
65
- return if mblocks.empty?
66
-
67
- # Initial step
68
- if mblocks[0][:source][:begin] > 0
69
- e1 = mblocks[0][:source][:begin]
70
- e2 = mblocks[0][:target][:begin]
57
+ ## to fill the gaps
58
+ last_block = nil
59
+ blocks2 = blocks.inject([]) do |sum, block|
60
+ b1 = last_block ? last_block[:source][:end] : 0
61
+ e1 = block[:source][:begin]
71
62
 
72
- if mblocks[0][:target][:begin] == 0
73
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
63
+ sum += if b1 == e1
64
+ [block]
74
65
  else
75
- _str1 = str1[0 ... e1]
76
- _str2 = str2[0 ... e2]
66
+ b2 = last_block ? last_block[:target][:end] : 0
67
+ e2 = block[:target][:begin]
68
+
69
+ if b2 == e2
70
+ [
71
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
72
+ block
73
+ ]
74
+ else
75
+ if b1 == 0 && b2 == 0
76
+ len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
77
+ b2 = e2 - len_buffer if e2 > len_buffer
78
+ end
77
79
 
78
- unless _str1.strip.empty?
79
- if _str2.strip.empty?
80
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
80
+ _str1 = str1[b1 ... e1]
81
+ _str2 = str2[b2 ... e2]
82
+
83
+ if _str1.strip.empty? || _str2.strip.empty?
84
+ [
85
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
86
+ block
87
+ ]
81
88
  else
82
- len_min = [_str1.length, _str2.length].min
83
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
84
- b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
85
- b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
86
-
87
- @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
88
-
89
- _str1 = str1[b1 ... e1]
90
- _str2 = str2[b2 ... e2]
91
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
92
- if alignment.similarity < 0.5
93
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
94
- else
95
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
96
- end
89
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
97
90
  end
98
91
  end
99
92
  end
93
+
94
+ last_block = block
95
+ sum
100
96
  end
101
- @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
102
-
103
- (1 ... mblocks.length).each do |i|
104
- b1 = mblocks[i - 1][:source][:end]
105
- b2 = mblocks[i - 1][:target][:end]
106
- e1 = mblocks[i][:source][:begin]
107
- e2 = mblocks[i][:target][:begin]
108
- _str1 = str1[b1 ... e1]
109
- _str2 = str2[b2 ... e2]
110
- unless _str1.strip.empty?
111
- if _str2.strip.empty?
112
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
97
+
98
+ # the last step
99
+ blocks2 += if last_block.nil?
100
+ local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
101
+ else
102
+ b1 = last_block[:source][:end]
103
+ if b1 < str1.length
104
+ e1 = str1.length
105
+
106
+ b2 = last_block[:target][:end]
107
+ if b2 < str2.length
108
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
109
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
110
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
113
111
  else
114
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
115
- if alignment.similarity < 0.5
116
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
117
- else
118
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
112
+ [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
113
+ end
114
+ else
115
+ []
116
+ end
117
+ end
118
+
119
+ @block_alignment[:blocks] = blocks2
120
+ end
121
+
122
+ def whole_block_alignment(str1, str2)
123
+ ## Block exact match
124
+ block_begin = str2.index(str1)
125
+ unless block_begin.nil?
126
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
+ end
128
+
129
+ block_begin = str2.downcase.index(str1.downcase)
130
+ unless block_begin.nil?
131
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
132
+ end
133
+
134
+ nil
135
+ end
136
+
137
+ def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
138
+ block2 = str2[b2 ... e2]
139
+
140
+ ## term-based alignment
141
+ tblocks = if denotations
142
+ ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
143
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
144
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
145
+
146
+ position = 0
147
+ tblocks = ds_in_scope.map do |term|
148
+ lex = term[:lex]
149
+ r = block2.index(lex, position)
150
+ if r.nil?
151
+ position = nil
152
+ break
153
+ end
154
+ position = r + lex.length
155
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
156
+ end
157
+
158
+ # missing term found
159
+ tblocks = [] if position.nil?
160
+
161
+ # redundant matching found
162
+ unless position.nil?
163
+ ds_in_scope.each do |term|
164
+ lex = term[:lex]
165
+ look_forward = block2.index(lex, position)
166
+ unless look_forward.nil?
167
+ tblocks = []
168
+ break
119
169
  end
120
170
  end
121
171
  end
122
- @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
172
+
173
+ tblocks
174
+ else
175
+ []
123
176
  end
124
177
 
125
- # Final step
126
- if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
127
- b1 = mblocks[-1][:source][:end]
128
- b2 = mblocks[-1][:target][:end]
129
- _str1 = str1[b1 ... str1.length]
130
- _str2 = str2[b2 ... str2.length]
178
+ if tblocks.empty?
179
+ if b1 == 0 && e1 == str1.length
180
+ if (e1 > 2000) || (e2 > 2000)
181
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
182
+ else
183
+ block1 = str1[b1 ... e1]
184
+ block2 = str2[b2 ... e2]
131
185
 
132
- unless _str1.strip.empty?
133
- if _str2.strip.empty?
134
- @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
186
+ ## character-based alignment
187
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
188
+ if alignment.sdiff.nil?
189
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
190
+ else
191
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
192
+ end
193
+ end
194
+ else
195
+ block1 = str1[b1 ... e1]
196
+ block2 = str2[b2 ... e2]
197
+
198
+ ## character-based alignment
199
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
200
+ if alignment.sdiff.nil?
201
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
135
202
  else
136
- len_min = [_str1.length, _str2.length].min
137
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
138
- e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
139
- e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
140
- _str1 = str1[b1 ... e1]
141
- _str2 = str2[b2 ... e2]
203
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
204
+ end
205
+ end
206
+ else
207
+ last_tblock = nil
208
+ lblocks = tblocks.inject([]) do |sum, tblock|
209
+ tb1 = last_tblock ? last_tblock[:source][:end] : b1
210
+ te1 = tblock[:source][:begin]
142
211
 
143
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
144
- if alignment.similarity < 0.5
145
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
212
+ sum += if te1 == tb1
213
+ [tblock]
214
+ else
215
+ tb2 = last_tblock ? last_tblock[:target][:end] : b2
216
+ te2 = tblock[:target][:begin]
217
+
218
+ if b2 == e2
219
+ [
220
+ {source:{begin:tb1, end:te1}, alignment: :empty},
221
+ tblock
222
+ ]
146
223
  else
147
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
224
+ [
225
+ {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
226
+ tblock
227
+ ]
148
228
  end
229
+ end
230
+
231
+ last_tblock = tblock
232
+ sum
233
+ end
149
234
 
150
- @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
235
+ if last_tblock[:source][:end] < e1
236
+ if last_tblock[:target][:end] < e2
237
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
238
+ else
239
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
151
240
  end
152
241
  end
153
- end
154
242
 
155
- @block_alignment[:blocks].each do |a|
156
- a[:delta] = a[:target][:begin] - a[:source][:begin]
243
+ lblocks
157
244
  end
158
245
  end
159
246
 
247
+
248
+ def indices(str, target)
249
+ position = 0
250
+ len = target.len
251
+ Enumerator.new do |yielder|
252
+ while idx = str.index(target, position)
253
+ yielder << idx
254
+ position = idx + len
255
+ end
256
+ end
257
+ end
258
+
160
259
  def transform_begin_position(begin_position)
161
260
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
162
261
  block = @block_alignment[:blocks][i]
163
262
 
164
- b = if block[:alignment] == :block
263
+ b = if block[:alignment] == :block || block[:alignment] == :term
165
264
  begin_position + block[:delta]
166
265
  elsif block[:alignment] == :empty
167
266
  if begin_position == block[:source][:begin]
168
267
  block[:target][:begin]
169
268
  else
170
- # raise "lost annotation"
171
269
  nil
172
270
  end
173
271
  else
@@ -180,13 +278,12 @@ class TextAlignment::TextAlignment
180
278
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
181
279
  block = @block_alignment[:blocks][i]
182
280
 
183
- e = if block[:alignment] == :block
281
+ e = if block[:alignment] == :block || block[:alignment] == :term
184
282
  end_position + block[:delta]
185
283
  elsif block[:alignment] == :empty
186
284
  if end_position == block[:source][:end]
187
285
  block[:target][:end]
188
286
  else
189
- # raise "lost annotation"
190
287
  nil
191
288
  end
192
289
  else
@@ -208,14 +305,14 @@ class TextAlignment::TextAlignment
208
305
  @lost_annotations = []
209
306
 
210
307
  denotations.each do |d|
211
- begin
212
- d.begin = transform_begin_position(d.begin);
213
- d.end = transform_end_position(d.end);
214
- rescue
215
- @lost_annotations << d
216
- d.begin = nil
217
- d.end = nil
218
- end
308
+ source = {begin:d.begin, end:d.end}
309
+ d.begin = transform_begin_position(d.begin);
310
+ d.end = transform_end_position(d.end);
311
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
312
+ rescue
313
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
314
+ d.begin = nil
315
+ d.end = nil
219
316
  end
220
317
 
221
318
  @lost_annotations
@@ -226,12 +323,12 @@ class TextAlignment::TextAlignment
226
323
  @lost_annotations = []
227
324
 
228
325
  r = hdenotations.collect do |d|
229
- new_d = begin
230
- d.dup.merge({span:transform_a_span(d[:span])})
231
- rescue
232
- @lost_annotations << d
233
- nil
234
- end
326
+ t = transform_a_span(d[:span])
327
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
328
+ new_d = d.dup.merge({span:t})
329
+ rescue
330
+ @lost_annotations << {source: d[:span], target:t}
331
+ nil
235
332
  end.compact
236
333
 
237
334
  r
@@ -245,14 +342,22 @@ class TextAlignment::TextAlignment
245
342
  @block_alignment[:blocks].each do |a|
246
343
  show += case a[:alignment]
247
344
  when :block
248
- "===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
345
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
346
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
347
+ when :term
348
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
249
349
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
250
350
  when :empty
251
351
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
252
352
  "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
253
353
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
254
- ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
255
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
354
+ ">>>>> string 2 " +
355
+ if a[:target]
356
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
357
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
358
+ else
359
+ "[-]\n\n"
360
+ end
256
361
  else
257
362
  astr1 = ''
258
363
  astr2 = ''
@@ -292,5 +397,4 @@ class TextAlignment::TextAlignment
292
397
  end
293
398
  show
294
399
  end
295
-
296
400
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6.4'
2
+ VERSION = '0.8.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.4
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2020-10-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary