text_alignment 0.7.1 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '088ea92f4ca68c574cbd04bbf932aa70014b58cbbad82028b7161e0af35cdb4c'
4
- data.tar.gz: 455e6b53a846e7ebf0a90e724b93b78b61025ccb2d02c1e167f1969946b292e2
3
+ metadata.gz: bc60f2422e09058c8abc037a5f4c7e28a2c26c4b0defa3e157a478f6c691e85e
4
+ data.tar.gz: 3732d51c46d0597cec005396c13e5aa7c84c766232f5de0c5b90e789a2fa77f1
5
5
  SHA512:
6
- metadata.gz: 02cdb75cc9b95415c0e86d78bc0278ab8a0cf5a7afa1870ba5c10f0137e59ea782b92e8130ea2f9b3e16fd43b08c8c72da0d2b2ecf4546cb6a46a72ad62957ef
7
- data.tar.gz: 1216190e0b3880acedc6b735b70a3611469f4fcb4506f059271acbef8d84ffdee793234029c8e5f9e9d1f10cdf088ed44a8c54c93986b19757036aceaf938247
6
+ metadata.gz: '0095e5682996e5ccb6d6cc7529c40901656f169670e49d26331acad139964b528a6b3ae9c48f32844fbe2a8737f0ab66fdc4f4da51dc37808bed65e7a7447f37'
7
+ data.tar.gz: b8e00566dbcba94fbfd1d84bd7d10ac6ba7677124aa8a0676797223d4969e76917ea21013cb509762a46d14324eb28e38b1d6ad7dc26cd0fcb2a30af573e6612
@@ -26,33 +26,43 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_mdoc(source_annotations, target_annotations)
29
+ def align_denotations(denotations, source_text, target_text, debug = false)
30
+ alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
31
+ new_denotations = alignment.transform_hdenotations(denotations)
32
+
33
+ if debug
34
+ warn "[block alignment]"
35
+ warn alignment.alignment_show
36
+ warn "-----"
37
+ end
38
+
39
+ lost_annotations = alignment.lost_annotations
40
+ unless lost_annotations.empty?
41
+ warn "\n[lost annotations] #{lost_annotations.length}"
42
+ lost_annotations.each do |a|
43
+ warn "#{a}"
44
+ end
45
+ warn "====="
46
+ end
47
+ warn
48
+
49
+ # return target annotations
50
+ new_denotations
51
+ end
52
+
53
+ def align_mannotations(source_annotations, target_text, debug = false)
54
+ target_annotations = {text:target_text}
55
+
30
56
  idnum_denotations = 0
31
57
  idnum_relations = 0
32
58
  idnum_attributes = 0
33
59
  idnum_modifications = 0
34
60
 
35
- source_annotations.each do |annotations|
36
- alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
-
38
- puts alignment.alignment_show
39
- puts "-----"
40
- puts
41
-
42
- # alignment.block_alignments.each do |a|
43
- # p {source:a[:source], target:a[:target]}
44
- # puts "--"
45
- # p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
46
- # puts "--"
47
- # puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
48
- # puts "--"
49
- # puts target_text[a[:target][:begin] ... a[:target][:end]]
50
- # puts "======"
51
- # end
52
-
61
+ source_annotations.each_with_index do |annotations, i|
53
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
54
63
  ididx = {}
55
- denotations = alignment.transform_hdenotations(annotations[:denotations])
64
+ warn "[#{i}]-=-=-=-=-"
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
56
66
  denotations.each do |d|
57
67
  reid = 'T' + (idnum_denotations += 1).to_s
58
68
  ididx[d[:id]] = reid
@@ -101,126 +111,11 @@ end
101
111
  source_annotations = read_annotations(ARGV[0])
102
112
  target_text = read_text(ARGV[1])
103
113
 
104
- lost_annotations = []
105
114
  target_annotations = if source_annotations.class == Array
106
- align_mdoc(source_annotations, {text: target_text})
115
+ align_mannotations(source_annotations, target_text, false)
107
116
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
109
-
110
- # verification
111
- # source_text = source_annotations[:text]
112
- # puts "=====BEGIN"
113
- # (0 ... source_text.rstrip.length).each do |p|
114
- # t = alignment.transform_begin_position(p)
115
- # if t.nil?
116
- # print source_text[p]
117
- # else
118
- # print '.'
119
- # end
120
- # end
121
- # puts
122
- # puts "=====END"
123
-
124
- # puts "=====BEGIN"
125
- # (0 .. source_text.rstrip.length).each do |p|
126
- # t = alignment.transform_end_position(p)
127
- # if t.nil?
128
- # print source_text[p]
129
- # else
130
- # print '.'
131
- # end
132
- # end
133
- # puts
134
- # puts "=====END"
135
-
136
- source_text = source_annotations[:text]
137
-
138
- puts "[block alignment]"
139
- puts alignment.alignment_show
140
- puts "====="
141
- # exit
142
-
143
- denotations = alignment.transform_hdenotations(source_annotations[:denotations])
144
- lost_annotations += alignment.lost_annotations if alignment.lost_annotations
145
-
117
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
146
118
  source_annotations.merge({text:target_text, denotations:denotations})
147
119
  end
148
120
 
149
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
150
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
151
- source_annotations.each do |annotations|
152
- num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
153
- num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
154
- num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
155
- num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
156
- end
157
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
158
- else
159
- num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
160
- num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
161
- num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
162
- num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
163
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
164
- end
165
-
166
- warn "[source]"
167
- warn "denotations:\t#{num_denotations_source}"
168
- # warn "relations:\t#{num_relations_source}"
169
- # warn "attributes:\t#{num_attributes_source}"
170
- # warn "modifications:\t#{num_modifications_source}"
171
-
172
- warn "\n[target]"
173
- warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
174
- # warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
175
- # warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
176
- # warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
177
-
178
- if lost_annotations
179
- warn "\n[lost annotations]"
180
- lost_annotations.each do |a|
181
- p a
182
- end
183
- end
184
-
185
- #puts target_annotations.to_json
186
-
187
- # denotations = anns1[:denotations]
188
-
189
- # puts "[Alignment1]====="
190
- # align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
191
-
192
- # align.alignment.each do |a|
193
- # p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
194
- # end
195
-
196
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
197
- # puts
198
- # puts "[Similarity]\n#{align.similarity}"
199
- # puts
200
- # puts '[Denotations original]'
201
- # pp denotations
202
- # puts
203
- # puts '[Denotations transformed]'
204
- # new_denotations = align.transform_hdenotations(denotations)
205
- # pp new_denotations
206
- # puts
207
- # puts "[Alignment2 (downcased)]====="
208
- # align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
209
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
210
- # puts
211
- # puts "[Similarity]\n#{align.similarity}"
212
- # puts
213
- # puts '[Denotations original]'
214
- # pp denotations
215
- # puts
216
- # puts '[Denotations transformed]'
217
- # new_denotations = align.transform_hdenotations(denotations)
218
- # pp new_denotations
219
- # puts
220
- # puts '[Annotations transformed]'
221
- # anns2[:denotations] = new_denotations
222
- # puts anns2.to_json
223
-
224
- # p align.common_elements
225
- # puts "---------------"
226
- # p align.mapped_elements
121
+ # puts target_annotations.to_json
@@ -1,7 +1,7 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
- TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
4
+ TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
7
  TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -1,74 +1,172 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::MAPPINGS = [
4
- ["©", "(c)"], #U+00A9 (Copyright Sign)
5
-
6
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
- ["β", "beta"], #U+03B2 (greek small letter beta)
8
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
- ["δ", "delta"], #U+03B4 (greek small letter delta)
10
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
- ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
- ["η", "eta"], #U+03B7 (greek small letter eta)
13
- ["θ", "theta"], #U+03B7 (greek small letter eta)
14
- ["ι", "iota"], #U+03B7 (greek small letter eta)
15
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
- ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
- ["μ", "mu"], #U+03BC (greek small letter mu)
19
- ["ν", "nu"], #U+03BD (greek small letter nu)
20
- ["ξ", "xi"], #U+03BE (greek small letter xi)
21
- ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
- ["π", "pi"], #U+03C0 (greek small letter pi)
23
- ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
- ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
- ["τ", "tau"], #U+03C4 (greek small letter tau)
26
- ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
- ["φ", "phi"], #U+03C6 (greek small letter phi)
28
- ["χ", "chi"], #U+03C7 (greek small letter chi)
29
- ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
- ["ω", "omega"], #U+03C9 (greek small letter omega)
31
-
32
- ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
- ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
- ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
- ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
- ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
- ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
- ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
- ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
- ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
- ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
- ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
- ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
- ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
- ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
- ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
- ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
- ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
- ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
- ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
- ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
- ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
- ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
- ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
- ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
- ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
-
58
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
-
60
- ["×", "x"], #U+00D7 (multiplication sign)
61
- ["•", "*"], #U+2022 (bullet)
62
- [" ", " "], #U+2009 (thin space)
63
- [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
65
- [" ", " "], #U+3000 (ideographic space)
66
- ["", "-"], #U+2212 (minus sign)
67
- ["", "-"], #U+2013 (en dash)
68
- ["", "'"], #U+2032 (prime)
69
- ["", "'"], #U+2018 (left single quotation mark)
70
- ["", "'"], #U+2019 (right single quotation mark)
71
- ["", '"'], #U+201C (left double quotation mark)
72
- ["", '"'], #U+201D (right double quotation mark)
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
+
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
+
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
+
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
+
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (no-break space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["", "-"], #U+2211 (Non-Breaking Hyphen)
67
+ ["", "-"], #U+2212 (minus sign)
68
+ ["", "-"], #U+2013 (en dash)
69
+ ["", "'"], #U+2032 (prime)
70
+ ["", "'"], #U+2018 (left single quotation mark)
71
+ ["", "'"], #U+2019 (right single quotation mark)
72
+ ["", '"'], #U+201C (left double quotation mark)
73
+ ["”", '"'], #U+201D (right double quotation mark)
73
74
  ['"', "''"]
74
- ]
75
+ ]
76
+
77
+
78
+ TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
79
+
80
+
81
+ class << TextAlignment
82
+ def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
+ _mappings ||= TextAlignment::MAPPINGS
84
+
85
+ character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
+ if character_mappings.empty?
87
+ [_str1, _str2, _mappings]
88
+ else
89
+ characters_from = character_mappings.collect{|m| m[0]}.join
90
+ characters_to = character_mappings.collect{|m| m[1]}.join
91
+ characters_to.gsub!(/-/, '\-')
92
+
93
+ str1 = _str1.tr(characters_from, characters_to)
94
+ str2 = _str2.tr(characters_from, characters_to)
95
+
96
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
97
+
98
+ [str1, str2, mappings]
99
+ end
100
+ end
101
+
102
+ def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
+ _mappings ||= TextAlignment::MAPPINGS
104
+
105
+ long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
+ if long_to_one_mappings.empty?
107
+ [_str1, _str2, _mappings]
108
+ else
109
+ ## long to one character mappings
110
+ pletters = TextAlignment::PADDING_LETTERS
111
+
112
+ # find the padding letter for str1
113
+ @padding_letter1 = begin
114
+ i = pletters.index{|l| _str2.index(l).nil?}
115
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
+ TextAlignment::PADDING_LETTERS[i]
117
+ end
118
+
119
+ # find the padding letter for str2
120
+ @padding_letter2 = begin
121
+ i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
+ TextAlignment::PADDING_LETTERS[i]
124
+ end
125
+
126
+ str1 = str2 = nil
127
+ long_to_one_mappings.each do |f|
128
+ from = f[1]
129
+
130
+ str1 = if _str2.index(f[0])
131
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
+ _str1.gsub(from, to)
133
+ else
134
+ _str1
135
+ end
136
+
137
+ str2 = if _str1.index(f[0])
138
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
+ _str2.gsub(from, to)
140
+ else
141
+ _str2
142
+ end
143
+ end
144
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
145
+
146
+ [str1, str2, mappings]
147
+ end
148
+ end
149
+
150
+ def compute_similarity(_s1, _s2, sdiff)
151
+ return 0 if sdiff.nil?
152
+
153
+ # compute the lcs only with non-whitespace letters
154
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
+ return 0 if lcs == 0
156
+
157
+ s1 = if @padding_letter1
158
+ _s1.tr(@padding_letter1, ' ')
159
+ else
160
+ _s1
161
+ end
162
+
163
+ s2 = if @padding_letter2
164
+ _s2.tr(@padding_letter2, ' ')
165
+ else
166
+ _s2
167
+ end
168
+
169
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
170
+ end
171
+
172
+ end
@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(_str1, _str2)
20
+ def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = string_preprocessing(_str1, _str2)
23
+ str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
24
24
 
25
25
  _compute_mixed_alignment(str1, str2, mappings)
26
26
  end
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
63
63
  end
64
64
 
65
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = compute_similarity(str1, str2, @sdiff)
66
+ @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
67
67
  @str1_match_initial = cmp.str1_match_initial
68
68
  @str1_match_final = cmp.str1_match_final
69
69
  @str2_match_initial = cmp.str2_match_initial
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
139
139
  @position_map_end = posmap_end.sort.to_h
140
140
  end
141
141
 
142
- private
143
-
144
- def string_preprocessing(_str1, _str2)
145
- str1 = _str1.dup
146
- str2 = _str2.dup
147
- mappings = TextAlignment::MAPPINGS.dup
148
-
149
- ## single character mappings
150
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
- characters_from = character_mappings.collect{|m| m[0]}.join
152
- characters_to = character_mappings.collect{|m| m[1]}.join
153
- characters_to.gsub!(/-/, '\-')
154
-
155
- str1.tr!(characters_from, characters_to)
156
- str2.tr!(characters_from, characters_to)
157
-
158
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
-
160
- ## long to one character mappings
161
- pletters = TextAlignment::PADDING_LETTERS
162
-
163
- # find the padding letter for str1
164
- @padding_letter1 = begin
165
- i = pletters.index{|l| str2.index(l).nil?}
166
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
- TextAlignment::PADDING_LETTERS[i]
168
- end
169
-
170
- # find the padding letter for str2
171
- @padding_letter2 = begin
172
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
- TextAlignment::PADDING_LETTERS[i]
175
- end
176
-
177
- # ASCII foldings
178
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
- ascii_foldings.each do |f|
180
- from = f[1]
181
-
182
- if str2.index(f[0])
183
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
- str1.gsub!(from, to)
185
- end
186
-
187
- if str1.index(f[0])
188
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
- str2.gsub!(from, to)
190
- end
191
- end
192
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
-
194
- [str1, str2, mappings]
195
- end
196
-
197
- def compute_similarity(_s1, _s2, sdiff)
198
- return 0 if sdiff.nil?
199
-
200
- # compute the lcs only with non-whitespace letters
201
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
- return 0 if lcs == 0
203
-
204
- s1 = _s1.tr(@padding_letter1, ' ')
205
- s2 = _s2.tr(@padding_letter2, ' ')
206
-
207
- similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
- end
209
-
210
142
  end
@@ -5,34 +5,25 @@ require 'text_alignment/mixed_alignment'
5
5
 
6
6
  module TextAlignment; end unless defined? TextAlignment
7
7
 
8
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
-
10
8
  class TextAlignment::TextAlignment
11
9
  attr_reader :block_alignment
12
10
  attr_reader :similarity
13
11
  attr_reader :lost_annotations
14
12
 
15
- def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
13
+ def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
17
15
 
18
- @block_alignment = {source_text:str1, target_text:str2}
19
- @str1 = str1
20
- @str2 = str2
16
+ @block_alignment = {source_text: _str1, target_text: _str2, denotations: denotations}
17
+ @original_str1 = _str1
18
+ @original_str2 = _str2
21
19
 
22
- ## Block exact match
23
- block_begin = str2.index(str1)
24
- unless block_begin.nil?
25
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return
27
- end
20
+ str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
28
21
 
29
- block_begin = str2.downcase.index(str1.downcase)
30
- unless block_begin.nil?
31
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
22
+ if r = whole_block_alignment(str1, str2)
23
+ @block_alignment[:blocks] = r
32
24
  return
33
25
  end
34
26
 
35
-
36
27
  ## to find block alignments
37
28
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
38
29
 
@@ -128,6 +119,21 @@ class TextAlignment::TextAlignment
128
119
  @block_alignment[:blocks] = blocks2
129
120
  end
130
121
 
122
+ def whole_block_alignment(str1, str2)
123
+ ## Block exact match
124
+ block_begin = str2.index(str1)
125
+ unless block_begin.nil?
126
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
+ end
128
+
129
+ block_begin = str2.downcase.index(str1.downcase)
130
+ unless block_begin.nil?
131
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
132
+ end
133
+
134
+ nil
135
+ end
136
+
131
137
  def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
132
138
  block2 = str2[b2 ... e2]
133
139
 
@@ -146,7 +152,7 @@ class TextAlignment::TextAlignment
146
152
  break
147
153
  end
148
154
  position = r + lex.length
149
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
155
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r + b2 - term[:span][:begin]}
150
156
  end
151
157
 
152
158
  # missing term found
@@ -158,7 +164,6 @@ class TextAlignment::TextAlignment
158
164
  lex = term[:lex]
159
165
  look_forward = block2.index(lex, position)
160
166
  unless look_forward.nil?
161
- puts lex
162
167
  tblocks = []
163
168
  break
164
169
  end
@@ -166,26 +171,32 @@ class TextAlignment::TextAlignment
166
171
  end
167
172
 
168
173
  tblocks
174
+ else
175
+ []
169
176
  end
170
177
 
171
178
  if tblocks.empty?
172
179
  if b1 == 0 && e1 == str1.length
173
- block1 = str1[b1 ... e1]
174
- block2 = str2[b2 ... e2]
175
-
176
- ## character-based alignment
177
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
178
- if alignment.sdiff.nil?
180
+ if (e1 > 2000) || (e2 > 2000)
179
181
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
180
182
  else
181
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
183
+ block1 = str1[b1 ... e1]
184
+ block2 = str2[b2 ... e2]
185
+
186
+ ## character-based alignment
187
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
188
+ if alignment.sdiff.nil?
189
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
190
+ else
191
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
192
+ end
182
193
  end
183
194
  else
184
195
  block1 = str1[b1 ... e1]
185
196
  block2 = str2[b2 ... e2]
186
197
 
187
198
  ## character-based alignment
188
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
199
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
189
200
  if alignment.sdiff.nil?
190
201
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
191
202
  else
@@ -297,7 +308,7 @@ class TextAlignment::TextAlignment
297
308
  source = {begin:d.begin, end:d.end}
298
309
  d.begin = transform_begin_position(d.begin);
299
310
  d.end = transform_end_position(d.end);
300
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
311
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
301
312
  rescue
302
313
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
303
314
  d.begin = nil
@@ -313,7 +324,7 @@ class TextAlignment::TextAlignment
313
324
 
314
325
  r = hdenotations.collect do |d|
315
326
  t = transform_a_span(d[:span])
316
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
327
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
317
328
  new_d = d.dup.merge({span:t})
318
329
  rescue
319
330
  @lost_annotations << {source: d[:span], target:t}
@@ -340,8 +351,13 @@ class TextAlignment::TextAlignment
340
351
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
341
352
  "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
342
353
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
343
- ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
344
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
354
+ ">>>>> string 2 " +
355
+ if a[:target]
356
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
357
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
358
+ else
359
+ "[-]\n\n"
360
+ end
345
361
  else
346
362
  astr1 = ''
347
363
  astr2 = ''
@@ -381,5 +397,4 @@ class TextAlignment::TextAlignment
381
397
  end
382
398
  show
383
399
  end
384
-
385
400
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.7.1'
2
+ VERSION = '0.9.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-12 00:00:00.000000000 Z
11
+ date: 2021-01-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary