text_alignment 0.7 → 0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
4
- data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
3
+ metadata.gz: 1c44cc3036273c8c34800d8f78a79316c53efb80fe45ad81092a6172da3b03c6
4
+ data.tar.gz: 50ab44cc66b50bf732e99f900c10584025c6ed498603ccf3afd75de90cac4b79
5
5
  SHA512:
6
- metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
7
- data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
6
+ metadata.gz: 98645c1ba4566c822d1e6ba6488e4ecdfe100c30923cc7effe7d2a4390ebb6901707e8c9f6a12145e2f98515bc6792afef4f9bfa5fcd683c77d3a5cf599094c7
7
+ data.tar.gz: 11657abdb8acb64c8edfd5271bbf78d2a75024753180988030c5ce6722b4da2781760e583ca6e33ed469cca85e4a2f8e28af6ef4dc62029ada5bd8a184200dfb
@@ -26,33 +26,43 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_mdoc(source_annotations, target_annotations)
29
+ def align_denotations(denotations, source_text, target_text, debug = false)
30
+ alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
31
+ new_denotations = alignment.transform_hdenotations(denotations)
32
+
33
+ if debug
34
+ warn "[block alignment]"
35
+ warn alignment.alignment_show
36
+ warn "-----"
37
+ end
38
+
39
+ lost_annotations = alignment.lost_annotations
40
+ unless lost_annotations.empty?
41
+ warn "\n[lost annotations] #{lost_annotations.length}"
42
+ lost_annotations.each do |a|
43
+ warn "#{a}"
44
+ end
45
+ warn "====="
46
+ end
47
+ warn
48
+
49
+ # return target annotations
50
+ new_denotations
51
+ end
52
+
53
+ def align_mannotations(source_annotations, target_text, debug = false)
54
+ target_annotations = {text:target_text}
55
+
30
56
  idnum_denotations = 0
31
57
  idnum_relations = 0
32
58
  idnum_attributes = 0
33
59
  idnum_modifications = 0
34
60
 
35
- source_annotations.each do |annotations|
36
- alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
-
38
- puts alignment.alignment_show
39
- puts "-----"
40
- puts
41
-
42
- # alignment.block_alignments.each do |a|
43
- # p {source:a[:source], target:a[:target]}
44
- # puts "--"
45
- # p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
46
- # puts "--"
47
- # puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
48
- # puts "--"
49
- # puts target_text[a[:target][:begin] ... a[:target][:end]]
50
- # puts "======"
51
- # end
52
-
61
+ source_annotations.each_with_index do |annotations, i|
53
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
54
63
  ididx = {}
55
- denotations = alignment.transform_hdenotations(annotations[:denotations])
64
+ warn "[#{i}]-=-=-=-=-"
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
56
66
  denotations.each do |d|
57
67
  reid = 'T' + (idnum_denotations += 1).to_s
58
68
  ididx[d[:id]] = reid
@@ -101,126 +111,11 @@ end
101
111
  source_annotations = read_annotations(ARGV[0])
102
112
  target_text = read_text(ARGV[1])
103
113
 
104
- lost_annotations = []
105
114
  target_annotations = if source_annotations.class == Array
106
- align_mdoc(source_annotations, {text: target_text})
115
+ align_mannotations(source_annotations, target_text, false)
107
116
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
109
-
110
- # verification
111
- # source_text = source_annotations[:text]
112
- # puts "=====BEGIN"
113
- # (0 ... source_text.rstrip.length).each do |p|
114
- # t = alignment.transform_begin_position(p)
115
- # if t.nil?
116
- # print source_text[p]
117
- # else
118
- # print '.'
119
- # end
120
- # end
121
- # puts
122
- # puts "=====END"
123
-
124
- # puts "=====BEGIN"
125
- # (0 .. source_text.rstrip.length).each do |p|
126
- # t = alignment.transform_end_position(p)
127
- # if t.nil?
128
- # print source_text[p]
129
- # else
130
- # print '.'
131
- # end
132
- # end
133
- # puts
134
- # puts "=====END"
135
-
136
- source_text = source_annotations[:text]
137
-
138
- puts "[block alignment]"
139
- puts alignment.alignment_show
140
- puts "====="
141
- # exit
142
-
143
- denotations = alignment.transform_hdenotations(source_annotations[:denotations])
144
- lost_annotations += alignment.lost_annotations if alignment.lost_annotations
145
-
117
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
146
118
  source_annotations.merge({text:target_text, denotations:denotations})
147
119
  end
148
120
 
149
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
150
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
151
- source_annotations.each do |annotations|
152
- num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
153
- num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
154
- num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
155
- num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
156
- end
157
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
158
- else
159
- num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
160
- num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
161
- num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
162
- num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
163
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
164
- end
165
-
166
- warn "[source]"
167
- warn "denotations:\t#{num_denotations_source}"
168
- # warn "relations:\t#{num_relations_source}"
169
- # warn "attributes:\t#{num_attributes_source}"
170
- # warn "modifications:\t#{num_modifications_source}"
171
-
172
- warn "\n[target]"
173
- warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
174
- # warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
175
- # warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
176
- # warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
177
-
178
- if lost_annotations
179
- warn "\n[lost annotations]"
180
- lost_annotations.each do |a|
181
- p a
182
- end
183
- end
184
-
185
- #puts target_annotations.to_json
186
-
187
- # denotations = anns1[:denotations]
188
-
189
- # puts "[Alignment1]====="
190
- # align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
191
-
192
- # align.alignment.each do |a|
193
- # p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
194
- # end
195
-
196
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
197
- # puts
198
- # puts "[Similarity]\n#{align.similarity}"
199
- # puts
200
- # puts '[Denotations original]'
201
- # pp denotations
202
- # puts
203
- # puts '[Denotations transformed]'
204
- # new_denotations = align.transform_hdenotations(denotations)
205
- # pp new_denotations
206
- # puts
207
- # puts "[Alignment2 (downcased)]====="
208
- # align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
209
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
210
- # puts
211
- # puts "[Similarity]\n#{align.similarity}"
212
- # puts
213
- # puts '[Denotations original]'
214
- # pp denotations
215
- # puts
216
- # puts '[Denotations transformed]'
217
- # new_denotations = align.transform_hdenotations(denotations)
218
- # pp new_denotations
219
- # puts
220
- # puts '[Annotations transformed]'
221
- # anns2[:denotations] = new_denotations
222
- # puts anns2.to_json
223
-
224
- # p align.common_elements
225
- # puts "---------------"
226
- # p align.mapped_elements
121
+ # puts target_annotations.to_json
@@ -1,7 +1,7 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
- TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
4
+ TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
7
  TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -1,74 +1,172 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::MAPPINGS = [
4
- ["©", "(c)"], #U+00A9 (Copyright Sign)
5
-
6
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
- ["β", "beta"], #U+03B2 (greek small letter beta)
8
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
- ["δ", "delta"], #U+03B4 (greek small letter delta)
10
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
- ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
- ["η", "eta"], #U+03B7 (greek small letter eta)
13
- ["θ", "theta"], #U+03B7 (greek small letter eta)
14
- ["ι", "iota"], #U+03B7 (greek small letter eta)
15
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
- ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
- ["μ", "mu"], #U+03BC (greek small letter mu)
19
- ["ν", "nu"], #U+03BD (greek small letter nu)
20
- ["ξ", "xi"], #U+03BE (greek small letter xi)
21
- ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
- ["π", "pi"], #U+03C0 (greek small letter pi)
23
- ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
- ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
- ["τ", "tau"], #U+03C4 (greek small letter tau)
26
- ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
- ["φ", "phi"], #U+03C6 (greek small letter phi)
28
- ["χ", "chi"], #U+03C7 (greek small letter chi)
29
- ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
- ["ω", "omega"], #U+03C9 (greek small letter omega)
31
-
32
- ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
- ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
- ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
- ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
- ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
- ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
- ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
- ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
- ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
- ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
- ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
- ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
- ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
- ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
- ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
- ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
- ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
- ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
- ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
- ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
- ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
- ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
- ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
- ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
- ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
-
58
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
-
60
- ["×", "x"], #U+00D7 (multiplication sign)
61
- ["•", "*"], #U+2022 (bullet)
62
- [" ", " "], #U+2009 (thin space)
63
- [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
65
- [" ", " "], #U+3000 (ideographic space)
66
- ["", "-"], #U+2212 (minus sign)
67
- ["", "-"], #U+2013 (en dash)
68
- ["", "'"], #U+2032 (prime)
69
- ["", "'"], #U+2018 (left single quotation mark)
70
- ["", "'"], #U+2019 (right single quotation mark)
71
- ["", '"'], #U+201C (left double quotation mark)
72
- ["", '"'], #U+201D (right double quotation mark)
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
+
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
+
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
+
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
+
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (no-break space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["", "-"], #U+2211 (Non-Breaking Hyphen)
67
+ ["", "-"], #U+2212 (minus sign)
68
+ ["", "-"], #U+2013 (en dash)
69
+ ["", "'"], #U+2032 (prime)
70
+ ["", "'"], #U+2018 (left single quotation mark)
71
+ ["", "'"], #U+2019 (right single quotation mark)
72
+ ["", '"'], #U+201C (left double quotation mark)
73
+ ["”", '"'], #U+201D (right double quotation mark)
73
74
  ['"', "''"]
74
- ]
75
+ ]
76
+
77
+
78
+ TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
79
+
80
+
81
+ class << TextAlignment
82
+ def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
+ _mappings ||= TextAlignment::MAPPINGS
84
+
85
+ character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
+ if character_mappings.empty?
87
+ [_str1, _str2, _mappings]
88
+ else
89
+ characters_from = character_mappings.collect{|m| m[0]}.join
90
+ characters_to = character_mappings.collect{|m| m[1]}.join
91
+ characters_to.gsub!(/-/, '\-')
92
+
93
+ str1 = _str1.tr(characters_from, characters_to)
94
+ str2 = _str2.tr(characters_from, characters_to)
95
+
96
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
97
+
98
+ [str1, str2, mappings]
99
+ end
100
+ end
101
+
102
+ def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
+ _mappings ||= TextAlignment::MAPPINGS
104
+
105
+ long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
+ if long_to_one_mappings.empty?
107
+ [_str1, _str2, _mappings]
108
+ else
109
+ ## long to one character mappings
110
+ pletters = TextAlignment::PADDING_LETTERS
111
+
112
+ # find the padding letter for str1
113
+ @padding_letter1 = begin
114
+ i = pletters.index{|l| _str2.index(l).nil?}
115
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
+ TextAlignment::PADDING_LETTERS[i]
117
+ end
118
+
119
+ # find the padding letter for str2
120
+ @padding_letter2 = begin
121
+ i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
+ TextAlignment::PADDING_LETTERS[i]
124
+ end
125
+
126
+ str1 = str2 = nil
127
+ long_to_one_mappings.each do |f|
128
+ from = f[1]
129
+
130
+ str1 = if _str2.index(f[0])
131
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
+ _str1.gsub(from, to)
133
+ else
134
+ _str1
135
+ end
136
+
137
+ str2 = if _str1.index(f[0])
138
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
+ _str2.gsub(from, to)
140
+ else
141
+ _str2
142
+ end
143
+ end
144
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
145
+
146
+ [str1, str2, mappings]
147
+ end
148
+ end
149
+
150
+ def compute_similarity(_s1, _s2, sdiff)
151
+ return 0 if sdiff.nil?
152
+
153
+ # compute the lcs only with non-whitespace letters
154
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
+ return 0 if lcs == 0
156
+
157
+ s1 = if @padding_letter1
158
+ _s1.tr(@padding_letter1, ' ')
159
+ else
160
+ _s1
161
+ end
162
+
163
+ s2 = if @padding_letter2
164
+ _s2.tr(@padding_letter2, ' ')
165
+ else
166
+ _s2
167
+ end
168
+
169
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
170
+ end
171
+
172
+ end
@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(_str1, _str2)
20
+ def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = string_preprocessing(_str1, _str2)
23
+ str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
24
24
 
25
25
  _compute_mixed_alignment(str1, str2, mappings)
26
26
  end
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
63
63
  end
64
64
 
65
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = compute_similarity(str1, str2, @sdiff)
66
+ @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
67
67
  @str1_match_initial = cmp.str1_match_initial
68
68
  @str1_match_final = cmp.str1_match_final
69
69
  @str2_match_initial = cmp.str2_match_initial
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
139
139
  @position_map_end = posmap_end.sort.to_h
140
140
  end
141
141
 
142
- private
143
-
144
- def string_preprocessing(_str1, _str2)
145
- str1 = _str1.dup
146
- str2 = _str2.dup
147
- mappings = TextAlignment::MAPPINGS.dup
148
-
149
- ## single character mappings
150
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
- characters_from = character_mappings.collect{|m| m[0]}.join
152
- characters_to = character_mappings.collect{|m| m[1]}.join
153
- characters_to.gsub!(/-/, '\-')
154
-
155
- str1.tr!(characters_from, characters_to)
156
- str2.tr!(characters_from, characters_to)
157
-
158
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
-
160
- ## long to one character mappings
161
- pletters = TextAlignment::PADDING_LETTERS
162
-
163
- # find the padding letter for str1
164
- @padding_letter1 = begin
165
- i = pletters.index{|l| str2.index(l).nil?}
166
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
- TextAlignment::PADDING_LETTERS[i]
168
- end
169
-
170
- # find the padding letter for str2
171
- @padding_letter2 = begin
172
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
- TextAlignment::PADDING_LETTERS[i]
175
- end
176
-
177
- # ASCII foldings
178
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
- ascii_foldings.each do |f|
180
- from = f[1]
181
-
182
- if str2.index(f[0])
183
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
- str1.gsub!(from, to)
185
- end
186
-
187
- if str1.index(f[0])
188
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
- str2.gsub!(from, to)
190
- end
191
- end
192
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
-
194
- [str1, str2, mappings]
195
- end
196
-
197
- def compute_similarity(_s1, _s2, sdiff)
198
- return 0 if sdiff.nil?
199
-
200
- # compute the lcs only with non-whitespace letters
201
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
- return 0 if lcs == 0
203
-
204
- s1 = _s1.tr(@padding_letter1, ' ')
205
- s2 = _s2.tr(@padding_letter2, ' ')
206
-
207
- similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
- end
209
-
210
142
  end
@@ -5,34 +5,25 @@ require 'text_alignment/mixed_alignment'
5
5
 
6
6
  module TextAlignment; end unless defined? TextAlignment
7
7
 
8
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
-
10
8
  class TextAlignment::TextAlignment
11
9
  attr_reader :block_alignment
12
10
  attr_reader :similarity
13
11
  attr_reader :lost_annotations
14
12
 
15
- def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
13
+ def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
17
15
 
18
- @block_alignment = {source_text:str1, target_text:str2}
19
- @str1 = str1
20
- @str2 = str2
16
+ @block_alignment = {source_text: _str1, target_text: _str2, denotations: denotations}
17
+ @original_str1 = _str1
18
+ @original_str2 = _str2
21
19
 
22
- ## Block exact match
23
- block_begin = str2.index(str1)
24
- unless block_begin.nil?
25
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return
27
- end
20
+ str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
28
21
 
29
- block_begin = str2.downcase.index(str1.downcase)
30
- unless block_begin.nil?
31
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
22
+ if r = whole_block_alignment(str1, str2)
23
+ @block_alignment[:blocks] = r
32
24
  return
33
25
  end
34
26
 
35
-
36
27
  ## to find block alignments
37
28
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
38
29
 
@@ -120,12 +111,29 @@ class TextAlignment::TextAlignment
120
111
  else
121
112
  [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
122
113
  end
114
+ else
115
+ []
123
116
  end
124
117
  end
125
118
 
126
119
  @block_alignment[:blocks] = blocks2
127
120
  end
128
121
 
122
+ def whole_block_alignment(str1, str2)
123
+ ## Block exact match
124
+ block_begin = str2.index(str1)
125
+ unless block_begin.nil?
126
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
+ end
128
+
129
+ block_begin = str2.downcase.index(str1.downcase)
130
+ unless block_begin.nil?
131
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
132
+ end
133
+
134
+ nil
135
+ end
136
+
129
137
  def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
130
138
  block2 = str2[b2 ... e2]
131
139
 
@@ -156,7 +164,6 @@ class TextAlignment::TextAlignment
156
164
  lex = term[:lex]
157
165
  look_forward = block2.index(lex, position)
158
166
  unless look_forward.nil?
159
- puts lex
160
167
  tblocks = []
161
168
  break
162
169
  end
@@ -164,31 +171,37 @@ class TextAlignment::TextAlignment
164
171
  end
165
172
 
166
173
  tblocks
174
+ else
175
+ []
167
176
  end
168
177
 
169
178
  if tblocks.empty?
170
179
  if b1 == 0 && e1 == str1.length
171
- if str2.length > 2000
180
+ if (e1 > 2000) || (e2 > 2000)
172
181
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
173
182
  else
174
183
  block1 = str1[b1 ... e1]
175
184
  block2 = str2[b2 ... e2]
176
185
 
177
186
  ## character-based alignment
178
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
179
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
180
- # alignment = :alignment
181
- # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
187
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
188
+ if alignment.sdiff.nil?
189
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
190
+ else
191
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
192
+ end
182
193
  end
183
194
  else
184
195
  block1 = str1[b1 ... e1]
185
196
  block2 = str2[b2 ... e2]
186
197
 
187
198
  ## character-based alignment
188
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
189
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
190
- # alignmnet = :alignment
191
- # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
199
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
200
+ if alignment.sdiff.nil?
201
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
202
+ else
203
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
204
+ end
192
205
  end
193
206
  else
194
207
  last_tblock = nil
@@ -199,7 +212,7 @@ class TextAlignment::TextAlignment
199
212
  sum += if te1 == tb1
200
213
  [tblock]
201
214
  else
202
- tb2 = last_tblock ? tlast_block[:target][:end] : b2
215
+ tb2 = last_tblock ? last_tblock[:target][:end] : b2
203
216
  te2 = tblock[:target][:begin]
204
217
 
205
218
  if b2 == e2
@@ -295,7 +308,7 @@ class TextAlignment::TextAlignment
295
308
  source = {begin:d.begin, end:d.end}
296
309
  d.begin = transform_begin_position(d.begin);
297
310
  d.end = transform_end_position(d.end);
298
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
311
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
299
312
  rescue
300
313
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
301
314
  d.begin = nil
@@ -311,7 +324,7 @@ class TextAlignment::TextAlignment
311
324
 
312
325
  r = hdenotations.collect do |d|
313
326
  t = transform_a_span(d[:span])
314
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
327
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
315
328
  new_d = d.dup.merge({span:t})
316
329
  rescue
317
330
  @lost_annotations << {source: d[:span], target:t}
@@ -338,8 +351,13 @@ class TextAlignment::TextAlignment
338
351
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
339
352
  "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
340
353
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
341
- ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
342
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
354
+ ">>>>> string 2 " +
355
+ if a[:target]
356
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
357
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
358
+ else
359
+ "[-]\n\n"
360
+ end
343
361
  else
344
362
  astr1 = ''
345
363
  astr2 = ''
@@ -379,5 +397,4 @@ class TextAlignment::TextAlignment
379
397
  end
380
398
  show
381
399
  end
382
-
383
400
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.7'
2
+ VERSION = '0.9'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.7'
4
+ version: '0.9'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-11 00:00:00.000000000 Z
11
+ date: 2020-11-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary