text_alignment 0.7.3 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0788bdb6d161499f5a5258757b9f61faee96b60246b422b57b17ba953b4a2c87'
4
- data.tar.gz: 4564fd15e1e1d673932438206989dc706aa67f2467698f16946e2635c562ec90
3
+ metadata.gz: 0c49793b84e9ca5606a8fe9151530f6732337c2b92cfd1af1549a56ea3c2f39e
4
+ data.tar.gz: 4a0b8328d4c6de43af50bd8c278f83facef311c74516dc3a9a0c9dd5f91fbfc0
5
5
  SHA512:
6
- metadata.gz: c93882cc28e8bfdbdeed325b282b63fd3f3644d1739ed979ef92a7d8a133e26f4a8ffd0bda22da0fc2e0a31c77c6d49ce89f8f0fae802c9ae2041a7db60a2a4e
7
- data.tar.gz: 238ed44ac0c0a178a64743846550639c626d3a300a79f05fedb016e71a890f11025b52d260ce2f46f00d68b1c4b9c13c48a8be8712f737bb2b57b0274b174b8b
6
+ metadata.gz: ad70de67b4a7b38290a59d89bed7cd9fa343ce3db62bb67b779e51b499929b9d7c4697871cf25017fa59a34c0525741d5da679559fc2f33d902fcda692f9f1ac
7
+ data.tar.gz: 6242b35cbb8f53effc477b508d428d9060cba2e93895496134963755cff956f3931411a87a7bc0d33b7b91459f1a27eb67c422897693320e1b2ceb06eff6e22b
@@ -26,33 +26,43 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_mdoc(source_annotations, target_annotations)
29
+ def align_denotations(denotations, source_text, target_text, debug = false)
30
+ alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
31
+ new_denotations = alignment.transform_hdenotations(denotations)
32
+
33
+ if debug
34
+ warn "[block alignment]"
35
+ warn alignment.alignment_show
36
+ warn "-----"
37
+ end
38
+
39
+ lost_annotations = alignment.lost_annotations
40
+ unless lost_annotations.empty?
41
+ warn "\n[lost annotations] #{lost_annotations.length}"
42
+ lost_annotations.each do |a|
43
+ warn "#{a}"
44
+ end
45
+ warn "====="
46
+ end
47
+ warn
48
+
49
+ # return target annotations
50
+ new_denotations
51
+ end
52
+
53
+ def align_mannotations(source_annotations, target_text, debug = false)
54
+ target_annotations = {text:target_text}
55
+
30
56
  idnum_denotations = 0
31
57
  idnum_relations = 0
32
58
  idnum_attributes = 0
33
59
  idnum_modifications = 0
34
60
 
35
- source_annotations.each do |annotations|
36
- alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
-
38
- puts alignment.alignment_show
39
- puts "-----"
40
- puts
41
-
42
- # alignment.block_alignments.each do |a|
43
- # p {source:a[:source], target:a[:target]}
44
- # puts "--"
45
- # p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
46
- # puts "--"
47
- # puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
48
- # puts "--"
49
- # puts target_text[a[:target][:begin] ... a[:target][:end]]
50
- # puts "======"
51
- # end
52
-
61
+ source_annotations.each_with_index do |annotations, i|
53
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
54
63
  ididx = {}
55
- denotations = alignment.transform_hdenotations(annotations[:denotations])
64
+ warn "[#{i}]-=-=-=-=-"
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
56
66
  denotations.each do |d|
57
67
  reid = 'T' + (idnum_denotations += 1).to_s
58
68
  ididx[d[:id]] = reid
@@ -101,126 +111,11 @@ end
101
111
  source_annotations = read_annotations(ARGV[0])
102
112
  target_text = read_text(ARGV[1])
103
113
 
104
- lost_annotations = []
105
114
  target_annotations = if source_annotations.class == Array
106
- align_mdoc(source_annotations, {text: target_text})
115
+ align_mannotations(source_annotations, target_text, false)
107
116
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
109
-
110
- # verification
111
- # source_text = source_annotations[:text]
112
- # puts "=====BEGIN"
113
- # (0 ... source_text.rstrip.length).each do |p|
114
- # t = alignment.transform_begin_position(p)
115
- # if t.nil?
116
- # print source_text[p]
117
- # else
118
- # print '.'
119
- # end
120
- # end
121
- # puts
122
- # puts "=====END"
123
-
124
- # puts "=====BEGIN"
125
- # (0 .. source_text.rstrip.length).each do |p|
126
- # t = alignment.transform_end_position(p)
127
- # if t.nil?
128
- # print source_text[p]
129
- # else
130
- # print '.'
131
- # end
132
- # end
133
- # puts
134
- # puts "=====END"
135
-
136
- source_text = source_annotations[:text]
137
-
138
- puts "[block alignment]"
139
- puts alignment.alignment_show
140
- puts "====="
141
- # exit
142
-
143
- denotations = alignment.transform_hdenotations(source_annotations[:denotations])
144
- lost_annotations += alignment.lost_annotations if alignment.lost_annotations
145
-
117
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
146
118
  source_annotations.merge({text:target_text, denotations:denotations})
147
119
  end
148
120
 
149
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
150
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
151
- source_annotations.each do |annotations|
152
- num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
153
- num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
154
- num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
155
- num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
156
- end
157
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
158
- else
159
- num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
160
- num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
161
- num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
162
- num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
163
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
164
- end
165
-
166
- warn "[source]"
167
- warn "denotations:\t#{num_denotations_source}"
168
- # warn "relations:\t#{num_relations_source}"
169
- # warn "attributes:\t#{num_attributes_source}"
170
- # warn "modifications:\t#{num_modifications_source}"
171
-
172
- warn "\n[target]"
173
- warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
174
- # warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
175
- # warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
176
- # warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
177
-
178
- if lost_annotations
179
- warn "\n[lost annotations]"
180
- lost_annotations.each do |a|
181
- p a
182
- end
183
- end
184
-
185
- #puts target_annotations.to_json
186
-
187
- # denotations = anns1[:denotations]
188
-
189
- # puts "[Alignment1]====="
190
- # align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
191
-
192
- # align.alignment.each do |a|
193
- # p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
194
- # end
195
-
196
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
197
- # puts
198
- # puts "[Similarity]\n#{align.similarity}"
199
- # puts
200
- # puts '[Denotations original]'
201
- # pp denotations
202
- # puts
203
- # puts '[Denotations transformed]'
204
- # new_denotations = align.transform_hdenotations(denotations)
205
- # pp new_denotations
206
- # puts
207
- # puts "[Alignment2 (downcased)]====="
208
- # align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
209
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
210
- # puts
211
- # puts "[Similarity]\n#{align.similarity}"
212
- # puts
213
- # puts '[Denotations original]'
214
- # pp denotations
215
- # puts
216
- # puts '[Denotations transformed]'
217
- # new_denotations = align.transform_hdenotations(denotations)
218
- # pp new_denotations
219
- # puts
220
- # puts '[Annotations transformed]'
221
- # anns2[:denotations] = new_denotations
222
- # puts anns2.to_json
223
-
224
- # p align.common_elements
225
- # puts "---------------"
226
- # p align.mapped_elements
121
+ # puts target_annotations.to_json
@@ -1,7 +1,7 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
- TextAlignment::SIZE_WINDOW = 30 unless defined? TextAlignment::SIZE_WINDOW
4
+ TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
7
  TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -1,75 +1,172 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::MAPPINGS = [
4
- ["©", "(c)"], #U+00A9 (Copyright Sign)
5
-
6
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
- ["β", "beta"], #U+03B2 (greek small letter beta)
8
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
- ["δ", "delta"], #U+03B4 (greek small letter delta)
10
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
- ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
- ["η", "eta"], #U+03B7 (greek small letter eta)
13
- ["θ", "theta"], #U+03B7 (greek small letter eta)
14
- ["ι", "iota"], #U+03B7 (greek small letter eta)
15
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
- ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
- ["μ", "mu"], #U+03BC (greek small letter mu)
19
- ["ν", "nu"], #U+03BD (greek small letter nu)
20
- ["ξ", "xi"], #U+03BE (greek small letter xi)
21
- ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
- ["π", "pi"], #U+03C0 (greek small letter pi)
23
- ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
- ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
- ["τ", "tau"], #U+03C4 (greek small letter tau)
26
- ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
- ["φ", "phi"], #U+03C6 (greek small letter phi)
28
- ["χ", "chi"], #U+03C7 (greek small letter chi)
29
- ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
- ["ω", "omega"], #U+03C9 (greek small letter omega)
31
-
32
- ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
- ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
- ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
- ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
- ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
- ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
- ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
- ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
- ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
- ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
- ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
- ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
- ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
- ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
- ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
- ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
- ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
- ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
- ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
- ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
- ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
- ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
- ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
- ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
- ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
-
58
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
-
60
- ["×", "x"], #U+00D7 (multiplication sign)
61
- ["•", "*"], #U+2022 (bullet)
62
- [" ", " "], #U+2009 (thin space)
63
- [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
65
- [" ", " "], #U+3000 (ideographic space)
66
- ["‑", "-"], #U+2211 (Non-Breaking Hyphen)
67
- ["−", "-"], #U+2212 (minus sign)
68
- ["–", "-"], #U+2013 (en dash)
69
- ["′", "'"], #U+2032 (prime)
70
- ["‘", "'"], #U+2018 (left single quotation mark)
71
- ["’", "'"], #U+2019 (right single quotation mark)
72
- ["“", '"'], #U+201C (left double quotation mark)
73
- ["”", '"'], #U+201D (right double quotation mark)
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
+
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
+
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
+
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
+
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (no-break space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["‑", "-"], #U+2211 (Non-Breaking Hyphen)
67
+ ["−", "-"], #U+2212 (minus sign)
68
+ ["–", "-"], #U+2013 (en dash)
69
+ ["′", "'"], #U+2032 (prime)
70
+ ["‘", "'"], #U+2018 (left single quotation mark)
71
+ ["’", "'"], #U+2019 (right single quotation mark)
72
+ ["“", '"'], #U+201C (left double quotation mark)
73
+ ["”", '"'], #U+201D (right double quotation mark)
74
74
  ['"', "''"]
75
- ]
75
+ ]
76
+
77
+
78
+ TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
79
+
80
+
81
+ class << TextAlignment
82
+ def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
+ _mappings ||= TextAlignment::MAPPINGS
84
+
85
+ character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
+ if character_mappings.empty?
87
+ [_str1, _str2, _mappings]
88
+ else
89
+ characters_from = character_mappings.collect{|m| m[0]}.join
90
+ characters_to = character_mappings.collect{|m| m[1]}.join
91
+ characters_to.gsub!(/-/, '\-')
92
+
93
+ str1 = _str1.tr(characters_from, characters_to)
94
+ str2 = _str2.tr(characters_from, characters_to)
95
+
96
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
97
+
98
+ [str1, str2, mappings]
99
+ end
100
+ end
101
+
102
+ def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
+ _mappings ||= TextAlignment::MAPPINGS
104
+
105
+ long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
+ if long_to_one_mappings.empty?
107
+ [_str1, _str2, _mappings]
108
+ else
109
+ ## long to one character mappings
110
+ pletters = TextAlignment::PADDING_LETTERS
111
+
112
+ # find the padding letter for str1
113
+ @padding_letter1 = begin
114
+ i = pletters.index{|l| _str2.index(l).nil?}
115
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
+ TextAlignment::PADDING_LETTERS[i]
117
+ end
118
+
119
+ # find the padding letter for str2
120
+ @padding_letter2 = begin
121
+ i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
+ TextAlignment::PADDING_LETTERS[i]
124
+ end
125
+
126
+ str1 = str2 = nil
127
+ long_to_one_mappings.each do |f|
128
+ from = f[1]
129
+
130
+ str1 = if _str2.index(f[0])
131
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
+ _str1.gsub(from, to)
133
+ else
134
+ _str1
135
+ end
136
+
137
+ str2 = if _str1.index(f[0])
138
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
+ _str2.gsub(from, to)
140
+ else
141
+ _str2
142
+ end
143
+ end
144
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
145
+
146
+ [str1, str2, mappings]
147
+ end
148
+ end
149
+
150
+ def compute_similarity(_s1, _s2, sdiff)
151
+ return 0 if sdiff.nil?
152
+
153
+ # compute the lcs only with non-whitespace letters
154
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
+ return 0 if lcs == 0
156
+
157
+ s1 = if @padding_letter1
158
+ _s1.tr(@padding_letter1, ' ')
159
+ else
160
+ _s1
161
+ end
162
+
163
+ s2 = if @padding_letter2
164
+ _s2.tr(@padding_letter2, ' ')
165
+ else
166
+ _s2
167
+ end
168
+
169
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
170
+ end
171
+
172
+ end
@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(_str1, _str2)
20
+ def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = string_preprocessing(_str1, _str2)
23
+ str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
24
24
 
25
25
  _compute_mixed_alignment(str1, str2, mappings)
26
26
  end
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
63
63
  end
64
64
 
65
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = compute_similarity(str1, str2, @sdiff)
66
+ @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
67
67
  @str1_match_initial = cmp.str1_match_initial
68
68
  @str1_match_final = cmp.str1_match_final
69
69
  @str2_match_initial = cmp.str2_match_initial
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
139
139
  @position_map_end = posmap_end.sort.to_h
140
140
  end
141
141
 
142
- private
143
-
144
- def string_preprocessing(_str1, _str2)
145
- str1 = _str1.dup
146
- str2 = _str2.dup
147
- mappings = TextAlignment::MAPPINGS.dup
148
-
149
- ## single character mappings
150
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
- characters_from = character_mappings.collect{|m| m[0]}.join
152
- characters_to = character_mappings.collect{|m| m[1]}.join
153
- characters_to.gsub!(/-/, '\-')
154
-
155
- str1.tr!(characters_from, characters_to)
156
- str2.tr!(characters_from, characters_to)
157
-
158
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
-
160
- ## long to one character mappings
161
- pletters = TextAlignment::PADDING_LETTERS
162
-
163
- # find the padding letter for str1
164
- @padding_letter1 = begin
165
- i = pletters.index{|l| str2.index(l).nil?}
166
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
- TextAlignment::PADDING_LETTERS[i]
168
- end
169
-
170
- # find the padding letter for str2
171
- @padding_letter2 = begin
172
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
- TextAlignment::PADDING_LETTERS[i]
175
- end
176
-
177
- # ASCII foldings
178
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
- ascii_foldings.each do |f|
180
- from = f[1]
181
-
182
- if str2.index(f[0])
183
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
- str1.gsub!(from, to)
185
- end
186
-
187
- if str1.index(f[0])
188
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
- str2.gsub!(from, to)
190
- end
191
- end
192
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
-
194
- [str1, str2, mappings]
195
- end
196
-
197
- def compute_similarity(_s1, _s2, sdiff)
198
- return 0 if sdiff.nil?
199
-
200
- # compute the lcs only with non-whitespace letters
201
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
- return 0 if lcs == 0
203
-
204
- s1 = _s1.tr(@padding_letter1, ' ')
205
- s2 = _s2.tr(@padding_letter2, ' ')
206
-
207
- similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
- end
209
-
210
142
  end
@@ -5,34 +5,25 @@ require 'text_alignment/mixed_alignment'
5
5
 
6
6
  module TextAlignment; end unless defined? TextAlignment
7
7
 
8
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
-
10
8
  class TextAlignment::TextAlignment
11
9
  attr_reader :block_alignment
12
10
  attr_reader :similarity
13
11
  attr_reader :lost_annotations
14
12
 
15
- def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
13
+ def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
17
15
 
18
- @block_alignment = {source_text:str1, target_text:str2}
19
- @str1 = str1
20
- @str2 = str2
16
+ @block_alignment = {source_text:_str1, target_text:_str2}
17
+ @original_str1 = _str1
18
+ @original_str2 = _str2
21
19
 
22
- ## Block exact match
23
- block_begin = str2.index(str1)
24
- unless block_begin.nil?
25
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return
27
- end
20
+ str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
28
21
 
29
- block_begin = str2.downcase.index(str1.downcase)
30
- unless block_begin.nil?
31
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
22
+ if r = whole_block_alignment(str1, str2)
23
+ @block_alignment[:blocks] = r
32
24
  return
33
25
  end
34
26
 
35
-
36
27
  ## to find block alignments
37
28
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
38
29
 
@@ -128,6 +119,21 @@ class TextAlignment::TextAlignment
128
119
  @block_alignment[:blocks] = blocks2
129
120
  end
130
121
 
122
+ def whole_block_alignment(str1, str2)
123
+ ## Block exact match
124
+ block_begin = str2.index(str1)
125
+ unless block_begin.nil?
126
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
+ end
128
+
129
+ block_begin = str2.downcase.index(str1.downcase)
130
+ unless block_begin.nil?
131
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
132
+ end
133
+
134
+ nil
135
+ end
136
+
131
137
  def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
132
138
  block2 = str2[b2 ... e2]
133
139
 
@@ -158,7 +164,6 @@ class TextAlignment::TextAlignment
158
164
  lex = term[:lex]
159
165
  look_forward = block2.index(lex, position)
160
166
  unless look_forward.nil?
161
- puts lex
162
167
  tblocks = []
163
168
  break
164
169
  end
@@ -166,6 +171,8 @@ class TextAlignment::TextAlignment
166
171
  end
167
172
 
168
173
  tblocks
174
+ else
175
+ []
169
176
  end
170
177
 
171
178
  if tblocks.empty?
@@ -177,7 +184,7 @@ class TextAlignment::TextAlignment
177
184
  block2 = str2[b2 ... e2]
178
185
 
179
186
  ## character-based alignment
180
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
187
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
181
188
  if alignment.sdiff.nil?
182
189
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
183
190
  else
@@ -189,7 +196,7 @@ class TextAlignment::TextAlignment
189
196
  block2 = str2[b2 ... e2]
190
197
 
191
198
  ## character-based alignment
192
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
199
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
193
200
  if alignment.sdiff.nil?
194
201
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
195
202
  else
@@ -301,7 +308,7 @@ class TextAlignment::TextAlignment
301
308
  source = {begin:d.begin, end:d.end}
302
309
  d.begin = transform_begin_position(d.begin);
303
310
  d.end = transform_end_position(d.end);
304
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
311
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
305
312
  rescue
306
313
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
307
314
  d.begin = nil
@@ -317,7 +324,7 @@ class TextAlignment::TextAlignment
317
324
 
318
325
  r = hdenotations.collect do |d|
319
326
  t = transform_a_span(d[:span])
320
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
327
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
321
328
  new_d = d.dup.merge({span:t})
322
329
  rescue
323
330
  @lost_annotations << {source: d[:span], target:t}
@@ -344,8 +351,13 @@ class TextAlignment::TextAlignment
344
351
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
345
352
  "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
346
353
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
347
- ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
348
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
354
+ ">>>>> string 2 " +
355
+ if a[:target]
356
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
357
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
358
+ else
359
+ "[-]\n\n"
360
+ end
349
361
  else
350
362
  astr1 = ''
351
363
  astr2 = ''
@@ -385,5 +397,4 @@ class TextAlignment::TextAlignment
385
397
  end
386
398
  show
387
399
  end
388
-
389
400
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.7.3'
2
+ VERSION = '0.8.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.3
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-12 00:00:00.000000000 Z
11
+ date: 2020-10-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary