text_alignment 0.7 → 0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
4
- data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
3
+ metadata.gz: 1c44cc3036273c8c34800d8f78a79316c53efb80fe45ad81092a6172da3b03c6
4
+ data.tar.gz: 50ab44cc66b50bf732e99f900c10584025c6ed498603ccf3afd75de90cac4b79
5
5
  SHA512:
6
- metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
7
- data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
6
+ metadata.gz: 98645c1ba4566c822d1e6ba6488e4ecdfe100c30923cc7effe7d2a4390ebb6901707e8c9f6a12145e2f98515bc6792afef4f9bfa5fcd683c77d3a5cf599094c7
7
+ data.tar.gz: 11657abdb8acb64c8edfd5271bbf78d2a75024753180988030c5ce6722b4da2781760e583ca6e33ed469cca85e4a2f8e28af6ef4dc62029ada5bd8a184200dfb
@@ -26,33 +26,43 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_mdoc(source_annotations, target_annotations)
29
+ def align_denotations(denotations, source_text, target_text, debug = false)
30
+ alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
31
+ new_denotations = alignment.transform_hdenotations(denotations)
32
+
33
+ if debug
34
+ warn "[block alignment]"
35
+ warn alignment.alignment_show
36
+ warn "-----"
37
+ end
38
+
39
+ lost_annotations = alignment.lost_annotations
40
+ unless lost_annotations.empty?
41
+ warn "\n[lost annotations] #{lost_annotations.length}"
42
+ lost_annotations.each do |a|
43
+ warn "#{a}"
44
+ end
45
+ warn "====="
46
+ end
47
+ warn
48
+
49
+ # return target annotations
50
+ new_denotations
51
+ end
52
+
53
+ def align_mannotations(source_annotations, target_text, debug = false)
54
+ target_annotations = {text:target_text}
55
+
30
56
  idnum_denotations = 0
31
57
  idnum_relations = 0
32
58
  idnum_attributes = 0
33
59
  idnum_modifications = 0
34
60
 
35
- source_annotations.each do |annotations|
36
- alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
-
38
- puts alignment.alignment_show
39
- puts "-----"
40
- puts
41
-
42
- # alignment.block_alignments.each do |a|
43
- # p {source:a[:source], target:a[:target]}
44
- # puts "--"
45
- # p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
46
- # puts "--"
47
- # puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
48
- # puts "--"
49
- # puts target_text[a[:target][:begin] ... a[:target][:end]]
50
- # puts "======"
51
- # end
52
-
61
+ source_annotations.each_with_index do |annotations, i|
53
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
54
63
  ididx = {}
55
- denotations = alignment.transform_hdenotations(annotations[:denotations])
64
+ warn "[#{i}]-=-=-=-=-"
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
56
66
  denotations.each do |d|
57
67
  reid = 'T' + (idnum_denotations += 1).to_s
58
68
  ididx[d[:id]] = reid
@@ -101,126 +111,11 @@ end
101
111
  source_annotations = read_annotations(ARGV[0])
102
112
  target_text = read_text(ARGV[1])
103
113
 
104
- lost_annotations = []
105
114
  target_annotations = if source_annotations.class == Array
106
- align_mdoc(source_annotations, {text: target_text})
115
+ align_mannotations(source_annotations, target_text, false)
107
116
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
109
-
110
- # verification
111
- # source_text = source_annotations[:text]
112
- # puts "=====BEGIN"
113
- # (0 ... source_text.rstrip.length).each do |p|
114
- # t = alignment.transform_begin_position(p)
115
- # if t.nil?
116
- # print source_text[p]
117
- # else
118
- # print '.'
119
- # end
120
- # end
121
- # puts
122
- # puts "=====END"
123
-
124
- # puts "=====BEGIN"
125
- # (0 .. source_text.rstrip.length).each do |p|
126
- # t = alignment.transform_end_position(p)
127
- # if t.nil?
128
- # print source_text[p]
129
- # else
130
- # print '.'
131
- # end
132
- # end
133
- # puts
134
- # puts "=====END"
135
-
136
- source_text = source_annotations[:text]
137
-
138
- puts "[block alignment]"
139
- puts alignment.alignment_show
140
- puts "====="
141
- # exit
142
-
143
- denotations = alignment.transform_hdenotations(source_annotations[:denotations])
144
- lost_annotations += alignment.lost_annotations if alignment.lost_annotations
145
-
117
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
146
118
  source_annotations.merge({text:target_text, denotations:denotations})
147
119
  end
148
120
 
149
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
150
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
151
- source_annotations.each do |annotations|
152
- num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
153
- num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
154
- num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
155
- num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
156
- end
157
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
158
- else
159
- num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
160
- num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
161
- num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
162
- num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
163
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
164
- end
165
-
166
- warn "[source]"
167
- warn "denotations:\t#{num_denotations_source}"
168
- # warn "relations:\t#{num_relations_source}"
169
- # warn "attributes:\t#{num_attributes_source}"
170
- # warn "modifications:\t#{num_modifications_source}"
171
-
172
- warn "\n[target]"
173
- warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
174
- # warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
175
- # warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
176
- # warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
177
-
178
- if lost_annotations
179
- warn "\n[lost annotations]"
180
- lost_annotations.each do |a|
181
- p a
182
- end
183
- end
184
-
185
- #puts target_annotations.to_json
186
-
187
- # denotations = anns1[:denotations]
188
-
189
- # puts "[Alignment1]====="
190
- # align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
191
-
192
- # align.alignment.each do |a|
193
- # p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
194
- # end
195
-
196
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
197
- # puts
198
- # puts "[Similarity]\n#{align.similarity}"
199
- # puts
200
- # puts '[Denotations original]'
201
- # pp denotations
202
- # puts
203
- # puts '[Denotations transformed]'
204
- # new_denotations = align.transform_hdenotations(denotations)
205
- # pp new_denotations
206
- # puts
207
- # puts "[Alignment2 (downcased)]====="
208
- # align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
209
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
210
- # puts
211
- # puts "[Similarity]\n#{align.similarity}"
212
- # puts
213
- # puts '[Denotations original]'
214
- # pp denotations
215
- # puts
216
- # puts '[Denotations transformed]'
217
- # new_denotations = align.transform_hdenotations(denotations)
218
- # pp new_denotations
219
- # puts
220
- # puts '[Annotations transformed]'
221
- # anns2[:denotations] = new_denotations
222
- # puts anns2.to_json
223
-
224
- # p align.common_elements
225
- # puts "---------------"
226
- # p align.mapped_elements
121
+ # puts target_annotations.to_json
@@ -1,7 +1,7 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
- TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
4
+ TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
7
  TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -1,74 +1,172 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::MAPPINGS = [
4
- ["©", "(c)"], #U+00A9 (Copyright Sign)
5
-
6
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
- ["β", "beta"], #U+03B2 (greek small letter beta)
8
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
- ["δ", "delta"], #U+03B4 (greek small letter delta)
10
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
- ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
- ["η", "eta"], #U+03B7 (greek small letter eta)
13
- ["θ", "theta"], #U+03B7 (greek small letter eta)
14
- ["ι", "iota"], #U+03B7 (greek small letter eta)
15
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
- ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
- ["μ", "mu"], #U+03BC (greek small letter mu)
19
- ["ν", "nu"], #U+03BD (greek small letter nu)
20
- ["ξ", "xi"], #U+03BE (greek small letter xi)
21
- ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
- ["π", "pi"], #U+03C0 (greek small letter pi)
23
- ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
- ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
- ["τ", "tau"], #U+03C4 (greek small letter tau)
26
- ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
- ["φ", "phi"], #U+03C6 (greek small letter phi)
28
- ["χ", "chi"], #U+03C7 (greek small letter chi)
29
- ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
- ["ω", "omega"], #U+03C9 (greek small letter omega)
31
-
32
- ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
- ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
- ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
- ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
- ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
- ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
- ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
- ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
- ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
- ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
- ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
- ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
- ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
- ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
- ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
- ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
- ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
- ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
- ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
- ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
- ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
- ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
- ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
- ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
- ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
-
58
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
-
60
- ["×", "x"], #U+00D7 (multiplication sign)
61
- ["•", "*"], #U+2022 (bullet)
62
- [" ", " "], #U+2009 (thin space)
63
- [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
65
- [" ", " "], #U+3000 (ideographic space)
66
- ["", "-"], #U+2212 (minus sign)
67
- ["", "-"], #U+2013 (en dash)
68
- ["", "'"], #U+2032 (prime)
69
- ["", "'"], #U+2018 (left single quotation mark)
70
- ["", "'"], #U+2019 (right single quotation mark)
71
- ["", '"'], #U+201C (left double quotation mark)
72
- ["", '"'], #U+201D (right double quotation mark)
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
+
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
+
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
+
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
+
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (no-break space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["", "-"], #U+2211 (Non-Breaking Hyphen)
67
+ ["", "-"], #U+2212 (minus sign)
68
+ ["", "-"], #U+2013 (en dash)
69
+ ["", "'"], #U+2032 (prime)
70
+ ["", "'"], #U+2018 (left single quotation mark)
71
+ ["", "'"], #U+2019 (right single quotation mark)
72
+ ["", '"'], #U+201C (left double quotation mark)
73
+ ["”", '"'], #U+201D (right double quotation mark)
73
74
  ['"', "''"]
74
- ]
75
+ ]
76
+
77
+
78
+ TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
79
+
80
+
81
+ class << TextAlignment
82
+ def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
+ _mappings ||= TextAlignment::MAPPINGS
84
+
85
+ character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
+ if character_mappings.empty?
87
+ [_str1, _str2, _mappings]
88
+ else
89
+ characters_from = character_mappings.collect{|m| m[0]}.join
90
+ characters_to = character_mappings.collect{|m| m[1]}.join
91
+ characters_to.gsub!(/-/, '\-')
92
+
93
+ str1 = _str1.tr(characters_from, characters_to)
94
+ str2 = _str2.tr(characters_from, characters_to)
95
+
96
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
97
+
98
+ [str1, str2, mappings]
99
+ end
100
+ end
101
+
102
+ def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
+ _mappings ||= TextAlignment::MAPPINGS
104
+
105
+ long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
+ if long_to_one_mappings.empty?
107
+ [_str1, _str2, _mappings]
108
+ else
109
+ ## long to one character mappings
110
+ pletters = TextAlignment::PADDING_LETTERS
111
+
112
+ # find the padding letter for str1
113
+ @padding_letter1 = begin
114
+ i = pletters.index{|l| _str2.index(l).nil?}
115
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
+ TextAlignment::PADDING_LETTERS[i]
117
+ end
118
+
119
+ # find the padding letter for str2
120
+ @padding_letter2 = begin
121
+ i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
+ TextAlignment::PADDING_LETTERS[i]
124
+ end
125
+
126
+ str1 = str2 = nil
127
+ long_to_one_mappings.each do |f|
128
+ from = f[1]
129
+
130
+ str1 = if _str2.index(f[0])
131
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
+ _str1.gsub(from, to)
133
+ else
134
+ _str1
135
+ end
136
+
137
+ str2 = if _str1.index(f[0])
138
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
+ _str2.gsub(from, to)
140
+ else
141
+ _str2
142
+ end
143
+ end
144
+ mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
145
+
146
+ [str1, str2, mappings]
147
+ end
148
+ end
149
+
150
+ def compute_similarity(_s1, _s2, sdiff)
151
+ return 0 if sdiff.nil?
152
+
153
+ # compute the lcs only with non-whitespace letters
154
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
+ return 0 if lcs == 0
156
+
157
+ s1 = if @padding_letter1
158
+ _s1.tr(@padding_letter1, ' ')
159
+ else
160
+ _s1
161
+ end
162
+
163
+ s2 = if @padding_letter2
164
+ _s2.tr(@padding_letter2, ' ')
165
+ else
166
+ _s2
167
+ end
168
+
169
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
170
+ end
171
+
172
+ end
@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(_str1, _str2)
20
+ def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = string_preprocessing(_str1, _str2)
23
+ str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
24
24
 
25
25
  _compute_mixed_alignment(str1, str2, mappings)
26
26
  end
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
63
63
  end
64
64
 
65
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = compute_similarity(str1, str2, @sdiff)
66
+ @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
67
67
  @str1_match_initial = cmp.str1_match_initial
68
68
  @str1_match_final = cmp.str1_match_final
69
69
  @str2_match_initial = cmp.str2_match_initial
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
139
139
  @position_map_end = posmap_end.sort.to_h
140
140
  end
141
141
 
142
- private
143
-
144
- def string_preprocessing(_str1, _str2)
145
- str1 = _str1.dup
146
- str2 = _str2.dup
147
- mappings = TextAlignment::MAPPINGS.dup
148
-
149
- ## single character mappings
150
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
- characters_from = character_mappings.collect{|m| m[0]}.join
152
- characters_to = character_mappings.collect{|m| m[1]}.join
153
- characters_to.gsub!(/-/, '\-')
154
-
155
- str1.tr!(characters_from, characters_to)
156
- str2.tr!(characters_from, characters_to)
157
-
158
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
-
160
- ## long to one character mappings
161
- pletters = TextAlignment::PADDING_LETTERS
162
-
163
- # find the padding letter for str1
164
- @padding_letter1 = begin
165
- i = pletters.index{|l| str2.index(l).nil?}
166
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
- TextAlignment::PADDING_LETTERS[i]
168
- end
169
-
170
- # find the padding letter for str2
171
- @padding_letter2 = begin
172
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
- TextAlignment::PADDING_LETTERS[i]
175
- end
176
-
177
- # ASCII foldings
178
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
- ascii_foldings.each do |f|
180
- from = f[1]
181
-
182
- if str2.index(f[0])
183
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
- str1.gsub!(from, to)
185
- end
186
-
187
- if str1.index(f[0])
188
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
- str2.gsub!(from, to)
190
- end
191
- end
192
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
-
194
- [str1, str2, mappings]
195
- end
196
-
197
- def compute_similarity(_s1, _s2, sdiff)
198
- return 0 if sdiff.nil?
199
-
200
- # compute the lcs only with non-whitespace letters
201
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
- return 0 if lcs == 0
203
-
204
- s1 = _s1.tr(@padding_letter1, ' ')
205
- s2 = _s2.tr(@padding_letter2, ' ')
206
-
207
- similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
- end
209
-
210
142
  end
@@ -5,34 +5,25 @@ require 'text_alignment/mixed_alignment'
5
5
 
6
6
  module TextAlignment; end unless defined? TextAlignment
7
7
 
8
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
-
10
8
  class TextAlignment::TextAlignment
11
9
  attr_reader :block_alignment
12
10
  attr_reader :similarity
13
11
  attr_reader :lost_annotations
14
12
 
15
- def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
13
+ def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
17
15
 
18
- @block_alignment = {source_text:str1, target_text:str2}
19
- @str1 = str1
20
- @str2 = str2
16
+ @block_alignment = {source_text: _str1, target_text: _str2, denotations: denotations}
17
+ @original_str1 = _str1
18
+ @original_str2 = _str2
21
19
 
22
- ## Block exact match
23
- block_begin = str2.index(str1)
24
- unless block_begin.nil?
25
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return
27
- end
20
+ str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
28
21
 
29
- block_begin = str2.downcase.index(str1.downcase)
30
- unless block_begin.nil?
31
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
22
+ if r = whole_block_alignment(str1, str2)
23
+ @block_alignment[:blocks] = r
32
24
  return
33
25
  end
34
26
 
35
-
36
27
  ## to find block alignments
37
28
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
38
29
 
@@ -120,12 +111,29 @@ class TextAlignment::TextAlignment
120
111
  else
121
112
  [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
122
113
  end
114
+ else
115
+ []
123
116
  end
124
117
  end
125
118
 
126
119
  @block_alignment[:blocks] = blocks2
127
120
  end
128
121
 
122
+ def whole_block_alignment(str1, str2)
123
+ ## Block exact match
124
+ block_begin = str2.index(str1)
125
+ unless block_begin.nil?
126
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
+ end
128
+
129
+ block_begin = str2.downcase.index(str1.downcase)
130
+ unless block_begin.nil?
131
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
132
+ end
133
+
134
+ nil
135
+ end
136
+
129
137
  def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
130
138
  block2 = str2[b2 ... e2]
131
139
 
@@ -156,7 +164,6 @@ class TextAlignment::TextAlignment
156
164
  lex = term[:lex]
157
165
  look_forward = block2.index(lex, position)
158
166
  unless look_forward.nil?
159
- puts lex
160
167
  tblocks = []
161
168
  break
162
169
  end
@@ -164,31 +171,37 @@ class TextAlignment::TextAlignment
164
171
  end
165
172
 
166
173
  tblocks
174
+ else
175
+ []
167
176
  end
168
177
 
169
178
  if tblocks.empty?
170
179
  if b1 == 0 && e1 == str1.length
171
- if str2.length > 2000
180
+ if (e1 > 2000) || (e2 > 2000)
172
181
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
173
182
  else
174
183
  block1 = str1[b1 ... e1]
175
184
  block2 = str2[b2 ... e2]
176
185
 
177
186
  ## character-based alignment
178
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
179
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
180
- # alignment = :alignment
181
- # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
187
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
188
+ if alignment.sdiff.nil?
189
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
190
+ else
191
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
192
+ end
182
193
  end
183
194
  else
184
195
  block1 = str1[b1 ... e1]
185
196
  block2 = str2[b2 ... e2]
186
197
 
187
198
  ## character-based alignment
188
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
189
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
190
- # alignmnet = :alignment
191
- # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
199
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
200
+ if alignment.sdiff.nil?
201
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
202
+ else
203
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
204
+ end
192
205
  end
193
206
  else
194
207
  last_tblock = nil
@@ -199,7 +212,7 @@ class TextAlignment::TextAlignment
199
212
  sum += if te1 == tb1
200
213
  [tblock]
201
214
  else
202
- tb2 = last_tblock ? tlast_block[:target][:end] : b2
215
+ tb2 = last_tblock ? last_tblock[:target][:end] : b2
203
216
  te2 = tblock[:target][:begin]
204
217
 
205
218
  if b2 == e2
@@ -295,7 +308,7 @@ class TextAlignment::TextAlignment
295
308
  source = {begin:d.begin, end:d.end}
296
309
  d.begin = transform_begin_position(d.begin);
297
310
  d.end = transform_end_position(d.end);
298
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
311
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
299
312
  rescue
300
313
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
301
314
  d.begin = nil
@@ -311,7 +324,7 @@ class TextAlignment::TextAlignment
311
324
 
312
325
  r = hdenotations.collect do |d|
313
326
  t = transform_a_span(d[:span])
314
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
327
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
315
328
  new_d = d.dup.merge({span:t})
316
329
  rescue
317
330
  @lost_annotations << {source: d[:span], target:t}
@@ -338,8 +351,13 @@ class TextAlignment::TextAlignment
338
351
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
339
352
  "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
340
353
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
341
- ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
342
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
354
+ ">>>>> string 2 " +
355
+ if a[:target]
356
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
357
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
358
+ else
359
+ "[-]\n\n"
360
+ end
343
361
  else
344
362
  astr1 = ''
345
363
  astr2 = ''
@@ -379,5 +397,4 @@ class TextAlignment::TextAlignment
379
397
  end
380
398
  show
381
399
  end
382
-
383
400
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.7'
2
+ VERSION = '0.9'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.7'
4
+ version: '0.9'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-11 00:00:00.000000000 Z
11
+ date: 2020-11-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary