text_alignment 0.2.9 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,75 +1,74 @@
1
- module TextAlignment
1
+ module TextAlignment; end unless defined? TextAlignment
2
2
 
3
- TextAlignment::MAPPINGS = [
4
- ["©", "(c)"], #U+00A9 (Copyright Sign)
3
+ TextAlignment::MAPPINGS = [
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
5
 
6
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
- ["β", "beta"], #U+03B2 (greek small letter beta)
8
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
- ["δ", "delta"], #U+03B4 (greek small letter delta)
10
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
- ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
- ["η", "eta"], #U+03B7 (greek small letter eta)
13
- ["θ", "theta"], #U+03B7 (greek small letter eta)
14
- ["ι", "iota"], #U+03B7 (greek small letter eta)
15
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
- ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
- ["μ", "mu"], #U+03BC (greek small letter mu)
19
- ["ν", "nu"], #U+03BD (greek small letter nu)
20
- ["ξ", "xi"], #U+03BE (greek small letter xi)
21
- ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
- ["π", "pi"], #U+03C0 (greek small letter pi)
23
- ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
- ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
- ["τ", "tau"], #U+03C4 (greek small letter tau)
26
- ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
- ["φ", "phi"], #U+03C6 (greek small letter phi)
28
- ["χ", "chi"], #U+03C7 (greek small letter chi)
29
- ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
- ["ω", "omega"], #U+03C9 (greek small letter omega)
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
31
 
32
- ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
- ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
- ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
- ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
- ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
- ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
- ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
- ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
- ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
- ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
- ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
- ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
- ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
- ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
- ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
- ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
- ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
- ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
- ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
- ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
- ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
- ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
- ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
- ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
- ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
57
 
58
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
59
 
60
- ["×", "x"], #U+00D7 (multiplication sign)
61
- ["•", "*"], #U+2022 (bullet)
62
- [" ", " "], #U+2009 (thin space)
63
- [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
65
- [" ", " "], #U+3000 (ideographic space)
66
- ["−", "-"], #U+2212 (minus sign)
67
- ["–", "-"], #U+2013 (en dash)
68
- ["′", "'"], #U+2032 (prime)
69
- ["‘", "'"], #U+2018 (left single quotation mark)
70
- ["’", "'"], #U+2019 (right single quotation mark)
71
- ["“", '"'], #U+201C (left double quotation mark)
72
- ["”", '"'], #U+201D (right double quotation mark)
73
- ['"', "''"]
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (no-break space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["−", "-"], #U+2212 (minus sign)
67
+ ["–", "-"], #U+2013 (en dash)
68
+ ["′", "'"], #U+2032 (prime)
69
+ ["‘", "'"], #U+2018 (left single quotation mark)
70
+ ["’", "'"], #U+2019 (right single quotation mark)
71
+ ["“", '"'], #U+201C (left double quotation mark)
72
+ ["”", '"'], #U+201D (right double quotation mark)
73
+ ['"', "''"]
74
74
  ]
75
- end
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env ruby
2
+ require 'diff-lcs'
3
+ require 'text_alignment/lcs_min'
4
+ require 'text_alignment/find_divisions'
5
+ require 'text_alignment/lcs_comparison'
6
+ require 'text_alignment/lcs_alignment'
7
+ require 'text_alignment/lcs_cdiff'
8
+ require 'text_alignment/glcs_alignment'
9
+ require 'text_alignment/mappings'
10
+
11
+ module TextAlignment; end unless defined? TextAlignment
12
+
13
+ TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
14
+
15
+ class TextAlignment::MixedAlignment
16
+ attr_reader :sdiff
17
+ attr_reader :position_map_begin, :position_map_end
18
+ attr_reader :common_elements, :mapped_elements
19
+ attr_reader :similarity
20
+ attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
21
+
22
+ def initialize(str1, str2, mappings = [])
23
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
24
+ raise ArgumentError, "nil mappings" if mappings.nil?
25
+
26
+ ## preprocessing
27
+ str1 = str1.dup
28
+ str2 = str2.dup
29
+ mappings = mappings.dup
30
+
31
+ ## find the first nomatch character
32
+ TextAlignment::NOMATCH_CHARS.each_char do |c|
33
+ if str2.index(c).nil?
34
+ @nomatch_char1 = c
35
+ break
36
+ end
37
+ end
38
+ raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
39
+
40
+ ## find the first nomatch character
41
+ TextAlignment::NOMATCH_CHARS.each_char do |c|
42
+ if c != @nomatch_char1 && str1.index(c).nil?
43
+ @nomatch_char2 = c
44
+ break
45
+ end
46
+ end
47
+ raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
48
+
49
+ # single character mappings
50
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
51
+ characters_from = character_mappings.collect{|m| m[0]}.join
52
+ characters_to = character_mappings.collect{|m| m[1]}.join
53
+ characters_to.gsub!(/-/, '\-')
54
+
55
+ str1.tr!(characters_from, characters_to)
56
+ str2.tr!(characters_from, characters_to)
57
+
58
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
59
+
60
+ # ASCII foldings
61
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
62
+ ascii_foldings.each do |f|
63
+ from = f[1]
64
+
65
+ if str2.index(f[0])
66
+ to = f[0] + (@nomatch_char1 * (f[1].length - 1))
67
+ str1.gsub!(from, to)
68
+ end
69
+
70
+ if str1.index(f[0])
71
+ to = f[0] + (@nomatch_char2 * (f[1].length - 1))
72
+ str2.gsub!(from, to)
73
+ end
74
+ end
75
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
76
+
77
+ _compute_mixed_alignment(str1, str2, mappings)
78
+ end
79
+
80
+ def transform_begin_position(begin_position)
81
+ @position_map_begin[begin_position]
82
+ end
83
+
84
+ def transform_end_position(end_position)
85
+ @position_map_end[end_position]
86
+ end
87
+
88
+ def transform_a_span(span)
89
+ {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
90
+ end
91
+
92
+ def transform_spans(spans)
93
+ spans.map{|span| transform_a_span(span)}
94
+ end
95
+
96
+ def transform_denotations!(denotations)
97
+ denotations.map!{|d| d.begin = @position_map_begin[d.begin]; d.end = @position_map_end[d.end]; d} unless denotations.nil?
98
+ end
99
+
100
+ def transform_hdenotations(hdenotations)
101
+ return nil if hdenotations.nil?
102
+ hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
103
+ end
104
+
105
+ private
106
+
107
+ def _compute_mixed_alignment(str1, str2, mappings = [])
108
+ lcsmin = TextAlignment::LCSMin.new(str1, str2)
109
+ lcs = lcsmin.lcs
110
+ @sdiff = lcsmin.sdiff
111
+
112
+ if @sdiff.nil?
113
+ @similarity = 0
114
+ return
115
+ end
116
+
117
+ cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
118
+ @similarity = cmp.similarity
119
+ @str1_match_initial = cmp.str1_match_initial
120
+ @str1_match_final = cmp.str1_match_final
121
+ @str2_match_initial = cmp.str2_match_initial
122
+ @str2_match_final = cmp.str2_match_final
123
+
124
+ posmap_begin, posmap_end = {}, {}
125
+ @common_elements, @mapped_elements = [], []
126
+
127
+ addition, deletion = [], []
128
+
129
+ @sdiff.each do |h|
130
+ case h.action
131
+ when '='
132
+ p1, p2 = h.old_position, h.new_position
133
+
134
+ @common_elements << [str1[p1], str2[p2]]
135
+ posmap_begin[p1], posmap_end[p1] = p2, p2
136
+
137
+ if !addition.empty? && deletion.empty?
138
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
139
+ elsif addition.empty? && !deletion.empty?
140
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
141
+ elsif !addition.empty? && !deletion.empty?
142
+ if addition.length > 1 || deletion.length > 1
143
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
144
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
145
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
146
+ posmap_begin[p1], posmap_end[p1] = p2, p2
147
+ @common_elements += galign.common_elements
148
+ @mapped_elements += galign.mapped_elements
149
+ else
150
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
151
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
152
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
153
+ end
154
+ end
155
+
156
+ addition.clear; deletion.clear
157
+
158
+ when '!'
159
+ deletion << h.old_position
160
+ addition << h.new_position
161
+ when '-'
162
+ deletion << h.old_position
163
+ when '+'
164
+ addition << h.new_position
165
+ end
166
+ end
167
+
168
+ p1, p2 = str1.length, str2.length
169
+ posmap_begin[p1], posmap_end[p1] = p2, p2
170
+
171
+ if !addition.empty? && deletion.empty?
172
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
173
+ elsif addition.empty? && !deletion.empty?
174
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
175
+ elsif !addition.empty? && !deletion.empty?
176
+ if addition.length > 1 && deletion.length > 1
177
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
178
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
179
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
180
+ posmap_begin[p1], posmap_end[p1] = p2, p2
181
+ @common_elements += galign.common_elements
182
+ @mapped_elements += galign.mapped_elements
183
+ else
184
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
185
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
186
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
187
+ end
188
+ end
189
+
190
+ @position_map_begin = posmap_begin.sort.to_h
191
+ @position_map_end = posmap_end.sort.to_h
192
+ end
193
+ end
@@ -1,181 +1,239 @@
1
1
  #!/usr/bin/env ruby
2
- require 'diff-lcs'
3
- require 'text_alignment/lcs_min'
4
- require 'text_alignment/find_divisions'
5
- require 'text_alignment/lcs_comparison'
6
- require 'text_alignment/lcs_alignment'
7
- require 'text_alignment/lcs_cdiff'
8
- require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
2
+ require 'text_alignment/anchor_finder'
3
+ require 'text_alignment/mixed_alignment'
10
4
 
11
5
  module TextAlignment; end unless defined? TextAlignment
12
6
 
13
- TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
14
- TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
7
+ TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
15
9
 
16
10
  class TextAlignment::TextAlignment
17
- attr_reader :sdiff
18
- attr_reader :position_map_begin, :position_map_end
19
- attr_reader :common_elements, :mapped_elements
20
- attr_reader :similarity
21
- attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
22
-
23
- def initialize(str1, str2, mappings = [])
24
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
25
- raise ArgumentError, "nil mappings" if mappings.nil?
26
-
27
- ## preprocessing
28
- str1 = str1.dup
29
- str2 = str2.dup
30
- mappings = mappings.dup
31
-
32
- ## find the first nomatch character
33
- TextAlignment::NOMATCH_CHARS.each_char do |c|
34
- if str2.index(c).nil?
35
- @nomatch_char1 = c
36
- break
37
- end
38
- end
39
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
40
-
41
- ## find the first nomatch character
42
- TextAlignment::NOMATCH_CHARS.each_char do |c|
43
- if c != @nomatch_char1 && str1.index(c).nil?
44
- @nomatch_char2 = c
45
- break
46
- end
47
- end
48
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
49
-
50
- # single character mappings
51
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
52
- characters_from = character_mappings.collect{|m| m[0]}.join
53
- characters_to = character_mappings.collect{|m| m[1]}.join
54
- characters_to.gsub!(/-/, '\-')
55
-
56
- str1.tr!(characters_from, characters_to)
57
- str2.tr!(characters_from, characters_to)
58
-
59
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
60
-
61
- # ASCII foldings
62
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
63
- ascii_foldings.each do |f|
64
- from = f[1]
65
-
66
- if str2.index(f[0])
67
- to = f[0] + (@nomatch_char1 * (f[1].length - 1))
68
- str1.gsub!(from, to)
69
- end
70
-
71
- if str1.index(f[0])
72
- to = f[0] + (@nomatch_char2 * (f[1].length - 1))
73
- str2.gsub!(from, to)
74
- end
75
- end
76
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
77
-
78
- _compute_mixed_alignment(str1, str2, mappings)
79
- end
80
-
81
- def transform_a_span(span)
82
- {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
83
- end
84
-
85
- def transform_spans(spans)
86
- spans.map{|span| transform_a_span(span)}
87
- end
88
-
89
- def transform_denotations!(denotations)
90
- denotations.map!{|d| d.begin = @position_map_begin[d.begin]; d.end = @position_map_end[d.end]; d} unless denotations.nil?
91
- end
92
-
93
- def transform_hdenotations(hdenotations)
94
- return nil if hdenotations.nil?
95
- hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
96
- end
97
-
98
- private
99
-
100
- def _compute_mixed_alignment(str1, str2, mappings = [])
101
- lcsmin = TextAlignment::LCSMin.new(str1, str2)
102
- lcs = lcsmin.lcs
103
- @sdiff = lcsmin.sdiff
104
-
105
- cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
106
- @similarity = cmp.similarity
107
- @str1_match_initial = cmp.str1_match_initial
108
- @str1_match_final = cmp.str1_match_final
109
- @str2_match_initial = cmp.str2_match_initial
110
- @str2_match_final = cmp.str2_match_final
111
-
112
- posmap_begin, posmap_end = {}, {}
113
- @common_elements, @mapped_elements = [], []
114
-
115
- addition, deletion = [], []
116
-
117
- @sdiff.each do |h|
118
- case h.action
119
- when '='
120
- p1, p2 = h.old_position, h.new_position
121
-
122
- @common_elements << [str1[p1], str2[p2]]
123
- posmap_begin[p1], posmap_end[p1] = p2, p2
124
-
125
- if !addition.empty? && deletion.empty?
126
- posmap_end[p1] = p2 - addition.length unless p1 == 0
127
- elsif addition.empty? && !deletion.empty?
128
- deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
129
- elsif !addition.empty? && !deletion.empty?
130
- if addition.length > 1 || deletion.length > 1
131
- galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
132
- galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
133
- galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
134
- posmap_begin[p1], posmap_end[p1] = p2, p2
135
- @common_elements += galign.common_elements
136
- @mapped_elements += galign.mapped_elements
137
- else
138
- posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
139
- deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
140
- @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
141
- end
142
- end
143
-
144
- addition.clear; deletion.clear
145
-
146
- when '!'
147
- deletion << h.old_position
148
- addition << h.new_position
149
- when '-'
150
- deletion << h.old_position
151
- when '+'
152
- addition << h.new_position
153
- end
154
- end
155
-
156
- p1, p2 = str1.length, str2.length
157
- posmap_begin[p1], posmap_end[p1] = p2, p2
158
-
159
- if !addition.empty? && deletion.empty?
160
- posmap_end[p1] = p2 - addition.length unless p1 == 0
161
- elsif addition.empty? && !deletion.empty?
162
- deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
163
- elsif !addition.empty? && !deletion.empty?
164
- if addition.length > 1 && deletion.length > 1
165
- galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
166
- galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
167
- galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
168
- posmap_begin[p1], posmap_end[p1] = p2, p2
169
- @common_elements += galign.common_elements
170
- @mapped_elements += galign.mapped_elements
171
- else
172
- posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
173
- deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
174
- @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
175
- end
176
- end
177
-
178
- @position_map_begin = posmap_begin.sort.to_h
179
- @position_map_end = posmap_end.sort.to_h
180
- end
11
+ attr_reader :block_alignments
12
+ attr_reader :similarity
13
+ attr_reader :lost_annotations
14
+
15
+ def initialize(str1, str2, mappings = nil)
16
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
+
18
+ mappings ||= TextAlignment::MAPPINGS
19
+
20
+ # try exact match
21
+ block_begin = str2.index(str1)
22
+ unless block_begin.nil?
23
+ @block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
24
+ return @block_alignments
25
+ end
26
+
27
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
28
+
29
+ # To collect matched blocks
30
+ mblocks = []
31
+ while anchor = anchor_finder.get_next_anchor
32
+ last = mblocks.last
33
+ if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
34
+ last[:source][:end] = anchor[:source][:end]
35
+ last[:target][:end] = anchor[:target][:end]
36
+ else
37
+ mblocks << anchor
38
+ end
39
+ end
40
+
41
+ # mblocks.each do |b|
42
+ # p [b[:source], b[:target]]
43
+ # puts "---"
44
+ # puts str1[b[:source][:begin] ... b[:source][:end]]
45
+ # puts "---"
46
+ # puts str2[b[:target][:begin] ... b[:target][:end]]
47
+ # puts "====="
48
+ # puts
49
+ # end
50
+ # puts "-=-=-=-=-"
51
+ # puts
52
+
53
+ ## To find block alignments
54
+ @block_alignments = []
55
+ return if mblocks.empty?
56
+
57
+ # Initial step
58
+ if mblocks[0][:source][:begin] > 0
59
+ e1 = mblocks[0][:source][:begin]
60
+ e2 = mblocks[0][:target][:begin]
61
+
62
+ if mblocks[0][:target][:begin] == 0
63
+ @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
64
+ else
65
+ _str1 = str1[0 ... e1]
66
+ _str2 = str2[0 ... e2]
67
+
68
+ unless _str1.strip.empty?
69
+ if _str2.strip.empty?
70
+ @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
71
+ else
72
+ len_min = [_str1.length, _str2.length].min
73
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
74
+ b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
75
+ b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
76
+
77
+ @block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
78
+
79
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
80
+ if alignment.similarity < 0.6
81
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
82
+ else
83
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
89
+ @block_alignments << mblocks[0]
90
+
91
+ (1 ... mblocks.length).each do |i|
92
+ b1 = mblocks[i - 1][:source][:end]
93
+ b2 = mblocks[i - 1][:target][:end]
94
+ e1 = mblocks[i][:source][:begin]
95
+ e2 = mblocks[i][:target][:begin]
96
+ _str1 = str1[b1 ... e1]
97
+ _str2 = str2[b2 ... e2]
98
+ unless _str1.strip.empty?
99
+ if _str2.strip.empty?
100
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
101
+ else
102
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
103
+ if alignment.similarity < 0.6
104
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
105
+ else
106
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
107
+ end
108
+ end
109
+ end
110
+ @block_alignments << mblocks[i]
111
+ end
112
+
113
+ # Final step
114
+ if mblocks[-1][:source][:end] < str1.length
115
+ b1 = mblocks[-1][:source][:end]
116
+ b2 = mblocks[-1][:target][:end]
117
+
118
+ if mblocks[-1][:target][:end] < str2.length
119
+
120
+ else
121
+ e1 = str1.length
122
+ e2 = str2.length
123
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
124
+ end
125
+ end
126
+
127
+ if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
128
+ b1 = mblocks[-1][:source][:end]
129
+ b2 = mblocks[-1][:target][:end]
130
+ _str1 = str1[b1 ... -1]
131
+ _str2 = str2[b2 ... -1]
132
+
133
+ unless _str1.strip.empty?
134
+ if _str2.strip.empty?
135
+ @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
136
+ else
137
+ len_min = [_str1.length, _str2.length].min
138
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
139
+ e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
140
+ e2 = _str2.length < len_buffer ? str1.length : b2 + len_buffer
141
+
142
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
143
+ if alignment.similarity < 0.6
144
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
145
+ else
146
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
147
+ end
148
+
149
+ @block_alignments << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
150
+ end
151
+ end
152
+ end
153
+
154
+ @block_alignments.each do |a|
155
+ a[:delta] = a[:target][:begin] - a[:source][:begin]
156
+ end
157
+ end
158
+
159
+ def transform_begin_position(begin_position)
160
+ i = @block_alignments.index{|b| b[:source][:end] > begin_position}
161
+ block_alignment = @block_alignments[i]
162
+
163
+ b = if block_alignment[:alignment].nil?
164
+ begin_position + block_alignment[:delta]
165
+ elsif block_alignment[:alignment] == :empty
166
+ if begin_position == block_alignment[:source][:begin]
167
+ block_alignment[:target][:begin]
168
+ else
169
+ raise "lost annotation"
170
+ end
171
+ else
172
+ block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
173
+ end
174
+ end
175
+
176
+ def transform_end_position(end_position)
177
+ i = @block_alignments.index{|b| b[:source][:end] >= end_position}
178
+ block_alignment = @block_alignments[i]
179
+
180
+ e = if block_alignment[:alignment].nil?
181
+ end_position + block_alignment[:delta]
182
+ elsif block_alignment[:alignment] == :empty
183
+ if end_position == block_alignment[:source][:end]
184
+ block_alignment[:target][:end]
185
+ else
186
+ raise "lost annotation"
187
+ end
188
+ else
189
+ block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
190
+ end
191
+ end
192
+
193
+ def transform_a_span(span)
194
+ {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
195
+ end
196
+
197
+ def transform_spans(spans)
198
+ spans.map{|span| transform_a_span(span)}
199
+ end
200
+
201
+ def transform_denotations!(denotations)
202
+ puts "hereherehere========"
203
+ return nil if denotations.nil?
204
+ @lost_annotations = []
205
+
206
+ pp denotations
207
+ puts "-----"
208
+
209
+ denotations.each do |d|
210
+ begin
211
+ d.begin = transform_begin_position(d.begin);
212
+ d.end = transform_end_position(d.end);
213
+ rescue
214
+ @lost_annotations << d
215
+ d.begin = nil
216
+ d.end = nil
217
+ end
218
+ end
219
+
220
+ pp denotations
221
+ end
222
+
223
+ def transform_hdenotations(hdenotations)
224
+ return nil if hdenotations.nil?
225
+ @lost_annotations = []
226
+
227
+ r = hdenotations.collect do |d|
228
+ new_d = begin
229
+ d.dup.merge({span:transform_a_span(d[:span])})
230
+ rescue
231
+ @lost_annotations << d
232
+ nil
233
+ end
234
+ end.compact
235
+
236
+ r
237
+ end
238
+
181
239
  end