text_alignment 0.2.9 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,75 +1,74 @@
1
- module TextAlignment
1
+ module TextAlignment; end unless defined? TextAlignment
2
2
 
3
- TextAlignment::MAPPINGS = [
4
- ["©", "(c)"], #U+00A9 (Copyright Sign)
3
+ TextAlignment::MAPPINGS = [
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
5
 
6
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
- ["β", "beta"], #U+03B2 (greek small letter beta)
8
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
- ["δ", "delta"], #U+03B4 (greek small letter delta)
10
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
- ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
- ["η", "eta"], #U+03B7 (greek small letter eta)
13
- ["θ", "theta"], #U+03B7 (greek small letter eta)
14
- ["ι", "iota"], #U+03B7 (greek small letter eta)
15
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
- ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
- ["μ", "mu"], #U+03BC (greek small letter mu)
19
- ["ν", "nu"], #U+03BD (greek small letter nu)
20
- ["ξ", "xi"], #U+03BE (greek small letter xi)
21
- ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
- ["π", "pi"], #U+03C0 (greek small letter pi)
23
- ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
- ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
- ["τ", "tau"], #U+03C4 (greek small letter tau)
26
- ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
- ["φ", "phi"], #U+03C6 (greek small letter phi)
28
- ["χ", "chi"], #U+03C7 (greek small letter chi)
29
- ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
- ["ω", "omega"], #U+03C9 (greek small letter omega)
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
31
 
32
- ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
- ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
- ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
- ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
- ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
- ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
- ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
- ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
- ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
- ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
- ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
- ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
- ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
- ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
- ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
- ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
- ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
- ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
- ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
- ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
- ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
- ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
- ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
- ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
- ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
57
 
58
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
59
 
60
- ["×", "x"], #U+00D7 (multiplication sign)
61
- ["•", "*"], #U+2022 (bullet)
62
- [" ", " "], #U+2009 (thin space)
63
- [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
65
- [" ", " "], #U+3000 (ideographic space)
66
- ["−", "-"], #U+2212 (minus sign)
67
- ["–", "-"], #U+2013 (en dash)
68
- ["′", "'"], #U+2032 (prime)
69
- ["‘", "'"], #U+2018 (left single quotation mark)
70
- ["’", "'"], #U+2019 (right single quotation mark)
71
- ["“", '"'], #U+201C (left double quotation mark)
72
- ["”", '"'], #U+201D (right double quotation mark)
73
- ['"', "''"]
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (no-break space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["−", "-"], #U+2212 (minus sign)
67
+ ["–", "-"], #U+2013 (en dash)
68
+ ["′", "'"], #U+2032 (prime)
69
+ ["‘", "'"], #U+2018 (left single quotation mark)
70
+ ["’", "'"], #U+2019 (right single quotation mark)
71
+ ["“", '"'], #U+201C (left double quotation mark)
72
+ ["”", '"'], #U+201D (right double quotation mark)
73
+ ['"', "''"]
74
74
  ]
75
- end
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env ruby
2
+ require 'diff-lcs'
3
+ require 'text_alignment/lcs_min'
4
+ require 'text_alignment/find_divisions'
5
+ require 'text_alignment/lcs_comparison'
6
+ require 'text_alignment/lcs_alignment'
7
+ require 'text_alignment/lcs_cdiff'
8
+ require 'text_alignment/glcs_alignment'
9
+ require 'text_alignment/mappings'
10
+
11
+ module TextAlignment; end unless defined? TextAlignment
12
+
13
+ TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
14
+
15
+ class TextAlignment::MixedAlignment
16
+ attr_reader :sdiff
17
+ attr_reader :position_map_begin, :position_map_end
18
+ attr_reader :common_elements, :mapped_elements
19
+ attr_reader :similarity
20
+ attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
21
+
22
+ def initialize(str1, str2, mappings = [])
23
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
24
+ raise ArgumentError, "nil mappings" if mappings.nil?
25
+
26
+ ## preprocessing
27
+ str1 = str1.dup
28
+ str2 = str2.dup
29
+ mappings = mappings.dup
30
+
31
+ ## find the first nomatch character
32
+ TextAlignment::NOMATCH_CHARS.each_char do |c|
33
+ if str2.index(c).nil?
34
+ @nomatch_char1 = c
35
+ break
36
+ end
37
+ end
38
+ raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
39
+
40
+ ## find the first nomatch character
41
+ TextAlignment::NOMATCH_CHARS.each_char do |c|
42
+ if c != @nomatch_char1 && str1.index(c).nil?
43
+ @nomatch_char2 = c
44
+ break
45
+ end
46
+ end
47
+ raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
48
+
49
+ # single character mappings
50
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
51
+ characters_from = character_mappings.collect{|m| m[0]}.join
52
+ characters_to = character_mappings.collect{|m| m[1]}.join
53
+ characters_to.gsub!(/-/, '\-')
54
+
55
+ str1.tr!(characters_from, characters_to)
56
+ str2.tr!(characters_from, characters_to)
57
+
58
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
59
+
60
+ # ASCII foldings
61
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
62
+ ascii_foldings.each do |f|
63
+ from = f[1]
64
+
65
+ if str2.index(f[0])
66
+ to = f[0] + (@nomatch_char1 * (f[1].length - 1))
67
+ str1.gsub!(from, to)
68
+ end
69
+
70
+ if str1.index(f[0])
71
+ to = f[0] + (@nomatch_char2 * (f[1].length - 1))
72
+ str2.gsub!(from, to)
73
+ end
74
+ end
75
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
76
+
77
+ _compute_mixed_alignment(str1, str2, mappings)
78
+ end
79
+
80
+ def transform_begin_position(begin_position)
81
+ @position_map_begin[begin_position]
82
+ end
83
+
84
+ def transform_end_position(end_position)
85
+ @position_map_end[end_position]
86
+ end
87
+
88
+ def transform_a_span(span)
89
+ {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
90
+ end
91
+
92
+ def transform_spans(spans)
93
+ spans.map{|span| transform_a_span(span)}
94
+ end
95
+
96
+ def transform_denotations!(denotations)
97
+ denotations.map!{|d| d.begin = @position_map_begin[d.begin]; d.end = @position_map_end[d.end]; d} unless denotations.nil?
98
+ end
99
+
100
+ def transform_hdenotations(hdenotations)
101
+ return nil if hdenotations.nil?
102
+ hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
103
+ end
104
+
105
+ private
106
+
107
+ def _compute_mixed_alignment(str1, str2, mappings = [])
108
+ lcsmin = TextAlignment::LCSMin.new(str1, str2)
109
+ lcs = lcsmin.lcs
110
+ @sdiff = lcsmin.sdiff
111
+
112
+ if @sdiff.nil?
113
+ @similarity = 0
114
+ return
115
+ end
116
+
117
+ cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
118
+ @similarity = cmp.similarity
119
+ @str1_match_initial = cmp.str1_match_initial
120
+ @str1_match_final = cmp.str1_match_final
121
+ @str2_match_initial = cmp.str2_match_initial
122
+ @str2_match_final = cmp.str2_match_final
123
+
124
+ posmap_begin, posmap_end = {}, {}
125
+ @common_elements, @mapped_elements = [], []
126
+
127
+ addition, deletion = [], []
128
+
129
+ @sdiff.each do |h|
130
+ case h.action
131
+ when '='
132
+ p1, p2 = h.old_position, h.new_position
133
+
134
+ @common_elements << [str1[p1], str2[p2]]
135
+ posmap_begin[p1], posmap_end[p1] = p2, p2
136
+
137
+ if !addition.empty? && deletion.empty?
138
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
139
+ elsif addition.empty? && !deletion.empty?
140
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
141
+ elsif !addition.empty? && !deletion.empty?
142
+ if addition.length > 1 || deletion.length > 1
143
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
144
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
145
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
146
+ posmap_begin[p1], posmap_end[p1] = p2, p2
147
+ @common_elements += galign.common_elements
148
+ @mapped_elements += galign.mapped_elements
149
+ else
150
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
151
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
152
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
153
+ end
154
+ end
155
+
156
+ addition.clear; deletion.clear
157
+
158
+ when '!'
159
+ deletion << h.old_position
160
+ addition << h.new_position
161
+ when '-'
162
+ deletion << h.old_position
163
+ when '+'
164
+ addition << h.new_position
165
+ end
166
+ end
167
+
168
+ p1, p2 = str1.length, str2.length
169
+ posmap_begin[p1], posmap_end[p1] = p2, p2
170
+
171
+ if !addition.empty? && deletion.empty?
172
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
173
+ elsif addition.empty? && !deletion.empty?
174
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
175
+ elsif !addition.empty? && !deletion.empty?
176
+ if addition.length > 1 && deletion.length > 1
177
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
178
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
179
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
180
+ posmap_begin[p1], posmap_end[p1] = p2, p2
181
+ @common_elements += galign.common_elements
182
+ @mapped_elements += galign.mapped_elements
183
+ else
184
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
185
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
186
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
187
+ end
188
+ end
189
+
190
+ @position_map_begin = posmap_begin.sort.to_h
191
+ @position_map_end = posmap_end.sort.to_h
192
+ end
193
+ end
@@ -1,181 +1,239 @@
1
1
  #!/usr/bin/env ruby
2
- require 'diff-lcs'
3
- require 'text_alignment/lcs_min'
4
- require 'text_alignment/find_divisions'
5
- require 'text_alignment/lcs_comparison'
6
- require 'text_alignment/lcs_alignment'
7
- require 'text_alignment/lcs_cdiff'
8
- require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
2
+ require 'text_alignment/anchor_finder'
3
+ require 'text_alignment/mixed_alignment'
10
4
 
11
5
  module TextAlignment; end unless defined? TextAlignment
12
6
 
13
- TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
14
- TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
7
+ TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
15
9
 
16
10
  class TextAlignment::TextAlignment
17
- attr_reader :sdiff
18
- attr_reader :position_map_begin, :position_map_end
19
- attr_reader :common_elements, :mapped_elements
20
- attr_reader :similarity
21
- attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
22
-
23
- def initialize(str1, str2, mappings = [])
24
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
25
- raise ArgumentError, "nil mappings" if mappings.nil?
26
-
27
- ## preprocessing
28
- str1 = str1.dup
29
- str2 = str2.dup
30
- mappings = mappings.dup
31
-
32
- ## find the first nomatch character
33
- TextAlignment::NOMATCH_CHARS.each_char do |c|
34
- if str2.index(c).nil?
35
- @nomatch_char1 = c
36
- break
37
- end
38
- end
39
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
40
-
41
- ## find the first nomatch character
42
- TextAlignment::NOMATCH_CHARS.each_char do |c|
43
- if c != @nomatch_char1 && str1.index(c).nil?
44
- @nomatch_char2 = c
45
- break
46
- end
47
- end
48
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
49
-
50
- # single character mappings
51
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
52
- characters_from = character_mappings.collect{|m| m[0]}.join
53
- characters_to = character_mappings.collect{|m| m[1]}.join
54
- characters_to.gsub!(/-/, '\-')
55
-
56
- str1.tr!(characters_from, characters_to)
57
- str2.tr!(characters_from, characters_to)
58
-
59
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
60
-
61
- # ASCII foldings
62
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
63
- ascii_foldings.each do |f|
64
- from = f[1]
65
-
66
- if str2.index(f[0])
67
- to = f[0] + (@nomatch_char1 * (f[1].length - 1))
68
- str1.gsub!(from, to)
69
- end
70
-
71
- if str1.index(f[0])
72
- to = f[0] + (@nomatch_char2 * (f[1].length - 1))
73
- str2.gsub!(from, to)
74
- end
75
- end
76
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
77
-
78
- _compute_mixed_alignment(str1, str2, mappings)
79
- end
80
-
81
- def transform_a_span(span)
82
- {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
83
- end
84
-
85
- def transform_spans(spans)
86
- spans.map{|span| transform_a_span(span)}
87
- end
88
-
89
- def transform_denotations!(denotations)
90
- denotations.map!{|d| d.begin = @position_map_begin[d.begin]; d.end = @position_map_end[d.end]; d} unless denotations.nil?
91
- end
92
-
93
- def transform_hdenotations(hdenotations)
94
- return nil if hdenotations.nil?
95
- hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
96
- end
97
-
98
- private
99
-
100
- def _compute_mixed_alignment(str1, str2, mappings = [])
101
- lcsmin = TextAlignment::LCSMin.new(str1, str2)
102
- lcs = lcsmin.lcs
103
- @sdiff = lcsmin.sdiff
104
-
105
- cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
106
- @similarity = cmp.similarity
107
- @str1_match_initial = cmp.str1_match_initial
108
- @str1_match_final = cmp.str1_match_final
109
- @str2_match_initial = cmp.str2_match_initial
110
- @str2_match_final = cmp.str2_match_final
111
-
112
- posmap_begin, posmap_end = {}, {}
113
- @common_elements, @mapped_elements = [], []
114
-
115
- addition, deletion = [], []
116
-
117
- @sdiff.each do |h|
118
- case h.action
119
- when '='
120
- p1, p2 = h.old_position, h.new_position
121
-
122
- @common_elements << [str1[p1], str2[p2]]
123
- posmap_begin[p1], posmap_end[p1] = p2, p2
124
-
125
- if !addition.empty? && deletion.empty?
126
- posmap_end[p1] = p2 - addition.length unless p1 == 0
127
- elsif addition.empty? && !deletion.empty?
128
- deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
129
- elsif !addition.empty? && !deletion.empty?
130
- if addition.length > 1 || deletion.length > 1
131
- galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
132
- galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
133
- galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
134
- posmap_begin[p1], posmap_end[p1] = p2, p2
135
- @common_elements += galign.common_elements
136
- @mapped_elements += galign.mapped_elements
137
- else
138
- posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
139
- deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
140
- @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
141
- end
142
- end
143
-
144
- addition.clear; deletion.clear
145
-
146
- when '!'
147
- deletion << h.old_position
148
- addition << h.new_position
149
- when '-'
150
- deletion << h.old_position
151
- when '+'
152
- addition << h.new_position
153
- end
154
- end
155
-
156
- p1, p2 = str1.length, str2.length
157
- posmap_begin[p1], posmap_end[p1] = p2, p2
158
-
159
- if !addition.empty? && deletion.empty?
160
- posmap_end[p1] = p2 - addition.length unless p1 == 0
161
- elsif addition.empty? && !deletion.empty?
162
- deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
163
- elsif !addition.empty? && !deletion.empty?
164
- if addition.length > 1 && deletion.length > 1
165
- galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
166
- galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
167
- galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
168
- posmap_begin[p1], posmap_end[p1] = p2, p2
169
- @common_elements += galign.common_elements
170
- @mapped_elements += galign.mapped_elements
171
- else
172
- posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
173
- deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
174
- @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
175
- end
176
- end
177
-
178
- @position_map_begin = posmap_begin.sort.to_h
179
- @position_map_end = posmap_end.sort.to_h
180
- end
11
+ attr_reader :block_alignments
12
+ attr_reader :similarity
13
+ attr_reader :lost_annotations
14
+
15
+ def initialize(str1, str2, mappings = nil)
16
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
+
18
+ mappings ||= TextAlignment::MAPPINGS
19
+
20
+ # try exact match
21
+ block_begin = str2.index(str1)
22
+ unless block_begin.nil?
23
+ @block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
24
+ return @block_alignments
25
+ end
26
+
27
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
28
+
29
+ # To collect matched blocks
30
+ mblocks = []
31
+ while anchor = anchor_finder.get_next_anchor
32
+ last = mblocks.last
33
+ if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
34
+ last[:source][:end] = anchor[:source][:end]
35
+ last[:target][:end] = anchor[:target][:end]
36
+ else
37
+ mblocks << anchor
38
+ end
39
+ end
40
+
41
+ # mblocks.each do |b|
42
+ # p [b[:source], b[:target]]
43
+ # puts "---"
44
+ # puts str1[b[:source][:begin] ... b[:source][:end]]
45
+ # puts "---"
46
+ # puts str2[b[:target][:begin] ... b[:target][:end]]
47
+ # puts "====="
48
+ # puts
49
+ # end
50
+ # puts "-=-=-=-=-"
51
+ # puts
52
+
53
+ ## To find block alignments
54
+ @block_alignments = []
55
+ return if mblocks.empty?
56
+
57
+ # Initial step
58
+ if mblocks[0][:source][:begin] > 0
59
+ e1 = mblocks[0][:source][:begin]
60
+ e2 = mblocks[0][:target][:begin]
61
+
62
+ if mblocks[0][:target][:begin] == 0
63
+ @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
64
+ else
65
+ _str1 = str1[0 ... e1]
66
+ _str2 = str2[0 ... e2]
67
+
68
+ unless _str1.strip.empty?
69
+ if _str2.strip.empty?
70
+ @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
71
+ else
72
+ len_min = [_str1.length, _str2.length].min
73
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
74
+ b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
75
+ b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
76
+
77
+ @block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
78
+
79
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
80
+ if alignment.similarity < 0.6
81
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
82
+ else
83
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
89
+ @block_alignments << mblocks[0]
90
+
91
+ (1 ... mblocks.length).each do |i|
92
+ b1 = mblocks[i - 1][:source][:end]
93
+ b2 = mblocks[i - 1][:target][:end]
94
+ e1 = mblocks[i][:source][:begin]
95
+ e2 = mblocks[i][:target][:begin]
96
+ _str1 = str1[b1 ... e1]
97
+ _str2 = str2[b2 ... e2]
98
+ unless _str1.strip.empty?
99
+ if _str2.strip.empty?
100
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
101
+ else
102
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
103
+ if alignment.similarity < 0.6
104
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
105
+ else
106
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
107
+ end
108
+ end
109
+ end
110
+ @block_alignments << mblocks[i]
111
+ end
112
+
113
+ # Final step
114
+ if mblocks[-1][:source][:end] < str1.length
115
+ b1 = mblocks[-1][:source][:end]
116
+ b2 = mblocks[-1][:target][:end]
117
+
118
+ if mblocks[-1][:target][:end] < str2.length
119
+
120
+ else
121
+ e1 = str1.length
122
+ e2 = str2.length
123
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
124
+ end
125
+ end
126
+
127
+ if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
128
+ b1 = mblocks[-1][:source][:end]
129
+ b2 = mblocks[-1][:target][:end]
130
+ _str1 = str1[b1 ... -1]
131
+ _str2 = str2[b2 ... -1]
132
+
133
+ unless _str1.strip.empty?
134
+ if _str2.strip.empty?
135
+ @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
136
+ else
137
+ len_min = [_str1.length, _str2.length].min
138
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
139
+ e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
140
+ e2 = _str2.length < len_buffer ? str1.length : b2 + len_buffer
141
+
142
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
143
+ if alignment.similarity < 0.6
144
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
145
+ else
146
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
147
+ end
148
+
149
+ @block_alignments << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
150
+ end
151
+ end
152
+ end
153
+
154
+ @block_alignments.each do |a|
155
+ a[:delta] = a[:target][:begin] - a[:source][:begin]
156
+ end
157
+ end
158
+
159
+ def transform_begin_position(begin_position)
160
+ i = @block_alignments.index{|b| b[:source][:end] > begin_position}
161
+ block_alignment = @block_alignments[i]
162
+
163
+ b = if block_alignment[:alignment].nil?
164
+ begin_position + block_alignment[:delta]
165
+ elsif block_alignment[:alignment] == :empty
166
+ if begin_position == block_alignment[:source][:begin]
167
+ block_alignment[:target][:begin]
168
+ else
169
+ raise "lost annotation"
170
+ end
171
+ else
172
+ block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
173
+ end
174
+ end
175
+
176
+ def transform_end_position(end_position)
177
+ i = @block_alignments.index{|b| b[:source][:end] >= end_position}
178
+ block_alignment = @block_alignments[i]
179
+
180
+ e = if block_alignment[:alignment].nil?
181
+ end_position + block_alignment[:delta]
182
+ elsif block_alignment[:alignment] == :empty
183
+ if end_position == block_alignment[:source][:end]
184
+ block_alignment[:target][:end]
185
+ else
186
+ raise "lost annotation"
187
+ end
188
+ else
189
+ block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
190
+ end
191
+ end
192
+
193
+ def transform_a_span(span)
194
+ {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
195
+ end
196
+
197
+ def transform_spans(spans)
198
+ spans.map{|span| transform_a_span(span)}
199
+ end
200
+
201
+ def transform_denotations!(denotations)
202
+ puts "hereherehere========"
203
+ return nil if denotations.nil?
204
+ @lost_annotations = []
205
+
206
+ pp denotations
207
+ puts "-----"
208
+
209
+ denotations.each do |d|
210
+ begin
211
+ d.begin = transform_begin_position(d.begin);
212
+ d.end = transform_end_position(d.end);
213
+ rescue
214
+ @lost_annotations << d
215
+ d.begin = nil
216
+ d.end = nil
217
+ end
218
+ end
219
+
220
+ pp denotations
221
+ end
222
+
223
+ def transform_hdenotations(hdenotations)
224
+ return nil if hdenotations.nil?
225
+ @lost_annotations = []
226
+
227
+ r = hdenotations.collect do |d|
228
+ new_d = begin
229
+ d.dup.merge({span:transform_a_span(d[:span])})
230
+ rescue
231
+ @lost_annotations << d
232
+ nil
233
+ end
234
+ end.compact
235
+
236
+ r
237
+ end
238
+
181
239
  end