text_alignment 0.2.9 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,75 +1,74 @@
1
- module TextAlignment
1
+ module TextAlignment; end unless defined? TextAlignment
2
2
 
3
- TextAlignment::MAPPINGS = [
4
- ["©", "(c)"], #U+00A9 (Copyright Sign)
3
+ TextAlignment::MAPPINGS = [
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
5
 
6
- ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
- ["β", "beta"], #U+03B2 (greek small letter beta)
8
- ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
- ["δ", "delta"], #U+03B4 (greek small letter delta)
10
- ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
- ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
- ["η", "eta"], #U+03B7 (greek small letter eta)
13
- ["θ", "theta"], #U+03B7 (greek small letter eta)
14
- ["ι", "iota"], #U+03B7 (greek small letter eta)
15
- ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
- ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
- ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
- ["μ", "mu"], #U+03BC (greek small letter mu)
19
- ["ν", "nu"], #U+03BD (greek small letter nu)
20
- ["ξ", "xi"], #U+03BE (greek small letter xi)
21
- ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
- ["π", "pi"], #U+03C0 (greek small letter pi)
23
- ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
- ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
- ["τ", "tau"], #U+03C4 (greek small letter tau)
26
- ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
- ["φ", "phi"], #U+03C6 (greek small letter phi)
28
- ["χ", "chi"], #U+03C7 (greek small letter chi)
29
- ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
- ["ω", "omega"], #U+03C9 (greek small letter omega)
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
31
 
32
- ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
- ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
- ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
- ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
- ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
- ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
- ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
- ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
- ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
- ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
- ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
- ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
- ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
- ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
- ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
- ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
- ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
- ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
- ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
- ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
- ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
- ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
- ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
- ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
- ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
57
 
58
- ["ϕ", "phi"], #U+03D5 (greek phi symbol)
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
59
 
60
- ["×", "x"], #U+00D7 (multiplication sign)
61
- ["•", "*"], #U+2022 (bullet)
62
- [" ", " "], #U+2009 (thin space)
63
- [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
65
- [" ", " "], #U+3000 (ideographic space)
66
- ["−", "-"], #U+2212 (minus sign)
67
- ["–", "-"], #U+2013 (en dash)
68
- ["′", "'"], #U+2032 (prime)
69
- ["‘", "'"], #U+2018 (left single quotation mark)
70
- ["’", "'"], #U+2019 (right single quotation mark)
71
- ["“", '"'], #U+201C (left double quotation mark)
72
- ["”", '"'], #U+201D (right double quotation mark)
73
- ['"', "''"]
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (no-break space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["−", "-"], #U+2212 (minus sign)
67
+ ["–", "-"], #U+2013 (en dash)
68
+ ["′", "'"], #U+2032 (prime)
69
+ ["‘", "'"], #U+2018 (left single quotation mark)
70
+ ["’", "'"], #U+2019 (right single quotation mark)
71
+ ["“", '"'], #U+201C (left double quotation mark)
72
+ ["”", '"'], #U+201D (right double quotation mark)
73
+ ['"', "''"]
74
74
  ]
75
- end
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env ruby
2
+ require 'diff-lcs'
3
+ require 'text_alignment/lcs_min'
4
+ require 'text_alignment/find_divisions'
5
+ require 'text_alignment/lcs_comparison'
6
+ require 'text_alignment/lcs_alignment'
7
+ require 'text_alignment/lcs_cdiff'
8
+ require 'text_alignment/glcs_alignment'
9
+ require 'text_alignment/mappings'
10
+
11
+ module TextAlignment; end unless defined? TextAlignment
12
+
13
+ TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
14
+
15
+ class TextAlignment::MixedAlignment
16
+ attr_reader :sdiff
17
+ attr_reader :position_map_begin, :position_map_end
18
+ attr_reader :common_elements, :mapped_elements
19
+ attr_reader :similarity
20
+ attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
21
+
22
+ def initialize(str1, str2, mappings = [])
23
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
24
+ raise ArgumentError, "nil mappings" if mappings.nil?
25
+
26
+ ## preprocessing
27
+ str1 = str1.dup
28
+ str2 = str2.dup
29
+ mappings = mappings.dup
30
+
31
+ ## find the first nomatch character
32
+ TextAlignment::NOMATCH_CHARS.each_char do |c|
33
+ if str2.index(c).nil?
34
+ @nomatch_char1 = c
35
+ break
36
+ end
37
+ end
38
+ raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
39
+
40
+ ## find the first nomatch character
41
+ TextAlignment::NOMATCH_CHARS.each_char do |c|
42
+ if c != @nomatch_char1 && str1.index(c).nil?
43
+ @nomatch_char2 = c
44
+ break
45
+ end
46
+ end
47
+ raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
48
+
49
+ # single character mappings
50
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
51
+ characters_from = character_mappings.collect{|m| m[0]}.join
52
+ characters_to = character_mappings.collect{|m| m[1]}.join
53
+ characters_to.gsub!(/-/, '\-')
54
+
55
+ str1.tr!(characters_from, characters_to)
56
+ str2.tr!(characters_from, characters_to)
57
+
58
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
59
+
60
+ # ASCII foldings
61
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
62
+ ascii_foldings.each do |f|
63
+ from = f[1]
64
+
65
+ if str2.index(f[0])
66
+ to = f[0] + (@nomatch_char1 * (f[1].length - 1))
67
+ str1.gsub!(from, to)
68
+ end
69
+
70
+ if str1.index(f[0])
71
+ to = f[0] + (@nomatch_char2 * (f[1].length - 1))
72
+ str2.gsub!(from, to)
73
+ end
74
+ end
75
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
76
+
77
+ _compute_mixed_alignment(str1, str2, mappings)
78
+ end
79
+
80
+ def transform_begin_position(begin_position)
81
+ @position_map_begin[begin_position]
82
+ end
83
+
84
+ def transform_end_position(end_position)
85
+ @position_map_end[end_position]
86
+ end
87
+
88
+ def transform_a_span(span)
89
+ {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
90
+ end
91
+
92
+ def transform_spans(spans)
93
+ spans.map{|span| transform_a_span(span)}
94
+ end
95
+
96
+ def transform_denotations!(denotations)
97
+ denotations.map!{|d| d.begin = @position_map_begin[d.begin]; d.end = @position_map_end[d.end]; d} unless denotations.nil?
98
+ end
99
+
100
+ def transform_hdenotations(hdenotations)
101
+ return nil if hdenotations.nil?
102
+ hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
103
+ end
104
+
105
+ private
106
+
107
+ def _compute_mixed_alignment(str1, str2, mappings = [])
108
+ lcsmin = TextAlignment::LCSMin.new(str1, str2)
109
+ lcs = lcsmin.lcs
110
+ @sdiff = lcsmin.sdiff
111
+
112
+ if @sdiff.nil?
113
+ @similarity = 0
114
+ return
115
+ end
116
+
117
+ cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
118
+ @similarity = cmp.similarity
119
+ @str1_match_initial = cmp.str1_match_initial
120
+ @str1_match_final = cmp.str1_match_final
121
+ @str2_match_initial = cmp.str2_match_initial
122
+ @str2_match_final = cmp.str2_match_final
123
+
124
+ posmap_begin, posmap_end = {}, {}
125
+ @common_elements, @mapped_elements = [], []
126
+
127
+ addition, deletion = [], []
128
+
129
+ @sdiff.each do |h|
130
+ case h.action
131
+ when '='
132
+ p1, p2 = h.old_position, h.new_position
133
+
134
+ @common_elements << [str1[p1], str2[p2]]
135
+ posmap_begin[p1], posmap_end[p1] = p2, p2
136
+
137
+ if !addition.empty? && deletion.empty?
138
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
139
+ elsif addition.empty? && !deletion.empty?
140
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
141
+ elsif !addition.empty? && !deletion.empty?
142
+ if addition.length > 1 || deletion.length > 1
143
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
144
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
145
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
146
+ posmap_begin[p1], posmap_end[p1] = p2, p2
147
+ @common_elements += galign.common_elements
148
+ @mapped_elements += galign.mapped_elements
149
+ else
150
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
151
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
152
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
153
+ end
154
+ end
155
+
156
+ addition.clear; deletion.clear
157
+
158
+ when '!'
159
+ deletion << h.old_position
160
+ addition << h.new_position
161
+ when '-'
162
+ deletion << h.old_position
163
+ when '+'
164
+ addition << h.new_position
165
+ end
166
+ end
167
+
168
+ p1, p2 = str1.length, str2.length
169
+ posmap_begin[p1], posmap_end[p1] = p2, p2
170
+
171
+ if !addition.empty? && deletion.empty?
172
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
173
+ elsif addition.empty? && !deletion.empty?
174
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
175
+ elsif !addition.empty? && !deletion.empty?
176
+ if addition.length > 1 && deletion.length > 1
177
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
178
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
179
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
180
+ posmap_begin[p1], posmap_end[p1] = p2, p2
181
+ @common_elements += galign.common_elements
182
+ @mapped_elements += galign.mapped_elements
183
+ else
184
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
185
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
186
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
187
+ end
188
+ end
189
+
190
+ @position_map_begin = posmap_begin.sort.to_h
191
+ @position_map_end = posmap_end.sort.to_h
192
+ end
193
+ end
@@ -1,181 +1,239 @@
1
1
  #!/usr/bin/env ruby
2
- require 'diff-lcs'
3
- require 'text_alignment/lcs_min'
4
- require 'text_alignment/find_divisions'
5
- require 'text_alignment/lcs_comparison'
6
- require 'text_alignment/lcs_alignment'
7
- require 'text_alignment/lcs_cdiff'
8
- require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
2
+ require 'text_alignment/anchor_finder'
3
+ require 'text_alignment/mixed_alignment'
10
4
 
11
5
  module TextAlignment; end unless defined? TextAlignment
12
6
 
13
- TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
14
- TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
7
+ TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
+ TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
+
15
11
 
16
12
  class TextAlignment::TextAlignment
17
- attr_reader :sdiff
18
- attr_reader :position_map_begin, :position_map_end
19
- attr_reader :common_elements, :mapped_elements
20
- attr_reader :similarity
21
- attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
22
-
23
- def initialize(str1, str2, mappings = [])
24
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
25
- raise ArgumentError, "nil mappings" if mappings.nil?
26
-
27
- ## preprocessing
28
- str1 = str1.dup
29
- str2 = str2.dup
30
- mappings = mappings.dup
31
-
32
- ## find the first nomatch character
33
- TextAlignment::NOMATCH_CHARS.each_char do |c|
34
- if str2.index(c).nil?
35
- @nomatch_char1 = c
36
- break
37
- end
38
- end
39
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
40
-
41
- ## find the first nomatch character
42
- TextAlignment::NOMATCH_CHARS.each_char do |c|
43
- if c != @nomatch_char1 && str1.index(c).nil?
44
- @nomatch_char2 = c
45
- break
46
- end
47
- end
48
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
49
-
50
- # single character mappings
51
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
52
- characters_from = character_mappings.collect{|m| m[0]}.join
53
- characters_to = character_mappings.collect{|m| m[1]}.join
54
- characters_to.gsub!(/-/, '\-')
55
-
56
- str1.tr!(characters_from, characters_to)
57
- str2.tr!(characters_from, characters_to)
58
-
59
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
60
-
61
- # ASCII foldings
62
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
63
- ascii_foldings.each do |f|
64
- from = f[1]
65
-
66
- if str2.index(f[0])
67
- to = f[0] + (@nomatch_char1 * (f[1].length - 1))
68
- str1.gsub!(from, to)
69
- end
70
-
71
- if str1.index(f[0])
72
- to = f[0] + (@nomatch_char2 * (f[1].length - 1))
73
- str2.gsub!(from, to)
74
- end
75
- end
76
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
77
-
78
- _compute_mixed_alignment(str1, str2, mappings)
79
- end
80
-
81
- def transform_a_span(span)
82
- {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
83
- end
84
-
85
- def transform_spans(spans)
86
- spans.map{|span| transform_a_span(span)}
87
- end
88
-
89
- def transform_denotations!(denotations)
90
- denotations.map!{|d| d.begin = @position_map_begin[d.begin]; d.end = @position_map_end[d.end]; d} unless denotations.nil?
91
- end
92
-
93
- def transform_hdenotations(hdenotations)
94
- return nil if hdenotations.nil?
95
- hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
96
- end
97
-
98
- private
99
-
100
- def _compute_mixed_alignment(str1, str2, mappings = [])
101
- lcsmin = TextAlignment::LCSMin.new(str1, str2)
102
- lcs = lcsmin.lcs
103
- @sdiff = lcsmin.sdiff
104
-
105
- cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
106
- @similarity = cmp.similarity
107
- @str1_match_initial = cmp.str1_match_initial
108
- @str1_match_final = cmp.str1_match_final
109
- @str2_match_initial = cmp.str2_match_initial
110
- @str2_match_final = cmp.str2_match_final
111
-
112
- posmap_begin, posmap_end = {}, {}
113
- @common_elements, @mapped_elements = [], []
114
-
115
- addition, deletion = [], []
116
-
117
- @sdiff.each do |h|
118
- case h.action
119
- when '='
120
- p1, p2 = h.old_position, h.new_position
121
-
122
- @common_elements << [str1[p1], str2[p2]]
123
- posmap_begin[p1], posmap_end[p1] = p2, p2
124
-
125
- if !addition.empty? && deletion.empty?
126
- posmap_end[p1] = p2 - addition.length unless p1 == 0
127
- elsif addition.empty? && !deletion.empty?
128
- deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
129
- elsif !addition.empty? && !deletion.empty?
130
- if addition.length > 1 || deletion.length > 1
131
- galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
132
- galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
133
- galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
134
- posmap_begin[p1], posmap_end[p1] = p2, p2
135
- @common_elements += galign.common_elements
136
- @mapped_elements += galign.mapped_elements
137
- else
138
- posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
139
- deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
140
- @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
141
- end
142
- end
143
-
144
- addition.clear; deletion.clear
145
-
146
- when '!'
147
- deletion << h.old_position
148
- addition << h.new_position
149
- when '-'
150
- deletion << h.old_position
151
- when '+'
152
- addition << h.new_position
153
- end
154
- end
155
-
156
- p1, p2 = str1.length, str2.length
157
- posmap_begin[p1], posmap_end[p1] = p2, p2
158
-
159
- if !addition.empty? && deletion.empty?
160
- posmap_end[p1] = p2 - addition.length unless p1 == 0
161
- elsif addition.empty? && !deletion.empty?
162
- deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
163
- elsif !addition.empty? && !deletion.empty?
164
- if addition.length > 1 && deletion.length > 1
165
- galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
166
- galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
167
- galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
168
- posmap_begin[p1], posmap_end[p1] = p2, p2
169
- @common_elements += galign.common_elements
170
- @mapped_elements += galign.mapped_elements
171
- else
172
- posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
173
- deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
174
- @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
175
- end
176
- end
177
-
178
- @position_map_begin = posmap_begin.sort.to_h
179
- @position_map_end = posmap_end.sort.to_h
180
- end
13
+ attr_reader :block_alignments
14
+ attr_reader :similarity
15
+ attr_reader :lost_annotations
16
+
17
+ def initialize(str1, str2, mappings = nil)
18
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
19
+
20
+ mappings ||= TextAlignment::MAPPINGS
21
+
22
+ # try exact match
23
+ block_begin = str2.index(str1)
24
+ unless block_begin.nil?
25
+ @block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
26
+ return @block_alignments
27
+ end
28
+
29
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
30
+
31
+ # To collect matched blocks
32
+ mblocks = []
33
+ while anchor = anchor_finder.get_next_anchor
34
+ last = mblocks.last
35
+ if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
36
+ last[:source][:end] = anchor[:source][:end]
37
+ last[:target][:end] = anchor[:target][:end]
38
+ else
39
+ mblocks << anchor
40
+ end
41
+ end
42
+
43
+ # mblocks.each do |b|
44
+ # p [b[:source], b[:target]]
45
+ # puts "---"
46
+ # puts str1[b[:source][:begin] ... b[:source][:end]]
47
+ # puts "---"
48
+ # puts str2[b[:target][:begin] ... b[:target][:end]]
49
+ # puts "====="
50
+ # puts
51
+ # end
52
+ # puts "-=-=-=-=-"
53
+ # puts
54
+
55
+ ## To find block alignments
56
+ @block_alignments = []
57
+ return if mblocks.empty?
58
+
59
+ # Initial step
60
+ if mblocks[0][:source][:begin] > 0
61
+ e1 = mblocks[0][:source][:begin]
62
+ e2 = mblocks[0][:target][:begin]
63
+
64
+ if mblocks[0][:target][:begin] == 0
65
+ @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
66
+ else
67
+ _str1 = str1[0 ... e1]
68
+ _str2 = str2[0 ... e2]
69
+
70
+ unless _str1.strip.empty?
71
+ if _str2.strip.empty?
72
+ @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
73
+ else
74
+ len_min = [_str1.length, _str2.length].min
75
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
76
+ b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
77
+ b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
78
+
79
+ @block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
80
+
81
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
82
+ if alignment.similarity < 0.6
83
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
84
+ else
85
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
91
+ @block_alignments << mblocks[0]
92
+
93
+ (1 ... mblocks.length).each do |i|
94
+ b1 = mblocks[i - 1][:source][:end]
95
+ b2 = mblocks[i - 1][:target][:end]
96
+ e1 = mblocks[i][:source][:begin]
97
+ e2 = mblocks[i][:target][:begin]
98
+ _str1 = str1[b1 ... e1]
99
+ _str2 = str2[b2 ... e2]
100
+ unless _str1.strip.empty?
101
+ if _str2.strip.empty?
102
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
103
+ else
104
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
105
+ if alignment.similarity < 0.6
106
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
107
+ else
108
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
109
+ end
110
+ end
111
+ end
112
+ @block_alignments << mblocks[i]
113
+ end
114
+
115
+ # Final step
116
+ if mblocks[-1][:source][:end] < str1.length
117
+ b1 = mblocks[-1][:source][:end]
118
+ b2 = mblocks[-1][:target][:end]
119
+
120
+ if mblocks[-1][:target][:end] < str2.length
121
+
122
+ else
123
+ e1 = str1.length
124
+ e2 = str2.length
125
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
126
+ end
127
+ end
128
+
129
+ if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
130
+ b1 = mblocks[-1][:source][:end]
131
+ b2 = mblocks[-1][:target][:end]
132
+ _str1 = str1[b1 ... -1]
133
+ _str2 = str2[b2 ... -1]
134
+
135
+ unless _str1.strip.empty?
136
+ if _str2.strip.empty?
137
+ @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
138
+ else
139
+ len_min = [_str1.length, _str2.length].min
140
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
141
+ e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
142
+ e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
143
+ _str1 = str1[b1 ... e1]
144
+ _str2 = str2[b2 ... e2]
145
+
146
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
147
+ if alignment.similarity < 0.6
148
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
149
+ else
150
+ @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
151
+ end
152
+
153
+ @block_alignments << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
154
+ end
155
+ end
156
+ end
157
+
158
+ @block_alignments.each do |a|
159
+ a[:delta] = a[:target][:begin] - a[:source][:begin]
160
+ end
161
+ end
162
+
163
+ def transform_begin_position(begin_position)
164
+ i = @block_alignments.index{|b| b[:source][:end] > begin_position}
165
+ block_alignment = @block_alignments[i]
166
+
167
+ b = if block_alignment[:alignment].nil?
168
+ begin_position + block_alignment[:delta]
169
+ elsif block_alignment[:alignment] == :empty
170
+ if begin_position == block_alignment[:source][:begin]
171
+ block_alignment[:target][:begin]
172
+ else
173
+ raise "lost annotation"
174
+ end
175
+ else
176
+ block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
177
+ end
178
+ end
179
+
180
+ def transform_end_position(end_position)
181
+ i = @block_alignments.index{|b| b[:source][:end] >= end_position}
182
+ block_alignment = @block_alignments[i]
183
+
184
+ e = if block_alignment[:alignment].nil?
185
+ end_position + block_alignment[:delta]
186
+ elsif block_alignment[:alignment] == :empty
187
+ if end_position == block_alignment[:source][:end]
188
+ block_alignment[:target][:end]
189
+ else
190
+ raise "lost annotation"
191
+ end
192
+ else
193
+ block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
194
+ end
195
+ end
196
+
197
+ def transform_a_span(span)
198
+ {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
199
+ end
200
+
201
+ def transform_spans(spans)
202
+ spans.map{|span| transform_a_span(span)}
203
+ end
204
+
205
+ def transform_denotations!(denotations)
206
+ return nil if denotations.nil?
207
+ @lost_annotations = []
208
+
209
+ denotations.each do |d|
210
+ begin
211
+ d.begin = transform_begin_position(d.begin);
212
+ d.end = transform_end_position(d.end);
213
+ rescue
214
+ @lost_annotations << d
215
+ d.begin = nil
216
+ d.end = nil
217
+ end
218
+ end
219
+
220
+ @lost_annotations
221
+ end
222
+
223
+ def transform_hdenotations(hdenotations)
224
+ return nil if hdenotations.nil?
225
+ @lost_annotations = []
226
+
227
+ r = hdenotations.collect do |d|
228
+ new_d = begin
229
+ d.dup.merge({span:transform_a_span(d[:span])})
230
+ rescue
231
+ @lost_annotations << d
232
+ nil
233
+ end
234
+ end.compact
235
+
236
+ r
237
+ end
238
+
181
239
  end