text_alignment 0.2.9 → 0.3.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +225 -39
- data/lib/text_alignment/anchor_finder.rb +146 -0
- data/lib/text_alignment/approximate_fit.rb +50 -52
- data/lib/text_alignment/find_divisions.rb +198 -200
- data/lib/text_alignment/glcs_alignment.rb +297 -297
- data/lib/text_alignment/glcs_alignment_fast.rb +94 -94
- data/lib/text_alignment/glcs_required.rb +50 -50
- data/lib/text_alignment/lcs_alignment.rb +115 -115
- data/lib/text_alignment/lcs_cdiff.rb +46 -48
- data/lib/text_alignment/lcs_comparison.rb +53 -53
- data/lib/text_alignment/lcs_min.rb +144 -138
- data/lib/text_alignment/mappings.rb +68 -69
- data/lib/text_alignment/mixed_alignment.rb +193 -0
- data/lib/text_alignment/text_alignment.rb +232 -174
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +1 -1
- metadata +5 -13
- data/spec/spec_helper.rb +0 -1
- data/spec/text_alignment/glcs_alignment_spec.rb +0 -302
- data/spec/text_alignment/lcs_alignment_spec.rb +0 -98
- data/spec/text_alignment/lcs_comparision_spec.rb +0 -322
- data/spec/text_alignment/text_alignment_spec.rb +0 -302
@@ -1,75 +1,74 @@
|
|
1
|
-
module TextAlignment
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
TextAlignment::MAPPINGS = [
|
4
|
+
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
6
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
7
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
8
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
9
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
10
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
11
|
+
["ζ", "zeta"], #U+03B6 (greek small letter zeta)
|
12
|
+
["η", "eta"], #U+03B7 (greek small letter eta)
|
13
|
+
["θ", "theta"], #U+03B7 (greek small letter eta)
|
14
|
+
["ι", "iota"], #U+03B7 (greek small letter eta)
|
15
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
16
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
17
|
+
["λ", "lamda"], #U+03BB (greek small letter lambda)
|
18
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
19
|
+
["ν", "nu"], #U+03BD (greek small letter nu)
|
20
|
+
["ξ", "xi"], #U+03BE (greek small letter xi)
|
21
|
+
["ο", "omicron"], #U+03BF (greek small letter omicron)
|
22
|
+
["π", "pi"], #U+03C0 (greek small letter pi)
|
23
|
+
["ρ", "rho"], #U+03C1 (greek small letter rho)
|
24
|
+
["σ", "sigma"], #U+03C3 (greek small letter sigma)
|
25
|
+
["τ", "tau"], #U+03C4 (greek small letter tau)
|
26
|
+
["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
|
27
|
+
["φ", "phi"], #U+03C6 (greek small letter phi)
|
28
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
29
|
+
["ψ", "psi"], #U+03C8 (greek small letter psi)
|
30
|
+
["ω", "omega"], #U+03C9 (greek small letter omega)
|
31
31
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
32
|
+
["Α", "Alpha"], #U+0391 (greek capital letter alpha)
|
33
|
+
["Β", "Beta"], #U+0392 (greek capital letter beta)
|
34
|
+
["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
|
35
|
+
["Δ", "Delta"], #U+0394 (greek capital letter delta)
|
36
|
+
["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
|
37
|
+
["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
|
38
|
+
["Η", "Eta"], #U+0397 (greek capital letter eta)
|
39
|
+
["Θ", "Theta"], #U+0398 (greek capital letter theta)
|
40
|
+
["Ι", "Iota"], #U+0399 (greek capital letter iota)
|
41
|
+
["Κ", "Kappa"], #U+039A (greek capital letter kappa)
|
42
|
+
["Λ", "Lambda"], #U+039B (greek capital letter lambda)
|
43
|
+
["Λ", "Lamda"], #U+039B (greek capital letter lambda)
|
44
|
+
["Μ", "Mu"], #U+039C (greek capital letter mu)
|
45
|
+
["Ν", "Nu"], #U+039D (greek capital letter nu)
|
46
|
+
["Ξ", "Xi"], #U+039E (greek capital letter xi)
|
47
|
+
["Ο", "Omicron"], #U+039F (greek capital letter omicron)
|
48
|
+
["Π", "Pi"], #U+03A0 (greek capital letter pi)
|
49
|
+
["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
|
50
|
+
["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
|
51
|
+
["Τ", "Tau"], #U+03A4 (greek capital letter tau)
|
52
|
+
["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
|
53
|
+
["Φ", "Phi"], #U+03A6 (greek capital letter phi)
|
54
|
+
["Χ", "Chi"], #U+03A7 (greek capital letter chi)
|
55
|
+
["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
|
56
|
+
["Ω", "Omega"], #U+03A9 (greek capital letter omega)
|
57
57
|
|
58
|
-
|
58
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
59
59
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
60
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
61
|
+
["•", "*"], #U+2022 (bullet)
|
62
|
+
[" ", " "], #U+2009 (thin space)
|
63
|
+
[" ", " "], #U+200A (hair space)
|
64
|
+
[" ", " "], #U+00A0 (no-break space)
|
65
|
+
[" ", " "], #U+3000 (ideographic space)
|
66
|
+
["−", "-"], #U+2212 (minus sign)
|
67
|
+
["–", "-"], #U+2013 (en dash)
|
68
|
+
["′", "'"], #U+2032 (prime)
|
69
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
70
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
71
|
+
["“", '"'], #U+201C (left double quotation mark)
|
72
|
+
["”", '"'], #U+201D (right double quotation mark)
|
73
|
+
['"', "''"]
|
74
74
|
]
|
75
|
-
end
|
@@ -0,0 +1,193 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'diff-lcs'
|
3
|
+
require 'text_alignment/lcs_min'
|
4
|
+
require 'text_alignment/find_divisions'
|
5
|
+
require 'text_alignment/lcs_comparison'
|
6
|
+
require 'text_alignment/lcs_alignment'
|
7
|
+
require 'text_alignment/lcs_cdiff'
|
8
|
+
require 'text_alignment/glcs_alignment'
|
9
|
+
require 'text_alignment/mappings'
|
10
|
+
|
11
|
+
module TextAlignment; end unless defined? TextAlignment
|
12
|
+
|
13
|
+
TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
|
14
|
+
|
15
|
+
class TextAlignment::MixedAlignment
|
16
|
+
attr_reader :sdiff
|
17
|
+
attr_reader :position_map_begin, :position_map_end
|
18
|
+
attr_reader :common_elements, :mapped_elements
|
19
|
+
attr_reader :similarity
|
20
|
+
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
21
|
+
|
22
|
+
def initialize(str1, str2, mappings = [])
|
23
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
24
|
+
raise ArgumentError, "nil mappings" if mappings.nil?
|
25
|
+
|
26
|
+
## preprocessing
|
27
|
+
str1 = str1.dup
|
28
|
+
str2 = str2.dup
|
29
|
+
mappings = mappings.dup
|
30
|
+
|
31
|
+
## find the first nomatch character
|
32
|
+
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
33
|
+
if str2.index(c).nil?
|
34
|
+
@nomatch_char1 = c
|
35
|
+
break
|
36
|
+
end
|
37
|
+
end
|
38
|
+
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
|
39
|
+
|
40
|
+
## find the first nomatch character
|
41
|
+
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
42
|
+
if c != @nomatch_char1 && str1.index(c).nil?
|
43
|
+
@nomatch_char2 = c
|
44
|
+
break
|
45
|
+
end
|
46
|
+
end
|
47
|
+
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
|
48
|
+
|
49
|
+
# single character mappings
|
50
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
51
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
52
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
53
|
+
characters_to.gsub!(/-/, '\-')
|
54
|
+
|
55
|
+
str1.tr!(characters_from, characters_to)
|
56
|
+
str2.tr!(characters_from, characters_to)
|
57
|
+
|
58
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
59
|
+
|
60
|
+
# ASCII foldings
|
61
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
62
|
+
ascii_foldings.each do |f|
|
63
|
+
from = f[1]
|
64
|
+
|
65
|
+
if str2.index(f[0])
|
66
|
+
to = f[0] + (@nomatch_char1 * (f[1].length - 1))
|
67
|
+
str1.gsub!(from, to)
|
68
|
+
end
|
69
|
+
|
70
|
+
if str1.index(f[0])
|
71
|
+
to = f[0] + (@nomatch_char2 * (f[1].length - 1))
|
72
|
+
str2.gsub!(from, to)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
76
|
+
|
77
|
+
_compute_mixed_alignment(str1, str2, mappings)
|
78
|
+
end
|
79
|
+
|
80
|
+
def transform_begin_position(begin_position)
|
81
|
+
@position_map_begin[begin_position]
|
82
|
+
end
|
83
|
+
|
84
|
+
def transform_end_position(end_position)
|
85
|
+
@position_map_end[end_position]
|
86
|
+
end
|
87
|
+
|
88
|
+
def transform_a_span(span)
|
89
|
+
{begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
|
90
|
+
end
|
91
|
+
|
92
|
+
def transform_spans(spans)
|
93
|
+
spans.map{|span| transform_a_span(span)}
|
94
|
+
end
|
95
|
+
|
96
|
+
def transform_denotations!(denotations)
|
97
|
+
denotations.map!{|d| d.begin = @position_map_begin[d.begin]; d.end = @position_map_end[d.end]; d} unless denotations.nil?
|
98
|
+
end
|
99
|
+
|
100
|
+
def transform_hdenotations(hdenotations)
|
101
|
+
return nil if hdenotations.nil?
|
102
|
+
hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
def _compute_mixed_alignment(str1, str2, mappings = [])
|
108
|
+
lcsmin = TextAlignment::LCSMin.new(str1, str2)
|
109
|
+
lcs = lcsmin.lcs
|
110
|
+
@sdiff = lcsmin.sdiff
|
111
|
+
|
112
|
+
if @sdiff.nil?
|
113
|
+
@similarity = 0
|
114
|
+
return
|
115
|
+
end
|
116
|
+
|
117
|
+
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
118
|
+
@similarity = cmp.similarity
|
119
|
+
@str1_match_initial = cmp.str1_match_initial
|
120
|
+
@str1_match_final = cmp.str1_match_final
|
121
|
+
@str2_match_initial = cmp.str2_match_initial
|
122
|
+
@str2_match_final = cmp.str2_match_final
|
123
|
+
|
124
|
+
posmap_begin, posmap_end = {}, {}
|
125
|
+
@common_elements, @mapped_elements = [], []
|
126
|
+
|
127
|
+
addition, deletion = [], []
|
128
|
+
|
129
|
+
@sdiff.each do |h|
|
130
|
+
case h.action
|
131
|
+
when '='
|
132
|
+
p1, p2 = h.old_position, h.new_position
|
133
|
+
|
134
|
+
@common_elements << [str1[p1], str2[p2]]
|
135
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
136
|
+
|
137
|
+
if !addition.empty? && deletion.empty?
|
138
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
139
|
+
elsif addition.empty? && !deletion.empty?
|
140
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
141
|
+
elsif !addition.empty? && !deletion.empty?
|
142
|
+
if addition.length > 1 || deletion.length > 1
|
143
|
+
galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
|
144
|
+
galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
145
|
+
galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
146
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
147
|
+
@common_elements += galign.common_elements
|
148
|
+
@mapped_elements += galign.mapped_elements
|
149
|
+
else
|
150
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
151
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
152
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
addition.clear; deletion.clear
|
157
|
+
|
158
|
+
when '!'
|
159
|
+
deletion << h.old_position
|
160
|
+
addition << h.new_position
|
161
|
+
when '-'
|
162
|
+
deletion << h.old_position
|
163
|
+
when '+'
|
164
|
+
addition << h.new_position
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
p1, p2 = str1.length, str2.length
|
169
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
170
|
+
|
171
|
+
if !addition.empty? && deletion.empty?
|
172
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
173
|
+
elsif addition.empty? && !deletion.empty?
|
174
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
175
|
+
elsif !addition.empty? && !deletion.empty?
|
176
|
+
if addition.length > 1 && deletion.length > 1
|
177
|
+
galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
|
178
|
+
galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
179
|
+
galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
180
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
181
|
+
@common_elements += galign.common_elements
|
182
|
+
@mapped_elements += galign.mapped_elements
|
183
|
+
else
|
184
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
185
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
186
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
@position_map_begin = posmap_begin.sort.to_h
|
191
|
+
@position_map_end = posmap_end.sort.to_h
|
192
|
+
end
|
193
|
+
end
|
@@ -1,181 +1,239 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
require '
|
3
|
-
require 'text_alignment/
|
4
|
-
require 'text_alignment/find_divisions'
|
5
|
-
require 'text_alignment/lcs_comparison'
|
6
|
-
require 'text_alignment/lcs_alignment'
|
7
|
-
require 'text_alignment/lcs_cdiff'
|
8
|
-
require 'text_alignment/glcs_alignment'
|
9
|
-
require 'text_alignment/mappings'
|
2
|
+
require 'text_alignment/anchor_finder'
|
3
|
+
require 'text_alignment/mixed_alignment'
|
10
4
|
|
11
5
|
module TextAlignment; end unless defined? TextAlignment
|
12
6
|
|
13
|
-
TextAlignment::SIGNATURE_NGRAM =
|
14
|
-
TextAlignment::
|
7
|
+
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
+
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
10
|
+
|
15
11
|
|
16
12
|
class TextAlignment::TextAlignment
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
13
|
+
attr_reader :block_alignments
|
14
|
+
attr_reader :similarity
|
15
|
+
attr_reader :lost_annotations
|
16
|
+
|
17
|
+
def initialize(str1, str2, mappings = nil)
|
18
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
19
|
+
|
20
|
+
mappings ||= TextAlignment::MAPPINGS
|
21
|
+
|
22
|
+
# try exact match
|
23
|
+
block_begin = str2.index(str1)
|
24
|
+
unless block_begin.nil?
|
25
|
+
@block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
|
26
|
+
return @block_alignments
|
27
|
+
end
|
28
|
+
|
29
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
|
30
|
+
|
31
|
+
# To collect matched blocks
|
32
|
+
mblocks = []
|
33
|
+
while anchor = anchor_finder.get_next_anchor
|
34
|
+
last = mblocks.last
|
35
|
+
if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
|
36
|
+
last[:source][:end] = anchor[:source][:end]
|
37
|
+
last[:target][:end] = anchor[:target][:end]
|
38
|
+
else
|
39
|
+
mblocks << anchor
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# mblocks.each do |b|
|
44
|
+
# p [b[:source], b[:target]]
|
45
|
+
# puts "---"
|
46
|
+
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
47
|
+
# puts "---"
|
48
|
+
# puts str2[b[:target][:begin] ... b[:target][:end]]
|
49
|
+
# puts "====="
|
50
|
+
# puts
|
51
|
+
# end
|
52
|
+
# puts "-=-=-=-=-"
|
53
|
+
# puts
|
54
|
+
|
55
|
+
## To find block alignments
|
56
|
+
@block_alignments = []
|
57
|
+
return if mblocks.empty?
|
58
|
+
|
59
|
+
# Initial step
|
60
|
+
if mblocks[0][:source][:begin] > 0
|
61
|
+
e1 = mblocks[0][:source][:begin]
|
62
|
+
e2 = mblocks[0][:target][:begin]
|
63
|
+
|
64
|
+
if mblocks[0][:target][:begin] == 0
|
65
|
+
@block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
|
66
|
+
else
|
67
|
+
_str1 = str1[0 ... e1]
|
68
|
+
_str2 = str2[0 ... e2]
|
69
|
+
|
70
|
+
unless _str1.strip.empty?
|
71
|
+
if _str2.strip.empty?
|
72
|
+
@block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
73
|
+
else
|
74
|
+
len_min = [_str1.length, _str2.length].min
|
75
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
76
|
+
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
77
|
+
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
78
|
+
|
79
|
+
@block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
80
|
+
|
81
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
82
|
+
if alignment.similarity < 0.6
|
83
|
+
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
84
|
+
else
|
85
|
+
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
@block_alignments << mblocks[0]
|
92
|
+
|
93
|
+
(1 ... mblocks.length).each do |i|
|
94
|
+
b1 = mblocks[i - 1][:source][:end]
|
95
|
+
b2 = mblocks[i - 1][:target][:end]
|
96
|
+
e1 = mblocks[i][:source][:begin]
|
97
|
+
e2 = mblocks[i][:target][:begin]
|
98
|
+
_str1 = str1[b1 ... e1]
|
99
|
+
_str2 = str2[b2 ... e2]
|
100
|
+
unless _str1.strip.empty?
|
101
|
+
if _str2.strip.empty?
|
102
|
+
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
103
|
+
else
|
104
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
105
|
+
if alignment.similarity < 0.6
|
106
|
+
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
107
|
+
else
|
108
|
+
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
@block_alignments << mblocks[i]
|
113
|
+
end
|
114
|
+
|
115
|
+
# Final step
|
116
|
+
if mblocks[-1][:source][:end] < str1.length
|
117
|
+
b1 = mblocks[-1][:source][:end]
|
118
|
+
b2 = mblocks[-1][:target][:end]
|
119
|
+
|
120
|
+
if mblocks[-1][:target][:end] < str2.length
|
121
|
+
|
122
|
+
else
|
123
|
+
e1 = str1.length
|
124
|
+
e2 = str2.length
|
125
|
+
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
|
130
|
+
b1 = mblocks[-1][:source][:end]
|
131
|
+
b2 = mblocks[-1][:target][:end]
|
132
|
+
_str1 = str1[b1 ... -1]
|
133
|
+
_str2 = str2[b2 ... -1]
|
134
|
+
|
135
|
+
unless _str1.strip.empty?
|
136
|
+
if _str2.strip.empty?
|
137
|
+
@block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
138
|
+
else
|
139
|
+
len_min = [_str1.length, _str2.length].min
|
140
|
+
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
141
|
+
e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
|
142
|
+
e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
|
143
|
+
_str1 = str1[b1 ... e1]
|
144
|
+
_str2 = str2[b2 ... e2]
|
145
|
+
|
146
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
147
|
+
if alignment.similarity < 0.6
|
148
|
+
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
149
|
+
else
|
150
|
+
@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
151
|
+
end
|
152
|
+
|
153
|
+
@block_alignments << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
@block_alignments.each do |a|
|
159
|
+
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def transform_begin_position(begin_position)
|
164
|
+
i = @block_alignments.index{|b| b[:source][:end] > begin_position}
|
165
|
+
block_alignment = @block_alignments[i]
|
166
|
+
|
167
|
+
b = if block_alignment[:alignment].nil?
|
168
|
+
begin_position + block_alignment[:delta]
|
169
|
+
elsif block_alignment[:alignment] == :empty
|
170
|
+
if begin_position == block_alignment[:source][:begin]
|
171
|
+
block_alignment[:target][:begin]
|
172
|
+
else
|
173
|
+
raise "lost annotation"
|
174
|
+
end
|
175
|
+
else
|
176
|
+
block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def transform_end_position(end_position)
|
181
|
+
i = @block_alignments.index{|b| b[:source][:end] >= end_position}
|
182
|
+
block_alignment = @block_alignments[i]
|
183
|
+
|
184
|
+
e = if block_alignment[:alignment].nil?
|
185
|
+
end_position + block_alignment[:delta]
|
186
|
+
elsif block_alignment[:alignment] == :empty
|
187
|
+
if end_position == block_alignment[:source][:end]
|
188
|
+
block_alignment[:target][:end]
|
189
|
+
else
|
190
|
+
raise "lost annotation"
|
191
|
+
end
|
192
|
+
else
|
193
|
+
block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
def transform_a_span(span)
|
198
|
+
{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
|
199
|
+
end
|
200
|
+
|
201
|
+
def transform_spans(spans)
|
202
|
+
spans.map{|span| transform_a_span(span)}
|
203
|
+
end
|
204
|
+
|
205
|
+
def transform_denotations!(denotations)
|
206
|
+
return nil if denotations.nil?
|
207
|
+
@lost_annotations = []
|
208
|
+
|
209
|
+
denotations.each do |d|
|
210
|
+
begin
|
211
|
+
d.begin = transform_begin_position(d.begin);
|
212
|
+
d.end = transform_end_position(d.end);
|
213
|
+
rescue
|
214
|
+
@lost_annotations << d
|
215
|
+
d.begin = nil
|
216
|
+
d.end = nil
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
@lost_annotations
|
221
|
+
end
|
222
|
+
|
223
|
+
def transform_hdenotations(hdenotations)
|
224
|
+
return nil if hdenotations.nil?
|
225
|
+
@lost_annotations = []
|
226
|
+
|
227
|
+
r = hdenotations.collect do |d|
|
228
|
+
new_d = begin
|
229
|
+
d.dup.merge({span:transform_a_span(d[:span])})
|
230
|
+
rescue
|
231
|
+
@lost_annotations << d
|
232
|
+
nil
|
233
|
+
end
|
234
|
+
end.compact
|
235
|
+
|
236
|
+
r
|
237
|
+
end
|
238
|
+
|
181
239
|
end
|