text_alignment 0.6.4 → 0.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +33 -153
- data/lib/text_alignment/constants.rb +1 -1
- data/lib/text_alignment/mappings.rb +168 -70
- data/lib/text_alignment/mixed_alignment.rb +3 -71
- data/lib/text_alignment/text_alignment.rb +223 -119
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c49793b84e9ca5606a8fe9151530f6732337c2b92cfd1af1549a56ea3c2f39e
|
4
|
+
data.tar.gz: 4a0b8328d4c6de43af50bd8c278f83facef311c74516dc3a9a0c9dd5f91fbfc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad70de67b4a7b38290a59d89bed7cd9fa343ce3db62bb67b779e51b499929b9d7c4697871cf25017fa59a34c0525741d5da679559fc2f33d902fcda692f9f1ac
|
7
|
+
data.tar.gz: 6242b35cbb8f53effc477b508d428d9060cba2e93895496134963755cff956f3931411a87a7bc0d33b7b91459f1a27eb67c422897693320e1b2ceb06eff6e22b
|
data/bin/align_annotations
CHANGED
@@ -26,33 +26,43 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def
|
29
|
+
def align_denotations(denotations, source_text, target_text, debug = false)
|
30
|
+
alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
|
31
|
+
new_denotations = alignment.transform_hdenotations(denotations)
|
32
|
+
|
33
|
+
if debug
|
34
|
+
warn "[block alignment]"
|
35
|
+
warn alignment.alignment_show
|
36
|
+
warn "-----"
|
37
|
+
end
|
38
|
+
|
39
|
+
lost_annotations = alignment.lost_annotations
|
40
|
+
unless lost_annotations.empty?
|
41
|
+
warn "\n[lost annotations] #{lost_annotations.length}"
|
42
|
+
lost_annotations.each do |a|
|
43
|
+
warn "#{a}"
|
44
|
+
end
|
45
|
+
warn "====="
|
46
|
+
end
|
47
|
+
warn
|
48
|
+
|
49
|
+
# return target annotations
|
50
|
+
new_denotations
|
51
|
+
end
|
52
|
+
|
53
|
+
def align_mannotations(source_annotations, target_text, debug = false)
|
54
|
+
target_annotations = {text:target_text}
|
55
|
+
|
30
56
|
idnum_denotations = 0
|
31
57
|
idnum_relations = 0
|
32
58
|
idnum_attributes = 0
|
33
59
|
idnum_modifications = 0
|
34
60
|
|
35
|
-
source_annotations.
|
36
|
-
alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
|
37
|
-
|
38
|
-
puts alignment.alignment_show
|
39
|
-
puts "-----"
|
40
|
-
puts
|
41
|
-
|
42
|
-
# alignment.block_alignments.each do |a|
|
43
|
-
# p {source:a[:source], target:a[:target]}
|
44
|
-
# puts "--"
|
45
|
-
# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
|
46
|
-
# puts "--"
|
47
|
-
# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
48
|
-
# puts "--"
|
49
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
50
|
-
# puts "======"
|
51
|
-
# end
|
52
|
-
|
61
|
+
source_annotations.each_with_index do |annotations, i|
|
53
62
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
54
63
|
ididx = {}
|
55
|
-
|
64
|
+
warn "[#{i}]-=-=-=-=-"
|
65
|
+
denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
|
56
66
|
denotations.each do |d|
|
57
67
|
reid = 'T' + (idnum_denotations += 1).to_s
|
58
68
|
ididx[d[:id]] = reid
|
@@ -101,141 +111,11 @@ end
|
|
101
111
|
source_annotations = read_annotations(ARGV[0])
|
102
112
|
target_text = read_text(ARGV[1])
|
103
113
|
|
104
|
-
lost_annotations = []
|
105
114
|
target_annotations = if source_annotations.class == Array
|
106
|
-
|
115
|
+
align_mannotations(source_annotations, target_text, false)
|
107
116
|
else
|
108
|
-
|
109
|
-
|
110
|
-
# pp alignment
|
111
|
-
|
112
|
-
# verification
|
113
|
-
# source_text = source_annotations[:text]
|
114
|
-
# puts "=====BEGIN"
|
115
|
-
# (0 ... source_text.rstrip.length).each do |p|
|
116
|
-
# t = alignment.transform_begin_position(p)
|
117
|
-
# if t.nil?
|
118
|
-
# print source_text[p]
|
119
|
-
# else
|
120
|
-
# print '.'
|
121
|
-
# end
|
122
|
-
# end
|
123
|
-
# puts
|
124
|
-
# puts "=====END"
|
125
|
-
|
126
|
-
# puts "=====BEGIN"
|
127
|
-
# (0 .. source_text.rstrip.length).each do |p|
|
128
|
-
# t = alignment.transform_end_position(p)
|
129
|
-
# if t.nil?
|
130
|
-
# print source_text[p]
|
131
|
-
# else
|
132
|
-
# print '.'
|
133
|
-
# end
|
134
|
-
# end
|
135
|
-
# puts
|
136
|
-
# puts "=====END"
|
137
|
-
|
138
|
-
source_text = source_annotations[:text]
|
139
|
-
|
140
|
-
puts "[block alignment]"
|
141
|
-
puts alignment.alignment_show
|
142
|
-
puts "====="
|
143
|
-
# exit
|
144
|
-
|
145
|
-
# verification of source denotations
|
146
|
-
puts "[Invalid source denotations]"
|
147
|
-
source_annotations[:denotations] do |d|
|
148
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
149
|
-
end
|
150
|
-
puts "====="
|
151
|
-
puts
|
152
|
-
|
153
|
-
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
154
|
-
puts "[Invalid transformation]"
|
155
|
-
denotations.each do |d|
|
156
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
157
|
-
end
|
158
|
-
puts "====="
|
159
|
-
puts
|
160
|
-
|
161
|
-
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
162
|
-
|
117
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
|
163
118
|
source_annotations.merge({text:target_text, denotations:denotations})
|
164
119
|
end
|
165
120
|
|
166
|
-
|
167
|
-
num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
|
168
|
-
source_annotations.each do |annotations|
|
169
|
-
num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
|
170
|
-
num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
|
171
|
-
num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
|
172
|
-
num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
|
173
|
-
end
|
174
|
-
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
175
|
-
else
|
176
|
-
num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
|
177
|
-
num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
|
178
|
-
num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
|
179
|
-
num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
|
180
|
-
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
181
|
-
end
|
182
|
-
|
183
|
-
warn "[source]"
|
184
|
-
warn "denotations:\t#{num_denotations_source}"
|
185
|
-
# warn "relations:\t#{num_relations_source}"
|
186
|
-
# warn "attributes:\t#{num_attributes_source}"
|
187
|
-
# warn "modifications:\t#{num_modifications_source}"
|
188
|
-
|
189
|
-
warn "\n[target]"
|
190
|
-
warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
|
191
|
-
# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
|
192
|
-
# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
|
193
|
-
# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
|
194
|
-
|
195
|
-
if lost_annotations
|
196
|
-
warn "\n[lost annotations]"
|
197
|
-
warn "#{lost_annotations.length}"
|
198
|
-
end
|
199
|
-
|
200
|
-
#puts target_annotations.to_json
|
201
|
-
|
202
|
-
# denotations = anns1[:denotations]
|
203
|
-
|
204
|
-
# puts "[Alignment1]====="
|
205
|
-
# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
206
|
-
|
207
|
-
# align.alignment.each do |a|
|
208
|
-
# p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
|
209
|
-
# end
|
210
|
-
|
211
|
-
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
212
|
-
# puts
|
213
|
-
# puts "[Similarity]\n#{align.similarity}"
|
214
|
-
# puts
|
215
|
-
# puts '[Denotations original]'
|
216
|
-
# pp denotations
|
217
|
-
# puts
|
218
|
-
# puts '[Denotations transformed]'
|
219
|
-
# new_denotations = align.transform_hdenotations(denotations)
|
220
|
-
# pp new_denotations
|
221
|
-
# puts
|
222
|
-
# puts "[Alignment2 (downcased)]====="
|
223
|
-
# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
|
224
|
-
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
225
|
-
# puts
|
226
|
-
# puts "[Similarity]\n#{align.similarity}"
|
227
|
-
# puts
|
228
|
-
# puts '[Denotations original]'
|
229
|
-
# pp denotations
|
230
|
-
# puts
|
231
|
-
# puts '[Denotations transformed]'
|
232
|
-
# new_denotations = align.transform_hdenotations(denotations)
|
233
|
-
# pp new_denotations
|
234
|
-
# puts
|
235
|
-
# puts '[Annotations transformed]'
|
236
|
-
# anns2[:denotations] = new_denotations
|
237
|
-
# puts anns2.to_json
|
238
|
-
|
239
|
-
# p align.common_elements
|
240
|
-
# puts "---------------"
|
241
|
-
# p align.mapped_elements
|
121
|
+
# puts target_annotations.to_json
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
3
|
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
-
TextAlignment::SIZE_WINDOW =
|
4
|
+
TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
|
5
5
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
6
|
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
7
|
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -1,74 +1,172 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
3
|
TextAlignment::MAPPINGS = [
|
4
|
-
["©", "(c)"],
|
5
|
-
|
6
|
-
["α", "alpha"],
|
7
|
-
["β", "beta"],
|
8
|
-
["γ", "gamma"],
|
9
|
-
["δ", "delta"],
|
10
|
-
["ε", "epsilon"],
|
11
|
-
["ζ", "zeta"],
|
12
|
-
["η", "eta"],
|
13
|
-
["θ", "theta"],
|
14
|
-
["ι", "iota"],
|
15
|
-
["κ", "kappa"],
|
16
|
-
["λ", "lambda"],
|
17
|
-
["λ", "lamda"],
|
18
|
-
["μ", "mu"],
|
19
|
-
["ν", "nu"],
|
20
|
-
["ξ", "xi"],
|
21
|
-
["ο", "omicron"],
|
22
|
-
["π", "pi"],
|
23
|
-
["ρ", "rho"],
|
24
|
-
["σ", "sigma"],
|
25
|
-
["τ", "tau"],
|
26
|
-
["υ", "upsilon"],
|
27
|
-
["φ", "phi"],
|
28
|
-
["χ", "chi"],
|
29
|
-
["ψ", "psi"],
|
30
|
-
["ω", "omega"],
|
31
|
-
|
32
|
-
["Α", "Alpha"],
|
33
|
-
["Β", "Beta"],
|
34
|
-
["Γ", "Gamma"],
|
35
|
-
["Δ", "Delta"],
|
36
|
-
["Ε", "Epsilon"],
|
37
|
-
["Ζ", "Zeta"],
|
38
|
-
["Η", "Eta"],
|
39
|
-
["Θ", "Theta"],
|
40
|
-
["Ι", "Iota"],
|
41
|
-
["Κ", "Kappa"],
|
42
|
-
["Λ", "Lambda"],
|
43
|
-
["Λ", "Lamda"],
|
44
|
-
["Μ", "Mu"],
|
45
|
-
["Ν", "Nu"],
|
46
|
-
["Ξ", "Xi"],
|
47
|
-
["Ο", "Omicron"],
|
48
|
-
["Π", "Pi"],
|
49
|
-
["Ρ", "Rho"],
|
50
|
-
["Σ", "Sigma"],
|
51
|
-
["Τ", "Tau"],
|
52
|
-
["Υ", "Upsilon"],
|
53
|
-
["Φ", "Phi"],
|
54
|
-
["Χ", "Chi"],
|
55
|
-
["Ψ", "Psi"],
|
56
|
-
["Ω", "Omega"],
|
57
|
-
|
58
|
-
["ϕ", "phi"],
|
59
|
-
|
60
|
-
["×", "x"],
|
61
|
-
["•", "*"],
|
62
|
-
[" ", " "],
|
63
|
-
[" ", " "],
|
64
|
-
[" ", " "],
|
65
|
-
[" ", " "],
|
66
|
-
["
|
67
|
-
["
|
68
|
-
["
|
69
|
-
["
|
70
|
-
["
|
71
|
-
["
|
72
|
-
["
|
4
|
+
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
|
+
|
6
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
7
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
8
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
9
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
10
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
11
|
+
["ζ", "zeta"], #U+03B6 (greek small letter zeta)
|
12
|
+
["η", "eta"], #U+03B7 (greek small letter eta)
|
13
|
+
["θ", "theta"], #U+03B7 (greek small letter eta)
|
14
|
+
["ι", "iota"], #U+03B7 (greek small letter eta)
|
15
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
16
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
17
|
+
["λ", "lamda"], #U+03BB (greek small letter lambda)
|
18
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
19
|
+
["ν", "nu"], #U+03BD (greek small letter nu)
|
20
|
+
["ξ", "xi"], #U+03BE (greek small letter xi)
|
21
|
+
["ο", "omicron"], #U+03BF (greek small letter omicron)
|
22
|
+
["π", "pi"], #U+03C0 (greek small letter pi)
|
23
|
+
["ρ", "rho"], #U+03C1 (greek small letter rho)
|
24
|
+
["σ", "sigma"], #U+03C3 (greek small letter sigma)
|
25
|
+
["τ", "tau"], #U+03C4 (greek small letter tau)
|
26
|
+
["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
|
27
|
+
["φ", "phi"], #U+03C6 (greek small letter phi)
|
28
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
29
|
+
["ψ", "psi"], #U+03C8 (greek small letter psi)
|
30
|
+
["ω", "omega"], #U+03C9 (greek small letter omega)
|
31
|
+
|
32
|
+
["Α", "Alpha"], #U+0391 (greek capital letter alpha)
|
33
|
+
["Β", "Beta"], #U+0392 (greek capital letter beta)
|
34
|
+
["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
|
35
|
+
["Δ", "Delta"], #U+0394 (greek capital letter delta)
|
36
|
+
["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
|
37
|
+
["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
|
38
|
+
["Η", "Eta"], #U+0397 (greek capital letter eta)
|
39
|
+
["Θ", "Theta"], #U+0398 (greek capital letter theta)
|
40
|
+
["Ι", "Iota"], #U+0399 (greek capital letter iota)
|
41
|
+
["Κ", "Kappa"], #U+039A (greek capital letter kappa)
|
42
|
+
["Λ", "Lambda"], #U+039B (greek capital letter lambda)
|
43
|
+
["Λ", "Lamda"], #U+039B (greek capital letter lambda)
|
44
|
+
["Μ", "Mu"], #U+039C (greek capital letter mu)
|
45
|
+
["Ν", "Nu"], #U+039D (greek capital letter nu)
|
46
|
+
["Ξ", "Xi"], #U+039E (greek capital letter xi)
|
47
|
+
["Ο", "Omicron"], #U+039F (greek capital letter omicron)
|
48
|
+
["Π", "Pi"], #U+03A0 (greek capital letter pi)
|
49
|
+
["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
|
50
|
+
["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
|
51
|
+
["Τ", "Tau"], #U+03A4 (greek capital letter tau)
|
52
|
+
["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
|
53
|
+
["Φ", "Phi"], #U+03A6 (greek capital letter phi)
|
54
|
+
["Χ", "Chi"], #U+03A7 (greek capital letter chi)
|
55
|
+
["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
|
56
|
+
["Ω", "Omega"], #U+03A9 (greek capital letter omega)
|
57
|
+
|
58
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
59
|
+
|
60
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
61
|
+
["•", "*"], #U+2022 (bullet)
|
62
|
+
[" ", " "], #U+2009 (thin space)
|
63
|
+
[" ", " "], #U+200A (hair space)
|
64
|
+
[" ", " "], #U+00A0 (no-break space)
|
65
|
+
[" ", " "], #U+3000 (ideographic space)
|
66
|
+
["‑", "-"], #U+2211 (Non-Breaking Hyphen)
|
67
|
+
["−", "-"], #U+2212 (minus sign)
|
68
|
+
["–", "-"], #U+2013 (en dash)
|
69
|
+
["′", "'"], #U+2032 (prime)
|
70
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
71
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
72
|
+
["“", '"'], #U+201C (left double quotation mark)
|
73
|
+
["”", '"'], #U+201D (right double quotation mark)
|
73
74
|
['"', "''"]
|
74
|
-
|
75
|
+
]
|
76
|
+
|
77
|
+
|
78
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
79
|
+
|
80
|
+
|
81
|
+
class << TextAlignment
|
82
|
+
def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
|
83
|
+
_mappings ||= TextAlignment::MAPPINGS
|
84
|
+
|
85
|
+
character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
86
|
+
if character_mappings.empty?
|
87
|
+
[_str1, _str2, _mappings]
|
88
|
+
else
|
89
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
90
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
91
|
+
characters_to.gsub!(/-/, '\-')
|
92
|
+
|
93
|
+
str1 = _str1.tr(characters_from, characters_to)
|
94
|
+
str2 = _str2.tr(characters_from, characters_to)
|
95
|
+
|
96
|
+
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
|
97
|
+
|
98
|
+
[str1, str2, mappings]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
|
103
|
+
_mappings ||= TextAlignment::MAPPINGS
|
104
|
+
|
105
|
+
long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
106
|
+
if long_to_one_mappings.empty?
|
107
|
+
[_str1, _str2, _mappings]
|
108
|
+
else
|
109
|
+
## long to one character mappings
|
110
|
+
pletters = TextAlignment::PADDING_LETTERS
|
111
|
+
|
112
|
+
# find the padding letter for str1
|
113
|
+
@padding_letter1 = begin
|
114
|
+
i = pletters.index{|l| _str2.index(l).nil?}
|
115
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
116
|
+
TextAlignment::PADDING_LETTERS[i]
|
117
|
+
end
|
118
|
+
|
119
|
+
# find the padding letter for str2
|
120
|
+
@padding_letter2 = begin
|
121
|
+
i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
|
122
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
123
|
+
TextAlignment::PADDING_LETTERS[i]
|
124
|
+
end
|
125
|
+
|
126
|
+
str1 = str2 = nil
|
127
|
+
long_to_one_mappings.each do |f|
|
128
|
+
from = f[1]
|
129
|
+
|
130
|
+
str1 = if _str2.index(f[0])
|
131
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
132
|
+
_str1.gsub(from, to)
|
133
|
+
else
|
134
|
+
_str1
|
135
|
+
end
|
136
|
+
|
137
|
+
str2 = if _str1.index(f[0])
|
138
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
139
|
+
_str2.gsub(from, to)
|
140
|
+
else
|
141
|
+
_str2
|
142
|
+
end
|
143
|
+
end
|
144
|
+
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
|
145
|
+
|
146
|
+
[str1, str2, mappings]
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def compute_similarity(_s1, _s2, sdiff)
|
151
|
+
return 0 if sdiff.nil?
|
152
|
+
|
153
|
+
# compute the lcs only with non-whitespace letters
|
154
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
155
|
+
return 0 if lcs == 0
|
156
|
+
|
157
|
+
s1 = if @padding_letter1
|
158
|
+
_s1.tr(@padding_letter1, ' ')
|
159
|
+
else
|
160
|
+
_s1
|
161
|
+
end
|
162
|
+
|
163
|
+
s2 = if @padding_letter2
|
164
|
+
_s2.tr(@padding_letter2, ' ')
|
165
|
+
else
|
166
|
+
_s2
|
167
|
+
end
|
168
|
+
|
169
|
+
similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(_str1, _str2)
|
20
|
+
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
str1, str2, mappings =
|
23
|
+
str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
|
24
24
|
|
25
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
26
|
end
|
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
63
63
|
end
|
64
64
|
|
65
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
66
|
-
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
|
+
@similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
|
67
67
|
@str1_match_initial = cmp.str1_match_initial
|
68
68
|
@str1_match_final = cmp.str1_match_final
|
69
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
|
|
139
139
|
@position_map_end = posmap_end.sort.to_h
|
140
140
|
end
|
141
141
|
|
142
|
-
private
|
143
|
-
|
144
|
-
def string_preprocessing(_str1, _str2)
|
145
|
-
str1 = _str1.dup
|
146
|
-
str2 = _str2.dup
|
147
|
-
mappings = TextAlignment::MAPPINGS.dup
|
148
|
-
|
149
|
-
## single character mappings
|
150
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
-
characters_to.gsub!(/-/, '\-')
|
154
|
-
|
155
|
-
str1.tr!(characters_from, characters_to)
|
156
|
-
str2.tr!(characters_from, characters_to)
|
157
|
-
|
158
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
-
|
160
|
-
## long to one character mappings
|
161
|
-
pletters = TextAlignment::PADDING_LETTERS
|
162
|
-
|
163
|
-
# find the padding letter for str1
|
164
|
-
@padding_letter1 = begin
|
165
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
-
TextAlignment::PADDING_LETTERS[i]
|
168
|
-
end
|
169
|
-
|
170
|
-
# find the padding letter for str2
|
171
|
-
@padding_letter2 = begin
|
172
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
-
TextAlignment::PADDING_LETTERS[i]
|
175
|
-
end
|
176
|
-
|
177
|
-
# ASCII foldings
|
178
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
-
ascii_foldings.each do |f|
|
180
|
-
from = f[1]
|
181
|
-
|
182
|
-
if str2.index(f[0])
|
183
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
-
str1.gsub!(from, to)
|
185
|
-
end
|
186
|
-
|
187
|
-
if str1.index(f[0])
|
188
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
-
str2.gsub!(from, to)
|
190
|
-
end
|
191
|
-
end
|
192
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
-
|
194
|
-
[str1, str2, mappings]
|
195
|
-
end
|
196
|
-
|
197
|
-
def compute_similarity(_s1, _s2, sdiff)
|
198
|
-
return 0 if sdiff.nil?
|
199
|
-
|
200
|
-
# compute the lcs only with non-whitespace letters
|
201
|
-
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
-
return 0 if lcs == 0
|
203
|
-
|
204
|
-
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
-
|
207
|
-
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
-
end
|
209
|
-
|
210
142
|
end
|
@@ -5,50 +5,44 @@ require 'text_alignment/mixed_alignment'
|
|
5
5
|
|
6
6
|
module TextAlignment; end unless defined? TextAlignment
|
7
7
|
|
8
|
-
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
|
-
|
10
8
|
class TextAlignment::TextAlignment
|
11
9
|
attr_reader :block_alignment
|
12
10
|
attr_reader :similarity
|
13
11
|
attr_reader :lost_annotations
|
14
12
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
13
|
+
def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
14
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
15
|
|
18
|
-
@block_alignment = {source_text:
|
16
|
+
@block_alignment = {source_text:_str1, target_text:_str2}
|
17
|
+
@original_str1 = _str1
|
18
|
+
@original_str2 = _str2
|
19
19
|
|
20
|
-
|
21
|
-
block_begin = str2.index(str1)
|
22
|
-
unless block_begin.nil?
|
23
|
-
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
24
|
-
return @block_alignment
|
25
|
-
end
|
20
|
+
str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
|
26
21
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
31
|
-
return @block_alignment
|
22
|
+
if r = whole_block_alignment(str1, str2)
|
23
|
+
@block_alignment[:blocks] = r
|
24
|
+
return
|
32
25
|
end
|
33
26
|
|
27
|
+
## to find block alignments
|
34
28
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
35
29
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
last
|
40
|
-
|
41
|
-
last[:
|
42
|
-
last[:target][:end] = anchor[:target][:end]
|
30
|
+
blocks = []
|
31
|
+
while block = anchor_finder.get_next_anchor
|
32
|
+
last = blocks.last
|
33
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
34
|
+
last[:source][:end] = block[:source][:end]
|
35
|
+
last[:target][:end] = block[:target][:end]
|
43
36
|
else
|
44
|
-
|
37
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
45
38
|
end
|
46
39
|
end
|
47
40
|
|
48
|
-
# pp
|
41
|
+
# pp blocks
|
49
42
|
# puts "-----"
|
50
43
|
# puts
|
51
|
-
#
|
44
|
+
# exit
|
45
|
+
# blocks.each do |b|
|
52
46
|
# p [b[:source], b[:target]]
|
53
47
|
# puts "---"
|
54
48
|
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
@@ -60,114 +54,218 @@ class TextAlignment::TextAlignment
|
|
60
54
|
# puts "-=-=-=-=-"
|
61
55
|
# puts
|
62
56
|
|
63
|
-
##
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
if mblocks[0][:source][:begin] > 0
|
69
|
-
e1 = mblocks[0][:source][:begin]
|
70
|
-
e2 = mblocks[0][:target][:begin]
|
57
|
+
## to fill the gaps
|
58
|
+
last_block = nil
|
59
|
+
blocks2 = blocks.inject([]) do |sum, block|
|
60
|
+
b1 = last_block ? last_block[:source][:end] : 0
|
61
|
+
e1 = block[:source][:begin]
|
71
62
|
|
72
|
-
if
|
73
|
-
|
63
|
+
sum += if b1 == e1
|
64
|
+
[block]
|
74
65
|
else
|
75
|
-
|
76
|
-
|
66
|
+
b2 = last_block ? last_block[:target][:end] : 0
|
67
|
+
e2 = block[:target][:begin]
|
68
|
+
|
69
|
+
if b2 == e2
|
70
|
+
[
|
71
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
72
|
+
block
|
73
|
+
]
|
74
|
+
else
|
75
|
+
if b1 == 0 && b2 == 0
|
76
|
+
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
77
|
+
b2 = e2 - len_buffer if e2 > len_buffer
|
78
|
+
end
|
77
79
|
|
78
|
-
|
79
|
-
|
80
|
-
|
80
|
+
_str1 = str1[b1 ... e1]
|
81
|
+
_str2 = str2[b2 ... e2]
|
82
|
+
|
83
|
+
if _str1.strip.empty? || _str2.strip.empty?
|
84
|
+
[
|
85
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
86
|
+
block
|
87
|
+
]
|
81
88
|
else
|
82
|
-
|
83
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
84
|
-
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
85
|
-
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
86
|
-
|
87
|
-
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
88
|
-
|
89
|
-
_str1 = str1[b1 ... e1]
|
90
|
-
_str2 = str2[b2 ... e2]
|
91
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
92
|
-
if alignment.similarity < 0.5
|
93
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
94
|
-
else
|
95
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
96
|
-
end
|
89
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
97
90
|
end
|
98
91
|
end
|
99
92
|
end
|
93
|
+
|
94
|
+
last_block = block
|
95
|
+
sum
|
100
96
|
end
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
if
|
112
|
-
|
97
|
+
|
98
|
+
# the last step
|
99
|
+
blocks2 += if last_block.nil?
|
100
|
+
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
101
|
+
else
|
102
|
+
b1 = last_block[:source][:end]
|
103
|
+
if b1 < str1.length
|
104
|
+
e1 = str1.length
|
105
|
+
|
106
|
+
b2 = last_block[:target][:end]
|
107
|
+
if b2 < str2.length
|
108
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
109
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
110
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
113
111
|
else
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
112
|
+
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
113
|
+
end
|
114
|
+
else
|
115
|
+
[]
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
@block_alignment[:blocks] = blocks2
|
120
|
+
end
|
121
|
+
|
122
|
+
def whole_block_alignment(str1, str2)
|
123
|
+
## Block exact match
|
124
|
+
block_begin = str2.index(str1)
|
125
|
+
unless block_begin.nil?
|
126
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
127
|
+
end
|
128
|
+
|
129
|
+
block_begin = str2.downcase.index(str1.downcase)
|
130
|
+
unless block_begin.nil?
|
131
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
132
|
+
end
|
133
|
+
|
134
|
+
nil
|
135
|
+
end
|
136
|
+
|
137
|
+
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
138
|
+
block2 = str2[b2 ... e2]
|
139
|
+
|
140
|
+
## term-based alignment
|
141
|
+
tblocks = if denotations
|
142
|
+
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
143
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
144
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
145
|
+
|
146
|
+
position = 0
|
147
|
+
tblocks = ds_in_scope.map do |term|
|
148
|
+
lex = term[:lex]
|
149
|
+
r = block2.index(lex, position)
|
150
|
+
if r.nil?
|
151
|
+
position = nil
|
152
|
+
break
|
153
|
+
end
|
154
|
+
position = r + lex.length
|
155
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
156
|
+
end
|
157
|
+
|
158
|
+
# missing term found
|
159
|
+
tblocks = [] if position.nil?
|
160
|
+
|
161
|
+
# redundant matching found
|
162
|
+
unless position.nil?
|
163
|
+
ds_in_scope.each do |term|
|
164
|
+
lex = term[:lex]
|
165
|
+
look_forward = block2.index(lex, position)
|
166
|
+
unless look_forward.nil?
|
167
|
+
tblocks = []
|
168
|
+
break
|
119
169
|
end
|
120
170
|
end
|
121
171
|
end
|
122
|
-
|
172
|
+
|
173
|
+
tblocks
|
174
|
+
else
|
175
|
+
[]
|
123
176
|
end
|
124
177
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
178
|
+
if tblocks.empty?
|
179
|
+
if b1 == 0 && e1 == str1.length
|
180
|
+
if (e1 > 2000) || (e2 > 2000)
|
181
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
182
|
+
else
|
183
|
+
block1 = str1[b1 ... e1]
|
184
|
+
block2 = str2[b2 ... e2]
|
131
185
|
|
132
|
-
|
133
|
-
|
134
|
-
|
186
|
+
## character-based alignment
|
187
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
188
|
+
if alignment.sdiff.nil?
|
189
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
190
|
+
else
|
191
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
else
|
195
|
+
block1 = str1[b1 ... e1]
|
196
|
+
block2 = str2[b2 ... e2]
|
197
|
+
|
198
|
+
## character-based alignment
|
199
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
200
|
+
if alignment.sdiff.nil?
|
201
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
135
202
|
else
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
203
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
204
|
+
end
|
205
|
+
end
|
206
|
+
else
|
207
|
+
last_tblock = nil
|
208
|
+
lblocks = tblocks.inject([]) do |sum, tblock|
|
209
|
+
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
210
|
+
te1 = tblock[:source][:begin]
|
142
211
|
|
143
|
-
|
144
|
-
|
145
|
-
|
212
|
+
sum += if te1 == tb1
|
213
|
+
[tblock]
|
214
|
+
else
|
215
|
+
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
216
|
+
te2 = tblock[:target][:begin]
|
217
|
+
|
218
|
+
if b2 == e2
|
219
|
+
[
|
220
|
+
{source:{begin:tb1, end:te1}, alignment: :empty},
|
221
|
+
tblock
|
222
|
+
]
|
146
223
|
else
|
147
|
-
|
224
|
+
[
|
225
|
+
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
226
|
+
tblock
|
227
|
+
]
|
148
228
|
end
|
229
|
+
end
|
230
|
+
|
231
|
+
last_tblock = tblock
|
232
|
+
sum
|
233
|
+
end
|
149
234
|
|
150
|
-
|
235
|
+
if last_tblock[:source][:end] < e1
|
236
|
+
if last_tblock[:target][:end] < e2
|
237
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
238
|
+
else
|
239
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
151
240
|
end
|
152
241
|
end
|
153
|
-
end
|
154
242
|
|
155
|
-
|
156
|
-
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
243
|
+
lblocks
|
157
244
|
end
|
158
245
|
end
|
159
246
|
|
247
|
+
|
248
|
+
def indices(str, target)
|
249
|
+
position = 0
|
250
|
+
len = target.len
|
251
|
+
Enumerator.new do |yielder|
|
252
|
+
while idx = str.index(target, position)
|
253
|
+
yielder << idx
|
254
|
+
position = idx + len
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
160
259
|
def transform_begin_position(begin_position)
|
161
260
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
162
261
|
block = @block_alignment[:blocks][i]
|
163
262
|
|
164
|
-
b = if block[:alignment] == :block
|
263
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
165
264
|
begin_position + block[:delta]
|
166
265
|
elsif block[:alignment] == :empty
|
167
266
|
if begin_position == block[:source][:begin]
|
168
267
|
block[:target][:begin]
|
169
268
|
else
|
170
|
-
# raise "lost annotation"
|
171
269
|
nil
|
172
270
|
end
|
173
271
|
else
|
@@ -180,13 +278,12 @@ class TextAlignment::TextAlignment
|
|
180
278
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
181
279
|
block = @block_alignment[:blocks][i]
|
182
280
|
|
183
|
-
e = if block[:alignment] == :block
|
281
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
184
282
|
end_position + block[:delta]
|
185
283
|
elsif block[:alignment] == :empty
|
186
284
|
if end_position == block[:source][:end]
|
187
285
|
block[:target][:end]
|
188
286
|
else
|
189
|
-
# raise "lost annotation"
|
190
287
|
nil
|
191
288
|
end
|
192
289
|
else
|
@@ -208,14 +305,14 @@ class TextAlignment::TextAlignment
|
|
208
305
|
@lost_annotations = []
|
209
306
|
|
210
307
|
denotations.each do |d|
|
211
|
-
begin
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
end
|
308
|
+
source = {begin:d.begin, end:d.end}
|
309
|
+
d.begin = transform_begin_position(d.begin);
|
310
|
+
d.end = transform_end_position(d.end);
|
311
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
|
312
|
+
rescue
|
313
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
314
|
+
d.begin = nil
|
315
|
+
d.end = nil
|
219
316
|
end
|
220
317
|
|
221
318
|
@lost_annotations
|
@@ -226,12 +323,12 @@ class TextAlignment::TextAlignment
|
|
226
323
|
@lost_annotations = []
|
227
324
|
|
228
325
|
r = hdenotations.collect do |d|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
326
|
+
t = transform_a_span(d[:span])
|
327
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
|
328
|
+
new_d = d.dup.merge({span:t})
|
329
|
+
rescue
|
330
|
+
@lost_annotations << {source: d[:span], target:t}
|
331
|
+
nil
|
235
332
|
end.compact
|
236
333
|
|
237
334
|
r
|
@@ -245,14 +342,22 @@ class TextAlignment::TextAlignment
|
|
245
342
|
@block_alignment[:blocks].each do |a|
|
246
343
|
show += case a[:alignment]
|
247
344
|
when :block
|
248
|
-
"===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
345
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
346
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
347
|
+
when :term
|
348
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
249
349
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
250
350
|
when :empty
|
251
351
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
252
352
|
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
253
353
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
254
|
-
">>>>> string 2
|
255
|
-
|
354
|
+
">>>>> string 2 " +
|
355
|
+
if a[:target]
|
356
|
+
"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
357
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
358
|
+
else
|
359
|
+
"[-]\n\n"
|
360
|
+
end
|
256
361
|
else
|
257
362
|
astr1 = ''
|
258
363
|
astr2 = ''
|
@@ -292,5 +397,4 @@ class TextAlignment::TextAlignment
|
|
292
397
|
end
|
293
398
|
show
|
294
399
|
end
|
295
|
-
|
296
400
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|