text_alignment 0.6.4 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +33 -153
- data/lib/text_alignment/constants.rb +1 -1
- data/lib/text_alignment/mappings.rb +168 -70
- data/lib/text_alignment/mixed_alignment.rb +3 -71
- data/lib/text_alignment/text_alignment.rb +223 -119
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c49793b84e9ca5606a8fe9151530f6732337c2b92cfd1af1549a56ea3c2f39e
|
4
|
+
data.tar.gz: 4a0b8328d4c6de43af50bd8c278f83facef311c74516dc3a9a0c9dd5f91fbfc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad70de67b4a7b38290a59d89bed7cd9fa343ce3db62bb67b779e51b499929b9d7c4697871cf25017fa59a34c0525741d5da679559fc2f33d902fcda692f9f1ac
|
7
|
+
data.tar.gz: 6242b35cbb8f53effc477b508d428d9060cba2e93895496134963755cff956f3931411a87a7bc0d33b7b91459f1a27eb67c422897693320e1b2ceb06eff6e22b
|
data/bin/align_annotations
CHANGED
@@ -26,33 +26,43 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def
|
29
|
+
def align_denotations(denotations, source_text, target_text, debug = false)
|
30
|
+
alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
|
31
|
+
new_denotations = alignment.transform_hdenotations(denotations)
|
32
|
+
|
33
|
+
if debug
|
34
|
+
warn "[block alignment]"
|
35
|
+
warn alignment.alignment_show
|
36
|
+
warn "-----"
|
37
|
+
end
|
38
|
+
|
39
|
+
lost_annotations = alignment.lost_annotations
|
40
|
+
unless lost_annotations.empty?
|
41
|
+
warn "\n[lost annotations] #{lost_annotations.length}"
|
42
|
+
lost_annotations.each do |a|
|
43
|
+
warn "#{a}"
|
44
|
+
end
|
45
|
+
warn "====="
|
46
|
+
end
|
47
|
+
warn
|
48
|
+
|
49
|
+
# return target annotations
|
50
|
+
new_denotations
|
51
|
+
end
|
52
|
+
|
53
|
+
def align_mannotations(source_annotations, target_text, debug = false)
|
54
|
+
target_annotations = {text:target_text}
|
55
|
+
|
30
56
|
idnum_denotations = 0
|
31
57
|
idnum_relations = 0
|
32
58
|
idnum_attributes = 0
|
33
59
|
idnum_modifications = 0
|
34
60
|
|
35
|
-
source_annotations.
|
36
|
-
alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
|
37
|
-
|
38
|
-
puts alignment.alignment_show
|
39
|
-
puts "-----"
|
40
|
-
puts
|
41
|
-
|
42
|
-
# alignment.block_alignments.each do |a|
|
43
|
-
# p {source:a[:source], target:a[:target]}
|
44
|
-
# puts "--"
|
45
|
-
# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
|
46
|
-
# puts "--"
|
47
|
-
# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
48
|
-
# puts "--"
|
49
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
50
|
-
# puts "======"
|
51
|
-
# end
|
52
|
-
|
61
|
+
source_annotations.each_with_index do |annotations, i|
|
53
62
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
54
63
|
ididx = {}
|
55
|
-
|
64
|
+
warn "[#{i}]-=-=-=-=-"
|
65
|
+
denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
|
56
66
|
denotations.each do |d|
|
57
67
|
reid = 'T' + (idnum_denotations += 1).to_s
|
58
68
|
ididx[d[:id]] = reid
|
@@ -101,141 +111,11 @@ end
|
|
101
111
|
source_annotations = read_annotations(ARGV[0])
|
102
112
|
target_text = read_text(ARGV[1])
|
103
113
|
|
104
|
-
lost_annotations = []
|
105
114
|
target_annotations = if source_annotations.class == Array
|
106
|
-
|
115
|
+
align_mannotations(source_annotations, target_text, false)
|
107
116
|
else
|
108
|
-
|
109
|
-
|
110
|
-
# pp alignment
|
111
|
-
|
112
|
-
# verification
|
113
|
-
# source_text = source_annotations[:text]
|
114
|
-
# puts "=====BEGIN"
|
115
|
-
# (0 ... source_text.rstrip.length).each do |p|
|
116
|
-
# t = alignment.transform_begin_position(p)
|
117
|
-
# if t.nil?
|
118
|
-
# print source_text[p]
|
119
|
-
# else
|
120
|
-
# print '.'
|
121
|
-
# end
|
122
|
-
# end
|
123
|
-
# puts
|
124
|
-
# puts "=====END"
|
125
|
-
|
126
|
-
# puts "=====BEGIN"
|
127
|
-
# (0 .. source_text.rstrip.length).each do |p|
|
128
|
-
# t = alignment.transform_end_position(p)
|
129
|
-
# if t.nil?
|
130
|
-
# print source_text[p]
|
131
|
-
# else
|
132
|
-
# print '.'
|
133
|
-
# end
|
134
|
-
# end
|
135
|
-
# puts
|
136
|
-
# puts "=====END"
|
137
|
-
|
138
|
-
source_text = source_annotations[:text]
|
139
|
-
|
140
|
-
puts "[block alignment]"
|
141
|
-
puts alignment.alignment_show
|
142
|
-
puts "====="
|
143
|
-
# exit
|
144
|
-
|
145
|
-
# verification of source denotations
|
146
|
-
puts "[Invalid source denotations]"
|
147
|
-
source_annotations[:denotations] do |d|
|
148
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
149
|
-
end
|
150
|
-
puts "====="
|
151
|
-
puts
|
152
|
-
|
153
|
-
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
154
|
-
puts "[Invalid transformation]"
|
155
|
-
denotations.each do |d|
|
156
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
157
|
-
end
|
158
|
-
puts "====="
|
159
|
-
puts
|
160
|
-
|
161
|
-
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
162
|
-
|
117
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
|
163
118
|
source_annotations.merge({text:target_text, denotations:denotations})
|
164
119
|
end
|
165
120
|
|
166
|
-
|
167
|
-
num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
|
168
|
-
source_annotations.each do |annotations|
|
169
|
-
num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
|
170
|
-
num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
|
171
|
-
num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
|
172
|
-
num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
|
173
|
-
end
|
174
|
-
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
175
|
-
else
|
176
|
-
num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
|
177
|
-
num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
|
178
|
-
num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
|
179
|
-
num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
|
180
|
-
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
181
|
-
end
|
182
|
-
|
183
|
-
warn "[source]"
|
184
|
-
warn "denotations:\t#{num_denotations_source}"
|
185
|
-
# warn "relations:\t#{num_relations_source}"
|
186
|
-
# warn "attributes:\t#{num_attributes_source}"
|
187
|
-
# warn "modifications:\t#{num_modifications_source}"
|
188
|
-
|
189
|
-
warn "\n[target]"
|
190
|
-
warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
|
191
|
-
# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
|
192
|
-
# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
|
193
|
-
# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
|
194
|
-
|
195
|
-
if lost_annotations
|
196
|
-
warn "\n[lost annotations]"
|
197
|
-
warn "#{lost_annotations.length}"
|
198
|
-
end
|
199
|
-
|
200
|
-
#puts target_annotations.to_json
|
201
|
-
|
202
|
-
# denotations = anns1[:denotations]
|
203
|
-
|
204
|
-
# puts "[Alignment1]====="
|
205
|
-
# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
206
|
-
|
207
|
-
# align.alignment.each do |a|
|
208
|
-
# p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
|
209
|
-
# end
|
210
|
-
|
211
|
-
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
212
|
-
# puts
|
213
|
-
# puts "[Similarity]\n#{align.similarity}"
|
214
|
-
# puts
|
215
|
-
# puts '[Denotations original]'
|
216
|
-
# pp denotations
|
217
|
-
# puts
|
218
|
-
# puts '[Denotations transformed]'
|
219
|
-
# new_denotations = align.transform_hdenotations(denotations)
|
220
|
-
# pp new_denotations
|
221
|
-
# puts
|
222
|
-
# puts "[Alignment2 (downcased)]====="
|
223
|
-
# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
|
224
|
-
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
225
|
-
# puts
|
226
|
-
# puts "[Similarity]\n#{align.similarity}"
|
227
|
-
# puts
|
228
|
-
# puts '[Denotations original]'
|
229
|
-
# pp denotations
|
230
|
-
# puts
|
231
|
-
# puts '[Denotations transformed]'
|
232
|
-
# new_denotations = align.transform_hdenotations(denotations)
|
233
|
-
# pp new_denotations
|
234
|
-
# puts
|
235
|
-
# puts '[Annotations transformed]'
|
236
|
-
# anns2[:denotations] = new_denotations
|
237
|
-
# puts anns2.to_json
|
238
|
-
|
239
|
-
# p align.common_elements
|
240
|
-
# puts "---------------"
|
241
|
-
# p align.mapped_elements
|
121
|
+
# puts target_annotations.to_json
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
3
|
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
-
TextAlignment::SIZE_WINDOW =
|
4
|
+
TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
|
5
5
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
6
|
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
7
|
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -1,74 +1,172 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
3
|
TextAlignment::MAPPINGS = [
|
4
|
-
["©", "(c)"],
|
5
|
-
|
6
|
-
["α", "alpha"],
|
7
|
-
["β", "beta"],
|
8
|
-
["γ", "gamma"],
|
9
|
-
["δ", "delta"],
|
10
|
-
["ε", "epsilon"],
|
11
|
-
["ζ", "zeta"],
|
12
|
-
["η", "eta"],
|
13
|
-
["θ", "theta"],
|
14
|
-
["ι", "iota"],
|
15
|
-
["κ", "kappa"],
|
16
|
-
["λ", "lambda"],
|
17
|
-
["λ", "lamda"],
|
18
|
-
["μ", "mu"],
|
19
|
-
["ν", "nu"],
|
20
|
-
["ξ", "xi"],
|
21
|
-
["ο", "omicron"],
|
22
|
-
["π", "pi"],
|
23
|
-
["ρ", "rho"],
|
24
|
-
["σ", "sigma"],
|
25
|
-
["τ", "tau"],
|
26
|
-
["υ", "upsilon"],
|
27
|
-
["φ", "phi"],
|
28
|
-
["χ", "chi"],
|
29
|
-
["ψ", "psi"],
|
30
|
-
["ω", "omega"],
|
31
|
-
|
32
|
-
["Α", "Alpha"],
|
33
|
-
["Β", "Beta"],
|
34
|
-
["Γ", "Gamma"],
|
35
|
-
["Δ", "Delta"],
|
36
|
-
["Ε", "Epsilon"],
|
37
|
-
["Ζ", "Zeta"],
|
38
|
-
["Η", "Eta"],
|
39
|
-
["Θ", "Theta"],
|
40
|
-
["Ι", "Iota"],
|
41
|
-
["Κ", "Kappa"],
|
42
|
-
["Λ", "Lambda"],
|
43
|
-
["Λ", "Lamda"],
|
44
|
-
["Μ", "Mu"],
|
45
|
-
["Ν", "Nu"],
|
46
|
-
["Ξ", "Xi"],
|
47
|
-
["Ο", "Omicron"],
|
48
|
-
["Π", "Pi"],
|
49
|
-
["Ρ", "Rho"],
|
50
|
-
["Σ", "Sigma"],
|
51
|
-
["Τ", "Tau"],
|
52
|
-
["Υ", "Upsilon"],
|
53
|
-
["Φ", "Phi"],
|
54
|
-
["Χ", "Chi"],
|
55
|
-
["Ψ", "Psi"],
|
56
|
-
["Ω", "Omega"],
|
57
|
-
|
58
|
-
["ϕ", "phi"],
|
59
|
-
|
60
|
-
["×", "x"],
|
61
|
-
["•", "*"],
|
62
|
-
[" ", " "],
|
63
|
-
[" ", " "],
|
64
|
-
[" ", " "],
|
65
|
-
[" ", " "],
|
66
|
-
["
|
67
|
-
["
|
68
|
-
["
|
69
|
-
["
|
70
|
-
["
|
71
|
-
["
|
72
|
-
["
|
4
|
+
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
|
+
|
6
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
7
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
8
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
9
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
10
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
11
|
+
["ζ", "zeta"], #U+03B6 (greek small letter zeta)
|
12
|
+
["η", "eta"], #U+03B7 (greek small letter eta)
|
13
|
+
["θ", "theta"], #U+03B7 (greek small letter eta)
|
14
|
+
["ι", "iota"], #U+03B7 (greek small letter eta)
|
15
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
16
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
17
|
+
["λ", "lamda"], #U+03BB (greek small letter lambda)
|
18
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
19
|
+
["ν", "nu"], #U+03BD (greek small letter nu)
|
20
|
+
["ξ", "xi"], #U+03BE (greek small letter xi)
|
21
|
+
["ο", "omicron"], #U+03BF (greek small letter omicron)
|
22
|
+
["π", "pi"], #U+03C0 (greek small letter pi)
|
23
|
+
["ρ", "rho"], #U+03C1 (greek small letter rho)
|
24
|
+
["σ", "sigma"], #U+03C3 (greek small letter sigma)
|
25
|
+
["τ", "tau"], #U+03C4 (greek small letter tau)
|
26
|
+
["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
|
27
|
+
["φ", "phi"], #U+03C6 (greek small letter phi)
|
28
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
29
|
+
["ψ", "psi"], #U+03C8 (greek small letter psi)
|
30
|
+
["ω", "omega"], #U+03C9 (greek small letter omega)
|
31
|
+
|
32
|
+
["Α", "Alpha"], #U+0391 (greek capital letter alpha)
|
33
|
+
["Β", "Beta"], #U+0392 (greek capital letter beta)
|
34
|
+
["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
|
35
|
+
["Δ", "Delta"], #U+0394 (greek capital letter delta)
|
36
|
+
["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
|
37
|
+
["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
|
38
|
+
["Η", "Eta"], #U+0397 (greek capital letter eta)
|
39
|
+
["Θ", "Theta"], #U+0398 (greek capital letter theta)
|
40
|
+
["Ι", "Iota"], #U+0399 (greek capital letter iota)
|
41
|
+
["Κ", "Kappa"], #U+039A (greek capital letter kappa)
|
42
|
+
["Λ", "Lambda"], #U+039B (greek capital letter lambda)
|
43
|
+
["Λ", "Lamda"], #U+039B (greek capital letter lambda)
|
44
|
+
["Μ", "Mu"], #U+039C (greek capital letter mu)
|
45
|
+
["Ν", "Nu"], #U+039D (greek capital letter nu)
|
46
|
+
["Ξ", "Xi"], #U+039E (greek capital letter xi)
|
47
|
+
["Ο", "Omicron"], #U+039F (greek capital letter omicron)
|
48
|
+
["Π", "Pi"], #U+03A0 (greek capital letter pi)
|
49
|
+
["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
|
50
|
+
["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
|
51
|
+
["Τ", "Tau"], #U+03A4 (greek capital letter tau)
|
52
|
+
["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
|
53
|
+
["Φ", "Phi"], #U+03A6 (greek capital letter phi)
|
54
|
+
["Χ", "Chi"], #U+03A7 (greek capital letter chi)
|
55
|
+
["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
|
56
|
+
["Ω", "Omega"], #U+03A9 (greek capital letter omega)
|
57
|
+
|
58
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
59
|
+
|
60
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
61
|
+
["•", "*"], #U+2022 (bullet)
|
62
|
+
[" ", " "], #U+2009 (thin space)
|
63
|
+
[" ", " "], #U+200A (hair space)
|
64
|
+
[" ", " "], #U+00A0 (no-break space)
|
65
|
+
[" ", " "], #U+3000 (ideographic space)
|
66
|
+
["‑", "-"], #U+2211 (Non-Breaking Hyphen)
|
67
|
+
["−", "-"], #U+2212 (minus sign)
|
68
|
+
["–", "-"], #U+2013 (en dash)
|
69
|
+
["′", "'"], #U+2032 (prime)
|
70
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
71
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
72
|
+
["“", '"'], #U+201C (left double quotation mark)
|
73
|
+
["”", '"'], #U+201D (right double quotation mark)
|
73
74
|
['"', "''"]
|
74
|
-
|
75
|
+
]
|
76
|
+
|
77
|
+
|
78
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
79
|
+
|
80
|
+
|
81
|
+
class << TextAlignment
|
82
|
+
def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
|
83
|
+
_mappings ||= TextAlignment::MAPPINGS
|
84
|
+
|
85
|
+
character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
86
|
+
if character_mappings.empty?
|
87
|
+
[_str1, _str2, _mappings]
|
88
|
+
else
|
89
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
90
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
91
|
+
characters_to.gsub!(/-/, '\-')
|
92
|
+
|
93
|
+
str1 = _str1.tr(characters_from, characters_to)
|
94
|
+
str2 = _str2.tr(characters_from, characters_to)
|
95
|
+
|
96
|
+
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
|
97
|
+
|
98
|
+
[str1, str2, mappings]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
|
103
|
+
_mappings ||= TextAlignment::MAPPINGS
|
104
|
+
|
105
|
+
long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
106
|
+
if long_to_one_mappings.empty?
|
107
|
+
[_str1, _str2, _mappings]
|
108
|
+
else
|
109
|
+
## long to one character mappings
|
110
|
+
pletters = TextAlignment::PADDING_LETTERS
|
111
|
+
|
112
|
+
# find the padding letter for str1
|
113
|
+
@padding_letter1 = begin
|
114
|
+
i = pletters.index{|l| _str2.index(l).nil?}
|
115
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
116
|
+
TextAlignment::PADDING_LETTERS[i]
|
117
|
+
end
|
118
|
+
|
119
|
+
# find the padding letter for str2
|
120
|
+
@padding_letter2 = begin
|
121
|
+
i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
|
122
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
123
|
+
TextAlignment::PADDING_LETTERS[i]
|
124
|
+
end
|
125
|
+
|
126
|
+
str1 = str2 = nil
|
127
|
+
long_to_one_mappings.each do |f|
|
128
|
+
from = f[1]
|
129
|
+
|
130
|
+
str1 = if _str2.index(f[0])
|
131
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
132
|
+
_str1.gsub(from, to)
|
133
|
+
else
|
134
|
+
_str1
|
135
|
+
end
|
136
|
+
|
137
|
+
str2 = if _str1.index(f[0])
|
138
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
139
|
+
_str2.gsub(from, to)
|
140
|
+
else
|
141
|
+
_str2
|
142
|
+
end
|
143
|
+
end
|
144
|
+
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
|
145
|
+
|
146
|
+
[str1, str2, mappings]
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def compute_similarity(_s1, _s2, sdiff)
|
151
|
+
return 0 if sdiff.nil?
|
152
|
+
|
153
|
+
# compute the lcs only with non-whitespace letters
|
154
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
155
|
+
return 0 if lcs == 0
|
156
|
+
|
157
|
+
s1 = if @padding_letter1
|
158
|
+
_s1.tr(@padding_letter1, ' ')
|
159
|
+
else
|
160
|
+
_s1
|
161
|
+
end
|
162
|
+
|
163
|
+
s2 = if @padding_letter2
|
164
|
+
_s2.tr(@padding_letter2, ' ')
|
165
|
+
else
|
166
|
+
_s2
|
167
|
+
end
|
168
|
+
|
169
|
+
similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(_str1, _str2)
|
20
|
+
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
str1, str2, mappings =
|
23
|
+
str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
|
24
24
|
|
25
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
26
|
end
|
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
63
63
|
end
|
64
64
|
|
65
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
66
|
-
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
|
+
@similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
|
67
67
|
@str1_match_initial = cmp.str1_match_initial
|
68
68
|
@str1_match_final = cmp.str1_match_final
|
69
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
|
|
139
139
|
@position_map_end = posmap_end.sort.to_h
|
140
140
|
end
|
141
141
|
|
142
|
-
private
|
143
|
-
|
144
|
-
def string_preprocessing(_str1, _str2)
|
145
|
-
str1 = _str1.dup
|
146
|
-
str2 = _str2.dup
|
147
|
-
mappings = TextAlignment::MAPPINGS.dup
|
148
|
-
|
149
|
-
## single character mappings
|
150
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
-
characters_to.gsub!(/-/, '\-')
|
154
|
-
|
155
|
-
str1.tr!(characters_from, characters_to)
|
156
|
-
str2.tr!(characters_from, characters_to)
|
157
|
-
|
158
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
-
|
160
|
-
## long to one character mappings
|
161
|
-
pletters = TextAlignment::PADDING_LETTERS
|
162
|
-
|
163
|
-
# find the padding letter for str1
|
164
|
-
@padding_letter1 = begin
|
165
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
-
TextAlignment::PADDING_LETTERS[i]
|
168
|
-
end
|
169
|
-
|
170
|
-
# find the padding letter for str2
|
171
|
-
@padding_letter2 = begin
|
172
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
-
TextAlignment::PADDING_LETTERS[i]
|
175
|
-
end
|
176
|
-
|
177
|
-
# ASCII foldings
|
178
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
-
ascii_foldings.each do |f|
|
180
|
-
from = f[1]
|
181
|
-
|
182
|
-
if str2.index(f[0])
|
183
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
-
str1.gsub!(from, to)
|
185
|
-
end
|
186
|
-
|
187
|
-
if str1.index(f[0])
|
188
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
-
str2.gsub!(from, to)
|
190
|
-
end
|
191
|
-
end
|
192
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
-
|
194
|
-
[str1, str2, mappings]
|
195
|
-
end
|
196
|
-
|
197
|
-
def compute_similarity(_s1, _s2, sdiff)
|
198
|
-
return 0 if sdiff.nil?
|
199
|
-
|
200
|
-
# compute the lcs only with non-whitespace letters
|
201
|
-
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
-
return 0 if lcs == 0
|
203
|
-
|
204
|
-
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
-
|
207
|
-
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
-
end
|
209
|
-
|
210
142
|
end
|
@@ -5,50 +5,44 @@ require 'text_alignment/mixed_alignment'
|
|
5
5
|
|
6
6
|
module TextAlignment; end unless defined? TextAlignment
|
7
7
|
|
8
|
-
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
|
-
|
10
8
|
class TextAlignment::TextAlignment
|
11
9
|
attr_reader :block_alignment
|
12
10
|
attr_reader :similarity
|
13
11
|
attr_reader :lost_annotations
|
14
12
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
13
|
+
def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
14
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
15
|
|
18
|
-
@block_alignment = {source_text:
|
16
|
+
@block_alignment = {source_text:_str1, target_text:_str2}
|
17
|
+
@original_str1 = _str1
|
18
|
+
@original_str2 = _str2
|
19
19
|
|
20
|
-
|
21
|
-
block_begin = str2.index(str1)
|
22
|
-
unless block_begin.nil?
|
23
|
-
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
24
|
-
return @block_alignment
|
25
|
-
end
|
20
|
+
str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
|
26
21
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
31
|
-
return @block_alignment
|
22
|
+
if r = whole_block_alignment(str1, str2)
|
23
|
+
@block_alignment[:blocks] = r
|
24
|
+
return
|
32
25
|
end
|
33
26
|
|
27
|
+
## to find block alignments
|
34
28
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
35
29
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
last
|
40
|
-
|
41
|
-
last[:
|
42
|
-
last[:target][:end] = anchor[:target][:end]
|
30
|
+
blocks = []
|
31
|
+
while block = anchor_finder.get_next_anchor
|
32
|
+
last = blocks.last
|
33
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
34
|
+
last[:source][:end] = block[:source][:end]
|
35
|
+
last[:target][:end] = block[:target][:end]
|
43
36
|
else
|
44
|
-
|
37
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
45
38
|
end
|
46
39
|
end
|
47
40
|
|
48
|
-
# pp
|
41
|
+
# pp blocks
|
49
42
|
# puts "-----"
|
50
43
|
# puts
|
51
|
-
#
|
44
|
+
# exit
|
45
|
+
# blocks.each do |b|
|
52
46
|
# p [b[:source], b[:target]]
|
53
47
|
# puts "---"
|
54
48
|
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
@@ -60,114 +54,218 @@ class TextAlignment::TextAlignment
|
|
60
54
|
# puts "-=-=-=-=-"
|
61
55
|
# puts
|
62
56
|
|
63
|
-
##
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
if mblocks[0][:source][:begin] > 0
|
69
|
-
e1 = mblocks[0][:source][:begin]
|
70
|
-
e2 = mblocks[0][:target][:begin]
|
57
|
+
## to fill the gaps
|
58
|
+
last_block = nil
|
59
|
+
blocks2 = blocks.inject([]) do |sum, block|
|
60
|
+
b1 = last_block ? last_block[:source][:end] : 0
|
61
|
+
e1 = block[:source][:begin]
|
71
62
|
|
72
|
-
if
|
73
|
-
|
63
|
+
sum += if b1 == e1
|
64
|
+
[block]
|
74
65
|
else
|
75
|
-
|
76
|
-
|
66
|
+
b2 = last_block ? last_block[:target][:end] : 0
|
67
|
+
e2 = block[:target][:begin]
|
68
|
+
|
69
|
+
if b2 == e2
|
70
|
+
[
|
71
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
72
|
+
block
|
73
|
+
]
|
74
|
+
else
|
75
|
+
if b1 == 0 && b2 == 0
|
76
|
+
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
77
|
+
b2 = e2 - len_buffer if e2 > len_buffer
|
78
|
+
end
|
77
79
|
|
78
|
-
|
79
|
-
|
80
|
-
|
80
|
+
_str1 = str1[b1 ... e1]
|
81
|
+
_str2 = str2[b2 ... e2]
|
82
|
+
|
83
|
+
if _str1.strip.empty? || _str2.strip.empty?
|
84
|
+
[
|
85
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
86
|
+
block
|
87
|
+
]
|
81
88
|
else
|
82
|
-
|
83
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
84
|
-
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
85
|
-
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
86
|
-
|
87
|
-
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
88
|
-
|
89
|
-
_str1 = str1[b1 ... e1]
|
90
|
-
_str2 = str2[b2 ... e2]
|
91
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
92
|
-
if alignment.similarity < 0.5
|
93
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
94
|
-
else
|
95
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
96
|
-
end
|
89
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
97
90
|
end
|
98
91
|
end
|
99
92
|
end
|
93
|
+
|
94
|
+
last_block = block
|
95
|
+
sum
|
100
96
|
end
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
if
|
112
|
-
|
97
|
+
|
98
|
+
# the last step
|
99
|
+
blocks2 += if last_block.nil?
|
100
|
+
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
101
|
+
else
|
102
|
+
b1 = last_block[:source][:end]
|
103
|
+
if b1 < str1.length
|
104
|
+
e1 = str1.length
|
105
|
+
|
106
|
+
b2 = last_block[:target][:end]
|
107
|
+
if b2 < str2.length
|
108
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
109
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
110
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
113
111
|
else
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
112
|
+
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
113
|
+
end
|
114
|
+
else
|
115
|
+
[]
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
@block_alignment[:blocks] = blocks2
|
120
|
+
end
|
121
|
+
|
122
|
+
def whole_block_alignment(str1, str2)
|
123
|
+
## Block exact match
|
124
|
+
block_begin = str2.index(str1)
|
125
|
+
unless block_begin.nil?
|
126
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
127
|
+
end
|
128
|
+
|
129
|
+
block_begin = str2.downcase.index(str1.downcase)
|
130
|
+
unless block_begin.nil?
|
131
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
132
|
+
end
|
133
|
+
|
134
|
+
nil
|
135
|
+
end
|
136
|
+
|
137
|
+
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
138
|
+
block2 = str2[b2 ... e2]
|
139
|
+
|
140
|
+
## term-based alignment
|
141
|
+
tblocks = if denotations
|
142
|
+
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
143
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
144
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
145
|
+
|
146
|
+
position = 0
|
147
|
+
tblocks = ds_in_scope.map do |term|
|
148
|
+
lex = term[:lex]
|
149
|
+
r = block2.index(lex, position)
|
150
|
+
if r.nil?
|
151
|
+
position = nil
|
152
|
+
break
|
153
|
+
end
|
154
|
+
position = r + lex.length
|
155
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
156
|
+
end
|
157
|
+
|
158
|
+
# missing term found
|
159
|
+
tblocks = [] if position.nil?
|
160
|
+
|
161
|
+
# redundant matching found
|
162
|
+
unless position.nil?
|
163
|
+
ds_in_scope.each do |term|
|
164
|
+
lex = term[:lex]
|
165
|
+
look_forward = block2.index(lex, position)
|
166
|
+
unless look_forward.nil?
|
167
|
+
tblocks = []
|
168
|
+
break
|
119
169
|
end
|
120
170
|
end
|
121
171
|
end
|
122
|
-
|
172
|
+
|
173
|
+
tblocks
|
174
|
+
else
|
175
|
+
[]
|
123
176
|
end
|
124
177
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
178
|
+
if tblocks.empty?
|
179
|
+
if b1 == 0 && e1 == str1.length
|
180
|
+
if (e1 > 2000) || (e2 > 2000)
|
181
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
182
|
+
else
|
183
|
+
block1 = str1[b1 ... e1]
|
184
|
+
block2 = str2[b2 ... e2]
|
131
185
|
|
132
|
-
|
133
|
-
|
134
|
-
|
186
|
+
## character-based alignment
|
187
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
188
|
+
if alignment.sdiff.nil?
|
189
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
190
|
+
else
|
191
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
else
|
195
|
+
block1 = str1[b1 ... e1]
|
196
|
+
block2 = str2[b2 ... e2]
|
197
|
+
|
198
|
+
## character-based alignment
|
199
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
200
|
+
if alignment.sdiff.nil?
|
201
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
135
202
|
else
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
203
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
204
|
+
end
|
205
|
+
end
|
206
|
+
else
|
207
|
+
last_tblock = nil
|
208
|
+
lblocks = tblocks.inject([]) do |sum, tblock|
|
209
|
+
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
210
|
+
te1 = tblock[:source][:begin]
|
142
211
|
|
143
|
-
|
144
|
-
|
145
|
-
|
212
|
+
sum += if te1 == tb1
|
213
|
+
[tblock]
|
214
|
+
else
|
215
|
+
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
216
|
+
te2 = tblock[:target][:begin]
|
217
|
+
|
218
|
+
if b2 == e2
|
219
|
+
[
|
220
|
+
{source:{begin:tb1, end:te1}, alignment: :empty},
|
221
|
+
tblock
|
222
|
+
]
|
146
223
|
else
|
147
|
-
|
224
|
+
[
|
225
|
+
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
226
|
+
tblock
|
227
|
+
]
|
148
228
|
end
|
229
|
+
end
|
230
|
+
|
231
|
+
last_tblock = tblock
|
232
|
+
sum
|
233
|
+
end
|
149
234
|
|
150
|
-
|
235
|
+
if last_tblock[:source][:end] < e1
|
236
|
+
if last_tblock[:target][:end] < e2
|
237
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
238
|
+
else
|
239
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
151
240
|
end
|
152
241
|
end
|
153
|
-
end
|
154
242
|
|
155
|
-
|
156
|
-
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
243
|
+
lblocks
|
157
244
|
end
|
158
245
|
end
|
159
246
|
|
247
|
+
|
248
|
+
def indices(str, target)
|
249
|
+
position = 0
|
250
|
+
len = target.len
|
251
|
+
Enumerator.new do |yielder|
|
252
|
+
while idx = str.index(target, position)
|
253
|
+
yielder << idx
|
254
|
+
position = idx + len
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
160
259
|
def transform_begin_position(begin_position)
|
161
260
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
162
261
|
block = @block_alignment[:blocks][i]
|
163
262
|
|
164
|
-
b = if block[:alignment] == :block
|
263
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
165
264
|
begin_position + block[:delta]
|
166
265
|
elsif block[:alignment] == :empty
|
167
266
|
if begin_position == block[:source][:begin]
|
168
267
|
block[:target][:begin]
|
169
268
|
else
|
170
|
-
# raise "lost annotation"
|
171
269
|
nil
|
172
270
|
end
|
173
271
|
else
|
@@ -180,13 +278,12 @@ class TextAlignment::TextAlignment
|
|
180
278
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
181
279
|
block = @block_alignment[:blocks][i]
|
182
280
|
|
183
|
-
e = if block[:alignment] == :block
|
281
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
184
282
|
end_position + block[:delta]
|
185
283
|
elsif block[:alignment] == :empty
|
186
284
|
if end_position == block[:source][:end]
|
187
285
|
block[:target][:end]
|
188
286
|
else
|
189
|
-
# raise "lost annotation"
|
190
287
|
nil
|
191
288
|
end
|
192
289
|
else
|
@@ -208,14 +305,14 @@ class TextAlignment::TextAlignment
|
|
208
305
|
@lost_annotations = []
|
209
306
|
|
210
307
|
denotations.each do |d|
|
211
|
-
begin
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
end
|
308
|
+
source = {begin:d.begin, end:d.end}
|
309
|
+
d.begin = transform_begin_position(d.begin);
|
310
|
+
d.end = transform_end_position(d.end);
|
311
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
|
312
|
+
rescue
|
313
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
314
|
+
d.begin = nil
|
315
|
+
d.end = nil
|
219
316
|
end
|
220
317
|
|
221
318
|
@lost_annotations
|
@@ -226,12 +323,12 @@ class TextAlignment::TextAlignment
|
|
226
323
|
@lost_annotations = []
|
227
324
|
|
228
325
|
r = hdenotations.collect do |d|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
326
|
+
t = transform_a_span(d[:span])
|
327
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
|
328
|
+
new_d = d.dup.merge({span:t})
|
329
|
+
rescue
|
330
|
+
@lost_annotations << {source: d[:span], target:t}
|
331
|
+
nil
|
235
332
|
end.compact
|
236
333
|
|
237
334
|
r
|
@@ -245,14 +342,22 @@ class TextAlignment::TextAlignment
|
|
245
342
|
@block_alignment[:blocks].each do |a|
|
246
343
|
show += case a[:alignment]
|
247
344
|
when :block
|
248
|
-
"===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
345
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
346
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
347
|
+
when :term
|
348
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
249
349
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
250
350
|
when :empty
|
251
351
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
252
352
|
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
253
353
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
254
|
-
">>>>> string 2
|
255
|
-
|
354
|
+
">>>>> string 2 " +
|
355
|
+
if a[:target]
|
356
|
+
"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
357
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
358
|
+
else
|
359
|
+
"[-]\n\n"
|
360
|
+
end
|
256
361
|
else
|
257
362
|
astr1 = ''
|
258
363
|
astr2 = ''
|
@@ -292,5 +397,4 @@ class TextAlignment::TextAlignment
|
|
292
397
|
end
|
293
398
|
show
|
294
399
|
end
|
295
|
-
|
296
400
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|