text_alignment 0.7 → 0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +33 -138
- data/lib/text_alignment/constants.rb +1 -1
- data/lib/text_alignment/mappings.rb +168 -70
- data/lib/text_alignment/mixed_alignment.rb +3 -71
- data/lib/text_alignment/text_alignment.rb +50 -33
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1c44cc3036273c8c34800d8f78a79316c53efb80fe45ad81092a6172da3b03c6
|
4
|
+
data.tar.gz: 50ab44cc66b50bf732e99f900c10584025c6ed498603ccf3afd75de90cac4b79
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98645c1ba4566c822d1e6ba6488e4ecdfe100c30923cc7effe7d2a4390ebb6901707e8c9f6a12145e2f98515bc6792afef4f9bfa5fcd683c77d3a5cf599094c7
|
7
|
+
data.tar.gz: 11657abdb8acb64c8edfd5271bbf78d2a75024753180988030c5ce6722b4da2781760e583ca6e33ed469cca85e4a2f8e28af6ef4dc62029ada5bd8a184200dfb
|
data/bin/align_annotations
CHANGED
@@ -26,33 +26,43 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def
|
29
|
+
def align_denotations(denotations, source_text, target_text, debug = false)
|
30
|
+
alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
|
31
|
+
new_denotations = alignment.transform_hdenotations(denotations)
|
32
|
+
|
33
|
+
if debug
|
34
|
+
warn "[block alignment]"
|
35
|
+
warn alignment.alignment_show
|
36
|
+
warn "-----"
|
37
|
+
end
|
38
|
+
|
39
|
+
lost_annotations = alignment.lost_annotations
|
40
|
+
unless lost_annotations.empty?
|
41
|
+
warn "\n[lost annotations] #{lost_annotations.length}"
|
42
|
+
lost_annotations.each do |a|
|
43
|
+
warn "#{a}"
|
44
|
+
end
|
45
|
+
warn "====="
|
46
|
+
end
|
47
|
+
warn
|
48
|
+
|
49
|
+
# return target annotations
|
50
|
+
new_denotations
|
51
|
+
end
|
52
|
+
|
53
|
+
def align_mannotations(source_annotations, target_text, debug = false)
|
54
|
+
target_annotations = {text:target_text}
|
55
|
+
|
30
56
|
idnum_denotations = 0
|
31
57
|
idnum_relations = 0
|
32
58
|
idnum_attributes = 0
|
33
59
|
idnum_modifications = 0
|
34
60
|
|
35
|
-
source_annotations.
|
36
|
-
alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
|
37
|
-
|
38
|
-
puts alignment.alignment_show
|
39
|
-
puts "-----"
|
40
|
-
puts
|
41
|
-
|
42
|
-
# alignment.block_alignments.each do |a|
|
43
|
-
# p {source:a[:source], target:a[:target]}
|
44
|
-
# puts "--"
|
45
|
-
# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
|
46
|
-
# puts "--"
|
47
|
-
# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
48
|
-
# puts "--"
|
49
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
50
|
-
# puts "======"
|
51
|
-
# end
|
52
|
-
|
61
|
+
source_annotations.each_with_index do |annotations, i|
|
53
62
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
54
63
|
ididx = {}
|
55
|
-
|
64
|
+
warn "[#{i}]-=-=-=-=-"
|
65
|
+
denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
|
56
66
|
denotations.each do |d|
|
57
67
|
reid = 'T' + (idnum_denotations += 1).to_s
|
58
68
|
ididx[d[:id]] = reid
|
@@ -101,126 +111,11 @@ end
|
|
101
111
|
source_annotations = read_annotations(ARGV[0])
|
102
112
|
target_text = read_text(ARGV[1])
|
103
113
|
|
104
|
-
lost_annotations = []
|
105
114
|
target_annotations = if source_annotations.class == Array
|
106
|
-
|
115
|
+
align_mannotations(source_annotations, target_text, false)
|
107
116
|
else
|
108
|
-
|
109
|
-
|
110
|
-
# verification
|
111
|
-
# source_text = source_annotations[:text]
|
112
|
-
# puts "=====BEGIN"
|
113
|
-
# (0 ... source_text.rstrip.length).each do |p|
|
114
|
-
# t = alignment.transform_begin_position(p)
|
115
|
-
# if t.nil?
|
116
|
-
# print source_text[p]
|
117
|
-
# else
|
118
|
-
# print '.'
|
119
|
-
# end
|
120
|
-
# end
|
121
|
-
# puts
|
122
|
-
# puts "=====END"
|
123
|
-
|
124
|
-
# puts "=====BEGIN"
|
125
|
-
# (0 .. source_text.rstrip.length).each do |p|
|
126
|
-
# t = alignment.transform_end_position(p)
|
127
|
-
# if t.nil?
|
128
|
-
# print source_text[p]
|
129
|
-
# else
|
130
|
-
# print '.'
|
131
|
-
# end
|
132
|
-
# end
|
133
|
-
# puts
|
134
|
-
# puts "=====END"
|
135
|
-
|
136
|
-
source_text = source_annotations[:text]
|
137
|
-
|
138
|
-
puts "[block alignment]"
|
139
|
-
puts alignment.alignment_show
|
140
|
-
puts "====="
|
141
|
-
# exit
|
142
|
-
|
143
|
-
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
144
|
-
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
145
|
-
|
117
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
|
146
118
|
source_annotations.merge({text:target_text, denotations:denotations})
|
147
119
|
end
|
148
120
|
|
149
|
-
|
150
|
-
num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
|
151
|
-
source_annotations.each do |annotations|
|
152
|
-
num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
|
153
|
-
num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
|
154
|
-
num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
|
155
|
-
num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
|
156
|
-
end
|
157
|
-
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
158
|
-
else
|
159
|
-
num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
|
160
|
-
num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
|
161
|
-
num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
|
162
|
-
num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
|
163
|
-
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
164
|
-
end
|
165
|
-
|
166
|
-
warn "[source]"
|
167
|
-
warn "denotations:\t#{num_denotations_source}"
|
168
|
-
# warn "relations:\t#{num_relations_source}"
|
169
|
-
# warn "attributes:\t#{num_attributes_source}"
|
170
|
-
# warn "modifications:\t#{num_modifications_source}"
|
171
|
-
|
172
|
-
warn "\n[target]"
|
173
|
-
warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
|
174
|
-
# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
|
175
|
-
# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
|
176
|
-
# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
|
177
|
-
|
178
|
-
if lost_annotations
|
179
|
-
warn "\n[lost annotations]"
|
180
|
-
lost_annotations.each do |a|
|
181
|
-
p a
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
#puts target_annotations.to_json
|
186
|
-
|
187
|
-
# denotations = anns1[:denotations]
|
188
|
-
|
189
|
-
# puts "[Alignment1]====="
|
190
|
-
# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
191
|
-
|
192
|
-
# align.alignment.each do |a|
|
193
|
-
# p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
|
194
|
-
# end
|
195
|
-
|
196
|
-
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
197
|
-
# puts
|
198
|
-
# puts "[Similarity]\n#{align.similarity}"
|
199
|
-
# puts
|
200
|
-
# puts '[Denotations original]'
|
201
|
-
# pp denotations
|
202
|
-
# puts
|
203
|
-
# puts '[Denotations transformed]'
|
204
|
-
# new_denotations = align.transform_hdenotations(denotations)
|
205
|
-
# pp new_denotations
|
206
|
-
# puts
|
207
|
-
# puts "[Alignment2 (downcased)]====="
|
208
|
-
# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
|
209
|
-
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
210
|
-
# puts
|
211
|
-
# puts "[Similarity]\n#{align.similarity}"
|
212
|
-
# puts
|
213
|
-
# puts '[Denotations original]'
|
214
|
-
# pp denotations
|
215
|
-
# puts
|
216
|
-
# puts '[Denotations transformed]'
|
217
|
-
# new_denotations = align.transform_hdenotations(denotations)
|
218
|
-
# pp new_denotations
|
219
|
-
# puts
|
220
|
-
# puts '[Annotations transformed]'
|
221
|
-
# anns2[:denotations] = new_denotations
|
222
|
-
# puts anns2.to_json
|
223
|
-
|
224
|
-
# p align.common_elements
|
225
|
-
# puts "---------------"
|
226
|
-
# p align.mapped_elements
|
121
|
+
# puts target_annotations.to_json
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
3
|
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
-
TextAlignment::SIZE_WINDOW =
|
4
|
+
TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
|
5
5
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
6
|
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
7
|
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -1,74 +1,172 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
3
|
TextAlignment::MAPPINGS = [
|
4
|
-
["©", "(c)"],
|
5
|
-
|
6
|
-
["α", "alpha"],
|
7
|
-
["β", "beta"],
|
8
|
-
["γ", "gamma"],
|
9
|
-
["δ", "delta"],
|
10
|
-
["ε", "epsilon"],
|
11
|
-
["ζ", "zeta"],
|
12
|
-
["η", "eta"],
|
13
|
-
["θ", "theta"],
|
14
|
-
["ι", "iota"],
|
15
|
-
["κ", "kappa"],
|
16
|
-
["λ", "lambda"],
|
17
|
-
["λ", "lamda"],
|
18
|
-
["μ", "mu"],
|
19
|
-
["ν", "nu"],
|
20
|
-
["ξ", "xi"],
|
21
|
-
["ο", "omicron"],
|
22
|
-
["π", "pi"],
|
23
|
-
["ρ", "rho"],
|
24
|
-
["σ", "sigma"],
|
25
|
-
["τ", "tau"],
|
26
|
-
["υ", "upsilon"],
|
27
|
-
["φ", "phi"],
|
28
|
-
["χ", "chi"],
|
29
|
-
["ψ", "psi"],
|
30
|
-
["ω", "omega"],
|
31
|
-
|
32
|
-
["Α", "Alpha"],
|
33
|
-
["Β", "Beta"],
|
34
|
-
["Γ", "Gamma"],
|
35
|
-
["Δ", "Delta"],
|
36
|
-
["Ε", "Epsilon"],
|
37
|
-
["Ζ", "Zeta"],
|
38
|
-
["Η", "Eta"],
|
39
|
-
["Θ", "Theta"],
|
40
|
-
["Ι", "Iota"],
|
41
|
-
["Κ", "Kappa"],
|
42
|
-
["Λ", "Lambda"],
|
43
|
-
["Λ", "Lamda"],
|
44
|
-
["Μ", "Mu"],
|
45
|
-
["Ν", "Nu"],
|
46
|
-
["Ξ", "Xi"],
|
47
|
-
["Ο", "Omicron"],
|
48
|
-
["Π", "Pi"],
|
49
|
-
["Ρ", "Rho"],
|
50
|
-
["Σ", "Sigma"],
|
51
|
-
["Τ", "Tau"],
|
52
|
-
["Υ", "Upsilon"],
|
53
|
-
["Φ", "Phi"],
|
54
|
-
["Χ", "Chi"],
|
55
|
-
["Ψ", "Psi"],
|
56
|
-
["Ω", "Omega"],
|
57
|
-
|
58
|
-
["ϕ", "phi"],
|
59
|
-
|
60
|
-
["×", "x"],
|
61
|
-
["•", "*"],
|
62
|
-
[" ", " "],
|
63
|
-
[" ", " "],
|
64
|
-
[" ", " "],
|
65
|
-
[" ", " "],
|
66
|
-
["
|
67
|
-
["
|
68
|
-
["
|
69
|
-
["
|
70
|
-
["
|
71
|
-
["
|
72
|
-
["
|
4
|
+
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
|
+
|
6
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
7
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
8
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
9
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
10
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
11
|
+
["ζ", "zeta"], #U+03B6 (greek small letter zeta)
|
12
|
+
["η", "eta"], #U+03B7 (greek small letter eta)
|
13
|
+
["θ", "theta"], #U+03B7 (greek small letter eta)
|
14
|
+
["ι", "iota"], #U+03B7 (greek small letter eta)
|
15
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
16
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
17
|
+
["λ", "lamda"], #U+03BB (greek small letter lambda)
|
18
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
19
|
+
["ν", "nu"], #U+03BD (greek small letter nu)
|
20
|
+
["ξ", "xi"], #U+03BE (greek small letter xi)
|
21
|
+
["ο", "omicron"], #U+03BF (greek small letter omicron)
|
22
|
+
["π", "pi"], #U+03C0 (greek small letter pi)
|
23
|
+
["ρ", "rho"], #U+03C1 (greek small letter rho)
|
24
|
+
["σ", "sigma"], #U+03C3 (greek small letter sigma)
|
25
|
+
["τ", "tau"], #U+03C4 (greek small letter tau)
|
26
|
+
["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
|
27
|
+
["φ", "phi"], #U+03C6 (greek small letter phi)
|
28
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
29
|
+
["ψ", "psi"], #U+03C8 (greek small letter psi)
|
30
|
+
["ω", "omega"], #U+03C9 (greek small letter omega)
|
31
|
+
|
32
|
+
["Α", "Alpha"], #U+0391 (greek capital letter alpha)
|
33
|
+
["Β", "Beta"], #U+0392 (greek capital letter beta)
|
34
|
+
["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
|
35
|
+
["Δ", "Delta"], #U+0394 (greek capital letter delta)
|
36
|
+
["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
|
37
|
+
["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
|
38
|
+
["Η", "Eta"], #U+0397 (greek capital letter eta)
|
39
|
+
["Θ", "Theta"], #U+0398 (greek capital letter theta)
|
40
|
+
["Ι", "Iota"], #U+0399 (greek capital letter iota)
|
41
|
+
["Κ", "Kappa"], #U+039A (greek capital letter kappa)
|
42
|
+
["Λ", "Lambda"], #U+039B (greek capital letter lambda)
|
43
|
+
["Λ", "Lamda"], #U+039B (greek capital letter lambda)
|
44
|
+
["Μ", "Mu"], #U+039C (greek capital letter mu)
|
45
|
+
["Ν", "Nu"], #U+039D (greek capital letter nu)
|
46
|
+
["Ξ", "Xi"], #U+039E (greek capital letter xi)
|
47
|
+
["Ο", "Omicron"], #U+039F (greek capital letter omicron)
|
48
|
+
["Π", "Pi"], #U+03A0 (greek capital letter pi)
|
49
|
+
["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
|
50
|
+
["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
|
51
|
+
["Τ", "Tau"], #U+03A4 (greek capital letter tau)
|
52
|
+
["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
|
53
|
+
["Φ", "Phi"], #U+03A6 (greek capital letter phi)
|
54
|
+
["Χ", "Chi"], #U+03A7 (greek capital letter chi)
|
55
|
+
["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
|
56
|
+
["Ω", "Omega"], #U+03A9 (greek capital letter omega)
|
57
|
+
|
58
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
59
|
+
|
60
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
61
|
+
["•", "*"], #U+2022 (bullet)
|
62
|
+
[" ", " "], #U+2009 (thin space)
|
63
|
+
[" ", " "], #U+200A (hair space)
|
64
|
+
[" ", " "], #U+00A0 (no-break space)
|
65
|
+
[" ", " "], #U+3000 (ideographic space)
|
66
|
+
["‑", "-"], #U+2211 (Non-Breaking Hyphen)
|
67
|
+
["−", "-"], #U+2212 (minus sign)
|
68
|
+
["–", "-"], #U+2013 (en dash)
|
69
|
+
["′", "'"], #U+2032 (prime)
|
70
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
71
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
72
|
+
["“", '"'], #U+201C (left double quotation mark)
|
73
|
+
["”", '"'], #U+201D (right double quotation mark)
|
73
74
|
['"', "''"]
|
74
|
-
|
75
|
+
]
|
76
|
+
|
77
|
+
|
78
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
79
|
+
|
80
|
+
|
81
|
+
class << TextAlignment
|
82
|
+
def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
|
83
|
+
_mappings ||= TextAlignment::MAPPINGS
|
84
|
+
|
85
|
+
character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
86
|
+
if character_mappings.empty?
|
87
|
+
[_str1, _str2, _mappings]
|
88
|
+
else
|
89
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
90
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
91
|
+
characters_to.gsub!(/-/, '\-')
|
92
|
+
|
93
|
+
str1 = _str1.tr(characters_from, characters_to)
|
94
|
+
str2 = _str2.tr(characters_from, characters_to)
|
95
|
+
|
96
|
+
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
|
97
|
+
|
98
|
+
[str1, str2, mappings]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
|
103
|
+
_mappings ||= TextAlignment::MAPPINGS
|
104
|
+
|
105
|
+
long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
106
|
+
if long_to_one_mappings.empty?
|
107
|
+
[_str1, _str2, _mappings]
|
108
|
+
else
|
109
|
+
## long to one character mappings
|
110
|
+
pletters = TextAlignment::PADDING_LETTERS
|
111
|
+
|
112
|
+
# find the padding letter for str1
|
113
|
+
@padding_letter1 = begin
|
114
|
+
i = pletters.index{|l| _str2.index(l).nil?}
|
115
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
116
|
+
TextAlignment::PADDING_LETTERS[i]
|
117
|
+
end
|
118
|
+
|
119
|
+
# find the padding letter for str2
|
120
|
+
@padding_letter2 = begin
|
121
|
+
i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
|
122
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
123
|
+
TextAlignment::PADDING_LETTERS[i]
|
124
|
+
end
|
125
|
+
|
126
|
+
str1 = str2 = nil
|
127
|
+
long_to_one_mappings.each do |f|
|
128
|
+
from = f[1]
|
129
|
+
|
130
|
+
str1 = if _str2.index(f[0])
|
131
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
132
|
+
_str1.gsub(from, to)
|
133
|
+
else
|
134
|
+
_str1
|
135
|
+
end
|
136
|
+
|
137
|
+
str2 = if _str1.index(f[0])
|
138
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
139
|
+
_str2.gsub(from, to)
|
140
|
+
else
|
141
|
+
_str2
|
142
|
+
end
|
143
|
+
end
|
144
|
+
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
|
145
|
+
|
146
|
+
[str1, str2, mappings]
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def compute_similarity(_s1, _s2, sdiff)
|
151
|
+
return 0 if sdiff.nil?
|
152
|
+
|
153
|
+
# compute the lcs only with non-whitespace letters
|
154
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
155
|
+
return 0 if lcs == 0
|
156
|
+
|
157
|
+
s1 = if @padding_letter1
|
158
|
+
_s1.tr(@padding_letter1, ' ')
|
159
|
+
else
|
160
|
+
_s1
|
161
|
+
end
|
162
|
+
|
163
|
+
s2 = if @padding_letter2
|
164
|
+
_s2.tr(@padding_letter2, ' ')
|
165
|
+
else
|
166
|
+
_s2
|
167
|
+
end
|
168
|
+
|
169
|
+
similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(_str1, _str2)
|
20
|
+
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
str1, str2, mappings =
|
23
|
+
str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
|
24
24
|
|
25
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
26
|
end
|
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
63
63
|
end
|
64
64
|
|
65
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
66
|
-
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
|
+
@similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
|
67
67
|
@str1_match_initial = cmp.str1_match_initial
|
68
68
|
@str1_match_final = cmp.str1_match_final
|
69
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
|
|
139
139
|
@position_map_end = posmap_end.sort.to_h
|
140
140
|
end
|
141
141
|
|
142
|
-
private
|
143
|
-
|
144
|
-
def string_preprocessing(_str1, _str2)
|
145
|
-
str1 = _str1.dup
|
146
|
-
str2 = _str2.dup
|
147
|
-
mappings = TextAlignment::MAPPINGS.dup
|
148
|
-
|
149
|
-
## single character mappings
|
150
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
-
characters_to.gsub!(/-/, '\-')
|
154
|
-
|
155
|
-
str1.tr!(characters_from, characters_to)
|
156
|
-
str2.tr!(characters_from, characters_to)
|
157
|
-
|
158
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
-
|
160
|
-
## long to one character mappings
|
161
|
-
pletters = TextAlignment::PADDING_LETTERS
|
162
|
-
|
163
|
-
# find the padding letter for str1
|
164
|
-
@padding_letter1 = begin
|
165
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
-
TextAlignment::PADDING_LETTERS[i]
|
168
|
-
end
|
169
|
-
|
170
|
-
# find the padding letter for str2
|
171
|
-
@padding_letter2 = begin
|
172
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
-
TextAlignment::PADDING_LETTERS[i]
|
175
|
-
end
|
176
|
-
|
177
|
-
# ASCII foldings
|
178
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
-
ascii_foldings.each do |f|
|
180
|
-
from = f[1]
|
181
|
-
|
182
|
-
if str2.index(f[0])
|
183
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
-
str1.gsub!(from, to)
|
185
|
-
end
|
186
|
-
|
187
|
-
if str1.index(f[0])
|
188
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
-
str2.gsub!(from, to)
|
190
|
-
end
|
191
|
-
end
|
192
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
-
|
194
|
-
[str1, str2, mappings]
|
195
|
-
end
|
196
|
-
|
197
|
-
def compute_similarity(_s1, _s2, sdiff)
|
198
|
-
return 0 if sdiff.nil?
|
199
|
-
|
200
|
-
# compute the lcs only with non-whitespace letters
|
201
|
-
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
-
return 0 if lcs == 0
|
203
|
-
|
204
|
-
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
-
|
207
|
-
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
-
end
|
209
|
-
|
210
142
|
end
|
@@ -5,34 +5,25 @@ require 'text_alignment/mixed_alignment'
|
|
5
5
|
|
6
6
|
module TextAlignment; end unless defined? TextAlignment
|
7
7
|
|
8
|
-
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
|
-
|
10
8
|
class TextAlignment::TextAlignment
|
11
9
|
attr_reader :block_alignment
|
12
10
|
attr_reader :similarity
|
13
11
|
attr_reader :lost_annotations
|
14
12
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
13
|
+
def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
14
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
15
|
|
18
|
-
@block_alignment = {source_text:
|
19
|
-
@
|
20
|
-
@
|
16
|
+
@block_alignment = {source_text: _str1, target_text: _str2, denotations: denotations}
|
17
|
+
@original_str1 = _str1
|
18
|
+
@original_str2 = _str2
|
21
19
|
|
22
|
-
|
23
|
-
block_begin = str2.index(str1)
|
24
|
-
unless block_begin.nil?
|
25
|
-
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
|
-
return
|
27
|
-
end
|
20
|
+
str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
|
28
21
|
|
29
|
-
|
30
|
-
|
31
|
-
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
22
|
+
if r = whole_block_alignment(str1, str2)
|
23
|
+
@block_alignment[:blocks] = r
|
32
24
|
return
|
33
25
|
end
|
34
26
|
|
35
|
-
|
36
27
|
## to find block alignments
|
37
28
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
38
29
|
|
@@ -120,12 +111,29 @@ class TextAlignment::TextAlignment
|
|
120
111
|
else
|
121
112
|
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
122
113
|
end
|
114
|
+
else
|
115
|
+
[]
|
123
116
|
end
|
124
117
|
end
|
125
118
|
|
126
119
|
@block_alignment[:blocks] = blocks2
|
127
120
|
end
|
128
121
|
|
122
|
+
def whole_block_alignment(str1, str2)
|
123
|
+
## Block exact match
|
124
|
+
block_begin = str2.index(str1)
|
125
|
+
unless block_begin.nil?
|
126
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
127
|
+
end
|
128
|
+
|
129
|
+
block_begin = str2.downcase.index(str1.downcase)
|
130
|
+
unless block_begin.nil?
|
131
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
132
|
+
end
|
133
|
+
|
134
|
+
nil
|
135
|
+
end
|
136
|
+
|
129
137
|
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
130
138
|
block2 = str2[b2 ... e2]
|
131
139
|
|
@@ -156,7 +164,6 @@ class TextAlignment::TextAlignment
|
|
156
164
|
lex = term[:lex]
|
157
165
|
look_forward = block2.index(lex, position)
|
158
166
|
unless look_forward.nil?
|
159
|
-
puts lex
|
160
167
|
tblocks = []
|
161
168
|
break
|
162
169
|
end
|
@@ -164,31 +171,37 @@ class TextAlignment::TextAlignment
|
|
164
171
|
end
|
165
172
|
|
166
173
|
tblocks
|
174
|
+
else
|
175
|
+
[]
|
167
176
|
end
|
168
177
|
|
169
178
|
if tblocks.empty?
|
170
179
|
if b1 == 0 && e1 == str1.length
|
171
|
-
if
|
180
|
+
if (e1 > 2000) || (e2 > 2000)
|
172
181
|
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
173
182
|
else
|
174
183
|
block1 = str1[b1 ... e1]
|
175
184
|
block2 = str2[b2 ... e2]
|
176
185
|
|
177
186
|
## character-based alignment
|
178
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
179
|
-
|
180
|
-
|
181
|
-
|
187
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
188
|
+
if alignment.sdiff.nil?
|
189
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
190
|
+
else
|
191
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
192
|
+
end
|
182
193
|
end
|
183
194
|
else
|
184
195
|
block1 = str1[b1 ... e1]
|
185
196
|
block2 = str2[b2 ... e2]
|
186
197
|
|
187
198
|
## character-based alignment
|
188
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
189
|
-
|
190
|
-
|
191
|
-
|
199
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
200
|
+
if alignment.sdiff.nil?
|
201
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
202
|
+
else
|
203
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
204
|
+
end
|
192
205
|
end
|
193
206
|
else
|
194
207
|
last_tblock = nil
|
@@ -199,7 +212,7 @@ class TextAlignment::TextAlignment
|
|
199
212
|
sum += if te1 == tb1
|
200
213
|
[tblock]
|
201
214
|
else
|
202
|
-
tb2 = last_tblock ?
|
215
|
+
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
203
216
|
te2 = tblock[:target][:begin]
|
204
217
|
|
205
218
|
if b2 == e2
|
@@ -295,7 +308,7 @@ class TextAlignment::TextAlignment
|
|
295
308
|
source = {begin:d.begin, end:d.end}
|
296
309
|
d.begin = transform_begin_position(d.begin);
|
297
310
|
d.end = transform_end_position(d.end);
|
298
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @
|
311
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
|
299
312
|
rescue
|
300
313
|
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
301
314
|
d.begin = nil
|
@@ -311,7 +324,7 @@ class TextAlignment::TextAlignment
|
|
311
324
|
|
312
325
|
r = hdenotations.collect do |d|
|
313
326
|
t = transform_a_span(d[:span])
|
314
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @
|
327
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
|
315
328
|
new_d = d.dup.merge({span:t})
|
316
329
|
rescue
|
317
330
|
@lost_annotations << {source: d[:span], target:t}
|
@@ -338,8 +351,13 @@ class TextAlignment::TextAlignment
|
|
338
351
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
339
352
|
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
340
353
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
341
|
-
">>>>> string 2
|
342
|
-
|
354
|
+
">>>>> string 2 " +
|
355
|
+
if a[:target]
|
356
|
+
"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
357
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
358
|
+
else
|
359
|
+
"[-]\n\n"
|
360
|
+
end
|
343
361
|
else
|
344
362
|
astr1 = ''
|
345
363
|
astr2 = ''
|
@@ -379,5 +397,4 @@ class TextAlignment::TextAlignment
|
|
379
397
|
end
|
380
398
|
show
|
381
399
|
end
|
382
|
-
|
383
400
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.9'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-11-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|