shalmaneser 0.0.1.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +284 -0
@@ -0,0 +1,1324 @@
|
|
1
|
+
# Salsa packages
|
2
|
+
require "frprep/ISO-8859-1"
|
3
|
+
require "frprep/Parser"
|
4
|
+
require "frprep/RegXML"
|
5
|
+
require "frprep/SalsaTigerRegXML"
|
6
|
+
require "frprep/SalsaTigerXMLHelper"
|
7
|
+
require "frprep/TabFormat"
|
8
|
+
require "frprep/ruby_class_extensions"
|
9
|
+
require "frprep/AbstractSynInterface"
|
10
|
+
|
11
|
+
############################################3
|
12
|
+
# Module FrprepHelper:
|
13
|
+
#
|
14
|
+
# diverse transformation methods for frprep.rb
|
15
|
+
# moved over here to make the main file less crowded
|
16
|
+
module FrprepHelper
|
17
|
+
|
18
|
+
####
|
19
|
+
# transform a file to UTF-8 from a given encoding
|
20
|
+
def FrprepHelper.to_utf8_file(input_filename, # string: name of input file
|
21
|
+
output_filename, # string: name of output file
|
22
|
+
encoding) # string: "iso", "hex"
|
23
|
+
begin
|
24
|
+
infile = File.new(input_filename)
|
25
|
+
outfile = File.new(output_filename, "w")
|
26
|
+
rescue
|
27
|
+
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
28
|
+
end
|
29
|
+
|
30
|
+
while (line = infile.gets())
|
31
|
+
case encoding
|
32
|
+
when "iso"
|
33
|
+
outfile.puts UtfIso.from_iso_8859_1(line)
|
34
|
+
when "hex"
|
35
|
+
outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
|
36
|
+
else
|
37
|
+
raise "Shouldn't be here."
|
38
|
+
end
|
39
|
+
end
|
40
|
+
infile.close()
|
41
|
+
outfile.close()
|
42
|
+
end
|
43
|
+
|
44
|
+
####
|
45
|
+
# transform BNC format file to plaintext file
|
46
|
+
def FrprepHelper.bnc_to_plain_file(input_filename, # string: name of input file
|
47
|
+
output_filename) # string: name of output file
|
48
|
+
begin
|
49
|
+
infile = File.new(input_filename)
|
50
|
+
outfile = File.new(output_filename, "w")
|
51
|
+
rescue
|
52
|
+
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
53
|
+
end
|
54
|
+
|
55
|
+
infile.each { |line|
|
56
|
+
# does this line contain a sentence?
|
57
|
+
if line =~ /^\s*<s\s+n=/
|
58
|
+
# remove all tags, replace by spaces,
|
59
|
+
# then remove superfluous spaces
|
60
|
+
textline = line.gsub(/<.+?>/, " ").strip().squeeze(" ")
|
61
|
+
|
62
|
+
|
63
|
+
textline.gsub!(/&bquo;/, '"')
|
64
|
+
textline.gsub!(/&equo;/, '"')
|
65
|
+
textline.gsub!(/—/, "-")
|
66
|
+
textline.gsub!(/–/, "-")
|
67
|
+
textline.gsub!(/%/, "%")
|
68
|
+
textline.gsub!(/£/, " pounds ")
|
69
|
+
textline.gsub!(/&/, " and ")
|
70
|
+
textline.gsub!(/…/, "...")
|
71
|
+
textline.gsub!(/©/, "(copyright)")
|
72
|
+
textline.gsub!(/é/, "e")
|
73
|
+
textline.gsub!(/•/, "*")
|
74
|
+
textline.gsub!(/$/, "$")
|
75
|
+
textline.gsub!(/°/, " degree ")
|
76
|
+
|
77
|
+
textline.gsub!(/½/, "1/2")
|
78
|
+
textline.gsub!(/¾/, "3/4")
|
79
|
+
|
80
|
+
textline.gsub!(/[/, "[")
|
81
|
+
textline.gsub!(/]/, "]")
|
82
|
+
|
83
|
+
textline.gsub!(/&ins;/, "i")
|
84
|
+
textline.gsub!(/&ft;/, "ft")
|
85
|
+
|
86
|
+
textline.gsub!(/→/, ">")
|
87
|
+
textline.gsub!(/←/, "<")
|
88
|
+
|
89
|
+
|
90
|
+
textline.gsub!(/á/, "a")
|
91
|
+
textline.gsub!(/ä/, "a")
|
92
|
+
textline.gsub!(/à/, "a")
|
93
|
+
textline.gsub!(/ã/, "a")
|
94
|
+
textline.gsub!(/â/, "a")
|
95
|
+
textline.gsub!(/Á/, "A")
|
96
|
+
textline.gsub!(/Ä/, "A")
|
97
|
+
textline.gsub!(/À/, "A")
|
98
|
+
textline.gsub!(/Ã/, "A")
|
99
|
+
textline.gsub!(/Â/, "A")
|
100
|
+
|
101
|
+
textline.gsub!(/é/, "e")
|
102
|
+
textline.gsub!(/è/, "e")
|
103
|
+
textline.gsub!(/ê/, "e")
|
104
|
+
textline.gsub!(/ë/, "e")
|
105
|
+
textline.gsub!(/É/, "E")
|
106
|
+
textline.gsub!(/È/, "E")
|
107
|
+
textline.gsub!(/Ê/, "E")
|
108
|
+
textline.gsub!(/Ë/, "E")
|
109
|
+
|
110
|
+
textline.gsub!(/í/, "i")
|
111
|
+
textline.gsub!(/ì/, "i")
|
112
|
+
textline.gsub!(/î/, "i")
|
113
|
+
textline.gsub!(/ï/, "i")
|
114
|
+
textline.gsub!(/Í/, "I")
|
115
|
+
textline.gsub!(/Ì/, "I")
|
116
|
+
textline.gsub!(/Î/, "I")
|
117
|
+
|
118
|
+
textline.gsub!(/ó/, "o")
|
119
|
+
textline.gsub!(/ò/, "o")
|
120
|
+
textline.gsub!(/ô/, "o")
|
121
|
+
textline.gsub!(/ö/, "o")
|
122
|
+
textline.gsub!(/Ó/, "O")
|
123
|
+
textline.gsub!(/Ò/, "O")
|
124
|
+
textline.gsub!(/Ô/, "O")
|
125
|
+
textline.gsub!(/Ö/, "O")
|
126
|
+
|
127
|
+
textline.gsub!(/ú/, "u")
|
128
|
+
textline.gsub!(/ù/, "u")
|
129
|
+
textline.gsub!(/û/, "u")
|
130
|
+
textline.gsub!(/ü/, "u")
|
131
|
+
textline.gsub!(/Ú/, "U")
|
132
|
+
textline.gsub!(/Ù/, "U")
|
133
|
+
textline.gsub!(/Û/, "U")
|
134
|
+
textline.gsub!(/Ü/, "U")
|
135
|
+
|
136
|
+
textline.gsub!(/ÿ/, "y")
|
137
|
+
textline.gsub!(/Ÿ/, "Y")
|
138
|
+
|
139
|
+
textline.gsub!(/ñ/, "n")
|
140
|
+
textline.gsub!(/Ñ/, "N")
|
141
|
+
|
142
|
+
textline.gsub!(/ç/, "c")
|
143
|
+
textline.gsub!(/Ç/, "C")
|
144
|
+
|
145
|
+
|
146
|
+
outfile.puts textline
|
147
|
+
end
|
148
|
+
}
|
149
|
+
infile.close()
|
150
|
+
outfile.close()
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
####
|
155
|
+
# transform plaintext file to Tab format file
|
156
|
+
def FrprepHelper.plain_to_tab_file(input_filename,# string: name of input file
|
157
|
+
output_filename) # string: name of output file
|
158
|
+
begin
|
159
|
+
infile = File.new(input_filename)
|
160
|
+
outfile = File.new(output_filename, "w")
|
161
|
+
rescue
|
162
|
+
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
163
|
+
end
|
164
|
+
|
165
|
+
filename_core = File.basename(input_filename, "txt")
|
166
|
+
|
167
|
+
# array(string): keep the words of each sentence
|
168
|
+
sentence = Array.new
|
169
|
+
# sentence number for making the sentence ID:
|
170
|
+
# global count, over all input files
|
171
|
+
sentno = 0
|
172
|
+
|
173
|
+
while (line = infile.gets())
|
174
|
+
|
175
|
+
# make a sentence ID for the next sentence: running number
|
176
|
+
sentid = filename_core + "_" + sentno.to_s
|
177
|
+
sentno += 1
|
178
|
+
|
179
|
+
# read words into the sentence array,
|
180
|
+
# separating out punctuation attached to the beginning or end of words
|
181
|
+
sentence.clear()
|
182
|
+
line.split.each { |word|
|
183
|
+
# punctuation at the beginning of the word
|
184
|
+
#if word =~ /^([\(\[`'\"-]+)(.*)$/
|
185
|
+
if word =~ /^([\(\[`\"-]+)(.*)$/
|
186
|
+
punct = $1
|
187
|
+
word = $2
|
188
|
+
punct.scan(/./) { |single_punct|
|
189
|
+
sentence << single_punct
|
190
|
+
}
|
191
|
+
|
192
|
+
end
|
193
|
+
# punctuation at the end of the word
|
194
|
+
#if word =~ /[,:;-\`?!'\"\.\)\]]+$/
|
195
|
+
if word =~ /[,:;-\`?!\"\.\)\]]+$/
|
196
|
+
sentence << $` # part before the match: the word
|
197
|
+
punct = $&
|
198
|
+
punct.scan(/./) { |single_punct|
|
199
|
+
sentence << single_punct
|
200
|
+
}
|
201
|
+
|
202
|
+
else
|
203
|
+
# no punctuation recognized
|
204
|
+
sentence << word
|
205
|
+
end
|
206
|
+
}
|
207
|
+
|
208
|
+
|
209
|
+
# remove empty words
|
210
|
+
sentence.reject! { |word| word.nil? or word.strip.empty? }
|
211
|
+
|
212
|
+
# write words to tab file
|
213
|
+
# KE Dec 06: TabFormat changed
|
214
|
+
sentence.each { |word|
|
215
|
+
# for each word, one line, entries in the line tab-separated
|
216
|
+
# the 'word' entry is the word, the 'lu_sent_ids' entry is the sentence ID sentid,
|
217
|
+
# all other entries (gf, pt, frame etc.) are not set
|
218
|
+
outfile.puts FNTabFormatFile.format_str({
|
219
|
+
"word" => word,
|
220
|
+
"sent_id" => sentid
|
221
|
+
})
|
222
|
+
}
|
223
|
+
outfile.puts
|
224
|
+
end
|
225
|
+
outfile.close()
|
226
|
+
end
|
227
|
+
|
228
|
+
###########
|
229
|
+
#
|
230
|
+
# class method split_dir:
|
231
|
+
# read all files in one directory and produce chunk files *#{suffix} in outdir
|
232
|
+
# with a certain number of files in them (sent_num).
|
233
|
+
# Optionally, remove all sentences longer than sent_leng
|
234
|
+
#
|
235
|
+
# produces output files 1.<suffix>, 2.<suffix>, etc.
|
236
|
+
#
|
237
|
+
# assumes TabFormat sentences
|
238
|
+
#
|
239
|
+
# example: split_all("/tmp/in","/tmp/out",".tab",2000,80)
|
240
|
+
|
241
|
+
def FrprepHelper.split_dir(indir,
|
242
|
+
outdir,
|
243
|
+
suffix,
|
244
|
+
sent_num,
|
245
|
+
sent_leng=nil)
|
246
|
+
|
247
|
+
unless indir[-1,1] == "/"
|
248
|
+
indir += "/"
|
249
|
+
end
|
250
|
+
unless outdir[-1,1] == "/"
|
251
|
+
outdir += "/"
|
252
|
+
end
|
253
|
+
|
254
|
+
outfile_counter = 0
|
255
|
+
line_stack = Array.new
|
256
|
+
sent_stack = Array.new
|
257
|
+
|
258
|
+
Dir[indir+"*#{suffix}"].each {|infilename|
|
259
|
+
STDERR.puts "Now splitting #{infilename}"
|
260
|
+
infile = File.new(infilename)
|
261
|
+
|
262
|
+
while line = infile.gets
|
263
|
+
line.chomp!
|
264
|
+
case line
|
265
|
+
when "" # end of sentence
|
266
|
+
if !(sent_leng.nil? or line_stack.length < sent_leng) # record sentence
|
267
|
+
# suppress multiple empty lines
|
268
|
+
# to avoid problems with lemmatiser
|
269
|
+
# only record sent_stack if it is not empty.
|
270
|
+
|
271
|
+
# change (sp 15 01 07): just cut off sentence at sent_leng.
|
272
|
+
|
273
|
+
STDERR.puts "Cutting off long sentence #{line_stack.last.split("\t").last}"
|
274
|
+
line_stack = line_stack[0..sent_leng-1]
|
275
|
+
end
|
276
|
+
unless line_stack.empty?
|
277
|
+
sent_stack << line_stack
|
278
|
+
# reset line_stack
|
279
|
+
line_stack = Array.new
|
280
|
+
end
|
281
|
+
|
282
|
+
|
283
|
+
# check if we have to empty the sent stack
|
284
|
+
if sent_stack.length == sent_num # enough sentences for new outfile?
|
285
|
+
outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
|
286
|
+
sent_stack.each {|l_stack|
|
287
|
+
outfile.puts l_stack.join("\n")
|
288
|
+
outfile.puts
|
289
|
+
}
|
290
|
+
outfile.close
|
291
|
+
outfile_counter += 1
|
292
|
+
sent_stack = Array.new
|
293
|
+
end
|
294
|
+
|
295
|
+
else # for any other line
|
296
|
+
line_stack << line
|
297
|
+
end
|
298
|
+
end
|
299
|
+
infile.close
|
300
|
+
}
|
301
|
+
# the last remaining sentences
|
302
|
+
unless sent_stack.empty?
|
303
|
+
outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
|
304
|
+
sent_stack.each {|l_stack|
|
305
|
+
l_stack << "\n"
|
306
|
+
outfile.puts l_stack.join("\n")
|
307
|
+
}
|
308
|
+
outfile.close
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
####
|
313
|
+
# note salsa targetlemma
|
314
|
+
#
|
315
|
+
# old_dir contains xml files whose name starts with the
|
316
|
+
# target lemma for all frames in the file
|
317
|
+
# record that target lemma in the <target> element of each frame
|
318
|
+
def FrprepHelper.note_salsa_targetlemma(old_dir, # string ending in /
|
319
|
+
new_dir) # string ending in /
|
320
|
+
|
321
|
+
|
322
|
+
# each input file: extract target lemma from filename,
|
323
|
+
# not this lemma in the <target> element of each frame
|
324
|
+
Dir[old_dir + "*.xml"].each { |filename|
|
325
|
+
changedfilename = new_dir + File.basename(filename)
|
326
|
+
|
327
|
+
if File.basename(filename) =~ /^(.*?)[_\.]/
|
328
|
+
lemma = $1
|
329
|
+
|
330
|
+
infile = FilePartsParser.new(filename)
|
331
|
+
outfile = File.new(changedfilename, "w")
|
332
|
+
|
333
|
+
# write header
|
334
|
+
outfile.puts infile.head()
|
335
|
+
|
336
|
+
# iterate through sentences, yield as SalsaTigerSentence objects
|
337
|
+
infile.scan_s() { |sent_string|
|
338
|
+
sent = SalsaTigerSentence.new(sent_string)
|
339
|
+
sent.each_frame { |frame|
|
340
|
+
frame.target.set_attribute("lemma", lemma)
|
341
|
+
}
|
342
|
+
|
343
|
+
# write changed sentence
|
344
|
+
outfile.puts sent.get()
|
345
|
+
} # each sentence
|
346
|
+
|
347
|
+
# write footer
|
348
|
+
outfile.puts infile.tail()
|
349
|
+
infile.close()
|
350
|
+
outfile.close()
|
351
|
+
|
352
|
+
else
|
353
|
+
# couldn't determine lemma
|
354
|
+
# just copy the file
|
355
|
+
`cp #{filename} #{changedfilename}`
|
356
|
+
end
|
357
|
+
}
|
358
|
+
end
|
359
|
+
|
360
|
+
####
|
361
|
+
# stxml_split_dir
|
362
|
+
#
|
363
|
+
# split SalsaTigerXML files into new files of given length,
|
364
|
+
# skipping sentences that are too long
|
365
|
+
#
|
366
|
+
# At the same time, sentences that occur several times (i.e. sentences which are
|
367
|
+
# annotated by SALSA for more than one predicate) are compacted into one occurrence
|
368
|
+
# with combined semantics.
|
369
|
+
#
|
370
|
+
# assumes that all files in input_dir with
|
371
|
+
# extension .xml are SalsaTigerXMl files
|
372
|
+
def FrprepHelper.stxml_split_dir(input_dir, # string: input directory with STXML files
|
373
|
+
split_dir, # string: output directory
|
374
|
+
max_sentnum, # integer: max num of sentences per file
|
375
|
+
max_sentlen) # integer: max num of terminals per sentence
|
376
|
+
|
377
|
+
filenames = Dir[input_dir+"*.xml"].to_a
|
378
|
+
|
379
|
+
graph_hash = Hash.new # for each sentence id, keep <s...</graph>
|
380
|
+
frame_hash = Hash.new # for each sentence id , keep the <frame... </frame> string
|
381
|
+
uspfes_hash = Hash.new # for each sentence id, keep the uspfes stuff
|
382
|
+
uspframes_hash = Hash.new # for each sentence id, keep the uspframes stuff
|
383
|
+
|
384
|
+
########################
|
385
|
+
# Traverse of file(s): compute an index of all frames for each sentence, with unique identifiers
|
386
|
+
|
387
|
+
filenames.each {|filename|
|
388
|
+
|
389
|
+
infile = FilePartsParser.new(filename)
|
390
|
+
infile.scan_s {|sent_str|
|
391
|
+
|
392
|
+
sentlen = 0
|
393
|
+
sent_str.delete("\n").scan(/<t\s/) { |occ| sentlen += 1}
|
394
|
+
if sentlen > max_sentlen
|
395
|
+
sent = RegXML.new(sent_str)
|
396
|
+
# revisit handling of long sentences
|
397
|
+
# $stderr.puts "I would have skipped overly long sentence " + sent.attributes["id"]+" but Sebastian forbade me.".to_s
|
398
|
+
# next
|
399
|
+
end
|
400
|
+
|
401
|
+
# substitute old frame identifiers with new, unique ones
|
402
|
+
|
403
|
+
# problem: we may have several frames per sentence, and need to keep track of them
|
404
|
+
# if we rename etc sxx_f1 to sxx_f2 and there is already a sxx_f2, then
|
405
|
+
# we cannot distinguish between these frames
|
406
|
+
|
407
|
+
# therefore, we substitute temporary identifiers until we have substituted
|
408
|
+
# all ids with temporary ones, and re-substitute final ones at the end.
|
409
|
+
|
410
|
+
this_frames = Array.new
|
411
|
+
|
412
|
+
temp_subs = Array.new
|
413
|
+
final_subs = Array.new
|
414
|
+
|
415
|
+
sent = RegXML.new(sent_str)
|
416
|
+
sentid = sent.attributes["id"].to_s
|
417
|
+
if sentid.nil?
|
418
|
+
STDERR.puts "[frprep] Warning: cannot find sentence id, skipping sentence:"
|
419
|
+
STDERR.puts sent_str
|
420
|
+
# strange sentence, no ID? skip
|
421
|
+
next
|
422
|
+
end
|
423
|
+
|
424
|
+
unless frame_hash.key? sentid
|
425
|
+
frame_hash[sentid] = Array.new
|
426
|
+
uspfes_hash[sentid] = Array.new
|
427
|
+
uspframes_hash[sentid] = Array.new
|
428
|
+
end
|
429
|
+
|
430
|
+
# find everything up to and including the graph
|
431
|
+
sent_children = sent.children_and_text()
|
432
|
+
graph = sent_children.detect { |child| child.name == "graph" }
|
433
|
+
graph_hash[sentid] = "<s " +
|
434
|
+
sent.attributes.to_a.map { |at, val| "#{at}=\'#{val}\'" }.join(" ") +
|
435
|
+
">" +
|
436
|
+
graph.to_s
|
437
|
+
|
438
|
+
# find the usp block
|
439
|
+
|
440
|
+
sem = sent_children.detect { |child| child.name == "sem"}
|
441
|
+
usp = ""
|
442
|
+
if sem
|
443
|
+
usp = sem.children_and_text.detect { |child| child.name == "usp" }
|
444
|
+
usp = usp.to_s
|
445
|
+
end
|
446
|
+
|
447
|
+
# find all frames
|
448
|
+
if sem
|
449
|
+
frames = sem.children_and_text.detect { |child| child.name == "frames" }
|
450
|
+
if frames
|
451
|
+
frames.children_and_text.each { |frame|
|
452
|
+
unless frame.name == "frame"
|
453
|
+
next
|
454
|
+
end
|
455
|
+
frameid = frame.attributes["id"]
|
456
|
+
|
457
|
+
temp_frameid = "#{sentid}_temp_f#{frame_hash[sentid].length+this_frames.length+1}"
|
458
|
+
final_frameid = "#{sentid}_f#{frame_hash[sentid].length+this_frames.length+1}"
|
459
|
+
|
460
|
+
temp_subs << [frameid,temp_frameid]
|
461
|
+
final_subs << [temp_frameid,final_frameid]
|
462
|
+
|
463
|
+
this_frames << frame.to_s
|
464
|
+
}
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
# now first rename all the frames to temporary names
|
469
|
+
|
470
|
+
temp_subs.each {|orig_frameid, temp_frameid|
|
471
|
+
this_frames.map! {|frame_str|
|
472
|
+
#print "orig ", orig_frameid, " temp ", temp_frameid, "\n"
|
473
|
+
frame_str.gsub(orig_frameid,temp_frameid)
|
474
|
+
}
|
475
|
+
|
476
|
+
usp.gsub!(orig_frameid,temp_frameid)
|
477
|
+
}
|
478
|
+
|
479
|
+
# and re-rename the temporary names
|
480
|
+
|
481
|
+
final_subs.each {|temp_frameid, final_frameid|
|
482
|
+
this_frames.map! {|frame_str|
|
483
|
+
frame_str.gsub(temp_frameid,final_frameid)
|
484
|
+
}
|
485
|
+
usp.gsub!(temp_frameid, final_frameid)
|
486
|
+
}
|
487
|
+
|
488
|
+
# store frames in data structure
|
489
|
+
this_frames.each {|frame_str|
|
490
|
+
frame_hash[sentid] << frame_str
|
491
|
+
}
|
492
|
+
|
493
|
+
# store uspfes in data structure
|
494
|
+
unless usp.empty?
|
495
|
+
usp_elt = RegXML.new(usp)
|
496
|
+
uspfes = usp_elt.children_and_text.detect { |child| child.name == "uspfes" }
|
497
|
+
uspfes.children_and_text.each { |child|
|
498
|
+
unless child.name == "uspblock"
|
499
|
+
next
|
500
|
+
end
|
501
|
+
uspfes_hash[sentid] << child.to_s
|
502
|
+
}
|
503
|
+
|
504
|
+
# store uspframes in data structure
|
505
|
+
uspframes = usp_elt.children_and_text.detect { |child| child.name == "uspframes" }
|
506
|
+
uspframes.children_and_text.each { |child|
|
507
|
+
unless child.name == "uspblock"
|
508
|
+
next
|
509
|
+
end
|
510
|
+
uspframes_hash[sentid] << child.to_s
|
511
|
+
}
|
512
|
+
end
|
513
|
+
}
|
514
|
+
}
|
515
|
+
|
516
|
+
# now write everything in the data structure back to a file
|
517
|
+
|
518
|
+
filecounter = 0
|
519
|
+
sentcounter = 0
|
520
|
+
outfile = nil
|
521
|
+
sent_stack = Array.new
|
522
|
+
|
523
|
+
graph_hash.sort {|a,b| a[0].to_i <=> b[0].to_i}.each {|sentid,graph_str|
|
524
|
+
|
525
|
+
if sentcounter == max_sentnum
|
526
|
+
outfile.puts SalsaTigerXMLHelper.get_footer
|
527
|
+
outfile.close
|
528
|
+
outfile = nil
|
529
|
+
end
|
530
|
+
|
531
|
+
unless outfile
|
532
|
+
outfile = File.new(split_dir+filecounter.to_s+".xml","w")
|
533
|
+
outfile.puts SalsaTigerXMLHelper.get_header
|
534
|
+
filecounter +=1
|
535
|
+
sentcounter = 0
|
536
|
+
end
|
537
|
+
|
538
|
+
xml = Array.new
|
539
|
+
xml << graph_str
|
540
|
+
xml << "<sem>"
|
541
|
+
xml << "<globals>"
|
542
|
+
xml << "</globals>"
|
543
|
+
xml << "<frames>"
|
544
|
+
frame_hash[sentid].each {|frame_str|
|
545
|
+
xml << frame_str
|
546
|
+
}
|
547
|
+
xml << "</frames>"
|
548
|
+
xml << "<usp>"
|
549
|
+
xml << "<uspframes>"
|
550
|
+
uspframes_hash[sentid].each {|uspblock_str|
|
551
|
+
xml << uspblock_str
|
552
|
+
}
|
553
|
+
xml << "</uspframes>"
|
554
|
+
xml << "<uspfes>"
|
555
|
+
uspfes_hash[sentid].each {|uspblock_str|
|
556
|
+
xml << uspblock_str
|
557
|
+
}
|
558
|
+
xml << "</uspfes>"
|
559
|
+
xml << "</usp>"
|
560
|
+
xml << "</sem>"
|
561
|
+
xml << "</s>"
|
562
|
+
|
563
|
+
outfile.puts xml.join("\n")
|
564
|
+
sentcounter += 1
|
565
|
+
}
|
566
|
+
|
567
|
+
if outfile
|
568
|
+
outfile.puts SalsaTigerXMLHelper.get_footer
|
569
|
+
outfile.close
|
570
|
+
outfile = nil
|
571
|
+
end
|
572
|
+
|
573
|
+
end
|
574
|
+
|
575
|
+
|
576
|
+
####
|
577
|
+
# transform SalsaTigerXML file to Tab format file
|
578
|
+
def FrprepHelper.stxml_to_tab_file(input_filename, # string: name of input file
|
579
|
+
output_filename, # string: name of output file
|
580
|
+
exp) # FrprepConfigData
|
581
|
+
infile = FilePartsParser.new(input_filename)
|
582
|
+
begin
|
583
|
+
outfile = File.new(output_filename,"w")
|
584
|
+
rescue
|
585
|
+
raise "Stxml to tab: could not write to tab file #{output_filename}"
|
586
|
+
end
|
587
|
+
|
588
|
+
infile.scan_s {|sent_string|
|
589
|
+
|
590
|
+
# determine sentence ID
|
591
|
+
sentid = RegXML.new(sent_string).attributes["id"]
|
592
|
+
unless sentid
|
593
|
+
$stderr.puts "No sentence ID in sentence:\n "+ sent_string
|
594
|
+
$stderr.puts "Making a new one up."
|
595
|
+
sentid = Time.new().to_f.to_s
|
596
|
+
end
|
597
|
+
|
598
|
+
# find terminals and process them
|
599
|
+
unless sent_string.delete("\n") =~ /<terminals[ >].+<\/terminals>/
|
600
|
+
$stderr.puts "Warning: could not find terminals in sentence:"
|
601
|
+
$stderr.puts sent_string
|
602
|
+
$stderr.puts "Skipping"
|
603
|
+
next
|
604
|
+
end
|
605
|
+
|
606
|
+
# modified by ines, 27/08/08
|
607
|
+
# for Berkeley => convert ( ) to -LRB- -RRB-
|
608
|
+
|
609
|
+
text = $&
|
610
|
+
if exp.get("parser") == "berkeley"
|
611
|
+
text.gsub!(/word='\('/, "word='*LRB*'")
|
612
|
+
text.gsub!(/word='\)'/, "word='*RRB*'")
|
613
|
+
text.gsub!(/word=['"]``['"]/, "word='\"'")
|
614
|
+
text.gsub!(/word=['"]''['"]/, "word='\"'")
|
615
|
+
text.gsub!(/word=['"]\'\'['"]/, "word='\"'")
|
616
|
+
#text.gsub!(/word=['"]\(['"]/, "word='-LRB-'")
|
617
|
+
#text.gsub!(/word=['"]\)['"]/, "word='-RRB-'")
|
618
|
+
|
619
|
+
end
|
620
|
+
terminals = text
|
621
|
+
#terminals = sent_string
|
622
|
+
terminals = RegXML.new(terminals)
|
623
|
+
terminals.children_and_text.each { |terminal|
|
624
|
+
|
625
|
+
unless terminal.name == "t"
|
626
|
+
# not a terminal after all
|
627
|
+
next
|
628
|
+
end
|
629
|
+
|
630
|
+
|
631
|
+
outfile.puts FNTabFormatFile.format_str({
|
632
|
+
"word" => SalsaTigerXMLHelper.unescape(terminal.attributes["word"]),
|
633
|
+
"sent_id" => sentid
|
634
|
+
})
|
635
|
+
} # each terminal
|
636
|
+
outfile.puts
|
637
|
+
} # each sentence
|
638
|
+
outfile.close
|
639
|
+
end
|
640
|
+
|
641
|
+
###
|
642
|
+
# add semantics from tab:
|
643
|
+
#
|
644
|
+
# add information about semantics from a FN tab sentence
|
645
|
+
# to a SalsaTigerSentence object:
|
646
|
+
# - frames (one frame per sentence)
|
647
|
+
# - roles
|
648
|
+
# - FrameNet grammatical functions
|
649
|
+
# - FrameNet POS of target
|
650
|
+
def FrprepHelper.add_semantics_from_tab(st_sent, # SalsaTigerSentence object
|
651
|
+
tab_sent, # FNTabFormatSentence object
|
652
|
+
mapping, # hash: tab lineno -> array:SynNode
|
653
|
+
interpreter_class, # SynInterpreter class
|
654
|
+
exp) # FrprepConfigData
|
655
|
+
|
656
|
+
if tab_sent.nil?
|
657
|
+
# tab sentence not found
|
658
|
+
return
|
659
|
+
end
|
660
|
+
|
661
|
+
# iterate through frames in the tabsent
|
662
|
+
frame_index = 0
|
663
|
+
tab_sent.each_frame { |tab_frame_obj|
|
664
|
+
frame_name = tab_frame_obj.get_frame() # string
|
665
|
+
|
666
|
+
if frame_name.nil? or frame_name =~ /^-*$/
|
667
|
+
# weird: a frame without a frame
|
668
|
+
$stderr.puts "Warning: frame entry without a frame in tab sentence #{st_sent.id}."
|
669
|
+
$stderr.puts "Skipping"
|
670
|
+
next
|
671
|
+
end
|
672
|
+
|
673
|
+
frame_node = st_sent.add_frame(frame_name, tab_sent.get_sent_id() + "_f#{frame_index}")
|
674
|
+
frame_index += 1
|
675
|
+
|
676
|
+
# target
|
677
|
+
target_nodes = Array.new
|
678
|
+
tab_frame_obj.get_target_indices.each {|terminal_id|
|
679
|
+
if mapping[terminal_id]
|
680
|
+
target_nodes.concat mapping[terminal_id]
|
681
|
+
end
|
682
|
+
}
|
683
|
+
|
684
|
+
# let the interpreter class decide on how to determine the maximum constituents
|
685
|
+
target_maxnodes = interpreter_class.max_constituents(target_nodes, st_sent)
|
686
|
+
if target_maxnodes.empty?
|
687
|
+
# HIEr
|
688
|
+
STDERR.puts "Warning: no target in frame entry, sentence #{st_sent.id}."
|
689
|
+
$stderr.puts "frame is #{frame_name}, frame no #{frame_index}"
|
690
|
+
$stderr.puts "Skipping."
|
691
|
+
$stderr.puts "target indices: " + tab_frame_obj.get_target_indices.join(", ")
|
692
|
+
#tab_sent.each_line { |line|
|
693
|
+
# $stderr.puts line
|
694
|
+
# $stderr.puts "--"
|
695
|
+
#}
|
696
|
+
next
|
697
|
+
end
|
698
|
+
frame_node.add_fe("target",target_maxnodes)
|
699
|
+
|
700
|
+
# set features on target: target lemma, target POS
|
701
|
+
target_lemma = tab_frame_obj.get_target()
|
702
|
+
target_pos = nil
|
703
|
+
if target_lemma
|
704
|
+
if exp.get("origin") == "FrameNet"
|
705
|
+
# FrameNet data: here the lemma in the tab file has the form
|
706
|
+
# <lemma>.<POS>
|
707
|
+
# separate the two
|
708
|
+
if target_lemma =~ /^(.*)\.(.*)$/
|
709
|
+
target_lemma = $1
|
710
|
+
target_pos = $2
|
711
|
+
end
|
712
|
+
end
|
713
|
+
frame_node.target.set_attribute("lemma", target_lemma)
|
714
|
+
if target_pos
|
715
|
+
frame_node.target.set_attribute("pos", target_pos)
|
716
|
+
end
|
717
|
+
end
|
718
|
+
|
719
|
+
# roles, GF, PT
|
720
|
+
# synnode_markable_label:
|
721
|
+
# hash "role" | "gf" | "pt" -> SynNode -> array: label(string)
|
722
|
+
layer_synnode_label = Hash.new
|
723
|
+
["gf", "pt", "role"].each {|layer|
|
724
|
+
termids2labels = tab_frame_obj.markables(layer)
|
725
|
+
|
726
|
+
unless layer_synnode_label[layer]
|
727
|
+
layer_synnode_label[layer] = Hash.new
|
728
|
+
end
|
729
|
+
|
730
|
+
termids2labels.each {|terminal_indices, label|
|
731
|
+
terminal_indices.each { |t_i|
|
732
|
+
|
733
|
+
if (nodes = mapping[t_i])
|
734
|
+
|
735
|
+
nodes.each { |node|
|
736
|
+
unless layer_synnode_label[layer][node]
|
737
|
+
layer_synnode_label[layer][node] = Array.new
|
738
|
+
end
|
739
|
+
|
740
|
+
layer_synnode_label[layer][node] << label
|
741
|
+
} # each node that t_i maps to
|
742
|
+
end # if t_i maps to anything
|
743
|
+
|
744
|
+
} # each terminal index
|
745
|
+
} # each mapping terminal indices -> label
|
746
|
+
} # each layer
|
747
|
+
|
748
|
+
# 'stuff' (Support and other things)
|
749
|
+
layer_synnode_label["stuff"] = Hash.new
|
750
|
+
tab_frame_obj.each_line_parsed { |line_obj|
|
751
|
+
if (label = line_obj.get("stuff")) != "-"
|
752
|
+
if (nodes = mapping[line_obj.get("lineno")])
|
753
|
+
nodes.each { |node|
|
754
|
+
unless layer_synnode_label["stuff"][node]
|
755
|
+
layer_synnode_label["stuff"][node] = Array.new
|
756
|
+
end
|
757
|
+
layer_synnode_label["stuff"][node] << label
|
758
|
+
}
|
759
|
+
end
|
760
|
+
end
|
761
|
+
}
|
762
|
+
|
763
|
+
# reencode:
|
764
|
+
# hash role_label(string) -> array of tuples [synnodes, gflabels, ptlabels]
|
765
|
+
# synnodes: array:SynNode. gflabels, ptlabels: array:String
|
766
|
+
#
|
767
|
+
# note that in this step, any gf or pt labels that have been
|
768
|
+
# assigned to a SynNode that has not also been assigned a role
|
769
|
+
# will be lost
|
770
|
+
role2nodes_labels = Hash.new
|
771
|
+
layer_synnode_label["role"].each_pair { |synnode, labels|
|
772
|
+
labels.each { | rolelabel|
|
773
|
+
unless role2nodes_labels[rolelabel]
|
774
|
+
role2nodes_labels[rolelabel] = Array.new
|
775
|
+
end
|
776
|
+
|
777
|
+
role2nodes_labels[rolelabel] << [
|
778
|
+
synnode,
|
779
|
+
layer_synnode_label["gf"][synnode],
|
780
|
+
layer_synnode_label["pt"][synnode]
|
781
|
+
]
|
782
|
+
} # each role label
|
783
|
+
} # each pair SynNode/role labels
|
784
|
+
|
785
|
+
# reencode "stuff", but only the support cases
|
786
|
+
role2nodes_labels["Support"] = Array.new()
|
787
|
+
|
788
|
+
layer_synnode_label["stuff"].each_pair { |synnode, labels|
|
789
|
+
labels.each { |stufflabel|
|
790
|
+
if stufflabel =~ /Supp/
|
791
|
+
# some sort of support
|
792
|
+
role2nodes_labels["Support"] << [synnode, nil, nil]
|
793
|
+
end
|
794
|
+
}
|
795
|
+
}
|
796
|
+
|
797
|
+
##
|
798
|
+
# each role label:
|
799
|
+
# make FeNode for the current frame
|
800
|
+
role2nodes_labels.each_pair { |rolelabel, node_gf_pt|
|
801
|
+
|
802
|
+
# get list of syn nodes, GF and PT labels for this role
|
803
|
+
# shortcut for GF and PT labels: take any labels that have
|
804
|
+
# been assigned for _some_ Synnode of this role
|
805
|
+
synnodes = node_gf_pt.map { |ngp| ngp[0] }
|
806
|
+
gflabels = node_gf_pt.map { |ngp| ngp[1] }.compact.flatten.uniq
|
807
|
+
ptlabels = node_gf_pt.map { |ngp| ngp[2] }.compact.flatten.uniq
|
808
|
+
|
809
|
+
|
810
|
+
# let the interpreter class decide on how to
|
811
|
+
# determine the maximum constituents
|
812
|
+
maxnodes = interpreter_class.max_constituents(synnodes, st_sent)
|
813
|
+
|
814
|
+
fe_node = st_sent.add_fe(frame_node, rolelabel, maxnodes)
|
815
|
+
unless gflabels.empty?
|
816
|
+
fe_node.set_attribute("gf", gflabels.join(","))
|
817
|
+
end
|
818
|
+
unless ptlabels.empty?
|
819
|
+
fe_node.set_attribute("pt", ptlabels.join(","))
|
820
|
+
end
|
821
|
+
} # each role label
|
822
|
+
} # each frame
|
823
|
+
end
|
824
|
+
|
825
|
+
|
826
|
+
######
|
827
|
+
# handle multiword targets:
|
828
|
+
# if you find a verb with a separate prefix,
|
829
|
+
# change the verb's lemma information accordingly
|
830
|
+
# and add an attribute "other_words" to the verb node
|
831
|
+
# pointing to the other node
|
832
|
+
#
|
833
|
+
# In general, it will be assumed that "other_words" contains
|
834
|
+
# a list of node IDs for other nodes belonging to the same
|
835
|
+
# group, node IDs separated by spaces, and that
|
836
|
+
# each node of a group has the "other_words" attribute.
|
837
|
+
#
|
838
|
+
def FrprepHelper.handle_multiword_targets(sent, # SalsaTigerSentence object
|
839
|
+
interpreter, # SynInterpreter object
|
840
|
+
language) # string: en, de
|
841
|
+
##
|
842
|
+
# only retain the interesting words of the sentence:
|
843
|
+
# content words and prepositions
|
844
|
+
if sent.nil?
|
845
|
+
return
|
846
|
+
end
|
847
|
+
|
848
|
+
nodes = sent.terminals.select { |node|
|
849
|
+
[
|
850
|
+
"adj", "adv", "card", "noun", "part", "prep", "verb"
|
851
|
+
].include? interpreter.category(node)
|
852
|
+
}
|
853
|
+
|
854
|
+
##
|
855
|
+
# group:
|
856
|
+
# group verbs with their separate particles
|
857
|
+
# (at a later point, other types of grouping can be inserted here)
|
858
|
+
groups = FrprepHelper.group_words(nodes, interpreter)
|
859
|
+
|
860
|
+
##
|
861
|
+
# record grouping information as attributes on the terminals.
|
862
|
+
groups.each { |descr, group_of_nodes|
|
863
|
+
case descr
|
864
|
+
when "none"
|
865
|
+
# no grouping
|
866
|
+
when "part"
|
867
|
+
# separate particle belonging to a verb
|
868
|
+
|
869
|
+
# group_of_nodes is a pair [verb, particle]
|
870
|
+
verb, particle = group_of_nodes
|
871
|
+
|
872
|
+
verb.set_attribute("other_words", particle.id())
|
873
|
+
particle.set_attribute("other_words", verb.id())
|
874
|
+
|
875
|
+
if verb.get_attribute("lemma") and particle.get_attribute("lemma")
|
876
|
+
case language
|
877
|
+
when "de"
|
878
|
+
# German: prepend SVP to get the real lemma of the verb
|
879
|
+
verb.set_attribute("lemma",
|
880
|
+
particle.get_attribute("lemma") +
|
881
|
+
verb.get_attribute("lemma"))
|
882
|
+
when "en"
|
883
|
+
# English: append particle as separate word after the lemma of the verb
|
884
|
+
verb.set_attribute("lemma",
|
885
|
+
verb.get_attribute("lemma") + " " +
|
886
|
+
particle.get_attribute("lemma"))
|
887
|
+
else
|
888
|
+
# default
|
889
|
+
verb.set_attribute("lemma",
|
890
|
+
verb.get_attribute("lemma") + " " +
|
891
|
+
particle.get_attribute("lemma"))
|
892
|
+
end
|
893
|
+
end
|
894
|
+
|
895
|
+
else
|
896
|
+
raise "Shouldn't be here: unexpected description #{descr}"
|
897
|
+
end
|
898
|
+
}
|
899
|
+
end
|
900
|
+
|
901
|
+
########################
|
902
|
+
# group_words
|
903
|
+
#
|
904
|
+
# auxiliary of transform_multiword targets
|
905
|
+
#
|
906
|
+
# Group terminals:
|
907
|
+
# At the moment, just find separate prefixes and particles
|
908
|
+
# for verbs
|
909
|
+
#
|
910
|
+
# returns: list of pairs [descr, nodes]
|
911
|
+
# descr: string, "none" (no group), "part" (separate verb particle)
|
912
|
+
# nodes: array:SynNode
|
913
|
+
def FrprepHelper.group_words(nodes, # array: SynNode
|
914
|
+
interpreter) # SynInterpreter object
|
915
|
+
|
916
|
+
retv = Array.new # array of groups, array:array:SynNode
|
917
|
+
done = Array.new # remember nodes already covered
|
918
|
+
|
919
|
+
nodes.each { |terminal_node|
|
920
|
+
if done.include? terminal_node
|
921
|
+
# we have already included this node in one of the groups
|
922
|
+
next
|
923
|
+
end
|
924
|
+
|
925
|
+
if (svp = interpreter.particle_of_verb(terminal_node, nodes))
|
926
|
+
retv << ["part", [terminal_node, svp]]
|
927
|
+
done << terminal_node
|
928
|
+
done << svp
|
929
|
+
else
|
930
|
+
retv << ["none", [terminal_node]]
|
931
|
+
done << terminal_node
|
932
|
+
end
|
933
|
+
|
934
|
+
}
|
935
|
+
|
936
|
+
return retv
|
937
|
+
end
|
938
|
+
|
939
|
+
|
940
|
+
######
|
941
|
+
# handle unknown framenames
|
942
|
+
#
|
943
|
+
# For all frames with names matching Unknown\d+,
|
944
|
+
# rename them to <lemma>_Unknown\d+
|
945
|
+
def FrprepHelper.handle_unknown_framenames(sent, # SalsaTigerSentence
|
946
|
+
interpreter) # SynInterpreter class
|
947
|
+
if sent.nil?
|
948
|
+
return
|
949
|
+
end
|
950
|
+
|
951
|
+
sent.each_frame { |frame|
|
952
|
+
if frame.name() =~ /^Unknown/
|
953
|
+
if frame.target
|
954
|
+
maintarget = interpreter.main_node_of_expr(frame.target.children(), "no_mwe")
|
955
|
+
else
|
956
|
+
maintarget = nil
|
957
|
+
end
|
958
|
+
unless maintarget
|
959
|
+
$stderr.puts "Warning: Unknown frame, and I could not determine the target lemma: Frame #{frame.id()}"
|
960
|
+
$stderr.puts "Cannot repair frame name, leaving it as is."
|
961
|
+
return
|
962
|
+
end
|
963
|
+
|
964
|
+
# get lemma, if it exists, otherwise get word
|
965
|
+
# also, if the lemmatizer has returned a disjunction of lemmas,
|
966
|
+
# get the first disjunct
|
967
|
+
lemma = interpreter.lemma_backoff(maintarget)
|
968
|
+
if lemma
|
969
|
+
# we have a lemma
|
970
|
+
frame.set_name(lemma + "_" + frame.name())
|
971
|
+
else
|
972
|
+
# the main target word has no lemma attribute,
|
973
|
+
# and somehow I couldn't even get the target word
|
974
|
+
$stderr.puts "Warning: Salsa 'Unknown' frame."
|
975
|
+
$stderr.puts "Trying to make its lemma-specificity explicit, but"
|
976
|
+
$stderr.puts "I could not determine the target lemma nor the target word: frame #{frame.id()}"
|
977
|
+
$stderr.puts "Leaving 'Unknown' as it is."
|
978
|
+
end
|
979
|
+
end
|
980
|
+
}
|
981
|
+
end
|
982
|
+
|
983
|
+
|
984
|
+
#####################
|
985
|
+
#
|
986
|
+
# Integrate the semantic annotation of an old sentence
|
987
|
+
# into the corresponding new sentence
|
988
|
+
# At the same time, integrate the lemma information from the
|
989
|
+
# old sentence into the new sentence
|
990
|
+
def FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent,
|
991
|
+
newsent,
|
992
|
+
interpreter_class,
|
993
|
+
exp)
|
994
|
+
if oldsent.nil? or newsent.nil?
|
995
|
+
return
|
996
|
+
end
|
997
|
+
##
|
998
|
+
# match old and new sentence via terminals
|
999
|
+
newterminals = newsent.terminals_sorted()
|
1000
|
+
oldterminals = oldsent.terminals_sorted()
|
1001
|
+
# sanity check: exact match on terminals?
|
1002
|
+
newterminals.interleave(oldterminals).each { |newnode, oldnode|
|
1003
|
+
#print "old ", oldnode.word, " ", newnode.word, "\n"
|
1004
|
+
# new and old word: use both unescaped and escaped variant
|
1005
|
+
if newnode
|
1006
|
+
newwords = [ newnode.word, SalsaTigerXMLHelper.escape(newnode.word) ]
|
1007
|
+
else
|
1008
|
+
newwords = [nil, nil]
|
1009
|
+
end
|
1010
|
+
if oldnode
|
1011
|
+
oldwords = [ oldnode.word, SalsaTigerXMLHelper.escape(oldnode.word) ]
|
1012
|
+
else
|
1013
|
+
oldwords = [ nil, nil]
|
1014
|
+
end
|
1015
|
+
|
1016
|
+
if (newwords & oldwords).empty?
|
1017
|
+
# old and new word don't match, either escaped or non-escaped
|
1018
|
+
|
1019
|
+
$stderr.puts "Warning: could not match terminals of sentence #{newsent.id()}"
|
1020
|
+
$stderr.puts "This means that I cannot match the semantic annotation"
|
1021
|
+
$stderr.puts "to the newly parsed sentence. Skipping."
|
1022
|
+
#$stderr.puts "Old sentence: "
|
1023
|
+
#$stderr.puts oldterminals.map { |n| n.word }.join("--")
|
1024
|
+
#$stderr.puts "New sentence: "
|
1025
|
+
#$stderr.puts newterminals.map { |n| n.word }.join("--")
|
1026
|
+
return false
|
1027
|
+
end
|
1028
|
+
}
|
1029
|
+
|
1030
|
+
##
|
1031
|
+
# copy lemma information
|
1032
|
+
oldterminals.each_with_index { |oldnode, ix|
|
1033
|
+
newnode = newterminals[ix]
|
1034
|
+
if oldnode.get_attribute("lemma")
|
1035
|
+
newnode.set_attribute("lemma", oldnode.get_attribute("lemma"))
|
1036
|
+
end
|
1037
|
+
}
|
1038
|
+
|
1039
|
+
##
|
1040
|
+
# copy frames
|
1041
|
+
oldsent.each_frame { |oldframe|
|
1042
|
+
# make new frame with same ID
|
1043
|
+
newframe = newsent.add_frame(oldframe.name, oldframe.id())
|
1044
|
+
# copy FEs
|
1045
|
+
oldframe.each_child { |oldfe|
|
1046
|
+
# new nodes: map old terminals to new terminals,
|
1047
|
+
# then find max constituents covering them
|
1048
|
+
newnodes = oldfe.descendants.select { |n|
|
1049
|
+
n.is_terminal?
|
1050
|
+
}.map { |n|
|
1051
|
+
oldterminals.index(n)
|
1052
|
+
}.map { |ix|
|
1053
|
+
newterminals[ix]
|
1054
|
+
}
|
1055
|
+
|
1056
|
+
# let the interpreter class decide on how to determine the maximum constituents
|
1057
|
+
newnodes = interpreter_class.max_constituents(newnodes, newsent)
|
1058
|
+
|
1059
|
+
# make new FE with same ID
|
1060
|
+
new_fe = newsent.add_fe(newframe, oldfe.name(), newnodes, oldfe.id())
|
1061
|
+
# keep all attributes of the FE
|
1062
|
+
if oldfe.get_f("attributes")
|
1063
|
+
oldfe.get_f("attributes").each_pair { |attr, value|
|
1064
|
+
new_fe.set_attribute(attr, value)
|
1065
|
+
}
|
1066
|
+
end
|
1067
|
+
}
|
1068
|
+
}
|
1069
|
+
|
1070
|
+
##
|
1071
|
+
### changed by ines => appears twice in stxml file
|
1072
|
+
|
1073
|
+
# copy underspecification
|
1074
|
+
# keep as is, since we've kept all frame and FE IDs
|
1075
|
+
oldsent.each_usp_frameblock { |olduspframe|
|
1076
|
+
newuspframe = newsent.add_usp("frame")
|
1077
|
+
olduspframe.each_child { |oldnode|
|
1078
|
+
newnode = newsent.sem_node_with_id(oldnode.id())
|
1079
|
+
if newnode
|
1080
|
+
newuspframe.add_child(newnode)
|
1081
|
+
else
|
1082
|
+
$stderr.puts "Error: unknown frame with ID #{oldnode.id()}"
|
1083
|
+
end
|
1084
|
+
}
|
1085
|
+
}
|
1086
|
+
oldsent.each_usp_feblock { |olduspfe|
|
1087
|
+
newuspfe = newsent.add_usp("fe")
|
1088
|
+
olduspfe.each_child { |oldnode|
|
1089
|
+
newnode = newsent.sem_node_with_id(oldnode.id())
|
1090
|
+
if newnode
|
1091
|
+
newuspfe.add_child(newnode)
|
1092
|
+
else
|
1093
|
+
$stderr.puts "Error: unknown FE with ID #{oldnode.id()}"
|
1094
|
+
end
|
1095
|
+
}
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
end
|
1099
|
+
|
1100
|
+
####################
|
1101
|
+
# add head attributes to each nonterminal in each
|
1102
|
+
# SalsaTigerXML file in a directory
|
1103
|
+
|
1104
|
+
def FrprepHelper.add_head_attributes(st_sent, # SalsaTigerSentence object
|
1105
|
+
interpreter) # SynInterpreter class
|
1106
|
+
st_sent.each_nonterminal {|nt_node|
|
1107
|
+
head_term = interpreter.head_terminal(nt_node)
|
1108
|
+
if head_term and head_term.word()
|
1109
|
+
nt_node.set_attribute("head", head_term.word())
|
1110
|
+
else
|
1111
|
+
nt_node.set_attribute("head", "--")
|
1112
|
+
end
|
1113
|
+
} # each nonterminal
|
1114
|
+
end
|
1115
|
+
|
1116
|
+
# add lemma information to each terminal in a given SalsaTigerSentence object
|
1117
|
+
def FrprepHelper.add_lemmas_from_tab(st_sent, # SalsaTigerSentence object
|
1118
|
+
tab_sent,# FNTabFormatSentence object
|
1119
|
+
mapping) # hash: tab lineno -> array:SynNode
|
1120
|
+
if tab_sent.nil?
|
1121
|
+
# tab sentence not found
|
1122
|
+
return
|
1123
|
+
end
|
1124
|
+
|
1125
|
+
# produce list with word, lemma pairs
|
1126
|
+
lemmat = Array.new
|
1127
|
+
tab_sent.each_line_parsed {|line|
|
1128
|
+
word = line.get("word")
|
1129
|
+
lemma = line.get("lemma")
|
1130
|
+
lemmat << [word,lemma]
|
1131
|
+
}
|
1132
|
+
|
1133
|
+
# match with st_sent terminal list and add lemma attributes
|
1134
|
+
# KE Jan 07: if word mismatch,
|
1135
|
+
# set to Lemmatizer file version,
|
1136
|
+
# but count mismatches
|
1137
|
+
word_mismatches = Array.new()
|
1138
|
+
|
1139
|
+
st_sent.each_terminal_sorted {|t|
|
1140
|
+
matching_lineno = (0..lemmat.length()-1).to_a.detect { |tab_lineno|
|
1141
|
+
mapping[tab_lineno].include? t
|
1142
|
+
}
|
1143
|
+
unless matching_lineno
|
1144
|
+
next
|
1145
|
+
end
|
1146
|
+
word, lemma = lemmat[matching_lineno]
|
1147
|
+
|
1148
|
+
# transform characters to XML-friendly form
|
1149
|
+
# for comparison with st_word, which is also escaped
|
1150
|
+
word = SalsaTigerXMLHelper.escape(word)
|
1151
|
+
st_word = t.word()
|
1152
|
+
if word != st_word and
|
1153
|
+
word != SalsaTigerXMLHelper.escape(st_word)
|
1154
|
+
# true mismatch.
|
1155
|
+
# use the Lemmatizer version of the word, remember the mismatch
|
1156
|
+
word_mismatches << [st_word, word]
|
1157
|
+
t.set_attribute("word", word)
|
1158
|
+
end
|
1159
|
+
|
1160
|
+
if lemma
|
1161
|
+
# we actually do have lemma information
|
1162
|
+
lemmatised_head = SalsaTigerXMLHelper.escape(lemma)
|
1163
|
+
t.set_attribute("lemma",lemmatised_head)
|
1164
|
+
end
|
1165
|
+
} # each terminal
|
1166
|
+
|
1167
|
+
# did we have mismatches? then report them
|
1168
|
+
unless word_mismatches.empty?
|
1169
|
+
$stderr.puts "Warning: Word mismatches found between Lemmatizer file and SalsaTigerXML file generalted from parser output."
|
1170
|
+
$stderr.puts "(May be due to failed reencoding of special character in the parser output.)"
|
1171
|
+
$stderr.puts "I am using the Lemmatizer version by default."
|
1172
|
+
$stderr.puts "Version used:"
|
1173
|
+
$stderr.print "\t"
|
1174
|
+
st_sent.each_terminal_sorted { |t| $stderr.print ">>#{t}<<" }
|
1175
|
+
$stderr.puts
|
1176
|
+
$stderr.print "SalsaTigerXML file had: "
|
1177
|
+
$stderr.print word_mismatches.map { |st_word, tab_word|
|
1178
|
+
"#{st_word} instead of #{tab_word}"
|
1179
|
+
}.join(", ")
|
1180
|
+
$stderr.puts
|
1181
|
+
end
|
1182
|
+
end
|
1183
|
+
|
1184
|
+
###################3
|
1185
|
+
# given a SalsaTigerSentence,
|
1186
|
+
# look for FrameNet frames that are
|
1187
|
+
# test frames, and remove them
|
1188
|
+
def FrprepHelper.remove_deprecated_frames(sent, # SalsaTigerSentence
|
1189
|
+
exp) # FrprepConfigData
|
1190
|
+
|
1191
|
+
unless exp.get("origin") == "FrameNet"
|
1192
|
+
return
|
1193
|
+
end
|
1194
|
+
|
1195
|
+
sent.frames.each { |frame_obj|
|
1196
|
+
if frame_obj.name() == "Boulder" or
|
1197
|
+
frame_obj.name() =~ /^Test/
|
1198
|
+
sent.remove_frame(frame_obj)
|
1199
|
+
end
|
1200
|
+
}
|
1201
|
+
end
|
1202
|
+
|
1203
|
+
end
|
1204
|
+
|
1205
|
+
############################################3
|
1206
|
+
# Class FrprepFlatSyntax:
|
1207
|
+
#
|
1208
|
+
# given a FNTabFormat file,
|
1209
|
+
# yield each of its sentences in SalsaTigerXML,
|
1210
|
+
# constructing a flat syntax
|
1211
|
+
class FrprepFlatSyntax
|
1212
|
+
def initialize(tabfilename, # string: name of tab file
|
1213
|
+
postag_suffix, # postag file suffix (or nil)
|
1214
|
+
lemma_suffix) # lemmatisation file suffix (or nil)
|
1215
|
+
|
1216
|
+
@tabfilename = tabfilename
|
1217
|
+
@pos_suffix = postag_suffix
|
1218
|
+
@lemma_suffix = lemma_suffix
|
1219
|
+
end
|
1220
|
+
|
1221
|
+
# yield each non-parse sentence as a tuple
|
1222
|
+
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
1223
|
+
# of a SalsaTigerSentence object, a FNTabSentence object,
|
1224
|
+
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
1225
|
+
# pointing each tab word to one or more SalsaTigerSentence terminals
|
1226
|
+
def each_sentence(dummy)
|
1227
|
+
|
1228
|
+
# read tab file with lemma and POS info
|
1229
|
+
tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
|
1230
|
+
|
1231
|
+
tabfile.each_sentence() { |tabsent|
|
1232
|
+
# start new, empty sentence with "failed" attribute (i.e. no parse)
|
1233
|
+
# and with the ID of the corresponding TabFormat sentence
|
1234
|
+
sentid = tabsent.get_sent_id()
|
1235
|
+
if sentid.nil? or sentid =~ /^-*$/
|
1236
|
+
$stderr.puts "No sentence ID for sentence:"
|
1237
|
+
tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
|
1238
|
+
$stderr.puts
|
1239
|
+
sentid = Time.new().to_f.to_s
|
1240
|
+
end
|
1241
|
+
sent = SalsaTigerSentence.new("<s id=\"#{SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
|
1242
|
+
|
1243
|
+
# add single nonterminal node, category "S"
|
1244
|
+
single_nonterminal_id = SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
|
1245
|
+
vroot = sent.add_syn("nt", "S", # category
|
1246
|
+
nil, # word
|
1247
|
+
nil, # pos
|
1248
|
+
single_nonterminal_id)
|
1249
|
+
|
1250
|
+
# add terminals
|
1251
|
+
tabsent.each_line_parsed() { |line_obj|
|
1252
|
+
# make terminal node with tab sent info
|
1253
|
+
node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
|
1254
|
+
word = line_obj.get("word")
|
1255
|
+
unless word
|
1256
|
+
word = ""
|
1257
|
+
end
|
1258
|
+
word = SalsaTigerXMLHelper.escape(word)
|
1259
|
+
pos = line_obj.get("pos")
|
1260
|
+
unless pos
|
1261
|
+
pos = ""
|
1262
|
+
end
|
1263
|
+
pos = SalsaTigerXMLHelper.escape(pos)
|
1264
|
+
terminal = sent.add_syn("t", nil, # category
|
1265
|
+
word, pos,
|
1266
|
+
node_id)
|
1267
|
+
|
1268
|
+
if line_obj.get("lemma")
|
1269
|
+
# lemma
|
1270
|
+
terminal.set_attribute("lemma", SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
|
1271
|
+
end
|
1272
|
+
|
1273
|
+
# add new terminal as child of vroot
|
1274
|
+
vroot.add_child(terminal, nil)
|
1275
|
+
terminal.add_parent(vroot, nil)
|
1276
|
+
} # each line of tab file
|
1277
|
+
|
1278
|
+
# yield newly constructed SalsaTigerXMl sentence plus tab sentence
|
1279
|
+
yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
|
1280
|
+
}
|
1281
|
+
end
|
1282
|
+
end
|
1283
|
+
|
1284
|
+
############################################3
|
1285
|
+
# Class FrprepReadStxml
|
1286
|
+
#
|
1287
|
+
# given a STXML file,
|
1288
|
+
# yield each of its sentences
|
1289
|
+
class FrprepReadStxml
|
1290
|
+
def initialize(stxmlfilename, # string: name of SalsaTigerXML file
|
1291
|
+
tabfilename, # string: name of corresponding tab file (or nil)
|
1292
|
+
postag_suffix, # POS tag file suffix (or nil)
|
1293
|
+
lemma_suffix) # lemmatization file suffix (or nil)
|
1294
|
+
|
1295
|
+
@stxmlfilename = stxmlfilename
|
1296
|
+
@tabfilename = tabfilename
|
1297
|
+
@pos_suffix = postag_suffix
|
1298
|
+
@lemma_suffix = lemma_suffix
|
1299
|
+
end
|
1300
|
+
# yield each non-parse sentence as a tuple
|
1301
|
+
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
1302
|
+
# of a SalsaTigerSentence object, a FNTabSentence object,
|
1303
|
+
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
1304
|
+
# pointing each tab word to one or more SalsaTigerSentence terminals
|
1305
|
+
def each_sentence(dummy)
|
1306
|
+
# read corresponding tab file?
|
1307
|
+
tab_sents = Array.new()
|
1308
|
+
if File.exists? @tabfilename
|
1309
|
+
tabfile = FNTabFormatFile.new(@tabfilename,@pos_suffix,@lemma_suffix)
|
1310
|
+
tabfile.each_sentence { |tabsent|
|
1311
|
+
tab_sents << tabsent
|
1312
|
+
}
|
1313
|
+
end
|
1314
|
+
|
1315
|
+
# read STXML file
|
1316
|
+
infile = FilePartsParser.new(@stxmlfilename)
|
1317
|
+
index = 0
|
1318
|
+
infile.scan_s { |sent_string|
|
1319
|
+
sent = SalsaTigerSentence.new(sent_string)
|
1320
|
+
yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
|
1321
|
+
index += 1
|
1322
|
+
}
|
1323
|
+
end
|
1324
|
+
end
|