frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,686 @@
|
|
1
|
+
require 'frprep/do_parses'
|
2
|
+
require 'frprep/FrprepHelper'
|
3
|
+
require 'frprep/FixSynSemMapping'
|
4
|
+
|
5
|
+
##############################
|
6
|
+
# The class that does all the work
|
7
|
+
module FrPrep
|
8
|
+
class FrPrep
|
9
|
+
|
10
|
+
def initialize(exp) # FrprepConfigData object
|
11
|
+
@exp = exp
|
12
|
+
|
13
|
+
# AB: move to FRprepOptionParser
|
14
|
+
# remove previous contents of frprep internal data directory
|
15
|
+
unless exp.get("frprep_directory")
|
16
|
+
raise "Please set 'frprep_directory', the frprep internal data directory,\n" +
|
17
|
+
"in the experiment file."
|
18
|
+
end
|
19
|
+
|
20
|
+
# experiment directory:
|
21
|
+
# frprep internal data directory, subdir according to experiment ID
|
22
|
+
exp_dir = File.new_dir(@exp.get("frprep_directory"),
|
23
|
+
@exp.get("prep_experiment_ID"))
|
24
|
+
# %x{rm -rf #{exp_dir}}
|
25
|
+
|
26
|
+
# suffixes for different types of output files
|
27
|
+
@file_suffixes = {"lemma" => ".lemma",
|
28
|
+
"pos" => ".pos",
|
29
|
+
"tab" => ".tab",
|
30
|
+
"stxml" => ".xml"}
|
31
|
+
end
|
32
|
+
|
33
|
+
def transform()
|
34
|
+
|
35
|
+
# AB: Debugging.
|
36
|
+
debugger if $DEBUG
|
37
|
+
|
38
|
+
current_format = @exp.get("format")
|
39
|
+
|
40
|
+
# AB: move to FRprepOptionParser
|
41
|
+
unless @exp.get("directory_input")
|
42
|
+
$stderr.puts "Please specify 'directory_input' in the experiment file."
|
43
|
+
exit 1
|
44
|
+
end
|
45
|
+
# AB: move to FRprepOptionParser
|
46
|
+
unless @exp.get("directory_preprocessed")
|
47
|
+
$stderr.puts "Please specify 'directory_preprocessed' in the experiment file."
|
48
|
+
exit 1
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
# input and output directories.
|
53
|
+
#
|
54
|
+
# sanity check: output in tab format will not work
|
55
|
+
# if we also do a parse
|
56
|
+
if @exp.get("tabformat_output") and @exp.get("do_parse")
|
57
|
+
$stderr.puts "Error: Cannot do Tab format output"
|
58
|
+
$stderr.puts "when the input text is being parsed."
|
59
|
+
$stderr.puts "Please set either 'tabformat_output' or 'do_parse' to false."
|
60
|
+
exit 1
|
61
|
+
end
|
62
|
+
input_dir = File.existing_dir(@exp.get("directory_input"))
|
63
|
+
output_dir = File.new_dir(@exp.get("directory_preprocessed"))
|
64
|
+
if @exp.get("tabformat_output")
|
65
|
+
split_dir = output_dir
|
66
|
+
else
|
67
|
+
split_dir = frprep_dirname("split", "new")
|
68
|
+
end
|
69
|
+
|
70
|
+
####
|
71
|
+
# transform data to UTF-8
|
72
|
+
|
73
|
+
if ["iso", "hex"].include? @exp.get("encoding")
|
74
|
+
# transform ISO -> UTF-8 or Hex -> UTF-8
|
75
|
+
# write result to encoding_dir,
|
76
|
+
# then set encoding_dir to be the new input_dir
|
77
|
+
|
78
|
+
encoding_dir = frprep_dirname("encoding", "new")
|
79
|
+
$stderr.puts "Frprep: Transforming to UTF-8."
|
80
|
+
Dir[input_dir + "*"].each { |filename|
|
81
|
+
unless File.file? filename
|
82
|
+
# not a file? then skip
|
83
|
+
next
|
84
|
+
end
|
85
|
+
outfilename = encoding_dir + File.basename(filename)
|
86
|
+
FrprepHelper.to_utf8_file(filename, outfilename, @exp.get("encoding"))
|
87
|
+
}
|
88
|
+
|
89
|
+
input_dir = encoding_dir
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
####
|
94
|
+
# transform data all the way to the output format,
|
95
|
+
# which is SalsaTigerXML by default,
|
96
|
+
# except when tabformat_output has been set, in which case it's
|
97
|
+
# Tab format.
|
98
|
+
current_dir = input_dir
|
99
|
+
|
100
|
+
if @exp.get("tabformat_output")
|
101
|
+
done_format = "SalsaTabWithPos"
|
102
|
+
else
|
103
|
+
done_format = "Done"
|
104
|
+
end
|
105
|
+
|
106
|
+
while not(current_format == done_format)
|
107
|
+
case current_format
|
108
|
+
|
109
|
+
when "BNC"
|
110
|
+
# basically plain, plus some tags to be removed
|
111
|
+
plain_dir = frprep_dirname("plain", "new")
|
112
|
+
|
113
|
+
$stderr.puts "Frprep: Transforming BNC format text in #{current_dir} to plain format."
|
114
|
+
$stderr.puts "Storing the result in #{plain_dir}."
|
115
|
+
$stderr.puts "Expecting one sentence per line."
|
116
|
+
|
117
|
+
transform_bncformat_dir(current_dir, plain_dir)
|
118
|
+
|
119
|
+
current_dir = plain_dir
|
120
|
+
current_format = "Plain"
|
121
|
+
|
122
|
+
when "Plain"
|
123
|
+
# transform to tab format
|
124
|
+
|
125
|
+
tab_dir = frprep_dirname("tab", "new")
|
126
|
+
|
127
|
+
$stderr.puts "Frprep: Transforming plain text in #{current_dir} to SalsaTab format."
|
128
|
+
$stderr.puts "Storing the result in #{tab_dir}."
|
129
|
+
$stderr.puts "Expecting one sentence per line."
|
130
|
+
|
131
|
+
transform_plain_dir(current_dir, tab_dir)
|
132
|
+
|
133
|
+
current_dir = tab_dir
|
134
|
+
current_format = "SalsaTab"
|
135
|
+
|
136
|
+
when "FNXml"
|
137
|
+
# transform to tab format
|
138
|
+
|
139
|
+
tab_dir = frprep_dirname("tab", "new")
|
140
|
+
|
141
|
+
$stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
|
142
|
+
$stderr.puts "Storing the result in " + tab_dir
|
143
|
+
|
144
|
+
fndata = FNDatabase.new(current_dir)
|
145
|
+
fndata.extract_everything(tab_dir)
|
146
|
+
Kernel.system("chmod -R g+rx #{tab_dir}")
|
147
|
+
|
148
|
+
current_dir = tab_dir
|
149
|
+
current_format = "SalsaTab"
|
150
|
+
|
151
|
+
when "FNCorpusXml"
|
152
|
+
# transform to tab format
|
153
|
+
tab_dir = frprep_dirname("tab", "new")
|
154
|
+
|
155
|
+
$stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
|
156
|
+
$stderr.puts "Storing the result in " + tab_dir
|
157
|
+
# assuming that all XML files in the current directory are FN Corpus XML files
|
158
|
+
Dir[current_dir + "*.xml"].each { |fncorpusfilename|
|
159
|
+
corpus = FNCorpusXMLFile.new(fncorpusfilename)
|
160
|
+
outfile = File.new(tab_dir + File.basename(fncorpusfilename, ".xml") + ".tab",
|
161
|
+
"w")
|
162
|
+
corpus.print_conll_style(outfile)
|
163
|
+
outfile.close()
|
164
|
+
}
|
165
|
+
|
166
|
+
Kernel.system("chmod -R g+rx #{tab_dir}")
|
167
|
+
current_dir = tab_dir
|
168
|
+
current_format = "SalsaTab"
|
169
|
+
|
170
|
+
when "SalsaTab"
|
171
|
+
# lemmatize and POStag
|
172
|
+
|
173
|
+
$stderr.puts "Frprep: Lemmatizing and parsing text in #{current_dir}."
|
174
|
+
$stderr.puts "Storing the result in #{split_dir}."
|
175
|
+
transform_pos_and_lemmatize(current_dir, split_dir)
|
176
|
+
|
177
|
+
current_dir = split_dir
|
178
|
+
current_format = "SalsaTabWithPos"
|
179
|
+
|
180
|
+
when "SalsaTabWithPos"
|
181
|
+
# parse
|
182
|
+
|
183
|
+
parse_dir = frprep_dirname("parse", "new")
|
184
|
+
|
185
|
+
$stderr.puts "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format."
|
186
|
+
$stderr.puts "Storing the result in #{parse_dir}."
|
187
|
+
|
188
|
+
transform_salsatab_dir(current_dir, parse_dir, output_dir)
|
189
|
+
|
190
|
+
current_dir = output_dir
|
191
|
+
current_format = "Done"
|
192
|
+
|
193
|
+
when "SalsaTigerXML"
|
194
|
+
|
195
|
+
parse_dir = frprep_dirname("parse", "new")
|
196
|
+
print "Transform parser output into stxml\n"
|
197
|
+
transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir, @exp)
|
198
|
+
current_dir = output_dir
|
199
|
+
current_format = "Done"
|
200
|
+
|
201
|
+
else
|
202
|
+
$stderr.puts "Unknown data format #{current_format}"
|
203
|
+
$stderr.puts "Please check the 'format' entry in your experiment file."
|
204
|
+
raise "Experiment file problem"
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
$stderr.puts "Frprep: Done preprocessing."
|
209
|
+
end
|
210
|
+
|
211
|
+
############################################################################3
|
212
|
+
private
|
213
|
+
############################################################################3
|
214
|
+
|
215
|
+
###############
|
216
|
+
# frprep_dirname:
|
217
|
+
# make directory name for frprep-internal data
|
218
|
+
# of a certain kind described in <subdir>
|
219
|
+
#
|
220
|
+
# frprep_directory has one subdirectory for each experiment ID,
|
221
|
+
# and below that there is one subdir per subtask
|
222
|
+
#
|
223
|
+
# If this is a new directory, it is constructed,
|
224
|
+
# if it should be an existing directory, its existence is checked.
|
225
|
+
def frprep_dirname(subdir, # string: designator of subdirectory
|
226
|
+
new = nil) # non-nil: this may be a new directory
|
227
|
+
|
228
|
+
dirname = File.new_dir(@exp.get("frprep_directory"),
|
229
|
+
@exp.get("prep_experiment_ID"),
|
230
|
+
subdir)
|
231
|
+
|
232
|
+
|
233
|
+
if new
|
234
|
+
return File.new_dir(dirname)
|
235
|
+
else
|
236
|
+
return File.existing_dir(dirname)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
###############
|
243
|
+
# transform_plain:
|
244
|
+
#
|
245
|
+
# transformation for BNC format:
|
246
|
+
#
|
247
|
+
# transform to plain format, removing <> elements
|
248
|
+
def transform_bncformat_dir(input_dir, # string: input directory
|
249
|
+
output_dir) # string: output directory
|
250
|
+
|
251
|
+
Dir[input_dir + "*"].each { |bncfilename|
|
252
|
+
|
253
|
+
# open input and output file
|
254
|
+
# end output file name in "tab" because that is, at the moment, required
|
255
|
+
outfilename = output_dir + File.basename(bncfilename)
|
256
|
+
FrprepHelper.bnc_to_plain_file(bncfilename, outfilename)
|
257
|
+
}
|
258
|
+
end
|
259
|
+
|
260
|
+
|
261
|
+
###############
|
262
|
+
# transform_plain:
|
263
|
+
#
|
264
|
+
# transformation for plaintext:
|
265
|
+
#
|
266
|
+
# transform to Tab format, separating punctuation from adjacent words
|
267
|
+
def transform_plain_dir(input_dir, # string: input directory
|
268
|
+
output_dir) # string: output directory
|
269
|
+
|
270
|
+
Dir[input_dir + "*"].each { |plainfilename|
|
271
|
+
|
272
|
+
# open input and output file
|
273
|
+
# end output file name in "tab" because that is, at the moment, required
|
274
|
+
outfilename = output_dir + File.basename(plainfilename) + @file_suffixes["tab"]
|
275
|
+
FrprepHelper.plain_to_tab_file(plainfilename, outfilename)
|
276
|
+
}
|
277
|
+
end
|
278
|
+
|
279
|
+
###############
|
280
|
+
# transform_pos_and_lemmatize
|
281
|
+
#
|
282
|
+
# transformation for Tab format files:
|
283
|
+
#
|
284
|
+
# - Split into parser-size chunks
|
285
|
+
# - POS-tag, lemmatize
|
286
|
+
def transform_pos_and_lemmatize(input_dir, # string: input directory
|
287
|
+
output_dir) # string: output directory
|
288
|
+
##
|
289
|
+
# split the TabFormatFile into chunks of max_sent_num size
|
290
|
+
FrprepHelper.split_dir(input_dir, output_dir,@file_suffixes["tab"],
|
291
|
+
@exp.get("parser_max_sent_num"),
|
292
|
+
@exp.get("parser_max_sent_len"))
|
293
|
+
|
294
|
+
##
|
295
|
+
# POS-Tagging
|
296
|
+
if @exp.get("do_postag")
|
297
|
+
$stderr.puts "Frprep: Tagging."
|
298
|
+
unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
|
299
|
+
raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
|
300
|
+
end
|
301
|
+
|
302
|
+
sys_class = SynInterfaces.get_interface("pos_tagger",
|
303
|
+
@exp.get("pos_tagger"))
|
304
|
+
print "pos tagger interface: ", sys_class, "\n"
|
305
|
+
unless sys_class
|
306
|
+
raise "Shouldn't be here"
|
307
|
+
end
|
308
|
+
sys = sys_class.new(@exp.get("pos_tagger_path"),
|
309
|
+
@file_suffixes["tab"],
|
310
|
+
@file_suffixes["pos"])
|
311
|
+
sys.process_dir(output_dir, output_dir)
|
312
|
+
end
|
313
|
+
|
314
|
+
|
315
|
+
##
|
316
|
+
# Lemmatization
|
317
|
+
if @exp.get("do_lemmatize")
|
318
|
+
$stderr.puts "Frprep: Lemmatizing."
|
319
|
+
unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
|
320
|
+
raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
|
321
|
+
end
|
322
|
+
|
323
|
+
sys_class = SynInterfaces.get_interface("lemmatizer",
|
324
|
+
@exp.get("lemmatizer"))
|
325
|
+
# AB: make this exception explicit.
|
326
|
+
unless sys_class
|
327
|
+
raise 'I got a empty interface class for the lemmatizer!'
|
328
|
+
end
|
329
|
+
sys = sys_class.new(@exp.get("lemmatizer_path"),
|
330
|
+
@file_suffixes["tab"],
|
331
|
+
@file_suffixes["lemma"])
|
332
|
+
sys.process_dir(output_dir, output_dir)
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
###############
|
337
|
+
# transform_salsatab
|
338
|
+
#
|
339
|
+
# transformation for Tab format files:
|
340
|
+
#
|
341
|
+
# - parse
|
342
|
+
# - Transform parser output to SalsaTigerXML
|
343
|
+
# If no parsing, make flat syntactic structure.
|
344
|
+
def transform_salsatab_dir(input_dir, # string: input directory
|
345
|
+
parse_dir, # string: output directory for parses
|
346
|
+
output_dir) # string: global output directory
|
347
|
+
|
348
|
+
##
|
349
|
+
# (Parse and) transform to SalsaTigerXML
|
350
|
+
|
351
|
+
# get interpretation class for this
|
352
|
+
# parser/lemmatizer/POS tagger combination
|
353
|
+
interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
|
354
|
+
unless interpreter_class
|
355
|
+
raise "Shouldn't be here"
|
356
|
+
end
|
357
|
+
|
358
|
+
parse_obj = DoParses.new(@exp, @file_suffixes,
|
359
|
+
parse_dir,
|
360
|
+
"tab_dir" => input_dir)
|
361
|
+
parse_obj.each_parsed_file { |parsed_file_obj|
|
362
|
+
|
363
|
+
outfilename = output_dir + parsed_file_obj.filename + ".xml"
|
364
|
+
$stderr.puts "Writing #{outfilename}"
|
365
|
+
begin
|
366
|
+
outfile = File.new(outfilename, "w")
|
367
|
+
rescue
|
368
|
+
raise "Cannot write to SalsaTigerXML output file #{outfilename}"
|
369
|
+
end
|
370
|
+
|
371
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
372
|
+
# work with triples
|
373
|
+
# SalsaTigerSentence, FNTabSentence,
|
374
|
+
# hash: tab sentence index(integer) -> array:SynNode
|
375
|
+
parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
|
376
|
+
|
377
|
+
# parsed: add headwords using parse tree
|
378
|
+
if @exp.get("do_parse")
|
379
|
+
FrprepHelper.add_head_attributes(st_sent, interpreter_class)
|
380
|
+
end
|
381
|
+
|
382
|
+
# add lemmas, if they are there. If they are not, don't print out a warning.
|
383
|
+
if @exp.get("do_lemmatize")
|
384
|
+
FrprepHelper.add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
|
385
|
+
end
|
386
|
+
|
387
|
+
# add semantics
|
388
|
+
# we can use the method in SalsaTigerXMLHelper
|
389
|
+
# that reads semantic information from the tab file
|
390
|
+
# and combines all targets of a sentence into one frame
|
391
|
+
FrprepHelper.add_semantics_from_tab(st_sent, tabformat_sent, mapping,
|
392
|
+
interpreter_class, @exp)
|
393
|
+
|
394
|
+
# remove pseudo-frames from FrameNet data
|
395
|
+
FrprepHelper.remove_deprecated_frames(st_sent, @exp)
|
396
|
+
|
397
|
+
# handle multiword targets
|
398
|
+
FrprepHelper.handle_multiword_targets(st_sent,
|
399
|
+
interpreter_class, @exp.get("language"))
|
400
|
+
|
401
|
+
# handle Unknown frame names
|
402
|
+
FrprepHelper.handle_unknown_framenames(st_sent, interpreter_class)
|
403
|
+
|
404
|
+
outfile.puts st_sent.get()
|
405
|
+
}
|
406
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
407
|
+
}
|
408
|
+
end
|
409
|
+
|
410
|
+
#############################################
|
411
|
+
# transform_stxml
|
412
|
+
#
|
413
|
+
# transformation for SalsaTigerXML data
|
414
|
+
#
|
415
|
+
# - If the input format was SalsaTigerXML:
|
416
|
+
# - Tag, lemmatize and parse, if the experiment file tells you so
|
417
|
+
#
|
418
|
+
# - If the origin is the Salsa corpus:
|
419
|
+
# Change frame names from Unknown\d+ to lemma_Unknown\d+
|
420
|
+
#
|
421
|
+
# - fix multiword lemmas, or at least try
|
422
|
+
# - transform to UTF 8
|
423
|
+
def transform_stxml_dir(parse_dir, # string: name of directory for parse data
|
424
|
+
tab_dir, # string: name of directory for split/tab data
|
425
|
+
input_dir, # string: name of input directory
|
426
|
+
output_dir, # string: name of final output directory
|
427
|
+
exp) # FrprepConfigData
|
428
|
+
|
429
|
+
####
|
430
|
+
# Data preparation
|
431
|
+
|
432
|
+
# Data with Salsa as origin:
|
433
|
+
# remember the target lemma as an attribute on the
|
434
|
+
# <target> elements
|
435
|
+
#
|
436
|
+
# currently deactivated: encoding problems
|
437
|
+
# if @exp.get("origin") == "SalsaTiger"
|
438
|
+
# $stderr.puts "Frprep: noting target lemmas"
|
439
|
+
# changed_input_dir = frprep_dirname("salsalemma", "new")
|
440
|
+
# FrprepHelper.note_salsa_targetlemmas(input_dir, changed_input_dir)
|
441
|
+
|
442
|
+
# # remember changed input dir as input dir
|
443
|
+
# input_dir = changed_input_dir
|
444
|
+
# end
|
445
|
+
|
446
|
+
# If data is to be parsed, split and tabify input files
|
447
|
+
# else copy data to stxml_indir.
|
448
|
+
|
449
|
+
# stxml_dir: directory where SalsaTiger data is situated
|
450
|
+
if @exp.get("do_parse")
|
451
|
+
# split data
|
452
|
+
stxml_splitdir = frprep_dirname("stxml_split", "new")
|
453
|
+
stxml_dir = stxml_splitdir
|
454
|
+
|
455
|
+
$stderr.puts "Frprep: splitting data"
|
456
|
+
FrprepHelper.stxml_split_dir(input_dir, stxml_splitdir,
|
457
|
+
@exp.get("parser_max_sent_num"),
|
458
|
+
@exp.get("parser_max_sent_len"))
|
459
|
+
else
|
460
|
+
# no parsing: copy data to split dir
|
461
|
+
stxml_dir = parse_dir
|
462
|
+
$stderr.puts "Frprep: Copying data to #{stxml_dir}"
|
463
|
+
Dir[input_dir + "*.xml"].each { |filename|
|
464
|
+
`cp #{filename} #{stxml_dir}#{File.basename(filename)}`
|
465
|
+
}
|
466
|
+
end
|
467
|
+
|
468
|
+
# Some syntactic processing will take place:
|
469
|
+
# tabify data
|
470
|
+
if @exp.get("do_parse") or @exp.get("do_lemmatize") or @exp.get("do_postag")
|
471
|
+
$stderr.puts "Frprep: making input for syn. processing"
|
472
|
+
|
473
|
+
Dir[stxml_dir+"*"+@file_suffixes["stxml"]].each { |stxmlfilename|
|
474
|
+
|
475
|
+
tabfilename = tab_dir + File.basename(stxmlfilename,@file_suffixes["stxml"]) + @file_suffixes["tab"]
|
476
|
+
FrprepHelper.stxml_to_tab_file(stxmlfilename, tabfilename, exp)
|
477
|
+
}
|
478
|
+
end
|
479
|
+
|
480
|
+
###
|
481
|
+
# POS-tagging
|
482
|
+
if @exp.get("do_postag")
|
483
|
+
$stderr.puts "Frprep: Tagging."
|
484
|
+
unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
|
485
|
+
raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
|
486
|
+
end
|
487
|
+
|
488
|
+
sys_class = SynInterfaces.get_interface("pos_tagger",
|
489
|
+
@exp.get("pos_tagger"))
|
490
|
+
unless sys_class
|
491
|
+
raise "Shouldn't be here"
|
492
|
+
end
|
493
|
+
sys = sys_class.new(@exp.get("pos_tagger_path"),
|
494
|
+
@file_suffixes["tab"],
|
495
|
+
@file_suffixes["pos"])
|
496
|
+
sys.process_dir(tab_dir, tab_dir)
|
497
|
+
end
|
498
|
+
|
499
|
+
###
|
500
|
+
# Lemmatization
|
501
|
+
if @exp.get("do_lemmatize")
|
502
|
+
$stderr.puts "Frprep: Lemmatizing."
|
503
|
+
unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
|
504
|
+
raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
|
505
|
+
end
|
506
|
+
|
507
|
+
sys_class = SynInterfaces.get_interface("lemmatizer",
|
508
|
+
@exp.get("lemmatizer"))
|
509
|
+
unless sys_class
|
510
|
+
raise "Shouldn't be here"
|
511
|
+
end
|
512
|
+
sys = sys_class.new(@exp.get("lemmatizer_path"),
|
513
|
+
@file_suffixes["tab"],
|
514
|
+
@file_suffixes["lemma"])
|
515
|
+
sys.process_dir(tab_dir, tab_dir)
|
516
|
+
end
|
517
|
+
|
518
|
+
###
|
519
|
+
# Parsing, production of SalsaTigerXML output
|
520
|
+
|
521
|
+
# get interpretation class for this
|
522
|
+
# parser/lemmatizer/POS tagger combination
|
523
|
+
sys_class_names = Hash.new
|
524
|
+
[["do_postag", "pos_tagger"],
|
525
|
+
["do_lemmatize", "lemmatizer"],
|
526
|
+
["do_parse", "parser"]].each { |service, system_name|
|
527
|
+
if @exp.get(service) # yes, perform this service
|
528
|
+
sys_class_names[system_name] = @exp.get(system_name)
|
529
|
+
end
|
530
|
+
}
|
531
|
+
interpreter_class = SynInterfaces.get_interpreter(sys_class_names)
|
532
|
+
unless interpreter_class
|
533
|
+
raise "Shouldn't be here"
|
534
|
+
end
|
535
|
+
|
536
|
+
parse_obj = DoParses.new(@exp, @file_suffixes,
|
537
|
+
parse_dir,
|
538
|
+
"tab_dir" => tab_dir,
|
539
|
+
"stxml_dir" => stxml_dir)
|
540
|
+
parse_obj.each_parsed_file { |parsed_file_obj|
|
541
|
+
outfilename = output_dir + parsed_file_obj.filename + ".xml"
|
542
|
+
$stderr.puts "Writing #{outfilename}"
|
543
|
+
begin
|
544
|
+
outfile = File.new(outfilename, "w")
|
545
|
+
rescue
|
546
|
+
raise "Cannot write to SalsaTigerXML output file #{outfilename}"
|
547
|
+
end
|
548
|
+
|
549
|
+
|
550
|
+
if @exp.get("do_parse")
|
551
|
+
# read old SalsaTigerXML file
|
552
|
+
# so we can integrate the old file's semantics later
|
553
|
+
oldxml = Array.new # array of sentence strings
|
554
|
+
# we assume that the old and the new file have the same name,
|
555
|
+
# ending in .xml.
|
556
|
+
oldxmlfile = FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
|
557
|
+
oldxmlfile.scan_s { |sent_string|
|
558
|
+
# remember this sentence by its ID
|
559
|
+
oldxml << sent_string
|
560
|
+
}
|
561
|
+
end
|
562
|
+
|
563
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
564
|
+
index = 0
|
565
|
+
# work with triples
|
566
|
+
# SalsaTigerSentence, FNTabSentence,
|
567
|
+
# hash: tab sentence index(integer) -> array:SynNode
|
568
|
+
parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
|
569
|
+
|
570
|
+
# parsed? then integrate semantics and lemmas from old file
|
571
|
+
if @exp.get("do_parse")
|
572
|
+
oldsent_string = oldxml[index]
|
573
|
+
index += 1
|
574
|
+
if oldsent_string
|
575
|
+
|
576
|
+
# modified by ines, 27/08/08
|
577
|
+
# for Berkeley => substitute ( ) for *LRB* *RRB*
|
578
|
+
if exp.get("parser") == "berkeley"
|
579
|
+
oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
|
580
|
+
oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
|
581
|
+
oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
|
582
|
+
oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
|
583
|
+
end
|
584
|
+
|
585
|
+
# we have both an old and a new sentence, so integrate semantics
|
586
|
+
oldsent = SalsaTigerSentence.new(oldsent_string)
|
587
|
+
if st_sent.nil?
|
588
|
+
next
|
589
|
+
end
|
590
|
+
if ( FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp) == false)
|
591
|
+
#print "FALSE \n";
|
592
|
+
#print oldsent, "\n", st_sent, "\n\n";
|
593
|
+
|
594
|
+
oldsent_string = oldxml[index]
|
595
|
+
index += 1
|
596
|
+
if oldsent_string
|
597
|
+
|
598
|
+
# modified by ines, 27/08/08
|
599
|
+
# for Berkeley => substitute ( ) for *LRB* *RRB*
|
600
|
+
if exp.get("parser") == "berkeley"
|
601
|
+
oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
|
602
|
+
oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
|
603
|
+
oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
|
604
|
+
oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
|
605
|
+
end
|
606
|
+
|
607
|
+
# we have both an old and a new sentence, so integrate semantics
|
608
|
+
oldsent = SalsaTigerSentence.new(oldsent_string)
|
609
|
+
#print oldsent, "\n", st_sent, "\n\n";
|
610
|
+
FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
|
611
|
+
|
612
|
+
end
|
613
|
+
#else
|
614
|
+
#print "TRUE\n";
|
615
|
+
#print oldsent, "\n", st_sent, "\n\n";
|
616
|
+
end
|
617
|
+
else
|
618
|
+
# no corresponding old sentence for this new sentence
|
619
|
+
$stderr.puts "Warning: transporting semantics -- missing source sentence, skipping"
|
620
|
+
end
|
621
|
+
end
|
622
|
+
|
623
|
+
# remove pseudo-frames from FrameNet data
|
624
|
+
FrprepHelper.remove_deprecated_frames(st_sent, @exp)
|
625
|
+
|
626
|
+
# repair syn/sem mapping problems?
|
627
|
+
if @exp.get("fe_syn_repair") or @exp.get("fe_rel_repair")
|
628
|
+
FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
|
629
|
+
end
|
630
|
+
|
631
|
+
outfile.puts st_sent.get()
|
632
|
+
} # each ST sentence
|
633
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
634
|
+
} # each file parsed
|
635
|
+
end
|
636
|
+
|
637
|
+
|
638
|
+
###################################
|
639
|
+
# general file iterators
|
640
|
+
|
641
|
+
# yields pairs of [infile name, outfile stream]
|
642
|
+
def change_each_file_in_dir(dir, # string: directory name
|
643
|
+
suffix) # string: filename pattern, e.g. "*.xml"
|
644
|
+
Dir[dir + "*#{suffix}"].each { |filename|
|
645
|
+
tempfile = Tempfile.new("FrprepHelper")
|
646
|
+
yield [filename, tempfile]
|
647
|
+
|
648
|
+
# move temp file to original file location
|
649
|
+
tempfile.close()
|
650
|
+
`cp #{filename} #{filename}.bak`
|
651
|
+
`mv #{tempfile.path()} #{filename}`
|
652
|
+
tempfile.close(true)
|
653
|
+
} # each file
|
654
|
+
end
|
655
|
+
|
656
|
+
#######
|
657
|
+
# change_each_stxml_file_in_dir
|
658
|
+
#
|
659
|
+
# use change_each_file_in_dir, but assume that the files
|
660
|
+
# are SalsaTigerXML files: Keep file headers and footers,
|
661
|
+
# and just offer individual sentences for changing
|
662
|
+
#
|
663
|
+
# Yields SalsaTigerSentence objects, each sentence to be changed
|
664
|
+
def change_each_stxml_file_in_dir(dir) # string: directory name
|
665
|
+
|
666
|
+
change_each_file_in_dir(dir, "*.xml") { |stfilename, tf|
|
667
|
+
infile = FilePartsParser.new(stfilename)
|
668
|
+
|
669
|
+
# write header
|
670
|
+
tf.puts infile.head()
|
671
|
+
|
672
|
+
# iterate through sentences, yield as SalsaTigerSentence objects
|
673
|
+
infile.scan_s() { |sent_string|
|
674
|
+
sent = SalsaTigerSentence.new(sent_string)
|
675
|
+
yield sent
|
676
|
+
# write changed sentence
|
677
|
+
tf.puts sent.get()
|
678
|
+
} # each sentence
|
679
|
+
|
680
|
+
# write footer
|
681
|
+
tf.puts infile.tail()
|
682
|
+
infile.close()
|
683
|
+
}
|
684
|
+
end
|
685
|
+
end
|
686
|
+
end
|