shalmaneser-prep 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,143 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ # AB, 2010-11-25
4
+
5
+ ##############################
6
+ # class for managing parses:
7
+ #
8
+ # Given either a directory with tab format files or
9
+ # a directory with SalsaTigerXML files (or both) and
10
+ # a directory for putting parse files:
11
+ # - parse, unless no parsing set in the experiment file
12
+ # - for each parsed file: yield one OneParsedFile object
13
+ require 'frprep/one_parsed_file'
14
+
15
+ class DoParses
16
+ def initialize(exp, # FrPrepConfigData object
17
+ file_suffixes, # hash: file type(string) -> suffix(string)
18
+ parse_dir, # string: name of directory to put parses
19
+ var_hash = {}) # further directories
20
+ @exp = exp
21
+ @file_suffixes = file_suffixes
22
+ @parse_dir = parse_dir
23
+ @tab_dir = var_hash["tab_dir"]
24
+ @stxml_dir = var_hash["stxml_dir"]
25
+
26
+ # pre-parsed data available?
27
+ @parsed_files = @exp.get("directory_parserout")
28
+ end
29
+
30
+ ###
31
+ def each_parsed_file()
32
+ if @exp.get("do_postag")
33
+ postag_suffix = @file_suffixes["pos"]
34
+ else
35
+ postag_suffix = nil
36
+ end
37
+
38
+ if @exp.get("do_lemmatize")
39
+ lemma_suffix = @file_suffixes["lemma"]
40
+ else
41
+ lemma_suffix = nil
42
+ end
43
+
44
+ if @exp.get("do_parse")
45
+
46
+ # get parser interface
47
+ sys_class = SynInterfaces.get_interface("parser",
48
+ @exp.get("parser"))
49
+ unless sys_class
50
+ raise "Shouldn't be here"
51
+ end
52
+ parse_suffix = "." + sys_class.name()
53
+ sys = sys_class.new(@exp.get("parser_path"),
54
+ @file_suffixes["tab"],
55
+ parse_suffix,
56
+ @file_suffixes["stxml"],
57
+ "pos_suffix" => postag_suffix,
58
+ "lemma_suffix" => lemma_suffix,
59
+ "tab_dir" => @tab_dir)
60
+
61
+ if @parsed_files
62
+ # reuse old parses
63
+
64
+ $stderr.puts "Frprep: using pre-computed parses in " + @parsed_files.to_s()
65
+ $stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
66
+
67
+ Dir[@parsed_files + "*"].each { |parsefilename|
68
+
69
+ if File.stat(parsefilename).ftype != "file"
70
+ # something other than a file
71
+ next
72
+ end
73
+
74
+
75
+ # core filename: remove directory and anything after the last "."
76
+ filename_core = File.basename(parsefilename, ".*")
77
+ #print "FN ", filename_core, " PN ", parsefilename, " sys ", sys, "\n"
78
+ # use iterator to read each parsed file
79
+ yield OneParsedFile.new(filename_core, parsefilename, sys)
80
+ }
81
+
82
+ else
83
+ # do new parses
84
+ $stderr.puts "Frprep: Parsing"
85
+
86
+ # sanity check
87
+ unless @exp.get("parser_path")
88
+ raise "Parsing: I need 'parser_path' in the experiment file"
89
+ end
90
+ unless @tab_dir
91
+ raise "Cannot parse without tab files"
92
+ end
93
+
94
+ # AB: NOTE This is the position where a parser is invoked.
95
+ # parse
96
+ sys.process_dir(@tab_dir, @parse_dir)
97
+
98
+ $stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
99
+
100
+ Dir[@parse_dir + "*" + parse_suffix].each { |parsefilename|
101
+ filename_core = File.basename(parsefilename, parse_suffix)
102
+
103
+ # use iterator to read each parsed file
104
+ yield OneParsedFile.new(filename_core, parsefilename, sys)
105
+ }
106
+ end
107
+
108
+ else
109
+ # no parse:
110
+ # get pseudo-parse tree
111
+
112
+ if @stxml_dir
113
+ # use existing SalsaTigerXML files
114
+ Dir[@stxml_dir + "*.xml"].each { |stxmlfilename|
115
+
116
+ filename_core = File.basename(stxmlfilename, ".xml")
117
+ if @tab_dir
118
+ # we know the tab directory too
119
+ tabfilename = @tab_dir + filename_core + @file_suffixes["tab"]
120
+ each_sentence_obj = FrprepReadStxml.new(stxmlfilename, tabfilename,
121
+ postag_suffix, lemma_suffix)
122
+ else
123
+ # we have no tab directory
124
+ each_sentence_obj = FrprepReadStxml.new(stxmlfilename, nil,
125
+ postag_suffix, lemma_suffix)
126
+ end
127
+
128
+ yield OneParsedFile.new(filename_core, stxmlfilename, each_sentence_obj)
129
+ }
130
+
131
+ else
132
+ # construct SalsaTigerXML from tab files
133
+ Dir[@tab_dir+"*"+@file_suffixes["tab"]].each { |tabfilename|
134
+ each_sentence_obj = FrprepFlatSyntax.new(tabfilename,
135
+ postag_suffix,
136
+ lemma_suffix)
137
+ filename_core = File.basename(tabfilename, @file_suffixes["tab"])
138
+ yield OneParsedFile.new(filename_core, tabfilename, each_sentence_obj)
139
+ }
140
+ end # source of pseudo-parse
141
+ end # parse or no parse
142
+ end
143
+ end
@@ -0,0 +1,693 @@
1
+ require 'frprep/do_parses'
2
+ require 'common/prep_helper'
3
+ require 'common/FixSynSemMapping'
4
+ # For FN input.
5
+ require 'frprep/FNCorpusXML'
6
+ require 'frprep/FNDatabase'
7
+
8
+ ##############################
9
+ # The class that does all the work
10
+ module FrPrep
11
+ class FrPrep
12
+ # @param exp [FrprepConfigData] Configuration object
13
+ def initialize(exp)
14
+ @exp = exp
15
+
16
+ # AB: move to FRprepOptionParser
17
+ # remove previous contents of frprep internal data directory
18
+ unless exp.get("frprep_directory")
19
+ raise "Please set 'frprep_directory', the frprep internal data directory,\n" +
20
+ "in the experiment file."
21
+ end
22
+
23
+ # experiment directory:
24
+ # frprep internal data directory, subdir according to experiment ID
25
+ exp_dir = File.new_dir(@exp.get("frprep_directory"),
26
+ @exp.get("prep_experiment_ID"))
27
+ # %x{rm -rf #{exp_dir}}
28
+
29
+ # suffixes for different types of output files
30
+ @file_suffixes = {"lemma" => ".lemma",
31
+ "pos" => ".pos",
32
+ "tab" => ".tab",
33
+ "stxml" => ".xml"}
34
+ end
35
+
36
+ def transform
37
+
38
+ # AB: Debugging.
39
+ debugger if $DEBUG
40
+
41
+
42
+ # AB: move to FRprepOptionParser
43
+ unless @exp.get("directory_input")
44
+ $stderr.puts "Please specify 'directory_input' in the experiment file."
45
+ exit 1
46
+ end
47
+ # AB: move to FRprepOptionParser
48
+ unless @exp.get("directory_preprocessed")
49
+ $stderr.puts "Please specify 'directory_preprocessed' in the experiment file."
50
+ exit 1
51
+ end
52
+
53
+ ##
54
+ # input and output directories.
55
+ #
56
+ # sanity check: output in tab format will not work
57
+ # if we also do a parse
58
+ if @exp.get("tabformat_output") and @exp.get("do_parse")
59
+ $stderr.puts "Error: Cannot do Tab format output"
60
+ $stderr.puts "when the input text is being parsed."
61
+ $stderr.puts "Please set either 'tabformat_output' or 'do_parse' to false."
62
+ exit 1
63
+ end
64
+ input_dir = File.existing_dir(@exp.get("directory_input"))
65
+ output_dir = File.new_dir(@exp.get("directory_preprocessed"))
66
+ if @exp.get("tabformat_output")
67
+ split_dir = output_dir
68
+ else
69
+ split_dir = frprep_dirname("split", "new")
70
+ end
71
+
72
+ ####
73
+ # transform data to UTF-8
74
+
75
+ if ["iso", "hex"].include? @exp.get("encoding")
76
+ # transform ISO -> UTF-8 or Hex -> UTF-8
77
+ # write result to encoding_dir,
78
+ # then set encoding_dir to be the new input_dir
79
+
80
+ encoding_dir = frprep_dirname("encoding", "new")
81
+ $stderr.puts "Frprep: Transforming to UTF-8."
82
+ Dir[input_dir + "*"].each { |filename|
83
+ unless File.file? filename
84
+ # not a file? then skip
85
+ next
86
+ end
87
+ outfilename = encoding_dir + File.basename(filename)
88
+ FrprepHelper.to_utf8_file(filename, outfilename, @exp.get("encoding"))
89
+ }
90
+
91
+ input_dir = encoding_dir
92
+ end
93
+
94
+
95
+ ####
96
+ # transform data all the way to the output format,
97
+ # which is SalsaTigerXML by default,
98
+ # except when tabformat_output has been set, in which case it's
99
+ # Tab format.
100
+ current_dir = input_dir
101
+
102
+ done_format = @exp.get("tabformat_output") ? 'SalsaTabWithPos' : 'Done'
103
+
104
+ current_format = @exp.get("format")
105
+
106
+ while current_format != done_format
107
+ # AB: DEBUG Remove it
108
+ STDERR.puts "#{current_format} - #{done_format}"
109
+ # after debugging
110
+ case current_format
111
+
112
+ when "BNC"
113
+ # basically plain, plus some tags to be removed
114
+ plain_dir = frprep_dirname("plain", "new")
115
+
116
+ $stderr.puts "Frprep: Transforming BNC format text in #{current_dir} to plain format."
117
+ $stderr.puts "Storing the result in #{plain_dir}."
118
+ $stderr.puts "Expecting one sentence per line."
119
+
120
+ transform_bncformat_dir(current_dir, plain_dir)
121
+
122
+ current_dir = plain_dir
123
+ current_format = "Plain"
124
+
125
+ when "Plain"
126
+ # transform to tab format
127
+
128
+ tab_dir = frprep_dirname("tab", "new")
129
+
130
+ $stderr.puts "Frprep: Transforming plain text in #{current_dir} to SalsaTab format."
131
+ $stderr.puts "Storing the result in #{tab_dir}."
132
+ $stderr.puts "Expecting one sentence per line."
133
+
134
+ transform_plain_dir(current_dir, tab_dir)
135
+
136
+ current_dir = tab_dir
137
+ current_format = "SalsaTab"
138
+
139
+ when "FNXml"
140
+ # transform to tab format
141
+
142
+ tab_dir = frprep_dirname("tab", "new")
143
+
144
+ $stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
145
+ $stderr.puts "Storing the result in " + tab_dir
146
+
147
+ fndata = FNDatabase.new(current_dir)
148
+ fndata.extract_everything(tab_dir)
149
+ Kernel.system("chmod -R g+rx #{tab_dir}")
150
+
151
+ current_dir = tab_dir
152
+ current_format = "SalsaTab"
153
+
154
+ when "FNCorpusXml"
155
+ # transform to tab format
156
+ tab_dir = frprep_dirname("tab", "new")
157
+
158
+ $stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
159
+ $stderr.puts "Storing the result in " + tab_dir
160
+ # assuming that all XML files in the current directory are FN Corpus XML files
161
+ Dir[current_dir + "*.xml"].each { |fncorpusfilename|
162
+ corpus = FNCorpusXMLFile.new(fncorpusfilename)
163
+ outfile = File.new(tab_dir + File.basename(fncorpusfilename, ".xml") + ".tab",
164
+ "w")
165
+ corpus.print_conll_style(outfile)
166
+ outfile.close()
167
+ }
168
+
169
+ Kernel.system("chmod -R g+rx #{tab_dir}")
170
+ current_dir = tab_dir
171
+ current_format = "SalsaTab"
172
+
173
+ when "SalsaTab"
174
+ # lemmatize and POStag
175
+
176
+ $stderr.puts "Frprep: Lemmatizing and parsing text in #{current_dir}."
177
+ $stderr.puts "Storing the result in #{split_dir}."
178
+ transform_pos_and_lemmatize(current_dir, split_dir)
179
+
180
+ current_dir = split_dir
181
+ current_format = "SalsaTabWithPos"
182
+
183
+ when "SalsaTabWithPos"
184
+ # parse
185
+
186
+ parse_dir = frprep_dirname("parse", "new")
187
+
188
+ $stderr.puts "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format."
189
+ $stderr.puts "Storing the result in #{parse_dir}."
190
+
191
+ transform_salsatab_dir(current_dir, parse_dir, output_dir)
192
+
193
+ current_dir = output_dir
194
+ current_format = "Done"
195
+
196
+ when "SalsaTigerXML"
197
+
198
+ parse_dir = frprep_dirname("parse", "new")
199
+ print "Transform parser output into stxml\n"
200
+ transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir, @exp)
201
+ current_dir = output_dir
202
+ current_format = "Done"
203
+
204
+ else
205
+ STDERR.puts "Done format is: #{done_format}"
206
+ $stderr.puts "Unknown data format #{current_format}"
207
+ $stderr.puts "Please check the 'format' entry in your experiment file."
208
+ raise "Experiment file problem"
209
+ end
210
+ end
211
+
212
+ STDERR.puts "FrPrep: Done preprocessing."
213
+ end
214
+
215
+ ############################################################################
216
+ private
217
+
218
+ ###############
219
+ # frprep_dirname:
220
+ # make directory name for frprep-internal data
221
+ # of a certain kind described in <subdir>
222
+ #
223
+ # frprep_directory has one subdirectory for each experiment ID,
224
+ # and below that there is one subdir per subtask
225
+ #
226
+ # If this is a new directory, it is constructed,
227
+ # if it should be an existing directory, its existence is checked.
228
+ # @param subdir [String] designator of a subdirectory
229
+ # @param neu [Nil] non-nil This may be a new directory
230
+ def frprep_dirname(subdir, neu = nil)
231
+
232
+ dirname = File.new_dir(@exp.get("frprep_directory"),
233
+ @exp.get("prep_experiment_ID"),
234
+ subdir)
235
+
236
+ neu ? File.new_dir(dirname) : File.existing_dir(dirname)
237
+ end
238
+
239
+
240
+
241
+ ###############
242
+ # transform_plain:
243
+ #
244
+ # transformation for BNC format:
245
+ #
246
+ # transform to plain format, removing <> elements
247
+ def transform_bncformat_dir(input_dir, # string: input directory
248
+ output_dir) # string: output directory
249
+
250
+ Dir[input_dir + "*"].each { |bncfilename|
251
+
252
+ # open input and output file
253
+ # end output file name in "tab" because that is, at the moment, required
254
+ outfilename = output_dir + File.basename(bncfilename)
255
+ FrprepHelper.bnc_to_plain_file(bncfilename, outfilename)
256
+ }
257
+ end
258
+
259
+
260
+ ###############
261
+ # transform_plain:
262
+ #
263
+ # transformation for plaintext:
264
+ #
265
+ # transform to Tab format, separating punctuation from adjacent words
266
+ # @param input_dir [String] input directory
267
+ # @param output_dir [String] output directory
268
+ def transform_plain_dir(input_dir, output_dir)
269
+ Dir[input_dir + "*"].each do |plainfilename|
270
+ # open input and output file
271
+ # end output file name in "tab" because that is, at the moment, required
272
+ outfilename = output_dir + File.basename(plainfilename) + @file_suffixes["tab"]
273
+ FrprepHelper.plain_to_tab_file(plainfilename, outfilename)
274
+ end
275
+ end
276
+
277
+ ###############
278
+ # transform_pos_and_lemmatize
279
+ #
280
+ # transformation for Tab format files:
281
+ #
282
+ # - Split into parser-size chunks
283
+ # - POS-tag, lemmatize
284
+ def transform_pos_and_lemmatize(input_dir, # string: input directory
285
+ output_dir) # string: output directory
286
+ ##
287
+ # split the TabFormatFile into chunks of max_sent_num size
288
+ FrprepHelper.split_dir(input_dir, output_dir,@file_suffixes["tab"],
289
+ @exp.get("parser_max_sent_num"),
290
+ @exp.get("parser_max_sent_len"))
291
+
292
+ ##
293
+ # POS-Tagging
294
+ if @exp.get("do_postag")
295
+ $stderr.puts "Frprep: Tagging."
296
+
297
+ # AB: TODO Move it to OptionParser.
298
+ unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
299
+ raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
300
+ end
301
+
302
+ sys_class = SynInterfaces.get_interface("pos_tagger",
303
+ @exp.get("pos_tagger"))
304
+ print "pos tagger interface: ", sys_class, "\n"
305
+
306
+ # AB: TODO Remove it.
307
+ unless sys_class
308
+ raise "Shouldn't be here"
309
+ end
310
+
311
+ sys = sys_class.new(@exp.get("pos_tagger_path"),
312
+ @file_suffixes["tab"],
313
+ @file_suffixes["pos"])
314
+ sys.process_dir(output_dir, output_dir)
315
+ end
316
+
317
+
318
+ ##
319
+ # Lemmatization
320
+ # AB: We're working on the <split> dir and writing there.
321
+ if @exp.get("do_lemmatize")
322
+ STDERR.puts 'Frprep: Lemmatizing.'
323
+
324
+ # AB: TODO Move it to OptionParser.
325
+ unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
326
+ raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
327
+ end
328
+
329
+ sys_class = SynInterfaces.get_interface("lemmatizer",
330
+ @exp.get("lemmatizer"))
331
+ # AB: TODO make this exception explicit.
332
+ unless sys_class
333
+ raise 'I got a empty interface class for the lemmatizer!'
334
+ end
335
+
336
+ sys = sys_class.new(@exp.get("lemmatizer_path"),
337
+ @file_suffixes["tab"],
338
+ @file_suffixes["lemma"])
339
+ sys.process_dir(output_dir, output_dir)
340
+ end
341
+ end
342
+
343
+ ###############
344
+ # transform_salsatab
345
+ #
346
+ # transformation for Tab format files:
347
+ #
348
+ # - parse
349
+ # - Transform parser output to SalsaTigerXML
350
+ # If no parsing, make flat syntactic structure.
351
+ def transform_salsatab_dir(input_dir, # string: input directory
352
+ parse_dir, # string: output directory for parses
353
+ output_dir) # string: global output directory
354
+
355
+ ##
356
+ # (Parse and) transform to SalsaTigerXML
357
+
358
+ # get interpretation class for this
359
+ # parser/lemmatizer/POS tagger combination
360
+ interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
361
+ unless interpreter_class
362
+ raise "Shouldn't be here"
363
+ end
364
+
365
+ parse_obj = DoParses.new(@exp, @file_suffixes,
366
+ parse_dir,
367
+ "tab_dir" => input_dir)
368
+ parse_obj.each_parsed_file { |parsed_file_obj|
369
+
370
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
371
+ $stderr.puts "Writing #{outfilename}"
372
+ begin
373
+ outfile = File.new(outfilename, "w")
374
+ rescue
375
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
376
+ end
377
+
378
+ outfile.puts SalsaTigerXMLHelper.get_header()
379
+ # work with triples
380
+ # SalsaTigerSentence, FNTabSentence,
381
+ # hash: tab sentence index(integer) -> array:SynNode
382
+ parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
383
+
384
+ # parsed: add headwords using parse tree
385
+ if @exp.get("do_parse")
386
+ FrprepHelper.add_head_attributes(st_sent, interpreter_class)
387
+ end
388
+
389
+ # add lemmas, if they are there. If they are not, don't print out a warning.
390
+ if @exp.get("do_lemmatize")
391
+ FrprepHelper.add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
392
+ end
393
+
394
+ # add semantics
395
+ # we can use the method in SalsaTigerXMLHelper
396
+ # that reads semantic information from the tab file
397
+ # and combines all targets of a sentence into one frame
398
+ FrprepHelper.add_semantics_from_tab(st_sent, tabformat_sent, mapping,
399
+ interpreter_class, @exp)
400
+
401
+ # remove pseudo-frames from FrameNet data
402
+ FrprepHelper.remove_deprecated_frames(st_sent, @exp)
403
+
404
+ # handle multiword targets
405
+ FrprepHelper.handle_multiword_targets(st_sent,
406
+ interpreter_class, @exp.get("language"))
407
+
408
+ # handle Unknown frame names
409
+ FrprepHelper.handle_unknown_framenames(st_sent, interpreter_class)
410
+
411
+ outfile.puts st_sent.get()
412
+ }
413
+ outfile.puts SalsaTigerXMLHelper.get_footer()
414
+ }
415
+ end
416
+
417
+ #############################################
418
+ # transform_stxml
419
+ #
420
+ # transformation for SalsaTigerXML data
421
+ #
422
+ # - If the input format was SalsaTigerXML:
423
+ # - Tag, lemmatize and parse, if the experiment file tells you so
424
+ #
425
+ # - If the origin is the Salsa corpus:
426
+ # Change frame names from Unknown\d+ to lemma_Unknown\d+
427
+ #
428
+ # - fix multiword lemmas, or at least try
429
+ # - transform to UTF 8
430
+ def transform_stxml_dir(parse_dir, # string: name of directory for parse data
431
+ tab_dir, # string: name of directory for split/tab data
432
+ input_dir, # string: name of input directory
433
+ output_dir, # string: name of final output directory
434
+ exp) # FrprepConfigData
435
+
436
+ ####
437
+ # Data preparation
438
+
439
+ # Data with Salsa as origin:
440
+ # remember the target lemma as an attribute on the
441
+ # <target> elements
442
+ #
443
+ # currently deactivated: encoding problems
444
+ # if @exp.get("origin") == "SalsaTiger"
445
+ # $stderr.puts "Frprep: noting target lemmas"
446
+ # changed_input_dir = frprep_dirname("salsalemma", "new")
447
+ # FrprepHelper.note_salsa_targetlemmas(input_dir, changed_input_dir)
448
+
449
+ # # remember changed input dir as input dir
450
+ # input_dir = changed_input_dir
451
+ # end
452
+
453
+ # If data is to be parsed, split and tabify input files
454
+ # else copy data to stxml_indir.
455
+
456
+ # stxml_dir: directory where SalsaTiger data is situated
457
+ if @exp.get("do_parse")
458
+ # split data
459
+ stxml_splitdir = frprep_dirname("stxml_split", "new")
460
+ stxml_dir = stxml_splitdir
461
+
462
+ $stderr.puts "Frprep: splitting data"
463
+ FrprepHelper.stxml_split_dir(input_dir, stxml_splitdir,
464
+ @exp.get("parser_max_sent_num"),
465
+ @exp.get("parser_max_sent_len"))
466
+ else
467
+ # no parsing: copy data to split dir
468
+ stxml_dir = parse_dir
469
+ $stderr.puts "Frprep: Copying data to #{stxml_dir}"
470
+ Dir[input_dir + "*.xml"].each { |filename|
471
+ `cp #{filename} #{stxml_dir}#{File.basename(filename)}`
472
+ }
473
+ end
474
+
475
+ # Some syntactic processing will take place:
476
+ # tabify data
477
+ if @exp.get("do_parse") or @exp.get("do_lemmatize") or @exp.get("do_postag")
478
+ $stderr.puts "Frprep: making input for syn. processing"
479
+
480
+ Dir[stxml_dir+"*"+@file_suffixes["stxml"]].each { |stxmlfilename|
481
+
482
+ tabfilename = tab_dir + File.basename(stxmlfilename,@file_suffixes["stxml"]) + @file_suffixes["tab"]
483
+ FrprepHelper.stxml_to_tab_file(stxmlfilename, tabfilename, exp)
484
+ }
485
+ end
486
+
487
+ ###
488
+ # POS-tagging
489
+ if @exp.get("do_postag")
490
+ $stderr.puts "Frprep: Tagging."
491
+ unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
492
+ raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
493
+ end
494
+
495
+ sys_class = SynInterfaces.get_interface("pos_tagger",
496
+ @exp.get("pos_tagger"))
497
+ unless sys_class
498
+ raise "Shouldn't be here"
499
+ end
500
+ sys = sys_class.new(@exp.get("pos_tagger_path"),
501
+ @file_suffixes["tab"],
502
+ @file_suffixes["pos"])
503
+ sys.process_dir(tab_dir, tab_dir)
504
+ end
505
+
506
+ ###
507
+ # Lemmatization
508
+ if @exp.get("do_lemmatize")
509
+ $stderr.puts "Frprep: Lemmatizing."
510
+ unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
511
+ raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
512
+ end
513
+
514
+ sys_class = SynInterfaces.get_interface("lemmatizer",
515
+ @exp.get("lemmatizer"))
516
+ unless sys_class
517
+ raise "Shouldn't be here"
518
+ end
519
+ sys = sys_class.new(@exp.get("lemmatizer_path"),
520
+ @file_suffixes["tab"],
521
+ @file_suffixes["lemma"])
522
+ sys.process_dir(tab_dir, tab_dir)
523
+ end
524
+
525
+ ###
526
+ # Parsing, production of SalsaTigerXML output
527
+
528
+ # get interpretation class for this
529
+ # parser/lemmatizer/POS tagger combination
530
+ sys_class_names = Hash.new
531
+ [["do_postag", "pos_tagger"],
532
+ ["do_lemmatize", "lemmatizer"],
533
+ ["do_parse", "parser"]].each { |service, system_name|
534
+ if @exp.get(service) # yes, perform this service
535
+ sys_class_names[system_name] = @exp.get(system_name)
536
+ end
537
+ }
538
+ interpreter_class = SynInterfaces.get_interpreter(sys_class_names)
539
+ unless interpreter_class
540
+ raise "Shouldn't be here"
541
+ end
542
+
543
+ parse_obj = DoParses.new(@exp, @file_suffixes,
544
+ parse_dir,
545
+ "tab_dir" => tab_dir,
546
+ "stxml_dir" => stxml_dir)
547
+ parse_obj.each_parsed_file { |parsed_file_obj|
548
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
549
+ $stderr.puts "Writing #{outfilename}"
550
+ begin
551
+ outfile = File.new(outfilename, "w")
552
+ rescue
553
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
554
+ end
555
+
556
+
557
+ if @exp.get("do_parse")
558
+ # read old SalsaTigerXML file
559
+ # so we can integrate the old file's semantics later
560
+ oldxml = Array.new # array of sentence strings
561
+ # we assume that the old and the new file have the same name,
562
+ # ending in .xml.
563
+ oldxmlfile = FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
564
+ oldxmlfile.scan_s { |sent_string|
565
+ # remember this sentence by its ID
566
+ oldxml << sent_string
567
+ }
568
+ end
569
+
570
+ outfile.puts SalsaTigerXMLHelper.get_header()
571
+ index = 0
572
+ # work with triples
573
+ # SalsaTigerSentence, FNTabSentence,
574
+ # hash: tab sentence index(integer) -> array:SynNode
575
+ parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
576
+
577
+ # parsed? then integrate semantics and lemmas from old file
578
+ if @exp.get("do_parse")
579
+ oldsent_string = oldxml[index]
580
+ index += 1
581
+ if oldsent_string
582
+
583
+ # modified by ines, 27/08/08
584
+ # for Berkeley => substitute ( ) for *LRB* *RRB*
585
+ if exp.get("parser") == "berkeley"
586
+ oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
587
+ oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
588
+ oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
589
+ oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
590
+ end
591
+
592
+ # we have both an old and a new sentence, so integrate semantics
593
+ oldsent = SalsaTigerSentence.new(oldsent_string)
594
+ if st_sent.nil?
595
+ next
596
+ end
597
+ if ( FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp) == false)
598
+ #print "FALSE \n";
599
+ #print oldsent, "\n", st_sent, "\n\n";
600
+
601
+ oldsent_string = oldxml[index]
602
+ index += 1
603
+ if oldsent_string
604
+
605
+ # modified by ines, 27/08/08
606
+ # for Berkeley => substitute ( ) for *LRB* *RRB*
607
+ if exp.get("parser") == "berkeley"
608
+ oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
609
+ oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
610
+ oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
611
+ oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
612
+ end
613
+
614
+ # we have both an old and a new sentence, so integrate semantics
615
+ oldsent = SalsaTigerSentence.new(oldsent_string)
616
+ #print oldsent, "\n", st_sent, "\n\n";
617
+ FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
618
+
619
+ end
620
+ #else
621
+ #print "TRUE\n";
622
+ #print oldsent, "\n", st_sent, "\n\n";
623
+ end
624
+ else
625
+ # no corresponding old sentence for this new sentence
626
+ $stderr.puts "Warning: transporting semantics -- missing source sentence, skipping"
627
+ end
628
+ end
629
+
630
+ # remove pseudo-frames from FrameNet data
631
+ FrprepHelper.remove_deprecated_frames(st_sent, @exp)
632
+
633
+ # repair syn/sem mapping problems?
634
+ if @exp.get("fe_syn_repair") or @exp.get("fe_rel_repair")
635
+ FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
636
+ end
637
+
638
+ outfile.puts st_sent.get()
639
+ } # each ST sentence
640
+ outfile.puts SalsaTigerXMLHelper.get_footer()
641
+ } # each file parsed
642
+ end
643
+
644
+
645
+ ###################################
646
+ # general file iterators
647
+
648
+ # yields pairs of [infile name, outfile stream]
649
+ def change_each_file_in_dir(dir, # string: directory name
650
+ suffix) # string: filename pattern, e.g. "*.xml"
651
+ Dir[dir + "*#{suffix}"].each { |filename|
652
+ tempfile = Tempfile.new("FrprepHelper")
653
+ yield [filename, tempfile]
654
+
655
+ # move temp file to original file location
656
+ tempfile.close()
657
+ `cp #{filename} #{filename}.bak`
658
+ `mv #{tempfile.path()} #{filename}`
659
+ tempfile.close(true)
660
+ } # each file
661
+ end
662
+
663
+ #######
664
+ # change_each_stxml_file_in_dir
665
+ #
666
+ # use change_each_file_in_dir, but assume that the files
667
+ # are SalsaTigerXML files: Keep file headers and footers,
668
+ # and just offer individual sentences for changing
669
+ #
670
+ # Yields SalsaTigerSentence objects, each sentence to be changed
671
+ def change_each_stxml_file_in_dir(dir) # string: directory name
672
+
673
+ change_each_file_in_dir(dir, "*.xml") { |stfilename, tf|
674
+ infile = FilePartsParser.new(stfilename)
675
+
676
+ # write header
677
+ tf.puts infile.head()
678
+
679
+ # iterate through sentences, yield as SalsaTigerSentence objects
680
+ infile.scan_s() { |sent_string|
681
+ sent = SalsaTigerSentence.new(sent_string)
682
+ yield sent
683
+ # write changed sentence
684
+ tf.puts sent.get()
685
+ } # each sentence
686
+
687
+ # write footer
688
+ tf.puts infile.tail()
689
+ infile.close()
690
+ }
691
+ end
692
+ end
693
+ end