shalmaneser-prep 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,143 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ # AB, 2010-11-25
4
+
5
+ ##############################
6
+ # class for managing parses:
7
+ #
8
+ # Given either a directory with tab format files or
9
+ # a directory with SalsaTigerXML files (or both) and
10
+ # a directory for putting parse files:
11
+ # - parse, unless no parsing set in the experiment file
12
+ # - for each parsed file: yield one OneParsedFile object
13
+ require 'frprep/one_parsed_file'
14
+
15
+ class DoParses
16
+ def initialize(exp, # FrPrepConfigData object
17
+ file_suffixes, # hash: file type(string) -> suffix(string)
18
+ parse_dir, # string: name of directory to put parses
19
+ var_hash = {}) # further directories
20
+ @exp = exp
21
+ @file_suffixes = file_suffixes
22
+ @parse_dir = parse_dir
23
+ @tab_dir = var_hash["tab_dir"]
24
+ @stxml_dir = var_hash["stxml_dir"]
25
+
26
+ # pre-parsed data available?
27
+ @parsed_files = @exp.get("directory_parserout")
28
+ end
29
+
30
+ ###
31
+ def each_parsed_file()
32
+ if @exp.get("do_postag")
33
+ postag_suffix = @file_suffixes["pos"]
34
+ else
35
+ postag_suffix = nil
36
+ end
37
+
38
+ if @exp.get("do_lemmatize")
39
+ lemma_suffix = @file_suffixes["lemma"]
40
+ else
41
+ lemma_suffix = nil
42
+ end
43
+
44
+ if @exp.get("do_parse")
45
+
46
+ # get parser interface
47
+ sys_class = SynInterfaces.get_interface("parser",
48
+ @exp.get("parser"))
49
+ unless sys_class
50
+ raise "Shouldn't be here"
51
+ end
52
+ parse_suffix = "." + sys_class.name()
53
+ sys = sys_class.new(@exp.get("parser_path"),
54
+ @file_suffixes["tab"],
55
+ parse_suffix,
56
+ @file_suffixes["stxml"],
57
+ "pos_suffix" => postag_suffix,
58
+ "lemma_suffix" => lemma_suffix,
59
+ "tab_dir" => @tab_dir)
60
+
61
+ if @parsed_files
62
+ # reuse old parses
63
+
64
+ $stderr.puts "Frprep: using pre-computed parses in " + @parsed_files.to_s()
65
+ $stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
66
+
67
+ Dir[@parsed_files + "*"].each { |parsefilename|
68
+
69
+ if File.stat(parsefilename).ftype != "file"
70
+ # something other than a file
71
+ next
72
+ end
73
+
74
+
75
+ # core filename: remove directory and anything after the last "."
76
+ filename_core = File.basename(parsefilename, ".*")
77
+ #print "FN ", filename_core, " PN ", parsefilename, " sys ", sys, "\n"
78
+ # use iterator to read each parsed file
79
+ yield OneParsedFile.new(filename_core, parsefilename, sys)
80
+ }
81
+
82
+ else
83
+ # do new parses
84
+ $stderr.puts "Frprep: Parsing"
85
+
86
+ # sanity check
87
+ unless @exp.get("parser_path")
88
+ raise "Parsing: I need 'parser_path' in the experiment file"
89
+ end
90
+ unless @tab_dir
91
+ raise "Cannot parse without tab files"
92
+ end
93
+
94
+ # AB: NOTE This is the position where a parser is invoked.
95
+ # parse
96
+ sys.process_dir(@tab_dir, @parse_dir)
97
+
98
+ $stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
99
+
100
+ Dir[@parse_dir + "*" + parse_suffix].each { |parsefilename|
101
+ filename_core = File.basename(parsefilename, parse_suffix)
102
+
103
+ # use iterator to read each parsed file
104
+ yield OneParsedFile.new(filename_core, parsefilename, sys)
105
+ }
106
+ end
107
+
108
+ else
109
+ # no parse:
110
+ # get pseudo-parse tree
111
+
112
+ if @stxml_dir
113
+ # use existing SalsaTigerXML files
114
+ Dir[@stxml_dir + "*.xml"].each { |stxmlfilename|
115
+
116
+ filename_core = File.basename(stxmlfilename, ".xml")
117
+ if @tab_dir
118
+ # we know the tab directory too
119
+ tabfilename = @tab_dir + filename_core + @file_suffixes["tab"]
120
+ each_sentence_obj = FrprepReadStxml.new(stxmlfilename, tabfilename,
121
+ postag_suffix, lemma_suffix)
122
+ else
123
+ # we have no tab directory
124
+ each_sentence_obj = FrprepReadStxml.new(stxmlfilename, nil,
125
+ postag_suffix, lemma_suffix)
126
+ end
127
+
128
+ yield OneParsedFile.new(filename_core, stxmlfilename, each_sentence_obj)
129
+ }
130
+
131
+ else
132
+ # construct SalsaTigerXML from tab files
133
+ Dir[@tab_dir+"*"+@file_suffixes["tab"]].each { |tabfilename|
134
+ each_sentence_obj = FrprepFlatSyntax.new(tabfilename,
135
+ postag_suffix,
136
+ lemma_suffix)
137
+ filename_core = File.basename(tabfilename, @file_suffixes["tab"])
138
+ yield OneParsedFile.new(filename_core, tabfilename, each_sentence_obj)
139
+ }
140
+ end # source of pseudo-parse
141
+ end # parse or no parse
142
+ end
143
+ end
@@ -0,0 +1,693 @@
1
+ require 'frprep/do_parses'
2
+ require 'common/prep_helper'
3
+ require 'common/FixSynSemMapping'
4
+ # For FN input.
5
+ require 'frprep/FNCorpusXML'
6
+ require 'frprep/FNDatabase'
7
+
8
+ ##############################
9
+ # The class that does all the work
10
+ module FrPrep
11
+ class FrPrep
12
+ # @param exp [FrprepConfigData] Configuration object
13
+ def initialize(exp)
14
+ @exp = exp
15
+
16
+ # AB: move to FRprepOptionParser
17
+ # remove previous contents of frprep internal data directory
18
+ unless exp.get("frprep_directory")
19
+ raise "Please set 'frprep_directory', the frprep internal data directory,\n" +
20
+ "in the experiment file."
21
+ end
22
+
23
+ # experiment directory:
24
+ # frprep internal data directory, subdir according to experiment ID
25
+ exp_dir = File.new_dir(@exp.get("frprep_directory"),
26
+ @exp.get("prep_experiment_ID"))
27
+ # %x{rm -rf #{exp_dir}}
28
+
29
+ # suffixes for different types of output files
30
+ @file_suffixes = {"lemma" => ".lemma",
31
+ "pos" => ".pos",
32
+ "tab" => ".tab",
33
+ "stxml" => ".xml"}
34
+ end
35
+
36
+ def transform
37
+
38
+ # AB: Debugging.
39
+ debugger if $DEBUG
40
+
41
+
42
+ # AB: move to FRprepOptionParser
43
+ unless @exp.get("directory_input")
44
+ $stderr.puts "Please specify 'directory_input' in the experiment file."
45
+ exit 1
46
+ end
47
+ # AB: move to FRprepOptionParser
48
+ unless @exp.get("directory_preprocessed")
49
+ $stderr.puts "Please specify 'directory_preprocessed' in the experiment file."
50
+ exit 1
51
+ end
52
+
53
+ ##
54
+ # input and output directories.
55
+ #
56
+ # sanity check: output in tab format will not work
57
+ # if we also do a parse
58
+ if @exp.get("tabformat_output") and @exp.get("do_parse")
59
+ $stderr.puts "Error: Cannot do Tab format output"
60
+ $stderr.puts "when the input text is being parsed."
61
+ $stderr.puts "Please set either 'tabformat_output' or 'do_parse' to false."
62
+ exit 1
63
+ end
64
+ input_dir = File.existing_dir(@exp.get("directory_input"))
65
+ output_dir = File.new_dir(@exp.get("directory_preprocessed"))
66
+ if @exp.get("tabformat_output")
67
+ split_dir = output_dir
68
+ else
69
+ split_dir = frprep_dirname("split", "new")
70
+ end
71
+
72
+ ####
73
+ # transform data to UTF-8
74
+
75
+ if ["iso", "hex"].include? @exp.get("encoding")
76
+ # transform ISO -> UTF-8 or Hex -> UTF-8
77
+ # write result to encoding_dir,
78
+ # then set encoding_dir to be the new input_dir
79
+
80
+ encoding_dir = frprep_dirname("encoding", "new")
81
+ $stderr.puts "Frprep: Transforming to UTF-8."
82
+ Dir[input_dir + "*"].each { |filename|
83
+ unless File.file? filename
84
+ # not a file? then skip
85
+ next
86
+ end
87
+ outfilename = encoding_dir + File.basename(filename)
88
+ FrprepHelper.to_utf8_file(filename, outfilename, @exp.get("encoding"))
89
+ }
90
+
91
+ input_dir = encoding_dir
92
+ end
93
+
94
+
95
+ ####
96
+ # transform data all the way to the output format,
97
+ # which is SalsaTigerXML by default,
98
+ # except when tabformat_output has been set, in which case it's
99
+ # Tab format.
100
+ current_dir = input_dir
101
+
102
+ done_format = @exp.get("tabformat_output") ? 'SalsaTabWithPos' : 'Done'
103
+
104
+ current_format = @exp.get("format")
105
+
106
+ while current_format != done_format
107
+ # AB: DEBUG Remove it
108
+ STDERR.puts "#{current_format} - #{done_format}"
109
+ # after debugging
110
+ case current_format
111
+
112
+ when "BNC"
113
+ # basically plain, plus some tags to be removed
114
+ plain_dir = frprep_dirname("plain", "new")
115
+
116
+ $stderr.puts "Frprep: Transforming BNC format text in #{current_dir} to plain format."
117
+ $stderr.puts "Storing the result in #{plain_dir}."
118
+ $stderr.puts "Expecting one sentence per line."
119
+
120
+ transform_bncformat_dir(current_dir, plain_dir)
121
+
122
+ current_dir = plain_dir
123
+ current_format = "Plain"
124
+
125
+ when "Plain"
126
+ # transform to tab format
127
+
128
+ tab_dir = frprep_dirname("tab", "new")
129
+
130
+ $stderr.puts "Frprep: Transforming plain text in #{current_dir} to SalsaTab format."
131
+ $stderr.puts "Storing the result in #{tab_dir}."
132
+ $stderr.puts "Expecting one sentence per line."
133
+
134
+ transform_plain_dir(current_dir, tab_dir)
135
+
136
+ current_dir = tab_dir
137
+ current_format = "SalsaTab"
138
+
139
+ when "FNXml"
140
+ # transform to tab format
141
+
142
+ tab_dir = frprep_dirname("tab", "new")
143
+
144
+ $stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
145
+ $stderr.puts "Storing the result in " + tab_dir
146
+
147
+ fndata = FNDatabase.new(current_dir)
148
+ fndata.extract_everything(tab_dir)
149
+ Kernel.system("chmod -R g+rx #{tab_dir}")
150
+
151
+ current_dir = tab_dir
152
+ current_format = "SalsaTab"
153
+
154
+ when "FNCorpusXml"
155
+ # transform to tab format
156
+ tab_dir = frprep_dirname("tab", "new")
157
+
158
+ $stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
159
+ $stderr.puts "Storing the result in " + tab_dir
160
+ # assuming that all XML files in the current directory are FN Corpus XML files
161
+ Dir[current_dir + "*.xml"].each { |fncorpusfilename|
162
+ corpus = FNCorpusXMLFile.new(fncorpusfilename)
163
+ outfile = File.new(tab_dir + File.basename(fncorpusfilename, ".xml") + ".tab",
164
+ "w")
165
+ corpus.print_conll_style(outfile)
166
+ outfile.close()
167
+ }
168
+
169
+ Kernel.system("chmod -R g+rx #{tab_dir}")
170
+ current_dir = tab_dir
171
+ current_format = "SalsaTab"
172
+
173
+ when "SalsaTab"
174
+ # lemmatize and POStag
175
+
176
+ $stderr.puts "Frprep: Lemmatizing and parsing text in #{current_dir}."
177
+ $stderr.puts "Storing the result in #{split_dir}."
178
+ transform_pos_and_lemmatize(current_dir, split_dir)
179
+
180
+ current_dir = split_dir
181
+ current_format = "SalsaTabWithPos"
182
+
183
+ when "SalsaTabWithPos"
184
+ # parse
185
+
186
+ parse_dir = frprep_dirname("parse", "new")
187
+
188
+ $stderr.puts "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format."
189
+ $stderr.puts "Storing the result in #{parse_dir}."
190
+
191
+ transform_salsatab_dir(current_dir, parse_dir, output_dir)
192
+
193
+ current_dir = output_dir
194
+ current_format = "Done"
195
+
196
+ when "SalsaTigerXML"
197
+
198
+ parse_dir = frprep_dirname("parse", "new")
199
+ print "Transform parser output into stxml\n"
200
+ transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir, @exp)
201
+ current_dir = output_dir
202
+ current_format = "Done"
203
+
204
+ else
205
+ STDERR.puts "Done format is: #{done_format}"
206
+ $stderr.puts "Unknown data format #{current_format}"
207
+ $stderr.puts "Please check the 'format' entry in your experiment file."
208
+ raise "Experiment file problem"
209
+ end
210
+ end
211
+
212
+ STDERR.puts "FrPrep: Done preprocessing."
213
+ end
214
+
215
+ ############################################################################
216
+ private
217
+
218
+ ###############
219
+ # frprep_dirname:
220
+ # make directory name for frprep-internal data
221
+ # of a certain kind described in <subdir>
222
+ #
223
+ # frprep_directory has one subdirectory for each experiment ID,
224
+ # and below that there is one subdir per subtask
225
+ #
226
+ # If this is a new directory, it is constructed,
227
+ # if it should be an existing directory, its existence is checked.
228
+ # @param subdir [String] designator of a subdirectory
229
+ # @param neu [Nil] non-nil This may be a new directory
230
+ def frprep_dirname(subdir, neu = nil)
231
+
232
+ dirname = File.new_dir(@exp.get("frprep_directory"),
233
+ @exp.get("prep_experiment_ID"),
234
+ subdir)
235
+
236
+ neu ? File.new_dir(dirname) : File.existing_dir(dirname)
237
+ end
238
+
239
+
240
+
241
+ ###############
242
+ # transform_plain:
243
+ #
244
+ # transformation for BNC format:
245
+ #
246
+ # transform to plain format, removing <> elements
247
+ def transform_bncformat_dir(input_dir, # string: input directory
248
+ output_dir) # string: output directory
249
+
250
+ Dir[input_dir + "*"].each { |bncfilename|
251
+
252
+ # open input and output file
253
+ # end output file name in "tab" because that is, at the moment, required
254
+ outfilename = output_dir + File.basename(bncfilename)
255
+ FrprepHelper.bnc_to_plain_file(bncfilename, outfilename)
256
+ }
257
+ end
258
+
259
+
260
+ ###############
261
+ # transform_plain:
262
+ #
263
+ # transformation for plaintext:
264
+ #
265
+ # transform to Tab format, separating punctuation from adjacent words
266
+ # @param input_dir [String] input directory
267
+ # @param output_dir [String] output directory
268
+ def transform_plain_dir(input_dir, output_dir)
269
+ Dir[input_dir + "*"].each do |plainfilename|
270
+ # open input and output file
271
+ # end output file name in "tab" because that is, at the moment, required
272
+ outfilename = output_dir + File.basename(plainfilename) + @file_suffixes["tab"]
273
+ FrprepHelper.plain_to_tab_file(plainfilename, outfilename)
274
+ end
275
+ end
276
+
277
+ ###############
278
+ # transform_pos_and_lemmatize
279
+ #
280
+ # transformation for Tab format files:
281
+ #
282
+ # - Split into parser-size chunks
283
+ # - POS-tag, lemmatize
284
+ def transform_pos_and_lemmatize(input_dir, # string: input directory
285
+ output_dir) # string: output directory
286
+ ##
287
+ # split the TabFormatFile into chunks of max_sent_num size
288
+ FrprepHelper.split_dir(input_dir, output_dir,@file_suffixes["tab"],
289
+ @exp.get("parser_max_sent_num"),
290
+ @exp.get("parser_max_sent_len"))
291
+
292
+ ##
293
+ # POS-Tagging
294
+ if @exp.get("do_postag")
295
+ $stderr.puts "Frprep: Tagging."
296
+
297
+ # AB: TODO Move it to OptionParser.
298
+ unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
299
+ raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
300
+ end
301
+
302
+ sys_class = SynInterfaces.get_interface("pos_tagger",
303
+ @exp.get("pos_tagger"))
304
+ print "pos tagger interface: ", sys_class, "\n"
305
+
306
+ # AB: TODO Remove it.
307
+ unless sys_class
308
+ raise "Shouldn't be here"
309
+ end
310
+
311
+ sys = sys_class.new(@exp.get("pos_tagger_path"),
312
+ @file_suffixes["tab"],
313
+ @file_suffixes["pos"])
314
+ sys.process_dir(output_dir, output_dir)
315
+ end
316
+
317
+
318
+ ##
319
+ # Lemmatization
320
+ # AB: We're working on the <split> dir and writing there.
321
+ if @exp.get("do_lemmatize")
322
+ STDERR.puts 'Frprep: Lemmatizing.'
323
+
324
+ # AB: TODO Move it to OptionParser.
325
+ unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
326
+ raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
327
+ end
328
+
329
+ sys_class = SynInterfaces.get_interface("lemmatizer",
330
+ @exp.get("lemmatizer"))
331
+ # AB: TODO make this exception explicit.
332
+ unless sys_class
333
+ raise 'I got a empty interface class for the lemmatizer!'
334
+ end
335
+
336
+ sys = sys_class.new(@exp.get("lemmatizer_path"),
337
+ @file_suffixes["tab"],
338
+ @file_suffixes["lemma"])
339
+ sys.process_dir(output_dir, output_dir)
340
+ end
341
+ end
342
+
343
+ ###############
344
+ # transform_salsatab
345
+ #
346
+ # transformation for Tab format files:
347
+ #
348
+ # - parse
349
+ # - Transform parser output to SalsaTigerXML
350
+ # If no parsing, make flat syntactic structure.
351
+ def transform_salsatab_dir(input_dir, # string: input directory
352
+ parse_dir, # string: output directory for parses
353
+ output_dir) # string: global output directory
354
+
355
+ ##
356
+ # (Parse and) transform to SalsaTigerXML
357
+
358
+ # get interpretation class for this
359
+ # parser/lemmatizer/POS tagger combination
360
+ interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
361
+ unless interpreter_class
362
+ raise "Shouldn't be here"
363
+ end
364
+
365
+ parse_obj = DoParses.new(@exp, @file_suffixes,
366
+ parse_dir,
367
+ "tab_dir" => input_dir)
368
+ parse_obj.each_parsed_file { |parsed_file_obj|
369
+
370
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
371
+ $stderr.puts "Writing #{outfilename}"
372
+ begin
373
+ outfile = File.new(outfilename, "w")
374
+ rescue
375
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
376
+ end
377
+
378
+ outfile.puts SalsaTigerXMLHelper.get_header()
379
+ # work with triples
380
+ # SalsaTigerSentence, FNTabSentence,
381
+ # hash: tab sentence index(integer) -> array:SynNode
382
+ parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
383
+
384
+ # parsed: add headwords using parse tree
385
+ if @exp.get("do_parse")
386
+ FrprepHelper.add_head_attributes(st_sent, interpreter_class)
387
+ end
388
+
389
+ # add lemmas, if they are there. If they are not, don't print out a warning.
390
+ if @exp.get("do_lemmatize")
391
+ FrprepHelper.add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
392
+ end
393
+
394
+ # add semantics
395
+ # we can use the method in SalsaTigerXMLHelper
396
+ # that reads semantic information from the tab file
397
+ # and combines all targets of a sentence into one frame
398
+ FrprepHelper.add_semantics_from_tab(st_sent, tabformat_sent, mapping,
399
+ interpreter_class, @exp)
400
+
401
+ # remove pseudo-frames from FrameNet data
402
+ FrprepHelper.remove_deprecated_frames(st_sent, @exp)
403
+
404
+ # handle multiword targets
405
+ FrprepHelper.handle_multiword_targets(st_sent,
406
+ interpreter_class, @exp.get("language"))
407
+
408
+ # handle Unknown frame names
409
+ FrprepHelper.handle_unknown_framenames(st_sent, interpreter_class)
410
+
411
+ outfile.puts st_sent.get()
412
+ }
413
+ outfile.puts SalsaTigerXMLHelper.get_footer()
414
+ }
415
+ end
416
+
417
+ #############################################
418
+ # transform_stxml
419
+ #
420
+ # transformation for SalsaTigerXML data
421
+ #
422
+ # - If the input format was SalsaTigerXML:
423
+ # - Tag, lemmatize and parse, if the experiment file tells you so
424
+ #
425
+ # - If the origin is the Salsa corpus:
426
+ # Change frame names from Unknown\d+ to lemma_Unknown\d+
427
+ #
428
+ # - fix multiword lemmas, or at least try
429
+ # - transform to UTF 8
430
+ def transform_stxml_dir(parse_dir, # string: name of directory for parse data
431
+ tab_dir, # string: name of directory for split/tab data
432
+ input_dir, # string: name of input directory
433
+ output_dir, # string: name of final output directory
434
+ exp) # FrprepConfigData
435
+
436
+ ####
437
+ # Data preparation
438
+
439
+ # Data with Salsa as origin:
440
+ # remember the target lemma as an attribute on the
441
+ # <target> elements
442
+ #
443
+ # currently deactivated: encoding problems
444
+ # if @exp.get("origin") == "SalsaTiger"
445
+ # $stderr.puts "Frprep: noting target lemmas"
446
+ # changed_input_dir = frprep_dirname("salsalemma", "new")
447
+ # FrprepHelper.note_salsa_targetlemmas(input_dir, changed_input_dir)
448
+
449
+ # # remember changed input dir as input dir
450
+ # input_dir = changed_input_dir
451
+ # end
452
+
453
+ # If data is to be parsed, split and tabify input files
454
+ # else copy data to stxml_indir.
455
+
456
+ # stxml_dir: directory where SalsaTiger data is situated
457
+ if @exp.get("do_parse")
458
+ # split data
459
+ stxml_splitdir = frprep_dirname("stxml_split", "new")
460
+ stxml_dir = stxml_splitdir
461
+
462
+ $stderr.puts "Frprep: splitting data"
463
+ FrprepHelper.stxml_split_dir(input_dir, stxml_splitdir,
464
+ @exp.get("parser_max_sent_num"),
465
+ @exp.get("parser_max_sent_len"))
466
+ else
467
+ # no parsing: copy data to split dir
468
+ stxml_dir = parse_dir
469
+ $stderr.puts "Frprep: Copying data to #{stxml_dir}"
470
+ Dir[input_dir + "*.xml"].each { |filename|
471
+ `cp #{filename} #{stxml_dir}#{File.basename(filename)}`
472
+ }
473
+ end
474
+
475
+ # Some syntactic processing will take place:
476
+ # tabify data
477
+ if @exp.get("do_parse") or @exp.get("do_lemmatize") or @exp.get("do_postag")
478
+ $stderr.puts "Frprep: making input for syn. processing"
479
+
480
+ Dir[stxml_dir+"*"+@file_suffixes["stxml"]].each { |stxmlfilename|
481
+
482
+ tabfilename = tab_dir + File.basename(stxmlfilename,@file_suffixes["stxml"]) + @file_suffixes["tab"]
483
+ FrprepHelper.stxml_to_tab_file(stxmlfilename, tabfilename, exp)
484
+ }
485
+ end
486
+
487
+ ###
488
+ # POS-tagging
489
+ if @exp.get("do_postag")
490
+ $stderr.puts "Frprep: Tagging."
491
+ unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
492
+ raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
493
+ end
494
+
495
+ sys_class = SynInterfaces.get_interface("pos_tagger",
496
+ @exp.get("pos_tagger"))
497
+ unless sys_class
498
+ raise "Shouldn't be here"
499
+ end
500
+ sys = sys_class.new(@exp.get("pos_tagger_path"),
501
+ @file_suffixes["tab"],
502
+ @file_suffixes["pos"])
503
+ sys.process_dir(tab_dir, tab_dir)
504
+ end
505
+
506
+ ###
507
+ # Lemmatization
508
+ if @exp.get("do_lemmatize")
509
+ $stderr.puts "Frprep: Lemmatizing."
510
+ unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
511
+ raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
512
+ end
513
+
514
+ sys_class = SynInterfaces.get_interface("lemmatizer",
515
+ @exp.get("lemmatizer"))
516
+ unless sys_class
517
+ raise "Shouldn't be here"
518
+ end
519
+ sys = sys_class.new(@exp.get("lemmatizer_path"),
520
+ @file_suffixes["tab"],
521
+ @file_suffixes["lemma"])
522
+ sys.process_dir(tab_dir, tab_dir)
523
+ end
524
+
525
+ ###
526
+ # Parsing, production of SalsaTigerXML output
527
+
528
+ # get interpretation class for this
529
+ # parser/lemmatizer/POS tagger combination
530
+ sys_class_names = Hash.new
531
+ [["do_postag", "pos_tagger"],
532
+ ["do_lemmatize", "lemmatizer"],
533
+ ["do_parse", "parser"]].each { |service, system_name|
534
+ if @exp.get(service) # yes, perform this service
535
+ sys_class_names[system_name] = @exp.get(system_name)
536
+ end
537
+ }
538
+ interpreter_class = SynInterfaces.get_interpreter(sys_class_names)
539
+ unless interpreter_class
540
+ raise "Shouldn't be here"
541
+ end
542
+
543
+ parse_obj = DoParses.new(@exp, @file_suffixes,
544
+ parse_dir,
545
+ "tab_dir" => tab_dir,
546
+ "stxml_dir" => stxml_dir)
547
+ parse_obj.each_parsed_file { |parsed_file_obj|
548
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
549
+ $stderr.puts "Writing #{outfilename}"
550
+ begin
551
+ outfile = File.new(outfilename, "w")
552
+ rescue
553
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
554
+ end
555
+
556
+
557
+ if @exp.get("do_parse")
558
+ # read old SalsaTigerXML file
559
+ # so we can integrate the old file's semantics later
560
+ oldxml = Array.new # array of sentence strings
561
+ # we assume that the old and the new file have the same name,
562
+ # ending in .xml.
563
+ oldxmlfile = FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
564
+ oldxmlfile.scan_s { |sent_string|
565
+ # remember this sentence by its ID
566
+ oldxml << sent_string
567
+ }
568
+ end
569
+
570
+ outfile.puts SalsaTigerXMLHelper.get_header()
571
+ index = 0
572
+ # work with triples
573
+ # SalsaTigerSentence, FNTabSentence,
574
+ # hash: tab sentence index(integer) -> array:SynNode
575
+ parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
576
+
577
+ # parsed? then integrate semantics and lemmas from old file
578
+ if @exp.get("do_parse")
579
+ oldsent_string = oldxml[index]
580
+ index += 1
581
+ if oldsent_string
582
+
583
+ # modified by ines, 27/08/08
584
+ # for Berkeley => substitute ( ) for *LRB* *RRB*
585
+ if exp.get("parser") == "berkeley"
586
+ oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
587
+ oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
588
+ oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
589
+ oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
590
+ end
591
+
592
+ # we have both an old and a new sentence, so integrate semantics
593
+ oldsent = SalsaTigerSentence.new(oldsent_string)
594
+ if st_sent.nil?
595
+ next
596
+ end
597
+ if ( FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp) == false)
598
+ #print "FALSE \n";
599
+ #print oldsent, "\n", st_sent, "\n\n";
600
+
601
+ oldsent_string = oldxml[index]
602
+ index += 1
603
+ if oldsent_string
604
+
605
+ # modified by ines, 27/08/08
606
+ # for Berkeley => substitute ( ) for *LRB* *RRB*
607
+ if exp.get("parser") == "berkeley"
608
+ oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
609
+ oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
610
+ oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
611
+ oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
612
+ end
613
+
614
+ # we have both an old and a new sentence, so integrate semantics
615
+ oldsent = SalsaTigerSentence.new(oldsent_string)
616
+ #print oldsent, "\n", st_sent, "\n\n";
617
+ FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
618
+
619
+ end
620
+ #else
621
+ #print "TRUE\n";
622
+ #print oldsent, "\n", st_sent, "\n\n";
623
+ end
624
+ else
625
+ # no corresponding old sentence for this new sentence
626
+ $stderr.puts "Warning: transporting semantics -- missing source sentence, skipping"
627
+ end
628
+ end
629
+
630
+ # remove pseudo-frames from FrameNet data
631
+ FrprepHelper.remove_deprecated_frames(st_sent, @exp)
632
+
633
+ # repair syn/sem mapping problems?
634
+ if @exp.get("fe_syn_repair") or @exp.get("fe_rel_repair")
635
+ FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
636
+ end
637
+
638
+ outfile.puts st_sent.get()
639
+ } # each ST sentence
640
+ outfile.puts SalsaTigerXMLHelper.get_footer()
641
+ } # each file parsed
642
+ end
643
+
644
+
645
+ ###################################
646
+ # general file iterators
647
+
648
+ # yields pairs of [infile name, outfile stream]
649
+ def change_each_file_in_dir(dir, # string: directory name
650
+ suffix) # string: filename pattern, e.g. "*.xml"
651
+ Dir[dir + "*#{suffix}"].each { |filename|
652
+ tempfile = Tempfile.new("FrprepHelper")
653
+ yield [filename, tempfile]
654
+
655
+ # move temp file to original file location
656
+ tempfile.close()
657
+ `cp #{filename} #{filename}.bak`
658
+ `mv #{tempfile.path()} #{filename}`
659
+ tempfile.close(true)
660
+ } # each file
661
+ end
662
+
663
+ #######
664
+ # change_each_stxml_file_in_dir
665
+ #
666
+ # use change_each_file_in_dir, but assume that the files
667
+ # are SalsaTigerXML files: Keep file headers and footers,
668
+ # and just offer individual sentences for changing
669
+ #
670
+ # Yields SalsaTigerSentence objects, each sentence to be changed
671
+ def change_each_stxml_file_in_dir(dir) # string: directory name
672
+
673
+ change_each_file_in_dir(dir, "*.xml") { |stfilename, tf|
674
+ infile = FilePartsParser.new(stfilename)
675
+
676
+ # write header
677
+ tf.puts infile.head()
678
+
679
+ # iterate through sentences, yield as SalsaTigerSentence objects
680
+ infile.scan_s() { |sent_string|
681
+ sent = SalsaTigerSentence.new(sent_string)
682
+ yield sent
683
+ # write changed sentence
684
+ tf.puts sent.get()
685
+ } # each sentence
686
+
687
+ # write footer
688
+ tf.puts infile.tail()
689
+ infile.close()
690
+ }
691
+ end
692
+ end
693
+ end