shalmaneser-frappe 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/frappe/Ampersand.rb +41 -0
  7. data/lib/frappe/file_parser.rb +126 -0
  8. data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
  9. data/lib/frappe/frappe.rb +217 -0
  10. data/lib/frappe/frappe_flat_syntax.rb +89 -0
  11. data/lib/frappe/frappe_read_stxml.rb +48 -0
  12. data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
  13. data/lib/frappe/interfaces/collins_interface.rb +340 -0
  14. data/lib/frappe/interfaces/counter.rb +19 -0
  15. data/lib/frappe/interfaces/stanford_interface.rb +353 -0
  16. data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
  17. data/lib/frappe/interfaces/treetagger_module.rb +111 -0
  18. data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
  19. data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
  20. data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
  21. data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
  22. data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
  23. data/lib/frappe/interpreters/headz.rb +265 -0
  24. data/lib/frappe/interpreters/headz_helpers.rb +54 -0
  25. data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
  26. data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
  27. data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
  28. data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
  29. data/lib/frappe/one_parsed_file.rb +31 -0
  30. data/lib/frappe/opt_parser.rb +92 -0
  31. data/lib/frappe/path.rb +199 -0
  32. data/lib/frappe/plain_converter.rb +59 -0
  33. data/lib/frappe/salsa_tab_converter.rb +154 -0
  34. data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
  35. data/lib/frappe/stxml_converter.rb +666 -0
  36. data/lib/frappe/syn_interface.rb +76 -0
  37. data/lib/frappe/syn_interface_stxml.rb +173 -0
  38. data/lib/frappe/syn_interface_tab.rb +39 -0
  39. data/lib/frappe/utf_iso.rb +27 -0
  40. data/lib/shalmaneser/frappe.rb +1 -0
  41. metadata +130 -0
@@ -0,0 +1,666 @@
1
+ require 'logging'
2
+ require 'fileutils'
3
+ require 'external_systems'
4
+ require 'frappe/file_parser'
5
+ require 'salsa_tiger_xml/file_parts_parser'
6
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
7
+ require 'salsa_tiger_xml/salsa_tiger_xml_helper'
8
+ require 'salsa_tiger_xml/corpus'
9
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
10
+ require 'tabular_format/fn_tab_format_file'
11
+ require 'frappe/fix_syn_sem_mapping'
12
+
13
+ module Shalmaneser
14
+ module Frappe
15
+ class STXMLConverter
16
+ def initialize(exp)
17
+ @exp = exp
18
+ # @todo Implement the logger as a mixin for all classes.
19
+ @logger = LOGGER
20
+ # suffixes for different types of output files
21
+ @file_suffixes = {"lemma" => ".lemma", "pos" => ".pos", "tab" => ".tab", "stxml" => ".xml"}
22
+ end
23
+ #############################################
24
+ # transform_stxml
25
+ #
26
+ # transformation for SalsaTigerXML data
27
+ #
28
+ # - If the input format was SalsaTigerXML:
29
+ # - Tag, lemmatize and parse, if the experiment file tells you so
30
+ #
31
+ # - If the origin is the Salsa corpus:
32
+ # Change frame names from Unknown\d+ to lemma_Unknown\d+
33
+ #
34
+ # - fix multiword lemmas, or at least try
35
+ # - transform to UTF 8
36
+ # string: name of directory for parse data
37
+ # string: name of directory for split/tab data
38
+ # string: name of input directory
39
+ # string: name of final output directory
40
+ # FrappeConfigData
41
+ def transform_stxml_dir(parse_dir, tab_dir, input_dir, output_dir)
42
+ ####
43
+ # Data preparation
44
+
45
+ # Data with Salsa as origin:
46
+ # remember the target lemma as an attribute on the
47
+ # <target> elements
48
+ #
49
+ # currently deactivated: encoding problems
50
+ # if @exp.get("origin") == "SalsaTiger"
51
+ # $stderr.puts "Frprep: noting target lemmas"
52
+ # changed_input_dir = frprep_dirname("salsalemma", "new")
53
+ # note_salsa_targetlemmas(input_dir, changed_input_dir)
54
+
55
+ # # remember changed input dir as input dir
56
+ # input_dir = changed_input_dir
57
+ # end
58
+
59
+ # If data is to be parsed, split input files
60
+ # else copy data to stxml_indir.
61
+ # stxml_dir: directory where SalsaTiger data is situated
62
+ if @exp.get("do_parse")
63
+ # split data
64
+ stxml_splitdir = frprep_dirname("stxml_split", "new")
65
+ stxml_dir = stxml_splitdir
66
+
67
+ LOGGER.info "#{PROGRAM_NAME}: Splitting the input data into #{stxml_dir}."
68
+
69
+ stxml_split_dir(input_dir, stxml_splitdir, @exp.get("parser_max_sent_num"), @exp.get("parser_max_sent_len"))
70
+ else
71
+ # no parsing: copy data to split dir
72
+ stxml_dir = parse_dir
73
+
74
+ LOGGER.info "#{PROGRAM_NAME}: Copying data to #{stxml_dir}"
75
+
76
+ Dir[input_dir + "*.xml"].each { |f| FileUtils.cp(f, stxml_dir) }
77
+ end
78
+
79
+ # Some syntactic processing will take place:
80
+ # tabify data
81
+ if @exp.get("do_parse") || @exp.get("do_lemmatize") || @exp.get("do_postag")
82
+ LOGGER.info "#{PROGRAM_NAME}: Making input for syn. processing."
83
+ Dir[stxml_dir + "*" + @file_suffixes["stxml"]].each do |stxmlfilename|
84
+ tabfilename = tab_dir + File.basename(stxmlfilename, @file_suffixes["stxml"]) + @file_suffixes["tab"]
85
+ stxml_to_tab_file(stxmlfilename, tabfilename)
86
+ end
87
+ end
88
+
89
+ ###
90
+ # POS-tagging
91
+ if @exp.get("do_postag")
92
+ LOGGER.info "#{PROGRAM_NAME}: Tagging."
93
+ sys_class = ExternalSystems.get_interface("pos_tagger", @exp.get("pos_tagger"))
94
+ sys = sys_class.new(@exp.get("pos_tagger_path"), @file_suffixes["tab"], @file_suffixes["pos"])
95
+ sys.process_dir(tab_dir, tab_dir)
96
+ end
97
+
98
+ ###
99
+ # Lemmatization
100
+ if @exp.get("do_lemmatize")
101
+ LOGGER.info "#{PROGRAM_NAME}: Lemmatizing."
102
+ sys_class = ExternalSystems.get_interface("lemmatizer", @exp.get("lemmatizer"))
103
+ sys = sys_class.new(@exp.get("lemmatizer_path"), @file_suffixes["tab"], @file_suffixes["lemma"])
104
+ sys.process_dir(tab_dir, tab_dir)
105
+ end
106
+
107
+ ###
108
+ # Parsing, production of SalsaTigerXML output
109
+
110
+ # get interpretation class for this
111
+ # parser/lemmatizer/POS tagger combination
112
+ sys_class_names = {}
113
+
114
+ [["do_postag", "pos_tagger"], ["do_lemmatize", "lemmatizer"], ["do_parse", "parser"]].each do |service, system_name|
115
+ # yes, perform this service
116
+ if @exp.get(service)
117
+ sys_class_names[system_name] = @exp.get(system_name)
118
+ end
119
+ end
120
+
121
+ interpreter_class = ExternalSystems.get_interpreter(sys_class_names)
122
+
123
+ unless interpreter_class
124
+ raise "Shouldn't be here"
125
+ end
126
+
127
+ parse_obj = FileParser.new(@exp, @file_suffixes, parse_dir, "tab_dir" => tab_dir, "stxml_dir" => stxml_dir)
128
+ parse_obj.each_parsed_file do |parsed_file_obj|
129
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
130
+ LOGGER.debug "Writing #{outfilename}."
131
+ begin
132
+ outfile = File.new(outfilename, "w")
133
+ rescue
134
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
135
+ end
136
+ if @exp.get("do_parse")
137
+ # read old SalsaTigerXML file
138
+ # so we can integrate the old file's semantics later
139
+ # array of sentence strings
140
+ oldxml = []
141
+ # we assume that the old and the new file have the same name,
142
+ # ending in .xml.
143
+ oldxmlfile = STXML::FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
144
+ oldxmlfile.scan_s do |sent_string|
145
+ # remember this sentence by its ID
146
+ oldxml << sent_string
147
+ end
148
+ end
149
+ outfile.puts STXML::SalsaTigerXMLHelper.get_header
150
+ index = 0
151
+ # work with triples
152
+ # SalsaTigerSentence, FNTabSentence,
153
+ # hash: tab sentence index(integer) -> array:SynNode
154
+ parsed_file_obj.each_sentence do |st_sent, tabformat_sent, mapping|
155
+ # parsed? then integrate semantics and lemmas from old file
156
+ if @exp.get("do_parse")
157
+ oldsent_string = oldxml[index]
158
+ index += 1
159
+ if oldsent_string
160
+ oldsent_string = escape_berkeley_chars(oldsent_string)
161
+ # we have both an old and a new sentence, so integrate semantics
162
+ oldsent = STXML::SalsaTigerSentence.new(oldsent_string)
163
+
164
+ next if st_sent.nil?
165
+
166
+ unless integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
167
+ oldsent_string = oldxml[index]
168
+ index += 1
169
+ if oldsent_string
170
+ oldsent_string = escape_berkeley_chars(oldsent_string)
171
+ # we have both an old and a new sentence, so integrate semantics
172
+ oldsent = STXML::SalsaTigerSentence.new(oldsent_string)
173
+
174
+ integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
175
+ end
176
+ end
177
+ else
178
+ # no corresponding old sentence for this new sentence
179
+ @logger.warn "Warning: Transporting semantics - missing source sentence, skipping"
180
+ end
181
+ end
182
+ # remove pseudo-frames from FrameNet data
183
+ remove_deprecated_frames(st_sent, @exp)
184
+ # repair syn/sem mapping problems?
185
+ if @exp.get("fe_syn_repair") || @exp.get("fe_rel_repair")
186
+ FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
187
+ end
188
+
189
+ outfile.puts st_sent.get
190
+ end # each ST sentence
191
+ outfile.puts STXML::SalsaTigerXMLHelper.get_footer
192
+ end # each file parsed
193
+ end
194
+
195
+ ####
196
+ # transform SalsaTigerXML file to Tab format file
197
+ # @param [String] input_filename Name of input file.
198
+ # @param [String] output_filename Name of output file.
199
+ # @param [FrappeConfigData]
200
+ def stxml_to_tab_file(input_filename, output_filename)
201
+ corpus = STXML::Corpus.new(input_filename)
202
+
203
+ File.open(output_filename, 'w') do |f|
204
+ corpus.each_sentence do |sentence|
205
+ raise 'Interface changed!!!' unless sentence.is_a?(Nokogiri::XML::Element)
206
+ id = sentence.attributes['id'].value
207
+ words = sentence.xpath('.//t')
208
+ # byebug
209
+ words.each do |word|
210
+ word = STXML::SalsaTigerXMLHelper.unescape(word.attributes['word'].value)
211
+ # @todo AB: I don't know why the Berkeley Parser wants this.
212
+ # Investigate if every Grammar needs this conversion.
213
+ # Try to move this convertion from FrappeHelper to BerkeleyInterface.
214
+ if @exp.get("parser") == "berkeley"
215
+ word.gsub!(/\(/, "*LRB*")
216
+ word.gsub!(/\)/, "*RRB*")
217
+ word.gsub!(/``/, '"')
218
+ word.gsub!(/''/, '"')
219
+ word.gsub!(%r{\&apos;\&apos;}, '"')
220
+ end
221
+ fields = {'word' => word, 'sent_id' => id}
222
+ f.puts FNTabFormatFile.format_str(fields)
223
+ end
224
+ f.puts
225
+ end
226
+ end
227
+ end
228
+ ###############
229
+ # frprep_dirname:
230
+ # make directory name for frprep-internal data
231
+ # of a certain kind described in <subdir>
232
+ #
233
+ # frprep_directory has one subdirectory for each experiment ID,
234
+ # and below that there is one subdir per subtask
235
+ #
236
+ # If this is a new directory, it is constructed,
237
+ # if it should be an existing directory, its existence is checked.
238
+ # @param subdir [String] designator of a subdirectory
239
+ # @param neu [Nil] non-nil This may be a new directory
240
+ def frprep_dirname(subdir, neu = nil)
241
+ dirname = File.new_dir(@exp.get("frprep_directory"), @exp.get("prep_experiment_ID"), subdir)
242
+
243
+ neu ? File.new_dir(dirname) : File.existing_dir(dirname)
244
+ end
245
+
246
+
247
+ def escape_berkeley_chars(str)
248
+ # modified by ines, 27/08/08
249
+ # for Berkeley => substitute ( ) for *LRB* *RRB*
250
+ # @note AB: Duplicated code!! Move it to the Berkeley Interface.
251
+ if @exp.get("parser") == "berkeley"
252
+ str.gsub!(/word='\('/, "word='*LRB*'")
253
+ str.gsub!(/word='\)'/, "word='*RRB*'")
254
+ str.gsub!(/word=\"\(\"/, "word='*LRB*'")
255
+ str.gsub!(/word=\"\)\"/, "word='*RRB*'")
256
+ end
257
+
258
+ str
259
+ end
260
+
261
+
262
+ ####
263
+ # stxml_split_dir
264
+ #
265
+ # split SalsaTigerXML files into new files of given length,
266
+ # skipping sentences that are too long
267
+ #
268
+ # At the same time, sentences that occur several times (i.e. sentences which are
269
+ # annotated by SALSA for more than one predicate) are compacted into one occurrence
270
+ # with combined semantics.
271
+ #
272
+ # assumes that all files in input_dir with
273
+ # extension .xml are SalsaTigerXMl files
274
+ def stxml_split_dir(input_dir, # string: input directory with STXML files
275
+ split_dir, # string: output directory
276
+ max_sentnum, # integer: max num of sentences per file
277
+ max_sentlen) # integer: max num of terminals per sentence
278
+
279
+
280
+ # @note AB: Effectevely copying all files.
281
+ Dir["#{input_dir}*.xml"].each do |file|
282
+ FileUtils.cp file, split_dir
283
+ end
284
+
285
+ # @note AB: Switch off splitting for now.
286
+ # The algorithms are weird.
287
+ =begin
288
+ $stderr.puts "Frprep: splitting data"
289
+
290
+ filenames = Dir[input_dir + "*.xml"].to_a
291
+
292
+ graph_hash = {} # for each sentence id, keep <s...</graph>
293
+ frame_hash = {} # for each sentence id , keep the <frame... </frame> string
294
+ uspfes_hash = {} # for each sentence id, keep the uspfes stuff
295
+ uspframes_hash = {} # for each sentence id, keep the uspframes stuff
296
+
297
+ ########################
298
+ # Traverse of file(s): compute an index of all frames for each sentence, with unique identifiers
299
+
300
+ filenames.each { |filename|
301
+
302
+ infile = STXML::FilePartsParser.new(filename)
303
+ infile.scan_s { |sent_str|
304
+
305
+ sentlen = 0
306
+ sent_str.delete("\n").scan(/<t\s/) { |occ| sentlen += 1}
307
+ if sentlen > max_sentlen
308
+ sent = STXML::RegXML.new(sent_str)
309
+ # revisit handling of long sentences
310
+ # $stderr.puts "I would have skipped overly long sentence " + sent.attributes["id"]+" but Sebastian forbade me.".to_s
311
+ # next
312
+ end
313
+
314
+ # substitute old frame identifiers with new, unique ones
315
+
316
+ # problem: we may have several frames per sentence, and need to keep track of them
317
+ # if we rename etc sxx_f1 to sxx_f2 and there is already a sxx_f2, then
318
+ # we cannot distinguish between these frames
319
+
320
+ # therefore, we substitute temporary identifiers until we have substituted
321
+ # all ids with temporary ones, and re-substitute final ones at the end.
322
+
323
+ this_frames = []
324
+
325
+ temp_subs = []
326
+ final_subs = []
327
+
328
+ sent = STXML::RegXML.new(sent_str)
329
+ sentid = sent.attributes["id"].to_s
330
+ if sentid.nil?
331
+ STDERR.puts "[frprep] Warning: cannot find sentence id, skipping sentence:"
332
+ STDERR.puts sent_str
333
+ # strange sentence, no ID? skip
334
+ next
335
+ end
336
+
337
+ unless frame_hash.key? sentid
338
+ frame_hash[sentid] = []
339
+ uspfes_hash[sentid] = []
340
+ uspframes_hash[sentid] = []
341
+ end
342
+
343
+ # find everything up to and including the graph
344
+ sent_children = sent.children_and_text
345
+ graph = sent_children.detect { |child| child.name == "graph" }
346
+ graph_hash[sentid] = "<s " +
347
+ sent.attributes.to_a.map { |at, val| "#{at}=\'#{val}\'" }.join(" ") +
348
+ ">" +
349
+ graph.to_s
350
+
351
+ # find the usp block
352
+
353
+ sem = sent_children.detect { |child| child.name == "sem"}
354
+ usp = ""
355
+ if sem
356
+ usp = sem.children_and_text.detect { |child| child.name == "usp" }
357
+ usp = usp.to_s
358
+ end
359
+
360
+ # find all frames
361
+ if sem
362
+ frames = sem.children_and_text.detect { |child| child.name == "frames" }
363
+ if frames
364
+ frames.children_and_text.each { |frame|
365
+ unless frame.name == "frame"
366
+ next
367
+ end
368
+ frameid = frame.attributes["id"]
369
+
370
+ temp_frameid = "#{sentid}_temp_f#{frame_hash[sentid].length + this_frames.length + 1}"
371
+ final_frameid = "#{sentid}_f#{frame_hash[sentid].length + this_frames.length + 1}"
372
+
373
+ temp_subs << [frameid, temp_frameid]
374
+ final_subs << [temp_frameid, final_frameid]
375
+
376
+ this_frames << frame.to_s
377
+ }
378
+ end
379
+ end
380
+
381
+ # now first rename all the frames to temporary names
382
+
383
+ temp_subs.each {|orig_frameid, temp_frameid|
384
+ this_frames.map! {|frame_str|
385
+ #print "orig ", orig_frameid, " temp ", temp_frameid, "\n"
386
+ frame_str.gsub(orig_frameid,temp_frameid)
387
+ }
388
+
389
+ usp.gsub!(orig_frameid,temp_frameid)
390
+ }
391
+
392
+ # and re-rename the temporary names
393
+
394
+ final_subs.each {|temp_frameid, final_frameid|
395
+ this_frames.map! {|frame_str|
396
+ frame_str.gsub(temp_frameid,final_frameid)
397
+ }
398
+ usp.gsub!(temp_frameid, final_frameid)
399
+ }
400
+
401
+ # store frames in data structure
402
+ this_frames.each {|frame_str|
403
+ frame_hash[sentid] << frame_str
404
+ }
405
+
406
+ # store uspfes in data structure
407
+ unless usp.empty?
408
+ usp_elt = STXML::RegXML.new(usp)
409
+ uspfes = usp_elt.children_and_text.detect { |child| child.name == "uspfes" }
410
+ uspfes.children_and_text.each { |child|
411
+ unless child.name == "uspblock"
412
+ next
413
+ end
414
+ uspfes_hash[sentid] << child.to_s
415
+ }
416
+
417
+ # store uspframes in data structure
418
+ uspframes = usp_elt.children_and_text.detect { |child| child.name == "uspframes" }
419
+ uspframes.children_and_text.each { |child|
420
+ unless child.name == "uspblock"
421
+ next
422
+ end
423
+ uspframes_hash[sentid] << child.to_s
424
+ }
425
+ end
426
+ }
427
+ }
428
+
429
+ # now write everything in the data structure back to a file
430
+
431
+ filecounter = 0
432
+ sentcounter = 0
433
+ outfile = nil
434
+ sent_stack = []
435
+
436
+ graph_hash = graph_hash.sort { |a, b| a[0].to_i <=> b[0].to_i }
437
+
438
+ graph_hash.each do |sentid, graph_str|
439
+ unless outfile
440
+ outfile = File.new(split_dir + filecounter.to_s + ".xml", "w")
441
+ outfile.puts STXML::SalsaTigerXMLHelper.get_header
442
+ filecounter += 1
443
+ sentcounter = 0
444
+ end
445
+
446
+ xml = []
447
+ xml << graph_str
448
+ xml << "<sem>"
449
+ xml << "<globals>"
450
+ xml << "</globals>"
451
+ xml << "<frames>"
452
+
453
+ frame_hash[sentid].each { |frame_str| xml << frame_str }
454
+
455
+ xml << "</frames>"
456
+ xml << "<usp>"
457
+ xml << "<uspframes>"
458
+
459
+ uspframes_hash[sentid].each { |uspblock_str| xml << uspblock_str }
460
+
461
+ xml << "</uspframes>"
462
+ xml << "<uspfes>"
463
+
464
+ uspfes_hash[sentid].each { |uspblock_str| xml << uspblock_str }
465
+
466
+ xml << "</uspfes>"
467
+ xml << "</usp>"
468
+ xml << "</sem>"
469
+ xml << "</s>"
470
+
471
+ outfile.puts xml.join("\n")
472
+ sentcounter += 1
473
+ end
474
+
475
+ if outfile
476
+ outfile.puts STXML::SalsaTigerXMLHelper.get_footer
477
+ outfile.close
478
+ outfile = nil
479
+ end
480
+ =end
481
+ end
482
+
483
+
484
+ #####################
485
+ #
486
+ # Integrate the semantic annotation of an old sentence
487
+ # into the corresponding new sentence
488
+ # At the same time, integrate the lemma information from the
489
+ # old sentence into the new sentence
490
+ def integrate_stxml_semantics_and_lemmas(oldsent,
491
+ newsent,
492
+ interpreter_class,
493
+ exp)
494
+ if oldsent.nil? or newsent.nil?
495
+ return
496
+ end
497
+ ##
498
+ # match old and new sentence via terminals
499
+ newterminals = newsent.terminals_sorted
500
+ oldterminals = oldsent.terminals_sorted
501
+ # sanity check: exact match on terminals?
502
+ newterminals.interleave(oldterminals).each { |newnode, oldnode|
503
+ #print "old ", oldnode.word, " ", newnode.word, "\n"
504
+ # new and old word: use both unescaped and escaped variant
505
+ if newnode
506
+ newwords = [ newnode.word, STXML::SalsaTigerXMLHelper.escape(newnode.word) ]
507
+ else
508
+ newwords = [nil, nil]
509
+ end
510
+ if oldnode
511
+ oldwords = [ oldnode.word, STXML::SalsaTigerXMLHelper.escape(oldnode.word) ]
512
+ else
513
+ oldwords = [ nil, nil]
514
+ end
515
+
516
+ if (newwords & oldwords).empty?
517
+ # old and new word don't match, either escaped or non-escaped
518
+
519
+ $stderr.puts "Warning: could not match terminals of sentence #{newsent.id}"
520
+ $stderr.puts "This means that I cannot match the semantic annotation"
521
+ $stderr.puts "to the newly parsed sentence. Skipping."
522
+ #$stderr.puts "Old sentence: "
523
+ #$stderr.puts oldterminals.map { |n| n.word }.join("--")
524
+ #$stderr.puts "New sentence: "
525
+ #$stderr.puts newterminals.map { |n| n.word }.join("--")
526
+ return false
527
+ end
528
+ }
529
+
530
+ ##
531
+ # copy lemma information
532
+ oldterminals.each_with_index { |oldnode, ix|
533
+ newnode = newterminals[ix]
534
+ if oldnode.get_attribute("lemma")
535
+ newnode.set_attribute("lemma", oldnode.get_attribute("lemma"))
536
+ end
537
+ }
538
+
539
+ ##
540
+ # copy frames
541
+ oldsent.each_frame { |oldframe|
542
+ # make new frame with same ID
543
+ newframe = newsent.add_frame(oldframe.name, oldframe.id)
544
+ # copy FEs
545
+ oldframe.each_child { |oldfe|
546
+ # new nodes: map old terminals to new terminals,
547
+ # then find max constituents covering them
548
+ newnodes = oldfe.descendants.select { |n|
549
+ n.is_terminal?
550
+ }.map { |n|
551
+ oldterminals.index(n)
552
+ }.map { |ix|
553
+ newterminals[ix]
554
+ }
555
+
556
+ # let the interpreter class decide on how to determine the maximum constituents
557
+ newnodes = interpreter_class.max_constituents(newnodes, newsent)
558
+
559
+ # make new FE with same ID
560
+ new_fe = newsent.add_fe(newframe, oldfe.name, newnodes, oldfe.id)
561
+ # keep all attributes of the FE
562
+ if oldfe.get_f("attributes")
563
+ oldfe.get_f("attributes").each_pair { |attr, value|
564
+ new_fe.set_attribute(attr, value)
565
+ }
566
+ end
567
+ }
568
+ }
569
+
570
+ ##
571
+ ### changed by ines => appears twice in stxml file
572
+
573
+ # copy underspecification
574
+ # keep as is, since we've kept all frame and FE IDs
575
+ oldsent.each_usp_frameblock { |olduspframe|
576
+ newuspframe = newsent.add_usp("frame")
577
+ olduspframe.each_child { |oldnode|
578
+ newnode = newsent.sem_node_with_id(oldnode.id)
579
+ if newnode
580
+ newuspframe.add_child(newnode)
581
+ else
582
+ $stderr.puts "Error: unknown frame with ID #{oldnode.id}"
583
+ end
584
+ }
585
+ }
586
+ oldsent.each_usp_feblock { |olduspfe|
587
+ newuspfe = newsent.add_usp("fe")
588
+ olduspfe.each_child { |oldnode|
589
+ newnode = newsent.sem_node_with_id(oldnode.id)
590
+ if newnode
591
+ newuspfe.add_child(newnode)
592
+ else
593
+ $stderr.puts "Error: unknown FE with ID #{oldnode.id}"
594
+ end
595
+ }
596
+ }
597
+
598
+ end
599
+ ####
600
+ # note salsa targetlemma
601
+ #
602
+ # old_dir contains xml files whose name starts with the
603
+ # target lemma for all frames in the file
604
+ # record that target lemma in the <target> element of each frame
605
+ def note_salsa_targetlemma(old_dir, # string ending in /
606
+ new_dir) # string ending in /
607
+
608
+
609
+ # each input file: extract target lemma from filename,
610
+ # not this lemma in the <target> element of each frame
611
+ Dir[old_dir + "*.xml"].each { |filename|
612
+ changedfilename = new_dir + File.basename(filename)
613
+
614
+ if File.basename(filename) =~ /^(.*?)[_\.]/
615
+ lemma = $1
616
+
617
+ infile = STXML::FilePartsParser.new(filename)
618
+ outfile = File.new(changedfilename, "w")
619
+
620
+ # write header
621
+ outfile.puts infile.head
622
+
623
+ # iterate through sentences, yield as SalsaTigerSentence objects
624
+ infile.scan_s { |sent_string|
625
+ sent = STXML::SalsaTigerSentence.new(sent_string)
626
+ sent.each_frame { |frame|
627
+ frame.target.set_attribute("lemma", lemma)
628
+ }
629
+
630
+ # write changed sentence
631
+ outfile.puts sent.get
632
+ } # each sentence
633
+
634
+ # write footer
635
+ outfile.puts infile.tail
636
+ infile.close
637
+ outfile.close
638
+
639
+ else
640
+ # couldn't determine lemma
641
+ # just copy the file
642
+ `cp #{filename} #{changedfilename}`
643
+ end
644
+ }
645
+ end
646
+
647
+ ###################3
648
+ # given a SalsaTigerSentence,
649
+ # look for FrameNet frames that are
650
+ # test frames, and remove them
651
+ # @param [SalsaTigerSentence] sent
652
+ # @param [FrprepConfigData] exp
653
+ def remove_deprecated_frames(sent, exp)
654
+ unless exp.get("origin") == "FrameNet"
655
+ return
656
+ end
657
+
658
+ sent.frames.each do |frame_obj|
659
+ if frame_obj.name == "Boulder" || frame_obj.name =~ /^Test/
660
+ sent.remove_frame(frame_obj)
661
+ end
662
+ end
663
+ end
664
+ end
665
+ end
666
+ end