shalmaneser-frappe 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/frappe/Ampersand.rb +41 -0
  7. data/lib/frappe/file_parser.rb +126 -0
  8. data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
  9. data/lib/frappe/frappe.rb +217 -0
  10. data/lib/frappe/frappe_flat_syntax.rb +89 -0
  11. data/lib/frappe/frappe_read_stxml.rb +48 -0
  12. data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
  13. data/lib/frappe/interfaces/collins_interface.rb +340 -0
  14. data/lib/frappe/interfaces/counter.rb +19 -0
  15. data/lib/frappe/interfaces/stanford_interface.rb +353 -0
  16. data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
  17. data/lib/frappe/interfaces/treetagger_module.rb +111 -0
  18. data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
  19. data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
  20. data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
  21. data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
  22. data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
  23. data/lib/frappe/interpreters/headz.rb +265 -0
  24. data/lib/frappe/interpreters/headz_helpers.rb +54 -0
  25. data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
  26. data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
  27. data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
  28. data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
  29. data/lib/frappe/one_parsed_file.rb +31 -0
  30. data/lib/frappe/opt_parser.rb +92 -0
  31. data/lib/frappe/path.rb +199 -0
  32. data/lib/frappe/plain_converter.rb +59 -0
  33. data/lib/frappe/salsa_tab_converter.rb +154 -0
  34. data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
  35. data/lib/frappe/stxml_converter.rb +666 -0
  36. data/lib/frappe/syn_interface.rb +76 -0
  37. data/lib/frappe/syn_interface_stxml.rb +173 -0
  38. data/lib/frappe/syn_interface_tab.rb +39 -0
  39. data/lib/frappe/utf_iso.rb +27 -0
  40. data/lib/shalmaneser/frappe.rb +1 -0
  41. metadata +130 -0
@@ -0,0 +1,154 @@
1
+ require 'logging'
2
+ require 'external_systems'
3
+
4
+ module Shalmaneser
5
+ module Frappe
6
+ class SalsaTabConverter
7
+ def initialize(exp)
8
+ @exp = exp
9
+ # suffixes for different types of output files
10
+ @file_suffixes = {"lemma" => ".lemma", "pos" => ".pos", "tab" => ".tab", "stxml" => ".xml"}
11
+ end
12
+
13
+ ###############
14
+ # transform_pos_and_lemmatize
15
+ #
16
+ # transformation for Tab format files:
17
+ #
18
+ # - Split into parser-size chunks
19
+ # - POS-tag, lemmatize
20
+ # string: input directory
21
+ # string: output directory
22
+ def transform_pos_and_lemmatize(input_dir, output_dir)
23
+ ##
24
+ # split the TabFormatFile into chunks of max_sent_num size
25
+ split_dir(input_dir, output_dir, @file_suffixes["tab"], @exp.get("parser_max_sent_num"), @exp.get("parser_max_sent_len"))
26
+
27
+ ##
28
+ # POS-Tagging
29
+ if @exp.get("do_postag")
30
+ LOGGER.info "#{PROGRAM_NAME}: Tagging."
31
+
32
+ sys_class = ExternalSystems.get_interface("pos_tagger", @exp.get("pos_tagger"))
33
+
34
+ # AB: TODO Remove it.
35
+ unless sys_class
36
+ raise "Shouldn't be here"
37
+ end
38
+
39
+ LOGGER.debug "POS Tagger interface: #{sys_class}."
40
+ sys = sys_class.new(@exp.get("pos_tagger_path"), @file_suffixes["tab"], @file_suffixes["pos"])
41
+ sys.process_dir(output_dir, output_dir)
42
+ end
43
+
44
+ ##
45
+ # Lemmatization
46
+ # AB: We're working on the <split> dir and writing there.
47
+ if @exp.get("do_lemmatize")
48
+ LOGGER.info "#{PROGRAM_NAME}: Lemmatizing."
49
+
50
+ sys_class = ExternalSystems.get_interface("lemmatizer", @exp.get("lemmatizer"))
51
+ # AB: TODO make this exception explicit.
52
+ unless sys_class
53
+ raise 'I got a empty interface class for the lemmatizer!'
54
+ end
55
+
56
+ sys = sys_class.new(@exp.get("lemmatizer_path"), @file_suffixes["tab"], @file_suffixes["lemma"])
57
+ sys.process_dir(output_dir, output_dir)
58
+ end
59
+ end
60
+
61
+
62
+ ###########
63
+ #
64
+ # class method split_dir:
65
+ # read all files in one directory and produce chunk files with _suffix_ in outdir
66
+ # with a certain number of files in them (sent_num).
67
+ # Optionally, remove all sentences longer than sent_leng
68
+ #
69
+ # produces output files 1.<suffix>, 2.<suffix>, etc.
70
+ #
71
+ # assumes TabFormat sentences
72
+ #
73
+ # example: split_all("/tmp/in","/tmp/out",".tab",2000,80)
74
+ def split_dir(indir, outdir, suffix, sent_num, sent_leng = nil)
75
+ unless indir[-1,1] == "/"
76
+ indir += "/"
77
+ end
78
+ unless outdir[-1,1] == "/"
79
+ outdir += "/"
80
+ end
81
+
82
+ # @note AB: A dummy reimplementation.
83
+ # Not doing splitting at all.
84
+ # I want to preserve original file names.
85
+ Dir["#{indir}*#{suffix}"].each do |file|
86
+ FileUtils.cp file, outdir
87
+ end
88
+ # @note AB: Not doing splitting for now.
89
+ =begin
90
+ outfile_counter = 0
91
+ line_stack = []
92
+ sent_stack = []
93
+
94
+ Dir[indir + "*#{suffix}"].each do |infilename|
95
+ LOGGER.info "Now splitting #{infilename}."
96
+
97
+ infile = File.new(infilename)
98
+
99
+ while (line = infile.gets)
100
+ line.chomp!
101
+ case line
102
+ when "" # end of sentence
103
+ if !(sent_leng.nil? or line_stack.length < sent_leng) # record sentence
104
+ # suppress multiple empty lines
105
+ # to avoid problems with lemmatiser
106
+ # only record sent_stack if it is not empty.
107
+
108
+ # change (sp 15 01 07): just cut off sentence at sent_leng.
109
+
110
+ STDERR.puts "Cutting off long sentence #{line_stack.last.split("\t").last}"
111
+ line_stack = line_stack[0...sent_leng]
112
+ end
113
+
114
+ unless line_stack.empty?
115
+ sent_stack << line_stack
116
+ # reset line_stack
117
+ line_stack = []
118
+ end
119
+
120
+ # check if we have to empty the sent stack
121
+ if sent_stack.length == sent_num # enough sentences for new outfile?
122
+ outfile = File.new(outdir + outfile_counter.to_s + "#{suffix}", "w")
123
+
124
+ sent_stack.each { |l_stack|
125
+ outfile.puts l_stack.join("\n")
126
+ outfile.puts
127
+ }
128
+
129
+ outfile.close
130
+ outfile_counter += 1
131
+ sent_stack = []
132
+ end
133
+ else # for any other line
134
+ line_stack << line
135
+ end
136
+ end
137
+ infile.close
138
+ end
139
+
140
+ # the last remaining sentences
141
+ unless sent_stack.empty?
142
+ File.open(outdir + outfile_counter.to_s + "#{suffix}", "w") do |outfile|
143
+ sent_stack.each { |l_stack|
144
+ l_stack << "\n"
145
+ outfile.puts l_stack.join("\n")
146
+ }
147
+ end
148
+ end
149
+ =end
150
+ end
151
+
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,531 @@
1
+ require 'logging'
2
+
3
+ require 'salsa_tiger_xml/salsa_tiger_xml_helper'
4
+ require 'frappe/file_parser'
5
+ require 'external_systems'
6
+
7
+ module Shalmaneser
8
+ module Frappe
9
+ class SalsaTabWithPOSConverter
10
+ def initialize(exp)
11
+ @exp = exp
12
+ # suffixes for different types of output files
13
+ @file_suffixes = {"lemma" => ".lemma", "pos" => ".pos", "tab" => ".tab", "stxml" => ".xml"}
14
+ end
15
+ ###############
16
+ # transform_salsatab
17
+ #
18
+ # transformation for Tab format files:
19
+ #
20
+ # - parse
21
+ # - Transform parser output to SalsaTigerXML
22
+ # If no parsing, make flat syntactic structure.
23
+ # @param [String] input_dir Input directory.
24
+ # @param [String] parse_dir Output directory for parses.
25
+ # @param [String] output_dir Global output directory.
26
+ def transform_salsatab_dir(input_dir, parse_dir, output_dir)
27
+ ##
28
+ # (Parse and) transform to SalsaTigerXML
29
+ # get interpretation class for this
30
+ # parser/lemmatizer/POS tagger combination
31
+ interpreter_class = ExternalSystems.get_interpreter_according_to_exp(@exp)
32
+
33
+ unless interpreter_class
34
+ raise "Shouldn't be here"
35
+ end
36
+
37
+ parse_obj = FileParser.new(@exp, @file_suffixes, parse_dir, "tab_dir" => input_dir)
38
+
39
+ parse_obj.each_parsed_file do |parsed_file_obj|
40
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
41
+ LOGGER.debug "Writing #{outfilename}."
42
+
43
+ begin
44
+ outfile = File.new(outfilename, "w")
45
+ rescue
46
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
47
+ end
48
+
49
+ outfile.puts STXML::SalsaTigerXMLHelper.get_header
50
+ # work with triples
51
+ # SalsaTigerSentence, FNTabSentence,
52
+ # hash: tab sentence index(integer) -> array:SynNode
53
+ parsed_file_obj.each_sentence do |st_sent, tabformat_sent, mapping|
54
+ # parsed: add headwords using parse tree
55
+ if @exp.get("do_parse")
56
+ add_head_attributes(st_sent, interpreter_class)
57
+ end
58
+
59
+ # add lemmas, if they are there. If they are not, don't print out a warning.
60
+ if @exp.get("do_lemmatize")
61
+ add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
62
+ end
63
+
64
+ # add semantics
65
+ # we can use the method in SalsaTigerXMLHelper
66
+ # that reads semantic information from the tab file
67
+ # and combines all targets of a sentence into one frame
68
+ add_semantics_from_tab(st_sent, tabformat_sent, mapping, interpreter_class, @exp)
69
+
70
+ # remove pseudo-frames from FrameNet data
71
+ remove_deprecated_frames(st_sent, @exp)
72
+
73
+ # handle multiword targets
74
+ handle_multiword_targets(st_sent, interpreter_class, @exp.get("language"))
75
+
76
+ # handle Unknown frame names
77
+ handle_unknown_framenames(st_sent, interpreter_class)
78
+
79
+ outfile.puts st_sent.get
80
+ end
81
+ outfile.puts STXML::SalsaTigerXMLHelper.get_footer
82
+ end
83
+ end
84
+
85
+ # add lemma information to each terminal in a given SalsaTigerSentence object
86
+ # @param [SalsaTigerSentence] st_sent
87
+ # @param [FNTabFormatSentence] tab_sent
88
+ # @param [Hash] mapping hash: tab lineno -> array:SynNode
89
+ def add_lemmas_from_tab(st_sent, tab_sent, mapping)
90
+ if tab_sent.nil?
91
+ # tab sentence not found
92
+ return
93
+ end
94
+
95
+ # produce list with word, lemma pairs
96
+ lemmat = []
97
+ tab_sent.each_line_parsed {|line|
98
+ word = line.get("word")
99
+ lemma = line.get("lemma")
100
+ lemmat << [word, lemma]
101
+ }
102
+
103
+ # match with st_sent terminal list and add lemma attributes
104
+ # KE Jan 07: if word mismatch,
105
+ # set to Lemmatizer file version,
106
+ # but count mismatches
107
+ word_mismatches = []
108
+
109
+ st_sent.each_terminal_sorted { |t|
110
+ matching_lineno = (0...lemmat.length).to_a.detect { |tab_lineno|
111
+ mapping[tab_lineno].include? t
112
+ }
113
+ unless matching_lineno
114
+ next
115
+ end
116
+ word, lemma = lemmat[matching_lineno]
117
+
118
+ # transform characters to XML-friendly form
119
+ # for comparison with st_word, which is also escaped
120
+ word = STXML::SalsaTigerXMLHelper.escape(word)
121
+ st_word = t.word
122
+ if word != st_word && word != STXML::SalsaTigerXMLHelper.escape(st_word)
123
+ # true mismatch.
124
+ # use the Lemmatizer version of the word, remember the mismatch
125
+ word_mismatches << [st_word, word]
126
+ t.set_attribute("word", word)
127
+ end
128
+
129
+ if lemma
130
+ # we actually do have lemma information
131
+ lemmatised_head = STXML::SalsaTigerXMLHelper.escape(lemma)
132
+ t.set_attribute("lemma",lemmatised_head)
133
+ end
134
+ } # each terminal
135
+
136
+ # did we have mismatches? then report them
137
+ unless word_mismatches.empty?
138
+ $stderr.puts "Warning: Word mismatches found between Lemmatizer file and SalsaTigerXML file generalted from parser output."
139
+ $stderr.puts "(May be due to failed reencoding of special character in the parser output.)"
140
+ $stderr.puts "I am using the Lemmatizer version by default."
141
+ $stderr.puts "Version used:"
142
+ $stderr.print "\t"
143
+ st_sent.each_terminal_sorted { |t| $stderr.print ">>#{t}<<" }
144
+ $stderr.puts
145
+ $stderr.print "SalsaTigerXML file had: "
146
+ $stderr.print word_mismatches.map { |st_word, tab_word|
147
+ "#{st_word} instead of #{tab_word}"
148
+ }.join(", ")
149
+ $stderr.puts
150
+ end
151
+ end
152
+
153
+
154
+ ###
155
+ # add semantics from tab:
156
+ #
157
+ # add information about semantics from a FN tab sentence
158
+ # to a SalsaTigerSentence object:
159
+ # - frames (one frame per sentence)
160
+ # - roles
161
+ # - FrameNet grammatical functions
162
+ # - FrameNet POS of target
163
+ def add_semantics_from_tab(st_sent, # SalsaTigerSentence object
164
+ tab_sent, # FNTabFormatSentence object
165
+ mapping, # hash: tab lineno -> array:SynNode
166
+ interpreter_class, # SynInterpreter class
167
+ exp) # FrprepConfigData
168
+
169
+ if tab_sent.nil?
170
+ # tab sentence not found
171
+ return
172
+ end
173
+
174
+ # iterate through frames in the tabsent
175
+ frame_index = 0
176
+ tab_sent.each_frame { |tab_frame_obj|
177
+ frame_name = tab_frame_obj.get_frame # string
178
+
179
+ if frame_name.nil? or frame_name =~ /^-*$/
180
+ # weird: a frame without a frame
181
+ $stderr.puts "Warning: frame entry without a frame in tab sentence #{st_sent.id}."
182
+ $stderr.puts "Skipping"
183
+ next
184
+ end
185
+
186
+ frame_node = st_sent.add_frame(frame_name, tab_sent.get_sent_id + "_f#{frame_index}")
187
+ frame_index += 1
188
+
189
+ # target
190
+ target_nodes = []
191
+ tab_frame_obj.get_target_indices.each {|terminal_id|
192
+ if mapping[terminal_id]
193
+ target_nodes.concat mapping[terminal_id]
194
+ end
195
+ }
196
+
197
+ # let the interpreter class decide on how to determine the maximum constituents
198
+ target_maxnodes = interpreter_class.max_constituents(target_nodes, st_sent)
199
+ if target_maxnodes.empty?
200
+ # HIEr
201
+ STDERR.puts "Warning: no target in frame entry, sentence #{st_sent.id}."
202
+ $stderr.puts "frame is #{frame_name}, frame no #{frame_index}"
203
+ $stderr.puts "Skipping."
204
+ $stderr.puts "target indices: " + tab_frame_obj.get_target_indices.join(", ")
205
+ #tab_sent.each_line { |line|
206
+ # $stderr.puts line
207
+ # $stderr.puts "--"
208
+ #}
209
+ next
210
+ end
211
+ frame_node.add_fe("target",target_maxnodes)
212
+
213
+ # set features on target: target lemma, target POS
214
+ target_lemma = tab_frame_obj.get_target
215
+ target_pos = nil
216
+ if target_lemma
217
+ if exp.get("origin") == "FrameNet"
218
+ # FrameNet data: here the lemma in the tab file has the form
219
+ # <lemma>.<POS>
220
+ # separate the two
221
+ if target_lemma =~ /^(.*)\.(.*)$/
222
+ target_lemma = $1
223
+ target_pos = $2
224
+ end
225
+ end
226
+ frame_node.target.set_attribute("lemma", target_lemma)
227
+ if target_pos
228
+ frame_node.target.set_attribute("pos", target_pos)
229
+ end
230
+ end
231
+
232
+ # roles, GF, PT
233
+ # synnode_markable_label:
234
+ # hash "role" | "gf" | "pt" -> SynNode -> array: label(string)
235
+ layer_synnode_label = {}
236
+ ["gf", "pt", "role"].each {|layer|
237
+ termids2labels = tab_frame_obj.markables(layer)
238
+
239
+ unless layer_synnode_label[layer]
240
+ layer_synnode_label[layer] = {}
241
+ end
242
+
243
+ termids2labels.each {|terminal_indices, label|
244
+ terminal_indices.each { |t_i|
245
+
246
+ if (nodes = mapping[t_i])
247
+
248
+ nodes.each { |node|
249
+ unless layer_synnode_label[layer][node]
250
+ layer_synnode_label[layer][node] = []
251
+ end
252
+
253
+ layer_synnode_label[layer][node] << label
254
+ } # each node that t_i maps to
255
+ end # if t_i maps to anything
256
+
257
+ } # each terminal index
258
+ } # each mapping terminal indices -> label
259
+ } # each layer
260
+
261
+ # 'stuff' (Support and other things)
262
+ layer_synnode_label["stuff"] = {}
263
+ tab_frame_obj.each_line_parsed { |line_obj|
264
+ if (label = line_obj.get("stuff")) != "-"
265
+ if (nodes = mapping[line_obj.get("lineno")])
266
+ nodes.each { |node|
267
+ unless layer_synnode_label["stuff"][node]
268
+ layer_synnode_label["stuff"][node] = []
269
+ end
270
+ layer_synnode_label["stuff"][node] << label
271
+ }
272
+ end
273
+ end
274
+ }
275
+
276
+ # reencode:
277
+ # hash role_label(string) -> array of tuples [synnodes, gflabels, ptlabels]
278
+ # synnodes: array:SynNode. gflabels, ptlabels: array:String
279
+ #
280
+ # note that in this step, any gf or pt labels that have been
281
+ # assigned to a SynNode that has not also been assigned a role
282
+ # will be lost
283
+ role2nodes_labels = {}
284
+ layer_synnode_label["role"].each_pair { |synnode, labels|
285
+ labels.each { | rolelabel|
286
+ unless role2nodes_labels[rolelabel]
287
+ role2nodes_labels[rolelabel] = []
288
+ end
289
+
290
+ role2nodes_labels[rolelabel] << [
291
+ synnode,
292
+ layer_synnode_label["gf"][synnode],
293
+ layer_synnode_label["pt"][synnode]
294
+ ]
295
+ } # each role label
296
+ } # each pair SynNode/role labels
297
+
298
+ # reencode "stuff", but only the support cases
299
+ role2nodes_labels["Support"] = []
300
+
301
+ layer_synnode_label["stuff"].each_pair { |synnode, labels|
302
+ labels.each { |stufflabel|
303
+ if stufflabel =~ /Supp/
304
+ # some sort of support
305
+ role2nodes_labels["Support"] << [synnode, nil, nil]
306
+ end
307
+ }
308
+ }
309
+
310
+ ##
311
+ # each role label:
312
+ # make FeNode for the current frame
313
+ role2nodes_labels.each_pair { |rolelabel, node_gf_pt|
314
+
315
+ # get list of syn nodes, GF and PT labels for this role
316
+ # shortcut for GF and PT labels: take any labels that have
317
+ # been assigned for _some_ Synnode of this role
318
+ synnodes = node_gf_pt.map { |ngp| ngp[0] }
319
+ gflabels = node_gf_pt.map { |ngp| ngp[1] }.compact.flatten.uniq
320
+ ptlabels = node_gf_pt.map { |ngp| ngp[2] }.compact.flatten.uniq
321
+
322
+
323
+ # let the interpreter class decide on how to
324
+ # determine the maximum constituents
325
+ maxnodes = interpreter_class.max_constituents(synnodes, st_sent)
326
+
327
+ fe_node = st_sent.add_fe(frame_node, rolelabel, maxnodes)
328
+ unless gflabels.empty?
329
+ fe_node.set_attribute("gf", gflabels.join(","))
330
+ end
331
+ unless ptlabels.empty?
332
+ fe_node.set_attribute("pt", ptlabels.join(","))
333
+ end
334
+ } # each role label
335
+ } # each frame
336
+ end
337
+
338
+
339
+ ######
340
+ # handle multiword targets:
341
+ # if you find a verb with a separate prefix,
342
+ # change the verb's lemma information accordingly
343
+ # and add an attribute "other_words" to the verb node
344
+ # pointing to the other node
345
+ #
346
+ # In general, it will be assumed that "other_words" contains
347
+ # a list of node IDs for other nodes belonging to the same
348
+ # group, node IDs separated by spaces, and that
349
+ # each node of a group has the "other_words" attribute.
350
+ #
351
+ def handle_multiword_targets(sent, # SalsaTigerSentence object
352
+ interpreter, # SynInterpreter object
353
+ language) # string: en, de
354
+ ##
355
+ # only retain the interesting words of the sentence:
356
+ # content words and prepositions
357
+ if sent.nil?
358
+ return
359
+ end
360
+
361
+ nodes = sent.terminals.select { |node|
362
+ [
363
+ "adj", "adv", "card", "noun", "part", "prep", "verb"
364
+ ].include? interpreter.category(node)
365
+ }
366
+
367
+ ##
368
+ # group:
369
+ # group verbs with their separate particles
370
+ # (at a later point, other types of grouping can be inserted here)
371
+ groups = group_words(nodes, interpreter)
372
+
373
+ ##
374
+ # record grouping information as attributes on the terminals.
375
+ groups.each { |descr, group_of_nodes|
376
+ case descr
377
+ when "none"
378
+ # no grouping
379
+ when "part"
380
+ # separate particle belonging to a verb
381
+
382
+ # group_of_nodes is a pair [verb, particle]
383
+ verb, particle = group_of_nodes
384
+
385
+ verb.set_attribute("other_words", particle.id)
386
+ particle.set_attribute("other_words", verb.id)
387
+
388
+ if verb.get_attribute("lemma") and particle.get_attribute("lemma")
389
+ case language
390
+ when "de"
391
+ # German: prepend SVP to get the real lemma of the verb
392
+ verb.set_attribute("lemma",
393
+ particle.get_attribute("lemma") +
394
+ verb.get_attribute("lemma"))
395
+ when "en"
396
+ # English: append particle as separate word after the lemma of the verb
397
+ verb.set_attribute("lemma",
398
+ verb.get_attribute("lemma") + " " +
399
+ particle.get_attribute("lemma"))
400
+ else
401
+ # default
402
+ verb.set_attribute("lemma",
403
+ verb.get_attribute("lemma") + " " +
404
+ particle.get_attribute("lemma"))
405
+ end
406
+ end
407
+
408
+ else
409
+ raise "Shouldn't be here: unexpected description #{descr}"
410
+ end
411
+ }
412
+ end
413
+
414
+ ########################
415
+ # group_words
416
+ #
417
+ # auxiliary of transform_multiword targets
418
+ #
419
+ # Group terminals:
420
+ # At the moment, just find separate prefixes and particles
421
+ # for verbs
422
+ #
423
+ # returns: list of pairs [descr, nodes]
424
+ # descr: string, "none" (no group), "part" (separate verb particle)
425
+ # nodes: array:SynNode
426
+ def group_words(nodes, # array: SynNode
427
+ interpreter) # SynInterpreter object
428
+
429
+ retv = [] # array of groups, array:array:SynNode
430
+ done = [] # remember nodes already covered
431
+
432
+ nodes.each { |terminal_node|
433
+ if done.include? terminal_node
434
+ # we have already included this node in one of the groups
435
+ next
436
+ end
437
+
438
+ if (svp = interpreter.particle_of_verb(terminal_node, nodes))
439
+ retv << ["part", [terminal_node, svp]]
440
+ done << terminal_node
441
+ done << svp
442
+ else
443
+ retv << ["none", [terminal_node]]
444
+ done << terminal_node
445
+ end
446
+
447
+ }
448
+
449
+ return retv
450
+ end
451
+
452
+ ######
453
+ # handle unknown framenames
454
+ #
455
+ # For all frames with names matching Unknown\d+,
456
+ # rename them to <lemma>_Unknown\d+
457
+ def handle_unknown_framenames(sent, # SalsaTigerSentence
458
+ interpreter) # SynInterpreter class
459
+ if sent.nil?
460
+ return
461
+ end
462
+
463
+ sent.each_frame { |frame|
464
+ if frame.name =~ /^Unknown/
465
+ if frame.target
466
+ maintarget = interpreter.main_node_of_expr(frame.target.children, "no_mwe")
467
+ else
468
+ maintarget = nil
469
+ end
470
+ unless maintarget
471
+ $stderr.puts "Warning: Unknown frame, and I could not determine the target lemma: Frame #{frame.id}"
472
+ $stderr.puts "Cannot repair frame name, leaving it as is."
473
+ return
474
+ end
475
+
476
+ # get lemma, if it exists, otherwise get word
477
+ # also, if the lemmatizer has returned a disjunction of lemmas,
478
+ # get the first disjunct
479
+ lemma = interpreter.lemma_backoff(maintarget)
480
+ if lemma
481
+ # we have a lemma
482
+ frame.set_name(lemma + "_" + frame.name)
483
+ else
484
+ # the main target word has no lemma attribute,
485
+ # and somehow I couldn't even get the target word
486
+ $stderr.puts "Warning: Salsa 'Unknown' frame."
487
+ $stderr.puts "Trying to make its lemma-specificity explicit, but"
488
+ $stderr.puts "I could not determine the target lemma nor the target word: frame #{frame.id}"
489
+ $stderr.puts "Leaving 'Unknown' as it is."
490
+ end
491
+ end
492
+ }
493
+ end
494
+
495
+
496
+ ####################
497
+ # add head attributes to each nonterminal in each
498
+ # SalsaTigerXML file in a directory
499
+ # @param [SalsaTigerSentence] st_sent
500
+ # @param [SynInterpreter] interpreter
501
+ def add_head_attributes(st_sent, interpreter)
502
+ st_sent.each_nonterminal do |nt_node|
503
+ head_term = interpreter.head_terminal(nt_node)
504
+ if head_term && head_term.word
505
+ nt_node.set_attribute("head", head_term.word)
506
+ else
507
+ nt_node.set_attribute("head", "--")
508
+ end
509
+ end # each nonterminal
510
+ end
511
+
512
+ ###################
513
+ # given a SalsaTigerSentence,
514
+ # look for FrameNet frames that are
515
+ # test frames, and remove them
516
+ # @param [SalsaTigerSentence] sent
517
+ # @param [FrprepConfigData] exp
518
+ def remove_deprecated_frames(sent, exp)
519
+ unless exp.get("origin") == "FrameNet"
520
+ return
521
+ end
522
+
523
+ sent.frames.each do |frame_obj|
524
+ if frame_obj.name == "Boulder" || frame_obj.name =~ /^Test/
525
+ sent.remove_frame(frame_obj)
526
+ end
527
+ end
528
+ end
529
+ end
530
+ end
531
+ end