shalmaneser 0.0.1.alpha → 1.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +2 -2
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +49 -0
  6. data/bin/fred +18 -0
  7. data/bin/frprep +34 -0
  8. data/bin/rosy +17 -0
  9. data/lib/common/AbstractSynInterface.rb +35 -33
  10. data/lib/common/Mallet.rb +236 -0
  11. data/lib/common/Maxent.rb +26 -12
  12. data/lib/common/Parser.rb +5 -5
  13. data/lib/common/SynInterfaces.rb +13 -6
  14. data/lib/common/TabFormat.rb +7 -6
  15. data/lib/common/Tiger.rb +4 -4
  16. data/lib/common/Timbl.rb +144 -0
  17. data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
  18. data/lib/common/headz.rb +1 -1
  19. data/lib/common/ruby_class_extensions.rb +3 -3
  20. data/lib/fred/FredBOWContext.rb +14 -2
  21. data/lib/fred/FredDetermineTargets.rb +4 -9
  22. data/lib/fred/FredEval.rb +1 -1
  23. data/lib/fred/FredFeatureExtractors.rb +4 -3
  24. data/lib/fred/FredFeaturize.rb +1 -1
  25. data/lib/frprep/CollinsInterface.rb +6 -6
  26. data/lib/frprep/MiniparInterface.rb +5 -5
  27. data/lib/frprep/SleepyInterface.rb +7 -7
  28. data/lib/frprep/TntInterface.rb +1 -1
  29. data/lib/frprep/TreetaggerInterface.rb +29 -5
  30. data/lib/frprep/do_parses.rb +1 -0
  31. data/lib/frprep/frprep.rb +36 -32
  32. data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
  33. data/lib/frprep/interfaces/stanford_interface.rb +353 -0
  34. data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
  35. data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
  36. data/lib/frprep/opt_parser.rb +2 -2
  37. data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
  38. data/lib/rosy/RosyIterator.rb +11 -10
  39. data/lib/rosy/rosy.rb +1 -0
  40. data/lib/shalmaneser/version.rb +1 -1
  41. data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
  42. data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
  43. data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
  44. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
  45. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
  46. data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
  47. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
  48. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
  49. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
  50. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
  51. data/test/functional/test_frprep.rb +3 -3
  52. data/test/functional/test_rosy.rb +20 -0
  53. metadata +215 -224
  54. data/CHANGELOG.rdoc +0 -0
  55. data/LICENSE.rdoc +0 -0
  56. data/README.rdoc +0 -0
  57. data/lib/common/CollinsInterface.rb +0 -1165
  58. data/lib/common/MiniparInterface.rb +0 -1388
  59. data/lib/common/SleepyInterface.rb +0 -384
  60. data/lib/common/TntInterface.rb +0 -44
  61. data/lib/common/TreetaggerInterface.rb +0 -303
  62. data/lib/frprep/AbstractSynInterface.rb +0 -1227
  63. data/lib/frprep/BerkeleyInterface.rb +0 -375
  64. data/lib/frprep/ConfigData.rb +0 -694
  65. data/lib/frprep/FixSynSemMapping.rb +0 -196
  66. data/lib/frprep/FrPrepConfigData.rb +0 -66
  67. data/lib/frprep/FrprepHelper.rb +0 -1324
  68. data/lib/frprep/ISO-8859-1.rb +0 -24
  69. data/lib/frprep/Parser.rb +0 -213
  70. data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
  71. data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
  72. data/lib/frprep/SynInterfaces.rb +0 -275
  73. data/lib/frprep/TabFormat.rb +0 -720
  74. data/lib/frprep/Tiger.rb +0 -1448
  75. data/lib/frprep/Tree.rb +0 -61
  76. data/lib/frprep/headz.rb +0 -338
@@ -1,1227 +0,0 @@
1
- # Katrin Erk Oct/Nov 05
2
- #
3
- # Abstract classes for interfaces for systems that provide syntactic
4
- # analysis.
5
- #
6
- # There are two types of interfaces to syntactic analysis systems:
7
- # - interfaces:
8
- # offer methods for syntactic analysis.
9
- #
10
- # SynInterfaceTab:
11
- # input and output format is (FN)TabFormat.
12
- # SynInterfaceSTXML:
13
- # input format is TabFormat, output format is
14
- # Salsa/Tiger XML, also provided as
15
- # SalsaTigerSentence objects
16
- #
17
- # - interpreters:
18
- # interpret the resulting Salsa/Tiger XML (represented as
19
- # SalsaTigerSentence and SynNode objects), e.g.
20
- # generalize over part of speech;
21
- # describe the path between a pair of nodes both as a path
22
- # and (potentially) as a grammatical function of one of the nodes;
23
- # determine whether a node describes a verb, and in which voice;
24
- # determine the head of a constituent
25
-
26
- require "tempfile"
27
-
28
- require "frprep/ruby_class_extensions"
29
-
30
- require "frprep/ISO-8859-1"
31
- require "frprep/Parser"
32
- require "frprep/SalsaTigerRegXML"
33
- require "frprep/TabFormat"
34
-
35
- #############################
36
- # abstract class, to be inherited:
37
- #
38
- # tabular format or SalsaTigerXML interface for modules
39
- # offering POS tagging, lemmatization, parsing etc.
40
- class SynInterface
41
-
42
- ###
43
- # returns a string: the name of the system
44
- # e.g. "Collins" or "TNT"
45
- def SynInterface.system()
46
- raise "Overwrite me"
47
- end
48
-
49
- ###
50
- # returns a string: the service offered
51
- # one of "lemmatizer", "parser", "pos tagger"
52
- def SynInterface.service()
53
- raise "Overwrite me"
54
- end
55
-
56
- ###
57
- # initialize to set values for all subsequent processing
58
- def initialize(program_path, # string: path to system
59
- insuffix, # string: suffix of input files
60
- outsuffix, # string: suffix for processed files
61
- var_hash = {}) # optional arguments in a hash
62
-
63
- @program_path = program_path
64
- @insuffix = insuffix
65
- @outsuffix = outsuffix
66
- end
67
-
68
- ###
69
- # process each file in in_dir with matching suffix,
70
- # producing a file in out_dir with same name but the suffix replaced
71
- #
72
- # returns: nothing
73
- def process_dir(in_dir, # string: name of input directory
74
- out_dir) # string: name of output directory
75
-
76
- Dir[in_dir+"*#{@insuffix}"].each {|infilename|
77
- outfilename = out_dir + File.basename(infilename, @insuffix) + @outsuffix
78
- process_file(infilename,outfilename)
79
- }
80
- end
81
-
82
- ###
83
- # process one file, writing the result to outfilename
84
- #
85
- # returns: nothing
86
- def process_file(infilename, # string: name of input file
87
- outfilename)
88
- raise "Overwrite me"
89
- end
90
-
91
- ######
92
- protected
93
-
94
- def SynInterface.announce_me()
95
- if defined?(SynInterfaces)
96
- # yup, we have a class to which we can announce ourselves
97
- SynInterfaces.add_interface(eval(self.name()))
98
- else
99
- # no interface collector class
100
- $stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
101
- end
102
- end
103
- end
104
-
105
- #############################
106
- # abstract class, to be inherited:
107
- #
108
- # SalsaTigerXML interface for modules
109
- # offering parsing etc.
110
- #
111
- # The input format for these classes is TabFormat or FNTabFormat
112
- class SynInterfaceSTXML < SynInterface
113
- ###
114
- # initialize to set values for all subsequent processing
115
- def initialize(program_path, # string: path to system
116
- insuffix, # string: suffix of input files
117
- outsuffix, # string: suffix for processed files
118
- stsuffix, # string: suffix for Salsa/Tiger XML files
119
- var_hash = {}) # optional arguments in a hash
120
- super(program_path, insuffix, outsuffix, var_hash)
121
- @stsuffix = stsuffix
122
- end
123
-
124
- def to_stxml_dir(in_dir, # string: name of dir with parse files
125
- out_dir) # string: name of output dir
126
-
127
- Dir[in_dir+"*#{@outsuffix}"].each { |parsefilename|
128
- stxmlfilename = out_dir + File.basename(parsefilename, @outsuffix) + @stsuffix
129
- to_stxml_file(parsefilename, stxmlfilename)
130
- }
131
- end
132
-
133
- def to_stxml_file(infilename,
134
- outfilename)
135
- raise "Overwrite me"
136
- end
137
-
138
- ###
139
- # standard mapping:
140
- #
141
- # to be used as the mapping from tab sentence words to
142
- # SalsaTigerSentence nodes returned by each_sentence():
143
- # map the n-th word of the tab sentence to the n-th terminal of
144
- # the SalsaTigerSentence
145
- def SynInterfaceSTXML.standard_mapping(sent, tabsent)
146
- retv = Hash.new
147
- if sent.nil?
148
- return nil
149
- end
150
- terminals = sent.terminals_sorted()
151
- if tabsent
152
- tabsent.each_line_parsed { |l|
153
- if (t = terminals[l.get("lineno")])
154
- retv[l.get("lineno")] = [t]
155
- else
156
- retv[l.get("lineno")] = []
157
- end
158
- }
159
- end
160
- return retv
161
- end
162
-
163
-
164
- ###
165
- # for a given processed file:
166
- # yield each sentence as a tuple
167
- # [SalsaTigerSentence object, FNTabFormatSentence object, mapping]
168
- # of
169
- # - the sentence in SalsaTigerXML,
170
- # - the matching tab format sentence
171
- # - a mapping of terminals:
172
- # hash: line in tab sentence(integer) -> array:SynNode
173
- # mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
174
- #
175
- # default version: write Salsa/Tiger XML to tempfile, read back in
176
- # and assume that each sentence in the tab file has a correspondent
177
- # in the processed file (may not hold e.g. if the parser leaves out
178
- # sentences it cannot process)
179
- def each_sentence(infilename, # string: name of processed file
180
- tab_dir = nil) # string: name of dir with input files
181
- # (set either here or on initialization)
182
- if tab_dir
183
- @tab_dir = tab_dir
184
- end
185
-
186
- # write Salsa/Tiger XML to tempfile
187
- tf = Tempfile.new("SynInterface")
188
- tf.close()
189
- to_stxml_file(infilename, tf.path)
190
- tf.flush()
191
-
192
- # get matching tab file, read
193
- tab_reader = get_tab_reader(infilename)
194
- tab_sentences = Array.new
195
- tab_reader.each_sentence { |s| tab_sentences << s }
196
-
197
- # read Salsa/Tiger sentences and yield them
198
- reader = FilePartsParser.new(tf.path)
199
- sent_index = 0
200
- reader.scan_s { |sent_string|
201
- yield [
202
- SalsaTigerSentence.new(sent_string, tab_sentences[sent_index]),
203
- tab_sentences[sent_index],
204
- SynInterfaceSTXML.standard_mapping(sent, tab_sentences[sent_index])
205
- ]
206
- sent_index += 1
207
- }
208
-
209
- # remove tempfile
210
- tf.close(true)
211
- end
212
-
213
- #####################
214
- protected
215
-
216
-
217
- ###
218
- # get tab format file for a given processed file
219
- def get_tab_reader(infilename) # string: name of processed file
220
- # find matching non-processed file for processed file
221
- # assumption: directory with non-processed files
222
- # has been set as @tab_dir
223
-
224
- # sanity checks
225
- unless @tab_dir
226
- raise "Need to set tab directory"
227
- end
228
-
229
- # get matching tab file for this parser output file
230
- tabfilename = @tab_dir+File.basename(infilename, @outsuffix)+ @insuffix
231
- return FNTabFormatFile.new(tabfilename)
232
- end
233
-
234
-
235
- ###
236
- # provide a XML representation for a sentence that couldn't be analyzed
237
- # assuming a flat structure of all terminals, adding a virtual top node
238
- def SynInterfaceSTXML.failed_sentence(tab_sent,sentid)
239
-
240
- sent_obj = SalsaTigerSentence.empty_sentence(sentid.to_s)
241
-
242
- sent_obj.set_attribute("failed","true")
243
-
244
- topnode = sent_obj.add_syn("nt",
245
- "NONE", # cat
246
- nil, # word (doesn't matter)
247
- nil, # pos (doesn't matter)
248
- "500") # nonterminal counter
249
-
250
- t_counter = 0
251
-
252
- tab_sent.each_line_parsed {|line|
253
- t_counter += 1
254
- word = line.get("word")
255
- pos = line.get("pos")
256
- node = sent_obj.add_syn("t",
257
- nil, # cat (doesn't matter here)
258
- SalsaTigerXMLHelper.escape(word), # word
259
- pos, # pos
260
- t_counter.to_s)
261
- topnode.add_child(node,nil)
262
- node.add_parent(topnode, nil)
263
- }
264
- return sent_obj
265
- end
266
- end
267
-
268
- #############################
269
- # abstract class, to be inherited:
270
- #
271
- # tabular format interface for modules
272
- # offering POS tagging, lemmatization etc.
273
- class SynInterfaceTab < SynInterface
274
-
275
- ##########
276
- protected
277
-
278
- # fntab_words_for_file:
279
- # given a file in tab format, columns as in FNTabFormat,
280
- # get the "word" entries and write them to a given file,
281
- # one word per line, as input for processing
282
- def SynInterfaceTab.fntab_words_to_file(infilename, # string: name of input file
283
- outfile, # stream: output file
284
- sent_marker = "", # string: mark end of sentence how?
285
- iso = nil) # non-nil: assume utf-8, transform to iso-8859-1
286
- corpusfile = FNTabFormatFile.new(infilename)
287
- corpusfile.each_sentence {|s|
288
- s.each_line_parsed {|line_obj|
289
- if iso
290
- outfile.puts UtfIso.to_iso_8859_1(line_obj.get("word"))
291
- else
292
- outfile.puts line_obj.get("word")
293
- end
294
- }
295
- outfile.puts sent_marker
296
- }
297
- end
298
- end
299
-
300
- #############################
301
- # class describing a path between two nodes
302
- #
303
- # provides access and output facilities for different aspects of the path
304
- #
305
- # this is the return value of SynInterpreter.path_between()
306
- class Path
307
- attr_reader :startnode
308
-
309
- ###
310
- # initialize to empty path
311
- def initialize(startnode)
312
- @path = Array.new
313
- @cutoff_last_pt = false
314
- set_startnode(startnode)
315
- end
316
-
317
- ###
318
- # deep_clone:
319
- # return clone of this path object,
320
- # with clone of this path rather than the same path
321
- def deep_clone()
322
- new_path = self.clone()
323
- new_path.set_path(@path.clone())
324
-
325
- return new_path
326
- end
327
-
328
- ###
329
- def set_startnode(startnode)
330
- @startnode = startnode
331
-
332
- return self
333
- end
334
-
335
- ###
336
- # iterate through the current path
337
- #
338
- # yield tuples
339
- # [direction, edgelabel, nodelabel, endnode]
340
- # direction: string, U/D
341
- # edgelabel: string
342
- # nodelabel: string
343
- # endnode: SynNode
344
- def each_step()
345
- @path.each { |step|
346
- yield step
347
- }
348
- end
349
-
350
- ###
351
- # empty?
352
- # any steps in here?
353
- def empty?
354
- return @path.empty?
355
- end
356
-
357
- ###
358
- # add one step to the beginning of the current path
359
- def add_first_step(start_node,#SynNode
360
- direction, # string: U, D
361
- gf, # string: edge label
362
- pt)
363
- @path.prepend([direction, gf, pt, @startnode])
364
- set_startnode(start_node)
365
-
366
- return self
367
- end
368
-
369
-
370
- ###
371
- # add one step to the end of the current path
372
- def add_last_step(direction, # string: U, D
373
- gf, # string: edge label
374
- pt, # string: node label (of end_node)
375
- end_node) # SynNode
376
- @path << [direction, gf, pt, end_node]
377
-
378
- return self
379
- end
380
-
381
- ###
382
- # path length
383
- def length()
384
- return @path.length()
385
- end
386
-
387
- ###
388
- #
389
- def print(print_direction, # boolean. true: print direction
390
- print_gf, # boolean. true: print edgelabel
391
- print_pt) # boolean. true: print nodelabel
392
-
393
- return print_aux(@path, print_direction, print_gf, print_pt)
394
- end
395
-
396
- ###
397
- # print path from roof node to end
398
- def print_downpart(print_direction,
399
- print_gf,
400
- print_pt)
401
-
402
- roof, roof_index = compute_roof()
403
- if roof.nil? or @path.empty?
404
- # no roof set
405
- return ""
406
-
407
- else
408
- # roof node is in the middle
409
- return print_aux(@path[roof_index..-1],
410
- print_direction, print_gf, print_pt)
411
- end
412
- end
413
-
414
- ###
415
- def lca()
416
- return compute_roof().first
417
- end
418
-
419
- ###
420
- # cut off last node label in print() and print_downpart()?
421
- def set_cutoff_last_pt_on_printing(bool) # Boolean
422
- @cutoff_last_pt = bool
423
- end
424
-
425
- ########
426
- protected
427
-
428
- def set_path(new_path)
429
- @path = new_path
430
- end
431
-
432
-
433
- ########
434
- private
435
-
436
- ###
437
- # step through the path as long as direction is up.
438
- # when direction starts to go "D", take current node as roof node
439
- #
440
- # returns: pair [roof node, roof node index] (SynNode, integer)
441
- def compute_roof()
442
- node = @startnode
443
- index = 0
444
-
445
- each_step { |direction, edgelabel, nodelabel, endnode|
446
- if direction =~ /D/
447
- # down! the previous node was roof
448
- return [node, index]
449
- else
450
- node = endnode
451
- index += 1
452
- end
453
- }
454
-
455
- # last node is roof
456
- return [node, index]
457
-
458
- end
459
-
460
- ###
461
- def print_aux(path,
462
- print_direction,
463
- print_gf,
464
- print_pt)
465
- retv = ""
466
- path.each { |step|
467
- direction, gf, pt, node = step.map { |entry|
468
- if entry.nil?
469
- "-"
470
- else
471
- entry
472
- end
473
- }
474
- if print_direction
475
- retv << direction + " "
476
- end
477
- if print_gf
478
- retv << gf + " "
479
- end
480
- if print_pt
481
- retv << pt + " "
482
- end
483
- }
484
-
485
- if @cutoff_last_pt and print_pt and
486
- retv =~ /^(.+ )\w+ $/
487
- return $1
488
- else
489
- return retv
490
- end
491
- end
492
-
493
- end
494
-
495
-
496
- #############################
497
- # abstract class, to be inherited:
498
- #
499
- # interpretation for a POS tagger/lemmatizer/parser combination
500
- class SynInterpreter
501
-
502
- ###
503
- # systems interpreted by this class:
504
- # returns a hash service(string) -> system name (string),
505
- # e.g.
506
- # { "parser" => "collins", "lemmatizer" => "treetagger" }
507
- def SynInterpreter.systems()
508
- raise "Overwrite me"
509
- end
510
-
511
- ###
512
- # names of additional systems that may be interpreted by this class
513
- # returns a hash service(string) -> system name(string)
514
- # same as names()
515
- def SynInterpreter.optional_systems()
516
- raise "Overwrite me"
517
- end
518
-
519
- ###
520
- # generalize over POS tags.
521
- #
522
- # returns one of:
523
- #
524
- # adj: adjective (phrase)
525
- # adv: adverb (phrase)
526
- # card: numbers, quantity phrases
527
- # con: conjunction
528
- # det: determiner, including possessive/demonstrative pronouns etc.
529
- # for: foreign material
530
- # noun: noun (phrase), including personal pronouns, proper names, expletives
531
- # part: particles, truncated words (German compound parts)
532
- # prep: preposition (phrase)
533
- # pun: punctuation, brackets, etc.
534
- # sent: sentence
535
- # top: top node of a sentence
536
- # verb: verb (phrase)
537
- # nil: something went wrong
538
- #
539
- # default: return phrase type as is
540
- #
541
- # returns: string or nil
542
- def SynInterpreter.category(node) # SynNode
543
- unless node.kind_of? SynNode
544
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
545
- return nil
546
- end
547
-
548
- return eval(self.name()).pt(node)
549
- end
550
-
551
- ###
552
- # is relative pronoun?
553
- #
554
- # default: false
555
- def SynInterpreter.relative_pronoun?(node) # SynNode
556
- return false
557
- end
558
-
559
- ###
560
- # lemma_backoff:
561
- #
562
- # if we have lemma information, return that,
563
- # and failing that, return the word
564
- #
565
- # returns: string or nil
566
- def SynInterpreter.lemma_backoff(node)
567
- unless node.kind_of? SynNode
568
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
569
- return nil
570
- end
571
-
572
- lemma = node.get_attribute("lemma")
573
- if (lemma.nil? or lemma =~ /unknown/) and
574
- node.is_terminal?
575
- return node.word()
576
- else
577
- return lemma
578
- end
579
- end
580
-
581
- ###
582
- # phrase type:
583
- # constituent label for nonterminals,
584
- # part of speech for terminals
585
- #
586
- # returns: string
587
- def SynInterpreter.pt(node)
588
- unless node.kind_of? SynNode
589
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
590
- return nil
591
- end
592
-
593
- if node.is_terminal?
594
- return node.part_of_speech
595
- else
596
- return node.category
597
- end
598
- end
599
-
600
- ###
601
- # simplified phrase type:
602
- # like phrase type, but may simplify
603
- # the constituent label
604
- # default: just the same as pt()
605
- #
606
- # returns: string or nil
607
- def SynInterpreter.simplified_pt(node)
608
- return eval(self.name()).pt(node)
609
- end
610
-
611
- ###
612
- # particle_of_verb:
613
- #
614
- # given a node and a nodelist,
615
- # if the node represents a verb:
616
- # see if the verb has a particle among the nodes in nodelist
617
- # if so, return it
618
- # default: no recognition of separate particles
619
- #
620
- # returns: SynNode object if successful, else nil
621
- def SynInterpreter.particle_of_verb(node,
622
- node_list)
623
- return nil
624
- end
625
-
626
- ###
627
- # auxiliary?
628
- #
629
- # returns true if the given node is an auxiliary
630
- # default: no recognition of auxiliaries
631
- #
632
- # returns: boolean
633
- def SynInterpreter.auxiliary?(node)
634
- return false
635
- end
636
-
637
- ###
638
- # modal?
639
- #
640
- # returns true if the given node is a modal verb
641
- # default: no recognition of modals
642
- #
643
- # returns: boolean
644
- def SynInterpreter.modal?(node)
645
- return false
646
- end
647
-
648
- ###
649
- # head_terminal
650
- #
651
- # given a constituent, return the terminal node
652
- # that describes its headword
653
- # default: a heuristic that assumes the existence of a 'head'
654
- # attribute on nodes:
655
- # find the first node in my yield corresponding to my head attribute..
656
- #
657
- # returns: a SynNode object if successful, else nil
658
- def SynInterpreter.head_terminal(node)
659
- unless node.kind_of? SynNode
660
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
661
- return nil
662
- end
663
-
664
- if node.is_terminal?
665
- return node
666
- end
667
-
668
- head = node.get_attribute("head")
669
- unless head
670
- return nil
671
- end
672
-
673
- return node.yield_nodes.detect { |t|
674
- t.get_attribute("word") == head
675
- }
676
- end
677
-
678
- ###
679
- # voice
680
- #
681
- # given a constituent, return
682
- # - "active"/"passive" if it is a verb
683
- # - nil, else
684
- #
685
- # default: treat all as active
686
- def SynInterpreter.voice(node)
687
- unless node.kind_of? SynNode
688
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
689
- return nil
690
- end
691
-
692
- if eval(self.name()).category(node) == "verb"
693
- return "active"
694
- else
695
- return nil
696
- end
697
- end
698
-
699
- ###
700
- # gfs
701
- #
702
- # grammatical functions of a constituent:
703
- #
704
- # returns: a list of pairs [relation(string), node(SynNode)]
705
- # where <node> stands in the relation <relation> to the parameter
706
- # that the method was called with
707
- #
708
- # default: children of this node, with edge labels as relations,
709
- # prepositions tacked on for pps
710
- def SynInterpreter.gfs(node, # SynNode
711
- sent) # SalsaTigerSentence
712
- unless node.kind_of? SynNode
713
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
714
- return nil
715
- end
716
-
717
- return node.children_with_edgelabel().map { |rel, gf_node|
718
-
719
- if eval(self.name()).category(gf_node) == "prep"
720
- [rel + "-" + eval(self.name()).preposition(gf_node).to_s, gf_node]
721
-
722
- else
723
- [rel, gf_node]
724
- end
725
- }
726
- end
727
-
728
- ###
729
- # informative_content_node
730
- #
731
- # for most constituents: the head
732
- # for a PP, the NP
733
- # for an SBAR, the VP
734
- # for a VP, the embedded VP
735
- #
736
- # Default: returns the first non-head child
737
- def SynInterpreter.informative_content_node(node)
738
- unless node.kind_of? SynNode
739
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
740
- return nil
741
- end
742
-
743
- headlemma = eval(self.name()).lemma_backoff(node)
744
-
745
- first_nonhead_child = node.children().detect { |n|
746
- nnh = eval(self.name()).head_terminal(n)
747
- nnh and
748
- eval(self.name()).lemma_backoff(nnh) != headlemma
749
- }
750
-
751
- return first_nonhead_child
752
- end
753
-
754
- #####################################
755
- # verbs(sent) sent is a sentence in SalsaTigerSentence format
756
- #
757
- # return a list of the nodes of full verbs in a given sentence:
758
- # it is a list of lists. An item in that list is
759
- # - either a pair [verb, svp]
760
- # of the node of a verb with separable prefix
761
- # and the node of its separate prefix
762
- # - or a singleton [verb]
763
- # of the node of a verb without separate prefix
764
- def SynInterpreter.verbs(sent)
765
-
766
- return sent.syn_nodes.select { |node|
767
- eval(self.name()).category(node) == "verb"
768
- }.map { |node|
769
- [node]
770
- }
771
- end
772
-
773
- ###
774
- # governing verbs
775
- #
776
- # returns a list of pairs [rel, verb_node]
777
- # such that the given node fills the grammatical function rel
778
- # for this verb_node
779
- # or an empty list if there is no such verb
780
- def SynInterpreter.governing_verbs(node,
781
- sent)
782
- unless node.kind_of? SynNode
783
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
784
- return nil
785
- end
786
-
787
- retv = Array.new
788
-
789
- # each verb of the sentence:
790
- eval(self.name()).verbs(sent).each { |verb_node, prefix_node|
791
- # each gf of this verb:
792
- eval(self.name()).gfs(verb_node, sent).each { |rel, other_node|
793
- # if it points to the given node, record
794
- if other_node == node or
795
- eval(self.name()).informative_content_node(other_node) == node
796
- retv << [rel, verb_node]
797
- break
798
- end
799
- }
800
- }
801
-
802
- return retv
803
- end
804
-
805
- ###
806
- # path_between
807
- #
808
- # construct path in syntactic structure between two nodes,
809
- # using
810
- # - node labels
811
- # - edge labels
812
- # - direction Up, Down
813
- #
814
- # use_nontree_edges: set to true to use coreference edges
815
- # and other non-tree edges returned by the parser
816
- # in path computation. (Will produce no change if the parser
817
- # does not produce any non-tree edges.)
818
- #
819
- # returns: Path object
820
- def SynInterpreter.path_between(from_node, # SynNode
821
- to_node, # SynNode
822
- use_nontree_edges = false) # boolean
823
-
824
- unless from_node.kind_of? SynNode and to_node.kind_of? SynNode
825
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
826
- return nil
827
- end
828
-
829
- path = eval(self.name()).search_up(from_node,to_node, nil)
830
-
831
- if path.nil?
832
- # no path found
833
- # STDERR.puts "Warning: no path found between #{to_node.id} and #{from_node.id}"
834
- end
835
-
836
- return path
837
- end
838
-
839
- ###
840
- # surrounding_nodes:
841
- #
842
- # construct paths in syntactic structure between a node and each of its neighbors
843
- # path construction as in path_between.
844
- # Neighbors: parent, child, plus potentially neighbors by nontree edges
845
- # use_nontree_edges: again, same as in path_between
846
- #
847
- # returns: list of pairs [neighbor(SynNode), path(Path)]
848
- def SynInterpreter.surrounding_nodes(node, # SynNode
849
- use_nontree_edges = false) # boolean
850
-
851
- unless node.kind_of? SynNode
852
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
853
- return nil
854
- end
855
-
856
- retv = Array.new
857
-
858
- # parent
859
- if (p = node.parent)
860
- retv << [
861
- p,
862
- Path.new(node).add_last_step("U", node.parent_label(),
863
- eval(self.name()).simplified_pt(p), p)
864
- ]
865
- end
866
-
867
- # children
868
- node.each_child_with_edgelabel { |label, c|
869
- retv << [
870
- c,
871
- Path.new(node).add_last_step("D", label,
872
- eval(self.name()).simplified_pt(c), c)
873
- ]
874
- }
875
-
876
- return retv
877
- end
878
-
879
- ###
880
- # relative_position
881
- # of a node with respect to an (anchor) node:
882
- # left, right, dom
883
- def SynInterpreter.relative_position(node, # SynNode
884
- anchor_node) # SynNode
885
-
886
- unless node.kind_of? SynNode and anchor_node.kind_of? SynNode
887
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
888
- return nil
889
- end
890
-
891
- # compute up to a root node
892
- root = node
893
- while (p = root.parent())
894
- root = p
895
- end
896
-
897
- # determine position of {leftmost, rightmost} terminal of
898
- # {node, anchor_node} in the list of all terminals
899
- all_yieldnodes = root.yield_nodes_ordered()
900
-
901
- pos_nodefirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(node))
902
- pos_anchorfirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(anchor_node))
903
- pos_nodelast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(node))
904
- pos_anchorlast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(anchor_node))
905
-
906
- # determine relative position
907
- if pos_nodefirst and pos_anchorfirst and pos_nodefirst < pos_anchorfirst
908
- return "LEFT"
909
- elsif pos_nodelast and pos_anchorlast and pos_anchorlast < pos_nodelast
910
- return "RIGHT"
911
- else
912
- return "DOM"
913
- end
914
- end
915
-
916
- ###
917
- # leftmost_terminal
918
- #
919
- # given a constituent, determine its leftmost terminal,
920
- # excluding punctuation
921
- def SynInterpreter.leftmost_terminal(node)
922
- leftmost = node.yield_nodes_ordered().detect {|n| eval(self.name()).category(n) != "pun"}
923
- unless leftmost
924
- leftmost = node.yield_nodes_ordered().first
925
- end
926
- return leftmost
927
- end
928
-
929
- ###
930
- # rightmost_terminal
931
- #
932
- # given a constituent, determine its rightmost terminal,
933
- # excluding punctuation
934
- def SynInterpreter.rightmost_terminal(node)
935
- rightmost = node.yield_nodes_ordered().reverse.detect {|n| eval(self.name()).category(n) != "pun"}
936
- unless rightmost
937
- rightmost = node.yield_nodes_ordered().last
938
- end
939
- return rightmost
940
- end
941
-
942
- ###
943
- # preposition
944
- #
945
- # if the given node represents a PP, return the preposition
946
- #
947
- # default: assume that either the PP node will have the preposition as its lemma,
948
- # or that the head terminal of the PP will be the preposition
949
- def SynInterpreter.preposition(node)
950
- unless node.kind_of? SynNode
951
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
952
- return nil
953
- end
954
-
955
- # preposition as lemma of this node?
956
- if eval(self.name()).category(node) == "prep" and
957
- (lemma = eval(self.name()).lemma_backoff(node)) and
958
- not(lemma.empty?)
959
- return lemma
960
- end
961
-
962
- # head terminal is preposition and has a lemma?
963
- hl = eval(self.name()).head_terminal(node)
964
- if hl and
965
- eval(self.name()).category(hl) == "prep" and
966
- (lemma = eval(self.name()).lemma_backoff(hl)) and
967
- not(lemma.empty?)
968
- return lemma
969
- end
970
-
971
- # no luck
972
- return nil
973
- end
974
-
975
-
976
- ###
977
- # main node of expression
978
- #
979
- # returns: SynNode, main node, if found
980
- # else nil
981
- def SynInterpreter.main_node_of_expr(nodelist,
982
- no_mwes = nil) # non-nil: don't handle multiword expressions beyond verbs with separate particles
983
-
984
- # map nodes to terminals
985
- nodelist1 = nodelist.map { |n| n.yield_nodes() }.flatten
986
-
987
- # single node? return it
988
- if nodelist1.length == 1
989
- return nodelist1.first
990
- end
991
-
992
- # more than one word
993
-
994
- # see if we can get a headword of a single constituent
995
- if nodelist.length() == 1 and
996
- (headword = eval(self.name()).head_terminal(nodelist.first()))
997
- return headword
998
- end
999
-
1000
- # filter out auxiliaries and modals, see if only one node remains
1001
- nodelist2 = nodelist1.reject { |t|
1002
- eval(self.name()).auxiliary?(t) or
1003
- eval(self.name()).modal?(t)
1004
- }
1005
-
1006
- # one verb, one prep or particle? then
1007
- # assume we have a separate verb prefix, and take the lemma of the verb
1008
- if nodelist2.length == 2
1009
- verbs = nodelist2.select { |t| eval(self.name()).category(t) == "verb"}
1010
- if verbs.length() == 1
1011
- # found exactly one verb, so we have one verb, one other
1012
- if eval(self.name()).particle_of_verb(verbs.first, nodelist2)
1013
- # we have found a particle/separate verb prefix
1014
- # take verb as main node
1015
- return verbs.first
1016
- end
1017
- end
1018
- end
1019
-
1020
- if no_mwes
1021
- # I was told only to look for separate verb particles,
1022
- # not for anything else, so return nil at this point
1023
- return nil
1024
- end
1025
-
1026
- # filtered out everything? oops -- return to previous node list
1027
- if nodelist2.empty?
1028
- nodelist2 = nodelist1
1029
- end
1030
-
1031
- # if the nodelist describes an mwe, try to find its headword:
1032
- # look for the lowest common ancestor of all nodes in nodelist2
1033
- # if its head terminal is in nodelist2, return that
1034
- lca = nodelist2.first
1035
- lca_found = false
1036
- while lca and not(lca_found)
1037
- yn = lca.yield_nodes()
1038
- # lca's yield nodes include all nodes in nodelist2?
1039
- # then lca is indeed the lowest common ancestor
1040
- if nodelist2.big_and { |t| yn.include? t }
1041
- lca_found = true
1042
- else
1043
- lca = lca.parent()
1044
- end
1045
- end
1046
- # nodelist2 includes lca's head terminal? then return that
1047
- if lca_found and
1048
- (h = eval(self.name()).head_terminal(lca)) and
1049
- nodelist2.include? h
1050
- return h
1051
- end
1052
-
1053
-
1054
- # try first verb, then first noun, then first adjective
1055
- ["verb", "noun", "adj"].each { |cat|
1056
- nodelist.each { |t|
1057
- if eval(self.name()).category(t) == cat
1058
- return t
1059
- end
1060
- }
1061
- }
1062
-
1063
- # return first node
1064
- return nodelist.first
1065
- end
1066
-
1067
- ########
1068
- # max constituents:
1069
- # given a set of nodes, compute the maximal constituents
1070
- # that exactly cover them
1071
- #
1072
- # If include_single_missing_children is set to true,
1073
- # then a node that has at least one child whose yield is in nodelist,
1074
- # and has only one child whose yield is not in nodelist,
1075
- # will be considered as having its yield in nodelist.
1076
- #
1077
- # Optionally, a procedure accept_anyway_proc can be given.
1078
- # Like the option include_single_missing_children, it can lead to nodes being
1079
- # included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
1080
- # even though not all of their yield nodes are yield nodes of the node_list.
1081
- # accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
1082
- # The procedure is called with three arguments:
1083
- # accept_anyway_proc(node, ch_in, ch_out)
1084
- # node is a SynNode that would not normally be in NYAAYNN.
1085
- # ch_in is the list of its children that are in NYAAYNN.
1086
- # ch_out is the list of its children that are not.
1087
- # If the procedure exists and returns true, node is put into NYAAYNN.
1088
- #
1089
- #
1090
- # default: use the SalsaTigerSentence method for this
1091
- def SynInterpreter.max_constituents(nodeset, # Array:SynNode
1092
- sent, # SalsaTigerSentence
1093
- idealize_maxconst = false, # boolean
1094
- accept_anyway_proc = nil) # procedure
1095
-
1096
- if idealize_maxconst
1097
- return sent.max_constituents_smc(nodeset, idealize_maxconst,
1098
- false, # do not ignore empty terminals
1099
- accept_anyway_proc)
1100
- else
1101
- return sent.max_constituents_for_nodes(nodeset)
1102
- end
1103
- end
1104
-
1105
- ########
1106
- # prune?
1107
- # given a target node t and another node n of the syntactic structure,
1108
- # decide whether n is likely to instantiate a semantic role
1109
- # of t. If not, recommend n for pruning.
1110
- #
1111
- # This method is supposed to implement a method similar
1112
- # to the one proposed by Xue and Palmer (EMNLP 2004).
1113
- #
1114
- # returns: true to recommend n for pruning, else false
1115
- #
1116
- # Since the implementation is highly parser-specific,
1117
- # all that we can do in the default method is
1118
- # always to return false.
1119
- def SynInterpreter.prune?(node, # SynNode
1120
- paths_to_target, # hash: node ID -> Path object: paths from nodes to target
1121
- terminal_index) # hash: terminal node -> word index in sentence
1122
-
1123
- unless node.kind_of? SynNode
1124
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
1125
- return nil
1126
- end
1127
-
1128
- return false
1129
- end
1130
-
1131
-
1132
- ####################3
1133
- protected
1134
-
1135
- def SynInterpreter.announce_me()
1136
- if defined?(SynInterfaces)
1137
- # yup, we have a class to which we can announce ourselves
1138
- SynInterfaces.add_interpreter(eval(self.name()))
1139
- else
1140
- # no interface collector class
1141
- $stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
1142
- end
1143
- end
1144
-
1145
- ####################3
1146
- private
1147
-
1148
- ###
1149
- # search upward:
1150
- # look for path from from_node to to_node
1151
- # already_covered is either nil or
1152
- # a node whose subtree we have already searched
1153
- def SynInterpreter.search_up(from_node, # SynNode
1154
- to_node, # SynNode
1155
- already_covered) # SynNode
1156
- # returns (1) the path from from_node to to_node,
1157
- # (2) just the part from the lca down to the node
1158
- # (3) the lowest common ancestor as node
1159
-
1160
- path = eval(self.name()).search_down(from_node,to_node, already_covered)
1161
-
1162
- if path.nil?
1163
- # search down unsuccessful
1164
-
1165
- parent = from_node.parent
1166
- edgelabel = from_node.parent_label
1167
- # puts "Going up from "+from_node.id.to_s+" to "+parent.id.to_s
1168
-
1169
- if parent.nil?
1170
- # no path found
1171
- return nil
1172
-
1173
- else
1174
- # search up
1175
- path = eval(self.name()).search_up(parent,to_node, from_node)
1176
-
1177
- if path.nil?
1178
- # no path found
1179
- return nil
1180
-
1181
- else
1182
- # search up was successful
1183
- parent_pt = eval(self.name()).simplified_pt(parent)
1184
- path.add_first_step(from_node, "U", edgelabel, parent_pt)
1185
- return path
1186
- end
1187
- end
1188
-
1189
- else
1190
- # search down successful
1191
- return path
1192
- end
1193
- end
1194
-
1195
- ###
1196
- # search in tree
1197
- def SynInterpreter.search_down(from_node, # SynNode
1198
- to_node, # SynNode
1199
- already_explored) # SynNode
1200
-
1201
- if from_node == to_node
1202
- return Path.new(from_node)
1203
-
1204
- else
1205
-
1206
- from_node.children.each {|c|
1207
-
1208
- if c == already_explored
1209
- # we have done this subtree,
1210
- # don't do it again
1211
- next
1212
- end
1213
-
1214
- path = eval(self.name()).search_down(c, to_node, already_explored)
1215
-
1216
- unless path.nil?
1217
- c_pt = eval(self.name()).simplified_pt(c)
1218
- path.add_first_step(from_node, "D", c.parent_label(), c_pt)
1219
- return path
1220
- end
1221
- }
1222
-
1223
- # no path found for any of the children
1224
- return nil
1225
- end
1226
- end
1227
- end