shalmaneser 0.0.1.alpha → 1.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +2 -2
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +49 -0
  6. data/bin/fred +18 -0
  7. data/bin/frprep +34 -0
  8. data/bin/rosy +17 -0
  9. data/lib/common/AbstractSynInterface.rb +35 -33
  10. data/lib/common/Mallet.rb +236 -0
  11. data/lib/common/Maxent.rb +26 -12
  12. data/lib/common/Parser.rb +5 -5
  13. data/lib/common/SynInterfaces.rb +13 -6
  14. data/lib/common/TabFormat.rb +7 -6
  15. data/lib/common/Tiger.rb +4 -4
  16. data/lib/common/Timbl.rb +144 -0
  17. data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
  18. data/lib/common/headz.rb +1 -1
  19. data/lib/common/ruby_class_extensions.rb +3 -3
  20. data/lib/fred/FredBOWContext.rb +14 -2
  21. data/lib/fred/FredDetermineTargets.rb +4 -9
  22. data/lib/fred/FredEval.rb +1 -1
  23. data/lib/fred/FredFeatureExtractors.rb +4 -3
  24. data/lib/fred/FredFeaturize.rb +1 -1
  25. data/lib/frprep/CollinsInterface.rb +6 -6
  26. data/lib/frprep/MiniparInterface.rb +5 -5
  27. data/lib/frprep/SleepyInterface.rb +7 -7
  28. data/lib/frprep/TntInterface.rb +1 -1
  29. data/lib/frprep/TreetaggerInterface.rb +29 -5
  30. data/lib/frprep/do_parses.rb +1 -0
  31. data/lib/frprep/frprep.rb +36 -32
  32. data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
  33. data/lib/frprep/interfaces/stanford_interface.rb +353 -0
  34. data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
  35. data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
  36. data/lib/frprep/opt_parser.rb +2 -2
  37. data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
  38. data/lib/rosy/RosyIterator.rb +11 -10
  39. data/lib/rosy/rosy.rb +1 -0
  40. data/lib/shalmaneser/version.rb +1 -1
  41. data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
  42. data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
  43. data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
  44. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
  45. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
  46. data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
  47. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
  48. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
  49. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
  50. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
  51. data/test/functional/test_frprep.rb +3 -3
  52. data/test/functional/test_rosy.rb +20 -0
  53. metadata +215 -224
  54. data/CHANGELOG.rdoc +0 -0
  55. data/LICENSE.rdoc +0 -0
  56. data/README.rdoc +0 -0
  57. data/lib/common/CollinsInterface.rb +0 -1165
  58. data/lib/common/MiniparInterface.rb +0 -1388
  59. data/lib/common/SleepyInterface.rb +0 -384
  60. data/lib/common/TntInterface.rb +0 -44
  61. data/lib/common/TreetaggerInterface.rb +0 -303
  62. data/lib/frprep/AbstractSynInterface.rb +0 -1227
  63. data/lib/frprep/BerkeleyInterface.rb +0 -375
  64. data/lib/frprep/ConfigData.rb +0 -694
  65. data/lib/frprep/FixSynSemMapping.rb +0 -196
  66. data/lib/frprep/FrPrepConfigData.rb +0 -66
  67. data/lib/frprep/FrprepHelper.rb +0 -1324
  68. data/lib/frprep/ISO-8859-1.rb +0 -24
  69. data/lib/frprep/Parser.rb +0 -213
  70. data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
  71. data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
  72. data/lib/frprep/SynInterfaces.rb +0 -275
  73. data/lib/frprep/TabFormat.rb +0 -720
  74. data/lib/frprep/Tiger.rb +0 -1448
  75. data/lib/frprep/Tree.rb +0 -61
  76. data/lib/frprep/headz.rb +0 -338
data/CHANGELOG.rdoc DELETED
File without changes
data/LICENSE.rdoc DELETED
File without changes
data/README.rdoc DELETED
File without changes
@@ -1,1165 +0,0 @@
1
- ####
2
- # sp 15 04 05
3
- #
4
- # modified ke 30 10 05: adapted to fit into SynInterface
5
- #
6
- # represents a file containing Collins parses
7
- #
8
- # underlying data structure for individual sentences: SalsaTigerSentence
9
-
10
-
11
- require "tempfile"
12
- require "common/TabFormat"
13
- require "common/SalsaTigerRegXML"
14
- require "common/SalsaTigerXMLHelper"
15
- require "common/Counter"
16
-
17
- require "common/AbstractSynInterface"
18
-
19
- ################################################
20
- # Interface class
21
- class CollinsInterface < SynInterfaceSTXML
22
- CollinsInterface.announce_me()
23
-
24
- ###
25
- def CollinsInterface.system()
26
- return "collins"
27
- end
28
-
29
- ###
30
- def CollinsInterface.service()
31
- return "parser"
32
- end
33
-
34
- ###
35
- # initialize to set values for all subsequent processing
36
- def initialize(program_path, # string: path to system
37
- insuffix, # string: suffix of tab files
38
- outsuffix, # string: suffix for parsed files
39
- stsuffix, # string: suffix for Salsa/TIGER XML files
40
- var_hash = {}) # optional arguments in a hash
41
-
42
- super(program_path, insuffix, outsuffix, stsuffix, var_hash)
43
- # I am not expecting any parameters, but I need
44
- # the program path to end in a /.
45
- unless @program_path =~ /\/$/
46
- @program_path = @program_path + "/"
47
- end
48
-
49
- # new: evaluate var hash
50
- @pos_suffix = var_hash["pos_suffix"]
51
- @lemma_suffix = var_hash["lemma_suffix"]
52
- @tab_dir = var_hash["tab_dir"]
53
- end
54
-
55
-
56
- ###
57
- # parse a bunch of TabFormat files (*.<insuffix>) with Collins model 3
58
- # required: POS tags must be present
59
- # produced: in outputdir, files *.<outsuffix>
60
- # I assume that the files in inputdir are smaller than
61
- # the maximum number of sentences
62
- # Collins can parse in one go (i.e. that they are split) and I don't have to care
63
- def process_dir(in_dir, # string: name of input directory
64
- out_dir) # string: name of output directory
65
- print "parsing ", in_dir, " and writing to ", out_dir, "\n"
66
-
67
- unless @pos_suffix
68
- raise "Collins interface: need suffix for POS files"
69
- end
70
-
71
- collins_prog = "gunzip -c #{@program_path}models/model3/events.gz | nice #{@program_path}code/parser"
72
- collins_params = " #{@program_path}models/model3/grammar 10000 1 1 1 1"
73
-
74
- Dir[in_dir+ "*" + @insuffix].each { |inputfilename|
75
-
76
- STDERR.puts "*** Parsing #{inputfilename} with Collins"
77
-
78
- corpusfilename = File.basename(inputfilename, @insuffix)
79
- parsefilename = out_dir+corpusfilename+ @outsuffix
80
- tempfile = Tempfile.new(corpusfilename)
81
-
82
- # we need to have part of speech tags (but no lemmas at this point)
83
- # included automatically by FNTabFormatFile initialize from *.pos
84
- tabfile = FNTabFormatFile.new(inputfilename,@pos_suffix)
85
-
86
- CollinsInterface.produce_collins_input(tabfile,tempfile)
87
- tempfile.close
88
- print collins_prog+" "+tempfile.path+" "+ collins_params+" > "+parsefilename
89
- Kernel.system(collins_prog+" "+tempfile.path+" "+
90
- collins_params+" > "+parsefilename)
91
- tempfile.close(true)
92
- }
93
- end
94
-
95
- ###
96
- # for a given parsed file:
97
- # yield each sentence as a pair
98
- # [SalsaTigerSentence object, FNTabFormatSentence object]
99
- # of the sentence in SalsaTigerXML and the matching tab format sentence
100
- #
101
- # If a parse has failed, returns
102
- # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
103
- # to allow more detailed accounting for failed parses
104
- def each_sentence(parsefilename)
105
-
106
- # sanity checks
107
- unless @tab_dir
108
- raise "Need to set tab directory on initialization"
109
- end
110
-
111
- # get matching tab file for this parser output file
112
- parserfile = File.new(parsefilename)
113
- tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
114
-
115
- corpusfile = FNTabFormatFile.new(tabfilename, @pos_suffix, @lemma_suffix)
116
-
117
- corpusfile.each_sentence {|tab_sent| # iterate over corpus sentences
118
-
119
- my_sent_id = tab_sent.get_sent_id()
120
-
121
- while true # find next matching line in parse file
122
- line = parserfile.gets
123
- # search for the next "relevant" file or end of the file
124
- if line.nil? or line=~/^\(TOP/
125
- break
126
- end
127
- end
128
- STDERR.puts line
129
- # while we search a parse, the parse file is over...
130
- if line.nil?
131
- raise "Error: premature end of parser file!"
132
- end
133
-
134
- line.chomp!
135
-
136
- # it now holds that line =~ ^(TOP
137
-
138
- case line
139
- when /^\(TOP~/ # successful parse
140
-
141
- st_sent = SalsaTigerSentence.empty_sentence(my_sent_id.to_s)
142
-
143
- build_salsatiger(line,st_sent)
144
-
145
- yield [st_sent, tab_sent, CollinsInterface.standard_mapping(st_sent, tab_sent)]
146
-
147
- else
148
- # failed parse: create a "failed" parse object
149
- # with one nonterminal node and all the terminals
150
-
151
- sent = CollinsInterface.failed_sentence(tab_sent,my_sent_id)
152
- yield [sent, tab_sent, CollinsInterface.standard_mapping(sent, tab_sent)]
153
-
154
- end
155
- }
156
- # after the end of the corpusfile, check if there are any parses left
157
- while true
158
- line = parserfile.gets
159
- if line.nil? # if there are none, everything is fine
160
- break
161
- elsif line =~ /^\(TOP/ # if there are, raise an exception
162
- raise "Error: premature end of corpus file!"
163
- end
164
- end
165
- end
166
-
167
- ###
168
- # write Salsa/TIGER XML output to file
169
- def to_stxml_file(infilename, # string: name of parse file
170
- outfilename) # string: name of output stxml file
171
-
172
- outfile = File.new(outfilename, "w")
173
- outfile.puts SalsaTigerXMLHelper.get_header()
174
- each_sentence(infilename) { |st_sent, tabsent|
175
- outfile.puts st_sent.get()
176
- }
177
- outfile.puts SalsaTigerXMLHelper.get_footer()
178
- outfile.close()
179
- end
180
-
181
-
182
- ########################
183
- private
184
-
185
- # Build a SalsaTigerSentence corresponding to the Collins parse in argument string.
186
- #
187
- # Special features: removes unary nodes and traces
188
- def build_salsatiger(string,st_sent)
189
-
190
- nt_c = Counter.new(500)
191
- t_c = Counter.new(0)
192
-
193
- position = 0
194
- stack = Array.new
195
-
196
- while position < string.length
197
- if string[position,1] == "(" # push nonterminal
198
- nextspace = string.index(" ",position)
199
- nonterminal = string[position+1..nextspace-1]
200
- stack.push nonterminal
201
- position = nextspace+1
202
- elsif string[position,1] == ")" # reduce stack
203
- tempstack = Array.new
204
- while true
205
- # get all Nodes from the stack and put them on a tempstack,
206
- # until you find a String, which is a not-yet existing nonterminal
207
- object = stack.pop
208
- if object.kind_of? SynNode
209
- tempstack.push(object) # terminal or subtree
210
- else # string (nonterminal label)
211
- if tempstack.length == 1 # skip unary nodes: do nothing and write tempstack back to stack
212
- stack += tempstack
213
- break
214
- # puts "Unary node #{object}"
215
- end
216
- nt_a = object.split("~")
217
- unless nt_a.length == 4
218
- # something went wrong. maybe it's about character encoding
219
- if nt_a.length() > 4
220
- # yes, assume it's about character encoding
221
- nt_a = [nt_a[0], nt_a[1..-3].join("~"), nt_a[-2], nt_a[-1]]
222
- else
223
- # whoa, _less_ pieces than expected: problem.
224
- $stderr.puts "Collins parse tree translation nonrecoverable error:"
225
- $stderr.puts "Unexpectedly too few components in nonterminal " + nt_a.join("~")
226
- raise StandardError.new("nonrecoverable error")
227
- end
228
- end
229
-
230
- # construct a new nonterminal
231
- node = st_sent.add_syn("nt",
232
- SalsaTigerXMLHelper.escape(nt_a[0].strip), # cat
233
- nil, # word (doesn't matter)
234
- nil, # pos (doesn't matter)
235
- nt_c.next.to_s)
236
- node.set_attribute("head",SalsaTigerXMLHelper.escape(nt_a[1].strip))
237
- tempstack.reverse.each {|child|
238
- node.add_child(child,nil)
239
- child.set_parent(node,nil)
240
- }
241
- stack.push(node)
242
- break # while
243
- end
244
- end
245
- position = position+2 # == nextspace+1
246
- else # terminal
247
- nextspace = string.index(" ",position)
248
- terminal = string[position..nextspace].strip
249
- t_a = terminal.split("/")
250
- unless t_a.length == 2
251
- raise "[collins] Cannot split terminal #{terminal} into word and POS!"
252
- end
253
-
254
- word = t_a[0]
255
- pos = t_a[1]
256
-
257
- unless pos =~ /TRACE/
258
- # construct a new terminal
259
- node = st_sent.add_syn("t",
260
- nil,
261
- SalsaTigerXMLHelper.escape(CollinsInterface.unescape(word)), # word
262
- SalsaTigerXMLHelper.escape(pos), # pos
263
- t_c.next.to_s)
264
- stack.push(node)
265
- end
266
- position = nextspace+1
267
- end
268
- end
269
-
270
- # at the very end, we need to have exactly one syntactic root
271
-
272
- if stack.length != 1
273
- raise "[collins] Error: Sentence has #{stack.length} roots"
274
- end
275
- end
276
-
277
-
278
- ####
279
- # extract the Collins parser input format from a TabFormat object
280
- # that includes part-of-speech (pos)
281
- #
282
- def CollinsInterface.produce_collins_input(corpusfile,tempfile)
283
- corpusfile.each_sentence {|s|
284
- words = Array.new
285
- s.each_line_parsed {|line_obj|
286
- word = line_obj.get("word")
287
- tag = line_obj.get("pos")
288
- if tag.nil?
289
- raise "Error: FNTabFormat object not tagged!"
290
- end
291
- word_tag_pair = CollinsInterface.escape(word,tag)
292
- if word_tag_pair =~ /\)/
293
- puts word_tag_pair
294
- puts s.to_s
295
- end
296
- words << word_tag_pair
297
- }
298
- tempfile.puts words.length.to_s+" "+words.join(" ")
299
- }
300
- end
301
-
302
- ####
303
- def CollinsInterface.escape(word,pos) # returns array word+" "+lemma
304
- case word
305
-
306
- # replace opening or closing brackets
307
- # word representation is {L,R}R{B,S,C} (bracket, square, curly)
308
- # POS for opening brackets is LRB, closing brackets RRB
309
-
310
- when "("
311
- return "LRB -LRB-"
312
- when "["
313
- return "LRS -LRB-"
314
- when "{"
315
- return "LRC -LRB-"
316
-
317
- when ")"
318
- return "RRB -RRB-"
319
- when "]"
320
- return "RRS -RRB-"
321
- when "}"
322
- return "RRC -RRB-"
323
-
324
- # catch those brackets or slashes inside words
325
- else
326
- word.gsub!(/\(/,"LRB")
327
- word.gsub!(/\)/,"RRB")
328
- word.gsub!(/\[/,"LRS")
329
- word.gsub!(/\]/,"RRS")
330
- word.gsub!(/\{/,"LRC")
331
- word.gsub!(/\}/,"RRC")
332
- word.gsub!(/\//,"&Slash;")
333
- return word+" "+pos
334
- end
335
- end
336
-
337
- ####
338
- # replace replacements with original values
339
- def CollinsInterface.unescape(word)
340
- return word.gsub(/LRB/,"(").gsub(/RRB/,")").gsub(/LRS/,"[").gsub(/RRS/,"]").gsub(/LRC/,"{").gsub(/RRC/,"}").gsub(/&Slash;/,"/")
341
- end
342
- end
343
-
344
- ################################################
345
- # Interpreter class
346
- class CollinsTntInterpreter < SynInterpreter
347
- CollinsTntInterpreter.announce_me()
348
-
349
- ###
350
- # names of the systems interpreted by this class:
351
- # returns a hash service(string) -> system name (string),
352
- # e.g.
353
- # { "parser" => "collins", "lemmatizer" => "treetagger" }
354
- def CollinsTntInterpreter.systems()
355
- return {
356
- "pos_tagger" => "treetagger",
357
- "parser" => "collins"
358
- }
359
- end
360
-
361
- ###
362
- # names of additional systems that may be interpreted by this class
363
- # returns a hash service(string) -> system name(string)
364
- # same as names()
365
- def CollinsTntInterpreter.optional_systems()
366
- return {
367
- "lemmatizer" => "treetagger"
368
- }
369
- end
370
-
371
- ###
372
- # generalize over POS tags.
373
- #
374
- # returns one of:
375
- #
376
- # adj: adjective (phrase)
377
- # adv: adverb (phrase)
378
- # card: numbers, quantity phrases
379
- # con: conjunction
380
- # det: determiner, including possessive/demonstrative pronouns etc.
381
- # for: foreign material
382
- # noun: noun (phrase), including personal pronouns, proper names, expletives
383
- # part: particles, truncated words (German compound parts)
384
- # prep: preposition (phrase)
385
- # pun: punctuation, brackets, etc.
386
- # sent: sentence
387
- # top: top node of a sentence
388
- # verb: verb (phrase)
389
- # nil: something went wrong
390
- #
391
- # returns: string, or nil
392
- def CollinsTntInterpreter.category(node) # SynNode
393
- pt = CollinsTntInterpreter.simplified_pt(node)
394
- if pt.nil?
395
- # phrase type could not be determined
396
- return nil
397
- end
398
-
399
- pt.to_s.strip() =~ /^([^-]*)/
400
- case $1
401
- when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
402
- when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
403
- when /^CD/, /^QP/ then return "card"
404
- when /^CC/, /^WRB/, /^CONJP/ then return "con"
405
- when /^DT/, /^POS/ then return "det"
406
- when /^FW/, /^SYM/ then return "for"
407
- when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
408
- when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
409
- when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
410
- when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
411
- when /^TOP/ then return "top"
412
- when /^TRACE/ then return "trace"
413
- when /^V/ , /^MD/ then return "verb"
414
- else
415
- # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
416
- return nil
417
- end
418
- end
419
-
420
-
421
- ###
422
- # is relative pronoun?
423
- #
424
- def CollinsTntInterpreter.relative_pronoun?(node) # SynNode
425
- pt = CollinsTntInterpreter.simplified_pt(node)
426
- if pt.nil?
427
- # phrase type could not be determined
428
- return nil
429
- end
430
-
431
- pt.to_s.strip() =~ /^([^-]*)/
432
- case $1
433
- when /^WDT/, /^WHAD/, /^WHNP/, /^WP/
434
- return true
435
- else
436
- return false
437
- end
438
- end
439
-
440
- ###
441
- # lemma_backoff:
442
- #
443
- # if we have lemma information, return that,
444
- # and failing that, return the word
445
- #
446
- # returns: string, or nil
447
- def CollinsTntInterpreter.lemma_backoff(node)
448
- lemma = super(node)
449
- # lemmatizer has returned more than one possible lemma form:
450
- # just accept the first
451
- if lemma =~ /^([^|]+)|/
452
- return $1
453
- else
454
- return lemma
455
- end
456
- end
457
-
458
-
459
- ###
460
- # simplified phrase type:
461
- # like phrase type, but may simplify
462
- # the constituent label
463
- #
464
- # returns: string
465
- def CollinsTntInterpreter.simplified_pt(node)
466
- CollinsTntInterpreter.pt(node) =~ /^(\w+)(-\w)*/
467
- return $1
468
- end
469
-
470
- ###
471
- # verb_with_particle:
472
- #
473
- # given a node and a nodelist,
474
- # if the node represents a verb:
475
- # see if the verb has a particle among the nodes in nodelist
476
- # if so, return it
477
- #
478
- # returns: SynNode object if successful, else nil
479
- def CollinsTntInterpreter.particle_of_verb(node,
480
- node_list)
481
-
482
- # must be verb
483
- unless CollinsTntInterpreter.category(node) == "verb"
484
- return nil
485
- end
486
-
487
- # must have parent
488
- unless node.parent
489
- return nil
490
- end
491
-
492
- # look for sisters of the verb node that have the particle category
493
- particles = node.parent.children.select { |sister|
494
- CollinsTntInterpreter.category(sister) == "part"
495
- }.map { |n| n.children}.flatten.select { |niece|
496
- # now look for children of those nodes that are particles and are in the nodelist
497
- nodelist.include? niece and
498
- CollinsTntInterpreter.category(niece) == "part"
499
- }
500
-
501
- if particles.length == 0
502
- return nil
503
- else
504
- return particles.first
505
- end
506
- end
507
-
508
- ###
509
- # auxiliary?
510
- #
511
- # returns true if the given node is an auxiliary
512
- # else false
513
- def CollinsTntInterpreter.auxiliary?(node)
514
-
515
- # look for
516
- # ---VP---
517
- # | |
518
- # the given node VP-A
519
- # |
520
- # verb node
521
- # verb?
522
- unless CollinsTntInterpreter.category(node) == "verb"
523
- return false
524
- end
525
-
526
- unless (parent = node.parent) and
527
- parent.category() == "VP"
528
- return false
529
- end
530
- unless (vpa_node = parent.children.detect { |other_child| other_child.category() == "VP-A" })
531
- return false
532
- end
533
- unless vpa_node.children.detect { |other_node| CollinsTntInterpreter.category(other_node) == "verb" }
534
- return false
535
- end
536
-
537
- return true
538
-
539
- end
540
-
541
- ###
542
- # modal?
543
- #
544
- # returns true if the given node is a modal verb,
545
- # else false
546
- def CollinsTntInterpreter.modal?(node)
547
- if node.part_of_speech() =~ /^MD/
548
- return true
549
- else
550
- return false
551
- end
552
- end
553
-
554
- ###
555
- # voice
556
- #
557
- # given a constituent, return
558
- # - "active"/"passive" if it is a verb
559
- # - nil, else
560
- def CollinsTntInterpreter.voice(node) # SynNode
561
-
562
- tobe = ["be","am","is","are","was","were"]
563
-
564
- unless CollinsTntInterpreter.category(node) == "verb"
565
- return nil
566
- end
567
-
568
- # if we have a gerund, a present tense, or an infitive
569
- # then we are sure that we have an active form
570
- case CollinsTntInterpreter.pt(node)
571
- when "VBG","VBP", "VBZ", "VB"
572
- return "active"
573
- end
574
-
575
-
576
- # There is an ambiguity for many word forms between VBN (past participle - passive)
577
- # and VBD (past tense - active)
578
-
579
- # so for these, we only say something if we can exclude one possibility,
580
- # this is the case
581
- # (a) when there is a c-commanding "to be" somewhere. -> passive
582
- # (b) when there is no "to be", but a "to have" somewhere. -> active
583
-
584
- # collect lemmas of c-commanding verbs.
585
-
586
- parent = node.parent
587
- if parent.nil?
588
- return nil
589
- end
590
- gp = parent.parent
591
- if gp.nil?
592
- return nil
593
- end
594
-
595
- # other_verbs = Array.new
596
- #
597
- # current_node = node
598
- # while current_node = current_node.parent
599
- # pt = CollinsTntInterpreter.category(current_node)
600
- # unless ["verb","sentence"].include? pt
601
- # break
602
- # end
603
- # current_node.children.each {|child|
604
- # if CollinsTntInterpreter.category(child) == "verb"
605
- # other_verbs << CollinsTntInterpreter.lemma_backoff(nephew)
606
- # end
607
- # }
608
- # end
609
- #
610
- # unless (tobe & other_verbs).empty?
611
- # puts "passive "+node.id
612
- # return "passive"
613
- # end
614
- # unless (tohave & other_verbs).empty?
615
- # return "active"
616
- # end
617
-
618
- if CollinsTntInterpreter.category(gp) == "verb" or CollinsTntInterpreter.category(gp) == "sent"
619
-
620
- current_node = node
621
-
622
- while current_node = current_node.parent
623
- pt = CollinsTntInterpreter.category(current_node)
624
- unless ["verb","sent"].include? pt
625
- break
626
- end
627
- if current_node.children.detect {|nephew| tobe.include? CollinsTntInterpreter.lemma_backoff(nephew)}
628
- return "passive"
629
- end
630
- end
631
- # if no "to be" has been found...
632
- return "active"
633
- end
634
-
635
- # case 2: The grandfather is something else (e.g. a noun phrase)
636
- # here, simple past forms are often mis-tagged as passives
637
- #
638
-
639
- # if we were cautious, we would return "dontknow" here;
640
- # however, these cases are so rare that it is unlikely that
641
- # assignments would be more reliable; so we rely on the
642
- # POS tag anyway.
643
-
644
-
645
- case CollinsTntInterpreter.pt(node)
646
- when "VBN","VBD"
647
- return "passive"
648
- # this must be some kind of error...
649
- else
650
- return nil
651
- end
652
- end
653
-
654
- ###
655
- # gfs
656
- #
657
- # grammatical functions of a constituent:
658
- #
659
- # returns: a list of pairs [relation(string), node(SynNode)]
660
- # where <node> stands in the relation <relation> to the parameter
661
- # that the method was called with
662
- def CollinsTntInterpreter.gfs(anchor_node, # SynNode
663
- sent) # SalsaTigerSentence
664
-
665
- return sent.syn_nodes.map { |gf_node|
666
-
667
- case CollinsTntInterpreter.category(anchor_node)
668
- when "adj"
669
- rel = CollinsTntInterpreter.gf_adj(anchor_node, gf_node)
670
- when "verb"
671
- rel = CollinsTntInterpreter.gf_verb(anchor_node, gf_node)
672
- when "noun"
673
- rel = CollinsTntInterpreter.gf_noun(anchor_node, gf_node)
674
- end
675
-
676
- if rel
677
- [rel, gf_node]
678
- else
679
- nil
680
- end
681
- }.compact()
682
- end
683
-
684
- ###
685
- # informative_content_node
686
- #
687
- # for most constituents: nil
688
- # for a PP, the NP
689
- # for an SBAR, the VP
690
- # for a VP, the embedded VP
691
- def CollinsTntInterpreter.informative_content_node(node)
692
- this_pt = CollinsTntInterpreter.simplified_pt(node)
693
-
694
- unless ["SBAR", "VP", "PP"].include? this_pt
695
- return nil
696
- end
697
-
698
- nh = CollinsTntInterpreter.head_terminal(node)
699
- unless nh
700
- return nil
701
- end
702
- headlemma = CollinsTntInterpreter.lemma_backoff(nh)
703
-
704
- nonhead_children = node.children().reject { |n|
705
- nnh = CollinsTntInterpreter.head_terminal(n)
706
- not(nnh) or
707
- CollinsTntInterpreter.lemma_backoff(nnh) == headlemma
708
- }
709
- if nonhead_children.length() == 1
710
- return nonhead_children.first
711
- end
712
-
713
- # more than one child:
714
- # for SBAR and VP take child with head POS starting in VB,
715
- # for PP child with head POS starting in NN
716
- case this_pt
717
- when "SBAR", "VP"
718
- icont_child = nonhead_children.detect { |n|
719
- h = CollinsTntInterpreter.head_terminal(n)
720
- h and h.part_of_speech() =~ /^VB/
721
- }
722
- when "PP"
723
- icont_child = nonhead_children.detect { |n|
724
- h = CollinsTntInterpreter.head_terminal(n)
725
- h and h.part_of_speech() =~ /^NN/
726
- }
727
- else
728
- raise "Shouldn't be here"
729
- end
730
-
731
- if icont_child
732
- return icont_child
733
- else
734
- return nonhead_children.first
735
- end
736
- end
737
-
738
-
739
-
740
-
741
- ########
742
- # prune?
743
- # given a target node t and another node n of the syntactic structure,
744
- # decide whether n is likely to instantiate a semantic role
745
- # of t. If not, recommend n for pruning.
746
- #
747
- # This method implements a slight variant of Xue and Palmer (EMNLP 2004).
748
- # Pruning according to Xue & Palmer, EMNLP 2004:
749
- # "Step 1: Designate the predicate as the current node and
750
- # collect its sisters (constituents attached at the same level
751
- # as the predicate) unless its sisters are coordinated with the
752
- # predicate. If a sister is a PP, also collect its immediate
753
- # children.
754
- # Step 2: Reset the current node to its parent and repeat Step 1
755
- # till it reaches the top level node.
756
- #
757
- # Modifications made here:
758
- # - paths of length 0 accepted in any case
759
- #
760
- # returns: false to recommend n for pruning, else true
761
- def CollinsTntInterpreter.prune?(node, # SynNode
762
- paths_to_target, # hash: node ID -> Path object: paths from target to node
763
- terminal_index) # hash: terminal node -> word index in sentence
764
-
765
- path_to_target = paths_to_target[node.id()]
766
-
767
- if not path_to_target
768
- # no path from target to node: suggest for pruning
769
-
770
- return 0
771
-
772
- elsif path_to_target.length == 0
773
- # target may be its own role: definite accept
774
-
775
- return 1
776
-
777
- else
778
- # consider path from target to node.
779
- # (1) If the path to the current node includes at least one Up
780
- # and exactly one Down, keep.
781
- # (2) Else, if the path includes at least one Up and exactly two Down,
782
- # and the current node's parent is a PP, keep
783
- # (3) else discard
784
-
785
- # count number of up and down steps in path to target
786
- num_up = 0
787
- num_down = 0
788
- path_to_target.each_step { |direction, edgelabel, nodelabel, endnode|
789
- case direction
790
- when /U/
791
- num_up += 1
792
- when /D/
793
- num_down += 1
794
- end
795
- }
796
-
797
- # coordination sister between node and target?
798
- conj_sister_between = CollinsTntInterpreter.conj_sister_between?(node, paths_to_target,
799
- terminal_index)
800
-
801
-
802
- if conj_sister_between
803
- # coordination between me and the target -- drop
804
- return 0
805
-
806
- elsif num_up >= 1 and num_down == 1
807
- # case (1)
808
- return 1
809
-
810
- elsif num_up >= 1 and num_down == 2 and
811
- (p = node.parent()) and CollinsTntInterpreter.category(p) == "prep"
812
-
813
- # case (2)
814
- return 1
815
-
816
- else
817
- # case (3)
818
- return 0
819
- end
820
- end
821
- end
822
-
823
-
824
- ###
825
- private
826
-
827
-
828
- ###
829
- # given an anchor node and another node that may be some
830
- # grammatical function of the anchor node:
831
- # return the grammatical function (string) if found,
832
- # else nil.
833
- #
834
- # here: anchor node is verb.
835
- def CollinsTntInterpreter.gf_verb(anchor_node, # SynNode
836
- gf_node) # SynNode
837
-
838
- # first classification: according to constituent type
839
- cat = CollinsTntInterpreter.category(gf_node)
840
- if cat.nil?
841
- return nil
842
- end
843
-
844
- # second classification: according to path
845
- path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
846
- if path.nil?
847
- # no path between anchor node and gf node
848
- return nil
849
- end
850
-
851
- path.set_cutoff_last_pt_on_printing(true)
852
- path_string = path.print(true,false,true)
853
-
854
- case path_string
855
- when "U VP D ", "U SG D "
856
- categ2 = "inside"
857
- when /^U (VP U )*S(BAR)? D $/
858
- categ2 = "external"
859
- when /^U (VP U )*VP D ADVP D $/
860
- categ2 = "external"
861
- else
862
- categ2 = ""
863
- end
864
-
865
- # now evaluate based on both
866
- case cat+ "+" + categ2
867
- when "noun+inside"
868
- # direct object
869
- return "OA"
870
-
871
- when "noun+external"
872
- unless CollinsTntInterpreter.relative_position(gf_node, anchor_node) == "LEFT"
873
- return nil
874
- end
875
-
876
- if CollinsTntInterpreter.voice(anchor_node) == "passive"
877
- return "OA"
878
- else
879
- return "SB"
880
- end
881
-
882
- when "prep+inside"
883
- if CollinsTntInterpreter.voice(anchor_node) == "passive" and
884
- CollinsTntInterpreter.preposition(gf_node) == "by"
885
- return "SB"
886
- else
887
- return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
888
- end
889
-
890
- when "sent+inside"
891
- return "OC"
892
-
893
- when "sent+external"
894
- return "OC"
895
-
896
- else
897
- return nil
898
- end
899
- end
900
-
901
- ###
902
- # given an anchor node and another node that may be some
903
- # grammatical function of the anchor node:
904
- # return the grammatical function (string) if found,
905
- # else nil.
906
- #
907
- # here: anchor node is noun.
908
- def CollinsTntInterpreter.gf_noun(anchor_node, # SynNode
909
- gf_node) # SynNode
910
-
911
- # first classification: according to constituent type
912
- cat = CollinsTntInterpreter.category(gf_node)
913
- if cat.nil?
914
- return nil
915
- end
916
-
917
- # second classification: according to path
918
- path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
919
- if path.nil?
920
- # no path between anchor node and gf node
921
- return nil
922
- end
923
-
924
- path.set_cutoff_last_pt_on_printing(true)
925
- path_string = path.print(true,false,true)
926
-
927
- case path_string
928
- when "U NPB D "
929
- categ2 = "np-neighbor"
930
- when "U NPB U NP D "
931
- categ2 = "np-parent"
932
- when "U NP D "
933
- categ2 = "np-a"
934
- when /^U NPB (U NP )?(U NP )?U S(BAR)? D( VP D)? $/
935
- categ2 = "beyond-s"
936
- when /^U NP(B)? (U NP )?U VP D $/
937
- categ2 = "beyond-vp"
938
- when /^U NPB (U NP )?(U NP)?U PP U VP(-A)? D $/
939
- categ2 = "beyond-pp-vp"
940
- else
941
- categ2 = ""
942
- end
943
-
944
- # now evaluate based on both
945
- case cat + "+" + categ2
946
- when "noun+np-neighbor"
947
- return "AG"
948
-
949
- when "sent+np-parent"
950
- return "OC"
951
-
952
- when "prep+np-parent", "prep+np-a"
953
- return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
954
- # relation of anchor noun to governing verb not covered by "gfs" method
955
- # when "verb+beyond-s"
956
- # return "SB-of"
957
-
958
- # when "verb+beyond-vp"
959
- # return "OA-of"
960
-
961
- # when "verb+beyond-pp-vp"
962
- # return "MO-of"
963
- else
964
- return nil
965
- end
966
- end
967
-
968
-
969
- ###
970
- # given an anchor node and another node that may be some
971
- # grammatical function of the anchor node:
972
- # return the grammatical function (string) if found,
973
- # else nil.
974
- #
975
- # here: anchor node is adjective.
976
- def CollinsTntInterpreter.gf_adj(anchor_node, # SynNode
977
- gf_node) # SynNode
978
-
979
- # first classification: according to constituent type
980
- cat = CollinsTntInterpreter.category(gf_node)
981
- if cat.nil?
982
- return nil
983
- end
984
-
985
- # second classification: according to path
986
- path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
987
- if path.nil?
988
- # no path between anchor node and gf node
989
- return nil
990
- end
991
-
992
- path.set_cutoff_last_pt_on_printing(true)
993
- path_string = path.print(true,false,true)
994
-
995
- case path_string
996
- when /^(U ADJP )?U NPB D $/
997
- categ2 = "nnpath"
998
- when "U ADJP D "
999
- categ2 = "adjp-neighbor"
1000
- when /^(U ADJP )?U (VP U )?S(BAR)? D $/
1001
- categ2 = "s"
1002
- when /^U (ADJP U )?VP D $/
1003
- categ2 = "vp"
1004
- else
1005
- categ2 = ""
1006
- end
1007
-
1008
- # now evaluate based on both
1009
- case cat + "+" + categ2
1010
- when "noun+nnpath"
1011
- return "HD"
1012
- when "verb+adjp-neighbor"
1013
- return "OC"
1014
- when "prep+vp", "prep+adjp-neighbor"
1015
- return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
1016
- else
1017
- return nil
1018
- end
1019
- end
1020
-
1021
- ####
1022
- # auxiliary of prune?:
1023
- #
1024
- # given a node and a hash mapping node IDs to paths to target:
1025
- # Does that node have a sister that is a coordination and that
1026
- # is between it and the target?
1027
- #
1028
- def CollinsTntInterpreter.conj_sister_between?(node, # SynNode
1029
- paths_to_target, # Hash: node ID -> Path obj: path from node to target
1030
- ti) # hash: terminal node -> word index in sentence
1031
-
1032
- # does node have sisters that represent coordination?
1033
- unless (p = node.parent())
1034
- return false
1035
- end
1036
-
1037
- unless (conj_sisters = p.children.select { |sib|
1038
- sib != node and CollinsTntInterpreter.category(sib) == "con"
1039
- } ) and
1040
- not (conj_sisters.empty?)
1041
- return false
1042
- end
1043
-
1044
- # represent each coordination sister, and the node itself,
1045
- # as a triple [node, leftmost terminal index(node), rightmost terminal index(node)
1046
- conj_sisters = conj_sisters.map { |n|
1047
- [n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
1048
- }
1049
-
1050
- this_triple = [node, CollinsTntInterpreter.lti(node, ti), CollinsTntInterpreter.rti(node, ti)]
1051
-
1052
- # sisters closer to the target than node:
1053
- # also map to triples
1054
- sisters_closer_to_target = p.children.select { |sib|
1055
- sib != node and
1056
- not(conj_sisters.include? sib) and
1057
- paths_to_target[sib.id()] and
1058
- paths_to_target[sib.id()].length() < paths_to_target[node.id()].length
1059
- }.map { |n|
1060
- [n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
1061
- }
1062
-
1063
- if sisters_closer_to_target.empty?
1064
- return false
1065
- end
1066
-
1067
- # is there any coordination sister that is inbetween this node
1068
- # and some sister that is closer to the target?
1069
- # if so, return true
1070
- conj_sisters.each { |conj_triple|
1071
- if leftof(conj_triple, this_triple) and
1072
- sisters_closer_to_target.detect { |s| CollinsTntInterpreter.leftof(s, conj_triple) }
1073
-
1074
- return true
1075
-
1076
- elsif rightof(conj_triple, this_triple) and
1077
- sisters_closer_to_target.detect { |s| CollinsTntInterpreter.rightof(s, conj_triple) }
1078
-
1079
- return true
1080
- end
1081
- }
1082
-
1083
- # else return false
1084
- return false
1085
- end
1086
-
1087
- ###
1088
- # lti, rti: terminal index of the leftmost/rightmost terminal of
1089
- # a given node (SynNode)
1090
- #
1091
- # auxiliary of conj_sister_between?
1092
- def CollinsTntInterpreter.lti(node, # SynNode
1093
- terminal_index) # hash: terminal node -> word index in sentence
1094
- lt = CollinsTntInterpreter.leftmost_terminal(node)
1095
- unless lt
1096
- return nil
1097
- end
1098
-
1099
- return terminal_index[lt]
1100
- end
1101
-
1102
- def CollinsTntInterpreter.rti(node, # SynNode
1103
- terminal_index) # hash: terminal node -> word index in sentence
1104
- rt = CollinsTntInterpreter.rightmost_terminal(node)
1105
- unless rt
1106
- return nil
1107
- end
1108
-
1109
- return terminal_index[rt]
1110
- end
1111
-
1112
- ###
1113
- # leftof, rightof: given 2 triples
1114
- # [node(SynNode), index of leftmost terminal(integer/nil), index of rightmost terminal(integer/nil),
1115
- #
1116
- # auxiliaries of conj_sister_between?
1117
- #
1118
- # return true if both leftmost and rightmost terminal indices of the first triple are
1119
- # smaller than (for leftof) / bigger than (for rightof) the
1120
- # corresponding indices of the second triple
1121
- #
1122
- # return false if some index is nil
1123
- def CollinsTntInterpreter.leftof(triple1,
1124
- triple2)
1125
- dummy, lm1, rm1 = triple1
1126
- dummy, lm2, rm2 = triple2
1127
-
1128
- if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
1129
- return false
1130
- elsif lm1 < lm2 and rm1 < rm2
1131
- return true
1132
- else
1133
- return false
1134
- end
1135
- end
1136
-
1137
- def CollinsTntInterpreter.rightof(triple1,
1138
- triple2)
1139
- dummy, lm1, rm1 = triple1
1140
- dummy, lm2, rm2 = triple2
1141
-
1142
- if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
1143
- return false
1144
- elsif lm1 > lm2 and rm1 > rm2
1145
- return true
1146
- else
1147
- return false
1148
- end
1149
- end
1150
- end
1151
-
1152
-
1153
- # use TreeTagger as replacement for TnT; re-use everything, but use treetagger as POS tagger
1154
-
1155
- class CollinsTreeTaggerInterpreter < CollinsTntInterpreter
1156
- CollinsTreeTaggerInterpreter.announce_me()
1157
-
1158
- def CollinsTreeTaggerInterpreter.systems()
1159
- return {
1160
- "pos_tagger" => "treetagger",
1161
- "parser" => "collins"
1162
- }
1163
- end
1164
- end
1165
-