shalmaneser 0.0.1.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +284 -0
@@ -0,0 +1,375 @@
1
+ # -*- coding: utf-8 -*-
2
+ ####
3
+ # sp 21 07 05
4
+ #
5
+ # modified ke 30 10 05: adapted to fit into SynInterface
6
+ #
7
+ # represents a file containing Berkeley parses
8
+ #
9
+ # underlying data structure for individual sentences: SalsaTigerSentence
10
+ require "tempfile"
11
+
12
+ require "common/SalsaTigerRegXML"
13
+ require "common/SalsaTigerXMLHelper"
14
+ require "common/TabFormat"
15
+ require "common/Counter"
16
+
17
+ require "common/AbstractSynInterface"
18
+ require "common/Tiger.rb"
19
+
20
+ ################################################
21
+ # Interface class
22
+ class BerkeleyInterface < SynInterfaceSTXML
23
+ $stderr.puts 'Announcing Berkeley Interface' if $DEBUG
24
+ BerkeleyInterface.announce_me()
25
+
26
+ ###
27
+ def BerkeleyInterface.system()
28
+ return "berkeley"
29
+ end
30
+
31
+ ###
32
+ def BerkeleyInterface.service()
33
+ return "parser"
34
+ end
35
+
36
+ ###
37
+ # initialize to set values for all subsequent processing
38
+ def initialize(program_path, # string: path to system
39
+ insuffix, # string: suffix of tab files
40
+ outsuffix, # string: suffix for parsed files
41
+ stsuffix, # string: suffix for Salsa/TIGER XML files
42
+ var_hash = {}) # optional arguments in a hash
43
+
44
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
45
+ unless @program_path =~ /\/$/
46
+ @program_path = @program_path + "/"
47
+ end
48
+
49
+ # new: evaluate var hash
50
+ @pos_suffix = var_hash["pos_suffix"]
51
+ @lemma_suffix = var_hash["lemma_suffix"]
52
+ @tab_dir = var_hash["tab_dir"]
53
+ end
54
+
55
+ ####
56
+ # parse a directory with TabFormat files and write the parse trees to outputdir
57
+ # I assume that the files in inputdir are smaller than
58
+ # the maximum number of sentences that
59
+ # Berkeley can parse in one go (i.e. that they are split)
60
+ def process_dir(in_dir, # string: input directory name
61
+ out_dir) # string: output directory name
62
+
63
+ # not using x64 arch, adjusting for 32 bit
64
+ # berkeley_prog = "java -d64 -Xmx10000m -jar #{@program_path}berkeley-parser.jar -gr #{@program_path}gerNegra.01.utf8 "
65
+ berkeley_prog = "java -Xmx2000m -jar #{@program_path}berkeleyParser.jar -gr #{@program_path}ger_sm5.gr"
66
+
67
+ berkeley_prog = "java -jar #{@program_path}berkeley-parser.jar -gr #{@program_path}gerNegra.01.utf8 "
68
+ Dir[in_dir + "*" + @insuffix].each {|inputfilename|
69
+ STDERR.puts "*** Parsing #{inputfilename} with Berkeley"
70
+ corpusfilename = File.basename(inputfilename, @insuffix)
71
+ parsefilename = out_dir + corpusfilename + @outsuffix
72
+ tempfile = Tempfile.new(corpusfilename)
73
+
74
+ # we need neither lemmata nor POS tags; berkeley can do with the words
75
+ corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
76
+ corpusfile.each_sentence {|sentence|
77
+ #puts sentence.to_s
78
+ tempfile.puts sentence.to_s
79
+ }
80
+ tempfile.close
81
+ # parse and remove comments in the parser output
82
+ STDERR.puts "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
83
+
84
+ # AB: for testing we leave this step out, it takes too much time.
85
+ # Please keep the <parsefile> intact!!!
86
+ Kernel.system("#{berkeley_prog} < #{tempfile.path} > #{parsefilename}")
87
+
88
+ }
89
+ end
90
+
91
+ ###
92
+ # for a given parsed file:
93
+ # yield each sentence as a pair
94
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
95
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
96
+ #
97
+ # If a parse has failed, returns
98
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
99
+ # to allow more detailed accounting for failed parses
100
+ # (basically just a flat structure with a failed=true attribute
101
+ # at the sentence node)
102
+ def each_sentence(parsefilename)
103
+ # sanity checks
104
+ unless @tab_dir
105
+ raise "Need to set tab directory on initialization"
106
+ end
107
+
108
+ # get matching tab file for this parser output file
109
+ parsefile = File.new(parsefilename)
110
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
111
+ tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
112
+
113
+ sentid = 0
114
+ tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
115
+
116
+ sentence_str = ""
117
+ status = true # error encountered?
118
+ # assemble next sentence in Berkeley file by reading lines from parsefile
119
+ # for berkeley:
120
+ while true
121
+ line = parsefile.gets
122
+
123
+ # search for the next "relevant" file or end of the file
124
+ if line.nil? or line=~/^\( *\(TOP/ or line=~/^\(\(\)/
125
+ break
126
+ end
127
+ sentid +=1
128
+
129
+ end
130
+
131
+
132
+ if line.nil? # while we search a parse, the parse file is over...
133
+ raise "Error: premature end of parser file!"
134
+ end
135
+
136
+
137
+ # berkeley parser output: remove brackets /(.*)/
138
+ line.sub!(/^\( */, '')
139
+ line.sub!(/ *\) *$/, '')
140
+ line.gsub!(/\)\)/, ') )')
141
+ line.gsub!(/\)\)/, ') )')
142
+ line.gsub!(/(\([A-Z]+)_/, '\1-')
143
+
144
+ sentence_str = line.chomp!
145
+
146
+ # if we are here, we have a sentence_str to work on
147
+ # hopefully, our status is OK
148
+ case status
149
+ when true
150
+ if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
151
+ my_sent_id = tab_sent.get_sent_id()
152
+ else
153
+ my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
154
+ end
155
+
156
+ st_sent = build_salsatiger(" " + sentence_str + " ", 0,
157
+ Array.new, Counter.new(0),
158
+ Counter.new(500),
159
+ SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
160
+ if st_sent.nil?
161
+ next
162
+ end
163
+ yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
164
+ else # i.e. when "failed"
165
+ #raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
166
+ end
167
+
168
+ }
169
+
170
+ # we don't have a sentence: hopefully, this is becase parsing has failed
171
+
172
+
173
+ # all TabFile sentences are consumed:
174
+ # now we may just encounter comments, garbage, empty lines etc.
175
+
176
+ while not parsefile.eof?
177
+
178
+ case parsefile.gets
179
+ when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
180
+ else
181
+ raise "Error: premature end of tab file!"
182
+ end
183
+ end
184
+ end
185
+
186
+
187
+ ###
188
+ # write Salsa/TIGER XML output to file
189
+ def to_stxml_file(infilename, # string: name of parse file
190
+ outfilename) # string: name of output stxml file
191
+
192
+ outfile = File.new(outfilename, "w")
193
+
194
+ outfile.puts SalsaTigerXMLHelper.get_header()
195
+ each_sentence(infilename) { |st_sent, tabsent|
196
+ outfile.puts st_sent.get()
197
+ }
198
+ outfile.puts SalsaTigerXMLHelper.get_footer()
199
+ outfile.close()
200
+ end
201
+
202
+
203
+
204
+ ########################
205
+ private
206
+
207
+ ###
208
+ # Recursive function for parsing a Berkeley parse tree and
209
+ # building a SalsaTigerSentence recursively
210
+ #
211
+ # Algorithm: manage stack which contains, for the current constituent,
212
+ # child constituents (if a nonterminal), and the category label.
213
+ # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
214
+ # All children and the category label are popped from the stack and integrated into the
215
+ # TigerSalsa data structure. The new node is re-pushed onto the stack.
216
+ def build_salsatiger(sentence, # string
217
+ pos, # position in string (index): integer
218
+ stack, # stack with incomplete nodes: Array
219
+ termc, # terminal counter
220
+ nontc, # nonterminal counter
221
+ sent_obj) # SalsaTigerSentence
222
+
223
+
224
+
225
+ if sentence =~ /\(\)/
226
+ return nil
227
+ end
228
+
229
+ # main case distinction: match the beginning of our string
230
+ # (i.e. what follows our current position in the string)
231
+ case sentence[pos..-1]
232
+
233
+ when /^ *$/ # nothing -> whole sentence parsed
234
+ if stack.length == 1
235
+ # sleepy always delivers one "top" node; if we don't get just one
236
+ # node, something has gone wrong
237
+ node = stack.pop
238
+ node.del_attribute("gf")
239
+ return sent_obj
240
+ else
241
+ raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
242
+ end
243
+
244
+ when /^\s*\(([^ )]+) /
245
+ # match the beginning of a new constituent
246
+ # (opening bracket + category + space, may not contain closing bracket)
247
+ cat = $1
248
+ if cat.nil? or cat == ""
249
+ raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
250
+ end
251
+ # STDERR.puts "new const #{cat}"
252
+ stack.push cat # throw the category label on the stack
253
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
254
+
255
+ when /^\s*(\S+)\) /
256
+ # match the end of a terminal constituent (something before a closing bracket + space)
257
+ word = $1
258
+
259
+ comb_cat = stack.pop
260
+ if comb_cat.to_s == ""
261
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
262
+ end
263
+
264
+ cat,gf = split_cat(comb_cat)
265
+ node = sent_obj.add_syn("t",
266
+ nil, # cat (doesn't matter here)
267
+ SalsaTigerXMLHelper.escape(word), # word
268
+ cat, # pos
269
+ termc.next.to_s)
270
+ node.set_attribute("gf",gf)
271
+ # STDERR.puts "completed terminal #{cat}, #{word}"
272
+ stack.push node
273
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
274
+
275
+ when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
276
+ # now collect children:
277
+ # pop items from the stack until you find the category
278
+ children = Array.new
279
+ while true
280
+ if stack.empty?
281
+ raise "Error: stack empty; cannot find more children"
282
+ end
283
+ item = stack.pop
284
+ case item.class.to_s
285
+ when "SynNode" # this is a child
286
+ children.push item
287
+ when "String" # this is the category label
288
+ if item.to_s == ""
289
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
290
+ end
291
+ cat,gf = split_cat(item)
292
+ break
293
+ else
294
+ raise "Error: unknown item class #{item.class.to_s}"
295
+ end
296
+ end
297
+ # now add a nonterminal node to the sentence object and
298
+ # register the children nodes
299
+ node = sent_obj.add_syn("nt",
300
+ cat, # cat
301
+ nil, # word (doesn't matter)
302
+ nil, # pos (doesn't matter)
303
+ nontc.next.to_s)
304
+ children.each {|child|
305
+ child_gf = child.get_attribute("gf")
306
+ child.del_attribute("gf")
307
+ node.add_child(child,child_gf)
308
+ child.add_parent(node, child_gf)
309
+ }
310
+ node.set_attribute("gf",gf)
311
+ # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
312
+ stack.push node
313
+
314
+ return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
315
+ else
316
+ raise "Error: cannot analyse sentence at pos #{pos}: #{sentence[pos..-1]}. Complete sentence: \n#{sentence}"
317
+ end
318
+ end
319
+
320
+
321
+
322
+
323
+ ###
324
+ # Berkeley delivers node labels as "phrase type"-"grammatical function"
325
+ # but the GF may not be present.
326
+
327
+ def split_cat(cat)
328
+
329
+ cat =~ /^([^-]*)(-([^-]*))?$/
330
+ unless $1
331
+ raise "Error: could not identify category in #{cat}"
332
+ end
333
+
334
+ proper_cat = $1
335
+
336
+ if $3
337
+ gf = $3
338
+ else
339
+ gf = ""
340
+ end
341
+
342
+ return [proper_cat,gf]
343
+
344
+ end
345
+ end
346
+
347
+
348
+
349
+ ################################################
350
+ # Interpreter class
351
+ class BerkeleyInterpreter < Tiger
352
+ BerkeleyInterpreter.announce_me()
353
+
354
+ ###
355
+ # names of the systems interpreted by this class:
356
+ # returns a hash service(string) -> system name (string),
357
+ # e.g.
358
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
359
+ def BerkeleyInterpreter.systems()
360
+ return {
361
+ "parser" => "berkeley"
362
+ }
363
+ end
364
+
365
+ ###
366
+ # names of additional systems that may be interpreted by this class
367
+ # returns a hash service(string) -> system name(string)
368
+ # same as names()
369
+ def BerkeleyInterpreter.optional_systems()
370
+ return {
371
+ "lemmatizer" => "treetagger"
372
+ }
373
+ end
374
+
375
+ end
@@ -0,0 +1,1165 @@
1
+ ####
2
+ # sp 15 04 05
3
+ #
4
+ # modified ke 30 10 05: adapted to fit into SynInterface
5
+ #
6
+ # represents a file containing Collins parses
7
+ #
8
+ # underlying data structure for individual sentences: SalsaTigerSentence
9
+
10
+
11
+ require "tempfile"
12
+ require "common/TabFormat"
13
+ require "common/SalsaTigerRegXML"
14
+ require "common/SalsaTigerXMLHelper"
15
+ require "common/Counter"
16
+
17
+ require "common/AbstractSynInterface"
18
+
19
+ ################################################
20
+ # Interface class
21
+ class CollinsInterface < SynInterfaceSTXML
22
+ CollinsInterface.announce_me()
23
+
24
+ ###
25
+ def CollinsInterface.system()
26
+ return "collins"
27
+ end
28
+
29
+ ###
30
+ def CollinsInterface.service()
31
+ return "parser"
32
+ end
33
+
34
+ ###
35
+ # initialize to set values for all subsequent processing
36
+ def initialize(program_path, # string: path to system
37
+ insuffix, # string: suffix of tab files
38
+ outsuffix, # string: suffix for parsed files
39
+ stsuffix, # string: suffix for Salsa/TIGER XML files
40
+ var_hash = {}) # optional arguments in a hash
41
+
42
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
43
+ # I am not expecting any parameters, but I need
44
+ # the program path to end in a /.
45
+ unless @program_path =~ /\/$/
46
+ @program_path = @program_path + "/"
47
+ end
48
+
49
+ # new: evaluate var hash
50
+ @pos_suffix = var_hash["pos_suffix"]
51
+ @lemma_suffix = var_hash["lemma_suffix"]
52
+ @tab_dir = var_hash["tab_dir"]
53
+ end
54
+
55
+
56
+ ###
57
+ # parse a bunch of TabFormat files (*.<insuffix>) with Collins model 3
58
+ # required: POS tags must be present
59
+ # produced: in outputdir, files *.<outsuffix>
60
+ # I assume that the files in inputdir are smaller than
61
+ # the maximum number of sentences
62
+ # Collins can parse in one go (i.e. that they are split) and I don't have to care
63
+ def process_dir(in_dir, # string: name of input directory
64
+ out_dir) # string: name of output directory
65
+ print "parsing ", in_dir, " and writing to ", out_dir, "\n"
66
+
67
+ unless @pos_suffix
68
+ raise "Collins interface: need suffix for POS files"
69
+ end
70
+
71
+ collins_prog = "gunzip -c #{@program_path}models/model3/events.gz | nice #{@program_path}code/parser"
72
+ collins_params = " #{@program_path}models/model3/grammar 10000 1 1 1 1"
73
+
74
+ Dir[in_dir+ "*" + @insuffix].each { |inputfilename|
75
+
76
+ STDERR.puts "*** Parsing #{inputfilename} with Collins"
77
+
78
+ corpusfilename = File.basename(inputfilename, @insuffix)
79
+ parsefilename = out_dir+corpusfilename+ @outsuffix
80
+ tempfile = Tempfile.new(corpusfilename)
81
+
82
+ # we need to have part of speech tags (but no lemmas at this point)
83
+ # included automatically by FNTabFormatFile initialize from *.pos
84
+ tabfile = FNTabFormatFile.new(inputfilename,@pos_suffix)
85
+
86
+ CollinsInterface.produce_collins_input(tabfile,tempfile)
87
+ tempfile.close
88
+ print collins_prog+" "+tempfile.path+" "+ collins_params+" > "+parsefilename
89
+ Kernel.system(collins_prog+" "+tempfile.path+" "+
90
+ collins_params+" > "+parsefilename)
91
+ tempfile.close(true)
92
+ }
93
+ end
94
+
95
+ ###
96
+ # for a given parsed file:
97
+ # yield each sentence as a pair
98
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
99
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
100
+ #
101
+ # If a parse has failed, returns
102
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
103
+ # to allow more detailed accounting for failed parses
104
+ def each_sentence(parsefilename)
105
+
106
+ # sanity checks
107
+ unless @tab_dir
108
+ raise "Need to set tab directory on initialization"
109
+ end
110
+
111
+ # get matching tab file for this parser output file
112
+ parserfile = File.new(parsefilename)
113
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
114
+
115
+ corpusfile = FNTabFormatFile.new(tabfilename, @pos_suffix, @lemma_suffix)
116
+
117
+ corpusfile.each_sentence {|tab_sent| # iterate over corpus sentences
118
+
119
+ my_sent_id = tab_sent.get_sent_id()
120
+
121
+ while true # find next matching line in parse file
122
+ line = parserfile.gets
123
+ # search for the next "relevant" file or end of the file
124
+ if line.nil? or line=~/^\(TOP/
125
+ break
126
+ end
127
+ end
128
+ STDERR.puts line
129
+ # while we search a parse, the parse file is over...
130
+ if line.nil?
131
+ raise "Error: premature end of parser file!"
132
+ end
133
+
134
+ line.chomp!
135
+
136
+ # it now holds that line =~ ^(TOP
137
+
138
+ case line
139
+ when /^\(TOP~/ # successful parse
140
+
141
+ st_sent = SalsaTigerSentence.empty_sentence(my_sent_id.to_s)
142
+
143
+ build_salsatiger(line,st_sent)
144
+
145
+ yield [st_sent, tab_sent, CollinsInterface.standard_mapping(st_sent, tab_sent)]
146
+
147
+ else
148
+ # failed parse: create a "failed" parse object
149
+ # with one nonterminal node and all the terminals
150
+
151
+ sent = CollinsInterface.failed_sentence(tab_sent,my_sent_id)
152
+ yield [sent, tab_sent, CollinsInterface.standard_mapping(sent, tab_sent)]
153
+
154
+ end
155
+ }
156
+ # after the end of the corpusfile, check if there are any parses left
157
+ while true
158
+ line = parserfile.gets
159
+ if line.nil? # if there are none, everything is fine
160
+ break
161
+ elsif line =~ /^\(TOP/ # if there are, raise an exception
162
+ raise "Error: premature end of corpus file!"
163
+ end
164
+ end
165
+ end
166
+
167
+ ###
168
+ # write Salsa/TIGER XML output to file
169
+ def to_stxml_file(infilename, # string: name of parse file
170
+ outfilename) # string: name of output stxml file
171
+
172
+ outfile = File.new(outfilename, "w")
173
+ outfile.puts SalsaTigerXMLHelper.get_header()
174
+ each_sentence(infilename) { |st_sent, tabsent|
175
+ outfile.puts st_sent.get()
176
+ }
177
+ outfile.puts SalsaTigerXMLHelper.get_footer()
178
+ outfile.close()
179
+ end
180
+
181
+
182
+ ########################
183
+ private
184
+
185
+ # Build a SalsaTigerSentence corresponding to the Collins parse in argument string.
186
+ #
187
+ # Special features: removes unary nodes and traces
188
+ def build_salsatiger(string,st_sent)
189
+
190
+ nt_c = Counter.new(500)
191
+ t_c = Counter.new(0)
192
+
193
+ position = 0
194
+ stack = Array.new
195
+
196
+ while position < string.length
197
+ if string[position,1] == "(" # push nonterminal
198
+ nextspace = string.index(" ",position)
199
+ nonterminal = string[position+1..nextspace-1]
200
+ stack.push nonterminal
201
+ position = nextspace+1
202
+ elsif string[position,1] == ")" # reduce stack
203
+ tempstack = Array.new
204
+ while true
205
+ # get all Nodes from the stack and put them on a tempstack,
206
+ # until you find a String, which is a not-yet existing nonterminal
207
+ object = stack.pop
208
+ if object.kind_of? SynNode
209
+ tempstack.push(object) # terminal or subtree
210
+ else # string (nonterminal label)
211
+ if tempstack.length == 1 # skip unary nodes: do nothing and write tempstack back to stack
212
+ stack += tempstack
213
+ break
214
+ # puts "Unary node #{object}"
215
+ end
216
+ nt_a = object.split("~")
217
+ unless nt_a.length == 4
218
+ # something went wrong. maybe it's about character encoding
219
+ if nt_a.length() > 4
220
+ # yes, assume it's about character encoding
221
+ nt_a = [nt_a[0], nt_a[1..-3].join("~"), nt_a[-2], nt_a[-1]]
222
+ else
223
+ # whoa, _less_ pieces than expected: problem.
224
+ $stderr.puts "Collins parse tree translation nonrecoverable error:"
225
+ $stderr.puts "Unexpectedly too few components in nonterminal " + nt_a.join("~")
226
+ raise StandardError.new("nonrecoverable error")
227
+ end
228
+ end
229
+
230
+ # construct a new nonterminal
231
+ node = st_sent.add_syn("nt",
232
+ SalsaTigerXMLHelper.escape(nt_a[0].strip), # cat
233
+ nil, # word (doesn't matter)
234
+ nil, # pos (doesn't matter)
235
+ nt_c.next.to_s)
236
+ node.set_attribute("head",SalsaTigerXMLHelper.escape(nt_a[1].strip))
237
+ tempstack.reverse.each {|child|
238
+ node.add_child(child,nil)
239
+ child.set_parent(node,nil)
240
+ }
241
+ stack.push(node)
242
+ break # while
243
+ end
244
+ end
245
+ position = position+2 # == nextspace+1
246
+ else # terminal
247
+ nextspace = string.index(" ",position)
248
+ terminal = string[position..nextspace].strip
249
+ t_a = terminal.split("/")
250
+ unless t_a.length == 2
251
+ raise "[collins] Cannot split terminal #{terminal} into word and POS!"
252
+ end
253
+
254
+ word = t_a[0]
255
+ pos = t_a[1]
256
+
257
+ unless pos =~ /TRACE/
258
+ # construct a new terminal
259
+ node = st_sent.add_syn("t",
260
+ nil,
261
+ SalsaTigerXMLHelper.escape(CollinsInterface.unescape(word)), # word
262
+ SalsaTigerXMLHelper.escape(pos), # pos
263
+ t_c.next.to_s)
264
+ stack.push(node)
265
+ end
266
+ position = nextspace+1
267
+ end
268
+ end
269
+
270
+ # at the very end, we need to have exactly one syntactic root
271
+
272
+ if stack.length != 1
273
+ raise "[collins] Error: Sentence has #{stack.length} roots"
274
+ end
275
+ end
276
+
277
+
278
+ ####
279
+ # extract the Collins parser input format from a TabFormat object
280
+ # that includes part-of-speech (pos)
281
+ #
282
+ def CollinsInterface.produce_collins_input(corpusfile,tempfile)
283
+ corpusfile.each_sentence {|s|
284
+ words = Array.new
285
+ s.each_line_parsed {|line_obj|
286
+ word = line_obj.get("word")
287
+ tag = line_obj.get("pos")
288
+ if tag.nil?
289
+ raise "Error: FNTabFormat object not tagged!"
290
+ end
291
+ word_tag_pair = CollinsInterface.escape(word,tag)
292
+ if word_tag_pair =~ /\)/
293
+ puts word_tag_pair
294
+ puts s.to_s
295
+ end
296
+ words << word_tag_pair
297
+ }
298
+ tempfile.puts words.length.to_s+" "+words.join(" ")
299
+ }
300
+ end
301
+
302
+ ####
303
+ def CollinsInterface.escape(word,pos) # returns array word+" "+lemma
304
+ case word
305
+
306
+ # replace opening or closing brackets
307
+ # word representation is {L,R}R{B,S,C} (bracket, square, curly)
308
+ # POS for opening brackets is LRB, closing brackets RRB
309
+
310
+ when "("
311
+ return "LRB -LRB-"
312
+ when "["
313
+ return "LRS -LRB-"
314
+ when "{"
315
+ return "LRC -LRB-"
316
+
317
+ when ")"
318
+ return "RRB -RRB-"
319
+ when "]"
320
+ return "RRS -RRB-"
321
+ when "}"
322
+ return "RRC -RRB-"
323
+
324
+ # catch those brackets or slashes inside words
325
+ else
326
+ word.gsub!(/\(/,"LRB")
327
+ word.gsub!(/\)/,"RRB")
328
+ word.gsub!(/\[/,"LRS")
329
+ word.gsub!(/\]/,"RRS")
330
+ word.gsub!(/\{/,"LRC")
331
+ word.gsub!(/\}/,"RRC")
332
+ word.gsub!(/\//,"&Slash;")
333
+ return word+" "+pos
334
+ end
335
+ end
336
+
337
+ ####
338
+ # replace replacements with original values
339
+ def CollinsInterface.unescape(word)
340
+ return word.gsub(/LRB/,"(").gsub(/RRB/,")").gsub(/LRS/,"[").gsub(/RRS/,"]").gsub(/LRC/,"{").gsub(/RRC/,"}").gsub(/&Slash;/,"/")
341
+ end
342
+ end
343
+
344
+ ################################################
345
+ # Interpreter class
346
+ class CollinsTntInterpreter < SynInterpreter
347
+ CollinsTntInterpreter.announce_me()
348
+
349
+ ###
350
+ # names of the systems interpreted by this class:
351
+ # returns a hash service(string) -> system name (string),
352
+ # e.g.
353
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
354
+ def CollinsTntInterpreter.systems()
355
+ return {
356
+ "pos_tagger" => "treetagger",
357
+ "parser" => "collins"
358
+ }
359
+ end
360
+
361
+ ###
362
+ # names of additional systems that may be interpreted by this class
363
+ # returns a hash service(string) -> system name(string)
364
+ # same as names()
365
+ def CollinsTntInterpreter.optional_systems()
366
+ return {
367
+ "lemmatizer" => "treetagger"
368
+ }
369
+ end
370
+
371
+ ###
372
+ # generalize over POS tags.
373
+ #
374
+ # returns one of:
375
+ #
376
+ # adj: adjective (phrase)
377
+ # adv: adverb (phrase)
378
+ # card: numbers, quantity phrases
379
+ # con: conjunction
380
+ # det: determiner, including possessive/demonstrative pronouns etc.
381
+ # for: foreign material
382
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
383
+ # part: particles, truncated words (German compound parts)
384
+ # prep: preposition (phrase)
385
+ # pun: punctuation, brackets, etc.
386
+ # sent: sentence
387
+ # top: top node of a sentence
388
+ # verb: verb (phrase)
389
+ # nil: something went wrong
390
+ #
391
+ # returns: string, or nil
392
+ def CollinsTntInterpreter.category(node) # SynNode
393
+ pt = CollinsTntInterpreter.simplified_pt(node)
394
+ if pt.nil?
395
+ # phrase type could not be determined
396
+ return nil
397
+ end
398
+
399
+ pt.to_s.strip() =~ /^([^-]*)/
400
+ case $1
401
+ when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
402
+ when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
403
+ when /^CD/, /^QP/ then return "card"
404
+ when /^CC/, /^WRB/, /^CONJP/ then return "con"
405
+ when /^DT/, /^POS/ then return "det"
406
+ when /^FW/, /^SYM/ then return "for"
407
+ when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
408
+ when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
409
+ when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
410
+ when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
411
+ when /^TOP/ then return "top"
412
+ when /^TRACE/ then return "trace"
413
+ when /^V/ , /^MD/ then return "verb"
414
+ else
415
+ # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
416
+ return nil
417
+ end
418
+ end
419
+
420
+
421
+ ###
422
+ # is relative pronoun?
423
+ #
424
+ def CollinsTntInterpreter.relative_pronoun?(node) # SynNode
425
+ pt = CollinsTntInterpreter.simplified_pt(node)
426
+ if pt.nil?
427
+ # phrase type could not be determined
428
+ return nil
429
+ end
430
+
431
+ pt.to_s.strip() =~ /^([^-]*)/
432
+ case $1
433
+ when /^WDT/, /^WHAD/, /^WHNP/, /^WP/
434
+ return true
435
+ else
436
+ return false
437
+ end
438
+ end
439
+
440
+ ###
441
+ # lemma_backoff:
442
+ #
443
+ # if we have lemma information, return that,
444
+ # and failing that, return the word
445
+ #
446
+ # returns: string, or nil
447
+ def CollinsTntInterpreter.lemma_backoff(node)
448
+ lemma = super(node)
449
+ # lemmatizer has returned more than one possible lemma form:
450
+ # just accept the first
451
+ if lemma =~ /^([^|]+)|/
452
+ return $1
453
+ else
454
+ return lemma
455
+ end
456
+ end
457
+
458
+
459
+ ###
460
+ # simplified phrase type:
461
+ # like phrase type, but may simplify
462
+ # the constituent label
463
+ #
464
+ # returns: string
465
+ def CollinsTntInterpreter.simplified_pt(node)
466
+ CollinsTntInterpreter.pt(node) =~ /^(\w+)(-\w)*/
467
+ return $1
468
+ end
469
+
470
+ ###
471
+ # verb_with_particle:
472
+ #
473
+ # given a node and a nodelist,
474
+ # if the node represents a verb:
475
+ # see if the verb has a particle among the nodes in nodelist
476
+ # if so, return it
477
+ #
478
+ # returns: SynNode object if successful, else nil
479
+ def CollinsTntInterpreter.particle_of_verb(node,
480
+ node_list)
481
+
482
+ # must be verb
483
+ unless CollinsTntInterpreter.category(node) == "verb"
484
+ return nil
485
+ end
486
+
487
+ # must have parent
488
+ unless node.parent
489
+ return nil
490
+ end
491
+
492
+ # look for sisters of the verb node that have the particle category
493
+ particles = node.parent.children.select { |sister|
494
+ CollinsTntInterpreter.category(sister) == "part"
495
+ }.map { |n| n.children}.flatten.select { |niece|
496
+ # now look for children of those nodes that are particles and are in the nodelist
497
+ nodelist.include? niece and
498
+ CollinsTntInterpreter.category(niece) == "part"
499
+ }
500
+
501
+ if particles.length == 0
502
+ return nil
503
+ else
504
+ return particles.first
505
+ end
506
+ end
507
+
508
+ ###
509
+ # auxiliary?
510
+ #
511
+ # returns true if the given node is an auxiliary
512
+ # else false
513
+ def CollinsTntInterpreter.auxiliary?(node)
514
+
515
+ # look for
516
+ # ---VP---
517
+ # | |
518
+ # the given node VP-A
519
+ # |
520
+ # verb node
521
+ # verb?
522
+ unless CollinsTntInterpreter.category(node) == "verb"
523
+ return false
524
+ end
525
+
526
+ unless (parent = node.parent) and
527
+ parent.category() == "VP"
528
+ return false
529
+ end
530
+ unless (vpa_node = parent.children.detect { |other_child| other_child.category() == "VP-A" })
531
+ return false
532
+ end
533
+ unless vpa_node.children.detect { |other_node| CollinsTntInterpreter.category(other_node) == "verb" }
534
+ return false
535
+ end
536
+
537
+ return true
538
+
539
+ end
540
+
541
+ ###
542
+ # modal?
543
+ #
544
+ # returns true if the given node is a modal verb,
545
+ # else false
546
+ def CollinsTntInterpreter.modal?(node)
547
+ if node.part_of_speech() =~ /^MD/
548
+ return true
549
+ else
550
+ return false
551
+ end
552
+ end
553
+
554
+ ###
555
+ # voice
556
+ #
557
+ # given a constituent, return
558
+ # - "active"/"passive" if it is a verb
559
+ # - nil, else
560
+ def CollinsTntInterpreter.voice(node) # SynNode
561
+
562
+ tobe = ["be","am","is","are","was","were"]
563
+
564
+ unless CollinsTntInterpreter.category(node) == "verb"
565
+ return nil
566
+ end
567
+
568
+ # if we have a gerund, a present tense, or an infitive
569
+ # then we are sure that we have an active form
570
+ case CollinsTntInterpreter.pt(node)
571
+ when "VBG","VBP", "VBZ", "VB"
572
+ return "active"
573
+ end
574
+
575
+
576
+ # There is an ambiguity for many word forms between VBN (past participle - passive)
577
+ # and VBD (past tense - active)
578
+
579
+ # so for these, we only say something if we can exclude one possibility,
580
+ # this is the case
581
+ # (a) when there is a c-commanding "to be" somewhere. -> passive
582
+ # (b) when there is no "to be", but a "to have" somewhere. -> active
583
+
584
+ # collect lemmas of c-commanding verbs.
585
+
586
+ parent = node.parent
587
+ if parent.nil?
588
+ return nil
589
+ end
590
+ gp = parent.parent
591
+ if gp.nil?
592
+ return nil
593
+ end
594
+
595
+ # other_verbs = Array.new
596
+ #
597
+ # current_node = node
598
+ # while current_node = current_node.parent
599
+ # pt = CollinsTntInterpreter.category(current_node)
600
+ # unless ["verb","sentence"].include? pt
601
+ # break
602
+ # end
603
+ # current_node.children.each {|child|
604
+ # if CollinsTntInterpreter.category(child) == "verb"
605
+ # other_verbs << CollinsTntInterpreter.lemma_backoff(nephew)
606
+ # end
607
+ # }
608
+ # end
609
+ #
610
+ # unless (tobe & other_verbs).empty?
611
+ # puts "passive "+node.id
612
+ # return "passive"
613
+ # end
614
+ # unless (tohave & other_verbs).empty?
615
+ # return "active"
616
+ # end
617
+
618
+ if CollinsTntInterpreter.category(gp) == "verb" or CollinsTntInterpreter.category(gp) == "sent"
619
+
620
+ current_node = node
621
+
622
+ while current_node = current_node.parent
623
+ pt = CollinsTntInterpreter.category(current_node)
624
+ unless ["verb","sent"].include? pt
625
+ break
626
+ end
627
+ if current_node.children.detect {|nephew| tobe.include? CollinsTntInterpreter.lemma_backoff(nephew)}
628
+ return "passive"
629
+ end
630
+ end
631
+ # if no "to be" has been found...
632
+ return "active"
633
+ end
634
+
635
+ # case 2: The grandfather is something else (e.g. a noun phrase)
636
+ # here, simple past forms are often mis-tagged as passives
637
+ #
638
+
639
+ # if we were cautious, we would return "dontknow" here;
640
+ # however, these cases are so rare that it is unlikely that
641
+ # assignments would be more reliable; so we rely on the
642
+ # POS tag anyway.
643
+
644
+
645
+ case CollinsTntInterpreter.pt(node)
646
+ when "VBN","VBD"
647
+ return "passive"
648
+ # this must be some kind of error...
649
+ else
650
+ return nil
651
+ end
652
+ end
653
+
654
+ ###
655
+ # gfs
656
+ #
657
+ # grammatical functions of a constituent:
658
+ #
659
+ # returns: a list of pairs [relation(string), node(SynNode)]
660
+ # where <node> stands in the relation <relation> to the parameter
661
+ # that the method was called with
662
+ def CollinsTntInterpreter.gfs(anchor_node, # SynNode
663
+ sent) # SalsaTigerSentence
664
+
665
+ return sent.syn_nodes.map { |gf_node|
666
+
667
+ case CollinsTntInterpreter.category(anchor_node)
668
+ when "adj"
669
+ rel = CollinsTntInterpreter.gf_adj(anchor_node, gf_node)
670
+ when "verb"
671
+ rel = CollinsTntInterpreter.gf_verb(anchor_node, gf_node)
672
+ when "noun"
673
+ rel = CollinsTntInterpreter.gf_noun(anchor_node, gf_node)
674
+ end
675
+
676
+ if rel
677
+ [rel, gf_node]
678
+ else
679
+ nil
680
+ end
681
+ }.compact()
682
+ end
683
+
684
+ ###
685
+ # informative_content_node
686
+ #
687
+ # for most constituents: nil
688
+ # for a PP, the NP
689
+ # for an SBAR, the VP
690
+ # for a VP, the embedded VP
691
+ def CollinsTntInterpreter.informative_content_node(node)
692
+ this_pt = CollinsTntInterpreter.simplified_pt(node)
693
+
694
+ unless ["SBAR", "VP", "PP"].include? this_pt
695
+ return nil
696
+ end
697
+
698
+ nh = CollinsTntInterpreter.head_terminal(node)
699
+ unless nh
700
+ return nil
701
+ end
702
+ headlemma = CollinsTntInterpreter.lemma_backoff(nh)
703
+
704
+ nonhead_children = node.children().reject { |n|
705
+ nnh = CollinsTntInterpreter.head_terminal(n)
706
+ not(nnh) or
707
+ CollinsTntInterpreter.lemma_backoff(nnh) == headlemma
708
+ }
709
+ if nonhead_children.length() == 1
710
+ return nonhead_children.first
711
+ end
712
+
713
+ # more than one child:
714
+ # for SBAR and VP take child with head POS starting in VB,
715
+ # for PP child with head POS starting in NN
716
+ case this_pt
717
+ when "SBAR", "VP"
718
+ icont_child = nonhead_children.detect { |n|
719
+ h = CollinsTntInterpreter.head_terminal(n)
720
+ h and h.part_of_speech() =~ /^VB/
721
+ }
722
+ when "PP"
723
+ icont_child = nonhead_children.detect { |n|
724
+ h = CollinsTntInterpreter.head_terminal(n)
725
+ h and h.part_of_speech() =~ /^NN/
726
+ }
727
+ else
728
+ raise "Shouldn't be here"
729
+ end
730
+
731
+ if icont_child
732
+ return icont_child
733
+ else
734
+ return nonhead_children.first
735
+ end
736
+ end
737
+
738
+
739
+
740
+
741
+ ########
742
+ # prune?
743
+ # given a target node t and another node n of the syntactic structure,
744
+ # decide whether n is likely to instantiate a semantic role
745
+ # of t. If not, recommend n for pruning.
746
+ #
747
+ # This method implements a slight variant of Xue and Palmer (EMNLP 2004).
748
+ # Pruning according to Xue & Palmer, EMNLP 2004:
749
+ # "Step 1: Designate the predicate as the current node and
750
+ # collect its sisters (constituents attached at the same level
751
+ # as the predicate) unless its sisters are coordinated with the
752
+ # predicate. If a sister is a PP, also collect its immediate
753
+ # children.
754
+ # Step 2: Reset the current node to its parent and repeat Step 1
755
+ # till it reaches the top level node.
756
+ #
757
+ # Modifications made here:
758
+ # - paths of length 0 accepted in any case
759
+ #
760
+ # returns: false to recommend n for pruning, else true
761
+ def CollinsTntInterpreter.prune?(node, # SynNode
762
+ paths_to_target, # hash: node ID -> Path object: paths from target to node
763
+ terminal_index) # hash: terminal node -> word index in sentence
764
+
765
+ path_to_target = paths_to_target[node.id()]
766
+
767
+ if not path_to_target
768
+ # no path from target to node: suggest for pruning
769
+
770
+ return 0
771
+
772
+ elsif path_to_target.length == 0
773
+ # target may be its own role: definite accept
774
+
775
+ return 1
776
+
777
+ else
778
+ # consider path from target to node.
779
+ # (1) If the path to the current node includes at least one Up
780
+ # and exactly one Down, keep.
781
+ # (2) Else, if the path includes at least one Up and exactly two Down,
782
+ # and the current node's parent is a PP, keep
783
+ # (3) else discard
784
+
785
+ # count number of up and down steps in path to target
786
+ num_up = 0
787
+ num_down = 0
788
+ path_to_target.each_step { |direction, edgelabel, nodelabel, endnode|
789
+ case direction
790
+ when /U/
791
+ num_up += 1
792
+ when /D/
793
+ num_down += 1
794
+ end
795
+ }
796
+
797
+ # coordination sister between node and target?
798
+ conj_sister_between = CollinsTntInterpreter.conj_sister_between?(node, paths_to_target,
799
+ terminal_index)
800
+
801
+
802
+ if conj_sister_between
803
+ # coordination between me and the target -- drop
804
+ return 0
805
+
806
+ elsif num_up >= 1 and num_down == 1
807
+ # case (1)
808
+ return 1
809
+
810
+ elsif num_up >= 1 and num_down == 2 and
811
+ (p = node.parent()) and CollinsTntInterpreter.category(p) == "prep"
812
+
813
+ # case (2)
814
+ return 1
815
+
816
+ else
817
+ # case (3)
818
+ return 0
819
+ end
820
+ end
821
+ end
822
+
823
+
824
+ ###
825
+ private
826
+
827
+
828
+ ###
829
+ # given an anchor node and another node that may be some
830
+ # grammatical function of the anchor node:
831
+ # return the grammatical function (string) if found,
832
+ # else nil.
833
+ #
834
+ # here: anchor node is verb.
835
+ def CollinsTntInterpreter.gf_verb(anchor_node, # SynNode
836
+ gf_node) # SynNode
837
+
838
+ # first classification: according to constituent type
839
+ cat = CollinsTntInterpreter.category(gf_node)
840
+ if cat.nil?
841
+ return nil
842
+ end
843
+
844
+ # second classification: according to path
845
+ path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
846
+ if path.nil?
847
+ # no path between anchor node and gf node
848
+ return nil
849
+ end
850
+
851
+ path.set_cutoff_last_pt_on_printing(true)
852
+ path_string = path.print(true,false,true)
853
+
854
+ case path_string
855
+ when "U VP D ", "U SG D "
856
+ categ2 = "inside"
857
+ when /^U (VP U )*S(BAR)? D $/
858
+ categ2 = "external"
859
+ when /^U (VP U )*VP D ADVP D $/
860
+ categ2 = "external"
861
+ else
862
+ categ2 = ""
863
+ end
864
+
865
+ # now evaluate based on both
866
+ case cat+ "+" + categ2
867
+ when "noun+inside"
868
+ # direct object
869
+ return "OA"
870
+
871
+ when "noun+external"
872
+ unless CollinsTntInterpreter.relative_position(gf_node, anchor_node) == "LEFT"
873
+ return nil
874
+ end
875
+
876
+ if CollinsTntInterpreter.voice(anchor_node) == "passive"
877
+ return "OA"
878
+ else
879
+ return "SB"
880
+ end
881
+
882
+ when "prep+inside"
883
+ if CollinsTntInterpreter.voice(anchor_node) == "passive" and
884
+ CollinsTntInterpreter.preposition(gf_node) == "by"
885
+ return "SB"
886
+ else
887
+ return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
888
+ end
889
+
890
+ when "sent+inside"
891
+ return "OC"
892
+
893
+ when "sent+external"
894
+ return "OC"
895
+
896
+ else
897
+ return nil
898
+ end
899
+ end
900
+
901
+ ###
902
+ # given an anchor node and another node that may be some
903
+ # grammatical function of the anchor node:
904
+ # return the grammatical function (string) if found,
905
+ # else nil.
906
+ #
907
+ # here: anchor node is noun.
908
+ def CollinsTntInterpreter.gf_noun(anchor_node, # SynNode
909
+ gf_node) # SynNode
910
+
911
+ # first classification: according to constituent type
912
+ cat = CollinsTntInterpreter.category(gf_node)
913
+ if cat.nil?
914
+ return nil
915
+ end
916
+
917
+ # second classification: according to path
918
+ path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
919
+ if path.nil?
920
+ # no path between anchor node and gf node
921
+ return nil
922
+ end
923
+
924
+ path.set_cutoff_last_pt_on_printing(true)
925
+ path_string = path.print(true,false,true)
926
+
927
+ case path_string
928
+ when "U NPB D "
929
+ categ2 = "np-neighbor"
930
+ when "U NPB U NP D "
931
+ categ2 = "np-parent"
932
+ when "U NP D "
933
+ categ2 = "np-a"
934
+ when /^U NPB (U NP )?(U NP )?U S(BAR)? D( VP D)? $/
935
+ categ2 = "beyond-s"
936
+ when /^U NP(B)? (U NP )?U VP D $/
937
+ categ2 = "beyond-vp"
938
+ when /^U NPB (U NP )?(U NP)?U PP U VP(-A)? D $/
939
+ categ2 = "beyond-pp-vp"
940
+ else
941
+ categ2 = ""
942
+ end
943
+
944
+ # now evaluate based on both
945
+ case cat + "+" + categ2
946
+ when "noun+np-neighbor"
947
+ return "AG"
948
+
949
+ when "sent+np-parent"
950
+ return "OC"
951
+
952
+ when "prep+np-parent", "prep+np-a"
953
+ return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
954
+ # relation of anchor noun to governing verb not covered by "gfs" method
955
+ # when "verb+beyond-s"
956
+ # return "SB-of"
957
+
958
+ # when "verb+beyond-vp"
959
+ # return "OA-of"
960
+
961
+ # when "verb+beyond-pp-vp"
962
+ # return "MO-of"
963
+ else
964
+ return nil
965
+ end
966
+ end
967
+
968
+
969
+ ###
970
+ # given an anchor node and another node that may be some
971
+ # grammatical function of the anchor node:
972
+ # return the grammatical function (string) if found,
973
+ # else nil.
974
+ #
975
+ # here: anchor node is adjective.
976
+ def CollinsTntInterpreter.gf_adj(anchor_node, # SynNode
977
+ gf_node) # SynNode
978
+
979
+ # first classification: according to constituent type
980
+ cat = CollinsTntInterpreter.category(gf_node)
981
+ if cat.nil?
982
+ return nil
983
+ end
984
+
985
+ # second classification: according to path
986
+ path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
987
+ if path.nil?
988
+ # no path between anchor node and gf node
989
+ return nil
990
+ end
991
+
992
+ path.set_cutoff_last_pt_on_printing(true)
993
+ path_string = path.print(true,false,true)
994
+
995
+ case path_string
996
+ when /^(U ADJP )?U NPB D $/
997
+ categ2 = "nnpath"
998
+ when "U ADJP D "
999
+ categ2 = "adjp-neighbor"
1000
+ when /^(U ADJP )?U (VP U )?S(BAR)? D $/
1001
+ categ2 = "s"
1002
+ when /^U (ADJP U )?VP D $/
1003
+ categ2 = "vp"
1004
+ else
1005
+ categ2 = ""
1006
+ end
1007
+
1008
+ # now evaluate based on both
1009
+ case cat + "+" + categ2
1010
+ when "noun+nnpath"
1011
+ return "HD"
1012
+ when "verb+adjp-neighbor"
1013
+ return "OC"
1014
+ when "prep+vp", "prep+adjp-neighbor"
1015
+ return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
1016
+ else
1017
+ return nil
1018
+ end
1019
+ end
1020
+
1021
+ ####
1022
+ # auxiliary of prune?:
1023
+ #
1024
+ # given a node and a hash mapping node IDs to paths to target:
1025
+ # Does that node have a sister that is a coordination and that
1026
+ # is between it and the target?
1027
+ #
1028
+ def CollinsTntInterpreter.conj_sister_between?(node, # SynNode
1029
+ paths_to_target, # Hash: node ID -> Path obj: path from node to target
1030
+ ti) # hash: terminal node -> word index in sentence
1031
+
1032
+ # does node have sisters that represent coordination?
1033
+ unless (p = node.parent())
1034
+ return false
1035
+ end
1036
+
1037
+ unless (conj_sisters = p.children.select { |sib|
1038
+ sib != node and CollinsTntInterpreter.category(sib) == "con"
1039
+ } ) and
1040
+ not (conj_sisters.empty?)
1041
+ return false
1042
+ end
1043
+
1044
+ # represent each coordination sister, and the node itself,
1045
+ # as a triple [node, leftmost terminal index(node), rightmost terminal index(node)
1046
+ conj_sisters = conj_sisters.map { |n|
1047
+ [n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
1048
+ }
1049
+
1050
+ this_triple = [node, CollinsTntInterpreter.lti(node, ti), CollinsTntInterpreter.rti(node, ti)]
1051
+
1052
+ # sisters closer to the target than node:
1053
+ # also map to triples
1054
+ sisters_closer_to_target = p.children.select { |sib|
1055
+ sib != node and
1056
+ not(conj_sisters.include? sib) and
1057
+ paths_to_target[sib.id()] and
1058
+ paths_to_target[sib.id()].length() < paths_to_target[node.id()].length
1059
+ }.map { |n|
1060
+ [n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
1061
+ }
1062
+
1063
+ if sisters_closer_to_target.empty?
1064
+ return false
1065
+ end
1066
+
1067
+ # is there any coordination sister that is inbetween this node
1068
+ # and some sister that is closer to the target?
1069
+ # if so, return true
1070
+ conj_sisters.each { |conj_triple|
1071
+ if leftof(conj_triple, this_triple) and
1072
+ sisters_closer_to_target.detect { |s| CollinsTntInterpreter.leftof(s, conj_triple) }
1073
+
1074
+ return true
1075
+
1076
+ elsif rightof(conj_triple, this_triple) and
1077
+ sisters_closer_to_target.detect { |s| CollinsTntInterpreter.rightof(s, conj_triple) }
1078
+
1079
+ return true
1080
+ end
1081
+ }
1082
+
1083
+ # else return false
1084
+ return false
1085
+ end
1086
+
1087
+ ###
1088
+ # lti, rti: terminal index of the leftmost/rightmost terminal of
1089
+ # a given node (SynNode)
1090
+ #
1091
+ # auxiliary of conj_sister_between?
1092
+ def CollinsTntInterpreter.lti(node, # SynNode
1093
+ terminal_index) # hash: terminal node -> word index in sentence
1094
+ lt = CollinsTntInterpreter.leftmost_terminal(node)
1095
+ unless lt
1096
+ return nil
1097
+ end
1098
+
1099
+ return terminal_index[lt]
1100
+ end
1101
+
1102
+ def CollinsTntInterpreter.rti(node, # SynNode
1103
+ terminal_index) # hash: terminal node -> word index in sentence
1104
+ rt = CollinsTntInterpreter.rightmost_terminal(node)
1105
+ unless rt
1106
+ return nil
1107
+ end
1108
+
1109
+ return terminal_index[rt]
1110
+ end
1111
+
1112
+ ###
1113
+ # leftof, rightof: given 2 triples
1114
+ # [node(SynNode), index of leftmost terminal(integer/nil), index of rightmost terminal(integer/nil),
1115
+ #
1116
+ # auxiliaries of conj_sister_between?
1117
+ #
1118
+ # return true if both leftmost and rightmost terminal indices of the first triple are
1119
+ # smaller than (for leftof) / bigger than (for rightof) the
1120
+ # corresponding indices of the second triple
1121
+ #
1122
+ # return false if some index is nil
1123
+ def CollinsTntInterpreter.leftof(triple1,
1124
+ triple2)
1125
+ dummy, lm1, rm1 = triple1
1126
+ dummy, lm2, rm2 = triple2
1127
+
1128
+ if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
1129
+ return false
1130
+ elsif lm1 < lm2 and rm1 < rm2
1131
+ return true
1132
+ else
1133
+ return false
1134
+ end
1135
+ end
1136
+
1137
+ def CollinsTntInterpreter.rightof(triple1,
1138
+ triple2)
1139
+ dummy, lm1, rm1 = triple1
1140
+ dummy, lm2, rm2 = triple2
1141
+
1142
+ if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
1143
+ return false
1144
+ elsif lm1 > lm2 and rm1 > rm2
1145
+ return true
1146
+ else
1147
+ return false
1148
+ end
1149
+ end
1150
+ end
1151
+
1152
+
1153
+ # use TreeTagger as replacement for TnT; re-use everything, but use treetagger as POS tagger
1154
+
1155
+ class CollinsTreeTaggerInterpreter < CollinsTntInterpreter
1156
+ CollinsTreeTaggerInterpreter.announce_me()
1157
+
1158
+ def CollinsTreeTaggerInterpreter.systems()
1159
+ return {
1160
+ "pos_tagger" => "treetagger",
1161
+ "parser" => "collins"
1162
+ }
1163
+ end
1164
+ end
1165
+