shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
data/bin/fred DELETED
@@ -1,16 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: utf-8 -*-
3
-
4
- # @author Andrei Beliankou, 2011-11-13
5
- # @author Katrin Erk, April 05
6
- #
7
- # Frame disambiguation system:
8
- # frame assignment as word sense disambiguation
9
-
10
- require 'fred/opt_parser'
11
- require 'fred/fred'
12
-
13
- options = Fred::OptParser.parse(ARGV)
14
-
15
- fred = Fred::Fred.new(options)
16
- fred.assign
data/bin/frprep DELETED
@@ -1,34 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: utf-8 -*-
3
-
4
- # AB, 2010-11-25
5
-
6
- # frprep
7
- # Katrin Erk July 05
8
- #
9
- # Preprocessing for Fred and Rosy:
10
- # accept input as plain text,
11
- # FrameNet XML, Salsa-tabular format,
12
- # or SalsaTigerXML,
13
- # lemmatize, POS-tag and parse
14
- # (if asked to do so)
15
- # and in any case produce output in
16
- # SalsaTigerXML.
17
- #
18
- # Extensions to SalsaTigerXML introduced by frprep:
19
- #
20
- # - "lemma": lemma. Attribute of terminals.
21
- # - "head": head word (not lemma!) of constituent.Attribute of nonterminals.
22
- # - "fn_gf": FrameNet grammatical function label, attached to the maximal
23
- # constituents covering the terminals labeled with that label
24
-
25
-
26
- require 'frprep/frprep'
27
- require 'frprep/opt_parser'
28
-
29
-
30
- options = FrPrep::OptParser.parse(ARGV)
31
-
32
-
33
- preprocessor = FrPrep::FrPrep.new(options)
34
- preprocessor.transform
data/bin/rosy DELETED
@@ -1,17 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: utf-8 -*-
3
-
4
- # AB: 2011-11-14
5
- # rosy.rb
6
- # KE, SP April 05
7
- #
8
- # Main file of the Rosy role assignment system.
9
-
10
-
11
- require 'rosy/opt_parser'
12
- require 'rosy/rosy'
13
-
14
- options = Rosy::OptParser.parse(ARGV)
15
-
16
- rosy = Rosy::Rosy.new(options)
17
- rosy.assign
@@ -1,1229 +0,0 @@
1
- # Katrin Erk Oct/Nov 05
2
- #
3
- # Abstract classes for interfaces for systems that provide syntactic
4
- # analysis.
5
- #
6
- # There are two types of interfaces to syntactic analysis systems:
7
- # - interfaces:
8
- # offer methods for syntactic analysis.
9
- #
10
- # SynInterfaceTab:
11
- # input and output format is (FN)TabFormat.
12
- # SynInterfaceSTXML:
13
- # input format is TabFormat, output format is
14
- # Salsa/Tiger XML, also provided as
15
- # SalsaTigerSentence objects
16
- #
17
- # - interpreters:
18
- # interpret the resulting Salsa/Tiger XML (represented as
19
- # SalsaTigerSentence and SynNode objects), e.g.
20
- # generalize over part of speech;
21
- # describe the path between a pair of nodes both as a path
22
- # and (potentially) as a grammatical function of one of the nodes;
23
- # determine whether a node describes a verb, and in which voice;
24
- # determine the head of a constituent
25
-
26
- require "tempfile"
27
-
28
- require 'common/ruby_class_extensions'
29
-
30
- require 'common/ISO-8859-1'
31
- require 'common/Parser'
32
- require "common/SalsaTigerRegXML"
33
- require "common/TabFormat"
34
-
35
- #############################
36
- # abstract class, to be inherited:
37
- #
38
- # tabular format or SalsaTigerXML interface for modules
39
- # offering POS tagging, lemmatization, parsing etc.
40
- class SynInterface
41
-
42
- ###
43
- # returns a string: the name of the system
44
- # e.g. "Collins" or "TNT"
45
- def self.system
46
- raise "Overwrite me"
47
- end
48
-
49
- ###
50
- # returns a string: the service offered
51
- # one of "lemmatizer", "parser", "pos tagger"
52
- def self.service
53
- raise "Overwrite me"
54
- end
55
-
56
- ###
57
- # initialize to set values for all subsequent processing
58
- def initialize(program_path, # string: path to system
59
- insuffix, # string: suffix of input files
60
- outsuffix, # string: suffix for processed files
61
- var_hash = {}) # optional arguments in a hash
62
-
63
- @program_path = program_path
64
- @insuffix = insuffix
65
- @outsuffix = outsuffix
66
- end
67
-
68
- ###
69
- # process each file in in_dir with matching suffix,
70
- # producing a file in out_dir with same name but the suffix replaced
71
- #
72
- # returns: nothing
73
- def process_dir(in_dir, # string: name of input directory
74
- out_dir) # string: name of output directory
75
-
76
- Dir["#{in_dir}*#{@insuffix}"].each do |infilename|
77
- outfilename = "#{out_dir}#{File.basename(infilename, @insuffix)}#{@outsuffix}"
78
- process_file(infilename, outfilename)
79
- end
80
- end
81
-
82
- ###
83
- # process one file, writing the result to outfilename
84
- #
85
- # returns: nothing
86
- def process_file(infilename, # string: name of input file
87
- outfilename)
88
- raise "Overwrite me"
89
- end
90
-
91
- ######
92
- protected
93
-
94
- def self.announce_me
95
- if defined?(SynInterfaces)
96
- # yup, we have a class to which we can announce ourselves
97
- SynInterfaces.add_interface(eval(self.name))
98
- else
99
- # no interface collector class
100
- STDERR.puts "Interface #{self.name} not announced: no SynInterfaces."
101
- end
102
- end
103
- end
104
-
105
- #############################
106
- # abstract class, to be inherited:
107
- #
108
- # SalsaTigerXML interface for modules
109
- # offering parsing etc.
110
- #
111
- # The input format for these classes is TabFormat or FNTabFormat
112
- class SynInterfaceSTXML < SynInterface
113
- ###
114
- # initialize to set values for all subsequent processing
115
- def initialize(program_path, # string: path to system
116
- insuffix, # string: suffix of input files
117
- outsuffix, # string: suffix for processed files
118
- stsuffix, # string: suffix for Salsa/Tiger XML files
119
- var_hash = {}) # optional arguments in a hash
120
- super(program_path, insuffix, outsuffix, var_hash)
121
- @stsuffix = stsuffix
122
- end
123
-
124
- def to_stxml_dir(in_dir, # string: name of dir with parse files
125
- out_dir) # string: name of output dir
126
-
127
- Dir["#{in_dir}*#{@outsuffix}"].each do |parsefilename|
128
- stxmlfilename = "#{out_dir}#{File.basename(parsefilename, @outsuffix)}#{@stsuffix}"
129
- to_stxml_file(parsefilename, stxmlfilename)
130
- end
131
- end
132
-
133
- def to_stxml_file(infilename, outfilename)
134
- raise "Overwrite me"
135
- end
136
-
137
- ###
138
- # standard mapping:
139
- #
140
- # to be used as the mapping from tab sentence words to
141
- # SalsaTigerSentence nodes returned by each_sentence():
142
- # map the n-th word of the tab sentence to the n-th terminal of
143
- # the SalsaTigerSentence
144
- def self.standard_mapping(sent, tabsent)
145
- retv = {}
146
-
147
- if sent.nil?
148
- retv = nil
149
- else
150
- terminals = sent.terminals_sorted
151
- if tabsent
152
- tabsent.each_line_parsed do |l|
153
- if (t = terminals[l.get("lineno")])
154
- retv[l.get("lineno")] = [t]
155
- else
156
- retv[l.get("lineno")] = []
157
- end
158
- end
159
- end
160
- end
161
-
162
- retv
163
- end
164
-
165
-
166
- ###
167
- # for a given processed file:
168
- # yield each sentence as a tuple
169
- # [SalsaTigerSentence object, FNTabFormatSentence object, mapping]
170
- # of
171
- # - the sentence in SalsaTigerXML,
172
- # - the matching tab format sentence
173
- # - a mapping of terminals:
174
- # hash: line in tab sentence(integer) -> array:SynNode
175
- # mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
176
- #
177
- # default version: write Salsa/Tiger XML to tempfile, read back in
178
- # and assume that each sentence in the tab file has a correspondent
179
- # in the processed file (may not hold e.g. if the parser leaves out
180
- # sentences it cannot process)
181
- def each_sentence(infilename, # string: name of processed file
182
- tab_dir = nil) # string: name of dir with input files
183
- # (set either here or on initialization)
184
- if tab_dir
185
- @tab_dir = tab_dir
186
- end
187
-
188
- # write Salsa/Tiger XML to tempfile
189
- tf = Tempfile.new("SynInterface")
190
- tf.close
191
- to_stxml_file(infilename, tf.path)
192
- tf.flush
193
-
194
- # get matching tab file, read
195
- tab_reader = get_tab_reader(infilename)
196
- tab_sentences = []
197
- tab_reader.each_sentence { |s| tab_sentences << s }
198
-
199
- # read Salsa/Tiger sentences and yield them
200
- reader = FilePartsParser.new(tf.path)
201
- sent_index = 0
202
- reader.scan_s { |sent_string|
203
- yield [
204
- SalsaTigerSentence.new(sent_string, tab_sentences[sent_index]),
205
- tab_sentences[sent_index],
206
- SynInterfaceSTXML.standard_mapping(sent, tab_sentences[sent_index])
207
- ]
208
- sent_index += 1
209
- }
210
-
211
- # remove tempfile
212
- tf.close(true)
213
- end
214
-
215
- #####################
216
- protected
217
-
218
-
219
- ###
220
- # get tab format file for a given processed file
221
- def get_tab_reader(infilename) # string: name of processed file
222
- # find matching non-processed file for processed file
223
- # assumption: directory with non-processed files
224
- # has been set as @tab_dir
225
-
226
- # sanity checks
227
- unless @tab_dir
228
- raise "Need to set tab directory"
229
- end
230
-
231
- # get matching tab file for this parser output file
232
- tabfilename = @tab_dir+File.basename(infilename, @outsuffix)+ @insuffix
233
- return FNTabFormatFile.new(tabfilename)
234
- end
235
-
236
-
237
- ###
238
- # provide a XML representation for a sentence that couldn't be analyzed
239
- # assuming a flat structure of all terminals, adding a virtual top node
240
- def SynInterfaceSTXML.failed_sentence(tab_sent,sentid)
241
-
242
- sent_obj = SalsaTigerSentence.empty_sentence(sentid.to_s)
243
-
244
- sent_obj.set_attribute("failed","true")
245
-
246
- topnode = sent_obj.add_syn("nt",
247
- "NONE", # cat
248
- nil, # word (doesn't matter)
249
- nil, # pos (doesn't matter)
250
- "500") # nonterminal counter
251
-
252
- t_counter = 0
253
-
254
- tab_sent.each_line_parsed {|line|
255
- t_counter += 1
256
- word = line.get("word")
257
- pos = line.get("pos")
258
- node = sent_obj.add_syn("t",
259
- nil, # cat (doesn't matter here)
260
- SalsaTigerXMLHelper.escape(word), # word
261
- pos, # pos
262
- t_counter.to_s)
263
- topnode.add_child(node,nil)
264
- node.add_parent(topnode, nil)
265
- }
266
- return sent_obj
267
- end
268
- end
269
-
270
- #############################
271
- # abstract class, to be inherited:
272
- #
273
- # tabular format interface for modules
274
- # offering POS tagging, lemmatization etc.
275
- class SynInterfaceTab < SynInterface
276
-
277
- ##########
278
- protected
279
-
280
- # fntab_words_for_file:
281
- # given a file in tab format, columns as in FNTabFormat,
282
- # get the "word" entries and write them to a given file,
283
- # one word per line, as input for processing
284
- def SynInterfaceTab.fntab_words_to_file(infilename, # string: name of input file
285
- outfile, # stream: output file
286
- sent_marker = "", # string: mark end of sentence how?
287
- iso = nil) # non-nil: assume utf-8, transform to iso-8859-1
288
- corpusfile = FNTabFormatFile.new(infilename)
289
- corpusfile.each_sentence {|s|
290
- s.each_line_parsed {|line_obj|
291
- if iso
292
- outfile.puts UtfIso.to_iso_8859_1(line_obj.get("word"))
293
- else
294
- outfile.puts line_obj.get("word")
295
- end
296
- }
297
- outfile.puts sent_marker
298
- }
299
- end
300
- end
301
-
302
- #############################
303
- # class describing a path between two nodes
304
- #
305
- # provides access and output facilities for different aspects of the path
306
- #
307
- # this is the return value of SynInterpreter.path_between()
308
- class Path
309
- attr_reader :startnode
310
-
311
- ###
312
- # initialize to empty path
313
- def initialize(startnode)
314
- @path = Array.new
315
- @cutoff_last_pt = false
316
- set_startnode(startnode)
317
- end
318
-
319
- ###
320
- # deep_clone:
321
- # return clone of this path object,
322
- # with clone of this path rather than the same path
323
- def deep_clone()
324
- new_path = self.clone()
325
- new_path.set_path(@path.clone())
326
-
327
- return new_path
328
- end
329
-
330
- ###
331
- def set_startnode(startnode)
332
- @startnode = startnode
333
-
334
- return self
335
- end
336
-
337
- ###
338
- # iterate through the current path
339
- #
340
- # yield tuples
341
- # [direction, edgelabel, nodelabel, endnode]
342
- # direction: string, U/D
343
- # edgelabel: string
344
- # nodelabel: string
345
- # endnode: SynNode
346
- def each_step()
347
- @path.each { |step|
348
- yield step
349
- }
350
- end
351
-
352
- ###
353
- # empty?
354
- # any steps in here?
355
- def empty?
356
- return @path.empty?
357
- end
358
-
359
- ###
360
- # add one step to the beginning of the current path
361
- def add_first_step(start_node,#SynNode
362
- direction, # string: U, D
363
- gf, # string: edge label
364
- pt)
365
- @path.prepend([direction, gf, pt, @startnode])
366
- set_startnode(start_node)
367
-
368
- return self
369
- end
370
-
371
-
372
- ###
373
- # add one step to the end of the current path
374
- def add_last_step(direction, # string: U, D
375
- gf, # string: edge label
376
- pt, # string: node label (of end_node)
377
- end_node) # SynNode
378
- @path << [direction, gf, pt, end_node]
379
-
380
- return self
381
- end
382
-
383
- ###
384
- # path length
385
- def length()
386
- return @path.length()
387
- end
388
-
389
- ###
390
- #
391
- def print(print_direction, # boolean. true: print direction
392
- print_gf, # boolean. true: print edgelabel
393
- print_pt) # boolean. true: print nodelabel
394
-
395
- return print_aux(@path, print_direction, print_gf, print_pt)
396
- end
397
-
398
- ###
399
- # print path from roof node to end
400
- def print_downpart(print_direction,
401
- print_gf,
402
- print_pt)
403
-
404
- roof, roof_index = compute_roof()
405
- if roof.nil? or @path.empty?
406
- # no roof set
407
- return ""
408
-
409
- else
410
- # roof node is in the middle
411
- return print_aux(@path[roof_index..-1],
412
- print_direction, print_gf, print_pt)
413
- end
414
- end
415
-
416
- ###
417
- def lca()
418
- return compute_roof().first
419
- end
420
-
421
- ###
422
- # cut off last node label in print() and print_downpart()?
423
- def set_cutoff_last_pt_on_printing(bool) # Boolean
424
- @cutoff_last_pt = bool
425
- end
426
-
427
- ########
428
- protected
429
-
430
- def set_path(new_path)
431
- @path = new_path
432
- end
433
-
434
-
435
- ########
436
- private
437
-
438
- ###
439
- # step through the path as long as direction is up.
440
- # when direction starts to go "D", take current node as roof node
441
- #
442
- # returns: pair [roof node, roof node index] (SynNode, integer)
443
- def compute_roof()
444
- node = @startnode
445
- index = 0
446
-
447
- each_step { |direction, edgelabel, nodelabel, endnode|
448
- if direction =~ /D/
449
- # down! the previous node was roof
450
- return [node, index]
451
- else
452
- node = endnode
453
- index += 1
454
- end
455
- }
456
-
457
- # last node is roof
458
- return [node, index]
459
-
460
- end
461
-
462
- ###
463
- def print_aux(path,
464
- print_direction,
465
- print_gf,
466
- print_pt)
467
- retv = ""
468
- path.each { |step|
469
- direction, gf, pt, node = step.map { |entry|
470
- if entry.nil?
471
- "-"
472
- else
473
- entry
474
- end
475
- }
476
- if print_direction
477
- retv << direction + " "
478
- end
479
- if print_gf
480
- retv << gf + " "
481
- end
482
- if print_pt
483
- retv << pt + " "
484
- end
485
- }
486
-
487
- if @cutoff_last_pt and print_pt and
488
- retv =~ /^(.+ )\w+ $/
489
- return $1
490
- else
491
- return retv
492
- end
493
- end
494
-
495
- end
496
-
497
-
498
- #############################
499
- # abstract class, to be inherited:
500
- #
501
- # interpretation for a POS tagger/lemmatizer/parser combination
502
- class SynInterpreter
503
-
504
- ###
505
- # systems interpreted by this class:
506
- # returns a hash service(string) -> system name (string),
507
- # e.g.
508
- # { "parser" => "collins", "lemmatizer" => "treetagger" }
509
- def SynInterpreter.systems()
510
- raise "Overwrite me"
511
- end
512
-
513
- ###
514
- # names of additional systems that may be interpreted by this class
515
- # returns a hash service(string) -> system name(string)
516
- # same as names()
517
- def SynInterpreter.optional_systems()
518
- raise "Overwrite me"
519
- end
520
-
521
- ###
522
- # generalize over POS tags.
523
- #
524
- # returns one of:
525
- #
526
- # adj: adjective (phrase)
527
- # adv: adverb (phrase)
528
- # card: numbers, quantity phrases
529
- # con: conjunction
530
- # det: determiner, including possessive/demonstrative pronouns etc.
531
- # for: foreign material
532
- # noun: noun (phrase), including personal pronouns, proper names, expletives
533
- # part: particles, truncated words (German compound parts)
534
- # prep: preposition (phrase)
535
- # pun: punctuation, brackets, etc.
536
- # sent: sentence
537
- # top: top node of a sentence
538
- # verb: verb (phrase)
539
- # nil: something went wrong
540
- #
541
- # default: return phrase type as is
542
- #
543
- # returns: string or nil
544
- def SynInterpreter.category(node) # SynNode
545
- unless node.kind_of? SynNode
546
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
547
- return nil
548
- end
549
-
550
- return eval(self.name()).pt(node)
551
- end
552
-
553
- ###
554
- # is relative pronoun?
555
- #
556
- # default: false
557
- def SynInterpreter.relative_pronoun?(node) # SynNode
558
- return false
559
- end
560
-
561
- ###
562
- # lemma_backoff:
563
- #
564
- # if we have lemma information, return that,
565
- # and failing that, return the word
566
- #
567
- # returns: string or nil
568
- def SynInterpreter.lemma_backoff(node)
569
- unless node.kind_of? SynNode
570
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
571
- return nil
572
- end
573
-
574
- lemma = node.get_attribute("lemma")
575
- if (lemma.nil? or lemma =~ /unknown/) and
576
- node.is_terminal?
577
- return node.word()
578
- else
579
- return lemma
580
- end
581
- end
582
-
583
- ###
584
- # phrase type:
585
- # constituent label for nonterminals,
586
- # part of speech for terminals
587
- #
588
- # returns: string
589
- def SynInterpreter.pt(node)
590
- unless node.kind_of? SynNode
591
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
592
- return nil
593
- end
594
-
595
- if node.is_terminal?
596
- return node.part_of_speech
597
- else
598
- return node.category
599
- end
600
- end
601
-
602
- ###
603
- # simplified phrase type:
604
- # like phrase type, but may simplify
605
- # the constituent label
606
- # default: just the same as pt()
607
- #
608
- # returns: string or nil
609
- def SynInterpreter.simplified_pt(node)
610
- return eval(self.name()).pt(node)
611
- end
612
-
613
- ###
614
- # particle_of_verb:
615
- #
616
- # given a node and a nodelist,
617
- # if the node represents a verb:
618
- # see if the verb has a particle among the nodes in nodelist
619
- # if so, return it
620
- # default: no recognition of separate particles
621
- #
622
- # returns: SynNode object if successful, else nil
623
- def SynInterpreter.particle_of_verb(node,
624
- node_list)
625
- return nil
626
- end
627
-
628
- ###
629
- # auxiliary?
630
- #
631
- # returns true if the given node is an auxiliary
632
- # default: no recognition of auxiliaries
633
- #
634
- # returns: boolean
635
- def SynInterpreter.auxiliary?(node)
636
- return false
637
- end
638
-
639
- ###
640
- # modal?
641
- #
642
- # returns true if the given node is a modal verb
643
- # default: no recognition of modals
644
- #
645
- # returns: boolean
646
- def SynInterpreter.modal?(node)
647
- return false
648
- end
649
-
650
- ###
651
- # head_terminal
652
- #
653
- # given a constituent, return the terminal node
654
- # that describes its headword
655
- # default: a heuristic that assumes the existence of a 'head'
656
- # attribute on nodes:
657
- # find the first node in my yield corresponding to my head attribute..
658
- #
659
- # returns: a SynNode object if successful, else nil
660
- def SynInterpreter.head_terminal(node)
661
- unless node.kind_of? SynNode
662
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
663
- return nil
664
- end
665
-
666
- if node.is_terminal?
667
- return node
668
- end
669
-
670
- head = node.get_attribute("head")
671
- unless head
672
- return nil
673
- end
674
-
675
- return node.yield_nodes.detect { |t|
676
- t.get_attribute("word") == head
677
- }
678
- end
679
-
680
- ###
681
- # voice
682
- #
683
- # given a constituent, return
684
- # - "active"/"passive" if it is a verb
685
- # - nil, else
686
- #
687
- # default: treat all as active
688
- def SynInterpreter.voice(node)
689
- unless node.kind_of? SynNode
690
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
691
- return nil
692
- end
693
-
694
- if eval(self.name()).category(node) == "verb"
695
- return "active"
696
- else
697
- return nil
698
- end
699
- end
700
-
701
- ###
702
- # gfs
703
- #
704
- # grammatical functions of a constituent:
705
- #
706
- # returns: a list of pairs [relation(string), node(SynNode)]
707
- # where <node> stands in the relation <relation> to the parameter
708
- # that the method was called with
709
- #
710
- # default: children of this node, with edge labels as relations,
711
- # prepositions tacked on for pps
712
- def SynInterpreter.gfs(node, # SynNode
713
- sent) # SalsaTigerSentence
714
- unless node.kind_of? SynNode
715
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
716
- return nil
717
- end
718
-
719
- return node.children_with_edgelabel().map { |rel, gf_node|
720
-
721
- if eval(self.name()).category(gf_node) == "prep"
722
- [rel + "-" + eval(self.name()).preposition(gf_node).to_s, gf_node]
723
-
724
- else
725
- [rel, gf_node]
726
- end
727
- }
728
- end
729
-
730
- ###
731
- # informative_content_node
732
- #
733
- # for most constituents: the head
734
- # for a PP, the NP
735
- # for an SBAR, the VP
736
- # for a VP, the embedded VP
737
- #
738
- # Default: returns the first non-head child
739
- def SynInterpreter.informative_content_node(node)
740
- unless node.kind_of? SynNode
741
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
742
- return nil
743
- end
744
-
745
- headlemma = eval(self.name()).lemma_backoff(node)
746
-
747
- first_nonhead_child = node.children().detect { |n|
748
- nnh = eval(self.name()).head_terminal(n)
749
- nnh and
750
- eval(self.name()).lemma_backoff(nnh) != headlemma
751
- }
752
-
753
- return first_nonhead_child
754
- end
755
-
756
- #####################################
757
- # verbs(sent) sent is a sentence in SalsaTigerSentence format
758
- #
759
- # return a list of the nodes of full verbs in a given sentence:
760
- # it is a list of lists. An item in that list is
761
- # - either a pair [verb, svp]
762
- # of the node of a verb with separable prefix
763
- # and the node of its separate prefix
764
- # - or a singleton [verb]
765
- # of the node of a verb without separate prefix
766
- def SynInterpreter.verbs(sent)
767
-
768
- return sent.syn_nodes.select { |node|
769
- eval(self.name()).category(node) == "verb"
770
- }.map { |node|
771
- [node]
772
- }
773
- end
774
-
775
- ###
776
- # governing verbs
777
- #
778
- # returns a list of pairs [rel, verb_node]
779
- # such that the given node fills the grammatical function rel
780
- # for this verb_node
781
- # or an empty list if there is no such verb
782
- def SynInterpreter.governing_verbs(node,
783
- sent)
784
- unless node.kind_of? SynNode
785
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
786
- return nil
787
- end
788
-
789
- retv = Array.new
790
-
791
- # each verb of the sentence:
792
- eval(self.name()).verbs(sent).each { |verb_node, prefix_node|
793
- # each gf of this verb:
794
- eval(self.name()).gfs(verb_node, sent).each { |rel, other_node|
795
- # if it points to the given node, record
796
- if other_node == node or
797
- eval(self.name()).informative_content_node(other_node) == node
798
- retv << [rel, verb_node]
799
- break
800
- end
801
- }
802
- }
803
-
804
- return retv
805
- end
806
-
807
- ###
808
- # path_between
809
- #
810
- # construct path in syntactic structure between two nodes,
811
- # using
812
- # - node labels
813
- # - edge labels
814
- # - direction Up, Down
815
- #
816
- # use_nontree_edges: set to true to use coreference edges
817
- # and other non-tree edges returned by the parser
818
- # in path computation. (Will produce no change if the parser
819
- # does not produce any non-tree edges.)
820
- #
821
- # returns: Path object
822
- def SynInterpreter.path_between(from_node, # SynNode
823
- to_node, # SynNode
824
- use_nontree_edges = false) # boolean
825
-
826
- unless from_node.kind_of? SynNode and to_node.kind_of? SynNode
827
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
828
- return nil
829
- end
830
-
831
- path = eval(self.name()).search_up(from_node,to_node, nil)
832
-
833
- if path.nil?
834
- # no path found
835
- # STDERR.puts "Warning: no path found between #{to_node.id} and #{from_node.id}"
836
- end
837
-
838
- return path
839
- end
840
-
841
- ###
842
- # surrounding_nodes:
843
- #
844
- # construct paths in syntactic structure between a node and each of its neighbors
845
- # path construction as in path_between.
846
- # Neighbors: parent, child, plus potentially neighbors by nontree edges
847
- # use_nontree_edges: again, same as in path_between
848
- #
849
- # returns: list of pairs [neighbor(SynNode), path(Path)]
850
- def SynInterpreter.surrounding_nodes(node, # SynNode
851
- use_nontree_edges = false) # boolean
852
-
853
- unless node.kind_of? SynNode
854
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
855
- return nil
856
- end
857
-
858
- retv = Array.new
859
-
860
- # parent
861
- if (p = node.parent)
862
- retv << [
863
- p,
864
- Path.new(node).add_last_step("U", node.parent_label(),
865
- eval(self.name()).simplified_pt(p), p)
866
- ]
867
- end
868
-
869
- # children
870
- node.each_child_with_edgelabel { |label, c|
871
- retv << [
872
- c,
873
- Path.new(node).add_last_step("D", label,
874
- eval(self.name()).simplified_pt(c), c)
875
- ]
876
- }
877
-
878
- return retv
879
- end
880
-
881
- ###
882
- # relative_position
883
- # of a node with respect to an (anchor) node:
884
- # left, right, dom
885
- def SynInterpreter.relative_position(node, # SynNode
886
- anchor_node) # SynNode
887
-
888
- unless node.kind_of? SynNode and anchor_node.kind_of? SynNode
889
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
890
- return nil
891
- end
892
-
893
- # compute up to a root node
894
- root = node
895
- while (p = root.parent())
896
- root = p
897
- end
898
-
899
- # determine position of {leftmost, rightmost} terminal of
900
- # {node, anchor_node} in the list of all terminals
901
- all_yieldnodes = root.yield_nodes_ordered()
902
-
903
- pos_nodefirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(node))
904
- pos_anchorfirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(anchor_node))
905
- pos_nodelast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(node))
906
- pos_anchorlast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(anchor_node))
907
-
908
- # determine relative position
909
- if pos_nodefirst and pos_anchorfirst and pos_nodefirst < pos_anchorfirst
910
- return "LEFT"
911
- elsif pos_nodelast and pos_anchorlast and pos_anchorlast < pos_nodelast
912
- return "RIGHT"
913
- else
914
- return "DOM"
915
- end
916
- end
917
-
918
- ###
919
- # leftmost_terminal
920
- #
921
- # given a constituent, determine its leftmost terminal,
922
- # excluding punctuation
923
- def SynInterpreter.leftmost_terminal(node)
924
- leftmost = node.yield_nodes_ordered().detect {|n| eval(self.name()).category(n) != "pun"}
925
- unless leftmost
926
- leftmost = node.yield_nodes_ordered().first
927
- end
928
- return leftmost
929
- end
930
-
931
- ###
932
- # rightmost_terminal
933
- #
934
- # given a constituent, determine its rightmost terminal,
935
- # excluding punctuation
936
- def SynInterpreter.rightmost_terminal(node)
937
- rightmost = node.yield_nodes_ordered().reverse.detect {|n| eval(self.name()).category(n) != "pun"}
938
- unless rightmost
939
- rightmost = node.yield_nodes_ordered().last
940
- end
941
- return rightmost
942
- end
943
-
944
- ###
945
- # preposition
946
- #
947
- # if the given node represents a PP, return the preposition
948
- #
949
- # default: assume that either the PP node will have the preposition as its lemma,
950
- # or that the head terminal of the PP will be the preposition
951
- def SynInterpreter.preposition(node)
952
- unless node.kind_of? SynNode
953
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
954
- return nil
955
- end
956
-
957
- # preposition as lemma of this node?
958
- if eval(self.name()).category(node) == "prep" and
959
- (lemma = eval(self.name()).lemma_backoff(node)) and
960
- not(lemma.empty?)
961
- return lemma
962
- end
963
-
964
- # head terminal is preposition and has a lemma?
965
- hl = eval(self.name()).head_terminal(node)
966
- if hl and
967
- eval(self.name()).category(hl) == "prep" and
968
- (lemma = eval(self.name()).lemma_backoff(hl)) and
969
- not(lemma.empty?)
970
- return lemma
971
- end
972
-
973
- # no luck
974
- return nil
975
- end
976
-
977
-
978
- ###
979
- # main node of expression
980
- #
981
- # returns: SynNode, main node, if found
982
- # else nil
983
- def SynInterpreter.main_node_of_expr(nodelist,
984
- no_mwes = nil) # non-nil: don't handle multiword expressions beyond verbs with separate particles
985
-
986
- # map nodes to terminals
987
- nodelist1 = nodelist.map { |n| n.yield_nodes() }.flatten
988
-
989
- # single node? return it
990
- if nodelist1.length == 1
991
- return nodelist1.first
992
- end
993
-
994
- # more than one word
995
-
996
- # see if we can get a headword of a single constituent
997
- if nodelist.length() == 1 and
998
- (headword = eval(self.name()).head_terminal(nodelist.first()))
999
- return headword
1000
- end
1001
-
1002
- # filter out auxiliaries and modals, see if only one node remains
1003
- nodelist2 = nodelist1.reject { |t|
1004
- eval(self.name()).auxiliary?(t) or
1005
- eval(self.name()).modal?(t)
1006
- }
1007
-
1008
- # one verb, one prep or particle? then
1009
- # assume we have a separate verb prefix, and take the lemma of the verb
1010
- if nodelist2.length == 2
1011
- verbs = nodelist2.select { |t| eval(self.name()).category(t) == "verb"}
1012
- if verbs.length() == 1
1013
- # found exactly one verb, so we have one verb, one other
1014
- if eval(self.name()).particle_of_verb(verbs.first, nodelist2)
1015
- # we have found a particle/separate verb prefix
1016
- # take verb as main node
1017
- return verbs.first
1018
- end
1019
- end
1020
- end
1021
-
1022
- if no_mwes
1023
- # I was told only to look for separate verb particles,
1024
- # not for anything else, so return nil at this point
1025
- return nil
1026
- end
1027
-
1028
- # filtered out everything? oops -- return to previous node list
1029
- if nodelist2.empty?
1030
- nodelist2 = nodelist1
1031
- end
1032
-
1033
- # if the nodelist describes an mwe, try to find its headword:
1034
- # look for the lowest common ancestor of all nodes in nodelist2
1035
- # if its head terminal is in nodelist2, return that
1036
- lca = nodelist2.first
1037
- lca_found = false
1038
- while lca and not(lca_found)
1039
- yn = lca.yield_nodes()
1040
- # lca's yield nodes include all nodes in nodelist2?
1041
- # then lca is indeed the lowest common ancestor
1042
- if nodelist2.big_and { |t| yn.include? t }
1043
- lca_found = true
1044
- else
1045
- lca = lca.parent()
1046
- end
1047
- end
1048
- # nodelist2 includes lca's head terminal? then return that
1049
- if lca_found and
1050
- (h = eval(self.name()).head_terminal(lca)) and
1051
- nodelist2.include? h
1052
- return h
1053
- end
1054
-
1055
-
1056
- # try first verb, then first noun, then first adjective
1057
- ["verb", "noun", "adj"].each { |cat|
1058
- nodelist.each { |t|
1059
- if eval(self.name()).category(t) == cat
1060
- return t
1061
- end
1062
- }
1063
- }
1064
-
1065
- # return first node
1066
- return nodelist.first
1067
- end
1068
-
1069
- ########
1070
- # max constituents:
1071
- # given a set of nodes, compute the maximal constituents
1072
- # that exactly cover them
1073
- #
1074
- # If include_single_missing_children is set to true,
1075
- # then a node that has at least one child whose yield is in nodelist,
1076
- # and has only one child whose yield is not in nodelist,
1077
- # will be considered as having its yield in nodelist.
1078
- #
1079
- # Optionally, a procedure accept_anyway_proc can be given.
1080
- # Like the option include_single_missing_children, it can lead to nodes being
1081
- # included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
1082
- # even though not all of their yield nodes are yield nodes of the node_list.
1083
- # accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
1084
- # The procedure is called with three arguments:
1085
- # accept_anyway_proc(node, ch_in, ch_out)
1086
- # node is a SynNode that would not normally be in NYAAYNN.
1087
- # ch_in is the list of its children that are in NYAAYNN.
1088
- # ch_out is the list of its children that are not.
1089
- # If the procedure exists and returns true, node is put into NYAAYNN.
1090
- #
1091
- #
1092
- # default: use the SalsaTigerSentence method for this
1093
- def SynInterpreter.max_constituents(nodeset, # Array:SynNode
1094
- sent, # SalsaTigerSentence
1095
- idealize_maxconst = false, # boolean
1096
- accept_anyway_proc = nil) # procedure
1097
-
1098
- if idealize_maxconst
1099
- return sent.max_constituents_smc(nodeset, idealize_maxconst,
1100
- false, # do not ignore empty terminals
1101
- accept_anyway_proc)
1102
- else
1103
- return sent.max_constituents_for_nodes(nodeset)
1104
- end
1105
- end
1106
-
1107
- ########
1108
- # prune?
1109
- # given a target node t and another node n of the syntactic structure,
1110
- # decide whether n is likely to instantiate a semantic role
1111
- # of t. If not, recommend n for pruning.
1112
- #
1113
- # This method is supposed to implement a method similar
1114
- # to the one proposed by Xue and Palmer (EMNLP 2004).
1115
- #
1116
- # returns: true to recommend n for pruning, else false
1117
- #
1118
- # Since the implementation is highly parser-specific,
1119
- # all that we can do in the default method is
1120
- # always to return false.
1121
- def SynInterpreter.prune?(node, # SynNode
1122
- paths_to_target, # hash: node ID -> Path object: paths from nodes to target
1123
- terminal_index) # hash: terminal node -> word index in sentence
1124
-
1125
- unless node.kind_of? SynNode
1126
- $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
1127
- return nil
1128
- end
1129
-
1130
- return false
1131
- end
1132
-
1133
-
1134
- ####################3
1135
- protected
1136
-
1137
- def SynInterpreter.announce_me()
1138
- if defined?(SynInterfaces)
1139
- # yup, we have a class to which we can announce ourselves
1140
- SynInterfaces.add_interpreter(eval(self.name()))
1141
- else
1142
- # no interface collector class
1143
- $stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
1144
- end
1145
- end
1146
-
1147
- ####################3
1148
- private
1149
-
1150
- ###
1151
- # search upward:
1152
- # look for path from from_node to to_node
1153
- # already_covered is either nil or
1154
- # a node whose subtree we have already searched
1155
- def SynInterpreter.search_up(from_node, # SynNode
1156
- to_node, # SynNode
1157
- already_covered) # SynNode
1158
- # returns (1) the path from from_node to to_node,
1159
- # (2) just the part from the lca down to the node
1160
- # (3) the lowest common ancestor as node
1161
-
1162
- path = eval(self.name()).search_down(from_node,to_node, already_covered)
1163
-
1164
- if path.nil?
1165
- # search down unsuccessful
1166
-
1167
- parent = from_node.parent
1168
- edgelabel = from_node.parent_label
1169
- # puts "Going up from "+from_node.id.to_s+" to "+parent.id.to_s
1170
-
1171
- if parent.nil?
1172
- # no path found
1173
- return nil
1174
-
1175
- else
1176
- # search up
1177
- path = eval(self.name()).search_up(parent,to_node, from_node)
1178
-
1179
- if path.nil?
1180
- # no path found
1181
- return nil
1182
-
1183
- else
1184
- # search up was successful
1185
- parent_pt = eval(self.name()).simplified_pt(parent)
1186
- path.add_first_step(from_node, "U", edgelabel, parent_pt)
1187
- return path
1188
- end
1189
- end
1190
-
1191
- else
1192
- # search down successful
1193
- return path
1194
- end
1195
- end
1196
-
1197
- ###
1198
- # search in tree
1199
- def SynInterpreter.search_down(from_node, # SynNode
1200
- to_node, # SynNode
1201
- already_explored) # SynNode
1202
-
1203
- if from_node == to_node
1204
- return Path.new(from_node)
1205
-
1206
- else
1207
-
1208
- from_node.children.each {|c|
1209
-
1210
- if c == already_explored
1211
- # we have done this subtree,
1212
- # don't do it again
1213
- next
1214
- end
1215
-
1216
- path = eval(self.name()).search_down(c, to_node, already_explored)
1217
-
1218
- unless path.nil?
1219
- c_pt = eval(self.name()).simplified_pt(c)
1220
- path.add_first_step(from_node, "D", c.parent_label(), c_pt)
1221
- return path
1222
- end
1223
- }
1224
-
1225
- # no path found for any of the children
1226
- return nil
1227
- end
1228
- end
1229
- end