frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,45 @@
1
+ # AB: 2011-11-13
2
+ # Initial import done, need to reimplement the whole interface.
3
+ require 'fred/FredFeaturize'
4
+ require 'fred/FredSplit'
5
+ require 'fred/FredTrain'
6
+ require 'fred/FredTest'
7
+ require 'fred/FredEval'
8
+
9
+ module Fred
10
+ class Fred
11
+
12
+ def initialize(options)
13
+ @exp, @opts = options
14
+ @task = @opts['--task']
15
+ end
16
+ ##
17
+ # now perform the given task
18
+
19
+ def assign
20
+
21
+ # initialize task object
22
+ case @task
23
+ when "featurize"
24
+ task_obj = FredFeaturize.new(@exp, @opts)
25
+ when "refeaturize"
26
+ task_obj = FredFeaturize.new(@exp, @opts, "refeaturize" => true)
27
+ when "split"
28
+ task_obj = FredSplit.new(@exp, @opts)
29
+ when "train"
30
+ task_obj = FredTrain.new(@exp, @opts)
31
+ when "test"
32
+ task_obj = FredTest.new(@exp, @opts)
33
+ when "eval"
34
+ task_obj = FredEval.new(@exp, @opts)
35
+ else
36
+ raise "Shouldn't be here"
37
+ end
38
+
39
+ task_obj.compute
40
+
41
+ $stderr.puts "Fred: Done."
42
+
43
+ end
44
+ end # class Fred
45
+ end # module Fred
@@ -0,0 +1,23 @@
1
+ # just for compatibility; requiring "md5" is obsoleted
2
+ #
3
+ # $RoughId: md5.rb,v 1.4 2001/07/13 15:38:27 knu Exp $
4
+ # $Id: md5.rb 12008 2007-03-06 10:12:12Z knu $
5
+
6
+ require 'digest/md5'
7
+
8
+ class MD5 < Digest::MD5
9
+ class << self
10
+ alias orig_new new
11
+ def new(str = nil)
12
+ if str
13
+ orig_new.update(str)
14
+ else
15
+ orig_new
16
+ end
17
+ end
18
+
19
+ def md5(*args)
20
+ new(*args)
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,250 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ # AB, 2011-11-13
4
+
5
+ #require 'optparse' # for reimplementation
6
+ require 'getoptlong'
7
+ require "fred/FredConfigData"
8
+
9
+ module Fred
10
+
11
+ # This class parses options for Fred.
12
+ class OptParser
13
+ def self.parse(cmd_opts)
14
+ tasks = {
15
+ "featurize" => [ [ '--dataset', '-d', GetoptLong::REQUIRED_ARGUMENT], # set to featurize: 'train' or 'test'
16
+ [ "--append", "-A", GetoptLong::NO_ARGUMENT]
17
+ ],
18
+ "refeaturize" => [ [ '--dataset', '-d', GetoptLong::REQUIRED_ARGUMENT], # set to featurize: 'train' or 'test'
19
+ [ "--append", "-A", GetoptLong::NO_ARGUMENT]
20
+ ],
21
+ "split" => [ ['--logID', '-i', GetoptLong::REQUIRED_ARGUMENT], # splitlog ID, required, no default
22
+ [ '--trainpercent', '-r', GetoptLong::REQUIRED_ARGUMENT] # percentage training data, default: 90
23
+ ],
24
+ "train" => [ ['--logID', '-i', GetoptLong::REQUIRED_ARGUMENT] # splitlog ID; if given, will train on split
25
+ # rather than all training data
26
+ ],
27
+ "test" => [ ['--logID', '-i', GetoptLong::REQUIRED_ARGUMENT], # splitlog ID: if given, test on this split of
28
+ # the training data
29
+ [ '--baseline', '-b', GetoptLong::NO_ARGUMENT], # set this to compute baseline rather than
30
+ # apply classifiers
31
+ [ '--nooutput', '-N', GetoptLong::NO_ARGUMENT] # set this to prevent output of disambiguated
32
+ # test data
33
+
34
+ ],
35
+ "eval" => [['--logID', '-i', GetoptLong::REQUIRED_ARGUMENT], # splitlog ID: if given, evaluate this split.
36
+ ['--printLog', '-l', GetoptLong::NO_ARGUMENT]
37
+ ]
38
+ }
39
+
40
+ # general options
41
+ optnames = [[ '--help', '-h', GetoptLong::NO_ARGUMENT], # get help
42
+ [ '--expfile', '-e', GetoptLong::REQUIRED_ARGUMENT], # experiment file name (and path), no default
43
+ [ '--task', '-t', GetoptLong::REQUIRED_ARGUMENT ], # task to perform: one of task.keys, no default
44
+ ]
45
+ # append task-specific to general options
46
+ tasks.values.each { |more_optnames|
47
+ optnames.concat more_optnames
48
+ }
49
+ optnames.uniq!
50
+
51
+ # asterisk: "explode" array into individual parameters
52
+ begin
53
+ opts = options_hash(GetoptLong.new(*optnames))
54
+ rescue
55
+ $stderr.puts "Error: unknown command line option: " + $!
56
+ exit 1
57
+ end
58
+
59
+ experiment_filename = nil
60
+
61
+ ##
62
+ # are we being asked for help?
63
+ if opts['--help']
64
+ help()
65
+ exit(0)
66
+ end
67
+
68
+ ##
69
+ # now find the task
70
+ task = opts['--task']
71
+ # sanity checks for task
72
+ if task.nil?
73
+ help()
74
+ exit(0)
75
+ end
76
+ unless tasks.keys.include? task
77
+ $stderr.puts "Sorry, I don't know the task " + task
78
+ exit 1
79
+ end
80
+
81
+ ##
82
+ # now evaluate the rest of the options
83
+ opts.each_pair { |opt,arg|
84
+ case opt
85
+ when '--help', '--task'
86
+ # we already handled this
87
+ when '--expfile'
88
+ experiment_filename = arg
89
+ else
90
+ # do we know this option?
91
+ unless tasks[task].assoc(opt)
92
+ $stderr.puts "Sorry, I don't know the option " + opt + " for task " + task
93
+ exit 1
94
+ end
95
+ end
96
+ }
97
+
98
+
99
+
100
+ unless experiment_filename
101
+ $stderr.puts "I need an experiment file name, option --expfile|-e"
102
+ exit 1
103
+ end
104
+
105
+ ##
106
+ # open config file
107
+
108
+ exp = FredConfigData.new(experiment_filename)
109
+
110
+ # sanity checks
111
+ unless exp.get("experiment_ID") =~ /^[A-Za-z0-9_]+$/
112
+ raise "Please choose an experiment ID consisting only of the letters A-Za-z0-9_."
113
+ end
114
+
115
+ # enduser mode?
116
+ $ENDUSER_MODE = exp.get("enduser_mode")
117
+
118
+ # set defaults
119
+ unless exp.get("handle_multilabel")
120
+ if exp.get("binary_classifiers")
121
+ exp.set_entry("handle_multilabel", "binarize")
122
+ else
123
+ exp.set_entry("handle_multilabel", "repeat")
124
+ end
125
+ end
126
+ # sanity check: if we're using option 'binarize' for handling items
127
+ # with multiple labels, we have to have binary classifiers
128
+ if exp.get("handle_multilabel") == "binarize" and not(exp.get("binary_classifiers"))
129
+ $stderr.puts "Error: cannot use 'handle_multilabel=binarize' with n-ary classifiers."
130
+ exit(1)
131
+ end
132
+ unless exp.get("numerical_features")
133
+ exp.set_entry("numerical_features", "bin")
134
+ end
135
+
136
+ [exp, opts]
137
+ end
138
+ private
139
+ ###
140
+ # options_hash:
141
+ #
142
+ # GetoptLong only allows you to access options via each(),
143
+ # not individually, and it only allows you to cycle through the options once.
144
+ # So we re-code the options as a hash
145
+ def self.options_hash(opts_obj) # GetoptLong object
146
+ opt_hash = Hash.new
147
+
148
+ opts_obj.each do |opt, arg|
149
+ opt_hash[opt] = arg
150
+ end
151
+
152
+ return opt_hash
153
+ end
154
+ def self.help
155
+ $stderr.puts "
156
+ Fred: FRamE Disambiguation System Version 0.3
157
+
158
+ Usage:
159
+ ----------------
160
+
161
+ ruby fred.rb --help|-h
162
+ Gets you this text.
163
+
164
+
165
+ ruby fred.rb --task|-t featurize --expfile|-e <e> --dataset|-d <d>
166
+ [--append|-A]
167
+ Featurizes input data and stores it in feature files.
168
+ Feature files are stored in
169
+ <fred_directory>/<experiment_ID>/<train/test>/features
170
+ Enduser mode: dataset has to be test (preset as default), no --append.
171
+
172
+ --expfile <e> Use <e> as the experiment description and configuation file
173
+
174
+ --dataset <d> Set to featurize: <d> is either 'train' or 'test'
175
+ Accordingly, either the directory dir_train or dir_test (as
176
+ specified in the experiment file) is used to store the features
177
+
178
+ --append Do not overwrite previously computed features for this experiment.
179
+ Rather, append the new features to the old featurization files.
180
+ Default: overwrite
181
+
182
+ ruby fred.rb --task|-t split --expfile|-e <e> --logID|-i <i>
183
+ [--trainpercent|-r <r>]
184
+ Produces a new train/test split on the training data of the experiment.
185
+ Split logs are stored in <fred_directory>/<experiment_ID>/split/<splitlog ID>
186
+ Not available in enduser mode.
187
+
188
+ --expfile <e> Use <e> as the experiment description and configuation file
189
+
190
+ --logID <l> Use <l> as the ID for storing this new split
191
+
192
+ --trainpercent <r> Allocate <r> percent of the data as train,
193
+ and 100-<r> as test.
194
+ default: <r>=90
195
+
196
+ ruby fred.rb --task|-t train --expfile|-e <e>
197
+ [--logID|-i <i> ]
198
+ Train classifier(s) on the training data (or a split of it)
199
+ Classifiers are stored in
200
+ <fred_directory>/<experiment_ID>/classifiers/<classifier_name>
201
+ Not available in enduser mode.
202
+
203
+ --expfile <e> Use <e> as the experiment description and configuation file
204
+
205
+ --logID <l> Train not on the whole training data but
206
+ on the split with ID <l>
207
+
208
+ ruby fred.rb --task|-t test --expfile|-e <e>
209
+ [--logID|-i <i>] [--baseline|-b]
210
+ [--nooutput|-N]
211
+ Apply classifier(s) to the test data (or a split of the training data)
212
+ Classification results are stored in
213
+ <fred_directory>/<experiment_ID>/results/main or
214
+ <fred_directory>/<experiment_ID>/results/baseline for the baseline.
215
+ If you are using classifier combination, individual classification results
216
+ are stored in <fred_directory>/<experiment_ID>/results/<classifier_name>
217
+ System output (disambiguated text in SalsaTigerXML format) is stored in
218
+ <fred_directory>/<experiment_ID>/output/stxml
219
+ or <directory_output>, if that has been specified.
220
+
221
+ --expfile <e> Use <e> as the experiment description and configuation file
222
+
223
+ --logID <l> Test on a split of the training data with ID <l>
224
+
225
+ --baseline Compute the baseline: Always assign most frequent sense.
226
+ Default: use the trained classifiers
227
+
228
+ --nooutput Do not produce an output of the disambiguated test data
229
+ in SalsaTigerXML format. This is useful if you just want
230
+ to evaluate the system.
231
+ Default: output is produced.
232
+
233
+ ruby fred.rb --task|-t eval --expfile|-e <e>
234
+ [--logID|-i <i>] [--printLog|-l]
235
+ Evaluate the performance of Fred on the test data
236
+ (or on a split of the training data).
237
+ Evaluation file is written to <fred_directory>/<experiment_ID>/eval/eval
238
+ Not available in enduser mode.
239
+
240
+ --expfile <e> Use <e> as the experiment description and configuation file
241
+
242
+ --logID <l> Evaluate a split of the training data with ID <l>
243
+
244
+ --printLog Also print logfile detailing evaluation of every instance.
245
+ Log file is written to <fred_directory>/eval/log
246
+
247
+ "
248
+ end
249
+ end # class OptParser
250
+ end # module FrPrep
@@ -0,0 +1,1227 @@
1
+ # Katrin Erk Oct/Nov 05
2
+ #
3
+ # Abstract classes for interfaces for systems that provide syntactic
4
+ # analysis.
5
+ #
6
+ # There are two types of interfaces to syntactic analysis systems:
7
+ # - interfaces:
8
+ # offer methods for syntactic analysis.
9
+ #
10
+ # SynInterfaceTab:
11
+ # input and output format is (FN)TabFormat.
12
+ # SynInterfaceSTXML:
13
+ # input format is TabFormat, output format is
14
+ # Salsa/Tiger XML, also provided as
15
+ # SalsaTigerSentence objects
16
+ #
17
+ # - interpreters:
18
+ # interpret the resulting Salsa/Tiger XML (represented as
19
+ # SalsaTigerSentence and SynNode objects), e.g.
20
+ # generalize over part of speech;
21
+ # describe the path between a pair of nodes both as a path
22
+ # and (potentially) as a grammatical function of one of the nodes;
23
+ # determine whether a node describes a verb, and in which voice;
24
+ # determine the head of a constituent
25
+
26
+ require "tempfile"
27
+
28
+ require "frprep/ruby_class_extensions"
29
+
30
+ require "frprep/ISO-8859-1"
31
+ require "frprep/Parser"
32
+ require "frprep/SalsaTigerRegXML"
33
+ require "frprep/TabFormat"
34
+
35
+ #############################
36
+ # abstract class, to be inherited:
37
+ #
38
+ # tabular format or SalsaTigerXML interface for modules
39
+ # offering POS tagging, lemmatization, parsing etc.
40
+ class SynInterface
41
+
42
+ ###
43
+ # returns a string: the name of the system
44
+ # e.g. "Collins" or "TNT"
45
+ def SynInterface.system()
46
+ raise "Overwrite me"
47
+ end
48
+
49
+ ###
50
+ # returns a string: the service offered
51
+ # one of "lemmatizer", "parser", "pos tagger"
52
+ def SynInterface.service()
53
+ raise "Overwrite me"
54
+ end
55
+
56
+ ###
57
+ # initialize to set values for all subsequent processing
58
+ def initialize(program_path, # string: path to system
59
+ insuffix, # string: suffix of input files
60
+ outsuffix, # string: suffix for processed files
61
+ var_hash = {}) # optional arguments in a hash
62
+
63
+ @program_path = program_path
64
+ @insuffix = insuffix
65
+ @outsuffix = outsuffix
66
+ end
67
+
68
+ ###
69
+ # process each file in in_dir with matching suffix,
70
+ # producing a file in out_dir with same name but the suffix replaced
71
+ #
72
+ # returns: nothing
73
+ def process_dir(in_dir, # string: name of input directory
74
+ out_dir) # string: name of output directory
75
+
76
+ Dir[in_dir+"*#{@insuffix}"].each {|infilename|
77
+ outfilename = out_dir + File.basename(infilename, @insuffix) + @outsuffix
78
+ process_file(infilename,outfilename)
79
+ }
80
+ end
81
+
82
+ ###
83
+ # process one file, writing the result to outfilename
84
+ #
85
+ # returns: nothing
86
+ def process_file(infilename, # string: name of input file
87
+ outfilename)
88
+ raise "Overwrite me"
89
+ end
90
+
91
+ ######
92
+ protected
93
+
94
+ def SynInterface.announce_me()
95
+ if defined?(SynInterfaces)
96
+ # yup, we have a class to which we can announce ourselves
97
+ SynInterfaces.add_interface(eval(self.name()))
98
+ else
99
+ # no interface collector class
100
+ $stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
101
+ end
102
+ end
103
+ end
104
+
105
+ #############################
106
+ # abstract class, to be inherited:
107
+ #
108
+ # SalsaTigerXML interface for modules
109
+ # offering parsing etc.
110
+ #
111
+ # The input format for these classes is TabFormat or FNTabFormat
112
+ class SynInterfaceSTXML < SynInterface
113
+ ###
114
+ # initialize to set values for all subsequent processing
115
+ def initialize(program_path, # string: path to system
116
+ insuffix, # string: suffix of input files
117
+ outsuffix, # string: suffix for processed files
118
+ stsuffix, # string: suffix for Salsa/Tiger XML files
119
+ var_hash = {}) # optional arguments in a hash
120
+ super(program_path, insuffix, outsuffix, var_hash)
121
+ @stsuffix = stsuffix
122
+ end
123
+
124
+ def to_stxml_dir(in_dir, # string: name of dir with parse files
125
+ out_dir) # string: name of output dir
126
+
127
+ Dir[in_dir+"*#{@outsuffix}"].each { |parsefilename|
128
+ stxmlfilename = out_dir + File.basename(parsefilename, @outsuffix) + @stsuffix
129
+ to_stxml_file(parsefilename, stxmlfilename)
130
+ }
131
+ end
132
+
133
+ def to_stxml_file(infilename,
134
+ outfilename)
135
+ raise "Overwrite me"
136
+ end
137
+
138
+ ###
139
+ # standard mapping:
140
+ #
141
+ # to be used as the mapping from tab sentence words to
142
+ # SalsaTigerSentence nodes returned by each_sentence():
143
+ # map the n-th word of the tab sentence to the n-th terminal of
144
+ # the SalsaTigerSentence
145
+ def SynInterfaceSTXML.standard_mapping(sent, tabsent)
146
+ retv = Hash.new
147
+ if sent.nil?
148
+ return nil
149
+ end
150
+ terminals = sent.terminals_sorted()
151
+ if tabsent
152
+ tabsent.each_line_parsed { |l|
153
+ if (t = terminals[l.get("lineno")])
154
+ retv[l.get("lineno")] = [t]
155
+ else
156
+ retv[l.get("lineno")] = []
157
+ end
158
+ }
159
+ end
160
+ return retv
161
+ end
162
+
163
+
164
+ ###
165
+ # for a given processed file:
166
+ # yield each sentence as a tuple
167
+ # [SalsaTigerSentence object, FNTabFormatSentence object, mapping]
168
+ # of
169
+ # - the sentence in SalsaTigerXML,
170
+ # - the matching tab format sentence
171
+ # - a mapping of terminals:
172
+ # hash: line in tab sentence(integer) -> array:SynNode
173
+ # mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
174
+ #
175
+ # default version: write Salsa/Tiger XML to tempfile, read back in
176
+ # and assume that each sentence in the tab file has a correspondent
177
+ # in the processed file (may not hold e.g. if the parser leaves out
178
+ # sentences it cannot process)
179
+ def each_sentence(infilename, # string: name of processed file
180
+ tab_dir = nil) # string: name of dir with input files
181
+ # (set either here or on initialization)
182
+ if tab_dir
183
+ @tab_dir = tab_dir
184
+ end
185
+
186
+ # write Salsa/Tiger XML to tempfile
187
+ tf = Tempfile.new("SynInterface")
188
+ tf.close()
189
+ to_stxml_file(infilename, tf.path)
190
+ tf.flush()
191
+
192
+ # get matching tab file, read
193
+ tab_reader = get_tab_reader(infilename)
194
+ tab_sentences = Array.new
195
+ tab_reader.each_sentence { |s| tab_sentences << s }
196
+
197
+ # read Salsa/Tiger sentences and yield them
198
+ reader = FilePartsParser.new(tf.path)
199
+ sent_index = 0
200
+ reader.scan_s { |sent_string|
201
+ yield [
202
+ SalsaTigerSentence.new(sent_string, tab_sentences[sent_index]),
203
+ tab_sentences[sent_index],
204
+ SynInterfaceSTXML.standard_mapping(sent, tab_sentences[sent_index])
205
+ ]
206
+ sent_index += 1
207
+ }
208
+
209
+ # remove tempfile
210
+ tf.close(true)
211
+ end
212
+
213
+ #####################
214
+ protected
215
+
216
+
217
+ ###
218
+ # get tab format file for a given processed file
219
+ def get_tab_reader(infilename) # string: name of processed file
220
+ # find matching non-processed file for processed file
221
+ # assumption: directory with non-processed files
222
+ # has been set as @tab_dir
223
+
224
+ # sanity checks
225
+ unless @tab_dir
226
+ raise "Need to set tab directory"
227
+ end
228
+
229
+ # get matching tab file for this parser output file
230
+ tabfilename = @tab_dir+File.basename(infilename, @outsuffix)+ @insuffix
231
+ return FNTabFormatFile.new(tabfilename)
232
+ end
233
+
234
+
235
+ ###
236
+ # provide a XML representation for a sentence that couldn't be analyzed
237
+ # assuming a flat structure of all terminals, adding a virtual top node
238
+ def SynInterfaceSTXML.failed_sentence(tab_sent,sentid)
239
+
240
+ sent_obj = SalsaTigerSentence.empty_sentence(sentid.to_s)
241
+
242
+ sent_obj.set_attribute("failed","true")
243
+
244
+ topnode = sent_obj.add_syn("nt",
245
+ "NONE", # cat
246
+ nil, # word (doesn't matter)
247
+ nil, # pos (doesn't matter)
248
+ "500") # nonterminal counter
249
+
250
+ t_counter = 0
251
+
252
+ tab_sent.each_line_parsed {|line|
253
+ t_counter += 1
254
+ word = line.get("word")
255
+ pos = line.get("pos")
256
+ node = sent_obj.add_syn("t",
257
+ nil, # cat (doesn't matter here)
258
+ SalsaTigerXMLHelper.escape(word), # word
259
+ pos, # pos
260
+ t_counter.to_s)
261
+ topnode.add_child(node,nil)
262
+ node.add_parent(topnode, nil)
263
+ }
264
+ return sent_obj
265
+ end
266
+ end
267
+
268
+ #############################
269
+ # abstract class, to be inherited:
270
+ #
271
+ # tabular format interface for modules
272
+ # offering POS tagging, lemmatization etc.
273
+ class SynInterfaceTab < SynInterface
274
+
275
+ ##########
276
+ protected
277
+
278
+ # fntab_words_for_file:
279
+ # given a file in tab format, columns as in FNTabFormat,
280
+ # get the "word" entries and write them to a given file,
281
+ # one word per line, as input for processing
282
+ def SynInterfaceTab.fntab_words_to_file(infilename, # string: name of input file
283
+ outfile, # stream: output file
284
+ sent_marker = "", # string: mark end of sentence how?
285
+ iso = nil) # non-nil: assume utf-8, transform to iso-8859-1
286
+ corpusfile = FNTabFormatFile.new(infilename)
287
+ corpusfile.each_sentence {|s|
288
+ s.each_line_parsed {|line_obj|
289
+ if iso
290
+ outfile.puts UtfIso.to_iso_8859_1(line_obj.get("word"))
291
+ else
292
+ outfile.puts line_obj.get("word")
293
+ end
294
+ }
295
+ outfile.puts sent_marker
296
+ }
297
+ end
298
+ end
299
+
300
+ #############################
301
+ # class describing a path between two nodes
302
+ #
303
+ # provides access and output facilities for different aspects of the path
304
+ #
305
+ # this is the return value of SynInterpreter.path_between()
306
+ class Path
307
+ attr_reader :startnode
308
+
309
+ ###
310
+ # initialize to empty path
311
+ def initialize(startnode)
312
+ @path = Array.new
313
+ @cutoff_last_pt = false
314
+ set_startnode(startnode)
315
+ end
316
+
317
+ ###
318
+ # deep_clone:
319
+ # return clone of this path object,
320
+ # with clone of this path rather than the same path
321
+ def deep_clone()
322
+ new_path = self.clone()
323
+ new_path.set_path(@path.clone())
324
+
325
+ return new_path
326
+ end
327
+
328
+ ###
329
+ def set_startnode(startnode)
330
+ @startnode = startnode
331
+
332
+ return self
333
+ end
334
+
335
+ ###
336
+ # iterate through the current path
337
+ #
338
+ # yield tuples
339
+ # [direction, edgelabel, nodelabel, endnode]
340
+ # direction: string, U/D
341
+ # edgelabel: string
342
+ # nodelabel: string
343
+ # endnode: SynNode
344
+ def each_step()
345
+ @path.each { |step|
346
+ yield step
347
+ }
348
+ end
349
+
350
+ ###
351
+ # empty?
352
+ # any steps in here?
353
+ def empty?
354
+ return @path.empty?
355
+ end
356
+
357
+ ###
358
+ # add one step to the beginning of the current path
359
+ def add_first_step(start_node,#SynNode
360
+ direction, # string: U, D
361
+ gf, # string: edge label
362
+ pt)
363
+ @path.prepend([direction, gf, pt, @startnode])
364
+ set_startnode(start_node)
365
+
366
+ return self
367
+ end
368
+
369
+
370
+ ###
371
+ # add one step to the end of the current path
372
+ def add_last_step(direction, # string: U, D
373
+ gf, # string: edge label
374
+ pt, # string: node label (of end_node)
375
+ end_node) # SynNode
376
+ @path << [direction, gf, pt, end_node]
377
+
378
+ return self
379
+ end
380
+
381
+ ###
382
+ # path length
383
+ def length()
384
+ return @path.length()
385
+ end
386
+
387
+ ###
388
+ #
389
+ def print(print_direction, # boolean. true: print direction
390
+ print_gf, # boolean. true: print edgelabel
391
+ print_pt) # boolean. true: print nodelabel
392
+
393
+ return print_aux(@path, print_direction, print_gf, print_pt)
394
+ end
395
+
396
+ ###
397
+ # print path from roof node to end
398
+ def print_downpart(print_direction,
399
+ print_gf,
400
+ print_pt)
401
+
402
+ roof, roof_index = compute_roof()
403
+ if roof.nil? or @path.empty?
404
+ # no roof set
405
+ return ""
406
+
407
+ else
408
+ # roof node is in the middle
409
+ return print_aux(@path[roof_index..-1],
410
+ print_direction, print_gf, print_pt)
411
+ end
412
+ end
413
+
414
+ ###
415
+ def lca()
416
+ return compute_roof().first
417
+ end
418
+
419
+ ###
420
+ # cut off last node label in print() and print_downpart()?
421
+ def set_cutoff_last_pt_on_printing(bool) # Boolean
422
+ @cutoff_last_pt = bool
423
+ end
424
+
425
+ ########
426
+ protected
427
+
428
+ def set_path(new_path)
429
+ @path = new_path
430
+ end
431
+
432
+
433
+ ########
434
+ private
435
+
436
+ ###
437
+ # step through the path as long as direction is up.
438
+ # when direction starts to go "D", take current node as roof node
439
+ #
440
+ # returns: pair [roof node, roof node index] (SynNode, integer)
441
+ def compute_roof()
442
+ node = @startnode
443
+ index = 0
444
+
445
+ each_step { |direction, edgelabel, nodelabel, endnode|
446
+ if direction =~ /D/
447
+ # down! the previous node was roof
448
+ return [node, index]
449
+ else
450
+ node = endnode
451
+ index += 1
452
+ end
453
+ }
454
+
455
+ # last node is roof
456
+ return [node, index]
457
+
458
+ end
459
+
460
+ ###
461
+ def print_aux(path,
462
+ print_direction,
463
+ print_gf,
464
+ print_pt)
465
+ retv = ""
466
+ path.each { |step|
467
+ direction, gf, pt, node = step.map { |entry|
468
+ if entry.nil?
469
+ "-"
470
+ else
471
+ entry
472
+ end
473
+ }
474
+ if print_direction
475
+ retv << direction + " "
476
+ end
477
+ if print_gf
478
+ retv << gf + " "
479
+ end
480
+ if print_pt
481
+ retv << pt + " "
482
+ end
483
+ }
484
+
485
+ if @cutoff_last_pt and print_pt and
486
+ retv =~ /^(.+ )\w+ $/
487
+ return $1
488
+ else
489
+ return retv
490
+ end
491
+ end
492
+
493
+ end
494
+
495
+
496
+ #############################
497
+ # abstract class, to be inherited:
498
+ #
499
+ # interpretation for a POS tagger/lemmatizer/parser combination
500
+ class SynInterpreter
501
+
502
+ ###
503
+ # systems interpreted by this class:
504
+ # returns a hash service(string) -> system name (string),
505
+ # e.g.
506
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
507
+ def SynInterpreter.systems()
508
+ raise "Overwrite me"
509
+ end
510
+
511
+ ###
512
+ # names of additional systems that may be interpreted by this class
513
+ # returns a hash service(string) -> system name(string)
514
+ # same as names()
515
+ def SynInterpreter.optional_systems()
516
+ raise "Overwrite me"
517
+ end
518
+
519
+ ###
520
+ # generalize over POS tags.
521
+ #
522
+ # returns one of:
523
+ #
524
+ # adj: adjective (phrase)
525
+ # adv: adverb (phrase)
526
+ # card: numbers, quantity phrases
527
+ # con: conjunction
528
+ # det: determiner, including possessive/demonstrative pronouns etc.
529
+ # for: foreign material
530
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
531
+ # part: particles, truncated words (German compound parts)
532
+ # prep: preposition (phrase)
533
+ # pun: punctuation, brackets, etc.
534
+ # sent: sentence
535
+ # top: top node of a sentence
536
+ # verb: verb (phrase)
537
+ # nil: something went wrong
538
+ #
539
+ # default: return phrase type as is
540
+ #
541
+ # returns: string or nil
542
+ def SynInterpreter.category(node) # SynNode
543
+ unless node.kind_of? SynNode
544
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
545
+ return nil
546
+ end
547
+
548
+ return eval(self.name()).pt(node)
549
+ end
550
+
551
+ ###
552
+ # is relative pronoun?
553
+ #
554
+ # default: false
555
+ def SynInterpreter.relative_pronoun?(node) # SynNode
556
+ return false
557
+ end
558
+
559
+ ###
560
+ # lemma_backoff:
561
+ #
562
+ # if we have lemma information, return that,
563
+ # and failing that, return the word
564
+ #
565
+ # returns: string or nil
566
+ def SynInterpreter.lemma_backoff(node)
567
+ unless node.kind_of? SynNode
568
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
569
+ return nil
570
+ end
571
+
572
+ lemma = node.get_attribute("lemma")
573
+ if (lemma.nil? or lemma =~ /unknown/) and
574
+ node.is_terminal?
575
+ return node.word()
576
+ else
577
+ return lemma
578
+ end
579
+ end
580
+
581
+ ###
582
+ # phrase type:
583
+ # constituent label for nonterminals,
584
+ # part of speech for terminals
585
+ #
586
+ # returns: string
587
+ def SynInterpreter.pt(node)
588
+ unless node.kind_of? SynNode
589
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
590
+ return nil
591
+ end
592
+
593
+ if node.is_terminal?
594
+ return node.part_of_speech
595
+ else
596
+ return node.category
597
+ end
598
+ end
599
+
600
+ ###
601
+ # simplified phrase type:
602
+ # like phrase type, but may simplify
603
+ # the constituent label
604
+ # default: just the same as pt()
605
+ #
606
+ # returns: string or nil
607
+ def SynInterpreter.simplified_pt(node)
608
+ return eval(self.name()).pt(node)
609
+ end
610
+
611
+ ###
612
+ # particle_of_verb:
613
+ #
614
+ # given a node and a nodelist,
615
+ # if the node represents a verb:
616
+ # see if the verb has a particle among the nodes in nodelist
617
+ # if so, return it
618
+ # default: no recognition of separate particles
619
+ #
620
+ # returns: SynNode object if successful, else nil
621
+ def SynInterpreter.particle_of_verb(node,
622
+ node_list)
623
+ return nil
624
+ end
625
+
626
+ ###
627
+ # auxiliary?
628
+ #
629
+ # returns true if the given node is an auxiliary
630
+ # default: no recognition of auxiliaries
631
+ #
632
+ # returns: boolean
633
+ def SynInterpreter.auxiliary?(node)
634
+ return false
635
+ end
636
+
637
+ ###
638
+ # modal?
639
+ #
640
+ # returns true if the given node is a modal verb
641
+ # default: no recognition of modals
642
+ #
643
+ # returns: boolean
644
+ def SynInterpreter.modal?(node)
645
+ return false
646
+ end
647
+
648
+ ###
649
+ # head_terminal
650
+ #
651
+ # given a constituent, return the terminal node
652
+ # that describes its headword
653
+ # default: a heuristic that assumes the existence of a 'head'
654
+ # attribute on nodes:
655
+ # find the first node in my yield corresponding to my head attribute..
656
+ #
657
+ # returns: a SynNode object if successful, else nil
658
+ def SynInterpreter.head_terminal(node)
659
+ unless node.kind_of? SynNode
660
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
661
+ return nil
662
+ end
663
+
664
+ if node.is_terminal?
665
+ return node
666
+ end
667
+
668
+ head = node.get_attribute("head")
669
+ unless head
670
+ return nil
671
+ end
672
+
673
+ return node.yield_nodes.detect { |t|
674
+ t.get_attribute("word") == head
675
+ }
676
+ end
677
+
678
+ ###
679
+ # voice
680
+ #
681
+ # given a constituent, return
682
+ # - "active"/"passive" if it is a verb
683
+ # - nil, else
684
+ #
685
+ # default: treat all as active
686
+ def SynInterpreter.voice(node)
687
+ unless node.kind_of? SynNode
688
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
689
+ return nil
690
+ end
691
+
692
+ if eval(self.name()).category(node) == "verb"
693
+ return "active"
694
+ else
695
+ return nil
696
+ end
697
+ end
698
+
699
+ ###
700
+ # gfs
701
+ #
702
+ # grammatical functions of a constituent:
703
+ #
704
+ # returns: a list of pairs [relation(string), node(SynNode)]
705
+ # where <node> stands in the relation <relation> to the parameter
706
+ # that the method was called with
707
+ #
708
+ # default: children of this node, with edge labels as relations,
709
+ # prepositions tacked on for pps
710
+ def SynInterpreter.gfs(node, # SynNode
711
+ sent) # SalsaTigerSentence
712
+ unless node.kind_of? SynNode
713
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
714
+ return nil
715
+ end
716
+
717
+ return node.children_with_edgelabel().map { |rel, gf_node|
718
+
719
+ if eval(self.name()).category(gf_node) == "prep"
720
+ [rel + "-" + eval(self.name()).preposition(gf_node).to_s, gf_node]
721
+
722
+ else
723
+ [rel, gf_node]
724
+ end
725
+ }
726
+ end
727
+
728
+ ###
729
+ # informative_content_node
730
+ #
731
+ # for most constituents: the head
732
+ # for a PP, the NP
733
+ # for an SBAR, the VP
734
+ # for a VP, the embedded VP
735
+ #
736
+ # Default: returns the first non-head child
737
+ def SynInterpreter.informative_content_node(node)
738
+ unless node.kind_of? SynNode
739
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
740
+ return nil
741
+ end
742
+
743
+ headlemma = eval(self.name()).lemma_backoff(node)
744
+
745
+ first_nonhead_child = node.children().detect { |n|
746
+ nnh = eval(self.name()).head_terminal(n)
747
+ nnh and
748
+ eval(self.name()).lemma_backoff(nnh) != headlemma
749
+ }
750
+
751
+ return first_nonhead_child
752
+ end
753
+
754
+ #####################################
755
+ # verbs(sent) sent is a sentence in SalsaTigerSentence format
756
+ #
757
+ # return a list of the nodes of full verbs in a given sentence:
758
+ # it is a list of lists. An item in that list is
759
+ # - either a pair [verb, svp]
760
+ # of the node of a verb with separable prefix
761
+ # and the node of its separate prefix
762
+ # - or a singleton [verb]
763
+ # of the node of a verb without separate prefix
764
+ def SynInterpreter.verbs(sent)
765
+
766
+ return sent.syn_nodes.select { |node|
767
+ eval(self.name()).category(node) == "verb"
768
+ }.map { |node|
769
+ [node]
770
+ }
771
+ end
772
+
773
+ ###
774
+ # governing verbs
775
+ #
776
+ # returns a list of pairs [rel, verb_node]
777
+ # such that the given node fills the grammatical function rel
778
+ # for this verb_node
779
+ # or an empty list if there is no such verb
780
+ def SynInterpreter.governing_verbs(node,
781
+ sent)
782
+ unless node.kind_of? SynNode
783
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
784
+ return nil
785
+ end
786
+
787
+ retv = Array.new
788
+
789
+ # each verb of the sentence:
790
+ eval(self.name()).verbs(sent).each { |verb_node, prefix_node|
791
+ # each gf of this verb:
792
+ eval(self.name()).gfs(verb_node, sent).each { |rel, other_node|
793
+ # if it points to the given node, record
794
+ if other_node == node or
795
+ eval(self.name()).informative_content_node(other_node) == node
796
+ retv << [rel, verb_node]
797
+ break
798
+ end
799
+ }
800
+ }
801
+
802
+ return retv
803
+ end
804
+
805
+ ###
806
+ # path_between
807
+ #
808
+ # construct path in syntactic structure between two nodes,
809
+ # using
810
+ # - node labels
811
+ # - edge labels
812
+ # - direction Up, Down
813
+ #
814
+ # use_nontree_edges: set to true to use coreference edges
815
+ # and other non-tree edges returned by the parser
816
+ # in path computation. (Will produce no change if the parser
817
+ # does not produce any non-tree edges.)
818
+ #
819
+ # returns: Path object
820
+ def SynInterpreter.path_between(from_node, # SynNode
821
+ to_node, # SynNode
822
+ use_nontree_edges = false) # boolean
823
+
824
+ unless from_node.kind_of? SynNode and to_node.kind_of? SynNode
825
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
826
+ return nil
827
+ end
828
+
829
+ path = eval(self.name()).search_up(from_node,to_node, nil)
830
+
831
+ if path.nil?
832
+ # no path found
833
+ # STDERR.puts "Warning: no path found between #{to_node.id} and #{from_node.id}"
834
+ end
835
+
836
+ return path
837
+ end
838
+
839
+ ###
840
+ # surrounding_nodes:
841
+ #
842
+ # construct paths in syntactic structure between a node and each of its neighbors
843
+ # path construction as in path_between.
844
+ # Neighbors: parent, child, plus potentially neighbors by nontree edges
845
+ # use_nontree_edges: again, same as in path_between
846
+ #
847
+ # returns: list of pairs [neighbor(SynNode), path(Path)]
848
+ def SynInterpreter.surrounding_nodes(node, # SynNode
849
+ use_nontree_edges = false) # boolean
850
+
851
+ unless node.kind_of? SynNode
852
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
853
+ return nil
854
+ end
855
+
856
+ retv = Array.new
857
+
858
+ # parent
859
+ if (p = node.parent)
860
+ retv << [
861
+ p,
862
+ Path.new(node).add_last_step("U", node.parent_label(),
863
+ eval(self.name()).simplified_pt(p), p)
864
+ ]
865
+ end
866
+
867
+ # children
868
+ node.each_child_with_edgelabel { |label, c|
869
+ retv << [
870
+ c,
871
+ Path.new(node).add_last_step("D", label,
872
+ eval(self.name()).simplified_pt(c), c)
873
+ ]
874
+ }
875
+
876
+ return retv
877
+ end
878
+
879
+ ###
880
+ # relative_position
881
+ # of a node with respect to an (anchor) node:
882
+ # left, right, dom
883
+ def SynInterpreter.relative_position(node, # SynNode
884
+ anchor_node) # SynNode
885
+
886
+ unless node.kind_of? SynNode and anchor_node.kind_of? SynNode
887
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
888
+ return nil
889
+ end
890
+
891
+ # compute up to a root node
892
+ root = node
893
+ while (p = root.parent())
894
+ root = p
895
+ end
896
+
897
+ # determine position of {leftmost, rightmost} terminal of
898
+ # {node, anchor_node} in the list of all terminals
899
+ all_yieldnodes = root.yield_nodes_ordered()
900
+
901
+ pos_nodefirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(node))
902
+ pos_anchorfirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(anchor_node))
903
+ pos_nodelast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(node))
904
+ pos_anchorlast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(anchor_node))
905
+
906
+ # determine relative position
907
+ if pos_nodefirst and pos_anchorfirst and pos_nodefirst < pos_anchorfirst
908
+ return "LEFT"
909
+ elsif pos_nodelast and pos_anchorlast and pos_anchorlast < pos_nodelast
910
+ return "RIGHT"
911
+ else
912
+ return "DOM"
913
+ end
914
+ end
915
+
916
+ ###
917
+ # leftmost_terminal
918
+ #
919
+ # given a constituent, determine its leftmost terminal,
920
+ # excluding punctuation
921
+ def SynInterpreter.leftmost_terminal(node)
922
+ leftmost = node.yield_nodes_ordered().detect {|n| eval(self.name()).category(n) != "pun"}
923
+ unless leftmost
924
+ leftmost = node.yield_nodes_ordered().first
925
+ end
926
+ return leftmost
927
+ end
928
+
929
+ ###
930
+ # rightmost_terminal
931
+ #
932
+ # given a constituent, determine its rightmost terminal,
933
+ # excluding punctuation
934
+ def SynInterpreter.rightmost_terminal(node)
935
+ rightmost = node.yield_nodes_ordered().reverse.detect {|n| eval(self.name()).category(n) != "pun"}
936
+ unless rightmost
937
+ rightmost = node.yield_nodes_ordered().last
938
+ end
939
+ return rightmost
940
+ end
941
+
942
+ ###
943
+ # preposition
944
+ #
945
+ # if the given node represents a PP, return the preposition
946
+ #
947
+ # default: assume that either the PP node will have the preposition as its lemma,
948
+ # or that the head terminal of the PP will be the preposition
949
+ def SynInterpreter.preposition(node)
950
+ unless node.kind_of? SynNode
951
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
952
+ return nil
953
+ end
954
+
955
+ # preposition as lemma of this node?
956
+ if eval(self.name()).category(node) == "prep" and
957
+ (lemma = eval(self.name()).lemma_backoff(node)) and
958
+ not(lemma.empty?)
959
+ return lemma
960
+ end
961
+
962
+ # head terminal is preposition and has a lemma?
963
+ hl = eval(self.name()).head_terminal(node)
964
+ if hl and
965
+ eval(self.name()).category(hl) == "prep" and
966
+ (lemma = eval(self.name()).lemma_backoff(hl)) and
967
+ not(lemma.empty?)
968
+ return lemma
969
+ end
970
+
971
+ # no luck
972
+ return nil
973
+ end
974
+
975
+
976
+ ###
977
+ # main node of expression
978
+ #
979
+ # returns: SynNode, main node, if found
980
+ # else nil
981
+ def SynInterpreter.main_node_of_expr(nodelist,
982
+ no_mwes = nil) # non-nil: don't handle multiword expressions beyond verbs with separate particles
983
+
984
+ # map nodes to terminals
985
+ nodelist1 = nodelist.map { |n| n.yield_nodes() }.flatten
986
+
987
+ # single node? return it
988
+ if nodelist1.length == 1
989
+ return nodelist1.first
990
+ end
991
+
992
+ # more than one word
993
+
994
+ # see if we can get a headword of a single constituent
995
+ if nodelist.length() == 1 and
996
+ (headword = eval(self.name()).head_terminal(nodelist.first()))
997
+ return headword
998
+ end
999
+
1000
+ # filter out auxiliaries and modals, see if only one node remains
1001
+ nodelist2 = nodelist1.reject { |t|
1002
+ eval(self.name()).auxiliary?(t) or
1003
+ eval(self.name()).modal?(t)
1004
+ }
1005
+
1006
+ # one verb, one prep or particle? then
1007
+ # assume we have a separate verb prefix, and take the lemma of the verb
1008
+ if nodelist2.length == 2
1009
+ verbs = nodelist2.select { |t| eval(self.name()).category(t) == "verb"}
1010
+ if verbs.length() == 1
1011
+ # found exactly one verb, so we have one verb, one other
1012
+ if eval(self.name()).particle_of_verb(verbs.first, nodelist2)
1013
+ # we have found a particle/separate verb prefix
1014
+ # take verb as main node
1015
+ return verbs.first
1016
+ end
1017
+ end
1018
+ end
1019
+
1020
+ if no_mwes
1021
+ # I was told only to look for separate verb particles,
1022
+ # not for anything else, so return nil at this point
1023
+ return nil
1024
+ end
1025
+
1026
+ # filtered out everything? oops -- return to previous node list
1027
+ if nodelist2.empty?
1028
+ nodelist2 = nodelist1
1029
+ end
1030
+
1031
+ # if the nodelist describes an mwe, try to find its headword:
1032
+ # look for the lowest common ancestor of all nodes in nodelist2
1033
+ # if its head terminal is in nodelist2, return that
1034
+ lca = nodelist2.first
1035
+ lca_found = false
1036
+ while lca and not(lca_found)
1037
+ yn = lca.yield_nodes()
1038
+ # lca's yield nodes include all nodes in nodelist2?
1039
+ # then lca is indeed the lowest common ancestor
1040
+ if nodelist2.big_and { |t| yn.include? t }
1041
+ lca_found = true
1042
+ else
1043
+ lca = lca.parent()
1044
+ end
1045
+ end
1046
+ # nodelist2 includes lca's head terminal? then return that
1047
+ if lca_found and
1048
+ (h = eval(self.name()).head_terminal(lca)) and
1049
+ nodelist2.include? h
1050
+ return h
1051
+ end
1052
+
1053
+
1054
+ # try first verb, then first noun, then first adjective
1055
+ ["verb", "noun", "adj"].each { |cat|
1056
+ nodelist.each { |t|
1057
+ if eval(self.name()).category(t) == cat
1058
+ return t
1059
+ end
1060
+ }
1061
+ }
1062
+
1063
+ # return first node
1064
+ return nodelist.first
1065
+ end
1066
+
1067
+ ########
1068
+ # max constituents:
1069
+ # given a set of nodes, compute the maximal constituents
1070
+ # that exactly cover them
1071
+ #
1072
+ # If include_single_missing_children is set to true,
1073
+ # then a node that has at least one child whose yield is in nodelist,
1074
+ # and has only one child whose yield is not in nodelist,
1075
+ # will be considered as having its yield in nodelist.
1076
+ #
1077
+ # Optionally, a procedure accept_anyway_proc can be given.
1078
+ # Like the option include_single_missing_children, it can lead to nodes being
1079
+ # included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
1080
+ # even though not all of their yield nodes are yield nodes of the node_list.
1081
+ # accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
1082
+ # The procedure is called with three arguments:
1083
+ # accept_anyway_proc(node, ch_in, ch_out)
1084
+ # node is a SynNode that would not normally be in NYAAYNN.
1085
+ # ch_in is the list of its children that are in NYAAYNN.
1086
+ # ch_out is the list of its children that are not.
1087
+ # If the procedure exists and returns true, node is put into NYAAYNN.
1088
+ #
1089
+ #
1090
+ # default: use the SalsaTigerSentence method for this
1091
+ def SynInterpreter.max_constituents(nodeset, # Array:SynNode
1092
+ sent, # SalsaTigerSentence
1093
+ idealize_maxconst = false, # boolean
1094
+ accept_anyway_proc = nil) # procedure
1095
+
1096
+ if idealize_maxconst
1097
+ return sent.max_constituents_smc(nodeset, idealize_maxconst,
1098
+ false, # do not ignore empty terminals
1099
+ accept_anyway_proc)
1100
+ else
1101
+ return sent.max_constituents_for_nodes(nodeset)
1102
+ end
1103
+ end
1104
+
1105
+ ########
1106
+ # prune?
1107
+ # given a target node t and another node n of the syntactic structure,
1108
+ # decide whether n is likely to instantiate a semantic role
1109
+ # of t. If not, recommend n for pruning.
1110
+ #
1111
+ # This method is supposed to implement a method similar
1112
+ # to the one proposed by Xue and Palmer (EMNLP 2004).
1113
+ #
1114
+ # returns: true to recommend n for pruning, else false
1115
+ #
1116
+ # Since the implementation is highly parser-specific,
1117
+ # all that we can do in the default method is
1118
+ # always to return false.
1119
+ def SynInterpreter.prune?(node, # SynNode
1120
+ paths_to_target, # hash: node ID -> Path object: paths from nodes to target
1121
+ terminal_index) # hash: terminal node -> word index in sentence
1122
+
1123
+ unless node.kind_of? SynNode
1124
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
1125
+ return nil
1126
+ end
1127
+
1128
+ return false
1129
+ end
1130
+
1131
+
1132
+ ####################3
1133
+ protected
1134
+
1135
+ def SynInterpreter.announce_me()
1136
+ if defined?(SynInterfaces)
1137
+ # yup, we have a class to which we can announce ourselves
1138
+ SynInterfaces.add_interpreter(eval(self.name()))
1139
+ else
1140
+ # no interface collector class
1141
+ $stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
1142
+ end
1143
+ end
1144
+
1145
+ ####################3
1146
+ private
1147
+
1148
+ ###
1149
+ # search upward:
1150
+ # look for path from from_node to to_node
1151
+ # already_covered is either nil or
1152
+ # a node whose subtree we have already searched
1153
+ def SynInterpreter.search_up(from_node, # SynNode
1154
+ to_node, # SynNode
1155
+ already_covered) # SynNode
1156
+ # returns (1) the path from from_node to to_node,
1157
+ # (2) just the part from the lca down to the node
1158
+ # (3) the lowest common ancestor as node
1159
+
1160
+ path = eval(self.name()).search_down(from_node,to_node, already_covered)
1161
+
1162
+ if path.nil?
1163
+ # search down unsuccessful
1164
+
1165
+ parent = from_node.parent
1166
+ edgelabel = from_node.parent_label
1167
+ # puts "Going up from "+from_node.id.to_s+" to "+parent.id.to_s
1168
+
1169
+ if parent.nil?
1170
+ # no path found
1171
+ return nil
1172
+
1173
+ else
1174
+ # search up
1175
+ path = eval(self.name()).search_up(parent,to_node, from_node)
1176
+
1177
+ if path.nil?
1178
+ # no path found
1179
+ return nil
1180
+
1181
+ else
1182
+ # search up was successful
1183
+ parent_pt = eval(self.name()).simplified_pt(parent)
1184
+ path.add_first_step(from_node, "U", edgelabel, parent_pt)
1185
+ return path
1186
+ end
1187
+ end
1188
+
1189
+ else
1190
+ # search down successful
1191
+ return path
1192
+ end
1193
+ end
1194
+
1195
+ ###
1196
+ # search in tree
1197
+ def SynInterpreter.search_down(from_node, # SynNode
1198
+ to_node, # SynNode
1199
+ already_explored) # SynNode
1200
+
1201
+ if from_node == to_node
1202
+ return Path.new(from_node)
1203
+
1204
+ else
1205
+
1206
+ from_node.children.each {|c|
1207
+
1208
+ if c == already_explored
1209
+ # we have done this subtree,
1210
+ # don't do it again
1211
+ next
1212
+ end
1213
+
1214
+ path = eval(self.name()).search_down(c, to_node, already_explored)
1215
+
1216
+ unless path.nil?
1217
+ c_pt = eval(self.name()).simplified_pt(c)
1218
+ path.add_first_step(from_node, "D", c.parent_label(), c_pt)
1219
+ return path
1220
+ end
1221
+ }
1222
+
1223
+ # no path found for any of the children
1224
+ return nil
1225
+ end
1226
+ end
1227
+ end