shalmaneser 0.0.1.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +284 -0
@@ -0,0 +1,215 @@
1
+ # wrapper script for the OpenNLP Maxent classifier
2
+
3
+ # sp July 2007
4
+
5
+
6
+ require "tempfile"
7
+ require 'fileutils'
8
+
9
+ class Maxent
10
+
11
+ ###
12
+ def initialize(program_path,parameters)
13
+
14
+ if parameters.empty?
15
+ puts "Error: The OpenNLP maxent system needs two paths (first the location of maxent itself and then the location of the interface, usually program/tools/maxent)."
16
+ puts "I got only the program path."
17
+ Kernel.exit
18
+ end
19
+
20
+ @maxentpath = program_path
21
+ @interface_path = parameters.first
22
+ unless @maxentpath =~ /\/$/
23
+ @maxentpath = @maxentpath + "/"
24
+ end
25
+
26
+ # classpath for maxent
27
+
28
+ @cp = "#{ENV["CLASSPATH"]}:#{@maxentpath}:#{@maxentpath}lib:#{@maxentpath}lib/trove.jar:#{@maxentpath}output/maxent-2.4.0.jar"
29
+
30
+ end
31
+
32
+ ###
33
+ #
34
+ # write classifier to training directory...
35
+ def train(infilename,classifier_file)
36
+ trainfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
37
+ infile = File.new(infilename)
38
+ c45_to_maxent(infile,trainfile) # training data in csv format
39
+ infile.close
40
+ trainfile.close
41
+
42
+ if classifier_file
43
+ @classifier_location = classifier_file
44
+ else
45
+ @classifier_location = trainfile.path+"Model.bin.gz"
46
+ end
47
+
48
+ @classifier_location = enforce_compact_storage(@classifier_location)
49
+
50
+ # store model in binary, gzipped form...
51
+ command = ["cd #{@interface_path}; ",
52
+ #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Train",
53
+ "java -cp #{@cp} -Xmx1000m Train",
54
+ trainfile.path,
55
+ @classifier_location].join(" ")
56
+ # remember location
57
+ unless successfully_run(command)
58
+ return false
59
+ end
60
+ trainfile.close(true)
61
+ end
62
+
63
+ def write(classifier_file)
64
+
65
+ classifier_file = enforce_compact_storage(classifier_file)
66
+
67
+ if @classifier_location
68
+ @classifier_location = enforce_compact_storage(@classifier_location)
69
+ %x{cp #{@classifier_location} #{classifier_file}} # store classifier
70
+ # File.chmod(0664,classifier_file+".classifier")
71
+ else
72
+ $stderr.puts "Maxent error: cannot read Maxent classifier file #{@classifier_file}."
73
+ return nil
74
+ end
75
+ end
76
+
77
+ ###
78
+ def exists?(classifier_file)
79
+ classifier_file = enforce_compact_storage(classifier_file)
80
+ return FileTest.exists?(classifier_file)
81
+ end
82
+
83
+ ###
84
+ # return true iff reading the classifier has had success
85
+ def read(classifier_file)
86
+
87
+ classifier_file = enforce_compact_storage(classifier_file)
88
+
89
+ if exists?(classifier_file)
90
+ @classifier_location = classifier_file
91
+ return true
92
+ else
93
+ $stderr.puts "No classifier file "+classifier_file
94
+ return false
95
+ end
96
+ end
97
+
98
+ ###
99
+ def apply(infilename,outfilename)
100
+
101
+ @classifier_location = enforce_compact_storage(@classifier_location)
102
+ unless @classifier_location
103
+ return false
104
+ end
105
+
106
+ testfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
107
+
108
+ infile = File.new(infilename)
109
+ c45_to_maxent(infile,testfile) # training data in csv format
110
+ infile.close
111
+ testfile.close
112
+
113
+ command = ["cd #{@interface_path}; ",
114
+ #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Classify ",
115
+ "java -cp #{@cp} -Xmx1000m Classify ",
116
+ testfile.path,
117
+ @classifier_location,
118
+ ">",
119
+ outfilename].join(" ")
120
+
121
+ # classify
122
+ unless successfully_run(command)
123
+ return false
124
+ end
125
+
126
+ # some error in classification
127
+ unless FileTest.exists?(outfilename)
128
+ return false
129
+ end
130
+
131
+ # no errors = success
132
+ testfile.close(true)
133
+ return true
134
+ end
135
+
136
+ #####
137
+ # format of Maxent result file:
138
+ # <best label>[<confidence>] <secondbest_label>[<confidence>] ....
139
+ #
140
+ # returns a list of instance_results
141
+ # where an instance_result is a list of pairs [label, confidence]
142
+ # where the pairs are sorted by confidence
143
+ def read_resultfile(filename)
144
+ begin
145
+ f = File.new(filename)
146
+ rescue
147
+ $stderr.puts "Maxent error: cannot read Maxent result file #{filemame}."
148
+ return nil
149
+ end
150
+
151
+ retv = Array.new()
152
+
153
+ f.each { |line|
154
+ line_results = Array.new()
155
+ pieces = line.split() # split at whitespace
156
+
157
+ pieces.each {|piece|
158
+ piece =~ /(\S+)\[(.+)\]/
159
+ label = $1
160
+ confidence = $2.to_f
161
+
162
+ line_results << [label, confidence]
163
+ }
164
+
165
+ # sort: most confident label first
166
+ retv << line_results.sort {|a,b| b[1] <=> a[1]}
167
+ }
168
+ return retv
169
+ end
170
+
171
+
172
+ ###################################
173
+ private
174
+
175
+ ###
176
+ # produce input file for maxent learner: make attribute-value pairs
177
+ # where attribute == featureX=
178
+ def c45_to_maxent(inpipe,outpipe)
179
+ while (line = inpipe.gets)
180
+ line.chomp!
181
+ la = line.split(",")
182
+ label = la.pop
183
+ if label[-1,1] == "."
184
+ label.chop!
185
+ end
186
+ la.each_index {|i|
187
+ la[i] = i.to_s() + "=" + la[i]
188
+ }
189
+ la.push(label)
190
+ outpipe.puts la.join(" ")
191
+ end
192
+ end
193
+
194
+ # since the OpenNLP MaxEnt system determines storage based on filename,
195
+ # make sure that all models are stored internally as binary, gzipped files.
196
+
197
+ def enforce_compact_storage(filename)
198
+ if filename =~ /Model.bin.gz/
199
+ return filename
200
+ else
201
+ return filename+"Model.bin.gz"
202
+ end
203
+ end
204
+
205
+ ###
206
+ def successfully_run(command)
207
+ retv = Kernel.system(command)
208
+ unless retv
209
+ $stderr.puts "Error running classifier. Continuing."
210
+ $stderr.puts "Offending command: "+command
211
+ # exit 1
212
+ end
213
+ return retv
214
+ end
215
+ end
@@ -0,0 +1,1388 @@
1
+ ####
2
+ # KE Nov 2005
3
+ #
4
+ # Interface for use of the Minipar parser:
5
+ # parsing with Salsa/Tiger XML output format,
6
+ # class for interpreting the Salsa/Tiger XML data structures
7
+
8
+ require "tempfile"
9
+ require "common/TabFormat"
10
+ require "common/SalsaTigerRegXML"
11
+ require "common/SalsaTigerXMLHelper"
12
+
13
+ require "common/AbstractSynInterface"
14
+
15
+ #########################################
16
+ # MiniparSentence class
17
+ #
18
+ # analyze one minipar output sentence,
19
+ # provide access
20
+ #
21
+ # hash representation of a node:
22
+ # keys are
23
+ # index, word , lemma, pos, parent_index, edgelabel, governing_lemma, antecedent_index
24
+ #
25
+ # other access: as SalsaTigerSentence object
26
+ class MiniparSentence
27
+
28
+ ########
29
+ def initialize(sentence) # array:string, one minipar node per string
30
+ @nodes = Array.new
31
+
32
+ sentence.each { |line_string|
33
+ @nodes << analyze_line(line_string)
34
+ }
35
+ # sort nodes by line index -- sometimes nodes with lower index are mentioned later in the sentence
36
+ @nodes.sort! { |a, b| a["index"].to_i <=> b["index"].to_i }
37
+
38
+ @tabsent = nil
39
+ # nodehash_mapping: hash tabindex -> array:nodehashes
40
+ @nodehash_mapping = nil
41
+ end
42
+
43
+ #####
44
+ def nodes()
45
+ return @nodes.clone.freeze()
46
+ end
47
+
48
+ #####3
49
+ # stxml:
50
+ #
51
+ # make SalsaTigerSentence object from this sentence,
52
+ # one node per minipar node.
53
+ # if it is a nonterminal, duplicate it as a terminal
54
+ #
55
+ # return: pair [SalsaTigerSentence, mapping]:
56
+ # if we have a tab sent, mapping is a mapping from tab word indices to SynNode objects
57
+ # of the minipar sentence representation
58
+ def stxml(sentence_id)
59
+ return salsatigerxml_output(sentence_id)
60
+ end
61
+
62
+ #####
63
+ # set tabsent:
64
+ # set this tab format sentence, which has entries "word", "lineno",
65
+ # as the sentence matching this minipar output sentence.
66
+ #
67
+ # On success, remember the tab sentence as well as the mapping
68
+ # between fntab sentence indices and minipar node hash indices
69
+ #
70
+ # returns true on success
71
+ # or false if matching failed
72
+
73
+ def set_tabsent(tabsent, # TabFileFormat object
74
+ sloppy = true) # not nil or false: allow sloppy match
75
+
76
+ # empty minipar sentence? then no match
77
+ if @nodes.empty?
78
+ return false
79
+ end
80
+
81
+ # tabwords: array:string
82
+ tabwords = Array.new
83
+ tabsent.each_line_parsed { |l| tabwords << l.get("word") }
84
+
85
+ # main data structure: a chart of partial mappings fn_index -> minipar_index
86
+ # represented as an array of partial mappings
87
+ # each partial mapping is an array of triples [fn_index, min_index, "full"|"partial"]
88
+ old_chart = Array.new
89
+
90
+ # enter data for 1st minipar node into the chart
91
+ first_node_no = 0
92
+ while @nodes[first_node_no]["word"].nil?
93
+ first_node_no += 1
94
+ end
95
+ old_chart = fnw_minw_match(tabwords, @nodes[first_node_no]["word"]).map { |fnw_index, match_how|
96
+ [[fnw_index, first_node_no, match_how]]
97
+ }
98
+
99
+ if old_chart.empty?
100
+ # unmatched single word in minipar sentence
101
+ return false
102
+ end
103
+
104
+ # enter data for the rest of the minipar nodes into the chart
105
+ (first_node_no + 1).upto(@nodes.length - 1) { |node_no|
106
+ unless @nodes[node_no]["word"]
107
+ # minipar node with empty word, skip
108
+ next
109
+ end
110
+ new_chart = Array.new
111
+
112
+ # each partial mapping found up to now:
113
+ # try to extend it, record results in new_chart
114
+ old_chart.each { |partial_mapping|
115
+ prev_fnw_index, prev_mw_index, match_how = partial_mapping.last
116
+
117
+ # where do we start looking in tabwords? same word as before, or advance one?
118
+ case match_how
119
+ when "full"
120
+ fnw_index = prev_fnw_index + 1
121
+ when "partial"
122
+ fnw_index = prev_fnw_index
123
+ else
124
+ raise "Shouldn't be here"
125
+ end
126
+
127
+ fnw_minw_match(tabwords[fnw_index..tabwords.length()-1],
128
+ @nodes[node_no]["word"]).each { |match_offset, match_how|
129
+ new_chart.push partial_mapping + [[fnw_index + match_offset, node_no, match_how]]
130
+ }
131
+ }
132
+
133
+ if new_chart.empty?
134
+ # no partial mappings found that would work up to this minipar node:
135
+ # matching failed
136
+ return false
137
+ end
138
+
139
+ old_chart = new_chart
140
+ }
141
+
142
+ # $stderr.puts "Msent: "+ @nodes.map { |n| n["word"]}.join(" ")
143
+ # $stderr.puts "Tsent: "+ tabwords.join(" ")
144
+ # $stderr.puts "Mappings: "
145
+ # old_chart.each { |mapping|
146
+ # mapping.each { |fnw_ix, mnode_no, match_how|
147
+ # $stderr.print tabwords[fnw_ix] + ":" + @nodes[mnode_no]["word"] + ":" + match_how + " "
148
+ # }
149
+ # $stderr.puts
150
+ # }
151
+ # $stderr.puts "any key"
152
+ # $stdin.gets()
153
+
154
+ # filter chart: if some fntab sent words are only matched partially, discard
155
+ if sloppy
156
+ chart = old_chart
157
+ else
158
+ chart = old_chart.select { |mapping|
159
+
160
+ mapping_ok = true
161
+ tabwords.each_with_index { |fnw, fnw_index|
162
+
163
+ tuples = mapping.select { |other_fnw_index, mnode_no, match_how| other_fnw_index == fnw_index }
164
+
165
+ unless tuples.empty?
166
+ word = tuples.map { |fnw_index, mnode_no, match_how| @nodes[mnode_no]["word"] }.join()
167
+
168
+ unless word == fnw
169
+ mapping_ok = false
170
+ break
171
+ end
172
+ end
173
+ }
174
+ mapping_ok
175
+ }
176
+ end
177
+
178
+ if chart.empty?
179
+ return false
180
+ elsif chart.length() > 1
181
+ # $stderr.puts "Found more than one mapping for sentence:"
182
+ # $stderr.puts "Msent: " + @nodes.map { |n| n["word"]}.join(" ")
183
+ # $stderr.puts "Tsent: "+ tabwords.join(" ")
184
+ # $stderr.puts
185
+ end
186
+
187
+ # success: found mapping
188
+ # nodehash_mapping: hash tab sentence word index -> array: SynNodes
189
+ @tabsent = tabsent
190
+ @nodehash_mapping = Hash.new
191
+ chart.first.each { |tabindex, mindex, match_how|
192
+ unless @nodehash_mapping[tabindex]
193
+ @nodehash_mapping[tabindex] = Array.new
194
+ end
195
+ @nodehash_mapping[tabindex] << @nodes[mindex]
196
+ }
197
+ return true
198
+ end
199
+
200
+ # nodehash_mapping: hash tabindex -> array:nodehashes
201
+ def nodehash_mapping()
202
+ if @nodehash_mapping
203
+ return @nodehash_mapping.clone.freeze()
204
+ else
205
+ return nil
206
+ end
207
+ end
208
+
209
+
210
+ ################################################3
211
+ ################################################3
212
+ private
213
+
214
+ ###########
215
+ # analyze one line of the sentence array.
216
+ #
217
+ # examples of possible entries:
218
+ # E1 (() fin C E4 )
219
+ # 3 (them ~ N 2 obj (gov call))
220
+ # E5 (() they N 2 subj (gov call) (antecedent 1))
221
+ def analyze_line(line)
222
+ retv = Hash.new()
223
+
224
+ unless line =~ /^(\w+)\t\((.+)\)\s*$/
225
+ raise "Cannot parse line: #{line}"
226
+ end
227
+
228
+ # line structure:
229
+ # index ( node descr )
230
+ retv["index"] = $1
231
+
232
+ descr = $2
233
+ word, lemma_pos, parentindex, edgelabel, governor, antecedent = descr.split("\t")
234
+
235
+ # word
236
+ if word
237
+ if word =~ /^['"](.+)['"]$/
238
+ # quoted? remove quotes
239
+ word = $1
240
+ end
241
+ unless word == "()"
242
+ retv["word"] = word
243
+ end
244
+ end
245
+
246
+ # lemma, POS
247
+ if lemma_pos
248
+ lemma_pos.strip!
249
+ if lemma_pos == "U"
250
+ # neither lemma nor POS for this node
251
+ else
252
+ # we have both lemma and POS
253
+
254
+ if lemma_pos =~ /^(.+)\s(.+)$/
255
+ # lemma may be "...." with spaces in.
256
+ # this regexp. uses the last space to separate lemma and POS
257
+ retv["lemma"] = $1
258
+ retv["pos"] = $2
259
+
260
+ if retv["lemma"] =~ /^"(.+)"$/
261
+ # remove quotes around lemma
262
+ retv["lemma"] = $1
263
+
264
+ elsif retv["lemma"] == "~"
265
+ # lemma same as word
266
+ retv["lemma"] = retv["word"]
267
+ end
268
+ elsif lemma_pos.strip().split().length() == 1
269
+ # only pos given
270
+ retv["pos"] = lemma_pos.strip()
271
+ else
272
+ $stderr.puts "cannot parse lemma_pos pair " + lemma_pos
273
+ end
274
+ end
275
+ end
276
+
277
+ # parent index
278
+ if parentindex.nil? or parentindex == "*"
279
+ # root
280
+ else
281
+ retv["parent_index"] = parentindex
282
+ end
283
+
284
+ # edge label
285
+ if edgelabel.nil? or edgelabel.strip.empty?
286
+ # no edge label given
287
+ else
288
+ retv["edgelabel"] = edgelabel
289
+ end
290
+
291
+ # governing word
292
+ if governor and not(governor.strip.empty?)
293
+ # expected format:
294
+ # (gov <governing_lemma>)
295
+ if governor =~ /^\(gov\s(.+)\)$/
296
+ retv["governing_lemma"] = $1
297
+ elsif governor == "(gov )"
298
+ # okay, no governor given
299
+ else
300
+ $stderr.puts "cannot parse governor "+ governor
301
+ end
302
+ end
303
+
304
+ # antecedent
305
+ if antecedent and not(antecedent.strip.empty?)
306
+ # expected format:
307
+ # (antecedent <index>)
308
+ if antecedent =~ /^\(antecedent\s(.+)\)$/
309
+ retv["antecedent_index"] = $1
310
+ else
311
+ $stderr.puts "cannot parse antecedent "+ antecedent
312
+ end
313
+ end
314
+
315
+ return retv
316
+ end
317
+
318
+ ###########
319
+ # returns: SalsaTigerSentence object describing this minipar parse
320
+ def salsatigerxml_output(sentence_id)
321
+
322
+ # start sentence object
323
+ sent_obj = SalsaTigerSentence.empty_sentence(sentence_id)
324
+
325
+ # determine children of each node
326
+ # so we'll know which nodes to make terminal and which to make nonterminal
327
+ i_have_children = Hash.new
328
+ @nodes.each { | node|
329
+ if (parent_ix = node["parent_index"])
330
+ # node has parent. record the parent as having children
331
+ i_have_children[parent_ix] = true
332
+ end
333
+ }
334
+
335
+ # make SynNode objects for each minipar node
336
+ # minipar terminal: one SynNode terminal
337
+ # minipar nonterminal: one SynNode nonterminal, plus one SynNode terminal
338
+ # duplicating the word, lemma and POS info
339
+ # to keep with the SalsaTigerSentence assumptions that
340
+ # the sentence can be read off from the terminals
341
+ index_to_synnode = Hash.new
342
+ @nodes.each { |minipar_node|
343
+ node_id = minipar_node["index"]
344
+ if minipar_node["word"]
345
+ word = SalsaTigerXMLHelper.escape(minipar_node["word"])
346
+ elsif not(i_have_children[minipar_node["index"]])
347
+ # node without word and children: probably has an antecedent
348
+ # add an empty word so the Salsa tool can represent the node with the antecedent
349
+ word = ""
350
+ else
351
+ word = nil
352
+ end
353
+
354
+ if word
355
+ # make a terminal SynNode for this minipar node
356
+ # only if it has a word, otherwise it's not much use as a terminal
357
+ t_node = sent_obj.add_syn("t",
358
+ nil, # category
359
+ word, # word
360
+ minipar_node["pos"], # POS
361
+ node_id) # node ID
362
+ if minipar_node["lemma"]
363
+ t_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
364
+ end
365
+
366
+ # remember this node
367
+ index_to_synnode[minipar_node["index"]] = t_node
368
+ else
369
+ t_node = nil
370
+ end
371
+
372
+ if i_have_children[minipar_node["index"]] or not(word)
373
+ # does this minipar node have children, or
374
+ # does it lack a word? then add a (second) nonterminal SynNode for it
375
+ node_id = node_id + "nt"
376
+ nt_node = sent_obj.add_syn("nt",
377
+ minipar_node["pos"], # category
378
+ word, # word
379
+ minipar_node["pos"], # POS
380
+ node_id) # node ID
381
+ if minipar_node["lemma"]
382
+ nt_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
383
+ end
384
+
385
+ # link t node to nt node
386
+ if t_node
387
+ nt_node.add_child(t_node, "Head")
388
+ t_node.add_parent(nt_node, "Head")
389
+ end
390
+
391
+ # just terminal node: remember it
392
+ # both terminal and nonterminal:remember just the nonterminal
393
+ index_to_synnode[minipar_node["index"]] = nt_node
394
+ end
395
+
396
+ }
397
+
398
+ # link SynNodes
399
+ @nodes.each { |minipar_node|
400
+ # find my syn node
401
+ my_synnode = index_to_synnode[minipar_node["index"]]
402
+ unless my_synnode
403
+ raise "Error: no syn node constructed for index in sentence #{sentence_id}"
404
+ end
405
+
406
+ # link to parent syn node
407
+ if (parent_ix = minipar_node["parent_index"])
408
+ parent_synnode = index_to_synnode[parent_ix]
409
+ unless parent_synnode
410
+ raise "Error: no syn node constructed for parent index #{parent_ix} in sentence #{sentence_id}"
411
+ end
412
+
413
+ parent_synnode.add_child(my_synnode, minipar_node["edgelabel"])
414
+ my_synnode.add_parent(parent_synnode, minipar_node["edgelabel"])
415
+ end
416
+
417
+ # remember antecedent: both the node itself and its index, the latter as an attribute
418
+ # this way, we have
419
+ # - easy access to the antecedent via the node itself
420
+ # - a record of the antecedent in the SalsaTigerXML output
421
+ if (antecedent_ix = minipar_node["antecedent_index"])
422
+ antecedent_synnode = index_to_synnode[antecedent_ix]
423
+ unless antecedent_synnode
424
+ raise "Error: no syn node constructed for antecedent index #{antecedent_ix} in sentence #{sentence_id}"
425
+ end
426
+
427
+ my_synnode.set_f("antecedent", antecedent_synnode)
428
+ my_synnode.set_attribute("antecedent", antecedent_synnode.id())
429
+ end
430
+ }
431
+
432
+ return [sent_obj, construct_tabsent_mapping_stxml(sent_obj)]
433
+ end
434
+
435
+ ###########3
436
+ # construct mapping fntab line -> array of SynNodes
437
+ # and add fntab words not present in minipar as children of the
438
+ # SalsaTigerSentence object's root
439
+ def construct_tabsent_mapping_stxml(sent)
440
+ unless @tabsent
441
+ return nil
442
+ end
443
+
444
+ retv = Hash.new
445
+ prev_minipar_index = nil
446
+
447
+ @tabsent.each_line_parsed { |tabline|
448
+ retv[tabline.get("lineno")] = Array.new
449
+
450
+ # nodehash_mapping: hash tabsent lineno -> array: member of @nodes
451
+ if (nodehashes = @nodehash_mapping[tabline.get("lineno")])
452
+ nodehashes.each { |nodehash|
453
+ prev_minipar_index = nodehash["index"]
454
+
455
+ # this tabsent word has a corresponding minipar node
456
+ # enter it in tabsent_mapping
457
+ if (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"]))
458
+ # terminal matching this fntab word
459
+ retv[tabline.get("lineno")] << node
460
+ elsif (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"] + "nt"))
461
+ # we have a nonterminal matching this fntab word
462
+ retv[tabline.get("lineno")] << node
463
+ else
464
+ # no match after all?
465
+ raise "missing: SalsaTigerSentence node for minipar node with index #{nodehash["index"]}"
466
+ end
467
+ }
468
+
469
+ else
470
+ # this tabsent word has no corresponding minipar node yet
471
+ # make one. See to it that it occurs in the right spot in sent.terminals_ordered.
472
+ parent = sent.syn_roots.first
473
+ node = sent.add_syn("t", # terminal
474
+ "", # category
475
+ tabline.get("word"), # word
476
+ "", # part of speech
477
+ (prev_minipar_index.to_i + 1).to_s) # ID
478
+ parent.add_child(node, "-")
479
+ node.add_parent(parent, "-")
480
+
481
+ retv[tabline.get("lineno")] = [node]
482
+ end
483
+ }
484
+
485
+ return retv
486
+ end
487
+
488
+ ######
489
+ # return a list of pairs [fntab word index, match type]
490
+ # with an entry for each fntab word on fnw_list that matches minw,
491
+ # either fnw == minw (match_type "full") or minw part_of fnw (match_type "partial")
492
+ def fnw_minw_match(fnw_list, minw)
493
+ retv = Array.new
494
+
495
+ fnw_list.each_with_index { |fnw, fnw_index|
496
+ if fnw == minw
497
+ # words identical
498
+ retv << [fnw_index, "full"]
499
+ elsif fnw.index(minw)
500
+ # fn word includes minipar word
501
+ retv << [fnw_index, "partial"]
502
+ end
503
+ }
504
+
505
+ return retv
506
+ end
507
+ end
508
+
509
+
510
+
511
+ ################################################
512
+ # Interface class
513
+ class MiniparInterface < SynInterfaceSTXML
514
+ MiniparInterface.announce_me()
515
+
516
+ ###
517
+ def MiniparInterface.system()
518
+ return "minipar"
519
+ end
520
+
521
+ ###
522
+ def MiniparInterface.service()
523
+ return "parser"
524
+ end
525
+
526
+ ###
527
+ # initialize to set values for all subsequent processing
528
+ def initialize(program_path, # string: path to system
529
+ insuffix, # string: suffix of tab files
530
+ outsuffix, # string: suffix for parsed files
531
+ stsuffix, # string: suffix for Salsa/TIGER XML files
532
+ var_hash = {}) # optional arguments in a hash
533
+
534
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
535
+
536
+ # new: evaluate var hash
537
+ @pos_suffix = var_hash["pos_suffix"]
538
+ @lemma_suffix = var_hash["lemma_suffix"]
539
+ @tab_dir = var_hash["tab_dir"]
540
+ end
541
+
542
+
543
+ ###
544
+ # process one file, writing the result to outfilename
545
+ # input format is FNTabFormat, output format is
546
+ # Minipar format
547
+ #
548
+ # returns: nothing
549
+ def process_file(infilename, # string: name of input file
550
+ outfilename) # string: name of output file
551
+
552
+ tf = Tempfile.new("minipar")
553
+ reader = FNTabFormatFile.new(infilename)
554
+ reader.each_sentence { |sent|
555
+ sent.each_line_parsed { |line|
556
+ tf.print line.get("word"), " "
557
+ }
558
+ tf.puts
559
+ }
560
+
561
+ tf.close()
562
+ %x{#{@program_path} < #{tf.path()} > #{outfilename}}
563
+ end
564
+
565
+ #########3
566
+ # yields tuples
567
+ # [ minipar output sentence, tab sentence, mapping]
568
+ #
569
+ # minipar output sentence is
570
+ # - either an array of hashes, each describing one node;
571
+ # - or a SalsaTigerSentence object
572
+ # - or a MiniparSentence object
573
+ # (which has methods returns the sentence as either a
574
+ # nodehash array or a SalsaTigerSentence)
575
+ #
576
+ # tab sentence: matching tab sentence, if tab file has been given on initialization
577
+ #
578
+ # mapping: hash: line in tab sentence(integer) -> array:SynNode
579
+ # mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
580
+ #
581
+ # If a parse has failed, returns
582
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
583
+ # to allow more detailed accounting for failed parses
584
+ def each_sentence(parsefilename, # name of minipar output file
585
+ format = "stxml") # format to return data in
586
+ # sanity checks
587
+ unless @tab_dir
588
+ raise "Need to set tab directory on initialization"
589
+ end
590
+
591
+ # get matching tab file for this parser output file,
592
+ # read its contents
593
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
594
+ @tab_sentences = Array.new
595
+ reader = FNTabFormatFile.new(tabfilename)
596
+ reader.each_sentence { |sent_obj| @tab_sentences << sent_obj }
597
+
598
+ stream = open_minipar_outfile(parsefilename)
599
+
600
+ sentno = 0
601
+ tab_sentno = 0
602
+ matched_tabsent = Hash.new()
603
+
604
+ each_miniparsent_obj(stream) { |parse|
605
+
606
+ if (matching_tab_sentno = matching_tabsent(parse, tab_sentno))
607
+ # found matching tab sentence
608
+ tabsent = @tab_sentences[matching_tab_sentno]
609
+ tab_sentno = matching_tab_sentno + 1
610
+ matched_tabsent[matching_tab_sentno] = true
611
+ else
612
+ tabsent = nil
613
+ end
614
+
615
+ # yield minipar parse in the required format
616
+ case format
617
+ when "nodehashes"
618
+ yield [parse.nodes(), tabsent, parse.nodehash_mapping()]
619
+ when "stxml"
620
+ sent, mapping = parse.stxml(@filename_core + sentno.to_s)
621
+ yield [sent, tabsent, mapping]
622
+ when "objects"
623
+ yield [parse, tabsent]
624
+ else
625
+ raise "Unknown each_sentence format #{format}"
626
+ end
627
+
628
+ sentno += 1
629
+ }
630
+
631
+ ##
632
+ # each unmatched tab sentence: yield as failed parse object
633
+ @tab_sentences.each_with_index { |tabsent, index|
634
+ unless matched_tabsent[index]
635
+ # spotted an unmatched sentence
636
+ sent = MiniparInterface.failed_sentence(tabsent,tabsent.get_sent_id())
637
+ yield [sent, tabsent, MiniparInterface.standard_mapping(sent, tabsent)]
638
+ end
639
+ }
640
+ end
641
+
642
+ ###
643
+ # write Salsa/TIGER XML output to file
644
+ def to_stxml_file(infilename, # string: name of parse file
645
+ outfilename) # string: name of output stxml file
646
+
647
+ outfile = File.new(outfilename, "w")
648
+ outfile.puts SalsaTigerXMLHelper.get_header()
649
+ each_sentence(infilename) { |st_sent, tabsent|
650
+ outfile.puts st_sent.get()
651
+ }
652
+ outfile.puts SalsaTigerXMLHelper.get_footer()
653
+ outfile.close()
654
+ end
655
+
656
+
657
+ #####################3
658
+ private
659
+
660
+ ###
661
+ # open minipar outfile
662
+ #
663
+ # return: IO stream for reading minipar outfile
664
+ def open_minipar_outfile(filename)
665
+
666
+ ##
667
+ # zipped? then unzip first
668
+ # (the Ruby read-zipped package doesn't seem to be reliable)
669
+ if filename =~ /\.gz$/
670
+ @filename_core = File.basename(filename, ".gz")
671
+ return IO.popen("zcat #{filename}")
672
+ else
673
+ @filename_core = File.basename(filename)
674
+ begin
675
+ return File.new(filename)
676
+ rescue
677
+ raise "Couldn't read minipar file #{filename}"
678
+ end
679
+ end
680
+ end
681
+
682
+ ###
683
+ # each_miniparsent_obj
684
+ # read minipar output from stream,
685
+ # yield sentence-wise as MiniparSentence objects
686
+ def each_miniparsent_obj(stream) # IO object: stream to read from
687
+
688
+ # status: string
689
+ # "outside": waiting for next start of sentence with ( alone in a line
690
+ # "inside": inside a sentence, sentence ends with ) alone on a line
691
+ status = "outside"
692
+
693
+ # sentence: array of strings, one for each line of the sentence
694
+ sentence = Array.new()
695
+
696
+ while (line = stream.gets())
697
+ case status
698
+ when "outside"
699
+ # start of sentence?
700
+ if ["(", "> ("].include? line.chomp().strip()
701
+ sentence.clear()
702
+ status = "inside"
703
+ end
704
+
705
+ when "inside"
706
+ if line.chomp().strip() == ")"
707
+ # end of sentence
708
+ yield MiniparSentence.new(sentence)
709
+ status = "outside"
710
+ else
711
+ # inside sentence
712
+ sentence << line.chomp().strip()
713
+ end
714
+ else
715
+ raise "Shouldn't be here"
716
+ end # case
717
+ end # while file not ended
718
+ end
719
+
720
+ ###
721
+ # matching_tabsent
722
+ #
723
+ # if we have tab sentences, and if there is
724
+ # a tab sentence matching the given minipar sentence,
725
+ # return its index, else return false
726
+ #
727
+ # If there is a matching tabsent,
728
+ # the MiniparSentence will remember it (and the terminal mapping)
729
+ def matching_tabsent(parse, # MiniparSentence object
730
+ tabsent_no) # integer: starting point in @tab_sentences array
731
+ if @tab_sentences.empty?
732
+ return nil
733
+ end
734
+
735
+ tabsent_no.upto(@tab_sentences.length() - 1) { |index|
736
+ if parse.set_tabsent(@tab_sentences[index])
737
+ return index
738
+ end
739
+ }
740
+
741
+ # no match found up to now. so try sloppy match
742
+ if parse.set_tabsent(@tab_sentences[tabsent_no], "sloppy")
743
+ # $stderr.puts "Warning: sloppy match used. Minipar sentence:"
744
+ # $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
745
+ # $stderr.puts "Matching fntab sentence: "
746
+ # @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
747
+ # $stderr.puts
748
+ return tabsent_no
749
+ end
750
+
751
+ # $stderr.puts "Warning: No match found for minipar sentence:"
752
+ # $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
753
+ # $stderr.puts "First tested fntab sentence: "
754
+ # @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
755
+ # $stderr.puts
756
+
757
+ return nil
758
+ end
759
+ end
760
+
761
+ ################################################
762
+ # Interpreter class
763
+ class MiniparInterpreter < SynInterpreter
764
+ MiniparInterpreter.announce_me()
765
+
766
+ ###
767
+ # names of the systems interpreted by this class:
768
+ # returns a hash service(string) -> system name (string),
769
+ # e.g.
770
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
771
+ def MiniparInterpreter.systems()
772
+ return {
773
+ "parser" => "minipar"
774
+ }
775
+ end
776
+
777
+ ###
778
+ # names of additional systems that may be interpreted by this class
779
+ # returns a hash service(string) -> system name(string)
780
+ # same as names()
781
+ def MiniparInterpreter.optional_systems()
782
+ return {}
783
+ end
784
+
785
+ ###
786
+ # generalize over POS tags.
787
+ #
788
+ # returns one of:
789
+ #
790
+ # adj: adjective (phrase)
791
+ # adv: adverb (phrase)
792
+ # card: numbers, quantity phrases
793
+ # con: conjunction
794
+ # det: determiner, including possessive/demonstrative pronouns etc.
795
+ # for: foreign material
796
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
797
+ # part: particles, truncated words (German compound parts)
798
+ # prep: preposition (phrase)
799
+ # pun: punctuation, brackets, etc.
800
+ # sent: sentence
801
+ # top: top node of a sentence
802
+ # verb: verb (phrase)
803
+ # nil: something went wrong
804
+ #
805
+ # returns: string, or nil
806
+ def MiniparInterpreter.category(node) # SynNode
807
+ node = MiniparInterpreter.ensure_upper(node)
808
+
809
+ if node.get_attribute("lemma") =~ /NUM/
810
+ return "card"
811
+ end
812
+
813
+ if node.part_of_speech() == "U" and
814
+ node.parent_label() == "lex-mod" and
815
+ node.parent and MiniparInterpreter.category(node.parent) == "verb"
816
+ # this node is part of a complex verb
817
+ return "part"
818
+ end
819
+
820
+ if node.word =~ /^[!?;`'",(){}\[\]\.\:]+$/
821
+ return "pun"
822
+ end
823
+
824
+ if node.parent.nil?
825
+ return "top"
826
+ end
827
+
828
+ case node.part_of_speech()
829
+
830
+ when "A" # same POS for adjectives and adverbs
831
+ parent = node.parent
832
+ if parent
833
+ if MiniparInterpreter.category(parent) == "verb"
834
+ return "adv"
835
+ else
836
+ return "adj"
837
+ end
838
+ else
839
+ return "adj"
840
+ end
841
+
842
+ when "Det"
843
+ return "det"
844
+ when "N"
845
+ return "noun"
846
+
847
+ when "Prep"
848
+ return "prep"
849
+
850
+ when "C"
851
+ return "sent"
852
+
853
+ when /^V/
854
+ return "verb"
855
+
856
+ else
857
+ return nil
858
+ end
859
+ end
860
+
861
+ ###
862
+ # is relative pronoun?
863
+ #
864
+ def MiniparInterpreter.relative_pronoun?(node) # SynNode
865
+ if node.parent_label() =~ /^wh/
866
+ return true
867
+ else
868
+ return false
869
+ end
870
+ end
871
+
872
+ ###
873
+ # phrase type:
874
+ # constituent label for nonterminals,
875
+ # part of speech for terminals
876
+ #
877
+ # returns: string
878
+ def MiniparInterpreter.pt(node)
879
+ return node.part_of_speech()
880
+ end
881
+
882
+ ###
883
+ # auxiliary?
884
+ #
885
+ # returns true if the given node is an auxiliary
886
+ #
887
+ # returns: boolean
888
+ def MiniparInterpreter.auxiliary?(node)
889
+ if MiniparInterpreter.aux_or_modal?(node) and
890
+ not(MiniparInterpreter.modal?(node))
891
+ return true
892
+ else
893
+ return false
894
+ end
895
+ end
896
+
897
+ ###
898
+ # modal?
899
+ #
900
+ # returns true if the given node is a modal verb
901
+ #
902
+ # returns: boolean
903
+ def MiniparInterpreter.modal?(node)
904
+ if MiniparInterpreter.aux_or_modal?(node) and
905
+ ["can",
906
+ "could",
907
+ "must",
908
+ "should",
909
+ "shall"
910
+ ].include? node.word()
911
+ return true
912
+ else
913
+ return false
914
+ end
915
+ end
916
+
917
+ ###
918
+ # head_terminal
919
+ #
920
+ # given a constituent, return the terminal node
921
+ # that describes its headword
922
+ #
923
+ # returns: a SynNode object if successful, else nil
924
+ def MiniparInterpreter.head_terminal(node)
925
+ if node.is_terminal?
926
+ return node
927
+ else
928
+ return node.children_by_edgelabels(["Head"]).first
929
+ end
930
+ end
931
+
932
+ ###
933
+ # voice
934
+ #
935
+ # given a constituent, return
936
+ # - "active"/"passive" if it is a verb
937
+ # - nil, else
938
+ def MiniparInterpreter.voice(verb_node)
939
+
940
+ # am I a terminal added to make minipar representations
941
+ # more TigerXML-like? then move to my parent
942
+ verb_node = MiniparInterpreter.ensure_upper(verb_node)
943
+
944
+ # verb has to have part of speech V or VBE
945
+ unless ["V", "VBE"].include? verb_node.part_of_speech()
946
+ return nil
947
+ end
948
+
949
+ # outgoing edge "by_subj"?
950
+ # then assume passive
951
+ unless verb_node.children_by_edgelabels(["by_subj"]).empty?
952
+ # $stderr.puts "passive #{verb_node.id()} by_subj"
953
+ return "passive"
954
+ end
955
+
956
+ # outgoing edge to auxiliary "be", and not "be ....ing"?
957
+ # then assume passive
958
+ if not(verb_node.children_by_edgelabels(["be"]).empty?) and
959
+ verb_node.word !~ /ing$/
960
+ # $stderr.puts "passive #{verb_node.id()} be"
961
+ return "passive"
962
+ end
963
+
964
+ # vrel incoming edge? then assume passive
965
+ if verb_node.parent_label() == "vrel"
966
+ # $stderr.puts "passive #{verb_node.id()} vrel"
967
+ return "passive"
968
+ end
969
+
970
+ # obj child coreferent with s child?
971
+ # then assume passive
972
+ if (obj_ch = verb_node.children_by_edgelabels(["obj"]).first)
973
+ if (s_ch = verb_node.children_by_edgelabels(["s"]).first)
974
+ if obj_ch.get_f("antecedent") == s_ch
975
+ # $stderr.puts "passive #{verb_node.id()} obj=s"
976
+ return "passive"
977
+ end
978
+ end
979
+ end
980
+
981
+ # okay, assume active voice
982
+ return "active"
983
+ end
984
+
985
+ ###
986
+ # gfs
987
+ #
988
+ # grammatical functions of a constituent:
989
+ #
990
+ # returns: a list of pairs [relation(string), node(SynNode)]
991
+ # where <node> stands in the relation <relation> to the parameter
992
+ # that the method was called with
993
+ def MiniparInterpreter.gfs(start_node, # SynNode
994
+ sent) # SalsaTigerSentence
995
+
996
+ start_node = MiniparInterpreter.ensure_upper(start_node)
997
+
998
+ retv = start_node.children_with_edgelabel.reject { |edgelabel, node|
999
+ ["Head", # head of the target node -- not really bearer of a GF
1000
+ "-",
1001
+ "aux",
1002
+ "have",
1003
+ "be"
1004
+ ].include? edgelabel
1005
+ }.map { |edgelabel,node|
1006
+
1007
+ # map node to suitable other node
1008
+ while (ant_id = node.get_attribute("antecedent"))
1009
+
1010
+ # Antecedent node for empty nodes and relative pronouns
1011
+
1012
+ new_node = sent.syn_node_with_id(ant_id)
1013
+ if new_node
1014
+ node = new_node
1015
+ else
1016
+ # error. stop seeking
1017
+ # $stderr.puts "Antecedent ID not matching any node: #{ant_id}"
1018
+ break
1019
+ end
1020
+ end
1021
+
1022
+ # PP -- i.e. edgelabel == mod and node.POS == Prep?
1023
+ # then add the preposition to the edgelabel,
1024
+ # and take the node's head as head instead of the node
1025
+ if edgelabel == "mod" and
1026
+ node.part_of_speech() == "Prep"
1027
+ edgelabel = edgelabel + "-" + node.word().to_s
1028
+ end
1029
+
1030
+ [edgelabel, node]
1031
+ }
1032
+
1033
+ # duplicate entries?
1034
+ # s is often coreferent with either subj or obj
1035
+ if MiniparInterpreter.voice(start_node) == "active" and
1036
+ (s_entry = retv.assoc("s")) and
1037
+ (subj_entry = retv.assoc("subj")) and
1038
+ s_entry.last == subj_entry.last
1039
+ retv.delete(s_entry)
1040
+
1041
+ elsif MiniparInterpreter.voice(start_node) == "passive" and
1042
+ (s_entry = retv.assoc("s")) and
1043
+ (obj_entry = retv.assoc("obj")) and
1044
+ s_entry.last == obj_entry.last
1045
+ retv.delete(s_entry)
1046
+ end
1047
+
1048
+ # $stderr.puts "blip " + retv.map { |l, n| l}.join(" ")
1049
+ return retv
1050
+ end
1051
+
1052
+ ###
1053
+ # informative_content_node
1054
+ #
1055
+ # for most constituents: the head
1056
+ # for a PP, the NP
1057
+ # for an SBAR, the VP
1058
+ # for a VP, the embedded VP
1059
+ def MiniparInterpreter.informative_content_node(node)
1060
+ node = MiniparInterpreter.ensure_upper(node)
1061
+
1062
+ if node.part_of_speech() == "Prep"
1063
+ # use complement of this constituent
1064
+ children = node.children_by_edgelabels(["pcomp-n",
1065
+ "vpsc_pcomp-c",
1066
+ "pcomp-c"])
1067
+
1068
+ if children.empty?
1069
+ # no suitable child found
1070
+ # $stderr.puts "Prep node without suitable child."
1071
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1072
+ return nil
1073
+
1074
+ else
1075
+ # if children.length() > 1
1076
+ # $stderr.puts "Too many suitable children for prep node: "
1077
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1078
+ # end
1079
+
1080
+ return children.first
1081
+ end
1082
+
1083
+
1084
+ elsif node.part_of_speech() == "SentAdjunct"
1085
+ # use complement of this constituent
1086
+ children = node.children_by_edgelabels(["comp1"])
1087
+
1088
+ if children.empty?
1089
+ # no suitable child found
1090
+ # $stderr.puts "SentAdjunct node without suitable child."
1091
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1092
+ return nil
1093
+
1094
+ else
1095
+ # if children.length() > 1
1096
+ # $stderr.puts "Too many suitable children for sent. adjunct node: "
1097
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1098
+ # end
1099
+
1100
+ return children.first
1101
+ end
1102
+
1103
+ elsif node.word().nil? or node.word().empty?
1104
+ # no word for this node: use child instead
1105
+
1106
+ children = node.children_by_edgelabels(["i"])
1107
+ if children.length() > 0
1108
+ # if children.length() > 1
1109
+ # $stderr.puts "Too many i edges from empty node."
1110
+ # end
1111
+
1112
+ return children.first
1113
+ end
1114
+
1115
+ children = node.children_by_edgelabels(["nn"])
1116
+ if children.length() > 0
1117
+ # if children.length() > 1
1118
+ # $stderr.puts "Too many nn edges from empty node."
1119
+ # end
1120
+
1121
+ return children.first
1122
+ end
1123
+
1124
+ # no children for this node: try antecedent
1125
+ ant = node.get_f("antecedent")
1126
+ if ant
1127
+ return ant
1128
+ end
1129
+
1130
+ return nil
1131
+ end
1132
+
1133
+ end
1134
+
1135
+ ###
1136
+ # path_between
1137
+ #
1138
+ # construct path in syntactic structure between two nodes,
1139
+ # using
1140
+ # - node labels
1141
+ # - edge labels
1142
+ # - direction Up, Down
1143
+ #
1144
+ # use_nontree_edges: set to true to use coreference edges
1145
+ # and other non-tree edges returned by the parser
1146
+ # in path computation.
1147
+ #
1148
+ # returns: Path object
1149
+ def MiniparInterpreter.path_between(from_node, # SynNode
1150
+ to_node, # SynNode
1151
+ use_nontree_edges = false) # boolean
1152
+ from_node = MiniparInterpreter.ensure_upper(from_node)
1153
+ to_node = MiniparInterpreter.ensure_upper(to_node)
1154
+
1155
+ if use_nontree_edges
1156
+ MiniparInterpreter.each_reachable_node(from_node) { |node, ant, paths, prev|
1157
+ if node == to_node
1158
+ return paths.first
1159
+ end
1160
+ true # each_reachable_node requires boolean to determine
1161
+ # whether to continue the path beyond node
1162
+ }
1163
+ else
1164
+ return super(from_node, to_node)
1165
+ end
1166
+ end
1167
+
1168
+ ###
1169
+ # surrounding_nodes:
1170
+ #
1171
+ # construct paths in syntactic structure between a node and each of its neighbors
1172
+ # path construction as in path_between.
1173
+ # Neighbors: parent, child, plus potentially neighbors by nontree edges
1174
+ # use_nontree_edges: again, same as in path_between
1175
+ #
1176
+ # returns: list of pairs [neighbor(SynNode), path(Path)]
1177
+ def MiniparInterpreter.surrounding_nodes(node, # SynNode
1178
+ use_nontree_edges = false) # boolean
1179
+ normal_neighbors = super(node, use_nontree_edges)
1180
+ # add antecedents
1181
+ more_neighbors = Array.new
1182
+ normal_neighbors.each { |neighbor, path|
1183
+ while n = (neighbor.get_f("antecedent"))
1184
+ more_neighbors << [n, path]
1185
+ neighbor = n
1186
+ end
1187
+ }
1188
+ return normal_neighbors + more_neighbors
1189
+ end
1190
+
1191
+
1192
+ # ###
1193
+ # # main node of expression
1194
+ # #
1195
+ # # 2nd argument non-nil:
1196
+ # # don't handle multiword expressions beyond verbs with separate particles
1197
+ # #
1198
+ # # returns: SynNode, main node, if found
1199
+ # # else nil
1200
+ # def MiniparInterpreter.main_node_of_expr(nodelist,
1201
+ # no_mwes = nil)
1202
+
1203
+ # nodelist = nodelist.map { |n| MiniparInterpreter.ensure_upper(n) }.uniq()
1204
+
1205
+ # # main reason we are overwriting the parent method:
1206
+ # # don't go to terminal nodes right away.
1207
+ # # If we have a single nonterminal, stay with it.
1208
+ # # Otherwise, use parent method
1209
+ # if nodelist.length() == 1
1210
+ # return nodelist.first
1211
+ # end
1212
+
1213
+ # return super(nodelist, no_mwes)
1214
+ # end
1215
+
1216
+ ########
1217
+ # max constituents:
1218
+ # given a set of nodes, compute the maximal constituents
1219
+ # that exactly cover them
1220
+ #
1221
+ # overwrite default: ignore empty terminals, both in nodeset
1222
+ # and in the nodes that are tested as potential maximal constituents
1223
+ def MiniparInterpreter.max_constituents(nodeset, # Array:SynNode
1224
+ sent, # SalsaTigerSentence
1225
+ idealize_maxconst = false) # boolean
1226
+
1227
+ my_nodeset = nodeset.reject { |n| MiniparInterpreter.empty_terminal?(n)}
1228
+ if idealize_maxconst
1229
+ return sent.max_constituents_smc(my_nodeset, idealize_maxconst, true)
1230
+ else
1231
+ return sent.max_constituents_for_nodes(my_nodeset, true)
1232
+ end
1233
+ end
1234
+
1235
+
1236
+ ###
1237
+ # for all nodes reachable from a given from_node:
1238
+ # compute the path from from_node,
1239
+ # using both tree edges and coreference edges
1240
+ #
1241
+ # compute a widening circle of nodes from from_node outward,
1242
+ # following all antecedent links as 0-length paths.
1243
+ #
1244
+ # yields tuples
1245
+ # [
1246
+ # minipar node,
1247
+ # array: other minipar node(s) reached from this one solely via antecedent edges,
1248
+ # array: minimal paths from start_node to this node as Path objects
1249
+ # minipar node 2: last stop on path from start_node to minipar_node
1250
+ # ]
1251
+ def MiniparInterpreter.each_reachable_node(from_node) # SynNode
1252
+
1253
+ from_node = MiniparInterpreter.ensure_upper(from_node)
1254
+
1255
+ # rim: array:SynNode, current outermost nodes
1256
+ rim = [ from_node ]
1257
+ # seen: hash SynNode->Path, mapping (seen) minipar nodes to
1258
+ # the path leading from the target to them
1259
+ seen = {
1260
+ from_node => [Path.new(from_node)]
1261
+ }
1262
+
1263
+ while not(rim.empty?)
1264
+ # remove node from the beginning of the rim
1265
+ minipar_node = rim.shift()
1266
+
1267
+ # make tuples:
1268
+ # ["D" for down from minipar_node, or "U" for up,
1269
+ # parent or child of minipar_node,
1270
+ # edgelabel between minipar_node and that parent or child,
1271
+ # POS of that parent or child,
1272
+ # preposition
1273
+ # ]
1274
+ surrounding_n = minipar_node.children.map { |child|
1275
+ ["D", child,
1276
+ minipar_node.child_label(child), child.part_of_speech()]
1277
+ }
1278
+ if minipar_node.parent
1279
+ surrounding_n.push([
1280
+ "U", minipar_node.parent,
1281
+ minipar_node.parent_label(),
1282
+ minipar_node.parent.part_of_speech()
1283
+ ])
1284
+ end
1285
+
1286
+ surrounding_n.each { |direction, new_node, edgelabel, nodelabel|
1287
+
1288
+ # node we are actually using: the antecedent, if it's there
1289
+ # the coref chain may have a length > 1
1290
+ actual_new_node = new_node
1291
+ antecedents = []
1292
+ while actual_new_node.get_f("antecedent")
1293
+ antecedents << actual_new_node.get_f("antecedent")
1294
+ actual_new_node = actual_new_node.get_f("antecedent")
1295
+ end
1296
+
1297
+ # node seen before, and seen with shorter path?
1298
+ # all paths in seen[actual_new_node] have the same length
1299
+ if seen[actual_new_node] and
1300
+ seen[actual_new_node].first.length() < seen[minipar_node].first.length() + 1
1301
+ # yes, seen with a shorter path. discard
1302
+ next
1303
+ end
1304
+
1305
+ # make paths for this new_node
1306
+ paths = seen[minipar_node].map { |previous_path|
1307
+ new_path = previous_path.deep_clone
1308
+ if new_node.part_of_speech() == "Prep"
1309
+ # preposition? add to path too
1310
+ new_path.add_last_step(direction,
1311
+ edgelabel + "-" + new_node.get_attribute("lemma"),
1312
+ nodelabel,
1313
+ new_node)
1314
+ else
1315
+ new_path.add_last_step(direction, edgelabel, nodelabel, new_node)
1316
+ end
1317
+ new_path
1318
+ }
1319
+
1320
+ # node not seen before: record
1321
+ unless seen[actual_new_node]
1322
+ seen[actual_new_node] = Array.new
1323
+ end
1324
+ seen[actual_new_node].concat paths
1325
+
1326
+ keepthisnode = yield(new_node, antecedents, paths, minipar_node)
1327
+
1328
+ if keepthisnode and not(rim.include?(actual_new_node))
1329
+ rim.push actual_new_node
1330
+ end
1331
+
1332
+ } # each parent or child of the current rim node
1333
+ end # while new rim nodes keep being discovered
1334
+ end
1335
+
1336
+ #####################33
1337
+ private
1338
+
1339
+ ###
1340
+ # auxiliaries and modals share this characteristic
1341
+ def MiniparInterpreter.aux_or_modal?(node)
1342
+ node = MiniparInterpreter.ensure_upper(node)
1343
+
1344
+ if (l = node.parent_label()) and
1345
+ ["be", "have", "aux"].include? l and
1346
+ (p = node.parent()) and
1347
+ MiniparInterpreter.category(p) == "verb"
1348
+ return true
1349
+ else
1350
+ return false
1351
+ end
1352
+ end
1353
+
1354
+ ###
1355
+ # given a node: if it has a Head child, return that,
1356
+ # else return the node
1357
+ def MiniparInterpreter.ensure_terminal(node)
1358
+ headchildren = node.children_by_edgelabels(["Head"])
1359
+ if headchildren and not(headchildren.empty?)
1360
+ return headchildren.first
1361
+ else
1362
+ return node
1363
+ end
1364
+ end
1365
+
1366
+ ###
1367
+ # given a node: if it is a terminal that is linked to its
1368
+ # parent by a Head edge, return the parent,
1369
+ # else return the node
1370
+ def MiniparInterpreter.ensure_upper(node)
1371
+ if node.parent_label() == "Head"
1372
+ return node.parent
1373
+ else
1374
+ return node
1375
+ end
1376
+ end
1377
+
1378
+ ###
1379
+ # is this an empty terminal?
1380
+ def MiniparInterpreter.empty_terminal?(node)
1381
+ if node.is_terminal? and node.word().empty?
1382
+ return true
1383
+ else
1384
+ return false
1385
+ end
1386
+ end
1387
+
1388
+ end