frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,44 @@
1
+ require "tempfile"
2
+ require "common/AbstractSynInterface"
3
+
4
+ ################################################
5
+ # Interface class
6
+ class TntInterface < SynInterfaceTab
7
+ TntInterface.announce_me()
8
+
9
+ def TntInterface.system()
10
+ return "tnt"
11
+ end
12
+
13
+ def TntInterface.service()
14
+ return "pos_tagger"
15
+ end
16
+
17
+ def process_file(infilename, # string: name of input file
18
+ outfilename) # string: name of output file
19
+
20
+ tempfile = Tempfile.new("Tnt")
21
+ TntInterface.fntab_words_to_file(infilename, tempfile)
22
+ tempfile.close
23
+
24
+ # 1. use grep to remove commentaries from file
25
+ # 2. use sed to extract tags tag list:
26
+ # - match one or more non-spaces
27
+ # - match one or more spaces
28
+ # - match one or more non-spaces and write to outfilename
29
+
30
+ # This assumes that the experiment file entry for pos_tagger_path
31
+ # has the form
32
+ # pos_tagger_path = <program_name> <model>
33
+
34
+ Kernel.system(@program_path + " " + tempfile.path +
35
+ ' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
36
+
37
+ tempfile.close(true) # delete tempfile
38
+ unless `cat #{infilename} | wc -l`.strip ==
39
+ `cat #{outfilename} | wc -l`.strip
40
+ raise "Error: tagged file has different line number from corpus file!"
41
+ end
42
+ end
43
+ end
44
+
@@ -0,0 +1,61 @@
1
+ require 'common/Graph'
2
+
3
+ class TreeNode < GraphNode
4
+
5
+ def initialize(id)
6
+ super(id)
7
+ end
8
+
9
+ # redo the ancestor-related methods,
10
+ # since here we only have one parent per node
11
+ def parent()
12
+ retv = parents()
13
+ if retv.nil?
14
+ return nil
15
+ else
16
+ return retv.first
17
+ end
18
+ end
19
+
20
+ def parent_label()
21
+ retv = parent_labels()
22
+ if retv.nil?
23
+ return nil
24
+ else
25
+ return retv.first
26
+ end
27
+ end
28
+
29
+
30
+ def parent_with_edgelabel()
31
+ retv = parents_with_edgelabel()
32
+
33
+ if retv.nil?
34
+ return nil
35
+ else
36
+ return retv.first
37
+ end
38
+ end
39
+
40
+
41
+ def add_parent(parent, edgelabel, varhash={})
42
+ set_parent(parent, edgelabel, varhash)
43
+ end
44
+
45
+ def set_parent(parent, edgelabel, varhash={})
46
+ # remove old parent
47
+ each_parent_with_edgelabel { |label, parent|
48
+ remove_parent(parent, label, varhash)
49
+ }
50
+
51
+ # set new parent
52
+ @parents << [edgelabel, parent]
53
+
54
+ # and vice versa: add self as child to parent
55
+ unless varhash["pointer_insteadof_edge"]
56
+ unless parent.children_with_edgelabel().include? [edgelabel, self]
57
+ parent.add_child(self, edgelabel)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,303 @@
1
+ # sp 30 11 06
2
+ # extended by TreeTaggerPOSInterface
3
+
4
+ require "tempfile"
5
+
6
+ require "common/AbstractSynInterface"
7
+
8
+ ###########
9
+ # KE dec 7, 06
10
+ # common mixin for both Treetagger modules, doing the actual processing
11
+ module TreetaggerModule
12
+ ###
13
+ # Treetagger does both lemmatization and POS-tagging.
14
+ # However, the way the SynInterface system is set up in Shalmaneser,
15
+ # each SynInterface can offer only _one_ service.
16
+ # This means that we cannot do a SynInterface that writes
17
+ # both a POS file and a lemma file.
18
+ # Instead, both will include this module, which does the
19
+ # actual TreeTagger call and then stores the result in a file
20
+ # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
21
+ # but with a separate extension.
22
+ # really_process_file checks for existence of this file because,
23
+ # if the TreeTagger lemmatization and POS-tagging classes are called separately,
24
+ # one of them will go first, and the 2nd one will not need to do the
25
+ # TreeTagger call anymore
26
+ #
27
+ # really_process_file returns a filename, the name of the file containing
28
+ # the TreeTagger output with both POS tags and lemma information
29
+ #
30
+ # WARNING: this method assumes that outfilename contains a suffix
31
+ # that can be replaced by .TreeTagger
32
+ def really_process_file(infilename, # string: name of input file
33
+ outfilename,# string: name of file that the caller is to produce
34
+ make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
35
+
36
+ # fabricate the filename in which the
37
+ # actual TreeTagger output will be placed:
38
+ # <directory> + <outfilename minus last suffix> + ".TreeTagger"
39
+ current_suffix = outfilename[outfilename.rindex(".")..-1]
40
+ my_outfilename = File.dirname(outfilename) + "/" +
41
+ File.basename(outfilename, current_suffix) +
42
+ ".TreeTagger"
43
+
44
+ ##
45
+ # does it exist? then just return it
46
+ if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
47
+ return my_outfilename
48
+ end
49
+
50
+ ##
51
+ # else construct it, then return it
52
+ tempfile = Tempfile.new("Treetagger")
53
+ TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
54
+ tempfile.close
55
+
56
+ # call TreeTagger
57
+ Kernel.system(@program_path+" "+tempfile.path +
58
+ " > " + my_outfilename)
59
+ tempfile.close(true) # delete first tempfile
60
+
61
+ # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
62
+ # resulting on a .tagged file missing the last (blank) line
63
+
64
+ original_length = `cat #{infilename} | wc -l`.strip.to_i
65
+ puts infilename
66
+ lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
67
+
68
+ # `cp #{tempfile2.path()} /tmp/lout`
69
+
70
+ case original_length - lemmatised_length
71
+ when 0
72
+ # everything ok, don't do anything
73
+ when 1
74
+ # add one more newline to the .tagged file
75
+ `echo "" >> #{my_outfilename}`
76
+ else
77
+ # this is "real" error
78
+ STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
79
+ STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
80
+ $stderr.puts "has different line number from corpus file!"
81
+ raise
82
+ end
83
+
84
+
85
+ return my_outfilename
86
+ end
87
+ end
88
+
89
+ #######################################
90
+ class TreetaggerInterface < SynInterfaceTab
91
+ TreetaggerInterface.announce_me()
92
+
93
+ include TreetaggerModule
94
+
95
+ ###
96
+ def TreetaggerInterface.system()
97
+ return "treetagger"
98
+ end
99
+
100
+ ###
101
+ def TreetaggerInterface.service()
102
+ return "lemmatizer"
103
+ end
104
+
105
+ ###
106
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
107
+
108
+ def convert_to_berkeley(line)
109
+ line.chomp!
110
+ return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
111
+ end
112
+
113
+
114
+ ###
115
+ def process_file(infilename, # string: name of input file
116
+ outfilename) # string: name of output file
117
+
118
+ # KE change here
119
+ ttfilename = really_process_file(infilename, outfilename)
120
+
121
+ # write all output to tempfile2 first, then
122
+ # change ISO to UTF-8 into outputfile
123
+ tempfile2 = Tempfile.new("treetagger")
124
+ tempfile2.close()
125
+
126
+ # 2. use cut to get the actual lemmtisation
127
+
128
+ Kernel.system("cat " + ttfilename +
129
+ ' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
130
+
131
+ # transform ISO-8859-1 back to UTF-8,
132
+ # write to 'outfilename'
133
+ begin
134
+ outfile = File.new(outfilename, "w")
135
+ rescue
136
+ raise "Could not write to #{outfilename}"
137
+ end
138
+ tempfile2.open
139
+ # AB: Internally all the flow is an utf-8 encoded stream.
140
+ # TreeTagger consumes one byte encodings (but we should provide a
141
+ # utf-8 model for German). So we convert utf-8 to latin1, then
142
+ # process the text and convert it back to utf-8.
143
+ #
144
+ while line = tempfile2.gets
145
+ #outfile.puts UtfIso.from_iso_8859_1(line)
146
+ utf8line = UtfIso.from_iso_8859_1(line)
147
+ outfile.puts convert_to_berkeley(utf8line)
148
+ end
149
+
150
+ # remove second tempfile, finalize output file
151
+ tempfile2.close(true)
152
+ outfile.close()
153
+
154
+ end
155
+ end
156
+
157
+
158
+ # sp 30 11 06
159
+ #
160
+ # using TreeTagger for POS tagging of English text
161
+ #
162
+ # copy-and-paste from lemmatisation
163
+ #
164
+ # differences:
165
+ # 1. use field 2 and not 3 from the output
166
+ # 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
167
+ #
168
+ # KE 7 12 06
169
+ # change interface such that TreeTagger is called only once
170
+ # and both POS tags and lemma are read from the same files,
171
+ # rather than calling the tagger twice
172
+ class TreetaggerPOSInterface < SynInterfaceTab
173
+ TreetaggerPOSInterface.announce_me()
174
+ include TreetaggerModule
175
+
176
+ ###
177
+ def TreetaggerPOSInterface.system()
178
+ return "treetagger"
179
+ end
180
+
181
+ ###
182
+ def TreetaggerPOSInterface.service()
183
+ return "pos_tagger"
184
+ end
185
+
186
+ ###
187
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
188
+
189
+ def convert_to_collins(line)
190
+ line.chomp!
191
+ return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
192
+ end
193
+
194
+ ###
195
+ def process_file(infilename, # string: name of input file
196
+ outfilename) # string: name of output file
197
+
198
+ # KE change here
199
+ tt_filename = really_process_file(infilename, outfilename, true)
200
+
201
+ # write all output to tempfile2 first, then
202
+ # change ISO to UTF-8 into outputfile
203
+ tempfile2 = Tempfile.new("treetagger")
204
+ tempfile2.close()
205
+
206
+ # 2. use cut to get the actual lemmtisation
207
+
208
+ Kernel.system("cat " + tt_filename +
209
+ ' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
210
+
211
+ # transform ISO-8859-1 back to UTF-8,
212
+ # write to 'outfilename'
213
+ begin
214
+ outfile = File.new(outfilename, "w")
215
+ rescue
216
+ raise "Could not write to #{outfilename}"
217
+ end
218
+ tempfile2.open()
219
+ while (line = tempfile2.gets())
220
+ outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
221
+ end
222
+
223
+ # remove second tempfile, finalize output file
224
+ tempfile2.close(true)
225
+ outfile.close()
226
+ end
227
+ end
228
+
229
+ ###############
230
+ # an interpreter that only has Treetagger, no parser
231
+ class TreetaggerInterpreter < SynInterpreter
232
+ TreetaggerInterpreter.announce_me()
233
+
234
+ ###
235
+ # names of the systems interpreted by this class:
236
+ # returns a hash service(string) -> system name (string),
237
+ # e.g.
238
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
239
+ def TreetaggerInterpreter.systems()
240
+ return {
241
+ "pos_tagger" => "treetagger",
242
+ }
243
+ end
244
+
245
+ ###
246
+ # names of additional systems that may be interpreted by this class
247
+ # returns a hash service(string) -> system name(string)
248
+ # same as names()
249
+ def TreetaggerInterpreter.optional_systems()
250
+ return {
251
+ "lemmatizer" => "treetagger"
252
+ }
253
+ end
254
+
255
+ ###
256
+ # generalize over POS tags.
257
+ #
258
+ # returns one of:
259
+ #
260
+ # adj: adjective (phrase)
261
+ # adv: adverb (phrase)
262
+ # card: numbers, quantity phrases
263
+ # con: conjunction
264
+ # det: determiner, including possessive/demonstrative pronouns etc.
265
+ # for: foreign material
266
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
267
+ # part: particles, truncated words (German compound parts)
268
+ # prep: preposition (phrase)
269
+ # pun: punctuation, brackets, etc.
270
+ # sent: sentence
271
+ # top: top node of a sentence
272
+ # verb: verb (phrase)
273
+ # nil: something went wrong
274
+ #
275
+ # returns: string, or nil
276
+ def TreetaggerInterpreter.category(node) # SynNode
277
+ pt = TreetaggerInterpreter.pt(node)
278
+ if pt.nil?
279
+ # phrase type could not be determined
280
+ return nil
281
+ end
282
+
283
+ pt.to_s.strip() =~ /^([^-]*)/
284
+ case $1
285
+ when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
286
+ when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
287
+ when /^CD/, /^QP/ then return "card"
288
+ when /^CC/, /^WRB/, /^CONJP/ then return "con"
289
+ when /^DT/, /^POS/ then return "det"
290
+ when /^FW/, /^SYM/ then return "for"
291
+ when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
292
+ when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
293
+ when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
294
+ when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
295
+ when /^TOP/ then return "top"
296
+ when /^TRACE/ then return "trace"
297
+ when /^V/ , /^MD/ then return "verb"
298
+ else
299
+ # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
300
+ return nil
301
+ end
302
+ end
303
+ end
@@ -0,0 +1,338 @@
1
+ # name: Module Headz
2
+ # auth: albu@coli.uni-sb.de
3
+ #
4
+ # modified KE Sept 04:
5
+ # changed from old Sentence pkg to new SalsaTigerSentence pkg
6
+ #
7
+ # modified KE April 05:
8
+ # suppress the flood of warnings
9
+ #
10
+ # modified SP June 05: added some more cases; change to SalsTigerRegXML
11
+ #
12
+ #
13
+ # INIT: REXML TIGER sentence,
14
+ # FUNC: syn_nodes(term/non_term) -> heads
15
+ #
16
+ #
17
+ # usage:
18
+ #
19
+ # h = Headz.new()
20
+ #
21
+ # hash = h.get_sem_head(node) # node is a SalsaTigerXmlNode obj
22
+ #
23
+ # head = hash["head"]
24
+ # prep = hash["prep"]
25
+ #
26
+ # if h.complex(head)
27
+ # print "preposition of conjunction involved"
28
+ # end
29
+
30
+ require "common/SalsaTigerRegXML"
31
+
32
+ class Headz
33
+
34
+ def initialize()
35
+ @Helpers = HeadzHelpers.new()
36
+ @Verbose = false #KE 13.4.05: please not that many messages!
37
+ end
38
+
39
+ # head of one node
40
+ def get_sem_head(node)
41
+ gsh(node)
42
+ end
43
+
44
+ # all headz of top-nodes covering fe
45
+ def get_fe_heads(fe)
46
+ if (const = fe.children())
47
+ const.map { |node|
48
+ get_sem_head(node)
49
+ }
50
+ else
51
+ $stderr.puts "Headz.get_sem_head: no children for FE #{fe}"
52
+ []
53
+ end
54
+ end
55
+
56
+ def gsh (node)
57
+ if !node then
58
+ if @Verbose then $stderr.puts "Headz.gsh: no input node" end
59
+ return {}
60
+
61
+ elsif node.is_terminal? then return Hash['head'=>node]
62
+
63
+ else
64
+ case node.category
65
+ when 'AP'
66
+ return gsh(@Helpers.get_dtr(node,'HD'))
67
+
68
+ when 'AVP'
69
+ return gsh(@Helpers.get_dtr(node,'HD'))
70
+ when 'CAP', 'CAVP', 'CNP', 'CPP', 'CS', 'CVP'
71
+ conjs = @Helpers.get_conjuncts(node)
72
+ head = gsh(conjs.shift)
73
+ if head
74
+ head.update(Hash["conj"=>gsh_conjs(conjs)])
75
+ end
76
+ return head
77
+
78
+ when 'NM'
79
+ return gsh(@Helpers.get_rightmost_dtr(node,'NMC'))
80
+ when 'NP'
81
+ nk = @Helpers.get_rightmost_dtr(node,'NK')
82
+ if nk
83
+ return gsh(nk)
84
+ else
85
+ return gsh(@Helpers.get_rightmost_dtr(node, "NN"))
86
+ end
87
+
88
+ when 'PN'
89
+ pncs = @Helpers.get_dtrs(node,'PNC')
90
+ head = gsh(pncs.last)
91
+ if head
92
+ head.update(Hash["pncs"=>pncs])
93
+ end
94
+ return head
95
+
96
+ when 'PP'
97
+ return pp(node)
98
+
99
+ when 'S'
100
+ return s(node)
101
+ when 'VROOT'
102
+ dtrs = @Helpers.get_dtrs(node,'--')
103
+
104
+ # discourse level node with sentence nodes below?
105
+ # or conjunction with sentence nodes below?
106
+ discourselevel_dtr = dtrs.detect { |n| n.category == "DL"}
107
+ co_dtr = dtrs.detect { |n| n.category == "CO" }
108
+ if discourselevel_dtr
109
+ dtrs = discourselevel_dtr.children()
110
+ elsif co_dtr
111
+ dtrs = co_dtr.children()
112
+ end
113
+
114
+
115
+ # take first sentence node
116
+ sent_dtr = dtrs.detect {|n| n.category =~ /^C?S/}
117
+ if sent_dtr
118
+ return gsh(sent_dtr)
119
+ else
120
+ # $stderr.puts "headz Warning: no sentence found below VROOT! Node #{node.id()}"
121
+ return nil
122
+ end
123
+
124
+ when 'VP'
125
+ return vp(node)
126
+
127
+ when 'MTA'
128
+ return gsh(@Helpers.get_rightmost_dtr(node,'ADC'))
129
+
130
+ when 'VZ'
131
+ return gsh(@Helpers.get_dtr(node,'HD'))
132
+ else
133
+ if @Verbose
134
+ $stderr.puts " Headz.gsh: no rule for #{node.category}"
135
+ end
136
+ {}
137
+ end
138
+ end
139
+ end
140
+
141
+ # flatten the processed conjs to a list of (head) Hashes
142
+ # containing no conj features themselves
143
+ def gsh_conjs(conjs)
144
+ flat = Array.new
145
+
146
+ conjs.each {|conj|
147
+ current = gsh(conj)
148
+ @Helpers.descend(current,flat)
149
+ }
150
+
151
+ flat
152
+ end
153
+
154
+ #####################################3
155
+ def pp(node)
156
+
157
+ prep = node.terminals_sorted().detect { |n|
158
+ (pt = n.part_of_speech()) and
159
+ (pt =~ /^APPR/ or
160
+ pt =~ /^PWAV/ or
161
+ pt =~ /^C?PP/
162
+ )
163
+ }
164
+
165
+ if (lastnk = @Helpers.get_rightmost_dtr(node,'NK'))
166
+ head = gsh(lastnk)
167
+ if head and prep
168
+ head.update(Hash['prep'=>prep])
169
+ end
170
+
171
+ elsif (re = @Helpers.get_dtr(node,'RE'))
172
+ head = gsh(re)
173
+ if head and prep
174
+ head.update(Hash['prep'=>prep])
175
+ end
176
+ else
177
+ if @Verbose then $stderr.puts " pp: no rule for #{node}" end
178
+ end
179
+
180
+ head
181
+ end
182
+
183
+ ################
184
+ def s(node)
185
+ head = @Helpers.get_dtr(node,'HD')
186
+ if !head
187
+ # $stderr.puts " s: no head for #{node}"
188
+ return Hash[]
189
+ end
190
+
191
+ if head.outdeg() == 0
192
+ return gsh(head)
193
+ end
194
+
195
+ oc = @Helpers.get_dtr(node,'OC')
196
+ case head.category
197
+ when 'VVFIN'
198
+ if svp = @Helpers.get_dtr(node,'SVP') then
199
+ h = gsh(head)
200
+ if h
201
+ return h.update(Hash['svp'=>gsh(svp), 'oc'=>gsh(oc)])
202
+ else
203
+ return h
204
+ end
205
+ else
206
+ return gsh(head)
207
+ end
208
+
209
+ when 'VAFIN'
210
+ if oc && headd = @Helpers.get_dtr(oc,'HD')
211
+ h = gsh(headd)
212
+ if h
213
+ return h.update(Hash['oc'=>gsh(oc)])
214
+ else
215
+ return h
216
+ end
217
+
218
+ elsif pd = @Helpers.get_dtr(node,'PD') && head = @Helpers.get_dtr(pd,'HD')
219
+ return gsh(head)
220
+
221
+ else
222
+ if @Verbose then $stderr.puts " s: no rule for #{node}" end
223
+ end
224
+ else
225
+ if @Verbose then $stderr.puts " s: no rule for #{node}" end
226
+ end
227
+ end
228
+
229
+ ################
230
+ def vp(node)
231
+ head = gsh(@Helpers.get_dtr(node,'HD'))
232
+ tmp = @Verbose
233
+ @Verbose = false
234
+ newHash = Hash.new
235
+ ["da","oa"].each { |type|
236
+ if (dtr = @Helpers.get_dtr(node,type.upcase))
237
+ newHash[type] = gsh(dtr)
238
+ end
239
+ }
240
+ @Verbose = tmp
241
+ if head
242
+ return head.update(newHash)
243
+ else
244
+ return newHash
245
+ end
246
+ end
247
+
248
+ ################
249
+ # Access
250
+ def head(h)
251
+ return h['head']
252
+ end
253
+
254
+ def complex(h)
255
+ prep(h) or conj(h)
256
+ end
257
+
258
+ def prep(h)
259
+ return h['prep']
260
+ end
261
+
262
+ def conj(h)
263
+ return h['conj']
264
+ end
265
+
266
+
267
+
268
+ end # Class Headz
269
+
270
+
271
+ class HeadzHelpers
272
+ @Verbose = true
273
+
274
+ # Conjunction
275
+
276
+ def get_conjuncts(node)
277
+ conjuncts = get_dtrs(node,'CJ')
278
+ end
279
+
280
+ # flatten
281
+ def descend(current,flat)
282
+ if current.nil?
283
+ return flat
284
+ end
285
+
286
+ if current.has_key?("conj") then
287
+ tmp = current.delete("conj")
288
+ flat.push current
289
+ tmp.each {|item|
290
+ descend(item,flat)}
291
+ else
292
+ flat.push current
293
+ end
294
+ end
295
+
296
+ # Zugriff
297
+
298
+ def get_dtr(node,label)
299
+ if (dtrs = node.children_by_edgelabels([label]))
300
+ dtrs.first
301
+ else
302
+ if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtr for #{node}" end
303
+ nil
304
+ end
305
+ end
306
+
307
+ def get_dtrs(node,label)
308
+ if ! dtrs = node.children_by_edgelabels([label])
309
+ if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtr for #{node}" end
310
+ else
311
+ dtrs
312
+ end
313
+ end
314
+
315
+ def get_rightmost_dtr(node,label)
316
+ children = node.children_by_edgelabels([label])
317
+ if re = children.last then re
318
+ else
319
+ if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtrs for #{node}" end
320
+ nil
321
+ end
322
+ end
323
+
324
+ # def l2h(list)
325
+ # h = Hash.new
326
+ # while (list.length > 1) do
327
+ # h[list.shift] = list.shift
328
+ # end
329
+ # if list.length == 1 then
330
+ # $stderr.puts "l2h: odd number of elems: " + list.join(" / ")
331
+ # end
332
+ # h
333
+ # end
334
+
335
+ end # Class HeadzHelpers
336
+
337
+
338
+