frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,44 @@
1
+ require "tempfile"
2
+ require "common/AbstractSynInterface"
3
+
4
+ ################################################
5
+ # Interface class
6
+ class TntInterface < SynInterfaceTab
7
+ TntInterface.announce_me()
8
+
9
+ def TntInterface.system()
10
+ return "tnt"
11
+ end
12
+
13
+ def TntInterface.service()
14
+ return "pos_tagger"
15
+ end
16
+
17
+ def process_file(infilename, # string: name of input file
18
+ outfilename) # string: name of output file
19
+
20
+ tempfile = Tempfile.new("Tnt")
21
+ TntInterface.fntab_words_to_file(infilename, tempfile)
22
+ tempfile.close
23
+
24
+ # 1. use grep to remove commentaries from file
25
+ # 2. use sed to extract tags tag list:
26
+ # - match one or more non-spaces
27
+ # - match one or more spaces
28
+ # - match one or more non-spaces and write to outfilename
29
+
30
+ # This assumes that the experiment file entry for pos_tagger_path
31
+ # has the form
32
+ # pos_tagger_path = <program_name> <model>
33
+
34
+ Kernel.system(@program_path + " " + tempfile.path +
35
+ ' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
36
+
37
+ tempfile.close(true) # delete tempfile
38
+ unless `cat #{infilename} | wc -l`.strip ==
39
+ `cat #{outfilename} | wc -l`.strip
40
+ raise "Error: tagged file has different line number from corpus file!"
41
+ end
42
+ end
43
+ end
44
+
@@ -0,0 +1,61 @@
1
+ require 'common/Graph'
2
+
3
+ class TreeNode < GraphNode
4
+
5
+ def initialize(id)
6
+ super(id)
7
+ end
8
+
9
+ # redo the ancestor-related methods,
10
+ # since here we only have one parent per node
11
+ def parent()
12
+ retv = parents()
13
+ if retv.nil?
14
+ return nil
15
+ else
16
+ return retv.first
17
+ end
18
+ end
19
+
20
+ def parent_label()
21
+ retv = parent_labels()
22
+ if retv.nil?
23
+ return nil
24
+ else
25
+ return retv.first
26
+ end
27
+ end
28
+
29
+
30
+ def parent_with_edgelabel()
31
+ retv = parents_with_edgelabel()
32
+
33
+ if retv.nil?
34
+ return nil
35
+ else
36
+ return retv.first
37
+ end
38
+ end
39
+
40
+
41
+ def add_parent(parent, edgelabel, varhash={})
42
+ set_parent(parent, edgelabel, varhash)
43
+ end
44
+
45
+ def set_parent(parent, edgelabel, varhash={})
46
+ # remove old parent
47
+ each_parent_with_edgelabel { |label, parent|
48
+ remove_parent(parent, label, varhash)
49
+ }
50
+
51
+ # set new parent
52
+ @parents << [edgelabel, parent]
53
+
54
+ # and vice versa: add self as child to parent
55
+ unless varhash["pointer_insteadof_edge"]
56
+ unless parent.children_with_edgelabel().include? [edgelabel, self]
57
+ parent.add_child(self, edgelabel)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,303 @@
1
+ # sp 30 11 06
2
+ # extended by TreeTaggerPOSInterface
3
+
4
+ require "tempfile"
5
+
6
+ require "common/AbstractSynInterface"
7
+
8
+ ###########
9
+ # KE dec 7, 06
10
+ # common mixin for both Treetagger modules, doing the actual processing
11
+ module TreetaggerModule
12
+ ###
13
+ # Treetagger does both lemmatization and POS-tagging.
14
+ # However, the way the SynInterface system is set up in Shalmaneser,
15
+ # each SynInterface can offer only _one_ service.
16
+ # This means that we cannot do a SynInterface that writes
17
+ # both a POS file and a lemma file.
18
+ # Instead, both will include this module, which does the
19
+ # actual TreeTagger call and then stores the result in a file
20
+ # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
21
+ # but with a separate extension.
22
+ # really_process_file checks for existence of this file because,
23
+ # if the TreeTagger lemmatization and POS-tagging classes are called separately,
24
+ # one of them will go first, and the 2nd one will not need to do the
25
+ # TreeTagger call anymore
26
+ #
27
+ # really_process_file returns a filename, the name of the file containing
28
+ # the TreeTagger output with both POS tags and lemma information
29
+ #
30
+ # WARNING: this method assumes that outfilename contains a suffix
31
+ # that can be replaced by .TreeTagger
32
+ def really_process_file(infilename, # string: name of input file
33
+ outfilename,# string: name of file that the caller is to produce
34
+ make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
35
+
36
+ # fabricate the filename in which the
37
+ # actual TreeTagger output will be placed:
38
+ # <directory> + <outfilename minus last suffix> + ".TreeTagger"
39
+ current_suffix = outfilename[outfilename.rindex(".")..-1]
40
+ my_outfilename = File.dirname(outfilename) + "/" +
41
+ File.basename(outfilename, current_suffix) +
42
+ ".TreeTagger"
43
+
44
+ ##
45
+ # does it exist? then just return it
46
+ if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
47
+ return my_outfilename
48
+ end
49
+
50
+ ##
51
+ # else construct it, then return it
52
+ tempfile = Tempfile.new("Treetagger")
53
+ TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
54
+ tempfile.close
55
+
56
+ # call TreeTagger
57
+ Kernel.system(@program_path+" "+tempfile.path +
58
+ " > " + my_outfilename)
59
+ tempfile.close(true) # delete first tempfile
60
+
61
+ # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
62
+ # resulting on a .tagged file missing the last (blank) line
63
+
64
+ original_length = `cat #{infilename} | wc -l`.strip.to_i
65
+ puts infilename
66
+ lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
67
+
68
+ # `cp #{tempfile2.path()} /tmp/lout`
69
+
70
+ case original_length - lemmatised_length
71
+ when 0
72
+ # everything ok, don't do anything
73
+ when 1
74
+ # add one more newline to the .tagged file
75
+ `echo "" >> #{my_outfilename}`
76
+ else
77
+ # this is "real" error
78
+ STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
79
+ STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
80
+ $stderr.puts "has different line number from corpus file!"
81
+ raise
82
+ end
83
+
84
+
85
+ return my_outfilename
86
+ end
87
+ end
88
+
89
+ #######################################
90
+ class TreetaggerInterface < SynInterfaceTab
91
+ TreetaggerInterface.announce_me()
92
+
93
+ include TreetaggerModule
94
+
95
+ ###
96
+ def TreetaggerInterface.system()
97
+ return "treetagger"
98
+ end
99
+
100
+ ###
101
+ def TreetaggerInterface.service()
102
+ return "lemmatizer"
103
+ end
104
+
105
+ ###
106
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
107
+
108
+ def convert_to_berkeley(line)
109
+ line.chomp!
110
+ return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
111
+ end
112
+
113
+
114
+ ###
115
+ def process_file(infilename, # string: name of input file
116
+ outfilename) # string: name of output file
117
+
118
+ # KE change here
119
+ ttfilename = really_process_file(infilename, outfilename)
120
+
121
+ # write all output to tempfile2 first, then
122
+ # change ISO to UTF-8 into outputfile
123
+ tempfile2 = Tempfile.new("treetagger")
124
+ tempfile2.close()
125
+
126
+ # 2. use cut to get the actual lemmtisation
127
+
128
+ Kernel.system("cat " + ttfilename +
129
+ ' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
130
+
131
+ # transform ISO-8859-1 back to UTF-8,
132
+ # write to 'outfilename'
133
+ begin
134
+ outfile = File.new(outfilename, "w")
135
+ rescue
136
+ raise "Could not write to #{outfilename}"
137
+ end
138
+ tempfile2.open
139
+ # AB: Internally all the flow is an utf-8 encoded stream.
140
+ # TreeTagger consumes one byte encodings (but we should provide a
141
+ # utf-8 model for German). So we convert utf-8 to latin1, then
142
+ # process the text and convert it back to utf-8.
143
+ #
144
+ while line = tempfile2.gets
145
+ #outfile.puts UtfIso.from_iso_8859_1(line)
146
+ utf8line = UtfIso.from_iso_8859_1(line)
147
+ outfile.puts convert_to_berkeley(utf8line)
148
+ end
149
+
150
+ # remove second tempfile, finalize output file
151
+ tempfile2.close(true)
152
+ outfile.close()
153
+
154
+ end
155
+ end
156
+
157
+
158
+ # sp 30 11 06
159
+ #
160
+ # using TreeTagger for POS tagging of English text
161
+ #
162
+ # copy-and-paste from lemmatisation
163
+ #
164
+ # differences:
165
+ # 1. use field 2 and not 3 from the output
166
+ # 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
167
+ #
168
+ # KE 7 12 06
169
+ # change interface such that TreeTagger is called only once
170
+ # and both POS tags and lemma are read from the same files,
171
+ # rather than calling the tagger twice
172
+ class TreetaggerPOSInterface < SynInterfaceTab
173
+ TreetaggerPOSInterface.announce_me()
174
+ include TreetaggerModule
175
+
176
+ ###
177
+ def TreetaggerPOSInterface.system()
178
+ return "treetagger"
179
+ end
180
+
181
+ ###
182
+ def TreetaggerPOSInterface.service()
183
+ return "pos_tagger"
184
+ end
185
+
186
+ ###
187
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
188
+
189
+ def convert_to_collins(line)
190
+ line.chomp!
191
+ return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
192
+ end
193
+
194
+ ###
195
+ def process_file(infilename, # string: name of input file
196
+ outfilename) # string: name of output file
197
+
198
+ # KE change here
199
+ tt_filename = really_process_file(infilename, outfilename, true)
200
+
201
+ # write all output to tempfile2 first, then
202
+ # change ISO to UTF-8 into outputfile
203
+ tempfile2 = Tempfile.new("treetagger")
204
+ tempfile2.close()
205
+
206
+ # 2. use cut to get the actual lemmtisation
207
+
208
+ Kernel.system("cat " + tt_filename +
209
+ ' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
210
+
211
+ # transform ISO-8859-1 back to UTF-8,
212
+ # write to 'outfilename'
213
+ begin
214
+ outfile = File.new(outfilename, "w")
215
+ rescue
216
+ raise "Could not write to #{outfilename}"
217
+ end
218
+ tempfile2.open()
219
+ while (line = tempfile2.gets())
220
+ outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
221
+ end
222
+
223
+ # remove second tempfile, finalize output file
224
+ tempfile2.close(true)
225
+ outfile.close()
226
+ end
227
+ end
228
+
229
+ ###############
230
+ # an interpreter that only has Treetagger, no parser
231
+ class TreetaggerInterpreter < SynInterpreter
232
+ TreetaggerInterpreter.announce_me()
233
+
234
+ ###
235
+ # names of the systems interpreted by this class:
236
+ # returns a hash service(string) -> system name (string),
237
+ # e.g.
238
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
239
+ def TreetaggerInterpreter.systems()
240
+ return {
241
+ "pos_tagger" => "treetagger",
242
+ }
243
+ end
244
+
245
+ ###
246
+ # names of additional systems that may be interpreted by this class
247
+ # returns a hash service(string) -> system name(string)
248
+ # same as names()
249
+ def TreetaggerInterpreter.optional_systems()
250
+ return {
251
+ "lemmatizer" => "treetagger"
252
+ }
253
+ end
254
+
255
+ ###
256
+ # generalize over POS tags.
257
+ #
258
+ # returns one of:
259
+ #
260
+ # adj: adjective (phrase)
261
+ # adv: adverb (phrase)
262
+ # card: numbers, quantity phrases
263
+ # con: conjunction
264
+ # det: determiner, including possessive/demonstrative pronouns etc.
265
+ # for: foreign material
266
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
267
+ # part: particles, truncated words (German compound parts)
268
+ # prep: preposition (phrase)
269
+ # pun: punctuation, brackets, etc.
270
+ # sent: sentence
271
+ # top: top node of a sentence
272
+ # verb: verb (phrase)
273
+ # nil: something went wrong
274
+ #
275
+ # returns: string, or nil
276
+ def TreetaggerInterpreter.category(node) # SynNode
277
+ pt = TreetaggerInterpreter.pt(node)
278
+ if pt.nil?
279
+ # phrase type could not be determined
280
+ return nil
281
+ end
282
+
283
+ pt.to_s.strip() =~ /^([^-]*)/
284
+ case $1
285
+ when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
286
+ when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
287
+ when /^CD/, /^QP/ then return "card"
288
+ when /^CC/, /^WRB/, /^CONJP/ then return "con"
289
+ when /^DT/, /^POS/ then return "det"
290
+ when /^FW/, /^SYM/ then return "for"
291
+ when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
292
+ when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
293
+ when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
294
+ when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
295
+ when /^TOP/ then return "top"
296
+ when /^TRACE/ then return "trace"
297
+ when /^V/ , /^MD/ then return "verb"
298
+ else
299
+ # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
300
+ return nil
301
+ end
302
+ end
303
+ end
@@ -0,0 +1,338 @@
1
+ # name: Module Headz
2
+ # auth: albu@coli.uni-sb.de
3
+ #
4
+ # modified KE Sept 04:
5
+ # changed from old Sentence pkg to new SalsaTigerSentence pkg
6
+ #
7
+ # modified KE April 05:
8
+ # suppress the flood of warnings
9
+ #
10
+ # modified SP June 05: added some more cases; change to SalsTigerRegXML
11
+ #
12
+ #
13
+ # INIT: REXML TIGER sentence,
14
+ # FUNC: syn_nodes(term/non_term) -> heads
15
+ #
16
+ #
17
+ # usage:
18
+ #
19
+ # h = Headz.new()
20
+ #
21
+ # hash = h.get_sem_head(node) # node is a SalsaTigerXmlNode obj
22
+ #
23
+ # head = hash["head"]
24
+ # prep = hash["prep"]
25
+ #
26
+ # if h.complex(head)
27
+ # print "preposition of conjunction involved"
28
+ # end
29
+
30
+ require "common/SalsaTigerRegXML"
31
+
32
+ class Headz
33
+
34
+ def initialize()
35
+ @Helpers = HeadzHelpers.new()
36
+ @Verbose = false #KE 13.4.05: please not that many messages!
37
+ end
38
+
39
+ # head of one node
40
+ def get_sem_head(node)
41
+ gsh(node)
42
+ end
43
+
44
+ # all headz of top-nodes covering fe
45
+ def get_fe_heads(fe)
46
+ if (const = fe.children())
47
+ const.map { |node|
48
+ get_sem_head(node)
49
+ }
50
+ else
51
+ $stderr.puts "Headz.get_sem_head: no children for FE #{fe}"
52
+ []
53
+ end
54
+ end
55
+
56
+ def gsh (node)
57
+ if !node then
58
+ if @Verbose then $stderr.puts "Headz.gsh: no input node" end
59
+ return {}
60
+
61
+ elsif node.is_terminal? then return Hash['head'=>node]
62
+
63
+ else
64
+ case node.category
65
+ when 'AP'
66
+ return gsh(@Helpers.get_dtr(node,'HD'))
67
+
68
+ when 'AVP'
69
+ return gsh(@Helpers.get_dtr(node,'HD'))
70
+ when 'CAP', 'CAVP', 'CNP', 'CPP', 'CS', 'CVP'
71
+ conjs = @Helpers.get_conjuncts(node)
72
+ head = gsh(conjs.shift)
73
+ if head
74
+ head.update(Hash["conj"=>gsh_conjs(conjs)])
75
+ end
76
+ return head
77
+
78
+ when 'NM'
79
+ return gsh(@Helpers.get_rightmost_dtr(node,'NMC'))
80
+ when 'NP'
81
+ nk = @Helpers.get_rightmost_dtr(node,'NK')
82
+ if nk
83
+ return gsh(nk)
84
+ else
85
+ return gsh(@Helpers.get_rightmost_dtr(node, "NN"))
86
+ end
87
+
88
+ when 'PN'
89
+ pncs = @Helpers.get_dtrs(node,'PNC')
90
+ head = gsh(pncs.last)
91
+ if head
92
+ head.update(Hash["pncs"=>pncs])
93
+ end
94
+ return head
95
+
96
+ when 'PP'
97
+ return pp(node)
98
+
99
+ when 'S'
100
+ return s(node)
101
+ when 'VROOT'
102
+ dtrs = @Helpers.get_dtrs(node,'--')
103
+
104
+ # discourse level node with sentence nodes below?
105
+ # or conjunction with sentence nodes below?
106
+ discourselevel_dtr = dtrs.detect { |n| n.category == "DL"}
107
+ co_dtr = dtrs.detect { |n| n.category == "CO" }
108
+ if discourselevel_dtr
109
+ dtrs = discourselevel_dtr.children()
110
+ elsif co_dtr
111
+ dtrs = co_dtr.children()
112
+ end
113
+
114
+
115
+ # take first sentence node
116
+ sent_dtr = dtrs.detect {|n| n.category =~ /^C?S/}
117
+ if sent_dtr
118
+ return gsh(sent_dtr)
119
+ else
120
+ # $stderr.puts "headz Warning: no sentence found below VROOT! Node #{node.id()}"
121
+ return nil
122
+ end
123
+
124
+ when 'VP'
125
+ return vp(node)
126
+
127
+ when 'MTA'
128
+ return gsh(@Helpers.get_rightmost_dtr(node,'ADC'))
129
+
130
+ when 'VZ'
131
+ return gsh(@Helpers.get_dtr(node,'HD'))
132
+ else
133
+ if @Verbose
134
+ $stderr.puts " Headz.gsh: no rule for #{node.category}"
135
+ end
136
+ {}
137
+ end
138
+ end
139
+ end
140
+
141
+ # flatten the processed conjs to a list of (head) Hashes
142
+ # containing no conj features themselves
143
+ def gsh_conjs(conjs)
144
+ flat = Array.new
145
+
146
+ conjs.each {|conj|
147
+ current = gsh(conj)
148
+ @Helpers.descend(current,flat)
149
+ }
150
+
151
+ flat
152
+ end
153
+
154
+ #####################################3
155
+ def pp(node)
156
+
157
+ prep = node.terminals_sorted().detect { |n|
158
+ (pt = n.part_of_speech()) and
159
+ (pt =~ /^APPR/ or
160
+ pt =~ /^PWAV/ or
161
+ pt =~ /^C?PP/
162
+ )
163
+ }
164
+
165
+ if (lastnk = @Helpers.get_rightmost_dtr(node,'NK'))
166
+ head = gsh(lastnk)
167
+ if head and prep
168
+ head.update(Hash['prep'=>prep])
169
+ end
170
+
171
+ elsif (re = @Helpers.get_dtr(node,'RE'))
172
+ head = gsh(re)
173
+ if head and prep
174
+ head.update(Hash['prep'=>prep])
175
+ end
176
+ else
177
+ if @Verbose then $stderr.puts " pp: no rule for #{node}" end
178
+ end
179
+
180
+ head
181
+ end
182
+
183
+ ################
184
+ def s(node)
185
+ head = @Helpers.get_dtr(node,'HD')
186
+ if !head
187
+ # $stderr.puts " s: no head for #{node}"
188
+ return Hash[]
189
+ end
190
+
191
+ if head.outdeg() == 0
192
+ return gsh(head)
193
+ end
194
+
195
+ oc = @Helpers.get_dtr(node,'OC')
196
+ case head.category
197
+ when 'VVFIN'
198
+ if svp = @Helpers.get_dtr(node,'SVP') then
199
+ h = gsh(head)
200
+ if h
201
+ return h.update(Hash['svp'=>gsh(svp), 'oc'=>gsh(oc)])
202
+ else
203
+ return h
204
+ end
205
+ else
206
+ return gsh(head)
207
+ end
208
+
209
+ when 'VAFIN'
210
+ if oc && headd = @Helpers.get_dtr(oc,'HD')
211
+ h = gsh(headd)
212
+ if h
213
+ return h.update(Hash['oc'=>gsh(oc)])
214
+ else
215
+ return h
216
+ end
217
+
218
+ elsif pd = @Helpers.get_dtr(node,'PD') && head = @Helpers.get_dtr(pd,'HD')
219
+ return gsh(head)
220
+
221
+ else
222
+ if @Verbose then $stderr.puts " s: no rule for #{node}" end
223
+ end
224
+ else
225
+ if @Verbose then $stderr.puts " s: no rule for #{node}" end
226
+ end
227
+ end
228
+
229
+ ################
230
+ def vp(node)
231
+ head = gsh(@Helpers.get_dtr(node,'HD'))
232
+ tmp = @Verbose
233
+ @Verbose = false
234
+ newHash = Hash.new
235
+ ["da","oa"].each { |type|
236
+ if (dtr = @Helpers.get_dtr(node,type.upcase))
237
+ newHash[type] = gsh(dtr)
238
+ end
239
+ }
240
+ @Verbose = tmp
241
+ if head
242
+ return head.update(newHash)
243
+ else
244
+ return newHash
245
+ end
246
+ end
247
+
248
+ ################
249
+ # Access
250
+ def head(h)
251
+ return h['head']
252
+ end
253
+
254
+ def complex(h)
255
+ prep(h) or conj(h)
256
+ end
257
+
258
+ def prep(h)
259
+ return h['prep']
260
+ end
261
+
262
+ def conj(h)
263
+ return h['conj']
264
+ end
265
+
266
+
267
+
268
+ end # Class Headz
269
+
270
+
271
+ class HeadzHelpers
272
+ @Verbose = true
273
+
274
+ # Conjunction
275
+
276
+ def get_conjuncts(node)
277
+ conjuncts = get_dtrs(node,'CJ')
278
+ end
279
+
280
+ # flatten
281
+ def descend(current,flat)
282
+ if current.nil?
283
+ return flat
284
+ end
285
+
286
+ if current.has_key?("conj") then
287
+ tmp = current.delete("conj")
288
+ flat.push current
289
+ tmp.each {|item|
290
+ descend(item,flat)}
291
+ else
292
+ flat.push current
293
+ end
294
+ end
295
+
296
+ # Zugriff
297
+
298
+ def get_dtr(node,label)
299
+ if (dtrs = node.children_by_edgelabels([label]))
300
+ dtrs.first
301
+ else
302
+ if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtr for #{node}" end
303
+ nil
304
+ end
305
+ end
306
+
307
+ def get_dtrs(node,label)
308
+ if ! dtrs = node.children_by_edgelabels([label])
309
+ if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtr for #{node}" end
310
+ else
311
+ dtrs
312
+ end
313
+ end
314
+
315
+ def get_rightmost_dtr(node,label)
316
+ children = node.children_by_edgelabels([label])
317
+ if re = children.last then re
318
+ else
319
+ if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtrs for #{node}" end
320
+ nil
321
+ end
322
+ end
323
+
324
+ # def l2h(list)
325
+ # h = Hash.new
326
+ # while (list.length > 1) do
327
+ # h[list.shift] = list.shift
328
+ # end
329
+ # if list.length == 1 then
330
+ # $stderr.puts "l2h: odd number of elems: " + list.join(" / ")
331
+ # end
332
+ # h
333
+ # end
334
+
335
+ end # Class HeadzHelpers
336
+
337
+
338
+