frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,44 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
require "common/AbstractSynInterface"
|
3
|
+
|
4
|
+
################################################
|
5
|
+
# Interface class
|
6
|
+
class TntInterface < SynInterfaceTab
|
7
|
+
TntInterface.announce_me()
|
8
|
+
|
9
|
+
def TntInterface.system()
|
10
|
+
return "tnt"
|
11
|
+
end
|
12
|
+
|
13
|
+
def TntInterface.service()
|
14
|
+
return "pos_tagger"
|
15
|
+
end
|
16
|
+
|
17
|
+
def process_file(infilename, # string: name of input file
|
18
|
+
outfilename) # string: name of output file
|
19
|
+
|
20
|
+
tempfile = Tempfile.new("Tnt")
|
21
|
+
TntInterface.fntab_words_to_file(infilename, tempfile)
|
22
|
+
tempfile.close
|
23
|
+
|
24
|
+
# 1. use grep to remove commentaries from file
|
25
|
+
# 2. use sed to extract tags tag list:
|
26
|
+
# - match one or more non-spaces
|
27
|
+
# - match one or more spaces
|
28
|
+
# - match one or more non-spaces and write to outfilename
|
29
|
+
|
30
|
+
# This assumes that the experiment file entry for pos_tagger_path
|
31
|
+
# has the form
|
32
|
+
# pos_tagger_path = <program_name> <model>
|
33
|
+
|
34
|
+
Kernel.system(@program_path + " " + tempfile.path +
|
35
|
+
' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
|
36
|
+
|
37
|
+
tempfile.close(true) # delete tempfile
|
38
|
+
unless `cat #{infilename} | wc -l`.strip ==
|
39
|
+
`cat #{outfilename} | wc -l`.strip
|
40
|
+
raise "Error: tagged file has different line number from corpus file!"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
data/lib/common/Tree.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'common/Graph'
|
2
|
+
|
3
|
+
class TreeNode < GraphNode
|
4
|
+
|
5
|
+
def initialize(id)
|
6
|
+
super(id)
|
7
|
+
end
|
8
|
+
|
9
|
+
# redo the ancestor-related methods,
|
10
|
+
# since here we only have one parent per node
|
11
|
+
def parent()
|
12
|
+
retv = parents()
|
13
|
+
if retv.nil?
|
14
|
+
return nil
|
15
|
+
else
|
16
|
+
return retv.first
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def parent_label()
|
21
|
+
retv = parent_labels()
|
22
|
+
if retv.nil?
|
23
|
+
return nil
|
24
|
+
else
|
25
|
+
return retv.first
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def parent_with_edgelabel()
|
31
|
+
retv = parents_with_edgelabel()
|
32
|
+
|
33
|
+
if retv.nil?
|
34
|
+
return nil
|
35
|
+
else
|
36
|
+
return retv.first
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
def add_parent(parent, edgelabel, varhash={})
|
42
|
+
set_parent(parent, edgelabel, varhash)
|
43
|
+
end
|
44
|
+
|
45
|
+
def set_parent(parent, edgelabel, varhash={})
|
46
|
+
# remove old parent
|
47
|
+
each_parent_with_edgelabel { |label, parent|
|
48
|
+
remove_parent(parent, label, varhash)
|
49
|
+
}
|
50
|
+
|
51
|
+
# set new parent
|
52
|
+
@parents << [edgelabel, parent]
|
53
|
+
|
54
|
+
# and vice versa: add self as child to parent
|
55
|
+
unless varhash["pointer_insteadof_edge"]
|
56
|
+
unless parent.children_with_edgelabel().include? [edgelabel, self]
|
57
|
+
parent.add_child(self, edgelabel)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,303 @@
|
|
1
|
+
# sp 30 11 06
|
2
|
+
# extended by TreeTaggerPOSInterface
|
3
|
+
|
4
|
+
require "tempfile"
|
5
|
+
|
6
|
+
require "common/AbstractSynInterface"
|
7
|
+
|
8
|
+
###########
|
9
|
+
# KE dec 7, 06
|
10
|
+
# common mixin for both Treetagger modules, doing the actual processing
|
11
|
+
module TreetaggerModule
|
12
|
+
###
|
13
|
+
# Treetagger does both lemmatization and POS-tagging.
|
14
|
+
# However, the way the SynInterface system is set up in Shalmaneser,
|
15
|
+
# each SynInterface can offer only _one_ service.
|
16
|
+
# This means that we cannot do a SynInterface that writes
|
17
|
+
# both a POS file and a lemma file.
|
18
|
+
# Instead, both will include this module, which does the
|
19
|
+
# actual TreeTagger call and then stores the result in a file
|
20
|
+
# of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
|
21
|
+
# but with a separate extension.
|
22
|
+
# really_process_file checks for existence of this file because,
|
23
|
+
# if the TreeTagger lemmatization and POS-tagging classes are called separately,
|
24
|
+
# one of them will go first, and the 2nd one will not need to do the
|
25
|
+
# TreeTagger call anymore
|
26
|
+
#
|
27
|
+
# really_process_file returns a filename, the name of the file containing
|
28
|
+
# the TreeTagger output with both POS tags and lemma information
|
29
|
+
#
|
30
|
+
# WARNING: this method assumes that outfilename contains a suffix
|
31
|
+
# that can be replaced by .TreeTagger
|
32
|
+
def really_process_file(infilename, # string: name of input file
|
33
|
+
outfilename,# string: name of file that the caller is to produce
|
34
|
+
make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
|
35
|
+
|
36
|
+
# fabricate the filename in which the
|
37
|
+
# actual TreeTagger output will be placed:
|
38
|
+
# <directory> + <outfilename minus last suffix> + ".TreeTagger"
|
39
|
+
current_suffix = outfilename[outfilename.rindex(".")..-1]
|
40
|
+
my_outfilename = File.dirname(outfilename) + "/" +
|
41
|
+
File.basename(outfilename, current_suffix) +
|
42
|
+
".TreeTagger"
|
43
|
+
|
44
|
+
##
|
45
|
+
# does it exist? then just return it
|
46
|
+
if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
|
47
|
+
return my_outfilename
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# else construct it, then return it
|
52
|
+
tempfile = Tempfile.new("Treetagger")
|
53
|
+
TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
|
54
|
+
tempfile.close
|
55
|
+
|
56
|
+
# call TreeTagger
|
57
|
+
Kernel.system(@program_path+" "+tempfile.path +
|
58
|
+
" > " + my_outfilename)
|
59
|
+
tempfile.close(true) # delete first tempfile
|
60
|
+
|
61
|
+
# external problem: sometimes, the treetagger keeps the last <EOS> for itself,
|
62
|
+
# resulting on a .tagged file missing the last (blank) line
|
63
|
+
|
64
|
+
original_length = `cat #{infilename} | wc -l`.strip.to_i
|
65
|
+
puts infilename
|
66
|
+
lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
|
67
|
+
|
68
|
+
# `cp #{tempfile2.path()} /tmp/lout`
|
69
|
+
|
70
|
+
case original_length - lemmatised_length
|
71
|
+
when 0
|
72
|
+
# everything ok, don't do anything
|
73
|
+
when 1
|
74
|
+
# add one more newline to the .tagged file
|
75
|
+
`echo "" >> #{my_outfilename}`
|
76
|
+
else
|
77
|
+
# this is "real" error
|
78
|
+
STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
|
79
|
+
STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
|
80
|
+
$stderr.puts "has different line number from corpus file!"
|
81
|
+
raise
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
return my_outfilename
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
#######################################
|
90
|
+
class TreetaggerInterface < SynInterfaceTab
|
91
|
+
TreetaggerInterface.announce_me()
|
92
|
+
|
93
|
+
include TreetaggerModule
|
94
|
+
|
95
|
+
###
|
96
|
+
def TreetaggerInterface.system()
|
97
|
+
return "treetagger"
|
98
|
+
end
|
99
|
+
|
100
|
+
###
|
101
|
+
def TreetaggerInterface.service()
|
102
|
+
return "lemmatizer"
|
103
|
+
end
|
104
|
+
|
105
|
+
###
|
106
|
+
# convert TreeTagger's penn tagset into Collins' penn tagset *argh*
|
107
|
+
|
108
|
+
def convert_to_berkeley(line)
|
109
|
+
line.chomp!
|
110
|
+
return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
###
|
115
|
+
def process_file(infilename, # string: name of input file
|
116
|
+
outfilename) # string: name of output file
|
117
|
+
|
118
|
+
# KE change here
|
119
|
+
ttfilename = really_process_file(infilename, outfilename)
|
120
|
+
|
121
|
+
# write all output to tempfile2 first, then
|
122
|
+
# change ISO to UTF-8 into outputfile
|
123
|
+
tempfile2 = Tempfile.new("treetagger")
|
124
|
+
tempfile2.close()
|
125
|
+
|
126
|
+
# 2. use cut to get the actual lemmtisation
|
127
|
+
|
128
|
+
Kernel.system("cat " + ttfilename +
|
129
|
+
' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
|
130
|
+
|
131
|
+
# transform ISO-8859-1 back to UTF-8,
|
132
|
+
# write to 'outfilename'
|
133
|
+
begin
|
134
|
+
outfile = File.new(outfilename, "w")
|
135
|
+
rescue
|
136
|
+
raise "Could not write to #{outfilename}"
|
137
|
+
end
|
138
|
+
tempfile2.open
|
139
|
+
# AB: Internally all the flow is an utf-8 encoded stream.
|
140
|
+
# TreeTagger consumes one byte encodings (but we should provide a
|
141
|
+
# utf-8 model for German). So we convert utf-8 to latin1, then
|
142
|
+
# process the text and convert it back to utf-8.
|
143
|
+
#
|
144
|
+
while line = tempfile2.gets
|
145
|
+
#outfile.puts UtfIso.from_iso_8859_1(line)
|
146
|
+
utf8line = UtfIso.from_iso_8859_1(line)
|
147
|
+
outfile.puts convert_to_berkeley(utf8line)
|
148
|
+
end
|
149
|
+
|
150
|
+
# remove second tempfile, finalize output file
|
151
|
+
tempfile2.close(true)
|
152
|
+
outfile.close()
|
153
|
+
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
# sp 30 11 06
|
159
|
+
#
|
160
|
+
# using TreeTagger for POS tagging of English text
|
161
|
+
#
|
162
|
+
# copy-and-paste from lemmatisation
|
163
|
+
#
|
164
|
+
# differences:
|
165
|
+
# 1. use field 2 and not 3 from the output
|
166
|
+
# 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
|
167
|
+
#
|
168
|
+
# KE 7 12 06
|
169
|
+
# change interface such that TreeTagger is called only once
|
170
|
+
# and both POS tags and lemma are read from the same files,
|
171
|
+
# rather than calling the tagger twice
|
172
|
+
class TreetaggerPOSInterface < SynInterfaceTab
|
173
|
+
TreetaggerPOSInterface.announce_me()
|
174
|
+
include TreetaggerModule
|
175
|
+
|
176
|
+
###
|
177
|
+
def TreetaggerPOSInterface.system()
|
178
|
+
return "treetagger"
|
179
|
+
end
|
180
|
+
|
181
|
+
###
|
182
|
+
def TreetaggerPOSInterface.service()
|
183
|
+
return "pos_tagger"
|
184
|
+
end
|
185
|
+
|
186
|
+
###
|
187
|
+
# convert TreeTagger's penn tagset into Collins' penn tagset *argh*
|
188
|
+
|
189
|
+
def convert_to_collins(line)
|
190
|
+
line.chomp!
|
191
|
+
return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
|
192
|
+
end
|
193
|
+
|
194
|
+
###
|
195
|
+
def process_file(infilename, # string: name of input file
|
196
|
+
outfilename) # string: name of output file
|
197
|
+
|
198
|
+
# KE change here
|
199
|
+
tt_filename = really_process_file(infilename, outfilename, true)
|
200
|
+
|
201
|
+
# write all output to tempfile2 first, then
|
202
|
+
# change ISO to UTF-8 into outputfile
|
203
|
+
tempfile2 = Tempfile.new("treetagger")
|
204
|
+
tempfile2.close()
|
205
|
+
|
206
|
+
# 2. use cut to get the actual lemmtisation
|
207
|
+
|
208
|
+
Kernel.system("cat " + tt_filename +
|
209
|
+
' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
|
210
|
+
|
211
|
+
# transform ISO-8859-1 back to UTF-8,
|
212
|
+
# write to 'outfilename'
|
213
|
+
begin
|
214
|
+
outfile = File.new(outfilename, "w")
|
215
|
+
rescue
|
216
|
+
raise "Could not write to #{outfilename}"
|
217
|
+
end
|
218
|
+
tempfile2.open()
|
219
|
+
while (line = tempfile2.gets())
|
220
|
+
outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
|
221
|
+
end
|
222
|
+
|
223
|
+
# remove second tempfile, finalize output file
|
224
|
+
tempfile2.close(true)
|
225
|
+
outfile.close()
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
###############
|
230
|
+
# an interpreter that only has Treetagger, no parser
|
231
|
+
class TreetaggerInterpreter < SynInterpreter
|
232
|
+
TreetaggerInterpreter.announce_me()
|
233
|
+
|
234
|
+
###
|
235
|
+
# names of the systems interpreted by this class:
|
236
|
+
# returns a hash service(string) -> system name (string),
|
237
|
+
# e.g.
|
238
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
239
|
+
def TreetaggerInterpreter.systems()
|
240
|
+
return {
|
241
|
+
"pos_tagger" => "treetagger",
|
242
|
+
}
|
243
|
+
end
|
244
|
+
|
245
|
+
###
|
246
|
+
# names of additional systems that may be interpreted by this class
|
247
|
+
# returns a hash service(string) -> system name(string)
|
248
|
+
# same as names()
|
249
|
+
def TreetaggerInterpreter.optional_systems()
|
250
|
+
return {
|
251
|
+
"lemmatizer" => "treetagger"
|
252
|
+
}
|
253
|
+
end
|
254
|
+
|
255
|
+
###
|
256
|
+
# generalize over POS tags.
|
257
|
+
#
|
258
|
+
# returns one of:
|
259
|
+
#
|
260
|
+
# adj: adjective (phrase)
|
261
|
+
# adv: adverb (phrase)
|
262
|
+
# card: numbers, quantity phrases
|
263
|
+
# con: conjunction
|
264
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
265
|
+
# for: foreign material
|
266
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
267
|
+
# part: particles, truncated words (German compound parts)
|
268
|
+
# prep: preposition (phrase)
|
269
|
+
# pun: punctuation, brackets, etc.
|
270
|
+
# sent: sentence
|
271
|
+
# top: top node of a sentence
|
272
|
+
# verb: verb (phrase)
|
273
|
+
# nil: something went wrong
|
274
|
+
#
|
275
|
+
# returns: string, or nil
|
276
|
+
def TreetaggerInterpreter.category(node) # SynNode
|
277
|
+
pt = TreetaggerInterpreter.pt(node)
|
278
|
+
if pt.nil?
|
279
|
+
# phrase type could not be determined
|
280
|
+
return nil
|
281
|
+
end
|
282
|
+
|
283
|
+
pt.to_s.strip() =~ /^([^-]*)/
|
284
|
+
case $1
|
285
|
+
when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
|
286
|
+
when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
|
287
|
+
when /^CD/, /^QP/ then return "card"
|
288
|
+
when /^CC/, /^WRB/, /^CONJP/ then return "con"
|
289
|
+
when /^DT/, /^POS/ then return "det"
|
290
|
+
when /^FW/, /^SYM/ then return "for"
|
291
|
+
when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
|
292
|
+
when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
|
293
|
+
when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
|
294
|
+
when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
|
295
|
+
when /^TOP/ then return "top"
|
296
|
+
when /^TRACE/ then return "trace"
|
297
|
+
when /^V/ , /^MD/ then return "verb"
|
298
|
+
else
|
299
|
+
# $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
|
300
|
+
return nil
|
301
|
+
end
|
302
|
+
end
|
303
|
+
end
|
data/lib/common/headz.rb
ADDED
@@ -0,0 +1,338 @@
|
|
1
|
+
# name: Module Headz
|
2
|
+
# auth: albu@coli.uni-sb.de
|
3
|
+
#
|
4
|
+
# modified KE Sept 04:
|
5
|
+
# changed from old Sentence pkg to new SalsaTigerSentence pkg
|
6
|
+
#
|
7
|
+
# modified KE April 05:
|
8
|
+
# suppress the flood of warnings
|
9
|
+
#
|
10
|
+
# modified SP June 05: added some more cases; change to SalsTigerRegXML
|
11
|
+
#
|
12
|
+
#
|
13
|
+
# INIT: REXML TIGER sentence,
|
14
|
+
# FUNC: syn_nodes(term/non_term) -> heads
|
15
|
+
#
|
16
|
+
#
|
17
|
+
# usage:
|
18
|
+
#
|
19
|
+
# h = Headz.new()
|
20
|
+
#
|
21
|
+
# hash = h.get_sem_head(node) # node is a SalsaTigerXmlNode obj
|
22
|
+
#
|
23
|
+
# head = hash["head"]
|
24
|
+
# prep = hash["prep"]
|
25
|
+
#
|
26
|
+
# if h.complex(head)
|
27
|
+
# print "preposition of conjunction involved"
|
28
|
+
# end
|
29
|
+
|
30
|
+
require "common/SalsaTigerRegXML"
|
31
|
+
|
32
|
+
class Headz
|
33
|
+
|
34
|
+
def initialize()
|
35
|
+
@Helpers = HeadzHelpers.new()
|
36
|
+
@Verbose = false #KE 13.4.05: please not that many messages!
|
37
|
+
end
|
38
|
+
|
39
|
+
# head of one node
|
40
|
+
def get_sem_head(node)
|
41
|
+
gsh(node)
|
42
|
+
end
|
43
|
+
|
44
|
+
# all headz of top-nodes covering fe
|
45
|
+
def get_fe_heads(fe)
|
46
|
+
if (const = fe.children())
|
47
|
+
const.map { |node|
|
48
|
+
get_sem_head(node)
|
49
|
+
}
|
50
|
+
else
|
51
|
+
$stderr.puts "Headz.get_sem_head: no children for FE #{fe}"
|
52
|
+
[]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def gsh (node)
|
57
|
+
if !node then
|
58
|
+
if @Verbose then $stderr.puts "Headz.gsh: no input node" end
|
59
|
+
return {}
|
60
|
+
|
61
|
+
elsif node.is_terminal? then return Hash['head'=>node]
|
62
|
+
|
63
|
+
else
|
64
|
+
case node.category
|
65
|
+
when 'AP'
|
66
|
+
return gsh(@Helpers.get_dtr(node,'HD'))
|
67
|
+
|
68
|
+
when 'AVP'
|
69
|
+
return gsh(@Helpers.get_dtr(node,'HD'))
|
70
|
+
when 'CAP', 'CAVP', 'CNP', 'CPP', 'CS', 'CVP'
|
71
|
+
conjs = @Helpers.get_conjuncts(node)
|
72
|
+
head = gsh(conjs.shift)
|
73
|
+
if head
|
74
|
+
head.update(Hash["conj"=>gsh_conjs(conjs)])
|
75
|
+
end
|
76
|
+
return head
|
77
|
+
|
78
|
+
when 'NM'
|
79
|
+
return gsh(@Helpers.get_rightmost_dtr(node,'NMC'))
|
80
|
+
when 'NP'
|
81
|
+
nk = @Helpers.get_rightmost_dtr(node,'NK')
|
82
|
+
if nk
|
83
|
+
return gsh(nk)
|
84
|
+
else
|
85
|
+
return gsh(@Helpers.get_rightmost_dtr(node, "NN"))
|
86
|
+
end
|
87
|
+
|
88
|
+
when 'PN'
|
89
|
+
pncs = @Helpers.get_dtrs(node,'PNC')
|
90
|
+
head = gsh(pncs.last)
|
91
|
+
if head
|
92
|
+
head.update(Hash["pncs"=>pncs])
|
93
|
+
end
|
94
|
+
return head
|
95
|
+
|
96
|
+
when 'PP'
|
97
|
+
return pp(node)
|
98
|
+
|
99
|
+
when 'S'
|
100
|
+
return s(node)
|
101
|
+
when 'VROOT'
|
102
|
+
dtrs = @Helpers.get_dtrs(node,'--')
|
103
|
+
|
104
|
+
# discourse level node with sentence nodes below?
|
105
|
+
# or conjunction with sentence nodes below?
|
106
|
+
discourselevel_dtr = dtrs.detect { |n| n.category == "DL"}
|
107
|
+
co_dtr = dtrs.detect { |n| n.category == "CO" }
|
108
|
+
if discourselevel_dtr
|
109
|
+
dtrs = discourselevel_dtr.children()
|
110
|
+
elsif co_dtr
|
111
|
+
dtrs = co_dtr.children()
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
# take first sentence node
|
116
|
+
sent_dtr = dtrs.detect {|n| n.category =~ /^C?S/}
|
117
|
+
if sent_dtr
|
118
|
+
return gsh(sent_dtr)
|
119
|
+
else
|
120
|
+
# $stderr.puts "headz Warning: no sentence found below VROOT! Node #{node.id()}"
|
121
|
+
return nil
|
122
|
+
end
|
123
|
+
|
124
|
+
when 'VP'
|
125
|
+
return vp(node)
|
126
|
+
|
127
|
+
when 'MTA'
|
128
|
+
return gsh(@Helpers.get_rightmost_dtr(node,'ADC'))
|
129
|
+
|
130
|
+
when 'VZ'
|
131
|
+
return gsh(@Helpers.get_dtr(node,'HD'))
|
132
|
+
else
|
133
|
+
if @Verbose
|
134
|
+
$stderr.puts " Headz.gsh: no rule for #{node.category}"
|
135
|
+
end
|
136
|
+
{}
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# flatten the processed conjs to a list of (head) Hashes
|
142
|
+
# containing no conj features themselves
|
143
|
+
def gsh_conjs(conjs)
|
144
|
+
flat = Array.new
|
145
|
+
|
146
|
+
conjs.each {|conj|
|
147
|
+
current = gsh(conj)
|
148
|
+
@Helpers.descend(current,flat)
|
149
|
+
}
|
150
|
+
|
151
|
+
flat
|
152
|
+
end
|
153
|
+
|
154
|
+
#####################################3
|
155
|
+
def pp(node)
|
156
|
+
|
157
|
+
prep = node.terminals_sorted().detect { |n|
|
158
|
+
(pt = n.part_of_speech()) and
|
159
|
+
(pt =~ /^APPR/ or
|
160
|
+
pt =~ /^PWAV/ or
|
161
|
+
pt =~ /^C?PP/
|
162
|
+
)
|
163
|
+
}
|
164
|
+
|
165
|
+
if (lastnk = @Helpers.get_rightmost_dtr(node,'NK'))
|
166
|
+
head = gsh(lastnk)
|
167
|
+
if head and prep
|
168
|
+
head.update(Hash['prep'=>prep])
|
169
|
+
end
|
170
|
+
|
171
|
+
elsif (re = @Helpers.get_dtr(node,'RE'))
|
172
|
+
head = gsh(re)
|
173
|
+
if head and prep
|
174
|
+
head.update(Hash['prep'=>prep])
|
175
|
+
end
|
176
|
+
else
|
177
|
+
if @Verbose then $stderr.puts " pp: no rule for #{node}" end
|
178
|
+
end
|
179
|
+
|
180
|
+
head
|
181
|
+
end
|
182
|
+
|
183
|
+
################
|
184
|
+
def s(node)
|
185
|
+
head = @Helpers.get_dtr(node,'HD')
|
186
|
+
if !head
|
187
|
+
# $stderr.puts " s: no head for #{node}"
|
188
|
+
return Hash[]
|
189
|
+
end
|
190
|
+
|
191
|
+
if head.outdeg() == 0
|
192
|
+
return gsh(head)
|
193
|
+
end
|
194
|
+
|
195
|
+
oc = @Helpers.get_dtr(node,'OC')
|
196
|
+
case head.category
|
197
|
+
when 'VVFIN'
|
198
|
+
if svp = @Helpers.get_dtr(node,'SVP') then
|
199
|
+
h = gsh(head)
|
200
|
+
if h
|
201
|
+
return h.update(Hash['svp'=>gsh(svp), 'oc'=>gsh(oc)])
|
202
|
+
else
|
203
|
+
return h
|
204
|
+
end
|
205
|
+
else
|
206
|
+
return gsh(head)
|
207
|
+
end
|
208
|
+
|
209
|
+
when 'VAFIN'
|
210
|
+
if oc && headd = @Helpers.get_dtr(oc,'HD')
|
211
|
+
h = gsh(headd)
|
212
|
+
if h
|
213
|
+
return h.update(Hash['oc'=>gsh(oc)])
|
214
|
+
else
|
215
|
+
return h
|
216
|
+
end
|
217
|
+
|
218
|
+
elsif pd = @Helpers.get_dtr(node,'PD') && head = @Helpers.get_dtr(pd,'HD')
|
219
|
+
return gsh(head)
|
220
|
+
|
221
|
+
else
|
222
|
+
if @Verbose then $stderr.puts " s: no rule for #{node}" end
|
223
|
+
end
|
224
|
+
else
|
225
|
+
if @Verbose then $stderr.puts " s: no rule for #{node}" end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
################
|
230
|
+
def vp(node)
|
231
|
+
head = gsh(@Helpers.get_dtr(node,'HD'))
|
232
|
+
tmp = @Verbose
|
233
|
+
@Verbose = false
|
234
|
+
newHash = Hash.new
|
235
|
+
["da","oa"].each { |type|
|
236
|
+
if (dtr = @Helpers.get_dtr(node,type.upcase))
|
237
|
+
newHash[type] = gsh(dtr)
|
238
|
+
end
|
239
|
+
}
|
240
|
+
@Verbose = tmp
|
241
|
+
if head
|
242
|
+
return head.update(newHash)
|
243
|
+
else
|
244
|
+
return newHash
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
################
|
249
|
+
# Access
|
250
|
+
def head(h)
|
251
|
+
return h['head']
|
252
|
+
end
|
253
|
+
|
254
|
+
def complex(h)
|
255
|
+
prep(h) or conj(h)
|
256
|
+
end
|
257
|
+
|
258
|
+
def prep(h)
|
259
|
+
return h['prep']
|
260
|
+
end
|
261
|
+
|
262
|
+
def conj(h)
|
263
|
+
return h['conj']
|
264
|
+
end
|
265
|
+
|
266
|
+
|
267
|
+
|
268
|
+
end # Class Headz
|
269
|
+
|
270
|
+
|
271
|
+
class HeadzHelpers
|
272
|
+
@Verbose = true
|
273
|
+
|
274
|
+
# Conjunction
|
275
|
+
|
276
|
+
def get_conjuncts(node)
|
277
|
+
conjuncts = get_dtrs(node,'CJ')
|
278
|
+
end
|
279
|
+
|
280
|
+
# flatten
|
281
|
+
def descend(current,flat)
|
282
|
+
if current.nil?
|
283
|
+
return flat
|
284
|
+
end
|
285
|
+
|
286
|
+
if current.has_key?("conj") then
|
287
|
+
tmp = current.delete("conj")
|
288
|
+
flat.push current
|
289
|
+
tmp.each {|item|
|
290
|
+
descend(item,flat)}
|
291
|
+
else
|
292
|
+
flat.push current
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
# Zugriff
|
297
|
+
|
298
|
+
def get_dtr(node,label)
|
299
|
+
if (dtrs = node.children_by_edgelabels([label]))
|
300
|
+
dtrs.first
|
301
|
+
else
|
302
|
+
if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtr for #{node}" end
|
303
|
+
nil
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
def get_dtrs(node,label)
|
308
|
+
if ! dtrs = node.children_by_edgelabels([label])
|
309
|
+
if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtr for #{node}" end
|
310
|
+
else
|
311
|
+
dtrs
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def get_rightmost_dtr(node,label)
|
316
|
+
children = node.children_by_edgelabels([label])
|
317
|
+
if re = children.last then re
|
318
|
+
else
|
319
|
+
if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtrs for #{node}" end
|
320
|
+
nil
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
# def l2h(list)
|
325
|
+
# h = Hash.new
|
326
|
+
# while (list.length > 1) do
|
327
|
+
# h[list.shift] = list.shift
|
328
|
+
# end
|
329
|
+
# if list.length == 1 then
|
330
|
+
# $stderr.puts "l2h: odd number of elems: " + list.join(" / ")
|
331
|
+
# end
|
332
|
+
# h
|
333
|
+
# end
|
334
|
+
|
335
|
+
end # Class HeadzHelpers
|
336
|
+
|
337
|
+
|
338
|
+
|