frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,182 @@
|
|
1
|
+
# FredConfigData
|
2
|
+
# Katrin Erk April 05
|
3
|
+
#
|
4
|
+
# Frame disambiguation system:
|
5
|
+
# access to a configuration and experiment description file
|
6
|
+
|
7
|
+
require "common/ConfigData"
|
8
|
+
|
9
|
+
##############################
|
10
|
+
# Class FredConfigData
|
11
|
+
#
|
12
|
+
# inherits from ConfigData,
|
13
|
+
# sets variable names appropriate to WSD task
|
14
|
+
|
15
|
+
class FredConfigData < ConfigData
|
16
|
+
def initialize(filename)
|
17
|
+
|
18
|
+
# initialize config data object
|
19
|
+
super(filename, # config file
|
20
|
+
{
|
21
|
+
"experiment_ID" => "string", # experiment ID
|
22
|
+
"enduser_mode" => "bool", # work in enduser mode? (disallowing many things)
|
23
|
+
|
24
|
+
"preproc_descr_file_train" => "string", # path to preprocessing files
|
25
|
+
"preproc_descr_file_test" => "string",
|
26
|
+
"directory_output" => "string", # path to Salsa/Tiger XML output directory
|
27
|
+
|
28
|
+
"verbose" => "bool" , # print diagnostic messages?
|
29
|
+
"apply_to_all_known_targets" => "bool", # apply to all known targets rather than the ones with a frame?
|
30
|
+
|
31
|
+
"fred_directory" => "string",# directory for internal info
|
32
|
+
"classifier_dir" => "string", # write classifiers here
|
33
|
+
|
34
|
+
"classifier" => "list", # classifiers
|
35
|
+
|
36
|
+
"dbtype" => "string", # "mysql" or "sqlite"
|
37
|
+
|
38
|
+
"host" => "string", # DB access: sqlite only
|
39
|
+
"user" => "string",
|
40
|
+
"passwd" => "string",
|
41
|
+
"dbname" => "string",
|
42
|
+
|
43
|
+
# featurization info
|
44
|
+
"feature" => "list", # which features to use for the classifier?
|
45
|
+
"binary_classifiers" => "bool",# make binary rather than n-ary clasifiers?
|
46
|
+
"negsense" => "string", # binary classifier: negative sense is..?
|
47
|
+
"numerical_features" => "string", # do what with numerical features?
|
48
|
+
|
49
|
+
# what to do with items that have multiple senses?
|
50
|
+
# 'binarize': binary classifiers, and consider positive
|
51
|
+
# if the sense is among the gold senses
|
52
|
+
# 'join' : make one joint sense
|
53
|
+
# 'repeat' : make multiple occurrences of the item, one sense per occ
|
54
|
+
# 'keep' : keep as separate labels
|
55
|
+
#
|
56
|
+
# multilabel: consider as assigned all labels
|
57
|
+
# above a certain confidence threshold?
|
58
|
+
"handle_multilabel" => "string",
|
59
|
+
"assignment_confidence_threshold" => "float",
|
60
|
+
|
61
|
+
# single-sentence context?
|
62
|
+
"single_sent_context" => "bool",
|
63
|
+
|
64
|
+
# noncontiguous input? then we need access to a larger corpus
|
65
|
+
"noncontiguous_input" => "bool",
|
66
|
+
"larger_corpus_dir" => "string",
|
67
|
+
"larger_corpus_format" => "string",
|
68
|
+
"larger_corpus_encoding" => "string"
|
69
|
+
},
|
70
|
+
[ # variables
|
71
|
+
"train",
|
72
|
+
"exp_ID"
|
73
|
+
]
|
74
|
+
)
|
75
|
+
|
76
|
+
# set access functions for list features
|
77
|
+
set_list_feature_access("classifier",
|
78
|
+
method("access_classifier"))
|
79
|
+
set_list_feature_access("feature",
|
80
|
+
method("access_feature"))
|
81
|
+
end
|
82
|
+
|
83
|
+
###
|
84
|
+
# protected
|
85
|
+
|
86
|
+
#####
|
87
|
+
# access_feature
|
88
|
+
#
|
89
|
+
# access function for feature 'feature'
|
90
|
+
#
|
91
|
+
# assumed format:
|
92
|
+
#
|
93
|
+
# feature = context 50
|
94
|
+
# feature = context 2
|
95
|
+
# feature = syn
|
96
|
+
#
|
97
|
+
# i.e. first the name of the feature type to use, then
|
98
|
+
# optionally a parameter,
|
99
|
+
# and the same feature can occur more than once (which makes sense
|
100
|
+
# only in case of parameters)
|
101
|
+
#
|
102
|
+
#
|
103
|
+
# returns:
|
104
|
+
# - If a feature is given as a parameter,
|
105
|
+
# - If the feature is not set in the experiment file, nil
|
106
|
+
# - If the feature is set and has a parameter, the list of
|
107
|
+
# parameter values set for it. It is assumed that the parameters
|
108
|
+
# are integers, and they are returned as integers
|
109
|
+
# - If the feature is set and has no parameter, true
|
110
|
+
# - If no feature is given as parameter:
|
111
|
+
# a list of all features that have been set in the experiment file
|
112
|
+
# Each feature is given as a tuple: the first element is the feature (a string),
|
113
|
+
# all further elements are options (integers)
|
114
|
+
def access_feature(val_list, # array:array:string: list of tuples defined in config file
|
115
|
+
# for feature 'feature'
|
116
|
+
feature=nil) # string: feature type name
|
117
|
+
|
118
|
+
if feature
|
119
|
+
# access options for this feature
|
120
|
+
|
121
|
+
# get the right tuples
|
122
|
+
positives = val_list.select { |entries|
|
123
|
+
entries.first() == feature
|
124
|
+
}.map { |entries|
|
125
|
+
entries[1]
|
126
|
+
}
|
127
|
+
|
128
|
+
if positives.empty?
|
129
|
+
# feature not defined
|
130
|
+
return nil
|
131
|
+
|
132
|
+
elsif positives.compact().empty?
|
133
|
+
# feature defined, but no parameters
|
134
|
+
return true
|
135
|
+
|
136
|
+
else
|
137
|
+
# feature defined, and has values
|
138
|
+
return positives.map { |par| par.to_i() }
|
139
|
+
end
|
140
|
+
|
141
|
+
else
|
142
|
+
# return all features that have been set
|
143
|
+
return val_list.map { |feature_name, *options|
|
144
|
+
[feature_name] + options.map { |o| o.to_i() }
|
145
|
+
}
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
#####
|
150
|
+
# access_classifier
|
151
|
+
#
|
152
|
+
# access function for feature 'classifier'
|
153
|
+
#
|
154
|
+
# assumed format in the config file:
|
155
|
+
#
|
156
|
+
# feature = path [option]*
|
157
|
+
#
|
158
|
+
# i.e. first the name of the feature type to use, then
|
159
|
+
# optionally options associated with that feature,
|
160
|
+
# e.g. 'argrec': use that feature only when computing argrec
|
161
|
+
#
|
162
|
+
# the access function is called with parameter val_list, an array of
|
163
|
+
# string tuples, one string tuple for each feature defined.
|
164
|
+
# the first string in the tuple is the feature name, the rest are the options
|
165
|
+
#
|
166
|
+
# returns: a list of pairs [feature_name(string), options(array:string)]
|
167
|
+
# of defined features
|
168
|
+
def access_classifier(val_list) # array:array:string: list of tuples defined in config file
|
169
|
+
# for feature 'feature'
|
170
|
+
if val_list.nil?
|
171
|
+
return []
|
172
|
+
else
|
173
|
+
return val_list.map { |cl_descr_tuple|
|
174
|
+
[cl_descr_tuple.first, cl_descr_tuple[1..-1]]
|
175
|
+
}
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
|
@@ -0,0 +1,232 @@
|
|
1
|
+
# FredConventions
|
2
|
+
# Katrin Erk June 05
|
3
|
+
#
|
4
|
+
# several small things that should be uniform
|
5
|
+
# throughout the system
|
6
|
+
|
7
|
+
require "common/ruby_class_extensions"
|
8
|
+
|
9
|
+
require "common/EnduserMode"
|
10
|
+
class Object
|
11
|
+
|
12
|
+
###
|
13
|
+
# joining and breaking up senses
|
14
|
+
def fred_join_senses(senses)
|
15
|
+
return senses.sort().join("++")
|
16
|
+
end
|
17
|
+
|
18
|
+
def fred_split_sense(joined_senses)
|
19
|
+
return joined_senses.split("++")
|
20
|
+
end
|
21
|
+
|
22
|
+
###
|
23
|
+
# fred_dirname
|
24
|
+
#
|
25
|
+
# constructs a directory name:
|
26
|
+
# fred data directory / experiment ID / maindir / subdir
|
27
|
+
#
|
28
|
+
# if is_existing == existing, the directory is checked for existence,
|
29
|
+
# if is_existing == new, it is created if necessary
|
30
|
+
#
|
31
|
+
# returns: a string
|
32
|
+
def fred_dirname(exp, # FredConfigData object
|
33
|
+
maindir, # string: main part of directory name
|
34
|
+
subdir, # string: subpart of directory name
|
35
|
+
is_existing = "existing") # string: "existing" or "new", default: existing
|
36
|
+
|
37
|
+
case is_existing
|
38
|
+
when "existing"
|
39
|
+
return File.existing_dir(exp.get("fred_directory"),
|
40
|
+
exp.get("experiment_ID"),
|
41
|
+
maindir,
|
42
|
+
subdir)
|
43
|
+
when "new"
|
44
|
+
return File.new_dir(exp.get("fred_directory"),
|
45
|
+
exp.get("experiment_ID"),
|
46
|
+
maindir,
|
47
|
+
subdir)
|
48
|
+
else
|
49
|
+
raise "Shouldn't be here: #{is_existing}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
####
|
54
|
+
# filenames for feature files
|
55
|
+
def fred_feature_filename(lemma, sense = nil,
|
56
|
+
do_binary = false)
|
57
|
+
if do_binary
|
58
|
+
return "fred.features.#{lemma}.SENSE.#{sense}"
|
59
|
+
else
|
60
|
+
return "fred.features.#{lemma}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
####
|
65
|
+
# filenames for split files
|
66
|
+
def fred_split_filename(lemma)
|
67
|
+
return "fred.split.#{lemma}"
|
68
|
+
end
|
69
|
+
|
70
|
+
###
|
71
|
+
# deconstruct split filename
|
72
|
+
# returns: lemma
|
73
|
+
def deconstruct_fred_split_filename(filename)
|
74
|
+
basename = File.basename(filename)
|
75
|
+
if basename =~ /^fred\.split\.(.*)/
|
76
|
+
return $1
|
77
|
+
else
|
78
|
+
return nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
###
|
83
|
+
# deconstruct feature file name
|
84
|
+
# returns: hash with keys
|
85
|
+
# "lemma"
|
86
|
+
# "sense
|
87
|
+
def deconstruct_fred_feature_filename(filename)
|
88
|
+
|
89
|
+
basename = File.basename(filename)
|
90
|
+
retv = Hash.new()
|
91
|
+
# binary:
|
92
|
+
# fred.features.#{lemma}.SENSE.#{sense}
|
93
|
+
if basename =~ /^fred\.features\.(.*)\.SENSE\.(.*)$/
|
94
|
+
retv["lemma"] = $1
|
95
|
+
retv["sense"] = $2
|
96
|
+
elsif basename =~ /^fred\.features\.(.*)/
|
97
|
+
# fred.features.#{lemma}
|
98
|
+
retv["lemma"] = $1
|
99
|
+
|
100
|
+
else
|
101
|
+
# complete mismatch
|
102
|
+
return nil
|
103
|
+
end
|
104
|
+
|
105
|
+
return retv
|
106
|
+
end
|
107
|
+
|
108
|
+
####
|
109
|
+
# filename for answer key files
|
110
|
+
def fred_answerkey_filename(lemma)
|
111
|
+
return "fred.answerkey.#{lemma}"
|
112
|
+
end
|
113
|
+
|
114
|
+
###
|
115
|
+
# classifier directory
|
116
|
+
def fred_classifier_directory(exp, # FredConfigData object
|
117
|
+
splitID = nil) # string or nil
|
118
|
+
|
119
|
+
if exp.get("classifier_dir")
|
120
|
+
# user-specified classifier directory
|
121
|
+
|
122
|
+
if splitID
|
123
|
+
return File.new_dir(exp.get("classifier_dir"), splitID)
|
124
|
+
else
|
125
|
+
return File.new_dir(exp.get("classifier_dir"))
|
126
|
+
end
|
127
|
+
|
128
|
+
else
|
129
|
+
# my classifier directory
|
130
|
+
if splitID
|
131
|
+
return fred_dirname(exp, "classifiers", splitID, "new")
|
132
|
+
else
|
133
|
+
return fred_dirname(exp, "classifiers", "all", "new")
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
###
|
139
|
+
# classifier file
|
140
|
+
def fred_classifier_filename(classifier, lemma, sense=nil)
|
141
|
+
if sense
|
142
|
+
return "fred.classif.#{classifier}.LEMMA.#{lemma}.SENSE.#{sense}"
|
143
|
+
else
|
144
|
+
return "fred.classif.#{classifier}.LEMMA.#{lemma}"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def deconstruct_fred_classifier_filename(filename)
|
149
|
+
retv = Hash.new()
|
150
|
+
if filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)\.SENSE\.(.*)$/
|
151
|
+
retv["lemma"] = $2
|
152
|
+
retv["sense"] = $3
|
153
|
+
elsif filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)$/
|
154
|
+
retv["lemma"] = $2
|
155
|
+
end
|
156
|
+
return retv
|
157
|
+
end
|
158
|
+
|
159
|
+
###
|
160
|
+
# result file
|
161
|
+
def fred_result_filename(lemma)
|
162
|
+
return "fred.result.#{lemma.gsub(/\./, "_")}"
|
163
|
+
end
|
164
|
+
|
165
|
+
##########
|
166
|
+
# lemma and POS: combine into string separated by
|
167
|
+
# a separator character
|
168
|
+
#
|
169
|
+
# fred_lemmapos_combine: take two strings, return combined string
|
170
|
+
# if POS is nil, returns lemma<separator character>
|
171
|
+
# fred_lemmapos_separate: take one string, return two strings
|
172
|
+
# if no POS could be retrieved, returns nil as POS and the whole string as lemma
|
173
|
+
def fred_lemmapos_combine(lemma, # string
|
174
|
+
pos) # string
|
175
|
+
return lemma.to_s + "." + pos.to_s.gsub(/\./, "DOT")
|
176
|
+
end
|
177
|
+
|
178
|
+
###
|
179
|
+
def fred_lemmapos_separate(lemmapos) # string
|
180
|
+
pieces = lemmapos.split(".")
|
181
|
+
if pieces.length() > 1
|
182
|
+
return [ pieces[0..-2].join("."), pieces[-1] ]
|
183
|
+
else
|
184
|
+
# no POS found, treat all of lemmapos as lemma
|
185
|
+
return [ lemmapos, nil ]
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
########################################
|
191
|
+
# given a SynNode object representing a terminal,
|
192
|
+
# return:
|
193
|
+
# - the word
|
194
|
+
# - the lemma
|
195
|
+
# - the part of speech
|
196
|
+
# - the named entity (if any)
|
197
|
+
#
|
198
|
+
# as a tuple
|
199
|
+
#
|
200
|
+
# WARNING: word and lemma are turned to lowercase
|
201
|
+
module WordLemmaPosNe
|
202
|
+
def word_lemma_pos_ne(syn_obj, # SynNode object
|
203
|
+
i) # SynInterpreter class
|
204
|
+
unless syn_obj.is_terminal?
|
205
|
+
$stderr.puts "Featurization warning: unexpectedly received non-terminal"
|
206
|
+
return [ nil, nil, nil, nil ]
|
207
|
+
end
|
208
|
+
|
209
|
+
word = syn_obj.word()
|
210
|
+
if word
|
211
|
+
word.downcase!
|
212
|
+
end
|
213
|
+
|
214
|
+
lemma = i.lemma_backoff(syn_obj)
|
215
|
+
if lemma and SalsaTigerXMLHelper.unescape(lemma) == "<unknown>"
|
216
|
+
lemma = nil
|
217
|
+
end
|
218
|
+
if lemma
|
219
|
+
lemma.downcase!
|
220
|
+
end
|
221
|
+
|
222
|
+
pos = syn_obj.part_of_speech()
|
223
|
+
|
224
|
+
ne = syn_obj.get_attribute("ne")
|
225
|
+
unless ne
|
226
|
+
ne = syn_obj.get_attribute("headof_ne")
|
227
|
+
end
|
228
|
+
|
229
|
+
return [word, lemma, pos, ne]
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
@@ -0,0 +1,324 @@
|
|
1
|
+
require "fred/FileZipped"
|
2
|
+
|
3
|
+
require "fred/FredConfigData"
|
4
|
+
require "common/SynInterfaces"
|
5
|
+
require "fred/FredConventions"
|
6
|
+
|
7
|
+
|
8
|
+
########################################
|
9
|
+
# target determination classes:
|
10
|
+
# either determine targets from existing annotation
|
11
|
+
# with frames,
|
12
|
+
# or use all known targets.
|
13
|
+
class Targets
|
14
|
+
attr_reader :targets_okay
|
15
|
+
|
16
|
+
###
|
17
|
+
def initialize(exp, # experiment file object
|
18
|
+
interpreter_class, # SynInterpreter class, or nil
|
19
|
+
mode) # string: "r", "w", "a", as in files
|
20
|
+
@exp = exp
|
21
|
+
@interpreter_class = interpreter_class
|
22
|
+
|
23
|
+
# keep recorded targets here.
|
24
|
+
# try to read old list now.
|
25
|
+
@targets = Hash.new()
|
26
|
+
|
27
|
+
# write target info in the classifier directory.
|
28
|
+
# This is _not_ dependent on a potential split ID
|
29
|
+
@dir = File.new_dir(fred_classifier_directory(@exp), "targets")
|
30
|
+
|
31
|
+
@targets_okay = true
|
32
|
+
case mode
|
33
|
+
when "w"
|
34
|
+
# start from scratch, no list of targets
|
35
|
+
when "a", "r"
|
36
|
+
# read existing file containing targets
|
37
|
+
begin
|
38
|
+
file = FileZipped.new(@dir + "targets.txt.gz")
|
39
|
+
rescue
|
40
|
+
# no pickle present: signal this
|
41
|
+
@targets_okay = false
|
42
|
+
return
|
43
|
+
end
|
44
|
+
file.each { |line|
|
45
|
+
line.chomp!
|
46
|
+
if line =~ /^LEMMA (.+) SENSES (.+)$/
|
47
|
+
lemmapos = $1
|
48
|
+
senses = $2.split()
|
49
|
+
lemmapos.gsub!(/ /, '_')
|
50
|
+
#lemmapos.gsub!(/\.[A-Z]\./, '.')
|
51
|
+
@targets[lemmapos] = senses
|
52
|
+
end
|
53
|
+
}
|
54
|
+
|
55
|
+
else
|
56
|
+
$stderr.puts "Error: shouldn't be here."
|
57
|
+
exit 1
|
58
|
+
end
|
59
|
+
|
60
|
+
if ["w", "a"].include? mode
|
61
|
+
@record_targets = true
|
62
|
+
else
|
63
|
+
@record_targets = false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
###
|
68
|
+
# determine_targets:
|
69
|
+
# for a given SalsaTigerSentence,
|
70
|
+
# determine all targets,
|
71
|
+
# each as a _single_ main terminal node
|
72
|
+
#
|
73
|
+
# We need a single terminal node in order
|
74
|
+
# to compute the context window
|
75
|
+
#
|
76
|
+
# returns:
|
77
|
+
# hash: target_IDs -> list of senses
|
78
|
+
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
79
|
+
#
|
80
|
+
# where a sense is represented as a hash:
|
81
|
+
# "sense": sense, a string
|
82
|
+
# "obj": FrameNode object
|
83
|
+
# "all_targets": list of node IDs, may comprise more than a single node
|
84
|
+
# "lex": lemma, or multiword expression in canonical form
|
85
|
+
# "sid": sentence ID
|
86
|
+
def determine_targets(sent)
|
87
|
+
raise "overwrite me"
|
88
|
+
end
|
89
|
+
|
90
|
+
##
|
91
|
+
# returns a list of lemma-pos combined strings
|
92
|
+
def get_lemmas()
|
93
|
+
return @targets.keys()
|
94
|
+
end
|
95
|
+
|
96
|
+
##
|
97
|
+
# access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
|
98
|
+
def get_lemma_pos()
|
99
|
+
|
100
|
+
return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# access to senses
|
105
|
+
def get_senses(lemmapos) # string, result of fred_lemmapos_combine
|
106
|
+
|
107
|
+
if @targets[lemmapos]
|
108
|
+
return @targets[lemmapos]
|
109
|
+
else
|
110
|
+
return []
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
##
|
115
|
+
# write file
|
116
|
+
def done_reading_targets()
|
117
|
+
begin
|
118
|
+
file = FileZipped.new(@dir + "targets.txt.gz", "w")
|
119
|
+
rescue
|
120
|
+
$stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
|
121
|
+
exit 1
|
122
|
+
end
|
123
|
+
|
124
|
+
@targets.each_pair { |lemma, senses|
|
125
|
+
file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
|
126
|
+
}
|
127
|
+
|
128
|
+
file.close()
|
129
|
+
end
|
130
|
+
|
131
|
+
###############################
|
132
|
+
protected
|
133
|
+
|
134
|
+
##
|
135
|
+
# record: record occurrence of a lemma/sense pair
|
136
|
+
# @targets data structure
|
137
|
+
def record(target_info)
|
138
|
+
lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
|
139
|
+
unless @targets[lemmapos]
|
140
|
+
@targets[lemmapos] = Array.new
|
141
|
+
end
|
142
|
+
|
143
|
+
unless @targets[lemmapos].include? target_info["sense"]
|
144
|
+
@targets[lemmapos] << target_info["sense"]
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
########################################
|
150
|
+
class FindTargetsFromFrames < Targets
|
151
|
+
###
|
152
|
+
# determine_targets:
|
153
|
+
# use existing frames to find targets
|
154
|
+
#
|
155
|
+
# returns:
|
156
|
+
# hash: target_IDs -> list of senses
|
157
|
+
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
158
|
+
#
|
159
|
+
# where a sense is represented as a hash:
|
160
|
+
# "sense": sense, a string
|
161
|
+
# "obj": FrameNode object
|
162
|
+
# "all_targets": list of node IDs, may comprise more than a single node
|
163
|
+
# "lex": lemma, or multiword expression in canonical form
|
164
|
+
# "sid": sentence ID
|
165
|
+
def determine_targets(st_sent) #SalsaTigerSentence object
|
166
|
+
retv = Hash.new()
|
167
|
+
st_sent.each_frame { |frame_obj|
|
168
|
+
# instance-specific computation:
|
169
|
+
# target and target positions
|
170
|
+
# WARNING: at this moment, we are
|
171
|
+
# not considering true multiword targets for German.
|
172
|
+
# Remove the "no_mwe" parameter in main_node_of_expr
|
173
|
+
# to change this
|
174
|
+
term = nil
|
175
|
+
all_targets = nil
|
176
|
+
if frame_obj.target.nil? or frame_obj.target.children.empty?
|
177
|
+
# no target, nothing to record
|
178
|
+
|
179
|
+
elsif @exp.get("language") == "de"
|
180
|
+
# don't consider true multiword targets for German
|
181
|
+
all_targets = frame_obj.target.children()
|
182
|
+
term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
|
183
|
+
|
184
|
+
else
|
185
|
+
# for all other languages: try to figure out the head target word
|
186
|
+
# anyway
|
187
|
+
all_targets = frame_obj.target.children()
|
188
|
+
term = @interpreter_class.main_node_of_expr(all_targets)
|
189
|
+
end
|
190
|
+
|
191
|
+
if term and term.is_splitword?
|
192
|
+
# don't use parts of a word as main node
|
193
|
+
term = term.parent()
|
194
|
+
end
|
195
|
+
if term and term.is_terminal?
|
196
|
+
key = [all_targets.map { |t| t.id() }, term.id()]
|
197
|
+
|
198
|
+
unless retv[key]
|
199
|
+
retv[key] = Array.new()
|
200
|
+
end
|
201
|
+
|
202
|
+
pos = frame_obj.target().get_attribute("pos")
|
203
|
+
# gold POS available, may be in wrong form,
|
204
|
+
# i.e. not the same strings that @interpreter_class.category()
|
205
|
+
# would return
|
206
|
+
case pos
|
207
|
+
when /^[Vv]$/
|
208
|
+
pos = "verb"
|
209
|
+
when /^[Nn]$/
|
210
|
+
pos = "noun"
|
211
|
+
when /^[Aa]$/
|
212
|
+
pos = "adj"
|
213
|
+
when nil
|
214
|
+
pos = @interpreter_class.category(term)
|
215
|
+
end
|
216
|
+
|
217
|
+
target_info = {
|
218
|
+
"sense" => frame_obj.name(),
|
219
|
+
"obj" => frame_obj,
|
220
|
+
"all_targets" => frame_obj.target.children().map { |ch| ch.id() },
|
221
|
+
"lex" => frame_obj.target().get_attribute("lemma"),
|
222
|
+
"pos" => pos,
|
223
|
+
"sid" => st_sent.id()
|
224
|
+
}
|
225
|
+
#print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
|
226
|
+
retv[key] << target_info
|
227
|
+
if @record_targets
|
228
|
+
record(target_info)
|
229
|
+
end
|
230
|
+
end
|
231
|
+
}
|
232
|
+
return retv
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
########################################
|
237
|
+
class FindAllTargets < Targets
|
238
|
+
###
|
239
|
+
# determine_targets:
|
240
|
+
# use all known lemmas, minus stopwords
|
241
|
+
def initialize(exp,
|
242
|
+
interpreter_class)
|
243
|
+
# read target info from file
|
244
|
+
super(exp, interpreter_class, "r")
|
245
|
+
@training_lemmapos_pairs = get_lemma_pos()
|
246
|
+
|
247
|
+
get_senses(@training_lemmapos_pairs)
|
248
|
+
# list of words to exclude from assignment, for now
|
249
|
+
@stoplemmas = [
|
250
|
+
"have",
|
251
|
+
"do",
|
252
|
+
"be"
|
253
|
+
# "make"
|
254
|
+
]
|
255
|
+
|
256
|
+
end
|
257
|
+
|
258
|
+
####
|
259
|
+
#
|
260
|
+
# returns:
|
261
|
+
# hash: target_IDs -> list of senses
|
262
|
+
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
263
|
+
#
|
264
|
+
# where a sense is represented as a hash:
|
265
|
+
# "sense": sense, a string
|
266
|
+
# "obj": FrameNode object
|
267
|
+
# "all_targets": list of node IDs, may comprise more than a single node
|
268
|
+
# "lex": lemma, or multiword expression in canonical form
|
269
|
+
# "sid": sentence ID
|
270
|
+
def determine_targets(sent) #SalsaTigerSentence object
|
271
|
+
# map target IDs to list of senses, in our case always [ nil ]
|
272
|
+
# because we assume that the senses of the targets we point out
|
273
|
+
# are unknown
|
274
|
+
retv = Hash.new()
|
275
|
+
# iterate through terminals of the sentence, check for inclusion
|
276
|
+
# of their lemma in @training_lemmas
|
277
|
+
sent.each_terminal { |node|
|
278
|
+
# we know this lemma from the training data,
|
279
|
+
# and it is not an auxiliary,
|
280
|
+
# and it is not in the stopword list
|
281
|
+
# and the node does not represent a preposition
|
282
|
+
|
283
|
+
### modified by ines, 17.10.2008
|
284
|
+
lemma = @interpreter_class.lemma_backoff(node)
|
285
|
+
pos = @interpreter_class.category(node)
|
286
|
+
|
287
|
+
# print "lemma ", lemma, " pos ", pos, "\n"
|
288
|
+
# reg = /\.[ANV]/
|
289
|
+
# if !reg.match(lemma)
|
290
|
+
# if /verb/.match(pos)
|
291
|
+
# lemma = lemma + ".V"
|
292
|
+
# elsif /noun/.match(pos)
|
293
|
+
# lemma = lemma + ".N"
|
294
|
+
# elsif /adj/.match(pos)
|
295
|
+
# lemma = lemma + ".A"
|
296
|
+
# end
|
297
|
+
# print "LEMMA ", lemma, " POS ", pos, "\n"
|
298
|
+
# end
|
299
|
+
|
300
|
+
if (@training_lemmapos_pairs.include? [lemma, pos] and
|
301
|
+
not(@interpreter_class.auxiliary?(node)) and
|
302
|
+
not(@stoplemmas.include? lemma) and
|
303
|
+
not(pos == "prep"))
|
304
|
+
key = [ [ node.id() ], node.id() ]
|
305
|
+
|
306
|
+
# take this as a target.
|
307
|
+
retv[ key ] = [
|
308
|
+
{
|
309
|
+
"sense" => nil,
|
310
|
+
"obj" => nil,
|
311
|
+
"all_targets" => [ node.id() ],
|
312
|
+
"lex" => lemma,
|
313
|
+
"pos" => pos,
|
314
|
+
"sid" => sent.id()
|
315
|
+
} ]
|
316
|
+
# no recording of target info,
|
317
|
+
# since we haven't determined anything new
|
318
|
+
end
|
319
|
+
}
|
320
|
+
|
321
|
+
return retv
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|