frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,182 @@
|
|
1
|
+
# FredConfigData
|
2
|
+
# Katrin Erk April 05
|
3
|
+
#
|
4
|
+
# Frame disambiguation system:
|
5
|
+
# access to a configuration and experiment description file
|
6
|
+
|
7
|
+
require "common/ConfigData"
|
8
|
+
|
9
|
+
##############################
|
10
|
+
# Class FredConfigData
|
11
|
+
#
|
12
|
+
# inherits from ConfigData,
|
13
|
+
# sets variable names appropriate to WSD task
|
14
|
+
|
15
|
+
class FredConfigData < ConfigData
|
16
|
+
def initialize(filename)
|
17
|
+
|
18
|
+
# initialize config data object
|
19
|
+
super(filename, # config file
|
20
|
+
{
|
21
|
+
"experiment_ID" => "string", # experiment ID
|
22
|
+
"enduser_mode" => "bool", # work in enduser mode? (disallowing many things)
|
23
|
+
|
24
|
+
"preproc_descr_file_train" => "string", # path to preprocessing files
|
25
|
+
"preproc_descr_file_test" => "string",
|
26
|
+
"directory_output" => "string", # path to Salsa/Tiger XML output directory
|
27
|
+
|
28
|
+
"verbose" => "bool" , # print diagnostic messages?
|
29
|
+
"apply_to_all_known_targets" => "bool", # apply to all known targets rather than the ones with a frame?
|
30
|
+
|
31
|
+
"fred_directory" => "string",# directory for internal info
|
32
|
+
"classifier_dir" => "string", # write classifiers here
|
33
|
+
|
34
|
+
"classifier" => "list", # classifiers
|
35
|
+
|
36
|
+
"dbtype" => "string", # "mysql" or "sqlite"
|
37
|
+
|
38
|
+
"host" => "string", # DB access: sqlite only
|
39
|
+
"user" => "string",
|
40
|
+
"passwd" => "string",
|
41
|
+
"dbname" => "string",
|
42
|
+
|
43
|
+
# featurization info
|
44
|
+
"feature" => "list", # which features to use for the classifier?
|
45
|
+
"binary_classifiers" => "bool",# make binary rather than n-ary clasifiers?
|
46
|
+
"negsense" => "string", # binary classifier: negative sense is..?
|
47
|
+
"numerical_features" => "string", # do what with numerical features?
|
48
|
+
|
49
|
+
# what to do with items that have multiple senses?
|
50
|
+
# 'binarize': binary classifiers, and consider positive
|
51
|
+
# if the sense is among the gold senses
|
52
|
+
# 'join' : make one joint sense
|
53
|
+
# 'repeat' : make multiple occurrences of the item, one sense per occ
|
54
|
+
# 'keep' : keep as separate labels
|
55
|
+
#
|
56
|
+
# multilabel: consider as assigned all labels
|
57
|
+
# above a certain confidence threshold?
|
58
|
+
"handle_multilabel" => "string",
|
59
|
+
"assignment_confidence_threshold" => "float",
|
60
|
+
|
61
|
+
# single-sentence context?
|
62
|
+
"single_sent_context" => "bool",
|
63
|
+
|
64
|
+
# noncontiguous input? then we need access to a larger corpus
|
65
|
+
"noncontiguous_input" => "bool",
|
66
|
+
"larger_corpus_dir" => "string",
|
67
|
+
"larger_corpus_format" => "string",
|
68
|
+
"larger_corpus_encoding" => "string"
|
69
|
+
},
|
70
|
+
[ # variables
|
71
|
+
"train",
|
72
|
+
"exp_ID"
|
73
|
+
]
|
74
|
+
)
|
75
|
+
|
76
|
+
# set access functions for list features
|
77
|
+
set_list_feature_access("classifier",
|
78
|
+
method("access_classifier"))
|
79
|
+
set_list_feature_access("feature",
|
80
|
+
method("access_feature"))
|
81
|
+
end
|
82
|
+
|
83
|
+
###
|
84
|
+
# protected
|
85
|
+
|
86
|
+
#####
|
87
|
+
# access_feature
|
88
|
+
#
|
89
|
+
# access function for feature 'feature'
|
90
|
+
#
|
91
|
+
# assumed format:
|
92
|
+
#
|
93
|
+
# feature = context 50
|
94
|
+
# feature = context 2
|
95
|
+
# feature = syn
|
96
|
+
#
|
97
|
+
# i.e. first the name of the feature type to use, then
|
98
|
+
# optionally a parameter,
|
99
|
+
# and the same feature can occur more than once (which makes sense
|
100
|
+
# only in case of parameters)
|
101
|
+
#
|
102
|
+
#
|
103
|
+
# returns:
|
104
|
+
# - If a feature is given as a parameter,
|
105
|
+
# - If the feature is not set in the experiment file, nil
|
106
|
+
# - If the feature is set and has a parameter, the list of
|
107
|
+
# parameter values set for it. It is assumed that the parameters
|
108
|
+
# are integers, and they are returned as integers
|
109
|
+
# - If the feature is set and has no parameter, true
|
110
|
+
# - If no feature is given as parameter:
|
111
|
+
# a list of all features that have been set in the experiment file
|
112
|
+
# Each feature is given as a tuple: the first element is the feature (a string),
|
113
|
+
# all further elements are options (integers)
|
114
|
+
def access_feature(val_list, # array:array:string: list of tuples defined in config file
|
115
|
+
# for feature 'feature'
|
116
|
+
feature=nil) # string: feature type name
|
117
|
+
|
118
|
+
if feature
|
119
|
+
# access options for this feature
|
120
|
+
|
121
|
+
# get the right tuples
|
122
|
+
positives = val_list.select { |entries|
|
123
|
+
entries.first() == feature
|
124
|
+
}.map { |entries|
|
125
|
+
entries[1]
|
126
|
+
}
|
127
|
+
|
128
|
+
if positives.empty?
|
129
|
+
# feature not defined
|
130
|
+
return nil
|
131
|
+
|
132
|
+
elsif positives.compact().empty?
|
133
|
+
# feature defined, but no parameters
|
134
|
+
return true
|
135
|
+
|
136
|
+
else
|
137
|
+
# feature defined, and has values
|
138
|
+
return positives.map { |par| par.to_i() }
|
139
|
+
end
|
140
|
+
|
141
|
+
else
|
142
|
+
# return all features that have been set
|
143
|
+
return val_list.map { |feature_name, *options|
|
144
|
+
[feature_name] + options.map { |o| o.to_i() }
|
145
|
+
}
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
#####
|
150
|
+
# access_classifier
|
151
|
+
#
|
152
|
+
# access function for feature 'classifier'
|
153
|
+
#
|
154
|
+
# assumed format in the config file:
|
155
|
+
#
|
156
|
+
# feature = path [option]*
|
157
|
+
#
|
158
|
+
# i.e. first the name of the feature type to use, then
|
159
|
+
# optionally options associated with that feature,
|
160
|
+
# e.g. 'argrec': use that feature only when computing argrec
|
161
|
+
#
|
162
|
+
# the access function is called with parameter val_list, an array of
|
163
|
+
# string tuples, one string tuple for each feature defined.
|
164
|
+
# the first string in the tuple is the feature name, the rest are the options
|
165
|
+
#
|
166
|
+
# returns: a list of pairs [feature_name(string), options(array:string)]
|
167
|
+
# of defined features
|
168
|
+
def access_classifier(val_list) # array:array:string: list of tuples defined in config file
|
169
|
+
# for feature 'feature'
|
170
|
+
if val_list.nil?
|
171
|
+
return []
|
172
|
+
else
|
173
|
+
return val_list.map { |cl_descr_tuple|
|
174
|
+
[cl_descr_tuple.first, cl_descr_tuple[1..-1]]
|
175
|
+
}
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
|
@@ -0,0 +1,232 @@
|
|
1
|
+
# FredConventions
|
2
|
+
# Katrin Erk June 05
|
3
|
+
#
|
4
|
+
# several small things that should be uniform
|
5
|
+
# throughout the system
|
6
|
+
|
7
|
+
require "common/ruby_class_extensions"
|
8
|
+
|
9
|
+
require "common/EnduserMode"
|
10
|
+
class Object
|
11
|
+
|
12
|
+
###
|
13
|
+
# joining and breaking up senses
|
14
|
+
def fred_join_senses(senses)
|
15
|
+
return senses.sort().join("++")
|
16
|
+
end
|
17
|
+
|
18
|
+
def fred_split_sense(joined_senses)
|
19
|
+
return joined_senses.split("++")
|
20
|
+
end
|
21
|
+
|
22
|
+
###
|
23
|
+
# fred_dirname
|
24
|
+
#
|
25
|
+
# constructs a directory name:
|
26
|
+
# fred data directory / experiment ID / maindir / subdir
|
27
|
+
#
|
28
|
+
# if is_existing == existing, the directory is checked for existence,
|
29
|
+
# if is_existing == new, it is created if necessary
|
30
|
+
#
|
31
|
+
# returns: a string
|
32
|
+
def fred_dirname(exp, # FredConfigData object
|
33
|
+
maindir, # string: main part of directory name
|
34
|
+
subdir, # string: subpart of directory name
|
35
|
+
is_existing = "existing") # string: "existing" or "new", default: existing
|
36
|
+
|
37
|
+
case is_existing
|
38
|
+
when "existing"
|
39
|
+
return File.existing_dir(exp.get("fred_directory"),
|
40
|
+
exp.get("experiment_ID"),
|
41
|
+
maindir,
|
42
|
+
subdir)
|
43
|
+
when "new"
|
44
|
+
return File.new_dir(exp.get("fred_directory"),
|
45
|
+
exp.get("experiment_ID"),
|
46
|
+
maindir,
|
47
|
+
subdir)
|
48
|
+
else
|
49
|
+
raise "Shouldn't be here: #{is_existing}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
####
|
54
|
+
# filenames for feature files
|
55
|
+
def fred_feature_filename(lemma, sense = nil,
|
56
|
+
do_binary = false)
|
57
|
+
if do_binary
|
58
|
+
return "fred.features.#{lemma}.SENSE.#{sense}"
|
59
|
+
else
|
60
|
+
return "fred.features.#{lemma}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
####
|
65
|
+
# filenames for split files
|
66
|
+
def fred_split_filename(lemma)
|
67
|
+
return "fred.split.#{lemma}"
|
68
|
+
end
|
69
|
+
|
70
|
+
###
|
71
|
+
# deconstruct split filename
|
72
|
+
# returns: lemma
|
73
|
+
def deconstruct_fred_split_filename(filename)
|
74
|
+
basename = File.basename(filename)
|
75
|
+
if basename =~ /^fred\.split\.(.*)/
|
76
|
+
return $1
|
77
|
+
else
|
78
|
+
return nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
###
|
83
|
+
# deconstruct feature file name
|
84
|
+
# returns: hash with keys
|
85
|
+
# "lemma"
|
86
|
+
# "sense
|
87
|
+
def deconstruct_fred_feature_filename(filename)
|
88
|
+
|
89
|
+
basename = File.basename(filename)
|
90
|
+
retv = Hash.new()
|
91
|
+
# binary:
|
92
|
+
# fred.features.#{lemma}.SENSE.#{sense}
|
93
|
+
if basename =~ /^fred\.features\.(.*)\.SENSE\.(.*)$/
|
94
|
+
retv["lemma"] = $1
|
95
|
+
retv["sense"] = $2
|
96
|
+
elsif basename =~ /^fred\.features\.(.*)/
|
97
|
+
# fred.features.#{lemma}
|
98
|
+
retv["lemma"] = $1
|
99
|
+
|
100
|
+
else
|
101
|
+
# complete mismatch
|
102
|
+
return nil
|
103
|
+
end
|
104
|
+
|
105
|
+
return retv
|
106
|
+
end
|
107
|
+
|
108
|
+
####
|
109
|
+
# filename for answer key files
|
110
|
+
def fred_answerkey_filename(lemma)
|
111
|
+
return "fred.answerkey.#{lemma}"
|
112
|
+
end
|
113
|
+
|
114
|
+
###
|
115
|
+
# classifier directory
|
116
|
+
def fred_classifier_directory(exp, # FredConfigData object
|
117
|
+
splitID = nil) # string or nil
|
118
|
+
|
119
|
+
if exp.get("classifier_dir")
|
120
|
+
# user-specified classifier directory
|
121
|
+
|
122
|
+
if splitID
|
123
|
+
return File.new_dir(exp.get("classifier_dir"), splitID)
|
124
|
+
else
|
125
|
+
return File.new_dir(exp.get("classifier_dir"))
|
126
|
+
end
|
127
|
+
|
128
|
+
else
|
129
|
+
# my classifier directory
|
130
|
+
if splitID
|
131
|
+
return fred_dirname(exp, "classifiers", splitID, "new")
|
132
|
+
else
|
133
|
+
return fred_dirname(exp, "classifiers", "all", "new")
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
###
|
139
|
+
# classifier file
|
140
|
+
def fred_classifier_filename(classifier, lemma, sense=nil)
|
141
|
+
if sense
|
142
|
+
return "fred.classif.#{classifier}.LEMMA.#{lemma}.SENSE.#{sense}"
|
143
|
+
else
|
144
|
+
return "fred.classif.#{classifier}.LEMMA.#{lemma}"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def deconstruct_fred_classifier_filename(filename)
|
149
|
+
retv = Hash.new()
|
150
|
+
if filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)\.SENSE\.(.*)$/
|
151
|
+
retv["lemma"] = $2
|
152
|
+
retv["sense"] = $3
|
153
|
+
elsif filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)$/
|
154
|
+
retv["lemma"] = $2
|
155
|
+
end
|
156
|
+
return retv
|
157
|
+
end
|
158
|
+
|
159
|
+
###
|
160
|
+
# result file
|
161
|
+
def fred_result_filename(lemma)
|
162
|
+
return "fred.result.#{lemma.gsub(/\./, "_")}"
|
163
|
+
end
|
164
|
+
|
165
|
+
##########
|
166
|
+
# lemma and POS: combine into string separated by
|
167
|
+
# a separator character
|
168
|
+
#
|
169
|
+
# fred_lemmapos_combine: take two strings, return combined string
|
170
|
+
# if POS is nil, returns lemma<separator character>
|
171
|
+
# fred_lemmapos_separate: take one string, return two strings
|
172
|
+
# if no POS could be retrieved, returns nil as POS and the whole string as lemma
|
173
|
+
def fred_lemmapos_combine(lemma, # string
|
174
|
+
pos) # string
|
175
|
+
return lemma.to_s + "." + pos.to_s.gsub(/\./, "DOT")
|
176
|
+
end
|
177
|
+
|
178
|
+
###
|
179
|
+
def fred_lemmapos_separate(lemmapos) # string
|
180
|
+
pieces = lemmapos.split(".")
|
181
|
+
if pieces.length() > 1
|
182
|
+
return [ pieces[0..-2].join("."), pieces[-1] ]
|
183
|
+
else
|
184
|
+
# no POS found, treat all of lemmapos as lemma
|
185
|
+
return [ lemmapos, nil ]
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
########################################
|
191
|
+
# given a SynNode object representing a terminal,
|
192
|
+
# return:
|
193
|
+
# - the word
|
194
|
+
# - the lemma
|
195
|
+
# - the part of speech
|
196
|
+
# - the named entity (if any)
|
197
|
+
#
|
198
|
+
# as a tuple
|
199
|
+
#
|
200
|
+
# WARNING: word and lemma are turned to lowercase
|
201
|
+
module WordLemmaPosNe
|
202
|
+
def word_lemma_pos_ne(syn_obj, # SynNode object
|
203
|
+
i) # SynInterpreter class
|
204
|
+
unless syn_obj.is_terminal?
|
205
|
+
$stderr.puts "Featurization warning: unexpectedly received non-terminal"
|
206
|
+
return [ nil, nil, nil, nil ]
|
207
|
+
end
|
208
|
+
|
209
|
+
word = syn_obj.word()
|
210
|
+
if word
|
211
|
+
word.downcase!
|
212
|
+
end
|
213
|
+
|
214
|
+
lemma = i.lemma_backoff(syn_obj)
|
215
|
+
if lemma and SalsaTigerXMLHelper.unescape(lemma) == "<unknown>"
|
216
|
+
lemma = nil
|
217
|
+
end
|
218
|
+
if lemma
|
219
|
+
lemma.downcase!
|
220
|
+
end
|
221
|
+
|
222
|
+
pos = syn_obj.part_of_speech()
|
223
|
+
|
224
|
+
ne = syn_obj.get_attribute("ne")
|
225
|
+
unless ne
|
226
|
+
ne = syn_obj.get_attribute("headof_ne")
|
227
|
+
end
|
228
|
+
|
229
|
+
return [word, lemma, pos, ne]
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
@@ -0,0 +1,324 @@
|
|
1
|
+
require "fred/FileZipped"
|
2
|
+
|
3
|
+
require "fred/FredConfigData"
|
4
|
+
require "common/SynInterfaces"
|
5
|
+
require "fred/FredConventions"
|
6
|
+
|
7
|
+
|
8
|
+
########################################
|
9
|
+
# target determination classes:
|
10
|
+
# either determine targets from existing annotation
|
11
|
+
# with frames,
|
12
|
+
# or use all known targets.
|
13
|
+
class Targets
|
14
|
+
attr_reader :targets_okay
|
15
|
+
|
16
|
+
###
|
17
|
+
def initialize(exp, # experiment file object
|
18
|
+
interpreter_class, # SynInterpreter class, or nil
|
19
|
+
mode) # string: "r", "w", "a", as in files
|
20
|
+
@exp = exp
|
21
|
+
@interpreter_class = interpreter_class
|
22
|
+
|
23
|
+
# keep recorded targets here.
|
24
|
+
# try to read old list now.
|
25
|
+
@targets = Hash.new()
|
26
|
+
|
27
|
+
# write target info in the classifier directory.
|
28
|
+
# This is _not_ dependent on a potential split ID
|
29
|
+
@dir = File.new_dir(fred_classifier_directory(@exp), "targets")
|
30
|
+
|
31
|
+
@targets_okay = true
|
32
|
+
case mode
|
33
|
+
when "w"
|
34
|
+
# start from scratch, no list of targets
|
35
|
+
when "a", "r"
|
36
|
+
# read existing file containing targets
|
37
|
+
begin
|
38
|
+
file = FileZipped.new(@dir + "targets.txt.gz")
|
39
|
+
rescue
|
40
|
+
# no pickle present: signal this
|
41
|
+
@targets_okay = false
|
42
|
+
return
|
43
|
+
end
|
44
|
+
file.each { |line|
|
45
|
+
line.chomp!
|
46
|
+
if line =~ /^LEMMA (.+) SENSES (.+)$/
|
47
|
+
lemmapos = $1
|
48
|
+
senses = $2.split()
|
49
|
+
lemmapos.gsub!(/ /, '_')
|
50
|
+
#lemmapos.gsub!(/\.[A-Z]\./, '.')
|
51
|
+
@targets[lemmapos] = senses
|
52
|
+
end
|
53
|
+
}
|
54
|
+
|
55
|
+
else
|
56
|
+
$stderr.puts "Error: shouldn't be here."
|
57
|
+
exit 1
|
58
|
+
end
|
59
|
+
|
60
|
+
if ["w", "a"].include? mode
|
61
|
+
@record_targets = true
|
62
|
+
else
|
63
|
+
@record_targets = false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
###
|
68
|
+
# determine_targets:
|
69
|
+
# for a given SalsaTigerSentence,
|
70
|
+
# determine all targets,
|
71
|
+
# each as a _single_ main terminal node
|
72
|
+
#
|
73
|
+
# We need a single terminal node in order
|
74
|
+
# to compute the context window
|
75
|
+
#
|
76
|
+
# returns:
|
77
|
+
# hash: target_IDs -> list of senses
|
78
|
+
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
79
|
+
#
|
80
|
+
# where a sense is represented as a hash:
|
81
|
+
# "sense": sense, a string
|
82
|
+
# "obj": FrameNode object
|
83
|
+
# "all_targets": list of node IDs, may comprise more than a single node
|
84
|
+
# "lex": lemma, or multiword expression in canonical form
|
85
|
+
# "sid": sentence ID
|
86
|
+
def determine_targets(sent)
|
87
|
+
raise "overwrite me"
|
88
|
+
end
|
89
|
+
|
90
|
+
##
|
91
|
+
# returns a list of lemma-pos combined strings
|
92
|
+
def get_lemmas()
|
93
|
+
return @targets.keys()
|
94
|
+
end
|
95
|
+
|
96
|
+
##
|
97
|
+
# access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
|
98
|
+
def get_lemma_pos()
|
99
|
+
|
100
|
+
return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# access to senses
|
105
|
+
def get_senses(lemmapos) # string, result of fred_lemmapos_combine
|
106
|
+
|
107
|
+
if @targets[lemmapos]
|
108
|
+
return @targets[lemmapos]
|
109
|
+
else
|
110
|
+
return []
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
##
|
115
|
+
# write file
|
116
|
+
def done_reading_targets()
|
117
|
+
begin
|
118
|
+
file = FileZipped.new(@dir + "targets.txt.gz", "w")
|
119
|
+
rescue
|
120
|
+
$stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
|
121
|
+
exit 1
|
122
|
+
end
|
123
|
+
|
124
|
+
@targets.each_pair { |lemma, senses|
|
125
|
+
file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
|
126
|
+
}
|
127
|
+
|
128
|
+
file.close()
|
129
|
+
end
|
130
|
+
|
131
|
+
###############################
|
132
|
+
protected
|
133
|
+
|
134
|
+
##
|
135
|
+
# record: record occurrence of a lemma/sense pair
|
136
|
+
# @targets data structure
|
137
|
+
def record(target_info)
|
138
|
+
lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
|
139
|
+
unless @targets[lemmapos]
|
140
|
+
@targets[lemmapos] = Array.new
|
141
|
+
end
|
142
|
+
|
143
|
+
unless @targets[lemmapos].include? target_info["sense"]
|
144
|
+
@targets[lemmapos] << target_info["sense"]
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
########################################
|
150
|
+
class FindTargetsFromFrames < Targets
|
151
|
+
###
|
152
|
+
# determine_targets:
|
153
|
+
# use existing frames to find targets
|
154
|
+
#
|
155
|
+
# returns:
|
156
|
+
# hash: target_IDs -> list of senses
|
157
|
+
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
158
|
+
#
|
159
|
+
# where a sense is represented as a hash:
|
160
|
+
# "sense": sense, a string
|
161
|
+
# "obj": FrameNode object
|
162
|
+
# "all_targets": list of node IDs, may comprise more than a single node
|
163
|
+
# "lex": lemma, or multiword expression in canonical form
|
164
|
+
# "sid": sentence ID
|
165
|
+
def determine_targets(st_sent) #SalsaTigerSentence object
|
166
|
+
retv = Hash.new()
|
167
|
+
st_sent.each_frame { |frame_obj|
|
168
|
+
# instance-specific computation:
|
169
|
+
# target and target positions
|
170
|
+
# WARNING: at this moment, we are
|
171
|
+
# not considering true multiword targets for German.
|
172
|
+
# Remove the "no_mwe" parameter in main_node_of_expr
|
173
|
+
# to change this
|
174
|
+
term = nil
|
175
|
+
all_targets = nil
|
176
|
+
if frame_obj.target.nil? or frame_obj.target.children.empty?
|
177
|
+
# no target, nothing to record
|
178
|
+
|
179
|
+
elsif @exp.get("language") == "de"
|
180
|
+
# don't consider true multiword targets for German
|
181
|
+
all_targets = frame_obj.target.children()
|
182
|
+
term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
|
183
|
+
|
184
|
+
else
|
185
|
+
# for all other languages: try to figure out the head target word
|
186
|
+
# anyway
|
187
|
+
all_targets = frame_obj.target.children()
|
188
|
+
term = @interpreter_class.main_node_of_expr(all_targets)
|
189
|
+
end
|
190
|
+
|
191
|
+
if term and term.is_splitword?
|
192
|
+
# don't use parts of a word as main node
|
193
|
+
term = term.parent()
|
194
|
+
end
|
195
|
+
if term and term.is_terminal?
|
196
|
+
key = [all_targets.map { |t| t.id() }, term.id()]
|
197
|
+
|
198
|
+
unless retv[key]
|
199
|
+
retv[key] = Array.new()
|
200
|
+
end
|
201
|
+
|
202
|
+
pos = frame_obj.target().get_attribute("pos")
|
203
|
+
# gold POS available, may be in wrong form,
|
204
|
+
# i.e. not the same strings that @interpreter_class.category()
|
205
|
+
# would return
|
206
|
+
case pos
|
207
|
+
when /^[Vv]$/
|
208
|
+
pos = "verb"
|
209
|
+
when /^[Nn]$/
|
210
|
+
pos = "noun"
|
211
|
+
when /^[Aa]$/
|
212
|
+
pos = "adj"
|
213
|
+
when nil
|
214
|
+
pos = @interpreter_class.category(term)
|
215
|
+
end
|
216
|
+
|
217
|
+
target_info = {
|
218
|
+
"sense" => frame_obj.name(),
|
219
|
+
"obj" => frame_obj,
|
220
|
+
"all_targets" => frame_obj.target.children().map { |ch| ch.id() },
|
221
|
+
"lex" => frame_obj.target().get_attribute("lemma"),
|
222
|
+
"pos" => pos,
|
223
|
+
"sid" => st_sent.id()
|
224
|
+
}
|
225
|
+
#print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
|
226
|
+
retv[key] << target_info
|
227
|
+
if @record_targets
|
228
|
+
record(target_info)
|
229
|
+
end
|
230
|
+
end
|
231
|
+
}
|
232
|
+
return retv
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
########################################
|
237
|
+
class FindAllTargets < Targets
|
238
|
+
###
|
239
|
+
# determine_targets:
|
240
|
+
# use all known lemmas, minus stopwords
|
241
|
+
def initialize(exp,
|
242
|
+
interpreter_class)
|
243
|
+
# read target info from file
|
244
|
+
super(exp, interpreter_class, "r")
|
245
|
+
@training_lemmapos_pairs = get_lemma_pos()
|
246
|
+
|
247
|
+
get_senses(@training_lemmapos_pairs)
|
248
|
+
# list of words to exclude from assignment, for now
|
249
|
+
@stoplemmas = [
|
250
|
+
"have",
|
251
|
+
"do",
|
252
|
+
"be"
|
253
|
+
# "make"
|
254
|
+
]
|
255
|
+
|
256
|
+
end
|
257
|
+
|
258
|
+
####
|
259
|
+
#
|
260
|
+
# returns:
|
261
|
+
# hash: target_IDs -> list of senses
|
262
|
+
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
263
|
+
#
|
264
|
+
# where a sense is represented as a hash:
|
265
|
+
# "sense": sense, a string
|
266
|
+
# "obj": FrameNode object
|
267
|
+
# "all_targets": list of node IDs, may comprise more than a single node
|
268
|
+
# "lex": lemma, or multiword expression in canonical form
|
269
|
+
# "sid": sentence ID
|
270
|
+
def determine_targets(sent) #SalsaTigerSentence object
|
271
|
+
# map target IDs to list of senses, in our case always [ nil ]
|
272
|
+
# because we assume that the senses of the targets we point out
|
273
|
+
# are unknown
|
274
|
+
retv = Hash.new()
|
275
|
+
# iterate through terminals of the sentence, check for inclusion
|
276
|
+
# of their lemma in @training_lemmas
|
277
|
+
sent.each_terminal { |node|
|
278
|
+
# we know this lemma from the training data,
|
279
|
+
# and it is not an auxiliary,
|
280
|
+
# and it is not in the stopword list
|
281
|
+
# and the node does not represent a preposition
|
282
|
+
|
283
|
+
### modified by ines, 17.10.2008
|
284
|
+
lemma = @interpreter_class.lemma_backoff(node)
|
285
|
+
pos = @interpreter_class.category(node)
|
286
|
+
|
287
|
+
# print "lemma ", lemma, " pos ", pos, "\n"
|
288
|
+
# reg = /\.[ANV]/
|
289
|
+
# if !reg.match(lemma)
|
290
|
+
# if /verb/.match(pos)
|
291
|
+
# lemma = lemma + ".V"
|
292
|
+
# elsif /noun/.match(pos)
|
293
|
+
# lemma = lemma + ".N"
|
294
|
+
# elsif /adj/.match(pos)
|
295
|
+
# lemma = lemma + ".A"
|
296
|
+
# end
|
297
|
+
# print "LEMMA ", lemma, " POS ", pos, "\n"
|
298
|
+
# end
|
299
|
+
|
300
|
+
if (@training_lemmapos_pairs.include? [lemma, pos] and
|
301
|
+
not(@interpreter_class.auxiliary?(node)) and
|
302
|
+
not(@stoplemmas.include? lemma) and
|
303
|
+
not(pos == "prep"))
|
304
|
+
key = [ [ node.id() ], node.id() ]
|
305
|
+
|
306
|
+
# take this as a target.
|
307
|
+
retv[ key ] = [
|
308
|
+
{
|
309
|
+
"sense" => nil,
|
310
|
+
"obj" => nil,
|
311
|
+
"all_targets" => [ node.id() ],
|
312
|
+
"lex" => lemma,
|
313
|
+
"pos" => pos,
|
314
|
+
"sid" => sent.id()
|
315
|
+
} ]
|
316
|
+
# no recording of target info,
|
317
|
+
# since we haven't determined anything new
|
318
|
+
end
|
319
|
+
}
|
320
|
+
|
321
|
+
return retv
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|