shalmaneser-fred 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,47 @@
1
+ # AB: 2011-11-13
2
+ # Initial import done, need to reimplement the whole interface.
3
+ require 'fred/FredFeaturize'
4
+ require 'fred/FredSplit'
5
+ require 'fred/FredTrain'
6
+ require 'fred/FredTest'
7
+ require 'fred/FredEval'
8
+
9
+ module Fred
10
+ class Fred
11
+
12
+ def initialize(options)
13
+ @exp, @opts = options
14
+ @task = @opts['--task']
15
+ end
16
+ ##
17
+ # now perform the given task
18
+
19
+ def assign
20
+
21
+ # initialize task object
22
+ case @task
23
+ when "featurize"
24
+ task_obj = FredFeaturize.new(@exp, @opts)
25
+ when "refeaturize"
26
+ task_obj = FredFeaturize.new(@exp, @opts, "refeaturize" => true)
27
+ when "split"
28
+ task_obj = FredSplit.new(@exp, @opts)
29
+ when "train"
30
+ task_obj = FredTrain.new(@exp, @opts)
31
+ when "test"
32
+ task_obj = FredTest.new(@exp, @opts)
33
+ when "eval"
34
+ task_obj = FredEval.new(@exp, @opts)
35
+ else
36
+ raise "Shouldn't be here"
37
+ # @todo AB: this <else> condition should be unpossible
38
+ # do in OptionParser
39
+ end
40
+
41
+ task_obj.compute
42
+
43
+ $stderr.puts "Fred: Done."
44
+
45
+ end
46
+ end # class Fred
47
+ end # module Fred
@@ -0,0 +1,185 @@
1
+ # FredConfigData
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system:
5
+ # access to a configuration and experiment description file
6
+
7
+ require "common/config_data"
8
+
9
+ ##############################
10
+ # Class FredConfigData
11
+ #
12
+ # inherits from ConfigData,
13
+ # sets variable names appropriate to WSD task
14
+
15
+ class FredConfigData < ConfigData
16
+ CONFIG_DEFS = {
17
+ "experiment_ID" => "string", # experiment ID
18
+ "enduser_mode" => "bool", # work in enduser mode? (disallowing many things)
19
+
20
+ "preproc_descr_file_train" => "string", # path to preprocessing files
21
+ "preproc_descr_file_test" => "string",
22
+ "directory_output" => "string", # path to Salsa/Tiger XML output directory
23
+
24
+ "verbose" => "bool" , # print diagnostic messages?
25
+ "apply_to_all_known_targets" => "bool", # apply to all known targets rather than the ones with a frame?
26
+
27
+ "fred_directory" => "string",# directory for internal info
28
+ "classifier_dir" => "string", # write classifiers here
29
+
30
+ "classifier" => "list", # classifiers
31
+
32
+ "dbtype" => "string", # "mysql" or "sqlite"
33
+
34
+ "host" => "string", # DB access: sqlite only
35
+ "user" => "string",
36
+ "passwd" => "string",
37
+ "dbname" => "string",
38
+
39
+ # featurization info
40
+ "feature" => "list", # which features to use for the classifier?
41
+ "binary_classifiers" => "bool",# make binary rather than n-ary clasifiers?
42
+ "negsense" => "string", # binary classifier: negative sense is..?
43
+ "numerical_features" => "string", # do what with numerical features?
44
+
45
+ # what to do with items that have multiple senses?
46
+ # 'binarize': binary classifiers, and consider positive
47
+ # if the sense is among the gold senses
48
+ # 'join' : make one joint sense
49
+ # 'repeat' : make multiple occurrences of the item, one sense per occ
50
+ # 'keep' : keep as separate labels
51
+ #
52
+ # multilabel: consider as assigned all labels
53
+ # above a certain confidence threshold?
54
+ "handle_multilabel" => "string",
55
+ "assignment_confidence_threshold" => "float",
56
+
57
+ # single-sentence context?
58
+ "single_sent_context" => "bool",
59
+
60
+ # noncontiguous input? then we need access to a larger corpus
61
+ "noncontiguous_input" => "bool",
62
+ "larger_corpus_dir" => "string",
63
+ "larger_corpus_format" => "string",
64
+ "larger_corpus_encoding" => "string",
65
+ # Imported from PrepConfigData
66
+ 'do_postag' => 'bool',
67
+ 'do_lemmatize' => 'bool',
68
+ 'do_parse' => 'bool',
69
+ 'pos_tagger' => 'string',
70
+ 'lemmatizer' => 'string',
71
+ 'parser' => 'string',
72
+ 'directory_preprocessed' => 'string',
73
+ 'language' => 'string'
74
+ }
75
+
76
+ def initialize(filename)
77
+
78
+ super(filename, CONFIG_DEFS, ["train", "exp_ID"])
79
+
80
+ # set access functions for list features
81
+ set_list_feature_access("classifier", method("access_classifier"))
82
+ set_list_feature_access("feature", method("access_feature"))
83
+ end
84
+
85
+ ###
86
+ # protected
87
+
88
+ #####
89
+ # access_feature
90
+ #
91
+ # access function for feature 'feature'
92
+ #
93
+ # assumed format:
94
+ #
95
+ # feature = context 50
96
+ # feature = context 2
97
+ # feature = syn
98
+ #
99
+ # i.e. first the name of the feature type to use, then
100
+ # optionally a parameter,
101
+ # and the same feature can occur more than once (which makes sense
102
+ # only in case of parameters)
103
+ #
104
+ #
105
+ # returns:
106
+ # - If a feature is given as a parameter,
107
+ # - If the feature is not set in the experiment file, nil
108
+ # - If the feature is set and has a parameter, the list of
109
+ # parameter values set for it. It is assumed that the parameters
110
+ # are integers, and they are returned as integers
111
+ # - If the feature is set and has no parameter, true
112
+ # - If no feature is given as parameter:
113
+ # a list of all features that have been set in the experiment file
114
+ # Each feature is given as a tuple: the first element is the feature (a string),
115
+ # all further elements are options (integers)
116
+ def access_feature(val_list, # array:array:string: list of tuples defined in config file
117
+ # for feature 'feature'
118
+ feature=nil) # string: feature type name
119
+
120
+ if feature
121
+ # access options for this feature
122
+
123
+ # get the right tuples
124
+ positives = val_list.select { |entries|
125
+ entries.first() == feature
126
+ }.map { |entries|
127
+ entries[1]
128
+ }
129
+
130
+ if positives.empty?
131
+ # feature not defined
132
+ return nil
133
+
134
+ elsif positives.compact().empty?
135
+ # feature defined, but no parameters
136
+ return true
137
+
138
+ else
139
+ # feature defined, and has values
140
+ return positives.map { |par| par.to_i() }
141
+ end
142
+
143
+ else
144
+ # return all features that have been set
145
+ return val_list.map { |feature_name, *options|
146
+ [feature_name] + options.map { |o| o.to_i() }
147
+ }
148
+ end
149
+ end
150
+
151
+ #####
152
+ # access_classifier
153
+ #
154
+ # access function for feature 'classifier'
155
+ #
156
+ # assumed format in the config file:
157
+ #
158
+ # feature = path [option]*
159
+ #
160
+ # i.e. first the name of the feature type to use, then
161
+ # optionally options associated with that feature,
162
+ # e.g. 'argrec': use that feature only when computing argrec
163
+ #
164
+ # the access function is called with parameter val_list, an array of
165
+ # string tuples, one string tuple for each feature defined.
166
+ # the first string in the tuple is the feature name, the rest are the options
167
+ #
168
+ # returns: a list of pairs [feature_name(string), options(array:string)]
169
+ # of defined features
170
+ # @param val_list [Array] array:array:string: list of tuples defined
171
+ # in config file for feature 'feature'
172
+ def access_classifier(val_list)
173
+ if val_list.nil?
174
+ []
175
+ else
176
+ val_list.map do |cl_descr_tuple|
177
+ [cl_descr_tuple.first, cl_descr_tuple[1..-1]]
178
+ end
179
+ end
180
+ end
181
+
182
+ end
183
+
184
+
185
+
@@ -0,0 +1,23 @@
1
+ # just for compatibility; requiring "md5" is obsoleted
2
+ #
3
+ # $RoughId: md5.rb,v 1.4 2001/07/13 15:38:27 knu Exp $
4
+ # $Id: md5.rb 12008 2007-03-06 10:12:12Z knu $
5
+
6
+ require 'digest/md5'
7
+
8
+ class MD5 < Digest::MD5
9
+ class << self
10
+ alias orig_new new
11
+ def new(str = nil)
12
+ if str
13
+ orig_new.update(str)
14
+ else
15
+ orig_new
16
+ end
17
+ end
18
+
19
+ def md5(*args)
20
+ new(*args)
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,250 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ # AB, 2011-11-13
4
+
5
+ #require 'optparse' # for reimplementation
6
+ require 'getoptlong'
7
+ require "fred/fred_config_data"
8
+
9
+ module Fred
10
+
11
+ # This class parses options for Fred.
12
+ class OptParser
13
+ def self.parse(cmd_opts)
14
+ tasks = {
15
+ "featurize" => [ [ '--dataset', '-d', GetoptLong::REQUIRED_ARGUMENT], # set to featurize: 'train' or 'test'
16
+ [ "--append", "-A", GetoptLong::NO_ARGUMENT]
17
+ ],
18
+ "refeaturize" => [ [ '--dataset', '-d', GetoptLong::REQUIRED_ARGUMENT], # set to featurize: 'train' or 'test'
19
+ [ "--append", "-A", GetoptLong::NO_ARGUMENT]
20
+ ],
21
+ "split" => [ ['--logID', '-i', GetoptLong::REQUIRED_ARGUMENT], # splitlog ID, required, no default
22
+ [ '--trainpercent', '-r', GetoptLong::REQUIRED_ARGUMENT] # percentage training data, default: 90
23
+ ],
24
+ "train" => [ ['--logID', '-i', GetoptLong::REQUIRED_ARGUMENT] # splitlog ID; if given, will train on split
25
+ # rather than all training data
26
+ ],
27
+ "test" => [ ['--logID', '-i', GetoptLong::REQUIRED_ARGUMENT], # splitlog ID: if given, test on this split of
28
+ # the training data
29
+ [ '--baseline', '-b', GetoptLong::NO_ARGUMENT], # set this to compute baseline rather than
30
+ # apply classifiers
31
+ [ '--nooutput', '-N', GetoptLong::NO_ARGUMENT] # set this to prevent output of disambiguated
32
+ # test data
33
+
34
+ ],
35
+ "eval" => [['--logID', '-i', GetoptLong::REQUIRED_ARGUMENT], # splitlog ID: if given, evaluate this split.
36
+ ['--printLog', '-l', GetoptLong::NO_ARGUMENT]
37
+ ]
38
+ }
39
+
40
+ # general options
41
+ optnames = [[ '--help', '-h', GetoptLong::NO_ARGUMENT], # get help
42
+ [ '--expfile', '-e', GetoptLong::REQUIRED_ARGUMENT], # experiment file name (and path), no default
43
+ [ '--task', '-t', GetoptLong::REQUIRED_ARGUMENT ], # task to perform: one of task.keys, no default
44
+ ]
45
+ # append task-specific to general options
46
+ tasks.values.each { |more_optnames|
47
+ optnames.concat more_optnames
48
+ }
49
+ optnames.uniq!
50
+
51
+ # asterisk: "explode" array into individual parameters
52
+ begin
53
+ opts = options_hash(GetoptLong.new(*optnames))
54
+ rescue
55
+ $stderr.puts "Error: unknown command line option: " + $!
56
+ exit 1
57
+ end
58
+
59
+ experiment_filename = nil
60
+
61
+ ##
62
+ # are we being asked for help?
63
+ if opts['--help']
64
+ help()
65
+ exit(0)
66
+ end
67
+
68
+ ##
69
+ # now find the task
70
+ task = opts['--task']
71
+ # sanity checks for task
72
+ if task.nil?
73
+ help()
74
+ exit(0)
75
+ end
76
+ unless tasks.keys.include? task
77
+ $stderr.puts "Sorry, I don't know the task " + task
78
+ exit 1
79
+ end
80
+
81
+ ##
82
+ # now evaluate the rest of the options
83
+ opts.each_pair { |opt,arg|
84
+ case opt
85
+ when '--help', '--task'
86
+ # we already handled this
87
+ when '--expfile'
88
+ experiment_filename = arg
89
+ else
90
+ # do we know this option?
91
+ unless tasks[task].assoc(opt)
92
+ $stderr.puts "Sorry, I don't know the option " + opt + " for task " + task
93
+ exit 1
94
+ end
95
+ end
96
+ }
97
+
98
+
99
+
100
+ unless experiment_filename
101
+ $stderr.puts "I need an experiment file name, option --expfile|-e"
102
+ exit 1
103
+ end
104
+
105
+ ##
106
+ # open config file
107
+
108
+ exp = FredConfigData.new(experiment_filename)
109
+
110
+ # sanity checks
111
+ unless exp.get("experiment_ID") =~ /^[A-Za-z0-9_]+$/
112
+ raise "Please choose an experiment ID consisting only of the letters A-Za-z0-9_."
113
+ end
114
+
115
+ # enduser mode?
116
+ $ENDUSER_MODE = exp.get("enduser_mode")
117
+
118
+ # set defaults
119
+ unless exp.get("handle_multilabel")
120
+ if exp.get("binary_classifiers")
121
+ exp.set_entry("handle_multilabel", "binarize")
122
+ else
123
+ exp.set_entry("handle_multilabel", "repeat")
124
+ end
125
+ end
126
+ # sanity check: if we're using option 'binarize' for handling items
127
+ # with multiple labels, we have to have binary classifiers
128
+ if exp.get("handle_multilabel") == "binarize" and not(exp.get("binary_classifiers"))
129
+ $stderr.puts "Error: cannot use 'handle_multilabel=binarize' with n-ary classifiers."
130
+ exit(1)
131
+ end
132
+ unless exp.get("numerical_features")
133
+ exp.set_entry("numerical_features", "bin")
134
+ end
135
+
136
+ [exp, opts]
137
+ end
138
+ private
139
+ ###
140
+ # options_hash:
141
+ #
142
+ # GetoptLong only allows you to access options via each(),
143
+ # not individually, and it only allows you to cycle through the options once.
144
+ # So we re-code the options as a hash
145
+ def self.options_hash(opts_obj) # GetoptLong object
146
+ opt_hash = Hash.new
147
+
148
+ opts_obj.each do |opt, arg|
149
+ opt_hash[opt] = arg
150
+ end
151
+
152
+ return opt_hash
153
+ end
154
+ def self.help
155
+ $stderr.puts "
156
+ Fred: FRamE Disambiguation System Version 0.3
157
+
158
+ Usage:
159
+ ----------------
160
+
161
+ ruby fred.rb --help|-h
162
+ Gets you this text.
163
+
164
+
165
+ ruby fred.rb --task|-t featurize --expfile|-e <e> --dataset|-d <d>
166
+ [--append|-A]
167
+ Featurizes input data and stores it in feature files.
168
+ Feature files are stored in
169
+ <fred_directory>/<experiment_ID>/<train/test>/features
170
+ Enduser mode: dataset has to be test (preset as default), no --append.
171
+
172
+ --expfile <e> Use <e> as the experiment description and configuation file
173
+
174
+ --dataset <d> Set to featurize: <d> is either 'train' or 'test'
175
+ Accordingly, either the directory dir_train or dir_test (as
176
+ specified in the experiment file) is used to store the features
177
+
178
+ --append Do not overwrite previously computed features for this experiment.
179
+ Rather, append the new features to the old featurization files.
180
+ Default: overwrite
181
+
182
+ ruby fred.rb --task|-t split --expfile|-e <e> --logID|-i <i>
183
+ [--trainpercent|-r <r>]
184
+ Produces a new train/test split on the training data of the experiment.
185
+ Split logs are stored in <fred_directory>/<experiment_ID>/split/<splitlog ID>
186
+ Not available in enduser mode.
187
+
188
+ --expfile <e> Use <e> as the experiment description and configuation file
189
+
190
+ --logID <l> Use <l> as the ID for storing this new split
191
+
192
+ --trainpercent <r> Allocate <r> percent of the data as train,
193
+ and 100-<r> as test.
194
+ default: <r>=90
195
+
196
+ ruby fred.rb --task|-t train --expfile|-e <e>
197
+ [--logID|-i <i> ]
198
+ Train classifier(s) on the training data (or a split of it)
199
+ Classifiers are stored in
200
+ <fred_directory>/<experiment_ID>/classifiers/<classifier_name>
201
+ Not available in enduser mode.
202
+
203
+ --expfile <e> Use <e> as the experiment description and configuation file
204
+
205
+ --logID <l> Train not on the whole training data but
206
+ on the split with ID <l>
207
+
208
+ ruby fred.rb --task|-t test --expfile|-e <e>
209
+ [--logID|-i <i>] [--baseline|-b]
210
+ [--nooutput|-N]
211
+ Apply classifier(s) to the test data (or a split of the training data)
212
+ Classification results are stored in
213
+ <fred_directory>/<experiment_ID>/results/main or
214
+ <fred_directory>/<experiment_ID>/results/baseline for the baseline.
215
+ If you are using classifier combination, individual classification results
216
+ are stored in <fred_directory>/<experiment_ID>/results/<classifier_name>
217
+ System output (disambiguated text in SalsaTigerXML format) is stored in
218
+ <fred_directory>/<experiment_ID>/output/stxml
219
+ or <directory_output>, if that has been specified.
220
+
221
+ --expfile <e> Use <e> as the experiment description and configuation file
222
+
223
+ --logID <l> Test on a split of the training data with ID <l>
224
+
225
+ --baseline Compute the baseline: Always assign most frequent sense.
226
+ Default: use the trained classifiers
227
+
228
+ --nooutput Do not produce an output of the disambiguated test data
229
+ in SalsaTigerXML format. This is useful if you just want
230
+ to evaluate the system.
231
+ Default: output is produced.
232
+
233
+ ruby fred.rb --task|-t eval --expfile|-e <e>
234
+ [--logID|-i <i>] [--printLog|-l]
235
+ Evaluate the performance of Fred on the test data
236
+ (or on a split of the training data).
237
+ Evaluation file is written to <fred_directory>/<experiment_ID>/eval/eval
238
+ Not available in enduser mode.
239
+
240
+ --expfile <e> Use <e> as the experiment description and configuation file
241
+
242
+ --logID <l> Evaluate a split of the training data with ID <l>
243
+
244
+ --printLog Also print logfile detailing evaluation of every instance.
245
+ Log file is written to <fred_directory>/eval/log
246
+
247
+ "
248
+ end
249
+ end # class OptParser
250
+ end # module FrPrep