shalmaneser-lib 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/configuration/config_data.rb +457 -0
- data/lib/configuration/config_format_element.rb +210 -0
- data/lib/configuration/configuration_error.rb +15 -0
- data/lib/configuration/external_config_data.rb +56 -0
- data/lib/configuration/frappe_config_data.rb +134 -0
- data/lib/configuration/fred_config_data.rb +199 -0
- data/lib/configuration/rosy_config_data.rb +126 -0
- data/lib/db/db_interface.rb +50 -0
- data/lib/db/db_mysql.rb +141 -0
- data/lib/db/db_sqlite.rb +280 -0
- data/lib/db/db_table.rb +237 -0
- data/lib/db/db_view.rb +416 -0
- data/lib/db/db_wrapper.rb +175 -0
- data/lib/db/select_table_and_columns.rb +10 -0
- data/lib/db/sql_query.rb +243 -0
- data/lib/definitions.rb +19 -0
- data/lib/eval.rb +482 -0
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/external_systems.rb +251 -0
- data/lib/framenet_format/fn_corpus_aset.rb +209 -0
- data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
- data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
- data/lib/framenet_format/fn_database.rb +143 -0
- data/lib/framenet_format/frame_xml_file.rb +104 -0
- data/lib/framenet_format/frame_xml_sentence.rb +411 -0
- data/lib/logging.rb +25 -0
- data/lib/ml/classifier.rb +189 -0
- data/lib/ml/mallet.rb +236 -0
- data/lib/ml/maxent.rb +229 -0
- data/lib/ml/optimize.rb +195 -0
- data/lib/ml/timbl.rb +140 -0
- data/lib/monkey_patching/array.rb +82 -0
- data/lib/monkey_patching/enumerable_bool.rb +24 -0
- data/lib/monkey_patching/enumerable_distribute.rb +18 -0
- data/lib/monkey_patching/file.rb +131 -0
- data/lib/monkey_patching/subsumed.rb +24 -0
- data/lib/ruby_class_extensions.rb +4 -0
- data/lib/salsa_tiger_xml/corpus.rb +24 -0
- data/lib/salsa_tiger_xml/fe_node.rb +98 -0
- data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
- data/lib/salsa_tiger_xml/frame_node.rb +145 -0
- data/lib/salsa_tiger_xml/graph_node.rb +347 -0
- data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
- data/lib/salsa_tiger_xml/sem_node.rb +58 -0
- data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
- data/lib/salsa_tiger_xml/syn_node.rb +169 -0
- data/lib/salsa_tiger_xml/tree_node.rb +59 -0
- data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
- data/lib/salsa_tiger_xml/usp_node.rb +72 -0
- data/lib/salsa_tiger_xml/xml_node.rb +163 -0
- data/lib/shalmaneser/lib.rb +1 -0
- data/lib/tabular_format/fn_tab_format_file.rb +38 -0
- data/lib/tabular_format/fn_tab_frame.rb +67 -0
- data/lib/tabular_format/fn_tab_sentence.rb +169 -0
- data/lib/tabular_format/tab_format_file.rb +91 -0
- data/lib/tabular_format/tab_format_named_args.rb +184 -0
- data/lib/tabular_format/tab_format_sentence.rb +119 -0
- data/lib/value_restriction.rb +49 -0
- metadata +131 -0
@@ -0,0 +1,210 @@
|
|
1
|
+
##############################
|
2
|
+
# ConfigFormatelement is an auxiliary class
|
3
|
+
# of ConfigData.
|
4
|
+
# It keeps track of feature patterns with variables in them
|
5
|
+
# that can be instantiated.
|
6
|
+
# @author Andrei Beliankou
|
7
|
+
#
|
8
|
+
|
9
|
+
require_relative 'configuration_error'
|
10
|
+
|
11
|
+
module Shalmaneser
|
12
|
+
module Configuration
|
13
|
+
class ConfigFormatElement
|
14
|
+
|
15
|
+
# given a pattern and a list of variable names,
|
16
|
+
# analyze the pattern and remember the variable names
|
17
|
+
#
|
18
|
+
def initialize(string, # string: feature name, may include names of variables.
|
19
|
+
# they are included in <>
|
20
|
+
variables) # list of variable names that can occur
|
21
|
+
|
22
|
+
@variables = variables
|
23
|
+
|
24
|
+
# pattern: this is what the 'string' is split into,
|
25
|
+
# an array of elements that are either fixed parts or variables.
|
26
|
+
# fixed part: pair [item:string, "string"]
|
27
|
+
# variable: pair [variable_name:string, "variable"]
|
28
|
+
@pattern = []
|
29
|
+
state = "out"
|
30
|
+
item = ""
|
31
|
+
|
32
|
+
# analyze string,
|
33
|
+
# split into variables and fixed parts
|
34
|
+
string.split(//).each { |char|
|
35
|
+
case state
|
36
|
+
when "in"
|
37
|
+
case char
|
38
|
+
when "<"
|
39
|
+
raise ConfigurationError, "Duplicate < in #{string}."
|
40
|
+
when ">"
|
41
|
+
unless @variables.include? item
|
42
|
+
raise ConfigurationError, "Unknown variable #{item}."
|
43
|
+
end
|
44
|
+
@pattern << [item, "variable"]
|
45
|
+
item = ""
|
46
|
+
state = "out"
|
47
|
+
else
|
48
|
+
item << char
|
49
|
+
state = "in"
|
50
|
+
end
|
51
|
+
|
52
|
+
when "out"
|
53
|
+
case char
|
54
|
+
when "<"
|
55
|
+
unless item.empty?
|
56
|
+
@pattern << [item, "string"]
|
57
|
+
item = ""
|
58
|
+
end
|
59
|
+
state = "in"
|
60
|
+
when ">"
|
61
|
+
raise ConfigurationError, "Unexpected > in #{string}."
|
62
|
+
else
|
63
|
+
item << char
|
64
|
+
state = "out"
|
65
|
+
end
|
66
|
+
|
67
|
+
else
|
68
|
+
raise ConfigurationError, "Shouldn't be here!"
|
69
|
+
end
|
70
|
+
}
|
71
|
+
|
72
|
+
# read through the whole of "string"
|
73
|
+
# end state has to be "out"
|
74
|
+
unless state == "out"
|
75
|
+
raise ConfigurationError, "Unclosed < in #{string}."
|
76
|
+
end
|
77
|
+
|
78
|
+
# last bit still to be recorded?
|
79
|
+
unless item.empty?
|
80
|
+
@pattern << [item, "string"]
|
81
|
+
end
|
82
|
+
|
83
|
+
# make regexp for matching this pattern
|
84
|
+
@regexp = make_regexp(@pattern)
|
85
|
+
end
|
86
|
+
|
87
|
+
# instantiate: given pairs of variable names and variable values,
|
88
|
+
# instantiate @pattern to a string in which var names are replaced
|
89
|
+
# by their values
|
90
|
+
#
|
91
|
+
# returns: string
|
92
|
+
def instantiate(var_hash) # hash variable name(string) => variable value(string)
|
93
|
+
# instantiate the pattern
|
94
|
+
@pattern.map do |item, string_or_var|
|
95
|
+
case string_or_var
|
96
|
+
when "string"
|
97
|
+
item
|
98
|
+
when "variable"
|
99
|
+
if var_hash[item].nil?
|
100
|
+
raise ConfigurationError, "Missing variable instantiation: #{item}."
|
101
|
+
end
|
102
|
+
var_hash[item]
|
103
|
+
else
|
104
|
+
raise ConfigurationError, "Shouldn't be here!"
|
105
|
+
end
|
106
|
+
end.join
|
107
|
+
end
|
108
|
+
|
109
|
+
# match()
|
110
|
+
#
|
111
|
+
# given a string, try to match it against the @pattern
|
112
|
+
# while setting the variables given in 'fillers' to
|
113
|
+
# the values given in that hash.
|
114
|
+
#
|
115
|
+
# returns: if the string matches, a hash variable name => value
|
116
|
+
# that includes the fillers given as a parameter as well as
|
117
|
+
# values for all other variables mentioned in @pattern,
|
118
|
+
# or false if no match.
|
119
|
+
def match(string, # a string
|
120
|
+
fillers = nil) # hash variable name(string) => value(string)
|
121
|
+
|
122
|
+
# have we been given partial info about variables?
|
123
|
+
if fillers
|
124
|
+
match = make_regexp(@pattern, fillers).match(string)
|
125
|
+
else
|
126
|
+
match = @regexp.match(string)
|
127
|
+
end
|
128
|
+
|
129
|
+
if match.nil?
|
130
|
+
# no match via the regular expression
|
131
|
+
return false
|
132
|
+
end
|
133
|
+
|
134
|
+
# regular expression matched.
|
135
|
+
# construct return value in hash
|
136
|
+
# retv: variable name(string) => value(string)
|
137
|
+
retv = {}
|
138
|
+
if fillers
|
139
|
+
# include given fillers in retv hash
|
140
|
+
fillers.each_pair { |name, val| retv[name] = val }
|
141
|
+
end
|
142
|
+
|
143
|
+
# now put values for other variables in @pattern into retv
|
144
|
+
index = 1
|
145
|
+
@pattern.to_a.select { |item, string_or_var|
|
146
|
+
string_or_var == "variable"
|
147
|
+
}.select { |item, string_or_var|
|
148
|
+
fillers.nil? or
|
149
|
+
fillers[item].nil?
|
150
|
+
}.each { |item, string_or_var|
|
151
|
+
# for all items on the pattern list
|
152
|
+
# that are variables and
|
153
|
+
# haven't been filled by the "fillers" list already:
|
154
|
+
# fill from matches
|
155
|
+
|
156
|
+
if match[index].nil?
|
157
|
+
raise ConfigurationError, "Match, but not enough matched elements? Strange."
|
158
|
+
end
|
159
|
+
|
160
|
+
if retv[item].nil?
|
161
|
+
retv[item] = match[index]
|
162
|
+
else
|
163
|
+
unless retv[item] == match[index]
|
164
|
+
return false
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
index += 1
|
169
|
+
}
|
170
|
+
|
171
|
+
retv
|
172
|
+
end
|
173
|
+
|
174
|
+
# used_variables
|
175
|
+
#
|
176
|
+
# returns: an array of variable names used in @pattern
|
177
|
+
def used_variables
|
178
|
+
@pattern.select do |_item, string_or_var|
|
179
|
+
string_or_var == "variable"
|
180
|
+
end.map { |item, _string_or_var| item }
|
181
|
+
end
|
182
|
+
|
183
|
+
####################
|
184
|
+
private
|
185
|
+
|
186
|
+
# make_regexp:
|
187
|
+
# make regular expression from a pattern
|
188
|
+
# together with some variable fillers
|
189
|
+
#
|
190
|
+
# @return [Regexp] object
|
191
|
+
# @param [Array] pattern An array of pairs [string, "string"] or [string, "variable"]
|
192
|
+
# @param [Hash] fillers A Hash variable name(string) => value(string)
|
193
|
+
def make_regexp(pattern, fillers = nil)
|
194
|
+
pattern = pattern.map do |item, string_or_var|
|
195
|
+
case string_or_var
|
196
|
+
when "variable"
|
197
|
+
fillers && fillers[item] ? Regexp.escape(fillers[item]) : '(.+)'
|
198
|
+
when "string"
|
199
|
+
Regexp.escape(item)
|
200
|
+
else
|
201
|
+
# @todo Find the source of this error.
|
202
|
+
raise ConfiguratinError, "Shouldn't be here"
|
203
|
+
end
|
204
|
+
end.join
|
205
|
+
|
206
|
+
Regexp.new("^#{pattern}$")
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Shalmaneser
|
2
|
+
module Configuration
|
3
|
+
class ConfigurationError < StandardError
|
4
|
+
# @param [String] msg A custom message for this exception.
|
5
|
+
# @param [Exception] nested_exception An external exception
|
6
|
+
# which is reused to provide more information.
|
7
|
+
def initialize(msg = nil, nested_exception = nil)
|
8
|
+
if nested_exception
|
9
|
+
msg = "#{nested_exception.class}: #{nested_exception.message}\n#{msg}"
|
10
|
+
end
|
11
|
+
super(msg)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# ExternalConfigData
|
2
|
+
# Katrin Erk January 2006
|
3
|
+
#
|
4
|
+
# All scripts that compute additional external knowledge sources
|
5
|
+
# for Fred and Rosy:
|
6
|
+
# access to configuration and experiment description file
|
7
|
+
|
8
|
+
require_relative 'config_data'
|
9
|
+
|
10
|
+
##############################
|
11
|
+
# Class ExternalConfigData
|
12
|
+
#
|
13
|
+
# inherits from ConfigData,
|
14
|
+
# sets variable names appropriate to tasks of external knowledge sources
|
15
|
+
module Shalmaneser
|
16
|
+
module Configuration
|
17
|
+
class ExternalConfigData < ConfigData
|
18
|
+
def initialize(filename)
|
19
|
+
# initialize config data object
|
20
|
+
super(filename, # config file
|
21
|
+
{ "directory" => "string", # features
|
22
|
+
|
23
|
+
"experiment_id" => "string",
|
24
|
+
|
25
|
+
"gfmap_restrict_to_downpath" => "bool",
|
26
|
+
"gfmap_restrict_pathlen" => "integer",
|
27
|
+
"gfmap_remove_gf" => "list"
|
28
|
+
},
|
29
|
+
[] # variables
|
30
|
+
)
|
31
|
+
|
32
|
+
# set access functions for list features
|
33
|
+
set_list_feature_access("gfmap_remove_gf",
|
34
|
+
method("access_as_stringlist"))
|
35
|
+
end
|
36
|
+
|
37
|
+
###
|
38
|
+
protected
|
39
|
+
#####
|
40
|
+
# access_as_stringlist
|
41
|
+
#
|
42
|
+
# assumed format:
|
43
|
+
#
|
44
|
+
# lhs = rhs1 rhs2 ... rhsN
|
45
|
+
#
|
46
|
+
# given in val_list as string tuples [rhs1,...,rhsN]
|
47
|
+
#
|
48
|
+
# join the rhs strings by spaces, return as string
|
49
|
+
# "rhs1 rhs2 ... rhsN"
|
50
|
+
#
|
51
|
+
def access_as_stringlist(val_list) # array:array:string
|
52
|
+
val_list.map { |rhs| rhs.join(" ") }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
# FPrepConfigData
|
2
|
+
# Katrin Erk July 05
|
3
|
+
#
|
4
|
+
# Preprocessing for Fred and Rosy:
|
5
|
+
# access to a configuration and experiment description file
|
6
|
+
|
7
|
+
require_relative 'config_data'
|
8
|
+
|
9
|
+
##############################
|
10
|
+
# Class FrappeConfigData
|
11
|
+
#
|
12
|
+
# inherits from ConfigData,
|
13
|
+
# sets variable names appropriate to preprocessing task
|
14
|
+
module Shalmaneser
|
15
|
+
module Configuration
|
16
|
+
class FrappeConfigData < ConfigData
|
17
|
+
VALID_ENCODINGS = ['hex', 'iso', 'utf8', nil]
|
18
|
+
VALID_INPUT_FORMATS = %w(Plain SalsaTab FNXml FNCorpusXml SalsaTigerXML)
|
19
|
+
CONFIG_DEFS = {
|
20
|
+
"prep_experiment_ID" => "string", # experiment identifier
|
21
|
+
"frprep_directory" => "string", # dir for frprep internal data
|
22
|
+
# information about the dataset
|
23
|
+
"language" => "string", # en, de
|
24
|
+
"origin" => "string", # FrameNet, Salsa, or nothing
|
25
|
+
"format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
|
26
|
+
"encoding" => "string", # utf8, iso, hex, or nothing
|
27
|
+
|
28
|
+
# directories
|
29
|
+
"directory_input" => "string", # dir with input data
|
30
|
+
"directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
|
31
|
+
"directory_parserout" => "string", # dir with parser output for the parser named below
|
32
|
+
|
33
|
+
# syntactic processing
|
34
|
+
"pos_tagger" => "string", # name of POS tagger
|
35
|
+
"lemmatizer" => "string", # name of lemmatizer
|
36
|
+
"parser" => "string", # name of parser
|
37
|
+
"pos_tagger_path" => "string", # path to POS tagger
|
38
|
+
"lemmatizer_path" => "string", # path to lemmatizer
|
39
|
+
"parser_path" => "string", # path to parser
|
40
|
+
"parser_max_sent_num" => "integer", # max number of sentences per parser input file
|
41
|
+
"parser_max_sent_len" => "integer", # max sentence length the parser handles
|
42
|
+
|
43
|
+
"do_parse" => "bool", # use parser?
|
44
|
+
"do_lemmatize" => "bool",# use lemmatizer?
|
45
|
+
"do_postag" => "bool", # use POS tagger?
|
46
|
+
|
47
|
+
# output format: if tabformat_output == true,
|
48
|
+
# output in Tab format rather than Salsa/Tiger XML
|
49
|
+
# (this will not work if do_parse == true)
|
50
|
+
"tabformat_output" => "bool",
|
51
|
+
|
52
|
+
# syntactic repairs, dependent on existing semantic role annotation
|
53
|
+
"fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
|
54
|
+
"fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
|
55
|
+
}
|
56
|
+
|
57
|
+
# @param filename [String]
|
58
|
+
def initialize(filename)
|
59
|
+
# @param filename [String] path to a config file
|
60
|
+
# @param CONFIG_DEFS [Hash] a list of configuration definitions
|
61
|
+
super(filename, CONFIG_DEFS, [])
|
62
|
+
validate
|
63
|
+
end
|
64
|
+
|
65
|
+
# @return [True, False]
|
66
|
+
# Shall we convert our input files into the target encoding?
|
67
|
+
def convert_encoding?
|
68
|
+
get('encoding') != 'utf8'
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
# Validates semantically the input values from the experiment file.
|
74
|
+
# @todo Rework the whole validation engine, the parameter definitions
|
75
|
+
# should entail the information about: optional, obligatory,
|
76
|
+
# in combination with. This information should be stored in external
|
77
|
+
# resource files to easily change them.
|
78
|
+
# @todo Accumulate error messages.
|
79
|
+
def validate
|
80
|
+
msg = []
|
81
|
+
|
82
|
+
unless get('frprep_directory')
|
83
|
+
msg << 'Please set <frprep_directory>, the Frappe internal data '\
|
84
|
+
'directory, in the experiment file.'
|
85
|
+
end
|
86
|
+
|
87
|
+
unless get('directory_input')
|
88
|
+
msg << 'Please specify <directory_input> in the Frappe experiment file.'
|
89
|
+
end
|
90
|
+
|
91
|
+
unless get('directory_preprocessed')
|
92
|
+
msg << 'Please specify <directory_preprocessed> in the experiment file.'
|
93
|
+
end
|
94
|
+
|
95
|
+
# sanity check: output in tab format will not work
|
96
|
+
# if we also do a parse
|
97
|
+
if get('tabformat_output') && get('do_parse')
|
98
|
+
msg << 'Error: Cannot do Tab format output when the input text is being'\
|
99
|
+
'parsed. Please set either <tabformat_output> or <do_parse> to false.'
|
100
|
+
end
|
101
|
+
|
102
|
+
if get('do_postag') && !(get('pos_tagger_path') && get('pos_tagger'))
|
103
|
+
msg << 'POS Tagging: I need <pos_tagger> and <pos_tagger_path> '\
|
104
|
+
'in the experiment file.'
|
105
|
+
end
|
106
|
+
|
107
|
+
if get('do_lemmatize') && !(get('lemmatizer_path') && get('lemmatizer'))
|
108
|
+
msg << 'Lemmatization: I need <lemmatizer> and <lemmatizer_path> in the experiment file.'
|
109
|
+
end
|
110
|
+
|
111
|
+
if get('do_parse') && !(get('parser_path') && get('parser'))
|
112
|
+
msg << 'Parsing: I need <parser> and <parser_path> in the experiment file.'
|
113
|
+
end
|
114
|
+
|
115
|
+
unless VALID_ENCODINGS.include?(get('encoding'))
|
116
|
+
msg << 'Please define a correct encoding in the configuration file: '\
|
117
|
+
"<#{VALID_ENCODINGS.join('>, <')}>!"
|
118
|
+
end
|
119
|
+
|
120
|
+
unless VALID_INPUT_FORMATS.include?(get('format'))
|
121
|
+
msg << 'Please define a correct input format in the configuration file: '\
|
122
|
+
"<#{VALID_INPUT_FORMATS.join('>, <')}>!"
|
123
|
+
end
|
124
|
+
|
125
|
+
unless get("prep_experiment_ID") =~ /^[A-Za-z0-9_]+$/
|
126
|
+
msg << 'Please choose an alphanumeric experiment ID! '\
|
127
|
+
"You provided: #{get('prep_experiment_ID')}"
|
128
|
+
end
|
129
|
+
|
130
|
+
raise(ConfigurationError, msg.join("\n")) if msg.any?
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
# FredConfigData
|
2
|
+
# Katrin Erk April 05
|
3
|
+
#
|
4
|
+
# Frame disambiguation system:
|
5
|
+
# access to a configuration and experiment description file
|
6
|
+
|
7
|
+
require_relative 'config_data'
|
8
|
+
require 'definitions'
|
9
|
+
require 'logging'
|
10
|
+
|
11
|
+
##############################
|
12
|
+
# Class FredConfigData
|
13
|
+
#
|
14
|
+
# inherits from ConfigData,
|
15
|
+
# sets variable names appropriate to WSD task
|
16
|
+
module Shalmaneser
|
17
|
+
module Configuration
|
18
|
+
class FredConfigData < ConfigData
|
19
|
+
VALID_TASKS = %w(featurize refeaturize split test eval)
|
20
|
+
CONFIG_DEFS = {
|
21
|
+
"experiment_ID" => "string", # experiment ID
|
22
|
+
"preproc_descr_file_train" => "string", # path to preprocessing files
|
23
|
+
"preproc_descr_file_test" => "string",
|
24
|
+
"directory_output" => "string", # path to Salsa/Tiger XML output directory
|
25
|
+
|
26
|
+
# @todo Verbosity should be handled by the Logger and only via cmd switches.
|
27
|
+
"verbose" => "bool", # print diagnostic messages?
|
28
|
+
"apply_to_all_known_targets" => "bool", # apply to all known targets rather than the ones with a frame?
|
29
|
+
|
30
|
+
"fred_directory" => "string",# directory for internal info
|
31
|
+
"classifier_dir" => "string", # write classifiers here
|
32
|
+
|
33
|
+
"classifier" => "list", # classifiers
|
34
|
+
|
35
|
+
"dbtype" => "string", # "mysql" or "sqlite"
|
36
|
+
|
37
|
+
"host" => "string", # DB access: sqlite only
|
38
|
+
"user" => "string",
|
39
|
+
"passwd" => "string",
|
40
|
+
"dbname" => "string",
|
41
|
+
|
42
|
+
# featurization info
|
43
|
+
"feature" => "list", # which features to use for the classifier?
|
44
|
+
"binary_classifiers" => "bool",# make binary rather than n-ary clasifiers?
|
45
|
+
"negsense" => "string", # binary classifier: negative sense is..?
|
46
|
+
"numerical_features" => "string", # do what with numerical features?
|
47
|
+
|
48
|
+
# what to do with items that have multiple senses?
|
49
|
+
# 'binarize': binary classifiers, and consider positive
|
50
|
+
# if the sense is among the gold senses
|
51
|
+
# 'join' : make one joint sense
|
52
|
+
# 'repeat' : make multiple occurrences of the item, one sense per occ
|
53
|
+
# 'keep' : keep as separate labels
|
54
|
+
#
|
55
|
+
# multilabel: consider as assigned all labels
|
56
|
+
# above a certain confidence threshold?
|
57
|
+
"handle_multilabel" => "string",
|
58
|
+
"assignment_confidence_threshold" => "float",
|
59
|
+
|
60
|
+
# single-sentence context?
|
61
|
+
"single_sent_context" => "bool",
|
62
|
+
|
63
|
+
# noncontiguous input? then we need access to a larger corpus
|
64
|
+
"noncontiguous_input" => "bool",
|
65
|
+
"larger_corpus_dir" => "string",
|
66
|
+
"larger_corpus_format" => "string",
|
67
|
+
"larger_corpus_encoding" => "string",
|
68
|
+
|
69
|
+
# Imported from PrepConfigData
|
70
|
+
'do_postag' => 'bool',
|
71
|
+
'do_lemmatize' => 'bool',
|
72
|
+
'do_parse' => 'bool',
|
73
|
+
'pos_tagger' => 'string',
|
74
|
+
'lemmatizer' => 'string',
|
75
|
+
'parser' => 'string',
|
76
|
+
'directory_preprocessed' => 'string',
|
77
|
+
'language' => 'string'
|
78
|
+
}
|
79
|
+
|
80
|
+
def initialize(filename)
|
81
|
+
super(filename, CONFIG_DEFS, ["train", "exp_ID"])
|
82
|
+
# set access functions for list features
|
83
|
+
set_list_feature_access("classifier", method("access_classifier"))
|
84
|
+
set_list_feature_access("feature", method("access_feature"))
|
85
|
+
validate
|
86
|
+
end
|
87
|
+
|
88
|
+
###
|
89
|
+
# protected
|
90
|
+
|
91
|
+
#####
|
92
|
+
# access_feature
|
93
|
+
#
|
94
|
+
# access function for feature 'feature'
|
95
|
+
#
|
96
|
+
# assumed format:
|
97
|
+
#
|
98
|
+
# feature = context 50
|
99
|
+
# feature = context 2
|
100
|
+
# feature = syn
|
101
|
+
#
|
102
|
+
# i.e. first the name of the feature type to use, then
|
103
|
+
# optionally a parameter,
|
104
|
+
# and the same feature can occur more than once (which makes sense
|
105
|
+
# only in case of parameters)
|
106
|
+
#
|
107
|
+
#
|
108
|
+
# returns:
|
109
|
+
# - If a feature is given as a parameter,
|
110
|
+
# - If the feature is not set in the experiment file, nil
|
111
|
+
# - If the feature is set and has a parameter, the list of
|
112
|
+
# parameter values set for it. It is assumed that the parameters
|
113
|
+
# are integers, and they are returned as integers
|
114
|
+
# - If the feature is set and has no parameter, true
|
115
|
+
# - If no feature is given as parameter:
|
116
|
+
# a list of all features that have been set in the experiment file
|
117
|
+
# Each feature is given as a tuple: the first element is the feature (a string),
|
118
|
+
# all further elements are options (integers)
|
119
|
+
def access_feature(val_list, # array:array:string: list of tuples defined in config file
|
120
|
+
# for feature 'feature'
|
121
|
+
feature=nil) # string: feature type name
|
122
|
+
|
123
|
+
if feature
|
124
|
+
# access options for this feature
|
125
|
+
|
126
|
+
# get the right tuples
|
127
|
+
positives = val_list.select { |entries|
|
128
|
+
entries.first == feature
|
129
|
+
}.map { |entries|
|
130
|
+
entries[1]
|
131
|
+
}
|
132
|
+
|
133
|
+
if positives.empty?
|
134
|
+
# feature not defined
|
135
|
+
return nil
|
136
|
+
|
137
|
+
elsif positives.compact.empty?
|
138
|
+
# feature defined, but no parameters
|
139
|
+
return true
|
140
|
+
|
141
|
+
else
|
142
|
+
# feature defined, and has values
|
143
|
+
return positives.map { |par| par.to_i }
|
144
|
+
end
|
145
|
+
|
146
|
+
else
|
147
|
+
# return all features that have been set
|
148
|
+
return val_list.map { |feature_name, *options|
|
149
|
+
[feature_name] + options.map { |o| o.to_i }
|
150
|
+
}
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
#####
|
155
|
+
# access_classifier
|
156
|
+
#
|
157
|
+
# access function for feature 'classifier'
|
158
|
+
#
|
159
|
+
# assumed format in the config file:
|
160
|
+
#
|
161
|
+
# feature = path [option]*
|
162
|
+
#
|
163
|
+
# i.e. first the name of the feature type to use, then
|
164
|
+
# optionally options associated with that feature,
|
165
|
+
# e.g. 'argrec': use that feature only when computing argrec
|
166
|
+
#
|
167
|
+
# the access function is called with parameter val_list, an array of
|
168
|
+
# string tuples, one string tuple for each feature defined.
|
169
|
+
# the first string in the tuple is the feature name, the rest are the options
|
170
|
+
#
|
171
|
+
# returns: a list of pairs [feature_name(string), options(array:string)]
|
172
|
+
# of defined features
|
173
|
+
# @param val_list [Array] array:array:string: list of tuples defined
|
174
|
+
# in config file for feature 'feature'
|
175
|
+
def access_classifier(val_list)
|
176
|
+
if val_list.nil?
|
177
|
+
[]
|
178
|
+
else
|
179
|
+
val_list.map do |cl_descr_tuple|
|
180
|
+
[cl_descr_tuple.first, cl_descr_tuple[1..-1]]
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
private
|
186
|
+
|
187
|
+
def validate
|
188
|
+
msg = []
|
189
|
+
=begin
|
190
|
+
unless VALID_TASKS.include?(get('encoding'))
|
191
|
+
msg << 'Please define a correct encoding in the configuration file: '\
|
192
|
+
"<#{VALID_ENCODINGS.join('>, <')}>!"
|
193
|
+
end
|
194
|
+
=end
|
195
|
+
raise(ConfigurationError, msg.join("\n")) if msg.any?
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|