frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,176 @@
|
|
1
|
+
###########################
|
2
|
+
# DBWrapper:
|
3
|
+
# abstract class wrapping database interfaces,
|
4
|
+
# so we can have both an interface to an SQL server
|
5
|
+
# and an interface to SQLite in Shalmaneser
|
6
|
+
class DBWrapper
|
7
|
+
attr_reader :table_name
|
8
|
+
|
9
|
+
###
|
10
|
+
def initialize(exp) # RosyConfigData experiment file object
|
11
|
+
# remember experiment file
|
12
|
+
@exp = exp
|
13
|
+
|
14
|
+
# open the database:
|
15
|
+
# please set to some other value in subclass initialization
|
16
|
+
@database = nil
|
17
|
+
|
18
|
+
# name of default table to access: none
|
19
|
+
@table_name = nil
|
20
|
+
end
|
21
|
+
|
22
|
+
###
|
23
|
+
# close DB access
|
24
|
+
def close()
|
25
|
+
@database.close()
|
26
|
+
end
|
27
|
+
|
28
|
+
####
|
29
|
+
# querying the database:
|
30
|
+
# returns an DBResult object
|
31
|
+
def query(query)
|
32
|
+
raise "Overwrite me"
|
33
|
+
end
|
34
|
+
|
35
|
+
####
|
36
|
+
# querying the database:
|
37
|
+
# no result value
|
38
|
+
def query_noretv(query)
|
39
|
+
raise "Overwrite me"
|
40
|
+
end
|
41
|
+
|
42
|
+
###
|
43
|
+
# list all tables in the database:
|
44
|
+
# no default here
|
45
|
+
#
|
46
|
+
# returns: list of strings
|
47
|
+
def list_tables()
|
48
|
+
raise "Overwrite me"
|
49
|
+
end
|
50
|
+
|
51
|
+
###
|
52
|
+
# make a table
|
53
|
+
#
|
54
|
+
# returns: nothing
|
55
|
+
def create_table(table_name, # string
|
56
|
+
column_formats, # array: array: string*string [column_name,column_format]
|
57
|
+
index_column_names, # array: string: column_name
|
58
|
+
indexname) # string: name of automatically created index column
|
59
|
+
raise "overwrite me"
|
60
|
+
end
|
61
|
+
|
62
|
+
###
|
63
|
+
# remove a table
|
64
|
+
def drop_table(table_name)
|
65
|
+
query_noretv("DROP TABLE " + table_name)
|
66
|
+
end
|
67
|
+
|
68
|
+
###
|
69
|
+
# list all column names of a table (no default)
|
70
|
+
#
|
71
|
+
# returns: array of strings
|
72
|
+
def list_column_names(table_name)
|
73
|
+
return list_column_formats(table_name).map { |col_name, col_format| col_name }
|
74
|
+
end
|
75
|
+
|
76
|
+
#####
|
77
|
+
# list_column_formats
|
78
|
+
#
|
79
|
+
# list column names and column types of this table
|
80
|
+
#
|
81
|
+
# returns: array:string*string, list of pairs [column name, column format]
|
82
|
+
def list_column_formats(table_name)
|
83
|
+
raise "Overwrite me"
|
84
|
+
end
|
85
|
+
|
86
|
+
####
|
87
|
+
# num_rows
|
88
|
+
#
|
89
|
+
# determine the number of rows in a table
|
90
|
+
# returns: integer
|
91
|
+
def num_rows(table_name)
|
92
|
+
raise "Overwrite me"
|
93
|
+
end
|
94
|
+
|
95
|
+
####
|
96
|
+
# make a temporary table: basically just make a table
|
97
|
+
#
|
98
|
+
# returns: DBWrapper object (or object of current subclass)
|
99
|
+
# that has the @table_name attribute set to the name of a temporary DB
|
100
|
+
def make_temp_table(column_formats, # array: string*string [column_name,column_format]
|
101
|
+
index_column_names, # array: string: column_name
|
102
|
+
indexname) # string: name of autoincrement primary index
|
103
|
+
|
104
|
+
temp_obj = self.clone()
|
105
|
+
temp_obj.initialize_temp_table(column_formats, index_column_names, indexname)
|
106
|
+
return temp_obj
|
107
|
+
end
|
108
|
+
|
109
|
+
def drop_temp_table()
|
110
|
+
unless @table_name
|
111
|
+
raise "can only do drop_temp_table() for objects that have a temp table"
|
112
|
+
end
|
113
|
+
drop_table(@table_name)
|
114
|
+
end
|
115
|
+
|
116
|
+
##############################
|
117
|
+
protected
|
118
|
+
|
119
|
+
def initialize_temp_table(column_formats, index_column_names, indexname)
|
120
|
+
@table_name = "t" + Time.new().to_f().to_s().gsub(/\./, "")
|
121
|
+
create_table(@table_name, column_formats, index_column_names, indexname)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
######################################################################
|
129
|
+
# DBResult:
|
130
|
+
# abstract class keeping query results
|
131
|
+
#
|
132
|
+
# instantiate for the DB package used
|
133
|
+
class DBResult
|
134
|
+
###
|
135
|
+
# initialize with query result, and keep it
|
136
|
+
def initialize(value)
|
137
|
+
@result = value
|
138
|
+
end
|
139
|
+
|
140
|
+
# column names: NO DEFAULT
|
141
|
+
def list_column_names()
|
142
|
+
raise "Overwrite me"
|
143
|
+
end
|
144
|
+
|
145
|
+
# number of rows: returns an integer
|
146
|
+
def num_rows()
|
147
|
+
return @result.num_rows
|
148
|
+
end
|
149
|
+
|
150
|
+
# yields each row as an array of values
|
151
|
+
def each()
|
152
|
+
@result.each { |row| yield row }
|
153
|
+
end
|
154
|
+
|
155
|
+
# yields each row as a hash: column name=> column value
|
156
|
+
def each_hash()
|
157
|
+
@result.each_hash { |row_hash| yield row_hash }
|
158
|
+
end
|
159
|
+
|
160
|
+
# reset object, such that each() can be run again
|
161
|
+
# DEFAULT DOES NOTHING, PLEASE OVERWRITE
|
162
|
+
def reset()
|
163
|
+
end
|
164
|
+
|
165
|
+
# free result object
|
166
|
+
def free()
|
167
|
+
@result.free()
|
168
|
+
end
|
169
|
+
|
170
|
+
# returns row as an array of column contents
|
171
|
+
def fetch_row()
|
172
|
+
return @result.fetch_row()
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
176
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# ExternalConfigData
|
2
|
+
# Katrin Erk January 2006
|
3
|
+
#
|
4
|
+
# All scripts that compute additional external knowledge sources
|
5
|
+
# for Fred and Rosy:
|
6
|
+
# access to configuration and experiment description file
|
7
|
+
|
8
|
+
require 'common/ConfigData'
|
9
|
+
|
10
|
+
##############################
|
11
|
+
# Class ExternalConfigData
|
12
|
+
#
|
13
|
+
# inherits from ConfigData,
|
14
|
+
# sets variable names appropriate to tasks of external knowledge sources
|
15
|
+
|
16
|
+
class ExternalConfigData < ConfigData
|
17
|
+
def initialize(filename)
|
18
|
+
|
19
|
+
# initialize config data object
|
20
|
+
super(filename, # config file
|
21
|
+
{ "directory" => "string", # features
|
22
|
+
|
23
|
+
"experiment_id" => "string",
|
24
|
+
|
25
|
+
"gfmap_restrict_to_downpath" => "bool",
|
26
|
+
"gfmap_restrict_pathlen" => "integer",
|
27
|
+
"gfmap_remove_gf" => "list"
|
28
|
+
},
|
29
|
+
[] # variables
|
30
|
+
)
|
31
|
+
|
32
|
+
# set access functions for list features
|
33
|
+
set_list_feature_access("gfmap_remove_gf",
|
34
|
+
method("access_as_stringlist"))
|
35
|
+
end
|
36
|
+
|
37
|
+
###
|
38
|
+
protected
|
39
|
+
|
40
|
+
#####
|
41
|
+
# access_as_stringlist
|
42
|
+
#
|
43
|
+
# assumed format:
|
44
|
+
#
|
45
|
+
# lhs = rhs1 rhs2 ... rhsN
|
46
|
+
#
|
47
|
+
# given in val_list as string tuples [rhs1,...,rhsN]
|
48
|
+
#
|
49
|
+
# join the rhs strings by spaces, return as string
|
50
|
+
# "rhs1 rhs2 ... rhsN"
|
51
|
+
#
|
52
|
+
def access_as_stringlist(val_list) # array:array:string
|
53
|
+
return val_list.map { |rhs| rhs.join(" ") }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
|
@@ -0,0 +1,130 @@
|
|
1
|
+
# Failed Parses
|
2
|
+
#
|
3
|
+
# SP May 05
|
4
|
+
#
|
5
|
+
# Administration of information about failed parses;
|
6
|
+
# - sentence ID
|
7
|
+
# - frame
|
8
|
+
# - missed FE markables
|
9
|
+
#
|
10
|
+
# this class is pretty much a gloriefied hash table with methods to
|
11
|
+
# - read FailedParses from a file and to write them to a file
|
12
|
+
# - access info in a frame-specific way
|
13
|
+
|
14
|
+
class FailedParses
|
15
|
+
|
16
|
+
###
|
17
|
+
# initialize
|
18
|
+
#
|
19
|
+
# nothing much happens here
|
20
|
+
def initialize()
|
21
|
+
@failed_parses = Array.new
|
22
|
+
end
|
23
|
+
|
24
|
+
###
|
25
|
+
# register
|
26
|
+
#
|
27
|
+
# register new failed parse by specifying
|
28
|
+
# - its sentence id (any object)
|
29
|
+
# - its frame (String)
|
30
|
+
# - its FE list (String Array)
|
31
|
+
|
32
|
+
def register(sent_id, # object
|
33
|
+
frame, # string: frame name
|
34
|
+
target, # string?
|
35
|
+
target_pos, # string: target POS
|
36
|
+
fe_list) # array:string
|
37
|
+
if @failed_parses.assoc sent_id
|
38
|
+
# $stderr.puts "Error: trying to register sentence id #{sent_id} twice!"
|
39
|
+
# $stderr.puts "Skipping second occurrence."
|
40
|
+
end
|
41
|
+
@failed_parses << [sent_id,frame,target,target_pos,fe_list]
|
42
|
+
end
|
43
|
+
|
44
|
+
###
|
45
|
+
# make_split
|
46
|
+
#
|
47
|
+
# produce a "split" of the failed parses into a train and a test section
|
48
|
+
# paramer: train_percentage, Integer between 0 and 100
|
49
|
+
#
|
50
|
+
# returns an Array with two FailedParses objects, the first for the
|
51
|
+
# train data, the second for the test data
|
52
|
+
|
53
|
+
def make_split(train_percentage)
|
54
|
+
unless train_percentage.class < Integer and train_percentage >= 0 and train_percentage <= 100
|
55
|
+
raise "Need Integer between 0 and 100 as training percentage."
|
56
|
+
end
|
57
|
+
train_failed = FailedParses.new()
|
58
|
+
test_failed = FailedParses.new()
|
59
|
+
@failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
|
60
|
+
if rand(100) > train_percentage
|
61
|
+
test_failed.register(sent_id,frame,target,target_pos,fe_list)
|
62
|
+
else
|
63
|
+
train_failed.register(sent_id,frame,target,target_pos,fe_list)
|
64
|
+
end
|
65
|
+
}
|
66
|
+
return [train_failed, test_failed]
|
67
|
+
end
|
68
|
+
|
69
|
+
###
|
70
|
+
# Access information
|
71
|
+
#
|
72
|
+
# failed_sent: number of failed sentences
|
73
|
+
# failed_fes: Hash that maps FE names [String] onto numbers of failed FEs [Int]
|
74
|
+
#
|
75
|
+
# optional parameters: frame, target, target_pos : if not specified or nil, marginal
|
76
|
+
# frequencies are counted (sum over all values)
|
77
|
+
|
78
|
+
|
79
|
+
def failed_sent(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
|
80
|
+
counter = 0
|
81
|
+
@failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
|
82
|
+
if ((frame_spec.nil? or frame_spec == frame) and
|
83
|
+
(target_spec.nil? or target_spec == target) and
|
84
|
+
(target_pos_spec.nil? or target_pos_spec == target_pos))
|
85
|
+
counter += 1
|
86
|
+
end
|
87
|
+
}
|
88
|
+
return counter
|
89
|
+
end
|
90
|
+
|
91
|
+
def failed_fes(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
|
92
|
+
fe_hash = Hash.new(0)
|
93
|
+
@failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
|
94
|
+
if ((frame_spec.nil? or frame_spec == frame) and
|
95
|
+
(target_spec.nil? or target_spec == target) and
|
96
|
+
(target_pos_spec.nil? or target_pos_spec == target))
|
97
|
+
fe_list.each {|fe_label|
|
98
|
+
fe_hash[fe_label] += 1
|
99
|
+
}
|
100
|
+
end
|
101
|
+
}
|
102
|
+
return fe_hash
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
###
|
107
|
+
# Marshalling:
|
108
|
+
#
|
109
|
+
# save - save info about failed parses to file
|
110
|
+
# load - load info about failed parses from file
|
111
|
+
|
112
|
+
def save(filename)
|
113
|
+
io_obj = File.new(filename,"w")
|
114
|
+
Marshal.dump(@failed_parses,io_obj)
|
115
|
+
io_obj.close
|
116
|
+
end
|
117
|
+
|
118
|
+
def load(filename)
|
119
|
+
begin
|
120
|
+
io_obj = File.new(filename)
|
121
|
+
@failed_parses = Marshal.load(io_obj)
|
122
|
+
io_obj.close
|
123
|
+
rescue
|
124
|
+
$stderr.puts "WARNING: couldn't read failed parses file #{filename}."
|
125
|
+
$stderr.puts "I'll assume that there are no failed parses."
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
|
130
|
+
end
|
@@ -0,0 +1,242 @@
|
|
1
|
+
require 'common/ruby_class_extensions'
|
2
|
+
|
3
|
+
class RosyFeatureInfo
|
4
|
+
###
|
5
|
+
# class variable:
|
6
|
+
# list of all known extractors
|
7
|
+
# add to it using add_feature()
|
8
|
+
@@extractors = Array.new
|
9
|
+
|
10
|
+
# boolean. set to true after warning messages have been given once
|
11
|
+
@@warned = false
|
12
|
+
|
13
|
+
###
|
14
|
+
# add interface/interpreter
|
15
|
+
def RosyFeatureInfo.add_feature(class_name) # Class object
|
16
|
+
@@extractors << class_name
|
17
|
+
end
|
18
|
+
|
19
|
+
###
|
20
|
+
def initialize(exp)
|
21
|
+
|
22
|
+
##
|
23
|
+
# make list of extractors that are
|
24
|
+
# either required by the user
|
25
|
+
# or needed by the system
|
26
|
+
@current_extractors = Array.new
|
27
|
+
@exp = exp
|
28
|
+
|
29
|
+
# user-chosen extractors:
|
30
|
+
# returns array of pairs [feature group designator(string), options(array:string)]
|
31
|
+
exp.get_lf("feature").each { |extractor_name, options|
|
32
|
+
extractor = @@extractors.detect { |e| e.designator() == extractor_name }
|
33
|
+
unless extractor
|
34
|
+
# no extractor found matching the given designator
|
35
|
+
unless @@warned
|
36
|
+
$stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
|
37
|
+
end
|
38
|
+
next
|
39
|
+
end
|
40
|
+
|
41
|
+
# read and check options
|
42
|
+
step = nil
|
43
|
+
|
44
|
+
options.each { |option|
|
45
|
+
case option
|
46
|
+
when "dontuse", "argrec", "arglab", "onestep"
|
47
|
+
|
48
|
+
if step
|
49
|
+
# step has already been set
|
50
|
+
$stderr.puts "ERROR in feature #{extractor_name}: Please set only one of the options dontuse, argrec, arglab, onestep"
|
51
|
+
exit 1
|
52
|
+
end
|
53
|
+
|
54
|
+
step = option
|
55
|
+
|
56
|
+
else
|
57
|
+
unless @@warned
|
58
|
+
$stderr.puts "Warning: Unknown option for feature #{extractor_name}: #{option}. Skipping"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
}
|
62
|
+
|
63
|
+
@current_extractors << {
|
64
|
+
"extractor" => extractor,
|
65
|
+
"step" => step
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
# extractors needed by the system
|
70
|
+
@@extractors.select { |e|
|
71
|
+
# select admin features and gold feature
|
72
|
+
["admin", "gold"].include? e.feature_type()
|
73
|
+
}.each { |extractor|
|
74
|
+
|
75
|
+
# if we have already added that extractor, remove it
|
76
|
+
# and add it with our own options
|
77
|
+
@current_extractors.delete_if { |descr| descr["extractor"].designator() == extractor.designator() }
|
78
|
+
|
79
|
+
@current_extractors << {
|
80
|
+
"extractor"=> extractor,
|
81
|
+
"step" => "dontuse"
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
# make sure that all extractors are computable in the current model
|
86
|
+
# (i.e. check dependencies)
|
87
|
+
|
88
|
+
allstep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil?
|
89
|
+
}.map { |e| e["extractor"].designator() }
|
90
|
+
argrec_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "argrec"
|
91
|
+
}.map { |e| e["extractor"].designator() }
|
92
|
+
arglab_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "arglab"
|
93
|
+
}.map { |e| e["extractor"].designator() }
|
94
|
+
onestep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "onestep"
|
95
|
+
}.map { |e| e["extractor"].designator() }
|
96
|
+
|
97
|
+
@current_extractors.delete_if {|extractor_hash|
|
98
|
+
case extractor_hash["step"]
|
99
|
+
when nil
|
100
|
+
computable = extractor_hash["extractor"].is_computable(allstep_extractors)
|
101
|
+
when "argrec"
|
102
|
+
computable = extractor_hash["extractor"].is_computable(argrec_extractors)
|
103
|
+
when "arglab"
|
104
|
+
computable = extractor_hash["extractor"].is_computable(arglab_extractors)
|
105
|
+
when "onestep"
|
106
|
+
computable = extractor_hash["extractor"].is_computable(onestep_extractors)
|
107
|
+
when "dontuse"
|
108
|
+
# either an admin feature or a user feature not to be used this time
|
109
|
+
computable = true
|
110
|
+
end
|
111
|
+
|
112
|
+
if computable
|
113
|
+
false # i.e. don't delete
|
114
|
+
else
|
115
|
+
unless @@warned
|
116
|
+
$stderr.puts "Warning: Feature extractor #{extractor_hash["extractor"].designator()} cannot be computed: skipping."
|
117
|
+
end
|
118
|
+
true
|
119
|
+
end
|
120
|
+
}
|
121
|
+
|
122
|
+
# make list of all features as hashes
|
123
|
+
# "feature_name" -> string,
|
124
|
+
# "sql_type" -> string,
|
125
|
+
# "is_index" -> boolean,
|
126
|
+
# "step" -> string: argrec, arglab, onestep, or nil
|
127
|
+
# "type" -> string
|
128
|
+
# "phase" -> string: phase 1 or phase 2
|
129
|
+
@features = Array.new
|
130
|
+
@current_extractors.each { |descr|
|
131
|
+
extractor = descr["extractor"]
|
132
|
+
extractor.feature_names.each { |feature_name|
|
133
|
+
@features << {
|
134
|
+
"feature_name" => feature_name,
|
135
|
+
"sql_type" => extractor.sql_type(),
|
136
|
+
"is_index" => extractor.info().include?("index"),
|
137
|
+
"step" => descr["step"],
|
138
|
+
"type" => extractor.feature_type(),
|
139
|
+
"phase" => extractor.phase()
|
140
|
+
}
|
141
|
+
}
|
142
|
+
}
|
143
|
+
|
144
|
+
# do not print warnings again if another RosyFeatureInfo object is made
|
145
|
+
@@warned = true
|
146
|
+
end
|
147
|
+
|
148
|
+
###
|
149
|
+
# get_column_formats
|
150
|
+
#
|
151
|
+
# returns a list of pairs [feature_name(string), sql_column_format(string)]:
|
152
|
+
# all features to be computed, with their SQL column formats
|
153
|
+
def get_column_formats(phase = nil) # string: phase 1 or phase 2
|
154
|
+
return @features.select { |feature_descr|
|
155
|
+
phase.nil? or
|
156
|
+
feature_descr["phase"] == phase
|
157
|
+
}.map { |feature_descr|
|
158
|
+
[feature_descr["feature_name"], feature_descr["sql_type"]]
|
159
|
+
}
|
160
|
+
end
|
161
|
+
|
162
|
+
###
|
163
|
+
# get_column_names
|
164
|
+
#
|
165
|
+
# returns a list of feature names (strings)
|
166
|
+
# all features to be computed
|
167
|
+
def get_column_names(phase = nil) # string: phase 1 or phase 2
|
168
|
+
return @features.select { |feature_descr|
|
169
|
+
phase.nil? or
|
170
|
+
feature_descr["phase"] == phase
|
171
|
+
}.map { |feature_descr|
|
172
|
+
feature_descr["feature_name"]
|
173
|
+
}
|
174
|
+
end
|
175
|
+
|
176
|
+
###
|
177
|
+
# get_index_columns
|
178
|
+
#
|
179
|
+
# returns a list of feature (column) names as Strings
|
180
|
+
# consisting of all features that have been requested as index features
|
181
|
+
# in the experiment file or in the list of @@all_features_we_have above
|
182
|
+
def get_index_columns()
|
183
|
+
return @features.select { |feature_descr|
|
184
|
+
feature_descr["is_index"]
|
185
|
+
}.map {|feature_descr|
|
186
|
+
feature_descr["feature_name"]
|
187
|
+
}
|
188
|
+
end
|
189
|
+
|
190
|
+
###
|
191
|
+
# get_model_features
|
192
|
+
#
|
193
|
+
# returns a list of feature (column) names as strings
|
194
|
+
# consisting of all the features to be used for the modeling
|
195
|
+
#
|
196
|
+
# step: argrec, arglab, onestep
|
197
|
+
def get_model_features(step)
|
198
|
+
|
199
|
+
return @features.select { |feature_descr|
|
200
|
+
# features for the current step
|
201
|
+
# feature_descr["step"] is argrec, arglab, onestep, dontuse, or nil
|
202
|
+
# nil matches all steps
|
203
|
+
# 'dontuse' matches no step, so these features will never be returned here
|
204
|
+
feature_descr["step"].nil? or
|
205
|
+
feature_descr["step"] == step
|
206
|
+
}.reject { |feature_descr|
|
207
|
+
# that are not admin features or the gold label
|
208
|
+
["admin", "gold"].include? feature_descr["type"]
|
209
|
+
}.map { |feature_descr|
|
210
|
+
# use just the names of the features
|
211
|
+
feature_descr["feature_name"]
|
212
|
+
}
|
213
|
+
end
|
214
|
+
|
215
|
+
###
|
216
|
+
# get_extractor_objects
|
217
|
+
#
|
218
|
+
# returns two lists of feature extractor objects,
|
219
|
+
# covering all features of the given phase:
|
220
|
+
# the first list contains RosyFeatureExtractor extractors,
|
221
|
+
# the second list contains the others.
|
222
|
+
def get_extractor_objects(phase, # string: "phase 1" or "phase 2"
|
223
|
+
interpreter_class) # SynInterpreter class
|
224
|
+
unless ["phase 1", "phase 2"].include? phase
|
225
|
+
raise "Shouldn't be here: " + phase
|
226
|
+
end
|
227
|
+
|
228
|
+
return @current_extractors.select { |descr|
|
229
|
+
# select extractors of the right phase
|
230
|
+
descr["extractor"].phase() == phase
|
231
|
+
}.map { |descr|
|
232
|
+
|
233
|
+
# make objects from extractor classes
|
234
|
+
descr["extractor"].new(@exp, interpreter_class)
|
235
|
+
}.distribute { |extractor_obj|
|
236
|
+
# distribute extractors in two bins:
|
237
|
+
# first, rosy extractors
|
238
|
+
# second, others
|
239
|
+
extractor_obj.class.info().include? "rosy"
|
240
|
+
}
|
241
|
+
end
|
242
|
+
end
|