frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,176 @@
|
|
1
|
+
###########################
|
2
|
+
# DBWrapper:
|
3
|
+
# abstract class wrapping database interfaces,
|
4
|
+
# so we can have both an interface to an SQL server
|
5
|
+
# and an interface to SQLite in Shalmaneser
|
6
|
+
class DBWrapper
|
7
|
+
attr_reader :table_name
|
8
|
+
|
9
|
+
###
|
10
|
+
def initialize(exp) # RosyConfigData experiment file object
|
11
|
+
# remember experiment file
|
12
|
+
@exp = exp
|
13
|
+
|
14
|
+
# open the database:
|
15
|
+
# please set to some other value in subclass initialization
|
16
|
+
@database = nil
|
17
|
+
|
18
|
+
# name of default table to access: none
|
19
|
+
@table_name = nil
|
20
|
+
end
|
21
|
+
|
22
|
+
###
|
23
|
+
# close DB access
|
24
|
+
def close()
|
25
|
+
@database.close()
|
26
|
+
end
|
27
|
+
|
28
|
+
####
|
29
|
+
# querying the database:
|
30
|
+
# returns an DBResult object
|
31
|
+
def query(query)
|
32
|
+
raise "Overwrite me"
|
33
|
+
end
|
34
|
+
|
35
|
+
####
|
36
|
+
# querying the database:
|
37
|
+
# no result value
|
38
|
+
def query_noretv(query)
|
39
|
+
raise "Overwrite me"
|
40
|
+
end
|
41
|
+
|
42
|
+
###
|
43
|
+
# list all tables in the database:
|
44
|
+
# no default here
|
45
|
+
#
|
46
|
+
# returns: list of strings
|
47
|
+
def list_tables()
|
48
|
+
raise "Overwrite me"
|
49
|
+
end
|
50
|
+
|
51
|
+
###
|
52
|
+
# make a table
|
53
|
+
#
|
54
|
+
# returns: nothing
|
55
|
+
def create_table(table_name, # string
|
56
|
+
column_formats, # array: array: string*string [column_name,column_format]
|
57
|
+
index_column_names, # array: string: column_name
|
58
|
+
indexname) # string: name of automatically created index column
|
59
|
+
raise "overwrite me"
|
60
|
+
end
|
61
|
+
|
62
|
+
###
|
63
|
+
# remove a table
|
64
|
+
def drop_table(table_name)
|
65
|
+
query_noretv("DROP TABLE " + table_name)
|
66
|
+
end
|
67
|
+
|
68
|
+
###
|
69
|
+
# list all column names of a table (no default)
|
70
|
+
#
|
71
|
+
# returns: array of strings
|
72
|
+
def list_column_names(table_name)
|
73
|
+
return list_column_formats(table_name).map { |col_name, col_format| col_name }
|
74
|
+
end
|
75
|
+
|
76
|
+
#####
|
77
|
+
# list_column_formats
|
78
|
+
#
|
79
|
+
# list column names and column types of this table
|
80
|
+
#
|
81
|
+
# returns: array:string*string, list of pairs [column name, column format]
|
82
|
+
def list_column_formats(table_name)
|
83
|
+
raise "Overwrite me"
|
84
|
+
end
|
85
|
+
|
86
|
+
####
|
87
|
+
# num_rows
|
88
|
+
#
|
89
|
+
# determine the number of rows in a table
|
90
|
+
# returns: integer
|
91
|
+
def num_rows(table_name)
|
92
|
+
raise "Overwrite me"
|
93
|
+
end
|
94
|
+
|
95
|
+
####
|
96
|
+
# make a temporary table: basically just make a table
|
97
|
+
#
|
98
|
+
# returns: DBWrapper object (or object of current subclass)
|
99
|
+
# that has the @table_name attribute set to the name of a temporary DB
|
100
|
+
def make_temp_table(column_formats, # array: string*string [column_name,column_format]
|
101
|
+
index_column_names, # array: string: column_name
|
102
|
+
indexname) # string: name of autoincrement primary index
|
103
|
+
|
104
|
+
temp_obj = self.clone()
|
105
|
+
temp_obj.initialize_temp_table(column_formats, index_column_names, indexname)
|
106
|
+
return temp_obj
|
107
|
+
end
|
108
|
+
|
109
|
+
def drop_temp_table()
|
110
|
+
unless @table_name
|
111
|
+
raise "can only do drop_temp_table() for objects that have a temp table"
|
112
|
+
end
|
113
|
+
drop_table(@table_name)
|
114
|
+
end
|
115
|
+
|
116
|
+
##############################
|
117
|
+
protected
|
118
|
+
|
119
|
+
def initialize_temp_table(column_formats, index_column_names, indexname)
|
120
|
+
@table_name = "t" + Time.new().to_f().to_s().gsub(/\./, "")
|
121
|
+
create_table(@table_name, column_formats, index_column_names, indexname)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
######################################################################
|
129
|
+
# DBResult:
|
130
|
+
# abstract class keeping query results
|
131
|
+
#
|
132
|
+
# instantiate for the DB package used
|
133
|
+
class DBResult
|
134
|
+
###
|
135
|
+
# initialize with query result, and keep it
|
136
|
+
def initialize(value)
|
137
|
+
@result = value
|
138
|
+
end
|
139
|
+
|
140
|
+
# column names: NO DEFAULT
|
141
|
+
def list_column_names()
|
142
|
+
raise "Overwrite me"
|
143
|
+
end
|
144
|
+
|
145
|
+
# number of rows: returns an integer
|
146
|
+
def num_rows()
|
147
|
+
return @result.num_rows
|
148
|
+
end
|
149
|
+
|
150
|
+
# yields each row as an array of values
|
151
|
+
def each()
|
152
|
+
@result.each { |row| yield row }
|
153
|
+
end
|
154
|
+
|
155
|
+
# yields each row as a hash: column name=> column value
|
156
|
+
def each_hash()
|
157
|
+
@result.each_hash { |row_hash| yield row_hash }
|
158
|
+
end
|
159
|
+
|
160
|
+
# reset object, such that each() can be run again
|
161
|
+
# DEFAULT DOES NOTHING, PLEASE OVERWRITE
|
162
|
+
def reset()
|
163
|
+
end
|
164
|
+
|
165
|
+
# free result object
|
166
|
+
def free()
|
167
|
+
@result.free()
|
168
|
+
end
|
169
|
+
|
170
|
+
# returns row as an array of column contents
|
171
|
+
def fetch_row()
|
172
|
+
return @result.fetch_row()
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
176
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# ExternalConfigData
|
2
|
+
# Katrin Erk January 2006
|
3
|
+
#
|
4
|
+
# All scripts that compute additional external knowledge sources
|
5
|
+
# for Fred and Rosy:
|
6
|
+
# access to configuration and experiment description file
|
7
|
+
|
8
|
+
require 'common/ConfigData'
|
9
|
+
|
10
|
+
##############################
|
11
|
+
# Class ExternalConfigData
|
12
|
+
#
|
13
|
+
# inherits from ConfigData,
|
14
|
+
# sets variable names appropriate to tasks of external knowledge sources
|
15
|
+
|
16
|
+
class ExternalConfigData < ConfigData
|
17
|
+
def initialize(filename)
|
18
|
+
|
19
|
+
# initialize config data object
|
20
|
+
super(filename, # config file
|
21
|
+
{ "directory" => "string", # features
|
22
|
+
|
23
|
+
"experiment_id" => "string",
|
24
|
+
|
25
|
+
"gfmap_restrict_to_downpath" => "bool",
|
26
|
+
"gfmap_restrict_pathlen" => "integer",
|
27
|
+
"gfmap_remove_gf" => "list"
|
28
|
+
},
|
29
|
+
[] # variables
|
30
|
+
)
|
31
|
+
|
32
|
+
# set access functions for list features
|
33
|
+
set_list_feature_access("gfmap_remove_gf",
|
34
|
+
method("access_as_stringlist"))
|
35
|
+
end
|
36
|
+
|
37
|
+
###
|
38
|
+
protected
|
39
|
+
|
40
|
+
#####
|
41
|
+
# access_as_stringlist
|
42
|
+
#
|
43
|
+
# assumed format:
|
44
|
+
#
|
45
|
+
# lhs = rhs1 rhs2 ... rhsN
|
46
|
+
#
|
47
|
+
# given in val_list as string tuples [rhs1,...,rhsN]
|
48
|
+
#
|
49
|
+
# join the rhs strings by spaces, return as string
|
50
|
+
# "rhs1 rhs2 ... rhsN"
|
51
|
+
#
|
52
|
+
def access_as_stringlist(val_list) # array:array:string
|
53
|
+
return val_list.map { |rhs| rhs.join(" ") }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
|
@@ -0,0 +1,130 @@
|
|
1
|
+
# Failed Parses
|
2
|
+
#
|
3
|
+
# SP May 05
|
4
|
+
#
|
5
|
+
# Administration of information about failed parses;
|
6
|
+
# - sentence ID
|
7
|
+
# - frame
|
8
|
+
# - missed FE markables
|
9
|
+
#
|
10
|
+
# this class is pretty much a gloriefied hash table with methods to
|
11
|
+
# - read FailedParses from a file and to write them to a file
|
12
|
+
# - access info in a frame-specific way
|
13
|
+
|
14
|
+
class FailedParses
|
15
|
+
|
16
|
+
###
|
17
|
+
# initialize
|
18
|
+
#
|
19
|
+
# nothing much happens here
|
20
|
+
def initialize()
|
21
|
+
@failed_parses = Array.new
|
22
|
+
end
|
23
|
+
|
24
|
+
###
|
25
|
+
# register
|
26
|
+
#
|
27
|
+
# register new failed parse by specifying
|
28
|
+
# - its sentence id (any object)
|
29
|
+
# - its frame (String)
|
30
|
+
# - its FE list (String Array)
|
31
|
+
|
32
|
+
def register(sent_id, # object
|
33
|
+
frame, # string: frame name
|
34
|
+
target, # string?
|
35
|
+
target_pos, # string: target POS
|
36
|
+
fe_list) # array:string
|
37
|
+
if @failed_parses.assoc sent_id
|
38
|
+
# $stderr.puts "Error: trying to register sentence id #{sent_id} twice!"
|
39
|
+
# $stderr.puts "Skipping second occurrence."
|
40
|
+
end
|
41
|
+
@failed_parses << [sent_id,frame,target,target_pos,fe_list]
|
42
|
+
end
|
43
|
+
|
44
|
+
###
|
45
|
+
# make_split
|
46
|
+
#
|
47
|
+
# produce a "split" of the failed parses into a train and a test section
|
48
|
+
# paramer: train_percentage, Integer between 0 and 100
|
49
|
+
#
|
50
|
+
# returns an Array with two FailedParses objects, the first for the
|
51
|
+
# train data, the second for the test data
|
52
|
+
|
53
|
+
def make_split(train_percentage)
|
54
|
+
unless train_percentage.class < Integer and train_percentage >= 0 and train_percentage <= 100
|
55
|
+
raise "Need Integer between 0 and 100 as training percentage."
|
56
|
+
end
|
57
|
+
train_failed = FailedParses.new()
|
58
|
+
test_failed = FailedParses.new()
|
59
|
+
@failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
|
60
|
+
if rand(100) > train_percentage
|
61
|
+
test_failed.register(sent_id,frame,target,target_pos,fe_list)
|
62
|
+
else
|
63
|
+
train_failed.register(sent_id,frame,target,target_pos,fe_list)
|
64
|
+
end
|
65
|
+
}
|
66
|
+
return [train_failed, test_failed]
|
67
|
+
end
|
68
|
+
|
69
|
+
###
|
70
|
+
# Access information
|
71
|
+
#
|
72
|
+
# failed_sent: number of failed sentences
|
73
|
+
# failed_fes: Hash that maps FE names [String] onto numbers of failed FEs [Int]
|
74
|
+
#
|
75
|
+
# optional parameters: frame, target, target_pos : if not specified or nil, marginal
|
76
|
+
# frequencies are counted (sum over all values)
|
77
|
+
|
78
|
+
|
79
|
+
def failed_sent(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
|
80
|
+
counter = 0
|
81
|
+
@failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
|
82
|
+
if ((frame_spec.nil? or frame_spec == frame) and
|
83
|
+
(target_spec.nil? or target_spec == target) and
|
84
|
+
(target_pos_spec.nil? or target_pos_spec == target_pos))
|
85
|
+
counter += 1
|
86
|
+
end
|
87
|
+
}
|
88
|
+
return counter
|
89
|
+
end
|
90
|
+
|
91
|
+
def failed_fes(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
|
92
|
+
fe_hash = Hash.new(0)
|
93
|
+
@failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
|
94
|
+
if ((frame_spec.nil? or frame_spec == frame) and
|
95
|
+
(target_spec.nil? or target_spec == target) and
|
96
|
+
(target_pos_spec.nil? or target_pos_spec == target))
|
97
|
+
fe_list.each {|fe_label|
|
98
|
+
fe_hash[fe_label] += 1
|
99
|
+
}
|
100
|
+
end
|
101
|
+
}
|
102
|
+
return fe_hash
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
###
|
107
|
+
# Marshalling:
|
108
|
+
#
|
109
|
+
# save - save info about failed parses to file
|
110
|
+
# load - load info about failed parses from file
|
111
|
+
|
112
|
+
def save(filename)
|
113
|
+
io_obj = File.new(filename,"w")
|
114
|
+
Marshal.dump(@failed_parses,io_obj)
|
115
|
+
io_obj.close
|
116
|
+
end
|
117
|
+
|
118
|
+
def load(filename)
|
119
|
+
begin
|
120
|
+
io_obj = File.new(filename)
|
121
|
+
@failed_parses = Marshal.load(io_obj)
|
122
|
+
io_obj.close
|
123
|
+
rescue
|
124
|
+
$stderr.puts "WARNING: couldn't read failed parses file #{filename}."
|
125
|
+
$stderr.puts "I'll assume that there are no failed parses."
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
|
130
|
+
end
|
@@ -0,0 +1,242 @@
|
|
1
|
+
require 'common/ruby_class_extensions'
|
2
|
+
|
3
|
+
class RosyFeatureInfo
|
4
|
+
###
|
5
|
+
# class variable:
|
6
|
+
# list of all known extractors
|
7
|
+
# add to it using add_feature()
|
8
|
+
@@extractors = Array.new
|
9
|
+
|
10
|
+
# boolean. set to true after warning messages have been given once
|
11
|
+
@@warned = false
|
12
|
+
|
13
|
+
###
|
14
|
+
# add interface/interpreter
|
15
|
+
def RosyFeatureInfo.add_feature(class_name) # Class object
|
16
|
+
@@extractors << class_name
|
17
|
+
end
|
18
|
+
|
19
|
+
###
|
20
|
+
def initialize(exp)
|
21
|
+
|
22
|
+
##
|
23
|
+
# make list of extractors that are
|
24
|
+
# either required by the user
|
25
|
+
# or needed by the system
|
26
|
+
@current_extractors = Array.new
|
27
|
+
@exp = exp
|
28
|
+
|
29
|
+
# user-chosen extractors:
|
30
|
+
# returns array of pairs [feature group designator(string), options(array:string)]
|
31
|
+
exp.get_lf("feature").each { |extractor_name, options|
|
32
|
+
extractor = @@extractors.detect { |e| e.designator() == extractor_name }
|
33
|
+
unless extractor
|
34
|
+
# no extractor found matching the given designator
|
35
|
+
unless @@warned
|
36
|
+
$stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
|
37
|
+
end
|
38
|
+
next
|
39
|
+
end
|
40
|
+
|
41
|
+
# read and check options
|
42
|
+
step = nil
|
43
|
+
|
44
|
+
options.each { |option|
|
45
|
+
case option
|
46
|
+
when "dontuse", "argrec", "arglab", "onestep"
|
47
|
+
|
48
|
+
if step
|
49
|
+
# step has already been set
|
50
|
+
$stderr.puts "ERROR in feature #{extractor_name}: Please set only one of the options dontuse, argrec, arglab, onestep"
|
51
|
+
exit 1
|
52
|
+
end
|
53
|
+
|
54
|
+
step = option
|
55
|
+
|
56
|
+
else
|
57
|
+
unless @@warned
|
58
|
+
$stderr.puts "Warning: Unknown option for feature #{extractor_name}: #{option}. Skipping"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
}
|
62
|
+
|
63
|
+
@current_extractors << {
|
64
|
+
"extractor" => extractor,
|
65
|
+
"step" => step
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
# extractors needed by the system
|
70
|
+
@@extractors.select { |e|
|
71
|
+
# select admin features and gold feature
|
72
|
+
["admin", "gold"].include? e.feature_type()
|
73
|
+
}.each { |extractor|
|
74
|
+
|
75
|
+
# if we have already added that extractor, remove it
|
76
|
+
# and add it with our own options
|
77
|
+
@current_extractors.delete_if { |descr| descr["extractor"].designator() == extractor.designator() }
|
78
|
+
|
79
|
+
@current_extractors << {
|
80
|
+
"extractor"=> extractor,
|
81
|
+
"step" => "dontuse"
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
# make sure that all extractors are computable in the current model
|
86
|
+
# (i.e. check dependencies)
|
87
|
+
|
88
|
+
allstep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil?
|
89
|
+
}.map { |e| e["extractor"].designator() }
|
90
|
+
argrec_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "argrec"
|
91
|
+
}.map { |e| e["extractor"].designator() }
|
92
|
+
arglab_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "arglab"
|
93
|
+
}.map { |e| e["extractor"].designator() }
|
94
|
+
onestep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "onestep"
|
95
|
+
}.map { |e| e["extractor"].designator() }
|
96
|
+
|
97
|
+
@current_extractors.delete_if {|extractor_hash|
|
98
|
+
case extractor_hash["step"]
|
99
|
+
when nil
|
100
|
+
computable = extractor_hash["extractor"].is_computable(allstep_extractors)
|
101
|
+
when "argrec"
|
102
|
+
computable = extractor_hash["extractor"].is_computable(argrec_extractors)
|
103
|
+
when "arglab"
|
104
|
+
computable = extractor_hash["extractor"].is_computable(arglab_extractors)
|
105
|
+
when "onestep"
|
106
|
+
computable = extractor_hash["extractor"].is_computable(onestep_extractors)
|
107
|
+
when "dontuse"
|
108
|
+
# either an admin feature or a user feature not to be used this time
|
109
|
+
computable = true
|
110
|
+
end
|
111
|
+
|
112
|
+
if computable
|
113
|
+
false # i.e. don't delete
|
114
|
+
else
|
115
|
+
unless @@warned
|
116
|
+
$stderr.puts "Warning: Feature extractor #{extractor_hash["extractor"].designator()} cannot be computed: skipping."
|
117
|
+
end
|
118
|
+
true
|
119
|
+
end
|
120
|
+
}
|
121
|
+
|
122
|
+
# make list of all features as hashes
|
123
|
+
# "feature_name" -> string,
|
124
|
+
# "sql_type" -> string,
|
125
|
+
# "is_index" -> boolean,
|
126
|
+
# "step" -> string: argrec, arglab, onestep, or nil
|
127
|
+
# "type" -> string
|
128
|
+
# "phase" -> string: phase 1 or phase 2
|
129
|
+
@features = Array.new
|
130
|
+
@current_extractors.each { |descr|
|
131
|
+
extractor = descr["extractor"]
|
132
|
+
extractor.feature_names.each { |feature_name|
|
133
|
+
@features << {
|
134
|
+
"feature_name" => feature_name,
|
135
|
+
"sql_type" => extractor.sql_type(),
|
136
|
+
"is_index" => extractor.info().include?("index"),
|
137
|
+
"step" => descr["step"],
|
138
|
+
"type" => extractor.feature_type(),
|
139
|
+
"phase" => extractor.phase()
|
140
|
+
}
|
141
|
+
}
|
142
|
+
}
|
143
|
+
|
144
|
+
# do not print warnings again if another RosyFeatureInfo object is made
|
145
|
+
@@warned = true
|
146
|
+
end
|
147
|
+
|
148
|
+
###
|
149
|
+
# get_column_formats
|
150
|
+
#
|
151
|
+
# returns a list of pairs [feature_name(string), sql_column_format(string)]:
|
152
|
+
# all features to be computed, with their SQL column formats
|
153
|
+
def get_column_formats(phase = nil) # string: phase 1 or phase 2
|
154
|
+
return @features.select { |feature_descr|
|
155
|
+
phase.nil? or
|
156
|
+
feature_descr["phase"] == phase
|
157
|
+
}.map { |feature_descr|
|
158
|
+
[feature_descr["feature_name"], feature_descr["sql_type"]]
|
159
|
+
}
|
160
|
+
end
|
161
|
+
|
162
|
+
###
|
163
|
+
# get_column_names
|
164
|
+
#
|
165
|
+
# returns a list of feature names (strings)
|
166
|
+
# all features to be computed
|
167
|
+
def get_column_names(phase = nil) # string: phase 1 or phase 2
|
168
|
+
return @features.select { |feature_descr|
|
169
|
+
phase.nil? or
|
170
|
+
feature_descr["phase"] == phase
|
171
|
+
}.map { |feature_descr|
|
172
|
+
feature_descr["feature_name"]
|
173
|
+
}
|
174
|
+
end
|
175
|
+
|
176
|
+
###
|
177
|
+
# get_index_columns
|
178
|
+
#
|
179
|
+
# returns a list of feature (column) names as Strings
|
180
|
+
# consisting of all features that have been requested as index features
|
181
|
+
# in the experiment file or in the list of @@all_features_we_have above
|
182
|
+
def get_index_columns()
|
183
|
+
return @features.select { |feature_descr|
|
184
|
+
feature_descr["is_index"]
|
185
|
+
}.map {|feature_descr|
|
186
|
+
feature_descr["feature_name"]
|
187
|
+
}
|
188
|
+
end
|
189
|
+
|
190
|
+
###
|
191
|
+
# get_model_features
|
192
|
+
#
|
193
|
+
# returns a list of feature (column) names as strings
|
194
|
+
# consisting of all the features to be used for the modeling
|
195
|
+
#
|
196
|
+
# step: argrec, arglab, onestep
|
197
|
+
def get_model_features(step)
|
198
|
+
|
199
|
+
return @features.select { |feature_descr|
|
200
|
+
# features for the current step
|
201
|
+
# feature_descr["step"] is argrec, arglab, onestep, dontuse, or nil
|
202
|
+
# nil matches all steps
|
203
|
+
# 'dontuse' matches no step, so these features will never be returned here
|
204
|
+
feature_descr["step"].nil? or
|
205
|
+
feature_descr["step"] == step
|
206
|
+
}.reject { |feature_descr|
|
207
|
+
# that are not admin features or the gold label
|
208
|
+
["admin", "gold"].include? feature_descr["type"]
|
209
|
+
}.map { |feature_descr|
|
210
|
+
# use just the names of the features
|
211
|
+
feature_descr["feature_name"]
|
212
|
+
}
|
213
|
+
end
|
214
|
+
|
215
|
+
###
|
216
|
+
# get_extractor_objects
|
217
|
+
#
|
218
|
+
# returns two lists of feature extractor objects,
|
219
|
+
# covering all features of the given phase:
|
220
|
+
# the first list contains RosyFeatureExtractor extractors,
|
221
|
+
# the second list contains the others.
|
222
|
+
def get_extractor_objects(phase, # string: "phase 1" or "phase 2"
|
223
|
+
interpreter_class) # SynInterpreter class
|
224
|
+
unless ["phase 1", "phase 2"].include? phase
|
225
|
+
raise "Shouldn't be here: " + phase
|
226
|
+
end
|
227
|
+
|
228
|
+
return @current_extractors.select { |descr|
|
229
|
+
# select extractors of the right phase
|
230
|
+
descr["extractor"].phase() == phase
|
231
|
+
}.map { |descr|
|
232
|
+
|
233
|
+
# make objects from extractor classes
|
234
|
+
descr["extractor"].new(@exp, interpreter_class)
|
235
|
+
}.distribute { |extractor_obj|
|
236
|
+
# distribute extractors in two bins:
|
237
|
+
# first, rosy extractors
|
238
|
+
# second, others
|
239
|
+
extractor_obj.class.info().include? "rosy"
|
240
|
+
}
|
241
|
+
end
|
242
|
+
end
|