shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,242 +0,0 @@
1
- # Katrin Erk November 05
2
- #
3
- # Abstract classes for
4
- # - Rosy features
5
- # - Rosy interface for external knowledge sources.
6
-
7
- require 'rosy/ExternalConfigData'
8
-
9
- ####
10
- # Feature Extractor:
11
- # computes one or more features for a node (a SynNode object) out of
12
- # a SalsaTigerSentence
13
- class AbstractFeatureExtractor
14
- @@sent = nil # SalsaTigerSentence: sentence of the current instance
15
- @@frame = nil # FrameNode: frame of the current instance
16
- @@node = nil # SynNode: constituent that is the current instance
17
- @@interpreter_class = nil # SynInterpreter class
18
- @@instance_ok = true
19
-
20
- ###
21
- # returns a string: the designator for this feature extractor
22
- # (an extractor may compute several features, but
23
- # in the experiment file it is chosen by a single designator)
24
- def AbstractFeatureExtractor.designator()
25
- raise "Overwrite me"
26
- end
27
-
28
- ###
29
- # returns an array of feature names, the names of the
30
- # features that it can compute.
31
- # The number of features that the extractor computes must be fixed.
32
- def AbstractFeatureExtractor.feature_names()
33
- raise "Overwrite me."
34
- end
35
-
36
- ###
37
- # returns a string: the data type for the feature
38
- # to be passed on to the MySQL database,
39
- # e.g. VARCHAR(10), INT
40
- def AbstractFeatureExtractor.sql_type()
41
- raise "Overwrite me"
42
- end
43
-
44
- ###
45
- # returns a string: the feature type
46
- # (the same for all features computed by this extractor)
47
- # possible values:
48
- # - gold: gold label
49
- # - admin: administrative feature, do not pass this on to the learner
50
- # - syn: feature computed from syntactic characteristics of the instance
51
- # - sem: feature involving semantic characteristics of the instance
52
- # - sentlevel: this feature is the same for all instances of a sentence
53
- def AbstractFeatureExtractor.feature_type()
54
- raise "Overwrite me"
55
- end
56
-
57
- ###
58
- # returns a string: "phase 1" or "phase 2",
59
- # depending on whether the feature is computed
60
- # directly from the SalsaTigerSentence and the SynNode objects
61
- # or whether it is computed from the phase 1 features
62
- def AbstractFeatureExtractor.phase()
63
- raise "Overwrite me."
64
- end
65
-
66
- ###
67
- # returns an array of strings, providing information about
68
- # the feature extractor
69
- def AbstractFeatureExtractor.info()
70
- return []
71
- end
72
-
73
- ###
74
- # set sentence, set node, set other settings:
75
- # this is done prior to
76
- # feature computation using compute_feature()
77
- # such that computations that stay the same for
78
- # several features can be done in advance
79
- #
80
- # This is just relevant for Phase 1
81
- #
82
- # returns: false/nil if there was a problem
83
- def AbstractFeatureExtractor.set_sentence(sent, # SalsaTigerSentence object
84
- frame) # FrameNode object
85
- @@sent = sent
86
- @@frame = frame
87
-
88
- return true
89
- end
90
-
91
- def AbstractFeatureExtractor.set_node(node) # SynNode of the sentence set in set_sentence
92
- @@node = node
93
-
94
- return true
95
- end
96
-
97
- ###
98
- # set sentence, set node, set general settings: this is done prior to
99
- # feature computation using compute_feature_value()
100
- # such that computations that stay the same for
101
- # several features can be done in advance
102
- def AbstractFeatureExtractor.set(var_hash = {})
103
- # no settings at this point
104
-
105
- return true
106
- end
107
- # test during initialisation whether a feature is computable
108
- # gives the feature the possibility to specify additional constraints
109
- # e.g. for phase2 features : specify which extractors from phase 1 are presupposed
110
- def AbstractFeatureExtractor.is_computable(extractor_list) # bool
111
- return true
112
- end
113
-
114
- ###
115
- # @param exp [ConfigData] Experiment file information
116
- # @param interpreter_class [Class]
117
- def initialize(exp, interpreter_class)
118
- @exp = exp
119
- @@interpreter_class = interpreter_class
120
- end
121
-
122
- ###
123
- # compute: compute features
124
- #
125
- # returns an array of features (strings), length the same as the
126
- # length of feature_names()
127
- def compute_features()
128
- raise "overwrite me"
129
- end
130
-
131
- ###
132
- # phase 2 extractors:
133
- # compute features for a complete view
134
- #
135
- # returns: an array of columns,
136
- # where a column is an array of feature values.
137
- # returns one column per entry in feature_names()
138
- def compute_features_on_view(view) # DBView object
139
- raise "overwrite me"
140
- end
141
-
142
- # At this place, we had abstract methods for "training" phase 2 features
143
- # Since this involves introducing a "state" that is nontrivial to preserve
144
- # for a standalone version of the classifiers, without keeping the training data,
145
- # we decided to remove this functionality (30.11.05).
146
- # Features which rely on learning patterns from the training data and applying them
147
- # to the test data will from now on be implemented as externals.
148
-
149
- ######
150
- protected
151
-
152
- def AbstractFeatureExtractor.announce_me()
153
- # AB: In 1.9 constants are symbols.
154
- if Module.constants.include?("RosyFeatureInfo") or Module.constants.include?(:RosyFeatureInfo)
155
- # yup, we have a class to which we can announce ourselves
156
- RosyFeatureInfo.add_feature(eval(self.name()))
157
- else
158
- # no interface collector class
159
- # $stderr.puts "Feature #{self.name()} not announced: no RosyFeatureInfo."
160
- end
161
- end
162
- end
163
-
164
- ################################################################
165
- # Wrapper class for extractors that compute a single feature
166
- class AbstractSingleFeatureExtractor < AbstractFeatureExtractor
167
-
168
- ###
169
- # returns a string: the designator for this feature extractor
170
- # (an extractor may compute several features, but
171
- # in the experiment file it is chosen by a single designator)
172
- #
173
- # here: single feature, and the feature name is the designator
174
- def AbstractFeatureExtractor.designator()
175
- return eval(self.name()).feature_name()
176
- end
177
-
178
- ###
179
- def AbstractSingleFeatureExtractor.feature_names()
180
- return [eval(self.name()).feature_name()]
181
- end
182
-
183
- ###
184
- def compute_features()
185
- return [compute_feature()]
186
- end
187
-
188
- def compute_features_on_view(view) # DBView object
189
- return [compute_feature_on_view(view)]
190
- end
191
-
192
-
193
- ######
194
- # Single-feature methods
195
-
196
- ###
197
- def AbstractSingleFeatureExtractor.feature_name()
198
- raise "Overwrite me."
199
- end
200
-
201
- ###
202
- def compute_feature()
203
- raise "Overwrite me"
204
- end
205
-
206
- ###
207
- def compute_feature_on_view(view) # DBView object
208
- raise "Overwrite me"
209
- end
210
- end
211
-
212
- ######################################################
213
-
214
- class ExternalFeatureExtractor < AbstractFeatureExtractor
215
-
216
- @@warning_uttered = false
217
-
218
- ####
219
- # initialization:
220
- #
221
- # read experiment file for external interfaces
222
- def initialize(exp, # RosyConfigData object
223
- interpreter_class)
224
-
225
- @exp_rosy = exp
226
- @@interpreter_class = interpreter_class
227
-
228
- unless @exp_rosy.get("external_descr_file")
229
- unless @@warning_uttered
230
- $stderr.puts "Warning: Cannot compute external feature"
231
- $stderr.puts "since 'external_descr_file' has not been set"
232
- $stderr.puts "in the Rosy experiment file."
233
- @@warning_uttered = true
234
- end
235
-
236
- @exp_external = nil
237
- return
238
- end
239
-
240
- @exp_external = ExternalConfigData.new(@exp_rosy.get("external_descr_file"))
241
- end
242
- end
@@ -1,58 +0,0 @@
1
- # ExternalConfigData
2
- # Katrin Erk January 2006
3
- #
4
- # All scripts that compute additional external knowledge sources
5
- # for Fred and Rosy:
6
- # access to configuration and experiment description file
7
-
8
- require 'common/config_data'
9
-
10
- ##############################
11
- # Class ExternalConfigData
12
- #
13
- # inherits from ConfigData,
14
- # sets variable names appropriate to tasks of external knowledge sources
15
-
16
- class ExternalConfigData < ConfigData
17
- def initialize(filename)
18
-
19
- # initialize config data object
20
- super(filename, # config file
21
- { "directory" => "string", # features
22
-
23
- "experiment_id" => "string",
24
-
25
- "gfmap_restrict_to_downpath" => "bool",
26
- "gfmap_restrict_pathlen" => "integer",
27
- "gfmap_remove_gf" => "list"
28
- },
29
- [] # variables
30
- )
31
-
32
- # set access functions for list features
33
- set_list_feature_access("gfmap_remove_gf",
34
- method("access_as_stringlist"))
35
- end
36
-
37
- ###
38
- protected
39
-
40
- #####
41
- # access_as_stringlist
42
- #
43
- # assumed format:
44
- #
45
- # lhs = rhs1 rhs2 ... rhsN
46
- #
47
- # given in val_list as string tuples [rhs1,...,rhsN]
48
- #
49
- # join the rhs strings by spaces, return as string
50
- # "rhs1 rhs2 ... rhsN"
51
- #
52
- def access_as_stringlist(val_list) # array:array:string
53
- return val_list.map { |rhs| rhs.join(" ") }
54
- end
55
- end
56
-
57
-
58
-
@@ -1,130 +0,0 @@
1
- # Failed Parses
2
- #
3
- # SP May 05
4
- #
5
- # Administration of information about failed parses;
6
- # - sentence ID
7
- # - frame
8
- # - missed FE markables
9
- #
10
- # this class is pretty much a gloriefied hash table with methods to
11
- # - read FailedParses from a file and to write them to a file
12
- # - access info in a frame-specific way
13
-
14
- class FailedParses
15
-
16
- ###
17
- # initialize
18
- #
19
- # nothing much happens here
20
- def initialize()
21
- @failed_parses = Array.new
22
- end
23
-
24
- ###
25
- # register
26
- #
27
- # register new failed parse by specifying
28
- # - its sentence id (any object)
29
- # - its frame (String)
30
- # - its FE list (String Array)
31
-
32
- def register(sent_id, # object
33
- frame, # string: frame name
34
- target, # string?
35
- target_pos, # string: target POS
36
- fe_list) # array:string
37
- if @failed_parses.assoc sent_id
38
- # $stderr.puts "Error: trying to register sentence id #{sent_id} twice!"
39
- # $stderr.puts "Skipping second occurrence."
40
- end
41
- @failed_parses << [sent_id,frame,target,target_pos,fe_list]
42
- end
43
-
44
- ###
45
- # make_split
46
- #
47
- # produce a "split" of the failed parses into a train and a test section
48
- # paramer: train_percentage, Integer between 0 and 100
49
- #
50
- # returns an Array with two FailedParses objects, the first for the
51
- # train data, the second for the test data
52
-
53
- def make_split(train_percentage)
54
- unless train_percentage.class < Integer and train_percentage >= 0 and train_percentage <= 100
55
- raise "Need Integer between 0 and 100 as training percentage."
56
- end
57
- train_failed = FailedParses.new()
58
- test_failed = FailedParses.new()
59
- @failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
60
- if rand(100) > train_percentage
61
- test_failed.register(sent_id,frame,target,target_pos,fe_list)
62
- else
63
- train_failed.register(sent_id,frame,target,target_pos,fe_list)
64
- end
65
- }
66
- return [train_failed, test_failed]
67
- end
68
-
69
- ###
70
- # Access information
71
- #
72
- # failed_sent: number of failed sentences
73
- # failed_fes: Hash that maps FE names [String] onto numbers of failed FEs [Int]
74
- #
75
- # optional parameters: frame, target, target_pos : if not specified or nil, marginal
76
- # frequencies are counted (sum over all values)
77
-
78
-
79
- def failed_sent(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
80
- counter = 0
81
- @failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
82
- if ((frame_spec.nil? or frame_spec == frame) and
83
- (target_spec.nil? or target_spec == target) and
84
- (target_pos_spec.nil? or target_pos_spec == target_pos))
85
- counter += 1
86
- end
87
- }
88
- return counter
89
- end
90
-
91
- def failed_fes(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
92
- fe_hash = Hash.new(0)
93
- @failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
94
- if ((frame_spec.nil? or frame_spec == frame) and
95
- (target_spec.nil? or target_spec == target) and
96
- (target_pos_spec.nil? or target_pos_spec == target))
97
- fe_list.each {|fe_label|
98
- fe_hash[fe_label] += 1
99
- }
100
- end
101
- }
102
- return fe_hash
103
- end
104
-
105
-
106
- ###
107
- # Marshalling:
108
- #
109
- # save - save info about failed parses to file
110
- # load - load info about failed parses from file
111
-
112
- def save(filename)
113
- io_obj = File.new(filename,"w")
114
- Marshal.dump(@failed_parses,io_obj)
115
- io_obj.close
116
- end
117
-
118
- def load(filename)
119
- begin
120
- io_obj = File.new(filename)
121
- @failed_parses = Marshal.load(io_obj)
122
- io_obj.close
123
- rescue
124
- $stderr.puts "WARNING: couldn't read failed parses file #{filename}."
125
- $stderr.puts "I'll assume that there are no failed parses."
126
- end
127
- end
128
-
129
-
130
- end
@@ -1,242 +0,0 @@
1
- require 'common/ruby_class_extensions'
2
-
3
- class RosyFeatureInfo
4
- ###
5
- # class variable:
6
- # list of all known extractors
7
- # add to it using add_feature()
8
- @@extractors = Array.new
9
-
10
- # boolean. set to true after warning messages have been given once
11
- @@warned = false
12
-
13
- ###
14
- # add interface/interpreter
15
- def RosyFeatureInfo.add_feature(class_name) # Class object
16
- @@extractors << class_name
17
- end
18
-
19
- ###
20
- def initialize(exp)
21
-
22
- ##
23
- # make list of extractors that are
24
- # either required by the user
25
- # or needed by the system
26
- @current_extractors = Array.new
27
- @exp = exp
28
-
29
- # user-chosen extractors:
30
- # returns array of pairs [feature group designator(string), options(array:string)]
31
- exp.get_lf("feature").each { |extractor_name, options|
32
- extractor = @@extractors.detect { |e| e.designator() == extractor_name }
33
- unless extractor
34
- # no extractor found matching the given designator
35
- unless @@warned
36
- $stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
37
- end
38
- next
39
- end
40
-
41
- # read and check options
42
- step = nil
43
-
44
- options.each { |option|
45
- case option
46
- when "dontuse", "argrec", "arglab", "onestep"
47
-
48
- if step
49
- # step has already been set
50
- $stderr.puts "ERROR in feature #{extractor_name}: Please set only one of the options dontuse, argrec, arglab, onestep"
51
- exit 1
52
- end
53
-
54
- step = option
55
-
56
- else
57
- unless @@warned
58
- $stderr.puts "Warning: Unknown option for feature #{extractor_name}: #{option}. Skipping"
59
- end
60
- end
61
- }
62
-
63
- @current_extractors << {
64
- "extractor" => extractor,
65
- "step" => step
66
- }
67
- }
68
-
69
- # extractors needed by the system
70
- @@extractors.select { |e|
71
- # select admin features and gold feature
72
- ["admin", "gold"].include? e.feature_type()
73
- }.each { |extractor|
74
-
75
- # if we have already added that extractor, remove it
76
- # and add it with our own options
77
- @current_extractors.delete_if { |descr| descr["extractor"].designator() == extractor.designator() }
78
-
79
- @current_extractors << {
80
- "extractor"=> extractor,
81
- "step" => "dontuse"
82
- }
83
- }
84
-
85
- # make sure that all extractors are computable in the current model
86
- # (i.e. check dependencies)
87
-
88
- allstep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil?
89
- }.map { |e| e["extractor"].designator() }
90
- argrec_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "argrec"
91
- }.map { |e| e["extractor"].designator() }
92
- arglab_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "arglab"
93
- }.map { |e| e["extractor"].designator() }
94
- onestep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "onestep"
95
- }.map { |e| e["extractor"].designator() }
96
-
97
- @current_extractors.delete_if {|extractor_hash|
98
- case extractor_hash["step"]
99
- when nil
100
- computable = extractor_hash["extractor"].is_computable(allstep_extractors)
101
- when "argrec"
102
- computable = extractor_hash["extractor"].is_computable(argrec_extractors)
103
- when "arglab"
104
- computable = extractor_hash["extractor"].is_computable(arglab_extractors)
105
- when "onestep"
106
- computable = extractor_hash["extractor"].is_computable(onestep_extractors)
107
- when "dontuse"
108
- # either an admin feature or a user feature not to be used this time
109
- computable = true
110
- end
111
-
112
- if computable
113
- false # i.e. don't delete
114
- else
115
- unless @@warned
116
- $stderr.puts "Warning: Feature extractor #{extractor_hash["extractor"].designator()} cannot be computed: skipping."
117
- end
118
- true
119
- end
120
- }
121
-
122
- # make list of all features as hashes
123
- # "feature_name" -> string,
124
- # "sql_type" -> string,
125
- # "is_index" -> boolean,
126
- # "step" -> string: argrec, arglab, onestep, or nil
127
- # "type" -> string
128
- # "phase" -> string: phase 1 or phase 2
129
- @features = Array.new
130
- @current_extractors.each { |descr|
131
- extractor = descr["extractor"]
132
- extractor.feature_names.each { |feature_name|
133
- @features << {
134
- "feature_name" => feature_name,
135
- "sql_type" => extractor.sql_type(),
136
- "is_index" => extractor.info().include?("index"),
137
- "step" => descr["step"],
138
- "type" => extractor.feature_type(),
139
- "phase" => extractor.phase()
140
- }
141
- }
142
- }
143
-
144
- # do not print warnings again if another RosyFeatureInfo object is made
145
- @@warned = true
146
- end
147
-
148
- ###
149
- # get_column_formats
150
- #
151
- # returns a list of pairs [feature_name(string), sql_column_format(string)]:
152
- # all features to be computed, with their SQL column formats
153
- def get_column_formats(phase = nil) # string: phase 1 or phase 2
154
- return @features.select { |feature_descr|
155
- phase.nil? or
156
- feature_descr["phase"] == phase
157
- }.map { |feature_descr|
158
- [feature_descr["feature_name"], feature_descr["sql_type"]]
159
- }
160
- end
161
-
162
- ###
163
- # get_column_names
164
- #
165
- # returns a list of feature names (strings)
166
- # all features to be computed
167
- def get_column_names(phase = nil) # string: phase 1 or phase 2
168
- return @features.select { |feature_descr|
169
- phase.nil? or
170
- feature_descr["phase"] == phase
171
- }.map { |feature_descr|
172
- feature_descr["feature_name"]
173
- }
174
- end
175
-
176
- ###
177
- # get_index_columns
178
- #
179
- # returns a list of feature (column) names as Strings
180
- # consisting of all features that have been requested as index features
181
- # in the experiment file or in the list of @@all_features_we_have above
182
- def get_index_columns()
183
- return @features.select { |feature_descr|
184
- feature_descr["is_index"]
185
- }.map {|feature_descr|
186
- feature_descr["feature_name"]
187
- }
188
- end
189
-
190
- ###
191
- # get_model_features
192
- #
193
- # returns a list of feature (column) names as strings
194
- # consisting of all the features to be used for the modeling
195
- #
196
- # step: argrec, arglab, onestep
197
- def get_model_features(step)
198
-
199
- return @features.select { |feature_descr|
200
- # features for the current step
201
- # feature_descr["step"] is argrec, arglab, onestep, dontuse, or nil
202
- # nil matches all steps
203
- # 'dontuse' matches no step, so these features will never be returned here
204
- feature_descr["step"].nil? or
205
- feature_descr["step"] == step
206
- }.reject { |feature_descr|
207
- # that are not admin features or the gold label
208
- ["admin", "gold"].include? feature_descr["type"]
209
- }.map { |feature_descr|
210
- # use just the names of the features
211
- feature_descr["feature_name"]
212
- }
213
- end
214
-
215
- ###
216
- # get_extractor_objects
217
- #
218
- # returns two lists of feature extractor objects,
219
- # covering all features of the given phase:
220
- # the first list contains RosyFeatureExtractor extractors,
221
- # the second list contains the others.
222
- def get_extractor_objects(phase, # string: "phase 1" or "phase 2"
223
- interpreter_class) # SynInterpreter class
224
- unless ["phase 1", "phase 2"].include? phase
225
- raise "Shouldn't be here: " + phase
226
- end
227
-
228
- return @current_extractors.select { |descr|
229
- # select extractors of the right phase
230
- descr["extractor"].phase() == phase
231
- }.map { |descr|
232
-
233
- # make objects from extractor classes
234
- descr["extractor"].new(@exp, interpreter_class)
235
- }.distribute { |extractor_obj|
236
- # distribute extractors in two bins:
237
- # first, rosy extractors
238
- # second, others
239
- extractor_obj.class.info().include? "rosy"
240
- }
241
- end
242
- end