frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,176 @@
1
+ ###########################
2
+ # DBWrapper:
3
+ # abstract class wrapping database interfaces,
4
+ # so we can have both an interface to an SQL server
5
+ # and an interface to SQLite in Shalmaneser
6
+ class DBWrapper
7
+ attr_reader :table_name
8
+
9
+ ###
10
+ def initialize(exp) # RosyConfigData experiment file object
11
+ # remember experiment file
12
+ @exp = exp
13
+
14
+ # open the database:
15
+ # please set to some other value in subclass initialization
16
+ @database = nil
17
+
18
+ # name of default table to access: none
19
+ @table_name = nil
20
+ end
21
+
22
+ ###
23
+ # close DB access
24
+ def close()
25
+ @database.close()
26
+ end
27
+
28
+ ####
29
+ # querying the database:
30
+ # returns an DBResult object
31
+ def query(query)
32
+ raise "Overwrite me"
33
+ end
34
+
35
+ ####
36
+ # querying the database:
37
+ # no result value
38
+ def query_noretv(query)
39
+ raise "Overwrite me"
40
+ end
41
+
42
+ ###
43
+ # list all tables in the database:
44
+ # no default here
45
+ #
46
+ # returns: list of strings
47
+ def list_tables()
48
+ raise "Overwrite me"
49
+ end
50
+
51
+ ###
52
+ # make a table
53
+ #
54
+ # returns: nothing
55
+ def create_table(table_name, # string
56
+ column_formats, # array: array: string*string [column_name,column_format]
57
+ index_column_names, # array: string: column_name
58
+ indexname) # string: name of automatically created index column
59
+ raise "overwrite me"
60
+ end
61
+
62
+ ###
63
+ # remove a table
64
+ def drop_table(table_name)
65
+ query_noretv("DROP TABLE " + table_name)
66
+ end
67
+
68
+ ###
69
+ # list all column names of a table (no default)
70
+ #
71
+ # returns: array of strings
72
+ def list_column_names(table_name)
73
+ return list_column_formats(table_name).map { |col_name, col_format| col_name }
74
+ end
75
+
76
+ #####
77
+ # list_column_formats
78
+ #
79
+ # list column names and column types of this table
80
+ #
81
+ # returns: array:string*string, list of pairs [column name, column format]
82
+ def list_column_formats(table_name)
83
+ raise "Overwrite me"
84
+ end
85
+
86
+ ####
87
+ # num_rows
88
+ #
89
+ # determine the number of rows in a table
90
+ # returns: integer
91
+ def num_rows(table_name)
92
+ raise "Overwrite me"
93
+ end
94
+
95
+ ####
96
+ # make a temporary table: basically just make a table
97
+ #
98
+ # returns: DBWrapper object (or object of current subclass)
99
+ # that has the @table_name attribute set to the name of a temporary DB
100
+ def make_temp_table(column_formats, # array: string*string [column_name,column_format]
101
+ index_column_names, # array: string: column_name
102
+ indexname) # string: name of autoincrement primary index
103
+
104
+ temp_obj = self.clone()
105
+ temp_obj.initialize_temp_table(column_formats, index_column_names, indexname)
106
+ return temp_obj
107
+ end
108
+
109
+ def drop_temp_table()
110
+ unless @table_name
111
+ raise "can only do drop_temp_table() for objects that have a temp table"
112
+ end
113
+ drop_table(@table_name)
114
+ end
115
+
116
+ ##############################
117
+ protected
118
+
119
+ def initialize_temp_table(column_formats, index_column_names, indexname)
120
+ @table_name = "t" + Time.new().to_f().to_s().gsub(/\./, "")
121
+ create_table(@table_name, column_formats, index_column_names, indexname)
122
+ end
123
+ end
124
+
125
+
126
+
127
+
128
+ ######################################################################
129
+ # DBResult:
130
+ # abstract class keeping query results
131
+ #
132
+ # instantiate for the DB package used
133
+ class DBResult
134
+ ###
135
+ # initialize with query result, and keep it
136
+ def initialize(value)
137
+ @result = value
138
+ end
139
+
140
+ # column names: NO DEFAULT
141
+ def list_column_names()
142
+ raise "Overwrite me"
143
+ end
144
+
145
+ # number of rows: returns an integer
146
+ def num_rows()
147
+ return @result.num_rows
148
+ end
149
+
150
+ # yields each row as an array of values
151
+ def each()
152
+ @result.each { |row| yield row }
153
+ end
154
+
155
+ # yields each row as a hash: column name=> column value
156
+ def each_hash()
157
+ @result.each_hash { |row_hash| yield row_hash }
158
+ end
159
+
160
+ # reset object, such that each() can be run again
161
+ # DEFAULT DOES NOTHING, PLEASE OVERWRITE
162
+ def reset()
163
+ end
164
+
165
+ # free result object
166
+ def free()
167
+ @result.free()
168
+ end
169
+
170
+ # returns row as an array of column contents
171
+ def fetch_row()
172
+ return @result.fetch_row()
173
+ end
174
+
175
+ end
176
+
@@ -0,0 +1,58 @@
1
+ # ExternalConfigData
2
+ # Katrin Erk January 2006
3
+ #
4
+ # All scripts that compute additional external knowledge sources
5
+ # for Fred and Rosy:
6
+ # access to configuration and experiment description file
7
+
8
+ require 'common/ConfigData'
9
+
10
+ ##############################
11
+ # Class ExternalConfigData
12
+ #
13
+ # inherits from ConfigData,
14
+ # sets variable names appropriate to tasks of external knowledge sources
15
+
16
+ class ExternalConfigData < ConfigData
17
+ def initialize(filename)
18
+
19
+ # initialize config data object
20
+ super(filename, # config file
21
+ { "directory" => "string", # features
22
+
23
+ "experiment_id" => "string",
24
+
25
+ "gfmap_restrict_to_downpath" => "bool",
26
+ "gfmap_restrict_pathlen" => "integer",
27
+ "gfmap_remove_gf" => "list"
28
+ },
29
+ [] # variables
30
+ )
31
+
32
+ # set access functions for list features
33
+ set_list_feature_access("gfmap_remove_gf",
34
+ method("access_as_stringlist"))
35
+ end
36
+
37
+ ###
38
+ protected
39
+
40
+ #####
41
+ # access_as_stringlist
42
+ #
43
+ # assumed format:
44
+ #
45
+ # lhs = rhs1 rhs2 ... rhsN
46
+ #
47
+ # given in val_list as string tuples [rhs1,...,rhsN]
48
+ #
49
+ # join the rhs strings by spaces, return as string
50
+ # "rhs1 rhs2 ... rhsN"
51
+ #
52
+ def access_as_stringlist(val_list) # array:array:string
53
+ return val_list.map { |rhs| rhs.join(" ") }
54
+ end
55
+ end
56
+
57
+
58
+
@@ -0,0 +1,130 @@
1
+ # Failed Parses
2
+ #
3
+ # SP May 05
4
+ #
5
+ # Administration of information about failed parses;
6
+ # - sentence ID
7
+ # - frame
8
+ # - missed FE markables
9
+ #
10
+ # this class is pretty much a gloriefied hash table with methods to
11
+ # - read FailedParses from a file and to write them to a file
12
+ # - access info in a frame-specific way
13
+
14
+ class FailedParses
15
+
16
+ ###
17
+ # initialize
18
+ #
19
+ # nothing much happens here
20
+ def initialize()
21
+ @failed_parses = Array.new
22
+ end
23
+
24
+ ###
25
+ # register
26
+ #
27
+ # register new failed parse by specifying
28
+ # - its sentence id (any object)
29
+ # - its frame (String)
30
+ # - its FE list (String Array)
31
+
32
+ def register(sent_id, # object
33
+ frame, # string: frame name
34
+ target, # string?
35
+ target_pos, # string: target POS
36
+ fe_list) # array:string
37
+ if @failed_parses.assoc sent_id
38
+ # $stderr.puts "Error: trying to register sentence id #{sent_id} twice!"
39
+ # $stderr.puts "Skipping second occurrence."
40
+ end
41
+ @failed_parses << [sent_id,frame,target,target_pos,fe_list]
42
+ end
43
+
44
+ ###
45
+ # make_split
46
+ #
47
+ # produce a "split" of the failed parses into a train and a test section
48
+ # paramer: train_percentage, Integer between 0 and 100
49
+ #
50
+ # returns an Array with two FailedParses objects, the first for the
51
+ # train data, the second for the test data
52
+
53
+ def make_split(train_percentage)
54
+ unless train_percentage.class < Integer and train_percentage >= 0 and train_percentage <= 100
55
+ raise "Need Integer between 0 and 100 as training percentage."
56
+ end
57
+ train_failed = FailedParses.new()
58
+ test_failed = FailedParses.new()
59
+ @failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
60
+ if rand(100) > train_percentage
61
+ test_failed.register(sent_id,frame,target,target_pos,fe_list)
62
+ else
63
+ train_failed.register(sent_id,frame,target,target_pos,fe_list)
64
+ end
65
+ }
66
+ return [train_failed, test_failed]
67
+ end
68
+
69
+ ###
70
+ # Access information
71
+ #
72
+ # failed_sent: number of failed sentences
73
+ # failed_fes: Hash that maps FE names [String] onto numbers of failed FEs [Int]
74
+ #
75
+ # optional parameters: frame, target, target_pos : if not specified or nil, marginal
76
+ # frequencies are counted (sum over all values)
77
+
78
+
79
+ def failed_sent(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
80
+ counter = 0
81
+ @failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
82
+ if ((frame_spec.nil? or frame_spec == frame) and
83
+ (target_spec.nil? or target_spec == target) and
84
+ (target_pos_spec.nil? or target_pos_spec == target_pos))
85
+ counter += 1
86
+ end
87
+ }
88
+ return counter
89
+ end
90
+
91
+ def failed_fes(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
92
+ fe_hash = Hash.new(0)
93
+ @failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
94
+ if ((frame_spec.nil? or frame_spec == frame) and
95
+ (target_spec.nil? or target_spec == target) and
96
+ (target_pos_spec.nil? or target_pos_spec == target))
97
+ fe_list.each {|fe_label|
98
+ fe_hash[fe_label] += 1
99
+ }
100
+ end
101
+ }
102
+ return fe_hash
103
+ end
104
+
105
+
106
+ ###
107
+ # Marshalling:
108
+ #
109
+ # save - save info about failed parses to file
110
+ # load - load info about failed parses from file
111
+
112
+ def save(filename)
113
+ io_obj = File.new(filename,"w")
114
+ Marshal.dump(@failed_parses,io_obj)
115
+ io_obj.close
116
+ end
117
+
118
+ def load(filename)
119
+ begin
120
+ io_obj = File.new(filename)
121
+ @failed_parses = Marshal.load(io_obj)
122
+ io_obj.close
123
+ rescue
124
+ $stderr.puts "WARNING: couldn't read failed parses file #{filename}."
125
+ $stderr.puts "I'll assume that there are no failed parses."
126
+ end
127
+ end
128
+
129
+
130
+ end
@@ -0,0 +1,242 @@
1
+ require 'common/ruby_class_extensions'
2
+
3
+ class RosyFeatureInfo
4
+ ###
5
+ # class variable:
6
+ # list of all known extractors
7
+ # add to it using add_feature()
8
+ @@extractors = Array.new
9
+
10
+ # boolean. set to true after warning messages have been given once
11
+ @@warned = false
12
+
13
+ ###
14
+ # add interface/interpreter
15
+ def RosyFeatureInfo.add_feature(class_name) # Class object
16
+ @@extractors << class_name
17
+ end
18
+
19
+ ###
20
+ def initialize(exp)
21
+
22
+ ##
23
+ # make list of extractors that are
24
+ # either required by the user
25
+ # or needed by the system
26
+ @current_extractors = Array.new
27
+ @exp = exp
28
+
29
+ # user-chosen extractors:
30
+ # returns array of pairs [feature group designator(string), options(array:string)]
31
+ exp.get_lf("feature").each { |extractor_name, options|
32
+ extractor = @@extractors.detect { |e| e.designator() == extractor_name }
33
+ unless extractor
34
+ # no extractor found matching the given designator
35
+ unless @@warned
36
+ $stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
37
+ end
38
+ next
39
+ end
40
+
41
+ # read and check options
42
+ step = nil
43
+
44
+ options.each { |option|
45
+ case option
46
+ when "dontuse", "argrec", "arglab", "onestep"
47
+
48
+ if step
49
+ # step has already been set
50
+ $stderr.puts "ERROR in feature #{extractor_name}: Please set only one of the options dontuse, argrec, arglab, onestep"
51
+ exit 1
52
+ end
53
+
54
+ step = option
55
+
56
+ else
57
+ unless @@warned
58
+ $stderr.puts "Warning: Unknown option for feature #{extractor_name}: #{option}. Skipping"
59
+ end
60
+ end
61
+ }
62
+
63
+ @current_extractors << {
64
+ "extractor" => extractor,
65
+ "step" => step
66
+ }
67
+ }
68
+
69
+ # extractors needed by the system
70
+ @@extractors.select { |e|
71
+ # select admin features and gold feature
72
+ ["admin", "gold"].include? e.feature_type()
73
+ }.each { |extractor|
74
+
75
+ # if we have already added that extractor, remove it
76
+ # and add it with our own options
77
+ @current_extractors.delete_if { |descr| descr["extractor"].designator() == extractor.designator() }
78
+
79
+ @current_extractors << {
80
+ "extractor"=> extractor,
81
+ "step" => "dontuse"
82
+ }
83
+ }
84
+
85
+ # make sure that all extractors are computable in the current model
86
+ # (i.e. check dependencies)
87
+
88
+ allstep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil?
89
+ }.map { |e| e["extractor"].designator() }
90
+ argrec_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "argrec"
91
+ }.map { |e| e["extractor"].designator() }
92
+ arglab_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "arglab"
93
+ }.map { |e| e["extractor"].designator() }
94
+ onestep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "onestep"
95
+ }.map { |e| e["extractor"].designator() }
96
+
97
+ @current_extractors.delete_if {|extractor_hash|
98
+ case extractor_hash["step"]
99
+ when nil
100
+ computable = extractor_hash["extractor"].is_computable(allstep_extractors)
101
+ when "argrec"
102
+ computable = extractor_hash["extractor"].is_computable(argrec_extractors)
103
+ when "arglab"
104
+ computable = extractor_hash["extractor"].is_computable(arglab_extractors)
105
+ when "onestep"
106
+ computable = extractor_hash["extractor"].is_computable(onestep_extractors)
107
+ when "dontuse"
108
+ # either an admin feature or a user feature not to be used this time
109
+ computable = true
110
+ end
111
+
112
+ if computable
113
+ false # i.e. don't delete
114
+ else
115
+ unless @@warned
116
+ $stderr.puts "Warning: Feature extractor #{extractor_hash["extractor"].designator()} cannot be computed: skipping."
117
+ end
118
+ true
119
+ end
120
+ }
121
+
122
+ # make list of all features as hashes
123
+ # "feature_name" -> string,
124
+ # "sql_type" -> string,
125
+ # "is_index" -> boolean,
126
+ # "step" -> string: argrec, arglab, onestep, or nil
127
+ # "type" -> string
128
+ # "phase" -> string: phase 1 or phase 2
129
+ @features = Array.new
130
+ @current_extractors.each { |descr|
131
+ extractor = descr["extractor"]
132
+ extractor.feature_names.each { |feature_name|
133
+ @features << {
134
+ "feature_name" => feature_name,
135
+ "sql_type" => extractor.sql_type(),
136
+ "is_index" => extractor.info().include?("index"),
137
+ "step" => descr["step"],
138
+ "type" => extractor.feature_type(),
139
+ "phase" => extractor.phase()
140
+ }
141
+ }
142
+ }
143
+
144
+ # do not print warnings again if another RosyFeatureInfo object is made
145
+ @@warned = true
146
+ end
147
+
148
+ ###
149
+ # get_column_formats
150
+ #
151
+ # returns a list of pairs [feature_name(string), sql_column_format(string)]:
152
+ # all features to be computed, with their SQL column formats
153
+ def get_column_formats(phase = nil) # string: phase 1 or phase 2
154
+ return @features.select { |feature_descr|
155
+ phase.nil? or
156
+ feature_descr["phase"] == phase
157
+ }.map { |feature_descr|
158
+ [feature_descr["feature_name"], feature_descr["sql_type"]]
159
+ }
160
+ end
161
+
162
+ ###
163
+ # get_column_names
164
+ #
165
+ # returns a list of feature names (strings)
166
+ # all features to be computed
167
+ def get_column_names(phase = nil) # string: phase 1 or phase 2
168
+ return @features.select { |feature_descr|
169
+ phase.nil? or
170
+ feature_descr["phase"] == phase
171
+ }.map { |feature_descr|
172
+ feature_descr["feature_name"]
173
+ }
174
+ end
175
+
176
+ ###
177
+ # get_index_columns
178
+ #
179
+ # returns a list of feature (column) names as Strings
180
+ # consisting of all features that have been requested as index features
181
+ # in the experiment file or in the list of @@all_features_we_have above
182
+ def get_index_columns()
183
+ return @features.select { |feature_descr|
184
+ feature_descr["is_index"]
185
+ }.map {|feature_descr|
186
+ feature_descr["feature_name"]
187
+ }
188
+ end
189
+
190
+ ###
191
+ # get_model_features
192
+ #
193
+ # returns a list of feature (column) names as strings
194
+ # consisting of all the features to be used for the modeling
195
+ #
196
+ # step: argrec, arglab, onestep
197
+ def get_model_features(step)
198
+
199
+ return @features.select { |feature_descr|
200
+ # features for the current step
201
+ # feature_descr["step"] is argrec, arglab, onestep, dontuse, or nil
202
+ # nil matches all steps
203
+ # 'dontuse' matches no step, so these features will never be returned here
204
+ feature_descr["step"].nil? or
205
+ feature_descr["step"] == step
206
+ }.reject { |feature_descr|
207
+ # that are not admin features or the gold label
208
+ ["admin", "gold"].include? feature_descr["type"]
209
+ }.map { |feature_descr|
210
+ # use just the names of the features
211
+ feature_descr["feature_name"]
212
+ }
213
+ end
214
+
215
+ ###
216
+ # get_extractor_objects
217
+ #
218
+ # returns two lists of feature extractor objects,
219
+ # covering all features of the given phase:
220
+ # the first list contains RosyFeatureExtractor extractors,
221
+ # the second list contains the others.
222
+ def get_extractor_objects(phase, # string: "phase 1" or "phase 2"
223
+ interpreter_class) # SynInterpreter class
224
+ unless ["phase 1", "phase 2"].include? phase
225
+ raise "Shouldn't be here: " + phase
226
+ end
227
+
228
+ return @current_extractors.select { |descr|
229
+ # select extractors of the right phase
230
+ descr["extractor"].phase() == phase
231
+ }.map { |descr|
232
+
233
+ # make objects from extractor classes
234
+ descr["extractor"].new(@exp, interpreter_class)
235
+ }.distribute { |extractor_obj|
236
+ # distribute extractors in two bins:
237
+ # first, rosy extractors
238
+ # second, others
239
+ extractor_obj.class.info().include? "rosy"
240
+ }
241
+ end
242
+ end