frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,176 @@
1
+ ###########################
2
+ # DBWrapper:
3
+ # abstract class wrapping database interfaces,
4
+ # so we can have both an interface to an SQL server
5
+ # and an interface to SQLite in Shalmaneser
6
+ class DBWrapper
7
+ attr_reader :table_name
8
+
9
+ ###
10
+ def initialize(exp) # RosyConfigData experiment file object
11
+ # remember experiment file
12
+ @exp = exp
13
+
14
+ # open the database:
15
+ # please set to some other value in subclass initialization
16
+ @database = nil
17
+
18
+ # name of default table to access: none
19
+ @table_name = nil
20
+ end
21
+
22
+ ###
23
+ # close DB access
24
+ def close()
25
+ @database.close()
26
+ end
27
+
28
+ ####
29
+ # querying the database:
30
+ # returns an DBResult object
31
+ def query(query)
32
+ raise "Overwrite me"
33
+ end
34
+
35
+ ####
36
+ # querying the database:
37
+ # no result value
38
+ def query_noretv(query)
39
+ raise "Overwrite me"
40
+ end
41
+
42
+ ###
43
+ # list all tables in the database:
44
+ # no default here
45
+ #
46
+ # returns: list of strings
47
+ def list_tables()
48
+ raise "Overwrite me"
49
+ end
50
+
51
+ ###
52
+ # make a table
53
+ #
54
+ # returns: nothing
55
+ def create_table(table_name, # string
56
+ column_formats, # array: array: string*string [column_name,column_format]
57
+ index_column_names, # array: string: column_name
58
+ indexname) # string: name of automatically created index column
59
+ raise "overwrite me"
60
+ end
61
+
62
+ ###
63
+ # remove a table
64
+ def drop_table(table_name)
65
+ query_noretv("DROP TABLE " + table_name)
66
+ end
67
+
68
+ ###
69
+ # list all column names of a table (no default)
70
+ #
71
+ # returns: array of strings
72
+ def list_column_names(table_name)
73
+ return list_column_formats(table_name).map { |col_name, col_format| col_name }
74
+ end
75
+
76
+ #####
77
+ # list_column_formats
78
+ #
79
+ # list column names and column types of this table
80
+ #
81
+ # returns: array:string*string, list of pairs [column name, column format]
82
+ def list_column_formats(table_name)
83
+ raise "Overwrite me"
84
+ end
85
+
86
+ ####
87
+ # num_rows
88
+ #
89
+ # determine the number of rows in a table
90
+ # returns: integer
91
+ def num_rows(table_name)
92
+ raise "Overwrite me"
93
+ end
94
+
95
+ ####
96
+ # make a temporary table: basically just make a table
97
+ #
98
+ # returns: DBWrapper object (or object of current subclass)
99
+ # that has the @table_name attribute set to the name of a temporary DB
100
+ def make_temp_table(column_formats, # array: string*string [column_name,column_format]
101
+ index_column_names, # array: string: column_name
102
+ indexname) # string: name of autoincrement primary index
103
+
104
+ temp_obj = self.clone()
105
+ temp_obj.initialize_temp_table(column_formats, index_column_names, indexname)
106
+ return temp_obj
107
+ end
108
+
109
+ def drop_temp_table()
110
+ unless @table_name
111
+ raise "can only do drop_temp_table() for objects that have a temp table"
112
+ end
113
+ drop_table(@table_name)
114
+ end
115
+
116
+ ##############################
117
+ protected
118
+
119
+ def initialize_temp_table(column_formats, index_column_names, indexname)
120
+ @table_name = "t" + Time.new().to_f().to_s().gsub(/\./, "")
121
+ create_table(@table_name, column_formats, index_column_names, indexname)
122
+ end
123
+ end
124
+
125
+
126
+
127
+
128
+ ######################################################################
129
+ # DBResult:
130
+ # abstract class keeping query results
131
+ #
132
+ # instantiate for the DB package used
133
+ class DBResult
134
+ ###
135
+ # initialize with query result, and keep it
136
+ def initialize(value)
137
+ @result = value
138
+ end
139
+
140
+ # column names: NO DEFAULT
141
+ def list_column_names()
142
+ raise "Overwrite me"
143
+ end
144
+
145
+ # number of rows: returns an integer
146
+ def num_rows()
147
+ return @result.num_rows
148
+ end
149
+
150
+ # yields each row as an array of values
151
+ def each()
152
+ @result.each { |row| yield row }
153
+ end
154
+
155
+ # yields each row as a hash: column name=> column value
156
+ def each_hash()
157
+ @result.each_hash { |row_hash| yield row_hash }
158
+ end
159
+
160
+ # reset object, such that each() can be run again
161
+ # DEFAULT DOES NOTHING, PLEASE OVERWRITE
162
+ def reset()
163
+ end
164
+
165
+ # free result object
166
+ def free()
167
+ @result.free()
168
+ end
169
+
170
+ # returns row as an array of column contents
171
+ def fetch_row()
172
+ return @result.fetch_row()
173
+ end
174
+
175
+ end
176
+
@@ -0,0 +1,58 @@
1
+ # ExternalConfigData
2
+ # Katrin Erk January 2006
3
+ #
4
+ # All scripts that compute additional external knowledge sources
5
+ # for Fred and Rosy:
6
+ # access to configuration and experiment description file
7
+
8
+ require 'common/ConfigData'
9
+
10
+ ##############################
11
+ # Class ExternalConfigData
12
+ #
13
+ # inherits from ConfigData,
14
+ # sets variable names appropriate to tasks of external knowledge sources
15
+
16
+ class ExternalConfigData < ConfigData
17
+ def initialize(filename)
18
+
19
+ # initialize config data object
20
+ super(filename, # config file
21
+ { "directory" => "string", # features
22
+
23
+ "experiment_id" => "string",
24
+
25
+ "gfmap_restrict_to_downpath" => "bool",
26
+ "gfmap_restrict_pathlen" => "integer",
27
+ "gfmap_remove_gf" => "list"
28
+ },
29
+ [] # variables
30
+ )
31
+
32
+ # set access functions for list features
33
+ set_list_feature_access("gfmap_remove_gf",
34
+ method("access_as_stringlist"))
35
+ end
36
+
37
+ ###
38
+ protected
39
+
40
+ #####
41
+ # access_as_stringlist
42
+ #
43
+ # assumed format:
44
+ #
45
+ # lhs = rhs1 rhs2 ... rhsN
46
+ #
47
+ # given in val_list as string tuples [rhs1,...,rhsN]
48
+ #
49
+ # join the rhs strings by spaces, return as string
50
+ # "rhs1 rhs2 ... rhsN"
51
+ #
52
+ def access_as_stringlist(val_list) # array:array:string
53
+ return val_list.map { |rhs| rhs.join(" ") }
54
+ end
55
+ end
56
+
57
+
58
+
@@ -0,0 +1,130 @@
1
+ # Failed Parses
2
+ #
3
+ # SP May 05
4
+ #
5
+ # Administration of information about failed parses;
6
+ # - sentence ID
7
+ # - frame
8
+ # - missed FE markables
9
+ #
10
+ # this class is pretty much a gloriefied hash table with methods to
11
+ # - read FailedParses from a file and to write them to a file
12
+ # - access info in a frame-specific way
13
+
14
+ class FailedParses
15
+
16
+ ###
17
+ # initialize
18
+ #
19
+ # nothing much happens here
20
+ def initialize()
21
+ @failed_parses = Array.new
22
+ end
23
+
24
+ ###
25
+ # register
26
+ #
27
+ # register new failed parse by specifying
28
+ # - its sentence id (any object)
29
+ # - its frame (String)
30
+ # - its FE list (String Array)
31
+
32
+ def register(sent_id, # object
33
+ frame, # string: frame name
34
+ target, # string?
35
+ target_pos, # string: target POS
36
+ fe_list) # array:string
37
+ if @failed_parses.assoc sent_id
38
+ # $stderr.puts "Error: trying to register sentence id #{sent_id} twice!"
39
+ # $stderr.puts "Skipping second occurrence."
40
+ end
41
+ @failed_parses << [sent_id,frame,target,target_pos,fe_list]
42
+ end
43
+
44
+ ###
45
+ # make_split
46
+ #
47
+ # produce a "split" of the failed parses into a train and a test section
48
+ # paramer: train_percentage, Integer between 0 and 100
49
+ #
50
+ # returns an Array with two FailedParses objects, the first for the
51
+ # train data, the second for the test data
52
+
53
+ def make_split(train_percentage)
54
+ unless train_percentage.class < Integer and train_percentage >= 0 and train_percentage <= 100
55
+ raise "Need Integer between 0 and 100 as training percentage."
56
+ end
57
+ train_failed = FailedParses.new()
58
+ test_failed = FailedParses.new()
59
+ @failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
60
+ if rand(100) > train_percentage
61
+ test_failed.register(sent_id,frame,target,target_pos,fe_list)
62
+ else
63
+ train_failed.register(sent_id,frame,target,target_pos,fe_list)
64
+ end
65
+ }
66
+ return [train_failed, test_failed]
67
+ end
68
+
69
+ ###
70
+ # Access information
71
+ #
72
+ # failed_sent: number of failed sentences
73
+ # failed_fes: Hash that maps FE names [String] onto numbers of failed FEs [Int]
74
+ #
75
+ # optional parameters: frame, target, target_pos : if not specified or nil, marginal
76
+ # frequencies are counted (sum over all values)
77
+
78
+
79
+ def failed_sent(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
80
+ counter = 0
81
+ @failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
82
+ if ((frame_spec.nil? or frame_spec == frame) and
83
+ (target_spec.nil? or target_spec == target) and
84
+ (target_pos_spec.nil? or target_pos_spec == target_pos))
85
+ counter += 1
86
+ end
87
+ }
88
+ return counter
89
+ end
90
+
91
+ def failed_fes(frame_spec=nil,target_spec=nil,target_pos_spec=nil)
92
+ fe_hash = Hash.new(0)
93
+ @failed_parses.each {|sent_id,frame,target,target_pos,fe_list|
94
+ if ((frame_spec.nil? or frame_spec == frame) and
95
+ (target_spec.nil? or target_spec == target) and
96
+ (target_pos_spec.nil? or target_pos_spec == target))
97
+ fe_list.each {|fe_label|
98
+ fe_hash[fe_label] += 1
99
+ }
100
+ end
101
+ }
102
+ return fe_hash
103
+ end
104
+
105
+
106
+ ###
107
+ # Marshalling:
108
+ #
109
+ # save - save info about failed parses to file
110
+ # load - load info about failed parses from file
111
+
112
+ def save(filename)
113
+ io_obj = File.new(filename,"w")
114
+ Marshal.dump(@failed_parses,io_obj)
115
+ io_obj.close
116
+ end
117
+
118
+ def load(filename)
119
+ begin
120
+ io_obj = File.new(filename)
121
+ @failed_parses = Marshal.load(io_obj)
122
+ io_obj.close
123
+ rescue
124
+ $stderr.puts "WARNING: couldn't read failed parses file #{filename}."
125
+ $stderr.puts "I'll assume that there are no failed parses."
126
+ end
127
+ end
128
+
129
+
130
+ end
@@ -0,0 +1,242 @@
1
+ require 'common/ruby_class_extensions'
2
+
3
+ class RosyFeatureInfo
4
+ ###
5
+ # class variable:
6
+ # list of all known extractors
7
+ # add to it using add_feature()
8
+ @@extractors = Array.new
9
+
10
+ # boolean. set to true after warning messages have been given once
11
+ @@warned = false
12
+
13
+ ###
14
+ # add interface/interpreter
15
+ def RosyFeatureInfo.add_feature(class_name) # Class object
16
+ @@extractors << class_name
17
+ end
18
+
19
+ ###
20
+ def initialize(exp)
21
+
22
+ ##
23
+ # make list of extractors that are
24
+ # either required by the user
25
+ # or needed by the system
26
+ @current_extractors = Array.new
27
+ @exp = exp
28
+
29
+ # user-chosen extractors:
30
+ # returns array of pairs [feature group designator(string), options(array:string)]
31
+ exp.get_lf("feature").each { |extractor_name, options|
32
+ extractor = @@extractors.detect { |e| e.designator() == extractor_name }
33
+ unless extractor
34
+ # no extractor found matching the given designator
35
+ unless @@warned
36
+ $stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
37
+ end
38
+ next
39
+ end
40
+
41
+ # read and check options
42
+ step = nil
43
+
44
+ options.each { |option|
45
+ case option
46
+ when "dontuse", "argrec", "arglab", "onestep"
47
+
48
+ if step
49
+ # step has already been set
50
+ $stderr.puts "ERROR in feature #{extractor_name}: Please set only one of the options dontuse, argrec, arglab, onestep"
51
+ exit 1
52
+ end
53
+
54
+ step = option
55
+
56
+ else
57
+ unless @@warned
58
+ $stderr.puts "Warning: Unknown option for feature #{extractor_name}: #{option}. Skipping"
59
+ end
60
+ end
61
+ }
62
+
63
+ @current_extractors << {
64
+ "extractor" => extractor,
65
+ "step" => step
66
+ }
67
+ }
68
+
69
+ # extractors needed by the system
70
+ @@extractors.select { |e|
71
+ # select admin features and gold feature
72
+ ["admin", "gold"].include? e.feature_type()
73
+ }.each { |extractor|
74
+
75
+ # if we have already added that extractor, remove it
76
+ # and add it with our own options
77
+ @current_extractors.delete_if { |descr| descr["extractor"].designator() == extractor.designator() }
78
+
79
+ @current_extractors << {
80
+ "extractor"=> extractor,
81
+ "step" => "dontuse"
82
+ }
83
+ }
84
+
85
+ # make sure that all extractors are computable in the current model
86
+ # (i.e. check dependencies)
87
+
88
+ allstep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil?
89
+ }.map { |e| e["extractor"].designator() }
90
+ argrec_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "argrec"
91
+ }.map { |e| e["extractor"].designator() }
92
+ arglab_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "arglab"
93
+ }.map { |e| e["extractor"].designator() }
94
+ onestep_extractors = @current_extractors.find_all {|e_hash| e_hash["step"].nil? or e_hash["step"] == "onestep"
95
+ }.map { |e| e["extractor"].designator() }
96
+
97
+ @current_extractors.delete_if {|extractor_hash|
98
+ case extractor_hash["step"]
99
+ when nil
100
+ computable = extractor_hash["extractor"].is_computable(allstep_extractors)
101
+ when "argrec"
102
+ computable = extractor_hash["extractor"].is_computable(argrec_extractors)
103
+ when "arglab"
104
+ computable = extractor_hash["extractor"].is_computable(arglab_extractors)
105
+ when "onestep"
106
+ computable = extractor_hash["extractor"].is_computable(onestep_extractors)
107
+ when "dontuse"
108
+ # either an admin feature or a user feature not to be used this time
109
+ computable = true
110
+ end
111
+
112
+ if computable
113
+ false # i.e. don't delete
114
+ else
115
+ unless @@warned
116
+ $stderr.puts "Warning: Feature extractor #{extractor_hash["extractor"].designator()} cannot be computed: skipping."
117
+ end
118
+ true
119
+ end
120
+ }
121
+
122
+ # make list of all features as hashes
123
+ # "feature_name" -> string,
124
+ # "sql_type" -> string,
125
+ # "is_index" -> boolean,
126
+ # "step" -> string: argrec, arglab, onestep, or nil
127
+ # "type" -> string
128
+ # "phase" -> string: phase 1 or phase 2
129
+ @features = Array.new
130
+ @current_extractors.each { |descr|
131
+ extractor = descr["extractor"]
132
+ extractor.feature_names.each { |feature_name|
133
+ @features << {
134
+ "feature_name" => feature_name,
135
+ "sql_type" => extractor.sql_type(),
136
+ "is_index" => extractor.info().include?("index"),
137
+ "step" => descr["step"],
138
+ "type" => extractor.feature_type(),
139
+ "phase" => extractor.phase()
140
+ }
141
+ }
142
+ }
143
+
144
+ # do not print warnings again if another RosyFeatureInfo object is made
145
+ @@warned = true
146
+ end
147
+
148
+ ###
149
+ # get_column_formats
150
+ #
151
+ # returns a list of pairs [feature_name(string), sql_column_format(string)]:
152
+ # all features to be computed, with their SQL column formats
153
+ def get_column_formats(phase = nil) # string: phase 1 or phase 2
154
+ return @features.select { |feature_descr|
155
+ phase.nil? or
156
+ feature_descr["phase"] == phase
157
+ }.map { |feature_descr|
158
+ [feature_descr["feature_name"], feature_descr["sql_type"]]
159
+ }
160
+ end
161
+
162
+ ###
163
+ # get_column_names
164
+ #
165
+ # returns a list of feature names (strings)
166
+ # all features to be computed
167
+ def get_column_names(phase = nil) # string: phase 1 or phase 2
168
+ return @features.select { |feature_descr|
169
+ phase.nil? or
170
+ feature_descr["phase"] == phase
171
+ }.map { |feature_descr|
172
+ feature_descr["feature_name"]
173
+ }
174
+ end
175
+
176
+ ###
177
+ # get_index_columns
178
+ #
179
+ # returns a list of feature (column) names as Strings
180
+ # consisting of all features that have been requested as index features
181
+ # in the experiment file or in the list of @@all_features_we_have above
182
+ def get_index_columns()
183
+ return @features.select { |feature_descr|
184
+ feature_descr["is_index"]
185
+ }.map {|feature_descr|
186
+ feature_descr["feature_name"]
187
+ }
188
+ end
189
+
190
+ ###
191
+ # get_model_features
192
+ #
193
+ # returns a list of feature (column) names as strings
194
+ # consisting of all the features to be used for the modeling
195
+ #
196
+ # step: argrec, arglab, onestep
197
+ def get_model_features(step)
198
+
199
+ return @features.select { |feature_descr|
200
+ # features for the current step
201
+ # feature_descr["step"] is argrec, arglab, onestep, dontuse, or nil
202
+ # nil matches all steps
203
+ # 'dontuse' matches no step, so these features will never be returned here
204
+ feature_descr["step"].nil? or
205
+ feature_descr["step"] == step
206
+ }.reject { |feature_descr|
207
+ # that are not admin features or the gold label
208
+ ["admin", "gold"].include? feature_descr["type"]
209
+ }.map { |feature_descr|
210
+ # use just the names of the features
211
+ feature_descr["feature_name"]
212
+ }
213
+ end
214
+
215
+ ###
216
+ # get_extractor_objects
217
+ #
218
+ # returns two lists of feature extractor objects,
219
+ # covering all features of the given phase:
220
+ # the first list contains RosyFeatureExtractor extractors,
221
+ # the second list contains the others.
222
+ def get_extractor_objects(phase, # string: "phase 1" or "phase 2"
223
+ interpreter_class) # SynInterpreter class
224
+ unless ["phase 1", "phase 2"].include? phase
225
+ raise "Shouldn't be here: " + phase
226
+ end
227
+
228
+ return @current_extractors.select { |descr|
229
+ # select extractors of the right phase
230
+ descr["extractor"].phase() == phase
231
+ }.map { |descr|
232
+
233
+ # make objects from extractor classes
234
+ descr["extractor"].new(@exp, interpreter_class)
235
+ }.distribute { |extractor_obj|
236
+ # distribute extractors in two bins:
237
+ # first, rosy extractors
238
+ # second, others
239
+ extractor_obj.class.info().include? "rosy"
240
+ }
241
+ end
242
+ end