svmlab 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,337 @@
1
+ require 'yaml'
2
+ #require 'forkoff'
3
+ require 'svmlab-config.rb'
4
+
5
+ # An SVMFeature object is initialized with either a string in YAML format
6
+ # or a file object pointing to a file with configuration given in YAML format.
7
+ #
8
+ # The SVMFeature class is the 'middleware' between software that calculates
9
+ # features and the SVMLab class. SVMFeature's features are :
10
+ # - Maintanence of a database of calculated features. This is done to
11
+ # minimize CPU time needed when developing an SVM experiment. The motto
12
+ # is that a calculation should be done only once and then never again.
13
+ # - Meta features can be defined that uses other features to calculate
14
+ # a feature . In these cases, a structure similar to the entire configuration
15
+ # should be given for this feature.
16
+ #
17
+ # ---CONFIGURATION---
18
+ #
19
+ # SVMFeature is initiated by a configuration file in YAML format.
20
+ # Required fields in configuration are:
21
+ # (all paths can be given either absolute or relative to BaseDir)
22
+ #
23
+ # ---
24
+ # Features:
25
+ # - <targetfeature>
26
+ # - <feature1>
27
+ # - <feature2>
28
+ # ...
29
+ # BaseDir: <base directory>
30
+ # DataSet: <path of file giving the dataset>
31
+ # OR
32
+ # <prefix of a set of files giving the dataset>
33
+ # Example: If Dataset is not given, this gives name(s) of examples
34
+ # to use.
35
+ # Groups: <range (n1..n2) or (n1...n2) in example name to use for grouping>
36
+ # OR
37
+ # <file prefix relative to BaseDir for files giving groups>
38
+ # Methods: <path of .rb file holding all feature calculation methods>
39
+ # --------------------------------------------------------------
40
+ # targetfeature:
41
+ # HomeDir: <home directory>
42
+ # If HomeDir is not given, it will be set to BaseDir/featurename
43
+ # Method: <the method calculating this feature>
44
+ # If Method is not given, an attempts is made to acquire
45
+ # the feature from the database. If it fails, ERROR is reported.
46
+ # Dimensions: <the number of dimensions in this feature>
47
+ # If Dimensions is not given, it will be assumed to be 1
48
+ # and only the first value for each example will be used
49
+ # <Further specific configuration of this feature>
50
+ # feature1:
51
+ # HomeDir: <home directory>
52
+ # Method: <the method calculating this feature>
53
+ # Dimensions: <the number of dimensions in this feature>
54
+ # <Further specific configuration of this feature>
55
+ # feature2:
56
+ # ...
57
+ # featureN:
58
+ # If featureN configuration is not given, then all default settings
59
+ # will apply to this feature.
60
+ # ...
61
+ #
62
+ #
63
+
64
+ class SVMFeature < Hash
65
+
66
+ attr_reader :dim
67
+
68
+ # config is either a file object or a string object.
69
+ def initialize(config)
70
+ @config = SVMFeaturesConfig.new(config)
71
+ #Get examples
72
+ @examples =
73
+ if dataset = @config['DataSet']
74
+ dir = dataset.split(/\//)[0...-1].join('/')+'/'
75
+ if File::file?(dataset)
76
+ open(dataset) { |f| f.read }.split
77
+ elsif (files = Dir::entries(dir).
78
+ grep(/^#{dataset.split(/\//).last}/)).
79
+ size>0
80
+ files.inject([]) { |exarray,fname|
81
+ exarray += open(dir+fname){|f| f.read}.split }
82
+ end
83
+ elsif example = @config['Example']
84
+ if example.is_a? Array
85
+ example
86
+ else
87
+ [ example ]
88
+ end
89
+ end
90
+ # Set @feature to an empty hash
91
+ @feature = {}
92
+ #tmparr = @examples.forkoff :processes => 1 do |ex|
93
+ #tmparr = @examples.forkmap 1 do |ex|
94
+ tmparr = @examples.map do |ex|
95
+ begin
96
+ getExAllFeatures(ex)
97
+ rescue
98
+ STDERR.puts $!
99
+ $!
100
+ end
101
+ end
102
+ @examples.zip(tmparr).each do |k,v|
103
+ self[k] = v
104
+ end
105
+ end
106
+
107
+ # Returns a string giving the path to the file holding
108
+ # the feature for the current settings.
109
+ # Returns nil if current setting do not match any file.
110
+ def getDataFile(feature)
111
+ dfile = nil
112
+ filemapfname = @config[feature]['HomeDir'] + 'filemap.yml'
113
+ return nil if !File.file?(filemapfname)
114
+ open(filemapfname,'r') do |f|
115
+ if f.flock(File::LOCK_SH)
116
+ begin
117
+ filemap = YAML.load(f)
118
+ if cfg = filemap.find {|c| c['Config'] == @config[feature]}
119
+ dfile = @config[feature]['HomeDir'] + cfg['FeatureFile']
120
+ else
121
+ dfile = nil
122
+ end
123
+ ensure
124
+ f.flock(File::LOCK_UN)
125
+ end
126
+ end
127
+ end
128
+ dfile
129
+ end
130
+
131
+ # Returns an array of floats giving the selected feature of the selected example.
132
+ def getExFeature(example, feature)
133
+ x = getExFeatureInternal(example, feature)
134
+ if !x then raise "ERROR: #{feature} is nil." end
135
+ if x =~ /^ERROR/
136
+ raise "ERROR (#{feature}): #{x.split[1..-1].join(' ')}"
137
+ end
138
+ if (dim=x.split.size) != @config[feature]['Dimensions']
139
+ raise "ERROR (#{feature}): Number of dimensions (#{dim})" +
140
+ " for #{example} is not correct"
141
+ end
142
+ x.split.map{ |v| Float(v)}
143
+ end
144
+
145
+ # Returns a string giving the selected feature of the selected example.
146
+ def getExFeatureInternal(example,feature)
147
+ val = nil
148
+
149
+ # 0. Check the hash
150
+ if @feature[feature] and @feature[feature][example]
151
+ return @feature[feature][example]
152
+ end
153
+
154
+ # 1. Look in database if value is available
155
+ if dfile = getDataFile(feature)
156
+ open(dfile,'r') do |f|
157
+ if f.flock(File::LOCK_SH)
158
+ begin
159
+ val = if @feature[feature] = YAML.load(f)
160
+ then @feature[feature][example]
161
+ else nil end
162
+ ensure
163
+ f.flock(File::LOCK_UN)
164
+ end
165
+ end
166
+ end
167
+ end
168
+ return val if val
169
+
170
+ # 2. Calculate the value
171
+ calhash = {}
172
+ begin
173
+ method = @config[feature]['Method']
174
+ raise ArgumentError, "No method given for calculation" if !method
175
+ calhash = eval("#{@config[feature]['Method']}(@config[feature],example)")
176
+ if !calhash.is_a? Hash
177
+ raise "Incorrect output format from #{@config[feature]['Method']}"
178
+ end
179
+ calhash.each do |k,v|
180
+ if !v.is_a? String
181
+ raise "Incorrect output class (#{v.class})" +
182
+ "from #{@config[feature]['Method']}"
183
+ end
184
+ end
185
+ rescue ArgumentError
186
+ raise
187
+ rescue NameError
188
+ raise NameError, "Method #{method} not found."
189
+ rescue
190
+ error = 'ERROR:' + $!.to_s.split(/\n/).shift.split(/\:/).pop
191
+ calhash = {example => error}
192
+ end
193
+ if !calhash or !calhash[example]
194
+ calhash[example] = 'ERROR: No output from method'
195
+ end
196
+ val = calhash[example]
197
+
198
+ # Update filemap.yml
199
+ if !getDataFile(feature)
200
+ filemapfname = @config[feature]['HomeDir'] + 'filemap.yml'
201
+ open(filemapfname,'w') {} if !File.file?(filemapfname)
202
+ open(filemapfname,'r+') do |f|
203
+ if f.flock(File::LOCK_EX)
204
+ begin
205
+ filemap = if tmp = YAML.load(f) then tmp
206
+ else [] end
207
+ datafile = feature + filemap.size.to_s + '.yml'
208
+ filemap.push({'FeatureFile' => datafile,
209
+ 'Config' => @config[feature] })
210
+ f.rewind
211
+ YAML.dump(filemap,f)
212
+ ensure
213
+ f.flock(File::LOCK_UN)
214
+ end
215
+ end
216
+ end
217
+ end
218
+
219
+ # Add all outcome of calculation to the database
220
+ dfile = getDataFile(feature)
221
+ open(dfile,'a+') do |f|
222
+ if f.flock(File::LOCK_EX)
223
+ begin
224
+ oldhash = YAML.load(f)
225
+ f.puts '--- ' if !oldhash
226
+ calhash.each do |k,v|
227
+ if !oldhash or !oldhash[k]
228
+ h = {k => v}
229
+ f.puts h.to_yaml[5..-1]
230
+ end
231
+ end
232
+ ensure
233
+ f.flock(File::LOCK_UN)
234
+ end
235
+ end
236
+ end
237
+ puts "#{DateTime.now}, Calculated #{example}, #{feature} = #{val}"
238
+ val
239
+ end
240
+
241
+ # Returns an array of floats containing all features for given example
242
+ def getExAllFeatures(example)
243
+ @config['Features'].inject([]) do |array,feature|
244
+ f = getExFeature(example, feature)
245
+ if array.empty? and (c=@config['PosClassFrom'])
246
+ f.first >= c ? [1] : [-1]
247
+ else
248
+ array + f
249
+ end
250
+ end
251
+ end
252
+
253
+ # Returns a hash for all examples in the data set
254
+ # key : example name
255
+ # value : array of values (float)
256
+ # getAllFeatures is dependent on a data set being given in the configuration
257
+ def getAllFeatures()
258
+ @examples.inject({}) { |output,example|
259
+ begin
260
+ output[example] = getExAllFeatures(example)
261
+ rescue
262
+ #puts "Excluding #{example}"
263
+ end
264
+ output
265
+ }
266
+ end
267
+
268
+ # Prints all examples and their features
269
+ # Prints to a file if given, otherwise to standard output.
270
+ def printFeatures(file = nil)
271
+ features = self.getAllFeatures
272
+ if file
273
+ open(file,'w') do |f|
274
+ features.each do |k,v|
275
+ f.puts k + ' ' + v.join(' ')
276
+ end
277
+ end
278
+ else
279
+ features.each do |k,v|
280
+ puts k + ' ' + v.join(' ')
281
+ end
282
+ end
283
+ nil
284
+ end
285
+
286
+ # Returns a string of the n examples with highest target feature
287
+ def getTopRanking(n = 0)
288
+ self.sort{ |(k1,v1),(k2,v2)| v1[0]<=>v2[0]}.
289
+ reverse[0..(n-1)].map{|i| "#{i.first}\t#{i[1].first}"}.
290
+ join("\n")
291
+ end
292
+
293
+ def featname(index)
294
+ i = 0
295
+ s = ''
296
+ @config['Features'].each do |feature|
297
+ if i + @config[feature]['Dimensions'] > index
298
+ if s==''
299
+ s =
300
+ if @config[feature]['Dimensions']==1
301
+ feature
302
+ else
303
+ "#{feature}_#{index - i}"
304
+ end
305
+ end
306
+ else
307
+ i += @config[feature]['Dimensions']
308
+ end
309
+ end
310
+ s
311
+ end
312
+
313
+ def to_s
314
+ (0...@config.dim).map do |i|
315
+ self.featname(i)
316
+ end.join(' ') + "\n" +
317
+ self.keys.sort.map do |key|
318
+ "#{key} #{self[key].join(' ')}"
319
+ end.join("\n")
320
+ end
321
+
322
+ # --- [] ---
323
+ # If indexing with a regular expression, a new SVMPrediction object is created
324
+ # containing all elements with matching keys.
325
+ def [](expr)
326
+ if expr.is_a? Regexp
327
+ subs = SVMPrediction.new
328
+ self.find_all { |(k,v)| k =~ expr }.each do |i|
329
+ subs[i[0]] = i[1]
330
+ end
331
+ subs
332
+ else
333
+ super(expr)
334
+ end
335
+ end
336
+
337
+ end
@@ -0,0 +1,98 @@
1
+ class SVMFeature < Hash
2
+
3
+ attr_reader :cfg
4
+
5
+ # config is either a file object or a string object.
6
+ def initialize(config)
7
+ @cfg = if config.is_a? SVMFeaturesConfig
8
+ config
9
+ else
10
+ SVMFeaturesConfig.new(config)
11
+ end
12
+ updateFromDatabases!
13
+ end
14
+
15
+ def [](key)
16
+ return super.join(' ') if super
17
+ "I don't have it!"
18
+ end
19
+
20
+ #def []=(key,val)
21
+ # STDERR.puts "You can't just PUT a value in SVMFeature..."
22
+ # nil
23
+ #end
24
+
25
+ def updateFromDatabases!
26
+ @features = @cfg['Features'].map do |feature|
27
+ readDatabase(feature)
28
+ end
29
+ sizes = @features.map{|h| h.size}
30
+ minsize = sizes.min
31
+ index = sizes.index{|i| i==minsize}
32
+ @features[index].each do |k,v|
33
+ unless @features.find{|h| !h[k] or h[k]=~/ERROR/}
34
+ self[k] = @features.map{|h| h[k]}#.join(' ')
35
+ end
36
+ end
37
+ end
38
+
39
+ def readDatabase(feature)
40
+ file = getDataFileToRead(feature)
41
+ return {} unless file
42
+ open(file,'r') do |f|
43
+ if f.flock(File::LOCK_SH)
44
+ begin
45
+ YAML.load(f)
46
+ ensure
47
+ f.flock(File::LOCK_UN)
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ # Returns a string giving the path to the file holding
54
+ # the feature for the current settings.
55
+ # Returns nil if current setting do not match any file.
56
+ def getDataFileToRead(feature)
57
+ dfile = nil
58
+ filemapfname = File.join(@cfg[feature]['HomeDir'], 'filemap.yml')
59
+ return nil if !File.file?(filemapfname)
60
+ open(filemapfname,'r') do |f|
61
+ if f.flock(File::LOCK_SH)
62
+ begin
63
+ filemap = YAML.load(f)
64
+ if cfg = filemap.find {|c| c['Config'] == @cfg[feature]}
65
+ dfile = File.join(@cfg[feature]['HomeDir'], cfg['FeatureFile'])
66
+ else
67
+ dfile = nil
68
+ end
69
+ ensure
70
+ f.flock(File::LOCK_UN)
71
+ end
72
+ end
73
+ end
74
+ dfile
75
+ end
76
+
77
+ def getDataFileToWrite(feature)
78
+ return file if file=getDataFileToRead(feature)
79
+ file = File.join(@cfg[feature]['HomeDir'], 'filemap.yml')
80
+ open(file,'w') {} if !File.file?(file)
81
+ open(file,'r+') do |f|
82
+ if f.flock(File::LOCK_EX)
83
+ begin
84
+ filemap = (tmp=YAML.load(f)) ? tmp : []
85
+ newdatafile = feature + filemap.size.to_s + '.yml'
86
+ filemap.push({'FeatureFile' => newdatafile,
87
+ 'Config' => @config[feature] })
88
+ f.rewind
89
+ YAML.dump(filemap,f)
90
+ ensure
91
+ f.flock(File::LOCK_UN)
92
+ end
93
+ end
94
+ end
95
+ file
96
+ end
97
+
98
+ end
@@ -0,0 +1,215 @@
1
+ # SVMLabConfig
2
+ # is created by giving a text string as argument.
3
+
4
+ require 'yaml'
5
+
6
+ def method?(arg)
7
+ begin
8
+ method(arg)
9
+ true
10
+ rescue
11
+ false
12
+ end
13
+ end
14
+
15
+ # ---SVMFeature configuration---
16
+ #
17
+ # SVMFeature is initiated by a configuration file in YAML format.
18
+ # Required fields in configuration are:
19
+ # (all paths can be given either absolute or relative to BaseDir)
20
+ #
21
+ # ---
22
+ # Features:
23
+ # - <targetfeature>
24
+ # - <feature1>
25
+ # - <feature2>
26
+ # ...
27
+ # BaseDir: <base directory>
28
+ # DataSet: <path of file giving the dataset>
29
+ # OR
30
+ # <prefix of a set of files giving the dataset>
31
+ # Example: If Dataset is not given, this gives name(s) of examples
32
+ # to use.
33
+ # Groups: <range (n1..n2) or (n1...n2) in example name to use for grouping>
34
+ # OR
35
+ # <file prefix relative to BaseDir for files giving groups>
36
+ # Methods: <path of .rb file holding all feature calculation methods>
37
+ # --------------------------------------------------------------
38
+ # targetfeature:
39
+ # HomeDir: <home directory>
40
+ # If HomeDir is not given, it will be set to BaseDir/featurename
41
+ # Method: <the method calculating this feature>
42
+ # If Method is not given, an attempts is made to acquire
43
+ # the feature from the database. If it fails, ERROR is reported.
44
+ # Dimensions: <the number of dimensions in this feature>
45
+ # If Dimensions is not given, it will be assumed to be 1
46
+ # and only the first value for each example will be used
47
+ # <Further specific configuration of this feature>
48
+ # feature1:
49
+ # HomeDir: <home directory>
50
+ # Method: <the method calculating this feature>
51
+ # Dimensions: <the number of dimensions in this feature>
52
+ # <Further specific configuration of this feature>
53
+ # feature2:
54
+ # ...
55
+ # featureN:
56
+ # If featureN configuration is not given, then all default settings
57
+ # will apply to this feature.
58
+ # ...
59
+ #
60
+ #
61
+ class SVMFeaturesConfig < Hash
62
+
63
+ attr_reader :cfg, :dim
64
+
65
+ #----------------SVMFeatureConfig--------------------------------------------
66
+ class SVMFeatureConfig < Hash
67
+ def initialize(maincfg, feature)
68
+ if !(cfg = maincfg[feature])
69
+ raise ArgumentError, "#{feature} configuration not given."
70
+ else
71
+ cfg.each do |k,v|
72
+ self[k] = v
73
+ end
74
+ end
75
+ # Method
76
+ method = self['Method']
77
+ if !method
78
+ raise ArgumentError, "Method for #{feature} not given."
79
+ end
80
+ if !method? method
81
+ raise ArgumentError, "Method #{method} for #{feature} not found."
82
+ end
83
+ # HomeDir
84
+ dir = self['HomeDir'] ? self['HomeDir'] : feature.dup
85
+ dir << '/' if dir[-1..-1]!='/'
86
+ self['HomeDir'] = maincfg.setupDirInfo(dir)
87
+ Dir.mkdir(self['HomeDir']) if !File.directory?(self['HomeDir'])
88
+ # Dimensions
89
+ self['Dimensions'] = self['Dimensions'] ? self['Dimensions'] : 1
90
+ # Meta-feature
91
+ if self['Features']
92
+ self['BaseDir'] = maincfg['BaseDir']
93
+ end
94
+
95
+ end
96
+
97
+ def to_s
98
+ self.to_yaml
99
+ end
100
+ end
101
+ #----------------------------------------------------------------------------
102
+
103
+ def initialize(arg)
104
+ begin
105
+ cfg = YAML.load(arg)
106
+ cfg.each do |k,v|
107
+ self[k] = v
108
+ end
109
+ # Set up dataset file
110
+ if self['DataSet']
111
+ self['DataSet'] = setupDirInfo(self['DataSet'])
112
+ if !File.exists? self['DataSet']
113
+ raise ArgumentError, "Cannot find dataset file #{self['DataSet']}."
114
+ end
115
+ end
116
+ # Get feature methods
117
+ if methods = self['Methods']
118
+ if methods.is_a? Array
119
+ methods.each { |method| require setupDirInfo(method) }
120
+ elsif methods.is_a? String
121
+ require setupDirInfo(methods)
122
+ end
123
+ end
124
+ # Set up each feature's configuration
125
+ raise ArgumentError, "Features not given." if !self['Features']
126
+ @dim = 0
127
+ self['Features'].each do |feature|
128
+ self[feature] = SVMFeatureConfig.new(self,feature)
129
+ @dim += self[feature]['Dimensions']
130
+ end
131
+ rescue ArgumentError, LoadError
132
+ raise $!,$!,nil
133
+ end
134
+ end
135
+
136
+ def setupDirInfo(path)
137
+ if path !~ /^\//
138
+ base = self['BaseDir']
139
+ raise ArgumentError, "BaseDir not given." if !base
140
+ path.sub!(/#{path}/, base + '/' + path)
141
+ path.sub!(/\/\//,'/')
142
+ end
143
+ path
144
+ end
145
+
146
+ end
147
+
148
+ class SVMLabConfig
149
+
150
+ def initialize(cfg)
151
+ @cfg = if cfg.split("\n").size==1
152
+ YAML.load(File.new(cfg))
153
+ else
154
+ YAML.load(cfg)
155
+ end
156
+ end
157
+
158
+ def [](a)
159
+ @cfg[a]
160
+ end
161
+
162
+ def scales=(scale)
163
+ @cfg['SVM']['Scale'].each{ |k,v|
164
+ (0...v.size).each { |i|
165
+ @cfg['SVM']['Scale'][k][i] =
166
+ if scale.is_a? String
167
+ scale.dup
168
+ else
169
+ scale
170
+ end
171
+ } }
172
+ end
173
+
174
+ def C=(arg)
175
+ @cfg['SVM']['C'] = arg
176
+ end
177
+
178
+ def e=(arg)
179
+ @cfg['SVM']['e'] = arg
180
+ end
181
+
182
+ def to_s
183
+ @cfg.to_yaml
184
+ end
185
+
186
+ def sayhi
187
+ puts "Hello! I'm a SVMLabConfig object!"
188
+ end
189
+
190
+ # SMVLabConfigStructure =
191
+ # '---
192
+ #Feature:
193
+ # Features:
194
+ # - exA
195
+ # exA:
196
+ # Method: exB
197
+ # Dimensions: exC
198
+ # Methods: exD
199
+ # BaseDir: exE
200
+ # Groups: ExF
201
+ # DataSet: exG
202
+ #SVM:
203
+ # PosClassFrom: Float
204
+ # PShalf: Integer
205
+ # C: Float
206
+ # ScaleMethod:
207
+ # Name: String
208
+ # Nhalf: Integer
209
+ # StepMethod: String
210
+ # e: Float
211
+ # Scale:
212
+ # exA:
213
+ # - exH
214
+ #'
215
+ end