svmlab 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,170 @@
1
+ # ---------------------------------------------------------------------------------------
2
+ # Plot methods
3
+
4
+ require 'gnuplot'
5
+
6
+
7
+ # Each should be an array giving more than one plot
8
+ def genericplot(plotdata, file, title='Plot', xtitle='X', ytitle='Y')
9
+ Gnuplot.open do |gp| # This could be either a file or the gnuplot process that we pipe to
10
+ Gnuplot::Plot.new( gp ) do |plot|
11
+ plot.title title
12
+ plot.xlabel xtitle
13
+ plot.ylabel ytitle
14
+ plot.set "grid"
15
+ if file =~ /(png)|(ps)$/
16
+ # Remember to add following line to your .baschrc file :
17
+ # export GDFONTPATH=/usr/share/fonts/truetype/ttf-bitstream-vera/
18
+ plot.terminal "png size 800,600 font Vera 16" if file =~ /png$/
19
+ #plot.terminal "png size 800,600 large" if file =~ /png$/
20
+ plot.terminal "postscript color \"Helvetica\" 16" if file =~ /ps$/
21
+ plot.output file
22
+ end
23
+ plot.data = plotdata
24
+ end
25
+ end
26
+ end
27
+
28
+ # --- predplot ---
29
+ # PredictionPlot: Plots true value on the X axis vs. predicted value on the Y axis.
30
+ def predplot(predarr, legends = [], title = 'SVM Prediction', err = nil, file = '')
31
+ predarr = [predarr] if !predarr.is_a? Array
32
+ dataarr = predarr.map do |predictions|
33
+ x, y = predictions.inject([[],[]]) { |data,(example,val)|
34
+ data[0] << val['truth']
35
+ data[1] << val['pred']
36
+ data }
37
+ end
38
+
39
+ from = dataarr.inject(dataarr[0][0][0]) { |m,a|
40
+ [m, a[0].min, a[1].min].min }.floor
41
+ to = dataarr.inject(dataarr[0][0][0]) { |m,a|
42
+ [m, a[0].max, a[1].max].max }.ceil
43
+ sampleindex = 0
44
+ # Fiddling with legends
45
+ legends = dataarr.map{|d| "Sample #{sampleindex+=1}"} if legends.size==0
46
+ if err
47
+ legends = legends.zip(predarr).map { | legend, pred |
48
+ begin
49
+ #args = if err.split(/,/).size==1 then 'pred'
50
+ # else (['pred'] + err.split(/,/)[1..-1]).join(',') end
51
+ #legend + " (#{err} = ".upcase + "%.2f"%eval("#{err.split(/,/)[0].downcase}(#{args})") + ")"
52
+ legend + " (#{err} = ".upcase + "%.2f"%eval("pred.#{err}") + ')'
53
+ rescue
54
+ legend
55
+ raise
56
+ end
57
+ }
58
+ end
59
+ # Setting plotdata
60
+ plotdata =
61
+ [ Gnuplot::DataSet.new( dataarr.first ) { |ds|
62
+ ds.using = '1:2'
63
+ ds.with = "points"
64
+ ds.title = legends.first
65
+ ds.linewidth = 2
66
+ ds.matrix = nil } ] +
67
+ [ Gnuplot::DataSet.new( [[from,to], [from,to]] ) { |ds|
68
+ ds.using = '1:2'
69
+ ds.with = "lines"
70
+ ds.title = "Correct diagonal"
71
+ ds.linewidth = 1
72
+ ds.matrix = nil } ] +
73
+ dataarr[1..-1].zip(legends[1..-1]).inject([]) { |arr,((x,y),legend)|
74
+ arr.push(Gnuplot::DataSet.new( [x,y] ) { |ds|
75
+ ds.using = '1:2'
76
+ ds.with = "points"
77
+ ds.title = legend
78
+ ds.linewidth = 2
79
+ ds.matrix = nil }) }
80
+ genericplot(plotdata, file, title, 'Experimental value', 'Predicted value')
81
+ nil
82
+ end
83
+
84
+ class SVMLab
85
+
86
+ # --- featurecorrelationplot ---
87
+ # Plots target feature on the Y axis vs. selected feature on the X axis
88
+ def featurecorrelationplot( feature, file = '', title = 'Feature correlation')
89
+ x,y = @examples.inject([[],[]]) do |data,(example,val)|
90
+ raise "#{feature} outside feature range"if not (0...val.size) === feature
91
+ data[0].push(val[feature] / @scale[feature] + @center[feature])
92
+ data[1].push(val[0] / @scale[0] + @center[0])
93
+ data
94
+ end
95
+ plotdata = [ Gnuplot::DataSet.new( [x,y] ) { |ds|
96
+ ds.using = '1:2'
97
+ ds.with = "points"
98
+ ds.title = "Feature #{feature} vs target feature"
99
+ ds.linewidth = 1
100
+ ds.matrix = nil } ]
101
+ genericplot(plotdata, file, title, "Feature #{feature}", "Target feature")
102
+ end
103
+
104
+ # --- predplotgroups ---
105
+ def predplotgroups(predarr, file = '', legends = [], title = 'SVM Prediction', err = nil)
106
+ substr = @cfg['Feature']['Groups']
107
+ groups = @examples.map{|k,v| k[(eval substr)] }.uniq
108
+ # For each group
109
+ groups.each do |group|
110
+ predarr2 = predarr.map { |preds|
111
+ preds.find_all { |k,v| k[(eval substr)] == group }.
112
+ inject({}) { |k,a| k[ a[0] ] = a[1]
113
+ k }
114
+ }
115
+ predplot(predarr2 ,
116
+ if file.size==0 then file
117
+ else [file.split(/\./)[0...-1],group,file.split(/\./).last].join('.') end ,
118
+ legends, title + " on #{group}", err)
119
+ end
120
+ end
121
+
122
+
123
+
124
+ # --- onefeatureplot ---
125
+ def onefeatureplot(file='', title = 'SVM Prediction')
126
+ xp = [] # Don't initialize in one line : x=y=[]
127
+ yp = [] # If doing that, they will both refer to the same array
128
+ xt = [] # Don't initialize in one line : x=y=[]
129
+ yt = [] # If doing that, they will both refer to the same array
130
+ @examples.each {|example, features|
131
+ xt.push(features[1].to_f)
132
+ yt.push(features[0].to_f)
133
+ }
134
+ (0..1000).each {|i|
135
+ x = (i * (xt.max-xt.min) / 1000 + xt.min).to_f
136
+ xp.push(x)
137
+ yp.push(@model.predict([x]) / @scale[0] + @center[0])
138
+ }
139
+ Gnuplot.open do |gp| # This could be either a file or the gnuplot process that we pipe to
140
+ Gnuplot::Plot.new( gp ) do |plot|
141
+ plot.title title
142
+ plot.xlabel "Truth"
143
+ plot.ylabel "Prediction"
144
+ plot.set "grid"
145
+ if file =~ /(png)|(ps)$/
146
+ plot.terminal "png size 800,600 small" if file =~ /png$/
147
+ plot.terminal "postscript" if file =~ /ps$/
148
+ plot.output file
149
+ end
150
+ plot.data = [
151
+ Gnuplot::DataSet.new( [xp,yp] ) { |ds|
152
+ ds.using = '1:2'
153
+ ds.with = "lines"
154
+ ds.title = "SVM prediction"
155
+ ds.linewidth = 1
156
+ ds.matrix = nil
157
+ },
158
+ Gnuplot::DataSet.new( [xt, yt] ) { |ds|
159
+ ds.using = '1:2'
160
+ ds.with = "points"
161
+ ds.title = "Correct prediction"
162
+ ds.linewidth = 1
163
+ ds.matrix = nil
164
+ }
165
+ ]
166
+ end
167
+ end
168
+ end
169
+
170
+ end
@@ -0,0 +1,365 @@
1
+ require 'SVM'
2
+ require 'yaml'
3
+ require 'rubygems'
4
+ require 'tempfile'
5
+ #require 'forkoff'
6
+
7
+ require 'svmfeature.rb'
8
+ require 'svmlab-optim.rb'
9
+ require 'svmlab-plot.rb'
10
+ require 'svmprediction.rb'
11
+ require 'svmlab-config.rb'
12
+
13
+ # An SVMLab object is created giving the configuration either as a file
14
+ # object or as a string. The configuration is in YAML format:
15
+ #
16
+ # ---
17
+ # Feature:
18
+ # <See SVMFeature class documentation>
19
+ # SVM:
20
+ # C: <parameter C>
21
+ # g: <RBF kernel's gamma>
22
+ # e: <epsilon for regression>
23
+ # Scale:
24
+ # <Feature1>:
25
+ # - <Scale1>
26
+ # - <Scale2>
27
+ # - ...
28
+ # - <ScaleN>
29
+ # <Feature2>: <Scale>
30
+ #
31
+ # The Scale setup has to match the features given in Feature configuration
32
+ # and each scale can be given as scalar or as array.
33
+ #
34
+
35
+ class SVMLab
36
+
37
+ attr_reader :cfg, :pslog, :features
38
+
39
+ # All examples are centered and scaled and the centered/scaled examples are stored
40
+ # in the object variable @examples. Information about the centering/scaling
41
+ # is stored in the @cfg['SVM'] part of the configuration hash
42
+ # There are three ways to initialize.
43
+ # 1) With an SVMLabConfig object
44
+ # 2) With a configuration file File object
45
+ # 3) With a string giving the configuration
46
+ def initialize(cfg)
47
+ @cfg = if cfg.is_a? SVMLabConfig then cfg
48
+ else SVMLabConfig.new(cfg) end
49
+ @features = SVMFeature.new(@cfg['Feature'].to_yaml)
50
+ @examples = @features.getAllFeatures
51
+ @ndimensions = nil
52
+
53
+ checkScales(@cfg)
54
+ checkOptimization(@cfg)
55
+ scaleExamples
56
+ @groups = setGroups()
57
+ #puts 'Groups:', @groups.map{ |group,members| {group => members.map{|name,feat| name}.size} }.to_yaml
58
+ end
59
+
60
+ # --- setGroups ---
61
+ # Return value:
62
+ # groups hash:
63
+ # key : group name
64
+ # value : array of example names
65
+ # Returns nil if @cfg['Feature']['Groups'] is not set
66
+ def setGroups()
67
+ if groups = @cfg['Feature']['Groups']
68
+ # If using the (n1..n2) syntax
69
+ if groups =~ /^\(\d(\.{2}|\.{3})\d\)$/
70
+ hashkeys = @examples.map{|k,v| k[eval(groups)] }.uniq
71
+ hashkeys.inject({}) { |hash,key|
72
+ hash[key] = @examples.find_all{ |exname,val| exname[eval(groups)]==key }.map{|i| i[0]}
73
+ hash }
74
+ # If using the file prefix syntax
75
+ elsif (gfiles = Dir::entries(@cfg['Feature']['BaseDir']).grep(/^#{groups}\d+$/)).size>0
76
+ hashkeys = gfiles.map { |file| file.split(groups).last }
77
+ #puts '---','Groups :',hashkeys,'---'
78
+ hashkeys.inject({}) { |hash,key|
79
+ hash[key] = open(@cfg['Feature']['BaseDir']+groups+key){|f| f.read}.split(/\n/)
80
+ hash }
81
+ end
82
+ else # If no groups set, use leave-one-out crossvalidation
83
+ @examples.inject({}) { |hash,(key, value)|
84
+ hash[key] = [ key ]
85
+ hash
86
+ }
87
+ end
88
+ end
89
+
90
+ # Set the penalty factor C.
91
+ def C=(arg)
92
+ @cfg['SVM']['C'] = arg.to_f
93
+ end
94
+
95
+ # Set epsilon for Support Vector Regression.
96
+ def e=(arg)
97
+ @cfg['SVM']['e'] = arg.to_f
98
+ end
99
+
100
+ # Set gamma for the RBF kernel.
101
+ def g=(arg)
102
+ @cfg['SVM']['g'] = arg.to_f
103
+ end
104
+
105
+ # Returns the n closest neighbors of the @example hash to example
106
+ # Possibly broken - check @feature hash if an erranous example
107
+ def getNeighbors(example, n = 1)
108
+ arr = @examples[example]
109
+ distance = @examples.sort_by { |a|
110
+ dist(arr[1...arr.size],a[1][1...a[1].size]) }
111
+ distance[0..n].map { |a|
112
+ i = 0
113
+ a[0] + ' : ' + # Name
114
+ "%.3f \n"%dist(arr[1...arr.size], a[1][1...a[1].size]) + # Distance
115
+ #" : %.3f\n"%(a[1][0] - arr[0]) + # Distance in target value
116
+ @cfg['Feature']['Features'].inject('') { |string,feature|
117
+ nvector = @features.getExFeature(a[0],feature)
118
+ featdist = dist(arr[i...i+nvector.size],
119
+ @examples[a[0]][i...i+nvector.size])
120
+ i += nvector.size
121
+ pretty = if feature==@cfg['Feature']['Features'][0] then ' *** ' else ' --- ' end
122
+ string += pretty + "(%.2f)"%featdist + pretty +
123
+ feature + " : " +
124
+ nvector.join(' ') + "\n"
125
+ }
126
+ }
127
+ end
128
+
129
+ def dist(a,b)
130
+ raise "Cannot calculate distance" if a.size != b.size
131
+ Math.sqrt(a.zip(b).inject(0){|d,(ai,bi)| d+(ai-bi).abs**2})
132
+ end
133
+
134
+ # Finds those examples that have been predicted most far off
135
+ # the correct value. Returns a string consisting of those
136
+ # examples along with the closest neighbors.
137
+ def getOutliers(n = (1..1), n2 = 3, predictions = nil)
138
+ if !predictions
139
+ predictions = self.crossvalidate
140
+ end
141
+ sortedpred = predictions.sort_by { |(k,v)|
142
+ - (v['pred'] - v['truth']).abs }
143
+ n = if n.is_a? Fixnum then (n..n) else n end
144
+ n.map do |i|
145
+ "OUTLIER %d : \n"%i +
146
+ sortedpred[i-1][0] + " was predicted %.3f"%sortedpred[i-1][1]['pred'] +
147
+ " but the truth is %.3f :\n"%sortedpred[i-1][1]['truth'] +
148
+ getNeighbors(sortedpred[i-1][0],n2).join('')
149
+ end.join("\n")
150
+ end
151
+
152
+ # Returns a String of all examples with features.
153
+ def printExamples
154
+ @examples.inject('') do |str,(exname,vector)|
155
+ str += vector.map{|v| v.to_s}.join(' ') + "\n"
156
+ end
157
+ end
158
+
159
+ # An outer binding for the RubySVM predict function. This binding introduces
160
+ # inverse centering and scaling of the predicted value. This in order
161
+ # to give a real prediction value.
162
+ def predict(examples, model = nil)
163
+ if !model then model = self.train end
164
+ examples = [ examples ] if examples.is_a? String
165
+ predictions = examples.map do |example|
166
+ begin
167
+ vector =
168
+ if !@examples[example]
169
+ fcfg = Marshal.load(Marshal.dump(@cfg['Feature']))
170
+ fcfg.delete('DataSet')
171
+ fcfg['Example'] = example
172
+ scaleExample(SVMFeature.new(fcfg.to_yaml).getExAllFeatures(example))
173
+ else
174
+ @examples[example]
175
+ end
176
+ if @cfg['Feature']['PosClassFrom']
177
+ model.predict(vector[1..-1]).round
178
+ else
179
+ model.predict(vector[1..-1]) /
180
+ @cfg['SVM']['Scale'][ @cfg['Feature']['Features'][0] ][0] +
181
+ @cfg['SVM']['Center'][ @cfg['Feature']['Features'][0] ][0]
182
+ end
183
+ rescue
184
+ $!
185
+ end
186
+ end
187
+ if predictions.size==1 then predictions[0]
188
+ else predictions end
189
+ end
190
+
191
+ # An outer binding for the RubySVM training function.
192
+ # If no training examples given, it will train on all data in the dataset.
193
+ def train(examples = nil)
194
+ svm = SVM::Problem.new
195
+ if examples then examples.each { |exname|
196
+ @ndimensions = @examples[exname].size - 1 if !@ndimensions
197
+ if @examples[exname]
198
+ svm.addExample( @examples[exname][0], @examples[exname][1..-1] )
199
+ end }
200
+ else @examples.each { |name,vector|
201
+ @ndimensions = vector.size-1 if !@ndimensions
202
+ svm.addExample( vector[0], vector[1..-1] ) }
203
+ end
204
+ begin
205
+ errout = STDERR.clone
206
+ out = STDOUT.clone
207
+ STDERR.reopen(File.open('/dev/null','w'))
208
+ STDOUT.reopen(File.open('/dev/null','w'))
209
+ @par = SVM::Parameter.new
210
+ @par.svm_type = @cfg['Feature']['PosClassFrom'] ? 0 : 3
211
+ if c=@cfg['SVM']['C'] then @par.C = c.to_f end
212
+ if e=@cfg['SVM']['e'] then @par.eps = e.to_f end
213
+ @par.gamma = if g=@cfg['SVM']['g'] then g.to_f
214
+ else 1.0 / @ndimensions end
215
+ SVM::Model.new(svm,@par)
216
+ ensure
217
+ STDERR.reopen(errout)
218
+ STDOUT.reopen(out)
219
+ end
220
+ end
221
+
222
+ # crossvalidation on a grouping made from "Groups" in cfg
223
+ # Return values:
224
+ # - Predictions hash :
225
+ # key : example name
226
+ # value : 'truth' => the true value
227
+ # 'pred' => the predicted value
228
+ #--
229
+ # Remaining issues:
230
+ # 2) No of parallel computations should be in cfg
231
+ #++
232
+ def crossvalidate()
233
+ #parr = @groups.keys.forkoff do |group|
234
+ parr = @groups.keys.map do |group|
235
+ members = @groups[group]
236
+ trainingex = @groups.inject([]){ |exarr,(trgroup,trmem)|
237
+ (trgroup == group) ? exarr : exarr + trmem }
238
+ model = self.train(trainingex)
239
+ # Predict each member of the group left out of training
240
+ pred = members.inject({}) do |p,predname|
241
+ p[predname] = {
242
+ 'truth' => if @cfg['Feature']['PosClassFrom']
243
+ @examples[predname][0].round
244
+ else @examples[predname][0] /
245
+ @cfg['SVM']['Scale'][@cfg['Feature']['Features'][0]][0] +
246
+ @cfg['SVM']['Center'][@cfg['Feature']['Features'][0]][0]
247
+ end,
248
+ 'pred' => self.predict(predname,model) } if @examples[predname]
249
+ p
250
+ end
251
+ end
252
+ predictions = parr.inject SVMPrediction.new do |p,predhash|
253
+ predhash.each { |exname,phash| p[exname] = phash } ; p
254
+ end
255
+ end
256
+
257
+ # Same as crossvalidate, but also outputs configuration and result
258
+ # to a file.
259
+ def publish_crossvalidate(path)
260
+ predictions = self.crossvalidate
261
+ time = DateTime.now
262
+ info = {
263
+ 'Time' => time,
264
+ 'Evaluation' => {
265
+ 'RMSD' => predictions.rmsd,
266
+ #'MeanErr' => predictions.meanerr,
267
+ 'CC' => predictions.cc },
268
+ #'AUC' => auc(predictions,1),
269
+ #'PBRMSD' => pbrmsd(predictions, @cfg['Feature']['Groups']),
270
+ #'WRMSD' => wrmsd(predictions,1),
271
+ #'F1' => f1(predictions,2) },
272
+ 'Configuration' => @cfg,
273
+ 'Predictions' => predictions.predictions
274
+ }
275
+ pdepth = 0
276
+ (patharr = path.split(/\//))[1...-1].each do
277
+ dir = patharr[0..pdepth+=1].join('/')
278
+ Dir.mkdir dir if !File.exists?(dir)
279
+ end
280
+ File.open(path,'w') { |f| YAML.dump(info,f) }
281
+ return predictions
282
+ end
283
+
284
+ private
285
+
286
+ # Go through each feature and check its centering
287
+ # and scaling instructions.
288
+ def checkScales(cfg)
289
+ cfg['SVM']['Center'] = {} if !cfg['SVM']['Center']
290
+ cfg['SVM']['Scale'] = {} if !cfg['SVM']['Scale']
291
+ dim0 = 0
292
+ cfg['Feature']['Features'].each_with_index do |feature, index|
293
+ dim = @cfg['Feature'][feature]['Dimensions']
294
+ dim ||= 1 # If Dimensions not given
295
+ cfg['SVM']['Center'][feature] ||= []
296
+ cfg['SVM']['Scale'][feature] ||= []
297
+ sc = cfg['SVM']['Scale'][feature]
298
+ (0...dim).each do |i|
299
+ #Check centering
300
+ cfg['SVM']['Center'][feature][i] =
301
+ if index==0 and @cfg['Feature']['PosClassFrom']
302
+ 0
303
+ elsif !cfg['SVM']['Center'][feature][i]
304
+ sum = @examples.inject(0) { |s,(exname,vector)| s + vector[dim0+i] }
305
+ sum / @examples.size
306
+ else
307
+ cfg['SVM']['Center'][feature][i]
308
+ end
309
+ #Check scaling
310
+ if index==0 and @cfg['Feature']['PosClassFrom']
311
+ sc[i] = 1
312
+ elsif !sc[i] or (sc[i].is_a? String and sc[i] =~ /max/)
313
+ absmax = @examples.inject(0) { |s,(exname,vector)|
314
+ [s, (vector[dim0+i] - cfg['SVM']['Center'][feature][i]).abs].max }
315
+ scale = if absmax!=0 then 1/absmax else 1 end
316
+ if sc[i] =~ /max/ then sc[i].sub!('max', scale.to_s)
317
+ else sc[i] = scale end
318
+ sc[i] = sc[i].to_f if sc[i] =~ /^\d+\.*\d*$/
319
+ elsif (sc[i].is_a? String and sc[i] =~ /avg/)
320
+ abssum = @examples.inject(0) { |s,(exname,vector)|
321
+ s + (vector[dim0+i] - cfg['SVM']['Center'][feature][i]).abs }
322
+ scale = @examples.size / abssum
323
+ if sc[i] =~ /avg/ then sc[i].sub!('avg', scale.to_s)
324
+ else sc[i] = scale end
325
+ sc[i] = sc[i].to_f if sc[i] =~ /^\d+\.*\d*$/
326
+ elsif (sc[i].is_a? String and sc[i] =~ /std/)
327
+ sqsum = @examples.inject(0) { |s,(exname,vector)|
328
+ s + (vector[dim0+i] - cfg['SVM']['Center'][feature][i])**2 }
329
+ scale = Math::sqrt( @examples.size / sqsum )
330
+ if sc[i] =~ /std/ then sc[i].sub!('std', scale.to_s)
331
+ else sc[i] = scale end
332
+ sc[i] = sc[i].to_f if sc[i] =~ /^\d+\.*\d*$/
333
+ end
334
+ end
335
+ dim0 += dim
336
+ end
337
+ #puts 'CENTER',@cfg['SVM']['Center'].to_yaml
338
+ #puts 'SCALE',@cfg['SVM']['Scale'].to_yaml
339
+ end
340
+
341
+ # The object variable @examples's features are centered around zero and scaled
342
+ # according to the configuration. I.e. each feature has a center term and a scaling
343
+ # factor.
344
+ def scaleExamples()
345
+ @examples.each do |name, vector|
346
+ @examples[name] = scaleExample(vector)
347
+ end
348
+ end
349
+
350
+ def scaleExample(vector)
351
+ scaledvector = []
352
+ @cfg['Feature']['Features'].inject(0) { |i,feature|
353
+ dim = @cfg['Feature'][feature]['Dimensions']
354
+ dim = 1 if !dim
355
+ (0...dim).each { |j|
356
+ scaledvector[i+j] =
357
+ (vector[i+j] - @cfg['SVM']['Center'][feature][j]) *
358
+ @cfg['SVM']['Scale'][feature][j].to_f }
359
+ i + dim
360
+ }
361
+ scaledvector
362
+ end
363
+
364
+ end
365
+