svmlab 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,170 @@
1
+ # ---------------------------------------------------------------------------------------
2
+ # Plot methods
3
+
4
+ require 'gnuplot'
5
+
6
+
7
+ # Each should be an array giving more than one plot
8
+ def genericplot(plotdata, file, title='Plot', xtitle='X', ytitle='Y')
9
+ Gnuplot.open do |gp| # This could be either a file or the gnuplot process that we pipe to
10
+ Gnuplot::Plot.new( gp ) do |plot|
11
+ plot.title title
12
+ plot.xlabel xtitle
13
+ plot.ylabel ytitle
14
+ plot.set "grid"
15
+ if file =~ /(png)|(ps)$/
16
+ # Remember to add following line to your .baschrc file :
17
+ # export GDFONTPATH=/usr/share/fonts/truetype/ttf-bitstream-vera/
18
+ plot.terminal "png size 800,600 font Vera 16" if file =~ /png$/
19
+ #plot.terminal "png size 800,600 large" if file =~ /png$/
20
+ plot.terminal "postscript color \"Helvetica\" 16" if file =~ /ps$/
21
+ plot.output file
22
+ end
23
+ plot.data = plotdata
24
+ end
25
+ end
26
+ end
27
+
28
+ # --- predplot ---
29
+ # PredictionPlot: Plots true value on the X axis vs. predicted value on the Y axis.
30
+ def predplot(predarr, legends = [], title = 'SVM Prediction', err = nil, file = '')
31
+ predarr = [predarr] if !predarr.is_a? Array
32
+ dataarr = predarr.map do |predictions|
33
+ x, y = predictions.inject([[],[]]) { |data,(example,val)|
34
+ data[0] << val['truth']
35
+ data[1] << val['pred']
36
+ data }
37
+ end
38
+
39
+ from = dataarr.inject(dataarr[0][0][0]) { |m,a|
40
+ [m, a[0].min, a[1].min].min }.floor
41
+ to = dataarr.inject(dataarr[0][0][0]) { |m,a|
42
+ [m, a[0].max, a[1].max].max }.ceil
43
+ sampleindex = 0
44
+ # Fiddling with legends
45
+ legends = dataarr.map{|d| "Sample #{sampleindex+=1}"} if legends.size==0
46
+ if err
47
+ legends = legends.zip(predarr).map { | legend, pred |
48
+ begin
49
+ #args = if err.split(/,/).size==1 then 'pred'
50
+ # else (['pred'] + err.split(/,/)[1..-1]).join(',') end
51
+ #legend + " (#{err} = ".upcase + "%.2f"%eval("#{err.split(/,/)[0].downcase}(#{args})") + ")"
52
+ legend + " (#{err} = ".upcase + "%.2f"%eval("pred.#{err}") + ')'
53
+ rescue
54
+ legend
55
+ raise
56
+ end
57
+ }
58
+ end
59
+ # Setting plotdata
60
+ plotdata =
61
+ [ Gnuplot::DataSet.new( dataarr.first ) { |ds|
62
+ ds.using = '1:2'
63
+ ds.with = "points"
64
+ ds.title = legends.first
65
+ ds.linewidth = 2
66
+ ds.matrix = nil } ] +
67
+ [ Gnuplot::DataSet.new( [[from,to], [from,to]] ) { |ds|
68
+ ds.using = '1:2'
69
+ ds.with = "lines"
70
+ ds.title = "Correct diagonal"
71
+ ds.linewidth = 1
72
+ ds.matrix = nil } ] +
73
+ dataarr[1..-1].zip(legends[1..-1]).inject([]) { |arr,((x,y),legend)|
74
+ arr.push(Gnuplot::DataSet.new( [x,y] ) { |ds|
75
+ ds.using = '1:2'
76
+ ds.with = "points"
77
+ ds.title = legend
78
+ ds.linewidth = 2
79
+ ds.matrix = nil }) }
80
+ genericplot(plotdata, file, title, 'Experimental value', 'Predicted value')
81
+ nil
82
+ end
83
+
84
+ class SVMLab
85
+
86
+ # --- featurecorrelationplot ---
87
+ # Plots target feature on the Y axis vs. selected feature on the X axis
88
+ def featurecorrelationplot( feature, file = '', title = 'Feature correlation')
89
+ x,y = @examples.inject([[],[]]) do |data,(example,val)|
90
+ raise "#{feature} outside feature range"if not (0...val.size) === feature
91
+ data[0].push(val[feature] / @scale[feature] + @center[feature])
92
+ data[1].push(val[0] / @scale[0] + @center[0])
93
+ data
94
+ end
95
+ plotdata = [ Gnuplot::DataSet.new( [x,y] ) { |ds|
96
+ ds.using = '1:2'
97
+ ds.with = "points"
98
+ ds.title = "Feature #{feature} vs target feature"
99
+ ds.linewidth = 1
100
+ ds.matrix = nil } ]
101
+ genericplot(plotdata, file, title, "Feature #{feature}", "Target feature")
102
+ end
103
+
104
+ # --- predplotgroups ---
105
+ def predplotgroups(predarr, file = '', legends = [], title = 'SVM Prediction', err = nil)
106
+ substr = @cfg['Feature']['Groups']
107
+ groups = @examples.map{|k,v| k[(eval substr)] }.uniq
108
+ # For each group
109
+ groups.each do |group|
110
+ predarr2 = predarr.map { |preds|
111
+ preds.find_all { |k,v| k[(eval substr)] == group }.
112
+ inject({}) { |k,a| k[ a[0] ] = a[1]
113
+ k }
114
+ }
115
+ predplot(predarr2 ,
116
+ if file.size==0 then file
117
+ else [file.split(/\./)[0...-1],group,file.split(/\./).last].join('.') end ,
118
+ legends, title + " on #{group}", err)
119
+ end
120
+ end
121
+
122
+
123
+
124
+ # --- onefeatureplot ---
125
+ def onefeatureplot(file='', title = 'SVM Prediction')
126
+ xp = [] # Don't initialize in one line : x=y=[]
127
+ yp = [] # If doing that, they will both refer to the same array
128
+ xt = [] # Don't initialize in one line : x=y=[]
129
+ yt = [] # If doing that, they will both refer to the same array
130
+ @examples.each {|example, features|
131
+ xt.push(features[1].to_f)
132
+ yt.push(features[0].to_f)
133
+ }
134
+ (0..1000).each {|i|
135
+ x = (i * (xt.max-xt.min) / 1000 + xt.min).to_f
136
+ xp.push(x)
137
+ yp.push(@model.predict([x]) / @scale[0] + @center[0])
138
+ }
139
+ Gnuplot.open do |gp| # This could be either a file or the gnuplot process that we pipe to
140
+ Gnuplot::Plot.new( gp ) do |plot|
141
+ plot.title title
142
+ plot.xlabel "Truth"
143
+ plot.ylabel "Prediction"
144
+ plot.set "grid"
145
+ if file =~ /(png)|(ps)$/
146
+ plot.terminal "png size 800,600 small" if file =~ /png$/
147
+ plot.terminal "postscript" if file =~ /ps$/
148
+ plot.output file
149
+ end
150
+ plot.data = [
151
+ Gnuplot::DataSet.new( [xp,yp] ) { |ds|
152
+ ds.using = '1:2'
153
+ ds.with = "lines"
154
+ ds.title = "SVM prediction"
155
+ ds.linewidth = 1
156
+ ds.matrix = nil
157
+ },
158
+ Gnuplot::DataSet.new( [xt, yt] ) { |ds|
159
+ ds.using = '1:2'
160
+ ds.with = "points"
161
+ ds.title = "Correct prediction"
162
+ ds.linewidth = 1
163
+ ds.matrix = nil
164
+ }
165
+ ]
166
+ end
167
+ end
168
+ end
169
+
170
+ end
@@ -0,0 +1,365 @@
1
+ require 'SVM'
2
+ require 'yaml'
3
+ require 'rubygems'
4
+ require 'tempfile'
5
+ #require 'forkoff'
6
+
7
+ require 'svmfeature.rb'
8
+ require 'svmlab-optim.rb'
9
+ require 'svmlab-plot.rb'
10
+ require 'svmprediction.rb'
11
+ require 'svmlab-config.rb'
12
+
13
+ # An SVMLab object is created giving the configuration either as a file
14
+ # object or as a string. The configuration is in YAML format:
15
+ #
16
+ # ---
17
+ # Feature:
18
+ # <See SVMFeature class documentation>
19
+ # SVM:
20
+ # C: <parameter C>
21
+ # g: <RBF kernel's gamma>
22
+ # e: <epsilon for regression>
23
+ # Scale:
24
+ # <Feature1>:
25
+ # - <Scale1>
26
+ # - <Scale2>
27
+ # - ...
28
+ # - <ScaleN>
29
+ # <Feature2>: <Scale>
30
+ #
31
+ # The Scale setup has to match the features given in Feature configuration
32
+ # and each scale can be given as scalar or as array.
33
+ #
34
+
35
+ class SVMLab
36
+
37
+ attr_reader :cfg, :pslog, :features
38
+
39
+ # All examples are centered and scaled and the centered/scaled examples are stored
40
+ # in the object variable @examples. Information about the centering/scaling
41
+ # is stored in the @cfg['SVM'] part of the configuration hash
42
+ # There are three ways to initialize.
43
+ # 1) With an SVMLabConfig object
44
+ # 2) With a configuration file File object
45
+ # 3) With a string giving the configuration
46
+ def initialize(cfg)
47
+ @cfg = if cfg.is_a? SVMLabConfig then cfg
48
+ else SVMLabConfig.new(cfg) end
49
+ @features = SVMFeature.new(@cfg['Feature'].to_yaml)
50
+ @examples = @features.getAllFeatures
51
+ @ndimensions = nil
52
+
53
+ checkScales(@cfg)
54
+ checkOptimization(@cfg)
55
+ scaleExamples
56
+ @groups = setGroups()
57
+ #puts 'Groups:', @groups.map{ |group,members| {group => members.map{|name,feat| name}.size} }.to_yaml
58
+ end
59
+
60
+ # --- setGroups ---
61
+ # Return value:
62
+ # groups hash:
63
+ # key : group name
64
+ # value : array of example names
65
+ # Returns nil if @cfg['Feature']['Groups'] is not set
66
+ def setGroups()
67
+ if groups = @cfg['Feature']['Groups']
68
+ # If using the (n1..n2) syntax
69
+ if groups =~ /^\(\d(\.{2}|\.{3})\d\)$/
70
+ hashkeys = @examples.map{|k,v| k[eval(groups)] }.uniq
71
+ hashkeys.inject({}) { |hash,key|
72
+ hash[key] = @examples.find_all{ |exname,val| exname[eval(groups)]==key }.map{|i| i[0]}
73
+ hash }
74
+ # If using the file prefix syntax
75
+ elsif (gfiles = Dir::entries(@cfg['Feature']['BaseDir']).grep(/^#{groups}\d+$/)).size>0
76
+ hashkeys = gfiles.map { |file| file.split(groups).last }
77
+ #puts '---','Groups :',hashkeys,'---'
78
+ hashkeys.inject({}) { |hash,key|
79
+ hash[key] = open(@cfg['Feature']['BaseDir']+groups+key){|f| f.read}.split(/\n/)
80
+ hash }
81
+ end
82
+ else # If no groups set, use leave-one-out crossvalidation
83
+ @examples.inject({}) { |hash,(key, value)|
84
+ hash[key] = [ key ]
85
+ hash
86
+ }
87
+ end
88
+ end
89
+
90
+ # Set the penalty factor C.
91
+ def C=(arg)
92
+ @cfg['SVM']['C'] = arg.to_f
93
+ end
94
+
95
+ # Set epsilon for Support Vector Regression.
96
+ def e=(arg)
97
+ @cfg['SVM']['e'] = arg.to_f
98
+ end
99
+
100
+ # Set gamma for the RBF kernel.
101
+ def g=(arg)
102
+ @cfg['SVM']['g'] = arg.to_f
103
+ end
104
+
105
+ # Returns the n closest neighbors of the @example hash to example
106
+ # Possibly broken - check @feature hash if an erranous example
107
+ def getNeighbors(example, n = 1)
108
+ arr = @examples[example]
109
+ distance = @examples.sort_by { |a|
110
+ dist(arr[1...arr.size],a[1][1...a[1].size]) }
111
+ distance[0..n].map { |a|
112
+ i = 0
113
+ a[0] + ' : ' + # Name
114
+ "%.3f \n"%dist(arr[1...arr.size], a[1][1...a[1].size]) + # Distance
115
+ #" : %.3f\n"%(a[1][0] - arr[0]) + # Distance in target value
116
+ @cfg['Feature']['Features'].inject('') { |string,feature|
117
+ nvector = @features.getExFeature(a[0],feature)
118
+ featdist = dist(arr[i...i+nvector.size],
119
+ @examples[a[0]][i...i+nvector.size])
120
+ i += nvector.size
121
+ pretty = if feature==@cfg['Feature']['Features'][0] then ' *** ' else ' --- ' end
122
+ string += pretty + "(%.2f)"%featdist + pretty +
123
+ feature + " : " +
124
+ nvector.join(' ') + "\n"
125
+ }
126
+ }
127
+ end
128
+
129
+ def dist(a,b)
130
+ raise "Cannot calculate distance" if a.size != b.size
131
+ Math.sqrt(a.zip(b).inject(0){|d,(ai,bi)| d+(ai-bi).abs**2})
132
+ end
133
+
134
+ # Finds those examples that have been predicted most far off
135
+ # the correct value. Returns a string consisting of those
136
+ # examples along with the closest neighbors.
137
+ def getOutliers(n = (1..1), n2 = 3, predictions = nil)
138
+ if !predictions
139
+ predictions = self.crossvalidate
140
+ end
141
+ sortedpred = predictions.sort_by { |(k,v)|
142
+ - (v['pred'] - v['truth']).abs }
143
+ n = if n.is_a? Fixnum then (n..n) else n end
144
+ n.map do |i|
145
+ "OUTLIER %d : \n"%i +
146
+ sortedpred[i-1][0] + " was predicted %.3f"%sortedpred[i-1][1]['pred'] +
147
+ " but the truth is %.3f :\n"%sortedpred[i-1][1]['truth'] +
148
+ getNeighbors(sortedpred[i-1][0],n2).join('')
149
+ end.join("\n")
150
+ end
151
+
152
+ # Returns a String of all examples with features.
153
+ def printExamples
154
+ @examples.inject('') do |str,(exname,vector)|
155
+ str += vector.map{|v| v.to_s}.join(' ') + "\n"
156
+ end
157
+ end
158
+
159
+ # An outer binding for the RubySVM predict function. This binding introduces
160
+ # inverse centering and scaling of the predicted value. This in order
161
+ # to give a real prediction value.
162
+ def predict(examples, model = nil)
163
+ if !model then model = self.train end
164
+ examples = [ examples ] if examples.is_a? String
165
+ predictions = examples.map do |example|
166
+ begin
167
+ vector =
168
+ if !@examples[example]
169
+ fcfg = Marshal.load(Marshal.dump(@cfg['Feature']))
170
+ fcfg.delete('DataSet')
171
+ fcfg['Example'] = example
172
+ scaleExample(SVMFeature.new(fcfg.to_yaml).getExAllFeatures(example))
173
+ else
174
+ @examples[example]
175
+ end
176
+ if @cfg['Feature']['PosClassFrom']
177
+ model.predict(vector[1..-1]).round
178
+ else
179
+ model.predict(vector[1..-1]) /
180
+ @cfg['SVM']['Scale'][ @cfg['Feature']['Features'][0] ][0] +
181
+ @cfg['SVM']['Center'][ @cfg['Feature']['Features'][0] ][0]
182
+ end
183
+ rescue
184
+ $!
185
+ end
186
+ end
187
+ if predictions.size==1 then predictions[0]
188
+ else predictions end
189
+ end
190
+
191
+ # An outer binding for the RubySVM training function.
192
+ # If no training examples given, it will train on all data in the dataset.
193
+ def train(examples = nil)
194
+ svm = SVM::Problem.new
195
+ if examples then examples.each { |exname|
196
+ @ndimensions = @examples[exname].size - 1 if !@ndimensions
197
+ if @examples[exname]
198
+ svm.addExample( @examples[exname][0], @examples[exname][1..-1] )
199
+ end }
200
+ else @examples.each { |name,vector|
201
+ @ndimensions = vector.size-1 if !@ndimensions
202
+ svm.addExample( vector[0], vector[1..-1] ) }
203
+ end
204
+ begin
205
+ errout = STDERR.clone
206
+ out = STDOUT.clone
207
+ STDERR.reopen(File.open('/dev/null','w'))
208
+ STDOUT.reopen(File.open('/dev/null','w'))
209
+ @par = SVM::Parameter.new
210
+ @par.svm_type = @cfg['Feature']['PosClassFrom'] ? 0 : 3
211
+ if c=@cfg['SVM']['C'] then @par.C = c.to_f end
212
+ if e=@cfg['SVM']['e'] then @par.eps = e.to_f end
213
+ @par.gamma = if g=@cfg['SVM']['g'] then g.to_f
214
+ else 1.0 / @ndimensions end
215
+ SVM::Model.new(svm,@par)
216
+ ensure
217
+ STDERR.reopen(errout)
218
+ STDOUT.reopen(out)
219
+ end
220
+ end
221
+
222
+ # crossvalidation on a grouping made from "Groups" in cfg
223
+ # Return values:
224
+ # - Predictions hash :
225
+ # key : example name
226
+ # value : 'truth' => the true value
227
+ # 'pred' => the predicted value
228
+ #--
229
+ # Remaining issues:
230
+ # 2) No of parallel computations should be in cfg
231
+ #++
232
+ def crossvalidate()
233
+ #parr = @groups.keys.forkoff do |group|
234
+ parr = @groups.keys.map do |group|
235
+ members = @groups[group]
236
+ trainingex = @groups.inject([]){ |exarr,(trgroup,trmem)|
237
+ (trgroup == group) ? exarr : exarr + trmem }
238
+ model = self.train(trainingex)
239
+ # Predict each member of the group left out of training
240
+ pred = members.inject({}) do |p,predname|
241
+ p[predname] = {
242
+ 'truth' => if @cfg['Feature']['PosClassFrom']
243
+ @examples[predname][0].round
244
+ else @examples[predname][0] /
245
+ @cfg['SVM']['Scale'][@cfg['Feature']['Features'][0]][0] +
246
+ @cfg['SVM']['Center'][@cfg['Feature']['Features'][0]][0]
247
+ end,
248
+ 'pred' => self.predict(predname,model) } if @examples[predname]
249
+ p
250
+ end
251
+ end
252
+ predictions = parr.inject SVMPrediction.new do |p,predhash|
253
+ predhash.each { |exname,phash| p[exname] = phash } ; p
254
+ end
255
+ end
256
+
257
+ # Same as crossvalidate, but also outputs configuration and result
258
+ # to a file.
259
+ def publish_crossvalidate(path)
260
+ predictions = self.crossvalidate
261
+ time = DateTime.now
262
+ info = {
263
+ 'Time' => time,
264
+ 'Evaluation' => {
265
+ 'RMSD' => predictions.rmsd,
266
+ #'MeanErr' => predictions.meanerr,
267
+ 'CC' => predictions.cc },
268
+ #'AUC' => auc(predictions,1),
269
+ #'PBRMSD' => pbrmsd(predictions, @cfg['Feature']['Groups']),
270
+ #'WRMSD' => wrmsd(predictions,1),
271
+ #'F1' => f1(predictions,2) },
272
+ 'Configuration' => @cfg,
273
+ 'Predictions' => predictions.predictions
274
+ }
275
+ pdepth = 0
276
+ (patharr = path.split(/\//))[1...-1].each do
277
+ dir = patharr[0..pdepth+=1].join('/')
278
+ Dir.mkdir dir if !File.exists?(dir)
279
+ end
280
+ File.open(path,'w') { |f| YAML.dump(info,f) }
281
+ return predictions
282
+ end
283
+
284
+ private
285
+
286
+ # Go through each feature and check its centering
287
+ # and scaling instructions.
288
+ def checkScales(cfg)
289
+ cfg['SVM']['Center'] = {} if !cfg['SVM']['Center']
290
+ cfg['SVM']['Scale'] = {} if !cfg['SVM']['Scale']
291
+ dim0 = 0
292
+ cfg['Feature']['Features'].each_with_index do |feature, index|
293
+ dim = @cfg['Feature'][feature]['Dimensions']
294
+ dim ||= 1 # If Dimensions not given
295
+ cfg['SVM']['Center'][feature] ||= []
296
+ cfg['SVM']['Scale'][feature] ||= []
297
+ sc = cfg['SVM']['Scale'][feature]
298
+ (0...dim).each do |i|
299
+ #Check centering
300
+ cfg['SVM']['Center'][feature][i] =
301
+ if index==0 and @cfg['Feature']['PosClassFrom']
302
+ 0
303
+ elsif !cfg['SVM']['Center'][feature][i]
304
+ sum = @examples.inject(0) { |s,(exname,vector)| s + vector[dim0+i] }
305
+ sum / @examples.size
306
+ else
307
+ cfg['SVM']['Center'][feature][i]
308
+ end
309
+ #Check scaling
310
+ if index==0 and @cfg['Feature']['PosClassFrom']
311
+ sc[i] = 1
312
+ elsif !sc[i] or (sc[i].is_a? String and sc[i] =~ /max/)
313
+ absmax = @examples.inject(0) { |s,(exname,vector)|
314
+ [s, (vector[dim0+i] - cfg['SVM']['Center'][feature][i]).abs].max }
315
+ scale = if absmax!=0 then 1/absmax else 1 end
316
+ if sc[i] =~ /max/ then sc[i].sub!('max', scale.to_s)
317
+ else sc[i] = scale end
318
+ sc[i] = sc[i].to_f if sc[i] =~ /^\d+\.*\d*$/
319
+ elsif (sc[i].is_a? String and sc[i] =~ /avg/)
320
+ abssum = @examples.inject(0) { |s,(exname,vector)|
321
+ s + (vector[dim0+i] - cfg['SVM']['Center'][feature][i]).abs }
322
+ scale = @examples.size / abssum
323
+ if sc[i] =~ /avg/ then sc[i].sub!('avg', scale.to_s)
324
+ else sc[i] = scale end
325
+ sc[i] = sc[i].to_f if sc[i] =~ /^\d+\.*\d*$/
326
+ elsif (sc[i].is_a? String and sc[i] =~ /std/)
327
+ sqsum = @examples.inject(0) { |s,(exname,vector)|
328
+ s + (vector[dim0+i] - cfg['SVM']['Center'][feature][i])**2 }
329
+ scale = Math::sqrt( @examples.size / sqsum )
330
+ if sc[i] =~ /std/ then sc[i].sub!('std', scale.to_s)
331
+ else sc[i] = scale end
332
+ sc[i] = sc[i].to_f if sc[i] =~ /^\d+\.*\d*$/
333
+ end
334
+ end
335
+ dim0 += dim
336
+ end
337
+ #puts 'CENTER',@cfg['SVM']['Center'].to_yaml
338
+ #puts 'SCALE',@cfg['SVM']['Scale'].to_yaml
339
+ end
340
+
341
+ # The object variable @examples's features are centered around zero and scaled
342
+ # according to the configuration. I.e. each feature has a center term and a scaling
343
+ # factor.
344
+ def scaleExamples()
345
+ @examples.each do |name, vector|
346
+ @examples[name] = scaleExample(vector)
347
+ end
348
+ end
349
+
350
+ def scaleExample(vector)
351
+ scaledvector = []
352
+ @cfg['Feature']['Features'].inject(0) { |i,feature|
353
+ dim = @cfg['Feature'][feature]['Dimensions']
354
+ dim = 1 if !dim
355
+ (0...dim).each { |j|
356
+ scaledvector[i+j] =
357
+ (vector[i+j] - @cfg['SVM']['Center'][feature][j]) *
358
+ @cfg['SVM']['Scale'][feature][j].to_f }
359
+ i + dim
360
+ }
361
+ scaledvector
362
+ end
363
+
364
+ end
365
+