opentox-ruby 3.0.1 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -70,7 +70,7 @@ module OpenTox
70
70
 
71
71
  begin
72
72
  #LOGGER.debug "RestCall: "+rest_call.to_s+" "+uri.to_s+" "+headers.inspect+" "+payload.inspect
73
- resource = RestClient::Resource.new(uri,{:timeout => 60})
73
+ resource = RestClient::Resource.new(uri,{:timeout => 600})
74
74
  if rest_call=="post" || rest_call=="put"
75
75
  result = resource.send(rest_call, payload, headers)
76
76
  else
data/lib/serializer.rb CHANGED
@@ -55,7 +55,7 @@ module OpenTox
55
55
  OT.predictedVariables => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
56
56
  OT.paramValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
57
57
 
58
- #object props for validation#
58
+ #object props for validation#
59
59
  OT.model => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
60
60
  OT.trainingDataset => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
61
61
  OT.predictionFeature => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
@@ -87,7 +87,7 @@ module OpenTox
87
87
  OT.percentageCompleted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
88
88
  OT.acceptValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
89
89
 
90
- # annotation props for validation
90
+ # annotation props for validation
91
91
  OT.numUnpredicted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
92
92
  OT.crossvalidationFold => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
93
93
  OT.numInstances => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
@@ -143,8 +143,8 @@ module OpenTox
143
143
  @data_entries = {}
144
144
  @values_id = 0
145
145
  @parameter_id = 0
146
-
147
- @classes = Set.new
146
+
147
+ @classes = Set.new
148
148
  @object_properties = Set.new
149
149
  @annotation_properties = Set.new
150
150
  @datatype_properties = Set.new
@@ -208,7 +208,7 @@ module OpenTox
208
208
  @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => OT.Task }] }
209
209
  add_metadata uri, metadata
210
210
  end
211
-
211
+
212
212
  # Add a resource defined by resource_class and content
213
213
  # (see documentation of add_content for example)
214
214
  # @param [String] uri of resource
@@ -223,10 +223,10 @@ module OpenTox
223
223
  def add_uri(uri,type)
224
224
  @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => type }] }
225
225
  end
226
-
226
+
227
227
  private
228
228
  @@content_id = 1
229
-
229
+
230
230
  #Recursiv function to add content
231
231
  #@example
232
232
  # { DC.description => "bla",
@@ -244,7 +244,7 @@ module OpenTox
244
244
  hash.each do |u,v|
245
245
  if v.is_a? Hash
246
246
  # value is again a hash, i.e. a new owl class is added
247
- # first make sure type (==class) is set
247
+ # first make sure type (==class) is set
248
248
  type = v[RDF.type]
249
249
  raise "type missing for "+u.to_s+" content:\n"+v.inspect unless type
250
250
  raise "class unknown "+type.to_s+" (for "+u.to_s+")" unless @object.has_key?(type)
@@ -256,7 +256,7 @@ module OpenTox
256
256
  # add content to new class
257
257
  add_content(genid,v)
258
258
  elsif v.is_a? Array
259
- # value is an array, i.e. a list of values with property is added
259
+ # value is an array, i.e. a list of values with property is added
260
260
  v.each{ |vv| add_content( uri, { u => vv } ) }
261
261
  else # v.is_a? String
262
262
  # simple string value
@@ -268,7 +268,7 @@ module OpenTox
268
268
  end
269
269
  end
270
270
  end
271
-
271
+
272
272
  public
273
273
 
274
274
  # Add metadata
@@ -329,7 +329,7 @@ module OpenTox
329
329
  v = [{ "type" => "uri", "value" => value}]
330
330
  when "literal"
331
331
  v = [{ "type" => "literal", "value" => value, "datatype" => datatype(value) }]
332
- else
332
+ else
333
333
  raise "Illegal type #{type(value)} for #{value}."
334
334
  end
335
335
  @object[values] = {
@@ -342,7 +342,7 @@ module OpenTox
342
342
  end
343
343
 
344
344
  # Serializers
345
-
345
+
346
346
  # Convert to N-Triples
347
347
  # @return [text/plain] Object OWL-DL in N-Triples format
348
348
  def to_ntriples
@@ -353,7 +353,7 @@ module OpenTox
353
353
  entry.each do |p,objects|
354
354
  p = url(p)
355
355
  objects.each do |o|
356
- case o["type"]
356
+ case o["type"]
357
357
  when "uri"
358
358
  o = url(o["value"])
359
359
  when "literal"
@@ -371,9 +371,15 @@ module OpenTox
371
371
  # Convert to RDF/XML
372
372
  # @return [text/plain] Object OWL-DL in RDF/XML format
373
373
  def to_rdfxml
374
- Tempfile.open("owl-serializer"){|f| f.write(self.to_ntriples); @path = f.path}
374
+ tmpf = Tempfile.open("owl-serializer")
375
+ tmpf.write(self.to_ntriples)
376
+ tmpf.flush
377
+ @path = tmpf.path
375
378
  # TODO: add base uri for ist services
376
- `rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
379
+ res=`rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
380
+ tmpf.close
381
+ tmpf.delete
382
+ res
377
383
  end
378
384
 
379
385
  # Convert to JSON as specified in http://n2.talis.com/wiki/RDF_JSON_Specification
@@ -427,20 +433,20 @@ module OpenTox
427
433
  end
428
434
 
429
435
  def literal(value,type)
430
- # concat and << are faster string concatination operators than +
436
+ # concat and << are faster string concatination operators than +
431
437
  '"'.concat(value.to_s).concat('"^^<').concat(type).concat('>')
432
438
  end
433
439
 
434
440
  def url(uri)
435
- # concat and << are faster string concatination operators than +
441
+ # concat and << are faster string concatination operators than +
436
442
  '<'.concat(uri).concat('>')
437
443
  end
438
444
 
439
445
  def rdf_types
440
- @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
441
- @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
442
- @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
443
- @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
446
+ @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
447
+ @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
448
+ @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
449
+ @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
444
450
  end
445
451
 
446
452
  end
@@ -457,35 +463,46 @@ module OpenTox
457
463
  @rows.first << features
458
464
  @rows.first.flatten!
459
465
  dataset.data_entries.each do |compound,entries|
460
- smiles = Compound.new(compound).to_smiles
466
+ cmpd = Compound.new(compound)
467
+ smiles = cmpd.to_smiles
468
+ inchi = URI.encode_www_form_component(cmpd.to_inchi)
469
+ row_container = Array.new
461
470
  row = Array.new(@rows.first.size)
462
- row[0] = smiles
471
+ row_container << row
472
+ #row[0] = smiles
473
+ row[0] = inchi
463
474
  entries.each do |feature, values|
464
475
  i = features.index(feature)+1
465
476
  values.each do |value|
466
- if row[i]
467
- row[i] = "#{row[i]} #{value}" # multiple values
477
+ if row_container[0][i]
478
+ #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
479
+ row_container << row_container.last.collect
480
+ row_container.last[i] = value
481
+ #LOGGER.debug "RC: #{row_container.to_yaml}"
468
482
  else
469
- row[i] = value
483
+ row_container.each { |r| r[i] = value }
470
484
  end
471
485
  end
472
486
  end
473
- @rows << row
487
+ row_container.each { |r| @rows << r }
474
488
  end
475
489
  end
476
490
 
477
491
  # Convert to CSV string
478
492
  # @return [String] CSV string
479
493
  def to_csv
480
- @rows.collect{|r| r.join(", ")}.join("\n")
494
+ rows = @rows.collect
495
+ result = ""
496
+ result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
497
+ result << rows.collect{ |r| r.join(",") }.join("\n")
481
498
  end
482
499
 
483
500
  # Convert to spreadsheet workbook
484
501
  # @return [Spreadsheet::Workbook] Workbook object (use the spreadsheet gemc to write a file)
485
- def to_spreadsheet
502
+ def to_spreadsheet(sheetname="sheet1")
486
503
  Spreadsheet.client_encoding = 'UTF-8'
487
504
  book = Spreadsheet::Workbook.new
488
- sheet = book.create_worksheet(:name => '')
505
+ sheet = book.create_worksheet(:name => "#{sheetname}")
489
506
  sheet.column(0).width = 100
490
507
  i = 0
491
508
  @rows.each do |row|
@@ -0,0 +1,201 @@
1
+
2
+ nominal_to_binary <- function( data )
3
+ {
4
+ result = NULL
5
+ for (i in 1:ncol(data))
6
+ {
7
+ #print(i)
8
+ if (is.numeric( data[,i] ) )
9
+ {
10
+ if (is.null(result))
11
+ result = data.frame(data[,i])
12
+ else
13
+ result = data.frame(result, data[,i])
14
+ colnames(result)[ncol(result)] <- colnames(data)[i]
15
+ }
16
+ else
17
+ {
18
+ vals = unique(data[,i])
19
+ for (j in 1:length(vals))
20
+ {
21
+ #print(j)
22
+ bins = c()
23
+ for (k in 1:nrow(data))
24
+ {
25
+ if(data[,i][k] == vals[j])
26
+ bins = c(bins,1)
27
+ else
28
+ bins = c(bins,0)
29
+ }
30
+ #print(bins)
31
+ if (is.null(result))
32
+ result = data.frame(bins)
33
+ else
34
+ result = data.frame(result, bins)
35
+ colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
36
+ if (length(vals)==2) break
37
+ }
38
+ }
39
+ }
40
+ #print(head(result))
41
+ result
42
+ }
43
+
44
+ process_data <- function( data )
45
+ {
46
+ data.num <- as.data.frame(data)
47
+ if (!is.numeric(data.num))
48
+ {
49
+ data.num = nominal_to_binary(data.num)
50
+ }
51
+ if(any(is.na(data.num)))
52
+ {
53
+ require("gam")
54
+ data.repl = na.gam.replace(data.num)
55
+ }
56
+ else
57
+ data.repl = data.num
58
+ data.repl
59
+ }
60
+
61
+ cluster <- function( data, min=10, max=15 )
62
+ {
63
+ require("vegan")
64
+ max <- min(max,nrow(unique(data)))
65
+ max <- min(max,nrow(data)-1)
66
+ if (min>max)
67
+ min=max
68
+ print(paste("cascade k-means ",min," - ",max))
69
+ s = cascadeKM(data,min,max,iter=30)
70
+ m = max.col(s$results)[2]
71
+ print(paste("best k-means clustering result: ",((m-1)+min)," num clusters"))
72
+ cbind(s$partition[,m])
73
+ }
74
+
75
+ stratified_split <- function( data, ratio=0.3, method="cluster" )
76
+ {
77
+ data.processed = as.matrix(process_data( data ))
78
+ if (method == "samplecube")
79
+ {
80
+ require("sampling")
81
+ # adjust ratio to make samplecube return exact number of samples
82
+ ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
83
+ pik = rep(ratio,times=nrow(data.processed))
84
+ data.strat = cbind(pik,data.processed)
85
+ samplecube(data.strat,pik,order=2,comment=F)
86
+ }
87
+ else if (method == "cluster")
88
+ {
89
+ cl = cluster(data.processed)
90
+ # require("caret")
91
+ # res = createDataPartition(cl,p=ratio)
92
+ # split = rep(1, times=nrow(data))
93
+ # for (j in 1:nrow(data))
94
+ # if ( is.na(match(j,res$Resample1)) )
95
+ # split[j]=0
96
+ # split
97
+ require("sampling")
98
+ stratified_split(cl,ratio,"samplecube")
99
+ }
100
+ else
101
+ stop("unknown method")
102
+ }
103
+
104
+ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
105
+ {
106
+ print(paste(num_folds,"-fold-split, data-size",nrow(data)))
107
+ data.processed = as.matrix(process_data( data ))
108
+ if (method == "samplecube")
109
+ {
110
+ folds = rep(0, times=nrow(data))
111
+ for (i in 1:(num_folds-1))
112
+ {
113
+ require("sampling")
114
+ prop = 1/(num_folds-(i-1))
115
+ print(paste("fold",i,"/",num_folds," prop",prop))
116
+ pik = rep(prop,times=nrow(data))
117
+ for (j in 1:nrow(data))
118
+ if(folds[j]!=0)
119
+ pik[j]=0
120
+ data.strat = cbind(pik,data.processed)
121
+ s<-samplecube(data.strat,pik,order=2,comment=F)
122
+ print(paste("fold size: ",sum(s)))
123
+ for (j in 1:nrow(data))
124
+ if (s[j] == 1)
125
+ folds[j]=i
126
+ }
127
+ for (j in 1:nrow(data))
128
+ if (folds[j] == 0)
129
+ folds[j]=num_folds
130
+ folds
131
+ }
132
+ else if (method == "cluster")
133
+ {
134
+ require("TunePareto")
135
+ cl = cluster(data.processed)
136
+ res = generateCVRuns(cl,ntimes=1,nfold=3)
137
+ folds = rep(0, times=nrow(data))
138
+ for (i in 1:num_folds)
139
+ for(j in 1:length(res[[1]][[i]]))
140
+ folds[res[[1]][[i]][j]]=i
141
+ folds
142
+ }
143
+ else
144
+ stop("unknown method")
145
+ }
146
+
147
+ plot_pre_process <- function( data, method="pca" )
148
+ {
149
+ data.processed = process_data( data )
150
+ if (method == "pca")
151
+ {
152
+ data.pca <- prcomp(data.processed, scale=TRUE)
153
+ as.data.frame(data.pca$x)[1:2]
154
+ }
155
+ else if (method == "smacof")
156
+ {
157
+ require("smacof")
158
+ data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
159
+ data.emb$conf
160
+ }
161
+ else
162
+ stop("unknown method")
163
+ }
164
+
165
+ plot_split <- function( data, split, names=NULL, ... )
166
+ {
167
+ if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
168
+ stop("data not suitable for plotting, plot_pre_process() first")
169
+
170
+ plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
171
+ if (is.null(names))
172
+ names <- c("split 1","split 2")
173
+ colos = as.double(rep(2:(max(split)+2)))
174
+ legend("topleft",names,pch=2,col=colos)
175
+
176
+ for (j in max(split):0)
177
+ {
178
+ set = c()
179
+ for (i in 1:nrow(data))
180
+ if (split[i] == j)
181
+ set = c(set,i)
182
+ points(data[set,], pch = 2, col=(j+2))
183
+ }
184
+ }
185
+
186
+ #a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
187
+ #b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
188
+ #data<-rbind(a,b)
189
+ #c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
190
+ #data<-rbind(data,c)
191
+ #data=iris
192
+ #split = stratified_k_fold_split(data, num_folds=3)
193
+ #split = stratified_split(data, ratio=0.33, method="cluster")
194
+ #print(sum(split))
195
+ #plot_split(plot_pre_process(data),split,c("training","test"))
196
+
197
+ #cl = cluster(data)
198
+
199
+
200
+
201
+
data/lib/task.rb CHANGED
@@ -242,16 +242,20 @@ module OpenTox
242
242
  # waits for a task, unless time exceeds or state is no longer running
243
243
  # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
244
244
  # @param [optional,Numeric] dur seconds pausing before cheking again for completion
245
- def wait_for_completion( waiting_task=nil, dur=0.3)
245
+ def wait_for_completion( waiting_task=nil)
246
246
 
247
247
  waiting_task.waiting_for(self.uri) if waiting_task
248
248
  due_to_time = Time.new + DEFAULT_TASK_MAX_DURATION
249
+ start_time = Time.new
250
+ dur = 0
249
251
  LOGGER.debug "start waiting for task "+@uri.to_s+" at: "+Time.new.to_s+", waiting at least until "+due_to_time.to_s
250
252
 
251
253
  load_metadata # for extremely fast tasks
252
254
  check_state
253
255
  while self.running? or self.queued?
254
256
  sleep dur
257
+ dur = [[(Time.new - start_time)/20.0,0.3].max,300.0].min
258
+ #LOGGER.debug "task-object-id: #{self.object_id} - wait: #{"%.2f"%(Time.new - start_time)} - dur: #{"%.2f"%dur}"
255
259
  load_metadata
256
260
  # if another (sub)task is waiting for self, set progress accordingly
257
261
  waiting_task.progress(@metadata[OT.percentageCompleted].to_f) if waiting_task
data/lib/transform.rb ADDED
@@ -0,0 +1,520 @@
1
+ module OpenTox
2
+ module Transform
3
+ # Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
4
+
5
+ # LogAutoScaler for GSL vectors.
6
+ # Take log and scale.
7
+ class LogAutoScale
8
+ attr_accessor :vs, :offset, :autoscaler
9
+
10
+ # @param [GSL::Vector] Values to transform using LogAutoScaling.
11
+ def initialize values
12
+ @distance_to_zero = 1.0
13
+ begin
14
+ raise "Cannot transform, values empty." if values.size==0
15
+ vs = values.clone
16
+ @offset = vs.min - @distance_to_zero
17
+ @autoscaler = OpenTox::Transform::AutoScale.new mvlog(vs)
18
+ @vs = @autoscaler.vs
19
+ rescue Exception => e
20
+ LOGGER.debug "#{e.class}: #{e.message}"
21
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
22
+ end
23
+ end
24
+
25
+ # @param [GSL::Vector] values to restore.
26
+ # @return [GSL::Vector] transformed values.
27
+ def restore values
28
+ begin
29
+ raise "Cannot transform, values empty." if values.size==0
30
+ vs = values.clone
31
+ rv = @autoscaler.restore(vs)
32
+ rv.to_a.collect { |v| (10**v) + @offset }.to_gv
33
+ rescue Exception => e
34
+ LOGGER.debug "#{e.class}: #{e.message}"
35
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
36
+ end
37
+ end
38
+
39
+ # @param [GSL::Vector] values to transform.
40
+ # @return [GSL::Vector] transformed values.
41
+ def mvlog values
42
+ values.to_a.collect { |v| Math::log10(v - @offset) }.to_gv
43
+ end
44
+
45
+ end
46
+
47
+
48
+ # Auto-Scaler for GSL vectors.
49
+ # Center on mean and divide by standard deviation.
50
+ class AutoScale
51
+ attr_accessor :vs, :mean, :stdev
52
+
53
+ # @param [GSL::Vector] values to transform using AutoScaling.
54
+ def initialize values
55
+ begin
56
+ raise "Cannot transform, values empty." if values.size==0
57
+ vs = values.clone
58
+ @mean = vs.to_scale.mean
59
+ @stdev = vs.to_scale.standard_deviation_population
60
+ @stdev = 0.0 if @stdev.nan?
61
+ @vs = transform vs
62
+ rescue Exception => e
63
+ LOGGER.debug "#{e.class}: #{e.message}"
64
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
65
+ end
66
+ end
67
+
68
+ # @param [GSL::Vector] values to transform.
69
+ # @return [GSL::Vector] transformed values.
70
+ def transform values
71
+ begin
72
+ raise "Cannot transform, values empty." if values.size==0
73
+ autoscale values.clone
74
+ rescue Exception => e
75
+ LOGGER.debug "#{e.class}: #{e.message}"
76
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
77
+ end
78
+ end
79
+
80
+ # @param [GSL::Vector] Values to restore.
81
+ # @return [GSL::Vector] transformed values.
82
+ def restore values
83
+ begin
84
+ raise "Cannot transform, values empty." if values.size==0
85
+ rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
86
+ (rv_ss + @mean).to_gsl
87
+ rescue Exception => e
88
+ LOGGER.debug "#{e.class}: #{e.message}"
89
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
90
+ end
91
+ end
92
+
93
+ # @param [GSL::Vector] values to transform.
94
+ # @return [GSL::Vector] transformed values.
95
+ def autoscale values
96
+ vs_ss = values.clone.to_scale - @mean
97
+ @stdev == 0.0 ? vs_ss.to_gsl : ( vs_ss * ( 1 / @stdev) ).to_gsl
98
+ end
99
+
100
+ end
101
+
102
+
103
+ # Principal Components Analysis.
104
+ class PCA
105
+ attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
106
+
107
+ # Creates a transformed dataset as GSL::Matrix.
108
+ #
109
+ # @param [GSL::Matrix] Data matrix.
110
+ # @param [Float] Compression ratio from [0,1], default 0.05.
111
+ # @return [GSL::Matrix] Data transformed matrix.
112
+ def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
113
+ begin
114
+ @data_matrix = data_matrix.clone
115
+ @compression = compression.to_f
116
+ @mean = Array.new
117
+ @autoscaler = Array.new
118
+ @cols = Array.new
119
+ @maxcols = maxcols
120
+
121
+ # Objective Feature Selection
122
+ raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
123
+ @data_matrix_selected = nil
124
+ (0..@data_matrix.size2-1).each { |i|
125
+ if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
126
+ if @data_matrix_selected.nil?
127
+ @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
128
+ @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
129
+ else
130
+ @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
131
+ end
132
+ @cols << i
133
+ end
134
+ }
135
+ raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
136
+
137
+ # PCA uses internal centering on 0
138
+ @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
139
+ (0..@cols.size-1).each { |i|
140
+ as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i))
141
+ @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
142
+ @mean << as.mean
143
+ @autoscaler << as
144
+ }
145
+
146
+ # PCA
147
+ data_matrix_hash = Hash.new
148
+ (0..@cols.size-1).each { |i|
149
+ column_view = @data_matrix_scaled.col(i)
150
+ data_matrix_hash[i] = column_view.to_scale
151
+ }
152
+ dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
153
+ cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
154
+ pca=Statsample::Factor::PCA.new(cor_matrix)
155
+
156
+ # Select best eigenvectors
157
+ pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
158
+ @eigenvalue_sums = Array.new
159
+ (0..@cols.size-1).each { |i|
160
+ @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
161
+ }
162
+ eigenvectors_selected = Array.new
163
+ pca.eigenvectors.each_with_index { |ev, i|
164
+ if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
165
+ eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
166
+ end
167
+ }
168
+ @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
169
+ @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
170
+
171
+ rescue Exception => e
172
+ LOGGER.debug "#{e.class}: #{e.message}"
173
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
174
+ end
175
+ end
176
+
177
+ # Transforms data to feature space found by PCA.
178
+ #
179
+ # @param [GSL::Matrix] Data matrix.
180
+ # @return [GSL::Matrix] Transformed data matrix.
181
+ def transform values
182
+ begin
183
+ vs = values.clone
184
+ raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
185
+ data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
186
+ @cols.each_with_index { |i,j|
187
+ data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
188
+ }
189
+ (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
190
+ rescue Exception => e
191
+ LOGGER.debug "#{e.class}: #{e.message}"
192
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
193
+ end
194
+ end
195
+
196
+ # Restores data in the original feature space (possibly with compression loss).
197
+ #
198
+ # @param [GSL::Matrix] Transformed data matrix.
199
+ # @return [GSL::Matrix] Data matrix.
200
+ def restore
201
+ begin
202
+ data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
203
+ # reverse scaling
204
+ (0..@cols.size-1).each { |i|
205
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
206
+ }
207
+ data_matrix_restored
208
+ rescue Exception => e
209
+ LOGGER.debug "#{e.class}: #{e.message}"
210
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
211
+ end
212
+ end
213
+
214
+ end
215
+
216
+
217
+ # Singular Value Decomposition
218
+ class SVD
219
+ attr_accessor :data_matrix, :compression, :data_transformed_matrix, :uk, :vk, :eigk, :eigk_inv
220
+
221
+ # Creates a transformed dataset as GSL::Matrix.
222
+ #
223
+ # @param [GSL::Matrix] Data matrix
224
+ # @param [Float] Compression ratio from [0,1], default 0.05
225
+ # @return [GSL::Matrix] Data transformed matrix
226
+
227
+ def initialize data_matrix, compression=0.05
228
+ begin
229
+ @data_matrix = data_matrix.clone
230
+ @compression = compression
231
+
232
+ # Compute the SV Decomposition X=USV
233
+ # vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
234
+ u, vt, s = data_matrix.SV_decomp
235
+
236
+ # Determine cutoff index
237
+ s2 = s.mul(s) ; s2_sum = s2.sum
238
+ s2_run = 0
239
+ k = s2.size - 1
240
+ s2.to_a.reverse.each { |v|
241
+ s2_run += v
242
+ frac = s2_run / s2_sum
243
+ break if frac > compression
244
+ k -= 1
245
+ }
246
+ k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
247
+
248
+ # Take the k-rank approximation of the Matrix
249
+ # - Take first k columns of u
250
+ # - Take first k columns of vt
251
+ # - Take the first k eigenvalues
252
+ @uk = u.submatrix(nil, (0..k)) # used to transform column format data
253
+ @vk = vt.submatrix(nil, (0..k)) # used to transform row format data
254
+ s = GSL::Matrix.diagonal(s)
255
+ @eigk = s.submatrix((0..k), (0..k))
256
+ @eigk_inv = @eigk.inv
257
+
258
+ # Transform data
259
+ @data_transformed_matrix = @uk # = u for all SVs
260
+ # NOTE: @data_transformed_matrix is also equal to
261
+ # @data_matrix * @vk * @eigk_inv
262
+
263
+ rescue Exception => e
264
+ LOGGER.debug "#{e.class}: #{e.message}"
265
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
266
+ end
267
+ end
268
+
269
+
270
+ # Transforms data instance (1 row) to feature space found by SVD.
271
+ #
272
+ # @param [GSL::Matrix] Data matrix (1 x m).
273
+ # @return [GSL::Matrix] Transformed data matrix.
274
+ def transform_instance values
275
+ begin
276
+ values * @vk * @eigk_inv
277
+ rescue Exception => e
278
+ LOGGER.debug "#{e.class}: #{e.message}"
279
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
280
+ end
281
+ end
282
+ alias :transform :transform_instance # make this the default (see PCA interface)
283
+
284
+ # Transforms data feature (1 column) to feature space found by SVD.
285
+ #
286
+ # @param [GSL::Matrix] Data matrix (1 x n).
287
+ # @return [GSL::Matrix] Transformed data matrix.
288
+ def transform_feature values
289
+ begin
290
+ values * @uk * @eigk_inv
291
+ rescue Exception => e
292
+ LOGGER.debug "#{e.class}: #{e.message}"
293
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
294
+ end
295
+ end
296
+
297
+
298
+ # Restores data in the original feature space (possibly with compression loss).
299
+ #
300
+ # @param [GSL::Matrix] Transformed data matrix.
301
+ # @return [GSL::Matrix] Data matrix.
302
+ def restore
303
+ begin
304
+ @data_transformed_matrix * @eigk * @vk.transpose # reverse svd
305
+ rescue Exception => e
306
+ LOGGER.debug "#{e.class}: #{e.message}"
307
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
308
+ end
309
+ end
310
+
311
+
312
+ end
313
+
314
+
315
+
316
+ # Attaches transformations to an OpenTox::Model
317
+ # Stores props, sims, performs similarity calculations
318
+ class ModelTransformer
319
+ attr_accessor :model, :similarity_algorithm, :acts, :sims
320
+
321
+ # @params[OpenTox::Model] model to transform
322
+ def initialize model
323
+ @model = model
324
+ @similarity_algorithm = @model.similarity_algorithm
325
+ end
326
+
327
+ def transform
328
+ get_matrices # creates @n_prop, @q_prop, @acts from ordered fps
329
+ @ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors
330
+
331
+ # Preprocessing
332
+ if (@model.similarity_algorithm == "Similarity.cosine")
333
+ # truncate nil-columns and -rows
334
+ LOGGER.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
335
+ while @q_prop.size>0
336
+ idx = @q_prop.index(nil)
337
+ break if idx.nil?
338
+ @q_prop.slice!(idx)
339
+ @n_prop.each { |r| r.slice!(idx) }
340
+ end
341
+ LOGGER.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
342
+ remove_nils # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors
343
+ LOGGER.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
344
+
345
+ # adjust rest
346
+ fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp
347
+ cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp
348
+ acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
349
+
350
+ # scale and svd
351
+ nr_cases, nr_features = @n_prop.size, @n_prop[0].size
352
+ gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup
353
+ gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup
354
+ (0...nr_features).each { |i|
355
+ autoscaler = OpenTox::Transform::AutoScale.new(gsl_n_prop.col(i))
356
+ gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs
357
+ gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i)
358
+ }
359
+ svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0)
360
+ @n_prop = svd.data_transformed_matrix.to_a
361
+ @q_prop = svd.transform(gsl_q_prop).row(0).to_a
362
+ LOGGER.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
363
+ else
364
+ convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched
365
+ end
366
+
367
+ # neighbor calculation
368
+ @ids = [] # surviving compounds become neighbors
369
+ @sims = [] # calculated by neighbor routine
370
+ neighbors
371
+ n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix
372
+ acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
373
+
374
+
375
+ # Sims between neighbors, if necessary
376
+ gram_matrix = []
377
+ if !@model.parameter("propositionalized") # need gram matrix for standard setting (n. prop.)
378
+ @n_prop.each_index do |i|
379
+ gram_matrix[i] = [] unless gram_matrix[i]
380
+ @n_prop.each_index do |j|
381
+ if (j>i)
382
+ sim = eval("OpenTox::Algorithm::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])")
383
+ gram_matrix[i][j] = sim
384
+ gram_matrix[j] = [] unless gram_matrix[j]
385
+ gram_matrix[j][i] = gram_matrix[i][j]
386
+ end
387
+ end
388
+ gram_matrix[i][i] = 1.0
389
+ end
390
+ end
391
+
392
+ # reclaim original data (if svd was performed)
393
+ if svd
394
+ @n_prop = gsl_n_prop_orig.to_a
395
+ n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp
396
+ @q_prop = gsl_q_prop_orig.row(0).to_a
397
+ end
398
+
399
+ LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
400
+ LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
401
+
402
+ @sims = [ gram_matrix, @sims ]
403
+
404
+ end
405
+
406
+
407
+
408
+
409
+ # Find neighbors and store them as object variable, access all compounds for that.
410
+ def neighbors
411
+ @model.neighbors = []
412
+ @n_prop.each_with_index do |fp, idx| # AM: access all compounds
413
+ add_neighbor fp, idx
414
+ end
415
+ end
416
+
417
+
418
+ # Adds a neighbor to @neighbors if it passes the similarity threshold
419
+ # adjusts @ids to signal the
420
+ def add_neighbor(training_props, idx)
421
+
422
+ sim = similarity(training_props)
423
+ if sim > @model.parameter("min_sim")
424
+ if @model.activities[@cmpds[idx]]
425
+ @model.activities[@cmpds[idx]].each do |act|
426
+ @model.neighbors << {
427
+ :compound => @cmpds[idx],
428
+ :similarity => sim,
429
+ :features => @fps[idx].keys,
430
+ :activity => act
431
+ }
432
+ @sims << sim
433
+ @ids << idx
434
+ end
435
+ end
436
+ end
437
+ end
438
+
439
+
440
+ # Removes nil entries from n_prop and q_prop.
441
+ # Matrix is a nested two-dimensional array.
442
+ # Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed.
443
+ # Tie break: columns take precedence.
444
+ # Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero.
445
+ # Enables the use of cosine similarity / SVD
446
+ def remove_nils
447
+ return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0)
448
+ col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
449
+ row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
450
+ m_cols = col_nr_nils.max
451
+ m_rows = row_nr_nils.max
452
+ idx_cols = col_nr_nils.index(m_cols)
453
+ idx_rows = row_nr_nils.index(m_rows)
454
+ while ((m_cols > 0) || (m_rows > 0)) do
455
+ if m_cols >= m_rows
456
+ @n_prop.each { |row| row.slice!(idx_cols) }
457
+ @q_prop.slice!(idx_cols)
458
+ else
459
+ @n_prop.slice!(idx_rows)
460
+ @ids.slice!(idx_rows)
461
+ end
462
+ break if (@n_prop.length == 0) || (@n_prop[0].length == 0)
463
+ col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
464
+ row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
465
+ m_cols = col_nr_nils.max
466
+ m_rows = row_nr_nils.max
467
+ idx_cols= col_nr_nils.index(m_cols)
468
+ idx_rows = row_nr_nils.index(m_rows)
469
+ end
470
+ end
471
+
472
+
473
+ # Replaces nils by zeroes in n_prop and q_prop
474
+ # Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop)
475
+ def convert_nils
476
+ @n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } }
477
+ @q_prop.collect! { |v| v.nil? ? 0 : v }
478
+ end
479
+
480
+
481
+ # Executes model similarity_algorithm
482
+ def similarity(training_props)
483
+ eval("OpenTox::Algorithm::#{@model.similarity_algorithm}(training_props, @q_prop)")
484
+ end
485
+
486
+
487
+ # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed.
488
+ # Same for compound fingerprints.
489
+ def get_matrices
490
+
491
+ @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
492
+
493
+ @model.fingerprints.each { |fp|
494
+ cmpd = fp[0]; fp = fp[1]
495
+ if @model.activities[cmpd] # row good
496
+ acts = @model.activities[cmpd]; @acts += acts
497
+ LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
498
+ row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
499
+ acts.size.times { # multiple additions for multiple activities
500
+ @n_prop << row.collect
501
+ @cmpds << cmpd
502
+ @fps << Marshal.load(Marshal.dump(fp))
503
+ }
504
+ else
505
+ LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
506
+ end
507
+ }
508
+
509
+ @model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure
510
+
511
+ end
512
+
513
+ def props
514
+ @model.parameter("propositionalized") ? [ @n_prop, @q_prop ] : nil
515
+ end
516
+
517
+ end
518
+
519
+ end
520
+ end