opentox-ruby 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,7 +70,7 @@ module OpenTox
70
70
 
71
71
  begin
72
72
  #LOGGER.debug "RestCall: "+rest_call.to_s+" "+uri.to_s+" "+headers.inspect+" "+payload.inspect
73
- resource = RestClient::Resource.new(uri,{:timeout => 60})
73
+ resource = RestClient::Resource.new(uri,{:timeout => 600})
74
74
  if rest_call=="post" || rest_call=="put"
75
75
  result = resource.send(rest_call, payload, headers)
76
76
  else
data/lib/serializer.rb CHANGED
@@ -55,7 +55,7 @@ module OpenTox
55
55
  OT.predictedVariables => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
56
56
  OT.paramValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
57
57
 
58
- #object props for validation#
58
+ #object props for validation#
59
59
  OT.model => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
60
60
  OT.trainingDataset => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
61
61
  OT.predictionFeature => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
@@ -87,7 +87,7 @@ module OpenTox
87
87
  OT.percentageCompleted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
88
88
  OT.acceptValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
89
89
 
90
- # annotation props for validation
90
+ # annotation props for validation
91
91
  OT.numUnpredicted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
92
92
  OT.crossvalidationFold => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
93
93
  OT.numInstances => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
@@ -143,8 +143,8 @@ module OpenTox
143
143
  @data_entries = {}
144
144
  @values_id = 0
145
145
  @parameter_id = 0
146
-
147
- @classes = Set.new
146
+
147
+ @classes = Set.new
148
148
  @object_properties = Set.new
149
149
  @annotation_properties = Set.new
150
150
  @datatype_properties = Set.new
@@ -208,7 +208,7 @@ module OpenTox
208
208
  @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => OT.Task }] }
209
209
  add_metadata uri, metadata
210
210
  end
211
-
211
+
212
212
  # Add a resource defined by resource_class and content
213
213
  # (see documentation of add_content for example)
214
214
  # @param [String] uri of resource
@@ -223,10 +223,10 @@ module OpenTox
223
223
  def add_uri(uri,type)
224
224
  @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => type }] }
225
225
  end
226
-
226
+
227
227
  private
228
228
  @@content_id = 1
229
-
229
+
230
230
  #Recursiv function to add content
231
231
  #@example
232
232
  # { DC.description => "bla",
@@ -244,7 +244,7 @@ module OpenTox
244
244
  hash.each do |u,v|
245
245
  if v.is_a? Hash
246
246
  # value is again a hash, i.e. a new owl class is added
247
- # first make sure type (==class) is set
247
+ # first make sure type (==class) is set
248
248
  type = v[RDF.type]
249
249
  raise "type missing for "+u.to_s+" content:\n"+v.inspect unless type
250
250
  raise "class unknown "+type.to_s+" (for "+u.to_s+")" unless @object.has_key?(type)
@@ -256,7 +256,7 @@ module OpenTox
256
256
  # add content to new class
257
257
  add_content(genid,v)
258
258
  elsif v.is_a? Array
259
- # value is an array, i.e. a list of values with property is added
259
+ # value is an array, i.e. a list of values with property is added
260
260
  v.each{ |vv| add_content( uri, { u => vv } ) }
261
261
  else # v.is_a? String
262
262
  # simple string value
@@ -268,7 +268,7 @@ module OpenTox
268
268
  end
269
269
  end
270
270
  end
271
-
271
+
272
272
  public
273
273
 
274
274
  # Add metadata
@@ -329,7 +329,7 @@ module OpenTox
329
329
  v = [{ "type" => "uri", "value" => value}]
330
330
  when "literal"
331
331
  v = [{ "type" => "literal", "value" => value, "datatype" => datatype(value) }]
332
- else
332
+ else
333
333
  raise "Illegal type #{type(value)} for #{value}."
334
334
  end
335
335
  @object[values] = {
@@ -342,7 +342,7 @@ module OpenTox
342
342
  end
343
343
 
344
344
  # Serializers
345
-
345
+
346
346
  # Convert to N-Triples
347
347
  # @return [text/plain] Object OWL-DL in N-Triples format
348
348
  def to_ntriples
@@ -353,7 +353,7 @@ module OpenTox
353
353
  entry.each do |p,objects|
354
354
  p = url(p)
355
355
  objects.each do |o|
356
- case o["type"]
356
+ case o["type"]
357
357
  when "uri"
358
358
  o = url(o["value"])
359
359
  when "literal"
@@ -371,9 +371,15 @@ module OpenTox
371
371
  # Convert to RDF/XML
372
372
  # @return [text/plain] Object OWL-DL in RDF/XML format
373
373
  def to_rdfxml
374
- Tempfile.open("owl-serializer"){|f| f.write(self.to_ntriples); @path = f.path}
374
+ tmpf = Tempfile.open("owl-serializer")
375
+ tmpf.write(self.to_ntriples)
376
+ tmpf.flush
377
+ @path = tmpf.path
375
378
  # TODO: add base uri for ist services
376
- `rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
379
+ res=`rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
380
+ tmpf.close
381
+ tmpf.delete
382
+ res
377
383
  end
378
384
 
379
385
  # Convert to JSON as specified in http://n2.talis.com/wiki/RDF_JSON_Specification
@@ -427,20 +433,20 @@ module OpenTox
427
433
  end
428
434
 
429
435
  def literal(value,type)
430
- # concat and << are faster string concatination operators than +
436
+ # concat and << are faster string concatination operators than +
431
437
  '"'.concat(value.to_s).concat('"^^<').concat(type).concat('>')
432
438
  end
433
439
 
434
440
  def url(uri)
435
- # concat and << are faster string concatination operators than +
441
+ # concat and << are faster string concatination operators than +
436
442
  '<'.concat(uri).concat('>')
437
443
  end
438
444
 
439
445
  def rdf_types
440
- @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
441
- @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
442
- @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
443
- @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
446
+ @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
447
+ @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
448
+ @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
449
+ @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
444
450
  end
445
451
 
446
452
  end
@@ -457,35 +463,46 @@ module OpenTox
457
463
  @rows.first << features
458
464
  @rows.first.flatten!
459
465
  dataset.data_entries.each do |compound,entries|
460
- smiles = Compound.new(compound).to_smiles
466
+ cmpd = Compound.new(compound)
467
+ smiles = cmpd.to_smiles
468
+ inchi = URI.encode_www_form_component(cmpd.to_inchi)
469
+ row_container = Array.new
461
470
  row = Array.new(@rows.first.size)
462
- row[0] = smiles
471
+ row_container << row
472
+ #row[0] = smiles
473
+ row[0] = inchi
463
474
  entries.each do |feature, values|
464
475
  i = features.index(feature)+1
465
476
  values.each do |value|
466
- if row[i]
467
- row[i] = "#{row[i]} #{value}" # multiple values
477
+ if row_container[0][i]
478
+ #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
479
+ row_container << row_container.last.collect
480
+ row_container.last[i] = value
481
+ #LOGGER.debug "RC: #{row_container.to_yaml}"
468
482
  else
469
- row[i] = value
483
+ row_container.each { |r| r[i] = value }
470
484
  end
471
485
  end
472
486
  end
473
- @rows << row
487
+ row_container.each { |r| @rows << r }
474
488
  end
475
489
  end
476
490
 
477
491
  # Convert to CSV string
478
492
  # @return [String] CSV string
479
493
  def to_csv
480
- @rows.collect{|r| r.join(", ")}.join("\n")
494
+ rows = @rows.collect
495
+ result = ""
496
+ result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
497
+ result << rows.collect{ |r| r.join(",") }.join("\n")
481
498
  end
482
499
 
483
500
  # Convert to spreadsheet workbook
484
501
  # @return [Spreadsheet::Workbook] Workbook object (use the spreadsheet gemc to write a file)
485
- def to_spreadsheet
502
+ def to_spreadsheet(sheetname="sheet1")
486
503
  Spreadsheet.client_encoding = 'UTF-8'
487
504
  book = Spreadsheet::Workbook.new
488
- sheet = book.create_worksheet(:name => '')
505
+ sheet = book.create_worksheet(:name => "#{sheetname}")
489
506
  sheet.column(0).width = 100
490
507
  i = 0
491
508
  @rows.each do |row|
@@ -0,0 +1,201 @@
1
+
2
+ nominal_to_binary <- function( data )
3
+ {
4
+ result = NULL
5
+ for (i in 1:ncol(data))
6
+ {
7
+ #print(i)
8
+ if (is.numeric( data[,i] ) )
9
+ {
10
+ if (is.null(result))
11
+ result = data.frame(data[,i])
12
+ else
13
+ result = data.frame(result, data[,i])
14
+ colnames(result)[ncol(result)] <- colnames(data)[i]
15
+ }
16
+ else
17
+ {
18
+ vals = unique(data[,i])
19
+ for (j in 1:length(vals))
20
+ {
21
+ #print(j)
22
+ bins = c()
23
+ for (k in 1:nrow(data))
24
+ {
25
+ if(data[,i][k] == vals[j])
26
+ bins = c(bins,1)
27
+ else
28
+ bins = c(bins,0)
29
+ }
30
+ #print(bins)
31
+ if (is.null(result))
32
+ result = data.frame(bins)
33
+ else
34
+ result = data.frame(result, bins)
35
+ colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
36
+ if (length(vals)==2) break
37
+ }
38
+ }
39
+ }
40
+ #print(head(result))
41
+ result
42
+ }
43
+
44
+ process_data <- function( data )
45
+ {
46
+ data.num <- as.data.frame(data)
47
+ if (!is.numeric(data.num))
48
+ {
49
+ data.num = nominal_to_binary(data.num)
50
+ }
51
+ if(any(is.na(data.num)))
52
+ {
53
+ require("gam")
54
+ data.repl = na.gam.replace(data.num)
55
+ }
56
+ else
57
+ data.repl = data.num
58
+ data.repl
59
+ }
60
+
61
+ cluster <- function( data, min=10, max=15 )
62
+ {
63
+ require("vegan")
64
+ max <- min(max,nrow(unique(data)))
65
+ max <- min(max,nrow(data)-1)
66
+ if (min>max)
67
+ min=max
68
+ print(paste("cascade k-means ",min," - ",max))
69
+ s = cascadeKM(data,min,max,iter=30)
70
+ m = max.col(s$results)[2]
71
+ print(paste("best k-means clustering result: ",((m-1)+min)," num clusters"))
72
+ cbind(s$partition[,m])
73
+ }
74
+
75
+ stratified_split <- function( data, ratio=0.3, method="cluster" )
76
+ {
77
+ data.processed = as.matrix(process_data( data ))
78
+ if (method == "samplecube")
79
+ {
80
+ require("sampling")
81
+ # adjust ratio to make samplecube return exact number of samples
82
+ ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
83
+ pik = rep(ratio,times=nrow(data.processed))
84
+ data.strat = cbind(pik,data.processed)
85
+ samplecube(data.strat,pik,order=2,comment=F)
86
+ }
87
+ else if (method == "cluster")
88
+ {
89
+ cl = cluster(data.processed)
90
+ # require("caret")
91
+ # res = createDataPartition(cl,p=ratio)
92
+ # split = rep(1, times=nrow(data))
93
+ # for (j in 1:nrow(data))
94
+ # if ( is.na(match(j,res$Resample1)) )
95
+ # split[j]=0
96
+ # split
97
+ require("sampling")
98
+ stratified_split(cl,ratio,"samplecube")
99
+ }
100
+ else
101
+ stop("unknown method")
102
+ }
103
+
104
+ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
105
+ {
106
+ print(paste(num_folds,"-fold-split, data-size",nrow(data)))
107
+ data.processed = as.matrix(process_data( data ))
108
+ if (method == "samplecube")
109
+ {
110
+ folds = rep(0, times=nrow(data))
111
+ for (i in 1:(num_folds-1))
112
+ {
113
+ require("sampling")
114
+ prop = 1/(num_folds-(i-1))
115
+ print(paste("fold",i,"/",num_folds," prop",prop))
116
+ pik = rep(prop,times=nrow(data))
117
+ for (j in 1:nrow(data))
118
+ if(folds[j]!=0)
119
+ pik[j]=0
120
+ data.strat = cbind(pik,data.processed)
121
+ s<-samplecube(data.strat,pik,order=2,comment=F)
122
+ print(paste("fold size: ",sum(s)))
123
+ for (j in 1:nrow(data))
124
+ if (s[j] == 1)
125
+ folds[j]=i
126
+ }
127
+ for (j in 1:nrow(data))
128
+ if (folds[j] == 0)
129
+ folds[j]=num_folds
130
+ folds
131
+ }
132
+ else if (method == "cluster")
133
+ {
134
+ require("TunePareto")
135
+ cl = cluster(data.processed)
136
+ res = generateCVRuns(cl,ntimes=1,nfold=3)
137
+ folds = rep(0, times=nrow(data))
138
+ for (i in 1:num_folds)
139
+ for(j in 1:length(res[[1]][[i]]))
140
+ folds[res[[1]][[i]][j]]=i
141
+ folds
142
+ }
143
+ else
144
+ stop("unknown method")
145
+ }
146
+
147
+ plot_pre_process <- function( data, method="pca" )
148
+ {
149
+ data.processed = process_data( data )
150
+ if (method == "pca")
151
+ {
152
+ data.pca <- prcomp(data.processed, scale=TRUE)
153
+ as.data.frame(data.pca$x)[1:2]
154
+ }
155
+ else if (method == "smacof")
156
+ {
157
+ require("smacof")
158
+ data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
159
+ data.emb$conf
160
+ }
161
+ else
162
+ stop("unknown method")
163
+ }
164
+
165
+ plot_split <- function( data, split, names=NULL, ... )
166
+ {
167
+ if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
168
+ stop("data not suitable for plotting, plot_pre_process() first")
169
+
170
+ plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
171
+ if (is.null(names))
172
+ names <- c("split 1","split 2")
173
+ colos = as.double(rep(2:(max(split)+2)))
174
+ legend("topleft",names,pch=2,col=colos)
175
+
176
+ for (j in max(split):0)
177
+ {
178
+ set = c()
179
+ for (i in 1:nrow(data))
180
+ if (split[i] == j)
181
+ set = c(set,i)
182
+ points(data[set,], pch = 2, col=(j+2))
183
+ }
184
+ }
185
+
186
+ #a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
187
+ #b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
188
+ #data<-rbind(a,b)
189
+ #c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
190
+ #data<-rbind(data,c)
191
+ #data=iris
192
+ #split = stratified_k_fold_split(data, num_folds=3)
193
+ #split = stratified_split(data, ratio=0.33, method="cluster")
194
+ #print(sum(split))
195
+ #plot_split(plot_pre_process(data),split,c("training","test"))
196
+
197
+ #cl = cluster(data)
198
+
199
+
200
+
201
+
data/lib/task.rb CHANGED
@@ -242,16 +242,20 @@ module OpenTox
242
242
  # waits for a task, unless time exceeds or state is no longer running
243
243
  # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
244
244
  # @param [optional,Numeric] dur seconds pausing before cheking again for completion
245
- def wait_for_completion( waiting_task=nil, dur=0.3)
245
+ def wait_for_completion( waiting_task=nil)
246
246
 
247
247
  waiting_task.waiting_for(self.uri) if waiting_task
248
248
  due_to_time = Time.new + DEFAULT_TASK_MAX_DURATION
249
+ start_time = Time.new
250
+ dur = 0
249
251
  LOGGER.debug "start waiting for task "+@uri.to_s+" at: "+Time.new.to_s+", waiting at least until "+due_to_time.to_s
250
252
 
251
253
  load_metadata # for extremely fast tasks
252
254
  check_state
253
255
  while self.running? or self.queued?
254
256
  sleep dur
257
+ dur = [[(Time.new - start_time)/20.0,0.3].max,300.0].min
258
+ #LOGGER.debug "task-object-id: #{self.object_id} - wait: #{"%.2f"%(Time.new - start_time)} - dur: #{"%.2f"%dur}"
255
259
  load_metadata
256
260
  # if another (sub)task is waiting for self, set progress accordingly
257
261
  waiting_task.progress(@metadata[OT.percentageCompleted].to_f) if waiting_task
data/lib/transform.rb ADDED
@@ -0,0 +1,520 @@
1
+ module OpenTox
2
+ module Transform
3
+ # Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
4
+
5
+ # LogAutoScaler for GSL vectors.
6
+ # Take log and scale.
7
+ class LogAutoScale
8
+ attr_accessor :vs, :offset, :autoscaler
9
+
10
+ # @param [GSL::Vector] Values to transform using LogAutoScaling.
11
+ def initialize values
12
+ @distance_to_zero = 1.0
13
+ begin
14
+ raise "Cannot transform, values empty." if values.size==0
15
+ vs = values.clone
16
+ @offset = vs.min - @distance_to_zero
17
+ @autoscaler = OpenTox::Transform::AutoScale.new mvlog(vs)
18
+ @vs = @autoscaler.vs
19
+ rescue Exception => e
20
+ LOGGER.debug "#{e.class}: #{e.message}"
21
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
22
+ end
23
+ end
24
+
25
+ # @param [GSL::Vector] values to restore.
26
+ # @return [GSL::Vector] transformed values.
27
+ def restore values
28
+ begin
29
+ raise "Cannot transform, values empty." if values.size==0
30
+ vs = values.clone
31
+ rv = @autoscaler.restore(vs)
32
+ rv.to_a.collect { |v| (10**v) + @offset }.to_gv
33
+ rescue Exception => e
34
+ LOGGER.debug "#{e.class}: #{e.message}"
35
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
36
+ end
37
+ end
38
+
39
+ # @param [GSL::Vector] values to transform.
40
+ # @return [GSL::Vector] transformed values.
41
+ def mvlog values
42
+ values.to_a.collect { |v| Math::log10(v - @offset) }.to_gv
43
+ end
44
+
45
+ end
46
+
47
+
48
+ # Auto-Scaler for GSL vectors.
49
+ # Center on mean and divide by standard deviation.
50
+ class AutoScale
51
+ attr_accessor :vs, :mean, :stdev
52
+
53
+ # @param [GSL::Vector] values to transform using AutoScaling.
54
+ def initialize values
55
+ begin
56
+ raise "Cannot transform, values empty." if values.size==0
57
+ vs = values.clone
58
+ @mean = vs.to_scale.mean
59
+ @stdev = vs.to_scale.standard_deviation_population
60
+ @stdev = 0.0 if @stdev.nan?
61
+ @vs = transform vs
62
+ rescue Exception => e
63
+ LOGGER.debug "#{e.class}: #{e.message}"
64
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
65
+ end
66
+ end
67
+
68
+ # @param [GSL::Vector] values to transform.
69
+ # @return [GSL::Vector] transformed values.
70
+ def transform values
71
+ begin
72
+ raise "Cannot transform, values empty." if values.size==0
73
+ autoscale values.clone
74
+ rescue Exception => e
75
+ LOGGER.debug "#{e.class}: #{e.message}"
76
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
77
+ end
78
+ end
79
+
80
+ # @param [GSL::Vector] Values to restore.
81
+ # @return [GSL::Vector] transformed values.
82
+ def restore values
83
+ begin
84
+ raise "Cannot transform, values empty." if values.size==0
85
+ rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
86
+ (rv_ss + @mean).to_gsl
87
+ rescue Exception => e
88
+ LOGGER.debug "#{e.class}: #{e.message}"
89
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
90
+ end
91
+ end
92
+
93
+ # @param [GSL::Vector] values to transform.
94
+ # @return [GSL::Vector] transformed values.
95
+ def autoscale values
96
+ vs_ss = values.clone.to_scale - @mean
97
+ @stdev == 0.0 ? vs_ss.to_gsl : ( vs_ss * ( 1 / @stdev) ).to_gsl
98
+ end
99
+
100
+ end
101
+
102
+
103
+ # Principal Components Analysis.
104
+ class PCA
105
+ attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
106
+
107
+ # Creates a transformed dataset as GSL::Matrix.
108
+ #
109
+ # @param [GSL::Matrix] Data matrix.
110
+ # @param [Float] Compression ratio from [0,1], default 0.05.
111
+ # @return [GSL::Matrix] Data transformed matrix.
112
+ def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
113
+ begin
114
+ @data_matrix = data_matrix.clone
115
+ @compression = compression.to_f
116
+ @mean = Array.new
117
+ @autoscaler = Array.new
118
+ @cols = Array.new
119
+ @maxcols = maxcols
120
+
121
+ # Objective Feature Selection
122
+ raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
123
+ @data_matrix_selected = nil
124
+ (0..@data_matrix.size2-1).each { |i|
125
+ if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
126
+ if @data_matrix_selected.nil?
127
+ @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
128
+ @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
129
+ else
130
+ @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
131
+ end
132
+ @cols << i
133
+ end
134
+ }
135
+ raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
136
+
137
+ # PCA uses internal centering on 0
138
+ @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
139
+ (0..@cols.size-1).each { |i|
140
+ as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i))
141
+ @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
142
+ @mean << as.mean
143
+ @autoscaler << as
144
+ }
145
+
146
+ # PCA
147
+ data_matrix_hash = Hash.new
148
+ (0..@cols.size-1).each { |i|
149
+ column_view = @data_matrix_scaled.col(i)
150
+ data_matrix_hash[i] = column_view.to_scale
151
+ }
152
+ dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
153
+ cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
154
+ pca=Statsample::Factor::PCA.new(cor_matrix)
155
+
156
+ # Select best eigenvectors
157
+ pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
158
+ @eigenvalue_sums = Array.new
159
+ (0..@cols.size-1).each { |i|
160
+ @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
161
+ }
162
+ eigenvectors_selected = Array.new
163
+ pca.eigenvectors.each_with_index { |ev, i|
164
+ if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
165
+ eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
166
+ end
167
+ }
168
+ @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
169
+ @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
170
+
171
+ rescue Exception => e
172
+ LOGGER.debug "#{e.class}: #{e.message}"
173
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
174
+ end
175
+ end
176
+
177
+ # Transforms data to feature space found by PCA.
178
+ #
179
+ # @param [GSL::Matrix] Data matrix.
180
+ # @return [GSL::Matrix] Transformed data matrix.
181
+ def transform values
182
+ begin
183
+ vs = values.clone
184
+ raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
185
+ data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
186
+ @cols.each_with_index { |i,j|
187
+ data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
188
+ }
189
+ (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
190
+ rescue Exception => e
191
+ LOGGER.debug "#{e.class}: #{e.message}"
192
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
193
+ end
194
+ end
195
+
196
+ # Restores data in the original feature space (possibly with compression loss).
197
+ #
198
+ # @param [GSL::Matrix] Transformed data matrix.
199
+ # @return [GSL::Matrix] Data matrix.
200
+ def restore
201
+ begin
202
+ data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
203
+ # reverse scaling
204
+ (0..@cols.size-1).each { |i|
205
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
206
+ }
207
+ data_matrix_restored
208
+ rescue Exception => e
209
+ LOGGER.debug "#{e.class}: #{e.message}"
210
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
211
+ end
212
+ end
213
+
214
+ end
215
+
216
+
217
+ # Singular Value Decomposition
218
+ class SVD
219
+ attr_accessor :data_matrix, :compression, :data_transformed_matrix, :uk, :vk, :eigk, :eigk_inv
220
+
221
+ # Creates a transformed dataset as GSL::Matrix.
222
+ #
223
+ # @param [GSL::Matrix] Data matrix
224
+ # @param [Float] Compression ratio from [0,1], default 0.05
225
+ # @return [GSL::Matrix] Data transformed matrix
226
+
227
+ def initialize data_matrix, compression=0.05
228
+ begin
229
+ @data_matrix = data_matrix.clone
230
+ @compression = compression
231
+
232
+ # Compute the SV Decomposition X=USV
233
+ # vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
234
+ u, vt, s = data_matrix.SV_decomp
235
+
236
+ # Determine cutoff index
237
+ s2 = s.mul(s) ; s2_sum = s2.sum
238
+ s2_run = 0
239
+ k = s2.size - 1
240
+ s2.to_a.reverse.each { |v|
241
+ s2_run += v
242
+ frac = s2_run / s2_sum
243
+ break if frac > compression
244
+ k -= 1
245
+ }
246
+ k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
247
+
248
+ # Take the k-rank approximation of the Matrix
249
+ # - Take first k columns of u
250
+ # - Take first k columns of vt
251
+ # - Take the first k eigenvalues
252
+ @uk = u.submatrix(nil, (0..k)) # used to transform column format data
253
+ @vk = vt.submatrix(nil, (0..k)) # used to transform row format data
254
+ s = GSL::Matrix.diagonal(s)
255
+ @eigk = s.submatrix((0..k), (0..k))
256
+ @eigk_inv = @eigk.inv
257
+
258
+ # Transform data
259
+ @data_transformed_matrix = @uk # = u for all SVs
260
+ # NOTE: @data_transformed_matrix is also equal to
261
+ # @data_matrix * @vk * @eigk_inv
262
+
263
+ rescue Exception => e
264
+ LOGGER.debug "#{e.class}: #{e.message}"
265
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
266
+ end
267
+ end
268
+
269
+
270
+ # Transforms data instance (1 row) to feature space found by SVD.
271
+ #
272
+ # @param [GSL::Matrix] Data matrix (1 x m).
273
+ # @return [GSL::Matrix] Transformed data matrix.
274
+ def transform_instance values
275
+ begin
276
+ values * @vk * @eigk_inv
277
+ rescue Exception => e
278
+ LOGGER.debug "#{e.class}: #{e.message}"
279
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
280
+ end
281
+ end
282
+ alias :transform :transform_instance # make this the default (see PCA interface)
283
+
284
+ # Transforms data feature (1 column) to feature space found by SVD.
285
+ #
286
+ # @param [GSL::Matrix] Data matrix (1 x n).
287
+ # @return [GSL::Matrix] Transformed data matrix.
288
+ def transform_feature values
289
+ begin
290
+ values * @uk * @eigk_inv
291
+ rescue Exception => e
292
+ LOGGER.debug "#{e.class}: #{e.message}"
293
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
294
+ end
295
+ end
296
+
297
+
298
+ # Restores data in the original feature space (possibly with compression loss).
299
+ #
300
+ # @param [GSL::Matrix] Transformed data matrix.
301
+ # @return [GSL::Matrix] Data matrix.
302
+ def restore
303
+ begin
304
+ @data_transformed_matrix * @eigk * @vk.transpose # reverse svd
305
+ rescue Exception => e
306
+ LOGGER.debug "#{e.class}: #{e.message}"
307
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
308
+ end
309
+ end
310
+
311
+
312
+ end
313
+
314
+
315
+
316
+ # Attaches transformations to an OpenTox::Model
317
+ # Stores props, sims, performs similarity calculations
318
+ class ModelTransformer
319
+ attr_accessor :model, :similarity_algorithm, :acts, :sims
320
+
321
+ # @params[OpenTox::Model] model to transform
322
+ def initialize model
323
+ @model = model
324
+ @similarity_algorithm = @model.similarity_algorithm
325
+ end
326
+
327
+ def transform
328
+ get_matrices # creates @n_prop, @q_prop, @acts from ordered fps
329
+ @ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors
330
+
331
+ # Preprocessing
332
+ if (@model.similarity_algorithm == "Similarity.cosine")
333
+ # truncate nil-columns and -rows
334
+ LOGGER.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
335
+ while @q_prop.size>0
336
+ idx = @q_prop.index(nil)
337
+ break if idx.nil?
338
+ @q_prop.slice!(idx)
339
+ @n_prop.each { |r| r.slice!(idx) }
340
+ end
341
+ LOGGER.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
342
+ remove_nils # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors
343
+ LOGGER.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
344
+
345
+ # adjust rest
346
+ fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp
347
+ cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp
348
+ acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
349
+
350
+ # scale and svd
351
+ nr_cases, nr_features = @n_prop.size, @n_prop[0].size
352
+ gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup
353
+ gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup
354
+ (0...nr_features).each { |i|
355
+ autoscaler = OpenTox::Transform::AutoScale.new(gsl_n_prop.col(i))
356
+ gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs
357
+ gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i)
358
+ }
359
+ svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0)
360
+ @n_prop = svd.data_transformed_matrix.to_a
361
+ @q_prop = svd.transform(gsl_q_prop).row(0).to_a
362
+ LOGGER.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
363
+ else
364
+ convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched
365
+ end
366
+
367
+ # neighbor calculation
368
+ @ids = [] # surviving compounds become neighbors
369
+ @sims = [] # calculated by neighbor routine
370
+ neighbors
371
+ n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix
372
+ acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
373
+
374
+
375
+ # Sims between neighbors, if necessary
376
+ gram_matrix = []
377
+ if !@model.parameter("propositionalized") # need gram matrix for standard setting (n. prop.)
378
+ @n_prop.each_index do |i|
379
+ gram_matrix[i] = [] unless gram_matrix[i]
380
+ @n_prop.each_index do |j|
381
+ if (j>i)
382
+ sim = eval("OpenTox::Algorithm::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])")
383
+ gram_matrix[i][j] = sim
384
+ gram_matrix[j] = [] unless gram_matrix[j]
385
+ gram_matrix[j][i] = gram_matrix[i][j]
386
+ end
387
+ end
388
+ gram_matrix[i][i] = 1.0
389
+ end
390
+ end
391
+
392
+ # reclaim original data (if svd was performed)
393
+ if svd
394
+ @n_prop = gsl_n_prop_orig.to_a
395
+ n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp
396
+ @q_prop = gsl_q_prop_orig.row(0).to_a
397
+ end
398
+
399
+ LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
400
+ LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
401
+
402
+ @sims = [ gram_matrix, @sims ]
403
+
404
+ end
405
+
406
+
407
+
408
+
409
+ # Find neighbors and store them as object variable, access all compounds for that.
410
+ def neighbors
411
+ @model.neighbors = []
412
+ @n_prop.each_with_index do |fp, idx| # AM: access all compounds
413
+ add_neighbor fp, idx
414
+ end
415
+ end
416
+
417
+
418
+ # Adds a neighbor to @neighbors if it passes the similarity threshold
419
+ # adjusts @ids to signal the
420
+ def add_neighbor(training_props, idx)
421
+
422
+ sim = similarity(training_props)
423
+ if sim > @model.parameter("min_sim")
424
+ if @model.activities[@cmpds[idx]]
425
+ @model.activities[@cmpds[idx]].each do |act|
426
+ @model.neighbors << {
427
+ :compound => @cmpds[idx],
428
+ :similarity => sim,
429
+ :features => @fps[idx].keys,
430
+ :activity => act
431
+ }
432
+ @sims << sim
433
+ @ids << idx
434
+ end
435
+ end
436
+ end
437
+ end
438
+
439
+
440
+ # Removes nil entries from n_prop and q_prop.
441
+ # Matrix is a nested two-dimensional array.
442
+ # Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed.
443
+ # Tie break: columns take precedence.
444
+ # Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero.
445
+ # Enables the use of cosine similarity / SVD
446
+ def remove_nils
447
+ return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0)
448
+ col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
449
+ row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
450
+ m_cols = col_nr_nils.max
451
+ m_rows = row_nr_nils.max
452
+ idx_cols = col_nr_nils.index(m_cols)
453
+ idx_rows = row_nr_nils.index(m_rows)
454
+ while ((m_cols > 0) || (m_rows > 0)) do
455
+ if m_cols >= m_rows
456
+ @n_prop.each { |row| row.slice!(idx_cols) }
457
+ @q_prop.slice!(idx_cols)
458
+ else
459
+ @n_prop.slice!(idx_rows)
460
+ @ids.slice!(idx_rows)
461
+ end
462
+ break if (@n_prop.length == 0) || (@n_prop[0].length == 0)
463
+ col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
464
+ row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
465
+ m_cols = col_nr_nils.max
466
+ m_rows = row_nr_nils.max
467
+ idx_cols= col_nr_nils.index(m_cols)
468
+ idx_rows = row_nr_nils.index(m_rows)
469
+ end
470
+ end
471
+
472
+
473
+ # Replaces nils by zeroes in n_prop and q_prop
474
+ # Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop)
475
+ def convert_nils
476
+ @n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } }
477
+ @q_prop.collect! { |v| v.nil? ? 0 : v }
478
+ end
479
+
480
+
481
+ # Executes model similarity_algorithm
482
+ def similarity(training_props)
483
+ eval("OpenTox::Algorithm::#{@model.similarity_algorithm}(training_props, @q_prop)")
484
+ end
485
+
486
+
487
+ # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed.
488
+ # Same for compound fingerprints.
489
+ def get_matrices
490
+
491
+ @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
492
+
493
+ @model.fingerprints.each { |fp|
494
+ cmpd = fp[0]; fp = fp[1]
495
+ if @model.activities[cmpd] # row good
496
+ acts = @model.activities[cmpd]; @acts += acts
497
+ LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
498
+ row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
499
+ acts.size.times { # multiple additions for multiple activities
500
+ @n_prop << row.collect
501
+ @cmpds << cmpd
502
+ @fps << Marshal.load(Marshal.dump(fp))
503
+ }
504
+ else
505
+ LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
506
+ end
507
+ }
508
+
509
+ @model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure
510
+
511
+ end
512
+
513
+ def props
514
+ @model.parameter("propositionalized") ? [ @n_prop, @q_prop ] : nil
515
+ end
516
+
517
+ end
518
+
519
+ end
520
+ end