opentox-ruby 3.0.1 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +8 -0
- data/Rakefile +2 -3
- data/VERSION +1 -1
- data/lib/algorithm.rb +227 -675
- data/lib/authorization.rb +10 -8
- data/lib/compound.rb +47 -11
- data/lib/dataset.rb +50 -2
- data/lib/environment.rb +6 -1
- data/lib/model.rb +37 -72
- data/lib/opentox-ruby.rb +1 -1
- data/lib/parser.rb +115 -57
- data/lib/r-util.rb +354 -0
- data/lib/rest_client_wrapper.rb +1 -1
- data/lib/serializer.rb +47 -30
- data/lib/stratification.R +201 -0
- data/lib/task.rb +5 -1
- data/lib/transform.rb +520 -0
- data/lib/utils.rb +372 -0
- data/lib/validation.rb +52 -6
- metadata +413 -428
data/lib/rest_client_wrapper.rb
CHANGED
@@ -70,7 +70,7 @@ module OpenTox
|
|
70
70
|
|
71
71
|
begin
|
72
72
|
#LOGGER.debug "RestCall: "+rest_call.to_s+" "+uri.to_s+" "+headers.inspect+" "+payload.inspect
|
73
|
-
resource = RestClient::Resource.new(uri,{:timeout =>
|
73
|
+
resource = RestClient::Resource.new(uri,{:timeout => 600})
|
74
74
|
if rest_call=="post" || rest_call=="put"
|
75
75
|
result = resource.send(rest_call, payload, headers)
|
76
76
|
else
|
data/lib/serializer.rb
CHANGED
@@ -55,7 +55,7 @@ module OpenTox
|
|
55
55
|
OT.predictedVariables => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
56
56
|
OT.paramValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
57
57
|
|
58
|
-
#object props for validation#
|
58
|
+
#object props for validation#
|
59
59
|
OT.model => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
60
60
|
OT.trainingDataset => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
61
61
|
OT.predictionFeature => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
@@ -87,7 +87,7 @@ module OpenTox
|
|
87
87
|
OT.percentageCompleted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
88
88
|
OT.acceptValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
89
89
|
|
90
|
-
# annotation props for validation
|
90
|
+
# annotation props for validation
|
91
91
|
OT.numUnpredicted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
92
92
|
OT.crossvalidationFold => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
93
93
|
OT.numInstances => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
@@ -143,8 +143,8 @@ module OpenTox
|
|
143
143
|
@data_entries = {}
|
144
144
|
@values_id = 0
|
145
145
|
@parameter_id = 0
|
146
|
-
|
147
|
-
@classes = Set.new
|
146
|
+
|
147
|
+
@classes = Set.new
|
148
148
|
@object_properties = Set.new
|
149
149
|
@annotation_properties = Set.new
|
150
150
|
@datatype_properties = Set.new
|
@@ -208,7 +208,7 @@ module OpenTox
|
|
208
208
|
@object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => OT.Task }] }
|
209
209
|
add_metadata uri, metadata
|
210
210
|
end
|
211
|
-
|
211
|
+
|
212
212
|
# Add a resource defined by resource_class and content
|
213
213
|
# (see documentation of add_content for example)
|
214
214
|
# @param [String] uri of resource
|
@@ -223,10 +223,10 @@ module OpenTox
|
|
223
223
|
def add_uri(uri,type)
|
224
224
|
@object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => type }] }
|
225
225
|
end
|
226
|
-
|
226
|
+
|
227
227
|
private
|
228
228
|
@@content_id = 1
|
229
|
-
|
229
|
+
|
230
230
|
#Recursiv function to add content
|
231
231
|
#@example
|
232
232
|
# { DC.description => "bla",
|
@@ -244,7 +244,7 @@ module OpenTox
|
|
244
244
|
hash.each do |u,v|
|
245
245
|
if v.is_a? Hash
|
246
246
|
# value is again a hash, i.e. a new owl class is added
|
247
|
-
# first make sure type (==class) is set
|
247
|
+
# first make sure type (==class) is set
|
248
248
|
type = v[RDF.type]
|
249
249
|
raise "type missing for "+u.to_s+" content:\n"+v.inspect unless type
|
250
250
|
raise "class unknown "+type.to_s+" (for "+u.to_s+")" unless @object.has_key?(type)
|
@@ -256,7 +256,7 @@ module OpenTox
|
|
256
256
|
# add content to new class
|
257
257
|
add_content(genid,v)
|
258
258
|
elsif v.is_a? Array
|
259
|
-
# value is an array, i.e. a list of values with property is added
|
259
|
+
# value is an array, i.e. a list of values with property is added
|
260
260
|
v.each{ |vv| add_content( uri, { u => vv } ) }
|
261
261
|
else # v.is_a? String
|
262
262
|
# simple string value
|
@@ -268,7 +268,7 @@ module OpenTox
|
|
268
268
|
end
|
269
269
|
end
|
270
270
|
end
|
271
|
-
|
271
|
+
|
272
272
|
public
|
273
273
|
|
274
274
|
# Add metadata
|
@@ -329,7 +329,7 @@ module OpenTox
|
|
329
329
|
v = [{ "type" => "uri", "value" => value}]
|
330
330
|
when "literal"
|
331
331
|
v = [{ "type" => "literal", "value" => value, "datatype" => datatype(value) }]
|
332
|
-
else
|
332
|
+
else
|
333
333
|
raise "Illegal type #{type(value)} for #{value}."
|
334
334
|
end
|
335
335
|
@object[values] = {
|
@@ -342,7 +342,7 @@ module OpenTox
|
|
342
342
|
end
|
343
343
|
|
344
344
|
# Serializers
|
345
|
-
|
345
|
+
|
346
346
|
# Convert to N-Triples
|
347
347
|
# @return [text/plain] Object OWL-DL in N-Triples format
|
348
348
|
def to_ntriples
|
@@ -353,7 +353,7 @@ module OpenTox
|
|
353
353
|
entry.each do |p,objects|
|
354
354
|
p = url(p)
|
355
355
|
objects.each do |o|
|
356
|
-
case o["type"]
|
356
|
+
case o["type"]
|
357
357
|
when "uri"
|
358
358
|
o = url(o["value"])
|
359
359
|
when "literal"
|
@@ -371,9 +371,15 @@ module OpenTox
|
|
371
371
|
# Convert to RDF/XML
|
372
372
|
# @return [text/plain] Object OWL-DL in RDF/XML format
|
373
373
|
def to_rdfxml
|
374
|
-
Tempfile.open("owl-serializer")
|
374
|
+
tmpf = Tempfile.open("owl-serializer")
|
375
|
+
tmpf.write(self.to_ntriples)
|
376
|
+
tmpf.flush
|
377
|
+
@path = tmpf.path
|
375
378
|
# TODO: add base uri for ist services
|
376
|
-
|
379
|
+
res=`rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
|
380
|
+
tmpf.close
|
381
|
+
tmpf.delete
|
382
|
+
res
|
377
383
|
end
|
378
384
|
|
379
385
|
# Convert to JSON as specified in http://n2.talis.com/wiki/RDF_JSON_Specification
|
@@ -427,20 +433,20 @@ module OpenTox
|
|
427
433
|
end
|
428
434
|
|
429
435
|
def literal(value,type)
|
430
|
-
# concat and << are faster string concatination operators than +
|
436
|
+
# concat and << are faster string concatination operators than +
|
431
437
|
'"'.concat(value.to_s).concat('"^^<').concat(type).concat('>')
|
432
438
|
end
|
433
439
|
|
434
440
|
def url(uri)
|
435
|
-
# concat and << are faster string concatination operators than +
|
441
|
+
# concat and << are faster string concatination operators than +
|
436
442
|
'<'.concat(uri).concat('>')
|
437
443
|
end
|
438
444
|
|
439
445
|
def rdf_types
|
440
|
-
@classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
|
441
|
-
@object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
|
442
|
-
@annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
|
443
|
-
@datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
|
446
|
+
@classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
|
447
|
+
@object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
|
448
|
+
@annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
|
449
|
+
@datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
|
444
450
|
end
|
445
451
|
|
446
452
|
end
|
@@ -457,35 +463,46 @@ module OpenTox
|
|
457
463
|
@rows.first << features
|
458
464
|
@rows.first.flatten!
|
459
465
|
dataset.data_entries.each do |compound,entries|
|
460
|
-
|
466
|
+
cmpd = Compound.new(compound)
|
467
|
+
smiles = cmpd.to_smiles
|
468
|
+
inchi = URI.encode_www_form_component(cmpd.to_inchi)
|
469
|
+
row_container = Array.new
|
461
470
|
row = Array.new(@rows.first.size)
|
462
|
-
|
471
|
+
row_container << row
|
472
|
+
#row[0] = smiles
|
473
|
+
row[0] = inchi
|
463
474
|
entries.each do |feature, values|
|
464
475
|
i = features.index(feature)+1
|
465
476
|
values.each do |value|
|
466
|
-
if
|
467
|
-
|
477
|
+
if row_container[0][i]
|
478
|
+
#LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
|
479
|
+
row_container << row_container.last.collect
|
480
|
+
row_container.last[i] = value
|
481
|
+
#LOGGER.debug "RC: #{row_container.to_yaml}"
|
468
482
|
else
|
469
|
-
|
483
|
+
row_container.each { |r| r[i] = value }
|
470
484
|
end
|
471
485
|
end
|
472
486
|
end
|
473
|
-
@rows <<
|
487
|
+
row_container.each { |r| @rows << r }
|
474
488
|
end
|
475
489
|
end
|
476
490
|
|
477
491
|
# Convert to CSV string
|
478
492
|
# @return [String] CSV string
|
479
493
|
def to_csv
|
480
|
-
@rows.collect
|
494
|
+
rows = @rows.collect
|
495
|
+
result = ""
|
496
|
+
result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
|
497
|
+
result << rows.collect{ |r| r.join(",") }.join("\n")
|
481
498
|
end
|
482
499
|
|
483
500
|
# Convert to spreadsheet workbook
|
484
501
|
# @return [Spreadsheet::Workbook] Workbook object (use the spreadsheet gemc to write a file)
|
485
|
-
def to_spreadsheet
|
502
|
+
def to_spreadsheet(sheetname="sheet1")
|
486
503
|
Spreadsheet.client_encoding = 'UTF-8'
|
487
504
|
book = Spreadsheet::Workbook.new
|
488
|
-
sheet = book.create_worksheet(:name =>
|
505
|
+
sheet = book.create_worksheet(:name => "#{sheetname}")
|
489
506
|
sheet.column(0).width = 100
|
490
507
|
i = 0
|
491
508
|
@rows.each do |row|
|
@@ -0,0 +1,201 @@
|
|
1
|
+
|
2
|
+
nominal_to_binary <- function( data )
|
3
|
+
{
|
4
|
+
result = NULL
|
5
|
+
for (i in 1:ncol(data))
|
6
|
+
{
|
7
|
+
#print(i)
|
8
|
+
if (is.numeric( data[,i] ) )
|
9
|
+
{
|
10
|
+
if (is.null(result))
|
11
|
+
result = data.frame(data[,i])
|
12
|
+
else
|
13
|
+
result = data.frame(result, data[,i])
|
14
|
+
colnames(result)[ncol(result)] <- colnames(data)[i]
|
15
|
+
}
|
16
|
+
else
|
17
|
+
{
|
18
|
+
vals = unique(data[,i])
|
19
|
+
for (j in 1:length(vals))
|
20
|
+
{
|
21
|
+
#print(j)
|
22
|
+
bins = c()
|
23
|
+
for (k in 1:nrow(data))
|
24
|
+
{
|
25
|
+
if(data[,i][k] == vals[j])
|
26
|
+
bins = c(bins,1)
|
27
|
+
else
|
28
|
+
bins = c(bins,0)
|
29
|
+
}
|
30
|
+
#print(bins)
|
31
|
+
if (is.null(result))
|
32
|
+
result = data.frame(bins)
|
33
|
+
else
|
34
|
+
result = data.frame(result, bins)
|
35
|
+
colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
|
36
|
+
if (length(vals)==2) break
|
37
|
+
}
|
38
|
+
}
|
39
|
+
}
|
40
|
+
#print(head(result))
|
41
|
+
result
|
42
|
+
}
|
43
|
+
|
44
|
+
process_data <- function( data )
|
45
|
+
{
|
46
|
+
data.num <- as.data.frame(data)
|
47
|
+
if (!is.numeric(data.num))
|
48
|
+
{
|
49
|
+
data.num = nominal_to_binary(data.num)
|
50
|
+
}
|
51
|
+
if(any(is.na(data.num)))
|
52
|
+
{
|
53
|
+
require("gam")
|
54
|
+
data.repl = na.gam.replace(data.num)
|
55
|
+
}
|
56
|
+
else
|
57
|
+
data.repl = data.num
|
58
|
+
data.repl
|
59
|
+
}
|
60
|
+
|
61
|
+
cluster <- function( data, min=10, max=15 )
|
62
|
+
{
|
63
|
+
require("vegan")
|
64
|
+
max <- min(max,nrow(unique(data)))
|
65
|
+
max <- min(max,nrow(data)-1)
|
66
|
+
if (min>max)
|
67
|
+
min=max
|
68
|
+
print(paste("cascade k-means ",min," - ",max))
|
69
|
+
s = cascadeKM(data,min,max,iter=30)
|
70
|
+
m = max.col(s$results)[2]
|
71
|
+
print(paste("best k-means clustering result: ",((m-1)+min)," num clusters"))
|
72
|
+
cbind(s$partition[,m])
|
73
|
+
}
|
74
|
+
|
75
|
+
stratified_split <- function( data, ratio=0.3, method="cluster" )
|
76
|
+
{
|
77
|
+
data.processed = as.matrix(process_data( data ))
|
78
|
+
if (method == "samplecube")
|
79
|
+
{
|
80
|
+
require("sampling")
|
81
|
+
# adjust ratio to make samplecube return exact number of samples
|
82
|
+
ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
|
83
|
+
pik = rep(ratio,times=nrow(data.processed))
|
84
|
+
data.strat = cbind(pik,data.processed)
|
85
|
+
samplecube(data.strat,pik,order=2,comment=F)
|
86
|
+
}
|
87
|
+
else if (method == "cluster")
|
88
|
+
{
|
89
|
+
cl = cluster(data.processed)
|
90
|
+
# require("caret")
|
91
|
+
# res = createDataPartition(cl,p=ratio)
|
92
|
+
# split = rep(1, times=nrow(data))
|
93
|
+
# for (j in 1:nrow(data))
|
94
|
+
# if ( is.na(match(j,res$Resample1)) )
|
95
|
+
# split[j]=0
|
96
|
+
# split
|
97
|
+
require("sampling")
|
98
|
+
stratified_split(cl,ratio,"samplecube")
|
99
|
+
}
|
100
|
+
else
|
101
|
+
stop("unknown method")
|
102
|
+
}
|
103
|
+
|
104
|
+
stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
|
105
|
+
{
|
106
|
+
print(paste(num_folds,"-fold-split, data-size",nrow(data)))
|
107
|
+
data.processed = as.matrix(process_data( data ))
|
108
|
+
if (method == "samplecube")
|
109
|
+
{
|
110
|
+
folds = rep(0, times=nrow(data))
|
111
|
+
for (i in 1:(num_folds-1))
|
112
|
+
{
|
113
|
+
require("sampling")
|
114
|
+
prop = 1/(num_folds-(i-1))
|
115
|
+
print(paste("fold",i,"/",num_folds," prop",prop))
|
116
|
+
pik = rep(prop,times=nrow(data))
|
117
|
+
for (j in 1:nrow(data))
|
118
|
+
if(folds[j]!=0)
|
119
|
+
pik[j]=0
|
120
|
+
data.strat = cbind(pik,data.processed)
|
121
|
+
s<-samplecube(data.strat,pik,order=2,comment=F)
|
122
|
+
print(paste("fold size: ",sum(s)))
|
123
|
+
for (j in 1:nrow(data))
|
124
|
+
if (s[j] == 1)
|
125
|
+
folds[j]=i
|
126
|
+
}
|
127
|
+
for (j in 1:nrow(data))
|
128
|
+
if (folds[j] == 0)
|
129
|
+
folds[j]=num_folds
|
130
|
+
folds
|
131
|
+
}
|
132
|
+
else if (method == "cluster")
|
133
|
+
{
|
134
|
+
require("TunePareto")
|
135
|
+
cl = cluster(data.processed)
|
136
|
+
res = generateCVRuns(cl,ntimes=1,nfold=3)
|
137
|
+
folds = rep(0, times=nrow(data))
|
138
|
+
for (i in 1:num_folds)
|
139
|
+
for(j in 1:length(res[[1]][[i]]))
|
140
|
+
folds[res[[1]][[i]][j]]=i
|
141
|
+
folds
|
142
|
+
}
|
143
|
+
else
|
144
|
+
stop("unknown method")
|
145
|
+
}
|
146
|
+
|
147
|
+
plot_pre_process <- function( data, method="pca" )
|
148
|
+
{
|
149
|
+
data.processed = process_data( data )
|
150
|
+
if (method == "pca")
|
151
|
+
{
|
152
|
+
data.pca <- prcomp(data.processed, scale=TRUE)
|
153
|
+
as.data.frame(data.pca$x)[1:2]
|
154
|
+
}
|
155
|
+
else if (method == "smacof")
|
156
|
+
{
|
157
|
+
require("smacof")
|
158
|
+
data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
|
159
|
+
data.emb$conf
|
160
|
+
}
|
161
|
+
else
|
162
|
+
stop("unknown method")
|
163
|
+
}
|
164
|
+
|
165
|
+
plot_split <- function( data, split, names=NULL, ... )
|
166
|
+
{
|
167
|
+
if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
|
168
|
+
stop("data not suitable for plotting, plot_pre_process() first")
|
169
|
+
|
170
|
+
plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
|
171
|
+
if (is.null(names))
|
172
|
+
names <- c("split 1","split 2")
|
173
|
+
colos = as.double(rep(2:(max(split)+2)))
|
174
|
+
legend("topleft",names,pch=2,col=colos)
|
175
|
+
|
176
|
+
for (j in max(split):0)
|
177
|
+
{
|
178
|
+
set = c()
|
179
|
+
for (i in 1:nrow(data))
|
180
|
+
if (split[i] == j)
|
181
|
+
set = c(set,i)
|
182
|
+
points(data[set,], pch = 2, col=(j+2))
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
186
|
+
#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
|
187
|
+
#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
|
188
|
+
#data<-rbind(a,b)
|
189
|
+
#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
|
190
|
+
#data<-rbind(data,c)
|
191
|
+
#data=iris
|
192
|
+
#split = stratified_k_fold_split(data, num_folds=3)
|
193
|
+
#split = stratified_split(data, ratio=0.33, method="cluster")
|
194
|
+
#print(sum(split))
|
195
|
+
#plot_split(plot_pre_process(data),split,c("training","test"))
|
196
|
+
|
197
|
+
#cl = cluster(data)
|
198
|
+
|
199
|
+
|
200
|
+
|
201
|
+
|
data/lib/task.rb
CHANGED
@@ -242,16 +242,20 @@ module OpenTox
|
|
242
242
|
# waits for a task, unless time exceeds or state is no longer running
|
243
243
|
# @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
|
244
244
|
# @param [optional,Numeric] dur seconds pausing before cheking again for completion
|
245
|
-
def wait_for_completion( waiting_task=nil
|
245
|
+
def wait_for_completion( waiting_task=nil)
|
246
246
|
|
247
247
|
waiting_task.waiting_for(self.uri) if waiting_task
|
248
248
|
due_to_time = Time.new + DEFAULT_TASK_MAX_DURATION
|
249
|
+
start_time = Time.new
|
250
|
+
dur = 0
|
249
251
|
LOGGER.debug "start waiting for task "+@uri.to_s+" at: "+Time.new.to_s+", waiting at least until "+due_to_time.to_s
|
250
252
|
|
251
253
|
load_metadata # for extremely fast tasks
|
252
254
|
check_state
|
253
255
|
while self.running? or self.queued?
|
254
256
|
sleep dur
|
257
|
+
dur = [[(Time.new - start_time)/20.0,0.3].max,300.0].min
|
258
|
+
#LOGGER.debug "task-object-id: #{self.object_id} - wait: #{"%.2f"%(Time.new - start_time)} - dur: #{"%.2f"%dur}"
|
255
259
|
load_metadata
|
256
260
|
# if another (sub)task is waiting for self, set progress accordingly
|
257
261
|
waiting_task.progress(@metadata[OT.percentageCompleted].to_f) if waiting_task
|
data/lib/transform.rb
ADDED
@@ -0,0 +1,520 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Transform
|
3
|
+
# Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
|
4
|
+
|
5
|
+
# LogAutoScaler for GSL vectors.
|
6
|
+
# Take log and scale.
|
7
|
+
class LogAutoScale
|
8
|
+
attr_accessor :vs, :offset, :autoscaler
|
9
|
+
|
10
|
+
# @param [GSL::Vector] Values to transform using LogAutoScaling.
|
11
|
+
def initialize values
|
12
|
+
@distance_to_zero = 1.0
|
13
|
+
begin
|
14
|
+
raise "Cannot transform, values empty." if values.size==0
|
15
|
+
vs = values.clone
|
16
|
+
@offset = vs.min - @distance_to_zero
|
17
|
+
@autoscaler = OpenTox::Transform::AutoScale.new mvlog(vs)
|
18
|
+
@vs = @autoscaler.vs
|
19
|
+
rescue Exception => e
|
20
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
21
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# @param [GSL::Vector] values to restore.
|
26
|
+
# @return [GSL::Vector] transformed values.
|
27
|
+
def restore values
|
28
|
+
begin
|
29
|
+
raise "Cannot transform, values empty." if values.size==0
|
30
|
+
vs = values.clone
|
31
|
+
rv = @autoscaler.restore(vs)
|
32
|
+
rv.to_a.collect { |v| (10**v) + @offset }.to_gv
|
33
|
+
rescue Exception => e
|
34
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
35
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# @param [GSL::Vector] values to transform.
|
40
|
+
# @return [GSL::Vector] transformed values.
|
41
|
+
def mvlog values
|
42
|
+
values.to_a.collect { |v| Math::log10(v - @offset) }.to_gv
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# Auto-Scaler for GSL vectors.
|
49
|
+
# Center on mean and divide by standard deviation.
|
50
|
+
class AutoScale
|
51
|
+
attr_accessor :vs, :mean, :stdev
|
52
|
+
|
53
|
+
# @param [GSL::Vector] values to transform using AutoScaling.
|
54
|
+
def initialize values
|
55
|
+
begin
|
56
|
+
raise "Cannot transform, values empty." if values.size==0
|
57
|
+
vs = values.clone
|
58
|
+
@mean = vs.to_scale.mean
|
59
|
+
@stdev = vs.to_scale.standard_deviation_population
|
60
|
+
@stdev = 0.0 if @stdev.nan?
|
61
|
+
@vs = transform vs
|
62
|
+
rescue Exception => e
|
63
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
64
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# @param [GSL::Vector] values to transform.
|
69
|
+
# @return [GSL::Vector] transformed values.
|
70
|
+
def transform values
|
71
|
+
begin
|
72
|
+
raise "Cannot transform, values empty." if values.size==0
|
73
|
+
autoscale values.clone
|
74
|
+
rescue Exception => e
|
75
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
76
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# @param [GSL::Vector] Values to restore.
|
81
|
+
# @return [GSL::Vector] transformed values.
|
82
|
+
def restore values
|
83
|
+
begin
|
84
|
+
raise "Cannot transform, values empty." if values.size==0
|
85
|
+
rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
|
86
|
+
(rv_ss + @mean).to_gsl
|
87
|
+
rescue Exception => e
|
88
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
89
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# @param [GSL::Vector] values to transform.
|
94
|
+
# @return [GSL::Vector] transformed values.
|
95
|
+
def autoscale values
|
96
|
+
vs_ss = values.clone.to_scale - @mean
|
97
|
+
@stdev == 0.0 ? vs_ss.to_gsl : ( vs_ss * ( 1 / @stdev) ).to_gsl
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
# Principal Components Analysis.
|
104
|
+
class PCA
|
105
|
+
attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
|
106
|
+
|
107
|
+
# Creates a transformed dataset as GSL::Matrix.
|
108
|
+
#
|
109
|
+
# @param [GSL::Matrix] Data matrix.
|
110
|
+
# @param [Float] Compression ratio from [0,1], default 0.05.
|
111
|
+
# @return [GSL::Matrix] Data transformed matrix.
|
112
|
+
def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
|
113
|
+
begin
|
114
|
+
@data_matrix = data_matrix.clone
|
115
|
+
@compression = compression.to_f
|
116
|
+
@mean = Array.new
|
117
|
+
@autoscaler = Array.new
|
118
|
+
@cols = Array.new
|
119
|
+
@maxcols = maxcols
|
120
|
+
|
121
|
+
# Objective Feature Selection
|
122
|
+
raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
|
123
|
+
@data_matrix_selected = nil
|
124
|
+
(0..@data_matrix.size2-1).each { |i|
|
125
|
+
if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
|
126
|
+
if @data_matrix_selected.nil?
|
127
|
+
@data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
|
128
|
+
@data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
|
129
|
+
else
|
130
|
+
@data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
|
131
|
+
end
|
132
|
+
@cols << i
|
133
|
+
end
|
134
|
+
}
|
135
|
+
raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
|
136
|
+
|
137
|
+
# PCA uses internal centering on 0
|
138
|
+
@data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
|
139
|
+
(0..@cols.size-1).each { |i|
|
140
|
+
as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i))
|
141
|
+
@data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
|
142
|
+
@mean << as.mean
|
143
|
+
@autoscaler << as
|
144
|
+
}
|
145
|
+
|
146
|
+
# PCA
|
147
|
+
data_matrix_hash = Hash.new
|
148
|
+
(0..@cols.size-1).each { |i|
|
149
|
+
column_view = @data_matrix_scaled.col(i)
|
150
|
+
data_matrix_hash[i] = column_view.to_scale
|
151
|
+
}
|
152
|
+
dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
|
153
|
+
cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
|
154
|
+
pca=Statsample::Factor::PCA.new(cor_matrix)
|
155
|
+
|
156
|
+
# Select best eigenvectors
|
157
|
+
pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
|
158
|
+
@eigenvalue_sums = Array.new
|
159
|
+
(0..@cols.size-1).each { |i|
|
160
|
+
@eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
|
161
|
+
}
|
162
|
+
eigenvectors_selected = Array.new
|
163
|
+
pca.eigenvectors.each_with_index { |ev, i|
|
164
|
+
if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
|
165
|
+
eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
|
166
|
+
end
|
167
|
+
}
|
168
|
+
@eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
|
169
|
+
@data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
|
170
|
+
|
171
|
+
rescue Exception => e
|
172
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
173
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
# Transforms data to feature space found by PCA.
|
178
|
+
#
|
179
|
+
# @param [GSL::Matrix] Data matrix.
|
180
|
+
# @return [GSL::Matrix] Transformed data matrix.
|
181
|
+
def transform values
|
182
|
+
begin
|
183
|
+
vs = values.clone
|
184
|
+
raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
|
185
|
+
data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
|
186
|
+
@cols.each_with_index { |i,j|
|
187
|
+
data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
|
188
|
+
}
|
189
|
+
(@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
|
190
|
+
rescue Exception => e
|
191
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
192
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
# Restores data in the original feature space (possibly with compression loss).
|
197
|
+
#
|
198
|
+
# @param [GSL::Matrix] Transformed data matrix.
|
199
|
+
# @return [GSL::Matrix] Data matrix.
|
200
|
+
def restore
|
201
|
+
begin
|
202
|
+
data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
|
203
|
+
# reverse scaling
|
204
|
+
(0..@cols.size-1).each { |i|
|
205
|
+
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
|
206
|
+
}
|
207
|
+
data_matrix_restored
|
208
|
+
rescue Exception => e
|
209
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
210
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
# Singular Value Decomposition
|
218
|
+
class SVD
|
219
|
+
attr_accessor :data_matrix, :compression, :data_transformed_matrix, :uk, :vk, :eigk, :eigk_inv
|
220
|
+
|
221
|
+
# Creates a transformed dataset as GSL::Matrix.
|
222
|
+
#
|
223
|
+
# @param [GSL::Matrix] Data matrix
|
224
|
+
# @param [Float] Compression ratio from [0,1], default 0.05
|
225
|
+
# @return [GSL::Matrix] Data transformed matrix
|
226
|
+
|
227
|
+
def initialize data_matrix, compression=0.05
|
228
|
+
begin
|
229
|
+
@data_matrix = data_matrix.clone
|
230
|
+
@compression = compression
|
231
|
+
|
232
|
+
# Compute the SV Decomposition X=USV
|
233
|
+
# vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
|
234
|
+
u, vt, s = data_matrix.SV_decomp
|
235
|
+
|
236
|
+
# Determine cutoff index
|
237
|
+
s2 = s.mul(s) ; s2_sum = s2.sum
|
238
|
+
s2_run = 0
|
239
|
+
k = s2.size - 1
|
240
|
+
s2.to_a.reverse.each { |v|
|
241
|
+
s2_run += v
|
242
|
+
frac = s2_run / s2_sum
|
243
|
+
break if frac > compression
|
244
|
+
k -= 1
|
245
|
+
}
|
246
|
+
k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
|
247
|
+
|
248
|
+
# Take the k-rank approximation of the Matrix
|
249
|
+
# - Take first k columns of u
|
250
|
+
# - Take first k columns of vt
|
251
|
+
# - Take the first k eigenvalues
|
252
|
+
@uk = u.submatrix(nil, (0..k)) # used to transform column format data
|
253
|
+
@vk = vt.submatrix(nil, (0..k)) # used to transform row format data
|
254
|
+
s = GSL::Matrix.diagonal(s)
|
255
|
+
@eigk = s.submatrix((0..k), (0..k))
|
256
|
+
@eigk_inv = @eigk.inv
|
257
|
+
|
258
|
+
# Transform data
|
259
|
+
@data_transformed_matrix = @uk # = u for all SVs
|
260
|
+
# NOTE: @data_transformed_matrix is also equal to
|
261
|
+
# @data_matrix * @vk * @eigk_inv
|
262
|
+
|
263
|
+
rescue Exception => e
|
264
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
265
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
|
270
|
+
# Transforms data instance (1 row) to feature space found by SVD.
|
271
|
+
#
|
272
|
+
# @param [GSL::Matrix] Data matrix (1 x m).
|
273
|
+
# @return [GSL::Matrix] Transformed data matrix.
|
274
|
+
def transform_instance values
|
275
|
+
begin
|
276
|
+
values * @vk * @eigk_inv
|
277
|
+
rescue Exception => e
|
278
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
279
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
280
|
+
end
|
281
|
+
end
|
282
|
+
alias :transform :transform_instance # make this the default (see PCA interface)
|
283
|
+
|
284
|
+
# Transforms data feature (1 column) to feature space found by SVD.
|
285
|
+
#
|
286
|
+
# @param [GSL::Matrix] Data matrix (1 x n).
|
287
|
+
# @return [GSL::Matrix] Transformed data matrix.
|
288
|
+
def transform_feature values
|
289
|
+
begin
|
290
|
+
values * @uk * @eigk_inv
|
291
|
+
rescue Exception => e
|
292
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
293
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
|
298
|
+
# Restores data in the original feature space (possibly with compression loss).
|
299
|
+
#
|
300
|
+
# @param [GSL::Matrix] Transformed data matrix.
|
301
|
+
# @return [GSL::Matrix] Data matrix.
|
302
|
+
def restore
|
303
|
+
begin
|
304
|
+
@data_transformed_matrix * @eigk * @vk.transpose # reverse svd
|
305
|
+
rescue Exception => e
|
306
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
307
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
|
312
|
+
end
|
313
|
+
|
314
|
+
|
315
|
+
|
316
|
+
# Attaches transformations to an OpenTox::Model
|
317
|
+
# Stores props, sims, performs similarity calculations
|
318
|
+
class ModelTransformer
|
319
|
+
attr_accessor :model, :similarity_algorithm, :acts, :sims
|
320
|
+
|
321
|
+
# @params[OpenTox::Model] model to transform
|
322
|
+
def initialize model
|
323
|
+
@model = model
|
324
|
+
@similarity_algorithm = @model.similarity_algorithm
|
325
|
+
end
|
326
|
+
|
327
|
+
def transform
|
328
|
+
get_matrices # creates @n_prop, @q_prop, @acts from ordered fps
|
329
|
+
@ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors
|
330
|
+
|
331
|
+
# Preprocessing
|
332
|
+
if (@model.similarity_algorithm == "Similarity.cosine")
|
333
|
+
# truncate nil-columns and -rows
|
334
|
+
LOGGER.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
335
|
+
while @q_prop.size>0
|
336
|
+
idx = @q_prop.index(nil)
|
337
|
+
break if idx.nil?
|
338
|
+
@q_prop.slice!(idx)
|
339
|
+
@n_prop.each { |r| r.slice!(idx) }
|
340
|
+
end
|
341
|
+
LOGGER.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
342
|
+
remove_nils # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors
|
343
|
+
LOGGER.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
344
|
+
|
345
|
+
# adjust rest
|
346
|
+
fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp
|
347
|
+
cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp
|
348
|
+
acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
|
349
|
+
|
350
|
+
# scale and svd
|
351
|
+
nr_cases, nr_features = @n_prop.size, @n_prop[0].size
|
352
|
+
gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup
|
353
|
+
gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup
|
354
|
+
(0...nr_features).each { |i|
|
355
|
+
autoscaler = OpenTox::Transform::AutoScale.new(gsl_n_prop.col(i))
|
356
|
+
gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs
|
357
|
+
gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i)
|
358
|
+
}
|
359
|
+
svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0)
|
360
|
+
@n_prop = svd.data_transformed_matrix.to_a
|
361
|
+
@q_prop = svd.transform(gsl_q_prop).row(0).to_a
|
362
|
+
LOGGER.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
363
|
+
else
|
364
|
+
convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched
|
365
|
+
end
|
366
|
+
|
367
|
+
# neighbor calculation
|
368
|
+
@ids = [] # surviving compounds become neighbors
|
369
|
+
@sims = [] # calculated by neighbor routine
|
370
|
+
neighbors
|
371
|
+
n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix
|
372
|
+
acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
|
373
|
+
|
374
|
+
|
375
|
+
# Sims between neighbors, if necessary
|
376
|
+
gram_matrix = []
|
377
|
+
if !@model.parameter("propositionalized") # need gram matrix for standard setting (n. prop.)
|
378
|
+
@n_prop.each_index do |i|
|
379
|
+
gram_matrix[i] = [] unless gram_matrix[i]
|
380
|
+
@n_prop.each_index do |j|
|
381
|
+
if (j>i)
|
382
|
+
sim = eval("OpenTox::Algorithm::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])")
|
383
|
+
gram_matrix[i][j] = sim
|
384
|
+
gram_matrix[j] = [] unless gram_matrix[j]
|
385
|
+
gram_matrix[j][i] = gram_matrix[i][j]
|
386
|
+
end
|
387
|
+
end
|
388
|
+
gram_matrix[i][i] = 1.0
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
# reclaim original data (if svd was performed)
|
393
|
+
if svd
|
394
|
+
@n_prop = gsl_n_prop_orig.to_a
|
395
|
+
n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp
|
396
|
+
@q_prop = gsl_q_prop_orig.row(0).to_a
|
397
|
+
end
|
398
|
+
|
399
|
+
LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
400
|
+
LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
|
401
|
+
|
402
|
+
@sims = [ gram_matrix, @sims ]
|
403
|
+
|
404
|
+
end
|
405
|
+
|
406
|
+
|
407
|
+
|
408
|
+
|
409
|
+
# Find neighbors and store them as object variable, access all compounds for that.
|
410
|
+
def neighbors
|
411
|
+
@model.neighbors = []
|
412
|
+
@n_prop.each_with_index do |fp, idx| # AM: access all compounds
|
413
|
+
add_neighbor fp, idx
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
417
|
+
|
418
|
+
# Adds a neighbor to @neighbors if it passes the similarity threshold
|
419
|
+
# adjusts @ids to signal the
|
420
|
+
def add_neighbor(training_props, idx)
|
421
|
+
|
422
|
+
sim = similarity(training_props)
|
423
|
+
if sim > @model.parameter("min_sim")
|
424
|
+
if @model.activities[@cmpds[idx]]
|
425
|
+
@model.activities[@cmpds[idx]].each do |act|
|
426
|
+
@model.neighbors << {
|
427
|
+
:compound => @cmpds[idx],
|
428
|
+
:similarity => sim,
|
429
|
+
:features => @fps[idx].keys,
|
430
|
+
:activity => act
|
431
|
+
}
|
432
|
+
@sims << sim
|
433
|
+
@ids << idx
|
434
|
+
end
|
435
|
+
end
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
|
440
|
+
# Removes nil entries from n_prop and q_prop.
|
441
|
+
# Matrix is a nested two-dimensional array.
|
442
|
+
# Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed.
|
443
|
+
# Tie break: columns take precedence.
|
444
|
+
# Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero.
|
445
|
+
# Enables the use of cosine similarity / SVD
|
446
|
+
def remove_nils
|
447
|
+
return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0)
|
448
|
+
col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
|
449
|
+
row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
|
450
|
+
m_cols = col_nr_nils.max
|
451
|
+
m_rows = row_nr_nils.max
|
452
|
+
idx_cols = col_nr_nils.index(m_cols)
|
453
|
+
idx_rows = row_nr_nils.index(m_rows)
|
454
|
+
while ((m_cols > 0) || (m_rows > 0)) do
|
455
|
+
if m_cols >= m_rows
|
456
|
+
@n_prop.each { |row| row.slice!(idx_cols) }
|
457
|
+
@q_prop.slice!(idx_cols)
|
458
|
+
else
|
459
|
+
@n_prop.slice!(idx_rows)
|
460
|
+
@ids.slice!(idx_rows)
|
461
|
+
end
|
462
|
+
break if (@n_prop.length == 0) || (@n_prop[0].length == 0)
|
463
|
+
col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
|
464
|
+
row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
|
465
|
+
m_cols = col_nr_nils.max
|
466
|
+
m_rows = row_nr_nils.max
|
467
|
+
idx_cols= col_nr_nils.index(m_cols)
|
468
|
+
idx_rows = row_nr_nils.index(m_rows)
|
469
|
+
end
|
470
|
+
end
|
471
|
+
|
472
|
+
|
473
|
+
# Replaces nils by zeroes in n_prop and q_prop
|
474
|
+
# Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop)
|
475
|
+
def convert_nils
|
476
|
+
@n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } }
|
477
|
+
@q_prop.collect! { |v| v.nil? ? 0 : v }
|
478
|
+
end
|
479
|
+
|
480
|
+
|
481
|
+
# Executes model similarity_algorithm
|
482
|
+
def similarity(training_props)
|
483
|
+
eval("OpenTox::Algorithm::#{@model.similarity_algorithm}(training_props, @q_prop)")
|
484
|
+
end
|
485
|
+
|
486
|
+
|
487
|
+
# Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed.
|
488
|
+
# Same for compound fingerprints.
|
489
|
+
def get_matrices
|
490
|
+
|
491
|
+
@cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
|
492
|
+
|
493
|
+
@model.fingerprints.each { |fp|
|
494
|
+
cmpd = fp[0]; fp = fp[1]
|
495
|
+
if @model.activities[cmpd] # row good
|
496
|
+
acts = @model.activities[cmpd]; @acts += acts
|
497
|
+
LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
|
498
|
+
row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
|
499
|
+
acts.size.times { # multiple additions for multiple activities
|
500
|
+
@n_prop << row.collect
|
501
|
+
@cmpds << cmpd
|
502
|
+
@fps << Marshal.load(Marshal.dump(fp))
|
503
|
+
}
|
504
|
+
else
|
505
|
+
LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
|
506
|
+
end
|
507
|
+
}
|
508
|
+
|
509
|
+
@model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure
|
510
|
+
|
511
|
+
end
|
512
|
+
|
513
|
+
def props
|
514
|
+
@model.parameter("propositionalized") ? [ @n_prop, @q_prop ] : nil
|
515
|
+
end
|
516
|
+
|
517
|
+
end
|
518
|
+
|
519
|
+
end
|
520
|
+
end
|