opentox-ruby 3.0.1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +8 -0
- data/Rakefile +2 -3
- data/VERSION +1 -1
- data/lib/algorithm.rb +227 -675
- data/lib/authorization.rb +10 -8
- data/lib/compound.rb +47 -11
- data/lib/dataset.rb +50 -2
- data/lib/environment.rb +6 -1
- data/lib/model.rb +37 -72
- data/lib/opentox-ruby.rb +1 -1
- data/lib/parser.rb +115 -57
- data/lib/r-util.rb +354 -0
- data/lib/rest_client_wrapper.rb +1 -1
- data/lib/serializer.rb +47 -30
- data/lib/stratification.R +201 -0
- data/lib/task.rb +5 -1
- data/lib/transform.rb +520 -0
- data/lib/utils.rb +372 -0
- data/lib/validation.rb +52 -6
- metadata +413 -428
data/lib/rest_client_wrapper.rb
CHANGED
@@ -70,7 +70,7 @@ module OpenTox
|
|
70
70
|
|
71
71
|
begin
|
72
72
|
#LOGGER.debug "RestCall: "+rest_call.to_s+" "+uri.to_s+" "+headers.inspect+" "+payload.inspect
|
73
|
-
resource = RestClient::Resource.new(uri,{:timeout =>
|
73
|
+
resource = RestClient::Resource.new(uri,{:timeout => 600})
|
74
74
|
if rest_call=="post" || rest_call=="put"
|
75
75
|
result = resource.send(rest_call, payload, headers)
|
76
76
|
else
|
data/lib/serializer.rb
CHANGED
@@ -55,7 +55,7 @@ module OpenTox
|
|
55
55
|
OT.predictedVariables => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
56
56
|
OT.paramValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
57
57
|
|
58
|
-
#object props for validation#
|
58
|
+
#object props for validation#
|
59
59
|
OT.model => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
60
60
|
OT.trainingDataset => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
61
61
|
OT.predictionFeature => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
|
@@ -87,7 +87,7 @@ module OpenTox
|
|
87
87
|
OT.percentageCompleted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
88
88
|
OT.acceptValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
89
89
|
|
90
|
-
# annotation props for validation
|
90
|
+
# annotation props for validation
|
91
91
|
OT.numUnpredicted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
92
92
|
OT.crossvalidationFold => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
93
93
|
OT.numInstances => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
|
@@ -143,8 +143,8 @@ module OpenTox
|
|
143
143
|
@data_entries = {}
|
144
144
|
@values_id = 0
|
145
145
|
@parameter_id = 0
|
146
|
-
|
147
|
-
@classes = Set.new
|
146
|
+
|
147
|
+
@classes = Set.new
|
148
148
|
@object_properties = Set.new
|
149
149
|
@annotation_properties = Set.new
|
150
150
|
@datatype_properties = Set.new
|
@@ -208,7 +208,7 @@ module OpenTox
|
|
208
208
|
@object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => OT.Task }] }
|
209
209
|
add_metadata uri, metadata
|
210
210
|
end
|
211
|
-
|
211
|
+
|
212
212
|
# Add a resource defined by resource_class and content
|
213
213
|
# (see documentation of add_content for example)
|
214
214
|
# @param [String] uri of resource
|
@@ -223,10 +223,10 @@ module OpenTox
|
|
223
223
|
def add_uri(uri,type)
|
224
224
|
@object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => type }] }
|
225
225
|
end
|
226
|
-
|
226
|
+
|
227
227
|
private
|
228
228
|
@@content_id = 1
|
229
|
-
|
229
|
+
|
230
230
|
#Recursiv function to add content
|
231
231
|
#@example
|
232
232
|
# { DC.description => "bla",
|
@@ -244,7 +244,7 @@ module OpenTox
|
|
244
244
|
hash.each do |u,v|
|
245
245
|
if v.is_a? Hash
|
246
246
|
# value is again a hash, i.e. a new owl class is added
|
247
|
-
# first make sure type (==class) is set
|
247
|
+
# first make sure type (==class) is set
|
248
248
|
type = v[RDF.type]
|
249
249
|
raise "type missing for "+u.to_s+" content:\n"+v.inspect unless type
|
250
250
|
raise "class unknown "+type.to_s+" (for "+u.to_s+")" unless @object.has_key?(type)
|
@@ -256,7 +256,7 @@ module OpenTox
|
|
256
256
|
# add content to new class
|
257
257
|
add_content(genid,v)
|
258
258
|
elsif v.is_a? Array
|
259
|
-
# value is an array, i.e. a list of values with property is added
|
259
|
+
# value is an array, i.e. a list of values with property is added
|
260
260
|
v.each{ |vv| add_content( uri, { u => vv } ) }
|
261
261
|
else # v.is_a? String
|
262
262
|
# simple string value
|
@@ -268,7 +268,7 @@ module OpenTox
|
|
268
268
|
end
|
269
269
|
end
|
270
270
|
end
|
271
|
-
|
271
|
+
|
272
272
|
public
|
273
273
|
|
274
274
|
# Add metadata
|
@@ -329,7 +329,7 @@ module OpenTox
|
|
329
329
|
v = [{ "type" => "uri", "value" => value}]
|
330
330
|
when "literal"
|
331
331
|
v = [{ "type" => "literal", "value" => value, "datatype" => datatype(value) }]
|
332
|
-
else
|
332
|
+
else
|
333
333
|
raise "Illegal type #{type(value)} for #{value}."
|
334
334
|
end
|
335
335
|
@object[values] = {
|
@@ -342,7 +342,7 @@ module OpenTox
|
|
342
342
|
end
|
343
343
|
|
344
344
|
# Serializers
|
345
|
-
|
345
|
+
|
346
346
|
# Convert to N-Triples
|
347
347
|
# @return [text/plain] Object OWL-DL in N-Triples format
|
348
348
|
def to_ntriples
|
@@ -353,7 +353,7 @@ module OpenTox
|
|
353
353
|
entry.each do |p,objects|
|
354
354
|
p = url(p)
|
355
355
|
objects.each do |o|
|
356
|
-
case o["type"]
|
356
|
+
case o["type"]
|
357
357
|
when "uri"
|
358
358
|
o = url(o["value"])
|
359
359
|
when "literal"
|
@@ -371,9 +371,15 @@ module OpenTox
|
|
371
371
|
# Convert to RDF/XML
|
372
372
|
# @return [text/plain] Object OWL-DL in RDF/XML format
|
373
373
|
def to_rdfxml
|
374
|
-
Tempfile.open("owl-serializer")
|
374
|
+
tmpf = Tempfile.open("owl-serializer")
|
375
|
+
tmpf.write(self.to_ntriples)
|
376
|
+
tmpf.flush
|
377
|
+
@path = tmpf.path
|
375
378
|
# TODO: add base uri for ist services
|
376
|
-
|
379
|
+
res=`rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
|
380
|
+
tmpf.close
|
381
|
+
tmpf.delete
|
382
|
+
res
|
377
383
|
end
|
378
384
|
|
379
385
|
# Convert to JSON as specified in http://n2.talis.com/wiki/RDF_JSON_Specification
|
@@ -427,20 +433,20 @@ module OpenTox
|
|
427
433
|
end
|
428
434
|
|
429
435
|
def literal(value,type)
|
430
|
-
# concat and << are faster string concatination operators than +
|
436
|
+
# concat and << are faster string concatination operators than +
|
431
437
|
'"'.concat(value.to_s).concat('"^^<').concat(type).concat('>')
|
432
438
|
end
|
433
439
|
|
434
440
|
def url(uri)
|
435
|
-
# concat and << are faster string concatination operators than +
|
441
|
+
# concat and << are faster string concatination operators than +
|
436
442
|
'<'.concat(uri).concat('>')
|
437
443
|
end
|
438
444
|
|
439
445
|
def rdf_types
|
440
|
-
@classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
|
441
|
-
@object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
|
442
|
-
@annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
|
443
|
-
@datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
|
446
|
+
@classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
|
447
|
+
@object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
|
448
|
+
@annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
|
449
|
+
@datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
|
444
450
|
end
|
445
451
|
|
446
452
|
end
|
@@ -457,35 +463,46 @@ module OpenTox
|
|
457
463
|
@rows.first << features
|
458
464
|
@rows.first.flatten!
|
459
465
|
dataset.data_entries.each do |compound,entries|
|
460
|
-
|
466
|
+
cmpd = Compound.new(compound)
|
467
|
+
smiles = cmpd.to_smiles
|
468
|
+
inchi = URI.encode_www_form_component(cmpd.to_inchi)
|
469
|
+
row_container = Array.new
|
461
470
|
row = Array.new(@rows.first.size)
|
462
|
-
|
471
|
+
row_container << row
|
472
|
+
#row[0] = smiles
|
473
|
+
row[0] = inchi
|
463
474
|
entries.each do |feature, values|
|
464
475
|
i = features.index(feature)+1
|
465
476
|
values.each do |value|
|
466
|
-
if
|
467
|
-
|
477
|
+
if row_container[0][i]
|
478
|
+
#LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
|
479
|
+
row_container << row_container.last.collect
|
480
|
+
row_container.last[i] = value
|
481
|
+
#LOGGER.debug "RC: #{row_container.to_yaml}"
|
468
482
|
else
|
469
|
-
|
483
|
+
row_container.each { |r| r[i] = value }
|
470
484
|
end
|
471
485
|
end
|
472
486
|
end
|
473
|
-
@rows <<
|
487
|
+
row_container.each { |r| @rows << r }
|
474
488
|
end
|
475
489
|
end
|
476
490
|
|
477
491
|
# Convert to CSV string
|
478
492
|
# @return [String] CSV string
|
479
493
|
def to_csv
|
480
|
-
@rows.collect
|
494
|
+
rows = @rows.collect
|
495
|
+
result = ""
|
496
|
+
result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
|
497
|
+
result << rows.collect{ |r| r.join(",") }.join("\n")
|
481
498
|
end
|
482
499
|
|
483
500
|
# Convert to spreadsheet workbook
|
484
501
|
# @return [Spreadsheet::Workbook] Workbook object (use the spreadsheet gemc to write a file)
|
485
|
-
def to_spreadsheet
|
502
|
+
def to_spreadsheet(sheetname="sheet1")
|
486
503
|
Spreadsheet.client_encoding = 'UTF-8'
|
487
504
|
book = Spreadsheet::Workbook.new
|
488
|
-
sheet = book.create_worksheet(:name =>
|
505
|
+
sheet = book.create_worksheet(:name => "#{sheetname}")
|
489
506
|
sheet.column(0).width = 100
|
490
507
|
i = 0
|
491
508
|
@rows.each do |row|
|
@@ -0,0 +1,201 @@
|
|
1
|
+
|
2
|
+
nominal_to_binary <- function( data )
|
3
|
+
{
|
4
|
+
result = NULL
|
5
|
+
for (i in 1:ncol(data))
|
6
|
+
{
|
7
|
+
#print(i)
|
8
|
+
if (is.numeric( data[,i] ) )
|
9
|
+
{
|
10
|
+
if (is.null(result))
|
11
|
+
result = data.frame(data[,i])
|
12
|
+
else
|
13
|
+
result = data.frame(result, data[,i])
|
14
|
+
colnames(result)[ncol(result)] <- colnames(data)[i]
|
15
|
+
}
|
16
|
+
else
|
17
|
+
{
|
18
|
+
vals = unique(data[,i])
|
19
|
+
for (j in 1:length(vals))
|
20
|
+
{
|
21
|
+
#print(j)
|
22
|
+
bins = c()
|
23
|
+
for (k in 1:nrow(data))
|
24
|
+
{
|
25
|
+
if(data[,i][k] == vals[j])
|
26
|
+
bins = c(bins,1)
|
27
|
+
else
|
28
|
+
bins = c(bins,0)
|
29
|
+
}
|
30
|
+
#print(bins)
|
31
|
+
if (is.null(result))
|
32
|
+
result = data.frame(bins)
|
33
|
+
else
|
34
|
+
result = data.frame(result, bins)
|
35
|
+
colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
|
36
|
+
if (length(vals)==2) break
|
37
|
+
}
|
38
|
+
}
|
39
|
+
}
|
40
|
+
#print(head(result))
|
41
|
+
result
|
42
|
+
}
|
43
|
+
|
44
|
+
process_data <- function( data )
|
45
|
+
{
|
46
|
+
data.num <- as.data.frame(data)
|
47
|
+
if (!is.numeric(data.num))
|
48
|
+
{
|
49
|
+
data.num = nominal_to_binary(data.num)
|
50
|
+
}
|
51
|
+
if(any(is.na(data.num)))
|
52
|
+
{
|
53
|
+
require("gam")
|
54
|
+
data.repl = na.gam.replace(data.num)
|
55
|
+
}
|
56
|
+
else
|
57
|
+
data.repl = data.num
|
58
|
+
data.repl
|
59
|
+
}
|
60
|
+
|
61
|
+
cluster <- function( data, min=10, max=15 )
|
62
|
+
{
|
63
|
+
require("vegan")
|
64
|
+
max <- min(max,nrow(unique(data)))
|
65
|
+
max <- min(max,nrow(data)-1)
|
66
|
+
if (min>max)
|
67
|
+
min=max
|
68
|
+
print(paste("cascade k-means ",min," - ",max))
|
69
|
+
s = cascadeKM(data,min,max,iter=30)
|
70
|
+
m = max.col(s$results)[2]
|
71
|
+
print(paste("best k-means clustering result: ",((m-1)+min)," num clusters"))
|
72
|
+
cbind(s$partition[,m])
|
73
|
+
}
|
74
|
+
|
75
|
+
stratified_split <- function( data, ratio=0.3, method="cluster" )
|
76
|
+
{
|
77
|
+
data.processed = as.matrix(process_data( data ))
|
78
|
+
if (method == "samplecube")
|
79
|
+
{
|
80
|
+
require("sampling")
|
81
|
+
# adjust ratio to make samplecube return exact number of samples
|
82
|
+
ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
|
83
|
+
pik = rep(ratio,times=nrow(data.processed))
|
84
|
+
data.strat = cbind(pik,data.processed)
|
85
|
+
samplecube(data.strat,pik,order=2,comment=F)
|
86
|
+
}
|
87
|
+
else if (method == "cluster")
|
88
|
+
{
|
89
|
+
cl = cluster(data.processed)
|
90
|
+
# require("caret")
|
91
|
+
# res = createDataPartition(cl,p=ratio)
|
92
|
+
# split = rep(1, times=nrow(data))
|
93
|
+
# for (j in 1:nrow(data))
|
94
|
+
# if ( is.na(match(j,res$Resample1)) )
|
95
|
+
# split[j]=0
|
96
|
+
# split
|
97
|
+
require("sampling")
|
98
|
+
stratified_split(cl,ratio,"samplecube")
|
99
|
+
}
|
100
|
+
else
|
101
|
+
stop("unknown method")
|
102
|
+
}
|
103
|
+
|
104
|
+
stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
|
105
|
+
{
|
106
|
+
print(paste(num_folds,"-fold-split, data-size",nrow(data)))
|
107
|
+
data.processed = as.matrix(process_data( data ))
|
108
|
+
if (method == "samplecube")
|
109
|
+
{
|
110
|
+
folds = rep(0, times=nrow(data))
|
111
|
+
for (i in 1:(num_folds-1))
|
112
|
+
{
|
113
|
+
require("sampling")
|
114
|
+
prop = 1/(num_folds-(i-1))
|
115
|
+
print(paste("fold",i,"/",num_folds," prop",prop))
|
116
|
+
pik = rep(prop,times=nrow(data))
|
117
|
+
for (j in 1:nrow(data))
|
118
|
+
if(folds[j]!=0)
|
119
|
+
pik[j]=0
|
120
|
+
data.strat = cbind(pik,data.processed)
|
121
|
+
s<-samplecube(data.strat,pik,order=2,comment=F)
|
122
|
+
print(paste("fold size: ",sum(s)))
|
123
|
+
for (j in 1:nrow(data))
|
124
|
+
if (s[j] == 1)
|
125
|
+
folds[j]=i
|
126
|
+
}
|
127
|
+
for (j in 1:nrow(data))
|
128
|
+
if (folds[j] == 0)
|
129
|
+
folds[j]=num_folds
|
130
|
+
folds
|
131
|
+
}
|
132
|
+
else if (method == "cluster")
|
133
|
+
{
|
134
|
+
require("TunePareto")
|
135
|
+
cl = cluster(data.processed)
|
136
|
+
res = generateCVRuns(cl,ntimes=1,nfold=3)
|
137
|
+
folds = rep(0, times=nrow(data))
|
138
|
+
for (i in 1:num_folds)
|
139
|
+
for(j in 1:length(res[[1]][[i]]))
|
140
|
+
folds[res[[1]][[i]][j]]=i
|
141
|
+
folds
|
142
|
+
}
|
143
|
+
else
|
144
|
+
stop("unknown method")
|
145
|
+
}
|
146
|
+
|
147
|
+
plot_pre_process <- function( data, method="pca" )
|
148
|
+
{
|
149
|
+
data.processed = process_data( data )
|
150
|
+
if (method == "pca")
|
151
|
+
{
|
152
|
+
data.pca <- prcomp(data.processed, scale=TRUE)
|
153
|
+
as.data.frame(data.pca$x)[1:2]
|
154
|
+
}
|
155
|
+
else if (method == "smacof")
|
156
|
+
{
|
157
|
+
require("smacof")
|
158
|
+
data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
|
159
|
+
data.emb$conf
|
160
|
+
}
|
161
|
+
else
|
162
|
+
stop("unknown method")
|
163
|
+
}
|
164
|
+
|
165
|
+
plot_split <- function( data, split, names=NULL, ... )
|
166
|
+
{
|
167
|
+
if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
|
168
|
+
stop("data not suitable for plotting, plot_pre_process() first")
|
169
|
+
|
170
|
+
plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
|
171
|
+
if (is.null(names))
|
172
|
+
names <- c("split 1","split 2")
|
173
|
+
colos = as.double(rep(2:(max(split)+2)))
|
174
|
+
legend("topleft",names,pch=2,col=colos)
|
175
|
+
|
176
|
+
for (j in max(split):0)
|
177
|
+
{
|
178
|
+
set = c()
|
179
|
+
for (i in 1:nrow(data))
|
180
|
+
if (split[i] == j)
|
181
|
+
set = c(set,i)
|
182
|
+
points(data[set,], pch = 2, col=(j+2))
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
186
|
+
#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
|
187
|
+
#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
|
188
|
+
#data<-rbind(a,b)
|
189
|
+
#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
|
190
|
+
#data<-rbind(data,c)
|
191
|
+
#data=iris
|
192
|
+
#split = stratified_k_fold_split(data, num_folds=3)
|
193
|
+
#split = stratified_split(data, ratio=0.33, method="cluster")
|
194
|
+
#print(sum(split))
|
195
|
+
#plot_split(plot_pre_process(data),split,c("training","test"))
|
196
|
+
|
197
|
+
#cl = cluster(data)
|
198
|
+
|
199
|
+
|
200
|
+
|
201
|
+
|
data/lib/task.rb
CHANGED
@@ -242,16 +242,20 @@ module OpenTox
|
|
242
242
|
# waits for a task, unless time exceeds or state is no longer running
|
243
243
|
# @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
|
244
244
|
# @param [optional,Numeric] dur seconds pausing before cheking again for completion
|
245
|
-
def wait_for_completion( waiting_task=nil
|
245
|
+
def wait_for_completion( waiting_task=nil)
|
246
246
|
|
247
247
|
waiting_task.waiting_for(self.uri) if waiting_task
|
248
248
|
due_to_time = Time.new + DEFAULT_TASK_MAX_DURATION
|
249
|
+
start_time = Time.new
|
250
|
+
dur = 0
|
249
251
|
LOGGER.debug "start waiting for task "+@uri.to_s+" at: "+Time.new.to_s+", waiting at least until "+due_to_time.to_s
|
250
252
|
|
251
253
|
load_metadata # for extremely fast tasks
|
252
254
|
check_state
|
253
255
|
while self.running? or self.queued?
|
254
256
|
sleep dur
|
257
|
+
dur = [[(Time.new - start_time)/20.0,0.3].max,300.0].min
|
258
|
+
#LOGGER.debug "task-object-id: #{self.object_id} - wait: #{"%.2f"%(Time.new - start_time)} - dur: #{"%.2f"%dur}"
|
255
259
|
load_metadata
|
256
260
|
# if another (sub)task is waiting for self, set progress accordingly
|
257
261
|
waiting_task.progress(@metadata[OT.percentageCompleted].to_f) if waiting_task
|
data/lib/transform.rb
ADDED
@@ -0,0 +1,520 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Transform
|
3
|
+
# Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
|
4
|
+
|
5
|
+
# LogAutoScaler for GSL vectors.
|
6
|
+
# Take log and scale.
|
7
|
+
class LogAutoScale
|
8
|
+
attr_accessor :vs, :offset, :autoscaler
|
9
|
+
|
10
|
+
# @param [GSL::Vector] Values to transform using LogAutoScaling.
|
11
|
+
def initialize values
|
12
|
+
@distance_to_zero = 1.0
|
13
|
+
begin
|
14
|
+
raise "Cannot transform, values empty." if values.size==0
|
15
|
+
vs = values.clone
|
16
|
+
@offset = vs.min - @distance_to_zero
|
17
|
+
@autoscaler = OpenTox::Transform::AutoScale.new mvlog(vs)
|
18
|
+
@vs = @autoscaler.vs
|
19
|
+
rescue Exception => e
|
20
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
21
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# @param [GSL::Vector] values to restore.
|
26
|
+
# @return [GSL::Vector] transformed values.
|
27
|
+
def restore values
|
28
|
+
begin
|
29
|
+
raise "Cannot transform, values empty." if values.size==0
|
30
|
+
vs = values.clone
|
31
|
+
rv = @autoscaler.restore(vs)
|
32
|
+
rv.to_a.collect { |v| (10**v) + @offset }.to_gv
|
33
|
+
rescue Exception => e
|
34
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
35
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# @param [GSL::Vector] values to transform.
|
40
|
+
# @return [GSL::Vector] transformed values.
|
41
|
+
def mvlog values
|
42
|
+
values.to_a.collect { |v| Math::log10(v - @offset) }.to_gv
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# Auto-Scaler for GSL vectors.
|
49
|
+
# Center on mean and divide by standard deviation.
|
50
|
+
class AutoScale
|
51
|
+
attr_accessor :vs, :mean, :stdev
|
52
|
+
|
53
|
+
# @param [GSL::Vector] values to transform using AutoScaling.
|
54
|
+
def initialize values
|
55
|
+
begin
|
56
|
+
raise "Cannot transform, values empty." if values.size==0
|
57
|
+
vs = values.clone
|
58
|
+
@mean = vs.to_scale.mean
|
59
|
+
@stdev = vs.to_scale.standard_deviation_population
|
60
|
+
@stdev = 0.0 if @stdev.nan?
|
61
|
+
@vs = transform vs
|
62
|
+
rescue Exception => e
|
63
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
64
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# @param [GSL::Vector] values to transform.
|
69
|
+
# @return [GSL::Vector] transformed values.
|
70
|
+
def transform values
|
71
|
+
begin
|
72
|
+
raise "Cannot transform, values empty." if values.size==0
|
73
|
+
autoscale values.clone
|
74
|
+
rescue Exception => e
|
75
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
76
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# @param [GSL::Vector] Values to restore.
|
81
|
+
# @return [GSL::Vector] transformed values.
|
82
|
+
def restore values
|
83
|
+
begin
|
84
|
+
raise "Cannot transform, values empty." if values.size==0
|
85
|
+
rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
|
86
|
+
(rv_ss + @mean).to_gsl
|
87
|
+
rescue Exception => e
|
88
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
89
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# @param [GSL::Vector] values to transform.
|
94
|
+
# @return [GSL::Vector] transformed values.
|
95
|
+
def autoscale values
|
96
|
+
vs_ss = values.clone.to_scale - @mean
|
97
|
+
@stdev == 0.0 ? vs_ss.to_gsl : ( vs_ss * ( 1 / @stdev) ).to_gsl
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
# Principal Components Analysis.
|
104
|
+
class PCA
|
105
|
+
attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
|
106
|
+
|
107
|
+
# Creates a transformed dataset as GSL::Matrix.
|
108
|
+
#
|
109
|
+
# @param [GSL::Matrix] Data matrix.
|
110
|
+
# @param [Float] Compression ratio from [0,1], default 0.05.
|
111
|
+
# @return [GSL::Matrix] Data transformed matrix.
|
112
|
+
def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
|
113
|
+
begin
|
114
|
+
@data_matrix = data_matrix.clone
|
115
|
+
@compression = compression.to_f
|
116
|
+
@mean = Array.new
|
117
|
+
@autoscaler = Array.new
|
118
|
+
@cols = Array.new
|
119
|
+
@maxcols = maxcols
|
120
|
+
|
121
|
+
# Objective Feature Selection
|
122
|
+
raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
|
123
|
+
@data_matrix_selected = nil
|
124
|
+
(0..@data_matrix.size2-1).each { |i|
|
125
|
+
if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
|
126
|
+
if @data_matrix_selected.nil?
|
127
|
+
@data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
|
128
|
+
@data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
|
129
|
+
else
|
130
|
+
@data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
|
131
|
+
end
|
132
|
+
@cols << i
|
133
|
+
end
|
134
|
+
}
|
135
|
+
raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
|
136
|
+
|
137
|
+
# PCA uses internal centering on 0
|
138
|
+
@data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
|
139
|
+
(0..@cols.size-1).each { |i|
|
140
|
+
as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i))
|
141
|
+
@data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
|
142
|
+
@mean << as.mean
|
143
|
+
@autoscaler << as
|
144
|
+
}
|
145
|
+
|
146
|
+
# PCA
|
147
|
+
data_matrix_hash = Hash.new
|
148
|
+
(0..@cols.size-1).each { |i|
|
149
|
+
column_view = @data_matrix_scaled.col(i)
|
150
|
+
data_matrix_hash[i] = column_view.to_scale
|
151
|
+
}
|
152
|
+
dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
|
153
|
+
cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
|
154
|
+
pca=Statsample::Factor::PCA.new(cor_matrix)
|
155
|
+
|
156
|
+
# Select best eigenvectors
|
157
|
+
pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
|
158
|
+
@eigenvalue_sums = Array.new
|
159
|
+
(0..@cols.size-1).each { |i|
|
160
|
+
@eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
|
161
|
+
}
|
162
|
+
eigenvectors_selected = Array.new
|
163
|
+
pca.eigenvectors.each_with_index { |ev, i|
|
164
|
+
if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
|
165
|
+
eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
|
166
|
+
end
|
167
|
+
}
|
168
|
+
@eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
|
169
|
+
@data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
|
170
|
+
|
171
|
+
rescue Exception => e
|
172
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
173
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
# Transforms data to feature space found by PCA.
|
178
|
+
#
|
179
|
+
# @param [GSL::Matrix] Data matrix.
|
180
|
+
# @return [GSL::Matrix] Transformed data matrix.
|
181
|
+
def transform values
|
182
|
+
begin
|
183
|
+
vs = values.clone
|
184
|
+
raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
|
185
|
+
data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
|
186
|
+
@cols.each_with_index { |i,j|
|
187
|
+
data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
|
188
|
+
}
|
189
|
+
(@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
|
190
|
+
rescue Exception => e
|
191
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
192
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
# Restores data in the original feature space (possibly with compression loss).
|
197
|
+
#
|
198
|
+
# @param [GSL::Matrix] Transformed data matrix.
|
199
|
+
# @return [GSL::Matrix] Data matrix.
|
200
|
+
def restore
|
201
|
+
begin
|
202
|
+
data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
|
203
|
+
# reverse scaling
|
204
|
+
(0..@cols.size-1).each { |i|
|
205
|
+
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
|
206
|
+
}
|
207
|
+
data_matrix_restored
|
208
|
+
rescue Exception => e
|
209
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
210
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
# Singular Value Decomposition
|
218
|
+
class SVD
|
219
|
+
attr_accessor :data_matrix, :compression, :data_transformed_matrix, :uk, :vk, :eigk, :eigk_inv
|
220
|
+
|
221
|
+
# Creates a transformed dataset as GSL::Matrix.
|
222
|
+
#
|
223
|
+
# @param [GSL::Matrix] Data matrix
|
224
|
+
# @param [Float] Compression ratio from [0,1], default 0.05
|
225
|
+
# @return [GSL::Matrix] Data transformed matrix
|
226
|
+
|
227
|
+
def initialize data_matrix, compression=0.05
|
228
|
+
begin
|
229
|
+
@data_matrix = data_matrix.clone
|
230
|
+
@compression = compression
|
231
|
+
|
232
|
+
# Compute the SV Decomposition X=USV
|
233
|
+
# vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
|
234
|
+
u, vt, s = data_matrix.SV_decomp
|
235
|
+
|
236
|
+
# Determine cutoff index
|
237
|
+
s2 = s.mul(s) ; s2_sum = s2.sum
|
238
|
+
s2_run = 0
|
239
|
+
k = s2.size - 1
|
240
|
+
s2.to_a.reverse.each { |v|
|
241
|
+
s2_run += v
|
242
|
+
frac = s2_run / s2_sum
|
243
|
+
break if frac > compression
|
244
|
+
k -= 1
|
245
|
+
}
|
246
|
+
k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
|
247
|
+
|
248
|
+
# Take the k-rank approximation of the Matrix
|
249
|
+
# - Take first k columns of u
|
250
|
+
# - Take first k columns of vt
|
251
|
+
# - Take the first k eigenvalues
|
252
|
+
@uk = u.submatrix(nil, (0..k)) # used to transform column format data
|
253
|
+
@vk = vt.submatrix(nil, (0..k)) # used to transform row format data
|
254
|
+
s = GSL::Matrix.diagonal(s)
|
255
|
+
@eigk = s.submatrix((0..k), (0..k))
|
256
|
+
@eigk_inv = @eigk.inv
|
257
|
+
|
258
|
+
# Transform data
|
259
|
+
@data_transformed_matrix = @uk # = u for all SVs
|
260
|
+
# NOTE: @data_transformed_matrix is also equal to
|
261
|
+
# @data_matrix * @vk * @eigk_inv
|
262
|
+
|
263
|
+
rescue Exception => e
|
264
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
265
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
|
270
|
+
# Transforms data instance (1 row) to feature space found by SVD.
|
271
|
+
#
|
272
|
+
# @param [GSL::Matrix] Data matrix (1 x m).
|
273
|
+
# @return [GSL::Matrix] Transformed data matrix.
|
274
|
+
def transform_instance values
|
275
|
+
begin
|
276
|
+
values * @vk * @eigk_inv
|
277
|
+
rescue Exception => e
|
278
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
279
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
280
|
+
end
|
281
|
+
end
|
282
|
+
alias :transform :transform_instance # make this the default (see PCA interface)
|
283
|
+
|
284
|
+
# Transforms data feature (1 column) to feature space found by SVD.
|
285
|
+
#
|
286
|
+
# @param [GSL::Matrix] Data matrix (1 x n).
|
287
|
+
# @return [GSL::Matrix] Transformed data matrix.
|
288
|
+
def transform_feature values
|
289
|
+
begin
|
290
|
+
values * @uk * @eigk_inv
|
291
|
+
rescue Exception => e
|
292
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
293
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
|
298
|
+
# Restores data in the original feature space (possibly with compression loss).
|
299
|
+
#
|
300
|
+
# @param [GSL::Matrix] Transformed data matrix.
|
301
|
+
# @return [GSL::Matrix] Data matrix.
|
302
|
+
def restore
|
303
|
+
begin
|
304
|
+
@data_transformed_matrix * @eigk * @vk.transpose # reverse svd
|
305
|
+
rescue Exception => e
|
306
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
307
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
|
312
|
+
end
|
313
|
+
|
314
|
+
|
315
|
+
|
316
|
+
# Attaches transformations to an OpenTox::Model
|
317
|
+
# Stores props, sims, performs similarity calculations
|
318
|
+
class ModelTransformer
|
319
|
+
attr_accessor :model, :similarity_algorithm, :acts, :sims
|
320
|
+
|
321
|
+
# @params[OpenTox::Model] model to transform
|
322
|
+
def initialize model
|
323
|
+
@model = model
|
324
|
+
@similarity_algorithm = @model.similarity_algorithm
|
325
|
+
end
|
326
|
+
|
327
|
+
def transform
|
328
|
+
get_matrices # creates @n_prop, @q_prop, @acts from ordered fps
|
329
|
+
@ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors
|
330
|
+
|
331
|
+
# Preprocessing
|
332
|
+
if (@model.similarity_algorithm == "Similarity.cosine")
|
333
|
+
# truncate nil-columns and -rows
|
334
|
+
LOGGER.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
335
|
+
while @q_prop.size>0
|
336
|
+
idx = @q_prop.index(nil)
|
337
|
+
break if idx.nil?
|
338
|
+
@q_prop.slice!(idx)
|
339
|
+
@n_prop.each { |r| r.slice!(idx) }
|
340
|
+
end
|
341
|
+
LOGGER.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
342
|
+
remove_nils # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors
|
343
|
+
LOGGER.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
344
|
+
|
345
|
+
# adjust rest
|
346
|
+
fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp
|
347
|
+
cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp
|
348
|
+
acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
|
349
|
+
|
350
|
+
# scale and svd
|
351
|
+
nr_cases, nr_features = @n_prop.size, @n_prop[0].size
|
352
|
+
gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup
|
353
|
+
gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup
|
354
|
+
(0...nr_features).each { |i|
|
355
|
+
autoscaler = OpenTox::Transform::AutoScale.new(gsl_n_prop.col(i))
|
356
|
+
gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs
|
357
|
+
gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i)
|
358
|
+
}
|
359
|
+
svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0)
|
360
|
+
@n_prop = svd.data_transformed_matrix.to_a
|
361
|
+
@q_prop = svd.transform(gsl_q_prop).row(0).to_a
|
362
|
+
LOGGER.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
363
|
+
else
|
364
|
+
convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched
|
365
|
+
end
|
366
|
+
|
367
|
+
# neighbor calculation
|
368
|
+
@ids = [] # surviving compounds become neighbors
|
369
|
+
@sims = [] # calculated by neighbor routine
|
370
|
+
neighbors
|
371
|
+
n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix
|
372
|
+
acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
|
373
|
+
|
374
|
+
|
375
|
+
# Sims between neighbors, if necessary
|
376
|
+
gram_matrix = []
|
377
|
+
if !@model.parameter("propositionalized") # need gram matrix for standard setting (n. prop.)
|
378
|
+
@n_prop.each_index do |i|
|
379
|
+
gram_matrix[i] = [] unless gram_matrix[i]
|
380
|
+
@n_prop.each_index do |j|
|
381
|
+
if (j>i)
|
382
|
+
sim = eval("OpenTox::Algorithm::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])")
|
383
|
+
gram_matrix[i][j] = sim
|
384
|
+
gram_matrix[j] = [] unless gram_matrix[j]
|
385
|
+
gram_matrix[j][i] = gram_matrix[i][j]
|
386
|
+
end
|
387
|
+
end
|
388
|
+
gram_matrix[i][i] = 1.0
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
# reclaim original data (if svd was performed)
|
393
|
+
if svd
|
394
|
+
@n_prop = gsl_n_prop_orig.to_a
|
395
|
+
n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp
|
396
|
+
@q_prop = gsl_q_prop_orig.row(0).to_a
|
397
|
+
end
|
398
|
+
|
399
|
+
LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
400
|
+
LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
|
401
|
+
|
402
|
+
@sims = [ gram_matrix, @sims ]
|
403
|
+
|
404
|
+
end
|
405
|
+
|
406
|
+
|
407
|
+
|
408
|
+
|
409
|
+
# Find neighbors and store them as object variable, access all compounds for that.
|
410
|
+
def neighbors
|
411
|
+
@model.neighbors = []
|
412
|
+
@n_prop.each_with_index do |fp, idx| # AM: access all compounds
|
413
|
+
add_neighbor fp, idx
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
417
|
+
|
418
|
+
# Adds a neighbor to @neighbors if it passes the similarity threshold
|
419
|
+
# adjusts @ids to signal the
|
420
|
+
def add_neighbor(training_props, idx)
|
421
|
+
|
422
|
+
sim = similarity(training_props)
|
423
|
+
if sim > @model.parameter("min_sim")
|
424
|
+
if @model.activities[@cmpds[idx]]
|
425
|
+
@model.activities[@cmpds[idx]].each do |act|
|
426
|
+
@model.neighbors << {
|
427
|
+
:compound => @cmpds[idx],
|
428
|
+
:similarity => sim,
|
429
|
+
:features => @fps[idx].keys,
|
430
|
+
:activity => act
|
431
|
+
}
|
432
|
+
@sims << sim
|
433
|
+
@ids << idx
|
434
|
+
end
|
435
|
+
end
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
|
440
|
+
# Removes nil entries from n_prop and q_prop.
|
441
|
+
# Matrix is a nested two-dimensional array.
|
442
|
+
# Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed.
|
443
|
+
# Tie break: columns take precedence.
|
444
|
+
# Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero.
|
445
|
+
# Enables the use of cosine similarity / SVD
|
446
|
+
def remove_nils
|
447
|
+
return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0)
|
448
|
+
col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
|
449
|
+
row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
|
450
|
+
m_cols = col_nr_nils.max
|
451
|
+
m_rows = row_nr_nils.max
|
452
|
+
idx_cols = col_nr_nils.index(m_cols)
|
453
|
+
idx_rows = row_nr_nils.index(m_rows)
|
454
|
+
while ((m_cols > 0) || (m_rows > 0)) do
|
455
|
+
if m_cols >= m_rows
|
456
|
+
@n_prop.each { |row| row.slice!(idx_cols) }
|
457
|
+
@q_prop.slice!(idx_cols)
|
458
|
+
else
|
459
|
+
@n_prop.slice!(idx_rows)
|
460
|
+
@ids.slice!(idx_rows)
|
461
|
+
end
|
462
|
+
break if (@n_prop.length == 0) || (@n_prop[0].length == 0)
|
463
|
+
col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
|
464
|
+
row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
|
465
|
+
m_cols = col_nr_nils.max
|
466
|
+
m_rows = row_nr_nils.max
|
467
|
+
idx_cols= col_nr_nils.index(m_cols)
|
468
|
+
idx_rows = row_nr_nils.index(m_rows)
|
469
|
+
end
|
470
|
+
end
|
471
|
+
|
472
|
+
|
473
|
+
# Replaces nils by zeroes in n_prop and q_prop
|
474
|
+
# Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop)
|
475
|
+
def convert_nils
|
476
|
+
@n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } }
|
477
|
+
@q_prop.collect! { |v| v.nil? ? 0 : v }
|
478
|
+
end
|
479
|
+
|
480
|
+
|
481
|
+
# Executes model similarity_algorithm
|
482
|
+
def similarity(training_props)
|
483
|
+
eval("OpenTox::Algorithm::#{@model.similarity_algorithm}(training_props, @q_prop)")
|
484
|
+
end
|
485
|
+
|
486
|
+
|
487
|
+
# Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed.
|
488
|
+
# Same for compound fingerprints.
|
489
|
+
def get_matrices
|
490
|
+
|
491
|
+
@cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
|
492
|
+
|
493
|
+
@model.fingerprints.each { |fp|
|
494
|
+
cmpd = fp[0]; fp = fp[1]
|
495
|
+
if @model.activities[cmpd] # row good
|
496
|
+
acts = @model.activities[cmpd]; @acts += acts
|
497
|
+
LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
|
498
|
+
row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
|
499
|
+
acts.size.times { # multiple additions for multiple activities
|
500
|
+
@n_prop << row.collect
|
501
|
+
@cmpds << cmpd
|
502
|
+
@fps << Marshal.load(Marshal.dump(fp))
|
503
|
+
}
|
504
|
+
else
|
505
|
+
LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
|
506
|
+
end
|
507
|
+
}
|
508
|
+
|
509
|
+
@model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure
|
510
|
+
|
511
|
+
end
|
512
|
+
|
513
|
+
def props
|
514
|
+
@model.parameter("propositionalized") ? [ @n_prop, @q_prop ] : nil
|
515
|
+
end
|
516
|
+
|
517
|
+
end
|
518
|
+
|
519
|
+
end
|
520
|
+
end
|