lazar 0.9.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
data/lib/overwrite.rb
CHANGED
@@ -28,6 +28,11 @@ class Float
|
|
28
28
|
def signif(n)
|
29
29
|
Float("%.#{n}g" % self)
|
30
30
|
end
|
31
|
+
|
32
|
+
# converts -10 logarithmized values back
|
33
|
+
def delog10
|
34
|
+
10**(-1*self)
|
35
|
+
end
|
31
36
|
end
|
32
37
|
|
33
38
|
module Enumerable
|
@@ -101,19 +106,35 @@ class Array
|
|
101
106
|
end
|
102
107
|
|
103
108
|
def mean
|
104
|
-
self.inject{ |sum, el| sum + el }.to_f / self.size
|
109
|
+
self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
|
105
110
|
end
|
106
111
|
|
107
112
|
def sample_variance
|
108
113
|
m = self.mean
|
109
|
-
sum = self.inject(0){|accum, i| accum +(i-m)**2 }
|
110
|
-
sum/(self.length - 1).to_f
|
114
|
+
sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
|
115
|
+
sum/(self.compact.length - 1).to_f
|
111
116
|
end
|
112
117
|
|
113
118
|
def standard_deviation
|
114
119
|
Math.sqrt(self.sample_variance)
|
115
120
|
end
|
116
121
|
|
122
|
+
def for_R
|
123
|
+
if self.first.is_a?(String)
|
124
|
+
#"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
|
125
|
+
"NA"
|
126
|
+
else
|
127
|
+
self.median
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def collect_with_index
|
132
|
+
result = []
|
133
|
+
self.each_with_index do |elt, idx|
|
134
|
+
result << yield(elt, idx)
|
135
|
+
end
|
136
|
+
result
|
137
|
+
end
|
117
138
|
end
|
118
139
|
|
119
140
|
module URI
|
data/lib/physchem.rb
CHANGED
@@ -14,7 +14,7 @@ module OpenTox
|
|
14
14
|
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
|
15
15
|
|
16
16
|
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
|
17
|
-
|
17
|
+
OPENBABEL = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
|
18
18
|
name,description = d.split(/\s+/,2)
|
19
19
|
["Openbabel."+name,description] unless obexclude.include? name
|
20
20
|
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
@@ -25,24 +25,24 @@ module OpenTox
|
|
25
25
|
prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
|
26
26
|
d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
|
27
27
|
end
|
28
|
-
|
28
|
+
CDK = cdkdescriptors
|
29
29
|
|
30
30
|
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
|
31
31
|
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
|
32
32
|
# strip Joelib messages from stdout
|
33
|
-
|
33
|
+
JOELIB = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
|
34
34
|
name = d[:java_class].sub(/^joelib2.feature.types./,'')
|
35
35
|
["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
|
36
36
|
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
37
37
|
|
38
|
-
DESCRIPTORS =
|
38
|
+
DESCRIPTORS = OPENBABEL.merge(CDK.merge(JOELIB))
|
39
39
|
|
40
40
|
require_relative "unique_descriptors.rb"
|
41
41
|
|
42
42
|
def self.descriptors desc=DESCRIPTORS
|
43
43
|
desc.collect do |name,description|
|
44
44
|
lib,desc = name.split('.',2)
|
45
|
-
self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true
|
45
|
+
self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
@@ -54,26 +54,26 @@ module OpenTox
|
|
54
54
|
CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
|
55
55
|
dname = "#{name}.#{n}"
|
56
56
|
description = DESCRIPTORS[dname]
|
57
|
-
udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true
|
57
|
+
udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
|
58
58
|
end
|
59
59
|
else
|
60
60
|
description = DESCRIPTORS[name]
|
61
|
-
udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true
|
61
|
+
udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
|
62
62
|
end
|
63
63
|
end
|
64
64
|
udesc
|
65
65
|
end
|
66
66
|
|
67
67
|
def self.openbabel_descriptors
|
68
|
-
descriptors
|
68
|
+
descriptors OPENBABEL
|
69
69
|
end
|
70
70
|
|
71
71
|
def self.cdk_descriptors
|
72
|
-
descriptors
|
72
|
+
descriptors CDK
|
73
73
|
end
|
74
74
|
|
75
75
|
def self.joelib_descriptors
|
76
|
-
descriptors
|
76
|
+
descriptors JOELIB
|
77
77
|
end
|
78
78
|
|
79
79
|
def calculate compound
|
@@ -131,3 +131,4 @@ module OpenTox
|
|
131
131
|
end
|
132
132
|
|
133
133
|
end
|
134
|
+
OpenTox::PhysChem.descriptors # load descriptor features
|
data/lib/regression.rb
CHANGED
@@ -3,148 +3,18 @@ module OpenTox
|
|
3
3
|
|
4
4
|
class Regression
|
5
5
|
|
6
|
-
def self.
|
6
|
+
def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
|
7
|
+
# TODO: prediction_interval
|
7
8
|
weighted_sum = 0.0
|
8
9
|
sim_sum = 0.0
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
weighted_sum += sim*Math.log10(act)
|
15
|
-
sim_sum += sim
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
|
10
|
+
dependent_variables.each_with_index do |v,i|
|
11
|
+
weighted_sum += weights[i]*dependent_variables[i]
|
12
|
+
sim_sum += weights[i]
|
13
|
+
end if dependent_variables
|
14
|
+
sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
|
20
15
|
{:value => prediction}
|
21
16
|
end
|
22
17
|
|
23
|
-
# TODO explicit neighbors, also for physchem
|
24
|
-
def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
|
25
|
-
neighbors = params[:neighbors]
|
26
|
-
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
|
27
|
-
activities = []
|
28
|
-
fingerprints = {}
|
29
|
-
weights = []
|
30
|
-
fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
|
31
|
-
|
32
|
-
neighbors.each_with_index do |row,i|
|
33
|
-
neighbor = Compound.find row["_id"]
|
34
|
-
fingerprint = neighbor.fingerprint
|
35
|
-
if row["features"][params[:prediction_feature_id].to_s]
|
36
|
-
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
37
|
-
activities << Math.log10(act)
|
38
|
-
weights << row["tanimoto"]
|
39
|
-
fingerprint_ids.each_with_index do |id,j|
|
40
|
-
fingerprints[id] ||= []
|
41
|
-
fingerprints[id] << fingerprint.include?(id)
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
variables = []
|
48
|
-
data_frame = [activities]
|
49
|
-
fingerprints.each do |k,v|
|
50
|
-
unless v.uniq.size == 1
|
51
|
-
data_frame << v.collect{|m| m ? "T" : "F"}
|
52
|
-
variables << k
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
if variables.empty?
|
57
|
-
result = local_weighted_average(compound, params)
|
58
|
-
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
|
59
|
-
return result
|
60
|
-
|
61
|
-
else
|
62
|
-
compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
|
63
|
-
prediction = r_model_prediction method, data_frame, variables, weights, compound_features
|
64
|
-
if prediction.nil? or prediction[:value].nil?
|
65
|
-
prediction = local_weighted_average(compound, params)
|
66
|
-
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
|
67
|
-
return prediction
|
68
|
-
else
|
69
|
-
prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
|
70
|
-
prediction[:value] = 10**prediction[:value]
|
71
|
-
prediction[:rmse] = 10**prediction[:rmse]
|
72
|
-
prediction
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
end
|
77
|
-
|
78
|
-
def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4"
|
79
|
-
|
80
|
-
neighbors = params[:neighbors]
|
81
|
-
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
|
82
|
-
return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
|
83
|
-
|
84
|
-
activities = []
|
85
|
-
weights = []
|
86
|
-
physchem = {}
|
87
|
-
|
88
|
-
neighbors.each_with_index do |row,i|
|
89
|
-
neighbor = Compound.find row["_id"]
|
90
|
-
if row["features"][params[:prediction_feature_id].to_s]
|
91
|
-
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
92
|
-
activities << Math.log10(act)
|
93
|
-
weights << row["tanimoto"] # TODO cosine ?
|
94
|
-
neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
|
95
|
-
physchem[pid] ||= []
|
96
|
-
physchem[pid] << v
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
# remove properties with a single value
|
103
|
-
physchem.each do |pid,v|
|
104
|
-
physchem.delete(pid) if v.uniq.size <= 1
|
105
|
-
end
|
106
|
-
|
107
|
-
if physchem.empty?
|
108
|
-
result = local_weighted_average(compound, params)
|
109
|
-
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
|
110
|
-
return result
|
111
|
-
|
112
|
-
else
|
113
|
-
data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
|
114
|
-
prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
|
115
|
-
if prediction.nil?
|
116
|
-
prediction = local_weighted_average(compound, params)
|
117
|
-
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
|
118
|
-
return prediction
|
119
|
-
else
|
120
|
-
prediction[:value] = 10**prediction[:value]
|
121
|
-
prediction
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
end
|
126
|
-
|
127
|
-
def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
|
128
|
-
R.assign "weights", training_weights
|
129
|
-
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
|
130
|
-
R.eval "data <- #{r_data_frame}"
|
131
|
-
R.assign "features", training_features
|
132
|
-
R.eval "names(data) <- append(c('activities'),features)" #
|
133
|
-
begin
|
134
|
-
R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
|
135
|
-
rescue
|
136
|
-
return nil
|
137
|
-
end
|
138
|
-
R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
|
139
|
-
R.eval "names(fingerprint) <- features"
|
140
|
-
R.eval "prediction <- predict(model,fingerprint)"
|
141
|
-
{
|
142
|
-
:value => R.eval("prediction").to_f,
|
143
|
-
:rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
|
144
|
-
:r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
|
145
|
-
}
|
146
|
-
end
|
147
|
-
|
148
18
|
end
|
149
19
|
end
|
150
20
|
end
|
data/lib/rest-client-wrapper.rb
CHANGED
@@ -55,14 +55,8 @@ module OpenTox
|
|
55
55
|
if [301, 302, 307].include? response.code and request.method == :get
|
56
56
|
response.follow_redirection(request, result)
|
57
57
|
elsif response.code >= 400 and !URI.task?(uri)
|
58
|
-
#TODO add parameters to error-report
|
59
|
-
#parameters = request.args
|
60
|
-
#parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
|
61
|
-
#parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
|
62
|
-
#message += "\nREST parameters:\n#{parameters.inspect}"
|
63
58
|
error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
|
64
59
|
begin # errors are returned as error reports in json, try to parse
|
65
|
-
# TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
|
66
60
|
content = JSON.parse(response)
|
67
61
|
msg = content["message"].to_s
|
68
62
|
cause = content["errorCause"].to_s
|
data/lib/similarity.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Algorithm
|
3
|
+
|
4
|
+
class Vector
|
5
|
+
def self.dot_product(a, b)
|
6
|
+
products = a.zip(b).map{|a, b| a * b}
|
7
|
+
products.inject(0) {|s,p| s + p}
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.magnitude(point)
|
11
|
+
squares = point.map{|x| x ** 2}
|
12
|
+
Math.sqrt(squares.inject(0) {|s, c| s + c})
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Similarity
|
17
|
+
|
18
|
+
def self.tanimoto fingerprints
|
19
|
+
( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
|
20
|
+
end
|
21
|
+
|
22
|
+
#def self.weighted_tanimoto fingerprints
|
23
|
+
#( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
|
24
|
+
#end
|
25
|
+
|
26
|
+
def self.euclid scaled_properties
|
27
|
+
sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
|
28
|
+
Math.sqrt(sq.inject(0) {|s,c| s + c})
|
29
|
+
end
|
30
|
+
|
31
|
+
# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
|
32
|
+
def self.cosine scaled_properties
|
33
|
+
scaled_properties = remove_nils scaled_properties
|
34
|
+
Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.weighted_cosine scaled_properties # [a,b,weights]
|
38
|
+
a,b,w = remove_nils scaled_properties
|
39
|
+
return cosine(scaled_properties) if w.uniq.size == 1
|
40
|
+
dot_product = 0
|
41
|
+
magnitude_a = 0
|
42
|
+
magnitude_b = 0
|
43
|
+
(0..a.size-1).each do |i|
|
44
|
+
dot_product += w[i].abs*a[i]*b[i]
|
45
|
+
magnitude_a += w[i].abs*a[i]**2
|
46
|
+
magnitude_b += w[i].abs*b[i]**2
|
47
|
+
end
|
48
|
+
dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.remove_nils scaled_properties
|
52
|
+
a =[]; b = []; w = []
|
53
|
+
(0..scaled_properties.first.size-1).each do |i|
|
54
|
+
if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan?
|
55
|
+
a << scaled_properties[0][i]
|
56
|
+
b << scaled_properties[1][i]
|
57
|
+
w << scaled_properties[2][i]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
[a,b,w]
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/lib/substance.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
module Validation
|
4
|
+
|
5
|
+
class TrainTest < Validation
|
6
|
+
|
7
|
+
field :training_dataset_id, type: BSON::ObjectId
|
8
|
+
field :test_dataset_id, type: BSON::ObjectId
|
9
|
+
|
10
|
+
def self.create model, training_set, test_set
|
11
|
+
|
12
|
+
validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
|
13
|
+
validation_model.save
|
14
|
+
predictions = validation_model.predict test_set.substances
|
15
|
+
nr_unpredicted = 0
|
16
|
+
predictions.each do |cid,prediction|
|
17
|
+
if prediction[:value]
|
18
|
+
prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id])
|
19
|
+
else
|
20
|
+
nr_unpredicted += 1
|
21
|
+
end
|
22
|
+
end
|
23
|
+
predictions.select!{|cid,p| p[:value] and p[:measurements]}
|
24
|
+
validation = self.new(
|
25
|
+
:model_id => validation_model.id,
|
26
|
+
:test_dataset_id => test_set.id,
|
27
|
+
:nr_instances => test_set.substances.size,
|
28
|
+
:nr_unpredicted => nr_unpredicted,
|
29
|
+
:predictions => predictions
|
30
|
+
)
|
31
|
+
validation.save
|
32
|
+
validation
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_dataset
|
36
|
+
Dataset.find test_dataset_id
|
37
|
+
end
|
38
|
+
|
39
|
+
def training_dataset
|
40
|
+
Dataset.find training_dataset_id
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
class ClassificationTrainTest < TrainTest
|
46
|
+
include ClassificationStatistics
|
47
|
+
field :accept_values, type: Array
|
48
|
+
field :confusion_matrix, type: Array
|
49
|
+
field :weighted_confusion_matrix, type: Array
|
50
|
+
field :accuracy, type: Float
|
51
|
+
field :weighted_accuracy, type: Float
|
52
|
+
field :true_rate, type: Hash
|
53
|
+
field :predictivity, type: Hash
|
54
|
+
field :probability_plot_id, type: BSON::ObjectId
|
55
|
+
end
|
56
|
+
|
57
|
+
class RegressionTrainTest < TrainTest
|
58
|
+
include RegressionStatistics
|
59
|
+
field :rmse, type: Float, default:0
|
60
|
+
field :mae, type: Float, default:0
|
61
|
+
field :r_squared, type: Float
|
62
|
+
field :within_prediction_interval, type: Integer, default:0
|
63
|
+
field :out_of_prediction_interval, type: Integer, default:0
|
64
|
+
field :correlation_plot_id, type: BSON::ObjectId
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,223 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Validation
|
3
|
+
module ClassificationStatistics
|
4
|
+
|
5
|
+
def statistics
|
6
|
+
self.accept_values = model.prediction_feature.accept_values
|
7
|
+
self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
|
8
|
+
self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
|
9
|
+
nr_instances = 0
|
10
|
+
predictions.each do |cid,pred|
|
11
|
+
# TODO
|
12
|
+
# use predictions without probabilities (single neighbor)??
|
13
|
+
# use measured majority class??
|
14
|
+
if pred[:measurements].uniq.size == 1 and pred[:probabilities]
|
15
|
+
m = pred[:measurements].first
|
16
|
+
if pred[:value] == m
|
17
|
+
if pred[:value] == accept_values[0]
|
18
|
+
confusion_matrix[0][0] += 1
|
19
|
+
weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
|
20
|
+
nr_instances += 1
|
21
|
+
elsif pred[:value] == accept_values[1]
|
22
|
+
confusion_matrix[1][1] += 1
|
23
|
+
weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
|
24
|
+
nr_instances += 1
|
25
|
+
end
|
26
|
+
elsif pred[:value] != m
|
27
|
+
if pred[:value] == accept_values[0]
|
28
|
+
confusion_matrix[0][1] += 1
|
29
|
+
weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
|
30
|
+
nr_instances += 1
|
31
|
+
elsif pred[:value] == accept_values[1]
|
32
|
+
confusion_matrix[1][0] += 1
|
33
|
+
weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
|
34
|
+
nr_instances += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
self.true_rate = {}
|
40
|
+
self.predictivity = {}
|
41
|
+
accept_values.each_with_index do |v,i|
|
42
|
+
self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
|
43
|
+
self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
|
44
|
+
end
|
45
|
+
confidence_sum = 0
|
46
|
+
weighted_confusion_matrix.each do |r|
|
47
|
+
r.each do |c|
|
48
|
+
confidence_sum += c
|
49
|
+
end
|
50
|
+
end
|
51
|
+
self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
|
52
|
+
self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
|
53
|
+
$logger.debug "Accuracy #{accuracy}"
|
54
|
+
save
|
55
|
+
{
|
56
|
+
:accept_values => accept_values,
|
57
|
+
:confusion_matrix => confusion_matrix,
|
58
|
+
:weighted_confusion_matrix => weighted_confusion_matrix,
|
59
|
+
:accuracy => accuracy,
|
60
|
+
:weighted_accuracy => weighted_accuracy,
|
61
|
+
:true_rate => self.true_rate,
|
62
|
+
:predictivity => self.predictivity,
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
def probability_plot format: "pdf"
|
67
|
+
#unless probability_plot_id
|
68
|
+
|
69
|
+
#tmpdir = File.join(ENV["HOME"], "tmp")
|
70
|
+
tmpdir = "/tmp"
|
71
|
+
#p tmpdir
|
72
|
+
FileUtils.mkdir_p tmpdir
|
73
|
+
tmpfile = File.join(tmpdir,"#{id.to_s}_probability.#{format}")
|
74
|
+
accuracies = []
|
75
|
+
probabilities = []
|
76
|
+
correct_predictions = 0
|
77
|
+
incorrect_predictions = 0
|
78
|
+
pp = []
|
79
|
+
predictions.values.select{|p| p["probabilities"]}.compact.each do |p|
|
80
|
+
p["measurements"].each do |m|
|
81
|
+
pp << [ p["probabilities"][p["value"]], p["value"] == m ]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
pp.sort_by!{|p| 1-p.first}
|
85
|
+
pp.each do |p|
|
86
|
+
p[1] ? correct_predictions += 1 : incorrect_predictions += 1
|
87
|
+
accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
|
88
|
+
probabilities << p[0]
|
89
|
+
end
|
90
|
+
R.assign "accuracy", accuracies
|
91
|
+
R.assign "probability", probabilities
|
92
|
+
R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()"
|
93
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
94
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
|
95
|
+
plot_id = $gridfs.insert_one(file)
|
96
|
+
update(:probability_plot_id => plot_id)
|
97
|
+
#end
|
98
|
+
$gridfs.find_one(_id: probability_plot_id).data
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
module RegressionStatistics
|
103
|
+
|
104
|
+
def statistics
|
105
|
+
self.rmse = 0
|
106
|
+
self.mae = 0
|
107
|
+
self.within_prediction_interval = 0
|
108
|
+
self.out_of_prediction_interval = 0
|
109
|
+
x = []
|
110
|
+
y = []
|
111
|
+
predictions.each do |cid,pred|
|
112
|
+
if pred[:value] and pred[:measurements]
|
113
|
+
x << pred[:measurements].median
|
114
|
+
y << pred[:value]
|
115
|
+
error = pred[:value]-pred[:measurements].median
|
116
|
+
self.rmse += error**2
|
117
|
+
self.mae += error.abs
|
118
|
+
if pred[:prediction_interval]
|
119
|
+
if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
|
120
|
+
self.within_prediction_interval += 1
|
121
|
+
else
|
122
|
+
self.out_of_prediction_interval += 1
|
123
|
+
end
|
124
|
+
end
|
125
|
+
else
|
126
|
+
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
127
|
+
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
128
|
+
end
|
129
|
+
end
|
130
|
+
R.assign "measurement", x
|
131
|
+
R.assign "prediction", y
|
132
|
+
R.eval "r <- cor(measurement,prediction,use='pairwise')"
|
133
|
+
self.r_squared = R.eval("r").to_ruby**2
|
134
|
+
self.mae = self.mae/predictions.size
|
135
|
+
self.rmse = Math.sqrt(self.rmse/predictions.size)
|
136
|
+
$logger.debug "R^2 #{r_squared}"
|
137
|
+
$logger.debug "RMSE #{rmse}"
|
138
|
+
$logger.debug "MAE #{mae}"
|
139
|
+
$logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
|
140
|
+
save
|
141
|
+
{
|
142
|
+
:mae => mae,
|
143
|
+
:rmse => rmse,
|
144
|
+
:r_squared => r_squared,
|
145
|
+
:within_prediction_interval => within_prediction_interval,
|
146
|
+
:out_of_prediction_interval => out_of_prediction_interval,
|
147
|
+
}
|
148
|
+
end
|
149
|
+
|
150
|
+
def percent_within_prediction_interval
|
151
|
+
100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
|
152
|
+
end
|
153
|
+
|
154
|
+
def correlation_plot format: "png"
|
155
|
+
unless correlation_plot_id
|
156
|
+
tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
|
157
|
+
x = []
|
158
|
+
y = []
|
159
|
+
feature = Feature.find(predictions.first.last["prediction_feature_id"])
|
160
|
+
predictions.each do |sid,p|
|
161
|
+
x << p["measurements"].median
|
162
|
+
y << p["value"]
|
163
|
+
end
|
164
|
+
R.assign "measurement", x
|
165
|
+
R.assign "prediction", y
|
166
|
+
R.eval "all = c(measurement,prediction)"
|
167
|
+
R.eval "range = c(min(all), max(all))"
|
168
|
+
title = feature.name
|
169
|
+
title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
|
170
|
+
R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
|
171
|
+
R.eval "image = image + geom_abline(intercept=0, slope=1)"
|
172
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
173
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
|
174
|
+
plot_id = $gridfs.insert_one(file)
|
175
|
+
update(:correlation_plot_id => plot_id)
|
176
|
+
end
|
177
|
+
$gridfs.find_one(_id: correlation_plot_id).data
|
178
|
+
end
|
179
|
+
|
180
|
+
def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
|
181
|
+
worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
|
182
|
+
worst_predictions.collect do |p|
|
183
|
+
substance = Substance.find(p.first)
|
184
|
+
prediction = p[1]
|
185
|
+
if show_neigbors
|
186
|
+
neighbors = prediction["neighbors"].collect do |n|
|
187
|
+
common_descriptors = []
|
188
|
+
if show_common_descriptors
|
189
|
+
common_descriptors = n["common_descriptors"].collect do |d|
|
190
|
+
f=Feature.find(d)
|
191
|
+
{
|
192
|
+
:id => f.id.to_s,
|
193
|
+
:name => "#{f.name} (#{f.conditions})",
|
194
|
+
:p_value => d[:p_value],
|
195
|
+
:r_squared => d[:r_squared],
|
196
|
+
}
|
197
|
+
end
|
198
|
+
else
|
199
|
+
common_descriptors = n["common_descriptors"].size
|
200
|
+
end
|
201
|
+
{
|
202
|
+
:name => Substance.find(n["_id"]).name,
|
203
|
+
:id => n["_id"].to_s,
|
204
|
+
:common_descriptors => common_descriptors
|
205
|
+
}
|
206
|
+
end
|
207
|
+
else
|
208
|
+
neighbors = prediction["neighbors"].size
|
209
|
+
end
|
210
|
+
{
|
211
|
+
:id => substance.id.to_s,
|
212
|
+
:name => substance.name,
|
213
|
+
:feature => Feature.find(prediction["prediction_feature_id"]).name,
|
214
|
+
:error => (prediction["value"] - prediction["measurements"].median).abs,
|
215
|
+
:prediction => prediction["value"],
|
216
|
+
:measurements => prediction["measurements"],
|
217
|
+
:neighbors => neighbors
|
218
|
+
}
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|