lazar 0.9.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
data/lib/overwrite.rb
CHANGED
@@ -28,6 +28,11 @@ class Float
|
|
28
28
|
def signif(n)
|
29
29
|
Float("%.#{n}g" % self)
|
30
30
|
end
|
31
|
+
|
32
|
+
# converts -10 logarithmized values back
|
33
|
+
def delog10
|
34
|
+
10**(-1*self)
|
35
|
+
end
|
31
36
|
end
|
32
37
|
|
33
38
|
module Enumerable
|
@@ -101,19 +106,35 @@ class Array
|
|
101
106
|
end
|
102
107
|
|
103
108
|
def mean
|
104
|
-
self.inject{ |sum, el| sum + el }.to_f / self.size
|
109
|
+
self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
|
105
110
|
end
|
106
111
|
|
107
112
|
def sample_variance
|
108
113
|
m = self.mean
|
109
|
-
sum = self.inject(0){|accum, i| accum +(i-m)**2 }
|
110
|
-
sum/(self.length - 1).to_f
|
114
|
+
sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
|
115
|
+
sum/(self.compact.length - 1).to_f
|
111
116
|
end
|
112
117
|
|
113
118
|
def standard_deviation
|
114
119
|
Math.sqrt(self.sample_variance)
|
115
120
|
end
|
116
121
|
|
122
|
+
def for_R
|
123
|
+
if self.first.is_a?(String)
|
124
|
+
#"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
|
125
|
+
"NA"
|
126
|
+
else
|
127
|
+
self.median
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def collect_with_index
|
132
|
+
result = []
|
133
|
+
self.each_with_index do |elt, idx|
|
134
|
+
result << yield(elt, idx)
|
135
|
+
end
|
136
|
+
result
|
137
|
+
end
|
117
138
|
end
|
118
139
|
|
119
140
|
module URI
|
data/lib/physchem.rb
CHANGED
@@ -14,7 +14,7 @@ module OpenTox
|
|
14
14
|
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
|
15
15
|
|
16
16
|
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
|
17
|
-
|
17
|
+
OPENBABEL = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
|
18
18
|
name,description = d.split(/\s+/,2)
|
19
19
|
["Openbabel."+name,description] unless obexclude.include? name
|
20
20
|
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
@@ -25,24 +25,24 @@ module OpenTox
|
|
25
25
|
prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
|
26
26
|
d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
|
27
27
|
end
|
28
|
-
|
28
|
+
CDK = cdkdescriptors
|
29
29
|
|
30
30
|
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
|
31
31
|
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
|
32
32
|
# strip Joelib messages from stdout
|
33
|
-
|
33
|
+
JOELIB = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
|
34
34
|
name = d[:java_class].sub(/^joelib2.feature.types./,'')
|
35
35
|
["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
|
36
36
|
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
37
37
|
|
38
|
-
DESCRIPTORS =
|
38
|
+
DESCRIPTORS = OPENBABEL.merge(CDK.merge(JOELIB))
|
39
39
|
|
40
40
|
require_relative "unique_descriptors.rb"
|
41
41
|
|
42
42
|
def self.descriptors desc=DESCRIPTORS
|
43
43
|
desc.collect do |name,description|
|
44
44
|
lib,desc = name.split('.',2)
|
45
|
-
self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true
|
45
|
+
self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
@@ -54,26 +54,26 @@ module OpenTox
|
|
54
54
|
CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
|
55
55
|
dname = "#{name}.#{n}"
|
56
56
|
description = DESCRIPTORS[dname]
|
57
|
-
udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true
|
57
|
+
udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
|
58
58
|
end
|
59
59
|
else
|
60
60
|
description = DESCRIPTORS[name]
|
61
|
-
udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true
|
61
|
+
udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
|
62
62
|
end
|
63
63
|
end
|
64
64
|
udesc
|
65
65
|
end
|
66
66
|
|
67
67
|
def self.openbabel_descriptors
|
68
|
-
descriptors
|
68
|
+
descriptors OPENBABEL
|
69
69
|
end
|
70
70
|
|
71
71
|
def self.cdk_descriptors
|
72
|
-
descriptors
|
72
|
+
descriptors CDK
|
73
73
|
end
|
74
74
|
|
75
75
|
def self.joelib_descriptors
|
76
|
-
descriptors
|
76
|
+
descriptors JOELIB
|
77
77
|
end
|
78
78
|
|
79
79
|
def calculate compound
|
@@ -131,3 +131,4 @@ module OpenTox
|
|
131
131
|
end
|
132
132
|
|
133
133
|
end
|
134
|
+
OpenTox::PhysChem.descriptors # load descriptor features
|
data/lib/regression.rb
CHANGED
@@ -3,148 +3,18 @@ module OpenTox
|
|
3
3
|
|
4
4
|
class Regression
|
5
5
|
|
6
|
-
def self.
|
6
|
+
def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
|
7
|
+
# TODO: prediction_interval
|
7
8
|
weighted_sum = 0.0
|
8
9
|
sim_sum = 0.0
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
weighted_sum += sim*Math.log10(act)
|
15
|
-
sim_sum += sim
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
|
10
|
+
dependent_variables.each_with_index do |v,i|
|
11
|
+
weighted_sum += weights[i]*dependent_variables[i]
|
12
|
+
sim_sum += weights[i]
|
13
|
+
end if dependent_variables
|
14
|
+
sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
|
20
15
|
{:value => prediction}
|
21
16
|
end
|
22
17
|
|
23
|
-
# TODO explicit neighbors, also for physchem
|
24
|
-
def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
|
25
|
-
neighbors = params[:neighbors]
|
26
|
-
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
|
27
|
-
activities = []
|
28
|
-
fingerprints = {}
|
29
|
-
weights = []
|
30
|
-
fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
|
31
|
-
|
32
|
-
neighbors.each_with_index do |row,i|
|
33
|
-
neighbor = Compound.find row["_id"]
|
34
|
-
fingerprint = neighbor.fingerprint
|
35
|
-
if row["features"][params[:prediction_feature_id].to_s]
|
36
|
-
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
37
|
-
activities << Math.log10(act)
|
38
|
-
weights << row["tanimoto"]
|
39
|
-
fingerprint_ids.each_with_index do |id,j|
|
40
|
-
fingerprints[id] ||= []
|
41
|
-
fingerprints[id] << fingerprint.include?(id)
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
variables = []
|
48
|
-
data_frame = [activities]
|
49
|
-
fingerprints.each do |k,v|
|
50
|
-
unless v.uniq.size == 1
|
51
|
-
data_frame << v.collect{|m| m ? "T" : "F"}
|
52
|
-
variables << k
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
if variables.empty?
|
57
|
-
result = local_weighted_average(compound, params)
|
58
|
-
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
|
59
|
-
return result
|
60
|
-
|
61
|
-
else
|
62
|
-
compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
|
63
|
-
prediction = r_model_prediction method, data_frame, variables, weights, compound_features
|
64
|
-
if prediction.nil? or prediction[:value].nil?
|
65
|
-
prediction = local_weighted_average(compound, params)
|
66
|
-
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
|
67
|
-
return prediction
|
68
|
-
else
|
69
|
-
prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
|
70
|
-
prediction[:value] = 10**prediction[:value]
|
71
|
-
prediction[:rmse] = 10**prediction[:rmse]
|
72
|
-
prediction
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
end
|
77
|
-
|
78
|
-
def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4"
|
79
|
-
|
80
|
-
neighbors = params[:neighbors]
|
81
|
-
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
|
82
|
-
return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
|
83
|
-
|
84
|
-
activities = []
|
85
|
-
weights = []
|
86
|
-
physchem = {}
|
87
|
-
|
88
|
-
neighbors.each_with_index do |row,i|
|
89
|
-
neighbor = Compound.find row["_id"]
|
90
|
-
if row["features"][params[:prediction_feature_id].to_s]
|
91
|
-
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
92
|
-
activities << Math.log10(act)
|
93
|
-
weights << row["tanimoto"] # TODO cosine ?
|
94
|
-
neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
|
95
|
-
physchem[pid] ||= []
|
96
|
-
physchem[pid] << v
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
# remove properties with a single value
|
103
|
-
physchem.each do |pid,v|
|
104
|
-
physchem.delete(pid) if v.uniq.size <= 1
|
105
|
-
end
|
106
|
-
|
107
|
-
if physchem.empty?
|
108
|
-
result = local_weighted_average(compound, params)
|
109
|
-
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
|
110
|
-
return result
|
111
|
-
|
112
|
-
else
|
113
|
-
data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
|
114
|
-
prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
|
115
|
-
if prediction.nil?
|
116
|
-
prediction = local_weighted_average(compound, params)
|
117
|
-
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
|
118
|
-
return prediction
|
119
|
-
else
|
120
|
-
prediction[:value] = 10**prediction[:value]
|
121
|
-
prediction
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
end
|
126
|
-
|
127
|
-
def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
|
128
|
-
R.assign "weights", training_weights
|
129
|
-
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
|
130
|
-
R.eval "data <- #{r_data_frame}"
|
131
|
-
R.assign "features", training_features
|
132
|
-
R.eval "names(data) <- append(c('activities'),features)" #
|
133
|
-
begin
|
134
|
-
R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
|
135
|
-
rescue
|
136
|
-
return nil
|
137
|
-
end
|
138
|
-
R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
|
139
|
-
R.eval "names(fingerprint) <- features"
|
140
|
-
R.eval "prediction <- predict(model,fingerprint)"
|
141
|
-
{
|
142
|
-
:value => R.eval("prediction").to_f,
|
143
|
-
:rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
|
144
|
-
:r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
|
145
|
-
}
|
146
|
-
end
|
147
|
-
|
148
18
|
end
|
149
19
|
end
|
150
20
|
end
|
data/lib/rest-client-wrapper.rb
CHANGED
@@ -55,14 +55,8 @@ module OpenTox
|
|
55
55
|
if [301, 302, 307].include? response.code and request.method == :get
|
56
56
|
response.follow_redirection(request, result)
|
57
57
|
elsif response.code >= 400 and !URI.task?(uri)
|
58
|
-
#TODO add parameters to error-report
|
59
|
-
#parameters = request.args
|
60
|
-
#parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
|
61
|
-
#parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
|
62
|
-
#message += "\nREST parameters:\n#{parameters.inspect}"
|
63
58
|
error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
|
64
59
|
begin # errors are returned as error reports in json, try to parse
|
65
|
-
# TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
|
66
60
|
content = JSON.parse(response)
|
67
61
|
msg = content["message"].to_s
|
68
62
|
cause = content["errorCause"].to_s
|
data/lib/similarity.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Algorithm
|
3
|
+
|
4
|
+
class Vector
|
5
|
+
def self.dot_product(a, b)
|
6
|
+
products = a.zip(b).map{|a, b| a * b}
|
7
|
+
products.inject(0) {|s,p| s + p}
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.magnitude(point)
|
11
|
+
squares = point.map{|x| x ** 2}
|
12
|
+
Math.sqrt(squares.inject(0) {|s, c| s + c})
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Similarity
|
17
|
+
|
18
|
+
def self.tanimoto fingerprints
|
19
|
+
( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
|
20
|
+
end
|
21
|
+
|
22
|
+
#def self.weighted_tanimoto fingerprints
|
23
|
+
#( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
|
24
|
+
#end
|
25
|
+
|
26
|
+
def self.euclid scaled_properties
|
27
|
+
sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
|
28
|
+
Math.sqrt(sq.inject(0) {|s,c| s + c})
|
29
|
+
end
|
30
|
+
|
31
|
+
# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
|
32
|
+
def self.cosine scaled_properties
|
33
|
+
scaled_properties = remove_nils scaled_properties
|
34
|
+
Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.weighted_cosine scaled_properties # [a,b,weights]
|
38
|
+
a,b,w = remove_nils scaled_properties
|
39
|
+
return cosine(scaled_properties) if w.uniq.size == 1
|
40
|
+
dot_product = 0
|
41
|
+
magnitude_a = 0
|
42
|
+
magnitude_b = 0
|
43
|
+
(0..a.size-1).each do |i|
|
44
|
+
dot_product += w[i].abs*a[i]*b[i]
|
45
|
+
magnitude_a += w[i].abs*a[i]**2
|
46
|
+
magnitude_b += w[i].abs*b[i]**2
|
47
|
+
end
|
48
|
+
dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.remove_nils scaled_properties
|
52
|
+
a =[]; b = []; w = []
|
53
|
+
(0..scaled_properties.first.size-1).each do |i|
|
54
|
+
if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan?
|
55
|
+
a << scaled_properties[0][i]
|
56
|
+
b << scaled_properties[1][i]
|
57
|
+
w << scaled_properties[2][i]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
[a,b,w]
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/lib/substance.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
module Validation
|
4
|
+
|
5
|
+
class TrainTest < Validation
|
6
|
+
|
7
|
+
field :training_dataset_id, type: BSON::ObjectId
|
8
|
+
field :test_dataset_id, type: BSON::ObjectId
|
9
|
+
|
10
|
+
def self.create model, training_set, test_set
|
11
|
+
|
12
|
+
validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
|
13
|
+
validation_model.save
|
14
|
+
predictions = validation_model.predict test_set.substances
|
15
|
+
nr_unpredicted = 0
|
16
|
+
predictions.each do |cid,prediction|
|
17
|
+
if prediction[:value]
|
18
|
+
prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id])
|
19
|
+
else
|
20
|
+
nr_unpredicted += 1
|
21
|
+
end
|
22
|
+
end
|
23
|
+
predictions.select!{|cid,p| p[:value] and p[:measurements]}
|
24
|
+
validation = self.new(
|
25
|
+
:model_id => validation_model.id,
|
26
|
+
:test_dataset_id => test_set.id,
|
27
|
+
:nr_instances => test_set.substances.size,
|
28
|
+
:nr_unpredicted => nr_unpredicted,
|
29
|
+
:predictions => predictions
|
30
|
+
)
|
31
|
+
validation.save
|
32
|
+
validation
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_dataset
|
36
|
+
Dataset.find test_dataset_id
|
37
|
+
end
|
38
|
+
|
39
|
+
def training_dataset
|
40
|
+
Dataset.find training_dataset_id
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
class ClassificationTrainTest < TrainTest
|
46
|
+
include ClassificationStatistics
|
47
|
+
field :accept_values, type: Array
|
48
|
+
field :confusion_matrix, type: Array
|
49
|
+
field :weighted_confusion_matrix, type: Array
|
50
|
+
field :accuracy, type: Float
|
51
|
+
field :weighted_accuracy, type: Float
|
52
|
+
field :true_rate, type: Hash
|
53
|
+
field :predictivity, type: Hash
|
54
|
+
field :probability_plot_id, type: BSON::ObjectId
|
55
|
+
end
|
56
|
+
|
57
|
+
class RegressionTrainTest < TrainTest
|
58
|
+
include RegressionStatistics
|
59
|
+
field :rmse, type: Float, default:0
|
60
|
+
field :mae, type: Float, default:0
|
61
|
+
field :r_squared, type: Float
|
62
|
+
field :within_prediction_interval, type: Integer, default:0
|
63
|
+
field :out_of_prediction_interval, type: Integer, default:0
|
64
|
+
field :correlation_plot_id, type: BSON::ObjectId
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,223 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Validation
|
3
|
+
module ClassificationStatistics
|
4
|
+
|
5
|
+
def statistics
|
6
|
+
self.accept_values = model.prediction_feature.accept_values
|
7
|
+
self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
|
8
|
+
self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
|
9
|
+
nr_instances = 0
|
10
|
+
predictions.each do |cid,pred|
|
11
|
+
# TODO
|
12
|
+
# use predictions without probabilities (single neighbor)??
|
13
|
+
# use measured majority class??
|
14
|
+
if pred[:measurements].uniq.size == 1 and pred[:probabilities]
|
15
|
+
m = pred[:measurements].first
|
16
|
+
if pred[:value] == m
|
17
|
+
if pred[:value] == accept_values[0]
|
18
|
+
confusion_matrix[0][0] += 1
|
19
|
+
weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
|
20
|
+
nr_instances += 1
|
21
|
+
elsif pred[:value] == accept_values[1]
|
22
|
+
confusion_matrix[1][1] += 1
|
23
|
+
weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
|
24
|
+
nr_instances += 1
|
25
|
+
end
|
26
|
+
elsif pred[:value] != m
|
27
|
+
if pred[:value] == accept_values[0]
|
28
|
+
confusion_matrix[0][1] += 1
|
29
|
+
weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
|
30
|
+
nr_instances += 1
|
31
|
+
elsif pred[:value] == accept_values[1]
|
32
|
+
confusion_matrix[1][0] += 1
|
33
|
+
weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
|
34
|
+
nr_instances += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
self.true_rate = {}
|
40
|
+
self.predictivity = {}
|
41
|
+
accept_values.each_with_index do |v,i|
|
42
|
+
self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
|
43
|
+
self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
|
44
|
+
end
|
45
|
+
confidence_sum = 0
|
46
|
+
weighted_confusion_matrix.each do |r|
|
47
|
+
r.each do |c|
|
48
|
+
confidence_sum += c
|
49
|
+
end
|
50
|
+
end
|
51
|
+
self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
|
52
|
+
self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
|
53
|
+
$logger.debug "Accuracy #{accuracy}"
|
54
|
+
save
|
55
|
+
{
|
56
|
+
:accept_values => accept_values,
|
57
|
+
:confusion_matrix => confusion_matrix,
|
58
|
+
:weighted_confusion_matrix => weighted_confusion_matrix,
|
59
|
+
:accuracy => accuracy,
|
60
|
+
:weighted_accuracy => weighted_accuracy,
|
61
|
+
:true_rate => self.true_rate,
|
62
|
+
:predictivity => self.predictivity,
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
def probability_plot format: "pdf"
|
67
|
+
#unless probability_plot_id
|
68
|
+
|
69
|
+
#tmpdir = File.join(ENV["HOME"], "tmp")
|
70
|
+
tmpdir = "/tmp"
|
71
|
+
#p tmpdir
|
72
|
+
FileUtils.mkdir_p tmpdir
|
73
|
+
tmpfile = File.join(tmpdir,"#{id.to_s}_probability.#{format}")
|
74
|
+
accuracies = []
|
75
|
+
probabilities = []
|
76
|
+
correct_predictions = 0
|
77
|
+
incorrect_predictions = 0
|
78
|
+
pp = []
|
79
|
+
predictions.values.select{|p| p["probabilities"]}.compact.each do |p|
|
80
|
+
p["measurements"].each do |m|
|
81
|
+
pp << [ p["probabilities"][p["value"]], p["value"] == m ]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
pp.sort_by!{|p| 1-p.first}
|
85
|
+
pp.each do |p|
|
86
|
+
p[1] ? correct_predictions += 1 : incorrect_predictions += 1
|
87
|
+
accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
|
88
|
+
probabilities << p[0]
|
89
|
+
end
|
90
|
+
R.assign "accuracy", accuracies
|
91
|
+
R.assign "probability", probabilities
|
92
|
+
R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()"
|
93
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
94
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
|
95
|
+
plot_id = $gridfs.insert_one(file)
|
96
|
+
update(:probability_plot_id => plot_id)
|
97
|
+
#end
|
98
|
+
$gridfs.find_one(_id: probability_plot_id).data
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
module RegressionStatistics
|
103
|
+
|
104
|
+
def statistics
|
105
|
+
self.rmse = 0
|
106
|
+
self.mae = 0
|
107
|
+
self.within_prediction_interval = 0
|
108
|
+
self.out_of_prediction_interval = 0
|
109
|
+
x = []
|
110
|
+
y = []
|
111
|
+
predictions.each do |cid,pred|
|
112
|
+
if pred[:value] and pred[:measurements]
|
113
|
+
x << pred[:measurements].median
|
114
|
+
y << pred[:value]
|
115
|
+
error = pred[:value]-pred[:measurements].median
|
116
|
+
self.rmse += error**2
|
117
|
+
self.mae += error.abs
|
118
|
+
if pred[:prediction_interval]
|
119
|
+
if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
|
120
|
+
self.within_prediction_interval += 1
|
121
|
+
else
|
122
|
+
self.out_of_prediction_interval += 1
|
123
|
+
end
|
124
|
+
end
|
125
|
+
else
|
126
|
+
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
127
|
+
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
128
|
+
end
|
129
|
+
end
|
130
|
+
R.assign "measurement", x
|
131
|
+
R.assign "prediction", y
|
132
|
+
R.eval "r <- cor(measurement,prediction,use='pairwise')"
|
133
|
+
self.r_squared = R.eval("r").to_ruby**2
|
134
|
+
self.mae = self.mae/predictions.size
|
135
|
+
self.rmse = Math.sqrt(self.rmse/predictions.size)
|
136
|
+
$logger.debug "R^2 #{r_squared}"
|
137
|
+
$logger.debug "RMSE #{rmse}"
|
138
|
+
$logger.debug "MAE #{mae}"
|
139
|
+
$logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
|
140
|
+
save
|
141
|
+
{
|
142
|
+
:mae => mae,
|
143
|
+
:rmse => rmse,
|
144
|
+
:r_squared => r_squared,
|
145
|
+
:within_prediction_interval => within_prediction_interval,
|
146
|
+
:out_of_prediction_interval => out_of_prediction_interval,
|
147
|
+
}
|
148
|
+
end
|
149
|
+
|
150
|
+
def percent_within_prediction_interval
|
151
|
+
100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
|
152
|
+
end
|
153
|
+
|
154
|
+
def correlation_plot format: "png"
|
155
|
+
unless correlation_plot_id
|
156
|
+
tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
|
157
|
+
x = []
|
158
|
+
y = []
|
159
|
+
feature = Feature.find(predictions.first.last["prediction_feature_id"])
|
160
|
+
predictions.each do |sid,p|
|
161
|
+
x << p["measurements"].median
|
162
|
+
y << p["value"]
|
163
|
+
end
|
164
|
+
R.assign "measurement", x
|
165
|
+
R.assign "prediction", y
|
166
|
+
R.eval "all = c(measurement,prediction)"
|
167
|
+
R.eval "range = c(min(all), max(all))"
|
168
|
+
title = feature.name
|
169
|
+
title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
|
170
|
+
R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
|
171
|
+
R.eval "image = image + geom_abline(intercept=0, slope=1)"
|
172
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
173
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
|
174
|
+
plot_id = $gridfs.insert_one(file)
|
175
|
+
update(:correlation_plot_id => plot_id)
|
176
|
+
end
|
177
|
+
$gridfs.find_one(_id: correlation_plot_id).data
|
178
|
+
end
|
179
|
+
|
180
|
+
def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
|
181
|
+
worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
|
182
|
+
worst_predictions.collect do |p|
|
183
|
+
substance = Substance.find(p.first)
|
184
|
+
prediction = p[1]
|
185
|
+
if show_neigbors
|
186
|
+
neighbors = prediction["neighbors"].collect do |n|
|
187
|
+
common_descriptors = []
|
188
|
+
if show_common_descriptors
|
189
|
+
common_descriptors = n["common_descriptors"].collect do |d|
|
190
|
+
f=Feature.find(d)
|
191
|
+
{
|
192
|
+
:id => f.id.to_s,
|
193
|
+
:name => "#{f.name} (#{f.conditions})",
|
194
|
+
:p_value => d[:p_value],
|
195
|
+
:r_squared => d[:r_squared],
|
196
|
+
}
|
197
|
+
end
|
198
|
+
else
|
199
|
+
common_descriptors = n["common_descriptors"].size
|
200
|
+
end
|
201
|
+
{
|
202
|
+
:name => Substance.find(n["_id"]).name,
|
203
|
+
:id => n["_id"].to_s,
|
204
|
+
:common_descriptors => common_descriptors
|
205
|
+
}
|
206
|
+
end
|
207
|
+
else
|
208
|
+
neighbors = prediction["neighbors"].size
|
209
|
+
end
|
210
|
+
{
|
211
|
+
:id => substance.id.to_s,
|
212
|
+
:name => substance.name,
|
213
|
+
:feature => Feature.find(prediction["prediction_feature_id"]).name,
|
214
|
+
:error => (prediction["value"] - prediction["measurements"].median).abs,
|
215
|
+
:prediction => prediction["value"],
|
216
|
+
:measurements => prediction["measurements"],
|
217
|
+
:neighbors => neighbors
|
218
|
+
}
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|