lazar 0.9.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
data/lib/overwrite.rb CHANGED
@@ -28,6 +28,11 @@ class Float
28
28
  def signif(n)
29
29
  Float("%.#{n}g" % self)
30
30
  end
31
+
32
+ # converts -10 logarithmized values back
33
+ def delog10
34
+ 10**(-1*self)
35
+ end
31
36
  end
32
37
 
33
38
  module Enumerable
@@ -101,19 +106,35 @@ class Array
101
106
  end
102
107
 
103
108
  def mean
104
- self.inject{ |sum, el| sum + el }.to_f / self.size
109
+ self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
105
110
  end
106
111
 
107
112
  def sample_variance
108
113
  m = self.mean
109
- sum = self.inject(0){|accum, i| accum +(i-m)**2 }
110
- sum/(self.length - 1).to_f
114
+ sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
115
+ sum/(self.compact.length - 1).to_f
111
116
  end
112
117
 
113
118
  def standard_deviation
114
119
  Math.sqrt(self.sample_variance)
115
120
  end
116
121
 
122
+ def for_R
123
+ if self.first.is_a?(String)
124
+ #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
125
+ "NA"
126
+ else
127
+ self.median
128
+ end
129
+ end
130
+
131
+ def collect_with_index
132
+ result = []
133
+ self.each_with_index do |elt, idx|
134
+ result << yield(elt, idx)
135
+ end
136
+ result
137
+ end
117
138
  end
118
139
 
119
140
  module URI
data/lib/physchem.rb CHANGED
@@ -14,7 +14,7 @@ module OpenTox
14
14
  JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
15
15
 
16
16
  obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
17
- OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
17
+ OPENBABEL = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
18
18
  name,description = d.split(/\s+/,2)
19
19
  ["Openbabel."+name,description] unless obexclude.include? name
20
20
  end.compact.sort{|a,b| a[0] <=> b[0]}]
@@ -25,24 +25,24 @@ module OpenTox
25
25
  prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
26
26
  d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
27
27
  end
28
- CDKDESCRIPTORS = cdkdescriptors
28
+ CDK = cdkdescriptors
29
29
 
30
30
  # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
31
31
  joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
32
32
  # strip Joelib messages from stdout
33
- JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
33
+ JOELIB = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
34
34
  name = d[:java_class].sub(/^joelib2.feature.types./,'')
35
35
  ["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
36
36
  end.compact.sort{|a,b| a[0] <=> b[0]}]
37
37
 
38
- DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
38
+ DESCRIPTORS = OPENBABEL.merge(CDK.merge(JOELIB))
39
39
 
40
40
  require_relative "unique_descriptors.rb"
41
41
 
42
42
  def self.descriptors desc=DESCRIPTORS
43
43
  desc.collect do |name,description|
44
44
  lib,desc = name.split('.',2)
45
- self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
45
+ self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
46
46
  end
47
47
  end
48
48
 
@@ -54,26 +54,26 @@ module OpenTox
54
54
  CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
55
55
  dname = "#{name}.#{n}"
56
56
  description = DESCRIPTORS[dname]
57
- udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
57
+ udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
58
58
  end
59
59
  else
60
60
  description = DESCRIPTORS[name]
61
- udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
61
+ udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
62
62
  end
63
63
  end
64
64
  udesc
65
65
  end
66
66
 
67
67
  def self.openbabel_descriptors
68
- descriptors OBDESCRIPTORS
68
+ descriptors OPENBABEL
69
69
  end
70
70
 
71
71
  def self.cdk_descriptors
72
- descriptors CDKDESCRIPTORS
72
+ descriptors CDK
73
73
  end
74
74
 
75
75
  def self.joelib_descriptors
76
- descriptors JOELIBDESCRIPTORS
76
+ descriptors JOELIB
77
77
  end
78
78
 
79
79
  def calculate compound
@@ -131,3 +131,4 @@ module OpenTox
131
131
  end
132
132
 
133
133
  end
134
+ OpenTox::PhysChem.descriptors # load descriptor features
data/lib/regression.rb CHANGED
@@ -3,148 +3,18 @@ module OpenTox
3
3
 
4
4
  class Regression
5
5
 
6
- def self.local_weighted_average compound, params
6
+ def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
7
+ # TODO: prediction_interval
7
8
  weighted_sum = 0.0
8
9
  sim_sum = 0.0
9
- neighbors = params[:neighbors]
10
- neighbors.each do |row|
11
- sim = row["tanimoto"]
12
- if row["features"][params[:prediction_feature_id].to_s]
13
- row["features"][params[:prediction_feature_id].to_s].each do |act|
14
- weighted_sum += sim*Math.log10(act)
15
- sim_sum += sim
16
- end
17
- end
18
- end
19
- sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
10
+ dependent_variables.each_with_index do |v,i|
11
+ weighted_sum += weights[i]*dependent_variables[i]
12
+ sim_sum += weights[i]
13
+ end if dependent_variables
14
+ sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
20
15
  {:value => prediction}
21
16
  end
22
17
 
23
- # TODO explicit neighbors, also for physchem
24
- def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
25
- neighbors = params[:neighbors]
26
- return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
27
- activities = []
28
- fingerprints = {}
29
- weights = []
30
- fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
31
-
32
- neighbors.each_with_index do |row,i|
33
- neighbor = Compound.find row["_id"]
34
- fingerprint = neighbor.fingerprint
35
- if row["features"][params[:prediction_feature_id].to_s]
36
- row["features"][params[:prediction_feature_id].to_s].each do |act|
37
- activities << Math.log10(act)
38
- weights << row["tanimoto"]
39
- fingerprint_ids.each_with_index do |id,j|
40
- fingerprints[id] ||= []
41
- fingerprints[id] << fingerprint.include?(id)
42
- end
43
- end
44
- end
45
- end
46
-
47
- variables = []
48
- data_frame = [activities]
49
- fingerprints.each do |k,v|
50
- unless v.uniq.size == 1
51
- data_frame << v.collect{|m| m ? "T" : "F"}
52
- variables << k
53
- end
54
- end
55
-
56
- if variables.empty?
57
- result = local_weighted_average(compound, params)
58
- result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
59
- return result
60
-
61
- else
62
- compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
63
- prediction = r_model_prediction method, data_frame, variables, weights, compound_features
64
- if prediction.nil? or prediction[:value].nil?
65
- prediction = local_weighted_average(compound, params)
66
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
67
- return prediction
68
- else
69
- prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
70
- prediction[:value] = 10**prediction[:value]
71
- prediction[:rmse] = 10**prediction[:rmse]
72
- prediction
73
- end
74
- end
75
-
76
- end
77
-
78
- def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4"
79
-
80
- neighbors = params[:neighbors]
81
- return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
82
- return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
83
-
84
- activities = []
85
- weights = []
86
- physchem = {}
87
-
88
- neighbors.each_with_index do |row,i|
89
- neighbor = Compound.find row["_id"]
90
- if row["features"][params[:prediction_feature_id].to_s]
91
- row["features"][params[:prediction_feature_id].to_s].each do |act|
92
- activities << Math.log10(act)
93
- weights << row["tanimoto"] # TODO cosine ?
94
- neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
95
- physchem[pid] ||= []
96
- physchem[pid] << v
97
- end
98
- end
99
- end
100
- end
101
-
102
- # remove properties with a single value
103
- physchem.each do |pid,v|
104
- physchem.delete(pid) if v.uniq.size <= 1
105
- end
106
-
107
- if physchem.empty?
108
- result = local_weighted_average(compound, params)
109
- result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
110
- return result
111
-
112
- else
113
- data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
114
- prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
115
- if prediction.nil?
116
- prediction = local_weighted_average(compound, params)
117
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
118
- return prediction
119
- else
120
- prediction[:value] = 10**prediction[:value]
121
- prediction
122
- end
123
- end
124
-
125
- end
126
-
127
- def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
128
- R.assign "weights", training_weights
129
- r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
130
- R.eval "data <- #{r_data_frame}"
131
- R.assign "features", training_features
132
- R.eval "names(data) <- append(c('activities'),features)" #
133
- begin
134
- R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
135
- rescue
136
- return nil
137
- end
138
- R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
139
- R.eval "names(fingerprint) <- features"
140
- R.eval "prediction <- predict(model,fingerprint)"
141
- {
142
- :value => R.eval("prediction").to_f,
143
- :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
144
- :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
145
- }
146
- end
147
-
148
18
  end
149
19
  end
150
20
  end
@@ -55,14 +55,8 @@ module OpenTox
55
55
  if [301, 302, 307].include? response.code and request.method == :get
56
56
  response.follow_redirection(request, result)
57
57
  elsif response.code >= 400 and !URI.task?(uri)
58
- #TODO add parameters to error-report
59
- #parameters = request.args
60
- #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
61
- #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
62
- #message += "\nREST parameters:\n#{parameters.inspect}"
63
58
  error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
64
59
  begin # errors are returned as error reports in json, try to parse
65
- # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
66
60
  content = JSON.parse(response)
67
61
  msg = content["message"].to_s
68
62
  cause = content["errorCause"].to_s
data/lib/similarity.rb ADDED
@@ -0,0 +1,65 @@
1
+ module OpenTox
2
+ module Algorithm
3
+
4
+ class Vector
5
+ def self.dot_product(a, b)
6
+ products = a.zip(b).map{|a, b| a * b}
7
+ products.inject(0) {|s,p| s + p}
8
+ end
9
+
10
+ def self.magnitude(point)
11
+ squares = point.map{|x| x ** 2}
12
+ Math.sqrt(squares.inject(0) {|s, c| s + c})
13
+ end
14
+ end
15
+
16
+ class Similarity
17
+
18
+ def self.tanimoto fingerprints
19
+ ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
20
+ end
21
+
22
+ #def self.weighted_tanimoto fingerprints
23
+ #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
24
+ #end
25
+
26
+ def self.euclid scaled_properties
27
+ sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
28
+ Math.sqrt(sq.inject(0) {|s,c| s + c})
29
+ end
30
+
31
+ # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
32
+ def self.cosine scaled_properties
33
+ scaled_properties = remove_nils scaled_properties
34
+ Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
35
+ end
36
+
37
+ def self.weighted_cosine scaled_properties # [a,b,weights]
38
+ a,b,w = remove_nils scaled_properties
39
+ return cosine(scaled_properties) if w.uniq.size == 1
40
+ dot_product = 0
41
+ magnitude_a = 0
42
+ magnitude_b = 0
43
+ (0..a.size-1).each do |i|
44
+ dot_product += w[i].abs*a[i]*b[i]
45
+ magnitude_a += w[i].abs*a[i]**2
46
+ magnitude_b += w[i].abs*b[i]**2
47
+ end
48
+ dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
49
+ end
50
+
51
+ def self.remove_nils scaled_properties
52
+ a =[]; b = []; w = []
53
+ (0..scaled_properties.first.size-1).each do |i|
54
+ if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan?
55
+ a << scaled_properties[0][i]
56
+ b << scaled_properties[1][i]
57
+ w << scaled_properties[2][i]
58
+ end
59
+ end
60
+ [a,b,w]
61
+ end
62
+
63
+ end
64
+ end
65
+ end
data/lib/substance.rb ADDED
@@ -0,0 +1,8 @@
1
+ module OpenTox
2
+
3
+ class Substance
4
+ field :properties, type: Hash, default: {}
5
+ field :dataset_ids, type: Array, default: []
6
+ end
7
+
8
+ end
@@ -0,0 +1,69 @@
1
+ module OpenTox
2
+
3
+ module Validation
4
+
5
+ class TrainTest < Validation
6
+
7
+ field :training_dataset_id, type: BSON::ObjectId
8
+ field :test_dataset_id, type: BSON::ObjectId
9
+
10
+ def self.create model, training_set, test_set
11
+
12
+ validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
13
+ validation_model.save
14
+ predictions = validation_model.predict test_set.substances
15
+ nr_unpredicted = 0
16
+ predictions.each do |cid,prediction|
17
+ if prediction[:value]
18
+ prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id])
19
+ else
20
+ nr_unpredicted += 1
21
+ end
22
+ end
23
+ predictions.select!{|cid,p| p[:value] and p[:measurements]}
24
+ validation = self.new(
25
+ :model_id => validation_model.id,
26
+ :test_dataset_id => test_set.id,
27
+ :nr_instances => test_set.substances.size,
28
+ :nr_unpredicted => nr_unpredicted,
29
+ :predictions => predictions
30
+ )
31
+ validation.save
32
+ validation
33
+ end
34
+
35
+ def test_dataset
36
+ Dataset.find test_dataset_id
37
+ end
38
+
39
+ def training_dataset
40
+ Dataset.find training_dataset_id
41
+ end
42
+
43
+ end
44
+
45
+ class ClassificationTrainTest < TrainTest
46
+ include ClassificationStatistics
47
+ field :accept_values, type: Array
48
+ field :confusion_matrix, type: Array
49
+ field :weighted_confusion_matrix, type: Array
50
+ field :accuracy, type: Float
51
+ field :weighted_accuracy, type: Float
52
+ field :true_rate, type: Hash
53
+ field :predictivity, type: Hash
54
+ field :probability_plot_id, type: BSON::ObjectId
55
+ end
56
+
57
+ class RegressionTrainTest < TrainTest
58
+ include RegressionStatistics
59
+ field :rmse, type: Float, default:0
60
+ field :mae, type: Float, default:0
61
+ field :r_squared, type: Float
62
+ field :within_prediction_interval, type: Integer, default:0
63
+ field :out_of_prediction_interval, type: Integer, default:0
64
+ field :correlation_plot_id, type: BSON::ObjectId
65
+ end
66
+
67
+ end
68
+
69
+ end
@@ -0,0 +1,223 @@
1
+ module OpenTox
2
+ module Validation
3
+ module ClassificationStatistics
4
+
5
+ def statistics
6
+ self.accept_values = model.prediction_feature.accept_values
7
+ self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
8
+ self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
9
+ nr_instances = 0
10
+ predictions.each do |cid,pred|
11
+ # TODO
12
+ # use predictions without probabilities (single neighbor)??
13
+ # use measured majority class??
14
+ if pred[:measurements].uniq.size == 1 and pred[:probabilities]
15
+ m = pred[:measurements].first
16
+ if pred[:value] == m
17
+ if pred[:value] == accept_values[0]
18
+ confusion_matrix[0][0] += 1
19
+ weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
20
+ nr_instances += 1
21
+ elsif pred[:value] == accept_values[1]
22
+ confusion_matrix[1][1] += 1
23
+ weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
24
+ nr_instances += 1
25
+ end
26
+ elsif pred[:value] != m
27
+ if pred[:value] == accept_values[0]
28
+ confusion_matrix[0][1] += 1
29
+ weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
30
+ nr_instances += 1
31
+ elsif pred[:value] == accept_values[1]
32
+ confusion_matrix[1][0] += 1
33
+ weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
34
+ nr_instances += 1
35
+ end
36
+ end
37
+ end
38
+ end
39
+ self.true_rate = {}
40
+ self.predictivity = {}
41
+ accept_values.each_with_index do |v,i|
42
+ self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
43
+ self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
44
+ end
45
+ confidence_sum = 0
46
+ weighted_confusion_matrix.each do |r|
47
+ r.each do |c|
48
+ confidence_sum += c
49
+ end
50
+ end
51
+ self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
52
+ self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
53
+ $logger.debug "Accuracy #{accuracy}"
54
+ save
55
+ {
56
+ :accept_values => accept_values,
57
+ :confusion_matrix => confusion_matrix,
58
+ :weighted_confusion_matrix => weighted_confusion_matrix,
59
+ :accuracy => accuracy,
60
+ :weighted_accuracy => weighted_accuracy,
61
+ :true_rate => self.true_rate,
62
+ :predictivity => self.predictivity,
63
+ }
64
+ end
65
+
66
+ def probability_plot format: "pdf"
67
+ #unless probability_plot_id
68
+
69
+ #tmpdir = File.join(ENV["HOME"], "tmp")
70
+ tmpdir = "/tmp"
71
+ #p tmpdir
72
+ FileUtils.mkdir_p tmpdir
73
+ tmpfile = File.join(tmpdir,"#{id.to_s}_probability.#{format}")
74
+ accuracies = []
75
+ probabilities = []
76
+ correct_predictions = 0
77
+ incorrect_predictions = 0
78
+ pp = []
79
+ predictions.values.select{|p| p["probabilities"]}.compact.each do |p|
80
+ p["measurements"].each do |m|
81
+ pp << [ p["probabilities"][p["value"]], p["value"] == m ]
82
+ end
83
+ end
84
+ pp.sort_by!{|p| 1-p.first}
85
+ pp.each do |p|
86
+ p[1] ? correct_predictions += 1 : incorrect_predictions += 1
87
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
88
+ probabilities << p[0]
89
+ end
90
+ R.assign "accuracy", accuracies
91
+ R.assign "probability", probabilities
92
+ R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()"
93
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
94
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
95
+ plot_id = $gridfs.insert_one(file)
96
+ update(:probability_plot_id => plot_id)
97
+ #end
98
+ $gridfs.find_one(_id: probability_plot_id).data
99
+ end
100
+ end
101
+
102
+ module RegressionStatistics
103
+
104
+ def statistics
105
+ self.rmse = 0
106
+ self.mae = 0
107
+ self.within_prediction_interval = 0
108
+ self.out_of_prediction_interval = 0
109
+ x = []
110
+ y = []
111
+ predictions.each do |cid,pred|
112
+ if pred[:value] and pred[:measurements]
113
+ x << pred[:measurements].median
114
+ y << pred[:value]
115
+ error = pred[:value]-pred[:measurements].median
116
+ self.rmse += error**2
117
+ self.mae += error.abs
118
+ if pred[:prediction_interval]
119
+ if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
120
+ self.within_prediction_interval += 1
121
+ else
122
+ self.out_of_prediction_interval += 1
123
+ end
124
+ end
125
+ else
126
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
127
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
128
+ end
129
+ end
130
+ R.assign "measurement", x
131
+ R.assign "prediction", y
132
+ R.eval "r <- cor(measurement,prediction,use='pairwise')"
133
+ self.r_squared = R.eval("r").to_ruby**2
134
+ self.mae = self.mae/predictions.size
135
+ self.rmse = Math.sqrt(self.rmse/predictions.size)
136
+ $logger.debug "R^2 #{r_squared}"
137
+ $logger.debug "RMSE #{rmse}"
138
+ $logger.debug "MAE #{mae}"
139
+ $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
140
+ save
141
+ {
142
+ :mae => mae,
143
+ :rmse => rmse,
144
+ :r_squared => r_squared,
145
+ :within_prediction_interval => within_prediction_interval,
146
+ :out_of_prediction_interval => out_of_prediction_interval,
147
+ }
148
+ end
149
+
150
+ def percent_within_prediction_interval
151
+ 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
152
+ end
153
+
154
+ def correlation_plot format: "png"
155
+ unless correlation_plot_id
156
+ tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
157
+ x = []
158
+ y = []
159
+ feature = Feature.find(predictions.first.last["prediction_feature_id"])
160
+ predictions.each do |sid,p|
161
+ x << p["measurements"].median
162
+ y << p["value"]
163
+ end
164
+ R.assign "measurement", x
165
+ R.assign "prediction", y
166
+ R.eval "all = c(measurement,prediction)"
167
+ R.eval "range = c(min(all), max(all))"
168
+ title = feature.name
169
+ title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
170
+ R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
171
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
172
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
173
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
174
+ plot_id = $gridfs.insert_one(file)
175
+ update(:correlation_plot_id => plot_id)
176
+ end
177
+ $gridfs.find_one(_id: correlation_plot_id).data
178
+ end
179
+
180
+ def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
181
+ worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
182
+ worst_predictions.collect do |p|
183
+ substance = Substance.find(p.first)
184
+ prediction = p[1]
185
+ if show_neigbors
186
+ neighbors = prediction["neighbors"].collect do |n|
187
+ common_descriptors = []
188
+ if show_common_descriptors
189
+ common_descriptors = n["common_descriptors"].collect do |d|
190
+ f=Feature.find(d)
191
+ {
192
+ :id => f.id.to_s,
193
+ :name => "#{f.name} (#{f.conditions})",
194
+ :p_value => d[:p_value],
195
+ :r_squared => d[:r_squared],
196
+ }
197
+ end
198
+ else
199
+ common_descriptors = n["common_descriptors"].size
200
+ end
201
+ {
202
+ :name => Substance.find(n["_id"]).name,
203
+ :id => n["_id"].to_s,
204
+ :common_descriptors => common_descriptors
205
+ }
206
+ end
207
+ else
208
+ neighbors = prediction["neighbors"].size
209
+ end
210
+ {
211
+ :id => substance.id.to_s,
212
+ :name => substance.name,
213
+ :feature => Feature.find(prediction["prediction_feature_id"]).name,
214
+ :error => (prediction["value"] - prediction["measurements"].median).abs,
215
+ :prediction => prediction["value"],
216
+ :measurements => prediction["measurements"],
217
+ :neighbors => neighbors
218
+ }
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end