lazar 0.9.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
data/lib/overwrite.rb CHANGED
@@ -28,6 +28,11 @@ class Float
28
28
  def signif(n)
29
29
  Float("%.#{n}g" % self)
30
30
  end
31
+
32
+ # converts -10 logarithmized values back
33
+ def delog10
34
+ 10**(-1*self)
35
+ end
31
36
  end
32
37
 
33
38
  module Enumerable
@@ -101,19 +106,35 @@ class Array
101
106
  end
102
107
 
103
108
  def mean
104
- self.inject{ |sum, el| sum + el }.to_f / self.size
109
+ self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
105
110
  end
106
111
 
107
112
  def sample_variance
108
113
  m = self.mean
109
- sum = self.inject(0){|accum, i| accum +(i-m)**2 }
110
- sum/(self.length - 1).to_f
114
+ sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
115
+ sum/(self.compact.length - 1).to_f
111
116
  end
112
117
 
113
118
  def standard_deviation
114
119
  Math.sqrt(self.sample_variance)
115
120
  end
116
121
 
122
+ def for_R
123
+ if self.first.is_a?(String)
124
+ #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
125
+ "NA"
126
+ else
127
+ self.median
128
+ end
129
+ end
130
+
131
+ def collect_with_index
132
+ result = []
133
+ self.each_with_index do |elt, idx|
134
+ result << yield(elt, idx)
135
+ end
136
+ result
137
+ end
117
138
  end
118
139
 
119
140
  module URI
data/lib/physchem.rb CHANGED
@@ -14,7 +14,7 @@ module OpenTox
14
14
  JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
15
15
 
16
16
  obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
17
- OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
17
+ OPENBABEL = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
18
18
  name,description = d.split(/\s+/,2)
19
19
  ["Openbabel."+name,description] unless obexclude.include? name
20
20
  end.compact.sort{|a,b| a[0] <=> b[0]}]
@@ -25,24 +25,24 @@ module OpenTox
25
25
  prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
26
26
  d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
27
27
  end
28
- CDKDESCRIPTORS = cdkdescriptors
28
+ CDK = cdkdescriptors
29
29
 
30
30
  # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
31
31
  joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
32
32
  # strip Joelib messages from stdout
33
- JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
33
+ JOELIB = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
34
34
  name = d[:java_class].sub(/^joelib2.feature.types./,'')
35
35
  ["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
36
36
  end.compact.sort{|a,b| a[0] <=> b[0]}]
37
37
 
38
- DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
38
+ DESCRIPTORS = OPENBABEL.merge(CDK.merge(JOELIB))
39
39
 
40
40
  require_relative "unique_descriptors.rb"
41
41
 
42
42
  def self.descriptors desc=DESCRIPTORS
43
43
  desc.collect do |name,description|
44
44
  lib,desc = name.split('.',2)
45
- self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
45
+ self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
46
46
  end
47
47
  end
48
48
 
@@ -54,26 +54,26 @@ module OpenTox
54
54
  CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
55
55
  dname = "#{name}.#{n}"
56
56
  description = DESCRIPTORS[dname]
57
- udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
57
+ udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
58
58
  end
59
59
  else
60
60
  description = DESCRIPTORS[name]
61
- udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
61
+ udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
62
62
  end
63
63
  end
64
64
  udesc
65
65
  end
66
66
 
67
67
  def self.openbabel_descriptors
68
- descriptors OBDESCRIPTORS
68
+ descriptors OPENBABEL
69
69
  end
70
70
 
71
71
  def self.cdk_descriptors
72
- descriptors CDKDESCRIPTORS
72
+ descriptors CDK
73
73
  end
74
74
 
75
75
  def self.joelib_descriptors
76
- descriptors JOELIBDESCRIPTORS
76
+ descriptors JOELIB
77
77
  end
78
78
 
79
79
  def calculate compound
@@ -131,3 +131,4 @@ module OpenTox
131
131
  end
132
132
 
133
133
  end
134
+ OpenTox::PhysChem.descriptors # load descriptor features
data/lib/regression.rb CHANGED
@@ -3,148 +3,18 @@ module OpenTox
3
3
 
4
4
  class Regression
5
5
 
6
- def self.local_weighted_average compound, params
6
+ def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
7
+ # TODO: prediction_interval
7
8
  weighted_sum = 0.0
8
9
  sim_sum = 0.0
9
- neighbors = params[:neighbors]
10
- neighbors.each do |row|
11
- sim = row["tanimoto"]
12
- if row["features"][params[:prediction_feature_id].to_s]
13
- row["features"][params[:prediction_feature_id].to_s].each do |act|
14
- weighted_sum += sim*Math.log10(act)
15
- sim_sum += sim
16
- end
17
- end
18
- end
19
- sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
10
+ dependent_variables.each_with_index do |v,i|
11
+ weighted_sum += weights[i]*dependent_variables[i]
12
+ sim_sum += weights[i]
13
+ end if dependent_variables
14
+ sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
20
15
  {:value => prediction}
21
16
  end
22
17
 
23
- # TODO explicit neighbors, also for physchem
24
- def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
25
- neighbors = params[:neighbors]
26
- return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
27
- activities = []
28
- fingerprints = {}
29
- weights = []
30
- fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
31
-
32
- neighbors.each_with_index do |row,i|
33
- neighbor = Compound.find row["_id"]
34
- fingerprint = neighbor.fingerprint
35
- if row["features"][params[:prediction_feature_id].to_s]
36
- row["features"][params[:prediction_feature_id].to_s].each do |act|
37
- activities << Math.log10(act)
38
- weights << row["tanimoto"]
39
- fingerprint_ids.each_with_index do |id,j|
40
- fingerprints[id] ||= []
41
- fingerprints[id] << fingerprint.include?(id)
42
- end
43
- end
44
- end
45
- end
46
-
47
- variables = []
48
- data_frame = [activities]
49
- fingerprints.each do |k,v|
50
- unless v.uniq.size == 1
51
- data_frame << v.collect{|m| m ? "T" : "F"}
52
- variables << k
53
- end
54
- end
55
-
56
- if variables.empty?
57
- result = local_weighted_average(compound, params)
58
- result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
59
- return result
60
-
61
- else
62
- compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
63
- prediction = r_model_prediction method, data_frame, variables, weights, compound_features
64
- if prediction.nil? or prediction[:value].nil?
65
- prediction = local_weighted_average(compound, params)
66
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
67
- return prediction
68
- else
69
- prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
70
- prediction[:value] = 10**prediction[:value]
71
- prediction[:rmse] = 10**prediction[:rmse]
72
- prediction
73
- end
74
- end
75
-
76
- end
77
-
78
- def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4"
79
-
80
- neighbors = params[:neighbors]
81
- return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
82
- return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
83
-
84
- activities = []
85
- weights = []
86
- physchem = {}
87
-
88
- neighbors.each_with_index do |row,i|
89
- neighbor = Compound.find row["_id"]
90
- if row["features"][params[:prediction_feature_id].to_s]
91
- row["features"][params[:prediction_feature_id].to_s].each do |act|
92
- activities << Math.log10(act)
93
- weights << row["tanimoto"] # TODO cosine ?
94
- neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
95
- physchem[pid] ||= []
96
- physchem[pid] << v
97
- end
98
- end
99
- end
100
- end
101
-
102
- # remove properties with a single value
103
- physchem.each do |pid,v|
104
- physchem.delete(pid) if v.uniq.size <= 1
105
- end
106
-
107
- if physchem.empty?
108
- result = local_weighted_average(compound, params)
109
- result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
110
- return result
111
-
112
- else
113
- data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
114
- prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
115
- if prediction.nil?
116
- prediction = local_weighted_average(compound, params)
117
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
118
- return prediction
119
- else
120
- prediction[:value] = 10**prediction[:value]
121
- prediction
122
- end
123
- end
124
-
125
- end
126
-
127
- def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
128
- R.assign "weights", training_weights
129
- r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
130
- R.eval "data <- #{r_data_frame}"
131
- R.assign "features", training_features
132
- R.eval "names(data) <- append(c('activities'),features)" #
133
- begin
134
- R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
135
- rescue
136
- return nil
137
- end
138
- R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
139
- R.eval "names(fingerprint) <- features"
140
- R.eval "prediction <- predict(model,fingerprint)"
141
- {
142
- :value => R.eval("prediction").to_f,
143
- :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
144
- :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
145
- }
146
- end
147
-
148
18
  end
149
19
  end
150
20
  end
@@ -55,14 +55,8 @@ module OpenTox
55
55
  if [301, 302, 307].include? response.code and request.method == :get
56
56
  response.follow_redirection(request, result)
57
57
  elsif response.code >= 400 and !URI.task?(uri)
58
- #TODO add parameters to error-report
59
- #parameters = request.args
60
- #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
61
- #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
62
- #message += "\nREST parameters:\n#{parameters.inspect}"
63
58
  error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
64
59
  begin # errors are returned as error reports in json, try to parse
65
- # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
66
60
  content = JSON.parse(response)
67
61
  msg = content["message"].to_s
68
62
  cause = content["errorCause"].to_s
data/lib/similarity.rb ADDED
@@ -0,0 +1,65 @@
1
+ module OpenTox
2
+ module Algorithm
3
+
4
+ class Vector
5
+ def self.dot_product(a, b)
6
+ products = a.zip(b).map{|a, b| a * b}
7
+ products.inject(0) {|s,p| s + p}
8
+ end
9
+
10
+ def self.magnitude(point)
11
+ squares = point.map{|x| x ** 2}
12
+ Math.sqrt(squares.inject(0) {|s, c| s + c})
13
+ end
14
+ end
15
+
16
+ class Similarity
17
+
18
+ def self.tanimoto fingerprints
19
+ ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
20
+ end
21
+
22
+ #def self.weighted_tanimoto fingerprints
23
+ #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
24
+ #end
25
+
26
+ def self.euclid scaled_properties
27
+ sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
28
+ Math.sqrt(sq.inject(0) {|s,c| s + c})
29
+ end
30
+
31
+ # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
32
+ def self.cosine scaled_properties
33
+ scaled_properties = remove_nils scaled_properties
34
+ Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
35
+ end
36
+
37
+ def self.weighted_cosine scaled_properties # [a,b,weights]
38
+ a,b,w = remove_nils scaled_properties
39
+ return cosine(scaled_properties) if w.uniq.size == 1
40
+ dot_product = 0
41
+ magnitude_a = 0
42
+ magnitude_b = 0
43
+ (0..a.size-1).each do |i|
44
+ dot_product += w[i].abs*a[i]*b[i]
45
+ magnitude_a += w[i].abs*a[i]**2
46
+ magnitude_b += w[i].abs*b[i]**2
47
+ end
48
+ dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
49
+ end
50
+
51
+ def self.remove_nils scaled_properties
52
+ a =[]; b = []; w = []
53
+ (0..scaled_properties.first.size-1).each do |i|
54
+ if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan?
55
+ a << scaled_properties[0][i]
56
+ b << scaled_properties[1][i]
57
+ w << scaled_properties[2][i]
58
+ end
59
+ end
60
+ [a,b,w]
61
+ end
62
+
63
+ end
64
+ end
65
+ end
data/lib/substance.rb ADDED
@@ -0,0 +1,8 @@
1
+ module OpenTox
2
+
3
+ class Substance
4
+ field :properties, type: Hash, default: {}
5
+ field :dataset_ids, type: Array, default: []
6
+ end
7
+
8
+ end
@@ -0,0 +1,69 @@
1
+ module OpenTox
2
+
3
+ module Validation
4
+
5
+ class TrainTest < Validation
6
+
7
+ field :training_dataset_id, type: BSON::ObjectId
8
+ field :test_dataset_id, type: BSON::ObjectId
9
+
10
+ def self.create model, training_set, test_set
11
+
12
+ validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
13
+ validation_model.save
14
+ predictions = validation_model.predict test_set.substances
15
+ nr_unpredicted = 0
16
+ predictions.each do |cid,prediction|
17
+ if prediction[:value]
18
+ prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id])
19
+ else
20
+ nr_unpredicted += 1
21
+ end
22
+ end
23
+ predictions.select!{|cid,p| p[:value] and p[:measurements]}
24
+ validation = self.new(
25
+ :model_id => validation_model.id,
26
+ :test_dataset_id => test_set.id,
27
+ :nr_instances => test_set.substances.size,
28
+ :nr_unpredicted => nr_unpredicted,
29
+ :predictions => predictions
30
+ )
31
+ validation.save
32
+ validation
33
+ end
34
+
35
+ def test_dataset
36
+ Dataset.find test_dataset_id
37
+ end
38
+
39
+ def training_dataset
40
+ Dataset.find training_dataset_id
41
+ end
42
+
43
+ end
44
+
45
+ class ClassificationTrainTest < TrainTest
46
+ include ClassificationStatistics
47
+ field :accept_values, type: Array
48
+ field :confusion_matrix, type: Array
49
+ field :weighted_confusion_matrix, type: Array
50
+ field :accuracy, type: Float
51
+ field :weighted_accuracy, type: Float
52
+ field :true_rate, type: Hash
53
+ field :predictivity, type: Hash
54
+ field :probability_plot_id, type: BSON::ObjectId
55
+ end
56
+
57
+ class RegressionTrainTest < TrainTest
58
+ include RegressionStatistics
59
+ field :rmse, type: Float, default:0
60
+ field :mae, type: Float, default:0
61
+ field :r_squared, type: Float
62
+ field :within_prediction_interval, type: Integer, default:0
63
+ field :out_of_prediction_interval, type: Integer, default:0
64
+ field :correlation_plot_id, type: BSON::ObjectId
65
+ end
66
+
67
+ end
68
+
69
+ end
@@ -0,0 +1,223 @@
1
+ module OpenTox
2
+ module Validation
3
+ module ClassificationStatistics
4
+
5
+ def statistics
6
+ self.accept_values = model.prediction_feature.accept_values
7
+ self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
8
+ self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
9
+ nr_instances = 0
10
+ predictions.each do |cid,pred|
11
+ # TODO
12
+ # use predictions without probabilities (single neighbor)??
13
+ # use measured majority class??
14
+ if pred[:measurements].uniq.size == 1 and pred[:probabilities]
15
+ m = pred[:measurements].first
16
+ if pred[:value] == m
17
+ if pred[:value] == accept_values[0]
18
+ confusion_matrix[0][0] += 1
19
+ weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
20
+ nr_instances += 1
21
+ elsif pred[:value] == accept_values[1]
22
+ confusion_matrix[1][1] += 1
23
+ weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
24
+ nr_instances += 1
25
+ end
26
+ elsif pred[:value] != m
27
+ if pred[:value] == accept_values[0]
28
+ confusion_matrix[0][1] += 1
29
+ weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
30
+ nr_instances += 1
31
+ elsif pred[:value] == accept_values[1]
32
+ confusion_matrix[1][0] += 1
33
+ weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
34
+ nr_instances += 1
35
+ end
36
+ end
37
+ end
38
+ end
39
+ self.true_rate = {}
40
+ self.predictivity = {}
41
+ accept_values.each_with_index do |v,i|
42
+ self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
43
+ self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
44
+ end
45
+ confidence_sum = 0
46
+ weighted_confusion_matrix.each do |r|
47
+ r.each do |c|
48
+ confidence_sum += c
49
+ end
50
+ end
51
+ self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
52
+ self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
53
+ $logger.debug "Accuracy #{accuracy}"
54
+ save
55
+ {
56
+ :accept_values => accept_values,
57
+ :confusion_matrix => confusion_matrix,
58
+ :weighted_confusion_matrix => weighted_confusion_matrix,
59
+ :accuracy => accuracy,
60
+ :weighted_accuracy => weighted_accuracy,
61
+ :true_rate => self.true_rate,
62
+ :predictivity => self.predictivity,
63
+ }
64
+ end
65
+
66
+ def probability_plot format: "pdf"
67
+ #unless probability_plot_id
68
+
69
+ #tmpdir = File.join(ENV["HOME"], "tmp")
70
+ tmpdir = "/tmp"
71
+ #p tmpdir
72
+ FileUtils.mkdir_p tmpdir
73
+ tmpfile = File.join(tmpdir,"#{id.to_s}_probability.#{format}")
74
+ accuracies = []
75
+ probabilities = []
76
+ correct_predictions = 0
77
+ incorrect_predictions = 0
78
+ pp = []
79
+ predictions.values.select{|p| p["probabilities"]}.compact.each do |p|
80
+ p["measurements"].each do |m|
81
+ pp << [ p["probabilities"][p["value"]], p["value"] == m ]
82
+ end
83
+ end
84
+ pp.sort_by!{|p| 1-p.first}
85
+ pp.each do |p|
86
+ p[1] ? correct_predictions += 1 : incorrect_predictions += 1
87
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
88
+ probabilities << p[0]
89
+ end
90
+ R.assign "accuracy", accuracies
91
+ R.assign "probability", probabilities
92
+ R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()"
93
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
94
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
95
+ plot_id = $gridfs.insert_one(file)
96
+ update(:probability_plot_id => plot_id)
97
+ #end
98
+ $gridfs.find_one(_id: probability_plot_id).data
99
+ end
100
+ end
101
+
102
+ module RegressionStatistics
103
+
104
+ def statistics
105
+ self.rmse = 0
106
+ self.mae = 0
107
+ self.within_prediction_interval = 0
108
+ self.out_of_prediction_interval = 0
109
+ x = []
110
+ y = []
111
+ predictions.each do |cid,pred|
112
+ if pred[:value] and pred[:measurements]
113
+ x << pred[:measurements].median
114
+ y << pred[:value]
115
+ error = pred[:value]-pred[:measurements].median
116
+ self.rmse += error**2
117
+ self.mae += error.abs
118
+ if pred[:prediction_interval]
119
+ if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
120
+ self.within_prediction_interval += 1
121
+ else
122
+ self.out_of_prediction_interval += 1
123
+ end
124
+ end
125
+ else
126
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
127
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
128
+ end
129
+ end
130
+ R.assign "measurement", x
131
+ R.assign "prediction", y
132
+ R.eval "r <- cor(measurement,prediction,use='pairwise')"
133
+ self.r_squared = R.eval("r").to_ruby**2
134
+ self.mae = self.mae/predictions.size
135
+ self.rmse = Math.sqrt(self.rmse/predictions.size)
136
+ $logger.debug "R^2 #{r_squared}"
137
+ $logger.debug "RMSE #{rmse}"
138
+ $logger.debug "MAE #{mae}"
139
+ $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
140
+ save
141
+ {
142
+ :mae => mae,
143
+ :rmse => rmse,
144
+ :r_squared => r_squared,
145
+ :within_prediction_interval => within_prediction_interval,
146
+ :out_of_prediction_interval => out_of_prediction_interval,
147
+ }
148
+ end
149
+
150
+ def percent_within_prediction_interval
151
+ 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
152
+ end
153
+
154
+ def correlation_plot format: "png"
155
+ unless correlation_plot_id
156
+ tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
157
+ x = []
158
+ y = []
159
+ feature = Feature.find(predictions.first.last["prediction_feature_id"])
160
+ predictions.each do |sid,p|
161
+ x << p["measurements"].median
162
+ y << p["value"]
163
+ end
164
+ R.assign "measurement", x
165
+ R.assign "prediction", y
166
+ R.eval "all = c(measurement,prediction)"
167
+ R.eval "range = c(min(all), max(all))"
168
+ title = feature.name
169
+ title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
170
+ R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
171
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
172
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
173
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
174
+ plot_id = $gridfs.insert_one(file)
175
+ update(:correlation_plot_id => plot_id)
176
+ end
177
+ $gridfs.find_one(_id: correlation_plot_id).data
178
+ end
179
+
180
+ def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
181
+ worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
182
+ worst_predictions.collect do |p|
183
+ substance = Substance.find(p.first)
184
+ prediction = p[1]
185
+ if show_neigbors
186
+ neighbors = prediction["neighbors"].collect do |n|
187
+ common_descriptors = []
188
+ if show_common_descriptors
189
+ common_descriptors = n["common_descriptors"].collect do |d|
190
+ f=Feature.find(d)
191
+ {
192
+ :id => f.id.to_s,
193
+ :name => "#{f.name} (#{f.conditions})",
194
+ :p_value => d[:p_value],
195
+ :r_squared => d[:r_squared],
196
+ }
197
+ end
198
+ else
199
+ common_descriptors = n["common_descriptors"].size
200
+ end
201
+ {
202
+ :name => Substance.find(n["_id"]).name,
203
+ :id => n["_id"].to_s,
204
+ :common_descriptors => common_descriptors
205
+ }
206
+ end
207
+ else
208
+ neighbors = prediction["neighbors"].size
209
+ end
210
+ {
211
+ :id => substance.id.to_s,
212
+ :name => substance.name,
213
+ :feature => Feature.find(prediction["prediction_feature_id"]).name,
214
+ :error => (prediction["value"] - prediction["measurements"].median).abs,
215
+ :prediction => prediction["value"],
216
+ :measurements => prediction["measurements"],
217
+ :neighbors => neighbors
218
+ }
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end