lazar 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c17dc3fb7cae4c75aca1be7c0a6286cfbc3f22ce
4
- data.tar.gz: 5b9fb4bae6230e427188e0c8e34153fd5a6efa0a
3
+ metadata.gz: 698fd96821077269f4c31fdfca6bead6beab36f0
4
+ data.tar.gz: 2fd49abf99e8f5367b83764735c5c2e49caad4d2
5
5
  SHA512:
6
- metadata.gz: 7cae1ffb410cd9a2d1afd1516ebf99499e2b2447af8707a4381adb652cb59711e1875c11e80cec8fc101f8368224ab21bc378f685b0084ab29c631d798145dca
7
- data.tar.gz: d01273022852b6a0b59941a0e881a85ed1400a984912018d97fc137f8ab602cff1fd6f5fb42a65df5d9375cb43ca2809adeefbde7a0e385fd832c189df0da031
6
+ metadata.gz: ef5d243c765f5d1d6bb4c2dbd53bf2cdcca380df532cef8c48447b907ff01086f1b50c261e1f494827a30e52e7d8e62e2c077334cdbe30370546680ffd018886
7
+ data.tar.gz: d9bb905832388dcb44bb3211fb6976c4e0894d75eb02fdf60c17fe19f25dec73ba24b5585fea8cb85e77b6793b46518462cf1d9b2488c465332b9db3af132028
data/README.md CHANGED
@@ -59,7 +59,75 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
59
59
 
60
60
  #### Experiment with other algorithms
61
61
 
62
- You can pass algorithms parameters to the `Model::Validation.create_from_csv_file` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions.
62
+ You can pass algorithm specifications as parameters to the `Model::Validation.create_from_csv_file` and `Model::Lazar.create` commands. Algorithms for descriptors, similarity calculations, feature_selection and local models are specified in the `algorithm` parameter. Unspecified algorithms and parameters are substituted by default values. The example below selects
63
+
64
+ - MP2D fingerprint descriptors
65
+ - Tanimoto similarity with a threshold of 0.1
66
+ - no feature selection
67
+ - weighted majority vote predictions
68
+
69
+ ```
70
+ algorithms = {
71
+ :descriptors => { # descriptor algorithm
72
+ :method => "fingerprint", # fingerprint descriptors
73
+ :type => "MP2D" # fingerprint type, e.g. FP4, MACCS
74
+ },
75
+ :similarity => { # similarity algorithm
76
+ :method => "Algorithm::Similarity.tanimoto",
77
+ :min => 0.1 # similarity threshold for neighbors
78
+ },
79
+ :feature_selection => nil, # no feature selection
80
+ :prediction => { # local modelling algorithm
81
+ :method => "Algorithm::Classification.weighted_majority_vote",
82
+ },
83
+ }
84
+
85
+ training_dataset = Dataset.from_csv_file "hamster_carcinogenicity.csv"
86
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
87
+ ```
88
+
89
+ The next example creates a regression model with
90
+
91
+ - calculated descriptors from OpenBabel libraries
92
+ - weighted cosine similarity and a threshold of 0.5
93
+ - descriptors that are correlated with the endpoint
94
+ - local partial least squares models from the R caret package
95
+
96
+ ```
97
+ algorithms = {
98
+ :descriptors => { # descriptor algorithm
99
+ :method => "calculate_properties",
100
+ :features => PhysChem.openbabel_descriptors,
101
+ },
102
+ :similarity => { # similarity algorithm
103
+ :method => "Algorithm::Similarity.weighted_cosine",
104
+ :min => 0.5
105
+ },
106
+ :feature_selection => { # feature selection algorithm
107
+ :method => "Algorithm::FeatureSelection.correlation_filter",
108
+ },
109
+ :prediction => { # local modelling algorithm
110
+ :method => "Algorithm::Caret.pls",
111
+ },
112
+ }
113
+ training_dataset = Dataset.from_csv_file "EPAFHM_log10.csv"
114
+ model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
115
+ ```
116
+
117
+ Please consult the [API documentation](http://rdoc.info/gems/lazar) and [source code](https:://github.com/opentox/lazar) for up to date information about implemented algorithms:
118
+
119
+ - Descriptor algorithms
120
+ - [Compounds](http://www.rubydoc.info/gems/lazar/OpenTox/Compound)
121
+ - [Nanoparticles](http://www.rubydoc.info/gems/lazar/OpenTox/Nanoparticle)
122
+ - [Similarity algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Similarity)
123
+ - [Feature selection algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/FeatureSelection)
124
+ - Local models
125
+ - [Classification](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Classification)
126
+ - [Regression](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Regression)
127
+ - [R caret](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Caret)
128
+
129
+
130
+ You can find more working examples in the `lazar` `model-*.rb` and `validation-*.rb` [tests](https://github.com/opentox/lazar/tree/master/test).
63
131
 
64
132
  ### Create and use `lazar` nanoparticle models
65
133
 
@@ -87,7 +155,35 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
87
155
 
88
156
  #### Experiment with other datasets, endpoints and algorithms
89
157
 
90
- You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions. Detailed documentation and validation results can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf).
158
+ You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. Procedure and options are the same as for compounds. The following commands create and validate a `nano-lazar` model with
159
+
160
+ - measured P-CHEM properties as descriptors
161
+ - descriptors selected with correlation filter
162
+ - weighted cosine similarity with a threshold of 0.5
163
+ - Caret random forests
164
+
165
+ ```
166
+ algorithms = {
167
+ :descriptors => {
168
+ :method => "properties",
169
+ :categories => ["P-CHEM"],
170
+ },
171
+ :similarity => {
172
+ :method => "Algorithm::Similarity.weighted_cosine",
173
+ :min => 0.5
174
+ },
175
+ :feature_selection => {
176
+ :method => "Algorithm::FeatureSelection.correlation_filter",
177
+ },
178
+ :prediction => {
179
+ :method => "Algorithm::Caret.rf",
180
+ },
181
+ }
182
+ validation_model = Model::Validation.from_enanomapper algorithms: algorithms
183
+ ```
184
+
185
+
186
+ Detailed documentation and validation results for nanoparticle models can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf).
91
187
 
92
188
  Documentation
93
189
  -------------
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.1
1
+ 1.1.0
data/lib/caret.rb CHANGED
@@ -22,12 +22,11 @@ module OpenTox
22
22
  end
23
23
  if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
24
24
  prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
25
- prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
25
+ prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances."
26
26
  elsif
27
27
  dependent_variables.size < 3
28
28
  prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
29
- prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
30
-
29
+ prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
31
30
  else
32
31
  dependent_variables.each_with_index do |v,i|
33
32
  dependent_variables[i] = to_r(v)
@@ -52,7 +51,7 @@ module OpenTox
52
51
  $logger.debug dependent_variables
53
52
  $logger.debug independent_variables
54
53
  prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
55
- prediction[:warning] = "R caret model creation error. Using weighted average of similar substances."
54
+ prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
56
55
  return prediction
57
56
  end
58
57
  begin
@@ -73,12 +72,12 @@ module OpenTox
73
72
  $logger.debug "R caret prediction error for:"
74
73
  $logger.debug self.inspect
75
74
  prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
76
- prediction[:warning] = "R caret prediction error. Using weighted average of similar substances"
75
+ prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
77
76
  return prediction
78
77
  end
79
78
  if prediction.nil? or prediction[:value].nil?
80
79
  prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
81
- prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
80
+ prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
82
81
  end
83
82
  end
84
83
  prediction
@@ -18,6 +18,11 @@ module OpenTox
18
18
  class_weights.each do |a,w|
19
19
  probabilities[a] = w.sum/weights.sum
20
20
  end
21
+ # DG: hack to ensure always two probability values
22
+ if probabilities.keys.uniq.size == 1
23
+ missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0]
24
+ probabilities[missing_key] = 0.0
25
+ end
21
26
  probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
22
27
  p_max = probabilities.collect{|a,p| p}.max
23
28
  prediction = probabilities.key(p_max)
@@ -90,6 +90,7 @@ module OpenTox
90
90
  field :within_prediction_interval, type: Integer, default:0
91
91
  field :out_of_prediction_interval, type: Integer, default:0
92
92
  field :correlation_plot_id, type: BSON::ObjectId
93
+ field :warnings, type: Array
93
94
  end
94
95
 
95
96
  # Independent repeated crossvalidations
data/lib/dataset.rb CHANGED
@@ -46,7 +46,7 @@ module OpenTox
46
46
  if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
47
47
  data_entries[substance.to_s][feature.to_s]
48
48
  else
49
- nil
49
+ [nil]
50
50
  end
51
51
  end
52
52
 
data/lib/lazar.rb CHANGED
@@ -16,16 +16,19 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple
16
16
 
17
17
  ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"]
18
18
  ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
19
+ # search for a central mongo database in use
20
+ # http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment
21
+ CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
19
22
  Mongoid.load_configuration({
20
23
  :clients => {
21
24
  :default => {
22
25
  :database => ENV["LAZAR_ENV"],
23
- :hosts => ["localhost:27017"],
26
+ :hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
24
27
  }
25
28
  }
26
29
  })
27
30
  Mongoid.raise_not_found_error = false # return nil if no document is found
28
- $mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
31
+ $mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
29
32
  $gridfs = $mongo.database.fs
30
33
 
31
34
  # Logger setup
@@ -58,6 +58,7 @@ module OpenTox
58
58
  field :within_prediction_interval, type: Integer, default:0
59
59
  field :out_of_prediction_interval, type: Integer, default:0
60
60
  field :correlation_plot_id, type: BSON::ObjectId
61
+ field :warnings, type: Array
61
62
  end
62
63
 
63
64
  end
data/lib/model.rb CHANGED
@@ -57,7 +57,7 @@ module OpenTox
57
57
  model.version = {:warning => "git is not installed"}
58
58
  end
59
59
 
60
- # set defaults
60
+ # set defaults#
61
61
  substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
62
62
  bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
63
63
 
@@ -68,10 +68,6 @@ module OpenTox
68
68
  :method => "fingerprint",
69
69
  :type => "MP2D",
70
70
  },
71
- :similarity => {
72
- :method => "Algorithm::Similarity.tanimoto",
73
- :min => 0.1
74
- },
75
71
  :feature_selection => nil
76
72
  }
77
73
 
@@ -79,9 +75,17 @@ module OpenTox
79
75
  model.algorithms[:prediction] = {
80
76
  :method => "Algorithm::Classification.weighted_majority_vote",
81
77
  }
78
+ model.algorithms[:similarity] = {
79
+ :method => "Algorithm::Similarity.tanimoto",
80
+ :min => 0.1,
81
+ }
82
82
  elsif model.class == LazarRegression
83
83
  model.algorithms[:prediction] = {
84
- :method => "Algorithm::Caret.pls",
84
+ :method => "Algorithm::Caret.rf",
85
+ }
86
+ model.algorithms[:similarity] = {
87
+ :method => "Algorithm::Similarity.tanimoto",
88
+ :min => 0.5,
85
89
  }
86
90
  end
87
91
 
@@ -93,7 +97,7 @@ module OpenTox
93
97
  },
94
98
  :similarity => {
95
99
  :method => "Algorithm::Similarity.weighted_cosine",
96
- :min => 0.5
100
+ :min => 0.5,
97
101
  },
98
102
  :prediction => {
99
103
  :method => "Algorithm::Caret.rf",
@@ -141,7 +145,6 @@ module OpenTox
141
145
  end
142
146
  model.descriptor_ids = model.fingerprints.flatten.uniq
143
147
  model.descriptor_ids.each do |d|
144
- # resulting model may break BSON size limit (e.g. f Kazius dataset)
145
148
  model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
146
149
  end
147
150
  # calculate physchem properties
@@ -191,7 +194,7 @@ module OpenTox
191
194
  # Predict a substance (compound or nanoparticle)
192
195
  # @param [OpenTox::Substance]
193
196
  # @return [Hash]
194
- def predict_substance substance
197
+ def predict_substance substance, threshold = self.algorithms[:similarity][:min]
195
198
 
196
199
  @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
197
200
  case algorithms[:similarity][:method]
@@ -221,20 +224,19 @@ module OpenTox
221
224
  bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
222
225
  end
223
226
 
224
- prediction = {}
227
+ prediction = {:warnings => [], :measurements => []}
228
+ prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
225
229
  neighbor_ids = []
226
230
  neighbor_similarities = []
227
231
  neighbor_dependent_variables = []
228
232
  neighbor_independent_variables = []
229
233
 
230
- prediction = {}
231
234
  # find neighbors
232
235
  substance_ids.each_with_index do |s,i|
233
236
  # handle query substance
234
237
  if substance.id.to_s == s
235
- prediction[:measurements] ||= []
236
238
  prediction[:measurements] << dependent_variables[i]
237
- prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
239
+ prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
238
240
  else
239
241
  if fingerprints?
240
242
  neighbor_descriptors = fingerprints[i]
@@ -243,7 +245,7 @@ module OpenTox
243
245
  neighbor_descriptors = scaled_variables.collect{|v| v[i]}
244
246
  end
245
247
  sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
246
- if sim >= algorithms[:similarity][:min]
248
+ if sim >= threshold
247
249
  neighbor_ids << s
248
250
  neighbor_similarities << sim
249
251
  neighbor_dependent_variables << dependent_variables[i]
@@ -258,17 +260,27 @@ module OpenTox
258
260
  measurements = nil
259
261
 
260
262
  if neighbor_similarities.empty?
261
- prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
263
+ prediction[:value] = nil
264
+ prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
262
265
  elsif neighbor_similarities.size == 1
263
- prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
266
+ prediction[:value] = nil
267
+ prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
268
+ prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
264
269
  else
265
270
  query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
266
271
  # call prediction algorithm
267
272
  result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
268
273
  prediction.merge! result
269
274
  prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
275
+ #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
276
+ #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
277
+ #end
278
+ end
279
+ if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
280
+ prediction
281
+ else # try again with a lower threshold
282
+ predict_substance substance, 0.2
270
283
  end
271
- prediction
272
284
  end
273
285
 
274
286
  # Predict a substance (compound or nanoparticle), an array of substances or a dataset
@@ -300,7 +312,7 @@ module OpenTox
300
312
  # serialize result
301
313
  if object.is_a? Substance
302
314
  prediction = predictions[substances.first.id.to_s]
303
- prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
315
+ prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity
304
316
  return prediction
305
317
  elsif object.is_a? Array
306
318
  return predictions
data/lib/regression.rb CHANGED
@@ -17,7 +17,7 @@ module OpenTox
17
17
  sim_sum += weights[i]
18
18
  end if dependent_variables
19
19
  sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
20
- {:value => prediction}
20
+ {:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]}
21
21
  end
22
22
 
23
23
  end
@@ -27,6 +27,8 @@ module OpenTox
27
27
  end
28
28
  end
29
29
  predictions.select!{|cid,p| p[:value] and p[:measurements]}
30
+ # hack to avoid mongos file size limit error on large datasets
31
+ #predictions.each{|cid,p| p[:neighbors] = []} if model.training_dataset.name.match(/mutagenicity/i)
30
32
  validation = self.new(
31
33
  :model_id => validation_model.id,
32
34
  :test_dataset_id => test_set.id,
@@ -48,7 +48,8 @@ UNIQUEDESCRIPTORS = [
48
48
  #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
49
49
  #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
50
50
  "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
51
- "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
51
+ # TODO check why the next descriptor is not present in the CDK_DESCRIPTIONS variable.
52
+ #"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
52
53
  "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
53
54
  "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
54
55
  "Cdk.LargestChain", #Returns the number of atoms in the largest chain
@@ -111,6 +111,7 @@ module OpenTox
111
111
  # Get statistics
112
112
  # @return [Hash]
113
113
  def statistics
114
+ self.warnings = []
114
115
  self.rmse = 0
115
116
  self.mae = 0
116
117
  self.within_prediction_interval = 0
@@ -132,8 +133,10 @@ module OpenTox
132
133
  end
133
134
  end
134
135
  else
135
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
136
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
136
+ trd_id = model.training_dataset_id
137
+ smiles = Compound.find(cid).smiles
138
+ self.warnings << "No training activities for #{smiles} in training dataset #{trd_id}."
139
+ $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
137
140
  end
138
141
  end
139
142
  R.assign "measurement", x
@@ -146,6 +149,7 @@ module OpenTox
146
149
  $logger.debug "RMSE #{rmse}"
147
150
  $logger.debug "MAE #{mae}"
148
151
  $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
152
+ $logger.debug "#{warnings}"
149
153
  save
150
154
  {
151
155
  :mae => mae,
@@ -179,8 +183,12 @@ module OpenTox
179
183
  R.assign "prediction", y
180
184
  R.eval "all = c(measurement,prediction)"
181
185
  R.eval "range = c(min(all), max(all))"
182
- title = feature.name
183
- title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
186
+ if feature.name.match /Net cell association/ # ad hoc fix for awkward units
187
+ title = "log2(Net cell association [mL/ug(Mg)])"
188
+ else
189
+ title = feature.name
190
+ title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank?
191
+ end
184
192
  R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
185
193
  R.eval "image = image + geom_abline(intercept=0, slope=1)"
186
194
  R.eval "ggsave(file='#{tmpfile}', plot=image)"
@@ -191,51 +199,21 @@ module OpenTox
191
199
  $gridfs.find_one(_id: correlation_plot_id).data
192
200
  end
193
201
 
194
- # Get predictions with the largest difference between predicted and measured values
195
- # @params [Fixnum] number of predictions
196
- # @params [TrueClass,FalseClass,nil] include neighbors
197
- # @params [TrueClass,FalseClass,nil] show common descriptors
202
+ # Get predictions with measurements outside of the prediction interval
198
203
  # @return [Hash]
199
- def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
200
- worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
201
- worst_predictions.collect do |p|
202
- substance = Substance.find(p.first)
203
- prediction = p[1]
204
- if show_neigbors
205
- neighbors = prediction["neighbors"].collect do |n|
206
- common_descriptors = []
207
- if show_common_descriptors
208
- common_descriptors = n["common_descriptors"].collect do |d|
209
- f=Feature.find(d)
210
- {
211
- :id => f.id.to_s,
212
- :name => "#{f.name} (#{f.conditions})",
213
- :p_value => d[:p_value],
214
- :r_squared => d[:r_squared],
215
- }
216
- end
217
- else
218
- common_descriptors = n["common_descriptors"].size
219
- end
220
- {
221
- :name => Substance.find(n["_id"]).name,
222
- :id => n["_id"].to_s,
223
- :common_descriptors => common_descriptors
224
- }
225
- end
226
- else
227
- neighbors = prediction["neighbors"].size
204
+ def worst_predictions
205
+ worst_predictions = predictions.select do |sid,p|
206
+ p["prediction_interval"] and p["value"] and (p["measurements"].max < p["prediction_interval"][0] or p["measurements"].min > p["prediction_interval"][1])
207
+ end.compact.to_h
208
+ worst_predictions.each do |sid,p|
209
+ p["error"] = (p["value"] - p["measurements"].median).abs
210
+ if p["measurements"].max < p["prediction_interval"][0]
211
+ p["distance_prediction_interval"] = (p["measurements"].max - p["prediction_interval"][0]).abs
212
+ elsif p["measurements"].min > p["prediction_interval"][1]
213
+ p["distance_prediction_interval"] = (p["measurements"].min - p["prediction_interval"][1]).abs
228
214
  end
229
- {
230
- :id => substance.id.to_s,
231
- :name => substance.name,
232
- :feature => Feature.find(prediction["prediction_feature_id"]).name,
233
- :error => (prediction["value"] - prediction["measurements"].median).abs,
234
- :prediction => prediction["value"],
235
- :measurements => prediction["measurements"],
236
- :neighbors => neighbors
237
- }
238
215
  end
216
+ worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
239
217
  end
240
218
  end
241
219
  end
data/test/dataset.rb CHANGED
@@ -160,7 +160,7 @@ class DatasetTest < MiniTest::Test
160
160
  if v.numeric?
161
161
  assert_equal v.to_f, serialized[inchi][i].to_f
162
162
  else
163
- assert_equal v, serialized[inchi][i]
163
+ assert_equal v.to_s, serialized[inchi][i].to_s
164
164
  end
165
165
  end
166
166
 
data/test/feature.rb CHANGED
@@ -57,20 +57,20 @@ class FeatureTest < MiniTest::Test
57
57
  def test_physchem_description
58
58
  assert_equal 346, PhysChem.descriptors.size
59
59
  assert_equal 15, PhysChem.openbabel_descriptors.size
60
- assert_equal 295, PhysChem.cdk_descriptors.size
60
+ assert_equal 286, PhysChem.cdk_descriptors.size
61
61
  assert_equal 45, PhysChem.joelib_descriptors.size
62
- assert_equal 310, PhysChem.unique_descriptors.size
62
+ assert_equal 309, PhysChem.unique_descriptors.size
63
63
  end
64
64
 
65
65
  def test_physchem
66
66
  assert_equal 346, PhysChem.descriptors.size
67
67
  c = Compound.from_smiles "CC(=O)CC(C)C"
68
68
  logP = PhysChem.find_or_create_by :name => "Openbabel.logP"
69
- assert_equal 1.6215, logP.calculate(c)
69
+ assert_equal 1.6215, c.calculate_properties([logP]).first
70
70
  jlogP = PhysChem.find_or_create_by :name => "Joelib.LogP"
71
- assert_equal 3.5951, jlogP.calculate(c)
71
+ assert_equal 3.5951, c.calculate_properties([jlogP]).first
72
72
  alogP = PhysChem.find_or_create_by :name => "Cdk.ALOGP.ALogP"
73
- assert_equal 0.35380000000000034, alogP.calculate(c)
73
+ assert_equal 0.35380000000000034, c.calculate_properties([alogP]).first
74
74
  end
75
75
 
76
76
  end
@@ -46,12 +46,14 @@ class LazarClassificationTest < MiniTest::Test
46
46
  assert_equal compound_dataset.compounds, prediction_dataset.compounds
47
47
 
48
48
  cid = prediction_dataset.compounds[7].id.to_s
49
- assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning]
49
+ assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0]
50
+ expectations = ["Cannot create prediction: Only one similar compound in the training set.",
51
+ "Could not find similar substances with experimental data in the training dataset."]
50
52
  prediction_dataset.predictions.each do |cid,pred|
51
- assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil?
53
+ assert_includes expectations, pred[:warnings][0] if pred[:value].nil?
52
54
  end
53
55
  cid = Compound.from_smiles("CCOC(=O)N").id.to_s
54
- assert_match "excluded", prediction_dataset.predictions[cid][:warning]
56
+ assert_match "excluded", prediction_dataset.predictions[cid][:info]
55
57
  # cleanup
56
58
  [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
57
59
  end
@@ -10,21 +10,21 @@ class LazarRegressionTest < MiniTest::Test
10
10
  },
11
11
  :similarity => {
12
12
  :method => "Algorithm::Similarity.tanimoto",
13
- :min => 0.1
13
+ :min => 0.5
14
14
  },
15
15
  :prediction => {
16
- :method => "Algorithm::Caret.pls",
16
+ :method => "Algorithm::Caret.rf",
17
17
  },
18
18
  :feature_selection => nil,
19
19
  }
20
- training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
20
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv")
21
21
  model = Model::Lazar.create training_dataset: training_dataset
22
22
  assert_kind_of Model::LazarRegression, model
23
23
  assert_equal algorithms, model.algorithms
24
- substance = training_dataset.substances[10]
24
+ substance = training_dataset.substances[145]
25
25
  prediction = model.predict substance
26
26
  assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
27
- substance = Compound.from_smiles "NC(=O)OCCC"
27
+ substance = Compound.from_smiles "c1ccc(cc1)Oc1ccccc1"
28
28
  prediction = model.predict substance
29
29
  refute_nil prediction[:value]
30
30
  refute_nil prediction[:prediction_interval]
@@ -59,8 +59,8 @@ class LazarRegressionTest < MiniTest::Test
59
59
  model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
60
60
  compound = Compound.from_smiles "CCCSCCSCC"
61
61
  prediction = model.predict compound
62
- assert_equal 4, prediction[:neighbors].size
63
- assert_equal 1.37, prediction[:value].round(2)
62
+ assert_equal 3, prediction[:neighbors].size
63
+ assert prediction[:value].round(2) > 1.37, "Prediction value (#{prediction[:value].round(2)}) should be larger than 1.37."
64
64
  end
65
65
 
66
66
  def test_local_physchem_regression
@@ -112,12 +112,12 @@ class LazarRegressionTest < MiniTest::Test
112
112
  :method => "Algorithm::Similarity.cosine",
113
113
  }
114
114
  }
115
- training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
115
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
116
116
  model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
117
117
  assert_kind_of Model::LazarRegression, model
118
- assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
118
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
119
119
  assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method]
120
- assert_equal 0.1, model.algorithms[:similarity][:min]
120
+ assert_equal 0.5, model.algorithms[:similarity][:min]
121
121
  algorithms[:descriptors].delete :features
122
122
  assert_equal algorithms[:descriptors], model.algorithms[:descriptors]
123
123
  prediction = model.predict training_dataset.substances[10]
@@ -130,14 +130,14 @@ class LazarRegressionTest < MiniTest::Test
130
130
  :method => "Algorithm::FeatureSelection.correlation_filter",
131
131
  },
132
132
  }
133
- training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
133
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv")
134
134
  model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
135
135
  assert_kind_of Model::LazarRegression, model
136
- assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
136
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
137
137
  assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
138
- assert_equal 0.1, model.algorithms[:similarity][:min]
138
+ assert_equal 0.5, model.algorithms[:similarity][:min]
139
139
  assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
140
- prediction = model.predict training_dataset.substances[10]
140
+ prediction = model.predict training_dataset.substances[145]
141
141
  refute_nil prediction[:value]
142
142
  end
143
143
 
@@ -12,7 +12,7 @@ class ValidationModelTest < MiniTest::Test
12
12
  m.crossvalidations.each do |cv|
13
13
  assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
14
14
  end
15
- prediction = m.predict Compound.from_smiles("CCCC(NN)C")
15
+ prediction = m.predict Compound.from_smiles("OCC(CN(CC(O)C)N=O)O")
16
16
  assert_equal "true", prediction[:value]
17
17
  m.delete
18
18
  end
data/test/setup.rb CHANGED
@@ -3,6 +3,8 @@ require 'minitest/autorun'
3
3
  require_relative '../lib/lazar.rb'
4
4
  #require 'lazar'
5
5
  include OpenTox
6
+ #$mongo.database.drop
7
+ #$gridfs = $mongo.database.fs # recreate GridFS indexes
6
8
  TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
7
9
  DATA_DIR ||= File.join(TEST_DIR,"data")
8
10
  training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
@@ -47,7 +47,7 @@ class ValidationClassificationTest < MiniTest::Test
47
47
  dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
48
48
  model = Model::Lazar.create training_dataset: dataset
49
49
  loo = ClassificationLeaveOneOut.create model
50
- assert_equal 14, loo.nr_unpredicted
50
+ assert_equal 24, loo.nr_unpredicted
51
51
  refute_empty loo.confusion_matrix
52
52
  assert loo.accuracy > 0.77
53
53
  assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})."
@@ -84,7 +84,7 @@ class ValidationRegressionTest < MiniTest::Test
84
84
  repeated_cv = RepeatedCrossValidation.create model
85
85
  repeated_cv.crossvalidations.each do |cv|
86
86
  assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
87
- assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
87
+ assert cv.rmse < 0.5, "RMSE (#{cv.rmse}) should be smaller than 0.5"
88
88
  end
89
89
  end
90
90
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lazar
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler,
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-01-18 00:00:00.000000000 Z
12
+ date: 2017-05-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler