lazar 0.9.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
data/lib/model.rb CHANGED
@@ -2,7 +2,8 @@ module OpenTox
2
2
 
3
3
  module Model
4
4
 
5
- class Model
5
+ class Lazar
6
+
6
7
  include OpenTox
7
8
  include Mongoid::Document
8
9
  include Mongoid::Timestamps
@@ -10,64 +11,247 @@ module OpenTox
10
11
 
11
12
  field :name, type: String
12
13
  field :creator, type: String, default: __FILE__
13
- # datasets
14
+ field :algorithms, type: Hash, default:{}
14
15
  field :training_dataset_id, type: BSON::ObjectId
15
- # algorithms
16
- field :prediction_algorithm, type: String
17
- # prediction feature
16
+ field :substance_ids, type: Array, default:[]
18
17
  field :prediction_feature_id, type: BSON::ObjectId
18
+ field :dependent_variables, type: Array, default:[]
19
+ field :descriptor_ids, type:Array, default:[]
20
+ field :independent_variables, type: Array, default:[]
21
+ field :fingerprints, type: Array, default:[]
22
+ field :descriptor_weights, type: Array, default:[]
23
+ field :descriptor_means, type: Array, default:[]
24
+ field :descriptor_sds, type: Array, default:[]
25
+ field :scaled_variables, type: Array, default:[]
26
+ field :version, type: Hash, default:{}
27
+
28
+ def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
29
+ bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
30
+ prediction_feature = training_dataset.features.first unless prediction_feature
31
+ # TODO: prediction_feature without training_dataset: use all available data
32
+
33
+ # guess model type
34
+ prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new
35
+
36
+ model.prediction_feature_id = prediction_feature.id
37
+ model.training_dataset_id = training_dataset.id
38
+ model.name = "#{prediction_feature.name} (#{training_dataset.name})"
39
+ # TODO: check if this works for gem version, add gem versioning?
40
+ dir = File.dirname(__FILE__)
41
+ commit = `cd #{dir}; git rev-parse HEAD`.chomp
42
+ branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp
43
+ url = `cd #{dir}; git config --get remote.origin.url`.chomp
44
+ if branch
45
+ model.version = {:url => url, :branch => branch, :commit => commit}
46
+ else
47
+ model.version = {:warning => "git is not installed"}
48
+ end
19
49
 
20
- def training_dataset
21
- Dataset.find(training_dataset_id)
50
+ # set defaults
51
+ substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
52
+ bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
53
+
54
+ if substance_classes.first == "OpenTox::Compound"
55
+
56
+ model.algorithms = {
57
+ :descriptors => {
58
+ :method => "fingerprint",
59
+ :type => "MP2D",
60
+ },
61
+ :similarity => {
62
+ :method => "Algorithm::Similarity.tanimoto",
63
+ :min => 0.1
64
+ },
65
+ :feature_selection => nil
66
+ }
67
+
68
+ if model.class == LazarClassification
69
+ model.algorithms[:prediction] = {
70
+ :method => "Algorithm::Classification.weighted_majority_vote",
71
+ }
72
+ elsif model.class == LazarRegression
73
+ model.algorithms[:prediction] = {
74
+ :method => "Algorithm::Caret.pls",
75
+ }
76
+ end
77
+
78
+ elsif substance_classes.first == "OpenTox::Nanoparticle"
79
+ model.algorithms = {
80
+ :descriptors => {
81
+ :method => "properties",
82
+ :categories => ["P-CHEM"],
83
+ },
84
+ :similarity => {
85
+ :method => "Algorithm::Similarity.weighted_cosine",
86
+ :min => 0.5
87
+ },
88
+ :prediction => {
89
+ :method => "Algorithm::Caret.rf",
90
+ },
91
+ :feature_selection => {
92
+ :method => "Algorithm::FeatureSelection.correlation_filter",
93
+ },
94
+ }
95
+ else
96
+ bad_request_error "Cannot create models for #{substance_classes.first}."
97
+ end
98
+
99
+ # overwrite defaults with explicit parameters
100
+ algorithms.each do |type,parameters|
101
+ if parameters and parameters.is_a? Hash
102
+ parameters.each do |p,v|
103
+ model.algorithms[type] ||= {}
104
+ model.algorithms[type][p] = v
105
+ model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
106
+ end
107
+ else
108
+ model.algorithms[type] = parameters
109
+ end
110
+ end if algorithms
111
+
112
+ # parse dependent_variables from training dataset
113
+ training_dataset.substances.each do |substance|
114
+ values = training_dataset.values(substance,model.prediction_feature_id)
115
+ values.each do |v|
116
+ model.substance_ids << substance.id.to_s
117
+ model.dependent_variables << v
118
+ end if values
119
+ end
120
+
121
+ descriptor_method = model.algorithms[:descriptors][:method]
122
+ case descriptor_method
123
+ # parse fingerprints
124
+ when "fingerprint"
125
+ type = model.algorithms[:descriptors][:type]
126
+ model.substances.each_with_index do |s,i|
127
+ model.fingerprints[i] ||= []
128
+ model.fingerprints[i] += s.fingerprint(type)
129
+ model.fingerprints[i].uniq!
130
+ end
131
+ model.descriptor_ids = model.fingerprints.flatten.uniq
132
+ model.descriptor_ids.each do |d|
133
+ # resulting model may break BSON size limit (e.g. f Kazius dataset)
134
+ model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
135
+ end
136
+ # calculate physchem properties
137
+ when "calculate_properties"
138
+ features = model.algorithms[:descriptors][:features]
139
+ model.descriptor_ids = features.collect{|f| f.id.to_s}
140
+ model.algorithms[:descriptors].delete(:features)
141
+ model.algorithms[:descriptors].delete(:type)
142
+ model.substances.each_with_index do |s,i|
143
+ props = s.calculate_properties(features)
144
+ props.each_with_index do |v,j|
145
+ model.independent_variables[j] ||= []
146
+ model.independent_variables[j][i] = v
147
+ end if props and !props.empty?
148
+ end
149
+ # parse independent_variables
150
+ when "properties"
151
+ categories = model.algorithms[:descriptors][:categories]
152
+ feature_ids = []
153
+ categories.each do |category|
154
+ Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
155
+ end
156
+ properties = model.substances.collect { |s| s.properties }
157
+ property_ids = properties.collect{|p| p.keys}.flatten.uniq
158
+ model.descriptor_ids = feature_ids & property_ids
159
+ model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
160
+ else
161
+ bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
162
+ end
163
+
164
+ if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
165
+ model = Algorithm.run model.algorithms[:feature_selection][:method], model
166
+ end
167
+
168
+ # scale independent_variables
169
+ unless model.fingerprints?
170
+ model.independent_variables.each_with_index do |var,i|
171
+ model.descriptor_means[i] = var.mean
172
+ model.descriptor_sds[i] = var.standard_deviation
173
+ model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
174
+ end
175
+ end
176
+ model.save
177
+ model
22
178
  end
23
- end
24
179
 
25
- class Lazar < Model
26
-
27
- # algorithms
28
- field :neighbor_algorithm, type: String
29
- field :neighbor_algorithm_parameters, type: Hash, default: {}
30
-
31
- # Create a lazar model from a training_dataset and a feature_dataset
32
- # @param [OpenTox::Dataset] training_dataset
33
- # @return [OpenTox::Model::Lazar] Regression or classification model
34
- def initialize training_dataset, params={}
35
-
36
- super params
37
-
38
- # TODO document convention
39
- prediction_feature = training_dataset.features.first
40
- # set defaults for empty parameters
41
- self.prediction_feature_id ||= prediction_feature.id
42
- self.training_dataset_id ||= training_dataset.id
43
- self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
44
- self.neighbor_algorithm_parameters ||= {}
45
- self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
46
- save
47
- self
48
- end
49
-
50
- def predict_compound compound
51
- prediction_feature = Feature.find prediction_feature_id
52
- neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
53
- # remove neighbors without prediction_feature
54
- # check for database activities (neighbors may include query compound)
55
- database_activities = nil
180
+ def predict_substance substance
181
+
182
+ case algorithms[:similarity][:method]
183
+ when /tanimoto/ # binary features
184
+ similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
185
+ # TODO this excludes descriptors only present in the query substance
186
+ # use for applicability domain?
187
+ query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
188
+ when /euclid|cosine/ # quantitative features
189
+ if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
190
+ features = descriptor_ids.collect{|id| Feature.find(id)}
191
+ query_descriptors = substance.calculate_properties(features)
192
+ similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
193
+ else
194
+ similarity_descriptors = []
195
+ query_descriptors = []
196
+ descriptor_ids.each_with_index do |id,i|
197
+ prop = substance.properties[id]
198
+ prop = prop.median if prop.is_a? Array # measured
199
+ if prop
200
+ similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
201
+ query_descriptors[i] = prop
202
+ end
203
+ end
204
+ end
205
+ else
206
+ bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
207
+ end
208
+
56
209
  prediction = {}
57
- if neighbors.collect{|n| n["_id"]}.include? compound.id
210
+ neighbor_ids = []
211
+ neighbor_similarities = []
212
+ neighbor_dependent_variables = []
213
+ neighbor_independent_variables = []
58
214
 
59
- database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
60
- prediction[:database_activities] = database_activities
61
- prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
62
- neighbors.delete_if{|n| n["_id"] == compound.id}
215
+ prediction = {}
216
+ # find neighbors
217
+ substance_ids.each_with_index do |s,i|
218
+ # handle query substance
219
+ if substance.id.to_s == s
220
+ prediction[:measurements] ||= []
221
+ prediction[:measurements] << dependent_variables[i]
222
+ prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
223
+ else
224
+ if fingerprints?
225
+ neighbor_descriptors = fingerprints[i]
226
+ else
227
+ next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
228
+ neighbor_descriptors = scaled_variables.collect{|v| v[i]}
229
+ end
230
+ sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
231
+ if sim >= algorithms[:similarity][:min]
232
+ neighbor_ids << s
233
+ neighbor_similarities << sim
234
+ neighbor_dependent_variables << dependent_variables[i]
235
+ independent_variables.each_with_index do |c,j|
236
+ neighbor_independent_variables[j] ||= []
237
+ neighbor_independent_variables[j] << independent_variables[j][i]
238
+ end
239
+ end
240
+ end
63
241
  end
64
- neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
65
- if neighbors.empty?
66
- prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
242
+
243
+ measurements = nil
244
+
245
+ if neighbor_similarities.empty?
246
+ prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
247
+ elsif neighbor_similarities.size == 1
248
+ prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
67
249
  else
68
- prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
69
- prediction[:neighbors] = neighbors
70
- prediction[:neighbors] ||= []
250
+ query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
251
+ # call prediction algorithm
252
+ result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
253
+ prediction.merge! result
254
+ prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
71
255
  end
72
256
  prediction
73
257
  end
@@ -77,103 +261,81 @@ module OpenTox
77
261
  training_dataset = Dataset.find training_dataset_id
78
262
 
79
263
  # parse data
80
- compounds = []
81
- case object.class.to_s
82
- when "OpenTox::Compound"
83
- compounds = [object]
84
- when "Array"
85
- compounds = object
86
- when "OpenTox::Dataset"
87
- compounds = object.compounds
264
+ substances = []
265
+ if object.is_a? Substance
266
+ substances = [object]
267
+ elsif object.is_a? Array
268
+ substances = object
269
+ elsif object.is_a? Dataset
270
+ substances = object.substances
88
271
  else
89
- bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
272
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
90
273
  end
91
274
 
92
275
  # make predictions
93
- predictions = []
94
- predictions = compounds.collect{|c| predict_compound c}
276
+ predictions = {}
277
+ substances.each do |c|
278
+ predictions[c.id.to_s] = predict_substance c
279
+ predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
280
+ end
95
281
 
96
282
  # serialize result
97
- case object.class.to_s
98
- when "OpenTox::Compound"
99
- prediction = predictions.first
283
+ if object.is_a? Substance
284
+ prediction = predictions[substances.first.id.to_s]
100
285
  prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
101
286
  return prediction
102
- when "Array"
287
+ elsif object.is_a? Array
103
288
  return predictions
104
- when "OpenTox::Dataset"
289
+ elsif object.is_a? Dataset
105
290
  # prepare prediction dataset
106
291
  measurement_feature = Feature.find prediction_feature_id
107
292
 
108
- prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
109
- prediction_dataset = LazarPrediction.new(
293
+ prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
294
+ prediction_dataset = LazarPrediction.create(
110
295
  :name => "Lazar prediction for #{prediction_feature.name}",
111
296
  :creator => __FILE__,
112
- :prediction_feature_id => prediction_feature.id
113
-
297
+ :prediction_feature_id => prediction_feature.id,
298
+ :predictions => predictions
114
299
  )
115
- confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
116
- warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
117
- prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
118
- prediction_dataset.compounds = compounds
119
- prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
120
- prediction_dataset.save
121
300
  return prediction_dataset
122
301
  end
123
302
 
124
303
  end
125
-
126
- def training_activities
127
- i = training_dataset.feature_ids.index prediction_feature_id
128
- training_dataset.data_entries.collect{|de| de[i]}
304
+
305
+ def training_dataset
306
+ Dataset.find(training_dataset_id)
307
+ end
308
+
309
+ def prediction_feature
310
+ Feature.find(prediction_feature_id)
311
+ end
312
+
313
+ def descriptors
314
+ descriptor_ids.collect{|id| Feature.find(id)}
315
+ end
316
+
317
+ def substances
318
+ substance_ids.collect{|id| Substance.find(id)}
319
+ end
320
+
321
+ def fingerprints?
322
+ algorithms[:descriptors][:method] == "fingerprint" ? true : false
129
323
  end
130
324
 
131
325
  end
132
326
 
133
327
  class LazarClassification < Lazar
134
-
135
- def self.create training_dataset, params={}
136
- model = self.new training_dataset, params
137
- model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
138
- model.neighbor_algorithm ||= "fingerprint_neighbors"
139
- model.neighbor_algorithm_parameters ||= {}
140
- {
141
- :type => "MP2D",
142
- :training_dataset_id => training_dataset.id,
143
- :min_sim => 0.1
144
- }.each do |key,value|
145
- model.neighbor_algorithm_parameters[key] ||= value
146
- end
147
- model.save
148
- model
149
- end
150
328
  end
151
329
 
152
330
  class LazarRegression < Lazar
153
-
154
- def self.create training_dataset, params={}
155
- model = self.new training_dataset, params
156
- model.neighbor_algorithm ||= "fingerprint_neighbors"
157
- model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
158
- model.neighbor_algorithm_parameters ||= {}
159
- {
160
- :type => "MP2D",
161
- :training_dataset_id => training_dataset.id,
162
- :min_sim => 0.1
163
- }.each do |key,value|
164
- model.neighbor_algorithm_parameters[key] ||= value
165
- end
166
- model.save
167
- model
168
- end
169
331
  end
170
332
 
171
- class Prediction
333
+ class Validation
334
+
172
335
  include OpenTox
173
336
  include Mongoid::Document
174
337
  include Mongoid::Timestamps
175
338
 
176
- # TODO field Validations
177
339
  field :endpoint, type: String
178
340
  field :species, type: String
179
341
  field :source, type: String
@@ -182,7 +344,7 @@ module OpenTox
182
344
  field :repeated_crossvalidation_id, type: BSON::ObjectId
183
345
 
184
346
  def predict object
185
- Lazar.find(model_id).predict object
347
+ model.predict object
186
348
  end
187
349
 
188
350
  def training_dataset
@@ -193,8 +355,16 @@ module OpenTox
193
355
  Lazar.find model_id
194
356
  end
195
357
 
358
+ def algorithms
359
+ model.algorithms
360
+ end
361
+
362
+ def prediction_feature
363
+ model.prediction_feature
364
+ end
365
+
196
366
  def repeated_crossvalidation
197
- RepeatedCrossValidation.find repeated_crossvalidation_id
367
+ OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
198
368
  end
199
369
 
200
370
  def crossvalidations
@@ -202,29 +372,50 @@ module OpenTox
202
372
  end
203
373
 
204
374
  def regression?
205
- training_dataset.features.first.numeric?
375
+ model.is_a? LazarRegression
206
376
  end
207
377
 
208
378
  def classification?
209
- training_dataset.features.first.nominal?
379
+ model.is_a? LazarClassification
210
380
  end
211
381
 
212
382
  def self.from_csv_file file
213
383
  metadata_file = file.sub(/csv$/,"json")
214
384
  bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
215
- prediction_model = self.new JSON.parse(File.read(metadata_file))
385
+ model_validation = self.new JSON.parse(File.read(metadata_file))
216
386
  training_dataset = Dataset.from_csv_file file
217
- model = nil
218
- if training_dataset.features.first.nominal?
219
- model = LazarClassification.create training_dataset
220
- elsif training_dataset.features.first.numeric?
221
- model = LazarRegression.create training_dataset
387
+ model = Lazar.create training_dataset: training_dataset
388
+ model_validation[:model_id] = model.id
389
+ model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
390
+ model_validation.save
391
+ model_validation
392
+ end
393
+
394
+ def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
395
+
396
+ # find/import training_dataset
397
+ training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
398
+ unless training_dataset # try to import
399
+ Import::Enanomapper.import
400
+ training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
401
+ bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
222
402
  end
223
- prediction_model[:model_id] = model.id
224
- prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
225
- prediction_model.save
226
- prediction_model
403
+ prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
404
+
405
+ model_validation = self.new(
406
+ :endpoint => prediction_feature.name,
407
+ :source => prediction_feature.source,
408
+ :species => "A549 human lung epithelial carcinoma cells",
409
+ :unit => prediction_feature.unit
410
+ )
411
+ model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms
412
+ model_validation[:model_id] = model.id
413
+ repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5
414
+ model_validation[:repeated_crossvalidation_id] = repeated_cv.id
415
+ model_validation.save
416
+ model_validation
227
417
  end
418
+
228
419
  end
229
420
 
230
421
  end
@@ -0,0 +1,98 @@
1
+ module OpenTox
2
+
3
+ class Nanoparticle < Substance
4
+ include OpenTox
5
+
6
+ field :core_id, type: String, default: nil
7
+ field :coating_ids, type: Array, default: []
8
+
9
+ def core
10
+ Compound.find core_id
11
+ end
12
+
13
+ def coating
14
+ coating_ids.collect{|i| Compound.find i }
15
+ end
16
+
17
+ def fingerprint type=DEFAULT_FINGERPRINT
18
+ core_fp = core.fingerprint type
19
+ coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact
20
+ (core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact
21
+ end
22
+
23
+ def calculate_properties descriptors=PhysChem::OPENBABEL
24
+ if core.smiles and !coating.collect{|c| c.smiles}.compact.empty?
25
+ core_prop = core.calculate_properties descriptors
26
+ coating_prop = coating.collect{|c| c.calculate_properties descriptors if c.smiles}
27
+ descriptors.collect_with_index{|d,i| [core_prop[i],coating_prop.collect{|c| c[i] if c}]}
28
+ end
29
+ end
30
+
31
+ def add_feature feature, value, dataset
32
+ unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
33
+ case feature.category
34
+ when "P-CHEM"
35
+ properties[feature.id.to_s] ||= []
36
+ properties[feature.id.to_s] << value
37
+ properties[feature.id.to_s].uniq!
38
+ when "Proteomics"
39
+ properties[feature.id.to_s] ||= []
40
+ properties[feature.id.to_s] << value
41
+ properties[feature.id.to_s].uniq!
42
+ when "TOX"
43
+ if feature.name.match("Cell Viability Assay") and !feature.name.match("SLOPE") # -log10 transformation
44
+ value = -Math.log10(value)
45
+ feature.unit = "-log10(#{feature.unit})" unless feature.unit.match "log10"
46
+ feature.warnings += ["-log10 transformed values"] unless feature.warnings.include? "-log10 transformed values"
47
+ feature.save
48
+ end
49
+ dataset.add self, feature, value
50
+ else
51
+ warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
52
+ end
53
+ dataset_ids << dataset.id
54
+ dataset_ids.uniq!
55
+ end
56
+ end
57
+
58
+ def parse_ambit_value feature, v, dataset
59
+ # TODO add study id to warnings
60
+ v.delete "unit"
61
+ # TODO: ppm instead of weights
62
+ if v.keys == ["textValue"]
63
+ add_feature feature, v["textValue"], dataset
64
+ elsif v.keys == ["loValue"]
65
+ add_feature feature, v["loValue"], dataset
66
+ elsif v.keys.size == 2 and v["errorValue"]
67
+ add_feature feature, v["loValue"], dataset
68
+ #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
69
+ elsif v.keys.size == 2 and v["loQualifier"] == "mean"
70
+ add_feature feature, v["loValue"], dataset
71
+ #warn "'#{feature.name}' is a mean value. Original data is not available."
72
+ elsif v.keys.size == 2 and v["loQualifier"] #== ">="
73
+ #warn "Only min value available for '#{feature.name}', entry ignored"
74
+ elsif v.keys.size == 2 and v["upQualifier"] #== ">="
75
+ #warn "Only max value available for '#{feature.name}', entry ignored"
76
+ elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
77
+ add_feature feature, v["loValue"], dataset
78
+ #warn "loQualifier and upQualifier are empty."
79
+ elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
80
+ add_feature feature, v["loValue"], dataset
81
+ #warn "loQualifier and upQualifier are empty."
82
+ elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
83
+ add_feature feature, v["loValue"], dataset
84
+ #warn "loQualifier and upQualifier are empty."
85
+ elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
86
+ #add_feature feature, [v["loValue"],v["upValue"]].mean, dataset
87
+ #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
88
+ elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
89
+ #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
90
+ add_feature feature, v["loValue"], dataset
91
+ elsif v == {} # do nothing
92
+ else
93
+ warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
94
+ end
95
+ end
96
+
97
+ end
98
+ end
data/lib/opentox.rb CHANGED
@@ -1,8 +1,6 @@
1
1
  module OpenTox
2
2
 
3
- # Ruby interface
4
-
5
- # create default OpenTox classes (defined in opentox-client.rb)
3
+ # create default OpenTox classes
6
4
  # provides Mongoid's query and persistence methods
7
5
  # http://mongoid.org/en/mongoid/docs/persistence.html
8
6
  # http://mongoid.org/en/mongoid/docs/querying.html
@@ -13,10 +11,15 @@ module OpenTox
13
11
  include Mongoid::Timestamps
14
12
  store_in collection: klass.downcase.pluralize
15
13
  field :name, type: String
14
+ field :source, type: String
16
15
  field :warnings, type: Array, default: []
16
+
17
+ def warn warning
18
+ $logger.warn warning
19
+ warnings << warning
20
+ end
17
21
  end
18
22
  OpenTox.const_set klass,c
19
23
  end
20
24
 
21
25
  end
22
-