lazar 0.9.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
data/lib/model.rb CHANGED
@@ -2,7 +2,8 @@ module OpenTox
2
2
 
3
3
  module Model
4
4
 
5
- class Model
5
+ class Lazar
6
+
6
7
  include OpenTox
7
8
  include Mongoid::Document
8
9
  include Mongoid::Timestamps
@@ -10,64 +11,247 @@ module OpenTox
10
11
 
11
12
  field :name, type: String
12
13
  field :creator, type: String, default: __FILE__
13
- # datasets
14
+ field :algorithms, type: Hash, default:{}
14
15
  field :training_dataset_id, type: BSON::ObjectId
15
- # algorithms
16
- field :prediction_algorithm, type: String
17
- # prediction feature
16
+ field :substance_ids, type: Array, default:[]
18
17
  field :prediction_feature_id, type: BSON::ObjectId
18
+ field :dependent_variables, type: Array, default:[]
19
+ field :descriptor_ids, type:Array, default:[]
20
+ field :independent_variables, type: Array, default:[]
21
+ field :fingerprints, type: Array, default:[]
22
+ field :descriptor_weights, type: Array, default:[]
23
+ field :descriptor_means, type: Array, default:[]
24
+ field :descriptor_sds, type: Array, default:[]
25
+ field :scaled_variables, type: Array, default:[]
26
+ field :version, type: Hash, default:{}
27
+
28
+ def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
29
+ bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
30
+ prediction_feature = training_dataset.features.first unless prediction_feature
31
+ # TODO: prediction_feature without training_dataset: use all available data
32
+
33
+ # guess model type
34
+ prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new
35
+
36
+ model.prediction_feature_id = prediction_feature.id
37
+ model.training_dataset_id = training_dataset.id
38
+ model.name = "#{prediction_feature.name} (#{training_dataset.name})"
39
+ # TODO: check if this works for gem version, add gem versioning?
40
+ dir = File.dirname(__FILE__)
41
+ commit = `cd #{dir}; git rev-parse HEAD`.chomp
42
+ branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp
43
+ url = `cd #{dir}; git config --get remote.origin.url`.chomp
44
+ if branch
45
+ model.version = {:url => url, :branch => branch, :commit => commit}
46
+ else
47
+ model.version = {:warning => "git is not installed"}
48
+ end
19
49
 
20
- def training_dataset
21
- Dataset.find(training_dataset_id)
50
+ # set defaults
51
+ substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
52
+ bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
53
+
54
+ if substance_classes.first == "OpenTox::Compound"
55
+
56
+ model.algorithms = {
57
+ :descriptors => {
58
+ :method => "fingerprint",
59
+ :type => "MP2D",
60
+ },
61
+ :similarity => {
62
+ :method => "Algorithm::Similarity.tanimoto",
63
+ :min => 0.1
64
+ },
65
+ :feature_selection => nil
66
+ }
67
+
68
+ if model.class == LazarClassification
69
+ model.algorithms[:prediction] = {
70
+ :method => "Algorithm::Classification.weighted_majority_vote",
71
+ }
72
+ elsif model.class == LazarRegression
73
+ model.algorithms[:prediction] = {
74
+ :method => "Algorithm::Caret.pls",
75
+ }
76
+ end
77
+
78
+ elsif substance_classes.first == "OpenTox::Nanoparticle"
79
+ model.algorithms = {
80
+ :descriptors => {
81
+ :method => "properties",
82
+ :categories => ["P-CHEM"],
83
+ },
84
+ :similarity => {
85
+ :method => "Algorithm::Similarity.weighted_cosine",
86
+ :min => 0.5
87
+ },
88
+ :prediction => {
89
+ :method => "Algorithm::Caret.rf",
90
+ },
91
+ :feature_selection => {
92
+ :method => "Algorithm::FeatureSelection.correlation_filter",
93
+ },
94
+ }
95
+ else
96
+ bad_request_error "Cannot create models for #{substance_classes.first}."
97
+ end
98
+
99
+ # overwrite defaults with explicit parameters
100
+ algorithms.each do |type,parameters|
101
+ if parameters and parameters.is_a? Hash
102
+ parameters.each do |p,v|
103
+ model.algorithms[type] ||= {}
104
+ model.algorithms[type][p] = v
105
+ model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
106
+ end
107
+ else
108
+ model.algorithms[type] = parameters
109
+ end
110
+ end if algorithms
111
+
112
+ # parse dependent_variables from training dataset
113
+ training_dataset.substances.each do |substance|
114
+ values = training_dataset.values(substance,model.prediction_feature_id)
115
+ values.each do |v|
116
+ model.substance_ids << substance.id.to_s
117
+ model.dependent_variables << v
118
+ end if values
119
+ end
120
+
121
+ descriptor_method = model.algorithms[:descriptors][:method]
122
+ case descriptor_method
123
+ # parse fingerprints
124
+ when "fingerprint"
125
+ type = model.algorithms[:descriptors][:type]
126
+ model.substances.each_with_index do |s,i|
127
+ model.fingerprints[i] ||= []
128
+ model.fingerprints[i] += s.fingerprint(type)
129
+ model.fingerprints[i].uniq!
130
+ end
131
+ model.descriptor_ids = model.fingerprints.flatten.uniq
132
+ model.descriptor_ids.each do |d|
133
+ # resulting model may break BSON size limit (e.g. f Kazius dataset)
134
+ model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
135
+ end
136
+ # calculate physchem properties
137
+ when "calculate_properties"
138
+ features = model.algorithms[:descriptors][:features]
139
+ model.descriptor_ids = features.collect{|f| f.id.to_s}
140
+ model.algorithms[:descriptors].delete(:features)
141
+ model.algorithms[:descriptors].delete(:type)
142
+ model.substances.each_with_index do |s,i|
143
+ props = s.calculate_properties(features)
144
+ props.each_with_index do |v,j|
145
+ model.independent_variables[j] ||= []
146
+ model.independent_variables[j][i] = v
147
+ end if props and !props.empty?
148
+ end
149
+ # parse independent_variables
150
+ when "properties"
151
+ categories = model.algorithms[:descriptors][:categories]
152
+ feature_ids = []
153
+ categories.each do |category|
154
+ Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
155
+ end
156
+ properties = model.substances.collect { |s| s.properties }
157
+ property_ids = properties.collect{|p| p.keys}.flatten.uniq
158
+ model.descriptor_ids = feature_ids & property_ids
159
+ model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
160
+ else
161
+ bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
162
+ end
163
+
164
+ if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
165
+ model = Algorithm.run model.algorithms[:feature_selection][:method], model
166
+ end
167
+
168
+ # scale independent_variables
169
+ unless model.fingerprints?
170
+ model.independent_variables.each_with_index do |var,i|
171
+ model.descriptor_means[i] = var.mean
172
+ model.descriptor_sds[i] = var.standard_deviation
173
+ model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
174
+ end
175
+ end
176
+ model.save
177
+ model
22
178
  end
23
- end
24
179
 
25
- class Lazar < Model
26
-
27
- # algorithms
28
- field :neighbor_algorithm, type: String
29
- field :neighbor_algorithm_parameters, type: Hash, default: {}
30
-
31
- # Create a lazar model from a training_dataset and a feature_dataset
32
- # @param [OpenTox::Dataset] training_dataset
33
- # @return [OpenTox::Model::Lazar] Regression or classification model
34
- def initialize training_dataset, params={}
35
-
36
- super params
37
-
38
- # TODO document convention
39
- prediction_feature = training_dataset.features.first
40
- # set defaults for empty parameters
41
- self.prediction_feature_id ||= prediction_feature.id
42
- self.training_dataset_id ||= training_dataset.id
43
- self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
44
- self.neighbor_algorithm_parameters ||= {}
45
- self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
46
- save
47
- self
48
- end
49
-
50
- def predict_compound compound
51
- prediction_feature = Feature.find prediction_feature_id
52
- neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
53
- # remove neighbors without prediction_feature
54
- # check for database activities (neighbors may include query compound)
55
- database_activities = nil
180
+ def predict_substance substance
181
+
182
+ case algorithms[:similarity][:method]
183
+ when /tanimoto/ # binary features
184
+ similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
185
+ # TODO this excludes descriptors only present in the query substance
186
+ # use for applicability domain?
187
+ query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
188
+ when /euclid|cosine/ # quantitative features
189
+ if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
190
+ features = descriptor_ids.collect{|id| Feature.find(id)}
191
+ query_descriptors = substance.calculate_properties(features)
192
+ similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
193
+ else
194
+ similarity_descriptors = []
195
+ query_descriptors = []
196
+ descriptor_ids.each_with_index do |id,i|
197
+ prop = substance.properties[id]
198
+ prop = prop.median if prop.is_a? Array # measured
199
+ if prop
200
+ similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
201
+ query_descriptors[i] = prop
202
+ end
203
+ end
204
+ end
205
+ else
206
+ bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
207
+ end
208
+
56
209
  prediction = {}
57
- if neighbors.collect{|n| n["_id"]}.include? compound.id
210
+ neighbor_ids = []
211
+ neighbor_similarities = []
212
+ neighbor_dependent_variables = []
213
+ neighbor_independent_variables = []
58
214
 
59
- database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
60
- prediction[:database_activities] = database_activities
61
- prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
62
- neighbors.delete_if{|n| n["_id"] == compound.id}
215
+ prediction = {}
216
+ # find neighbors
217
+ substance_ids.each_with_index do |s,i|
218
+ # handle query substance
219
+ if substance.id.to_s == s
220
+ prediction[:measurements] ||= []
221
+ prediction[:measurements] << dependent_variables[i]
222
+ prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
223
+ else
224
+ if fingerprints?
225
+ neighbor_descriptors = fingerprints[i]
226
+ else
227
+ next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
228
+ neighbor_descriptors = scaled_variables.collect{|v| v[i]}
229
+ end
230
+ sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
231
+ if sim >= algorithms[:similarity][:min]
232
+ neighbor_ids << s
233
+ neighbor_similarities << sim
234
+ neighbor_dependent_variables << dependent_variables[i]
235
+ independent_variables.each_with_index do |c,j|
236
+ neighbor_independent_variables[j] ||= []
237
+ neighbor_independent_variables[j] << independent_variables[j][i]
238
+ end
239
+ end
240
+ end
63
241
  end
64
- neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
65
- if neighbors.empty?
66
- prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
242
+
243
+ measurements = nil
244
+
245
+ if neighbor_similarities.empty?
246
+ prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
247
+ elsif neighbor_similarities.size == 1
248
+ prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
67
249
  else
68
- prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
69
- prediction[:neighbors] = neighbors
70
- prediction[:neighbors] ||= []
250
+ query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
251
+ # call prediction algorithm
252
+ result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
253
+ prediction.merge! result
254
+ prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
71
255
  end
72
256
  prediction
73
257
  end
@@ -77,103 +261,81 @@ module OpenTox
77
261
  training_dataset = Dataset.find training_dataset_id
78
262
 
79
263
  # parse data
80
- compounds = []
81
- case object.class.to_s
82
- when "OpenTox::Compound"
83
- compounds = [object]
84
- when "Array"
85
- compounds = object
86
- when "OpenTox::Dataset"
87
- compounds = object.compounds
264
+ substances = []
265
+ if object.is_a? Substance
266
+ substances = [object]
267
+ elsif object.is_a? Array
268
+ substances = object
269
+ elsif object.is_a? Dataset
270
+ substances = object.substances
88
271
  else
89
- bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
272
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
90
273
  end
91
274
 
92
275
  # make predictions
93
- predictions = []
94
- predictions = compounds.collect{|c| predict_compound c}
276
+ predictions = {}
277
+ substances.each do |c|
278
+ predictions[c.id.to_s] = predict_substance c
279
+ predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
280
+ end
95
281
 
96
282
  # serialize result
97
- case object.class.to_s
98
- when "OpenTox::Compound"
99
- prediction = predictions.first
283
+ if object.is_a? Substance
284
+ prediction = predictions[substances.first.id.to_s]
100
285
  prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
101
286
  return prediction
102
- when "Array"
287
+ elsif object.is_a? Array
103
288
  return predictions
104
- when "OpenTox::Dataset"
289
+ elsif object.is_a? Dataset
105
290
  # prepare prediction dataset
106
291
  measurement_feature = Feature.find prediction_feature_id
107
292
 
108
- prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
109
- prediction_dataset = LazarPrediction.new(
293
+ prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
294
+ prediction_dataset = LazarPrediction.create(
110
295
  :name => "Lazar prediction for #{prediction_feature.name}",
111
296
  :creator => __FILE__,
112
- :prediction_feature_id => prediction_feature.id
113
-
297
+ :prediction_feature_id => prediction_feature.id,
298
+ :predictions => predictions
114
299
  )
115
- confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
116
- warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
117
- prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
118
- prediction_dataset.compounds = compounds
119
- prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
120
- prediction_dataset.save
121
300
  return prediction_dataset
122
301
  end
123
302
 
124
303
  end
125
-
126
- def training_activities
127
- i = training_dataset.feature_ids.index prediction_feature_id
128
- training_dataset.data_entries.collect{|de| de[i]}
304
+
305
+ def training_dataset
306
+ Dataset.find(training_dataset_id)
307
+ end
308
+
309
+ def prediction_feature
310
+ Feature.find(prediction_feature_id)
311
+ end
312
+
313
+ def descriptors
314
+ descriptor_ids.collect{|id| Feature.find(id)}
315
+ end
316
+
317
+ def substances
318
+ substance_ids.collect{|id| Substance.find(id)}
319
+ end
320
+
321
+ def fingerprints?
322
+ algorithms[:descriptors][:method] == "fingerprint" ? true : false
129
323
  end
130
324
 
131
325
  end
132
326
 
133
327
  class LazarClassification < Lazar
134
-
135
- def self.create training_dataset, params={}
136
- model = self.new training_dataset, params
137
- model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
138
- model.neighbor_algorithm ||= "fingerprint_neighbors"
139
- model.neighbor_algorithm_parameters ||= {}
140
- {
141
- :type => "MP2D",
142
- :training_dataset_id => training_dataset.id,
143
- :min_sim => 0.1
144
- }.each do |key,value|
145
- model.neighbor_algorithm_parameters[key] ||= value
146
- end
147
- model.save
148
- model
149
- end
150
328
  end
151
329
 
152
330
  class LazarRegression < Lazar
153
-
154
- def self.create training_dataset, params={}
155
- model = self.new training_dataset, params
156
- model.neighbor_algorithm ||= "fingerprint_neighbors"
157
- model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
158
- model.neighbor_algorithm_parameters ||= {}
159
- {
160
- :type => "MP2D",
161
- :training_dataset_id => training_dataset.id,
162
- :min_sim => 0.1
163
- }.each do |key,value|
164
- model.neighbor_algorithm_parameters[key] ||= value
165
- end
166
- model.save
167
- model
168
- end
169
331
  end
170
332
 
171
- class Prediction
333
+ class Validation
334
+
172
335
  include OpenTox
173
336
  include Mongoid::Document
174
337
  include Mongoid::Timestamps
175
338
 
176
- # TODO field Validations
177
339
  field :endpoint, type: String
178
340
  field :species, type: String
179
341
  field :source, type: String
@@ -182,7 +344,7 @@ module OpenTox
182
344
  field :repeated_crossvalidation_id, type: BSON::ObjectId
183
345
 
184
346
  def predict object
185
- Lazar.find(model_id).predict object
347
+ model.predict object
186
348
  end
187
349
 
188
350
  def training_dataset
@@ -193,8 +355,16 @@ module OpenTox
193
355
  Lazar.find model_id
194
356
  end
195
357
 
358
+ def algorithms
359
+ model.algorithms
360
+ end
361
+
362
+ def prediction_feature
363
+ model.prediction_feature
364
+ end
365
+
196
366
  def repeated_crossvalidation
197
- RepeatedCrossValidation.find repeated_crossvalidation_id
367
+ OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
198
368
  end
199
369
 
200
370
  def crossvalidations
@@ -202,29 +372,50 @@ module OpenTox
202
372
  end
203
373
 
204
374
  def regression?
205
- training_dataset.features.first.numeric?
375
+ model.is_a? LazarRegression
206
376
  end
207
377
 
208
378
  def classification?
209
- training_dataset.features.first.nominal?
379
+ model.is_a? LazarClassification
210
380
  end
211
381
 
212
382
  def self.from_csv_file file
213
383
  metadata_file = file.sub(/csv$/,"json")
214
384
  bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
215
- prediction_model = self.new JSON.parse(File.read(metadata_file))
385
+ model_validation = self.new JSON.parse(File.read(metadata_file))
216
386
  training_dataset = Dataset.from_csv_file file
217
- model = nil
218
- if training_dataset.features.first.nominal?
219
- model = LazarClassification.create training_dataset
220
- elsif training_dataset.features.first.numeric?
221
- model = LazarRegression.create training_dataset
387
+ model = Lazar.create training_dataset: training_dataset
388
+ model_validation[:model_id] = model.id
389
+ model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
390
+ model_validation.save
391
+ model_validation
392
+ end
393
+
394
+ def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
395
+
396
+ # find/import training_dataset
397
+ training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
398
+ unless training_dataset # try to import
399
+ Import::Enanomapper.import
400
+ training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
401
+ bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
222
402
  end
223
- prediction_model[:model_id] = model.id
224
- prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
225
- prediction_model.save
226
- prediction_model
403
+ prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
404
+
405
+ model_validation = self.new(
406
+ :endpoint => prediction_feature.name,
407
+ :source => prediction_feature.source,
408
+ :species => "A549 human lung epithelial carcinoma cells",
409
+ :unit => prediction_feature.unit
410
+ )
411
+ model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms
412
+ model_validation[:model_id] = model.id
413
+ repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5
414
+ model_validation[:repeated_crossvalidation_id] = repeated_cv.id
415
+ model_validation.save
416
+ model_validation
227
417
  end
418
+
228
419
  end
229
420
 
230
421
  end
@@ -0,0 +1,98 @@
1
+ module OpenTox
2
+
3
+ class Nanoparticle < Substance
4
+ include OpenTox
5
+
6
+ field :core_id, type: String, default: nil
7
+ field :coating_ids, type: Array, default: []
8
+
9
+ def core
10
+ Compound.find core_id
11
+ end
12
+
13
+ def coating
14
+ coating_ids.collect{|i| Compound.find i }
15
+ end
16
+
17
+ def fingerprint type=DEFAULT_FINGERPRINT
18
+ core_fp = core.fingerprint type
19
+ coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact
20
+ (core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact
21
+ end
22
+
23
+ def calculate_properties descriptors=PhysChem::OPENBABEL
24
+ if core.smiles and !coating.collect{|c| c.smiles}.compact.empty?
25
+ core_prop = core.calculate_properties descriptors
26
+ coating_prop = coating.collect{|c| c.calculate_properties descriptors if c.smiles}
27
+ descriptors.collect_with_index{|d,i| [core_prop[i],coating_prop.collect{|c| c[i] if c}]}
28
+ end
29
+ end
30
+
31
+ def add_feature feature, value, dataset
32
+ unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
33
+ case feature.category
34
+ when "P-CHEM"
35
+ properties[feature.id.to_s] ||= []
36
+ properties[feature.id.to_s] << value
37
+ properties[feature.id.to_s].uniq!
38
+ when "Proteomics"
39
+ properties[feature.id.to_s] ||= []
40
+ properties[feature.id.to_s] << value
41
+ properties[feature.id.to_s].uniq!
42
+ when "TOX"
43
+ if feature.name.match("Cell Viability Assay") and !feature.name.match("SLOPE") # -log10 transformation
44
+ value = -Math.log10(value)
45
+ feature.unit = "-log10(#{feature.unit})" unless feature.unit.match "log10"
46
+ feature.warnings += ["-log10 transformed values"] unless feature.warnings.include? "-log10 transformed values"
47
+ feature.save
48
+ end
49
+ dataset.add self, feature, value
50
+ else
51
+ warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
52
+ end
53
+ dataset_ids << dataset.id
54
+ dataset_ids.uniq!
55
+ end
56
+ end
57
+
58
+ def parse_ambit_value feature, v, dataset
59
+ # TODO add study id to warnings
60
+ v.delete "unit"
61
+ # TODO: ppm instead of weights
62
+ if v.keys == ["textValue"]
63
+ add_feature feature, v["textValue"], dataset
64
+ elsif v.keys == ["loValue"]
65
+ add_feature feature, v["loValue"], dataset
66
+ elsif v.keys.size == 2 and v["errorValue"]
67
+ add_feature feature, v["loValue"], dataset
68
+ #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
69
+ elsif v.keys.size == 2 and v["loQualifier"] == "mean"
70
+ add_feature feature, v["loValue"], dataset
71
+ #warn "'#{feature.name}' is a mean value. Original data is not available."
72
+ elsif v.keys.size == 2 and v["loQualifier"] #== ">="
73
+ #warn "Only min value available for '#{feature.name}', entry ignored"
74
+ elsif v.keys.size == 2 and v["upQualifier"] #== ">="
75
+ #warn "Only max value available for '#{feature.name}', entry ignored"
76
+ elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
77
+ add_feature feature, v["loValue"], dataset
78
+ #warn "loQualifier and upQualifier are empty."
79
+ elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
80
+ add_feature feature, v["loValue"], dataset
81
+ #warn "loQualifier and upQualifier are empty."
82
+ elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
83
+ add_feature feature, v["loValue"], dataset
84
+ #warn "loQualifier and upQualifier are empty."
85
+ elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
86
+ #add_feature feature, [v["loValue"],v["upValue"]].mean, dataset
87
+ #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
88
+ elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
89
+ #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
90
+ add_feature feature, v["loValue"], dataset
91
+ elsif v == {} # do nothing
92
+ else
93
+ warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
94
+ end
95
+ end
96
+
97
+ end
98
+ end
data/lib/opentox.rb CHANGED
@@ -1,8 +1,6 @@
1
1
  module OpenTox
2
2
 
3
- # Ruby interface
4
-
5
- # create default OpenTox classes (defined in opentox-client.rb)
3
+ # create default OpenTox classes
6
4
  # provides Mongoid's query and persistence methods
7
5
  # http://mongoid.org/en/mongoid/docs/persistence.html
8
6
  # http://mongoid.org/en/mongoid/docs/querying.html
@@ -13,10 +11,15 @@ module OpenTox
13
11
  include Mongoid::Timestamps
14
12
  store_in collection: klass.downcase.pluralize
15
13
  field :name, type: String
14
+ field :source, type: String
16
15
  field :warnings, type: Array, default: []
16
+
17
+ def warn warning
18
+ $logger.warn warning
19
+ warnings << warning
20
+ end
17
21
  end
18
22
  OpenTox.const_set klass,c
19
23
  end
20
24
 
21
25
  end
22
-