lazar 0.9.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
data/lib/model.rb
CHANGED
@@ -2,7 +2,8 @@ module OpenTox
|
|
2
2
|
|
3
3
|
module Model
|
4
4
|
|
5
|
-
class
|
5
|
+
class Lazar
|
6
|
+
|
6
7
|
include OpenTox
|
7
8
|
include Mongoid::Document
|
8
9
|
include Mongoid::Timestamps
|
@@ -10,64 +11,247 @@ module OpenTox
|
|
10
11
|
|
11
12
|
field :name, type: String
|
12
13
|
field :creator, type: String, default: __FILE__
|
13
|
-
|
14
|
+
field :algorithms, type: Hash, default:{}
|
14
15
|
field :training_dataset_id, type: BSON::ObjectId
|
15
|
-
|
16
|
-
field :prediction_algorithm, type: String
|
17
|
-
# prediction feature
|
16
|
+
field :substance_ids, type: Array, default:[]
|
18
17
|
field :prediction_feature_id, type: BSON::ObjectId
|
18
|
+
field :dependent_variables, type: Array, default:[]
|
19
|
+
field :descriptor_ids, type:Array, default:[]
|
20
|
+
field :independent_variables, type: Array, default:[]
|
21
|
+
field :fingerprints, type: Array, default:[]
|
22
|
+
field :descriptor_weights, type: Array, default:[]
|
23
|
+
field :descriptor_means, type: Array, default:[]
|
24
|
+
field :descriptor_sds, type: Array, default:[]
|
25
|
+
field :scaled_variables, type: Array, default:[]
|
26
|
+
field :version, type: Hash, default:{}
|
27
|
+
|
28
|
+
def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
|
29
|
+
bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
|
30
|
+
prediction_feature = training_dataset.features.first unless prediction_feature
|
31
|
+
# TODO: prediction_feature without training_dataset: use all available data
|
32
|
+
|
33
|
+
# guess model type
|
34
|
+
prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new
|
35
|
+
|
36
|
+
model.prediction_feature_id = prediction_feature.id
|
37
|
+
model.training_dataset_id = training_dataset.id
|
38
|
+
model.name = "#{prediction_feature.name} (#{training_dataset.name})"
|
39
|
+
# TODO: check if this works for gem version, add gem versioning?
|
40
|
+
dir = File.dirname(__FILE__)
|
41
|
+
commit = `cd #{dir}; git rev-parse HEAD`.chomp
|
42
|
+
branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp
|
43
|
+
url = `cd #{dir}; git config --get remote.origin.url`.chomp
|
44
|
+
if branch
|
45
|
+
model.version = {:url => url, :branch => branch, :commit => commit}
|
46
|
+
else
|
47
|
+
model.version = {:warning => "git is not installed"}
|
48
|
+
end
|
19
49
|
|
20
|
-
|
21
|
-
|
50
|
+
# set defaults
|
51
|
+
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
|
52
|
+
bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
|
53
|
+
|
54
|
+
if substance_classes.first == "OpenTox::Compound"
|
55
|
+
|
56
|
+
model.algorithms = {
|
57
|
+
:descriptors => {
|
58
|
+
:method => "fingerprint",
|
59
|
+
:type => "MP2D",
|
60
|
+
},
|
61
|
+
:similarity => {
|
62
|
+
:method => "Algorithm::Similarity.tanimoto",
|
63
|
+
:min => 0.1
|
64
|
+
},
|
65
|
+
:feature_selection => nil
|
66
|
+
}
|
67
|
+
|
68
|
+
if model.class == LazarClassification
|
69
|
+
model.algorithms[:prediction] = {
|
70
|
+
:method => "Algorithm::Classification.weighted_majority_vote",
|
71
|
+
}
|
72
|
+
elsif model.class == LazarRegression
|
73
|
+
model.algorithms[:prediction] = {
|
74
|
+
:method => "Algorithm::Caret.pls",
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
elsif substance_classes.first == "OpenTox::Nanoparticle"
|
79
|
+
model.algorithms = {
|
80
|
+
:descriptors => {
|
81
|
+
:method => "properties",
|
82
|
+
:categories => ["P-CHEM"],
|
83
|
+
},
|
84
|
+
:similarity => {
|
85
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
86
|
+
:min => 0.5
|
87
|
+
},
|
88
|
+
:prediction => {
|
89
|
+
:method => "Algorithm::Caret.rf",
|
90
|
+
},
|
91
|
+
:feature_selection => {
|
92
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
93
|
+
},
|
94
|
+
}
|
95
|
+
else
|
96
|
+
bad_request_error "Cannot create models for #{substance_classes.first}."
|
97
|
+
end
|
98
|
+
|
99
|
+
# overwrite defaults with explicit parameters
|
100
|
+
algorithms.each do |type,parameters|
|
101
|
+
if parameters and parameters.is_a? Hash
|
102
|
+
parameters.each do |p,v|
|
103
|
+
model.algorithms[type] ||= {}
|
104
|
+
model.algorithms[type][p] = v
|
105
|
+
model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
|
106
|
+
end
|
107
|
+
else
|
108
|
+
model.algorithms[type] = parameters
|
109
|
+
end
|
110
|
+
end if algorithms
|
111
|
+
|
112
|
+
# parse dependent_variables from training dataset
|
113
|
+
training_dataset.substances.each do |substance|
|
114
|
+
values = training_dataset.values(substance,model.prediction_feature_id)
|
115
|
+
values.each do |v|
|
116
|
+
model.substance_ids << substance.id.to_s
|
117
|
+
model.dependent_variables << v
|
118
|
+
end if values
|
119
|
+
end
|
120
|
+
|
121
|
+
descriptor_method = model.algorithms[:descriptors][:method]
|
122
|
+
case descriptor_method
|
123
|
+
# parse fingerprints
|
124
|
+
when "fingerprint"
|
125
|
+
type = model.algorithms[:descriptors][:type]
|
126
|
+
model.substances.each_with_index do |s,i|
|
127
|
+
model.fingerprints[i] ||= []
|
128
|
+
model.fingerprints[i] += s.fingerprint(type)
|
129
|
+
model.fingerprints[i].uniq!
|
130
|
+
end
|
131
|
+
model.descriptor_ids = model.fingerprints.flatten.uniq
|
132
|
+
model.descriptor_ids.each do |d|
|
133
|
+
# resulting model may break BSON size limit (e.g. f Kazius dataset)
|
134
|
+
model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
|
135
|
+
end
|
136
|
+
# calculate physchem properties
|
137
|
+
when "calculate_properties"
|
138
|
+
features = model.algorithms[:descriptors][:features]
|
139
|
+
model.descriptor_ids = features.collect{|f| f.id.to_s}
|
140
|
+
model.algorithms[:descriptors].delete(:features)
|
141
|
+
model.algorithms[:descriptors].delete(:type)
|
142
|
+
model.substances.each_with_index do |s,i|
|
143
|
+
props = s.calculate_properties(features)
|
144
|
+
props.each_with_index do |v,j|
|
145
|
+
model.independent_variables[j] ||= []
|
146
|
+
model.independent_variables[j][i] = v
|
147
|
+
end if props and !props.empty?
|
148
|
+
end
|
149
|
+
# parse independent_variables
|
150
|
+
when "properties"
|
151
|
+
categories = model.algorithms[:descriptors][:categories]
|
152
|
+
feature_ids = []
|
153
|
+
categories.each do |category|
|
154
|
+
Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
|
155
|
+
end
|
156
|
+
properties = model.substances.collect { |s| s.properties }
|
157
|
+
property_ids = properties.collect{|p| p.keys}.flatten.uniq
|
158
|
+
model.descriptor_ids = feature_ids & property_ids
|
159
|
+
model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
|
160
|
+
else
|
161
|
+
bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
|
162
|
+
end
|
163
|
+
|
164
|
+
if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
|
165
|
+
model = Algorithm.run model.algorithms[:feature_selection][:method], model
|
166
|
+
end
|
167
|
+
|
168
|
+
# scale independent_variables
|
169
|
+
unless model.fingerprints?
|
170
|
+
model.independent_variables.each_with_index do |var,i|
|
171
|
+
model.descriptor_means[i] = var.mean
|
172
|
+
model.descriptor_sds[i] = var.standard_deviation
|
173
|
+
model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
|
174
|
+
end
|
175
|
+
end
|
176
|
+
model.save
|
177
|
+
model
|
22
178
|
end
|
23
|
-
end
|
24
179
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
# check for database activities (neighbors may include query compound)
|
55
|
-
database_activities = nil
|
180
|
+
def predict_substance substance
|
181
|
+
|
182
|
+
case algorithms[:similarity][:method]
|
183
|
+
when /tanimoto/ # binary features
|
184
|
+
similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
|
185
|
+
# TODO this excludes descriptors only present in the query substance
|
186
|
+
# use for applicability domain?
|
187
|
+
query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
|
188
|
+
when /euclid|cosine/ # quantitative features
|
189
|
+
if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
|
190
|
+
features = descriptor_ids.collect{|id| Feature.find(id)}
|
191
|
+
query_descriptors = substance.calculate_properties(features)
|
192
|
+
similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
|
193
|
+
else
|
194
|
+
similarity_descriptors = []
|
195
|
+
query_descriptors = []
|
196
|
+
descriptor_ids.each_with_index do |id,i|
|
197
|
+
prop = substance.properties[id]
|
198
|
+
prop = prop.median if prop.is_a? Array # measured
|
199
|
+
if prop
|
200
|
+
similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
|
201
|
+
query_descriptors[i] = prop
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
else
|
206
|
+
bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
|
207
|
+
end
|
208
|
+
|
56
209
|
prediction = {}
|
57
|
-
|
210
|
+
neighbor_ids = []
|
211
|
+
neighbor_similarities = []
|
212
|
+
neighbor_dependent_variables = []
|
213
|
+
neighbor_independent_variables = []
|
58
214
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
215
|
+
prediction = {}
|
216
|
+
# find neighbors
|
217
|
+
substance_ids.each_with_index do |s,i|
|
218
|
+
# handle query substance
|
219
|
+
if substance.id.to_s == s
|
220
|
+
prediction[:measurements] ||= []
|
221
|
+
prediction[:measurements] << dependent_variables[i]
|
222
|
+
prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
|
223
|
+
else
|
224
|
+
if fingerprints?
|
225
|
+
neighbor_descriptors = fingerprints[i]
|
226
|
+
else
|
227
|
+
next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
|
228
|
+
neighbor_descriptors = scaled_variables.collect{|v| v[i]}
|
229
|
+
end
|
230
|
+
sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
|
231
|
+
if sim >= algorithms[:similarity][:min]
|
232
|
+
neighbor_ids << s
|
233
|
+
neighbor_similarities << sim
|
234
|
+
neighbor_dependent_variables << dependent_variables[i]
|
235
|
+
independent_variables.each_with_index do |c,j|
|
236
|
+
neighbor_independent_variables[j] ||= []
|
237
|
+
neighbor_independent_variables[j] << independent_variables[j][i]
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
63
241
|
end
|
64
|
-
|
65
|
-
|
66
|
-
|
242
|
+
|
243
|
+
measurements = nil
|
244
|
+
|
245
|
+
if neighbor_similarities.empty?
|
246
|
+
prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
|
247
|
+
elsif neighbor_similarities.size == 1
|
248
|
+
prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
|
67
249
|
else
|
68
|
-
|
69
|
-
prediction
|
70
|
-
prediction[:
|
250
|
+
query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
|
251
|
+
# call prediction algorithm
|
252
|
+
result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
|
253
|
+
prediction.merge! result
|
254
|
+
prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
|
71
255
|
end
|
72
256
|
prediction
|
73
257
|
end
|
@@ -77,103 +261,81 @@ module OpenTox
|
|
77
261
|
training_dataset = Dataset.find training_dataset_id
|
78
262
|
|
79
263
|
# parse data
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
compounds = object.compounds
|
264
|
+
substances = []
|
265
|
+
if object.is_a? Substance
|
266
|
+
substances = [object]
|
267
|
+
elsif object.is_a? Array
|
268
|
+
substances = object
|
269
|
+
elsif object.is_a? Dataset
|
270
|
+
substances = object.substances
|
88
271
|
else
|
89
|
-
bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::
|
272
|
+
bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
|
90
273
|
end
|
91
274
|
|
92
275
|
# make predictions
|
93
|
-
predictions =
|
94
|
-
|
276
|
+
predictions = {}
|
277
|
+
substances.each do |c|
|
278
|
+
predictions[c.id.to_s] = predict_substance c
|
279
|
+
predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
|
280
|
+
end
|
95
281
|
|
96
282
|
# serialize result
|
97
|
-
|
98
|
-
|
99
|
-
prediction = predictions.first
|
283
|
+
if object.is_a? Substance
|
284
|
+
prediction = predictions[substances.first.id.to_s]
|
100
285
|
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
|
101
286
|
return prediction
|
102
|
-
|
287
|
+
elsif object.is_a? Array
|
103
288
|
return predictions
|
104
|
-
|
289
|
+
elsif object.is_a? Dataset
|
105
290
|
# prepare prediction dataset
|
106
291
|
measurement_feature = Feature.find prediction_feature_id
|
107
292
|
|
108
|
-
prediction_feature =
|
109
|
-
prediction_dataset = LazarPrediction.
|
293
|
+
prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
|
294
|
+
prediction_dataset = LazarPrediction.create(
|
110
295
|
:name => "Lazar prediction for #{prediction_feature.name}",
|
111
296
|
:creator => __FILE__,
|
112
|
-
:prediction_feature_id => prediction_feature.id
|
113
|
-
|
297
|
+
:prediction_feature_id => prediction_feature.id,
|
298
|
+
:predictions => predictions
|
114
299
|
)
|
115
|
-
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
|
116
|
-
warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
|
117
|
-
prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
|
118
|
-
prediction_dataset.compounds = compounds
|
119
|
-
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
|
120
|
-
prediction_dataset.save
|
121
300
|
return prediction_dataset
|
122
301
|
end
|
123
302
|
|
124
303
|
end
|
125
|
-
|
126
|
-
def
|
127
|
-
|
128
|
-
|
304
|
+
|
305
|
+
def training_dataset
|
306
|
+
Dataset.find(training_dataset_id)
|
307
|
+
end
|
308
|
+
|
309
|
+
def prediction_feature
|
310
|
+
Feature.find(prediction_feature_id)
|
311
|
+
end
|
312
|
+
|
313
|
+
def descriptors
|
314
|
+
descriptor_ids.collect{|id| Feature.find(id)}
|
315
|
+
end
|
316
|
+
|
317
|
+
def substances
|
318
|
+
substance_ids.collect{|id| Substance.find(id)}
|
319
|
+
end
|
320
|
+
|
321
|
+
def fingerprints?
|
322
|
+
algorithms[:descriptors][:method] == "fingerprint" ? true : false
|
129
323
|
end
|
130
324
|
|
131
325
|
end
|
132
326
|
|
133
327
|
class LazarClassification < Lazar
|
134
|
-
|
135
|
-
def self.create training_dataset, params={}
|
136
|
-
model = self.new training_dataset, params
|
137
|
-
model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
|
138
|
-
model.neighbor_algorithm ||= "fingerprint_neighbors"
|
139
|
-
model.neighbor_algorithm_parameters ||= {}
|
140
|
-
{
|
141
|
-
:type => "MP2D",
|
142
|
-
:training_dataset_id => training_dataset.id,
|
143
|
-
:min_sim => 0.1
|
144
|
-
}.each do |key,value|
|
145
|
-
model.neighbor_algorithm_parameters[key] ||= value
|
146
|
-
end
|
147
|
-
model.save
|
148
|
-
model
|
149
|
-
end
|
150
328
|
end
|
151
329
|
|
152
330
|
class LazarRegression < Lazar
|
153
|
-
|
154
|
-
def self.create training_dataset, params={}
|
155
|
-
model = self.new training_dataset, params
|
156
|
-
model.neighbor_algorithm ||= "fingerprint_neighbors"
|
157
|
-
model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
|
158
|
-
model.neighbor_algorithm_parameters ||= {}
|
159
|
-
{
|
160
|
-
:type => "MP2D",
|
161
|
-
:training_dataset_id => training_dataset.id,
|
162
|
-
:min_sim => 0.1
|
163
|
-
}.each do |key,value|
|
164
|
-
model.neighbor_algorithm_parameters[key] ||= value
|
165
|
-
end
|
166
|
-
model.save
|
167
|
-
model
|
168
|
-
end
|
169
331
|
end
|
170
332
|
|
171
|
-
class
|
333
|
+
class Validation
|
334
|
+
|
172
335
|
include OpenTox
|
173
336
|
include Mongoid::Document
|
174
337
|
include Mongoid::Timestamps
|
175
338
|
|
176
|
-
# TODO field Validations
|
177
339
|
field :endpoint, type: String
|
178
340
|
field :species, type: String
|
179
341
|
field :source, type: String
|
@@ -182,7 +344,7 @@ module OpenTox
|
|
182
344
|
field :repeated_crossvalidation_id, type: BSON::ObjectId
|
183
345
|
|
184
346
|
def predict object
|
185
|
-
|
347
|
+
model.predict object
|
186
348
|
end
|
187
349
|
|
188
350
|
def training_dataset
|
@@ -193,8 +355,16 @@ module OpenTox
|
|
193
355
|
Lazar.find model_id
|
194
356
|
end
|
195
357
|
|
358
|
+
def algorithms
|
359
|
+
model.algorithms
|
360
|
+
end
|
361
|
+
|
362
|
+
def prediction_feature
|
363
|
+
model.prediction_feature
|
364
|
+
end
|
365
|
+
|
196
366
|
def repeated_crossvalidation
|
197
|
-
RepeatedCrossValidation.find repeated_crossvalidation_id
|
367
|
+
OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
|
198
368
|
end
|
199
369
|
|
200
370
|
def crossvalidations
|
@@ -202,29 +372,50 @@ module OpenTox
|
|
202
372
|
end
|
203
373
|
|
204
374
|
def regression?
|
205
|
-
|
375
|
+
model.is_a? LazarRegression
|
206
376
|
end
|
207
377
|
|
208
378
|
def classification?
|
209
|
-
|
379
|
+
model.is_a? LazarClassification
|
210
380
|
end
|
211
381
|
|
212
382
|
def self.from_csv_file file
|
213
383
|
metadata_file = file.sub(/csv$/,"json")
|
214
384
|
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
|
215
|
-
|
385
|
+
model_validation = self.new JSON.parse(File.read(metadata_file))
|
216
386
|
training_dataset = Dataset.from_csv_file file
|
217
|
-
model =
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
387
|
+
model = Lazar.create training_dataset: training_dataset
|
388
|
+
model_validation[:model_id] = model.id
|
389
|
+
model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
|
390
|
+
model_validation.save
|
391
|
+
model_validation
|
392
|
+
end
|
393
|
+
|
394
|
+
def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
|
395
|
+
|
396
|
+
# find/import training_dataset
|
397
|
+
training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
398
|
+
unless training_dataset # try to import
|
399
|
+
Import::Enanomapper.import
|
400
|
+
training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
401
|
+
bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
|
222
402
|
end
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
403
|
+
prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
|
404
|
+
|
405
|
+
model_validation = self.new(
|
406
|
+
:endpoint => prediction_feature.name,
|
407
|
+
:source => prediction_feature.source,
|
408
|
+
:species => "A549 human lung epithelial carcinoma cells",
|
409
|
+
:unit => prediction_feature.unit
|
410
|
+
)
|
411
|
+
model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms
|
412
|
+
model_validation[:model_id] = model.id
|
413
|
+
repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5
|
414
|
+
model_validation[:repeated_crossvalidation_id] = repeated_cv.id
|
415
|
+
model_validation.save
|
416
|
+
model_validation
|
227
417
|
end
|
418
|
+
|
228
419
|
end
|
229
420
|
|
230
421
|
end
|
data/lib/nanoparticle.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
class Nanoparticle < Substance
|
4
|
+
include OpenTox
|
5
|
+
|
6
|
+
field :core_id, type: String, default: nil
|
7
|
+
field :coating_ids, type: Array, default: []
|
8
|
+
|
9
|
+
def core
|
10
|
+
Compound.find core_id
|
11
|
+
end
|
12
|
+
|
13
|
+
def coating
|
14
|
+
coating_ids.collect{|i| Compound.find i }
|
15
|
+
end
|
16
|
+
|
17
|
+
def fingerprint type=DEFAULT_FINGERPRINT
|
18
|
+
core_fp = core.fingerprint type
|
19
|
+
coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact
|
20
|
+
(core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact
|
21
|
+
end
|
22
|
+
|
23
|
+
def calculate_properties descriptors=PhysChem::OPENBABEL
|
24
|
+
if core.smiles and !coating.collect{|c| c.smiles}.compact.empty?
|
25
|
+
core_prop = core.calculate_properties descriptors
|
26
|
+
coating_prop = coating.collect{|c| c.calculate_properties descriptors if c.smiles}
|
27
|
+
descriptors.collect_with_index{|d,i| [core_prop[i],coating_prop.collect{|c| c[i] if c}]}
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def add_feature feature, value, dataset
|
32
|
+
unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
|
33
|
+
case feature.category
|
34
|
+
when "P-CHEM"
|
35
|
+
properties[feature.id.to_s] ||= []
|
36
|
+
properties[feature.id.to_s] << value
|
37
|
+
properties[feature.id.to_s].uniq!
|
38
|
+
when "Proteomics"
|
39
|
+
properties[feature.id.to_s] ||= []
|
40
|
+
properties[feature.id.to_s] << value
|
41
|
+
properties[feature.id.to_s].uniq!
|
42
|
+
when "TOX"
|
43
|
+
if feature.name.match("Cell Viability Assay") and !feature.name.match("SLOPE") # -log10 transformation
|
44
|
+
value = -Math.log10(value)
|
45
|
+
feature.unit = "-log10(#{feature.unit})" unless feature.unit.match "log10"
|
46
|
+
feature.warnings += ["-log10 transformed values"] unless feature.warnings.include? "-log10 transformed values"
|
47
|
+
feature.save
|
48
|
+
end
|
49
|
+
dataset.add self, feature, value
|
50
|
+
else
|
51
|
+
warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
|
52
|
+
end
|
53
|
+
dataset_ids << dataset.id
|
54
|
+
dataset_ids.uniq!
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_ambit_value feature, v, dataset
|
59
|
+
# TODO add study id to warnings
|
60
|
+
v.delete "unit"
|
61
|
+
# TODO: ppm instead of weights
|
62
|
+
if v.keys == ["textValue"]
|
63
|
+
add_feature feature, v["textValue"], dataset
|
64
|
+
elsif v.keys == ["loValue"]
|
65
|
+
add_feature feature, v["loValue"], dataset
|
66
|
+
elsif v.keys.size == 2 and v["errorValue"]
|
67
|
+
add_feature feature, v["loValue"], dataset
|
68
|
+
#warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
|
69
|
+
elsif v.keys.size == 2 and v["loQualifier"] == "mean"
|
70
|
+
add_feature feature, v["loValue"], dataset
|
71
|
+
#warn "'#{feature.name}' is a mean value. Original data is not available."
|
72
|
+
elsif v.keys.size == 2 and v["loQualifier"] #== ">="
|
73
|
+
#warn "Only min value available for '#{feature.name}', entry ignored"
|
74
|
+
elsif v.keys.size == 2 and v["upQualifier"] #== ">="
|
75
|
+
#warn "Only max value available for '#{feature.name}', entry ignored"
|
76
|
+
elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
|
77
|
+
add_feature feature, v["loValue"], dataset
|
78
|
+
#warn "loQualifier and upQualifier are empty."
|
79
|
+
elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
|
80
|
+
add_feature feature, v["loValue"], dataset
|
81
|
+
#warn "loQualifier and upQualifier are empty."
|
82
|
+
elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
|
83
|
+
add_feature feature, v["loValue"], dataset
|
84
|
+
#warn "loQualifier and upQualifier are empty."
|
85
|
+
elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
|
86
|
+
#add_feature feature, [v["loValue"],v["upValue"]].mean, dataset
|
87
|
+
#warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
|
88
|
+
elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
|
89
|
+
#warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
|
90
|
+
add_feature feature, v["loValue"], dataset
|
91
|
+
elsif v == {} # do nothing
|
92
|
+
else
|
93
|
+
warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
end
|
data/lib/opentox.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
module OpenTox
|
2
2
|
|
3
|
-
#
|
4
|
-
|
5
|
-
# create default OpenTox classes (defined in opentox-client.rb)
|
3
|
+
# create default OpenTox classes
|
6
4
|
# provides Mongoid's query and persistence methods
|
7
5
|
# http://mongoid.org/en/mongoid/docs/persistence.html
|
8
6
|
# http://mongoid.org/en/mongoid/docs/querying.html
|
@@ -13,10 +11,15 @@ module OpenTox
|
|
13
11
|
include Mongoid::Timestamps
|
14
12
|
store_in collection: klass.downcase.pluralize
|
15
13
|
field :name, type: String
|
14
|
+
field :source, type: String
|
16
15
|
field :warnings, type: Array, default: []
|
16
|
+
|
17
|
+
def warn warning
|
18
|
+
$logger.warn warning
|
19
|
+
warnings << warning
|
20
|
+
end
|
17
21
|
end
|
18
22
|
OpenTox.const_set klass,c
|
19
23
|
end
|
20
24
|
|
21
25
|
end
|
22
|
-
|