lazar 0.9.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
data/lib/model.rb
CHANGED
@@ -2,7 +2,8 @@ module OpenTox
|
|
2
2
|
|
3
3
|
module Model
|
4
4
|
|
5
|
-
class
|
5
|
+
class Lazar
|
6
|
+
|
6
7
|
include OpenTox
|
7
8
|
include Mongoid::Document
|
8
9
|
include Mongoid::Timestamps
|
@@ -10,64 +11,247 @@ module OpenTox
|
|
10
11
|
|
11
12
|
field :name, type: String
|
12
13
|
field :creator, type: String, default: __FILE__
|
13
|
-
|
14
|
+
field :algorithms, type: Hash, default:{}
|
14
15
|
field :training_dataset_id, type: BSON::ObjectId
|
15
|
-
|
16
|
-
field :prediction_algorithm, type: String
|
17
|
-
# prediction feature
|
16
|
+
field :substance_ids, type: Array, default:[]
|
18
17
|
field :prediction_feature_id, type: BSON::ObjectId
|
18
|
+
field :dependent_variables, type: Array, default:[]
|
19
|
+
field :descriptor_ids, type:Array, default:[]
|
20
|
+
field :independent_variables, type: Array, default:[]
|
21
|
+
field :fingerprints, type: Array, default:[]
|
22
|
+
field :descriptor_weights, type: Array, default:[]
|
23
|
+
field :descriptor_means, type: Array, default:[]
|
24
|
+
field :descriptor_sds, type: Array, default:[]
|
25
|
+
field :scaled_variables, type: Array, default:[]
|
26
|
+
field :version, type: Hash, default:{}
|
27
|
+
|
28
|
+
def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
|
29
|
+
bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
|
30
|
+
prediction_feature = training_dataset.features.first unless prediction_feature
|
31
|
+
# TODO: prediction_feature without training_dataset: use all available data
|
32
|
+
|
33
|
+
# guess model type
|
34
|
+
prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new
|
35
|
+
|
36
|
+
model.prediction_feature_id = prediction_feature.id
|
37
|
+
model.training_dataset_id = training_dataset.id
|
38
|
+
model.name = "#{prediction_feature.name} (#{training_dataset.name})"
|
39
|
+
# TODO: check if this works for gem version, add gem versioning?
|
40
|
+
dir = File.dirname(__FILE__)
|
41
|
+
commit = `cd #{dir}; git rev-parse HEAD`.chomp
|
42
|
+
branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp
|
43
|
+
url = `cd #{dir}; git config --get remote.origin.url`.chomp
|
44
|
+
if branch
|
45
|
+
model.version = {:url => url, :branch => branch, :commit => commit}
|
46
|
+
else
|
47
|
+
model.version = {:warning => "git is not installed"}
|
48
|
+
end
|
19
49
|
|
20
|
-
|
21
|
-
|
50
|
+
# set defaults
|
51
|
+
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
|
52
|
+
bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
|
53
|
+
|
54
|
+
if substance_classes.first == "OpenTox::Compound"
|
55
|
+
|
56
|
+
model.algorithms = {
|
57
|
+
:descriptors => {
|
58
|
+
:method => "fingerprint",
|
59
|
+
:type => "MP2D",
|
60
|
+
},
|
61
|
+
:similarity => {
|
62
|
+
:method => "Algorithm::Similarity.tanimoto",
|
63
|
+
:min => 0.1
|
64
|
+
},
|
65
|
+
:feature_selection => nil
|
66
|
+
}
|
67
|
+
|
68
|
+
if model.class == LazarClassification
|
69
|
+
model.algorithms[:prediction] = {
|
70
|
+
:method => "Algorithm::Classification.weighted_majority_vote",
|
71
|
+
}
|
72
|
+
elsif model.class == LazarRegression
|
73
|
+
model.algorithms[:prediction] = {
|
74
|
+
:method => "Algorithm::Caret.pls",
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
elsif substance_classes.first == "OpenTox::Nanoparticle"
|
79
|
+
model.algorithms = {
|
80
|
+
:descriptors => {
|
81
|
+
:method => "properties",
|
82
|
+
:categories => ["P-CHEM"],
|
83
|
+
},
|
84
|
+
:similarity => {
|
85
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
86
|
+
:min => 0.5
|
87
|
+
},
|
88
|
+
:prediction => {
|
89
|
+
:method => "Algorithm::Caret.rf",
|
90
|
+
},
|
91
|
+
:feature_selection => {
|
92
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
93
|
+
},
|
94
|
+
}
|
95
|
+
else
|
96
|
+
bad_request_error "Cannot create models for #{substance_classes.first}."
|
97
|
+
end
|
98
|
+
|
99
|
+
# overwrite defaults with explicit parameters
|
100
|
+
algorithms.each do |type,parameters|
|
101
|
+
if parameters and parameters.is_a? Hash
|
102
|
+
parameters.each do |p,v|
|
103
|
+
model.algorithms[type] ||= {}
|
104
|
+
model.algorithms[type][p] = v
|
105
|
+
model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
|
106
|
+
end
|
107
|
+
else
|
108
|
+
model.algorithms[type] = parameters
|
109
|
+
end
|
110
|
+
end if algorithms
|
111
|
+
|
112
|
+
# parse dependent_variables from training dataset
|
113
|
+
training_dataset.substances.each do |substance|
|
114
|
+
values = training_dataset.values(substance,model.prediction_feature_id)
|
115
|
+
values.each do |v|
|
116
|
+
model.substance_ids << substance.id.to_s
|
117
|
+
model.dependent_variables << v
|
118
|
+
end if values
|
119
|
+
end
|
120
|
+
|
121
|
+
descriptor_method = model.algorithms[:descriptors][:method]
|
122
|
+
case descriptor_method
|
123
|
+
# parse fingerprints
|
124
|
+
when "fingerprint"
|
125
|
+
type = model.algorithms[:descriptors][:type]
|
126
|
+
model.substances.each_with_index do |s,i|
|
127
|
+
model.fingerprints[i] ||= []
|
128
|
+
model.fingerprints[i] += s.fingerprint(type)
|
129
|
+
model.fingerprints[i].uniq!
|
130
|
+
end
|
131
|
+
model.descriptor_ids = model.fingerprints.flatten.uniq
|
132
|
+
model.descriptor_ids.each do |d|
|
133
|
+
# resulting model may break BSON size limit (e.g. f Kazius dataset)
|
134
|
+
model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
|
135
|
+
end
|
136
|
+
# calculate physchem properties
|
137
|
+
when "calculate_properties"
|
138
|
+
features = model.algorithms[:descriptors][:features]
|
139
|
+
model.descriptor_ids = features.collect{|f| f.id.to_s}
|
140
|
+
model.algorithms[:descriptors].delete(:features)
|
141
|
+
model.algorithms[:descriptors].delete(:type)
|
142
|
+
model.substances.each_with_index do |s,i|
|
143
|
+
props = s.calculate_properties(features)
|
144
|
+
props.each_with_index do |v,j|
|
145
|
+
model.independent_variables[j] ||= []
|
146
|
+
model.independent_variables[j][i] = v
|
147
|
+
end if props and !props.empty?
|
148
|
+
end
|
149
|
+
# parse independent_variables
|
150
|
+
when "properties"
|
151
|
+
categories = model.algorithms[:descriptors][:categories]
|
152
|
+
feature_ids = []
|
153
|
+
categories.each do |category|
|
154
|
+
Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
|
155
|
+
end
|
156
|
+
properties = model.substances.collect { |s| s.properties }
|
157
|
+
property_ids = properties.collect{|p| p.keys}.flatten.uniq
|
158
|
+
model.descriptor_ids = feature_ids & property_ids
|
159
|
+
model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
|
160
|
+
else
|
161
|
+
bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
|
162
|
+
end
|
163
|
+
|
164
|
+
if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
|
165
|
+
model = Algorithm.run model.algorithms[:feature_selection][:method], model
|
166
|
+
end
|
167
|
+
|
168
|
+
# scale independent_variables
|
169
|
+
unless model.fingerprints?
|
170
|
+
model.independent_variables.each_with_index do |var,i|
|
171
|
+
model.descriptor_means[i] = var.mean
|
172
|
+
model.descriptor_sds[i] = var.standard_deviation
|
173
|
+
model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
|
174
|
+
end
|
175
|
+
end
|
176
|
+
model.save
|
177
|
+
model
|
22
178
|
end
|
23
|
-
end
|
24
179
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
# check for database activities (neighbors may include query compound)
|
55
|
-
database_activities = nil
|
180
|
+
def predict_substance substance
|
181
|
+
|
182
|
+
case algorithms[:similarity][:method]
|
183
|
+
when /tanimoto/ # binary features
|
184
|
+
similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
|
185
|
+
# TODO this excludes descriptors only present in the query substance
|
186
|
+
# use for applicability domain?
|
187
|
+
query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
|
188
|
+
when /euclid|cosine/ # quantitative features
|
189
|
+
if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
|
190
|
+
features = descriptor_ids.collect{|id| Feature.find(id)}
|
191
|
+
query_descriptors = substance.calculate_properties(features)
|
192
|
+
similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
|
193
|
+
else
|
194
|
+
similarity_descriptors = []
|
195
|
+
query_descriptors = []
|
196
|
+
descriptor_ids.each_with_index do |id,i|
|
197
|
+
prop = substance.properties[id]
|
198
|
+
prop = prop.median if prop.is_a? Array # measured
|
199
|
+
if prop
|
200
|
+
similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
|
201
|
+
query_descriptors[i] = prop
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
else
|
206
|
+
bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
|
207
|
+
end
|
208
|
+
|
56
209
|
prediction = {}
|
57
|
-
|
210
|
+
neighbor_ids = []
|
211
|
+
neighbor_similarities = []
|
212
|
+
neighbor_dependent_variables = []
|
213
|
+
neighbor_independent_variables = []
|
58
214
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
215
|
+
prediction = {}
|
216
|
+
# find neighbors
|
217
|
+
substance_ids.each_with_index do |s,i|
|
218
|
+
# handle query substance
|
219
|
+
if substance.id.to_s == s
|
220
|
+
prediction[:measurements] ||= []
|
221
|
+
prediction[:measurements] << dependent_variables[i]
|
222
|
+
prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
|
223
|
+
else
|
224
|
+
if fingerprints?
|
225
|
+
neighbor_descriptors = fingerprints[i]
|
226
|
+
else
|
227
|
+
next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
|
228
|
+
neighbor_descriptors = scaled_variables.collect{|v| v[i]}
|
229
|
+
end
|
230
|
+
sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
|
231
|
+
if sim >= algorithms[:similarity][:min]
|
232
|
+
neighbor_ids << s
|
233
|
+
neighbor_similarities << sim
|
234
|
+
neighbor_dependent_variables << dependent_variables[i]
|
235
|
+
independent_variables.each_with_index do |c,j|
|
236
|
+
neighbor_independent_variables[j] ||= []
|
237
|
+
neighbor_independent_variables[j] << independent_variables[j][i]
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
63
241
|
end
|
64
|
-
|
65
|
-
|
66
|
-
|
242
|
+
|
243
|
+
measurements = nil
|
244
|
+
|
245
|
+
if neighbor_similarities.empty?
|
246
|
+
prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
|
247
|
+
elsif neighbor_similarities.size == 1
|
248
|
+
prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
|
67
249
|
else
|
68
|
-
|
69
|
-
prediction
|
70
|
-
prediction[:
|
250
|
+
query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
|
251
|
+
# call prediction algorithm
|
252
|
+
result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
|
253
|
+
prediction.merge! result
|
254
|
+
prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
|
71
255
|
end
|
72
256
|
prediction
|
73
257
|
end
|
@@ -77,103 +261,81 @@ module OpenTox
|
|
77
261
|
training_dataset = Dataset.find training_dataset_id
|
78
262
|
|
79
263
|
# parse data
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
compounds = object.compounds
|
264
|
+
substances = []
|
265
|
+
if object.is_a? Substance
|
266
|
+
substances = [object]
|
267
|
+
elsif object.is_a? Array
|
268
|
+
substances = object
|
269
|
+
elsif object.is_a? Dataset
|
270
|
+
substances = object.substances
|
88
271
|
else
|
89
|
-
bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::
|
272
|
+
bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
|
90
273
|
end
|
91
274
|
|
92
275
|
# make predictions
|
93
|
-
predictions =
|
94
|
-
|
276
|
+
predictions = {}
|
277
|
+
substances.each do |c|
|
278
|
+
predictions[c.id.to_s] = predict_substance c
|
279
|
+
predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
|
280
|
+
end
|
95
281
|
|
96
282
|
# serialize result
|
97
|
-
|
98
|
-
|
99
|
-
prediction = predictions.first
|
283
|
+
if object.is_a? Substance
|
284
|
+
prediction = predictions[substances.first.id.to_s]
|
100
285
|
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
|
101
286
|
return prediction
|
102
|
-
|
287
|
+
elsif object.is_a? Array
|
103
288
|
return predictions
|
104
|
-
|
289
|
+
elsif object.is_a? Dataset
|
105
290
|
# prepare prediction dataset
|
106
291
|
measurement_feature = Feature.find prediction_feature_id
|
107
292
|
|
108
|
-
prediction_feature =
|
109
|
-
prediction_dataset = LazarPrediction.
|
293
|
+
prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
|
294
|
+
prediction_dataset = LazarPrediction.create(
|
110
295
|
:name => "Lazar prediction for #{prediction_feature.name}",
|
111
296
|
:creator => __FILE__,
|
112
|
-
:prediction_feature_id => prediction_feature.id
|
113
|
-
|
297
|
+
:prediction_feature_id => prediction_feature.id,
|
298
|
+
:predictions => predictions
|
114
299
|
)
|
115
|
-
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
|
116
|
-
warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
|
117
|
-
prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
|
118
|
-
prediction_dataset.compounds = compounds
|
119
|
-
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
|
120
|
-
prediction_dataset.save
|
121
300
|
return prediction_dataset
|
122
301
|
end
|
123
302
|
|
124
303
|
end
|
125
|
-
|
126
|
-
def
|
127
|
-
|
128
|
-
|
304
|
+
|
305
|
+
def training_dataset
|
306
|
+
Dataset.find(training_dataset_id)
|
307
|
+
end
|
308
|
+
|
309
|
+
def prediction_feature
|
310
|
+
Feature.find(prediction_feature_id)
|
311
|
+
end
|
312
|
+
|
313
|
+
def descriptors
|
314
|
+
descriptor_ids.collect{|id| Feature.find(id)}
|
315
|
+
end
|
316
|
+
|
317
|
+
def substances
|
318
|
+
substance_ids.collect{|id| Substance.find(id)}
|
319
|
+
end
|
320
|
+
|
321
|
+
def fingerprints?
|
322
|
+
algorithms[:descriptors][:method] == "fingerprint" ? true : false
|
129
323
|
end
|
130
324
|
|
131
325
|
end
|
132
326
|
|
133
327
|
class LazarClassification < Lazar
|
134
|
-
|
135
|
-
def self.create training_dataset, params={}
|
136
|
-
model = self.new training_dataset, params
|
137
|
-
model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
|
138
|
-
model.neighbor_algorithm ||= "fingerprint_neighbors"
|
139
|
-
model.neighbor_algorithm_parameters ||= {}
|
140
|
-
{
|
141
|
-
:type => "MP2D",
|
142
|
-
:training_dataset_id => training_dataset.id,
|
143
|
-
:min_sim => 0.1
|
144
|
-
}.each do |key,value|
|
145
|
-
model.neighbor_algorithm_parameters[key] ||= value
|
146
|
-
end
|
147
|
-
model.save
|
148
|
-
model
|
149
|
-
end
|
150
328
|
end
|
151
329
|
|
152
330
|
class LazarRegression < Lazar
|
153
|
-
|
154
|
-
def self.create training_dataset, params={}
|
155
|
-
model = self.new training_dataset, params
|
156
|
-
model.neighbor_algorithm ||= "fingerprint_neighbors"
|
157
|
-
model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
|
158
|
-
model.neighbor_algorithm_parameters ||= {}
|
159
|
-
{
|
160
|
-
:type => "MP2D",
|
161
|
-
:training_dataset_id => training_dataset.id,
|
162
|
-
:min_sim => 0.1
|
163
|
-
}.each do |key,value|
|
164
|
-
model.neighbor_algorithm_parameters[key] ||= value
|
165
|
-
end
|
166
|
-
model.save
|
167
|
-
model
|
168
|
-
end
|
169
331
|
end
|
170
332
|
|
171
|
-
class
|
333
|
+
class Validation
|
334
|
+
|
172
335
|
include OpenTox
|
173
336
|
include Mongoid::Document
|
174
337
|
include Mongoid::Timestamps
|
175
338
|
|
176
|
-
# TODO field Validations
|
177
339
|
field :endpoint, type: String
|
178
340
|
field :species, type: String
|
179
341
|
field :source, type: String
|
@@ -182,7 +344,7 @@ module OpenTox
|
|
182
344
|
field :repeated_crossvalidation_id, type: BSON::ObjectId
|
183
345
|
|
184
346
|
def predict object
|
185
|
-
|
347
|
+
model.predict object
|
186
348
|
end
|
187
349
|
|
188
350
|
def training_dataset
|
@@ -193,8 +355,16 @@ module OpenTox
|
|
193
355
|
Lazar.find model_id
|
194
356
|
end
|
195
357
|
|
358
|
+
def algorithms
|
359
|
+
model.algorithms
|
360
|
+
end
|
361
|
+
|
362
|
+
def prediction_feature
|
363
|
+
model.prediction_feature
|
364
|
+
end
|
365
|
+
|
196
366
|
def repeated_crossvalidation
|
197
|
-
RepeatedCrossValidation.find repeated_crossvalidation_id
|
367
|
+
OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
|
198
368
|
end
|
199
369
|
|
200
370
|
def crossvalidations
|
@@ -202,29 +372,50 @@ module OpenTox
|
|
202
372
|
end
|
203
373
|
|
204
374
|
def regression?
|
205
|
-
|
375
|
+
model.is_a? LazarRegression
|
206
376
|
end
|
207
377
|
|
208
378
|
def classification?
|
209
|
-
|
379
|
+
model.is_a? LazarClassification
|
210
380
|
end
|
211
381
|
|
212
382
|
def self.from_csv_file file
|
213
383
|
metadata_file = file.sub(/csv$/,"json")
|
214
384
|
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
|
215
|
-
|
385
|
+
model_validation = self.new JSON.parse(File.read(metadata_file))
|
216
386
|
training_dataset = Dataset.from_csv_file file
|
217
|
-
model =
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
387
|
+
model = Lazar.create training_dataset: training_dataset
|
388
|
+
model_validation[:model_id] = model.id
|
389
|
+
model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
|
390
|
+
model_validation.save
|
391
|
+
model_validation
|
392
|
+
end
|
393
|
+
|
394
|
+
def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
|
395
|
+
|
396
|
+
# find/import training_dataset
|
397
|
+
training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
398
|
+
unless training_dataset # try to import
|
399
|
+
Import::Enanomapper.import
|
400
|
+
training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
401
|
+
bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
|
222
402
|
end
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
403
|
+
prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
|
404
|
+
|
405
|
+
model_validation = self.new(
|
406
|
+
:endpoint => prediction_feature.name,
|
407
|
+
:source => prediction_feature.source,
|
408
|
+
:species => "A549 human lung epithelial carcinoma cells",
|
409
|
+
:unit => prediction_feature.unit
|
410
|
+
)
|
411
|
+
model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms
|
412
|
+
model_validation[:model_id] = model.id
|
413
|
+
repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5
|
414
|
+
model_validation[:repeated_crossvalidation_id] = repeated_cv.id
|
415
|
+
model_validation.save
|
416
|
+
model_validation
|
227
417
|
end
|
418
|
+
|
228
419
|
end
|
229
420
|
|
230
421
|
end
|
data/lib/nanoparticle.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
class Nanoparticle < Substance
|
4
|
+
include OpenTox
|
5
|
+
|
6
|
+
field :core_id, type: String, default: nil
|
7
|
+
field :coating_ids, type: Array, default: []
|
8
|
+
|
9
|
+
def core
|
10
|
+
Compound.find core_id
|
11
|
+
end
|
12
|
+
|
13
|
+
def coating
|
14
|
+
coating_ids.collect{|i| Compound.find i }
|
15
|
+
end
|
16
|
+
|
17
|
+
def fingerprint type=DEFAULT_FINGERPRINT
|
18
|
+
core_fp = core.fingerprint type
|
19
|
+
coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact
|
20
|
+
(core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact
|
21
|
+
end
|
22
|
+
|
23
|
+
def calculate_properties descriptors=PhysChem::OPENBABEL
|
24
|
+
if core.smiles and !coating.collect{|c| c.smiles}.compact.empty?
|
25
|
+
core_prop = core.calculate_properties descriptors
|
26
|
+
coating_prop = coating.collect{|c| c.calculate_properties descriptors if c.smiles}
|
27
|
+
descriptors.collect_with_index{|d,i| [core_prop[i],coating_prop.collect{|c| c[i] if c}]}
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def add_feature feature, value, dataset
|
32
|
+
unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
|
33
|
+
case feature.category
|
34
|
+
when "P-CHEM"
|
35
|
+
properties[feature.id.to_s] ||= []
|
36
|
+
properties[feature.id.to_s] << value
|
37
|
+
properties[feature.id.to_s].uniq!
|
38
|
+
when "Proteomics"
|
39
|
+
properties[feature.id.to_s] ||= []
|
40
|
+
properties[feature.id.to_s] << value
|
41
|
+
properties[feature.id.to_s].uniq!
|
42
|
+
when "TOX"
|
43
|
+
if feature.name.match("Cell Viability Assay") and !feature.name.match("SLOPE") # -log10 transformation
|
44
|
+
value = -Math.log10(value)
|
45
|
+
feature.unit = "-log10(#{feature.unit})" unless feature.unit.match "log10"
|
46
|
+
feature.warnings += ["-log10 transformed values"] unless feature.warnings.include? "-log10 transformed values"
|
47
|
+
feature.save
|
48
|
+
end
|
49
|
+
dataset.add self, feature, value
|
50
|
+
else
|
51
|
+
warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
|
52
|
+
end
|
53
|
+
dataset_ids << dataset.id
|
54
|
+
dataset_ids.uniq!
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_ambit_value feature, v, dataset
|
59
|
+
# TODO add study id to warnings
|
60
|
+
v.delete "unit"
|
61
|
+
# TODO: ppm instead of weights
|
62
|
+
if v.keys == ["textValue"]
|
63
|
+
add_feature feature, v["textValue"], dataset
|
64
|
+
elsif v.keys == ["loValue"]
|
65
|
+
add_feature feature, v["loValue"], dataset
|
66
|
+
elsif v.keys.size == 2 and v["errorValue"]
|
67
|
+
add_feature feature, v["loValue"], dataset
|
68
|
+
#warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
|
69
|
+
elsif v.keys.size == 2 and v["loQualifier"] == "mean"
|
70
|
+
add_feature feature, v["loValue"], dataset
|
71
|
+
#warn "'#{feature.name}' is a mean value. Original data is not available."
|
72
|
+
elsif v.keys.size == 2 and v["loQualifier"] #== ">="
|
73
|
+
#warn "Only min value available for '#{feature.name}', entry ignored"
|
74
|
+
elsif v.keys.size == 2 and v["upQualifier"] #== ">="
|
75
|
+
#warn "Only max value available for '#{feature.name}', entry ignored"
|
76
|
+
elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
|
77
|
+
add_feature feature, v["loValue"], dataset
|
78
|
+
#warn "loQualifier and upQualifier are empty."
|
79
|
+
elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
|
80
|
+
add_feature feature, v["loValue"], dataset
|
81
|
+
#warn "loQualifier and upQualifier are empty."
|
82
|
+
elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
|
83
|
+
add_feature feature, v["loValue"], dataset
|
84
|
+
#warn "loQualifier and upQualifier are empty."
|
85
|
+
elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
|
86
|
+
#add_feature feature, [v["loValue"],v["upValue"]].mean, dataset
|
87
|
+
#warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
|
88
|
+
elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
|
89
|
+
#warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
|
90
|
+
add_feature feature, v["loValue"], dataset
|
91
|
+
elsif v == {} # do nothing
|
92
|
+
else
|
93
|
+
warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
end
|
data/lib/opentox.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
module OpenTox
|
2
2
|
|
3
|
-
#
|
4
|
-
|
5
|
-
# create default OpenTox classes (defined in opentox-client.rb)
|
3
|
+
# create default OpenTox classes
|
6
4
|
# provides Mongoid's query and persistence methods
|
7
5
|
# http://mongoid.org/en/mongoid/docs/persistence.html
|
8
6
|
# http://mongoid.org/en/mongoid/docs/querying.html
|
@@ -13,10 +11,15 @@ module OpenTox
|
|
13
11
|
include Mongoid::Timestamps
|
14
12
|
store_in collection: klass.downcase.pluralize
|
15
13
|
field :name, type: String
|
14
|
+
field :source, type: String
|
16
15
|
field :warnings, type: Array, default: []
|
16
|
+
|
17
|
+
def warn warning
|
18
|
+
$logger.warn warning
|
19
|
+
warnings << warning
|
20
|
+
end
|
17
21
|
end
|
18
22
|
OpenTox.const_set klass,c
|
19
23
|
end
|
20
24
|
|
21
25
|
end
|
22
|
-
|