lazar 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +64 -1
- data/VERSION +1 -1
- data/lib/algorithm.rb +1 -0
- data/lib/caret.rb +11 -2
- data/lib/classification.rb +6 -1
- data/lib/compound.rb +32 -23
- data/lib/crossvalidation.rb +22 -0
- data/lib/dataset.rb +30 -3
- data/lib/feature.rb +7 -0
- data/lib/feature_selection.rb +4 -1
- data/lib/import.rb +5 -1
- data/lib/leave-one-out-validation.rb +6 -0
- data/lib/model.rb +77 -3
- data/lib/nanoparticle.rb +19 -0
- data/lib/overwrite.rb +46 -11
- data/lib/physchem.rb +23 -5
- data/lib/regression.rb +5 -0
- data/lib/rest-client-wrapper.rb +1 -0
- data/lib/similarity.rb +22 -2
- data/lib/substance.rb +1 -0
- data/lib/train-test-validation.rb +12 -0
- data/lib/validation-statistics.rb +19 -0
- data/lib/validation.rb +3 -0
- data/test/feature.rb +2 -2
- data/test/model-nanoparticle.rb +7 -0
- data/test/nanomaterial-model-validation.rb +2 -3
- data/test/setup.rb +1 -5
- data/test/validation-regression.rb +2 -3
- metadata +50 -5
- data/lib/experiment.rb +0 -99
@@ -1,7 +1,10 @@
|
|
1
1
|
module OpenTox
|
2
2
|
module Validation
|
3
|
+
# Statistical evaluation of classification validations
|
3
4
|
module ClassificationStatistics
|
4
5
|
|
6
|
+
# Get statistics
|
7
|
+
# @return [Hash]
|
5
8
|
def statistics
|
6
9
|
self.accept_values = model.prediction_feature.accept_values
|
7
10
|
self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
|
@@ -63,6 +66,9 @@ module OpenTox
|
|
63
66
|
}
|
64
67
|
end
|
65
68
|
|
69
|
+
# Plot accuracy vs prediction probability
|
70
|
+
# @param [String,nil] format
|
71
|
+
# @return [Blob]
|
66
72
|
def probability_plot format: "pdf"
|
67
73
|
#unless probability_plot_id
|
68
74
|
|
@@ -99,8 +105,11 @@ module OpenTox
|
|
99
105
|
end
|
100
106
|
end
|
101
107
|
|
108
|
+
# Statistical evaluation of regression validations
|
102
109
|
module RegressionStatistics
|
103
110
|
|
111
|
+
# Get statistics
|
112
|
+
# @return [Hash]
|
104
113
|
def statistics
|
105
114
|
self.rmse = 0
|
106
115
|
self.mae = 0
|
@@ -147,10 +156,15 @@ module OpenTox
|
|
147
156
|
}
|
148
157
|
end
|
149
158
|
|
159
|
+
# Get percentage of measurements within the prediction interval
|
160
|
+
# @return [Float]
|
150
161
|
def percent_within_prediction_interval
|
151
162
|
100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
|
152
163
|
end
|
153
164
|
|
165
|
+
# Plot predicted vs measured values
|
166
|
+
# @param [String,nil] format
|
167
|
+
# @return [Blob]
|
154
168
|
def correlation_plot format: "png"
|
155
169
|
unless correlation_plot_id
|
156
170
|
tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
|
@@ -177,6 +191,11 @@ module OpenTox
|
|
177
191
|
$gridfs.find_one(_id: correlation_plot_id).data
|
178
192
|
end
|
179
193
|
|
194
|
+
# Get predictions with the largest difference between predicted and measured values
|
195
|
+
# @params [Fixnum] number of predictions
|
196
|
+
# @params [TrueClass,FalseClass,nil] include neighbors
|
197
|
+
# @params [TrueClass,FalseClass,nil] show common descriptors
|
198
|
+
# @return [Hash]
|
180
199
|
def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
|
181
200
|
worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
|
182
201
|
worst_predictions.collect do |p|
|
data/lib/validation.rb
CHANGED
@@ -2,6 +2,7 @@ module OpenTox
|
|
2
2
|
|
3
3
|
module Validation
|
4
4
|
|
5
|
+
# Base validation class
|
5
6
|
class Validation
|
6
7
|
include OpenTox
|
7
8
|
include Mongoid::Document
|
@@ -14,6 +15,8 @@ module OpenTox
|
|
14
15
|
field :predictions, type: Hash, default: {}
|
15
16
|
field :finished_at, type: Time
|
16
17
|
|
18
|
+
# Get model
|
19
|
+
# @return [OpenTox::Model::Lazar]
|
17
20
|
def model
|
18
21
|
Model::Lazar.find model_id
|
19
22
|
end
|
data/test/feature.rb
CHANGED
@@ -55,7 +55,7 @@ class FeatureTest < MiniTest::Test
|
|
55
55
|
end
|
56
56
|
|
57
57
|
def test_physchem_description
|
58
|
-
assert_equal
|
58
|
+
assert_equal 346, PhysChem.descriptors.size
|
59
59
|
assert_equal 15, PhysChem.openbabel_descriptors.size
|
60
60
|
assert_equal 295, PhysChem.cdk_descriptors.size
|
61
61
|
assert_equal 45, PhysChem.joelib_descriptors.size
|
@@ -63,7 +63,7 @@ class FeatureTest < MiniTest::Test
|
|
63
63
|
end
|
64
64
|
|
65
65
|
def test_physchem
|
66
|
-
assert_equal
|
66
|
+
assert_equal 346, PhysChem.descriptors.size
|
67
67
|
c = Compound.from_smiles "CC(=O)CC(C)C"
|
68
68
|
logP = PhysChem.find_or_create_by :name => "Openbabel.logP"
|
69
69
|
assert_equal 1.6215, logP.calculate(c)
|
data/test/model-nanoparticle.rb
CHANGED
@@ -8,6 +8,13 @@ class NanoparticleModelTest < MiniTest::Test
|
|
8
8
|
@prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
|
9
9
|
end
|
10
10
|
|
11
|
+
def test_core_coating_source_uris
|
12
|
+
@training_dataset.nanoparticles.each do |np|
|
13
|
+
refute_nil np.core.source
|
14
|
+
np.coating.each{|c| refute_nil c.source}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
11
18
|
def test_nanoparticle_model
|
12
19
|
assert true, @prediction_feature.measured
|
13
20
|
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
|
@@ -8,7 +8,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def test_default_nanomaterial_validation_model
|
11
|
-
validation_model = Model::
|
11
|
+
validation_model = Model::Validation.from_enanomapper
|
12
12
|
[:endpoint,:species,:source].each do |p|
|
13
13
|
refute_empty validation_model[p]
|
14
14
|
end
|
@@ -39,7 +39,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
|
|
39
39
|
:prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
|
40
40
|
:feature_selection => nil
|
41
41
|
}
|
42
|
-
validation_model = Model::
|
42
|
+
validation_model = Model::Validation.from_enanomapper algorithms: algorithms
|
43
43
|
assert validation_model.regression?
|
44
44
|
refute validation_model.classification?
|
45
45
|
validation_model.crossvalidations.each do |cv|
|
@@ -50,6 +50,5 @@ class NanomaterialValidationModelTest < MiniTest::Test
|
|
50
50
|
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
51
51
|
prediction = validation_model.predict nanoparticle
|
52
52
|
refute_nil prediction[:value]
|
53
|
-
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
54
53
|
end
|
55
54
|
end
|
data/test/setup.rb
CHANGED
@@ -6,8 +6,4 @@ include OpenTox
|
|
6
6
|
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
|
7
7
|
DATA_DIR ||= File.join(TEST_DIR,"data")
|
8
8
|
training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
9
|
-
unless training_dataset
|
10
|
-
Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
|
11
|
-
end
|
12
|
-
#$mongo.database.drop
|
13
|
-
#$gridfs = $mongo.database.fs
|
9
|
+
Import::Enanomapper.import unless training_dataset
|
@@ -83,10 +83,9 @@ class ValidationRegressionTest < MiniTest::Test
|
|
83
83
|
model = Model::Lazar.create training_dataset: dataset
|
84
84
|
repeated_cv = RepeatedCrossValidation.create model
|
85
85
|
repeated_cv.crossvalidations.each do |cv|
|
86
|
-
|
87
|
-
|
86
|
+
assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
|
87
|
+
assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
|
88
88
|
end
|
89
|
-
File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot}
|
90
89
|
end
|
91
90
|
|
92
91
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lazar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler,
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2017-01-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -134,7 +134,6 @@ files:
|
|
134
134
|
- lib/crossvalidation.rb
|
135
135
|
- lib/dataset.rb
|
136
136
|
- lib/error.rb
|
137
|
-
- lib/experiment.rb
|
138
137
|
- lib/feature.rb
|
139
138
|
- lib/feature_selection.rb
|
140
139
|
- lib/import.rb
|
@@ -222,8 +221,54 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
222
221
|
version: '0'
|
223
222
|
requirements: []
|
224
223
|
rubyforge_project: lazar
|
225
|
-
rubygems_version: 2.5.
|
224
|
+
rubygems_version: 2.5.2
|
226
225
|
signing_key:
|
227
226
|
specification_version: 4
|
228
227
|
summary: Lazar framework
|
229
|
-
test_files:
|
228
|
+
test_files:
|
229
|
+
- test/all.rb
|
230
|
+
- test/compound.rb
|
231
|
+
- test/data/EPAFHM.csv
|
232
|
+
- test/data/EPAFHM.medi.csv
|
233
|
+
- test/data/EPAFHM.medi_log10.csv
|
234
|
+
- test/data/EPAFHM.mini.csv
|
235
|
+
- test/data/EPAFHM.mini_log10.csv
|
236
|
+
- test/data/EPAFHM_log10.csv
|
237
|
+
- test/data/ISSCAN-multi.csv
|
238
|
+
- test/data/LOAEL_mmol_corrected_smiles.csv
|
239
|
+
- test/data/acetaldehyde.sdf
|
240
|
+
- test/data/batch_prediction.csv
|
241
|
+
- test/data/batch_prediction_inchi_small.csv
|
242
|
+
- test/data/batch_prediction_smiles_small.csv
|
243
|
+
- test/data/hamster_carcinogenicity.csv
|
244
|
+
- test/data/hamster_carcinogenicity.json
|
245
|
+
- test/data/hamster_carcinogenicity.mini.bool_float.csv
|
246
|
+
- test/data/hamster_carcinogenicity.mini.bool_int.csv
|
247
|
+
- test/data/hamster_carcinogenicity.mini.bool_string.csv
|
248
|
+
- test/data/hamster_carcinogenicity.mini.csv
|
249
|
+
- test/data/hamster_carcinogenicity_with_errors.csv
|
250
|
+
- test/data/kazius.csv
|
251
|
+
- test/data/loael.csv
|
252
|
+
- test/data/loael_log10.csv
|
253
|
+
- test/data/multi_cell_call.csv
|
254
|
+
- test/data/multi_cell_call_no_dup.csv
|
255
|
+
- test/data/multicolumn.csv
|
256
|
+
- test/data/rat_feature_dataset.csv
|
257
|
+
- test/data/wrong_dataset.csv
|
258
|
+
- test/dataset.rb
|
259
|
+
- test/default_environment.rb
|
260
|
+
- test/descriptor.rb
|
261
|
+
- test/error.rb
|
262
|
+
- test/experiment.rb
|
263
|
+
- test/feature.rb
|
264
|
+
- test/gridfs.rb
|
265
|
+
- test/model-classification.rb
|
266
|
+
- test/model-nanoparticle.rb
|
267
|
+
- test/model-regression.rb
|
268
|
+
- test/model-validation.rb
|
269
|
+
- test/nanomaterial-model-validation.rb
|
270
|
+
- test/setup.rb
|
271
|
+
- test/test_environment.rb
|
272
|
+
- test/validation-classification.rb
|
273
|
+
- test/validation-nanoparticle.rb
|
274
|
+
- test/validation-regression.rb
|
data/lib/experiment.rb
DELETED
@@ -1,99 +0,0 @@
|
|
1
|
-
module OpenTox
|
2
|
-
|
3
|
-
class Experiment
|
4
|
-
field :dataset_ids, type: Array
|
5
|
-
field :model_settings, type: Array, default: []
|
6
|
-
field :results, type: Hash, default: {}
|
7
|
-
|
8
|
-
def run
|
9
|
-
dataset_ids.each do |dataset_id|
|
10
|
-
dataset = Dataset.find(dataset_id)
|
11
|
-
results[dataset_id.to_s] = []
|
12
|
-
model_settings.each do |setting|
|
13
|
-
setting = setting.dup
|
14
|
-
model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
|
15
|
-
model = Object.const_get(model_algorithm).create dataset, setting
|
16
|
-
$logger.debug model
|
17
|
-
model.save
|
18
|
-
repeated_crossvalidation = RepeatedCrossValidation.create model
|
19
|
-
results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
|
20
|
-
end
|
21
|
-
end
|
22
|
-
save
|
23
|
-
end
|
24
|
-
|
25
|
-
def report
|
26
|
-
# statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
|
27
|
-
report = {}
|
28
|
-
report[:name] = name
|
29
|
-
report[:experiment_id] = self.id.to_s
|
30
|
-
report[:results] = {}
|
31
|
-
parameters = []
|
32
|
-
dataset_ids.each do |dataset_id|
|
33
|
-
dataset_name = Dataset.find(dataset_id).name
|
34
|
-
report[:results][dataset_name] = {}
|
35
|
-
report[:results][dataset_name][:anova] = {}
|
36
|
-
report[:results][dataset_name][:data] = []
|
37
|
-
# TODO results[dataset_id.to_s] does not exist
|
38
|
-
results[dataset_id.to_s].each do |result|
|
39
|
-
model = Model::Lazar.find(result[:model_id])
|
40
|
-
repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
|
41
|
-
crossvalidations = repeated_cv.crossvalidations
|
42
|
-
if crossvalidations.first.is_a? ClassificationCrossValidation
|
43
|
-
parameters = [:accuracy,:true_rate,:predictivity]
|
44
|
-
elsif crossvalidations.first.is_a? RegressionCrossValidation
|
45
|
-
parameters = [:rmse,:mae,:r_squared]
|
46
|
-
end
|
47
|
-
summary = {}
|
48
|
-
[:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
|
49
|
-
summary[key] = model[key]
|
50
|
-
end
|
51
|
-
summary[:nr_instances] = crossvalidations.first.nr_instances
|
52
|
-
summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
|
53
|
-
summary[:time] = crossvalidations.collect{|cv| cv.time}
|
54
|
-
parameters.each do |param|
|
55
|
-
summary[param] = crossvalidations.collect{|cv| cv.send(param)}
|
56
|
-
end
|
57
|
-
report[:results][dataset_name][:data] << summary
|
58
|
-
end
|
59
|
-
end
|
60
|
-
report[:results].each do |dataset,results|
|
61
|
-
([:time,:nr_unpredicted]+parameters).each do |param|
|
62
|
-
experiments = []
|
63
|
-
outcome = []
|
64
|
-
results[:data].each_with_index do |result,i|
|
65
|
-
result[param].each do |p|
|
66
|
-
experiments << i
|
67
|
-
p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
|
68
|
-
outcome << p
|
69
|
-
end
|
70
|
-
end
|
71
|
-
begin
|
72
|
-
R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
|
73
|
-
R.eval "experiment_nr = factor(experiment_nr)"
|
74
|
-
R.assign "outcome", outcome
|
75
|
-
R.eval "data = data.frame(experiment_nr,outcome)"
|
76
|
-
# one-way ANOVA
|
77
|
-
R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
|
78
|
-
# http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
|
79
|
-
p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
|
80
|
-
# aequivalent
|
81
|
-
# sum = R.eval("summary(fit)")
|
82
|
-
#p_value = sum.to_ruby.first.last.first
|
83
|
-
rescue
|
84
|
-
p_value = nil
|
85
|
-
end
|
86
|
-
report[:results][dataset][:anova][param] = p_value
|
87
|
-
=begin
|
88
|
-
=end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
report
|
92
|
-
end
|
93
|
-
|
94
|
-
def summary
|
95
|
-
report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
end
|