lazar 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +64 -1
- data/VERSION +1 -1
- data/lib/algorithm.rb +1 -0
- data/lib/caret.rb +11 -2
- data/lib/classification.rb +6 -1
- data/lib/compound.rb +32 -23
- data/lib/crossvalidation.rb +22 -0
- data/lib/dataset.rb +30 -3
- data/lib/feature.rb +7 -0
- data/lib/feature_selection.rb +4 -1
- data/lib/import.rb +5 -1
- data/lib/leave-one-out-validation.rb +6 -0
- data/lib/model.rb +77 -3
- data/lib/nanoparticle.rb +19 -0
- data/lib/overwrite.rb +46 -11
- data/lib/physchem.rb +23 -5
- data/lib/regression.rb +5 -0
- data/lib/rest-client-wrapper.rb +1 -0
- data/lib/similarity.rb +22 -2
- data/lib/substance.rb +1 -0
- data/lib/train-test-validation.rb +12 -0
- data/lib/validation-statistics.rb +19 -0
- data/lib/validation.rb +3 -0
- data/test/feature.rb +2 -2
- data/test/model-nanoparticle.rb +7 -0
- data/test/nanomaterial-model-validation.rb +2 -3
- data/test/setup.rb +1 -5
- data/test/validation-regression.rb +2 -3
- metadata +50 -5
- data/lib/experiment.rb +0 -99
@@ -1,7 +1,10 @@
|
|
1
1
|
module OpenTox
|
2
2
|
module Validation
|
3
|
+
# Statistical evaluation of classification validations
|
3
4
|
module ClassificationStatistics
|
4
5
|
|
6
|
+
# Get statistics
|
7
|
+
# @return [Hash]
|
5
8
|
def statistics
|
6
9
|
self.accept_values = model.prediction_feature.accept_values
|
7
10
|
self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
|
@@ -63,6 +66,9 @@ module OpenTox
|
|
63
66
|
}
|
64
67
|
end
|
65
68
|
|
69
|
+
# Plot accuracy vs prediction probability
|
70
|
+
# @param [String,nil] format
|
71
|
+
# @return [Blob]
|
66
72
|
def probability_plot format: "pdf"
|
67
73
|
#unless probability_plot_id
|
68
74
|
|
@@ -99,8 +105,11 @@ module OpenTox
|
|
99
105
|
end
|
100
106
|
end
|
101
107
|
|
108
|
+
# Statistical evaluation of regression validations
|
102
109
|
module RegressionStatistics
|
103
110
|
|
111
|
+
# Get statistics
|
112
|
+
# @return [Hash]
|
104
113
|
def statistics
|
105
114
|
self.rmse = 0
|
106
115
|
self.mae = 0
|
@@ -147,10 +156,15 @@ module OpenTox
|
|
147
156
|
}
|
148
157
|
end
|
149
158
|
|
159
|
+
# Get percentage of measurements within the prediction interval
|
160
|
+
# @return [Float]
|
150
161
|
def percent_within_prediction_interval
|
151
162
|
100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
|
152
163
|
end
|
153
164
|
|
165
|
+
# Plot predicted vs measured values
|
166
|
+
# @param [String,nil] format
|
167
|
+
# @return [Blob]
|
154
168
|
def correlation_plot format: "png"
|
155
169
|
unless correlation_plot_id
|
156
170
|
tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
|
@@ -177,6 +191,11 @@ module OpenTox
|
|
177
191
|
$gridfs.find_one(_id: correlation_plot_id).data
|
178
192
|
end
|
179
193
|
|
194
|
+
# Get predictions with the largest difference between predicted and measured values
|
195
|
+
# @params [Fixnum] number of predictions
|
196
|
+
# @params [TrueClass,FalseClass,nil] include neighbors
|
197
|
+
# @params [TrueClass,FalseClass,nil] show common descriptors
|
198
|
+
# @return [Hash]
|
180
199
|
def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
|
181
200
|
worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
|
182
201
|
worst_predictions.collect do |p|
|
data/lib/validation.rb
CHANGED
@@ -2,6 +2,7 @@ module OpenTox
|
|
2
2
|
|
3
3
|
module Validation
|
4
4
|
|
5
|
+
# Base validation class
|
5
6
|
class Validation
|
6
7
|
include OpenTox
|
7
8
|
include Mongoid::Document
|
@@ -14,6 +15,8 @@ module OpenTox
|
|
14
15
|
field :predictions, type: Hash, default: {}
|
15
16
|
field :finished_at, type: Time
|
16
17
|
|
18
|
+
# Get model
|
19
|
+
# @return [OpenTox::Model::Lazar]
|
17
20
|
def model
|
18
21
|
Model::Lazar.find model_id
|
19
22
|
end
|
data/test/feature.rb
CHANGED
@@ -55,7 +55,7 @@ class FeatureTest < MiniTest::Test
|
|
55
55
|
end
|
56
56
|
|
57
57
|
def test_physchem_description
|
58
|
-
assert_equal
|
58
|
+
assert_equal 346, PhysChem.descriptors.size
|
59
59
|
assert_equal 15, PhysChem.openbabel_descriptors.size
|
60
60
|
assert_equal 295, PhysChem.cdk_descriptors.size
|
61
61
|
assert_equal 45, PhysChem.joelib_descriptors.size
|
@@ -63,7 +63,7 @@ class FeatureTest < MiniTest::Test
|
|
63
63
|
end
|
64
64
|
|
65
65
|
def test_physchem
|
66
|
-
assert_equal
|
66
|
+
assert_equal 346, PhysChem.descriptors.size
|
67
67
|
c = Compound.from_smiles "CC(=O)CC(C)C"
|
68
68
|
logP = PhysChem.find_or_create_by :name => "Openbabel.logP"
|
69
69
|
assert_equal 1.6215, logP.calculate(c)
|
data/test/model-nanoparticle.rb
CHANGED
@@ -8,6 +8,13 @@ class NanoparticleModelTest < MiniTest::Test
|
|
8
8
|
@prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
|
9
9
|
end
|
10
10
|
|
11
|
+
def test_core_coating_source_uris
|
12
|
+
@training_dataset.nanoparticles.each do |np|
|
13
|
+
refute_nil np.core.source
|
14
|
+
np.coating.each{|c| refute_nil c.source}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
11
18
|
def test_nanoparticle_model
|
12
19
|
assert true, @prediction_feature.measured
|
13
20
|
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
|
@@ -8,7 +8,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def test_default_nanomaterial_validation_model
|
11
|
-
validation_model = Model::
|
11
|
+
validation_model = Model::Validation.from_enanomapper
|
12
12
|
[:endpoint,:species,:source].each do |p|
|
13
13
|
refute_empty validation_model[p]
|
14
14
|
end
|
@@ -39,7 +39,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
|
|
39
39
|
:prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
|
40
40
|
:feature_selection => nil
|
41
41
|
}
|
42
|
-
validation_model = Model::
|
42
|
+
validation_model = Model::Validation.from_enanomapper algorithms: algorithms
|
43
43
|
assert validation_model.regression?
|
44
44
|
refute validation_model.classification?
|
45
45
|
validation_model.crossvalidations.each do |cv|
|
@@ -50,6 +50,5 @@ class NanomaterialValidationModelTest < MiniTest::Test
|
|
50
50
|
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
51
51
|
prediction = validation_model.predict nanoparticle
|
52
52
|
refute_nil prediction[:value]
|
53
|
-
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
54
53
|
end
|
55
54
|
end
|
data/test/setup.rb
CHANGED
@@ -6,8 +6,4 @@ include OpenTox
|
|
6
6
|
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
|
7
7
|
DATA_DIR ||= File.join(TEST_DIR,"data")
|
8
8
|
training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
9
|
-
unless training_dataset
|
10
|
-
Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
|
11
|
-
end
|
12
|
-
#$mongo.database.drop
|
13
|
-
#$gridfs = $mongo.database.fs
|
9
|
+
Import::Enanomapper.import unless training_dataset
|
@@ -83,10 +83,9 @@ class ValidationRegressionTest < MiniTest::Test
|
|
83
83
|
model = Model::Lazar.create training_dataset: dataset
|
84
84
|
repeated_cv = RepeatedCrossValidation.create model
|
85
85
|
repeated_cv.crossvalidations.each do |cv|
|
86
|
-
|
87
|
-
|
86
|
+
assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
|
87
|
+
assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
|
88
88
|
end
|
89
|
-
File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot}
|
90
89
|
end
|
91
90
|
|
92
91
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lazar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler,
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2017-01-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -134,7 +134,6 @@ files:
|
|
134
134
|
- lib/crossvalidation.rb
|
135
135
|
- lib/dataset.rb
|
136
136
|
- lib/error.rb
|
137
|
-
- lib/experiment.rb
|
138
137
|
- lib/feature.rb
|
139
138
|
- lib/feature_selection.rb
|
140
139
|
- lib/import.rb
|
@@ -222,8 +221,54 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
222
221
|
version: '0'
|
223
222
|
requirements: []
|
224
223
|
rubyforge_project: lazar
|
225
|
-
rubygems_version: 2.5.
|
224
|
+
rubygems_version: 2.5.2
|
226
225
|
signing_key:
|
227
226
|
specification_version: 4
|
228
227
|
summary: Lazar framework
|
229
|
-
test_files:
|
228
|
+
test_files:
|
229
|
+
- test/all.rb
|
230
|
+
- test/compound.rb
|
231
|
+
- test/data/EPAFHM.csv
|
232
|
+
- test/data/EPAFHM.medi.csv
|
233
|
+
- test/data/EPAFHM.medi_log10.csv
|
234
|
+
- test/data/EPAFHM.mini.csv
|
235
|
+
- test/data/EPAFHM.mini_log10.csv
|
236
|
+
- test/data/EPAFHM_log10.csv
|
237
|
+
- test/data/ISSCAN-multi.csv
|
238
|
+
- test/data/LOAEL_mmol_corrected_smiles.csv
|
239
|
+
- test/data/acetaldehyde.sdf
|
240
|
+
- test/data/batch_prediction.csv
|
241
|
+
- test/data/batch_prediction_inchi_small.csv
|
242
|
+
- test/data/batch_prediction_smiles_small.csv
|
243
|
+
- test/data/hamster_carcinogenicity.csv
|
244
|
+
- test/data/hamster_carcinogenicity.json
|
245
|
+
- test/data/hamster_carcinogenicity.mini.bool_float.csv
|
246
|
+
- test/data/hamster_carcinogenicity.mini.bool_int.csv
|
247
|
+
- test/data/hamster_carcinogenicity.mini.bool_string.csv
|
248
|
+
- test/data/hamster_carcinogenicity.mini.csv
|
249
|
+
- test/data/hamster_carcinogenicity_with_errors.csv
|
250
|
+
- test/data/kazius.csv
|
251
|
+
- test/data/loael.csv
|
252
|
+
- test/data/loael_log10.csv
|
253
|
+
- test/data/multi_cell_call.csv
|
254
|
+
- test/data/multi_cell_call_no_dup.csv
|
255
|
+
- test/data/multicolumn.csv
|
256
|
+
- test/data/rat_feature_dataset.csv
|
257
|
+
- test/data/wrong_dataset.csv
|
258
|
+
- test/dataset.rb
|
259
|
+
- test/default_environment.rb
|
260
|
+
- test/descriptor.rb
|
261
|
+
- test/error.rb
|
262
|
+
- test/experiment.rb
|
263
|
+
- test/feature.rb
|
264
|
+
- test/gridfs.rb
|
265
|
+
- test/model-classification.rb
|
266
|
+
- test/model-nanoparticle.rb
|
267
|
+
- test/model-regression.rb
|
268
|
+
- test/model-validation.rb
|
269
|
+
- test/nanomaterial-model-validation.rb
|
270
|
+
- test/setup.rb
|
271
|
+
- test/test_environment.rb
|
272
|
+
- test/validation-classification.rb
|
273
|
+
- test/validation-nanoparticle.rb
|
274
|
+
- test/validation-regression.rb
|
data/lib/experiment.rb
DELETED
@@ -1,99 +0,0 @@
|
|
1
|
-
module OpenTox
|
2
|
-
|
3
|
-
class Experiment
|
4
|
-
field :dataset_ids, type: Array
|
5
|
-
field :model_settings, type: Array, default: []
|
6
|
-
field :results, type: Hash, default: {}
|
7
|
-
|
8
|
-
def run
|
9
|
-
dataset_ids.each do |dataset_id|
|
10
|
-
dataset = Dataset.find(dataset_id)
|
11
|
-
results[dataset_id.to_s] = []
|
12
|
-
model_settings.each do |setting|
|
13
|
-
setting = setting.dup
|
14
|
-
model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
|
15
|
-
model = Object.const_get(model_algorithm).create dataset, setting
|
16
|
-
$logger.debug model
|
17
|
-
model.save
|
18
|
-
repeated_crossvalidation = RepeatedCrossValidation.create model
|
19
|
-
results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
|
20
|
-
end
|
21
|
-
end
|
22
|
-
save
|
23
|
-
end
|
24
|
-
|
25
|
-
def report
|
26
|
-
# statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
|
27
|
-
report = {}
|
28
|
-
report[:name] = name
|
29
|
-
report[:experiment_id] = self.id.to_s
|
30
|
-
report[:results] = {}
|
31
|
-
parameters = []
|
32
|
-
dataset_ids.each do |dataset_id|
|
33
|
-
dataset_name = Dataset.find(dataset_id).name
|
34
|
-
report[:results][dataset_name] = {}
|
35
|
-
report[:results][dataset_name][:anova] = {}
|
36
|
-
report[:results][dataset_name][:data] = []
|
37
|
-
# TODO results[dataset_id.to_s] does not exist
|
38
|
-
results[dataset_id.to_s].each do |result|
|
39
|
-
model = Model::Lazar.find(result[:model_id])
|
40
|
-
repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
|
41
|
-
crossvalidations = repeated_cv.crossvalidations
|
42
|
-
if crossvalidations.first.is_a? ClassificationCrossValidation
|
43
|
-
parameters = [:accuracy,:true_rate,:predictivity]
|
44
|
-
elsif crossvalidations.first.is_a? RegressionCrossValidation
|
45
|
-
parameters = [:rmse,:mae,:r_squared]
|
46
|
-
end
|
47
|
-
summary = {}
|
48
|
-
[:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
|
49
|
-
summary[key] = model[key]
|
50
|
-
end
|
51
|
-
summary[:nr_instances] = crossvalidations.first.nr_instances
|
52
|
-
summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
|
53
|
-
summary[:time] = crossvalidations.collect{|cv| cv.time}
|
54
|
-
parameters.each do |param|
|
55
|
-
summary[param] = crossvalidations.collect{|cv| cv.send(param)}
|
56
|
-
end
|
57
|
-
report[:results][dataset_name][:data] << summary
|
58
|
-
end
|
59
|
-
end
|
60
|
-
report[:results].each do |dataset,results|
|
61
|
-
([:time,:nr_unpredicted]+parameters).each do |param|
|
62
|
-
experiments = []
|
63
|
-
outcome = []
|
64
|
-
results[:data].each_with_index do |result,i|
|
65
|
-
result[param].each do |p|
|
66
|
-
experiments << i
|
67
|
-
p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
|
68
|
-
outcome << p
|
69
|
-
end
|
70
|
-
end
|
71
|
-
begin
|
72
|
-
R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
|
73
|
-
R.eval "experiment_nr = factor(experiment_nr)"
|
74
|
-
R.assign "outcome", outcome
|
75
|
-
R.eval "data = data.frame(experiment_nr,outcome)"
|
76
|
-
# one-way ANOVA
|
77
|
-
R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
|
78
|
-
# http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
|
79
|
-
p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
|
80
|
-
# aequivalent
|
81
|
-
# sum = R.eval("summary(fit)")
|
82
|
-
#p_value = sum.to_ruby.first.last.first
|
83
|
-
rescue
|
84
|
-
p_value = nil
|
85
|
-
end
|
86
|
-
report[:results][dataset][:anova][param] = p_value
|
87
|
-
=begin
|
88
|
-
=end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
report
|
92
|
-
end
|
93
|
-
|
94
|
-
def summary
|
95
|
-
report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
end
|