lazar 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,10 @@
1
1
  module OpenTox
2
2
  module Validation
3
+ # Statistical evaluation of classification validations
3
4
  module ClassificationStatistics
4
5
 
6
+ # Get statistics
7
+ # @return [Hash]
5
8
  def statistics
6
9
  self.accept_values = model.prediction_feature.accept_values
7
10
  self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
@@ -63,6 +66,9 @@ module OpenTox
63
66
  }
64
67
  end
65
68
 
69
+ # Plot accuracy vs prediction probability
70
+ # @param [String,nil] format
71
+ # @return [Blob]
66
72
  def probability_plot format: "pdf"
67
73
  #unless probability_plot_id
68
74
 
@@ -99,8 +105,11 @@ module OpenTox
99
105
  end
100
106
  end
101
107
 
108
+ # Statistical evaluation of regression validations
102
109
  module RegressionStatistics
103
110
 
111
+ # Get statistics
112
+ # @return [Hash]
104
113
  def statistics
105
114
  self.rmse = 0
106
115
  self.mae = 0
@@ -147,10 +156,15 @@ module OpenTox
147
156
  }
148
157
  end
149
158
 
159
+ # Get percentage of measurements within the prediction interval
160
+ # @return [Float]
150
161
  def percent_within_prediction_interval
151
162
  100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
152
163
  end
153
164
 
165
+ # Plot predicted vs measured values
166
+ # @param [String,nil] format
167
+ # @return [Blob]
154
168
  def correlation_plot format: "png"
155
169
  unless correlation_plot_id
156
170
  tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
@@ -177,6 +191,11 @@ module OpenTox
177
191
  $gridfs.find_one(_id: correlation_plot_id).data
178
192
  end
179
193
 
194
+ # Get predictions with the largest difference between predicted and measured values
195
+ # @params [Fixnum] number of predictions
196
+ # @params [TrueClass,FalseClass,nil] include neighbors
197
+ # @params [TrueClass,FalseClass,nil] show common descriptors
198
+ # @return [Hash]
180
199
  def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
181
200
  worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
182
201
  worst_predictions.collect do |p|
data/lib/validation.rb CHANGED
@@ -2,6 +2,7 @@ module OpenTox
2
2
 
3
3
  module Validation
4
4
 
5
+ # Base validation class
5
6
  class Validation
6
7
  include OpenTox
7
8
  include Mongoid::Document
@@ -14,6 +15,8 @@ module OpenTox
14
15
  field :predictions, type: Hash, default: {}
15
16
  field :finished_at, type: Time
16
17
 
18
+ # Get model
19
+ # @return [OpenTox::Model::Lazar]
17
20
  def model
18
21
  Model::Lazar.find model_id
19
22
  end
data/test/feature.rb CHANGED
@@ -55,7 +55,7 @@ class FeatureTest < MiniTest::Test
55
55
  end
56
56
 
57
57
  def test_physchem_description
58
- assert_equal 355, PhysChem.descriptors.size
58
+ assert_equal 346, PhysChem.descriptors.size
59
59
  assert_equal 15, PhysChem.openbabel_descriptors.size
60
60
  assert_equal 295, PhysChem.cdk_descriptors.size
61
61
  assert_equal 45, PhysChem.joelib_descriptors.size
@@ -63,7 +63,7 @@ class FeatureTest < MiniTest::Test
63
63
  end
64
64
 
65
65
  def test_physchem
66
- assert_equal 355, PhysChem.descriptors.size
66
+ assert_equal 346, PhysChem.descriptors.size
67
67
  c = Compound.from_smiles "CC(=O)CC(C)C"
68
68
  logP = PhysChem.find_or_create_by :name => "Openbabel.logP"
69
69
  assert_equal 1.6215, logP.calculate(c)
@@ -8,6 +8,13 @@ class NanoparticleModelTest < MiniTest::Test
8
8
  @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
9
9
  end
10
10
 
11
+ def test_core_coating_source_uris
12
+ @training_dataset.nanoparticles.each do |np|
13
+ refute_nil np.core.source
14
+ np.coating.each{|c| refute_nil c.source}
15
+ end
16
+ end
17
+
11
18
  def test_nanoparticle_model
12
19
  assert true, @prediction_feature.measured
13
20
  model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
@@ -8,7 +8,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
8
8
  end
9
9
 
10
10
  def test_default_nanomaterial_validation_model
11
- validation_model = Model::NanoValidation.create
11
+ validation_model = Model::Validation.from_enanomapper
12
12
  [:endpoint,:species,:source].each do |p|
13
13
  refute_empty validation_model[p]
14
14
  end
@@ -39,7 +39,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
39
39
  :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
40
40
  :feature_selection => nil
41
41
  }
42
- validation_model = Model::NanoValidation.create algorithms: algorithms
42
+ validation_model = Model::Validation.from_enanomapper algorithms: algorithms
43
43
  assert validation_model.regression?
44
44
  refute validation_model.classification?
45
45
  validation_model.crossvalidations.each do |cv|
@@ -50,6 +50,5 @@ class NanomaterialValidationModelTest < MiniTest::Test
50
50
  assert_includes nanoparticle.dataset_ids, @training_dataset.id
51
51
  prediction = validation_model.predict nanoparticle
52
52
  refute_nil prediction[:value]
53
- assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
54
53
  end
55
54
  end
data/test/setup.rb CHANGED
@@ -6,8 +6,4 @@ include OpenTox
6
6
  TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
7
7
  DATA_DIR ||= File.join(TEST_DIR,"data")
8
8
  training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
9
- unless training_dataset
10
- Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
11
- end
12
- #$mongo.database.drop
13
- #$gridfs = $mongo.database.fs
9
+ Import::Enanomapper.import unless training_dataset
@@ -83,10 +83,9 @@ class ValidationRegressionTest < MiniTest::Test
83
83
  model = Model::Lazar.create training_dataset: dataset
84
84
  repeated_cv = RepeatedCrossValidation.create model
85
85
  repeated_cv.crossvalidations.each do |cv|
86
- #assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
87
- #assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
86
+ assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
87
+ assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
88
88
  end
89
- File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot}
90
89
  end
91
90
 
92
91
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lazar
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler,
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-12-21 00:00:00.000000000 Z
12
+ date: 2017-01-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -134,7 +134,6 @@ files:
134
134
  - lib/crossvalidation.rb
135
135
  - lib/dataset.rb
136
136
  - lib/error.rb
137
- - lib/experiment.rb
138
137
  - lib/feature.rb
139
138
  - lib/feature_selection.rb
140
139
  - lib/import.rb
@@ -222,8 +221,54 @@ required_rubygems_version: !ruby/object:Gem::Requirement
222
221
  version: '0'
223
222
  requirements: []
224
223
  rubyforge_project: lazar
225
- rubygems_version: 2.5.1
224
+ rubygems_version: 2.5.2
226
225
  signing_key:
227
226
  specification_version: 4
228
227
  summary: Lazar framework
229
- test_files: []
228
+ test_files:
229
+ - test/all.rb
230
+ - test/compound.rb
231
+ - test/data/EPAFHM.csv
232
+ - test/data/EPAFHM.medi.csv
233
+ - test/data/EPAFHM.medi_log10.csv
234
+ - test/data/EPAFHM.mini.csv
235
+ - test/data/EPAFHM.mini_log10.csv
236
+ - test/data/EPAFHM_log10.csv
237
+ - test/data/ISSCAN-multi.csv
238
+ - test/data/LOAEL_mmol_corrected_smiles.csv
239
+ - test/data/acetaldehyde.sdf
240
+ - test/data/batch_prediction.csv
241
+ - test/data/batch_prediction_inchi_small.csv
242
+ - test/data/batch_prediction_smiles_small.csv
243
+ - test/data/hamster_carcinogenicity.csv
244
+ - test/data/hamster_carcinogenicity.json
245
+ - test/data/hamster_carcinogenicity.mini.bool_float.csv
246
+ - test/data/hamster_carcinogenicity.mini.bool_int.csv
247
+ - test/data/hamster_carcinogenicity.mini.bool_string.csv
248
+ - test/data/hamster_carcinogenicity.mini.csv
249
+ - test/data/hamster_carcinogenicity_with_errors.csv
250
+ - test/data/kazius.csv
251
+ - test/data/loael.csv
252
+ - test/data/loael_log10.csv
253
+ - test/data/multi_cell_call.csv
254
+ - test/data/multi_cell_call_no_dup.csv
255
+ - test/data/multicolumn.csv
256
+ - test/data/rat_feature_dataset.csv
257
+ - test/data/wrong_dataset.csv
258
+ - test/dataset.rb
259
+ - test/default_environment.rb
260
+ - test/descriptor.rb
261
+ - test/error.rb
262
+ - test/experiment.rb
263
+ - test/feature.rb
264
+ - test/gridfs.rb
265
+ - test/model-classification.rb
266
+ - test/model-nanoparticle.rb
267
+ - test/model-regression.rb
268
+ - test/model-validation.rb
269
+ - test/nanomaterial-model-validation.rb
270
+ - test/setup.rb
271
+ - test/test_environment.rb
272
+ - test/validation-classification.rb
273
+ - test/validation-nanoparticle.rb
274
+ - test/validation-regression.rb
data/lib/experiment.rb DELETED
@@ -1,99 +0,0 @@
1
- module OpenTox
2
-
3
- class Experiment
4
- field :dataset_ids, type: Array
5
- field :model_settings, type: Array, default: []
6
- field :results, type: Hash, default: {}
7
-
8
- def run
9
- dataset_ids.each do |dataset_id|
10
- dataset = Dataset.find(dataset_id)
11
- results[dataset_id.to_s] = []
12
- model_settings.each do |setting|
13
- setting = setting.dup
14
- model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
15
- model = Object.const_get(model_algorithm).create dataset, setting
16
- $logger.debug model
17
- model.save
18
- repeated_crossvalidation = RepeatedCrossValidation.create model
19
- results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
20
- end
21
- end
22
- save
23
- end
24
-
25
- def report
26
- # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
27
- report = {}
28
- report[:name] = name
29
- report[:experiment_id] = self.id.to_s
30
- report[:results] = {}
31
- parameters = []
32
- dataset_ids.each do |dataset_id|
33
- dataset_name = Dataset.find(dataset_id).name
34
- report[:results][dataset_name] = {}
35
- report[:results][dataset_name][:anova] = {}
36
- report[:results][dataset_name][:data] = []
37
- # TODO results[dataset_id.to_s] does not exist
38
- results[dataset_id.to_s].each do |result|
39
- model = Model::Lazar.find(result[:model_id])
40
- repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
41
- crossvalidations = repeated_cv.crossvalidations
42
- if crossvalidations.first.is_a? ClassificationCrossValidation
43
- parameters = [:accuracy,:true_rate,:predictivity]
44
- elsif crossvalidations.first.is_a? RegressionCrossValidation
45
- parameters = [:rmse,:mae,:r_squared]
46
- end
47
- summary = {}
48
- [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
49
- summary[key] = model[key]
50
- end
51
- summary[:nr_instances] = crossvalidations.first.nr_instances
52
- summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
53
- summary[:time] = crossvalidations.collect{|cv| cv.time}
54
- parameters.each do |param|
55
- summary[param] = crossvalidations.collect{|cv| cv.send(param)}
56
- end
57
- report[:results][dataset_name][:data] << summary
58
- end
59
- end
60
- report[:results].each do |dataset,results|
61
- ([:time,:nr_unpredicted]+parameters).each do |param|
62
- experiments = []
63
- outcome = []
64
- results[:data].each_with_index do |result,i|
65
- result[param].each do |p|
66
- experiments << i
67
- p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
68
- outcome << p
69
- end
70
- end
71
- begin
72
- R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
73
- R.eval "experiment_nr = factor(experiment_nr)"
74
- R.assign "outcome", outcome
75
- R.eval "data = data.frame(experiment_nr,outcome)"
76
- # one-way ANOVA
77
- R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
78
- # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
79
- p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
80
- # aequivalent
81
- # sum = R.eval("summary(fit)")
82
- #p_value = sum.to_ruby.first.last.first
83
- rescue
84
- p_value = nil
85
- end
86
- report[:results][dataset][:anova][param] = p_value
87
- =begin
88
- =end
89
- end
90
- end
91
- report
92
- end
93
-
94
- def summary
95
- report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
96
- end
97
- end
98
-
99
- end