lazar 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,10 @@
1
1
  module OpenTox
2
2
  module Validation
3
+ # Statistical evaluation of classification validations
3
4
  module ClassificationStatistics
4
5
 
6
+ # Get statistics
7
+ # @return [Hash]
5
8
  def statistics
6
9
  self.accept_values = model.prediction_feature.accept_values
7
10
  self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
@@ -63,6 +66,9 @@ module OpenTox
63
66
  }
64
67
  end
65
68
 
69
+ # Plot accuracy vs prediction probability
70
+ # @param [String,nil] format
71
+ # @return [Blob]
66
72
  def probability_plot format: "pdf"
67
73
  #unless probability_plot_id
68
74
 
@@ -99,8 +105,11 @@ module OpenTox
99
105
  end
100
106
  end
101
107
 
108
+ # Statistical evaluation of regression validations
102
109
  module RegressionStatistics
103
110
 
111
+ # Get statistics
112
+ # @return [Hash]
104
113
  def statistics
105
114
  self.rmse = 0
106
115
  self.mae = 0
@@ -147,10 +156,15 @@ module OpenTox
147
156
  }
148
157
  end
149
158
 
159
+ # Get percentage of measurements within the prediction interval
160
+ # @return [Float]
150
161
  def percent_within_prediction_interval
151
162
  100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
152
163
  end
153
164
 
165
+ # Plot predicted vs measured values
166
+ # @param [String,nil] format
167
+ # @return [Blob]
154
168
  def correlation_plot format: "png"
155
169
  unless correlation_plot_id
156
170
  tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
@@ -177,6 +191,11 @@ module OpenTox
177
191
  $gridfs.find_one(_id: correlation_plot_id).data
178
192
  end
179
193
 
194
+ # Get predictions with the largest difference between predicted and measured values
195
+ # @params [Fixnum] number of predictions
196
+ # @params [TrueClass,FalseClass,nil] include neighbors
197
+ # @params [TrueClass,FalseClass,nil] show common descriptors
198
+ # @return [Hash]
180
199
  def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
181
200
  worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
182
201
  worst_predictions.collect do |p|
data/lib/validation.rb CHANGED
@@ -2,6 +2,7 @@ module OpenTox
2
2
 
3
3
  module Validation
4
4
 
5
+ # Base validation class
5
6
  class Validation
6
7
  include OpenTox
7
8
  include Mongoid::Document
@@ -14,6 +15,8 @@ module OpenTox
14
15
  field :predictions, type: Hash, default: {}
15
16
  field :finished_at, type: Time
16
17
 
18
+ # Get model
19
+ # @return [OpenTox::Model::Lazar]
17
20
  def model
18
21
  Model::Lazar.find model_id
19
22
  end
data/test/feature.rb CHANGED
@@ -55,7 +55,7 @@ class FeatureTest < MiniTest::Test
55
55
  end
56
56
 
57
57
  def test_physchem_description
58
- assert_equal 355, PhysChem.descriptors.size
58
+ assert_equal 346, PhysChem.descriptors.size
59
59
  assert_equal 15, PhysChem.openbabel_descriptors.size
60
60
  assert_equal 295, PhysChem.cdk_descriptors.size
61
61
  assert_equal 45, PhysChem.joelib_descriptors.size
@@ -63,7 +63,7 @@ class FeatureTest < MiniTest::Test
63
63
  end
64
64
 
65
65
  def test_physchem
66
- assert_equal 355, PhysChem.descriptors.size
66
+ assert_equal 346, PhysChem.descriptors.size
67
67
  c = Compound.from_smiles "CC(=O)CC(C)C"
68
68
  logP = PhysChem.find_or_create_by :name => "Openbabel.logP"
69
69
  assert_equal 1.6215, logP.calculate(c)
@@ -8,6 +8,13 @@ class NanoparticleModelTest < MiniTest::Test
8
8
  @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
9
9
  end
10
10
 
11
+ def test_core_coating_source_uris
12
+ @training_dataset.nanoparticles.each do |np|
13
+ refute_nil np.core.source
14
+ np.coating.each{|c| refute_nil c.source}
15
+ end
16
+ end
17
+
11
18
  def test_nanoparticle_model
12
19
  assert true, @prediction_feature.measured
13
20
  model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
@@ -8,7 +8,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
8
8
  end
9
9
 
10
10
  def test_default_nanomaterial_validation_model
11
- validation_model = Model::NanoValidation.create
11
+ validation_model = Model::Validation.from_enanomapper
12
12
  [:endpoint,:species,:source].each do |p|
13
13
  refute_empty validation_model[p]
14
14
  end
@@ -39,7 +39,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
39
39
  :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
40
40
  :feature_selection => nil
41
41
  }
42
- validation_model = Model::NanoValidation.create algorithms: algorithms
42
+ validation_model = Model::Validation.from_enanomapper algorithms: algorithms
43
43
  assert validation_model.regression?
44
44
  refute validation_model.classification?
45
45
  validation_model.crossvalidations.each do |cv|
@@ -50,6 +50,5 @@ class NanomaterialValidationModelTest < MiniTest::Test
50
50
  assert_includes nanoparticle.dataset_ids, @training_dataset.id
51
51
  prediction = validation_model.predict nanoparticle
52
52
  refute_nil prediction[:value]
53
- assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
54
53
  end
55
54
  end
data/test/setup.rb CHANGED
@@ -6,8 +6,4 @@ include OpenTox
6
6
  TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
7
7
  DATA_DIR ||= File.join(TEST_DIR,"data")
8
8
  training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
9
- unless training_dataset
10
- Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
11
- end
12
- #$mongo.database.drop
13
- #$gridfs = $mongo.database.fs
9
+ Import::Enanomapper.import unless training_dataset
@@ -83,10 +83,9 @@ class ValidationRegressionTest < MiniTest::Test
83
83
  model = Model::Lazar.create training_dataset: dataset
84
84
  repeated_cv = RepeatedCrossValidation.create model
85
85
  repeated_cv.crossvalidations.each do |cv|
86
- #assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
87
- #assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
86
+ assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
87
+ assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
88
88
  end
89
- File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot}
90
89
  end
91
90
 
92
91
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lazar
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler,
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-12-21 00:00:00.000000000 Z
12
+ date: 2017-01-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -134,7 +134,6 @@ files:
134
134
  - lib/crossvalidation.rb
135
135
  - lib/dataset.rb
136
136
  - lib/error.rb
137
- - lib/experiment.rb
138
137
  - lib/feature.rb
139
138
  - lib/feature_selection.rb
140
139
  - lib/import.rb
@@ -222,8 +221,54 @@ required_rubygems_version: !ruby/object:Gem::Requirement
222
221
  version: '0'
223
222
  requirements: []
224
223
  rubyforge_project: lazar
225
- rubygems_version: 2.5.1
224
+ rubygems_version: 2.5.2
226
225
  signing_key:
227
226
  specification_version: 4
228
227
  summary: Lazar framework
229
- test_files: []
228
+ test_files:
229
+ - test/all.rb
230
+ - test/compound.rb
231
+ - test/data/EPAFHM.csv
232
+ - test/data/EPAFHM.medi.csv
233
+ - test/data/EPAFHM.medi_log10.csv
234
+ - test/data/EPAFHM.mini.csv
235
+ - test/data/EPAFHM.mini_log10.csv
236
+ - test/data/EPAFHM_log10.csv
237
+ - test/data/ISSCAN-multi.csv
238
+ - test/data/LOAEL_mmol_corrected_smiles.csv
239
+ - test/data/acetaldehyde.sdf
240
+ - test/data/batch_prediction.csv
241
+ - test/data/batch_prediction_inchi_small.csv
242
+ - test/data/batch_prediction_smiles_small.csv
243
+ - test/data/hamster_carcinogenicity.csv
244
+ - test/data/hamster_carcinogenicity.json
245
+ - test/data/hamster_carcinogenicity.mini.bool_float.csv
246
+ - test/data/hamster_carcinogenicity.mini.bool_int.csv
247
+ - test/data/hamster_carcinogenicity.mini.bool_string.csv
248
+ - test/data/hamster_carcinogenicity.mini.csv
249
+ - test/data/hamster_carcinogenicity_with_errors.csv
250
+ - test/data/kazius.csv
251
+ - test/data/loael.csv
252
+ - test/data/loael_log10.csv
253
+ - test/data/multi_cell_call.csv
254
+ - test/data/multi_cell_call_no_dup.csv
255
+ - test/data/multicolumn.csv
256
+ - test/data/rat_feature_dataset.csv
257
+ - test/data/wrong_dataset.csv
258
+ - test/dataset.rb
259
+ - test/default_environment.rb
260
+ - test/descriptor.rb
261
+ - test/error.rb
262
+ - test/experiment.rb
263
+ - test/feature.rb
264
+ - test/gridfs.rb
265
+ - test/model-classification.rb
266
+ - test/model-nanoparticle.rb
267
+ - test/model-regression.rb
268
+ - test/model-validation.rb
269
+ - test/nanomaterial-model-validation.rb
270
+ - test/setup.rb
271
+ - test/test_environment.rb
272
+ - test/validation-classification.rb
273
+ - test/validation-nanoparticle.rb
274
+ - test/validation-regression.rb
data/lib/experiment.rb DELETED
@@ -1,99 +0,0 @@
1
- module OpenTox
2
-
3
- class Experiment
4
- field :dataset_ids, type: Array
5
- field :model_settings, type: Array, default: []
6
- field :results, type: Hash, default: {}
7
-
8
- def run
9
- dataset_ids.each do |dataset_id|
10
- dataset = Dataset.find(dataset_id)
11
- results[dataset_id.to_s] = []
12
- model_settings.each do |setting|
13
- setting = setting.dup
14
- model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
15
- model = Object.const_get(model_algorithm).create dataset, setting
16
- $logger.debug model
17
- model.save
18
- repeated_crossvalidation = RepeatedCrossValidation.create model
19
- results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
20
- end
21
- end
22
- save
23
- end
24
-
25
- def report
26
- # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
27
- report = {}
28
- report[:name] = name
29
- report[:experiment_id] = self.id.to_s
30
- report[:results] = {}
31
- parameters = []
32
- dataset_ids.each do |dataset_id|
33
- dataset_name = Dataset.find(dataset_id).name
34
- report[:results][dataset_name] = {}
35
- report[:results][dataset_name][:anova] = {}
36
- report[:results][dataset_name][:data] = []
37
- # TODO results[dataset_id.to_s] does not exist
38
- results[dataset_id.to_s].each do |result|
39
- model = Model::Lazar.find(result[:model_id])
40
- repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
41
- crossvalidations = repeated_cv.crossvalidations
42
- if crossvalidations.first.is_a? ClassificationCrossValidation
43
- parameters = [:accuracy,:true_rate,:predictivity]
44
- elsif crossvalidations.first.is_a? RegressionCrossValidation
45
- parameters = [:rmse,:mae,:r_squared]
46
- end
47
- summary = {}
48
- [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
49
- summary[key] = model[key]
50
- end
51
- summary[:nr_instances] = crossvalidations.first.nr_instances
52
- summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
53
- summary[:time] = crossvalidations.collect{|cv| cv.time}
54
- parameters.each do |param|
55
- summary[param] = crossvalidations.collect{|cv| cv.send(param)}
56
- end
57
- report[:results][dataset_name][:data] << summary
58
- end
59
- end
60
- report[:results].each do |dataset,results|
61
- ([:time,:nr_unpredicted]+parameters).each do |param|
62
- experiments = []
63
- outcome = []
64
- results[:data].each_with_index do |result,i|
65
- result[param].each do |p|
66
- experiments << i
67
- p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
68
- outcome << p
69
- end
70
- end
71
- begin
72
- R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
73
- R.eval "experiment_nr = factor(experiment_nr)"
74
- R.assign "outcome", outcome
75
- R.eval "data = data.frame(experiment_nr,outcome)"
76
- # one-way ANOVA
77
- R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
78
- # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
79
- p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
80
- # aequivalent
81
- # sum = R.eval("summary(fit)")
82
- #p_value = sum.to_ruby.first.last.first
83
- rescue
84
- p_value = nil
85
- end
86
- report[:results][dataset][:anova][param] = p_value
87
- =begin
88
- =end
89
- end
90
- end
91
- report
92
- end
93
-
94
- def summary
95
- report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
96
- end
97
- end
98
-
99
- end