lazar 0.9.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
@@ -0,0 +1,106 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarClassificationTest < MiniTest::Test
4
+
5
+ def test_classification_default
6
+ algorithms = {
7
+ :descriptors => {
8
+ :method => "fingerprint",
9
+ :type => "MP2D"
10
+ },
11
+ :similarity => {
12
+ :method => "Algorithm::Similarity.tanimoto",
13
+ :min => 0.1
14
+ },
15
+ :prediction => {
16
+ :method => "Algorithm::Classification.weighted_majority_vote",
17
+ },
18
+ :feature_selection => nil,
19
+ }
20
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
21
+ model = Model::Lazar.create training_dataset: training_dataset
22
+ assert_kind_of Model::LazarClassification, model
23
+ assert_equal algorithms, model.algorithms
24
+ substance = training_dataset.substances[10]
25
+ prediction = model.predict substance
26
+ assert_equal "false", prediction[:value]
27
+ [ {
28
+ :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
29
+ :prediction => "false",
30
+ },{
31
+ :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
32
+ :prediction => "false",
33
+ } ].each do |example|
34
+ prediction = model.predict example[:compound]
35
+ assert_equal example[:prediction], prediction[:value]
36
+ end
37
+
38
+ compound = Compound.from_smiles "CCO"
39
+ prediction = model.predict compound
40
+ assert_equal "true", prediction[:value]
41
+ assert_equal ["false"], prediction[:measurements]
42
+
43
+ # make a dataset prediction
44
+ compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
45
+ prediction_dataset = model.predict compound_dataset
46
+ assert_equal compound_dataset.compounds, prediction_dataset.compounds
47
+
48
+ cid = prediction_dataset.compounds[7].id.to_s
49
+ assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning]
50
+ prediction_dataset.predictions.each do |cid,pred|
51
+ assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil?
52
+ end
53
+ cid = Compound.from_smiles("CCOC(=O)N").id.to_s
54
+ assert_match "excluded", prediction_dataset.predictions[cid][:warning]
55
+ # cleanup
56
+ [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
57
+ end
58
+
59
+ def test_classification_parameters
60
+ algorithms = {
61
+ :descriptors => {
62
+ :method => "fingerprint",
63
+ :type => "MACCS"
64
+ },
65
+ :similarity => {
66
+ :min => 0.4
67
+ },
68
+ }
69
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
70
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
71
+ assert_kind_of Model::LazarClassification, model
72
+ assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method]
73
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
74
+ assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
75
+ substance = training_dataset.substances[10]
76
+ prediction = model.predict substance
77
+ assert_equal "false", prediction[:value]
78
+ assert_equal 4, prediction[:neighbors].size
79
+ end
80
+
81
+ def test_kazius
82
+ t = Time.now
83
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
84
+ t = Time.now
85
+ model = Model::Lazar.create training_dataset: training_dataset
86
+ t = Time.now
87
+ 2.times do
88
+ compound = Compound.from_smiles("Clc1ccccc1NN")
89
+ prediction = model.predict compound
90
+ assert_equal "1", prediction[:value]
91
+ end
92
+ training_dataset.delete
93
+ end
94
+
95
+ def test_caret_classification
96
+ skip
97
+ end
98
+
99
+ def test_fingerprint_chisq_feature_selection
100
+ skip
101
+ end
102
+
103
+ def test_physchem_classification
104
+ skip
105
+ end
106
+ end
@@ -0,0 +1,128 @@
1
+ require_relative "setup.rb"
2
+
3
+ class NanoparticleModelTest < MiniTest::Test
4
+ include OpenTox::Validation
5
+
6
+ def setup
7
+ @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
8
+ @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
9
+ end
10
+
11
+ def test_nanoparticle_model
12
+ assert true, @prediction_feature.measured
13
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
14
+ refute_empty model.dependent_variables
15
+ refute_empty model.descriptor_ids
16
+ refute_empty model.independent_variables
17
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
18
+ assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method]
19
+ nanoparticle = @training_dataset.nanoparticles[-34]
20
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
21
+ prediction = model.predict nanoparticle
22
+ refute_nil prediction[:value]
23
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
24
+ prediction = model.predict @training_dataset.substances[14]
25
+ refute_nil prediction[:value]
26
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
27
+ model.delete
28
+ end
29
+
30
+ def test_nanoparticle_fingerprint_model
31
+ assert true, @prediction_feature.measured
32
+ algorithms = {
33
+ :descriptors => {
34
+ :method => "fingerprint",
35
+ :type => "MP2D",
36
+ },
37
+ :similarity => {
38
+ :method => "Algorithm::Similarity.tanimoto",
39
+ :min => 0.1
40
+ },
41
+ :feature_selection => nil
42
+ }
43
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
44
+ refute_empty model.dependent_variables
45
+ refute_empty model.descriptor_ids
46
+ refute_empty model.independent_variables
47
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
48
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
49
+ assert_nil model.algorithms[:descriptors][:categories]
50
+ nanoparticle = @training_dataset.nanoparticles[-34]
51
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
52
+ prediction = model.predict nanoparticle
53
+ refute_nil prediction[:value]
54
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
55
+ prediction = model.predict @training_dataset.substances[14]
56
+ refute_nil prediction[:value]
57
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
58
+ model.delete
59
+ end
60
+
61
+ def test_nanoparticle_fingerprint_model_with_feature_selection
62
+ assert true, @prediction_feature.measured
63
+ algorithms = {
64
+ :descriptors => {
65
+ :method => "fingerprint",
66
+ :type => "MP2D",
67
+ },
68
+ :similarity => {
69
+ :method => "Algorithm::Similarity.tanimoto",
70
+ :min => 0.1
71
+ },
72
+ }
73
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
74
+ refute_empty model.algorithms[:feature_selection]
75
+ refute_empty model.dependent_variables
76
+ refute_empty model.descriptor_ids
77
+ refute_empty model.independent_variables
78
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
79
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
80
+ nanoparticle = @training_dataset.nanoparticles[-34]
81
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
82
+ prediction = model.predict nanoparticle
83
+ refute_nil prediction[:value]
84
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
85
+ prediction = model.predict @training_dataset.substances[14]
86
+ refute_nil prediction[:value]
87
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
88
+ model.delete
89
+ end
90
+
91
+ def test_nanoparticle_calculated_properties_model
92
+ skip "Nanoparticle calculate_properties similarity not yet implemented"
93
+ assert true, @prediction_feature.measured
94
+ algorithms = {
95
+ :descriptors => {
96
+ :method => "calculate_properties",
97
+ :features => PhysChem.openbabel_descriptors,
98
+ },
99
+ :similarity => {
100
+ :method => "Algorithm::Similarity.weighted_cosine",
101
+ :min => 0.5
102
+ },
103
+ :prediction => {
104
+ :method => "Algorithm::Regression.weighted_average",
105
+ },
106
+ }
107
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
108
+ refute_empty model.dependent_variables
109
+ refute_empty model.descriptor_ids
110
+ refute_empty model.independent_variables
111
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
112
+ assert_equal "Algorithm::Similarity.weighted", model.algorithms[:similarity][:method]
113
+ nanoparticle = @training_dataset.nanoparticles[-34]
114
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
115
+ prediction = model.predict nanoparticle
116
+ refute_nil prediction[:value]
117
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
118
+ prediction = model.predict @training_dataset.substances[14]
119
+ refute_nil prediction[:value]
120
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
121
+ model.delete
122
+ end
123
+
124
+ def test_import_ld
125
+ skip # Ambit JSON-LD export defunct
126
+ dataset_ids = Import::Enanomapper.import_ld
127
+ end
128
+ end
@@ -0,0 +1,171 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarRegressionTest < MiniTest::Test
4
+
5
+ def test_default_regression
6
+ algorithms = {
7
+ :descriptors => {
8
+ :method => "fingerprint",
9
+ :type => "MP2D"
10
+ },
11
+ :similarity => {
12
+ :method => "Algorithm::Similarity.tanimoto",
13
+ :min => 0.1
14
+ },
15
+ :prediction => {
16
+ :method => "Algorithm::Caret.pls",
17
+ },
18
+ :feature_selection => nil,
19
+ }
20
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
21
+ model = Model::Lazar.create training_dataset: training_dataset
22
+ assert_kind_of Model::LazarRegression, model
23
+ assert_equal algorithms, model.algorithms
24
+ substance = training_dataset.substances[10]
25
+ prediction = model.predict substance
26
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
27
+ substance = Compound.from_smiles "NC(=O)OCCC"
28
+ prediction = model.predict substance
29
+ refute_nil prediction[:value]
30
+ refute_nil prediction[:prediction_interval]
31
+ refute_empty prediction[:neighbors]
32
+ end
33
+
34
+ def test_weighted_average
35
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
36
+ algorithms = {
37
+ :similarity => {
38
+ :min => 0
39
+ },
40
+ :prediction => {
41
+ :method => "Algorithm::Regression.weighted_average",
42
+ },
43
+ }
44
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
45
+ compound = Compound.from_smiles "CC(C)(C)CN"
46
+ prediction = model.predict compound
47
+ assert_equal -0.86, prediction[:value].round(2)
48
+ assert_equal model.substance_ids.size, prediction[:neighbors].size
49
+ end
50
+
51
+ def test_mpd_fingerprints
52
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
53
+ algorithms = {
54
+ :descriptors => {
55
+ :method => "fingerprint",
56
+ :type => "MP2D"
57
+ },
58
+ }
59
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
60
+ compound = Compound.from_smiles "CCCSCCSCC"
61
+ prediction = model.predict compound
62
+ assert_equal 4, prediction[:neighbors].size
63
+ assert_equal 1.37, prediction[:value].round(2)
64
+ end
65
+
66
+ def test_local_physchem_regression
67
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
68
+ algorithms = {
69
+ :descriptors => {
70
+ :method => "calculate_properties",
71
+ :features => PhysChem.openbabel_descriptors,
72
+ },
73
+ :similarity => {
74
+ :method => "Algorithm::Similarity.weighted_cosine",
75
+ :min => 0.5
76
+ },
77
+ }
78
+ model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
79
+ compound = Compound.from_smiles "NC(=O)OCCC"
80
+ prediction = model.predict compound
81
+ refute_nil prediction[:value]
82
+ end
83
+
84
+ def test_local_physchem_regression_with_feature_selection
85
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
86
+ algorithms = {
87
+ :descriptors => {
88
+ :method => "calculate_properties",
89
+ :features => PhysChem.openbabel_descriptors,
90
+ },
91
+ :similarity => {
92
+ :method => "Algorithm::Similarity.weighted_cosine",
93
+ :min => 0.5
94
+ },
95
+ :feature_selection => {
96
+ :method => "Algorithm::FeatureSelection.correlation_filter",
97
+ },
98
+ }
99
+ model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
100
+ compound = Compound.from_smiles "NC(=O)OCCC"
101
+ prediction = model.predict compound
102
+ refute_nil prediction[:value]
103
+ end
104
+
105
+ def test_unweighted_cosine_physchem_regression
106
+ algorithms = {
107
+ :descriptors => {
108
+ :method => "calculate_properties",
109
+ :features => PhysChem.openbabel_descriptors,
110
+ },
111
+ :similarity => {
112
+ :method => "Algorithm::Similarity.cosine",
113
+ }
114
+ }
115
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
116
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
117
+ assert_kind_of Model::LazarRegression, model
118
+ assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
119
+ assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method]
120
+ assert_equal 0.1, model.algorithms[:similarity][:min]
121
+ algorithms[:descriptors].delete :features
122
+ assert_equal algorithms[:descriptors], model.algorithms[:descriptors]
123
+ prediction = model.predict training_dataset.substances[10]
124
+ refute_nil prediction[:value]
125
+ end
126
+
127
+ def test_regression_with_feature_selection
128
+ algorithms = {
129
+ :feature_selection => {
130
+ :method => "Algorithm::FeatureSelection.correlation_filter",
131
+ },
132
+ }
133
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
134
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
135
+ assert_kind_of Model::LazarRegression, model
136
+ assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
137
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
138
+ assert_equal 0.1, model.algorithms[:similarity][:min]
139
+ assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
140
+ prediction = model.predict training_dataset.substances[10]
141
+ refute_nil prediction[:value]
142
+ end
143
+
144
+ def test_regression_parameters
145
+ algorithms = {
146
+ :descriptors => {
147
+ :method => "fingerprint",
148
+ :type => "MP2D"
149
+ },
150
+ :similarity => {
151
+ :method => "Algorithm::Similarity.tanimoto",
152
+ :min => 0.3
153
+ },
154
+ :prediction => {
155
+ :method => "Algorithm::Regression.weighted_average",
156
+ },
157
+ :feature_selection => nil,
158
+ }
159
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
160
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
161
+ assert_kind_of Model::LazarRegression, model
162
+ assert_equal "Algorithm::Regression.weighted_average", model.algorithms[:prediction][:method]
163
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
164
+ assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
165
+ assert_equal algorithms[:prediction][:parameters], model.algorithms[:prediction][:parameters]
166
+ substance = training_dataset.substances[10]
167
+ prediction = model.predict substance
168
+ assert_equal 0.83, prediction[:value].round(2)
169
+ end
170
+
171
+ end
@@ -0,0 +1,19 @@
1
+ require_relative "setup.rb"
2
+
3
+ class ValidationModelTest < MiniTest::Test
4
+
5
+ def test_validation_model
6
+ m = Model::Validation.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
7
+ [:endpoint,:species,:source].each do |p|
8
+ refute_empty m[p]
9
+ end
10
+ assert m.classification?
11
+ refute m.regression?
12
+ m.crossvalidations.each do |cv|
13
+ assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
14
+ end
15
+ prediction = m.predict Compound.from_smiles("CCCC(NN)C")
16
+ assert_equal "true", prediction[:value]
17
+ m.delete
18
+ end
19
+ end
@@ -0,0 +1,55 @@
1
+ require_relative "setup.rb"
2
+
3
+ class NanomaterialValidationModelTest < MiniTest::Test
4
+
5
+ def setup
6
+ @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
7
+ @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
8
+ end
9
+
10
+ def test_default_nanomaterial_validation_model
11
+ validation_model = Model::NanoValidation.create
12
+ [:endpoint,:species,:source].each do |p|
13
+ refute_empty validation_model[p]
14
+ end
15
+ assert validation_model.regression?
16
+ refute validation_model.classification?
17
+ validation_model.crossvalidations.each do |cv|
18
+ refute_nil cv.r_squared
19
+ refute_nil cv.rmse
20
+ end
21
+ nanoparticle = @training_dataset.nanoparticles[-34]
22
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
23
+ prediction = validation_model.predict nanoparticle
24
+ refute_nil prediction[:value]
25
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
26
+ validation_model.delete
27
+ end
28
+
29
+ def test_nanomaterial_validation_model_parameters
30
+ algorithms = {
31
+ :descriptors => {
32
+ :method => "fingerprint",
33
+ :type => "MP2D",
34
+ },
35
+ :similarity => {
36
+ :method => "Algorithm::Similarity.tanimoto",
37
+ :min => 0.1
38
+ },
39
+ :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
40
+ :feature_selection => nil
41
+ }
42
+ validation_model = Model::NanoValidation.create algorithms: algorithms
43
+ assert validation_model.regression?
44
+ refute validation_model.classification?
45
+ validation_model.crossvalidations.each do |cv|
46
+ refute_nil cv.r_squared
47
+ refute_nil cv.rmse
48
+ end
49
+ nanoparticle = @training_dataset.nanoparticles[-34]
50
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
51
+ prediction = validation_model.predict nanoparticle
52
+ refute_nil prediction[:value]
53
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
54
+ end
55
+ end
data/test/setup.rb CHANGED
@@ -1,9 +1,13 @@
1
1
  ENV["LAZAR_ENV"] = "development"
2
2
  require 'minitest/autorun'
3
- #require_relative '../lib/lazar.rb'
4
- require 'lazar'
3
+ require_relative '../lib/lazar.rb'
4
+ #require 'lazar'
5
5
  include OpenTox
6
6
  TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
7
7
  DATA_DIR ||= File.join(TEST_DIR,"data")
8
- $mongo.database.drop
9
- $gridfs = $mongo.database.fs
8
+ training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
9
+ unless training_dataset
10
+ Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
11
+ end
12
+ #$mongo.database.drop
13
+ #$gridfs = $mongo.database.fs
@@ -0,0 +1,67 @@
1
+ require_relative "setup.rb"
2
+
3
+ class ValidationClassificationTest < MiniTest::Test
4
+ include OpenTox::Validation
5
+
6
+ # defaults
7
+
8
+ def test_default_classification_crossvalidation
9
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
10
+ model = Model::Lazar.create training_dataset: dataset
11
+ cv = ClassificationCrossValidation.create model
12
+ assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split"
13
+ assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than accuracy (#{cv.accuracy})."
14
+ File.open("/tmp/tmp.pdf","w+"){|f| f.puts cv.probability_plot(format:"pdf")}
15
+ p `file -b /tmp/tmp.pdf`
16
+ File.open("/tmp/tmp.png","w+"){|f| f.puts cv.probability_plot(format:"png")}
17
+ p `file -b /tmp/tmp.png`
18
+ end
19
+
20
+ # parameters
21
+
22
+ def test_classification_crossvalidation_parameters
23
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
24
+ algorithms = {
25
+ :similarity => { :min => 0.3, },
26
+ :descriptors => { :type => "FP3" }
27
+ }
28
+ model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms
29
+ cv = ClassificationCrossValidation.create model
30
+ params = model.algorithms
31
+ params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
32
+
33
+ cv.validations.each do |validation|
34
+ validation_params = validation.model.algorithms
35
+ refute_nil model.training_dataset_id
36
+ refute_nil validation.model.training_dataset_id
37
+ refute_equal model.training_dataset_id, validation.model.training_dataset_id
38
+ ["min_sim","type","prediction_feature_id"].each do |k|
39
+ assert_equal params[k], validation_params[k]
40
+ end
41
+ end
42
+ end
43
+
44
+ # LOO
45
+
46
+ def test_classification_loo_validation
47
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
48
+ model = Model::Lazar.create training_dataset: dataset
49
+ loo = ClassificationLeaveOneOut.create model
50
+ assert_equal 14, loo.nr_unpredicted
51
+ refute_empty loo.confusion_matrix
52
+ assert loo.accuracy > 0.77
53
+ assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})."
54
+ end
55
+
56
+ # repeated CV
57
+
58
+ def test_repeated_crossvalidation
59
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
60
+ model = Model::Lazar.create training_dataset: dataset
61
+ repeated_cv = RepeatedCrossValidation.create model
62
+ repeated_cv.crossvalidations.each do |cv|
63
+ assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
64
+ end
65
+ end
66
+
67
+ end