lazar 0.9.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
@@ -0,0 +1,106 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarClassificationTest < MiniTest::Test
4
+
5
+ def test_classification_default
6
+ algorithms = {
7
+ :descriptors => {
8
+ :method => "fingerprint",
9
+ :type => "MP2D"
10
+ },
11
+ :similarity => {
12
+ :method => "Algorithm::Similarity.tanimoto",
13
+ :min => 0.1
14
+ },
15
+ :prediction => {
16
+ :method => "Algorithm::Classification.weighted_majority_vote",
17
+ },
18
+ :feature_selection => nil,
19
+ }
20
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
21
+ model = Model::Lazar.create training_dataset: training_dataset
22
+ assert_kind_of Model::LazarClassification, model
23
+ assert_equal algorithms, model.algorithms
24
+ substance = training_dataset.substances[10]
25
+ prediction = model.predict substance
26
+ assert_equal "false", prediction[:value]
27
+ [ {
28
+ :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
29
+ :prediction => "false",
30
+ },{
31
+ :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
32
+ :prediction => "false",
33
+ } ].each do |example|
34
+ prediction = model.predict example[:compound]
35
+ assert_equal example[:prediction], prediction[:value]
36
+ end
37
+
38
+ compound = Compound.from_smiles "CCO"
39
+ prediction = model.predict compound
40
+ assert_equal "true", prediction[:value]
41
+ assert_equal ["false"], prediction[:measurements]
42
+
43
+ # make a dataset prediction
44
+ compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
45
+ prediction_dataset = model.predict compound_dataset
46
+ assert_equal compound_dataset.compounds, prediction_dataset.compounds
47
+
48
+ cid = prediction_dataset.compounds[7].id.to_s
49
+ assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning]
50
+ prediction_dataset.predictions.each do |cid,pred|
51
+ assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil?
52
+ end
53
+ cid = Compound.from_smiles("CCOC(=O)N").id.to_s
54
+ assert_match "excluded", prediction_dataset.predictions[cid][:warning]
55
+ # cleanup
56
+ [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
57
+ end
58
+
59
+ def test_classification_parameters
60
+ algorithms = {
61
+ :descriptors => {
62
+ :method => "fingerprint",
63
+ :type => "MACCS"
64
+ },
65
+ :similarity => {
66
+ :min => 0.4
67
+ },
68
+ }
69
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
70
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
71
+ assert_kind_of Model::LazarClassification, model
72
+ assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method]
73
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
74
+ assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
75
+ substance = training_dataset.substances[10]
76
+ prediction = model.predict substance
77
+ assert_equal "false", prediction[:value]
78
+ assert_equal 4, prediction[:neighbors].size
79
+ end
80
+
81
+ def test_kazius
82
+ t = Time.now
83
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
84
+ t = Time.now
85
+ model = Model::Lazar.create training_dataset: training_dataset
86
+ t = Time.now
87
+ 2.times do
88
+ compound = Compound.from_smiles("Clc1ccccc1NN")
89
+ prediction = model.predict compound
90
+ assert_equal "1", prediction[:value]
91
+ end
92
+ training_dataset.delete
93
+ end
94
+
95
+ def test_caret_classification
96
+ skip
97
+ end
98
+
99
+ def test_fingerprint_chisq_feature_selection
100
+ skip
101
+ end
102
+
103
+ def test_physchem_classification
104
+ skip
105
+ end
106
+ end
@@ -0,0 +1,128 @@
1
+ require_relative "setup.rb"
2
+
3
+ class NanoparticleModelTest < MiniTest::Test
4
+ include OpenTox::Validation
5
+
6
+ def setup
7
+ @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
8
+ @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
9
+ end
10
+
11
+ def test_nanoparticle_model
12
+ assert true, @prediction_feature.measured
13
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
14
+ refute_empty model.dependent_variables
15
+ refute_empty model.descriptor_ids
16
+ refute_empty model.independent_variables
17
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
18
+ assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method]
19
+ nanoparticle = @training_dataset.nanoparticles[-34]
20
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
21
+ prediction = model.predict nanoparticle
22
+ refute_nil prediction[:value]
23
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
24
+ prediction = model.predict @training_dataset.substances[14]
25
+ refute_nil prediction[:value]
26
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
27
+ model.delete
28
+ end
29
+
30
+ def test_nanoparticle_fingerprint_model
31
+ assert true, @prediction_feature.measured
32
+ algorithms = {
33
+ :descriptors => {
34
+ :method => "fingerprint",
35
+ :type => "MP2D",
36
+ },
37
+ :similarity => {
38
+ :method => "Algorithm::Similarity.tanimoto",
39
+ :min => 0.1
40
+ },
41
+ :feature_selection => nil
42
+ }
43
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
44
+ refute_empty model.dependent_variables
45
+ refute_empty model.descriptor_ids
46
+ refute_empty model.independent_variables
47
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
48
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
49
+ assert_nil model.algorithms[:descriptors][:categories]
50
+ nanoparticle = @training_dataset.nanoparticles[-34]
51
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
52
+ prediction = model.predict nanoparticle
53
+ refute_nil prediction[:value]
54
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
55
+ prediction = model.predict @training_dataset.substances[14]
56
+ refute_nil prediction[:value]
57
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
58
+ model.delete
59
+ end
60
+
61
+ def test_nanoparticle_fingerprint_model_with_feature_selection
62
+ assert true, @prediction_feature.measured
63
+ algorithms = {
64
+ :descriptors => {
65
+ :method => "fingerprint",
66
+ :type => "MP2D",
67
+ },
68
+ :similarity => {
69
+ :method => "Algorithm::Similarity.tanimoto",
70
+ :min => 0.1
71
+ },
72
+ }
73
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
74
+ refute_empty model.algorithms[:feature_selection]
75
+ refute_empty model.dependent_variables
76
+ refute_empty model.descriptor_ids
77
+ refute_empty model.independent_variables
78
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
79
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
80
+ nanoparticle = @training_dataset.nanoparticles[-34]
81
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
82
+ prediction = model.predict nanoparticle
83
+ refute_nil prediction[:value]
84
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
85
+ prediction = model.predict @training_dataset.substances[14]
86
+ refute_nil prediction[:value]
87
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
88
+ model.delete
89
+ end
90
+
91
+ def test_nanoparticle_calculated_properties_model
92
+ skip "Nanoparticle calculate_properties similarity not yet implemented"
93
+ assert true, @prediction_feature.measured
94
+ algorithms = {
95
+ :descriptors => {
96
+ :method => "calculate_properties",
97
+ :features => PhysChem.openbabel_descriptors,
98
+ },
99
+ :similarity => {
100
+ :method => "Algorithm::Similarity.weighted_cosine",
101
+ :min => 0.5
102
+ },
103
+ :prediction => {
104
+ :method => "Algorithm::Regression.weighted_average",
105
+ },
106
+ }
107
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
108
+ refute_empty model.dependent_variables
109
+ refute_empty model.descriptor_ids
110
+ refute_empty model.independent_variables
111
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
112
+ assert_equal "Algorithm::Similarity.weighted", model.algorithms[:similarity][:method]
113
+ nanoparticle = @training_dataset.nanoparticles[-34]
114
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
115
+ prediction = model.predict nanoparticle
116
+ refute_nil prediction[:value]
117
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
118
+ prediction = model.predict @training_dataset.substances[14]
119
+ refute_nil prediction[:value]
120
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
121
+ model.delete
122
+ end
123
+
124
+ def test_import_ld
125
+ skip # Ambit JSON-LD export defunct
126
+ dataset_ids = Import::Enanomapper.import_ld
127
+ end
128
+ end
@@ -0,0 +1,171 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarRegressionTest < MiniTest::Test
4
+
5
+ def test_default_regression
6
+ algorithms = {
7
+ :descriptors => {
8
+ :method => "fingerprint",
9
+ :type => "MP2D"
10
+ },
11
+ :similarity => {
12
+ :method => "Algorithm::Similarity.tanimoto",
13
+ :min => 0.1
14
+ },
15
+ :prediction => {
16
+ :method => "Algorithm::Caret.pls",
17
+ },
18
+ :feature_selection => nil,
19
+ }
20
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
21
+ model = Model::Lazar.create training_dataset: training_dataset
22
+ assert_kind_of Model::LazarRegression, model
23
+ assert_equal algorithms, model.algorithms
24
+ substance = training_dataset.substances[10]
25
+ prediction = model.predict substance
26
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
27
+ substance = Compound.from_smiles "NC(=O)OCCC"
28
+ prediction = model.predict substance
29
+ refute_nil prediction[:value]
30
+ refute_nil prediction[:prediction_interval]
31
+ refute_empty prediction[:neighbors]
32
+ end
33
+
34
+ def test_weighted_average
35
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
36
+ algorithms = {
37
+ :similarity => {
38
+ :min => 0
39
+ },
40
+ :prediction => {
41
+ :method => "Algorithm::Regression.weighted_average",
42
+ },
43
+ }
44
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
45
+ compound = Compound.from_smiles "CC(C)(C)CN"
46
+ prediction = model.predict compound
47
+ assert_equal -0.86, prediction[:value].round(2)
48
+ assert_equal model.substance_ids.size, prediction[:neighbors].size
49
+ end
50
+
51
+ def test_mpd_fingerprints
52
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
53
+ algorithms = {
54
+ :descriptors => {
55
+ :method => "fingerprint",
56
+ :type => "MP2D"
57
+ },
58
+ }
59
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
60
+ compound = Compound.from_smiles "CCCSCCSCC"
61
+ prediction = model.predict compound
62
+ assert_equal 4, prediction[:neighbors].size
63
+ assert_equal 1.37, prediction[:value].round(2)
64
+ end
65
+
66
+ def test_local_physchem_regression
67
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
68
+ algorithms = {
69
+ :descriptors => {
70
+ :method => "calculate_properties",
71
+ :features => PhysChem.openbabel_descriptors,
72
+ },
73
+ :similarity => {
74
+ :method => "Algorithm::Similarity.weighted_cosine",
75
+ :min => 0.5
76
+ },
77
+ }
78
+ model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
79
+ compound = Compound.from_smiles "NC(=O)OCCC"
80
+ prediction = model.predict compound
81
+ refute_nil prediction[:value]
82
+ end
83
+
84
+ def test_local_physchem_regression_with_feature_selection
85
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
86
+ algorithms = {
87
+ :descriptors => {
88
+ :method => "calculate_properties",
89
+ :features => PhysChem.openbabel_descriptors,
90
+ },
91
+ :similarity => {
92
+ :method => "Algorithm::Similarity.weighted_cosine",
93
+ :min => 0.5
94
+ },
95
+ :feature_selection => {
96
+ :method => "Algorithm::FeatureSelection.correlation_filter",
97
+ },
98
+ }
99
+ model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
100
+ compound = Compound.from_smiles "NC(=O)OCCC"
101
+ prediction = model.predict compound
102
+ refute_nil prediction[:value]
103
+ end
104
+
105
+ def test_unweighted_cosine_physchem_regression
106
+ algorithms = {
107
+ :descriptors => {
108
+ :method => "calculate_properties",
109
+ :features => PhysChem.openbabel_descriptors,
110
+ },
111
+ :similarity => {
112
+ :method => "Algorithm::Similarity.cosine",
113
+ }
114
+ }
115
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
116
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
117
+ assert_kind_of Model::LazarRegression, model
118
+ assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
119
+ assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method]
120
+ assert_equal 0.1, model.algorithms[:similarity][:min]
121
+ algorithms[:descriptors].delete :features
122
+ assert_equal algorithms[:descriptors], model.algorithms[:descriptors]
123
+ prediction = model.predict training_dataset.substances[10]
124
+ refute_nil prediction[:value]
125
+ end
126
+
127
+ def test_regression_with_feature_selection
128
+ algorithms = {
129
+ :feature_selection => {
130
+ :method => "Algorithm::FeatureSelection.correlation_filter",
131
+ },
132
+ }
133
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
134
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
135
+ assert_kind_of Model::LazarRegression, model
136
+ assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
137
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
138
+ assert_equal 0.1, model.algorithms[:similarity][:min]
139
+ assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
140
+ prediction = model.predict training_dataset.substances[10]
141
+ refute_nil prediction[:value]
142
+ end
143
+
144
+ def test_regression_parameters
145
+ algorithms = {
146
+ :descriptors => {
147
+ :method => "fingerprint",
148
+ :type => "MP2D"
149
+ },
150
+ :similarity => {
151
+ :method => "Algorithm::Similarity.tanimoto",
152
+ :min => 0.3
153
+ },
154
+ :prediction => {
155
+ :method => "Algorithm::Regression.weighted_average",
156
+ },
157
+ :feature_selection => nil,
158
+ }
159
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
160
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
161
+ assert_kind_of Model::LazarRegression, model
162
+ assert_equal "Algorithm::Regression.weighted_average", model.algorithms[:prediction][:method]
163
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
164
+ assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
165
+ assert_equal algorithms[:prediction][:parameters], model.algorithms[:prediction][:parameters]
166
+ substance = training_dataset.substances[10]
167
+ prediction = model.predict substance
168
+ assert_equal 0.83, prediction[:value].round(2)
169
+ end
170
+
171
+ end
@@ -0,0 +1,19 @@
1
+ require_relative "setup.rb"
2
+
3
+ class ValidationModelTest < MiniTest::Test
4
+
5
+ def test_validation_model
6
+ m = Model::Validation.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
7
+ [:endpoint,:species,:source].each do |p|
8
+ refute_empty m[p]
9
+ end
10
+ assert m.classification?
11
+ refute m.regression?
12
+ m.crossvalidations.each do |cv|
13
+ assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
14
+ end
15
+ prediction = m.predict Compound.from_smiles("CCCC(NN)C")
16
+ assert_equal "true", prediction[:value]
17
+ m.delete
18
+ end
19
+ end
@@ -0,0 +1,55 @@
1
+ require_relative "setup.rb"
2
+
3
+ class NanomaterialValidationModelTest < MiniTest::Test
4
+
5
+ def setup
6
+ @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
7
+ @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
8
+ end
9
+
10
+ def test_default_nanomaterial_validation_model
11
+ validation_model = Model::NanoValidation.create
12
+ [:endpoint,:species,:source].each do |p|
13
+ refute_empty validation_model[p]
14
+ end
15
+ assert validation_model.regression?
16
+ refute validation_model.classification?
17
+ validation_model.crossvalidations.each do |cv|
18
+ refute_nil cv.r_squared
19
+ refute_nil cv.rmse
20
+ end
21
+ nanoparticle = @training_dataset.nanoparticles[-34]
22
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
23
+ prediction = validation_model.predict nanoparticle
24
+ refute_nil prediction[:value]
25
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
26
+ validation_model.delete
27
+ end
28
+
29
+ def test_nanomaterial_validation_model_parameters
30
+ algorithms = {
31
+ :descriptors => {
32
+ :method => "fingerprint",
33
+ :type => "MP2D",
34
+ },
35
+ :similarity => {
36
+ :method => "Algorithm::Similarity.tanimoto",
37
+ :min => 0.1
38
+ },
39
+ :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
40
+ :feature_selection => nil
41
+ }
42
+ validation_model = Model::NanoValidation.create algorithms: algorithms
43
+ assert validation_model.regression?
44
+ refute validation_model.classification?
45
+ validation_model.crossvalidations.each do |cv|
46
+ refute_nil cv.r_squared
47
+ refute_nil cv.rmse
48
+ end
49
+ nanoparticle = @training_dataset.nanoparticles[-34]
50
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
51
+ prediction = validation_model.predict nanoparticle
52
+ refute_nil prediction[:value]
53
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
54
+ end
55
+ end
data/test/setup.rb CHANGED
@@ -1,9 +1,13 @@
1
1
  ENV["LAZAR_ENV"] = "development"
2
2
  require 'minitest/autorun'
3
- #require_relative '../lib/lazar.rb'
4
- require 'lazar'
3
+ require_relative '../lib/lazar.rb'
4
+ #require 'lazar'
5
5
  include OpenTox
6
6
  TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
7
7
  DATA_DIR ||= File.join(TEST_DIR,"data")
8
- $mongo.database.drop
9
- $gridfs = $mongo.database.fs
8
+ training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
9
+ unless training_dataset
10
+ Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
11
+ end
12
+ #$mongo.database.drop
13
+ #$gridfs = $mongo.database.fs
@@ -0,0 +1,67 @@
1
+ require_relative "setup.rb"
2
+
3
+ class ValidationClassificationTest < MiniTest::Test
4
+ include OpenTox::Validation
5
+
6
+ # defaults
7
+
8
+ def test_default_classification_crossvalidation
9
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
10
+ model = Model::Lazar.create training_dataset: dataset
11
+ cv = ClassificationCrossValidation.create model
12
+ assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split"
13
+ assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than accuracy (#{cv.accuracy})."
14
+ File.open("/tmp/tmp.pdf","w+"){|f| f.puts cv.probability_plot(format:"pdf")}
15
+ p `file -b /tmp/tmp.pdf`
16
+ File.open("/tmp/tmp.png","w+"){|f| f.puts cv.probability_plot(format:"png")}
17
+ p `file -b /tmp/tmp.png`
18
+ end
19
+
20
+ # parameters
21
+
22
+ def test_classification_crossvalidation_parameters
23
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
24
+ algorithms = {
25
+ :similarity => { :min => 0.3, },
26
+ :descriptors => { :type => "FP3" }
27
+ }
28
+ model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms
29
+ cv = ClassificationCrossValidation.create model
30
+ params = model.algorithms
31
+ params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
32
+
33
+ cv.validations.each do |validation|
34
+ validation_params = validation.model.algorithms
35
+ refute_nil model.training_dataset_id
36
+ refute_nil validation.model.training_dataset_id
37
+ refute_equal model.training_dataset_id, validation.model.training_dataset_id
38
+ ["min_sim","type","prediction_feature_id"].each do |k|
39
+ assert_equal params[k], validation_params[k]
40
+ end
41
+ end
42
+ end
43
+
44
+ # LOO
45
+
46
+ def test_classification_loo_validation
47
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
48
+ model = Model::Lazar.create training_dataset: dataset
49
+ loo = ClassificationLeaveOneOut.create model
50
+ assert_equal 14, loo.nr_unpredicted
51
+ refute_empty loo.confusion_matrix
52
+ assert loo.accuracy > 0.77
53
+ assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})."
54
+ end
55
+
56
+ # repeated CV
57
+
58
+ def test_repeated_crossvalidation
59
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
60
+ model = Model::Lazar.create training_dataset: dataset
61
+ repeated_cv = RepeatedCrossValidation.create model
62
+ repeated_cv.crossvalidations.each do |cv|
63
+ assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
64
+ end
65
+ end
66
+
67
+ end