lazar 0.9.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
@@ -0,0 +1,106 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class LazarClassificationTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_classification_default
|
6
|
+
algorithms = {
|
7
|
+
:descriptors => {
|
8
|
+
:method => "fingerprint",
|
9
|
+
:type => "MP2D"
|
10
|
+
},
|
11
|
+
:similarity => {
|
12
|
+
:method => "Algorithm::Similarity.tanimoto",
|
13
|
+
:min => 0.1
|
14
|
+
},
|
15
|
+
:prediction => {
|
16
|
+
:method => "Algorithm::Classification.weighted_majority_vote",
|
17
|
+
},
|
18
|
+
:feature_selection => nil,
|
19
|
+
}
|
20
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
21
|
+
model = Model::Lazar.create training_dataset: training_dataset
|
22
|
+
assert_kind_of Model::LazarClassification, model
|
23
|
+
assert_equal algorithms, model.algorithms
|
24
|
+
substance = training_dataset.substances[10]
|
25
|
+
prediction = model.predict substance
|
26
|
+
assert_equal "false", prediction[:value]
|
27
|
+
[ {
|
28
|
+
:compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
|
29
|
+
:prediction => "false",
|
30
|
+
},{
|
31
|
+
:compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
|
32
|
+
:prediction => "false",
|
33
|
+
} ].each do |example|
|
34
|
+
prediction = model.predict example[:compound]
|
35
|
+
assert_equal example[:prediction], prediction[:value]
|
36
|
+
end
|
37
|
+
|
38
|
+
compound = Compound.from_smiles "CCO"
|
39
|
+
prediction = model.predict compound
|
40
|
+
assert_equal "true", prediction[:value]
|
41
|
+
assert_equal ["false"], prediction[:measurements]
|
42
|
+
|
43
|
+
# make a dataset prediction
|
44
|
+
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
|
45
|
+
prediction_dataset = model.predict compound_dataset
|
46
|
+
assert_equal compound_dataset.compounds, prediction_dataset.compounds
|
47
|
+
|
48
|
+
cid = prediction_dataset.compounds[7].id.to_s
|
49
|
+
assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning]
|
50
|
+
prediction_dataset.predictions.each do |cid,pred|
|
51
|
+
assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil?
|
52
|
+
end
|
53
|
+
cid = Compound.from_smiles("CCOC(=O)N").id.to_s
|
54
|
+
assert_match "excluded", prediction_dataset.predictions[cid][:warning]
|
55
|
+
# cleanup
|
56
|
+
[training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_classification_parameters
|
60
|
+
algorithms = {
|
61
|
+
:descriptors => {
|
62
|
+
:method => "fingerprint",
|
63
|
+
:type => "MACCS"
|
64
|
+
},
|
65
|
+
:similarity => {
|
66
|
+
:min => 0.4
|
67
|
+
},
|
68
|
+
}
|
69
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
70
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
71
|
+
assert_kind_of Model::LazarClassification, model
|
72
|
+
assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method]
|
73
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
74
|
+
assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
|
75
|
+
substance = training_dataset.substances[10]
|
76
|
+
prediction = model.predict substance
|
77
|
+
assert_equal "false", prediction[:value]
|
78
|
+
assert_equal 4, prediction[:neighbors].size
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_kazius
|
82
|
+
t = Time.now
|
83
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
|
84
|
+
t = Time.now
|
85
|
+
model = Model::Lazar.create training_dataset: training_dataset
|
86
|
+
t = Time.now
|
87
|
+
2.times do
|
88
|
+
compound = Compound.from_smiles("Clc1ccccc1NN")
|
89
|
+
prediction = model.predict compound
|
90
|
+
assert_equal "1", prediction[:value]
|
91
|
+
end
|
92
|
+
training_dataset.delete
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_caret_classification
|
96
|
+
skip
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_fingerprint_chisq_feature_selection
|
100
|
+
skip
|
101
|
+
end
|
102
|
+
|
103
|
+
def test_physchem_classification
|
104
|
+
skip
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class NanoparticleModelTest < MiniTest::Test
|
4
|
+
include OpenTox::Validation
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
8
|
+
@prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_nanoparticle_model
|
12
|
+
assert true, @prediction_feature.measured
|
13
|
+
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
|
14
|
+
refute_empty model.dependent_variables
|
15
|
+
refute_empty model.descriptor_ids
|
16
|
+
refute_empty model.independent_variables
|
17
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
18
|
+
assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method]
|
19
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
20
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
21
|
+
prediction = model.predict nanoparticle
|
22
|
+
refute_nil prediction[:value]
|
23
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
24
|
+
prediction = model.predict @training_dataset.substances[14]
|
25
|
+
refute_nil prediction[:value]
|
26
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
27
|
+
model.delete
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_nanoparticle_fingerprint_model
|
31
|
+
assert true, @prediction_feature.measured
|
32
|
+
algorithms = {
|
33
|
+
:descriptors => {
|
34
|
+
:method => "fingerprint",
|
35
|
+
:type => "MP2D",
|
36
|
+
},
|
37
|
+
:similarity => {
|
38
|
+
:method => "Algorithm::Similarity.tanimoto",
|
39
|
+
:min => 0.1
|
40
|
+
},
|
41
|
+
:feature_selection => nil
|
42
|
+
}
|
43
|
+
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
|
44
|
+
refute_empty model.dependent_variables
|
45
|
+
refute_empty model.descriptor_ids
|
46
|
+
refute_empty model.independent_variables
|
47
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
48
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
49
|
+
assert_nil model.algorithms[:descriptors][:categories]
|
50
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
51
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
52
|
+
prediction = model.predict nanoparticle
|
53
|
+
refute_nil prediction[:value]
|
54
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
55
|
+
prediction = model.predict @training_dataset.substances[14]
|
56
|
+
refute_nil prediction[:value]
|
57
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
58
|
+
model.delete
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_nanoparticle_fingerprint_model_with_feature_selection
|
62
|
+
assert true, @prediction_feature.measured
|
63
|
+
algorithms = {
|
64
|
+
:descriptors => {
|
65
|
+
:method => "fingerprint",
|
66
|
+
:type => "MP2D",
|
67
|
+
},
|
68
|
+
:similarity => {
|
69
|
+
:method => "Algorithm::Similarity.tanimoto",
|
70
|
+
:min => 0.1
|
71
|
+
},
|
72
|
+
}
|
73
|
+
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
|
74
|
+
refute_empty model.algorithms[:feature_selection]
|
75
|
+
refute_empty model.dependent_variables
|
76
|
+
refute_empty model.descriptor_ids
|
77
|
+
refute_empty model.independent_variables
|
78
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
79
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
80
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
81
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
82
|
+
prediction = model.predict nanoparticle
|
83
|
+
refute_nil prediction[:value]
|
84
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
85
|
+
prediction = model.predict @training_dataset.substances[14]
|
86
|
+
refute_nil prediction[:value]
|
87
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
88
|
+
model.delete
|
89
|
+
end
|
90
|
+
|
91
|
+
def test_nanoparticle_calculated_properties_model
|
92
|
+
skip "Nanoparticle calculate_properties similarity not yet implemented"
|
93
|
+
assert true, @prediction_feature.measured
|
94
|
+
algorithms = {
|
95
|
+
:descriptors => {
|
96
|
+
:method => "calculate_properties",
|
97
|
+
:features => PhysChem.openbabel_descriptors,
|
98
|
+
},
|
99
|
+
:similarity => {
|
100
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
101
|
+
:min => 0.5
|
102
|
+
},
|
103
|
+
:prediction => {
|
104
|
+
:method => "Algorithm::Regression.weighted_average",
|
105
|
+
},
|
106
|
+
}
|
107
|
+
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
|
108
|
+
refute_empty model.dependent_variables
|
109
|
+
refute_empty model.descriptor_ids
|
110
|
+
refute_empty model.independent_variables
|
111
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
112
|
+
assert_equal "Algorithm::Similarity.weighted", model.algorithms[:similarity][:method]
|
113
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
114
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
115
|
+
prediction = model.predict nanoparticle
|
116
|
+
refute_nil prediction[:value]
|
117
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
118
|
+
prediction = model.predict @training_dataset.substances[14]
|
119
|
+
refute_nil prediction[:value]
|
120
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
121
|
+
model.delete
|
122
|
+
end
|
123
|
+
|
124
|
+
def test_import_ld
|
125
|
+
skip # Ambit JSON-LD export defunct
|
126
|
+
dataset_ids = Import::Enanomapper.import_ld
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class LazarRegressionTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_default_regression
|
6
|
+
algorithms = {
|
7
|
+
:descriptors => {
|
8
|
+
:method => "fingerprint",
|
9
|
+
:type => "MP2D"
|
10
|
+
},
|
11
|
+
:similarity => {
|
12
|
+
:method => "Algorithm::Similarity.tanimoto",
|
13
|
+
:min => 0.1
|
14
|
+
},
|
15
|
+
:prediction => {
|
16
|
+
:method => "Algorithm::Caret.pls",
|
17
|
+
},
|
18
|
+
:feature_selection => nil,
|
19
|
+
}
|
20
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
|
21
|
+
model = Model::Lazar.create training_dataset: training_dataset
|
22
|
+
assert_kind_of Model::LazarRegression, model
|
23
|
+
assert_equal algorithms, model.algorithms
|
24
|
+
substance = training_dataset.substances[10]
|
25
|
+
prediction = model.predict substance
|
26
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
27
|
+
substance = Compound.from_smiles "NC(=O)OCCC"
|
28
|
+
prediction = model.predict substance
|
29
|
+
refute_nil prediction[:value]
|
30
|
+
refute_nil prediction[:prediction_interval]
|
31
|
+
refute_empty prediction[:neighbors]
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_weighted_average
|
35
|
+
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
|
36
|
+
algorithms = {
|
37
|
+
:similarity => {
|
38
|
+
:min => 0
|
39
|
+
},
|
40
|
+
:prediction => {
|
41
|
+
:method => "Algorithm::Regression.weighted_average",
|
42
|
+
},
|
43
|
+
}
|
44
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
45
|
+
compound = Compound.from_smiles "CC(C)(C)CN"
|
46
|
+
prediction = model.predict compound
|
47
|
+
assert_equal -0.86, prediction[:value].round(2)
|
48
|
+
assert_equal model.substance_ids.size, prediction[:neighbors].size
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_mpd_fingerprints
|
52
|
+
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
|
53
|
+
algorithms = {
|
54
|
+
:descriptors => {
|
55
|
+
:method => "fingerprint",
|
56
|
+
:type => "MP2D"
|
57
|
+
},
|
58
|
+
}
|
59
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
60
|
+
compound = Compound.from_smiles "CCCSCCSCC"
|
61
|
+
prediction = model.predict compound
|
62
|
+
assert_equal 4, prediction[:neighbors].size
|
63
|
+
assert_equal 1.37, prediction[:value].round(2)
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_local_physchem_regression
|
67
|
+
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
|
68
|
+
algorithms = {
|
69
|
+
:descriptors => {
|
70
|
+
:method => "calculate_properties",
|
71
|
+
:features => PhysChem.openbabel_descriptors,
|
72
|
+
},
|
73
|
+
:similarity => {
|
74
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
75
|
+
:min => 0.5
|
76
|
+
},
|
77
|
+
}
|
78
|
+
model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
|
79
|
+
compound = Compound.from_smiles "NC(=O)OCCC"
|
80
|
+
prediction = model.predict compound
|
81
|
+
refute_nil prediction[:value]
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_local_physchem_regression_with_feature_selection
|
85
|
+
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
|
86
|
+
algorithms = {
|
87
|
+
:descriptors => {
|
88
|
+
:method => "calculate_properties",
|
89
|
+
:features => PhysChem.openbabel_descriptors,
|
90
|
+
},
|
91
|
+
:similarity => {
|
92
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
93
|
+
:min => 0.5
|
94
|
+
},
|
95
|
+
:feature_selection => {
|
96
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
97
|
+
},
|
98
|
+
}
|
99
|
+
model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
|
100
|
+
compound = Compound.from_smiles "NC(=O)OCCC"
|
101
|
+
prediction = model.predict compound
|
102
|
+
refute_nil prediction[:value]
|
103
|
+
end
|
104
|
+
|
105
|
+
def test_unweighted_cosine_physchem_regression
|
106
|
+
algorithms = {
|
107
|
+
:descriptors => {
|
108
|
+
:method => "calculate_properties",
|
109
|
+
:features => PhysChem.openbabel_descriptors,
|
110
|
+
},
|
111
|
+
:similarity => {
|
112
|
+
:method => "Algorithm::Similarity.cosine",
|
113
|
+
}
|
114
|
+
}
|
115
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
|
116
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
117
|
+
assert_kind_of Model::LazarRegression, model
|
118
|
+
assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
|
119
|
+
assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method]
|
120
|
+
assert_equal 0.1, model.algorithms[:similarity][:min]
|
121
|
+
algorithms[:descriptors].delete :features
|
122
|
+
assert_equal algorithms[:descriptors], model.algorithms[:descriptors]
|
123
|
+
prediction = model.predict training_dataset.substances[10]
|
124
|
+
refute_nil prediction[:value]
|
125
|
+
end
|
126
|
+
|
127
|
+
def test_regression_with_feature_selection
|
128
|
+
algorithms = {
|
129
|
+
:feature_selection => {
|
130
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
131
|
+
},
|
132
|
+
}
|
133
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
|
134
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
135
|
+
assert_kind_of Model::LazarRegression, model
|
136
|
+
assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
|
137
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
138
|
+
assert_equal 0.1, model.algorithms[:similarity][:min]
|
139
|
+
assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
|
140
|
+
prediction = model.predict training_dataset.substances[10]
|
141
|
+
refute_nil prediction[:value]
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_regression_parameters
|
145
|
+
algorithms = {
|
146
|
+
:descriptors => {
|
147
|
+
:method => "fingerprint",
|
148
|
+
:type => "MP2D"
|
149
|
+
},
|
150
|
+
:similarity => {
|
151
|
+
:method => "Algorithm::Similarity.tanimoto",
|
152
|
+
:min => 0.3
|
153
|
+
},
|
154
|
+
:prediction => {
|
155
|
+
:method => "Algorithm::Regression.weighted_average",
|
156
|
+
},
|
157
|
+
:feature_selection => nil,
|
158
|
+
}
|
159
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
|
160
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
161
|
+
assert_kind_of Model::LazarRegression, model
|
162
|
+
assert_equal "Algorithm::Regression.weighted_average", model.algorithms[:prediction][:method]
|
163
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
164
|
+
assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
|
165
|
+
assert_equal algorithms[:prediction][:parameters], model.algorithms[:prediction][:parameters]
|
166
|
+
substance = training_dataset.substances[10]
|
167
|
+
prediction = model.predict substance
|
168
|
+
assert_equal 0.83, prediction[:value].round(2)
|
169
|
+
end
|
170
|
+
|
171
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class ValidationModelTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_validation_model
|
6
|
+
m = Model::Validation.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
7
|
+
[:endpoint,:species,:source].each do |p|
|
8
|
+
refute_empty m[p]
|
9
|
+
end
|
10
|
+
assert m.classification?
|
11
|
+
refute m.regression?
|
12
|
+
m.crossvalidations.each do |cv|
|
13
|
+
assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
|
14
|
+
end
|
15
|
+
prediction = m.predict Compound.from_smiles("CCCC(NN)C")
|
16
|
+
assert_equal "true", prediction[:value]
|
17
|
+
m.delete
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class NanomaterialValidationModelTest < MiniTest::Test
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
7
|
+
@prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_default_nanomaterial_validation_model
|
11
|
+
validation_model = Model::NanoValidation.create
|
12
|
+
[:endpoint,:species,:source].each do |p|
|
13
|
+
refute_empty validation_model[p]
|
14
|
+
end
|
15
|
+
assert validation_model.regression?
|
16
|
+
refute validation_model.classification?
|
17
|
+
validation_model.crossvalidations.each do |cv|
|
18
|
+
refute_nil cv.r_squared
|
19
|
+
refute_nil cv.rmse
|
20
|
+
end
|
21
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
22
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
23
|
+
prediction = validation_model.predict nanoparticle
|
24
|
+
refute_nil prediction[:value]
|
25
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
26
|
+
validation_model.delete
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_nanomaterial_validation_model_parameters
|
30
|
+
algorithms = {
|
31
|
+
:descriptors => {
|
32
|
+
:method => "fingerprint",
|
33
|
+
:type => "MP2D",
|
34
|
+
},
|
35
|
+
:similarity => {
|
36
|
+
:method => "Algorithm::Similarity.tanimoto",
|
37
|
+
:min => 0.1
|
38
|
+
},
|
39
|
+
:prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
|
40
|
+
:feature_selection => nil
|
41
|
+
}
|
42
|
+
validation_model = Model::NanoValidation.create algorithms: algorithms
|
43
|
+
assert validation_model.regression?
|
44
|
+
refute validation_model.classification?
|
45
|
+
validation_model.crossvalidations.each do |cv|
|
46
|
+
refute_nil cv.r_squared
|
47
|
+
refute_nil cv.rmse
|
48
|
+
end
|
49
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
50
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
51
|
+
prediction = validation_model.predict nanoparticle
|
52
|
+
refute_nil prediction[:value]
|
53
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
54
|
+
end
|
55
|
+
end
|
data/test/setup.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
1
|
ENV["LAZAR_ENV"] = "development"
|
2
2
|
require 'minitest/autorun'
|
3
|
-
|
4
|
-
require 'lazar'
|
3
|
+
require_relative '../lib/lazar.rb'
|
4
|
+
#require 'lazar'
|
5
5
|
include OpenTox
|
6
6
|
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
|
7
7
|
DATA_DIR ||= File.join(TEST_DIR,"data")
|
8
|
-
|
9
|
-
|
8
|
+
training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
9
|
+
unless training_dataset
|
10
|
+
Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
|
11
|
+
end
|
12
|
+
#$mongo.database.drop
|
13
|
+
#$gridfs = $mongo.database.fs
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class ValidationClassificationTest < MiniTest::Test
|
4
|
+
include OpenTox::Validation
|
5
|
+
|
6
|
+
# defaults
|
7
|
+
|
8
|
+
def test_default_classification_crossvalidation
|
9
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
10
|
+
model = Model::Lazar.create training_dataset: dataset
|
11
|
+
cv = ClassificationCrossValidation.create model
|
12
|
+
assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split"
|
13
|
+
assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than accuracy (#{cv.accuracy})."
|
14
|
+
File.open("/tmp/tmp.pdf","w+"){|f| f.puts cv.probability_plot(format:"pdf")}
|
15
|
+
p `file -b /tmp/tmp.pdf`
|
16
|
+
File.open("/tmp/tmp.png","w+"){|f| f.puts cv.probability_plot(format:"png")}
|
17
|
+
p `file -b /tmp/tmp.png`
|
18
|
+
end
|
19
|
+
|
20
|
+
# parameters
|
21
|
+
|
22
|
+
def test_classification_crossvalidation_parameters
|
23
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
24
|
+
algorithms = {
|
25
|
+
:similarity => { :min => 0.3, },
|
26
|
+
:descriptors => { :type => "FP3" }
|
27
|
+
}
|
28
|
+
model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms
|
29
|
+
cv = ClassificationCrossValidation.create model
|
30
|
+
params = model.algorithms
|
31
|
+
params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
|
32
|
+
|
33
|
+
cv.validations.each do |validation|
|
34
|
+
validation_params = validation.model.algorithms
|
35
|
+
refute_nil model.training_dataset_id
|
36
|
+
refute_nil validation.model.training_dataset_id
|
37
|
+
refute_equal model.training_dataset_id, validation.model.training_dataset_id
|
38
|
+
["min_sim","type","prediction_feature_id"].each do |k|
|
39
|
+
assert_equal params[k], validation_params[k]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# LOO
|
45
|
+
|
46
|
+
def test_classification_loo_validation
|
47
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
48
|
+
model = Model::Lazar.create training_dataset: dataset
|
49
|
+
loo = ClassificationLeaveOneOut.create model
|
50
|
+
assert_equal 14, loo.nr_unpredicted
|
51
|
+
refute_empty loo.confusion_matrix
|
52
|
+
assert loo.accuracy > 0.77
|
53
|
+
assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})."
|
54
|
+
end
|
55
|
+
|
56
|
+
# repeated CV
|
57
|
+
|
58
|
+
def test_repeated_crossvalidation
|
59
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
60
|
+
model = Model::Lazar.create training_dataset: dataset
|
61
|
+
repeated_cv = RepeatedCrossValidation.create model
|
62
|
+
repeated_cv.crossvalidations.each do |cv|
|
63
|
+
assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|