lazar 0.9.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
@@ -0,0 +1,106 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class LazarClassificationTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_classification_default
|
6
|
+
algorithms = {
|
7
|
+
:descriptors => {
|
8
|
+
:method => "fingerprint",
|
9
|
+
:type => "MP2D"
|
10
|
+
},
|
11
|
+
:similarity => {
|
12
|
+
:method => "Algorithm::Similarity.tanimoto",
|
13
|
+
:min => 0.1
|
14
|
+
},
|
15
|
+
:prediction => {
|
16
|
+
:method => "Algorithm::Classification.weighted_majority_vote",
|
17
|
+
},
|
18
|
+
:feature_selection => nil,
|
19
|
+
}
|
20
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
21
|
+
model = Model::Lazar.create training_dataset: training_dataset
|
22
|
+
assert_kind_of Model::LazarClassification, model
|
23
|
+
assert_equal algorithms, model.algorithms
|
24
|
+
substance = training_dataset.substances[10]
|
25
|
+
prediction = model.predict substance
|
26
|
+
assert_equal "false", prediction[:value]
|
27
|
+
[ {
|
28
|
+
:compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
|
29
|
+
:prediction => "false",
|
30
|
+
},{
|
31
|
+
:compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
|
32
|
+
:prediction => "false",
|
33
|
+
} ].each do |example|
|
34
|
+
prediction = model.predict example[:compound]
|
35
|
+
assert_equal example[:prediction], prediction[:value]
|
36
|
+
end
|
37
|
+
|
38
|
+
compound = Compound.from_smiles "CCO"
|
39
|
+
prediction = model.predict compound
|
40
|
+
assert_equal "true", prediction[:value]
|
41
|
+
assert_equal ["false"], prediction[:measurements]
|
42
|
+
|
43
|
+
# make a dataset prediction
|
44
|
+
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
|
45
|
+
prediction_dataset = model.predict compound_dataset
|
46
|
+
assert_equal compound_dataset.compounds, prediction_dataset.compounds
|
47
|
+
|
48
|
+
cid = prediction_dataset.compounds[7].id.to_s
|
49
|
+
assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning]
|
50
|
+
prediction_dataset.predictions.each do |cid,pred|
|
51
|
+
assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil?
|
52
|
+
end
|
53
|
+
cid = Compound.from_smiles("CCOC(=O)N").id.to_s
|
54
|
+
assert_match "excluded", prediction_dataset.predictions[cid][:warning]
|
55
|
+
# cleanup
|
56
|
+
[training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_classification_parameters
|
60
|
+
algorithms = {
|
61
|
+
:descriptors => {
|
62
|
+
:method => "fingerprint",
|
63
|
+
:type => "MACCS"
|
64
|
+
},
|
65
|
+
:similarity => {
|
66
|
+
:min => 0.4
|
67
|
+
},
|
68
|
+
}
|
69
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
70
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
71
|
+
assert_kind_of Model::LazarClassification, model
|
72
|
+
assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method]
|
73
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
74
|
+
assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
|
75
|
+
substance = training_dataset.substances[10]
|
76
|
+
prediction = model.predict substance
|
77
|
+
assert_equal "false", prediction[:value]
|
78
|
+
assert_equal 4, prediction[:neighbors].size
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_kazius
|
82
|
+
t = Time.now
|
83
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
|
84
|
+
t = Time.now
|
85
|
+
model = Model::Lazar.create training_dataset: training_dataset
|
86
|
+
t = Time.now
|
87
|
+
2.times do
|
88
|
+
compound = Compound.from_smiles("Clc1ccccc1NN")
|
89
|
+
prediction = model.predict compound
|
90
|
+
assert_equal "1", prediction[:value]
|
91
|
+
end
|
92
|
+
training_dataset.delete
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_caret_classification
|
96
|
+
skip
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_fingerprint_chisq_feature_selection
|
100
|
+
skip
|
101
|
+
end
|
102
|
+
|
103
|
+
def test_physchem_classification
|
104
|
+
skip
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class NanoparticleModelTest < MiniTest::Test
|
4
|
+
include OpenTox::Validation
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
8
|
+
@prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_nanoparticle_model
|
12
|
+
assert true, @prediction_feature.measured
|
13
|
+
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
|
14
|
+
refute_empty model.dependent_variables
|
15
|
+
refute_empty model.descriptor_ids
|
16
|
+
refute_empty model.independent_variables
|
17
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
18
|
+
assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method]
|
19
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
20
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
21
|
+
prediction = model.predict nanoparticle
|
22
|
+
refute_nil prediction[:value]
|
23
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
24
|
+
prediction = model.predict @training_dataset.substances[14]
|
25
|
+
refute_nil prediction[:value]
|
26
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
27
|
+
model.delete
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_nanoparticle_fingerprint_model
|
31
|
+
assert true, @prediction_feature.measured
|
32
|
+
algorithms = {
|
33
|
+
:descriptors => {
|
34
|
+
:method => "fingerprint",
|
35
|
+
:type => "MP2D",
|
36
|
+
},
|
37
|
+
:similarity => {
|
38
|
+
:method => "Algorithm::Similarity.tanimoto",
|
39
|
+
:min => 0.1
|
40
|
+
},
|
41
|
+
:feature_selection => nil
|
42
|
+
}
|
43
|
+
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
|
44
|
+
refute_empty model.dependent_variables
|
45
|
+
refute_empty model.descriptor_ids
|
46
|
+
refute_empty model.independent_variables
|
47
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
48
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
49
|
+
assert_nil model.algorithms[:descriptors][:categories]
|
50
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
51
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
52
|
+
prediction = model.predict nanoparticle
|
53
|
+
refute_nil prediction[:value]
|
54
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
55
|
+
prediction = model.predict @training_dataset.substances[14]
|
56
|
+
refute_nil prediction[:value]
|
57
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
58
|
+
model.delete
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_nanoparticle_fingerprint_model_with_feature_selection
|
62
|
+
assert true, @prediction_feature.measured
|
63
|
+
algorithms = {
|
64
|
+
:descriptors => {
|
65
|
+
:method => "fingerprint",
|
66
|
+
:type => "MP2D",
|
67
|
+
},
|
68
|
+
:similarity => {
|
69
|
+
:method => "Algorithm::Similarity.tanimoto",
|
70
|
+
:min => 0.1
|
71
|
+
},
|
72
|
+
}
|
73
|
+
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
|
74
|
+
refute_empty model.algorithms[:feature_selection]
|
75
|
+
refute_empty model.dependent_variables
|
76
|
+
refute_empty model.descriptor_ids
|
77
|
+
refute_empty model.independent_variables
|
78
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
79
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
80
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
81
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
82
|
+
prediction = model.predict nanoparticle
|
83
|
+
refute_nil prediction[:value]
|
84
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
85
|
+
prediction = model.predict @training_dataset.substances[14]
|
86
|
+
refute_nil prediction[:value]
|
87
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
88
|
+
model.delete
|
89
|
+
end
|
90
|
+
|
91
|
+
def test_nanoparticle_calculated_properties_model
|
92
|
+
skip "Nanoparticle calculate_properties similarity not yet implemented"
|
93
|
+
assert true, @prediction_feature.measured
|
94
|
+
algorithms = {
|
95
|
+
:descriptors => {
|
96
|
+
:method => "calculate_properties",
|
97
|
+
:features => PhysChem.openbabel_descriptors,
|
98
|
+
},
|
99
|
+
:similarity => {
|
100
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
101
|
+
:min => 0.5
|
102
|
+
},
|
103
|
+
:prediction => {
|
104
|
+
:method => "Algorithm::Regression.weighted_average",
|
105
|
+
},
|
106
|
+
}
|
107
|
+
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
|
108
|
+
refute_empty model.dependent_variables
|
109
|
+
refute_empty model.descriptor_ids
|
110
|
+
refute_empty model.independent_variables
|
111
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
112
|
+
assert_equal "Algorithm::Similarity.weighted", model.algorithms[:similarity][:method]
|
113
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
114
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
115
|
+
prediction = model.predict nanoparticle
|
116
|
+
refute_nil prediction[:value]
|
117
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
118
|
+
prediction = model.predict @training_dataset.substances[14]
|
119
|
+
refute_nil prediction[:value]
|
120
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
121
|
+
model.delete
|
122
|
+
end
|
123
|
+
|
124
|
+
def test_import_ld
|
125
|
+
skip # Ambit JSON-LD export defunct
|
126
|
+
dataset_ids = Import::Enanomapper.import_ld
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class LazarRegressionTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_default_regression
|
6
|
+
algorithms = {
|
7
|
+
:descriptors => {
|
8
|
+
:method => "fingerprint",
|
9
|
+
:type => "MP2D"
|
10
|
+
},
|
11
|
+
:similarity => {
|
12
|
+
:method => "Algorithm::Similarity.tanimoto",
|
13
|
+
:min => 0.1
|
14
|
+
},
|
15
|
+
:prediction => {
|
16
|
+
:method => "Algorithm::Caret.pls",
|
17
|
+
},
|
18
|
+
:feature_selection => nil,
|
19
|
+
}
|
20
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
|
21
|
+
model = Model::Lazar.create training_dataset: training_dataset
|
22
|
+
assert_kind_of Model::LazarRegression, model
|
23
|
+
assert_equal algorithms, model.algorithms
|
24
|
+
substance = training_dataset.substances[10]
|
25
|
+
prediction = model.predict substance
|
26
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
27
|
+
substance = Compound.from_smiles "NC(=O)OCCC"
|
28
|
+
prediction = model.predict substance
|
29
|
+
refute_nil prediction[:value]
|
30
|
+
refute_nil prediction[:prediction_interval]
|
31
|
+
refute_empty prediction[:neighbors]
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_weighted_average
|
35
|
+
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
|
36
|
+
algorithms = {
|
37
|
+
:similarity => {
|
38
|
+
:min => 0
|
39
|
+
},
|
40
|
+
:prediction => {
|
41
|
+
:method => "Algorithm::Regression.weighted_average",
|
42
|
+
},
|
43
|
+
}
|
44
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
45
|
+
compound = Compound.from_smiles "CC(C)(C)CN"
|
46
|
+
prediction = model.predict compound
|
47
|
+
assert_equal -0.86, prediction[:value].round(2)
|
48
|
+
assert_equal model.substance_ids.size, prediction[:neighbors].size
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_mpd_fingerprints
|
52
|
+
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
|
53
|
+
algorithms = {
|
54
|
+
:descriptors => {
|
55
|
+
:method => "fingerprint",
|
56
|
+
:type => "MP2D"
|
57
|
+
},
|
58
|
+
}
|
59
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
60
|
+
compound = Compound.from_smiles "CCCSCCSCC"
|
61
|
+
prediction = model.predict compound
|
62
|
+
assert_equal 4, prediction[:neighbors].size
|
63
|
+
assert_equal 1.37, prediction[:value].round(2)
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_local_physchem_regression
|
67
|
+
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
|
68
|
+
algorithms = {
|
69
|
+
:descriptors => {
|
70
|
+
:method => "calculate_properties",
|
71
|
+
:features => PhysChem.openbabel_descriptors,
|
72
|
+
},
|
73
|
+
:similarity => {
|
74
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
75
|
+
:min => 0.5
|
76
|
+
},
|
77
|
+
}
|
78
|
+
model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
|
79
|
+
compound = Compound.from_smiles "NC(=O)OCCC"
|
80
|
+
prediction = model.predict compound
|
81
|
+
refute_nil prediction[:value]
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_local_physchem_regression_with_feature_selection
|
85
|
+
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
|
86
|
+
algorithms = {
|
87
|
+
:descriptors => {
|
88
|
+
:method => "calculate_properties",
|
89
|
+
:features => PhysChem.openbabel_descriptors,
|
90
|
+
},
|
91
|
+
:similarity => {
|
92
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
93
|
+
:min => 0.5
|
94
|
+
},
|
95
|
+
:feature_selection => {
|
96
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
97
|
+
},
|
98
|
+
}
|
99
|
+
model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
|
100
|
+
compound = Compound.from_smiles "NC(=O)OCCC"
|
101
|
+
prediction = model.predict compound
|
102
|
+
refute_nil prediction[:value]
|
103
|
+
end
|
104
|
+
|
105
|
+
def test_unweighted_cosine_physchem_regression
|
106
|
+
algorithms = {
|
107
|
+
:descriptors => {
|
108
|
+
:method => "calculate_properties",
|
109
|
+
:features => PhysChem.openbabel_descriptors,
|
110
|
+
},
|
111
|
+
:similarity => {
|
112
|
+
:method => "Algorithm::Similarity.cosine",
|
113
|
+
}
|
114
|
+
}
|
115
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
|
116
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
117
|
+
assert_kind_of Model::LazarRegression, model
|
118
|
+
assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
|
119
|
+
assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method]
|
120
|
+
assert_equal 0.1, model.algorithms[:similarity][:min]
|
121
|
+
algorithms[:descriptors].delete :features
|
122
|
+
assert_equal algorithms[:descriptors], model.algorithms[:descriptors]
|
123
|
+
prediction = model.predict training_dataset.substances[10]
|
124
|
+
refute_nil prediction[:value]
|
125
|
+
end
|
126
|
+
|
127
|
+
def test_regression_with_feature_selection
|
128
|
+
algorithms = {
|
129
|
+
:feature_selection => {
|
130
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
131
|
+
},
|
132
|
+
}
|
133
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
|
134
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
135
|
+
assert_kind_of Model::LazarRegression, model
|
136
|
+
assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
|
137
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
138
|
+
assert_equal 0.1, model.algorithms[:similarity][:min]
|
139
|
+
assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
|
140
|
+
prediction = model.predict training_dataset.substances[10]
|
141
|
+
refute_nil prediction[:value]
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_regression_parameters
|
145
|
+
algorithms = {
|
146
|
+
:descriptors => {
|
147
|
+
:method => "fingerprint",
|
148
|
+
:type => "MP2D"
|
149
|
+
},
|
150
|
+
:similarity => {
|
151
|
+
:method => "Algorithm::Similarity.tanimoto",
|
152
|
+
:min => 0.3
|
153
|
+
},
|
154
|
+
:prediction => {
|
155
|
+
:method => "Algorithm::Regression.weighted_average",
|
156
|
+
},
|
157
|
+
:feature_selection => nil,
|
158
|
+
}
|
159
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
|
160
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
161
|
+
assert_kind_of Model::LazarRegression, model
|
162
|
+
assert_equal "Algorithm::Regression.weighted_average", model.algorithms[:prediction][:method]
|
163
|
+
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
164
|
+
assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
|
165
|
+
assert_equal algorithms[:prediction][:parameters], model.algorithms[:prediction][:parameters]
|
166
|
+
substance = training_dataset.substances[10]
|
167
|
+
prediction = model.predict substance
|
168
|
+
assert_equal 0.83, prediction[:value].round(2)
|
169
|
+
end
|
170
|
+
|
171
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class ValidationModelTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_validation_model
|
6
|
+
m = Model::Validation.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
7
|
+
[:endpoint,:species,:source].each do |p|
|
8
|
+
refute_empty m[p]
|
9
|
+
end
|
10
|
+
assert m.classification?
|
11
|
+
refute m.regression?
|
12
|
+
m.crossvalidations.each do |cv|
|
13
|
+
assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
|
14
|
+
end
|
15
|
+
prediction = m.predict Compound.from_smiles("CCCC(NN)C")
|
16
|
+
assert_equal "true", prediction[:value]
|
17
|
+
m.delete
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class NanomaterialValidationModelTest < MiniTest::Test
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
7
|
+
@prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_default_nanomaterial_validation_model
|
11
|
+
validation_model = Model::NanoValidation.create
|
12
|
+
[:endpoint,:species,:source].each do |p|
|
13
|
+
refute_empty validation_model[p]
|
14
|
+
end
|
15
|
+
assert validation_model.regression?
|
16
|
+
refute validation_model.classification?
|
17
|
+
validation_model.crossvalidations.each do |cv|
|
18
|
+
refute_nil cv.r_squared
|
19
|
+
refute_nil cv.rmse
|
20
|
+
end
|
21
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
22
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
23
|
+
prediction = validation_model.predict nanoparticle
|
24
|
+
refute_nil prediction[:value]
|
25
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
26
|
+
validation_model.delete
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_nanomaterial_validation_model_parameters
|
30
|
+
algorithms = {
|
31
|
+
:descriptors => {
|
32
|
+
:method => "fingerprint",
|
33
|
+
:type => "MP2D",
|
34
|
+
},
|
35
|
+
:similarity => {
|
36
|
+
:method => "Algorithm::Similarity.tanimoto",
|
37
|
+
:min => 0.1
|
38
|
+
},
|
39
|
+
:prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
|
40
|
+
:feature_selection => nil
|
41
|
+
}
|
42
|
+
validation_model = Model::NanoValidation.create algorithms: algorithms
|
43
|
+
assert validation_model.regression?
|
44
|
+
refute validation_model.classification?
|
45
|
+
validation_model.crossvalidations.each do |cv|
|
46
|
+
refute_nil cv.r_squared
|
47
|
+
refute_nil cv.rmse
|
48
|
+
end
|
49
|
+
nanoparticle = @training_dataset.nanoparticles[-34]
|
50
|
+
assert_includes nanoparticle.dataset_ids, @training_dataset.id
|
51
|
+
prediction = validation_model.predict nanoparticle
|
52
|
+
refute_nil prediction[:value]
|
53
|
+
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
54
|
+
end
|
55
|
+
end
|
data/test/setup.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
1
|
ENV["LAZAR_ENV"] = "development"
|
2
2
|
require 'minitest/autorun'
|
3
|
-
|
4
|
-
require 'lazar'
|
3
|
+
require_relative '../lib/lazar.rb'
|
4
|
+
#require 'lazar'
|
5
5
|
include OpenTox
|
6
6
|
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
|
7
7
|
DATA_DIR ||= File.join(TEST_DIR,"data")
|
8
|
-
|
9
|
-
|
8
|
+
training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
9
|
+
unless training_dataset
|
10
|
+
Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
|
11
|
+
end
|
12
|
+
#$mongo.database.drop
|
13
|
+
#$gridfs = $mongo.database.fs
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class ValidationClassificationTest < MiniTest::Test
|
4
|
+
include OpenTox::Validation
|
5
|
+
|
6
|
+
# defaults
|
7
|
+
|
8
|
+
def test_default_classification_crossvalidation
|
9
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
10
|
+
model = Model::Lazar.create training_dataset: dataset
|
11
|
+
cv = ClassificationCrossValidation.create model
|
12
|
+
assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split"
|
13
|
+
assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than accuracy (#{cv.accuracy})."
|
14
|
+
File.open("/tmp/tmp.pdf","w+"){|f| f.puts cv.probability_plot(format:"pdf")}
|
15
|
+
p `file -b /tmp/tmp.pdf`
|
16
|
+
File.open("/tmp/tmp.png","w+"){|f| f.puts cv.probability_plot(format:"png")}
|
17
|
+
p `file -b /tmp/tmp.png`
|
18
|
+
end
|
19
|
+
|
20
|
+
# parameters
|
21
|
+
|
22
|
+
def test_classification_crossvalidation_parameters
|
23
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
24
|
+
algorithms = {
|
25
|
+
:similarity => { :min => 0.3, },
|
26
|
+
:descriptors => { :type => "FP3" }
|
27
|
+
}
|
28
|
+
model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms
|
29
|
+
cv = ClassificationCrossValidation.create model
|
30
|
+
params = model.algorithms
|
31
|
+
params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
|
32
|
+
|
33
|
+
cv.validations.each do |validation|
|
34
|
+
validation_params = validation.model.algorithms
|
35
|
+
refute_nil model.training_dataset_id
|
36
|
+
refute_nil validation.model.training_dataset_id
|
37
|
+
refute_equal model.training_dataset_id, validation.model.training_dataset_id
|
38
|
+
["min_sim","type","prediction_feature_id"].each do |k|
|
39
|
+
assert_equal params[k], validation_params[k]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# LOO
|
45
|
+
|
46
|
+
def test_classification_loo_validation
|
47
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
48
|
+
model = Model::Lazar.create training_dataset: dataset
|
49
|
+
loo = ClassificationLeaveOneOut.create model
|
50
|
+
assert_equal 14, loo.nr_unpredicted
|
51
|
+
refute_empty loo.confusion_matrix
|
52
|
+
assert loo.accuracy > 0.77
|
53
|
+
assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})."
|
54
|
+
end
|
55
|
+
|
56
|
+
# repeated CV
|
57
|
+
|
58
|
+
def test_repeated_crossvalidation
|
59
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
60
|
+
model = Model::Lazar.create training_dataset: dataset
|
61
|
+
repeated_cv = RepeatedCrossValidation.create model
|
62
|
+
repeated_cv.crossvalidations.each do |cv|
|
63
|
+
assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|