lazar 0.9.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
data/test/regression.rb
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class LazarRegressionTest < MiniTest::Test
|
4
|
-
|
5
|
-
def test_weighted_average
|
6
|
-
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
7
|
-
model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average"}
|
8
|
-
compound = Compound.from_smiles "CC(C)(C)CN"
|
9
|
-
prediction = model.predict compound
|
10
|
-
assert_equal 7.2, prediction[:value].round(1)
|
11
|
-
assert_equal 88, prediction[:neighbors].size
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_mpd_fingerprints
|
15
|
-
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
16
|
-
model = Model::LazarRegression.create training_dataset
|
17
|
-
model.neighbor_algorithm_parameters[:type] = "MP2D"
|
18
|
-
compound = Compound.from_smiles "CCCSCCSCC"
|
19
|
-
prediction = model.predict compound
|
20
|
-
assert_equal 0.04, prediction[:value].round(2)
|
21
|
-
assert_equal 3, prediction[:neighbors].size
|
22
|
-
end
|
23
|
-
|
24
|
-
def test_local_fingerprint_regression
|
25
|
-
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
26
|
-
model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
|
27
|
-
compound = Compound.from_smiles "NC(=O)OCCC"
|
28
|
-
prediction = model.predict compound
|
29
|
-
p prediction
|
30
|
-
refute_nil prediction[:value]
|
31
|
-
refute_nil prediction[:prediction_interval]
|
32
|
-
refute_empty prediction[:neighbors]
|
33
|
-
end
|
34
|
-
|
35
|
-
def test_local_physchem_regression
|
36
|
-
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
37
|
-
model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
|
38
|
-
compound = Compound.from_smiles "NC(=O)OCCC"
|
39
|
-
prediction = model.predict compound
|
40
|
-
refute_nil prediction[:value]
|
41
|
-
end
|
42
|
-
|
43
|
-
end
|
data/test/validation.rb
DELETED
@@ -1,108 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class ValidationTest < MiniTest::Test
|
4
|
-
|
5
|
-
# defaults
|
6
|
-
|
7
|
-
def test_default_classification_crossvalidation
|
8
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
9
|
-
model = Model::LazarClassification.create dataset
|
10
|
-
cv = ClassificationCrossValidation.create model
|
11
|
-
assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7"
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_default_regression_crossvalidation
|
15
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
16
|
-
model = Model::LazarRegression.create dataset
|
17
|
-
cv = RegressionCrossValidation.create model
|
18
|
-
assert cv.rmse < 1.5, "RMSE > 1.5"
|
19
|
-
assert cv.mae < 1
|
20
|
-
end
|
21
|
-
|
22
|
-
# parameters
|
23
|
-
|
24
|
-
def test_classification_crossvalidation_parameters
|
25
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
26
|
-
params = {
|
27
|
-
:training_dataset_id => dataset.id,
|
28
|
-
:neighbor_algorithm_parameters => {
|
29
|
-
:min_sim => 0.3,
|
30
|
-
:type => "FP3"
|
31
|
-
}
|
32
|
-
}
|
33
|
-
model = Model::LazarClassification.create dataset, params
|
34
|
-
model.save
|
35
|
-
cv = ClassificationCrossValidation.create model
|
36
|
-
params = model.neighbor_algorithm_parameters
|
37
|
-
params.delete :training_dataset_id
|
38
|
-
params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
|
39
|
-
|
40
|
-
cv.validations.each do |validation|
|
41
|
-
validation_params = validation.model.neighbor_algorithm_parameters
|
42
|
-
validation_params.delete "training_dataset_id"
|
43
|
-
assert_equal params, validation_params
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def test_regression_crossvalidation_params
|
48
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
49
|
-
params = {
|
50
|
-
:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average",
|
51
|
-
:neighbor_algorithm => "fingerprint_neighbors",
|
52
|
-
:neighbor_algorithm_parameters => {
|
53
|
-
:type => "MACCS",
|
54
|
-
:min_sim => 0.7,
|
55
|
-
}
|
56
|
-
}
|
57
|
-
model = Model::LazarRegression.create dataset, params
|
58
|
-
cv = RegressionCrossValidation.create model
|
59
|
-
cv.validation_ids.each do |vid|
|
60
|
-
model = Model::Lazar.find(Validation.find(vid).model_id)
|
61
|
-
assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type]
|
62
|
-
assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim]
|
63
|
-
refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
|
64
|
-
end
|
65
|
-
|
66
|
-
refute_nil cv.rmse
|
67
|
-
refute_nil cv.mae
|
68
|
-
end
|
69
|
-
|
70
|
-
def test_physchem_regression_crossvalidation
|
71
|
-
|
72
|
-
training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
|
73
|
-
model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
|
74
|
-
cv = RegressionCrossValidation.create model
|
75
|
-
refute_nil cv.rmse
|
76
|
-
refute_nil cv.mae
|
77
|
-
end
|
78
|
-
|
79
|
-
# LOO
|
80
|
-
|
81
|
-
def test_classification_loo_validation
|
82
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
83
|
-
model = Model::LazarClassification.create dataset
|
84
|
-
loo = ClassificationLeaveOneOutValidation.create model
|
85
|
-
assert_equal 14, loo.nr_unpredicted
|
86
|
-
refute_empty loo.confusion_matrix
|
87
|
-
assert loo.accuracy > 0.77
|
88
|
-
end
|
89
|
-
|
90
|
-
def test_regression_loo_validation
|
91
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
|
92
|
-
model = Model::LazarRegression.create dataset
|
93
|
-
loo = RegressionLeaveOneOutValidation.create model
|
94
|
-
assert loo.r_squared > 0.34
|
95
|
-
end
|
96
|
-
|
97
|
-
# repeated CV
|
98
|
-
|
99
|
-
def test_repeated_crossvalidation
|
100
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
101
|
-
model = Model::LazarClassification.create dataset
|
102
|
-
repeated_cv = RepeatedCrossValidation.create model
|
103
|
-
repeated_cv.crossvalidations.each do |cv|
|
104
|
-
assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
end
|