lazar 0.9.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
data/test/regression.rb
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class LazarRegressionTest < MiniTest::Test
|
4
|
-
|
5
|
-
def test_weighted_average
|
6
|
-
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
7
|
-
model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average"}
|
8
|
-
compound = Compound.from_smiles "CC(C)(C)CN"
|
9
|
-
prediction = model.predict compound
|
10
|
-
assert_equal 7.2, prediction[:value].round(1)
|
11
|
-
assert_equal 88, prediction[:neighbors].size
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_mpd_fingerprints
|
15
|
-
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
16
|
-
model = Model::LazarRegression.create training_dataset
|
17
|
-
model.neighbor_algorithm_parameters[:type] = "MP2D"
|
18
|
-
compound = Compound.from_smiles "CCCSCCSCC"
|
19
|
-
prediction = model.predict compound
|
20
|
-
assert_equal 0.04, prediction[:value].round(2)
|
21
|
-
assert_equal 3, prediction[:neighbors].size
|
22
|
-
end
|
23
|
-
|
24
|
-
def test_local_fingerprint_regression
|
25
|
-
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
26
|
-
model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
|
27
|
-
compound = Compound.from_smiles "NC(=O)OCCC"
|
28
|
-
prediction = model.predict compound
|
29
|
-
p prediction
|
30
|
-
refute_nil prediction[:value]
|
31
|
-
refute_nil prediction[:prediction_interval]
|
32
|
-
refute_empty prediction[:neighbors]
|
33
|
-
end
|
34
|
-
|
35
|
-
def test_local_physchem_regression
|
36
|
-
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
37
|
-
model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
|
38
|
-
compound = Compound.from_smiles "NC(=O)OCCC"
|
39
|
-
prediction = model.predict compound
|
40
|
-
refute_nil prediction[:value]
|
41
|
-
end
|
42
|
-
|
43
|
-
end
|
data/test/validation.rb
DELETED
@@ -1,108 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class ValidationTest < MiniTest::Test
|
4
|
-
|
5
|
-
# defaults
|
6
|
-
|
7
|
-
def test_default_classification_crossvalidation
|
8
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
9
|
-
model = Model::LazarClassification.create dataset
|
10
|
-
cv = ClassificationCrossValidation.create model
|
11
|
-
assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7"
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_default_regression_crossvalidation
|
15
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
16
|
-
model = Model::LazarRegression.create dataset
|
17
|
-
cv = RegressionCrossValidation.create model
|
18
|
-
assert cv.rmse < 1.5, "RMSE > 1.5"
|
19
|
-
assert cv.mae < 1
|
20
|
-
end
|
21
|
-
|
22
|
-
# parameters
|
23
|
-
|
24
|
-
def test_classification_crossvalidation_parameters
|
25
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
26
|
-
params = {
|
27
|
-
:training_dataset_id => dataset.id,
|
28
|
-
:neighbor_algorithm_parameters => {
|
29
|
-
:min_sim => 0.3,
|
30
|
-
:type => "FP3"
|
31
|
-
}
|
32
|
-
}
|
33
|
-
model = Model::LazarClassification.create dataset, params
|
34
|
-
model.save
|
35
|
-
cv = ClassificationCrossValidation.create model
|
36
|
-
params = model.neighbor_algorithm_parameters
|
37
|
-
params.delete :training_dataset_id
|
38
|
-
params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
|
39
|
-
|
40
|
-
cv.validations.each do |validation|
|
41
|
-
validation_params = validation.model.neighbor_algorithm_parameters
|
42
|
-
validation_params.delete "training_dataset_id"
|
43
|
-
assert_equal params, validation_params
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def test_regression_crossvalidation_params
|
48
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
|
49
|
-
params = {
|
50
|
-
:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average",
|
51
|
-
:neighbor_algorithm => "fingerprint_neighbors",
|
52
|
-
:neighbor_algorithm_parameters => {
|
53
|
-
:type => "MACCS",
|
54
|
-
:min_sim => 0.7,
|
55
|
-
}
|
56
|
-
}
|
57
|
-
model = Model::LazarRegression.create dataset, params
|
58
|
-
cv = RegressionCrossValidation.create model
|
59
|
-
cv.validation_ids.each do |vid|
|
60
|
-
model = Model::Lazar.find(Validation.find(vid).model_id)
|
61
|
-
assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type]
|
62
|
-
assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim]
|
63
|
-
refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
|
64
|
-
end
|
65
|
-
|
66
|
-
refute_nil cv.rmse
|
67
|
-
refute_nil cv.mae
|
68
|
-
end
|
69
|
-
|
70
|
-
def test_physchem_regression_crossvalidation
|
71
|
-
|
72
|
-
training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
|
73
|
-
model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
|
74
|
-
cv = RegressionCrossValidation.create model
|
75
|
-
refute_nil cv.rmse
|
76
|
-
refute_nil cv.mae
|
77
|
-
end
|
78
|
-
|
79
|
-
# LOO
|
80
|
-
|
81
|
-
def test_classification_loo_validation
|
82
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
83
|
-
model = Model::LazarClassification.create dataset
|
84
|
-
loo = ClassificationLeaveOneOutValidation.create model
|
85
|
-
assert_equal 14, loo.nr_unpredicted
|
86
|
-
refute_empty loo.confusion_matrix
|
87
|
-
assert loo.accuracy > 0.77
|
88
|
-
end
|
89
|
-
|
90
|
-
def test_regression_loo_validation
|
91
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
|
92
|
-
model = Model::LazarRegression.create dataset
|
93
|
-
loo = RegressionLeaveOneOutValidation.create model
|
94
|
-
assert loo.r_squared > 0.34
|
95
|
-
end
|
96
|
-
|
97
|
-
# repeated CV
|
98
|
-
|
99
|
-
def test_repeated_crossvalidation
|
100
|
-
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
101
|
-
model = Model::LazarClassification.create dataset
|
102
|
-
repeated_cv = RepeatedCrossValidation.create model
|
103
|
-
repeated_cv.crossvalidations.each do |cv|
|
104
|
-
assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
end
|