lazar 0.9.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
data/test/regression.rb DELETED
@@ -1,43 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class LazarRegressionTest < MiniTest::Test
4
-
5
- def test_weighted_average
6
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
7
- model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average"}
8
- compound = Compound.from_smiles "CC(C)(C)CN"
9
- prediction = model.predict compound
10
- assert_equal 7.2, prediction[:value].round(1)
11
- assert_equal 88, prediction[:neighbors].size
12
- end
13
-
14
- def test_mpd_fingerprints
15
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
16
- model = Model::LazarRegression.create training_dataset
17
- model.neighbor_algorithm_parameters[:type] = "MP2D"
18
- compound = Compound.from_smiles "CCCSCCSCC"
19
- prediction = model.predict compound
20
- assert_equal 0.04, prediction[:value].round(2)
21
- assert_equal 3, prediction[:neighbors].size
22
- end
23
-
24
- def test_local_fingerprint_regression
25
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
26
- model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
27
- compound = Compound.from_smiles "NC(=O)OCCC"
28
- prediction = model.predict compound
29
- p prediction
30
- refute_nil prediction[:value]
31
- refute_nil prediction[:prediction_interval]
32
- refute_empty prediction[:neighbors]
33
- end
34
-
35
- def test_local_physchem_regression
36
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
37
- model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
38
- compound = Compound.from_smiles "NC(=O)OCCC"
39
- prediction = model.predict compound
40
- refute_nil prediction[:value]
41
- end
42
-
43
- end
data/test/validation.rb DELETED
@@ -1,108 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class ValidationTest < MiniTest::Test
4
-
5
- # defaults
6
-
7
- def test_default_classification_crossvalidation
8
- dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
9
- model = Model::LazarClassification.create dataset
10
- cv = ClassificationCrossValidation.create model
11
- assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7"
12
- end
13
-
14
- def test_default_regression_crossvalidation
15
- dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
16
- model = Model::LazarRegression.create dataset
17
- cv = RegressionCrossValidation.create model
18
- assert cv.rmse < 1.5, "RMSE > 1.5"
19
- assert cv.mae < 1
20
- end
21
-
22
- # parameters
23
-
24
- def test_classification_crossvalidation_parameters
25
- dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
26
- params = {
27
- :training_dataset_id => dataset.id,
28
- :neighbor_algorithm_parameters => {
29
- :min_sim => 0.3,
30
- :type => "FP3"
31
- }
32
- }
33
- model = Model::LazarClassification.create dataset, params
34
- model.save
35
- cv = ClassificationCrossValidation.create model
36
- params = model.neighbor_algorithm_parameters
37
- params.delete :training_dataset_id
38
- params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
39
-
40
- cv.validations.each do |validation|
41
- validation_params = validation.model.neighbor_algorithm_parameters
42
- validation_params.delete "training_dataset_id"
43
- assert_equal params, validation_params
44
- end
45
- end
46
-
47
- def test_regression_crossvalidation_params
48
- dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
49
- params = {
50
- :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average",
51
- :neighbor_algorithm => "fingerprint_neighbors",
52
- :neighbor_algorithm_parameters => {
53
- :type => "MACCS",
54
- :min_sim => 0.7,
55
- }
56
- }
57
- model = Model::LazarRegression.create dataset, params
58
- cv = RegressionCrossValidation.create model
59
- cv.validation_ids.each do |vid|
60
- model = Model::Lazar.find(Validation.find(vid).model_id)
61
- assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type]
62
- assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim]
63
- refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
64
- end
65
-
66
- refute_nil cv.rmse
67
- refute_nil cv.mae
68
- end
69
-
70
- def test_physchem_regression_crossvalidation
71
-
72
- training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
73
- model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
74
- cv = RegressionCrossValidation.create model
75
- refute_nil cv.rmse
76
- refute_nil cv.mae
77
- end
78
-
79
- # LOO
80
-
81
- def test_classification_loo_validation
82
- dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
83
- model = Model::LazarClassification.create dataset
84
- loo = ClassificationLeaveOneOutValidation.create model
85
- assert_equal 14, loo.nr_unpredicted
86
- refute_empty loo.confusion_matrix
87
- assert loo.accuracy > 0.77
88
- end
89
-
90
- def test_regression_loo_validation
91
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
92
- model = Model::LazarRegression.create dataset
93
- loo = RegressionLeaveOneOutValidation.create model
94
- assert loo.r_squared > 0.34
95
- end
96
-
97
- # repeated CV
98
-
99
- def test_repeated_crossvalidation
100
- dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
101
- model = Model::LazarClassification.create dataset
102
- repeated_cv = RepeatedCrossValidation.create model
103
- repeated_cv.crossvalidations.each do |cv|
104
- assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
105
- end
106
- end
107
-
108
- end