lazar 0.9.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
data/test/regression.rb DELETED
@@ -1,43 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class LazarRegressionTest < MiniTest::Test
4
-
5
- def test_weighted_average
6
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
7
- model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average"}
8
- compound = Compound.from_smiles "CC(C)(C)CN"
9
- prediction = model.predict compound
10
- assert_equal 7.2, prediction[:value].round(1)
11
- assert_equal 88, prediction[:neighbors].size
12
- end
13
-
14
- def test_mpd_fingerprints
15
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
16
- model = Model::LazarRegression.create training_dataset
17
- model.neighbor_algorithm_parameters[:type] = "MP2D"
18
- compound = Compound.from_smiles "CCCSCCSCC"
19
- prediction = model.predict compound
20
- assert_equal 0.04, prediction[:value].round(2)
21
- assert_equal 3, prediction[:neighbors].size
22
- end
23
-
24
- def test_local_fingerprint_regression
25
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
26
- model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
27
- compound = Compound.from_smiles "NC(=O)OCCC"
28
- prediction = model.predict compound
29
- p prediction
30
- refute_nil prediction[:value]
31
- refute_nil prediction[:prediction_interval]
32
- refute_empty prediction[:neighbors]
33
- end
34
-
35
- def test_local_physchem_regression
36
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
37
- model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
38
- compound = Compound.from_smiles "NC(=O)OCCC"
39
- prediction = model.predict compound
40
- refute_nil prediction[:value]
41
- end
42
-
43
- end
data/test/validation.rb DELETED
@@ -1,108 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class ValidationTest < MiniTest::Test
4
-
5
- # defaults
6
-
7
- def test_default_classification_crossvalidation
8
- dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
9
- model = Model::LazarClassification.create dataset
10
- cv = ClassificationCrossValidation.create model
11
- assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7"
12
- end
13
-
14
- def test_default_regression_crossvalidation
15
- dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
16
- model = Model::LazarRegression.create dataset
17
- cv = RegressionCrossValidation.create model
18
- assert cv.rmse < 1.5, "RMSE > 1.5"
19
- assert cv.mae < 1
20
- end
21
-
22
- # parameters
23
-
24
- def test_classification_crossvalidation_parameters
25
- dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
26
- params = {
27
- :training_dataset_id => dataset.id,
28
- :neighbor_algorithm_parameters => {
29
- :min_sim => 0.3,
30
- :type => "FP3"
31
- }
32
- }
33
- model = Model::LazarClassification.create dataset, params
34
- model.save
35
- cv = ClassificationCrossValidation.create model
36
- params = model.neighbor_algorithm_parameters
37
- params.delete :training_dataset_id
38
- params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
39
-
40
- cv.validations.each do |validation|
41
- validation_params = validation.model.neighbor_algorithm_parameters
42
- validation_params.delete "training_dataset_id"
43
- assert_equal params, validation_params
44
- end
45
- end
46
-
47
- def test_regression_crossvalidation_params
48
- dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
49
- params = {
50
- :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average",
51
- :neighbor_algorithm => "fingerprint_neighbors",
52
- :neighbor_algorithm_parameters => {
53
- :type => "MACCS",
54
- :min_sim => 0.7,
55
- }
56
- }
57
- model = Model::LazarRegression.create dataset, params
58
- cv = RegressionCrossValidation.create model
59
- cv.validation_ids.each do |vid|
60
- model = Model::Lazar.find(Validation.find(vid).model_id)
61
- assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type]
62
- assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim]
63
- refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
64
- end
65
-
66
- refute_nil cv.rmse
67
- refute_nil cv.mae
68
- end
69
-
70
- def test_physchem_regression_crossvalidation
71
-
72
- training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
73
- model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
74
- cv = RegressionCrossValidation.create model
75
- refute_nil cv.rmse
76
- refute_nil cv.mae
77
- end
78
-
79
- # LOO
80
-
81
- def test_classification_loo_validation
82
- dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
83
- model = Model::LazarClassification.create dataset
84
- loo = ClassificationLeaveOneOutValidation.create model
85
- assert_equal 14, loo.nr_unpredicted
86
- refute_empty loo.confusion_matrix
87
- assert loo.accuracy > 0.77
88
- end
89
-
90
- def test_regression_loo_validation
91
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
92
- model = Model::LazarRegression.create dataset
93
- loo = RegressionLeaveOneOutValidation.create model
94
- assert loo.r_squared > 0.34
95
- end
96
-
97
- # repeated CV
98
-
99
- def test_repeated_crossvalidation
100
- dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
101
- model = Model::LazarClassification.create dataset
102
- repeated_cv = RepeatedCrossValidation.create model
103
- repeated_cv.crossvalidations.each do |cv|
104
- assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
105
- end
106
- end
107
-
108
- end