lazar 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
data/lib/validation.rb ADDED
@@ -0,0 +1,114 @@
1
+ module OpenTox
2
+
3
+ class Validation
4
+
5
+ field :prediction_dataset_id, type: BSON::ObjectId
6
+ field :test_dataset_id, type: BSON::ObjectId
7
+ field :nr_instances, type: Integer
8
+ field :nr_unpredicted, type: Integer
9
+ field :predictions, type: Array
10
+
11
+ def prediction_dataset
12
+ Dataset.find prediction_dataset_id
13
+ end
14
+
15
+ def test_dataset
16
+ Dataset.find test_dataset_id
17
+ end
18
+
19
+ end
20
+
21
+ class ClassificationValidation < Validation
22
+ field :accept_values, type: String
23
+ field :confusion_matrix, type: Array
24
+ field :weighted_confusion_matrix, type: Array
25
+
26
+ def self.create model, training_set, test_set
27
+ validation = self.class.new
28
+ #feature_dataset = Dataset.find model.feature_dataset_id
29
+ # TODO check and delegate to Algorithm
30
+ #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
31
+ validation_model = model.class.create training_set#, features
32
+ test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
33
+ prediction_dataset = validation_model.predict test_set_without_activities
34
+ accept_values = prediction_dataset.prediction_feature.accept_values
35
+ confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
36
+ weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
37
+ predictions = []
38
+ nr_unpredicted = 0
39
+ prediction_dataset.data_entries.each_with_index do |pe,i|
40
+ if pe[0] and pe[1] and pe[1].numeric?
41
+ prediction = pe[0]
42
+ # TODO prediction_feature, convention??
43
+ # TODO generalize for multiple classes
44
+ activity = test_set.data_entries[i].first
45
+ confidence = prediction_dataset.data_entries[i][1]
46
+ predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
47
+ if prediction == activity
48
+ if prediction == accept_values[0]
49
+ confusion_matrix[0][0] += 1
50
+ weighted_confusion_matrix[0][0] += confidence
51
+ elsif prediction == accept_values[1]
52
+ confusion_matrix[1][1] += 1
53
+ weighted_confusion_matrix[1][1] += confidence
54
+ end
55
+ elsif prediction != activity
56
+ if prediction == accept_values[0]
57
+ confusion_matrix[0][1] += 1
58
+ weighted_confusion_matrix[0][1] += confidence
59
+ elsif prediction == accept_values[1]
60
+ confusion_matrix[1][0] += 1
61
+ weighted_confusion_matrix[1][0] += confidence
62
+ end
63
+ end
64
+ else
65
+ nr_unpredicted += 1 if pe[0].nil?
66
+ end
67
+ end
68
+ validation = self.new(
69
+ :prediction_dataset_id => prediction_dataset.id,
70
+ :test_dataset_id => test_set.id,
71
+ :nr_instances => test_set.compound_ids.size,
72
+ :nr_unpredicted => nr_unpredicted,
73
+ :accept_values => accept_values,
74
+ :confusion_matrix => confusion_matrix,
75
+ :weighted_confusion_matrix => weighted_confusion_matrix,
76
+ :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
77
+ )
78
+ validation.save
79
+ validation
80
+ end
81
+ end
82
+
83
+ class RegressionValidation < Validation
84
+ def self.create model, training_set, test_set
85
+
86
+ validation_model = Model::LazarRegression.create training_set
87
+ test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
88
+ prediction_dataset = validation_model.predict test_set_without_activities
89
+ predictions = []
90
+ nr_unpredicted = 0
91
+ activities = test_set.data_entries.collect{|de| de.first}
92
+ prediction_dataset.data_entries.each_with_index do |de,i|
93
+ if de[0] and de[1] and de[1].numeric?
94
+ activity = activities[i]
95
+ prediction = de.first
96
+ confidence = de[1]
97
+ predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
98
+ else
99
+ nr_unpredicted += 1
100
+ end
101
+ end
102
+ validation = self.new(
103
+ :prediction_dataset_id => prediction_dataset.id,
104
+ :test_dataset_id => test_set.id,
105
+ :nr_instances => test_set.compound_ids.size,
106
+ :nr_unpredicted => nr_unpredicted,
107
+ :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
108
+ )
109
+ validation.save
110
+ validation
111
+ end
112
+ end
113
+
114
+ end
data/mongoid.yml ADDED
@@ -0,0 +1,8 @@
1
+ development:
2
+ clients:
3
+ default:
4
+ database: opentox
5
+ hosts:
6
+ - localhost:27017
7
+ options:
8
+ raise_not_found_error: false
data/test/all.rb ADDED
@@ -0,0 +1,5 @@
1
+ exclude = ["./setup.rb","./all.rb"]
2
+ (Dir[File.join(File.dirname(__FILE__),"*.rb")]-exclude).each do |test|
3
+ p test
4
+ require_relative test
5
+ end
data/test/compound.rb ADDED
@@ -0,0 +1,100 @@
1
+ require_relative "setup.rb"
2
+
3
+ class CompoundTest < MiniTest::Test
4
+
5
+ def test_0_compound_from_smiles
6
+ c = OpenTox::Compound.from_smiles "F[B-](F)(F)F.[Na+]"
7
+ assert_equal "InChI=1S/BF4.Na/c2-1(3,4)5;/q-1;+1", c.inchi.chomp
8
+ assert_equal "F[B-](F)(F)F.[Na+]", c.smiles, "A failure here might be caused by a compound webservice running on 64bit architectures using an outdated version of OpenBabel. Please install OpenBabel version 2.3.2 or higher." # seems to be fixed in 2.3.2
9
+ end
10
+
11
+ def test_1_compound_from_smiles
12
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
13
+ assert_equal "InChI=1S/C6H9NO/c1-5(4-7)3-6(2)8/h5H,3H2,1-2H3", c.inchi
14
+ assert_equal "CC(C#N)CC(=O)C", c.smiles
15
+ end
16
+
17
+ def test_2_compound_from_smiles
18
+ c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
19
+ assert_equal "InChI=1S/C6H5N2.BF4/c7-8-6-4-2-1-3-5-6;2-1(3,4)5/h1-5H;/q+1;-1", c.inchi
20
+ assert_equal "F[B-](F)(F)F.N#[N+]c1ccccc1", c.smiles
21
+ end
22
+
23
+ def test_compound_from_name
24
+ c = OpenTox::Compound.from_name "Benzene"
25
+ assert_equal "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H", c.inchi
26
+ assert_equal "c1ccccc1", c.smiles
27
+ end
28
+
29
+ def test_compound_from_inchi
30
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
31
+ assert_equal "c1ccccc1", c.smiles
32
+ end
33
+
34
+ def test_sdf_import
35
+ c = OpenTox::Compound.from_sdf File.read(File.join DATA_DIR, "acetaldehyde.sdf")
36
+ assert_equal "InChI=1S/C2H4O/c1-2-3/h2H,1H3", c.inchi
37
+ assert_equal "CC=O", c.smiles
38
+ assert c.names.include? "Acetylaldehyde"
39
+ end
40
+
41
+ def test_sdf_export
42
+ c = OpenTox::Compound.from_smiles "CC=O"
43
+ print c.sdf
44
+ assert_match /7 6 0 0 0 0 0 0 0 0999 V2000/, c.sdf
45
+ end
46
+
47
+ def test_compound_image
48
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
49
+ testbild = "/tmp/testbild.png"
50
+ File.open(testbild, "w"){|f| f.puts c.png}
51
+ assert_match "image/png", `file -b --mime-type /tmp/testbild.png`
52
+ File.unlink(testbild)
53
+ end
54
+
55
+ def test_inchikey
56
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
57
+ p c
58
+ assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
59
+ end
60
+
61
+ def test_cid
62
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
63
+ assert_equal "241", c.cid
64
+ end
65
+
66
+ def test_chemblid
67
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
68
+ #assert_equal "CHEMBL277500", c.chemblid
69
+ assert_equal "CHEMBL581676", c.chemblid
70
+ end
71
+
72
+ def test_sdf_storage
73
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
74
+ c.sdf
75
+ assert !c.sdf_id.nil?
76
+ end
77
+
78
+ def test_fingerprint
79
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
80
+
81
+ assert c.fp4.collect{|fid| Feature.find(fid).name}.include? ("1,3-Tautomerizable")
82
+ assert_equal c.fp4.size, c.fp4_size
83
+ end
84
+
85
+ def test_neighbors
86
+ d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
87
+ d.compounds.each do |c|
88
+ refute_nil c.fp4
89
+ end
90
+ c = d.compounds[371]
91
+ assert c.neighbors.size >= 19
92
+ end
93
+
94
+ def test_openbabel_segfault
95
+ inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/b13-5-/t11-,12-,14-,18-,19?/m1/s1"
96
+
97
+ c = Compound.from_inchi(inchi)
98
+ assert_equal inchi, c.inchi
99
+ end
100
+ end