lazar 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
data/lib/validation.rb ADDED
@@ -0,0 +1,114 @@
1
+ module OpenTox
2
+
3
+ class Validation
4
+
5
+ field :prediction_dataset_id, type: BSON::ObjectId
6
+ field :test_dataset_id, type: BSON::ObjectId
7
+ field :nr_instances, type: Integer
8
+ field :nr_unpredicted, type: Integer
9
+ field :predictions, type: Array
10
+
11
+ def prediction_dataset
12
+ Dataset.find prediction_dataset_id
13
+ end
14
+
15
+ def test_dataset
16
+ Dataset.find test_dataset_id
17
+ end
18
+
19
+ end
20
+
21
+ class ClassificationValidation < Validation
22
+ field :accept_values, type: String
23
+ field :confusion_matrix, type: Array
24
+ field :weighted_confusion_matrix, type: Array
25
+
26
+ def self.create model, training_set, test_set
27
+ validation = self.class.new
28
+ #feature_dataset = Dataset.find model.feature_dataset_id
29
+ # TODO check and delegate to Algorithm
30
+ #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
31
+ validation_model = model.class.create training_set#, features
32
+ test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
33
+ prediction_dataset = validation_model.predict test_set_without_activities
34
+ accept_values = prediction_dataset.prediction_feature.accept_values
35
+ confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
36
+ weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
37
+ predictions = []
38
+ nr_unpredicted = 0
39
+ prediction_dataset.data_entries.each_with_index do |pe,i|
40
+ if pe[0] and pe[1] and pe[1].numeric?
41
+ prediction = pe[0]
42
+ # TODO prediction_feature, convention??
43
+ # TODO generalize for multiple classes
44
+ activity = test_set.data_entries[i].first
45
+ confidence = prediction_dataset.data_entries[i][1]
46
+ predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
47
+ if prediction == activity
48
+ if prediction == accept_values[0]
49
+ confusion_matrix[0][0] += 1
50
+ weighted_confusion_matrix[0][0] += confidence
51
+ elsif prediction == accept_values[1]
52
+ confusion_matrix[1][1] += 1
53
+ weighted_confusion_matrix[1][1] += confidence
54
+ end
55
+ elsif prediction != activity
56
+ if prediction == accept_values[0]
57
+ confusion_matrix[0][1] += 1
58
+ weighted_confusion_matrix[0][1] += confidence
59
+ elsif prediction == accept_values[1]
60
+ confusion_matrix[1][0] += 1
61
+ weighted_confusion_matrix[1][0] += confidence
62
+ end
63
+ end
64
+ else
65
+ nr_unpredicted += 1 if pe[0].nil?
66
+ end
67
+ end
68
+ validation = self.new(
69
+ :prediction_dataset_id => prediction_dataset.id,
70
+ :test_dataset_id => test_set.id,
71
+ :nr_instances => test_set.compound_ids.size,
72
+ :nr_unpredicted => nr_unpredicted,
73
+ :accept_values => accept_values,
74
+ :confusion_matrix => confusion_matrix,
75
+ :weighted_confusion_matrix => weighted_confusion_matrix,
76
+ :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
77
+ )
78
+ validation.save
79
+ validation
80
+ end
81
+ end
82
+
83
+ class RegressionValidation < Validation
84
+ def self.create model, training_set, test_set
85
+
86
+ validation_model = Model::LazarRegression.create training_set
87
+ test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
88
+ prediction_dataset = validation_model.predict test_set_without_activities
89
+ predictions = []
90
+ nr_unpredicted = 0
91
+ activities = test_set.data_entries.collect{|de| de.first}
92
+ prediction_dataset.data_entries.each_with_index do |de,i|
93
+ if de[0] and de[1] and de[1].numeric?
94
+ activity = activities[i]
95
+ prediction = de.first
96
+ confidence = de[1]
97
+ predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
98
+ else
99
+ nr_unpredicted += 1
100
+ end
101
+ end
102
+ validation = self.new(
103
+ :prediction_dataset_id => prediction_dataset.id,
104
+ :test_dataset_id => test_set.id,
105
+ :nr_instances => test_set.compound_ids.size,
106
+ :nr_unpredicted => nr_unpredicted,
107
+ :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
108
+ )
109
+ validation.save
110
+ validation
111
+ end
112
+ end
113
+
114
+ end
data/mongoid.yml ADDED
@@ -0,0 +1,8 @@
1
+ development:
2
+ clients:
3
+ default:
4
+ database: opentox
5
+ hosts:
6
+ - localhost:27017
7
+ options:
8
+ raise_not_found_error: false
data/test/all.rb ADDED
@@ -0,0 +1,5 @@
1
+ exclude = ["./setup.rb","./all.rb"]
2
+ (Dir[File.join(File.dirname(__FILE__),"*.rb")]-exclude).each do |test|
3
+ p test
4
+ require_relative test
5
+ end
data/test/compound.rb ADDED
@@ -0,0 +1,100 @@
1
+ require_relative "setup.rb"
2
+
3
+ class CompoundTest < MiniTest::Test
4
+
5
+ def test_0_compound_from_smiles
6
+ c = OpenTox::Compound.from_smiles "F[B-](F)(F)F.[Na+]"
7
+ assert_equal "InChI=1S/BF4.Na/c2-1(3,4)5;/q-1;+1", c.inchi.chomp
8
+ assert_equal "F[B-](F)(F)F.[Na+]", c.smiles, "A failure here might be caused by a compound webservice running on 64bit architectures using an outdated version of OpenBabel. Please install OpenBabel version 2.3.2 or higher." # seems to be fixed in 2.3.2
9
+ end
10
+
11
+ def test_1_compound_from_smiles
12
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
13
+ assert_equal "InChI=1S/C6H9NO/c1-5(4-7)3-6(2)8/h5H,3H2,1-2H3", c.inchi
14
+ assert_equal "CC(C#N)CC(=O)C", c.smiles
15
+ end
16
+
17
+ def test_2_compound_from_smiles
18
+ c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
19
+ assert_equal "InChI=1S/C6H5N2.BF4/c7-8-6-4-2-1-3-5-6;2-1(3,4)5/h1-5H;/q+1;-1", c.inchi
20
+ assert_equal "F[B-](F)(F)F.N#[N+]c1ccccc1", c.smiles
21
+ end
22
+
23
+ def test_compound_from_name
24
+ c = OpenTox::Compound.from_name "Benzene"
25
+ assert_equal "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H", c.inchi
26
+ assert_equal "c1ccccc1", c.smiles
27
+ end
28
+
29
+ def test_compound_from_inchi
30
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
31
+ assert_equal "c1ccccc1", c.smiles
32
+ end
33
+
34
+ def test_sdf_import
35
+ c = OpenTox::Compound.from_sdf File.read(File.join DATA_DIR, "acetaldehyde.sdf")
36
+ assert_equal "InChI=1S/C2H4O/c1-2-3/h2H,1H3", c.inchi
37
+ assert_equal "CC=O", c.smiles
38
+ assert c.names.include? "Acetylaldehyde"
39
+ end
40
+
41
+ def test_sdf_export
42
+ c = OpenTox::Compound.from_smiles "CC=O"
43
+ print c.sdf
44
+ assert_match /7 6 0 0 0 0 0 0 0 0999 V2000/, c.sdf
45
+ end
46
+
47
+ def test_compound_image
48
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
49
+ testbild = "/tmp/testbild.png"
50
+ File.open(testbild, "w"){|f| f.puts c.png}
51
+ assert_match "image/png", `file -b --mime-type /tmp/testbild.png`
52
+ File.unlink(testbild)
53
+ end
54
+
55
+ def test_inchikey
56
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
57
+ p c
58
+ assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
59
+ end
60
+
61
+ def test_cid
62
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
63
+ assert_equal "241", c.cid
64
+ end
65
+
66
+ def test_chemblid
67
+ c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
68
+ #assert_equal "CHEMBL277500", c.chemblid
69
+ assert_equal "CHEMBL581676", c.chemblid
70
+ end
71
+
72
+ def test_sdf_storage
73
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
74
+ c.sdf
75
+ assert !c.sdf_id.nil?
76
+ end
77
+
78
+ def test_fingerprint
79
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
80
+
81
+ assert c.fp4.collect{|fid| Feature.find(fid).name}.include? ("1,3-Tautomerizable")
82
+ assert_equal c.fp4.size, c.fp4_size
83
+ end
84
+
85
+ def test_neighbors
86
+ d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
87
+ d.compounds.each do |c|
88
+ refute_nil c.fp4
89
+ end
90
+ c = d.compounds[371]
91
+ assert c.neighbors.size >= 19
92
+ end
93
+
94
+ def test_openbabel_segfault
95
+ inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/b13-5-/t11-,12-,14-,18-,19?/m1/s1"
96
+
97
+ c = Compound.from_inchi(inchi)
98
+ assert_equal inchi, c.inchi
99
+ end
100
+ end