lazar 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
data/lib/validation.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
class Validation
|
4
|
+
|
5
|
+
field :prediction_dataset_id, type: BSON::ObjectId
|
6
|
+
field :test_dataset_id, type: BSON::ObjectId
|
7
|
+
field :nr_instances, type: Integer
|
8
|
+
field :nr_unpredicted, type: Integer
|
9
|
+
field :predictions, type: Array
|
10
|
+
|
11
|
+
def prediction_dataset
|
12
|
+
Dataset.find prediction_dataset_id
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_dataset
|
16
|
+
Dataset.find test_dataset_id
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
class ClassificationValidation < Validation
|
22
|
+
field :accept_values, type: String
|
23
|
+
field :confusion_matrix, type: Array
|
24
|
+
field :weighted_confusion_matrix, type: Array
|
25
|
+
|
26
|
+
def self.create model, training_set, test_set
|
27
|
+
validation = self.class.new
|
28
|
+
#feature_dataset = Dataset.find model.feature_dataset_id
|
29
|
+
# TODO check and delegate to Algorithm
|
30
|
+
#features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
|
31
|
+
validation_model = model.class.create training_set#, features
|
32
|
+
test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
|
33
|
+
prediction_dataset = validation_model.predict test_set_without_activities
|
34
|
+
accept_values = prediction_dataset.prediction_feature.accept_values
|
35
|
+
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
36
|
+
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
37
|
+
predictions = []
|
38
|
+
nr_unpredicted = 0
|
39
|
+
prediction_dataset.data_entries.each_with_index do |pe,i|
|
40
|
+
if pe[0] and pe[1] and pe[1].numeric?
|
41
|
+
prediction = pe[0]
|
42
|
+
# TODO prediction_feature, convention??
|
43
|
+
# TODO generalize for multiple classes
|
44
|
+
activity = test_set.data_entries[i].first
|
45
|
+
confidence = prediction_dataset.data_entries[i][1]
|
46
|
+
predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
|
47
|
+
if prediction == activity
|
48
|
+
if prediction == accept_values[0]
|
49
|
+
confusion_matrix[0][0] += 1
|
50
|
+
weighted_confusion_matrix[0][0] += confidence
|
51
|
+
elsif prediction == accept_values[1]
|
52
|
+
confusion_matrix[1][1] += 1
|
53
|
+
weighted_confusion_matrix[1][1] += confidence
|
54
|
+
end
|
55
|
+
elsif prediction != activity
|
56
|
+
if prediction == accept_values[0]
|
57
|
+
confusion_matrix[0][1] += 1
|
58
|
+
weighted_confusion_matrix[0][1] += confidence
|
59
|
+
elsif prediction == accept_values[1]
|
60
|
+
confusion_matrix[1][0] += 1
|
61
|
+
weighted_confusion_matrix[1][0] += confidence
|
62
|
+
end
|
63
|
+
end
|
64
|
+
else
|
65
|
+
nr_unpredicted += 1 if pe[0].nil?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
validation = self.new(
|
69
|
+
:prediction_dataset_id => prediction_dataset.id,
|
70
|
+
:test_dataset_id => test_set.id,
|
71
|
+
:nr_instances => test_set.compound_ids.size,
|
72
|
+
:nr_unpredicted => nr_unpredicted,
|
73
|
+
:accept_values => accept_values,
|
74
|
+
:confusion_matrix => confusion_matrix,
|
75
|
+
:weighted_confusion_matrix => weighted_confusion_matrix,
|
76
|
+
:predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
|
77
|
+
)
|
78
|
+
validation.save
|
79
|
+
validation
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
class RegressionValidation < Validation
|
84
|
+
def self.create model, training_set, test_set
|
85
|
+
|
86
|
+
validation_model = Model::LazarRegression.create training_set
|
87
|
+
test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
|
88
|
+
prediction_dataset = validation_model.predict test_set_without_activities
|
89
|
+
predictions = []
|
90
|
+
nr_unpredicted = 0
|
91
|
+
activities = test_set.data_entries.collect{|de| de.first}
|
92
|
+
prediction_dataset.data_entries.each_with_index do |de,i|
|
93
|
+
if de[0] and de[1] and de[1].numeric?
|
94
|
+
activity = activities[i]
|
95
|
+
prediction = de.first
|
96
|
+
confidence = de[1]
|
97
|
+
predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
|
98
|
+
else
|
99
|
+
nr_unpredicted += 1
|
100
|
+
end
|
101
|
+
end
|
102
|
+
validation = self.new(
|
103
|
+
:prediction_dataset_id => prediction_dataset.id,
|
104
|
+
:test_dataset_id => test_set.id,
|
105
|
+
:nr_instances => test_set.compound_ids.size,
|
106
|
+
:nr_unpredicted => nr_unpredicted,
|
107
|
+
:predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
|
108
|
+
)
|
109
|
+
validation.save
|
110
|
+
validation
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
data/mongoid.yml
ADDED
data/test/all.rb
ADDED
data/test/compound.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class CompoundTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_0_compound_from_smiles
|
6
|
+
c = OpenTox::Compound.from_smiles "F[B-](F)(F)F.[Na+]"
|
7
|
+
assert_equal "InChI=1S/BF4.Na/c2-1(3,4)5;/q-1;+1", c.inchi.chomp
|
8
|
+
assert_equal "F[B-](F)(F)F.[Na+]", c.smiles, "A failure here might be caused by a compound webservice running on 64bit architectures using an outdated version of OpenBabel. Please install OpenBabel version 2.3.2 or higher." # seems to be fixed in 2.3.2
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_1_compound_from_smiles
|
12
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
13
|
+
assert_equal "InChI=1S/C6H9NO/c1-5(4-7)3-6(2)8/h5H,3H2,1-2H3", c.inchi
|
14
|
+
assert_equal "CC(C#N)CC(=O)C", c.smiles
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_2_compound_from_smiles
|
18
|
+
c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
|
19
|
+
assert_equal "InChI=1S/C6H5N2.BF4/c7-8-6-4-2-1-3-5-6;2-1(3,4)5/h1-5H;/q+1;-1", c.inchi
|
20
|
+
assert_equal "F[B-](F)(F)F.N#[N+]c1ccccc1", c.smiles
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_compound_from_name
|
24
|
+
c = OpenTox::Compound.from_name "Benzene"
|
25
|
+
assert_equal "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H", c.inchi
|
26
|
+
assert_equal "c1ccccc1", c.smiles
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_compound_from_inchi
|
30
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
31
|
+
assert_equal "c1ccccc1", c.smiles
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_sdf_import
|
35
|
+
c = OpenTox::Compound.from_sdf File.read(File.join DATA_DIR, "acetaldehyde.sdf")
|
36
|
+
assert_equal "InChI=1S/C2H4O/c1-2-3/h2H,1H3", c.inchi
|
37
|
+
assert_equal "CC=O", c.smiles
|
38
|
+
assert c.names.include? "Acetylaldehyde"
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_sdf_export
|
42
|
+
c = OpenTox::Compound.from_smiles "CC=O"
|
43
|
+
print c.sdf
|
44
|
+
assert_match /7 6 0 0 0 0 0 0 0 0999 V2000/, c.sdf
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_compound_image
|
48
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
49
|
+
testbild = "/tmp/testbild.png"
|
50
|
+
File.open(testbild, "w"){|f| f.puts c.png}
|
51
|
+
assert_match "image/png", `file -b --mime-type /tmp/testbild.png`
|
52
|
+
File.unlink(testbild)
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_inchikey
|
56
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
57
|
+
p c
|
58
|
+
assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_cid
|
62
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
63
|
+
assert_equal "241", c.cid
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_chemblid
|
67
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
68
|
+
#assert_equal "CHEMBL277500", c.chemblid
|
69
|
+
assert_equal "CHEMBL581676", c.chemblid
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_sdf_storage
|
73
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
74
|
+
c.sdf
|
75
|
+
assert !c.sdf_id.nil?
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_fingerprint
|
79
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
80
|
+
|
81
|
+
assert c.fp4.collect{|fid| Feature.find(fid).name}.include? ("1,3-Tautomerizable")
|
82
|
+
assert_equal c.fp4.size, c.fp4_size
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_neighbors
|
86
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
87
|
+
d.compounds.each do |c|
|
88
|
+
refute_nil c.fp4
|
89
|
+
end
|
90
|
+
c = d.compounds[371]
|
91
|
+
assert c.neighbors.size >= 19
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_openbabel_segfault
|
95
|
+
inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/b13-5-/t11-,12-,14-,18-,19?/m1/s1"
|
96
|
+
|
97
|
+
c = Compound.from_inchi(inchi)
|
98
|
+
assert_equal inchi, c.inchi
|
99
|
+
end
|
100
|
+
end
|