lazar 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
data/lib/validation.rb
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
|
|
3
|
+
class Validation
|
|
4
|
+
|
|
5
|
+
field :prediction_dataset_id, type: BSON::ObjectId
|
|
6
|
+
field :test_dataset_id, type: BSON::ObjectId
|
|
7
|
+
field :nr_instances, type: Integer
|
|
8
|
+
field :nr_unpredicted, type: Integer
|
|
9
|
+
field :predictions, type: Array
|
|
10
|
+
|
|
11
|
+
def prediction_dataset
|
|
12
|
+
Dataset.find prediction_dataset_id
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def test_dataset
|
|
16
|
+
Dataset.find test_dataset_id
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
class ClassificationValidation < Validation
|
|
22
|
+
field :accept_values, type: String
|
|
23
|
+
field :confusion_matrix, type: Array
|
|
24
|
+
field :weighted_confusion_matrix, type: Array
|
|
25
|
+
|
|
26
|
+
def self.create model, training_set, test_set
|
|
27
|
+
validation = self.class.new
|
|
28
|
+
#feature_dataset = Dataset.find model.feature_dataset_id
|
|
29
|
+
# TODO check and delegate to Algorithm
|
|
30
|
+
#features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
|
|
31
|
+
validation_model = model.class.create training_set#, features
|
|
32
|
+
test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
|
|
33
|
+
prediction_dataset = validation_model.predict test_set_without_activities
|
|
34
|
+
accept_values = prediction_dataset.prediction_feature.accept_values
|
|
35
|
+
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
|
36
|
+
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
|
37
|
+
predictions = []
|
|
38
|
+
nr_unpredicted = 0
|
|
39
|
+
prediction_dataset.data_entries.each_with_index do |pe,i|
|
|
40
|
+
if pe[0] and pe[1] and pe[1].numeric?
|
|
41
|
+
prediction = pe[0]
|
|
42
|
+
# TODO prediction_feature, convention??
|
|
43
|
+
# TODO generalize for multiple classes
|
|
44
|
+
activity = test_set.data_entries[i].first
|
|
45
|
+
confidence = prediction_dataset.data_entries[i][1]
|
|
46
|
+
predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
|
|
47
|
+
if prediction == activity
|
|
48
|
+
if prediction == accept_values[0]
|
|
49
|
+
confusion_matrix[0][0] += 1
|
|
50
|
+
weighted_confusion_matrix[0][0] += confidence
|
|
51
|
+
elsif prediction == accept_values[1]
|
|
52
|
+
confusion_matrix[1][1] += 1
|
|
53
|
+
weighted_confusion_matrix[1][1] += confidence
|
|
54
|
+
end
|
|
55
|
+
elsif prediction != activity
|
|
56
|
+
if prediction == accept_values[0]
|
|
57
|
+
confusion_matrix[0][1] += 1
|
|
58
|
+
weighted_confusion_matrix[0][1] += confidence
|
|
59
|
+
elsif prediction == accept_values[1]
|
|
60
|
+
confusion_matrix[1][0] += 1
|
|
61
|
+
weighted_confusion_matrix[1][0] += confidence
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
else
|
|
65
|
+
nr_unpredicted += 1 if pe[0].nil?
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
validation = self.new(
|
|
69
|
+
:prediction_dataset_id => prediction_dataset.id,
|
|
70
|
+
:test_dataset_id => test_set.id,
|
|
71
|
+
:nr_instances => test_set.compound_ids.size,
|
|
72
|
+
:nr_unpredicted => nr_unpredicted,
|
|
73
|
+
:accept_values => accept_values,
|
|
74
|
+
:confusion_matrix => confusion_matrix,
|
|
75
|
+
:weighted_confusion_matrix => weighted_confusion_matrix,
|
|
76
|
+
:predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
|
|
77
|
+
)
|
|
78
|
+
validation.save
|
|
79
|
+
validation
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
class RegressionValidation < Validation
|
|
84
|
+
def self.create model, training_set, test_set
|
|
85
|
+
|
|
86
|
+
validation_model = Model::LazarRegression.create training_set
|
|
87
|
+
test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
|
|
88
|
+
prediction_dataset = validation_model.predict test_set_without_activities
|
|
89
|
+
predictions = []
|
|
90
|
+
nr_unpredicted = 0
|
|
91
|
+
activities = test_set.data_entries.collect{|de| de.first}
|
|
92
|
+
prediction_dataset.data_entries.each_with_index do |de,i|
|
|
93
|
+
if de[0] and de[1] and de[1].numeric?
|
|
94
|
+
activity = activities[i]
|
|
95
|
+
prediction = de.first
|
|
96
|
+
confidence = de[1]
|
|
97
|
+
predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
|
|
98
|
+
else
|
|
99
|
+
nr_unpredicted += 1
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
validation = self.new(
|
|
103
|
+
:prediction_dataset_id => prediction_dataset.id,
|
|
104
|
+
:test_dataset_id => test_set.id,
|
|
105
|
+
:nr_instances => test_set.compound_ids.size,
|
|
106
|
+
:nr_unpredicted => nr_unpredicted,
|
|
107
|
+
:predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
|
|
108
|
+
)
|
|
109
|
+
validation.save
|
|
110
|
+
validation
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
end
|
data/mongoid.yml
ADDED
data/test/all.rb
ADDED
data/test/compound.rb
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
require_relative "setup.rb"
|
|
2
|
+
|
|
3
|
+
class CompoundTest < MiniTest::Test
|
|
4
|
+
|
|
5
|
+
def test_0_compound_from_smiles
|
|
6
|
+
c = OpenTox::Compound.from_smiles "F[B-](F)(F)F.[Na+]"
|
|
7
|
+
assert_equal "InChI=1S/BF4.Na/c2-1(3,4)5;/q-1;+1", c.inchi.chomp
|
|
8
|
+
assert_equal "F[B-](F)(F)F.[Na+]", c.smiles, "A failure here might be caused by a compound webservice running on 64bit architectures using an outdated version of OpenBabel. Please install OpenBabel version 2.3.2 or higher." # seems to be fixed in 2.3.2
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def test_1_compound_from_smiles
|
|
12
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
|
13
|
+
assert_equal "InChI=1S/C6H9NO/c1-5(4-7)3-6(2)8/h5H,3H2,1-2H3", c.inchi
|
|
14
|
+
assert_equal "CC(C#N)CC(=O)C", c.smiles
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def test_2_compound_from_smiles
|
|
18
|
+
c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
|
|
19
|
+
assert_equal "InChI=1S/C6H5N2.BF4/c7-8-6-4-2-1-3-5-6;2-1(3,4)5/h1-5H;/q+1;-1", c.inchi
|
|
20
|
+
assert_equal "F[B-](F)(F)F.N#[N+]c1ccccc1", c.smiles
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def test_compound_from_name
|
|
24
|
+
c = OpenTox::Compound.from_name "Benzene"
|
|
25
|
+
assert_equal "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H", c.inchi
|
|
26
|
+
assert_equal "c1ccccc1", c.smiles
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def test_compound_from_inchi
|
|
30
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
|
31
|
+
assert_equal "c1ccccc1", c.smiles
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def test_sdf_import
|
|
35
|
+
c = OpenTox::Compound.from_sdf File.read(File.join DATA_DIR, "acetaldehyde.sdf")
|
|
36
|
+
assert_equal "InChI=1S/C2H4O/c1-2-3/h2H,1H3", c.inchi
|
|
37
|
+
assert_equal "CC=O", c.smiles
|
|
38
|
+
assert c.names.include? "Acetylaldehyde"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def test_sdf_export
|
|
42
|
+
c = OpenTox::Compound.from_smiles "CC=O"
|
|
43
|
+
print c.sdf
|
|
44
|
+
assert_match /7 6 0 0 0 0 0 0 0 0999 V2000/, c.sdf
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def test_compound_image
|
|
48
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
|
49
|
+
testbild = "/tmp/testbild.png"
|
|
50
|
+
File.open(testbild, "w"){|f| f.puts c.png}
|
|
51
|
+
assert_match "image/png", `file -b --mime-type /tmp/testbild.png`
|
|
52
|
+
File.unlink(testbild)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def test_inchikey
|
|
56
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
|
57
|
+
p c
|
|
58
|
+
assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def test_cid
|
|
62
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
|
63
|
+
assert_equal "241", c.cid
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def test_chemblid
|
|
67
|
+
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
|
68
|
+
#assert_equal "CHEMBL277500", c.chemblid
|
|
69
|
+
assert_equal "CHEMBL581676", c.chemblid
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def test_sdf_storage
|
|
73
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
|
74
|
+
c.sdf
|
|
75
|
+
assert !c.sdf_id.nil?
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def test_fingerprint
|
|
79
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
|
80
|
+
|
|
81
|
+
assert c.fp4.collect{|fid| Feature.find(fid).name}.include? ("1,3-Tautomerizable")
|
|
82
|
+
assert_equal c.fp4.size, c.fp4_size
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def test_neighbors
|
|
86
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
|
87
|
+
d.compounds.each do |c|
|
|
88
|
+
refute_nil c.fp4
|
|
89
|
+
end
|
|
90
|
+
c = d.compounds[371]
|
|
91
|
+
assert c.neighbors.size >= 19
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def test_openbabel_segfault
|
|
95
|
+
inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/b13-5-/t11-,12-,14-,18-,19?/m1/s1"
|
|
96
|
+
|
|
97
|
+
c = Compound.from_inchi(inchi)
|
|
98
|
+
assert_equal inchi, c.inchi
|
|
99
|
+
end
|
|
100
|
+
end
|