lazar 0.0.7 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
data/lib/validation.rb
CHANGED
@@ -2,7 +2,9 @@ module OpenTox
|
|
2
2
|
|
3
3
|
class Validation
|
4
4
|
|
5
|
+
field :model_id, type: BSON::ObjectId
|
5
6
|
field :prediction_dataset_id, type: BSON::ObjectId
|
7
|
+
field :crossvalidation_id, type: BSON::ObjectId
|
6
8
|
field :test_dataset_id, type: BSON::ObjectId
|
7
9
|
field :nr_instances, type: Integer
|
8
10
|
field :nr_unpredicted, type: Integer
|
@@ -16,98 +18,90 @@ module OpenTox
|
|
16
18
|
Dataset.find test_dataset_id
|
17
19
|
end
|
18
20
|
|
19
|
-
|
21
|
+
def model
|
22
|
+
Model::Lazar.find model_id
|
23
|
+
end
|
20
24
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
+
def self.create model, training_set, test_set, crossvalidation=nil
|
26
|
+
|
27
|
+
atts = model.attributes.dup # do not modify attributes from original model
|
28
|
+
atts["_id"] = BSON::ObjectId.new
|
29
|
+
atts[:training_dataset_id] = training_set.id
|
30
|
+
validation_model = model.class.create training_set, atts
|
31
|
+
validation_model.save
|
32
|
+
cids = test_set.compound_ids
|
25
33
|
|
26
|
-
|
27
|
-
validation = self.class.new
|
28
|
-
#feature_dataset = Dataset.find model.feature_dataset_id
|
29
|
-
# TODO check and delegate to Algorithm
|
30
|
-
#features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
|
31
|
-
validation_model = model.class.create training_set#, features
|
32
|
-
test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
|
34
|
+
test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
|
33
35
|
prediction_dataset = validation_model.predict test_set_without_activities
|
34
|
-
accept_values = prediction_dataset.prediction_feature.accept_values
|
35
|
-
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
36
|
-
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
37
36
|
predictions = []
|
38
37
|
nr_unpredicted = 0
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
if prediction == accept_values[0]
|
49
|
-
confusion_matrix[0][0] += 1
|
50
|
-
weighted_confusion_matrix[0][0] += confidence
|
51
|
-
elsif prediction == accept_values[1]
|
52
|
-
confusion_matrix[1][1] += 1
|
53
|
-
weighted_confusion_matrix[1][1] += confidence
|
54
|
-
end
|
55
|
-
elsif prediction != activity
|
56
|
-
if prediction == accept_values[0]
|
57
|
-
confusion_matrix[0][1] += 1
|
58
|
-
weighted_confusion_matrix[0][1] += confidence
|
59
|
-
elsif prediction == accept_values[1]
|
60
|
-
confusion_matrix[1][0] += 1
|
61
|
-
weighted_confusion_matrix[1][0] += confidence
|
62
|
-
end
|
63
|
-
end
|
38
|
+
activities = test_set.data_entries.collect{|de| de.first}
|
39
|
+
prediction_dataset.data_entries.each_with_index do |de,i|
|
40
|
+
if de[0] #and de[1]
|
41
|
+
cid = prediction_dataset.compound_ids[i]
|
42
|
+
rows = cids.each_index.select{|r| cids[r] == cid }
|
43
|
+
activities = rows.collect{|r| test_set.data_entries[r][0]}
|
44
|
+
prediction = de.first
|
45
|
+
confidence = de[1]
|
46
|
+
predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
|
64
47
|
else
|
65
|
-
nr_unpredicted += 1
|
48
|
+
nr_unpredicted += 1
|
66
49
|
end
|
67
50
|
end
|
68
51
|
validation = self.new(
|
52
|
+
:model_id => validation_model.id,
|
69
53
|
:prediction_dataset_id => prediction_dataset.id,
|
70
54
|
:test_dataset_id => test_set.id,
|
71
55
|
:nr_instances => test_set.compound_ids.size,
|
72
56
|
:nr_unpredicted => nr_unpredicted,
|
73
|
-
:
|
74
|
-
:confusion_matrix => confusion_matrix,
|
75
|
-
:weighted_confusion_matrix => weighted_confusion_matrix,
|
76
|
-
:predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
|
57
|
+
:predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
|
77
58
|
)
|
59
|
+
validation.crossvalidation_id = crossvalidation.id if crossvalidation
|
78
60
|
validation.save
|
79
61
|
validation
|
80
62
|
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
class ClassificationValidation < Validation
|
81
67
|
end
|
82
68
|
|
83
69
|
class RegressionValidation < Validation
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
70
|
+
|
71
|
+
def statistics
|
72
|
+
rmse = 0
|
73
|
+
weighted_rmse = 0
|
74
|
+
rse = 0
|
75
|
+
weighted_rse = 0
|
76
|
+
mae = 0
|
77
|
+
weighted_mae = 0
|
78
|
+
confidence_sum = 0
|
79
|
+
predictions.each do |pred|
|
80
|
+
compound_id,activity,prediction,confidence = pred
|
81
|
+
if activity and prediction
|
82
|
+
error = Math.log10(prediction)-Math.log10(activity.median)
|
83
|
+
rmse += error**2
|
84
|
+
weighted_rmse += confidence*error**2
|
85
|
+
mae += error.abs
|
86
|
+
weighted_mae += confidence*error.abs
|
87
|
+
confidence_sum += confidence
|
98
88
|
else
|
99
|
-
|
89
|
+
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
90
|
+
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
100
91
|
end
|
101
92
|
end
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
93
|
+
x = predictions.collect{|p| p[1].median}
|
94
|
+
y = predictions.collect{|p| p[2]}
|
95
|
+
R.assign "measurement", x
|
96
|
+
R.assign "prediction", y
|
97
|
+
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
|
98
|
+
r = R.eval("r").to_ruby
|
99
|
+
|
100
|
+
mae = mae/predictions.size
|
101
|
+
weighted_mae = weighted_mae/confidence_sum
|
102
|
+
rmse = Math.sqrt(rmse/predictions.size)
|
103
|
+
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
|
104
|
+
{ "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
|
111
105
|
end
|
112
106
|
end
|
113
107
|
|
data/test/all.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
1
|
+
# "./default_environment.rb" has to be executed separately
|
2
|
+
exclude = ["./setup.rb","./all.rb", "./default_environment.rb"]
|
2
3
|
(Dir[File.join(File.dirname(__FILE__),"*.rb")]-exclude).each do |test|
|
3
|
-
p test
|
4
4
|
require_relative test
|
5
5
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class LazarClassificationTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_lazar_classification
|
6
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
+
model = Model::LazarClassification.create training_dataset
|
8
|
+
|
9
|
+
[ {
|
10
|
+
:compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
|
11
|
+
:prediction => "false",
|
12
|
+
:confidence => 0.25281385281385277,
|
13
|
+
:nr_neighbors => 11
|
14
|
+
},{
|
15
|
+
:compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
|
16
|
+
:prediction => "false",
|
17
|
+
:confidence => 0.3639589577089577,
|
18
|
+
:nr_neighbors => 14
|
19
|
+
} ].each do |example|
|
20
|
+
prediction = model.predict example[:compound]
|
21
|
+
assert_equal example[:prediction], prediction[:value]
|
22
|
+
#assert_equal example[:confidence], prediction[:confidence]
|
23
|
+
#assert_equal example[:nr_neighbors], prediction[:neighbors].size
|
24
|
+
end
|
25
|
+
|
26
|
+
compound = Compound.from_smiles "CCO"
|
27
|
+
prediction = model.predict compound
|
28
|
+
assert_equal ["false"], prediction[:database_activities]
|
29
|
+
assert_equal "true", prediction[:value]
|
30
|
+
|
31
|
+
# make a dataset prediction
|
32
|
+
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
33
|
+
prediction = model.predict compound_dataset
|
34
|
+
assert_equal compound_dataset.compounds, prediction.compounds
|
35
|
+
|
36
|
+
assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3]
|
37
|
+
assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3]
|
38
|
+
# cleanup
|
39
|
+
[training_dataset,model,compound_dataset].each{|o| o.delete}
|
40
|
+
end
|
41
|
+
end
|
data/test/compound.rb
CHANGED
@@ -54,7 +54,6 @@ print c.sdf
|
|
54
54
|
|
55
55
|
def test_inchikey
|
56
56
|
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
57
|
-
p c
|
58
57
|
assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
|
59
58
|
end
|
60
59
|
|
@@ -65,8 +64,7 @@ print c.sdf
|
|
65
64
|
|
66
65
|
def test_chemblid
|
67
66
|
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
68
|
-
|
69
|
-
assert_equal "CHEMBL581676", c.chemblid
|
67
|
+
assert_equal "CHEMBL277500", c.chemblid
|
70
68
|
end
|
71
69
|
|
72
70
|
def test_sdf_storage
|
@@ -78,17 +76,17 @@ print c.sdf
|
|
78
76
|
def test_fingerprint
|
79
77
|
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
80
78
|
|
81
|
-
|
82
|
-
assert_equal c.fp4.size, c.fp4_size
|
79
|
+
assert_equal 9, c.fingerprint("FP4").size
|
83
80
|
end
|
84
81
|
|
85
82
|
def test_neighbors
|
86
83
|
d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
87
84
|
d.compounds.each do |c|
|
88
|
-
refute_nil c.
|
85
|
+
refute_nil c.fingerprint("MP2D")
|
89
86
|
end
|
90
87
|
c = d.compounds[371]
|
91
|
-
|
88
|
+
n = c.fingerprint_neighbors({:type => "FP4", :min_sim => 0.7, :training_dataset_id => d.id })
|
89
|
+
assert n.size >= 18, "Neighbors size (#{n.size}) should be larger than 17"
|
92
90
|
end
|
93
91
|
|
94
92
|
def test_openbabel_segfault
|
@@ -97,4 +95,115 @@ print c.sdf
|
|
97
95
|
c = Compound.from_inchi(inchi)
|
98
96
|
assert_equal inchi, c.inchi
|
99
97
|
end
|
98
|
+
|
99
|
+
def test_openbabel_fingerprint
|
100
|
+
[
|
101
|
+
"CC(=O)CC(C)C#N",
|
102
|
+
"CC(=O)CC(C)C",
|
103
|
+
"C(=O)CC(C)C#N",
|
104
|
+
].each do |smi|
|
105
|
+
c = OpenTox::Compound.from_smiles smi
|
106
|
+
refute_nil c.fingerprint("FP4")
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_fingerprint_neighbors
|
111
|
+
types = ["FP2", "FP3", "FP4", "MACCS"]
|
112
|
+
min_sim = 0.7
|
113
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
114
|
+
[
|
115
|
+
"CC(=O)CC(C)C#N",
|
116
|
+
"CC(=O)CC(C)C",
|
117
|
+
"C(=O)CC(C)C#N",
|
118
|
+
].each do |smi|
|
119
|
+
c = OpenTox::Compound.from_smiles smi
|
120
|
+
types.each do |type|
|
121
|
+
neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
|
122
|
+
unless type == "FP2" and smi == "CC(=O)CC(C)C#N" or smi == "C(=O)CC(C)C#N" and (type == "FP2" or type == "MACCS")
|
123
|
+
refute_empty neighbors
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def test_mna
|
130
|
+
c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
|
131
|
+
assert_equal 18, c.fingerprint("MNA").size
|
132
|
+
assert_equal 9, c.fingerprint("MNA").uniq.size
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_mpd
|
136
|
+
c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
|
137
|
+
assert 13, c.fingerprint("MP2D").size
|
138
|
+
assert 7, c.fingerprint("MP2D").uniq.size
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_fingerprint_count_neighbors
|
142
|
+
types = ["MP2D", "MNA"]
|
143
|
+
min_sim = 0.0
|
144
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
145
|
+
[
|
146
|
+
"CC(=O)CC(C)C#N",
|
147
|
+
"CC(=O)CC(C)C",
|
148
|
+
"C(=O)CC(C)C#N",
|
149
|
+
].each do |smi|
|
150
|
+
c = OpenTox::Compound.from_smiles smi
|
151
|
+
types.each do |type|
|
152
|
+
neighbors = c.fingerprint_count_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
|
153
|
+
if type == "FP4"
|
154
|
+
fp4_neighbors = c.neighbors
|
155
|
+
neighbors.each do |n|
|
156
|
+
assert_includes fp4_neighbors, n
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def test_fingerprint_db_neighbors
|
164
|
+
#skip
|
165
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
166
|
+
[
|
167
|
+
"CC(=O)CC(C)C#N",
|
168
|
+
"CC(=O)CC(C)C",
|
169
|
+
"C(=O)CC(C)C#N",
|
170
|
+
].each do |smi|
|
171
|
+
c = OpenTox::Compound.from_smiles smi
|
172
|
+
t = Time.now
|
173
|
+
neighbors = c.db_neighbors(:training_dataset_id => training_dataset.id, :min_sim => 0.2)
|
174
|
+
p Time.now - t
|
175
|
+
t = Time.now
|
176
|
+
neighbors2 = c.fingerprint_neighbors({:type => "MP2D", :training_dataset_id => training_dataset.id, :min_sim => 0.2})
|
177
|
+
p Time.now - t
|
178
|
+
p neighbors.size
|
179
|
+
p neighbors2.size
|
180
|
+
#p neighbors
|
181
|
+
#p neighbors2
|
182
|
+
#p neighbors2 - neighbors
|
183
|
+
#assert_equal neighbors, neighbors2
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def test_molecular_weight
|
188
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
|
189
|
+
assert_equal 100.15888, c.molecular_weight
|
190
|
+
end
|
191
|
+
|
192
|
+
def test_mg_conversions
|
193
|
+
# TODO fix!
|
194
|
+
skip
|
195
|
+
c = OpenTox::Compound.from_smiles "O"
|
196
|
+
mw = c.molecular_weight
|
197
|
+
assert_equal 18.01528, mw
|
198
|
+
assert_equal 0.8105107141417474, c.logmmol_to_mg(4.34688225631145, mw)
|
199
|
+
assert_equal 9007.64, c.mmol_to_mg(500, mw)
|
200
|
+
assert_equal 2437.9999984148976, c.logmg_to_mg(3.387033701)
|
201
|
+
end
|
202
|
+
|
203
|
+
def test_physchem
|
204
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
|
205
|
+
assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem.size
|
206
|
+
assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem(PhysChem.openbabel_descriptors).size
|
207
|
+
assert_equal PhysChem::unique_descriptors.size, c.physchem(PhysChem.unique_descriptors).size
|
208
|
+
end
|
100
209
|
end
|