lazar 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
data/lib/validation.rb
CHANGED
@@ -2,7 +2,9 @@ module OpenTox
|
|
2
2
|
|
3
3
|
class Validation
|
4
4
|
|
5
|
+
field :model_id, type: BSON::ObjectId
|
5
6
|
field :prediction_dataset_id, type: BSON::ObjectId
|
7
|
+
field :crossvalidation_id, type: BSON::ObjectId
|
6
8
|
field :test_dataset_id, type: BSON::ObjectId
|
7
9
|
field :nr_instances, type: Integer
|
8
10
|
field :nr_unpredicted, type: Integer
|
@@ -16,98 +18,90 @@ module OpenTox
|
|
16
18
|
Dataset.find test_dataset_id
|
17
19
|
end
|
18
20
|
|
19
|
-
|
21
|
+
def model
|
22
|
+
Model::Lazar.find model_id
|
23
|
+
end
|
20
24
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
+
def self.create model, training_set, test_set, crossvalidation=nil
|
26
|
+
|
27
|
+
atts = model.attributes.dup # do not modify attributes from original model
|
28
|
+
atts["_id"] = BSON::ObjectId.new
|
29
|
+
atts[:training_dataset_id] = training_set.id
|
30
|
+
validation_model = model.class.create training_set, atts
|
31
|
+
validation_model.save
|
32
|
+
cids = test_set.compound_ids
|
25
33
|
|
26
|
-
|
27
|
-
validation = self.class.new
|
28
|
-
#feature_dataset = Dataset.find model.feature_dataset_id
|
29
|
-
# TODO check and delegate to Algorithm
|
30
|
-
#features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
|
31
|
-
validation_model = model.class.create training_set#, features
|
32
|
-
test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
|
34
|
+
test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
|
33
35
|
prediction_dataset = validation_model.predict test_set_without_activities
|
34
|
-
accept_values = prediction_dataset.prediction_feature.accept_values
|
35
|
-
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
36
|
-
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
37
36
|
predictions = []
|
38
37
|
nr_unpredicted = 0
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
if prediction == accept_values[0]
|
49
|
-
confusion_matrix[0][0] += 1
|
50
|
-
weighted_confusion_matrix[0][0] += confidence
|
51
|
-
elsif prediction == accept_values[1]
|
52
|
-
confusion_matrix[1][1] += 1
|
53
|
-
weighted_confusion_matrix[1][1] += confidence
|
54
|
-
end
|
55
|
-
elsif prediction != activity
|
56
|
-
if prediction == accept_values[0]
|
57
|
-
confusion_matrix[0][1] += 1
|
58
|
-
weighted_confusion_matrix[0][1] += confidence
|
59
|
-
elsif prediction == accept_values[1]
|
60
|
-
confusion_matrix[1][0] += 1
|
61
|
-
weighted_confusion_matrix[1][0] += confidence
|
62
|
-
end
|
63
|
-
end
|
38
|
+
activities = test_set.data_entries.collect{|de| de.first}
|
39
|
+
prediction_dataset.data_entries.each_with_index do |de,i|
|
40
|
+
if de[0] #and de[1]
|
41
|
+
cid = prediction_dataset.compound_ids[i]
|
42
|
+
rows = cids.each_index.select{|r| cids[r] == cid }
|
43
|
+
activities = rows.collect{|r| test_set.data_entries[r][0]}
|
44
|
+
prediction = de.first
|
45
|
+
confidence = de[1]
|
46
|
+
predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
|
64
47
|
else
|
65
|
-
nr_unpredicted += 1
|
48
|
+
nr_unpredicted += 1
|
66
49
|
end
|
67
50
|
end
|
68
51
|
validation = self.new(
|
52
|
+
:model_id => validation_model.id,
|
69
53
|
:prediction_dataset_id => prediction_dataset.id,
|
70
54
|
:test_dataset_id => test_set.id,
|
71
55
|
:nr_instances => test_set.compound_ids.size,
|
72
56
|
:nr_unpredicted => nr_unpredicted,
|
73
|
-
:
|
74
|
-
:confusion_matrix => confusion_matrix,
|
75
|
-
:weighted_confusion_matrix => weighted_confusion_matrix,
|
76
|
-
:predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
|
57
|
+
:predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
|
77
58
|
)
|
59
|
+
validation.crossvalidation_id = crossvalidation.id if crossvalidation
|
78
60
|
validation.save
|
79
61
|
validation
|
80
62
|
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
class ClassificationValidation < Validation
|
81
67
|
end
|
82
68
|
|
83
69
|
class RegressionValidation < Validation
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
70
|
+
|
71
|
+
def statistics
|
72
|
+
rmse = 0
|
73
|
+
weighted_rmse = 0
|
74
|
+
rse = 0
|
75
|
+
weighted_rse = 0
|
76
|
+
mae = 0
|
77
|
+
weighted_mae = 0
|
78
|
+
confidence_sum = 0
|
79
|
+
predictions.each do |pred|
|
80
|
+
compound_id,activity,prediction,confidence = pred
|
81
|
+
if activity and prediction
|
82
|
+
error = Math.log10(prediction)-Math.log10(activity.median)
|
83
|
+
rmse += error**2
|
84
|
+
weighted_rmse += confidence*error**2
|
85
|
+
mae += error.abs
|
86
|
+
weighted_mae += confidence*error.abs
|
87
|
+
confidence_sum += confidence
|
98
88
|
else
|
99
|
-
|
89
|
+
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
90
|
+
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
100
91
|
end
|
101
92
|
end
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
93
|
+
x = predictions.collect{|p| p[1].median}
|
94
|
+
y = predictions.collect{|p| p[2]}
|
95
|
+
R.assign "measurement", x
|
96
|
+
R.assign "prediction", y
|
97
|
+
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
|
98
|
+
r = R.eval("r").to_ruby
|
99
|
+
|
100
|
+
mae = mae/predictions.size
|
101
|
+
weighted_mae = weighted_mae/confidence_sum
|
102
|
+
rmse = Math.sqrt(rmse/predictions.size)
|
103
|
+
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
|
104
|
+
{ "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
|
111
105
|
end
|
112
106
|
end
|
113
107
|
|
data/test/all.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
1
|
+
# "./default_environment.rb" has to be executed separately
|
2
|
+
exclude = ["./setup.rb","./all.rb", "./default_environment.rb"]
|
2
3
|
(Dir[File.join(File.dirname(__FILE__),"*.rb")]-exclude).each do |test|
|
3
|
-
p test
|
4
4
|
require_relative test
|
5
5
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class LazarClassificationTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_lazar_classification
|
6
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
+
model = Model::LazarClassification.create training_dataset
|
8
|
+
|
9
|
+
[ {
|
10
|
+
:compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
|
11
|
+
:prediction => "false",
|
12
|
+
:confidence => 0.25281385281385277,
|
13
|
+
:nr_neighbors => 11
|
14
|
+
},{
|
15
|
+
:compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
|
16
|
+
:prediction => "false",
|
17
|
+
:confidence => 0.3639589577089577,
|
18
|
+
:nr_neighbors => 14
|
19
|
+
} ].each do |example|
|
20
|
+
prediction = model.predict example[:compound]
|
21
|
+
assert_equal example[:prediction], prediction[:value]
|
22
|
+
#assert_equal example[:confidence], prediction[:confidence]
|
23
|
+
#assert_equal example[:nr_neighbors], prediction[:neighbors].size
|
24
|
+
end
|
25
|
+
|
26
|
+
compound = Compound.from_smiles "CCO"
|
27
|
+
prediction = model.predict compound
|
28
|
+
assert_equal ["false"], prediction[:database_activities]
|
29
|
+
assert_equal "true", prediction[:value]
|
30
|
+
|
31
|
+
# make a dataset prediction
|
32
|
+
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
33
|
+
prediction = model.predict compound_dataset
|
34
|
+
assert_equal compound_dataset.compounds, prediction.compounds
|
35
|
+
|
36
|
+
assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3]
|
37
|
+
assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3]
|
38
|
+
# cleanup
|
39
|
+
[training_dataset,model,compound_dataset].each{|o| o.delete}
|
40
|
+
end
|
41
|
+
end
|
data/test/compound.rb
CHANGED
@@ -54,7 +54,6 @@ print c.sdf
|
|
54
54
|
|
55
55
|
def test_inchikey
|
56
56
|
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
57
|
-
p c
|
58
57
|
assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
|
59
58
|
end
|
60
59
|
|
@@ -65,8 +64,7 @@ print c.sdf
|
|
65
64
|
|
66
65
|
def test_chemblid
|
67
66
|
c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
|
68
|
-
|
69
|
-
assert_equal "CHEMBL581676", c.chemblid
|
67
|
+
assert_equal "CHEMBL277500", c.chemblid
|
70
68
|
end
|
71
69
|
|
72
70
|
def test_sdf_storage
|
@@ -78,17 +76,17 @@ print c.sdf
|
|
78
76
|
def test_fingerprint
|
79
77
|
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
80
78
|
|
81
|
-
|
82
|
-
assert_equal c.fp4.size, c.fp4_size
|
79
|
+
assert_equal 9, c.fingerprint("FP4").size
|
83
80
|
end
|
84
81
|
|
85
82
|
def test_neighbors
|
86
83
|
d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
87
84
|
d.compounds.each do |c|
|
88
|
-
refute_nil c.
|
85
|
+
refute_nil c.fingerprint("MP2D")
|
89
86
|
end
|
90
87
|
c = d.compounds[371]
|
91
|
-
|
88
|
+
n = c.fingerprint_neighbors({:type => "FP4", :min_sim => 0.7, :training_dataset_id => d.id })
|
89
|
+
assert n.size >= 18, "Neighbors size (#{n.size}) should be larger than 17"
|
92
90
|
end
|
93
91
|
|
94
92
|
def test_openbabel_segfault
|
@@ -97,4 +95,115 @@ print c.sdf
|
|
97
95
|
c = Compound.from_inchi(inchi)
|
98
96
|
assert_equal inchi, c.inchi
|
99
97
|
end
|
98
|
+
|
99
|
+
def test_openbabel_fingerprint
|
100
|
+
[
|
101
|
+
"CC(=O)CC(C)C#N",
|
102
|
+
"CC(=O)CC(C)C",
|
103
|
+
"C(=O)CC(C)C#N",
|
104
|
+
].each do |smi|
|
105
|
+
c = OpenTox::Compound.from_smiles smi
|
106
|
+
refute_nil c.fingerprint("FP4")
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_fingerprint_neighbors
|
111
|
+
types = ["FP2", "FP3", "FP4", "MACCS"]
|
112
|
+
min_sim = 0.7
|
113
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
114
|
+
[
|
115
|
+
"CC(=O)CC(C)C#N",
|
116
|
+
"CC(=O)CC(C)C",
|
117
|
+
"C(=O)CC(C)C#N",
|
118
|
+
].each do |smi|
|
119
|
+
c = OpenTox::Compound.from_smiles smi
|
120
|
+
types.each do |type|
|
121
|
+
neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
|
122
|
+
unless type == "FP2" and smi == "CC(=O)CC(C)C#N" or smi == "C(=O)CC(C)C#N" and (type == "FP2" or type == "MACCS")
|
123
|
+
refute_empty neighbors
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def test_mna
|
130
|
+
c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
|
131
|
+
assert_equal 18, c.fingerprint("MNA").size
|
132
|
+
assert_equal 9, c.fingerprint("MNA").uniq.size
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_mpd
|
136
|
+
c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
|
137
|
+
assert 13, c.fingerprint("MP2D").size
|
138
|
+
assert 7, c.fingerprint("MP2D").uniq.size
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_fingerprint_count_neighbors
|
142
|
+
types = ["MP2D", "MNA"]
|
143
|
+
min_sim = 0.0
|
144
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
145
|
+
[
|
146
|
+
"CC(=O)CC(C)C#N",
|
147
|
+
"CC(=O)CC(C)C",
|
148
|
+
"C(=O)CC(C)C#N",
|
149
|
+
].each do |smi|
|
150
|
+
c = OpenTox::Compound.from_smiles smi
|
151
|
+
types.each do |type|
|
152
|
+
neighbors = c.fingerprint_count_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
|
153
|
+
if type == "FP4"
|
154
|
+
fp4_neighbors = c.neighbors
|
155
|
+
neighbors.each do |n|
|
156
|
+
assert_includes fp4_neighbors, n
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def test_fingerprint_db_neighbors
|
164
|
+
#skip
|
165
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
|
166
|
+
[
|
167
|
+
"CC(=O)CC(C)C#N",
|
168
|
+
"CC(=O)CC(C)C",
|
169
|
+
"C(=O)CC(C)C#N",
|
170
|
+
].each do |smi|
|
171
|
+
c = OpenTox::Compound.from_smiles smi
|
172
|
+
t = Time.now
|
173
|
+
neighbors = c.db_neighbors(:training_dataset_id => training_dataset.id, :min_sim => 0.2)
|
174
|
+
p Time.now - t
|
175
|
+
t = Time.now
|
176
|
+
neighbors2 = c.fingerprint_neighbors({:type => "MP2D", :training_dataset_id => training_dataset.id, :min_sim => 0.2})
|
177
|
+
p Time.now - t
|
178
|
+
p neighbors.size
|
179
|
+
p neighbors2.size
|
180
|
+
#p neighbors
|
181
|
+
#p neighbors2
|
182
|
+
#p neighbors2 - neighbors
|
183
|
+
#assert_equal neighbors, neighbors2
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def test_molecular_weight
|
188
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
|
189
|
+
assert_equal 100.15888, c.molecular_weight
|
190
|
+
end
|
191
|
+
|
192
|
+
def test_mg_conversions
|
193
|
+
# TODO fix!
|
194
|
+
skip
|
195
|
+
c = OpenTox::Compound.from_smiles "O"
|
196
|
+
mw = c.molecular_weight
|
197
|
+
assert_equal 18.01528, mw
|
198
|
+
assert_equal 0.8105107141417474, c.logmmol_to_mg(4.34688225631145, mw)
|
199
|
+
assert_equal 9007.64, c.mmol_to_mg(500, mw)
|
200
|
+
assert_equal 2437.9999984148976, c.logmg_to_mg(3.387033701)
|
201
|
+
end
|
202
|
+
|
203
|
+
def test_physchem
|
204
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
|
205
|
+
assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem.size
|
206
|
+
assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem(PhysChem.openbabel_descriptors).size
|
207
|
+
assert_equal PhysChem::unique_descriptors.size, c.physchem(PhysChem.unique_descriptors).size
|
208
|
+
end
|
100
209
|
end
|