lazar 0.9.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
data/test/dataset.rb
CHANGED
@@ -1,9 +1,16 @@
|
|
1
|
-
# TODO; check compound/data_entry sequences with missing and duplicated values
|
2
|
-
|
3
1
|
require_relative "setup.rb"
|
4
2
|
|
5
3
|
class DatasetTest < MiniTest::Test
|
6
4
|
|
5
|
+
# basics
|
6
|
+
|
7
|
+
def test_create_empty
|
8
|
+
d = Dataset.new
|
9
|
+
assert_equal Dataset, d.class
|
10
|
+
refute_nil d.id
|
11
|
+
assert_kind_of BSON::ObjectId, d.id
|
12
|
+
end
|
13
|
+
|
7
14
|
def test_all
|
8
15
|
d1 = Dataset.new
|
9
16
|
d1.save
|
@@ -12,145 +19,182 @@ class DatasetTest < MiniTest::Test
|
|
12
19
|
d1.delete
|
13
20
|
end
|
14
21
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
22
|
+
# real datasets
|
23
|
+
|
24
|
+
def test_upload_hamster
|
25
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
26
|
+
assert_equal Dataset, d.class
|
27
|
+
assert_equal 1, d.features.size
|
28
|
+
assert_equal 85, d.compounds.size
|
29
|
+
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
|
30
|
+
csv.shift
|
31
|
+
csv.each do |row|
|
32
|
+
c = Compound.from_smiles row.shift
|
33
|
+
assert_equal row, d.values(c,d.features.first)
|
24
34
|
end
|
35
|
+
d.delete
|
25
36
|
end
|
26
37
|
|
27
|
-
def
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
38
|
+
def test_upload_kazius
|
39
|
+
f = File.join DATA_DIR, "kazius.csv"
|
40
|
+
d = OpenTox::Dataset.from_csv_file f
|
41
|
+
csv = CSV.read f
|
42
|
+
assert_equal csv.size-1, d.compounds.size
|
43
|
+
assert_equal csv.first.size-1, d.features.size
|
44
|
+
assert_empty d.warnings
|
45
|
+
# 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1
|
46
|
+
c = d.compounds[491]
|
47
|
+
assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC"
|
48
|
+
assert_equal ["1"], d.values(c,d.features.first)
|
49
|
+
d.delete
|
32
50
|
end
|
33
51
|
|
34
|
-
def
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
d
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
+
def test_upload_multicell
|
53
|
+
duplicates = [
|
54
|
+
"InChI=1S/C6HCl5O/c7-1-2(8)4(10)6(12)5(11)3(1)9/h12H",
|
55
|
+
"InChI=1S/C12H8Cl6O/c13-8-9(14)11(16)5-3-1-2(6-7(3)19-6)4(5)10(8,15)12(11,17)18/h2-7H,1H2",
|
56
|
+
"InChI=1S/C2HCl3/c3-1-2(4)5/h1H",
|
57
|
+
"InChI=1S/C4H5Cl/c1-3-4(2)5/h3H,1-2H2",
|
58
|
+
"InChI=1S/C4H7Cl/c1-4(2)3-5/h1,3H2,2H3",
|
59
|
+
"InChI=1S/C8H14O4/c1-5-4-8(11-6(2)9)12-7(3)10-5/h5,7-8H,4H2,1-3H3",
|
60
|
+
"InChI=1S/C19H30O5/c1-3-5-7-20-8-9-21-10-11-22-14-17-13-19-18(23-15-24-19)12-16(17)6-4-2/h12-13H,3-11,14-15H2,1-2H3",
|
61
|
+
].collect{|inchi| Compound.from_inchi(inchi).smiles}
|
62
|
+
errors = ['O=P(H)(OC)OC', 'C=CCNN.HCl' ]
|
63
|
+
f = File.join DATA_DIR, "multi_cell_call.csv"
|
64
|
+
d = OpenTox::Dataset.from_csv_file f
|
65
|
+
csv = CSV.read f
|
66
|
+
assert_equal true, d.features.first.nominal?
|
67
|
+
assert_equal 1056, d.compounds.size
|
68
|
+
assert_equal csv.first.size-1, d.features.size
|
69
|
+
errors.each do |smi|
|
70
|
+
refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}}
|
71
|
+
end
|
72
|
+
duplicates.each do |smi|
|
73
|
+
refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}}
|
52
74
|
end
|
53
|
-
|
54
|
-
# wrong feature size
|
55
|
-
# << operator was removed for efficiency reasons (CH)
|
56
|
-
#assert_raises BadRequestError do
|
57
|
-
# d << [Compound.from_smiles("c1ccccc1NN"), 1,2,3]
|
58
|
-
#end
|
59
|
-
|
60
|
-
# manual low-level insertions without consistency checks for runtime efficiency
|
61
|
-
data_entries = []
|
62
|
-
d.compound_ids << Compound.from_smiles("c1ccccc1NN").id
|
63
|
-
data_entries << [1,2]
|
64
|
-
d.compound_ids << Compound.from_smiles("CC(C)N").id
|
65
|
-
data_entries << [4,5]
|
66
|
-
d.compound_ids << Compound.from_smiles("C1C(C)CCCC1").id
|
67
|
-
data_entries << [6,7]
|
68
|
-
d.data_entries = data_entries
|
69
|
-
assert_equal 3, d.compounds.size
|
70
|
-
assert_equal 2, d.features.size
|
71
|
-
assert_equal [[1,2],[4,5],[6,7]], d.data_entries
|
72
|
-
d.save
|
73
|
-
# check if dataset has been saved correctly
|
74
|
-
new_dataset = Dataset.find d.id
|
75
|
-
assert_equal 3, new_dataset.compounds.size
|
76
|
-
assert_equal 2, new_dataset.features.size
|
77
|
-
assert_equal [[1,2],[4,5],[6,7]], new_dataset.data_entries
|
78
75
|
d.delete
|
79
|
-
assert_nil Dataset.find d.id
|
80
|
-
assert_nil Dataset.find new_dataset.id
|
81
76
|
end
|
82
77
|
|
83
|
-
def
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
assert_equal "multicolumn", new_dataset.name
|
90
|
-
# get features
|
91
|
-
assert_equal 6, new_dataset.features.size
|
92
|
-
assert_equal 7, new_dataset.compounds.size
|
93
|
-
assert_equal ["1", nil, "false", nil, nil, 1.0], new_dataset.data_entries.last
|
78
|
+
def test_upload_isscan
|
79
|
+
f = File.join DATA_DIR, "ISSCAN-multi.csv"
|
80
|
+
d = OpenTox::Dataset.from_csv_file f
|
81
|
+
csv = CSV.read f
|
82
|
+
assert_equal csv.size-1, d.compounds.size
|
83
|
+
assert_equal csv.first.size-1, d.features.size
|
94
84
|
d.delete
|
95
85
|
end
|
96
86
|
|
97
|
-
def
|
98
|
-
|
87
|
+
def test_upload_epafhm
|
88
|
+
f = File.join DATA_DIR, "EPAFHM_log10.csv"
|
89
|
+
d = OpenTox::Dataset.from_csv_file f
|
99
90
|
assert_equal Dataset, d.class
|
100
|
-
|
101
|
-
|
102
|
-
assert_equal
|
103
|
-
d.
|
104
|
-
|
91
|
+
csv = CSV.read f
|
92
|
+
assert_equal csv.size-1, d.compounds.size
|
93
|
+
assert_equal csv.first.size-1, d.features.size
|
94
|
+
assert_match "EPAFHM_log10.csv", d.source
|
95
|
+
assert_equal "EPAFHM_log10", d.name
|
96
|
+
feature = d.features.first
|
97
|
+
assert_kind_of NumericFeature, feature
|
98
|
+
assert_equal -Math.log10(0.0113), d.values(d.compounds.first,feature).first
|
99
|
+
assert_equal -Math.log10(0.00323), d.values(d.compounds[4],feature).first
|
100
|
+
d2 = Dataset.find d.id
|
101
|
+
assert_equal -Math.log10(0.0113), d2.values(d2.compounds[0],feature).first
|
102
|
+
assert_equal -Math.log10(0.00323), d2.values(d2.compounds[4],feature).first
|
103
|
+
d.delete
|
105
104
|
end
|
106
105
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
106
|
+
# batch predictions
|
107
|
+
|
108
|
+
def test_create_without_features_smiles_and_inchi
|
109
|
+
["smiles", "inchi"].each do |type|
|
110
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv"), true
|
111
|
+
assert_equal Dataset, d.class
|
112
|
+
refute_nil d.id
|
113
|
+
dataset = Dataset.find d.id
|
114
|
+
assert_equal 3, d.compounds.size
|
115
|
+
d.delete
|
116
|
+
end
|
112
117
|
end
|
113
118
|
|
114
|
-
|
119
|
+
# dataset operations
|
120
|
+
|
121
|
+
def test_folds
|
122
|
+
dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv")
|
123
|
+
dataset.folds(10).each do |fold|
|
124
|
+
fold.each do |d|
|
125
|
+
assert_operator d.compounds.size, :>=, d.compounds.uniq.size
|
126
|
+
end
|
127
|
+
assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size
|
128
|
+
assert_equal dataset.substances.size, fold.first.substances.size + fold.last.substances.size
|
129
|
+
assert_empty (fold.first.substances & fold.last.substances)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# serialisation
|
134
|
+
|
135
|
+
def test_to_csv
|
115
136
|
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
|
116
137
|
refute_nil d.warnings
|
117
138
|
assert d.warnings.grep(/Duplicate compound/)
|
118
139
|
assert d.warnings.grep(/3, 5/)
|
119
140
|
assert_equal 6, d.features.size
|
120
|
-
assert_equal
|
141
|
+
assert_equal 5, d.compounds.uniq.size
|
121
142
|
assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size
|
122
|
-
assert_equal [["1", "1", "true", "true", "test", 1.1], ["1", "2", "false", "7.5", "test", 0.24], ["1", "3", "true", "5", "test", 3578.239], ["0", "4", "false", "false", "test", -2.35], ["1", "2", "true", "4", "test_2", 1], ["1", "2", "false", "false", "test", -1.5], ["1", nil, "false", nil, nil, 1.0]], d.data_entries
|
123
|
-
assert_equal "c1ccc[nH]1,1,,false,,,1.0", d.to_csv.split("\n")[7]
|
124
143
|
csv = CSV.parse(d.to_csv)
|
125
144
|
original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv")
|
126
145
|
csv.shift
|
127
146
|
original_csv.shift
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
147
|
+
original = {}
|
148
|
+
original_csv.each do |row|
|
149
|
+
c = Compound.from_smiles row.shift.strip
|
150
|
+
original[c.inchi] = row.collect{|v| v.strip}
|
151
|
+
end
|
152
|
+
serialized = {}
|
153
|
+
csv.each do |row|
|
154
|
+
c = Compound.from_smiles row.shift
|
155
|
+
serialized[c.inchi] = row
|
156
|
+
end
|
157
|
+
#puts serialized.to_yaml
|
158
|
+
original.each do |inchi,row|
|
159
|
+
row.each_with_index do |v,i|
|
133
160
|
if v.numeric?
|
134
|
-
assert_equal
|
161
|
+
assert_equal v.to_f, serialized[inchi][i].to_f
|
135
162
|
else
|
136
|
-
assert_equal
|
163
|
+
assert_equal v, serialized[inchi][i]
|
137
164
|
end
|
138
165
|
end
|
166
|
+
|
139
167
|
end
|
140
168
|
d.delete
|
141
169
|
end
|
142
170
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
csv.
|
151
|
-
assert_equal
|
152
|
-
|
153
|
-
|
171
|
+
# special cases/details
|
172
|
+
|
173
|
+
def test_dataset_accessors
|
174
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
|
175
|
+
# create empty dataset
|
176
|
+
new_dataset = Dataset.find d.id
|
177
|
+
# get metadata
|
178
|
+
assert_match "multicolumn.csv", new_dataset.source
|
179
|
+
assert_equal "multicolumn", new_dataset.name
|
180
|
+
# get features
|
181
|
+
assert_equal 6, new_dataset.features.size
|
182
|
+
assert_equal 5, new_dataset.compounds.uniq.size
|
183
|
+
c = new_dataset.compounds.last
|
184
|
+
f = new_dataset.features.first
|
185
|
+
assert_equal ["1"], new_dataset.values(c,f)
|
186
|
+
f = new_dataset.features.last.id.to_s
|
187
|
+
assert_equal [1.0], new_dataset.values(c,f)
|
188
|
+
f = new_dataset.features[2]
|
189
|
+
assert_equal ["false"], new_dataset.values(c,f)
|
190
|
+
d.delete
|
191
|
+
end
|
192
|
+
|
193
|
+
def test_create_from_file_with_wrong_smiles_compound_entries
|
194
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv")
|
195
|
+
refute_nil d.warnings
|
196
|
+
assert_match /2|3|4|5|6|7|8/, d.warnings.join
|
197
|
+
d.delete
|
154
198
|
end
|
155
199
|
|
156
200
|
def test_from_csv_classification
|
@@ -158,9 +202,9 @@ class DatasetTest < MiniTest::Test
|
|
158
202
|
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv"
|
159
203
|
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv")
|
160
204
|
csv.shift
|
161
|
-
|
162
|
-
|
163
|
-
assert_equal
|
205
|
+
csv.each do |row|
|
206
|
+
c = Compound.from_smiles row.shift
|
207
|
+
assert_equal row, d.values(c,d.features.first)
|
164
208
|
end
|
165
209
|
d.delete
|
166
210
|
end
|
@@ -169,7 +213,7 @@ class DatasetTest < MiniTest::Test
|
|
169
213
|
def test_from_csv2
|
170
214
|
File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") }
|
171
215
|
dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv"
|
172
|
-
assert_equal "Cannot parse SMILES compound '' at
|
216
|
+
assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.join
|
173
217
|
File.delete "#{DATA_DIR}/temp_test.csv"
|
174
218
|
dataset.features.each{|f| feature = Feature.find f.id; feature.delete}
|
175
219
|
dataset.delete
|
@@ -187,32 +231,50 @@ class DatasetTest < MiniTest::Test
|
|
187
231
|
datasets.each{|d| d.delete}
|
188
232
|
end
|
189
233
|
|
190
|
-
def
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
234
|
+
def test_simultanous_upload
|
235
|
+
threads = []
|
236
|
+
3.times do |t|
|
237
|
+
threads << Thread.new(t) do |up|
|
238
|
+
d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
239
|
+
assert_equal OpenTox::Dataset, d.class
|
240
|
+
assert_equal 1, d.features.size
|
241
|
+
assert_equal 85, d.compounds.size
|
242
|
+
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
|
243
|
+
csv.shift
|
244
|
+
csv.each do |row|
|
245
|
+
c = Compound.from_smiles(row.shift)
|
246
|
+
assert_equal row, d.values(c,d.features.first)
|
247
|
+
end
|
248
|
+
d.delete
|
249
|
+
end
|
250
|
+
end
|
251
|
+
threads.each {|aThread| aThread.join}
|
204
252
|
end
|
205
253
|
|
206
|
-
def
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
254
|
+
def test_upload_feature_dataset
|
255
|
+
skip
|
256
|
+
t = Time.now
|
257
|
+
f = File.join DATA_DIR, "rat_feature_dataset.csv"
|
258
|
+
d = Dataset.from_csv_file f
|
259
|
+
assert_equal 458, d.features.size
|
260
|
+
d.save
|
261
|
+
#p "Upload: #{Time.now-t}"
|
262
|
+
d2 = Dataset.find d.id
|
263
|
+
t = Time.now
|
264
|
+
assert_equal d.features.size, d2.features.size
|
265
|
+
csv = CSV.read f
|
266
|
+
csv.shift # remove header
|
267
|
+
assert_empty d2.warnings
|
268
|
+
assert_equal csv.size, d2.compounds.size
|
269
|
+
assert_equal csv.first.size-1, d2.features.size
|
270
|
+
d2.compounds.each_with_index do |compound,i|
|
271
|
+
row = csv[i]
|
272
|
+
row.shift # remove compound
|
273
|
+
assert_equal row, d2.data_entries[i]
|
214
274
|
end
|
215
|
-
#
|
275
|
+
#p "Dowload: #{Time.now-t}"
|
276
|
+
d2.delete
|
277
|
+
assert_nil Dataset.find d.id
|
216
278
|
end
|
217
279
|
|
218
280
|
end
|
data/test/descriptor.rb
CHANGED
@@ -4,15 +4,17 @@ class DescriptorTest < MiniTest::Test
|
|
4
4
|
|
5
5
|
def test_list
|
6
6
|
# check available descriptors
|
7
|
-
assert_equal 355,PhysChem.descriptors.size,"incorrect number of physchem descriptors"
|
8
7
|
assert_equal 15,PhysChem.openbabel_descriptors.size,"incorrect number of Openbabel descriptors"
|
9
|
-
assert_equal 295,PhysChem.cdk_descriptors.size,"incorrect number of Cdk descriptors"
|
10
8
|
assert_equal 45,PhysChem.joelib_descriptors.size,"incorrect number of Joelib descriptors"
|
9
|
+
assert_equal 286,PhysChem.cdk_descriptors.size,"incorrect number of Cdk descriptors"
|
10
|
+
assert_equal 346,PhysChem.descriptors.size,"incorrect number of physchem descriptors"
|
11
11
|
end
|
12
12
|
|
13
13
|
def test_smarts
|
14
14
|
c = OpenTox::Compound.from_smiles "N=C=C1CCC(=F=FO)C1"
|
15
|
-
File.open("tmp.png","w+"){|f| f.puts c.png}
|
15
|
+
File.open("/tmp/tmp.png","w+"){|f| f.puts c.png}
|
16
|
+
assert_match /^PNG/,`file -b /tmp/tmp.png`
|
17
|
+
File.delete "/tmp/tmp.png"
|
16
18
|
s = Smarts.find_or_create_by(:smarts => "F=F")
|
17
19
|
result = c.smarts_match [s]
|
18
20
|
assert_equal [1], result
|
@@ -26,43 +28,50 @@ class DescriptorTest < MiniTest::Test
|
|
26
28
|
|
27
29
|
def test_compound_openbabel_single
|
28
30
|
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
29
|
-
|
30
|
-
|
31
|
+
feature = PhysChem.find_or_create_by(:name => "Openbabel.logP")
|
32
|
+
result = c.calculate_properties([feature])
|
33
|
+
assert_equal 1.12518, result.first.round(5)
|
34
|
+
assert_equal 1.12518, c.properties[feature.id.to_s].round(5)
|
31
35
|
end
|
32
36
|
|
33
37
|
def test_compound_cdk_single
|
34
38
|
c = OpenTox::Compound.from_smiles "c1ccccc1"
|
35
|
-
|
36
|
-
|
39
|
+
feature = PhysChem.find_or_create_by(:name => "Cdk.AtomCount.nAtom")
|
40
|
+
result = c.calculate_properties([feature])
|
41
|
+
assert_equal 12, result.first
|
37
42
|
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
38
|
-
|
39
|
-
|
43
|
+
feature = PhysChem.find_or_create_by(:name => "Cdk.AtomCount.nAtom")
|
44
|
+
result = c.calculate_properties([feature])
|
45
|
+
assert_equal 17, result.first
|
40
46
|
c_types = {"Cdk.CarbonTypes.C1SP1"=>1, "Cdk.CarbonTypes.C2SP1"=>0, "Cdk.CarbonTypes.C1SP2"=>0, "Cdk.CarbonTypes.C2SP2"=>1, "Cdk.CarbonTypes.C3SP2"=>0, "Cdk.CarbonTypes.C1SP3"=>2, "Cdk.CarbonTypes.C2SP3"=>1, "Cdk.CarbonTypes.C3SP3"=>1, "Cdk.CarbonTypes.C4SP3"=>0}
|
41
47
|
physchem_features = c_types.collect{|t,nr| PhysChem.find_or_create_by(:name => t)}
|
42
|
-
result = c.
|
43
|
-
assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result
|
48
|
+
result = c.calculate_properties physchem_features
|
49
|
+
assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result
|
44
50
|
end
|
45
51
|
|
46
52
|
def test_compound_joelib_single
|
47
53
|
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
48
|
-
result = c.
|
49
|
-
assert_equal 2.65908, result.first
|
54
|
+
result = c.calculate_properties [PhysChem.find_or_create_by(:name => "Joelib.LogP")]
|
55
|
+
assert_equal 2.65908, result.first
|
50
56
|
end
|
51
57
|
|
52
58
|
def test_compound_all
|
53
59
|
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
54
|
-
result = c.physchem PhysChem.descriptors
|
55
60
|
amr = PhysChem.find_or_create_by(:name => "Cdk.ALOGP.AMR", :library => "Cdk")
|
56
61
|
sbonds = PhysChem.find_by(:name => "Openbabel.sbonds")
|
57
|
-
|
58
|
-
assert_equal
|
62
|
+
result = c.calculate_properties([amr,sbonds])
|
63
|
+
assert_equal 30.8723, result[0]
|
64
|
+
assert_equal 5, result[1]
|
59
65
|
end
|
60
66
|
|
61
67
|
def test_compound_descriptor_parameters
|
68
|
+
PhysChem.descriptors
|
62
69
|
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
63
|
-
result = c.
|
70
|
+
result = c.calculate_properties [ "Openbabel.logP", "Cdk.AtomCount.nAtom", "Joelib.LogP" ].collect{|d| PhysChem.find_or_create_by(:name => d)}
|
64
71
|
assert_equal 3, result.size
|
65
|
-
assert_equal
|
72
|
+
assert_equal 1.12518, result[0].round(5)
|
73
|
+
assert_equal 17.0, result[1].round(5)
|
74
|
+
assert_equal 2.65908, result[2].round(5)
|
66
75
|
end
|
67
76
|
|
68
77
|
end
|
data/test/error.rb
CHANGED
data/test/experiment.rb
CHANGED
@@ -5,7 +5,7 @@ class ExperimentTest < MiniTest::Test
|
|
5
5
|
def test_regression_experiment
|
6
6
|
skip
|
7
7
|
datasets = [
|
8
|
-
"EPAFHM.
|
8
|
+
"EPAFHM.medi_log10.csv",
|
9
9
|
#"EPAFHM.csv",
|
10
10
|
#"FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv",
|
11
11
|
"LOAEL_mmol_corrected_smiles.csv"
|
@@ -68,7 +68,7 @@ class ExperimentTest < MiniTest::Test
|
|
68
68
|
skip
|
69
69
|
#=begin
|
70
70
|
datasets = [
|
71
|
-
"EPAFHM.
|
71
|
+
"EPAFHM.medi_log10.csv",
|
72
72
|
#"LOAEL_mmol_corrected_smiles.csv"
|
73
73
|
]
|
74
74
|
min_sims = [0.3,0.7]
|
@@ -118,7 +118,7 @@ class ExperimentTest < MiniTest::Test
|
|
118
118
|
def test_mpd_fingerprints
|
119
119
|
skip
|
120
120
|
datasets = [
|
121
|
-
"EPAFHM.
|
121
|
+
"EPAFHM.medi_log10.csv",
|
122
122
|
]
|
123
123
|
types = ["FP2","MP2D"]
|
124
124
|
experiment = Experiment.create(
|
@@ -147,7 +147,7 @@ class ExperimentTest < MiniTest::Test
|
|
147
147
|
def test_multiple_datasets
|
148
148
|
skip
|
149
149
|
datasets = [
|
150
|
-
"EPAFHM.
|
150
|
+
"EPAFHM.medi_log10.csv",
|
151
151
|
"LOAEL_mmol_corrected_smiles.csv"
|
152
152
|
]
|
153
153
|
min_sims = [0.3]
|
data/test/feature.rb
CHANGED
@@ -32,10 +32,9 @@ class FeatureTest < MiniTest::Test
|
|
32
32
|
def test_duplicated_features
|
33
33
|
metadata = {
|
34
34
|
:name => "feature duplication test",
|
35
|
-
:nominal => true,
|
36
35
|
}
|
37
|
-
feature =
|
38
|
-
dup_feature =
|
36
|
+
feature = NumericFeature.find_or_create_by metadata
|
37
|
+
dup_feature = NumericFeature.find_or_create_by metadata
|
39
38
|
assert_kind_of Feature, feature
|
40
39
|
assert !feature.id.nil?, "No Feature ID in #{feature.inspect}"
|
41
40
|
assert !feature.id.nil?, "No Feature ID in #{dup_feature.inspect}"
|
data/test/gridfs.rb
ADDED