lazar 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
@@ -0,0 +1,8 @@
|
|
1
|
+
SMILES,Wrong Dataset
|
2
|
+
Tost,0
|
3
|
+
Is,1
|
4
|
+
A,0
|
5
|
+
Wrong,1
|
6
|
+
Dataset,0
|
7
|
+
Entry,1
|
8
|
+
O[C@@H]8[C@@H](O)[C@@H]1O[C@H](CO)[C@H]8O[C@H]7O[C@H](CO)[C@@H](O[C@H]6O[C@H](CO)[C@@H](O[C@H]5O[C@H](CO)[C@@H](O[C@H]4O[C@H](CO)[C@@H](O[C@H]3O[C@H](CO)[C@@H](O[C@H]2O[C@H](CO)[C@@H](O1)[C@H](O)[C@H]2O)[C@H](O)[C@H]3O)[C@H](O)[C@H]4O)[C@H](O)[C@H]5O)[C, 0
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class DatasetLongTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_01_upload_epafhm
|
6
|
+
f = File.join DATA_DIR, "EPAFHM.csv"
|
7
|
+
d = OpenTox::Dataset.from_csv_file f
|
8
|
+
csv = CSV.read f
|
9
|
+
assert_equal csv.size-1, d.compounds.size
|
10
|
+
assert_equal csv.first.size-1, d.features.size
|
11
|
+
assert_equal csv.size-1, d.data_entries.size
|
12
|
+
d.delete
|
13
|
+
end
|
14
|
+
|
15
|
+
=begin
|
16
|
+
# TODO catch OpenBabel segfaults and identify/remove cause
|
17
|
+
def test_02_upload_multicell
|
18
|
+
duplicates = [
|
19
|
+
"http://localhost:8082/compound/InChI=1S/C6HCl5O/c7-1-2(8)4(10)6(12)5(11)3(1)9/h12H",
|
20
|
+
"http://localhost:8082/compound/InChI=1S/C12H8Cl6O/c13-8-9(14)11(16)5-3-1-2(6-7(3)19-6)4(5)10(8,15)12(11,17)18/h2-7H,1H2",
|
21
|
+
"http://localhost:8082/compound/InChI=1S/C2HCl3/c3-1-2(4)5/h1H",
|
22
|
+
"http://localhost:8082/compound/InChI=1S/C4H5Cl/c1-3-4(2)5/h3H,1-2H2",
|
23
|
+
"http://localhost:8082/compound/InChI=1S/C4H7Cl/c1-4(2)3-5/h1,3H2,2H3",
|
24
|
+
"http://localhost:8082/compound/InChI=1S/C8H14O4/c1-5-4-8(11-6(2)9)12-7(3)10-5/h5,7-8H,4H2,1-3H3",
|
25
|
+
"http://localhost:8082/compound/InChI=1S/C19H30O5/c1-3-5-7-20-8-9-21-10-11-22-14-17-13-19-18(23-15-24-19)12-16(17)6-4-2/h12-13H,3-11,14-15H2,1-2H3",
|
26
|
+
]
|
27
|
+
errors = ['O=P(H)(OC)OC', 'C=CCNN.HCl' ]
|
28
|
+
f = File.join DATA_DIR, "multi_cell_call.csv"
|
29
|
+
d = OpenTox::Dataset.from_csv_file f
|
30
|
+
csv = CSV.read f
|
31
|
+
assert_equal true, d.features.first.nominal
|
32
|
+
assert_nil d["index"]
|
33
|
+
assert_equal csv.size-1-errors.size, d.compounds.size
|
34
|
+
assert_equal csv.first.size-1, d.features.size
|
35
|
+
assert_equal csv.size-1-errors.size, d.data_entries.size
|
36
|
+
p d.warnings
|
37
|
+
(duplicates+errors).each do |uri|
|
38
|
+
assert d.warnings.grep %r{#{uri}}
|
39
|
+
end
|
40
|
+
d.delete
|
41
|
+
end
|
42
|
+
=end
|
43
|
+
|
44
|
+
def test_03_upload_isscan
|
45
|
+
f = File.join DATA_DIR, "ISSCAN-multi.csv"
|
46
|
+
d = OpenTox::Dataset.from_csv_file f
|
47
|
+
csv = CSV.read f
|
48
|
+
assert_equal csv.size-1, d.compounds.size
|
49
|
+
assert_equal csv.first.size-1, d.features.size
|
50
|
+
assert_equal csv.size-1, d.data_entries.size
|
51
|
+
d.delete
|
52
|
+
#assert_equal false, URI.accessible?(d.uri)
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_04_simultanous_upload
|
56
|
+
threads = []
|
57
|
+
3.times do |t|
|
58
|
+
threads << Thread.new(t) do |up|
|
59
|
+
d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
60
|
+
assert_equal OpenTox::Dataset, d.class
|
61
|
+
assert_equal 1, d.features.size
|
62
|
+
assert_equal 85, d.compounds.size
|
63
|
+
assert_equal 85, d.data_entries.size
|
64
|
+
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
|
65
|
+
csv.shift
|
66
|
+
assert_equal csv.collect{|r| r[1]}, d.data_entries.flatten
|
67
|
+
d.delete
|
68
|
+
end
|
69
|
+
end
|
70
|
+
threads.each {|aThread| aThread.join}
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_05_upload_kazius
|
74
|
+
f = File.join DATA_DIR, "kazius.csv"
|
75
|
+
d = OpenTox::Dataset.from_csv_file f
|
76
|
+
csv = CSV.read f
|
77
|
+
assert_equal csv.size-1, d.compounds.size
|
78
|
+
assert_equal csv.first.size-1, d.features.size
|
79
|
+
assert_equal csv.size-1, d.data_entries.size
|
80
|
+
assert_empty d.warnings
|
81
|
+
# 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1
|
82
|
+
c = d.compounds[491]
|
83
|
+
assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC"
|
84
|
+
assert_equal d.data_entries[491][0], "1"
|
85
|
+
d.delete
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_upload_feature_dataset
|
89
|
+
t = Time.now
|
90
|
+
f = File.join DATA_DIR, "rat_feature_dataset.csv"
|
91
|
+
d = Dataset.from_csv_file f
|
92
|
+
assert_equal 458, d.features.size
|
93
|
+
d.save
|
94
|
+
p "Upload: #{Time.now-t}"
|
95
|
+
d2 = Dataset.find d.id
|
96
|
+
t = Time.now
|
97
|
+
assert_equal d.features.size, d2.features.size
|
98
|
+
csv = CSV.read f
|
99
|
+
csv.delete_at(248) # remove entry with InChi segfault
|
100
|
+
csv.shift # remove header
|
101
|
+
refute_empty d2.warnings
|
102
|
+
assert_match /249/, d2.warnings.join
|
103
|
+
assert_equal csv.size, d2.compounds.size
|
104
|
+
assert_equal csv.first.size-1, d2.features.size
|
105
|
+
d2.compounds.each_with_index do |compound,i|
|
106
|
+
row = csv[i]
|
107
|
+
row.shift # remove compound
|
108
|
+
assert_equal row, d2.data_entries[i]
|
109
|
+
end
|
110
|
+
p "Dowload: #{Time.now-t}"
|
111
|
+
d2.delete
|
112
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
113
|
+
Dataset.find d.id
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
data/test/dataset.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
# TODO; check compound/data_entry sequences with missing and duplicated values
|
2
|
+
|
3
|
+
require_relative "setup.rb"
|
4
|
+
|
5
|
+
class DatasetTest < MiniTest::Test
|
6
|
+
|
7
|
+
def test_all
|
8
|
+
d1 = Dataset.new
|
9
|
+
d1.save
|
10
|
+
datasets = Dataset.all
|
11
|
+
assert_equal Dataset, datasets.first.class
|
12
|
+
d1.delete
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_create_empty
|
16
|
+
d = Dataset.new
|
17
|
+
assert_equal Dataset, d.class
|
18
|
+
refute_nil d.id
|
19
|
+
assert_kind_of BSON::ObjectId, d.id
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_client_create
|
23
|
+
d = Dataset.new
|
24
|
+
assert_equal Dataset, d.class
|
25
|
+
d.name = "Create dataset test"
|
26
|
+
|
27
|
+
# features not set
|
28
|
+
# << operator was removed for efficiency reasons (CH)
|
29
|
+
#assert_raises BadRequestError do
|
30
|
+
# d << [Compound.from_smiles("c1ccccc1NN"), 1,2]
|
31
|
+
#end
|
32
|
+
|
33
|
+
# add data entries
|
34
|
+
d.features = ["test1", "test2"].collect do |title|
|
35
|
+
f = Feature.new
|
36
|
+
f.name = title
|
37
|
+
f.numeric = true
|
38
|
+
f.save
|
39
|
+
f
|
40
|
+
end
|
41
|
+
|
42
|
+
# wrong feature size
|
43
|
+
# << operator was removed for efficiency reasons (CH)
|
44
|
+
#assert_raises BadRequestError do
|
45
|
+
# d << [Compound.from_smiles("c1ccccc1NN"), 1,2,3]
|
46
|
+
#end
|
47
|
+
|
48
|
+
# manual low-level insertions without consistency checks for runtime efficiency
|
49
|
+
data_entries = []
|
50
|
+
d.compound_ids << Compound.from_smiles("c1ccccc1NN").id
|
51
|
+
data_entries << [1,2]
|
52
|
+
d.compound_ids << Compound.from_smiles("CC(C)N").id
|
53
|
+
data_entries << [4,5]
|
54
|
+
d.compound_ids << Compound.from_smiles("C1C(C)CCCC1").id
|
55
|
+
data_entries << [6,7]
|
56
|
+
d.data_entries = data_entries
|
57
|
+
assert_equal 3, d.compounds.size
|
58
|
+
assert_equal 2, d.features.size
|
59
|
+
assert_equal [[1,2],[4,5],[6,7]], d.data_entries
|
60
|
+
d.save_all
|
61
|
+
# check if dataset has been saved correctly
|
62
|
+
new_dataset = Dataset.find d.id
|
63
|
+
assert_equal 3, new_dataset.compounds.size
|
64
|
+
assert_equal 2, new_dataset.features.size
|
65
|
+
assert_equal [[1,2],[4,5],[6,7]], new_dataset.data_entries
|
66
|
+
d.delete
|
67
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
68
|
+
Dataset.find d.id
|
69
|
+
end
|
70
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
71
|
+
Dataset.find new_dataset.id
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_dataset_accessors
|
76
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
|
77
|
+
# create empty dataset
|
78
|
+
new_dataset = Dataset.find d.id
|
79
|
+
# get metadata
|
80
|
+
assert_match "multicolumn.csv", new_dataset.source
|
81
|
+
assert_equal "multicolumn.csv", new_dataset.title
|
82
|
+
# get features
|
83
|
+
assert_equal 6, new_dataset.features.size
|
84
|
+
assert_equal 7, new_dataset.compounds.size
|
85
|
+
assert_equal ["1", nil, "false", nil, nil, 1.0], new_dataset.data_entries.last
|
86
|
+
d.delete
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_create_from_file
|
90
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
91
|
+
assert_equal Dataset, d.class
|
92
|
+
refute_nil d.warnings
|
93
|
+
assert_match "EPAFHM.mini.csv", d.source
|
94
|
+
assert_equal "EPAFHM.mini.csv", d.name
|
95
|
+
d.delete
|
96
|
+
#assert_equal false, URI.accessible?(d.uri)
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_create_from_file_with_wrong_smiles_compound_entries
|
100
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv")
|
101
|
+
refute_nil d.warnings
|
102
|
+
assert_match /2|3|4|5|6|7|8/, d.warnings.join
|
103
|
+
d.delete
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_multicolumn_csv
|
107
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
|
108
|
+
refute_nil d.warnings
|
109
|
+
assert d.warnings.grep(/Duplicate compound/)
|
110
|
+
assert d.warnings.grep(/3, 5/)
|
111
|
+
assert_equal 6, d.features.size
|
112
|
+
assert_equal 7, d.compounds.size
|
113
|
+
assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size
|
114
|
+
assert_equal [["1", "1", "true", "true", "test", 1.1], ["1", "2", "false", "7.5", "test", 0.24], ["1", "3", "true", "5", "test", 3578.239], ["0", "4", "false", "false", "test", -2.35], ["1", "2", "true", "4", "test_2", 1], ["1", "2", "false", "false", "test", -1.5], ["1", nil, "false", nil, nil, 1.0]], d.data_entries
|
115
|
+
assert_equal "c1ccc[nH]1,1,,false,,,1.0", d.to_csv.split("\n")[7]
|
116
|
+
csv = CSV.parse(d.to_csv)
|
117
|
+
original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv")
|
118
|
+
csv.shift
|
119
|
+
original_csv.shift
|
120
|
+
csv.each_with_index do |row,i|
|
121
|
+
compound = Compound.from_smiles row.shift
|
122
|
+
original_compound = Compound.from_smiles original_csv[i].shift
|
123
|
+
assert_equal original_compound.inchi, compound.inchi
|
124
|
+
row.each_with_index do |v,j|
|
125
|
+
if v.numeric?
|
126
|
+
assert_equal original_csv[i][j].strip.to_f, row[j].to_f
|
127
|
+
else
|
128
|
+
assert_equal original_csv[i][j].strip, row[j].to_s
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
d.delete
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_from_csv
|
136
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
137
|
+
assert_equal Dataset, d.class
|
138
|
+
assert_equal 1, d.features.size
|
139
|
+
assert_equal 85, d.compounds.size
|
140
|
+
assert_equal 85, d.data_entries.size
|
141
|
+
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
|
142
|
+
csv.shift
|
143
|
+
assert_equal csv.collect{|r| r[1]}, d.data_entries.flatten
|
144
|
+
d.delete
|
145
|
+
#assert_equal false, URI.accessible?(d.uri)
|
146
|
+
end
|
147
|
+
|
148
|
+
def test_from_csv_classification
|
149
|
+
["int", "float", "string"].each do |mode|
|
150
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv"
|
151
|
+
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv")
|
152
|
+
csv.shift
|
153
|
+
entries = d.data_entries.flatten
|
154
|
+
csv.each_with_index do |r, i|
|
155
|
+
assert_equal r[1].to_s, entries[i]
|
156
|
+
end
|
157
|
+
d.delete
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def test_from_csv2
|
162
|
+
File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") }
|
163
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv"
|
164
|
+
assert_equal "Cannot parse SMILES compound ' ' at position 3, all entries are ignored.", dataset.warnings.join
|
165
|
+
File.delete "#{DATA_DIR}/temp_test.csv"
|
166
|
+
dataset.features.each{|f| feature = Feature.find f.id; feature.delete}
|
167
|
+
dataset.delete
|
168
|
+
end
|
169
|
+
|
170
|
+
def test_same_feature
|
171
|
+
datasets = []
|
172
|
+
features = []
|
173
|
+
2.times do |i|
|
174
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.mini.csv"
|
175
|
+
features << d.features.first
|
176
|
+
assert features[0].id==features[-1].id,"re-upload should find old feature, but created new one"
|
177
|
+
datasets << d
|
178
|
+
end
|
179
|
+
datasets.each{|d| d.delete}
|
180
|
+
end
|
181
|
+
|
182
|
+
def test_create_from_file
|
183
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
184
|
+
assert_equal Dataset, d.class
|
185
|
+
refute_nil d.warnings
|
186
|
+
assert_match /row 13/, d.warnings.join
|
187
|
+
assert_match "EPAFHM.mini.csv", d.source
|
188
|
+
assert_equal 1, d.features.size
|
189
|
+
feature = d.features.first
|
190
|
+
assert_kind_of NumericBioAssay, feature
|
191
|
+
assert_equal 0.0113, d.data_entries[0][0]
|
192
|
+
assert_equal 0.00323, d.data_entries[5][0]
|
193
|
+
d2 = Dataset.find d.id
|
194
|
+
assert_equal 0.0113, d2.data_entries[0][0]
|
195
|
+
assert_equal 0.00323, d2.data_entries[5][0]
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
class DescriptorLongTest < MiniTest::Test
|
3
|
+
|
4
|
+
def test_dataset_all
|
5
|
+
# TODO: improve CDK descriptor calculation speed or add timeout
|
6
|
+
skip "CDK descriptor calculation takes too long for some compounds"
|
7
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
8
|
+
d = OpenTox::Algorithm::Descriptor.physchem dataset
|
9
|
+
assert_equal dataset.compounds, d.compounds
|
10
|
+
assert_equal 332, d.features.size
|
11
|
+
assert_equal 332, d.data_entries.first.size
|
12
|
+
d.delete
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_dataset_openbabel
|
16
|
+
# TODO: improve CDK descriptor calculation speed or add timeout
|
17
|
+
dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
18
|
+
d = Algorithm::Descriptor.physchem dataset, Algorithm::Descriptor::OBDESCRIPTORS.keys
|
19
|
+
assert_equal dataset.compounds, d.compounds
|
20
|
+
size = Algorithm::Descriptor::OBDESCRIPTORS.keys.size
|
21
|
+
assert_equal size, d.features.size
|
22
|
+
assert_equal size, d.data_entries.first.size
|
23
|
+
d.delete
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
data/test/descriptor.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class DescriptorTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_list
|
6
|
+
# check available descriptors
|
7
|
+
@descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys
|
8
|
+
assert_equal 111,@descriptors.size,"wrong num physchem descriptors"
|
9
|
+
@descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES
|
10
|
+
assert_equal 356,@descriptor_values.size,"wrong num physchem descriptors"
|
11
|
+
sum = 0
|
12
|
+
[ @descriptors, @descriptor_values ].each do |desc|
|
13
|
+
{"Openbabel"=>16,"Cdk"=>(desc==@descriptors ? 50 : 295),"Joelib"=>45}.each do |k,v|
|
14
|
+
assert_equal v,desc.select{|x| x=~/^#{k}\./}.size,"wrong num #{k} descriptors"
|
15
|
+
sum += v
|
16
|
+
end
|
17
|
+
end
|
18
|
+
assert_equal (111+356),sum
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_smarts
|
22
|
+
c = OpenTox::Compound.from_smiles "N=C=C1CCC(=F=FO)C1"
|
23
|
+
File.open("tmp.png","w+"){|f| f.puts c.png}
|
24
|
+
s = Smarts.find_or_create_by(:smarts => "F=F")
|
25
|
+
result = OpenTox::Algorithm::Descriptor.smarts_match c, s
|
26
|
+
assert_equal [1], result
|
27
|
+
smarts = ["CC", "C", "C=C", "CO", "F=F", "C1CCCC1", "NN"].collect{|s| Smarts.find_or_create_by(:smarts => s)}
|
28
|
+
result = OpenTox::Algorithm::Descriptor.smarts_match c, smarts
|
29
|
+
assert_equal [1, 1, 1, 0, 1, 1, 0], result
|
30
|
+
smarts_count = [10, 6, 2, 0, 2, 10, 0]
|
31
|
+
result = OpenTox::Algorithm::Descriptor.smarts_count c, smarts
|
32
|
+
assert_equal smarts_count, result
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_compound_openbabel_single
|
36
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
37
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Openbabel.logP"]
|
38
|
+
assert_equal 1.12518, result.first
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_compound_cdk_single
|
42
|
+
c = OpenTox::Compound.from_smiles "c1ccccc1"
|
43
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"]
|
44
|
+
assert_equal [12], result
|
45
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
46
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"]
|
47
|
+
assert_equal [17], result
|
48
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.CarbonTypes"]
|
49
|
+
c_types = {"Cdk.CarbonTypes.C1SP1"=>1, "Cdk.CarbonTypes.C2SP1"=>0, "Cdk.CarbonTypes.C1SP2"=>0, "Cdk.CarbonTypes.C2SP2"=>1, "Cdk.CarbonTypes.C3SP2"=>0, "Cdk.CarbonTypes.C1SP3"=>2, "Cdk.CarbonTypes.C2SP3"=>1, "Cdk.CarbonTypes.C3SP3"=>1, "Cdk.CarbonTypes.C4SP3"=>0}
|
50
|
+
assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_compound_joelib_single
|
54
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
55
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Joelib.LogP"]
|
56
|
+
assert_equal [2.65908], result
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_compound_all
|
60
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
61
|
+
result = OpenTox::Algorithm::Descriptor.physchem c
|
62
|
+
assert_equal 332, result.size
|
63
|
+
assert_equal 30.8723, result[2]
|
64
|
+
assert_equal 1.12518, result[328]
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_compound_descriptor_parameters
|
68
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
69
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ]#, true
|
70
|
+
assert_equal 12, result.size
|
71
|
+
assert_equal [1.12518, 17.0, 1, 0, 0, 1, 0, 2, 1, 1, 0, 2.65908], result#.last
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_dataset_descriptor_parameters
|
75
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
76
|
+
d = OpenTox::Algorithm::Descriptor.physchem dataset, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ]
|
77
|
+
assert_kind_of Dataset, d
|
78
|
+
assert_equal dataset.compounds, d.compounds
|
79
|
+
assert_equal dataset.compounds.size, d.data_entries.size
|
80
|
+
assert_equal 12, d.data_entries.first.size
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
data/test/error.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class ErrorTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_bad_request
|
6
|
+
object = OpenTox::Feature.new
|
7
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
8
|
+
response = OpenTox::Feature.find(object.id)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_error_methods
|
13
|
+
assert_raises OpenTox::ResourceNotFoundError do
|
14
|
+
resource_not_found_error "This is a test"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_exception
|
19
|
+
assert_raises Exception do
|
20
|
+
raise Exception.new "Basic Exception"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
data/test/feature.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class FeatureTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_opentox_feature
|
6
|
+
@feature = OpenTox::Feature.create(:name => "tost")
|
7
|
+
assert_equal true, OpenTox::Feature.where(name: "tost").exists?, "#{@feature.id} is not accessible."
|
8
|
+
assert_equal true, OpenTox::Feature.where(id: @feature.id).exists?, "#{@feature.id} is not accessible."
|
9
|
+
|
10
|
+
list = OpenTox::Feature.all
|
11
|
+
listsize1 = list.length
|
12
|
+
assert_equal true, list.collect{|f| f.id}.include?(@feature.id)
|
13
|
+
# modify feature
|
14
|
+
@feature2 = OpenTox::Feature.find(@feature.id)
|
15
|
+
assert_equal "tost", @feature2[:name]
|
16
|
+
assert_equal "tost", @feature2.name
|
17
|
+
assert_kind_of Feature, @feature2
|
18
|
+
|
19
|
+
@feature2[:name] = "feature2"
|
20
|
+
@feature2.save
|
21
|
+
list = OpenTox::Feature.all
|
22
|
+
listsize2 = list.length
|
23
|
+
assert_match "feature2", @feature2.name
|
24
|
+
refute_match "tost", @feature2.name
|
25
|
+
assert_equal listsize1, listsize2
|
26
|
+
|
27
|
+
id = @feature2.id
|
28
|
+
@feature2.delete
|
29
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
30
|
+
OpenTox::Feature.find(id)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_duplicated_features
|
35
|
+
metadata = {
|
36
|
+
:name => "feature duplication test",
|
37
|
+
:nominal => true,
|
38
|
+
:description => "feature duplication test"
|
39
|
+
}
|
40
|
+
feature = NumericBioAssay.find_or_create_by metadata
|
41
|
+
dup_feature = NumericBioAssay.find_or_create_by metadata
|
42
|
+
assert_kind_of Feature, feature
|
43
|
+
assert !feature.id.nil?, "No Feature ID in #{feature.inspect}"
|
44
|
+
assert !feature.id.nil?, "No Feature ID in #{dup_feature.inspect}"
|
45
|
+
assert_equal feature.id, dup_feature.id
|
46
|
+
feature.delete
|
47
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
48
|
+
OpenTox::Feature.find(feature.id)
|
49
|
+
end
|
50
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
51
|
+
OpenTox::Feature.find(dup_feature.id)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_smarts_feature
|
56
|
+
feature = Smarts.find_or_create_by(:smarts => "CN")
|
57
|
+
assert feature.smarts, "CN"
|
58
|
+
assert_kind_of Smarts, feature
|
59
|
+
feature.smarts = 'cc'
|
60
|
+
assert feature.smarts, "cc"
|
61
|
+
original = Feature.where(:smarts => 'CN').first
|
62
|
+
assert original.smarts, "CN"
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
data/test/fminer-long.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class FminerTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_fminer_multicell
|
6
|
+
#skip "multicell segfaults"
|
7
|
+
# TODO aborts, probably fminer
|
8
|
+
# or OpenBabel segfault
|
9
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv")
|
10
|
+
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
|
11
|
+
p feature_dataset.training_parameters
|
12
|
+
assert_equal dataset.compound_ids, feature_dataset.compound_ids
|
13
|
+
dataset.delete
|
14
|
+
feature_dataset.delete
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_fminer_isscan
|
18
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv")
|
19
|
+
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
|
20
|
+
assert_equal feature_dataset.compounds.size, dataset.compounds.size
|
21
|
+
p feature_dataset.features.size
|
22
|
+
p feature_dataset.training_parameters
|
23
|
+
dataset.delete
|
24
|
+
feature_dataset.delete
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_fminer_kazius
|
28
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
|
29
|
+
# TODO reactivate default settings
|
30
|
+
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20)
|
31
|
+
assert_equal feature_dataset.compounds.size, dataset.compounds.size
|
32
|
+
feature_dataset = Dataset.find feature_dataset.id
|
33
|
+
assert feature_dataset.data_entries.size, dataset.compounds.size
|
34
|
+
dataset.delete
|
35
|
+
feature_dataset.delete
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
data/test/fminer.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class FminerTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_fminer_bbrc
|
6
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
+
refute_nil dataset.id
|
8
|
+
feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset
|
9
|
+
feature_dataset = Dataset.find feature_dataset.id
|
10
|
+
assert_equal dataset.compounds.size, feature_dataset.compounds.size
|
11
|
+
# TODO: fminer calculates 62 instead of 54 features
|
12
|
+
# it is unclear which commit changed the numbers (occurs with old libraries/mongodb branch too
|
13
|
+
# modification of Compound to use smiles instead of inchis seems to have no effect
|
14
|
+
#assert_equal 54, feature_dataset.features.size
|
15
|
+
#assert_equal "C-C-C=C", feature_dataset.features.first.smarts
|
16
|
+
compounds = feature_dataset.compounds
|
17
|
+
smarts = feature_dataset.features
|
18
|
+
smarts.each do |smart|
|
19
|
+
assert smart.p_value.round(2) >= 0.95
|
20
|
+
end
|
21
|
+
match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
|
22
|
+
feature_dataset.data_entries.each_with_index do |fingerprint,i|
|
23
|
+
assert_equal match[i], fingerprint
|
24
|
+
end
|
25
|
+
|
26
|
+
dataset.delete
|
27
|
+
feature_dataset.delete
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_fminer_last
|
31
|
+
skip "last features have to be activated"
|
32
|
+
dataset = OpenTox::Dataset.new
|
33
|
+
dataset.upload File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
34
|
+
feature_dataset = OpenTox::Algorithm::Fminer.last :dataset => dataset
|
35
|
+
assert_equal dataset.compounds.size, feature_dataset.compounds.size
|
36
|
+
assert_equal 21, feature_dataset.features.size
|
37
|
+
assert_equal '[#6&A]-[#6&a]:[#6&a]:[#6&a]:[#6&a]:[#6&a]', feature_dataset.features.first.smarts
|
38
|
+
|
39
|
+
compounds = feature_dataset.compounds
|
40
|
+
smarts = feature_dataset.features.collect{|f| f.smarts}
|
41
|
+
match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
|
42
|
+
compounds.each_with_index do |c,i|
|
43
|
+
smarts.each_with_index do |s,j|
|
44
|
+
assert_equal match[i][j], feature_dataset.data_entries[i][j].to_i
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
dataset.delete
|
49
|
+
feature_dataset.delete
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class LazarFminerTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_lazar_fminer
|
6
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
+
model = Model::LazarFminerClassification.create training_dataset#, feature_dataset
|
8
|
+
feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id]
|
9
|
+
assert_equal training_dataset.compounds.size, feature_dataset.compounds.size
|
10
|
+
#TODO check fminer features, see fminer.rb
|
11
|
+
#assert_equal 54, feature_dataset.features.size
|
12
|
+
feature_dataset.data_entries.each do |e|
|
13
|
+
assert_equal e.size, feature_dataset.features.size
|
14
|
+
end
|
15
|
+
#assert_equal 'C-C-C=C', feature_dataset.features.first.smarts
|
16
|
+
|
17
|
+
[ {
|
18
|
+
:compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
|
19
|
+
:prediction => "false",
|
20
|
+
:confidence => 0.25281385281385277,
|
21
|
+
:nr_neighbors => 11
|
22
|
+
},{
|
23
|
+
:compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
|
24
|
+
:prediction => "false",
|
25
|
+
:confidence => 0.3639589577089577,
|
26
|
+
:nr_neighbors => 14
|
27
|
+
}, {
|
28
|
+
:compound => Compound.from_smiles('OCCCCCCCC\C=C/CCCCCCCC'),
|
29
|
+
:prediction => "false",
|
30
|
+
:confidence => 0.5555555555555556,
|
31
|
+
:nr_neighbors => 1
|
32
|
+
}].each do |example|
|
33
|
+
prediction = model.predict example[:compound]
|
34
|
+
|
35
|
+
assert_equal example[:prediction], prediction[:value]
|
36
|
+
#assert_equal example[:confidence], prediction[:confidence]
|
37
|
+
#assert_equal example[:nr_neighbors], prediction[:neighbors].size
|
38
|
+
end
|
39
|
+
|
40
|
+
# make a dataset prediction
|
41
|
+
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
42
|
+
prediction = model.predict compound_dataset
|
43
|
+
assert_equal compound_dataset.compounds, prediction.compounds
|
44
|
+
|
45
|
+
assert_equal "Cound not find similar compounds.", prediction.data_entries[7][2]
|
46
|
+
assert_equal "measured", prediction.data_entries[14][1]
|
47
|
+
# cleanup
|
48
|
+
[training_dataset,model,feature_dataset,compound_dataset].each{|o| o.delete}
|
49
|
+
end
|
50
|
+
end
|