lazar 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
@@ -0,0 +1,8 @@
|
|
1
|
+
SMILES,Wrong Dataset
|
2
|
+
Tost,0
|
3
|
+
Is,1
|
4
|
+
A,0
|
5
|
+
Wrong,1
|
6
|
+
Dataset,0
|
7
|
+
Entry,1
|
8
|
+
O[C@@H]8[C@@H](O)[C@@H]1O[C@H](CO)[C@H]8O[C@H]7O[C@H](CO)[C@@H](O[C@H]6O[C@H](CO)[C@@H](O[C@H]5O[C@H](CO)[C@@H](O[C@H]4O[C@H](CO)[C@@H](O[C@H]3O[C@H](CO)[C@@H](O[C@H]2O[C@H](CO)[C@@H](O1)[C@H](O)[C@H]2O)[C@H](O)[C@H]3O)[C@H](O)[C@H]4O)[C@H](O)[C@H]5O)[C, 0
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class DatasetLongTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_01_upload_epafhm
|
6
|
+
f = File.join DATA_DIR, "EPAFHM.csv"
|
7
|
+
d = OpenTox::Dataset.from_csv_file f
|
8
|
+
csv = CSV.read f
|
9
|
+
assert_equal csv.size-1, d.compounds.size
|
10
|
+
assert_equal csv.first.size-1, d.features.size
|
11
|
+
assert_equal csv.size-1, d.data_entries.size
|
12
|
+
d.delete
|
13
|
+
end
|
14
|
+
|
15
|
+
=begin
|
16
|
+
# TODO catch OpenBabel segfaults and identify/remove cause
|
17
|
+
def test_02_upload_multicell
|
18
|
+
duplicates = [
|
19
|
+
"http://localhost:8082/compound/InChI=1S/C6HCl5O/c7-1-2(8)4(10)6(12)5(11)3(1)9/h12H",
|
20
|
+
"http://localhost:8082/compound/InChI=1S/C12H8Cl6O/c13-8-9(14)11(16)5-3-1-2(6-7(3)19-6)4(5)10(8,15)12(11,17)18/h2-7H,1H2",
|
21
|
+
"http://localhost:8082/compound/InChI=1S/C2HCl3/c3-1-2(4)5/h1H",
|
22
|
+
"http://localhost:8082/compound/InChI=1S/C4H5Cl/c1-3-4(2)5/h3H,1-2H2",
|
23
|
+
"http://localhost:8082/compound/InChI=1S/C4H7Cl/c1-4(2)3-5/h1,3H2,2H3",
|
24
|
+
"http://localhost:8082/compound/InChI=1S/C8H14O4/c1-5-4-8(11-6(2)9)12-7(3)10-5/h5,7-8H,4H2,1-3H3",
|
25
|
+
"http://localhost:8082/compound/InChI=1S/C19H30O5/c1-3-5-7-20-8-9-21-10-11-22-14-17-13-19-18(23-15-24-19)12-16(17)6-4-2/h12-13H,3-11,14-15H2,1-2H3",
|
26
|
+
]
|
27
|
+
errors = ['O=P(H)(OC)OC', 'C=CCNN.HCl' ]
|
28
|
+
f = File.join DATA_DIR, "multi_cell_call.csv"
|
29
|
+
d = OpenTox::Dataset.from_csv_file f
|
30
|
+
csv = CSV.read f
|
31
|
+
assert_equal true, d.features.first.nominal
|
32
|
+
assert_nil d["index"]
|
33
|
+
assert_equal csv.size-1-errors.size, d.compounds.size
|
34
|
+
assert_equal csv.first.size-1, d.features.size
|
35
|
+
assert_equal csv.size-1-errors.size, d.data_entries.size
|
36
|
+
p d.warnings
|
37
|
+
(duplicates+errors).each do |uri|
|
38
|
+
assert d.warnings.grep %r{#{uri}}
|
39
|
+
end
|
40
|
+
d.delete
|
41
|
+
end
|
42
|
+
=end
|
43
|
+
|
44
|
+
def test_03_upload_isscan
|
45
|
+
f = File.join DATA_DIR, "ISSCAN-multi.csv"
|
46
|
+
d = OpenTox::Dataset.from_csv_file f
|
47
|
+
csv = CSV.read f
|
48
|
+
assert_equal csv.size-1, d.compounds.size
|
49
|
+
assert_equal csv.first.size-1, d.features.size
|
50
|
+
assert_equal csv.size-1, d.data_entries.size
|
51
|
+
d.delete
|
52
|
+
#assert_equal false, URI.accessible?(d.uri)
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_04_simultanous_upload
|
56
|
+
threads = []
|
57
|
+
3.times do |t|
|
58
|
+
threads << Thread.new(t) do |up|
|
59
|
+
d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
60
|
+
assert_equal OpenTox::Dataset, d.class
|
61
|
+
assert_equal 1, d.features.size
|
62
|
+
assert_equal 85, d.compounds.size
|
63
|
+
assert_equal 85, d.data_entries.size
|
64
|
+
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
|
65
|
+
csv.shift
|
66
|
+
assert_equal csv.collect{|r| r[1]}, d.data_entries.flatten
|
67
|
+
d.delete
|
68
|
+
end
|
69
|
+
end
|
70
|
+
threads.each {|aThread| aThread.join}
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_05_upload_kazius
|
74
|
+
f = File.join DATA_DIR, "kazius.csv"
|
75
|
+
d = OpenTox::Dataset.from_csv_file f
|
76
|
+
csv = CSV.read f
|
77
|
+
assert_equal csv.size-1, d.compounds.size
|
78
|
+
assert_equal csv.first.size-1, d.features.size
|
79
|
+
assert_equal csv.size-1, d.data_entries.size
|
80
|
+
assert_empty d.warnings
|
81
|
+
# 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1
|
82
|
+
c = d.compounds[491]
|
83
|
+
assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC"
|
84
|
+
assert_equal d.data_entries[491][0], "1"
|
85
|
+
d.delete
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_upload_feature_dataset
|
89
|
+
t = Time.now
|
90
|
+
f = File.join DATA_DIR, "rat_feature_dataset.csv"
|
91
|
+
d = Dataset.from_csv_file f
|
92
|
+
assert_equal 458, d.features.size
|
93
|
+
d.save
|
94
|
+
p "Upload: #{Time.now-t}"
|
95
|
+
d2 = Dataset.find d.id
|
96
|
+
t = Time.now
|
97
|
+
assert_equal d.features.size, d2.features.size
|
98
|
+
csv = CSV.read f
|
99
|
+
csv.delete_at(248) # remove entry with InChi segfault
|
100
|
+
csv.shift # remove header
|
101
|
+
refute_empty d2.warnings
|
102
|
+
assert_match /249/, d2.warnings.join
|
103
|
+
assert_equal csv.size, d2.compounds.size
|
104
|
+
assert_equal csv.first.size-1, d2.features.size
|
105
|
+
d2.compounds.each_with_index do |compound,i|
|
106
|
+
row = csv[i]
|
107
|
+
row.shift # remove compound
|
108
|
+
assert_equal row, d2.data_entries[i]
|
109
|
+
end
|
110
|
+
p "Dowload: #{Time.now-t}"
|
111
|
+
d2.delete
|
112
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
113
|
+
Dataset.find d.id
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
data/test/dataset.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
# TODO; check compound/data_entry sequences with missing and duplicated values
|
2
|
+
|
3
|
+
require_relative "setup.rb"
|
4
|
+
|
5
|
+
class DatasetTest < MiniTest::Test
|
6
|
+
|
7
|
+
def test_all
|
8
|
+
d1 = Dataset.new
|
9
|
+
d1.save
|
10
|
+
datasets = Dataset.all
|
11
|
+
assert_equal Dataset, datasets.first.class
|
12
|
+
d1.delete
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_create_empty
|
16
|
+
d = Dataset.new
|
17
|
+
assert_equal Dataset, d.class
|
18
|
+
refute_nil d.id
|
19
|
+
assert_kind_of BSON::ObjectId, d.id
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_client_create
|
23
|
+
d = Dataset.new
|
24
|
+
assert_equal Dataset, d.class
|
25
|
+
d.name = "Create dataset test"
|
26
|
+
|
27
|
+
# features not set
|
28
|
+
# << operator was removed for efficiency reasons (CH)
|
29
|
+
#assert_raises BadRequestError do
|
30
|
+
# d << [Compound.from_smiles("c1ccccc1NN"), 1,2]
|
31
|
+
#end
|
32
|
+
|
33
|
+
# add data entries
|
34
|
+
d.features = ["test1", "test2"].collect do |title|
|
35
|
+
f = Feature.new
|
36
|
+
f.name = title
|
37
|
+
f.numeric = true
|
38
|
+
f.save
|
39
|
+
f
|
40
|
+
end
|
41
|
+
|
42
|
+
# wrong feature size
|
43
|
+
# << operator was removed for efficiency reasons (CH)
|
44
|
+
#assert_raises BadRequestError do
|
45
|
+
# d << [Compound.from_smiles("c1ccccc1NN"), 1,2,3]
|
46
|
+
#end
|
47
|
+
|
48
|
+
# manual low-level insertions without consistency checks for runtime efficiency
|
49
|
+
data_entries = []
|
50
|
+
d.compound_ids << Compound.from_smiles("c1ccccc1NN").id
|
51
|
+
data_entries << [1,2]
|
52
|
+
d.compound_ids << Compound.from_smiles("CC(C)N").id
|
53
|
+
data_entries << [4,5]
|
54
|
+
d.compound_ids << Compound.from_smiles("C1C(C)CCCC1").id
|
55
|
+
data_entries << [6,7]
|
56
|
+
d.data_entries = data_entries
|
57
|
+
assert_equal 3, d.compounds.size
|
58
|
+
assert_equal 2, d.features.size
|
59
|
+
assert_equal [[1,2],[4,5],[6,7]], d.data_entries
|
60
|
+
d.save_all
|
61
|
+
# check if dataset has been saved correctly
|
62
|
+
new_dataset = Dataset.find d.id
|
63
|
+
assert_equal 3, new_dataset.compounds.size
|
64
|
+
assert_equal 2, new_dataset.features.size
|
65
|
+
assert_equal [[1,2],[4,5],[6,7]], new_dataset.data_entries
|
66
|
+
d.delete
|
67
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
68
|
+
Dataset.find d.id
|
69
|
+
end
|
70
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
71
|
+
Dataset.find new_dataset.id
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_dataset_accessors
|
76
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
|
77
|
+
# create empty dataset
|
78
|
+
new_dataset = Dataset.find d.id
|
79
|
+
# get metadata
|
80
|
+
assert_match "multicolumn.csv", new_dataset.source
|
81
|
+
assert_equal "multicolumn.csv", new_dataset.title
|
82
|
+
# get features
|
83
|
+
assert_equal 6, new_dataset.features.size
|
84
|
+
assert_equal 7, new_dataset.compounds.size
|
85
|
+
assert_equal ["1", nil, "false", nil, nil, 1.0], new_dataset.data_entries.last
|
86
|
+
d.delete
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_create_from_file
|
90
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
91
|
+
assert_equal Dataset, d.class
|
92
|
+
refute_nil d.warnings
|
93
|
+
assert_match "EPAFHM.mini.csv", d.source
|
94
|
+
assert_equal "EPAFHM.mini.csv", d.name
|
95
|
+
d.delete
|
96
|
+
#assert_equal false, URI.accessible?(d.uri)
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_create_from_file_with_wrong_smiles_compound_entries
|
100
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv")
|
101
|
+
refute_nil d.warnings
|
102
|
+
assert_match /2|3|4|5|6|7|8/, d.warnings.join
|
103
|
+
d.delete
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_multicolumn_csv
|
107
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
|
108
|
+
refute_nil d.warnings
|
109
|
+
assert d.warnings.grep(/Duplicate compound/)
|
110
|
+
assert d.warnings.grep(/3, 5/)
|
111
|
+
assert_equal 6, d.features.size
|
112
|
+
assert_equal 7, d.compounds.size
|
113
|
+
assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size
|
114
|
+
assert_equal [["1", "1", "true", "true", "test", 1.1], ["1", "2", "false", "7.5", "test", 0.24], ["1", "3", "true", "5", "test", 3578.239], ["0", "4", "false", "false", "test", -2.35], ["1", "2", "true", "4", "test_2", 1], ["1", "2", "false", "false", "test", -1.5], ["1", nil, "false", nil, nil, 1.0]], d.data_entries
|
115
|
+
assert_equal "c1ccc[nH]1,1,,false,,,1.0", d.to_csv.split("\n")[7]
|
116
|
+
csv = CSV.parse(d.to_csv)
|
117
|
+
original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv")
|
118
|
+
csv.shift
|
119
|
+
original_csv.shift
|
120
|
+
csv.each_with_index do |row,i|
|
121
|
+
compound = Compound.from_smiles row.shift
|
122
|
+
original_compound = Compound.from_smiles original_csv[i].shift
|
123
|
+
assert_equal original_compound.inchi, compound.inchi
|
124
|
+
row.each_with_index do |v,j|
|
125
|
+
if v.numeric?
|
126
|
+
assert_equal original_csv[i][j].strip.to_f, row[j].to_f
|
127
|
+
else
|
128
|
+
assert_equal original_csv[i][j].strip, row[j].to_s
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
d.delete
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_from_csv
|
136
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
137
|
+
assert_equal Dataset, d.class
|
138
|
+
assert_equal 1, d.features.size
|
139
|
+
assert_equal 85, d.compounds.size
|
140
|
+
assert_equal 85, d.data_entries.size
|
141
|
+
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
|
142
|
+
csv.shift
|
143
|
+
assert_equal csv.collect{|r| r[1]}, d.data_entries.flatten
|
144
|
+
d.delete
|
145
|
+
#assert_equal false, URI.accessible?(d.uri)
|
146
|
+
end
|
147
|
+
|
148
|
+
def test_from_csv_classification
|
149
|
+
["int", "float", "string"].each do |mode|
|
150
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv"
|
151
|
+
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv")
|
152
|
+
csv.shift
|
153
|
+
entries = d.data_entries.flatten
|
154
|
+
csv.each_with_index do |r, i|
|
155
|
+
assert_equal r[1].to_s, entries[i]
|
156
|
+
end
|
157
|
+
d.delete
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def test_from_csv2
|
162
|
+
File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") }
|
163
|
+
dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv"
|
164
|
+
assert_equal "Cannot parse SMILES compound ' ' at position 3, all entries are ignored.", dataset.warnings.join
|
165
|
+
File.delete "#{DATA_DIR}/temp_test.csv"
|
166
|
+
dataset.features.each{|f| feature = Feature.find f.id; feature.delete}
|
167
|
+
dataset.delete
|
168
|
+
end
|
169
|
+
|
170
|
+
def test_same_feature
|
171
|
+
datasets = []
|
172
|
+
features = []
|
173
|
+
2.times do |i|
|
174
|
+
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.mini.csv"
|
175
|
+
features << d.features.first
|
176
|
+
assert features[0].id==features[-1].id,"re-upload should find old feature, but created new one"
|
177
|
+
datasets << d
|
178
|
+
end
|
179
|
+
datasets.each{|d| d.delete}
|
180
|
+
end
|
181
|
+
|
182
|
+
def test_create_from_file
|
183
|
+
d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
184
|
+
assert_equal Dataset, d.class
|
185
|
+
refute_nil d.warnings
|
186
|
+
assert_match /row 13/, d.warnings.join
|
187
|
+
assert_match "EPAFHM.mini.csv", d.source
|
188
|
+
assert_equal 1, d.features.size
|
189
|
+
feature = d.features.first
|
190
|
+
assert_kind_of NumericBioAssay, feature
|
191
|
+
assert_equal 0.0113, d.data_entries[0][0]
|
192
|
+
assert_equal 0.00323, d.data_entries[5][0]
|
193
|
+
d2 = Dataset.find d.id
|
194
|
+
assert_equal 0.0113, d2.data_entries[0][0]
|
195
|
+
assert_equal 0.00323, d2.data_entries[5][0]
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
class DescriptorLongTest < MiniTest::Test
|
3
|
+
|
4
|
+
def test_dataset_all
|
5
|
+
# TODO: improve CDK descriptor calculation speed or add timeout
|
6
|
+
skip "CDK descriptor calculation takes too long for some compounds"
|
7
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
8
|
+
d = OpenTox::Algorithm::Descriptor.physchem dataset
|
9
|
+
assert_equal dataset.compounds, d.compounds
|
10
|
+
assert_equal 332, d.features.size
|
11
|
+
assert_equal 332, d.data_entries.first.size
|
12
|
+
d.delete
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_dataset_openbabel
|
16
|
+
# TODO: improve CDK descriptor calculation speed or add timeout
|
17
|
+
dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
18
|
+
d = Algorithm::Descriptor.physchem dataset, Algorithm::Descriptor::OBDESCRIPTORS.keys
|
19
|
+
assert_equal dataset.compounds, d.compounds
|
20
|
+
size = Algorithm::Descriptor::OBDESCRIPTORS.keys.size
|
21
|
+
assert_equal size, d.features.size
|
22
|
+
assert_equal size, d.data_entries.first.size
|
23
|
+
d.delete
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
data/test/descriptor.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class DescriptorTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_list
|
6
|
+
# check available descriptors
|
7
|
+
@descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys
|
8
|
+
assert_equal 111,@descriptors.size,"wrong num physchem descriptors"
|
9
|
+
@descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES
|
10
|
+
assert_equal 356,@descriptor_values.size,"wrong num physchem descriptors"
|
11
|
+
sum = 0
|
12
|
+
[ @descriptors, @descriptor_values ].each do |desc|
|
13
|
+
{"Openbabel"=>16,"Cdk"=>(desc==@descriptors ? 50 : 295),"Joelib"=>45}.each do |k,v|
|
14
|
+
assert_equal v,desc.select{|x| x=~/^#{k}\./}.size,"wrong num #{k} descriptors"
|
15
|
+
sum += v
|
16
|
+
end
|
17
|
+
end
|
18
|
+
assert_equal (111+356),sum
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_smarts
|
22
|
+
c = OpenTox::Compound.from_smiles "N=C=C1CCC(=F=FO)C1"
|
23
|
+
File.open("tmp.png","w+"){|f| f.puts c.png}
|
24
|
+
s = Smarts.find_or_create_by(:smarts => "F=F")
|
25
|
+
result = OpenTox::Algorithm::Descriptor.smarts_match c, s
|
26
|
+
assert_equal [1], result
|
27
|
+
smarts = ["CC", "C", "C=C", "CO", "F=F", "C1CCCC1", "NN"].collect{|s| Smarts.find_or_create_by(:smarts => s)}
|
28
|
+
result = OpenTox::Algorithm::Descriptor.smarts_match c, smarts
|
29
|
+
assert_equal [1, 1, 1, 0, 1, 1, 0], result
|
30
|
+
smarts_count = [10, 6, 2, 0, 2, 10, 0]
|
31
|
+
result = OpenTox::Algorithm::Descriptor.smarts_count c, smarts
|
32
|
+
assert_equal smarts_count, result
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_compound_openbabel_single
|
36
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
37
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Openbabel.logP"]
|
38
|
+
assert_equal 1.12518, result.first
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_compound_cdk_single
|
42
|
+
c = OpenTox::Compound.from_smiles "c1ccccc1"
|
43
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"]
|
44
|
+
assert_equal [12], result
|
45
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
46
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"]
|
47
|
+
assert_equal [17], result
|
48
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.CarbonTypes"]
|
49
|
+
c_types = {"Cdk.CarbonTypes.C1SP1"=>1, "Cdk.CarbonTypes.C2SP1"=>0, "Cdk.CarbonTypes.C1SP2"=>0, "Cdk.CarbonTypes.C2SP2"=>1, "Cdk.CarbonTypes.C3SP2"=>0, "Cdk.CarbonTypes.C1SP3"=>2, "Cdk.CarbonTypes.C2SP3"=>1, "Cdk.CarbonTypes.C3SP3"=>1, "Cdk.CarbonTypes.C4SP3"=>0}
|
50
|
+
assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_compound_joelib_single
|
54
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
55
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, ["Joelib.LogP"]
|
56
|
+
assert_equal [2.65908], result
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_compound_all
|
60
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
61
|
+
result = OpenTox::Algorithm::Descriptor.physchem c
|
62
|
+
assert_equal 332, result.size
|
63
|
+
assert_equal 30.8723, result[2]
|
64
|
+
assert_equal 1.12518, result[328]
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_compound_descriptor_parameters
|
68
|
+
c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
|
69
|
+
result = OpenTox::Algorithm::Descriptor.physchem c, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ]#, true
|
70
|
+
assert_equal 12, result.size
|
71
|
+
assert_equal [1.12518, 17.0, 1, 0, 0, 1, 0, 2, 1, 1, 0, 2.65908], result#.last
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_dataset_descriptor_parameters
|
75
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
76
|
+
d = OpenTox::Algorithm::Descriptor.physchem dataset, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ]
|
77
|
+
assert_kind_of Dataset, d
|
78
|
+
assert_equal dataset.compounds, d.compounds
|
79
|
+
assert_equal dataset.compounds.size, d.data_entries.size
|
80
|
+
assert_equal 12, d.data_entries.first.size
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
data/test/error.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class ErrorTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_bad_request
|
6
|
+
object = OpenTox::Feature.new
|
7
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
8
|
+
response = OpenTox::Feature.find(object.id)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_error_methods
|
13
|
+
assert_raises OpenTox::ResourceNotFoundError do
|
14
|
+
resource_not_found_error "This is a test"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_exception
|
19
|
+
assert_raises Exception do
|
20
|
+
raise Exception.new "Basic Exception"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
data/test/feature.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class FeatureTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_opentox_feature
|
6
|
+
@feature = OpenTox::Feature.create(:name => "tost")
|
7
|
+
assert_equal true, OpenTox::Feature.where(name: "tost").exists?, "#{@feature.id} is not accessible."
|
8
|
+
assert_equal true, OpenTox::Feature.where(id: @feature.id).exists?, "#{@feature.id} is not accessible."
|
9
|
+
|
10
|
+
list = OpenTox::Feature.all
|
11
|
+
listsize1 = list.length
|
12
|
+
assert_equal true, list.collect{|f| f.id}.include?(@feature.id)
|
13
|
+
# modify feature
|
14
|
+
@feature2 = OpenTox::Feature.find(@feature.id)
|
15
|
+
assert_equal "tost", @feature2[:name]
|
16
|
+
assert_equal "tost", @feature2.name
|
17
|
+
assert_kind_of Feature, @feature2
|
18
|
+
|
19
|
+
@feature2[:name] = "feature2"
|
20
|
+
@feature2.save
|
21
|
+
list = OpenTox::Feature.all
|
22
|
+
listsize2 = list.length
|
23
|
+
assert_match "feature2", @feature2.name
|
24
|
+
refute_match "tost", @feature2.name
|
25
|
+
assert_equal listsize1, listsize2
|
26
|
+
|
27
|
+
id = @feature2.id
|
28
|
+
@feature2.delete
|
29
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
30
|
+
OpenTox::Feature.find(id)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_duplicated_features
|
35
|
+
metadata = {
|
36
|
+
:name => "feature duplication test",
|
37
|
+
:nominal => true,
|
38
|
+
:description => "feature duplication test"
|
39
|
+
}
|
40
|
+
feature = NumericBioAssay.find_or_create_by metadata
|
41
|
+
dup_feature = NumericBioAssay.find_or_create_by metadata
|
42
|
+
assert_kind_of Feature, feature
|
43
|
+
assert !feature.id.nil?, "No Feature ID in #{feature.inspect}"
|
44
|
+
assert !feature.id.nil?, "No Feature ID in #{dup_feature.inspect}"
|
45
|
+
assert_equal feature.id, dup_feature.id
|
46
|
+
feature.delete
|
47
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
48
|
+
OpenTox::Feature.find(feature.id)
|
49
|
+
end
|
50
|
+
assert_raises Mongoid::Errors::DocumentNotFound do
|
51
|
+
OpenTox::Feature.find(dup_feature.id)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_smarts_feature
|
56
|
+
feature = Smarts.find_or_create_by(:smarts => "CN")
|
57
|
+
assert feature.smarts, "CN"
|
58
|
+
assert_kind_of Smarts, feature
|
59
|
+
feature.smarts = 'cc'
|
60
|
+
assert feature.smarts, "cc"
|
61
|
+
original = Feature.where(:smarts => 'CN').first
|
62
|
+
assert original.smarts, "CN"
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
data/test/fminer-long.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class FminerTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_fminer_multicell
|
6
|
+
#skip "multicell segfaults"
|
7
|
+
# TODO aborts, probably fminer
|
8
|
+
# or OpenBabel segfault
|
9
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv")
|
10
|
+
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
|
11
|
+
p feature_dataset.training_parameters
|
12
|
+
assert_equal dataset.compound_ids, feature_dataset.compound_ids
|
13
|
+
dataset.delete
|
14
|
+
feature_dataset.delete
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_fminer_isscan
|
18
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv")
|
19
|
+
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
|
20
|
+
assert_equal feature_dataset.compounds.size, dataset.compounds.size
|
21
|
+
p feature_dataset.features.size
|
22
|
+
p feature_dataset.training_parameters
|
23
|
+
dataset.delete
|
24
|
+
feature_dataset.delete
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_fminer_kazius
|
28
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
|
29
|
+
# TODO reactivate default settings
|
30
|
+
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20)
|
31
|
+
assert_equal feature_dataset.compounds.size, dataset.compounds.size
|
32
|
+
feature_dataset = Dataset.find feature_dataset.id
|
33
|
+
assert feature_dataset.data_entries.size, dataset.compounds.size
|
34
|
+
dataset.delete
|
35
|
+
feature_dataset.delete
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
data/test/fminer.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class FminerTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_fminer_bbrc
|
6
|
+
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
+
refute_nil dataset.id
|
8
|
+
feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset
|
9
|
+
feature_dataset = Dataset.find feature_dataset.id
|
10
|
+
assert_equal dataset.compounds.size, feature_dataset.compounds.size
|
11
|
+
# TODO: fminer calculates 62 instead of 54 features
|
12
|
+
# it is unclear which commit changed the numbers (occurs with old libraries/mongodb branch too
|
13
|
+
# modification of Compound to use smiles instead of inchis seems to have no effect
|
14
|
+
#assert_equal 54, feature_dataset.features.size
|
15
|
+
#assert_equal "C-C-C=C", feature_dataset.features.first.smarts
|
16
|
+
compounds = feature_dataset.compounds
|
17
|
+
smarts = feature_dataset.features
|
18
|
+
smarts.each do |smart|
|
19
|
+
assert smart.p_value.round(2) >= 0.95
|
20
|
+
end
|
21
|
+
match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
|
22
|
+
feature_dataset.data_entries.each_with_index do |fingerprint,i|
|
23
|
+
assert_equal match[i], fingerprint
|
24
|
+
end
|
25
|
+
|
26
|
+
dataset.delete
|
27
|
+
feature_dataset.delete
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_fminer_last
|
31
|
+
skip "last features have to be activated"
|
32
|
+
dataset = OpenTox::Dataset.new
|
33
|
+
dataset.upload File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
34
|
+
feature_dataset = OpenTox::Algorithm::Fminer.last :dataset => dataset
|
35
|
+
assert_equal dataset.compounds.size, feature_dataset.compounds.size
|
36
|
+
assert_equal 21, feature_dataset.features.size
|
37
|
+
assert_equal '[#6&A]-[#6&a]:[#6&a]:[#6&a]:[#6&a]:[#6&a]', feature_dataset.features.first.smarts
|
38
|
+
|
39
|
+
compounds = feature_dataset.compounds
|
40
|
+
smarts = feature_dataset.features.collect{|f| f.smarts}
|
41
|
+
match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
|
42
|
+
compounds.each_with_index do |c,i|
|
43
|
+
smarts.each_with_index do |s,j|
|
44
|
+
assert_equal match[i][j], feature_dataset.data_entries[i][j].to_i
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
dataset.delete
|
49
|
+
feature_dataset.delete
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require_relative "setup.rb"
|
2
|
+
|
3
|
+
class LazarFminerTest < MiniTest::Test
|
4
|
+
|
5
|
+
def test_lazar_fminer
|
6
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
+
model = Model::LazarFminerClassification.create training_dataset#, feature_dataset
|
8
|
+
feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id]
|
9
|
+
assert_equal training_dataset.compounds.size, feature_dataset.compounds.size
|
10
|
+
#TODO check fminer features, see fminer.rb
|
11
|
+
#assert_equal 54, feature_dataset.features.size
|
12
|
+
feature_dataset.data_entries.each do |e|
|
13
|
+
assert_equal e.size, feature_dataset.features.size
|
14
|
+
end
|
15
|
+
#assert_equal 'C-C-C=C', feature_dataset.features.first.smarts
|
16
|
+
|
17
|
+
[ {
|
18
|
+
:compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
|
19
|
+
:prediction => "false",
|
20
|
+
:confidence => 0.25281385281385277,
|
21
|
+
:nr_neighbors => 11
|
22
|
+
},{
|
23
|
+
:compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
|
24
|
+
:prediction => "false",
|
25
|
+
:confidence => 0.3639589577089577,
|
26
|
+
:nr_neighbors => 14
|
27
|
+
}, {
|
28
|
+
:compound => Compound.from_smiles('OCCCCCCCC\C=C/CCCCCCCC'),
|
29
|
+
:prediction => "false",
|
30
|
+
:confidence => 0.5555555555555556,
|
31
|
+
:nr_neighbors => 1
|
32
|
+
}].each do |example|
|
33
|
+
prediction = model.predict example[:compound]
|
34
|
+
|
35
|
+
assert_equal example[:prediction], prediction[:value]
|
36
|
+
#assert_equal example[:confidence], prediction[:confidence]
|
37
|
+
#assert_equal example[:nr_neighbors], prediction[:neighbors].size
|
38
|
+
end
|
39
|
+
|
40
|
+
# make a dataset prediction
|
41
|
+
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
42
|
+
prediction = model.predict compound_dataset
|
43
|
+
assert_equal compound_dataset.compounds, prediction.compounds
|
44
|
+
|
45
|
+
assert_equal "Cound not find similar compounds.", prediction.data_entries[7][2]
|
46
|
+
assert_equal "measured", prediction.data_entries[14][1]
|
47
|
+
# cleanup
|
48
|
+
[training_dataset,model,feature_dataset,compound_dataset].each{|o| o.delete}
|
49
|
+
end
|
50
|
+
end
|