lazar 0.9.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
data/lib/dataset.rb
CHANGED
@@ -5,46 +5,49 @@ module OpenTox
|
|
5
5
|
|
6
6
|
class Dataset
|
7
7
|
|
8
|
-
|
9
|
-
field :feature_ids, type: Array, default: []
|
10
|
-
field :compound_ids, type: Array, default: []
|
11
|
-
field :data_entries, type: Array, default: []
|
12
|
-
field :source, type: String
|
8
|
+
field :data_entries, type: Hash, default: {}
|
13
9
|
|
14
10
|
# Readers
|
15
11
|
|
16
|
-
# Get all compounds
|
17
12
|
def compounds
|
18
|
-
|
19
|
-
|
13
|
+
substances.select{|s| s.is_a? Compound}
|
14
|
+
end
|
15
|
+
|
16
|
+
def nanoparticles
|
17
|
+
substances.select{|s| s.is_a? Nanoparticle}
|
18
|
+
end
|
19
|
+
|
20
|
+
# Get all substances
|
21
|
+
def substances
|
22
|
+
@substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq
|
23
|
+
@substances
|
20
24
|
end
|
21
25
|
|
22
26
|
# Get all features
|
23
27
|
def features
|
24
|
-
@features ||=
|
28
|
+
@features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq
|
25
29
|
@features
|
26
30
|
end
|
27
31
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
32
|
+
def values substance,feature
|
33
|
+
substance = substance.id if substance.is_a? Substance
|
34
|
+
feature = feature.id if feature.is_a? Feature
|
35
|
+
if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
|
36
|
+
data_entries[substance.to_s][feature.to_s]
|
37
|
+
else
|
38
|
+
nil
|
39
|
+
end
|
36
40
|
end
|
37
41
|
|
38
42
|
# Writers
|
39
43
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
self.feature_ids = features.collect{|f| f.id}
|
44
|
+
def add(substance,feature,value)
|
45
|
+
substance = substance.id if substance.is_a? Substance
|
46
|
+
feature = feature.id if feature.is_a? Feature
|
47
|
+
data_entries[substance.to_s] ||= {}
|
48
|
+
data_entries[substance.to_s][feature.to_s] ||= []
|
49
|
+
data_entries[substance.to_s][feature.to_s] << value
|
50
|
+
#data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source
|
48
51
|
end
|
49
52
|
|
50
53
|
# Dataset operations
|
@@ -53,13 +56,7 @@ module OpenTox
|
|
53
56
|
# @param [Integer] number of folds
|
54
57
|
# @return [Array] Array with folds [training_dataset,test_dataset]
|
55
58
|
def folds n
|
56
|
-
|
57
|
-
compound_ids.each_with_index do |cid,i|
|
58
|
-
unique_compound_data[cid] ||= []
|
59
|
-
unique_compound_data[cid] << data_entries[i]
|
60
|
-
end
|
61
|
-
unique_compound_ids = unique_compound_data.keys
|
62
|
-
len = unique_compound_ids.size
|
59
|
+
len = self.substances.size
|
63
60
|
indices = (0..len-1).to_a.shuffle
|
64
61
|
mid = (len/n)
|
65
62
|
chunks = []
|
@@ -68,22 +65,16 @@ module OpenTox
|
|
68
65
|
last = start+mid
|
69
66
|
last = last-1 unless len%n >= i
|
70
67
|
test_idxs = indices[start..last] || []
|
71
|
-
|
68
|
+
test_substances = test_idxs.collect{|i| substances[i]}
|
72
69
|
training_idxs = indices-test_idxs
|
73
|
-
|
74
|
-
chunk = [
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
end
|
83
|
-
dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
|
84
|
-
dataset.compounds.each do |compound|
|
85
|
-
compound.dataset_ids << dataset.id
|
86
|
-
compound.save
|
70
|
+
training_substances = training_idxs.collect{|i| substances[i]}
|
71
|
+
chunk = [training_substances,test_substances].collect do |substances|
|
72
|
+
dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id )
|
73
|
+
substances.each do |substance|
|
74
|
+
substance.dataset_ids << dataset.id
|
75
|
+
substance.dataset_ids.uniq!
|
76
|
+
substance.save
|
77
|
+
dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {}
|
87
78
|
end
|
88
79
|
dataset.save
|
89
80
|
dataset
|
@@ -94,41 +85,37 @@ module OpenTox
|
|
94
85
|
chunks
|
95
86
|
end
|
96
87
|
|
97
|
-
# Diagnostics
|
98
|
-
|
99
|
-
def duplicates feature=self.features.first
|
100
|
-
col = feature_ids.index feature.id
|
101
|
-
dups = {}
|
102
|
-
compound_ids.each_with_index do |cid,i|
|
103
|
-
rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
|
104
|
-
values = rows.collect{|row| data_entries[row][col]}
|
105
|
-
dups[cid] = values if values.size > 1
|
106
|
-
end
|
107
|
-
dups
|
108
|
-
end
|
109
|
-
|
110
|
-
def correlation_plot training_dataset
|
111
|
-
# TODO: create/store svg
|
112
|
-
R.assign "features", data_entries
|
113
|
-
R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
|
114
|
-
R.eval "featurePlot(features,activities)"
|
115
|
-
end
|
116
|
-
|
117
|
-
def density_plot
|
118
|
-
# TODO: create/store svg
|
119
|
-
R.assign "acts", data_entries.collect{|r| r.first }#.compact
|
120
|
-
R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
|
121
|
-
end
|
122
|
-
|
123
88
|
# Serialisation
|
124
89
|
|
125
90
|
# converts dataset to csv format including compound smiles as first column, other column headers are feature names
|
126
91
|
# @return [String]
|
127
92
|
def to_csv(inchi=false)
|
128
|
-
CSV.generate() do |csv|
|
129
|
-
|
130
|
-
|
131
|
-
csv << [inchi ?
|
93
|
+
CSV.generate() do |csv|
|
94
|
+
compound = substances.first.is_a? Compound
|
95
|
+
if compound
|
96
|
+
csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
|
97
|
+
else
|
98
|
+
csv << ["Name"] + features.collect{|f| f.name}
|
99
|
+
end
|
100
|
+
substances.each do |substance|
|
101
|
+
if compound
|
102
|
+
name = (inchi ? substance.inchi : substance.smiles)
|
103
|
+
else
|
104
|
+
name = substance.name
|
105
|
+
end
|
106
|
+
nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq
|
107
|
+
|
108
|
+
if nr_measurements.size > 1
|
109
|
+
warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
|
110
|
+
else
|
111
|
+
(0..nr_measurements.first-1).each do |i|
|
112
|
+
row = [name]
|
113
|
+
features.each do |f|
|
114
|
+
values(substance,f) ? row << values(substance,f)[i] : row << ""
|
115
|
+
end
|
116
|
+
csv << row
|
117
|
+
end
|
118
|
+
end
|
132
119
|
end
|
133
120
|
end
|
134
121
|
end
|
@@ -143,9 +130,8 @@ module OpenTox
|
|
143
130
|
#end
|
144
131
|
|
145
132
|
# Create a dataset from CSV file
|
146
|
-
|
147
|
-
|
148
|
-
source ||= file
|
133
|
+
def self.from_csv_file file, accept_empty_values=false
|
134
|
+
source = file
|
149
135
|
name = File.basename(file,".*")
|
150
136
|
dataset = self.find_by(:source => source, :name => name)
|
151
137
|
if dataset
|
@@ -154,171 +140,116 @@ module OpenTox
|
|
154
140
|
$logger.debug "Parsing #{file}."
|
155
141
|
table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
|
156
142
|
dataset = self.new(:source => source, :name => name)
|
157
|
-
dataset.parse_table table,
|
143
|
+
dataset.parse_table table, accept_empty_values
|
158
144
|
end
|
159
145
|
dataset
|
160
146
|
end
|
161
147
|
|
162
148
|
# parse data in tabular format (e.g. from csv)
|
163
149
|
# does a lot of guesswork in order to determine feature types
|
164
|
-
def parse_table table,
|
165
|
-
|
166
|
-
time = Time.now
|
150
|
+
def parse_table table, accept_empty_values
|
167
151
|
|
168
152
|
# features
|
169
153
|
feature_names = table.shift.collect{|f| f.strip}
|
170
|
-
warnings << "
|
154
|
+
warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
|
171
155
|
compound_format = feature_names.shift.strip
|
172
156
|
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
|
173
|
-
|
174
157
|
numeric = []
|
158
|
+
features = []
|
175
159
|
# guess feature types
|
176
160
|
feature_names.each_with_index do |f,i|
|
177
161
|
metadata = {:name => f}
|
178
162
|
values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
|
179
163
|
types = values.collect{|v| v.numeric? ? true : false}.uniq
|
164
|
+
feature = nil
|
180
165
|
if values.size == 0 # empty feature
|
181
166
|
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
|
182
|
-
metadata["numeric"] = true
|
183
167
|
numeric[i] = true
|
168
|
+
feature = NumericFeature.find_or_create_by(metadata)
|
184
169
|
else
|
185
|
-
metadata["nominal"] = true
|
186
170
|
metadata["accept_values"] = values
|
187
171
|
numeric[i] = false
|
172
|
+
feature = NominalFeature.find_or_create_by(metadata)
|
188
173
|
end
|
189
|
-
if
|
190
|
-
if metadata["numeric"]
|
191
|
-
feature = NumericBioAssay.find_or_create_by(metadata)
|
192
|
-
elsif metadata["nominal"]
|
193
|
-
feature = NominalBioAssay.find_or_create_by(metadata)
|
194
|
-
end
|
195
|
-
else
|
196
|
-
metadata.merge({:measured => false, :calculated => true})
|
197
|
-
if metadata["numeric"]
|
198
|
-
feature = NumericFeature.find_or_create_by(metadata)
|
199
|
-
elsif metadata["nominal"]
|
200
|
-
feature = NominalFeature.find_or_create_by(metadata)
|
201
|
-
end
|
202
|
-
end
|
203
|
-
feature_ids << feature.id if feature
|
174
|
+
features << feature if feature
|
204
175
|
end
|
205
176
|
|
206
|
-
|
207
|
-
time = Time.now
|
208
|
-
|
209
|
-
r = -1
|
210
|
-
compound_time = 0
|
211
|
-
value_time = 0
|
212
|
-
|
213
|
-
# compounds and values
|
214
|
-
self.data_entries = []
|
177
|
+
# substances and values
|
215
178
|
|
179
|
+
all_substances = []
|
216
180
|
table.each_with_index do |vals,i|
|
217
|
-
ct = Time.now
|
218
181
|
identifier = vals.shift.strip
|
219
|
-
|
182
|
+
warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values
|
220
183
|
begin
|
221
184
|
case compound_format
|
222
185
|
when /SMILES/i
|
223
|
-
|
186
|
+
substance = OpenTox::Compound.from_smiles(identifier)
|
224
187
|
when /InChI/i
|
225
|
-
|
188
|
+
substance = OpenTox::Compound.from_inchi(identifier)
|
226
189
|
end
|
227
190
|
rescue
|
228
|
-
|
191
|
+
substance = nil
|
229
192
|
end
|
230
|
-
if
|
231
|
-
# compound
|
232
|
-
warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
|
193
|
+
if substance.nil? # compound parsers may return nil
|
194
|
+
warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
|
233
195
|
next
|
234
196
|
end
|
235
|
-
|
236
|
-
|
197
|
+
all_substances << substance
|
198
|
+
substance.dataset_ids << self.id
|
199
|
+
substance.dataset_ids.uniq!
|
200
|
+
substance.save
|
237
201
|
|
238
|
-
|
239
|
-
|
240
|
-
warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
|
202
|
+
unless vals.size == features.size
|
203
|
+
warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
|
241
204
|
next
|
242
205
|
end
|
243
206
|
|
244
|
-
compound_ids << compound.id
|
245
|
-
table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
|
246
|
-
|
247
207
|
vals.each_with_index do |v,j|
|
248
208
|
if v.blank?
|
249
|
-
|
209
|
+
warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'."
|
250
210
|
next
|
251
211
|
elsif numeric[j]
|
252
212
|
v = v.to_f
|
253
213
|
else
|
254
214
|
v = v.strip
|
255
215
|
end
|
256
|
-
|
257
|
-
#i = compound.feature_ids.index feature_ids[j]
|
258
|
-
compound.features[feature_ids[j].to_s] ||= []
|
259
|
-
compound.features[feature_ids[j].to_s] << v
|
260
|
-
compound.save
|
216
|
+
add substance, features[j], v
|
261
217
|
end
|
218
|
+
data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values
|
262
219
|
end
|
263
|
-
|
220
|
+
all_substances.duplicates.each do |substance|
|
264
221
|
positions = []
|
265
|
-
|
266
|
-
|
222
|
+
all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
|
223
|
+
warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
|
267
224
|
end
|
268
|
-
|
269
|
-
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
|
270
|
-
time = Time.now
|
271
225
|
save
|
272
|
-
$logger.debug "Saving: #{Time.now-time}"
|
273
|
-
|
274
226
|
end
|
275
227
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
(0 .. compound_ids.size-1).each do |i|
|
280
|
-
data_entries[i] ||= []
|
281
|
-
(0 .. feature_ids.size-1).each do |j|
|
282
|
-
data_entries[i][j] ||= n
|
283
|
-
end
|
284
|
-
end
|
228
|
+
def delete
|
229
|
+
compounds.each{|c| c.dataset_ids.delete id.to_s}
|
230
|
+
super
|
285
231
|
end
|
286
232
|
|
287
233
|
end
|
288
234
|
|
289
235
|
# Dataset for lazar predictions
|
290
|
-
class LazarPrediction
|
236
|
+
class LazarPrediction #< Dataset
|
291
237
|
field :creator, type: String
|
292
|
-
field :prediction_feature_id, type:
|
238
|
+
field :prediction_feature_id, type: BSON::ObjectId
|
239
|
+
field :predictions, type: Hash, default: {}
|
293
240
|
|
294
241
|
def prediction_feature
|
295
242
|
Feature.find prediction_feature_id
|
296
243
|
end
|
297
244
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
class DescriptorDataset < Dataset
|
302
|
-
field :feature_calculation_algorithm, type: String
|
303
|
-
|
304
|
-
end
|
305
|
-
|
306
|
-
class ScaledDataset < DescriptorDataset
|
307
|
-
|
308
|
-
field :centers, type: Array, default: []
|
309
|
-
field :scales, type: Array, default: []
|
245
|
+
def compounds
|
246
|
+
substances.select{|s| s.is_a? Compound}
|
247
|
+
end
|
310
248
|
|
311
|
-
def
|
312
|
-
|
249
|
+
def substances
|
250
|
+
predictions.keys.collect{|id| Substance.find id}
|
313
251
|
end
|
314
|
-
end
|
315
252
|
|
316
|
-
# Dataset for fminer descriptors
|
317
|
-
class FminerDataset < DescriptorDataset
|
318
|
-
field :training_algorithm, type: String
|
319
|
-
field :training_dataset_id, type: BSON::ObjectId
|
320
|
-
field :training_feature_id, type: BSON::ObjectId
|
321
|
-
field :training_parameters, type: Hash
|
322
253
|
end
|
323
254
|
|
324
255
|
end
|
data/lib/feature.rb
CHANGED
@@ -2,27 +2,28 @@ module OpenTox
|
|
2
2
|
|
3
3
|
# Basic feature class
|
4
4
|
class Feature
|
5
|
-
field :nominal, type: Boolean
|
6
|
-
field :numeric, type: Boolean
|
7
5
|
field :measured, type: Boolean
|
8
6
|
field :calculated, type: Boolean
|
7
|
+
field :category, type: String
|
8
|
+
field :unit, type: String
|
9
|
+
field :conditions, type: Hash
|
10
|
+
|
11
|
+
def nominal?
|
12
|
+
self.class == NominalFeature
|
13
|
+
end
|
14
|
+
|
15
|
+
def numeric?
|
16
|
+
self.class == NumericFeature
|
17
|
+
end
|
9
18
|
end
|
10
19
|
|
11
20
|
# Feature for categorical variables
|
12
21
|
class NominalFeature < Feature
|
13
22
|
field :accept_values, type: Array
|
14
|
-
def initialize params
|
15
|
-
super params
|
16
|
-
nominal = true
|
17
|
-
end
|
18
23
|
end
|
19
24
|
|
20
25
|
# Feature for quantitative variables
|
21
26
|
class NumericFeature < Feature
|
22
|
-
def initialize params
|
23
|
-
super params
|
24
|
-
numeric = true
|
25
|
-
end
|
26
27
|
end
|
27
28
|
|
28
29
|
# Feature for SMARTS fragments
|
@@ -34,12 +35,4 @@ module OpenTox
|
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
|
-
# Feature for categorical bioassay results
|
38
|
-
class NominalBioAssay < NominalFeature
|
39
|
-
end
|
40
|
-
|
41
|
-
# Feature for quantitative bioassay results
|
42
|
-
class NumericBioAssay < NumericFeature
|
43
|
-
end
|
44
|
-
|
45
38
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Algorithm
|
3
|
+
|
4
|
+
class FeatureSelection
|
5
|
+
|
6
|
+
def self.correlation_filter model
|
7
|
+
relevant_features = {}
|
8
|
+
R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
|
9
|
+
model.descriptor_weights = []
|
10
|
+
selected_variables = []
|
11
|
+
selected_descriptor_ids = []
|
12
|
+
model.independent_variables.each_with_index do |v,i|
|
13
|
+
v.collect!{|n| to_r(n)}
|
14
|
+
R.assign "independent", v
|
15
|
+
begin
|
16
|
+
R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
|
17
|
+
pvalue = R.eval("cor$p.value").to_ruby
|
18
|
+
if pvalue <= 0.05
|
19
|
+
model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
|
20
|
+
selected_variables << v
|
21
|
+
selected_descriptor_ids << model.descriptor_ids[i]
|
22
|
+
end
|
23
|
+
rescue
|
24
|
+
warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
model.independent_variables = selected_variables
|
29
|
+
model.descriptor_ids = selected_descriptor_ids
|
30
|
+
model
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.to_r v
|
34
|
+
return 0 if v == false
|
35
|
+
return 1 if v == true
|
36
|
+
v
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
data/lib/import.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
module Import
|
4
|
+
|
5
|
+
class Enanomapper
|
6
|
+
include OpenTox
|
7
|
+
|
8
|
+
# time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
|
9
|
+
def self.import
|
10
|
+
datasets = {}
|
11
|
+
bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
|
12
|
+
bundles.each do |bundle|
|
13
|
+
datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
|
14
|
+
$logger.debug bundle["title"].strip
|
15
|
+
nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
|
16
|
+
nanoparticles.each_with_index do |np,n|
|
17
|
+
core_id = nil
|
18
|
+
coating_ids = []
|
19
|
+
np["composition"].each do |c|
|
20
|
+
uri = c["component"]["compound"]["URI"]
|
21
|
+
uri = CGI.escape File.join(uri,"&media=application/json")
|
22
|
+
data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
|
23
|
+
smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
|
24
|
+
names = []
|
25
|
+
names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
|
26
|
+
names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
|
27
|
+
if smiles
|
28
|
+
compound = Compound.find_or_create_by(:smiles => smiles)
|
29
|
+
compound.name = names.first
|
30
|
+
compound.names = names.compact
|
31
|
+
else
|
32
|
+
compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
|
33
|
+
end
|
34
|
+
compound.save
|
35
|
+
if c["relation"] == "HAS_CORE"
|
36
|
+
core_id = compound.id.to_s
|
37
|
+
elsif c["relation"] == "HAS_COATING"
|
38
|
+
coating_ids << compound.id.to_s
|
39
|
+
end
|
40
|
+
end if np["composition"]
|
41
|
+
nanoparticle = Nanoparticle.find_or_create_by(
|
42
|
+
:name => np["values"]["https://data.enanomapper.net/identifier/name"],
|
43
|
+
:source => np["compound"]["URI"],
|
44
|
+
:core_id => core_id,
|
45
|
+
:coating_ids => coating_ids
|
46
|
+
)
|
47
|
+
np["bundles"].keys.each do |bundle_uri|
|
48
|
+
nanoparticle.dataset_ids << datasets[bundle_uri].id
|
49
|
+
end
|
50
|
+
|
51
|
+
studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
|
52
|
+
studies.each do |study|
|
53
|
+
dataset = datasets[np["bundles"].keys.first]
|
54
|
+
proteomics_features = {}
|
55
|
+
category = study["protocol"]["topcategory"]
|
56
|
+
source = study["protocol"]["category"]["term"]
|
57
|
+
study["effects"].each do |effect|
|
58
|
+
|
59
|
+
effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
|
60
|
+
effect["conditions"].delete_if { |k, v| v.nil? }
|
61
|
+
|
62
|
+
if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
|
63
|
+
|
64
|
+
JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
|
65
|
+
proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
|
66
|
+
nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
|
67
|
+
end
|
68
|
+
else
|
69
|
+
name = effect["endpoint"]
|
70
|
+
unit = effect["result"]["unit"]
|
71
|
+
warnings = []
|
72
|
+
case name
|
73
|
+
when "Log2 transformed" # use a sensible name
|
74
|
+
name = "log2(Net cell association)"
|
75
|
+
warnings = ["Original name was 'Log2 transformed'"]
|
76
|
+
unit = "log2(mL/ug(Mg))"
|
77
|
+
when "Total protein (BCA assay)"
|
78
|
+
category = "P-CHEM"
|
79
|
+
warnings = ["Category changed from TOX to P-CHEM"]
|
80
|
+
end
|
81
|
+
feature = klass.find_or_create_by(
|
82
|
+
:name => name,
|
83
|
+
:unit => unit,
|
84
|
+
:category => category,
|
85
|
+
:conditions => effect["conditions"],
|
86
|
+
:source => study["protocol"]["category"]["term"],
|
87
|
+
:measured => true,
|
88
|
+
:warnings => warnings
|
89
|
+
)
|
90
|
+
nanoparticle.parse_ambit_value feature, effect["result"], dataset
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
nanoparticle.save
|
95
|
+
print "#{n}, "
|
96
|
+
end
|
97
|
+
puts
|
98
|
+
end
|
99
|
+
datasets.each { |u,d| d.save }
|
100
|
+
end
|
101
|
+
|
102
|
+
=begin
|
103
|
+
def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
|
104
|
+
#get list of bundle URIs
|
105
|
+
bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
|
106
|
+
datasets = []
|
107
|
+
bundles.each do |bundle|
|
108
|
+
uri = bundle["URI"]
|
109
|
+
study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
|
110
|
+
study["@graph"].each do |i|
|
111
|
+
puts i.to_yaml if i.keys.include? "sio:has-value"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
datasets.collect{|d| d.id}
|
115
|
+
end
|
116
|
+
=end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
data/lib/lazar.rb
CHANGED
@@ -48,6 +48,7 @@ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
|
|
48
48
|
R = Rserve::Connection.new
|
49
49
|
R.eval "
|
50
50
|
suppressPackageStartupMessages({
|
51
|
+
library(labeling,lib=\"#{rlib}\")
|
51
52
|
library(iterators,lib=\"#{rlib}\")
|
52
53
|
library(foreach,lib=\"#{rlib}\")
|
53
54
|
library(ggplot2,lib=\"#{rlib}\")
|
@@ -56,12 +57,14 @@ suppressPackageStartupMessages({
|
|
56
57
|
library(pls,lib=\"#{rlib}\")
|
57
58
|
library(caret,lib=\"#{rlib}\")
|
58
59
|
library(doMC,lib=\"#{rlib}\")
|
60
|
+
library(randomForest,lib=\"#{rlib}\")
|
61
|
+
library(plyr,lib=\"#{rlib}\")
|
59
62
|
registerDoMC(#{NR_CORES})
|
60
63
|
})
|
61
64
|
"
|
62
65
|
|
63
66
|
# OpenTox classes and includes
|
64
|
-
CLASSES = ["Feature","
|
67
|
+
CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
|
65
68
|
|
66
69
|
[ # be aware of the require sequence as it affects class/method overwrites
|
67
70
|
"overwrite.rb",
|
@@ -70,15 +73,22 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO
|
|
70
73
|
"opentox.rb",
|
71
74
|
"feature.rb",
|
72
75
|
"physchem.rb",
|
76
|
+
"substance.rb",
|
73
77
|
"compound.rb",
|
78
|
+
"nanoparticle.rb",
|
74
79
|
"dataset.rb",
|
75
80
|
"algorithm.rb",
|
81
|
+
"similarity.rb",
|
82
|
+
"feature_selection.rb",
|
76
83
|
"model.rb",
|
77
84
|
"classification.rb",
|
78
85
|
"regression.rb",
|
86
|
+
"caret.rb",
|
87
|
+
"validation-statistics.rb",
|
79
88
|
"validation.rb",
|
80
|
-
"
|
89
|
+
"train-test-validation.rb",
|
81
90
|
"leave-one-out-validation.rb",
|
82
|
-
"
|
91
|
+
"crossvalidation.rb",
|
92
|
+
#"experiment.rb",
|
93
|
+
"import.rb",
|
83
94
|
].each{ |f| require_relative f }
|
84
|
-
OpenTox::PhysChem.descriptors # load descriptor features
|