lazar 0.9.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
data/lib/dataset.rb
CHANGED
@@ -5,46 +5,49 @@ module OpenTox
|
|
5
5
|
|
6
6
|
class Dataset
|
7
7
|
|
8
|
-
|
9
|
-
field :feature_ids, type: Array, default: []
|
10
|
-
field :compound_ids, type: Array, default: []
|
11
|
-
field :data_entries, type: Array, default: []
|
12
|
-
field :source, type: String
|
8
|
+
field :data_entries, type: Hash, default: {}
|
13
9
|
|
14
10
|
# Readers
|
15
11
|
|
16
|
-
# Get all compounds
|
17
12
|
def compounds
|
18
|
-
|
19
|
-
|
13
|
+
substances.select{|s| s.is_a? Compound}
|
14
|
+
end
|
15
|
+
|
16
|
+
def nanoparticles
|
17
|
+
substances.select{|s| s.is_a? Nanoparticle}
|
18
|
+
end
|
19
|
+
|
20
|
+
# Get all substances
|
21
|
+
def substances
|
22
|
+
@substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq
|
23
|
+
@substances
|
20
24
|
end
|
21
25
|
|
22
26
|
# Get all features
|
23
27
|
def features
|
24
|
-
@features ||=
|
28
|
+
@features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq
|
25
29
|
@features
|
26
30
|
end
|
27
31
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
32
|
+
def values substance,feature
|
33
|
+
substance = substance.id if substance.is_a? Substance
|
34
|
+
feature = feature.id if feature.is_a? Feature
|
35
|
+
if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
|
36
|
+
data_entries[substance.to_s][feature.to_s]
|
37
|
+
else
|
38
|
+
nil
|
39
|
+
end
|
36
40
|
end
|
37
41
|
|
38
42
|
# Writers
|
39
43
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
self.feature_ids = features.collect{|f| f.id}
|
44
|
+
def add(substance,feature,value)
|
45
|
+
substance = substance.id if substance.is_a? Substance
|
46
|
+
feature = feature.id if feature.is_a? Feature
|
47
|
+
data_entries[substance.to_s] ||= {}
|
48
|
+
data_entries[substance.to_s][feature.to_s] ||= []
|
49
|
+
data_entries[substance.to_s][feature.to_s] << value
|
50
|
+
#data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source
|
48
51
|
end
|
49
52
|
|
50
53
|
# Dataset operations
|
@@ -53,13 +56,7 @@ module OpenTox
|
|
53
56
|
# @param [Integer] number of folds
|
54
57
|
# @return [Array] Array with folds [training_dataset,test_dataset]
|
55
58
|
def folds n
|
56
|
-
|
57
|
-
compound_ids.each_with_index do |cid,i|
|
58
|
-
unique_compound_data[cid] ||= []
|
59
|
-
unique_compound_data[cid] << data_entries[i]
|
60
|
-
end
|
61
|
-
unique_compound_ids = unique_compound_data.keys
|
62
|
-
len = unique_compound_ids.size
|
59
|
+
len = self.substances.size
|
63
60
|
indices = (0..len-1).to_a.shuffle
|
64
61
|
mid = (len/n)
|
65
62
|
chunks = []
|
@@ -68,22 +65,16 @@ module OpenTox
|
|
68
65
|
last = start+mid
|
69
66
|
last = last-1 unless len%n >= i
|
70
67
|
test_idxs = indices[start..last] || []
|
71
|
-
|
68
|
+
test_substances = test_idxs.collect{|i| substances[i]}
|
72
69
|
training_idxs = indices-test_idxs
|
73
|
-
|
74
|
-
chunk = [
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
end
|
83
|
-
dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
|
84
|
-
dataset.compounds.each do |compound|
|
85
|
-
compound.dataset_ids << dataset.id
|
86
|
-
compound.save
|
70
|
+
training_substances = training_idxs.collect{|i| substances[i]}
|
71
|
+
chunk = [training_substances,test_substances].collect do |substances|
|
72
|
+
dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id )
|
73
|
+
substances.each do |substance|
|
74
|
+
substance.dataset_ids << dataset.id
|
75
|
+
substance.dataset_ids.uniq!
|
76
|
+
substance.save
|
77
|
+
dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {}
|
87
78
|
end
|
88
79
|
dataset.save
|
89
80
|
dataset
|
@@ -94,41 +85,37 @@ module OpenTox
|
|
94
85
|
chunks
|
95
86
|
end
|
96
87
|
|
97
|
-
# Diagnostics
|
98
|
-
|
99
|
-
def duplicates feature=self.features.first
|
100
|
-
col = feature_ids.index feature.id
|
101
|
-
dups = {}
|
102
|
-
compound_ids.each_with_index do |cid,i|
|
103
|
-
rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
|
104
|
-
values = rows.collect{|row| data_entries[row][col]}
|
105
|
-
dups[cid] = values if values.size > 1
|
106
|
-
end
|
107
|
-
dups
|
108
|
-
end
|
109
|
-
|
110
|
-
def correlation_plot training_dataset
|
111
|
-
# TODO: create/store svg
|
112
|
-
R.assign "features", data_entries
|
113
|
-
R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
|
114
|
-
R.eval "featurePlot(features,activities)"
|
115
|
-
end
|
116
|
-
|
117
|
-
def density_plot
|
118
|
-
# TODO: create/store svg
|
119
|
-
R.assign "acts", data_entries.collect{|r| r.first }#.compact
|
120
|
-
R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
|
121
|
-
end
|
122
|
-
|
123
88
|
# Serialisation
|
124
89
|
|
125
90
|
# converts dataset to csv format including compound smiles as first column, other column headers are feature names
|
126
91
|
# @return [String]
|
127
92
|
def to_csv(inchi=false)
|
128
|
-
CSV.generate() do |csv|
|
129
|
-
|
130
|
-
|
131
|
-
csv << [inchi ?
|
93
|
+
CSV.generate() do |csv|
|
94
|
+
compound = substances.first.is_a? Compound
|
95
|
+
if compound
|
96
|
+
csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
|
97
|
+
else
|
98
|
+
csv << ["Name"] + features.collect{|f| f.name}
|
99
|
+
end
|
100
|
+
substances.each do |substance|
|
101
|
+
if compound
|
102
|
+
name = (inchi ? substance.inchi : substance.smiles)
|
103
|
+
else
|
104
|
+
name = substance.name
|
105
|
+
end
|
106
|
+
nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq
|
107
|
+
|
108
|
+
if nr_measurements.size > 1
|
109
|
+
warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
|
110
|
+
else
|
111
|
+
(0..nr_measurements.first-1).each do |i|
|
112
|
+
row = [name]
|
113
|
+
features.each do |f|
|
114
|
+
values(substance,f) ? row << values(substance,f)[i] : row << ""
|
115
|
+
end
|
116
|
+
csv << row
|
117
|
+
end
|
118
|
+
end
|
132
119
|
end
|
133
120
|
end
|
134
121
|
end
|
@@ -143,9 +130,8 @@ module OpenTox
|
|
143
130
|
#end
|
144
131
|
|
145
132
|
# Create a dataset from CSV file
|
146
|
-
|
147
|
-
|
148
|
-
source ||= file
|
133
|
+
def self.from_csv_file file, accept_empty_values=false
|
134
|
+
source = file
|
149
135
|
name = File.basename(file,".*")
|
150
136
|
dataset = self.find_by(:source => source, :name => name)
|
151
137
|
if dataset
|
@@ -154,171 +140,116 @@ module OpenTox
|
|
154
140
|
$logger.debug "Parsing #{file}."
|
155
141
|
table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
|
156
142
|
dataset = self.new(:source => source, :name => name)
|
157
|
-
dataset.parse_table table,
|
143
|
+
dataset.parse_table table, accept_empty_values
|
158
144
|
end
|
159
145
|
dataset
|
160
146
|
end
|
161
147
|
|
162
148
|
# parse data in tabular format (e.g. from csv)
|
163
149
|
# does a lot of guesswork in order to determine feature types
|
164
|
-
def parse_table table,
|
165
|
-
|
166
|
-
time = Time.now
|
150
|
+
def parse_table table, accept_empty_values
|
167
151
|
|
168
152
|
# features
|
169
153
|
feature_names = table.shift.collect{|f| f.strip}
|
170
|
-
warnings << "
|
154
|
+
warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
|
171
155
|
compound_format = feature_names.shift.strip
|
172
156
|
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
|
173
|
-
|
174
157
|
numeric = []
|
158
|
+
features = []
|
175
159
|
# guess feature types
|
176
160
|
feature_names.each_with_index do |f,i|
|
177
161
|
metadata = {:name => f}
|
178
162
|
values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
|
179
163
|
types = values.collect{|v| v.numeric? ? true : false}.uniq
|
164
|
+
feature = nil
|
180
165
|
if values.size == 0 # empty feature
|
181
166
|
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
|
182
|
-
metadata["numeric"] = true
|
183
167
|
numeric[i] = true
|
168
|
+
feature = NumericFeature.find_or_create_by(metadata)
|
184
169
|
else
|
185
|
-
metadata["nominal"] = true
|
186
170
|
metadata["accept_values"] = values
|
187
171
|
numeric[i] = false
|
172
|
+
feature = NominalFeature.find_or_create_by(metadata)
|
188
173
|
end
|
189
|
-
if
|
190
|
-
if metadata["numeric"]
|
191
|
-
feature = NumericBioAssay.find_or_create_by(metadata)
|
192
|
-
elsif metadata["nominal"]
|
193
|
-
feature = NominalBioAssay.find_or_create_by(metadata)
|
194
|
-
end
|
195
|
-
else
|
196
|
-
metadata.merge({:measured => false, :calculated => true})
|
197
|
-
if metadata["numeric"]
|
198
|
-
feature = NumericFeature.find_or_create_by(metadata)
|
199
|
-
elsif metadata["nominal"]
|
200
|
-
feature = NominalFeature.find_or_create_by(metadata)
|
201
|
-
end
|
202
|
-
end
|
203
|
-
feature_ids << feature.id if feature
|
174
|
+
features << feature if feature
|
204
175
|
end
|
205
176
|
|
206
|
-
|
207
|
-
time = Time.now
|
208
|
-
|
209
|
-
r = -1
|
210
|
-
compound_time = 0
|
211
|
-
value_time = 0
|
212
|
-
|
213
|
-
# compounds and values
|
214
|
-
self.data_entries = []
|
177
|
+
# substances and values
|
215
178
|
|
179
|
+
all_substances = []
|
216
180
|
table.each_with_index do |vals,i|
|
217
|
-
ct = Time.now
|
218
181
|
identifier = vals.shift.strip
|
219
|
-
|
182
|
+
warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values
|
220
183
|
begin
|
221
184
|
case compound_format
|
222
185
|
when /SMILES/i
|
223
|
-
|
186
|
+
substance = OpenTox::Compound.from_smiles(identifier)
|
224
187
|
when /InChI/i
|
225
|
-
|
188
|
+
substance = OpenTox::Compound.from_inchi(identifier)
|
226
189
|
end
|
227
190
|
rescue
|
228
|
-
|
191
|
+
substance = nil
|
229
192
|
end
|
230
|
-
if
|
231
|
-
# compound
|
232
|
-
warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
|
193
|
+
if substance.nil? # compound parsers may return nil
|
194
|
+
warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
|
233
195
|
next
|
234
196
|
end
|
235
|
-
|
236
|
-
|
197
|
+
all_substances << substance
|
198
|
+
substance.dataset_ids << self.id
|
199
|
+
substance.dataset_ids.uniq!
|
200
|
+
substance.save
|
237
201
|
|
238
|
-
|
239
|
-
|
240
|
-
warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
|
202
|
+
unless vals.size == features.size
|
203
|
+
warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
|
241
204
|
next
|
242
205
|
end
|
243
206
|
|
244
|
-
compound_ids << compound.id
|
245
|
-
table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
|
246
|
-
|
247
207
|
vals.each_with_index do |v,j|
|
248
208
|
if v.blank?
|
249
|
-
|
209
|
+
warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'."
|
250
210
|
next
|
251
211
|
elsif numeric[j]
|
252
212
|
v = v.to_f
|
253
213
|
else
|
254
214
|
v = v.strip
|
255
215
|
end
|
256
|
-
|
257
|
-
#i = compound.feature_ids.index feature_ids[j]
|
258
|
-
compound.features[feature_ids[j].to_s] ||= []
|
259
|
-
compound.features[feature_ids[j].to_s] << v
|
260
|
-
compound.save
|
216
|
+
add substance, features[j], v
|
261
217
|
end
|
218
|
+
data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values
|
262
219
|
end
|
263
|
-
|
220
|
+
all_substances.duplicates.each do |substance|
|
264
221
|
positions = []
|
265
|
-
|
266
|
-
|
222
|
+
all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
|
223
|
+
warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
|
267
224
|
end
|
268
|
-
|
269
|
-
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
|
270
|
-
time = Time.now
|
271
225
|
save
|
272
|
-
$logger.debug "Saving: #{Time.now-time}"
|
273
|
-
|
274
226
|
end
|
275
227
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
(0 .. compound_ids.size-1).each do |i|
|
280
|
-
data_entries[i] ||= []
|
281
|
-
(0 .. feature_ids.size-1).each do |j|
|
282
|
-
data_entries[i][j] ||= n
|
283
|
-
end
|
284
|
-
end
|
228
|
+
def delete
|
229
|
+
compounds.each{|c| c.dataset_ids.delete id.to_s}
|
230
|
+
super
|
285
231
|
end
|
286
232
|
|
287
233
|
end
|
288
234
|
|
289
235
|
# Dataset for lazar predictions
|
290
|
-
class LazarPrediction
|
236
|
+
class LazarPrediction #< Dataset
|
291
237
|
field :creator, type: String
|
292
|
-
field :prediction_feature_id, type:
|
238
|
+
field :prediction_feature_id, type: BSON::ObjectId
|
239
|
+
field :predictions, type: Hash, default: {}
|
293
240
|
|
294
241
|
def prediction_feature
|
295
242
|
Feature.find prediction_feature_id
|
296
243
|
end
|
297
244
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
class DescriptorDataset < Dataset
|
302
|
-
field :feature_calculation_algorithm, type: String
|
303
|
-
|
304
|
-
end
|
305
|
-
|
306
|
-
class ScaledDataset < DescriptorDataset
|
307
|
-
|
308
|
-
field :centers, type: Array, default: []
|
309
|
-
field :scales, type: Array, default: []
|
245
|
+
def compounds
|
246
|
+
substances.select{|s| s.is_a? Compound}
|
247
|
+
end
|
310
248
|
|
311
|
-
def
|
312
|
-
|
249
|
+
def substances
|
250
|
+
predictions.keys.collect{|id| Substance.find id}
|
313
251
|
end
|
314
|
-
end
|
315
252
|
|
316
|
-
# Dataset for fminer descriptors
|
317
|
-
class FminerDataset < DescriptorDataset
|
318
|
-
field :training_algorithm, type: String
|
319
|
-
field :training_dataset_id, type: BSON::ObjectId
|
320
|
-
field :training_feature_id, type: BSON::ObjectId
|
321
|
-
field :training_parameters, type: Hash
|
322
253
|
end
|
323
254
|
|
324
255
|
end
|
data/lib/feature.rb
CHANGED
@@ -2,27 +2,28 @@ module OpenTox
|
|
2
2
|
|
3
3
|
# Basic feature class
|
4
4
|
class Feature
|
5
|
-
field :nominal, type: Boolean
|
6
|
-
field :numeric, type: Boolean
|
7
5
|
field :measured, type: Boolean
|
8
6
|
field :calculated, type: Boolean
|
7
|
+
field :category, type: String
|
8
|
+
field :unit, type: String
|
9
|
+
field :conditions, type: Hash
|
10
|
+
|
11
|
+
def nominal?
|
12
|
+
self.class == NominalFeature
|
13
|
+
end
|
14
|
+
|
15
|
+
def numeric?
|
16
|
+
self.class == NumericFeature
|
17
|
+
end
|
9
18
|
end
|
10
19
|
|
11
20
|
# Feature for categorical variables
|
12
21
|
class NominalFeature < Feature
|
13
22
|
field :accept_values, type: Array
|
14
|
-
def initialize params
|
15
|
-
super params
|
16
|
-
nominal = true
|
17
|
-
end
|
18
23
|
end
|
19
24
|
|
20
25
|
# Feature for quantitative variables
|
21
26
|
class NumericFeature < Feature
|
22
|
-
def initialize params
|
23
|
-
super params
|
24
|
-
numeric = true
|
25
|
-
end
|
26
27
|
end
|
27
28
|
|
28
29
|
# Feature for SMARTS fragments
|
@@ -34,12 +35,4 @@ module OpenTox
|
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
|
-
# Feature for categorical bioassay results
|
38
|
-
class NominalBioAssay < NominalFeature
|
39
|
-
end
|
40
|
-
|
41
|
-
# Feature for quantitative bioassay results
|
42
|
-
class NumericBioAssay < NumericFeature
|
43
|
-
end
|
44
|
-
|
45
38
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Algorithm
|
3
|
+
|
4
|
+
class FeatureSelection
|
5
|
+
|
6
|
+
def self.correlation_filter model
|
7
|
+
relevant_features = {}
|
8
|
+
R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
|
9
|
+
model.descriptor_weights = []
|
10
|
+
selected_variables = []
|
11
|
+
selected_descriptor_ids = []
|
12
|
+
model.independent_variables.each_with_index do |v,i|
|
13
|
+
v.collect!{|n| to_r(n)}
|
14
|
+
R.assign "independent", v
|
15
|
+
begin
|
16
|
+
R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
|
17
|
+
pvalue = R.eval("cor$p.value").to_ruby
|
18
|
+
if pvalue <= 0.05
|
19
|
+
model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
|
20
|
+
selected_variables << v
|
21
|
+
selected_descriptor_ids << model.descriptor_ids[i]
|
22
|
+
end
|
23
|
+
rescue
|
24
|
+
warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
model.independent_variables = selected_variables
|
29
|
+
model.descriptor_ids = selected_descriptor_ids
|
30
|
+
model
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.to_r v
|
34
|
+
return 0 if v == false
|
35
|
+
return 1 if v == true
|
36
|
+
v
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
data/lib/import.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
module Import
|
4
|
+
|
5
|
+
class Enanomapper
|
6
|
+
include OpenTox
|
7
|
+
|
8
|
+
# time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
|
9
|
+
def self.import
|
10
|
+
datasets = {}
|
11
|
+
bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
|
12
|
+
bundles.each do |bundle|
|
13
|
+
datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
|
14
|
+
$logger.debug bundle["title"].strip
|
15
|
+
nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
|
16
|
+
nanoparticles.each_with_index do |np,n|
|
17
|
+
core_id = nil
|
18
|
+
coating_ids = []
|
19
|
+
np["composition"].each do |c|
|
20
|
+
uri = c["component"]["compound"]["URI"]
|
21
|
+
uri = CGI.escape File.join(uri,"&media=application/json")
|
22
|
+
data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
|
23
|
+
smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
|
24
|
+
names = []
|
25
|
+
names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
|
26
|
+
names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
|
27
|
+
if smiles
|
28
|
+
compound = Compound.find_or_create_by(:smiles => smiles)
|
29
|
+
compound.name = names.first
|
30
|
+
compound.names = names.compact
|
31
|
+
else
|
32
|
+
compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
|
33
|
+
end
|
34
|
+
compound.save
|
35
|
+
if c["relation"] == "HAS_CORE"
|
36
|
+
core_id = compound.id.to_s
|
37
|
+
elsif c["relation"] == "HAS_COATING"
|
38
|
+
coating_ids << compound.id.to_s
|
39
|
+
end
|
40
|
+
end if np["composition"]
|
41
|
+
nanoparticle = Nanoparticle.find_or_create_by(
|
42
|
+
:name => np["values"]["https://data.enanomapper.net/identifier/name"],
|
43
|
+
:source => np["compound"]["URI"],
|
44
|
+
:core_id => core_id,
|
45
|
+
:coating_ids => coating_ids
|
46
|
+
)
|
47
|
+
np["bundles"].keys.each do |bundle_uri|
|
48
|
+
nanoparticle.dataset_ids << datasets[bundle_uri].id
|
49
|
+
end
|
50
|
+
|
51
|
+
studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
|
52
|
+
studies.each do |study|
|
53
|
+
dataset = datasets[np["bundles"].keys.first]
|
54
|
+
proteomics_features = {}
|
55
|
+
category = study["protocol"]["topcategory"]
|
56
|
+
source = study["protocol"]["category"]["term"]
|
57
|
+
study["effects"].each do |effect|
|
58
|
+
|
59
|
+
effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
|
60
|
+
effect["conditions"].delete_if { |k, v| v.nil? }
|
61
|
+
|
62
|
+
if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
|
63
|
+
|
64
|
+
JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
|
65
|
+
proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
|
66
|
+
nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
|
67
|
+
end
|
68
|
+
else
|
69
|
+
name = effect["endpoint"]
|
70
|
+
unit = effect["result"]["unit"]
|
71
|
+
warnings = []
|
72
|
+
case name
|
73
|
+
when "Log2 transformed" # use a sensible name
|
74
|
+
name = "log2(Net cell association)"
|
75
|
+
warnings = ["Original name was 'Log2 transformed'"]
|
76
|
+
unit = "log2(mL/ug(Mg))"
|
77
|
+
when "Total protein (BCA assay)"
|
78
|
+
category = "P-CHEM"
|
79
|
+
warnings = ["Category changed from TOX to P-CHEM"]
|
80
|
+
end
|
81
|
+
feature = klass.find_or_create_by(
|
82
|
+
:name => name,
|
83
|
+
:unit => unit,
|
84
|
+
:category => category,
|
85
|
+
:conditions => effect["conditions"],
|
86
|
+
:source => study["protocol"]["category"]["term"],
|
87
|
+
:measured => true,
|
88
|
+
:warnings => warnings
|
89
|
+
)
|
90
|
+
nanoparticle.parse_ambit_value feature, effect["result"], dataset
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
nanoparticle.save
|
95
|
+
print "#{n}, "
|
96
|
+
end
|
97
|
+
puts
|
98
|
+
end
|
99
|
+
datasets.each { |u,d| d.save }
|
100
|
+
end
|
101
|
+
|
102
|
+
=begin
|
103
|
+
def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
|
104
|
+
#get list of bundle URIs
|
105
|
+
bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
|
106
|
+
datasets = []
|
107
|
+
bundles.each do |bundle|
|
108
|
+
uri = bundle["URI"]
|
109
|
+
study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
|
110
|
+
study["@graph"].each do |i|
|
111
|
+
puts i.to_yaml if i.keys.include? "sio:has-value"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
datasets.collect{|d| d.id}
|
115
|
+
end
|
116
|
+
=end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
data/lib/lazar.rb
CHANGED
@@ -48,6 +48,7 @@ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
|
|
48
48
|
R = Rserve::Connection.new
|
49
49
|
R.eval "
|
50
50
|
suppressPackageStartupMessages({
|
51
|
+
library(labeling,lib=\"#{rlib}\")
|
51
52
|
library(iterators,lib=\"#{rlib}\")
|
52
53
|
library(foreach,lib=\"#{rlib}\")
|
53
54
|
library(ggplot2,lib=\"#{rlib}\")
|
@@ -56,12 +57,14 @@ suppressPackageStartupMessages({
|
|
56
57
|
library(pls,lib=\"#{rlib}\")
|
57
58
|
library(caret,lib=\"#{rlib}\")
|
58
59
|
library(doMC,lib=\"#{rlib}\")
|
60
|
+
library(randomForest,lib=\"#{rlib}\")
|
61
|
+
library(plyr,lib=\"#{rlib}\")
|
59
62
|
registerDoMC(#{NR_CORES})
|
60
63
|
})
|
61
64
|
"
|
62
65
|
|
63
66
|
# OpenTox classes and includes
|
64
|
-
CLASSES = ["Feature","
|
67
|
+
CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
|
65
68
|
|
66
69
|
[ # be aware of the require sequence as it affects class/method overwrites
|
67
70
|
"overwrite.rb",
|
@@ -70,15 +73,22 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO
|
|
70
73
|
"opentox.rb",
|
71
74
|
"feature.rb",
|
72
75
|
"physchem.rb",
|
76
|
+
"substance.rb",
|
73
77
|
"compound.rb",
|
78
|
+
"nanoparticle.rb",
|
74
79
|
"dataset.rb",
|
75
80
|
"algorithm.rb",
|
81
|
+
"similarity.rb",
|
82
|
+
"feature_selection.rb",
|
76
83
|
"model.rb",
|
77
84
|
"classification.rb",
|
78
85
|
"regression.rb",
|
86
|
+
"caret.rb",
|
87
|
+
"validation-statistics.rb",
|
79
88
|
"validation.rb",
|
80
|
-
"
|
89
|
+
"train-test-validation.rb",
|
81
90
|
"leave-one-out-validation.rb",
|
82
|
-
"
|
91
|
+
"crossvalidation.rb",
|
92
|
+
#"experiment.rb",
|
93
|
+
"import.rb",
|
83
94
|
].each{ |f| require_relative f }
|
84
|
-
OpenTox::PhysChem.descriptors # load descriptor features
|