lazar 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
data/lib/dataset.rb
CHANGED
@@ -5,24 +5,11 @@ module OpenTox
|
|
5
5
|
|
6
6
|
class Dataset
|
7
7
|
|
8
|
-
attr_writer :data_entries
|
9
|
-
|
10
8
|
# associations like has_many, belongs_to deteriorate performance
|
11
9
|
field :feature_ids, type: Array, default: []
|
12
10
|
field :compound_ids, type: Array, default: []
|
13
|
-
field :
|
11
|
+
field :data_entries, type: Array, default: []
|
14
12
|
field :source, type: String
|
15
|
-
field :warnings, type: Array, default: []
|
16
|
-
|
17
|
-
# Save all data including data_entries
|
18
|
-
# Should be used instead of save
|
19
|
-
def save_all
|
20
|
-
dump = Marshal.dump(@data_entries)
|
21
|
-
file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
|
22
|
-
data_entries_id = $gridfs.insert_one(file)
|
23
|
-
update(:data_entries_id => data_entries_id)
|
24
|
-
save
|
25
|
-
end
|
26
13
|
|
27
14
|
# Readers
|
28
15
|
|
@@ -38,24 +25,6 @@ module OpenTox
|
|
38
25
|
@features
|
39
26
|
end
|
40
27
|
|
41
|
-
# Get all data_entries
|
42
|
-
def data_entries
|
43
|
-
unless @data_entries
|
44
|
-
t = Time.now
|
45
|
-
data_entry_file = $gridfs.find_one(_id: data_entries_id)
|
46
|
-
if data_entry_file.nil?
|
47
|
-
@data_entries = []
|
48
|
-
else
|
49
|
-
@data_entries = Marshal.load(data_entry_file.data)
|
50
|
-
bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
|
51
|
-
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
|
52
|
-
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
|
53
|
-
$logger.debug "Retrieving data: #{Time.now-t}"
|
54
|
-
end
|
55
|
-
end
|
56
|
-
@data_entries
|
57
|
-
end
|
58
|
-
|
59
28
|
# Find data entry values for a given compound and feature
|
60
29
|
# @param compound [OpenTox::Compound] OpenTox Compound object
|
61
30
|
# @param feature [OpenTox::Feature] OpenTox Feature object
|
@@ -84,7 +53,13 @@ module OpenTox
|
|
84
53
|
# @param [Integer] number of folds
|
85
54
|
# @return [Array] Array with folds [training_dataset,test_dataset]
|
86
55
|
def folds n
|
87
|
-
|
56
|
+
unique_compound_data = {}
|
57
|
+
compound_ids.each_with_index do |cid,i|
|
58
|
+
unique_compound_data[cid] ||= []
|
59
|
+
unique_compound_data[cid] << data_entries[i]
|
60
|
+
end
|
61
|
+
unique_compound_ids = unique_compound_data.keys
|
62
|
+
len = unique_compound_ids.size
|
88
63
|
indices = (0..len-1).to_a.shuffle
|
89
64
|
mid = (len/n)
|
90
65
|
chunks = []
|
@@ -93,22 +68,44 @@ module OpenTox
|
|
93
68
|
last = start+mid
|
94
69
|
last = last-1 unless len%n >= i
|
95
70
|
test_idxs = indices[start..last] || []
|
96
|
-
test_cids = test_idxs.collect{|i|
|
97
|
-
test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
|
98
|
-
test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
|
71
|
+
test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
|
99
72
|
training_idxs = indices-test_idxs
|
100
|
-
training_cids = training_idxs.collect{|i|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
73
|
+
training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
|
74
|
+
chunk = [training_cids,test_cids].collect do |unique_cids|
|
75
|
+
cids = []
|
76
|
+
data_entries = []
|
77
|
+
unique_cids.each do |cid|
|
78
|
+
unique_compound_data[cid].each do |de|
|
79
|
+
cids << cid
|
80
|
+
data_entries << de
|
81
|
+
end
|
82
|
+
end
|
83
|
+
dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
|
84
|
+
dataset.compounds.each do |compound|
|
85
|
+
compound.dataset_ids << dataset.id
|
86
|
+
compound.save
|
87
|
+
end
|
88
|
+
dataset.save
|
89
|
+
dataset
|
90
|
+
end
|
106
91
|
start = last+1
|
92
|
+
chunks << chunk
|
107
93
|
end
|
108
94
|
chunks
|
109
95
|
end
|
110
96
|
|
111
97
|
# Diagnostics
|
98
|
+
|
99
|
+
def duplicates feature=self.features.first
|
100
|
+
col = feature_ids.index feature.id
|
101
|
+
dups = {}
|
102
|
+
compound_ids.each_with_index do |cid,i|
|
103
|
+
rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
|
104
|
+
values = rows.collect{|row| data_entries[row][col]}
|
105
|
+
dups[cid] = values if values.size > 1
|
106
|
+
end
|
107
|
+
dups
|
108
|
+
end
|
112
109
|
|
113
110
|
def correlation_plot training_dataset
|
114
111
|
# TODO: create/store svg
|
@@ -120,23 +117,22 @@ module OpenTox
|
|
120
117
|
def density_plot
|
121
118
|
# TODO: create/store svg
|
122
119
|
R.assign "acts", data_entries.collect{|r| r.first }#.compact
|
123
|
-
R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
|
120
|
+
R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
|
124
121
|
end
|
125
122
|
|
126
123
|
# Serialisation
|
127
124
|
|
128
|
-
# converts dataset to csv format including compound smiles as first column, other column headers are feature
|
125
|
+
# converts dataset to csv format including compound smiles as first column, other column headers are feature names
|
129
126
|
# @return [String]
|
130
127
|
def to_csv(inchi=false)
|
131
128
|
CSV.generate() do |csv| #{:force_quotes=>true}
|
132
|
-
csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.
|
129
|
+
csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
|
133
130
|
compounds.each_with_index do |c,i|
|
134
131
|
csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
|
135
132
|
end
|
136
133
|
end
|
137
134
|
end
|
138
135
|
|
139
|
-
|
140
136
|
# Parsers
|
141
137
|
|
142
138
|
# Create a dataset from file (csv,sdf,...)
|
@@ -145,14 +141,21 @@ module OpenTox
|
|
145
141
|
# TODO
|
146
142
|
#def self.from_sdf_file
|
147
143
|
#end
|
148
|
-
|
144
|
+
|
149
145
|
# Create a dataset from CSV file
|
150
146
|
# TODO: document structure
|
151
|
-
def self.from_csv_file file, source=nil, bioassay=true
|
147
|
+
def self.from_csv_file file, source=nil, bioassay=true#, layout={}
|
152
148
|
source ||= file
|
153
|
-
|
154
|
-
dataset = self.
|
155
|
-
dataset
|
149
|
+
name = File.basename(file,".*")
|
150
|
+
dataset = self.find_by(:source => source, :name => name)
|
151
|
+
if dataset
|
152
|
+
$logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
|
153
|
+
else
|
154
|
+
$logger.debug "Parsing #{file}."
|
155
|
+
table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
|
156
|
+
dataset = self.new(:source => source, :name => name)
|
157
|
+
dataset.parse_table table, bioassay#, layout
|
158
|
+
end
|
156
159
|
dataset
|
157
160
|
end
|
158
161
|
|
@@ -197,7 +200,7 @@ module OpenTox
|
|
197
200
|
feature = NominalFeature.find_or_create_by(metadata)
|
198
201
|
end
|
199
202
|
end
|
200
|
-
feature_ids << feature.id
|
203
|
+
feature_ids << feature.id if feature
|
201
204
|
end
|
202
205
|
|
203
206
|
$logger.debug "Feature values: #{Time.now-time}"
|
@@ -208,11 +211,11 @@ module OpenTox
|
|
208
211
|
value_time = 0
|
209
212
|
|
210
213
|
# compounds and values
|
211
|
-
|
214
|
+
self.data_entries = []
|
212
215
|
|
213
216
|
table.each_with_index do |vals,i|
|
214
217
|
ct = Time.now
|
215
|
-
identifier = vals.shift
|
218
|
+
identifier = vals.shift.strip
|
216
219
|
warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
|
217
220
|
begin
|
218
221
|
case compound_format
|
@@ -229,7 +232,7 @@ module OpenTox
|
|
229
232
|
warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
|
230
233
|
next
|
231
234
|
end
|
232
|
-
|
235
|
+
compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
|
233
236
|
compound_time += Time.now-ct
|
234
237
|
|
235
238
|
r += 1
|
@@ -239,72 +242,48 @@ module OpenTox
|
|
239
242
|
end
|
240
243
|
|
241
244
|
compound_ids << compound.id
|
242
|
-
|
245
|
+
table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
|
243
246
|
|
244
247
|
vals.each_with_index do |v,j|
|
245
248
|
if v.blank?
|
246
249
|
warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
|
247
250
|
next
|
248
251
|
elsif numeric[j]
|
249
|
-
|
252
|
+
v = v.to_f
|
250
253
|
else
|
251
|
-
|
254
|
+
v = v.strip
|
252
255
|
end
|
256
|
+
self.data_entries.last[j] = v
|
257
|
+
#i = compound.feature_ids.index feature_ids[j]
|
258
|
+
compound.features[feature_ids[j].to_s] ||= []
|
259
|
+
compound.features[feature_ids[j].to_s] << v
|
260
|
+
compound.save
|
253
261
|
end
|
254
262
|
end
|
255
263
|
compounds.duplicates.each do |compound|
|
256
264
|
positions = []
|
257
|
-
compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
|
258
|
-
warnings << "Duplicate compound #{compound.
|
265
|
+
compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
|
266
|
+
warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
|
259
267
|
end
|
260
268
|
|
261
269
|
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
|
262
270
|
time = Time.now
|
263
|
-
|
271
|
+
save
|
264
272
|
$logger.debug "Saving: #{Time.now-time}"
|
265
273
|
|
266
274
|
end
|
267
275
|
|
268
|
-
=begin
|
269
|
-
# TODO remove
|
270
|
-
|
271
|
-
# Create a dataset with compounds and features
|
272
|
-
def self.create compounds, features, warnings=[], source=nil
|
273
|
-
dataset = Dataset.new(:warnings => warnings)
|
274
|
-
dataset.compounds = compounds
|
275
|
-
dataset.features = features
|
276
|
-
dataset
|
277
|
-
end
|
278
|
-
# merge dataset (i.e. append features)
|
279
|
-
def +(dataset)
|
280
|
-
bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
|
281
|
-
bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids
|
282
|
-
self.feature_ids ||= []
|
283
|
-
self.feature_ids = self.feature_ids + dataset.feature_ids
|
284
|
-
@data_entries ||= Array.new(compound_ids.size){[]}
|
285
|
-
@data_entries.each_with_index do |row,i|
|
286
|
-
@data_entries[i] = row + dataset.fingerprint(compounds[i])
|
287
|
-
end
|
288
|
-
self
|
289
|
-
|
290
|
-
end
|
291
|
-
|
292
|
-
def fingerprint(compound)
|
293
|
-
i = compound_ids.index(compound.id)
|
294
|
-
i.nil? ? nil : data_entries[i]
|
295
|
-
end
|
296
|
-
=end
|
297
|
-
|
298
276
|
# Fill unset data entries
|
299
277
|
# @param any value
|
300
278
|
def fill_nil_with n
|
301
279
|
(0 .. compound_ids.size-1).each do |i|
|
302
|
-
|
280
|
+
data_entries[i] ||= []
|
303
281
|
(0 .. feature_ids.size-1).each do |j|
|
304
|
-
|
282
|
+
data_entries[i][j] ||= n
|
305
283
|
end
|
306
284
|
end
|
307
285
|
end
|
286
|
+
|
308
287
|
end
|
309
288
|
|
310
289
|
# Dataset for lazar predictions
|
@@ -321,6 +300,17 @@ module OpenTox
|
|
321
300
|
# Dataset for descriptors (physchem)
|
322
301
|
class DescriptorDataset < Dataset
|
323
302
|
field :feature_calculation_algorithm, type: String
|
303
|
+
|
304
|
+
end
|
305
|
+
|
306
|
+
class ScaledDataset < DescriptorDataset
|
307
|
+
|
308
|
+
field :centers, type: Array, default: []
|
309
|
+
field :scales, type: Array, default: []
|
310
|
+
|
311
|
+
def original_value value, i
|
312
|
+
value * scales[i] + centers[i]
|
313
|
+
end
|
324
314
|
end
|
325
315
|
|
326
316
|
# Dataset for fminer descriptors
|
data/lib/error.rb
CHANGED
@@ -58,7 +58,7 @@ module OpenTox
|
|
58
58
|
OpenTox.const_set error[:class],c
|
59
59
|
|
60
60
|
# define global methods for raising errors, eg. bad_request_error
|
61
|
-
Object.send(:define_method, error[:method]) do |message
|
61
|
+
Object.send(:define_method, error[:method]) do |message|
|
62
62
|
raise c.new(message)
|
63
63
|
end
|
64
64
|
end
|
data/lib/experiment.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
class Experiment
|
4
|
+
field :dataset_ids, type: Array
|
5
|
+
field :model_settings, type: Array, default: []
|
6
|
+
field :results, type: Hash, default: {}
|
7
|
+
|
8
|
+
def run
|
9
|
+
dataset_ids.each do |dataset_id|
|
10
|
+
dataset = Dataset.find(dataset_id)
|
11
|
+
results[dataset_id.to_s] = []
|
12
|
+
model_settings.each do |setting|
|
13
|
+
setting = setting.dup
|
14
|
+
model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
|
15
|
+
model = Object.const_get(model_algorithm).create dataset, setting
|
16
|
+
$logger.debug model
|
17
|
+
model.save
|
18
|
+
repeated_crossvalidation = RepeatedCrossValidation.create model
|
19
|
+
results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
save
|
23
|
+
end
|
24
|
+
|
25
|
+
def report
|
26
|
+
# statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
|
27
|
+
report = {}
|
28
|
+
report[:name] = name
|
29
|
+
report[:experiment_id] = self.id.to_s
|
30
|
+
report[:results] = {}
|
31
|
+
parameters = []
|
32
|
+
dataset_ids.each do |dataset_id|
|
33
|
+
dataset_name = Dataset.find(dataset_id).name
|
34
|
+
report[:results][dataset_name] = {}
|
35
|
+
report[:results][dataset_name][:anova] = {}
|
36
|
+
report[:results][dataset_name][:data] = []
|
37
|
+
# TODO results[dataset_id.to_s] does not exist
|
38
|
+
results[dataset_id.to_s].each do |result|
|
39
|
+
model = Model::Lazar.find(result[:model_id])
|
40
|
+
repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
|
41
|
+
crossvalidations = repeated_cv.crossvalidations
|
42
|
+
if crossvalidations.first.is_a? ClassificationCrossValidation
|
43
|
+
parameters = [:accuracy,:true_rate,:predictivity]
|
44
|
+
elsif crossvalidations.first.is_a? RegressionCrossValidation
|
45
|
+
parameters = [:rmse,:mae,:r_squared]
|
46
|
+
end
|
47
|
+
summary = {}
|
48
|
+
[:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
|
49
|
+
summary[key] = model[key]
|
50
|
+
end
|
51
|
+
summary[:nr_instances] = crossvalidations.first.nr_instances
|
52
|
+
summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
|
53
|
+
summary[:time] = crossvalidations.collect{|cv| cv.time}
|
54
|
+
parameters.each do |param|
|
55
|
+
summary[param] = crossvalidations.collect{|cv| cv.send(param)}
|
56
|
+
end
|
57
|
+
report[:results][dataset_name][:data] << summary
|
58
|
+
end
|
59
|
+
end
|
60
|
+
report[:results].each do |dataset,results|
|
61
|
+
([:time,:nr_unpredicted]+parameters).each do |param|
|
62
|
+
experiments = []
|
63
|
+
outcome = []
|
64
|
+
results[:data].each_with_index do |result,i|
|
65
|
+
result[param].each do |p|
|
66
|
+
experiments << i
|
67
|
+
p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
|
68
|
+
outcome << p
|
69
|
+
end
|
70
|
+
end
|
71
|
+
begin
|
72
|
+
R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
|
73
|
+
R.eval "experiment_nr = factor(experiment_nr)"
|
74
|
+
R.assign "outcome", outcome
|
75
|
+
R.eval "data = data.frame(experiment_nr,outcome)"
|
76
|
+
# one-way ANOVA
|
77
|
+
R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
|
78
|
+
# http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
|
79
|
+
p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
|
80
|
+
# aequivalent
|
81
|
+
# sum = R.eval("summary(fit)")
|
82
|
+
#p_value = sum.to_ruby.first.last.first
|
83
|
+
rescue
|
84
|
+
p_value = nil
|
85
|
+
end
|
86
|
+
report[:results][dataset][:anova][param] = p_value
|
87
|
+
=begin
|
88
|
+
=end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
report
|
92
|
+
end
|
93
|
+
|
94
|
+
def summary
|
95
|
+
report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
data/lib/feature.rb
CHANGED
@@ -2,15 +2,14 @@ module OpenTox
|
|
2
2
|
|
3
3
|
# Basic feature class
|
4
4
|
class Feature
|
5
|
-
field :name, as: :title, type: String
|
6
5
|
field :nominal, type: Boolean
|
7
6
|
field :numeric, type: Boolean
|
8
7
|
field :measured, type: Boolean
|
8
|
+
field :calculated, type: Boolean
|
9
9
|
end
|
10
10
|
|
11
11
|
# Feature for categorical variables
|
12
12
|
class NominalFeature < Feature
|
13
|
-
# TODO check if accept_values are still needed
|
14
13
|
field :accept_values, type: Array
|
15
14
|
def initialize params
|
16
15
|
super params
|
@@ -29,69 +28,18 @@ module OpenTox
|
|
29
28
|
# Feature for SMARTS fragments
|
30
29
|
class Smarts < NominalFeature
|
31
30
|
field :smarts, type: String
|
31
|
+
index "smarts" => 1
|
32
32
|
def self.from_smarts smarts
|
33
33
|
self.find_or_create_by :smarts => smarts
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
# Feature for supervised fragments from Fminer algorithm
|
38
|
-
class FminerSmarts < Smarts
|
39
|
-
field :p_value, type: Float
|
40
|
-
# TODO check if effect is used
|
41
|
-
field :effect, type: String
|
42
|
-
field :dataset_id
|
43
|
-
end
|
44
|
-
|
45
|
-
# Feature for database fingerprints
|
46
|
-
# needs count for efficient retrieval (see compound.rb)
|
47
|
-
class FingerprintSmarts < Smarts
|
48
|
-
field :count, type: Integer
|
49
|
-
def self.fingerprint
|
50
|
-
@@fp4 ||= OpenTox::FingerprintSmarts.all
|
51
|
-
unless @@fp4.size == 306
|
52
|
-
@@fp4 = []
|
53
|
-
# OpenBabel FP4 fingerprints
|
54
|
-
# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
|
55
|
-
# TODO investigate other types of fingerprints (MACCS)
|
56
|
-
# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
|
57
|
-
# http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
|
58
|
-
# OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
|
59
|
-
# Morgan ECFP, FCFP
|
60
|
-
# http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
|
61
|
-
# http://www.rdkit.org/docs/GettingStartedInPython.html
|
62
|
-
# Chemfp
|
63
|
-
# https://chemfp.readthedocs.org/en/latest/using-tools.html
|
64
|
-
# CACTVS/PubChem
|
65
|
-
|
66
|
-
File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l|
|
67
|
-
l.strip!
|
68
|
-
unless l.empty? or l.match /^#/
|
69
|
-
name,smarts = l.split(': ')
|
70
|
-
@@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
@@fp4
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# Feature for physico-chemical descriptors
|
79
|
-
class PhysChemDescriptor < NumericFeature
|
80
|
-
field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
|
81
|
-
field :parameters, type: Hash
|
82
|
-
field :creator, type: String
|
83
|
-
end
|
84
|
-
|
85
37
|
# Feature for categorical bioassay results
|
86
38
|
class NominalBioAssay < NominalFeature
|
87
|
-
# TODO: needed? move to dataset?
|
88
|
-
field :description, type: String
|
89
39
|
end
|
90
40
|
|
91
41
|
# Feature for quantitative bioassay results
|
92
42
|
class NumericBioAssay < NumericFeature
|
93
|
-
# TODO: needed? move to dataset?
|
94
|
-
field :description, type: String
|
95
43
|
end
|
96
44
|
|
97
45
|
end
|
data/lib/lazar.rb
CHANGED
@@ -8,43 +8,58 @@ require 'mongoid'
|
|
8
8
|
require 'rserve'
|
9
9
|
require "nokogiri"
|
10
10
|
require "base64"
|
11
|
+
require 'openbabel'
|
11
12
|
|
13
|
+
# Environment setup
|
14
|
+
ENV["LAZAR_ENV"] ||= "production"
|
15
|
+
raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', please set it to 'production' or 'development'." unless ENV["LAZAR_ENV"].match(/production|development/)
|
12
16
|
|
13
|
-
|
14
|
-
#
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"]
|
18
|
+
ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
|
19
|
+
Mongoid.load_configuration({
|
20
|
+
:clients => {
|
21
|
+
:default => {
|
22
|
+
:database => ENV["LAZAR_ENV"],
|
23
|
+
:hosts => ["localhost:27017"],
|
24
|
+
}
|
25
|
+
}
|
26
|
+
})
|
27
|
+
Mongoid.raise_not_found_error = false # return nil if no document is found
|
28
|
+
$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
|
21
29
|
$gridfs = $mongo.database.fs
|
22
30
|
|
23
|
-
# R setup
|
24
|
-
R = Rserve::Connection.new
|
25
|
-
|
26
31
|
# Logger setup
|
32
|
+
STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files
|
27
33
|
$logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
require_relative '../last-utils/lu.rb'
|
37
|
-
require_relative '../openbabel/lib/openbabel'
|
34
|
+
case ENV["LAZAR_ENV"]
|
35
|
+
when "production"
|
36
|
+
$logger.level = Logger::WARN
|
37
|
+
Mongo::Logger.level = Logger::WARN
|
38
|
+
when "development"
|
39
|
+
$logger.level = Logger::DEBUG
|
40
|
+
Mongo::Logger.level = Logger::WARN
|
41
|
+
end
|
38
42
|
|
39
|
-
#
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
# R setup
|
44
|
+
rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
|
45
|
+
# should work on POSIX including os x
|
46
|
+
# http://stackoverflow.com/questions/19619582/number-of-processors-cores-in-command-line
|
47
|
+
NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
|
48
|
+
R = Rserve::Connection.new
|
49
|
+
R.eval "
|
50
|
+
suppressPackageStartupMessages({
|
51
|
+
library(ggplot2,lib=\"#{rlib}\")
|
52
|
+
library(grid,lib=\"#{rlib}\")
|
53
|
+
library(gridExtra,lib=\"#{rlib}\")
|
54
|
+
library(pls,lib=\"#{rlib}\")
|
55
|
+
library(caret,lib=\"#{rlib}\")
|
56
|
+
library(doMC,lib=\"#{rlib}\")
|
57
|
+
registerDoMC(#{NR_CORES})
|
58
|
+
})
|
59
|
+
"
|
45
60
|
|
46
61
|
# OpenTox classes and includes
|
47
|
-
CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
|
62
|
+
CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
|
48
63
|
|
49
64
|
[ # be aware of the require sequence as it affects class/method overwrites
|
50
65
|
"overwrite.rb",
|
@@ -52,18 +67,16 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor
|
|
52
67
|
"error.rb",
|
53
68
|
"opentox.rb",
|
54
69
|
"feature.rb",
|
70
|
+
"physchem.rb",
|
55
71
|
"compound.rb",
|
56
72
|
"dataset.rb",
|
57
|
-
"descriptor.rb",
|
58
73
|
"algorithm.rb",
|
59
|
-
"descriptor.rb",
|
60
|
-
"bbrc.rb",
|
61
74
|
"model.rb",
|
62
|
-
"similarity.rb",
|
63
|
-
"neighbor.rb",
|
64
75
|
"classification.rb",
|
65
76
|
"regression.rb",
|
66
77
|
"validation.rb",
|
67
78
|
"crossvalidation.rb",
|
79
|
+
"leave-one-out-validation.rb",
|
80
|
+
"experiment.rb",
|
68
81
|
].each{ |f| require_relative f }
|
69
|
-
|
82
|
+
OpenTox::PhysChem.descriptors # load descriptor features
|