lazar 0.0.7 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
data/lib/dataset.rb
CHANGED
@@ -5,24 +5,11 @@ module OpenTox
|
|
5
5
|
|
6
6
|
class Dataset
|
7
7
|
|
8
|
-
attr_writer :data_entries
|
9
|
-
|
10
8
|
# associations like has_many, belongs_to deteriorate performance
|
11
9
|
field :feature_ids, type: Array, default: []
|
12
10
|
field :compound_ids, type: Array, default: []
|
13
|
-
field :
|
11
|
+
field :data_entries, type: Array, default: []
|
14
12
|
field :source, type: String
|
15
|
-
field :warnings, type: Array, default: []
|
16
|
-
|
17
|
-
# Save all data including data_entries
|
18
|
-
# Should be used instead of save
|
19
|
-
def save_all
|
20
|
-
dump = Marshal.dump(@data_entries)
|
21
|
-
file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
|
22
|
-
data_entries_id = $gridfs.insert_one(file)
|
23
|
-
update(:data_entries_id => data_entries_id)
|
24
|
-
save
|
25
|
-
end
|
26
13
|
|
27
14
|
# Readers
|
28
15
|
|
@@ -38,24 +25,6 @@ module OpenTox
|
|
38
25
|
@features
|
39
26
|
end
|
40
27
|
|
41
|
-
# Get all data_entries
|
42
|
-
def data_entries
|
43
|
-
unless @data_entries
|
44
|
-
t = Time.now
|
45
|
-
data_entry_file = $gridfs.find_one(_id: data_entries_id)
|
46
|
-
if data_entry_file.nil?
|
47
|
-
@data_entries = []
|
48
|
-
else
|
49
|
-
@data_entries = Marshal.load(data_entry_file.data)
|
50
|
-
bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
|
51
|
-
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
|
52
|
-
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
|
53
|
-
$logger.debug "Retrieving data: #{Time.now-t}"
|
54
|
-
end
|
55
|
-
end
|
56
|
-
@data_entries
|
57
|
-
end
|
58
|
-
|
59
28
|
# Find data entry values for a given compound and feature
|
60
29
|
# @param compound [OpenTox::Compound] OpenTox Compound object
|
61
30
|
# @param feature [OpenTox::Feature] OpenTox Feature object
|
@@ -84,7 +53,13 @@ module OpenTox
|
|
84
53
|
# @param [Integer] number of folds
|
85
54
|
# @return [Array] Array with folds [training_dataset,test_dataset]
|
86
55
|
def folds n
|
87
|
-
|
56
|
+
unique_compound_data = {}
|
57
|
+
compound_ids.each_with_index do |cid,i|
|
58
|
+
unique_compound_data[cid] ||= []
|
59
|
+
unique_compound_data[cid] << data_entries[i]
|
60
|
+
end
|
61
|
+
unique_compound_ids = unique_compound_data.keys
|
62
|
+
len = unique_compound_ids.size
|
88
63
|
indices = (0..len-1).to_a.shuffle
|
89
64
|
mid = (len/n)
|
90
65
|
chunks = []
|
@@ -93,22 +68,44 @@ module OpenTox
|
|
93
68
|
last = start+mid
|
94
69
|
last = last-1 unless len%n >= i
|
95
70
|
test_idxs = indices[start..last] || []
|
96
|
-
test_cids = test_idxs.collect{|i|
|
97
|
-
test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
|
98
|
-
test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
|
71
|
+
test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
|
99
72
|
training_idxs = indices-test_idxs
|
100
|
-
training_cids = training_idxs.collect{|i|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
73
|
+
training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
|
74
|
+
chunk = [training_cids,test_cids].collect do |unique_cids|
|
75
|
+
cids = []
|
76
|
+
data_entries = []
|
77
|
+
unique_cids.each do |cid|
|
78
|
+
unique_compound_data[cid].each do |de|
|
79
|
+
cids << cid
|
80
|
+
data_entries << de
|
81
|
+
end
|
82
|
+
end
|
83
|
+
dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
|
84
|
+
dataset.compounds.each do |compound|
|
85
|
+
compound.dataset_ids << dataset.id
|
86
|
+
compound.save
|
87
|
+
end
|
88
|
+
dataset.save
|
89
|
+
dataset
|
90
|
+
end
|
106
91
|
start = last+1
|
92
|
+
chunks << chunk
|
107
93
|
end
|
108
94
|
chunks
|
109
95
|
end
|
110
96
|
|
111
97
|
# Diagnostics
|
98
|
+
|
99
|
+
def duplicates feature=self.features.first
|
100
|
+
col = feature_ids.index feature.id
|
101
|
+
dups = {}
|
102
|
+
compound_ids.each_with_index do |cid,i|
|
103
|
+
rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
|
104
|
+
values = rows.collect{|row| data_entries[row][col]}
|
105
|
+
dups[cid] = values if values.size > 1
|
106
|
+
end
|
107
|
+
dups
|
108
|
+
end
|
112
109
|
|
113
110
|
def correlation_plot training_dataset
|
114
111
|
# TODO: create/store svg
|
@@ -120,23 +117,22 @@ module OpenTox
|
|
120
117
|
def density_plot
|
121
118
|
# TODO: create/store svg
|
122
119
|
R.assign "acts", data_entries.collect{|r| r.first }#.compact
|
123
|
-
R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
|
120
|
+
R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
|
124
121
|
end
|
125
122
|
|
126
123
|
# Serialisation
|
127
124
|
|
128
|
-
# converts dataset to csv format including compound smiles as first column, other column headers are feature
|
125
|
+
# converts dataset to csv format including compound smiles as first column, other column headers are feature names
|
129
126
|
# @return [String]
|
130
127
|
def to_csv(inchi=false)
|
131
128
|
CSV.generate() do |csv| #{:force_quotes=>true}
|
132
|
-
csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.
|
129
|
+
csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
|
133
130
|
compounds.each_with_index do |c,i|
|
134
131
|
csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
|
135
132
|
end
|
136
133
|
end
|
137
134
|
end
|
138
135
|
|
139
|
-
|
140
136
|
# Parsers
|
141
137
|
|
142
138
|
# Create a dataset from file (csv,sdf,...)
|
@@ -145,14 +141,21 @@ module OpenTox
|
|
145
141
|
# TODO
|
146
142
|
#def self.from_sdf_file
|
147
143
|
#end
|
148
|
-
|
144
|
+
|
149
145
|
# Create a dataset from CSV file
|
150
146
|
# TODO: document structure
|
151
|
-
def self.from_csv_file file, source=nil, bioassay=true
|
147
|
+
def self.from_csv_file file, source=nil, bioassay=true#, layout={}
|
152
148
|
source ||= file
|
153
|
-
|
154
|
-
dataset = self.
|
155
|
-
dataset
|
149
|
+
name = File.basename(file,".*")
|
150
|
+
dataset = self.find_by(:source => source, :name => name)
|
151
|
+
if dataset
|
152
|
+
$logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
|
153
|
+
else
|
154
|
+
$logger.debug "Parsing #{file}."
|
155
|
+
table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
|
156
|
+
dataset = self.new(:source => source, :name => name)
|
157
|
+
dataset.parse_table table, bioassay#, layout
|
158
|
+
end
|
156
159
|
dataset
|
157
160
|
end
|
158
161
|
|
@@ -197,7 +200,7 @@ module OpenTox
|
|
197
200
|
feature = NominalFeature.find_or_create_by(metadata)
|
198
201
|
end
|
199
202
|
end
|
200
|
-
feature_ids << feature.id
|
203
|
+
feature_ids << feature.id if feature
|
201
204
|
end
|
202
205
|
|
203
206
|
$logger.debug "Feature values: #{Time.now-time}"
|
@@ -208,11 +211,11 @@ module OpenTox
|
|
208
211
|
value_time = 0
|
209
212
|
|
210
213
|
# compounds and values
|
211
|
-
|
214
|
+
self.data_entries = []
|
212
215
|
|
213
216
|
table.each_with_index do |vals,i|
|
214
217
|
ct = Time.now
|
215
|
-
identifier = vals.shift
|
218
|
+
identifier = vals.shift.strip
|
216
219
|
warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
|
217
220
|
begin
|
218
221
|
case compound_format
|
@@ -229,7 +232,7 @@ module OpenTox
|
|
229
232
|
warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
|
230
233
|
next
|
231
234
|
end
|
232
|
-
|
235
|
+
compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
|
233
236
|
compound_time += Time.now-ct
|
234
237
|
|
235
238
|
r += 1
|
@@ -239,72 +242,48 @@ module OpenTox
|
|
239
242
|
end
|
240
243
|
|
241
244
|
compound_ids << compound.id
|
242
|
-
|
245
|
+
table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
|
243
246
|
|
244
247
|
vals.each_with_index do |v,j|
|
245
248
|
if v.blank?
|
246
249
|
warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
|
247
250
|
next
|
248
251
|
elsif numeric[j]
|
249
|
-
|
252
|
+
v = v.to_f
|
250
253
|
else
|
251
|
-
|
254
|
+
v = v.strip
|
252
255
|
end
|
256
|
+
self.data_entries.last[j] = v
|
257
|
+
#i = compound.feature_ids.index feature_ids[j]
|
258
|
+
compound.features[feature_ids[j].to_s] ||= []
|
259
|
+
compound.features[feature_ids[j].to_s] << v
|
260
|
+
compound.save
|
253
261
|
end
|
254
262
|
end
|
255
263
|
compounds.duplicates.each do |compound|
|
256
264
|
positions = []
|
257
|
-
compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
|
258
|
-
warnings << "Duplicate compound #{compound.
|
265
|
+
compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
|
266
|
+
warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
|
259
267
|
end
|
260
268
|
|
261
269
|
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
|
262
270
|
time = Time.now
|
263
|
-
|
271
|
+
save
|
264
272
|
$logger.debug "Saving: #{Time.now-time}"
|
265
273
|
|
266
274
|
end
|
267
275
|
|
268
|
-
=begin
|
269
|
-
# TODO remove
|
270
|
-
|
271
|
-
# Create a dataset with compounds and features
|
272
|
-
def self.create compounds, features, warnings=[], source=nil
|
273
|
-
dataset = Dataset.new(:warnings => warnings)
|
274
|
-
dataset.compounds = compounds
|
275
|
-
dataset.features = features
|
276
|
-
dataset
|
277
|
-
end
|
278
|
-
# merge dataset (i.e. append features)
|
279
|
-
def +(dataset)
|
280
|
-
bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
|
281
|
-
bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids
|
282
|
-
self.feature_ids ||= []
|
283
|
-
self.feature_ids = self.feature_ids + dataset.feature_ids
|
284
|
-
@data_entries ||= Array.new(compound_ids.size){[]}
|
285
|
-
@data_entries.each_with_index do |row,i|
|
286
|
-
@data_entries[i] = row + dataset.fingerprint(compounds[i])
|
287
|
-
end
|
288
|
-
self
|
289
|
-
|
290
|
-
end
|
291
|
-
|
292
|
-
def fingerprint(compound)
|
293
|
-
i = compound_ids.index(compound.id)
|
294
|
-
i.nil? ? nil : data_entries[i]
|
295
|
-
end
|
296
|
-
=end
|
297
|
-
|
298
276
|
# Fill unset data entries
|
299
277
|
# @param any value
|
300
278
|
def fill_nil_with n
|
301
279
|
(0 .. compound_ids.size-1).each do |i|
|
302
|
-
|
280
|
+
data_entries[i] ||= []
|
303
281
|
(0 .. feature_ids.size-1).each do |j|
|
304
|
-
|
282
|
+
data_entries[i][j] ||= n
|
305
283
|
end
|
306
284
|
end
|
307
285
|
end
|
286
|
+
|
308
287
|
end
|
309
288
|
|
310
289
|
# Dataset for lazar predictions
|
@@ -321,6 +300,17 @@ module OpenTox
|
|
321
300
|
# Dataset for descriptors (physchem)
|
322
301
|
class DescriptorDataset < Dataset
|
323
302
|
field :feature_calculation_algorithm, type: String
|
303
|
+
|
304
|
+
end
|
305
|
+
|
306
|
+
class ScaledDataset < DescriptorDataset
|
307
|
+
|
308
|
+
field :centers, type: Array, default: []
|
309
|
+
field :scales, type: Array, default: []
|
310
|
+
|
311
|
+
def original_value value, i
|
312
|
+
value * scales[i] + centers[i]
|
313
|
+
end
|
324
314
|
end
|
325
315
|
|
326
316
|
# Dataset for fminer descriptors
|
data/lib/error.rb
CHANGED
@@ -58,7 +58,7 @@ module OpenTox
|
|
58
58
|
OpenTox.const_set error[:class],c
|
59
59
|
|
60
60
|
# define global methods for raising errors, eg. bad_request_error
|
61
|
-
Object.send(:define_method, error[:method]) do |message
|
61
|
+
Object.send(:define_method, error[:method]) do |message|
|
62
62
|
raise c.new(message)
|
63
63
|
end
|
64
64
|
end
|
data/lib/experiment.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
class Experiment
|
4
|
+
field :dataset_ids, type: Array
|
5
|
+
field :model_settings, type: Array, default: []
|
6
|
+
field :results, type: Hash, default: {}
|
7
|
+
|
8
|
+
def run
|
9
|
+
dataset_ids.each do |dataset_id|
|
10
|
+
dataset = Dataset.find(dataset_id)
|
11
|
+
results[dataset_id.to_s] = []
|
12
|
+
model_settings.each do |setting|
|
13
|
+
setting = setting.dup
|
14
|
+
model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
|
15
|
+
model = Object.const_get(model_algorithm).create dataset, setting
|
16
|
+
$logger.debug model
|
17
|
+
model.save
|
18
|
+
repeated_crossvalidation = RepeatedCrossValidation.create model
|
19
|
+
results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
save
|
23
|
+
end
|
24
|
+
|
25
|
+
def report
|
26
|
+
# statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
|
27
|
+
report = {}
|
28
|
+
report[:name] = name
|
29
|
+
report[:experiment_id] = self.id.to_s
|
30
|
+
report[:results] = {}
|
31
|
+
parameters = []
|
32
|
+
dataset_ids.each do |dataset_id|
|
33
|
+
dataset_name = Dataset.find(dataset_id).name
|
34
|
+
report[:results][dataset_name] = {}
|
35
|
+
report[:results][dataset_name][:anova] = {}
|
36
|
+
report[:results][dataset_name][:data] = []
|
37
|
+
# TODO results[dataset_id.to_s] does not exist
|
38
|
+
results[dataset_id.to_s].each do |result|
|
39
|
+
model = Model::Lazar.find(result[:model_id])
|
40
|
+
repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
|
41
|
+
crossvalidations = repeated_cv.crossvalidations
|
42
|
+
if crossvalidations.first.is_a? ClassificationCrossValidation
|
43
|
+
parameters = [:accuracy,:true_rate,:predictivity]
|
44
|
+
elsif crossvalidations.first.is_a? RegressionCrossValidation
|
45
|
+
parameters = [:rmse,:mae,:r_squared]
|
46
|
+
end
|
47
|
+
summary = {}
|
48
|
+
[:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
|
49
|
+
summary[key] = model[key]
|
50
|
+
end
|
51
|
+
summary[:nr_instances] = crossvalidations.first.nr_instances
|
52
|
+
summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
|
53
|
+
summary[:time] = crossvalidations.collect{|cv| cv.time}
|
54
|
+
parameters.each do |param|
|
55
|
+
summary[param] = crossvalidations.collect{|cv| cv.send(param)}
|
56
|
+
end
|
57
|
+
report[:results][dataset_name][:data] << summary
|
58
|
+
end
|
59
|
+
end
|
60
|
+
report[:results].each do |dataset,results|
|
61
|
+
([:time,:nr_unpredicted]+parameters).each do |param|
|
62
|
+
experiments = []
|
63
|
+
outcome = []
|
64
|
+
results[:data].each_with_index do |result,i|
|
65
|
+
result[param].each do |p|
|
66
|
+
experiments << i
|
67
|
+
p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
|
68
|
+
outcome << p
|
69
|
+
end
|
70
|
+
end
|
71
|
+
begin
|
72
|
+
R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
|
73
|
+
R.eval "experiment_nr = factor(experiment_nr)"
|
74
|
+
R.assign "outcome", outcome
|
75
|
+
R.eval "data = data.frame(experiment_nr,outcome)"
|
76
|
+
# one-way ANOVA
|
77
|
+
R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
|
78
|
+
# http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
|
79
|
+
p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
|
80
|
+
# aequivalent
|
81
|
+
# sum = R.eval("summary(fit)")
|
82
|
+
#p_value = sum.to_ruby.first.last.first
|
83
|
+
rescue
|
84
|
+
p_value = nil
|
85
|
+
end
|
86
|
+
report[:results][dataset][:anova][param] = p_value
|
87
|
+
=begin
|
88
|
+
=end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
report
|
92
|
+
end
|
93
|
+
|
94
|
+
def summary
|
95
|
+
report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
data/lib/feature.rb
CHANGED
@@ -2,15 +2,14 @@ module OpenTox
|
|
2
2
|
|
3
3
|
# Basic feature class
|
4
4
|
class Feature
|
5
|
-
field :name, as: :title, type: String
|
6
5
|
field :nominal, type: Boolean
|
7
6
|
field :numeric, type: Boolean
|
8
7
|
field :measured, type: Boolean
|
8
|
+
field :calculated, type: Boolean
|
9
9
|
end
|
10
10
|
|
11
11
|
# Feature for categorical variables
|
12
12
|
class NominalFeature < Feature
|
13
|
-
# TODO check if accept_values are still needed
|
14
13
|
field :accept_values, type: Array
|
15
14
|
def initialize params
|
16
15
|
super params
|
@@ -29,69 +28,18 @@ module OpenTox
|
|
29
28
|
# Feature for SMARTS fragments
|
30
29
|
class Smarts < NominalFeature
|
31
30
|
field :smarts, type: String
|
31
|
+
index "smarts" => 1
|
32
32
|
def self.from_smarts smarts
|
33
33
|
self.find_or_create_by :smarts => smarts
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
# Feature for supervised fragments from Fminer algorithm
|
38
|
-
class FminerSmarts < Smarts
|
39
|
-
field :p_value, type: Float
|
40
|
-
# TODO check if effect is used
|
41
|
-
field :effect, type: String
|
42
|
-
field :dataset_id
|
43
|
-
end
|
44
|
-
|
45
|
-
# Feature for database fingerprints
|
46
|
-
# needs count for efficient retrieval (see compound.rb)
|
47
|
-
class FingerprintSmarts < Smarts
|
48
|
-
field :count, type: Integer
|
49
|
-
def self.fingerprint
|
50
|
-
@@fp4 ||= OpenTox::FingerprintSmarts.all
|
51
|
-
unless @@fp4.size == 306
|
52
|
-
@@fp4 = []
|
53
|
-
# OpenBabel FP4 fingerprints
|
54
|
-
# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
|
55
|
-
# TODO investigate other types of fingerprints (MACCS)
|
56
|
-
# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
|
57
|
-
# http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
|
58
|
-
# OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
|
59
|
-
# Morgan ECFP, FCFP
|
60
|
-
# http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
|
61
|
-
# http://www.rdkit.org/docs/GettingStartedInPython.html
|
62
|
-
# Chemfp
|
63
|
-
# https://chemfp.readthedocs.org/en/latest/using-tools.html
|
64
|
-
# CACTVS/PubChem
|
65
|
-
|
66
|
-
File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l|
|
67
|
-
l.strip!
|
68
|
-
unless l.empty? or l.match /^#/
|
69
|
-
name,smarts = l.split(': ')
|
70
|
-
@@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
@@fp4
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# Feature for physico-chemical descriptors
|
79
|
-
class PhysChemDescriptor < NumericFeature
|
80
|
-
field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
|
81
|
-
field :parameters, type: Hash
|
82
|
-
field :creator, type: String
|
83
|
-
end
|
84
|
-
|
85
37
|
# Feature for categorical bioassay results
|
86
38
|
class NominalBioAssay < NominalFeature
|
87
|
-
# TODO: needed? move to dataset?
|
88
|
-
field :description, type: String
|
89
39
|
end
|
90
40
|
|
91
41
|
# Feature for quantitative bioassay results
|
92
42
|
class NumericBioAssay < NumericFeature
|
93
|
-
# TODO: needed? move to dataset?
|
94
|
-
field :description, type: String
|
95
43
|
end
|
96
44
|
|
97
45
|
end
|
data/lib/lazar.rb
CHANGED
@@ -8,43 +8,58 @@ require 'mongoid'
|
|
8
8
|
require 'rserve'
|
9
9
|
require "nokogiri"
|
10
10
|
require "base64"
|
11
|
+
require 'openbabel'
|
11
12
|
|
13
|
+
# Environment setup
|
14
|
+
ENV["LAZAR_ENV"] ||= "production"
|
15
|
+
raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', please set it to 'production' or 'development'." unless ENV["LAZAR_ENV"].match(/production|development/)
|
12
16
|
|
13
|
-
|
14
|
-
#
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"]
|
18
|
+
ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
|
19
|
+
Mongoid.load_configuration({
|
20
|
+
:clients => {
|
21
|
+
:default => {
|
22
|
+
:database => ENV["LAZAR_ENV"],
|
23
|
+
:hosts => ["localhost:27017"],
|
24
|
+
}
|
25
|
+
}
|
26
|
+
})
|
27
|
+
Mongoid.raise_not_found_error = false # return nil if no document is found
|
28
|
+
$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
|
21
29
|
$gridfs = $mongo.database.fs
|
22
30
|
|
23
|
-
# R setup
|
24
|
-
R = Rserve::Connection.new
|
25
|
-
|
26
31
|
# Logger setup
|
32
|
+
STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files
|
27
33
|
$logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
require_relative '../last-utils/lu.rb'
|
37
|
-
require_relative '../openbabel/lib/openbabel'
|
34
|
+
case ENV["LAZAR_ENV"]
|
35
|
+
when "production"
|
36
|
+
$logger.level = Logger::WARN
|
37
|
+
Mongo::Logger.level = Logger::WARN
|
38
|
+
when "development"
|
39
|
+
$logger.level = Logger::DEBUG
|
40
|
+
Mongo::Logger.level = Logger::WARN
|
41
|
+
end
|
38
42
|
|
39
|
-
#
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
# R setup
|
44
|
+
rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
|
45
|
+
# should work on POSIX including os x
|
46
|
+
# http://stackoverflow.com/questions/19619582/number-of-processors-cores-in-command-line
|
47
|
+
NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
|
48
|
+
R = Rserve::Connection.new
|
49
|
+
R.eval "
|
50
|
+
suppressPackageStartupMessages({
|
51
|
+
library(ggplot2,lib=\"#{rlib}\")
|
52
|
+
library(grid,lib=\"#{rlib}\")
|
53
|
+
library(gridExtra,lib=\"#{rlib}\")
|
54
|
+
library(pls,lib=\"#{rlib}\")
|
55
|
+
library(caret,lib=\"#{rlib}\")
|
56
|
+
library(doMC,lib=\"#{rlib}\")
|
57
|
+
registerDoMC(#{NR_CORES})
|
58
|
+
})
|
59
|
+
"
|
45
60
|
|
46
61
|
# OpenTox classes and includes
|
47
|
-
CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
|
62
|
+
CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
|
48
63
|
|
49
64
|
[ # be aware of the require sequence as it affects class/method overwrites
|
50
65
|
"overwrite.rb",
|
@@ -52,18 +67,16 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor
|
|
52
67
|
"error.rb",
|
53
68
|
"opentox.rb",
|
54
69
|
"feature.rb",
|
70
|
+
"physchem.rb",
|
55
71
|
"compound.rb",
|
56
72
|
"dataset.rb",
|
57
|
-
"descriptor.rb",
|
58
73
|
"algorithm.rb",
|
59
|
-
"descriptor.rb",
|
60
|
-
"bbrc.rb",
|
61
74
|
"model.rb",
|
62
|
-
"similarity.rb",
|
63
|
-
"neighbor.rb",
|
64
75
|
"classification.rb",
|
65
76
|
"regression.rb",
|
66
77
|
"validation.rb",
|
67
78
|
"crossvalidation.rb",
|
79
|
+
"leave-one-out-validation.rb",
|
80
|
+
"experiment.rb",
|
68
81
|
].each{ |f| require_relative f }
|
69
|
-
|
82
|
+
OpenTox::PhysChem.descriptors # load descriptor features
|