lazar 0.0.7 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
data/lib/dataset.rb CHANGED
@@ -5,24 +5,11 @@ module OpenTox
5
5
 
6
6
  class Dataset
7
7
 
8
- attr_writer :data_entries
9
-
10
8
  # associations like has_many, belongs_to deteriorate performance
11
9
  field :feature_ids, type: Array, default: []
12
10
  field :compound_ids, type: Array, default: []
13
- field :data_entries_id, type: BSON::ObjectId, default: []
11
+ field :data_entries, type: Array, default: []
14
12
  field :source, type: String
15
- field :warnings, type: Array, default: []
16
-
17
- # Save all data including data_entries
18
- # Should be used instead of save
19
- def save_all
20
- dump = Marshal.dump(@data_entries)
21
- file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
22
- data_entries_id = $gridfs.insert_one(file)
23
- update(:data_entries_id => data_entries_id)
24
- save
25
- end
26
13
 
27
14
  # Readers
28
15
 
@@ -38,24 +25,6 @@ module OpenTox
38
25
  @features
39
26
  end
40
27
 
41
- # Get all data_entries
42
- def data_entries
43
- unless @data_entries
44
- t = Time.now
45
- data_entry_file = $gridfs.find_one(_id: data_entries_id)
46
- if data_entry_file.nil?
47
- @data_entries = []
48
- else
49
- @data_entries = Marshal.load(data_entry_file.data)
50
- bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
51
- bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
52
- bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
53
- $logger.debug "Retrieving data: #{Time.now-t}"
54
- end
55
- end
56
- @data_entries
57
- end
58
-
59
28
  # Find data entry values for a given compound and feature
60
29
  # @param compound [OpenTox::Compound] OpenTox Compound object
61
30
  # @param feature [OpenTox::Feature] OpenTox Feature object
@@ -84,7 +53,13 @@ module OpenTox
84
53
  # @param [Integer] number of folds
85
54
  # @return [Array] Array with folds [training_dataset,test_dataset]
86
55
  def folds n
87
- len = self.compound_ids.size
56
+ unique_compound_data = {}
57
+ compound_ids.each_with_index do |cid,i|
58
+ unique_compound_data[cid] ||= []
59
+ unique_compound_data[cid] << data_entries[i]
60
+ end
61
+ unique_compound_ids = unique_compound_data.keys
62
+ len = unique_compound_ids.size
88
63
  indices = (0..len-1).to_a.shuffle
89
64
  mid = (len/n)
90
65
  chunks = []
@@ -93,22 +68,44 @@ module OpenTox
93
68
  last = start+mid
94
69
  last = last-1 unless len%n >= i
95
70
  test_idxs = indices[start..last] || []
96
- test_cids = test_idxs.collect{|i| self.compound_ids[i]}
97
- test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
98
- test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
71
+ test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
99
72
  training_idxs = indices-test_idxs
100
- training_cids = training_idxs.collect{|i| self.compound_ids[i]}
101
- training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
102
- training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
103
- test_dataset.save_all
104
- training_dataset.save_all
105
- chunks << [training_dataset,test_dataset]
73
+ training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
74
+ chunk = [training_cids,test_cids].collect do |unique_cids|
75
+ cids = []
76
+ data_entries = []
77
+ unique_cids.each do |cid|
78
+ unique_compound_data[cid].each do |de|
79
+ cids << cid
80
+ data_entries << de
81
+ end
82
+ end
83
+ dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
84
+ dataset.compounds.each do |compound|
85
+ compound.dataset_ids << dataset.id
86
+ compound.save
87
+ end
88
+ dataset.save
89
+ dataset
90
+ end
106
91
  start = last+1
92
+ chunks << chunk
107
93
  end
108
94
  chunks
109
95
  end
110
96
 
111
97
  # Diagnostics
98
+
99
+ def duplicates feature=self.features.first
100
+ col = feature_ids.index feature.id
101
+ dups = {}
102
+ compound_ids.each_with_index do |cid,i|
103
+ rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
104
+ values = rows.collect{|row| data_entries[row][col]}
105
+ dups[cid] = values if values.size > 1
106
+ end
107
+ dups
108
+ end
112
109
 
113
110
  def correlation_plot training_dataset
114
111
  # TODO: create/store svg
@@ -120,23 +117,22 @@ module OpenTox
120
117
  def density_plot
121
118
  # TODO: create/store svg
122
119
  R.assign "acts", data_entries.collect{|r| r.first }#.compact
123
- R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
120
+ R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
124
121
  end
125
122
 
126
123
  # Serialisation
127
124
 
128
- # converts dataset to csv format including compound smiles as first column, other column headers are feature titles
125
+ # converts dataset to csv format including compound smiles as first column, other column headers are feature names
129
126
  # @return [String]
130
127
  def to_csv(inchi=false)
131
128
  CSV.generate() do |csv| #{:force_quotes=>true}
132
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.title}
129
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
133
130
  compounds.each_with_index do |c,i|
134
131
  csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
135
132
  end
136
133
  end
137
134
  end
138
135
 
139
-
140
136
  # Parsers
141
137
 
142
138
  # Create a dataset from file (csv,sdf,...)
@@ -145,14 +141,21 @@ module OpenTox
145
141
  # TODO
146
142
  #def self.from_sdf_file
147
143
  #end
148
-
144
+
149
145
  # Create a dataset from CSV file
150
146
  # TODO: document structure
151
- def self.from_csv_file file, source=nil, bioassay=true
147
+ def self.from_csv_file file, source=nil, bioassay=true#, layout={}
152
148
  source ||= file
153
- table = CSV.read file, :skip_blanks => true
154
- dataset = self.new(:source => source, :name => File.basename(file))
155
- dataset.parse_table table, bioassay
149
+ name = File.basename(file,".*")
150
+ dataset = self.find_by(:source => source, :name => name)
151
+ if dataset
152
+ $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
153
+ else
154
+ $logger.debug "Parsing #{file}."
155
+ table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
156
+ dataset = self.new(:source => source, :name => name)
157
+ dataset.parse_table table, bioassay#, layout
158
+ end
156
159
  dataset
157
160
  end
158
161
 
@@ -197,7 +200,7 @@ module OpenTox
197
200
  feature = NominalFeature.find_or_create_by(metadata)
198
201
  end
199
202
  end
200
- feature_ids << feature.id
203
+ feature_ids << feature.id if feature
201
204
  end
202
205
 
203
206
  $logger.debug "Feature values: #{Time.now-time}"
@@ -208,11 +211,11 @@ module OpenTox
208
211
  value_time = 0
209
212
 
210
213
  # compounds and values
211
- @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
214
+ self.data_entries = []
212
215
 
213
216
  table.each_with_index do |vals,i|
214
217
  ct = Time.now
215
- identifier = vals.shift
218
+ identifier = vals.shift.strip
216
219
  warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
217
220
  begin
218
221
  case compound_format
@@ -229,7 +232,7 @@ module OpenTox
229
232
  warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
230
233
  next
231
234
  end
232
- # TODO insert empty compounds to keep positions?
235
+ compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
233
236
  compound_time += Time.now-ct
234
237
 
235
238
  r += 1
@@ -239,72 +242,48 @@ module OpenTox
239
242
  end
240
243
 
241
244
  compound_ids << compound.id
242
- @data_entries << Array.new(table.first.size-1)
245
+ table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
243
246
 
244
247
  vals.each_with_index do |v,j|
245
248
  if v.blank?
246
249
  warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
247
250
  next
248
251
  elsif numeric[j]
249
- @data_entries.last[j] = v.to_f
252
+ v = v.to_f
250
253
  else
251
- @data_entries.last[j] = v.strip
254
+ v = v.strip
252
255
  end
256
+ self.data_entries.last[j] = v
257
+ #i = compound.feature_ids.index feature_ids[j]
258
+ compound.features[feature_ids[j].to_s] ||= []
259
+ compound.features[feature_ids[j].to_s] << v
260
+ compound.save
253
261
  end
254
262
  end
255
263
  compounds.duplicates.each do |compound|
256
264
  positions = []
257
- compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
258
- warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
265
+ compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
266
+ warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
259
267
  end
260
268
 
261
269
  $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
262
270
  time = Time.now
263
- save_all
271
+ save
264
272
  $logger.debug "Saving: #{Time.now-time}"
265
273
 
266
274
  end
267
275
 
268
- =begin
269
- # TODO remove
270
-
271
- # Create a dataset with compounds and features
272
- def self.create compounds, features, warnings=[], source=nil
273
- dataset = Dataset.new(:warnings => warnings)
274
- dataset.compounds = compounds
275
- dataset.features = features
276
- dataset
277
- end
278
- # merge dataset (i.e. append features)
279
- def +(dataset)
280
- bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
281
- bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids
282
- self.feature_ids ||= []
283
- self.feature_ids = self.feature_ids + dataset.feature_ids
284
- @data_entries ||= Array.new(compound_ids.size){[]}
285
- @data_entries.each_with_index do |row,i|
286
- @data_entries[i] = row + dataset.fingerprint(compounds[i])
287
- end
288
- self
289
-
290
- end
291
-
292
- def fingerprint(compound)
293
- i = compound_ids.index(compound.id)
294
- i.nil? ? nil : data_entries[i]
295
- end
296
- =end
297
-
298
276
  # Fill unset data entries
299
277
  # @param any value
300
278
  def fill_nil_with n
301
279
  (0 .. compound_ids.size-1).each do |i|
302
- @data_entries[i] ||= []
280
+ data_entries[i] ||= []
303
281
  (0 .. feature_ids.size-1).each do |j|
304
- @data_entries[i][j] ||= n
282
+ data_entries[i][j] ||= n
305
283
  end
306
284
  end
307
285
  end
286
+
308
287
  end
309
288
 
310
289
  # Dataset for lazar predictions
@@ -321,6 +300,17 @@ module OpenTox
321
300
  # Dataset for descriptors (physchem)
322
301
  class DescriptorDataset < Dataset
323
302
  field :feature_calculation_algorithm, type: String
303
+
304
+ end
305
+
306
+ class ScaledDataset < DescriptorDataset
307
+
308
+ field :centers, type: Array, default: []
309
+ field :scales, type: Array, default: []
310
+
311
+ def original_value value, i
312
+ value * scales[i] + centers[i]
313
+ end
324
314
  end
325
315
 
326
316
  # Dataset for fminer descriptors
data/lib/error.rb CHANGED
@@ -58,7 +58,7 @@ module OpenTox
58
58
  OpenTox.const_set error[:class],c
59
59
 
60
60
  # define global methods for raising errors, eg. bad_request_error
61
- Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil|
61
+ Object.send(:define_method, error[:method]) do |message|
62
62
  raise c.new(message)
63
63
  end
64
64
  end
data/lib/experiment.rb ADDED
@@ -0,0 +1,99 @@
1
+ module OpenTox
2
+
3
+ class Experiment
4
+ field :dataset_ids, type: Array
5
+ field :model_settings, type: Array, default: []
6
+ field :results, type: Hash, default: {}
7
+
8
+ def run
9
+ dataset_ids.each do |dataset_id|
10
+ dataset = Dataset.find(dataset_id)
11
+ results[dataset_id.to_s] = []
12
+ model_settings.each do |setting|
13
+ setting = setting.dup
14
+ model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
15
+ model = Object.const_get(model_algorithm).create dataset, setting
16
+ $logger.debug model
17
+ model.save
18
+ repeated_crossvalidation = RepeatedCrossValidation.create model
19
+ results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
20
+ end
21
+ end
22
+ save
23
+ end
24
+
25
+ def report
26
+ # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
27
+ report = {}
28
+ report[:name] = name
29
+ report[:experiment_id] = self.id.to_s
30
+ report[:results] = {}
31
+ parameters = []
32
+ dataset_ids.each do |dataset_id|
33
+ dataset_name = Dataset.find(dataset_id).name
34
+ report[:results][dataset_name] = {}
35
+ report[:results][dataset_name][:anova] = {}
36
+ report[:results][dataset_name][:data] = []
37
+ # TODO results[dataset_id.to_s] does not exist
38
+ results[dataset_id.to_s].each do |result|
39
+ model = Model::Lazar.find(result[:model_id])
40
+ repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
41
+ crossvalidations = repeated_cv.crossvalidations
42
+ if crossvalidations.first.is_a? ClassificationCrossValidation
43
+ parameters = [:accuracy,:true_rate,:predictivity]
44
+ elsif crossvalidations.first.is_a? RegressionCrossValidation
45
+ parameters = [:rmse,:mae,:r_squared]
46
+ end
47
+ summary = {}
48
+ [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
49
+ summary[key] = model[key]
50
+ end
51
+ summary[:nr_instances] = crossvalidations.first.nr_instances
52
+ summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
53
+ summary[:time] = crossvalidations.collect{|cv| cv.time}
54
+ parameters.each do |param|
55
+ summary[param] = crossvalidations.collect{|cv| cv.send(param)}
56
+ end
57
+ report[:results][dataset_name][:data] << summary
58
+ end
59
+ end
60
+ report[:results].each do |dataset,results|
61
+ ([:time,:nr_unpredicted]+parameters).each do |param|
62
+ experiments = []
63
+ outcome = []
64
+ results[:data].each_with_index do |result,i|
65
+ result[param].each do |p|
66
+ experiments << i
67
+ p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
68
+ outcome << p
69
+ end
70
+ end
71
+ begin
72
+ R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
73
+ R.eval "experiment_nr = factor(experiment_nr)"
74
+ R.assign "outcome", outcome
75
+ R.eval "data = data.frame(experiment_nr,outcome)"
76
+ # one-way ANOVA
77
+ R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
78
+ # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
79
+ p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
80
+ # aequivalent
81
+ # sum = R.eval("summary(fit)")
82
+ #p_value = sum.to_ruby.first.last.first
83
+ rescue
84
+ p_value = nil
85
+ end
86
+ report[:results][dataset][:anova][param] = p_value
87
+ =begin
88
+ =end
89
+ end
90
+ end
91
+ report
92
+ end
93
+
94
+ def summary
95
+ report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
96
+ end
97
+ end
98
+
99
+ end
data/lib/feature.rb CHANGED
@@ -2,15 +2,14 @@ module OpenTox
2
2
 
3
3
  # Basic feature class
4
4
  class Feature
5
- field :name, as: :title, type: String
6
5
  field :nominal, type: Boolean
7
6
  field :numeric, type: Boolean
8
7
  field :measured, type: Boolean
8
+ field :calculated, type: Boolean
9
9
  end
10
10
 
11
11
  # Feature for categorical variables
12
12
  class NominalFeature < Feature
13
- # TODO check if accept_values are still needed
14
13
  field :accept_values, type: Array
15
14
  def initialize params
16
15
  super params
@@ -29,69 +28,18 @@ module OpenTox
29
28
  # Feature for SMARTS fragments
30
29
  class Smarts < NominalFeature
31
30
  field :smarts, type: String
31
+ index "smarts" => 1
32
32
  def self.from_smarts smarts
33
33
  self.find_or_create_by :smarts => smarts
34
34
  end
35
35
  end
36
36
 
37
- # Feature for supervised fragments from Fminer algorithm
38
- class FminerSmarts < Smarts
39
- field :p_value, type: Float
40
- # TODO check if effect is used
41
- field :effect, type: String
42
- field :dataset_id
43
- end
44
-
45
- # Feature for database fingerprints
46
- # needs count for efficient retrieval (see compound.rb)
47
- class FingerprintSmarts < Smarts
48
- field :count, type: Integer
49
- def self.fingerprint
50
- @@fp4 ||= OpenTox::FingerprintSmarts.all
51
- unless @@fp4.size == 306
52
- @@fp4 = []
53
- # OpenBabel FP4 fingerprints
54
- # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
55
- # TODO investigate other types of fingerprints (MACCS)
56
- # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
57
- # http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
58
- # OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
59
- # Morgan ECFP, FCFP
60
- # http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
61
- # http://www.rdkit.org/docs/GettingStartedInPython.html
62
- # Chemfp
63
- # https://chemfp.readthedocs.org/en/latest/using-tools.html
64
- # CACTVS/PubChem
65
-
66
- File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l|
67
- l.strip!
68
- unless l.empty? or l.match /^#/
69
- name,smarts = l.split(': ')
70
- @@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
71
- end
72
- end
73
- end
74
- @@fp4
75
- end
76
- end
77
-
78
- # Feature for physico-chemical descriptors
79
- class PhysChemDescriptor < NumericFeature
80
- field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
81
- field :parameters, type: Hash
82
- field :creator, type: String
83
- end
84
-
85
37
  # Feature for categorical bioassay results
86
38
  class NominalBioAssay < NominalFeature
87
- # TODO: needed? move to dataset?
88
- field :description, type: String
89
39
  end
90
40
 
91
41
  # Feature for quantitative bioassay results
92
42
  class NumericBioAssay < NumericFeature
93
- # TODO: needed? move to dataset?
94
- field :description, type: String
95
43
  end
96
44
 
97
45
  end
data/lib/lazar.rb CHANGED
@@ -8,43 +8,58 @@ require 'mongoid'
8
8
  require 'rserve'
9
9
  require "nokogiri"
10
10
  require "base64"
11
+ require 'openbabel'
11
12
 
13
+ # Environment setup
14
+ ENV["LAZAR_ENV"] ||= "production"
15
+ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', please set it to 'production' or 'development'." unless ENV["LAZAR_ENV"].match(/production|development/)
12
16
 
13
- # Mongo setup
14
- # TODO retrieve correct environment from Rack/Sinatra
15
- ENV["MONGOID_ENV"] ||= "development"
16
- # TODO remove config files, change default via ENV or directly in Mongoid class
17
- Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
18
- # TODO get Mongo::Client from Mongoid
19
- $mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
20
- # TODO same for GridFS
17
+ ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"]
18
+ ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
19
+ Mongoid.load_configuration({
20
+ :clients => {
21
+ :default => {
22
+ :database => ENV["LAZAR_ENV"],
23
+ :hosts => ["localhost:27017"],
24
+ }
25
+ }
26
+ })
27
+ Mongoid.raise_not_found_error = false # return nil if no document is found
28
+ $mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
21
29
  $gridfs = $mongo.database.fs
22
30
 
23
- # R setup
24
- R = Rserve::Connection.new
25
-
26
31
  # Logger setup
32
+ STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files
27
33
  $logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
28
- $logger.level = Logger::DEBUG
29
- Mongo::Logger.logger = $logger
30
- Mongo::Logger.level = Logger::WARN
31
- #Mongoid.logger = $logger
32
-
33
- # Require sub-Repositories
34
- require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
35
- require_relative '../libfminer/liblast/last' #
36
- require_relative '../last-utils/lu.rb'
37
- require_relative '../openbabel/lib/openbabel'
34
+ case ENV["LAZAR_ENV"]
35
+ when "production"
36
+ $logger.level = Logger::WARN
37
+ Mongo::Logger.level = Logger::WARN
38
+ when "development"
39
+ $logger.level = Logger::DEBUG
40
+ Mongo::Logger.level = Logger::WARN
41
+ end
38
42
 
39
- # Fminer environment variables
40
- ENV['FMINER_SMARTS'] = 'true'
41
- ENV['FMINER_NO_AROMATIC'] = 'true'
42
- ENV['FMINER_PVALUES'] = 'true'
43
- ENV['FMINER_SILENT'] = 'true'
44
- ENV['FMINER_NR_HITS'] = 'true'
43
+ # R setup
44
+ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
45
+ # should work on POSIX including os x
46
+ # http://stackoverflow.com/questions/19619582/number-of-processors-cores-in-command-line
47
+ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
48
+ R = Rserve::Connection.new
49
+ R.eval "
50
+ suppressPackageStartupMessages({
51
+ library(ggplot2,lib=\"#{rlib}\")
52
+ library(grid,lib=\"#{rlib}\")
53
+ library(gridExtra,lib=\"#{rlib}\")
54
+ library(pls,lib=\"#{rlib}\")
55
+ library(caret,lib=\"#{rlib}\")
56
+ library(doMC,lib=\"#{rlib}\")
57
+ registerDoMC(#{NR_CORES})
58
+ })
59
+ "
45
60
 
46
61
  # OpenTox classes and includes
47
- CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
62
+ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
48
63
 
49
64
  [ # be aware of the require sequence as it affects class/method overwrites
50
65
  "overwrite.rb",
@@ -52,18 +67,16 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor
52
67
  "error.rb",
53
68
  "opentox.rb",
54
69
  "feature.rb",
70
+ "physchem.rb",
55
71
  "compound.rb",
56
72
  "dataset.rb",
57
- "descriptor.rb",
58
73
  "algorithm.rb",
59
- "descriptor.rb",
60
- "bbrc.rb",
61
74
  "model.rb",
62
- "similarity.rb",
63
- "neighbor.rb",
64
75
  "classification.rb",
65
76
  "regression.rb",
66
77
  "validation.rb",
67
78
  "crossvalidation.rb",
79
+ "leave-one-out-validation.rb",
80
+ "experiment.rb",
68
81
  ].each{ |f| require_relative f }
69
-
82
+ OpenTox::PhysChem.descriptors # load descriptor features