lazar 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
data/lib/dataset.rb CHANGED
@@ -5,24 +5,11 @@ module OpenTox
5
5
 
6
6
  class Dataset
7
7
 
8
- attr_writer :data_entries
9
-
10
8
  # associations like has_many, belongs_to deteriorate performance
11
9
  field :feature_ids, type: Array, default: []
12
10
  field :compound_ids, type: Array, default: []
13
- field :data_entries_id, type: BSON::ObjectId, default: []
11
+ field :data_entries, type: Array, default: []
14
12
  field :source, type: String
15
- field :warnings, type: Array, default: []
16
-
17
- # Save all data including data_entries
18
- # Should be used instead of save
19
- def save_all
20
- dump = Marshal.dump(@data_entries)
21
- file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
22
- data_entries_id = $gridfs.insert_one(file)
23
- update(:data_entries_id => data_entries_id)
24
- save
25
- end
26
13
 
27
14
  # Readers
28
15
 
@@ -38,24 +25,6 @@ module OpenTox
38
25
  @features
39
26
  end
40
27
 
41
- # Get all data_entries
42
- def data_entries
43
- unless @data_entries
44
- t = Time.now
45
- data_entry_file = $gridfs.find_one(_id: data_entries_id)
46
- if data_entry_file.nil?
47
- @data_entries = []
48
- else
49
- @data_entries = Marshal.load(data_entry_file.data)
50
- bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
51
- bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
52
- bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
53
- $logger.debug "Retrieving data: #{Time.now-t}"
54
- end
55
- end
56
- @data_entries
57
- end
58
-
59
28
  # Find data entry values for a given compound and feature
60
29
  # @param compound [OpenTox::Compound] OpenTox Compound object
61
30
  # @param feature [OpenTox::Feature] OpenTox Feature object
@@ -84,7 +53,13 @@ module OpenTox
84
53
  # @param [Integer] number of folds
85
54
  # @return [Array] Array with folds [training_dataset,test_dataset]
86
55
  def folds n
87
- len = self.compound_ids.size
56
+ unique_compound_data = {}
57
+ compound_ids.each_with_index do |cid,i|
58
+ unique_compound_data[cid] ||= []
59
+ unique_compound_data[cid] << data_entries[i]
60
+ end
61
+ unique_compound_ids = unique_compound_data.keys
62
+ len = unique_compound_ids.size
88
63
  indices = (0..len-1).to_a.shuffle
89
64
  mid = (len/n)
90
65
  chunks = []
@@ -93,22 +68,44 @@ module OpenTox
93
68
  last = start+mid
94
69
  last = last-1 unless len%n >= i
95
70
  test_idxs = indices[start..last] || []
96
- test_cids = test_idxs.collect{|i| self.compound_ids[i]}
97
- test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
98
- test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
71
+ test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
99
72
  training_idxs = indices-test_idxs
100
- training_cids = training_idxs.collect{|i| self.compound_ids[i]}
101
- training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
102
- training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
103
- test_dataset.save_all
104
- training_dataset.save_all
105
- chunks << [training_dataset,test_dataset]
73
+ training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
74
+ chunk = [training_cids,test_cids].collect do |unique_cids|
75
+ cids = []
76
+ data_entries = []
77
+ unique_cids.each do |cid|
78
+ unique_compound_data[cid].each do |de|
79
+ cids << cid
80
+ data_entries << de
81
+ end
82
+ end
83
+ dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
84
+ dataset.compounds.each do |compound|
85
+ compound.dataset_ids << dataset.id
86
+ compound.save
87
+ end
88
+ dataset.save
89
+ dataset
90
+ end
106
91
  start = last+1
92
+ chunks << chunk
107
93
  end
108
94
  chunks
109
95
  end
110
96
 
111
97
  # Diagnostics
98
+
99
+ def duplicates feature=self.features.first
100
+ col = feature_ids.index feature.id
101
+ dups = {}
102
+ compound_ids.each_with_index do |cid,i|
103
+ rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
104
+ values = rows.collect{|row| data_entries[row][col]}
105
+ dups[cid] = values if values.size > 1
106
+ end
107
+ dups
108
+ end
112
109
 
113
110
  def correlation_plot training_dataset
114
111
  # TODO: create/store svg
@@ -120,23 +117,22 @@ module OpenTox
120
117
  def density_plot
121
118
  # TODO: create/store svg
122
119
  R.assign "acts", data_entries.collect{|r| r.first }#.compact
123
- R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
120
+ R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
124
121
  end
125
122
 
126
123
  # Serialisation
127
124
 
128
- # converts dataset to csv format including compound smiles as first column, other column headers are feature titles
125
+ # converts dataset to csv format including compound smiles as first column, other column headers are feature names
129
126
  # @return [String]
130
127
  def to_csv(inchi=false)
131
128
  CSV.generate() do |csv| #{:force_quotes=>true}
132
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.title}
129
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
133
130
  compounds.each_with_index do |c,i|
134
131
  csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
135
132
  end
136
133
  end
137
134
  end
138
135
 
139
-
140
136
  # Parsers
141
137
 
142
138
  # Create a dataset from file (csv,sdf,...)
@@ -145,14 +141,21 @@ module OpenTox
145
141
  # TODO
146
142
  #def self.from_sdf_file
147
143
  #end
148
-
144
+
149
145
  # Create a dataset from CSV file
150
146
  # TODO: document structure
151
- def self.from_csv_file file, source=nil, bioassay=true
147
+ def self.from_csv_file file, source=nil, bioassay=true#, layout={}
152
148
  source ||= file
153
- table = CSV.read file, :skip_blanks => true
154
- dataset = self.new(:source => source, :name => File.basename(file))
155
- dataset.parse_table table, bioassay
149
+ name = File.basename(file,".*")
150
+ dataset = self.find_by(:source => source, :name => name)
151
+ if dataset
152
+ $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
153
+ else
154
+ $logger.debug "Parsing #{file}."
155
+ table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
156
+ dataset = self.new(:source => source, :name => name)
157
+ dataset.parse_table table, bioassay#, layout
158
+ end
156
159
  dataset
157
160
  end
158
161
 
@@ -197,7 +200,7 @@ module OpenTox
197
200
  feature = NominalFeature.find_or_create_by(metadata)
198
201
  end
199
202
  end
200
- feature_ids << feature.id
203
+ feature_ids << feature.id if feature
201
204
  end
202
205
 
203
206
  $logger.debug "Feature values: #{Time.now-time}"
@@ -208,11 +211,11 @@ module OpenTox
208
211
  value_time = 0
209
212
 
210
213
  # compounds and values
211
- @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
214
+ self.data_entries = []
212
215
 
213
216
  table.each_with_index do |vals,i|
214
217
  ct = Time.now
215
- identifier = vals.shift
218
+ identifier = vals.shift.strip
216
219
  warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
217
220
  begin
218
221
  case compound_format
@@ -229,7 +232,7 @@ module OpenTox
229
232
  warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
230
233
  next
231
234
  end
232
- # TODO insert empty compounds to keep positions?
235
+ compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
233
236
  compound_time += Time.now-ct
234
237
 
235
238
  r += 1
@@ -239,72 +242,48 @@ module OpenTox
239
242
  end
240
243
 
241
244
  compound_ids << compound.id
242
- @data_entries << Array.new(table.first.size-1)
245
+ table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
243
246
 
244
247
  vals.each_with_index do |v,j|
245
248
  if v.blank?
246
249
  warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
247
250
  next
248
251
  elsif numeric[j]
249
- @data_entries.last[j] = v.to_f
252
+ v = v.to_f
250
253
  else
251
- @data_entries.last[j] = v.strip
254
+ v = v.strip
252
255
  end
256
+ self.data_entries.last[j] = v
257
+ #i = compound.feature_ids.index feature_ids[j]
258
+ compound.features[feature_ids[j].to_s] ||= []
259
+ compound.features[feature_ids[j].to_s] << v
260
+ compound.save
253
261
  end
254
262
  end
255
263
  compounds.duplicates.each do |compound|
256
264
  positions = []
257
- compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
258
- warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
265
+ compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
266
+ warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
259
267
  end
260
268
 
261
269
  $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
262
270
  time = Time.now
263
- save_all
271
+ save
264
272
  $logger.debug "Saving: #{Time.now-time}"
265
273
 
266
274
  end
267
275
 
268
- =begin
269
- # TODO remove
270
-
271
- # Create a dataset with compounds and features
272
- def self.create compounds, features, warnings=[], source=nil
273
- dataset = Dataset.new(:warnings => warnings)
274
- dataset.compounds = compounds
275
- dataset.features = features
276
- dataset
277
- end
278
- # merge dataset (i.e. append features)
279
- def +(dataset)
280
- bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
281
- bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids
282
- self.feature_ids ||= []
283
- self.feature_ids = self.feature_ids + dataset.feature_ids
284
- @data_entries ||= Array.new(compound_ids.size){[]}
285
- @data_entries.each_with_index do |row,i|
286
- @data_entries[i] = row + dataset.fingerprint(compounds[i])
287
- end
288
- self
289
-
290
- end
291
-
292
- def fingerprint(compound)
293
- i = compound_ids.index(compound.id)
294
- i.nil? ? nil : data_entries[i]
295
- end
296
- =end
297
-
298
276
  # Fill unset data entries
299
277
  # @param any value
300
278
  def fill_nil_with n
301
279
  (0 .. compound_ids.size-1).each do |i|
302
- @data_entries[i] ||= []
280
+ data_entries[i] ||= []
303
281
  (0 .. feature_ids.size-1).each do |j|
304
- @data_entries[i][j] ||= n
282
+ data_entries[i][j] ||= n
305
283
  end
306
284
  end
307
285
  end
286
+
308
287
  end
309
288
 
310
289
  # Dataset for lazar predictions
@@ -321,6 +300,17 @@ module OpenTox
321
300
  # Dataset for descriptors (physchem)
322
301
  class DescriptorDataset < Dataset
323
302
  field :feature_calculation_algorithm, type: String
303
+
304
+ end
305
+
306
+ class ScaledDataset < DescriptorDataset
307
+
308
+ field :centers, type: Array, default: []
309
+ field :scales, type: Array, default: []
310
+
311
+ def original_value value, i
312
+ value * scales[i] + centers[i]
313
+ end
324
314
  end
325
315
 
326
316
  # Dataset for fminer descriptors
data/lib/error.rb CHANGED
@@ -58,7 +58,7 @@ module OpenTox
58
58
  OpenTox.const_set error[:class],c
59
59
 
60
60
  # define global methods for raising errors, eg. bad_request_error
61
- Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil|
61
+ Object.send(:define_method, error[:method]) do |message|
62
62
  raise c.new(message)
63
63
  end
64
64
  end
data/lib/experiment.rb ADDED
@@ -0,0 +1,99 @@
1
+ module OpenTox
2
+
3
+ class Experiment
4
+ field :dataset_ids, type: Array
5
+ field :model_settings, type: Array, default: []
6
+ field :results, type: Hash, default: {}
7
+
8
+ def run
9
+ dataset_ids.each do |dataset_id|
10
+ dataset = Dataset.find(dataset_id)
11
+ results[dataset_id.to_s] = []
12
+ model_settings.each do |setting|
13
+ setting = setting.dup
14
+ model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
15
+ model = Object.const_get(model_algorithm).create dataset, setting
16
+ $logger.debug model
17
+ model.save
18
+ repeated_crossvalidation = RepeatedCrossValidation.create model
19
+ results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
20
+ end
21
+ end
22
+ save
23
+ end
24
+
25
+ def report
26
+ # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
27
+ report = {}
28
+ report[:name] = name
29
+ report[:experiment_id] = self.id.to_s
30
+ report[:results] = {}
31
+ parameters = []
32
+ dataset_ids.each do |dataset_id|
33
+ dataset_name = Dataset.find(dataset_id).name
34
+ report[:results][dataset_name] = {}
35
+ report[:results][dataset_name][:anova] = {}
36
+ report[:results][dataset_name][:data] = []
37
+ # TODO results[dataset_id.to_s] does not exist
38
+ results[dataset_id.to_s].each do |result|
39
+ model = Model::Lazar.find(result[:model_id])
40
+ repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
41
+ crossvalidations = repeated_cv.crossvalidations
42
+ if crossvalidations.first.is_a? ClassificationCrossValidation
43
+ parameters = [:accuracy,:true_rate,:predictivity]
44
+ elsif crossvalidations.first.is_a? RegressionCrossValidation
45
+ parameters = [:rmse,:mae,:r_squared]
46
+ end
47
+ summary = {}
48
+ [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
49
+ summary[key] = model[key]
50
+ end
51
+ summary[:nr_instances] = crossvalidations.first.nr_instances
52
+ summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
53
+ summary[:time] = crossvalidations.collect{|cv| cv.time}
54
+ parameters.each do |param|
55
+ summary[param] = crossvalidations.collect{|cv| cv.send(param)}
56
+ end
57
+ report[:results][dataset_name][:data] << summary
58
+ end
59
+ end
60
+ report[:results].each do |dataset,results|
61
+ ([:time,:nr_unpredicted]+parameters).each do |param|
62
+ experiments = []
63
+ outcome = []
64
+ results[:data].each_with_index do |result,i|
65
+ result[param].each do |p|
66
+ experiments << i
67
+ p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
68
+ outcome << p
69
+ end
70
+ end
71
+ begin
72
+ R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
73
+ R.eval "experiment_nr = factor(experiment_nr)"
74
+ R.assign "outcome", outcome
75
+ R.eval "data = data.frame(experiment_nr,outcome)"
76
+ # one-way ANOVA
77
+ R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
78
+ # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
79
+ p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
80
+ # aequivalent
81
+ # sum = R.eval("summary(fit)")
82
+ #p_value = sum.to_ruby.first.last.first
83
+ rescue
84
+ p_value = nil
85
+ end
86
+ report[:results][dataset][:anova][param] = p_value
87
+ =begin
88
+ =end
89
+ end
90
+ end
91
+ report
92
+ end
93
+
94
+ def summary
95
+ report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
96
+ end
97
+ end
98
+
99
+ end
data/lib/feature.rb CHANGED
@@ -2,15 +2,14 @@ module OpenTox
2
2
 
3
3
  # Basic feature class
4
4
  class Feature
5
- field :name, as: :title, type: String
6
5
  field :nominal, type: Boolean
7
6
  field :numeric, type: Boolean
8
7
  field :measured, type: Boolean
8
+ field :calculated, type: Boolean
9
9
  end
10
10
 
11
11
  # Feature for categorical variables
12
12
  class NominalFeature < Feature
13
- # TODO check if accept_values are still needed
14
13
  field :accept_values, type: Array
15
14
  def initialize params
16
15
  super params
@@ -29,69 +28,18 @@ module OpenTox
29
28
  # Feature for SMARTS fragments
30
29
  class Smarts < NominalFeature
31
30
  field :smarts, type: String
31
+ index "smarts" => 1
32
32
  def self.from_smarts smarts
33
33
  self.find_or_create_by :smarts => smarts
34
34
  end
35
35
  end
36
36
 
37
- # Feature for supervised fragments from Fminer algorithm
38
- class FminerSmarts < Smarts
39
- field :p_value, type: Float
40
- # TODO check if effect is used
41
- field :effect, type: String
42
- field :dataset_id
43
- end
44
-
45
- # Feature for database fingerprints
46
- # needs count for efficient retrieval (see compound.rb)
47
- class FingerprintSmarts < Smarts
48
- field :count, type: Integer
49
- def self.fingerprint
50
- @@fp4 ||= OpenTox::FingerprintSmarts.all
51
- unless @@fp4.size == 306
52
- @@fp4 = []
53
- # OpenBabel FP4 fingerprints
54
- # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
55
- # TODO investigate other types of fingerprints (MACCS)
56
- # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
57
- # http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
58
- # OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
59
- # Morgan ECFP, FCFP
60
- # http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
61
- # http://www.rdkit.org/docs/GettingStartedInPython.html
62
- # Chemfp
63
- # https://chemfp.readthedocs.org/en/latest/using-tools.html
64
- # CACTVS/PubChem
65
-
66
- File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l|
67
- l.strip!
68
- unless l.empty? or l.match /^#/
69
- name,smarts = l.split(': ')
70
- @@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
71
- end
72
- end
73
- end
74
- @@fp4
75
- end
76
- end
77
-
78
- # Feature for physico-chemical descriptors
79
- class PhysChemDescriptor < NumericFeature
80
- field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
81
- field :parameters, type: Hash
82
- field :creator, type: String
83
- end
84
-
85
37
  # Feature for categorical bioassay results
86
38
  class NominalBioAssay < NominalFeature
87
- # TODO: needed? move to dataset?
88
- field :description, type: String
89
39
  end
90
40
 
91
41
  # Feature for quantitative bioassay results
92
42
  class NumericBioAssay < NumericFeature
93
- # TODO: needed? move to dataset?
94
- field :description, type: String
95
43
  end
96
44
 
97
45
  end
data/lib/lazar.rb CHANGED
@@ -8,43 +8,58 @@ require 'mongoid'
8
8
  require 'rserve'
9
9
  require "nokogiri"
10
10
  require "base64"
11
+ require 'openbabel'
11
12
 
13
+ # Environment setup
14
+ ENV["LAZAR_ENV"] ||= "production"
15
+ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', please set it to 'production' or 'development'." unless ENV["LAZAR_ENV"].match(/production|development/)
12
16
 
13
- # Mongo setup
14
- # TODO retrieve correct environment from Rack/Sinatra
15
- ENV["MONGOID_ENV"] ||= "development"
16
- # TODO remove config files, change default via ENV or directly in Mongoid class
17
- Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
18
- # TODO get Mongo::Client from Mongoid
19
- $mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
20
- # TODO same for GridFS
17
+ ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"]
18
+ ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
19
+ Mongoid.load_configuration({
20
+ :clients => {
21
+ :default => {
22
+ :database => ENV["LAZAR_ENV"],
23
+ :hosts => ["localhost:27017"],
24
+ }
25
+ }
26
+ })
27
+ Mongoid.raise_not_found_error = false # return nil if no document is found
28
+ $mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
21
29
  $gridfs = $mongo.database.fs
22
30
 
23
- # R setup
24
- R = Rserve::Connection.new
25
-
26
31
  # Logger setup
32
+ STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files
27
33
  $logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
28
- $logger.level = Logger::DEBUG
29
- Mongo::Logger.logger = $logger
30
- Mongo::Logger.level = Logger::WARN
31
- #Mongoid.logger = $logger
32
-
33
- # Require sub-Repositories
34
- require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
35
- require_relative '../libfminer/liblast/last' #
36
- require_relative '../last-utils/lu.rb'
37
- require_relative '../openbabel/lib/openbabel'
34
+ case ENV["LAZAR_ENV"]
35
+ when "production"
36
+ $logger.level = Logger::WARN
37
+ Mongo::Logger.level = Logger::WARN
38
+ when "development"
39
+ $logger.level = Logger::DEBUG
40
+ Mongo::Logger.level = Logger::WARN
41
+ end
38
42
 
39
- # Fminer environment variables
40
- ENV['FMINER_SMARTS'] = 'true'
41
- ENV['FMINER_NO_AROMATIC'] = 'true'
42
- ENV['FMINER_PVALUES'] = 'true'
43
- ENV['FMINER_SILENT'] = 'true'
44
- ENV['FMINER_NR_HITS'] = 'true'
43
+ # R setup
44
+ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
45
+ # should work on POSIX including os x
46
+ # http://stackoverflow.com/questions/19619582/number-of-processors-cores-in-command-line
47
+ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
48
+ R = Rserve::Connection.new
49
+ R.eval "
50
+ suppressPackageStartupMessages({
51
+ library(ggplot2,lib=\"#{rlib}\")
52
+ library(grid,lib=\"#{rlib}\")
53
+ library(gridExtra,lib=\"#{rlib}\")
54
+ library(pls,lib=\"#{rlib}\")
55
+ library(caret,lib=\"#{rlib}\")
56
+ library(doMC,lib=\"#{rlib}\")
57
+ registerDoMC(#{NR_CORES})
58
+ })
59
+ "
45
60
 
46
61
  # OpenTox classes and includes
47
- CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
62
+ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
48
63
 
49
64
  [ # be aware of the require sequence as it affects class/method overwrites
50
65
  "overwrite.rb",
@@ -52,18 +67,16 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor
52
67
  "error.rb",
53
68
  "opentox.rb",
54
69
  "feature.rb",
70
+ "physchem.rb",
55
71
  "compound.rb",
56
72
  "dataset.rb",
57
- "descriptor.rb",
58
73
  "algorithm.rb",
59
- "descriptor.rb",
60
- "bbrc.rb",
61
74
  "model.rb",
62
- "similarity.rb",
63
- "neighbor.rb",
64
75
  "classification.rb",
65
76
  "regression.rb",
66
77
  "validation.rb",
67
78
  "crossvalidation.rb",
79
+ "leave-one-out-validation.rb",
80
+ "experiment.rb",
68
81
  ].each{ |f| require_relative f }
69
-
82
+ OpenTox::PhysChem.descriptors # load descriptor features