lazar 0.9.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
data/lib/dataset.rb CHANGED
@@ -5,46 +5,49 @@ module OpenTox
5
5
 
6
6
  class Dataset
7
7
 
8
- # associations like has_many, belongs_to deteriorate performance
9
- field :feature_ids, type: Array, default: []
10
- field :compound_ids, type: Array, default: []
11
- field :data_entries, type: Array, default: []
12
- field :source, type: String
8
+ field :data_entries, type: Hash, default: {}
13
9
 
14
10
  # Readers
15
11
 
16
- # Get all compounds
17
12
  def compounds
18
- @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
19
- @compounds
13
+ substances.select{|s| s.is_a? Compound}
14
+ end
15
+
16
+ def nanoparticles
17
+ substances.select{|s| s.is_a? Nanoparticle}
18
+ end
19
+
20
+ # Get all substances
21
+ def substances
22
+ @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq
23
+ @substances
20
24
  end
21
25
 
22
26
  # Get all features
23
27
  def features
24
- @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
28
+ @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq
25
29
  @features
26
30
  end
27
31
 
28
- # Find data entry values for a given compound and feature
29
- # @param compound [OpenTox::Compound] OpenTox Compound object
30
- # @param feature [OpenTox::Feature] OpenTox Feature object
31
- # @return [Array] Data entry values
32
- def values(compound, feature)
33
- rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
34
- col = feature_ids.index feature.id
35
- rows.collect{|row| data_entries[row][col]}
32
+ def values substance,feature
33
+ substance = substance.id if substance.is_a? Substance
34
+ feature = feature.id if feature.is_a? Feature
35
+ if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
36
+ data_entries[substance.to_s][feature.to_s]
37
+ else
38
+ nil
39
+ end
36
40
  end
37
41
 
38
42
  # Writers
39
43
 
40
- # Set compounds
41
- def compounds=(compounds)
42
- self.compound_ids = compounds.collect{|c| c.id}
43
- end
44
-
45
- # Set features
46
- def features=(features)
47
- self.feature_ids = features.collect{|f| f.id}
44
+ def add(substance,feature,value)
45
+ substance = substance.id if substance.is_a? Substance
46
+ feature = feature.id if feature.is_a? Feature
47
+ data_entries[substance.to_s] ||= {}
48
+ data_entries[substance.to_s][feature.to_s] ||= []
49
+ data_entries[substance.to_s][feature.to_s] << value
50
+ #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source
48
51
  end
49
52
 
50
53
  # Dataset operations
@@ -53,13 +56,7 @@ module OpenTox
53
56
  # @param [Integer] number of folds
54
57
  # @return [Array] Array with folds [training_dataset,test_dataset]
55
58
  def folds n
56
- unique_compound_data = {}
57
- compound_ids.each_with_index do |cid,i|
58
- unique_compound_data[cid] ||= []
59
- unique_compound_data[cid] << data_entries[i]
60
- end
61
- unique_compound_ids = unique_compound_data.keys
62
- len = unique_compound_ids.size
59
+ len = self.substances.size
63
60
  indices = (0..len-1).to_a.shuffle
64
61
  mid = (len/n)
65
62
  chunks = []
@@ -68,22 +65,16 @@ module OpenTox
68
65
  last = start+mid
69
66
  last = last-1 unless len%n >= i
70
67
  test_idxs = indices[start..last] || []
71
- test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
68
+ test_substances = test_idxs.collect{|i| substances[i]}
72
69
  training_idxs = indices-test_idxs
73
- training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
74
- chunk = [training_cids,test_cids].collect do |unique_cids|
75
- cids = []
76
- data_entries = []
77
- unique_cids.each do |cid|
78
- unique_compound_data[cid].each do |de|
79
- cids << cid
80
- data_entries << de
81
- end
82
- end
83
- dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
84
- dataset.compounds.each do |compound|
85
- compound.dataset_ids << dataset.id
86
- compound.save
70
+ training_substances = training_idxs.collect{|i| substances[i]}
71
+ chunk = [training_substances,test_substances].collect do |substances|
72
+ dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id )
73
+ substances.each do |substance|
74
+ substance.dataset_ids << dataset.id
75
+ substance.dataset_ids.uniq!
76
+ substance.save
77
+ dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {}
87
78
  end
88
79
  dataset.save
89
80
  dataset
@@ -94,41 +85,37 @@ module OpenTox
94
85
  chunks
95
86
  end
96
87
 
97
- # Diagnostics
98
-
99
- def duplicates feature=self.features.first
100
- col = feature_ids.index feature.id
101
- dups = {}
102
- compound_ids.each_with_index do |cid,i|
103
- rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
104
- values = rows.collect{|row| data_entries[row][col]}
105
- dups[cid] = values if values.size > 1
106
- end
107
- dups
108
- end
109
-
110
- def correlation_plot training_dataset
111
- # TODO: create/store svg
112
- R.assign "features", data_entries
113
- R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
114
- R.eval "featurePlot(features,activities)"
115
- end
116
-
117
- def density_plot
118
- # TODO: create/store svg
119
- R.assign "acts", data_entries.collect{|r| r.first }#.compact
120
- R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
121
- end
122
-
123
88
  # Serialisation
124
89
 
125
90
  # converts dataset to csv format including compound smiles as first column, other column headers are feature names
126
91
  # @return [String]
127
92
  def to_csv(inchi=false)
128
- CSV.generate() do |csv| #{:force_quotes=>true}
129
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
130
- compounds.each_with_index do |c,i|
131
- csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
93
+ CSV.generate() do |csv|
94
+ compound = substances.first.is_a? Compound
95
+ if compound
96
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
97
+ else
98
+ csv << ["Name"] + features.collect{|f| f.name}
99
+ end
100
+ substances.each do |substance|
101
+ if compound
102
+ name = (inchi ? substance.inchi : substance.smiles)
103
+ else
104
+ name = substance.name
105
+ end
106
+ nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq
107
+
108
+ if nr_measurements.size > 1
109
+ warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
110
+ else
111
+ (0..nr_measurements.first-1).each do |i|
112
+ row = [name]
113
+ features.each do |f|
114
+ values(substance,f) ? row << values(substance,f)[i] : row << ""
115
+ end
116
+ csv << row
117
+ end
118
+ end
132
119
  end
133
120
  end
134
121
  end
@@ -143,9 +130,8 @@ module OpenTox
143
130
  #end
144
131
 
145
132
  # Create a dataset from CSV file
146
- # TODO: document structure
147
- def self.from_csv_file file, source=nil, bioassay=true#, layout={}
148
- source ||= file
133
+ def self.from_csv_file file, accept_empty_values=false
134
+ source = file
149
135
  name = File.basename(file,".*")
150
136
  dataset = self.find_by(:source => source, :name => name)
151
137
  if dataset
@@ -154,171 +140,116 @@ module OpenTox
154
140
  $logger.debug "Parsing #{file}."
155
141
  table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
156
142
  dataset = self.new(:source => source, :name => name)
157
- dataset.parse_table table, bioassay#, layout
143
+ dataset.parse_table table, accept_empty_values
158
144
  end
159
145
  dataset
160
146
  end
161
147
 
162
148
  # parse data in tabular format (e.g. from csv)
163
149
  # does a lot of guesswork in order to determine feature types
164
- def parse_table table, bioassay=true
165
-
166
- time = Time.now
150
+ def parse_table table, accept_empty_values
167
151
 
168
152
  # features
169
153
  feature_names = table.shift.collect{|f| f.strip}
170
- warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
154
+ warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
171
155
  compound_format = feature_names.shift.strip
172
156
  bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
173
-
174
157
  numeric = []
158
+ features = []
175
159
  # guess feature types
176
160
  feature_names.each_with_index do |f,i|
177
161
  metadata = {:name => f}
178
162
  values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
179
163
  types = values.collect{|v| v.numeric? ? true : false}.uniq
164
+ feature = nil
180
165
  if values.size == 0 # empty feature
181
166
  elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
182
- metadata["numeric"] = true
183
167
  numeric[i] = true
168
+ feature = NumericFeature.find_or_create_by(metadata)
184
169
  else
185
- metadata["nominal"] = true
186
170
  metadata["accept_values"] = values
187
171
  numeric[i] = false
172
+ feature = NominalFeature.find_or_create_by(metadata)
188
173
  end
189
- if bioassay
190
- if metadata["numeric"]
191
- feature = NumericBioAssay.find_or_create_by(metadata)
192
- elsif metadata["nominal"]
193
- feature = NominalBioAssay.find_or_create_by(metadata)
194
- end
195
- else
196
- metadata.merge({:measured => false, :calculated => true})
197
- if metadata["numeric"]
198
- feature = NumericFeature.find_or_create_by(metadata)
199
- elsif metadata["nominal"]
200
- feature = NominalFeature.find_or_create_by(metadata)
201
- end
202
- end
203
- feature_ids << feature.id if feature
174
+ features << feature if feature
204
175
  end
205
176
 
206
- $logger.debug "Feature values: #{Time.now-time}"
207
- time = Time.now
208
-
209
- r = -1
210
- compound_time = 0
211
- value_time = 0
212
-
213
- # compounds and values
214
- self.data_entries = []
177
+ # substances and values
215
178
 
179
+ all_substances = []
216
180
  table.each_with_index do |vals,i|
217
- ct = Time.now
218
181
  identifier = vals.shift.strip
219
- warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
182
+ warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values
220
183
  begin
221
184
  case compound_format
222
185
  when /SMILES/i
223
- compound = OpenTox::Compound.from_smiles(identifier)
186
+ substance = OpenTox::Compound.from_smiles(identifier)
224
187
  when /InChI/i
225
- compound = OpenTox::Compound.from_inchi(identifier)
188
+ substance = OpenTox::Compound.from_inchi(identifier)
226
189
  end
227
190
  rescue
228
- compound = nil
191
+ substance = nil
229
192
  end
230
- if compound.nil?
231
- # compound parsers may return nil
232
- warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
193
+ if substance.nil? # compound parsers may return nil
194
+ warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
233
195
  next
234
196
  end
235
- compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
236
- compound_time += Time.now-ct
197
+ all_substances << substance
198
+ substance.dataset_ids << self.id
199
+ substance.dataset_ids.uniq!
200
+ substance.save
237
201
 
238
- r += 1
239
- unless vals.size == feature_ids.size # way cheaper than accessing features
240
- warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
202
+ unless vals.size == features.size
203
+ warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
241
204
  next
242
205
  end
243
206
 
244
- compound_ids << compound.id
245
- table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
246
-
247
207
  vals.each_with_index do |v,j|
248
208
  if v.blank?
249
- warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
209
+ warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'."
250
210
  next
251
211
  elsif numeric[j]
252
212
  v = v.to_f
253
213
  else
254
214
  v = v.strip
255
215
  end
256
- self.data_entries.last[j] = v
257
- #i = compound.feature_ids.index feature_ids[j]
258
- compound.features[feature_ids[j].to_s] ||= []
259
- compound.features[feature_ids[j].to_s] << v
260
- compound.save
216
+ add substance, features[j], v
261
217
  end
218
+ data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values
262
219
  end
263
- compounds.duplicates.each do |compound|
220
+ all_substances.duplicates.each do |substance|
264
221
  positions = []
265
- compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
266
- warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
222
+ all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
223
+ warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
267
224
  end
268
-
269
- $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
270
- time = Time.now
271
225
  save
272
- $logger.debug "Saving: #{Time.now-time}"
273
-
274
226
  end
275
227
 
276
- # Fill unset data entries
277
- # @param any value
278
- def fill_nil_with n
279
- (0 .. compound_ids.size-1).each do |i|
280
- data_entries[i] ||= []
281
- (0 .. feature_ids.size-1).each do |j|
282
- data_entries[i][j] ||= n
283
- end
284
- end
228
+ def delete
229
+ compounds.each{|c| c.dataset_ids.delete id.to_s}
230
+ super
285
231
  end
286
232
 
287
233
  end
288
234
 
289
235
  # Dataset for lazar predictions
290
- class LazarPrediction < Dataset
236
+ class LazarPrediction #< Dataset
291
237
  field :creator, type: String
292
- field :prediction_feature_id, type: String
238
+ field :prediction_feature_id, type: BSON::ObjectId
239
+ field :predictions, type: Hash, default: {}
293
240
 
294
241
  def prediction_feature
295
242
  Feature.find prediction_feature_id
296
243
  end
297
244
 
298
- end
299
-
300
- # Dataset for descriptors (physchem)
301
- class DescriptorDataset < Dataset
302
- field :feature_calculation_algorithm, type: String
303
-
304
- end
305
-
306
- class ScaledDataset < DescriptorDataset
307
-
308
- field :centers, type: Array, default: []
309
- field :scales, type: Array, default: []
245
+ def compounds
246
+ substances.select{|s| s.is_a? Compound}
247
+ end
310
248
 
311
- def original_value value, i
312
- value * scales[i] + centers[i]
249
+ def substances
250
+ predictions.keys.collect{|id| Substance.find id}
313
251
  end
314
- end
315
252
 
316
- # Dataset for fminer descriptors
317
- class FminerDataset < DescriptorDataset
318
- field :training_algorithm, type: String
319
- field :training_dataset_id, type: BSON::ObjectId
320
- field :training_feature_id, type: BSON::ObjectId
321
- field :training_parameters, type: Hash
322
253
  end
323
254
 
324
255
  end
data/lib/feature.rb CHANGED
@@ -2,27 +2,28 @@ module OpenTox
2
2
 
3
3
  # Basic feature class
4
4
  class Feature
5
- field :nominal, type: Boolean
6
- field :numeric, type: Boolean
7
5
  field :measured, type: Boolean
8
6
  field :calculated, type: Boolean
7
+ field :category, type: String
8
+ field :unit, type: String
9
+ field :conditions, type: Hash
10
+
11
+ def nominal?
12
+ self.class == NominalFeature
13
+ end
14
+
15
+ def numeric?
16
+ self.class == NumericFeature
17
+ end
9
18
  end
10
19
 
11
20
  # Feature for categorical variables
12
21
  class NominalFeature < Feature
13
22
  field :accept_values, type: Array
14
- def initialize params
15
- super params
16
- nominal = true
17
- end
18
23
  end
19
24
 
20
25
  # Feature for quantitative variables
21
26
  class NumericFeature < Feature
22
- def initialize params
23
- super params
24
- numeric = true
25
- end
26
27
  end
27
28
 
28
29
  # Feature for SMARTS fragments
@@ -34,12 +35,4 @@ module OpenTox
34
35
  end
35
36
  end
36
37
 
37
- # Feature for categorical bioassay results
38
- class NominalBioAssay < NominalFeature
39
- end
40
-
41
- # Feature for quantitative bioassay results
42
- class NumericBioAssay < NumericFeature
43
- end
44
-
45
38
  end
@@ -0,0 +1,42 @@
1
+ module OpenTox
2
+ module Algorithm
3
+
4
+ class FeatureSelection
5
+
6
+ def self.correlation_filter model
7
+ relevant_features = {}
8
+ R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
9
+ model.descriptor_weights = []
10
+ selected_variables = []
11
+ selected_descriptor_ids = []
12
+ model.independent_variables.each_with_index do |v,i|
13
+ v.collect!{|n| to_r(n)}
14
+ R.assign "independent", v
15
+ begin
16
+ R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
17
+ pvalue = R.eval("cor$p.value").to_ruby
18
+ if pvalue <= 0.05
19
+ model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
20
+ selected_variables << v
21
+ selected_descriptor_ids << model.descriptor_ids[i]
22
+ end
23
+ rescue
24
+ warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
25
+ end
26
+ end
27
+
28
+ model.independent_variables = selected_variables
29
+ model.descriptor_ids = selected_descriptor_ids
30
+ model
31
+ end
32
+
33
+ def self.to_r v
34
+ return 0 if v == false
35
+ return 1 if v == true
36
+ v
37
+ end
38
+
39
+ end
40
+
41
+ end
42
+ end
data/lib/import.rb ADDED
@@ -0,0 +1,122 @@
1
+ module OpenTox
2
+
3
+ module Import
4
+
5
+ class Enanomapper
6
+ include OpenTox
7
+
8
+ # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
9
+ def self.import
10
+ datasets = {}
11
+ bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
12
+ bundles.each do |bundle|
13
+ datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
14
+ $logger.debug bundle["title"].strip
15
+ nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
16
+ nanoparticles.each_with_index do |np,n|
17
+ core_id = nil
18
+ coating_ids = []
19
+ np["composition"].each do |c|
20
+ uri = c["component"]["compound"]["URI"]
21
+ uri = CGI.escape File.join(uri,"&media=application/json")
22
+ data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
23
+ smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
24
+ names = []
25
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
26
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
27
+ if smiles
28
+ compound = Compound.find_or_create_by(:smiles => smiles)
29
+ compound.name = names.first
30
+ compound.names = names.compact
31
+ else
32
+ compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
33
+ end
34
+ compound.save
35
+ if c["relation"] == "HAS_CORE"
36
+ core_id = compound.id.to_s
37
+ elsif c["relation"] == "HAS_COATING"
38
+ coating_ids << compound.id.to_s
39
+ end
40
+ end if np["composition"]
41
+ nanoparticle = Nanoparticle.find_or_create_by(
42
+ :name => np["values"]["https://data.enanomapper.net/identifier/name"],
43
+ :source => np["compound"]["URI"],
44
+ :core_id => core_id,
45
+ :coating_ids => coating_ids
46
+ )
47
+ np["bundles"].keys.each do |bundle_uri|
48
+ nanoparticle.dataset_ids << datasets[bundle_uri].id
49
+ end
50
+
51
+ studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
52
+ studies.each do |study|
53
+ dataset = datasets[np["bundles"].keys.first]
54
+ proteomics_features = {}
55
+ category = study["protocol"]["topcategory"]
56
+ source = study["protocol"]["category"]["term"]
57
+ study["effects"].each do |effect|
58
+
59
+ effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
60
+ effect["conditions"].delete_if { |k, v| v.nil? }
61
+
62
+ if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
63
+
64
+ JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
65
+ proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
66
+ nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
67
+ end
68
+ else
69
+ name = effect["endpoint"]
70
+ unit = effect["result"]["unit"]
71
+ warnings = []
72
+ case name
73
+ when "Log2 transformed" # use a sensible name
74
+ name = "log2(Net cell association)"
75
+ warnings = ["Original name was 'Log2 transformed'"]
76
+ unit = "log2(mL/ug(Mg))"
77
+ when "Total protein (BCA assay)"
78
+ category = "P-CHEM"
79
+ warnings = ["Category changed from TOX to P-CHEM"]
80
+ end
81
+ feature = klass.find_or_create_by(
82
+ :name => name,
83
+ :unit => unit,
84
+ :category => category,
85
+ :conditions => effect["conditions"],
86
+ :source => study["protocol"]["category"]["term"],
87
+ :measured => true,
88
+ :warnings => warnings
89
+ )
90
+ nanoparticle.parse_ambit_value feature, effect["result"], dataset
91
+ end
92
+ end
93
+ end
94
+ nanoparticle.save
95
+ print "#{n}, "
96
+ end
97
+ puts
98
+ end
99
+ datasets.each { |u,d| d.save }
100
+ end
101
+
102
+ =begin
103
+ def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
104
+ #get list of bundle URIs
105
+ bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
106
+ datasets = []
107
+ bundles.each do |bundle|
108
+ uri = bundle["URI"]
109
+ study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
110
+ study["@graph"].each do |i|
111
+ puts i.to_yaml if i.keys.include? "sio:has-value"
112
+ end
113
+ end
114
+ datasets.collect{|d| d.id}
115
+ end
116
+ =end
117
+
118
+ end
119
+
120
+ end
121
+
122
+ end
data/lib/lazar.rb CHANGED
@@ -48,6 +48,7 @@ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
48
48
  R = Rserve::Connection.new
49
49
  R.eval "
50
50
  suppressPackageStartupMessages({
51
+ library(labeling,lib=\"#{rlib}\")
51
52
  library(iterators,lib=\"#{rlib}\")
52
53
  library(foreach,lib=\"#{rlib}\")
53
54
  library(ggplot2,lib=\"#{rlib}\")
@@ -56,12 +57,14 @@ suppressPackageStartupMessages({
56
57
  library(pls,lib=\"#{rlib}\")
57
58
  library(caret,lib=\"#{rlib}\")
58
59
  library(doMC,lib=\"#{rlib}\")
60
+ library(randomForest,lib=\"#{rlib}\")
61
+ library(plyr,lib=\"#{rlib}\")
59
62
  registerDoMC(#{NR_CORES})
60
63
  })
61
64
  "
62
65
 
63
66
  # OpenTox classes and includes
64
- CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
67
+ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
65
68
 
66
69
  [ # be aware of the require sequence as it affects class/method overwrites
67
70
  "overwrite.rb",
@@ -70,15 +73,22 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO
70
73
  "opentox.rb",
71
74
  "feature.rb",
72
75
  "physchem.rb",
76
+ "substance.rb",
73
77
  "compound.rb",
78
+ "nanoparticle.rb",
74
79
  "dataset.rb",
75
80
  "algorithm.rb",
81
+ "similarity.rb",
82
+ "feature_selection.rb",
76
83
  "model.rb",
77
84
  "classification.rb",
78
85
  "regression.rb",
86
+ "caret.rb",
87
+ "validation-statistics.rb",
79
88
  "validation.rb",
80
- "crossvalidation.rb",
89
+ "train-test-validation.rb",
81
90
  "leave-one-out-validation.rb",
82
- "experiment.rb",
91
+ "crossvalidation.rb",
92
+ #"experiment.rb",
93
+ "import.rb",
83
94
  ].each{ |f| require_relative f }
84
- OpenTox::PhysChem.descriptors # load descriptor features