lazar 0.9.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
data/lib/dataset.rb CHANGED
@@ -5,46 +5,49 @@ module OpenTox
5
5
 
6
6
  class Dataset
7
7
 
8
- # associations like has_many, belongs_to deteriorate performance
9
- field :feature_ids, type: Array, default: []
10
- field :compound_ids, type: Array, default: []
11
- field :data_entries, type: Array, default: []
12
- field :source, type: String
8
+ field :data_entries, type: Hash, default: {}
13
9
 
14
10
  # Readers
15
11
 
16
- # Get all compounds
17
12
  def compounds
18
- @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
19
- @compounds
13
+ substances.select{|s| s.is_a? Compound}
14
+ end
15
+
16
+ def nanoparticles
17
+ substances.select{|s| s.is_a? Nanoparticle}
18
+ end
19
+
20
+ # Get all substances
21
+ def substances
22
+ @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq
23
+ @substances
20
24
  end
21
25
 
22
26
  # Get all features
23
27
  def features
24
- @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
28
+ @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq
25
29
  @features
26
30
  end
27
31
 
28
- # Find data entry values for a given compound and feature
29
- # @param compound [OpenTox::Compound] OpenTox Compound object
30
- # @param feature [OpenTox::Feature] OpenTox Feature object
31
- # @return [Array] Data entry values
32
- def values(compound, feature)
33
- rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
34
- col = feature_ids.index feature.id
35
- rows.collect{|row| data_entries[row][col]}
32
+ def values substance,feature
33
+ substance = substance.id if substance.is_a? Substance
34
+ feature = feature.id if feature.is_a? Feature
35
+ if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
36
+ data_entries[substance.to_s][feature.to_s]
37
+ else
38
+ nil
39
+ end
36
40
  end
37
41
 
38
42
  # Writers
39
43
 
40
- # Set compounds
41
- def compounds=(compounds)
42
- self.compound_ids = compounds.collect{|c| c.id}
43
- end
44
-
45
- # Set features
46
- def features=(features)
47
- self.feature_ids = features.collect{|f| f.id}
44
+ def add(substance,feature,value)
45
+ substance = substance.id if substance.is_a? Substance
46
+ feature = feature.id if feature.is_a? Feature
47
+ data_entries[substance.to_s] ||= {}
48
+ data_entries[substance.to_s][feature.to_s] ||= []
49
+ data_entries[substance.to_s][feature.to_s] << value
50
+ #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source
48
51
  end
49
52
 
50
53
  # Dataset operations
@@ -53,13 +56,7 @@ module OpenTox
53
56
  # @param [Integer] number of folds
54
57
  # @return [Array] Array with folds [training_dataset,test_dataset]
55
58
  def folds n
56
- unique_compound_data = {}
57
- compound_ids.each_with_index do |cid,i|
58
- unique_compound_data[cid] ||= []
59
- unique_compound_data[cid] << data_entries[i]
60
- end
61
- unique_compound_ids = unique_compound_data.keys
62
- len = unique_compound_ids.size
59
+ len = self.substances.size
63
60
  indices = (0..len-1).to_a.shuffle
64
61
  mid = (len/n)
65
62
  chunks = []
@@ -68,22 +65,16 @@ module OpenTox
68
65
  last = start+mid
69
66
  last = last-1 unless len%n >= i
70
67
  test_idxs = indices[start..last] || []
71
- test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
68
+ test_substances = test_idxs.collect{|i| substances[i]}
72
69
  training_idxs = indices-test_idxs
73
- training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
74
- chunk = [training_cids,test_cids].collect do |unique_cids|
75
- cids = []
76
- data_entries = []
77
- unique_cids.each do |cid|
78
- unique_compound_data[cid].each do |de|
79
- cids << cid
80
- data_entries << de
81
- end
82
- end
83
- dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
84
- dataset.compounds.each do |compound|
85
- compound.dataset_ids << dataset.id
86
- compound.save
70
+ training_substances = training_idxs.collect{|i| substances[i]}
71
+ chunk = [training_substances,test_substances].collect do |substances|
72
+ dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id )
73
+ substances.each do |substance|
74
+ substance.dataset_ids << dataset.id
75
+ substance.dataset_ids.uniq!
76
+ substance.save
77
+ dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {}
87
78
  end
88
79
  dataset.save
89
80
  dataset
@@ -94,41 +85,37 @@ module OpenTox
94
85
  chunks
95
86
  end
96
87
 
97
- # Diagnostics
98
-
99
- def duplicates feature=self.features.first
100
- col = feature_ids.index feature.id
101
- dups = {}
102
- compound_ids.each_with_index do |cid,i|
103
- rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
104
- values = rows.collect{|row| data_entries[row][col]}
105
- dups[cid] = values if values.size > 1
106
- end
107
- dups
108
- end
109
-
110
- def correlation_plot training_dataset
111
- # TODO: create/store svg
112
- R.assign "features", data_entries
113
- R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
114
- R.eval "featurePlot(features,activities)"
115
- end
116
-
117
- def density_plot
118
- # TODO: create/store svg
119
- R.assign "acts", data_entries.collect{|r| r.first }#.compact
120
- R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
121
- end
122
-
123
88
  # Serialisation
124
89
 
125
90
  # converts dataset to csv format including compound smiles as first column, other column headers are feature names
126
91
  # @return [String]
127
92
  def to_csv(inchi=false)
128
- CSV.generate() do |csv| #{:force_quotes=>true}
129
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
130
- compounds.each_with_index do |c,i|
131
- csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
93
+ CSV.generate() do |csv|
94
+ compound = substances.first.is_a? Compound
95
+ if compound
96
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
97
+ else
98
+ csv << ["Name"] + features.collect{|f| f.name}
99
+ end
100
+ substances.each do |substance|
101
+ if compound
102
+ name = (inchi ? substance.inchi : substance.smiles)
103
+ else
104
+ name = substance.name
105
+ end
106
+ nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq
107
+
108
+ if nr_measurements.size > 1
109
+ warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
110
+ else
111
+ (0..nr_measurements.first-1).each do |i|
112
+ row = [name]
113
+ features.each do |f|
114
+ values(substance,f) ? row << values(substance,f)[i] : row << ""
115
+ end
116
+ csv << row
117
+ end
118
+ end
132
119
  end
133
120
  end
134
121
  end
@@ -143,9 +130,8 @@ module OpenTox
143
130
  #end
144
131
 
145
132
  # Create a dataset from CSV file
146
- # TODO: document structure
147
- def self.from_csv_file file, source=nil, bioassay=true#, layout={}
148
- source ||= file
133
+ def self.from_csv_file file, accept_empty_values=false
134
+ source = file
149
135
  name = File.basename(file,".*")
150
136
  dataset = self.find_by(:source => source, :name => name)
151
137
  if dataset
@@ -154,171 +140,116 @@ module OpenTox
154
140
  $logger.debug "Parsing #{file}."
155
141
  table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
156
142
  dataset = self.new(:source => source, :name => name)
157
- dataset.parse_table table, bioassay#, layout
143
+ dataset.parse_table table, accept_empty_values
158
144
  end
159
145
  dataset
160
146
  end
161
147
 
162
148
  # parse data in tabular format (e.g. from csv)
163
149
  # does a lot of guesswork in order to determine feature types
164
- def parse_table table, bioassay=true
165
-
166
- time = Time.now
150
+ def parse_table table, accept_empty_values
167
151
 
168
152
  # features
169
153
  feature_names = table.shift.collect{|f| f.strip}
170
- warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
154
+ warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
171
155
  compound_format = feature_names.shift.strip
172
156
  bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
173
-
174
157
  numeric = []
158
+ features = []
175
159
  # guess feature types
176
160
  feature_names.each_with_index do |f,i|
177
161
  metadata = {:name => f}
178
162
  values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
179
163
  types = values.collect{|v| v.numeric? ? true : false}.uniq
164
+ feature = nil
180
165
  if values.size == 0 # empty feature
181
166
  elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
182
- metadata["numeric"] = true
183
167
  numeric[i] = true
168
+ feature = NumericFeature.find_or_create_by(metadata)
184
169
  else
185
- metadata["nominal"] = true
186
170
  metadata["accept_values"] = values
187
171
  numeric[i] = false
172
+ feature = NominalFeature.find_or_create_by(metadata)
188
173
  end
189
- if bioassay
190
- if metadata["numeric"]
191
- feature = NumericBioAssay.find_or_create_by(metadata)
192
- elsif metadata["nominal"]
193
- feature = NominalBioAssay.find_or_create_by(metadata)
194
- end
195
- else
196
- metadata.merge({:measured => false, :calculated => true})
197
- if metadata["numeric"]
198
- feature = NumericFeature.find_or_create_by(metadata)
199
- elsif metadata["nominal"]
200
- feature = NominalFeature.find_or_create_by(metadata)
201
- end
202
- end
203
- feature_ids << feature.id if feature
174
+ features << feature if feature
204
175
  end
205
176
 
206
- $logger.debug "Feature values: #{Time.now-time}"
207
- time = Time.now
208
-
209
- r = -1
210
- compound_time = 0
211
- value_time = 0
212
-
213
- # compounds and values
214
- self.data_entries = []
177
+ # substances and values
215
178
 
179
+ all_substances = []
216
180
  table.each_with_index do |vals,i|
217
- ct = Time.now
218
181
  identifier = vals.shift.strip
219
- warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
182
+ warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values
220
183
  begin
221
184
  case compound_format
222
185
  when /SMILES/i
223
- compound = OpenTox::Compound.from_smiles(identifier)
186
+ substance = OpenTox::Compound.from_smiles(identifier)
224
187
  when /InChI/i
225
- compound = OpenTox::Compound.from_inchi(identifier)
188
+ substance = OpenTox::Compound.from_inchi(identifier)
226
189
  end
227
190
  rescue
228
- compound = nil
191
+ substance = nil
229
192
  end
230
- if compound.nil?
231
- # compound parsers may return nil
232
- warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
193
+ if substance.nil? # compound parsers may return nil
194
+ warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
233
195
  next
234
196
  end
235
- compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
236
- compound_time += Time.now-ct
197
+ all_substances << substance
198
+ substance.dataset_ids << self.id
199
+ substance.dataset_ids.uniq!
200
+ substance.save
237
201
 
238
- r += 1
239
- unless vals.size == feature_ids.size # way cheaper than accessing features
240
- warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
202
+ unless vals.size == features.size
203
+ warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
241
204
  next
242
205
  end
243
206
 
244
- compound_ids << compound.id
245
- table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
246
-
247
207
  vals.each_with_index do |v,j|
248
208
  if v.blank?
249
- warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
209
+ warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'."
250
210
  next
251
211
  elsif numeric[j]
252
212
  v = v.to_f
253
213
  else
254
214
  v = v.strip
255
215
  end
256
- self.data_entries.last[j] = v
257
- #i = compound.feature_ids.index feature_ids[j]
258
- compound.features[feature_ids[j].to_s] ||= []
259
- compound.features[feature_ids[j].to_s] << v
260
- compound.save
216
+ add substance, features[j], v
261
217
  end
218
+ data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values
262
219
  end
263
- compounds.duplicates.each do |compound|
220
+ all_substances.duplicates.each do |substance|
264
221
  positions = []
265
- compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
266
- warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
222
+ all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
223
+ warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
267
224
  end
268
-
269
- $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
270
- time = Time.now
271
225
  save
272
- $logger.debug "Saving: #{Time.now-time}"
273
-
274
226
  end
275
227
 
276
- # Fill unset data entries
277
- # @param any value
278
- def fill_nil_with n
279
- (0 .. compound_ids.size-1).each do |i|
280
- data_entries[i] ||= []
281
- (0 .. feature_ids.size-1).each do |j|
282
- data_entries[i][j] ||= n
283
- end
284
- end
228
+ def delete
229
+ compounds.each{|c| c.dataset_ids.delete id.to_s}
230
+ super
285
231
  end
286
232
 
287
233
  end
288
234
 
289
235
  # Dataset for lazar predictions
290
- class LazarPrediction < Dataset
236
+ class LazarPrediction #< Dataset
291
237
  field :creator, type: String
292
- field :prediction_feature_id, type: String
238
+ field :prediction_feature_id, type: BSON::ObjectId
239
+ field :predictions, type: Hash, default: {}
293
240
 
294
241
  def prediction_feature
295
242
  Feature.find prediction_feature_id
296
243
  end
297
244
 
298
- end
299
-
300
- # Dataset for descriptors (physchem)
301
- class DescriptorDataset < Dataset
302
- field :feature_calculation_algorithm, type: String
303
-
304
- end
305
-
306
- class ScaledDataset < DescriptorDataset
307
-
308
- field :centers, type: Array, default: []
309
- field :scales, type: Array, default: []
245
+ def compounds
246
+ substances.select{|s| s.is_a? Compound}
247
+ end
310
248
 
311
- def original_value value, i
312
- value * scales[i] + centers[i]
249
+ def substances
250
+ predictions.keys.collect{|id| Substance.find id}
313
251
  end
314
- end
315
252
 
316
- # Dataset for fminer descriptors
317
- class FminerDataset < DescriptorDataset
318
- field :training_algorithm, type: String
319
- field :training_dataset_id, type: BSON::ObjectId
320
- field :training_feature_id, type: BSON::ObjectId
321
- field :training_parameters, type: Hash
322
253
  end
323
254
 
324
255
  end
data/lib/feature.rb CHANGED
@@ -2,27 +2,28 @@ module OpenTox
2
2
 
3
3
  # Basic feature class
4
4
  class Feature
5
- field :nominal, type: Boolean
6
- field :numeric, type: Boolean
7
5
  field :measured, type: Boolean
8
6
  field :calculated, type: Boolean
7
+ field :category, type: String
8
+ field :unit, type: String
9
+ field :conditions, type: Hash
10
+
11
+ def nominal?
12
+ self.class == NominalFeature
13
+ end
14
+
15
+ def numeric?
16
+ self.class == NumericFeature
17
+ end
9
18
  end
10
19
 
11
20
  # Feature for categorical variables
12
21
  class NominalFeature < Feature
13
22
  field :accept_values, type: Array
14
- def initialize params
15
- super params
16
- nominal = true
17
- end
18
23
  end
19
24
 
20
25
  # Feature for quantitative variables
21
26
  class NumericFeature < Feature
22
- def initialize params
23
- super params
24
- numeric = true
25
- end
26
27
  end
27
28
 
28
29
  # Feature for SMARTS fragments
@@ -34,12 +35,4 @@ module OpenTox
34
35
  end
35
36
  end
36
37
 
37
- # Feature for categorical bioassay results
38
- class NominalBioAssay < NominalFeature
39
- end
40
-
41
- # Feature for quantitative bioassay results
42
- class NumericBioAssay < NumericFeature
43
- end
44
-
45
38
  end
@@ -0,0 +1,42 @@
1
+ module OpenTox
2
+ module Algorithm
3
+
4
+ class FeatureSelection
5
+
6
+ def self.correlation_filter model
7
+ relevant_features = {}
8
+ R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
9
+ model.descriptor_weights = []
10
+ selected_variables = []
11
+ selected_descriptor_ids = []
12
+ model.independent_variables.each_with_index do |v,i|
13
+ v.collect!{|n| to_r(n)}
14
+ R.assign "independent", v
15
+ begin
16
+ R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
17
+ pvalue = R.eval("cor$p.value").to_ruby
18
+ if pvalue <= 0.05
19
+ model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
20
+ selected_variables << v
21
+ selected_descriptor_ids << model.descriptor_ids[i]
22
+ end
23
+ rescue
24
+ warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
25
+ end
26
+ end
27
+
28
+ model.independent_variables = selected_variables
29
+ model.descriptor_ids = selected_descriptor_ids
30
+ model
31
+ end
32
+
33
+ def self.to_r v
34
+ return 0 if v == false
35
+ return 1 if v == true
36
+ v
37
+ end
38
+
39
+ end
40
+
41
+ end
42
+ end
data/lib/import.rb ADDED
@@ -0,0 +1,122 @@
1
+ module OpenTox
2
+
3
+ module Import
4
+
5
+ class Enanomapper
6
+ include OpenTox
7
+
8
+ # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
9
+ def self.import
10
+ datasets = {}
11
+ bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
12
+ bundles.each do |bundle|
13
+ datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
14
+ $logger.debug bundle["title"].strip
15
+ nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
16
+ nanoparticles.each_with_index do |np,n|
17
+ core_id = nil
18
+ coating_ids = []
19
+ np["composition"].each do |c|
20
+ uri = c["component"]["compound"]["URI"]
21
+ uri = CGI.escape File.join(uri,"&media=application/json")
22
+ data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
23
+ smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
24
+ names = []
25
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
26
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
27
+ if smiles
28
+ compound = Compound.find_or_create_by(:smiles => smiles)
29
+ compound.name = names.first
30
+ compound.names = names.compact
31
+ else
32
+ compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
33
+ end
34
+ compound.save
35
+ if c["relation"] == "HAS_CORE"
36
+ core_id = compound.id.to_s
37
+ elsif c["relation"] == "HAS_COATING"
38
+ coating_ids << compound.id.to_s
39
+ end
40
+ end if np["composition"]
41
+ nanoparticle = Nanoparticle.find_or_create_by(
42
+ :name => np["values"]["https://data.enanomapper.net/identifier/name"],
43
+ :source => np["compound"]["URI"],
44
+ :core_id => core_id,
45
+ :coating_ids => coating_ids
46
+ )
47
+ np["bundles"].keys.each do |bundle_uri|
48
+ nanoparticle.dataset_ids << datasets[bundle_uri].id
49
+ end
50
+
51
+ studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
52
+ studies.each do |study|
53
+ dataset = datasets[np["bundles"].keys.first]
54
+ proteomics_features = {}
55
+ category = study["protocol"]["topcategory"]
56
+ source = study["protocol"]["category"]["term"]
57
+ study["effects"].each do |effect|
58
+
59
+ effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
60
+ effect["conditions"].delete_if { |k, v| v.nil? }
61
+
62
+ if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
63
+
64
+ JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
65
+ proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
66
+ nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
67
+ end
68
+ else
69
+ name = effect["endpoint"]
70
+ unit = effect["result"]["unit"]
71
+ warnings = []
72
+ case name
73
+ when "Log2 transformed" # use a sensible name
74
+ name = "log2(Net cell association)"
75
+ warnings = ["Original name was 'Log2 transformed'"]
76
+ unit = "log2(mL/ug(Mg))"
77
+ when "Total protein (BCA assay)"
78
+ category = "P-CHEM"
79
+ warnings = ["Category changed from TOX to P-CHEM"]
80
+ end
81
+ feature = klass.find_or_create_by(
82
+ :name => name,
83
+ :unit => unit,
84
+ :category => category,
85
+ :conditions => effect["conditions"],
86
+ :source => study["protocol"]["category"]["term"],
87
+ :measured => true,
88
+ :warnings => warnings
89
+ )
90
+ nanoparticle.parse_ambit_value feature, effect["result"], dataset
91
+ end
92
+ end
93
+ end
94
+ nanoparticle.save
95
+ print "#{n}, "
96
+ end
97
+ puts
98
+ end
99
+ datasets.each { |u,d| d.save }
100
+ end
101
+
102
+ =begin
103
+ def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
104
+ #get list of bundle URIs
105
+ bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
106
+ datasets = []
107
+ bundles.each do |bundle|
108
+ uri = bundle["URI"]
109
+ study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
110
+ study["@graph"].each do |i|
111
+ puts i.to_yaml if i.keys.include? "sio:has-value"
112
+ end
113
+ end
114
+ datasets.collect{|d| d.id}
115
+ end
116
+ =end
117
+
118
+ end
119
+
120
+ end
121
+
122
+ end
data/lib/lazar.rb CHANGED
@@ -48,6 +48,7 @@ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
48
48
  R = Rserve::Connection.new
49
49
  R.eval "
50
50
  suppressPackageStartupMessages({
51
+ library(labeling,lib=\"#{rlib}\")
51
52
  library(iterators,lib=\"#{rlib}\")
52
53
  library(foreach,lib=\"#{rlib}\")
53
54
  library(ggplot2,lib=\"#{rlib}\")
@@ -56,12 +57,14 @@ suppressPackageStartupMessages({
56
57
  library(pls,lib=\"#{rlib}\")
57
58
  library(caret,lib=\"#{rlib}\")
58
59
  library(doMC,lib=\"#{rlib}\")
60
+ library(randomForest,lib=\"#{rlib}\")
61
+ library(plyr,lib=\"#{rlib}\")
59
62
  registerDoMC(#{NR_CORES})
60
63
  })
61
64
  "
62
65
 
63
66
  # OpenTox classes and includes
64
- CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
67
+ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
65
68
 
66
69
  [ # be aware of the require sequence as it affects class/method overwrites
67
70
  "overwrite.rb",
@@ -70,15 +73,22 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO
70
73
  "opentox.rb",
71
74
  "feature.rb",
72
75
  "physchem.rb",
76
+ "substance.rb",
73
77
  "compound.rb",
78
+ "nanoparticle.rb",
74
79
  "dataset.rb",
75
80
  "algorithm.rb",
81
+ "similarity.rb",
82
+ "feature_selection.rb",
76
83
  "model.rb",
77
84
  "classification.rb",
78
85
  "regression.rb",
86
+ "caret.rb",
87
+ "validation-statistics.rb",
79
88
  "validation.rb",
80
- "crossvalidation.rb",
89
+ "train-test-validation.rb",
81
90
  "leave-one-out-validation.rb",
82
- "experiment.rb",
91
+ "crossvalidation.rb",
92
+ #"experiment.rb",
93
+ "import.rb",
83
94
  ].each{ |f| require_relative f }
84
- OpenTox::PhysChem.descriptors # load descriptor features