lazar 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
@@ -0,0 +1,187 @@
1
+ module OpenTox
2
+
3
+ class CrossValidation
4
+ field :validation_ids, type: Array, default: []
5
+ field :folds, type: Integer
6
+ field :nr_instances, type: Integer
7
+ field :nr_unpredicted, type: Integer
8
+ field :predictions, type: Array
9
+ field :finished_at, type: Time
10
+ end
11
+
12
+ class ClassificationCrossValidation < CrossValidation
13
+
14
+ field :accept_values, type: Array
15
+ field :confusion_matrix, type: Array
16
+ field :weighted_confusion_matrix, type: Array
17
+ field :accuracy, type: Float
18
+ field :weighted_accuracy, type: Float
19
+ field :true_rate, type: Hash
20
+ field :predictivity, type: Hash
21
+ # TODO auc, f-measure (usability??)
22
+
23
+ def self.create model, n=10
24
+ cv = self.new
25
+ validation_ids = []
26
+ nr_instances = 0
27
+ nr_unpredicted = 0
28
+ predictions = []
29
+ validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
30
+ accept_values = Feature.find(model.prediction_feature_id).accept_values
31
+ confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
32
+ weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
33
+ true_rate = {}
34
+ predictivity = {}
35
+ fold_nr = 1
36
+ training_dataset = Dataset.find model.training_dataset_id
37
+ training_dataset.folds(n).each do |fold|
38
+ t = Time.now
39
+ $logger.debug "Fold #{fold_nr}"
40
+ validation = validation_class.create(model, fold[0], fold[1])
41
+ validation_ids << validation.id
42
+ nr_instances += validation.nr_instances
43
+ nr_unpredicted += validation.nr_unpredicted
44
+ predictions += validation.predictions
45
+ validation.confusion_matrix.each_with_index do |r,i|
46
+ r.each_with_index do |c,j|
47
+ confusion_matrix[i][j] += c
48
+ weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
49
+ end
50
+ end
51
+ $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
52
+ fold_nr +=1
53
+ end
54
+ true_rate = {}
55
+ predictivity = {}
56
+ accept_values.each_with_index do |v,i|
57
+ true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
58
+ predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
59
+ end
60
+ confidence_sum = 0
61
+ weighted_confusion_matrix.each do |r|
62
+ r.each do |c|
63
+ confidence_sum += c
64
+ end
65
+ end
66
+ cv.update_attributes(
67
+ nr_instances: nr_instances,
68
+ nr_unpredicted: nr_unpredicted,
69
+ accept_values: accept_values,
70
+ confusion_matrix: confusion_matrix,
71
+ weighted_confusion_matrix: weighted_confusion_matrix,
72
+ accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
73
+ weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
74
+ true_rate: true_rate,
75
+ predictivity: predictivity,
76
+ predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
77
+ finished_at: Time.now
78
+ )
79
+ cv.save
80
+ cv
81
+ end
82
+
83
+ #Average area under roc 0.646
84
+ #Area under roc 0.646
85
+ #F measure carcinogen: 0.769, noncarcinogen: 0.348
86
+ end
87
+
88
+ class RegressionCrossValidation < Validation
89
+
90
+ field :validation_ids, type: Array, default: []
91
+ field :folds, type: Integer
92
+ field :rmse, type: Float
93
+ field :mae, type: Float
94
+ field :weighted_rmse, type: Float
95
+ field :weighted_mae, type: Float
96
+
97
+ def self.create model, n=10
98
+ cv = self.new
99
+ validation_ids = []
100
+ nr_instances = 0
101
+ nr_unpredicted = 0
102
+ predictions = []
103
+ validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
104
+ fold_nr = 1
105
+ training_dataset = Dataset.find model.training_dataset_id
106
+ training_dataset.folds(n).each do |fold|
107
+ t = Time.now
108
+ $logger.debug "Predicting fold #{fold_nr}"
109
+
110
+ validation = validation_class.create(model, fold[0], fold[1])
111
+ validation_ids << validation.id
112
+ nr_instances += validation.nr_instances
113
+ nr_unpredicted += validation.nr_unpredicted
114
+ predictions += validation.predictions
115
+ $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
116
+ fold_nr +=1
117
+ end
118
+ rmse = 0
119
+ weighted_rmse = 0
120
+ rse = 0
121
+ weighted_rse = 0
122
+ mae = 0
123
+ weighted_mae = 0
124
+ rae = 0
125
+ weighted_rae = 0
126
+ n = 0
127
+ confidence_sum = 0
128
+ predictions.each do |pred|
129
+ compound_id,activity,prediction,confidence = pred
130
+ if activity and prediction
131
+ error = prediction-activity
132
+ rmse += error**2
133
+ weighted_rmse += confidence*error**2
134
+ mae += error.abs
135
+ weighted_mae += confidence*error.abs
136
+ n += 1
137
+ confidence_sum += confidence
138
+ else
139
+ # TODO: create warnings
140
+ p pred
141
+ end
142
+ end
143
+ mae = mae/n
144
+ weighted_mae = weighted_mae/confidence_sum
145
+ rmse = Math.sqrt(rmse/n)
146
+ weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
147
+ cv.update_attributes(
148
+ folds: n,
149
+ validation_ids: validation_ids,
150
+ nr_instances: nr_instances,
151
+ nr_unpredicted: nr_unpredicted,
152
+ predictions: predictions.sort{|a,b| b[3] <=> a[3]},
153
+ mae: mae,
154
+ rmse: rmse,
155
+ weighted_mae: weighted_mae,
156
+ weighted_rmse: weighted_rmse
157
+ )
158
+ cv.save
159
+ cv
160
+ end
161
+
162
+ def plot
163
+ # RMSE
164
+ x = predictions.collect{|p| p[1]}
165
+ y = predictions.collect{|p| p[2]}
166
+ R.assign "Measurement", x
167
+ R.assign "Prediction", y
168
+ R.eval "par(pty='s')" # sets the plot type to be square
169
+ #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
170
+ #R.eval "error <- log(Measurement)-log(Prediction)"
171
+ R.eval "error <- Measurement-Prediction"
172
+ R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
173
+ R.eval "mae <- mean( abs(error), na.rm = TRUE)"
174
+ R.eval "r <- cor(log(Prediction),log(Measurement))"
175
+ R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
176
+ R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
177
+ #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
178
+ #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
179
+ R.eval "abline(0,1,col='blue')"
180
+ #R.eval "abline(fitline,col='red')"
181
+ R.eval "dev.off()"
182
+ "/tmp/#{id.to_s}.svg"
183
+ end
184
+ end
185
+
186
+
187
+ end
data/lib/dataset.rb ADDED
@@ -0,0 +1,334 @@
1
+ require 'csv'
2
+ require 'tempfile'
3
+
4
+ module OpenTox
5
+
6
+ class Dataset
7
+
8
+ attr_writer :data_entries
9
+
10
+ # associations like has_many, belongs_to deteriorate performance
11
+ field :feature_ids, type: Array, default: []
12
+ field :compound_ids, type: Array, default: []
13
+ field :data_entries_id, type: BSON::ObjectId, default: []
14
+ field :source, type: String
15
+ field :warnings, type: Array, default: []
16
+
17
+ # Save all data including data_entries
18
+ # Should be used instead of save
19
+ def save_all
20
+ dump = Marshal.dump(@data_entries)
21
+ file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
22
+ data_entries_id = $gridfs.insert_one(file)
23
+ update(:data_entries_id => data_entries_id)
24
+ save
25
+ end
26
+
27
+ # Readers
28
+
29
+ # Get all compounds
30
+ def compounds
31
+ @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
32
+ @compounds
33
+ end
34
+
35
+ # Get all features
36
+ def features
37
+ @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
38
+ @features
39
+ end
40
+
41
+ # Get all data_entries
42
+ def data_entries
43
+ unless @data_entries
44
+ t = Time.now
45
+ data_entry_file = $gridfs.find_one(_id: data_entries_id)
46
+ if data_entry_file.nil?
47
+ @data_entries = []
48
+ else
49
+ @data_entries = Marshal.load(data_entry_file.data)
50
+ bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
51
+ bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
52
+ bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries..first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
53
+ $logger.debug "Retrieving data: #{Time.now-t}"
54
+ end
55
+ end
56
+ @data_entries
57
+ end
58
+
59
+ # Find data entry values for a given compound and feature
60
+ # @param compound [OpenTox::Compound] OpenTox Compound object
61
+ # @param feature [OpenTox::Feature] OpenTox Feature object
62
+ # @return [Array] Data entry values
63
+ def values(compound, feature)
64
+ rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
65
+ col = feature_ids.index feature.id
66
+ rows.collect{|row| data_entries[row][col]}
67
+ end
68
+
69
+ # Writers
70
+
71
+ # Set compounds
72
+ def compounds=(compounds)
73
+ self.compound_ids = compounds.collect{|c| c.id}
74
+ end
75
+
76
+ # Set features
77
+ def features=(features)
78
+ self.feature_ids = features.collect{|f| f.id}
79
+ end
80
+
81
+ # Dataset operations
82
+
83
+ # Split a dataset into n folds
84
+ # @param [Integer] number of folds
85
+ # @return [Array] Array with folds [training_dataset,test_dataset]
86
+ def folds n
87
+ len = self.compound_ids.size
88
+ indices = (0..len-1).to_a.shuffle
89
+ mid = (len/n)
90
+ chunks = []
91
+ start = 0
92
+ 1.upto(n) do |i|
93
+ last = start+mid
94
+ last = last-1 unless len%n >= i
95
+ test_idxs = indices[start..last] || []
96
+ test_cids = test_idxs.collect{|i| self.compound_ids[i]}
97
+ test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
98
+ test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
99
+ training_idxs = indices-test_idxs
100
+ training_cids = training_idxs.collect{|i| self.compound_ids[i]}
101
+ training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
102
+ training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
103
+ test_dataset.save_all
104
+ training_dataset.save_all
105
+ chunks << [training_dataset,test_dataset]
106
+ start = last+1
107
+ end
108
+ chunks
109
+ end
110
+
111
+ # Diagnostics
112
+
113
+ def correlation_plot training_dataset
114
+ # TODO: create/store svg
115
+ R.assign "features", data_entries
116
+ R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
117
+ R.eval "featurePlot(features,activities)"
118
+ end
119
+
120
+ def density_plot
121
+ # TODO: create/store svg
122
+ R.assign "acts", data_entries.collect{|r| r.first }#.compact
123
+ R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
124
+ end
125
+
126
+ # Serialisation
127
+
128
+ # converts dataset to csv format including compound smiles as first column, other column headers are feature titles
129
+ # @return [String]
130
+ def to_csv(inchi=false)
131
+ CSV.generate() do |csv| #{:force_quotes=>true}
132
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.title}
133
+ compounds.each_with_index do |c,i|
134
+ csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
135
+ end
136
+ end
137
+ end
138
+
139
+
140
+ # Parsers
141
+
142
+ # Create a dataset from file (csv,sdf,...)
143
+ # @param filename [String]
144
+ # @return [String] dataset uri
145
+ # TODO
146
+ #def self.from_sdf_file
147
+ #end
148
+
149
+ # Create a dataset from CSV file
150
+ # TODO: document structure
151
+ def self.from_csv_file file, source=nil, bioassay=true
152
+ source ||= file
153
+ table = CSV.read file, :skip_blanks => true
154
+ dataset = self.new(:source => source, :name => File.basename(file))
155
+ dataset.parse_table table, bioassay
156
+ dataset
157
+ end
158
+
159
+ # parse data in tabular format (e.g. from csv)
160
+ # does a lot of guesswork in order to determine feature types
161
+ def parse_table table, bioassay=true
162
+
163
+ time = Time.now
164
+
165
+ # features
166
+ feature_names = table.shift.collect{|f| f.strip}
167
+ warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
168
+ compound_format = feature_names.shift.strip
169
+ bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
170
+
171
+ numeric = []
172
+ # guess feature types
173
+ feature_names.each_with_index do |f,i|
174
+ metadata = {:name => f}
175
+ values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
176
+ types = values.collect{|v| v.numeric? ? true : false}.uniq
177
+ if values.size == 0 # empty feature
178
+ elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
179
+ metadata["numeric"] = true
180
+ numeric[i] = true
181
+ else
182
+ metadata["nominal"] = true
183
+ metadata["accept_values"] = values
184
+ numeric[i] = false
185
+ end
186
+ if bioassay
187
+ if metadata["numeric"]
188
+ feature = NumericBioAssay.find_or_create_by(metadata)
189
+ elsif metadata["nominal"]
190
+ feature = NominalBioAssay.find_or_create_by(metadata)
191
+ end
192
+ else
193
+ metadata.merge({:measured => false, :calculated => true})
194
+ if metadata["numeric"]
195
+ feature = NumericFeature.find_or_create_by(metadata)
196
+ elsif metadata["nominal"]
197
+ feature = NominalFeature.find_or_create_by(metadata)
198
+ end
199
+ end
200
+ feature_ids << feature.id
201
+ end
202
+
203
+ $logger.debug "Feature values: #{Time.now-time}"
204
+ time = Time.now
205
+
206
+ r = -1
207
+ compound_time = 0
208
+ value_time = 0
209
+
210
+ # compounds and values
211
+ @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
212
+
213
+ table.each_with_index do |vals,i|
214
+ ct = Time.now
215
+ identifier = vals.shift
216
+ warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
217
+ begin
218
+ case compound_format
219
+ when /SMILES/i
220
+ compound = OpenTox::Compound.from_smiles(identifier)
221
+ when /InChI/i
222
+ compound = OpenTox::Compound.from_inchi(identifier)
223
+ end
224
+ rescue
225
+ compound = nil
226
+ end
227
+ if compound.nil?
228
+ # compound parsers may return nil
229
+ warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
230
+ next
231
+ end
232
+ # TODO insert empty compounds to keep positions?
233
+ compound_time += Time.now-ct
234
+
235
+ r += 1
236
+ unless vals.size == feature_ids.size # way cheaper than accessing features
237
+ warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
238
+ next
239
+ end
240
+
241
+ compound_ids << compound.id
242
+ @data_entries << Array.new(table.first.size-1)
243
+
244
+ vals.each_with_index do |v,j|
245
+ if v.blank?
246
+ warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
247
+ next
248
+ elsif numeric[j]
249
+ @data_entries.last[j] = v.to_f
250
+ else
251
+ @data_entries.last[j] = v.strip
252
+ end
253
+ end
254
+ end
255
+ compounds.duplicates.each do |compound|
256
+ positions = []
257
+ compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
258
+ warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
259
+ end
260
+
261
+ $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
262
+ time = Time.now
263
+ save_all
264
+ $logger.debug "Saving: #{Time.now-time}"
265
+
266
+ end
267
+
268
+ =begin
269
+ # TODO remove
270
+
271
+ # Create a dataset with compounds and features
272
+ def self.create compounds, features, warnings=[], source=nil
273
+ dataset = Dataset.new(:warnings => warnings)
274
+ dataset.compounds = compounds
275
+ dataset.features = features
276
+ dataset
277
+ end
278
+ # merge dataset (i.e. append features)
279
+ def +(dataset)
280
+ bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
281
+ bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids
282
+ self.feature_ids ||= []
283
+ self.feature_ids = self.feature_ids + dataset.feature_ids
284
+ @data_entries ||= Array.new(compound_ids.size){[]}
285
+ @data_entries.each_with_index do |row,i|
286
+ @data_entries[i] = row + dataset.fingerprint(compounds[i])
287
+ end
288
+ self
289
+
290
+ end
291
+
292
+ def fingerprint(compound)
293
+ i = compound_ids.index(compound.id)
294
+ i.nil? ? nil : data_entries[i]
295
+ end
296
+ =end
297
+
298
+ # Fill unset data entries
299
+ # @param any value
300
+ def fill_nil_with n
301
+ (0 .. compound_ids.size-1).each do |i|
302
+ @data_entries[i] ||= []
303
+ (0 .. feature_ids.size-1).each do |j|
304
+ @data_entries[i][j] ||= n
305
+ end
306
+ end
307
+ end
308
+ end
309
+
310
+ # Dataset for lazar predictions
311
+ class LazarPrediction < Dataset
312
+ field :creator, type: String
313
+ field :prediction_feature_id, type: String
314
+
315
+ def prediction_feature
316
+ Feature.find prediction_feature_id
317
+ end
318
+
319
+ end
320
+
321
+ # Dataset for descriptors (physchem)
322
+ class DescriptorDataset < Dataset
323
+ field :feature_calculation_algorithm, type: String
324
+ end
325
+
326
+ # Dataset for fminer descriptors
327
+ class FminerDataset < DescriptorDataset
328
+ field :training_algorithm, type: String
329
+ field :training_dataset_id, type: BSON::ObjectId
330
+ field :training_feature_id, type: BSON::ObjectId
331
+ field :training_parameters, type: Hash
332
+ end
333
+
334
+ end