lazar 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
@@ -0,0 +1,187 @@
1
+ module OpenTox
2
+
3
+ class CrossValidation
4
+ field :validation_ids, type: Array, default: []
5
+ field :folds, type: Integer
6
+ field :nr_instances, type: Integer
7
+ field :nr_unpredicted, type: Integer
8
+ field :predictions, type: Array
9
+ field :finished_at, type: Time
10
+ end
11
+
12
+ class ClassificationCrossValidation < CrossValidation
13
+
14
+ field :accept_values, type: Array
15
+ field :confusion_matrix, type: Array
16
+ field :weighted_confusion_matrix, type: Array
17
+ field :accuracy, type: Float
18
+ field :weighted_accuracy, type: Float
19
+ field :true_rate, type: Hash
20
+ field :predictivity, type: Hash
21
+ # TODO auc, f-measure (usability??)
22
+
23
+ def self.create model, n=10
24
+ cv = self.new
25
+ validation_ids = []
26
+ nr_instances = 0
27
+ nr_unpredicted = 0
28
+ predictions = []
29
+ validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
30
+ accept_values = Feature.find(model.prediction_feature_id).accept_values
31
+ confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
32
+ weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
33
+ true_rate = {}
34
+ predictivity = {}
35
+ fold_nr = 1
36
+ training_dataset = Dataset.find model.training_dataset_id
37
+ training_dataset.folds(n).each do |fold|
38
+ t = Time.now
39
+ $logger.debug "Fold #{fold_nr}"
40
+ validation = validation_class.create(model, fold[0], fold[1])
41
+ validation_ids << validation.id
42
+ nr_instances += validation.nr_instances
43
+ nr_unpredicted += validation.nr_unpredicted
44
+ predictions += validation.predictions
45
+ validation.confusion_matrix.each_with_index do |r,i|
46
+ r.each_with_index do |c,j|
47
+ confusion_matrix[i][j] += c
48
+ weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
49
+ end
50
+ end
51
+ $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
52
+ fold_nr +=1
53
+ end
54
+ true_rate = {}
55
+ predictivity = {}
56
+ accept_values.each_with_index do |v,i|
57
+ true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
58
+ predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
59
+ end
60
+ confidence_sum = 0
61
+ weighted_confusion_matrix.each do |r|
62
+ r.each do |c|
63
+ confidence_sum += c
64
+ end
65
+ end
66
+ cv.update_attributes(
67
+ nr_instances: nr_instances,
68
+ nr_unpredicted: nr_unpredicted,
69
+ accept_values: accept_values,
70
+ confusion_matrix: confusion_matrix,
71
+ weighted_confusion_matrix: weighted_confusion_matrix,
72
+ accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
73
+ weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
74
+ true_rate: true_rate,
75
+ predictivity: predictivity,
76
+ predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
77
+ finished_at: Time.now
78
+ )
79
+ cv.save
80
+ cv
81
+ end
82
+
83
+ #Average area under roc 0.646
84
+ #Area under roc 0.646
85
+ #F measure carcinogen: 0.769, noncarcinogen: 0.348
86
+ end
87
+
88
+ class RegressionCrossValidation < Validation
89
+
90
+ field :validation_ids, type: Array, default: []
91
+ field :folds, type: Integer
92
+ field :rmse, type: Float
93
+ field :mae, type: Float
94
+ field :weighted_rmse, type: Float
95
+ field :weighted_mae, type: Float
96
+
97
+ def self.create model, n=10
98
+ cv = self.new
99
+ validation_ids = []
100
+ nr_instances = 0
101
+ nr_unpredicted = 0
102
+ predictions = []
103
+ validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
104
+ fold_nr = 1
105
+ training_dataset = Dataset.find model.training_dataset_id
106
+ training_dataset.folds(n).each do |fold|
107
+ t = Time.now
108
+ $logger.debug "Predicting fold #{fold_nr}"
109
+
110
+ validation = validation_class.create(model, fold[0], fold[1])
111
+ validation_ids << validation.id
112
+ nr_instances += validation.nr_instances
113
+ nr_unpredicted += validation.nr_unpredicted
114
+ predictions += validation.predictions
115
+ $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
116
+ fold_nr +=1
117
+ end
118
+ rmse = 0
119
+ weighted_rmse = 0
120
+ rse = 0
121
+ weighted_rse = 0
122
+ mae = 0
123
+ weighted_mae = 0
124
+ rae = 0
125
+ weighted_rae = 0
126
+ n = 0
127
+ confidence_sum = 0
128
+ predictions.each do |pred|
129
+ compound_id,activity,prediction,confidence = pred
130
+ if activity and prediction
131
+ error = prediction-activity
132
+ rmse += error**2
133
+ weighted_rmse += confidence*error**2
134
+ mae += error.abs
135
+ weighted_mae += confidence*error.abs
136
+ n += 1
137
+ confidence_sum += confidence
138
+ else
139
+ # TODO: create warnings
140
+ p pred
141
+ end
142
+ end
143
+ mae = mae/n
144
+ weighted_mae = weighted_mae/confidence_sum
145
+ rmse = Math.sqrt(rmse/n)
146
+ weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
147
+ cv.update_attributes(
148
+ folds: n,
149
+ validation_ids: validation_ids,
150
+ nr_instances: nr_instances,
151
+ nr_unpredicted: nr_unpredicted,
152
+ predictions: predictions.sort{|a,b| b[3] <=> a[3]},
153
+ mae: mae,
154
+ rmse: rmse,
155
+ weighted_mae: weighted_mae,
156
+ weighted_rmse: weighted_rmse
157
+ )
158
+ cv.save
159
+ cv
160
+ end
161
+
162
+ def plot
163
+ # RMSE
164
+ x = predictions.collect{|p| p[1]}
165
+ y = predictions.collect{|p| p[2]}
166
+ R.assign "Measurement", x
167
+ R.assign "Prediction", y
168
+ R.eval "par(pty='s')" # sets the plot type to be square
169
+ #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
170
+ #R.eval "error <- log(Measurement)-log(Prediction)"
171
+ R.eval "error <- Measurement-Prediction"
172
+ R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
173
+ R.eval "mae <- mean( abs(error), na.rm = TRUE)"
174
+ R.eval "r <- cor(log(Prediction),log(Measurement))"
175
+ R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
176
+ R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
177
+ #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
178
+ #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
179
+ R.eval "abline(0,1,col='blue')"
180
+ #R.eval "abline(fitline,col='red')"
181
+ R.eval "dev.off()"
182
+ "/tmp/#{id.to_s}.svg"
183
+ end
184
+ end
185
+
186
+
187
+ end
data/lib/dataset.rb ADDED
@@ -0,0 +1,334 @@
1
+ require 'csv'
2
+ require 'tempfile'
3
+
4
+ module OpenTox
5
+
6
+ class Dataset
7
+
8
+ attr_writer :data_entries
9
+
10
+ # associations like has_many, belongs_to deteriorate performance
11
+ field :feature_ids, type: Array, default: []
12
+ field :compound_ids, type: Array, default: []
13
+ field :data_entries_id, type: BSON::ObjectId, default: []
14
+ field :source, type: String
15
+ field :warnings, type: Array, default: []
16
+
17
+ # Save all data including data_entries
18
+ # Should be used instead of save
19
+ def save_all
20
+ dump = Marshal.dump(@data_entries)
21
+ file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
22
+ data_entries_id = $gridfs.insert_one(file)
23
+ update(:data_entries_id => data_entries_id)
24
+ save
25
+ end
26
+
27
+ # Readers
28
+
29
+ # Get all compounds
30
+ def compounds
31
+ @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
32
+ @compounds
33
+ end
34
+
35
+ # Get all features
36
+ def features
37
+ @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
38
+ @features
39
+ end
40
+
41
+ # Get all data_entries
42
+ def data_entries
43
+ unless @data_entries
44
+ t = Time.now
45
+ data_entry_file = $gridfs.find_one(_id: data_entries_id)
46
+ if data_entry_file.nil?
47
+ @data_entries = []
48
+ else
49
+ @data_entries = Marshal.load(data_entry_file.data)
50
+ bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
51
+ bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
52
+ bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries..first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
53
+ $logger.debug "Retrieving data: #{Time.now-t}"
54
+ end
55
+ end
56
+ @data_entries
57
+ end
58
+
59
+ # Find data entry values for a given compound and feature
60
+ # @param compound [OpenTox::Compound] OpenTox Compound object
61
+ # @param feature [OpenTox::Feature] OpenTox Feature object
62
+ # @return [Array] Data entry values
63
+ def values(compound, feature)
64
+ rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
65
+ col = feature_ids.index feature.id
66
+ rows.collect{|row| data_entries[row][col]}
67
+ end
68
+
69
+ # Writers
70
+
71
+ # Set compounds
72
+ def compounds=(compounds)
73
+ self.compound_ids = compounds.collect{|c| c.id}
74
+ end
75
+
76
+ # Set features
77
+ def features=(features)
78
+ self.feature_ids = features.collect{|f| f.id}
79
+ end
80
+
81
+ # Dataset operations
82
+
83
+ # Split a dataset into n folds
84
+ # @param [Integer] number of folds
85
+ # @return [Array] Array with folds [training_dataset,test_dataset]
86
+ def folds n
87
+ len = self.compound_ids.size
88
+ indices = (0..len-1).to_a.shuffle
89
+ mid = (len/n)
90
+ chunks = []
91
+ start = 0
92
+ 1.upto(n) do |i|
93
+ last = start+mid
94
+ last = last-1 unless len%n >= i
95
+ test_idxs = indices[start..last] || []
96
+ test_cids = test_idxs.collect{|i| self.compound_ids[i]}
97
+ test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
98
+ test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
99
+ training_idxs = indices-test_idxs
100
+ training_cids = training_idxs.collect{|i| self.compound_ids[i]}
101
+ training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
102
+ training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
103
+ test_dataset.save_all
104
+ training_dataset.save_all
105
+ chunks << [training_dataset,test_dataset]
106
+ start = last+1
107
+ end
108
+ chunks
109
+ end
110
+
111
+ # Diagnostics
112
+
113
+ def correlation_plot training_dataset
114
+ # TODO: create/store svg
115
+ R.assign "features", data_entries
116
+ R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
117
+ R.eval "featurePlot(features,activities)"
118
+ end
119
+
120
+ def density_plot
121
+ # TODO: create/store svg
122
+ R.assign "acts", data_entries.collect{|r| r.first }#.compact
123
+ R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
124
+ end
125
+
126
+ # Serialisation
127
+
128
+ # converts dataset to csv format including compound smiles as first column, other column headers are feature titles
129
+ # @return [String]
130
+ def to_csv(inchi=false)
131
+ CSV.generate() do |csv| #{:force_quotes=>true}
132
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.title}
133
+ compounds.each_with_index do |c,i|
134
+ csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
135
+ end
136
+ end
137
+ end
138
+
139
+
140
+ # Parsers
141
+
142
+ # Create a dataset from file (csv,sdf,...)
143
+ # @param filename [String]
144
+ # @return [String] dataset uri
145
+ # TODO
146
+ #def self.from_sdf_file
147
+ #end
148
+
149
+ # Create a dataset from CSV file
150
+ # TODO: document structure
151
+ def self.from_csv_file file, source=nil, bioassay=true
152
+ source ||= file
153
+ table = CSV.read file, :skip_blanks => true
154
+ dataset = self.new(:source => source, :name => File.basename(file))
155
+ dataset.parse_table table, bioassay
156
+ dataset
157
+ end
158
+
159
+ # parse data in tabular format (e.g. from csv)
160
+ # does a lot of guesswork in order to determine feature types
161
+ def parse_table table, bioassay=true
162
+
163
+ time = Time.now
164
+
165
+ # features
166
+ feature_names = table.shift.collect{|f| f.strip}
167
+ warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
168
+ compound_format = feature_names.shift.strip
169
+ bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
170
+
171
+ numeric = []
172
+ # guess feature types
173
+ feature_names.each_with_index do |f,i|
174
+ metadata = {:name => f}
175
+ values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
176
+ types = values.collect{|v| v.numeric? ? true : false}.uniq
177
+ if values.size == 0 # empty feature
178
+ elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
179
+ metadata["numeric"] = true
180
+ numeric[i] = true
181
+ else
182
+ metadata["nominal"] = true
183
+ metadata["accept_values"] = values
184
+ numeric[i] = false
185
+ end
186
+ if bioassay
187
+ if metadata["numeric"]
188
+ feature = NumericBioAssay.find_or_create_by(metadata)
189
+ elsif metadata["nominal"]
190
+ feature = NominalBioAssay.find_or_create_by(metadata)
191
+ end
192
+ else
193
+ metadata.merge({:measured => false, :calculated => true})
194
+ if metadata["numeric"]
195
+ feature = NumericFeature.find_or_create_by(metadata)
196
+ elsif metadata["nominal"]
197
+ feature = NominalFeature.find_or_create_by(metadata)
198
+ end
199
+ end
200
+ feature_ids << feature.id
201
+ end
202
+
203
+ $logger.debug "Feature values: #{Time.now-time}"
204
+ time = Time.now
205
+
206
+ r = -1
207
+ compound_time = 0
208
+ value_time = 0
209
+
210
+ # compounds and values
211
+ @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
212
+
213
+ table.each_with_index do |vals,i|
214
+ ct = Time.now
215
+ identifier = vals.shift
216
+ warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
217
+ begin
218
+ case compound_format
219
+ when /SMILES/i
220
+ compound = OpenTox::Compound.from_smiles(identifier)
221
+ when /InChI/i
222
+ compound = OpenTox::Compound.from_inchi(identifier)
223
+ end
224
+ rescue
225
+ compound = nil
226
+ end
227
+ if compound.nil?
228
+ # compound parsers may return nil
229
+ warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
230
+ next
231
+ end
232
+ # TODO insert empty compounds to keep positions?
233
+ compound_time += Time.now-ct
234
+
235
+ r += 1
236
+ unless vals.size == feature_ids.size # way cheaper than accessing features
237
+ warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
238
+ next
239
+ end
240
+
241
+ compound_ids << compound.id
242
+ @data_entries << Array.new(table.first.size-1)
243
+
244
+ vals.each_with_index do |v,j|
245
+ if v.blank?
246
+ warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
247
+ next
248
+ elsif numeric[j]
249
+ @data_entries.last[j] = v.to_f
250
+ else
251
+ @data_entries.last[j] = v.strip
252
+ end
253
+ end
254
+ end
255
+ compounds.duplicates.each do |compound|
256
+ positions = []
257
+ compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
258
+ warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
259
+ end
260
+
261
+ $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
262
+ time = Time.now
263
+ save_all
264
+ $logger.debug "Saving: #{Time.now-time}"
265
+
266
+ end
267
+
268
+ =begin
269
+ # TODO remove
270
+
271
+ # Create a dataset with compounds and features
272
+ def self.create compounds, features, warnings=[], source=nil
273
+ dataset = Dataset.new(:warnings => warnings)
274
+ dataset.compounds = compounds
275
+ dataset.features = features
276
+ dataset
277
+ end
278
+ # merge dataset (i.e. append features)
279
+ def +(dataset)
280
+ bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
281
+ bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids
282
+ self.feature_ids ||= []
283
+ self.feature_ids = self.feature_ids + dataset.feature_ids
284
+ @data_entries ||= Array.new(compound_ids.size){[]}
285
+ @data_entries.each_with_index do |row,i|
286
+ @data_entries[i] = row + dataset.fingerprint(compounds[i])
287
+ end
288
+ self
289
+
290
+ end
291
+
292
+ def fingerprint(compound)
293
+ i = compound_ids.index(compound.id)
294
+ i.nil? ? nil : data_entries[i]
295
+ end
296
+ =end
297
+
298
+ # Fill unset data entries
299
+ # @param any value
300
+ def fill_nil_with n
301
+ (0 .. compound_ids.size-1).each do |i|
302
+ @data_entries[i] ||= []
303
+ (0 .. feature_ids.size-1).each do |j|
304
+ @data_entries[i][j] ||= n
305
+ end
306
+ end
307
+ end
308
+ end
309
+
310
+ # Dataset for lazar predictions
311
+ class LazarPrediction < Dataset
312
+ field :creator, type: String
313
+ field :prediction_feature_id, type: String
314
+
315
+ def prediction_feature
316
+ Feature.find prediction_feature_id
317
+ end
318
+
319
+ end
320
+
321
+ # Dataset for descriptors (physchem)
322
+ class DescriptorDataset < Dataset
323
+ field :feature_calculation_algorithm, type: String
324
+ end
325
+
326
+ # Dataset for fminer descriptors
327
+ class FminerDataset < DescriptorDataset
328
+ field :training_algorithm, type: String
329
+ field :training_dataset_id, type: BSON::ObjectId
330
+ field :training_feature_id, type: BSON::ObjectId
331
+ field :training_parameters, type: Hash
332
+ end
333
+
334
+ end