lazar 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
|
|
3
|
+
class CrossValidation
|
|
4
|
+
field :validation_ids, type: Array, default: []
|
|
5
|
+
field :folds, type: Integer
|
|
6
|
+
field :nr_instances, type: Integer
|
|
7
|
+
field :nr_unpredicted, type: Integer
|
|
8
|
+
field :predictions, type: Array
|
|
9
|
+
field :finished_at, type: Time
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class ClassificationCrossValidation < CrossValidation
|
|
13
|
+
|
|
14
|
+
field :accept_values, type: Array
|
|
15
|
+
field :confusion_matrix, type: Array
|
|
16
|
+
field :weighted_confusion_matrix, type: Array
|
|
17
|
+
field :accuracy, type: Float
|
|
18
|
+
field :weighted_accuracy, type: Float
|
|
19
|
+
field :true_rate, type: Hash
|
|
20
|
+
field :predictivity, type: Hash
|
|
21
|
+
# TODO auc, f-measure (usability??)
|
|
22
|
+
|
|
23
|
+
def self.create model, n=10
|
|
24
|
+
cv = self.new
|
|
25
|
+
validation_ids = []
|
|
26
|
+
nr_instances = 0
|
|
27
|
+
nr_unpredicted = 0
|
|
28
|
+
predictions = []
|
|
29
|
+
validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
|
|
30
|
+
accept_values = Feature.find(model.prediction_feature_id).accept_values
|
|
31
|
+
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
|
32
|
+
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
|
33
|
+
true_rate = {}
|
|
34
|
+
predictivity = {}
|
|
35
|
+
fold_nr = 1
|
|
36
|
+
training_dataset = Dataset.find model.training_dataset_id
|
|
37
|
+
training_dataset.folds(n).each do |fold|
|
|
38
|
+
t = Time.now
|
|
39
|
+
$logger.debug "Fold #{fold_nr}"
|
|
40
|
+
validation = validation_class.create(model, fold[0], fold[1])
|
|
41
|
+
validation_ids << validation.id
|
|
42
|
+
nr_instances += validation.nr_instances
|
|
43
|
+
nr_unpredicted += validation.nr_unpredicted
|
|
44
|
+
predictions += validation.predictions
|
|
45
|
+
validation.confusion_matrix.each_with_index do |r,i|
|
|
46
|
+
r.each_with_index do |c,j|
|
|
47
|
+
confusion_matrix[i][j] += c
|
|
48
|
+
weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
$logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
|
|
52
|
+
fold_nr +=1
|
|
53
|
+
end
|
|
54
|
+
true_rate = {}
|
|
55
|
+
predictivity = {}
|
|
56
|
+
accept_values.each_with_index do |v,i|
|
|
57
|
+
true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
|
|
58
|
+
predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
|
|
59
|
+
end
|
|
60
|
+
confidence_sum = 0
|
|
61
|
+
weighted_confusion_matrix.each do |r|
|
|
62
|
+
r.each do |c|
|
|
63
|
+
confidence_sum += c
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
cv.update_attributes(
|
|
67
|
+
nr_instances: nr_instances,
|
|
68
|
+
nr_unpredicted: nr_unpredicted,
|
|
69
|
+
accept_values: accept_values,
|
|
70
|
+
confusion_matrix: confusion_matrix,
|
|
71
|
+
weighted_confusion_matrix: weighted_confusion_matrix,
|
|
72
|
+
accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
|
|
73
|
+
weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
|
|
74
|
+
true_rate: true_rate,
|
|
75
|
+
predictivity: predictivity,
|
|
76
|
+
predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
|
|
77
|
+
finished_at: Time.now
|
|
78
|
+
)
|
|
79
|
+
cv.save
|
|
80
|
+
cv
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
#Average area under roc 0.646
|
|
84
|
+
#Area under roc 0.646
|
|
85
|
+
#F measure carcinogen: 0.769, noncarcinogen: 0.348
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
class RegressionCrossValidation < Validation
|
|
89
|
+
|
|
90
|
+
field :validation_ids, type: Array, default: []
|
|
91
|
+
field :folds, type: Integer
|
|
92
|
+
field :rmse, type: Float
|
|
93
|
+
field :mae, type: Float
|
|
94
|
+
field :weighted_rmse, type: Float
|
|
95
|
+
field :weighted_mae, type: Float
|
|
96
|
+
|
|
97
|
+
def self.create model, n=10
|
|
98
|
+
cv = self.new
|
|
99
|
+
validation_ids = []
|
|
100
|
+
nr_instances = 0
|
|
101
|
+
nr_unpredicted = 0
|
|
102
|
+
predictions = []
|
|
103
|
+
validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
|
|
104
|
+
fold_nr = 1
|
|
105
|
+
training_dataset = Dataset.find model.training_dataset_id
|
|
106
|
+
training_dataset.folds(n).each do |fold|
|
|
107
|
+
t = Time.now
|
|
108
|
+
$logger.debug "Predicting fold #{fold_nr}"
|
|
109
|
+
|
|
110
|
+
validation = validation_class.create(model, fold[0], fold[1])
|
|
111
|
+
validation_ids << validation.id
|
|
112
|
+
nr_instances += validation.nr_instances
|
|
113
|
+
nr_unpredicted += validation.nr_unpredicted
|
|
114
|
+
predictions += validation.predictions
|
|
115
|
+
$logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
|
|
116
|
+
fold_nr +=1
|
|
117
|
+
end
|
|
118
|
+
rmse = 0
|
|
119
|
+
weighted_rmse = 0
|
|
120
|
+
rse = 0
|
|
121
|
+
weighted_rse = 0
|
|
122
|
+
mae = 0
|
|
123
|
+
weighted_mae = 0
|
|
124
|
+
rae = 0
|
|
125
|
+
weighted_rae = 0
|
|
126
|
+
n = 0
|
|
127
|
+
confidence_sum = 0
|
|
128
|
+
predictions.each do |pred|
|
|
129
|
+
compound_id,activity,prediction,confidence = pred
|
|
130
|
+
if activity and prediction
|
|
131
|
+
error = prediction-activity
|
|
132
|
+
rmse += error**2
|
|
133
|
+
weighted_rmse += confidence*error**2
|
|
134
|
+
mae += error.abs
|
|
135
|
+
weighted_mae += confidence*error.abs
|
|
136
|
+
n += 1
|
|
137
|
+
confidence_sum += confidence
|
|
138
|
+
else
|
|
139
|
+
# TODO: create warnings
|
|
140
|
+
p pred
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
mae = mae/n
|
|
144
|
+
weighted_mae = weighted_mae/confidence_sum
|
|
145
|
+
rmse = Math.sqrt(rmse/n)
|
|
146
|
+
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
|
|
147
|
+
cv.update_attributes(
|
|
148
|
+
folds: n,
|
|
149
|
+
validation_ids: validation_ids,
|
|
150
|
+
nr_instances: nr_instances,
|
|
151
|
+
nr_unpredicted: nr_unpredicted,
|
|
152
|
+
predictions: predictions.sort{|a,b| b[3] <=> a[3]},
|
|
153
|
+
mae: mae,
|
|
154
|
+
rmse: rmse,
|
|
155
|
+
weighted_mae: weighted_mae,
|
|
156
|
+
weighted_rmse: weighted_rmse
|
|
157
|
+
)
|
|
158
|
+
cv.save
|
|
159
|
+
cv
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def plot
|
|
163
|
+
# RMSE
|
|
164
|
+
x = predictions.collect{|p| p[1]}
|
|
165
|
+
y = predictions.collect{|p| p[2]}
|
|
166
|
+
R.assign "Measurement", x
|
|
167
|
+
R.assign "Prediction", y
|
|
168
|
+
R.eval "par(pty='s')" # sets the plot type to be square
|
|
169
|
+
#R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
|
|
170
|
+
#R.eval "error <- log(Measurement)-log(Prediction)"
|
|
171
|
+
R.eval "error <- Measurement-Prediction"
|
|
172
|
+
R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
|
|
173
|
+
R.eval "mae <- mean( abs(error), na.rm = TRUE)"
|
|
174
|
+
R.eval "r <- cor(log(Prediction),log(Measurement))"
|
|
175
|
+
R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
|
|
176
|
+
R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
|
|
177
|
+
#R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
|
|
178
|
+
#R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
|
|
179
|
+
R.eval "abline(0,1,col='blue')"
|
|
180
|
+
#R.eval "abline(fitline,col='red')"
|
|
181
|
+
R.eval "dev.off()"
|
|
182
|
+
"/tmp/#{id.to_s}.svg"
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
end
|
data/lib/dataset.rb
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
require 'csv'
|
|
2
|
+
require 'tempfile'
|
|
3
|
+
|
|
4
|
+
module OpenTox
|
|
5
|
+
|
|
6
|
+
class Dataset
|
|
7
|
+
|
|
8
|
+
attr_writer :data_entries
|
|
9
|
+
|
|
10
|
+
# associations like has_many, belongs_to deteriorate performance
|
|
11
|
+
field :feature_ids, type: Array, default: []
|
|
12
|
+
field :compound_ids, type: Array, default: []
|
|
13
|
+
field :data_entries_id, type: BSON::ObjectId, default: []
|
|
14
|
+
field :source, type: String
|
|
15
|
+
field :warnings, type: Array, default: []
|
|
16
|
+
|
|
17
|
+
# Save all data including data_entries
|
|
18
|
+
# Should be used instead of save
|
|
19
|
+
def save_all
|
|
20
|
+
dump = Marshal.dump(@data_entries)
|
|
21
|
+
file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
|
|
22
|
+
data_entries_id = $gridfs.insert_one(file)
|
|
23
|
+
update(:data_entries_id => data_entries_id)
|
|
24
|
+
save
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Readers
|
|
28
|
+
|
|
29
|
+
# Get all compounds
|
|
30
|
+
def compounds
|
|
31
|
+
@compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
|
|
32
|
+
@compounds
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Get all features
|
|
36
|
+
def features
|
|
37
|
+
@features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
|
|
38
|
+
@features
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Get all data_entries
|
|
42
|
+
def data_entries
|
|
43
|
+
unless @data_entries
|
|
44
|
+
t = Time.now
|
|
45
|
+
data_entry_file = $gridfs.find_one(_id: data_entries_id)
|
|
46
|
+
if data_entry_file.nil?
|
|
47
|
+
@data_entries = []
|
|
48
|
+
else
|
|
49
|
+
@data_entries = Marshal.load(data_entry_file.data)
|
|
50
|
+
bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
|
|
51
|
+
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
|
|
52
|
+
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries..first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
|
|
53
|
+
$logger.debug "Retrieving data: #{Time.now-t}"
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
@data_entries
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Find data entry values for a given compound and feature
|
|
60
|
+
# @param compound [OpenTox::Compound] OpenTox Compound object
|
|
61
|
+
# @param feature [OpenTox::Feature] OpenTox Feature object
|
|
62
|
+
# @return [Array] Data entry values
|
|
63
|
+
def values(compound, feature)
|
|
64
|
+
rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
|
|
65
|
+
col = feature_ids.index feature.id
|
|
66
|
+
rows.collect{|row| data_entries[row][col]}
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Writers
|
|
70
|
+
|
|
71
|
+
# Set compounds
|
|
72
|
+
def compounds=(compounds)
|
|
73
|
+
self.compound_ids = compounds.collect{|c| c.id}
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Set features
|
|
77
|
+
def features=(features)
|
|
78
|
+
self.feature_ids = features.collect{|f| f.id}
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Dataset operations
|
|
82
|
+
|
|
83
|
+
# Split a dataset into n folds
|
|
84
|
+
# @param [Integer] number of folds
|
|
85
|
+
# @return [Array] Array with folds [training_dataset,test_dataset]
|
|
86
|
+
def folds n
|
|
87
|
+
len = self.compound_ids.size
|
|
88
|
+
indices = (0..len-1).to_a.shuffle
|
|
89
|
+
mid = (len/n)
|
|
90
|
+
chunks = []
|
|
91
|
+
start = 0
|
|
92
|
+
1.upto(n) do |i|
|
|
93
|
+
last = start+mid
|
|
94
|
+
last = last-1 unless len%n >= i
|
|
95
|
+
test_idxs = indices[start..last] || []
|
|
96
|
+
test_cids = test_idxs.collect{|i| self.compound_ids[i]}
|
|
97
|
+
test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
|
|
98
|
+
test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
|
|
99
|
+
training_idxs = indices-test_idxs
|
|
100
|
+
training_cids = training_idxs.collect{|i| self.compound_ids[i]}
|
|
101
|
+
training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
|
|
102
|
+
training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
|
|
103
|
+
test_dataset.save_all
|
|
104
|
+
training_dataset.save_all
|
|
105
|
+
chunks << [training_dataset,test_dataset]
|
|
106
|
+
start = last+1
|
|
107
|
+
end
|
|
108
|
+
chunks
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Diagnostics
|
|
112
|
+
|
|
113
|
+
def correlation_plot training_dataset
|
|
114
|
+
# TODO: create/store svg
|
|
115
|
+
R.assign "features", data_entries
|
|
116
|
+
R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
|
|
117
|
+
R.eval "featurePlot(features,activities)"
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def density_plot
|
|
121
|
+
# TODO: create/store svg
|
|
122
|
+
R.assign "acts", data_entries.collect{|r| r.first }#.compact
|
|
123
|
+
R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Serialisation
|
|
127
|
+
|
|
128
|
+
# converts dataset to csv format including compound smiles as first column, other column headers are feature titles
|
|
129
|
+
# @return [String]
|
|
130
|
+
def to_csv(inchi=false)
|
|
131
|
+
CSV.generate() do |csv| #{:force_quotes=>true}
|
|
132
|
+
csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.title}
|
|
133
|
+
compounds.each_with_index do |c,i|
|
|
134
|
+
csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# Parsers
|
|
141
|
+
|
|
142
|
+
# Create a dataset from file (csv,sdf,...)
|
|
143
|
+
# @param filename [String]
|
|
144
|
+
# @return [String] dataset uri
|
|
145
|
+
# TODO
|
|
146
|
+
#def self.from_sdf_file
|
|
147
|
+
#end
|
|
148
|
+
|
|
149
|
+
# Create a dataset from CSV file
|
|
150
|
+
# TODO: document structure
|
|
151
|
+
def self.from_csv_file file, source=nil, bioassay=true
|
|
152
|
+
source ||= file
|
|
153
|
+
table = CSV.read file, :skip_blanks => true
|
|
154
|
+
dataset = self.new(:source => source, :name => File.basename(file))
|
|
155
|
+
dataset.parse_table table, bioassay
|
|
156
|
+
dataset
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# parse data in tabular format (e.g. from csv)
|
|
160
|
+
# does a lot of guesswork in order to determine feature types
|
|
161
|
+
def parse_table table, bioassay=true
|
|
162
|
+
|
|
163
|
+
time = Time.now
|
|
164
|
+
|
|
165
|
+
# features
|
|
166
|
+
feature_names = table.shift.collect{|f| f.strip}
|
|
167
|
+
warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
|
|
168
|
+
compound_format = feature_names.shift.strip
|
|
169
|
+
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
|
|
170
|
+
|
|
171
|
+
numeric = []
|
|
172
|
+
# guess feature types
|
|
173
|
+
feature_names.each_with_index do |f,i|
|
|
174
|
+
metadata = {:name => f}
|
|
175
|
+
values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
|
|
176
|
+
types = values.collect{|v| v.numeric? ? true : false}.uniq
|
|
177
|
+
if values.size == 0 # empty feature
|
|
178
|
+
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
|
|
179
|
+
metadata["numeric"] = true
|
|
180
|
+
numeric[i] = true
|
|
181
|
+
else
|
|
182
|
+
metadata["nominal"] = true
|
|
183
|
+
metadata["accept_values"] = values
|
|
184
|
+
numeric[i] = false
|
|
185
|
+
end
|
|
186
|
+
if bioassay
|
|
187
|
+
if metadata["numeric"]
|
|
188
|
+
feature = NumericBioAssay.find_or_create_by(metadata)
|
|
189
|
+
elsif metadata["nominal"]
|
|
190
|
+
feature = NominalBioAssay.find_or_create_by(metadata)
|
|
191
|
+
end
|
|
192
|
+
else
|
|
193
|
+
metadata.merge({:measured => false, :calculated => true})
|
|
194
|
+
if metadata["numeric"]
|
|
195
|
+
feature = NumericFeature.find_or_create_by(metadata)
|
|
196
|
+
elsif metadata["nominal"]
|
|
197
|
+
feature = NominalFeature.find_or_create_by(metadata)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
feature_ids << feature.id
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
$logger.debug "Feature values: #{Time.now-time}"
|
|
204
|
+
time = Time.now
|
|
205
|
+
|
|
206
|
+
r = -1
|
|
207
|
+
compound_time = 0
|
|
208
|
+
value_time = 0
|
|
209
|
+
|
|
210
|
+
# compounds and values
|
|
211
|
+
@data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
|
|
212
|
+
|
|
213
|
+
table.each_with_index do |vals,i|
|
|
214
|
+
ct = Time.now
|
|
215
|
+
identifier = vals.shift
|
|
216
|
+
warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
|
|
217
|
+
begin
|
|
218
|
+
case compound_format
|
|
219
|
+
when /SMILES/i
|
|
220
|
+
compound = OpenTox::Compound.from_smiles(identifier)
|
|
221
|
+
when /InChI/i
|
|
222
|
+
compound = OpenTox::Compound.from_inchi(identifier)
|
|
223
|
+
end
|
|
224
|
+
rescue
|
|
225
|
+
compound = nil
|
|
226
|
+
end
|
|
227
|
+
if compound.nil?
|
|
228
|
+
# compound parsers may return nil
|
|
229
|
+
warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
|
|
230
|
+
next
|
|
231
|
+
end
|
|
232
|
+
# TODO insert empty compounds to keep positions?
|
|
233
|
+
compound_time += Time.now-ct
|
|
234
|
+
|
|
235
|
+
r += 1
|
|
236
|
+
unless vals.size == feature_ids.size # way cheaper than accessing features
|
|
237
|
+
warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
|
|
238
|
+
next
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
compound_ids << compound.id
|
|
242
|
+
@data_entries << Array.new(table.first.size-1)
|
|
243
|
+
|
|
244
|
+
vals.each_with_index do |v,j|
|
|
245
|
+
if v.blank?
|
|
246
|
+
warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
|
|
247
|
+
next
|
|
248
|
+
elsif numeric[j]
|
|
249
|
+
@data_entries.last[j] = v.to_f
|
|
250
|
+
else
|
|
251
|
+
@data_entries.last[j] = v.strip
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
compounds.duplicates.each do |compound|
|
|
256
|
+
positions = []
|
|
257
|
+
compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
|
|
258
|
+
warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
|
|
262
|
+
time = Time.now
|
|
263
|
+
save_all
|
|
264
|
+
$logger.debug "Saving: #{Time.now-time}"
|
|
265
|
+
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
=begin
|
|
269
|
+
# TODO remove
|
|
270
|
+
|
|
271
|
+
# Create a dataset with compounds and features
|
|
272
|
+
def self.create compounds, features, warnings=[], source=nil
|
|
273
|
+
dataset = Dataset.new(:warnings => warnings)
|
|
274
|
+
dataset.compounds = compounds
|
|
275
|
+
dataset.features = features
|
|
276
|
+
dataset
|
|
277
|
+
end
|
|
278
|
+
# merge dataset (i.e. append features)
|
|
279
|
+
def +(dataset)
|
|
280
|
+
bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
|
|
281
|
+
bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids
|
|
282
|
+
self.feature_ids ||= []
|
|
283
|
+
self.feature_ids = self.feature_ids + dataset.feature_ids
|
|
284
|
+
@data_entries ||= Array.new(compound_ids.size){[]}
|
|
285
|
+
@data_entries.each_with_index do |row,i|
|
|
286
|
+
@data_entries[i] = row + dataset.fingerprint(compounds[i])
|
|
287
|
+
end
|
|
288
|
+
self
|
|
289
|
+
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
def fingerprint(compound)
|
|
293
|
+
i = compound_ids.index(compound.id)
|
|
294
|
+
i.nil? ? nil : data_entries[i]
|
|
295
|
+
end
|
|
296
|
+
=end
|
|
297
|
+
|
|
298
|
+
# Fill unset data entries
|
|
299
|
+
# @param any value
|
|
300
|
+
def fill_nil_with n
|
|
301
|
+
(0 .. compound_ids.size-1).each do |i|
|
|
302
|
+
@data_entries[i] ||= []
|
|
303
|
+
(0 .. feature_ids.size-1).each do |j|
|
|
304
|
+
@data_entries[i][j] ||= n
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Dataset for lazar predictions
|
|
311
|
+
class LazarPrediction < Dataset
|
|
312
|
+
field :creator, type: String
|
|
313
|
+
field :prediction_feature_id, type: String
|
|
314
|
+
|
|
315
|
+
def prediction_feature
|
|
316
|
+
Feature.find prediction_feature_id
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
# Dataset for descriptors (physchem)
|
|
322
|
+
class DescriptorDataset < Dataset
|
|
323
|
+
field :feature_calculation_algorithm, type: String
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
# Dataset for fminer descriptors
|
|
327
|
+
class FminerDataset < DescriptorDataset
|
|
328
|
+
field :training_algorithm, type: String
|
|
329
|
+
field :training_dataset_id, type: BSON::ObjectId
|
|
330
|
+
field :training_feature_id, type: BSON::ObjectId
|
|
331
|
+
field :training_parameters, type: Hash
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
end
|