lazar 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
@@ -0,0 +1,187 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
class CrossValidation
|
4
|
+
field :validation_ids, type: Array, default: []
|
5
|
+
field :folds, type: Integer
|
6
|
+
field :nr_instances, type: Integer
|
7
|
+
field :nr_unpredicted, type: Integer
|
8
|
+
field :predictions, type: Array
|
9
|
+
field :finished_at, type: Time
|
10
|
+
end
|
11
|
+
|
12
|
+
class ClassificationCrossValidation < CrossValidation
|
13
|
+
|
14
|
+
field :accept_values, type: Array
|
15
|
+
field :confusion_matrix, type: Array
|
16
|
+
field :weighted_confusion_matrix, type: Array
|
17
|
+
field :accuracy, type: Float
|
18
|
+
field :weighted_accuracy, type: Float
|
19
|
+
field :true_rate, type: Hash
|
20
|
+
field :predictivity, type: Hash
|
21
|
+
# TODO auc, f-measure (usability??)
|
22
|
+
|
23
|
+
def self.create model, n=10
|
24
|
+
cv = self.new
|
25
|
+
validation_ids = []
|
26
|
+
nr_instances = 0
|
27
|
+
nr_unpredicted = 0
|
28
|
+
predictions = []
|
29
|
+
validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
|
30
|
+
accept_values = Feature.find(model.prediction_feature_id).accept_values
|
31
|
+
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
32
|
+
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
33
|
+
true_rate = {}
|
34
|
+
predictivity = {}
|
35
|
+
fold_nr = 1
|
36
|
+
training_dataset = Dataset.find model.training_dataset_id
|
37
|
+
training_dataset.folds(n).each do |fold|
|
38
|
+
t = Time.now
|
39
|
+
$logger.debug "Fold #{fold_nr}"
|
40
|
+
validation = validation_class.create(model, fold[0], fold[1])
|
41
|
+
validation_ids << validation.id
|
42
|
+
nr_instances += validation.nr_instances
|
43
|
+
nr_unpredicted += validation.nr_unpredicted
|
44
|
+
predictions += validation.predictions
|
45
|
+
validation.confusion_matrix.each_with_index do |r,i|
|
46
|
+
r.each_with_index do |c,j|
|
47
|
+
confusion_matrix[i][j] += c
|
48
|
+
weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
$logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
|
52
|
+
fold_nr +=1
|
53
|
+
end
|
54
|
+
true_rate = {}
|
55
|
+
predictivity = {}
|
56
|
+
accept_values.each_with_index do |v,i|
|
57
|
+
true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
|
58
|
+
predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
|
59
|
+
end
|
60
|
+
confidence_sum = 0
|
61
|
+
weighted_confusion_matrix.each do |r|
|
62
|
+
r.each do |c|
|
63
|
+
confidence_sum += c
|
64
|
+
end
|
65
|
+
end
|
66
|
+
cv.update_attributes(
|
67
|
+
nr_instances: nr_instances,
|
68
|
+
nr_unpredicted: nr_unpredicted,
|
69
|
+
accept_values: accept_values,
|
70
|
+
confusion_matrix: confusion_matrix,
|
71
|
+
weighted_confusion_matrix: weighted_confusion_matrix,
|
72
|
+
accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
|
73
|
+
weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
|
74
|
+
true_rate: true_rate,
|
75
|
+
predictivity: predictivity,
|
76
|
+
predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
|
77
|
+
finished_at: Time.now
|
78
|
+
)
|
79
|
+
cv.save
|
80
|
+
cv
|
81
|
+
end
|
82
|
+
|
83
|
+
#Average area under roc 0.646
|
84
|
+
#Area under roc 0.646
|
85
|
+
#F measure carcinogen: 0.769, noncarcinogen: 0.348
|
86
|
+
end
|
87
|
+
|
88
|
+
class RegressionCrossValidation < Validation
|
89
|
+
|
90
|
+
field :validation_ids, type: Array, default: []
|
91
|
+
field :folds, type: Integer
|
92
|
+
field :rmse, type: Float
|
93
|
+
field :mae, type: Float
|
94
|
+
field :weighted_rmse, type: Float
|
95
|
+
field :weighted_mae, type: Float
|
96
|
+
|
97
|
+
def self.create model, n=10
|
98
|
+
cv = self.new
|
99
|
+
validation_ids = []
|
100
|
+
nr_instances = 0
|
101
|
+
nr_unpredicted = 0
|
102
|
+
predictions = []
|
103
|
+
validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
|
104
|
+
fold_nr = 1
|
105
|
+
training_dataset = Dataset.find model.training_dataset_id
|
106
|
+
training_dataset.folds(n).each do |fold|
|
107
|
+
t = Time.now
|
108
|
+
$logger.debug "Predicting fold #{fold_nr}"
|
109
|
+
|
110
|
+
validation = validation_class.create(model, fold[0], fold[1])
|
111
|
+
validation_ids << validation.id
|
112
|
+
nr_instances += validation.nr_instances
|
113
|
+
nr_unpredicted += validation.nr_unpredicted
|
114
|
+
predictions += validation.predictions
|
115
|
+
$logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
|
116
|
+
fold_nr +=1
|
117
|
+
end
|
118
|
+
rmse = 0
|
119
|
+
weighted_rmse = 0
|
120
|
+
rse = 0
|
121
|
+
weighted_rse = 0
|
122
|
+
mae = 0
|
123
|
+
weighted_mae = 0
|
124
|
+
rae = 0
|
125
|
+
weighted_rae = 0
|
126
|
+
n = 0
|
127
|
+
confidence_sum = 0
|
128
|
+
predictions.each do |pred|
|
129
|
+
compound_id,activity,prediction,confidence = pred
|
130
|
+
if activity and prediction
|
131
|
+
error = prediction-activity
|
132
|
+
rmse += error**2
|
133
|
+
weighted_rmse += confidence*error**2
|
134
|
+
mae += error.abs
|
135
|
+
weighted_mae += confidence*error.abs
|
136
|
+
n += 1
|
137
|
+
confidence_sum += confidence
|
138
|
+
else
|
139
|
+
# TODO: create warnings
|
140
|
+
p pred
|
141
|
+
end
|
142
|
+
end
|
143
|
+
mae = mae/n
|
144
|
+
weighted_mae = weighted_mae/confidence_sum
|
145
|
+
rmse = Math.sqrt(rmse/n)
|
146
|
+
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
|
147
|
+
cv.update_attributes(
|
148
|
+
folds: n,
|
149
|
+
validation_ids: validation_ids,
|
150
|
+
nr_instances: nr_instances,
|
151
|
+
nr_unpredicted: nr_unpredicted,
|
152
|
+
predictions: predictions.sort{|a,b| b[3] <=> a[3]},
|
153
|
+
mae: mae,
|
154
|
+
rmse: rmse,
|
155
|
+
weighted_mae: weighted_mae,
|
156
|
+
weighted_rmse: weighted_rmse
|
157
|
+
)
|
158
|
+
cv.save
|
159
|
+
cv
|
160
|
+
end
|
161
|
+
|
162
|
+
def plot
|
163
|
+
# RMSE
|
164
|
+
x = predictions.collect{|p| p[1]}
|
165
|
+
y = predictions.collect{|p| p[2]}
|
166
|
+
R.assign "Measurement", x
|
167
|
+
R.assign "Prediction", y
|
168
|
+
R.eval "par(pty='s')" # sets the plot type to be square
|
169
|
+
#R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
|
170
|
+
#R.eval "error <- log(Measurement)-log(Prediction)"
|
171
|
+
R.eval "error <- Measurement-Prediction"
|
172
|
+
R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
|
173
|
+
R.eval "mae <- mean( abs(error), na.rm = TRUE)"
|
174
|
+
R.eval "r <- cor(log(Prediction),log(Measurement))"
|
175
|
+
R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
|
176
|
+
R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
|
177
|
+
#R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
|
178
|
+
#R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
|
179
|
+
R.eval "abline(0,1,col='blue')"
|
180
|
+
#R.eval "abline(fitline,col='red')"
|
181
|
+
R.eval "dev.off()"
|
182
|
+
"/tmp/#{id.to_s}.svg"
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
end
|
data/lib/dataset.rb
ADDED
@@ -0,0 +1,334 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'tempfile'
|
3
|
+
|
4
|
+
module OpenTox
|
5
|
+
|
6
|
+
class Dataset
|
7
|
+
|
8
|
+
attr_writer :data_entries
|
9
|
+
|
10
|
+
# associations like has_many, belongs_to deteriorate performance
|
11
|
+
field :feature_ids, type: Array, default: []
|
12
|
+
field :compound_ids, type: Array, default: []
|
13
|
+
field :data_entries_id, type: BSON::ObjectId, default: []
|
14
|
+
field :source, type: String
|
15
|
+
field :warnings, type: Array, default: []
|
16
|
+
|
17
|
+
# Save all data including data_entries
|
18
|
+
# Should be used instead of save
|
19
|
+
def save_all
|
20
|
+
dump = Marshal.dump(@data_entries)
|
21
|
+
file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
|
22
|
+
data_entries_id = $gridfs.insert_one(file)
|
23
|
+
update(:data_entries_id => data_entries_id)
|
24
|
+
save
|
25
|
+
end
|
26
|
+
|
27
|
+
# Readers
|
28
|
+
|
29
|
+
# Get all compounds
|
30
|
+
def compounds
|
31
|
+
@compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
|
32
|
+
@compounds
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get all features
|
36
|
+
def features
|
37
|
+
@features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
|
38
|
+
@features
|
39
|
+
end
|
40
|
+
|
41
|
+
# Get all data_entries
|
42
|
+
def data_entries
|
43
|
+
unless @data_entries
|
44
|
+
t = Time.now
|
45
|
+
data_entry_file = $gridfs.find_one(_id: data_entries_id)
|
46
|
+
if data_entry_file.nil?
|
47
|
+
@data_entries = []
|
48
|
+
else
|
49
|
+
@data_entries = Marshal.load(data_entry_file.data)
|
50
|
+
bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
|
51
|
+
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
|
52
|
+
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries..first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
|
53
|
+
$logger.debug "Retrieving data: #{Time.now-t}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
@data_entries
|
57
|
+
end
|
58
|
+
|
59
|
+
# Find data entry values for a given compound and feature
|
60
|
+
# @param compound [OpenTox::Compound] OpenTox Compound object
|
61
|
+
# @param feature [OpenTox::Feature] OpenTox Feature object
|
62
|
+
# @return [Array] Data entry values
|
63
|
+
def values(compound, feature)
|
64
|
+
rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
|
65
|
+
col = feature_ids.index feature.id
|
66
|
+
rows.collect{|row| data_entries[row][col]}
|
67
|
+
end
|
68
|
+
|
69
|
+
# Writers
|
70
|
+
|
71
|
+
# Set compounds
|
72
|
+
def compounds=(compounds)
|
73
|
+
self.compound_ids = compounds.collect{|c| c.id}
|
74
|
+
end
|
75
|
+
|
76
|
+
# Set features
|
77
|
+
def features=(features)
|
78
|
+
self.feature_ids = features.collect{|f| f.id}
|
79
|
+
end
|
80
|
+
|
81
|
+
# Dataset operations
|
82
|
+
|
83
|
+
# Split a dataset into n folds
|
84
|
+
# @param [Integer] number of folds
|
85
|
+
# @return [Array] Array with folds [training_dataset,test_dataset]
|
86
|
+
def folds n
|
87
|
+
len = self.compound_ids.size
|
88
|
+
indices = (0..len-1).to_a.shuffle
|
89
|
+
mid = (len/n)
|
90
|
+
chunks = []
|
91
|
+
start = 0
|
92
|
+
1.upto(n) do |i|
|
93
|
+
last = start+mid
|
94
|
+
last = last-1 unless len%n >= i
|
95
|
+
test_idxs = indices[start..last] || []
|
96
|
+
test_cids = test_idxs.collect{|i| self.compound_ids[i]}
|
97
|
+
test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
|
98
|
+
test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
|
99
|
+
training_idxs = indices-test_idxs
|
100
|
+
training_cids = training_idxs.collect{|i| self.compound_ids[i]}
|
101
|
+
training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
|
102
|
+
training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
|
103
|
+
test_dataset.save_all
|
104
|
+
training_dataset.save_all
|
105
|
+
chunks << [training_dataset,test_dataset]
|
106
|
+
start = last+1
|
107
|
+
end
|
108
|
+
chunks
|
109
|
+
end
|
110
|
+
|
111
|
+
# Diagnostics
|
112
|
+
|
113
|
+
def correlation_plot training_dataset
|
114
|
+
# TODO: create/store svg
|
115
|
+
R.assign "features", data_entries
|
116
|
+
R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
|
117
|
+
R.eval "featurePlot(features,activities)"
|
118
|
+
end
|
119
|
+
|
120
|
+
def density_plot
|
121
|
+
# TODO: create/store svg
|
122
|
+
R.assign "acts", data_entries.collect{|r| r.first }#.compact
|
123
|
+
R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
|
124
|
+
end
|
125
|
+
|
126
|
+
# Serialisation
|
127
|
+
|
128
|
+
# converts dataset to csv format including compound smiles as first column, other column headers are feature titles
|
129
|
+
# @return [String]
|
130
|
+
def to_csv(inchi=false)
|
131
|
+
CSV.generate() do |csv| #{:force_quotes=>true}
|
132
|
+
csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.title}
|
133
|
+
compounds.each_with_index do |c,i|
|
134
|
+
csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
# Parsers
|
141
|
+
|
142
|
+
# Create a dataset from file (csv,sdf,...)
|
143
|
+
# @param filename [String]
|
144
|
+
# @return [String] dataset uri
|
145
|
+
# TODO
|
146
|
+
#def self.from_sdf_file
|
147
|
+
#end
|
148
|
+
|
149
|
+
# Create a dataset from CSV file
|
150
|
+
# TODO: document structure
|
151
|
+
def self.from_csv_file file, source=nil, bioassay=true
|
152
|
+
source ||= file
|
153
|
+
table = CSV.read file, :skip_blanks => true
|
154
|
+
dataset = self.new(:source => source, :name => File.basename(file))
|
155
|
+
dataset.parse_table table, bioassay
|
156
|
+
dataset
|
157
|
+
end
|
158
|
+
|
159
|
+
# parse data in tabular format (e.g. from csv)
|
160
|
+
# does a lot of guesswork in order to determine feature types
|
161
|
+
def parse_table table, bioassay=true
|
162
|
+
|
163
|
+
time = Time.now
|
164
|
+
|
165
|
+
# features
|
166
|
+
feature_names = table.shift.collect{|f| f.strip}
|
167
|
+
warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
|
168
|
+
compound_format = feature_names.shift.strip
|
169
|
+
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
|
170
|
+
|
171
|
+
numeric = []
|
172
|
+
# guess feature types
|
173
|
+
feature_names.each_with_index do |f,i|
|
174
|
+
metadata = {:name => f}
|
175
|
+
values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
|
176
|
+
types = values.collect{|v| v.numeric? ? true : false}.uniq
|
177
|
+
if values.size == 0 # empty feature
|
178
|
+
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
|
179
|
+
metadata["numeric"] = true
|
180
|
+
numeric[i] = true
|
181
|
+
else
|
182
|
+
metadata["nominal"] = true
|
183
|
+
metadata["accept_values"] = values
|
184
|
+
numeric[i] = false
|
185
|
+
end
|
186
|
+
if bioassay
|
187
|
+
if metadata["numeric"]
|
188
|
+
feature = NumericBioAssay.find_or_create_by(metadata)
|
189
|
+
elsif metadata["nominal"]
|
190
|
+
feature = NominalBioAssay.find_or_create_by(metadata)
|
191
|
+
end
|
192
|
+
else
|
193
|
+
metadata.merge({:measured => false, :calculated => true})
|
194
|
+
if metadata["numeric"]
|
195
|
+
feature = NumericFeature.find_or_create_by(metadata)
|
196
|
+
elsif metadata["nominal"]
|
197
|
+
feature = NominalFeature.find_or_create_by(metadata)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
feature_ids << feature.id
|
201
|
+
end
|
202
|
+
|
203
|
+
$logger.debug "Feature values: #{Time.now-time}"
|
204
|
+
time = Time.now
|
205
|
+
|
206
|
+
r = -1
|
207
|
+
compound_time = 0
|
208
|
+
value_time = 0
|
209
|
+
|
210
|
+
# compounds and values
|
211
|
+
@data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
|
212
|
+
|
213
|
+
table.each_with_index do |vals,i|
|
214
|
+
ct = Time.now
|
215
|
+
identifier = vals.shift
|
216
|
+
warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
|
217
|
+
begin
|
218
|
+
case compound_format
|
219
|
+
when /SMILES/i
|
220
|
+
compound = OpenTox::Compound.from_smiles(identifier)
|
221
|
+
when /InChI/i
|
222
|
+
compound = OpenTox::Compound.from_inchi(identifier)
|
223
|
+
end
|
224
|
+
rescue
|
225
|
+
compound = nil
|
226
|
+
end
|
227
|
+
if compound.nil?
|
228
|
+
# compound parsers may return nil
|
229
|
+
warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
|
230
|
+
next
|
231
|
+
end
|
232
|
+
# TODO insert empty compounds to keep positions?
|
233
|
+
compound_time += Time.now-ct
|
234
|
+
|
235
|
+
r += 1
|
236
|
+
unless vals.size == feature_ids.size # way cheaper than accessing features
|
237
|
+
warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
|
238
|
+
next
|
239
|
+
end
|
240
|
+
|
241
|
+
compound_ids << compound.id
|
242
|
+
@data_entries << Array.new(table.first.size-1)
|
243
|
+
|
244
|
+
vals.each_with_index do |v,j|
|
245
|
+
if v.blank?
|
246
|
+
warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
|
247
|
+
next
|
248
|
+
elsif numeric[j]
|
249
|
+
@data_entries.last[j] = v.to_f
|
250
|
+
else
|
251
|
+
@data_entries.last[j] = v.strip
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
compounds.duplicates.each do |compound|
|
256
|
+
positions = []
|
257
|
+
compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
|
258
|
+
warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
|
259
|
+
end
|
260
|
+
|
261
|
+
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
|
262
|
+
time = Time.now
|
263
|
+
save_all
|
264
|
+
$logger.debug "Saving: #{Time.now-time}"
|
265
|
+
|
266
|
+
end
|
267
|
+
|
268
|
+
=begin
|
269
|
+
# TODO remove
|
270
|
+
|
271
|
+
# Create a dataset with compounds and features
|
272
|
+
def self.create compounds, features, warnings=[], source=nil
|
273
|
+
dataset = Dataset.new(:warnings => warnings)
|
274
|
+
dataset.compounds = compounds
|
275
|
+
dataset.features = features
|
276
|
+
dataset
|
277
|
+
end
|
278
|
+
# merge dataset (i.e. append features)
|
279
|
+
def +(dataset)
|
280
|
+
bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
|
281
|
+
bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids
|
282
|
+
self.feature_ids ||= []
|
283
|
+
self.feature_ids = self.feature_ids + dataset.feature_ids
|
284
|
+
@data_entries ||= Array.new(compound_ids.size){[]}
|
285
|
+
@data_entries.each_with_index do |row,i|
|
286
|
+
@data_entries[i] = row + dataset.fingerprint(compounds[i])
|
287
|
+
end
|
288
|
+
self
|
289
|
+
|
290
|
+
end
|
291
|
+
|
292
|
+
def fingerprint(compound)
|
293
|
+
i = compound_ids.index(compound.id)
|
294
|
+
i.nil? ? nil : data_entries[i]
|
295
|
+
end
|
296
|
+
=end
|
297
|
+
|
298
|
+
# Fill unset data entries
|
299
|
+
# @param any value
|
300
|
+
def fill_nil_with n
|
301
|
+
(0 .. compound_ids.size-1).each do |i|
|
302
|
+
@data_entries[i] ||= []
|
303
|
+
(0 .. feature_ids.size-1).each do |j|
|
304
|
+
@data_entries[i][j] ||= n
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
# Dataset for lazar predictions
|
311
|
+
class LazarPrediction < Dataset
|
312
|
+
field :creator, type: String
|
313
|
+
field :prediction_feature_id, type: String
|
314
|
+
|
315
|
+
def prediction_feature
|
316
|
+
Feature.find prediction_feature_id
|
317
|
+
end
|
318
|
+
|
319
|
+
end
|
320
|
+
|
321
|
+
# Dataset for descriptors (physchem)
|
322
|
+
class DescriptorDataset < Dataset
|
323
|
+
field :feature_calculation_algorithm, type: String
|
324
|
+
end
|
325
|
+
|
326
|
+
# Dataset for fminer descriptors
|
327
|
+
class FminerDataset < DescriptorDataset
|
328
|
+
field :training_algorithm, type: String
|
329
|
+
field :training_dataset_id, type: BSON::ObjectId
|
330
|
+
field :training_feature_id, type: BSON::ObjectId
|
331
|
+
field :training_parameters, type: Hash
|
332
|
+
end
|
333
|
+
|
334
|
+
end
|