opentox-ruby 3.1.0 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +19 -9
- data/README.markdown +1 -1
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/lib/algorithm.rb +143 -37
- data/lib/compound.rb +66 -18
- data/lib/dataset.rb +38 -3
- data/lib/model.rb +36 -13
- data/lib/parser.rb +34 -19
- data/lib/r-util.rb +93 -34
- data/lib/serializer.rb +70 -22
- data/lib/stratification.R +71 -7
- data/lib/transform.rb +5 -3
- data/lib/utils.rb +356 -97
- data/lib/validation.rb +6 -4
- metadata +20 -4
data/lib/dataset.rb
CHANGED
@@ -197,7 +197,12 @@ module OpenTox
|
|
197
197
|
accept_values
|
198
198
|
end
|
199
199
|
|
200
|
-
# Detect feature type(
|
200
|
+
# Detect feature type (reduced to one across all features)
|
201
|
+
# Classification takes precedence over regression
|
202
|
+
# DEPRECATED --
|
203
|
+
# HAS NO SENSE FOR DATASETS WITH MORE THAN 1 FEATURE
|
204
|
+
# FEATURES CAN HAVE MULTIPLE TYPES
|
205
|
+
# Replacement: see feature_types()
|
201
206
|
# @return [String] `classification", "regression", "mixed" or unknown`
|
202
207
|
def feature_type(subjectid=nil)
|
203
208
|
load_features(subjectid)
|
@@ -210,6 +215,24 @@ module OpenTox
|
|
210
215
|
"unknown"
|
211
216
|
end
|
212
217
|
end
|
218
|
+
|
219
|
+
|
220
|
+
# Detect feature types. A feature can have multiple types.
|
221
|
+
# Returns types hashed by feature URI, with missing features omitted.
|
222
|
+
# Example (YAML):
|
223
|
+
# http://toxcreate3.in-silico.ch:8082/dataset/152/feature/nHal:
|
224
|
+
# - http://www.opentox.org/api/1.1#NumericFeature
|
225
|
+
# - http://www.opentox.org/api/1.1#NominalFeature
|
226
|
+
# ...
|
227
|
+
#
|
228
|
+
# @return [Hash] Keys: feature URIs, Values: Array of types
|
229
|
+
def feature_types(subjectid=nil)
|
230
|
+
load_features(subjectid)
|
231
|
+
@features.inject({}){ |h,(f,metadata)|
|
232
|
+
h[f]=metadata[RDF.type] unless metadata[RDF.type][0].include? "MissingFeature"
|
233
|
+
h
|
234
|
+
}
|
235
|
+
end
|
213
236
|
=begin
|
214
237
|
=end
|
215
238
|
|
@@ -316,11 +339,14 @@ module OpenTox
|
|
316
339
|
end
|
317
340
|
|
318
341
|
# Complete feature values by adding zeroes
|
319
|
-
|
342
|
+
# @param [Hash] key: compound, value: duplicate sizes
|
343
|
+
def complete_data_entries(compound_sizes)
|
320
344
|
all_features = @features.keys
|
321
345
|
@data_entries.each { |c, e|
|
322
346
|
(Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f|
|
323
|
-
|
347
|
+
compound_sizes[c].times {
|
348
|
+
self.add(c,f,0)
|
349
|
+
}
|
324
350
|
}
|
325
351
|
}
|
326
352
|
end
|
@@ -454,6 +480,14 @@ module OpenTox
|
|
454
480
|
end
|
455
481
|
end
|
456
482
|
|
483
|
+
def value_map(prediction_feature_uri)
|
484
|
+
training_classes = accept_values(prediction_feature_uri).sort
|
485
|
+
value_map=Hash.new
|
486
|
+
training_classes.each_with_index { |c,i| value_map[i+1] = c }
|
487
|
+
value_map
|
488
|
+
end
|
489
|
+
|
490
|
+
|
457
491
|
private
|
458
492
|
# Copy a dataset (rewrites URI)
|
459
493
|
def copy(dataset)
|
@@ -504,6 +538,7 @@ module OpenTox
|
|
504
538
|
@data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/neighbor/)}.compact if @data_entries[compound.uri]
|
505
539
|
end
|
506
540
|
|
541
|
+
|
507
542
|
# def errors(compound)
|
508
543
|
# features = @data_entries[compound.uri].keys
|
509
544
|
# features.collect{|f| @features[f][OT.error]}.join(" ") if features
|
data/lib/model.rb
CHANGED
@@ -103,7 +103,7 @@ module OpenTox
|
|
103
103
|
include Model
|
104
104
|
|
105
105
|
|
106
|
-
attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors
|
106
|
+
attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors, :compounds
|
107
107
|
def initialize(uri=nil)
|
108
108
|
|
109
109
|
if uri
|
@@ -169,12 +169,13 @@ module OpenTox
|
|
169
169
|
lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"]
|
170
170
|
lazar.subjectid = hash["subjectid"] if hash["subjectid"]
|
171
171
|
lazar.value_map = hash["value_map"] if hash["value_map"]
|
172
|
+
lazar.compounds = hash["compounds"] if hash["compounds"]
|
172
173
|
|
173
174
|
lazar
|
174
175
|
end
|
175
176
|
|
176
177
|
def to_json
|
177
|
-
Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map})
|
178
|
+
Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map, :compounds => @compounds})
|
178
179
|
end
|
179
180
|
|
180
181
|
def run( params, accept_header=nil, waiting_task=nil )
|
@@ -237,6 +238,7 @@ module OpenTox
|
|
237
238
|
|
238
239
|
@compound = Compound.new compound_uri
|
239
240
|
features = {}
|
241
|
+
|
240
242
|
#LOGGER.debug self.to_yaml
|
241
243
|
unless @prediction_dataset
|
242
244
|
@prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
|
@@ -247,19 +249,19 @@ module OpenTox
|
|
247
249
|
OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
|
248
250
|
} )
|
249
251
|
end
|
250
|
-
|
251
|
-
all_activities = []
|
252
|
-
all_activities = @activities.values.flatten.collect! { |i| i.to_f }
|
253
|
-
end
|
252
|
+
|
254
253
|
unless database_activity(subjectid) # adds database activity to @prediction_dataset
|
254
|
+
|
255
255
|
# Calculation of needed values for query compound
|
256
256
|
@compound_features = eval("#{@feature_calculation_algorithm}({
|
257
257
|
:compound => @compound,
|
258
258
|
:features => @features,
|
259
259
|
:feature_dataset_uri => @metadata[OT.featureDataset],
|
260
260
|
:pc_type => self.parameter(\"pc_type\"),
|
261
|
+
:lib => self.parameter(\"lib\"),
|
261
262
|
:subjectid => subjectid
|
262
263
|
})")
|
264
|
+
|
263
265
|
# Adding fingerprint of query compound with features and values(p_value*nr_hits)
|
264
266
|
@compound_fingerprints = {}
|
265
267
|
@compound_features.each do |feature, value| # value is nil if "Substructure.match"
|
@@ -314,6 +316,16 @@ module OpenTox
|
|
314
316
|
@prediction_dataset.add @compound.uri, feature_uri, true
|
315
317
|
f+=1
|
316
318
|
end
|
319
|
+
elsif @feature_calculation_algorithm == "Substructure.lookup"
|
320
|
+
f = 0
|
321
|
+
@compound_features.each do |feature, value|
|
322
|
+
features[feature] = feature
|
323
|
+
@prediction_dataset.add_feature(feature, {
|
324
|
+
RDF.type => [OT.NumericFeature]
|
325
|
+
})
|
326
|
+
@prediction_dataset.add @compound.uri, feature, value
|
327
|
+
f+=1
|
328
|
+
end
|
317
329
|
else
|
318
330
|
@compound_features.each do |feature|
|
319
331
|
features[feature] = feature
|
@@ -337,15 +349,26 @@ module OpenTox
|
|
337
349
|
else
|
338
350
|
feature_uri = feature
|
339
351
|
end
|
340
|
-
@
|
352
|
+
if @feature_calculation_algorithm == "Substructure.lookup"
|
353
|
+
@prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
|
354
|
+
else
|
355
|
+
@prediction_dataset.add neighbor[:compound], feature_uri, true
|
356
|
+
end
|
357
|
+
|
341
358
|
unless features.has_key? feature
|
342
359
|
features[feature] = feature_uri
|
343
|
-
@
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
360
|
+
if @feature_calculation_algorithm == "Substructure.lookup"
|
361
|
+
@prediction_dataset.add_feature(feature_uri, {
|
362
|
+
RDF.type => [OT.NumericFeature]
|
363
|
+
})
|
364
|
+
else
|
365
|
+
@prediction_dataset.add_feature(feature_uri, {
|
366
|
+
RDF.type => [OT.Substructure],
|
367
|
+
OT.smarts => feature,
|
368
|
+
OT.pValue => @p_values[feature],
|
369
|
+
OT.effect => @effects[feature]
|
370
|
+
})
|
371
|
+
end
|
349
372
|
f+=1
|
350
373
|
end
|
351
374
|
end
|
data/lib/parser.rb
CHANGED
@@ -349,11 +349,15 @@ module OpenTox
|
|
349
349
|
|
350
350
|
# Load CSV string (format specification: http://toxcreate.org/help)
|
351
351
|
# @param [String] csv CSV representation of the dataset
|
352
|
+
# @param [Boolean] drop_missing Whether completely missing rows should be droppped
|
353
|
+
# @param [Boolean] all_numeric Whether all features should be treated as numeric
|
354
|
+
# @param [Boolean] del_nominal All nominal features will be removed
|
352
355
|
# @return [OpenTox::Dataset] Dataset object with CSV data
|
353
|
-
def load_csv(csv, drop_missing=false)
|
356
|
+
def load_csv(csv, drop_missing=false, all_numeric=false)
|
354
357
|
row = 0
|
355
358
|
input = csv.split("\n")
|
356
359
|
headers = split_row(input.shift)
|
360
|
+
headers.collect! {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
|
357
361
|
add_features(headers)
|
358
362
|
value_maps = Array.new
|
359
363
|
regression_features=Array.new
|
@@ -362,7 +366,7 @@ module OpenTox
|
|
362
366
|
row = split_row(row)
|
363
367
|
value_maps = detect_new_values(row, value_maps)
|
364
368
|
value_maps.each_with_index { |vm,j|
|
365
|
-
if vm.size > @max_class_values # max @max_class_values classes.
|
369
|
+
if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes.
|
366
370
|
regression_features[j]=true
|
367
371
|
else
|
368
372
|
regression_features[j]=false
|
@@ -392,22 +396,30 @@ module OpenTox
|
|
392
396
|
|
393
397
|
def warnings
|
394
398
|
|
395
|
-
info = ''
|
399
|
+
info = '<br>'
|
396
400
|
@feature_types.each do |feature,types|
|
401
|
+
@dataset.add_feature_metadata(feature,{RDF.type => []})
|
397
402
|
if types.uniq.size == 0
|
398
|
-
|
399
|
-
|
400
|
-
|
403
|
+
@dataset.add_feature_metadata(
|
404
|
+
feature, {RDF.type => ( @dataset.features[feature][RDF.type] << "helper#MissingFeature" ) } # TODO: Fit to OT ontology!
|
405
|
+
)
|
406
|
+
info += "'#{@dataset.feature_name(feature)}' detected as 'MissingFeature'<br>"
|
401
407
|
else
|
402
|
-
|
408
|
+
info += "'#{@dataset.feature_name(feature)}' detected as "
|
409
|
+
types_arr = []
|
410
|
+
types.uniq.each { |t|
|
411
|
+
types_arr << t
|
412
|
+
info += "'#{t.split('#').last}', "
|
413
|
+
}
|
414
|
+
|
415
|
+
@dataset.add_feature_metadata(
|
416
|
+
feature, {RDF.type => types_arr.sort} # nominal should be first for downward compatibility
|
417
|
+
)
|
418
|
+
|
419
|
+
info.chop!.chop!
|
420
|
+
info += "<br>"
|
403
421
|
end
|
404
|
-
@dataset.add_feature_metadata(feature,{RDF.type => [type]})
|
405
|
-
info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
|
406
|
-
|
407
|
-
# TODO: rewrite feature values
|
408
|
-
# TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
|
409
422
|
end
|
410
|
-
|
411
423
|
@dataset.metadata[OT.Info] = info
|
412
424
|
|
413
425
|
warnings = ''
|
@@ -469,28 +481,31 @@ module OpenTox
|
|
469
481
|
unless @duplicate_feature_indices.include? i
|
470
482
|
|
471
483
|
value = row[i]
|
472
|
-
#LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
|
473
484
|
feature = @features[feature_idx]
|
474
485
|
|
475
486
|
type = feature_type(value) # May be NIL
|
476
|
-
type = OT.NominalFeature unless (type.nil? || regression_features[i])
|
477
487
|
@feature_types[feature] << type if type
|
488
|
+
# Add nominal type if #distinct values le @max_class_values
|
489
|
+
if type == OT.NumericFeature
|
490
|
+
@feature_types[feature] << OT.NominalFeature unless regression_features[i]
|
491
|
+
end
|
478
492
|
|
479
493
|
val = nil
|
480
494
|
case type
|
481
495
|
when OT.NumericFeature
|
482
496
|
val = value.to_f
|
497
|
+
val = nil if val.infinite?
|
483
498
|
when OT.NominalFeature
|
484
499
|
val = value.to_s
|
485
500
|
end
|
486
501
|
|
487
502
|
feature_idx += 1
|
488
503
|
|
489
|
-
if val != nil
|
504
|
+
if val != nil
|
490
505
|
@dataset.add(compound.uri, feature, val)
|
491
|
-
if
|
506
|
+
if @feature_types[feature].include? OT.NominalFeature
|
492
507
|
@dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
|
493
|
-
@dataset.features[feature][OT.acceptValue] << val
|
508
|
+
@dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val)
|
494
509
|
end
|
495
510
|
end
|
496
511
|
|
@@ -654,7 +669,7 @@ module OpenTox
|
|
654
669
|
obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
|
655
670
|
table.data[compound.uri] = row
|
656
671
|
end
|
657
|
-
|
672
|
+
|
658
673
|
# find and remove ignored_features
|
659
674
|
@activity_errors = table.clean_features
|
660
675
|
table.add_to_dataset @dataset
|
data/lib/r-util.rb
CHANGED
@@ -8,6 +8,18 @@ PACKAGE_DIR = package_dir
|
|
8
8
|
|
9
9
|
require "tempfile"
|
10
10
|
|
11
|
+
class Array
|
12
|
+
|
13
|
+
def check_uniq
|
14
|
+
hash = {}
|
15
|
+
self.each do |x|
|
16
|
+
raise "duplicate #{x}" if hash[x]
|
17
|
+
hash[x] = true
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
11
23
|
module OpenTox
|
12
24
|
|
13
25
|
class RUtil
|
@@ -75,12 +87,10 @@ module OpenTox
|
|
75
87
|
end
|
76
88
|
|
77
89
|
# embedds feature values of two datasets into 2D and plots it
|
78
|
-
# fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
|
79
90
|
#
|
80
91
|
def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
|
81
|
-
features=nil,
|
92
|
+
features=nil, subjectid=nil, waiting_task=nil)
|
82
93
|
|
83
|
-
raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
|
84
94
|
LOGGER.debug("r-util> create feature value plot")
|
85
95
|
d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
|
86
96
|
d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
|
@@ -102,17 +112,13 @@ module OpenTox
|
|
102
112
|
@r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
|
103
113
|
@r.names = [dataset_name1, dataset_name2]
|
104
114
|
LOGGER.debug("r-util> - convert data to 2d")
|
105
|
-
|
115
|
+
#@r.eval "save.image(\"/tmp/image.R\")"
|
116
|
+
@r.eval "df.2d <- plot_pre_process(df, method='sammon')"
|
106
117
|
waiting_task.progress(75) if waiting_task
|
107
118
|
|
108
|
-
if fast_plot
|
109
|
-
info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
|
110
|
-
else
|
111
|
-
info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
|
112
|
-
end
|
113
119
|
LOGGER.debug("r-util> - plot data")
|
114
120
|
plot_to_files(files) do |file|
|
115
|
-
@r.eval "plot_split( df.2d, split, names, #{
|
121
|
+
@r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
|
116
122
|
end
|
117
123
|
end
|
118
124
|
|
@@ -170,19 +176,68 @@ module OpenTox
|
|
170
176
|
end
|
171
177
|
end
|
172
178
|
|
173
|
-
# stratified splits a dataset into two dataset the feature values
|
179
|
+
# stratified splits a dataset into two dataset according to the feature values
|
180
|
+
# all features are taken into account unless <split_features> is given
|
181
|
+
# returns two datases
|
182
|
+
def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
|
183
|
+
stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
|
184
|
+
end
|
185
|
+
|
186
|
+
# stratified splits a dataset into k datasets according the feature values
|
174
187
|
# all features are taken into account unless <split_features> is given
|
175
|
-
|
188
|
+
# returns two arrays of datasets
|
189
|
+
def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
|
190
|
+
stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features )
|
191
|
+
end
|
192
|
+
|
193
|
+
private
|
194
|
+
def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
|
195
|
+
raise "internal error" if num_folds!=nil and pct!=nil
|
196
|
+
k_fold_split = num_folds!=nil
|
197
|
+
if k_fold_split
|
198
|
+
raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum)
|
199
|
+
else
|
200
|
+
raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric)
|
201
|
+
end
|
176
202
|
raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
|
203
|
+
raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0
|
204
|
+
raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String)
|
177
205
|
LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
|
178
206
|
|
179
|
-
df = dataset_to_dataframe( dataset, missing_values, subjectid
|
207
|
+
df = dataset_to_dataframe( dataset, missing_values, subjectid)
|
180
208
|
@r.eval "set.seed(#{seed})"
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
209
|
+
str_split_features = ""
|
210
|
+
if split_features
|
211
|
+
@r.split_features = split_features if split_features
|
212
|
+
str_split_features = "colnames=split_features"
|
213
|
+
end
|
214
|
+
#@r.eval "save.image(\"/tmp/image.R\")"
|
215
|
+
|
216
|
+
if k_fold_split
|
217
|
+
@r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})"
|
218
|
+
split = @r.pull 'split'
|
219
|
+
train = []
|
220
|
+
test = []
|
221
|
+
num_folds.times do |f|
|
222
|
+
datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s
|
223
|
+
metadata[DC.title] = "training "+datasetname
|
224
|
+
train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) }
|
225
|
+
metadata[DC.title] = "test "+datasetname
|
226
|
+
test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) }
|
227
|
+
end
|
228
|
+
return train, test
|
229
|
+
else
|
230
|
+
puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
|
231
|
+
@r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
|
232
|
+
split = @r.pull 'split'
|
233
|
+
metadata[DC.title] = "Training dataset split of "+dataset.uri
|
234
|
+
train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
|
235
|
+
metadata[DC.title] = "Test dataset split of "+dataset.uri
|
236
|
+
test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
|
237
|
+
return train, test
|
238
|
+
end
|
185
239
|
end
|
240
|
+
public
|
186
241
|
|
187
242
|
# dataset should be loaded completely (use Dataset.find)
|
188
243
|
# takes duplicates into account
|
@@ -212,9 +267,13 @@ module OpenTox
|
|
212
267
|
features = dataset.features.keys.sort
|
213
268
|
end
|
214
269
|
compounds = []
|
270
|
+
compound_names = []
|
215
271
|
dataset.compounds.each do |c|
|
272
|
+
count = 0
|
216
273
|
num_compounds[c].times do |i|
|
217
274
|
compounds << c
|
275
|
+
compound_names << "#{c}$#{count}"
|
276
|
+
count+=1
|
218
277
|
end
|
219
278
|
end
|
220
279
|
|
@@ -238,7 +297,7 @@ module OpenTox
|
|
238
297
|
end
|
239
298
|
end
|
240
299
|
df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
|
241
|
-
assign_dataframe(df_name,d_values,
|
300
|
+
assign_dataframe(df_name,d_values,compound_names,features)
|
242
301
|
|
243
302
|
# set dataframe column types accordingly
|
244
303
|
f_count = 1 #R starts at 1
|
@@ -264,25 +323,27 @@ module OpenTox
|
|
264
323
|
|
265
324
|
# converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
|
266
325
|
# this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
|
267
|
-
def dataframe_to_dataset( df, subjectid=nil )
|
268
|
-
dataframe_to_dataset_indices( df, subjectid, nil)
|
326
|
+
def dataframe_to_dataset( df, metadata={}, subjectid=nil )
|
327
|
+
dataframe_to_dataset_indices( df, metadata, subjectid, nil)
|
269
328
|
end
|
270
329
|
|
271
330
|
private
|
272
|
-
def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
|
331
|
+
def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
|
273
332
|
raise unless @@feats[df].size>0
|
274
|
-
values,
|
333
|
+
values, compound_names, features = pull_dataframe(df)
|
334
|
+
compounds = compound_names.collect{|c| c.split("$")[0]}
|
275
335
|
features.each{|f| raise unless @@feats[df][f]}
|
276
336
|
dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
|
337
|
+
dataset.add_metadata(metadata)
|
277
338
|
LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
|
278
339
|
compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
|
279
340
|
features.each{|f| dataset.add_feature(f,@@feats[df][f])}
|
280
341
|
features.size.times do |c|
|
281
342
|
feat = OpenTox::Feature.find(features[c],subjectid)
|
282
|
-
|
343
|
+
numeric = feat.metadata[RDF.type].to_a.flatten.include?(OT.NumericFeature)
|
283
344
|
compounds.size.times do |r|
|
284
345
|
if compound_indices==nil or compound_indices.include?(r)
|
285
|
-
dataset.add(compounds[r],features[c],
|
346
|
+
dataset.add(compounds[r],features[c],numeric ? values[r][c].to_f : values[r][c]) if values[r][c]!="NA"
|
286
347
|
end
|
287
348
|
end
|
288
349
|
end
|
@@ -290,16 +351,12 @@ module OpenTox
|
|
290
351
|
dataset
|
291
352
|
end
|
292
353
|
|
293
|
-
def
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
|
300
|
-
sets << dataset
|
301
|
-
end
|
302
|
-
sets
|
354
|
+
def split_to_dataset( df, split, metadata={}, subjectid=nil )
|
355
|
+
indices = []
|
356
|
+
split.size.times{|i| indices<<i if yield(split[i]) }
|
357
|
+
dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
|
358
|
+
LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
|
359
|
+
dataset
|
303
360
|
end
|
304
361
|
|
305
362
|
def pull_dataframe(df)
|
@@ -323,6 +380,8 @@ module OpenTox
|
|
323
380
|
end
|
324
381
|
|
325
382
|
def assign_dataframe(df,input,rownames,colnames)
|
383
|
+
rownames.check_uniq if rownames
|
384
|
+
colnames.check_uniq if colnames
|
326
385
|
tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
|
327
386
|
file = File.new(tmp, 'w')
|
328
387
|
input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
|