opentox-ruby 3.1.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +19 -9
- data/README.markdown +1 -1
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/lib/algorithm.rb +143 -37
- data/lib/compound.rb +66 -18
- data/lib/dataset.rb +38 -3
- data/lib/model.rb +36 -13
- data/lib/parser.rb +34 -19
- data/lib/r-util.rb +93 -34
- data/lib/serializer.rb +70 -22
- data/lib/stratification.R +71 -7
- data/lib/transform.rb +5 -3
- data/lib/utils.rb +356 -97
- data/lib/validation.rb +6 -4
- metadata +20 -4
data/lib/dataset.rb
CHANGED
@@ -197,7 +197,12 @@ module OpenTox
|
|
197
197
|
accept_values
|
198
198
|
end
|
199
199
|
|
200
|
-
# Detect feature type(
|
200
|
+
# Detect feature type (reduced to one across all features)
|
201
|
+
# Classification takes precedence over regression
|
202
|
+
# DEPRECATED --
|
203
|
+
# HAS NO SENSE FOR DATASETS WITH MORE THAN 1 FEATURE
|
204
|
+
# FEATURES CAN HAVE MULTIPLE TYPES
|
205
|
+
# Replacement: see feature_types()
|
201
206
|
# @return [String] `classification", "regression", "mixed" or unknown`
|
202
207
|
def feature_type(subjectid=nil)
|
203
208
|
load_features(subjectid)
|
@@ -210,6 +215,24 @@ module OpenTox
|
|
210
215
|
"unknown"
|
211
216
|
end
|
212
217
|
end
|
218
|
+
|
219
|
+
|
220
|
+
# Detect feature types. A feature can have multiple types.
|
221
|
+
# Returns types hashed by feature URI, with missing features omitted.
|
222
|
+
# Example (YAML):
|
223
|
+
# http://toxcreate3.in-silico.ch:8082/dataset/152/feature/nHal:
|
224
|
+
# - http://www.opentox.org/api/1.1#NumericFeature
|
225
|
+
# - http://www.opentox.org/api/1.1#NominalFeature
|
226
|
+
# ...
|
227
|
+
#
|
228
|
+
# @return [Hash] Keys: feature URIs, Values: Array of types
|
229
|
+
def feature_types(subjectid=nil)
|
230
|
+
load_features(subjectid)
|
231
|
+
@features.inject({}){ |h,(f,metadata)|
|
232
|
+
h[f]=metadata[RDF.type] unless metadata[RDF.type][0].include? "MissingFeature"
|
233
|
+
h
|
234
|
+
}
|
235
|
+
end
|
213
236
|
=begin
|
214
237
|
=end
|
215
238
|
|
@@ -316,11 +339,14 @@ module OpenTox
|
|
316
339
|
end
|
317
340
|
|
318
341
|
# Complete feature values by adding zeroes
|
319
|
-
|
342
|
+
# @param [Hash] key: compound, value: duplicate sizes
|
343
|
+
def complete_data_entries(compound_sizes)
|
320
344
|
all_features = @features.keys
|
321
345
|
@data_entries.each { |c, e|
|
322
346
|
(Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f|
|
323
|
-
|
347
|
+
compound_sizes[c].times {
|
348
|
+
self.add(c,f,0)
|
349
|
+
}
|
324
350
|
}
|
325
351
|
}
|
326
352
|
end
|
@@ -454,6 +480,14 @@ module OpenTox
|
|
454
480
|
end
|
455
481
|
end
|
456
482
|
|
483
|
+
def value_map(prediction_feature_uri)
|
484
|
+
training_classes = accept_values(prediction_feature_uri).sort
|
485
|
+
value_map=Hash.new
|
486
|
+
training_classes.each_with_index { |c,i| value_map[i+1] = c }
|
487
|
+
value_map
|
488
|
+
end
|
489
|
+
|
490
|
+
|
457
491
|
private
|
458
492
|
# Copy a dataset (rewrites URI)
|
459
493
|
def copy(dataset)
|
@@ -504,6 +538,7 @@ module OpenTox
|
|
504
538
|
@data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/neighbor/)}.compact if @data_entries[compound.uri]
|
505
539
|
end
|
506
540
|
|
541
|
+
|
507
542
|
# def errors(compound)
|
508
543
|
# features = @data_entries[compound.uri].keys
|
509
544
|
# features.collect{|f| @features[f][OT.error]}.join(" ") if features
|
data/lib/model.rb
CHANGED
@@ -103,7 +103,7 @@ module OpenTox
|
|
103
103
|
include Model
|
104
104
|
|
105
105
|
|
106
|
-
attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors
|
106
|
+
attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors, :compounds
|
107
107
|
def initialize(uri=nil)
|
108
108
|
|
109
109
|
if uri
|
@@ -169,12 +169,13 @@ module OpenTox
|
|
169
169
|
lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"]
|
170
170
|
lazar.subjectid = hash["subjectid"] if hash["subjectid"]
|
171
171
|
lazar.value_map = hash["value_map"] if hash["value_map"]
|
172
|
+
lazar.compounds = hash["compounds"] if hash["compounds"]
|
172
173
|
|
173
174
|
lazar
|
174
175
|
end
|
175
176
|
|
176
177
|
def to_json
|
177
|
-
Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map})
|
178
|
+
Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map, :compounds => @compounds})
|
178
179
|
end
|
179
180
|
|
180
181
|
def run( params, accept_header=nil, waiting_task=nil )
|
@@ -237,6 +238,7 @@ module OpenTox
|
|
237
238
|
|
238
239
|
@compound = Compound.new compound_uri
|
239
240
|
features = {}
|
241
|
+
|
240
242
|
#LOGGER.debug self.to_yaml
|
241
243
|
unless @prediction_dataset
|
242
244
|
@prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
|
@@ -247,19 +249,19 @@ module OpenTox
|
|
247
249
|
OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
|
248
250
|
} )
|
249
251
|
end
|
250
|
-
|
251
|
-
all_activities = []
|
252
|
-
all_activities = @activities.values.flatten.collect! { |i| i.to_f }
|
253
|
-
end
|
252
|
+
|
254
253
|
unless database_activity(subjectid) # adds database activity to @prediction_dataset
|
254
|
+
|
255
255
|
# Calculation of needed values for query compound
|
256
256
|
@compound_features = eval("#{@feature_calculation_algorithm}({
|
257
257
|
:compound => @compound,
|
258
258
|
:features => @features,
|
259
259
|
:feature_dataset_uri => @metadata[OT.featureDataset],
|
260
260
|
:pc_type => self.parameter(\"pc_type\"),
|
261
|
+
:lib => self.parameter(\"lib\"),
|
261
262
|
:subjectid => subjectid
|
262
263
|
})")
|
264
|
+
|
263
265
|
# Adding fingerprint of query compound with features and values(p_value*nr_hits)
|
264
266
|
@compound_fingerprints = {}
|
265
267
|
@compound_features.each do |feature, value| # value is nil if "Substructure.match"
|
@@ -314,6 +316,16 @@ module OpenTox
|
|
314
316
|
@prediction_dataset.add @compound.uri, feature_uri, true
|
315
317
|
f+=1
|
316
318
|
end
|
319
|
+
elsif @feature_calculation_algorithm == "Substructure.lookup"
|
320
|
+
f = 0
|
321
|
+
@compound_features.each do |feature, value|
|
322
|
+
features[feature] = feature
|
323
|
+
@prediction_dataset.add_feature(feature, {
|
324
|
+
RDF.type => [OT.NumericFeature]
|
325
|
+
})
|
326
|
+
@prediction_dataset.add @compound.uri, feature, value
|
327
|
+
f+=1
|
328
|
+
end
|
317
329
|
else
|
318
330
|
@compound_features.each do |feature|
|
319
331
|
features[feature] = feature
|
@@ -337,15 +349,26 @@ module OpenTox
|
|
337
349
|
else
|
338
350
|
feature_uri = feature
|
339
351
|
end
|
340
|
-
@
|
352
|
+
if @feature_calculation_algorithm == "Substructure.lookup"
|
353
|
+
@prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
|
354
|
+
else
|
355
|
+
@prediction_dataset.add neighbor[:compound], feature_uri, true
|
356
|
+
end
|
357
|
+
|
341
358
|
unless features.has_key? feature
|
342
359
|
features[feature] = feature_uri
|
343
|
-
@
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
360
|
+
if @feature_calculation_algorithm == "Substructure.lookup"
|
361
|
+
@prediction_dataset.add_feature(feature_uri, {
|
362
|
+
RDF.type => [OT.NumericFeature]
|
363
|
+
})
|
364
|
+
else
|
365
|
+
@prediction_dataset.add_feature(feature_uri, {
|
366
|
+
RDF.type => [OT.Substructure],
|
367
|
+
OT.smarts => feature,
|
368
|
+
OT.pValue => @p_values[feature],
|
369
|
+
OT.effect => @effects[feature]
|
370
|
+
})
|
371
|
+
end
|
349
372
|
f+=1
|
350
373
|
end
|
351
374
|
end
|
data/lib/parser.rb
CHANGED
@@ -349,11 +349,15 @@ module OpenTox
|
|
349
349
|
|
350
350
|
# Load CSV string (format specification: http://toxcreate.org/help)
|
351
351
|
# @param [String] csv CSV representation of the dataset
|
352
|
+
# @param [Boolean] drop_missing Whether completely missing rows should be droppped
|
353
|
+
# @param [Boolean] all_numeric Whether all features should be treated as numeric
|
354
|
+
# @param [Boolean] del_nominal All nominal features will be removed
|
352
355
|
# @return [OpenTox::Dataset] Dataset object with CSV data
|
353
|
-
def load_csv(csv, drop_missing=false)
|
356
|
+
def load_csv(csv, drop_missing=false, all_numeric=false)
|
354
357
|
row = 0
|
355
358
|
input = csv.split("\n")
|
356
359
|
headers = split_row(input.shift)
|
360
|
+
headers.collect! {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
|
357
361
|
add_features(headers)
|
358
362
|
value_maps = Array.new
|
359
363
|
regression_features=Array.new
|
@@ -362,7 +366,7 @@ module OpenTox
|
|
362
366
|
row = split_row(row)
|
363
367
|
value_maps = detect_new_values(row, value_maps)
|
364
368
|
value_maps.each_with_index { |vm,j|
|
365
|
-
if vm.size > @max_class_values # max @max_class_values classes.
|
369
|
+
if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes.
|
366
370
|
regression_features[j]=true
|
367
371
|
else
|
368
372
|
regression_features[j]=false
|
@@ -392,22 +396,30 @@ module OpenTox
|
|
392
396
|
|
393
397
|
def warnings
|
394
398
|
|
395
|
-
info = ''
|
399
|
+
info = '<br>'
|
396
400
|
@feature_types.each do |feature,types|
|
401
|
+
@dataset.add_feature_metadata(feature,{RDF.type => []})
|
397
402
|
if types.uniq.size == 0
|
398
|
-
|
399
|
-
|
400
|
-
|
403
|
+
@dataset.add_feature_metadata(
|
404
|
+
feature, {RDF.type => ( @dataset.features[feature][RDF.type] << "helper#MissingFeature" ) } # TODO: Fit to OT ontology!
|
405
|
+
)
|
406
|
+
info += "'#{@dataset.feature_name(feature)}' detected as 'MissingFeature'<br>"
|
401
407
|
else
|
402
|
-
|
408
|
+
info += "'#{@dataset.feature_name(feature)}' detected as "
|
409
|
+
types_arr = []
|
410
|
+
types.uniq.each { |t|
|
411
|
+
types_arr << t
|
412
|
+
info += "'#{t.split('#').last}', "
|
413
|
+
}
|
414
|
+
|
415
|
+
@dataset.add_feature_metadata(
|
416
|
+
feature, {RDF.type => types_arr.sort} # nominal should be first for downward compatibility
|
417
|
+
)
|
418
|
+
|
419
|
+
info.chop!.chop!
|
420
|
+
info += "<br>"
|
403
421
|
end
|
404
|
-
@dataset.add_feature_metadata(feature,{RDF.type => [type]})
|
405
|
-
info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
|
406
|
-
|
407
|
-
# TODO: rewrite feature values
|
408
|
-
# TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
|
409
422
|
end
|
410
|
-
|
411
423
|
@dataset.metadata[OT.Info] = info
|
412
424
|
|
413
425
|
warnings = ''
|
@@ -469,28 +481,31 @@ module OpenTox
|
|
469
481
|
unless @duplicate_feature_indices.include? i
|
470
482
|
|
471
483
|
value = row[i]
|
472
|
-
#LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
|
473
484
|
feature = @features[feature_idx]
|
474
485
|
|
475
486
|
type = feature_type(value) # May be NIL
|
476
|
-
type = OT.NominalFeature unless (type.nil? || regression_features[i])
|
477
487
|
@feature_types[feature] << type if type
|
488
|
+
# Add nominal type if #distinct values le @max_class_values
|
489
|
+
if type == OT.NumericFeature
|
490
|
+
@feature_types[feature] << OT.NominalFeature unless regression_features[i]
|
491
|
+
end
|
478
492
|
|
479
493
|
val = nil
|
480
494
|
case type
|
481
495
|
when OT.NumericFeature
|
482
496
|
val = value.to_f
|
497
|
+
val = nil if val.infinite?
|
483
498
|
when OT.NominalFeature
|
484
499
|
val = value.to_s
|
485
500
|
end
|
486
501
|
|
487
502
|
feature_idx += 1
|
488
503
|
|
489
|
-
if val != nil
|
504
|
+
if val != nil
|
490
505
|
@dataset.add(compound.uri, feature, val)
|
491
|
-
if
|
506
|
+
if @feature_types[feature].include? OT.NominalFeature
|
492
507
|
@dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
|
493
|
-
@dataset.features[feature][OT.acceptValue] << val
|
508
|
+
@dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val)
|
494
509
|
end
|
495
510
|
end
|
496
511
|
|
@@ -654,7 +669,7 @@ module OpenTox
|
|
654
669
|
obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
|
655
670
|
table.data[compound.uri] = row
|
656
671
|
end
|
657
|
-
|
672
|
+
|
658
673
|
# find and remove ignored_features
|
659
674
|
@activity_errors = table.clean_features
|
660
675
|
table.add_to_dataset @dataset
|
data/lib/r-util.rb
CHANGED
@@ -8,6 +8,18 @@ PACKAGE_DIR = package_dir
|
|
8
8
|
|
9
9
|
require "tempfile"
|
10
10
|
|
11
|
+
class Array
|
12
|
+
|
13
|
+
def check_uniq
|
14
|
+
hash = {}
|
15
|
+
self.each do |x|
|
16
|
+
raise "duplicate #{x}" if hash[x]
|
17
|
+
hash[x] = true
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
11
23
|
module OpenTox
|
12
24
|
|
13
25
|
class RUtil
|
@@ -75,12 +87,10 @@ module OpenTox
|
|
75
87
|
end
|
76
88
|
|
77
89
|
# embedds feature values of two datasets into 2D and plots it
|
78
|
-
# fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
|
79
90
|
#
|
80
91
|
def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
|
81
|
-
features=nil,
|
92
|
+
features=nil, subjectid=nil, waiting_task=nil)
|
82
93
|
|
83
|
-
raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
|
84
94
|
LOGGER.debug("r-util> create feature value plot")
|
85
95
|
d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
|
86
96
|
d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
|
@@ -102,17 +112,13 @@ module OpenTox
|
|
102
112
|
@r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
|
103
113
|
@r.names = [dataset_name1, dataset_name2]
|
104
114
|
LOGGER.debug("r-util> - convert data to 2d")
|
105
|
-
|
115
|
+
#@r.eval "save.image(\"/tmp/image.R\")"
|
116
|
+
@r.eval "df.2d <- plot_pre_process(df, method='sammon')"
|
106
117
|
waiting_task.progress(75) if waiting_task
|
107
118
|
|
108
|
-
if fast_plot
|
109
|
-
info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
|
110
|
-
else
|
111
|
-
info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
|
112
|
-
end
|
113
119
|
LOGGER.debug("r-util> - plot data")
|
114
120
|
plot_to_files(files) do |file|
|
115
|
-
@r.eval "plot_split( df.2d, split, names, #{
|
121
|
+
@r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
|
116
122
|
end
|
117
123
|
end
|
118
124
|
|
@@ -170,19 +176,68 @@ module OpenTox
|
|
170
176
|
end
|
171
177
|
end
|
172
178
|
|
173
|
-
# stratified splits a dataset into two dataset the feature values
|
179
|
+
# stratified splits a dataset into two dataset according to the feature values
|
180
|
+
# all features are taken into account unless <split_features> is given
|
181
|
+
# returns two datases
|
182
|
+
def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
|
183
|
+
stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
|
184
|
+
end
|
185
|
+
|
186
|
+
# stratified splits a dataset into k datasets according the feature values
|
174
187
|
# all features are taken into account unless <split_features> is given
|
175
|
-
|
188
|
+
# returns two arrays of datasets
|
189
|
+
def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
|
190
|
+
stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features )
|
191
|
+
end
|
192
|
+
|
193
|
+
private
|
194
|
+
def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
|
195
|
+
raise "internal error" if num_folds!=nil and pct!=nil
|
196
|
+
k_fold_split = num_folds!=nil
|
197
|
+
if k_fold_split
|
198
|
+
raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum)
|
199
|
+
else
|
200
|
+
raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric)
|
201
|
+
end
|
176
202
|
raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
|
203
|
+
raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0
|
204
|
+
raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String)
|
177
205
|
LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
|
178
206
|
|
179
|
-
df = dataset_to_dataframe( dataset, missing_values, subjectid
|
207
|
+
df = dataset_to_dataframe( dataset, missing_values, subjectid)
|
180
208
|
@r.eval "set.seed(#{seed})"
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
209
|
+
str_split_features = ""
|
210
|
+
if split_features
|
211
|
+
@r.split_features = split_features if split_features
|
212
|
+
str_split_features = "colnames=split_features"
|
213
|
+
end
|
214
|
+
#@r.eval "save.image(\"/tmp/image.R\")"
|
215
|
+
|
216
|
+
if k_fold_split
|
217
|
+
@r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})"
|
218
|
+
split = @r.pull 'split'
|
219
|
+
train = []
|
220
|
+
test = []
|
221
|
+
num_folds.times do |f|
|
222
|
+
datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s
|
223
|
+
metadata[DC.title] = "training "+datasetname
|
224
|
+
train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) }
|
225
|
+
metadata[DC.title] = "test "+datasetname
|
226
|
+
test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) }
|
227
|
+
end
|
228
|
+
return train, test
|
229
|
+
else
|
230
|
+
puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
|
231
|
+
@r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
|
232
|
+
split = @r.pull 'split'
|
233
|
+
metadata[DC.title] = "Training dataset split of "+dataset.uri
|
234
|
+
train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
|
235
|
+
metadata[DC.title] = "Test dataset split of "+dataset.uri
|
236
|
+
test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
|
237
|
+
return train, test
|
238
|
+
end
|
185
239
|
end
|
240
|
+
public
|
186
241
|
|
187
242
|
# dataset should be loaded completely (use Dataset.find)
|
188
243
|
# takes duplicates into account
|
@@ -212,9 +267,13 @@ module OpenTox
|
|
212
267
|
features = dataset.features.keys.sort
|
213
268
|
end
|
214
269
|
compounds = []
|
270
|
+
compound_names = []
|
215
271
|
dataset.compounds.each do |c|
|
272
|
+
count = 0
|
216
273
|
num_compounds[c].times do |i|
|
217
274
|
compounds << c
|
275
|
+
compound_names << "#{c}$#{count}"
|
276
|
+
count+=1
|
218
277
|
end
|
219
278
|
end
|
220
279
|
|
@@ -238,7 +297,7 @@ module OpenTox
|
|
238
297
|
end
|
239
298
|
end
|
240
299
|
df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
|
241
|
-
assign_dataframe(df_name,d_values,
|
300
|
+
assign_dataframe(df_name,d_values,compound_names,features)
|
242
301
|
|
243
302
|
# set dataframe column types accordingly
|
244
303
|
f_count = 1 #R starts at 1
|
@@ -264,25 +323,27 @@ module OpenTox
|
|
264
323
|
|
265
324
|
# converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
|
266
325
|
# this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
|
267
|
-
def dataframe_to_dataset( df, subjectid=nil )
|
268
|
-
dataframe_to_dataset_indices( df, subjectid, nil)
|
326
|
+
def dataframe_to_dataset( df, metadata={}, subjectid=nil )
|
327
|
+
dataframe_to_dataset_indices( df, metadata, subjectid, nil)
|
269
328
|
end
|
270
329
|
|
271
330
|
private
|
272
|
-
def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
|
331
|
+
def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
|
273
332
|
raise unless @@feats[df].size>0
|
274
|
-
values,
|
333
|
+
values, compound_names, features = pull_dataframe(df)
|
334
|
+
compounds = compound_names.collect{|c| c.split("$")[0]}
|
275
335
|
features.each{|f| raise unless @@feats[df][f]}
|
276
336
|
dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
|
337
|
+
dataset.add_metadata(metadata)
|
277
338
|
LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
|
278
339
|
compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
|
279
340
|
features.each{|f| dataset.add_feature(f,@@feats[df][f])}
|
280
341
|
features.size.times do |c|
|
281
342
|
feat = OpenTox::Feature.find(features[c],subjectid)
|
282
|
-
|
343
|
+
numeric = feat.metadata[RDF.type].to_a.flatten.include?(OT.NumericFeature)
|
283
344
|
compounds.size.times do |r|
|
284
345
|
if compound_indices==nil or compound_indices.include?(r)
|
285
|
-
dataset.add(compounds[r],features[c],
|
346
|
+
dataset.add(compounds[r],features[c],numeric ? values[r][c].to_f : values[r][c]) if values[r][c]!="NA"
|
286
347
|
end
|
287
348
|
end
|
288
349
|
end
|
@@ -290,16 +351,12 @@ module OpenTox
|
|
290
351
|
dataset
|
291
352
|
end
|
292
353
|
|
293
|
-
def
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
|
300
|
-
sets << dataset
|
301
|
-
end
|
302
|
-
sets
|
354
|
+
def split_to_dataset( df, split, metadata={}, subjectid=nil )
|
355
|
+
indices = []
|
356
|
+
split.size.times{|i| indices<<i if yield(split[i]) }
|
357
|
+
dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
|
358
|
+
LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
|
359
|
+
dataset
|
303
360
|
end
|
304
361
|
|
305
362
|
def pull_dataframe(df)
|
@@ -323,6 +380,8 @@ module OpenTox
|
|
323
380
|
end
|
324
381
|
|
325
382
|
def assign_dataframe(df,input,rownames,colnames)
|
383
|
+
rownames.check_uniq if rownames
|
384
|
+
colnames.check_uniq if colnames
|
326
385
|
tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
|
327
386
|
file = File.new(tmp, 'w')
|
328
387
|
input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
|