opentox-ruby 3.1.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog CHANGED
@@ -1,13 +1,23 @@
1
+ v4.0.0 2012-07-12
2
+ * fminer addition of compounds fixed
3
+ * improved performance for CSV download
4
+ * switch to opentox-ruby version 4.0.0
5
+
6
+ 2012-04-20
7
+ * Support for joelib and openbabel descriptors in a completely unified interface with CDK (Ambit)
8
+ * Features can have multiple types (nominal and numeric), PC descriptors have detailed meta data
9
+ * Myriads of bugfixes to CSV download code (e.g. missing descriptors, handling of duplicates)
10
+
1
11
  v3.1.0 2012-02-24
2
- * utils.rb: added for special routines (e.g. descriptor calculation)
3
- * task.rb: Polling with increasing interval
4
- * parser.rb: CSV up and download fixed
5
- * transform.rb: routines to create machine learning data matrices
6
- * algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
7
- gauss() removed
12
+ * utils.rb: added for special routines (e.g. descriptor calculation)
13
+ * task.rb: Polling with increasing interval
14
+ * parser.rb: CSV up and download fixed
15
+ * transform.rb: routines to create machine learning data matrices
16
+ * algorithm.rb: SVM parameter grid search, cos similarity as algorithm, gauss() removed
8
17
 
9
18
  v3.0.1 2011-10-19
10
- * feature: model registration to ontology service
11
- * ontology lib gets endpoints from ontology service
19
+ * feature: model registration to ontology service
20
+ * ontology lib gets endpoints from ontology service
21
+
12
22
  v3.0.0 2011-09-23
13
- * datasets stored as json (with Yajl) to improve performance
23
+ * datasets stored as json (with Yajl) to improve performance
@@ -38,4 +38,4 @@ This example shows how to create a lazar model and predict a compound, it assume
38
38
  Copyright
39
39
  ---------
40
40
 
41
- Copyright (c) 2009-2011 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
41
+ Copyright (c) 2009-2012 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
data/Rakefile CHANGED
@@ -42,9 +42,10 @@ begin
42
42
  gem.add_dependency "dm-migrations", "=1.1.0"
43
43
  gem.add_dependency "dm-validations", "=1.1.0"
44
44
  gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
45
- gem.add_dependency "ruby-plot", "=0.6.0"
45
+ gem.add_dependency "ruby-plot", "=0.6.1"
46
46
  gem.add_dependency "gsl", "=1.14.7"
47
47
  gem.add_dependency "statsample", "=1.1.0"
48
+ gem.add_dependency "redis", "=2.2.2"
48
49
 
49
50
  gem.add_development_dependency 'jeweler'
50
51
  gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
data/VERSION CHANGED
@@ -1 +1 @@
1
- 3.1.0
1
+ 4.0.0
@@ -56,25 +56,73 @@ module OpenTox
56
56
 
57
57
  def check_params(params,per_mil,subjectid=nil)
58
58
  raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
59
- raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
60
- @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
61
59
  @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
60
+
61
+ unless params[:prediction_feature] # try to read prediction_feature from dataset
62
+ raise OpenTox::NotFoundError.new "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1
63
+ prediction_feature = OpenTox::Feature.find(@training_dataset.features.keys.first,@subjectid)
64
+ params[:prediction_feature] = prediction_feature.uri
65
+ end
66
+ @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
67
+
62
68
  raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
63
69
 
64
70
  unless params[:min_frequency].nil?
65
- @minfreq=params[:min_frequency].to_i
66
- raise "Minimum frequency must be a number >0!" unless @minfreq>0
67
- else
68
- @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
71
+ # check for percentage
72
+ if params[:min_frequency].include? "pc"
73
+ per_mil=params[:min_frequency].gsub(/pc/,"")
74
+ if OpenTox::Algorithm.numeric? per_mil
75
+ per_mil = per_mil.to_i * 10
76
+ else
77
+ bad_request=true
78
+ end
79
+ # check for per-mil
80
+ elsif params[:min_frequency].include? "pm"
81
+ per_mil=params[:min_frequency].gsub(/pm/,"")
82
+ if OpenTox::Algorithm.numeric? per_mil
83
+ per_mil = per_mil.to_i
84
+ else
85
+ bad_request=true
86
+ end
87
+ # set minfreq directly
88
+ else
89
+ if OpenTox::Algorithm.numeric? params[:min_frequency]
90
+ @minfreq=params[:min_frequency].to_i
91
+ LOGGER.debug "min_frequency #{@minfreq}"
92
+ else
93
+ bad_request=true
94
+ end
95
+ end
96
+ raise OpenTox::BadRequestError.new "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request
97
+ end
98
+ if @minfreq.nil?
99
+ @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil)
100
+ LOGGER.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)"
69
101
  end
70
102
  end
71
103
 
72
- def add_fminer_data(fminer_instance, params, value_map)
104
+ def add_fminer_data(fminer_instance, value_map)
105
+
106
+
107
+ # detect nr duplicates per compound
108
+ compound_sizes = {}
109
+ @training_dataset.compounds.each do |compound|
110
+ entries=@training_dataset.data_entries[compound]
111
+ entries.each do |feature, values|
112
+ compound_sizes[compound] || compound_sizes[compound] = []
113
+ compound_sizes[compound] << values.size unless values.size == 0
114
+ end
115
+ compound_sizes[compound].uniq!
116
+ raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1
117
+ compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
118
+ end
73
119
 
74
120
  id = 1 # fminer start id is not 0
75
- @training_dataset.data_entries.each do |compound,entry|
121
+
122
+ @training_dataset.compounds.each do |compound|
123
+ entry=@training_dataset.data_entries[compound]
76
124
  begin
77
- smiles = OpenTox::Compound.smiles(compound.to_s)
125
+ smiles = OpenTox::Compound.new(compound).to_smiles
78
126
  rescue
79
127
  LOGGER.warn "No resource for #{compound.to_s}"
80
128
  next
@@ -84,32 +132,31 @@ module OpenTox
84
132
  next
85
133
  end
86
134
 
87
- value_map=params[:value_map] unless params[:value_map].nil?
88
135
  entry.each do |feature,values|
89
136
  if feature == @prediction_feature.uri
90
- values.each do |value|
91
- if value.nil?
137
+ (0...compound_sizes[compound]).each { |i|
138
+ if values[i].nil?
92
139
  LOGGER.warn "No #{feature} activity for #{compound.to_s}."
93
140
  else
94
141
  if @prediction_feature.feature_type == "classification"
95
- activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
142
+ activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n
96
143
  @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
97
144
  elsif @prediction_feature.feature_type == "regression"
98
- activity= value.to_f
145
+ activity= values[i].to_f
99
146
  end
100
147
  begin
101
- fminer_instance.AddCompound(smiles,id)
102
- fminer_instance.AddActivity(activity, id)
148
+ fminer_instance.AddCompound(smiles,id) if fminer_instance
149
+ fminer_instance.AddActivity(activity, id) if fminer_instance
103
150
  @all_activities[id]=activity # DV: insert global information
104
151
  @compounds[id] = compound
105
152
  @smi[id] = smiles
106
153
  id += 1
107
154
  rescue Exception => e
108
- LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
155
+ LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer"
109
156
  LOGGER.warn e.backtrace
110
157
  end
111
158
  end
112
- end
159
+ }
113
160
  end
114
161
  end
115
162
  end
@@ -380,11 +427,11 @@ module OpenTox
380
427
  prediction = acts[0]
381
428
  else
382
429
  #LOGGER.debug gram_matrix.to_yaml
383
- @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
384
- @r.eval "set.seed(1)"
430
+ @r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
385
431
  @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
386
432
  @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
387
433
  @r.eval "registerDoMC()" # switch on parallel processing
434
+ @r.eval "set.seed(1)"
388
435
  begin
389
436
 
390
437
  # set data
@@ -400,7 +447,14 @@ module OpenTox
400
447
 
401
448
  # prepare data
402
449
  LOGGER.debug "Preparing R data ..."
403
- @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
450
+ @r.eval <<-EOR
451
+ weights=NULL
452
+ if (class(y) == 'character') {
453
+ y = factor(y)
454
+ suppressPackageStartupMessages(library('class'))
455
+ #weights=unlist(as.list(prop.table(table(y))))
456
+ }
457
+ EOR
404
458
 
405
459
  @r.eval <<-EOR
406
460
  rem = nearZeroVar(prop_matrix)
@@ -417,8 +471,18 @@ module OpenTox
417
471
 
418
472
  # model + support vectors
419
473
  LOGGER.debug "Creating R SVM model ..."
420
- @r.eval <<-EOR
421
- model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
474
+ train_success = @r.eval <<-EOR
475
+ # AM: TODO: evaluate class weight effect by altering:
476
+ # AM: comment in 'weights' above run and class.weights=weights vs. class.weights=1-weights
477
+ # AM: vs
478
+ # AM: comment out 'weights' above (status quo), thereby disabling weights
479
+ model = train(prop_matrix,y,
480
+ method="svmradial",
481
+ preProcess=c("center", "scale"),
482
+ class.weights=weights,
483
+ trControl=trainControl(method="LGOCV",number=10),
484
+ tuneLength=8
485
+ )
422
486
  perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
423
487
  EOR
424
488
 
@@ -431,6 +495,7 @@ module OpenTox
431
495
 
432
496
  # censoring
433
497
  prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
498
+ prediction = nil unless train_success
434
499
  LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
435
500
  rescue Exception => e
436
501
  LOGGER.debug "#{e.class}: #{e.message}"
@@ -456,30 +521,42 @@ module OpenTox
456
521
  @r.del_missing = params[:del_missing] == true ? 1 : 0
457
522
  r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
458
523
  @r.f_fds_r = r_result_file.to_s
459
-
524
+
460
525
  # need packs 'randomForest', 'RANN'
461
526
  @r.eval <<-EOR
462
- set.seed(1)
463
527
  suppressPackageStartupMessages(library('caret'))
464
528
  suppressPackageStartupMessages(library('randomForest'))
465
529
  suppressPackageStartupMessages(library('RANN'))
466
530
  suppressPackageStartupMessages(library('doMC'))
467
531
  registerDoMC()
468
-
532
+ set.seed(1)
533
+
469
534
  acts = read.csv(ds_csv_file, check.names=F)
470
535
  feats = read.csv(fds_csv_file, check.names=F)
471
536
  ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
472
-
537
+
473
538
  features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
474
539
  y = ds[,which(names(ds) == prediction_feature)]
475
-
540
+
476
541
  # assumes a data matrix 'features' and a vector 'y' of target values
477
542
  row.names(features)=NULL
478
-
543
+
544
+ # features with all values missing removed
545
+ na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
546
+ features = features[,!names(features) %in% na_col]
547
+
548
+ # features with infinite values removed
549
+ inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )
550
+ features = features[,!names(features) %in% inf_col]
551
+
552
+ # features with zero variance removed
553
+ zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
554
+ features = features[,!names(features) %in% zero_var]
555
+
479
556
  pp = NULL
480
557
  if (del_missing) {
481
558
  # needed if rows should be removed
482
- na_ids = apply(features,1,function(x)any(is.na(x)))
559
+ na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) )
483
560
  features = features[!na_ids,]
484
561
  y = y[!na_ids]
485
562
  pp = preProcess(features, method=c("scale", "center"))
@@ -488,17 +565,23 @@ module OpenTox
488
565
  pp = preProcess(features, method=c("scale", "center", "knnImpute"))
489
566
  }
490
567
  features = predict(pp, features)
491
-
568
+
569
+ # features with nan values removed (sometimes preProcess return NaN values)
570
+ nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
571
+ features = features[,!names(features) %in% nan_col]
572
+
492
573
  # determine subsets
493
- subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
494
- subsets = c(2,3,4,5,7,10,subsets)
574
+ subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
575
+ #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
576
+ #subsets = c(2,3,4,5,7,10,subsets)
577
+ #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
495
578
  subsets = unique(sort(round(subsets)))
496
579
  subsets = subsets[subsets<=dim(features)[2]]
497
580
  subsets = subsets[subsets>1]
498
-
581
+
499
582
  # Recursive feature elimination
500
- rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
501
-
583
+ rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
584
+
502
585
  # read existing dataset and select most useful features
503
586
  csv=feats[,c("SMILES", rfProfile$optVariables)]
504
587
  write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
@@ -527,7 +610,7 @@ module OpenTox
527
610
  # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
528
611
  # @return [Hash] Hash with matching Smarts and number of hits
529
612
  def self.lookup(params)
530
- params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
613
+ params[:compound].lookup(params[:features], params[:feature_dataset_uri], params[:pc_type], params[:lib], params[:subjectid])
531
614
  end
532
615
  end
533
616
 
@@ -539,3 +622,26 @@ module OpenTox
539
622
  end
540
623
  end
541
624
  end
625
+
626
+ class Array
627
+ # collect method extended for parallel processing.
628
+ # Note: assign return value as: ans = arr.pcollect(n) { |obj| ... }
629
+ # @param n the number of processes to spawn (default: unlimited)
630
+ def pcollect(n = nil)
631
+ nproc = 0
632
+ result = collect do |*a|
633
+ r, w = IO.pipe
634
+ fork do
635
+ r.close
636
+ w.write( Marshal.dump( yield(*a) ) )
637
+ end
638
+ if n and (nproc+=1) >= n
639
+ Process.wait ; nproc -= 1
640
+ end
641
+ [ w.close, r ].last
642
+ end
643
+ Process.waitall
644
+ result.collect{|r| Marshal.load [ r.read, r.close ].first}
645
+ end
646
+ end
647
+
@@ -3,6 +3,7 @@
3
3
 
4
4
  module OpenTox
5
5
 
6
+ require "rexml/document"
6
7
  # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
7
8
  class Compound
8
9
 
@@ -17,16 +18,20 @@ module OpenTox
17
18
  # @return [Compound] Compound
18
19
  def initialize(uri=nil)
19
20
  @uri = uri
20
- case @uri
21
- when /InChI/ # shortcut for IST services
22
- @inchi = @uri.sub(/^.*InChI/, 'InChI')
21
+ if (@uri =~ URI::regexp) || @uri.nil?
22
+ case @uri
23
+ when /InChI/ # shortcut for IST services
24
+ @inchi = @uri.sub(/^.*InChI/, 'InChI')
25
+ else
26
+ @inchi = RestClientWrapper.get(@uri, :accept => 'chemical/x-inchi').to_s.chomp if @uri
27
+ end
28
+
29
+ if @uri and @inchi.to_s.size==0
30
+ LOGGER.warn "REMOVE ABMIT HACK: no inchi for compound "+@uri.to_s+", load via smiles"
31
+ @inchi = Compound.smiles2inchi(Compound.smiles(@uri))
32
+ end
23
33
  else
24
- @inchi = RestClientWrapper.get(@uri, :accept => 'chemical/x-inchi').to_s.chomp if @uri
25
- end
26
-
27
- if @uri and @inchi.to_s.size==0
28
- LOGGER.warn "REMOVE ABMIT HACK: no inchi for compound "+@uri.to_s+", load via smiles"
29
- @inchi = Compound.smiles2inchi(Compound.smiles(@uri))
34
+ raise "Not able to create compound with uri: #{@uri}"
30
35
  end
31
36
  end
32
37
 
@@ -130,6 +135,47 @@ module OpenTox
130
135
  "not available"
131
136
  end
132
137
  end
138
+
139
+
140
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
141
+ # @example
142
+ # names = compound.to_names_hash
143
+ # @return [Hash] Classification => Name Array
144
+ def to_names_hash
145
+ begin
146
+ xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml")
147
+ xmldoc = REXML::Document.new(xml)
148
+ data = {}
149
+
150
+ xmldoc.root.elements[1].elements.each{|e|
151
+ if data.has_key?(e.attribute("classification").value) == false
152
+ data[e.attribute("classification").value] = [e.text]
153
+ else
154
+ data[e.attribute("classification").value].push(e.text)
155
+ end
156
+ }
157
+ data
158
+ rescue
159
+ "not available"
160
+ end
161
+ end
162
+
163
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
164
+ # @example
165
+ # names = compound.to_names_hash
166
+ # @return [Hash] Classification => Name Array
167
+ def to_ambit_names_hash
168
+ begin
169
+ ds = OpenTox::Dataset.new
170
+ ds.save
171
+ ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}"))
172
+ ds.save
173
+ ds.uri
174
+ rescue
175
+ "not available"
176
+ end
177
+ end
178
+
133
179
 
134
180
  # Match a smarts string
135
181
  # @example
@@ -197,25 +243,28 @@ module OpenTox
197
243
  # Lookup numerical values, returns hash with feature name as key and value as value
198
244
  # @param [Array] Array of feature names
199
245
  # @param [String] Feature dataset uri
246
+ # @param [String] Comma separated pc types
247
+ # @param [String] Comma separated lib
200
248
  # @return [Hash] Hash with feature name as key and value as value
201
- def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil)
249
+ def lookup(feature_array,feature_dataset_uri,pc_type,lib,subjectid=nil)
202
250
  ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid)
203
251
  #entry = ds.data_entries[self.uri]
204
252
  entry = nil
205
- ds.data_entries.each { |c_uri, values|
206
- if c_uri.split('/compound/').last == self.to_inchi
207
- entry = ds.data_entries[self.uri]
253
+ ds.data_entries.each { |c_uri, values|
254
+ compound = OpenTox::Compound.new(c_uri)
255
+ if compound.to_inchi == self.to_inchi # Compare compounds by InChI
256
+ entry = ds.data_entries[c_uri]
208
257
  break
209
258
  end
210
259
  }
211
260
  LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil?
212
-
213
261
  if entry.nil?
214
- uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type})
215
- uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid)
216
- ds = OpenTox::Dataset.find(uri,subjectid)
262
+ temp_ds = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid); temp_ds.add_compound(self.uri); temp_uri = temp_ds.save(subjectid)
263
+ uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"], "/pc/AllDescriptors"), {:dataset_uri => temp_uri, :pc_type => pc_type, :lib => lib, :subjectid => subjectid})
264
+ ds = OpenTox::Dataset.find(uri, subjectid)
217
265
  entry = ds.data_entries[self.uri]
218
266
  ds.delete(subjectid)
267
+ temp_ds.delete(subjectid)
219
268
  end
220
269
  features = entry.keys
221
270
  features.each { |feature|
@@ -224,7 +273,6 @@ module OpenTox
224
273
  entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit
225
274
  }
226
275
  #res = feature_array.collect {|v| entry[v]}
227
- #LOGGER.debug "----- am #{entry.to_yaml}"
228
276
  entry
229
277
  end
230
278