opentox-ruby 3.1.0 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog CHANGED
@@ -1,13 +1,23 @@
1
+ v4.0.0 2012-07-12
2
+ * fminer addition of compounds fixed
3
+ * improved performance for CSV download
4
+ * switch to opentox-ruby version 4.0.0
5
+
6
+ 2012-04-20
7
+ * Support for joelib and openbabel descriptors in a completely unified interface with CDK (Ambit)
8
+ * Features can have multiple types (nominal and numeric), PC descriptors have detailed meta data
9
+ * Myriads of bugfixes to CSV download code (e.g. missing descriptors, handling of duplicates)
10
+
1
11
  v3.1.0 2012-02-24
2
- * utils.rb: added for special routines (e.g. descriptor calculation)
3
- * task.rb: Polling with increasing interval
4
- * parser.rb: CSV up and download fixed
5
- * transform.rb: routines to create machine learning data matrices
6
- * algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
7
- gauss() removed
12
+ * utils.rb: added for special routines (e.g. descriptor calculation)
13
+ * task.rb: Polling with increasing interval
14
+ * parser.rb: CSV up and download fixed
15
+ * transform.rb: routines to create machine learning data matrices
16
+ * algorithm.rb: SVM parameter grid search, cos similarity as algorithm, gauss() removed
8
17
 
9
18
  v3.0.1 2011-10-19
10
- * feature: model registration to ontology service
11
- * ontology lib gets endpoints from ontology service
19
+ * feature: model registration to ontology service
20
+ * ontology lib gets endpoints from ontology service
21
+
12
22
  v3.0.0 2011-09-23
13
- * datasets stored as json (with Yajl) to improve performance
23
+ * datasets stored as json (with Yajl) to improve performance
@@ -38,4 +38,4 @@ This example shows how to create a lazar model and predict a compound, it assume
38
38
  Copyright
39
39
  ---------
40
40
 
41
- Copyright (c) 2009-2011 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
41
+ Copyright (c) 2009-2012 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
data/Rakefile CHANGED
@@ -42,9 +42,10 @@ begin
42
42
  gem.add_dependency "dm-migrations", "=1.1.0"
43
43
  gem.add_dependency "dm-validations", "=1.1.0"
44
44
  gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
45
- gem.add_dependency "ruby-plot", "=0.6.0"
45
+ gem.add_dependency "ruby-plot", "=0.6.1"
46
46
  gem.add_dependency "gsl", "=1.14.7"
47
47
  gem.add_dependency "statsample", "=1.1.0"
48
+ gem.add_dependency "redis", "=2.2.2"
48
49
 
49
50
  gem.add_development_dependency 'jeweler'
50
51
  gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
data/VERSION CHANGED
@@ -1 +1 @@
1
- 3.1.0
1
+ 4.0.0
@@ -56,25 +56,73 @@ module OpenTox
56
56
 
57
57
  def check_params(params,per_mil,subjectid=nil)
58
58
  raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
59
- raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
60
- @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
61
59
  @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
60
+
61
+ unless params[:prediction_feature] # try to read prediction_feature from dataset
62
+ raise OpenTox::NotFoundError.new "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1
63
+ prediction_feature = OpenTox::Feature.find(@training_dataset.features.keys.first,@subjectid)
64
+ params[:prediction_feature] = prediction_feature.uri
65
+ end
66
+ @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
67
+
62
68
  raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
63
69
 
64
70
  unless params[:min_frequency].nil?
65
- @minfreq=params[:min_frequency].to_i
66
- raise "Minimum frequency must be a number >0!" unless @minfreq>0
67
- else
68
- @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
71
+ # check for percentage
72
+ if params[:min_frequency].include? "pc"
73
+ per_mil=params[:min_frequency].gsub(/pc/,"")
74
+ if OpenTox::Algorithm.numeric? per_mil
75
+ per_mil = per_mil.to_i * 10
76
+ else
77
+ bad_request=true
78
+ end
79
+ # check for per-mil
80
+ elsif params[:min_frequency].include? "pm"
81
+ per_mil=params[:min_frequency].gsub(/pm/,"")
82
+ if OpenTox::Algorithm.numeric? per_mil
83
+ per_mil = per_mil.to_i
84
+ else
85
+ bad_request=true
86
+ end
87
+ # set minfreq directly
88
+ else
89
+ if OpenTox::Algorithm.numeric? params[:min_frequency]
90
+ @minfreq=params[:min_frequency].to_i
91
+ LOGGER.debug "min_frequency #{@minfreq}"
92
+ else
93
+ bad_request=true
94
+ end
95
+ end
96
+ raise OpenTox::BadRequestError.new "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request
97
+ end
98
+ if @minfreq.nil?
99
+ @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil)
100
+ LOGGER.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)"
69
101
  end
70
102
  end
71
103
 
72
- def add_fminer_data(fminer_instance, params, value_map)
104
+ def add_fminer_data(fminer_instance, value_map)
105
+
106
+
107
+ # detect nr duplicates per compound
108
+ compound_sizes = {}
109
+ @training_dataset.compounds.each do |compound|
110
+ entries=@training_dataset.data_entries[compound]
111
+ entries.each do |feature, values|
112
+ compound_sizes[compound] || compound_sizes[compound] = []
113
+ compound_sizes[compound] << values.size unless values.size == 0
114
+ end
115
+ compound_sizes[compound].uniq!
116
+ raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1
117
+ compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
118
+ end
73
119
 
74
120
  id = 1 # fminer start id is not 0
75
- @training_dataset.data_entries.each do |compound,entry|
121
+
122
+ @training_dataset.compounds.each do |compound|
123
+ entry=@training_dataset.data_entries[compound]
76
124
  begin
77
- smiles = OpenTox::Compound.smiles(compound.to_s)
125
+ smiles = OpenTox::Compound.new(compound).to_smiles
78
126
  rescue
79
127
  LOGGER.warn "No resource for #{compound.to_s}"
80
128
  next
@@ -84,32 +132,31 @@ module OpenTox
84
132
  next
85
133
  end
86
134
 
87
- value_map=params[:value_map] unless params[:value_map].nil?
88
135
  entry.each do |feature,values|
89
136
  if feature == @prediction_feature.uri
90
- values.each do |value|
91
- if value.nil?
137
+ (0...compound_sizes[compound]).each { |i|
138
+ if values[i].nil?
92
139
  LOGGER.warn "No #{feature} activity for #{compound.to_s}."
93
140
  else
94
141
  if @prediction_feature.feature_type == "classification"
95
- activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
142
+ activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n
96
143
  @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
97
144
  elsif @prediction_feature.feature_type == "regression"
98
- activity= value.to_f
145
+ activity= values[i].to_f
99
146
  end
100
147
  begin
101
- fminer_instance.AddCompound(smiles,id)
102
- fminer_instance.AddActivity(activity, id)
148
+ fminer_instance.AddCompound(smiles,id) if fminer_instance
149
+ fminer_instance.AddActivity(activity, id) if fminer_instance
103
150
  @all_activities[id]=activity # DV: insert global information
104
151
  @compounds[id] = compound
105
152
  @smi[id] = smiles
106
153
  id += 1
107
154
  rescue Exception => e
108
- LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
155
+ LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer"
109
156
  LOGGER.warn e.backtrace
110
157
  end
111
158
  end
112
- end
159
+ }
113
160
  end
114
161
  end
115
162
  end
@@ -380,11 +427,11 @@ module OpenTox
380
427
  prediction = acts[0]
381
428
  else
382
429
  #LOGGER.debug gram_matrix.to_yaml
383
- @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
384
- @r.eval "set.seed(1)"
430
+ @r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
385
431
  @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
386
432
  @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
387
433
  @r.eval "registerDoMC()" # switch on parallel processing
434
+ @r.eval "set.seed(1)"
388
435
  begin
389
436
 
390
437
  # set data
@@ -400,7 +447,14 @@ module OpenTox
400
447
 
401
448
  # prepare data
402
449
  LOGGER.debug "Preparing R data ..."
403
- @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
450
+ @r.eval <<-EOR
451
+ weights=NULL
452
+ if (class(y) == 'character') {
453
+ y = factor(y)
454
+ suppressPackageStartupMessages(library('class'))
455
+ #weights=unlist(as.list(prop.table(table(y))))
456
+ }
457
+ EOR
404
458
 
405
459
  @r.eval <<-EOR
406
460
  rem = nearZeroVar(prop_matrix)
@@ -417,8 +471,18 @@ module OpenTox
417
471
 
418
472
  # model + support vectors
419
473
  LOGGER.debug "Creating R SVM model ..."
420
- @r.eval <<-EOR
421
- model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
474
+ train_success = @r.eval <<-EOR
475
+ # AM: TODO: evaluate class weight effect by altering:
476
+ # AM: comment in 'weights' above run and class.weights=weights vs. class.weights=1-weights
477
+ # AM: vs
478
+ # AM: comment out 'weights' above (status quo), thereby disabling weights
479
+ model = train(prop_matrix,y,
480
+ method="svmradial",
481
+ preProcess=c("center", "scale"),
482
+ class.weights=weights,
483
+ trControl=trainControl(method="LGOCV",number=10),
484
+ tuneLength=8
485
+ )
422
486
  perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
423
487
  EOR
424
488
 
@@ -431,6 +495,7 @@ module OpenTox
431
495
 
432
496
  # censoring
433
497
  prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
498
+ prediction = nil unless train_success
434
499
  LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
435
500
  rescue Exception => e
436
501
  LOGGER.debug "#{e.class}: #{e.message}"
@@ -456,30 +521,42 @@ module OpenTox
456
521
  @r.del_missing = params[:del_missing] == true ? 1 : 0
457
522
  r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
458
523
  @r.f_fds_r = r_result_file.to_s
459
-
524
+
460
525
  # need packs 'randomForest', 'RANN'
461
526
  @r.eval <<-EOR
462
- set.seed(1)
463
527
  suppressPackageStartupMessages(library('caret'))
464
528
  suppressPackageStartupMessages(library('randomForest'))
465
529
  suppressPackageStartupMessages(library('RANN'))
466
530
  suppressPackageStartupMessages(library('doMC'))
467
531
  registerDoMC()
468
-
532
+ set.seed(1)
533
+
469
534
  acts = read.csv(ds_csv_file, check.names=F)
470
535
  feats = read.csv(fds_csv_file, check.names=F)
471
536
  ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
472
-
537
+
473
538
  features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
474
539
  y = ds[,which(names(ds) == prediction_feature)]
475
-
540
+
476
541
  # assumes a data matrix 'features' and a vector 'y' of target values
477
542
  row.names(features)=NULL
478
-
543
+
544
+ # features with all values missing removed
545
+ na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
546
+ features = features[,!names(features) %in% na_col]
547
+
548
+ # features with infinite values removed
549
+ inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )
550
+ features = features[,!names(features) %in% inf_col]
551
+
552
+ # features with zero variance removed
553
+ zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
554
+ features = features[,!names(features) %in% zero_var]
555
+
479
556
  pp = NULL
480
557
  if (del_missing) {
481
558
  # needed if rows should be removed
482
- na_ids = apply(features,1,function(x)any(is.na(x)))
559
+ na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) )
483
560
  features = features[!na_ids,]
484
561
  y = y[!na_ids]
485
562
  pp = preProcess(features, method=c("scale", "center"))
@@ -488,17 +565,23 @@ module OpenTox
488
565
  pp = preProcess(features, method=c("scale", "center", "knnImpute"))
489
566
  }
490
567
  features = predict(pp, features)
491
-
568
+
569
+ # features with nan values removed (sometimes preProcess return NaN values)
570
+ nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
571
+ features = features[,!names(features) %in% nan_col]
572
+
492
573
  # determine subsets
493
- subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
494
- subsets = c(2,3,4,5,7,10,subsets)
574
+ subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
575
+ #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
576
+ #subsets = c(2,3,4,5,7,10,subsets)
577
+ #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
495
578
  subsets = unique(sort(round(subsets)))
496
579
  subsets = subsets[subsets<=dim(features)[2]]
497
580
  subsets = subsets[subsets>1]
498
-
581
+
499
582
  # Recursive feature elimination
500
- rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
501
-
583
+ rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
584
+
502
585
  # read existing dataset and select most useful features
503
586
  csv=feats[,c("SMILES", rfProfile$optVariables)]
504
587
  write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
@@ -527,7 +610,7 @@ module OpenTox
527
610
  # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
528
611
  # @return [Hash] Hash with matching Smarts and number of hits
529
612
  def self.lookup(params)
530
- params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
613
+ params[:compound].lookup(params[:features], params[:feature_dataset_uri], params[:pc_type], params[:lib], params[:subjectid])
531
614
  end
532
615
  end
533
616
 
@@ -539,3 +622,26 @@ module OpenTox
539
622
  end
540
623
  end
541
624
  end
625
+
626
+ class Array
627
+ # collect method extended for parallel processing.
628
+ # Note: assign return value as: ans = arr.pcollect(n) { |obj| ... }
629
+ # @param n the number of processes to spawn (default: unlimited)
630
+ def pcollect(n = nil)
631
+ nproc = 0
632
+ result = collect do |*a|
633
+ r, w = IO.pipe
634
+ fork do
635
+ r.close
636
+ w.write( Marshal.dump( yield(*a) ) )
637
+ end
638
+ if n and (nproc+=1) >= n
639
+ Process.wait ; nproc -= 1
640
+ end
641
+ [ w.close, r ].last
642
+ end
643
+ Process.waitall
644
+ result.collect{|r| Marshal.load [ r.read, r.close ].first}
645
+ end
646
+ end
647
+
@@ -3,6 +3,7 @@
3
3
 
4
4
  module OpenTox
5
5
 
6
+ require "rexml/document"
6
7
  # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
7
8
  class Compound
8
9
 
@@ -17,16 +18,20 @@ module OpenTox
17
18
  # @return [Compound] Compound
18
19
  def initialize(uri=nil)
19
20
  @uri = uri
20
- case @uri
21
- when /InChI/ # shortcut for IST services
22
- @inchi = @uri.sub(/^.*InChI/, 'InChI')
21
+ if (@uri =~ URI::regexp) || @uri.nil?
22
+ case @uri
23
+ when /InChI/ # shortcut for IST services
24
+ @inchi = @uri.sub(/^.*InChI/, 'InChI')
25
+ else
26
+ @inchi = RestClientWrapper.get(@uri, :accept => 'chemical/x-inchi').to_s.chomp if @uri
27
+ end
28
+
29
+ if @uri and @inchi.to_s.size==0
30
+ LOGGER.warn "REMOVE ABMIT HACK: no inchi for compound "+@uri.to_s+", load via smiles"
31
+ @inchi = Compound.smiles2inchi(Compound.smiles(@uri))
32
+ end
23
33
  else
24
- @inchi = RestClientWrapper.get(@uri, :accept => 'chemical/x-inchi').to_s.chomp if @uri
25
- end
26
-
27
- if @uri and @inchi.to_s.size==0
28
- LOGGER.warn "REMOVE ABMIT HACK: no inchi for compound "+@uri.to_s+", load via smiles"
29
- @inchi = Compound.smiles2inchi(Compound.smiles(@uri))
34
+ raise "Not able to create compound with uri: #{@uri}"
30
35
  end
31
36
  end
32
37
 
@@ -130,6 +135,47 @@ module OpenTox
130
135
  "not available"
131
136
  end
132
137
  end
138
+
139
+
140
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
141
+ # @example
142
+ # names = compound.to_names_hash
143
+ # @return [Hash] Classification => Name Array
144
+ def to_names_hash
145
+ begin
146
+ xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml")
147
+ xmldoc = REXML::Document.new(xml)
148
+ data = {}
149
+
150
+ xmldoc.root.elements[1].elements.each{|e|
151
+ if data.has_key?(e.attribute("classification").value) == false
152
+ data[e.attribute("classification").value] = [e.text]
153
+ else
154
+ data[e.attribute("classification").value].push(e.text)
155
+ end
156
+ }
157
+ data
158
+ rescue
159
+ "not available"
160
+ end
161
+ end
162
+
163
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
164
+ # @example
165
+ # names = compound.to_names_hash
166
+ # @return [Hash] Classification => Name Array
167
+ def to_ambit_names_hash
168
+ begin
169
+ ds = OpenTox::Dataset.new
170
+ ds.save
171
+ ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}"))
172
+ ds.save
173
+ ds.uri
174
+ rescue
175
+ "not available"
176
+ end
177
+ end
178
+
133
179
 
134
180
  # Match a smarts string
135
181
  # @example
@@ -197,25 +243,28 @@ module OpenTox
197
243
  # Lookup numerical values, returns hash with feature name as key and value as value
198
244
  # @param [Array] Array of feature names
199
245
  # @param [String] Feature dataset uri
246
+ # @param [String] Comma separated pc types
247
+ # @param [String] Comma separated lib
200
248
  # @return [Hash] Hash with feature name as key and value as value
201
- def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil)
249
+ def lookup(feature_array,feature_dataset_uri,pc_type,lib,subjectid=nil)
202
250
  ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid)
203
251
  #entry = ds.data_entries[self.uri]
204
252
  entry = nil
205
- ds.data_entries.each { |c_uri, values|
206
- if c_uri.split('/compound/').last == self.to_inchi
207
- entry = ds.data_entries[self.uri]
253
+ ds.data_entries.each { |c_uri, values|
254
+ compound = OpenTox::Compound.new(c_uri)
255
+ if compound.to_inchi == self.to_inchi # Compare compounds by InChI
256
+ entry = ds.data_entries[c_uri]
208
257
  break
209
258
  end
210
259
  }
211
260
  LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil?
212
-
213
261
  if entry.nil?
214
- uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type})
215
- uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid)
216
- ds = OpenTox::Dataset.find(uri,subjectid)
262
+ temp_ds = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid); temp_ds.add_compound(self.uri); temp_uri = temp_ds.save(subjectid)
263
+ uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"], "/pc/AllDescriptors"), {:dataset_uri => temp_uri, :pc_type => pc_type, :lib => lib, :subjectid => subjectid})
264
+ ds = OpenTox::Dataset.find(uri, subjectid)
217
265
  entry = ds.data_entries[self.uri]
218
266
  ds.delete(subjectid)
267
+ temp_ds.delete(subjectid)
219
268
  end
220
269
  features = entry.keys
221
270
  features.each { |feature|
@@ -224,7 +273,6 @@ module OpenTox
224
273
  entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit
225
274
  }
226
275
  #res = feature_array.collect {|v| entry[v]}
227
- #LOGGER.debug "----- am #{entry.to_yaml}"
228
276
  entry
229
277
  end
230
278