lazar 0.9.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 44e8fb9b8d65ca3f2fb8d02fb08c76e98ebc140c
4
- data.tar.gz: d62b490434324e405ad10a13b72fc51574e02404
3
+ metadata.gz: 2211d5cf1767b241583acff9a22379b56a5d8f1c
4
+ data.tar.gz: 923a3d00d5c78fd77a2153c973c5e3935c939eda
5
5
  SHA512:
6
- metadata.gz: 2ea37844e810a1410453e36b87e9d4473226bd78a57f692f8f46f8c56153fec13cb3a320c3f0df718242bca7aed13bebb510192812ee10ce41c3acd1a36d8c92
7
- data.tar.gz: 696378dea89f26a8a50c96e20de20a4fcbc8a717a22c3198ba352f06b92c4d597c8a5903352f249ab0e9af295aee803f60f37a0755253cee9bc7a7f5ce5556dd
6
+ metadata.gz: 2a366bae505c427a72211df4d59c7f296ead656bfe3f42db0fb6bb2dc3885028c70ba9df0aa7778c0bd78acdbd7b2939417caafd342a535c4954a34fef410c8d
7
+ data.tar.gz: 04fd93e7ab52517d338e6005223fe22b498d74be324f8dc6ef2e3a4d4a843202abc9224ff55e8ba053ce7a16a6a76301437f4fc061ac2719d65ff3afa392396a
data/.gitignore CHANGED
@@ -1,8 +1,5 @@
1
- last-utils
2
- libfminer
1
+ R
3
2
  openbabel
4
- fminer_debug.txt
5
- test/fminer_debug.txt
6
3
  Gemfile.lock
7
4
  *.gem
8
5
  .bundle
data/README.md CHANGED
@@ -6,31 +6,21 @@ Ruby libraries for the lazar framework
6
6
  Dependencies
7
7
  ------------
8
8
 
9
- lazar depends on a couple of external programs and libraries. On Debian 7 "Wheezy" systems you can install them with
10
-
11
- `sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev r-cran-rserve openjdk-7-jre libgsl0-dev libxml2-dev zlib1g-dev libcairo2-dev`
12
-
13
- You will also need at least mongodb version 3.0, but Debian "Wheezy" provides version 2.4. Please follow the instructions at http://docs.mongodb.org/manual/tutorial/install-mongodb-on-debian/:
14
-
15
- ```
16
- sudo apt-key adv --keyserver keyserver.ubuntu.com --recv 7F0CEB10
17
- echo "deb http://repo.mongodb.org/apt/debian wheezy/mongodb-org/3.0 main" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list
18
- sudo apt-get update
19
- sudo apt-get install -y mongodb-org
20
- ```
9
+ lazar depends on a couple of external programs and libraries. All required libraries will be installed with the `gem install lazar` command.
10
+ If any of the dependencies fails to install, please check if all required development packages are installed from your operating systems package manager (e.g. `apt`, `rpm`, `pacman`, ...).
11
+ You will need a working Java runtime to use descriptor calculation algorithms from CDK and JOELib libraries.
21
12
 
22
13
  Installation
23
14
  ------------
24
15
 
25
16
  `gem install lazar`
26
17
 
27
- Please be patient, the compilation of OpenBabel and Fminer libraries can be very time consuming. If installation fails you can try to install manually:
18
+ Please be patient, the compilation of external libraries can be very time consuming. If installation fails you can try to install manually:
28
19
 
29
20
  ```
30
21
  git clone https://github.com/opentox/lazar.git
31
22
  cd lazar
32
23
  ruby ext/lazar/extconf.rb
33
- sudo Rscript ext/lazar/rinstall.R
34
24
  bundle install
35
25
  ```
36
26
 
@@ -42,4 +32,4 @@ Documentation
42
32
 
43
33
  Copyright
44
34
  ---------
45
- Copyright (c) 2009-2015 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
35
+ Copyright (c) 2009-2016 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.9.3
1
+ 1.0.0
data/ext/lazar/extconf.rb CHANGED
@@ -15,7 +15,7 @@ abort "Please install Rserve on your system. Execute 'install.packages('Rserve')
15
15
  # install R packages
16
16
  r_dir = File.join main_dir, "R"
17
17
  FileUtils.mkdir_p r_dir
18
- FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
18
+ #FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
19
19
  rinstall = File.expand_path(File.join(File.dirname(__FILE__),"rinstall.R"))
20
20
  puts `Rscript --vanilla #{rinstall} #{r_dir}`
21
21
 
data/ext/lazar/rinstall.R CHANGED
@@ -1,10 +1,12 @@
1
1
  libdir = commandArgs(trailingOnly=TRUE)[1]
2
2
  repo = "https://stat.ethz.ch/CRAN/"
3
3
  #install.packages("Rserve",lib=libdir,repos=repo,dependencies=TRUE)
4
- install.packages("iterators",lib=libdir,repos=repo);
5
- install.packages("foreach",lib=libdir,repos=repo);
6
- install.packages("gridExtra",lib=libdir,repos=repo);
7
- install.packages("ggplot2",lib=libdir,repos=repo);
8
- install.packages("pls",lib=libdir,repos=repo);
9
- install.packages("caret",lib=libdir,repos=repo);
10
- install.packages("doMC",lib=libdir,repos=repo);
4
+ install.packages("stringi",lib=libdir,repos=repo,dependencies=TRUE);
5
+ install.packages("iterators",lib=libdir,repos=repo,dependencies=TRUE);
6
+ install.packages("foreach",lib=libdir,repos=repo,dependencies=TRUE);
7
+ install.packages("gridExtra",lib=libdir,repos=repo,dependencies=TRUE);
8
+ install.packages("ggplot2",lib=libdir,repos=repo,dependencies=TRUE);
9
+ install.packages("pls",lib=libdir,repos=repo,dependencies=TRUE);
10
+ install.packages("randomForest",lib=libdir,repos=repo,dependencies=TRUE);
11
+ install.packages("caret",lib=libdir,repos=repo,dependencies=TRUE);
12
+ install.packages("doMC",lib=libdir,repos=repo,dependencies=TRUE);
Binary file
@@ -1,11 +1,12 @@
1
1
  import java.util.*;
2
- import org.openscience.cdk.qsar.descriptors.molecular.*;
2
+ import org.openscience.cdk.DefaultChemObjectBuilder;
3
3
  import org.openscience.cdk.qsar.*;
4
+ //import org.openscience.cdk.qsar.descriptors.molecular.*;
4
5
 
5
6
  class CdkDescriptorInfo {
6
7
  public static void main(String[] args) {
7
8
 
8
- DescriptorEngine engine = new DescriptorEngine(DescriptorEngine.MOLECULAR);
9
+ DescriptorEngine engine = new DescriptorEngine(IMolecularDescriptor.class,null);
9
10
 
10
11
  for (Iterator<IDescriptor> it = engine.getDescriptorInstances().iterator(); it.hasNext(); ) {
11
12
  IDescriptor descriptor = it.next();
Binary file
@@ -1,10 +1,10 @@
1
1
  import java.util.*;
2
2
  import java.io.*;
3
3
  import org.openscience.cdk.DefaultChemObjectBuilder;
4
- import org.openscience.cdk.interfaces.IMolecule;
5
- import org.openscience.cdk.io.iterator.IteratingMDLReader;
4
+ import org.openscience.cdk.IImplementationSpecification;
5
+ import org.openscience.cdk.interfaces.IAtomContainer;
6
+ import org.openscience.cdk.io.iterator.IteratingSDFReader;
6
7
  import org.openscience.cdk.qsar.*;
7
- import org.openscience.cdk.qsar.DescriptorValue;
8
8
  import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector;
9
9
  import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
10
10
  import org.openscience.cdk.exception.NoSuchAtomTypeException;
@@ -17,8 +17,8 @@ class CdkDescriptors {
17
17
  System.exit(1);
18
18
  }
19
19
  if (! new File(args[0]).exists()){
20
- System.err.println("file not found "+args[0]);
21
- System.exit(1);
20
+ System.err.println("file not found "+args[0]);
21
+ System.exit(1);
22
22
  }
23
23
 
24
24
  // command line descriptor params can be either "descriptorName" or "descriptorValueName"
@@ -34,19 +34,19 @@ class CdkDescriptors {
34
34
  for (int i =1; i < args.length; i++) {
35
35
  String descriptorName;
36
36
  if (args[i].indexOf(".")!=-1) {
37
- descriptorValueNames.add(args[i]);
38
- descriptorName = args[i].substring(0,args[i].indexOf("."));
37
+ descriptorValueNames.add(args[i]);
38
+ descriptorName = args[i].substring(0,args[i].indexOf("."));
39
39
  }
40
40
  else {
41
- descriptorNames.add(args[i]);
42
- descriptorName = args[i];
41
+ descriptorNames.add(args[i]);
42
+ descriptorName = args[i];
43
43
  }
44
44
  classNames.add(getDescriptorClassName(descriptorName));
45
45
  }
46
46
 
47
- engine = new DescriptorEngine(new ArrayList<String>(classNames));
47
+ engine = new DescriptorEngine(new ArrayList<String>(classNames),null);
48
48
  List<IDescriptor> instances = engine.instantiateDescriptors(new ArrayList<String>(classNames));
49
- List<DescriptorSpecification> specs = engine.initializeSpecifications(instances);
49
+ List<IImplementationSpecification> specs = engine.initializeSpecifications(instances);
50
50
  engine.setDescriptorInstances(instances);
51
51
  engine.setDescriptorSpecifications(specs);
52
52
 
@@ -54,13 +54,13 @@ class CdkDescriptors {
54
54
  BufferedReader br = new BufferedReader(new FileReader(args[0]));
55
55
  PrintWriter yaml = new PrintWriter(new FileWriter(args[0]+"cdk.yaml"));
56
56
  // parse 3d sdf from file and calculate descriptors
57
- IteratingMDLReader reader = new IteratingMDLReader( br, DefaultChemObjectBuilder.getInstance());
57
+ IteratingSDFReader reader = new IteratingSDFReader( br, DefaultChemObjectBuilder.getInstance());
58
58
  int c = 0;
59
59
  while (reader.hasNext()) {
60
60
  try {
61
61
  System.out.println("computing "+(args.length-1)+" descriptors for compound "+(++c));
62
- IMolecule molecule = (IMolecule)reader.next();
63
- molecule = (IMolecule) AtomContainerManipulator.removeHydrogens(molecule);
62
+ IAtomContainer molecule = (IAtomContainer)reader.next();
63
+ molecule = (IAtomContainer) AtomContainerManipulator.removeHydrogens(molecule);
64
64
  try {
65
65
  AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(molecule);
66
66
  }
@@ -110,21 +110,21 @@ class CdkDescriptors {
110
110
  * problem: Descriptor is not always at the end of the class (APolDescriptor), but may be in the middle (AutocorrelationDescriptorPolarizability)
111
111
  * this method makes a class-lookup using trial and error */
112
112
  static String getDescriptorClassName(String descriptorName) {
113
- String split = splitCamelCase(descriptorName)+" "; // space mark possible positions for 'Descriptor'
114
- for(int i = split.length()-1; i>0; i--) {
115
- if (split.charAt(i)==' ') { // iterate over all spaces, starting with the trailing one
116
- String test = split.substring(0,i)+"Descriptor"+split.substring(i+1,split.length()); // replace current space with 'Descriptor' ..
117
- test = test.replaceAll("\\s",""); // .. and remove other spaces
118
- String className = "org.openscience.cdk.qsar.descriptors.molecular." + test;
119
- try {
120
- Class.forName(className);
121
- return className;
122
- } catch (ClassNotFoundException e) {}
123
- }
113
+ String split = splitCamelCase(descriptorName)+" "; // space mark possible positions for 'Descriptor'
114
+ for(int i = split.length()-1; i>0; i--) {
115
+ if (split.charAt(i)==' ') { // iterate over all spaces, starting with the trailing one
116
+ String test = split.substring(0,i)+"Descriptor"+split.substring(i+1,split.length()); // replace current space with 'Descriptor' ..
117
+ test = test.replaceAll("\\s",""); // .. and remove other spaces
118
+ String className = "org.openscience.cdk.qsar.descriptors.molecular." + test;
119
+ try {
120
+ Class.forName(className);
121
+ return className;
122
+ } catch (ClassNotFoundException e) {}
124
123
  }
125
- System.err.println("Descriptor not found: "+descriptorName);
126
- System.exit(1);
127
- return null;
124
+ }
125
+ System.err.println("Descriptor not found: "+descriptorName);
126
+ System.exit(1);
127
+ return null;
128
128
  }
129
129
 
130
130
  /** inserts space in between camel words */
data/java/Rakefile CHANGED
@@ -1,7 +1,7 @@
1
1
  # Java class, classpath
2
2
  java_classes = [
3
- ["CdkDescriptors", "cdk-1.4.19.jar"],
4
- ["CdkDescriptorInfo", "cdk-1.4.19.jar"],
3
+ ["CdkDescriptors", "cdk-2.0-SNAPSHOT.jar"],
4
+ ["CdkDescriptorInfo", "cdk-2.0-SNAPSHOT.jar"],
5
5
  ["JoelibDescriptors", "joelib2.jar:."],
6
6
  ["JoelibDescriptorInfo", "joelib2.jar:."],
7
7
  ]
@@ -10,6 +10,6 @@ task :default => java_classes.collect{|c| "#{c.first}.class"}
10
10
 
11
11
  java_classes.each do |c|
12
12
  file "#{c.first}.class" => "#{c.first}.java" do
13
- puts `javac -classpath #{c.last} #{c.first}.java`
13
+ puts `javac -Xlint:deprecation -classpath #{c.last} #{c.first}.java`
14
14
  end
15
15
  end
data/lazar.gemspec CHANGED
@@ -18,11 +18,10 @@ Gem::Specification.new do |s|
18
18
  s.require_paths = ["lib"]
19
19
 
20
20
  # specify any dependencies here; for example:
21
- s.add_runtime_dependency 'bundler', '~> 1.11'
22
- s.add_runtime_dependency 'rest-client', '~> 1.8'
23
- s.add_runtime_dependency 'nokogiri', '~> 1.6'
24
- s.add_runtime_dependency 'rserve-client', '~> 0.3'
25
- s.add_runtime_dependency 'mongoid', '~> 5.0'
26
- s.add_runtime_dependency 'openbabel', '~> 2.3', '>= 2.3.2.2'
27
-
21
+ s.add_runtime_dependency 'bundler'
22
+ s.add_runtime_dependency 'rest-client'
23
+ s.add_runtime_dependency 'nokogiri'
24
+ s.add_runtime_dependency 'rserve-client'
25
+ s.add_runtime_dependency 'mongoid'
26
+ s.add_runtime_dependency 'openbabel'
28
27
  end
data/lib/algorithm.rb CHANGED
@@ -2,18 +2,9 @@ module OpenTox
2
2
 
3
3
  module Algorithm
4
4
 
5
- # Generic method to execute algorithms
6
- # Algorithms should:
7
- # - accept a Compound, an Array of Compounds or a Dataset as first argument
8
- # - optional parameters as second argument
9
- # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
10
- # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
11
- # @param [Hash] Algorithm parameters
12
- # @return Algorithm result
13
- def self.run algorithm, object, parameters=nil
14
- bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
5
+ def self.run algorithm, parameters=nil
15
6
  klass,method = algorithm.split('.')
16
- parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
7
+ Object.const_get(klass).send(method,parameters)
17
8
  end
18
9
 
19
10
  end
data/lib/caret.rb ADDED
@@ -0,0 +1,96 @@
1
+ module OpenTox
2
+ module Algorithm
3
+
4
+ class Caret
5
+ # model list: https://topepo.github.io/caret/modelList.html
6
+
7
+ def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables:
8
+ remove = []
9
+ # remove independent_variables with single values
10
+ independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1}
11
+ remove.sort.reverse.each do |i|
12
+ independent_variables.delete_at i
13
+ query_variables.delete_at i
14
+ end
15
+ if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
16
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
17
+ prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
18
+ elsif
19
+ dependent_variables.size < 3
20
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
21
+ prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
22
+
23
+ else
24
+ dependent_variables.each_with_index do |v,i|
25
+ dependent_variables[i] = to_r(v)
26
+ end
27
+ independent_variables.each_with_index do |c,i|
28
+ c.each_with_index do |v,j|
29
+ independent_variables[i][j] = to_r(v)
30
+ end
31
+ end
32
+ query_variables.each_with_index do |v,i|
33
+ query_variables[i] = to_r(v)
34
+ end
35
+ begin
36
+ R.assign "weights", weights
37
+ r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})"
38
+ R.eval "data <- #{r_data_frame}"
39
+ R.assign "features", (0..independent_variables.size-1).to_a
40
+ R.eval "names(data) <- append(c('activities'),features)" #
41
+ R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
42
+ rescue => e
43
+ $logger.debug "R caret model creation error for:"
44
+ $logger.debug dependent_variables
45
+ $logger.debug independent_variables
46
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
47
+ prediction[:warning] = "R caret model creation error. Using weighted average of similar substances."
48
+ return prediction
49
+ end
50
+ begin
51
+ R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))"
52
+ R.eval "names(query) <- features"
53
+ R.eval "prediction <- predict(model,query)"
54
+ value = R.eval("prediction").to_f
55
+ rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
56
+ r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
57
+ prediction_interval = value-1.96*rmse, value+1.96*rmse
58
+ prediction = {
59
+ :value => value,
60
+ :rmse => rmse,
61
+ :r_squared => r_squared,
62
+ :prediction_interval => prediction_interval
63
+ }
64
+ rescue => e
65
+ $logger.debug "R caret prediction error for:"
66
+ $logger.debug self.inspect
67
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
68
+ prediction[:warning] = "R caret prediction error. Using weighted average of similar substances"
69
+ return prediction
70
+ end
71
+ if prediction.nil? or prediction[:value].nil?
72
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
73
+ prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
74
+ end
75
+ end
76
+ prediction
77
+
78
+ end
79
+
80
+ # call caret methods dynamically, e.g. Caret.pls
81
+ def self.method_missing(sym, *args, &block)
82
+ args.first[:method] = sym.to_s
83
+ self.create_model_and_predict args.first
84
+ end
85
+
86
+ def self.to_r v
87
+ return "F" if v == false
88
+ return "T" if v == true
89
+ return nil if v.is_a? Float and v.nan?
90
+ v
91
+ end
92
+
93
+ end
94
+ end
95
+ end
96
+
@@ -3,32 +3,24 @@ module OpenTox
3
3
 
4
4
  class Classification
5
5
 
6
- def self.weighted_majority_vote compound, params
7
- neighbors = params[:neighbors]
8
- weighted_sum = {}
9
- sim_sum = 0.0
10
- confidence = 0.0
11
- neighbors.each do |row|
12
- sim = row["tanimoto"]
13
- row["features"][params[:prediction_feature_id].to_s].each do |act|
14
- weighted_sum[act] ||= 0
15
- weighted_sum[act] += sim
16
- end
6
+ def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:
7
+ class_weights = {}
8
+ dependent_variables.each_with_index do |v,i|
9
+ class_weights[v] ||= []
10
+ class_weights[v] << weights[i] unless v.nil?
17
11
  end
18
- case weighted_sum.size
19
- when 1
20
- return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
21
- when 2
22
- sim_sum = weighted_sum[weighted_sum.keys[0]]
23
- sim_sum -= weighted_sum[weighted_sum.keys[1]]
24
- sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
25
- confidence = (sim_sum/neighbors.size).abs
26
- return {:value => prediction,:confidence => confidence}
27
- else
28
- bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
12
+ probabilities = {}
13
+ class_weights.each do |a,w|
14
+ probabilities[a] = w.sum/weights.sum
29
15
  end
16
+ probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
17
+ p_max = probabilities.collect{|a,p| p}.max
18
+ prediction = probabilities.key(p_max)
19
+ {:value => prediction,:probabilities => probabilities}
30
20
  end
21
+
31
22
  end
23
+
32
24
  end
33
25
  end
34
26
 
data/lib/compound.rb CHANGED
@@ -1,11 +1,9 @@
1
- CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
1
+ CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/"
2
2
 
3
3
  module OpenTox
4
4
 
5
- class Compound
5
+ class Compound < Substance
6
6
  require_relative "unique_descriptors.rb"
7
- include OpenTox
8
-
9
7
  DEFAULT_FINGERPRINT = "MP2D"
10
8
 
11
9
  field :inchi, type: String
@@ -19,9 +17,6 @@ module OpenTox
19
17
  field :sdf_id, type: BSON::ObjectId
20
18
  field :fingerprints, type: Hash, default: {}
21
19
  field :default_fingerprint_size, type: Integer
22
- field :physchem_descriptors, type: Hash, default: {}
23
- field :dataset_ids, type: Array, default: []
24
- field :features, type: Hash, default: {}
25
20
 
26
21
  index({smiles: 1}, {unique: true})
27
22
 
@@ -80,9 +75,8 @@ module OpenTox
80
75
  fingerprints[type]
81
76
  end
82
77
 
83
- def physchem descriptors=PhysChem.openbabel_descriptors
84
- # TODO: speedup java descriptors
85
- calculated_ids = physchem_descriptors.keys
78
+ def calculate_properties descriptors=PhysChem::OPENBABEL
79
+ calculated_ids = properties.keys
86
80
  # BSON::ObjectId instances are not allowed as keys in a BSON document.
87
81
  new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
88
82
  descs = {}
@@ -95,11 +89,11 @@ module OpenTox
95
89
  # avoid recalculating Cdk features with multiple values
96
90
  descs.keys.uniq.each do |k|
97
91
  descs[k].send(k[0].downcase,k[1],self).each do |n,v|
98
- physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
92
+ properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
99
93
  end
100
94
  end
101
95
  save
102
- physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
96
+ descriptors.collect{|d| properties[d.id.to_s]}
103
97
  end
104
98
 
105
99
  def smarts_match smarts, count=false
@@ -142,9 +136,6 @@ module OpenTox
142
136
  # @param inchi [String] smiles InChI string
143
137
  # @return [OpenTox::Compound] Compound
144
138
  def self.from_inchi inchi
145
- # Temporary workaround for OpenBabels Inchi bug
146
- # http://sourceforge.net/p/openbabel/bugs/957/
147
- # bug has not been fixed in latest git/development version
148
139
  #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
149
140
  smiles = obconversion(inchi,"inchi","can")
150
141
  if smiles.empty?
@@ -246,7 +237,7 @@ module OpenTox
246
237
 
247
238
  # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
248
239
  def cid
249
- pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
240
+ pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
250
241
  update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
251
242
  self["cid"]
252
243
  end
@@ -254,70 +245,13 @@ module OpenTox
254
245
  # @return [String] ChEMBL database compound id, derieved via restcall to chembl
255
246
  def chemblid
256
247
  # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
257
- uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
248
+ uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
258
249
  update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
259
250
  self["chemblid"]
260
251
  end
261
252
 
262
- def fingerprint_count_neighbors params
263
- # TODO fix
264
- neighbors = []
265
- query_fingerprint = self.fingerprint params[:type]
266
- training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
267
- unless self == compound
268
- candidate_fingerprint = compound.fingerprint params[:type]
269
- features = (query_fingerprint + candidate_fingerprint).uniq
270
- min_sum = 0
271
- max_sum = 0
272
- features.each do |f|
273
- min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
274
- min_sum += min
275
- max_sum += max
276
- end
277
- max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
278
- neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
279
- end
280
- end
281
- neighbors.sort{|a,b| b.last <=> a.last}
282
- end
283
-
284
- def fingerprint_neighbors params
285
- bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
286
- neighbors = []
287
- if params[:type] == DEFAULT_FINGERPRINT
288
- neighbors = db_neighbors params
289
- else
290
- query_fingerprint = self.fingerprint params[:type]
291
- training_dataset = Dataset.find(params[:training_dataset_id])
292
- prediction_feature = training_dataset.features.first
293
- training_dataset.compounds.each do |compound|
294
- candidate_fingerprint = compound.fingerprint params[:type]
295
- sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
296
- feature_values = training_dataset.values(compound,prediction_feature)
297
- neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
298
- end
299
- neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
300
- end
301
- neighbors
302
- end
303
-
304
- def physchem_neighbors params
305
- feature_dataset = Dataset.find params[:feature_dataset_id]
306
- query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
307
- neighbors = []
308
- feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
309
- # TODO implement pearson and cosine similarity separatly
310
- R.assign "x", query_fingerprint
311
- R.assign "y", candidate_fingerprint
312
- sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
313
- if sim >= params[:min_sim]
314
- neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
315
- end
316
- end
317
- neighbors
318
- end
319
-
320
- def db_neighbors params
253
+ def db_neighbors min_sim: 0.1, dataset_id:
254
+ #p fingerprints[DEFAULT_FINGERPRINT]
321
255
  # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
322
256
 
323
257
  #qn = default_fingerprint_size
@@ -329,31 +263,31 @@ module OpenTox
329
263
  #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
330
264
  #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
331
265
  {'$project' => {
332
- 'tanimoto' => {'$let' => {
266
+ 'similarity' => {'$let' => {
333
267
  'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
334
- #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
335
268
  'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
336
269
  }},
337
270
  '_id' => 1,
338
- 'features' => 1,
271
+ #'measurements' => 1,
339
272
  'dataset_ids' => 1
340
273
  }},
341
- {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
342
- {'$sort' => {'tanimoto' => -1}}
274
+ {'$match' => {'similarity' => {'$gte' => min_sim}}},
275
+ {'$sort' => {'similarity' => -1}}
343
276
  ]
344
-
345
- $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
277
+
278
+ # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array
279
+ $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id}
346
280
 
347
281
  end
348
282
 
349
- # Convert mg to mmol
283
+ # Convert mmol to mg
350
284
  # @return [Float] value in mg
351
285
  def mmol_to_mg mmol
352
286
  mmol.to_f*molecular_weight
353
287
  end
354
288
 
355
- # Convert mmol to mg
356
- # @return [Float] value in mg
289
+ # Convert mg to mmol
290
+ # @return [Float] value in mmol
357
291
  def mg_to_mmol mg
358
292
  mg.to_f/molecular_weight
359
293
  end
@@ -362,7 +296,7 @@ module OpenTox
362
296
  # @return [Float] molecular weight
363
297
  def molecular_weight
364
298
  mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
365
- physchem([mw_feature])[mw_feature.id.to_s]
299
+ calculate_properties([mw_feature]).first
366
300
  end
367
301
 
368
302
  private