lazar 0.9.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 44e8fb9b8d65ca3f2fb8d02fb08c76e98ebc140c
4
- data.tar.gz: d62b490434324e405ad10a13b72fc51574e02404
3
+ metadata.gz: 2211d5cf1767b241583acff9a22379b56a5d8f1c
4
+ data.tar.gz: 923a3d00d5c78fd77a2153c973c5e3935c939eda
5
5
  SHA512:
6
- metadata.gz: 2ea37844e810a1410453e36b87e9d4473226bd78a57f692f8f46f8c56153fec13cb3a320c3f0df718242bca7aed13bebb510192812ee10ce41c3acd1a36d8c92
7
- data.tar.gz: 696378dea89f26a8a50c96e20de20a4fcbc8a717a22c3198ba352f06b92c4d597c8a5903352f249ab0e9af295aee803f60f37a0755253cee9bc7a7f5ce5556dd
6
+ metadata.gz: 2a366bae505c427a72211df4d59c7f296ead656bfe3f42db0fb6bb2dc3885028c70ba9df0aa7778c0bd78acdbd7b2939417caafd342a535c4954a34fef410c8d
7
+ data.tar.gz: 04fd93e7ab52517d338e6005223fe22b498d74be324f8dc6ef2e3a4d4a843202abc9224ff55e8ba053ce7a16a6a76301437f4fc061ac2719d65ff3afa392396a
data/.gitignore CHANGED
@@ -1,8 +1,5 @@
1
- last-utils
2
- libfminer
1
+ R
3
2
  openbabel
4
- fminer_debug.txt
5
- test/fminer_debug.txt
6
3
  Gemfile.lock
7
4
  *.gem
8
5
  .bundle
data/README.md CHANGED
@@ -6,31 +6,21 @@ Ruby libraries for the lazar framework
6
6
  Dependencies
7
7
  ------------
8
8
 
9
- lazar depends on a couple of external programs and libraries. On Debian 7 "Wheezy" systems you can install them with
10
-
11
- `sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev r-cran-rserve openjdk-7-jre libgsl0-dev libxml2-dev zlib1g-dev libcairo2-dev`
12
-
13
- You will also need at least mongodb version 3.0, but Debian "Wheezy" provides version 2.4. Please follow the instructions at http://docs.mongodb.org/manual/tutorial/install-mongodb-on-debian/:
14
-
15
- ```
16
- sudo apt-key adv --keyserver keyserver.ubuntu.com --recv 7F0CEB10
17
- echo "deb http://repo.mongodb.org/apt/debian wheezy/mongodb-org/3.0 main" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list
18
- sudo apt-get update
19
- sudo apt-get install -y mongodb-org
20
- ```
9
+ lazar depends on a couple of external programs and libraries. All required libraries will be installed with the `gem install lazar` command.
10
+ If any of the dependencies fails to install, please check if all required development packages are installed from your operating systems package manager (e.g. `apt`, `rpm`, `pacman`, ...).
11
+ You will need a working Java runtime to use descriptor calculation algorithms from CDK and JOELib libraries.
21
12
 
22
13
  Installation
23
14
  ------------
24
15
 
25
16
  `gem install lazar`
26
17
 
27
- Please be patient, the compilation of OpenBabel and Fminer libraries can be very time consuming. If installation fails you can try to install manually:
18
+ Please be patient, the compilation of external libraries can be very time consuming. If installation fails you can try to install manually:
28
19
 
29
20
  ```
30
21
  git clone https://github.com/opentox/lazar.git
31
22
  cd lazar
32
23
  ruby ext/lazar/extconf.rb
33
- sudo Rscript ext/lazar/rinstall.R
34
24
  bundle install
35
25
  ```
36
26
 
@@ -42,4 +32,4 @@ Documentation
42
32
 
43
33
  Copyright
44
34
  ---------
45
- Copyright (c) 2009-2015 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
35
+ Copyright (c) 2009-2016 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.9.3
1
+ 1.0.0
data/ext/lazar/extconf.rb CHANGED
@@ -15,7 +15,7 @@ abort "Please install Rserve on your system. Execute 'install.packages('Rserve')
15
15
  # install R packages
16
16
  r_dir = File.join main_dir, "R"
17
17
  FileUtils.mkdir_p r_dir
18
- FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
18
+ #FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
19
19
  rinstall = File.expand_path(File.join(File.dirname(__FILE__),"rinstall.R"))
20
20
  puts `Rscript --vanilla #{rinstall} #{r_dir}`
21
21
 
data/ext/lazar/rinstall.R CHANGED
@@ -1,10 +1,12 @@
1
1
  libdir = commandArgs(trailingOnly=TRUE)[1]
2
2
  repo = "https://stat.ethz.ch/CRAN/"
3
3
  #install.packages("Rserve",lib=libdir,repos=repo,dependencies=TRUE)
4
- install.packages("iterators",lib=libdir,repos=repo);
5
- install.packages("foreach",lib=libdir,repos=repo);
6
- install.packages("gridExtra",lib=libdir,repos=repo);
7
- install.packages("ggplot2",lib=libdir,repos=repo);
8
- install.packages("pls",lib=libdir,repos=repo);
9
- install.packages("caret",lib=libdir,repos=repo);
10
- install.packages("doMC",lib=libdir,repos=repo);
4
+ install.packages("stringi",lib=libdir,repos=repo,dependencies=TRUE);
5
+ install.packages("iterators",lib=libdir,repos=repo,dependencies=TRUE);
6
+ install.packages("foreach",lib=libdir,repos=repo,dependencies=TRUE);
7
+ install.packages("gridExtra",lib=libdir,repos=repo,dependencies=TRUE);
8
+ install.packages("ggplot2",lib=libdir,repos=repo,dependencies=TRUE);
9
+ install.packages("pls",lib=libdir,repos=repo,dependencies=TRUE);
10
+ install.packages("randomForest",lib=libdir,repos=repo,dependencies=TRUE);
11
+ install.packages("caret",lib=libdir,repos=repo,dependencies=TRUE);
12
+ install.packages("doMC",lib=libdir,repos=repo,dependencies=TRUE);
Binary file
@@ -1,11 +1,12 @@
1
1
  import java.util.*;
2
- import org.openscience.cdk.qsar.descriptors.molecular.*;
2
+ import org.openscience.cdk.DefaultChemObjectBuilder;
3
3
  import org.openscience.cdk.qsar.*;
4
+ //import org.openscience.cdk.qsar.descriptors.molecular.*;
4
5
 
5
6
  class CdkDescriptorInfo {
6
7
  public static void main(String[] args) {
7
8
 
8
- DescriptorEngine engine = new DescriptorEngine(DescriptorEngine.MOLECULAR);
9
+ DescriptorEngine engine = new DescriptorEngine(IMolecularDescriptor.class,null);
9
10
 
10
11
  for (Iterator<IDescriptor> it = engine.getDescriptorInstances().iterator(); it.hasNext(); ) {
11
12
  IDescriptor descriptor = it.next();
Binary file
@@ -1,10 +1,10 @@
1
1
  import java.util.*;
2
2
  import java.io.*;
3
3
  import org.openscience.cdk.DefaultChemObjectBuilder;
4
- import org.openscience.cdk.interfaces.IMolecule;
5
- import org.openscience.cdk.io.iterator.IteratingMDLReader;
4
+ import org.openscience.cdk.IImplementationSpecification;
5
+ import org.openscience.cdk.interfaces.IAtomContainer;
6
+ import org.openscience.cdk.io.iterator.IteratingSDFReader;
6
7
  import org.openscience.cdk.qsar.*;
7
- import org.openscience.cdk.qsar.DescriptorValue;
8
8
  import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector;
9
9
  import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
10
10
  import org.openscience.cdk.exception.NoSuchAtomTypeException;
@@ -17,8 +17,8 @@ class CdkDescriptors {
17
17
  System.exit(1);
18
18
  }
19
19
  if (! new File(args[0]).exists()){
20
- System.err.println("file not found "+args[0]);
21
- System.exit(1);
20
+ System.err.println("file not found "+args[0]);
21
+ System.exit(1);
22
22
  }
23
23
 
24
24
  // command line descriptor params can be either "descriptorName" or "descriptorValueName"
@@ -34,19 +34,19 @@ class CdkDescriptors {
34
34
  for (int i =1; i < args.length; i++) {
35
35
  String descriptorName;
36
36
  if (args[i].indexOf(".")!=-1) {
37
- descriptorValueNames.add(args[i]);
38
- descriptorName = args[i].substring(0,args[i].indexOf("."));
37
+ descriptorValueNames.add(args[i]);
38
+ descriptorName = args[i].substring(0,args[i].indexOf("."));
39
39
  }
40
40
  else {
41
- descriptorNames.add(args[i]);
42
- descriptorName = args[i];
41
+ descriptorNames.add(args[i]);
42
+ descriptorName = args[i];
43
43
  }
44
44
  classNames.add(getDescriptorClassName(descriptorName));
45
45
  }
46
46
 
47
- engine = new DescriptorEngine(new ArrayList<String>(classNames));
47
+ engine = new DescriptorEngine(new ArrayList<String>(classNames),null);
48
48
  List<IDescriptor> instances = engine.instantiateDescriptors(new ArrayList<String>(classNames));
49
- List<DescriptorSpecification> specs = engine.initializeSpecifications(instances);
49
+ List<IImplementationSpecification> specs = engine.initializeSpecifications(instances);
50
50
  engine.setDescriptorInstances(instances);
51
51
  engine.setDescriptorSpecifications(specs);
52
52
 
@@ -54,13 +54,13 @@ class CdkDescriptors {
54
54
  BufferedReader br = new BufferedReader(new FileReader(args[0]));
55
55
  PrintWriter yaml = new PrintWriter(new FileWriter(args[0]+"cdk.yaml"));
56
56
  // parse 3d sdf from file and calculate descriptors
57
- IteratingMDLReader reader = new IteratingMDLReader( br, DefaultChemObjectBuilder.getInstance());
57
+ IteratingSDFReader reader = new IteratingSDFReader( br, DefaultChemObjectBuilder.getInstance());
58
58
  int c = 0;
59
59
  while (reader.hasNext()) {
60
60
  try {
61
61
  System.out.println("computing "+(args.length-1)+" descriptors for compound "+(++c));
62
- IMolecule molecule = (IMolecule)reader.next();
63
- molecule = (IMolecule) AtomContainerManipulator.removeHydrogens(molecule);
62
+ IAtomContainer molecule = (IAtomContainer)reader.next();
63
+ molecule = (IAtomContainer) AtomContainerManipulator.removeHydrogens(molecule);
64
64
  try {
65
65
  AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(molecule);
66
66
  }
@@ -110,21 +110,21 @@ class CdkDescriptors {
110
110
  * problem: Descriptor is not always at the end of the class (APolDescriptor), but may be in the middle (AutocorrelationDescriptorPolarizability)
111
111
  * this method makes a class-lookup using trial and error */
112
112
  static String getDescriptorClassName(String descriptorName) {
113
- String split = splitCamelCase(descriptorName)+" "; // space mark possible positions for 'Descriptor'
114
- for(int i = split.length()-1; i>0; i--) {
115
- if (split.charAt(i)==' ') { // iterate over all spaces, starting with the trailing one
116
- String test = split.substring(0,i)+"Descriptor"+split.substring(i+1,split.length()); // replace current space with 'Descriptor' ..
117
- test = test.replaceAll("\\s",""); // .. and remove other spaces
118
- String className = "org.openscience.cdk.qsar.descriptors.molecular." + test;
119
- try {
120
- Class.forName(className);
121
- return className;
122
- } catch (ClassNotFoundException e) {}
123
- }
113
+ String split = splitCamelCase(descriptorName)+" "; // space mark possible positions for 'Descriptor'
114
+ for(int i = split.length()-1; i>0; i--) {
115
+ if (split.charAt(i)==' ') { // iterate over all spaces, starting with the trailing one
116
+ String test = split.substring(0,i)+"Descriptor"+split.substring(i+1,split.length()); // replace current space with 'Descriptor' ..
117
+ test = test.replaceAll("\\s",""); // .. and remove other spaces
118
+ String className = "org.openscience.cdk.qsar.descriptors.molecular." + test;
119
+ try {
120
+ Class.forName(className);
121
+ return className;
122
+ } catch (ClassNotFoundException e) {}
124
123
  }
125
- System.err.println("Descriptor not found: "+descriptorName);
126
- System.exit(1);
127
- return null;
124
+ }
125
+ System.err.println("Descriptor not found: "+descriptorName);
126
+ System.exit(1);
127
+ return null;
128
128
  }
129
129
 
130
130
  /** inserts space in between camel words */
data/java/Rakefile CHANGED
@@ -1,7 +1,7 @@
1
1
  # Java class, classpath
2
2
  java_classes = [
3
- ["CdkDescriptors", "cdk-1.4.19.jar"],
4
- ["CdkDescriptorInfo", "cdk-1.4.19.jar"],
3
+ ["CdkDescriptors", "cdk-2.0-SNAPSHOT.jar"],
4
+ ["CdkDescriptorInfo", "cdk-2.0-SNAPSHOT.jar"],
5
5
  ["JoelibDescriptors", "joelib2.jar:."],
6
6
  ["JoelibDescriptorInfo", "joelib2.jar:."],
7
7
  ]
@@ -10,6 +10,6 @@ task :default => java_classes.collect{|c| "#{c.first}.class"}
10
10
 
11
11
  java_classes.each do |c|
12
12
  file "#{c.first}.class" => "#{c.first}.java" do
13
- puts `javac -classpath #{c.last} #{c.first}.java`
13
+ puts `javac -Xlint:deprecation -classpath #{c.last} #{c.first}.java`
14
14
  end
15
15
  end
data/lazar.gemspec CHANGED
@@ -18,11 +18,10 @@ Gem::Specification.new do |s|
18
18
  s.require_paths = ["lib"]
19
19
 
20
20
  # specify any dependencies here; for example:
21
- s.add_runtime_dependency 'bundler', '~> 1.11'
22
- s.add_runtime_dependency 'rest-client', '~> 1.8'
23
- s.add_runtime_dependency 'nokogiri', '~> 1.6'
24
- s.add_runtime_dependency 'rserve-client', '~> 0.3'
25
- s.add_runtime_dependency 'mongoid', '~> 5.0'
26
- s.add_runtime_dependency 'openbabel', '~> 2.3', '>= 2.3.2.2'
27
-
21
+ s.add_runtime_dependency 'bundler'
22
+ s.add_runtime_dependency 'rest-client'
23
+ s.add_runtime_dependency 'nokogiri'
24
+ s.add_runtime_dependency 'rserve-client'
25
+ s.add_runtime_dependency 'mongoid'
26
+ s.add_runtime_dependency 'openbabel'
28
27
  end
data/lib/algorithm.rb CHANGED
@@ -2,18 +2,9 @@ module OpenTox
2
2
 
3
3
  module Algorithm
4
4
 
5
- # Generic method to execute algorithms
6
- # Algorithms should:
7
- # - accept a Compound, an Array of Compounds or a Dataset as first argument
8
- # - optional parameters as second argument
9
- # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
10
- # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
11
- # @param [Hash] Algorithm parameters
12
- # @return Algorithm result
13
- def self.run algorithm, object, parameters=nil
14
- bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
5
+ def self.run algorithm, parameters=nil
15
6
  klass,method = algorithm.split('.')
16
- parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
7
+ Object.const_get(klass).send(method,parameters)
17
8
  end
18
9
 
19
10
  end
data/lib/caret.rb ADDED
@@ -0,0 +1,96 @@
1
+ module OpenTox
2
+ module Algorithm
3
+
4
+ class Caret
5
+ # model list: https://topepo.github.io/caret/modelList.html
6
+
7
+ def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables:
8
+ remove = []
9
+ # remove independent_variables with single values
10
+ independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1}
11
+ remove.sort.reverse.each do |i|
12
+ independent_variables.delete_at i
13
+ query_variables.delete_at i
14
+ end
15
+ if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
16
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
17
+ prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
18
+ elsif
19
+ dependent_variables.size < 3
20
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
21
+ prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
22
+
23
+ else
24
+ dependent_variables.each_with_index do |v,i|
25
+ dependent_variables[i] = to_r(v)
26
+ end
27
+ independent_variables.each_with_index do |c,i|
28
+ c.each_with_index do |v,j|
29
+ independent_variables[i][j] = to_r(v)
30
+ end
31
+ end
32
+ query_variables.each_with_index do |v,i|
33
+ query_variables[i] = to_r(v)
34
+ end
35
+ begin
36
+ R.assign "weights", weights
37
+ r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})"
38
+ R.eval "data <- #{r_data_frame}"
39
+ R.assign "features", (0..independent_variables.size-1).to_a
40
+ R.eval "names(data) <- append(c('activities'),features)" #
41
+ R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
42
+ rescue => e
43
+ $logger.debug "R caret model creation error for:"
44
+ $logger.debug dependent_variables
45
+ $logger.debug independent_variables
46
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
47
+ prediction[:warning] = "R caret model creation error. Using weighted average of similar substances."
48
+ return prediction
49
+ end
50
+ begin
51
+ R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))"
52
+ R.eval "names(query) <- features"
53
+ R.eval "prediction <- predict(model,query)"
54
+ value = R.eval("prediction").to_f
55
+ rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
56
+ r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
57
+ prediction_interval = value-1.96*rmse, value+1.96*rmse
58
+ prediction = {
59
+ :value => value,
60
+ :rmse => rmse,
61
+ :r_squared => r_squared,
62
+ :prediction_interval => prediction_interval
63
+ }
64
+ rescue => e
65
+ $logger.debug "R caret prediction error for:"
66
+ $logger.debug self.inspect
67
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
68
+ prediction[:warning] = "R caret prediction error. Using weighted average of similar substances"
69
+ return prediction
70
+ end
71
+ if prediction.nil? or prediction[:value].nil?
72
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
73
+ prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
74
+ end
75
+ end
76
+ prediction
77
+
78
+ end
79
+
80
+ # call caret methods dynamically, e.g. Caret.pls
81
+ def self.method_missing(sym, *args, &block)
82
+ args.first[:method] = sym.to_s
83
+ self.create_model_and_predict args.first
84
+ end
85
+
86
+ def self.to_r v
87
+ return "F" if v == false
88
+ return "T" if v == true
89
+ return nil if v.is_a? Float and v.nan?
90
+ v
91
+ end
92
+
93
+ end
94
+ end
95
+ end
96
+
@@ -3,32 +3,24 @@ module OpenTox
3
3
 
4
4
  class Classification
5
5
 
6
- def self.weighted_majority_vote compound, params
7
- neighbors = params[:neighbors]
8
- weighted_sum = {}
9
- sim_sum = 0.0
10
- confidence = 0.0
11
- neighbors.each do |row|
12
- sim = row["tanimoto"]
13
- row["features"][params[:prediction_feature_id].to_s].each do |act|
14
- weighted_sum[act] ||= 0
15
- weighted_sum[act] += sim
16
- end
6
+ def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:
7
+ class_weights = {}
8
+ dependent_variables.each_with_index do |v,i|
9
+ class_weights[v] ||= []
10
+ class_weights[v] << weights[i] unless v.nil?
17
11
  end
18
- case weighted_sum.size
19
- when 1
20
- return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
21
- when 2
22
- sim_sum = weighted_sum[weighted_sum.keys[0]]
23
- sim_sum -= weighted_sum[weighted_sum.keys[1]]
24
- sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
25
- confidence = (sim_sum/neighbors.size).abs
26
- return {:value => prediction,:confidence => confidence}
27
- else
28
- bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
12
+ probabilities = {}
13
+ class_weights.each do |a,w|
14
+ probabilities[a] = w.sum/weights.sum
29
15
  end
16
+ probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
17
+ p_max = probabilities.collect{|a,p| p}.max
18
+ prediction = probabilities.key(p_max)
19
+ {:value => prediction,:probabilities => probabilities}
30
20
  end
21
+
31
22
  end
23
+
32
24
  end
33
25
  end
34
26
 
data/lib/compound.rb CHANGED
@@ -1,11 +1,9 @@
1
- CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
1
+ CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/"
2
2
 
3
3
  module OpenTox
4
4
 
5
- class Compound
5
+ class Compound < Substance
6
6
  require_relative "unique_descriptors.rb"
7
- include OpenTox
8
-
9
7
  DEFAULT_FINGERPRINT = "MP2D"
10
8
 
11
9
  field :inchi, type: String
@@ -19,9 +17,6 @@ module OpenTox
19
17
  field :sdf_id, type: BSON::ObjectId
20
18
  field :fingerprints, type: Hash, default: {}
21
19
  field :default_fingerprint_size, type: Integer
22
- field :physchem_descriptors, type: Hash, default: {}
23
- field :dataset_ids, type: Array, default: []
24
- field :features, type: Hash, default: {}
25
20
 
26
21
  index({smiles: 1}, {unique: true})
27
22
 
@@ -80,9 +75,8 @@ module OpenTox
80
75
  fingerprints[type]
81
76
  end
82
77
 
83
- def physchem descriptors=PhysChem.openbabel_descriptors
84
- # TODO: speedup java descriptors
85
- calculated_ids = physchem_descriptors.keys
78
+ def calculate_properties descriptors=PhysChem::OPENBABEL
79
+ calculated_ids = properties.keys
86
80
  # BSON::ObjectId instances are not allowed as keys in a BSON document.
87
81
  new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
88
82
  descs = {}
@@ -95,11 +89,11 @@ module OpenTox
95
89
  # avoid recalculating Cdk features with multiple values
96
90
  descs.keys.uniq.each do |k|
97
91
  descs[k].send(k[0].downcase,k[1],self).each do |n,v|
98
- physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
92
+ properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
99
93
  end
100
94
  end
101
95
  save
102
- physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
96
+ descriptors.collect{|d| properties[d.id.to_s]}
103
97
  end
104
98
 
105
99
  def smarts_match smarts, count=false
@@ -142,9 +136,6 @@ module OpenTox
142
136
  # @param inchi [String] smiles InChI string
143
137
  # @return [OpenTox::Compound] Compound
144
138
  def self.from_inchi inchi
145
- # Temporary workaround for OpenBabels Inchi bug
146
- # http://sourceforge.net/p/openbabel/bugs/957/
147
- # bug has not been fixed in latest git/development version
148
139
  #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
149
140
  smiles = obconversion(inchi,"inchi","can")
150
141
  if smiles.empty?
@@ -246,7 +237,7 @@ module OpenTox
246
237
 
247
238
  # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
248
239
  def cid
249
- pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
240
+ pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
250
241
  update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
251
242
  self["cid"]
252
243
  end
@@ -254,70 +245,13 @@ module OpenTox
254
245
  # @return [String] ChEMBL database compound id, derieved via restcall to chembl
255
246
  def chemblid
256
247
  # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
257
- uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
248
+ uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
258
249
  update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
259
250
  self["chemblid"]
260
251
  end
261
252
 
262
- def fingerprint_count_neighbors params
263
- # TODO fix
264
- neighbors = []
265
- query_fingerprint = self.fingerprint params[:type]
266
- training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
267
- unless self == compound
268
- candidate_fingerprint = compound.fingerprint params[:type]
269
- features = (query_fingerprint + candidate_fingerprint).uniq
270
- min_sum = 0
271
- max_sum = 0
272
- features.each do |f|
273
- min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
274
- min_sum += min
275
- max_sum += max
276
- end
277
- max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
278
- neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
279
- end
280
- end
281
- neighbors.sort{|a,b| b.last <=> a.last}
282
- end
283
-
284
- def fingerprint_neighbors params
285
- bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
286
- neighbors = []
287
- if params[:type] == DEFAULT_FINGERPRINT
288
- neighbors = db_neighbors params
289
- else
290
- query_fingerprint = self.fingerprint params[:type]
291
- training_dataset = Dataset.find(params[:training_dataset_id])
292
- prediction_feature = training_dataset.features.first
293
- training_dataset.compounds.each do |compound|
294
- candidate_fingerprint = compound.fingerprint params[:type]
295
- sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
296
- feature_values = training_dataset.values(compound,prediction_feature)
297
- neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
298
- end
299
- neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
300
- end
301
- neighbors
302
- end
303
-
304
- def physchem_neighbors params
305
- feature_dataset = Dataset.find params[:feature_dataset_id]
306
- query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
307
- neighbors = []
308
- feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
309
- # TODO implement pearson and cosine similarity separatly
310
- R.assign "x", query_fingerprint
311
- R.assign "y", candidate_fingerprint
312
- sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
313
- if sim >= params[:min_sim]
314
- neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
315
- end
316
- end
317
- neighbors
318
- end
319
-
320
- def db_neighbors params
253
+ def db_neighbors min_sim: 0.1, dataset_id:
254
+ #p fingerprints[DEFAULT_FINGERPRINT]
321
255
  # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
322
256
 
323
257
  #qn = default_fingerprint_size
@@ -329,31 +263,31 @@ module OpenTox
329
263
  #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
330
264
  #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
331
265
  {'$project' => {
332
- 'tanimoto' => {'$let' => {
266
+ 'similarity' => {'$let' => {
333
267
  'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
334
- #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
335
268
  'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
336
269
  }},
337
270
  '_id' => 1,
338
- 'features' => 1,
271
+ #'measurements' => 1,
339
272
  'dataset_ids' => 1
340
273
  }},
341
- {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
342
- {'$sort' => {'tanimoto' => -1}}
274
+ {'$match' => {'similarity' => {'$gte' => min_sim}}},
275
+ {'$sort' => {'similarity' => -1}}
343
276
  ]
344
-
345
- $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
277
+
278
+ # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array
279
+ $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id}
346
280
 
347
281
  end
348
282
 
349
- # Convert mg to mmol
283
+ # Convert mmol to mg
350
284
  # @return [Float] value in mg
351
285
  def mmol_to_mg mmol
352
286
  mmol.to_f*molecular_weight
353
287
  end
354
288
 
355
- # Convert mmol to mg
356
- # @return [Float] value in mg
289
+ # Convert mg to mmol
290
+ # @return [Float] value in mmol
357
291
  def mg_to_mmol mg
358
292
  mg.to_f/molecular_weight
359
293
  end
@@ -362,7 +296,7 @@ module OpenTox
362
296
  # @return [Float] molecular weight
363
297
  def molecular_weight
364
298
  mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
365
- physchem([mw_feature])[mw_feature.id.to_s]
299
+ calculate_properties([mw_feature]).first
366
300
  end
367
301
 
368
302
  private