lazar 0.9.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2211d5cf1767b241583acff9a22379b56a5d8f1c
|
4
|
+
data.tar.gz: 923a3d00d5c78fd77a2153c973c5e3935c939eda
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a366bae505c427a72211df4d59c7f296ead656bfe3f42db0fb6bb2dc3885028c70ba9df0aa7778c0bd78acdbd7b2939417caafd342a535c4954a34fef410c8d
|
7
|
+
data.tar.gz: 04fd93e7ab52517d338e6005223fe22b498d74be324f8dc6ef2e3a4d4a843202abc9224ff55e8ba053ce7a16a6a76301437f4fc061ac2719d65ff3afa392396a
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -6,31 +6,21 @@ Ruby libraries for the lazar framework
|
|
6
6
|
Dependencies
|
7
7
|
------------
|
8
8
|
|
9
|
-
lazar depends on a couple of external programs and libraries.
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
You will also need at least mongodb version 3.0, but Debian "Wheezy" provides version 2.4. Please follow the instructions at http://docs.mongodb.org/manual/tutorial/install-mongodb-on-debian/:
|
14
|
-
|
15
|
-
```
|
16
|
-
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv 7F0CEB10
|
17
|
-
echo "deb http://repo.mongodb.org/apt/debian wheezy/mongodb-org/3.0 main" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list
|
18
|
-
sudo apt-get update
|
19
|
-
sudo apt-get install -y mongodb-org
|
20
|
-
```
|
9
|
+
lazar depends on a couple of external programs and libraries. All required libraries will be installed with the `gem install lazar` command.
|
10
|
+
If any of the dependencies fails to install, please check if all required development packages are installed from your operating systems package manager (e.g. `apt`, `rpm`, `pacman`, ...).
|
11
|
+
You will need a working Java runtime to use descriptor calculation algorithms from CDK and JOELib libraries.
|
21
12
|
|
22
13
|
Installation
|
23
14
|
------------
|
24
15
|
|
25
16
|
`gem install lazar`
|
26
17
|
|
27
|
-
Please be patient, the compilation of
|
18
|
+
Please be patient, the compilation of external libraries can be very time consuming. If installation fails you can try to install manually:
|
28
19
|
|
29
20
|
```
|
30
21
|
git clone https://github.com/opentox/lazar.git
|
31
22
|
cd lazar
|
32
23
|
ruby ext/lazar/extconf.rb
|
33
|
-
sudo Rscript ext/lazar/rinstall.R
|
34
24
|
bundle install
|
35
25
|
```
|
36
26
|
|
@@ -42,4 +32,4 @@ Documentation
|
|
42
32
|
|
43
33
|
Copyright
|
44
34
|
---------
|
45
|
-
Copyright (c) 2009-
|
35
|
+
Copyright (c) 2009-2016 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/ext/lazar/extconf.rb
CHANGED
@@ -15,7 +15,7 @@ abort "Please install Rserve on your system. Execute 'install.packages('Rserve')
|
|
15
15
|
# install R packages
|
16
16
|
r_dir = File.join main_dir, "R"
|
17
17
|
FileUtils.mkdir_p r_dir
|
18
|
-
FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
|
18
|
+
#FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
|
19
19
|
rinstall = File.expand_path(File.join(File.dirname(__FILE__),"rinstall.R"))
|
20
20
|
puts `Rscript --vanilla #{rinstall} #{r_dir}`
|
21
21
|
|
data/ext/lazar/rinstall.R
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
libdir = commandArgs(trailingOnly=TRUE)[1]
|
2
2
|
repo = "https://stat.ethz.ch/CRAN/"
|
3
3
|
#install.packages("Rserve",lib=libdir,repos=repo,dependencies=TRUE)
|
4
|
-
install.packages("
|
5
|
-
install.packages("
|
6
|
-
install.packages("
|
7
|
-
install.packages("
|
8
|
-
install.packages("
|
9
|
-
install.packages("
|
10
|
-
install.packages("
|
4
|
+
install.packages("stringi",lib=libdir,repos=repo,dependencies=TRUE);
|
5
|
+
install.packages("iterators",lib=libdir,repos=repo,dependencies=TRUE);
|
6
|
+
install.packages("foreach",lib=libdir,repos=repo,dependencies=TRUE);
|
7
|
+
install.packages("gridExtra",lib=libdir,repos=repo,dependencies=TRUE);
|
8
|
+
install.packages("ggplot2",lib=libdir,repos=repo,dependencies=TRUE);
|
9
|
+
install.packages("pls",lib=libdir,repos=repo,dependencies=TRUE);
|
10
|
+
install.packages("randomForest",lib=libdir,repos=repo,dependencies=TRUE);
|
11
|
+
install.packages("caret",lib=libdir,repos=repo,dependencies=TRUE);
|
12
|
+
install.packages("doMC",lib=libdir,repos=repo,dependencies=TRUE);
|
Binary file
|
data/java/CdkDescriptorInfo.java
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import java.util.*;
|
2
|
-
import org.openscience.cdk.
|
2
|
+
import org.openscience.cdk.DefaultChemObjectBuilder;
|
3
3
|
import org.openscience.cdk.qsar.*;
|
4
|
+
//import org.openscience.cdk.qsar.descriptors.molecular.*;
|
4
5
|
|
5
6
|
class CdkDescriptorInfo {
|
6
7
|
public static void main(String[] args) {
|
7
8
|
|
8
|
-
DescriptorEngine engine = new DescriptorEngine(
|
9
|
+
DescriptorEngine engine = new DescriptorEngine(IMolecularDescriptor.class,null);
|
9
10
|
|
10
11
|
for (Iterator<IDescriptor> it = engine.getDescriptorInstances().iterator(); it.hasNext(); ) {
|
11
12
|
IDescriptor descriptor = it.next();
|
data/java/CdkDescriptors.class
CHANGED
Binary file
|
data/java/CdkDescriptors.java
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
import java.util.*;
|
2
2
|
import java.io.*;
|
3
3
|
import org.openscience.cdk.DefaultChemObjectBuilder;
|
4
|
-
import org.openscience.cdk.
|
5
|
-
import org.openscience.cdk.
|
4
|
+
import org.openscience.cdk.IImplementationSpecification;
|
5
|
+
import org.openscience.cdk.interfaces.IAtomContainer;
|
6
|
+
import org.openscience.cdk.io.iterator.IteratingSDFReader;
|
6
7
|
import org.openscience.cdk.qsar.*;
|
7
|
-
import org.openscience.cdk.qsar.DescriptorValue;
|
8
8
|
import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector;
|
9
9
|
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
|
10
10
|
import org.openscience.cdk.exception.NoSuchAtomTypeException;
|
@@ -17,8 +17,8 @@ class CdkDescriptors {
|
|
17
17
|
System.exit(1);
|
18
18
|
}
|
19
19
|
if (! new File(args[0]).exists()){
|
20
|
-
|
21
|
-
|
20
|
+
System.err.println("file not found "+args[0]);
|
21
|
+
System.exit(1);
|
22
22
|
}
|
23
23
|
|
24
24
|
// command line descriptor params can be either "descriptorName" or "descriptorValueName"
|
@@ -34,19 +34,19 @@ class CdkDescriptors {
|
|
34
34
|
for (int i =1; i < args.length; i++) {
|
35
35
|
String descriptorName;
|
36
36
|
if (args[i].indexOf(".")!=-1) {
|
37
|
-
|
38
|
-
|
37
|
+
descriptorValueNames.add(args[i]);
|
38
|
+
descriptorName = args[i].substring(0,args[i].indexOf("."));
|
39
39
|
}
|
40
40
|
else {
|
41
|
-
|
42
|
-
|
41
|
+
descriptorNames.add(args[i]);
|
42
|
+
descriptorName = args[i];
|
43
43
|
}
|
44
44
|
classNames.add(getDescriptorClassName(descriptorName));
|
45
45
|
}
|
46
46
|
|
47
|
-
engine = new DescriptorEngine(new ArrayList<String>(classNames));
|
47
|
+
engine = new DescriptorEngine(new ArrayList<String>(classNames),null);
|
48
48
|
List<IDescriptor> instances = engine.instantiateDescriptors(new ArrayList<String>(classNames));
|
49
|
-
List<
|
49
|
+
List<IImplementationSpecification> specs = engine.initializeSpecifications(instances);
|
50
50
|
engine.setDescriptorInstances(instances);
|
51
51
|
engine.setDescriptorSpecifications(specs);
|
52
52
|
|
@@ -54,13 +54,13 @@ class CdkDescriptors {
|
|
54
54
|
BufferedReader br = new BufferedReader(new FileReader(args[0]));
|
55
55
|
PrintWriter yaml = new PrintWriter(new FileWriter(args[0]+"cdk.yaml"));
|
56
56
|
// parse 3d sdf from file and calculate descriptors
|
57
|
-
|
57
|
+
IteratingSDFReader reader = new IteratingSDFReader( br, DefaultChemObjectBuilder.getInstance());
|
58
58
|
int c = 0;
|
59
59
|
while (reader.hasNext()) {
|
60
60
|
try {
|
61
61
|
System.out.println("computing "+(args.length-1)+" descriptors for compound "+(++c));
|
62
|
-
|
63
|
-
molecule = (
|
62
|
+
IAtomContainer molecule = (IAtomContainer)reader.next();
|
63
|
+
molecule = (IAtomContainer) AtomContainerManipulator.removeHydrogens(molecule);
|
64
64
|
try {
|
65
65
|
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(molecule);
|
66
66
|
}
|
@@ -110,21 +110,21 @@ class CdkDescriptors {
|
|
110
110
|
* problem: Descriptor is not always at the end of the class (APolDescriptor), but may be in the middle (AutocorrelationDescriptorPolarizability)
|
111
111
|
* this method makes a class-lookup using trial and error */
|
112
112
|
static String getDescriptorClassName(String descriptorName) {
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
}
|
113
|
+
String split = splitCamelCase(descriptorName)+" "; // space mark possible positions for 'Descriptor'
|
114
|
+
for(int i = split.length()-1; i>0; i--) {
|
115
|
+
if (split.charAt(i)==' ') { // iterate over all spaces, starting with the trailing one
|
116
|
+
String test = split.substring(0,i)+"Descriptor"+split.substring(i+1,split.length()); // replace current space with 'Descriptor' ..
|
117
|
+
test = test.replaceAll("\\s",""); // .. and remove other spaces
|
118
|
+
String className = "org.openscience.cdk.qsar.descriptors.molecular." + test;
|
119
|
+
try {
|
120
|
+
Class.forName(className);
|
121
|
+
return className;
|
122
|
+
} catch (ClassNotFoundException e) {}
|
124
123
|
}
|
125
|
-
|
126
|
-
|
127
|
-
|
124
|
+
}
|
125
|
+
System.err.println("Descriptor not found: "+descriptorName);
|
126
|
+
System.exit(1);
|
127
|
+
return null;
|
128
128
|
}
|
129
129
|
|
130
130
|
/** inserts space in between camel words */
|
data/java/Rakefile
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Java class, classpath
|
2
2
|
java_classes = [
|
3
|
-
["CdkDescriptors", "cdk-
|
4
|
-
["CdkDescriptorInfo", "cdk-
|
3
|
+
["CdkDescriptors", "cdk-2.0-SNAPSHOT.jar"],
|
4
|
+
["CdkDescriptorInfo", "cdk-2.0-SNAPSHOT.jar"],
|
5
5
|
["JoelibDescriptors", "joelib2.jar:."],
|
6
6
|
["JoelibDescriptorInfo", "joelib2.jar:."],
|
7
7
|
]
|
@@ -10,6 +10,6 @@ task :default => java_classes.collect{|c| "#{c.first}.class"}
|
|
10
10
|
|
11
11
|
java_classes.each do |c|
|
12
12
|
file "#{c.first}.class" => "#{c.first}.java" do
|
13
|
-
puts `javac -classpath #{c.last} #{c.first}.java`
|
13
|
+
puts `javac -Xlint:deprecation -classpath #{c.last} #{c.first}.java`
|
14
14
|
end
|
15
15
|
end
|
Binary file
|
data/lazar.gemspec
CHANGED
@@ -18,11 +18,10 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.require_paths = ["lib"]
|
19
19
|
|
20
20
|
# specify any dependencies here; for example:
|
21
|
-
s.add_runtime_dependency 'bundler'
|
22
|
-
s.add_runtime_dependency 'rest-client'
|
23
|
-
s.add_runtime_dependency 'nokogiri'
|
24
|
-
s.add_runtime_dependency 'rserve-client'
|
25
|
-
s.add_runtime_dependency 'mongoid'
|
26
|
-
s.add_runtime_dependency 'openbabel'
|
27
|
-
|
21
|
+
s.add_runtime_dependency 'bundler'
|
22
|
+
s.add_runtime_dependency 'rest-client'
|
23
|
+
s.add_runtime_dependency 'nokogiri'
|
24
|
+
s.add_runtime_dependency 'rserve-client'
|
25
|
+
s.add_runtime_dependency 'mongoid'
|
26
|
+
s.add_runtime_dependency 'openbabel'
|
28
27
|
end
|
data/lib/algorithm.rb
CHANGED
@@ -2,18 +2,9 @@ module OpenTox
|
|
2
2
|
|
3
3
|
module Algorithm
|
4
4
|
|
5
|
-
|
6
|
-
# Algorithms should:
|
7
|
-
# - accept a Compound, an Array of Compounds or a Dataset as first argument
|
8
|
-
# - optional parameters as second argument
|
9
|
-
# - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
|
10
|
-
# @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
|
11
|
-
# @param [Hash] Algorithm parameters
|
12
|
-
# @return Algorithm result
|
13
|
-
def self.run algorithm, object, parameters=nil
|
14
|
-
bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
|
5
|
+
def self.run algorithm, parameters=nil
|
15
6
|
klass,method = algorithm.split('.')
|
16
|
-
|
7
|
+
Object.const_get(klass).send(method,parameters)
|
17
8
|
end
|
18
9
|
|
19
10
|
end
|
data/lib/caret.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Algorithm
|
3
|
+
|
4
|
+
class Caret
|
5
|
+
# model list: https://topepo.github.io/caret/modelList.html
|
6
|
+
|
7
|
+
def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables:
|
8
|
+
remove = []
|
9
|
+
# remove independent_variables with single values
|
10
|
+
independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1}
|
11
|
+
remove.sort.reverse.each do |i|
|
12
|
+
independent_variables.delete_at i
|
13
|
+
query_variables.delete_at i
|
14
|
+
end
|
15
|
+
if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
|
16
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
17
|
+
prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
|
18
|
+
elsif
|
19
|
+
dependent_variables.size < 3
|
20
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
21
|
+
prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
|
22
|
+
|
23
|
+
else
|
24
|
+
dependent_variables.each_with_index do |v,i|
|
25
|
+
dependent_variables[i] = to_r(v)
|
26
|
+
end
|
27
|
+
independent_variables.each_with_index do |c,i|
|
28
|
+
c.each_with_index do |v,j|
|
29
|
+
independent_variables[i][j] = to_r(v)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
query_variables.each_with_index do |v,i|
|
33
|
+
query_variables[i] = to_r(v)
|
34
|
+
end
|
35
|
+
begin
|
36
|
+
R.assign "weights", weights
|
37
|
+
r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})"
|
38
|
+
R.eval "data <- #{r_data_frame}"
|
39
|
+
R.assign "features", (0..independent_variables.size-1).to_a
|
40
|
+
R.eval "names(data) <- append(c('activities'),features)" #
|
41
|
+
R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
|
42
|
+
rescue => e
|
43
|
+
$logger.debug "R caret model creation error for:"
|
44
|
+
$logger.debug dependent_variables
|
45
|
+
$logger.debug independent_variables
|
46
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
47
|
+
prediction[:warning] = "R caret model creation error. Using weighted average of similar substances."
|
48
|
+
return prediction
|
49
|
+
end
|
50
|
+
begin
|
51
|
+
R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))"
|
52
|
+
R.eval "names(query) <- features"
|
53
|
+
R.eval "prediction <- predict(model,query)"
|
54
|
+
value = R.eval("prediction").to_f
|
55
|
+
rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
|
56
|
+
r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
|
57
|
+
prediction_interval = value-1.96*rmse, value+1.96*rmse
|
58
|
+
prediction = {
|
59
|
+
:value => value,
|
60
|
+
:rmse => rmse,
|
61
|
+
:r_squared => r_squared,
|
62
|
+
:prediction_interval => prediction_interval
|
63
|
+
}
|
64
|
+
rescue => e
|
65
|
+
$logger.debug "R caret prediction error for:"
|
66
|
+
$logger.debug self.inspect
|
67
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
68
|
+
prediction[:warning] = "R caret prediction error. Using weighted average of similar substances"
|
69
|
+
return prediction
|
70
|
+
end
|
71
|
+
if prediction.nil? or prediction[:value].nil?
|
72
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
73
|
+
prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
|
74
|
+
end
|
75
|
+
end
|
76
|
+
prediction
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
# call caret methods dynamically, e.g. Caret.pls
|
81
|
+
def self.method_missing(sym, *args, &block)
|
82
|
+
args.first[:method] = sym.to_s
|
83
|
+
self.create_model_and_predict args.first
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.to_r v
|
87
|
+
return "F" if v == false
|
88
|
+
return "T" if v == true
|
89
|
+
return nil if v.is_a? Float and v.nan?
|
90
|
+
v
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
data/lib/classification.rb
CHANGED
@@ -3,32 +3,24 @@ module OpenTox
|
|
3
3
|
|
4
4
|
class Classification
|
5
5
|
|
6
|
-
def self.weighted_majority_vote
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
neighbors.each do |row|
|
12
|
-
sim = row["tanimoto"]
|
13
|
-
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
14
|
-
weighted_sum[act] ||= 0
|
15
|
-
weighted_sum[act] += sim
|
16
|
-
end
|
6
|
+
def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:
|
7
|
+
class_weights = {}
|
8
|
+
dependent_variables.each_with_index do |v,i|
|
9
|
+
class_weights[v] ||= []
|
10
|
+
class_weights[v] << weights[i] unless v.nil?
|
17
11
|
end
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
when 2
|
22
|
-
sim_sum = weighted_sum[weighted_sum.keys[0]]
|
23
|
-
sim_sum -= weighted_sum[weighted_sum.keys[1]]
|
24
|
-
sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
|
25
|
-
confidence = (sim_sum/neighbors.size).abs
|
26
|
-
return {:value => prediction,:confidence => confidence}
|
27
|
-
else
|
28
|
-
bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
|
12
|
+
probabilities = {}
|
13
|
+
class_weights.each do |a,w|
|
14
|
+
probabilities[a] = w.sum/weights.sum
|
29
15
|
end
|
16
|
+
probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
|
17
|
+
p_max = probabilities.collect{|a,p| p}.max
|
18
|
+
prediction = probabilities.key(p_max)
|
19
|
+
{:value => prediction,:probabilities => probabilities}
|
30
20
|
end
|
21
|
+
|
31
22
|
end
|
23
|
+
|
32
24
|
end
|
33
25
|
end
|
34
26
|
|
data/lib/compound.rb
CHANGED
@@ -1,11 +1,9 @@
|
|
1
|
-
CACTUS_URI="
|
1
|
+
CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/"
|
2
2
|
|
3
3
|
module OpenTox
|
4
4
|
|
5
|
-
class Compound
|
5
|
+
class Compound < Substance
|
6
6
|
require_relative "unique_descriptors.rb"
|
7
|
-
include OpenTox
|
8
|
-
|
9
7
|
DEFAULT_FINGERPRINT = "MP2D"
|
10
8
|
|
11
9
|
field :inchi, type: String
|
@@ -19,9 +17,6 @@ module OpenTox
|
|
19
17
|
field :sdf_id, type: BSON::ObjectId
|
20
18
|
field :fingerprints, type: Hash, default: {}
|
21
19
|
field :default_fingerprint_size, type: Integer
|
22
|
-
field :physchem_descriptors, type: Hash, default: {}
|
23
|
-
field :dataset_ids, type: Array, default: []
|
24
|
-
field :features, type: Hash, default: {}
|
25
20
|
|
26
21
|
index({smiles: 1}, {unique: true})
|
27
22
|
|
@@ -80,9 +75,8 @@ module OpenTox
|
|
80
75
|
fingerprints[type]
|
81
76
|
end
|
82
77
|
|
83
|
-
def
|
84
|
-
|
85
|
-
calculated_ids = physchem_descriptors.keys
|
78
|
+
def calculate_properties descriptors=PhysChem::OPENBABEL
|
79
|
+
calculated_ids = properties.keys
|
86
80
|
# BSON::ObjectId instances are not allowed as keys in a BSON document.
|
87
81
|
new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
|
88
82
|
descs = {}
|
@@ -95,11 +89,11 @@ module OpenTox
|
|
95
89
|
# avoid recalculating Cdk features with multiple values
|
96
90
|
descs.keys.uniq.each do |k|
|
97
91
|
descs[k].send(k[0].downcase,k[1],self).each do |n,v|
|
98
|
-
|
92
|
+
properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
|
99
93
|
end
|
100
94
|
end
|
101
95
|
save
|
102
|
-
|
96
|
+
descriptors.collect{|d| properties[d.id.to_s]}
|
103
97
|
end
|
104
98
|
|
105
99
|
def smarts_match smarts, count=false
|
@@ -142,9 +136,6 @@ module OpenTox
|
|
142
136
|
# @param inchi [String] smiles InChI string
|
143
137
|
# @return [OpenTox::Compound] Compound
|
144
138
|
def self.from_inchi inchi
|
145
|
-
# Temporary workaround for OpenBabels Inchi bug
|
146
|
-
# http://sourceforge.net/p/openbabel/bugs/957/
|
147
|
-
# bug has not been fixed in latest git/development version
|
148
139
|
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
|
149
140
|
smiles = obconversion(inchi,"inchi","can")
|
150
141
|
if smiles.empty?
|
@@ -246,7 +237,7 @@ module OpenTox
|
|
246
237
|
|
247
238
|
# @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
|
248
239
|
def cid
|
249
|
-
pug_uri = "
|
240
|
+
pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
|
250
241
|
update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
|
251
242
|
self["cid"]
|
252
243
|
end
|
@@ -254,70 +245,13 @@ module OpenTox
|
|
254
245
|
# @return [String] ChEMBL database compound id, derieved via restcall to chembl
|
255
246
|
def chemblid
|
256
247
|
# https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
|
257
|
-
uri = "
|
248
|
+
uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
|
258
249
|
update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
|
259
250
|
self["chemblid"]
|
260
251
|
end
|
261
252
|
|
262
|
-
def
|
263
|
-
#
|
264
|
-
neighbors = []
|
265
|
-
query_fingerprint = self.fingerprint params[:type]
|
266
|
-
training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
|
267
|
-
unless self == compound
|
268
|
-
candidate_fingerprint = compound.fingerprint params[:type]
|
269
|
-
features = (query_fingerprint + candidate_fingerprint).uniq
|
270
|
-
min_sum = 0
|
271
|
-
max_sum = 0
|
272
|
-
features.each do |f|
|
273
|
-
min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
|
274
|
-
min_sum += min
|
275
|
-
max_sum += max
|
276
|
-
end
|
277
|
-
max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
|
278
|
-
neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
|
279
|
-
end
|
280
|
-
end
|
281
|
-
neighbors.sort{|a,b| b.last <=> a.last}
|
282
|
-
end
|
283
|
-
|
284
|
-
def fingerprint_neighbors params
|
285
|
-
bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
|
286
|
-
neighbors = []
|
287
|
-
if params[:type] == DEFAULT_FINGERPRINT
|
288
|
-
neighbors = db_neighbors params
|
289
|
-
else
|
290
|
-
query_fingerprint = self.fingerprint params[:type]
|
291
|
-
training_dataset = Dataset.find(params[:training_dataset_id])
|
292
|
-
prediction_feature = training_dataset.features.first
|
293
|
-
training_dataset.compounds.each do |compound|
|
294
|
-
candidate_fingerprint = compound.fingerprint params[:type]
|
295
|
-
sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
|
296
|
-
feature_values = training_dataset.values(compound,prediction_feature)
|
297
|
-
neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
|
298
|
-
end
|
299
|
-
neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
|
300
|
-
end
|
301
|
-
neighbors
|
302
|
-
end
|
303
|
-
|
304
|
-
def physchem_neighbors params
|
305
|
-
feature_dataset = Dataset.find params[:feature_dataset_id]
|
306
|
-
query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
|
307
|
-
neighbors = []
|
308
|
-
feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
|
309
|
-
# TODO implement pearson and cosine similarity separatly
|
310
|
-
R.assign "x", query_fingerprint
|
311
|
-
R.assign "y", candidate_fingerprint
|
312
|
-
sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
|
313
|
-
if sim >= params[:min_sim]
|
314
|
-
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
|
315
|
-
end
|
316
|
-
end
|
317
|
-
neighbors
|
318
|
-
end
|
319
|
-
|
320
|
-
def db_neighbors params
|
253
|
+
def db_neighbors min_sim: 0.1, dataset_id:
|
254
|
+
#p fingerprints[DEFAULT_FINGERPRINT]
|
321
255
|
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
|
322
256
|
|
323
257
|
#qn = default_fingerprint_size
|
@@ -329,31 +263,31 @@ module OpenTox
|
|
329
263
|
#{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
|
330
264
|
#{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
|
331
265
|
{'$project' => {
|
332
|
-
'
|
266
|
+
'similarity' => {'$let' => {
|
333
267
|
'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
|
334
|
-
#'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
|
335
268
|
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
|
336
269
|
}},
|
337
270
|
'_id' => 1,
|
338
|
-
'
|
271
|
+
#'measurements' => 1,
|
339
272
|
'dataset_ids' => 1
|
340
273
|
}},
|
341
|
-
{'$match' => {'
|
342
|
-
{'$sort' => {'
|
274
|
+
{'$match' => {'similarity' => {'$gte' => min_sim}}},
|
275
|
+
{'$sort' => {'similarity' => -1}}
|
343
276
|
]
|
344
|
-
|
345
|
-
|
277
|
+
|
278
|
+
# TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array
|
279
|
+
$mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id}
|
346
280
|
|
347
281
|
end
|
348
282
|
|
349
|
-
# Convert
|
283
|
+
# Convert mmol to mg
|
350
284
|
# @return [Float] value in mg
|
351
285
|
def mmol_to_mg mmol
|
352
286
|
mmol.to_f*molecular_weight
|
353
287
|
end
|
354
288
|
|
355
|
-
# Convert
|
356
|
-
# @return [Float] value in
|
289
|
+
# Convert mg to mmol
|
290
|
+
# @return [Float] value in mmol
|
357
291
|
def mg_to_mmol mg
|
358
292
|
mg.to_f/molecular_weight
|
359
293
|
end
|
@@ -362,7 +296,7 @@ module OpenTox
|
|
362
296
|
# @return [Float] molecular weight
|
363
297
|
def molecular_weight
|
364
298
|
mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
|
365
|
-
|
299
|
+
calculate_properties([mw_feature]).first
|
366
300
|
end
|
367
301
|
|
368
302
|
private
|