lazar 0.9.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2211d5cf1767b241583acff9a22379b56a5d8f1c
|
4
|
+
data.tar.gz: 923a3d00d5c78fd77a2153c973c5e3935c939eda
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a366bae505c427a72211df4d59c7f296ead656bfe3f42db0fb6bb2dc3885028c70ba9df0aa7778c0bd78acdbd7b2939417caafd342a535c4954a34fef410c8d
|
7
|
+
data.tar.gz: 04fd93e7ab52517d338e6005223fe22b498d74be324f8dc6ef2e3a4d4a843202abc9224ff55e8ba053ce7a16a6a76301437f4fc061ac2719d65ff3afa392396a
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -6,31 +6,21 @@ Ruby libraries for the lazar framework
|
|
6
6
|
Dependencies
|
7
7
|
------------
|
8
8
|
|
9
|
-
lazar depends on a couple of external programs and libraries.
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
You will also need at least mongodb version 3.0, but Debian "Wheezy" provides version 2.4. Please follow the instructions at http://docs.mongodb.org/manual/tutorial/install-mongodb-on-debian/:
|
14
|
-
|
15
|
-
```
|
16
|
-
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv 7F0CEB10
|
17
|
-
echo "deb http://repo.mongodb.org/apt/debian wheezy/mongodb-org/3.0 main" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list
|
18
|
-
sudo apt-get update
|
19
|
-
sudo apt-get install -y mongodb-org
|
20
|
-
```
|
9
|
+
lazar depends on a couple of external programs and libraries. All required libraries will be installed with the `gem install lazar` command.
|
10
|
+
If any of the dependencies fails to install, please check if all required development packages are installed from your operating systems package manager (e.g. `apt`, `rpm`, `pacman`, ...).
|
11
|
+
You will need a working Java runtime to use descriptor calculation algorithms from CDK and JOELib libraries.
|
21
12
|
|
22
13
|
Installation
|
23
14
|
------------
|
24
15
|
|
25
16
|
`gem install lazar`
|
26
17
|
|
27
|
-
Please be patient, the compilation of
|
18
|
+
Please be patient, the compilation of external libraries can be very time consuming. If installation fails you can try to install manually:
|
28
19
|
|
29
20
|
```
|
30
21
|
git clone https://github.com/opentox/lazar.git
|
31
22
|
cd lazar
|
32
23
|
ruby ext/lazar/extconf.rb
|
33
|
-
sudo Rscript ext/lazar/rinstall.R
|
34
24
|
bundle install
|
35
25
|
```
|
36
26
|
|
@@ -42,4 +32,4 @@ Documentation
|
|
42
32
|
|
43
33
|
Copyright
|
44
34
|
---------
|
45
|
-
Copyright (c) 2009-
|
35
|
+
Copyright (c) 2009-2016 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/ext/lazar/extconf.rb
CHANGED
@@ -15,7 +15,7 @@ abort "Please install Rserve on your system. Execute 'install.packages('Rserve')
|
|
15
15
|
# install R packages
|
16
16
|
r_dir = File.join main_dir, "R"
|
17
17
|
FileUtils.mkdir_p r_dir
|
18
|
-
FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
|
18
|
+
#FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
|
19
19
|
rinstall = File.expand_path(File.join(File.dirname(__FILE__),"rinstall.R"))
|
20
20
|
puts `Rscript --vanilla #{rinstall} #{r_dir}`
|
21
21
|
|
data/ext/lazar/rinstall.R
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
libdir = commandArgs(trailingOnly=TRUE)[1]
|
2
2
|
repo = "https://stat.ethz.ch/CRAN/"
|
3
3
|
#install.packages("Rserve",lib=libdir,repos=repo,dependencies=TRUE)
|
4
|
-
install.packages("
|
5
|
-
install.packages("
|
6
|
-
install.packages("
|
7
|
-
install.packages("
|
8
|
-
install.packages("
|
9
|
-
install.packages("
|
10
|
-
install.packages("
|
4
|
+
install.packages("stringi",lib=libdir,repos=repo,dependencies=TRUE);
|
5
|
+
install.packages("iterators",lib=libdir,repos=repo,dependencies=TRUE);
|
6
|
+
install.packages("foreach",lib=libdir,repos=repo,dependencies=TRUE);
|
7
|
+
install.packages("gridExtra",lib=libdir,repos=repo,dependencies=TRUE);
|
8
|
+
install.packages("ggplot2",lib=libdir,repos=repo,dependencies=TRUE);
|
9
|
+
install.packages("pls",lib=libdir,repos=repo,dependencies=TRUE);
|
10
|
+
install.packages("randomForest",lib=libdir,repos=repo,dependencies=TRUE);
|
11
|
+
install.packages("caret",lib=libdir,repos=repo,dependencies=TRUE);
|
12
|
+
install.packages("doMC",lib=libdir,repos=repo,dependencies=TRUE);
|
Binary file
|
data/java/CdkDescriptorInfo.java
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import java.util.*;
|
2
|
-
import org.openscience.cdk.
|
2
|
+
import org.openscience.cdk.DefaultChemObjectBuilder;
|
3
3
|
import org.openscience.cdk.qsar.*;
|
4
|
+
//import org.openscience.cdk.qsar.descriptors.molecular.*;
|
4
5
|
|
5
6
|
class CdkDescriptorInfo {
|
6
7
|
public static void main(String[] args) {
|
7
8
|
|
8
|
-
DescriptorEngine engine = new DescriptorEngine(
|
9
|
+
DescriptorEngine engine = new DescriptorEngine(IMolecularDescriptor.class,null);
|
9
10
|
|
10
11
|
for (Iterator<IDescriptor> it = engine.getDescriptorInstances().iterator(); it.hasNext(); ) {
|
11
12
|
IDescriptor descriptor = it.next();
|
data/java/CdkDescriptors.class
CHANGED
Binary file
|
data/java/CdkDescriptors.java
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
import java.util.*;
|
2
2
|
import java.io.*;
|
3
3
|
import org.openscience.cdk.DefaultChemObjectBuilder;
|
4
|
-
import org.openscience.cdk.
|
5
|
-
import org.openscience.cdk.
|
4
|
+
import org.openscience.cdk.IImplementationSpecification;
|
5
|
+
import org.openscience.cdk.interfaces.IAtomContainer;
|
6
|
+
import org.openscience.cdk.io.iterator.IteratingSDFReader;
|
6
7
|
import org.openscience.cdk.qsar.*;
|
7
|
-
import org.openscience.cdk.qsar.DescriptorValue;
|
8
8
|
import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector;
|
9
9
|
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
|
10
10
|
import org.openscience.cdk.exception.NoSuchAtomTypeException;
|
@@ -17,8 +17,8 @@ class CdkDescriptors {
|
|
17
17
|
System.exit(1);
|
18
18
|
}
|
19
19
|
if (! new File(args[0]).exists()){
|
20
|
-
|
21
|
-
|
20
|
+
System.err.println("file not found "+args[0]);
|
21
|
+
System.exit(1);
|
22
22
|
}
|
23
23
|
|
24
24
|
// command line descriptor params can be either "descriptorName" or "descriptorValueName"
|
@@ -34,19 +34,19 @@ class CdkDescriptors {
|
|
34
34
|
for (int i =1; i < args.length; i++) {
|
35
35
|
String descriptorName;
|
36
36
|
if (args[i].indexOf(".")!=-1) {
|
37
|
-
|
38
|
-
|
37
|
+
descriptorValueNames.add(args[i]);
|
38
|
+
descriptorName = args[i].substring(0,args[i].indexOf("."));
|
39
39
|
}
|
40
40
|
else {
|
41
|
-
|
42
|
-
|
41
|
+
descriptorNames.add(args[i]);
|
42
|
+
descriptorName = args[i];
|
43
43
|
}
|
44
44
|
classNames.add(getDescriptorClassName(descriptorName));
|
45
45
|
}
|
46
46
|
|
47
|
-
engine = new DescriptorEngine(new ArrayList<String>(classNames));
|
47
|
+
engine = new DescriptorEngine(new ArrayList<String>(classNames),null);
|
48
48
|
List<IDescriptor> instances = engine.instantiateDescriptors(new ArrayList<String>(classNames));
|
49
|
-
List<
|
49
|
+
List<IImplementationSpecification> specs = engine.initializeSpecifications(instances);
|
50
50
|
engine.setDescriptorInstances(instances);
|
51
51
|
engine.setDescriptorSpecifications(specs);
|
52
52
|
|
@@ -54,13 +54,13 @@ class CdkDescriptors {
|
|
54
54
|
BufferedReader br = new BufferedReader(new FileReader(args[0]));
|
55
55
|
PrintWriter yaml = new PrintWriter(new FileWriter(args[0]+"cdk.yaml"));
|
56
56
|
// parse 3d sdf from file and calculate descriptors
|
57
|
-
|
57
|
+
IteratingSDFReader reader = new IteratingSDFReader( br, DefaultChemObjectBuilder.getInstance());
|
58
58
|
int c = 0;
|
59
59
|
while (reader.hasNext()) {
|
60
60
|
try {
|
61
61
|
System.out.println("computing "+(args.length-1)+" descriptors for compound "+(++c));
|
62
|
-
|
63
|
-
molecule = (
|
62
|
+
IAtomContainer molecule = (IAtomContainer)reader.next();
|
63
|
+
molecule = (IAtomContainer) AtomContainerManipulator.removeHydrogens(molecule);
|
64
64
|
try {
|
65
65
|
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(molecule);
|
66
66
|
}
|
@@ -110,21 +110,21 @@ class CdkDescriptors {
|
|
110
110
|
* problem: Descriptor is not always at the end of the class (APolDescriptor), but may be in the middle (AutocorrelationDescriptorPolarizability)
|
111
111
|
* this method makes a class-lookup using trial and error */
|
112
112
|
static String getDescriptorClassName(String descriptorName) {
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
}
|
113
|
+
String split = splitCamelCase(descriptorName)+" "; // space mark possible positions for 'Descriptor'
|
114
|
+
for(int i = split.length()-1; i>0; i--) {
|
115
|
+
if (split.charAt(i)==' ') { // iterate over all spaces, starting with the trailing one
|
116
|
+
String test = split.substring(0,i)+"Descriptor"+split.substring(i+1,split.length()); // replace current space with 'Descriptor' ..
|
117
|
+
test = test.replaceAll("\\s",""); // .. and remove other spaces
|
118
|
+
String className = "org.openscience.cdk.qsar.descriptors.molecular." + test;
|
119
|
+
try {
|
120
|
+
Class.forName(className);
|
121
|
+
return className;
|
122
|
+
} catch (ClassNotFoundException e) {}
|
124
123
|
}
|
125
|
-
|
126
|
-
|
127
|
-
|
124
|
+
}
|
125
|
+
System.err.println("Descriptor not found: "+descriptorName);
|
126
|
+
System.exit(1);
|
127
|
+
return null;
|
128
128
|
}
|
129
129
|
|
130
130
|
/** inserts space in between camel words */
|
data/java/Rakefile
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Java class, classpath
|
2
2
|
java_classes = [
|
3
|
-
["CdkDescriptors", "cdk-
|
4
|
-
["CdkDescriptorInfo", "cdk-
|
3
|
+
["CdkDescriptors", "cdk-2.0-SNAPSHOT.jar"],
|
4
|
+
["CdkDescriptorInfo", "cdk-2.0-SNAPSHOT.jar"],
|
5
5
|
["JoelibDescriptors", "joelib2.jar:."],
|
6
6
|
["JoelibDescriptorInfo", "joelib2.jar:."],
|
7
7
|
]
|
@@ -10,6 +10,6 @@ task :default => java_classes.collect{|c| "#{c.first}.class"}
|
|
10
10
|
|
11
11
|
java_classes.each do |c|
|
12
12
|
file "#{c.first}.class" => "#{c.first}.java" do
|
13
|
-
puts `javac -classpath #{c.last} #{c.first}.java`
|
13
|
+
puts `javac -Xlint:deprecation -classpath #{c.last} #{c.first}.java`
|
14
14
|
end
|
15
15
|
end
|
Binary file
|
data/lazar.gemspec
CHANGED
@@ -18,11 +18,10 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.require_paths = ["lib"]
|
19
19
|
|
20
20
|
# specify any dependencies here; for example:
|
21
|
-
s.add_runtime_dependency 'bundler'
|
22
|
-
s.add_runtime_dependency 'rest-client'
|
23
|
-
s.add_runtime_dependency 'nokogiri'
|
24
|
-
s.add_runtime_dependency 'rserve-client'
|
25
|
-
s.add_runtime_dependency 'mongoid'
|
26
|
-
s.add_runtime_dependency 'openbabel'
|
27
|
-
|
21
|
+
s.add_runtime_dependency 'bundler'
|
22
|
+
s.add_runtime_dependency 'rest-client'
|
23
|
+
s.add_runtime_dependency 'nokogiri'
|
24
|
+
s.add_runtime_dependency 'rserve-client'
|
25
|
+
s.add_runtime_dependency 'mongoid'
|
26
|
+
s.add_runtime_dependency 'openbabel'
|
28
27
|
end
|
data/lib/algorithm.rb
CHANGED
@@ -2,18 +2,9 @@ module OpenTox
|
|
2
2
|
|
3
3
|
module Algorithm
|
4
4
|
|
5
|
-
|
6
|
-
# Algorithms should:
|
7
|
-
# - accept a Compound, an Array of Compounds or a Dataset as first argument
|
8
|
-
# - optional parameters as second argument
|
9
|
-
# - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
|
10
|
-
# @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
|
11
|
-
# @param [Hash] Algorithm parameters
|
12
|
-
# @return Algorithm result
|
13
|
-
def self.run algorithm, object, parameters=nil
|
14
|
-
bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
|
5
|
+
def self.run algorithm, parameters=nil
|
15
6
|
klass,method = algorithm.split('.')
|
16
|
-
|
7
|
+
Object.const_get(klass).send(method,parameters)
|
17
8
|
end
|
18
9
|
|
19
10
|
end
|
data/lib/caret.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Algorithm
|
3
|
+
|
4
|
+
class Caret
|
5
|
+
# model list: https://topepo.github.io/caret/modelList.html
|
6
|
+
|
7
|
+
def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables:
|
8
|
+
remove = []
|
9
|
+
# remove independent_variables with single values
|
10
|
+
independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1}
|
11
|
+
remove.sort.reverse.each do |i|
|
12
|
+
independent_variables.delete_at i
|
13
|
+
query_variables.delete_at i
|
14
|
+
end
|
15
|
+
if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
|
16
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
17
|
+
prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
|
18
|
+
elsif
|
19
|
+
dependent_variables.size < 3
|
20
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
21
|
+
prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
|
22
|
+
|
23
|
+
else
|
24
|
+
dependent_variables.each_with_index do |v,i|
|
25
|
+
dependent_variables[i] = to_r(v)
|
26
|
+
end
|
27
|
+
independent_variables.each_with_index do |c,i|
|
28
|
+
c.each_with_index do |v,j|
|
29
|
+
independent_variables[i][j] = to_r(v)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
query_variables.each_with_index do |v,i|
|
33
|
+
query_variables[i] = to_r(v)
|
34
|
+
end
|
35
|
+
begin
|
36
|
+
R.assign "weights", weights
|
37
|
+
r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})"
|
38
|
+
R.eval "data <- #{r_data_frame}"
|
39
|
+
R.assign "features", (0..independent_variables.size-1).to_a
|
40
|
+
R.eval "names(data) <- append(c('activities'),features)" #
|
41
|
+
R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
|
42
|
+
rescue => e
|
43
|
+
$logger.debug "R caret model creation error for:"
|
44
|
+
$logger.debug dependent_variables
|
45
|
+
$logger.debug independent_variables
|
46
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
47
|
+
prediction[:warning] = "R caret model creation error. Using weighted average of similar substances."
|
48
|
+
return prediction
|
49
|
+
end
|
50
|
+
begin
|
51
|
+
R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))"
|
52
|
+
R.eval "names(query) <- features"
|
53
|
+
R.eval "prediction <- predict(model,query)"
|
54
|
+
value = R.eval("prediction").to_f
|
55
|
+
rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
|
56
|
+
r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
|
57
|
+
prediction_interval = value-1.96*rmse, value+1.96*rmse
|
58
|
+
prediction = {
|
59
|
+
:value => value,
|
60
|
+
:rmse => rmse,
|
61
|
+
:r_squared => r_squared,
|
62
|
+
:prediction_interval => prediction_interval
|
63
|
+
}
|
64
|
+
rescue => e
|
65
|
+
$logger.debug "R caret prediction error for:"
|
66
|
+
$logger.debug self.inspect
|
67
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
68
|
+
prediction[:warning] = "R caret prediction error. Using weighted average of similar substances"
|
69
|
+
return prediction
|
70
|
+
end
|
71
|
+
if prediction.nil? or prediction[:value].nil?
|
72
|
+
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
73
|
+
prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
|
74
|
+
end
|
75
|
+
end
|
76
|
+
prediction
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
# call caret methods dynamically, e.g. Caret.pls
|
81
|
+
def self.method_missing(sym, *args, &block)
|
82
|
+
args.first[:method] = sym.to_s
|
83
|
+
self.create_model_and_predict args.first
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.to_r v
|
87
|
+
return "F" if v == false
|
88
|
+
return "T" if v == true
|
89
|
+
return nil if v.is_a? Float and v.nan?
|
90
|
+
v
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
data/lib/classification.rb
CHANGED
@@ -3,32 +3,24 @@ module OpenTox
|
|
3
3
|
|
4
4
|
class Classification
|
5
5
|
|
6
|
-
def self.weighted_majority_vote
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
neighbors.each do |row|
|
12
|
-
sim = row["tanimoto"]
|
13
|
-
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
14
|
-
weighted_sum[act] ||= 0
|
15
|
-
weighted_sum[act] += sim
|
16
|
-
end
|
6
|
+
def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:
|
7
|
+
class_weights = {}
|
8
|
+
dependent_variables.each_with_index do |v,i|
|
9
|
+
class_weights[v] ||= []
|
10
|
+
class_weights[v] << weights[i] unless v.nil?
|
17
11
|
end
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
when 2
|
22
|
-
sim_sum = weighted_sum[weighted_sum.keys[0]]
|
23
|
-
sim_sum -= weighted_sum[weighted_sum.keys[1]]
|
24
|
-
sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
|
25
|
-
confidence = (sim_sum/neighbors.size).abs
|
26
|
-
return {:value => prediction,:confidence => confidence}
|
27
|
-
else
|
28
|
-
bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
|
12
|
+
probabilities = {}
|
13
|
+
class_weights.each do |a,w|
|
14
|
+
probabilities[a] = w.sum/weights.sum
|
29
15
|
end
|
16
|
+
probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
|
17
|
+
p_max = probabilities.collect{|a,p| p}.max
|
18
|
+
prediction = probabilities.key(p_max)
|
19
|
+
{:value => prediction,:probabilities => probabilities}
|
30
20
|
end
|
21
|
+
|
31
22
|
end
|
23
|
+
|
32
24
|
end
|
33
25
|
end
|
34
26
|
|
data/lib/compound.rb
CHANGED
@@ -1,11 +1,9 @@
|
|
1
|
-
CACTUS_URI="
|
1
|
+
CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/"
|
2
2
|
|
3
3
|
module OpenTox
|
4
4
|
|
5
|
-
class Compound
|
5
|
+
class Compound < Substance
|
6
6
|
require_relative "unique_descriptors.rb"
|
7
|
-
include OpenTox
|
8
|
-
|
9
7
|
DEFAULT_FINGERPRINT = "MP2D"
|
10
8
|
|
11
9
|
field :inchi, type: String
|
@@ -19,9 +17,6 @@ module OpenTox
|
|
19
17
|
field :sdf_id, type: BSON::ObjectId
|
20
18
|
field :fingerprints, type: Hash, default: {}
|
21
19
|
field :default_fingerprint_size, type: Integer
|
22
|
-
field :physchem_descriptors, type: Hash, default: {}
|
23
|
-
field :dataset_ids, type: Array, default: []
|
24
|
-
field :features, type: Hash, default: {}
|
25
20
|
|
26
21
|
index({smiles: 1}, {unique: true})
|
27
22
|
|
@@ -80,9 +75,8 @@ module OpenTox
|
|
80
75
|
fingerprints[type]
|
81
76
|
end
|
82
77
|
|
83
|
-
def
|
84
|
-
|
85
|
-
calculated_ids = physchem_descriptors.keys
|
78
|
+
def calculate_properties descriptors=PhysChem::OPENBABEL
|
79
|
+
calculated_ids = properties.keys
|
86
80
|
# BSON::ObjectId instances are not allowed as keys in a BSON document.
|
87
81
|
new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
|
88
82
|
descs = {}
|
@@ -95,11 +89,11 @@ module OpenTox
|
|
95
89
|
# avoid recalculating Cdk features with multiple values
|
96
90
|
descs.keys.uniq.each do |k|
|
97
91
|
descs[k].send(k[0].downcase,k[1],self).each do |n,v|
|
98
|
-
|
92
|
+
properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
|
99
93
|
end
|
100
94
|
end
|
101
95
|
save
|
102
|
-
|
96
|
+
descriptors.collect{|d| properties[d.id.to_s]}
|
103
97
|
end
|
104
98
|
|
105
99
|
def smarts_match smarts, count=false
|
@@ -142,9 +136,6 @@ module OpenTox
|
|
142
136
|
# @param inchi [String] smiles InChI string
|
143
137
|
# @return [OpenTox::Compound] Compound
|
144
138
|
def self.from_inchi inchi
|
145
|
-
# Temporary workaround for OpenBabels Inchi bug
|
146
|
-
# http://sourceforge.net/p/openbabel/bugs/957/
|
147
|
-
# bug has not been fixed in latest git/development version
|
148
139
|
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
|
149
140
|
smiles = obconversion(inchi,"inchi","can")
|
150
141
|
if smiles.empty?
|
@@ -246,7 +237,7 @@ module OpenTox
|
|
246
237
|
|
247
238
|
# @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
|
248
239
|
def cid
|
249
|
-
pug_uri = "
|
240
|
+
pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
|
250
241
|
update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
|
251
242
|
self["cid"]
|
252
243
|
end
|
@@ -254,70 +245,13 @@ module OpenTox
|
|
254
245
|
# @return [String] ChEMBL database compound id, derieved via restcall to chembl
|
255
246
|
def chemblid
|
256
247
|
# https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
|
257
|
-
uri = "
|
248
|
+
uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
|
258
249
|
update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
|
259
250
|
self["chemblid"]
|
260
251
|
end
|
261
252
|
|
262
|
-
def
|
263
|
-
#
|
264
|
-
neighbors = []
|
265
|
-
query_fingerprint = self.fingerprint params[:type]
|
266
|
-
training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
|
267
|
-
unless self == compound
|
268
|
-
candidate_fingerprint = compound.fingerprint params[:type]
|
269
|
-
features = (query_fingerprint + candidate_fingerprint).uniq
|
270
|
-
min_sum = 0
|
271
|
-
max_sum = 0
|
272
|
-
features.each do |f|
|
273
|
-
min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
|
274
|
-
min_sum += min
|
275
|
-
max_sum += max
|
276
|
-
end
|
277
|
-
max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
|
278
|
-
neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
|
279
|
-
end
|
280
|
-
end
|
281
|
-
neighbors.sort{|a,b| b.last <=> a.last}
|
282
|
-
end
|
283
|
-
|
284
|
-
def fingerprint_neighbors params
|
285
|
-
bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
|
286
|
-
neighbors = []
|
287
|
-
if params[:type] == DEFAULT_FINGERPRINT
|
288
|
-
neighbors = db_neighbors params
|
289
|
-
else
|
290
|
-
query_fingerprint = self.fingerprint params[:type]
|
291
|
-
training_dataset = Dataset.find(params[:training_dataset_id])
|
292
|
-
prediction_feature = training_dataset.features.first
|
293
|
-
training_dataset.compounds.each do |compound|
|
294
|
-
candidate_fingerprint = compound.fingerprint params[:type]
|
295
|
-
sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
|
296
|
-
feature_values = training_dataset.values(compound,prediction_feature)
|
297
|
-
neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
|
298
|
-
end
|
299
|
-
neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
|
300
|
-
end
|
301
|
-
neighbors
|
302
|
-
end
|
303
|
-
|
304
|
-
def physchem_neighbors params
|
305
|
-
feature_dataset = Dataset.find params[:feature_dataset_id]
|
306
|
-
query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
|
307
|
-
neighbors = []
|
308
|
-
feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
|
309
|
-
# TODO implement pearson and cosine similarity separatly
|
310
|
-
R.assign "x", query_fingerprint
|
311
|
-
R.assign "y", candidate_fingerprint
|
312
|
-
sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
|
313
|
-
if sim >= params[:min_sim]
|
314
|
-
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
|
315
|
-
end
|
316
|
-
end
|
317
|
-
neighbors
|
318
|
-
end
|
319
|
-
|
320
|
-
def db_neighbors params
|
253
|
+
def db_neighbors min_sim: 0.1, dataset_id:
|
254
|
+
#p fingerprints[DEFAULT_FINGERPRINT]
|
321
255
|
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
|
322
256
|
|
323
257
|
#qn = default_fingerprint_size
|
@@ -329,31 +263,31 @@ module OpenTox
|
|
329
263
|
#{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
|
330
264
|
#{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
|
331
265
|
{'$project' => {
|
332
|
-
'
|
266
|
+
'similarity' => {'$let' => {
|
333
267
|
'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
|
334
|
-
#'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
|
335
268
|
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
|
336
269
|
}},
|
337
270
|
'_id' => 1,
|
338
|
-
'
|
271
|
+
#'measurements' => 1,
|
339
272
|
'dataset_ids' => 1
|
340
273
|
}},
|
341
|
-
{'$match' => {'
|
342
|
-
{'$sort' => {'
|
274
|
+
{'$match' => {'similarity' => {'$gte' => min_sim}}},
|
275
|
+
{'$sort' => {'similarity' => -1}}
|
343
276
|
]
|
344
|
-
|
345
|
-
|
277
|
+
|
278
|
+
# TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array
|
279
|
+
$mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id}
|
346
280
|
|
347
281
|
end
|
348
282
|
|
349
|
-
# Convert
|
283
|
+
# Convert mmol to mg
|
350
284
|
# @return [Float] value in mg
|
351
285
|
def mmol_to_mg mmol
|
352
286
|
mmol.to_f*molecular_weight
|
353
287
|
end
|
354
288
|
|
355
|
-
# Convert
|
356
|
-
# @return [Float] value in
|
289
|
+
# Convert mg to mmol
|
290
|
+
# @return [Float] value in mmol
|
357
291
|
def mg_to_mmol mg
|
358
292
|
mg.to_f/molecular_weight
|
359
293
|
end
|
@@ -362,7 +296,7 @@ module OpenTox
|
|
362
296
|
# @return [Float] molecular weight
|
363
297
|
def molecular_weight
|
364
298
|
mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
|
365
|
-
|
299
|
+
calculate_properties([mw_feature]).first
|
366
300
|
end
|
367
301
|
|
368
302
|
private
|