lazar 0.0.7 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1b22cad0ba1ecef02ff4af283796fcb36cbe758f
|
4
|
+
data.tar.gz: 49bd9a98d7c24ff2b7d1442d58d0b775aaf62e74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 96bd32e2b21abfb827a5cfa10ee520a1c06158876d4fb6238da63b79a785137fcc587aa78f40c8ec03b708e83a520c0cd0192c0795f4df34dbf05ebc21677a3c
|
7
|
+
data.tar.gz: c54ea1804b359da06a32b6c4a8314cc329cd173aa8defbdd7adc63c46230c4c75cf7489beecdb5d3290b6892352778e996cf6828d4ac576ef4082e1ef6c93a46
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -8,7 +8,7 @@ Dependencies
|
|
8
8
|
|
9
9
|
lazar depends on a couple of external programs and libraries. On Debian 7 "Wheezy" systems you can install them with
|
10
10
|
|
11
|
-
`sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev
|
11
|
+
`sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev openjdk-7-jre libgsl0-dev libxml2-dev zlib1g-dev libcairo2-dev`
|
12
12
|
|
13
13
|
You will also need at least mongodb version 3.0, but Debian "Wheezy" provides version 2.4. Please follow the instructions at http://docs.mongodb.org/manual/tutorial/install-mongodb-on-debian/:
|
14
14
|
|
@@ -30,6 +30,7 @@ Installation
|
|
30
30
|
git clone https://github.com/opentox/lazar.git
|
31
31
|
cd lazar
|
32
32
|
ruby ext/lazar/extconf.rb
|
33
|
+
sudo Rscript ext/lazar/rinstall.R
|
33
34
|
bundle install
|
34
35
|
```
|
35
36
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.9
|
data/ext/lazar/extconf.rb
CHANGED
@@ -1,88 +1,27 @@
|
|
1
1
|
require 'fileutils'
|
2
2
|
require 'rbconfig'
|
3
|
+
require 'mkmf'
|
3
4
|
|
4
5
|
main_dir = File.expand_path(File.join(File.dirname(__FILE__),"..",".."))
|
5
6
|
|
6
|
-
#
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
openbabel_dir = File.join main_dir, "openbabel"
|
12
|
-
src_dir = openbabel_dir #File.join openbabel_dir, "openbabel-#{openbabel_version}"
|
13
|
-
build_dir = File.join src_dir, "build"
|
14
|
-
install_dir = openbabel_dir
|
15
|
-
install_lib_dir = File.join install_dir, "lib"
|
16
|
-
lib_dir = File.join openbabel_dir, "lib", "openbabel"
|
17
|
-
ruby_src_dir = File.join src_dir, "scripts", "ruby"
|
18
|
-
|
19
|
-
begin
|
20
|
-
nr_processors = `grep processor /proc/cpuinfo | wc -l` # speed up compilation, Linux only
|
21
|
-
rescue
|
22
|
-
nr_processors = 1
|
7
|
+
# check for required programs
|
8
|
+
programs = ["R","Rscript","mongod","java","getconf"]
|
9
|
+
programs.each do |program|
|
10
|
+
abort "Please install #{program} on your system." unless find_executable program
|
23
11
|
end
|
24
12
|
|
25
|
-
|
26
|
-
Dir.chdir main_dir do
|
27
|
-
FileUtils.rm_rf src_dir
|
28
|
-
puts "Downloading OpenBabel sources"
|
29
|
-
system "git clone https://github.com/openbabel/openbabel.git"
|
30
|
-
end
|
13
|
+
abort "Please install Rserve on your system. Execute 'install.packages('Rserve')' in a R console running as root ('sudo R')." unless `R CMD Rserve --version`.match(/^Rserve/)
|
31
14
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
# http://www.cmake.org/Wiki/CMake_RPATH_handling
|
39
|
-
# http://vtk.1045678.n5.nabble.com/How-to-force-cmake-not-to-remove-install-rpath-td5721193.html
|
40
|
-
cmake += " -DCMAKE_INSTALL_RPATH:STRING=\"#{install_lib_dir}\""
|
41
|
-
system cmake
|
42
|
-
end
|
15
|
+
# install R packages
|
16
|
+
r_dir = File.join main_dir, "R"
|
17
|
+
FileUtils.mkdir_p r_dir
|
18
|
+
FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
|
19
|
+
rinstall = File.expand_path(File.join(File.dirname(__FILE__),"rinstall.R"))
|
20
|
+
puts `Rscript --vanilla #{rinstall} #{r_dir}`
|
43
21
|
|
44
|
-
#
|
45
|
-
|
46
|
-
puts "
|
47
|
-
system "make -j#{nr_processors}"
|
48
|
-
system "make install"
|
49
|
-
ENV["PKG_CONFIG_PATH"] = File.dirname(File.expand_path(Dir["#{install_dir}/**/openbabel*pc"].first))
|
22
|
+
# create a fake Makefile
|
23
|
+
File.open(File.join(File.dirname(__FILE__),"Makefile"),"w+") do |makefile|
|
24
|
+
makefile.puts "all:\n\ttrue\n\ninstall:\n\ttrue\n"
|
50
25
|
end
|
51
26
|
|
52
|
-
ob_include= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/include/openbabel-2.0")
|
53
|
-
ob_lib= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/lib")
|
54
|
-
|
55
|
-
# compile ruby bindings
|
56
|
-
=begin
|
57
|
-
puts "Compiling and installing OpenBabel Ruby bindings."
|
58
|
-
Dir.chdir ruby_src_dir do
|
59
|
-
# fix rpath
|
60
|
-
system "sed -i 's|with_ldflags.*$|with_ldflags(\"#\$LDFLAGS -dynamic -Wl,-rpath,#{install_lib_dir}\") do|' #{File.join(ruby_src_dir,'extconf.rb')}"
|
61
|
-
system "#{RbConfig.ruby} extconf.rb --with-openbabel-include=#{ob_include} --with-openbabel-lib=#{ob_lib}"
|
62
|
-
system "make -j#{nr_processors}"
|
63
|
-
end
|
64
|
-
=end
|
65
|
-
|
66
|
-
# install fminer
|
67
|
-
fminer_dir = File.join main_dir, "libfminer"
|
68
|
-
system "git clone git://github.com/amaunz/fminer2.git #{fminer_dir}"
|
69
|
-
|
70
|
-
["libbbrc","liblast"].each do |lib|
|
71
|
-
FileUtils.cd File.join(fminer_dir,lib)
|
72
|
-
system "sed -i 's,^INCLUDE_OB.*,INCLUDE_OB\ =\ #{ob_include},g' Makefile"
|
73
|
-
system "sed -i 's,^LDFLAGS_OB.*,LDFLAGS_OB\ =\ #{ob_lib},g' Makefile"
|
74
|
-
system "sed -i 's,^INCLUDE_RB.*,INCLUDE_RB\ =\ #{RbConfig::CONFIG['rubyhdrdir']},g' Makefile"
|
75
|
-
# TODO fix in fminer Makefile
|
76
|
-
system "sed -i 's,-g, -g -I #{RbConfig::CONFIG['rubyhdrdir']} -I #{RbConfig::CONFIG['rubyarchhdrdir']} -I,' Makefile" # fix include path (CH)
|
77
|
-
system "sed -i '74s/$(CC)/$(CC) -Wl,-rpath,#{ob_lib.gsub('/','\/')} -L/' Makefile" # fix library path (CH)
|
78
|
-
system "make ruby"
|
79
|
-
end
|
80
|
-
|
81
|
-
# install last-utils
|
82
|
-
FileUtils.cd main_dir
|
83
|
-
system "git clone git://github.com/amaunz/last-utils.git"
|
84
|
-
FileUtils.cd File.join(main_dir,"last-utils")
|
85
|
-
`sed -i '8s/"openbabel", //' lu.rb`
|
86
|
-
|
87
|
-
# install R packagemain_dir
|
88
27
|
$makefile_created = true
|
@@ -0,0 +1,9 @@
|
|
1
|
+
libdir = commandArgs(trailingOnly=TRUE)[1]
|
2
|
+
# chooseCRANmirror(ind=19); does not have any impact on selected server
|
3
|
+
#args=paste0("--prefix=",libdir,"/..")
|
4
|
+
#install.packages("Rserve",lib=libdir,configure.args=args)
|
5
|
+
install.packages("gridExtra",lib=libdir);
|
6
|
+
install.packages("ggplot2",lib=libdir);
|
7
|
+
install.packages("pls",lib=libdir);
|
8
|
+
install.packages("caret",lib=libdir);
|
9
|
+
install.packages("doMC",lib=libdir);
|
data/lazar.gemspec
CHANGED
@@ -9,20 +9,20 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.homepage = "http://github.com/opentox/lazar"
|
10
10
|
s.summary = %q{Lazar framework}
|
11
11
|
s.description = %q{Libraries for lazy structure-activity relationships and read-across.}
|
12
|
-
s.license = 'GPL-3'
|
12
|
+
s.license = 'GPL-3.0'
|
13
13
|
|
14
14
|
s.rubyforge_project = "lazar"
|
15
|
-
|
16
15
|
s.files = `git ls-files`.split("\n")
|
17
16
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
17
|
s.extensions = %w[ext/lazar/extconf.rb]
|
19
18
|
s.require_paths = ["lib"]
|
20
19
|
|
21
20
|
# specify any dependencies here; for example:
|
22
|
-
s.add_runtime_dependency "bundler"
|
23
|
-
s.add_runtime_dependency "rest-client"
|
24
|
-
s.add_runtime_dependency 'nokogiri'
|
25
|
-
s.add_runtime_dependency 'rserve-client'
|
26
|
-
s.add_runtime_dependency "mongoid",
|
21
|
+
s.add_runtime_dependency "bundler", "~> 1.11"
|
22
|
+
s.add_runtime_dependency "rest-client", "~> 1.8"
|
23
|
+
s.add_runtime_dependency 'nokogiri', "~> 1.6"
|
24
|
+
s.add_runtime_dependency 'rserve-client', "~> 0.3"
|
25
|
+
s.add_runtime_dependency "mongoid", "~> 5.0"
|
26
|
+
s.add_runtime_dependency 'openbabel> 2.3.2.2', '~> 0'
|
27
27
|
|
28
28
|
end
|
data/lib/classification.rb
CHANGED
@@ -3,13 +3,14 @@ module OpenTox
|
|
3
3
|
|
4
4
|
class Classification
|
5
5
|
|
6
|
-
def self.weighted_majority_vote
|
7
|
-
|
6
|
+
def self.weighted_majority_vote compound, params
|
7
|
+
neighbors = params[:neighbors]
|
8
8
|
weighted_sum = {}
|
9
9
|
sim_sum = 0.0
|
10
|
+
confidence = 0.0
|
10
11
|
neighbors.each do |row|
|
11
|
-
|
12
|
-
|
12
|
+
sim = row["tanimoto"]
|
13
|
+
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
13
14
|
weighted_sum[act] ||= 0
|
14
15
|
weighted_sum[act] += sim
|
15
16
|
end
|
@@ -27,81 +28,7 @@ module OpenTox
|
|
27
28
|
bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
|
28
29
|
end
|
29
30
|
end
|
30
|
-
|
31
|
-
# Classification with majority vote from neighbors weighted by similarity
|
32
|
-
# @param [Hash] params Keys `:activities, :sims, :value_map` are required
|
33
|
-
# @return [Numeric] A prediction value.
|
34
|
-
def self.fminer_weighted_majority_vote neighbors, training_dataset
|
35
|
-
|
36
|
-
neighbor_contribution = 0.0
|
37
|
-
confidence_sum = 0.0
|
38
|
-
|
39
|
-
$logger.debug "Weighted Majority Vote Classification."
|
40
|
-
|
41
|
-
values = neighbors.collect{|n| n[2]}.uniq
|
42
|
-
neighbors.each do |neighbor|
|
43
|
-
i = training_dataset.compound_ids.index n.id
|
44
|
-
neighbor_weight = neighbor[1]
|
45
|
-
activity = values.index(neighbor[2]) + 1 # map values to integers > 1
|
46
|
-
neighbor_contribution += activity * neighbor_weight
|
47
|
-
if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
|
48
|
-
case activity
|
49
|
-
when 1
|
50
|
-
confidence_sum -= neighbor_weight
|
51
|
-
when 2
|
52
|
-
confidence_sum += neighbor_weight
|
53
|
-
end
|
54
|
-
else
|
55
|
-
confidence_sum += neighbor_weight
|
56
|
-
end
|
57
|
-
end
|
58
|
-
if values.size == 2
|
59
|
-
if confidence_sum >= 0.0
|
60
|
-
prediction = values[1]
|
61
|
-
elsif confidence_sum < 0.0
|
62
|
-
prediction = values[0]
|
63
|
-
end
|
64
|
-
elsif values.size == 1 # all neighbors have the same value
|
65
|
-
prediction = values[0]
|
66
|
-
else
|
67
|
-
prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
|
68
|
-
end
|
69
|
-
|
70
|
-
confidence = (confidence_sum/neighbors.size).abs
|
71
|
-
{:value => prediction, :confidence => confidence.abs}
|
72
|
-
end
|
73
|
-
|
74
|
-
# Local support vector regression from neighbors
|
75
|
-
# @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
|
76
|
-
# @return [Numeric] A prediction value.
|
77
|
-
def self.local_svm_classification(params)
|
78
|
-
|
79
|
-
confidence = 0.0
|
80
|
-
prediction = nil
|
81
|
-
|
82
|
-
$logger.debug "Local SVM."
|
83
|
-
if params[:activities].size>0
|
84
|
-
if params[:props]
|
85
|
-
n_prop = params[:props][0].collect.to_a
|
86
|
-
q_prop = params[:props][1].collect.to_a
|
87
|
-
props = [ n_prop, q_prop ]
|
88
|
-
end
|
89
|
-
activities = params[:activities].collect.to_a
|
90
|
-
activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
|
91
|
-
prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
92
|
-
prediction = prediction.sub(/Val/,"") if prediction # Convert back
|
93
|
-
confidence = 0.0 if prediction.nil?
|
94
|
-
#$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
|
95
|
-
confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
|
96
|
-
end
|
97
|
-
{:value => prediction, :confidence => confidence}
|
98
|
-
|
99
|
-
end
|
100
|
-
|
101
|
-
|
102
|
-
|
103
31
|
end
|
104
|
-
|
105
32
|
end
|
106
33
|
end
|
107
34
|
|
data/lib/compound.rb
CHANGED
@@ -1,43 +1,122 @@
|
|
1
|
-
# TODO: check
|
2
|
-
# *** Open Babel Error in ParseFile
|
3
|
-
# Could not find contribution data file.
|
4
|
-
|
5
1
|
CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
|
6
2
|
|
7
3
|
module OpenTox
|
8
4
|
|
9
5
|
class Compound
|
6
|
+
require_relative "unique_descriptors.rb"
|
10
7
|
include OpenTox
|
11
8
|
|
9
|
+
DEFAULT_FINGERPRINT = "MP2D"
|
10
|
+
|
12
11
|
field :inchi, type: String
|
13
12
|
field :smiles, type: String
|
14
13
|
field :inchikey, type: String
|
15
14
|
field :names, type: Array
|
16
|
-
field :warning, type: String
|
17
15
|
field :cid, type: String
|
18
16
|
field :chemblid, type: String
|
19
17
|
field :png_id, type: BSON::ObjectId
|
20
18
|
field :svg_id, type: BSON::ObjectId
|
21
19
|
field :sdf_id, type: BSON::ObjectId
|
22
|
-
field :
|
23
|
-
field :
|
20
|
+
field :fingerprints, type: Hash, default: {}
|
21
|
+
field :default_fingerprint_size, type: Integer
|
22
|
+
field :physchem_descriptors, type: Hash, default: {}
|
23
|
+
field :dataset_ids, type: Array, default: []
|
24
|
+
field :features, type: Hash, default: {}
|
25
|
+
|
26
|
+
index({smiles: 1}, {unique: true})
|
24
27
|
|
25
28
|
# Overwrites standard Mongoid method to create fingerprints before database insertion
|
26
29
|
def self.find_or_create_by params
|
27
30
|
compound = self.find_or_initialize_by params
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size
|
32
|
+
compound.save
|
33
|
+
compound
|
34
|
+
end
|
35
|
+
|
36
|
+
def fingerprint type=DEFAULT_FINGERPRINT
|
37
|
+
unless fingerprints[type]
|
38
|
+
return [] unless self.smiles
|
39
|
+
#http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
|
40
|
+
if type == "MP2D"
|
41
|
+
fp = obconversion(smiles,"smi","mpd").strip.split("\t")
|
42
|
+
name = fp.shift # remove Title
|
43
|
+
fingerprints[type] = fp.uniq # no fingerprint counts
|
44
|
+
#http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
|
45
|
+
elsif type== "MNA"
|
46
|
+
level = 2 # TODO: level as parameter, evaluate level 1, see paper
|
47
|
+
fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
|
48
|
+
fp.shift # remove Title
|
49
|
+
fingerprints[type] = fp
|
50
|
+
else # standard fingerprints
|
51
|
+
fp = OpenBabel::OBFingerprint.find_fingerprint(type)
|
52
|
+
obmol = OpenBabel::OBMol.new
|
53
|
+
obconversion = OpenBabel::OBConversion.new
|
54
|
+
obconversion.set_in_format "smi"
|
55
|
+
obconversion.read_string obmol, self.smiles
|
56
|
+
result = OpenBabel::VectorUnsignedInt.new
|
57
|
+
fp.get_fingerprint(obmol,result)
|
58
|
+
# TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
|
59
|
+
#p OpenBabel::OBFingerprint.describe_bits(result)
|
60
|
+
# convert result to a list of the bits that are set
|
61
|
+
# from openbabel/scripts/python/pybel.py line 830
|
62
|
+
# see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
|
63
|
+
result = result.to_a
|
64
|
+
bitsperint = OpenBabel::OBFingerprint.getbitsperint()
|
65
|
+
bits_set = []
|
66
|
+
start = 1
|
67
|
+
result.each do |x|
|
68
|
+
i = start
|
69
|
+
while x > 0 do
|
70
|
+
bits_set << i if (x % 2) == 1
|
71
|
+
x >>= 1
|
72
|
+
i += 1
|
73
|
+
end
|
74
|
+
start += bitsperint
|
36
75
|
end
|
76
|
+
fingerprints[type] = bits_set
|
37
77
|
end
|
78
|
+
save
|
79
|
+
end
|
80
|
+
fingerprints[type]
|
81
|
+
end
|
82
|
+
|
83
|
+
def physchem descriptors=PhysChem.openbabel_descriptors
|
84
|
+
# TODO: speedup java descriptors
|
85
|
+
calculated_ids = physchem_descriptors.keys
|
86
|
+
# BSON::ObjectId instances are not allowed as keys in a BSON document.
|
87
|
+
new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
|
88
|
+
descs = {}
|
89
|
+
algos = {}
|
90
|
+
new_ids.each do |id|
|
91
|
+
descriptor = PhysChem.find id
|
92
|
+
descs[[descriptor.library, descriptor.descriptor]] = descriptor
|
93
|
+
algos[descriptor.name] = descriptor
|
94
|
+
end
|
95
|
+
# avoid recalculating Cdk features with multiple values
|
96
|
+
descs.keys.uniq.each do |k|
|
97
|
+
descs[k].send(k[0].downcase,k[1],self).each do |n,v|
|
98
|
+
physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
|
99
|
+
end
|
100
|
+
end
|
101
|
+
save
|
102
|
+
physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
|
103
|
+
end
|
104
|
+
|
105
|
+
def smarts_match smarts, count=false
|
106
|
+
obconversion = OpenBabel::OBConversion.new
|
107
|
+
obmol = OpenBabel::OBMol.new
|
108
|
+
obconversion.set_in_format('smi')
|
109
|
+
obconversion.read_string(obmol,self.smiles)
|
110
|
+
smarts_pattern = OpenBabel::OBSmartsPattern.new
|
111
|
+
smarts.collect do |sma|
|
112
|
+
smarts_pattern.init(sma.smarts)
|
113
|
+
if smarts_pattern.match(obmol)
|
114
|
+
count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
|
115
|
+
else
|
116
|
+
value = 0
|
117
|
+
end
|
118
|
+
value
|
38
119
|
end
|
39
|
-
compound.save
|
40
|
-
compound
|
41
120
|
end
|
42
121
|
|
43
122
|
# Create a compound from smiles string
|
@@ -46,11 +125,16 @@ module OpenTox
|
|
46
125
|
# @param [String] smiles Smiles string
|
47
126
|
# @return [OpenTox::Compound] Compound
|
48
127
|
def self.from_smiles smiles
|
49
|
-
smiles
|
128
|
+
if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
|
129
|
+
$logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
|
130
|
+
return nil
|
131
|
+
end
|
132
|
+
smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
|
50
133
|
if smiles.empty?
|
51
|
-
|
134
|
+
$logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
|
135
|
+
return nil
|
52
136
|
else
|
53
|
-
Compound.find_or_create_by :smiles =>
|
137
|
+
Compound.find_or_create_by :smiles => smiles
|
54
138
|
end
|
55
139
|
end
|
56
140
|
|
@@ -64,7 +148,7 @@ module OpenTox
|
|
64
148
|
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
|
65
149
|
smiles = obconversion(inchi,"inchi","can")
|
66
150
|
if smiles.empty?
|
67
|
-
Compound.find_or_create_by(:
|
151
|
+
Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
|
68
152
|
else
|
69
153
|
Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
|
70
154
|
end
|
@@ -94,7 +178,7 @@ module OpenTox
|
|
94
178
|
|
95
179
|
result = obconversion(smiles,"smi","inchi")
|
96
180
|
#result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
|
97
|
-
update(:inchi => result.chomp)
|
181
|
+
update(:inchi => result.chomp) if result and !result.empty?
|
98
182
|
end
|
99
183
|
self["inchi"]
|
100
184
|
end
|
@@ -131,7 +215,7 @@ module OpenTox
|
|
131
215
|
if self.svg_id.nil?
|
132
216
|
svg = obconversion(smiles,"smi","svg")
|
133
217
|
file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
|
134
|
-
update(:
|
218
|
+
update(:svg_id => $gridfs.insert_one(file))
|
135
219
|
end
|
136
220
|
$gridfs.find_one(_id: self.svg_id).data
|
137
221
|
|
@@ -175,32 +259,111 @@ module OpenTox
|
|
175
259
|
self["chemblid"]
|
176
260
|
end
|
177
261
|
|
178
|
-
def
|
262
|
+
def fingerprint_count_neighbors params
|
263
|
+
# TODO fix
|
264
|
+
neighbors = []
|
265
|
+
query_fingerprint = self.fingerprint params[:type]
|
266
|
+
training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
|
267
|
+
unless self == compound
|
268
|
+
candidate_fingerprint = compound.fingerprint params[:type]
|
269
|
+
features = (query_fingerprint + candidate_fingerprint).uniq
|
270
|
+
min_sum = 0
|
271
|
+
max_sum = 0
|
272
|
+
features.each do |f|
|
273
|
+
min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
|
274
|
+
min_sum += min
|
275
|
+
max_sum += max
|
276
|
+
end
|
277
|
+
max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
|
278
|
+
neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
|
279
|
+
end
|
280
|
+
end
|
281
|
+
neighbors.sort{|a,b| b.last <=> a.last}
|
282
|
+
end
|
283
|
+
|
284
|
+
def fingerprint_neighbors params
|
285
|
+
bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
|
286
|
+
neighbors = []
|
287
|
+
if params[:type] == DEFAULT_FINGERPRINT
|
288
|
+
neighbors = db_neighbors params
|
289
|
+
else
|
290
|
+
query_fingerprint = self.fingerprint params[:type]
|
291
|
+
training_dataset = Dataset.find(params[:training_dataset_id])
|
292
|
+
prediction_feature = training_dataset.features.first
|
293
|
+
training_dataset.compounds.each do |compound|
|
294
|
+
candidate_fingerprint = compound.fingerprint params[:type]
|
295
|
+
sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
|
296
|
+
feature_values = training_dataset.values(compound,prediction_feature)
|
297
|
+
neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
|
298
|
+
end
|
299
|
+
neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
|
300
|
+
end
|
301
|
+
neighbors
|
302
|
+
end
|
303
|
+
|
304
|
+
def physchem_neighbors params
|
305
|
+
feature_dataset = Dataset.find params[:feature_dataset_id]
|
306
|
+
query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
|
307
|
+
neighbors = []
|
308
|
+
feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
|
309
|
+
# TODO implement pearson and cosine similarity separatly
|
310
|
+
R.assign "x", query_fingerprint
|
311
|
+
R.assign "y", candidate_fingerprint
|
312
|
+
sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
|
313
|
+
if sim >= params[:min_sim]
|
314
|
+
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
|
315
|
+
end
|
316
|
+
end
|
317
|
+
neighbors
|
318
|
+
end
|
319
|
+
|
320
|
+
def db_neighbors params
|
179
321
|
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
|
180
|
-
|
322
|
+
|
323
|
+
#qn = default_fingerprint_size
|
181
324
|
#qmin = qn * threshold
|
182
325
|
#qmax = qn / threshold
|
183
326
|
#not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
|
184
327
|
#reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
|
185
328
|
aggregate = [
|
186
329
|
#{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
|
187
|
-
{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
|
330
|
+
#{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
|
188
331
|
{'$project' => {
|
189
332
|
'tanimoto' => {'$let' => {
|
190
|
-
'vars' => {'common' => {'$size' => {'$setIntersection' => [
|
191
|
-
'
|
333
|
+
'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
|
334
|
+
#'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
|
335
|
+
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
|
192
336
|
}},
|
193
|
-
'_id' => 1
|
337
|
+
'_id' => 1,
|
338
|
+
'features' => 1,
|
339
|
+
'dataset_ids' => 1
|
194
340
|
}},
|
195
|
-
{'$match' => {'tanimoto' => {'$gte' =>
|
341
|
+
{'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
|
196
342
|
{'$sort' => {'tanimoto' => -1}}
|
197
343
|
]
|
198
344
|
|
199
|
-
$mongo["compounds"].aggregate(aggregate).
|
345
|
+
$mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
|
200
346
|
|
201
347
|
end
|
202
|
-
|
203
|
-
|
348
|
+
|
349
|
+
# Convert mg to mmol
|
350
|
+
# @return [Float] value in mg
|
351
|
+
def mmol_to_mg mmol
|
352
|
+
mmol.to_f*molecular_weight
|
353
|
+
end
|
354
|
+
|
355
|
+
# Convert mmol to mg
|
356
|
+
# @return [Float] value in mg
|
357
|
+
def mg_to_mmol mg
|
358
|
+
mg.to_f/molecular_weight
|
359
|
+
end
|
360
|
+
|
361
|
+
# Calculate molecular weight of Compound with OB and store it in object
|
362
|
+
# @return [Float] molecular weight
|
363
|
+
def molecular_weight
|
364
|
+
mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
|
365
|
+
physchem([mw_feature])[mw_feature.id.to_s]
|
366
|
+
end
|
204
367
|
|
205
368
|
private
|
206
369
|
|
@@ -209,17 +372,12 @@ module OpenTox
|
|
209
372
|
obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
|
210
373
|
obmol = OpenBabel::OBMol.new
|
211
374
|
obconversion.set_in_and_out_formats input_format, output_format
|
375
|
+
return nil if identifier.nil?
|
212
376
|
obconversion.read_string obmol, identifier
|
213
377
|
case output_format
|
214
378
|
when /smi|can|inchi/
|
215
379
|
obconversion.write_string(obmol).gsub(/\s/,'').chomp
|
216
380
|
when /sdf/
|
217
|
-
p "SDF conversion"
|
218
|
-
# has no effect
|
219
|
-
#obconversion.add_option("gen3D", OpenBabel::OBConversion::GENOPTIONS)
|
220
|
-
# segfaults with openbabel git master
|
221
|
-
#OpenBabel::OBOp.find_type("Gen3D").do(obmol)
|
222
|
-
|
223
381
|
# TODO: find disconnected structures
|
224
382
|
# strip_salts
|
225
383
|
# separate
|
@@ -231,14 +389,13 @@ p "SDF conversion"
|
|
231
389
|
print sdf
|
232
390
|
if sdf.match(/.nan/)
|
233
391
|
|
234
|
-
# TODO: fix or eliminate 2d generation
|
235
392
|
$logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
|
236
393
|
obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
|
237
|
-
#OpenBabel::OBOp.find_type("Gen2D").do(obmol)
|
238
394
|
sdf = obconversion.write_string(obmol)
|
239
395
|
if sdf.match(/.nan/)
|
240
|
-
$logger.warn "2D generation failed for compound #{identifier}"
|
241
|
-
|
396
|
+
$logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
|
397
|
+
obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
|
398
|
+
sdf = obconversion.write_string(obmol)
|
242
399
|
end
|
243
400
|
end
|
244
401
|
sdf
|
@@ -248,7 +405,7 @@ print sdf
|
|
248
405
|
end
|
249
406
|
|
250
407
|
def obconversion(identifier,input_format,output_format,option=nil)
|
251
|
-
self.class.obconversion(identifier,input_format,output_format,option
|
408
|
+
self.class.obconversion(identifier,input_format,output_format,option)
|
252
409
|
end
|
253
410
|
end
|
254
411
|
end
|