lazar 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1b22cad0ba1ecef02ff4af283796fcb36cbe758f
|
4
|
+
data.tar.gz: 49bd9a98d7c24ff2b7d1442d58d0b775aaf62e74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 96bd32e2b21abfb827a5cfa10ee520a1c06158876d4fb6238da63b79a785137fcc587aa78f40c8ec03b708e83a520c0cd0192c0795f4df34dbf05ebc21677a3c
|
7
|
+
data.tar.gz: c54ea1804b359da06a32b6c4a8314cc329cd173aa8defbdd7adc63c46230c4c75cf7489beecdb5d3290b6892352778e996cf6828d4ac576ef4082e1ef6c93a46
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -8,7 +8,7 @@ Dependencies
|
|
8
8
|
|
9
9
|
lazar depends on a couple of external programs and libraries. On Debian 7 "Wheezy" systems you can install them with
|
10
10
|
|
11
|
-
`sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev
|
11
|
+
`sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev openjdk-7-jre libgsl0-dev libxml2-dev zlib1g-dev libcairo2-dev`
|
12
12
|
|
13
13
|
You will also need at least mongodb version 3.0, but Debian "Wheezy" provides version 2.4. Please follow the instructions at http://docs.mongodb.org/manual/tutorial/install-mongodb-on-debian/:
|
14
14
|
|
@@ -30,6 +30,7 @@ Installation
|
|
30
30
|
git clone https://github.com/opentox/lazar.git
|
31
31
|
cd lazar
|
32
32
|
ruby ext/lazar/extconf.rb
|
33
|
+
sudo Rscript ext/lazar/rinstall.R
|
33
34
|
bundle install
|
34
35
|
```
|
35
36
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.9
|
data/ext/lazar/extconf.rb
CHANGED
@@ -1,88 +1,27 @@
|
|
1
1
|
require 'fileutils'
|
2
2
|
require 'rbconfig'
|
3
|
+
require 'mkmf'
|
3
4
|
|
4
5
|
main_dir = File.expand_path(File.join(File.dirname(__FILE__),"..",".."))
|
5
6
|
|
6
|
-
#
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
openbabel_dir = File.join main_dir, "openbabel"
|
12
|
-
src_dir = openbabel_dir #File.join openbabel_dir, "openbabel-#{openbabel_version}"
|
13
|
-
build_dir = File.join src_dir, "build"
|
14
|
-
install_dir = openbabel_dir
|
15
|
-
install_lib_dir = File.join install_dir, "lib"
|
16
|
-
lib_dir = File.join openbabel_dir, "lib", "openbabel"
|
17
|
-
ruby_src_dir = File.join src_dir, "scripts", "ruby"
|
18
|
-
|
19
|
-
begin
|
20
|
-
nr_processors = `grep processor /proc/cpuinfo | wc -l` # speed up compilation, Linux only
|
21
|
-
rescue
|
22
|
-
nr_processors = 1
|
7
|
+
# check for required programs
|
8
|
+
programs = ["R","Rscript","mongod","java","getconf"]
|
9
|
+
programs.each do |program|
|
10
|
+
abort "Please install #{program} on your system." unless find_executable program
|
23
11
|
end
|
24
12
|
|
25
|
-
|
26
|
-
Dir.chdir main_dir do
|
27
|
-
FileUtils.rm_rf src_dir
|
28
|
-
puts "Downloading OpenBabel sources"
|
29
|
-
system "git clone https://github.com/openbabel/openbabel.git"
|
30
|
-
end
|
13
|
+
abort "Please install Rserve on your system. Execute 'install.packages('Rserve')' in a R console running as root ('sudo R')." unless `R CMD Rserve --version`.match(/^Rserve/)
|
31
14
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
# http://www.cmake.org/Wiki/CMake_RPATH_handling
|
39
|
-
# http://vtk.1045678.n5.nabble.com/How-to-force-cmake-not-to-remove-install-rpath-td5721193.html
|
40
|
-
cmake += " -DCMAKE_INSTALL_RPATH:STRING=\"#{install_lib_dir}\""
|
41
|
-
system cmake
|
42
|
-
end
|
15
|
+
# install R packages
|
16
|
+
r_dir = File.join main_dir, "R"
|
17
|
+
FileUtils.mkdir_p r_dir
|
18
|
+
FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
|
19
|
+
rinstall = File.expand_path(File.join(File.dirname(__FILE__),"rinstall.R"))
|
20
|
+
puts `Rscript --vanilla #{rinstall} #{r_dir}`
|
43
21
|
|
44
|
-
#
|
45
|
-
|
46
|
-
puts "
|
47
|
-
system "make -j#{nr_processors}"
|
48
|
-
system "make install"
|
49
|
-
ENV["PKG_CONFIG_PATH"] = File.dirname(File.expand_path(Dir["#{install_dir}/**/openbabel*pc"].first))
|
22
|
+
# create a fake Makefile
|
23
|
+
File.open(File.join(File.dirname(__FILE__),"Makefile"),"w+") do |makefile|
|
24
|
+
makefile.puts "all:\n\ttrue\n\ninstall:\n\ttrue\n"
|
50
25
|
end
|
51
26
|
|
52
|
-
ob_include= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/include/openbabel-2.0")
|
53
|
-
ob_lib= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/lib")
|
54
|
-
|
55
|
-
# compile ruby bindings
|
56
|
-
=begin
|
57
|
-
puts "Compiling and installing OpenBabel Ruby bindings."
|
58
|
-
Dir.chdir ruby_src_dir do
|
59
|
-
# fix rpath
|
60
|
-
system "sed -i 's|with_ldflags.*$|with_ldflags(\"#\$LDFLAGS -dynamic -Wl,-rpath,#{install_lib_dir}\") do|' #{File.join(ruby_src_dir,'extconf.rb')}"
|
61
|
-
system "#{RbConfig.ruby} extconf.rb --with-openbabel-include=#{ob_include} --with-openbabel-lib=#{ob_lib}"
|
62
|
-
system "make -j#{nr_processors}"
|
63
|
-
end
|
64
|
-
=end
|
65
|
-
|
66
|
-
# install fminer
|
67
|
-
fminer_dir = File.join main_dir, "libfminer"
|
68
|
-
system "git clone git://github.com/amaunz/fminer2.git #{fminer_dir}"
|
69
|
-
|
70
|
-
["libbbrc","liblast"].each do |lib|
|
71
|
-
FileUtils.cd File.join(fminer_dir,lib)
|
72
|
-
system "sed -i 's,^INCLUDE_OB.*,INCLUDE_OB\ =\ #{ob_include},g' Makefile"
|
73
|
-
system "sed -i 's,^LDFLAGS_OB.*,LDFLAGS_OB\ =\ #{ob_lib},g' Makefile"
|
74
|
-
system "sed -i 's,^INCLUDE_RB.*,INCLUDE_RB\ =\ #{RbConfig::CONFIG['rubyhdrdir']},g' Makefile"
|
75
|
-
# TODO fix in fminer Makefile
|
76
|
-
system "sed -i 's,-g, -g -I #{RbConfig::CONFIG['rubyhdrdir']} -I #{RbConfig::CONFIG['rubyarchhdrdir']} -I,' Makefile" # fix include path (CH)
|
77
|
-
system "sed -i '74s/$(CC)/$(CC) -Wl,-rpath,#{ob_lib.gsub('/','\/')} -L/' Makefile" # fix library path (CH)
|
78
|
-
system "make ruby"
|
79
|
-
end
|
80
|
-
|
81
|
-
# install last-utils
|
82
|
-
FileUtils.cd main_dir
|
83
|
-
system "git clone git://github.com/amaunz/last-utils.git"
|
84
|
-
FileUtils.cd File.join(main_dir,"last-utils")
|
85
|
-
`sed -i '8s/"openbabel", //' lu.rb`
|
86
|
-
|
87
|
-
# install R packagemain_dir
|
88
27
|
$makefile_created = true
|
@@ -0,0 +1,9 @@
|
|
1
|
+
libdir = commandArgs(trailingOnly=TRUE)[1]
|
2
|
+
# chooseCRANmirror(ind=19); does not have any impact on selected server
|
3
|
+
#args=paste0("--prefix=",libdir,"/..")
|
4
|
+
#install.packages("Rserve",lib=libdir,configure.args=args)
|
5
|
+
install.packages("gridExtra",lib=libdir);
|
6
|
+
install.packages("ggplot2",lib=libdir);
|
7
|
+
install.packages("pls",lib=libdir);
|
8
|
+
install.packages("caret",lib=libdir);
|
9
|
+
install.packages("doMC",lib=libdir);
|
data/lazar.gemspec
CHANGED
@@ -9,20 +9,20 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.homepage = "http://github.com/opentox/lazar"
|
10
10
|
s.summary = %q{Lazar framework}
|
11
11
|
s.description = %q{Libraries for lazy structure-activity relationships and read-across.}
|
12
|
-
s.license = 'GPL-3'
|
12
|
+
s.license = 'GPL-3.0'
|
13
13
|
|
14
14
|
s.rubyforge_project = "lazar"
|
15
|
-
|
16
15
|
s.files = `git ls-files`.split("\n")
|
17
16
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
17
|
s.extensions = %w[ext/lazar/extconf.rb]
|
19
18
|
s.require_paths = ["lib"]
|
20
19
|
|
21
20
|
# specify any dependencies here; for example:
|
22
|
-
s.add_runtime_dependency "bundler"
|
23
|
-
s.add_runtime_dependency "rest-client"
|
24
|
-
s.add_runtime_dependency 'nokogiri'
|
25
|
-
s.add_runtime_dependency 'rserve-client'
|
26
|
-
s.add_runtime_dependency "mongoid",
|
21
|
+
s.add_runtime_dependency "bundler", "~> 1.11"
|
22
|
+
s.add_runtime_dependency "rest-client", "~> 1.8"
|
23
|
+
s.add_runtime_dependency 'nokogiri', "~> 1.6"
|
24
|
+
s.add_runtime_dependency 'rserve-client', "~> 0.3"
|
25
|
+
s.add_runtime_dependency "mongoid", "~> 5.0"
|
26
|
+
s.add_runtime_dependency 'openbabel> 2.3.2.2', '~> 0'
|
27
27
|
|
28
28
|
end
|
data/lib/classification.rb
CHANGED
@@ -3,13 +3,14 @@ module OpenTox
|
|
3
3
|
|
4
4
|
class Classification
|
5
5
|
|
6
|
-
def self.weighted_majority_vote
|
7
|
-
|
6
|
+
def self.weighted_majority_vote compound, params
|
7
|
+
neighbors = params[:neighbors]
|
8
8
|
weighted_sum = {}
|
9
9
|
sim_sum = 0.0
|
10
|
+
confidence = 0.0
|
10
11
|
neighbors.each do |row|
|
11
|
-
|
12
|
-
|
12
|
+
sim = row["tanimoto"]
|
13
|
+
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
13
14
|
weighted_sum[act] ||= 0
|
14
15
|
weighted_sum[act] += sim
|
15
16
|
end
|
@@ -27,81 +28,7 @@ module OpenTox
|
|
27
28
|
bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
|
28
29
|
end
|
29
30
|
end
|
30
|
-
|
31
|
-
# Classification with majority vote from neighbors weighted by similarity
|
32
|
-
# @param [Hash] params Keys `:activities, :sims, :value_map` are required
|
33
|
-
# @return [Numeric] A prediction value.
|
34
|
-
def self.fminer_weighted_majority_vote neighbors, training_dataset
|
35
|
-
|
36
|
-
neighbor_contribution = 0.0
|
37
|
-
confidence_sum = 0.0
|
38
|
-
|
39
|
-
$logger.debug "Weighted Majority Vote Classification."
|
40
|
-
|
41
|
-
values = neighbors.collect{|n| n[2]}.uniq
|
42
|
-
neighbors.each do |neighbor|
|
43
|
-
i = training_dataset.compound_ids.index n.id
|
44
|
-
neighbor_weight = neighbor[1]
|
45
|
-
activity = values.index(neighbor[2]) + 1 # map values to integers > 1
|
46
|
-
neighbor_contribution += activity * neighbor_weight
|
47
|
-
if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
|
48
|
-
case activity
|
49
|
-
when 1
|
50
|
-
confidence_sum -= neighbor_weight
|
51
|
-
when 2
|
52
|
-
confidence_sum += neighbor_weight
|
53
|
-
end
|
54
|
-
else
|
55
|
-
confidence_sum += neighbor_weight
|
56
|
-
end
|
57
|
-
end
|
58
|
-
if values.size == 2
|
59
|
-
if confidence_sum >= 0.0
|
60
|
-
prediction = values[1]
|
61
|
-
elsif confidence_sum < 0.0
|
62
|
-
prediction = values[0]
|
63
|
-
end
|
64
|
-
elsif values.size == 1 # all neighbors have the same value
|
65
|
-
prediction = values[0]
|
66
|
-
else
|
67
|
-
prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
|
68
|
-
end
|
69
|
-
|
70
|
-
confidence = (confidence_sum/neighbors.size).abs
|
71
|
-
{:value => prediction, :confidence => confidence.abs}
|
72
|
-
end
|
73
|
-
|
74
|
-
# Local support vector regression from neighbors
|
75
|
-
# @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
|
76
|
-
# @return [Numeric] A prediction value.
|
77
|
-
def self.local_svm_classification(params)
|
78
|
-
|
79
|
-
confidence = 0.0
|
80
|
-
prediction = nil
|
81
|
-
|
82
|
-
$logger.debug "Local SVM."
|
83
|
-
if params[:activities].size>0
|
84
|
-
if params[:props]
|
85
|
-
n_prop = params[:props][0].collect.to_a
|
86
|
-
q_prop = params[:props][1].collect.to_a
|
87
|
-
props = [ n_prop, q_prop ]
|
88
|
-
end
|
89
|
-
activities = params[:activities].collect.to_a
|
90
|
-
activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
|
91
|
-
prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
92
|
-
prediction = prediction.sub(/Val/,"") if prediction # Convert back
|
93
|
-
confidence = 0.0 if prediction.nil?
|
94
|
-
#$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
|
95
|
-
confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
|
96
|
-
end
|
97
|
-
{:value => prediction, :confidence => confidence}
|
98
|
-
|
99
|
-
end
|
100
|
-
|
101
|
-
|
102
|
-
|
103
31
|
end
|
104
|
-
|
105
32
|
end
|
106
33
|
end
|
107
34
|
|
data/lib/compound.rb
CHANGED
@@ -1,43 +1,122 @@
|
|
1
|
-
# TODO: check
|
2
|
-
# *** Open Babel Error in ParseFile
|
3
|
-
# Could not find contribution data file.
|
4
|
-
|
5
1
|
CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
|
6
2
|
|
7
3
|
module OpenTox
|
8
4
|
|
9
5
|
class Compound
|
6
|
+
require_relative "unique_descriptors.rb"
|
10
7
|
include OpenTox
|
11
8
|
|
9
|
+
DEFAULT_FINGERPRINT = "MP2D"
|
10
|
+
|
12
11
|
field :inchi, type: String
|
13
12
|
field :smiles, type: String
|
14
13
|
field :inchikey, type: String
|
15
14
|
field :names, type: Array
|
16
|
-
field :warning, type: String
|
17
15
|
field :cid, type: String
|
18
16
|
field :chemblid, type: String
|
19
17
|
field :png_id, type: BSON::ObjectId
|
20
18
|
field :svg_id, type: BSON::ObjectId
|
21
19
|
field :sdf_id, type: BSON::ObjectId
|
22
|
-
field :
|
23
|
-
field :
|
20
|
+
field :fingerprints, type: Hash, default: {}
|
21
|
+
field :default_fingerprint_size, type: Integer
|
22
|
+
field :physchem_descriptors, type: Hash, default: {}
|
23
|
+
field :dataset_ids, type: Array, default: []
|
24
|
+
field :features, type: Hash, default: {}
|
25
|
+
|
26
|
+
index({smiles: 1}, {unique: true})
|
24
27
|
|
25
28
|
# Overwrites standard Mongoid method to create fingerprints before database insertion
|
26
29
|
def self.find_or_create_by params
|
27
30
|
compound = self.find_or_initialize_by params
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size
|
32
|
+
compound.save
|
33
|
+
compound
|
34
|
+
end
|
35
|
+
|
36
|
+
def fingerprint type=DEFAULT_FINGERPRINT
|
37
|
+
unless fingerprints[type]
|
38
|
+
return [] unless self.smiles
|
39
|
+
#http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
|
40
|
+
if type == "MP2D"
|
41
|
+
fp = obconversion(smiles,"smi","mpd").strip.split("\t")
|
42
|
+
name = fp.shift # remove Title
|
43
|
+
fingerprints[type] = fp.uniq # no fingerprint counts
|
44
|
+
#http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
|
45
|
+
elsif type== "MNA"
|
46
|
+
level = 2 # TODO: level as parameter, evaluate level 1, see paper
|
47
|
+
fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
|
48
|
+
fp.shift # remove Title
|
49
|
+
fingerprints[type] = fp
|
50
|
+
else # standard fingerprints
|
51
|
+
fp = OpenBabel::OBFingerprint.find_fingerprint(type)
|
52
|
+
obmol = OpenBabel::OBMol.new
|
53
|
+
obconversion = OpenBabel::OBConversion.new
|
54
|
+
obconversion.set_in_format "smi"
|
55
|
+
obconversion.read_string obmol, self.smiles
|
56
|
+
result = OpenBabel::VectorUnsignedInt.new
|
57
|
+
fp.get_fingerprint(obmol,result)
|
58
|
+
# TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
|
59
|
+
#p OpenBabel::OBFingerprint.describe_bits(result)
|
60
|
+
# convert result to a list of the bits that are set
|
61
|
+
# from openbabel/scripts/python/pybel.py line 830
|
62
|
+
# see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
|
63
|
+
result = result.to_a
|
64
|
+
bitsperint = OpenBabel::OBFingerprint.getbitsperint()
|
65
|
+
bits_set = []
|
66
|
+
start = 1
|
67
|
+
result.each do |x|
|
68
|
+
i = start
|
69
|
+
while x > 0 do
|
70
|
+
bits_set << i if (x % 2) == 1
|
71
|
+
x >>= 1
|
72
|
+
i += 1
|
73
|
+
end
|
74
|
+
start += bitsperint
|
36
75
|
end
|
76
|
+
fingerprints[type] = bits_set
|
37
77
|
end
|
78
|
+
save
|
79
|
+
end
|
80
|
+
fingerprints[type]
|
81
|
+
end
|
82
|
+
|
83
|
+
def physchem descriptors=PhysChem.openbabel_descriptors
|
84
|
+
# TODO: speedup java descriptors
|
85
|
+
calculated_ids = physchem_descriptors.keys
|
86
|
+
# BSON::ObjectId instances are not allowed as keys in a BSON document.
|
87
|
+
new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
|
88
|
+
descs = {}
|
89
|
+
algos = {}
|
90
|
+
new_ids.each do |id|
|
91
|
+
descriptor = PhysChem.find id
|
92
|
+
descs[[descriptor.library, descriptor.descriptor]] = descriptor
|
93
|
+
algos[descriptor.name] = descriptor
|
94
|
+
end
|
95
|
+
# avoid recalculating Cdk features with multiple values
|
96
|
+
descs.keys.uniq.each do |k|
|
97
|
+
descs[k].send(k[0].downcase,k[1],self).each do |n,v|
|
98
|
+
physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
|
99
|
+
end
|
100
|
+
end
|
101
|
+
save
|
102
|
+
physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
|
103
|
+
end
|
104
|
+
|
105
|
+
def smarts_match smarts, count=false
|
106
|
+
obconversion = OpenBabel::OBConversion.new
|
107
|
+
obmol = OpenBabel::OBMol.new
|
108
|
+
obconversion.set_in_format('smi')
|
109
|
+
obconversion.read_string(obmol,self.smiles)
|
110
|
+
smarts_pattern = OpenBabel::OBSmartsPattern.new
|
111
|
+
smarts.collect do |sma|
|
112
|
+
smarts_pattern.init(sma.smarts)
|
113
|
+
if smarts_pattern.match(obmol)
|
114
|
+
count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
|
115
|
+
else
|
116
|
+
value = 0
|
117
|
+
end
|
118
|
+
value
|
38
119
|
end
|
39
|
-
compound.save
|
40
|
-
compound
|
41
120
|
end
|
42
121
|
|
43
122
|
# Create a compound from smiles string
|
@@ -46,11 +125,16 @@ module OpenTox
|
|
46
125
|
# @param [String] smiles Smiles string
|
47
126
|
# @return [OpenTox::Compound] Compound
|
48
127
|
def self.from_smiles smiles
|
49
|
-
smiles
|
128
|
+
if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
|
129
|
+
$logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
|
130
|
+
return nil
|
131
|
+
end
|
132
|
+
smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
|
50
133
|
if smiles.empty?
|
51
|
-
|
134
|
+
$logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
|
135
|
+
return nil
|
52
136
|
else
|
53
|
-
Compound.find_or_create_by :smiles =>
|
137
|
+
Compound.find_or_create_by :smiles => smiles
|
54
138
|
end
|
55
139
|
end
|
56
140
|
|
@@ -64,7 +148,7 @@ module OpenTox
|
|
64
148
|
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
|
65
149
|
smiles = obconversion(inchi,"inchi","can")
|
66
150
|
if smiles.empty?
|
67
|
-
Compound.find_or_create_by(:
|
151
|
+
Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
|
68
152
|
else
|
69
153
|
Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
|
70
154
|
end
|
@@ -94,7 +178,7 @@ module OpenTox
|
|
94
178
|
|
95
179
|
result = obconversion(smiles,"smi","inchi")
|
96
180
|
#result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
|
97
|
-
update(:inchi => result.chomp)
|
181
|
+
update(:inchi => result.chomp) if result and !result.empty?
|
98
182
|
end
|
99
183
|
self["inchi"]
|
100
184
|
end
|
@@ -131,7 +215,7 @@ module OpenTox
|
|
131
215
|
if self.svg_id.nil?
|
132
216
|
svg = obconversion(smiles,"smi","svg")
|
133
217
|
file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
|
134
|
-
update(:
|
218
|
+
update(:svg_id => $gridfs.insert_one(file))
|
135
219
|
end
|
136
220
|
$gridfs.find_one(_id: self.svg_id).data
|
137
221
|
|
@@ -175,32 +259,111 @@ module OpenTox
|
|
175
259
|
self["chemblid"]
|
176
260
|
end
|
177
261
|
|
178
|
-
def
|
262
|
+
def fingerprint_count_neighbors params
|
263
|
+
# TODO fix
|
264
|
+
neighbors = []
|
265
|
+
query_fingerprint = self.fingerprint params[:type]
|
266
|
+
training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
|
267
|
+
unless self == compound
|
268
|
+
candidate_fingerprint = compound.fingerprint params[:type]
|
269
|
+
features = (query_fingerprint + candidate_fingerprint).uniq
|
270
|
+
min_sum = 0
|
271
|
+
max_sum = 0
|
272
|
+
features.each do |f|
|
273
|
+
min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
|
274
|
+
min_sum += min
|
275
|
+
max_sum += max
|
276
|
+
end
|
277
|
+
max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
|
278
|
+
neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
|
279
|
+
end
|
280
|
+
end
|
281
|
+
neighbors.sort{|a,b| b.last <=> a.last}
|
282
|
+
end
|
283
|
+
|
284
|
+
def fingerprint_neighbors params
|
285
|
+
bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
|
286
|
+
neighbors = []
|
287
|
+
if params[:type] == DEFAULT_FINGERPRINT
|
288
|
+
neighbors = db_neighbors params
|
289
|
+
else
|
290
|
+
query_fingerprint = self.fingerprint params[:type]
|
291
|
+
training_dataset = Dataset.find(params[:training_dataset_id])
|
292
|
+
prediction_feature = training_dataset.features.first
|
293
|
+
training_dataset.compounds.each do |compound|
|
294
|
+
candidate_fingerprint = compound.fingerprint params[:type]
|
295
|
+
sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
|
296
|
+
feature_values = training_dataset.values(compound,prediction_feature)
|
297
|
+
neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
|
298
|
+
end
|
299
|
+
neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
|
300
|
+
end
|
301
|
+
neighbors
|
302
|
+
end
|
303
|
+
|
304
|
+
def physchem_neighbors params
|
305
|
+
feature_dataset = Dataset.find params[:feature_dataset_id]
|
306
|
+
query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
|
307
|
+
neighbors = []
|
308
|
+
feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
|
309
|
+
# TODO implement pearson and cosine similarity separatly
|
310
|
+
R.assign "x", query_fingerprint
|
311
|
+
R.assign "y", candidate_fingerprint
|
312
|
+
sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
|
313
|
+
if sim >= params[:min_sim]
|
314
|
+
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
|
315
|
+
end
|
316
|
+
end
|
317
|
+
neighbors
|
318
|
+
end
|
319
|
+
|
320
|
+
def db_neighbors params
|
179
321
|
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
|
180
|
-
|
322
|
+
|
323
|
+
#qn = default_fingerprint_size
|
181
324
|
#qmin = qn * threshold
|
182
325
|
#qmax = qn / threshold
|
183
326
|
#not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
|
184
327
|
#reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
|
185
328
|
aggregate = [
|
186
329
|
#{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
|
187
|
-
{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
|
330
|
+
#{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
|
188
331
|
{'$project' => {
|
189
332
|
'tanimoto' => {'$let' => {
|
190
|
-
'vars' => {'common' => {'$size' => {'$setIntersection' => [
|
191
|
-
'
|
333
|
+
'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
|
334
|
+
#'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
|
335
|
+
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
|
192
336
|
}},
|
193
|
-
'_id' => 1
|
337
|
+
'_id' => 1,
|
338
|
+
'features' => 1,
|
339
|
+
'dataset_ids' => 1
|
194
340
|
}},
|
195
|
-
{'$match' => {'tanimoto' => {'$gte' =>
|
341
|
+
{'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
|
196
342
|
{'$sort' => {'tanimoto' => -1}}
|
197
343
|
]
|
198
344
|
|
199
|
-
$mongo["compounds"].aggregate(aggregate).
|
345
|
+
$mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
|
200
346
|
|
201
347
|
end
|
202
|
-
|
203
|
-
|
348
|
+
|
349
|
+
# Convert mg to mmol
|
350
|
+
# @return [Float] value in mg
|
351
|
+
def mmol_to_mg mmol
|
352
|
+
mmol.to_f*molecular_weight
|
353
|
+
end
|
354
|
+
|
355
|
+
# Convert mmol to mg
|
356
|
+
# @return [Float] value in mg
|
357
|
+
def mg_to_mmol mg
|
358
|
+
mg.to_f/molecular_weight
|
359
|
+
end
|
360
|
+
|
361
|
+
# Calculate molecular weight of Compound with OB and store it in object
|
362
|
+
# @return [Float] molecular weight
|
363
|
+
def molecular_weight
|
364
|
+
mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
|
365
|
+
physchem([mw_feature])[mw_feature.id.to_s]
|
366
|
+
end
|
204
367
|
|
205
368
|
private
|
206
369
|
|
@@ -209,17 +372,12 @@ module OpenTox
|
|
209
372
|
obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
|
210
373
|
obmol = OpenBabel::OBMol.new
|
211
374
|
obconversion.set_in_and_out_formats input_format, output_format
|
375
|
+
return nil if identifier.nil?
|
212
376
|
obconversion.read_string obmol, identifier
|
213
377
|
case output_format
|
214
378
|
when /smi|can|inchi/
|
215
379
|
obconversion.write_string(obmol).gsub(/\s/,'').chomp
|
216
380
|
when /sdf/
|
217
|
-
p "SDF conversion"
|
218
|
-
# has no effect
|
219
|
-
#obconversion.add_option("gen3D", OpenBabel::OBConversion::GENOPTIONS)
|
220
|
-
# segfaults with openbabel git master
|
221
|
-
#OpenBabel::OBOp.find_type("Gen3D").do(obmol)
|
222
|
-
|
223
381
|
# TODO: find disconnected structures
|
224
382
|
# strip_salts
|
225
383
|
# separate
|
@@ -231,14 +389,13 @@ p "SDF conversion"
|
|
231
389
|
print sdf
|
232
390
|
if sdf.match(/.nan/)
|
233
391
|
|
234
|
-
# TODO: fix or eliminate 2d generation
|
235
392
|
$logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
|
236
393
|
obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
|
237
|
-
#OpenBabel::OBOp.find_type("Gen2D").do(obmol)
|
238
394
|
sdf = obconversion.write_string(obmol)
|
239
395
|
if sdf.match(/.nan/)
|
240
|
-
$logger.warn "2D generation failed for compound #{identifier}"
|
241
|
-
|
396
|
+
$logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
|
397
|
+
obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
|
398
|
+
sdf = obconversion.write_string(obmol)
|
242
399
|
end
|
243
400
|
end
|
244
401
|
sdf
|
@@ -248,7 +405,7 @@ print sdf
|
|
248
405
|
end
|
249
406
|
|
250
407
|
def obconversion(identifier,input_format,output_format,option=nil)
|
251
|
-
self.class.obconversion(identifier,input_format,output_format,option
|
408
|
+
self.class.obconversion(identifier,input_format,output_format,option)
|
252
409
|
end
|
253
410
|
end
|
254
411
|
end
|