lazar 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 56f51ab78b66037e55ff41d7515b0c4bc3876481
4
- data.tar.gz: 893b5f4827406df36ff6abc186767889e4b2cb6c
3
+ metadata.gz: 1b22cad0ba1ecef02ff4af283796fcb36cbe758f
4
+ data.tar.gz: 49bd9a98d7c24ff2b7d1442d58d0b775aaf62e74
5
5
  SHA512:
6
- metadata.gz: b0d402841c42990b7d2a3d8efcbb9c3c7e1839939ad61774a906d289d5a0c7a33277833827175eb006d922f13da24d7c489aaba5e9c25b967dc6ea18964d9333
7
- data.tar.gz: 2242413832ffe15e2ec4bcbb8bf33a0fe126e365d163fe55c804bcd6dc3741ae6f0058dd3c39b7a70121a82e81586b190787dcce96fc504bc1e5aae32af3ec10
6
+ metadata.gz: 96bd32e2b21abfb827a5cfa10ee520a1c06158876d4fb6238da63b79a785137fcc587aa78f40c8ec03b708e83a520c0cd0192c0795f4df34dbf05ebc21677a3c
7
+ data.tar.gz: c54ea1804b359da06a32b6c4a8314cc329cd173aa8defbdd7adc63c46230c4c75cf7489beecdb5d3290b6892352778e996cf6828d4ac576ef4082e1ef6c93a46
data/.gitignore CHANGED
@@ -1,5 +1,7 @@
1
1
  last-utils
2
2
  libfminer
3
+ openbabel
4
+ fminer_debug.txt
3
5
  test/fminer_debug.txt
4
6
  Gemfile.lock
5
7
  *.gem
@@ -8,3 +10,4 @@ pkg/*
8
10
  *~
9
11
  .yardoc/
10
12
  doc/
13
+ lazar.log
data/README.md CHANGED
@@ -8,7 +8,7 @@ Dependencies
8
8
 
9
9
  lazar depends on a couple of external programs and libraries. On Debian 7 "Wheezy" systems you can install them with
10
10
 
11
- `sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev r-cran-rserve openjdk-7-jre libgsl0-dev libxml2-dev zlib1g-dev libcairo2-dev`
11
+ `sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev openjdk-7-jre libgsl0-dev libxml2-dev zlib1g-dev libcairo2-dev`
12
12
 
13
13
  You will also need at least mongodb version 3.0, but Debian "Wheezy" provides version 2.4. Please follow the instructions at http://docs.mongodb.org/manual/tutorial/install-mongodb-on-debian/:
14
14
 
@@ -30,6 +30,7 @@ Installation
30
30
  git clone https://github.com/opentox/lazar.git
31
31
  cd lazar
32
32
  ruby ext/lazar/extconf.rb
33
+ sudo Rscript ext/lazar/rinstall.R
33
34
  bundle install
34
35
  ```
35
36
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.7
1
+ 0.0.9
data/ext/lazar/extconf.rb CHANGED
@@ -1,88 +1,27 @@
1
1
  require 'fileutils'
2
2
  require 'rbconfig'
3
+ require 'mkmf'
3
4
 
4
5
  main_dir = File.expand_path(File.join(File.dirname(__FILE__),"..",".."))
5
6
 
6
- # install OpenBabel
7
-
8
-
9
- openbabel_version = "2.3.2"
10
-
11
- openbabel_dir = File.join main_dir, "openbabel"
12
- src_dir = openbabel_dir #File.join openbabel_dir, "openbabel-#{openbabel_version}"
13
- build_dir = File.join src_dir, "build"
14
- install_dir = openbabel_dir
15
- install_lib_dir = File.join install_dir, "lib"
16
- lib_dir = File.join openbabel_dir, "lib", "openbabel"
17
- ruby_src_dir = File.join src_dir, "scripts", "ruby"
18
-
19
- begin
20
- nr_processors = `grep processor /proc/cpuinfo | wc -l` # speed up compilation, Linux only
21
- rescue
22
- nr_processors = 1
7
+ # check for required programs
8
+ programs = ["R","Rscript","mongod","java","getconf"]
9
+ programs.each do |program|
10
+ abort "Please install #{program} on your system." unless find_executable program
23
11
  end
24
12
 
25
- FileUtils.mkdir_p openbabel_dir
26
- Dir.chdir main_dir do
27
- FileUtils.rm_rf src_dir
28
- puts "Downloading OpenBabel sources"
29
- system "git clone https://github.com/openbabel/openbabel.git"
30
- end
13
+ abort "Please install Rserve on your system. Execute 'install.packages('Rserve')' in a R console running as root ('sudo R')." unless `R CMD Rserve --version`.match(/^Rserve/)
31
14
 
32
- FileUtils.mkdir_p build_dir
33
- FileUtils.mkdir_p install_dir
34
- Dir.chdir build_dir do
35
- puts "Configuring OpenBabel"
36
- cmake = "cmake #{src_dir} -DCMAKE_INSTALL_PREFIX=#{install_dir} -DBUILD_GUI=OFF -DENABLE_TESTS=OFF -DRUN_SWIG=ON -DRUBY_BINDINGS=ON"
37
- # set rpath for local installations
38
- # http://www.cmake.org/Wiki/CMake_RPATH_handling
39
- # http://vtk.1045678.n5.nabble.com/How-to-force-cmake-not-to-remove-install-rpath-td5721193.html
40
- cmake += " -DCMAKE_INSTALL_RPATH:STRING=\"#{install_lib_dir}\""
41
- system cmake
42
- end
15
+ # install R packages
16
+ r_dir = File.join main_dir, "R"
17
+ FileUtils.mkdir_p r_dir
18
+ FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
19
+ rinstall = File.expand_path(File.join(File.dirname(__FILE__),"rinstall.R"))
20
+ puts `Rscript --vanilla #{rinstall} #{r_dir}`
43
21
 
44
- # local installation in gem directory
45
- Dir.chdir build_dir do
46
- puts "Compiling OpenBabel sources."
47
- system "make -j#{nr_processors}"
48
- system "make install"
49
- ENV["PKG_CONFIG_PATH"] = File.dirname(File.expand_path(Dir["#{install_dir}/**/openbabel*pc"].first))
22
+ # create a fake Makefile
23
+ File.open(File.join(File.dirname(__FILE__),"Makefile"),"w+") do |makefile|
24
+ makefile.puts "all:\n\ttrue\n\ninstall:\n\ttrue\n"
50
25
  end
51
26
 
52
- ob_include= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/include/openbabel-2.0")
53
- ob_lib= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/lib")
54
-
55
- # compile ruby bindings
56
- =begin
57
- puts "Compiling and installing OpenBabel Ruby bindings."
58
- Dir.chdir ruby_src_dir do
59
- # fix rpath
60
- system "sed -i 's|with_ldflags.*$|with_ldflags(\"#\$LDFLAGS -dynamic -Wl,-rpath,#{install_lib_dir}\") do|' #{File.join(ruby_src_dir,'extconf.rb')}"
61
- system "#{RbConfig.ruby} extconf.rb --with-openbabel-include=#{ob_include} --with-openbabel-lib=#{ob_lib}"
62
- system "make -j#{nr_processors}"
63
- end
64
- =end
65
-
66
- # install fminer
67
- fminer_dir = File.join main_dir, "libfminer"
68
- system "git clone git://github.com/amaunz/fminer2.git #{fminer_dir}"
69
-
70
- ["libbbrc","liblast"].each do |lib|
71
- FileUtils.cd File.join(fminer_dir,lib)
72
- system "sed -i 's,^INCLUDE_OB.*,INCLUDE_OB\ =\ #{ob_include},g' Makefile"
73
- system "sed -i 's,^LDFLAGS_OB.*,LDFLAGS_OB\ =\ #{ob_lib},g' Makefile"
74
- system "sed -i 's,^INCLUDE_RB.*,INCLUDE_RB\ =\ #{RbConfig::CONFIG['rubyhdrdir']},g' Makefile"
75
- # TODO fix in fminer Makefile
76
- system "sed -i 's,-g, -g -I #{RbConfig::CONFIG['rubyhdrdir']} -I #{RbConfig::CONFIG['rubyarchhdrdir']} -I,' Makefile" # fix include path (CH)
77
- system "sed -i '74s/$(CC)/$(CC) -Wl,-rpath,#{ob_lib.gsub('/','\/')} -L/' Makefile" # fix library path (CH)
78
- system "make ruby"
79
- end
80
-
81
- # install last-utils
82
- FileUtils.cd main_dir
83
- system "git clone git://github.com/amaunz/last-utils.git"
84
- FileUtils.cd File.join(main_dir,"last-utils")
85
- `sed -i '8s/"openbabel", //' lu.rb`
86
-
87
- # install R packagemain_dir
88
27
  $makefile_created = true
@@ -0,0 +1,9 @@
1
+ libdir = commandArgs(trailingOnly=TRUE)[1]
2
+ # chooseCRANmirror(ind=19); does not have any impact on selected server
3
+ #args=paste0("--prefix=",libdir,"/..")
4
+ #install.packages("Rserve",lib=libdir,configure.args=args)
5
+ install.packages("gridExtra",lib=libdir);
6
+ install.packages("ggplot2",lib=libdir);
7
+ install.packages("pls",lib=libdir);
8
+ install.packages("caret",lib=libdir);
9
+ install.packages("doMC",lib=libdir);
data/lazar.gemspec CHANGED
@@ -9,20 +9,20 @@ Gem::Specification.new do |s|
9
9
  s.homepage = "http://github.com/opentox/lazar"
10
10
  s.summary = %q{Lazar framework}
11
11
  s.description = %q{Libraries for lazy structure-activity relationships and read-across.}
12
- s.license = 'GPL-3'
12
+ s.license = 'GPL-3.0'
13
13
 
14
14
  s.rubyforge_project = "lazar"
15
-
16
15
  s.files = `git ls-files`.split("\n")
17
16
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
17
  s.extensions = %w[ext/lazar/extconf.rb]
19
18
  s.require_paths = ["lib"]
20
19
 
21
20
  # specify any dependencies here; for example:
22
- s.add_runtime_dependency "bundler"
23
- s.add_runtime_dependency "rest-client"
24
- s.add_runtime_dependency 'nokogiri'
25
- s.add_runtime_dependency 'rserve-client'
26
- s.add_runtime_dependency "mongoid", '~> 5.0beta'
21
+ s.add_runtime_dependency "bundler", "~> 1.11"
22
+ s.add_runtime_dependency "rest-client", "~> 1.8"
23
+ s.add_runtime_dependency 'nokogiri', "~> 1.6"
24
+ s.add_runtime_dependency 'rserve-client', "~> 0.3"
25
+ s.add_runtime_dependency "mongoid", "~> 5.0"
26
+ s.add_runtime_dependency 'openbabel> 2.3.2.2', '~> 0'
27
27
 
28
28
  end
@@ -3,13 +3,14 @@ module OpenTox
3
3
 
4
4
  class Classification
5
5
 
6
- def self.weighted_majority_vote neighbors
7
- return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
6
+ def self.weighted_majority_vote compound, params
7
+ neighbors = params[:neighbors]
8
8
  weighted_sum = {}
9
9
  sim_sum = 0.0
10
+ confidence = 0.0
10
11
  neighbors.each do |row|
11
- n,sim,acts = row
12
- acts.each do |act|
12
+ sim = row["tanimoto"]
13
+ row["features"][params[:prediction_feature_id].to_s].each do |act|
13
14
  weighted_sum[act] ||= 0
14
15
  weighted_sum[act] += sim
15
16
  end
@@ -27,81 +28,7 @@ module OpenTox
27
28
  bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
28
29
  end
29
30
  end
30
-
31
- # Classification with majority vote from neighbors weighted by similarity
32
- # @param [Hash] params Keys `:activities, :sims, :value_map` are required
33
- # @return [Numeric] A prediction value.
34
- def self.fminer_weighted_majority_vote neighbors, training_dataset
35
-
36
- neighbor_contribution = 0.0
37
- confidence_sum = 0.0
38
-
39
- $logger.debug "Weighted Majority Vote Classification."
40
-
41
- values = neighbors.collect{|n| n[2]}.uniq
42
- neighbors.each do |neighbor|
43
- i = training_dataset.compound_ids.index n.id
44
- neighbor_weight = neighbor[1]
45
- activity = values.index(neighbor[2]) + 1 # map values to integers > 1
46
- neighbor_contribution += activity * neighbor_weight
47
- if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
48
- case activity
49
- when 1
50
- confidence_sum -= neighbor_weight
51
- when 2
52
- confidence_sum += neighbor_weight
53
- end
54
- else
55
- confidence_sum += neighbor_weight
56
- end
57
- end
58
- if values.size == 2
59
- if confidence_sum >= 0.0
60
- prediction = values[1]
61
- elsif confidence_sum < 0.0
62
- prediction = values[0]
63
- end
64
- elsif values.size == 1 # all neighbors have the same value
65
- prediction = values[0]
66
- else
67
- prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
68
- end
69
-
70
- confidence = (confidence_sum/neighbors.size).abs
71
- {:value => prediction, :confidence => confidence.abs}
72
- end
73
-
74
- # Local support vector regression from neighbors
75
- # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
76
- # @return [Numeric] A prediction value.
77
- def self.local_svm_classification(params)
78
-
79
- confidence = 0.0
80
- prediction = nil
81
-
82
- $logger.debug "Local SVM."
83
- if params[:activities].size>0
84
- if params[:props]
85
- n_prop = params[:props][0].collect.to_a
86
- q_prop = params[:props][1].collect.to_a
87
- props = [ n_prop, q_prop ]
88
- end
89
- activities = params[:activities].collect.to_a
90
- activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
91
- prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
92
- prediction = prediction.sub(/Val/,"") if prediction # Convert back
93
- confidence = 0.0 if prediction.nil?
94
- #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
95
- confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
96
- end
97
- {:value => prediction, :confidence => confidence}
98
-
99
- end
100
-
101
-
102
-
103
31
  end
104
-
105
32
  end
106
33
  end
107
34
 
data/lib/compound.rb CHANGED
@@ -1,43 +1,122 @@
1
- # TODO: check
2
- # *** Open Babel Error in ParseFile
3
- # Could not find contribution data file.
4
-
5
1
  CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
6
2
 
7
3
  module OpenTox
8
4
 
9
5
  class Compound
6
+ require_relative "unique_descriptors.rb"
10
7
  include OpenTox
11
8
 
9
+ DEFAULT_FINGERPRINT = "MP2D"
10
+
12
11
  field :inchi, type: String
13
12
  field :smiles, type: String
14
13
  field :inchikey, type: String
15
14
  field :names, type: Array
16
- field :warning, type: String
17
15
  field :cid, type: String
18
16
  field :chemblid, type: String
19
17
  field :png_id, type: BSON::ObjectId
20
18
  field :svg_id, type: BSON::ObjectId
21
19
  field :sdf_id, type: BSON::ObjectId
22
- field :fp4, type: Array
23
- field :fp4_size, type: Integer
20
+ field :fingerprints, type: Hash, default: {}
21
+ field :default_fingerprint_size, type: Integer
22
+ field :physchem_descriptors, type: Hash, default: {}
23
+ field :dataset_ids, type: Array, default: []
24
+ field :features, type: Hash, default: {}
25
+
26
+ index({smiles: 1}, {unique: true})
24
27
 
25
28
  # Overwrites standard Mongoid method to create fingerprints before database insertion
26
29
  def self.find_or_create_by params
27
30
  compound = self.find_or_initialize_by params
28
- unless compound.fp4 and !compound.fp4.empty?
29
- compound.fp4_size = 0
30
- compound.fp4 = []
31
- fingerprint = FingerprintSmarts.fingerprint
32
- Algorithm::Descriptor.smarts_match(compound, fingerprint).each_with_index do |m,i|
33
- if m > 0
34
- compound.fp4 << fingerprint[i].id
35
- compound.fp4_size += 1
31
+ compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size
32
+ compound.save
33
+ compound
34
+ end
35
+
36
+ def fingerprint type=DEFAULT_FINGERPRINT
37
+ unless fingerprints[type]
38
+ return [] unless self.smiles
39
+ #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
40
+ if type == "MP2D"
41
+ fp = obconversion(smiles,"smi","mpd").strip.split("\t")
42
+ name = fp.shift # remove Title
43
+ fingerprints[type] = fp.uniq # no fingerprint counts
44
+ #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
45
+ elsif type== "MNA"
46
+ level = 2 # TODO: level as parameter, evaluate level 1, see paper
47
+ fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
48
+ fp.shift # remove Title
49
+ fingerprints[type] = fp
50
+ else # standard fingerprints
51
+ fp = OpenBabel::OBFingerprint.find_fingerprint(type)
52
+ obmol = OpenBabel::OBMol.new
53
+ obconversion = OpenBabel::OBConversion.new
54
+ obconversion.set_in_format "smi"
55
+ obconversion.read_string obmol, self.smiles
56
+ result = OpenBabel::VectorUnsignedInt.new
57
+ fp.get_fingerprint(obmol,result)
58
+ # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
59
+ #p OpenBabel::OBFingerprint.describe_bits(result)
60
+ # convert result to a list of the bits that are set
61
+ # from openbabel/scripts/python/pybel.py line 830
62
+ # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
63
+ result = result.to_a
64
+ bitsperint = OpenBabel::OBFingerprint.getbitsperint()
65
+ bits_set = []
66
+ start = 1
67
+ result.each do |x|
68
+ i = start
69
+ while x > 0 do
70
+ bits_set << i if (x % 2) == 1
71
+ x >>= 1
72
+ i += 1
73
+ end
74
+ start += bitsperint
36
75
  end
76
+ fingerprints[type] = bits_set
37
77
  end
78
+ save
79
+ end
80
+ fingerprints[type]
81
+ end
82
+
83
+ def physchem descriptors=PhysChem.openbabel_descriptors
84
+ # TODO: speedup java descriptors
85
+ calculated_ids = physchem_descriptors.keys
86
+ # BSON::ObjectId instances are not allowed as keys in a BSON document.
87
+ new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
88
+ descs = {}
89
+ algos = {}
90
+ new_ids.each do |id|
91
+ descriptor = PhysChem.find id
92
+ descs[[descriptor.library, descriptor.descriptor]] = descriptor
93
+ algos[descriptor.name] = descriptor
94
+ end
95
+ # avoid recalculating Cdk features with multiple values
96
+ descs.keys.uniq.each do |k|
97
+ descs[k].send(k[0].downcase,k[1],self).each do |n,v|
98
+ physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
99
+ end
100
+ end
101
+ save
102
+ physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
103
+ end
104
+
105
+ def smarts_match smarts, count=false
106
+ obconversion = OpenBabel::OBConversion.new
107
+ obmol = OpenBabel::OBMol.new
108
+ obconversion.set_in_format('smi')
109
+ obconversion.read_string(obmol,self.smiles)
110
+ smarts_pattern = OpenBabel::OBSmartsPattern.new
111
+ smarts.collect do |sma|
112
+ smarts_pattern.init(sma.smarts)
113
+ if smarts_pattern.match(obmol)
114
+ count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
115
+ else
116
+ value = 0
117
+ end
118
+ value
38
119
  end
39
- compound.save
40
- compound
41
120
  end
42
121
 
43
122
  # Create a compound from smiles string
@@ -46,11 +125,16 @@ module OpenTox
46
125
  # @param [String] smiles Smiles string
47
126
  # @return [OpenTox::Compound] Compound
48
127
  def self.from_smiles smiles
49
- smiles = obconversion(smiles,"smi","can")
128
+ if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
129
+ $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
130
+ return nil
131
+ end
132
+ smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
50
133
  if smiles.empty?
51
- Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
134
+ $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
135
+ return nil
52
136
  else
53
- Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
137
+ Compound.find_or_create_by :smiles => smiles
54
138
  end
55
139
  end
56
140
 
@@ -64,7 +148,7 @@ module OpenTox
64
148
  #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
65
149
  smiles = obconversion(inchi,"inchi","can")
66
150
  if smiles.empty?
67
- Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
151
+ Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
68
152
  else
69
153
  Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
70
154
  end
@@ -94,7 +178,7 @@ module OpenTox
94
178
 
95
179
  result = obconversion(smiles,"smi","inchi")
96
180
  #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
97
- update(:inchi => result.chomp) unless result.empty?
181
+ update(:inchi => result.chomp) if result and !result.empty?
98
182
  end
99
183
  self["inchi"]
100
184
  end
@@ -131,7 +215,7 @@ module OpenTox
131
215
  if self.svg_id.nil?
132
216
  svg = obconversion(smiles,"smi","svg")
133
217
  file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
134
- update(:image_id => $gridfs.insert_one(file))
218
+ update(:svg_id => $gridfs.insert_one(file))
135
219
  end
136
220
  $gridfs.find_one(_id: self.svg_id).data
137
221
 
@@ -175,32 +259,111 @@ module OpenTox
175
259
  self["chemblid"]
176
260
  end
177
261
 
178
- def neighbors threshold=0.7
262
+ def fingerprint_count_neighbors params
263
+ # TODO fix
264
+ neighbors = []
265
+ query_fingerprint = self.fingerprint params[:type]
266
+ training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
267
+ unless self == compound
268
+ candidate_fingerprint = compound.fingerprint params[:type]
269
+ features = (query_fingerprint + candidate_fingerprint).uniq
270
+ min_sum = 0
271
+ max_sum = 0
272
+ features.each do |f|
273
+ min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
274
+ min_sum += min
275
+ max_sum += max
276
+ end
277
+ max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
278
+ neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
279
+ end
280
+ end
281
+ neighbors.sort{|a,b| b.last <=> a.last}
282
+ end
283
+
284
+ def fingerprint_neighbors params
285
+ bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
286
+ neighbors = []
287
+ if params[:type] == DEFAULT_FINGERPRINT
288
+ neighbors = db_neighbors params
289
+ else
290
+ query_fingerprint = self.fingerprint params[:type]
291
+ training_dataset = Dataset.find(params[:training_dataset_id])
292
+ prediction_feature = training_dataset.features.first
293
+ training_dataset.compounds.each do |compound|
294
+ candidate_fingerprint = compound.fingerprint params[:type]
295
+ sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
296
+ feature_values = training_dataset.values(compound,prediction_feature)
297
+ neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
298
+ end
299
+ neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
300
+ end
301
+ neighbors
302
+ end
303
+
304
+ def physchem_neighbors params
305
+ feature_dataset = Dataset.find params[:feature_dataset_id]
306
+ query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
307
+ neighbors = []
308
+ feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
309
+ # TODO implement pearson and cosine similarity separatly
310
+ R.assign "x", query_fingerprint
311
+ R.assign "y", candidate_fingerprint
312
+ sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
313
+ if sim >= params[:min_sim]
314
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
315
+ end
316
+ end
317
+ neighbors
318
+ end
319
+
320
+ def db_neighbors params
179
321
  # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
180
- qn = fp4.size
322
+
323
+ #qn = default_fingerprint_size
181
324
  #qmin = qn * threshold
182
325
  #qmax = qn / threshold
183
326
  #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
184
327
  #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
185
328
  aggregate = [
186
329
  #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
187
- {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
330
+ #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
188
331
  {'$project' => {
189
332
  'tanimoto' => {'$let' => {
190
- 'vars' => {'common' => {'$size' => {'$setIntersection' => ['$fp4', fp4]}}},
191
- 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$fp4_size']}, '$$common']}]}
333
+ 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
334
+ #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
335
+ 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
192
336
  }},
193
- '_id' => 1
337
+ '_id' => 1,
338
+ 'features' => 1,
339
+ 'dataset_ids' => 1
194
340
  }},
195
- {'$match' => {'tanimoto' => {'$gte' => threshold}}},
341
+ {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
196
342
  {'$sort' => {'tanimoto' => -1}}
197
343
  ]
198
344
 
199
- $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
345
+ $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
200
346
 
201
347
  end
202
- =begin
203
- =end
348
+
349
+ # Convert mg to mmol
350
+ # @return [Float] value in mg
351
+ def mmol_to_mg mmol
352
+ mmol.to_f*molecular_weight
353
+ end
354
+
355
+ # Convert mmol to mg
356
+ # @return [Float] value in mg
357
+ def mg_to_mmol mg
358
+ mg.to_f/molecular_weight
359
+ end
360
+
361
+ # Calculate molecular weight of Compound with OB and store it in object
362
+ # @return [Float] molecular weight
363
+ def molecular_weight
364
+ mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
365
+ physchem([mw_feature])[mw_feature.id.to_s]
366
+ end
204
367
 
205
368
  private
206
369
 
@@ -209,17 +372,12 @@ module OpenTox
209
372
  obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
210
373
  obmol = OpenBabel::OBMol.new
211
374
  obconversion.set_in_and_out_formats input_format, output_format
375
+ return nil if identifier.nil?
212
376
  obconversion.read_string obmol, identifier
213
377
  case output_format
214
378
  when /smi|can|inchi/
215
379
  obconversion.write_string(obmol).gsub(/\s/,'').chomp
216
380
  when /sdf/
217
- p "SDF conversion"
218
- # has no effect
219
- #obconversion.add_option("gen3D", OpenBabel::OBConversion::GENOPTIONS)
220
- # segfaults with openbabel git master
221
- #OpenBabel::OBOp.find_type("Gen3D").do(obmol)
222
-
223
381
  # TODO: find disconnected structures
224
382
  # strip_salts
225
383
  # separate
@@ -231,14 +389,13 @@ p "SDF conversion"
231
389
  print sdf
232
390
  if sdf.match(/.nan/)
233
391
 
234
- # TODO: fix or eliminate 2d generation
235
392
  $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
236
393
  obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
237
- #OpenBabel::OBOp.find_type("Gen2D").do(obmol)
238
394
  sdf = obconversion.write_string(obmol)
239
395
  if sdf.match(/.nan/)
240
- $logger.warn "2D generation failed for compound #{identifier}"
241
- sdf = nil
396
+ $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
397
+ obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
398
+ sdf = obconversion.write_string(obmol)
242
399
  end
243
400
  end
244
401
  sdf
@@ -248,7 +405,7 @@ print sdf
248
405
  end
249
406
 
250
407
  def obconversion(identifier,input_format,output_format,option=nil)
251
- self.class.obconversion(identifier,input_format,output_format,option=nil)
408
+ self.class.obconversion(identifier,input_format,output_format,option)
252
409
  end
253
410
  end
254
411
  end