lazar 0.0.7 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 56f51ab78b66037e55ff41d7515b0c4bc3876481
4
- data.tar.gz: 893b5f4827406df36ff6abc186767889e4b2cb6c
3
+ metadata.gz: 1b22cad0ba1ecef02ff4af283796fcb36cbe758f
4
+ data.tar.gz: 49bd9a98d7c24ff2b7d1442d58d0b775aaf62e74
5
5
  SHA512:
6
- metadata.gz: b0d402841c42990b7d2a3d8efcbb9c3c7e1839939ad61774a906d289d5a0c7a33277833827175eb006d922f13da24d7c489aaba5e9c25b967dc6ea18964d9333
7
- data.tar.gz: 2242413832ffe15e2ec4bcbb8bf33a0fe126e365d163fe55c804bcd6dc3741ae6f0058dd3c39b7a70121a82e81586b190787dcce96fc504bc1e5aae32af3ec10
6
+ metadata.gz: 96bd32e2b21abfb827a5cfa10ee520a1c06158876d4fb6238da63b79a785137fcc587aa78f40c8ec03b708e83a520c0cd0192c0795f4df34dbf05ebc21677a3c
7
+ data.tar.gz: c54ea1804b359da06a32b6c4a8314cc329cd173aa8defbdd7adc63c46230c4c75cf7489beecdb5d3290b6892352778e996cf6828d4ac576ef4082e1ef6c93a46
data/.gitignore CHANGED
@@ -1,5 +1,7 @@
1
1
  last-utils
2
2
  libfminer
3
+ openbabel
4
+ fminer_debug.txt
3
5
  test/fminer_debug.txt
4
6
  Gemfile.lock
5
7
  *.gem
@@ -8,3 +10,4 @@ pkg/*
8
10
  *~
9
11
  .yardoc/
10
12
  doc/
13
+ lazar.log
data/README.md CHANGED
@@ -8,7 +8,7 @@ Dependencies
8
8
 
9
9
  lazar depends on a couple of external programs and libraries. On Debian 7 "Wheezy" systems you can install them with
10
10
 
11
- `sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev r-cran-rserve openjdk-7-jre libgsl0-dev libxml2-dev zlib1g-dev libcairo2-dev`
11
+ `sudo apt-get install build-essential ruby ruby-dev git cmake swig r-base r-base-dev openjdk-7-jre libgsl0-dev libxml2-dev zlib1g-dev libcairo2-dev`
12
12
 
13
13
  You will also need at least mongodb version 3.0, but Debian "Wheezy" provides version 2.4. Please follow the instructions at http://docs.mongodb.org/manual/tutorial/install-mongodb-on-debian/:
14
14
 
@@ -30,6 +30,7 @@ Installation
30
30
  git clone https://github.com/opentox/lazar.git
31
31
  cd lazar
32
32
  ruby ext/lazar/extconf.rb
33
+ sudo Rscript ext/lazar/rinstall.R
33
34
  bundle install
34
35
  ```
35
36
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.7
1
+ 0.0.9
data/ext/lazar/extconf.rb CHANGED
@@ -1,88 +1,27 @@
1
1
  require 'fileutils'
2
2
  require 'rbconfig'
3
+ require 'mkmf'
3
4
 
4
5
  main_dir = File.expand_path(File.join(File.dirname(__FILE__),"..",".."))
5
6
 
6
- # install OpenBabel
7
-
8
-
9
- openbabel_version = "2.3.2"
10
-
11
- openbabel_dir = File.join main_dir, "openbabel"
12
- src_dir = openbabel_dir #File.join openbabel_dir, "openbabel-#{openbabel_version}"
13
- build_dir = File.join src_dir, "build"
14
- install_dir = openbabel_dir
15
- install_lib_dir = File.join install_dir, "lib"
16
- lib_dir = File.join openbabel_dir, "lib", "openbabel"
17
- ruby_src_dir = File.join src_dir, "scripts", "ruby"
18
-
19
- begin
20
- nr_processors = `grep processor /proc/cpuinfo | wc -l` # speed up compilation, Linux only
21
- rescue
22
- nr_processors = 1
7
+ # check for required programs
8
+ programs = ["R","Rscript","mongod","java","getconf"]
9
+ programs.each do |program|
10
+ abort "Please install #{program} on your system." unless find_executable program
23
11
  end
24
12
 
25
- FileUtils.mkdir_p openbabel_dir
26
- Dir.chdir main_dir do
27
- FileUtils.rm_rf src_dir
28
- puts "Downloading OpenBabel sources"
29
- system "git clone https://github.com/openbabel/openbabel.git"
30
- end
13
+ abort "Please install Rserve on your system. Execute 'install.packages('Rserve')' in a R console running as root ('sudo R')." unless `R CMD Rserve --version`.match(/^Rserve/)
31
14
 
32
- FileUtils.mkdir_p build_dir
33
- FileUtils.mkdir_p install_dir
34
- Dir.chdir build_dir do
35
- puts "Configuring OpenBabel"
36
- cmake = "cmake #{src_dir} -DCMAKE_INSTALL_PREFIX=#{install_dir} -DBUILD_GUI=OFF -DENABLE_TESTS=OFF -DRUN_SWIG=ON -DRUBY_BINDINGS=ON"
37
- # set rpath for local installations
38
- # http://www.cmake.org/Wiki/CMake_RPATH_handling
39
- # http://vtk.1045678.n5.nabble.com/How-to-force-cmake-not-to-remove-install-rpath-td5721193.html
40
- cmake += " -DCMAKE_INSTALL_RPATH:STRING=\"#{install_lib_dir}\""
41
- system cmake
42
- end
15
+ # install R packages
16
+ r_dir = File.join main_dir, "R"
17
+ FileUtils.mkdir_p r_dir
18
+ FileUtils.mkdir_p File.join(main_dir,"bin") # for Rserve binary
19
+ rinstall = File.expand_path(File.join(File.dirname(__FILE__),"rinstall.R"))
20
+ puts `Rscript --vanilla #{rinstall} #{r_dir}`
43
21
 
44
- # local installation in gem directory
45
- Dir.chdir build_dir do
46
- puts "Compiling OpenBabel sources."
47
- system "make -j#{nr_processors}"
48
- system "make install"
49
- ENV["PKG_CONFIG_PATH"] = File.dirname(File.expand_path(Dir["#{install_dir}/**/openbabel*pc"].first))
22
+ # create a fake Makefile
23
+ File.open(File.join(File.dirname(__FILE__),"Makefile"),"w+") do |makefile|
24
+ makefile.puts "all:\n\ttrue\n\ninstall:\n\ttrue\n"
50
25
  end
51
26
 
52
- ob_include= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/include/openbabel-2.0")
53
- ob_lib= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/lib")
54
-
55
- # compile ruby bindings
56
- =begin
57
- puts "Compiling and installing OpenBabel Ruby bindings."
58
- Dir.chdir ruby_src_dir do
59
- # fix rpath
60
- system "sed -i 's|with_ldflags.*$|with_ldflags(\"#\$LDFLAGS -dynamic -Wl,-rpath,#{install_lib_dir}\") do|' #{File.join(ruby_src_dir,'extconf.rb')}"
61
- system "#{RbConfig.ruby} extconf.rb --with-openbabel-include=#{ob_include} --with-openbabel-lib=#{ob_lib}"
62
- system "make -j#{nr_processors}"
63
- end
64
- =end
65
-
66
- # install fminer
67
- fminer_dir = File.join main_dir, "libfminer"
68
- system "git clone git://github.com/amaunz/fminer2.git #{fminer_dir}"
69
-
70
- ["libbbrc","liblast"].each do |lib|
71
- FileUtils.cd File.join(fminer_dir,lib)
72
- system "sed -i 's,^INCLUDE_OB.*,INCLUDE_OB\ =\ #{ob_include},g' Makefile"
73
- system "sed -i 's,^LDFLAGS_OB.*,LDFLAGS_OB\ =\ #{ob_lib},g' Makefile"
74
- system "sed -i 's,^INCLUDE_RB.*,INCLUDE_RB\ =\ #{RbConfig::CONFIG['rubyhdrdir']},g' Makefile"
75
- # TODO fix in fminer Makefile
76
- system "sed -i 's,-g, -g -I #{RbConfig::CONFIG['rubyhdrdir']} -I #{RbConfig::CONFIG['rubyarchhdrdir']} -I,' Makefile" # fix include path (CH)
77
- system "sed -i '74s/$(CC)/$(CC) -Wl,-rpath,#{ob_lib.gsub('/','\/')} -L/' Makefile" # fix library path (CH)
78
- system "make ruby"
79
- end
80
-
81
- # install last-utils
82
- FileUtils.cd main_dir
83
- system "git clone git://github.com/amaunz/last-utils.git"
84
- FileUtils.cd File.join(main_dir,"last-utils")
85
- `sed -i '8s/"openbabel", //' lu.rb`
86
-
87
- # install R packagemain_dir
88
27
  $makefile_created = true
@@ -0,0 +1,9 @@
1
+ libdir = commandArgs(trailingOnly=TRUE)[1]
2
+ # chooseCRANmirror(ind=19); does not have any impact on selected server
3
+ #args=paste0("--prefix=",libdir,"/..")
4
+ #install.packages("Rserve",lib=libdir,configure.args=args)
5
+ install.packages("gridExtra",lib=libdir);
6
+ install.packages("ggplot2",lib=libdir);
7
+ install.packages("pls",lib=libdir);
8
+ install.packages("caret",lib=libdir);
9
+ install.packages("doMC",lib=libdir);
data/lazar.gemspec CHANGED
@@ -9,20 +9,20 @@ Gem::Specification.new do |s|
9
9
  s.homepage = "http://github.com/opentox/lazar"
10
10
  s.summary = %q{Lazar framework}
11
11
  s.description = %q{Libraries for lazy structure-activity relationships and read-across.}
12
- s.license = 'GPL-3'
12
+ s.license = 'GPL-3.0'
13
13
 
14
14
  s.rubyforge_project = "lazar"
15
-
16
15
  s.files = `git ls-files`.split("\n")
17
16
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
17
  s.extensions = %w[ext/lazar/extconf.rb]
19
18
  s.require_paths = ["lib"]
20
19
 
21
20
  # specify any dependencies here; for example:
22
- s.add_runtime_dependency "bundler"
23
- s.add_runtime_dependency "rest-client"
24
- s.add_runtime_dependency 'nokogiri'
25
- s.add_runtime_dependency 'rserve-client'
26
- s.add_runtime_dependency "mongoid", '~> 5.0beta'
21
+ s.add_runtime_dependency "bundler", "~> 1.11"
22
+ s.add_runtime_dependency "rest-client", "~> 1.8"
23
+ s.add_runtime_dependency 'nokogiri', "~> 1.6"
24
+ s.add_runtime_dependency 'rserve-client', "~> 0.3"
25
+ s.add_runtime_dependency "mongoid", "~> 5.0"
26
+ s.add_runtime_dependency 'openbabel> 2.3.2.2', '~> 0'
27
27
 
28
28
  end
@@ -3,13 +3,14 @@ module OpenTox
3
3
 
4
4
  class Classification
5
5
 
6
- def self.weighted_majority_vote neighbors
7
- return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
6
+ def self.weighted_majority_vote compound, params
7
+ neighbors = params[:neighbors]
8
8
  weighted_sum = {}
9
9
  sim_sum = 0.0
10
+ confidence = 0.0
10
11
  neighbors.each do |row|
11
- n,sim,acts = row
12
- acts.each do |act|
12
+ sim = row["tanimoto"]
13
+ row["features"][params[:prediction_feature_id].to_s].each do |act|
13
14
  weighted_sum[act] ||= 0
14
15
  weighted_sum[act] += sim
15
16
  end
@@ -27,81 +28,7 @@ module OpenTox
27
28
  bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
28
29
  end
29
30
  end
30
-
31
- # Classification with majority vote from neighbors weighted by similarity
32
- # @param [Hash] params Keys `:activities, :sims, :value_map` are required
33
- # @return [Numeric] A prediction value.
34
- def self.fminer_weighted_majority_vote neighbors, training_dataset
35
-
36
- neighbor_contribution = 0.0
37
- confidence_sum = 0.0
38
-
39
- $logger.debug "Weighted Majority Vote Classification."
40
-
41
- values = neighbors.collect{|n| n[2]}.uniq
42
- neighbors.each do |neighbor|
43
- i = training_dataset.compound_ids.index n.id
44
- neighbor_weight = neighbor[1]
45
- activity = values.index(neighbor[2]) + 1 # map values to integers > 1
46
- neighbor_contribution += activity * neighbor_weight
47
- if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
48
- case activity
49
- when 1
50
- confidence_sum -= neighbor_weight
51
- when 2
52
- confidence_sum += neighbor_weight
53
- end
54
- else
55
- confidence_sum += neighbor_weight
56
- end
57
- end
58
- if values.size == 2
59
- if confidence_sum >= 0.0
60
- prediction = values[1]
61
- elsif confidence_sum < 0.0
62
- prediction = values[0]
63
- end
64
- elsif values.size == 1 # all neighbors have the same value
65
- prediction = values[0]
66
- else
67
- prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
68
- end
69
-
70
- confidence = (confidence_sum/neighbors.size).abs
71
- {:value => prediction, :confidence => confidence.abs}
72
- end
73
-
74
- # Local support vector regression from neighbors
75
- # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
76
- # @return [Numeric] A prediction value.
77
- def self.local_svm_classification(params)
78
-
79
- confidence = 0.0
80
- prediction = nil
81
-
82
- $logger.debug "Local SVM."
83
- if params[:activities].size>0
84
- if params[:props]
85
- n_prop = params[:props][0].collect.to_a
86
- q_prop = params[:props][1].collect.to_a
87
- props = [ n_prop, q_prop ]
88
- end
89
- activities = params[:activities].collect.to_a
90
- activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
91
- prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
92
- prediction = prediction.sub(/Val/,"") if prediction # Convert back
93
- confidence = 0.0 if prediction.nil?
94
- #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
95
- confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
96
- end
97
- {:value => prediction, :confidence => confidence}
98
-
99
- end
100
-
101
-
102
-
103
31
  end
104
-
105
32
  end
106
33
  end
107
34
 
data/lib/compound.rb CHANGED
@@ -1,43 +1,122 @@
1
- # TODO: check
2
- # *** Open Babel Error in ParseFile
3
- # Could not find contribution data file.
4
-
5
1
  CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
6
2
 
7
3
  module OpenTox
8
4
 
9
5
  class Compound
6
+ require_relative "unique_descriptors.rb"
10
7
  include OpenTox
11
8
 
9
+ DEFAULT_FINGERPRINT = "MP2D"
10
+
12
11
  field :inchi, type: String
13
12
  field :smiles, type: String
14
13
  field :inchikey, type: String
15
14
  field :names, type: Array
16
- field :warning, type: String
17
15
  field :cid, type: String
18
16
  field :chemblid, type: String
19
17
  field :png_id, type: BSON::ObjectId
20
18
  field :svg_id, type: BSON::ObjectId
21
19
  field :sdf_id, type: BSON::ObjectId
22
- field :fp4, type: Array
23
- field :fp4_size, type: Integer
20
+ field :fingerprints, type: Hash, default: {}
21
+ field :default_fingerprint_size, type: Integer
22
+ field :physchem_descriptors, type: Hash, default: {}
23
+ field :dataset_ids, type: Array, default: []
24
+ field :features, type: Hash, default: {}
25
+
26
+ index({smiles: 1}, {unique: true})
24
27
 
25
28
  # Overwrites standard Mongoid method to create fingerprints before database insertion
26
29
  def self.find_or_create_by params
27
30
  compound = self.find_or_initialize_by params
28
- unless compound.fp4 and !compound.fp4.empty?
29
- compound.fp4_size = 0
30
- compound.fp4 = []
31
- fingerprint = FingerprintSmarts.fingerprint
32
- Algorithm::Descriptor.smarts_match(compound, fingerprint).each_with_index do |m,i|
33
- if m > 0
34
- compound.fp4 << fingerprint[i].id
35
- compound.fp4_size += 1
31
+ compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size
32
+ compound.save
33
+ compound
34
+ end
35
+
36
+ def fingerprint type=DEFAULT_FINGERPRINT
37
+ unless fingerprints[type]
38
+ return [] unless self.smiles
39
+ #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
40
+ if type == "MP2D"
41
+ fp = obconversion(smiles,"smi","mpd").strip.split("\t")
42
+ name = fp.shift # remove Title
43
+ fingerprints[type] = fp.uniq # no fingerprint counts
44
+ #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
45
+ elsif type== "MNA"
46
+ level = 2 # TODO: level as parameter, evaluate level 1, see paper
47
+ fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
48
+ fp.shift # remove Title
49
+ fingerprints[type] = fp
50
+ else # standard fingerprints
51
+ fp = OpenBabel::OBFingerprint.find_fingerprint(type)
52
+ obmol = OpenBabel::OBMol.new
53
+ obconversion = OpenBabel::OBConversion.new
54
+ obconversion.set_in_format "smi"
55
+ obconversion.read_string obmol, self.smiles
56
+ result = OpenBabel::VectorUnsignedInt.new
57
+ fp.get_fingerprint(obmol,result)
58
+ # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
59
+ #p OpenBabel::OBFingerprint.describe_bits(result)
60
+ # convert result to a list of the bits that are set
61
+ # from openbabel/scripts/python/pybel.py line 830
62
+ # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
63
+ result = result.to_a
64
+ bitsperint = OpenBabel::OBFingerprint.getbitsperint()
65
+ bits_set = []
66
+ start = 1
67
+ result.each do |x|
68
+ i = start
69
+ while x > 0 do
70
+ bits_set << i if (x % 2) == 1
71
+ x >>= 1
72
+ i += 1
73
+ end
74
+ start += bitsperint
36
75
  end
76
+ fingerprints[type] = bits_set
37
77
  end
78
+ save
79
+ end
80
+ fingerprints[type]
81
+ end
82
+
83
+ def physchem descriptors=PhysChem.openbabel_descriptors
84
+ # TODO: speedup java descriptors
85
+ calculated_ids = physchem_descriptors.keys
86
+ # BSON::ObjectId instances are not allowed as keys in a BSON document.
87
+ new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
88
+ descs = {}
89
+ algos = {}
90
+ new_ids.each do |id|
91
+ descriptor = PhysChem.find id
92
+ descs[[descriptor.library, descriptor.descriptor]] = descriptor
93
+ algos[descriptor.name] = descriptor
94
+ end
95
+ # avoid recalculating Cdk features with multiple values
96
+ descs.keys.uniq.each do |k|
97
+ descs[k].send(k[0].downcase,k[1],self).each do |n,v|
98
+ physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
99
+ end
100
+ end
101
+ save
102
+ physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
103
+ end
104
+
105
+ def smarts_match smarts, count=false
106
+ obconversion = OpenBabel::OBConversion.new
107
+ obmol = OpenBabel::OBMol.new
108
+ obconversion.set_in_format('smi')
109
+ obconversion.read_string(obmol,self.smiles)
110
+ smarts_pattern = OpenBabel::OBSmartsPattern.new
111
+ smarts.collect do |sma|
112
+ smarts_pattern.init(sma.smarts)
113
+ if smarts_pattern.match(obmol)
114
+ count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
115
+ else
116
+ value = 0
117
+ end
118
+ value
38
119
  end
39
- compound.save
40
- compound
41
120
  end
42
121
 
43
122
  # Create a compound from smiles string
@@ -46,11 +125,16 @@ module OpenTox
46
125
  # @param [String] smiles Smiles string
47
126
  # @return [OpenTox::Compound] Compound
48
127
  def self.from_smiles smiles
49
- smiles = obconversion(smiles,"smi","can")
128
+ if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
129
+ $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
130
+ return nil
131
+ end
132
+ smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
50
133
  if smiles.empty?
51
- Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
134
+ $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
135
+ return nil
52
136
  else
53
- Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
137
+ Compound.find_or_create_by :smiles => smiles
54
138
  end
55
139
  end
56
140
 
@@ -64,7 +148,7 @@ module OpenTox
64
148
  #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
65
149
  smiles = obconversion(inchi,"inchi","can")
66
150
  if smiles.empty?
67
- Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
151
+ Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
68
152
  else
69
153
  Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
70
154
  end
@@ -94,7 +178,7 @@ module OpenTox
94
178
 
95
179
  result = obconversion(smiles,"smi","inchi")
96
180
  #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
97
- update(:inchi => result.chomp) unless result.empty?
181
+ update(:inchi => result.chomp) if result and !result.empty?
98
182
  end
99
183
  self["inchi"]
100
184
  end
@@ -131,7 +215,7 @@ module OpenTox
131
215
  if self.svg_id.nil?
132
216
  svg = obconversion(smiles,"smi","svg")
133
217
  file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
134
- update(:image_id => $gridfs.insert_one(file))
218
+ update(:svg_id => $gridfs.insert_one(file))
135
219
  end
136
220
  $gridfs.find_one(_id: self.svg_id).data
137
221
 
@@ -175,32 +259,111 @@ module OpenTox
175
259
  self["chemblid"]
176
260
  end
177
261
 
178
- def neighbors threshold=0.7
262
+ def fingerprint_count_neighbors params
263
+ # TODO fix
264
+ neighbors = []
265
+ query_fingerprint = self.fingerprint params[:type]
266
+ training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
267
+ unless self == compound
268
+ candidate_fingerprint = compound.fingerprint params[:type]
269
+ features = (query_fingerprint + candidate_fingerprint).uniq
270
+ min_sum = 0
271
+ max_sum = 0
272
+ features.each do |f|
273
+ min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
274
+ min_sum += min
275
+ max_sum += max
276
+ end
277
+ max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
278
+ neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
279
+ end
280
+ end
281
+ neighbors.sort{|a,b| b.last <=> a.last}
282
+ end
283
+
284
+ def fingerprint_neighbors params
285
+ bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
286
+ neighbors = []
287
+ if params[:type] == DEFAULT_FINGERPRINT
288
+ neighbors = db_neighbors params
289
+ else
290
+ query_fingerprint = self.fingerprint params[:type]
291
+ training_dataset = Dataset.find(params[:training_dataset_id])
292
+ prediction_feature = training_dataset.features.first
293
+ training_dataset.compounds.each do |compound|
294
+ candidate_fingerprint = compound.fingerprint params[:type]
295
+ sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
296
+ feature_values = training_dataset.values(compound,prediction_feature)
297
+ neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
298
+ end
299
+ neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
300
+ end
301
+ neighbors
302
+ end
303
+
304
+ def physchem_neighbors params
305
+ feature_dataset = Dataset.find params[:feature_dataset_id]
306
+ query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
307
+ neighbors = []
308
+ feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
309
+ # TODO implement pearson and cosine similarity separatly
310
+ R.assign "x", query_fingerprint
311
+ R.assign "y", candidate_fingerprint
312
+ sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
313
+ if sim >= params[:min_sim]
314
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
315
+ end
316
+ end
317
+ neighbors
318
+ end
319
+
320
+ def db_neighbors params
179
321
  # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
180
- qn = fp4.size
322
+
323
+ #qn = default_fingerprint_size
181
324
  #qmin = qn * threshold
182
325
  #qmax = qn / threshold
183
326
  #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
184
327
  #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
185
328
  aggregate = [
186
329
  #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
187
- {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
330
+ #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
188
331
  {'$project' => {
189
332
  'tanimoto' => {'$let' => {
190
- 'vars' => {'common' => {'$size' => {'$setIntersection' => ['$fp4', fp4]}}},
191
- 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$fp4_size']}, '$$common']}]}
333
+ 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
334
+ #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
335
+ 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
192
336
  }},
193
- '_id' => 1
337
+ '_id' => 1,
338
+ 'features' => 1,
339
+ 'dataset_ids' => 1
194
340
  }},
195
- {'$match' => {'tanimoto' => {'$gte' => threshold}}},
341
+ {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
196
342
  {'$sort' => {'tanimoto' => -1}}
197
343
  ]
198
344
 
199
- $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
345
+ $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
200
346
 
201
347
  end
202
- =begin
203
- =end
348
+
349
+ # Convert mg to mmol
350
+ # @return [Float] value in mg
351
+ def mmol_to_mg mmol
352
+ mmol.to_f*molecular_weight
353
+ end
354
+
355
+ # Convert mmol to mg
356
+ # @return [Float] value in mg
357
+ def mg_to_mmol mg
358
+ mg.to_f/molecular_weight
359
+ end
360
+
361
+ # Calculate molecular weight of Compound with OB and store it in object
362
+ # @return [Float] molecular weight
363
+ def molecular_weight
364
+ mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
365
+ physchem([mw_feature])[mw_feature.id.to_s]
366
+ end
204
367
 
205
368
  private
206
369
 
@@ -209,17 +372,12 @@ module OpenTox
209
372
  obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
210
373
  obmol = OpenBabel::OBMol.new
211
374
  obconversion.set_in_and_out_formats input_format, output_format
375
+ return nil if identifier.nil?
212
376
  obconversion.read_string obmol, identifier
213
377
  case output_format
214
378
  when /smi|can|inchi/
215
379
  obconversion.write_string(obmol).gsub(/\s/,'').chomp
216
380
  when /sdf/
217
- p "SDF conversion"
218
- # has no effect
219
- #obconversion.add_option("gen3D", OpenBabel::OBConversion::GENOPTIONS)
220
- # segfaults with openbabel git master
221
- #OpenBabel::OBOp.find_type("Gen3D").do(obmol)
222
-
223
381
  # TODO: find disconnected structures
224
382
  # strip_salts
225
383
  # separate
@@ -231,14 +389,13 @@ p "SDF conversion"
231
389
  print sdf
232
390
  if sdf.match(/.nan/)
233
391
 
234
- # TODO: fix or eliminate 2d generation
235
392
  $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
236
393
  obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
237
- #OpenBabel::OBOp.find_type("Gen2D").do(obmol)
238
394
  sdf = obconversion.write_string(obmol)
239
395
  if sdf.match(/.nan/)
240
- $logger.warn "2D generation failed for compound #{identifier}"
241
- sdf = nil
396
+ $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
397
+ obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
398
+ sdf = obconversion.write_string(obmol)
242
399
  end
243
400
  end
244
401
  sdf
@@ -248,7 +405,7 @@ print sdf
248
405
  end
249
406
 
250
407
  def obconversion(identifier,input_format,output_format,option=nil)
251
- self.class.obconversion(identifier,input_format,output_format,option=nil)
408
+ self.class.obconversion(identifier,input_format,output_format,option)
252
409
  end
253
410
  end
254
411
  end