lazar 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
data/lib/algorithm.rb ADDED
@@ -0,0 +1,21 @@
1
+ module OpenTox
2
+
3
+ module Algorithm
4
+
5
+ # Generic method to execute algorithms
6
+ # Algorithms should:
7
+ # - accept a Compound, an Array of Compounds or a Dataset as first argument
8
+ # - optional parameters as second argument
9
+ # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
10
+ # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
11
+ # @param [Hash] Algorithm parameters
12
+ # @return Algorithm result
13
+ def self.run algorithm, object, parameters=nil
14
+ bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
15
+ klass,method = algorithm.split('.')
16
+ parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
17
+ end
18
+
19
+ end
20
+ end
21
+
data/lib/bbrc.rb ADDED
@@ -0,0 +1,165 @@
1
+ module OpenTox
2
+ module Algorithm
3
+ class Fminer
4
+ TABLE_OF_ELEMENTS = [
5
+ "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
6
+
7
+ #
8
+ # Run bbrc algorithm on dataset
9
+ #
10
+ # @param [OpenTox::Dataset] training dataset
11
+ # @param [optional] parameters BBRC parameters, accepted parameters are
12
+ # - min_frequency Minimum frequency (default 5)
13
+ # - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
14
+ # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
15
+ # - min_chisq_significance Significance threshold (between 0 and 1)
16
+ # - nr_hits Set to "true" to get hit count instead of presence
17
+ # - get_target Set to "true" to obtain target variable as feature
18
+ # @return [OpenTox::Dataset] Fminer Dataset
19
+ def self.bbrc training_dataset, params={}
20
+
21
+ time = Time.now
22
+ bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
23
+
24
+ prediction_feature = training_dataset.features.first
25
+ if params[:min_frequency]
26
+ minfreq = params[:min_frequency]
27
+ else
28
+ per_mil = 5 # value from latest version
29
+ per_mil = 8 # as suggested below
30
+ i = training_dataset.feature_ids.index prediction_feature.id
31
+ nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
32
+ minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
33
+ minfreq = 2 unless minfreq > 2
34
+ minfreq = minfreq.round
35
+ end
36
+
37
+ @bbrc ||= Bbrc::Bbrc.new
38
+ @bbrc.Reset
39
+ if prediction_feature.numeric
40
+ @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
41
+ else
42
+ bad_request_error "No accept values for "\
43
+ "dataset '#{training_dataset.id}' and "\
44
+ "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
45
+ value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
46
+ end
47
+ @bbrc.SetMinfreq(minfreq)
48
+ @bbrc.SetType(1) if params[:feature_type] == "paths"
49
+ @bbrc.SetBackbone(false) if params[:backbone] == "false"
50
+ @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
51
+ @bbrc.SetConsoleOut(false)
52
+
53
+ params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
54
+ feature_dataset = FminerDataset.new(
55
+ :training_dataset_id => training_dataset.id,
56
+ :training_algorithm => "#{self.to_s}.bbrc",
57
+ :training_feature_id => prediction_feature.id ,
58
+ :training_parameters => {
59
+ :min_frequency => minfreq,
60
+ :nr_hits => nr_hits,
61
+ :backbone => (params[:backbone] == false ? false : true)
62
+ }
63
+
64
+ )
65
+ feature_dataset.compounds = training_dataset.compounds
66
+
67
+ # add data
68
+ training_dataset.compounds.each_with_index do |compound,i|
69
+ act = value2act[training_dataset.data_entries[i].first]
70
+ if act # TODO check if this works
71
+ @bbrc.AddCompound(compound.smiles,i+1)
72
+ @bbrc.AddActivity(act,i+1)
73
+ end
74
+ end
75
+ #g_median=@fminer.all_activities.values.to_scale.median
76
+
77
+ #task.progress 10
78
+ #step_width = 80 / @bbrc.GetNoRootNodes().to_f
79
+
80
+ $logger.debug "BBRC setup: #{Time.now-time}"
81
+ time = Time.now
82
+ ftime = 0
83
+ itime = 0
84
+ rtime = 0
85
+
86
+ # run @bbrc
87
+ (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
88
+ results = @bbrc.MineRoot(j)
89
+ results.each do |result|
90
+ rt = Time.now
91
+ f = YAML.load(result)[0]
92
+ smarts = f.shift
93
+ # convert fminer SMARTS representation into a more human readable format
94
+ smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
95
+ element = TABLE_OF_ELEMENTS[$1.to_i-1]
96
+ $2 == "a" ? element.downcase : element
97
+ end
98
+ p_value = f.shift
99
+ f.flatten!
100
+ compound_idxs = f.collect{|e| e.first.first-1}
101
+ # majority class
102
+ effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
103
+
104
+ =begin
105
+ if (!@bbrc.GetRegression)
106
+ id_arrs = f[2..-1].flatten
107
+ max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
108
+ effect = max+1
109
+ else #regression part
110
+ id_arrs = f[2]
111
+ # DV: effect calculation
112
+ f_arr=Array.new
113
+ f[2].each do |id|
114
+ id=id.keys[0] # extract id from hit count hash
115
+ f_arr.push(@fminer.all_activities[id])
116
+ end
117
+ f_median=f_arr.to_scale.median
118
+ if g_median >= f_median
119
+ effect = 'activating'
120
+ else
121
+ effect = 'deactivating'
122
+ end
123
+ end
124
+ =end
125
+ rtime += Time.now - rt
126
+
127
+ ft = Time.now
128
+ feature = OpenTox::FminerSmarts.find_or_create_by({
129
+ "smarts" => smarts,
130
+ "p_value" => p_value.to_f.abs.round(5),
131
+ "effect" => effect,
132
+ "dataset_id" => feature_dataset.id
133
+ })
134
+ feature_dataset.feature_ids << feature.id
135
+ ftime += Time.now - ft
136
+
137
+ it = Time.now
138
+ f.each do |id_count_hash|
139
+ id_count_hash.each do |id,count|
140
+ nr_hits ? count = count.to_i : count = 1
141
+ feature_dataset.data_entries[id-1] ||= []
142
+ feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
143
+ end
144
+ end
145
+ itime += Time.now - it
146
+
147
+ end
148
+ end
149
+
150
+ $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
151
+ time = Time.now
152
+
153
+ feature_dataset.fill_nil_with 0
154
+
155
+ $logger.debug "Prepare save: #{Time.now-time}"
156
+ time = Time.now
157
+ feature_dataset.save_all
158
+
159
+ $logger.debug "Save: #{Time.now-time}"
160
+ feature_dataset
161
+
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,107 @@
1
+ module OpenTox
2
+ module Algorithm
3
+
4
+ class Classification
5
+
6
+ def self.weighted_majority_vote neighbors
7
+ return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
8
+ weighted_sum = {}
9
+ sim_sum = 0.0
10
+ neighbors.each do |row|
11
+ n,sim,acts = row
12
+ acts.each do |act|
13
+ weighted_sum[act] ||= 0
14
+ weighted_sum[act] += sim
15
+ end
16
+ end
17
+ case weighted_sum.size
18
+ when 1
19
+ return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
20
+ when 2
21
+ sim_sum = weighted_sum[weighted_sum.keys[0]]
22
+ sim_sum -= weighted_sum[weighted_sum.keys[1]]
23
+ sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
24
+ confidence = (sim_sum/neighbors.size).abs
25
+ return {:value => prediction,:confidence => confidence}
26
+ else
27
+ bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
28
+ end
29
+ end
30
+
31
+ # Classification with majority vote from neighbors weighted by similarity
32
+ # @param [Hash] params Keys `:activities, :sims, :value_map` are required
33
+ # @return [Numeric] A prediction value.
34
+ def self.fminer_weighted_majority_vote neighbors, training_dataset
35
+
36
+ neighbor_contribution = 0.0
37
+ confidence_sum = 0.0
38
+
39
+ $logger.debug "Weighted Majority Vote Classification."
40
+
41
+ values = neighbors.collect{|n| n[2]}.uniq
42
+ neighbors.each do |neighbor|
43
+ i = training_dataset.compound_ids.index n.id
44
+ neighbor_weight = neighbor[1]
45
+ activity = values.index(neighbor[2]) + 1 # map values to integers > 1
46
+ neighbor_contribution += activity * neighbor_weight
47
+ if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
48
+ case activity
49
+ when 1
50
+ confidence_sum -= neighbor_weight
51
+ when 2
52
+ confidence_sum += neighbor_weight
53
+ end
54
+ else
55
+ confidence_sum += neighbor_weight
56
+ end
57
+ end
58
+ if values.size == 2
59
+ if confidence_sum >= 0.0
60
+ prediction = values[1]
61
+ elsif confidence_sum < 0.0
62
+ prediction = values[0]
63
+ end
64
+ elsif values.size == 1 # all neighbors have the same value
65
+ prediction = values[0]
66
+ else
67
+ prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
68
+ end
69
+
70
+ confidence = (confidence_sum/neighbors.size).abs
71
+ {:value => prediction, :confidence => confidence.abs}
72
+ end
73
+
74
+ # Local support vector regression from neighbors
75
+ # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
76
+ # @return [Numeric] A prediction value.
77
+ def self.local_svm_classification(params)
78
+
79
+ confidence = 0.0
80
+ prediction = nil
81
+
82
+ $logger.debug "Local SVM."
83
+ if params[:activities].size>0
84
+ if params[:props]
85
+ n_prop = params[:props][0].collect.to_a
86
+ q_prop = params[:props][1].collect.to_a
87
+ props = [ n_prop, q_prop ]
88
+ end
89
+ activities = params[:activities].collect.to_a
90
+ activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
91
+ prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
92
+ prediction = prediction.sub(/Val/,"") if prediction # Convert back
93
+ confidence = 0.0 if prediction.nil?
94
+ #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
95
+ confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
96
+ end
97
+ {:value => prediction, :confidence => confidence}
98
+
99
+ end
100
+
101
+
102
+
103
+ end
104
+
105
+ end
106
+ end
107
+
data/lib/compound.rb ADDED
@@ -0,0 +1,254 @@
1
+ # TODO: check
2
+ # *** Open Babel Error in ParseFile
3
+ # Could not find contribution data file.
4
+
5
+ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
6
+
7
+ module OpenTox
8
+
9
+ class Compound
10
+ include OpenTox
11
+
12
+ field :inchi, type: String
13
+ field :smiles, type: String
14
+ field :inchikey, type: String
15
+ field :names, type: Array
16
+ field :warning, type: String
17
+ field :cid, type: String
18
+ field :chemblid, type: String
19
+ field :png_id, type: BSON::ObjectId
20
+ field :svg_id, type: BSON::ObjectId
21
+ field :sdf_id, type: BSON::ObjectId
22
+ field :fp4, type: Array
23
+ field :fp4_size, type: Integer
24
+
25
+ # Overwrites standard Mongoid method to create fingerprints before database insertion
26
+ def self.find_or_create_by params
27
+ compound = self.find_or_initialize_by params
28
+ unless compound.fp4 and !compound.fp4.empty?
29
+ compound.fp4_size = 0
30
+ compound.fp4 = []
31
+ fingerprint = FingerprintSmarts.fingerprint
32
+ Algorithm::Descriptor.smarts_match(compound, fingerprint).each_with_index do |m,i|
33
+ if m > 0
34
+ compound.fp4 << fingerprint[i].id
35
+ compound.fp4_size += 1
36
+ end
37
+ end
38
+ end
39
+ compound.save
40
+ compound
41
+ end
42
+
43
+ # Create a compound from smiles string
44
+ # @example
45
+ # compound = OpenTox::Compound.from_smiles("c1ccccc1")
46
+ # @param [String] smiles Smiles string
47
+ # @return [OpenTox::Compound] Compound
48
+ def self.from_smiles smiles
49
+ smiles = obconversion(smiles,"smi","can")
50
+ if smiles.empty?
51
+ Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
52
+ else
53
+ Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
54
+ end
55
+ end
56
+
57
+ # Create a compound from inchi string
58
+ # @param inchi [String] smiles InChI string
59
+ # @return [OpenTox::Compound] Compound
60
+ def self.from_inchi inchi
61
+ # Temporary workaround for OpenBabels Inchi bug
62
+ # http://sourceforge.net/p/openbabel/bugs/957/
63
+ # bug has not been fixed in latest git/development version
64
+ #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
65
+ smiles = obconversion(inchi,"inchi","can")
66
+ if smiles.empty?
67
+ Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
68
+ else
69
+ Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
70
+ end
71
+ end
72
+
73
+ # Create a compound from sdf string
74
+ # @param sdf [String] smiles SDF string
75
+ # @return [OpenTox::Compound] Compound
76
+ def self.from_sdf sdf
77
+ # do not store sdf because it might be 2D
78
+ Compound.from_smiles obconversion(sdf,"sdf","can")
79
+ end
80
+
81
+ # Create a compound from name. Relies on an external service for name lookups.
82
+ # @example
83
+ # compound = OpenTox::Compound.from_name("Benzene")
84
+ # @param name [String] can be also an InChI/InChiKey, CAS number, etc
85
+ # @return [OpenTox::Compound] Compound
86
+ def self.from_name name
87
+ Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
88
+ end
89
+
90
+ # Get InChI
91
+ # @return [String] InChI string
92
+ def inchi
93
+ unless self["inchi"]
94
+
95
+ result = obconversion(smiles,"smi","inchi")
96
+ #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
97
+ update(:inchi => result.chomp) unless result.empty?
98
+ end
99
+ self["inchi"]
100
+ end
101
+
102
+ # Get InChIKey
103
+ # @return [String] InChIKey string
104
+ def inchikey
105
+ update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
106
+ self["inchikey"]
107
+ end
108
+
109
+ # Get (canonical) smiles
110
+ # @return [String] Smiles string
111
+ def smiles
112
+ update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"]
113
+ self["smiles"]
114
+ end
115
+
116
+ # Get sdf
117
+ # @return [String] SDF string
118
+ def sdf
119
+ if self.sdf_id.nil?
120
+ sdf = obconversion(smiles,"smi","sdf")
121
+ file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile")
122
+ sdf_id = $gridfs.insert_one file
123
+ update :sdf_id => sdf_id
124
+ end
125
+ $gridfs.find_one(_id: self.sdf_id).data
126
+ end
127
+
128
+ # Get SVG image
129
+ # @return [image/svg] Image data
130
+ def svg
131
+ if self.svg_id.nil?
132
+ svg = obconversion(smiles,"smi","svg")
133
+ file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
134
+ update(:image_id => $gridfs.insert_one(file))
135
+ end
136
+ $gridfs.find_one(_id: self.svg_id).data
137
+
138
+ end
139
+
140
+ # Get png image
141
+ # @example
142
+ # image = compound.png
143
+ # @return [image/png] Image data
144
+ def png
145
+ if self.png_id.nil?
146
+ png = obconversion(smiles,"smi","_png2")
147
+ file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png")
148
+ update(:png_id => $gridfs.insert_one(file))
149
+ end
150
+ Base64.decode64($gridfs.find_one(_id: self.png_id).data)
151
+
152
+ end
153
+
154
+ # Get all known compound names. Relies on an external service for name lookups.
155
+ # @example
156
+ # names = compound.names
157
+ # @return [String] Compound names
158
+ def names
159
+ update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"]
160
+ self["names"]
161
+ end
162
+
163
+ # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
164
+ def cid
165
+ pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
166
+ update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
167
+ self["cid"]
168
+ end
169
+
170
+ # @return [String] ChEMBL database compound id, derieved via restcall to chembl
171
+ def chemblid
172
+ # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
173
+ uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
174
+ update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
175
+ self["chemblid"]
176
+ end
177
+
178
+ def neighbors threshold=0.7
179
+ # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
180
+ qn = fp4.size
181
+ #qmin = qn * threshold
182
+ #qmax = qn / threshold
183
+ #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
184
+ #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
185
+ aggregate = [
186
+ #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
187
+ {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
188
+ {'$project' => {
189
+ 'tanimoto' => {'$let' => {
190
+ 'vars' => {'common' => {'$size' => {'$setIntersection' => ['$fp4', fp4]}}},
191
+ 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$fp4_size']}, '$$common']}]}
192
+ }},
193
+ '_id' => 1
194
+ }},
195
+ {'$match' => {'tanimoto' => {'$gte' => threshold}}},
196
+ {'$sort' => {'tanimoto' => -1}}
197
+ ]
198
+
199
+ $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
200
+
201
+ end
202
+ =begin
203
+ =end
204
+
205
+ private
206
+
207
+ def self.obconversion(identifier,input_format,output_format,option=nil)
208
+ obconversion = OpenBabel::OBConversion.new
209
+ obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
210
+ obmol = OpenBabel::OBMol.new
211
+ obconversion.set_in_and_out_formats input_format, output_format
212
+ obconversion.read_string obmol, identifier
213
+ case output_format
214
+ when /smi|can|inchi/
215
+ obconversion.write_string(obmol).gsub(/\s/,'').chomp
216
+ when /sdf/
217
+ p "SDF conversion"
218
+ # has no effect
219
+ #obconversion.add_option("gen3D", OpenBabel::OBConversion::GENOPTIONS)
220
+ # segfaults with openbabel git master
221
+ #OpenBabel::OBOp.find_type("Gen3D").do(obmol)
222
+
223
+ # TODO: find disconnected structures
224
+ # strip_salts
225
+ # separate
226
+ obmol.add_hydrogens
227
+ builder = OpenBabel::OBBuilder.new
228
+ builder.build(obmol)
229
+
230
+ sdf = obconversion.write_string(obmol)
231
+ print sdf
232
+ if sdf.match(/.nan/)
233
+
234
+ # TODO: fix or eliminate 2d generation
235
+ $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
236
+ obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
237
+ #OpenBabel::OBOp.find_type("Gen2D").do(obmol)
238
+ sdf = obconversion.write_string(obmol)
239
+ if sdf.match(/.nan/)
240
+ $logger.warn "2D generation failed for compound #{identifier}"
241
+ sdf = nil
242
+ end
243
+ end
244
+ sdf
245
+ else
246
+ obconversion.write_string(obmol)
247
+ end
248
+ end
249
+
250
+ def obconversion(identifier,input_format,output_format,option=nil)
251
+ self.class.obconversion(identifier,input_format,output_format,option=nil)
252
+ end
253
+ end
254
+ end