lazar 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
data/lib/algorithm.rb ADDED
@@ -0,0 +1,21 @@
1
+ module OpenTox
2
+
3
+ module Algorithm
4
+
5
+ # Generic method to execute algorithms
6
+ # Algorithms should:
7
+ # - accept a Compound, an Array of Compounds or a Dataset as first argument
8
+ # - optional parameters as second argument
9
+ # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
10
+ # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
11
+ # @param [Hash] Algorithm parameters
12
+ # @return Algorithm result
13
+ def self.run algorithm, object, parameters=nil
14
+ bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
15
+ klass,method = algorithm.split('.')
16
+ parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
17
+ end
18
+
19
+ end
20
+ end
21
+
data/lib/bbrc.rb ADDED
@@ -0,0 +1,165 @@
1
+ module OpenTox
2
+ module Algorithm
3
+ class Fminer
4
+ TABLE_OF_ELEMENTS = [
5
+ "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
6
+
7
+ #
8
+ # Run bbrc algorithm on dataset
9
+ #
10
+ # @param [OpenTox::Dataset] training dataset
11
+ # @param [optional] parameters BBRC parameters, accepted parameters are
12
+ # - min_frequency Minimum frequency (default 5)
13
+ # - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
14
+ # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
15
+ # - min_chisq_significance Significance threshold (between 0 and 1)
16
+ # - nr_hits Set to "true" to get hit count instead of presence
17
+ # - get_target Set to "true" to obtain target variable as feature
18
+ # @return [OpenTox::Dataset] Fminer Dataset
19
+ def self.bbrc training_dataset, params={}
20
+
21
+ time = Time.now
22
+ bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
23
+
24
+ prediction_feature = training_dataset.features.first
25
+ if params[:min_frequency]
26
+ minfreq = params[:min_frequency]
27
+ else
28
+ per_mil = 5 # value from latest version
29
+ per_mil = 8 # as suggested below
30
+ i = training_dataset.feature_ids.index prediction_feature.id
31
+ nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
32
+ minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
33
+ minfreq = 2 unless minfreq > 2
34
+ minfreq = minfreq.round
35
+ end
36
+
37
+ @bbrc ||= Bbrc::Bbrc.new
38
+ @bbrc.Reset
39
+ if prediction_feature.numeric
40
+ @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
41
+ else
42
+ bad_request_error "No accept values for "\
43
+ "dataset '#{training_dataset.id}' and "\
44
+ "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
45
+ value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
46
+ end
47
+ @bbrc.SetMinfreq(minfreq)
48
+ @bbrc.SetType(1) if params[:feature_type] == "paths"
49
+ @bbrc.SetBackbone(false) if params[:backbone] == "false"
50
+ @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
51
+ @bbrc.SetConsoleOut(false)
52
+
53
+ params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
54
+ feature_dataset = FminerDataset.new(
55
+ :training_dataset_id => training_dataset.id,
56
+ :training_algorithm => "#{self.to_s}.bbrc",
57
+ :training_feature_id => prediction_feature.id ,
58
+ :training_parameters => {
59
+ :min_frequency => minfreq,
60
+ :nr_hits => nr_hits,
61
+ :backbone => (params[:backbone] == false ? false : true)
62
+ }
63
+
64
+ )
65
+ feature_dataset.compounds = training_dataset.compounds
66
+
67
+ # add data
68
+ training_dataset.compounds.each_with_index do |compound,i|
69
+ act = value2act[training_dataset.data_entries[i].first]
70
+ if act # TODO check if this works
71
+ @bbrc.AddCompound(compound.smiles,i+1)
72
+ @bbrc.AddActivity(act,i+1)
73
+ end
74
+ end
75
+ #g_median=@fminer.all_activities.values.to_scale.median
76
+
77
+ #task.progress 10
78
+ #step_width = 80 / @bbrc.GetNoRootNodes().to_f
79
+
80
+ $logger.debug "BBRC setup: #{Time.now-time}"
81
+ time = Time.now
82
+ ftime = 0
83
+ itime = 0
84
+ rtime = 0
85
+
86
+ # run @bbrc
87
+ (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
88
+ results = @bbrc.MineRoot(j)
89
+ results.each do |result|
90
+ rt = Time.now
91
+ f = YAML.load(result)[0]
92
+ smarts = f.shift
93
+ # convert fminer SMARTS representation into a more human readable format
94
+ smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
95
+ element = TABLE_OF_ELEMENTS[$1.to_i-1]
96
+ $2 == "a" ? element.downcase : element
97
+ end
98
+ p_value = f.shift
99
+ f.flatten!
100
+ compound_idxs = f.collect{|e| e.first.first-1}
101
+ # majority class
102
+ effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
103
+
104
+ =begin
105
+ if (!@bbrc.GetRegression)
106
+ id_arrs = f[2..-1].flatten
107
+ max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
108
+ effect = max+1
109
+ else #regression part
110
+ id_arrs = f[2]
111
+ # DV: effect calculation
112
+ f_arr=Array.new
113
+ f[2].each do |id|
114
+ id=id.keys[0] # extract id from hit count hash
115
+ f_arr.push(@fminer.all_activities[id])
116
+ end
117
+ f_median=f_arr.to_scale.median
118
+ if g_median >= f_median
119
+ effect = 'activating'
120
+ else
121
+ effect = 'deactivating'
122
+ end
123
+ end
124
+ =end
125
+ rtime += Time.now - rt
126
+
127
+ ft = Time.now
128
+ feature = OpenTox::FminerSmarts.find_or_create_by({
129
+ "smarts" => smarts,
130
+ "p_value" => p_value.to_f.abs.round(5),
131
+ "effect" => effect,
132
+ "dataset_id" => feature_dataset.id
133
+ })
134
+ feature_dataset.feature_ids << feature.id
135
+ ftime += Time.now - ft
136
+
137
+ it = Time.now
138
+ f.each do |id_count_hash|
139
+ id_count_hash.each do |id,count|
140
+ nr_hits ? count = count.to_i : count = 1
141
+ feature_dataset.data_entries[id-1] ||= []
142
+ feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
143
+ end
144
+ end
145
+ itime += Time.now - it
146
+
147
+ end
148
+ end
149
+
150
+ $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
151
+ time = Time.now
152
+
153
+ feature_dataset.fill_nil_with 0
154
+
155
+ $logger.debug "Prepare save: #{Time.now-time}"
156
+ time = Time.now
157
+ feature_dataset.save_all
158
+
159
+ $logger.debug "Save: #{Time.now-time}"
160
+ feature_dataset
161
+
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,107 @@
1
+ module OpenTox
2
+ module Algorithm
3
+
4
+ class Classification
5
+
6
+ def self.weighted_majority_vote neighbors
7
+ return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
8
+ weighted_sum = {}
9
+ sim_sum = 0.0
10
+ neighbors.each do |row|
11
+ n,sim,acts = row
12
+ acts.each do |act|
13
+ weighted_sum[act] ||= 0
14
+ weighted_sum[act] += sim
15
+ end
16
+ end
17
+ case weighted_sum.size
18
+ when 1
19
+ return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
20
+ when 2
21
+ sim_sum = weighted_sum[weighted_sum.keys[0]]
22
+ sim_sum -= weighted_sum[weighted_sum.keys[1]]
23
+ sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
24
+ confidence = (sim_sum/neighbors.size).abs
25
+ return {:value => prediction,:confidence => confidence}
26
+ else
27
+ bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
28
+ end
29
+ end
30
+
31
+ # Classification with majority vote from neighbors weighted by similarity
32
+ # @param [Hash] params Keys `:activities, :sims, :value_map` are required
33
+ # @return [Numeric] A prediction value.
34
+ def self.fminer_weighted_majority_vote neighbors, training_dataset
35
+
36
+ neighbor_contribution = 0.0
37
+ confidence_sum = 0.0
38
+
39
+ $logger.debug "Weighted Majority Vote Classification."
40
+
41
+ values = neighbors.collect{|n| n[2]}.uniq
42
+ neighbors.each do |neighbor|
43
+ i = training_dataset.compound_ids.index n.id
44
+ neighbor_weight = neighbor[1]
45
+ activity = values.index(neighbor[2]) + 1 # map values to integers > 1
46
+ neighbor_contribution += activity * neighbor_weight
47
+ if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
48
+ case activity
49
+ when 1
50
+ confidence_sum -= neighbor_weight
51
+ when 2
52
+ confidence_sum += neighbor_weight
53
+ end
54
+ else
55
+ confidence_sum += neighbor_weight
56
+ end
57
+ end
58
+ if values.size == 2
59
+ if confidence_sum >= 0.0
60
+ prediction = values[1]
61
+ elsif confidence_sum < 0.0
62
+ prediction = values[0]
63
+ end
64
+ elsif values.size == 1 # all neighbors have the same value
65
+ prediction = values[0]
66
+ else
67
+ prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
68
+ end
69
+
70
+ confidence = (confidence_sum/neighbors.size).abs
71
+ {:value => prediction, :confidence => confidence.abs}
72
+ end
73
+
74
+ # Local support vector regression from neighbors
75
+ # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
76
+ # @return [Numeric] A prediction value.
77
+ def self.local_svm_classification(params)
78
+
79
+ confidence = 0.0
80
+ prediction = nil
81
+
82
+ $logger.debug "Local SVM."
83
+ if params[:activities].size>0
84
+ if params[:props]
85
+ n_prop = params[:props][0].collect.to_a
86
+ q_prop = params[:props][1].collect.to_a
87
+ props = [ n_prop, q_prop ]
88
+ end
89
+ activities = params[:activities].collect.to_a
90
+ activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
91
+ prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
92
+ prediction = prediction.sub(/Val/,"") if prediction # Convert back
93
+ confidence = 0.0 if prediction.nil?
94
+ #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
95
+ confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
96
+ end
97
+ {:value => prediction, :confidence => confidence}
98
+
99
+ end
100
+
101
+
102
+
103
+ end
104
+
105
+ end
106
+ end
107
+
data/lib/compound.rb ADDED
@@ -0,0 +1,254 @@
1
+ # TODO: check
2
+ # *** Open Babel Error in ParseFile
3
+ # Could not find contribution data file.
4
+
5
+ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
6
+
7
+ module OpenTox
8
+
9
+ class Compound
10
+ include OpenTox
11
+
12
+ field :inchi, type: String
13
+ field :smiles, type: String
14
+ field :inchikey, type: String
15
+ field :names, type: Array
16
+ field :warning, type: String
17
+ field :cid, type: String
18
+ field :chemblid, type: String
19
+ field :png_id, type: BSON::ObjectId
20
+ field :svg_id, type: BSON::ObjectId
21
+ field :sdf_id, type: BSON::ObjectId
22
+ field :fp4, type: Array
23
+ field :fp4_size, type: Integer
24
+
25
+ # Overwrites standard Mongoid method to create fingerprints before database insertion
26
+ def self.find_or_create_by params
27
+ compound = self.find_or_initialize_by params
28
+ unless compound.fp4 and !compound.fp4.empty?
29
+ compound.fp4_size = 0
30
+ compound.fp4 = []
31
+ fingerprint = FingerprintSmarts.fingerprint
32
+ Algorithm::Descriptor.smarts_match(compound, fingerprint).each_with_index do |m,i|
33
+ if m > 0
34
+ compound.fp4 << fingerprint[i].id
35
+ compound.fp4_size += 1
36
+ end
37
+ end
38
+ end
39
+ compound.save
40
+ compound
41
+ end
42
+
43
+ # Create a compound from smiles string
44
+ # @example
45
+ # compound = OpenTox::Compound.from_smiles("c1ccccc1")
46
+ # @param [String] smiles Smiles string
47
+ # @return [OpenTox::Compound] Compound
48
+ def self.from_smiles smiles
49
+ smiles = obconversion(smiles,"smi","can")
50
+ if smiles.empty?
51
+ Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
52
+ else
53
+ Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
54
+ end
55
+ end
56
+
57
+ # Create a compound from inchi string
58
+ # @param inchi [String] smiles InChI string
59
+ # @return [OpenTox::Compound] Compound
60
+ def self.from_inchi inchi
61
+ # Temporary workaround for OpenBabels Inchi bug
62
+ # http://sourceforge.net/p/openbabel/bugs/957/
63
+ # bug has not been fixed in latest git/development version
64
+ #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
65
+ smiles = obconversion(inchi,"inchi","can")
66
+ if smiles.empty?
67
+ Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
68
+ else
69
+ Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
70
+ end
71
+ end
72
+
73
+ # Create a compound from sdf string
74
+ # @param sdf [String] smiles SDF string
75
+ # @return [OpenTox::Compound] Compound
76
+ def self.from_sdf sdf
77
+ # do not store sdf because it might be 2D
78
+ Compound.from_smiles obconversion(sdf,"sdf","can")
79
+ end
80
+
81
+ # Create a compound from name. Relies on an external service for name lookups.
82
+ # @example
83
+ # compound = OpenTox::Compound.from_name("Benzene")
84
+ # @param name [String] can be also an InChI/InChiKey, CAS number, etc
85
+ # @return [OpenTox::Compound] Compound
86
+ def self.from_name name
87
+ Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
88
+ end
89
+
90
+ # Get InChI
91
+ # @return [String] InChI string
92
+ def inchi
93
+ unless self["inchi"]
94
+
95
+ result = obconversion(smiles,"smi","inchi")
96
+ #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
97
+ update(:inchi => result.chomp) unless result.empty?
98
+ end
99
+ self["inchi"]
100
+ end
101
+
102
+ # Get InChIKey
103
+ # @return [String] InChIKey string
104
+ def inchikey
105
+ update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
106
+ self["inchikey"]
107
+ end
108
+
109
+ # Get (canonical) smiles
110
+ # @return [String] Smiles string
111
+ def smiles
112
+ update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"]
113
+ self["smiles"]
114
+ end
115
+
116
+ # Get sdf
117
+ # @return [String] SDF string
118
+ def sdf
119
+ if self.sdf_id.nil?
120
+ sdf = obconversion(smiles,"smi","sdf")
121
+ file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile")
122
+ sdf_id = $gridfs.insert_one file
123
+ update :sdf_id => sdf_id
124
+ end
125
+ $gridfs.find_one(_id: self.sdf_id).data
126
+ end
127
+
128
+ # Get SVG image
129
+ # @return [image/svg] Image data
130
+ def svg
131
+ if self.svg_id.nil?
132
+ svg = obconversion(smiles,"smi","svg")
133
+ file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
134
+ update(:image_id => $gridfs.insert_one(file))
135
+ end
136
+ $gridfs.find_one(_id: self.svg_id).data
137
+
138
+ end
139
+
140
+ # Get png image
141
+ # @example
142
+ # image = compound.png
143
+ # @return [image/png] Image data
144
+ def png
145
+ if self.png_id.nil?
146
+ png = obconversion(smiles,"smi","_png2")
147
+ file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png")
148
+ update(:png_id => $gridfs.insert_one(file))
149
+ end
150
+ Base64.decode64($gridfs.find_one(_id: self.png_id).data)
151
+
152
+ end
153
+
154
+ # Get all known compound names. Relies on an external service for name lookups.
155
+ # @example
156
+ # names = compound.names
157
+ # @return [String] Compound names
158
+ def names
159
+ update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"]
160
+ self["names"]
161
+ end
162
+
163
+ # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
164
+ def cid
165
+ pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
166
+ update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
167
+ self["cid"]
168
+ end
169
+
170
+ # @return [String] ChEMBL database compound id, derieved via restcall to chembl
171
+ def chemblid
172
+ # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
173
+ uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
174
+ update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
175
+ self["chemblid"]
176
+ end
177
+
178
+ def neighbors threshold=0.7
179
+ # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
180
+ qn = fp4.size
181
+ #qmin = qn * threshold
182
+ #qmax = qn / threshold
183
+ #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
184
+ #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
185
+ aggregate = [
186
+ #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
187
+ {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
188
+ {'$project' => {
189
+ 'tanimoto' => {'$let' => {
190
+ 'vars' => {'common' => {'$size' => {'$setIntersection' => ['$fp4', fp4]}}},
191
+ 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$fp4_size']}, '$$common']}]}
192
+ }},
193
+ '_id' => 1
194
+ }},
195
+ {'$match' => {'tanimoto' => {'$gte' => threshold}}},
196
+ {'$sort' => {'tanimoto' => -1}}
197
+ ]
198
+
199
+ $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
200
+
201
+ end
202
+ =begin
203
+ =end
204
+
205
+ private
206
+
207
+ def self.obconversion(identifier,input_format,output_format,option=nil)
208
+ obconversion = OpenBabel::OBConversion.new
209
+ obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
210
+ obmol = OpenBabel::OBMol.new
211
+ obconversion.set_in_and_out_formats input_format, output_format
212
+ obconversion.read_string obmol, identifier
213
+ case output_format
214
+ when /smi|can|inchi/
215
+ obconversion.write_string(obmol).gsub(/\s/,'').chomp
216
+ when /sdf/
217
+ p "SDF conversion"
218
+ # has no effect
219
+ #obconversion.add_option("gen3D", OpenBabel::OBConversion::GENOPTIONS)
220
+ # segfaults with openbabel git master
221
+ #OpenBabel::OBOp.find_type("Gen3D").do(obmol)
222
+
223
+ # TODO: find disconnected structures
224
+ # strip_salts
225
+ # separate
226
+ obmol.add_hydrogens
227
+ builder = OpenBabel::OBBuilder.new
228
+ builder.build(obmol)
229
+
230
+ sdf = obconversion.write_string(obmol)
231
+ print sdf
232
+ if sdf.match(/.nan/)
233
+
234
+ # TODO: fix or eliminate 2d generation
235
+ $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
236
+ obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
237
+ #OpenBabel::OBOp.find_type("Gen2D").do(obmol)
238
+ sdf = obconversion.write_string(obmol)
239
+ if sdf.match(/.nan/)
240
+ $logger.warn "2D generation failed for compound #{identifier}"
241
+ sdf = nil
242
+ end
243
+ end
244
+ sdf
245
+ else
246
+ obconversion.write_string(obmol)
247
+ end
248
+ end
249
+
250
+ def obconversion(identifier,input_format,output_format,option=nil)
251
+ self.class.obconversion(identifier,input_format,output_format,option=nil)
252
+ end
253
+ end
254
+ end