chemruby 0.9.3 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/README +2 -2
  2. data/Rakefile +67 -63
  3. data/ext/extconf.rb +2 -0
  4. data/ext/subcomp.c +461 -320
  5. data/ext/utils.c +56 -0
  6. data/ext/utils.h +13 -0
  7. data/lib/chem.rb +34 -8
  8. data/lib/chem/db.rb +8 -0
  9. data/lib/chem/db/cansmi.rb +1 -1
  10. data/lib/chem/db/cdx.rb +1 -1
  11. data/lib/chem/db/cml.rb +52 -0
  12. data/lib/chem/db/gd.rb +64 -0
  13. data/lib/chem/db/gspan.rb +2 -2
  14. data/lib/chem/db/kcf_rpair.rb +34 -0
  15. data/lib/chem/db/kegg.rb +35 -1
  16. data/lib/chem/db/mdl.rb +75 -34
  17. data/lib/chem/db/opsin.rb +24 -0
  18. data/lib/chem/db/pdb.rb +105 -0
  19. data/lib/chem/db/pdf.rb +2 -0
  20. data/lib/chem/db/pubchem.rb +1071 -88
  21. data/lib/chem/db/rmagick.rb +5 -3
  22. data/lib/chem/db/sdf.rb +28 -2
  23. data/lib/chem/db/smiles/smiles.ry +27 -25
  24. data/lib/chem/db/smiles/smiparser.rb +29 -27
  25. data/lib/chem/db/types/type_gd.rb +35 -0
  26. data/lib/chem/db/types/type_gspan.rb +2 -2
  27. data/lib/chem/db/types/type_kcf.rb +19 -0
  28. data/lib/chem/db/types/type_kegg.rb +2 -0
  29. data/lib/chem/db/types/type_mdl.rb +1 -1
  30. data/lib/chem/db/types/type_png.rb +5 -1
  31. data/lib/chem/db/types/type_rdf.rb +22 -0
  32. data/lib/chem/db/types/type_xyz.rb +1 -1
  33. data/lib/chem/db/vector.rb +19 -3
  34. data/lib/chem/model.rb +5 -2
  35. data/lib/chem/utils.rb +17 -1
  36. data/lib/chem/utils/bitdb.rb +49 -0
  37. data/lib/chem/utils/cas.rb +28 -0
  38. data/lib/chem/utils/cdk.rb +403 -0
  39. data/lib/chem/utils/fingerprint.rb +98 -0
  40. data/lib/chem/utils/geometry.rb +8 -0
  41. data/lib/chem/utils/net.rb +303 -0
  42. data/lib/chem/utils/once.rb +28 -0
  43. data/lib/chem/utils/openbabel.rb +204 -0
  44. data/lib/chem/utils/sssr.rb +33 -25
  45. data/lib/chem/utils/sub.rb +6 -0
  46. data/lib/chem/utils/transform.rb +9 -8
  47. data/lib/chem/utils/ullmann.rb +138 -95
  48. data/lib/graph.rb +5 -6
  49. data/lib/graph/utils.rb +8 -0
  50. data/sample/calc_maximum_common_subgraph.rb +27 -0
  51. data/sample/calc_properties.rb +9 -0
  52. data/sample/data/atp.mol +69 -0
  53. data/sample/data/pioglitazone.mol +58 -0
  54. data/sample/data/rosiglitazone.mol +55 -0
  55. data/sample/data/troglitazone.mol +70 -0
  56. data/sample/find_compound_by_keggapi.rb +19 -0
  57. data/sample/generate_inchi.rb +7 -0
  58. data/sample/generate_substructurekey.rb +11 -0
  59. data/sample/images/ex6.rb +17 -0
  60. data/sample/images/ex7.rb +18 -0
  61. data/sample/iupac2mol.rb +8 -0
  62. data/sample/kekule.rb +13 -0
  63. data/sample/logp.rb +4 -0
  64. data/sample/mcs.rb +13 -0
  65. data/sample/mol2pdf.rb +8 -0
  66. data/sample/pubchem_fetch.rb +8 -0
  67. data/sample/pubchem_search.rb +12 -0
  68. data/sample/rosiglitazone.mol +57 -0
  69. data/sample/smarts.rb +10 -0
  70. data/sample/structure_match.rb +8 -0
  71. data/sample/structure_match_color.rb +22 -0
  72. data/sample/thiazolidinedione.mol +19 -0
  73. data/sample/troglitazone.mol +232 -0
  74. data/sample/vicinity.rb +8 -0
  75. data/test/data/CID_704.sdf +236 -0
  76. data/test/data/CID_994.sdf +146 -0
  77. data/test/data/db_EXPT03276.txt +321 -0
  78. data/test/data/pioglitazone.mol +58 -0
  79. data/test/data/rosiglitazone.mol +55 -0
  80. data/test/data/thiazolidinedione.mol +19 -0
  81. data/test/data/troglitazone.mol +70 -0
  82. data/test/{test_adj.rb → tc_adj.rb} +0 -0
  83. data/test/{test_canonical_smiles.rb → tc_canonical_smiles.rb} +0 -0
  84. data/test/tc_casrn.rb +17 -0
  85. data/test/tc_cdk.rb +89 -0
  86. data/test/{test_cdx.rb → tc_cdx.rb} +0 -0
  87. data/test/{test_chem.rb → tc_chem.rb} +0 -0
  88. data/test/{test_cluster.rb → tc_cluster.rb} +0 -0
  89. data/test/{test_db.rb → tc_db.rb} +0 -0
  90. data/test/tc_develop.rb +38 -0
  91. data/test/tc_drugbank.rb +13 -0
  92. data/test/{test_eps.rb → tc_eps.rb} +0 -0
  93. data/test/tc_gd.rb +8 -0
  94. data/test/{test_geometry.rb → tc_geometry.rb} +0 -0
  95. data/test/tc_graph.rb +15 -0
  96. data/test/{test_gspan.rb → tc_gspan.rb} +0 -0
  97. data/test/{test_iupac.rb → tc_iupac.rb} +0 -0
  98. data/test/{test_kcf.rb → tc_kcf.rb} +0 -0
  99. data/test/{test_kcf_glycan.rb → tc_kcf_glycan.rb} +0 -0
  100. data/test/{test_kegg.rb → tc_kegg.rb} +13 -0
  101. data/test/{test_linucs.rb → tc_linucs.rb} +0 -0
  102. data/test/{test_mdl.rb → tc_mdl.rb} +20 -0
  103. data/test/{test_mol2.rb → tc_mol2.rb} +1 -1
  104. data/test/{test_morgan.rb → tc_morgan.rb} +0 -0
  105. data/test/tc_net.rb +5 -0
  106. data/test/tc_once.rb +29 -0
  107. data/test/tc_openbabel.rb +57 -0
  108. data/test/{test_pdf.rb → tc_pdf.rb} +0 -0
  109. data/test/{test_prop.rb → tc_prop.rb} +1 -1
  110. data/test/tc_pubchem.rb +32 -0
  111. data/test/{test_rmagick.rb → tc_rmagick.rb} +0 -0
  112. data/test/{test_sbdb.rb → tc_sbdb.rb} +0 -0
  113. data/test/{test_sdf.rb → tc_sdf.rb} +2 -0
  114. data/test/{test_smiles.rb → tc_smiles.rb} +46 -30
  115. data/test/tc_sssr.rb +1 -0
  116. data/test/{test_sub.rb → tc_sub.rb} +0 -0
  117. data/test/tc_subcomp.rb +59 -0
  118. data/test/{test_traverse.rb → tc_traverse.rb} +0 -0
  119. data/test/{test_writer.rb → tc_writer.rb} +0 -0
  120. data/test/{test_xyz.rb → tc_xyz.rb} +0 -0
  121. data/test/ts_current.rb +11 -0
  122. data/test/ts_image.rb +6 -0
  123. data/test/ts_main.rb +12 -0
  124. metadata +259 -194
  125. data/lib/chem/utils/graph_db.rb +0 -146
  126. data/test/test_sssr.rb +0 -18
  127. data/test/test_subcomp.rb +0 -37
@@ -0,0 +1,24 @@
1
+
2
+ module Chem
3
+
4
+ def self.opsin_parse(iupac_name)
5
+ OpsinMolecule.new(iupac_name)
6
+ end
7
+
8
+ class OpsinMolecule
9
+ include Molecule
10
+
11
+ def initialize(iupac_name)
12
+ require 'rcdk'
13
+ @iupac_name = iupac_name
14
+ name2struct = Rjb::import('uk.ac.cam.ch.wwmm.opsin.NameToStructure').new
15
+ @cml = name2struct.parseToCML(iupac_name).toXML.to_s
16
+ @mol = Chem::CMLMolecule.new(@cml)
17
+ end
18
+
19
+ def nodes ; @mol.nodes ; end
20
+
21
+ def edges ; @mol.edges ; end
22
+
23
+ end
24
+ end
@@ -0,0 +1,105 @@
1
+ $: << "/Users/tanaka/proj/chemruby/lib"
2
+ $: << "/Users/tanaka/proj/chemruby/ext"
3
+ $: << "/Users/tanaka/temp/bioruby/lib"
4
+
5
+ require 'bio'
6
+ require 'chem'
7
+
8
+ module Chem
9
+ module PDB
10
+
11
+ class PDBBond
12
+ include Bond
13
+ end
14
+
15
+ class PDBMolecule
16
+ include Chem::Molecule
17
+
18
+ def initialize name
19
+ @name = name
20
+ @nodes = []
21
+ @edges = []
22
+ end
23
+
24
+ # Set connection using het_dictionary
25
+ def set_connection het_dic
26
+ atom_hash = @nodes.inject({}){|ret, atom| ret[atom.name.strip] = atom ; ret}
27
+ con = het_dic.find{|entry| entry.entry_id == @name}
28
+ con.record["CONECT"].each do |b|
29
+ if from = atom_hash[b.name.strip]
30
+ b.other_atoms.each do |to_atom|
31
+ if to = atom_hash[to_atom.strip]
32
+ bond = PDBBond.new
33
+ @edges.push([bond, from, to])
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ end
41
+
42
+ end
43
+ end
44
+
45
+ module Bio
46
+
47
+ class PDB
48
+
49
+ def mols
50
+ mols = {}
51
+ @hash["HETATM"].each do |atom|
52
+ mol = (mols[[atom.resName, atom.chainID]] ||= Chem::PDB::PDBMolecule.new(atom.resName))
53
+ mol.nodes.push(atom)
54
+ end
55
+ mols
56
+ end
57
+
58
+ # reprensent one entry of het_dictionary.txt
59
+ class ChemicalComponent
60
+ end
61
+
62
+ class Record::HETATM
63
+ include Chem::Atom
64
+ include Chem::Transform::ThreeDimension
65
+ def pos ; @pos ||= Vector[@x, @y, @z] ; end
66
+ end
67
+
68
+ end
69
+
70
+ end
71
+
72
+ if __FILE__ == $0
73
+ dir = "/Users/tanaka/data/"
74
+
75
+ enzyme = Bio::FlatFile.auto(dir + "/pdb/1j4r.ent")
76
+
77
+ mols = {}
78
+ enzyme.each do |entry|
79
+ entry.mols.each do |key, mol|
80
+ p mol.nodes.length
81
+ dic = Bio::FlatFile.auto(dir + "het_dictionary.txt")
82
+ mol.set_connection(dic)
83
+ mol.save("#{key.join('_')}.png")
84
+ end
85
+ exit
86
+ entry.record("HETATM").each do |atom|
87
+ (mols[atom.resName] ||= []).push atom
88
+ end
89
+ end
90
+
91
+ # p mols.keys
92
+ end
93
+
94
+ #c001 = dic.find{|entry| entry.entry_id == "001"}
95
+
96
+ #p c001.hello#.record["CONECT"]
97
+
98
+ #p mols["001"]
99
+
100
+ __END__
101
+
102
+
103
+ pdb.each do |entry|
104
+ p entry.entry_id
105
+ end
@@ -101,6 +101,8 @@ EOL
101
101
 
102
102
  def text(str, x, y, params = {})
103
103
  @vect << "BT"
104
+ color = params[:color].nil? ? "0 0 0" : params[:color].join(" ")
105
+ @vect << "#{color} rg"
104
106
  @vect << "/F1 #{@params[:font]} Tf"
105
107
  @vect << "1 0 0 1 #{x - @params[:font] * 0.4} #{y - @params[:font] * 0.4} Tm"
106
108
  @vect << "(#{str}) Tj"
@@ -1,113 +1,1096 @@
1
1
  #
2
2
  # chem/db/pubchem.rb - PubChem database class
3
3
  #
4
- # Copyright (C) 2005 KADOWAKI Tadashi <kado@kuicr.kyoto-u.ac.jp>
5
- # TANAKA Nobuya <tanaka@kuicr.kyoto-u.ac.jp>
6
- #
4
+ # Copyright (C) 2005-2007 TANAKA Nobuya <nobuya.tanaka@gmail.com>
7
5
  #
8
6
 
9
- require 'uri'
10
- require 'net/http'
7
+ require 'chem'
8
+ # ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
11
9
 
12
10
  module Chem
11
+ # Section 1.
12
+ HierarchicElementCounts = {
13
+ :H => [
14
+ [4, 0],
15
+ [8, 1],
16
+ [16, 2],
17
+ [32, 3]],
18
+ :Li => [
19
+ [ 1, 4],
20
+ [ 2, 5]],
21
+ :B => [
22
+ [ 1, 6],
23
+ [ 2, 7],
24
+ [ 4, 8]],
25
+ :C => [
26
+ [ 2, 9],
27
+ [ 4, 10],
28
+ [ 8, 11],
29
+ [ 16, 12],
30
+ [ 32, 13]],
31
+ :N => [
32
+ [ 1, 14],
33
+ [ 2, 15],
34
+ [ 4, 16],
35
+ [ 8, 17]],
36
+ :O => [
37
+ [ 1, 18],
38
+ [ 2, 19],
39
+ [ 4, 20],
40
+ [ 8, 21],
41
+ [ 16, 22]],
42
+ :F => [
43
+ [ 1, 23],
44
+ [ 2, 24],
45
+ [ 4, 25]],
46
+ :Na => [
47
+ [ 1, 26],
48
+ [ 2, 27]],
49
+ :Si => [
50
+ [ 1, 28],
51
+ [ 2, 29]],
52
+ :P => [
53
+ [ 1, 30],
54
+ [ 2, 31],
55
+ [ 4, 32]],
56
+ :S => [
57
+ [ 1, 33],
58
+ [ 2, 34],
59
+ [ 4, 35],
60
+ [ 8, 36]],
61
+ :Cl => [
62
+ [ 1, 37],
63
+ [ 2, 38],
64
+ [ 4, 39],
65
+ [ 8, 40]],
66
+ :K => [
67
+ [ 1, 41],
68
+ [ 2, 42]],
69
+ :Br => [
70
+ [ 1, 43],
71
+ [ 2, 44],
72
+ [ 4, 45]],
73
+ :I => [
74
+ [ 1, 46],
75
+ [ 2, 47],
76
+ [ 4, 48]],
77
+ :Be => [[ 1, 49 ]],
78
+ :Mg => [[ 1, 50]],
79
+ :Al => [[ 1, 51]],
80
+ :Ca => [[ 1, 52]],
81
+ :Sc => [[ 1, 53]],
82
+ :Ti => [[ 1, 54]],
83
+ :V => [[ 1, 55]],
84
+ :Cr => [[ 1, 56]],
85
+ :Mn => [[ 1, 57]],
86
+ :Fe => [[ 1, 58]],
87
+ :Co => [[ 1, 59]],
88
+ :Ni => [[ 1, 60]],
89
+ :Cu => [[ 1, 61]],
90
+ :Zn => [[ 1, 62]],
91
+ :Ga => [[ 1, 63]],
92
+ :Ge => [[ 1, 64]],
93
+ :As => [[ 1, 65]],
94
+ :Se => [[ 1, 66]],
95
+ :Kr => [[ 1, 67]],
96
+ :Rb => [[ 1, 68]],
97
+ :Sr => [[ 1, 69]],
98
+ :Y => [[ 1, 70]],
99
+ :Zr => [[ 1, 71]],
100
+ :Nb => [[ 1, 72]],
101
+ :Mo => [[ 1, 73]],
102
+ :Ru => [[ 1, 74]],
103
+ :Rh => [[ 1, 75]],
104
+ :Pd => [[ 1, 76]],
105
+ :Ag => [[ 1, 77]],
106
+ :Cd => [[ 1, 78]],
107
+ :In => [[ 1, 79]],
108
+ :Sn => [[ 1, 80]],
109
+ :Sb => [[ 1, 81]],
110
+ :Te => [[ 1, 82]],
111
+ :Xe => [[ 1, 83]],
112
+ :Cs => [[ 1, 84]],
113
+ :Ba => [[ 1, 85]],
114
+ :Lu => [[ 1, 86]],
115
+ :Hf => [[ 1, 87]],
116
+ :Ta => [[ 1, 88]],
117
+ :W => [[ 1, 89]],
118
+ :Re => [[ 1, 90]],
119
+ :Os => [[ 1, 91]],
120
+ :Ir => [[ 1, 92]],
121
+ :Pt => [[ 1, 93]],
122
+ :Au => [[ 1, 94]],
123
+ :Hg => [[ 1, 95]],
124
+ :Tl => [[ 1, 96]],
125
+ :Pb => [[ 1, 97]],
126
+ :Bi => [[ 1, 98]],
127
+ :La => [[ 1, 99]],
128
+ :Ce => [[ 1, 100]],
129
+ :Pr => [[ 1, 101]],
130
+ :Nd => [[ 1, 102]],
131
+ :Pm => [[ 1, 103]],
132
+ :Sm => [[ 1, 104]],
133
+ :Eu => [[ 1, 105]],
134
+ :Gd => [[ 1, 106]],
135
+ :Tb => [[ 1, 107]],
136
+ :Dy => [[ 1, 108]],
137
+ :Ho => [[ 1, 109]],
138
+ :Er => [[ 1, 110]],
139
+ :Tm => [[ 1, 111]],
140
+ :Yb => [[ 1, 112]],
141
+ :Tc => [[ 1, 113]],
142
+ :U => [[ 1, 114]],
143
+ }
13
144
 
14
- module Molecule
15
- def search_pubchem
16
- end
17
- end
145
+ # Section 2
146
+
147
+ # Section 3
148
+ Section3 = {
149
+ 'H-Li' => 263,
150
+ 'Li-Li' => 264,
151
+ 'B-Li' => 265,
152
+ 'C-Li' => 266,
153
+ 'Li-O' => 267,
154
+ 'F-Li' => 268,
155
+ 'Li-P' => 269,
156
+ 'Li-S' => 270,
157
+ 'Cl-Li' => 271,
158
+ 'B-H' => 272,
159
+ 'B-B' => 273,
160
+ 'B-C' => 274,
161
+ 'B-N' => 275,
162
+ 'B-O' => 276,
163
+ 'B-F' => 277,
164
+ 'B-Si' => 278,
165
+ 'B-P' => 279,
166
+ 'B-S' => 280,
167
+ 'B-Cl' => 281,
168
+ 'B-Br' => 282,
169
+ 'C-H' => 283,
170
+ 'C-C' => 284,
171
+ 'C-N' => 285,
172
+ 'C-O' => 286,
173
+ 'C-F' => 287,
174
+ 'C-Na' => 288,
175
+ 'C-Mg' => 289,
176
+ 'Al-C' => 290,
177
+ 'C-Si' => 291,
178
+ 'C-P' => 292,
179
+ 'C-S' => 293,
180
+ 'C-Cl' => 294,
181
+ 'As-C' => 295,
182
+ 'C-Se' => 296,
183
+ 'Br-C' => 297,
184
+ 'C-I' => 298,
185
+ 'H-N' => 299,
186
+ 'N-N' => 300,
187
+ 'N-O' => 301,
188
+ 'F-N' => 302,
189
+ 'N-Si' => 303,
190
+ 'N-P' => 304,
191
+ 'N-S' => 305,
192
+ 'Cl-N' => 306,
193
+ 'Br-N' => 307,
194
+ 'H-O' => 308,
195
+ 'O-O' => 309,
196
+ 'Mg-O' => 310,
197
+ 'Na-O' => 311,
198
+ 'Al-O' => 312,
199
+ 'O-Si' => 313,
200
+ 'O-P' => 314,
201
+ 'K-O' => 315,
202
+ 'F-P' => 316,
203
+ 'F-S' => 317,
204
+ 'Al-H' => 318,
205
+ 'Al-Cl' => 319,
206
+ 'H-Si' => 320,
207
+ 'Si-Si' => 321,
208
+ 'Cl-Si' => 322,
209
+ 'H-P' => 323,
210
+ 'P-P' => 324,
211
+ 'As-H' => 325,
212
+ 'As-As' => 326,
213
+ }
214
+
215
+ # Section 4 Simple atom nearest neighbors
216
+
217
+ Section4 = {
218
+ :C => [
219
+ ["-Br", "-C", 327],
220
+ ["-Br", "-C", "-C", 328],
221
+ ["-Br", "-H", 329],
222
+ ["-Br", "~C", 330],
223
+ ["-Br", "~N", 331],
224
+ ["-C", "-C", 332],
225
+ ["-C", "-C", "-C", 333],
226
+ ["-C", "-C", "-C", "-C", 334],
227
+ ["-C", "-C", "-C", "-H", 335],
228
+ ["-C", "-C", "-C", "-N", 336],
229
+ ["-C", "-C", "-C", "-O", 337],
230
+ ["-C", "-C", "-H", "-N", 338],
231
+ ["-C", "-C", "-H", "-O", 339],
232
+ ["-C", "-C", "-N", 340],
233
+ ["-C", "-C", "-O", 341],
234
+ ["-C", "-Cl", 342],
235
+ ["-C", "-Cl", "-H", 343],
236
+ ["-C", "-H", 344],
237
+ ["-C", "-H", "-N", 345],
238
+ ["-C", "-H", "-O", 346],
239
+ ["-C", "-H", "-O", "-O", 347],
240
+ ["-C", "-H", "-P", 348],
241
+ ["-C", "-H", "-S", 349],
242
+ ["-C", "-I", 350],
243
+ ["-C", "-N", 351],
244
+ ["-C", "-O", 352],
245
+ ["-C", "-S", 353],
246
+ ["-C", "-Si", 354],
247
+ ["-C", "~C", 355],
248
+ ["-C", "~C", "~C", 356],
249
+ ["-C", "~C", "~N", 357],
250
+ ["-C", "~N", 358],
251
+ ["-C", "~N", "~N", 359],
252
+ ["-Cl", "-Cl", 360],
253
+ ["-Cl", "-H", 361],
254
+ ["-Cl", "~C", 362],
255
+ ["-F", "-F", 363],
256
+ ["-F", "~C", 364],
257
+ ["-H", "-N", 365],
258
+ ["-H", "-O", 366],
259
+ ["-H", "-O", "-O", 367],
260
+ ["-H", "-S", 368],
261
+ ["-H", "-Si", 369],
262
+ ["-H", "~C", 370],
263
+ ["-H", "~C", "~C", 371],
264
+ ["-H", "~C", "~N", 372],
265
+ ["-H", "~N", 373],
266
+ ["-H", "-H", "-H", 374],
267
+ ["-N", "-N", 375],
268
+ ["-N", "~C", 376],
269
+ ["-N", "~C", "~C", 377],
270
+ ["-N", "~C", "~N", 378],
271
+ ["-N", "~N", 379],
272
+ ["-O", "-O", 380],
273
+ ["-O", "~C", 381],
274
+ ["-O", "~C", "~C", 382],
275
+ ["-S", "~C", 383],
276
+ ["~C", "~C", 384],
277
+ ["~C", "~C", "~C", 385],
278
+ ["~C", "~C", "~N", 386],
279
+ ["~C", "~N", 387],
280
+ ["~C", "~N", "~N", 388],
281
+ ["~N", "~N", 389]],
282
+ :N => [
283
+ ["-C", "-C", 390],
284
+ ["-C", "-C", "-C", 391],
285
+ ["-C", "-C", "-H", 392],
286
+ ["-C", "-H", 393],
287
+ ["-C", "-H", "-N", 394],
288
+ ["-C", "-O", 395],
289
+ ["-C", "~C", 396],
290
+ ["-C", "~C", "~C", 397],
291
+ ["-H", "-N", 398],
292
+ ["-H", "~C", 399],
293
+ ["-H", "~C", "~C", 400],
294
+ ["-O", "-O", 401],
295
+ ["-O", "~O", 402],
296
+ ["~C", "~C", 403],
297
+ ["~C", "~C", "~C", 404]],
298
+ :O => [
299
+ ["-C", "-C", 405],
300
+ ["-C", "-H", 406],
301
+ ["-C", "-P", 407],
302
+ ["-H", "-S", 408],
303
+ ["~C", "~C", 409]],
304
+ :P => [
305
+ ["-C", "-C", 410],
306
+ ["-O", "-O", 411]],
307
+ :S => [
308
+ ["-C", "-C", 412],
309
+ ["-C", "-H", 413],
310
+ ["-C", "-O", 414]],
311
+ :Si => [
312
+ ["-C", "-C", 415]
313
+ ]
314
+ }
315
+
316
+
317
+
318
+ # Section 5 Detailed atom neighborhoods
319
+
320
+ # Section 6 Simple SMARTS patterns
321
+
322
+ Section6 = {
323
+ "C-C-C#C" => 460,
324
+ "O-C-C=N" => 461,
325
+ "O-C-C=O" => 462,
326
+ "N:C-S-[#1]" => 463,
327
+ "N-C-C=C" => 464,
328
+ "O=S-C-C" => 465,
329
+ "N#C-C=C" => 466,
330
+ "C=N-N-C" => 467,
331
+ "O=S-C-N" => 468,
332
+ "S-S-C:C" => 469,
333
+ "C:C-C=C" => 470,
334
+ "S:C:C:C" => 471,
335
+ "C:N:C-C" => 472,
336
+ "S-C:N:C" => 473,
337
+ "S:C:C:N" => 474,
338
+ "S-C=N-C" => 475,
339
+ "C-O-C=C" => 476,
340
+ "N-N-C:C" => 477,
341
+ "S-C=N-[#1]" => 478,
342
+ "S-C-S-C" => 479,
343
+ "C:S:C-C" => 480,
344
+ "O-S-C:C" => 481,
345
+ "C:N-C:C" => 482,
346
+ "N-S-C:C" => 483,
347
+ "N-C:N:C" => 484,
348
+ "N:C:C:N" => 485,
349
+ "N-C:N:N" => 486,
350
+ "N-C=N-C" => 487,
351
+ "N-C=N-[#1]" => 488,
352
+ "N-C-S-C" => 489,
353
+ "C-C-C=C" => 490,
354
+ "C-N:C-[#1]" => 491,
355
+ "N-C:O:C" => 492,
356
+ "O=C-C:C" => 493,
357
+ "O=C-C:N" => 494,
358
+ "C-N-C:C" => 495,
359
+ "N:N-C-[#1]" => 496,
360
+ "O-C:C:N" => 497,
361
+ "O-C=C-C" => 498,
362
+ "N-C:C:N" => 499,
363
+ "C-S-C:C" => 500,
364
+ "Cl-C:C-C" => 501,
365
+ "N-C=C-[#1]" => 502,
366
+ "Cl-C:C-[#1]" => 503,
367
+ "N:C:N-C" => 504,
368
+ "Cl-C:C-O" => 505,
369
+ "C-C:N:C" => 506,
370
+ "C-C-S-C" => 507,
371
+ "S=C-N-C" => 508,
372
+ "Br-C:C-C" => 509,
373
+ "[#1]-N-N-[#1]" => 510,
374
+ "S=C-N-[#1]" => 511,
375
+ "C-[As]-O-[#1]" => 512,
376
+ "S:C:C-[#1]" => 513,
377
+ "O-N-C-C" => 514,
378
+ "N-N-C-C" => 515,
379
+ "[#1]-C=C-[#1]" => 516,
380
+ "N-N-C-N" => 517,
381
+ "O=C-N-N" => 518,
382
+ "N=C-N-C" => 519,
383
+ "C=C-C:C" => 520,
384
+ "C:N-C-[#1]" => 521,
385
+ "C-N-N-[#1]" => 522,
386
+ "N:C:C-C" => 523,
387
+ "C-C=C-C" => 524,
388
+ "[As]-C:C-[#1]" => 525,
389
+ "Cl-C:C-Cl" => 526,
390
+ "C:C:N-[#1]" => 527,
391
+ "[#1]-N-C-[#1]" => 528,
392
+ "Cl-C-C-Cl" => 529,
393
+ "N:C-C:C" => 530,
394
+ "S-C:C-C" => 531,
395
+ "S-C:C-[#1]" => 532,
396
+ "S-C:C-N" => 533,
397
+ "S-C:C-O" => 534,
398
+ "O=C-C-C" => 535,
399
+ "O=C-C-N" => 536,
400
+ "O=C-C-O" => 537,
401
+ "N=C-C-C" => 538,
402
+ "N=C-C-[#1]" => 539,
403
+ "C-N-C-[#1]" => 540,
404
+ "O-C:C-C" => 541,
405
+ "O-C:C-[#1]" => 542,
406
+ "O-C:C-N" => 543,
407
+ "O-C:C-O" => 544,
408
+ "N-C:C-C" => 545,
409
+ "N-C:C-[#1]" => 546,
410
+ "N-C:C-N" => 547,
411
+ "O-C-C:C" => 548,
412
+ "N-C-C:C" => 549,
413
+ "Cl-C-C-C" => 550,
414
+ "Cl-C-C-O" => 551,
415
+ "C:C-C:C" => 552,
416
+ "O=C-C=C" => 553,
417
+ "Br-C-C-C" => 554,
418
+ "N=C-C=C" => 555,
419
+ "C=C-C-C" => 556,
420
+ "N:C-O-[#1]" => 557,
421
+ "O=N-C:C" => 558,
422
+ "O-C-N-[#1]" => 559,
423
+ "N-C-N-C" => 560,
424
+ "Cl-C-C=O" => 561,
425
+ "Br-C-C=O" => 562,
426
+ "O-C-O-C" => 563,
427
+ "C=C-C=C" => 564,
428
+ "C:C-O-C" => 565,
429
+ "O-C-C-N" => 566,
430
+ "O-C-C-O" => 567,
431
+ "N#C-C-C" => 568,
432
+ "N-C-C-N" => 569,
433
+ "C:C-C-C" => 570,
434
+ "[#1]-C-O-[#1]" => 571,
435
+ "N:C:N:C" => 572,
436
+ "O-C-C=C" => 573,
437
+ "O-C-C:C-C" => 574,
438
+ "O-C-C:C-O" => 575,
439
+ "N=C-C:C-[#1]" => 576,
440
+ "C:C-N-C:C" => 577,
441
+ "C-C:C-C:C" => 578,
442
+ "O=C-C-C-C" => 579,
443
+ "O=C-C-C-N" => 580,
444
+ "O=C-C-C-O" => 581,
445
+ "C-C-C-C-C" => 582,
446
+ "Cl-C:C-O-C" => 583,
447
+ "C:C-C=C-C" => 584,
448
+ "C-C:C-N-C" => 585,
449
+ "C-S-C-C-C" => 586,
450
+ "N-C:C-O-[#1]" => 587,
451
+ "O=C-C-C=O" => 588,
452
+ "C-C:C-O-C" => 589,
453
+ "C-C:C-O-[#1]" => 590,
454
+ "Cl-C-C-C-C" => 591,
455
+ "N-C-C-C-C" => 592,
456
+ "N-C-C-C-N" => 593,
457
+ "C-O-C-C=C" => 594,
458
+ "C:C-C-C-C" => 595,
459
+ "N=C-N-C-C" => 596,
460
+ "O=C-C-C:C" => 597,
461
+ "Cl-C:C:C-C" => 598,
462
+ "[#1]-C-C=C-[#1]" => 599,
463
+ "N-C:C:C-C" => 600,
464
+ "N-C:C:C-N" => 601,
465
+ "O=C-C-N-C" => 602,
466
+ "C-C:C:C-C" => 603,
467
+ "C-O-C-C:C" => 604,
468
+ "O=C-C-O-C" => 605,
469
+ "O-C:C-C-C" => 606,
470
+ "N-C-C-C:C" => 607,
471
+ "C-C-C-C:C" => 608,
472
+ "Cl-C-C-N-C" => 609,
473
+ "C-O-C-O-C" => 610,
474
+ "N-C-C-N-C" => 611,
475
+ "N-C-O-C-C" => 612,
476
+ "C-N-C-C-C" => 613,
477
+ "C-C-O-C-C" => 614,
478
+ "N-C-C-O-C" => 615,
479
+ "C:C:N:N:C" => 616,
480
+ "C-C-C-O-[#1]" => 617,
481
+ "C:C-C-C:C" => 618,
482
+ "O-C-C=C-C" => 619,
483
+ "C:C-O-C-C" => 620,
484
+ "N-C:C:C:N" => 621,
485
+ "O=C-O-C:C" => 622,
486
+ "O=C-C:C-C" => 623,
487
+ "O=C-C:C-N" => 624,
488
+ "O=C-C:C-O" => 625,
489
+ "C-O-C:C-C" => 626,
490
+ "O=[As]-C:C:C" => 627,
491
+ "C-N-C-C:C" => 628,
492
+ "S-C:C:C-N" => 629,
493
+ "O-C:C-O-C" => 630,
494
+ "O-C:C-O-[#1]" => 631,
495
+ "C-C-O-C:C" => 632,
496
+ "N-C-C:C-C" => 633,
497
+ "C-C-C:C-C" => 634,
498
+ "N-N-C-N-[#1]" => 635,
499
+ "C-N-C-N-C" => 636,
500
+ "O-C-C-C-C" => 637,
501
+ "O-C-C-C-N" => 638,
502
+ "O-C-C-C-O" => 639,
503
+ "C=C-C-C-C" => 640,
504
+ "O-C-C-C=C" => 641,
505
+ "O-C-C-C=O" => 642,
506
+ "[#1]-C-C-N-[#1]" => 643,
507
+ "C-C=N-N-C" => 644,
508
+ "O=C-N-C-C" => 645,
509
+ "O=C-N-C-[#1]" => 646,
510
+ "O=C-N-C-N" => 647,
511
+ "O=N-C:C-N" => 648,
512
+ "O=N-C:C-O" => 649,
513
+ "O=C-N-C=O" => 650,
514
+ "O-C:C:C-C" => 651,
515
+ "O-C:C:C-N" => 652,
516
+ "O-C:C:C-O" => 653,
517
+ "N-C-N-C-C" => 654,
518
+ "O-C-C-C:C" => 655,
519
+ "C-C-N-C-C" => 656,
520
+ "C-N-C:C-C" => 657,
521
+ "C-C-S-C-C" => 658,
522
+ "O-C-C-N-C" => 659,
523
+ "C-C=C-C-C" => 660,
524
+ "O-C-O-C-C" => 661,
525
+ "O-C-C-O-C" => 662,
526
+ "O-C-C-O-[#1]" => 663,
527
+ "C-C=C-C=C" => 664,
528
+ "N-C:C-C-C" => 665,
529
+ "C=C-C-O-C" => 666,
530
+ "C=C-C-O-[#1]" => 667,
531
+ "C-C:C-C-C" => 668,
532
+ "Cl-C:C-C=O" => 669,
533
+ "Br-C:C:C-C" => 670,
534
+ "O=C-C=C-C" => 671,
535
+ "O=C-C=C-[#1]" => 672,
536
+ "O=C-C=C-N" => 673,
537
+ "N-C-N-C:C" => 674,
538
+ "Br-C-C-C:C" => 675,
539
+ "N#C-C-C-C" => 676,
540
+ "C-C=C-C:C" => 677,
541
+ "C-C-C=C-C" => 678,
542
+ "C-C-C-C-C-C" => 679,
543
+ "O-C-C-C-C-C" => 680,
544
+ "O-C-C-C-C-O" => 681,
545
+ "O-C-C-C-C-N" => 682,
546
+ "N-C-C-C-C-C" => 683,
547
+ "O=C-C-C-C-C" => 684,
548
+ "O=C-C-C-C-N" => 685,
549
+ "O=C-C-C-C-O" => 686,
550
+ "O=C-C-C-C=O" => 687,
551
+ "C-C-C-C-C-C-C" => 688,
552
+ "O-C-C-C-C-C-C" => 689,
553
+ "O-C-C-C-C-C-O" => 690,
554
+ "O-C-C-C-C-C-N" => 691,
555
+ "O=C-C-C-C-C-C" => 692,
556
+ "O=C-C-C-C-C-O" => 693,
557
+ "O=C-C-C-C-C=O" => 694,
558
+ "O=C-C-C-C-C-N" => 695,
559
+ "C-C-C-C-C-C-C-C" => 696,
560
+ "C-C-C-C-C-C(C)-C" => 697,
561
+ "O-C-C-C-C-C-C-C" => 698,
562
+ "O-C-C-C-C-C(C)-C" => 699,
563
+ "O-C-C-C-C-C-O-C" => 700,
564
+ "O-C-C-C-C-C(O)-C" => 701,
565
+ "O-C-C-C-C-C-N-C" => 702,
566
+ "O-C-C-C-C-C(N)-C" => 703,
567
+ "O=C-C-C-C-C-C-C" => 704,
568
+ "O=C-C-C-C-C(O)-C" => 705,
569
+ "O=C-C-C-C-C(=O)-C" => 706,
570
+ "O=C-C-C-C-C(N)-C" => 707,
571
+ "C-C(C)-C-C" => 708,
572
+ "C-C(C)-C-C-C" => 709,
573
+ "C-C-C(C)-C-C" => 710,
574
+ "C-C(C)(C)-C-C" => 711,
575
+ "C-C(C)-C(C)-C" => 712,
576
+ }
577
+
578
+ # Section 7: Complex SMARTS patterns
18
579
 
19
- module PubChem
20
-
21
- Host="pubchem.ncbi.nlm.nih.gov"
22
- Summary="/summary/summary.cgi"
23
-
24
- class PubChem
25
- Searchpath="/search/"
26
- Query="PreQSrv.cgi"
27
- Boundary="-----boundary-----"
28
-
29
- Data = [
30
- Boundary, "Content-Disposition: form-data; name=\"mode\"", "", "simplequery",
31
- Boundary, "Content-Disposition: form-data; name=\"check\"", "", "remote",
32
- Boundary, "Content-Disposition: form-data; name=\"execution\"", "", "remote",
33
- Boundary, "Content-Disposition: form-data; name=\"queue\"", "", "ssquery",
34
- Boundary, "Content-Disposition: form-data; name=\"simple_searchdata\"", "", '%s',
35
- Boundary, "Content-Disposition: form-data; name=\"simple_cid\"", "", "",
36
- Boundary, "Content-Disposition: form-data; name=\"simple_sid\"", "", "",
37
- Boundary, "Content-Disposition: form-data; name=\"file\"; filename=\"\"",
38
- "Content-Type: application/octet-stream", "", "",
39
- Boundary, "Content-Disposition: form-data; name=\"simple_searchtype\"", "", "fs",
40
- Boundary, "Content-Disposition: form-data; name=\"maxhits\"", "", '%s',
41
- Boundary].join("\x0d\x0a")
42
-
43
- def self.smiles_search(smiles, maxhits=100)
44
- cid = []
45
- url = ""
46
- body = ""
47
- Net::HTTP.version_1_2
48
- Net::HTTP.start(Host, 80) do |http|
49
- body = http.post(Searchpath + Query, Data % [smiles, maxhits],
50
- {'Content-Type' => "multipart/form-data; boundary=#{Boundary}",
51
- 'Referer' => "http://pubchem.ncbi.nlm.nih.gov/search/"}).body
52
- if m = /url="([^"]+)"/.match(body)
53
- body = http.get(Searchpath + m[1]).body
54
- end
55
- while /setTimeout\('document.location.replace\("([^"]+)"\);', (\d+)\)/ =~ body do
56
- sleep($2.to_f/100)
57
- response = http.get(URI.parse($1))
58
- body = response.body
59
- url = response['location']
60
- end
61
- if /PubChem structure search report:(\s|\S)+No hits/ !~ body
62
- # text format
63
- url.sub!(/cmd=Select\+from\+History/, 'cmd=Text&dopt=Brief')
64
- body = http.get(url).body
65
- body.scan(/\d+: CID: (\d+)/).each do |id|
66
- cid.push(PubChemEntry.new(id[0].to_i))
67
- end
68
- # # html format
69
- # body = http.get(url).body
70
- # while /CID: <a href=\"([^"]+)\">(\d+)<\/a>/ =~ body do
71
- # cid.push($2)
72
- # body = $'
73
- # end
74
- end
580
+ Section7 = {
581
+ "Cc1ccc(C)cc1" => 713,
582
+ "Cc1ccc(O)cc1" => 714,
583
+ "Cc1ccc(S)cc1" => 715,
584
+ "Cc1ccc(N)cc1" => 716,
585
+ "Cc1ccc(Cl)cc1" => 717,
586
+ "Cc1ccc(Br)cc1" => 718,
587
+ "Oc1ccc(O)cc1" => 719,
588
+ "Oc1ccc(S)cc1" => 720,
589
+ "Oc1ccc(N)cc1" => 721,
590
+ "Oc1ccc(Cl)cc1" => 722,
591
+ "Oc1ccc(Br)cc1" => 723,
592
+ "Sc1ccc(S)cc1" => 724,
593
+ "Sc1ccc(N)cc1" => 725,
594
+ "Sc1ccc(Cl)cc1" => 726,
595
+ "Sc1ccc(Br)cc1" => 727,
596
+ "Nc1ccc(N)cc1" => 728,
597
+ "Nc1ccc(Cl)cc1" => 729,
598
+ "Nc1ccc(Br)cc1" => 730,
599
+ "Clc1ccc(Cl)cc1" => 731,
600
+ "Clc1ccc(Br)cc1" => 732,
601
+ "Brc1ccc(Br)cc1" => 733,
602
+ "Cc1cc(C)ccc1" => 734,
603
+ "Cc1cc(O)ccc1" => 735,
604
+ "Cc1cc(S)ccc1" => 736,
605
+ "Cc1cc(N)ccc1" => 737,
606
+ "Cc1cc(Cl)ccc1" => 738,
607
+ "Cc1cc(Br)ccc1" => 739,
608
+ "Oc1cc(O)ccc1" => 740,
609
+ "Oc1cc(S)ccc1" => 741,
610
+ "Oc1cc(N)ccc1" => 742,
611
+ "Oc1cc(Cl)ccc1" => 743,
612
+ "Oc1cc(Br)ccc1" => 744,
613
+ "Sc1cc(S)ccc1" => 745,
614
+ "Sc1cc(N)ccc1" => 746,
615
+ "Sc1cc(Cl)ccc1" => 747,
616
+ "Sc1cc(Br)ccc1" => 748,
617
+ "Nc1cc(N)ccc1" => 749,
618
+ "Nc1cc(Cl)ccc1" => 750,
619
+ "Nc1cc(Br)ccc1" => 751,
620
+ "Clc1cc(Cl)ccc1" => 752,
621
+ "Clc1cc(Br)ccc1" => 753,
622
+ "Brc1cc(Br)ccc1" => 754,
623
+ "Cc1c(C)cccc1" => 755,
624
+ "Cc1c(O)cccc1" => 756,
625
+ "Cc1c(S)cccc1" => 757,
626
+ "Cc1c(N)cccc1" => 758,
627
+ "Cc1c(Cl)cccc1" => 759,
628
+ "Cc1c(Br)cccc1" => 760,
629
+ "Oc1c(O)cccc1" => 761,
630
+ "Oc1c(S)cccc1" => 762,
631
+ "Oc1c(N)cccc1" => 763,
632
+ "Oc1c(Cl)cccc1" => 764,
633
+ "Oc1c(Br)cccc1" => 765,
634
+ "Sc1c(S)cccc1" => 766,
635
+ "Sc1c(N)cccc1" => 767,
636
+ "Sc1c(Cl)cccc1" => 768,
637
+ "Sc1c(Br)cccc1" => 769,
638
+ "Nc1c(N)cccc1" => 770,
639
+ "Nc1c(Cl)cccc1" => 771,
640
+ "Nc1c(Br)cccc1" => 772,
641
+ "Clc1c(Cl)cccc1" => 773,
642
+ "Clc1c(Br)cccc1" => 774,
643
+ "Brc1c(Br)cccc1" => 775,
644
+ "CC1CCC(C)CC1" => 776,
645
+ "CC1CCC(O)CC1" => 777,
646
+ "CC1CCC(S)CC1" => 778,
647
+ "CC1CCC(N)CC1" => 779,
648
+ "CC1CCC(Cl)CC1" => 780,
649
+ "CC1CCC(Br)CC1" => 781,
650
+ "OC1CCC(O)CC1" => 782,
651
+ "OC1CCC(S)CC1" => 783,
652
+ "OC1CCC(N)CC1" => 784,
653
+ "OC1CCC(Cl)CC1" => 785,
654
+ "OC1CCC(Br)CC1" => 786,
655
+ "SC1CCC(S)CC1" => 787,
656
+ "SC1CCC(N)CC1" => 788,
657
+ "SC1CCC(Cl)CC1" => 789,
658
+ "SC1CCC(Br)CC1" => 790,
659
+ "NC1CCC(N)CC1" => 791,
660
+ "NC1CCC(Cl)CC1" => 792,
661
+ "NC1CCC(Br)CC1" => 793,
662
+ "ClC1CCC(Cl)CC1" => 794,
663
+ "ClC1CCC(Br)CC1" => 795,
664
+ "BrC1CCC(Br)CC1" => 796,
665
+ "CC1CC(C)CCC1" => 797,
666
+ "CC1CC(O)CCC1" => 798,
667
+ "CC1CC(S)CCC1" => 799,
668
+ "CC1CC(N)CCC1" => 800,
669
+ "CC1CC(Cl)CCC1" => 801,
670
+ "CC1CC(Br)CCC1" => 802,
671
+ "OC1CC(O)CCC1" => 803,
672
+ "OC1CC(S)CCC1" => 804,
673
+ "OC1CC(N)CCC1" => 805,
674
+ "OC1CC(Cl)CCC1" => 806,
675
+ "OC1CC(Br)CCC1" => 807,
676
+ "SC1CC(S)CCC1" => 808,
677
+ "SC1CC(N)CCC1" => 809,
678
+ "SC1CC(Cl)CCC1" => 810,
679
+ "SC1CC(Br)CCC1" => 811,
680
+ "NC1CC(N)CCC1" => 812,
681
+ "NC1CC(Cl)CCC1" => 813,
682
+ "NC1CC(Br)CCC1" => 814,
683
+ "ClC1CC(Cl)CCC1" => 815,
684
+ "ClC1CC(Br)CCC1" => 816,
685
+ "BrC1CC(Br)CCC1" => 817,
686
+ "CC1C(C)CCCC1" => 818,
687
+ "CC1C(O)CCCC1" => 819,
688
+ "CC1C(S)CCCC1" => 820,
689
+ "CC1C(N)CCCC1" => 821,
690
+ "CC1C(Cl)CCCC1" => 822,
691
+ "CC1C(Br)CCCC1" => 823,
692
+ "OC1C(O)CCCC1" => 824,
693
+ "OC1C(S)CCCC1" => 825,
694
+ "OC1C(N)CCCC1" => 826,
695
+ "OC1C(Cl)CCCC1" => 827,
696
+ "OC1C(Br)CCCC1" => 828,
697
+ "SC1C(S)CCCC1" => 829,
698
+ "SC1C(N)CCCC1" => 830,
699
+ "SC1C(Cl)CCCC1" => 831,
700
+ "SC1C(Br)CCCC1" => 832,
701
+ "NC1C(N)CCCC1" => 833,
702
+ "NC1C(Cl)CCCC1" => 834,
703
+ "NC1C(Br)CCCC1" => 835,
704
+ "ClC1C(Cl)CCCC1" => 836,
705
+ "ClC1C(Br)CCCC1" => 837,
706
+ "BrC1C(Br)CCCC1" => 838,
707
+ "CC1CC(C)CC1" => 839,
708
+ "CC1CC(O)CC1" => 840,
709
+ "CC1CC(S)CC1" => 841,
710
+ "CC1CC(N)CC1" => 842,
711
+ "CC1CC(Cl)CC1" => 843,
712
+ "CC1CC(Br)CC1" => 844,
713
+ "OC1CC(O)CC1" => 845,
714
+ "OC1CC(S)CC1" => 846,
715
+ "OC1CC(N)CC1" => 847,
716
+ "OC1CC(Cl)CC1" => 848,
717
+ "OC1CC(Br)CC1" => 849,
718
+ "SC1CC(S)CC1" => 850,
719
+ "SC1CC(N)CC1" => 851,
720
+ "SC1CC(Cl)CC1" => 852,
721
+ "SC1CC(Br)CC1" => 853,
722
+ "NC1CC(N)CC1" => 854,
723
+ "NC1CC(Cl)CC1" => 855,
724
+ "NC1CC(Br)CC1" => 856,
725
+ "ClC1CC(Cl)CC1" => 857,
726
+ "ClC1CC(Br)CC1" => 858,
727
+ "BrC1CC(Br)CC1" => 859,
728
+ "CC1C(C)CCC1" => 860,
729
+ "CC1C(O)CCC1" => 861,
730
+ "CC1C(S)CCC1" => 862,
731
+ "CC1C(N)CCC1" => 863,
732
+ "CC1C(Cl)CCC1" => 864,
733
+ "CC1C(Br)CCC1" => 865,
734
+ "OC1C(O)CCC1" => 866,
735
+ "OC1C(S)CCC1" => 867,
736
+ "OC1C(N)CCC1" => 868,
737
+ "OC1C(Cl)CCC1" => 869,
738
+ "OC1C(Br)CCC1" => 870,
739
+ "SC1C(S)CCC1" => 871,
740
+ "SC1C(N)CCC1" => 872,
741
+ "SC1C(Cl)CCC1" => 873,
742
+ "SC1C(Br)CCC1" => 874,
743
+ "NC1C(N)CCC1" => 875,
744
+ "NC1C(Cl)CC1" => 876,
745
+ "NC1C(Br)CCC1" => 877,
746
+ "ClC1C(Cl)CCC1" => 878,
747
+ "ClC1C(Br)CCC1" => 879,
748
+ "BrC1C(Br)CCC1" => 880,
749
+ }
750
+
751
+ PubChemSubsKey = [
752
+ ">= 4 H",
753
+ ">= 8 H",
754
+ ">= 16 H",
755
+ ">= 32 H",
756
+ ">= 1 Li",
757
+ ">= 2 Li",
758
+ ">= 1 B",
759
+ ">= 2 B",
760
+ ">= 4 B",
761
+ ">= 2 C",
762
+ ">= 4 C",
763
+ ">= 8 C",
764
+ ">= 16 C",
765
+ ">= 32 C",
766
+ ">= 1 N",
767
+ ">= 2 N",
768
+ ">= 4 N",
769
+ ">= 8 N",
770
+ ">= 1 O",
771
+ ">= 2 O",
772
+ ">= 4 O",
773
+ ">= 8 O",
774
+ ">= 16 O",
775
+ ">= 1 F",
776
+ ">= 2 F",
777
+ ">= 4 F",
778
+ ">= 1 Na",
779
+ ">= 2 Na",
780
+ ">= 1 Si",
781
+ ">= 2 Si",
782
+ ">= 1 P",
783
+ ">= 2 P",
784
+ ">= 4 P",
785
+ ">= 1 S",
786
+ ">= 2 S",
787
+ ">= 4 S",
788
+ ">= 8 S",
789
+ ">= 1 Cl",
790
+ ">= 2 Cl",
791
+ ">= 4 Cl",
792
+ ">= 8 Cl",
793
+ ">= 1 K",
794
+ ">= 2 K",
795
+ ">= 1 Br",
796
+ ">= 2 Br",
797
+ ">= 4 Br",
798
+ ">= 1 I",
799
+ ">= 2 I",
800
+ ">= 4 I",
801
+ ">= 1 Be",
802
+ ">= 1 Mg",
803
+ ">= 1 Al",
804
+ ">= 1 Ca",
805
+ ">= 1 Sc",
806
+ ">= 1 Ti",
807
+ ">= 1 V",
808
+ ">= 1 Cr",
809
+ ">= 1 Mn",
810
+ ">= 1 Fe",
811
+ ">= 1 Co",
812
+ ">= 1 Ni",
813
+ ">= 1 Cu",
814
+ ">= 1 Zn",
815
+ ">= 1 Ga",
816
+ ">= 1 Ge",
817
+ ">= 1 As",
818
+ ">= 1 Se",
819
+ ">= 1 Kr",
820
+ ">= 1 Rb",
821
+ ">= 1 Sr",
822
+ ">= 1 Y",
823
+ ">= 1 Zr",
824
+ ">= 1 Nb",
825
+ ">= 1 Mo",
826
+ ">= 1 Ru",
827
+ ">= 1 Rh",
828
+ ">= 1 Pd",
829
+ ">= 1 Ag",
830
+ ">= 1 Cd",
831
+ ">= 1 In",
832
+ ">= 1 Sn",
833
+ ">= 1 Sb",
834
+ ">= 1 Te",
835
+ ">= 1 Xe",
836
+ ">= 1 Cs",
837
+ ">= 1 Ba",
838
+ ">= 1 Lu",
839
+ ">= 1 Hf",
840
+ ">= 1 Ta",
841
+ ">= 1 W",
842
+ ">= 1 Re",
843
+ ">= 1 Os",
844
+ ">= 1 Ir",
845
+ ">= 1 Pt",
846
+ ">= 1 Au",
847
+ ">= 1 Hg",
848
+ ">= 1 Tl",
849
+ ">= 1 Pb",
850
+ ">= 1 Bi",
851
+ ">= 1 La",
852
+ ">= 1 Ce",
853
+ ">= 1 Pr",
854
+ ">= 1 Nd",
855
+ ">= 1 Pm",
856
+ ">= 1 Sm",
857
+ ">= 1 Eu",
858
+ ">= 1 Gd",
859
+ ">= 1 Tb",
860
+ ">= 1 Dy",
861
+ ">= 1 Ho",
862
+ ">= 1 Er",
863
+ ">= 1 Tm",
864
+ ">= 1 Yb",
865
+ ">= 1 Tc",
866
+ ">= 1 U",
867
+ ">= 1 any ring size 3",
868
+ ">= 1 saturated carbon-only ring size 3",
869
+ ">= 1 saturated nitrogen-containing ring size 3",
870
+ ">= 1 saturated heteroatom-containing ring size 3",
871
+ ">= 1 unsaturated or aromatic carbon-only ring size 3",
872
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 3",
873
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 3",
874
+ ">= 2 any ring size 3",
875
+ ">= 2 saturated carbon-only ring size 3",
876
+ ">= 2 saturated nitrogen-containing ring size 3",
877
+ ">= 2 saturated heteroatom-containing ring size 3",
878
+ ">= 2 unsaturated or aromatic carbon-only ring size 3",
879
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 3",
880
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 3",
881
+ ">= 1 any ring size 4",
882
+ ">= 1 saturated carbon-only ring size 4",
883
+ ">= 1 saturated nitrogen-containing ring size 4",
884
+ ">= 1 saturated heteroatom-containing ring size 4",
885
+ ">= 1 unsaturated or aromatic carbon-only ring size 4",
886
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 4",
887
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 4",
888
+ ">= 2 any ring size 4",
889
+ ">= 2 saturated carbon-only ring size 4",
890
+ ">= 2 saturated nitrogen-containing ring size 4",
891
+ ">= 2 saturated heteroatom-containing ring size 4",
892
+ ">= 2 unsaturated or aromatic carbon-only ring size 4",
893
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 4",
894
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 4",
895
+ ">= 1 any ring size 5",
896
+ ">= 1 saturated carbon-only ring size 5",
897
+ ">= 1 saturated nitrogen-containing ring size 5",
898
+ ">= 1 saturated heteroatom-containing ring size 5",
899
+ ">= 1 unsaturated or aromatic carbon-only ring size 5",
900
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 5",
901
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 5",
902
+ ">= 2 any ring size 5",
903
+ ">= 2 saturated carbon-only ring size 5",
904
+ ">= 2 saturated nitrogen-containing ring size 5",
905
+ ">= 2 saturated heteroatom-containing ring size 5",
906
+ ">= 2 unsaturated or aromatic carbon-only ring size 5",
907
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 5",
908
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 5",
909
+ ">= 3 any ring size 5",
910
+ ">= 3 saturated carbon-only ring size 5",
911
+ ">= 3 saturated nitrogen-containing ring size 5",
912
+ ">= 3 saturated heteroatom-containing ring size 5",
913
+ ">= 3 unsaturated or aromatic carbon-only ring size 5",
914
+ ">= 3 unsaturated or aromatic nitrogen-containing ring size 5",
915
+ ">= 3 unsaturated or aromatic heteroatom-containing ring size 5",
916
+ ">= 4 any ring size 5",
917
+ ">= 4 saturated carbon-only ring size 5",
918
+ ">= 4 saturated nitrogen-containing ring size 5",
919
+ ">= 4 saturated heteroatom-containing ring size 5",
920
+ ">= 4 unsaturated or aromatic carbon-only ring size 5",
921
+ ">= 4 unsaturated or aromatic nitrogen-containing ring size 5",
922
+ ">= 4 unsaturated or aromatic heteroatom-containing ring size 5",
923
+ ">= 5 any ring size 5",
924
+ ">= 5 saturated carbon-only ring size 5",
925
+ ">= 5 saturated nitrogen-containing ring size 5",
926
+ ">= 5 saturated heteroatom-containing ring size 5",
927
+ ">= 5 unsaturated or aromatic carbon-only ring size 5",
928
+ ">= 5 unsaturated or aromatic nitrogen-containing ring size 5",
929
+ ">= 5 unsaturated or aromatic heteroatom-containing ring size 5",
930
+ ">= 1 any ring size 6",
931
+ ">= 1 saturated carbon-only ring size 6",
932
+ ">= 1 saturated nitrogen-containing ring size 6",
933
+ ">= 1 saturated heteroatom-containing ring size 6",
934
+ ">= 1 unsaturated or aromatic carbon-only ring size 6",
935
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 6",
936
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 6",
937
+ ">= 2 any ring size 6",
938
+ ">= 2 saturated carbon-only ring size 6",
939
+ ">= 2 saturated nitrogen-containing ring size 6",
940
+ ">= 2 saturated heteroatom-containing ring size 6",
941
+ ">= 2 unsaturated or aromatic carbon-only ring size 6",
942
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 6",
943
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 6",
944
+ ">= 3 any ring size 6",
945
+ ">= 3 saturated carbon-only ring size 6",
946
+ ">= 3 saturated nitrogen-containing ring size 6",
947
+ ">= 3 saturated heteroatom-containing ring size 6",
948
+ ">= 3 unsaturated or aromatic carbon-only ring size 6",
949
+ ">= 3 unsaturated or aromatic nitrogen-containing ring size 6",
950
+ ">= 3 unsaturated or aromatic heteroatom-containing ring size 6",
951
+ ">= 4 any ring size 6",
952
+ ">= 4 saturated carbon-only ring size 6",
953
+ ">= 4 saturated nitrogen-containing ring size 6",
954
+ ">= 4 saturated heteroatom-containing ring size 6",
955
+ ">= 4 unsaturated or aromatic carbon-only ring size 6",
956
+ ">= 4 unsaturated or aromatic nitrogen-containing ring size 6",
957
+ ">= 4 unsaturated or aromatic heteroatom-containing ring size 6",
958
+ ">= 5 any ring size 6",
959
+ ">= 5 saturated carbon-only ring size 6",
960
+ ">= 5 saturated nitrogen-containing ring size 6",
961
+ ">= 5 saturated heteroatom-containing ring size 6",
962
+ ">= 5 unsaturated or aromatic carbon-only ring size 6",
963
+ ">= 5 unsaturated or aromatic nitrogen-containing ring size 6",
964
+ ">= 5 unsaturated or aromatic heteroatom-containing ring size 6",
965
+ ">= 1 any ring size 7",
966
+ ">= 1 saturated carbon-only ring size 7",
967
+ ">= 1 saturated nitrogen-containing ring size 7",
968
+ ">= 1 saturated heteroatom-containing ring size 7",
969
+ ">= 1 unsaturated or aromatic carbon-only ring size 7",
970
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 7",
971
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 7",
972
+ ">= 2 any ring size 7",
973
+ ">= 2 saturated carbon-only ring size 7",
974
+ ">= 2 saturated nitrogen-containing ring size 7",
975
+ ">= 2 saturated heteroatom-containing ring size 7",
976
+ ">= 2 unsaturated or aromatic carbon-only ring size 7",
977
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 7",
978
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 7",
979
+ ">= 1 any ring size 8",
980
+ ">= 1 saturated carbon-only ring size 8",
981
+ ">= 1 saturated nitrogen-containing ring size 8",
982
+ ">= 1 saturated heteroatom-containing ring size 8",
983
+ ">= 1 unsaturated or aromatic carbon-only ring size 8",
984
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 8",
985
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 8",
986
+ ">= 2 any ring size 8",
987
+ ">= 2 saturated carbon-only ring size 8",
988
+ ">= 2 saturated nitrogen-containing ring size 8",
989
+ ">= 2 saturated heteroatom-containing ring size 8",
990
+ ">= 2 unsaturated or aromatic carbon-only ring size 8",
991
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 8",
992
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 8",
993
+ ">= 1 any ring size 9",
994
+ ">= 1 saturated carbon-only ring size 9",
995
+ ">= 1 saturated nitrogen-containing ring size 9",
996
+ ">= 1 saturated heteroatom-containing ring size 9",
997
+ ">= 1 unsaturated or aromatic carbon-only ring size 9",
998
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 9",
999
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 9",
1000
+ ">= 1 any ring size 10",
1001
+ ">= 1 saturated carbon-only ring size 10",
1002
+ ">= 1 saturated nitrogen-containing ring size 10",
1003
+ ">= 1 saturated heteroatom-containing ring size 10",
1004
+ ">= 1 unsaturated or aromatic carbon-only ring size 10",
1005
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 10",
1006
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 10",
1007
+ ">= 1 aromatic ring",
1008
+ ">= 1 hetero-aromatic ring",
1009
+ ">= 2 aromatic rings",
1010
+ ">= 2 hetero-aromatic rings",
1011
+ ">= 3 aromatic rings",
1012
+ ">= 3 hetero-aromatic rings",
1013
+ ">= 4 aromatic rings",
1014
+ ">= 4 hetero-aromatic rings",
1015
+ ]
1016
+
1017
+ RingSizeBaseNum = {
1018
+ 3 => 115,
1019
+ 4 => 129,
1020
+ 5 => 143,
1021
+ 6 => 178,
1022
+ 7 => 213,
1023
+ 8 => 227,
1024
+ 9 => 241,
1025
+ 10 => 248,
1026
+ }
1027
+ module Molecule
1028
+ def generate_pubchem_subskey
1029
+ fp = 0
1030
+ # Section 1
1031
+ self.composition.each do |elem, num|
1032
+ HierarchicElementCounts[elem].each do |n_atoms, bit|
1033
+ fp |= (1 << bit) if num >= n_atoms
75
1034
  end
76
- cid
77
1035
  end
78
1036
 
79
- end
1037
+ # Section 2
1038
+ [143, 150, 157, 164, 171]
1039
+ s2bit = []
1040
+ sssrs = self.find_sssr.inject({}){|hash, ring| (hash[ring.size] ||= []) << ring ; hash}
1041
+ sssrs.each do |ring_size, rings|
1042
+ # base_num = case ring_size
1043
+
1044
+ # p [ring_size, rings.size]
1045
+ # p rings.any?{|ring| ring.any?{|atom| atom.element == :N}}
1046
+ # p rings.any?{|ring| ring.any?{|atom| atom.element != :C}}
1047
+ end
80
1048
 
81
- class PubChemEntry
1049
+ # Section 3
1050
+ self.edges.collect{ |bond, atom1, atom2|
1051
+ [atom1.element.to_s, atom2.element.to_s].sort.join("-")
1052
+ }.uniq.each do |pair|
1053
+ fp |= (1 << Section3[pair]) if Section3[pair]
1054
+ end
82
1055
 
83
- def initialize cid
84
- @cid = cid
1056
+ # Section 4
1057
+ self.nodes.each do |node|
1058
+ adj = self.adjacent_to(node).collect{|bond, atom| }
1059
+ p Section4[node.element]
85
1060
  end
1061
+ # exit
1062
+
86
1063
 
87
- def get_xml
88
- Net::HTTP.version_1_2
89
- Net::HTTP.get(Host, Summary + "\?disopt=DisplayXML&cid=%dd" % @cid)
1064
+ Section6.each do |smarts, bit|
1065
+ pat = Chem::OpenBabel::parse_smarts(smarts)
1066
+ fp |= (1 << bit ) if pat.match(self)
90
1067
  end
91
1068
 
92
- def get_sdf
93
- Net::HTTP.version_1_2
94
- Net::HTTP.get(Host, Summary + "\?disopt=DisplaySDF&cid=%d" % @cid)
1069
+ Section7.each do |smarts, bit|
1070
+ pat = Chem::OpenBabel::parse_smarts(smarts)
1071
+ fp |= (1 << bit ) if pat.match(self)
95
1072
  end
1073
+ fp
1074
+ end
96
1075
 
1076
+ # Extract PubChem substructural keys
1077
+ # see ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
1078
+ def pubchem_subskeys
1079
+ require 'base64'
1080
+ fp = 0
1081
+ b64 = self.sdf_data["PUBCHEM_CACTVS_SUBSKEYS"]
1082
+ Base64.decode64(b64).unpack("N*")[1..-1].each_with_index do |bit, idx|
1083
+ # fp += (bit << (881 - (idx + 1) * 32))
1084
+ bb = 0
1085
+ 0.upto(31) do |n|
1086
+ bb += (1 << n) if (bit & (1 << (31 - n)) != 0)
1087
+ end
1088
+ fp += (bb << (idx * 32))
1089
+ end
1090
+ fp
97
1091
  end
1092
+
98
1093
  end
99
1094
 
100
1095
  end
101
1096
 
102
- if $0 == __FILE__
103
- smiles="CC23(CCC1c4ccc(O)cc4(CCC1C3(CC(O)C2(O))))"
104
- puts "===== CID(s) for SMILES, #{smiles} ====="
105
- cid = Chem::PubChem.smiles_search(smiles)
106
- p cid
107
- puts "===== MOL format data ===="
108
- cid.each do |c|
109
- puts c.get_sdf
110
- end
111
- # p Chem::PubChem.get_xml(cid[0])
112
- # puts Chem::PubChem.get_xml(cid[0]).sdf2mol.data
113
- end