chemruby 0.9.3 → 1.1.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/README +2 -2
  2. data/Rakefile +67 -63
  3. data/ext/extconf.rb +2 -0
  4. data/ext/subcomp.c +461 -320
  5. data/ext/utils.c +56 -0
  6. data/ext/utils.h +13 -0
  7. data/lib/chem.rb +34 -8
  8. data/lib/chem/db.rb +8 -0
  9. data/lib/chem/db/cansmi.rb +1 -1
  10. data/lib/chem/db/cdx.rb +1 -1
  11. data/lib/chem/db/cml.rb +52 -0
  12. data/lib/chem/db/gd.rb +64 -0
  13. data/lib/chem/db/gspan.rb +2 -2
  14. data/lib/chem/db/kcf_rpair.rb +34 -0
  15. data/lib/chem/db/kegg.rb +35 -1
  16. data/lib/chem/db/mdl.rb +75 -34
  17. data/lib/chem/db/opsin.rb +24 -0
  18. data/lib/chem/db/pdb.rb +105 -0
  19. data/lib/chem/db/pdf.rb +2 -0
  20. data/lib/chem/db/pubchem.rb +1071 -88
  21. data/lib/chem/db/rmagick.rb +5 -3
  22. data/lib/chem/db/sdf.rb +28 -2
  23. data/lib/chem/db/smiles/smiles.ry +27 -25
  24. data/lib/chem/db/smiles/smiparser.rb +29 -27
  25. data/lib/chem/db/types/type_gd.rb +35 -0
  26. data/lib/chem/db/types/type_gspan.rb +2 -2
  27. data/lib/chem/db/types/type_kcf.rb +19 -0
  28. data/lib/chem/db/types/type_kegg.rb +2 -0
  29. data/lib/chem/db/types/type_mdl.rb +1 -1
  30. data/lib/chem/db/types/type_png.rb +5 -1
  31. data/lib/chem/db/types/type_rdf.rb +22 -0
  32. data/lib/chem/db/types/type_xyz.rb +1 -1
  33. data/lib/chem/db/vector.rb +19 -3
  34. data/lib/chem/model.rb +5 -2
  35. data/lib/chem/utils.rb +17 -1
  36. data/lib/chem/utils/bitdb.rb +49 -0
  37. data/lib/chem/utils/cas.rb +28 -0
  38. data/lib/chem/utils/cdk.rb +403 -0
  39. data/lib/chem/utils/fingerprint.rb +98 -0
  40. data/lib/chem/utils/geometry.rb +8 -0
  41. data/lib/chem/utils/net.rb +303 -0
  42. data/lib/chem/utils/once.rb +28 -0
  43. data/lib/chem/utils/openbabel.rb +204 -0
  44. data/lib/chem/utils/sssr.rb +33 -25
  45. data/lib/chem/utils/sub.rb +6 -0
  46. data/lib/chem/utils/transform.rb +9 -8
  47. data/lib/chem/utils/ullmann.rb +138 -95
  48. data/lib/graph.rb +5 -6
  49. data/lib/graph/utils.rb +8 -0
  50. data/sample/calc_maximum_common_subgraph.rb +27 -0
  51. data/sample/calc_properties.rb +9 -0
  52. data/sample/data/atp.mol +69 -0
  53. data/sample/data/pioglitazone.mol +58 -0
  54. data/sample/data/rosiglitazone.mol +55 -0
  55. data/sample/data/troglitazone.mol +70 -0
  56. data/sample/find_compound_by_keggapi.rb +19 -0
  57. data/sample/generate_inchi.rb +7 -0
  58. data/sample/generate_substructurekey.rb +11 -0
  59. data/sample/images/ex6.rb +17 -0
  60. data/sample/images/ex7.rb +18 -0
  61. data/sample/iupac2mol.rb +8 -0
  62. data/sample/kekule.rb +13 -0
  63. data/sample/logp.rb +4 -0
  64. data/sample/mcs.rb +13 -0
  65. data/sample/mol2pdf.rb +8 -0
  66. data/sample/pubchem_fetch.rb +8 -0
  67. data/sample/pubchem_search.rb +12 -0
  68. data/sample/rosiglitazone.mol +57 -0
  69. data/sample/smarts.rb +10 -0
  70. data/sample/structure_match.rb +8 -0
  71. data/sample/structure_match_color.rb +22 -0
  72. data/sample/thiazolidinedione.mol +19 -0
  73. data/sample/troglitazone.mol +232 -0
  74. data/sample/vicinity.rb +8 -0
  75. data/test/data/CID_704.sdf +236 -0
  76. data/test/data/CID_994.sdf +146 -0
  77. data/test/data/db_EXPT03276.txt +321 -0
  78. data/test/data/pioglitazone.mol +58 -0
  79. data/test/data/rosiglitazone.mol +55 -0
  80. data/test/data/thiazolidinedione.mol +19 -0
  81. data/test/data/troglitazone.mol +70 -0
  82. data/test/{test_adj.rb → tc_adj.rb} +0 -0
  83. data/test/{test_canonical_smiles.rb → tc_canonical_smiles.rb} +0 -0
  84. data/test/tc_casrn.rb +17 -0
  85. data/test/tc_cdk.rb +89 -0
  86. data/test/{test_cdx.rb → tc_cdx.rb} +0 -0
  87. data/test/{test_chem.rb → tc_chem.rb} +0 -0
  88. data/test/{test_cluster.rb → tc_cluster.rb} +0 -0
  89. data/test/{test_db.rb → tc_db.rb} +0 -0
  90. data/test/tc_develop.rb +38 -0
  91. data/test/tc_drugbank.rb +13 -0
  92. data/test/{test_eps.rb → tc_eps.rb} +0 -0
  93. data/test/tc_gd.rb +8 -0
  94. data/test/{test_geometry.rb → tc_geometry.rb} +0 -0
  95. data/test/tc_graph.rb +15 -0
  96. data/test/{test_gspan.rb → tc_gspan.rb} +0 -0
  97. data/test/{test_iupac.rb → tc_iupac.rb} +0 -0
  98. data/test/{test_kcf.rb → tc_kcf.rb} +0 -0
  99. data/test/{test_kcf_glycan.rb → tc_kcf_glycan.rb} +0 -0
  100. data/test/{test_kegg.rb → tc_kegg.rb} +13 -0
  101. data/test/{test_linucs.rb → tc_linucs.rb} +0 -0
  102. data/test/{test_mdl.rb → tc_mdl.rb} +20 -0
  103. data/test/{test_mol2.rb → tc_mol2.rb} +1 -1
  104. data/test/{test_morgan.rb → tc_morgan.rb} +0 -0
  105. data/test/tc_net.rb +5 -0
  106. data/test/tc_once.rb +29 -0
  107. data/test/tc_openbabel.rb +57 -0
  108. data/test/{test_pdf.rb → tc_pdf.rb} +0 -0
  109. data/test/{test_prop.rb → tc_prop.rb} +1 -1
  110. data/test/tc_pubchem.rb +32 -0
  111. data/test/{test_rmagick.rb → tc_rmagick.rb} +0 -0
  112. data/test/{test_sbdb.rb → tc_sbdb.rb} +0 -0
  113. data/test/{test_sdf.rb → tc_sdf.rb} +2 -0
  114. data/test/{test_smiles.rb → tc_smiles.rb} +46 -30
  115. data/test/tc_sssr.rb +1 -0
  116. data/test/{test_sub.rb → tc_sub.rb} +0 -0
  117. data/test/tc_subcomp.rb +59 -0
  118. data/test/{test_traverse.rb → tc_traverse.rb} +0 -0
  119. data/test/{test_writer.rb → tc_writer.rb} +0 -0
  120. data/test/{test_xyz.rb → tc_xyz.rb} +0 -0
  121. data/test/ts_current.rb +11 -0
  122. data/test/ts_image.rb +6 -0
  123. data/test/ts_main.rb +12 -0
  124. metadata +259 -194
  125. data/lib/chem/utils/graph_db.rb +0 -146
  126. data/test/test_sssr.rb +0 -18
  127. data/test/test_subcomp.rb +0 -37
@@ -0,0 +1,24 @@
1
+
2
+ module Chem
3
+
4
+ def self.opsin_parse(iupac_name)
5
+ OpsinMolecule.new(iupac_name)
6
+ end
7
+
8
+ class OpsinMolecule
9
+ include Molecule
10
+
11
+ def initialize(iupac_name)
12
+ require 'rcdk'
13
+ @iupac_name = iupac_name
14
+ name2struct = Rjb::import('uk.ac.cam.ch.wwmm.opsin.NameToStructure').new
15
+ @cml = name2struct.parseToCML(iupac_name).toXML.to_s
16
+ @mol = Chem::CMLMolecule.new(@cml)
17
+ end
18
+
19
+ def nodes ; @mol.nodes ; end
20
+
21
+ def edges ; @mol.edges ; end
22
+
23
+ end
24
+ end
@@ -0,0 +1,105 @@
1
+ $: << "/Users/tanaka/proj/chemruby/lib"
2
+ $: << "/Users/tanaka/proj/chemruby/ext"
3
+ $: << "/Users/tanaka/temp/bioruby/lib"
4
+
5
+ require 'bio'
6
+ require 'chem'
7
+
8
+ module Chem
9
+ module PDB
10
+
11
+ class PDBBond
12
+ include Bond
13
+ end
14
+
15
+ class PDBMolecule
16
+ include Chem::Molecule
17
+
18
+ def initialize name
19
+ @name = name
20
+ @nodes = []
21
+ @edges = []
22
+ end
23
+
24
+ # Set connection using het_dictionary
25
+ def set_connection het_dic
26
+ atom_hash = @nodes.inject({}){|ret, atom| ret[atom.name.strip] = atom ; ret}
27
+ con = het_dic.find{|entry| entry.entry_id == @name}
28
+ con.record["CONECT"].each do |b|
29
+ if from = atom_hash[b.name.strip]
30
+ b.other_atoms.each do |to_atom|
31
+ if to = atom_hash[to_atom.strip]
32
+ bond = PDBBond.new
33
+ @edges.push([bond, from, to])
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ end
41
+
42
+ end
43
+ end
44
+
45
+ module Bio
46
+
47
+ class PDB
48
+
49
+ def mols
50
+ mols = {}
51
+ @hash["HETATM"].each do |atom|
52
+ mol = (mols[[atom.resName, atom.chainID]] ||= Chem::PDB::PDBMolecule.new(atom.resName))
53
+ mol.nodes.push(atom)
54
+ end
55
+ mols
56
+ end
57
+
58
+ # reprensent one entry of het_dictionary.txt
59
+ class ChemicalComponent
60
+ end
61
+
62
+ class Record::HETATM
63
+ include Chem::Atom
64
+ include Chem::Transform::ThreeDimension
65
+ def pos ; @pos ||= Vector[@x, @y, @z] ; end
66
+ end
67
+
68
+ end
69
+
70
+ end
71
+
72
+ if __FILE__ == $0
73
+ dir = "/Users/tanaka/data/"
74
+
75
+ enzyme = Bio::FlatFile.auto(dir + "/pdb/1j4r.ent")
76
+
77
+ mols = {}
78
+ enzyme.each do |entry|
79
+ entry.mols.each do |key, mol|
80
+ p mol.nodes.length
81
+ dic = Bio::FlatFile.auto(dir + "het_dictionary.txt")
82
+ mol.set_connection(dic)
83
+ mol.save("#{key.join('_')}.png")
84
+ end
85
+ exit
86
+ entry.record("HETATM").each do |atom|
87
+ (mols[atom.resName] ||= []).push atom
88
+ end
89
+ end
90
+
91
+ # p mols.keys
92
+ end
93
+
94
+ #c001 = dic.find{|entry| entry.entry_id == "001"}
95
+
96
+ #p c001.hello#.record["CONECT"]
97
+
98
+ #p mols["001"]
99
+
100
+ __END__
101
+
102
+
103
+ pdb.each do |entry|
104
+ p entry.entry_id
105
+ end
@@ -101,6 +101,8 @@ EOL
101
101
 
102
102
  def text(str, x, y, params = {})
103
103
  @vect << "BT"
104
+ color = params[:color].nil? ? "0 0 0" : params[:color].join(" ")
105
+ @vect << "#{color} rg"
104
106
  @vect << "/F1 #{@params[:font]} Tf"
105
107
  @vect << "1 0 0 1 #{x - @params[:font] * 0.4} #{y - @params[:font] * 0.4} Tm"
106
108
  @vect << "(#{str}) Tj"
@@ -1,113 +1,1096 @@
1
1
  #
2
2
  # chem/db/pubchem.rb - PubChem database class
3
3
  #
4
- # Copyright (C) 2005 KADOWAKI Tadashi <kado@kuicr.kyoto-u.ac.jp>
5
- # TANAKA Nobuya <tanaka@kuicr.kyoto-u.ac.jp>
6
- #
4
+ # Copyright (C) 2005-2007 TANAKA Nobuya <nobuya.tanaka@gmail.com>
7
5
  #
8
6
 
9
- require 'uri'
10
- require 'net/http'
7
+ require 'chem'
8
+ # ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
11
9
 
12
10
  module Chem
11
+ # Section 1.
12
+ HierarchicElementCounts = {
13
+ :H => [
14
+ [4, 0],
15
+ [8, 1],
16
+ [16, 2],
17
+ [32, 3]],
18
+ :Li => [
19
+ [ 1, 4],
20
+ [ 2, 5]],
21
+ :B => [
22
+ [ 1, 6],
23
+ [ 2, 7],
24
+ [ 4, 8]],
25
+ :C => [
26
+ [ 2, 9],
27
+ [ 4, 10],
28
+ [ 8, 11],
29
+ [ 16, 12],
30
+ [ 32, 13]],
31
+ :N => [
32
+ [ 1, 14],
33
+ [ 2, 15],
34
+ [ 4, 16],
35
+ [ 8, 17]],
36
+ :O => [
37
+ [ 1, 18],
38
+ [ 2, 19],
39
+ [ 4, 20],
40
+ [ 8, 21],
41
+ [ 16, 22]],
42
+ :F => [
43
+ [ 1, 23],
44
+ [ 2, 24],
45
+ [ 4, 25]],
46
+ :Na => [
47
+ [ 1, 26],
48
+ [ 2, 27]],
49
+ :Si => [
50
+ [ 1, 28],
51
+ [ 2, 29]],
52
+ :P => [
53
+ [ 1, 30],
54
+ [ 2, 31],
55
+ [ 4, 32]],
56
+ :S => [
57
+ [ 1, 33],
58
+ [ 2, 34],
59
+ [ 4, 35],
60
+ [ 8, 36]],
61
+ :Cl => [
62
+ [ 1, 37],
63
+ [ 2, 38],
64
+ [ 4, 39],
65
+ [ 8, 40]],
66
+ :K => [
67
+ [ 1, 41],
68
+ [ 2, 42]],
69
+ :Br => [
70
+ [ 1, 43],
71
+ [ 2, 44],
72
+ [ 4, 45]],
73
+ :I => [
74
+ [ 1, 46],
75
+ [ 2, 47],
76
+ [ 4, 48]],
77
+ :Be => [[ 1, 49 ]],
78
+ :Mg => [[ 1, 50]],
79
+ :Al => [[ 1, 51]],
80
+ :Ca => [[ 1, 52]],
81
+ :Sc => [[ 1, 53]],
82
+ :Ti => [[ 1, 54]],
83
+ :V => [[ 1, 55]],
84
+ :Cr => [[ 1, 56]],
85
+ :Mn => [[ 1, 57]],
86
+ :Fe => [[ 1, 58]],
87
+ :Co => [[ 1, 59]],
88
+ :Ni => [[ 1, 60]],
89
+ :Cu => [[ 1, 61]],
90
+ :Zn => [[ 1, 62]],
91
+ :Ga => [[ 1, 63]],
92
+ :Ge => [[ 1, 64]],
93
+ :As => [[ 1, 65]],
94
+ :Se => [[ 1, 66]],
95
+ :Kr => [[ 1, 67]],
96
+ :Rb => [[ 1, 68]],
97
+ :Sr => [[ 1, 69]],
98
+ :Y => [[ 1, 70]],
99
+ :Zr => [[ 1, 71]],
100
+ :Nb => [[ 1, 72]],
101
+ :Mo => [[ 1, 73]],
102
+ :Ru => [[ 1, 74]],
103
+ :Rh => [[ 1, 75]],
104
+ :Pd => [[ 1, 76]],
105
+ :Ag => [[ 1, 77]],
106
+ :Cd => [[ 1, 78]],
107
+ :In => [[ 1, 79]],
108
+ :Sn => [[ 1, 80]],
109
+ :Sb => [[ 1, 81]],
110
+ :Te => [[ 1, 82]],
111
+ :Xe => [[ 1, 83]],
112
+ :Cs => [[ 1, 84]],
113
+ :Ba => [[ 1, 85]],
114
+ :Lu => [[ 1, 86]],
115
+ :Hf => [[ 1, 87]],
116
+ :Ta => [[ 1, 88]],
117
+ :W => [[ 1, 89]],
118
+ :Re => [[ 1, 90]],
119
+ :Os => [[ 1, 91]],
120
+ :Ir => [[ 1, 92]],
121
+ :Pt => [[ 1, 93]],
122
+ :Au => [[ 1, 94]],
123
+ :Hg => [[ 1, 95]],
124
+ :Tl => [[ 1, 96]],
125
+ :Pb => [[ 1, 97]],
126
+ :Bi => [[ 1, 98]],
127
+ :La => [[ 1, 99]],
128
+ :Ce => [[ 1, 100]],
129
+ :Pr => [[ 1, 101]],
130
+ :Nd => [[ 1, 102]],
131
+ :Pm => [[ 1, 103]],
132
+ :Sm => [[ 1, 104]],
133
+ :Eu => [[ 1, 105]],
134
+ :Gd => [[ 1, 106]],
135
+ :Tb => [[ 1, 107]],
136
+ :Dy => [[ 1, 108]],
137
+ :Ho => [[ 1, 109]],
138
+ :Er => [[ 1, 110]],
139
+ :Tm => [[ 1, 111]],
140
+ :Yb => [[ 1, 112]],
141
+ :Tc => [[ 1, 113]],
142
+ :U => [[ 1, 114]],
143
+ }
13
144
 
14
- module Molecule
15
- def search_pubchem
16
- end
17
- end
145
+ # Section 2
146
+
147
+ # Section 3
148
+ Section3 = {
149
+ 'H-Li' => 263,
150
+ 'Li-Li' => 264,
151
+ 'B-Li' => 265,
152
+ 'C-Li' => 266,
153
+ 'Li-O' => 267,
154
+ 'F-Li' => 268,
155
+ 'Li-P' => 269,
156
+ 'Li-S' => 270,
157
+ 'Cl-Li' => 271,
158
+ 'B-H' => 272,
159
+ 'B-B' => 273,
160
+ 'B-C' => 274,
161
+ 'B-N' => 275,
162
+ 'B-O' => 276,
163
+ 'B-F' => 277,
164
+ 'B-Si' => 278,
165
+ 'B-P' => 279,
166
+ 'B-S' => 280,
167
+ 'B-Cl' => 281,
168
+ 'B-Br' => 282,
169
+ 'C-H' => 283,
170
+ 'C-C' => 284,
171
+ 'C-N' => 285,
172
+ 'C-O' => 286,
173
+ 'C-F' => 287,
174
+ 'C-Na' => 288,
175
+ 'C-Mg' => 289,
176
+ 'Al-C' => 290,
177
+ 'C-Si' => 291,
178
+ 'C-P' => 292,
179
+ 'C-S' => 293,
180
+ 'C-Cl' => 294,
181
+ 'As-C' => 295,
182
+ 'C-Se' => 296,
183
+ 'Br-C' => 297,
184
+ 'C-I' => 298,
185
+ 'H-N' => 299,
186
+ 'N-N' => 300,
187
+ 'N-O' => 301,
188
+ 'F-N' => 302,
189
+ 'N-Si' => 303,
190
+ 'N-P' => 304,
191
+ 'N-S' => 305,
192
+ 'Cl-N' => 306,
193
+ 'Br-N' => 307,
194
+ 'H-O' => 308,
195
+ 'O-O' => 309,
196
+ 'Mg-O' => 310,
197
+ 'Na-O' => 311,
198
+ 'Al-O' => 312,
199
+ 'O-Si' => 313,
200
+ 'O-P' => 314,
201
+ 'K-O' => 315,
202
+ 'F-P' => 316,
203
+ 'F-S' => 317,
204
+ 'Al-H' => 318,
205
+ 'Al-Cl' => 319,
206
+ 'H-Si' => 320,
207
+ 'Si-Si' => 321,
208
+ 'Cl-Si' => 322,
209
+ 'H-P' => 323,
210
+ 'P-P' => 324,
211
+ 'As-H' => 325,
212
+ 'As-As' => 326,
213
+ }
214
+
215
+ # Section 4 Simple atom nearest neighbors
216
+
217
+ Section4 = {
218
+ :C => [
219
+ ["-Br", "-C", 327],
220
+ ["-Br", "-C", "-C", 328],
221
+ ["-Br", "-H", 329],
222
+ ["-Br", "~C", 330],
223
+ ["-Br", "~N", 331],
224
+ ["-C", "-C", 332],
225
+ ["-C", "-C", "-C", 333],
226
+ ["-C", "-C", "-C", "-C", 334],
227
+ ["-C", "-C", "-C", "-H", 335],
228
+ ["-C", "-C", "-C", "-N", 336],
229
+ ["-C", "-C", "-C", "-O", 337],
230
+ ["-C", "-C", "-H", "-N", 338],
231
+ ["-C", "-C", "-H", "-O", 339],
232
+ ["-C", "-C", "-N", 340],
233
+ ["-C", "-C", "-O", 341],
234
+ ["-C", "-Cl", 342],
235
+ ["-C", "-Cl", "-H", 343],
236
+ ["-C", "-H", 344],
237
+ ["-C", "-H", "-N", 345],
238
+ ["-C", "-H", "-O", 346],
239
+ ["-C", "-H", "-O", "-O", 347],
240
+ ["-C", "-H", "-P", 348],
241
+ ["-C", "-H", "-S", 349],
242
+ ["-C", "-I", 350],
243
+ ["-C", "-N", 351],
244
+ ["-C", "-O", 352],
245
+ ["-C", "-S", 353],
246
+ ["-C", "-Si", 354],
247
+ ["-C", "~C", 355],
248
+ ["-C", "~C", "~C", 356],
249
+ ["-C", "~C", "~N", 357],
250
+ ["-C", "~N", 358],
251
+ ["-C", "~N", "~N", 359],
252
+ ["-Cl", "-Cl", 360],
253
+ ["-Cl", "-H", 361],
254
+ ["-Cl", "~C", 362],
255
+ ["-F", "-F", 363],
256
+ ["-F", "~C", 364],
257
+ ["-H", "-N", 365],
258
+ ["-H", "-O", 366],
259
+ ["-H", "-O", "-O", 367],
260
+ ["-H", "-S", 368],
261
+ ["-H", "-Si", 369],
262
+ ["-H", "~C", 370],
263
+ ["-H", "~C", "~C", 371],
264
+ ["-H", "~C", "~N", 372],
265
+ ["-H", "~N", 373],
266
+ ["-H", "-H", "-H", 374],
267
+ ["-N", "-N", 375],
268
+ ["-N", "~C", 376],
269
+ ["-N", "~C", "~C", 377],
270
+ ["-N", "~C", "~N", 378],
271
+ ["-N", "~N", 379],
272
+ ["-O", "-O", 380],
273
+ ["-O", "~C", 381],
274
+ ["-O", "~C", "~C", 382],
275
+ ["-S", "~C", 383],
276
+ ["~C", "~C", 384],
277
+ ["~C", "~C", "~C", 385],
278
+ ["~C", "~C", "~N", 386],
279
+ ["~C", "~N", 387],
280
+ ["~C", "~N", "~N", 388],
281
+ ["~N", "~N", 389]],
282
+ :N => [
283
+ ["-C", "-C", 390],
284
+ ["-C", "-C", "-C", 391],
285
+ ["-C", "-C", "-H", 392],
286
+ ["-C", "-H", 393],
287
+ ["-C", "-H", "-N", 394],
288
+ ["-C", "-O", 395],
289
+ ["-C", "~C", 396],
290
+ ["-C", "~C", "~C", 397],
291
+ ["-H", "-N", 398],
292
+ ["-H", "~C", 399],
293
+ ["-H", "~C", "~C", 400],
294
+ ["-O", "-O", 401],
295
+ ["-O", "~O", 402],
296
+ ["~C", "~C", 403],
297
+ ["~C", "~C", "~C", 404]],
298
+ :O => [
299
+ ["-C", "-C", 405],
300
+ ["-C", "-H", 406],
301
+ ["-C", "-P", 407],
302
+ ["-H", "-S", 408],
303
+ ["~C", "~C", 409]],
304
+ :P => [
305
+ ["-C", "-C", 410],
306
+ ["-O", "-O", 411]],
307
+ :S => [
308
+ ["-C", "-C", 412],
309
+ ["-C", "-H", 413],
310
+ ["-C", "-O", 414]],
311
+ :Si => [
312
+ ["-C", "-C", 415]
313
+ ]
314
+ }
315
+
316
+
317
+
318
+ # Section 5 Detailed atom neighborhoods
319
+
320
+ # Section 6 Simple SMARTS patterns
321
+
322
+ Section6 = {
323
+ "C-C-C#C" => 460,
324
+ "O-C-C=N" => 461,
325
+ "O-C-C=O" => 462,
326
+ "N:C-S-[#1]" => 463,
327
+ "N-C-C=C" => 464,
328
+ "O=S-C-C" => 465,
329
+ "N#C-C=C" => 466,
330
+ "C=N-N-C" => 467,
331
+ "O=S-C-N" => 468,
332
+ "S-S-C:C" => 469,
333
+ "C:C-C=C" => 470,
334
+ "S:C:C:C" => 471,
335
+ "C:N:C-C" => 472,
336
+ "S-C:N:C" => 473,
337
+ "S:C:C:N" => 474,
338
+ "S-C=N-C" => 475,
339
+ "C-O-C=C" => 476,
340
+ "N-N-C:C" => 477,
341
+ "S-C=N-[#1]" => 478,
342
+ "S-C-S-C" => 479,
343
+ "C:S:C-C" => 480,
344
+ "O-S-C:C" => 481,
345
+ "C:N-C:C" => 482,
346
+ "N-S-C:C" => 483,
347
+ "N-C:N:C" => 484,
348
+ "N:C:C:N" => 485,
349
+ "N-C:N:N" => 486,
350
+ "N-C=N-C" => 487,
351
+ "N-C=N-[#1]" => 488,
352
+ "N-C-S-C" => 489,
353
+ "C-C-C=C" => 490,
354
+ "C-N:C-[#1]" => 491,
355
+ "N-C:O:C" => 492,
356
+ "O=C-C:C" => 493,
357
+ "O=C-C:N" => 494,
358
+ "C-N-C:C" => 495,
359
+ "N:N-C-[#1]" => 496,
360
+ "O-C:C:N" => 497,
361
+ "O-C=C-C" => 498,
362
+ "N-C:C:N" => 499,
363
+ "C-S-C:C" => 500,
364
+ "Cl-C:C-C" => 501,
365
+ "N-C=C-[#1]" => 502,
366
+ "Cl-C:C-[#1]" => 503,
367
+ "N:C:N-C" => 504,
368
+ "Cl-C:C-O" => 505,
369
+ "C-C:N:C" => 506,
370
+ "C-C-S-C" => 507,
371
+ "S=C-N-C" => 508,
372
+ "Br-C:C-C" => 509,
373
+ "[#1]-N-N-[#1]" => 510,
374
+ "S=C-N-[#1]" => 511,
375
+ "C-[As]-O-[#1]" => 512,
376
+ "S:C:C-[#1]" => 513,
377
+ "O-N-C-C" => 514,
378
+ "N-N-C-C" => 515,
379
+ "[#1]-C=C-[#1]" => 516,
380
+ "N-N-C-N" => 517,
381
+ "O=C-N-N" => 518,
382
+ "N=C-N-C" => 519,
383
+ "C=C-C:C" => 520,
384
+ "C:N-C-[#1]" => 521,
385
+ "C-N-N-[#1]" => 522,
386
+ "N:C:C-C" => 523,
387
+ "C-C=C-C" => 524,
388
+ "[As]-C:C-[#1]" => 525,
389
+ "Cl-C:C-Cl" => 526,
390
+ "C:C:N-[#1]" => 527,
391
+ "[#1]-N-C-[#1]" => 528,
392
+ "Cl-C-C-Cl" => 529,
393
+ "N:C-C:C" => 530,
394
+ "S-C:C-C" => 531,
395
+ "S-C:C-[#1]" => 532,
396
+ "S-C:C-N" => 533,
397
+ "S-C:C-O" => 534,
398
+ "O=C-C-C" => 535,
399
+ "O=C-C-N" => 536,
400
+ "O=C-C-O" => 537,
401
+ "N=C-C-C" => 538,
402
+ "N=C-C-[#1]" => 539,
403
+ "C-N-C-[#1]" => 540,
404
+ "O-C:C-C" => 541,
405
+ "O-C:C-[#1]" => 542,
406
+ "O-C:C-N" => 543,
407
+ "O-C:C-O" => 544,
408
+ "N-C:C-C" => 545,
409
+ "N-C:C-[#1]" => 546,
410
+ "N-C:C-N" => 547,
411
+ "O-C-C:C" => 548,
412
+ "N-C-C:C" => 549,
413
+ "Cl-C-C-C" => 550,
414
+ "Cl-C-C-O" => 551,
415
+ "C:C-C:C" => 552,
416
+ "O=C-C=C" => 553,
417
+ "Br-C-C-C" => 554,
418
+ "N=C-C=C" => 555,
419
+ "C=C-C-C" => 556,
420
+ "N:C-O-[#1]" => 557,
421
+ "O=N-C:C" => 558,
422
+ "O-C-N-[#1]" => 559,
423
+ "N-C-N-C" => 560,
424
+ "Cl-C-C=O" => 561,
425
+ "Br-C-C=O" => 562,
426
+ "O-C-O-C" => 563,
427
+ "C=C-C=C" => 564,
428
+ "C:C-O-C" => 565,
429
+ "O-C-C-N" => 566,
430
+ "O-C-C-O" => 567,
431
+ "N#C-C-C" => 568,
432
+ "N-C-C-N" => 569,
433
+ "C:C-C-C" => 570,
434
+ "[#1]-C-O-[#1]" => 571,
435
+ "N:C:N:C" => 572,
436
+ "O-C-C=C" => 573,
437
+ "O-C-C:C-C" => 574,
438
+ "O-C-C:C-O" => 575,
439
+ "N=C-C:C-[#1]" => 576,
440
+ "C:C-N-C:C" => 577,
441
+ "C-C:C-C:C" => 578,
442
+ "O=C-C-C-C" => 579,
443
+ "O=C-C-C-N" => 580,
444
+ "O=C-C-C-O" => 581,
445
+ "C-C-C-C-C" => 582,
446
+ "Cl-C:C-O-C" => 583,
447
+ "C:C-C=C-C" => 584,
448
+ "C-C:C-N-C" => 585,
449
+ "C-S-C-C-C" => 586,
450
+ "N-C:C-O-[#1]" => 587,
451
+ "O=C-C-C=O" => 588,
452
+ "C-C:C-O-C" => 589,
453
+ "C-C:C-O-[#1]" => 590,
454
+ "Cl-C-C-C-C" => 591,
455
+ "N-C-C-C-C" => 592,
456
+ "N-C-C-C-N" => 593,
457
+ "C-O-C-C=C" => 594,
458
+ "C:C-C-C-C" => 595,
459
+ "N=C-N-C-C" => 596,
460
+ "O=C-C-C:C" => 597,
461
+ "Cl-C:C:C-C" => 598,
462
+ "[#1]-C-C=C-[#1]" => 599,
463
+ "N-C:C:C-C" => 600,
464
+ "N-C:C:C-N" => 601,
465
+ "O=C-C-N-C" => 602,
466
+ "C-C:C:C-C" => 603,
467
+ "C-O-C-C:C" => 604,
468
+ "O=C-C-O-C" => 605,
469
+ "O-C:C-C-C" => 606,
470
+ "N-C-C-C:C" => 607,
471
+ "C-C-C-C:C" => 608,
472
+ "Cl-C-C-N-C" => 609,
473
+ "C-O-C-O-C" => 610,
474
+ "N-C-C-N-C" => 611,
475
+ "N-C-O-C-C" => 612,
476
+ "C-N-C-C-C" => 613,
477
+ "C-C-O-C-C" => 614,
478
+ "N-C-C-O-C" => 615,
479
+ "C:C:N:N:C" => 616,
480
+ "C-C-C-O-[#1]" => 617,
481
+ "C:C-C-C:C" => 618,
482
+ "O-C-C=C-C" => 619,
483
+ "C:C-O-C-C" => 620,
484
+ "N-C:C:C:N" => 621,
485
+ "O=C-O-C:C" => 622,
486
+ "O=C-C:C-C" => 623,
487
+ "O=C-C:C-N" => 624,
488
+ "O=C-C:C-O" => 625,
489
+ "C-O-C:C-C" => 626,
490
+ "O=[As]-C:C:C" => 627,
491
+ "C-N-C-C:C" => 628,
492
+ "S-C:C:C-N" => 629,
493
+ "O-C:C-O-C" => 630,
494
+ "O-C:C-O-[#1]" => 631,
495
+ "C-C-O-C:C" => 632,
496
+ "N-C-C:C-C" => 633,
497
+ "C-C-C:C-C" => 634,
498
+ "N-N-C-N-[#1]" => 635,
499
+ "C-N-C-N-C" => 636,
500
+ "O-C-C-C-C" => 637,
501
+ "O-C-C-C-N" => 638,
502
+ "O-C-C-C-O" => 639,
503
+ "C=C-C-C-C" => 640,
504
+ "O-C-C-C=C" => 641,
505
+ "O-C-C-C=O" => 642,
506
+ "[#1]-C-C-N-[#1]" => 643,
507
+ "C-C=N-N-C" => 644,
508
+ "O=C-N-C-C" => 645,
509
+ "O=C-N-C-[#1]" => 646,
510
+ "O=C-N-C-N" => 647,
511
+ "O=N-C:C-N" => 648,
512
+ "O=N-C:C-O" => 649,
513
+ "O=C-N-C=O" => 650,
514
+ "O-C:C:C-C" => 651,
515
+ "O-C:C:C-N" => 652,
516
+ "O-C:C:C-O" => 653,
517
+ "N-C-N-C-C" => 654,
518
+ "O-C-C-C:C" => 655,
519
+ "C-C-N-C-C" => 656,
520
+ "C-N-C:C-C" => 657,
521
+ "C-C-S-C-C" => 658,
522
+ "O-C-C-N-C" => 659,
523
+ "C-C=C-C-C" => 660,
524
+ "O-C-O-C-C" => 661,
525
+ "O-C-C-O-C" => 662,
526
+ "O-C-C-O-[#1]" => 663,
527
+ "C-C=C-C=C" => 664,
528
+ "N-C:C-C-C" => 665,
529
+ "C=C-C-O-C" => 666,
530
+ "C=C-C-O-[#1]" => 667,
531
+ "C-C:C-C-C" => 668,
532
+ "Cl-C:C-C=O" => 669,
533
+ "Br-C:C:C-C" => 670,
534
+ "O=C-C=C-C" => 671,
535
+ "O=C-C=C-[#1]" => 672,
536
+ "O=C-C=C-N" => 673,
537
+ "N-C-N-C:C" => 674,
538
+ "Br-C-C-C:C" => 675,
539
+ "N#C-C-C-C" => 676,
540
+ "C-C=C-C:C" => 677,
541
+ "C-C-C=C-C" => 678,
542
+ "C-C-C-C-C-C" => 679,
543
+ "O-C-C-C-C-C" => 680,
544
+ "O-C-C-C-C-O" => 681,
545
+ "O-C-C-C-C-N" => 682,
546
+ "N-C-C-C-C-C" => 683,
547
+ "O=C-C-C-C-C" => 684,
548
+ "O=C-C-C-C-N" => 685,
549
+ "O=C-C-C-C-O" => 686,
550
+ "O=C-C-C-C=O" => 687,
551
+ "C-C-C-C-C-C-C" => 688,
552
+ "O-C-C-C-C-C-C" => 689,
553
+ "O-C-C-C-C-C-O" => 690,
554
+ "O-C-C-C-C-C-N" => 691,
555
+ "O=C-C-C-C-C-C" => 692,
556
+ "O=C-C-C-C-C-O" => 693,
557
+ "O=C-C-C-C-C=O" => 694,
558
+ "O=C-C-C-C-C-N" => 695,
559
+ "C-C-C-C-C-C-C-C" => 696,
560
+ "C-C-C-C-C-C(C)-C" => 697,
561
+ "O-C-C-C-C-C-C-C" => 698,
562
+ "O-C-C-C-C-C(C)-C" => 699,
563
+ "O-C-C-C-C-C-O-C" => 700,
564
+ "O-C-C-C-C-C(O)-C" => 701,
565
+ "O-C-C-C-C-C-N-C" => 702,
566
+ "O-C-C-C-C-C(N)-C" => 703,
567
+ "O=C-C-C-C-C-C-C" => 704,
568
+ "O=C-C-C-C-C(O)-C" => 705,
569
+ "O=C-C-C-C-C(=O)-C" => 706,
570
+ "O=C-C-C-C-C(N)-C" => 707,
571
+ "C-C(C)-C-C" => 708,
572
+ "C-C(C)-C-C-C" => 709,
573
+ "C-C-C(C)-C-C" => 710,
574
+ "C-C(C)(C)-C-C" => 711,
575
+ "C-C(C)-C(C)-C" => 712,
576
+ }
577
+
578
+ # Section 7: Complex SMARTS patterns
18
579
 
19
- module PubChem
20
-
21
- Host="pubchem.ncbi.nlm.nih.gov"
22
- Summary="/summary/summary.cgi"
23
-
24
- class PubChem
25
- Searchpath="/search/"
26
- Query="PreQSrv.cgi"
27
- Boundary="-----boundary-----"
28
-
29
- Data = [
30
- Boundary, "Content-Disposition: form-data; name=\"mode\"", "", "simplequery",
31
- Boundary, "Content-Disposition: form-data; name=\"check\"", "", "remote",
32
- Boundary, "Content-Disposition: form-data; name=\"execution\"", "", "remote",
33
- Boundary, "Content-Disposition: form-data; name=\"queue\"", "", "ssquery",
34
- Boundary, "Content-Disposition: form-data; name=\"simple_searchdata\"", "", '%s',
35
- Boundary, "Content-Disposition: form-data; name=\"simple_cid\"", "", "",
36
- Boundary, "Content-Disposition: form-data; name=\"simple_sid\"", "", "",
37
- Boundary, "Content-Disposition: form-data; name=\"file\"; filename=\"\"",
38
- "Content-Type: application/octet-stream", "", "",
39
- Boundary, "Content-Disposition: form-data; name=\"simple_searchtype\"", "", "fs",
40
- Boundary, "Content-Disposition: form-data; name=\"maxhits\"", "", '%s',
41
- Boundary].join("\x0d\x0a")
42
-
43
- def self.smiles_search(smiles, maxhits=100)
44
- cid = []
45
- url = ""
46
- body = ""
47
- Net::HTTP.version_1_2
48
- Net::HTTP.start(Host, 80) do |http|
49
- body = http.post(Searchpath + Query, Data % [smiles, maxhits],
50
- {'Content-Type' => "multipart/form-data; boundary=#{Boundary}",
51
- 'Referer' => "http://pubchem.ncbi.nlm.nih.gov/search/"}).body
52
- if m = /url="([^"]+)"/.match(body)
53
- body = http.get(Searchpath + m[1]).body
54
- end
55
- while /setTimeout\('document.location.replace\("([^"]+)"\);', (\d+)\)/ =~ body do
56
- sleep($2.to_f/100)
57
- response = http.get(URI.parse($1))
58
- body = response.body
59
- url = response['location']
60
- end
61
- if /PubChem structure search report:(\s|\S)+No hits/ !~ body
62
- # text format
63
- url.sub!(/cmd=Select\+from\+History/, 'cmd=Text&dopt=Brief')
64
- body = http.get(url).body
65
- body.scan(/\d+: CID: (\d+)/).each do |id|
66
- cid.push(PubChemEntry.new(id[0].to_i))
67
- end
68
- # # html format
69
- # body = http.get(url).body
70
- # while /CID: <a href=\"([^"]+)\">(\d+)<\/a>/ =~ body do
71
- # cid.push($2)
72
- # body = $'
73
- # end
74
- end
580
+ Section7 = {
581
+ "Cc1ccc(C)cc1" => 713,
582
+ "Cc1ccc(O)cc1" => 714,
583
+ "Cc1ccc(S)cc1" => 715,
584
+ "Cc1ccc(N)cc1" => 716,
585
+ "Cc1ccc(Cl)cc1" => 717,
586
+ "Cc1ccc(Br)cc1" => 718,
587
+ "Oc1ccc(O)cc1" => 719,
588
+ "Oc1ccc(S)cc1" => 720,
589
+ "Oc1ccc(N)cc1" => 721,
590
+ "Oc1ccc(Cl)cc1" => 722,
591
+ "Oc1ccc(Br)cc1" => 723,
592
+ "Sc1ccc(S)cc1" => 724,
593
+ "Sc1ccc(N)cc1" => 725,
594
+ "Sc1ccc(Cl)cc1" => 726,
595
+ "Sc1ccc(Br)cc1" => 727,
596
+ "Nc1ccc(N)cc1" => 728,
597
+ "Nc1ccc(Cl)cc1" => 729,
598
+ "Nc1ccc(Br)cc1" => 730,
599
+ "Clc1ccc(Cl)cc1" => 731,
600
+ "Clc1ccc(Br)cc1" => 732,
601
+ "Brc1ccc(Br)cc1" => 733,
602
+ "Cc1cc(C)ccc1" => 734,
603
+ "Cc1cc(O)ccc1" => 735,
604
+ "Cc1cc(S)ccc1" => 736,
605
+ "Cc1cc(N)ccc1" => 737,
606
+ "Cc1cc(Cl)ccc1" => 738,
607
+ "Cc1cc(Br)ccc1" => 739,
608
+ "Oc1cc(O)ccc1" => 740,
609
+ "Oc1cc(S)ccc1" => 741,
610
+ "Oc1cc(N)ccc1" => 742,
611
+ "Oc1cc(Cl)ccc1" => 743,
612
+ "Oc1cc(Br)ccc1" => 744,
613
+ "Sc1cc(S)ccc1" => 745,
614
+ "Sc1cc(N)ccc1" => 746,
615
+ "Sc1cc(Cl)ccc1" => 747,
616
+ "Sc1cc(Br)ccc1" => 748,
617
+ "Nc1cc(N)ccc1" => 749,
618
+ "Nc1cc(Cl)ccc1" => 750,
619
+ "Nc1cc(Br)ccc1" => 751,
620
+ "Clc1cc(Cl)ccc1" => 752,
621
+ "Clc1cc(Br)ccc1" => 753,
622
+ "Brc1cc(Br)ccc1" => 754,
623
+ "Cc1c(C)cccc1" => 755,
624
+ "Cc1c(O)cccc1" => 756,
625
+ "Cc1c(S)cccc1" => 757,
626
+ "Cc1c(N)cccc1" => 758,
627
+ "Cc1c(Cl)cccc1" => 759,
628
+ "Cc1c(Br)cccc1" => 760,
629
+ "Oc1c(O)cccc1" => 761,
630
+ "Oc1c(S)cccc1" => 762,
631
+ "Oc1c(N)cccc1" => 763,
632
+ "Oc1c(Cl)cccc1" => 764,
633
+ "Oc1c(Br)cccc1" => 765,
634
+ "Sc1c(S)cccc1" => 766,
635
+ "Sc1c(N)cccc1" => 767,
636
+ "Sc1c(Cl)cccc1" => 768,
637
+ "Sc1c(Br)cccc1" => 769,
638
+ "Nc1c(N)cccc1" => 770,
639
+ "Nc1c(Cl)cccc1" => 771,
640
+ "Nc1c(Br)cccc1" => 772,
641
+ "Clc1c(Cl)cccc1" => 773,
642
+ "Clc1c(Br)cccc1" => 774,
643
+ "Brc1c(Br)cccc1" => 775,
644
+ "CC1CCC(C)CC1" => 776,
645
+ "CC1CCC(O)CC1" => 777,
646
+ "CC1CCC(S)CC1" => 778,
647
+ "CC1CCC(N)CC1" => 779,
648
+ "CC1CCC(Cl)CC1" => 780,
649
+ "CC1CCC(Br)CC1" => 781,
650
+ "OC1CCC(O)CC1" => 782,
651
+ "OC1CCC(S)CC1" => 783,
652
+ "OC1CCC(N)CC1" => 784,
653
+ "OC1CCC(Cl)CC1" => 785,
654
+ "OC1CCC(Br)CC1" => 786,
655
+ "SC1CCC(S)CC1" => 787,
656
+ "SC1CCC(N)CC1" => 788,
657
+ "SC1CCC(Cl)CC1" => 789,
658
+ "SC1CCC(Br)CC1" => 790,
659
+ "NC1CCC(N)CC1" => 791,
660
+ "NC1CCC(Cl)CC1" => 792,
661
+ "NC1CCC(Br)CC1" => 793,
662
+ "ClC1CCC(Cl)CC1" => 794,
663
+ "ClC1CCC(Br)CC1" => 795,
664
+ "BrC1CCC(Br)CC1" => 796,
665
+ "CC1CC(C)CCC1" => 797,
666
+ "CC1CC(O)CCC1" => 798,
667
+ "CC1CC(S)CCC1" => 799,
668
+ "CC1CC(N)CCC1" => 800,
669
+ "CC1CC(Cl)CCC1" => 801,
670
+ "CC1CC(Br)CCC1" => 802,
671
+ "OC1CC(O)CCC1" => 803,
672
+ "OC1CC(S)CCC1" => 804,
673
+ "OC1CC(N)CCC1" => 805,
674
+ "OC1CC(Cl)CCC1" => 806,
675
+ "OC1CC(Br)CCC1" => 807,
676
+ "SC1CC(S)CCC1" => 808,
677
+ "SC1CC(N)CCC1" => 809,
678
+ "SC1CC(Cl)CCC1" => 810,
679
+ "SC1CC(Br)CCC1" => 811,
680
+ "NC1CC(N)CCC1" => 812,
681
+ "NC1CC(Cl)CCC1" => 813,
682
+ "NC1CC(Br)CCC1" => 814,
683
+ "ClC1CC(Cl)CCC1" => 815,
684
+ "ClC1CC(Br)CCC1" => 816,
685
+ "BrC1CC(Br)CCC1" => 817,
686
+ "CC1C(C)CCCC1" => 818,
687
+ "CC1C(O)CCCC1" => 819,
688
+ "CC1C(S)CCCC1" => 820,
689
+ "CC1C(N)CCCC1" => 821,
690
+ "CC1C(Cl)CCCC1" => 822,
691
+ "CC1C(Br)CCCC1" => 823,
692
+ "OC1C(O)CCCC1" => 824,
693
+ "OC1C(S)CCCC1" => 825,
694
+ "OC1C(N)CCCC1" => 826,
695
+ "OC1C(Cl)CCCC1" => 827,
696
+ "OC1C(Br)CCCC1" => 828,
697
+ "SC1C(S)CCCC1" => 829,
698
+ "SC1C(N)CCCC1" => 830,
699
+ "SC1C(Cl)CCCC1" => 831,
700
+ "SC1C(Br)CCCC1" => 832,
701
+ "NC1C(N)CCCC1" => 833,
702
+ "NC1C(Cl)CCCC1" => 834,
703
+ "NC1C(Br)CCCC1" => 835,
704
+ "ClC1C(Cl)CCCC1" => 836,
705
+ "ClC1C(Br)CCCC1" => 837,
706
+ "BrC1C(Br)CCCC1" => 838,
707
+ "CC1CC(C)CC1" => 839,
708
+ "CC1CC(O)CC1" => 840,
709
+ "CC1CC(S)CC1" => 841,
710
+ "CC1CC(N)CC1" => 842,
711
+ "CC1CC(Cl)CC1" => 843,
712
+ "CC1CC(Br)CC1" => 844,
713
+ "OC1CC(O)CC1" => 845,
714
+ "OC1CC(S)CC1" => 846,
715
+ "OC1CC(N)CC1" => 847,
716
+ "OC1CC(Cl)CC1" => 848,
717
+ "OC1CC(Br)CC1" => 849,
718
+ "SC1CC(S)CC1" => 850,
719
+ "SC1CC(N)CC1" => 851,
720
+ "SC1CC(Cl)CC1" => 852,
721
+ "SC1CC(Br)CC1" => 853,
722
+ "NC1CC(N)CC1" => 854,
723
+ "NC1CC(Cl)CC1" => 855,
724
+ "NC1CC(Br)CC1" => 856,
725
+ "ClC1CC(Cl)CC1" => 857,
726
+ "ClC1CC(Br)CC1" => 858,
727
+ "BrC1CC(Br)CC1" => 859,
728
+ "CC1C(C)CCC1" => 860,
729
+ "CC1C(O)CCC1" => 861,
730
+ "CC1C(S)CCC1" => 862,
731
+ "CC1C(N)CCC1" => 863,
732
+ "CC1C(Cl)CCC1" => 864,
733
+ "CC1C(Br)CCC1" => 865,
734
+ "OC1C(O)CCC1" => 866,
735
+ "OC1C(S)CCC1" => 867,
736
+ "OC1C(N)CCC1" => 868,
737
+ "OC1C(Cl)CCC1" => 869,
738
+ "OC1C(Br)CCC1" => 870,
739
+ "SC1C(S)CCC1" => 871,
740
+ "SC1C(N)CCC1" => 872,
741
+ "SC1C(Cl)CCC1" => 873,
742
+ "SC1C(Br)CCC1" => 874,
743
+ "NC1C(N)CCC1" => 875,
744
+ "NC1C(Cl)CC1" => 876,
745
+ "NC1C(Br)CCC1" => 877,
746
+ "ClC1C(Cl)CCC1" => 878,
747
+ "ClC1C(Br)CCC1" => 879,
748
+ "BrC1C(Br)CCC1" => 880,
749
+ }
750
+
751
+ PubChemSubsKey = [
752
+ ">= 4 H",
753
+ ">= 8 H",
754
+ ">= 16 H",
755
+ ">= 32 H",
756
+ ">= 1 Li",
757
+ ">= 2 Li",
758
+ ">= 1 B",
759
+ ">= 2 B",
760
+ ">= 4 B",
761
+ ">= 2 C",
762
+ ">= 4 C",
763
+ ">= 8 C",
764
+ ">= 16 C",
765
+ ">= 32 C",
766
+ ">= 1 N",
767
+ ">= 2 N",
768
+ ">= 4 N",
769
+ ">= 8 N",
770
+ ">= 1 O",
771
+ ">= 2 O",
772
+ ">= 4 O",
773
+ ">= 8 O",
774
+ ">= 16 O",
775
+ ">= 1 F",
776
+ ">= 2 F",
777
+ ">= 4 F",
778
+ ">= 1 Na",
779
+ ">= 2 Na",
780
+ ">= 1 Si",
781
+ ">= 2 Si",
782
+ ">= 1 P",
783
+ ">= 2 P",
784
+ ">= 4 P",
785
+ ">= 1 S",
786
+ ">= 2 S",
787
+ ">= 4 S",
788
+ ">= 8 S",
789
+ ">= 1 Cl",
790
+ ">= 2 Cl",
791
+ ">= 4 Cl",
792
+ ">= 8 Cl",
793
+ ">= 1 K",
794
+ ">= 2 K",
795
+ ">= 1 Br",
796
+ ">= 2 Br",
797
+ ">= 4 Br",
798
+ ">= 1 I",
799
+ ">= 2 I",
800
+ ">= 4 I",
801
+ ">= 1 Be",
802
+ ">= 1 Mg",
803
+ ">= 1 Al",
804
+ ">= 1 Ca",
805
+ ">= 1 Sc",
806
+ ">= 1 Ti",
807
+ ">= 1 V",
808
+ ">= 1 Cr",
809
+ ">= 1 Mn",
810
+ ">= 1 Fe",
811
+ ">= 1 Co",
812
+ ">= 1 Ni",
813
+ ">= 1 Cu",
814
+ ">= 1 Zn",
815
+ ">= 1 Ga",
816
+ ">= 1 Ge",
817
+ ">= 1 As",
818
+ ">= 1 Se",
819
+ ">= 1 Kr",
820
+ ">= 1 Rb",
821
+ ">= 1 Sr",
822
+ ">= 1 Y",
823
+ ">= 1 Zr",
824
+ ">= 1 Nb",
825
+ ">= 1 Mo",
826
+ ">= 1 Ru",
827
+ ">= 1 Rh",
828
+ ">= 1 Pd",
829
+ ">= 1 Ag",
830
+ ">= 1 Cd",
831
+ ">= 1 In",
832
+ ">= 1 Sn",
833
+ ">= 1 Sb",
834
+ ">= 1 Te",
835
+ ">= 1 Xe",
836
+ ">= 1 Cs",
837
+ ">= 1 Ba",
838
+ ">= 1 Lu",
839
+ ">= 1 Hf",
840
+ ">= 1 Ta",
841
+ ">= 1 W",
842
+ ">= 1 Re",
843
+ ">= 1 Os",
844
+ ">= 1 Ir",
845
+ ">= 1 Pt",
846
+ ">= 1 Au",
847
+ ">= 1 Hg",
848
+ ">= 1 Tl",
849
+ ">= 1 Pb",
850
+ ">= 1 Bi",
851
+ ">= 1 La",
852
+ ">= 1 Ce",
853
+ ">= 1 Pr",
854
+ ">= 1 Nd",
855
+ ">= 1 Pm",
856
+ ">= 1 Sm",
857
+ ">= 1 Eu",
858
+ ">= 1 Gd",
859
+ ">= 1 Tb",
860
+ ">= 1 Dy",
861
+ ">= 1 Ho",
862
+ ">= 1 Er",
863
+ ">= 1 Tm",
864
+ ">= 1 Yb",
865
+ ">= 1 Tc",
866
+ ">= 1 U",
867
+ ">= 1 any ring size 3",
868
+ ">= 1 saturated carbon-only ring size 3",
869
+ ">= 1 saturated nitrogen-containing ring size 3",
870
+ ">= 1 saturated heteroatom-containing ring size 3",
871
+ ">= 1 unsaturated or aromatic carbon-only ring size 3",
872
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 3",
873
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 3",
874
+ ">= 2 any ring size 3",
875
+ ">= 2 saturated carbon-only ring size 3",
876
+ ">= 2 saturated nitrogen-containing ring size 3",
877
+ ">= 2 saturated heteroatom-containing ring size 3",
878
+ ">= 2 unsaturated or aromatic carbon-only ring size 3",
879
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 3",
880
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 3",
881
+ ">= 1 any ring size 4",
882
+ ">= 1 saturated carbon-only ring size 4",
883
+ ">= 1 saturated nitrogen-containing ring size 4",
884
+ ">= 1 saturated heteroatom-containing ring size 4",
885
+ ">= 1 unsaturated or aromatic carbon-only ring size 4",
886
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 4",
887
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 4",
888
+ ">= 2 any ring size 4",
889
+ ">= 2 saturated carbon-only ring size 4",
890
+ ">= 2 saturated nitrogen-containing ring size 4",
891
+ ">= 2 saturated heteroatom-containing ring size 4",
892
+ ">= 2 unsaturated or aromatic carbon-only ring size 4",
893
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 4",
894
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 4",
895
+ ">= 1 any ring size 5",
896
+ ">= 1 saturated carbon-only ring size 5",
897
+ ">= 1 saturated nitrogen-containing ring size 5",
898
+ ">= 1 saturated heteroatom-containing ring size 5",
899
+ ">= 1 unsaturated or aromatic carbon-only ring size 5",
900
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 5",
901
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 5",
902
+ ">= 2 any ring size 5",
903
+ ">= 2 saturated carbon-only ring size 5",
904
+ ">= 2 saturated nitrogen-containing ring size 5",
905
+ ">= 2 saturated heteroatom-containing ring size 5",
906
+ ">= 2 unsaturated or aromatic carbon-only ring size 5",
907
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 5",
908
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 5",
909
+ ">= 3 any ring size 5",
910
+ ">= 3 saturated carbon-only ring size 5",
911
+ ">= 3 saturated nitrogen-containing ring size 5",
912
+ ">= 3 saturated heteroatom-containing ring size 5",
913
+ ">= 3 unsaturated or aromatic carbon-only ring size 5",
914
+ ">= 3 unsaturated or aromatic nitrogen-containing ring size 5",
915
+ ">= 3 unsaturated or aromatic heteroatom-containing ring size 5",
916
+ ">= 4 any ring size 5",
917
+ ">= 4 saturated carbon-only ring size 5",
918
+ ">= 4 saturated nitrogen-containing ring size 5",
919
+ ">= 4 saturated heteroatom-containing ring size 5",
920
+ ">= 4 unsaturated or aromatic carbon-only ring size 5",
921
+ ">= 4 unsaturated or aromatic nitrogen-containing ring size 5",
922
+ ">= 4 unsaturated or aromatic heteroatom-containing ring size 5",
923
+ ">= 5 any ring size 5",
924
+ ">= 5 saturated carbon-only ring size 5",
925
+ ">= 5 saturated nitrogen-containing ring size 5",
926
+ ">= 5 saturated heteroatom-containing ring size 5",
927
+ ">= 5 unsaturated or aromatic carbon-only ring size 5",
928
+ ">= 5 unsaturated or aromatic nitrogen-containing ring size 5",
929
+ ">= 5 unsaturated or aromatic heteroatom-containing ring size 5",
930
+ ">= 1 any ring size 6",
931
+ ">= 1 saturated carbon-only ring size 6",
932
+ ">= 1 saturated nitrogen-containing ring size 6",
933
+ ">= 1 saturated heteroatom-containing ring size 6",
934
+ ">= 1 unsaturated or aromatic carbon-only ring size 6",
935
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 6",
936
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 6",
937
+ ">= 2 any ring size 6",
938
+ ">= 2 saturated carbon-only ring size 6",
939
+ ">= 2 saturated nitrogen-containing ring size 6",
940
+ ">= 2 saturated heteroatom-containing ring size 6",
941
+ ">= 2 unsaturated or aromatic carbon-only ring size 6",
942
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 6",
943
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 6",
944
+ ">= 3 any ring size 6",
945
+ ">= 3 saturated carbon-only ring size 6",
946
+ ">= 3 saturated nitrogen-containing ring size 6",
947
+ ">= 3 saturated heteroatom-containing ring size 6",
948
+ ">= 3 unsaturated or aromatic carbon-only ring size 6",
949
+ ">= 3 unsaturated or aromatic nitrogen-containing ring size 6",
950
+ ">= 3 unsaturated or aromatic heteroatom-containing ring size 6",
951
+ ">= 4 any ring size 6",
952
+ ">= 4 saturated carbon-only ring size 6",
953
+ ">= 4 saturated nitrogen-containing ring size 6",
954
+ ">= 4 saturated heteroatom-containing ring size 6",
955
+ ">= 4 unsaturated or aromatic carbon-only ring size 6",
956
+ ">= 4 unsaturated or aromatic nitrogen-containing ring size 6",
957
+ ">= 4 unsaturated or aromatic heteroatom-containing ring size 6",
958
+ ">= 5 any ring size 6",
959
+ ">= 5 saturated carbon-only ring size 6",
960
+ ">= 5 saturated nitrogen-containing ring size 6",
961
+ ">= 5 saturated heteroatom-containing ring size 6",
962
+ ">= 5 unsaturated or aromatic carbon-only ring size 6",
963
+ ">= 5 unsaturated or aromatic nitrogen-containing ring size 6",
964
+ ">= 5 unsaturated or aromatic heteroatom-containing ring size 6",
965
+ ">= 1 any ring size 7",
966
+ ">= 1 saturated carbon-only ring size 7",
967
+ ">= 1 saturated nitrogen-containing ring size 7",
968
+ ">= 1 saturated heteroatom-containing ring size 7",
969
+ ">= 1 unsaturated or aromatic carbon-only ring size 7",
970
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 7",
971
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 7",
972
+ ">= 2 any ring size 7",
973
+ ">= 2 saturated carbon-only ring size 7",
974
+ ">= 2 saturated nitrogen-containing ring size 7",
975
+ ">= 2 saturated heteroatom-containing ring size 7",
976
+ ">= 2 unsaturated or aromatic carbon-only ring size 7",
977
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 7",
978
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 7",
979
+ ">= 1 any ring size 8",
980
+ ">= 1 saturated carbon-only ring size 8",
981
+ ">= 1 saturated nitrogen-containing ring size 8",
982
+ ">= 1 saturated heteroatom-containing ring size 8",
983
+ ">= 1 unsaturated or aromatic carbon-only ring size 8",
984
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 8",
985
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 8",
986
+ ">= 2 any ring size 8",
987
+ ">= 2 saturated carbon-only ring size 8",
988
+ ">= 2 saturated nitrogen-containing ring size 8",
989
+ ">= 2 saturated heteroatom-containing ring size 8",
990
+ ">= 2 unsaturated or aromatic carbon-only ring size 8",
991
+ ">= 2 unsaturated or aromatic nitrogen-containing ring size 8",
992
+ ">= 2 unsaturated or aromatic heteroatom-containing ring size 8",
993
+ ">= 1 any ring size 9",
994
+ ">= 1 saturated carbon-only ring size 9",
995
+ ">= 1 saturated nitrogen-containing ring size 9",
996
+ ">= 1 saturated heteroatom-containing ring size 9",
997
+ ">= 1 unsaturated or aromatic carbon-only ring size 9",
998
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 9",
999
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 9",
1000
+ ">= 1 any ring size 10",
1001
+ ">= 1 saturated carbon-only ring size 10",
1002
+ ">= 1 saturated nitrogen-containing ring size 10",
1003
+ ">= 1 saturated heteroatom-containing ring size 10",
1004
+ ">= 1 unsaturated or aromatic carbon-only ring size 10",
1005
+ ">= 1 unsaturated or aromatic nitrogen-containing ring size 10",
1006
+ ">= 1 unsaturated or aromatic heteroatom-containing ring size 10",
1007
+ ">= 1 aromatic ring",
1008
+ ">= 1 hetero-aromatic ring",
1009
+ ">= 2 aromatic rings",
1010
+ ">= 2 hetero-aromatic rings",
1011
+ ">= 3 aromatic rings",
1012
+ ">= 3 hetero-aromatic rings",
1013
+ ">= 4 aromatic rings",
1014
+ ">= 4 hetero-aromatic rings",
1015
+ ]
1016
+
1017
+ RingSizeBaseNum = {
1018
+ 3 => 115,
1019
+ 4 => 129,
1020
+ 5 => 143,
1021
+ 6 => 178,
1022
+ 7 => 213,
1023
+ 8 => 227,
1024
+ 9 => 241,
1025
+ 10 => 248,
1026
+ }
1027
+ module Molecule
1028
+ def generate_pubchem_subskey
1029
+ fp = 0
1030
+ # Section 1
1031
+ self.composition.each do |elem, num|
1032
+ HierarchicElementCounts[elem].each do |n_atoms, bit|
1033
+ fp |= (1 << bit) if num >= n_atoms
75
1034
  end
76
- cid
77
1035
  end
78
1036
 
79
- end
1037
+ # Section 2
1038
+ [143, 150, 157, 164, 171]
1039
+ s2bit = []
1040
+ sssrs = self.find_sssr.inject({}){|hash, ring| (hash[ring.size] ||= []) << ring ; hash}
1041
+ sssrs.each do |ring_size, rings|
1042
+ # base_num = case ring_size
1043
+
1044
+ # p [ring_size, rings.size]
1045
+ # p rings.any?{|ring| ring.any?{|atom| atom.element == :N}}
1046
+ # p rings.any?{|ring| ring.any?{|atom| atom.element != :C}}
1047
+ end
80
1048
 
81
- class PubChemEntry
1049
+ # Section 3
1050
+ self.edges.collect{ |bond, atom1, atom2|
1051
+ [atom1.element.to_s, atom2.element.to_s].sort.join("-")
1052
+ }.uniq.each do |pair|
1053
+ fp |= (1 << Section3[pair]) if Section3[pair]
1054
+ end
82
1055
 
83
- def initialize cid
84
- @cid = cid
1056
+ # Section 4
1057
+ self.nodes.each do |node|
1058
+ adj = self.adjacent_to(node).collect{|bond, atom| }
1059
+ p Section4[node.element]
85
1060
  end
1061
+ # exit
1062
+
86
1063
 
87
- def get_xml
88
- Net::HTTP.version_1_2
89
- Net::HTTP.get(Host, Summary + "\?disopt=DisplayXML&cid=%dd" % @cid)
1064
+ Section6.each do |smarts, bit|
1065
+ pat = Chem::OpenBabel::parse_smarts(smarts)
1066
+ fp |= (1 << bit ) if pat.match(self)
90
1067
  end
91
1068
 
92
- def get_sdf
93
- Net::HTTP.version_1_2
94
- Net::HTTP.get(Host, Summary + "\?disopt=DisplaySDF&cid=%d" % @cid)
1069
+ Section7.each do |smarts, bit|
1070
+ pat = Chem::OpenBabel::parse_smarts(smarts)
1071
+ fp |= (1 << bit ) if pat.match(self)
95
1072
  end
1073
+ fp
1074
+ end
96
1075
 
1076
+ # Extract PubChem substructural keys
1077
+ # see ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
1078
+ def pubchem_subskeys
1079
+ require 'base64'
1080
+ fp = 0
1081
+ b64 = self.sdf_data["PUBCHEM_CACTVS_SUBSKEYS"]
1082
+ Base64.decode64(b64).unpack("N*")[1..-1].each_with_index do |bit, idx|
1083
+ # fp += (bit << (881 - (idx + 1) * 32))
1084
+ bb = 0
1085
+ 0.upto(31) do |n|
1086
+ bb += (1 << n) if (bit & (1 << (31 - n)) != 0)
1087
+ end
1088
+ fp += (bb << (idx * 32))
1089
+ end
1090
+ fp
97
1091
  end
1092
+
98
1093
  end
99
1094
 
100
1095
  end
101
1096
 
102
- if $0 == __FILE__
103
- smiles="CC23(CCC1c4ccc(O)cc4(CCC1C3(CC(O)C2(O))))"
104
- puts "===== CID(s) for SMILES, #{smiles} ====="
105
- cid = Chem::PubChem.smiles_search(smiles)
106
- p cid
107
- puts "===== MOL format data ===="
108
- cid.each do |c|
109
- puts c.get_sdf
110
- end
111
- # p Chem::PubChem.get_xml(cid[0])
112
- # puts Chem::PubChem.get_xml(cid[0]).sdf2mol.data
113
- end