lazar 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
@@ -1,983 +0,0 @@
1
- #
2
- # SMARTS Patterns for Functional Group Classification
3
- #
4
- # written by Christian Laggner
5
- # Copyright 2005 Inte:Ligand Software-Entwicklungs und Consulting GmbH
6
- #
7
- # Released under the Lesser General Public License (LGPL license)
8
- # see http://www.gnu.org/copyleft/lesser.html
9
- # Modified from Version 221105
10
- #####################################################################################################
11
-
12
- # General Stuff:
13
- # These patters were written in an attempt to represent the classification of organic compounds
14
- # from the viewpoint of an organic chemist.
15
- # They are often very restrictive. This may be generally a good thing, but it also takes some time
16
- # for filtering/indexing large compound sets.
17
- # For filtering undesired groups (in druglike compounds) one will want to have more general patterns
18
- # (e.g. you don't want *any* halide of *any* acid, *neither* aldehyde *nor* formyl esters and amides, ...).
19
- #
20
-
21
- # Part I: Carbon
22
- # ==============
23
-
24
-
25
- # I.1: Carbon-Carbon Bonds
26
- # ------------------------
27
-
28
- # I.1.1 Alkanes:
29
-
30
- Primary_carbon: [CX4H3][#6]
31
-
32
- Secondary_carbon: [CX4H2]([#6])[#6]
33
-
34
- Tertiary_carbon: [CX4H1]([#6])([#6])[#6]
35
-
36
- Quaternary_carbon: [CX4]([#6])([#6])([#6])[#6]
37
-
38
-
39
- # I.1.2 C-C double and Triple Bonds
40
-
41
- Alkene: [CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])]=[CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])]
42
- # sp2 C may be substituted only by C or H -
43
- # does not hit ketenes and allenes, nor enamines, enols and the like
44
-
45
- Alkyne: [CX2]#[CX2]
46
- # non-carbon substituents (e.g. alkynol ethers) are rather rare, thus no further discrimination
47
-
48
- Allene: [CX3]=[CX2]=[CX3]
49
-
50
-
51
- # I.2: One Carbon-Hetero Bond
52
- # ---------------------------
53
-
54
-
55
- # I.2.1 Alkyl Halogenides
56
-
57
- Alkylchloride: [ClX1][CX4]
58
- # will also hit chloromethylethers and the like, but no chloroalkenes, -alkynes or -aromats
59
- # a more restrictive version can be obtained by modifying the Alcohol string.
60
-
61
- Alkylfluoride: [FX1][CX4]
62
-
63
- Alkylbromide: [BrX1][CX4]
64
-
65
- Alkyliodide: [IX1][CX4]
66
-
67
-
68
- # I.2.2 Alcohols and Ethers
69
-
70
- Alcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])]
71
- # nonspecific definition, no acetals, aminals, and the like
72
-
73
- Primary_alcohol: [OX2H][CX4H2;!$(C([OX2H])[O,S,#7,#15])]
74
-
75
- Secondary_alcohol: [OX2H][CX4H;!$(C([OX2H])[O,S,#7,#15])]
76
-
77
- Tertiary_alcohol: [OX2H][CX4D4;!$(C([OX2H])[O,S,#7,#15])]
78
-
79
- Dialkylether: [OX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])]
80
- # no acetals and the like; no enolethers
81
-
82
- Dialkylthioether: [SX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])]
83
- # no acetals and the like; no enolethers
84
-
85
- Alkylarylether: [OX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])]
86
- # no acetals and the like; no enolethers
87
-
88
- Diarylether: [c][OX2][c]
89
-
90
- Alkylarylthioether: [SX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])]
91
-
92
- Diarylthioether: [c][SX2][c]
93
-
94
- Oxonium: [O+;!$([O]~[!#6]);!$([S]*~[#7,#8,#15,#16])]
95
- # can't be aromatic, thus O and not #8
96
-
97
- # I.2.3 Amines
98
-
99
- Amine: [NX3+0,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])]
100
- # hits all amines (prim/sec/tert/quart), including ammonium salts, also enamines, but not amides, imides, aminals, ...
101
-
102
- # the following amines include also the protonated forms
103
-
104
- Primary_aliph_amine: [NX3H2+0,NX4H3+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
105
-
106
- Secondary_aliph_amine: [NX3H1+0,NX4H2+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
107
-
108
- Tertiary_aliph_amine: [NX3H0+0,NX4H1+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
109
-
110
- Quaternary_aliph_ammonium: [NX4H0+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
111
-
112
- Primary_arom_amine: [NX3H2+0,NX4H3+]c
113
-
114
- Secondary_arom_amine: [NX3H1+0,NX4H2+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
115
-
116
- Tertiary_arom_amine: [NX3H0+0,NX4H1+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
117
-
118
- Quaternary_arom_ammonium: [NX4H0+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
119
-
120
- Secondary_mixed_amine: [NX3H1+0,NX4H2+;$([N]([c])[C]);!$([N]*~[#7,#8,#15,#16])]
121
-
122
- Tertiary_mixed_amine: [NX3H0+0,NX4H1+;$([N]([c])([C])[#6]);!$([N]*~[#7,#8,#15,#16])]
123
-
124
- Quaternary_mixed_ammonium: [NX4H0+;$([N]([c])([C])[#6][#6]);!$([N]*~[#7,#8,#15,#16])]
125
-
126
- Ammonium: [N+;!$([N]~[!#6]);!$(N=*);!$([N]*~[#7,#8,#15,#16])]
127
- # only C and H substituents allowed. Quaternary or protonated amines
128
- # NX4+ or Nv4+ is not recognized by Daylight's depictmatch if less than four C are present
129
-
130
-
131
- # I.2.4 Others
132
-
133
- Alkylthiol: [SX2H][CX4;!$(C([SX2H])~[O,S,#7,#15])]
134
-
135
- Dialkylthioether: [SX2]([CX4;!$(C([SX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([SX2])[O,S,#7,#15])]
136
-
137
- Alkylarylthioether: [SX2](c)[CX4;!$(C([SX2])[O,S,#7,#15])]
138
-
139
- Disulfide: [SX2D2][SX2D2]
140
-
141
- 1,2-Aminoalcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15,F,Cl,Br,I])][CX4;!$(C([N])[O,S,#7,#15])][NX3;!$(NC=[O,S,N])]
142
- # does not hit alpha-amino acids, enaminoalcohols, 1,2-aminoacetals, o-aminophenols, etc.
143
-
144
- 1,2-Diol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])][CX4;!$(C([OX2H])[O,S,#7,#15])][OX2H]
145
- # does not hit alpha-hydroxy acids, enolalcohols, 1,2-hydroxyacetals, 1,2-diphenols, etc.
146
-
147
- 1,1-Diol: [OX2H][CX4;!$(C([OX2H])([OX2H])[O,S,#7,#15])][OX2H]
148
-
149
- Hydroperoxide: [OX2H][OX2]
150
- #does not neccessarily have to be connected to a carbon atom, includes also hydrotrioxides
151
-
152
- Peroxo: [OX2D2][OX2D2]
153
-
154
- Organolithium_compounds: [LiX1][#6,#14]
155
-
156
- Organomagnesium_compounds: [MgX2][#6,#14]
157
- # not restricted to Grignard compounds, also dialkyl Mg
158
-
159
- Organometallic_compounds: [!#1;!#5;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#33;!#34;!#35;!#52;!#53;!#85]~[#6;!-]
160
- # very general, includes all metals covalently bound to carbon
161
-
162
-
163
- # I.3: Two Carbon-Hetero Bonds (Carbonyl and Derivatives)
164
- # ----------------------------
165
-
166
- # I.3.1 Double Bond to Hetero
167
-
168
- Aldehyde: [$([CX3H][#6]),$([CX3H2])]=[OX1]
169
- # hits aldehydes including formaldehyde
170
-
171
- Ketone: [#6][CX3](=[OX1])[#6]
172
- # does not include oxo-groups connected to a (hetero-) aromatic ring
173
-
174
- Thioaldehyde: [$([CX3H][#6]),$([CX3H2])]=[SX1]
175
-
176
- Thioketone: [#6][CX3](=[SX1])[#6]
177
- # does not include thioxo-groups connected to a (hetero-) aromatic ring
178
-
179
- Imine: [NX2;$([N][#6]),$([NH]);!$([N][CX3]=[#7,#8,#15,#16])]=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])]
180
- # nitrogen is not part of an amidelike strukture, nor of an aromatic ring, but can be part of an aminal or similar
181
-
182
- Immonium: [NX3+;!$([N][!#6]);!$([N][CX3]=[#7,#8,#15,#16])]
183
-
184
- Oxime: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2H]
185
-
186
- Oximether: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2][#6;!$(C=[#7,#8])]
187
- # ether, not ester or amide; does not hit isoxazole
188
-
189
-
190
- # I.3.2. Two Single Bonds to Hetero
191
-
192
- Acetal: [OX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])]
193
- # does not hit hydroxy-methylesters, ketenacetals, hemiacetals, orthoesters, etc.
194
-
195
- Hemiacetal: [OX2H][CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])]
196
-
197
- Aminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][NX3v3;!$(NC=[#7,#8,#15,#16])][#6]
198
- # Ns are not part of an amide or similar. v3 ist to exclude nitro and similar groups
199
-
200
- Hemiaminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][OX2H]
201
-
202
- Thioacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][SX2][#6;!$(C=[O,S,N])]
203
-
204
- Thiohemiacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][OX2H]
205
-
206
- Halogen_acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1]
207
- # hits chloromethylenethers and other reactive alkylating agents
208
-
209
- Acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
210
- # includes all of the above and other combinations (S-C-N, hydrates, ...), but still no aminomethylenesters and similar
211
-
212
- Halogenmethylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1]
213
- # also reactive alkylating agents. Acid does not have to be carboxylic acid, also S- and P-based acids allowed
214
-
215
- NOS_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
216
- # Same as above, but N,O or S instead of halogen. Ester/amide allowed only on one side
217
-
218
- Hetero_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
219
- # Combination of the last two patterns
220
-
221
- Cyanhydrine: [NX1]#[CX2][CX4;$([CH2]),$([CH]([CX2])[#6]),$(C([CX2])([#6])[#6])][OX2H]
222
-
223
-
224
- # I.3.3 Single Bond to Hetero, C=C Double Bond (Enols and Similar)
225
-
226
- Chloroalkene: [ClX1][CX3]=[CX3]
227
-
228
- Fluoroalkene: [FX1][CX3]=[CX3]
229
-
230
- Bromoalkene: [BrX1][CX3]=[CX3]
231
-
232
- Iodoalkene: [IX1][CX3]=[CX3]
233
-
234
- Enol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3]
235
- # no phenols
236
-
237
- Endiol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3;$([H1]),$(C[#6])][OX2H]
238
- # no 1,2-diphenols, ketenacetals, ...
239
-
240
- Enolether: [OX2]([#6;!$(C=[N,O,S])])[CX3;$([H0][#6]),$([H1])]=[CX3]
241
- # finds also endiodiethers, but not enolesters, no aromats
242
-
243
- Enolester: [OX2]([CX3]=[OX1])[#6X3;$([#6][#6]),$([H1])]=[#6X3;!$(C[OX2H])]
244
-
245
-
246
- Enamine: [NX3;$([NH2][CX3]),$([NH1]([CX3])[#6]),$([N]([CX3])([#6])[#6]);!$([N]*=[#7,#8,#15,#16])][CX3;$([CH]),$([C][#6])]=[CX3]
247
- # does not hit amines attached to aromatic rings, nor may the nitrogen be aromatic
248
-
249
- Thioenol: [SX2H][CX3;$([H1]),$(C[#6])]=[CX3]
250
-
251
- Thioenolether: [SX2]([#6;!$(C=[N,O,S])])[CX3;$(C[#6]),$([CH])]=[CX3]
252
-
253
-
254
- # I.4: Three Carbon-Hetero Bonds (Carboxyl and Derivatives)
255
- # ------------------------------
256
-
257
- Acylchloride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[ClX1]
258
-
259
- Acylfluoride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1]
260
-
261
- Acylbromide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[BrX1]
262
-
263
- Acyliodide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[IX1]
264
-
265
- Acylhalide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1,ClX1,BrX1,IX1]
266
- # all of the above
267
-
268
-
269
- # The following contains all simple carboxylic combinations of O, N, S, & Hal -
270
- # - acids, esters, amides, ... as well as a few extra cases (anhydride, hydrazide...)
271
- # Cyclic structures (including aromats) like lactones, lactames, ... got their own
272
- # definitions. Structures where both heteroatoms are part of an aromatic ring
273
- # (oxazoles, imidazoles, ...) were excluded.
274
-
275
- Carboxylic_acid: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[$([OX2H]),$([OX1-])]
276
- # includes carboxylate anions
277
-
278
- Carboxylic_ester: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
279
- # does not hit anhydrides or lactones
280
-
281
- Lactone: [#6][#6X3R](=[OX1])[#8X2][#6;!$(C=[O,N,S])]
282
- # may also be aromatic
283
-
284
- Carboxylic_anhydride: [CX3;$([H0][#6]),$([H1])](=[OX1])[#8X2][CX3;$([H0][#6]),$([H1])](=[OX1])
285
- # anhydride formed by two carboxylic acids, no mixed anhydrides (e.g. between carboxylic acid and sulfuric acid); may be part of a ring, even aromatic
286
-
287
- Carboxylic_acid_derivative: [$([#6X3H0][#6]),$([#6X3H])](=[!#6])[!#6]
288
- # includes most of the structures of I.4 and many more, also 1,3-heteroaromatics such as isoxazole
289
-
290
- Carbothioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[OX1])[$([SX2H]),$([SX1-])]),$([C](=[SX1])[$([OX2H]),$([OX1-])])]
291
- # hits both tautomeric forms, as well as anions
292
-
293
- Carbothioic_S_ester: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[SX2][#6;!$(C=[O,N,S])]
294
-
295
- Carbothioic_S_lactone: [#6][#6X3R](=[OX1])[#16X2][#6;!$(C=[O,N,S])]
296
- # may also be aromatic
297
-
298
- Carbothioic_O_ester: [CX3;$([H0][#6]),$([H1])](=[SX1])[OX2][#6;!$(C=[O,N,S])]
299
-
300
- Carbothioic_O_lactone: [#6][#6X3R](=[SX1])[#8X2][#6;!$(C=[O,N,S])]
301
-
302
- Carbothioic_halide: [CX3;$([H0][#6]),$([H1])](=[SX1])[FX1,ClX1,BrX1,IX1]
303
-
304
- Carbodithioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2H])]
305
-
306
- Carbodithioic_ester: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2][#6;!$(C=[O,N,S])])]
307
-
308
- Carbodithiolactone: [#6][#6X3R](=[SX1])[#16X2][#6;!$(C=[O,N,S])]
309
-
310
-
311
- Amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
312
- # does not hit lactames
313
-
314
- Primary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[NX3H2]
315
-
316
- Secondary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]
317
-
318
- Tertiary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]
319
-
320
- Lactam: [#6R][#6X3R](=[OX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
321
- # cyclic amides, may also be aromatic
322
-
323
- Alkyl_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([#6])[#6X3;$([H0][#6]),$([H1])](=[OX1])
324
- # may be part of a ring, even aromatic. only C allowed at central N. May also be triacyl amide
325
-
326
- N_hetero_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([!#6])[#6X3;$([H0][#6]),$([H1])](=[OX1])
327
- # everything else than H or C at central N
328
-
329
- Imide_acidic: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H1][#6X3;$([H0][#6]),$([H1])](=[OX1])
330
- # can be deprotonated
331
-
332
- Thioamide: [$([CX3;!R][#6]),$([CX3H;!R])](=[SX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
333
- # does not hit thiolactames
334
-
335
- Thiolactam: [#6R][#6X3R](=[SX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
336
- # cyclic thioamides, may also be aromatic
337
-
338
-
339
- Oximester: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#8X2][#7X2]=,:[#6X3;$([H0]([#6])[#6]),$([H1][#6]),$([H2])]
340
- # may also be part of a ring / aromatic
341
-
342
- Amidine: [NX3;!$(NC=[O,S])][CX3;$([CH]),$([C][#6])]=[NX2;!$(NC=[O,S])]
343
- # only basic amidines, not as part of aromatic ring (e.g. imidazole)
344
-
345
- Hydroxamic_acid: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][$([OX2H]),$([OX1-])]
346
-
347
- Hydroxamic_acid_ester: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][OX2][#6;!$(C=[O,N,S])]
348
- # does not hit anhydrides of carboxylic acids withs hydroxamic acids
349
-
350
-
351
- Imidoacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
352
- # not cyclic
353
-
354
- Imidoacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
355
- # the enamide-form of lactames. may be aromatic like 2-hydroxypyridine
356
-
357
- Imidoester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])]
358
- # esters of the above structures. no anhydrides.
359
-
360
- Imidolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])]
361
- # no oxazoles and similar
362
-
363
- Imidothioacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])]
364
- # not cyclic
365
-
366
- Imidothioacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])]
367
- # the enamide-form of thiolactames. may be aromatic like 2-thiopyridine
368
-
369
- Imidothioester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])]
370
- # thioesters of the above structures. no anhydrides.
371
-
372
- Imidothiolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])]
373
- # no thioxazoles and similar
374
-
375
- Amidine: [#7X3v3;!$(N([#6X3]=[#7X2])C=[O,S])][CX3R0;$([H1]),$([H0][#6])]=[NX2v3;!$(N(=[#6X3][#7X3])C=[O,S])]
376
- # only basic amidines, not substituted by carbonyl or thiocarbonyl, not as part of a ring
377
-
378
- Imidolactam: [#6][#6X3R;$([H0](=[NX2;!$(N(=[#6X3][#7X3])C=[O,S])])[#7X3;!$(N([#6X3]=[#7X2])C=[O,S])]),$([H0](-[NX3;!$(N([#6X3]=[#7X2])C=[O,S])])=,:[#7X2;!$(N(=[#6X3][#7X3])C=[O,S])])]
379
- # one of the two C~N bonds is part of a ring (may be aromatic), but not both - thus no imidazole
380
-
381
- Imidoylhalide: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1]
382
- # not cyclic
383
-
384
- Imidoylhalide_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1]
385
- # may also be aromatic
386
-
387
- # may be ring, aromatic, substituted with carbonyls, hetero, ...
388
- # (everything else would get too complicated)
389
-
390
- Amidrazone: [$([$([#6X3][#6]),$([#6X3H])](=[#7X2v3])[#7X3v3][#7X3v3]),$([$([#6X3][#6]),$([#6X3H])]([#7X3v3])=[#7X2v3][#7X3v3])]
391
- # hits both tautomers. as above, it may be ring, aromatic, substituted with carbonyls, hetero, ...
392
-
393
-
394
- Alpha_aminoacid: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[OX2H,OX1-]
395
- # N may be alkylated, but not part of an amide (as in peptides), ionic forms are included
396
- # includes also non-natural aminoacids with double-bonded or two aliph./arom. substituents at alpha-C
397
- # N may not be aromatic as in 1H-pyrrole-2-carboxylic acid
398
-
399
- Alpha_hydroxyacid: [OX2H][C][CX3](=[OX1])[OX2H,OX1-]
400
-
401
- Peptide_middle: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])]
402
- # finds peptidic structures which are neither C- nor N-terminal. Both neighbours must be amino-acids/peptides
403
-
404
- Peptide_C_term: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[OX2H,OX1-]
405
- # finds C-terminal amino acids
406
-
407
- Peptide_N_term: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])]
408
- # finds N-terminal amino acids. As above, N may be substituted, but not part of an amide-bond.
409
-
410
-
411
- Carboxylic_orthoester: [#6][OX2][CX4;$(C[#6]),$([CH])]([OX2][#6])[OX2][#6]
412
- # hits also anhydride like struktures (e. g. HC(OMe)2-OC=O residues)
413
-
414
- Ketene: [CX3]=[CX2]=[OX1]
415
-
416
- Ketenacetal: [#7X2,#8X3,#16X2;$(*[#6,#14])][#6X3]([#7X2,#8X3,#16X2;$(*[#6,#14])])=[#6X3]
417
- # includes aminals, silylacetals, ketenesters, etc. C=C DB is not aromatic, everything else may be
418
-
419
- Nitrile: [NX1]#[CX2]
420
- # includes cyanhydrines
421
-
422
- Isonitrile: [CX1-]#[NX2+]
423
-
424
-
425
- Vinylogous_carbonyl_or_carboxyl_derivative: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7,#8,#16,F,Cl,Br,I]
426
- # may be part of a ring, even aromatic
427
-
428
- Vinylogous_acid: [#6X3](=[OX1])[#6X3]=,:[#6X3][$([OX2H]),$([OX1-])]
429
-
430
- Vinylogous_ester: [#6X3](=[OX1])[#6X3]=,:[#6X3][#6;!$(C=[O,N,S])]
431
-
432
- Vinylogous_amide: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
433
-
434
- Vinylogous_halide: [#6X3](=[OX1])[#6X3]=,:[#6X3][FX1,ClX1,BrX1,IX1]
435
-
436
-
437
-
438
- # I.5: Four Carbon-Hetero Bonds (Carbonic Acid and Derivatives)
439
- # -----------------------------
440
-
441
- Carbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[OX1])[#8X2][#6;!$(C=[O,N,S])]
442
- # may be part of a ring, even aromatic
443
-
444
- Carbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[OX2][FX1,ClX1,BrX1,IX1]
445
-
446
- Carbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[$([OX2H]),$([OX1-])]
447
- # unstable
448
-
449
- Carbonic_acid_derivatives: [!#6][#6X3](=[!#6])[!#6]
450
-
451
-
452
- Thiocarbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[SX1])[#8X2][#6;!$(C=[O,N,S])]
453
- # may be part of a ring, even aromatic
454
-
455
- Thiocarbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[OX2][FX1,ClX1,BrX1,IX1]
456
-
457
- Thiocarbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[$([OX2H]),$([OX1-])]
458
-
459
-
460
- Urea:[#7X3;!$([#7][!#6])][#6X3](=[OX1])[#7X3;!$([#7][!#6])]
461
- # no check whether part of imide, biuret, etc. Aromatic structures are only hit if
462
- # both N share no double bonds, like in the dioxo-form of uracil
463
-
464
- Thiourea: [#7X3;!$([#7][!#6])][#6X3](=[SX1])[#7X3;!$([#7][!#6])]
465
-
466
- Isourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#8X2&!$([#8][!#6]),OX1-])[#7X3;!$([#7][!#6])]
467
- # O may be substituted. no check whether further amide-like bonds are present. Aromatic
468
- # structures are only hit if single bonded N shares no additional double bond, like in
469
- # the 1-hydroxy-3-oxo form of uracil
470
-
471
- Isothiourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#16X2&!$([#16][!#6]),SX1-])[#7X3;!$([#7][!#6])]
472
-
473
- Guanidine: [N;v3X3,v4X4+][CX3](=[N;v3X2,v4X3+])[N;v3X3,v4X4+]
474
- # also hits guanidinium salts. v3 and v4 to avoid nitroamidines
475
-
476
- Carbaminic_acid: [NX3]C(=[OX1])[O;X2H,X1-]
477
- # quite unstable, unlikely to be found. Also hits salts
478
-
479
- Urethan: [#7X3][#6](=[OX1])[#8X2][#6]
480
- # also hits when part of a ring, no check whether the last C is part of carbonyl
481
-
482
- Biuret: [#7X3][#6](=[OX1])[#7X3][#6](=[OX1])[#7X3]
483
-
484
- Semicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1]
485
-
486
- Carbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[OX1]
487
-
488
- Semicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1]
489
-
490
- Carbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[OX1]
491
-
492
- Thiosemicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1]
493
-
494
- Thiocarbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[SX1]
495
-
496
- Thiosemicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1]
497
-
498
- Thiocarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[SX1]
499
-
500
-
501
- Isocyanate: [NX2]=[CX2]=[OX1]
502
-
503
- Cyanate: [OX2][CX2]#[NX1]
504
-
505
- Isothiocyanate: [NX2]=[CX2]=[SX1]
506
-
507
- Thiocyanate: [SX2][CX2]#[NX1]
508
-
509
- Carbodiimide: [NX2]=[CX2]=[NX2]
510
-
511
- Orthocarbonic_derivatives: [CX4H0]([O,S,#7])([O,S,#7])([O,S,#7])[O,S,#7,F,Cl,Br,I]
512
- # halogen allowed just once, to avoid mapping to -OCF3 and similar groups (much more
513
- # stable as for example C(OCH3)4)
514
-
515
-
516
- # I.6 Aromatics
517
- # -------------
518
-
519
- # I know that this classification is not very logical, arylamines are found under I.2 ...
520
-
521
- Phenol: [OX2H][c]
522
-
523
- 1,2-Diphenol: [OX2H][c][c][OX2H]
524
-
525
- Arylchloride: [Cl][c]
526
-
527
- Arylfluoride: [F][c]
528
-
529
- Arylbromide: [Br][c]
530
-
531
- Aryliodide: [I][c]
532
-
533
- Arylthiol: [SX2H][c]
534
-
535
- Iminoarene: [c]=[NX2;$([H1]),$([H0][#6;!$([C]=[N,S,O])])]
536
- # N may be substituted with H or C, but not carbonyl or similar
537
- # aromatic atom is always C, not S or P (these are not planar when substituted)
538
-
539
- Oxoarene: [c]=[OX1]
540
-
541
- Thioarene: [c]=[SX1]
542
-
543
- Hetero_N_basic_H: [nX3H1+0]
544
- # as in pyrole. uncharged to exclude pyridinium ions
545
-
546
- Hetero_N_basic_no_H: [nX3H0+0]
547
- # as in N-methylpyrole. uncharged to exclude pyridinium ions
548
-
549
- Hetero_N_nonbasic: [nX2,nX3+]
550
- # as in pyridine, pyridinium
551
-
552
- Hetero_O: [o]
553
-
554
- Hetero_S: [sX2]
555
- # X2 because Daylight's depictmatch falsely describes C1=CS(=O)C=C1 as aromatic
556
- # (is not planar because of lonepair at S)
557
-
558
- Heteroaromatic: [a;!c]
559
-
560
-
561
- # Part II: N, S, P, Si, B
562
- # =======================
563
-
564
-
565
- # II.1 Nitrogen
566
- # -------------
567
-
568
- Nitrite: [NX2](=[OX1])[O;$([X2]),$([X1-])]
569
- # hits nitrous acid, its anion, esters, and other O-substituted derivatives
570
-
571
- Thionitrite: [SX2][NX2]=[OX1]
572
-
573
- Nitrate: [$([NX3](=[OX1])(=[OX1])[O;$([X2]),$([X1-])]),$([NX3+]([OX1-])(=[OX1])[O;$([X2]),$([X1-])])]
574
- # hits nitric acid, its anion, esters, and other O-substituted derivatives
575
-
576
- Nitro: [$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]
577
- # hits nitro groups attached to C,N, ... but not nitrates
578
-
579
- Nitroso: [NX2](=[OX1])[!#7;!#8]
580
- # no nitrites, no nitrosamines
581
-
582
- Azide: [NX1]~[NX2]~[NX2,NX1]
583
- # hits both mesomeric forms, also anion
584
-
585
- Acylazide: [CX3](=[OX1])[NX2]~[NX2]~[NX1]
586
-
587
- Diazo: [$([#6]=[NX2+]=[NX1-]),$([#6-]-[NX2+]#[NX1])]
588
-
589
- Diazonium: [#6][NX2+]#[NX1]
590
-
591
- Nitrosamine: [#7;!$(N*=O)][NX2]=[OX1]
592
-
593
- Nitrosamide: [NX2](=[OX1])N-*=O
594
- # includes nitrososulfonamides
595
-
596
- N-Oxide: [$([#7+][OX1-]),$([#7v5]=[OX1]);!$([#7](~[O])~[O]);!$([#7]=[#7])]
597
- # Hits both forms. Won't hit azoxy, nitro, nitroso, or nitrate.
598
-
599
-
600
- Hydrazine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])]
601
- # no hydrazides
602
-
603
- Hydrazone: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX2]=[#6]
604
-
605
- Hydroxylamine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][OX2;$([H1]),$(O[#6;!$(C=[N,O,S])])]
606
- # no discrimination between O-, N-, and O,N-substitution
607
-
608
-
609
- # II.2 Sulfur
610
- # -----------
611
-
612
- Sulfon: [$([SX4](=[OX1])(=[OX1])([#6])[#6]),$([SX4+2]([OX1-])([OX1-])([#6])[#6])]
613
- # can't be aromatic, thus S and not #16
614
-
615
- Sulfoxide: [$([SX3](=[OX1])([#6])[#6]),$([SX3+]([OX1-])([#6])[#6])]
616
-
617
- Sulfonium: [S+;!$([S]~[!#6]);!$([S]*~[#7,#8,#15,#16])]
618
- # can't be aromatic, thus S and not #16
619
-
620
- Sulfuric_acid: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
621
- # includes anions
622
-
623
- Sulfuric_monoester: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
624
-
625
- Sulfuric_diester: [SX4](=[OX1])(=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
626
-
627
- Sulfuric_monoamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
628
-
629
- Sulfuric_diamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
630
-
631
- Sulfuric_esteramide: [SX4](=[OX1])(=[OX1])([#7X3][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
632
-
633
- Sulfuric_derivative: [SX4D4](=[!#6])(=[!#6])([!#6])[!#6]
634
- # everything else (would not be a "true" derivative of sulfuric acid, if one of the substituents were less electronegative
635
- # than sulfur, but this should be very very rare, anyway)
636
-
637
-
638
-
639
- #### sulfurous acid and derivatives missing!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
640
-
641
-
642
-
643
-
644
- Sulfonic_acid: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[$([OX2H]),$([OX1-])]
645
-
646
- Sulfonamide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
647
-
648
- Sulfonic_ester: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[OX2][#6;!$(C=[O,N,S])]
649
-
650
- Sulfonic_halide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[FX1,ClX1,BrX1,IX1]
651
-
652
- Sulfonic_derivative: [SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6])[!#6]
653
- # includes all of the above and many more
654
- # for comparison: this is what "all sulfonic derivatives but not the ones above" would look like:
655
- # [$([SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6;!O])[!#6]),$([SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[!$([FX1,ClX1,BrX1,IX1]);!$([#6]);!$([OX2H]);!$([OX1-]);!$([OX2][#6;!$(C=[O,N,S])]);!$([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])])]
656
-
657
-
658
- Sulfinic_acid: [SX3;$([H1]),$([H0][#6])](=[OX1])[$([OX2H]),$([OX1-])]
659
-
660
- Sulfinic_amide: [SX3;$([H1]),$([H0][#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
661
-
662
- Sulfinic_ester: [SX3;$([H1]),$([H0][#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
663
-
664
- Sulfinic_halide: [SX3;$([H1]),$([H0][#6])](=[OX1])[FX1,ClX1,BrX1,IX1]
665
-
666
- Sulfinic_derivative: [SX3;$([H1]),$([H0][#6])](=[!#6])[!#6]
667
-
668
- Sulfenic_acid: [SX2;$([H1]),$([H0][#6])][$([OX2H]),$([OX1-])]
669
-
670
- Sulfenic_amide: [SX2;$([H1]),$([H0][#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
671
-
672
- Sulfenic_ester: [SX2;$([H1]),$([H0][#6])][OX2][#6;!$(C=[O,N,S])]
673
-
674
- Sulfenic_halide: [SX2;$([H1]),$([H0][#6])][FX1,ClX1,BrX1,IX1]
675
-
676
- Sulfenic_derivative: [SX2;$([H1]),$([H0][#6])][!#6]
677
-
678
-
679
- # II.3 Phosphorous
680
- # ----------------
681
-
682
- Phosphine: [PX3;$([H3]),$([H2][#6]),$([H1]([#6])[#6]),$([H0]([#6])([#6])[#6])]
683
- # similar to amine, but less restrictive: includes also amide- and aminal-analogues
684
-
685
- Phosphine_oxide: [PX4;$([H3]=[OX1]),$([H2](=[OX1])[#6]),$([H1](=[OX1])([#6])[#6]),$([H0](=[OX1])([#6])([#6])[#6])]
686
-
687
- Phosphonium: [P+;!$([P]~[!#6]);!$([P]*~[#7,#8,#15,#16])]
688
- # similar to Ammonium
689
-
690
- Phosphorylen: [PX4;$([H3]=[CX3]),$([H2](=[CX3])[#6]),$([H1](=[CX3])([#6])[#6]),$([H0](=[CX3])([#6])([#6])[#6])]
691
-
692
-
693
- # conventions for the following acids and derivatives:
694
- # acids find protonated and deprotonated acids
695
- # esters do not find mixed anhydrides ( ...P-O-C(=O))
696
- # derivatives: subtituents which go in place of the OH and =O are not H or C (may also be O,
697
- # thus including acids and esters)
698
-
699
- Phosphonic_acid: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
700
- # includes anions
701
-
702
- Phosphonic_monoester: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
703
-
704
- Phosphonic_diester: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
705
-
706
- Phosphonic_monoamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
707
-
708
- Phosphonic_diamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
709
-
710
- Phosphonic_esteramide: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
711
-
712
- Phosphonic_acid_derivative: [PX4;$([H1]),$([H0][#6])](=[!#6])([!#6])[!#6]
713
- # all of the above and much more
714
-
715
-
716
- Phosphoric_acid: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
717
- # includes anions
718
-
719
- Phosphoric_monoester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
720
-
721
- Phosphoric_diester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
722
-
723
- Phosphoric_triester: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
724
-
725
- Phosphoric_monoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
726
-
727
- Phosphoric_diamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
728
-
729
- Phosphoric_triamide: [PX4D4](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
730
-
731
- Phosphoric_monoestermonoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
732
-
733
- Phosphoric_diestermonoamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
734
-
735
- Phosphoric_monoesterdiamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
736
-
737
- Phosphoric_acid_derivative: [PX4D4](=[!#6])([!#6])([!#6])[!#6]
738
-
739
-
740
- Phosphinic_acid: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[$([OX2H]),$([OX1-])]
741
-
742
- Phosphinic_ester: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
743
-
744
- Phosphinic_amide: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
745
-
746
- Phosphinic_acid_derivative: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[!#6])[!#6]
747
-
748
-
749
- Phosphonous_acid: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
750
-
751
- Phosphonous_monoester: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
752
-
753
- Phosphonous_diester: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
754
-
755
- Phosphonous_monoamide: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
756
-
757
- Phosphonous_diamide: [PX3;$([H1]),$([H0][#6])]([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
758
-
759
- Phosphonous_esteramide: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
760
-
761
- Phosphonous_derivatives: [PX3;$([D2]),$([D3][#6])]([!#6])[!#6]
762
-
763
-
764
- Phosphinous_acid: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][$([OX2H]),$([OX1-])]
765
-
766
- Phosphinous_ester: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][OX2][#6;!$(C=[O,N,S])]
767
-
768
- Phosphinous_amide: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
769
-
770
- Phosphinous_derivatives: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][!#6]
771
-
772
-
773
- # II.4 Silicon
774
- # ------------
775
-
776
- Quart_silane: [SiX4]([#6])([#6])([#6])[#6]
777
- # four C-substituents. non-reactive, non-toxic, in experimental phase for drug development
778
-
779
- Non-quart_silane: [SiX4;$([H1]([#6])([#6])[#6]),$([H2]([#6])[#6]),$([H3][#6]),$([H4])]
780
- # has 1-4 hydride(s), reactive. Daylight's depictmatch does not add hydrogens automatically to
781
- # the free positions at Si, thus Hs had to be added implicitly
782
-
783
- Silylmonohalide: [SiX4]([FX1,ClX1,BrX1,IX1])([#6])([#6])[#6]
784
- # reagents for inserting protection groups
785
-
786
- Het_trialkylsilane: [SiX4]([!#6])([#6])([#6])[#6]
787
- # mostly acid-labile protection groups such as trimethylsilyl-ethers
788
-
789
- Dihet_dialkylsilane: [SiX4]([!#6])([!#6])([#6])[#6]
790
-
791
- Trihet_alkylsilane: [SiX4]([!#6])([!#6])([!#6])[#6]
792
-
793
- Silicic_acid_derivative: [SiX4]([!#6])([!#6])([!#6])[!#6]
794
- # four substituent which are neither C nor H
795
-
796
-
797
- # II.5 Boron
798
- # ----------
799
-
800
- Trialkylborane: [BX3]([#6])([#6])[#6]
801
- # also carbonyls allowed
802
-
803
- Boric_acid_derivatives: [BX3]([!#6])([!#6])[!#6]
804
- # includes acids, esters, amides, ... H-substituent at B is very rare.
805
-
806
- Boronic_acid_derivative: [BX3]([!#6])([!#6])[!#6]
807
- # # includes acids, esters, amides, ...
808
-
809
- Borohydride: [BH1,BH2,BH3,BH4]
810
- # at least one H attached to B
811
-
812
- Quaternary_boron: [BX4]
813
- # mostly borates (negative charge), in complex with Lewis-base
814
-
815
-
816
-
817
- # Part III: Some Special Patterns
818
- # ===============================
819
-
820
-
821
- # III.1 Chains
822
- # ------------
823
-
824
- # some simple chains
825
-
826
-
827
-
828
- # III.2 Rings
829
- # -----------
830
-
831
- Aromatic: a
832
-
833
- Heterocyclic: [!#6;!R0]
834
- # may be aromatic or not
835
-
836
- Epoxide: [OX2r3]1[#6r3][#6r3]1
837
- # toxic/reactive. may be annelated to aromat, but must not be aromatic itself (oxirane-2,3-dione)
838
-
839
- NH_aziridine: [NX3H1r3]1[#6r3][#6r3]1
840
- # toxic/reactive according to Maybridge's garbage filter
841
-
842
- Spiro: [D4R;$(*(@*)(@*)(@*)@*)]
843
- # at least two different rings can be found which are sharing just one atom.
844
- # these two rings can be connected by a third ring, so it matches also some
845
- # bridged systems, like morphine
846
-
847
- Annelated_rings: [R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]
848
- # two different rings sharing exactly two atoms
849
-
850
- Bridged_rings: [R;$(*(@*)(@*)@*);!$([D4R;$(*(@*)(@*)(@*)@*)]);!$([R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])])]
851
- # part of two or more rings, not spiro, not annelated -> finds bridgehead atoms,
852
- # but only if they are not annelated at the same time - otherwise impossible (?)
853
- # to distinguish from non-bridgehead annelated atoms
854
-
855
- # some basic ring-patterns (just size, no other information):
856
-
857
-
858
-
859
-
860
-
861
- # III.3 Sugars and Nucleosides/Nucleotides, Steroids
862
- # --------------------------------------------------
863
-
864
- # because of the large variety of sugar derivatives, different patterns can be applied.
865
- # The choice of patterns and their combinations will depend on the contents of the database
866
- # e.g. natural products, nucleoside analoges with modified sugars, ... as well as on the
867
- # desired restriction
868
-
869
-
870
- Sugar_pattern_1: [OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)]
871
- # 5 or 6-membered ring containing one O and at least one (r5) or two (r6) oxygen-substituents.
872
-
873
- Sugar_pattern_2: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
874
- # 5 or 6-membered ring containing one O and an acetal-like bond at postion 2.
875
-
876
- Sugar_pattern_combi: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C(O)@C1)]
877
- # combination of the two above
878
-
879
- Sugar_pattern_2_reducing: [OX2;$([r5]1@C(!@[OX2H1])@C@C@C1),$([r6]1@C(!@[OX2H1])@C@C@C@C1)]
880
- # 5 or 6-membered cyclic hemi-acetal
881
-
882
- Sugar_pattern_2_alpha: [OX2;$([r5]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
883
- # 5 or 6-membered cyclic hemi-acetal
884
-
885
- Sugar_pattern_2_beta: [OX2;$([r5]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
886
- # 5 or 6-membered cyclic hemi-acetal
887
-
888
- ##Poly_sugar_1: ([OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)].[OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)])
889
- # pattern1 occours more than once (in same molecule, but moieties don't have to be adjacent!)
890
-
891
- ##Poly_sugar_2: ([OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)].[OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)])
892
- # pattern2 occours more than once (in same molecule, but moieties don't have to be adjacent!)
893
-
894
-
895
- # III.4 Everything else...
896
- # ------------------------
897
-
898
- Conjugated_double_bond: *=*[*]=,#,:[*]
899
-
900
- Conjugated_tripple_bond: *#*[*]=,#,:[*]
901
-
902
- Cis_double_bond: */[D2]=[D2]\*
903
- # only one single-bonded substituent on each DB-atom. no aromats.
904
- # only found when character of DB is explicitely stated.
905
-
906
- Trans_double_bond: */[D2]=[D2]/*
907
- # analog
908
-
909
- Mixed_anhydrides: [$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))][#8X2][$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))]
910
- # should hits all combinations of two acids
911
-
912
- Halogen_on_hetero: [FX1,ClX1,BrX1,IX1][!#6]
913
-
914
- Halogen_multi_subst: [F,Cl,Br,I;!$([X1]);!$([X0-])]
915
- # Halogen which is not mono-substituted nor an anion, e.g. chlorate.
916
- # Most of these cases should be also filtered by Halogen_on_hetero.
917
-
918
- Trifluoromethyl: [FX1][CX4;!$([H0][Cl,Br,I]);!$([F][C]([F])([F])[F])]([FX1])([FX1])
919
- # C with three F attached, connected to anything which is not another halogen
920
-
921
- C_ONS_bond: [#6]~[#7,#8,#16]
922
- # probably all drug-like molecules have at least one O, N, or S connected to a C -> nice filter
923
-
924
- ## Mixture: (*).(*)
925
- # two or more seperate parts, may also be salt
926
- # component-level grouping is not yet supported in Open Babel Version 2.0
927
-
928
-
929
- Charged: [!+0]
930
-
931
- Anion: [-1,-2,-3,-4,-5,-6,-7]
932
-
933
- Kation: [+1,+2,+3,+4,+5,+6,+7]
934
-
935
- Salt: ([-1,-2,-3,-4,-5,-6,-7]).([+1,+2,+3,+4,+5,+6,+7])
936
- # two or more seperate components with opposite charges
937
-
938
- ##Zwitterion: ([-1,-2,-3,-4,-5,-6,-7].[+1,+2,+3,+4,+5,+6,+7])
939
- # both negative and positive charges somewhere within the same molecule.
940
-
941
- 1,3-Tautomerizable: [$([#7X2,OX1,SX1]=*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=*),$([#7X3,OX2,SX2;!H0]*:n)]
942
- # 1,3 migration of H allowed. Includes keto/enol and amide/enamide.
943
- # Aromatic rings must stay aromatic - no keto form of phenol
944
-
945
- 1,5-Tautomerizable: [$([#7X2,OX1,SX1]=,:**=,:*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=**=*),$([#7X3,OX2,SX2;!H0]*=,:**:n)]
946
-
947
- Rotatable_bond: [!$(*#*)&!D1]-!@[!$(*#*)&!D1]
948
- # taken from http://www.daylight.com/support/contrib/smarts/content.html
949
-
950
- Michael_acceptor: [CX3]=[CX3][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-])]
951
- # the classical case: C=C near carbonyl, nitrile, nitro, or similar
952
- # Oxo-heteroaromats and similar are not included.
953
-
954
- Dicarbodiazene: [CX3](=[OX1])[NX2]=[NX2][CX3](=[OX1])
955
- # Michael-like acceptor, see Mitsunobu reaction
956
-
957
- # H-Bond_donor:
958
-
959
- # H-Bond_acceptor:
960
-
961
- # Pos_ionizable:
962
-
963
- # Neg_ionizable:
964
-
965
- # Unlikely_ions:
966
- # O+,N-,C+,C-, ...
967
-
968
- CH-acidic: [$([CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])]),$([CX4;!$([H0])]1[CX3]=[CX3][CX3]=[CX3]1)]
969
- # C-H alpha to carbony, nitro or similar, C is not double-bonded, only C, H, S,P=O and nitro substituents allowed.
970
- # pentadiene is included. acids, their salts, prim./sec. amides, and imides are excluded.
971
- # hits also CH-acidic_strong
972
-
973
- CH-acidic_strong: [CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])]([$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])])[$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])]
974
- # same as above (without pentadiene), but carbonyl or similar on two or three sides
975
-
976
- Chiral_center_specified: [$([*@](~*)(~*)(*)*),$([*@H](*)(*)*),$([*@](~*)(*)*),$([*@H](~*)~*)]
977
- # Hits atoms with tetrahedral chirality, if chiral center is specified in the SMILES string
978
- # depictmach does not find oxonium, sulfonium, or sulfoxides!
979
-
980
- # Chiral_center_unspecified: [$([*@?](~*)(~*)(*)*),$([*@?H](*)(*)*),$([*@?](~*)(*)*),$([*@?H](~*)~*)]
981
- # Hits atoms with tetrahedral chirality, if chiral center is not specified in the SMILES string
982
- # "@?" (unspecified chirality) is not yet supported in Open Babel Version 2.0
983
-