lazar 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
@@ -0,0 +1,983 @@
|
|
1
|
+
#
|
2
|
+
# SMARTS Patterns for Functional Group Classification
|
3
|
+
#
|
4
|
+
# written by Christian Laggner
|
5
|
+
# Copyright 2005 Inte:Ligand Software-Entwicklungs und Consulting GmbH
|
6
|
+
#
|
7
|
+
# Released under the Lesser General Public License (LGPL license)
|
8
|
+
# see http://www.gnu.org/copyleft/lesser.html
|
9
|
+
# Modified from Version 221105
|
10
|
+
#####################################################################################################
|
11
|
+
|
12
|
+
# General Stuff:
|
13
|
+
# These patters were written in an attempt to represent the classification of organic compounds
|
14
|
+
# from the viewpoint of an organic chemist.
|
15
|
+
# They are often very restrictive. This may be generally a good thing, but it also takes some time
|
16
|
+
# for filtering/indexing large compound sets.
|
17
|
+
# For filtering undesired groups (in druglike compounds) one will want to have more general patterns
|
18
|
+
# (e.g. you don't want *any* halide of *any* acid, *neither* aldehyde *nor* formyl esters and amides, ...).
|
19
|
+
#
|
20
|
+
|
21
|
+
# Part I: Carbon
|
22
|
+
# ==============
|
23
|
+
|
24
|
+
|
25
|
+
# I.1: Carbon-Carbon Bonds
|
26
|
+
# ------------------------
|
27
|
+
|
28
|
+
# I.1.1 Alkanes:
|
29
|
+
|
30
|
+
Primary_carbon: [CX4H3][#6]
|
31
|
+
|
32
|
+
Secondary_carbon: [CX4H2]([#6])[#6]
|
33
|
+
|
34
|
+
Tertiary_carbon: [CX4H1]([#6])([#6])[#6]
|
35
|
+
|
36
|
+
Quaternary_carbon: [CX4]([#6])([#6])([#6])[#6]
|
37
|
+
|
38
|
+
|
39
|
+
# I.1.2 C-C double and Triple Bonds
|
40
|
+
|
41
|
+
Alkene: [CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])]=[CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])]
|
42
|
+
# sp2 C may be substituted only by C or H -
|
43
|
+
# does not hit ketenes and allenes, nor enamines, enols and the like
|
44
|
+
|
45
|
+
Alkyne: [CX2]#[CX2]
|
46
|
+
# non-carbon substituents (e.g. alkynol ethers) are rather rare, thus no further discrimination
|
47
|
+
|
48
|
+
Allene: [CX3]=[CX2]=[CX3]
|
49
|
+
|
50
|
+
|
51
|
+
# I.2: One Carbon-Hetero Bond
|
52
|
+
# ---------------------------
|
53
|
+
|
54
|
+
|
55
|
+
# I.2.1 Alkyl Halogenides
|
56
|
+
|
57
|
+
Alkylchloride: [ClX1][CX4]
|
58
|
+
# will also hit chloromethylethers and the like, but no chloroalkenes, -alkynes or -aromats
|
59
|
+
# a more restrictive version can be obtained by modifying the Alcohol string.
|
60
|
+
|
61
|
+
Alkylfluoride: [FX1][CX4]
|
62
|
+
|
63
|
+
Alkylbromide: [BrX1][CX4]
|
64
|
+
|
65
|
+
Alkyliodide: [IX1][CX4]
|
66
|
+
|
67
|
+
|
68
|
+
# I.2.2 Alcohols and Ethers
|
69
|
+
|
70
|
+
Alcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])]
|
71
|
+
# nonspecific definition, no acetals, aminals, and the like
|
72
|
+
|
73
|
+
Primary_alcohol: [OX2H][CX4H2;!$(C([OX2H])[O,S,#7,#15])]
|
74
|
+
|
75
|
+
Secondary_alcohol: [OX2H][CX4H;!$(C([OX2H])[O,S,#7,#15])]
|
76
|
+
|
77
|
+
Tertiary_alcohol: [OX2H][CX4D4;!$(C([OX2H])[O,S,#7,#15])]
|
78
|
+
|
79
|
+
Dialkylether: [OX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])]
|
80
|
+
# no acetals and the like; no enolethers
|
81
|
+
|
82
|
+
Dialkylthioether: [SX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])]
|
83
|
+
# no acetals and the like; no enolethers
|
84
|
+
|
85
|
+
Alkylarylether: [OX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])]
|
86
|
+
# no acetals and the like; no enolethers
|
87
|
+
|
88
|
+
Diarylether: [c][OX2][c]
|
89
|
+
|
90
|
+
Alkylarylthioether: [SX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])]
|
91
|
+
|
92
|
+
Diarylthioether: [c][SX2][c]
|
93
|
+
|
94
|
+
Oxonium: [O+;!$([O]~[!#6]);!$([S]*~[#7,#8,#15,#16])]
|
95
|
+
# can't be aromatic, thus O and not #8
|
96
|
+
|
97
|
+
# I.2.3 Amines
|
98
|
+
|
99
|
+
Amine: [NX3+0,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])]
|
100
|
+
# hits all amines (prim/sec/tert/quart), including ammonium salts, also enamines, but not amides, imides, aminals, ...
|
101
|
+
|
102
|
+
# the following amines include also the protonated forms
|
103
|
+
|
104
|
+
Primary_aliph_amine: [NX3H2+0,NX4H3+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
|
105
|
+
|
106
|
+
Secondary_aliph_amine: [NX3H1+0,NX4H2+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
|
107
|
+
|
108
|
+
Tertiary_aliph_amine: [NX3H0+0,NX4H1+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
|
109
|
+
|
110
|
+
Quaternary_aliph_ammonium: [NX4H0+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
|
111
|
+
|
112
|
+
Primary_arom_amine: [NX3H2+0,NX4H3+]c
|
113
|
+
|
114
|
+
Secondary_arom_amine: [NX3H1+0,NX4H2+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
|
115
|
+
|
116
|
+
Tertiary_arom_amine: [NX3H0+0,NX4H1+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
|
117
|
+
|
118
|
+
Quaternary_arom_ammonium: [NX4H0+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
|
119
|
+
|
120
|
+
Secondary_mixed_amine: [NX3H1+0,NX4H2+;$([N]([c])[C]);!$([N]*~[#7,#8,#15,#16])]
|
121
|
+
|
122
|
+
Tertiary_mixed_amine: [NX3H0+0,NX4H1+;$([N]([c])([C])[#6]);!$([N]*~[#7,#8,#15,#16])]
|
123
|
+
|
124
|
+
Quaternary_mixed_ammonium: [NX4H0+;$([N]([c])([C])[#6][#6]);!$([N]*~[#7,#8,#15,#16])]
|
125
|
+
|
126
|
+
Ammonium: [N+;!$([N]~[!#6]);!$(N=*);!$([N]*~[#7,#8,#15,#16])]
|
127
|
+
# only C and H substituents allowed. Quaternary or protonated amines
|
128
|
+
# NX4+ or Nv4+ is not recognized by Daylight's depictmatch if less than four C are present
|
129
|
+
|
130
|
+
|
131
|
+
# I.2.4 Others
|
132
|
+
|
133
|
+
Alkylthiol: [SX2H][CX4;!$(C([SX2H])~[O,S,#7,#15])]
|
134
|
+
|
135
|
+
Dialkylthioether: [SX2]([CX4;!$(C([SX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([SX2])[O,S,#7,#15])]
|
136
|
+
|
137
|
+
Alkylarylthioether: [SX2](c)[CX4;!$(C([SX2])[O,S,#7,#15])]
|
138
|
+
|
139
|
+
Disulfide: [SX2D2][SX2D2]
|
140
|
+
|
141
|
+
1,2-Aminoalcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15,F,Cl,Br,I])][CX4;!$(C([N])[O,S,#7,#15])][NX3;!$(NC=[O,S,N])]
|
142
|
+
# does not hit alpha-amino acids, enaminoalcohols, 1,2-aminoacetals, o-aminophenols, etc.
|
143
|
+
|
144
|
+
1,2-Diol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])][CX4;!$(C([OX2H])[O,S,#7,#15])][OX2H]
|
145
|
+
# does not hit alpha-hydroxy acids, enolalcohols, 1,2-hydroxyacetals, 1,2-diphenols, etc.
|
146
|
+
|
147
|
+
1,1-Diol: [OX2H][CX4;!$(C([OX2H])([OX2H])[O,S,#7,#15])][OX2H]
|
148
|
+
|
149
|
+
Hydroperoxide: [OX2H][OX2]
|
150
|
+
#does not neccessarily have to be connected to a carbon atom, includes also hydrotrioxides
|
151
|
+
|
152
|
+
Peroxo: [OX2D2][OX2D2]
|
153
|
+
|
154
|
+
Organolithium_compounds: [LiX1][#6,#14]
|
155
|
+
|
156
|
+
Organomagnesium_compounds: [MgX2][#6,#14]
|
157
|
+
# not restricted to Grignard compounds, also dialkyl Mg
|
158
|
+
|
159
|
+
Organometallic_compounds: [!#1;!#5;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#33;!#34;!#35;!#52;!#53;!#85]~[#6;!-]
|
160
|
+
# very general, includes all metals covalently bound to carbon
|
161
|
+
|
162
|
+
|
163
|
+
# I.3: Two Carbon-Hetero Bonds (Carbonyl and Derivatives)
|
164
|
+
# ----------------------------
|
165
|
+
|
166
|
+
# I.3.1 Double Bond to Hetero
|
167
|
+
|
168
|
+
Aldehyde: [$([CX3H][#6]),$([CX3H2])]=[OX1]
|
169
|
+
# hits aldehydes including formaldehyde
|
170
|
+
|
171
|
+
Ketone: [#6][CX3](=[OX1])[#6]
|
172
|
+
# does not include oxo-groups connected to a (hetero-) aromatic ring
|
173
|
+
|
174
|
+
Thioaldehyde: [$([CX3H][#6]),$([CX3H2])]=[SX1]
|
175
|
+
|
176
|
+
Thioketone: [#6][CX3](=[SX1])[#6]
|
177
|
+
# does not include thioxo-groups connected to a (hetero-) aromatic ring
|
178
|
+
|
179
|
+
Imine: [NX2;$([N][#6]),$([NH]);!$([N][CX3]=[#7,#8,#15,#16])]=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])]
|
180
|
+
# nitrogen is not part of an amidelike strukture, nor of an aromatic ring, but can be part of an aminal or similar
|
181
|
+
|
182
|
+
Immonium: [NX3+;!$([N][!#6]);!$([N][CX3]=[#7,#8,#15,#16])]
|
183
|
+
|
184
|
+
Oxime: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2H]
|
185
|
+
|
186
|
+
Oximether: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2][#6;!$(C=[#7,#8])]
|
187
|
+
# ether, not ester or amide; does not hit isoxazole
|
188
|
+
|
189
|
+
|
190
|
+
# I.3.2. Two Single Bonds to Hetero
|
191
|
+
|
192
|
+
Acetal: [OX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])]
|
193
|
+
# does not hit hydroxy-methylesters, ketenacetals, hemiacetals, orthoesters, etc.
|
194
|
+
|
195
|
+
Hemiacetal: [OX2H][CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])]
|
196
|
+
|
197
|
+
Aminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][NX3v3;!$(NC=[#7,#8,#15,#16])][#6]
|
198
|
+
# Ns are not part of an amide or similar. v3 ist to exclude nitro and similar groups
|
199
|
+
|
200
|
+
Hemiaminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][OX2H]
|
201
|
+
|
202
|
+
Thioacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][SX2][#6;!$(C=[O,S,N])]
|
203
|
+
|
204
|
+
Thiohemiacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][OX2H]
|
205
|
+
|
206
|
+
Halogen_acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1]
|
207
|
+
# hits chloromethylenethers and other reactive alkylating agents
|
208
|
+
|
209
|
+
Acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
|
210
|
+
# includes all of the above and other combinations (S-C-N, hydrates, ...), but still no aminomethylenesters and similar
|
211
|
+
|
212
|
+
Halogenmethylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1]
|
213
|
+
# also reactive alkylating agents. Acid does not have to be carboxylic acid, also S- and P-based acids allowed
|
214
|
+
|
215
|
+
NOS_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
|
216
|
+
# Same as above, but N,O or S instead of halogen. Ester/amide allowed only on one side
|
217
|
+
|
218
|
+
Hetero_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
|
219
|
+
# Combination of the last two patterns
|
220
|
+
|
221
|
+
Cyanhydrine: [NX1]#[CX2][CX4;$([CH2]),$([CH]([CX2])[#6]),$(C([CX2])([#6])[#6])][OX2H]
|
222
|
+
|
223
|
+
|
224
|
+
# I.3.3 Single Bond to Hetero, C=C Double Bond (Enols and Similar)
|
225
|
+
|
226
|
+
Chloroalkene: [ClX1][CX3]=[CX3]
|
227
|
+
|
228
|
+
Fluoroalkene: [FX1][CX3]=[CX3]
|
229
|
+
|
230
|
+
Bromoalkene: [BrX1][CX3]=[CX3]
|
231
|
+
|
232
|
+
Iodoalkene: [IX1][CX3]=[CX3]
|
233
|
+
|
234
|
+
Enol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3]
|
235
|
+
# no phenols
|
236
|
+
|
237
|
+
Endiol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3;$([H1]),$(C[#6])][OX2H]
|
238
|
+
# no 1,2-diphenols, ketenacetals, ...
|
239
|
+
|
240
|
+
Enolether: [OX2]([#6;!$(C=[N,O,S])])[CX3;$([H0][#6]),$([H1])]=[CX3]
|
241
|
+
# finds also endiodiethers, but not enolesters, no aromats
|
242
|
+
|
243
|
+
Enolester: [OX2]([CX3]=[OX1])[#6X3;$([#6][#6]),$([H1])]=[#6X3;!$(C[OX2H])]
|
244
|
+
|
245
|
+
|
246
|
+
Enamine: [NX3;$([NH2][CX3]),$([NH1]([CX3])[#6]),$([N]([CX3])([#6])[#6]);!$([N]*=[#7,#8,#15,#16])][CX3;$([CH]),$([C][#6])]=[CX3]
|
247
|
+
# does not hit amines attached to aromatic rings, nor may the nitrogen be aromatic
|
248
|
+
|
249
|
+
Thioenol: [SX2H][CX3;$([H1]),$(C[#6])]=[CX3]
|
250
|
+
|
251
|
+
Thioenolether: [SX2]([#6;!$(C=[N,O,S])])[CX3;$(C[#6]),$([CH])]=[CX3]
|
252
|
+
|
253
|
+
|
254
|
+
# I.4: Three Carbon-Hetero Bonds (Carboxyl and Derivatives)
|
255
|
+
# ------------------------------
|
256
|
+
|
257
|
+
Acylchloride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[ClX1]
|
258
|
+
|
259
|
+
Acylfluoride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1]
|
260
|
+
|
261
|
+
Acylbromide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[BrX1]
|
262
|
+
|
263
|
+
Acyliodide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[IX1]
|
264
|
+
|
265
|
+
Acylhalide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1,ClX1,BrX1,IX1]
|
266
|
+
# all of the above
|
267
|
+
|
268
|
+
|
269
|
+
# The following contains all simple carboxylic combinations of O, N, S, & Hal -
|
270
|
+
# - acids, esters, amides, ... as well as a few extra cases (anhydride, hydrazide...)
|
271
|
+
# Cyclic structures (including aromats) like lactones, lactames, ... got their own
|
272
|
+
# definitions. Structures where both heteroatoms are part of an aromatic ring
|
273
|
+
# (oxazoles, imidazoles, ...) were excluded.
|
274
|
+
|
275
|
+
Carboxylic_acid: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[$([OX2H]),$([OX1-])]
|
276
|
+
# includes carboxylate anions
|
277
|
+
|
278
|
+
Carboxylic_ester: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
|
279
|
+
# does not hit anhydrides or lactones
|
280
|
+
|
281
|
+
Lactone: [#6][#6X3R](=[OX1])[#8X2][#6;!$(C=[O,N,S])]
|
282
|
+
# may also be aromatic
|
283
|
+
|
284
|
+
Carboxylic_anhydride: [CX3;$([H0][#6]),$([H1])](=[OX1])[#8X2][CX3;$([H0][#6]),$([H1])](=[OX1])
|
285
|
+
# anhydride formed by two carboxylic acids, no mixed anhydrides (e.g. between carboxylic acid and sulfuric acid); may be part of a ring, even aromatic
|
286
|
+
|
287
|
+
Carboxylic_acid_derivative: [$([#6X3H0][#6]),$([#6X3H])](=[!#6])[!#6]
|
288
|
+
# includes most of the structures of I.4 and many more, also 1,3-heteroaromatics such as isoxazole
|
289
|
+
|
290
|
+
Carbothioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[OX1])[$([SX2H]),$([SX1-])]),$([C](=[SX1])[$([OX2H]),$([OX1-])])]
|
291
|
+
# hits both tautomeric forms, as well as anions
|
292
|
+
|
293
|
+
Carbothioic_S_ester: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[SX2][#6;!$(C=[O,N,S])]
|
294
|
+
|
295
|
+
Carbothioic_S_lactone: [#6][#6X3R](=[OX1])[#16X2][#6;!$(C=[O,N,S])]
|
296
|
+
# may also be aromatic
|
297
|
+
|
298
|
+
Carbothioic_O_ester: [CX3;$([H0][#6]),$([H1])](=[SX1])[OX2][#6;!$(C=[O,N,S])]
|
299
|
+
|
300
|
+
Carbothioic_O_lactone: [#6][#6X3R](=[SX1])[#8X2][#6;!$(C=[O,N,S])]
|
301
|
+
|
302
|
+
Carbothioic_halide: [CX3;$([H0][#6]),$([H1])](=[SX1])[FX1,ClX1,BrX1,IX1]
|
303
|
+
|
304
|
+
Carbodithioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2H])]
|
305
|
+
|
306
|
+
Carbodithioic_ester: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2][#6;!$(C=[O,N,S])])]
|
307
|
+
|
308
|
+
Carbodithiolactone: [#6][#6X3R](=[SX1])[#16X2][#6;!$(C=[O,N,S])]
|
309
|
+
|
310
|
+
|
311
|
+
Amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
312
|
+
# does not hit lactames
|
313
|
+
|
314
|
+
Primary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[NX3H2]
|
315
|
+
|
316
|
+
Secondary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]
|
317
|
+
|
318
|
+
Tertiary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]
|
319
|
+
|
320
|
+
Lactam: [#6R][#6X3R](=[OX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
321
|
+
# cyclic amides, may also be aromatic
|
322
|
+
|
323
|
+
Alkyl_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([#6])[#6X3;$([H0][#6]),$([H1])](=[OX1])
|
324
|
+
# may be part of a ring, even aromatic. only C allowed at central N. May also be triacyl amide
|
325
|
+
|
326
|
+
N_hetero_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([!#6])[#6X3;$([H0][#6]),$([H1])](=[OX1])
|
327
|
+
# everything else than H or C at central N
|
328
|
+
|
329
|
+
Imide_acidic: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H1][#6X3;$([H0][#6]),$([H1])](=[OX1])
|
330
|
+
# can be deprotonated
|
331
|
+
|
332
|
+
Thioamide: [$([CX3;!R][#6]),$([CX3H;!R])](=[SX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
333
|
+
# does not hit thiolactames
|
334
|
+
|
335
|
+
Thiolactam: [#6R][#6X3R](=[SX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
336
|
+
# cyclic thioamides, may also be aromatic
|
337
|
+
|
338
|
+
|
339
|
+
Oximester: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#8X2][#7X2]=,:[#6X3;$([H0]([#6])[#6]),$([H1][#6]),$([H2])]
|
340
|
+
# may also be part of a ring / aromatic
|
341
|
+
|
342
|
+
Amidine: [NX3;!$(NC=[O,S])][CX3;$([CH]),$([C][#6])]=[NX2;!$(NC=[O,S])]
|
343
|
+
# only basic amidines, not as part of aromatic ring (e.g. imidazole)
|
344
|
+
|
345
|
+
Hydroxamic_acid: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][$([OX2H]),$([OX1-])]
|
346
|
+
|
347
|
+
Hydroxamic_acid_ester: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][OX2][#6;!$(C=[O,N,S])]
|
348
|
+
# does not hit anhydrides of carboxylic acids withs hydroxamic acids
|
349
|
+
|
350
|
+
|
351
|
+
Imidoacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
|
352
|
+
# not cyclic
|
353
|
+
|
354
|
+
Imidoacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
|
355
|
+
# the enamide-form of lactames. may be aromatic like 2-hydroxypyridine
|
356
|
+
|
357
|
+
Imidoester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])]
|
358
|
+
# esters of the above structures. no anhydrides.
|
359
|
+
|
360
|
+
Imidolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])]
|
361
|
+
# no oxazoles and similar
|
362
|
+
|
363
|
+
Imidothioacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])]
|
364
|
+
# not cyclic
|
365
|
+
|
366
|
+
Imidothioacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])]
|
367
|
+
# the enamide-form of thiolactames. may be aromatic like 2-thiopyridine
|
368
|
+
|
369
|
+
Imidothioester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])]
|
370
|
+
# thioesters of the above structures. no anhydrides.
|
371
|
+
|
372
|
+
Imidothiolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])]
|
373
|
+
# no thioxazoles and similar
|
374
|
+
|
375
|
+
Amidine: [#7X3v3;!$(N([#6X3]=[#7X2])C=[O,S])][CX3R0;$([H1]),$([H0][#6])]=[NX2v3;!$(N(=[#6X3][#7X3])C=[O,S])]
|
376
|
+
# only basic amidines, not substituted by carbonyl or thiocarbonyl, not as part of a ring
|
377
|
+
|
378
|
+
Imidolactam: [#6][#6X3R;$([H0](=[NX2;!$(N(=[#6X3][#7X3])C=[O,S])])[#7X3;!$(N([#6X3]=[#7X2])C=[O,S])]),$([H0](-[NX3;!$(N([#6X3]=[#7X2])C=[O,S])])=,:[#7X2;!$(N(=[#6X3][#7X3])C=[O,S])])]
|
379
|
+
# one of the two C~N bonds is part of a ring (may be aromatic), but not both - thus no imidazole
|
380
|
+
|
381
|
+
Imidoylhalide: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1]
|
382
|
+
# not cyclic
|
383
|
+
|
384
|
+
Imidoylhalide_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1]
|
385
|
+
# may also be aromatic
|
386
|
+
|
387
|
+
# may be ring, aromatic, substituted with carbonyls, hetero, ...
|
388
|
+
# (everything else would get too complicated)
|
389
|
+
|
390
|
+
Amidrazone: [$([$([#6X3][#6]),$([#6X3H])](=[#7X2v3])[#7X3v3][#7X3v3]),$([$([#6X3][#6]),$([#6X3H])]([#7X3v3])=[#7X2v3][#7X3v3])]
|
391
|
+
# hits both tautomers. as above, it may be ring, aromatic, substituted with carbonyls, hetero, ...
|
392
|
+
|
393
|
+
|
394
|
+
Alpha_aminoacid: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[OX2H,OX1-]
|
395
|
+
# N may be alkylated, but not part of an amide (as in peptides), ionic forms are included
|
396
|
+
# includes also non-natural aminoacids with double-bonded or two aliph./arom. substituents at alpha-C
|
397
|
+
# N may not be aromatic as in 1H-pyrrole-2-carboxylic acid
|
398
|
+
|
399
|
+
Alpha_hydroxyacid: [OX2H][C][CX3](=[OX1])[OX2H,OX1-]
|
400
|
+
|
401
|
+
Peptide_middle: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])]
|
402
|
+
# finds peptidic structures which are neither C- nor N-terminal. Both neighbours must be amino-acids/peptides
|
403
|
+
|
404
|
+
Peptide_C_term: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[OX2H,OX1-]
|
405
|
+
# finds C-terminal amino acids
|
406
|
+
|
407
|
+
Peptide_N_term: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])]
|
408
|
+
# finds N-terminal amino acids. As above, N may be substituted, but not part of an amide-bond.
|
409
|
+
|
410
|
+
|
411
|
+
Carboxylic_orthoester: [#6][OX2][CX4;$(C[#6]),$([CH])]([OX2][#6])[OX2][#6]
|
412
|
+
# hits also anhydride like struktures (e. g. HC(OMe)2-OC=O residues)
|
413
|
+
|
414
|
+
Ketene: [CX3]=[CX2]=[OX1]
|
415
|
+
|
416
|
+
Ketenacetal: [#7X2,#8X3,#16X2;$(*[#6,#14])][#6X3]([#7X2,#8X3,#16X2;$(*[#6,#14])])=[#6X3]
|
417
|
+
# includes aminals, silylacetals, ketenesters, etc. C=C DB is not aromatic, everything else may be
|
418
|
+
|
419
|
+
Nitrile: [NX1]#[CX2]
|
420
|
+
# includes cyanhydrines
|
421
|
+
|
422
|
+
Isonitrile: [CX1-]#[NX2+]
|
423
|
+
|
424
|
+
|
425
|
+
Vinylogous_carbonyl_or_carboxyl_derivative: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7,#8,#16,F,Cl,Br,I]
|
426
|
+
# may be part of a ring, even aromatic
|
427
|
+
|
428
|
+
Vinylogous_acid: [#6X3](=[OX1])[#6X3]=,:[#6X3][$([OX2H]),$([OX1-])]
|
429
|
+
|
430
|
+
Vinylogous_ester: [#6X3](=[OX1])[#6X3]=,:[#6X3][#6;!$(C=[O,N,S])]
|
431
|
+
|
432
|
+
Vinylogous_amide: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
433
|
+
|
434
|
+
Vinylogous_halide: [#6X3](=[OX1])[#6X3]=,:[#6X3][FX1,ClX1,BrX1,IX1]
|
435
|
+
|
436
|
+
|
437
|
+
|
438
|
+
# I.5: Four Carbon-Hetero Bonds (Carbonic Acid and Derivatives)
|
439
|
+
# -----------------------------
|
440
|
+
|
441
|
+
Carbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[OX1])[#8X2][#6;!$(C=[O,N,S])]
|
442
|
+
# may be part of a ring, even aromatic
|
443
|
+
|
444
|
+
Carbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[OX2][FX1,ClX1,BrX1,IX1]
|
445
|
+
|
446
|
+
Carbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[$([OX2H]),$([OX1-])]
|
447
|
+
# unstable
|
448
|
+
|
449
|
+
Carbonic_acid_derivatives: [!#6][#6X3](=[!#6])[!#6]
|
450
|
+
|
451
|
+
|
452
|
+
Thiocarbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[SX1])[#8X2][#6;!$(C=[O,N,S])]
|
453
|
+
# may be part of a ring, even aromatic
|
454
|
+
|
455
|
+
Thiocarbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[OX2][FX1,ClX1,BrX1,IX1]
|
456
|
+
|
457
|
+
Thiocarbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[$([OX2H]),$([OX1-])]
|
458
|
+
|
459
|
+
|
460
|
+
Urea:[#7X3;!$([#7][!#6])][#6X3](=[OX1])[#7X3;!$([#7][!#6])]
|
461
|
+
# no check whether part of imide, biuret, etc. Aromatic structures are only hit if
|
462
|
+
# both N share no double bonds, like in the dioxo-form of uracil
|
463
|
+
|
464
|
+
Thiourea: [#7X3;!$([#7][!#6])][#6X3](=[SX1])[#7X3;!$([#7][!#6])]
|
465
|
+
|
466
|
+
Isourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#8X2&!$([#8][!#6]),OX1-])[#7X3;!$([#7][!#6])]
|
467
|
+
# O may be substituted. no check whether further amide-like bonds are present. Aromatic
|
468
|
+
# structures are only hit if single bonded N shares no additional double bond, like in
|
469
|
+
# the 1-hydroxy-3-oxo form of uracil
|
470
|
+
|
471
|
+
Isothiourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#16X2&!$([#16][!#6]),SX1-])[#7X3;!$([#7][!#6])]
|
472
|
+
|
473
|
+
Guanidine: [N;v3X3,v4X4+][CX3](=[N;v3X2,v4X3+])[N;v3X3,v4X4+]
|
474
|
+
# also hits guanidinium salts. v3 and v4 to avoid nitroamidines
|
475
|
+
|
476
|
+
Carbaminic_acid: [NX3]C(=[OX1])[O;X2H,X1-]
|
477
|
+
# quite unstable, unlikely to be found. Also hits salts
|
478
|
+
|
479
|
+
Urethan: [#7X3][#6](=[OX1])[#8X2][#6]
|
480
|
+
# also hits when part of a ring, no check whether the last C is part of carbonyl
|
481
|
+
|
482
|
+
Biuret: [#7X3][#6](=[OX1])[#7X3][#6](=[OX1])[#7X3]
|
483
|
+
|
484
|
+
Semicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1]
|
485
|
+
|
486
|
+
Carbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[OX1]
|
487
|
+
|
488
|
+
Semicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1]
|
489
|
+
|
490
|
+
Carbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[OX1]
|
491
|
+
|
492
|
+
Thiosemicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1]
|
493
|
+
|
494
|
+
Thiocarbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[SX1]
|
495
|
+
|
496
|
+
Thiosemicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1]
|
497
|
+
|
498
|
+
Thiocarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[SX1]
|
499
|
+
|
500
|
+
|
501
|
+
Isocyanate: [NX2]=[CX2]=[OX1]
|
502
|
+
|
503
|
+
Cyanate: [OX2][CX2]#[NX1]
|
504
|
+
|
505
|
+
Isothiocyanate: [NX2]=[CX2]=[SX1]
|
506
|
+
|
507
|
+
Thiocyanate: [SX2][CX2]#[NX1]
|
508
|
+
|
509
|
+
Carbodiimide: [NX2]=[CX2]=[NX2]
|
510
|
+
|
511
|
+
Orthocarbonic_derivatives: [CX4H0]([O,S,#7])([O,S,#7])([O,S,#7])[O,S,#7,F,Cl,Br,I]
|
512
|
+
# halogen allowed just once, to avoid mapping to -OCF3 and similar groups (much more
|
513
|
+
# stable as for example C(OCH3)4)
|
514
|
+
|
515
|
+
|
516
|
+
# I.6 Aromatics
|
517
|
+
# -------------
|
518
|
+
|
519
|
+
# I know that this classification is not very logical, arylamines are found under I.2 ...
|
520
|
+
|
521
|
+
Phenol: [OX2H][c]
|
522
|
+
|
523
|
+
1,2-Diphenol: [OX2H][c][c][OX2H]
|
524
|
+
|
525
|
+
Arylchloride: [Cl][c]
|
526
|
+
|
527
|
+
Arylfluoride: [F][c]
|
528
|
+
|
529
|
+
Arylbromide: [Br][c]
|
530
|
+
|
531
|
+
Aryliodide: [I][c]
|
532
|
+
|
533
|
+
Arylthiol: [SX2H][c]
|
534
|
+
|
535
|
+
Iminoarene: [c]=[NX2;$([H1]),$([H0][#6;!$([C]=[N,S,O])])]
|
536
|
+
# N may be substituted with H or C, but not carbonyl or similar
|
537
|
+
# aromatic atom is always C, not S or P (these are not planar when substituted)
|
538
|
+
|
539
|
+
Oxoarene: [c]=[OX1]
|
540
|
+
|
541
|
+
Thioarene: [c]=[SX1]
|
542
|
+
|
543
|
+
Hetero_N_basic_H: [nX3H1+0]
|
544
|
+
# as in pyrole. uncharged to exclude pyridinium ions
|
545
|
+
|
546
|
+
Hetero_N_basic_no_H: [nX3H0+0]
|
547
|
+
# as in N-methylpyrole. uncharged to exclude pyridinium ions
|
548
|
+
|
549
|
+
Hetero_N_nonbasic: [nX2,nX3+]
|
550
|
+
# as in pyridine, pyridinium
|
551
|
+
|
552
|
+
Hetero_O: [o]
|
553
|
+
|
554
|
+
Hetero_S: [sX2]
|
555
|
+
# X2 because Daylight's depictmatch falsely describes C1=CS(=O)C=C1 as aromatic
|
556
|
+
# (is not planar because of lonepair at S)
|
557
|
+
|
558
|
+
Heteroaromatic: [a;!c]
|
559
|
+
|
560
|
+
|
561
|
+
# Part II: N, S, P, Si, B
|
562
|
+
# =======================
|
563
|
+
|
564
|
+
|
565
|
+
# II.1 Nitrogen
|
566
|
+
# -------------
|
567
|
+
|
568
|
+
Nitrite: [NX2](=[OX1])[O;$([X2]),$([X1-])]
|
569
|
+
# hits nitrous acid, its anion, esters, and other O-substituted derivatives
|
570
|
+
|
571
|
+
Thionitrite: [SX2][NX2]=[OX1]
|
572
|
+
|
573
|
+
Nitrate: [$([NX3](=[OX1])(=[OX1])[O;$([X2]),$([X1-])]),$([NX3+]([OX1-])(=[OX1])[O;$([X2]),$([X1-])])]
|
574
|
+
# hits nitric acid, its anion, esters, and other O-substituted derivatives
|
575
|
+
|
576
|
+
Nitro: [$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]
|
577
|
+
# hits nitro groups attached to C,N, ... but not nitrates
|
578
|
+
|
579
|
+
Nitroso: [NX2](=[OX1])[!#7;!#8]
|
580
|
+
# no nitrites, no nitrosamines
|
581
|
+
|
582
|
+
Azide: [NX1]~[NX2]~[NX2,NX1]
|
583
|
+
# hits both mesomeric forms, also anion
|
584
|
+
|
585
|
+
Acylazide: [CX3](=[OX1])[NX2]~[NX2]~[NX1]
|
586
|
+
|
587
|
+
Diazo: [$([#6]=[NX2+]=[NX1-]),$([#6-]-[NX2+]#[NX1])]
|
588
|
+
|
589
|
+
Diazonium: [#6][NX2+]#[NX1]
|
590
|
+
|
591
|
+
Nitrosamine: [#7;!$(N*=O)][NX2]=[OX1]
|
592
|
+
|
593
|
+
Nitrosamide: [NX2](=[OX1])N-*=O
|
594
|
+
# includes nitrososulfonamides
|
595
|
+
|
596
|
+
N-Oxide: [$([#7+][OX1-]),$([#7v5]=[OX1]);!$([#7](~[O])~[O]);!$([#7]=[#7])]
|
597
|
+
# Hits both forms. Won't hit azoxy, nitro, nitroso, or nitrate.
|
598
|
+
|
599
|
+
|
600
|
+
Hydrazine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])]
|
601
|
+
# no hydrazides
|
602
|
+
|
603
|
+
Hydrazone: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX2]=[#6]
|
604
|
+
|
605
|
+
Hydroxylamine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][OX2;$([H1]),$(O[#6;!$(C=[N,O,S])])]
|
606
|
+
# no discrimination between O-, N-, and O,N-substitution
|
607
|
+
|
608
|
+
|
609
|
+
# II.2 Sulfur
|
610
|
+
# -----------
|
611
|
+
|
612
|
+
Sulfon: [$([SX4](=[OX1])(=[OX1])([#6])[#6]),$([SX4+2]([OX1-])([OX1-])([#6])[#6])]
|
613
|
+
# can't be aromatic, thus S and not #16
|
614
|
+
|
615
|
+
Sulfoxide: [$([SX3](=[OX1])([#6])[#6]),$([SX3+]([OX1-])([#6])[#6])]
|
616
|
+
|
617
|
+
Sulfonium: [S+;!$([S]~[!#6]);!$([S]*~[#7,#8,#15,#16])]
|
618
|
+
# can't be aromatic, thus S and not #16
|
619
|
+
|
620
|
+
Sulfuric_acid: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
|
621
|
+
# includes anions
|
622
|
+
|
623
|
+
Sulfuric_monoester: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
|
624
|
+
|
625
|
+
Sulfuric_diester: [SX4](=[OX1])(=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
|
626
|
+
|
627
|
+
Sulfuric_monoamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
|
628
|
+
|
629
|
+
Sulfuric_diamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
630
|
+
|
631
|
+
Sulfuric_esteramide: [SX4](=[OX1])(=[OX1])([#7X3][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
|
632
|
+
|
633
|
+
Sulfuric_derivative: [SX4D4](=[!#6])(=[!#6])([!#6])[!#6]
|
634
|
+
# everything else (would not be a "true" derivative of sulfuric acid, if one of the substituents were less electronegative
|
635
|
+
# than sulfur, but this should be very very rare, anyway)
|
636
|
+
|
637
|
+
|
638
|
+
|
639
|
+
#### sulfurous acid and derivatives missing!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
640
|
+
|
641
|
+
|
642
|
+
|
643
|
+
|
644
|
+
Sulfonic_acid: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[$([OX2H]),$([OX1-])]
|
645
|
+
|
646
|
+
Sulfonamide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
647
|
+
|
648
|
+
Sulfonic_ester: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[OX2][#6;!$(C=[O,N,S])]
|
649
|
+
|
650
|
+
Sulfonic_halide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[FX1,ClX1,BrX1,IX1]
|
651
|
+
|
652
|
+
Sulfonic_derivative: [SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6])[!#6]
|
653
|
+
# includes all of the above and many more
|
654
|
+
# for comparison: this is what "all sulfonic derivatives but not the ones above" would look like:
|
655
|
+
# [$([SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6;!O])[!#6]),$([SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[!$([FX1,ClX1,BrX1,IX1]);!$([#6]);!$([OX2H]);!$([OX1-]);!$([OX2][#6;!$(C=[O,N,S])]);!$([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])])]
|
656
|
+
|
657
|
+
|
658
|
+
Sulfinic_acid: [SX3;$([H1]),$([H0][#6])](=[OX1])[$([OX2H]),$([OX1-])]
|
659
|
+
|
660
|
+
Sulfinic_amide: [SX3;$([H1]),$([H0][#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
661
|
+
|
662
|
+
Sulfinic_ester: [SX3;$([H1]),$([H0][#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
|
663
|
+
|
664
|
+
Sulfinic_halide: [SX3;$([H1]),$([H0][#6])](=[OX1])[FX1,ClX1,BrX1,IX1]
|
665
|
+
|
666
|
+
Sulfinic_derivative: [SX3;$([H1]),$([H0][#6])](=[!#6])[!#6]
|
667
|
+
|
668
|
+
Sulfenic_acid: [SX2;$([H1]),$([H0][#6])][$([OX2H]),$([OX1-])]
|
669
|
+
|
670
|
+
Sulfenic_amide: [SX2;$([H1]),$([H0][#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
671
|
+
|
672
|
+
Sulfenic_ester: [SX2;$([H1]),$([H0][#6])][OX2][#6;!$(C=[O,N,S])]
|
673
|
+
|
674
|
+
Sulfenic_halide: [SX2;$([H1]),$([H0][#6])][FX1,ClX1,BrX1,IX1]
|
675
|
+
|
676
|
+
Sulfenic_derivative: [SX2;$([H1]),$([H0][#6])][!#6]
|
677
|
+
|
678
|
+
|
679
|
+
# II.3 Phosphorous
|
680
|
+
# ----------------
|
681
|
+
|
682
|
+
Phosphine: [PX3;$([H3]),$([H2][#6]),$([H1]([#6])[#6]),$([H0]([#6])([#6])[#6])]
|
683
|
+
# similar to amine, but less restrictive: includes also amide- and aminal-analogues
|
684
|
+
|
685
|
+
Phosphine_oxide: [PX4;$([H3]=[OX1]),$([H2](=[OX1])[#6]),$([H1](=[OX1])([#6])[#6]),$([H0](=[OX1])([#6])([#6])[#6])]
|
686
|
+
|
687
|
+
Phosphonium: [P+;!$([P]~[!#6]);!$([P]*~[#7,#8,#15,#16])]
|
688
|
+
# similar to Ammonium
|
689
|
+
|
690
|
+
Phosphorylen: [PX4;$([H3]=[CX3]),$([H2](=[CX3])[#6]),$([H1](=[CX3])([#6])[#6]),$([H0](=[CX3])([#6])([#6])[#6])]
|
691
|
+
|
692
|
+
|
693
|
+
# conventions for the following acids and derivatives:
|
694
|
+
# acids find protonated and deprotonated acids
|
695
|
+
# esters do not find mixed anhydrides ( ...P-O-C(=O))
|
696
|
+
# derivatives: subtituents which go in place of the OH and =O are not H or C (may also be O,
|
697
|
+
# thus including acids and esters)
|
698
|
+
|
699
|
+
Phosphonic_acid: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
|
700
|
+
# includes anions
|
701
|
+
|
702
|
+
Phosphonic_monoester: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
|
703
|
+
|
704
|
+
Phosphonic_diester: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
|
705
|
+
|
706
|
+
Phosphonic_monoamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
707
|
+
|
708
|
+
Phosphonic_diamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
709
|
+
|
710
|
+
Phosphonic_esteramide: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
711
|
+
|
712
|
+
Phosphonic_acid_derivative: [PX4;$([H1]),$([H0][#6])](=[!#6])([!#6])[!#6]
|
713
|
+
# all of the above and much more
|
714
|
+
|
715
|
+
|
716
|
+
Phosphoric_acid: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
|
717
|
+
# includes anions
|
718
|
+
|
719
|
+
Phosphoric_monoester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
|
720
|
+
|
721
|
+
Phosphoric_diester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
|
722
|
+
|
723
|
+
Phosphoric_triester: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
|
724
|
+
|
725
|
+
Phosphoric_monoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
726
|
+
|
727
|
+
Phosphoric_diamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
728
|
+
|
729
|
+
Phosphoric_triamide: [PX4D4](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
730
|
+
|
731
|
+
Phosphoric_monoestermonoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
732
|
+
|
733
|
+
Phosphoric_diestermonoamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
734
|
+
|
735
|
+
Phosphoric_monoesterdiamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
736
|
+
|
737
|
+
Phosphoric_acid_derivative: [PX4D4](=[!#6])([!#6])([!#6])[!#6]
|
738
|
+
|
739
|
+
|
740
|
+
Phosphinic_acid: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[$([OX2H]),$([OX1-])]
|
741
|
+
|
742
|
+
Phosphinic_ester: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
|
743
|
+
|
744
|
+
Phosphinic_amide: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
745
|
+
|
746
|
+
Phosphinic_acid_derivative: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[!#6])[!#6]
|
747
|
+
|
748
|
+
|
749
|
+
Phosphonous_acid: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
|
750
|
+
|
751
|
+
Phosphonous_monoester: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
|
752
|
+
|
753
|
+
Phosphonous_diester: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
|
754
|
+
|
755
|
+
Phosphonous_monoamide: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
756
|
+
|
757
|
+
Phosphonous_diamide: [PX3;$([H1]),$([H0][#6])]([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
758
|
+
|
759
|
+
Phosphonous_esteramide: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
760
|
+
|
761
|
+
Phosphonous_derivatives: [PX3;$([D2]),$([D3][#6])]([!#6])[!#6]
|
762
|
+
|
763
|
+
|
764
|
+
Phosphinous_acid: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][$([OX2H]),$([OX1-])]
|
765
|
+
|
766
|
+
Phosphinous_ester: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][OX2][#6;!$(C=[O,N,S])]
|
767
|
+
|
768
|
+
Phosphinous_amide: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
|
769
|
+
|
770
|
+
Phosphinous_derivatives: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][!#6]
|
771
|
+
|
772
|
+
|
773
|
+
# II.4 Silicon
|
774
|
+
# ------------
|
775
|
+
|
776
|
+
Quart_silane: [SiX4]([#6])([#6])([#6])[#6]
|
777
|
+
# four C-substituents. non-reactive, non-toxic, in experimental phase for drug development
|
778
|
+
|
779
|
+
Non-quart_silane: [SiX4;$([H1]([#6])([#6])[#6]),$([H2]([#6])[#6]),$([H3][#6]),$([H4])]
|
780
|
+
# has 1-4 hydride(s), reactive. Daylight's depictmatch does not add hydrogens automatically to
|
781
|
+
# the free positions at Si, thus Hs had to be added implicitly
|
782
|
+
|
783
|
+
Silylmonohalide: [SiX4]([FX1,ClX1,BrX1,IX1])([#6])([#6])[#6]
|
784
|
+
# reagents for inserting protection groups
|
785
|
+
|
786
|
+
Het_trialkylsilane: [SiX4]([!#6])([#6])([#6])[#6]
|
787
|
+
# mostly acid-labile protection groups such as trimethylsilyl-ethers
|
788
|
+
|
789
|
+
Dihet_dialkylsilane: [SiX4]([!#6])([!#6])([#6])[#6]
|
790
|
+
|
791
|
+
Trihet_alkylsilane: [SiX4]([!#6])([!#6])([!#6])[#6]
|
792
|
+
|
793
|
+
Silicic_acid_derivative: [SiX4]([!#6])([!#6])([!#6])[!#6]
|
794
|
+
# four substituent which are neither C nor H
|
795
|
+
|
796
|
+
|
797
|
+
# II.5 Boron
|
798
|
+
# ----------
|
799
|
+
|
800
|
+
Trialkylborane: [BX3]([#6])([#6])[#6]
|
801
|
+
# also carbonyls allowed
|
802
|
+
|
803
|
+
Boric_acid_derivatives: [BX3]([!#6])([!#6])[!#6]
|
804
|
+
# includes acids, esters, amides, ... H-substituent at B is very rare.
|
805
|
+
|
806
|
+
Boronic_acid_derivative: [BX3]([!#6])([!#6])[!#6]
|
807
|
+
# # includes acids, esters, amides, ...
|
808
|
+
|
809
|
+
Borohydride: [BH1,BH2,BH3,BH4]
|
810
|
+
# at least one H attached to B
|
811
|
+
|
812
|
+
Quaternary_boron: [BX4]
|
813
|
+
# mostly borates (negative charge), in complex with Lewis-base
|
814
|
+
|
815
|
+
|
816
|
+
|
817
|
+
# Part III: Some Special Patterns
|
818
|
+
# ===============================
|
819
|
+
|
820
|
+
|
821
|
+
# III.1 Chains
|
822
|
+
# ------------
|
823
|
+
|
824
|
+
# some simple chains
|
825
|
+
|
826
|
+
|
827
|
+
|
828
|
+
# III.2 Rings
|
829
|
+
# -----------
|
830
|
+
|
831
|
+
Aromatic: a
|
832
|
+
|
833
|
+
Heterocyclic: [!#6;!R0]
|
834
|
+
# may be aromatic or not
|
835
|
+
|
836
|
+
Epoxide: [OX2r3]1[#6r3][#6r3]1
|
837
|
+
# toxic/reactive. may be annelated to aromat, but must not be aromatic itself (oxirane-2,3-dione)
|
838
|
+
|
839
|
+
NH_aziridine: [NX3H1r3]1[#6r3][#6r3]1
|
840
|
+
# toxic/reactive according to Maybridge's garbage filter
|
841
|
+
|
842
|
+
Spiro: [D4R;$(*(@*)(@*)(@*)@*)]
|
843
|
+
# at least two different rings can be found which are sharing just one atom.
|
844
|
+
# these two rings can be connected by a third ring, so it matches also some
|
845
|
+
# bridged systems, like morphine
|
846
|
+
|
847
|
+
Annelated_rings: [R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]
|
848
|
+
# two different rings sharing exactly two atoms
|
849
|
+
|
850
|
+
Bridged_rings: [R;$(*(@*)(@*)@*);!$([D4R;$(*(@*)(@*)(@*)@*)]);!$([R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])])]
|
851
|
+
# part of two or more rings, not spiro, not annelated -> finds bridgehead atoms,
|
852
|
+
# but only if they are not annelated at the same time - otherwise impossible (?)
|
853
|
+
# to distinguish from non-bridgehead annelated atoms
|
854
|
+
|
855
|
+
# some basic ring-patterns (just size, no other information):
|
856
|
+
|
857
|
+
|
858
|
+
|
859
|
+
|
860
|
+
|
861
|
+
# III.3 Sugars and Nucleosides/Nucleotides, Steroids
|
862
|
+
# --------------------------------------------------
|
863
|
+
|
864
|
+
# because of the large variety of sugar derivatives, different patterns can be applied.
|
865
|
+
# The choice of patterns and their combinations will depend on the contents of the database
|
866
|
+
# e.g. natural products, nucleoside analoges with modified sugars, ... as well as on the
|
867
|
+
# desired restriction
|
868
|
+
|
869
|
+
|
870
|
+
Sugar_pattern_1: [OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)]
|
871
|
+
# 5 or 6-membered ring containing one O and at least one (r5) or two (r6) oxygen-substituents.
|
872
|
+
|
873
|
+
Sugar_pattern_2: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
|
874
|
+
# 5 or 6-membered ring containing one O and an acetal-like bond at postion 2.
|
875
|
+
|
876
|
+
Sugar_pattern_combi: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C(O)@C1)]
|
877
|
+
# combination of the two above
|
878
|
+
|
879
|
+
Sugar_pattern_2_reducing: [OX2;$([r5]1@C(!@[OX2H1])@C@C@C1),$([r6]1@C(!@[OX2H1])@C@C@C@C1)]
|
880
|
+
# 5 or 6-membered cyclic hemi-acetal
|
881
|
+
|
882
|
+
Sugar_pattern_2_alpha: [OX2;$([r5]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
|
883
|
+
# 5 or 6-membered cyclic hemi-acetal
|
884
|
+
|
885
|
+
Sugar_pattern_2_beta: [OX2;$([r5]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
|
886
|
+
# 5 or 6-membered cyclic hemi-acetal
|
887
|
+
|
888
|
+
##Poly_sugar_1: ([OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)].[OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)])
|
889
|
+
# pattern1 occours more than once (in same molecule, but moieties don't have to be adjacent!)
|
890
|
+
|
891
|
+
##Poly_sugar_2: ([OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)].[OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)])
|
892
|
+
# pattern2 occours more than once (in same molecule, but moieties don't have to be adjacent!)
|
893
|
+
|
894
|
+
|
895
|
+
# III.4 Everything else...
|
896
|
+
# ------------------------
|
897
|
+
|
898
|
+
Conjugated_double_bond: *=*[*]=,#,:[*]
|
899
|
+
|
900
|
+
Conjugated_tripple_bond: *#*[*]=,#,:[*]
|
901
|
+
|
902
|
+
Cis_double_bond: */[D2]=[D2]\*
|
903
|
+
# only one single-bonded substituent on each DB-atom. no aromats.
|
904
|
+
# only found when character of DB is explicitely stated.
|
905
|
+
|
906
|
+
Trans_double_bond: */[D2]=[D2]/*
|
907
|
+
# analog
|
908
|
+
|
909
|
+
Mixed_anhydrides: [$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))][#8X2][$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))]
|
910
|
+
# should hits all combinations of two acids
|
911
|
+
|
912
|
+
Halogen_on_hetero: [FX1,ClX1,BrX1,IX1][!#6]
|
913
|
+
|
914
|
+
Halogen_multi_subst: [F,Cl,Br,I;!$([X1]);!$([X0-])]
|
915
|
+
# Halogen which is not mono-substituted nor an anion, e.g. chlorate.
|
916
|
+
# Most of these cases should be also filtered by Halogen_on_hetero.
|
917
|
+
|
918
|
+
Trifluoromethyl: [FX1][CX4;!$([H0][Cl,Br,I]);!$([F][C]([F])([F])[F])]([FX1])([FX1])
|
919
|
+
# C with three F attached, connected to anything which is not another halogen
|
920
|
+
|
921
|
+
C_ONS_bond: [#6]~[#7,#8,#16]
|
922
|
+
# probably all drug-like molecules have at least one O, N, or S connected to a C -> nice filter
|
923
|
+
|
924
|
+
## Mixture: (*).(*)
|
925
|
+
# two or more seperate parts, may also be salt
|
926
|
+
# component-level grouping is not yet supported in Open Babel Version 2.0
|
927
|
+
|
928
|
+
|
929
|
+
Charged: [!+0]
|
930
|
+
|
931
|
+
Anion: [-1,-2,-3,-4,-5,-6,-7]
|
932
|
+
|
933
|
+
Kation: [+1,+2,+3,+4,+5,+6,+7]
|
934
|
+
|
935
|
+
Salt: ([-1,-2,-3,-4,-5,-6,-7]).([+1,+2,+3,+4,+5,+6,+7])
|
936
|
+
# two or more seperate components with opposite charges
|
937
|
+
|
938
|
+
##Zwitterion: ([-1,-2,-3,-4,-5,-6,-7].[+1,+2,+3,+4,+5,+6,+7])
|
939
|
+
# both negative and positive charges somewhere within the same molecule.
|
940
|
+
|
941
|
+
1,3-Tautomerizable: [$([#7X2,OX1,SX1]=*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=*),$([#7X3,OX2,SX2;!H0]*:n)]
|
942
|
+
# 1,3 migration of H allowed. Includes keto/enol and amide/enamide.
|
943
|
+
# Aromatic rings must stay aromatic - no keto form of phenol
|
944
|
+
|
945
|
+
1,5-Tautomerizable: [$([#7X2,OX1,SX1]=,:**=,:*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=**=*),$([#7X3,OX2,SX2;!H0]*=,:**:n)]
|
946
|
+
|
947
|
+
Rotatable_bond: [!$(*#*)&!D1]-!@[!$(*#*)&!D1]
|
948
|
+
# taken from http://www.daylight.com/support/contrib/smarts/content.html
|
949
|
+
|
950
|
+
Michael_acceptor: [CX3]=[CX3][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-])]
|
951
|
+
# the classical case: C=C near carbonyl, nitrile, nitro, or similar
|
952
|
+
# Oxo-heteroaromats and similar are not included.
|
953
|
+
|
954
|
+
Dicarbodiazene: [CX3](=[OX1])[NX2]=[NX2][CX3](=[OX1])
|
955
|
+
# Michael-like acceptor, see Mitsunobu reaction
|
956
|
+
|
957
|
+
# H-Bond_donor:
|
958
|
+
|
959
|
+
# H-Bond_acceptor:
|
960
|
+
|
961
|
+
# Pos_ionizable:
|
962
|
+
|
963
|
+
# Neg_ionizable:
|
964
|
+
|
965
|
+
# Unlikely_ions:
|
966
|
+
# O+,N-,C+,C-, ...
|
967
|
+
|
968
|
+
CH-acidic: [$([CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])]),$([CX4;!$([H0])]1[CX3]=[CX3][CX3]=[CX3]1)]
|
969
|
+
# C-H alpha to carbony, nitro or similar, C is not double-bonded, only C, H, S,P=O and nitro substituents allowed.
|
970
|
+
# pentadiene is included. acids, their salts, prim./sec. amides, and imides are excluded.
|
971
|
+
# hits also CH-acidic_strong
|
972
|
+
|
973
|
+
CH-acidic_strong: [CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])]([$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])])[$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])]
|
974
|
+
# same as above (without pentadiene), but carbonyl or similar on two or three sides
|
975
|
+
|
976
|
+
Chiral_center_specified: [$([*@](~*)(~*)(*)*),$([*@H](*)(*)*),$([*@](~*)(*)*),$([*@H](~*)~*)]
|
977
|
+
# Hits atoms with tetrahedral chirality, if chiral center is specified in the SMILES string
|
978
|
+
# depictmach does not find oxonium, sulfonium, or sulfoxides!
|
979
|
+
|
980
|
+
# Chiral_center_unspecified: [$([*@?](~*)(~*)(*)*),$([*@?H](*)(*)*),$([*@?](~*)(*)*),$([*@?H](~*)~*)]
|
981
|
+
# Hits atoms with tetrahedral chirality, if chiral center is not specified in the SMILES string
|
982
|
+
# "@?" (unspecified chirality) is not yet supported in Open Babel Version 2.0
|
983
|
+
|