coho 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 78fcbe27ff1817e332322d5d1110037c6747aa969377857b26d5e049068988f9
4
+ data.tar.gz: 8d8a2215c7ff67b371c3d4028e8e28d3b1748be9a070d2bba0d0531739ca3af1
5
+ SHA512:
6
+ metadata.gz: 61ce89d6eb31c7260c8d40541c23f37085f658eb770e7e9571a8609a8d381dbb44400a786b5256ae02f01945779633cb18c9dd35295822518a8c24241b4e118d
7
+ data.tar.gz: a0e8d021ea0c1f9d3b93361beb5424fa151d9c80d604b148da42c942e2b001d906ce6c8cf666786c8abc5c859d0c6434892cb27c6c8eed3ff4239df66010b16d
@@ -0,0 +1,75 @@
1
+ #include <ruby.h>
2
+ #include <stdio.h>
3
+ #include "coho.h"
4
+
5
+ #include "extconf.h"
6
+
7
+ static VALUE coho_rb_mod;
8
+ static VALUE coho_rb_smiles_mod;
9
+ static VALUE coho_rb_smiles_atom_class;
10
+
11
+
12
+ static VALUE
13
+ get_int(int n)
14
+ {
15
+ return n == -1 ? Qnil : INT2NUM(n);
16
+ }
17
+
18
+
19
+ VALUE
20
+ smiles_parser_parse(VALUE self, VALUE smiles_str)
21
+ {
22
+ struct coho_smiles smiles;
23
+ struct coho_smiles_atom *a;
24
+ int i;
25
+ VALUE atom_class, atoms, atom;
26
+
27
+ coho_smiles_init(&smiles);
28
+ coho_smiles_parse(&smiles,
29
+ RSTRING_PTR(smiles_str),
30
+ RSTRING_LEN(smiles_str));
31
+
32
+ atom_class = rb_path2class("Coho::Smiles::Atom");
33
+ atoms = rb_ary_new();
34
+
35
+ for (i = 0; i < smiles.atom_count; i++) {
36
+ a = &smiles.atoms[i];
37
+ atom = rb_funcall(atom_class, rb_intern("new"), 0);
38
+ rb_iv_set(atom, "@atomic_number", get_int(a->atomic_number));
39
+ rb_iv_set(atom, "@symbol", rb_str_new2(a->symbol));
40
+ rb_iv_set(atom, "@isotope", get_int(a->isotope));
41
+ rb_iv_set(atom, "@charge", INT2NUM(a->charge));
42
+ rb_iv_set(atom, "@hydrogen_count", get_int(a->hydrogen_count));
43
+ rb_iv_set(atom,
44
+ "@implicit_hydrogen_count",
45
+ get_int(a->implicit_hydrogen_count));
46
+
47
+ rb_ary_push(atoms, atom);
48
+ }
49
+
50
+ coho_smiles_free(&smiles);
51
+
52
+ return atoms;
53
+ }
54
+
55
+ static void
56
+ init_smiles(VALUE coho_mod)
57
+ {
58
+ VALUE mod;
59
+ VALUE parser_class;
60
+
61
+ mod = rb_define_module_under(coho_mod, "Smiles");
62
+ parser_class = rb_define_class_under(mod, "Parser", rb_cObject);
63
+
64
+ rb_define_method(parser_class, "parse", smiles_parser_parse, 1);
65
+ }
66
+
67
+
68
+ void
69
+ Init_coho()
70
+ {
71
+ VALUE mod;
72
+
73
+ mod = rb_define_module("Coho");
74
+ init_smiles(mod);
75
+ }
@@ -0,0 +1,13 @@
1
+ require "mkmf"
2
+
3
+ $INCFLAGS += " -Isrc"
4
+
5
+ $objs = [
6
+ "coho.o",
7
+ "src/compat.o",
8
+ "src/smiles.o",
9
+ ]
10
+
11
+
12
+ create_header
13
+ create_makefile "coho/coho"
@@ -0,0 +1,111 @@
1
+ /*
2
+ * Copyright (c) 2017-2019 Ben Cornett <ben@lantern.is>
3
+ *
4
+ * Permission to use, copy, modify, and distribute this software for any
5
+ * purpose with or without fee is hereby granted, provided that the above
6
+ * copyright notice and this permission notice appear in all copies.
7
+ *
8
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
+ */
16
+
17
+ enum coho_status {
18
+ COHO_OK,
19
+ COHO_ERROR,
20
+ COHO_NOMEM,
21
+ };
22
+
23
+ /* Compatibility functions {{{
24
+ */
25
+
26
+ #undef strlcpy
27
+ size_t strlcpy(char *, const char *, size_t);
28
+
29
+ #undef reallocarray
30
+ void *reallocarray(void *, size_t, size_t);
31
+
32
+ /* }}} */
33
+
34
+ /* SMILES parsing {{{
35
+ */
36
+ enum {
37
+ COHO_SMILES_BOND_UNSPECIFIED = 0,
38
+ COHO_SMILES_BOND_SINGLE = 1,
39
+ COHO_SMILES_BOND_DOUBLE = 2,
40
+ COHO_SMILES_BOND_TRIPLE = 3,
41
+ COHO_SMILES_BOND_QUAD = 4,
42
+ COHO_SMILES_BOND_AROMATIC = 5,
43
+ };
44
+
45
+ enum {
46
+ COHO_SMILES_BOND_STEREO_UNSPECIFIED,
47
+ COHO_SMILES_BOND_STEREO_UP,
48
+ COHO_SMILES_BOND_STEREO_DOWN,
49
+ };
50
+
51
+ struct coho_smiles_atom {
52
+ int atomic_number;
53
+ char symbol[4];
54
+ int isotope;
55
+ int charge;
56
+ int hydrogen_count;
57
+ int implicit_hydrogen_count;
58
+ int is_bracket;
59
+ int is_organic;
60
+ int is_aromatic;
61
+ char chirality[8];
62
+ int atom_class;
63
+ int position;
64
+ int length;
65
+ };
66
+
67
+ struct coho_smiles_bond {
68
+ int atom0;
69
+ int atom1;
70
+ int order;
71
+ int stereo;
72
+ int is_implicit;
73
+ int is_ring;
74
+ int position;
75
+ int length;
76
+ };
77
+
78
+ struct coho_smiles_paren {
79
+ int position;
80
+ struct coho_smiles_bond bond;
81
+ };
82
+
83
+ struct coho_smiles {
84
+ const char *smiles;
85
+ int position;
86
+ int end;
87
+ char error[32];
88
+ int error_position;
89
+
90
+ int atom_count;
91
+ int bond_count;
92
+
93
+ struct coho_smiles_atom *atoms;
94
+ size_t atoms_capacity;
95
+
96
+ struct coho_smiles_bond *bonds;
97
+ size_t bonds_capacity;
98
+
99
+ struct coho_smiles_bond ring_bonds[100];
100
+ size_t open_ring_closures;
101
+
102
+ struct coho_smiles_paren *paren_stack;
103
+ int paren_stack_count;
104
+ size_t paren_stack_capacity;
105
+ };
106
+
107
+ void coho_smiles_free(struct coho_smiles *);
108
+ void coho_smiles_init(struct coho_smiles *);
109
+ int coho_smiles_parse(struct coho_smiles *, const char *, size_t);
110
+
111
+ /* }}} */
@@ -0,0 +1,91 @@
1
+ /* $OpenBSD: reallocarray.c,v 1.3 2015/09/13 08:31:47 guenther Exp $ */
2
+ /*
3
+ * Copyright (c) 2008 Otto Moerbeek <otto@drijf.net>
4
+ *
5
+ * Permission to use, copy, modify, and distribute this software for any
6
+ * purpose with or without fee is hereby granted, provided that the above
7
+ * copyright notice and this permission notice appear in all copies.
8
+ *
9
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
+ */
17
+
18
+ #include <sys/types.h>
19
+ #include <errno.h>
20
+ #include <stdint.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+
24
+ #include "coho.h"
25
+
26
+ /*
27
+ * This is sqrt(SIZE_MAX+1), as s1*s2 <= SIZE_MAX
28
+ * if both s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW
29
+ */
30
+ #define MUL_NO_OVERFLOW ((size_t)1 << (sizeof(size_t) * 4))
31
+
32
+ void *
33
+ reallocarray(void *optr, size_t nmemb, size_t size)
34
+ {
35
+ if ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) &&
36
+ nmemb > 0 && SIZE_MAX / nmemb < size) {
37
+ errno = ENOMEM;
38
+ return NULL;
39
+ }
40
+ return realloc(optr, size * nmemb);
41
+ }
42
+
43
+
44
+
45
+ /* $OpenBSD: strlcpy.c,v 1.15 2016/10/16 17:37:39 dtucker Exp $ */
46
+
47
+ /*
48
+ * Copyright (c) 1998, 2015 Todd C. Miller <Todd.Miller@courtesan.com>
49
+ *
50
+ * Permission to use, copy, modify, and distribute this software for any
51
+ * purpose with or without fee is hereby granted, provided that the above
52
+ * copyright notice and this permission notice appear in all copies.
53
+ *
54
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
55
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
56
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
57
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
58
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
59
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
60
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
61
+ */
62
+
63
+ /*
64
+ * Copy string src to buffer dst of size dsize. At most dsize-1
65
+ * chars will be copied. Always NUL terminates (unless dsize == 0).
66
+ * Returns strlen(src); if retval >= dsize, truncation occurred.
67
+ */
68
+ size_t
69
+ strlcpy(char *dst, const char *src, size_t dsize)
70
+ {
71
+ const char *osrc = src;
72
+ size_t nleft = dsize;
73
+
74
+ /* Copy as many bytes as will fit. */
75
+ if (nleft != 0) {
76
+ while (--nleft != 0) {
77
+ if ((*dst++ = *src++) == '\0')
78
+ break;
79
+ }
80
+ }
81
+
82
+ /* Not enough room in dst, add NUL and traverse rest of src. */
83
+ if (nleft == 0) {
84
+ if (dsize != 0)
85
+ *dst = '\0'; /* NUL-terminate dst */
86
+ while (*src++)
87
+ ;
88
+ }
89
+
90
+ return(src - osrc - 1); /* count does not include NUL */
91
+ }
@@ -0,0 +1,2205 @@
1
+ /*
2
+ * Copyright (c) 2017-2019 Ben Cornett <ben@lantern.is>
3
+ *
4
+ * Permission to use, copy, modify, and distribute this software for any
5
+ * purpose with or without fee is hereby granted, provided that the above
6
+ * copyright notice and this permission notice appear in all copies.
7
+ *
8
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
+ */
16
+
17
+ /*
18
+ * Parses SMILES as specified by the OpenSMILES standard.
19
+ */
20
+
21
+ #include <assert.h>
22
+ #include <limits.h>
23
+ #include <stdlib.h>
24
+ #include <string.h>
25
+
26
+ #include "coho.h"
27
+
28
+ #define ALIPHATIC_ORGANIC 0x00001
29
+ #define AROMATIC 0x00002
30
+ #define AROMATIC_ORGANIC 0x00004
31
+ #define BOND 0x00008
32
+ #define BRACKET_CLOSE 0x00010
33
+ #define BRACKET_OPEN 0x00020
34
+ #define CHIRALITY 0x00040
35
+ #define COLON 0x00080
36
+ #define DIGIT 0x00100
37
+ #define DOT 0x00200
38
+ #define ELEMENT 0x00400
39
+ #define HYDROGEN 0x00800
40
+ #define MINUS 0x01000
41
+ #define PAREN_CLOSE 0x02000
42
+ #define PAREN_OPEN 0x04000
43
+ #define PERCENT 0x08000
44
+ #define PLUS 0x10000
45
+ #define WILDCARD 0x20000
46
+
47
+ struct token {
48
+ int type;
49
+ int position;
50
+ const char *s;
51
+ size_t n;
52
+ int intval;
53
+ int flags;
54
+ };
55
+
56
+ static int atom_class(struct coho_smiles *, struct coho_smiles_atom *);
57
+ static int add_atom(struct coho_smiles *, struct coho_smiles_atom *);
58
+ static int add_bond(struct coho_smiles *, struct coho_smiles_bond *);
59
+ static int add_ringbond(struct coho_smiles *, int, struct coho_smiles_bond *);
60
+ static int aliphatic_organic(struct coho_smiles *, struct coho_smiles_atom *);
61
+ static int aromatic_organic(struct coho_smiles *, struct coho_smiles_atom *);
62
+ static int assign_implicit_hydrogen_count(struct coho_smiles *);
63
+ static int atom(struct coho_smiles *, int *);
64
+ static int atom_ringbond(struct coho_smiles *, int *);
65
+ static int atom_valence(struct coho_smiles *, size_t);
66
+ static int bond(struct coho_smiles *, struct coho_smiles_bond *b);
67
+ static int bracket_atom(struct coho_smiles *, struct coho_smiles_atom *);
68
+ static int charge(struct coho_smiles *, struct coho_smiles_atom *);
69
+ static int check_ring_closures(struct coho_smiles *);
70
+ static int chirality(struct coho_smiles *, struct coho_smiles_atom *);
71
+ static int close_paren(struct coho_smiles *, struct coho_smiles_bond *);
72
+ static int dot(struct coho_smiles *);
73
+ static int ensure_array_capacities(struct coho_smiles *, size_t);
74
+ static void finalize_implicit_bond_order(struct coho_smiles *,
75
+ struct coho_smiles_bond *);
76
+ static int hydrogen_count(struct coho_smiles *, struct coho_smiles_atom *);
77
+ static int integer(struct coho_smiles *, size_t, int *);
78
+ static int isotope(struct coho_smiles *, struct coho_smiles_atom *);
79
+ static unsigned int lex(struct coho_smiles *, struct token *, int);
80
+ static int match(struct coho_smiles *, struct token *, int, unsigned int);
81
+ static size_t next_array_capacity(size_t);
82
+ static int open_paren(struct coho_smiles *, struct coho_smiles_bond *);
83
+ static int pop_paren_stack(struct coho_smiles *, int, struct coho_smiles_bond *);
84
+ static void push_paren_stack(struct coho_smiles *, int, struct coho_smiles_bond *);
85
+ static int ringbond(struct coho_smiles *, int);
86
+ static int round_valence(int, int, int);
87
+ static void coho_smiles_atom_init(struct coho_smiles_atom *);
88
+ static void coho_smiles_bond_init(struct coho_smiles_bond *);
89
+ static void coho_smiles_reinit(struct coho_smiles *, const char *, size_t);
90
+ static int symbol(struct coho_smiles *, struct coho_smiles_atom *);
91
+ static void tokcpy(char *, struct token *, size_t);
92
+ static int wildcard(struct coho_smiles *, struct coho_smiles_atom *);
93
+
94
+ /*
95
+ * Table of standard atom valences.
96
+ * <atomic number> <valence>...
97
+ */
98
+ static int standard_valences[][4] = {
99
+ {5, 3, -1, -1}, /* B */
100
+ {6, 4, -1, -1}, /* C */
101
+ {7, 3, 5, -1}, /* N */
102
+ {8, 2, -1, -1}, /* O */
103
+ {9, 1, -1, -1}, /* F */
104
+ {15, 3, 5, -1}, /* P */
105
+ {16, 2, 4, 6}, /* S */
106
+ {17, 1, -1, -1}, /* Cl */
107
+ {35, 1, -1, -1}, /* Br */
108
+ {53, 1, -1, -1}, /* I */
109
+ {-1, -1, -1, -1},
110
+ };
111
+
112
+ void
113
+ coho_smiles_free(struct coho_smiles *x)
114
+ {
115
+ free(x->atoms);
116
+ free(x->bonds);
117
+ free(x->paren_stack);
118
+ }
119
+
120
+ void
121
+ coho_smiles_init(struct coho_smiles *x)
122
+ {
123
+ size_t i;
124
+
125
+ x->smiles = NULL;
126
+ x->position = 0;
127
+ x->end = 0;
128
+ x->error[0] = '\0';
129
+ x->error_position = -1;
130
+
131
+ x->atom_count = 0;
132
+ x->bond_count = 0;
133
+
134
+ x->atoms = NULL;
135
+ x->atoms_capacity = 0;
136
+ x->bonds = NULL;
137
+ x->bonds_capacity = 0;
138
+ x->paren_stack = NULL;
139
+ x->paren_stack_capacity = 0;
140
+
141
+ for (i = 0; i < 100; i++)
142
+ coho_smiles_bond_init(&x->ring_bonds[i]);
143
+ x->open_ring_closures = 0;
144
+ }
145
+
146
+ int
147
+ coho_smiles_parse(struct coho_smiles *x, const char *smiles, size_t sz)
148
+ {
149
+ struct coho_smiles_bond b;
150
+ int anum; /* index of last atom read */
151
+ int eos; /* end-of-string flag */
152
+ int rc;
153
+ size_t end;
154
+
155
+ enum {
156
+ INIT,
157
+ ATOM_READ,
158
+ BOND_READ,
159
+ DOT_READ,
160
+ OPEN_PAREN_READ,
161
+ CLOSE_PAREN_READ,
162
+ } state;
163
+
164
+ end = sz ? sz : strlen(smiles);
165
+ if (sz > INT_MAX) {
166
+ strlcpy(x->error, "SMILES too long", sizeof(x->error));
167
+ return COHO_NOMEM;
168
+ }
169
+ coho_smiles_reinit(x, smiles, end);
170
+
171
+ if (ensure_array_capacities(x, end)) {
172
+ return COHO_NOMEM;
173
+ }
174
+
175
+ b.atom0 = -1; /* no previous atom to bond to */
176
+ anum = -1;
177
+ state = INIT;
178
+
179
+ for (;;) {
180
+ eos = x->position == x->end;
181
+
182
+ switch (state) {
183
+
184
+ /*
185
+ * Parsing has just begun.
186
+ */
187
+ case INIT:
188
+ if (eos) {
189
+ strlcpy(x->error,
190
+ "empty SMILES",
191
+ sizeof(x->error));
192
+ goto err;
193
+ }
194
+
195
+ else if ((rc = atom_ringbond(x, &anum))) {
196
+ if (rc == -1)
197
+ goto err;
198
+ }
199
+
200
+ else {
201
+ strlcpy(x->error,
202
+ "atom expected",
203
+ sizeof(x->error));
204
+ goto err;
205
+ }
206
+ state = ATOM_READ;
207
+ break;
208
+
209
+ /*
210
+ * An atom has just been read.
211
+ */
212
+ case ATOM_READ:
213
+ /*
214
+ * If there is an open bond to the previous
215
+ * atom, complete it.
216
+ */
217
+ if (b.atom0 != -1) {
218
+ b.atom1 = anum;
219
+
220
+ finalize_implicit_bond_order(x, &b);
221
+
222
+ if (add_bond(x, &b) == -1)
223
+ goto err;
224
+ }
225
+
226
+ /*
227
+ * The atom just read may be bonded to
228
+ * subsequent atoms.
229
+ * Store this state in an incomplete bond.
230
+ */
231
+ coho_smiles_bond_init(&b);
232
+ b.atom0 = anum;
233
+ b.is_implicit = 1;
234
+
235
+ if (eos) {
236
+ goto done;
237
+ }
238
+
239
+ else if ((rc = atom_ringbond(x, &anum))) {
240
+ if (rc == -1)
241
+ goto err;
242
+ }
243
+
244
+ else if ((rc = bond(x, &b))) {
245
+ if (rc == -1)
246
+ goto err;
247
+ state = BOND_READ;
248
+ }
249
+
250
+ else if (dot(x)) {
251
+ state = DOT_READ;
252
+ }
253
+
254
+ else if (open_paren(x, &b)) {
255
+ state = OPEN_PAREN_READ;
256
+ }
257
+
258
+ else if ((rc = close_paren(x, &b))) {
259
+ if (rc == -1)
260
+ goto err;
261
+ state = CLOSE_PAREN_READ;
262
+ }
263
+
264
+ else {
265
+ goto unexpected;
266
+ }
267
+
268
+ break;
269
+
270
+ /*
271
+ * A dot (.) has just been read.
272
+ * An atom is expected.
273
+ * If there is a bond to a previous atom awaiting
274
+ * completion, it must be cancelled.
275
+ */
276
+ case DOT_READ:
277
+ /* Invalidate open bond to previous atom. */
278
+ b.atom0 = -1;
279
+
280
+ if ((rc = atom_ringbond(x, &anum))) {
281
+ if (rc == -1)
282
+ goto err;
283
+ } else {
284
+ strlcpy(x->error,
285
+ "atom must follow dot",
286
+ sizeof(x->error));
287
+ goto err;
288
+ }
289
+ state = ATOM_READ;
290
+ break;
291
+
292
+ /*
293
+ * A bond (-, =, #, etc) has just been read.
294
+ * An atom is expected.
295
+ */
296
+ case BOND_READ:
297
+
298
+ if ((rc = atom_ringbond(x, &anum))) {
299
+ if (rc == -1)
300
+ goto err;
301
+ } else {
302
+ strlcpy(x->error,
303
+ "atom must follow bond",
304
+ sizeof(x->error));
305
+ goto err;
306
+ }
307
+ state = ATOM_READ;
308
+ break;
309
+
310
+ /*
311
+ * An opening parenthesis has just been read
312
+ * and the parenthesis stack pushed.
313
+ */
314
+ case OPEN_PAREN_READ:
315
+
316
+ if (eos) {
317
+ strlcpy(x->error,
318
+ "unbalanced parenthesis",
319
+ sizeof(x->error));
320
+ x->error_position = x->position - 1;
321
+ goto err;
322
+ }
323
+
324
+ else if ((rc = atom_ringbond(x, &anum))) {
325
+ if (rc == -1)
326
+ goto err;
327
+ state = ATOM_READ;
328
+ }
329
+
330
+ else if ((rc = bond(x, &b))) {
331
+ if (rc == -1)
332
+ goto err;
333
+ state = BOND_READ;
334
+ }
335
+
336
+ else if (dot(x)) {
337
+ state = DOT_READ;
338
+ }
339
+
340
+ else {
341
+ strlcpy(x->error,
342
+ "atom, bond, or dot expected",
343
+ sizeof(x->error));
344
+ goto err;
345
+ }
346
+ break;
347
+
348
+ /*
349
+ * A closing parenthesis has just been read
350
+ * and the parenthesis stack popped.
351
+ */
352
+ case CLOSE_PAREN_READ:
353
+
354
+ if (eos) {
355
+ goto done;
356
+ }
357
+
358
+ else if ((rc = atom_ringbond(x, &anum))) {
359
+ if (rc == -1)
360
+ goto err;
361
+ state = ATOM_READ;
362
+ }
363
+
364
+ else if ((rc = bond(x, &b))) {
365
+ if (rc == -1)
366
+ goto err;
367
+ state = BOND_READ;
368
+ }
369
+
370
+ else if (dot(x)) {
371
+ state = DOT_READ;
372
+ }
373
+
374
+ else if (open_paren(x, &b)) {
375
+ state = OPEN_PAREN_READ;
376
+ }
377
+
378
+ else if ((rc = close_paren(x, &b))) {
379
+ if (rc == -1)
380
+ goto err;
381
+ state = CLOSE_PAREN_READ;
382
+ }
383
+
384
+ else {
385
+ goto unexpected;
386
+ }
387
+ break;
388
+ }
389
+ }
390
+
391
+ done:
392
+ assert(x->position == x->end);
393
+
394
+ if (check_ring_closures(x))
395
+ goto err;
396
+
397
+ if (x->paren_stack_count > 0) {
398
+ strlcpy(x->error, "unbalanced parenthesis", sizeof(x->error));
399
+ x->error_position = x->paren_stack[0].position;
400
+ goto err;
401
+ }
402
+
403
+ if (assign_implicit_hydrogen_count(x))
404
+ goto err;
405
+
406
+ return COHO_OK;
407
+
408
+ unexpected:
409
+ strlcpy(x->error, "unexpected character", sizeof(x->error));
410
+ err:
411
+ if (x->error_position == -1)
412
+ x->error_position = x->position;
413
+ return COHO_ERROR;
414
+ }
415
+
416
+ /*
417
+ * Parses optional atom class inside a bracket atom (ex: [C:23]).
418
+ * If successful, sets a->atom_class and increments a->length.
419
+ * Returns 1 if atom class was read, else 0.
420
+ * On error, sets x->error and returns -1.
421
+ */
422
+ static int
423
+ atom_class(struct coho_smiles *x, struct coho_smiles_atom *a)
424
+ {
425
+ struct token t;
426
+ int n;
427
+
428
+ if (!match(x, &t, 1, COLON))
429
+ return 0;
430
+
431
+ a->length += t.n;
432
+
433
+ if ((n = integer(x, 8, &a->atom_class)) == -1) {
434
+ strlcpy(x->error, "atom class too large", sizeof(x->error));
435
+ return -1;
436
+ } else if (n == 0) {
437
+ strlcpy(x->error, "atom class expected", sizeof(x->error));
438
+ return -1;
439
+ }
440
+
441
+ a->length += n;
442
+ return 1;
443
+ }
444
+
445
+ /*
446
+ * Saves a completed atom and returns its index.
447
+ */
448
+ static int
449
+ add_atom(struct coho_smiles *x, struct coho_smiles_atom *a)
450
+ {
451
+ x->atoms[x->atom_count] = *a;
452
+ return x->atom_count++;
453
+ }
454
+
455
+ /*
456
+ * Saves a new bond to the bond list and returns its index.
457
+ * Returns new length of bond list on success.
458
+ * If the bond is already in the list, sets x->error and returns -1.
459
+ * Bonds are added so that bond->atom0 < bond->atom1 and the entire bond list
460
+ * remains sorted.
461
+ */
462
+ static int
463
+ add_bond(struct coho_smiles *x, struct coho_smiles_bond *bond)
464
+ {
465
+ size_t i, move;
466
+ struct coho_smiles_bond nb, *b;
467
+
468
+ nb = *bond;
469
+
470
+ /* Flip so atom0 < atom1
471
+ */
472
+ if (bond->atom0 > bond->atom1) {
473
+ nb.atom0 = bond->atom1;
474
+ nb.atom1 = bond->atom0;
475
+
476
+ if (bond->stereo == COHO_SMILES_BOND_STEREO_UP)
477
+ nb.stereo = COHO_SMILES_BOND_STEREO_DOWN;
478
+ else if (bond->stereo == COHO_SMILES_BOND_STEREO_DOWN)
479
+ nb.stereo = COHO_SMILES_BOND_STEREO_UP;
480
+ }
481
+
482
+ /* Find position to insert and check for duplicates.
483
+ * Start search from end, since bonds are
484
+ * mostly generated in the correct order.
485
+ */
486
+ for (i = x->bond_count; i > 0; i--) {
487
+ b = &x->bonds[i-1];
488
+
489
+ if (nb.atom0 > b->atom0)
490
+ break;
491
+ else if (nb.atom0 < b->atom0)
492
+ continue;
493
+ else if (nb.atom1 > b->atom1)
494
+ break;
495
+ else if (nb.atom1 < b->atom1)
496
+ continue;
497
+ else {
498
+ strlcpy(x->error, "duplicate bond", sizeof(x->error));
499
+ x->error_position = nb.position;
500
+ return -1;
501
+ }
502
+ }
503
+
504
+ move = x->bond_count - i; /* # elements to shift */
505
+ if (move) {
506
+ memmove(x->bonds + i + 1,
507
+ x->bonds + i,
508
+ move * sizeof(x->bonds[0]));
509
+ }
510
+
511
+ x->bonds[i] = nb;
512
+ return x->bond_count++;
513
+ }
514
+
515
+ /*
516
+ * Adds a ring bond closure.
517
+ * If there is already an open ring bond using rnum,
518
+ * it is closed and the new bond is added to the bond list.
519
+ * Otherwise, a new bond is opened.
520
+ * Returns 0 on success.
521
+ * On failure, sets x->error and returns -1.
522
+ */
523
+ static int
524
+ add_ringbond(struct coho_smiles *x, int rnum, struct coho_smiles_bond *b)
525
+ {
526
+ struct coho_smiles_bond *rb;
527
+
528
+ assert(rnum < 100);
529
+
530
+ if (b->order == COHO_SMILES_BOND_UNSPECIFIED)
531
+ assert(b->stereo == COHO_SMILES_BOND_STEREO_UNSPECIFIED);
532
+
533
+ rb = &x->ring_bonds[rnum];
534
+
535
+ if (rb->atom0 == -1) {
536
+ rb->atom0 = b->atom0;
537
+ rb->order = b->order;
538
+ rb->stereo = b->stereo;
539
+ rb->is_implicit = 0;
540
+ rb->is_ring = 1;
541
+ rb->position = b->position;
542
+ rb->length = b->length;
543
+ x->open_ring_closures++;
544
+ return 0;
545
+ }
546
+
547
+ /* Close the open bond */
548
+
549
+ if (rb->atom0 == b->atom0) {
550
+ strlcpy(x->error,
551
+ "atom ring-bonded to itself",
552
+ sizeof(x->error));
553
+ x->error_position = x->atoms[b->atom0].position;
554
+ return -1;
555
+ }
556
+
557
+ if (rb->order == COHO_SMILES_BOND_UNSPECIFIED)
558
+ rb->order = b->order;
559
+ else if (b->order == COHO_SMILES_BOND_UNSPECIFIED)
560
+ ; /* pass */
561
+ else if (rb->order != b->order) {
562
+ strlcpy(x->error,
563
+ "conflicting ring bond orders",
564
+ sizeof(x->error));
565
+ x->error_position = x->atoms[b->atom0].position;
566
+ return -1;
567
+ }
568
+ if (rb->order == COHO_SMILES_BOND_UNSPECIFIED)
569
+ rb->order = COHO_SMILES_BOND_SINGLE;
570
+
571
+ rb->atom1 = b->atom0;
572
+
573
+ if (add_bond(x, rb) == -1)
574
+ return -1;
575
+
576
+ coho_smiles_bond_init(rb);
577
+ rb->atom0 = -1; ; /* mark slot open again */
578
+ x->open_ring_closures--;
579
+
580
+ return 0;
581
+ }
582
+
583
+ /*
584
+ * Matches an aliphatic organic atom (C, N, O, etc.).
585
+ * Returns 1 on match, 0 if no match, or -1 on error.
586
+ */
587
+ static int
588
+ aliphatic_organic(struct coho_smiles *x, struct coho_smiles_atom *a)
589
+ {
590
+ struct token t;
591
+
592
+ if (!match(x, &t, 0, ALIPHATIC_ORGANIC))
593
+ return 0;
594
+ coho_smiles_atom_init(a);
595
+ a->position = t.position;
596
+ a->atomic_number = t.intval;
597
+ a->is_organic = 1;
598
+ a->length = t.n;
599
+ tokcpy(a->symbol, &t, sizeof(a->symbol));
600
+ return 1;
601
+ }
602
+
603
+ /*
604
+ * Matches an aromatic organic atom (c, n, o, etc.).
605
+ * Returns 1 on match, 0 if no match, or -1 on error.
606
+ */
607
+ static int
608
+ aromatic_organic(struct coho_smiles *x, struct coho_smiles_atom *a)
609
+ {
610
+ struct token t;
611
+
612
+ if (!match(x, &t, 0, AROMATIC_ORGANIC))
613
+ return 0;
614
+ coho_smiles_atom_init(a);
615
+ a->position = t.position;
616
+ a->atomic_number = t.intval;
617
+ a->is_organic = 1;
618
+ a->is_aromatic = 1;
619
+ a->length = t.n;
620
+ tokcpy(a->symbol, &t, sizeof(a->symbol));
621
+ return 1;
622
+ }
623
+
624
+ /*
625
+ * Assigns implicit hydrogen counts for all atoms that were
626
+ * specified using the organic-subset shorthand.
627
+ */
628
+ static int
629
+ assign_implicit_hydrogen_count(struct coho_smiles *x)
630
+ {
631
+ int i, valence, std;
632
+ struct coho_smiles_atom *a;
633
+
634
+ for (i = 0; i < x->atom_count; i++) {
635
+ a = &x->atoms[i];
636
+
637
+ if (!a->is_organic)
638
+ continue;
639
+
640
+ valence = atom_valence(x, i);
641
+ std = round_valence(a->atomic_number, valence, a->is_aromatic);
642
+
643
+ if (std == -1)
644
+ a->implicit_hydrogen_count = 0;
645
+ else
646
+ a->implicit_hydrogen_count = std - valence;
647
+ }
648
+
649
+ return 0;
650
+ }
651
+
652
+ /*
653
+ * Matches an atom or returns 0 if not found.
654
+ * If successful, stores the index of the new atom in *anum and returns 1.
655
+ * On error, sets x->error and returns -1.
656
+ */
657
+ static int
658
+ atom(struct coho_smiles *x, int *atom_index)
659
+ {
660
+ struct coho_smiles_atom a;
661
+ int rc;
662
+
663
+ if ((rc = bracket_atom(x, &a)) ||
664
+ (rc = aliphatic_organic(x, &a)) ||
665
+ (rc = aromatic_organic(x, &a)) ||
666
+ (rc = wildcard(x, &a))) {
667
+ if (rc == -1)
668
+ return -1;
669
+ } else {
670
+ return 0;
671
+ }
672
+
673
+ *atom_index = add_atom(x, &a);
674
+ return 1;
675
+ }
676
+
677
+ /*
678
+ * Matches an atom followed by zero or more ringbonds.
679
+ * On success, stores the index of the new atom in *anum and returns 1.
680
+ * Returns 0 if there is no match.
681
+ * On error, sets x->error and returns -1.
682
+ */
683
+ static int
684
+ atom_ringbond(struct coho_smiles *x, int *anum)
685
+ {
686
+ int rc;
687
+
688
+ if ((rc = atom(x, anum))) {
689
+ if (rc == -1 )
690
+ return -1;
691
+ } else {
692
+ return 0;
693
+ }
694
+
695
+ while ((rc = ringbond(x, *anum)))
696
+ if (rc == -1 )
697
+ return -1;
698
+
699
+ return 1;
700
+ }
701
+
702
+ /*
703
+ * Computes the valence of an atom by summing the orders
704
+ * of its bonds.
705
+ * Treats aromatic atoms as a special case in an attempt to
706
+ * properly derive implicit hydrogen count.
707
+ */
708
+ static int
709
+ atom_valence(struct coho_smiles *x, size_t idx)
710
+ {
711
+ int i;
712
+ int valence, neighbors;
713
+ struct coho_smiles_bond *b;
714
+
715
+ valence = 0;
716
+ neighbors = 0;
717
+
718
+ for (i = 0; i < x->bond_count; i++) {
719
+ b = &x->bonds[i];
720
+ if (b->atom0 > (int)idx)
721
+ break;
722
+ else if (b->atom0 != (int)idx && b->atom1 != (int)idx)
723
+ continue;
724
+
725
+ if (b->order == COHO_SMILES_BOND_SINGLE)
726
+ valence += 1;
727
+ else if (b->order == COHO_SMILES_BOND_AROMATIC)
728
+ valence += 1;
729
+ else if (b->order == COHO_SMILES_BOND_DOUBLE)
730
+ valence += 2;
731
+ else if (b->order == COHO_SMILES_BOND_TRIPLE)
732
+ valence += 3;
733
+ else if (b->order == COHO_SMILES_BOND_QUAD)
734
+ valence += 4;
735
+
736
+ neighbors += 1;
737
+ }
738
+
739
+ if (x->atoms[idx].is_aromatic && valence == neighbors) {
740
+ valence += 1;
741
+ }
742
+
743
+ return valence;
744
+ }
745
+
746
+ /*
747
+ * Matches a bond or returns 0 if not found.
748
+ * If found, sets fields of *b and returns 1.
749
+ * Only sets fields that can be determined by the matching bond
750
+ * token (order, stereo, position, and length).
751
+ * Clears implicit flag.
752
+ * Doesn't set bond atoms.
753
+ */
754
+ static int
755
+ bond(struct coho_smiles *x, struct coho_smiles_bond *b)
756
+ {
757
+ struct token t;
758
+
759
+ if (!match(x, &t, 0, BOND))
760
+ return 0;
761
+
762
+ b->order = t.intval;
763
+ b->stereo = t.flags;
764
+ b->is_implicit = 0;
765
+ b->position = t.position;
766
+ b->length = t.n;
767
+ return 1;
768
+ }
769
+
770
+ /*
771
+ * Matches a bracket atom or returns 0 if not found.
772
+ * If found, initializes the atom, sets its fields, and returns 1.
773
+ * On error, sets x->error and returns -1.
774
+ */
775
+ static int
776
+ bracket_atom(struct coho_smiles *x, struct coho_smiles_atom *a)
777
+ {
778
+ struct token t;
779
+
780
+ if (!match(x, &t, 0, BRACKET_OPEN))
781
+ return 0;
782
+
783
+ coho_smiles_atom_init(a);
784
+ a->is_bracket = 1;
785
+ a->position = t.position;
786
+ a->length = t.n;
787
+
788
+ if (isotope(x, a) == -1)
789
+ return -1;
790
+
791
+ if (symbol(x, a) == 0) {
792
+ strlcpy(x->error, "atom symbol expected", sizeof(x->error));
793
+ return -1;
794
+ }
795
+
796
+ if (chirality(x, a) == -1)
797
+ return -1;
798
+
799
+ if (hydrogen_count(x, a) == -1)
800
+ return -1;
801
+
802
+ if (charge(x, a) == -1)
803
+ return -1;
804
+
805
+ if (atom_class(x, a) == -1)
806
+ return -1;
807
+
808
+ if (!match(x, &t, 0, BRACKET_CLOSE)) {
809
+ strlcpy(x->error,
810
+ "bracket atom syntax error",
811
+ sizeof(x->error));
812
+ return -1;
813
+ }
814
+ a->length += t.n;
815
+ return 1;
816
+ }
817
+
818
+ /*
819
+ * Returns 0 if all rings have been closed.
820
+ * Otherwise, sets x->error and returns -1.
821
+ */
822
+ static int
823
+ check_ring_closures(struct coho_smiles *x)
824
+ {
825
+ size_t i;
826
+
827
+ if (x->open_ring_closures == 0)
828
+ return 0;
829
+
830
+ strlcpy(x->error, "unclosed ring bond", sizeof(x->error));
831
+
832
+ for (i = 0; i < 100; i++) {
833
+ if (x->ring_bonds[i].atom0 != -1) {
834
+ x->error_position = x->ring_bonds[i].position;
835
+ break;
836
+ }
837
+ }
838
+
839
+ return -1;
840
+ }
841
+
842
+ /*
843
+ * Parses optional charge inside a bracket atom.
844
+ * If successful, sets a->charge and increments a->length.
845
+ * Returns 1 if charge was read, else 0.
846
+ * On error, sets x->error and returns -1.
847
+ */
848
+ static int
849
+ charge(struct coho_smiles *x, struct coho_smiles_atom *a)
850
+ {
851
+ struct token t;
852
+ int sign;
853
+ int n;
854
+ int length;
855
+
856
+ if (!match(x, &t, 1, PLUS | MINUS))
857
+ return 0;
858
+ sign = t.intval;
859
+ length = t.n;
860
+
861
+ if ((n = integer(x, 2, &a->charge)) == -1) {
862
+ strlcpy(x->error, "charge too large", sizeof(x->error));
863
+ return -1;
864
+ } else if (n) {
865
+ a->charge *= sign;
866
+ length += n;
867
+ } else {
868
+ a->charge = sign;
869
+
870
+ if (lex(x, &t, 1) & (PLUS | MINUS)) {
871
+ if (t.intval == sign) {
872
+ x->position += t.n;
873
+ a->charge *= 2;
874
+ length += t.n;
875
+ }
876
+ }
877
+ }
878
+
879
+ a->length += length;
880
+ return 1;
881
+ }
882
+
883
+ /*
884
+ * Parses chirality inside a bracket atom.
885
+ * If successful, sets a->chirality and increments a->length.
886
+ * Returns 1 if chirality was read, else 0.
887
+ * TODO: Currently, this only understands @ and @@.
888
+ */
889
+ static int
890
+ chirality(struct coho_smiles *x, struct coho_smiles_atom *a)
891
+ {
892
+ struct token t;
893
+
894
+ if (!match(x, &t, 1, CHIRALITY))
895
+ return 0;
896
+ tokcpy(a->chirality, &t, sizeof(a->chirality));
897
+ a->length += t.n;
898
+ return 1;
899
+ }
900
+
901
+ /*
902
+ * Matches a closing parenthesis that ends a branch.
903
+ * On success, pops the parenthesis stack and returns 1.
904
+ * Returns 0 if there was no match.
905
+ * On error, sets x->error and returns -1.
906
+ */
907
+ static int
908
+ close_paren(struct coho_smiles *x, struct coho_smiles_bond *b)
909
+ {
910
+ struct token t;
911
+
912
+ if (!match(x, &t, 0, PAREN_CLOSE))
913
+ return 0;
914
+
915
+ if (pop_paren_stack(x, t.position, b))
916
+ return -1;
917
+ return 1;
918
+ }
919
+
920
+ /*
921
+ * Matches dot, the no-bond specifier.
922
+ * Returns 1 on success, 0 if there was no match.
923
+ */
924
+ static int
925
+ dot(struct coho_smiles *x)
926
+ {
927
+ struct token t;
928
+
929
+ return match(x, &t, 0, DOT);
930
+ }
931
+
932
+ static int
933
+ ensure_array_capacities(struct coho_smiles *x, size_t smiles_length)
934
+ {
935
+ size_t new_capacity;
936
+ void *p;
937
+
938
+ /*
939
+ * Maximum required storage is bounded by length of SMILES string.
940
+ */
941
+ if (x->atoms_capacity >= smiles_length)
942
+ return 0;
943
+
944
+ new_capacity = next_array_capacity(smiles_length);
945
+
946
+ #define GROW(name) \
947
+ do { \
948
+ p = reallocarray(x->name, \
949
+ new_capacity, \
950
+ sizeof(x->name[0])); \
951
+ if (p == NULL) \
952
+ return -1; \
953
+ x->name = p; \
954
+ x->name##_capacity = new_capacity; \
955
+ } while (0)
956
+
957
+ GROW(atoms);
958
+ GROW(bonds);
959
+ GROW(paren_stack);
960
+
961
+ #undef GROW
962
+ return 0;
963
+ }
964
+
965
+ /*
966
+ * Sets the order of an implicit bond according to
967
+ * the aromaticity of the two atoms.
968
+ */
969
+ static void
970
+ finalize_implicit_bond_order(struct coho_smiles *x, struct coho_smiles_bond *b)
971
+ {
972
+ if (!b->is_implicit)
973
+ return;
974
+
975
+ if (x->atoms[b->atom0].is_aromatic && x->atoms[b->atom1].is_aromatic)
976
+ b->order = COHO_SMILES_BOND_AROMATIC;
977
+ else
978
+ b->order = COHO_SMILES_BOND_SINGLE;
979
+ }
980
+
981
+ /*
982
+ * Parses hydrogen count inside a bracket atom.
983
+ * If successful, sets a->hydrogen_count and increments a->length.
984
+ * Returns 1 if hydrogen_count was read, else 0.
985
+ */
986
+ static int
987
+ hydrogen_count(struct coho_smiles *x, struct coho_smiles_atom *a)
988
+ {
989
+ struct token t;
990
+
991
+ if (!match(x, &t, 1, HYDROGEN))
992
+ return 0;
993
+
994
+ a->length += t.n;
995
+
996
+ if (match(x, &t, 1, DIGIT)) {
997
+ a->hydrogen_count = t.intval;
998
+ a->length += t.n;
999
+ } else {
1000
+ a->hydrogen_count = 1;
1001
+ }
1002
+
1003
+ return 1;
1004
+ }
1005
+
1006
+ /*
1007
+ * Matches an integer up to maxdigit long.
1008
+ * On success, stores the integer in *dst and returns number of digits.
1009
+ * Returns 0 if no digits are available.
1010
+ * Returns -1 if maxdigit is exceeded.
1011
+ */
1012
+ static int
1013
+ integer(struct coho_smiles *x, size_t maxdigit, int *dst)
1014
+ {
1015
+ size_t i;
1016
+ int n = 0;
1017
+ int saved = x->position;
1018
+ struct token t;
1019
+
1020
+ for (i = 0; lex(x, &t, 0) & DIGIT; i++) {
1021
+ if (maxdigit && i == maxdigit) {
1022
+ x->position = saved;
1023
+ return -1;
1024
+ }
1025
+ x->position += t.n;
1026
+ n = n * 10 + t.intval;
1027
+ }
1028
+ if (i == 0)
1029
+ return 0;
1030
+ *dst = n;
1031
+ return i;
1032
+ }
1033
+
1034
+ /*
1035
+ * Parses isotope inside a bracket atom.
1036
+ * If successful, sets a->isotope and increments a->length.
1037
+ * Returns 1 if isotope was read, else 0.
1038
+ * On error, returns -1 and sets x->error.
1039
+ */
1040
+ static int
1041
+ isotope(struct coho_smiles *x, struct coho_smiles_atom *a)
1042
+ {
1043
+ int n;
1044
+
1045
+ if ((n = integer(x, 5, &a->isotope)) == -1) {
1046
+ strlcpy(x->error, "isotope too large", sizeof(x->error));
1047
+ return -1;
1048
+ }
1049
+ a->length += n;
1050
+ return 0;
1051
+ }
1052
+
1053
+ /*
1054
+ * Reads next token and checks if its type is among those requested.
1055
+ * If so, consumes the token and returns 1.
1056
+ * If not, returns 0 and the parsing position remains unchanged.
1057
+ */
1058
+ static int
1059
+ match(struct coho_smiles *x, struct token *t, int inbracket, unsigned int ttype)
1060
+ {
1061
+ if (lex(x, t, inbracket) & ttype) {
1062
+ x->position += t->n;
1063
+ return 1;
1064
+ }
1065
+ return 0;
1066
+ }
1067
+
1068
+ /*
1069
+ * Returns a new array capacity that is larger than
1070
+ * its previous capacity.
1071
+ */
1072
+ static size_t
1073
+ next_array_capacity(size_t previous_capacity)
1074
+ {
1075
+ size_t cap = 2 * previous_capacity - 1;
1076
+
1077
+ while (cap & (cap - 1))
1078
+ cap = cap & (cap - 1);
1079
+ return cap;
1080
+ }
1081
+
1082
+ /*
1083
+ * Matches an opening parenthesis that begins a branch.
1084
+ * On success, pushes the parenthesis stack and returns 1.
1085
+ * Returns 0 if there was no match.
1086
+ */
1087
+ static int
1088
+ open_paren(struct coho_smiles *x, struct coho_smiles_bond *b)
1089
+ {
1090
+ struct token t;
1091
+
1092
+ if (!match(x, &t, 0, PAREN_OPEN))
1093
+ return 0;
1094
+
1095
+ push_paren_stack(x, t.position, b);
1096
+ return 1;
1097
+ }
1098
+
1099
+ /*
1100
+ * Pops the parenthesis stack that holds the open bonds to
1101
+ * "previous" atoms.
1102
+ * Ex: In C(N)=O the closing parenthesis will trigger the popping of
1103
+ * the stack, ensuring that the oxygen is bonded to the carbon instead
1104
+ * of the nitrogen.
1105
+ * The position of the parenthesis triggering the pop is used for
1106
+ * error messages.
1107
+ * Returns 0 on success.
1108
+ * On failure, sets x->error and returns -1.
1109
+ */
1110
+ static int
1111
+ pop_paren_stack(struct coho_smiles *x, int position, struct coho_smiles_bond *b)
1112
+ {
1113
+ if (!x->paren_stack_count) {
1114
+ strlcpy(x->error, "unbalanced parenthesis", sizeof(x->error));
1115
+ x->error_position = position;
1116
+ return -1;
1117
+ }
1118
+
1119
+ *b = x->paren_stack[--x->paren_stack_count].bond;
1120
+ return 0;
1121
+ }
1122
+
1123
+ /*
1124
+ * Pushes the parenthesis stack that holds open bonds to
1125
+ * "previous" atoms.
1126
+ * Ex: In C(N)=O the first parenthesis will trigger the pushing of
1127
+ * an open bond to the carbon onto the stack.
1128
+ * The closing parenthesis will pop the stack, ensuring that the carbon
1129
+ * is correctly bonded to the oxygen.
1130
+ * The position of the parenthesis triggering the push is stored
1131
+ * to support error messages.
1132
+ */
1133
+ static void
1134
+ push_paren_stack(struct coho_smiles *x,
1135
+ int position,
1136
+ struct coho_smiles_bond *b)
1137
+ {
1138
+ struct coho_smiles_paren *p;
1139
+
1140
+ assert(b->atom0 != -1);
1141
+
1142
+ p = &x->paren_stack[x->paren_stack_count++];
1143
+ p->position = position;
1144
+ p->bond = *b;
1145
+ }
1146
+
1147
+ /*
1148
+ * Matches a ring bond or returns 0 if not found.
1149
+ * On error, sets x->error and returns -1.
1150
+ * On success, uses atom anum to open or close a ring
1151
+ * bond and then returns 1.
1152
+ * If the parsed ring bond ID is in use, closes it and adds a new bond
1153
+ * to the bond list.
1154
+ * Otherwise, marks the ring ID as open.
1155
+ */
1156
+ static int
1157
+ ringbond(struct coho_smiles *x, int anum)
1158
+ {
1159
+ struct token t;
1160
+ struct coho_smiles_bond b;
1161
+ int rc;
1162
+ int rnum;
1163
+ int saved = x->position;
1164
+
1165
+ coho_smiles_bond_init(&b);
1166
+ b.atom0 = anum;
1167
+
1168
+ if ((rc = bond(x, &b))) {
1169
+ if (rc == -1)
1170
+ return -1;
1171
+ } else {
1172
+ b.order = COHO_SMILES_BOND_UNSPECIFIED;
1173
+ b.position = x->position;
1174
+ }
1175
+
1176
+ if (!match(x, &t, 0, PERCENT | DIGIT)) {
1177
+ x->position = saved;
1178
+ return 0;
1179
+ }
1180
+
1181
+ if (t.type == PERCENT) {
1182
+ if (!match(x, &t, 0, DIGIT)) {
1183
+ strlcpy(x->error,
1184
+ "ring bond expected",
1185
+ sizeof(x->error));
1186
+ return -1;
1187
+ }
1188
+ rnum = t.intval * 10;
1189
+
1190
+ if (!match(x, &t, 0, DIGIT)) {
1191
+ strlcpy(x->error,
1192
+ "2 digit ring bond expected",
1193
+ sizeof(x->error));
1194
+ return -1;
1195
+ }
1196
+ rnum += t.intval;
1197
+ } else {
1198
+ rnum = t.intval;
1199
+ }
1200
+
1201
+ if (add_ringbond(x, rnum, &b))
1202
+ return -1;
1203
+ return 1;
1204
+ }
1205
+
1206
+ /*
1207
+ * Rounds an atom's current valence to its next standard one.
1208
+ * Returns its current valence if it among the standard ones.
1209
+ * Otherwise, returns the next higher standard one or -1 if
1210
+ * none are found.
1211
+ * Setting lowest_only to true causes the search to stop after
1212
+ * the first standard valence, disregarding higher valences.
1213
+ */
1214
+ static int
1215
+ round_valence(int atomic_number, int valence, int lowest_only)
1216
+ {
1217
+ int i, j, anum;
1218
+
1219
+ for (i = 0; (anum = standard_valences[i][0]) != -1; i++) {
1220
+ if (anum > atomic_number)
1221
+ break;
1222
+ else if (anum == atomic_number) {
1223
+ for (j = 1; j < 4; j++) {
1224
+ if (valence <= standard_valences[i][j])
1225
+ return standard_valences[i][j];
1226
+ if (lowest_only)
1227
+ break;
1228
+ }
1229
+ }
1230
+ }
1231
+ return -1;
1232
+ }
1233
+
1234
+ /*
1235
+ * Initializes struct coho_smiles_atom.
1236
+ */
1237
+ static void
1238
+ coho_smiles_atom_init(struct coho_smiles_atom *x)
1239
+ {
1240
+ x->atomic_number = 0;
1241
+ x->symbol[0] = '\0';
1242
+ x->isotope = -1;
1243
+ x->charge = 0;
1244
+ x->hydrogen_count = -1;
1245
+ x->implicit_hydrogen_count = -1;
1246
+ x->is_bracket = 0;
1247
+ x->is_organic = 0;
1248
+ x->is_aromatic = 0;
1249
+ x->chirality[0] = '\0';
1250
+ x->atom_class = -1;
1251
+ x->position = -1;
1252
+ x->length = 0;
1253
+ }
1254
+
1255
+ /*
1256
+ * Initializes struct coho_smiles_bond.
1257
+ */
1258
+ static void
1259
+ coho_smiles_bond_init(struct coho_smiles_bond *x)
1260
+ {
1261
+ x->atom0 = -1;
1262
+ x->atom1 = -1;
1263
+ x->order = -1;
1264
+ x->stereo = COHO_SMILES_BOND_STEREO_UNSPECIFIED;
1265
+ x->is_implicit = 0;
1266
+ x->is_ring = 0;
1267
+ x->position = -1;
1268
+ x->length = 0;
1269
+ }
1270
+
1271
+ /*
1272
+ * Reinitializes struct coho_smiles prior to parsing a new SMILES.
1273
+ * The given number of bytes of smiles will be parsed.
1274
+ */
1275
+ static void
1276
+ coho_smiles_reinit(struct coho_smiles *x, const char *smiles, size_t end)
1277
+ {
1278
+ size_t i;
1279
+
1280
+ x->smiles = smiles;
1281
+ x->position = 0;
1282
+ x->end = end;
1283
+ x->error[0] = '\0';
1284
+ x->error_position = -1;
1285
+ x->atom_count = 0;
1286
+ x->bond_count = 0;
1287
+ x->paren_stack_count = 0;
1288
+
1289
+ for (i = 0; i < 100; i++)
1290
+ coho_smiles_bond_init(&x->ring_bonds[i]);
1291
+ x->open_ring_closures = 0;
1292
+ }
1293
+
1294
+ /*
1295
+ * Parses atom symbol inside a bracket atom.
1296
+ * If successful, sets a->symbol, a->is_aromatic, and increments a->length.
1297
+ * Returns 1 if symbol was read, else 0.
1298
+ */
1299
+ static int
1300
+ symbol(struct coho_smiles *x, struct coho_smiles_atom *a)
1301
+ {
1302
+ struct token t;
1303
+
1304
+ if (!match(x, &t, 1, ELEMENT | AROMATIC | WILDCARD))
1305
+ return 0;
1306
+ a->atomic_number = t.intval;
1307
+ a->is_aromatic = t.type & AROMATIC ? 1 : 0;
1308
+ a->length += t.n;
1309
+ tokcpy(a->symbol, &t, sizeof(a->symbol));
1310
+ return 1;
1311
+ }
1312
+
1313
+ /*
1314
+ * Copies up to dstsz - 1 bytes from the token to dst, NUL-terminating
1315
+ * dst if dstsz is not 0.
1316
+ */
1317
+ static void
1318
+ tokcpy(char *dst, struct token *t, size_t dstsz)
1319
+ {
1320
+ size_t i;
1321
+
1322
+ if (dstsz == 0)
1323
+ return;
1324
+
1325
+ for (i = 0; i < t->n; i++) {
1326
+ if (i == dstsz - 1)
1327
+ break;
1328
+ dst[i] = t->s[i];
1329
+ }
1330
+
1331
+ dst[i] = 0;
1332
+ }
1333
+
1334
+ /*
1335
+ * Matches a wildcard atom (*) or returns 0 if not found.
1336
+ * If found, initializes the atom, sets its fields, and returns 1.
1337
+ * On error, sets x->error and returns -1.
1338
+ */
1339
+ static int
1340
+ wildcard(struct coho_smiles *x, struct coho_smiles_atom *a)
1341
+ {
1342
+ struct token t;
1343
+
1344
+ if (!match(x, &t, 0, WILDCARD))
1345
+ return 0;
1346
+ coho_smiles_atom_init(a);
1347
+ a->position = t.position;
1348
+ a->atomic_number = 0;
1349
+ a->length = t.n;
1350
+ tokcpy(a->symbol, &t, sizeof(a->symbol));
1351
+ return 1;
1352
+ }
1353
+
1354
+ /*
1355
+ * Reads next token from SMILES string.
1356
+ * The inbracket parameter should be set to true when parsing is
1357
+ * inside a bracket atom.
1358
+ * Returns the token type or zero if no token could be read.
1359
+ * The token type is a bitmask since a particular token can belong
1360
+ * to multiple categories. For example, the symbol for
1361
+ * hydrogen will have type ELEMENT | HYDROGEN.
1362
+ */
1363
+ static unsigned int
1364
+ lex(struct coho_smiles *x, struct token *t, int inbracket)
1365
+ {
1366
+ int c0, c1;
1367
+ const char *s;
1368
+
1369
+ if (x->position == x->end)
1370
+ return 0;
1371
+
1372
+ s = x->smiles + x->position;
1373
+ c0 = s[0];
1374
+ c1 = 0;
1375
+
1376
+ if (x->position < x->end)
1377
+ c1 = s[1];
1378
+
1379
+ t->s = s;
1380
+ t->position = x->position;
1381
+ t->n = 1;
1382
+ t->type = 0;
1383
+ t->intval = -1;
1384
+ t->flags = 0;
1385
+
1386
+ switch (c0) {
1387
+ case 'a':
1388
+ if (inbracket && c1 == 's') {
1389
+ t->n = 2;
1390
+ t->type = AROMATIC;
1391
+ t->intval = 33;
1392
+ goto out;
1393
+ }
1394
+ return 0;
1395
+ case 'b':
1396
+ t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
1397
+ t->intval = 5;
1398
+ goto out;
1399
+ case 'c':
1400
+ t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
1401
+ t->intval = 6;
1402
+ goto out;
1403
+ case 'n':
1404
+ t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
1405
+ t->intval = 7;
1406
+ goto out;
1407
+ case 'o':
1408
+ t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
1409
+ t->intval = 8;
1410
+ goto out;
1411
+ case 'p':
1412
+ t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
1413
+ t->intval = 15;
1414
+ goto out;
1415
+ case 's':
1416
+ if (!inbracket) {
1417
+ t->type = AROMATIC_ORGANIC;
1418
+ t->intval = 16;
1419
+ goto out;
1420
+ }
1421
+ switch (c1) {
1422
+ case 'e':
1423
+ t->type = AROMATIC;
1424
+ t->n = 2;
1425
+ t->intval = 34;
1426
+ goto out;
1427
+ default:
1428
+ t->type = AROMATIC;
1429
+ t->intval = 16;
1430
+ goto out;
1431
+ }
1432
+ case 'A':
1433
+ switch (c1) {
1434
+ case 'c':
1435
+ t->type = ELEMENT;
1436
+ t->intval = 89;
1437
+ t->n = 2;
1438
+ goto out;
1439
+ case 'g':
1440
+ t->type = ELEMENT;
1441
+ t->intval = 47;
1442
+ t->n = 2;
1443
+ goto out;
1444
+ case 'l':
1445
+ t->type = ELEMENT;
1446
+ t->intval = 13;
1447
+ t->n = 2;
1448
+ goto out;
1449
+ case 'm':
1450
+ t->type = ELEMENT;
1451
+ t->intval = 95;
1452
+ t->n = 2;
1453
+ goto out;
1454
+ case 'r':
1455
+ t->type = ELEMENT;
1456
+ t->intval = 18;
1457
+ t->n = 2;
1458
+ goto out;
1459
+ case 's':
1460
+ t->type = ELEMENT;
1461
+ t->intval = 33;
1462
+ t->n = 2;
1463
+ goto out;
1464
+ case 't':
1465
+ t->type = ELEMENT;
1466
+ t->intval = 85;
1467
+ t->n = 2;
1468
+ goto out;
1469
+ case 'u':
1470
+ t->type = ELEMENT;
1471
+ t->intval = 79;
1472
+ t->n = 2;
1473
+ goto out;
1474
+ default:
1475
+ return 0;
1476
+ }
1477
+ case 'B':
1478
+ if (!inbracket) {
1479
+ if (c1 == 'r') {
1480
+ t->intval = 35;
1481
+ t->n = 2;
1482
+ } else {
1483
+ t->intval = 5;
1484
+ }
1485
+ t->type = ALIPHATIC_ORGANIC;
1486
+ goto out;
1487
+ }
1488
+ switch (c1) {
1489
+ case 'a':
1490
+ t->type = ELEMENT;
1491
+ t->intval = 56;
1492
+ t->n = 2;
1493
+ goto out;
1494
+ case 'e':
1495
+ t->type = ELEMENT;
1496
+ t->intval = 4;
1497
+ t->n = 2;
1498
+ goto out;
1499
+ case 'h':
1500
+ t->type = ELEMENT;
1501
+ t->intval = 107;
1502
+ t->n = 2;
1503
+ goto out;
1504
+ case 'i':
1505
+ t->type = ELEMENT;
1506
+ t->intval = 83;
1507
+ t->n = 2;
1508
+ goto out;
1509
+ case 'k':
1510
+ t->type = ELEMENT;
1511
+ t->intval = 97;
1512
+ t->n = 2;
1513
+ goto out;
1514
+ case 'r':
1515
+ t->type = ELEMENT;
1516
+ t->intval = 35;
1517
+ t->n = 2;
1518
+ goto out;
1519
+ default:
1520
+ t->type = ELEMENT;
1521
+ t->intval = 5;
1522
+ goto out;
1523
+ }
1524
+ case 'C':
1525
+ if (!inbracket) {
1526
+ if (c1 == 'l') {
1527
+ t->intval = 17;
1528
+ t->n = 2;
1529
+ } else {
1530
+ t->intval = 6;
1531
+ }
1532
+ t->type = ALIPHATIC_ORGANIC;
1533
+ goto out;
1534
+ }
1535
+ switch (c1) {
1536
+ case 'a':
1537
+ t->type = ELEMENT;
1538
+ t->n = 2;
1539
+ t->intval = 20;
1540
+ goto out;
1541
+ case 'd':
1542
+ t->type = ELEMENT;
1543
+ t->n = 2;
1544
+ t->intval = 20;
1545
+ goto out;
1546
+ case 'e':
1547
+ t->type = ELEMENT;
1548
+ t->n = 2;
1549
+ t->intval = 58;
1550
+ goto out;
1551
+ case 'f':
1552
+ t->type = ELEMENT;
1553
+ t->n = 2;
1554
+ t->intval = 98;
1555
+ goto out;
1556
+ case 'l':
1557
+ t->type = ELEMENT;
1558
+ t->n = 2;
1559
+ t->intval = 17;
1560
+ goto out;
1561
+ case 'm':
1562
+ t->type = ELEMENT;
1563
+ t->n = 2;
1564
+ t->intval = 96;
1565
+ goto out;
1566
+ case 'n':
1567
+ t->type = ELEMENT;
1568
+ t->n = 2;
1569
+ t->intval = 112;
1570
+ goto out;
1571
+ case 'o':
1572
+ t->type = ELEMENT;
1573
+ t->n = 2;
1574
+ t->intval = 27;
1575
+ goto out;
1576
+ case 'r':
1577
+ t->type = ELEMENT;
1578
+ t->n = 2;
1579
+ t->intval = 24;
1580
+ goto out;
1581
+ case 's':
1582
+ t->type = ELEMENT;
1583
+ t->n = 2;
1584
+ t->intval = 55;
1585
+ goto out;
1586
+ case 'u':
1587
+ t->type = ELEMENT;
1588
+ t->n = 2;
1589
+ t->intval = 29;
1590
+ goto out;
1591
+ default:
1592
+ t->type = ELEMENT;
1593
+ t->intval = 6;
1594
+ goto out;
1595
+ }
1596
+ case 'D':
1597
+ switch (c1) {
1598
+ case 'b':
1599
+ t->type = ELEMENT;
1600
+ t->n = 2;
1601
+ t->intval = 105;
1602
+ goto out;
1603
+ case 's':
1604
+ t->type = ELEMENT;
1605
+ t->n = 2;
1606
+ t->intval = 110;
1607
+ goto out;
1608
+ case 'y':
1609
+ t->type = ELEMENT;
1610
+ t->n = 2;
1611
+ t->intval = 66;
1612
+ goto out;
1613
+ default:
1614
+ return 0;
1615
+ }
1616
+ case 'E':
1617
+ switch (c1) {
1618
+ case 'r':
1619
+ t->type = ELEMENT;
1620
+ t->n = 2;
1621
+ t->intval = 68;
1622
+ goto out;
1623
+ case 's':
1624
+ t->type = ELEMENT;
1625
+ t->n = 2;
1626
+ t->intval = 99;
1627
+ goto out;
1628
+ case 'u':
1629
+ t->type = ELEMENT;
1630
+ t->n = 2;
1631
+ t->intval = 63;
1632
+ goto out;
1633
+ default:
1634
+ return 0;
1635
+ }
1636
+ case 'F':
1637
+ if (!inbracket) {
1638
+ t->intval = 9;
1639
+ t->type = ALIPHATIC_ORGANIC;
1640
+ goto out;
1641
+ }
1642
+ switch (c1) {
1643
+ case 'e':
1644
+ t->type = ELEMENT;
1645
+ t->n = 2;
1646
+ t->intval = 26;
1647
+ goto out;
1648
+ case 'l':
1649
+ t->type = ELEMENT;
1650
+ t->n = 2;
1651
+ t->intval = 114;
1652
+ goto out;
1653
+ case 'm':
1654
+ t->type = ELEMENT;
1655
+ t->n = 2;
1656
+ t->intval = 100;
1657
+ goto out;
1658
+ case 'r':
1659
+ t->type = ELEMENT;
1660
+ t->n = 2;
1661
+ t->intval = 87;
1662
+ goto out;
1663
+ default:
1664
+ t->type = ELEMENT;
1665
+ t->intval = 9;
1666
+ goto out;
1667
+ }
1668
+ case 'G':
1669
+ switch (c1) {
1670
+ case 'a':
1671
+ t->type = ELEMENT;
1672
+ t->n = 2;
1673
+ t->intval = 31;
1674
+ goto out;
1675
+ case 'd':
1676
+ t->type = ELEMENT;
1677
+ t->n = 2;
1678
+ t->intval = 64;
1679
+ goto out;
1680
+ case 'e':
1681
+ t->type = ELEMENT;
1682
+ t->n = 2;
1683
+ t->intval = 32;
1684
+ goto out;
1685
+ default:
1686
+ return 0;
1687
+ }
1688
+ case 'H':
1689
+ switch (c1) {
1690
+ case 'e':
1691
+ t->type = ELEMENT;
1692
+ t->n = 2;
1693
+ t->intval = 2;
1694
+ goto out;
1695
+ case 'f':
1696
+ t->type = ELEMENT;
1697
+ t->n = 2;
1698
+ t->intval = 72;
1699
+ goto out;
1700
+ case 'g':
1701
+ t->type = ELEMENT;
1702
+ t->n = 2;
1703
+ t->intval = 80;
1704
+ goto out;
1705
+ case 'o':
1706
+ t->type = ELEMENT;
1707
+ t->n = 2;
1708
+ t->intval = 67;
1709
+ goto out;
1710
+ case 's':
1711
+ t->type = ELEMENT;
1712
+ t->n = 2;
1713
+ t->intval = 108;
1714
+ goto out;
1715
+ default:
1716
+ t->type = ELEMENT | HYDROGEN;
1717
+ t->intval = 1;
1718
+ goto out;
1719
+ }
1720
+ case 'I':
1721
+ if (!inbracket) {
1722
+ t->intval = 53;
1723
+ t->type = ALIPHATIC_ORGANIC;
1724
+ goto out;
1725
+ }
1726
+ switch (c1) {
1727
+ case 'n':
1728
+ t->type = ELEMENT;
1729
+ t->n = 2;
1730
+ t->intval = 49;
1731
+ goto out;
1732
+ case 'r':
1733
+ t->type = ELEMENT;
1734
+ t->n = 2;
1735
+ t->intval = 77;
1736
+ goto out;
1737
+ default:
1738
+ t->type = ELEMENT;
1739
+ t->intval = 53;
1740
+ goto out;
1741
+ }
1742
+ case 'K':
1743
+ switch (c1) {
1744
+ case 'r':
1745
+ t->type = ELEMENT;
1746
+ t->n = 2;
1747
+ t->intval = 36;
1748
+ goto out;
1749
+ default:
1750
+ t->type = ELEMENT;
1751
+ t->intval = 19;
1752
+ goto out;
1753
+ }
1754
+ case 'L':
1755
+ switch (c1) {
1756
+ case 'a':
1757
+ t->type = ELEMENT;
1758
+ t->n = 2;
1759
+ t->intval = 57;
1760
+ goto out;
1761
+ case 'i':
1762
+ t->type = ELEMENT;
1763
+ t->n = 2;
1764
+ t->intval = 3;
1765
+ goto out;
1766
+ case 'r':
1767
+ t->type = ELEMENT;
1768
+ t->n = 2;
1769
+ t->intval = 103;
1770
+ goto out;
1771
+ case 'u':
1772
+ t->type = ELEMENT;
1773
+ t->n = 2;
1774
+ t->intval = 71;
1775
+ goto out;
1776
+ case 'v':
1777
+ t->type = ELEMENT;
1778
+ t->n = 2;
1779
+ t->intval = 116;
1780
+ goto out;
1781
+ default:
1782
+ return 0;
1783
+ }
1784
+ case 'M':
1785
+ switch (c1) {
1786
+ case 'd':
1787
+ t->type = ELEMENT;
1788
+ t->n = 2;
1789
+ t->intval = 101;
1790
+ goto out;
1791
+ case 'g':
1792
+ t->type = ELEMENT;
1793
+ t->n = 2;
1794
+ t->intval = 12;
1795
+ goto out;
1796
+ case 'n':
1797
+ t->type = ELEMENT;
1798
+ t->n = 2;
1799
+ t->intval = 25;
1800
+ goto out;
1801
+ case 'o':
1802
+ t->type = ELEMENT;
1803
+ t->n = 2;
1804
+ t->intval = 42;
1805
+ goto out;
1806
+ case 't':
1807
+ t->type = ELEMENT;
1808
+ t->n = 2;
1809
+ t->intval = 109;
1810
+ goto out;
1811
+ default:
1812
+ return 0;
1813
+ }
1814
+ case 'N':
1815
+ if (!inbracket) {
1816
+ t->intval = 7;
1817
+ t->type = ALIPHATIC_ORGANIC;
1818
+ goto out;
1819
+ }
1820
+ switch (c1) {
1821
+ case 'a':
1822
+ t->type = ELEMENT;
1823
+ t->n = 2;
1824
+ t->intval = 11;
1825
+ goto out;
1826
+ case 'b':
1827
+ t->type = ELEMENT;
1828
+ t->n = 2;
1829
+ t->intval = 41;
1830
+ goto out;
1831
+ case 'd':
1832
+ t->type = ELEMENT;
1833
+ t->n = 2;
1834
+ t->intval = 101;
1835
+ goto out;
1836
+ case 'e':
1837
+ t->type = ELEMENT;
1838
+ t->n = 2;
1839
+ t->intval = 10;
1840
+ goto out;
1841
+ case 'i':
1842
+ t->type = ELEMENT;
1843
+ t->n = 2;
1844
+ t->intval = 28;
1845
+ goto out;
1846
+ case 'o':
1847
+ t->type = ELEMENT;
1848
+ t->n = 2;
1849
+ t->intval = 102;
1850
+ goto out;
1851
+ case 'p':
1852
+ t->type = ELEMENT;
1853
+ t->n = 2;
1854
+ t->intval = 93;
1855
+ goto out;
1856
+ default:
1857
+ t->type = ELEMENT;
1858
+ t->intval = 7;
1859
+ goto out;
1860
+ }
1861
+ case 'O':
1862
+ if (!inbracket) {
1863
+ t->intval = 8;
1864
+ t->type = ALIPHATIC_ORGANIC;
1865
+ goto out;
1866
+ }
1867
+ switch (c1) {
1868
+ case 's':
1869
+ t->type = ELEMENT;
1870
+ t->n = 2;
1871
+ t->intval = 76;
1872
+ goto out;
1873
+ default:
1874
+ t->type = ELEMENT;
1875
+ t->intval = 8;
1876
+ goto out;
1877
+ }
1878
+ case 'P':
1879
+ if (!inbracket) {
1880
+ t->intval = 15;
1881
+ t->type = ALIPHATIC_ORGANIC;
1882
+ goto out;
1883
+ }
1884
+ switch (c1) {
1885
+ case 'a':
1886
+ t->type = ELEMENT;
1887
+ t->n = 2;
1888
+ t->intval = 91;
1889
+ goto out;
1890
+ case 'b':
1891
+ t->type = ELEMENT;
1892
+ t->n = 2;
1893
+ t->intval = 82;
1894
+ goto out;
1895
+ case 'd':
1896
+ t->type = ELEMENT;
1897
+ t->n = 2;
1898
+ t->intval = 46;
1899
+ goto out;
1900
+ case 'm':
1901
+ t->type = ELEMENT;
1902
+ t->n = 2;
1903
+ t->intval = 61;
1904
+ goto out;
1905
+ case 'o':
1906
+ t->type = ELEMENT;
1907
+ t->n = 2;
1908
+ t->intval = 84;
1909
+ goto out;
1910
+ case 'r':
1911
+ t->type = ELEMENT;
1912
+ t->n = 2;
1913
+ t->intval = 59;
1914
+ goto out;
1915
+ case 't':
1916
+ t->type = ELEMENT;
1917
+ t->n = 2;
1918
+ t->intval = 78;
1919
+ goto out;
1920
+ case 'u':
1921
+ t->type = ELEMENT;
1922
+ t->n = 2;
1923
+ t->intval = 94;
1924
+ goto out;
1925
+ default:
1926
+ t->type = ELEMENT;
1927
+ t->intval = 15;
1928
+ goto out;
1929
+ }
1930
+ case 'R':
1931
+ switch (c1) {
1932
+ case 'a':
1933
+ t->type = ELEMENT;
1934
+ t->n = 2;
1935
+ t->intval = 88;
1936
+ goto out;
1937
+ case 'b':
1938
+ t->type = ELEMENT;
1939
+ t->n = 2;
1940
+ t->intval = 37;
1941
+ goto out;
1942
+ case 'e':
1943
+ t->type = ELEMENT;
1944
+ t->n = 2;
1945
+ t->intval = 75;
1946
+ goto out;
1947
+ case 'f':
1948
+ t->type = ELEMENT;
1949
+ t->n = 2;
1950
+ t->intval = 104;
1951
+ goto out;
1952
+ case 'g':
1953
+ t->type = ELEMENT;
1954
+ t->n = 2;
1955
+ t->intval = 111;
1956
+ goto out;
1957
+ case 'h':
1958
+ t->type = ELEMENT;
1959
+ t->n = 2;
1960
+ t->intval = 45;
1961
+ goto out;
1962
+ case 'n':
1963
+ t->type = ELEMENT;
1964
+ t->n = 2;
1965
+ t->intval = 86;
1966
+ goto out;
1967
+ case 'u':
1968
+ t->type = ELEMENT;
1969
+ t->n = 2;
1970
+ t->intval = 44;
1971
+ goto out;
1972
+ default:
1973
+ return 0;
1974
+ }
1975
+ case 'S':
1976
+ if (!inbracket) {
1977
+ t->intval = 16;
1978
+ t->type = ALIPHATIC_ORGANIC;
1979
+ goto out;
1980
+ }
1981
+ switch (c1) {
1982
+ case 'b':
1983
+ t->type = ELEMENT;
1984
+ t->n = 2;
1985
+ t->intval = 51;
1986
+ goto out;
1987
+ case 'c':
1988
+ t->type = ELEMENT;
1989
+ t->n = 2;
1990
+ t->intval = 21;
1991
+ goto out;
1992
+ case 'e':
1993
+ t->type = ELEMENT;
1994
+ t->n = 2;
1995
+ t->intval = 34;
1996
+ goto out;
1997
+ case 'g':
1998
+ t->type = ELEMENT;
1999
+ t->n = 2;
2000
+ t->intval = 106;
2001
+ goto out;
2002
+ case 'i':
2003
+ t->type = ELEMENT;
2004
+ t->n = 2;
2005
+ t->intval = 14;
2006
+ goto out;
2007
+ case 'm':
2008
+ t->type = ELEMENT;
2009
+ t->n = 2;
2010
+ t->intval = 62;
2011
+ goto out;
2012
+ case 'n':
2013
+ t->type = ELEMENT;
2014
+ t->n = 2;
2015
+ t->intval = 50;
2016
+ goto out;
2017
+ case 'r':
2018
+ t->type = ELEMENT;
2019
+ t->n = 2;
2020
+ t->intval = 38;
2021
+ goto out;
2022
+ default:
2023
+ t->type = ELEMENT;
2024
+ t->intval = 16;
2025
+ goto out;
2026
+ }
2027
+ case 'T':
2028
+ switch (c1) {
2029
+ case 'a':
2030
+ t->type = ELEMENT;
2031
+ t->n = 2;
2032
+ t->intval = 73;
2033
+ goto out;
2034
+ case 'b':
2035
+ t->type = ELEMENT;
2036
+ t->n = 2;
2037
+ t->intval = 65;
2038
+ goto out;
2039
+ case 'c':
2040
+ t->type = ELEMENT;
2041
+ t->n = 2;
2042
+ t->intval = 43;
2043
+ goto out;
2044
+ case 'e':
2045
+ t->type = ELEMENT;
2046
+ t->n = 2;
2047
+ t->intval = 52;
2048
+ goto out;
2049
+ case 'h':
2050
+ t->type = ELEMENT;
2051
+ t->n = 2;
2052
+ t->intval = 90;
2053
+ goto out;
2054
+ case 'i':
2055
+ t->type = ELEMENT;
2056
+ t->n = 2;
2057
+ t->intval = 22;
2058
+ goto out;
2059
+ case 'l':
2060
+ t->type = ELEMENT;
2061
+ t->n = 2;
2062
+ t->intval = 81;
2063
+ goto out;
2064
+ case 'm':
2065
+ t->type = ELEMENT;
2066
+ t->n = 2;
2067
+ t->intval = 69;
2068
+ goto out;
2069
+ default:
2070
+ return 0;
2071
+ }
2072
+ case 'U':
2073
+ t->type = ELEMENT;
2074
+ t->intval = 92;
2075
+ goto out;
2076
+ case 'V':
2077
+ t->type = ELEMENT;
2078
+ t->intval = 23;
2079
+ goto out;
2080
+ case 'W':
2081
+ t->type = ELEMENT;
2082
+ t->intval = 74;
2083
+ goto out;
2084
+ case 'X':
2085
+ switch (c1) {
2086
+ case 'e':
2087
+ t->type = ELEMENT;
2088
+ t->n = 2;
2089
+ t->intval = 54;
2090
+ goto out;
2091
+ default:
2092
+ return 0;
2093
+ }
2094
+ case 'Y':
2095
+ switch (c1) {
2096
+ case 'b':
2097
+ t->type = ELEMENT;
2098
+ t->n = 2;
2099
+ t->intval = 70;
2100
+ goto out;
2101
+ default:
2102
+ t->type = ELEMENT;
2103
+ t->intval = 39;
2104
+ goto out;
2105
+ }
2106
+ case 'Z':
2107
+ switch (c1) {
2108
+ case 'n':
2109
+ t->type = ELEMENT;
2110
+ t->n = 2;
2111
+ t->intval = 30;
2112
+ goto out;
2113
+ case 'r':
2114
+ t->type = ELEMENT;
2115
+ t->n = 2;
2116
+ t->intval = 40;
2117
+ goto out;
2118
+ default:
2119
+ return 0;
2120
+ }
2121
+ case '0':
2122
+ case '1':
2123
+ case '2':
2124
+ case '3':
2125
+ case '4':
2126
+ case '5':
2127
+ case '6':
2128
+ case '7':
2129
+ case '8':
2130
+ case '9':
2131
+ t->type = DIGIT;
2132
+ t->intval = c0 - '0';
2133
+ goto out;
2134
+ case '*':
2135
+ t->type = WILDCARD;
2136
+ t->intval = 0;
2137
+ goto out;
2138
+ case '[':
2139
+ t->type = BRACKET_OPEN;
2140
+ goto out;
2141
+ case ']':
2142
+ t->type = BRACKET_CLOSE;
2143
+ goto out;
2144
+ case '(':
2145
+ t->type = PAREN_OPEN;
2146
+ goto out;
2147
+ case ')':
2148
+ t->type = PAREN_CLOSE;
2149
+ goto out;
2150
+ case '+':
2151
+ t->type = PLUS;
2152
+ t->intval = 1;
2153
+ goto out;
2154
+ case '-':
2155
+ t->type = inbracket ? MINUS : BOND;
2156
+ t->intval = inbracket ? -1 : COHO_SMILES_BOND_SINGLE;
2157
+ goto out;
2158
+ case '%':
2159
+ t->type = PERCENT;
2160
+ goto out;
2161
+ case '=':
2162
+ t->type = BOND;
2163
+ t->intval = COHO_SMILES_BOND_DOUBLE;
2164
+ goto out;
2165
+ case '#':
2166
+ t->type = BOND;
2167
+ t->intval = COHO_SMILES_BOND_TRIPLE;
2168
+ goto out;
2169
+ case '$':
2170
+ t->type = BOND;
2171
+ t->intval = COHO_SMILES_BOND_QUAD;
2172
+ goto out;
2173
+ case ':':
2174
+ if (inbracket) {
2175
+ t->type = COLON;
2176
+ } else {
2177
+ t->type = BOND;
2178
+ t->intval = COHO_SMILES_BOND_AROMATIC;
2179
+ }
2180
+ goto out;
2181
+ case '/':
2182
+ t->type = BOND;
2183
+ t->intval = COHO_SMILES_BOND_SINGLE;
2184
+ t->flags = COHO_SMILES_BOND_STEREO_UP;
2185
+ goto out;
2186
+ case '\\':
2187
+ t->type = BOND;
2188
+ t->intval = COHO_SMILES_BOND_SINGLE;
2189
+ t->flags = COHO_SMILES_BOND_STEREO_DOWN;
2190
+ goto out;
2191
+ case '.':
2192
+ t->type = DOT;
2193
+ goto out;
2194
+ case '@':
2195
+ t->type = CHIRALITY;
2196
+ if (c1 == '@')
2197
+ t->n = 2;
2198
+ goto out;
2199
+ default:
2200
+ return 0;
2201
+ }
2202
+
2203
+ out:
2204
+ return t->type;
2205
+ }