coho 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/coho/coho.c +75 -0
- data/ext/coho/extconf.rb +13 -0
- data/ext/coho/src/coho.h +111 -0
- data/ext/coho/src/compat.c +91 -0
- data/ext/coho/src/smiles.c +2205 -0
- data/lib/coho.rb +11 -0
- metadata +49 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 78fcbe27ff1817e332322d5d1110037c6747aa969377857b26d5e049068988f9
|
4
|
+
data.tar.gz: 8d8a2215c7ff67b371c3d4028e8e28d3b1748be9a070d2bba0d0531739ca3af1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 61ce89d6eb31c7260c8d40541c23f37085f658eb770e7e9571a8609a8d381dbb44400a786b5256ae02f01945779633cb18c9dd35295822518a8c24241b4e118d
|
7
|
+
data.tar.gz: a0e8d021ea0c1f9d3b93361beb5424fa151d9c80d604b148da42c942e2b001d906ce6c8cf666786c8abc5c859d0c6434892cb27c6c8eed3ff4239df66010b16d
|
data/ext/coho/coho.c
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include "coho.h"
|
4
|
+
|
5
|
+
#include "extconf.h"
|
6
|
+
|
7
|
+
static VALUE coho_rb_mod;
|
8
|
+
static VALUE coho_rb_smiles_mod;
|
9
|
+
static VALUE coho_rb_smiles_atom_class;
|
10
|
+
|
11
|
+
|
12
|
+
static VALUE
|
13
|
+
get_int(int n)
|
14
|
+
{
|
15
|
+
return n == -1 ? Qnil : INT2NUM(n);
|
16
|
+
}
|
17
|
+
|
18
|
+
|
19
|
+
VALUE
|
20
|
+
smiles_parser_parse(VALUE self, VALUE smiles_str)
|
21
|
+
{
|
22
|
+
struct coho_smiles smiles;
|
23
|
+
struct coho_smiles_atom *a;
|
24
|
+
int i;
|
25
|
+
VALUE atom_class, atoms, atom;
|
26
|
+
|
27
|
+
coho_smiles_init(&smiles);
|
28
|
+
coho_smiles_parse(&smiles,
|
29
|
+
RSTRING_PTR(smiles_str),
|
30
|
+
RSTRING_LEN(smiles_str));
|
31
|
+
|
32
|
+
atom_class = rb_path2class("Coho::Smiles::Atom");
|
33
|
+
atoms = rb_ary_new();
|
34
|
+
|
35
|
+
for (i = 0; i < smiles.atom_count; i++) {
|
36
|
+
a = &smiles.atoms[i];
|
37
|
+
atom = rb_funcall(atom_class, rb_intern("new"), 0);
|
38
|
+
rb_iv_set(atom, "@atomic_number", get_int(a->atomic_number));
|
39
|
+
rb_iv_set(atom, "@symbol", rb_str_new2(a->symbol));
|
40
|
+
rb_iv_set(atom, "@isotope", get_int(a->isotope));
|
41
|
+
rb_iv_set(atom, "@charge", INT2NUM(a->charge));
|
42
|
+
rb_iv_set(atom, "@hydrogen_count", get_int(a->hydrogen_count));
|
43
|
+
rb_iv_set(atom,
|
44
|
+
"@implicit_hydrogen_count",
|
45
|
+
get_int(a->implicit_hydrogen_count));
|
46
|
+
|
47
|
+
rb_ary_push(atoms, atom);
|
48
|
+
}
|
49
|
+
|
50
|
+
coho_smiles_free(&smiles);
|
51
|
+
|
52
|
+
return atoms;
|
53
|
+
}
|
54
|
+
|
55
|
+
static void
|
56
|
+
init_smiles(VALUE coho_mod)
|
57
|
+
{
|
58
|
+
VALUE mod;
|
59
|
+
VALUE parser_class;
|
60
|
+
|
61
|
+
mod = rb_define_module_under(coho_mod, "Smiles");
|
62
|
+
parser_class = rb_define_class_under(mod, "Parser", rb_cObject);
|
63
|
+
|
64
|
+
rb_define_method(parser_class, "parse", smiles_parser_parse, 1);
|
65
|
+
}
|
66
|
+
|
67
|
+
|
68
|
+
void
|
69
|
+
Init_coho()
|
70
|
+
{
|
71
|
+
VALUE mod;
|
72
|
+
|
73
|
+
mod = rb_define_module("Coho");
|
74
|
+
init_smiles(mod);
|
75
|
+
}
|
data/ext/coho/extconf.rb
ADDED
data/ext/coho/src/coho.h
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2017-2019 Ben Cornett <ben@lantern.is>
|
3
|
+
*
|
4
|
+
* Permission to use, copy, modify, and distribute this software for any
|
5
|
+
* purpose with or without fee is hereby granted, provided that the above
|
6
|
+
* copyright notice and this permission notice appear in all copies.
|
7
|
+
*
|
8
|
+
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
9
|
+
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
10
|
+
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
11
|
+
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
12
|
+
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
13
|
+
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
14
|
+
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
15
|
+
*/
|
16
|
+
|
17
|
+
enum coho_status {
|
18
|
+
COHO_OK,
|
19
|
+
COHO_ERROR,
|
20
|
+
COHO_NOMEM,
|
21
|
+
};
|
22
|
+
|
23
|
+
/* Compatibility functions {{{
|
24
|
+
*/
|
25
|
+
|
26
|
+
#undef strlcpy
|
27
|
+
size_t strlcpy(char *, const char *, size_t);
|
28
|
+
|
29
|
+
#undef reallocarray
|
30
|
+
void *reallocarray(void *, size_t, size_t);
|
31
|
+
|
32
|
+
/* }}} */
|
33
|
+
|
34
|
+
/* SMILES parsing {{{
|
35
|
+
*/
|
36
|
+
enum {
|
37
|
+
COHO_SMILES_BOND_UNSPECIFIED = 0,
|
38
|
+
COHO_SMILES_BOND_SINGLE = 1,
|
39
|
+
COHO_SMILES_BOND_DOUBLE = 2,
|
40
|
+
COHO_SMILES_BOND_TRIPLE = 3,
|
41
|
+
COHO_SMILES_BOND_QUAD = 4,
|
42
|
+
COHO_SMILES_BOND_AROMATIC = 5,
|
43
|
+
};
|
44
|
+
|
45
|
+
enum {
|
46
|
+
COHO_SMILES_BOND_STEREO_UNSPECIFIED,
|
47
|
+
COHO_SMILES_BOND_STEREO_UP,
|
48
|
+
COHO_SMILES_BOND_STEREO_DOWN,
|
49
|
+
};
|
50
|
+
|
51
|
+
struct coho_smiles_atom {
|
52
|
+
int atomic_number;
|
53
|
+
char symbol[4];
|
54
|
+
int isotope;
|
55
|
+
int charge;
|
56
|
+
int hydrogen_count;
|
57
|
+
int implicit_hydrogen_count;
|
58
|
+
int is_bracket;
|
59
|
+
int is_organic;
|
60
|
+
int is_aromatic;
|
61
|
+
char chirality[8];
|
62
|
+
int atom_class;
|
63
|
+
int position;
|
64
|
+
int length;
|
65
|
+
};
|
66
|
+
|
67
|
+
struct coho_smiles_bond {
|
68
|
+
int atom0;
|
69
|
+
int atom1;
|
70
|
+
int order;
|
71
|
+
int stereo;
|
72
|
+
int is_implicit;
|
73
|
+
int is_ring;
|
74
|
+
int position;
|
75
|
+
int length;
|
76
|
+
};
|
77
|
+
|
78
|
+
struct coho_smiles_paren {
|
79
|
+
int position;
|
80
|
+
struct coho_smiles_bond bond;
|
81
|
+
};
|
82
|
+
|
83
|
+
struct coho_smiles {
|
84
|
+
const char *smiles;
|
85
|
+
int position;
|
86
|
+
int end;
|
87
|
+
char error[32];
|
88
|
+
int error_position;
|
89
|
+
|
90
|
+
int atom_count;
|
91
|
+
int bond_count;
|
92
|
+
|
93
|
+
struct coho_smiles_atom *atoms;
|
94
|
+
size_t atoms_capacity;
|
95
|
+
|
96
|
+
struct coho_smiles_bond *bonds;
|
97
|
+
size_t bonds_capacity;
|
98
|
+
|
99
|
+
struct coho_smiles_bond ring_bonds[100];
|
100
|
+
size_t open_ring_closures;
|
101
|
+
|
102
|
+
struct coho_smiles_paren *paren_stack;
|
103
|
+
int paren_stack_count;
|
104
|
+
size_t paren_stack_capacity;
|
105
|
+
};
|
106
|
+
|
107
|
+
void coho_smiles_free(struct coho_smiles *);
|
108
|
+
void coho_smiles_init(struct coho_smiles *);
|
109
|
+
int coho_smiles_parse(struct coho_smiles *, const char *, size_t);
|
110
|
+
|
111
|
+
/* }}} */
|
@@ -0,0 +1,91 @@
|
|
1
|
+
/* $OpenBSD: reallocarray.c,v 1.3 2015/09/13 08:31:47 guenther Exp $ */
|
2
|
+
/*
|
3
|
+
* Copyright (c) 2008 Otto Moerbeek <otto@drijf.net>
|
4
|
+
*
|
5
|
+
* Permission to use, copy, modify, and distribute this software for any
|
6
|
+
* purpose with or without fee is hereby granted, provided that the above
|
7
|
+
* copyright notice and this permission notice appear in all copies.
|
8
|
+
*
|
9
|
+
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
10
|
+
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
11
|
+
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
12
|
+
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
13
|
+
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
14
|
+
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
15
|
+
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
16
|
+
*/
|
17
|
+
|
18
|
+
#include <sys/types.h>
|
19
|
+
#include <errno.h>
|
20
|
+
#include <stdint.h>
|
21
|
+
#include <stdlib.h>
|
22
|
+
#include <string.h>
|
23
|
+
|
24
|
+
#include "coho.h"
|
25
|
+
|
26
|
+
/*
|
27
|
+
* This is sqrt(SIZE_MAX+1), as s1*s2 <= SIZE_MAX
|
28
|
+
* if both s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW
|
29
|
+
*/
|
30
|
+
#define MUL_NO_OVERFLOW ((size_t)1 << (sizeof(size_t) * 4))
|
31
|
+
|
32
|
+
void *
|
33
|
+
reallocarray(void *optr, size_t nmemb, size_t size)
|
34
|
+
{
|
35
|
+
if ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) &&
|
36
|
+
nmemb > 0 && SIZE_MAX / nmemb < size) {
|
37
|
+
errno = ENOMEM;
|
38
|
+
return NULL;
|
39
|
+
}
|
40
|
+
return realloc(optr, size * nmemb);
|
41
|
+
}
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
/* $OpenBSD: strlcpy.c,v 1.15 2016/10/16 17:37:39 dtucker Exp $ */
|
46
|
+
|
47
|
+
/*
|
48
|
+
* Copyright (c) 1998, 2015 Todd C. Miller <Todd.Miller@courtesan.com>
|
49
|
+
*
|
50
|
+
* Permission to use, copy, modify, and distribute this software for any
|
51
|
+
* purpose with or without fee is hereby granted, provided that the above
|
52
|
+
* copyright notice and this permission notice appear in all copies.
|
53
|
+
*
|
54
|
+
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
55
|
+
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
56
|
+
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
57
|
+
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
58
|
+
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
59
|
+
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
60
|
+
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
61
|
+
*/
|
62
|
+
|
63
|
+
/*
|
64
|
+
* Copy string src to buffer dst of size dsize. At most dsize-1
|
65
|
+
* chars will be copied. Always NUL terminates (unless dsize == 0).
|
66
|
+
* Returns strlen(src); if retval >= dsize, truncation occurred.
|
67
|
+
*/
|
68
|
+
size_t
|
69
|
+
strlcpy(char *dst, const char *src, size_t dsize)
|
70
|
+
{
|
71
|
+
const char *osrc = src;
|
72
|
+
size_t nleft = dsize;
|
73
|
+
|
74
|
+
/* Copy as many bytes as will fit. */
|
75
|
+
if (nleft != 0) {
|
76
|
+
while (--nleft != 0) {
|
77
|
+
if ((*dst++ = *src++) == '\0')
|
78
|
+
break;
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
/* Not enough room in dst, add NUL and traverse rest of src. */
|
83
|
+
if (nleft == 0) {
|
84
|
+
if (dsize != 0)
|
85
|
+
*dst = '\0'; /* NUL-terminate dst */
|
86
|
+
while (*src++)
|
87
|
+
;
|
88
|
+
}
|
89
|
+
|
90
|
+
return(src - osrc - 1); /* count does not include NUL */
|
91
|
+
}
|
@@ -0,0 +1,2205 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2017-2019 Ben Cornett <ben@lantern.is>
|
3
|
+
*
|
4
|
+
* Permission to use, copy, modify, and distribute this software for any
|
5
|
+
* purpose with or without fee is hereby granted, provided that the above
|
6
|
+
* copyright notice and this permission notice appear in all copies.
|
7
|
+
*
|
8
|
+
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
9
|
+
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
10
|
+
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
11
|
+
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
12
|
+
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
13
|
+
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
14
|
+
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
15
|
+
*/
|
16
|
+
|
17
|
+
/*
|
18
|
+
* Parses SMILES as specified by the OpenSMILES standard.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <assert.h>
|
22
|
+
#include <limits.h>
|
23
|
+
#include <stdlib.h>
|
24
|
+
#include <string.h>
|
25
|
+
|
26
|
+
#include "coho.h"
|
27
|
+
|
28
|
+
#define ALIPHATIC_ORGANIC 0x00001
|
29
|
+
#define AROMATIC 0x00002
|
30
|
+
#define AROMATIC_ORGANIC 0x00004
|
31
|
+
#define BOND 0x00008
|
32
|
+
#define BRACKET_CLOSE 0x00010
|
33
|
+
#define BRACKET_OPEN 0x00020
|
34
|
+
#define CHIRALITY 0x00040
|
35
|
+
#define COLON 0x00080
|
36
|
+
#define DIGIT 0x00100
|
37
|
+
#define DOT 0x00200
|
38
|
+
#define ELEMENT 0x00400
|
39
|
+
#define HYDROGEN 0x00800
|
40
|
+
#define MINUS 0x01000
|
41
|
+
#define PAREN_CLOSE 0x02000
|
42
|
+
#define PAREN_OPEN 0x04000
|
43
|
+
#define PERCENT 0x08000
|
44
|
+
#define PLUS 0x10000
|
45
|
+
#define WILDCARD 0x20000
|
46
|
+
|
47
|
+
struct token {
|
48
|
+
int type;
|
49
|
+
int position;
|
50
|
+
const char *s;
|
51
|
+
size_t n;
|
52
|
+
int intval;
|
53
|
+
int flags;
|
54
|
+
};
|
55
|
+
|
56
|
+
static int atom_class(struct coho_smiles *, struct coho_smiles_atom *);
|
57
|
+
static int add_atom(struct coho_smiles *, struct coho_smiles_atom *);
|
58
|
+
static int add_bond(struct coho_smiles *, struct coho_smiles_bond *);
|
59
|
+
static int add_ringbond(struct coho_smiles *, int, struct coho_smiles_bond *);
|
60
|
+
static int aliphatic_organic(struct coho_smiles *, struct coho_smiles_atom *);
|
61
|
+
static int aromatic_organic(struct coho_smiles *, struct coho_smiles_atom *);
|
62
|
+
static int assign_implicit_hydrogen_count(struct coho_smiles *);
|
63
|
+
static int atom(struct coho_smiles *, int *);
|
64
|
+
static int atom_ringbond(struct coho_smiles *, int *);
|
65
|
+
static int atom_valence(struct coho_smiles *, size_t);
|
66
|
+
static int bond(struct coho_smiles *, struct coho_smiles_bond *b);
|
67
|
+
static int bracket_atom(struct coho_smiles *, struct coho_smiles_atom *);
|
68
|
+
static int charge(struct coho_smiles *, struct coho_smiles_atom *);
|
69
|
+
static int check_ring_closures(struct coho_smiles *);
|
70
|
+
static int chirality(struct coho_smiles *, struct coho_smiles_atom *);
|
71
|
+
static int close_paren(struct coho_smiles *, struct coho_smiles_bond *);
|
72
|
+
static int dot(struct coho_smiles *);
|
73
|
+
static int ensure_array_capacities(struct coho_smiles *, size_t);
|
74
|
+
static void finalize_implicit_bond_order(struct coho_smiles *,
|
75
|
+
struct coho_smiles_bond *);
|
76
|
+
static int hydrogen_count(struct coho_smiles *, struct coho_smiles_atom *);
|
77
|
+
static int integer(struct coho_smiles *, size_t, int *);
|
78
|
+
static int isotope(struct coho_smiles *, struct coho_smiles_atom *);
|
79
|
+
static unsigned int lex(struct coho_smiles *, struct token *, int);
|
80
|
+
static int match(struct coho_smiles *, struct token *, int, unsigned int);
|
81
|
+
static size_t next_array_capacity(size_t);
|
82
|
+
static int open_paren(struct coho_smiles *, struct coho_smiles_bond *);
|
83
|
+
static int pop_paren_stack(struct coho_smiles *, int, struct coho_smiles_bond *);
|
84
|
+
static void push_paren_stack(struct coho_smiles *, int, struct coho_smiles_bond *);
|
85
|
+
static int ringbond(struct coho_smiles *, int);
|
86
|
+
static int round_valence(int, int, int);
|
87
|
+
static void coho_smiles_atom_init(struct coho_smiles_atom *);
|
88
|
+
static void coho_smiles_bond_init(struct coho_smiles_bond *);
|
89
|
+
static void coho_smiles_reinit(struct coho_smiles *, const char *, size_t);
|
90
|
+
static int symbol(struct coho_smiles *, struct coho_smiles_atom *);
|
91
|
+
static void tokcpy(char *, struct token *, size_t);
|
92
|
+
static int wildcard(struct coho_smiles *, struct coho_smiles_atom *);
|
93
|
+
|
94
|
+
/*
|
95
|
+
* Table of standard atom valences.
|
96
|
+
* <atomic number> <valence>...
|
97
|
+
*/
|
98
|
+
static int standard_valences[][4] = {
|
99
|
+
{5, 3, -1, -1}, /* B */
|
100
|
+
{6, 4, -1, -1}, /* C */
|
101
|
+
{7, 3, 5, -1}, /* N */
|
102
|
+
{8, 2, -1, -1}, /* O */
|
103
|
+
{9, 1, -1, -1}, /* F */
|
104
|
+
{15, 3, 5, -1}, /* P */
|
105
|
+
{16, 2, 4, 6}, /* S */
|
106
|
+
{17, 1, -1, -1}, /* Cl */
|
107
|
+
{35, 1, -1, -1}, /* Br */
|
108
|
+
{53, 1, -1, -1}, /* I */
|
109
|
+
{-1, -1, -1, -1},
|
110
|
+
};
|
111
|
+
|
112
|
+
void
|
113
|
+
coho_smiles_free(struct coho_smiles *x)
|
114
|
+
{
|
115
|
+
free(x->atoms);
|
116
|
+
free(x->bonds);
|
117
|
+
free(x->paren_stack);
|
118
|
+
}
|
119
|
+
|
120
|
+
void
|
121
|
+
coho_smiles_init(struct coho_smiles *x)
|
122
|
+
{
|
123
|
+
size_t i;
|
124
|
+
|
125
|
+
x->smiles = NULL;
|
126
|
+
x->position = 0;
|
127
|
+
x->end = 0;
|
128
|
+
x->error[0] = '\0';
|
129
|
+
x->error_position = -1;
|
130
|
+
|
131
|
+
x->atom_count = 0;
|
132
|
+
x->bond_count = 0;
|
133
|
+
|
134
|
+
x->atoms = NULL;
|
135
|
+
x->atoms_capacity = 0;
|
136
|
+
x->bonds = NULL;
|
137
|
+
x->bonds_capacity = 0;
|
138
|
+
x->paren_stack = NULL;
|
139
|
+
x->paren_stack_capacity = 0;
|
140
|
+
|
141
|
+
for (i = 0; i < 100; i++)
|
142
|
+
coho_smiles_bond_init(&x->ring_bonds[i]);
|
143
|
+
x->open_ring_closures = 0;
|
144
|
+
}
|
145
|
+
|
146
|
+
int
|
147
|
+
coho_smiles_parse(struct coho_smiles *x, const char *smiles, size_t sz)
|
148
|
+
{
|
149
|
+
struct coho_smiles_bond b;
|
150
|
+
int anum; /* index of last atom read */
|
151
|
+
int eos; /* end-of-string flag */
|
152
|
+
int rc;
|
153
|
+
size_t end;
|
154
|
+
|
155
|
+
enum {
|
156
|
+
INIT,
|
157
|
+
ATOM_READ,
|
158
|
+
BOND_READ,
|
159
|
+
DOT_READ,
|
160
|
+
OPEN_PAREN_READ,
|
161
|
+
CLOSE_PAREN_READ,
|
162
|
+
} state;
|
163
|
+
|
164
|
+
end = sz ? sz : strlen(smiles);
|
165
|
+
if (sz > INT_MAX) {
|
166
|
+
strlcpy(x->error, "SMILES too long", sizeof(x->error));
|
167
|
+
return COHO_NOMEM;
|
168
|
+
}
|
169
|
+
coho_smiles_reinit(x, smiles, end);
|
170
|
+
|
171
|
+
if (ensure_array_capacities(x, end)) {
|
172
|
+
return COHO_NOMEM;
|
173
|
+
}
|
174
|
+
|
175
|
+
b.atom0 = -1; /* no previous atom to bond to */
|
176
|
+
anum = -1;
|
177
|
+
state = INIT;
|
178
|
+
|
179
|
+
for (;;) {
|
180
|
+
eos = x->position == x->end;
|
181
|
+
|
182
|
+
switch (state) {
|
183
|
+
|
184
|
+
/*
|
185
|
+
* Parsing has just begun.
|
186
|
+
*/
|
187
|
+
case INIT:
|
188
|
+
if (eos) {
|
189
|
+
strlcpy(x->error,
|
190
|
+
"empty SMILES",
|
191
|
+
sizeof(x->error));
|
192
|
+
goto err;
|
193
|
+
}
|
194
|
+
|
195
|
+
else if ((rc = atom_ringbond(x, &anum))) {
|
196
|
+
if (rc == -1)
|
197
|
+
goto err;
|
198
|
+
}
|
199
|
+
|
200
|
+
else {
|
201
|
+
strlcpy(x->error,
|
202
|
+
"atom expected",
|
203
|
+
sizeof(x->error));
|
204
|
+
goto err;
|
205
|
+
}
|
206
|
+
state = ATOM_READ;
|
207
|
+
break;
|
208
|
+
|
209
|
+
/*
|
210
|
+
* An atom has just been read.
|
211
|
+
*/
|
212
|
+
case ATOM_READ:
|
213
|
+
/*
|
214
|
+
* If there is an open bond to the previous
|
215
|
+
* atom, complete it.
|
216
|
+
*/
|
217
|
+
if (b.atom0 != -1) {
|
218
|
+
b.atom1 = anum;
|
219
|
+
|
220
|
+
finalize_implicit_bond_order(x, &b);
|
221
|
+
|
222
|
+
if (add_bond(x, &b) == -1)
|
223
|
+
goto err;
|
224
|
+
}
|
225
|
+
|
226
|
+
/*
|
227
|
+
* The atom just read may be bonded to
|
228
|
+
* subsequent atoms.
|
229
|
+
* Store this state in an incomplete bond.
|
230
|
+
*/
|
231
|
+
coho_smiles_bond_init(&b);
|
232
|
+
b.atom0 = anum;
|
233
|
+
b.is_implicit = 1;
|
234
|
+
|
235
|
+
if (eos) {
|
236
|
+
goto done;
|
237
|
+
}
|
238
|
+
|
239
|
+
else if ((rc = atom_ringbond(x, &anum))) {
|
240
|
+
if (rc == -1)
|
241
|
+
goto err;
|
242
|
+
}
|
243
|
+
|
244
|
+
else if ((rc = bond(x, &b))) {
|
245
|
+
if (rc == -1)
|
246
|
+
goto err;
|
247
|
+
state = BOND_READ;
|
248
|
+
}
|
249
|
+
|
250
|
+
else if (dot(x)) {
|
251
|
+
state = DOT_READ;
|
252
|
+
}
|
253
|
+
|
254
|
+
else if (open_paren(x, &b)) {
|
255
|
+
state = OPEN_PAREN_READ;
|
256
|
+
}
|
257
|
+
|
258
|
+
else if ((rc = close_paren(x, &b))) {
|
259
|
+
if (rc == -1)
|
260
|
+
goto err;
|
261
|
+
state = CLOSE_PAREN_READ;
|
262
|
+
}
|
263
|
+
|
264
|
+
else {
|
265
|
+
goto unexpected;
|
266
|
+
}
|
267
|
+
|
268
|
+
break;
|
269
|
+
|
270
|
+
/*
|
271
|
+
* A dot (.) has just been read.
|
272
|
+
* An atom is expected.
|
273
|
+
* If there is a bond to a previous atom awaiting
|
274
|
+
* completion, it must be cancelled.
|
275
|
+
*/
|
276
|
+
case DOT_READ:
|
277
|
+
/* Invalidate open bond to previous atom. */
|
278
|
+
b.atom0 = -1;
|
279
|
+
|
280
|
+
if ((rc = atom_ringbond(x, &anum))) {
|
281
|
+
if (rc == -1)
|
282
|
+
goto err;
|
283
|
+
} else {
|
284
|
+
strlcpy(x->error,
|
285
|
+
"atom must follow dot",
|
286
|
+
sizeof(x->error));
|
287
|
+
goto err;
|
288
|
+
}
|
289
|
+
state = ATOM_READ;
|
290
|
+
break;
|
291
|
+
|
292
|
+
/*
|
293
|
+
* A bond (-, =, #, etc) has just been read.
|
294
|
+
* An atom is expected.
|
295
|
+
*/
|
296
|
+
case BOND_READ:
|
297
|
+
|
298
|
+
if ((rc = atom_ringbond(x, &anum))) {
|
299
|
+
if (rc == -1)
|
300
|
+
goto err;
|
301
|
+
} else {
|
302
|
+
strlcpy(x->error,
|
303
|
+
"atom must follow bond",
|
304
|
+
sizeof(x->error));
|
305
|
+
goto err;
|
306
|
+
}
|
307
|
+
state = ATOM_READ;
|
308
|
+
break;
|
309
|
+
|
310
|
+
/*
|
311
|
+
* An opening parenthesis has just been read
|
312
|
+
* and the parenthesis stack pushed.
|
313
|
+
*/
|
314
|
+
case OPEN_PAREN_READ:
|
315
|
+
|
316
|
+
if (eos) {
|
317
|
+
strlcpy(x->error,
|
318
|
+
"unbalanced parenthesis",
|
319
|
+
sizeof(x->error));
|
320
|
+
x->error_position = x->position - 1;
|
321
|
+
goto err;
|
322
|
+
}
|
323
|
+
|
324
|
+
else if ((rc = atom_ringbond(x, &anum))) {
|
325
|
+
if (rc == -1)
|
326
|
+
goto err;
|
327
|
+
state = ATOM_READ;
|
328
|
+
}
|
329
|
+
|
330
|
+
else if ((rc = bond(x, &b))) {
|
331
|
+
if (rc == -1)
|
332
|
+
goto err;
|
333
|
+
state = BOND_READ;
|
334
|
+
}
|
335
|
+
|
336
|
+
else if (dot(x)) {
|
337
|
+
state = DOT_READ;
|
338
|
+
}
|
339
|
+
|
340
|
+
else {
|
341
|
+
strlcpy(x->error,
|
342
|
+
"atom, bond, or dot expected",
|
343
|
+
sizeof(x->error));
|
344
|
+
goto err;
|
345
|
+
}
|
346
|
+
break;
|
347
|
+
|
348
|
+
/*
|
349
|
+
* A closing parenthesis has just been read
|
350
|
+
* and the parenthesis stack popped.
|
351
|
+
*/
|
352
|
+
case CLOSE_PAREN_READ:
|
353
|
+
|
354
|
+
if (eos) {
|
355
|
+
goto done;
|
356
|
+
}
|
357
|
+
|
358
|
+
else if ((rc = atom_ringbond(x, &anum))) {
|
359
|
+
if (rc == -1)
|
360
|
+
goto err;
|
361
|
+
state = ATOM_READ;
|
362
|
+
}
|
363
|
+
|
364
|
+
else if ((rc = bond(x, &b))) {
|
365
|
+
if (rc == -1)
|
366
|
+
goto err;
|
367
|
+
state = BOND_READ;
|
368
|
+
}
|
369
|
+
|
370
|
+
else if (dot(x)) {
|
371
|
+
state = DOT_READ;
|
372
|
+
}
|
373
|
+
|
374
|
+
else if (open_paren(x, &b)) {
|
375
|
+
state = OPEN_PAREN_READ;
|
376
|
+
}
|
377
|
+
|
378
|
+
else if ((rc = close_paren(x, &b))) {
|
379
|
+
if (rc == -1)
|
380
|
+
goto err;
|
381
|
+
state = CLOSE_PAREN_READ;
|
382
|
+
}
|
383
|
+
|
384
|
+
else {
|
385
|
+
goto unexpected;
|
386
|
+
}
|
387
|
+
break;
|
388
|
+
}
|
389
|
+
}
|
390
|
+
|
391
|
+
done:
|
392
|
+
assert(x->position == x->end);
|
393
|
+
|
394
|
+
if (check_ring_closures(x))
|
395
|
+
goto err;
|
396
|
+
|
397
|
+
if (x->paren_stack_count > 0) {
|
398
|
+
strlcpy(x->error, "unbalanced parenthesis", sizeof(x->error));
|
399
|
+
x->error_position = x->paren_stack[0].position;
|
400
|
+
goto err;
|
401
|
+
}
|
402
|
+
|
403
|
+
if (assign_implicit_hydrogen_count(x))
|
404
|
+
goto err;
|
405
|
+
|
406
|
+
return COHO_OK;
|
407
|
+
|
408
|
+
unexpected:
|
409
|
+
strlcpy(x->error, "unexpected character", sizeof(x->error));
|
410
|
+
err:
|
411
|
+
if (x->error_position == -1)
|
412
|
+
x->error_position = x->position;
|
413
|
+
return COHO_ERROR;
|
414
|
+
}
|
415
|
+
|
416
|
+
/*
|
417
|
+
* Parses optional atom class inside a bracket atom (ex: [C:23]).
|
418
|
+
* If successful, sets a->atom_class and increments a->length.
|
419
|
+
* Returns 1 if atom class was read, else 0.
|
420
|
+
* On error, sets x->error and returns -1.
|
421
|
+
*/
|
422
|
+
static int
|
423
|
+
atom_class(struct coho_smiles *x, struct coho_smiles_atom *a)
|
424
|
+
{
|
425
|
+
struct token t;
|
426
|
+
int n;
|
427
|
+
|
428
|
+
if (!match(x, &t, 1, COLON))
|
429
|
+
return 0;
|
430
|
+
|
431
|
+
a->length += t.n;
|
432
|
+
|
433
|
+
if ((n = integer(x, 8, &a->atom_class)) == -1) {
|
434
|
+
strlcpy(x->error, "atom class too large", sizeof(x->error));
|
435
|
+
return -1;
|
436
|
+
} else if (n == 0) {
|
437
|
+
strlcpy(x->error, "atom class expected", sizeof(x->error));
|
438
|
+
return -1;
|
439
|
+
}
|
440
|
+
|
441
|
+
a->length += n;
|
442
|
+
return 1;
|
443
|
+
}
|
444
|
+
|
445
|
+
/*
|
446
|
+
* Saves a completed atom and returns its index.
|
447
|
+
*/
|
448
|
+
static int
|
449
|
+
add_atom(struct coho_smiles *x, struct coho_smiles_atom *a)
|
450
|
+
{
|
451
|
+
x->atoms[x->atom_count] = *a;
|
452
|
+
return x->atom_count++;
|
453
|
+
}
|
454
|
+
|
455
|
+
/*
|
456
|
+
* Saves a new bond to the bond list and returns its index.
|
457
|
+
* Returns new length of bond list on success.
|
458
|
+
* If the bond is already in the list, sets x->error and returns -1.
|
459
|
+
* Bonds are added so that bond->atom0 < bond->atom1 and the entire bond list
|
460
|
+
* remains sorted.
|
461
|
+
*/
|
462
|
+
static int
|
463
|
+
add_bond(struct coho_smiles *x, struct coho_smiles_bond *bond)
|
464
|
+
{
|
465
|
+
size_t i, move;
|
466
|
+
struct coho_smiles_bond nb, *b;
|
467
|
+
|
468
|
+
nb = *bond;
|
469
|
+
|
470
|
+
/* Flip so atom0 < atom1
|
471
|
+
*/
|
472
|
+
if (bond->atom0 > bond->atom1) {
|
473
|
+
nb.atom0 = bond->atom1;
|
474
|
+
nb.atom1 = bond->atom0;
|
475
|
+
|
476
|
+
if (bond->stereo == COHO_SMILES_BOND_STEREO_UP)
|
477
|
+
nb.stereo = COHO_SMILES_BOND_STEREO_DOWN;
|
478
|
+
else if (bond->stereo == COHO_SMILES_BOND_STEREO_DOWN)
|
479
|
+
nb.stereo = COHO_SMILES_BOND_STEREO_UP;
|
480
|
+
}
|
481
|
+
|
482
|
+
/* Find position to insert and check for duplicates.
|
483
|
+
* Start search from end, since bonds are
|
484
|
+
* mostly generated in the correct order.
|
485
|
+
*/
|
486
|
+
for (i = x->bond_count; i > 0; i--) {
|
487
|
+
b = &x->bonds[i-1];
|
488
|
+
|
489
|
+
if (nb.atom0 > b->atom0)
|
490
|
+
break;
|
491
|
+
else if (nb.atom0 < b->atom0)
|
492
|
+
continue;
|
493
|
+
else if (nb.atom1 > b->atom1)
|
494
|
+
break;
|
495
|
+
else if (nb.atom1 < b->atom1)
|
496
|
+
continue;
|
497
|
+
else {
|
498
|
+
strlcpy(x->error, "duplicate bond", sizeof(x->error));
|
499
|
+
x->error_position = nb.position;
|
500
|
+
return -1;
|
501
|
+
}
|
502
|
+
}
|
503
|
+
|
504
|
+
move = x->bond_count - i; /* # elements to shift */
|
505
|
+
if (move) {
|
506
|
+
memmove(x->bonds + i + 1,
|
507
|
+
x->bonds + i,
|
508
|
+
move * sizeof(x->bonds[0]));
|
509
|
+
}
|
510
|
+
|
511
|
+
x->bonds[i] = nb;
|
512
|
+
return x->bond_count++;
|
513
|
+
}
|
514
|
+
|
515
|
+
/*
|
516
|
+
* Adds a ring bond closure.
|
517
|
+
* If there is already an open ring bond using rnum,
|
518
|
+
* it is closed and the new bond is added to the bond list.
|
519
|
+
* Otherwise, a new bond is opened.
|
520
|
+
* Returns 0 on success.
|
521
|
+
* On failure, sets x->error and returns -1.
|
522
|
+
*/
|
523
|
+
static int
|
524
|
+
add_ringbond(struct coho_smiles *x, int rnum, struct coho_smiles_bond *b)
|
525
|
+
{
|
526
|
+
struct coho_smiles_bond *rb;
|
527
|
+
|
528
|
+
assert(rnum < 100);
|
529
|
+
|
530
|
+
if (b->order == COHO_SMILES_BOND_UNSPECIFIED)
|
531
|
+
assert(b->stereo == COHO_SMILES_BOND_STEREO_UNSPECIFIED);
|
532
|
+
|
533
|
+
rb = &x->ring_bonds[rnum];
|
534
|
+
|
535
|
+
if (rb->atom0 == -1) {
|
536
|
+
rb->atom0 = b->atom0;
|
537
|
+
rb->order = b->order;
|
538
|
+
rb->stereo = b->stereo;
|
539
|
+
rb->is_implicit = 0;
|
540
|
+
rb->is_ring = 1;
|
541
|
+
rb->position = b->position;
|
542
|
+
rb->length = b->length;
|
543
|
+
x->open_ring_closures++;
|
544
|
+
return 0;
|
545
|
+
}
|
546
|
+
|
547
|
+
/* Close the open bond */
|
548
|
+
|
549
|
+
if (rb->atom0 == b->atom0) {
|
550
|
+
strlcpy(x->error,
|
551
|
+
"atom ring-bonded to itself",
|
552
|
+
sizeof(x->error));
|
553
|
+
x->error_position = x->atoms[b->atom0].position;
|
554
|
+
return -1;
|
555
|
+
}
|
556
|
+
|
557
|
+
if (rb->order == COHO_SMILES_BOND_UNSPECIFIED)
|
558
|
+
rb->order = b->order;
|
559
|
+
else if (b->order == COHO_SMILES_BOND_UNSPECIFIED)
|
560
|
+
; /* pass */
|
561
|
+
else if (rb->order != b->order) {
|
562
|
+
strlcpy(x->error,
|
563
|
+
"conflicting ring bond orders",
|
564
|
+
sizeof(x->error));
|
565
|
+
x->error_position = x->atoms[b->atom0].position;
|
566
|
+
return -1;
|
567
|
+
}
|
568
|
+
if (rb->order == COHO_SMILES_BOND_UNSPECIFIED)
|
569
|
+
rb->order = COHO_SMILES_BOND_SINGLE;
|
570
|
+
|
571
|
+
rb->atom1 = b->atom0;
|
572
|
+
|
573
|
+
if (add_bond(x, rb) == -1)
|
574
|
+
return -1;
|
575
|
+
|
576
|
+
coho_smiles_bond_init(rb);
|
577
|
+
rb->atom0 = -1; ; /* mark slot open again */
|
578
|
+
x->open_ring_closures--;
|
579
|
+
|
580
|
+
return 0;
|
581
|
+
}
|
582
|
+
|
583
|
+
/*
|
584
|
+
* Matches an aliphatic organic atom (C, N, O, etc.).
|
585
|
+
* Returns 1 on match, 0 if no match, or -1 on error.
|
586
|
+
*/
|
587
|
+
static int
|
588
|
+
aliphatic_organic(struct coho_smiles *x, struct coho_smiles_atom *a)
|
589
|
+
{
|
590
|
+
struct token t;
|
591
|
+
|
592
|
+
if (!match(x, &t, 0, ALIPHATIC_ORGANIC))
|
593
|
+
return 0;
|
594
|
+
coho_smiles_atom_init(a);
|
595
|
+
a->position = t.position;
|
596
|
+
a->atomic_number = t.intval;
|
597
|
+
a->is_organic = 1;
|
598
|
+
a->length = t.n;
|
599
|
+
tokcpy(a->symbol, &t, sizeof(a->symbol));
|
600
|
+
return 1;
|
601
|
+
}
|
602
|
+
|
603
|
+
/*
|
604
|
+
* Matches an aromatic organic atom (c, n, o, etc.).
|
605
|
+
* Returns 1 on match, 0 if no match, or -1 on error.
|
606
|
+
*/
|
607
|
+
static int
|
608
|
+
aromatic_organic(struct coho_smiles *x, struct coho_smiles_atom *a)
|
609
|
+
{
|
610
|
+
struct token t;
|
611
|
+
|
612
|
+
if (!match(x, &t, 0, AROMATIC_ORGANIC))
|
613
|
+
return 0;
|
614
|
+
coho_smiles_atom_init(a);
|
615
|
+
a->position = t.position;
|
616
|
+
a->atomic_number = t.intval;
|
617
|
+
a->is_organic = 1;
|
618
|
+
a->is_aromatic = 1;
|
619
|
+
a->length = t.n;
|
620
|
+
tokcpy(a->symbol, &t, sizeof(a->symbol));
|
621
|
+
return 1;
|
622
|
+
}
|
623
|
+
|
624
|
+
/*
|
625
|
+
* Assigns implicit hydrogen counts for all atoms that were
|
626
|
+
* specified using the organic-subset shorthand.
|
627
|
+
*/
|
628
|
+
static int
|
629
|
+
assign_implicit_hydrogen_count(struct coho_smiles *x)
|
630
|
+
{
|
631
|
+
int i, valence, std;
|
632
|
+
struct coho_smiles_atom *a;
|
633
|
+
|
634
|
+
for (i = 0; i < x->atom_count; i++) {
|
635
|
+
a = &x->atoms[i];
|
636
|
+
|
637
|
+
if (!a->is_organic)
|
638
|
+
continue;
|
639
|
+
|
640
|
+
valence = atom_valence(x, i);
|
641
|
+
std = round_valence(a->atomic_number, valence, a->is_aromatic);
|
642
|
+
|
643
|
+
if (std == -1)
|
644
|
+
a->implicit_hydrogen_count = 0;
|
645
|
+
else
|
646
|
+
a->implicit_hydrogen_count = std - valence;
|
647
|
+
}
|
648
|
+
|
649
|
+
return 0;
|
650
|
+
}
|
651
|
+
|
652
|
+
/*
|
653
|
+
* Matches an atom or returns 0 if not found.
|
654
|
+
* If successful, stores the index of the new atom in *anum and returns 1.
|
655
|
+
* On error, sets x->error and returns -1.
|
656
|
+
*/
|
657
|
+
static int
|
658
|
+
atom(struct coho_smiles *x, int *atom_index)
|
659
|
+
{
|
660
|
+
struct coho_smiles_atom a;
|
661
|
+
int rc;
|
662
|
+
|
663
|
+
if ((rc = bracket_atom(x, &a)) ||
|
664
|
+
(rc = aliphatic_organic(x, &a)) ||
|
665
|
+
(rc = aromatic_organic(x, &a)) ||
|
666
|
+
(rc = wildcard(x, &a))) {
|
667
|
+
if (rc == -1)
|
668
|
+
return -1;
|
669
|
+
} else {
|
670
|
+
return 0;
|
671
|
+
}
|
672
|
+
|
673
|
+
*atom_index = add_atom(x, &a);
|
674
|
+
return 1;
|
675
|
+
}
|
676
|
+
|
677
|
+
/*
|
678
|
+
* Matches an atom followed by zero or more ringbonds.
|
679
|
+
* On success, stores the index of the new atom in *anum and returns 1.
|
680
|
+
* Returns 0 if there is no match.
|
681
|
+
* On error, sets x->error and returns -1.
|
682
|
+
*/
|
683
|
+
static int
|
684
|
+
atom_ringbond(struct coho_smiles *x, int *anum)
|
685
|
+
{
|
686
|
+
int rc;
|
687
|
+
|
688
|
+
if ((rc = atom(x, anum))) {
|
689
|
+
if (rc == -1 )
|
690
|
+
return -1;
|
691
|
+
} else {
|
692
|
+
return 0;
|
693
|
+
}
|
694
|
+
|
695
|
+
while ((rc = ringbond(x, *anum)))
|
696
|
+
if (rc == -1 )
|
697
|
+
return -1;
|
698
|
+
|
699
|
+
return 1;
|
700
|
+
}
|
701
|
+
|
702
|
+
/*
|
703
|
+
* Computes the valence of an atom by summing the orders
|
704
|
+
* of its bonds.
|
705
|
+
* Treats aromatic atoms as a special case in an attempt to
|
706
|
+
* properly derive implicit hydrogen count.
|
707
|
+
*/
|
708
|
+
static int
|
709
|
+
atom_valence(struct coho_smiles *x, size_t idx)
|
710
|
+
{
|
711
|
+
int i;
|
712
|
+
int valence, neighbors;
|
713
|
+
struct coho_smiles_bond *b;
|
714
|
+
|
715
|
+
valence = 0;
|
716
|
+
neighbors = 0;
|
717
|
+
|
718
|
+
for (i = 0; i < x->bond_count; i++) {
|
719
|
+
b = &x->bonds[i];
|
720
|
+
if (b->atom0 > (int)idx)
|
721
|
+
break;
|
722
|
+
else if (b->atom0 != (int)idx && b->atom1 != (int)idx)
|
723
|
+
continue;
|
724
|
+
|
725
|
+
if (b->order == COHO_SMILES_BOND_SINGLE)
|
726
|
+
valence += 1;
|
727
|
+
else if (b->order == COHO_SMILES_BOND_AROMATIC)
|
728
|
+
valence += 1;
|
729
|
+
else if (b->order == COHO_SMILES_BOND_DOUBLE)
|
730
|
+
valence += 2;
|
731
|
+
else if (b->order == COHO_SMILES_BOND_TRIPLE)
|
732
|
+
valence += 3;
|
733
|
+
else if (b->order == COHO_SMILES_BOND_QUAD)
|
734
|
+
valence += 4;
|
735
|
+
|
736
|
+
neighbors += 1;
|
737
|
+
}
|
738
|
+
|
739
|
+
if (x->atoms[idx].is_aromatic && valence == neighbors) {
|
740
|
+
valence += 1;
|
741
|
+
}
|
742
|
+
|
743
|
+
return valence;
|
744
|
+
}
|
745
|
+
|
746
|
+
/*
|
747
|
+
* Matches a bond or returns 0 if not found.
|
748
|
+
* If found, sets fields of *b and returns 1.
|
749
|
+
* Only sets fields that can be determined by the matching bond
|
750
|
+
* token (order, stereo, position, and length).
|
751
|
+
* Clears implicit flag.
|
752
|
+
* Doesn't set bond atoms.
|
753
|
+
*/
|
754
|
+
static int
|
755
|
+
bond(struct coho_smiles *x, struct coho_smiles_bond *b)
|
756
|
+
{
|
757
|
+
struct token t;
|
758
|
+
|
759
|
+
if (!match(x, &t, 0, BOND))
|
760
|
+
return 0;
|
761
|
+
|
762
|
+
b->order = t.intval;
|
763
|
+
b->stereo = t.flags;
|
764
|
+
b->is_implicit = 0;
|
765
|
+
b->position = t.position;
|
766
|
+
b->length = t.n;
|
767
|
+
return 1;
|
768
|
+
}
|
769
|
+
|
770
|
+
/*
|
771
|
+
* Matches a bracket atom or returns 0 if not found.
|
772
|
+
* If found, initializes the atom, sets its fields, and returns 1.
|
773
|
+
* On error, sets x->error and returns -1.
|
774
|
+
*/
|
775
|
+
static int
|
776
|
+
bracket_atom(struct coho_smiles *x, struct coho_smiles_atom *a)
|
777
|
+
{
|
778
|
+
struct token t;
|
779
|
+
|
780
|
+
if (!match(x, &t, 0, BRACKET_OPEN))
|
781
|
+
return 0;
|
782
|
+
|
783
|
+
coho_smiles_atom_init(a);
|
784
|
+
a->is_bracket = 1;
|
785
|
+
a->position = t.position;
|
786
|
+
a->length = t.n;
|
787
|
+
|
788
|
+
if (isotope(x, a) == -1)
|
789
|
+
return -1;
|
790
|
+
|
791
|
+
if (symbol(x, a) == 0) {
|
792
|
+
strlcpy(x->error, "atom symbol expected", sizeof(x->error));
|
793
|
+
return -1;
|
794
|
+
}
|
795
|
+
|
796
|
+
if (chirality(x, a) == -1)
|
797
|
+
return -1;
|
798
|
+
|
799
|
+
if (hydrogen_count(x, a) == -1)
|
800
|
+
return -1;
|
801
|
+
|
802
|
+
if (charge(x, a) == -1)
|
803
|
+
return -1;
|
804
|
+
|
805
|
+
if (atom_class(x, a) == -1)
|
806
|
+
return -1;
|
807
|
+
|
808
|
+
if (!match(x, &t, 0, BRACKET_CLOSE)) {
|
809
|
+
strlcpy(x->error,
|
810
|
+
"bracket atom syntax error",
|
811
|
+
sizeof(x->error));
|
812
|
+
return -1;
|
813
|
+
}
|
814
|
+
a->length += t.n;
|
815
|
+
return 1;
|
816
|
+
}
|
817
|
+
|
818
|
+
/*
|
819
|
+
* Returns 0 if all rings have been closed.
|
820
|
+
* Otherwise, sets x->error and returns -1.
|
821
|
+
*/
|
822
|
+
static int
|
823
|
+
check_ring_closures(struct coho_smiles *x)
|
824
|
+
{
|
825
|
+
size_t i;
|
826
|
+
|
827
|
+
if (x->open_ring_closures == 0)
|
828
|
+
return 0;
|
829
|
+
|
830
|
+
strlcpy(x->error, "unclosed ring bond", sizeof(x->error));
|
831
|
+
|
832
|
+
for (i = 0; i < 100; i++) {
|
833
|
+
if (x->ring_bonds[i].atom0 != -1) {
|
834
|
+
x->error_position = x->ring_bonds[i].position;
|
835
|
+
break;
|
836
|
+
}
|
837
|
+
}
|
838
|
+
|
839
|
+
return -1;
|
840
|
+
}
|
841
|
+
|
842
|
+
/*
|
843
|
+
* Parses optional charge inside a bracket atom.
|
844
|
+
* If successful, sets a->charge and increments a->length.
|
845
|
+
* Returns 1 if charge was read, else 0.
|
846
|
+
* On error, sets x->error and returns -1.
|
847
|
+
*/
|
848
|
+
static int
|
849
|
+
charge(struct coho_smiles *x, struct coho_smiles_atom *a)
|
850
|
+
{
|
851
|
+
struct token t;
|
852
|
+
int sign;
|
853
|
+
int n;
|
854
|
+
int length;
|
855
|
+
|
856
|
+
if (!match(x, &t, 1, PLUS | MINUS))
|
857
|
+
return 0;
|
858
|
+
sign = t.intval;
|
859
|
+
length = t.n;
|
860
|
+
|
861
|
+
if ((n = integer(x, 2, &a->charge)) == -1) {
|
862
|
+
strlcpy(x->error, "charge too large", sizeof(x->error));
|
863
|
+
return -1;
|
864
|
+
} else if (n) {
|
865
|
+
a->charge *= sign;
|
866
|
+
length += n;
|
867
|
+
} else {
|
868
|
+
a->charge = sign;
|
869
|
+
|
870
|
+
if (lex(x, &t, 1) & (PLUS | MINUS)) {
|
871
|
+
if (t.intval == sign) {
|
872
|
+
x->position += t.n;
|
873
|
+
a->charge *= 2;
|
874
|
+
length += t.n;
|
875
|
+
}
|
876
|
+
}
|
877
|
+
}
|
878
|
+
|
879
|
+
a->length += length;
|
880
|
+
return 1;
|
881
|
+
}
|
882
|
+
|
883
|
+
/*
|
884
|
+
* Parses chirality inside a bracket atom.
|
885
|
+
* If successful, sets a->chirality and increments a->length.
|
886
|
+
* Returns 1 if chirality was read, else 0.
|
887
|
+
* TODO: Currently, this only understands @ and @@.
|
888
|
+
*/
|
889
|
+
static int
|
890
|
+
chirality(struct coho_smiles *x, struct coho_smiles_atom *a)
|
891
|
+
{
|
892
|
+
struct token t;
|
893
|
+
|
894
|
+
if (!match(x, &t, 1, CHIRALITY))
|
895
|
+
return 0;
|
896
|
+
tokcpy(a->chirality, &t, sizeof(a->chirality));
|
897
|
+
a->length += t.n;
|
898
|
+
return 1;
|
899
|
+
}
|
900
|
+
|
901
|
+
/*
|
902
|
+
* Matches a closing parenthesis that ends a branch.
|
903
|
+
* On success, pops the parenthesis stack and returns 1.
|
904
|
+
* Returns 0 if there was no match.
|
905
|
+
* On error, sets x->error and returns -1.
|
906
|
+
*/
|
907
|
+
static int
|
908
|
+
close_paren(struct coho_smiles *x, struct coho_smiles_bond *b)
|
909
|
+
{
|
910
|
+
struct token t;
|
911
|
+
|
912
|
+
if (!match(x, &t, 0, PAREN_CLOSE))
|
913
|
+
return 0;
|
914
|
+
|
915
|
+
if (pop_paren_stack(x, t.position, b))
|
916
|
+
return -1;
|
917
|
+
return 1;
|
918
|
+
}
|
919
|
+
|
920
|
+
/*
|
921
|
+
* Matches dot, the no-bond specifier.
|
922
|
+
* Returns 1 on success, 0 if there was no match.
|
923
|
+
*/
|
924
|
+
static int
|
925
|
+
dot(struct coho_smiles *x)
|
926
|
+
{
|
927
|
+
struct token t;
|
928
|
+
|
929
|
+
return match(x, &t, 0, DOT);
|
930
|
+
}
|
931
|
+
|
932
|
+
static int
|
933
|
+
ensure_array_capacities(struct coho_smiles *x, size_t smiles_length)
|
934
|
+
{
|
935
|
+
size_t new_capacity;
|
936
|
+
void *p;
|
937
|
+
|
938
|
+
/*
|
939
|
+
* Maximum required storage is bounded by length of SMILES string.
|
940
|
+
*/
|
941
|
+
if (x->atoms_capacity >= smiles_length)
|
942
|
+
return 0;
|
943
|
+
|
944
|
+
new_capacity = next_array_capacity(smiles_length);
|
945
|
+
|
946
|
+
#define GROW(name) \
|
947
|
+
do { \
|
948
|
+
p = reallocarray(x->name, \
|
949
|
+
new_capacity, \
|
950
|
+
sizeof(x->name[0])); \
|
951
|
+
if (p == NULL) \
|
952
|
+
return -1; \
|
953
|
+
x->name = p; \
|
954
|
+
x->name##_capacity = new_capacity; \
|
955
|
+
} while (0)
|
956
|
+
|
957
|
+
GROW(atoms);
|
958
|
+
GROW(bonds);
|
959
|
+
GROW(paren_stack);
|
960
|
+
|
961
|
+
#undef GROW
|
962
|
+
return 0;
|
963
|
+
}
|
964
|
+
|
965
|
+
/*
|
966
|
+
* Sets the order of an implicit bond according to
|
967
|
+
* the aromaticity of the two atoms.
|
968
|
+
*/
|
969
|
+
static void
|
970
|
+
finalize_implicit_bond_order(struct coho_smiles *x, struct coho_smiles_bond *b)
|
971
|
+
{
|
972
|
+
if (!b->is_implicit)
|
973
|
+
return;
|
974
|
+
|
975
|
+
if (x->atoms[b->atom0].is_aromatic && x->atoms[b->atom1].is_aromatic)
|
976
|
+
b->order = COHO_SMILES_BOND_AROMATIC;
|
977
|
+
else
|
978
|
+
b->order = COHO_SMILES_BOND_SINGLE;
|
979
|
+
}
|
980
|
+
|
981
|
+
/*
|
982
|
+
* Parses hydrogen count inside a bracket atom.
|
983
|
+
* If successful, sets a->hydrogen_count and increments a->length.
|
984
|
+
* Returns 1 if hydrogen_count was read, else 0.
|
985
|
+
*/
|
986
|
+
static int
|
987
|
+
hydrogen_count(struct coho_smiles *x, struct coho_smiles_atom *a)
|
988
|
+
{
|
989
|
+
struct token t;
|
990
|
+
|
991
|
+
if (!match(x, &t, 1, HYDROGEN))
|
992
|
+
return 0;
|
993
|
+
|
994
|
+
a->length += t.n;
|
995
|
+
|
996
|
+
if (match(x, &t, 1, DIGIT)) {
|
997
|
+
a->hydrogen_count = t.intval;
|
998
|
+
a->length += t.n;
|
999
|
+
} else {
|
1000
|
+
a->hydrogen_count = 1;
|
1001
|
+
}
|
1002
|
+
|
1003
|
+
return 1;
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
/*
|
1007
|
+
* Matches an integer up to maxdigit long.
|
1008
|
+
* On success, stores the integer in *dst and returns number of digits.
|
1009
|
+
* Returns 0 if no digits are available.
|
1010
|
+
* Returns -1 if maxdigit is exceeded.
|
1011
|
+
*/
|
1012
|
+
static int
|
1013
|
+
integer(struct coho_smiles *x, size_t maxdigit, int *dst)
|
1014
|
+
{
|
1015
|
+
size_t i;
|
1016
|
+
int n = 0;
|
1017
|
+
int saved = x->position;
|
1018
|
+
struct token t;
|
1019
|
+
|
1020
|
+
for (i = 0; lex(x, &t, 0) & DIGIT; i++) {
|
1021
|
+
if (maxdigit && i == maxdigit) {
|
1022
|
+
x->position = saved;
|
1023
|
+
return -1;
|
1024
|
+
}
|
1025
|
+
x->position += t.n;
|
1026
|
+
n = n * 10 + t.intval;
|
1027
|
+
}
|
1028
|
+
if (i == 0)
|
1029
|
+
return 0;
|
1030
|
+
*dst = n;
|
1031
|
+
return i;
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
/*
|
1035
|
+
* Parses isotope inside a bracket atom.
|
1036
|
+
* If successful, sets a->isotope and increments a->length.
|
1037
|
+
* Returns 1 if isotope was read, else 0.
|
1038
|
+
* On error, returns -1 and sets x->error.
|
1039
|
+
*/
|
1040
|
+
static int
|
1041
|
+
isotope(struct coho_smiles *x, struct coho_smiles_atom *a)
|
1042
|
+
{
|
1043
|
+
int n;
|
1044
|
+
|
1045
|
+
if ((n = integer(x, 5, &a->isotope)) == -1) {
|
1046
|
+
strlcpy(x->error, "isotope too large", sizeof(x->error));
|
1047
|
+
return -1;
|
1048
|
+
}
|
1049
|
+
a->length += n;
|
1050
|
+
return 0;
|
1051
|
+
}
|
1052
|
+
|
1053
|
+
/*
|
1054
|
+
* Reads next token and checks if its type is among those requested.
|
1055
|
+
* If so, consumes the token and returns 1.
|
1056
|
+
* If not, returns 0 and the parsing position remains unchanged.
|
1057
|
+
*/
|
1058
|
+
static int
|
1059
|
+
match(struct coho_smiles *x, struct token *t, int inbracket, unsigned int ttype)
|
1060
|
+
{
|
1061
|
+
if (lex(x, t, inbracket) & ttype) {
|
1062
|
+
x->position += t->n;
|
1063
|
+
return 1;
|
1064
|
+
}
|
1065
|
+
return 0;
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
/*
|
1069
|
+
* Returns a new array capacity that is larger than
|
1070
|
+
* its previous capacity.
|
1071
|
+
*/
|
1072
|
+
static size_t
|
1073
|
+
next_array_capacity(size_t previous_capacity)
|
1074
|
+
{
|
1075
|
+
size_t cap = 2 * previous_capacity - 1;
|
1076
|
+
|
1077
|
+
while (cap & (cap - 1))
|
1078
|
+
cap = cap & (cap - 1);
|
1079
|
+
return cap;
|
1080
|
+
}
|
1081
|
+
|
1082
|
+
/*
|
1083
|
+
* Matches an opening parenthesis that begins a branch.
|
1084
|
+
* On success, pushes the parenthesis stack and returns 1.
|
1085
|
+
* Returns 0 if there was no match.
|
1086
|
+
*/
|
1087
|
+
static int
|
1088
|
+
open_paren(struct coho_smiles *x, struct coho_smiles_bond *b)
|
1089
|
+
{
|
1090
|
+
struct token t;
|
1091
|
+
|
1092
|
+
if (!match(x, &t, 0, PAREN_OPEN))
|
1093
|
+
return 0;
|
1094
|
+
|
1095
|
+
push_paren_stack(x, t.position, b);
|
1096
|
+
return 1;
|
1097
|
+
}
|
1098
|
+
|
1099
|
+
/*
|
1100
|
+
* Pops the parenthesis stack that holds the open bonds to
|
1101
|
+
* "previous" atoms.
|
1102
|
+
* Ex: In C(N)=O the closing parenthesis will trigger the popping of
|
1103
|
+
* the stack, ensuring that the oxygen is bonded to the carbon instead
|
1104
|
+
* of the nitrogen.
|
1105
|
+
* The position of the parenthesis triggering the pop is used for
|
1106
|
+
* error messages.
|
1107
|
+
* Returns 0 on success.
|
1108
|
+
* On failure, sets x->error and returns -1.
|
1109
|
+
*/
|
1110
|
+
static int
|
1111
|
+
pop_paren_stack(struct coho_smiles *x, int position, struct coho_smiles_bond *b)
|
1112
|
+
{
|
1113
|
+
if (!x->paren_stack_count) {
|
1114
|
+
strlcpy(x->error, "unbalanced parenthesis", sizeof(x->error));
|
1115
|
+
x->error_position = position;
|
1116
|
+
return -1;
|
1117
|
+
}
|
1118
|
+
|
1119
|
+
*b = x->paren_stack[--x->paren_stack_count].bond;
|
1120
|
+
return 0;
|
1121
|
+
}
|
1122
|
+
|
1123
|
+
/*
|
1124
|
+
* Pushes the parenthesis stack that holds open bonds to
|
1125
|
+
* "previous" atoms.
|
1126
|
+
* Ex: In C(N)=O the first parenthesis will trigger the pushing of
|
1127
|
+
* an open bond to the carbon onto the stack.
|
1128
|
+
* The closing parenthesis will pop the stack, ensuring that the carbon
|
1129
|
+
* is correctly bonded to the oxygen.
|
1130
|
+
* The position of the parenthesis triggering the push is stored
|
1131
|
+
* to support error messages.
|
1132
|
+
*/
|
1133
|
+
static void
|
1134
|
+
push_paren_stack(struct coho_smiles *x,
|
1135
|
+
int position,
|
1136
|
+
struct coho_smiles_bond *b)
|
1137
|
+
{
|
1138
|
+
struct coho_smiles_paren *p;
|
1139
|
+
|
1140
|
+
assert(b->atom0 != -1);
|
1141
|
+
|
1142
|
+
p = &x->paren_stack[x->paren_stack_count++];
|
1143
|
+
p->position = position;
|
1144
|
+
p->bond = *b;
|
1145
|
+
}
|
1146
|
+
|
1147
|
+
/*
|
1148
|
+
* Matches a ring bond or returns 0 if not found.
|
1149
|
+
* On error, sets x->error and returns -1.
|
1150
|
+
* On success, uses atom anum to open or close a ring
|
1151
|
+
* bond and then returns 1.
|
1152
|
+
* If the parsed ring bond ID is in use, closes it and adds a new bond
|
1153
|
+
* to the bond list.
|
1154
|
+
* Otherwise, marks the ring ID as open.
|
1155
|
+
*/
|
1156
|
+
static int
|
1157
|
+
ringbond(struct coho_smiles *x, int anum)
|
1158
|
+
{
|
1159
|
+
struct token t;
|
1160
|
+
struct coho_smiles_bond b;
|
1161
|
+
int rc;
|
1162
|
+
int rnum;
|
1163
|
+
int saved = x->position;
|
1164
|
+
|
1165
|
+
coho_smiles_bond_init(&b);
|
1166
|
+
b.atom0 = anum;
|
1167
|
+
|
1168
|
+
if ((rc = bond(x, &b))) {
|
1169
|
+
if (rc == -1)
|
1170
|
+
return -1;
|
1171
|
+
} else {
|
1172
|
+
b.order = COHO_SMILES_BOND_UNSPECIFIED;
|
1173
|
+
b.position = x->position;
|
1174
|
+
}
|
1175
|
+
|
1176
|
+
if (!match(x, &t, 0, PERCENT | DIGIT)) {
|
1177
|
+
x->position = saved;
|
1178
|
+
return 0;
|
1179
|
+
}
|
1180
|
+
|
1181
|
+
if (t.type == PERCENT) {
|
1182
|
+
if (!match(x, &t, 0, DIGIT)) {
|
1183
|
+
strlcpy(x->error,
|
1184
|
+
"ring bond expected",
|
1185
|
+
sizeof(x->error));
|
1186
|
+
return -1;
|
1187
|
+
}
|
1188
|
+
rnum = t.intval * 10;
|
1189
|
+
|
1190
|
+
if (!match(x, &t, 0, DIGIT)) {
|
1191
|
+
strlcpy(x->error,
|
1192
|
+
"2 digit ring bond expected",
|
1193
|
+
sizeof(x->error));
|
1194
|
+
return -1;
|
1195
|
+
}
|
1196
|
+
rnum += t.intval;
|
1197
|
+
} else {
|
1198
|
+
rnum = t.intval;
|
1199
|
+
}
|
1200
|
+
|
1201
|
+
if (add_ringbond(x, rnum, &b))
|
1202
|
+
return -1;
|
1203
|
+
return 1;
|
1204
|
+
}
|
1205
|
+
|
1206
|
+
/*
|
1207
|
+
* Rounds an atom's current valence to its next standard one.
|
1208
|
+
* Returns its current valence if it among the standard ones.
|
1209
|
+
* Otherwise, returns the next higher standard one or -1 if
|
1210
|
+
* none are found.
|
1211
|
+
* Setting lowest_only to true causes the search to stop after
|
1212
|
+
* the first standard valence, disregarding higher valences.
|
1213
|
+
*/
|
1214
|
+
static int
|
1215
|
+
round_valence(int atomic_number, int valence, int lowest_only)
|
1216
|
+
{
|
1217
|
+
int i, j, anum;
|
1218
|
+
|
1219
|
+
for (i = 0; (anum = standard_valences[i][0]) != -1; i++) {
|
1220
|
+
if (anum > atomic_number)
|
1221
|
+
break;
|
1222
|
+
else if (anum == atomic_number) {
|
1223
|
+
for (j = 1; j < 4; j++) {
|
1224
|
+
if (valence <= standard_valences[i][j])
|
1225
|
+
return standard_valences[i][j];
|
1226
|
+
if (lowest_only)
|
1227
|
+
break;
|
1228
|
+
}
|
1229
|
+
}
|
1230
|
+
}
|
1231
|
+
return -1;
|
1232
|
+
}
|
1233
|
+
|
1234
|
+
/*
|
1235
|
+
* Initializes struct coho_smiles_atom.
|
1236
|
+
*/
|
1237
|
+
static void
|
1238
|
+
coho_smiles_atom_init(struct coho_smiles_atom *x)
|
1239
|
+
{
|
1240
|
+
x->atomic_number = 0;
|
1241
|
+
x->symbol[0] = '\0';
|
1242
|
+
x->isotope = -1;
|
1243
|
+
x->charge = 0;
|
1244
|
+
x->hydrogen_count = -1;
|
1245
|
+
x->implicit_hydrogen_count = -1;
|
1246
|
+
x->is_bracket = 0;
|
1247
|
+
x->is_organic = 0;
|
1248
|
+
x->is_aromatic = 0;
|
1249
|
+
x->chirality[0] = '\0';
|
1250
|
+
x->atom_class = -1;
|
1251
|
+
x->position = -1;
|
1252
|
+
x->length = 0;
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
/*
|
1256
|
+
* Initializes struct coho_smiles_bond.
|
1257
|
+
*/
|
1258
|
+
static void
|
1259
|
+
coho_smiles_bond_init(struct coho_smiles_bond *x)
|
1260
|
+
{
|
1261
|
+
x->atom0 = -1;
|
1262
|
+
x->atom1 = -1;
|
1263
|
+
x->order = -1;
|
1264
|
+
x->stereo = COHO_SMILES_BOND_STEREO_UNSPECIFIED;
|
1265
|
+
x->is_implicit = 0;
|
1266
|
+
x->is_ring = 0;
|
1267
|
+
x->position = -1;
|
1268
|
+
x->length = 0;
|
1269
|
+
}
|
1270
|
+
|
1271
|
+
/*
|
1272
|
+
* Reinitializes struct coho_smiles prior to parsing a new SMILES.
|
1273
|
+
* The given number of bytes of smiles will be parsed.
|
1274
|
+
*/
|
1275
|
+
static void
|
1276
|
+
coho_smiles_reinit(struct coho_smiles *x, const char *smiles, size_t end)
|
1277
|
+
{
|
1278
|
+
size_t i;
|
1279
|
+
|
1280
|
+
x->smiles = smiles;
|
1281
|
+
x->position = 0;
|
1282
|
+
x->end = end;
|
1283
|
+
x->error[0] = '\0';
|
1284
|
+
x->error_position = -1;
|
1285
|
+
x->atom_count = 0;
|
1286
|
+
x->bond_count = 0;
|
1287
|
+
x->paren_stack_count = 0;
|
1288
|
+
|
1289
|
+
for (i = 0; i < 100; i++)
|
1290
|
+
coho_smiles_bond_init(&x->ring_bonds[i]);
|
1291
|
+
x->open_ring_closures = 0;
|
1292
|
+
}
|
1293
|
+
|
1294
|
+
/*
|
1295
|
+
* Parses atom symbol inside a bracket atom.
|
1296
|
+
* If successful, sets a->symbol, a->is_aromatic, and increments a->length.
|
1297
|
+
* Returns 1 if symbol was read, else 0.
|
1298
|
+
*/
|
1299
|
+
static int
|
1300
|
+
symbol(struct coho_smiles *x, struct coho_smiles_atom *a)
|
1301
|
+
{
|
1302
|
+
struct token t;
|
1303
|
+
|
1304
|
+
if (!match(x, &t, 1, ELEMENT | AROMATIC | WILDCARD))
|
1305
|
+
return 0;
|
1306
|
+
a->atomic_number = t.intval;
|
1307
|
+
a->is_aromatic = t.type & AROMATIC ? 1 : 0;
|
1308
|
+
a->length += t.n;
|
1309
|
+
tokcpy(a->symbol, &t, sizeof(a->symbol));
|
1310
|
+
return 1;
|
1311
|
+
}
|
1312
|
+
|
1313
|
+
/*
|
1314
|
+
* Copies up to dstsz - 1 bytes from the token to dst, NUL-terminating
|
1315
|
+
* dst if dstsz is not 0.
|
1316
|
+
*/
|
1317
|
+
static void
|
1318
|
+
tokcpy(char *dst, struct token *t, size_t dstsz)
|
1319
|
+
{
|
1320
|
+
size_t i;
|
1321
|
+
|
1322
|
+
if (dstsz == 0)
|
1323
|
+
return;
|
1324
|
+
|
1325
|
+
for (i = 0; i < t->n; i++) {
|
1326
|
+
if (i == dstsz - 1)
|
1327
|
+
break;
|
1328
|
+
dst[i] = t->s[i];
|
1329
|
+
}
|
1330
|
+
|
1331
|
+
dst[i] = 0;
|
1332
|
+
}
|
1333
|
+
|
1334
|
+
/*
|
1335
|
+
* Matches a wildcard atom (*) or returns 0 if not found.
|
1336
|
+
* If found, initializes the atom, sets its fields, and returns 1.
|
1337
|
+
* On error, sets x->error and returns -1.
|
1338
|
+
*/
|
1339
|
+
static int
|
1340
|
+
wildcard(struct coho_smiles *x, struct coho_smiles_atom *a)
|
1341
|
+
{
|
1342
|
+
struct token t;
|
1343
|
+
|
1344
|
+
if (!match(x, &t, 0, WILDCARD))
|
1345
|
+
return 0;
|
1346
|
+
coho_smiles_atom_init(a);
|
1347
|
+
a->position = t.position;
|
1348
|
+
a->atomic_number = 0;
|
1349
|
+
a->length = t.n;
|
1350
|
+
tokcpy(a->symbol, &t, sizeof(a->symbol));
|
1351
|
+
return 1;
|
1352
|
+
}
|
1353
|
+
|
1354
|
+
/*
|
1355
|
+
* Reads next token from SMILES string.
|
1356
|
+
* The inbracket parameter should be set to true when parsing is
|
1357
|
+
* inside a bracket atom.
|
1358
|
+
* Returns the token type or zero if no token could be read.
|
1359
|
+
* The token type is a bitmask since a particular token can belong
|
1360
|
+
* to multiple categories. For example, the symbol for
|
1361
|
+
* hydrogen will have type ELEMENT | HYDROGEN.
|
1362
|
+
*/
|
1363
|
+
static unsigned int
|
1364
|
+
lex(struct coho_smiles *x, struct token *t, int inbracket)
|
1365
|
+
{
|
1366
|
+
int c0, c1;
|
1367
|
+
const char *s;
|
1368
|
+
|
1369
|
+
if (x->position == x->end)
|
1370
|
+
return 0;
|
1371
|
+
|
1372
|
+
s = x->smiles + x->position;
|
1373
|
+
c0 = s[0];
|
1374
|
+
c1 = 0;
|
1375
|
+
|
1376
|
+
if (x->position < x->end)
|
1377
|
+
c1 = s[1];
|
1378
|
+
|
1379
|
+
t->s = s;
|
1380
|
+
t->position = x->position;
|
1381
|
+
t->n = 1;
|
1382
|
+
t->type = 0;
|
1383
|
+
t->intval = -1;
|
1384
|
+
t->flags = 0;
|
1385
|
+
|
1386
|
+
switch (c0) {
|
1387
|
+
case 'a':
|
1388
|
+
if (inbracket && c1 == 's') {
|
1389
|
+
t->n = 2;
|
1390
|
+
t->type = AROMATIC;
|
1391
|
+
t->intval = 33;
|
1392
|
+
goto out;
|
1393
|
+
}
|
1394
|
+
return 0;
|
1395
|
+
case 'b':
|
1396
|
+
t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
|
1397
|
+
t->intval = 5;
|
1398
|
+
goto out;
|
1399
|
+
case 'c':
|
1400
|
+
t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
|
1401
|
+
t->intval = 6;
|
1402
|
+
goto out;
|
1403
|
+
case 'n':
|
1404
|
+
t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
|
1405
|
+
t->intval = 7;
|
1406
|
+
goto out;
|
1407
|
+
case 'o':
|
1408
|
+
t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
|
1409
|
+
t->intval = 8;
|
1410
|
+
goto out;
|
1411
|
+
case 'p':
|
1412
|
+
t->type = inbracket ? AROMATIC : AROMATIC_ORGANIC;
|
1413
|
+
t->intval = 15;
|
1414
|
+
goto out;
|
1415
|
+
case 's':
|
1416
|
+
if (!inbracket) {
|
1417
|
+
t->type = AROMATIC_ORGANIC;
|
1418
|
+
t->intval = 16;
|
1419
|
+
goto out;
|
1420
|
+
}
|
1421
|
+
switch (c1) {
|
1422
|
+
case 'e':
|
1423
|
+
t->type = AROMATIC;
|
1424
|
+
t->n = 2;
|
1425
|
+
t->intval = 34;
|
1426
|
+
goto out;
|
1427
|
+
default:
|
1428
|
+
t->type = AROMATIC;
|
1429
|
+
t->intval = 16;
|
1430
|
+
goto out;
|
1431
|
+
}
|
1432
|
+
case 'A':
|
1433
|
+
switch (c1) {
|
1434
|
+
case 'c':
|
1435
|
+
t->type = ELEMENT;
|
1436
|
+
t->intval = 89;
|
1437
|
+
t->n = 2;
|
1438
|
+
goto out;
|
1439
|
+
case 'g':
|
1440
|
+
t->type = ELEMENT;
|
1441
|
+
t->intval = 47;
|
1442
|
+
t->n = 2;
|
1443
|
+
goto out;
|
1444
|
+
case 'l':
|
1445
|
+
t->type = ELEMENT;
|
1446
|
+
t->intval = 13;
|
1447
|
+
t->n = 2;
|
1448
|
+
goto out;
|
1449
|
+
case 'm':
|
1450
|
+
t->type = ELEMENT;
|
1451
|
+
t->intval = 95;
|
1452
|
+
t->n = 2;
|
1453
|
+
goto out;
|
1454
|
+
case 'r':
|
1455
|
+
t->type = ELEMENT;
|
1456
|
+
t->intval = 18;
|
1457
|
+
t->n = 2;
|
1458
|
+
goto out;
|
1459
|
+
case 's':
|
1460
|
+
t->type = ELEMENT;
|
1461
|
+
t->intval = 33;
|
1462
|
+
t->n = 2;
|
1463
|
+
goto out;
|
1464
|
+
case 't':
|
1465
|
+
t->type = ELEMENT;
|
1466
|
+
t->intval = 85;
|
1467
|
+
t->n = 2;
|
1468
|
+
goto out;
|
1469
|
+
case 'u':
|
1470
|
+
t->type = ELEMENT;
|
1471
|
+
t->intval = 79;
|
1472
|
+
t->n = 2;
|
1473
|
+
goto out;
|
1474
|
+
default:
|
1475
|
+
return 0;
|
1476
|
+
}
|
1477
|
+
case 'B':
|
1478
|
+
if (!inbracket) {
|
1479
|
+
if (c1 == 'r') {
|
1480
|
+
t->intval = 35;
|
1481
|
+
t->n = 2;
|
1482
|
+
} else {
|
1483
|
+
t->intval = 5;
|
1484
|
+
}
|
1485
|
+
t->type = ALIPHATIC_ORGANIC;
|
1486
|
+
goto out;
|
1487
|
+
}
|
1488
|
+
switch (c1) {
|
1489
|
+
case 'a':
|
1490
|
+
t->type = ELEMENT;
|
1491
|
+
t->intval = 56;
|
1492
|
+
t->n = 2;
|
1493
|
+
goto out;
|
1494
|
+
case 'e':
|
1495
|
+
t->type = ELEMENT;
|
1496
|
+
t->intval = 4;
|
1497
|
+
t->n = 2;
|
1498
|
+
goto out;
|
1499
|
+
case 'h':
|
1500
|
+
t->type = ELEMENT;
|
1501
|
+
t->intval = 107;
|
1502
|
+
t->n = 2;
|
1503
|
+
goto out;
|
1504
|
+
case 'i':
|
1505
|
+
t->type = ELEMENT;
|
1506
|
+
t->intval = 83;
|
1507
|
+
t->n = 2;
|
1508
|
+
goto out;
|
1509
|
+
case 'k':
|
1510
|
+
t->type = ELEMENT;
|
1511
|
+
t->intval = 97;
|
1512
|
+
t->n = 2;
|
1513
|
+
goto out;
|
1514
|
+
case 'r':
|
1515
|
+
t->type = ELEMENT;
|
1516
|
+
t->intval = 35;
|
1517
|
+
t->n = 2;
|
1518
|
+
goto out;
|
1519
|
+
default:
|
1520
|
+
t->type = ELEMENT;
|
1521
|
+
t->intval = 5;
|
1522
|
+
goto out;
|
1523
|
+
}
|
1524
|
+
case 'C':
|
1525
|
+
if (!inbracket) {
|
1526
|
+
if (c1 == 'l') {
|
1527
|
+
t->intval = 17;
|
1528
|
+
t->n = 2;
|
1529
|
+
} else {
|
1530
|
+
t->intval = 6;
|
1531
|
+
}
|
1532
|
+
t->type = ALIPHATIC_ORGANIC;
|
1533
|
+
goto out;
|
1534
|
+
}
|
1535
|
+
switch (c1) {
|
1536
|
+
case 'a':
|
1537
|
+
t->type = ELEMENT;
|
1538
|
+
t->n = 2;
|
1539
|
+
t->intval = 20;
|
1540
|
+
goto out;
|
1541
|
+
case 'd':
|
1542
|
+
t->type = ELEMENT;
|
1543
|
+
t->n = 2;
|
1544
|
+
t->intval = 20;
|
1545
|
+
goto out;
|
1546
|
+
case 'e':
|
1547
|
+
t->type = ELEMENT;
|
1548
|
+
t->n = 2;
|
1549
|
+
t->intval = 58;
|
1550
|
+
goto out;
|
1551
|
+
case 'f':
|
1552
|
+
t->type = ELEMENT;
|
1553
|
+
t->n = 2;
|
1554
|
+
t->intval = 98;
|
1555
|
+
goto out;
|
1556
|
+
case 'l':
|
1557
|
+
t->type = ELEMENT;
|
1558
|
+
t->n = 2;
|
1559
|
+
t->intval = 17;
|
1560
|
+
goto out;
|
1561
|
+
case 'm':
|
1562
|
+
t->type = ELEMENT;
|
1563
|
+
t->n = 2;
|
1564
|
+
t->intval = 96;
|
1565
|
+
goto out;
|
1566
|
+
case 'n':
|
1567
|
+
t->type = ELEMENT;
|
1568
|
+
t->n = 2;
|
1569
|
+
t->intval = 112;
|
1570
|
+
goto out;
|
1571
|
+
case 'o':
|
1572
|
+
t->type = ELEMENT;
|
1573
|
+
t->n = 2;
|
1574
|
+
t->intval = 27;
|
1575
|
+
goto out;
|
1576
|
+
case 'r':
|
1577
|
+
t->type = ELEMENT;
|
1578
|
+
t->n = 2;
|
1579
|
+
t->intval = 24;
|
1580
|
+
goto out;
|
1581
|
+
case 's':
|
1582
|
+
t->type = ELEMENT;
|
1583
|
+
t->n = 2;
|
1584
|
+
t->intval = 55;
|
1585
|
+
goto out;
|
1586
|
+
case 'u':
|
1587
|
+
t->type = ELEMENT;
|
1588
|
+
t->n = 2;
|
1589
|
+
t->intval = 29;
|
1590
|
+
goto out;
|
1591
|
+
default:
|
1592
|
+
t->type = ELEMENT;
|
1593
|
+
t->intval = 6;
|
1594
|
+
goto out;
|
1595
|
+
}
|
1596
|
+
case 'D':
|
1597
|
+
switch (c1) {
|
1598
|
+
case 'b':
|
1599
|
+
t->type = ELEMENT;
|
1600
|
+
t->n = 2;
|
1601
|
+
t->intval = 105;
|
1602
|
+
goto out;
|
1603
|
+
case 's':
|
1604
|
+
t->type = ELEMENT;
|
1605
|
+
t->n = 2;
|
1606
|
+
t->intval = 110;
|
1607
|
+
goto out;
|
1608
|
+
case 'y':
|
1609
|
+
t->type = ELEMENT;
|
1610
|
+
t->n = 2;
|
1611
|
+
t->intval = 66;
|
1612
|
+
goto out;
|
1613
|
+
default:
|
1614
|
+
return 0;
|
1615
|
+
}
|
1616
|
+
case 'E':
|
1617
|
+
switch (c1) {
|
1618
|
+
case 'r':
|
1619
|
+
t->type = ELEMENT;
|
1620
|
+
t->n = 2;
|
1621
|
+
t->intval = 68;
|
1622
|
+
goto out;
|
1623
|
+
case 's':
|
1624
|
+
t->type = ELEMENT;
|
1625
|
+
t->n = 2;
|
1626
|
+
t->intval = 99;
|
1627
|
+
goto out;
|
1628
|
+
case 'u':
|
1629
|
+
t->type = ELEMENT;
|
1630
|
+
t->n = 2;
|
1631
|
+
t->intval = 63;
|
1632
|
+
goto out;
|
1633
|
+
default:
|
1634
|
+
return 0;
|
1635
|
+
}
|
1636
|
+
case 'F':
|
1637
|
+
if (!inbracket) {
|
1638
|
+
t->intval = 9;
|
1639
|
+
t->type = ALIPHATIC_ORGANIC;
|
1640
|
+
goto out;
|
1641
|
+
}
|
1642
|
+
switch (c1) {
|
1643
|
+
case 'e':
|
1644
|
+
t->type = ELEMENT;
|
1645
|
+
t->n = 2;
|
1646
|
+
t->intval = 26;
|
1647
|
+
goto out;
|
1648
|
+
case 'l':
|
1649
|
+
t->type = ELEMENT;
|
1650
|
+
t->n = 2;
|
1651
|
+
t->intval = 114;
|
1652
|
+
goto out;
|
1653
|
+
case 'm':
|
1654
|
+
t->type = ELEMENT;
|
1655
|
+
t->n = 2;
|
1656
|
+
t->intval = 100;
|
1657
|
+
goto out;
|
1658
|
+
case 'r':
|
1659
|
+
t->type = ELEMENT;
|
1660
|
+
t->n = 2;
|
1661
|
+
t->intval = 87;
|
1662
|
+
goto out;
|
1663
|
+
default:
|
1664
|
+
t->type = ELEMENT;
|
1665
|
+
t->intval = 9;
|
1666
|
+
goto out;
|
1667
|
+
}
|
1668
|
+
case 'G':
|
1669
|
+
switch (c1) {
|
1670
|
+
case 'a':
|
1671
|
+
t->type = ELEMENT;
|
1672
|
+
t->n = 2;
|
1673
|
+
t->intval = 31;
|
1674
|
+
goto out;
|
1675
|
+
case 'd':
|
1676
|
+
t->type = ELEMENT;
|
1677
|
+
t->n = 2;
|
1678
|
+
t->intval = 64;
|
1679
|
+
goto out;
|
1680
|
+
case 'e':
|
1681
|
+
t->type = ELEMENT;
|
1682
|
+
t->n = 2;
|
1683
|
+
t->intval = 32;
|
1684
|
+
goto out;
|
1685
|
+
default:
|
1686
|
+
return 0;
|
1687
|
+
}
|
1688
|
+
case 'H':
|
1689
|
+
switch (c1) {
|
1690
|
+
case 'e':
|
1691
|
+
t->type = ELEMENT;
|
1692
|
+
t->n = 2;
|
1693
|
+
t->intval = 2;
|
1694
|
+
goto out;
|
1695
|
+
case 'f':
|
1696
|
+
t->type = ELEMENT;
|
1697
|
+
t->n = 2;
|
1698
|
+
t->intval = 72;
|
1699
|
+
goto out;
|
1700
|
+
case 'g':
|
1701
|
+
t->type = ELEMENT;
|
1702
|
+
t->n = 2;
|
1703
|
+
t->intval = 80;
|
1704
|
+
goto out;
|
1705
|
+
case 'o':
|
1706
|
+
t->type = ELEMENT;
|
1707
|
+
t->n = 2;
|
1708
|
+
t->intval = 67;
|
1709
|
+
goto out;
|
1710
|
+
case 's':
|
1711
|
+
t->type = ELEMENT;
|
1712
|
+
t->n = 2;
|
1713
|
+
t->intval = 108;
|
1714
|
+
goto out;
|
1715
|
+
default:
|
1716
|
+
t->type = ELEMENT | HYDROGEN;
|
1717
|
+
t->intval = 1;
|
1718
|
+
goto out;
|
1719
|
+
}
|
1720
|
+
case 'I':
|
1721
|
+
if (!inbracket) {
|
1722
|
+
t->intval = 53;
|
1723
|
+
t->type = ALIPHATIC_ORGANIC;
|
1724
|
+
goto out;
|
1725
|
+
}
|
1726
|
+
switch (c1) {
|
1727
|
+
case 'n':
|
1728
|
+
t->type = ELEMENT;
|
1729
|
+
t->n = 2;
|
1730
|
+
t->intval = 49;
|
1731
|
+
goto out;
|
1732
|
+
case 'r':
|
1733
|
+
t->type = ELEMENT;
|
1734
|
+
t->n = 2;
|
1735
|
+
t->intval = 77;
|
1736
|
+
goto out;
|
1737
|
+
default:
|
1738
|
+
t->type = ELEMENT;
|
1739
|
+
t->intval = 53;
|
1740
|
+
goto out;
|
1741
|
+
}
|
1742
|
+
case 'K':
|
1743
|
+
switch (c1) {
|
1744
|
+
case 'r':
|
1745
|
+
t->type = ELEMENT;
|
1746
|
+
t->n = 2;
|
1747
|
+
t->intval = 36;
|
1748
|
+
goto out;
|
1749
|
+
default:
|
1750
|
+
t->type = ELEMENT;
|
1751
|
+
t->intval = 19;
|
1752
|
+
goto out;
|
1753
|
+
}
|
1754
|
+
case 'L':
|
1755
|
+
switch (c1) {
|
1756
|
+
case 'a':
|
1757
|
+
t->type = ELEMENT;
|
1758
|
+
t->n = 2;
|
1759
|
+
t->intval = 57;
|
1760
|
+
goto out;
|
1761
|
+
case 'i':
|
1762
|
+
t->type = ELEMENT;
|
1763
|
+
t->n = 2;
|
1764
|
+
t->intval = 3;
|
1765
|
+
goto out;
|
1766
|
+
case 'r':
|
1767
|
+
t->type = ELEMENT;
|
1768
|
+
t->n = 2;
|
1769
|
+
t->intval = 103;
|
1770
|
+
goto out;
|
1771
|
+
case 'u':
|
1772
|
+
t->type = ELEMENT;
|
1773
|
+
t->n = 2;
|
1774
|
+
t->intval = 71;
|
1775
|
+
goto out;
|
1776
|
+
case 'v':
|
1777
|
+
t->type = ELEMENT;
|
1778
|
+
t->n = 2;
|
1779
|
+
t->intval = 116;
|
1780
|
+
goto out;
|
1781
|
+
default:
|
1782
|
+
return 0;
|
1783
|
+
}
|
1784
|
+
case 'M':
|
1785
|
+
switch (c1) {
|
1786
|
+
case 'd':
|
1787
|
+
t->type = ELEMENT;
|
1788
|
+
t->n = 2;
|
1789
|
+
t->intval = 101;
|
1790
|
+
goto out;
|
1791
|
+
case 'g':
|
1792
|
+
t->type = ELEMENT;
|
1793
|
+
t->n = 2;
|
1794
|
+
t->intval = 12;
|
1795
|
+
goto out;
|
1796
|
+
case 'n':
|
1797
|
+
t->type = ELEMENT;
|
1798
|
+
t->n = 2;
|
1799
|
+
t->intval = 25;
|
1800
|
+
goto out;
|
1801
|
+
case 'o':
|
1802
|
+
t->type = ELEMENT;
|
1803
|
+
t->n = 2;
|
1804
|
+
t->intval = 42;
|
1805
|
+
goto out;
|
1806
|
+
case 't':
|
1807
|
+
t->type = ELEMENT;
|
1808
|
+
t->n = 2;
|
1809
|
+
t->intval = 109;
|
1810
|
+
goto out;
|
1811
|
+
default:
|
1812
|
+
return 0;
|
1813
|
+
}
|
1814
|
+
case 'N':
|
1815
|
+
if (!inbracket) {
|
1816
|
+
t->intval = 7;
|
1817
|
+
t->type = ALIPHATIC_ORGANIC;
|
1818
|
+
goto out;
|
1819
|
+
}
|
1820
|
+
switch (c1) {
|
1821
|
+
case 'a':
|
1822
|
+
t->type = ELEMENT;
|
1823
|
+
t->n = 2;
|
1824
|
+
t->intval = 11;
|
1825
|
+
goto out;
|
1826
|
+
case 'b':
|
1827
|
+
t->type = ELEMENT;
|
1828
|
+
t->n = 2;
|
1829
|
+
t->intval = 41;
|
1830
|
+
goto out;
|
1831
|
+
case 'd':
|
1832
|
+
t->type = ELEMENT;
|
1833
|
+
t->n = 2;
|
1834
|
+
t->intval = 101;
|
1835
|
+
goto out;
|
1836
|
+
case 'e':
|
1837
|
+
t->type = ELEMENT;
|
1838
|
+
t->n = 2;
|
1839
|
+
t->intval = 10;
|
1840
|
+
goto out;
|
1841
|
+
case 'i':
|
1842
|
+
t->type = ELEMENT;
|
1843
|
+
t->n = 2;
|
1844
|
+
t->intval = 28;
|
1845
|
+
goto out;
|
1846
|
+
case 'o':
|
1847
|
+
t->type = ELEMENT;
|
1848
|
+
t->n = 2;
|
1849
|
+
t->intval = 102;
|
1850
|
+
goto out;
|
1851
|
+
case 'p':
|
1852
|
+
t->type = ELEMENT;
|
1853
|
+
t->n = 2;
|
1854
|
+
t->intval = 93;
|
1855
|
+
goto out;
|
1856
|
+
default:
|
1857
|
+
t->type = ELEMENT;
|
1858
|
+
t->intval = 7;
|
1859
|
+
goto out;
|
1860
|
+
}
|
1861
|
+
case 'O':
|
1862
|
+
if (!inbracket) {
|
1863
|
+
t->intval = 8;
|
1864
|
+
t->type = ALIPHATIC_ORGANIC;
|
1865
|
+
goto out;
|
1866
|
+
}
|
1867
|
+
switch (c1) {
|
1868
|
+
case 's':
|
1869
|
+
t->type = ELEMENT;
|
1870
|
+
t->n = 2;
|
1871
|
+
t->intval = 76;
|
1872
|
+
goto out;
|
1873
|
+
default:
|
1874
|
+
t->type = ELEMENT;
|
1875
|
+
t->intval = 8;
|
1876
|
+
goto out;
|
1877
|
+
}
|
1878
|
+
case 'P':
|
1879
|
+
if (!inbracket) {
|
1880
|
+
t->intval = 15;
|
1881
|
+
t->type = ALIPHATIC_ORGANIC;
|
1882
|
+
goto out;
|
1883
|
+
}
|
1884
|
+
switch (c1) {
|
1885
|
+
case 'a':
|
1886
|
+
t->type = ELEMENT;
|
1887
|
+
t->n = 2;
|
1888
|
+
t->intval = 91;
|
1889
|
+
goto out;
|
1890
|
+
case 'b':
|
1891
|
+
t->type = ELEMENT;
|
1892
|
+
t->n = 2;
|
1893
|
+
t->intval = 82;
|
1894
|
+
goto out;
|
1895
|
+
case 'd':
|
1896
|
+
t->type = ELEMENT;
|
1897
|
+
t->n = 2;
|
1898
|
+
t->intval = 46;
|
1899
|
+
goto out;
|
1900
|
+
case 'm':
|
1901
|
+
t->type = ELEMENT;
|
1902
|
+
t->n = 2;
|
1903
|
+
t->intval = 61;
|
1904
|
+
goto out;
|
1905
|
+
case 'o':
|
1906
|
+
t->type = ELEMENT;
|
1907
|
+
t->n = 2;
|
1908
|
+
t->intval = 84;
|
1909
|
+
goto out;
|
1910
|
+
case 'r':
|
1911
|
+
t->type = ELEMENT;
|
1912
|
+
t->n = 2;
|
1913
|
+
t->intval = 59;
|
1914
|
+
goto out;
|
1915
|
+
case 't':
|
1916
|
+
t->type = ELEMENT;
|
1917
|
+
t->n = 2;
|
1918
|
+
t->intval = 78;
|
1919
|
+
goto out;
|
1920
|
+
case 'u':
|
1921
|
+
t->type = ELEMENT;
|
1922
|
+
t->n = 2;
|
1923
|
+
t->intval = 94;
|
1924
|
+
goto out;
|
1925
|
+
default:
|
1926
|
+
t->type = ELEMENT;
|
1927
|
+
t->intval = 15;
|
1928
|
+
goto out;
|
1929
|
+
}
|
1930
|
+
case 'R':
|
1931
|
+
switch (c1) {
|
1932
|
+
case 'a':
|
1933
|
+
t->type = ELEMENT;
|
1934
|
+
t->n = 2;
|
1935
|
+
t->intval = 88;
|
1936
|
+
goto out;
|
1937
|
+
case 'b':
|
1938
|
+
t->type = ELEMENT;
|
1939
|
+
t->n = 2;
|
1940
|
+
t->intval = 37;
|
1941
|
+
goto out;
|
1942
|
+
case 'e':
|
1943
|
+
t->type = ELEMENT;
|
1944
|
+
t->n = 2;
|
1945
|
+
t->intval = 75;
|
1946
|
+
goto out;
|
1947
|
+
case 'f':
|
1948
|
+
t->type = ELEMENT;
|
1949
|
+
t->n = 2;
|
1950
|
+
t->intval = 104;
|
1951
|
+
goto out;
|
1952
|
+
case 'g':
|
1953
|
+
t->type = ELEMENT;
|
1954
|
+
t->n = 2;
|
1955
|
+
t->intval = 111;
|
1956
|
+
goto out;
|
1957
|
+
case 'h':
|
1958
|
+
t->type = ELEMENT;
|
1959
|
+
t->n = 2;
|
1960
|
+
t->intval = 45;
|
1961
|
+
goto out;
|
1962
|
+
case 'n':
|
1963
|
+
t->type = ELEMENT;
|
1964
|
+
t->n = 2;
|
1965
|
+
t->intval = 86;
|
1966
|
+
goto out;
|
1967
|
+
case 'u':
|
1968
|
+
t->type = ELEMENT;
|
1969
|
+
t->n = 2;
|
1970
|
+
t->intval = 44;
|
1971
|
+
goto out;
|
1972
|
+
default:
|
1973
|
+
return 0;
|
1974
|
+
}
|
1975
|
+
case 'S':
|
1976
|
+
if (!inbracket) {
|
1977
|
+
t->intval = 16;
|
1978
|
+
t->type = ALIPHATIC_ORGANIC;
|
1979
|
+
goto out;
|
1980
|
+
}
|
1981
|
+
switch (c1) {
|
1982
|
+
case 'b':
|
1983
|
+
t->type = ELEMENT;
|
1984
|
+
t->n = 2;
|
1985
|
+
t->intval = 51;
|
1986
|
+
goto out;
|
1987
|
+
case 'c':
|
1988
|
+
t->type = ELEMENT;
|
1989
|
+
t->n = 2;
|
1990
|
+
t->intval = 21;
|
1991
|
+
goto out;
|
1992
|
+
case 'e':
|
1993
|
+
t->type = ELEMENT;
|
1994
|
+
t->n = 2;
|
1995
|
+
t->intval = 34;
|
1996
|
+
goto out;
|
1997
|
+
case 'g':
|
1998
|
+
t->type = ELEMENT;
|
1999
|
+
t->n = 2;
|
2000
|
+
t->intval = 106;
|
2001
|
+
goto out;
|
2002
|
+
case 'i':
|
2003
|
+
t->type = ELEMENT;
|
2004
|
+
t->n = 2;
|
2005
|
+
t->intval = 14;
|
2006
|
+
goto out;
|
2007
|
+
case 'm':
|
2008
|
+
t->type = ELEMENT;
|
2009
|
+
t->n = 2;
|
2010
|
+
t->intval = 62;
|
2011
|
+
goto out;
|
2012
|
+
case 'n':
|
2013
|
+
t->type = ELEMENT;
|
2014
|
+
t->n = 2;
|
2015
|
+
t->intval = 50;
|
2016
|
+
goto out;
|
2017
|
+
case 'r':
|
2018
|
+
t->type = ELEMENT;
|
2019
|
+
t->n = 2;
|
2020
|
+
t->intval = 38;
|
2021
|
+
goto out;
|
2022
|
+
default:
|
2023
|
+
t->type = ELEMENT;
|
2024
|
+
t->intval = 16;
|
2025
|
+
goto out;
|
2026
|
+
}
|
2027
|
+
case 'T':
|
2028
|
+
switch (c1) {
|
2029
|
+
case 'a':
|
2030
|
+
t->type = ELEMENT;
|
2031
|
+
t->n = 2;
|
2032
|
+
t->intval = 73;
|
2033
|
+
goto out;
|
2034
|
+
case 'b':
|
2035
|
+
t->type = ELEMENT;
|
2036
|
+
t->n = 2;
|
2037
|
+
t->intval = 65;
|
2038
|
+
goto out;
|
2039
|
+
case 'c':
|
2040
|
+
t->type = ELEMENT;
|
2041
|
+
t->n = 2;
|
2042
|
+
t->intval = 43;
|
2043
|
+
goto out;
|
2044
|
+
case 'e':
|
2045
|
+
t->type = ELEMENT;
|
2046
|
+
t->n = 2;
|
2047
|
+
t->intval = 52;
|
2048
|
+
goto out;
|
2049
|
+
case 'h':
|
2050
|
+
t->type = ELEMENT;
|
2051
|
+
t->n = 2;
|
2052
|
+
t->intval = 90;
|
2053
|
+
goto out;
|
2054
|
+
case 'i':
|
2055
|
+
t->type = ELEMENT;
|
2056
|
+
t->n = 2;
|
2057
|
+
t->intval = 22;
|
2058
|
+
goto out;
|
2059
|
+
case 'l':
|
2060
|
+
t->type = ELEMENT;
|
2061
|
+
t->n = 2;
|
2062
|
+
t->intval = 81;
|
2063
|
+
goto out;
|
2064
|
+
case 'm':
|
2065
|
+
t->type = ELEMENT;
|
2066
|
+
t->n = 2;
|
2067
|
+
t->intval = 69;
|
2068
|
+
goto out;
|
2069
|
+
default:
|
2070
|
+
return 0;
|
2071
|
+
}
|
2072
|
+
case 'U':
|
2073
|
+
t->type = ELEMENT;
|
2074
|
+
t->intval = 92;
|
2075
|
+
goto out;
|
2076
|
+
case 'V':
|
2077
|
+
t->type = ELEMENT;
|
2078
|
+
t->intval = 23;
|
2079
|
+
goto out;
|
2080
|
+
case 'W':
|
2081
|
+
t->type = ELEMENT;
|
2082
|
+
t->intval = 74;
|
2083
|
+
goto out;
|
2084
|
+
case 'X':
|
2085
|
+
switch (c1) {
|
2086
|
+
case 'e':
|
2087
|
+
t->type = ELEMENT;
|
2088
|
+
t->n = 2;
|
2089
|
+
t->intval = 54;
|
2090
|
+
goto out;
|
2091
|
+
default:
|
2092
|
+
return 0;
|
2093
|
+
}
|
2094
|
+
case 'Y':
|
2095
|
+
switch (c1) {
|
2096
|
+
case 'b':
|
2097
|
+
t->type = ELEMENT;
|
2098
|
+
t->n = 2;
|
2099
|
+
t->intval = 70;
|
2100
|
+
goto out;
|
2101
|
+
default:
|
2102
|
+
t->type = ELEMENT;
|
2103
|
+
t->intval = 39;
|
2104
|
+
goto out;
|
2105
|
+
}
|
2106
|
+
case 'Z':
|
2107
|
+
switch (c1) {
|
2108
|
+
case 'n':
|
2109
|
+
t->type = ELEMENT;
|
2110
|
+
t->n = 2;
|
2111
|
+
t->intval = 30;
|
2112
|
+
goto out;
|
2113
|
+
case 'r':
|
2114
|
+
t->type = ELEMENT;
|
2115
|
+
t->n = 2;
|
2116
|
+
t->intval = 40;
|
2117
|
+
goto out;
|
2118
|
+
default:
|
2119
|
+
return 0;
|
2120
|
+
}
|
2121
|
+
case '0':
|
2122
|
+
case '1':
|
2123
|
+
case '2':
|
2124
|
+
case '3':
|
2125
|
+
case '4':
|
2126
|
+
case '5':
|
2127
|
+
case '6':
|
2128
|
+
case '7':
|
2129
|
+
case '8':
|
2130
|
+
case '9':
|
2131
|
+
t->type = DIGIT;
|
2132
|
+
t->intval = c0 - '0';
|
2133
|
+
goto out;
|
2134
|
+
case '*':
|
2135
|
+
t->type = WILDCARD;
|
2136
|
+
t->intval = 0;
|
2137
|
+
goto out;
|
2138
|
+
case '[':
|
2139
|
+
t->type = BRACKET_OPEN;
|
2140
|
+
goto out;
|
2141
|
+
case ']':
|
2142
|
+
t->type = BRACKET_CLOSE;
|
2143
|
+
goto out;
|
2144
|
+
case '(':
|
2145
|
+
t->type = PAREN_OPEN;
|
2146
|
+
goto out;
|
2147
|
+
case ')':
|
2148
|
+
t->type = PAREN_CLOSE;
|
2149
|
+
goto out;
|
2150
|
+
case '+':
|
2151
|
+
t->type = PLUS;
|
2152
|
+
t->intval = 1;
|
2153
|
+
goto out;
|
2154
|
+
case '-':
|
2155
|
+
t->type = inbracket ? MINUS : BOND;
|
2156
|
+
t->intval = inbracket ? -1 : COHO_SMILES_BOND_SINGLE;
|
2157
|
+
goto out;
|
2158
|
+
case '%':
|
2159
|
+
t->type = PERCENT;
|
2160
|
+
goto out;
|
2161
|
+
case '=':
|
2162
|
+
t->type = BOND;
|
2163
|
+
t->intval = COHO_SMILES_BOND_DOUBLE;
|
2164
|
+
goto out;
|
2165
|
+
case '#':
|
2166
|
+
t->type = BOND;
|
2167
|
+
t->intval = COHO_SMILES_BOND_TRIPLE;
|
2168
|
+
goto out;
|
2169
|
+
case '$':
|
2170
|
+
t->type = BOND;
|
2171
|
+
t->intval = COHO_SMILES_BOND_QUAD;
|
2172
|
+
goto out;
|
2173
|
+
case ':':
|
2174
|
+
if (inbracket) {
|
2175
|
+
t->type = COLON;
|
2176
|
+
} else {
|
2177
|
+
t->type = BOND;
|
2178
|
+
t->intval = COHO_SMILES_BOND_AROMATIC;
|
2179
|
+
}
|
2180
|
+
goto out;
|
2181
|
+
case '/':
|
2182
|
+
t->type = BOND;
|
2183
|
+
t->intval = COHO_SMILES_BOND_SINGLE;
|
2184
|
+
t->flags = COHO_SMILES_BOND_STEREO_UP;
|
2185
|
+
goto out;
|
2186
|
+
case '\\':
|
2187
|
+
t->type = BOND;
|
2188
|
+
t->intval = COHO_SMILES_BOND_SINGLE;
|
2189
|
+
t->flags = COHO_SMILES_BOND_STEREO_DOWN;
|
2190
|
+
goto out;
|
2191
|
+
case '.':
|
2192
|
+
t->type = DOT;
|
2193
|
+
goto out;
|
2194
|
+
case '@':
|
2195
|
+
t->type = CHIRALITY;
|
2196
|
+
if (c1 == '@')
|
2197
|
+
t->n = 2;
|
2198
|
+
goto out;
|
2199
|
+
default:
|
2200
|
+
return 0;
|
2201
|
+
}
|
2202
|
+
|
2203
|
+
out:
|
2204
|
+
return t->type;
|
2205
|
+
}
|