selfies-js 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +274 -0
- package/package.json +65 -0
- package/src/alphabet.js +150 -0
- package/src/alphabet.test.js +82 -0
- package/src/chemistryValidator.js +236 -0
- package/src/cli.js +206 -0
- package/src/constraints.js +186 -0
- package/src/constraints.test.js +126 -0
- package/src/decoder.js +636 -0
- package/src/decoder.test.js +560 -0
- package/src/dsl/analyzer.js +170 -0
- package/src/dsl/analyzer.test.js +139 -0
- package/src/dsl/dsl.test.js +146 -0
- package/src/dsl/importer.js +238 -0
- package/src/dsl/index.js +32 -0
- package/src/dsl/lexer.js +264 -0
- package/src/dsl/lexer.test.js +115 -0
- package/src/dsl/parser.js +201 -0
- package/src/dsl/parser.test.js +148 -0
- package/src/dsl/resolver.js +136 -0
- package/src/dsl/resolver.test.js +99 -0
- package/src/dsl/symbolTable.js +56 -0
- package/src/dsl/symbolTable.test.js +68 -0
- package/src/dsl/valenceValidator.js +147 -0
- package/src/encoder.js +467 -0
- package/src/encoder.test.js +61 -0
- package/src/errors.js +79 -0
- package/src/errors.test.js +91 -0
- package/src/grammar_rules.js +146 -0
- package/src/index.js +70 -0
- package/src/parser.js +96 -0
- package/src/parser.test.js +96 -0
- package/src/properties/atoms.js +69 -0
- package/src/properties/atoms.test.js +116 -0
- package/src/properties/formula.js +111 -0
- package/src/properties/formula.test.js +95 -0
- package/src/properties/molecularWeight.js +80 -0
- package/src/properties/molecularWeight.test.js +84 -0
- package/src/properties/properties.test.js +77 -0
- package/src/renderers/README.md +127 -0
- package/src/renderers/svg.js +113 -0
- package/src/renderers/svg.test.js +42 -0
- package/src/syntax.js +641 -0
- package/src/syntax.test.js +363 -0
- package/src/tokenizer.js +99 -0
- package/src/tokenizer.test.js +55 -0
- package/src/validator.js +70 -0
- package/src/validator.test.js +44 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="toluene-logo.svg" alt="Toluene molecule" width="200"/>
|
|
3
|
+
<h1>selfies-js</h1>
|
|
4
|
+
<p>A pure JavaScript implementation of the SELFIES molecular string representation</p>
|
|
5
|
+
</div>
|
|
6
|
+
|
|
7
|
+
## What is SELFIES?
|
|
8
|
+
|
|
9
|
+
**SELFIES** (SELF-referencIng Embedded Strings) is a 100% robust molecular string representation. Unlike SMILES, every SELFIES string corresponds to a valid molecule, making it ideal for machine learning and generative models in chemistry.
|
|
10
|
+
|
|
11
|
+
This library is a JavaScript port inspired by the original Python implementation: **[aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)**
|
|
12
|
+
|
|
13
|
+
> Krenn, M., Häse, F., Nigam, A., Friederich, P., & Aspuru-Guzik, A. (2020). Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation. *Machine Learning: Science and Technology*, 1(4), 045024.
|
|
14
|
+
|
|
15
|
+
## Overview
|
|
16
|
+
|
|
17
|
+
```javascript
|
|
18
|
+
import {
|
|
19
|
+
decode, encode, isValid,
|
|
20
|
+
getMolecularWeight, getFormula,
|
|
21
|
+
lenSelfies, getSemanticConstraints,
|
|
22
|
+
isChemicallyValid, getCanonicalSmiles, validateRoundtrip
|
|
23
|
+
} from 'selfies-js'
|
|
24
|
+
|
|
25
|
+
// SELFIES → SMILES
|
|
26
|
+
decode('[C][C][O]') // 'CCO'
|
|
27
|
+
|
|
28
|
+
// SMILES → SELFIES
|
|
29
|
+
encode('CCO') // '[C][C][O]'
|
|
30
|
+
|
|
31
|
+
// Syntax validation
|
|
32
|
+
isValid('[C][C][O]') // true
|
|
33
|
+
|
|
34
|
+
// Chemistry validation (requires RDKit)
|
|
35
|
+
import { initRDKit } from 'selfies-js'
|
|
36
|
+
await initRDKit()
|
|
37
|
+
await isChemicallyValid('[C][C][O]') // true - molecule is chemically valid
|
|
38
|
+
await getCanonicalSmiles('[C][C][O]') // 'CCO' - canonical SMILES representation
|
|
39
|
+
await validateRoundtrip('CCO', '[C][C][O]') // true - structure preserved
|
|
40
|
+
|
|
41
|
+
// Properties
|
|
42
|
+
getMolecularWeight('[C][C][O]') // 46.07
|
|
43
|
+
getFormula('[C][C][O]') // 'C2H6O'
|
|
44
|
+
|
|
45
|
+
// Utilities
|
|
46
|
+
lenSelfies('[C][C][O]') // 3 (symbol count, not string length)
|
|
47
|
+
|
|
48
|
+
// Semantic constraints
|
|
49
|
+
const constraints = getSemanticConstraints()
|
|
50
|
+
console.log(constraints['C']) // 4 (max bonds for carbon)
|
|
51
|
+
|
|
52
|
+
// SVG Rendering (using RDKit.js)
|
|
53
|
+
import { renderSelfies } from 'selfies-js'
|
|
54
|
+
|
|
55
|
+
const svg = await renderSelfies('[C][C][O]', {
|
|
56
|
+
width: 300,
|
|
57
|
+
height: 300
|
|
58
|
+
})
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
npm install selfies-js
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Browser Usage (CDN)
|
|
68
|
+
|
|
69
|
+
For direct browser usage without a bundler:
|
|
70
|
+
|
|
71
|
+
```html
|
|
72
|
+
<!-- Complete bundle: DSL + SELFIES encode/decode + utilities -->
|
|
73
|
+
<script src="https://github.com/Ghost---Shadow/selfies-js/releases/latest/download/selfies.umd.min.js"></script>
|
|
74
|
+
<script>
|
|
75
|
+
// SELFIES encoding/decoding
|
|
76
|
+
const encoded = SELFIES.encode('CC');
|
|
77
|
+
console.log(encoded); // '[C][C]'
|
|
78
|
+
|
|
79
|
+
const decoded = SELFIES.decode('[C][C][O]');
|
|
80
|
+
console.log(decoded); // 'CCO'
|
|
81
|
+
|
|
82
|
+
// DSL parsing
|
|
83
|
+
const parsed = SELFIES.parse('[methyl] = [C]');
|
|
84
|
+
console.log(parsed);
|
|
85
|
+
|
|
86
|
+
// Molecular properties
|
|
87
|
+
const mw = SELFIES.getMolecularWeight('[C][C][O]');
|
|
88
|
+
console.log(mw); // 46.07
|
|
89
|
+
</script>
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Download from the [latest release](https://github.com/Ghost---Shadow/selfies-js/releases/latest).
|
|
93
|
+
|
|
94
|
+
## Features
|
|
95
|
+
|
|
96
|
+
- **Core:** Decode SELFIES to SMILES
|
|
97
|
+
- **Core:** Encode SMILES to SELFIES
|
|
98
|
+
- **Validation:** Syntax and semantic validation
|
|
99
|
+
- **Chemistry Validation:** RDKit-based molecular validity checking
|
|
100
|
+
- **Canonical SMILES:** Structure comparison and roundtrip validation
|
|
101
|
+
- **Properties:** Molecular weight and formula calculation
|
|
102
|
+
- **Constraints:** Customizable semantic constraints (bonding rules)
|
|
103
|
+
- **Utilities:** Symbol counting, alphabet extraction
|
|
104
|
+
- **DSL:** Define and resolve molecule libraries with named definitions
|
|
105
|
+
- **Imports:** Modular .selfies files with import support
|
|
106
|
+
- **CLI:** Command-line interface for executing .selfies files
|
|
107
|
+
- **Rendering:** SVG visualization of molecular structures
|
|
108
|
+
|
|
109
|
+
## CLI Usage
|
|
110
|
+
|
|
111
|
+
The `selfies-js` CLI allows you to work with `.selfies` DSL files from the command line.
|
|
112
|
+
|
|
113
|
+
### Commands
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Execute a .selfies file and output resolved definitions
|
|
117
|
+
bun src/cli.js run molecules.selfies
|
|
118
|
+
|
|
119
|
+
# Output as SMILES instead of SELFIES
|
|
120
|
+
bun src/cli.js run molecules.selfies --format=smiles
|
|
121
|
+
|
|
122
|
+
# Validate a .selfies file for errors
|
|
123
|
+
bun src/cli.js validate molecules.selfies
|
|
124
|
+
|
|
125
|
+
# List all definitions in a file
|
|
126
|
+
bun src/cli.js list molecules.selfies
|
|
127
|
+
|
|
128
|
+
# Show help
|
|
129
|
+
bun src/cli.js help
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### DSL Syntax
|
|
133
|
+
|
|
134
|
+
The `.selfies` DSL allows you to define named molecular fragments and compose them hierarchically:
|
|
135
|
+
|
|
136
|
+
```selfies
|
|
137
|
+
# Comments start with #
|
|
138
|
+
|
|
139
|
+
# Basic definitions
|
|
140
|
+
[methyl] = [C]
|
|
141
|
+
[ethyl] = [C][C]
|
|
142
|
+
[hydroxyl] = [O]
|
|
143
|
+
|
|
144
|
+
# Composition - reference other definitions
|
|
145
|
+
[ethanol] = [ethyl][hydroxyl]
|
|
146
|
+
|
|
147
|
+
# Complex structures with branches
|
|
148
|
+
[isopropyl] = [C][Branch1][C][C][C]
|
|
149
|
+
[isopropanol] = [isopropyl][hydroxyl]
|
|
150
|
+
|
|
151
|
+
# Aromatic rings
|
|
152
|
+
[phenyl] = [C][=C][C][=C][C][=C][Ring1][=Branch1]
|
|
153
|
+
[toluene] = [methyl][phenyl]
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### Import Syntax
|
|
157
|
+
|
|
158
|
+
Import definitions from other `.selfies` files:
|
|
159
|
+
|
|
160
|
+
```selfies
|
|
161
|
+
# Import all definitions from another file
|
|
162
|
+
import "./fragments.selfies"
|
|
163
|
+
|
|
164
|
+
# Alternative syntax for importing all
|
|
165
|
+
import * from "./common.selfies"
|
|
166
|
+
|
|
167
|
+
# Import specific definitions only
|
|
168
|
+
import [methyl, ethyl, hydroxyl] from "./base.selfies"
|
|
169
|
+
|
|
170
|
+
# Use imported definitions
|
|
171
|
+
[my_molecule] = [methyl][hydroxyl]
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Imports support:
|
|
175
|
+
- **Relative paths** resolved from the importing file's location
|
|
176
|
+
- **Chained imports** (file A imports B, B imports C)
|
|
177
|
+
- **Circular import detection** with clear error messages
|
|
178
|
+
- **Selective imports** to only include what you need
|
|
179
|
+
|
|
180
|
+
### Example Output
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
$ bun src/cli.js run molecules.selfies --format=smiles
|
|
184
|
+
methyl: C
|
|
185
|
+
ethyl: CC
|
|
186
|
+
hydroxyl: O
|
|
187
|
+
ethanol: CCO
|
|
188
|
+
isopropyl: C(C)C
|
|
189
|
+
isopropanol: C(C)CO
|
|
190
|
+
phenyl: C1=CC=CC=C1
|
|
191
|
+
toluene: CC1=CC=CC=C1
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## DSL API
|
|
195
|
+
|
|
196
|
+
```javascript
|
|
197
|
+
import { parse, resolve, resolveAll } from 'selfies-js/dsl'
|
|
198
|
+
import { loadFile } from 'selfies-js/dsl'
|
|
199
|
+
|
|
200
|
+
// Load a file with imports
|
|
201
|
+
const program = loadFile('molecules.selfies')
|
|
202
|
+
|
|
203
|
+
// Or parse source directly
|
|
204
|
+
const source = `
|
|
205
|
+
[methyl] = [C]
|
|
206
|
+
[ethanol] = [methyl][C][O]
|
|
207
|
+
`
|
|
208
|
+
const program = parse(source)
|
|
209
|
+
|
|
210
|
+
// Resolve a single definition
|
|
211
|
+
resolve(program, 'ethanol') // '[C][C][O]'
|
|
212
|
+
|
|
213
|
+
// Resolve with SMILES output
|
|
214
|
+
resolve(program, 'ethanol', { decode: true }) // 'CCO'
|
|
215
|
+
|
|
216
|
+
// Resolve all definitions
|
|
217
|
+
const all = resolveAll(program)
|
|
218
|
+
// Map { 'methyl' => '[C]', 'ethanol' => '[C][C][O]' }
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Visualization
|
|
222
|
+
|
|
223
|
+
The library uses **RDKit.js** for professional molecule rendering:
|
|
224
|
+
|
|
225
|
+
```javascript
|
|
226
|
+
import { renderSelfies, initRDKit } from 'selfies-js'
|
|
227
|
+
|
|
228
|
+
// Initialize RDKit (async, only needed once)
|
|
229
|
+
await initRDKit()
|
|
230
|
+
|
|
231
|
+
// Render toluene
|
|
232
|
+
const svg = await renderSelfies('[C][C][=C][C][=C][C][=C][Ring1][=Branch1]', {
|
|
233
|
+
width: 300,
|
|
234
|
+
height: 300
|
|
235
|
+
})
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
Features:
|
|
239
|
+
- Professional 2D coordinate generation via RDKit
|
|
240
|
+
- Proper skeletal formulas (carbons hidden)
|
|
241
|
+
- Correct benzene ring geometry
|
|
242
|
+
- Support for all bond types
|
|
243
|
+
- Stereochemistry notation
|
|
244
|
+
- Industry-standard rendering
|
|
245
|
+
|
|
246
|
+
## Examples
|
|
247
|
+
|
|
248
|
+
See the `examples/` directory for sample `.selfies` files:
|
|
249
|
+
|
|
250
|
+
- `base-fragments.selfies` - Reusable building blocks (alkyl groups, functional groups, halogens)
|
|
251
|
+
- `molecules-with-imports.selfies` - Demonstrates importing and composing molecules
|
|
252
|
+
- `organic-chemistry.selfies` - Alcohols, aldehydes, acids, amines, ethers
|
|
253
|
+
- `drug-fragments.selfies` - Pharmacophore fragments, drug-like building blocks
|
|
254
|
+
- `polymers.selfies` - Monomers, repeat units, oligomers
|
|
255
|
+
|
|
256
|
+
## Known Limitations
|
|
257
|
+
|
|
258
|
+
The encoder/decoder handles most common organic molecules correctly. Some complex cases have known limitations:
|
|
259
|
+
|
|
260
|
+
- **Bracket atoms** in SMILES (`[nH]`, `[C@@]`, `[13C]`) - limited support
|
|
261
|
+
- **Fused aromatic ring systems** - some complex cases may not roundtrip correctly
|
|
262
|
+
- **Polycyclic structures** with multiple ring closures - partial support
|
|
263
|
+
|
|
264
|
+
For complete SELFIES support, use the original Python library: [aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
|
|
265
|
+
|
|
266
|
+
## References
|
|
267
|
+
|
|
268
|
+
- **Original SELFIES Paper:** Krenn, M., Häse, F., Nigam, A., Friederich, P., & Aspuru-Guzik, A. (2020). Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation. *Machine Learning: Science and Technology*, 1(4), 045024. [DOI: 10.1088/2632-2153/aba947](https://doi.org/10.1088/2632-2153/aba947)
|
|
269
|
+
|
|
270
|
+
- **Python Implementation:** [github.com/aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
|
|
271
|
+
|
|
272
|
+
## License
|
|
273
|
+
|
|
274
|
+
MIT
|
package/package.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "selfies-js",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Pure JavaScript SELFIES encoder/decoder with DSL for molecular composition",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "src/index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"selfies-js": "./src/cli.js"
|
|
9
|
+
},
|
|
10
|
+
"exports": {
|
|
11
|
+
".": "./src/index.js",
|
|
12
|
+
"./dsl": "./src/dsl/index.js",
|
|
13
|
+
"./syntax": "./src/syntax.js"
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"src/**/*.js",
|
|
17
|
+
"README.md",
|
|
18
|
+
"LICENSE"
|
|
19
|
+
],
|
|
20
|
+
"scripts": {
|
|
21
|
+
"test": "bun test",
|
|
22
|
+
"bench": "bun run bench/decode.bench.js",
|
|
23
|
+
"lint": "echo 'Linting...'; exit 0",
|
|
24
|
+
"prepublishOnly": "bun test",
|
|
25
|
+
"pack": "bun pm pack",
|
|
26
|
+
"publish:dry": "bun pm pack --dry-run"
|
|
27
|
+
},
|
|
28
|
+
"keywords": [
|
|
29
|
+
"selfies",
|
|
30
|
+
"smiles",
|
|
31
|
+
"chemistry",
|
|
32
|
+
"cheminformatics",
|
|
33
|
+
"molecules",
|
|
34
|
+
"molecular-design",
|
|
35
|
+
"drug-discovery",
|
|
36
|
+
"encoder",
|
|
37
|
+
"decoder",
|
|
38
|
+
"dsl",
|
|
39
|
+
"parser"
|
|
40
|
+
],
|
|
41
|
+
"author": "SELFIES Contributors",
|
|
42
|
+
"license": "MIT",
|
|
43
|
+
"repository": {
|
|
44
|
+
"type": "git",
|
|
45
|
+
"url": "https://github.com/Ghost---Shadow/selfies-js.git"
|
|
46
|
+
},
|
|
47
|
+
"bugs": {
|
|
48
|
+
"url": "https://github.com/Ghost---Shadow/selfies-js/issues"
|
|
49
|
+
},
|
|
50
|
+
"homepage": "https://github.com/Ghost---Shadow/selfies-js#readme",
|
|
51
|
+
"engines": {
|
|
52
|
+
"node": ">=16.0.0"
|
|
53
|
+
},
|
|
54
|
+
"peerDependencies": {
|
|
55
|
+
"@rdkit/rdkit": "^2025.3.4-1.0.0"
|
|
56
|
+
},
|
|
57
|
+
"peerDependenciesMeta": {
|
|
58
|
+
"@rdkit/rdkit": {
|
|
59
|
+
"optional": true
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"devDependencies": {
|
|
63
|
+
"@rdkit/rdkit": "^2025.3.4-1.0.0"
|
|
64
|
+
}
|
|
65
|
+
}
|
package/src/alphabet.js
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Alphabet - Valid SELFIES token sets
|
|
3
|
+
*
|
|
4
|
+
* Defines the full SELFIES alphabet and semantic-robust subset.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Full SELFIES alphabet - all valid tokens
|
|
9
|
+
* @type {Set<string>}
|
|
10
|
+
*/
|
|
11
|
+
let _alphabet = null
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Semantic-robust alphabet - tokens that are always chemically meaningful
|
|
15
|
+
* @type {Set<string>}
|
|
16
|
+
*/
|
|
17
|
+
let _semanticAlphabet = null
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Gets the full SELFIES alphabet
|
|
21
|
+
* @returns {Set<string>} Set of all valid SELFIES tokens
|
|
22
|
+
*/
|
|
23
|
+
export function getAlphabet() {
|
|
24
|
+
if (_alphabet === null) {
|
|
25
|
+
_alphabet = new Set()
|
|
26
|
+
|
|
27
|
+
// Add all atom tokens (basic, double bond, triple bond)
|
|
28
|
+
const atomTokens = buildAtomTokens()
|
|
29
|
+
for (const token of atomTokens) {
|
|
30
|
+
_alphabet.add(token)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Add structural tokens (branches, rings, length specifiers)
|
|
34
|
+
const structuralTokens = buildStructuralTokens()
|
|
35
|
+
for (const token of structuralTokens) {
|
|
36
|
+
_alphabet.add(token)
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return _alphabet
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Gets the semantic-robust SELFIES alphabet
|
|
45
|
+
* @returns {Set<string>} Set of semantically robust tokens
|
|
46
|
+
*
|
|
47
|
+
* Semantic-robust tokens are those that produce valid molecules
|
|
48
|
+
* regardless of context (no branch/ring tokens, etc.)
|
|
49
|
+
*/
|
|
50
|
+
export function getSemanticAlphabet() {
|
|
51
|
+
if (_semanticAlphabet === null) {
|
|
52
|
+
_semanticAlphabet = new Set()
|
|
53
|
+
|
|
54
|
+
// Add only atom tokens (basic, double bond, triple bond)
|
|
55
|
+
// Exclude structural tokens (Branch, Ring) as they are context-dependent
|
|
56
|
+
const atomTokens = buildAtomTokens()
|
|
57
|
+
for (const token of atomTokens) {
|
|
58
|
+
_semanticAlphabet.add(token)
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return _semanticAlphabet
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Checks if a token is in the alphabet
|
|
67
|
+
* @param {string} token - Token to check
|
|
68
|
+
* @returns {boolean} True if token is valid
|
|
69
|
+
*/
|
|
70
|
+
export function isValidToken(token) {
|
|
71
|
+
return getAlphabet().has(token)
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Checks if a token is semantic-robust
|
|
76
|
+
* @param {string} token - Token to check
|
|
77
|
+
* @returns {boolean} True if token is semantic-robust
|
|
78
|
+
*/
|
|
79
|
+
export function isSemanticRobust(token) {
|
|
80
|
+
return getSemanticAlphabet().has(token)
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Builds the list of all atom tokens (with bond modifiers)
|
|
85
|
+
* @returns {string[]} Array of atom tokens
|
|
86
|
+
*/
|
|
87
|
+
function buildAtomTokens() {
|
|
88
|
+
const elements = ['C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I', 'B']
|
|
89
|
+
const tokens = []
|
|
90
|
+
|
|
91
|
+
for (const element of elements) {
|
|
92
|
+
tokens.push(`[${element}]`) // basic atom
|
|
93
|
+
tokens.push(`[=${element}]`) // double bond
|
|
94
|
+
tokens.push(`[#${element}]`) // triple bond
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return tokens
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Builds the list of structural tokens (branch, ring)
|
|
102
|
+
* @returns {string[]} Array of structural tokens
|
|
103
|
+
*/
|
|
104
|
+
function buildStructuralTokens() {
|
|
105
|
+
const tokens = []
|
|
106
|
+
|
|
107
|
+
// Branch and Ring tokens
|
|
108
|
+
tokens.push('[Branch1]', '[Branch2]', '[Branch3]')
|
|
109
|
+
tokens.push('[Ring1]', '[Ring2]', '[Ring3]')
|
|
110
|
+
|
|
111
|
+
// Numeric tokens used as length specifiers (based on atom tokens)
|
|
112
|
+
// These are the same as bond-modified atoms but used as numbers
|
|
113
|
+
tokens.push('[=Branch1]', '[=Branch2]', '[=Branch3]')
|
|
114
|
+
tokens.push('[#Branch1]', '[#Branch2]', '[#Branch3]')
|
|
115
|
+
|
|
116
|
+
// Stereochemistry ring tokens
|
|
117
|
+
tokens.push('[-/Ring1]', '[-/Ring2]', '[-/Ring3]')
|
|
118
|
+
tokens.push('[\\\\/Ring1]', '[\\\\/Ring2]', '[\\\\/Ring3]')
|
|
119
|
+
|
|
120
|
+
return tokens
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Extracts the unique SELFIES alphabet from a collection of SELFIES strings
|
|
125
|
+
* @param {Iterable<string>} selfiesIterable - Collection of SELFIES strings
|
|
126
|
+
* @returns {Set<string>} Set of unique SELFIES symbols found
|
|
127
|
+
*
|
|
128
|
+
* Based on selfies-py's get_alphabet_from_selfies() function.
|
|
129
|
+
*
|
|
130
|
+
* Example:
|
|
131
|
+
* const alphabet = getAlphabetFromSelfies(['[C][C][O]', '[N][C][=O]'])
|
|
132
|
+
* // Set { '[C]', '[O]', '[N]', '[=O]' }
|
|
133
|
+
*
|
|
134
|
+
* Reference: selfies-py/selfies/utils/selfies_utils.py::get_alphabet_from_selfies()
|
|
135
|
+
*/
|
|
136
|
+
export function getAlphabetFromSelfies(selfiesIterable) {
|
|
137
|
+
// TODO: Will implement this after tokenizer is ready
|
|
138
|
+
// For now, manually tokenize by extracting [...] patterns
|
|
139
|
+
const alphabet = new Set()
|
|
140
|
+
|
|
141
|
+
for (const selfies of selfiesIterable) {
|
|
142
|
+
// Simple regex-based tokenization (temporary until tokenizer.js is ready)
|
|
143
|
+
const tokens = selfies.match(/\[[^\]]+\]/g) || []
|
|
144
|
+
for (const token of tokens) {
|
|
145
|
+
alphabet.add(token)
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return alphabet
|
|
150
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for SELFIES alphabet
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, test, expect } from 'bun:test'
|
|
6
|
+
import { getAlphabet, getSemanticAlphabet, isValidToken, isSemanticRobust } from './alphabet.js'
|
|
7
|
+
|
|
8
|
+
describe('getAlphabet', () => {
|
|
9
|
+
test('includes basic atoms', () => {
|
|
10
|
+
const alphabet = getAlphabet()
|
|
11
|
+
expect(alphabet.has('[C]')).toBe(true)
|
|
12
|
+
expect(alphabet.has('[N]')).toBe(true)
|
|
13
|
+
expect(alphabet.has('[O]')).toBe(true)
|
|
14
|
+
})
|
|
15
|
+
|
|
16
|
+
test('includes bond modifiers', () => {
|
|
17
|
+
const alphabet = getAlphabet()
|
|
18
|
+
expect(alphabet.has('[=C]')).toBe(true)
|
|
19
|
+
expect(alphabet.has('[#N]')).toBe(true)
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
test('includes multi-char elements', () => {
|
|
23
|
+
const alphabet = getAlphabet()
|
|
24
|
+
expect(alphabet.has('[Cl]')).toBe(true)
|
|
25
|
+
expect(alphabet.has('[Br]')).toBe(true)
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
test('includes structural tokens', () => {
|
|
29
|
+
const alphabet = getAlphabet()
|
|
30
|
+
expect(alphabet.has('[Branch1]')).toBe(true)
|
|
31
|
+
expect(alphabet.has('[Ring1]')).toBe(true)
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
test('excludes invalid tokens', () => {
|
|
35
|
+
const alphabet = getAlphabet()
|
|
36
|
+
expect(alphabet.has('[Xyz]')).toBe(false)
|
|
37
|
+
expect(alphabet.has('[123]')).toBe(false)
|
|
38
|
+
})
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
describe('getSemanticAlphabet', () => {
|
|
42
|
+
test('includes atoms and bonds only', () => {
|
|
43
|
+
const alphabet = getSemanticAlphabet()
|
|
44
|
+
expect(alphabet.has('[C]')).toBe(true)
|
|
45
|
+
expect(alphabet.has('[=C]')).toBe(true)
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
test('excludes structural tokens', () => {
|
|
49
|
+
const alphabet = getSemanticAlphabet()
|
|
50
|
+
expect(alphabet.has('[Branch1]')).toBe(false)
|
|
51
|
+
expect(alphabet.has('[Ring1]')).toBe(false)
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
test('is subset of full alphabet', () => {
|
|
55
|
+
const full = getAlphabet()
|
|
56
|
+
const semantic = getSemanticAlphabet()
|
|
57
|
+
for (const token of semantic) {
|
|
58
|
+
expect(full.has(token)).toBe(true)
|
|
59
|
+
}
|
|
60
|
+
})
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
describe('isValidToken', () => {
|
|
64
|
+
test('validates basic atoms', () => {
|
|
65
|
+
expect(isValidToken('[C]')).toBe(true)
|
|
66
|
+
expect(isValidToken('[O]')).toBe(true)
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
test('rejects invalid tokens', () => {
|
|
70
|
+
expect(isValidToken('[Xyz]')).toBe(false)
|
|
71
|
+
})
|
|
72
|
+
})
|
|
73
|
+
|
|
74
|
+
describe('isSemanticRobust', () => {
|
|
75
|
+
test('atoms are semantic robust', () => {
|
|
76
|
+
expect(isSemanticRobust('[C]')).toBe(true)
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
test('structural tokens are not semantic robust', () => {
|
|
80
|
+
expect(isSemanticRobust('[Branch1]')).toBe(false)
|
|
81
|
+
})
|
|
82
|
+
})
|