selfies-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +274 -0
  3. package/package.json +65 -0
  4. package/src/alphabet.js +150 -0
  5. package/src/alphabet.test.js +82 -0
  6. package/src/chemistryValidator.js +236 -0
  7. package/src/cli.js +206 -0
  8. package/src/constraints.js +186 -0
  9. package/src/constraints.test.js +126 -0
  10. package/src/decoder.js +636 -0
  11. package/src/decoder.test.js +560 -0
  12. package/src/dsl/analyzer.js +170 -0
  13. package/src/dsl/analyzer.test.js +139 -0
  14. package/src/dsl/dsl.test.js +146 -0
  15. package/src/dsl/importer.js +238 -0
  16. package/src/dsl/index.js +32 -0
  17. package/src/dsl/lexer.js +264 -0
  18. package/src/dsl/lexer.test.js +115 -0
  19. package/src/dsl/parser.js +201 -0
  20. package/src/dsl/parser.test.js +148 -0
  21. package/src/dsl/resolver.js +136 -0
  22. package/src/dsl/resolver.test.js +99 -0
  23. package/src/dsl/symbolTable.js +56 -0
  24. package/src/dsl/symbolTable.test.js +68 -0
  25. package/src/dsl/valenceValidator.js +147 -0
  26. package/src/encoder.js +467 -0
  27. package/src/encoder.test.js +61 -0
  28. package/src/errors.js +79 -0
  29. package/src/errors.test.js +91 -0
  30. package/src/grammar_rules.js +146 -0
  31. package/src/index.js +70 -0
  32. package/src/parser.js +96 -0
  33. package/src/parser.test.js +96 -0
  34. package/src/properties/atoms.js +69 -0
  35. package/src/properties/atoms.test.js +116 -0
  36. package/src/properties/formula.js +111 -0
  37. package/src/properties/formula.test.js +95 -0
  38. package/src/properties/molecularWeight.js +80 -0
  39. package/src/properties/molecularWeight.test.js +84 -0
  40. package/src/properties/properties.test.js +77 -0
  41. package/src/renderers/README.md +127 -0
  42. package/src/renderers/svg.js +113 -0
  43. package/src/renderers/svg.test.js +42 -0
  44. package/src/syntax.js +641 -0
  45. package/src/syntax.test.js +363 -0
  46. package/src/tokenizer.js +99 -0
  47. package/src/tokenizer.test.js +55 -0
  48. package/src/validator.js +70 -0
  49. package/src/validator.test.js +44 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,274 @@
1
+ <div align="center">
2
+ <img src="toluene-logo.svg" alt="Toluene molecule" width="200"/>
3
+ <h1>selfies-js</h1>
4
+ <p>A pure JavaScript implementation of the SELFIES molecular string representation</p>
5
+ </div>
6
+
7
+ ## What is SELFIES?
8
+
9
+ **SELFIES** (SELF-referencIng Embedded Strings) is a 100% robust molecular string representation. Unlike SMILES, every SELFIES string corresponds to a valid molecule, making it ideal for machine learning and generative models in chemistry.
10
+
11
+ This library is a JavaScript port inspired by the original Python implementation: **[aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)**
12
+
13
+ > Krenn, M., Häse, F., Nigam, A., Friederich, P., & Aspuru-Guzik, A. (2020). Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation. *Machine Learning: Science and Technology*, 1(4), 045024.
14
+
15
+ ## Overview
16
+
17
+ ```javascript
18
+ import {
19
+ decode, encode, isValid,
20
+ getMolecularWeight, getFormula,
21
+ lenSelfies, getSemanticConstraints,
22
+ isChemicallyValid, getCanonicalSmiles, validateRoundtrip
23
+ } from 'selfies-js'
24
+
25
+ // SELFIES → SMILES
26
+ decode('[C][C][O]') // 'CCO'
27
+
28
+ // SMILES → SELFIES
29
+ encode('CCO') // '[C][C][O]'
30
+
31
+ // Syntax validation
32
+ isValid('[C][C][O]') // true
33
+
34
+ // Chemistry validation (requires RDKit)
35
+ import { initRDKit } from 'selfies-js'
36
+ await initRDKit()
37
+ await isChemicallyValid('[C][C][O]') // true - molecule is chemically valid
38
+ await getCanonicalSmiles('[C][C][O]') // 'CCO' - canonical SMILES representation
39
+ await validateRoundtrip('CCO', '[C][C][O]') // true - structure preserved
40
+
41
+ // Properties
42
+ getMolecularWeight('[C][C][O]') // 46.07
43
+ getFormula('[C][C][O]') // 'C2H6O'
44
+
45
+ // Utilities
46
+ lenSelfies('[C][C][O]') // 3 (symbol count, not string length)
47
+
48
+ // Semantic constraints
49
+ const constraints = getSemanticConstraints()
50
+ console.log(constraints['C']) // 4 (max bonds for carbon)
51
+
52
+ // SVG Rendering (using RDKit.js)
53
+ import { renderSelfies } from 'selfies-js'
54
+
55
+ const svg = await renderSelfies('[C][C][O]', {
56
+ width: 300,
57
+ height: 300
58
+ })
59
+ ```
60
+
61
+ ## Installation
62
+
63
+ ```bash
64
+ npm install selfies-js
65
+ ```
66
+
67
+ ### Browser Usage (CDN)
68
+
69
+ For direct browser usage without a bundler:
70
+
71
+ ```html
72
+ <!-- Complete bundle: DSL + SELFIES encode/decode + utilities -->
73
+ <script src="https://github.com/Ghost---Shadow/selfies-js/releases/latest/download/selfies.umd.min.js"></script>
74
+ <script>
75
+ // SELFIES encoding/decoding
76
+ const encoded = SELFIES.encode('CC');
77
+ console.log(encoded); // '[C][C]'
78
+
79
+ const decoded = SELFIES.decode('[C][C][O]');
80
+ console.log(decoded); // 'CCO'
81
+
82
+ // DSL parsing
83
+ const parsed = SELFIES.parse('[methyl] = [C]');
84
+ console.log(parsed);
85
+
86
+ // Molecular properties
87
+ const mw = SELFIES.getMolecularWeight('[C][C][O]');
88
+ console.log(mw); // 46.07
89
+ </script>
90
+ ```
91
+
92
+ Download from the [latest release](https://github.com/Ghost---Shadow/selfies-js/releases/latest).
93
+
94
+ ## Features
95
+
96
+ - **Core:** Decode SELFIES to SMILES
97
+ - **Core:** Encode SMILES to SELFIES
98
+ - **Validation:** Syntax and semantic validation
99
+ - **Chemistry Validation:** RDKit-based molecular validity checking
100
+ - **Canonical SMILES:** Structure comparison and roundtrip validation
101
+ - **Properties:** Molecular weight and formula calculation
102
+ - **Constraints:** Customizable semantic constraints (bonding rules)
103
+ - **Utilities:** Symbol counting, alphabet extraction
104
+ - **DSL:** Define and resolve molecule libraries with named definitions
105
+ - **Imports:** Modular .selfies files with import support
106
+ - **CLI:** Command-line interface for executing .selfies files
107
+ - **Rendering:** SVG visualization of molecular structures
108
+
109
+ ## CLI Usage
110
+
111
+ The `selfies-js` CLI allows you to work with `.selfies` DSL files from the command line.
112
+
113
+ ### Commands
114
+
115
+ ```bash
116
+ # Execute a .selfies file and output resolved definitions
117
+ bun src/cli.js run molecules.selfies
118
+
119
+ # Output as SMILES instead of SELFIES
120
+ bun src/cli.js run molecules.selfies --format=smiles
121
+
122
+ # Validate a .selfies file for errors
123
+ bun src/cli.js validate molecules.selfies
124
+
125
+ # List all definitions in a file
126
+ bun src/cli.js list molecules.selfies
127
+
128
+ # Show help
129
+ bun src/cli.js help
130
+ ```
131
+
132
+ ### DSL Syntax
133
+
134
+ The `.selfies` DSL allows you to define named molecular fragments and compose them hierarchically:
135
+
136
+ ```selfies
137
+ # Comments start with #
138
+
139
+ # Basic definitions
140
+ [methyl] = [C]
141
+ [ethyl] = [C][C]
142
+ [hydroxyl] = [O]
143
+
144
+ # Composition - reference other definitions
145
+ [ethanol] = [ethyl][hydroxyl]
146
+
147
+ # Complex structures with branches
148
+ [isopropyl] = [C][Branch1][C][C][C]
149
+ [isopropanol] = [isopropyl][hydroxyl]
150
+
151
+ # Aromatic rings
152
+ [phenyl] = [C][=C][C][=C][C][=C][Ring1][=Branch1]
153
+ [toluene] = [methyl][phenyl]
154
+ ```
155
+
156
+ ### Import Syntax
157
+
158
+ Import definitions from other `.selfies` files:
159
+
160
+ ```selfies
161
+ # Import all definitions from another file
162
+ import "./fragments.selfies"
163
+
164
+ # Alternative syntax for importing all
165
+ import * from "./common.selfies"
166
+
167
+ # Import specific definitions only
168
+ import [methyl, ethyl, hydroxyl] from "./base.selfies"
169
+
170
+ # Use imported definitions
171
+ [my_molecule] = [methyl][hydroxyl]
172
+ ```
173
+
174
+ Imports support:
175
+ - **Relative paths** resolved from the importing file's location
176
+ - **Chained imports** (file A imports B, B imports C)
177
+ - **Circular import detection** with clear error messages
178
+ - **Selective imports** to only include what you need
179
+
180
+ ### Example Output
181
+
182
+ ```bash
183
+ $ bun src/cli.js run molecules.selfies --format=smiles
184
+ methyl: C
185
+ ethyl: CC
186
+ hydroxyl: O
187
+ ethanol: CCO
188
+ isopropyl: C(C)C
189
+ isopropanol: C(C)CO
190
+ phenyl: C1=CC=CC=C1
191
+ toluene: CC1=CC=CC=C1
192
+ ```
193
+
194
+ ## DSL API
195
+
196
+ ```javascript
197
+ import { parse, resolve, resolveAll } from 'selfies-js/dsl'
198
+ import { loadFile } from 'selfies-js/dsl'
199
+
200
+ // Load a file with imports
201
+ const program = loadFile('molecules.selfies')
202
+
203
+ // Or parse source directly
204
+ const source = `
205
+ [methyl] = [C]
206
+ [ethanol] = [methyl][C][O]
207
+ `
208
+ const program = parse(source)
209
+
210
+ // Resolve a single definition
211
+ resolve(program, 'ethanol') // '[C][C][O]'
212
+
213
+ // Resolve with SMILES output
214
+ resolve(program, 'ethanol', { decode: true }) // 'CCO'
215
+
216
+ // Resolve all definitions
217
+ const all = resolveAll(program)
218
+ // Map { 'methyl' => '[C]', 'ethanol' => '[C][C][O]' }
219
+ ```
220
+
221
+ ## Visualization
222
+
223
+ The library uses **RDKit.js** for professional molecule rendering:
224
+
225
+ ```javascript
226
+ import { renderSelfies, initRDKit } from 'selfies-js'
227
+
228
+ // Initialize RDKit (async, only needed once)
229
+ await initRDKit()
230
+
231
+ // Render toluene
232
+ const svg = await renderSelfies('[C][C][=C][C][=C][C][=C][Ring1][=Branch1]', {
233
+ width: 300,
234
+ height: 300
235
+ })
236
+ ```
237
+
238
+ Features:
239
+ - Professional 2D coordinate generation via RDKit
240
+ - Proper skeletal formulas (carbons hidden)
241
+ - Correct benzene ring geometry
242
+ - Support for all bond types
243
+ - Stereochemistry notation
244
+ - Industry-standard rendering
245
+
246
+ ## Examples
247
+
248
+ See the `examples/` directory for sample `.selfies` files:
249
+
250
+ - `base-fragments.selfies` - Reusable building blocks (alkyl groups, functional groups, halogens)
251
+ - `molecules-with-imports.selfies` - Demonstrates importing and composing molecules
252
+ - `organic-chemistry.selfies` - Alcohols, aldehydes, acids, amines, ethers
253
+ - `drug-fragments.selfies` - Pharmacophore fragments, drug-like building blocks
254
+ - `polymers.selfies` - Monomers, repeat units, oligomers
255
+
256
+ ## Known Limitations
257
+
258
+ The encoder/decoder handles most common organic molecules correctly. Some complex cases have known limitations:
259
+
260
+ - **Bracket atoms** in SMILES (`[nH]`, `[C@@]`, `[13C]`) - limited support
261
+ - **Fused aromatic ring systems** - some complex cases may not roundtrip correctly
262
+ - **Polycyclic structures** with multiple ring closures - partial support
263
+
264
+ For complete SELFIES support, use the original Python library: [aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
265
+
266
+ ## References
267
+
268
+ - **Original SELFIES Paper:** Krenn, M., Häse, F., Nigam, A., Friederich, P., & Aspuru-Guzik, A. (2020). Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation. *Machine Learning: Science and Technology*, 1(4), 045024. [DOI: 10.1088/2632-2153/aba947](https://doi.org/10.1088/2632-2153/aba947)
269
+
270
+ - **Python Implementation:** [github.com/aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
271
+
272
+ ## License
273
+
274
+ MIT
package/package.json ADDED
@@ -0,0 +1,65 @@
1
+ {
2
+ "name": "selfies-js",
3
+ "version": "0.1.0",
4
+ "description": "Pure JavaScript SELFIES encoder/decoder with DSL for molecular composition",
5
+ "type": "module",
6
+ "main": "src/index.js",
7
+ "bin": {
8
+ "selfies-js": "./src/cli.js"
9
+ },
10
+ "exports": {
11
+ ".": "./src/index.js",
12
+ "./dsl": "./src/dsl/index.js",
13
+ "./syntax": "./src/syntax.js"
14
+ },
15
+ "files": [
16
+ "src/**/*.js",
17
+ "README.md",
18
+ "LICENSE"
19
+ ],
20
+ "scripts": {
21
+ "test": "bun test",
22
+ "bench": "bun run bench/decode.bench.js",
23
+ "lint": "echo 'Linting...'; exit 0",
24
+ "prepublishOnly": "bun test",
25
+ "pack": "bun pm pack",
26
+ "publish:dry": "bun pm pack --dry-run"
27
+ },
28
+ "keywords": [
29
+ "selfies",
30
+ "smiles",
31
+ "chemistry",
32
+ "cheminformatics",
33
+ "molecules",
34
+ "molecular-design",
35
+ "drug-discovery",
36
+ "encoder",
37
+ "decoder",
38
+ "dsl",
39
+ "parser"
40
+ ],
41
+ "author": "SELFIES Contributors",
42
+ "license": "MIT",
43
+ "repository": {
44
+ "type": "git",
45
+ "url": "https://github.com/Ghost---Shadow/selfies-js.git"
46
+ },
47
+ "bugs": {
48
+ "url": "https://github.com/Ghost---Shadow/selfies-js/issues"
49
+ },
50
+ "homepage": "https://github.com/Ghost---Shadow/selfies-js#readme",
51
+ "engines": {
52
+ "node": ">=16.0.0"
53
+ },
54
+ "peerDependencies": {
55
+ "@rdkit/rdkit": "^2025.3.4-1.0.0"
56
+ },
57
+ "peerDependenciesMeta": {
58
+ "@rdkit/rdkit": {
59
+ "optional": true
60
+ }
61
+ },
62
+ "devDependencies": {
63
+ "@rdkit/rdkit": "^2025.3.4-1.0.0"
64
+ }
65
+ }
@@ -0,0 +1,150 @@
1
+ /**
2
+ * Alphabet - Valid SELFIES token sets
3
+ *
4
+ * Defines the full SELFIES alphabet and semantic-robust subset.
5
+ */
6
+
7
+ /**
8
+ * Full SELFIES alphabet - all valid tokens
9
+ * @type {Set<string>}
10
+ */
11
+ let _alphabet = null
12
+
13
+ /**
14
+ * Semantic-robust alphabet - tokens that are always chemically meaningful
15
+ * @type {Set<string>}
16
+ */
17
+ let _semanticAlphabet = null
18
+
19
+ /**
20
+ * Gets the full SELFIES alphabet
21
+ * @returns {Set<string>} Set of all valid SELFIES tokens
22
+ */
23
+ export function getAlphabet() {
24
+ if (_alphabet === null) {
25
+ _alphabet = new Set()
26
+
27
+ // Add all atom tokens (basic, double bond, triple bond)
28
+ const atomTokens = buildAtomTokens()
29
+ for (const token of atomTokens) {
30
+ _alphabet.add(token)
31
+ }
32
+
33
+ // Add structural tokens (branches, rings, length specifiers)
34
+ const structuralTokens = buildStructuralTokens()
35
+ for (const token of structuralTokens) {
36
+ _alphabet.add(token)
37
+ }
38
+ }
39
+
40
+ return _alphabet
41
+ }
42
+
43
+ /**
44
+ * Gets the semantic-robust SELFIES alphabet
45
+ * @returns {Set<string>} Set of semantically robust tokens
46
+ *
47
+ * Semantic-robust tokens are those that produce valid molecules
48
+ * regardless of context (no branch/ring tokens, etc.)
49
+ */
50
+ export function getSemanticAlphabet() {
51
+ if (_semanticAlphabet === null) {
52
+ _semanticAlphabet = new Set()
53
+
54
+ // Add only atom tokens (basic, double bond, triple bond)
55
+ // Exclude structural tokens (Branch, Ring) as they are context-dependent
56
+ const atomTokens = buildAtomTokens()
57
+ for (const token of atomTokens) {
58
+ _semanticAlphabet.add(token)
59
+ }
60
+ }
61
+
62
+ return _semanticAlphabet
63
+ }
64
+
65
+ /**
66
+ * Checks if a token is in the alphabet
67
+ * @param {string} token - Token to check
68
+ * @returns {boolean} True if token is valid
69
+ */
70
+ export function isValidToken(token) {
71
+ return getAlphabet().has(token)
72
+ }
73
+
74
+ /**
75
+ * Checks if a token is semantic-robust
76
+ * @param {string} token - Token to check
77
+ * @returns {boolean} True if token is semantic-robust
78
+ */
79
+ export function isSemanticRobust(token) {
80
+ return getSemanticAlphabet().has(token)
81
+ }
82
+
83
+ /**
84
+ * Builds the list of all atom tokens (with bond modifiers)
85
+ * @returns {string[]} Array of atom tokens
86
+ */
87
+ function buildAtomTokens() {
88
+ const elements = ['C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I', 'B']
89
+ const tokens = []
90
+
91
+ for (const element of elements) {
92
+ tokens.push(`[${element}]`) // basic atom
93
+ tokens.push(`[=${element}]`) // double bond
94
+ tokens.push(`[#${element}]`) // triple bond
95
+ }
96
+
97
+ return tokens
98
+ }
99
+
100
+ /**
101
+ * Builds the list of structural tokens (branch, ring)
102
+ * @returns {string[]} Array of structural tokens
103
+ */
104
+ function buildStructuralTokens() {
105
+ const tokens = []
106
+
107
+ // Branch and Ring tokens
108
+ tokens.push('[Branch1]', '[Branch2]', '[Branch3]')
109
+ tokens.push('[Ring1]', '[Ring2]', '[Ring3]')
110
+
111
+ // Numeric tokens used as length specifiers (based on atom tokens)
112
+ // These are the same as bond-modified atoms but used as numbers
113
+ tokens.push('[=Branch1]', '[=Branch2]', '[=Branch3]')
114
+ tokens.push('[#Branch1]', '[#Branch2]', '[#Branch3]')
115
+
116
+ // Stereochemistry ring tokens
117
+ tokens.push('[-/Ring1]', '[-/Ring2]', '[-/Ring3]')
118
+ tokens.push('[\\\\/Ring1]', '[\\\\/Ring2]', '[\\\\/Ring3]')
119
+
120
+ return tokens
121
+ }
122
+
123
+ /**
124
+ * Extracts the unique SELFIES alphabet from a collection of SELFIES strings
125
+ * @param {Iterable<string>} selfiesIterable - Collection of SELFIES strings
126
+ * @returns {Set<string>} Set of unique SELFIES symbols found
127
+ *
128
+ * Based on selfies-py's get_alphabet_from_selfies() function.
129
+ *
130
+ * Example:
131
+ * const alphabet = getAlphabetFromSelfies(['[C][C][O]', '[N][C][=O]'])
132
+ * // Set { '[C]', '[O]', '[N]', '[=O]' }
133
+ *
134
+ * Reference: selfies-py/selfies/utils/selfies_utils.py::get_alphabet_from_selfies()
135
+ */
136
+ export function getAlphabetFromSelfies(selfiesIterable) {
137
+ // TODO: Will implement this after tokenizer is ready
138
+ // For now, manually tokenize by extracting [...] patterns
139
+ const alphabet = new Set()
140
+
141
+ for (const selfies of selfiesIterable) {
142
+ // Simple regex-based tokenization (temporary until tokenizer.js is ready)
143
+ const tokens = selfies.match(/\[[^\]]+\]/g) || []
144
+ for (const token of tokens) {
145
+ alphabet.add(token)
146
+ }
147
+ }
148
+
149
+ return alphabet
150
+ }
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Tests for SELFIES alphabet
3
+ */
4
+
5
+ import { describe, test, expect } from 'bun:test'
6
+ import { getAlphabet, getSemanticAlphabet, isValidToken, isSemanticRobust } from './alphabet.js'
7
+
8
+ describe('getAlphabet', () => {
9
+ test('includes basic atoms', () => {
10
+ const alphabet = getAlphabet()
11
+ expect(alphabet.has('[C]')).toBe(true)
12
+ expect(alphabet.has('[N]')).toBe(true)
13
+ expect(alphabet.has('[O]')).toBe(true)
14
+ })
15
+
16
+ test('includes bond modifiers', () => {
17
+ const alphabet = getAlphabet()
18
+ expect(alphabet.has('[=C]')).toBe(true)
19
+ expect(alphabet.has('[#N]')).toBe(true)
20
+ })
21
+
22
+ test('includes multi-char elements', () => {
23
+ const alphabet = getAlphabet()
24
+ expect(alphabet.has('[Cl]')).toBe(true)
25
+ expect(alphabet.has('[Br]')).toBe(true)
26
+ })
27
+
28
+ test('includes structural tokens', () => {
29
+ const alphabet = getAlphabet()
30
+ expect(alphabet.has('[Branch1]')).toBe(true)
31
+ expect(alphabet.has('[Ring1]')).toBe(true)
32
+ })
33
+
34
+ test('excludes invalid tokens', () => {
35
+ const alphabet = getAlphabet()
36
+ expect(alphabet.has('[Xyz]')).toBe(false)
37
+ expect(alphabet.has('[123]')).toBe(false)
38
+ })
39
+ })
40
+
41
+ describe('getSemanticAlphabet', () => {
42
+ test('includes atoms and bonds only', () => {
43
+ const alphabet = getSemanticAlphabet()
44
+ expect(alphabet.has('[C]')).toBe(true)
45
+ expect(alphabet.has('[=C]')).toBe(true)
46
+ })
47
+
48
+ test('excludes structural tokens', () => {
49
+ const alphabet = getSemanticAlphabet()
50
+ expect(alphabet.has('[Branch1]')).toBe(false)
51
+ expect(alphabet.has('[Ring1]')).toBe(false)
52
+ })
53
+
54
+ test('is subset of full alphabet', () => {
55
+ const full = getAlphabet()
56
+ const semantic = getSemanticAlphabet()
57
+ for (const token of semantic) {
58
+ expect(full.has(token)).toBe(true)
59
+ }
60
+ })
61
+ })
62
+
63
+ describe('isValidToken', () => {
64
+ test('validates basic atoms', () => {
65
+ expect(isValidToken('[C]')).toBe(true)
66
+ expect(isValidToken('[O]')).toBe(true)
67
+ })
68
+
69
+ test('rejects invalid tokens', () => {
70
+ expect(isValidToken('[Xyz]')).toBe(false)
71
+ })
72
+ })
73
+
74
+ describe('isSemanticRobust', () => {
75
+ test('atoms are semantic robust', () => {
76
+ expect(isSemanticRobust('[C]')).toBe(true)
77
+ })
78
+
79
+ test('structural tokens are not semantic robust', () => {
80
+ expect(isSemanticRobust('[Branch1]')).toBe(false)
81
+ })
82
+ })