selfies-js 0.2.0 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +121 -197
- package/package.json +1 -1
- package/src/dsl/lexer.js +72 -7
- package/src/dsl/parser.js +120 -1
- package/src/dsl/resolver.js +87 -1
- package/src/dsl/resolver.test.js +83 -0
package/README.md
CHANGED
|
@@ -1,74 +1,54 @@
|
|
|
1
1
|
<div align="center">
|
|
2
|
-
<img src="
|
|
2
|
+
<img src="icon.svg" alt="Toluene molecule" width="200"/>
|
|
3
3
|
<h1>selfies-js</h1>
|
|
4
|
-
<p>
|
|
4
|
+
<p><strong>Molecular fragments as reusable code.</strong></p>
|
|
5
5
|
</div>
|
|
6
6
|
|
|
7
|
-
##
|
|
8
|
-
|
|
9
|
-
**SELFIES** (SELF-referencIng Embedded Strings) is a 100% robust molecular string representation. Unlike SMILES, every SELFIES string corresponds to a valid molecule, making it ideal for machine learning and generative models in chemistry.
|
|
10
|
-
|
|
11
|
-
This library is a JavaScript port inspired by the original Python implementation: **[aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)**
|
|
7
|
+
## Why SELFIES?
|
|
12
8
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
## Overview
|
|
9
|
+
SELFIES was designed for machine learning — every string is a valid molecule, eliminating syntax errors in generative models. This DSL extends that principle: named fragments are easier for LLMs to compose correctly than raw atom strings, and undefined references fail loudly instead of producing silent errors.
|
|
16
10
|
|
|
17
|
-
|
|
18
|
-
import {
|
|
19
|
-
decode, encode, isValid,
|
|
20
|
-
getMolecularWeight, getFormula,
|
|
21
|
-
lenSelfies, getSemanticConstraints,
|
|
22
|
-
isChemicallyValid, getCanonicalSmiles, validateRoundtrip
|
|
23
|
-
} from 'selfies-js'
|
|
11
|
+
## The Problem
|
|
24
12
|
|
|
25
|
-
|
|
26
|
-
decode('[C][C][O]') // 'CCO'
|
|
13
|
+
Pharmaceutical SMILES strings are unreadable:
|
|
27
14
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
// Syntax validation
|
|
32
|
-
isValid('[C][C][O]') // true
|
|
33
|
-
|
|
34
|
-
// Chemistry validation (requires RDKit)
|
|
35
|
-
import { initRDKit } from 'selfies-js'
|
|
36
|
-
await initRDKit()
|
|
37
|
-
await isChemicallyValid('[C][C][O]') // true - molecule is chemically valid
|
|
38
|
-
await getCanonicalSmiles('[C][C][O]') // 'CCO' - canonical SMILES representation
|
|
39
|
-
await validateRoundtrip('CCO', '[C][C][O]') // true - structure preserved
|
|
15
|
+
```
|
|
16
|
+
CC(=O)Nc1ccc(O)cc1
|
|
17
|
+
```
|
|
40
18
|
|
|
41
|
-
|
|
42
|
-
getMolecularWeight('[C][C][O]') // 46.07
|
|
43
|
-
getFormula('[C][C][O]') // 'C2H6O'
|
|
19
|
+
What is that? Acetaminophen. But you'd never know by looking at it.
|
|
44
20
|
|
|
45
|
-
|
|
46
|
-
lenSelfies('[C][C][O]') // 3 (symbol count, not string length)
|
|
21
|
+
And when LLMs generate SMILES, they hallucinate invalid structures. SELFIES fixes the validity problem. The DSL fixes the readability problem.
|
|
47
22
|
|
|
48
|
-
|
|
49
|
-
const constraints = getSemanticConstraints()
|
|
50
|
-
console.log(constraints['C']) // 4 (max bonds for carbon)
|
|
23
|
+
## The Solution
|
|
51
24
|
|
|
52
|
-
|
|
53
|
-
import { renderSelfies } from 'selfies-js'
|
|
25
|
+
Define named fragments. Compose molecules like code. Import and reuse across projects.
|
|
54
26
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
27
|
+
```selfies
|
|
28
|
+
# base-fragments.selfies - your team's shared library
|
|
29
|
+
[methyl] = [C]
|
|
30
|
+
[amino] = [N]
|
|
31
|
+
[hydroxyl] = [O]
|
|
32
|
+
[phenyl] = [C][=C][C][=C][C][=C][Ring1][=Branch1]
|
|
59
33
|
```
|
|
60
34
|
|
|
61
|
-
|
|
35
|
+
```selfies
|
|
36
|
+
# molecules.selfies - today's work
|
|
37
|
+
import [methyl, amino, hydroxyl, phenyl] from "./base-fragments.selfies"
|
|
62
38
|
|
|
63
|
-
|
|
39
|
+
[methanol] = [methyl][hydroxyl]
|
|
40
|
+
[aniline] = [phenyl][amino]
|
|
41
|
+
[toluene] = [methyl][phenyl]
|
|
42
|
+
```
|
|
64
43
|
|
|
65
|
-
|
|
44
|
+
```bash
|
|
45
|
+
$ selfies run molecules.selfies --format=smiles
|
|
46
|
+
methanol: CO
|
|
47
|
+
aniline: NC1=CC=CC=C1
|
|
48
|
+
toluene: CC1=CC=CC=C1
|
|
49
|
+
```
|
|
66
50
|
|
|
67
|
-
|
|
68
|
-
- Live SELFIES ↔ SMILES conversion
|
|
69
|
-
- Real-time molecular properties
|
|
70
|
-
- Built-in test suite
|
|
71
|
-
- Syntax highlighting
|
|
51
|
+
Your molecule definitions are now version-controlled, diffable, and shareable.
|
|
72
52
|
|
|
73
53
|
## Installation
|
|
74
54
|
|
|
@@ -76,210 +56,154 @@ Features:
|
|
|
76
56
|
npm install selfies-js
|
|
77
57
|
```
|
|
78
58
|
|
|
79
|
-
|
|
59
|
+
## Quick Start
|
|
80
60
|
|
|
81
|
-
|
|
61
|
+
**1. Create a fragment library** (`fragments.selfies`):
|
|
82
62
|
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
console.log(encoded); // '[C][C]'
|
|
90
|
-
|
|
91
|
-
const decoded = SELFIES.decode('[C][C][O]');
|
|
92
|
-
console.log(decoded); // 'CCO'
|
|
93
|
-
|
|
94
|
-
// DSL parsing
|
|
95
|
-
const parsed = SELFIES.parse('[methyl] = [C]');
|
|
96
|
-
console.log(parsed);
|
|
97
|
-
|
|
98
|
-
// Molecular properties
|
|
99
|
-
const mw = SELFIES.getMolecularWeight('[C][C][O]');
|
|
100
|
-
console.log(mw); // 46.07
|
|
101
|
-
</script>
|
|
63
|
+
```selfies
|
|
64
|
+
[methyl] = [C]
|
|
65
|
+
[ethyl] = [C][C]
|
|
66
|
+
[hydroxyl] = [O]
|
|
67
|
+
[carbonyl] = [C][=O]
|
|
68
|
+
[carboxyl] = [C][=O][O]
|
|
102
69
|
```
|
|
103
70
|
|
|
104
|
-
|
|
71
|
+
**2. Compose molecules** (`molecules.selfies`):
|
|
105
72
|
|
|
106
|
-
|
|
73
|
+
```selfies
|
|
74
|
+
import "./fragments.selfies"
|
|
107
75
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
- **Chemistry Validation:** RDKit-based molecular validity checking
|
|
112
|
-
- **Canonical SMILES:** Structure comparison and roundtrip validation
|
|
113
|
-
- **Properties:** Molecular weight and formula calculation
|
|
114
|
-
- **Constraints:** Customizable semantic constraints (bonding rules)
|
|
115
|
-
- **Utilities:** Symbol counting, alphabet extraction
|
|
116
|
-
- **DSL:** Define and resolve molecule libraries with named definitions
|
|
117
|
-
- **Imports:** Modular .selfies files with import support
|
|
118
|
-
- **CLI:** Command-line interface for executing .selfies files
|
|
119
|
-
- **Rendering:** SVG visualization of molecular structures
|
|
76
|
+
[ethanol] = [ethyl][hydroxyl]
|
|
77
|
+
[acetic_acid] = [methyl][carboxyl]
|
|
78
|
+
```
|
|
120
79
|
|
|
121
|
-
|
|
80
|
+
**3. Compile to SMILES**:
|
|
122
81
|
|
|
123
|
-
|
|
82
|
+
```bash
|
|
83
|
+
$ selfies run molecules.selfies --format=smiles
|
|
84
|
+
ethanol: CCO
|
|
85
|
+
acetic_acid: CC(=O)O
|
|
86
|
+
```
|
|
124
87
|
|
|
125
|
-
|
|
88
|
+
## CLI Commands
|
|
126
89
|
|
|
127
90
|
```bash
|
|
128
|
-
#
|
|
129
|
-
|
|
91
|
+
# Compile a .selfies file to SMILES
|
|
92
|
+
selfies run molecules.selfies --format=smiles
|
|
130
93
|
|
|
131
|
-
# Output as
|
|
132
|
-
|
|
94
|
+
# Output as SELFIES (default)
|
|
95
|
+
selfies run molecules.selfies
|
|
133
96
|
|
|
134
|
-
# Validate
|
|
135
|
-
|
|
97
|
+
# Validate syntax without running
|
|
98
|
+
selfies validate molecules.selfies
|
|
136
99
|
|
|
137
100
|
# List all definitions in a file
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
# Show help
|
|
141
|
-
bun src/cli.js help
|
|
101
|
+
selfies list molecules.selfies
|
|
142
102
|
```
|
|
143
103
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
The `.selfies` DSL allows you to define named molecular fragments and compose them hierarchically:
|
|
104
|
+
## DSL Syntax
|
|
147
105
|
|
|
148
106
|
```selfies
|
|
149
107
|
# Comments start with #
|
|
150
108
|
|
|
151
|
-
#
|
|
152
|
-
[
|
|
153
|
-
[ethyl] = [C][C]
|
|
154
|
-
[hydroxyl] = [O]
|
|
109
|
+
# Define a fragment
|
|
110
|
+
[name] = [SELFIES][tokens][here]
|
|
155
111
|
|
|
156
|
-
#
|
|
112
|
+
# Reference other fragments
|
|
157
113
|
[ethanol] = [ethyl][hydroxyl]
|
|
158
114
|
|
|
159
|
-
#
|
|
160
|
-
|
|
161
|
-
[
|
|
162
|
-
|
|
163
|
-
# Aromatic rings
|
|
164
|
-
[phenyl] = [C][=C][C][=C][C][=C][Ring1][=Branch1]
|
|
165
|
-
[toluene] = [methyl][phenyl]
|
|
115
|
+
# Import from other files
|
|
116
|
+
import "./other-file.selfies" # import all
|
|
117
|
+
import [methyl, ethyl] from "./fragments.selfies" # import specific
|
|
166
118
|
```
|
|
167
119
|
|
|
168
|
-
|
|
120
|
+
## JavaScript API
|
|
169
121
|
|
|
170
|
-
|
|
122
|
+
```javascript
|
|
123
|
+
import { parse, resolve, resolveAll, loadFile } from 'selfies-js/dsl'
|
|
171
124
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
import "./fragments.selfies"
|
|
125
|
+
// Load a file with imports
|
|
126
|
+
const program = loadFile('molecules.selfies')
|
|
175
127
|
|
|
176
|
-
|
|
177
|
-
|
|
128
|
+
// Resolve a single definition to SELFIES
|
|
129
|
+
resolve(program, 'ethanol') // '[C][C][O]'
|
|
178
130
|
|
|
179
|
-
|
|
180
|
-
|
|
131
|
+
// Resolve to SMILES
|
|
132
|
+
resolve(program, 'ethanol', { decode: true }) // 'CCO'
|
|
181
133
|
|
|
182
|
-
|
|
183
|
-
|
|
134
|
+
// Resolve all definitions
|
|
135
|
+
resolveAll(program) // Map { 'ethanol' => '[C][C][O]', ... }
|
|
184
136
|
```
|
|
185
137
|
|
|
186
|
-
|
|
187
|
-
- **Relative paths** resolved from the importing file's location
|
|
188
|
-
- **Chained imports** (file A imports B, B imports C)
|
|
189
|
-
- **Circular import detection** with clear error messages
|
|
190
|
-
- **Selective imports** to only include what you need
|
|
138
|
+
### Core SELFIES functions
|
|
191
139
|
|
|
192
|
-
|
|
140
|
+
```javascript
|
|
141
|
+
import { encode, decode, isValid, getMolecularWeight, getFormula } from 'selfies-js'
|
|
193
142
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
ethanol: CCO
|
|
200
|
-
isopropyl: C(C)C
|
|
201
|
-
isopropanol: C(C)CO
|
|
202
|
-
phenyl: C1=CC=CC=C1
|
|
203
|
-
toluene: CC1=CC=CC=C1
|
|
143
|
+
decode('[C][C][O]') // 'CCO'
|
|
144
|
+
encode('CCO') // '[C][C][O]'
|
|
145
|
+
isValid('[C][C][O]') // true
|
|
146
|
+
getMolecularWeight('[C][C][O]') // 46.07
|
|
147
|
+
getFormula('[C][C][O]') // 'C2H6O'
|
|
204
148
|
```
|
|
205
149
|
|
|
206
|
-
|
|
150
|
+
### SVG Rendering
|
|
207
151
|
|
|
208
152
|
```javascript
|
|
209
|
-
import {
|
|
210
|
-
import { loadFile } from 'selfies-js/dsl'
|
|
153
|
+
import { renderSelfies, initRDKit } from 'selfies-js'
|
|
211
154
|
|
|
212
|
-
|
|
213
|
-
const
|
|
155
|
+
await initRDKit()
|
|
156
|
+
const svg = await renderSelfies('[C][C][O]', { width: 300, height: 300 })
|
|
157
|
+
```
|
|
214
158
|
|
|
215
|
-
|
|
216
|
-
const source = `
|
|
217
|
-
[methyl] = [C]
|
|
218
|
-
[ethanol] = [methyl][C][O]
|
|
219
|
-
`
|
|
220
|
-
const program = parse(source)
|
|
159
|
+
## VS Code Extension
|
|
221
160
|
|
|
222
|
-
|
|
223
|
-
resolve(program, 'ethanol') // '[C][C][O]'
|
|
161
|
+
Get live visualization as you author `.selfies` files. See the molecular structure update line-by-line as you navigate your code.
|
|
224
162
|
|
|
225
|
-
|
|
226
|
-
resolve(program, 'ethanol', { decode: true }) // 'CCO'
|
|
163
|
+
**[Install from VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=ghost---shadow.selfies-lang)**
|
|
227
164
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
165
|
+
## Browser Usage
|
|
166
|
+
|
|
167
|
+
```html
|
|
168
|
+
<script src="https://github.com/Ghost---Shadow/selfies-js/releases/latest/download/selfies.umd.min.js"></script>
|
|
169
|
+
<script>
|
|
170
|
+
SELFIES.encode('CCO') // '[C][C][O]'
|
|
171
|
+
SELFIES.decode('[C][C][O]') // 'CCO'
|
|
172
|
+
SELFIES.getMolecularWeight('[C][C][O]') // 46.07
|
|
173
|
+
</script>
|
|
231
174
|
```
|
|
232
175
|
|
|
233
|
-
##
|
|
176
|
+
## Examples
|
|
234
177
|
|
|
235
|
-
|
|
178
|
+
See the [`examples/`](./examples) directory:
|
|
236
179
|
|
|
237
|
-
|
|
238
|
-
|
|
180
|
+
- `base-fragments.selfies` — Reusable building blocks (alkyl groups, functional groups, halogens)
|
|
181
|
+
- `molecules-with-imports.selfies` — Composing molecules from imported fragments
|
|
182
|
+
- `selective-import.selfies` — Importing only what you need
|
|
239
183
|
|
|
240
|
-
|
|
241
|
-
await initRDKit()
|
|
184
|
+
## What is SELFIES?
|
|
242
185
|
|
|
243
|
-
|
|
244
|
-
const svg = await renderSelfies('[C][C][=C][C][=C][C][=C][Ring1][=Branch1]', {
|
|
245
|
-
width: 300,
|
|
246
|
-
height: 300
|
|
247
|
-
})
|
|
248
|
-
```
|
|
186
|
+
SELFIES (SELF-referencIng Embedded Strings) is a molecular string representation where **every string is a valid molecule**. Unlike SMILES, you can't write an invalid SELFIES string. This makes it ideal for machine learning and generative chemistry.
|
|
249
187
|
|
|
250
|
-
|
|
251
|
-
- Professional 2D coordinate generation via RDKit
|
|
252
|
-
- Proper skeletal formulas (carbons hidden)
|
|
253
|
-
- Correct benzene ring geometry
|
|
254
|
-
- Support for all bond types
|
|
255
|
-
- Stereochemistry notation
|
|
256
|
-
- Industry-standard rendering
|
|
188
|
+
This library is a JavaScript port of the Python implementation: [aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
|
|
257
189
|
|
|
258
|
-
|
|
190
|
+
> Krenn, M., Häse, F., Nigam, A., Friederich, P., & Aspuru-Guzik, A. (2020). Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation. *Machine Learning: Science and Technology*, 1(4), 045024.
|
|
259
191
|
|
|
260
|
-
|
|
192
|
+
## Interactive Playground
|
|
261
193
|
|
|
262
|
-
|
|
263
|
-
- `molecules-with-imports.selfies` - Demonstrates importing and composing molecules
|
|
264
|
-
- `organic-chemistry.selfies` - Alcohols, aldehydes, acids, amines, ethers
|
|
265
|
-
- `drug-fragments.selfies` - Pharmacophore fragments, drug-like building blocks
|
|
266
|
-
- `polymers.selfies` - Monomers, repeat units, oligomers
|
|
194
|
+
Try it live: **[https://ghost---shadow.github.io/selfies-js/](https://ghost---shadow.github.io/selfies-js/)**
|
|
267
195
|
|
|
268
196
|
## Known Limitations
|
|
269
197
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
- **
|
|
273
|
-
- **Fused aromatic ring systems** - some complex cases may not roundtrip correctly
|
|
274
|
-
- **Polycyclic structures** with multiple ring closures - partial support
|
|
275
|
-
|
|
276
|
-
For complete SELFIES support, use the original Python library: [aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
|
|
198
|
+
- **Bracket atoms** in SMILES (`[nH]`, `[C@@]`, `[13C]`) — limited support
|
|
199
|
+
- **Fused aromatic ring systems** — some complex cases may not roundtrip
|
|
200
|
+
- **Polycyclic structures** — partial support
|
|
277
201
|
|
|
278
|
-
|
|
202
|
+
For complete SELFIES support, use the Python library: [aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
|
|
279
203
|
|
|
280
|
-
|
|
204
|
+
## Testimonies
|
|
281
205
|
|
|
282
|
-
|
|
206
|
+

|
|
283
207
|
|
|
284
208
|
## License
|
|
285
209
|
|
package/package.json
CHANGED
package/src/dsl/lexer.js
CHANGED
|
@@ -20,11 +20,17 @@ export const TokenType = {
|
|
|
20
20
|
// Import-related tokens
|
|
21
21
|
IMPORT: 'IMPORT', // import keyword
|
|
22
22
|
FROM: 'FROM', // from keyword
|
|
23
|
-
STRING: 'STRING', // "path/to/file.selfies"
|
|
23
|
+
STRING: 'STRING', // "path/to/file.selfies" or 'pattern'
|
|
24
24
|
STAR: 'STAR', // * (wildcard import)
|
|
25
25
|
COMMA: 'COMMA', // , (separator in selective imports)
|
|
26
26
|
LBRACKET: 'LBRACKET', // [ (for selective import list)
|
|
27
27
|
RBRACKET: 'RBRACKET', // ] (for selective import list)
|
|
28
|
+
|
|
29
|
+
// Repeat macro tokens
|
|
30
|
+
REPEAT: 'REPEAT', // repeat keyword
|
|
31
|
+
LPAREN: 'LPAREN', // (
|
|
32
|
+
RPAREN: 'RPAREN', // )
|
|
33
|
+
NUMBER: 'NUMBER', // numeric literal
|
|
28
34
|
}
|
|
29
35
|
|
|
30
36
|
/**
|
|
@@ -135,14 +141,43 @@ export function lex(source) {
|
|
|
135
141
|
continue
|
|
136
142
|
}
|
|
137
143
|
|
|
138
|
-
//
|
|
139
|
-
if (char === '
|
|
144
|
+
// Left parenthesis
|
|
145
|
+
if (char === '(') {
|
|
146
|
+
tokens.push({
|
|
147
|
+
type: TokenType.LPAREN,
|
|
148
|
+
value: '(',
|
|
149
|
+
line,
|
|
150
|
+
column,
|
|
151
|
+
range: [i, i + 1]
|
|
152
|
+
})
|
|
153
|
+
i++
|
|
154
|
+
column++
|
|
155
|
+
continue
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Right parenthesis
|
|
159
|
+
if (char === ')') {
|
|
160
|
+
tokens.push({
|
|
161
|
+
type: TokenType.RPAREN,
|
|
162
|
+
value: ')',
|
|
163
|
+
line,
|
|
164
|
+
column,
|
|
165
|
+
range: [i, i + 1]
|
|
166
|
+
})
|
|
167
|
+
i++
|
|
168
|
+
column++
|
|
169
|
+
continue
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// String literal (for import paths and repeat patterns)
|
|
173
|
+
if (char === '"' || char === "'") {
|
|
140
174
|
const stringStart = i
|
|
141
|
-
|
|
175
|
+
const quote = char
|
|
176
|
+
let stringValue = quote
|
|
142
177
|
i++
|
|
143
178
|
column++
|
|
144
179
|
|
|
145
|
-
while (i < source.length && source[i] !==
|
|
180
|
+
while (i < source.length && source[i] !== quote && source[i] !== '\n') {
|
|
146
181
|
stringValue += source[i]
|
|
147
182
|
i++
|
|
148
183
|
column++
|
|
@@ -152,7 +187,7 @@ export function lex(source) {
|
|
|
152
187
|
throw new Error(`Unclosed string at line ${line}, column ${startColumn}`)
|
|
153
188
|
}
|
|
154
189
|
|
|
155
|
-
stringValue +=
|
|
190
|
+
stringValue += quote
|
|
156
191
|
i++
|
|
157
192
|
column++
|
|
158
193
|
|
|
@@ -166,7 +201,7 @@ export function lex(source) {
|
|
|
166
201
|
continue
|
|
167
202
|
}
|
|
168
203
|
|
|
169
|
-
// Keywords and identifiers (import, from)
|
|
204
|
+
// Keywords and identifiers (import, from, repeat)
|
|
170
205
|
if (isAlpha(char)) {
|
|
171
206
|
const wordStart = i
|
|
172
207
|
let wordValue = ''
|
|
@@ -182,6 +217,8 @@ export function lex(source) {
|
|
|
182
217
|
type = TokenType.IMPORT
|
|
183
218
|
} else if (wordValue === 'from') {
|
|
184
219
|
type = TokenType.FROM
|
|
220
|
+
} else if (wordValue === 'repeat') {
|
|
221
|
+
type = TokenType.REPEAT
|
|
185
222
|
}
|
|
186
223
|
|
|
187
224
|
tokens.push({
|
|
@@ -194,6 +231,34 @@ export function lex(source) {
|
|
|
194
231
|
continue
|
|
195
232
|
}
|
|
196
233
|
|
|
234
|
+
// Numbers (including negative)
|
|
235
|
+
if (char >= '0' && char <= '9' || (char === '-' && i + 1 < source.length && source[i + 1] >= '0' && source[i + 1] <= '9')) {
|
|
236
|
+
const numberStart = i
|
|
237
|
+
let numberValue = ''
|
|
238
|
+
|
|
239
|
+
// Handle negative sign
|
|
240
|
+
if (char === '-') {
|
|
241
|
+
numberValue += char
|
|
242
|
+
i++
|
|
243
|
+
column++
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
while (i < source.length && source[i] >= '0' && source[i] <= '9') {
|
|
247
|
+
numberValue += source[i]
|
|
248
|
+
i++
|
|
249
|
+
column++
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
tokens.push({
|
|
253
|
+
type: TokenType.NUMBER,
|
|
254
|
+
value: numberValue,
|
|
255
|
+
line,
|
|
256
|
+
column: startColumn,
|
|
257
|
+
range: [numberStart, i]
|
|
258
|
+
})
|
|
259
|
+
continue
|
|
260
|
+
}
|
|
261
|
+
|
|
197
262
|
// Bracketed token (could be NAME or SELFIES_TOKEN)
|
|
198
263
|
if (char === '[') {
|
|
199
264
|
const tokenStart = i
|
package/src/dsl/parser.js
CHANGED
|
@@ -131,7 +131,7 @@ function parseDefinition(tokens, startIndex) {
|
|
|
131
131
|
}
|
|
132
132
|
i++
|
|
133
133
|
|
|
134
|
-
// 3. Collect SELFIES_TOKENs until NEWLINE or EOF
|
|
134
|
+
// 3. Collect SELFIES_TOKENs and repeat calls until NEWLINE or EOF
|
|
135
135
|
const definitionTokens = []
|
|
136
136
|
const tokenStart = nameToken.range[0]
|
|
137
137
|
let tokenEnd = tokens[i - 1].range[1]
|
|
@@ -144,6 +144,17 @@ function parseDefinition(tokens, startIndex) {
|
|
|
144
144
|
definitionTokens.push(tokens[i].value)
|
|
145
145
|
tokenEnd = tokens[i].range[1]
|
|
146
146
|
i++
|
|
147
|
+
} else if (tokens[i].type === TokenType.REPEAT) {
|
|
148
|
+
// Parse repeat call: repeat(pattern, count)
|
|
149
|
+
const repeatResult = parseRepeatCall(tokens, i)
|
|
150
|
+
if (repeatResult.error) {
|
|
151
|
+
errors.push(repeatResult.error)
|
|
152
|
+
i = repeatResult.nextIndex
|
|
153
|
+
} else {
|
|
154
|
+
definitionTokens.push(repeatResult.repeatToken)
|
|
155
|
+
tokenEnd = repeatResult.range[1]
|
|
156
|
+
i = repeatResult.nextIndex
|
|
157
|
+
}
|
|
147
158
|
} else {
|
|
148
159
|
errors.push(createDiagnostic(
|
|
149
160
|
`Unexpected token in definition body: ${tokens[i].type}`,
|
|
@@ -183,6 +194,114 @@ function parseDefinition(tokens, startIndex) {
|
|
|
183
194
|
return { definition, errors, nextIndex: i }
|
|
184
195
|
}
|
|
185
196
|
|
|
197
|
+
/**
|
|
198
|
+
* Parses a repeat call: repeat(pattern, count)
|
|
199
|
+
* @param {Object[]} tokens - Token array
|
|
200
|
+
* @param {number} startIndex - Index of REPEAT token
|
|
201
|
+
* @returns {Object} Result with repeatToken or error
|
|
202
|
+
*/
|
|
203
|
+
function parseRepeatCall(tokens, startIndex) {
|
|
204
|
+
let i = startIndex
|
|
205
|
+
const repeatToken = tokens[i]
|
|
206
|
+
|
|
207
|
+
// Expect REPEAT
|
|
208
|
+
if (tokens[i].type !== TokenType.REPEAT) {
|
|
209
|
+
return {
|
|
210
|
+
error: createDiagnostic('Expected repeat keyword', 'error', tokens[i]),
|
|
211
|
+
nextIndex: i + 1
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
i++
|
|
215
|
+
|
|
216
|
+
// Expect LPAREN
|
|
217
|
+
if (i >= tokens.length || tokens[i].type !== TokenType.LPAREN) {
|
|
218
|
+
return {
|
|
219
|
+
error: createDiagnostic('Expected \'(\' after repeat', 'error', tokens[i] || repeatToken),
|
|
220
|
+
nextIndex: i
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
i++
|
|
224
|
+
|
|
225
|
+
// Expect STRING (pattern)
|
|
226
|
+
if (i >= tokens.length || tokens[i].type !== TokenType.STRING) {
|
|
227
|
+
// Skip to closing paren or end of line on error
|
|
228
|
+
const skipToEnd = skipToRParenOrEOL(tokens, i)
|
|
229
|
+
return {
|
|
230
|
+
error: createDiagnostic('Expected string pattern as first argument', 'error', tokens[i] || repeatToken),
|
|
231
|
+
nextIndex: skipToEnd
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
const patternToken = tokens[i]
|
|
235
|
+
const pattern = patternToken.value.slice(1, -1) // Remove quotes
|
|
236
|
+
i++
|
|
237
|
+
|
|
238
|
+
// Expect COMMA
|
|
239
|
+
if (i >= tokens.length || tokens[i].type !== TokenType.COMMA) {
|
|
240
|
+
const skipToEnd = skipToRParenOrEOL(tokens, i)
|
|
241
|
+
return {
|
|
242
|
+
error: createDiagnostic('Expected \',\' after pattern', 'error', tokens[i] || patternToken),
|
|
243
|
+
nextIndex: skipToEnd
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
i++
|
|
247
|
+
|
|
248
|
+
// Expect NUMBER (count)
|
|
249
|
+
if (i >= tokens.length || tokens[i].type !== TokenType.NUMBER) {
|
|
250
|
+
const skipToEnd = skipToRParenOrEOL(tokens, i)
|
|
251
|
+
return {
|
|
252
|
+
error: createDiagnostic('Expected number as second argument', 'error', tokens[i] || patternToken),
|
|
253
|
+
nextIndex: skipToEnd
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
const countToken = tokens[i]
|
|
257
|
+
const count = parseInt(countToken.value, 10)
|
|
258
|
+
i++
|
|
259
|
+
|
|
260
|
+
// Expect RPAREN
|
|
261
|
+
if (i >= tokens.length || tokens[i].type !== TokenType.RPAREN) {
|
|
262
|
+
const skipToEnd = skipToRParenOrEOL(tokens, i)
|
|
263
|
+
return {
|
|
264
|
+
error: createDiagnostic('Expected \')\' after count', 'error', tokens[i] || countToken),
|
|
265
|
+
nextIndex: skipToEnd
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
const rparenToken = tokens[i]
|
|
269
|
+
i++
|
|
270
|
+
|
|
271
|
+
// Create a special repeat token
|
|
272
|
+
return {
|
|
273
|
+
repeatToken: {
|
|
274
|
+
type: 'REPEAT_CALL',
|
|
275
|
+
pattern,
|
|
276
|
+
count,
|
|
277
|
+
range: [repeatToken.range[0], rparenToken.range[1]]
|
|
278
|
+
},
|
|
279
|
+
range: [repeatToken.range[0], rparenToken.range[1]],
|
|
280
|
+
nextIndex: i
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Skips tokens until we find RPAREN or reach end of line
|
|
286
|
+
* @param {Object[]} tokens - Token array
|
|
287
|
+
* @param {number} startIndex - Index to start skipping from
|
|
288
|
+
* @returns {number} Index after RPAREN or at NEWLINE/EOF
|
|
289
|
+
*/
|
|
290
|
+
function skipToRParenOrEOL(tokens, startIndex) {
|
|
291
|
+
let i = startIndex
|
|
292
|
+
while (i < tokens.length &&
|
|
293
|
+
tokens[i].type !== TokenType.RPAREN &&
|
|
294
|
+
tokens[i].type !== TokenType.NEWLINE &&
|
|
295
|
+
tokens[i].type !== TokenType.EOF) {
|
|
296
|
+
i++
|
|
297
|
+
}
|
|
298
|
+
// If we found RPAREN, move past it
|
|
299
|
+
if (i < tokens.length && tokens[i].type === TokenType.RPAREN) {
|
|
300
|
+
i++
|
|
301
|
+
}
|
|
302
|
+
return i
|
|
303
|
+
}
|
|
304
|
+
|
|
186
305
|
/**
|
|
187
306
|
* Creates a diagnostic object
|
|
188
307
|
* @param {string} message - Error/warning message
|
package/src/dsl/resolver.js
CHANGED
|
@@ -102,11 +102,21 @@ function resolveRecursive(program, name, visiting = new Set()) {
|
|
|
102
102
|
|
|
103
103
|
// Get definition
|
|
104
104
|
const definition = program.definitions.get(name)
|
|
105
|
+
|
|
106
|
+
// Check if definition has tokens (parse errors can result in empty definitions)
|
|
107
|
+
if (!definition.tokens || definition.tokens.length === 0) {
|
|
108
|
+
throw new ResolveError(`Definition '${name}' has no tokens (possibly due to parse errors)`, name)
|
|
109
|
+
}
|
|
110
|
+
|
|
105
111
|
const resolvedTokens = []
|
|
106
112
|
|
|
107
113
|
// Resolve each token
|
|
108
114
|
for (const token of definition.tokens) {
|
|
109
|
-
if (
|
|
115
|
+
if (typeof token === 'object' && token.type === 'REPEAT_CALL') {
|
|
116
|
+
// It's a repeat call - expand it
|
|
117
|
+
const expandedTokens = expandRepeat(token, program, visiting)
|
|
118
|
+
resolvedTokens.push(...expandedTokens)
|
|
119
|
+
} else if (isReference(token, program)) {
|
|
110
120
|
// It's a reference to another definition - resolve it recursively
|
|
111
121
|
const refName = token.slice(1, -1) // Remove brackets
|
|
112
122
|
const refResolved = resolveRecursive(program, refName, visiting)
|
|
@@ -123,6 +133,82 @@ function resolveRecursive(program, name, visiting = new Set()) {
|
|
|
123
133
|
return resolvedTokens
|
|
124
134
|
}
|
|
125
135
|
|
|
136
|
+
/**
|
|
137
|
+
* Expands a repeat call by repeating the pattern
|
|
138
|
+
* @param {Object} repeatToken - Repeat token object with pattern and count
|
|
139
|
+
* @param {Object} program - Program object
|
|
140
|
+
* @param {Set<string>} visiting - Set of currently visiting definitions
|
|
141
|
+
* @returns {string[]} Expanded tokens
|
|
142
|
+
*/
|
|
143
|
+
function expandRepeat(repeatToken, program, visiting) {
|
|
144
|
+
const { pattern, count } = repeatToken
|
|
145
|
+
|
|
146
|
+
// Validate count
|
|
147
|
+
if (count < 0) {
|
|
148
|
+
throw new ResolveError(`Repeat count must be non-negative, got ${count}`)
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (!Number.isInteger(count)) {
|
|
152
|
+
throw new ResolveError(`Repeat count must be an integer, got ${count}`)
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Tokenize the pattern string to extract individual SELFIES tokens
|
|
156
|
+
const patternTokens = tokenizePattern(pattern)
|
|
157
|
+
|
|
158
|
+
// Resolve each token in the pattern (they might be references)
|
|
159
|
+
const resolvedPatternTokens = []
|
|
160
|
+
for (const token of patternTokens) {
|
|
161
|
+
if (isReference(token, program)) {
|
|
162
|
+
// Recursively resolve the reference
|
|
163
|
+
const refName = token.slice(1, -1)
|
|
164
|
+
const refResolved = resolveRecursive(program, refName, visiting)
|
|
165
|
+
resolvedPatternTokens.push(...refResolved)
|
|
166
|
+
} else {
|
|
167
|
+
resolvedPatternTokens.push(token)
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Repeat the resolved pattern
|
|
172
|
+
const result = []
|
|
173
|
+
for (let i = 0; i < count; i++) {
|
|
174
|
+
result.push(...resolvedPatternTokens)
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
return result
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Tokenizes a pattern string into SELFIES tokens
|
|
182
|
+
* @param {string} pattern - Pattern string like '[C][=C]'
|
|
183
|
+
* @returns {string[]} Array of tokens
|
|
184
|
+
*/
|
|
185
|
+
function tokenizePattern(pattern) {
|
|
186
|
+
const tokens = []
|
|
187
|
+
let i = 0
|
|
188
|
+
|
|
189
|
+
while (i < pattern.length) {
|
|
190
|
+
if (pattern[i] === '[') {
|
|
191
|
+
// Find the closing bracket
|
|
192
|
+
let j = i + 1
|
|
193
|
+
while (j < pattern.length && pattern[j] !== ']') {
|
|
194
|
+
j++
|
|
195
|
+
}
|
|
196
|
+
if (j >= pattern.length) {
|
|
197
|
+
throw new ResolveError(`Unclosed bracket in pattern: ${pattern}`)
|
|
198
|
+
}
|
|
199
|
+
tokens.push(pattern.slice(i, j + 1))
|
|
200
|
+
i = j + 1
|
|
201
|
+
} else if (pattern[i] === ' ' || pattern[i] === '\t') {
|
|
202
|
+
// Skip whitespace
|
|
203
|
+
i++
|
|
204
|
+
} else {
|
|
205
|
+
throw new ResolveError(`Invalid character in pattern: ${pattern[i]}`)
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return tokens
|
|
210
|
+
}
|
|
211
|
+
|
|
126
212
|
/**
|
|
127
213
|
* Checks if a token is a reference to another definition
|
|
128
214
|
* @param {string} token - Token to check
|
package/src/dsl/resolver.test.js
CHANGED
|
@@ -97,3 +97,86 @@ describe('resolveAll', () => {
|
|
|
97
97
|
expect(resolved.size).toBe(0)
|
|
98
98
|
})
|
|
99
99
|
})
|
|
100
|
+
|
|
101
|
+
describe('repeat macro', () => {
|
|
102
|
+
test('repeats a simple token sequence', () => {
|
|
103
|
+
const program = parse('[triple_carbon] = repeat(\'[C]\', 3)')
|
|
104
|
+
expect(resolve(program, 'triple_carbon')).toBe('[C][C][C]')
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
test('repeats a complex token sequence', () => {
|
|
108
|
+
const program = parse('[benzene] = repeat(\'[C][=C]\', 3)[Ring1][=Branch1]')
|
|
109
|
+
expect(resolve(program, 'benzene')).toBe('[C][=C][C][=C][C][=C][Ring1][=Branch1]')
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
test('repeats with count of 1', () => {
|
|
113
|
+
const program = parse('[single] = repeat(\'[C][O]\', 1)')
|
|
114
|
+
expect(resolve(program, 'single')).toBe('[C][O]')
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
test('repeats with count of 0 produces empty sequence', () => {
|
|
118
|
+
const program = parse('[empty] = [C]repeat(\'[O]\', 0)[C]')
|
|
119
|
+
expect(resolve(program, 'empty')).toBe('[C][C]')
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
test('repeat with reference to other definition', () => {
|
|
123
|
+
const source = '[unit] = [C][=C]\n[triple] = repeat(\'[unit]\', 3)'
|
|
124
|
+
const program = parse(source)
|
|
125
|
+
expect(resolve(program, 'triple')).toBe('[C][=C][C][=C][C][=C]')
|
|
126
|
+
})
|
|
127
|
+
|
|
128
|
+
test('multiple repeat calls in one definition', () => {
|
|
129
|
+
const program = parse('[chain] = repeat(\'[C]\', 2)repeat(\'[O]\', 2)')
|
|
130
|
+
expect(resolve(program, 'chain')).toBe('[C][C][O][O]')
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
test('repeat combined with regular tokens', () => {
|
|
134
|
+
const program = parse('[molecule] = [N]repeat(\'[C]\', 3)[O]')
|
|
135
|
+
expect(resolve(program, 'molecule')).toBe('[N][C][C][C][O]')
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
test('repeat with nested brackets in pattern', () => {
|
|
139
|
+
const program = parse('[branched] = repeat(\'[C][Branch1][C][O]\', 2)')
|
|
140
|
+
expect(resolve(program, 'branched')).toBe('[C][Branch1][C][O][C][Branch1][C][O]')
|
|
141
|
+
})
|
|
142
|
+
|
|
143
|
+
test('throws error on invalid repeat count', () => {
|
|
144
|
+
const program = parse('[bad] = repeat(\'[C]\', -1)')
|
|
145
|
+
expect(() => resolve(program, 'bad')).toThrow(/count must be/)
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
test('throws error on non-numeric count', () => {
|
|
149
|
+
const program = parse('[bad] = repeat(\'[C]\', abc)')
|
|
150
|
+
expect(() => resolve(program, 'bad')).toThrow()
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
test('throws error on missing arguments', () => {
|
|
154
|
+
const program = parse('[bad] = repeat(\'[C]\')')
|
|
155
|
+
expect(() => resolve(program, 'bad')).toThrow()
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
test('throws error on malformed repeat syntax', () => {
|
|
159
|
+
const program = parse('[bad] = repeat([C], 3)')
|
|
160
|
+
expect(() => resolve(program, 'bad')).toThrow()
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
test('simple polymer-like chain', () => {
|
|
164
|
+
const source = '[ch2] = [C]\n[polymer_chain] = repeat(\'[ch2]\', 5)'
|
|
165
|
+
const program = parse(source)
|
|
166
|
+
expect(resolve(program, 'polymer_chain')).toBe('[C][C][C][C][C]')
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
test('polymer chain with decode', () => {
|
|
170
|
+
const source = '[ch2] = [C]\n[polymer_chain] = repeat(\'[ch2]\', 5)'
|
|
171
|
+
const program = parse(source)
|
|
172
|
+
expect(resolve(program, 'polymer_chain', { decode: true })).toBe('CCCCC')
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
test('vinyl chloride monomer units', () => {
|
|
176
|
+
// Each monomer as a branch structure for proper chemistry
|
|
177
|
+
const source = '[monomer] = [C][Branch1][C][Cl][C]\n[polymer] = repeat(\'[monomer]\', 3)'
|
|
178
|
+
const program = parse(source)
|
|
179
|
+
// This creates a branched structure: C(Cl)CC(Cl)CC(Cl)C
|
|
180
|
+
expect(resolve(program, 'polymer')).toBe('[C][Branch1][C][Cl][C][C][Branch1][C][Cl][C][C][Branch1][C][Cl][C]')
|
|
181
|
+
})
|
|
182
|
+
})
|