selfies-js 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -0
- package/package.json +1 -1
- package/src/decoder.js +74 -3
- package/src/dsl/analyzer.js +47 -4
- package/src/dsl/parser.js +26 -7
- package/src/dsl/resolver.test.js +18 -17
- package/src/dsl/valenceValidator.js +3 -19
package/README.md
CHANGED
|
@@ -112,6 +112,11 @@ selfies list molecules.selfies
|
|
|
112
112
|
# Reference other fragments
|
|
113
113
|
[ethanol] = [ethyl][hydroxyl]
|
|
114
114
|
|
|
115
|
+
# Use repeat() macro for patterns
|
|
116
|
+
[benzene] = repeat([C][=C], 3)[Ring1][=Branch1]
|
|
117
|
+
[carbon_chain] = repeat([C], 10)
|
|
118
|
+
[polymer] = repeat([monomer], 5)
|
|
119
|
+
|
|
115
120
|
# Import from other files
|
|
116
121
|
import "./other-file.selfies" # import all
|
|
117
122
|
import [methyl, ethyl] from "./fragments.selfies" # import specific
|
|
@@ -156,6 +161,33 @@ await initRDKit()
|
|
|
156
161
|
const svg = await renderSelfies('[C][C][O]', { width: 300, height: 300 })
|
|
157
162
|
```
|
|
158
163
|
|
|
164
|
+
## Repeat Macro
|
|
165
|
+
|
|
166
|
+
The `repeat()` macro allows you to repeat molecular patterns, perfect for polymers and long chains:
|
|
167
|
+
|
|
168
|
+
```selfies
|
|
169
|
+
# Benzene ring
|
|
170
|
+
[benzene] = repeat([C][=C], 3)[Ring1][=Branch1]
|
|
171
|
+
|
|
172
|
+
# Carbon chains
|
|
173
|
+
[decane] = repeat([C], 10)
|
|
174
|
+
[pentadecane] = repeat([C], 15)
|
|
175
|
+
|
|
176
|
+
# Polymer repeat units
|
|
177
|
+
[PE_unit] = [C][C]
|
|
178
|
+
[polyethylene_trimer] = repeat([PE_unit], 3)
|
|
179
|
+
|
|
180
|
+
# References work too
|
|
181
|
+
[monomer] = [C][Branch1][C][Cl][C]
|
|
182
|
+
[pvc_hexamer] = repeat([monomer], 6)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
The pattern can be:
|
|
186
|
+
- **Primitive tokens**: `repeat([C], 10)` for a 10-carbon chain
|
|
187
|
+
- **Complex patterns**: `repeat([C][=C], 3)` for alternating double bonds
|
|
188
|
+
- **Named references**: `repeat([unit], 5)` to repeat a defined fragment
|
|
189
|
+
- **Combined**: `[molecule] = [N]repeat([C], 3)[O]` with tokens before/after
|
|
190
|
+
|
|
159
191
|
## VS Code Extension
|
|
160
192
|
|
|
161
193
|
Get live visualization as you author `.selfies` files. See the molecular structure update line-by-line as you navigate your code.
|
package/package.json
CHANGED
package/src/decoder.js
CHANGED
|
@@ -415,10 +415,81 @@ export function deriveBranch(tokens, startIndex, maxDerive, initState, rootAtom,
|
|
|
415
415
|
const token = tokens[startIndex + consumed]
|
|
416
416
|
const content = token.slice(1, -1)
|
|
417
417
|
|
|
418
|
-
//
|
|
419
|
-
if (content.includes('
|
|
420
|
-
|
|
418
|
+
// Handle Ring tokens inside branch
|
|
419
|
+
if (content.includes('Ring') || content.includes('ng')) {
|
|
420
|
+
const ringInfo = processRingSymbol(token)
|
|
421
|
+
if (!ringInfo) {
|
|
422
|
+
throw new Error(`Invalid ring token in branch: ${token}`)
|
|
423
|
+
}
|
|
424
|
+
if (state === 0) {
|
|
425
|
+
throw new Error(`Ring ${token} at invalid state 0 inside branch`)
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
const { order: requestedOrder, L } = ringInfo
|
|
429
|
+
const [bondOrder, nextState] = nextRingState(requestedOrder, state)
|
|
430
|
+
|
|
431
|
+
consumed++
|
|
432
|
+
derived++ // Ring counts toward derive limit
|
|
433
|
+
|
|
434
|
+
if (consumed >= tokens.length - startIndex) {
|
|
435
|
+
state = nextState
|
|
436
|
+
break
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
const Q = readIndexFromTokens(tokens, startIndex + consumed, L)
|
|
440
|
+
consumed += Q.consumed
|
|
441
|
+
derived += Q.consumed // Q index tokens also count
|
|
442
|
+
|
|
443
|
+
const targetIndex = Math.max(0, prevAtomIndex - (Q.value + 1))
|
|
444
|
+
if (targetIndex !== prevAtomIndex) {
|
|
445
|
+
handleRingClosure(targetIndex, prevAtomIndex, bondOrder, bonds, rings)
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
state = nextState
|
|
449
|
+
continue
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// Handle nested Branch tokens - recursively process them
|
|
453
|
+
if (content.includes('Branch') || content.includes('ch')) {
|
|
454
|
+
const branchInfo = processBranchSymbol(token)
|
|
455
|
+
if (!branchInfo) {
|
|
456
|
+
throw new Error(`Invalid branch token in branch: ${token}`)
|
|
457
|
+
}
|
|
458
|
+
if (state <= 1) {
|
|
459
|
+
throw new Error(`Branch ${token} at invalid state ${state} inside branch`)
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
const { order: branchOrder, L } = branchInfo
|
|
463
|
+
const [branchInitState, nextState] = nextBranchState(branchOrder, state)
|
|
464
|
+
|
|
421
465
|
consumed++
|
|
466
|
+
derived++ // Branch counts toward derive limit
|
|
467
|
+
|
|
468
|
+
if (consumed >= tokens.length - startIndex) {
|
|
469
|
+
state = nextState
|
|
470
|
+
break
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
// Read Q index
|
|
474
|
+
const Q = readIndexFromTokens(tokens, startIndex + consumed, L)
|
|
475
|
+
consumed += Q.consumed
|
|
476
|
+
derived += Q.consumed // Q index tokens count
|
|
477
|
+
|
|
478
|
+
// Recursively derive nested branch
|
|
479
|
+
const nestedResult = deriveBranch(
|
|
480
|
+
tokens,
|
|
481
|
+
startIndex + consumed,
|
|
482
|
+
Q.value + 1,
|
|
483
|
+
branchInitState,
|
|
484
|
+
prevAtomIndex,
|
|
485
|
+
atoms,
|
|
486
|
+
bonds,
|
|
487
|
+
rings
|
|
488
|
+
)
|
|
489
|
+
consumed += nestedResult.consumed
|
|
490
|
+
derived += nestedResult.derived
|
|
491
|
+
|
|
492
|
+
state = nextState
|
|
422
493
|
continue
|
|
423
494
|
}
|
|
424
495
|
|
package/src/dsl/analyzer.js
CHANGED
|
@@ -25,10 +25,23 @@ export function getDependencies(program, name) {
|
|
|
25
25
|
|
|
26
26
|
const dependencies = []
|
|
27
27
|
for (const token of definition.tokens) {
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
28
|
+
if (typeof token === 'object' && token.type === 'REPEAT_CALL') {
|
|
29
|
+
// Extract dependencies from repeat pattern
|
|
30
|
+
const patternTokens = tokenizePattern(token.pattern)
|
|
31
|
+
for (const patternToken of patternTokens) {
|
|
32
|
+
const tokenName = patternToken.slice(1, -1) // Remove brackets
|
|
33
|
+
if (program.definitions.has(tokenName)) {
|
|
34
|
+
if (!dependencies.includes(tokenName)) {
|
|
35
|
+
dependencies.push(tokenName)
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
} else if (typeof token === 'string') {
|
|
40
|
+
const tokenName = token.slice(1, -1) // Remove brackets
|
|
41
|
+
if (program.definitions.has(tokenName)) {
|
|
42
|
+
if (!dependencies.includes(tokenName)) {
|
|
43
|
+
dependencies.push(tokenName)
|
|
44
|
+
}
|
|
32
45
|
}
|
|
33
46
|
}
|
|
34
47
|
}
|
|
@@ -36,6 +49,36 @@ export function getDependencies(program, name) {
|
|
|
36
49
|
return dependencies
|
|
37
50
|
}
|
|
38
51
|
|
|
52
|
+
/**
|
|
53
|
+
* Tokenizes a pattern string into SELFIES tokens
|
|
54
|
+
* @param {string} pattern - Pattern string like '[C][=C]'
|
|
55
|
+
* @returns {string[]} Array of tokens
|
|
56
|
+
*/
|
|
57
|
+
function tokenizePattern(pattern) {
|
|
58
|
+
const tokens = []
|
|
59
|
+
let i = 0
|
|
60
|
+
|
|
61
|
+
while (i < pattern.length) {
|
|
62
|
+
if (pattern[i] === '[') {
|
|
63
|
+
// Find the closing bracket
|
|
64
|
+
let j = i + 1
|
|
65
|
+
while (j < pattern.length && pattern[j] !== ']') {
|
|
66
|
+
j++
|
|
67
|
+
}
|
|
68
|
+
if (j < pattern.length) {
|
|
69
|
+
tokens.push(pattern.slice(i, j + 1))
|
|
70
|
+
i = j + 1
|
|
71
|
+
} else {
|
|
72
|
+
i++
|
|
73
|
+
}
|
|
74
|
+
} else {
|
|
75
|
+
i++
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return tokens
|
|
80
|
+
}
|
|
81
|
+
|
|
39
82
|
/**
|
|
40
83
|
* Gets the names that depend on a definition
|
|
41
84
|
* @param {Object} program - Program object
|
package/src/dsl/parser.js
CHANGED
|
@@ -222,18 +222,37 @@ function parseRepeatCall(tokens, startIndex) {
|
|
|
222
222
|
}
|
|
223
223
|
i++
|
|
224
224
|
|
|
225
|
-
//
|
|
226
|
-
|
|
227
|
-
|
|
225
|
+
// Collect SELFIES_TOKENs as pattern until we hit COMMA
|
|
226
|
+
const patternTokens = []
|
|
227
|
+
const patternStart = i
|
|
228
|
+
|
|
229
|
+
while (i < tokens.length &&
|
|
230
|
+
tokens[i].type !== TokenType.COMMA &&
|
|
231
|
+
tokens[i].type !== TokenType.RPAREN &&
|
|
232
|
+
tokens[i].type !== TokenType.NEWLINE &&
|
|
233
|
+
tokens[i].type !== TokenType.EOF) {
|
|
234
|
+
if (tokens[i].type === TokenType.SELFIES_TOKEN) {
|
|
235
|
+
patternTokens.push(tokens[i].value)
|
|
236
|
+
i++
|
|
237
|
+
} else {
|
|
238
|
+
const skipToEnd = skipToRParenOrEOL(tokens, i)
|
|
239
|
+
return {
|
|
240
|
+
error: createDiagnostic('Expected SELFIES tokens or name references in pattern', 'error', tokens[i]),
|
|
241
|
+
nextIndex: skipToEnd
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (patternTokens.length === 0) {
|
|
228
247
|
const skipToEnd = skipToRParenOrEOL(tokens, i)
|
|
229
248
|
return {
|
|
230
|
-
error: createDiagnostic('
|
|
249
|
+
error: createDiagnostic('Pattern cannot be empty', 'error', tokens[patternStart] || repeatToken),
|
|
231
250
|
nextIndex: skipToEnd
|
|
232
251
|
}
|
|
233
252
|
}
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
253
|
+
|
|
254
|
+
// Join pattern tokens into a single string
|
|
255
|
+
const pattern = patternTokens.join('')
|
|
237
256
|
|
|
238
257
|
// Expect COMMA
|
|
239
258
|
if (i >= tokens.length || tokens[i].type !== TokenType.COMMA) {
|
package/src/dsl/resolver.test.js
CHANGED
|
@@ -100,81 +100,82 @@ describe('resolveAll', () => {
|
|
|
100
100
|
|
|
101
101
|
describe('repeat macro', () => {
|
|
102
102
|
test('repeats a simple token sequence', () => {
|
|
103
|
-
const program = parse('[triple_carbon] = repeat(
|
|
103
|
+
const program = parse('[triple_carbon] = repeat([C], 3)')
|
|
104
104
|
expect(resolve(program, 'triple_carbon')).toBe('[C][C][C]')
|
|
105
105
|
})
|
|
106
106
|
|
|
107
107
|
test('repeats a complex token sequence', () => {
|
|
108
|
-
const program = parse('[benzene] = repeat(
|
|
108
|
+
const program = parse('[benzene] = repeat([C][=C], 3)[Ring1][=Branch1]')
|
|
109
109
|
expect(resolve(program, 'benzene')).toBe('[C][=C][C][=C][C][=C][Ring1][=Branch1]')
|
|
110
110
|
})
|
|
111
111
|
|
|
112
112
|
test('repeats with count of 1', () => {
|
|
113
|
-
const program = parse('[single] = repeat(
|
|
113
|
+
const program = parse('[single] = repeat([C][O], 1)')
|
|
114
114
|
expect(resolve(program, 'single')).toBe('[C][O]')
|
|
115
115
|
})
|
|
116
116
|
|
|
117
117
|
test('repeats with count of 0 produces empty sequence', () => {
|
|
118
|
-
const program = parse('[empty] = [C]repeat(
|
|
118
|
+
const program = parse('[empty] = [C]repeat([O], 0)[C]')
|
|
119
119
|
expect(resolve(program, 'empty')).toBe('[C][C]')
|
|
120
120
|
})
|
|
121
121
|
|
|
122
122
|
test('repeat with reference to other definition', () => {
|
|
123
|
-
const source = '[unit] = [C][=C]\n[triple] = repeat(
|
|
123
|
+
const source = '[unit] = [C][=C]\n[triple] = repeat([unit], 3)'
|
|
124
124
|
const program = parse(source)
|
|
125
125
|
expect(resolve(program, 'triple')).toBe('[C][=C][C][=C][C][=C]')
|
|
126
126
|
})
|
|
127
127
|
|
|
128
128
|
test('multiple repeat calls in one definition', () => {
|
|
129
|
-
const program = parse('[chain] = repeat(
|
|
129
|
+
const program = parse('[chain] = repeat([C], 2)repeat([O], 2)')
|
|
130
130
|
expect(resolve(program, 'chain')).toBe('[C][C][O][O]')
|
|
131
131
|
})
|
|
132
132
|
|
|
133
133
|
test('repeat combined with regular tokens', () => {
|
|
134
|
-
const program = parse('[molecule] = [N]repeat(
|
|
134
|
+
const program = parse('[molecule] = [N]repeat([C], 3)[O]')
|
|
135
135
|
expect(resolve(program, 'molecule')).toBe('[N][C][C][C][O]')
|
|
136
136
|
})
|
|
137
137
|
|
|
138
138
|
test('repeat with nested brackets in pattern', () => {
|
|
139
|
-
const program = parse('[branched] = repeat(
|
|
139
|
+
const program = parse('[branched] = repeat([C][Branch1][C][O], 2)')
|
|
140
140
|
expect(resolve(program, 'branched')).toBe('[C][Branch1][C][O][C][Branch1][C][O]')
|
|
141
141
|
})
|
|
142
142
|
|
|
143
143
|
test('throws error on invalid repeat count', () => {
|
|
144
|
-
const program = parse('[bad] = repeat(
|
|
144
|
+
const program = parse('[bad] = repeat([C], -1)')
|
|
145
145
|
expect(() => resolve(program, 'bad')).toThrow(/count must be/)
|
|
146
146
|
})
|
|
147
147
|
|
|
148
148
|
test('throws error on non-numeric count', () => {
|
|
149
|
-
const program = parse('[bad] = repeat(
|
|
149
|
+
const program = parse('[bad] = repeat([C], abc)')
|
|
150
150
|
expect(() => resolve(program, 'bad')).toThrow()
|
|
151
151
|
})
|
|
152
152
|
|
|
153
153
|
test('throws error on missing arguments', () => {
|
|
154
|
-
const program = parse('[bad] = repeat(
|
|
154
|
+
const program = parse('[bad] = repeat([C])')
|
|
155
155
|
expect(() => resolve(program, 'bad')).toThrow()
|
|
156
156
|
})
|
|
157
157
|
|
|
158
|
-
test('throws error on
|
|
159
|
-
const program = parse('[bad] = repeat(
|
|
160
|
-
|
|
158
|
+
test('throws error on empty pattern', () => {
|
|
159
|
+
const program = parse('[bad] = repeat(, 3)')
|
|
160
|
+
// Parse error results in empty definition
|
|
161
|
+
expect(() => resolve(program, 'bad')).toThrow(/no tokens/)
|
|
161
162
|
})
|
|
162
163
|
|
|
163
164
|
test('simple polymer-like chain', () => {
|
|
164
|
-
const source = '[ch2] = [C]\n[polymer_chain] = repeat(
|
|
165
|
+
const source = '[ch2] = [C]\n[polymer_chain] = repeat([ch2], 5)'
|
|
165
166
|
const program = parse(source)
|
|
166
167
|
expect(resolve(program, 'polymer_chain')).toBe('[C][C][C][C][C]')
|
|
167
168
|
})
|
|
168
169
|
|
|
169
170
|
test('polymer chain with decode', () => {
|
|
170
|
-
const source = '[ch2] = [C]\n[polymer_chain] = repeat(
|
|
171
|
+
const source = '[ch2] = [C]\n[polymer_chain] = repeat([ch2], 5)'
|
|
171
172
|
const program = parse(source)
|
|
172
173
|
expect(resolve(program, 'polymer_chain', { decode: true })).toBe('CCCCC')
|
|
173
174
|
})
|
|
174
175
|
|
|
175
176
|
test('vinyl chloride monomer units', () => {
|
|
176
177
|
// Each monomer as a branch structure for proper chemistry
|
|
177
|
-
const source = '[monomer] = [C][Branch1][C][Cl][C]\n[polymer] = repeat(
|
|
178
|
+
const source = '[monomer] = [C][Branch1][C][Cl][C]\n[polymer] = repeat([monomer], 3)'
|
|
178
179
|
const program = parse(source)
|
|
179
180
|
// This creates a branched structure: C(Cl)CC(Cl)CC(Cl)C
|
|
180
181
|
expect(resolve(program, 'polymer')).toBe('[C][Branch1][C][Cl][C][C][Branch1][C][Cl][C][C][Branch1][C][Cl][C]')
|
|
@@ -25,26 +25,10 @@ export function validateValence(selfies, defName) {
|
|
|
25
25
|
const errors = []
|
|
26
26
|
|
|
27
27
|
try {
|
|
28
|
-
// Try to decode to SMILES - this
|
|
28
|
+
// Try to decode to SMILES - this validates the structure
|
|
29
|
+
// The decoder's state machine properly enforces valence rules
|
|
29
30
|
const smiles = decode(selfies)
|
|
30
|
-
|
|
31
|
-
// Parse the SELFIES to extract atoms and bonds
|
|
32
|
-
const tokens = tokenizeSelfies(selfies)
|
|
33
|
-
const atomBonds = calculateBonds(tokens)
|
|
34
|
-
|
|
35
|
-
// Check each atom's valence
|
|
36
|
-
for (const [atom, bondCount] of Object.entries(atomBonds)) {
|
|
37
|
-
const { element, charge } = parseAtom(atom)
|
|
38
|
-
const maxBonds = getBondingCapacity(element, charge)
|
|
39
|
-
|
|
40
|
-
if (bondCount > maxBonds) {
|
|
41
|
-
errors.push({
|
|
42
|
-
message: `Valence error in '${defName}': ${element} has ${bondCount} bonds but max is ${maxBonds}`,
|
|
43
|
-
severity: 'error',
|
|
44
|
-
definitionName: defName
|
|
45
|
-
})
|
|
46
|
-
}
|
|
47
|
-
}
|
|
31
|
+
// If decoding succeeds, the SELFIES is valid
|
|
48
32
|
} catch (error) {
|
|
49
33
|
// If decoding fails, it's a structural error
|
|
50
34
|
errors.push({
|