selfies-js 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -112,6 +112,11 @@ selfies list molecules.selfies
112
112
  # Reference other fragments
113
113
  [ethanol] = [ethyl][hydroxyl]
114
114
 
115
+ # Use repeat() macro for patterns
116
+ [benzene] = repeat([C][=C], 3)[Ring1][=Branch1]
117
+ [carbon_chain] = repeat([C], 10)
118
+ [polymer] = repeat([monomer], 5)
119
+
115
120
  # Import from other files
116
121
  import "./other-file.selfies" # import all
117
122
  import [methyl, ethyl] from "./fragments.selfies" # import specific
@@ -156,6 +161,33 @@ await initRDKit()
156
161
  const svg = await renderSelfies('[C][C][O]', { width: 300, height: 300 })
157
162
  ```
158
163
 
164
+ ## Repeat Macro
165
+
166
+ The `repeat()` macro allows you to repeat molecular patterns, perfect for polymers and long chains:
167
+
168
+ ```selfies
169
+ # Benzene ring
170
+ [benzene] = repeat([C][=C], 3)[Ring1][=Branch1]
171
+
172
+ # Carbon chains
173
+ [decane] = repeat([C], 10)
174
+ [pentadecane] = repeat([C], 15)
175
+
176
+ # Polymer repeat units
177
+ [PE_unit] = [C][C]
178
+ [polyethylene_trimer] = repeat([PE_unit], 3)
179
+
180
+ # References work too
181
+ [monomer] = [C][Branch1][C][Cl][C]
182
+ [pvc_hexamer] = repeat([monomer], 6)
183
+ ```
184
+
185
+ The pattern can be:
186
+ - **Primitive tokens**: `repeat([C], 10)` for a 10-carbon chain
187
+ - **Complex patterns**: `repeat([C][=C], 3)` for alternating double bonds
188
+ - **Named references**: `repeat([unit], 5)` to repeat a defined fragment
189
+ - **Combined**: `[molecule] = [N]repeat([C], 3)[O]` with tokens before/after
190
+
159
191
  ## VS Code Extension
160
192
 
161
193
  Get live visualization as you author `.selfies` files. See the molecular structure update line-by-line as you navigate your code.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "selfies-js",
3
- "version": "0.3.2",
3
+ "version": "0.3.4",
4
4
  "description": "Pure JavaScript SELFIES encoder/decoder with DSL for molecular composition",
5
5
  "type": "module",
6
6
  "main": "src/index.js",
@@ -25,10 +25,23 @@ export function getDependencies(program, name) {
25
25
 
26
26
  const dependencies = []
27
27
  for (const token of definition.tokens) {
28
- const tokenName = token.slice(1, -1) // Remove brackets
29
- if (program.definitions.has(tokenName)) {
30
- if (!dependencies.includes(tokenName)) {
31
- dependencies.push(tokenName)
28
+ if (typeof token === 'object' && token.type === 'REPEAT_CALL') {
29
+ // Extract dependencies from repeat pattern
30
+ const patternTokens = tokenizePattern(token.pattern)
31
+ for (const patternToken of patternTokens) {
32
+ const tokenName = patternToken.slice(1, -1) // Remove brackets
33
+ if (program.definitions.has(tokenName)) {
34
+ if (!dependencies.includes(tokenName)) {
35
+ dependencies.push(tokenName)
36
+ }
37
+ }
38
+ }
39
+ } else if (typeof token === 'string') {
40
+ const tokenName = token.slice(1, -1) // Remove brackets
41
+ if (program.definitions.has(tokenName)) {
42
+ if (!dependencies.includes(tokenName)) {
43
+ dependencies.push(tokenName)
44
+ }
32
45
  }
33
46
  }
34
47
  }
@@ -36,6 +49,36 @@ export function getDependencies(program, name) {
36
49
  return dependencies
37
50
  }
38
51
 
52
+ /**
53
+ * Tokenizes a pattern string into SELFIES tokens
54
+ * @param {string} pattern - Pattern string like '[C][=C]'
55
+ * @returns {string[]} Array of tokens
56
+ */
57
+ function tokenizePattern(pattern) {
58
+ const tokens = []
59
+ let i = 0
60
+
61
+ while (i < pattern.length) {
62
+ if (pattern[i] === '[') {
63
+ // Find the closing bracket
64
+ let j = i + 1
65
+ while (j < pattern.length && pattern[j] !== ']') {
66
+ j++
67
+ }
68
+ if (j < pattern.length) {
69
+ tokens.push(pattern.slice(i, j + 1))
70
+ i = j + 1
71
+ } else {
72
+ i++
73
+ }
74
+ } else {
75
+ i++
76
+ }
77
+ }
78
+
79
+ return tokens
80
+ }
81
+
39
82
  /**
40
83
  * Gets the names that depend on a definition
41
84
  * @param {Object} program - Program object
package/src/dsl/lexer.js CHANGED
@@ -20,11 +20,17 @@ export const TokenType = {
20
20
  // Import-related tokens
21
21
  IMPORT: 'IMPORT', // import keyword
22
22
  FROM: 'FROM', // from keyword
23
- STRING: 'STRING', // "path/to/file.selfies"
23
+ STRING: 'STRING', // "path/to/file.selfies" or 'pattern'
24
24
  STAR: 'STAR', // * (wildcard import)
25
25
  COMMA: 'COMMA', // , (separator in selective imports)
26
26
  LBRACKET: 'LBRACKET', // [ (for selective import list)
27
27
  RBRACKET: 'RBRACKET', // ] (for selective import list)
28
+
29
+ // Repeat macro tokens
30
+ REPEAT: 'REPEAT', // repeat keyword
31
+ LPAREN: 'LPAREN', // (
32
+ RPAREN: 'RPAREN', // )
33
+ NUMBER: 'NUMBER', // numeric literal
28
34
  }
29
35
 
30
36
  /**
@@ -135,14 +141,43 @@ export function lex(source) {
135
141
  continue
136
142
  }
137
143
 
138
- // String literal (for import paths)
139
- if (char === '"') {
144
+ // Left parenthesis
145
+ if (char === '(') {
146
+ tokens.push({
147
+ type: TokenType.LPAREN,
148
+ value: '(',
149
+ line,
150
+ column,
151
+ range: [i, i + 1]
152
+ })
153
+ i++
154
+ column++
155
+ continue
156
+ }
157
+
158
+ // Right parenthesis
159
+ if (char === ')') {
160
+ tokens.push({
161
+ type: TokenType.RPAREN,
162
+ value: ')',
163
+ line,
164
+ column,
165
+ range: [i, i + 1]
166
+ })
167
+ i++
168
+ column++
169
+ continue
170
+ }
171
+
172
+ // String literal (for import paths and repeat patterns)
173
+ if (char === '"' || char === "'") {
140
174
  const stringStart = i
141
- let stringValue = '"'
175
+ const quote = char
176
+ let stringValue = quote
142
177
  i++
143
178
  column++
144
179
 
145
- while (i < source.length && source[i] !== '"' && source[i] !== '\n') {
180
+ while (i < source.length && source[i] !== quote && source[i] !== '\n') {
146
181
  stringValue += source[i]
147
182
  i++
148
183
  column++
@@ -152,7 +187,7 @@ export function lex(source) {
152
187
  throw new Error(`Unclosed string at line ${line}, column ${startColumn}`)
153
188
  }
154
189
 
155
- stringValue += '"'
190
+ stringValue += quote
156
191
  i++
157
192
  column++
158
193
 
@@ -166,7 +201,7 @@ export function lex(source) {
166
201
  continue
167
202
  }
168
203
 
169
- // Keywords and identifiers (import, from)
204
+ // Keywords and identifiers (import, from, repeat)
170
205
  if (isAlpha(char)) {
171
206
  const wordStart = i
172
207
  let wordValue = ''
@@ -182,6 +217,8 @@ export function lex(source) {
182
217
  type = TokenType.IMPORT
183
218
  } else if (wordValue === 'from') {
184
219
  type = TokenType.FROM
220
+ } else if (wordValue === 'repeat') {
221
+ type = TokenType.REPEAT
185
222
  }
186
223
 
187
224
  tokens.push({
@@ -194,6 +231,34 @@ export function lex(source) {
194
231
  continue
195
232
  }
196
233
 
234
+ // Numbers (including negative)
235
+ if (char >= '0' && char <= '9' || (char === '-' && i + 1 < source.length && source[i + 1] >= '0' && source[i + 1] <= '9')) {
236
+ const numberStart = i
237
+ let numberValue = ''
238
+
239
+ // Handle negative sign
240
+ if (char === '-') {
241
+ numberValue += char
242
+ i++
243
+ column++
244
+ }
245
+
246
+ while (i < source.length && source[i] >= '0' && source[i] <= '9') {
247
+ numberValue += source[i]
248
+ i++
249
+ column++
250
+ }
251
+
252
+ tokens.push({
253
+ type: TokenType.NUMBER,
254
+ value: numberValue,
255
+ line,
256
+ column: startColumn,
257
+ range: [numberStart, i]
258
+ })
259
+ continue
260
+ }
261
+
197
262
  // Bracketed token (could be NAME or SELFIES_TOKEN)
198
263
  if (char === '[') {
199
264
  const tokenStart = i
package/src/dsl/parser.js CHANGED
@@ -131,7 +131,7 @@ function parseDefinition(tokens, startIndex) {
131
131
  }
132
132
  i++
133
133
 
134
- // 3. Collect SELFIES_TOKENs until NEWLINE or EOF
134
+ // 3. Collect SELFIES_TOKENs and repeat calls until NEWLINE or EOF
135
135
  const definitionTokens = []
136
136
  const tokenStart = nameToken.range[0]
137
137
  let tokenEnd = tokens[i - 1].range[1]
@@ -144,6 +144,17 @@ function parseDefinition(tokens, startIndex) {
144
144
  definitionTokens.push(tokens[i].value)
145
145
  tokenEnd = tokens[i].range[1]
146
146
  i++
147
+ } else if (tokens[i].type === TokenType.REPEAT) {
148
+ // Parse repeat call: repeat(pattern, count)
149
+ const repeatResult = parseRepeatCall(tokens, i)
150
+ if (repeatResult.error) {
151
+ errors.push(repeatResult.error)
152
+ i = repeatResult.nextIndex
153
+ } else {
154
+ definitionTokens.push(repeatResult.repeatToken)
155
+ tokenEnd = repeatResult.range[1]
156
+ i = repeatResult.nextIndex
157
+ }
147
158
  } else {
148
159
  errors.push(createDiagnostic(
149
160
  `Unexpected token in definition body: ${tokens[i].type}`,
@@ -183,6 +194,133 @@ function parseDefinition(tokens, startIndex) {
183
194
  return { definition, errors, nextIndex: i }
184
195
  }
185
196
 
197
+ /**
198
+ * Parses a repeat call: repeat(pattern, count)
199
+ * @param {Object[]} tokens - Token array
200
+ * @param {number} startIndex - Index of REPEAT token
201
+ * @returns {Object} Result with repeatToken or error
202
+ */
203
+ function parseRepeatCall(tokens, startIndex) {
204
+ let i = startIndex
205
+ const repeatToken = tokens[i]
206
+
207
+ // Expect REPEAT
208
+ if (tokens[i].type !== TokenType.REPEAT) {
209
+ return {
210
+ error: createDiagnostic('Expected repeat keyword', 'error', tokens[i]),
211
+ nextIndex: i + 1
212
+ }
213
+ }
214
+ i++
215
+
216
+ // Expect LPAREN
217
+ if (i >= tokens.length || tokens[i].type !== TokenType.LPAREN) {
218
+ return {
219
+ error: createDiagnostic('Expected \'(\' after repeat', 'error', tokens[i] || repeatToken),
220
+ nextIndex: i
221
+ }
222
+ }
223
+ i++
224
+
225
+ // Collect SELFIES_TOKENs as pattern until we hit COMMA
226
+ const patternTokens = []
227
+ const patternStart = i
228
+
229
+ while (i < tokens.length &&
230
+ tokens[i].type !== TokenType.COMMA &&
231
+ tokens[i].type !== TokenType.RPAREN &&
232
+ tokens[i].type !== TokenType.NEWLINE &&
233
+ tokens[i].type !== TokenType.EOF) {
234
+ if (tokens[i].type === TokenType.SELFIES_TOKEN) {
235
+ patternTokens.push(tokens[i].value)
236
+ i++
237
+ } else {
238
+ const skipToEnd = skipToRParenOrEOL(tokens, i)
239
+ return {
240
+ error: createDiagnostic('Expected SELFIES tokens or name references in pattern', 'error', tokens[i]),
241
+ nextIndex: skipToEnd
242
+ }
243
+ }
244
+ }
245
+
246
+ if (patternTokens.length === 0) {
247
+ const skipToEnd = skipToRParenOrEOL(tokens, i)
248
+ return {
249
+ error: createDiagnostic('Pattern cannot be empty', 'error', tokens[patternStart] || repeatToken),
250
+ nextIndex: skipToEnd
251
+ }
252
+ }
253
+
254
+ // Join pattern tokens into a single string
255
+ const pattern = patternTokens.join('')
256
+
257
+ // Expect COMMA
258
+ if (i >= tokens.length || tokens[i].type !== TokenType.COMMA) {
259
+ const skipToEnd = skipToRParenOrEOL(tokens, i)
260
+ return {
261
+ error: createDiagnostic('Expected \',\' after pattern', 'error', tokens[i] || patternToken),
262
+ nextIndex: skipToEnd
263
+ }
264
+ }
265
+ i++
266
+
267
+ // Expect NUMBER (count)
268
+ if (i >= tokens.length || tokens[i].type !== TokenType.NUMBER) {
269
+ const skipToEnd = skipToRParenOrEOL(tokens, i)
270
+ return {
271
+ error: createDiagnostic('Expected number as second argument', 'error', tokens[i] || patternToken),
272
+ nextIndex: skipToEnd
273
+ }
274
+ }
275
+ const countToken = tokens[i]
276
+ const count = parseInt(countToken.value, 10)
277
+ i++
278
+
279
+ // Expect RPAREN
280
+ if (i >= tokens.length || tokens[i].type !== TokenType.RPAREN) {
281
+ const skipToEnd = skipToRParenOrEOL(tokens, i)
282
+ return {
283
+ error: createDiagnostic('Expected \')\' after count', 'error', tokens[i] || countToken),
284
+ nextIndex: skipToEnd
285
+ }
286
+ }
287
+ const rparenToken = tokens[i]
288
+ i++
289
+
290
+ // Create a special repeat token
291
+ return {
292
+ repeatToken: {
293
+ type: 'REPEAT_CALL',
294
+ pattern,
295
+ count,
296
+ range: [repeatToken.range[0], rparenToken.range[1]]
297
+ },
298
+ range: [repeatToken.range[0], rparenToken.range[1]],
299
+ nextIndex: i
300
+ }
301
+ }
302
+
303
+ /**
304
+ * Skips tokens until we find RPAREN or reach end of line
305
+ * @param {Object[]} tokens - Token array
306
+ * @param {number} startIndex - Index to start skipping from
307
+ * @returns {number} Index after RPAREN or at NEWLINE/EOF
308
+ */
309
+ function skipToRParenOrEOL(tokens, startIndex) {
310
+ let i = startIndex
311
+ while (i < tokens.length &&
312
+ tokens[i].type !== TokenType.RPAREN &&
313
+ tokens[i].type !== TokenType.NEWLINE &&
314
+ tokens[i].type !== TokenType.EOF) {
315
+ i++
316
+ }
317
+ // If we found RPAREN, move past it
318
+ if (i < tokens.length && tokens[i].type === TokenType.RPAREN) {
319
+ i++
320
+ }
321
+ return i
322
+ }
323
+
186
324
  /**
187
325
  * Creates a diagnostic object
188
326
  * @param {string} message - Error/warning message
@@ -102,11 +102,21 @@ function resolveRecursive(program, name, visiting = new Set()) {
102
102
 
103
103
  // Get definition
104
104
  const definition = program.definitions.get(name)
105
+
106
+ // Check if definition has tokens (parse errors can result in empty definitions)
107
+ if (!definition.tokens || definition.tokens.length === 0) {
108
+ throw new ResolveError(`Definition '${name}' has no tokens (possibly due to parse errors)`, name)
109
+ }
110
+
105
111
  const resolvedTokens = []
106
112
 
107
113
  // Resolve each token
108
114
  for (const token of definition.tokens) {
109
- if (isReference(token, program)) {
115
+ if (typeof token === 'object' && token.type === 'REPEAT_CALL') {
116
+ // It's a repeat call - expand it
117
+ const expandedTokens = expandRepeat(token, program, visiting)
118
+ resolvedTokens.push(...expandedTokens)
119
+ } else if (isReference(token, program)) {
110
120
  // It's a reference to another definition - resolve it recursively
111
121
  const refName = token.slice(1, -1) // Remove brackets
112
122
  const refResolved = resolveRecursive(program, refName, visiting)
@@ -123,6 +133,82 @@ function resolveRecursive(program, name, visiting = new Set()) {
123
133
  return resolvedTokens
124
134
  }
125
135
 
136
+ /**
137
+ * Expands a repeat call by repeating the pattern
138
+ * @param {Object} repeatToken - Repeat token object with pattern and count
139
+ * @param {Object} program - Program object
140
+ * @param {Set<string>} visiting - Set of currently visiting definitions
141
+ * @returns {string[]} Expanded tokens
142
+ */
143
+ function expandRepeat(repeatToken, program, visiting) {
144
+ const { pattern, count } = repeatToken
145
+
146
+ // Validate count
147
+ if (count < 0) {
148
+ throw new ResolveError(`Repeat count must be non-negative, got ${count}`)
149
+ }
150
+
151
+ if (!Number.isInteger(count)) {
152
+ throw new ResolveError(`Repeat count must be an integer, got ${count}`)
153
+ }
154
+
155
+ // Tokenize the pattern string to extract individual SELFIES tokens
156
+ const patternTokens = tokenizePattern(pattern)
157
+
158
+ // Resolve each token in the pattern (they might be references)
159
+ const resolvedPatternTokens = []
160
+ for (const token of patternTokens) {
161
+ if (isReference(token, program)) {
162
+ // Recursively resolve the reference
163
+ const refName = token.slice(1, -1)
164
+ const refResolved = resolveRecursive(program, refName, visiting)
165
+ resolvedPatternTokens.push(...refResolved)
166
+ } else {
167
+ resolvedPatternTokens.push(token)
168
+ }
169
+ }
170
+
171
+ // Repeat the resolved pattern
172
+ const result = []
173
+ for (let i = 0; i < count; i++) {
174
+ result.push(...resolvedPatternTokens)
175
+ }
176
+
177
+ return result
178
+ }
179
+
180
+ /**
181
+ * Tokenizes a pattern string into SELFIES tokens
182
+ * @param {string} pattern - Pattern string like '[C][=C]'
183
+ * @returns {string[]} Array of tokens
184
+ */
185
+ function tokenizePattern(pattern) {
186
+ const tokens = []
187
+ let i = 0
188
+
189
+ while (i < pattern.length) {
190
+ if (pattern[i] === '[') {
191
+ // Find the closing bracket
192
+ let j = i + 1
193
+ while (j < pattern.length && pattern[j] !== ']') {
194
+ j++
195
+ }
196
+ if (j >= pattern.length) {
197
+ throw new ResolveError(`Unclosed bracket in pattern: ${pattern}`)
198
+ }
199
+ tokens.push(pattern.slice(i, j + 1))
200
+ i = j + 1
201
+ } else if (pattern[i] === ' ' || pattern[i] === '\t') {
202
+ // Skip whitespace
203
+ i++
204
+ } else {
205
+ throw new ResolveError(`Invalid character in pattern: ${pattern[i]}`)
206
+ }
207
+ }
208
+
209
+ return tokens
210
+ }
211
+
126
212
  /**
127
213
  * Checks if a token is a reference to another definition
128
214
  * @param {string} token - Token to check
@@ -97,3 +97,87 @@ describe('resolveAll', () => {
97
97
  expect(resolved.size).toBe(0)
98
98
  })
99
99
  })
100
+
101
+ describe('repeat macro', () => {
102
+ test('repeats a simple token sequence', () => {
103
+ const program = parse('[triple_carbon] = repeat([C], 3)')
104
+ expect(resolve(program, 'triple_carbon')).toBe('[C][C][C]')
105
+ })
106
+
107
+ test('repeats a complex token sequence', () => {
108
+ const program = parse('[benzene] = repeat([C][=C], 3)[Ring1][=Branch1]')
109
+ expect(resolve(program, 'benzene')).toBe('[C][=C][C][=C][C][=C][Ring1][=Branch1]')
110
+ })
111
+
112
+ test('repeats with count of 1', () => {
113
+ const program = parse('[single] = repeat([C][O], 1)')
114
+ expect(resolve(program, 'single')).toBe('[C][O]')
115
+ })
116
+
117
+ test('repeats with count of 0 produces empty sequence', () => {
118
+ const program = parse('[empty] = [C]repeat([O], 0)[C]')
119
+ expect(resolve(program, 'empty')).toBe('[C][C]')
120
+ })
121
+
122
+ test('repeat with reference to other definition', () => {
123
+ const source = '[unit] = [C][=C]\n[triple] = repeat([unit], 3)'
124
+ const program = parse(source)
125
+ expect(resolve(program, 'triple')).toBe('[C][=C][C][=C][C][=C]')
126
+ })
127
+
128
+ test('multiple repeat calls in one definition', () => {
129
+ const program = parse('[chain] = repeat([C], 2)repeat([O], 2)')
130
+ expect(resolve(program, 'chain')).toBe('[C][C][O][O]')
131
+ })
132
+
133
+ test('repeat combined with regular tokens', () => {
134
+ const program = parse('[molecule] = [N]repeat([C], 3)[O]')
135
+ expect(resolve(program, 'molecule')).toBe('[N][C][C][C][O]')
136
+ })
137
+
138
+ test('repeat with nested brackets in pattern', () => {
139
+ const program = parse('[branched] = repeat([C][Branch1][C][O], 2)')
140
+ expect(resolve(program, 'branched')).toBe('[C][Branch1][C][O][C][Branch1][C][O]')
141
+ })
142
+
143
+ test('throws error on invalid repeat count', () => {
144
+ const program = parse('[bad] = repeat([C], -1)')
145
+ expect(() => resolve(program, 'bad')).toThrow(/count must be/)
146
+ })
147
+
148
+ test('throws error on non-numeric count', () => {
149
+ const program = parse('[bad] = repeat([C], abc)')
150
+ expect(() => resolve(program, 'bad')).toThrow()
151
+ })
152
+
153
+ test('throws error on missing arguments', () => {
154
+ const program = parse('[bad] = repeat([C])')
155
+ expect(() => resolve(program, 'bad')).toThrow()
156
+ })
157
+
158
+ test('throws error on empty pattern', () => {
159
+ const program = parse('[bad] = repeat(, 3)')
160
+ // Parse error results in empty definition
161
+ expect(() => resolve(program, 'bad')).toThrow(/no tokens/)
162
+ })
163
+
164
+ test('simple polymer-like chain', () => {
165
+ const source = '[ch2] = [C]\n[polymer_chain] = repeat([ch2], 5)'
166
+ const program = parse(source)
167
+ expect(resolve(program, 'polymer_chain')).toBe('[C][C][C][C][C]')
168
+ })
169
+
170
+ test('polymer chain with decode', () => {
171
+ const source = '[ch2] = [C]\n[polymer_chain] = repeat([ch2], 5)'
172
+ const program = parse(source)
173
+ expect(resolve(program, 'polymer_chain', { decode: true })).toBe('CCCCC')
174
+ })
175
+
176
+ test('vinyl chloride monomer units', () => {
177
+ // Each monomer as a branch structure for proper chemistry
178
+ const source = '[monomer] = [C][Branch1][C][Cl][C]\n[polymer] = repeat([monomer], 3)'
179
+ const program = parse(source)
180
+ // This creates a branched structure: C(Cl)CC(Cl)CC(Cl)C
181
+ expect(resolve(program, 'polymer')).toBe('[C][Branch1][C][Cl][C][C][Branch1][C][Cl][C][C][Branch1][C][Cl][C]')
182
+ })
183
+ })