selfies-js 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "selfies-js",
3
- "version": "0.3.2",
3
+ "version": "0.3.3",
4
4
  "description": "Pure JavaScript SELFIES encoder/decoder with DSL for molecular composition",
5
5
  "type": "module",
6
6
  "main": "src/index.js",
package/src/dsl/lexer.js CHANGED
@@ -20,11 +20,17 @@ export const TokenType = {
20
20
  // Import-related tokens
21
21
  IMPORT: 'IMPORT', // import keyword
22
22
  FROM: 'FROM', // from keyword
23
- STRING: 'STRING', // "path/to/file.selfies"
23
+ STRING: 'STRING', // "path/to/file.selfies" or 'pattern'
24
24
  STAR: 'STAR', // * (wildcard import)
25
25
  COMMA: 'COMMA', // , (separator in selective imports)
26
26
  LBRACKET: 'LBRACKET', // [ (for selective import list)
27
27
  RBRACKET: 'RBRACKET', // ] (for selective import list)
28
+
29
+ // Repeat macro tokens
30
+ REPEAT: 'REPEAT', // repeat keyword
31
+ LPAREN: 'LPAREN', // (
32
+ RPAREN: 'RPAREN', // )
33
+ NUMBER: 'NUMBER', // numeric literal
28
34
  }
29
35
 
30
36
  /**
@@ -135,14 +141,43 @@ export function lex(source) {
135
141
  continue
136
142
  }
137
143
 
138
- // String literal (for import paths)
139
- if (char === '"') {
144
+ // Left parenthesis
145
+ if (char === '(') {
146
+ tokens.push({
147
+ type: TokenType.LPAREN,
148
+ value: '(',
149
+ line,
150
+ column,
151
+ range: [i, i + 1]
152
+ })
153
+ i++
154
+ column++
155
+ continue
156
+ }
157
+
158
+ // Right parenthesis
159
+ if (char === ')') {
160
+ tokens.push({
161
+ type: TokenType.RPAREN,
162
+ value: ')',
163
+ line,
164
+ column,
165
+ range: [i, i + 1]
166
+ })
167
+ i++
168
+ column++
169
+ continue
170
+ }
171
+
172
+ // String literal (for import paths and repeat patterns)
173
+ if (char === '"' || char === "'") {
140
174
  const stringStart = i
141
- let stringValue = '"'
175
+ const quote = char
176
+ let stringValue = quote
142
177
  i++
143
178
  column++
144
179
 
145
- while (i < source.length && source[i] !== '"' && source[i] !== '\n') {
180
+ while (i < source.length && source[i] !== quote && source[i] !== '\n') {
146
181
  stringValue += source[i]
147
182
  i++
148
183
  column++
@@ -152,7 +187,7 @@ export function lex(source) {
152
187
  throw new Error(`Unclosed string at line ${line}, column ${startColumn}`)
153
188
  }
154
189
 
155
- stringValue += '"'
190
+ stringValue += quote
156
191
  i++
157
192
  column++
158
193
 
@@ -166,7 +201,7 @@ export function lex(source) {
166
201
  continue
167
202
  }
168
203
 
169
- // Keywords and identifiers (import, from)
204
+ // Keywords and identifiers (import, from, repeat)
170
205
  if (isAlpha(char)) {
171
206
  const wordStart = i
172
207
  let wordValue = ''
@@ -182,6 +217,8 @@ export function lex(source) {
182
217
  type = TokenType.IMPORT
183
218
  } else if (wordValue === 'from') {
184
219
  type = TokenType.FROM
220
+ } else if (wordValue === 'repeat') {
221
+ type = TokenType.REPEAT
185
222
  }
186
223
 
187
224
  tokens.push({
@@ -194,6 +231,34 @@ export function lex(source) {
194
231
  continue
195
232
  }
196
233
 
234
+ // Numbers (including negative)
235
+ if (char >= '0' && char <= '9' || (char === '-' && i + 1 < source.length && source[i + 1] >= '0' && source[i + 1] <= '9')) {
236
+ const numberStart = i
237
+ let numberValue = ''
238
+
239
+ // Handle negative sign
240
+ if (char === '-') {
241
+ numberValue += char
242
+ i++
243
+ column++
244
+ }
245
+
246
+ while (i < source.length && source[i] >= '0' && source[i] <= '9') {
247
+ numberValue += source[i]
248
+ i++
249
+ column++
250
+ }
251
+
252
+ tokens.push({
253
+ type: TokenType.NUMBER,
254
+ value: numberValue,
255
+ line,
256
+ column: startColumn,
257
+ range: [numberStart, i]
258
+ })
259
+ continue
260
+ }
261
+
197
262
  // Bracketed token (could be NAME or SELFIES_TOKEN)
198
263
  if (char === '[') {
199
264
  const tokenStart = i
package/src/dsl/parser.js CHANGED
@@ -131,7 +131,7 @@ function parseDefinition(tokens, startIndex) {
131
131
  }
132
132
  i++
133
133
 
134
- // 3. Collect SELFIES_TOKENs until NEWLINE or EOF
134
+ // 3. Collect SELFIES_TOKENs and repeat calls until NEWLINE or EOF
135
135
  const definitionTokens = []
136
136
  const tokenStart = nameToken.range[0]
137
137
  let tokenEnd = tokens[i - 1].range[1]
@@ -144,6 +144,17 @@ function parseDefinition(tokens, startIndex) {
144
144
  definitionTokens.push(tokens[i].value)
145
145
  tokenEnd = tokens[i].range[1]
146
146
  i++
147
+ } else if (tokens[i].type === TokenType.REPEAT) {
148
+ // Parse repeat call: repeat(pattern, count)
149
+ const repeatResult = parseRepeatCall(tokens, i)
150
+ if (repeatResult.error) {
151
+ errors.push(repeatResult.error)
152
+ i = repeatResult.nextIndex
153
+ } else {
154
+ definitionTokens.push(repeatResult.repeatToken)
155
+ tokenEnd = repeatResult.range[1]
156
+ i = repeatResult.nextIndex
157
+ }
147
158
  } else {
148
159
  errors.push(createDiagnostic(
149
160
  `Unexpected token in definition body: ${tokens[i].type}`,
@@ -183,6 +194,114 @@ function parseDefinition(tokens, startIndex) {
183
194
  return { definition, errors, nextIndex: i }
184
195
  }
185
196
 
197
+ /**
198
+ * Parses a repeat call: repeat(pattern, count)
199
+ * @param {Object[]} tokens - Token array
200
+ * @param {number} startIndex - Index of REPEAT token
201
+ * @returns {Object} Result with repeatToken or error
202
+ */
203
+ function parseRepeatCall(tokens, startIndex) {
204
+ let i = startIndex
205
+ const repeatToken = tokens[i]
206
+
207
+ // Expect REPEAT
208
+ if (tokens[i].type !== TokenType.REPEAT) {
209
+ return {
210
+ error: createDiagnostic('Expected repeat keyword', 'error', tokens[i]),
211
+ nextIndex: i + 1
212
+ }
213
+ }
214
+ i++
215
+
216
+ // Expect LPAREN
217
+ if (i >= tokens.length || tokens[i].type !== TokenType.LPAREN) {
218
+ return {
219
+ error: createDiagnostic('Expected \'(\' after repeat', 'error', tokens[i] || repeatToken),
220
+ nextIndex: i
221
+ }
222
+ }
223
+ i++
224
+
225
+ // Expect STRING (pattern)
226
+ if (i >= tokens.length || tokens[i].type !== TokenType.STRING) {
227
+ // Skip to closing paren or end of line on error
228
+ const skipToEnd = skipToRParenOrEOL(tokens, i)
229
+ return {
230
+ error: createDiagnostic('Expected string pattern as first argument', 'error', tokens[i] || repeatToken),
231
+ nextIndex: skipToEnd
232
+ }
233
+ }
234
+ const patternToken = tokens[i]
235
+ const pattern = patternToken.value.slice(1, -1) // Remove quotes
236
+ i++
237
+
238
+ // Expect COMMA
239
+ if (i >= tokens.length || tokens[i].type !== TokenType.COMMA) {
240
+ const skipToEnd = skipToRParenOrEOL(tokens, i)
241
+ return {
242
+ error: createDiagnostic('Expected \',\' after pattern', 'error', tokens[i] || patternToken),
243
+ nextIndex: skipToEnd
244
+ }
245
+ }
246
+ i++
247
+
248
+ // Expect NUMBER (count)
249
+ if (i >= tokens.length || tokens[i].type !== TokenType.NUMBER) {
250
+ const skipToEnd = skipToRParenOrEOL(tokens, i)
251
+ return {
252
+ error: createDiagnostic('Expected number as second argument', 'error', tokens[i] || patternToken),
253
+ nextIndex: skipToEnd
254
+ }
255
+ }
256
+ const countToken = tokens[i]
257
+ const count = parseInt(countToken.value, 10)
258
+ i++
259
+
260
+ // Expect RPAREN
261
+ if (i >= tokens.length || tokens[i].type !== TokenType.RPAREN) {
262
+ const skipToEnd = skipToRParenOrEOL(tokens, i)
263
+ return {
264
+ error: createDiagnostic('Expected \')\' after count', 'error', tokens[i] || countToken),
265
+ nextIndex: skipToEnd
266
+ }
267
+ }
268
+ const rparenToken = tokens[i]
269
+ i++
270
+
271
+ // Create a special repeat token
272
+ return {
273
+ repeatToken: {
274
+ type: 'REPEAT_CALL',
275
+ pattern,
276
+ count,
277
+ range: [repeatToken.range[0], rparenToken.range[1]]
278
+ },
279
+ range: [repeatToken.range[0], rparenToken.range[1]],
280
+ nextIndex: i
281
+ }
282
+ }
283
+
284
+ /**
285
+ * Skips tokens until we find RPAREN or reach end of line
286
+ * @param {Object[]} tokens - Token array
287
+ * @param {number} startIndex - Index to start skipping from
288
+ * @returns {number} Index after RPAREN or at NEWLINE/EOF
289
+ */
290
+ function skipToRParenOrEOL(tokens, startIndex) {
291
+ let i = startIndex
292
+ while (i < tokens.length &&
293
+ tokens[i].type !== TokenType.RPAREN &&
294
+ tokens[i].type !== TokenType.NEWLINE &&
295
+ tokens[i].type !== TokenType.EOF) {
296
+ i++
297
+ }
298
+ // If we found RPAREN, move past it
299
+ if (i < tokens.length && tokens[i].type === TokenType.RPAREN) {
300
+ i++
301
+ }
302
+ return i
303
+ }
304
+
186
305
  /**
187
306
  * Creates a diagnostic object
188
307
  * @param {string} message - Error/warning message
@@ -102,11 +102,21 @@ function resolveRecursive(program, name, visiting = new Set()) {
102
102
 
103
103
  // Get definition
104
104
  const definition = program.definitions.get(name)
105
+
106
+ // Check if definition has tokens (parse errors can result in empty definitions)
107
+ if (!definition.tokens || definition.tokens.length === 0) {
108
+ throw new ResolveError(`Definition '${name}' has no tokens (possibly due to parse errors)`, name)
109
+ }
110
+
105
111
  const resolvedTokens = []
106
112
 
107
113
  // Resolve each token
108
114
  for (const token of definition.tokens) {
109
- if (isReference(token, program)) {
115
+ if (typeof token === 'object' && token.type === 'REPEAT_CALL') {
116
+ // It's a repeat call - expand it
117
+ const expandedTokens = expandRepeat(token, program, visiting)
118
+ resolvedTokens.push(...expandedTokens)
119
+ } else if (isReference(token, program)) {
110
120
  // It's a reference to another definition - resolve it recursively
111
121
  const refName = token.slice(1, -1) // Remove brackets
112
122
  const refResolved = resolveRecursive(program, refName, visiting)
@@ -123,6 +133,82 @@ function resolveRecursive(program, name, visiting = new Set()) {
123
133
  return resolvedTokens
124
134
  }
125
135
 
136
+ /**
137
+ * Expands a repeat call by repeating the pattern
138
+ * @param {Object} repeatToken - Repeat token object with pattern and count
139
+ * @param {Object} program - Program object
140
+ * @param {Set<string>} visiting - Set of currently visiting definitions
141
+ * @returns {string[]} Expanded tokens
142
+ */
143
+ function expandRepeat(repeatToken, program, visiting) {
144
+ const { pattern, count } = repeatToken
145
+
146
+ // Validate count
147
+ if (count < 0) {
148
+ throw new ResolveError(`Repeat count must be non-negative, got ${count}`)
149
+ }
150
+
151
+ if (!Number.isInteger(count)) {
152
+ throw new ResolveError(`Repeat count must be an integer, got ${count}`)
153
+ }
154
+
155
+ // Tokenize the pattern string to extract individual SELFIES tokens
156
+ const patternTokens = tokenizePattern(pattern)
157
+
158
+ // Resolve each token in the pattern (they might be references)
159
+ const resolvedPatternTokens = []
160
+ for (const token of patternTokens) {
161
+ if (isReference(token, program)) {
162
+ // Recursively resolve the reference
163
+ const refName = token.slice(1, -1)
164
+ const refResolved = resolveRecursive(program, refName, visiting)
165
+ resolvedPatternTokens.push(...refResolved)
166
+ } else {
167
+ resolvedPatternTokens.push(token)
168
+ }
169
+ }
170
+
171
+ // Repeat the resolved pattern
172
+ const result = []
173
+ for (let i = 0; i < count; i++) {
174
+ result.push(...resolvedPatternTokens)
175
+ }
176
+
177
+ return result
178
+ }
179
+
180
+ /**
181
+ * Tokenizes a pattern string into SELFIES tokens
182
+ * @param {string} pattern - Pattern string like '[C][=C]'
183
+ * @returns {string[]} Array of tokens
184
+ */
185
+ function tokenizePattern(pattern) {
186
+ const tokens = []
187
+ let i = 0
188
+
189
+ while (i < pattern.length) {
190
+ if (pattern[i] === '[') {
191
+ // Find the closing bracket
192
+ let j = i + 1
193
+ while (j < pattern.length && pattern[j] !== ']') {
194
+ j++
195
+ }
196
+ if (j >= pattern.length) {
197
+ throw new ResolveError(`Unclosed bracket in pattern: ${pattern}`)
198
+ }
199
+ tokens.push(pattern.slice(i, j + 1))
200
+ i = j + 1
201
+ } else if (pattern[i] === ' ' || pattern[i] === '\t') {
202
+ // Skip whitespace
203
+ i++
204
+ } else {
205
+ throw new ResolveError(`Invalid character in pattern: ${pattern[i]}`)
206
+ }
207
+ }
208
+
209
+ return tokens
210
+ }
211
+
126
212
  /**
127
213
  * Checks if a token is a reference to another definition
128
214
  * @param {string} token - Token to check
@@ -97,3 +97,86 @@ describe('resolveAll', () => {
97
97
  expect(resolved.size).toBe(0)
98
98
  })
99
99
  })
100
+
101
+ describe('repeat macro', () => {
102
+ test('repeats a simple token sequence', () => {
103
+ const program = parse('[triple_carbon] = repeat(\'[C]\', 3)')
104
+ expect(resolve(program, 'triple_carbon')).toBe('[C][C][C]')
105
+ })
106
+
107
+ test('repeats a complex token sequence', () => {
108
+ const program = parse('[benzene] = repeat(\'[C][=C]\', 3)[Ring1][=Branch1]')
109
+ expect(resolve(program, 'benzene')).toBe('[C][=C][C][=C][C][=C][Ring1][=Branch1]')
110
+ })
111
+
112
+ test('repeats with count of 1', () => {
113
+ const program = parse('[single] = repeat(\'[C][O]\', 1)')
114
+ expect(resolve(program, 'single')).toBe('[C][O]')
115
+ })
116
+
117
+ test('repeats with count of 0 produces empty sequence', () => {
118
+ const program = parse('[empty] = [C]repeat(\'[O]\', 0)[C]')
119
+ expect(resolve(program, 'empty')).toBe('[C][C]')
120
+ })
121
+
122
+ test('repeat with reference to other definition', () => {
123
+ const source = '[unit] = [C][=C]\n[triple] = repeat(\'[unit]\', 3)'
124
+ const program = parse(source)
125
+ expect(resolve(program, 'triple')).toBe('[C][=C][C][=C][C][=C]')
126
+ })
127
+
128
+ test('multiple repeat calls in one definition', () => {
129
+ const program = parse('[chain] = repeat(\'[C]\', 2)repeat(\'[O]\', 2)')
130
+ expect(resolve(program, 'chain')).toBe('[C][C][O][O]')
131
+ })
132
+
133
+ test('repeat combined with regular tokens', () => {
134
+ const program = parse('[molecule] = [N]repeat(\'[C]\', 3)[O]')
135
+ expect(resolve(program, 'molecule')).toBe('[N][C][C][C][O]')
136
+ })
137
+
138
+ test('repeat with nested brackets in pattern', () => {
139
+ const program = parse('[branched] = repeat(\'[C][Branch1][C][O]\', 2)')
140
+ expect(resolve(program, 'branched')).toBe('[C][Branch1][C][O][C][Branch1][C][O]')
141
+ })
142
+
143
+ test('throws error on invalid repeat count', () => {
144
+ const program = parse('[bad] = repeat(\'[C]\', -1)')
145
+ expect(() => resolve(program, 'bad')).toThrow(/count must be/)
146
+ })
147
+
148
+ test('throws error on non-numeric count', () => {
149
+ const program = parse('[bad] = repeat(\'[C]\', abc)')
150
+ expect(() => resolve(program, 'bad')).toThrow()
151
+ })
152
+
153
+ test('throws error on missing arguments', () => {
154
+ const program = parse('[bad] = repeat(\'[C]\')')
155
+ expect(() => resolve(program, 'bad')).toThrow()
156
+ })
157
+
158
+ test('throws error on malformed repeat syntax', () => {
159
+ const program = parse('[bad] = repeat([C], 3)')
160
+ expect(() => resolve(program, 'bad')).toThrow()
161
+ })
162
+
163
+ test('simple polymer-like chain', () => {
164
+ const source = '[ch2] = [C]\n[polymer_chain] = repeat(\'[ch2]\', 5)'
165
+ const program = parse(source)
166
+ expect(resolve(program, 'polymer_chain')).toBe('[C][C][C][C][C]')
167
+ })
168
+
169
+ test('polymer chain with decode', () => {
170
+ const source = '[ch2] = [C]\n[polymer_chain] = repeat(\'[ch2]\', 5)'
171
+ const program = parse(source)
172
+ expect(resolve(program, 'polymer_chain', { decode: true })).toBe('CCCCC')
173
+ })
174
+
175
+ test('vinyl chloride monomer units', () => {
176
+ // Each monomer as a branch structure for proper chemistry
177
+ const source = '[monomer] = [C][Branch1][C][Cl][C]\n[polymer] = repeat(\'[monomer]\', 3)'
178
+ const program = parse(source)
179
+ // This creates a branched structure: C(Cl)CC(Cl)CC(Cl)C
180
+ expect(resolve(program, 'polymer')).toBe('[C][Branch1][C][Cl][C][C][Branch1][C][Cl][C][C][Branch1][C][Cl][C]')
181
+ })
182
+ })