selfies-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +274 -0
  3. package/package.json +65 -0
  4. package/src/alphabet.js +150 -0
  5. package/src/alphabet.test.js +82 -0
  6. package/src/chemistryValidator.js +236 -0
  7. package/src/cli.js +206 -0
  8. package/src/constraints.js +186 -0
  9. package/src/constraints.test.js +126 -0
  10. package/src/decoder.js +636 -0
  11. package/src/decoder.test.js +560 -0
  12. package/src/dsl/analyzer.js +170 -0
  13. package/src/dsl/analyzer.test.js +139 -0
  14. package/src/dsl/dsl.test.js +146 -0
  15. package/src/dsl/importer.js +238 -0
  16. package/src/dsl/index.js +32 -0
  17. package/src/dsl/lexer.js +264 -0
  18. package/src/dsl/lexer.test.js +115 -0
  19. package/src/dsl/parser.js +201 -0
  20. package/src/dsl/parser.test.js +148 -0
  21. package/src/dsl/resolver.js +136 -0
  22. package/src/dsl/resolver.test.js +99 -0
  23. package/src/dsl/symbolTable.js +56 -0
  24. package/src/dsl/symbolTable.test.js +68 -0
  25. package/src/dsl/valenceValidator.js +147 -0
  26. package/src/encoder.js +467 -0
  27. package/src/encoder.test.js +61 -0
  28. package/src/errors.js +79 -0
  29. package/src/errors.test.js +91 -0
  30. package/src/grammar_rules.js +146 -0
  31. package/src/index.js +70 -0
  32. package/src/parser.js +96 -0
  33. package/src/parser.test.js +96 -0
  34. package/src/properties/atoms.js +69 -0
  35. package/src/properties/atoms.test.js +116 -0
  36. package/src/properties/formula.js +111 -0
  37. package/src/properties/formula.test.js +95 -0
  38. package/src/properties/molecularWeight.js +80 -0
  39. package/src/properties/molecularWeight.test.js +84 -0
  40. package/src/properties/properties.test.js +77 -0
  41. package/src/renderers/README.md +127 -0
  42. package/src/renderers/svg.js +113 -0
  43. package/src/renderers/svg.test.js +42 -0
  44. package/src/syntax.js +641 -0
  45. package/src/syntax.test.js +363 -0
  46. package/src/tokenizer.js +99 -0
  47. package/src/tokenizer.test.js +55 -0
  48. package/src/validator.js +70 -0
  49. package/src/validator.test.js +44 -0
package/src/syntax.js ADDED
@@ -0,0 +1,641 @@
1
+ /**
2
+ * Syntax Highlighting API - Provides token information for syntax highlighting
3
+ *
4
+ * This module exports functions to tokenize SELFIES and DSL code for
5
+ * syntax highlighting in editors and other downstream tools.
6
+ */
7
+
8
+ import { lex as dslLex, TokenType } from './dsl/lexer.js'
9
+ import { getAlphabet, isValidToken } from './alphabet.js'
10
+
11
+ /**
12
+ * Token types for syntax highlighting
13
+ */
14
+ export const SyntaxTokenType = {
15
+ // SELFIES tokens
16
+ ATOM: 'atom', // Valid atom: [C], [N], [O]
17
+ BOND: 'bond', // Bond modifier: [=C], [#N]
18
+ BRANCH: 'branch', // Branch token: [Branch1], [=Branch2]
19
+ RING: 'ring', // Ring token: [Ring1], [Ring2]
20
+ INVALID_TOKEN: 'invalid_token', // Invalid SELFIES token
21
+
22
+ // DSL tokens
23
+ KEYWORD: 'keyword', // import, from
24
+ IDENTIFIER: 'identifier', // User-defined names: [methyl], [ethanol]
25
+ REFERENCE: 'reference', // Reference to defined name in expression
26
+ OPERATOR: 'operator', // =
27
+ COMMENT: 'comment', // # comment
28
+ STRING: 'string', // "path/to/file.selfies"
29
+ PUNCTUATION: 'punctuation', // *, ,
30
+
31
+ // Common
32
+ WHITESPACE: 'whitespace',
33
+ NEWLINE: 'newline',
34
+ ERROR: 'error' // Syntax errors
35
+ }
36
+
37
+ /**
38
+ * Semantic token modifiers for enhanced highlighting
39
+ */
40
+ export const TokenModifier = {
41
+ DEFINITION: 'definition', // Where a name is defined
42
+ REFERENCE: 'reference', // Where a name is used
43
+ VALID: 'valid', // Chemically valid
44
+ INVALID: 'invalid', // Chemically invalid
45
+ ORGANIC: 'organic', // Organic subset atom (C, N, O, S, P, F, Cl, Br, I)
46
+ METAL: 'metal', // Metal atom
47
+ HALOGEN: 'halogen' // Halogen atom
48
+ }
49
+
50
+ /**
51
+ * Result from tokenization containing tokens and metadata
52
+ * @typedef {Object} TokenizationResult
53
+ * @property {SyntaxToken[]} tokens - Array of syntax tokens
54
+ * @property {Object[]} errors - Array of tokenization errors
55
+ * @property {Object} metadata - Additional metadata (defined names, etc.)
56
+ */
57
+
58
+ /**
59
+ * Single syntax token
60
+ * @typedef {Object} SyntaxToken
61
+ * @property {string} type - Token type from SyntaxTokenType
62
+ * @property {string} value - Token text
63
+ * @property {number} start - Start offset in source
64
+ * @property {number} end - End offset in source
65
+ * @property {number} line - Line number (1-indexed)
66
+ * @property {number} column - Column number (1-indexed)
67
+ * @property {string[]} [modifiers] - Optional semantic modifiers
68
+ * @property {Object} [data] - Optional additional data
69
+ */
70
+
71
+ /**
72
+ * Tokenizes a SELFIES string for syntax highlighting
73
+ * @param {string} selfies - SELFIES string
74
+ * @param {Object} [options] - Options
75
+ * @param {boolean} [options.validateAgainstAlphabet=true] - Validate tokens against SELFIES alphabet
76
+ * @param {Set<string>} [options.knownNames] - Set of known DSL names to highlight as references
77
+ * @returns {TokenizationResult} Tokenization result
78
+ *
79
+ * Example:
80
+ * tokenizeSelfies('[C][=C][Branch1][C][O]')
81
+ * // {
82
+ * // tokens: [
83
+ * // { type: 'atom', value: '[C]', start: 0, end: 3, ... },
84
+ * // { type: 'bond', value: '[=C]', start: 3, end: 7, ... },
85
+ * // { type: 'branch', value: '[Branch1]', start: 7, end: 16, ... },
86
+ * // { type: 'atom', value: '[C]', start: 16, end: 19, ... },
87
+ * // { type: 'atom', value: '[O]', start: 19, end: 22, ... }
88
+ * // ],
89
+ * // errors: [],
90
+ * // metadata: {}
91
+ * // }
92
+ */
93
+ export function tokenizeSelfies(selfies, options = {}) {
94
+ const {
95
+ validateAgainstAlphabet = true,
96
+ knownNames = new Set()
97
+ } = options
98
+
99
+ const tokens = []
100
+ const errors = []
101
+ const regex = /\[[^\]]+\]/g
102
+ let match
103
+ let lastEnd = 0
104
+
105
+ // Get the alphabet for validation
106
+ const alphabet = validateAgainstAlphabet ? getAlphabet() : null
107
+
108
+ while ((match = regex.exec(selfies)) !== null) {
109
+ // Check for gap (invalid content between tokens)
110
+ if (match.index > lastEnd) {
111
+ const gapContent = selfies.slice(lastEnd, match.index)
112
+ tokens.push({
113
+ type: SyntaxTokenType.ERROR,
114
+ value: gapContent,
115
+ start: lastEnd,
116
+ end: match.index,
117
+ line: 1,
118
+ column: lastEnd + 1
119
+ })
120
+ errors.push({
121
+ message: `Invalid content between tokens: "${gapContent}"`,
122
+ start: lastEnd,
123
+ end: match.index
124
+ })
125
+ }
126
+
127
+ const value = match[0]
128
+ const content = value.slice(1, -1) // Remove brackets
129
+ const { type, modifiers } = classifySelfiesToken(content, alphabet, knownNames)
130
+
131
+ tokens.push({
132
+ type,
133
+ value,
134
+ start: match.index,
135
+ end: match.index + value.length,
136
+ line: 1,
137
+ column: match.index + 1,
138
+ modifiers
139
+ })
140
+
141
+ lastEnd = match.index + value.length
142
+ }
143
+
144
+ // Check for trailing content
145
+ if (lastEnd < selfies.length) {
146
+ const trailingContent = selfies.slice(lastEnd)
147
+ tokens.push({
148
+ type: SyntaxTokenType.ERROR,
149
+ value: trailingContent,
150
+ start: lastEnd,
151
+ end: selfies.length,
152
+ line: 1,
153
+ column: lastEnd + 1
154
+ })
155
+ errors.push({
156
+ message: `Invalid trailing content: "${trailingContent}"`,
157
+ start: lastEnd,
158
+ end: selfies.length
159
+ })
160
+ }
161
+
162
+ return { tokens, errors, metadata: {} }
163
+ }
164
+
165
+ /**
166
+ * Classifies a SELFIES token content (without brackets)
167
+ * @param {string} content - Token content without brackets
168
+ * @param {Set<string>|null} alphabet - SELFIES alphabet for validation
169
+ * @param {Set<string>} knownNames - Known DSL names
170
+ * @returns {{ type: string, modifiers: string[] }}
171
+ */
172
+ function classifySelfiesToken(content, alphabet, knownNames) {
173
+ const fullToken = `[${content}]`
174
+ const modifiers = []
175
+
176
+ // Check if it's a known DSL name reference
177
+ if (knownNames.has(fullToken)) {
178
+ return { type: SyntaxTokenType.REFERENCE, modifiers: [TokenModifier.REFERENCE] }
179
+ }
180
+
181
+ // Check branches
182
+ if (content.includes('Branch')) {
183
+ // Check for bond-modified branch: [=Branch1], [#Branch1]
184
+ if (content.startsWith('=') || content.startsWith('#')) {
185
+ modifiers.push(content.startsWith('=') ? 'double' : 'triple')
186
+ }
187
+ if (alphabet && !alphabet.has(fullToken)) {
188
+ modifiers.push(TokenModifier.INVALID)
189
+ return { type: SyntaxTokenType.INVALID_TOKEN, modifiers }
190
+ }
191
+ return { type: SyntaxTokenType.BRANCH, modifiers }
192
+ }
193
+
194
+ // Check rings
195
+ if (content.includes('Ring')) {
196
+ if (alphabet && !alphabet.has(fullToken)) {
197
+ modifiers.push(TokenModifier.INVALID)
198
+ return { type: SyntaxTokenType.INVALID_TOKEN, modifiers }
199
+ }
200
+ return { type: SyntaxTokenType.RING, modifiers }
201
+ }
202
+
203
+ // Check for bond-modified atoms
204
+ if (content.startsWith('=') || content.startsWith('#')) {
205
+ const element = content.slice(1)
206
+ modifiers.push(content.startsWith('=') ? 'double' : 'triple')
207
+ addElementModifiers(element, modifiers)
208
+
209
+ if (alphabet && !alphabet.has(fullToken)) {
210
+ modifiers.push(TokenModifier.INVALID)
211
+ return { type: SyntaxTokenType.INVALID_TOKEN, modifiers }
212
+ }
213
+ return { type: SyntaxTokenType.BOND, modifiers }
214
+ }
215
+
216
+ // Regular atom
217
+ addElementModifiers(content, modifiers)
218
+
219
+ if (alphabet && !alphabet.has(fullToken)) {
220
+ modifiers.push(TokenModifier.INVALID)
221
+ return { type: SyntaxTokenType.INVALID_TOKEN, modifiers }
222
+ }
223
+
224
+ modifiers.push(TokenModifier.VALID)
225
+ return { type: SyntaxTokenType.ATOM, modifiers }
226
+ }
227
+
228
+ /**
229
+ * Adds element-specific modifiers
230
+ * @param {string} element - Element symbol
231
+ * @param {string[]} modifiers - Modifiers array to add to
232
+ */
233
+ function addElementModifiers(element, modifiers) {
234
+ const organicSubset = ['C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I', 'B']
235
+ const halogens = ['F', 'Cl', 'Br', 'I']
236
+ const metals = ['Li', 'Na', 'K', 'Mg', 'Ca', 'Fe', 'Cu', 'Zn', 'Al']
237
+
238
+ if (organicSubset.includes(element)) {
239
+ modifiers.push(TokenModifier.ORGANIC)
240
+ }
241
+ if (halogens.includes(element)) {
242
+ modifiers.push(TokenModifier.HALOGEN)
243
+ }
244
+ if (metals.includes(element)) {
245
+ modifiers.push(TokenModifier.METAL)
246
+ }
247
+ }
248
+
249
+ /**
250
+ * Tokenizes DSL source code for syntax highlighting
251
+ * @param {string} source - DSL source code
252
+ * @param {Object} [options] - Options
253
+ * @param {boolean} [options.trackDefinitions=true] - Track name definitions
254
+ * @param {boolean} [options.validateSelfies=true] - Validate SELFIES tokens against alphabet
255
+ * @returns {TokenizationResult} Tokenization result
256
+ *
257
+ * Example:
258
+ * tokenizeDSL('[methyl] = [C] # comment')
259
+ * // {
260
+ * // tokens: [
261
+ * // { type: 'identifier', value: '[methyl]', start: 0, end: 8, modifiers: ['definition'] },
262
+ * // { type: 'operator', value: '=', start: 9, end: 10 },
263
+ * // { type: 'atom', value: '[C]', start: 11, end: 14, modifiers: ['valid', 'organic'] },
264
+ * // { type: 'comment', value: '# comment', start: 16, end: 25 }
265
+ * // ],
266
+ * // errors: [],
267
+ * // metadata: { definedNames: Set(['[methyl]']) }
268
+ * // }
269
+ */
270
+ export function tokenizeDSL(source, options = {}) {
271
+ const {
272
+ trackDefinitions = true,
273
+ validateSelfies = true
274
+ } = options
275
+
276
+ const tokens = []
277
+ const errors = []
278
+ const definedNames = new Set()
279
+ const alphabet = validateSelfies ? getAlphabet() : null
280
+
281
+ let lexTokens
282
+ try {
283
+ lexTokens = dslLex(source)
284
+ } catch (e) {
285
+ errors.push({
286
+ message: e.message,
287
+ start: 0,
288
+ end: source.length
289
+ })
290
+ return { tokens: [], errors, metadata: { definedNames } }
291
+ }
292
+
293
+ // First pass: identify definitions
294
+ let inDefinition = false
295
+ let currentDefinitionName = null
296
+
297
+ for (let i = 0; i < lexTokens.length; i++) {
298
+ const token = lexTokens[i]
299
+ const nextToken = lexTokens[i + 1]
300
+
301
+ // Track definition context
302
+ if (token.type === TokenType.SELFIES_TOKEN && nextToken?.type === TokenType.EQUALS) {
303
+ definedNames.add(token.value)
304
+ }
305
+ }
306
+
307
+ // Second pass: generate syntax tokens with context
308
+ inDefinition = false
309
+
310
+ for (let i = 0; i < lexTokens.length; i++) {
311
+ const token = lexTokens[i]
312
+ const nextToken = lexTokens[i + 1]
313
+
314
+ // Skip EOF
315
+ if (token.type === TokenType.EOF) continue
316
+
317
+ const syntaxToken = {
318
+ value: token.value,
319
+ start: token.range[0],
320
+ end: token.range[1],
321
+ line: token.line,
322
+ column: token.column
323
+ }
324
+
325
+ switch (token.type) {
326
+ case TokenType.IMPORT:
327
+ syntaxToken.type = SyntaxTokenType.KEYWORD
328
+ break
329
+
330
+ case TokenType.FROM:
331
+ syntaxToken.type = SyntaxTokenType.KEYWORD
332
+ break
333
+
334
+ case TokenType.STRING:
335
+ syntaxToken.type = SyntaxTokenType.STRING
336
+ break
337
+
338
+ case TokenType.STAR:
339
+ case TokenType.COMMA:
340
+ syntaxToken.type = SyntaxTokenType.PUNCTUATION
341
+ break
342
+
343
+ case TokenType.SELFIES_TOKEN:
344
+ // Check if this is a definition (name before =)
345
+ if (nextToken?.type === TokenType.EQUALS) {
346
+ syntaxToken.type = SyntaxTokenType.IDENTIFIER
347
+ syntaxToken.modifiers = [TokenModifier.DEFINITION]
348
+ inDefinition = true
349
+ currentDefinitionName = token.value
350
+ }
351
+ // Check if this is a reference to a defined name
352
+ else if (definedNames.has(token.value)) {
353
+ syntaxToken.type = SyntaxTokenType.REFERENCE
354
+ syntaxToken.modifiers = [TokenModifier.REFERENCE]
355
+ syntaxToken.data = { referenceTo: token.value }
356
+ }
357
+ // Otherwise, treat as SELFIES token
358
+ else {
359
+ const content = token.value.slice(1, -1)
360
+ const { type, modifiers } = classifySelfiesToken(content, alphabet, definedNames)
361
+ syntaxToken.type = type
362
+ syntaxToken.modifiers = modifiers
363
+ }
364
+ break
365
+
366
+ case TokenType.EQUALS:
367
+ syntaxToken.type = SyntaxTokenType.OPERATOR
368
+ break
369
+
370
+ case TokenType.COMMENT:
371
+ syntaxToken.type = SyntaxTokenType.COMMENT
372
+ break
373
+
374
+ case TokenType.NEWLINE:
375
+ syntaxToken.type = SyntaxTokenType.NEWLINE
376
+ inDefinition = false
377
+ break
378
+
379
+ case TokenType.NAME:
380
+ // Bare word (shouldn't normally appear in valid DSL)
381
+ syntaxToken.type = SyntaxTokenType.ERROR
382
+ errors.push({
383
+ message: `Unexpected bare word: "${token.value}"`,
384
+ start: token.range[0],
385
+ end: token.range[1]
386
+ })
387
+ break
388
+
389
+ default:
390
+ syntaxToken.type = SyntaxTokenType.ERROR
391
+ errors.push({
392
+ message: `Unknown token type: ${token.type}`,
393
+ start: token.range[0],
394
+ end: token.range[1]
395
+ })
396
+ }
397
+
398
+ tokens.push(syntaxToken)
399
+ }
400
+
401
+ return {
402
+ tokens,
403
+ errors,
404
+ metadata: { definedNames }
405
+ }
406
+ }
407
+
408
+ /**
409
+ * Gets color recommendations for token types
410
+ * @param {string} [theme='dark'] - Theme: 'dark' or 'light'
411
+ * @returns {Object} Map of token type to CSS color value
412
+ *
413
+ * Example:
414
+ * const colors = getColorScheme('dark')
415
+ * // { atom: '#61afef', bond: '#c678dd', ... }
416
+ */
417
+ export function getColorScheme(theme = 'dark') {
418
+ if (theme === 'light') {
419
+ return {
420
+ [SyntaxTokenType.ATOM]: '#0184bc', // Blue
421
+ [SyntaxTokenType.BOND]: '#a626a4', // Purple
422
+ [SyntaxTokenType.BRANCH]: '#c18401', // Orange
423
+ [SyntaxTokenType.RING]: '#c18401', // Orange
424
+ [SyntaxTokenType.INVALID_TOKEN]: '#e45649', // Red
425
+ [SyntaxTokenType.KEYWORD]: '#a626a4', // Purple
426
+ [SyntaxTokenType.IDENTIFIER]: '#0997b3', // Cyan
427
+ [SyntaxTokenType.REFERENCE]: '#4078f2', // Bright blue
428
+ [SyntaxTokenType.OPERATOR]: '#383a42', // Gray
429
+ [SyntaxTokenType.COMMENT]: '#a0a1a7', // Light gray
430
+ [SyntaxTokenType.STRING]: '#50a14f', // Green
431
+ [SyntaxTokenType.PUNCTUATION]: '#383a42', // Gray
432
+ [SyntaxTokenType.WHITESPACE]: 'transparent',
433
+ [SyntaxTokenType.NEWLINE]: 'transparent',
434
+ [SyntaxTokenType.ERROR]: '#e45649' // Red
435
+ }
436
+ }
437
+
438
+ // Dark theme (default)
439
+ return {
440
+ [SyntaxTokenType.ATOM]: '#61afef', // Blue
441
+ [SyntaxTokenType.BOND]: '#c678dd', // Purple
442
+ [SyntaxTokenType.BRANCH]: '#e5c07b', // Yellow/Orange
443
+ [SyntaxTokenType.RING]: '#e5c07b', // Yellow/Orange
444
+ [SyntaxTokenType.INVALID_TOKEN]: '#e06c75', // Red
445
+ [SyntaxTokenType.KEYWORD]: '#c678dd', // Purple
446
+ [SyntaxTokenType.IDENTIFIER]: '#56b6c2', // Cyan
447
+ [SyntaxTokenType.REFERENCE]: '#61afef', // Blue
448
+ [SyntaxTokenType.OPERATOR]: '#abb2bf', // Gray
449
+ [SyntaxTokenType.COMMENT]: '#5c6370', // Dark gray
450
+ [SyntaxTokenType.STRING]: '#98c379', // Green
451
+ [SyntaxTokenType.PUNCTUATION]: '#abb2bf', // Gray
452
+ [SyntaxTokenType.WHITESPACE]: 'transparent',
453
+ [SyntaxTokenType.NEWLINE]: 'transparent',
454
+ [SyntaxTokenType.ERROR]: '#e06c75' // Red
455
+ }
456
+ }
457
+
458
+ /**
459
+ * Gets TextMate-compatible scope names for token types
460
+ * @returns {Object} Map of token type to TextMate scope
461
+ *
462
+ * Useful for integrating with VS Code, Sublime Text, etc.
463
+ */
464
+ export function getTextMateScopes() {
465
+ return {
466
+ [SyntaxTokenType.ATOM]: 'entity.name.tag.atom.selfies',
467
+ [SyntaxTokenType.BOND]: 'keyword.operator.bond.selfies',
468
+ [SyntaxTokenType.BRANCH]: 'keyword.control.branch.selfies',
469
+ [SyntaxTokenType.RING]: 'keyword.control.ring.selfies',
470
+ [SyntaxTokenType.INVALID_TOKEN]: 'invalid.illegal.selfies',
471
+ [SyntaxTokenType.KEYWORD]: 'keyword.control.import.selfies',
472
+ [SyntaxTokenType.IDENTIFIER]: 'entity.name.function.selfies',
473
+ [SyntaxTokenType.REFERENCE]: 'variable.other.reference.selfies',
474
+ [SyntaxTokenType.OPERATOR]: 'keyword.operator.assignment.selfies',
475
+ [SyntaxTokenType.COMMENT]: 'comment.line.number-sign.selfies',
476
+ [SyntaxTokenType.STRING]: 'string.quoted.double.selfies',
477
+ [SyntaxTokenType.PUNCTUATION]: 'punctuation.separator.selfies',
478
+ [SyntaxTokenType.WHITESPACE]: 'text.whitespace.selfies',
479
+ [SyntaxTokenType.NEWLINE]: 'text.whitespace.selfies',
480
+ [SyntaxTokenType.ERROR]: 'invalid.illegal.selfies'
481
+ }
482
+ }
483
+
484
+ /**
485
+ * Gets Monaco Editor compatible token types
486
+ * @returns {Object} Map of token type to Monaco token class
487
+ */
488
+ export function getMonacoTokenTypes() {
489
+ return {
490
+ [SyntaxTokenType.ATOM]: 'type.identifier',
491
+ [SyntaxTokenType.BOND]: 'keyword',
492
+ [SyntaxTokenType.BRANCH]: 'keyword.control',
493
+ [SyntaxTokenType.RING]: 'keyword.control',
494
+ [SyntaxTokenType.INVALID_TOKEN]: 'invalid',
495
+ [SyntaxTokenType.KEYWORD]: 'keyword',
496
+ [SyntaxTokenType.IDENTIFIER]: 'type.identifier',
497
+ [SyntaxTokenType.REFERENCE]: 'variable',
498
+ [SyntaxTokenType.OPERATOR]: 'operator',
499
+ [SyntaxTokenType.COMMENT]: 'comment',
500
+ [SyntaxTokenType.STRING]: 'string',
501
+ [SyntaxTokenType.PUNCTUATION]: 'delimiter',
502
+ [SyntaxTokenType.WHITESPACE]: 'white',
503
+ [SyntaxTokenType.NEWLINE]: 'white',
504
+ [SyntaxTokenType.ERROR]: 'invalid'
505
+ }
506
+ }
507
+
508
+ /**
509
+ * Creates a Monaco Editor language definition
510
+ * @returns {Object} Monaco language definition object
511
+ */
512
+ export function createMonacoLanguage() {
513
+ return {
514
+ id: 'selfies',
515
+ extensions: ['.selfies'],
516
+ aliases: ['SELFIES', 'selfies'],
517
+ mimetypes: ['text/x-selfies'],
518
+
519
+ tokenizer: {
520
+ root: [
521
+ // Comments
522
+ [/#.*$/, 'comment'],
523
+
524
+ // Keywords
525
+ [/\b(import|from)\b/, 'keyword'],
526
+
527
+ // Strings
528
+ [/"[^"]*"/, 'string'],
529
+
530
+ // Operators and punctuation
531
+ [/=/, 'operator'],
532
+ [/[*,]/, 'delimiter'],
533
+
534
+ // Branch tokens
535
+ [/\[=?#?Branch[123]\]/, 'keyword.control'],
536
+
537
+ // Ring tokens
538
+ [/\[-?\/?\\?Ring[123]\]/, 'keyword.control'],
539
+
540
+ // Bond + atom tokens
541
+ [/\[[=#][A-Z][a-z]?\]/, 'keyword'],
542
+
543
+ // Regular atom tokens
544
+ [/\[[A-Z][a-z]?\]/, 'type.identifier'],
545
+
546
+ // User-defined names (identifiers)
547
+ [/\[[a-z][a-zA-Z0-9_]*\]/, 'variable'],
548
+
549
+ // Whitespace
550
+ [/\s+/, 'white']
551
+ ]
552
+ }
553
+ }
554
+ }
555
+
556
+ /**
557
+ * Validates that a token stream completely covers the source
558
+ * @param {SyntaxToken[]} tokens - Array of syntax tokens
559
+ * @param {string} originalSource - Original source string
560
+ * @returns {{ valid: boolean, gaps: Array<{start: number, end: number}> }}
561
+ */
562
+ export function validateTokenization(tokens, originalSource) {
563
+ const gaps = []
564
+ let expectedPos = 0
565
+
566
+ for (const token of tokens) {
567
+ if (token.start > expectedPos) {
568
+ gaps.push({ start: expectedPos, end: token.start })
569
+ }
570
+ expectedPos = Math.max(expectedPos, token.end)
571
+ }
572
+
573
+ if (expectedPos < originalSource.length) {
574
+ gaps.push({ start: expectedPos, end: originalSource.length })
575
+ }
576
+
577
+ return {
578
+ valid: gaps.length === 0,
579
+ gaps
580
+ }
581
+ }
582
+
583
+ /**
584
+ * Highlights source code by wrapping tokens in HTML spans
585
+ * @param {string} source - Source code
586
+ * @param {Object} [options] - Options
587
+ * @param {string} [options.language='dsl'] - 'selfies' or 'dsl'
588
+ * @param {string} [options.theme='dark'] - Color theme
589
+ * @param {string} [options.classPrefix='selfies-'] - CSS class prefix
590
+ * @returns {string} HTML string with highlighted tokens
591
+ */
592
+ export function highlightToHtml(source, options = {}) {
593
+ const {
594
+ language = 'dsl',
595
+ theme = 'dark',
596
+ classPrefix = 'selfies-'
597
+ } = options
598
+
599
+ const result = language === 'selfies'
600
+ ? tokenizeSelfies(source)
601
+ : tokenizeDSL(source)
602
+
603
+ const colors = getColorScheme(theme)
604
+ let html = ''
605
+ let lastEnd = 0
606
+
607
+ for (const token of result.tokens) {
608
+ // Add any gap content
609
+ if (token.start > lastEnd) {
610
+ html += escapeHtml(source.slice(lastEnd, token.start))
611
+ }
612
+
613
+ const className = `${classPrefix}${token.type}`
614
+ const color = colors[token.type] || 'inherit'
615
+ const modifierClasses = token.modifiers?.map(m => `${classPrefix}${m}`).join(' ') || ''
616
+
617
+ html += `<span class="${className} ${modifierClasses}" style="color: ${color}">${escapeHtml(token.value)}</span>`
618
+ lastEnd = token.end
619
+ }
620
+
621
+ // Add any remaining content
622
+ if (lastEnd < source.length) {
623
+ html += escapeHtml(source.slice(lastEnd))
624
+ }
625
+
626
+ return html
627
+ }
628
+
629
+ /**
630
+ * Escapes HTML special characters
631
+ * @param {string} text - Text to escape
632
+ * @returns {string} Escaped text
633
+ */
634
+ function escapeHtml(text) {
635
+ return text
636
+ .replace(/&/g, '&amp;')
637
+ .replace(/</g, '&lt;')
638
+ .replace(/>/g, '&gt;')
639
+ .replace(/"/g, '&quot;')
640
+ .replace(/'/g, '&#39;')
641
+ }