selfies-js 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +274 -0
- package/package.json +65 -0
- package/src/alphabet.js +150 -0
- package/src/alphabet.test.js +82 -0
- package/src/chemistryValidator.js +236 -0
- package/src/cli.js +206 -0
- package/src/constraints.js +186 -0
- package/src/constraints.test.js +126 -0
- package/src/decoder.js +636 -0
- package/src/decoder.test.js +560 -0
- package/src/dsl/analyzer.js +170 -0
- package/src/dsl/analyzer.test.js +139 -0
- package/src/dsl/dsl.test.js +146 -0
- package/src/dsl/importer.js +238 -0
- package/src/dsl/index.js +32 -0
- package/src/dsl/lexer.js +264 -0
- package/src/dsl/lexer.test.js +115 -0
- package/src/dsl/parser.js +201 -0
- package/src/dsl/parser.test.js +148 -0
- package/src/dsl/resolver.js +136 -0
- package/src/dsl/resolver.test.js +99 -0
- package/src/dsl/symbolTable.js +56 -0
- package/src/dsl/symbolTable.test.js +68 -0
- package/src/dsl/valenceValidator.js +147 -0
- package/src/encoder.js +467 -0
- package/src/encoder.test.js +61 -0
- package/src/errors.js +79 -0
- package/src/errors.test.js +91 -0
- package/src/grammar_rules.js +146 -0
- package/src/index.js +70 -0
- package/src/parser.js +96 -0
- package/src/parser.test.js +96 -0
- package/src/properties/atoms.js +69 -0
- package/src/properties/atoms.test.js +116 -0
- package/src/properties/formula.js +111 -0
- package/src/properties/formula.test.js +95 -0
- package/src/properties/molecularWeight.js +80 -0
- package/src/properties/molecularWeight.test.js +84 -0
- package/src/properties/properties.test.js +77 -0
- package/src/renderers/README.md +127 -0
- package/src/renderers/svg.js +113 -0
- package/src/renderers/svg.test.js +42 -0
- package/src/syntax.js +641 -0
- package/src/syntax.test.js +363 -0
- package/src/tokenizer.js +99 -0
- package/src/tokenizer.test.js +55 -0
- package/src/validator.js +70 -0
- package/src/validator.test.js +44 -0
package/src/syntax.js
ADDED
|
@@ -0,0 +1,641 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Syntax Highlighting API - Provides token information for syntax highlighting
|
|
3
|
+
*
|
|
4
|
+
* This module exports functions to tokenize SELFIES and DSL code for
|
|
5
|
+
* syntax highlighting in editors and other downstream tools.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { lex as dslLex, TokenType } from './dsl/lexer.js'
|
|
9
|
+
import { getAlphabet, isValidToken } from './alphabet.js'
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Token types for syntax highlighting
|
|
13
|
+
*/
|
|
14
|
+
export const SyntaxTokenType = {
|
|
15
|
+
// SELFIES tokens
|
|
16
|
+
ATOM: 'atom', // Valid atom: [C], [N], [O]
|
|
17
|
+
BOND: 'bond', // Bond modifier: [=C], [#N]
|
|
18
|
+
BRANCH: 'branch', // Branch token: [Branch1], [=Branch2]
|
|
19
|
+
RING: 'ring', // Ring token: [Ring1], [Ring2]
|
|
20
|
+
INVALID_TOKEN: 'invalid_token', // Invalid SELFIES token
|
|
21
|
+
|
|
22
|
+
// DSL tokens
|
|
23
|
+
KEYWORD: 'keyword', // import, from
|
|
24
|
+
IDENTIFIER: 'identifier', // User-defined names: [methyl], [ethanol]
|
|
25
|
+
REFERENCE: 'reference', // Reference to defined name in expression
|
|
26
|
+
OPERATOR: 'operator', // =
|
|
27
|
+
COMMENT: 'comment', // # comment
|
|
28
|
+
STRING: 'string', // "path/to/file.selfies"
|
|
29
|
+
PUNCTUATION: 'punctuation', // *, ,
|
|
30
|
+
|
|
31
|
+
// Common
|
|
32
|
+
WHITESPACE: 'whitespace',
|
|
33
|
+
NEWLINE: 'newline',
|
|
34
|
+
ERROR: 'error' // Syntax errors
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Semantic token modifiers for enhanced highlighting
|
|
39
|
+
*/
|
|
40
|
+
export const TokenModifier = {
|
|
41
|
+
DEFINITION: 'definition', // Where a name is defined
|
|
42
|
+
REFERENCE: 'reference', // Where a name is used
|
|
43
|
+
VALID: 'valid', // Chemically valid
|
|
44
|
+
INVALID: 'invalid', // Chemically invalid
|
|
45
|
+
ORGANIC: 'organic', // Organic subset atom (C, N, O, S, P, F, Cl, Br, I)
|
|
46
|
+
METAL: 'metal', // Metal atom
|
|
47
|
+
HALOGEN: 'halogen' // Halogen atom
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Result from tokenization containing tokens and metadata
|
|
52
|
+
* @typedef {Object} TokenizationResult
|
|
53
|
+
* @property {SyntaxToken[]} tokens - Array of syntax tokens
|
|
54
|
+
* @property {Object[]} errors - Array of tokenization errors
|
|
55
|
+
* @property {Object} metadata - Additional metadata (defined names, etc.)
|
|
56
|
+
*/
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Single syntax token
|
|
60
|
+
* @typedef {Object} SyntaxToken
|
|
61
|
+
* @property {string} type - Token type from SyntaxTokenType
|
|
62
|
+
* @property {string} value - Token text
|
|
63
|
+
* @property {number} start - Start offset in source
|
|
64
|
+
* @property {number} end - End offset in source
|
|
65
|
+
* @property {number} line - Line number (1-indexed)
|
|
66
|
+
* @property {number} column - Column number (1-indexed)
|
|
67
|
+
* @property {string[]} [modifiers] - Optional semantic modifiers
|
|
68
|
+
* @property {Object} [data] - Optional additional data
|
|
69
|
+
*/
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Tokenizes a SELFIES string for syntax highlighting
|
|
73
|
+
* @param {string} selfies - SELFIES string
|
|
74
|
+
* @param {Object} [options] - Options
|
|
75
|
+
* @param {boolean} [options.validateAgainstAlphabet=true] - Validate tokens against SELFIES alphabet
|
|
76
|
+
* @param {Set<string>} [options.knownNames] - Set of known DSL names to highlight as references
|
|
77
|
+
* @returns {TokenizationResult} Tokenization result
|
|
78
|
+
*
|
|
79
|
+
* Example:
|
|
80
|
+
* tokenizeSelfies('[C][=C][Branch1][C][O]')
|
|
81
|
+
* // {
|
|
82
|
+
* // tokens: [
|
|
83
|
+
* // { type: 'atom', value: '[C]', start: 0, end: 3, ... },
|
|
84
|
+
* // { type: 'bond', value: '[=C]', start: 3, end: 7, ... },
|
|
85
|
+
* // { type: 'branch', value: '[Branch1]', start: 7, end: 16, ... },
|
|
86
|
+
* // { type: 'atom', value: '[C]', start: 16, end: 19, ... },
|
|
87
|
+
* // { type: 'atom', value: '[O]', start: 19, end: 22, ... }
|
|
88
|
+
* // ],
|
|
89
|
+
* // errors: [],
|
|
90
|
+
* // metadata: {}
|
|
91
|
+
* // }
|
|
92
|
+
*/
|
|
93
|
+
export function tokenizeSelfies(selfies, options = {}) {
|
|
94
|
+
const {
|
|
95
|
+
validateAgainstAlphabet = true,
|
|
96
|
+
knownNames = new Set()
|
|
97
|
+
} = options
|
|
98
|
+
|
|
99
|
+
const tokens = []
|
|
100
|
+
const errors = []
|
|
101
|
+
const regex = /\[[^\]]+\]/g
|
|
102
|
+
let match
|
|
103
|
+
let lastEnd = 0
|
|
104
|
+
|
|
105
|
+
// Get the alphabet for validation
|
|
106
|
+
const alphabet = validateAgainstAlphabet ? getAlphabet() : null
|
|
107
|
+
|
|
108
|
+
while ((match = regex.exec(selfies)) !== null) {
|
|
109
|
+
// Check for gap (invalid content between tokens)
|
|
110
|
+
if (match.index > lastEnd) {
|
|
111
|
+
const gapContent = selfies.slice(lastEnd, match.index)
|
|
112
|
+
tokens.push({
|
|
113
|
+
type: SyntaxTokenType.ERROR,
|
|
114
|
+
value: gapContent,
|
|
115
|
+
start: lastEnd,
|
|
116
|
+
end: match.index,
|
|
117
|
+
line: 1,
|
|
118
|
+
column: lastEnd + 1
|
|
119
|
+
})
|
|
120
|
+
errors.push({
|
|
121
|
+
message: `Invalid content between tokens: "${gapContent}"`,
|
|
122
|
+
start: lastEnd,
|
|
123
|
+
end: match.index
|
|
124
|
+
})
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const value = match[0]
|
|
128
|
+
const content = value.slice(1, -1) // Remove brackets
|
|
129
|
+
const { type, modifiers } = classifySelfiesToken(content, alphabet, knownNames)
|
|
130
|
+
|
|
131
|
+
tokens.push({
|
|
132
|
+
type,
|
|
133
|
+
value,
|
|
134
|
+
start: match.index,
|
|
135
|
+
end: match.index + value.length,
|
|
136
|
+
line: 1,
|
|
137
|
+
column: match.index + 1,
|
|
138
|
+
modifiers
|
|
139
|
+
})
|
|
140
|
+
|
|
141
|
+
lastEnd = match.index + value.length
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Check for trailing content
|
|
145
|
+
if (lastEnd < selfies.length) {
|
|
146
|
+
const trailingContent = selfies.slice(lastEnd)
|
|
147
|
+
tokens.push({
|
|
148
|
+
type: SyntaxTokenType.ERROR,
|
|
149
|
+
value: trailingContent,
|
|
150
|
+
start: lastEnd,
|
|
151
|
+
end: selfies.length,
|
|
152
|
+
line: 1,
|
|
153
|
+
column: lastEnd + 1
|
|
154
|
+
})
|
|
155
|
+
errors.push({
|
|
156
|
+
message: `Invalid trailing content: "${trailingContent}"`,
|
|
157
|
+
start: lastEnd,
|
|
158
|
+
end: selfies.length
|
|
159
|
+
})
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
return { tokens, errors, metadata: {} }
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Classifies a SELFIES token content (without brackets)
|
|
167
|
+
* @param {string} content - Token content without brackets
|
|
168
|
+
* @param {Set<string>|null} alphabet - SELFIES alphabet for validation
|
|
169
|
+
* @param {Set<string>} knownNames - Known DSL names
|
|
170
|
+
* @returns {{ type: string, modifiers: string[] }}
|
|
171
|
+
*/
|
|
172
|
+
function classifySelfiesToken(content, alphabet, knownNames) {
|
|
173
|
+
const fullToken = `[${content}]`
|
|
174
|
+
const modifiers = []
|
|
175
|
+
|
|
176
|
+
// Check if it's a known DSL name reference
|
|
177
|
+
if (knownNames.has(fullToken)) {
|
|
178
|
+
return { type: SyntaxTokenType.REFERENCE, modifiers: [TokenModifier.REFERENCE] }
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Check branches
|
|
182
|
+
if (content.includes('Branch')) {
|
|
183
|
+
// Check for bond-modified branch: [=Branch1], [#Branch1]
|
|
184
|
+
if (content.startsWith('=') || content.startsWith('#')) {
|
|
185
|
+
modifiers.push(content.startsWith('=') ? 'double' : 'triple')
|
|
186
|
+
}
|
|
187
|
+
if (alphabet && !alphabet.has(fullToken)) {
|
|
188
|
+
modifiers.push(TokenModifier.INVALID)
|
|
189
|
+
return { type: SyntaxTokenType.INVALID_TOKEN, modifiers }
|
|
190
|
+
}
|
|
191
|
+
return { type: SyntaxTokenType.BRANCH, modifiers }
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Check rings
|
|
195
|
+
if (content.includes('Ring')) {
|
|
196
|
+
if (alphabet && !alphabet.has(fullToken)) {
|
|
197
|
+
modifiers.push(TokenModifier.INVALID)
|
|
198
|
+
return { type: SyntaxTokenType.INVALID_TOKEN, modifiers }
|
|
199
|
+
}
|
|
200
|
+
return { type: SyntaxTokenType.RING, modifiers }
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Check for bond-modified atoms
|
|
204
|
+
if (content.startsWith('=') || content.startsWith('#')) {
|
|
205
|
+
const element = content.slice(1)
|
|
206
|
+
modifiers.push(content.startsWith('=') ? 'double' : 'triple')
|
|
207
|
+
addElementModifiers(element, modifiers)
|
|
208
|
+
|
|
209
|
+
if (alphabet && !alphabet.has(fullToken)) {
|
|
210
|
+
modifiers.push(TokenModifier.INVALID)
|
|
211
|
+
return { type: SyntaxTokenType.INVALID_TOKEN, modifiers }
|
|
212
|
+
}
|
|
213
|
+
return { type: SyntaxTokenType.BOND, modifiers }
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Regular atom
|
|
217
|
+
addElementModifiers(content, modifiers)
|
|
218
|
+
|
|
219
|
+
if (alphabet && !alphabet.has(fullToken)) {
|
|
220
|
+
modifiers.push(TokenModifier.INVALID)
|
|
221
|
+
return { type: SyntaxTokenType.INVALID_TOKEN, modifiers }
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
modifiers.push(TokenModifier.VALID)
|
|
225
|
+
return { type: SyntaxTokenType.ATOM, modifiers }
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Adds element-specific modifiers
|
|
230
|
+
* @param {string} element - Element symbol
|
|
231
|
+
* @param {string[]} modifiers - Modifiers array to add to
|
|
232
|
+
*/
|
|
233
|
+
function addElementModifiers(element, modifiers) {
|
|
234
|
+
const organicSubset = ['C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I', 'B']
|
|
235
|
+
const halogens = ['F', 'Cl', 'Br', 'I']
|
|
236
|
+
const metals = ['Li', 'Na', 'K', 'Mg', 'Ca', 'Fe', 'Cu', 'Zn', 'Al']
|
|
237
|
+
|
|
238
|
+
if (organicSubset.includes(element)) {
|
|
239
|
+
modifiers.push(TokenModifier.ORGANIC)
|
|
240
|
+
}
|
|
241
|
+
if (halogens.includes(element)) {
|
|
242
|
+
modifiers.push(TokenModifier.HALOGEN)
|
|
243
|
+
}
|
|
244
|
+
if (metals.includes(element)) {
|
|
245
|
+
modifiers.push(TokenModifier.METAL)
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Tokenizes DSL source code for syntax highlighting
|
|
251
|
+
* @param {string} source - DSL source code
|
|
252
|
+
* @param {Object} [options] - Options
|
|
253
|
+
* @param {boolean} [options.trackDefinitions=true] - Track name definitions
|
|
254
|
+
* @param {boolean} [options.validateSelfies=true] - Validate SELFIES tokens against alphabet
|
|
255
|
+
* @returns {TokenizationResult} Tokenization result
|
|
256
|
+
*
|
|
257
|
+
* Example:
|
|
258
|
+
* tokenizeDSL('[methyl] = [C] # comment')
|
|
259
|
+
* // {
|
|
260
|
+
* // tokens: [
|
|
261
|
+
* // { type: 'identifier', value: '[methyl]', start: 0, end: 8, modifiers: ['definition'] },
|
|
262
|
+
* // { type: 'operator', value: '=', start: 9, end: 10 },
|
|
263
|
+
* // { type: 'atom', value: '[C]', start: 11, end: 14, modifiers: ['valid', 'organic'] },
|
|
264
|
+
* // { type: 'comment', value: '# comment', start: 16, end: 25 }
|
|
265
|
+
* // ],
|
|
266
|
+
* // errors: [],
|
|
267
|
+
* // metadata: { definedNames: Set(['[methyl]']) }
|
|
268
|
+
* // }
|
|
269
|
+
*/
|
|
270
|
+
export function tokenizeDSL(source, options = {}) {
|
|
271
|
+
const {
|
|
272
|
+
trackDefinitions = true,
|
|
273
|
+
validateSelfies = true
|
|
274
|
+
} = options
|
|
275
|
+
|
|
276
|
+
const tokens = []
|
|
277
|
+
const errors = []
|
|
278
|
+
const definedNames = new Set()
|
|
279
|
+
const alphabet = validateSelfies ? getAlphabet() : null
|
|
280
|
+
|
|
281
|
+
let lexTokens
|
|
282
|
+
try {
|
|
283
|
+
lexTokens = dslLex(source)
|
|
284
|
+
} catch (e) {
|
|
285
|
+
errors.push({
|
|
286
|
+
message: e.message,
|
|
287
|
+
start: 0,
|
|
288
|
+
end: source.length
|
|
289
|
+
})
|
|
290
|
+
return { tokens: [], errors, metadata: { definedNames } }
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// First pass: identify definitions
|
|
294
|
+
let inDefinition = false
|
|
295
|
+
let currentDefinitionName = null
|
|
296
|
+
|
|
297
|
+
for (let i = 0; i < lexTokens.length; i++) {
|
|
298
|
+
const token = lexTokens[i]
|
|
299
|
+
const nextToken = lexTokens[i + 1]
|
|
300
|
+
|
|
301
|
+
// Track definition context
|
|
302
|
+
if (token.type === TokenType.SELFIES_TOKEN && nextToken?.type === TokenType.EQUALS) {
|
|
303
|
+
definedNames.add(token.value)
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Second pass: generate syntax tokens with context
|
|
308
|
+
inDefinition = false
|
|
309
|
+
|
|
310
|
+
for (let i = 0; i < lexTokens.length; i++) {
|
|
311
|
+
const token = lexTokens[i]
|
|
312
|
+
const nextToken = lexTokens[i + 1]
|
|
313
|
+
|
|
314
|
+
// Skip EOF
|
|
315
|
+
if (token.type === TokenType.EOF) continue
|
|
316
|
+
|
|
317
|
+
const syntaxToken = {
|
|
318
|
+
value: token.value,
|
|
319
|
+
start: token.range[0],
|
|
320
|
+
end: token.range[1],
|
|
321
|
+
line: token.line,
|
|
322
|
+
column: token.column
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
switch (token.type) {
|
|
326
|
+
case TokenType.IMPORT:
|
|
327
|
+
syntaxToken.type = SyntaxTokenType.KEYWORD
|
|
328
|
+
break
|
|
329
|
+
|
|
330
|
+
case TokenType.FROM:
|
|
331
|
+
syntaxToken.type = SyntaxTokenType.KEYWORD
|
|
332
|
+
break
|
|
333
|
+
|
|
334
|
+
case TokenType.STRING:
|
|
335
|
+
syntaxToken.type = SyntaxTokenType.STRING
|
|
336
|
+
break
|
|
337
|
+
|
|
338
|
+
case TokenType.STAR:
|
|
339
|
+
case TokenType.COMMA:
|
|
340
|
+
syntaxToken.type = SyntaxTokenType.PUNCTUATION
|
|
341
|
+
break
|
|
342
|
+
|
|
343
|
+
case TokenType.SELFIES_TOKEN:
|
|
344
|
+
// Check if this is a definition (name before =)
|
|
345
|
+
if (nextToken?.type === TokenType.EQUALS) {
|
|
346
|
+
syntaxToken.type = SyntaxTokenType.IDENTIFIER
|
|
347
|
+
syntaxToken.modifiers = [TokenModifier.DEFINITION]
|
|
348
|
+
inDefinition = true
|
|
349
|
+
currentDefinitionName = token.value
|
|
350
|
+
}
|
|
351
|
+
// Check if this is a reference to a defined name
|
|
352
|
+
else if (definedNames.has(token.value)) {
|
|
353
|
+
syntaxToken.type = SyntaxTokenType.REFERENCE
|
|
354
|
+
syntaxToken.modifiers = [TokenModifier.REFERENCE]
|
|
355
|
+
syntaxToken.data = { referenceTo: token.value }
|
|
356
|
+
}
|
|
357
|
+
// Otherwise, treat as SELFIES token
|
|
358
|
+
else {
|
|
359
|
+
const content = token.value.slice(1, -1)
|
|
360
|
+
const { type, modifiers } = classifySelfiesToken(content, alphabet, definedNames)
|
|
361
|
+
syntaxToken.type = type
|
|
362
|
+
syntaxToken.modifiers = modifiers
|
|
363
|
+
}
|
|
364
|
+
break
|
|
365
|
+
|
|
366
|
+
case TokenType.EQUALS:
|
|
367
|
+
syntaxToken.type = SyntaxTokenType.OPERATOR
|
|
368
|
+
break
|
|
369
|
+
|
|
370
|
+
case TokenType.COMMENT:
|
|
371
|
+
syntaxToken.type = SyntaxTokenType.COMMENT
|
|
372
|
+
break
|
|
373
|
+
|
|
374
|
+
case TokenType.NEWLINE:
|
|
375
|
+
syntaxToken.type = SyntaxTokenType.NEWLINE
|
|
376
|
+
inDefinition = false
|
|
377
|
+
break
|
|
378
|
+
|
|
379
|
+
case TokenType.NAME:
|
|
380
|
+
// Bare word (shouldn't normally appear in valid DSL)
|
|
381
|
+
syntaxToken.type = SyntaxTokenType.ERROR
|
|
382
|
+
errors.push({
|
|
383
|
+
message: `Unexpected bare word: "${token.value}"`,
|
|
384
|
+
start: token.range[0],
|
|
385
|
+
end: token.range[1]
|
|
386
|
+
})
|
|
387
|
+
break
|
|
388
|
+
|
|
389
|
+
default:
|
|
390
|
+
syntaxToken.type = SyntaxTokenType.ERROR
|
|
391
|
+
errors.push({
|
|
392
|
+
message: `Unknown token type: ${token.type}`,
|
|
393
|
+
start: token.range[0],
|
|
394
|
+
end: token.range[1]
|
|
395
|
+
})
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
tokens.push(syntaxToken)
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
return {
|
|
402
|
+
tokens,
|
|
403
|
+
errors,
|
|
404
|
+
metadata: { definedNames }
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/**
|
|
409
|
+
* Gets color recommendations for token types
|
|
410
|
+
* @param {string} [theme='dark'] - Theme: 'dark' or 'light'
|
|
411
|
+
* @returns {Object} Map of token type to CSS color value
|
|
412
|
+
*
|
|
413
|
+
* Example:
|
|
414
|
+
* const colors = getColorScheme('dark')
|
|
415
|
+
* // { atom: '#61afef', bond: '#c678dd', ... }
|
|
416
|
+
*/
|
|
417
|
+
export function getColorScheme(theme = 'dark') {
|
|
418
|
+
if (theme === 'light') {
|
|
419
|
+
return {
|
|
420
|
+
[SyntaxTokenType.ATOM]: '#0184bc', // Blue
|
|
421
|
+
[SyntaxTokenType.BOND]: '#a626a4', // Purple
|
|
422
|
+
[SyntaxTokenType.BRANCH]: '#c18401', // Orange
|
|
423
|
+
[SyntaxTokenType.RING]: '#c18401', // Orange
|
|
424
|
+
[SyntaxTokenType.INVALID_TOKEN]: '#e45649', // Red
|
|
425
|
+
[SyntaxTokenType.KEYWORD]: '#a626a4', // Purple
|
|
426
|
+
[SyntaxTokenType.IDENTIFIER]: '#0997b3', // Cyan
|
|
427
|
+
[SyntaxTokenType.REFERENCE]: '#4078f2', // Bright blue
|
|
428
|
+
[SyntaxTokenType.OPERATOR]: '#383a42', // Gray
|
|
429
|
+
[SyntaxTokenType.COMMENT]: '#a0a1a7', // Light gray
|
|
430
|
+
[SyntaxTokenType.STRING]: '#50a14f', // Green
|
|
431
|
+
[SyntaxTokenType.PUNCTUATION]: '#383a42', // Gray
|
|
432
|
+
[SyntaxTokenType.WHITESPACE]: 'transparent',
|
|
433
|
+
[SyntaxTokenType.NEWLINE]: 'transparent',
|
|
434
|
+
[SyntaxTokenType.ERROR]: '#e45649' // Red
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Dark theme (default)
|
|
439
|
+
return {
|
|
440
|
+
[SyntaxTokenType.ATOM]: '#61afef', // Blue
|
|
441
|
+
[SyntaxTokenType.BOND]: '#c678dd', // Purple
|
|
442
|
+
[SyntaxTokenType.BRANCH]: '#e5c07b', // Yellow/Orange
|
|
443
|
+
[SyntaxTokenType.RING]: '#e5c07b', // Yellow/Orange
|
|
444
|
+
[SyntaxTokenType.INVALID_TOKEN]: '#e06c75', // Red
|
|
445
|
+
[SyntaxTokenType.KEYWORD]: '#c678dd', // Purple
|
|
446
|
+
[SyntaxTokenType.IDENTIFIER]: '#56b6c2', // Cyan
|
|
447
|
+
[SyntaxTokenType.REFERENCE]: '#61afef', // Blue
|
|
448
|
+
[SyntaxTokenType.OPERATOR]: '#abb2bf', // Gray
|
|
449
|
+
[SyntaxTokenType.COMMENT]: '#5c6370', // Dark gray
|
|
450
|
+
[SyntaxTokenType.STRING]: '#98c379', // Green
|
|
451
|
+
[SyntaxTokenType.PUNCTUATION]: '#abb2bf', // Gray
|
|
452
|
+
[SyntaxTokenType.WHITESPACE]: 'transparent',
|
|
453
|
+
[SyntaxTokenType.NEWLINE]: 'transparent',
|
|
454
|
+
[SyntaxTokenType.ERROR]: '#e06c75' // Red
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
/**
|
|
459
|
+
* Gets TextMate-compatible scope names for token types
|
|
460
|
+
* @returns {Object} Map of token type to TextMate scope
|
|
461
|
+
*
|
|
462
|
+
* Useful for integrating with VS Code, Sublime Text, etc.
|
|
463
|
+
*/
|
|
464
|
+
export function getTextMateScopes() {
|
|
465
|
+
return {
|
|
466
|
+
[SyntaxTokenType.ATOM]: 'entity.name.tag.atom.selfies',
|
|
467
|
+
[SyntaxTokenType.BOND]: 'keyword.operator.bond.selfies',
|
|
468
|
+
[SyntaxTokenType.BRANCH]: 'keyword.control.branch.selfies',
|
|
469
|
+
[SyntaxTokenType.RING]: 'keyword.control.ring.selfies',
|
|
470
|
+
[SyntaxTokenType.INVALID_TOKEN]: 'invalid.illegal.selfies',
|
|
471
|
+
[SyntaxTokenType.KEYWORD]: 'keyword.control.import.selfies',
|
|
472
|
+
[SyntaxTokenType.IDENTIFIER]: 'entity.name.function.selfies',
|
|
473
|
+
[SyntaxTokenType.REFERENCE]: 'variable.other.reference.selfies',
|
|
474
|
+
[SyntaxTokenType.OPERATOR]: 'keyword.operator.assignment.selfies',
|
|
475
|
+
[SyntaxTokenType.COMMENT]: 'comment.line.number-sign.selfies',
|
|
476
|
+
[SyntaxTokenType.STRING]: 'string.quoted.double.selfies',
|
|
477
|
+
[SyntaxTokenType.PUNCTUATION]: 'punctuation.separator.selfies',
|
|
478
|
+
[SyntaxTokenType.WHITESPACE]: 'text.whitespace.selfies',
|
|
479
|
+
[SyntaxTokenType.NEWLINE]: 'text.whitespace.selfies',
|
|
480
|
+
[SyntaxTokenType.ERROR]: 'invalid.illegal.selfies'
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
/**
|
|
485
|
+
* Gets Monaco Editor compatible token types
|
|
486
|
+
* @returns {Object} Map of token type to Monaco token class
|
|
487
|
+
*/
|
|
488
|
+
export function getMonacoTokenTypes() {
|
|
489
|
+
return {
|
|
490
|
+
[SyntaxTokenType.ATOM]: 'type.identifier',
|
|
491
|
+
[SyntaxTokenType.BOND]: 'keyword',
|
|
492
|
+
[SyntaxTokenType.BRANCH]: 'keyword.control',
|
|
493
|
+
[SyntaxTokenType.RING]: 'keyword.control',
|
|
494
|
+
[SyntaxTokenType.INVALID_TOKEN]: 'invalid',
|
|
495
|
+
[SyntaxTokenType.KEYWORD]: 'keyword',
|
|
496
|
+
[SyntaxTokenType.IDENTIFIER]: 'type.identifier',
|
|
497
|
+
[SyntaxTokenType.REFERENCE]: 'variable',
|
|
498
|
+
[SyntaxTokenType.OPERATOR]: 'operator',
|
|
499
|
+
[SyntaxTokenType.COMMENT]: 'comment',
|
|
500
|
+
[SyntaxTokenType.STRING]: 'string',
|
|
501
|
+
[SyntaxTokenType.PUNCTUATION]: 'delimiter',
|
|
502
|
+
[SyntaxTokenType.WHITESPACE]: 'white',
|
|
503
|
+
[SyntaxTokenType.NEWLINE]: 'white',
|
|
504
|
+
[SyntaxTokenType.ERROR]: 'invalid'
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
/**
|
|
509
|
+
* Creates a Monaco Editor language definition
|
|
510
|
+
* @returns {Object} Monaco language definition object
|
|
511
|
+
*/
|
|
512
|
+
export function createMonacoLanguage() {
|
|
513
|
+
return {
|
|
514
|
+
id: 'selfies',
|
|
515
|
+
extensions: ['.selfies'],
|
|
516
|
+
aliases: ['SELFIES', 'selfies'],
|
|
517
|
+
mimetypes: ['text/x-selfies'],
|
|
518
|
+
|
|
519
|
+
tokenizer: {
|
|
520
|
+
root: [
|
|
521
|
+
// Comments
|
|
522
|
+
[/#.*$/, 'comment'],
|
|
523
|
+
|
|
524
|
+
// Keywords
|
|
525
|
+
[/\b(import|from)\b/, 'keyword'],
|
|
526
|
+
|
|
527
|
+
// Strings
|
|
528
|
+
[/"[^"]*"/, 'string'],
|
|
529
|
+
|
|
530
|
+
// Operators and punctuation
|
|
531
|
+
[/=/, 'operator'],
|
|
532
|
+
[/[*,]/, 'delimiter'],
|
|
533
|
+
|
|
534
|
+
// Branch tokens
|
|
535
|
+
[/\[=?#?Branch[123]\]/, 'keyword.control'],
|
|
536
|
+
|
|
537
|
+
// Ring tokens
|
|
538
|
+
[/\[-?\/?\\?Ring[123]\]/, 'keyword.control'],
|
|
539
|
+
|
|
540
|
+
// Bond + atom tokens
|
|
541
|
+
[/\[[=#][A-Z][a-z]?\]/, 'keyword'],
|
|
542
|
+
|
|
543
|
+
// Regular atom tokens
|
|
544
|
+
[/\[[A-Z][a-z]?\]/, 'type.identifier'],
|
|
545
|
+
|
|
546
|
+
// User-defined names (identifiers)
|
|
547
|
+
[/\[[a-z][a-zA-Z0-9_]*\]/, 'variable'],
|
|
548
|
+
|
|
549
|
+
// Whitespace
|
|
550
|
+
[/\s+/, 'white']
|
|
551
|
+
]
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
/**
|
|
557
|
+
* Validates that a token stream completely covers the source
|
|
558
|
+
* @param {SyntaxToken[]} tokens - Array of syntax tokens
|
|
559
|
+
* @param {string} originalSource - Original source string
|
|
560
|
+
* @returns {{ valid: boolean, gaps: Array<{start: number, end: number}> }}
|
|
561
|
+
*/
|
|
562
|
+
export function validateTokenization(tokens, originalSource) {
|
|
563
|
+
const gaps = []
|
|
564
|
+
let expectedPos = 0
|
|
565
|
+
|
|
566
|
+
for (const token of tokens) {
|
|
567
|
+
if (token.start > expectedPos) {
|
|
568
|
+
gaps.push({ start: expectedPos, end: token.start })
|
|
569
|
+
}
|
|
570
|
+
expectedPos = Math.max(expectedPos, token.end)
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
if (expectedPos < originalSource.length) {
|
|
574
|
+
gaps.push({ start: expectedPos, end: originalSource.length })
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
return {
|
|
578
|
+
valid: gaps.length === 0,
|
|
579
|
+
gaps
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
/**
|
|
584
|
+
* Highlights source code by wrapping tokens in HTML spans
|
|
585
|
+
* @param {string} source - Source code
|
|
586
|
+
* @param {Object} [options] - Options
|
|
587
|
+
* @param {string} [options.language='dsl'] - 'selfies' or 'dsl'
|
|
588
|
+
* @param {string} [options.theme='dark'] - Color theme
|
|
589
|
+
* @param {string} [options.classPrefix='selfies-'] - CSS class prefix
|
|
590
|
+
* @returns {string} HTML string with highlighted tokens
|
|
591
|
+
*/
|
|
592
|
+
export function highlightToHtml(source, options = {}) {
|
|
593
|
+
const {
|
|
594
|
+
language = 'dsl',
|
|
595
|
+
theme = 'dark',
|
|
596
|
+
classPrefix = 'selfies-'
|
|
597
|
+
} = options
|
|
598
|
+
|
|
599
|
+
const result = language === 'selfies'
|
|
600
|
+
? tokenizeSelfies(source)
|
|
601
|
+
: tokenizeDSL(source)
|
|
602
|
+
|
|
603
|
+
const colors = getColorScheme(theme)
|
|
604
|
+
let html = ''
|
|
605
|
+
let lastEnd = 0
|
|
606
|
+
|
|
607
|
+
for (const token of result.tokens) {
|
|
608
|
+
// Add any gap content
|
|
609
|
+
if (token.start > lastEnd) {
|
|
610
|
+
html += escapeHtml(source.slice(lastEnd, token.start))
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
const className = `${classPrefix}${token.type}`
|
|
614
|
+
const color = colors[token.type] || 'inherit'
|
|
615
|
+
const modifierClasses = token.modifiers?.map(m => `${classPrefix}${m}`).join(' ') || ''
|
|
616
|
+
|
|
617
|
+
html += `<span class="${className} ${modifierClasses}" style="color: ${color}">${escapeHtml(token.value)}</span>`
|
|
618
|
+
lastEnd = token.end
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
// Add any remaining content
|
|
622
|
+
if (lastEnd < source.length) {
|
|
623
|
+
html += escapeHtml(source.slice(lastEnd))
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
return html
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
/**
|
|
630
|
+
* Escapes HTML special characters
|
|
631
|
+
* @param {string} text - Text to escape
|
|
632
|
+
* @returns {string} Escaped text
|
|
633
|
+
*/
|
|
634
|
+
function escapeHtml(text) {
|
|
635
|
+
return text
|
|
636
|
+
.replace(/&/g, '&')
|
|
637
|
+
.replace(/</g, '<')
|
|
638
|
+
.replace(/>/g, '>')
|
|
639
|
+
.replace(/"/g, '"')
|
|
640
|
+
.replace(/'/g, ''')
|
|
641
|
+
}
|