selfies-js 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +274 -0
- package/package.json +65 -0
- package/src/alphabet.js +150 -0
- package/src/alphabet.test.js +82 -0
- package/src/chemistryValidator.js +236 -0
- package/src/cli.js +206 -0
- package/src/constraints.js +186 -0
- package/src/constraints.test.js +126 -0
- package/src/decoder.js +636 -0
- package/src/decoder.test.js +560 -0
- package/src/dsl/analyzer.js +170 -0
- package/src/dsl/analyzer.test.js +139 -0
- package/src/dsl/dsl.test.js +146 -0
- package/src/dsl/importer.js +238 -0
- package/src/dsl/index.js +32 -0
- package/src/dsl/lexer.js +264 -0
- package/src/dsl/lexer.test.js +115 -0
- package/src/dsl/parser.js +201 -0
- package/src/dsl/parser.test.js +148 -0
- package/src/dsl/resolver.js +136 -0
- package/src/dsl/resolver.test.js +99 -0
- package/src/dsl/symbolTable.js +56 -0
- package/src/dsl/symbolTable.test.js +68 -0
- package/src/dsl/valenceValidator.js +147 -0
- package/src/encoder.js +467 -0
- package/src/encoder.test.js +61 -0
- package/src/errors.js +79 -0
- package/src/errors.test.js +91 -0
- package/src/grammar_rules.js +146 -0
- package/src/index.js +70 -0
- package/src/parser.js +96 -0
- package/src/parser.test.js +96 -0
- package/src/properties/atoms.js +69 -0
- package/src/properties/atoms.test.js +116 -0
- package/src/properties/formula.js +111 -0
- package/src/properties/formula.test.js +95 -0
- package/src/properties/molecularWeight.js +80 -0
- package/src/properties/molecularWeight.test.js +84 -0
- package/src/properties/properties.test.js +77 -0
- package/src/renderers/README.md +127 -0
- package/src/renderers/svg.js +113 -0
- package/src/renderers/svg.test.js +42 -0
- package/src/syntax.js +641 -0
- package/src/syntax.test.js +363 -0
- package/src/tokenizer.js +99 -0
- package/src/tokenizer.test.js +55 -0
- package/src/validator.js +70 -0
- package/src/validator.test.js +44 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
import { describe, it, expect } from 'bun:test'
|
|
2
|
+
import {
|
|
3
|
+
tokenizeSelfies,
|
|
4
|
+
tokenizeDSL,
|
|
5
|
+
SyntaxTokenType,
|
|
6
|
+
TokenModifier,
|
|
7
|
+
getColorScheme,
|
|
8
|
+
getTextMateScopes,
|
|
9
|
+
getMonacoTokenTypes,
|
|
10
|
+
createMonacoLanguage,
|
|
11
|
+
validateTokenization,
|
|
12
|
+
highlightToHtml
|
|
13
|
+
} from './syntax.js'
|
|
14
|
+
|
|
15
|
+
describe('tokenizeSelfies', () => {
|
|
16
|
+
it('should tokenize basic atoms', () => {
|
|
17
|
+
const result = tokenizeSelfies('[C][N][O]')
|
|
18
|
+
expect(result.tokens).toHaveLength(3)
|
|
19
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.ATOM)
|
|
20
|
+
expect(result.tokens[0].value).toBe('[C]')
|
|
21
|
+
expect(result.tokens[0].start).toBe(0)
|
|
22
|
+
expect(result.tokens[0].end).toBe(3)
|
|
23
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.ATOM)
|
|
24
|
+
expect(result.tokens[2].type).toBe(SyntaxTokenType.ATOM)
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
it('should tokenize bond-modified atoms', () => {
|
|
28
|
+
const result = tokenizeSelfies('[C][=C][#N]')
|
|
29
|
+
expect(result.tokens).toHaveLength(3)
|
|
30
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.ATOM)
|
|
31
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.BOND)
|
|
32
|
+
expect(result.tokens[1].value).toBe('[=C]')
|
|
33
|
+
expect(result.tokens[2].type).toBe(SyntaxTokenType.BOND)
|
|
34
|
+
expect(result.tokens[2].value).toBe('[#N]')
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
it('should tokenize branch tokens', () => {
|
|
38
|
+
const result = tokenizeSelfies('[C][Branch1][C][O]')
|
|
39
|
+
expect(result.tokens).toHaveLength(4)
|
|
40
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.BRANCH)
|
|
41
|
+
expect(result.tokens[1].value).toBe('[Branch1]')
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
it('should tokenize ring tokens', () => {
|
|
45
|
+
const result = tokenizeSelfies('[C][Ring1][C]')
|
|
46
|
+
expect(result.tokens).toHaveLength(3)
|
|
47
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.RING)
|
|
48
|
+
expect(result.tokens[1].value).toBe('[Ring1]')
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
it('should tokenize bond-modified branches', () => {
|
|
52
|
+
const result = tokenizeSelfies('[C][=Branch1][C][=O]')
|
|
53
|
+
expect(result.tokens).toHaveLength(4)
|
|
54
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.BRANCH)
|
|
55
|
+
expect(result.tokens[1].modifiers).toContain('double')
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
it('should add organic modifier to organic atoms', () => {
|
|
59
|
+
const result = tokenizeSelfies('[C]')
|
|
60
|
+
expect(result.tokens[0].modifiers).toContain(TokenModifier.ORGANIC)
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
it('should add halogen modifier to halogens', () => {
|
|
64
|
+
const result = tokenizeSelfies('[F][Cl][Br][I]')
|
|
65
|
+
for (const token of result.tokens) {
|
|
66
|
+
expect(token.modifiers).toContain(TokenModifier.HALOGEN)
|
|
67
|
+
}
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
it('should mark invalid tokens', () => {
|
|
71
|
+
const result = tokenizeSelfies('[InvalidElement]')
|
|
72
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.INVALID_TOKEN)
|
|
73
|
+
expect(result.tokens[0].modifiers).toContain(TokenModifier.INVALID)
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
it('should handle known DSL names as references', () => {
|
|
77
|
+
const knownNames = new Set(['[methyl]'])
|
|
78
|
+
const result = tokenizeSelfies('[methyl][C]', { knownNames })
|
|
79
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.REFERENCE)
|
|
80
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.ATOM)
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
it('should detect gaps between tokens as errors', () => {
|
|
84
|
+
const result = tokenizeSelfies('[C]invalid[N]')
|
|
85
|
+
expect(result.tokens).toHaveLength(3)
|
|
86
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.ERROR)
|
|
87
|
+
expect(result.tokens[1].value).toBe('invalid')
|
|
88
|
+
expect(result.errors).toHaveLength(1)
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
it('should detect trailing content as error', () => {
|
|
92
|
+
const result = tokenizeSelfies('[C]trailing')
|
|
93
|
+
expect(result.tokens).toHaveLength(2)
|
|
94
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.ERROR)
|
|
95
|
+
expect(result.errors).toHaveLength(1)
|
|
96
|
+
})
|
|
97
|
+
|
|
98
|
+
it('should return empty tokens for empty input', () => {
|
|
99
|
+
const result = tokenizeSelfies('')
|
|
100
|
+
expect(result.tokens).toHaveLength(0)
|
|
101
|
+
expect(result.errors).toHaveLength(0)
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
it('should skip alphabet validation when disabled', () => {
|
|
105
|
+
const result = tokenizeSelfies('[XYZ]', { validateAgainstAlphabet: false })
|
|
106
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.ATOM)
|
|
107
|
+
expect(result.tokens[0].modifiers).not.toContain(TokenModifier.INVALID)
|
|
108
|
+
})
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
describe('tokenizeDSL', () => {
|
|
112
|
+
it('should tokenize definition', () => {
|
|
113
|
+
const result = tokenizeDSL('[methyl] = [C]')
|
|
114
|
+
expect(result.tokens).toHaveLength(3)
|
|
115
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.IDENTIFIER)
|
|
116
|
+
expect(result.tokens[0].modifiers).toContain(TokenModifier.DEFINITION)
|
|
117
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.OPERATOR)
|
|
118
|
+
expect(result.tokens[2].type).toBe(SyntaxTokenType.ATOM)
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
it('should tokenize reference', () => {
|
|
122
|
+
const result = tokenizeDSL('[methyl] = [C]\n[ethyl] = [methyl][C]')
|
|
123
|
+
const ethylDef = result.tokens.find(t => t.value === '[ethyl]')
|
|
124
|
+
const methylRef = result.tokens.filter(t => t.value === '[methyl]')[1]
|
|
125
|
+
expect(ethylDef.type).toBe(SyntaxTokenType.IDENTIFIER)
|
|
126
|
+
expect(methylRef.type).toBe(SyntaxTokenType.REFERENCE)
|
|
127
|
+
expect(methylRef.modifiers).toContain(TokenModifier.REFERENCE)
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
it('should tokenize comments', () => {
|
|
131
|
+
const result = tokenizeDSL('# this is a comment\n[methyl] = [C]')
|
|
132
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.COMMENT)
|
|
133
|
+
expect(result.tokens[0].value).toBe('# this is a comment')
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
it('should tokenize import keyword', () => {
|
|
137
|
+
const result = tokenizeDSL('import "./base.selfies"')
|
|
138
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.KEYWORD)
|
|
139
|
+
expect(result.tokens[0].value).toBe('import')
|
|
140
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.STRING)
|
|
141
|
+
expect(result.tokens[1].value).toBe('"./base.selfies"')
|
|
142
|
+
})
|
|
143
|
+
|
|
144
|
+
it('should tokenize import * from syntax', () => {
|
|
145
|
+
const result = tokenizeDSL('import * from "./base.selfies"')
|
|
146
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.KEYWORD) // import
|
|
147
|
+
expect(result.tokens[1].type).toBe(SyntaxTokenType.PUNCTUATION) // *
|
|
148
|
+
expect(result.tokens[2].type).toBe(SyntaxTokenType.KEYWORD) // from
|
|
149
|
+
expect(result.tokens[3].type).toBe(SyntaxTokenType.STRING) // path
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
it('should tokenize selective import syntax', () => {
|
|
153
|
+
const result = tokenizeDSL('import [methyl, ethyl] from "./base.selfies"')
|
|
154
|
+
expect(result.tokens[0].type).toBe(SyntaxTokenType.KEYWORD) // import
|
|
155
|
+
// The bracketed names get tokenized as SELFIES_TOKEN
|
|
156
|
+
// which is then classified
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
it('should track defined names in metadata', () => {
|
|
160
|
+
const result = tokenizeDSL('[a] = [C]\n[b] = [N]')
|
|
161
|
+
expect(result.metadata.definedNames.has('[a]')).toBe(true)
|
|
162
|
+
expect(result.metadata.definedNames.has('[b]')).toBe(true)
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
it('should handle lexer errors gracefully', () => {
|
|
166
|
+
const result = tokenizeDSL('[unclosed')
|
|
167
|
+
expect(result.errors.length).toBeGreaterThan(0)
|
|
168
|
+
})
|
|
169
|
+
|
|
170
|
+
it('should classify SELFIES tokens in definition body', () => {
|
|
171
|
+
const result = tokenizeDSL('[ketone] = [C][=O]')
|
|
172
|
+
const oxygenToken = result.tokens.find(t => t.value === '[=O]')
|
|
173
|
+
expect(oxygenToken.type).toBe(SyntaxTokenType.BOND)
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
it('should mark newlines', () => {
|
|
177
|
+
const result = tokenizeDSL('[a] = [C]\n[b] = [N]')
|
|
178
|
+
const newlines = result.tokens.filter(t => t.type === SyntaxTokenType.NEWLINE)
|
|
179
|
+
expect(newlines.length).toBeGreaterThan(0)
|
|
180
|
+
})
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
describe('getColorScheme', () => {
|
|
184
|
+
it('should return dark theme colors by default', () => {
|
|
185
|
+
const colors = getColorScheme()
|
|
186
|
+
expect(colors[SyntaxTokenType.ATOM]).toBe('#61afef')
|
|
187
|
+
expect(colors[SyntaxTokenType.BOND]).toBe('#c678dd')
|
|
188
|
+
expect(colors[SyntaxTokenType.COMMENT]).toBe('#5c6370')
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
it('should return light theme colors', () => {
|
|
192
|
+
const colors = getColorScheme('light')
|
|
193
|
+
expect(colors[SyntaxTokenType.ATOM]).toBe('#0184bc')
|
|
194
|
+
expect(colors[SyntaxTokenType.STRING]).toBe('#50a14f')
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
it('should have all token types', () => {
|
|
198
|
+
const colors = getColorScheme()
|
|
199
|
+
for (const type of Object.values(SyntaxTokenType)) {
|
|
200
|
+
expect(colors[type]).toBeDefined()
|
|
201
|
+
}
|
|
202
|
+
})
|
|
203
|
+
})
|
|
204
|
+
|
|
205
|
+
describe('getTextMateScopes', () => {
|
|
206
|
+
it('should return TextMate scopes for all types', () => {
|
|
207
|
+
const scopes = getTextMateScopes()
|
|
208
|
+
expect(scopes[SyntaxTokenType.ATOM]).toBe('entity.name.tag.atom.selfies')
|
|
209
|
+
expect(scopes[SyntaxTokenType.COMMENT]).toBe('comment.line.number-sign.selfies')
|
|
210
|
+
expect(scopes[SyntaxTokenType.KEYWORD]).toBe('keyword.control.import.selfies')
|
|
211
|
+
})
|
|
212
|
+
|
|
213
|
+
it('should have all token types', () => {
|
|
214
|
+
const scopes = getTextMateScopes()
|
|
215
|
+
for (const type of Object.values(SyntaxTokenType)) {
|
|
216
|
+
expect(scopes[type]).toBeDefined()
|
|
217
|
+
}
|
|
218
|
+
})
|
|
219
|
+
})
|
|
220
|
+
|
|
221
|
+
describe('getMonacoTokenTypes', () => {
|
|
222
|
+
it('should return Monaco token types', () => {
|
|
223
|
+
const types = getMonacoTokenTypes()
|
|
224
|
+
expect(types[SyntaxTokenType.ATOM]).toBe('type.identifier')
|
|
225
|
+
expect(types[SyntaxTokenType.COMMENT]).toBe('comment')
|
|
226
|
+
expect(types[SyntaxTokenType.STRING]).toBe('string')
|
|
227
|
+
})
|
|
228
|
+
})
|
|
229
|
+
|
|
230
|
+
describe('createMonacoLanguage', () => {
|
|
231
|
+
it('should create valid Monaco language definition', () => {
|
|
232
|
+
const lang = createMonacoLanguage()
|
|
233
|
+
expect(lang.id).toBe('selfies')
|
|
234
|
+
expect(lang.extensions).toContain('.selfies')
|
|
235
|
+
expect(lang.tokenizer).toBeDefined()
|
|
236
|
+
expect(lang.tokenizer.root).toBeDefined()
|
|
237
|
+
expect(Array.isArray(lang.tokenizer.root)).toBe(true)
|
|
238
|
+
})
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
describe('validateTokenization', () => {
|
|
242
|
+
it('should validate complete tokenization', () => {
|
|
243
|
+
const result = tokenizeSelfies('[C][N][O]')
|
|
244
|
+
const validation = validateTokenization(result.tokens, '[C][N][O]')
|
|
245
|
+
expect(validation.valid).toBe(true)
|
|
246
|
+
expect(validation.gaps).toHaveLength(0)
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
it('should detect gaps in tokenization', () => {
|
|
250
|
+
const tokens = [
|
|
251
|
+
{ type: 'atom', value: '[C]', start: 0, end: 3 },
|
|
252
|
+
{ type: 'atom', value: '[N]', start: 6, end: 9 }
|
|
253
|
+
]
|
|
254
|
+
const validation = validateTokenization(tokens, '[C]...[N]')
|
|
255
|
+
expect(validation.valid).toBe(false)
|
|
256
|
+
expect(validation.gaps).toHaveLength(1)
|
|
257
|
+
expect(validation.gaps[0]).toEqual({ start: 3, end: 6 })
|
|
258
|
+
})
|
|
259
|
+
|
|
260
|
+
it('should detect trailing content', () => {
|
|
261
|
+
const tokens = [
|
|
262
|
+
{ type: 'atom', value: '[C]', start: 0, end: 3 }
|
|
263
|
+
]
|
|
264
|
+
const validation = validateTokenization(tokens, '[C]extra')
|
|
265
|
+
expect(validation.valid).toBe(false)
|
|
266
|
+
expect(validation.gaps).toHaveLength(1)
|
|
267
|
+
expect(validation.gaps[0]).toEqual({ start: 3, end: 8 })
|
|
268
|
+
})
|
|
269
|
+
})
|
|
270
|
+
|
|
271
|
+
describe('highlightToHtml', () => {
|
|
272
|
+
it('should generate HTML with spans', () => {
|
|
273
|
+
const html = highlightToHtml('[C][N]', { language: 'selfies' })
|
|
274
|
+
expect(html).toContain('<span')
|
|
275
|
+
expect(html).toContain('selfies-atom')
|
|
276
|
+
expect(html).toContain('[C]')
|
|
277
|
+
expect(html).toContain('[N]')
|
|
278
|
+
})
|
|
279
|
+
|
|
280
|
+
it('should use provided theme colors', () => {
|
|
281
|
+
const htmlDark = highlightToHtml('[C]', { language: 'selfies', theme: 'dark' })
|
|
282
|
+
const htmlLight = highlightToHtml('[C]', { language: 'selfies', theme: 'light' })
|
|
283
|
+
expect(htmlDark).toContain('#61afef')
|
|
284
|
+
expect(htmlLight).toContain('#0184bc')
|
|
285
|
+
})
|
|
286
|
+
|
|
287
|
+
it('should use custom class prefix', () => {
|
|
288
|
+
const html = highlightToHtml('[C]', {
|
|
289
|
+
language: 'selfies',
|
|
290
|
+
classPrefix: 'my-'
|
|
291
|
+
})
|
|
292
|
+
expect(html).toContain('my-atom')
|
|
293
|
+
})
|
|
294
|
+
|
|
295
|
+
it('should escape HTML characters', () => {
|
|
296
|
+
// Note: This tests the escapeHtml helper via highlightToHtml
|
|
297
|
+
// DSL with a comment that has special chars
|
|
298
|
+
const html = highlightToHtml('# <script>alert("xss")</script>', { language: 'dsl' })
|
|
299
|
+
expect(html).toContain('<script>')
|
|
300
|
+
expect(html).not.toContain('<script>')
|
|
301
|
+
})
|
|
302
|
+
|
|
303
|
+
it('should handle DSL language', () => {
|
|
304
|
+
const html = highlightToHtml('[methyl] = [C]', { language: 'dsl' })
|
|
305
|
+
expect(html).toContain('selfies-identifier')
|
|
306
|
+
expect(html).toContain('selfies-operator')
|
|
307
|
+
expect(html).toContain('selfies-atom')
|
|
308
|
+
})
|
|
309
|
+
|
|
310
|
+
it('should include modifier classes', () => {
|
|
311
|
+
const html = highlightToHtml('[C]', { language: 'selfies' })
|
|
312
|
+
expect(html).toContain('selfies-organic')
|
|
313
|
+
})
|
|
314
|
+
})
|
|
315
|
+
|
|
316
|
+
describe('integration', () => {
|
|
317
|
+
it('should handle complex SELFIES string', () => {
|
|
318
|
+
// Ethanol: C-C-O
|
|
319
|
+
const selfies = '[C][C][O]'
|
|
320
|
+
const result = tokenizeSelfies(selfies)
|
|
321
|
+
expect(result.errors).toHaveLength(0)
|
|
322
|
+
expect(result.tokens).toHaveLength(3)
|
|
323
|
+
result.tokens.forEach(t => {
|
|
324
|
+
expect(t.type).toBe(SyntaxTokenType.ATOM)
|
|
325
|
+
})
|
|
326
|
+
})
|
|
327
|
+
|
|
328
|
+
it('should handle complex DSL program', () => {
|
|
329
|
+
const dsl = `
|
|
330
|
+
# Common functional groups
|
|
331
|
+
[hydroxyl] = [O]
|
|
332
|
+
[carboxyl] = [C][=O][hydroxyl]
|
|
333
|
+
[amino] = [N]
|
|
334
|
+
|
|
335
|
+
# Glycine (simplest amino acid)
|
|
336
|
+
[glycine] = [amino][C][carboxyl]
|
|
337
|
+
`
|
|
338
|
+
const result = tokenizeDSL(dsl)
|
|
339
|
+
expect(result.errors).toHaveLength(0)
|
|
340
|
+
expect(result.metadata.definedNames.has('[hydroxyl]')).toBe(true)
|
|
341
|
+
expect(result.metadata.definedNames.has('[carboxyl]')).toBe(true)
|
|
342
|
+
expect(result.metadata.definedNames.has('[glycine]')).toBe(true)
|
|
343
|
+
|
|
344
|
+
// Check that carboxyl definition has a reference to hydroxyl
|
|
345
|
+
const hydroxylRef = result.tokens.find(
|
|
346
|
+
t => t.value === '[hydroxyl]' && t.type === SyntaxTokenType.REFERENCE
|
|
347
|
+
)
|
|
348
|
+
expect(hydroxylRef).toBeDefined()
|
|
349
|
+
})
|
|
350
|
+
|
|
351
|
+
it('should handle import statements in DSL', () => {
|
|
352
|
+
const dsl = `import * from "./fragments.selfies"
|
|
353
|
+
import "./base.selfies"
|
|
354
|
+
[molecule] = [C][N]`
|
|
355
|
+
|
|
356
|
+
const result = tokenizeDSL(dsl)
|
|
357
|
+
const keywords = result.tokens.filter(t => t.type === SyntaxTokenType.KEYWORD)
|
|
358
|
+
expect(keywords.length).toBe(3) // import, from, import
|
|
359
|
+
|
|
360
|
+
const strings = result.tokens.filter(t => t.type === SyntaxTokenType.STRING)
|
|
361
|
+
expect(strings.length).toBe(2)
|
|
362
|
+
})
|
|
363
|
+
})
|
package/src/tokenizer.js
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tokenizer - Splits SELFIES strings into individual tokens
|
|
3
|
+
*
|
|
4
|
+
* SELFIES tokens are bracketed expressions like [C], [=C], [Branch1], etc.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Splits a SELFIES string into individual tokens
|
|
9
|
+
* @param {string} selfies - The SELFIES string to tokenize
|
|
10
|
+
* @returns {string[]} Array of tokens
|
|
11
|
+
*
|
|
12
|
+
* Example:
|
|
13
|
+
* tokenize('[C][=C][Branch1][C][O]')
|
|
14
|
+
* // => ['[C]', '[=C]', '[Branch1]', '[C]', '[O]']
|
|
15
|
+
*/
|
|
16
|
+
export function tokenize(selfies) {
|
|
17
|
+
if (selfies === '') {
|
|
18
|
+
return []
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const tokens = []
|
|
22
|
+
let current = ''
|
|
23
|
+
let inToken = false
|
|
24
|
+
|
|
25
|
+
for (let i = 0; i < selfies.length; i++) {
|
|
26
|
+
const char = selfies[i]
|
|
27
|
+
|
|
28
|
+
if (char === '[') {
|
|
29
|
+
if (inToken) {
|
|
30
|
+
throw new Error(`Unclosed bracket at position ${i}`)
|
|
31
|
+
}
|
|
32
|
+
inToken = true
|
|
33
|
+
current = '['
|
|
34
|
+
} else if (char === ']') {
|
|
35
|
+
if (!inToken) {
|
|
36
|
+
throw new Error(`Unexpected closing bracket at position ${i}`)
|
|
37
|
+
}
|
|
38
|
+
current += ']'
|
|
39
|
+
|
|
40
|
+
// Check for empty token
|
|
41
|
+
if (current === '[]') {
|
|
42
|
+
throw new Error(`Empty token at position ${i - 1}`)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
tokens.push(current)
|
|
46
|
+
current = ''
|
|
47
|
+
inToken = false
|
|
48
|
+
} else if (inToken) {
|
|
49
|
+
current += char
|
|
50
|
+
} else {
|
|
51
|
+
throw new Error(`Character '${char}' outside of token at position ${i}`)
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Check for unclosed bracket at end
|
|
56
|
+
if (inToken) {
|
|
57
|
+
throw new Error(`Unclosed bracket at end of string`)
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return tokens
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Joins tokens back into a SELFIES string
|
|
65
|
+
* @param {string[]} tokens - Array of SELFIES tokens
|
|
66
|
+
* @returns {string} Joined SELFIES string
|
|
67
|
+
*
|
|
68
|
+
* Example:
|
|
69
|
+
* join(['[C]', '[C]', '[O]']) // => '[C][C][O]'
|
|
70
|
+
*/
|
|
71
|
+
export function join(tokens) {
|
|
72
|
+
return tokens.join('')
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Counts the number of SELFIES symbols (not character length)
|
|
77
|
+
* @param {string} selfies - The SELFIES string
|
|
78
|
+
* @returns {number} Number of symbols
|
|
79
|
+
*
|
|
80
|
+
* This is more efficient than tokenize().length as it just counts '[' characters.
|
|
81
|
+
* Based on selfies-py's len_selfies() function.
|
|
82
|
+
*
|
|
83
|
+
* Example:
|
|
84
|
+
* lenSelfies('[C][C][O]') // => 3 (not 9!)
|
|
85
|
+
* lenSelfies('[Cl][Br]') // => 2
|
|
86
|
+
*
|
|
87
|
+
* Reference: selfies-py/selfies/utils/selfies_utils.py::len_selfies()
|
|
88
|
+
*/
|
|
89
|
+
export function lenSelfies(selfies) {
|
|
90
|
+
// Count occurrences of '[' character
|
|
91
|
+
// This equals the number of symbols and is faster than tokenizing
|
|
92
|
+
let count = 0
|
|
93
|
+
for (let i = 0; i < selfies.length; i++) {
|
|
94
|
+
if (selfies[i] === '[') {
|
|
95
|
+
count++
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
return count
|
|
99
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for SELFIES tokenization
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, test, expect } from 'bun:test'
|
|
6
|
+
import { tokenize, join } from './tokenizer.js'
|
|
7
|
+
|
|
8
|
+
describe('tokenize', () => {
|
|
9
|
+
test('tokenizes simple molecule', () => {
|
|
10
|
+
expect(tokenize('[C][C][O]')).toEqual(['[C]', '[C]', '[O]'])
|
|
11
|
+
})
|
|
12
|
+
|
|
13
|
+
test('tokenizes with bond modifiers', () => {
|
|
14
|
+
expect(tokenize('[C][=C][Branch1][C][O]')).toEqual(['[C]', '[=C]', '[Branch1]', '[C]', '[O]'])
|
|
15
|
+
})
|
|
16
|
+
|
|
17
|
+
test('tokenizes multi-character elements', () => {
|
|
18
|
+
expect(tokenize('[Cl][Br]')).toEqual(['[Cl]', '[Br]'])
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
test('tokenizes empty string', () => {
|
|
22
|
+
expect(tokenize('')).toEqual([])
|
|
23
|
+
})
|
|
24
|
+
|
|
25
|
+
test('tokenizes single token', () => {
|
|
26
|
+
expect(tokenize('[C]')).toEqual(['[C]'])
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
test('throws on unclosed bracket', () => {
|
|
30
|
+
expect(() => tokenize('[C][C')).toThrow()
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
test('throws on empty token', () => {
|
|
34
|
+
expect(() => tokenize('[]')).toThrow()
|
|
35
|
+
})
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
describe('join', () => {
|
|
39
|
+
test('joins tokens', () => {
|
|
40
|
+
expect(join(['[C]', '[C]', '[O]'])).toBe('[C][C][O]')
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
test('joins empty array', () => {
|
|
44
|
+
expect(join([])).toBe('')
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
test('joins single token', () => {
|
|
48
|
+
expect(join(['[C]'])).toBe('[C]')
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
test('round-trips tokenize and join', () => {
|
|
52
|
+
const selfies = '[C][=C][C][=C][C][=C][Ring1][=Branch1]'
|
|
53
|
+
expect(join(tokenize(selfies))).toBe(selfies)
|
|
54
|
+
})
|
|
55
|
+
})
|
package/src/validator.js
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validator - Validates SELFIES syntax without full decoding
|
|
3
|
+
*
|
|
4
|
+
* This provides a fast path for checking if a SELFIES string is
|
|
5
|
+
* syntactically valid without building the full IR.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { tokenize } from './tokenizer.js'
|
|
9
|
+
import { getAlphabet } from './alphabet.js'
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Checks if a SELFIES string is syntactically valid
|
|
13
|
+
* @param {string} selfies - The SELFIES string to validate
|
|
14
|
+
* @returns {boolean} True if valid, false otherwise
|
|
15
|
+
*
|
|
16
|
+
* Validation checks:
|
|
17
|
+
* - All tokens are properly bracketed
|
|
18
|
+
* - All tokens are in the valid alphabet
|
|
19
|
+
* - Branch/Ring tokens have proper length specifiers
|
|
20
|
+
*
|
|
21
|
+
* Example:
|
|
22
|
+
* isValid('[C][C][O]') // => true
|
|
23
|
+
* isValid('[C][C][O') // => false (unclosed bracket)
|
|
24
|
+
* isValid('[Xyz]') // => false (invalid token)
|
|
25
|
+
*/
|
|
26
|
+
export function isValid(selfies) {
|
|
27
|
+
// Empty string is invalid
|
|
28
|
+
if (selfies === '') {
|
|
29
|
+
return false
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
try {
|
|
33
|
+
// Try to tokenize - will catch bracket errors
|
|
34
|
+
const tokens = tokenize(selfies)
|
|
35
|
+
|
|
36
|
+
// Check each token against alphabet
|
|
37
|
+
const alphabet = getAlphabet()
|
|
38
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
39
|
+
if (!alphabet.has(tokens[i])) {
|
|
40
|
+
return false
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return true
|
|
45
|
+
} catch (error) {
|
|
46
|
+
// Tokenization failed (unclosed brackets, empty tokens, etc.)
|
|
47
|
+
return false
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Validates a single token
|
|
53
|
+
* @param {string} token - SELFIES token to validate
|
|
54
|
+
* @returns {boolean} True if token is valid
|
|
55
|
+
*/
|
|
56
|
+
function isValidToken(token) {
|
|
57
|
+
const alphabet = getAlphabet()
|
|
58
|
+
return alphabet.has(token)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Validates branch/ring token sequences
|
|
63
|
+
* @param {string[]} tokens - Array of tokens
|
|
64
|
+
* @param {number} index - Index of branch/ring token
|
|
65
|
+
* @returns {boolean} True if sequence is valid
|
|
66
|
+
*/
|
|
67
|
+
function validateSpecialToken(tokens, index) {
|
|
68
|
+
// For now, just return true - full validation happens in parser
|
|
69
|
+
return true
|
|
70
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for SELFIES validation
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, test, expect } from 'bun:test'
|
|
6
|
+
import { isValid } from './validator.js'
|
|
7
|
+
|
|
8
|
+
describe('isValid', () => {
|
|
9
|
+
test('validates simple molecule', () => {
|
|
10
|
+
expect(isValid('[C][C][O]')).toBe(true)
|
|
11
|
+
})
|
|
12
|
+
|
|
13
|
+
test('validates with bond modifiers', () => {
|
|
14
|
+
expect(isValid('[C][=C][C][=C][C][=C][Ring1][=Branch1]')).toBe(true)
|
|
15
|
+
})
|
|
16
|
+
|
|
17
|
+
test('validates with branches', () => {
|
|
18
|
+
expect(isValid('[C][C][Branch1][C][C][C]')).toBe(true)
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
test('validates multi-character elements', () => {
|
|
22
|
+
expect(isValid('[Cl][C][Br]')).toBe(true)
|
|
23
|
+
})
|
|
24
|
+
|
|
25
|
+
test('rejects unclosed bracket', () => {
|
|
26
|
+
expect(isValid('[C][C][O')).toBe(false)
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
test('rejects empty string', () => {
|
|
30
|
+
expect(isValid('')).toBe(false)
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
test('rejects invalid token', () => {
|
|
34
|
+
expect(isValid('[Xyz]')).toBe(false)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('rejects empty token', () => {
|
|
38
|
+
expect(isValid('[]')).toBe(false)
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
test('rejects missing brackets', () => {
|
|
42
|
+
expect(isValid('CCO')).toBe(false)
|
|
43
|
+
})
|
|
44
|
+
})
|