selfies-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +274 -0
  3. package/package.json +65 -0
  4. package/src/alphabet.js +150 -0
  5. package/src/alphabet.test.js +82 -0
  6. package/src/chemistryValidator.js +236 -0
  7. package/src/cli.js +206 -0
  8. package/src/constraints.js +186 -0
  9. package/src/constraints.test.js +126 -0
  10. package/src/decoder.js +636 -0
  11. package/src/decoder.test.js +560 -0
  12. package/src/dsl/analyzer.js +170 -0
  13. package/src/dsl/analyzer.test.js +139 -0
  14. package/src/dsl/dsl.test.js +146 -0
  15. package/src/dsl/importer.js +238 -0
  16. package/src/dsl/index.js +32 -0
  17. package/src/dsl/lexer.js +264 -0
  18. package/src/dsl/lexer.test.js +115 -0
  19. package/src/dsl/parser.js +201 -0
  20. package/src/dsl/parser.test.js +148 -0
  21. package/src/dsl/resolver.js +136 -0
  22. package/src/dsl/resolver.test.js +99 -0
  23. package/src/dsl/symbolTable.js +56 -0
  24. package/src/dsl/symbolTable.test.js +68 -0
  25. package/src/dsl/valenceValidator.js +147 -0
  26. package/src/encoder.js +467 -0
  27. package/src/encoder.test.js +61 -0
  28. package/src/errors.js +79 -0
  29. package/src/errors.test.js +91 -0
  30. package/src/grammar_rules.js +146 -0
  31. package/src/index.js +70 -0
  32. package/src/parser.js +96 -0
  33. package/src/parser.test.js +96 -0
  34. package/src/properties/atoms.js +69 -0
  35. package/src/properties/atoms.test.js +116 -0
  36. package/src/properties/formula.js +111 -0
  37. package/src/properties/formula.test.js +95 -0
  38. package/src/properties/molecularWeight.js +80 -0
  39. package/src/properties/molecularWeight.test.js +84 -0
  40. package/src/properties/properties.test.js +77 -0
  41. package/src/renderers/README.md +127 -0
  42. package/src/renderers/svg.js +113 -0
  43. package/src/renderers/svg.test.js +42 -0
  44. package/src/syntax.js +641 -0
  45. package/src/syntax.test.js +363 -0
  46. package/src/tokenizer.js +99 -0
  47. package/src/tokenizer.test.js +55 -0
  48. package/src/validator.js +70 -0
  49. package/src/validator.test.js +44 -0
@@ -0,0 +1,363 @@
1
+ import { describe, it, expect } from 'bun:test'
2
+ import {
3
+ tokenizeSelfies,
4
+ tokenizeDSL,
5
+ SyntaxTokenType,
6
+ TokenModifier,
7
+ getColorScheme,
8
+ getTextMateScopes,
9
+ getMonacoTokenTypes,
10
+ createMonacoLanguage,
11
+ validateTokenization,
12
+ highlightToHtml
13
+ } from './syntax.js'
14
+
15
+ describe('tokenizeSelfies', () => {
16
+ it('should tokenize basic atoms', () => {
17
+ const result = tokenizeSelfies('[C][N][O]')
18
+ expect(result.tokens).toHaveLength(3)
19
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.ATOM)
20
+ expect(result.tokens[0].value).toBe('[C]')
21
+ expect(result.tokens[0].start).toBe(0)
22
+ expect(result.tokens[0].end).toBe(3)
23
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.ATOM)
24
+ expect(result.tokens[2].type).toBe(SyntaxTokenType.ATOM)
25
+ })
26
+
27
+ it('should tokenize bond-modified atoms', () => {
28
+ const result = tokenizeSelfies('[C][=C][#N]')
29
+ expect(result.tokens).toHaveLength(3)
30
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.ATOM)
31
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.BOND)
32
+ expect(result.tokens[1].value).toBe('[=C]')
33
+ expect(result.tokens[2].type).toBe(SyntaxTokenType.BOND)
34
+ expect(result.tokens[2].value).toBe('[#N]')
35
+ })
36
+
37
+ it('should tokenize branch tokens', () => {
38
+ const result = tokenizeSelfies('[C][Branch1][C][O]')
39
+ expect(result.tokens).toHaveLength(4)
40
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.BRANCH)
41
+ expect(result.tokens[1].value).toBe('[Branch1]')
42
+ })
43
+
44
+ it('should tokenize ring tokens', () => {
45
+ const result = tokenizeSelfies('[C][Ring1][C]')
46
+ expect(result.tokens).toHaveLength(3)
47
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.RING)
48
+ expect(result.tokens[1].value).toBe('[Ring1]')
49
+ })
50
+
51
+ it('should tokenize bond-modified branches', () => {
52
+ const result = tokenizeSelfies('[C][=Branch1][C][=O]')
53
+ expect(result.tokens).toHaveLength(4)
54
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.BRANCH)
55
+ expect(result.tokens[1].modifiers).toContain('double')
56
+ })
57
+
58
+ it('should add organic modifier to organic atoms', () => {
59
+ const result = tokenizeSelfies('[C]')
60
+ expect(result.tokens[0].modifiers).toContain(TokenModifier.ORGANIC)
61
+ })
62
+
63
+ it('should add halogen modifier to halogens', () => {
64
+ const result = tokenizeSelfies('[F][Cl][Br][I]')
65
+ for (const token of result.tokens) {
66
+ expect(token.modifiers).toContain(TokenModifier.HALOGEN)
67
+ }
68
+ })
69
+
70
+ it('should mark invalid tokens', () => {
71
+ const result = tokenizeSelfies('[InvalidElement]')
72
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.INVALID_TOKEN)
73
+ expect(result.tokens[0].modifiers).toContain(TokenModifier.INVALID)
74
+ })
75
+
76
+ it('should handle known DSL names as references', () => {
77
+ const knownNames = new Set(['[methyl]'])
78
+ const result = tokenizeSelfies('[methyl][C]', { knownNames })
79
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.REFERENCE)
80
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.ATOM)
81
+ })
82
+
83
+ it('should detect gaps between tokens as errors', () => {
84
+ const result = tokenizeSelfies('[C]invalid[N]')
85
+ expect(result.tokens).toHaveLength(3)
86
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.ERROR)
87
+ expect(result.tokens[1].value).toBe('invalid')
88
+ expect(result.errors).toHaveLength(1)
89
+ })
90
+
91
+ it('should detect trailing content as error', () => {
92
+ const result = tokenizeSelfies('[C]trailing')
93
+ expect(result.tokens).toHaveLength(2)
94
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.ERROR)
95
+ expect(result.errors).toHaveLength(1)
96
+ })
97
+
98
+ it('should return empty tokens for empty input', () => {
99
+ const result = tokenizeSelfies('')
100
+ expect(result.tokens).toHaveLength(0)
101
+ expect(result.errors).toHaveLength(0)
102
+ })
103
+
104
+ it('should skip alphabet validation when disabled', () => {
105
+ const result = tokenizeSelfies('[XYZ]', { validateAgainstAlphabet: false })
106
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.ATOM)
107
+ expect(result.tokens[0].modifiers).not.toContain(TokenModifier.INVALID)
108
+ })
109
+ })
110
+
111
+ describe('tokenizeDSL', () => {
112
+ it('should tokenize definition', () => {
113
+ const result = tokenizeDSL('[methyl] = [C]')
114
+ expect(result.tokens).toHaveLength(3)
115
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.IDENTIFIER)
116
+ expect(result.tokens[0].modifiers).toContain(TokenModifier.DEFINITION)
117
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.OPERATOR)
118
+ expect(result.tokens[2].type).toBe(SyntaxTokenType.ATOM)
119
+ })
120
+
121
+ it('should tokenize reference', () => {
122
+ const result = tokenizeDSL('[methyl] = [C]\n[ethyl] = [methyl][C]')
123
+ const ethylDef = result.tokens.find(t => t.value === '[ethyl]')
124
+ const methylRef = result.tokens.filter(t => t.value === '[methyl]')[1]
125
+ expect(ethylDef.type).toBe(SyntaxTokenType.IDENTIFIER)
126
+ expect(methylRef.type).toBe(SyntaxTokenType.REFERENCE)
127
+ expect(methylRef.modifiers).toContain(TokenModifier.REFERENCE)
128
+ })
129
+
130
+ it('should tokenize comments', () => {
131
+ const result = tokenizeDSL('# this is a comment\n[methyl] = [C]')
132
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.COMMENT)
133
+ expect(result.tokens[0].value).toBe('# this is a comment')
134
+ })
135
+
136
+ it('should tokenize import keyword', () => {
137
+ const result = tokenizeDSL('import "./base.selfies"')
138
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.KEYWORD)
139
+ expect(result.tokens[0].value).toBe('import')
140
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.STRING)
141
+ expect(result.tokens[1].value).toBe('"./base.selfies"')
142
+ })
143
+
144
+ it('should tokenize import * from syntax', () => {
145
+ const result = tokenizeDSL('import * from "./base.selfies"')
146
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.KEYWORD) // import
147
+ expect(result.tokens[1].type).toBe(SyntaxTokenType.PUNCTUATION) // *
148
+ expect(result.tokens[2].type).toBe(SyntaxTokenType.KEYWORD) // from
149
+ expect(result.tokens[3].type).toBe(SyntaxTokenType.STRING) // path
150
+ })
151
+
152
+ it('should tokenize selective import syntax', () => {
153
+ const result = tokenizeDSL('import [methyl, ethyl] from "./base.selfies"')
154
+ expect(result.tokens[0].type).toBe(SyntaxTokenType.KEYWORD) // import
155
+ // The bracketed names get tokenized as SELFIES_TOKEN
156
+ // which is then classified
157
+ })
158
+
159
+ it('should track defined names in metadata', () => {
160
+ const result = tokenizeDSL('[a] = [C]\n[b] = [N]')
161
+ expect(result.metadata.definedNames.has('[a]')).toBe(true)
162
+ expect(result.metadata.definedNames.has('[b]')).toBe(true)
163
+ })
164
+
165
+ it('should handle lexer errors gracefully', () => {
166
+ const result = tokenizeDSL('[unclosed')
167
+ expect(result.errors.length).toBeGreaterThan(0)
168
+ })
169
+
170
+ it('should classify SELFIES tokens in definition body', () => {
171
+ const result = tokenizeDSL('[ketone] = [C][=O]')
172
+ const oxygenToken = result.tokens.find(t => t.value === '[=O]')
173
+ expect(oxygenToken.type).toBe(SyntaxTokenType.BOND)
174
+ })
175
+
176
+ it('should mark newlines', () => {
177
+ const result = tokenizeDSL('[a] = [C]\n[b] = [N]')
178
+ const newlines = result.tokens.filter(t => t.type === SyntaxTokenType.NEWLINE)
179
+ expect(newlines.length).toBeGreaterThan(0)
180
+ })
181
+ })
182
+
183
+ describe('getColorScheme', () => {
184
+ it('should return dark theme colors by default', () => {
185
+ const colors = getColorScheme()
186
+ expect(colors[SyntaxTokenType.ATOM]).toBe('#61afef')
187
+ expect(colors[SyntaxTokenType.BOND]).toBe('#c678dd')
188
+ expect(colors[SyntaxTokenType.COMMENT]).toBe('#5c6370')
189
+ })
190
+
191
+ it('should return light theme colors', () => {
192
+ const colors = getColorScheme('light')
193
+ expect(colors[SyntaxTokenType.ATOM]).toBe('#0184bc')
194
+ expect(colors[SyntaxTokenType.STRING]).toBe('#50a14f')
195
+ })
196
+
197
+ it('should have all token types', () => {
198
+ const colors = getColorScheme()
199
+ for (const type of Object.values(SyntaxTokenType)) {
200
+ expect(colors[type]).toBeDefined()
201
+ }
202
+ })
203
+ })
204
+
205
+ describe('getTextMateScopes', () => {
206
+ it('should return TextMate scopes for all types', () => {
207
+ const scopes = getTextMateScopes()
208
+ expect(scopes[SyntaxTokenType.ATOM]).toBe('entity.name.tag.atom.selfies')
209
+ expect(scopes[SyntaxTokenType.COMMENT]).toBe('comment.line.number-sign.selfies')
210
+ expect(scopes[SyntaxTokenType.KEYWORD]).toBe('keyword.control.import.selfies')
211
+ })
212
+
213
+ it('should have all token types', () => {
214
+ const scopes = getTextMateScopes()
215
+ for (const type of Object.values(SyntaxTokenType)) {
216
+ expect(scopes[type]).toBeDefined()
217
+ }
218
+ })
219
+ })
220
+
221
+ describe('getMonacoTokenTypes', () => {
222
+ it('should return Monaco token types', () => {
223
+ const types = getMonacoTokenTypes()
224
+ expect(types[SyntaxTokenType.ATOM]).toBe('type.identifier')
225
+ expect(types[SyntaxTokenType.COMMENT]).toBe('comment')
226
+ expect(types[SyntaxTokenType.STRING]).toBe('string')
227
+ })
228
+ })
229
+
230
+ describe('createMonacoLanguage', () => {
231
+ it('should create valid Monaco language definition', () => {
232
+ const lang = createMonacoLanguage()
233
+ expect(lang.id).toBe('selfies')
234
+ expect(lang.extensions).toContain('.selfies')
235
+ expect(lang.tokenizer).toBeDefined()
236
+ expect(lang.tokenizer.root).toBeDefined()
237
+ expect(Array.isArray(lang.tokenizer.root)).toBe(true)
238
+ })
239
+ })
240
+
241
+ describe('validateTokenization', () => {
242
+ it('should validate complete tokenization', () => {
243
+ const result = tokenizeSelfies('[C][N][O]')
244
+ const validation = validateTokenization(result.tokens, '[C][N][O]')
245
+ expect(validation.valid).toBe(true)
246
+ expect(validation.gaps).toHaveLength(0)
247
+ })
248
+
249
+ it('should detect gaps in tokenization', () => {
250
+ const tokens = [
251
+ { type: 'atom', value: '[C]', start: 0, end: 3 },
252
+ { type: 'atom', value: '[N]', start: 6, end: 9 }
253
+ ]
254
+ const validation = validateTokenization(tokens, '[C]...[N]')
255
+ expect(validation.valid).toBe(false)
256
+ expect(validation.gaps).toHaveLength(1)
257
+ expect(validation.gaps[0]).toEqual({ start: 3, end: 6 })
258
+ })
259
+
260
+ it('should detect trailing content', () => {
261
+ const tokens = [
262
+ { type: 'atom', value: '[C]', start: 0, end: 3 }
263
+ ]
264
+ const validation = validateTokenization(tokens, '[C]extra')
265
+ expect(validation.valid).toBe(false)
266
+ expect(validation.gaps).toHaveLength(1)
267
+ expect(validation.gaps[0]).toEqual({ start: 3, end: 8 })
268
+ })
269
+ })
270
+
271
+ describe('highlightToHtml', () => {
272
+ it('should generate HTML with spans', () => {
273
+ const html = highlightToHtml('[C][N]', { language: 'selfies' })
274
+ expect(html).toContain('<span')
275
+ expect(html).toContain('selfies-atom')
276
+ expect(html).toContain('[C]')
277
+ expect(html).toContain('[N]')
278
+ })
279
+
280
+ it('should use provided theme colors', () => {
281
+ const htmlDark = highlightToHtml('[C]', { language: 'selfies', theme: 'dark' })
282
+ const htmlLight = highlightToHtml('[C]', { language: 'selfies', theme: 'light' })
283
+ expect(htmlDark).toContain('#61afef')
284
+ expect(htmlLight).toContain('#0184bc')
285
+ })
286
+
287
+ it('should use custom class prefix', () => {
288
+ const html = highlightToHtml('[C]', {
289
+ language: 'selfies',
290
+ classPrefix: 'my-'
291
+ })
292
+ expect(html).toContain('my-atom')
293
+ })
294
+
295
+ it('should escape HTML characters', () => {
296
+ // Note: This tests the escapeHtml helper via highlightToHtml
297
+ // DSL with a comment that has special chars
298
+ const html = highlightToHtml('# <script>alert("xss")</script>', { language: 'dsl' })
299
+ expect(html).toContain('&lt;script&gt;')
300
+ expect(html).not.toContain('<script>')
301
+ })
302
+
303
+ it('should handle DSL language', () => {
304
+ const html = highlightToHtml('[methyl] = [C]', { language: 'dsl' })
305
+ expect(html).toContain('selfies-identifier')
306
+ expect(html).toContain('selfies-operator')
307
+ expect(html).toContain('selfies-atom')
308
+ })
309
+
310
+ it('should include modifier classes', () => {
311
+ const html = highlightToHtml('[C]', { language: 'selfies' })
312
+ expect(html).toContain('selfies-organic')
313
+ })
314
+ })
315
+
316
+ describe('integration', () => {
317
+ it('should handle complex SELFIES string', () => {
318
+ // Ethanol: C-C-O
319
+ const selfies = '[C][C][O]'
320
+ const result = tokenizeSelfies(selfies)
321
+ expect(result.errors).toHaveLength(0)
322
+ expect(result.tokens).toHaveLength(3)
323
+ result.tokens.forEach(t => {
324
+ expect(t.type).toBe(SyntaxTokenType.ATOM)
325
+ })
326
+ })
327
+
328
+ it('should handle complex DSL program', () => {
329
+ const dsl = `
330
+ # Common functional groups
331
+ [hydroxyl] = [O]
332
+ [carboxyl] = [C][=O][hydroxyl]
333
+ [amino] = [N]
334
+
335
+ # Glycine (simplest amino acid)
336
+ [glycine] = [amino][C][carboxyl]
337
+ `
338
+ const result = tokenizeDSL(dsl)
339
+ expect(result.errors).toHaveLength(0)
340
+ expect(result.metadata.definedNames.has('[hydroxyl]')).toBe(true)
341
+ expect(result.metadata.definedNames.has('[carboxyl]')).toBe(true)
342
+ expect(result.metadata.definedNames.has('[glycine]')).toBe(true)
343
+
344
+ // Check that carboxyl definition has a reference to hydroxyl
345
+ const hydroxylRef = result.tokens.find(
346
+ t => t.value === '[hydroxyl]' && t.type === SyntaxTokenType.REFERENCE
347
+ )
348
+ expect(hydroxylRef).toBeDefined()
349
+ })
350
+
351
+ it('should handle import statements in DSL', () => {
352
+ const dsl = `import * from "./fragments.selfies"
353
+ import "./base.selfies"
354
+ [molecule] = [C][N]`
355
+
356
+ const result = tokenizeDSL(dsl)
357
+ const keywords = result.tokens.filter(t => t.type === SyntaxTokenType.KEYWORD)
358
+ expect(keywords.length).toBe(3) // import, from, import
359
+
360
+ const strings = result.tokens.filter(t => t.type === SyntaxTokenType.STRING)
361
+ expect(strings.length).toBe(2)
362
+ })
363
+ })
@@ -0,0 +1,99 @@
1
+ /**
2
+ * Tokenizer - Splits SELFIES strings into individual tokens
3
+ *
4
+ * SELFIES tokens are bracketed expressions like [C], [=C], [Branch1], etc.
5
+ */
6
+
7
+ /**
8
+ * Splits a SELFIES string into individual tokens
9
+ * @param {string} selfies - The SELFIES string to tokenize
10
+ * @returns {string[]} Array of tokens
11
+ *
12
+ * Example:
13
+ * tokenize('[C][=C][Branch1][C][O]')
14
+ * // => ['[C]', '[=C]', '[Branch1]', '[C]', '[O]']
15
+ */
16
+ export function tokenize(selfies) {
17
+ if (selfies === '') {
18
+ return []
19
+ }
20
+
21
+ const tokens = []
22
+ let current = ''
23
+ let inToken = false
24
+
25
+ for (let i = 0; i < selfies.length; i++) {
26
+ const char = selfies[i]
27
+
28
+ if (char === '[') {
29
+ if (inToken) {
30
+ throw new Error(`Unclosed bracket at position ${i}`)
31
+ }
32
+ inToken = true
33
+ current = '['
34
+ } else if (char === ']') {
35
+ if (!inToken) {
36
+ throw new Error(`Unexpected closing bracket at position ${i}`)
37
+ }
38
+ current += ']'
39
+
40
+ // Check for empty token
41
+ if (current === '[]') {
42
+ throw new Error(`Empty token at position ${i - 1}`)
43
+ }
44
+
45
+ tokens.push(current)
46
+ current = ''
47
+ inToken = false
48
+ } else if (inToken) {
49
+ current += char
50
+ } else {
51
+ throw new Error(`Character '${char}' outside of token at position ${i}`)
52
+ }
53
+ }
54
+
55
+ // Check for unclosed bracket at end
56
+ if (inToken) {
57
+ throw new Error(`Unclosed bracket at end of string`)
58
+ }
59
+
60
+ return tokens
61
+ }
62
+
63
+ /**
64
+ * Joins tokens back into a SELFIES string
65
+ * @param {string[]} tokens - Array of SELFIES tokens
66
+ * @returns {string} Joined SELFIES string
67
+ *
68
+ * Example:
69
+ * join(['[C]', '[C]', '[O]']) // => '[C][C][O]'
70
+ */
71
+ export function join(tokens) {
72
+ return tokens.join('')
73
+ }
74
+
75
+ /**
76
+ * Counts the number of SELFIES symbols (not character length)
77
+ * @param {string} selfies - The SELFIES string
78
+ * @returns {number} Number of symbols
79
+ *
80
+ * This is more efficient than tokenize().length as it just counts '[' characters.
81
+ * Based on selfies-py's len_selfies() function.
82
+ *
83
+ * Example:
84
+ * lenSelfies('[C][C][O]') // => 3 (not 9!)
85
+ * lenSelfies('[Cl][Br]') // => 2
86
+ *
87
+ * Reference: selfies-py/selfies/utils/selfies_utils.py::len_selfies()
88
+ */
89
+ export function lenSelfies(selfies) {
90
+ // Count occurrences of '[' character
91
+ // This equals the number of symbols and is faster than tokenizing
92
+ let count = 0
93
+ for (let i = 0; i < selfies.length; i++) {
94
+ if (selfies[i] === '[') {
95
+ count++
96
+ }
97
+ }
98
+ return count
99
+ }
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Tests for SELFIES tokenization
3
+ */
4
+
5
+ import { describe, test, expect } from 'bun:test'
6
+ import { tokenize, join } from './tokenizer.js'
7
+
8
+ describe('tokenize', () => {
9
+ test('tokenizes simple molecule', () => {
10
+ expect(tokenize('[C][C][O]')).toEqual(['[C]', '[C]', '[O]'])
11
+ })
12
+
13
+ test('tokenizes with bond modifiers', () => {
14
+ expect(tokenize('[C][=C][Branch1][C][O]')).toEqual(['[C]', '[=C]', '[Branch1]', '[C]', '[O]'])
15
+ })
16
+
17
+ test('tokenizes multi-character elements', () => {
18
+ expect(tokenize('[Cl][Br]')).toEqual(['[Cl]', '[Br]'])
19
+ })
20
+
21
+ test('tokenizes empty string', () => {
22
+ expect(tokenize('')).toEqual([])
23
+ })
24
+
25
+ test('tokenizes single token', () => {
26
+ expect(tokenize('[C]')).toEqual(['[C]'])
27
+ })
28
+
29
+ test('throws on unclosed bracket', () => {
30
+ expect(() => tokenize('[C][C')).toThrow()
31
+ })
32
+
33
+ test('throws on empty token', () => {
34
+ expect(() => tokenize('[]')).toThrow()
35
+ })
36
+ })
37
+
38
+ describe('join', () => {
39
+ test('joins tokens', () => {
40
+ expect(join(['[C]', '[C]', '[O]'])).toBe('[C][C][O]')
41
+ })
42
+
43
+ test('joins empty array', () => {
44
+ expect(join([])).toBe('')
45
+ })
46
+
47
+ test('joins single token', () => {
48
+ expect(join(['[C]'])).toBe('[C]')
49
+ })
50
+
51
+ test('round-trips tokenize and join', () => {
52
+ const selfies = '[C][=C][C][=C][C][=C][Ring1][=Branch1]'
53
+ expect(join(tokenize(selfies))).toBe(selfies)
54
+ })
55
+ })
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Validator - Validates SELFIES syntax without full decoding
3
+ *
4
+ * This provides a fast path for checking if a SELFIES string is
5
+ * syntactically valid without building the full IR.
6
+ */
7
+
8
+ import { tokenize } from './tokenizer.js'
9
+ import { getAlphabet } from './alphabet.js'
10
+
11
+ /**
12
+ * Checks if a SELFIES string is syntactically valid
13
+ * @param {string} selfies - The SELFIES string to validate
14
+ * @returns {boolean} True if valid, false otherwise
15
+ *
16
+ * Validation checks:
17
+ * - All tokens are properly bracketed
18
+ * - All tokens are in the valid alphabet
19
+ * - Branch/Ring tokens have proper length specifiers
20
+ *
21
+ * Example:
22
+ * isValid('[C][C][O]') // => true
23
+ * isValid('[C][C][O') // => false (unclosed bracket)
24
+ * isValid('[Xyz]') // => false (invalid token)
25
+ */
26
+ export function isValid(selfies) {
27
+ // Empty string is invalid
28
+ if (selfies === '') {
29
+ return false
30
+ }
31
+
32
+ try {
33
+ // Try to tokenize - will catch bracket errors
34
+ const tokens = tokenize(selfies)
35
+
36
+ // Check each token against alphabet
37
+ const alphabet = getAlphabet()
38
+ for (let i = 0; i < tokens.length; i++) {
39
+ if (!alphabet.has(tokens[i])) {
40
+ return false
41
+ }
42
+ }
43
+
44
+ return true
45
+ } catch (error) {
46
+ // Tokenization failed (unclosed brackets, empty tokens, etc.)
47
+ return false
48
+ }
49
+ }
50
+
51
+ /**
52
+ * Validates a single token
53
+ * @param {string} token - SELFIES token to validate
54
+ * @returns {boolean} True if token is valid
55
+ */
56
+ function isValidToken(token) {
57
+ const alphabet = getAlphabet()
58
+ return alphabet.has(token)
59
+ }
60
+
61
+ /**
62
+ * Validates branch/ring token sequences
63
+ * @param {string[]} tokens - Array of tokens
64
+ * @param {number} index - Index of branch/ring token
65
+ * @returns {boolean} True if sequence is valid
66
+ */
67
+ function validateSpecialToken(tokens, index) {
68
+ // For now, just return true - full validation happens in parser
69
+ return true
70
+ }
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Tests for SELFIES validation
3
+ */
4
+
5
+ import { describe, test, expect } from 'bun:test'
6
+ import { isValid } from './validator.js'
7
+
8
+ describe('isValid', () => {
9
+ test('validates simple molecule', () => {
10
+ expect(isValid('[C][C][O]')).toBe(true)
11
+ })
12
+
13
+ test('validates with bond modifiers', () => {
14
+ expect(isValid('[C][=C][C][=C][C][=C][Ring1][=Branch1]')).toBe(true)
15
+ })
16
+
17
+ test('validates with branches', () => {
18
+ expect(isValid('[C][C][Branch1][C][C][C]')).toBe(true)
19
+ })
20
+
21
+ test('validates multi-character elements', () => {
22
+ expect(isValid('[Cl][C][Br]')).toBe(true)
23
+ })
24
+
25
+ test('rejects unclosed bracket', () => {
26
+ expect(isValid('[C][C][O')).toBe(false)
27
+ })
28
+
29
+ test('rejects empty string', () => {
30
+ expect(isValid('')).toBe(false)
31
+ })
32
+
33
+ test('rejects invalid token', () => {
34
+ expect(isValid('[Xyz]')).toBe(false)
35
+ })
36
+
37
+ test('rejects empty token', () => {
38
+ expect(isValid('[]')).toBe(false)
39
+ })
40
+
41
+ test('rejects missing brackets', () => {
42
+ expect(isValid('CCO')).toBe(false)
43
+ })
44
+ })