selfies-js 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +274 -0
- package/package.json +65 -0
- package/src/alphabet.js +150 -0
- package/src/alphabet.test.js +82 -0
- package/src/chemistryValidator.js +236 -0
- package/src/cli.js +206 -0
- package/src/constraints.js +186 -0
- package/src/constraints.test.js +126 -0
- package/src/decoder.js +636 -0
- package/src/decoder.test.js +560 -0
- package/src/dsl/analyzer.js +170 -0
- package/src/dsl/analyzer.test.js +139 -0
- package/src/dsl/dsl.test.js +146 -0
- package/src/dsl/importer.js +238 -0
- package/src/dsl/index.js +32 -0
- package/src/dsl/lexer.js +264 -0
- package/src/dsl/lexer.test.js +115 -0
- package/src/dsl/parser.js +201 -0
- package/src/dsl/parser.test.js +148 -0
- package/src/dsl/resolver.js +136 -0
- package/src/dsl/resolver.test.js +99 -0
- package/src/dsl/symbolTable.js +56 -0
- package/src/dsl/symbolTable.test.js +68 -0
- package/src/dsl/valenceValidator.js +147 -0
- package/src/encoder.js +467 -0
- package/src/encoder.test.js +61 -0
- package/src/errors.js +79 -0
- package/src/errors.test.js +91 -0
- package/src/grammar_rules.js +146 -0
- package/src/index.js +70 -0
- package/src/parser.js +96 -0
- package/src/parser.test.js +96 -0
- package/src/properties/atoms.js +69 -0
- package/src/properties/atoms.test.js +116 -0
- package/src/properties/formula.js +111 -0
- package/src/properties/formula.test.js +95 -0
- package/src/properties/molecularWeight.js +80 -0
- package/src/properties/molecularWeight.test.js +84 -0
- package/src/properties/properties.test.js +77 -0
- package/src/renderers/README.md +127 -0
- package/src/renderers/svg.js +113 -0
- package/src/renderers/svg.test.js +42 -0
- package/src/syntax.js +641 -0
- package/src/syntax.test.js +363 -0
- package/src/tokenizer.js +99 -0
- package/src/tokenizer.test.js +55 -0
- package/src/validator.js +70 -0
- package/src/validator.test.js +44 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Grammar rules for SELFIES derivation
|
|
3
|
+
* Based on selfies-py/selfies/grammar_rules.py
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// Index alphabet for Q-value calculations
|
|
7
|
+
export const INDEX_ALPHABET = [
|
|
8
|
+
'[C]', '[Ring1]', '[Ring2]',
|
|
9
|
+
'[Branch1]', '[=Branch1]', '[#Branch1]',
|
|
10
|
+
'[Branch2]', '[=Branch2]', '[#Branch2]',
|
|
11
|
+
'[O]', '[N]', '[=N]', '[=C]', '[#C]', '[S]', '[P]'
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
// Index code mapping
|
|
15
|
+
export const INDEX_CODE = {}
|
|
16
|
+
for (let i = 0; i < INDEX_ALPHABET.length; i++) {
|
|
17
|
+
INDEX_CODE[INDEX_ALPHABET[i]] = i
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Process branch symbol and extract (bond_order, L)
|
|
22
|
+
* where L is the number of tokens to read for the Q-value
|
|
23
|
+
*/
|
|
24
|
+
export function processBranchSymbol(symbol) {
|
|
25
|
+
const match = symbol.match(/^\[(=|#)?Branch([1-3])\]$/)
|
|
26
|
+
if (!match) return null
|
|
27
|
+
|
|
28
|
+
const bondChar = match[1] || ''
|
|
29
|
+
const L = parseInt(match[2])
|
|
30
|
+
const order = bondChar === '=' ? 2 : bondChar === '#' ? 3 : 1
|
|
31
|
+
|
|
32
|
+
return { order, L }
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Process ring symbol and extract (bond_order, L, stereo)
|
|
37
|
+
* where L is the number of tokens to read for the Q-value
|
|
38
|
+
*/
|
|
39
|
+
export function processRingSymbol(symbol) {
|
|
40
|
+
// Basic rings: [Ring1], [=Ring1], [#Ring1], etc.
|
|
41
|
+
const basicMatch = symbol.match(/^\[(=|#)?Ring([1-3])\]$/)
|
|
42
|
+
if (basicMatch) {
|
|
43
|
+
const bondChar = basicMatch[1] || ''
|
|
44
|
+
const L = parseInt(basicMatch[2])
|
|
45
|
+
const order = bondChar === '=' ? 2 : bondChar === '#' ? 3 : 1
|
|
46
|
+
return { order, L, stereo: null }
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Stereo rings: [-/Ring1], [\/Ring1], etc.
|
|
50
|
+
const stereoMatch = symbol.match(/^\[([-\\/])([-\\/])Ring([1-3])\]$/)
|
|
51
|
+
if (stereoMatch) {
|
|
52
|
+
const L = parseInt(stereoMatch[3])
|
|
53
|
+
return { order: 1, L, stereo: [stereoMatch[1], stereoMatch[2]] }
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return null
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Calculate next state after processing an atom
|
|
61
|
+
* Returns [actualBondOrder, nextState]
|
|
62
|
+
*/
|
|
63
|
+
export function nextAtomState(requestedBondOrder, bondingCapacity, state) {
|
|
64
|
+
let actualBondOrder = requestedBondOrder
|
|
65
|
+
|
|
66
|
+
if (state === 0) {
|
|
67
|
+
actualBondOrder = 0
|
|
68
|
+
} else {
|
|
69
|
+
actualBondOrder = Math.min(requestedBondOrder, state, bondingCapacity)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const bondsLeft = bondingCapacity - actualBondOrder
|
|
73
|
+
const nextState = bondsLeft === 0 ? null : bondsLeft
|
|
74
|
+
|
|
75
|
+
return [actualBondOrder, nextState]
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Calculate branch init state and next state
|
|
80
|
+
* Returns [branchInitState, nextState]
|
|
81
|
+
*/
|
|
82
|
+
export function nextBranchState(branchType, state) {
|
|
83
|
+
if (state <= 1) {
|
|
84
|
+
throw new Error('Branch requires state > 1')
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const branchInitState = Math.min(state - 1, branchType)
|
|
88
|
+
const nextState = state - branchInitState
|
|
89
|
+
|
|
90
|
+
return [branchInitState, nextState]
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Calculate bond order and next state for ring
|
|
95
|
+
* Returns [bondOrder, nextState]
|
|
96
|
+
*/
|
|
97
|
+
export function nextRingState(ringType, state) {
|
|
98
|
+
if (state === 0) {
|
|
99
|
+
throw new Error('Ring requires state > 0')
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const bondOrder = Math.min(ringType, state)
|
|
103
|
+
const bondsLeft = state - bondOrder
|
|
104
|
+
const nextState = bondsLeft === 0 ? null : bondsLeft
|
|
105
|
+
|
|
106
|
+
return [bondOrder, nextState]
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Get index value from SELFIES symbols
|
|
111
|
+
* @param {string[]} symbols - Array of symbol contents (without brackets)
|
|
112
|
+
*/
|
|
113
|
+
export function getIndexFromSelfies(symbols) {
|
|
114
|
+
let index = 0
|
|
115
|
+
const base = INDEX_ALPHABET.length
|
|
116
|
+
|
|
117
|
+
for (let i = 0; i < symbols.length; i++) {
|
|
118
|
+
const symbolIndex = symbols.length - 1 - i
|
|
119
|
+
const code = INDEX_CODE[symbols[symbolIndex]] || 0
|
|
120
|
+
index += code * Math.pow(base, i)
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
return index
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Get SELFIES symbols from index value
|
|
128
|
+
*/
|
|
129
|
+
export function getSelfiesFromIndex(index) {
|
|
130
|
+
if (index < 0) {
|
|
131
|
+
throw new Error('Index must be non-negative')
|
|
132
|
+
}
|
|
133
|
+
if (index === 0) {
|
|
134
|
+
return [INDEX_ALPHABET[0]]
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const symbols = []
|
|
138
|
+
const base = INDEX_ALPHABET.length
|
|
139
|
+
|
|
140
|
+
while (index > 0) {
|
|
141
|
+
symbols.push(INDEX_ALPHABET[index % base])
|
|
142
|
+
index = Math.floor(index / base)
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return symbols.reverse()
|
|
146
|
+
}
|
package/src/index.js
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* selfies-js - Pure JavaScript SELFIES encoder/decoder
|
|
3
|
+
*
|
|
4
|
+
* Public API exports
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
// Core functionality
|
|
8
|
+
export { decode, decodeToAST, dumpAST } from './decoder.js'
|
|
9
|
+
export { encode } from './encoder.js'
|
|
10
|
+
export { isValid } from './validator.js'
|
|
11
|
+
|
|
12
|
+
// Tokenization
|
|
13
|
+
export { tokenize, join, lenSelfies } from './tokenizer.js'
|
|
14
|
+
|
|
15
|
+
// Properties
|
|
16
|
+
export { getMolecularWeight } from './properties/molecularWeight.js'
|
|
17
|
+
export { getFormula } from './properties/formula.js'
|
|
18
|
+
|
|
19
|
+
// Alphabet
|
|
20
|
+
export { getAlphabet, getSemanticAlphabet, getAlphabetFromSelfies } from './alphabet.js'
|
|
21
|
+
|
|
22
|
+
// Constraints
|
|
23
|
+
export {
|
|
24
|
+
getPresetConstraints,
|
|
25
|
+
getSemanticConstraints,
|
|
26
|
+
setSemanticConstraints,
|
|
27
|
+
getBondingCapacity,
|
|
28
|
+
resetConstraints
|
|
29
|
+
} from './constraints.js'
|
|
30
|
+
|
|
31
|
+
// DSL
|
|
32
|
+
export { parse } from './dsl/parser.js'
|
|
33
|
+
export { resolve, resolveAll } from './dsl/resolver.js'
|
|
34
|
+
export { getDependencies, getDependents } from './dsl/analyzer.js'
|
|
35
|
+
|
|
36
|
+
// Errors
|
|
37
|
+
export {
|
|
38
|
+
SelfiesError,
|
|
39
|
+
DecodeError,
|
|
40
|
+
EncodeError,
|
|
41
|
+
ResolveError,
|
|
42
|
+
ValidationError,
|
|
43
|
+
ParseError,
|
|
44
|
+
} from './errors.js'
|
|
45
|
+
|
|
46
|
+
// Chemistry Validation (RDKit-based)
|
|
47
|
+
export {
|
|
48
|
+
isChemicallyValid,
|
|
49
|
+
getCanonicalSmiles,
|
|
50
|
+
validateRoundtrip,
|
|
51
|
+
getValidationDetails,
|
|
52
|
+
batchValidate
|
|
53
|
+
} from './chemistryValidator.js'
|
|
54
|
+
|
|
55
|
+
// Renderers (RDKit-based)
|
|
56
|
+
export { renderSelfies, initRDKit } from './renderers/svg.js'
|
|
57
|
+
|
|
58
|
+
// Syntax Highlighting API
|
|
59
|
+
export {
|
|
60
|
+
tokenizeSelfies,
|
|
61
|
+
tokenizeDSL,
|
|
62
|
+
SyntaxTokenType,
|
|
63
|
+
TokenModifier,
|
|
64
|
+
getColorScheme,
|
|
65
|
+
getTextMateScopes,
|
|
66
|
+
getMonacoTokenTypes,
|
|
67
|
+
createMonacoLanguage,
|
|
68
|
+
validateTokenization,
|
|
69
|
+
highlightToHtml
|
|
70
|
+
} from './syntax.js'
|
package/src/parser.js
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Parser - Converts SELFIES tokens into an internal molecule representation (IR)
|
|
3
|
+
*
|
|
4
|
+
* The IR is a graph structure with atoms and bonds that can be used for
|
|
5
|
+
* decoding to SMILES, validation, and property calculation.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Parses SELFIES tokens into a molecule IR
|
|
10
|
+
* @param {string[]} tokens - Array of SELFIES tokens
|
|
11
|
+
* @returns {Object} Molecule IR with atoms and bonds
|
|
12
|
+
*
|
|
13
|
+
* IR Structure:
|
|
14
|
+
* {
|
|
15
|
+
* atoms: [
|
|
16
|
+
* { element: 'C', index: 0, valence: 4, usedValence: 0 },
|
|
17
|
+
* { element: 'O', index: 1, valence: 2, usedValence: 0 },
|
|
18
|
+
* ...
|
|
19
|
+
* ],
|
|
20
|
+
* bonds: [
|
|
21
|
+
* { from: 0, to: 1, order: 1 },
|
|
22
|
+
* ...
|
|
23
|
+
* ]
|
|
24
|
+
* }
|
|
25
|
+
*/
|
|
26
|
+
import { getValence } from './properties/atoms.js'
|
|
27
|
+
|
|
28
|
+
export function parse(tokens) {
|
|
29
|
+
const atoms = []
|
|
30
|
+
const bonds = []
|
|
31
|
+
|
|
32
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
33
|
+
const tokenInfo = parseToken(tokens[i])
|
|
34
|
+
|
|
35
|
+
// Skip branch/ring tokens for now (simplified)
|
|
36
|
+
if (isBranchToken(tokens[i]) || isRingToken(tokens[i])) {
|
|
37
|
+
continue
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (tokenInfo.element) {
|
|
41
|
+
const atom = {
|
|
42
|
+
element: tokenInfo.element,
|
|
43
|
+
index: atoms.length,
|
|
44
|
+
valence: getValence(tokenInfo.element),
|
|
45
|
+
usedValence: 0
|
|
46
|
+
}
|
|
47
|
+
atoms.push(atom)
|
|
48
|
+
|
|
49
|
+
// Create bond to previous atom
|
|
50
|
+
if (atoms.length > 1) {
|
|
51
|
+
bonds.push({
|
|
52
|
+
from: atoms.length - 2,
|
|
53
|
+
to: atoms.length - 1,
|
|
54
|
+
order: tokenInfo.bondOrder
|
|
55
|
+
})
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return { atoms, bonds }
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Extracts element symbol and bond prefix from a token
|
|
65
|
+
* @param {string} token - SELFIES token like '[C]', '[=O]', '[#N]'
|
|
66
|
+
* @returns {{element: string, bondOrder: number}} Parsed token info
|
|
67
|
+
*/
|
|
68
|
+
function parseToken(token) {
|
|
69
|
+
const content = token.slice(1, -1)
|
|
70
|
+
|
|
71
|
+
if (content.startsWith('=')) {
|
|
72
|
+
return { element: content.slice(1), bondOrder: 2 }
|
|
73
|
+
} else if (content.startsWith('#')) {
|
|
74
|
+
return { element: content.slice(1), bondOrder: 3 }
|
|
75
|
+
} else {
|
|
76
|
+
return { element: content, bondOrder: 1 }
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Determines if a token is a branch token
|
|
82
|
+
* @param {string} token - SELFIES token
|
|
83
|
+
* @returns {boolean} True if token is [Branch1], [Branch2], or [Branch3]
|
|
84
|
+
*/
|
|
85
|
+
function isBranchToken(token) {
|
|
86
|
+
return token.match(/^\[=?#?Branch[123]\]$/) !== null
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Determines if a token is a ring token
|
|
91
|
+
* @param {string} token - SELFIES token
|
|
92
|
+
* @returns {boolean} True if token is [Ring1], [Ring2], or [Ring3]
|
|
93
|
+
*/
|
|
94
|
+
function isRingToken(token) {
|
|
95
|
+
return token.match(/^\[=?#?Ring[123]\]$/) !== null
|
|
96
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for SELFIES parser (tokens → IR)
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, test, expect } from 'bun:test'
|
|
6
|
+
import { parse } from './parser.js'
|
|
7
|
+
|
|
8
|
+
describe('parse', () => {
|
|
9
|
+
// TODO: Basic parsing
|
|
10
|
+
test('parses simple molecule', () => {
|
|
11
|
+
// TODO: const tokens = ['[C]', '[C]', '[O]']
|
|
12
|
+
// TODO: const ir = parse(tokens)
|
|
13
|
+
// TODO: expect(ir.atoms).toHaveLength(3)
|
|
14
|
+
// TODO: expect(ir.bonds).toHaveLength(2)
|
|
15
|
+
})
|
|
16
|
+
|
|
17
|
+
test('parses with bond modifiers', () => {
|
|
18
|
+
// TODO: const tokens = ['[C]', '[=C]']
|
|
19
|
+
// TODO: const ir = parse(tokens)
|
|
20
|
+
// TODO: expect(ir.bonds[0].order).toBe(2) // double bond
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
test('parses triple bonds', () => {
|
|
24
|
+
// TODO: const tokens = ['[C]', '[#N]']
|
|
25
|
+
// TODO: const ir = parse(tokens)
|
|
26
|
+
// TODO: expect(ir.bonds[0].order).toBe(3) // triple bond
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
// TODO: Branch parsing
|
|
30
|
+
test('parses simple branch', () => {
|
|
31
|
+
// TODO: const tokens = ['[C]', '[C]', '[Branch1]', '[C]', '[C]', '[C]']
|
|
32
|
+
// TODO: const ir = parse(tokens)
|
|
33
|
+
// TODO: expect(ir.atoms).toHaveLength(4)
|
|
34
|
+
// TODO: expect(ir.bonds).toHaveLength(3)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('parses nested branches', () => {
|
|
38
|
+
// TODO: const tokens = ['[C]', '[C]', '[Branch1]', '[C]', '[C]', '[Branch1]', '[C]', '[C]', '[C]']
|
|
39
|
+
// TODO: const ir = parse(tokens)
|
|
40
|
+
// TODO: Verify branch structure
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
// TODO: Ring parsing
|
|
44
|
+
test('parses simple ring', () => {
|
|
45
|
+
// TODO: const tokens = ['[C]', '[C]', '[C]', '[Ring1]', '[C]']
|
|
46
|
+
// TODO: const ir = parse(tokens)
|
|
47
|
+
// TODO: Verify ring closure bond exists
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
test('parses benzene ring', () => {
|
|
51
|
+
// TODO: const tokens = ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]']
|
|
52
|
+
// TODO: const ir = parse(tokens)
|
|
53
|
+
// TODO: expect(ir.atoms).toHaveLength(6)
|
|
54
|
+
// TODO: Verify alternating double bonds
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
// TODO: IR structure validation
|
|
58
|
+
test('returns valid IR structure', () => {
|
|
59
|
+
// TODO: const tokens = ['[C]', '[C]']
|
|
60
|
+
// TODO: const ir = parse(tokens)
|
|
61
|
+
// TODO: expect(ir).toHaveProperty('atoms')
|
|
62
|
+
// TODO: expect(ir).toHaveProperty('bonds')
|
|
63
|
+
// TODO: expect(Array.isArray(ir.atoms)).toBe(true)
|
|
64
|
+
// TODO: expect(Array.isArray(ir.bonds)).toBe(true)
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
test('atoms have required properties', () => {
|
|
68
|
+
// TODO: const ir = parse(['[C]'])
|
|
69
|
+
// TODO: const atom = ir.atoms[0]
|
|
70
|
+
// TODO: expect(atom).toHaveProperty('element')
|
|
71
|
+
// TODO: expect(atom).toHaveProperty('index')
|
|
72
|
+
// TODO: expect(atom).toHaveProperty('valence')
|
|
73
|
+
// TODO: expect(atom).toHaveProperty('usedValence')
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
test('bonds have required properties', () => {
|
|
77
|
+
// TODO: const ir = parse(['[C]', '[C]'])
|
|
78
|
+
// TODO: const bond = ir.bonds[0]
|
|
79
|
+
// TODO: expect(bond).toHaveProperty('from')
|
|
80
|
+
// TODO: expect(bond).toHaveProperty('to')
|
|
81
|
+
// TODO: expect(bond).toHaveProperty('order')
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
// TODO: Error cases
|
|
85
|
+
test('throws on invalid tokens', () => {
|
|
86
|
+
// TODO: expect(() => parse(['[Xyz]'])).toThrow()
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
test('throws on malformed branch', () => {
|
|
90
|
+
// TODO: expect(() => parse(['[Branch1]'])).toThrow() // branch at start
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
test('throws on malformed ring', () => {
|
|
94
|
+
// TODO: expect(() => parse(['[Ring1]'])).toThrow() // ring at start
|
|
95
|
+
})
|
|
96
|
+
})
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Atoms - Atomic data for SELFIES elements
|
|
3
|
+
*
|
|
4
|
+
* Contains atomic masses, valences, and other properties for
|
|
5
|
+
* supported elements.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Atomic data for supported elements
|
|
10
|
+
* Maps element symbol to properties
|
|
11
|
+
*/
|
|
12
|
+
export const ATOMIC_DATA = {
|
|
13
|
+
'C': { mass: 12.011, valence: 4, name: 'Carbon' },
|
|
14
|
+
'N': { mass: 14.007, valence: 3, name: 'Nitrogen' },
|
|
15
|
+
'O': { mass: 15.999, valence: 2, name: 'Oxygen' },
|
|
16
|
+
'S': { mass: 32.06, valence: 2, name: 'Sulfur' },
|
|
17
|
+
'P': { mass: 30.974, valence: 3, name: 'Phosphorus' },
|
|
18
|
+
'F': { mass: 18.998, valence: 1, name: 'Fluorine' },
|
|
19
|
+
'Cl': { mass: 35.45, valence: 1, name: 'Chlorine' },
|
|
20
|
+
'Br': { mass: 79.904, valence: 1, name: 'Bromine' },
|
|
21
|
+
'I': { mass: 126.904, valence: 1, name: 'Iodine' },
|
|
22
|
+
'B': { mass: 10.81, valence: 3, name: 'Boron' },
|
|
23
|
+
'H': { mass: 1.008, valence: 1, name: 'Hydrogen' }
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Gets atomic mass for an element
|
|
28
|
+
* @param {string} element - Element symbol (e.g., 'C', 'N', 'O')
|
|
29
|
+
* @returns {number} Atomic mass in g/mol
|
|
30
|
+
* @throws {Error} If element is not supported
|
|
31
|
+
*/
|
|
32
|
+
export function getAtomicMass(element) {
|
|
33
|
+
const data = ATOMIC_DATA[element]
|
|
34
|
+
if (!data) {
|
|
35
|
+
throw new Error(`Unsupported element: ${element}`)
|
|
36
|
+
}
|
|
37
|
+
return data.mass
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Gets standard valence for an element
|
|
42
|
+
* @param {string} element - Element symbol
|
|
43
|
+
* @returns {number} Standard valence
|
|
44
|
+
* @throws {Error} If element is not supported
|
|
45
|
+
*/
|
|
46
|
+
export function getValence(element) {
|
|
47
|
+
const data = ATOMIC_DATA[element]
|
|
48
|
+
if (!data) {
|
|
49
|
+
throw new Error(`Unsupported element: ${element}`)
|
|
50
|
+
}
|
|
51
|
+
return data.valence
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Checks if an element is supported
|
|
56
|
+
* @param {string} element - Element symbol to check
|
|
57
|
+
* @returns {boolean} True if element is supported
|
|
58
|
+
*/
|
|
59
|
+
export function isSupported(element) {
|
|
60
|
+
return element in ATOMIC_DATA
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Gets list of all supported element symbols
|
|
65
|
+
* @returns {string[]} Array of element symbols
|
|
66
|
+
*/
|
|
67
|
+
export function getSupportedElements() {
|
|
68
|
+
return Object.keys(ATOMIC_DATA)
|
|
69
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for atomic data
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, test, expect } from 'bun:test'
|
|
6
|
+
import {
|
|
7
|
+
ATOMIC_DATA,
|
|
8
|
+
getAtomicMass,
|
|
9
|
+
getValence,
|
|
10
|
+
isSupported,
|
|
11
|
+
getSupportedElements
|
|
12
|
+
} from './atoms.js'
|
|
13
|
+
|
|
14
|
+
describe('ATOMIC_DATA', () => {
|
|
15
|
+
test('contains required elements', () => {
|
|
16
|
+
expect(ATOMIC_DATA).toHaveProperty('C')
|
|
17
|
+
expect(ATOMIC_DATA).toHaveProperty('N')
|
|
18
|
+
expect(ATOMIC_DATA).toHaveProperty('O')
|
|
19
|
+
expect(ATOMIC_DATA).toHaveProperty('S')
|
|
20
|
+
expect(ATOMIC_DATA).toHaveProperty('P')
|
|
21
|
+
expect(ATOMIC_DATA).toHaveProperty('F')
|
|
22
|
+
expect(ATOMIC_DATA).toHaveProperty('Cl')
|
|
23
|
+
expect(ATOMIC_DATA).toHaveProperty('Br')
|
|
24
|
+
expect(ATOMIC_DATA).toHaveProperty('I')
|
|
25
|
+
expect(ATOMIC_DATA).toHaveProperty('B')
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
test('each element has mass and valence', () => {
|
|
29
|
+
for (const [element, data] of Object.entries(ATOMIC_DATA)) {
|
|
30
|
+
expect(data).toHaveProperty('mass')
|
|
31
|
+
expect(data).toHaveProperty('valence')
|
|
32
|
+
expect(data).toHaveProperty('name')
|
|
33
|
+
}
|
|
34
|
+
})
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
describe('getAtomicMass', () => {
|
|
38
|
+
test('returns correct mass for carbon', () => {
|
|
39
|
+
expect(getAtomicMass('C')).toBeCloseTo(12.011, 2)
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
test('returns correct mass for oxygen', () => {
|
|
43
|
+
expect(getAtomicMass('O')).toBeCloseTo(15.999, 2)
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
test('returns correct mass for nitrogen', () => {
|
|
47
|
+
expect(getAtomicMass('N')).toBeCloseTo(14.007, 2)
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
test('throws on unsupported element', () => {
|
|
51
|
+
expect(() => getAtomicMass('Xx')).toThrow()
|
|
52
|
+
})
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
describe('getValence', () => {
|
|
56
|
+
test('returns correct valence for carbon', () => {
|
|
57
|
+
expect(getValence('C')).toBe(4)
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
test('returns correct valence for nitrogen', () => {
|
|
61
|
+
expect(getValence('N')).toBe(3)
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
test('returns correct valence for oxygen', () => {
|
|
65
|
+
expect(getValence('O')).toBe(2)
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
test('returns correct valence for halogens', () => {
|
|
69
|
+
expect(getValence('F')).toBe(1)
|
|
70
|
+
expect(getValence('Cl')).toBe(1)
|
|
71
|
+
expect(getValence('Br')).toBe(1)
|
|
72
|
+
expect(getValence('I')).toBe(1)
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
test('throws on unsupported element', () => {
|
|
76
|
+
expect(() => getValence('Xx')).toThrow()
|
|
77
|
+
})
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
describe('isSupported', () => {
|
|
81
|
+
test('returns true for supported elements', () => {
|
|
82
|
+
expect(isSupported('C')).toBe(true)
|
|
83
|
+
expect(isSupported('N')).toBe(true)
|
|
84
|
+
expect(isSupported('O')).toBe(true)
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
test('returns false for unsupported elements', () => {
|
|
88
|
+
expect(isSupported('Xx')).toBe(false)
|
|
89
|
+
expect(isSupported('He')).toBe(false)
|
|
90
|
+
})
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
describe('getSupportedElements', () => {
|
|
94
|
+
test('returns array of elements', () => {
|
|
95
|
+
const elements = getSupportedElements()
|
|
96
|
+
expect(Array.isArray(elements)).toBe(true)
|
|
97
|
+
expect(elements.length).toBeGreaterThan(0)
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
test('includes all required elements', () => {
|
|
101
|
+
const elements = getSupportedElements()
|
|
102
|
+
expect(elements).toContain('C')
|
|
103
|
+
expect(elements).toContain('N')
|
|
104
|
+
expect(elements).toContain('O')
|
|
105
|
+
expect(elements).toContain('S')
|
|
106
|
+
expect(elements).toContain('P')
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
test('includes halogens', () => {
|
|
110
|
+
const elements = getSupportedElements()
|
|
111
|
+
expect(elements).toContain('F')
|
|
112
|
+
expect(elements).toContain('Cl')
|
|
113
|
+
expect(elements).toContain('Br')
|
|
114
|
+
expect(elements).toContain('I')
|
|
115
|
+
})
|
|
116
|
+
})
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Formula - Generates molecular formula from SELFIES
|
|
3
|
+
*
|
|
4
|
+
* Computes the molecular formula in Hill notation:
|
|
5
|
+
* C first, then H, then other elements alphabetically.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { tokenize } from '../tokenizer.js'
|
|
9
|
+
import { parse } from '../parser.js'
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Generates molecular formula from SELFIES string
|
|
13
|
+
* @param {string} selfies - SELFIES string
|
|
14
|
+
* @returns {string} Molecular formula in Hill notation
|
|
15
|
+
*
|
|
16
|
+
* Hill notation:
|
|
17
|
+
* - Carbon first (if present)
|
|
18
|
+
* - Hydrogen second (if present)
|
|
19
|
+
* - Other elements alphabetically
|
|
20
|
+
* - Omit count if 1
|
|
21
|
+
*
|
|
22
|
+
* Examples:
|
|
23
|
+
* getFormula('[C][C][O]') // => 'C2H6O'
|
|
24
|
+
* getFormula('[C]') // => 'CH4'
|
|
25
|
+
* getFormula('[N][C][C][=O]') // => 'C2H5NO'
|
|
26
|
+
*/
|
|
27
|
+
export function getFormula(selfies) {
|
|
28
|
+
const tokens = tokenize(selfies)
|
|
29
|
+
const ir = parse(tokens)
|
|
30
|
+
const counts = countAtoms(ir)
|
|
31
|
+
return formatHill(counts)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Counts atoms in molecule IR (including implicit hydrogens)
|
|
36
|
+
* @param {Object} ir - Molecule internal representation
|
|
37
|
+
* @returns {Object} Map of element to count
|
|
38
|
+
*/
|
|
39
|
+
function countAtoms(ir) {
|
|
40
|
+
const counts = {}
|
|
41
|
+
|
|
42
|
+
// Count explicit atoms
|
|
43
|
+
for (const atom of ir.atoms) {
|
|
44
|
+
counts[atom.element] = (counts[atom.element] || 0) + 1
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Calculate used valence from bonds
|
|
48
|
+
const usedValence = new Array(ir.atoms.length).fill(0)
|
|
49
|
+
for (const bond of ir.bonds) {
|
|
50
|
+
usedValence[bond.from] += bond.order
|
|
51
|
+
usedValence[bond.to] += bond.order
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Add implicit hydrogens
|
|
55
|
+
let totalH = 0
|
|
56
|
+
for (let i = 0; i < ir.atoms.length; i++) {
|
|
57
|
+
const atom = ir.atoms[i]
|
|
58
|
+
const implicitH = Math.max(0, atom.valence - usedValence[i])
|
|
59
|
+
totalH += implicitH
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (totalH > 0) {
|
|
63
|
+
counts['H'] = totalH
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return counts
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Formats atom counts as Hill notation formula
|
|
71
|
+
* @param {Object} counts - Map of element to count
|
|
72
|
+
* @returns {string} Formatted formula
|
|
73
|
+
*
|
|
74
|
+
* Example:
|
|
75
|
+
* formatHill({ C: 2, H: 6, O: 1 }) // => 'C2H6O'
|
|
76
|
+
* formatHill({ H: 2, O: 1 }) // => 'H2O'
|
|
77
|
+
*/
|
|
78
|
+
function formatHill(counts) {
|
|
79
|
+
let formula = ''
|
|
80
|
+
|
|
81
|
+
// C first
|
|
82
|
+
if (counts['C']) {
|
|
83
|
+
formula += formatElement('C', counts['C'])
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// H second
|
|
87
|
+
if (counts['H']) {
|
|
88
|
+
formula += formatElement('H', counts['H'])
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Other elements alphabetically
|
|
92
|
+
const others = Object.keys(counts)
|
|
93
|
+
.filter(el => el !== 'C' && el !== 'H')
|
|
94
|
+
.sort()
|
|
95
|
+
|
|
96
|
+
for (const element of others) {
|
|
97
|
+
formula += formatElement(element, counts[element])
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return formula
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Formats a single element count
|
|
105
|
+
* @param {string} element - Element symbol
|
|
106
|
+
* @param {number} count - Number of atoms
|
|
107
|
+
* @returns {string} Formatted string (e.g., 'C2', 'O', 'H3')
|
|
108
|
+
*/
|
|
109
|
+
function formatElement(element, count) {
|
|
110
|
+
return count === 1 ? element : element + count
|
|
111
|
+
}
|