selfies-js 0.2.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +121 -197
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,74 +1,54 @@
|
|
|
1
1
|
<div align="center">
|
|
2
|
-
<img src="
|
|
2
|
+
<img src="icon.svg" alt="Toluene molecule" width="200"/>
|
|
3
3
|
<h1>selfies-js</h1>
|
|
4
|
-
<p>
|
|
4
|
+
<p><strong>Molecular fragments as reusable code.</strong></p>
|
|
5
5
|
</div>
|
|
6
6
|
|
|
7
|
-
##
|
|
8
|
-
|
|
9
|
-
**SELFIES** (SELF-referencIng Embedded Strings) is a 100% robust molecular string representation. Unlike SMILES, every SELFIES string corresponds to a valid molecule, making it ideal for machine learning and generative models in chemistry.
|
|
10
|
-
|
|
11
|
-
This library is a JavaScript port inspired by the original Python implementation: **[aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)**
|
|
7
|
+
## Why SELFIES?
|
|
12
8
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
## Overview
|
|
9
|
+
SELFIES was designed for machine learning — every string is a valid molecule, eliminating syntax errors in generative models. This DSL extends that principle: named fragments are easier for LLMs to compose correctly than raw atom strings, and undefined references fail loudly instead of producing silent errors.
|
|
16
10
|
|
|
17
|
-
|
|
18
|
-
import {
|
|
19
|
-
decode, encode, isValid,
|
|
20
|
-
getMolecularWeight, getFormula,
|
|
21
|
-
lenSelfies, getSemanticConstraints,
|
|
22
|
-
isChemicallyValid, getCanonicalSmiles, validateRoundtrip
|
|
23
|
-
} from 'selfies-js'
|
|
11
|
+
## The Problem
|
|
24
12
|
|
|
25
|
-
|
|
26
|
-
decode('[C][C][O]') // 'CCO'
|
|
13
|
+
Pharmaceutical SMILES strings are unreadable:
|
|
27
14
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
// Syntax validation
|
|
32
|
-
isValid('[C][C][O]') // true
|
|
33
|
-
|
|
34
|
-
// Chemistry validation (requires RDKit)
|
|
35
|
-
import { initRDKit } from 'selfies-js'
|
|
36
|
-
await initRDKit()
|
|
37
|
-
await isChemicallyValid('[C][C][O]') // true - molecule is chemically valid
|
|
38
|
-
await getCanonicalSmiles('[C][C][O]') // 'CCO' - canonical SMILES representation
|
|
39
|
-
await validateRoundtrip('CCO', '[C][C][O]') // true - structure preserved
|
|
15
|
+
```
|
|
16
|
+
CC(=O)Nc1ccc(O)cc1
|
|
17
|
+
```
|
|
40
18
|
|
|
41
|
-
|
|
42
|
-
getMolecularWeight('[C][C][O]') // 46.07
|
|
43
|
-
getFormula('[C][C][O]') // 'C2H6O'
|
|
19
|
+
What is that? Acetaminophen. But you'd never know by looking at it.
|
|
44
20
|
|
|
45
|
-
|
|
46
|
-
lenSelfies('[C][C][O]') // 3 (symbol count, not string length)
|
|
21
|
+
And when LLMs generate SMILES, they hallucinate invalid structures. SELFIES fixes the validity problem. The DSL fixes the readability problem.
|
|
47
22
|
|
|
48
|
-
|
|
49
|
-
const constraints = getSemanticConstraints()
|
|
50
|
-
console.log(constraints['C']) // 4 (max bonds for carbon)
|
|
23
|
+
## The Solution
|
|
51
24
|
|
|
52
|
-
|
|
53
|
-
import { renderSelfies } from 'selfies-js'
|
|
25
|
+
Define named fragments. Compose molecules like code. Import and reuse across projects.
|
|
54
26
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
27
|
+
```selfies
|
|
28
|
+
# base-fragments.selfies - your team's shared library
|
|
29
|
+
[methyl] = [C]
|
|
30
|
+
[amino] = [N]
|
|
31
|
+
[hydroxyl] = [O]
|
|
32
|
+
[phenyl] = [C][=C][C][=C][C][=C][Ring1][=Branch1]
|
|
59
33
|
```
|
|
60
34
|
|
|
61
|
-
|
|
35
|
+
```selfies
|
|
36
|
+
# molecules.selfies - today's work
|
|
37
|
+
import [methyl, amino, hydroxyl, phenyl] from "./base-fragments.selfies"
|
|
62
38
|
|
|
63
|
-
|
|
39
|
+
[methanol] = [methyl][hydroxyl]
|
|
40
|
+
[aniline] = [phenyl][amino]
|
|
41
|
+
[toluene] = [methyl][phenyl]
|
|
42
|
+
```
|
|
64
43
|
|
|
65
|
-
|
|
44
|
+
```bash
|
|
45
|
+
$ selfies run molecules.selfies --format=smiles
|
|
46
|
+
methanol: CO
|
|
47
|
+
aniline: NC1=CC=CC=C1
|
|
48
|
+
toluene: CC1=CC=CC=C1
|
|
49
|
+
```
|
|
66
50
|
|
|
67
|
-
|
|
68
|
-
- Live SELFIES ↔ SMILES conversion
|
|
69
|
-
- Real-time molecular properties
|
|
70
|
-
- Built-in test suite
|
|
71
|
-
- Syntax highlighting
|
|
51
|
+
Your molecule definitions are now version-controlled, diffable, and shareable.
|
|
72
52
|
|
|
73
53
|
## Installation
|
|
74
54
|
|
|
@@ -76,210 +56,154 @@ Features:
|
|
|
76
56
|
npm install selfies-js
|
|
77
57
|
```
|
|
78
58
|
|
|
79
|
-
|
|
59
|
+
## Quick Start
|
|
80
60
|
|
|
81
|
-
|
|
61
|
+
**1. Create a fragment library** (`fragments.selfies`):
|
|
82
62
|
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
console.log(encoded); // '[C][C]'
|
|
90
|
-
|
|
91
|
-
const decoded = SELFIES.decode('[C][C][O]');
|
|
92
|
-
console.log(decoded); // 'CCO'
|
|
93
|
-
|
|
94
|
-
// DSL parsing
|
|
95
|
-
const parsed = SELFIES.parse('[methyl] = [C]');
|
|
96
|
-
console.log(parsed);
|
|
97
|
-
|
|
98
|
-
// Molecular properties
|
|
99
|
-
const mw = SELFIES.getMolecularWeight('[C][C][O]');
|
|
100
|
-
console.log(mw); // 46.07
|
|
101
|
-
</script>
|
|
63
|
+
```selfies
|
|
64
|
+
[methyl] = [C]
|
|
65
|
+
[ethyl] = [C][C]
|
|
66
|
+
[hydroxyl] = [O]
|
|
67
|
+
[carbonyl] = [C][=O]
|
|
68
|
+
[carboxyl] = [C][=O][O]
|
|
102
69
|
```
|
|
103
70
|
|
|
104
|
-
|
|
71
|
+
**2. Compose molecules** (`molecules.selfies`):
|
|
105
72
|
|
|
106
|
-
|
|
73
|
+
```selfies
|
|
74
|
+
import "./fragments.selfies"
|
|
107
75
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
- **Chemistry Validation:** RDKit-based molecular validity checking
|
|
112
|
-
- **Canonical SMILES:** Structure comparison and roundtrip validation
|
|
113
|
-
- **Properties:** Molecular weight and formula calculation
|
|
114
|
-
- **Constraints:** Customizable semantic constraints (bonding rules)
|
|
115
|
-
- **Utilities:** Symbol counting, alphabet extraction
|
|
116
|
-
- **DSL:** Define and resolve molecule libraries with named definitions
|
|
117
|
-
- **Imports:** Modular .selfies files with import support
|
|
118
|
-
- **CLI:** Command-line interface for executing .selfies files
|
|
119
|
-
- **Rendering:** SVG visualization of molecular structures
|
|
76
|
+
[ethanol] = [ethyl][hydroxyl]
|
|
77
|
+
[acetic_acid] = [methyl][carboxyl]
|
|
78
|
+
```
|
|
120
79
|
|
|
121
|
-
|
|
80
|
+
**3. Compile to SMILES**:
|
|
122
81
|
|
|
123
|
-
|
|
82
|
+
```bash
|
|
83
|
+
$ selfies run molecules.selfies --format=smiles
|
|
84
|
+
ethanol: CCO
|
|
85
|
+
acetic_acid: CC(=O)O
|
|
86
|
+
```
|
|
124
87
|
|
|
125
|
-
|
|
88
|
+
## CLI Commands
|
|
126
89
|
|
|
127
90
|
```bash
|
|
128
|
-
#
|
|
129
|
-
|
|
91
|
+
# Compile a .selfies file to SMILES
|
|
92
|
+
selfies run molecules.selfies --format=smiles
|
|
130
93
|
|
|
131
|
-
# Output as
|
|
132
|
-
|
|
94
|
+
# Output as SELFIES (default)
|
|
95
|
+
selfies run molecules.selfies
|
|
133
96
|
|
|
134
|
-
# Validate
|
|
135
|
-
|
|
97
|
+
# Validate syntax without running
|
|
98
|
+
selfies validate molecules.selfies
|
|
136
99
|
|
|
137
100
|
# List all definitions in a file
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
# Show help
|
|
141
|
-
bun src/cli.js help
|
|
101
|
+
selfies list molecules.selfies
|
|
142
102
|
```
|
|
143
103
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
The `.selfies` DSL allows you to define named molecular fragments and compose them hierarchically:
|
|
104
|
+
## DSL Syntax
|
|
147
105
|
|
|
148
106
|
```selfies
|
|
149
107
|
# Comments start with #
|
|
150
108
|
|
|
151
|
-
#
|
|
152
|
-
[
|
|
153
|
-
[ethyl] = [C][C]
|
|
154
|
-
[hydroxyl] = [O]
|
|
109
|
+
# Define a fragment
|
|
110
|
+
[name] = [SELFIES][tokens][here]
|
|
155
111
|
|
|
156
|
-
#
|
|
112
|
+
# Reference other fragments
|
|
157
113
|
[ethanol] = [ethyl][hydroxyl]
|
|
158
114
|
|
|
159
|
-
#
|
|
160
|
-
|
|
161
|
-
[
|
|
162
|
-
|
|
163
|
-
# Aromatic rings
|
|
164
|
-
[phenyl] = [C][=C][C][=C][C][=C][Ring1][=Branch1]
|
|
165
|
-
[toluene] = [methyl][phenyl]
|
|
115
|
+
# Import from other files
|
|
116
|
+
import "./other-file.selfies" # import all
|
|
117
|
+
import [methyl, ethyl] from "./fragments.selfies" # import specific
|
|
166
118
|
```
|
|
167
119
|
|
|
168
|
-
|
|
120
|
+
## JavaScript API
|
|
169
121
|
|
|
170
|
-
|
|
122
|
+
```javascript
|
|
123
|
+
import { parse, resolve, resolveAll, loadFile } from 'selfies-js/dsl'
|
|
171
124
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
import "./fragments.selfies"
|
|
125
|
+
// Load a file with imports
|
|
126
|
+
const program = loadFile('molecules.selfies')
|
|
175
127
|
|
|
176
|
-
|
|
177
|
-
|
|
128
|
+
// Resolve a single definition to SELFIES
|
|
129
|
+
resolve(program, 'ethanol') // '[C][C][O]'
|
|
178
130
|
|
|
179
|
-
|
|
180
|
-
|
|
131
|
+
// Resolve to SMILES
|
|
132
|
+
resolve(program, 'ethanol', { decode: true }) // 'CCO'
|
|
181
133
|
|
|
182
|
-
|
|
183
|
-
|
|
134
|
+
// Resolve all definitions
|
|
135
|
+
resolveAll(program) // Map { 'ethanol' => '[C][C][O]', ... }
|
|
184
136
|
```
|
|
185
137
|
|
|
186
|
-
|
|
187
|
-
- **Relative paths** resolved from the importing file's location
|
|
188
|
-
- **Chained imports** (file A imports B, B imports C)
|
|
189
|
-
- **Circular import detection** with clear error messages
|
|
190
|
-
- **Selective imports** to only include what you need
|
|
138
|
+
### Core SELFIES functions
|
|
191
139
|
|
|
192
|
-
|
|
140
|
+
```javascript
|
|
141
|
+
import { encode, decode, isValid, getMolecularWeight, getFormula } from 'selfies-js'
|
|
193
142
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
ethanol: CCO
|
|
200
|
-
isopropyl: C(C)C
|
|
201
|
-
isopropanol: C(C)CO
|
|
202
|
-
phenyl: C1=CC=CC=C1
|
|
203
|
-
toluene: CC1=CC=CC=C1
|
|
143
|
+
decode('[C][C][O]') // 'CCO'
|
|
144
|
+
encode('CCO') // '[C][C][O]'
|
|
145
|
+
isValid('[C][C][O]') // true
|
|
146
|
+
getMolecularWeight('[C][C][O]') // 46.07
|
|
147
|
+
getFormula('[C][C][O]') // 'C2H6O'
|
|
204
148
|
```
|
|
205
149
|
|
|
206
|
-
|
|
150
|
+
### SVG Rendering
|
|
207
151
|
|
|
208
152
|
```javascript
|
|
209
|
-
import {
|
|
210
|
-
import { loadFile } from 'selfies-js/dsl'
|
|
153
|
+
import { renderSelfies, initRDKit } from 'selfies-js'
|
|
211
154
|
|
|
212
|
-
|
|
213
|
-
const
|
|
155
|
+
await initRDKit()
|
|
156
|
+
const svg = await renderSelfies('[C][C][O]', { width: 300, height: 300 })
|
|
157
|
+
```
|
|
214
158
|
|
|
215
|
-
|
|
216
|
-
const source = `
|
|
217
|
-
[methyl] = [C]
|
|
218
|
-
[ethanol] = [methyl][C][O]
|
|
219
|
-
`
|
|
220
|
-
const program = parse(source)
|
|
159
|
+
## VS Code Extension
|
|
221
160
|
|
|
222
|
-
|
|
223
|
-
resolve(program, 'ethanol') // '[C][C][O]'
|
|
161
|
+
Get live visualization as you author `.selfies` files. See the molecular structure update line-by-line as you navigate your code.
|
|
224
162
|
|
|
225
|
-
|
|
226
|
-
resolve(program, 'ethanol', { decode: true }) // 'CCO'
|
|
163
|
+
**[Install from VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=ghost---shadow.selfies-lang)**
|
|
227
164
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
165
|
+
## Browser Usage
|
|
166
|
+
|
|
167
|
+
```html
|
|
168
|
+
<script src="https://github.com/Ghost---Shadow/selfies-js/releases/latest/download/selfies.umd.min.js"></script>
|
|
169
|
+
<script>
|
|
170
|
+
SELFIES.encode('CCO') // '[C][C][O]'
|
|
171
|
+
SELFIES.decode('[C][C][O]') // 'CCO'
|
|
172
|
+
SELFIES.getMolecularWeight('[C][C][O]') // 46.07
|
|
173
|
+
</script>
|
|
231
174
|
```
|
|
232
175
|
|
|
233
|
-
##
|
|
176
|
+
## Examples
|
|
234
177
|
|
|
235
|
-
|
|
178
|
+
See the [`examples/`](./examples) directory:
|
|
236
179
|
|
|
237
|
-
|
|
238
|
-
|
|
180
|
+
- `base-fragments.selfies` — Reusable building blocks (alkyl groups, functional groups, halogens)
|
|
181
|
+
- `molecules-with-imports.selfies` — Composing molecules from imported fragments
|
|
182
|
+
- `selective-import.selfies` — Importing only what you need
|
|
239
183
|
|
|
240
|
-
|
|
241
|
-
await initRDKit()
|
|
184
|
+
## What is SELFIES?
|
|
242
185
|
|
|
243
|
-
|
|
244
|
-
const svg = await renderSelfies('[C][C][=C][C][=C][C][=C][Ring1][=Branch1]', {
|
|
245
|
-
width: 300,
|
|
246
|
-
height: 300
|
|
247
|
-
})
|
|
248
|
-
```
|
|
186
|
+
SELFIES (SELF-referencIng Embedded Strings) is a molecular string representation where **every string is a valid molecule**. Unlike SMILES, you can't write an invalid SELFIES string. This makes it ideal for machine learning and generative chemistry.
|
|
249
187
|
|
|
250
|
-
|
|
251
|
-
- Professional 2D coordinate generation via RDKit
|
|
252
|
-
- Proper skeletal formulas (carbons hidden)
|
|
253
|
-
- Correct benzene ring geometry
|
|
254
|
-
- Support for all bond types
|
|
255
|
-
- Stereochemistry notation
|
|
256
|
-
- Industry-standard rendering
|
|
188
|
+
This library is a JavaScript port of the Python implementation: [aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
|
|
257
189
|
|
|
258
|
-
|
|
190
|
+
> Krenn, M., Häse, F., Nigam, A., Friederich, P., & Aspuru-Guzik, A. (2020). Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation. *Machine Learning: Science and Technology*, 1(4), 045024.
|
|
259
191
|
|
|
260
|
-
|
|
192
|
+
## Interactive Playground
|
|
261
193
|
|
|
262
|
-
|
|
263
|
-
- `molecules-with-imports.selfies` - Demonstrates importing and composing molecules
|
|
264
|
-
- `organic-chemistry.selfies` - Alcohols, aldehydes, acids, amines, ethers
|
|
265
|
-
- `drug-fragments.selfies` - Pharmacophore fragments, drug-like building blocks
|
|
266
|
-
- `polymers.selfies` - Monomers, repeat units, oligomers
|
|
194
|
+
Try it live: **[https://ghost---shadow.github.io/selfies-js/](https://ghost---shadow.github.io/selfies-js/)**
|
|
267
195
|
|
|
268
196
|
## Known Limitations
|
|
269
197
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
- **
|
|
273
|
-
- **Fused aromatic ring systems** - some complex cases may not roundtrip correctly
|
|
274
|
-
- **Polycyclic structures** with multiple ring closures - partial support
|
|
275
|
-
|
|
276
|
-
For complete SELFIES support, use the original Python library: [aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
|
|
198
|
+
- **Bracket atoms** in SMILES (`[nH]`, `[C@@]`, `[13C]`) — limited support
|
|
199
|
+
- **Fused aromatic ring systems** — some complex cases may not roundtrip
|
|
200
|
+
- **Polycyclic structures** — partial support
|
|
277
201
|
|
|
278
|
-
|
|
202
|
+
For complete SELFIES support, use the Python library: [aspuru-guzik-group/selfies](https://github.com/aspuru-guzik-group/selfies)
|
|
279
203
|
|
|
280
|
-
|
|
204
|
+
## Testimonies
|
|
281
205
|
|
|
282
|
-
|
|
206
|
+

|
|
283
207
|
|
|
284
208
|
## License
|
|
285
209
|
|