@datagrok/sequence-translator 1.10.13 → 1.10.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/files/tests/chem_enum_cores.csv +5 -0
- package/files/tests/chem_enum_rgroups.csv +5 -0
- package/package.json +1 -1
- package/src/apps/structure/view/ui.ts +1 -1
- package/src/apps/translator/view/ui.ts +1 -1
- package/src/package-api.ts +8 -1
- package/src/package-test.ts +1 -0
- package/src/package.g.ts +11 -3
- package/src/package.ts +16 -5
- package/src/polytool/const.ts +1 -1
- package/src/polytool/pt-chem-enum-dialog.ts +940 -0
- package/src/polytool/pt-chem-enum.ts +553 -0
- package/src/polytool/pt-dialog.ts +2 -124
- package/src/polytool/pt-enumerate-seq-dialog.ts +3 -3
- package/src/tests/polytool-enumerate-chem-tests.ts +408 -0
- package/test-console-output-1.log +202 -89
- package/test-record-1.mp4 +0 -0
- package/src/polytool/pt-enumeration-chem.ts +0 -100
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
/* eslint-disable max-len */
|
|
2
|
+
import {RDModule, RDMol} from '@datagrok-libraries/chem-meta/src/rdkit-api';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* PolyTool Chemical Enumeration — pure logic module.
|
|
6
|
+
*
|
|
7
|
+
* Responsibilities:
|
|
8
|
+
* • Recognize and normalize the many spellings of R-group labels used in SMILES
|
|
9
|
+
* (`[1*]`, `[*:1]`, `[R1]`, `[R:1]`, `[*1]`, including multi-digit R numbers).
|
|
10
|
+
* • Validate cores (≥1 R-label) and R-groups (exactly one R-label).
|
|
11
|
+
* • Join a core SMILES with one R-group per R-number via SMILES-concatenation
|
|
12
|
+
* using shared ring-closure digits, then canonicalize through RDKit.
|
|
13
|
+
* • Enumerate across multiple cores and R-group lists in Zip or Cartesian mode.
|
|
14
|
+
* • Enforce a hard result cap.
|
|
15
|
+
*
|
|
16
|
+
* No Datagrok or UI dependencies — safe to unit-test with just an RDKit module.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
// ─── Constants ──────────────────────────────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
export const CHEM_ENUM_MAX_RESULTS = 1_000_000;
|
|
22
|
+
|
|
23
|
+
export const ChemEnumModes = {
|
|
24
|
+
Zip: 'Zip',
|
|
25
|
+
Cartesian: 'Cartesian',
|
|
26
|
+
} as const;
|
|
27
|
+
export type ChemEnumMode = typeof ChemEnumModes[keyof typeof ChemEnumModes];
|
|
28
|
+
|
|
29
|
+
// ─── Types ──────────────────────────────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
export interface ChemEnumCore {
|
|
32
|
+
/** Normalized SMILES — all R-labels rewritten to `[*:N]` form. */
|
|
33
|
+
smiles: string;
|
|
34
|
+
/** SMILES as supplied by the user (pre-normalization). */
|
|
35
|
+
originalSmiles: string;
|
|
36
|
+
/** Unique, sorted ascending, R numbers present in the core. */
|
|
37
|
+
rNumbers: number[];
|
|
38
|
+
/** Human-readable source, e.g. `"Drawn #1"` or `"Cores[3]"`. */
|
|
39
|
+
id: string;
|
|
40
|
+
/** If set, the core is invalid and should be excluded from enumeration. */
|
|
41
|
+
error?: string;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface ChemEnumRGroup {
|
|
45
|
+
/** Normalized SMILES with its single R-label remapped to the target R number. */
|
|
46
|
+
smiles: string;
|
|
47
|
+
/** SMILES as supplied (pre-normalization and pre-remap). */
|
|
48
|
+
originalSmiles: string;
|
|
49
|
+
/** Target R number this group fills in. */
|
|
50
|
+
rNumber: number;
|
|
51
|
+
/** R number as originally written in `originalSmiles` (pre-remap). */
|
|
52
|
+
sourceRNumber?: number;
|
|
53
|
+
id: string;
|
|
54
|
+
error?: string;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export interface ChemEnumParams {
|
|
58
|
+
cores: ChemEnumCore[];
|
|
59
|
+
/** Key = R number (1-based). Value = list of R-groups for that slot. */
|
|
60
|
+
rGroups: Map<number, ChemEnumRGroup[]>;
|
|
61
|
+
mode: ChemEnumMode;
|
|
62
|
+
/** Overrides `CHEM_ENUM_MAX_RESULTS` for tests. */
|
|
63
|
+
maxResults?: number;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export interface ChemEnumResult {
|
|
67
|
+
/** Canonical SMILES of the assembled molecule. */
|
|
68
|
+
smiles: string;
|
|
69
|
+
/** `originalSmiles` of the core used. */
|
|
70
|
+
coreSmiles: string;
|
|
71
|
+
/** R-number → `originalSmiles` of the R-group used at that position. */
|
|
72
|
+
rGroupSmilesByNum: Map<number, string>;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export interface ChemEnumValidation {
|
|
76
|
+
/** Overall validity flag. */
|
|
77
|
+
ok: boolean;
|
|
78
|
+
/** Free-form top-level messages (e.g. zip-length mismatch, cap exceeded). */
|
|
79
|
+
errors: string[];
|
|
80
|
+
/** Predicted total result count (capped at MAX+1 to signal "too many"). */
|
|
81
|
+
predictedCount: number;
|
|
82
|
+
/** True when `predictedCount > maxResults`. */
|
|
83
|
+
overCap: boolean;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ─── R-label recognition ────────────────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Matches all supported R-label spellings as a single bracketed atom:
|
|
90
|
+
* [N*] [*:N] [*N] [RN] [R:N]
|
|
91
|
+
* Capture groups 1..5 hold the numeric portion (exactly one is non-empty).
|
|
92
|
+
*/
|
|
93
|
+
const R_LABEL_SOURCE = String.raw`\[(?:(\d+)\*|\*:(\d+)|\*(\d+)|R(\d+)|R:(\d+))\]`;
|
|
94
|
+
|
|
95
|
+
export function rLabelRegex(): RegExp {
|
|
96
|
+
return new RegExp(R_LABEL_SOURCE, 'g');
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function pickNum(groups: string[]): number | null {
|
|
100
|
+
for (const g of groups) if (g !== undefined) return parseInt(g, 10);
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/** Replaces every supported R-label spelling with the canonical `[*:N]` form. */
|
|
105
|
+
export function normalizeRLabels(smi: string): string {
|
|
106
|
+
return smi.replace(rLabelRegex(), (_m, g1, g2, g3, g4, g5) => {
|
|
107
|
+
const n = pickNum([g1, g2, g3, g4, g5]);
|
|
108
|
+
return n === null ? _m : `[*:${n}]`;
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/** Returns R numbers found in the SMILES, sorted ascending, deduplicated. */
|
|
113
|
+
export function extractRNumbers(smi: string): number[] {
|
|
114
|
+
const seen = new Set<number>();
|
|
115
|
+
for (const m of smi.matchAll(rLabelRegex())) {
|
|
116
|
+
const n = pickNum([m[1], m[2], m[3], m[4], m[5]]);
|
|
117
|
+
if (n !== null) seen.add(n);
|
|
118
|
+
}
|
|
119
|
+
return [...seen].sort((a, b) => a - b);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/** Returns every R-label occurrence with its source number (order-preserving). */
|
|
123
|
+
export function findRLabels(smi: string): { source: number, match: string, index: number }[] {
|
|
124
|
+
const out: { source: number, match: string, index: number }[] = [];
|
|
125
|
+
for (const m of smi.matchAll(rLabelRegex())) {
|
|
126
|
+
const n = pickNum([m[1], m[2], m[3], m[4], m[5]]);
|
|
127
|
+
if (n !== null) out.push({source: n, match: m[0], index: m.index!});
|
|
128
|
+
}
|
|
129
|
+
return out;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Rewrites every R-label in `smi` to `[*:newN]`, regardless of source number
|
|
134
|
+
* or spelling. Intended for single-R groups being remapped into their assigned slot.
|
|
135
|
+
*/
|
|
136
|
+
export function remapSingleRLabel(smi: string, newN: number): string {
|
|
137
|
+
return smi.replace(rLabelRegex(), () => `[*:${newN}]`);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// ─── Core / R-group construction ────────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
export function makeCore(originalSmiles: string, id: string, rdkit?: RDModule): ChemEnumCore {
|
|
143
|
+
const trimmed = (originalSmiles ?? '').trim();
|
|
144
|
+
if (trimmed === '')
|
|
145
|
+
return {smiles: '', originalSmiles, rNumbers: [], id, error: 'Empty SMILES'};
|
|
146
|
+
|
|
147
|
+
const normalized = normalizeRLabels(trimmed);
|
|
148
|
+
const rNumbers = extractRNumbers(normalized);
|
|
149
|
+
|
|
150
|
+
if (rNumbers.length === 0)
|
|
151
|
+
return {smiles: normalized, originalSmiles, rNumbers, id, error: 'Core must contain at least one R group'};
|
|
152
|
+
|
|
153
|
+
if (rdkit) {
|
|
154
|
+
const err = tryParse(normalized, rdkit);
|
|
155
|
+
if (err) return {smiles: normalized, originalSmiles, rNumbers, id, error: `Invalid SMILES: ${err}`};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return {smiles: normalized, originalSmiles, rNumbers, id};
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
export function makeRGroup(
|
|
162
|
+
originalSmiles: string, targetRNumber: number, id: string, rdkit?: RDModule,
|
|
163
|
+
): ChemEnumRGroup {
|
|
164
|
+
const trimmed = (originalSmiles ?? '').trim();
|
|
165
|
+
if (trimmed === '')
|
|
166
|
+
return {smiles: '', originalSmiles, rNumber: targetRNumber, id, error: 'Empty SMILES'};
|
|
167
|
+
|
|
168
|
+
const normalized = normalizeRLabels(trimmed);
|
|
169
|
+
const rNumbers = extractRNumbers(normalized);
|
|
170
|
+
|
|
171
|
+
if (rNumbers.length === 0) {
|
|
172
|
+
return {
|
|
173
|
+
smiles: normalized, originalSmiles, rNumber: targetRNumber, id,
|
|
174
|
+
error: 'R-group must contain exactly one R label (found none)'};
|
|
175
|
+
}
|
|
176
|
+
if (rNumbers.length > 1) {
|
|
177
|
+
return {
|
|
178
|
+
smiles: normalized, originalSmiles, rNumber: targetRNumber, id,
|
|
179
|
+
sourceRNumber: rNumbers[0],
|
|
180
|
+
error: `R-group must contain exactly one R label (found ${rNumbers.length}: ${rNumbers.map((n) => 'R' + n).join(', ')})`};
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const sourceRNumber = rNumbers[0];
|
|
184
|
+
const remapped = remapSingleRLabel(normalized, targetRNumber);
|
|
185
|
+
|
|
186
|
+
if (rdkit) {
|
|
187
|
+
const err = tryParse(remapped, rdkit);
|
|
188
|
+
if (err) {
|
|
189
|
+
return {
|
|
190
|
+
smiles: remapped, originalSmiles, rNumber: targetRNumber, id, sourceRNumber,
|
|
191
|
+
error: `Invalid SMILES: ${err}`};
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return {smiles: remapped, originalSmiles, rNumber: targetRNumber, id, sourceRNumber};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function tryParse(smi: string, rdkit: RDModule): string | null {
|
|
199
|
+
let mol: RDMol | null = null;
|
|
200
|
+
try {
|
|
201
|
+
mol = rdkit.get_mol(smi);
|
|
202
|
+
if (!mol || !mol.is_valid()) return 'failed to parse';
|
|
203
|
+
return null;
|
|
204
|
+
} catch (err: any) {
|
|
205
|
+
return (err?.message ?? String(err)).toString().slice(0, 120);
|
|
206
|
+
} finally {
|
|
207
|
+
mol?.delete();
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// ─── Count + validation ─────────────────────────────────────────────────────
|
|
212
|
+
|
|
213
|
+
/** Results per core: depends on mode and the R-numbers the core uses. */
|
|
214
|
+
export function countForCore(
|
|
215
|
+
core: ChemEnumCore, rGroups: Map<number, ChemEnumRGroup[]>, mode: ChemEnumMode,
|
|
216
|
+
): { count: number, uncovered: number[] } {
|
|
217
|
+
const uncovered: number[] = [];
|
|
218
|
+
const counts: number[] = [];
|
|
219
|
+
for (const n of core.rNumbers) {
|
|
220
|
+
const list = rGroups.get(n);
|
|
221
|
+
if (!list || list.length === 0) { uncovered.push(n); continue; }
|
|
222
|
+
counts.push(list.filter((g) => !g.error).length);
|
|
223
|
+
}
|
|
224
|
+
if (uncovered.length > 0) return {count: 0, uncovered};
|
|
225
|
+
if (counts.some((c) => c === 0)) return {count: 0, uncovered};
|
|
226
|
+
|
|
227
|
+
if (mode === ChemEnumModes.Zip) {
|
|
228
|
+
if (counts.length === 0) return {count: 0, uncovered};
|
|
229
|
+
const first = counts[0];
|
|
230
|
+
return {count: counts.every((c) => c === first) ? first : -1, uncovered};
|
|
231
|
+
}
|
|
232
|
+
// Cartesian
|
|
233
|
+
return {count: counts.reduce((p, c) => p * c, 1), uncovered};
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Quickly validates the overall enumeration parameters, predicting the total
|
|
238
|
+
* result count and flagging structural issues (invalid inputs, uncovered R-numbers,
|
|
239
|
+
* zip-length mismatches, cap exceedance).
|
|
240
|
+
*/
|
|
241
|
+
export function validateParams(params: ChemEnumParams): ChemEnumValidation {
|
|
242
|
+
const max = params.maxResults ?? CHEM_ENUM_MAX_RESULTS;
|
|
243
|
+
const errors: string[] = [];
|
|
244
|
+
let total = 0;
|
|
245
|
+
|
|
246
|
+
const validCores = params.cores.filter((c) => !c.error);
|
|
247
|
+
if (validCores.length === 0) errors.push('No valid cores provided.');
|
|
248
|
+
|
|
249
|
+
// Collect all R-numbers used by any valid core
|
|
250
|
+
const usedRs = new Set<number>();
|
|
251
|
+
for (const c of validCores) for (const n of c.rNumbers) usedRs.add(n);
|
|
252
|
+
|
|
253
|
+
// Per-R-number R-group validity counts
|
|
254
|
+
const rgCounts = new Map<number, number>();
|
|
255
|
+
for (const n of usedRs) {
|
|
256
|
+
const list = params.rGroups.get(n) ?? [];
|
|
257
|
+
const valid = list.filter((g) => !g.error).length;
|
|
258
|
+
rgCounts.set(n, valid);
|
|
259
|
+
if (valid === 0) errors.push(`No valid R-group provided for R${n}.`);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Zip mode: all used R-group lists must share a length > 0
|
|
263
|
+
if (params.mode === ChemEnumModes.Zip) {
|
|
264
|
+
const lens = [...rgCounts.values()].filter((v) => v > 0);
|
|
265
|
+
if (lens.length > 1 && !lens.every((v) => v === lens[0]))
|
|
266
|
+
errors.push(`Zip mode requires every R-group list to have the same number of entries. Got ${[...rgCounts.entries()].map(([n, v]) => `R${n}=${v}`).join(', ')}.`);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
for (const c of validCores) {
|
|
270
|
+
const {count, uncovered} = countForCore(c, params.rGroups, params.mode);
|
|
271
|
+
if (uncovered.length > 0) {
|
|
272
|
+
errors.push(`Core "${c.id}" references uncovered R-number${uncovered.length > 1 ? 's' : ''}: ${uncovered.map((n) => 'R' + n).join(', ')}.`);
|
|
273
|
+
continue;
|
|
274
|
+
}
|
|
275
|
+
if (count < 0) continue; // already covered by the global zip-mismatch message
|
|
276
|
+
total += count;
|
|
277
|
+
if (total > max) break;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
const overCap = total > max;
|
|
281
|
+
if (overCap)
|
|
282
|
+
errors.push(`Too many combinations (> ${max.toLocaleString()}). Reduce the number of cores or R-groups, or switch to Zip mode.`);
|
|
283
|
+
|
|
284
|
+
return {ok: errors.length === 0, errors, predictedCount: total, overCap};
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// ─── SMILES assembly ────────────────────────────────────────────────────────
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Pure-string join: core + one R-group per R-number share ring-closure digits
|
|
291
|
+
* across a disconnected SMILES. Returns a SMILES that RDKit can parse, but
|
|
292
|
+
* **not canonicalized** — call `Chem:convertNotation` or {@link assembleMolecule}
|
|
293
|
+
* for canonical output. No RDKit calls — safe to run on millions of rows without
|
|
294
|
+
* blocking the main thread.
|
|
295
|
+
*
|
|
296
|
+
* core: `C[*:1]N[*:2]`, R1=`O[*:1]`, R2=`S[*:2]`
|
|
297
|
+
* → `C%50N%51.O%50.S%51` (uncanonical but valid)
|
|
298
|
+
*/
|
|
299
|
+
export function buildJoinedSmiles(
|
|
300
|
+
coreSmiles: string,
|
|
301
|
+
rgSmilesByNum: Map<number, string>,
|
|
302
|
+
): string | null {
|
|
303
|
+
if (rgSmilesByNum.size === 0) return null;
|
|
304
|
+
|
|
305
|
+
const coreFixed = moveStartRLabelToBranch(coreSmiles);
|
|
306
|
+
const rgsFixed = new Map<number, string>();
|
|
307
|
+
for (const [k, s] of rgSmilesByNum) rgsFixed.set(k, moveStartRLabelToBranch(s));
|
|
308
|
+
|
|
309
|
+
const allPieces = [coreFixed, ...rgsFixed.values()];
|
|
310
|
+
const digits = pickFreeRingDigits(allPieces, rgSmilesByNum.size);
|
|
311
|
+
if (digits.length < rgSmilesByNum.size) return null;
|
|
312
|
+
|
|
313
|
+
const digitByNum = new Map<number, string>();
|
|
314
|
+
let i = 0;
|
|
315
|
+
for (const k of rgSmilesByNum.keys()) digitByNum.set(k, formatRingDigit(digits[i++]));
|
|
316
|
+
|
|
317
|
+
let assembledCore = coreFixed;
|
|
318
|
+
for (const [k, d] of digitByNum)
|
|
319
|
+
assembledCore = substituteRLabelWithRingDigit(assembledCore, k, d);
|
|
320
|
+
|
|
321
|
+
const assembledRgs: string[] = [];
|
|
322
|
+
for (const [k, s] of rgsFixed)
|
|
323
|
+
assembledRgs.push(substituteRLabelWithRingDigit(s, k, digitByNum.get(k)!));
|
|
324
|
+
|
|
325
|
+
return [assembledCore, ...assembledRgs].join('.');
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Joins a core with one R-group per R-number and canonicalizes via RDKit.
|
|
330
|
+
* Per-molecule sync RDKit call — **do not use in bulk**; prefer {@link buildJoinedSmiles}
|
|
331
|
+
* + a batched `Chem:convertNotation` over the whole column.
|
|
332
|
+
*/
|
|
333
|
+
export function assembleMolecule(
|
|
334
|
+
coreSmiles: string,
|
|
335
|
+
rgSmilesByNum: Map<number, string>,
|
|
336
|
+
rdkit: RDModule,
|
|
337
|
+
): string | null {
|
|
338
|
+
const joined = buildJoinedSmiles(coreSmiles, rgSmilesByNum);
|
|
339
|
+
if (!joined) return null;
|
|
340
|
+
let mol: RDMol | null = null;
|
|
341
|
+
try {
|
|
342
|
+
mol = rdkit.get_mol(joined);
|
|
343
|
+
if (!mol || !mol.is_valid()) return null;
|
|
344
|
+
return mol.get_smiles();
|
|
345
|
+
} catch {
|
|
346
|
+
return null;
|
|
347
|
+
} finally {
|
|
348
|
+
mol?.delete();
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
/**
|
|
353
|
+
* `[*:N]X…` or `[*:N]=X…` at SMILES start becomes `X([*:N])…` / `X(=[*:N])…`
|
|
354
|
+
* so every R-label is preceded by an atom — required for ring-digit substitution.
|
|
355
|
+
*/
|
|
356
|
+
export function moveStartRLabelToBranch(smi: string): string {
|
|
357
|
+
const m = smi.match(/^(\[\*:\d+\])([-=#:/\\])?(\[[^\]]+\]|Br|Cl|[BCNOPSFIbcnops])(.*)$/s);
|
|
358
|
+
if (!m) return smi;
|
|
359
|
+
const [, rlab, bond, atom, rest] = m;
|
|
360
|
+
return `${atom}(${bond ?? ''}${rlab})${rest}`;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/** Replaces `[*:n]` and `([*:n])` in `smi` with a ring-closure token. */
|
|
364
|
+
export function substituteRLabelWithRingDigit(smi: string, n: number, digitToken: string): string {
|
|
365
|
+
const target = `[*:${n}]`;
|
|
366
|
+
// Collapse a lone-branch form first so `(` / `)` don't linger: X([*:n]) → X<digit>
|
|
367
|
+
const branchForm = new RegExp(`\\(\\s*\\[\\*:${n}\\]\\s*\\)`, 'g');
|
|
368
|
+
smi = smi.replace(branchForm, digitToken);
|
|
369
|
+
return smi.split(target).join(digitToken);
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
/**
|
|
373
|
+
* Picks `count` ring-closure digits not already in use in any of the pieces,
|
|
374
|
+
* formatted as bare digits when possible and `%NN` otherwise.
|
|
375
|
+
*/
|
|
376
|
+
export function pickFreeRingDigits(pieces: string[], count: number): number[] {
|
|
377
|
+
const used = new Set<number>();
|
|
378
|
+
for (const p of pieces) {
|
|
379
|
+
const stripped = p.replace(/\[[^\]]*\]/g, ''); // atoms are bracketed — ignore their digits
|
|
380
|
+
for (const m of stripped.matchAll(/%(\d{2})/g))
|
|
381
|
+
used.add(parseInt(m[1], 10));
|
|
382
|
+
for (const ch of stripped) {
|
|
383
|
+
const v = ch.charCodeAt(0) - 48;
|
|
384
|
+
if (v >= 0 && v <= 9) used.add(v);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
const free: number[] = [];
|
|
388
|
+
for (let d = 1; d < 100 && free.length < count; d++)
|
|
389
|
+
if (!used.has(d)) free.push(d);
|
|
390
|
+
return free;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
export function formatRingDigit(n: number): string {
|
|
394
|
+
if (n < 0 || n > 99) throw new Error(`Ring digit out of range: ${n}`);
|
|
395
|
+
return n <= 9 ? `${n}` : `%${n.toString().padStart(2, '0')}`;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// ─── Enumeration ────────────────────────────────────────────────────────────
|
|
399
|
+
|
|
400
|
+
/** Enumerates R-group assignments per core, yielding up to `params.maxResults`. */
|
|
401
|
+
export function* iterateAssignments(params: ChemEnumParams): Generator<{core: ChemEnumCore, assignment: Map<number, ChemEnumRGroup>}> {
|
|
402
|
+
const max = params.maxResults ?? CHEM_ENUM_MAX_RESULTS;
|
|
403
|
+
let produced = 0;
|
|
404
|
+
|
|
405
|
+
for (const core of params.cores) {
|
|
406
|
+
if (core.error) continue;
|
|
407
|
+
|
|
408
|
+
const rNums = core.rNumbers;
|
|
409
|
+
const lists: ChemEnumRGroup[][] = [];
|
|
410
|
+
let uncovered = false;
|
|
411
|
+
for (const n of rNums) {
|
|
412
|
+
const list = (params.rGroups.get(n) ?? []).filter((g) => !g.error);
|
|
413
|
+
if (list.length === 0) { uncovered = true; break; }
|
|
414
|
+
lists.push(list);
|
|
415
|
+
}
|
|
416
|
+
if (uncovered) continue;
|
|
417
|
+
|
|
418
|
+
if (params.mode === ChemEnumModes.Zip) {
|
|
419
|
+
if (lists.length === 0) continue;
|
|
420
|
+
const N = lists[0].length;
|
|
421
|
+
if (!lists.every((l) => l.length === N)) continue;
|
|
422
|
+
for (let i = 0; i < N; i++) {
|
|
423
|
+
const assignment = new Map<number, ChemEnumRGroup>();
|
|
424
|
+
for (let j = 0; j < rNums.length; j++) assignment.set(rNums[j], lists[j][i]);
|
|
425
|
+
yield {core, assignment};
|
|
426
|
+
if (++produced >= max) return;
|
|
427
|
+
}
|
|
428
|
+
} else {
|
|
429
|
+
// Cartesian — odometer iteration
|
|
430
|
+
const idx = new Array<number>(lists.length).fill(0);
|
|
431
|
+
while (true) {
|
|
432
|
+
const assignment = new Map<number, ChemEnumRGroup>();
|
|
433
|
+
for (let j = 0; j < rNums.length; j++) assignment.set(rNums[j], lists[j][idx[j]]);
|
|
434
|
+
yield {core, assignment};
|
|
435
|
+
if (++produced >= max) return;
|
|
436
|
+
|
|
437
|
+
let k = idx.length - 1;
|
|
438
|
+
while (k >= 0) {
|
|
439
|
+
idx[k]++;
|
|
440
|
+
if (idx[k] < lists[k].length) break;
|
|
441
|
+
idx[k] = 0;
|
|
442
|
+
k--;
|
|
443
|
+
}
|
|
444
|
+
if (k < 0) break;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
/**
|
|
451
|
+
* Runs the full enumeration. Returns `null` when validation fails (errors available
|
|
452
|
+
* via {@link validateParams}). Silently skips assignments that fail to assemble.
|
|
453
|
+
*/
|
|
454
|
+
export function enumerate(params: ChemEnumParams, rdkit: RDModule): ChemEnumResult[] | null {
|
|
455
|
+
const v = validateParams(params);
|
|
456
|
+
if (!v.ok) return null;
|
|
457
|
+
|
|
458
|
+
const out: ChemEnumResult[] = [];
|
|
459
|
+
for (const {core, assignment} of iterateAssignments(params)) {
|
|
460
|
+
const rgSmiByNum = new Map<number, string>();
|
|
461
|
+
for (const [n, rg] of assignment)
|
|
462
|
+
rgSmiByNum.set(n, rg.smiles);
|
|
463
|
+
const smi = assembleMolecule(core.smiles, rgSmiByNum, rdkit);
|
|
464
|
+
if (!smi) continue;
|
|
465
|
+
|
|
466
|
+
const originalRgs = new Map<number, string>();
|
|
467
|
+
for (const [n, rg] of assignment)
|
|
468
|
+
originalRgs.set(n, rg.originalSmiles);
|
|
469
|
+
out.push({smiles: smi, coreSmiles: core.originalSmiles, rGroupSmilesByNum: originalRgs});
|
|
470
|
+
}
|
|
471
|
+
return out;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
/**
|
|
475
|
+
* Reservoir-samples up to `sampleSize` results for a live preview.
|
|
476
|
+
* Total iteration count is capped at `params.maxResults` for safety.
|
|
477
|
+
*/
|
|
478
|
+
export function enumerateSample(
|
|
479
|
+
params: ChemEnumParams, rdkit: RDModule, sampleSize: number, rand: () => number = Math.random,
|
|
480
|
+
): ChemEnumResult[] {
|
|
481
|
+
const reservoir: ChemEnumResult[] = [];
|
|
482
|
+
let seen = 0;
|
|
483
|
+
for (const {core, assignment} of iterateAssignments(params)) {
|
|
484
|
+
const rgSmiByNum = new Map<number, string>();
|
|
485
|
+
for (const [n, rg] of assignment) rgSmiByNum.set(n, rg.smiles);
|
|
486
|
+
const smi = assembleMolecule(core.smiles, rgSmiByNum, rdkit);
|
|
487
|
+
if (!smi) continue;
|
|
488
|
+
|
|
489
|
+
const originalRgs = new Map<number, string>();
|
|
490
|
+
for (const [n, rg] of assignment) originalRgs.set(n, rg.originalSmiles);
|
|
491
|
+
const item: ChemEnumResult = {smiles: smi, coreSmiles: core.originalSmiles, rGroupSmilesByNum: originalRgs};
|
|
492
|
+
|
|
493
|
+
if (reservoir.length < sampleSize) {
|
|
494
|
+
reservoir.push(item);
|
|
495
|
+
} else {
|
|
496
|
+
const j = Math.floor(rand() * (seen + 1));
|
|
497
|
+
if (j < sampleSize) reservoir[j] = item;
|
|
498
|
+
}
|
|
499
|
+
seen++;
|
|
500
|
+
}
|
|
501
|
+
return reservoir;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
/**
|
|
505
|
+
* No-RDKit enumeration — returns *uncanonical* joined SMILES per assignment.
|
|
506
|
+
* Intended as the first stage of a bulk pipeline: collect these into a column
|
|
507
|
+
* and canonicalize with one parallel `Chem:convertNotation` call instead of
|
|
508
|
+
* per-row sync RDKit work.
|
|
509
|
+
*/
|
|
510
|
+
export function enumerateRaw(params: ChemEnumParams): ChemEnumResult[] | null {
|
|
511
|
+
const v = validateParams(params);
|
|
512
|
+
if (!v.ok) return null;
|
|
513
|
+
|
|
514
|
+
const out: ChemEnumResult[] = [];
|
|
515
|
+
for (const {core, assignment} of iterateAssignments(params)) {
|
|
516
|
+
const rgSmiByNum = new Map<number, string>();
|
|
517
|
+
for (const [n, rg] of assignment) rgSmiByNum.set(n, rg.smiles);
|
|
518
|
+
const smi = buildJoinedSmiles(core.smiles, rgSmiByNum);
|
|
519
|
+
if (!smi) continue;
|
|
520
|
+
|
|
521
|
+
const originalRgs = new Map<number, string>();
|
|
522
|
+
for (const [n, rg] of assignment) originalRgs.set(n, rg.originalSmiles);
|
|
523
|
+
out.push({smiles: smi, coreSmiles: core.originalSmiles, rGroupSmilesByNum: originalRgs});
|
|
524
|
+
}
|
|
525
|
+
return out;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
/** Reservoir-sample with the no-RDKit join. Output SMILES are uncanonical but parseable. */
|
|
529
|
+
export function enumerateSampleRaw(
|
|
530
|
+
params: ChemEnumParams, sampleSize: number, rand: () => number = Math.random,
|
|
531
|
+
): ChemEnumResult[] {
|
|
532
|
+
const reservoir: ChemEnumResult[] = [];
|
|
533
|
+
let seen = 0;
|
|
534
|
+
for (const {core, assignment} of iterateAssignments(params)) {
|
|
535
|
+
const rgSmiByNum = new Map<number, string>();
|
|
536
|
+
for (const [n, rg] of assignment) rgSmiByNum.set(n, rg.smiles);
|
|
537
|
+
const smi = buildJoinedSmiles(core.smiles, rgSmiByNum);
|
|
538
|
+
if (!smi) continue;
|
|
539
|
+
|
|
540
|
+
const originalRgs = new Map<number, string>();
|
|
541
|
+
for (const [n, rg] of assignment) originalRgs.set(n, rg.originalSmiles);
|
|
542
|
+
const item: ChemEnumResult = {smiles: smi, coreSmiles: core.originalSmiles, rGroupSmilesByNum: originalRgs};
|
|
543
|
+
|
|
544
|
+
if (reservoir.length < sampleSize) {
|
|
545
|
+
reservoir.push(item);
|
|
546
|
+
} else {
|
|
547
|
+
const j = Math.floor(rand() * (seen + 1));
|
|
548
|
+
if (j < sampleSize) reservoir[j] = item;
|
|
549
|
+
}
|
|
550
|
+
seen++;
|
|
551
|
+
}
|
|
552
|
+
return reservoir;
|
|
553
|
+
}
|
|
@@ -19,12 +19,10 @@ import {doPolyToolConvert} from './conversion/pt-conversion';
|
|
|
19
19
|
import {getOverriddenLibrary} from './conversion/pt-synthetic';
|
|
20
20
|
import {defaultErrorHandler} from '../utils/err-info';
|
|
21
21
|
import {getLibrariesList} from './utils';
|
|
22
|
-
import {getEnumerationChem, PT_CHEM_EXAMPLE} from './pt-enumeration-chem';
|
|
23
|
-
|
|
24
22
|
import {
|
|
25
|
-
PT_ERROR_DATAFRAME, PT_UI_ADD_HELM, PT_UI_DIALOG_CONVERSION,
|
|
23
|
+
PT_ERROR_DATAFRAME, PT_UI_ADD_HELM, PT_UI_DIALOG_CONVERSION,
|
|
26
24
|
PT_UI_GET_HELM, PT_UI_LINEARIZE, PT_UI_LINEARIZE_TT,
|
|
27
|
-
PT_UI_HIGHLIGHT_MONOMERS, PT_UI_RULES_USED, PT_UI_USE_CHIRALITY
|
|
25
|
+
PT_UI_HIGHLIGHT_MONOMERS, PT_UI_RULES_USED, PT_UI_USE_CHIRALITY,
|
|
28
26
|
} from './const';
|
|
29
27
|
|
|
30
28
|
import {_package} from '../package';
|
|
@@ -50,21 +48,6 @@ type PolyToolConvertSerialized = {
|
|
|
50
48
|
rules: string[];
|
|
51
49
|
};
|
|
52
50
|
|
|
53
|
-
type PolyToolEnumerateChemSerialized = {
|
|
54
|
-
mol: string;
|
|
55
|
-
screenLibrary: string | null;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
export async function polyToolEnumerateChemUI(cell?: DG.Cell): Promise<void> {
|
|
59
|
-
await _package.initPromise;
|
|
60
|
-
try {
|
|
61
|
-
const dialog = await getPolyToolEnumerationChemDialog(cell);
|
|
62
|
-
dialog.show({resizable: true});
|
|
63
|
-
} catch (_err: any) {
|
|
64
|
-
grok.shell.warning('To run PolyTool Enumeration, sketch the molecule and specify the R group to vary');
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
|
|
68
51
|
export async function polyToolConvertUI(): Promise<void> {
|
|
69
52
|
await _package.initPromise;
|
|
70
53
|
let dialog: DG.Dialog | null = null;
|
|
@@ -178,111 +161,6 @@ export async function getPolyToolConvertDialog(srcCol?: DG.Column): Promise<DG.D
|
|
|
178
161
|
}
|
|
179
162
|
}
|
|
180
163
|
|
|
181
|
-
async function getPolyToolEnumerationChemDialog(cell?: DG.Cell): Promise<DG.Dialog> {
|
|
182
|
-
const subs: Unsubscribable[] = [];
|
|
183
|
-
const destroy = () => {
|
|
184
|
-
for (const sub of subs) sub.unsubscribe();
|
|
185
|
-
};
|
|
186
|
-
try {
|
|
187
|
-
const [libList, helmHelper] = await Promise.all([
|
|
188
|
-
getLibrariesList(), getHelmHelper()]);
|
|
189
|
-
|
|
190
|
-
const molStr = (cell && cell.rowIndex >= 0) ? cell.value : PT_CHEM_EXAMPLE;//cell ? cell.value : PT_CHEM_EXAMPLE;
|
|
191
|
-
let molfileValue: string = await (async (): Promise<string> => {
|
|
192
|
-
if (DG.chem.isMolBlock(molStr)) return molStr;
|
|
193
|
-
return (await grok.functions.call('Chem:convertMolNotation', {
|
|
194
|
-
molecule: molStr,
|
|
195
|
-
sourceNotation: cell?.column.getTag(DG.TAGS.UNITS) ?? DG.chem.Notation.Unknown,
|
|
196
|
-
targetNotation: DG.chem.Notation.MolBlock,
|
|
197
|
-
}));
|
|
198
|
-
})();
|
|
199
|
-
|
|
200
|
-
const molInput = new DG.chem.Sketcher(DG.chem.SKETCHER_MODE.EXTERNAL);
|
|
201
|
-
molInput.syncCurrentObject = false;
|
|
202
|
-
// sketcher.setMolFile(col.tags[ALIGN_BY_SCAFFOLD_TAG]);
|
|
203
|
-
molInput.onChanged.subscribe((_: any) => {
|
|
204
|
-
molfileValue = molInput.getMolFile();
|
|
205
|
-
});
|
|
206
|
-
molInput.root.classList.add('ui-input-editor');
|
|
207
|
-
molInput.root.style.marginTop = '3px';
|
|
208
|
-
molInput.setMolFile(molfileValue);
|
|
209
|
-
|
|
210
|
-
//const helmInput = helmHelper.createHelmInput('Macromolecule', {value: helmValue});
|
|
211
|
-
const screenLibraryInput = ui.input.choice('Library to use', {
|
|
212
|
-
value: libList.length ? libList[0] : null,
|
|
213
|
-
items: libList,
|
|
214
|
-
nullable: false,
|
|
215
|
-
onValueChanged: () => {
|
|
216
|
-
dialog.getButton('OK').disabled = screenLibraryInput.value === null;
|
|
217
|
-
}
|
|
218
|
-
});
|
|
219
|
-
|
|
220
|
-
molInput.root.setAttribute('style', `min-width:250px!important;`);
|
|
221
|
-
molInput.root.setAttribute('style', `max-width:250px!important;`);
|
|
222
|
-
screenLibraryInput.input.setAttribute('style', `min-width:250px!important;`);
|
|
223
|
-
|
|
224
|
-
const div = ui.div([
|
|
225
|
-
molInput.root,
|
|
226
|
-
screenLibraryInput.root
|
|
227
|
-
]);
|
|
228
|
-
|
|
229
|
-
subs.push(grok.events.onCurrentCellChanged.subscribe(() => {
|
|
230
|
-
const cell = grok.shell.tv.dataFrame.currentCell;
|
|
231
|
-
|
|
232
|
-
if (cell.column.semType === DG.SEMTYPE.MOLECULE)
|
|
233
|
-
molInput.setValue(cell.value);
|
|
234
|
-
}));
|
|
235
|
-
|
|
236
|
-
const exec = async (): Promise<void> => {
|
|
237
|
-
try {
|
|
238
|
-
const molString = molInput.getMolFile();
|
|
239
|
-
|
|
240
|
-
if (molString === undefined || molString === '') {
|
|
241
|
-
grok.shell.warning('PolyTool: no molecule was provided');
|
|
242
|
-
} else if (!molString.includes('R#')) {
|
|
243
|
-
grok.shell.warning('PolyTool: no R group was provided');
|
|
244
|
-
} else {
|
|
245
|
-
const molecules = await getEnumerationChem(molString, screenLibraryInput.value!);
|
|
246
|
-
const molCol = DG.Column.fromStrings('Enumerated', molecules);
|
|
247
|
-
const df = DG.DataFrame.fromColumns([molCol]);
|
|
248
|
-
grok.shell.addTableView(df);
|
|
249
|
-
}
|
|
250
|
-
} catch (err: any) {
|
|
251
|
-
defaultErrorHandler(err);
|
|
252
|
-
}
|
|
253
|
-
};
|
|
254
|
-
|
|
255
|
-
// Displays the molecule from a current cell (monitors changes)
|
|
256
|
-
const dialog = ui.dialog(PT_UI_DIALOG_ENUMERATION)
|
|
257
|
-
.add(div)
|
|
258
|
-
.onOK(() => {
|
|
259
|
-
exec().finally(() => { destroy(); });
|
|
260
|
-
})
|
|
261
|
-
.onCancel(() => {
|
|
262
|
-
destroy();
|
|
263
|
-
});
|
|
264
|
-
subs.push(dialog.onClose.subscribe(() => {
|
|
265
|
-
destroy();
|
|
266
|
-
}));
|
|
267
|
-
dialog.history(
|
|
268
|
-
/* getInput */ (): PolyToolEnumerateChemSerialized => {
|
|
269
|
-
return {
|
|
270
|
-
mol: molInput.getMolFile(),
|
|
271
|
-
screenLibrary: screenLibraryInput.value,
|
|
272
|
-
};
|
|
273
|
-
},
|
|
274
|
-
/* applyInput */ (x: PolyToolEnumerateChemSerialized): void => {
|
|
275
|
-
molInput.setMolFile(x.mol);
|
|
276
|
-
screenLibraryInput.value = x.screenLibrary;
|
|
277
|
-
});
|
|
278
|
-
dialog.getButton('OK').disabled = screenLibraryInput.value === null;
|
|
279
|
-
return dialog;
|
|
280
|
-
} catch (err: any) {
|
|
281
|
-
destroy();
|
|
282
|
-
throw err;
|
|
283
|
-
}
|
|
284
|
-
}
|
|
285
|
-
|
|
286
164
|
/** Returns Helm and molfile columns. */
|
|
287
165
|
export async function polyToolConvert(seqCol: DG.Column<string>,
|
|
288
166
|
generateHelm: boolean, linearize: boolean, chiralityEngine: boolean, highlight: boolean, ruleFiles: string[]
|