@datagrok/bio 2.25.12 → 2.25.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package-test.js +3 -3
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +3 -3
- package/dist/package.js.map +1 -1
- package/package.json +1 -1
- package/src/demo/bio03-atomic-level.ts +1 -2
- package/src/substructure-search/substructure-search.ts +1 -2
- package/src/utils/detect-macromolecule-probe.ts +1 -2
- package/src/utils/monomer-lib/lib-manager.ts +1 -2
- package/src/utils/monomer-lib/monomer-manager/match-molecules.ts +347 -0
- package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts +4 -82
- package/src/utils/multiple-sequence-alignment-ui.ts +1 -2
- package/src/utils/seq-helper/seq-handler.ts +14 -4
- package/src/utils/split-to-monomers.ts +1 -2
- package/src/viewers/vd-regions-viewer.ts +1 -1
- package/src/viewers/web-logo-viewer.ts +1 -1
- package/src/widgets/bio-substructure-filter-helm.ts +2 -3
- package/src/widgets/bio-substructure-filter.ts +2 -2
- package/test-console-output-1.log +493 -520
- package/test-record-1.mp4 +0 -0
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Davit Rizhinashvili",
|
|
6
6
|
"email": "drizhinashvili@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.25.
|
|
8
|
+
"version": "2.25.14",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -4,7 +4,6 @@ import * as DG from 'datagrok-api/dg';
|
|
|
4
4
|
import {_package, PackageFunctions} from '../package';
|
|
5
5
|
import {handleError} from './utils';
|
|
6
6
|
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
7
|
-
import {delay} from '@datagrok-libraries/test/src/test';
|
|
8
7
|
import {adjustGridcolAfterRender} from '../utils/ui-utils';
|
|
9
8
|
|
|
10
9
|
export async function demoToAtomicLevel(): Promise<void> {
|
|
@@ -63,7 +62,7 @@ export async function demoBio03UI(): Promise<void> {
|
|
|
63
62
|
dlg = ui.dialog()
|
|
64
63
|
.add(sketcher)
|
|
65
64
|
.show();
|
|
66
|
-
await delay(3000);
|
|
65
|
+
await DG.delay(3000);
|
|
67
66
|
dlg.close();
|
|
68
67
|
}, {
|
|
69
68
|
description: 'Display atomic level structure within a sketcher.',
|
|
@@ -2,7 +2,6 @@ import * as grok from 'datagrok-api/grok';
|
|
|
2
2
|
import * as ui from 'datagrok-api/ui';
|
|
3
3
|
import * as DG from 'datagrok-api/dg';
|
|
4
4
|
|
|
5
|
-
import {delay} from '@datagrok-libraries/test/src/test';
|
|
6
5
|
import {TAGS as bioTAGS, NOTATION} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
7
6
|
import {ISeqHelper} from '@datagrok-libraries/bio/src/utils/seq-helper';
|
|
8
7
|
|
|
@@ -154,7 +153,7 @@ export async function helmSubstructureSearch(
|
|
|
154
153
|
export async function invalidateMols(col: DG.Column<string>, seqHelper: ISeqHelper, pattern: boolean): Promise<void> {
|
|
155
154
|
const progressBar = DG.TaskBarProgressIndicator.create(`Invalidating molfiles for ${col.name}`);
|
|
156
155
|
try {
|
|
157
|
-
await delay(10);
|
|
156
|
+
await DG.delay(10);
|
|
158
157
|
const monomersDict = new Map();
|
|
159
158
|
const monomericMolsCol = await getMonomericMols(col, seqHelper, pattern, monomersDict);
|
|
160
159
|
col.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS] = monomericMolsCol;
|
|
@@ -4,7 +4,6 @@ import * as DG from 'datagrok-api/dg';
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
import {_package} from '../package';
|
|
7
|
-
import {delay} from '@datagrok-libraries/test/src/test';
|
|
8
7
|
|
|
9
8
|
type IDetectorReport = { categoriesSample: any[], rejectReason: string };
|
|
10
9
|
|
|
@@ -31,7 +30,7 @@ export async function detectMacromoleculeProbeDo(
|
|
|
31
30
|
if ((progress - progressLast) >= 0.1) {
|
|
32
31
|
progressLast = progress;
|
|
33
32
|
pi.update(100 * progress, `detectMacromolecule probe ${failCount}/${i}/${probeCount} ...`);
|
|
34
|
-
await delay(0);
|
|
33
|
+
await DG.delay(0);
|
|
35
34
|
}
|
|
36
35
|
}
|
|
37
36
|
if (failCount > 0)
|
|
@@ -4,7 +4,6 @@ import * as grok from 'datagrok-api/grok';
|
|
|
4
4
|
import * as ui from 'datagrok-api/ui';
|
|
5
5
|
import * as DG from 'datagrok-api/dg';
|
|
6
6
|
|
|
7
|
-
import {delay} from '@datagrok-libraries/test/src/test';
|
|
8
7
|
import {ILogger} from '@datagrok-libraries/bio/src/utils/logger';
|
|
9
8
|
import {DEFAULT_FILES_LIB_PROVIDER_NAME, findProviderWithLibraryName, IMonomerLib, IMonomerSet} from '@datagrok-libraries/bio/src/types/monomer-library';
|
|
10
9
|
import {
|
|
@@ -91,7 +90,7 @@ export class MonomerLibManager implements IMonomerLibHelper {
|
|
|
91
90
|
return true;
|
|
92
91
|
})(),
|
|
93
92
|
(async () => {
|
|
94
|
-
await delay(timeout);
|
|
93
|
+
await DG.delay(timeout);
|
|
95
94
|
return false;
|
|
96
95
|
})(),
|
|
97
96
|
]).then((res) => {
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
/* eslint-disable max-lines-per-function */
|
|
2
|
+
/* eslint-disable max-len */
|
|
3
|
+
import * as grok from 'datagrok-api/grok';
|
|
4
|
+
import * as DG from 'datagrok-api/dg';
|
|
5
|
+
|
|
6
|
+
import {IMonomerLib, Monomer} from '@datagrok-libraries/bio/src/types/monomer-library';
|
|
7
|
+
import {PolymerType} from '@datagrok-libraries/bio/src/helm/types';
|
|
8
|
+
|
|
9
|
+
import {STANDRARD_R_GROUPS} from './const';
|
|
10
|
+
import {standardiseMonomers, capSmiles, getCorrectedSmiles} from './monomer-manager';
|
|
11
|
+
|
|
12
|
+
/** Represents a single monomer match result — one monomer that matched a molecule */
|
|
13
|
+
interface MonomerMatch {
|
|
14
|
+
/** monomer symbol from the library */
|
|
15
|
+
symbol: string;
|
|
16
|
+
/** canonical (possibly capped) SMILES used for the match */
|
|
17
|
+
smiles: string;
|
|
18
|
+
/** original SMILES from the monomer definition */
|
|
19
|
+
original: string;
|
|
20
|
+
/** library source name */
|
|
21
|
+
source: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Maps keyed by canonical SMILES, where each key can map to multiple monomers */
|
|
25
|
+
type MonomerSmilesMap = {[smiles: string]: MonomerMatch[]};
|
|
26
|
+
|
|
27
|
+
const MATCH_SEPARATOR = ' | ';
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Builds lookup maps from standardized monomers:
|
|
31
|
+
* - uncappedMap: maps raw canonical monomer SMILES -> MonomerMatch[]
|
|
32
|
+
* - cappedMap: maps capped (R-groups replaced with cap groups) canonical SMILES -> MonomerMatch[]
|
|
33
|
+
* Both maps store arrays so that duplicate monomers (same structure, different symbols/libs) are preserved.
|
|
34
|
+
*/
|
|
35
|
+
async function buildMonomerSmilesMaps(
|
|
36
|
+
fixedMonomers: Monomer[], originalMonomers: Monomer[], converterFunc: DG.Func,
|
|
37
|
+
): Promise<{cappedMap: MonomerSmilesMap; uncappedMap: MonomerSmilesMap}> {
|
|
38
|
+
// build uncapped map from raw monomer SMILES
|
|
39
|
+
const uncappedMap: MonomerSmilesMap = {};
|
|
40
|
+
for (const m of fixedMonomers) {
|
|
41
|
+
if (!m.smiles) continue;
|
|
42
|
+
const match: MonomerMatch = {symbol: m.symbol, smiles: m.smiles, original: m.smiles, source: m.lib?.source ?? ''};
|
|
43
|
+
if (!uncappedMap[m.smiles]) uncappedMap[m.smiles] = [];
|
|
44
|
+
uncappedMap[m.smiles].push(match);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// build capped monomer entries: replace R-groups with cap group atoms
|
|
48
|
+
const cappedEntries = fixedMonomers
|
|
49
|
+
.map((m, i) => ({
|
|
50
|
+
symbol: m.symbol,
|
|
51
|
+
smiles: capSmiles(m.smiles ?? '', m.rgroups ?? []),
|
|
52
|
+
original: m.smiles,
|
|
53
|
+
source: originalMonomers[i]?.lib?.source ?? '',
|
|
54
|
+
}))
|
|
55
|
+
.filter((e) => !!e.smiles && !e.smiles.includes('[*:'));
|
|
56
|
+
|
|
57
|
+
// canonicalize all capped SMILES in bulk
|
|
58
|
+
const cappedSmilesCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'CappedSmiles', cappedEntries.map((e) => e.smiles));
|
|
59
|
+
cappedSmilesCol.semType = DG.SEMTYPE.MOLECULE;
|
|
60
|
+
const canonicalCappedCol: DG.Column = await converterFunc.apply({molecule: cappedSmilesCol, targetNotation: DG.chem.Notation.Smiles});
|
|
61
|
+
if (!canonicalCappedCol || canonicalCappedCol.length !== cappedSmilesCol.length)
|
|
62
|
+
throw new Error('Error canonicalizing capped monomer SMILES');
|
|
63
|
+
|
|
64
|
+
// build capped map with canonicalized SMILES as keys
|
|
65
|
+
const cappedMap: MonomerSmilesMap = {};
|
|
66
|
+
const canonicalCappedList = canonicalCappedCol.toList();
|
|
67
|
+
for (let i = 0; i < canonicalCappedList.length; i++) {
|
|
68
|
+
const smi = canonicalCappedList[i];
|
|
69
|
+
if (!smi) continue;
|
|
70
|
+
cappedEntries[i].smiles = smi;
|
|
71
|
+
const match: MonomerMatch = cappedEntries[i];
|
|
72
|
+
if (!cappedMap[smi]) cappedMap[smi] = [];
|
|
73
|
+
cappedMap[smi].push(match);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return {cappedMap, uncappedMap};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Corrects and canonicalizes the input molecule column.
|
|
81
|
+
* Handles both SMILES and molblock inputs.
|
|
82
|
+
* Returns the list of canonical SMILES strings (null for invalid molecules).
|
|
83
|
+
*/
|
|
84
|
+
async function canonicalizeMolecules(
|
|
85
|
+
molDf: DG.DataFrame, molColName: string, converterFunc: DG.Func,
|
|
86
|
+
): Promise<(string | null)[]> {
|
|
87
|
+
const moleculesOriginalCol = molDf.col(molColName)!;
|
|
88
|
+
const correctedList = moleculesOriginalCol.toList().map((s) => {
|
|
89
|
+
if (!s) return s;
|
|
90
|
+
try {
|
|
91
|
+
const isMolBlock = s.includes('\n');
|
|
92
|
+
return getCorrectedSmiles([], isMolBlock ? undefined : s, isMolBlock ? s : undefined);
|
|
93
|
+
} catch (_e) {
|
|
94
|
+
return s;
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
const correctedCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'MoleculesCorrected', correctedList);
|
|
99
|
+
correctedCol.semType = DG.SEMTYPE.MOLECULE;
|
|
100
|
+
// dummy df needed for semtype detection by converterFunc
|
|
101
|
+
const _ddf = DG.DataFrame.fromColumns([correctedCol]);
|
|
102
|
+
|
|
103
|
+
const canonicalCol: DG.Column = await converterFunc.apply({molecule: correctedCol, targetNotation: DG.chem.Notation.Smiles});
|
|
104
|
+
if (!canonicalCol || canonicalCol.length !== correctedCol.length)
|
|
105
|
+
throw new Error('Error canonicalizing molecules');
|
|
106
|
+
|
|
107
|
+
return canonicalCol.toList();
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Attempts to match a single canonical molecule SMILES against the lookup maps.
|
|
112
|
+
* Tries in order: capped map -> uncapped map -> cap the molecule with standard R-groups and retry.
|
|
113
|
+
* Returns all matching monomers (can be multiple from different libraries).
|
|
114
|
+
*/
|
|
115
|
+
function matchBySmiles(
|
|
116
|
+
canonicalMol: string, cappedMap: MonomerSmilesMap, uncappedMap: MonomerSmilesMap,
|
|
117
|
+
): MonomerMatch[] {
|
|
118
|
+
// try direct lookup in capped and uncapped maps
|
|
119
|
+
let matches = cappedMap[canonicalMol] ?? uncappedMap[canonicalMol];
|
|
120
|
+
if (matches && matches.length > 0) return matches;
|
|
121
|
+
|
|
122
|
+
// fallback: cap the molecule with standard R-groups and try again
|
|
123
|
+
const cappedMol = capSmiles(canonicalMol, STANDRARD_R_GROUPS);
|
|
124
|
+
if (cappedMol !== canonicalMol) {
|
|
125
|
+
const correctedMol = grok.chem.convert(cappedMol, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles);
|
|
126
|
+
matches = cappedMap[correctedMol] ?? uncappedMap[correctedMol];
|
|
127
|
+
if (matches && matches.length > 0) return matches;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return [];
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Builds a Morgan fingerprint lookup map for all capped monomer SMILES.
|
|
135
|
+
* The map keys are fingerprint binary strings (via DG.BitSet.toBinaryString()),
|
|
136
|
+
* which allows fast exact matching that is tolerant of explicit hydrogen
|
|
137
|
+
* and minor stereochemistry differences.
|
|
138
|
+
*/
|
|
139
|
+
async function buildMonomerFingerprintMap(
|
|
140
|
+
cappedMap: MonomerSmilesMap,
|
|
141
|
+
): Promise<{fpMap: {[fpString: string]: MonomerMatch[]}; cappedSmilesList: string[]}> {
|
|
142
|
+
const cappedSmilesList = Object.keys(cappedMap);
|
|
143
|
+
if (cappedSmilesList.length === 0)
|
|
144
|
+
return {fpMap: {}, cappedSmilesList: []};
|
|
145
|
+
|
|
146
|
+
const monomerCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'MonomerSmiles', cappedSmilesList);
|
|
147
|
+
monomerCol.semType = DG.SEMTYPE.MOLECULE;
|
|
148
|
+
|
|
149
|
+
const fpCol: DG.Column = await grok.functions.call('Chem:getMorganFingerprints', {molColumn: monomerCol});
|
|
150
|
+
|
|
151
|
+
const fpMap: {[fpString: string]: MonomerMatch[]} = {};
|
|
152
|
+
for (let i = 0; i < fpCol.length; i++) {
|
|
153
|
+
const fp: DG.BitSet | null = fpCol.get(i);
|
|
154
|
+
if (!fp) continue;
|
|
155
|
+
const fpStr = fp.toBinaryString();
|
|
156
|
+
// merge monomer matches from the SMILES map into the fingerprint map
|
|
157
|
+
const smilesMatches = cappedMap[cappedSmilesList[i]] ?? [];
|
|
158
|
+
if (!fpMap[fpStr]) fpMap[fpStr] = [];
|
|
159
|
+
fpMap[fpStr].push(...smilesMatches);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
return {fpMap, cappedSmilesList};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* For molecules that were not matched by exact SMILES, attempts matching via
|
|
167
|
+
* Morgan fingerprints. Computes fingerprints for unmatched molecules and looks
|
|
168
|
+
* them up in the monomer fingerprint map. Also tries capping with standard R-groups.
|
|
169
|
+
*/
|
|
170
|
+
async function matchByFingerprint(
|
|
171
|
+
unmatchedIndices: number[],
|
|
172
|
+
canonicalizedMolecules: (string | null)[],
|
|
173
|
+
monomerFpMap: {[fpString: string]: MonomerMatch[]},
|
|
174
|
+
): Promise<Map<number, MonomerMatch[]>> {
|
|
175
|
+
const results = new Map<number, MonomerMatch[]>();
|
|
176
|
+
if (unmatchedIndices.length === 0 || Object.keys(monomerFpMap).length === 0)
|
|
177
|
+
return results;
|
|
178
|
+
|
|
179
|
+
// collect SMILES for unmatched molecules (uncapped first)
|
|
180
|
+
const uncappedSmiles: string[] = unmatchedIndices.map((idx) => canonicalizedMolecules[idx] ?? '');
|
|
181
|
+
|
|
182
|
+
// also prepare capped versions
|
|
183
|
+
const cappedSmiles: string[] = uncappedSmiles.map((s) => {
|
|
184
|
+
if (!s) return '';
|
|
185
|
+
const capped = capSmiles(s, STANDRARD_R_GROUPS);
|
|
186
|
+
return capped !== s ? grok.chem.convert(capped, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles) : s;
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
// compute fingerprints for both uncapped and capped molecules
|
|
190
|
+
const uncappedCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'UnmatchedMols', uncappedSmiles);
|
|
191
|
+
uncappedCol.semType = DG.SEMTYPE.MOLECULE;
|
|
192
|
+
const cappedCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'UnmatchedMolsCapped', cappedSmiles);
|
|
193
|
+
cappedCol.semType = DG.SEMTYPE.MOLECULE;
|
|
194
|
+
|
|
195
|
+
const [uncappedFpCol, cappedFpCol]: [DG.Column, DG.Column] = await Promise.all([
|
|
196
|
+
grok.functions.call('Chem:getMorganFingerprints', {molColumn: uncappedCol}),
|
|
197
|
+
grok.functions.call('Chem:getMorganFingerprints', {molColumn: cappedCol}),
|
|
198
|
+
]);
|
|
199
|
+
|
|
200
|
+
for (let i = 0; i < unmatchedIndices.length; i++) {
|
|
201
|
+
const molIdx = unmatchedIndices[i];
|
|
202
|
+
|
|
203
|
+
// try uncapped fingerprint first
|
|
204
|
+
const uncappedFp: DG.BitSet | null = uncappedFpCol.get(i);
|
|
205
|
+
if (uncappedFp) {
|
|
206
|
+
const fpStr = uncappedFp.toBinaryString();
|
|
207
|
+
const matches = monomerFpMap[fpStr];
|
|
208
|
+
if (matches && matches.length > 0) {
|
|
209
|
+
results.set(molIdx, matches);
|
|
210
|
+
continue;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// fallback: try capped fingerprint
|
|
215
|
+
const cappedFp: DG.BitSet | null = cappedFpCol.get(i);
|
|
216
|
+
if (cappedFp) {
|
|
217
|
+
const fpStr = cappedFp.toBinaryString();
|
|
218
|
+
const matches = monomerFpMap[fpStr];
|
|
219
|
+
if (matches && matches.length > 0)
|
|
220
|
+
results.set(molIdx, matches);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return results;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/** Deduplicates matches by symbol, keeping one entry per unique monomer symbol */
|
|
228
|
+
function deduplicateMatches(matches: MonomerMatch[]): MonomerMatch[] {
|
|
229
|
+
const seen = new Set<string>();
|
|
230
|
+
return matches.filter((m) => {
|
|
231
|
+
if (seen.has(m.symbol)) return false;
|
|
232
|
+
seen.add(m.symbol);
|
|
233
|
+
return true;
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/** Collects all source library names for matched monomers, including known duplicates */
|
|
238
|
+
function collectSources(
|
|
239
|
+
matches: MonomerMatch[], duplicates: {[symbol: string]: Monomer[]},
|
|
240
|
+
): string {
|
|
241
|
+
const sources = new Set<string>();
|
|
242
|
+
for (const m of matches) {
|
|
243
|
+
// check if monomerLib knows about duplicates for this symbol across libraries
|
|
244
|
+
const dups = duplicates[m.symbol];
|
|
245
|
+
if (dups && dups.length > 0) {
|
|
246
|
+
for (const dup of dups) {
|
|
247
|
+
const s = dup?.lib?.source;
|
|
248
|
+
if (s) sources.add(s);
|
|
249
|
+
}
|
|
250
|
+
} else if (m.source)
|
|
251
|
+
sources.add(m.source);
|
|
252
|
+
}
|
|
253
|
+
return Array.from(sources).join(', ');
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Matches molecules in a dataframe with monomers from a monomer library.
|
|
258
|
+
*
|
|
259
|
+
* Matching pipeline:
|
|
260
|
+
* 1. Standardize monomers and build SMILES lookup maps (capped & uncapped)
|
|
261
|
+
* 2. Canonicalize input molecules
|
|
262
|
+
* 3. Phase 1: exact canonical SMILES matching (capped, uncapped, and fallback-capped molecule)
|
|
263
|
+
* 4. Phase 2: Morgan fingerprint fallback for molecules that didn't match by SMILES
|
|
264
|
+
* 5. Populate result columns (supports multiple matches per molecule via pipe-delimited values)
|
|
265
|
+
*
|
|
266
|
+
* @returns cloned input DataFrame with added match columns
|
|
267
|
+
*/
|
|
268
|
+
export async function matchMoleculesWithMonomers(
|
|
269
|
+
molDf: DG.DataFrame, molColName: string, monomerLib: IMonomerLib, polymerType: PolymerType = 'PEPTIDE',
|
|
270
|
+
): Promise<DG.DataFrame> {
|
|
271
|
+
const duplicates = monomerLib.duplicateMonomers?.[polymerType] ?? {};
|
|
272
|
+
const converterFunc = DG.Func.find({package: 'Chem', name: 'convertMoleculeNotation'})[0];
|
|
273
|
+
if (!converterFunc)
|
|
274
|
+
throw new Error('Function convertMoleculeNotation not found, please install Chem package');
|
|
275
|
+
|
|
276
|
+
// === Step 1: Standardize monomers and build SMILES lookup maps ===
|
|
277
|
+
const monomers = monomerLib.getMonomerSymbolsByType(polymerType)
|
|
278
|
+
.map((s) => monomerLib.getMonomer(polymerType, s)!)
|
|
279
|
+
.filter((m) => m && (m.smiles || m.molfile));
|
|
280
|
+
|
|
281
|
+
const fixedMonomers = await standardiseMonomers(monomers);
|
|
282
|
+
// preserve library reference from original monomers (lost during standardization)
|
|
283
|
+
fixedMonomers.forEach((m, i) => { m.lib = monomers[i].lib; });
|
|
284
|
+
|
|
285
|
+
const {cappedMap, uncappedMap} = await buildMonomerSmilesMaps(fixedMonomers, monomers, converterFunc);
|
|
286
|
+
|
|
287
|
+
// === Step 2: Canonicalize input molecules ===
|
|
288
|
+
const canonicalizedMolecules = await canonicalizeMolecules(molDf, molColName, converterFunc);
|
|
289
|
+
|
|
290
|
+
// === Step 3: Phase 1 — Exact canonical SMILES matching ===
|
|
291
|
+
// matchResults[i] holds all MonomerMatch entries for molecule i (empty array if unmatched)
|
|
292
|
+
const matchResults: MonomerMatch[][] = new Array(canonicalizedMolecules.length).fill(null).map(() => []);
|
|
293
|
+
const unmatchedIndices: number[] = [];
|
|
294
|
+
|
|
295
|
+
for (let i = 0; i < canonicalizedMolecules.length; i++) {
|
|
296
|
+
const mol = canonicalizedMolecules[i];
|
|
297
|
+
if (!mol) continue;
|
|
298
|
+
const smilesMatches = matchBySmiles(mol, cappedMap, uncappedMap);
|
|
299
|
+
if (smilesMatches.length > 0)
|
|
300
|
+
matchResults[i] = smilesMatches;
|
|
301
|
+
else
|
|
302
|
+
unmatchedIndices.push(i);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// === Step 4: Phase 2 — Morgan fingerprint fallback for unmatched molecules ===
|
|
306
|
+
if (unmatchedIndices.length > 0) {
|
|
307
|
+
try {
|
|
308
|
+
const {fpMap} = await buildMonomerFingerprintMap(cappedMap);
|
|
309
|
+
const fpMatches = await matchByFingerprint(unmatchedIndices, canonicalizedMolecules, fpMap);
|
|
310
|
+
for (const [idx, matches] of fpMatches)
|
|
311
|
+
matchResults[idx] = matches;
|
|
312
|
+
} catch (e) {
|
|
313
|
+
console.warn('Fingerprint fallback matching failed, continuing with SMILES matches only:', e);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// === Step 5: Populate result columns ===
|
|
318
|
+
const resultDf = molDf.clone();
|
|
319
|
+
const symbolCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer symbol'));
|
|
320
|
+
const smilesCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer smiles'));
|
|
321
|
+
smilesCol.semType = DG.SEMTYPE.MOLECULE;
|
|
322
|
+
const sourceCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer source'));
|
|
323
|
+
const matchCountCol = resultDf.columns.addNewInt(resultDf.columns.getUnusedName('Match count'));
|
|
324
|
+
const matchMethodCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Match method'));
|
|
325
|
+
resultDf.columns.setOrder([molColName, symbolCol.name, smilesCol.name, sourceCol.name, matchCountCol.name, matchMethodCol.name]);
|
|
326
|
+
|
|
327
|
+
for (let i = 0; i < matchResults.length; i++) {
|
|
328
|
+
const matches = matchResults[i];
|
|
329
|
+
if (matches.length === 0) continue;
|
|
330
|
+
|
|
331
|
+
// deduplicate matches by symbol (same monomer can appear from multiple lookup paths)
|
|
332
|
+
const uniqueMatches = deduplicateMatches(matches);
|
|
333
|
+
|
|
334
|
+
// collect all sources, including duplicates from the monomer library
|
|
335
|
+
const allSources = collectSources(uniqueMatches, duplicates);
|
|
336
|
+
|
|
337
|
+
symbolCol.set(i, uniqueMatches.map((m) => m.symbol).join(MATCH_SEPARATOR), false);
|
|
338
|
+
smilesCol.set(i, uniqueMatches[0].original ?? uniqueMatches[0].smiles, false);
|
|
339
|
+
sourceCol.set(i, allSources, false);
|
|
340
|
+
matchCountCol.set(i, uniqueMatches.length, false);
|
|
341
|
+
// fingerprint matches are those from phase 2 (indices that were in unmatchedIndices)
|
|
342
|
+
const method = unmatchedIndices.includes(i) ? 'fingerprint' : 'exact';
|
|
343
|
+
matchMethodCol.set(i, method, false);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
return resultDf;
|
|
347
|
+
}
|
|
@@ -15,11 +15,10 @@ import {PolymerType, MonomerType} from '@datagrok-libraries/bio/src/helm/types';
|
|
|
15
15
|
import {MonomerLibManager} from '../lib-manager';
|
|
16
16
|
|
|
17
17
|
import {MONOMER_RENDERER_TAGS} from '@datagrok-libraries/bio/src/utils/cell-renderer';
|
|
18
|
-
import {BioTags} from '@datagrok-libraries/bio/src/utils/macromolecule/consts';
|
|
18
|
+
import {BioTags, MONOMER_MOTIF_SPLITTER} from '@datagrok-libraries/bio/src/utils/macromolecule/consts';
|
|
19
19
|
//@ts-ignore
|
|
20
20
|
import '../../../../css/monomer-manager.css';
|
|
21
21
|
import {Subscription} from 'rxjs';
|
|
22
|
-
import {STANDRARD_R_GROUPS} from './const';
|
|
23
22
|
|
|
24
23
|
// columns of monomers dataframe, note that rgroups is hidden and will be displayed as separate columns
|
|
25
24
|
export enum MONOMER_DF_COLUMN_NAMES {
|
|
@@ -60,84 +59,7 @@ export async function standardiseMonomers(monomers: Monomer[]) {
|
|
|
60
59
|
return fixedMonomers;
|
|
61
60
|
}
|
|
62
61
|
|
|
63
|
-
|
|
64
|
-
export async function matchMoleculesWithMonomers(molDf: DG.DataFrame, molColName: string, monomerLib: IMonomerLib, polymerType: PolymerType = 'PEPTIDE'): Promise<DG.DataFrame> {
|
|
65
|
-
const duplicates = monomerLib.duplicateMonomers?.[polymerType] ?? {};
|
|
66
|
-
const converterFunc = DG.Func.find({package: 'Chem', name: 'convertMoleculeNotation'})[0];
|
|
67
|
-
if (!converterFunc)
|
|
68
|
-
throw new Error('Function convertMoleculeNotation not found, please install Chem package');
|
|
69
|
-
// first: stamdardize monomers
|
|
70
|
-
const monomers = monomerLib.getMonomerSymbolsByType(polymerType).map((s) => monomerLib.getMonomer(polymerType, s)!).filter((m) => m && (m.smiles || m.molfile));
|
|
71
|
-
const fixedMonomers = await standardiseMonomers(monomers);
|
|
72
|
-
fixedMonomers.forEach((m, i) => {
|
|
73
|
-
m.lib = monomers[i].lib;
|
|
74
|
-
});
|
|
75
|
-
const unCappedMonomerSmilesMap = fixedMonomers.filter((m) => !!m.smiles).reduce((acc, m) => {
|
|
76
|
-
acc[m.smiles] = {symbol: m.symbol, smiles: m.smiles, original: m.smiles, source: m.lib?.source}; return acc;
|
|
77
|
-
}, {} as {[smiles: string]: {symbol: string, smiles: string, original: string | undefined, source: string | undefined}});
|
|
78
|
-
const cappedMonomerSmiles = fixedMonomers.map((m, i) => ({symbol: m.symbol, smiles: capSmiles(m.smiles ?? '', m.rgroups ?? []), original: m.smiles, source: monomers[i]?.lib?.source}))
|
|
79
|
-
.filter((s) => !!s?.smiles && !s.smiles.includes('[*:'));
|
|
80
|
-
|
|
81
|
-
// canonicalize all monomer smiles
|
|
82
|
-
const monomerSmilesCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'MonomerSmiles', cappedMonomerSmiles.map((m) => m.smiles!));
|
|
83
|
-
monomerSmilesCol.semType = DG.SEMTYPE.MOLECULE;
|
|
84
|
-
const canonicalizedMonomersSmilesCol: DG.Column = await converterFunc.apply({molecule: monomerSmilesCol, targetNotation: DG.chem.Notation.Smiles});
|
|
85
|
-
if (!canonicalizedMonomersSmilesCol || canonicalizedMonomersSmilesCol.length !== monomerSmilesCol.length)
|
|
86
|
-
throw new Error('Error canonicalizing monomer smiles');
|
|
87
|
-
canonicalizedMonomersSmilesCol.toList().forEach((s, i) => cappedMonomerSmiles[i].smiles = s);
|
|
88
|
-
const cappedMonomerSmilesMap = cappedMonomerSmiles.reduce((acc, m) => { acc[m.smiles] = m; return acc; }, {} as {[smiles: string]: {symbol: string, smiles: string, original: string | undefined, source: string | undefined}});
|
|
89
|
-
|
|
90
|
-
const moleculesOriginalCol = molDf.col(molColName)!;
|
|
91
|
-
const correctedOriginalList = moleculesOriginalCol.toList().map((s) => {
|
|
92
|
-
if (!s) return s;
|
|
93
|
-
try {
|
|
94
|
-
const isMolBlock = s.includes('\n');
|
|
95
|
-
return getCorrectedSmiles([], isMolBlock ? undefined : s, isMolBlock ? s : undefined);
|
|
96
|
-
} catch (_e) {
|
|
97
|
-
return s;
|
|
98
|
-
}
|
|
99
|
-
});
|
|
100
|
-
const moleculesOriginalColCorrected = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'MoleculesOriginalCorrected', correctedOriginalList);
|
|
101
|
-
// create dummy df
|
|
102
|
-
moleculesOriginalColCorrected.semType = DG.SEMTYPE.MOLECULE;
|
|
103
|
-
const _ddf = DG.DataFrame.fromColumns([moleculesOriginalColCorrected]);
|
|
104
|
-
const canonicalizedMoleculesCol: DG.Column = await converterFunc.apply({molecule: moleculesOriginalColCorrected, targetNotation: DG.chem.Notation.Smiles});
|
|
105
|
-
if (!canonicalizedMoleculesCol || canonicalizedMoleculesCol.length !== moleculesOriginalColCorrected.length)
|
|
106
|
-
throw new Error('Error canonicalizing molecules');
|
|
107
|
-
|
|
108
|
-
const canonicalizedMolecules = canonicalizedMoleculesCol.toList();
|
|
109
|
-
|
|
110
|
-
const resultDf = molDf.clone();
|
|
111
|
-
const matchingMonomerSmilesCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer smiles'));
|
|
112
|
-
matchingMonomerSmilesCol.semType = DG.SEMTYPE.MOLECULE;
|
|
113
|
-
const matchingMonomerSymbolCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer symbol'));
|
|
114
|
-
matchingMonomerSymbolCol.semType = 'Monomer';
|
|
115
|
-
const sourceLibCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer source'));
|
|
116
|
-
resultDf.columns.setOrder([molColName, matchingMonomerSymbolCol.name, matchingMonomerSmilesCol.name, sourceLibCol.name]);
|
|
117
|
-
|
|
118
|
-
for (let i = 0; i < canonicalizedMolecules.length; i++) {
|
|
119
|
-
const mol = canonicalizedMolecules[i];
|
|
120
|
-
if (!mol) continue;
|
|
121
|
-
let match = cappedMonomerSmilesMap[mol] ?? unCappedMonomerSmilesMap[mol];
|
|
122
|
-
if (!match) {
|
|
123
|
-
// try capping the molecule and matching again
|
|
124
|
-
const cappedMol = capSmiles(mol, STANDRARD_R_GROUPS);
|
|
125
|
-
if (cappedMol !== mol) {
|
|
126
|
-
const correctedMol = grok.chem.convert(cappedMol, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles);
|
|
127
|
-
match = cappedMonomerSmilesMap[correctedMol] ?? unCappedMonomerSmilesMap[correctedMol];
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
if (match) {
|
|
131
|
-
const matchSymbol = match.symbol;
|
|
132
|
-
const sources = (duplicates[matchSymbol]?.length ?? 0) > 0 ? duplicates[matchSymbol].map((m) => m?.lib?.source).filter((s) => !!s).join(', ') : (match.source ?? '');
|
|
133
|
-
const originalSmiles = match.original ?? match.smiles;
|
|
134
|
-
matchingMonomerSmilesCol.set(i, originalSmiles, false);
|
|
135
|
-
matchingMonomerSymbolCol.set(i, matchSymbol, false);
|
|
136
|
-
sourceLibCol.set(i, sources, false);
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
return resultDf;
|
|
140
|
-
}
|
|
62
|
+
export {matchMoleculesWithMonomers} from './match-molecules';
|
|
141
63
|
|
|
142
64
|
/** Standardizes the monomer library
|
|
143
65
|
* warning: throws error if the library is not valid or has invalid monomers
|
|
@@ -1217,7 +1139,7 @@ function replaceAllylsInSmiles(smiles: string): string {
|
|
|
1217
1139
|
}
|
|
1218
1140
|
|
|
1219
1141
|
/**NB! Can throw error */
|
|
1220
|
-
function getCorrectedSmiles(rgroups: RGroup[], smiles?: string, molBlock?: string): string {
|
|
1142
|
+
export function getCorrectedSmiles(rgroups: RGroup[], smiles?: string, molBlock?: string): string {
|
|
1221
1143
|
if (smiles)
|
|
1222
1144
|
smiles = replaceAllylsInSmiles(smiles);
|
|
1223
1145
|
const isSmilesMalformed = !smiles || !grok.chem.checkSmiles(smiles);
|
|
@@ -1310,7 +1232,7 @@ export function getCorrectedMolBlock(molBlock: string) {
|
|
|
1310
1232
|
}
|
|
1311
1233
|
|
|
1312
1234
|
// reverse of r-group substitution, will substitute rgroups with cap groups
|
|
1313
|
-
function capSmiles(smiles: string, rgroups: RGroup[]) {
|
|
1235
|
+
export function capSmiles(smiles: string, rgroups: RGroup[]) {
|
|
1314
1236
|
let newSmiles = smiles;
|
|
1315
1237
|
rgroups.forEach((rg, i) => {
|
|
1316
1238
|
const rgroupNum = rg.label[1] ?? `${i + 1}`; // if label is not in format R#, use index as number
|
|
@@ -6,7 +6,6 @@ import * as DG from 'datagrok-api/dg';
|
|
|
6
6
|
import * as ui from 'datagrok-api/ui';
|
|
7
7
|
|
|
8
8
|
import {ColumnInputOptions} from '@datagrok-libraries/utils/src/type-declarations';
|
|
9
|
-
import {delay} from '@datagrok-libraries/test/src/test';
|
|
10
9
|
import {ALPHABET, NOTATION} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
11
10
|
import {ISeqHelper} from '@datagrok-libraries/bio/src/utils/seq-helper';
|
|
12
11
|
|
|
@@ -82,7 +81,7 @@ export async function multipleSequenceAlignmentUI(
|
|
|
82
81
|
table: table, value: seqCol, onValueChanged: async (value: DG.Column<any>) => {
|
|
83
82
|
if (!value || value.semType !== DG.SEMTYPE.MACROMOLECULE) {
|
|
84
83
|
okBtn.disabled = true;
|
|
85
|
-
await delay(0); // to
|
|
84
|
+
await DG.delay(0); // to
|
|
86
85
|
colInput.value = prevSeqCol as DG.Column<string>;
|
|
87
86
|
return;
|
|
88
87
|
}
|
|
@@ -50,9 +50,19 @@ export class SeqHandler implements ISeqHandler {
|
|
|
50
50
|
if (col.type !== DG.TYPE.STRING)
|
|
51
51
|
throw new Error(`Unexpected column type '${col.type}', must be '${DG.TYPE.STRING}'.`);
|
|
52
52
|
this._column = col;
|
|
53
|
-
|
|
54
|
-
if (!units)
|
|
55
|
-
|
|
53
|
+
let units: string | null = this._column.meta.units;
|
|
54
|
+
if (!units) {
|
|
55
|
+
// it may be from layout that the macromolecule semtype is set but every other tag is missing, so we manually run detectors
|
|
56
|
+
if (!this._column.temp['seqHandlerDetectorRun']) {
|
|
57
|
+
this._column.temp['seqHandlerDetectorRun'] = true;
|
|
58
|
+
const detectorFunc = DG.Func.find({name: 'detectMacromolecule', meta: {role: 'semTypeDetector'}})[0];
|
|
59
|
+
if (detectorFunc)
|
|
60
|
+
detectorFunc.applySync({col: this._column});
|
|
61
|
+
units = this._column.meta.units;
|
|
62
|
+
}
|
|
63
|
+
if (!units)
|
|
64
|
+
throw new Error('Units are not specified in column');
|
|
65
|
+
}
|
|
56
66
|
this._units = units!;
|
|
57
67
|
|
|
58
68
|
this._notation = this.getNotation();
|
|
@@ -182,7 +192,7 @@ export class SeqHandler implements ISeqHandler {
|
|
|
182
192
|
|
|
183
193
|
let aligned = uh.column.getTag(TAGS.aligned);
|
|
184
194
|
if (aligned == null) {
|
|
185
|
-
aligned = uh.stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
|
|
195
|
+
aligned = uh.stats.sameLength || uh.column.categories.slice(0, 5).filter((a) => !!a).every((a) => a.length > 100) ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
|
|
186
196
|
uh.column.setTag(TAGS.aligned, aligned);
|
|
187
197
|
}
|
|
188
198
|
|
|
@@ -2,7 +2,6 @@ import * as grok from 'datagrok-api/grok';
|
|
|
2
2
|
import * as ui from 'datagrok-api/ui';
|
|
3
3
|
import * as DG from 'datagrok-api/dg';
|
|
4
4
|
|
|
5
|
-
import {delay} from '@datagrok-libraries/test/src/test';
|
|
6
5
|
import {checkInputColumnUI} from './check-input-column';
|
|
7
6
|
import {splitAlignedSequences} from '@datagrok-libraries/bio/src/utils/splitter';
|
|
8
7
|
import * as C from './constants';
|
|
@@ -17,7 +16,7 @@ export async function splitToMonomersUI(
|
|
|
17
16
|
): Promise<DG.DataFrame> {
|
|
18
17
|
// Delay is required for initial function dialog to close before starting invalidating of molfiles.
|
|
19
18
|
// Otherwise, dialog is freezing
|
|
20
|
-
await delay(10);
|
|
19
|
+
await DG.delay(10);
|
|
21
20
|
if (!checkInputColumnUI(seqCol, 'Sequence space')) return table;
|
|
22
21
|
|
|
23
22
|
const seqHelper = _package.seqHelper;
|
|
@@ -4,7 +4,7 @@ import * as DG from 'datagrok-api/dg';
|
|
|
4
4
|
|
|
5
5
|
import {fromEvent, Observable, Subject, Unsubscribable} from 'rxjs';
|
|
6
6
|
|
|
7
|
-
import {testEvent} from '@datagrok-libraries/
|
|
7
|
+
import {testEvent} from '@datagrok-libraries/utils/src/test';
|
|
8
8
|
import {
|
|
9
9
|
IVdRegionsViewer,
|
|
10
10
|
VdRegion, VdRegionType,
|
|
@@ -22,7 +22,7 @@ import {
|
|
|
22
22
|
import {errorToConsole} from '@datagrok-libraries/utils/src/to-console';
|
|
23
23
|
import {intToHtmlA} from '@datagrok-libraries/utils/src/color';
|
|
24
24
|
import {ISeqSplitted} from '@datagrok-libraries/bio/src/utils/macromolecule/types';
|
|
25
|
-
import {testEvent} from '@datagrok-libraries/
|
|
25
|
+
import {testEvent} from '@datagrok-libraries/utils/src/test';
|
|
26
26
|
import {PromiseSyncer} from '@datagrok-libraries/bio/src/utils/syncer';
|
|
27
27
|
import {GAP_SYMBOL} from '@datagrok-libraries/bio/src/utils/macromolecule/consts';
|
|
28
28
|
import {IMonomerLibBase} from '@datagrok-libraries/bio/src/types/monomer-library';
|
|
@@ -9,7 +9,6 @@ import {App, IHelmWebEditor} from '@datagrok-libraries/bio/src/helm/types';
|
|
|
9
9
|
import {getHelmHelper} from '@datagrok-libraries/bio/src/helm/helm-helper';
|
|
10
10
|
import {ILogger} from '@datagrok-libraries/bio/src/utils/logger';
|
|
11
11
|
import {errInfo} from '@datagrok-libraries/bio/src/utils/err-info';
|
|
12
|
-
import {delay} from '@datagrok-libraries/test/src/test';
|
|
13
12
|
import {ISeqHelper} from '@datagrok-libraries/bio/src/utils/seq-helper';
|
|
14
13
|
|
|
15
14
|
import {updateDivInnerHTML} from '../utils/ui-utils';
|
|
@@ -141,7 +140,7 @@ export class HelmBioFilter extends BioFilterBase<BioFilterProps> /* implements I
|
|
|
141
140
|
const logPrefix = `${this.viewerToLog()}.substructureSearch( column = <${column.name}> )`;
|
|
142
141
|
_package.logger.debug(`${logPrefix}, start`);
|
|
143
142
|
try {
|
|
144
|
-
await delay(10);
|
|
143
|
+
await DG.delay(10);
|
|
145
144
|
const res = await helmSubstructureSearch(this.props.substructure, column, this.seqHelper);
|
|
146
145
|
return res;
|
|
147
146
|
} finally {
|
|
@@ -163,7 +162,7 @@ export class HelmBioFilter extends BioFilterBase<BioFilterProps> /* implements I
|
|
|
163
162
|
// async awaitRendered(timeout: number = 10000): Promise<void> {
|
|
164
163
|
// const callLog = `awaitRendered( ${timeout} )`;
|
|
165
164
|
// const logPrefix = `${this.viewerToLog()}.${callLog}`;
|
|
166
|
-
// await delay(0);
|
|
165
|
+
// await DG.delay(0);
|
|
167
166
|
// await testEvent(this.onRendered, () => {
|
|
168
167
|
// this.logger.debug(`${logPrefix}, ` + '_onRendered event caught');
|
|
169
168
|
// }, () => {
|