@datagrok/bio 2.25.12 → 2.25.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Davit Rizhinashvili",
6
6
  "email": "drizhinashvili@datagrok.ai"
7
7
  },
8
- "version": "2.25.12",
8
+ "version": "2.25.13",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -0,0 +1,347 @@
1
+ /* eslint-disable max-lines-per-function */
2
+ /* eslint-disable max-len */
3
+ import * as grok from 'datagrok-api/grok';
4
+ import * as DG from 'datagrok-api/dg';
5
+
6
+ import {IMonomerLib, Monomer} from '@datagrok-libraries/bio/src/types/monomer-library';
7
+ import {PolymerType} from '@datagrok-libraries/bio/src/helm/types';
8
+
9
+ import {STANDRARD_R_GROUPS} from './const';
10
+ import {standardiseMonomers, capSmiles, getCorrectedSmiles} from './monomer-manager';
11
+
12
+ /** Represents a single monomer match result — one monomer that matched a molecule */
13
+ interface MonomerMatch {
14
+ /** monomer symbol from the library */
15
+ symbol: string;
16
+ /** canonical (possibly capped) SMILES used for the match */
17
+ smiles: string;
18
+ /** original SMILES from the monomer definition */
19
+ original: string;
20
+ /** library source name */
21
+ source: string;
22
+ }
23
+
24
+ /** Maps keyed by canonical SMILES, where each key can map to multiple monomers */
25
+ type MonomerSmilesMap = {[smiles: string]: MonomerMatch[]};
26
+
27
+ const MATCH_SEPARATOR = ' | ';
28
+
29
+ /**
30
+ * Builds lookup maps from standardized monomers:
31
+ * - uncappedMap: maps raw canonical monomer SMILES -> MonomerMatch[]
32
+ * - cappedMap: maps capped (R-groups replaced with cap groups) canonical SMILES -> MonomerMatch[]
33
+ * Both maps store arrays so that duplicate monomers (same structure, different symbols/libs) are preserved.
34
+ */
35
+ async function buildMonomerSmilesMaps(
36
+ fixedMonomers: Monomer[], originalMonomers: Monomer[], converterFunc: DG.Func,
37
+ ): Promise<{cappedMap: MonomerSmilesMap; uncappedMap: MonomerSmilesMap}> {
38
+ // build uncapped map from raw monomer SMILES
39
+ const uncappedMap: MonomerSmilesMap = {};
40
+ for (const m of fixedMonomers) {
41
+ if (!m.smiles) continue;
42
+ const match: MonomerMatch = {symbol: m.symbol, smiles: m.smiles, original: m.smiles, source: m.lib?.source ?? ''};
43
+ if (!uncappedMap[m.smiles]) uncappedMap[m.smiles] = [];
44
+ uncappedMap[m.smiles].push(match);
45
+ }
46
+
47
+ // build capped monomer entries: replace R-groups with cap group atoms
48
+ const cappedEntries = fixedMonomers
49
+ .map((m, i) => ({
50
+ symbol: m.symbol,
51
+ smiles: capSmiles(m.smiles ?? '', m.rgroups ?? []),
52
+ original: m.smiles,
53
+ source: originalMonomers[i]?.lib?.source ?? '',
54
+ }))
55
+ .filter((e) => !!e.smiles && !e.smiles.includes('[*:'));
56
+
57
+ // canonicalize all capped SMILES in bulk
58
+ const cappedSmilesCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'CappedSmiles', cappedEntries.map((e) => e.smiles));
59
+ cappedSmilesCol.semType = DG.SEMTYPE.MOLECULE;
60
+ const canonicalCappedCol: DG.Column = await converterFunc.apply({molecule: cappedSmilesCol, targetNotation: DG.chem.Notation.Smiles});
61
+ if (!canonicalCappedCol || canonicalCappedCol.length !== cappedSmilesCol.length)
62
+ throw new Error('Error canonicalizing capped monomer SMILES');
63
+
64
+ // build capped map with canonicalized SMILES as keys
65
+ const cappedMap: MonomerSmilesMap = {};
66
+ const canonicalCappedList = canonicalCappedCol.toList();
67
+ for (let i = 0; i < canonicalCappedList.length; i++) {
68
+ const smi = canonicalCappedList[i];
69
+ if (!smi) continue;
70
+ cappedEntries[i].smiles = smi;
71
+ const match: MonomerMatch = cappedEntries[i];
72
+ if (!cappedMap[smi]) cappedMap[smi] = [];
73
+ cappedMap[smi].push(match);
74
+ }
75
+
76
+ return {cappedMap, uncappedMap};
77
+ }
78
+
79
+ /**
80
+ * Corrects and canonicalizes the input molecule column.
81
+ * Handles both SMILES and molblock inputs.
82
+ * Returns the list of canonical SMILES strings (null for invalid molecules).
83
+ */
84
+ async function canonicalizeMolecules(
85
+ molDf: DG.DataFrame, molColName: string, converterFunc: DG.Func,
86
+ ): Promise<(string | null)[]> {
87
+ const moleculesOriginalCol = molDf.col(molColName)!;
88
+ const correctedList = moleculesOriginalCol.toList().map((s) => {
89
+ if (!s) return s;
90
+ try {
91
+ const isMolBlock = s.includes('\n');
92
+ return getCorrectedSmiles([], isMolBlock ? undefined : s, isMolBlock ? s : undefined);
93
+ } catch (_e) {
94
+ return s;
95
+ }
96
+ });
97
+
98
+ const correctedCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'MoleculesCorrected', correctedList);
99
+ correctedCol.semType = DG.SEMTYPE.MOLECULE;
100
+ // dummy df needed for semtype detection by converterFunc
101
+ const _ddf = DG.DataFrame.fromColumns([correctedCol]);
102
+
103
+ const canonicalCol: DG.Column = await converterFunc.apply({molecule: correctedCol, targetNotation: DG.chem.Notation.Smiles});
104
+ if (!canonicalCol || canonicalCol.length !== correctedCol.length)
105
+ throw new Error('Error canonicalizing molecules');
106
+
107
+ return canonicalCol.toList();
108
+ }
109
+
110
+ /**
111
+ * Attempts to match a single canonical molecule SMILES against the lookup maps.
112
+ * Tries in order: capped map -> uncapped map -> cap the molecule with standard R-groups and retry.
113
+ * Returns all matching monomers (can be multiple from different libraries).
114
+ */
115
+ function matchBySmiles(
116
+ canonicalMol: string, cappedMap: MonomerSmilesMap, uncappedMap: MonomerSmilesMap,
117
+ ): MonomerMatch[] {
118
+ // try direct lookup in capped and uncapped maps
119
+ let matches = cappedMap[canonicalMol] ?? uncappedMap[canonicalMol];
120
+ if (matches && matches.length > 0) return matches;
121
+
122
+ // fallback: cap the molecule with standard R-groups and try again
123
+ const cappedMol = capSmiles(canonicalMol, STANDRARD_R_GROUPS);
124
+ if (cappedMol !== canonicalMol) {
125
+ const correctedMol = grok.chem.convert(cappedMol, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles);
126
+ matches = cappedMap[correctedMol] ?? uncappedMap[correctedMol];
127
+ if (matches && matches.length > 0) return matches;
128
+ }
129
+
130
+ return [];
131
+ }
132
+
133
+ /**
134
+ * Builds a Morgan fingerprint lookup map for all capped monomer SMILES.
135
+ * The map keys are fingerprint binary strings (via DG.BitSet.toBinaryString()),
136
+ * which allows fast exact matching that is tolerant of explicit hydrogen
137
+ * and minor stereochemistry differences.
138
+ */
139
+ async function buildMonomerFingerprintMap(
140
+ cappedMap: MonomerSmilesMap,
141
+ ): Promise<{fpMap: {[fpString: string]: MonomerMatch[]}; cappedSmilesList: string[]}> {
142
+ const cappedSmilesList = Object.keys(cappedMap);
143
+ if (cappedSmilesList.length === 0)
144
+ return {fpMap: {}, cappedSmilesList: []};
145
+
146
+ const monomerCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'MonomerSmiles', cappedSmilesList);
147
+ monomerCol.semType = DG.SEMTYPE.MOLECULE;
148
+
149
+ const fpCol: DG.Column = await grok.functions.call('Chem:getMorganFingerprints', {molColumn: monomerCol});
150
+
151
+ const fpMap: {[fpString: string]: MonomerMatch[]} = {};
152
+ for (let i = 0; i < fpCol.length; i++) {
153
+ const fp: DG.BitSet | null = fpCol.get(i);
154
+ if (!fp) continue;
155
+ const fpStr = fp.toBinaryString();
156
+ // merge monomer matches from the SMILES map into the fingerprint map
157
+ const smilesMatches = cappedMap[cappedSmilesList[i]] ?? [];
158
+ if (!fpMap[fpStr]) fpMap[fpStr] = [];
159
+ fpMap[fpStr].push(...smilesMatches);
160
+ }
161
+
162
+ return {fpMap, cappedSmilesList};
163
+ }
164
+
165
+ /**
166
+ * For molecules that were not matched by exact SMILES, attempts matching via
167
+ * Morgan fingerprints. Computes fingerprints for unmatched molecules and looks
168
+ * them up in the monomer fingerprint map. Also tries capping with standard R-groups.
169
+ */
170
+ async function matchByFingerprint(
171
+ unmatchedIndices: number[],
172
+ canonicalizedMolecules: (string | null)[],
173
+ monomerFpMap: {[fpString: string]: MonomerMatch[]},
174
+ ): Promise<Map<number, MonomerMatch[]>> {
175
+ const results = new Map<number, MonomerMatch[]>();
176
+ if (unmatchedIndices.length === 0 || Object.keys(monomerFpMap).length === 0)
177
+ return results;
178
+
179
+ // collect SMILES for unmatched molecules (uncapped first)
180
+ const uncappedSmiles: string[] = unmatchedIndices.map((idx) => canonicalizedMolecules[idx] ?? '');
181
+
182
+ // also prepare capped versions
183
+ const cappedSmiles: string[] = uncappedSmiles.map((s) => {
184
+ if (!s) return '';
185
+ const capped = capSmiles(s, STANDRARD_R_GROUPS);
186
+ return capped !== s ? grok.chem.convert(capped, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles) : s;
187
+ });
188
+
189
+ // compute fingerprints for both uncapped and capped molecules
190
+ const uncappedCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'UnmatchedMols', uncappedSmiles);
191
+ uncappedCol.semType = DG.SEMTYPE.MOLECULE;
192
+ const cappedCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'UnmatchedMolsCapped', cappedSmiles);
193
+ cappedCol.semType = DG.SEMTYPE.MOLECULE;
194
+
195
+ const [uncappedFpCol, cappedFpCol]: [DG.Column, DG.Column] = await Promise.all([
196
+ grok.functions.call('Chem:getMorganFingerprints', {molColumn: uncappedCol}),
197
+ grok.functions.call('Chem:getMorganFingerprints', {molColumn: cappedCol}),
198
+ ]);
199
+
200
+ for (let i = 0; i < unmatchedIndices.length; i++) {
201
+ const molIdx = unmatchedIndices[i];
202
+
203
+ // try uncapped fingerprint first
204
+ const uncappedFp: DG.BitSet | null = uncappedFpCol.get(i);
205
+ if (uncappedFp) {
206
+ const fpStr = uncappedFp.toBinaryString();
207
+ const matches = monomerFpMap[fpStr];
208
+ if (matches && matches.length > 0) {
209
+ results.set(molIdx, matches);
210
+ continue;
211
+ }
212
+ }
213
+
214
+ // fallback: try capped fingerprint
215
+ const cappedFp: DG.BitSet | null = cappedFpCol.get(i);
216
+ if (cappedFp) {
217
+ const fpStr = cappedFp.toBinaryString();
218
+ const matches = monomerFpMap[fpStr];
219
+ if (matches && matches.length > 0)
220
+ results.set(molIdx, matches);
221
+ }
222
+ }
223
+
224
+ return results;
225
+ }
226
+
227
+ /** Deduplicates matches by symbol, keeping one entry per unique monomer symbol */
228
+ function deduplicateMatches(matches: MonomerMatch[]): MonomerMatch[] {
229
+ const seen = new Set<string>();
230
+ return matches.filter((m) => {
231
+ if (seen.has(m.symbol)) return false;
232
+ seen.add(m.symbol);
233
+ return true;
234
+ });
235
+ }
236
+
237
+ /** Collects all source library names for matched monomers, including known duplicates */
238
+ function collectSources(
239
+ matches: MonomerMatch[], duplicates: {[symbol: string]: Monomer[]},
240
+ ): string {
241
+ const sources = new Set<string>();
242
+ for (const m of matches) {
243
+ // check if monomerLib knows about duplicates for this symbol across libraries
244
+ const dups = duplicates[m.symbol];
245
+ if (dups && dups.length > 0) {
246
+ for (const dup of dups) {
247
+ const s = dup?.lib?.source;
248
+ if (s) sources.add(s);
249
+ }
250
+ } else if (m.source)
251
+ sources.add(m.source);
252
+ }
253
+ return Array.from(sources).join(', ');
254
+ }
255
+
256
+ /**
257
+ * Matches molecules in a dataframe with monomers from a monomer library.
258
+ *
259
+ * Matching pipeline:
260
+ * 1. Standardize monomers and build SMILES lookup maps (capped & uncapped)
261
+ * 2. Canonicalize input molecules
262
+ * 3. Phase 1: exact canonical SMILES matching (capped, uncapped, and fallback-capped molecule)
263
+ * 4. Phase 2: Morgan fingerprint fallback for molecules that didn't match by SMILES
264
+ * 5. Populate result columns (supports multiple matches per molecule via pipe-delimited values)
265
+ *
266
+ * @returns cloned input DataFrame with added match columns
267
+ */
268
+ export async function matchMoleculesWithMonomers(
269
+ molDf: DG.DataFrame, molColName: string, monomerLib: IMonomerLib, polymerType: PolymerType = 'PEPTIDE',
270
+ ): Promise<DG.DataFrame> {
271
+ const duplicates = monomerLib.duplicateMonomers?.[polymerType] ?? {};
272
+ const converterFunc = DG.Func.find({package: 'Chem', name: 'convertMoleculeNotation'})[0];
273
+ if (!converterFunc)
274
+ throw new Error('Function convertMoleculeNotation not found, please install Chem package');
275
+
276
+ // === Step 1: Standardize monomers and build SMILES lookup maps ===
277
+ const monomers = monomerLib.getMonomerSymbolsByType(polymerType)
278
+ .map((s) => monomerLib.getMonomer(polymerType, s)!)
279
+ .filter((m) => m && (m.smiles || m.molfile));
280
+
281
+ const fixedMonomers = await standardiseMonomers(monomers);
282
+ // preserve library reference from original monomers (lost during standardization)
283
+ fixedMonomers.forEach((m, i) => { m.lib = monomers[i].lib; });
284
+
285
+ const {cappedMap, uncappedMap} = await buildMonomerSmilesMaps(fixedMonomers, monomers, converterFunc);
286
+
287
+ // === Step 2: Canonicalize input molecules ===
288
+ const canonicalizedMolecules = await canonicalizeMolecules(molDf, molColName, converterFunc);
289
+
290
+ // === Step 3: Phase 1 — Exact canonical SMILES matching ===
291
+ // matchResults[i] holds all MonomerMatch entries for molecule i (empty array if unmatched)
292
+ const matchResults: MonomerMatch[][] = new Array(canonicalizedMolecules.length).fill(null).map(() => []);
293
+ const unmatchedIndices: number[] = [];
294
+
295
+ for (let i = 0; i < canonicalizedMolecules.length; i++) {
296
+ const mol = canonicalizedMolecules[i];
297
+ if (!mol) continue;
298
+ const smilesMatches = matchBySmiles(mol, cappedMap, uncappedMap);
299
+ if (smilesMatches.length > 0)
300
+ matchResults[i] = smilesMatches;
301
+ else
302
+ unmatchedIndices.push(i);
303
+ }
304
+
305
+ // === Step 4: Phase 2 — Morgan fingerprint fallback for unmatched molecules ===
306
+ if (unmatchedIndices.length > 0) {
307
+ try {
308
+ const {fpMap} = await buildMonomerFingerprintMap(cappedMap);
309
+ const fpMatches = await matchByFingerprint(unmatchedIndices, canonicalizedMolecules, fpMap);
310
+ for (const [idx, matches] of fpMatches)
311
+ matchResults[idx] = matches;
312
+ } catch (e) {
313
+ console.warn('Fingerprint fallback matching failed, continuing with SMILES matches only:', e);
314
+ }
315
+ }
316
+
317
+ // === Step 5: Populate result columns ===
318
+ const resultDf = molDf.clone();
319
+ const symbolCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer symbol'));
320
+ const smilesCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer smiles'));
321
+ smilesCol.semType = DG.SEMTYPE.MOLECULE;
322
+ const sourceCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer source'));
323
+ const matchCountCol = resultDf.columns.addNewInt(resultDf.columns.getUnusedName('Match count'));
324
+ const matchMethodCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Match method'));
325
+ resultDf.columns.setOrder([molColName, symbolCol.name, smilesCol.name, sourceCol.name, matchCountCol.name, matchMethodCol.name]);
326
+
327
+ for (let i = 0; i < matchResults.length; i++) {
328
+ const matches = matchResults[i];
329
+ if (matches.length === 0) continue;
330
+
331
+ // deduplicate matches by symbol (same monomer can appear from multiple lookup paths)
332
+ const uniqueMatches = deduplicateMatches(matches);
333
+
334
+ // collect all sources, including duplicates from the monomer library
335
+ const allSources = collectSources(uniqueMatches, duplicates);
336
+
337
+ symbolCol.set(i, uniqueMatches.map((m) => m.symbol).join(MATCH_SEPARATOR), false);
338
+ smilesCol.set(i, uniqueMatches[0].original ?? uniqueMatches[0].smiles, false);
339
+ sourceCol.set(i, allSources, false);
340
+ matchCountCol.set(i, uniqueMatches.length, false);
341
+ // fingerprint matches are those from phase 2 (indices that were in unmatchedIndices)
342
+ const method = unmatchedIndices.includes(i) ? 'fingerprint' : 'exact';
343
+ matchMethodCol.set(i, method, false);
344
+ }
345
+
346
+ return resultDf;
347
+ }
@@ -15,11 +15,10 @@ import {PolymerType, MonomerType} from '@datagrok-libraries/bio/src/helm/types';
15
15
  import {MonomerLibManager} from '../lib-manager';
16
16
 
17
17
  import {MONOMER_RENDERER_TAGS} from '@datagrok-libraries/bio/src/utils/cell-renderer';
18
- import {BioTags} from '@datagrok-libraries/bio/src/utils/macromolecule/consts';
18
+ import {BioTags, MONOMER_MOTIF_SPLITTER} from '@datagrok-libraries/bio/src/utils/macromolecule/consts';
19
19
  //@ts-ignore
20
20
  import '../../../../css/monomer-manager.css';
21
21
  import {Subscription} from 'rxjs';
22
- import {STANDRARD_R_GROUPS} from './const';
23
22
 
24
23
  // columns of monomers dataframe, note that rgroups is hidden and will be displayed as separate columns
25
24
  export enum MONOMER_DF_COLUMN_NAMES {
@@ -60,84 +59,7 @@ export async function standardiseMonomers(monomers: Monomer[]) {
60
59
  return fixedMonomers;
61
60
  }
62
61
 
63
- /// matches molecules in the dataframe with monomers in the library by canonical smiles
64
- export async function matchMoleculesWithMonomers(molDf: DG.DataFrame, molColName: string, monomerLib: IMonomerLib, polymerType: PolymerType = 'PEPTIDE'): Promise<DG.DataFrame> {
65
- const duplicates = monomerLib.duplicateMonomers?.[polymerType] ?? {};
66
- const converterFunc = DG.Func.find({package: 'Chem', name: 'convertMoleculeNotation'})[0];
67
- if (!converterFunc)
68
- throw new Error('Function convertMoleculeNotation not found, please install Chem package');
69
- // first: stamdardize monomers
70
- const monomers = monomerLib.getMonomerSymbolsByType(polymerType).map((s) => monomerLib.getMonomer(polymerType, s)!).filter((m) => m && (m.smiles || m.molfile));
71
- const fixedMonomers = await standardiseMonomers(monomers);
72
- fixedMonomers.forEach((m, i) => {
73
- m.lib = monomers[i].lib;
74
- });
75
- const unCappedMonomerSmilesMap = fixedMonomers.filter((m) => !!m.smiles).reduce((acc, m) => {
76
- acc[m.smiles] = {symbol: m.symbol, smiles: m.smiles, original: m.smiles, source: m.lib?.source}; return acc;
77
- }, {} as {[smiles: string]: {symbol: string, smiles: string, original: string | undefined, source: string | undefined}});
78
- const cappedMonomerSmiles = fixedMonomers.map((m, i) => ({symbol: m.symbol, smiles: capSmiles(m.smiles ?? '', m.rgroups ?? []), original: m.smiles, source: monomers[i]?.lib?.source}))
79
- .filter((s) => !!s?.smiles && !s.smiles.includes('[*:'));
80
-
81
- // canonicalize all monomer smiles
82
- const monomerSmilesCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'MonomerSmiles', cappedMonomerSmiles.map((m) => m.smiles!));
83
- monomerSmilesCol.semType = DG.SEMTYPE.MOLECULE;
84
- const canonicalizedMonomersSmilesCol: DG.Column = await converterFunc.apply({molecule: monomerSmilesCol, targetNotation: DG.chem.Notation.Smiles});
85
- if (!canonicalizedMonomersSmilesCol || canonicalizedMonomersSmilesCol.length !== monomerSmilesCol.length)
86
- throw new Error('Error canonicalizing monomer smiles');
87
- canonicalizedMonomersSmilesCol.toList().forEach((s, i) => cappedMonomerSmiles[i].smiles = s);
88
- const cappedMonomerSmilesMap = cappedMonomerSmiles.reduce((acc, m) => { acc[m.smiles] = m; return acc; }, {} as {[smiles: string]: {symbol: string, smiles: string, original: string | undefined, source: string | undefined}});
89
-
90
- const moleculesOriginalCol = molDf.col(molColName)!;
91
- const correctedOriginalList = moleculesOriginalCol.toList().map((s) => {
92
- if (!s) return s;
93
- try {
94
- const isMolBlock = s.includes('\n');
95
- return getCorrectedSmiles([], isMolBlock ? undefined : s, isMolBlock ? s : undefined);
96
- } catch (_e) {
97
- return s;
98
- }
99
- });
100
- const moleculesOriginalColCorrected = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'MoleculesOriginalCorrected', correctedOriginalList);
101
- // create dummy df
102
- moleculesOriginalColCorrected.semType = DG.SEMTYPE.MOLECULE;
103
- const _ddf = DG.DataFrame.fromColumns([moleculesOriginalColCorrected]);
104
- const canonicalizedMoleculesCol: DG.Column = await converterFunc.apply({molecule: moleculesOriginalColCorrected, targetNotation: DG.chem.Notation.Smiles});
105
- if (!canonicalizedMoleculesCol || canonicalizedMoleculesCol.length !== moleculesOriginalColCorrected.length)
106
- throw new Error('Error canonicalizing molecules');
107
-
108
- const canonicalizedMolecules = canonicalizedMoleculesCol.toList();
109
-
110
- const resultDf = molDf.clone();
111
- const matchingMonomerSmilesCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer smiles'));
112
- matchingMonomerSmilesCol.semType = DG.SEMTYPE.MOLECULE;
113
- const matchingMonomerSymbolCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer symbol'));
114
- matchingMonomerSymbolCol.semType = 'Monomer';
115
- const sourceLibCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer source'));
116
- resultDf.columns.setOrder([molColName, matchingMonomerSymbolCol.name, matchingMonomerSmilesCol.name, sourceLibCol.name]);
117
-
118
- for (let i = 0; i < canonicalizedMolecules.length; i++) {
119
- const mol = canonicalizedMolecules[i];
120
- if (!mol) continue;
121
- let match = cappedMonomerSmilesMap[mol] ?? unCappedMonomerSmilesMap[mol];
122
- if (!match) {
123
- // try capping the molecule and matching again
124
- const cappedMol = capSmiles(mol, STANDRARD_R_GROUPS);
125
- if (cappedMol !== mol) {
126
- const correctedMol = grok.chem.convert(cappedMol, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles);
127
- match = cappedMonomerSmilesMap[correctedMol] ?? unCappedMonomerSmilesMap[correctedMol];
128
- }
129
- }
130
- if (match) {
131
- const matchSymbol = match.symbol;
132
- const sources = (duplicates[matchSymbol]?.length ?? 0) > 0 ? duplicates[matchSymbol].map((m) => m?.lib?.source).filter((s) => !!s).join(', ') : (match.source ?? '');
133
- const originalSmiles = match.original ?? match.smiles;
134
- matchingMonomerSmilesCol.set(i, originalSmiles, false);
135
- matchingMonomerSymbolCol.set(i, matchSymbol, false);
136
- sourceLibCol.set(i, sources, false);
137
- }
138
- }
139
- return resultDf;
140
- }
62
+ export {matchMoleculesWithMonomers} from './match-molecules';
141
63
 
142
64
  /** Standardizes the monomer library
143
65
  * warning: throws error if the library is not valid or has invalid monomers
@@ -1217,7 +1139,7 @@ function replaceAllylsInSmiles(smiles: string): string {
1217
1139
  }
1218
1140
 
1219
1141
  /**NB! Can throw error */
1220
- function getCorrectedSmiles(rgroups: RGroup[], smiles?: string, molBlock?: string): string {
1142
+ export function getCorrectedSmiles(rgroups: RGroup[], smiles?: string, molBlock?: string): string {
1221
1143
  if (smiles)
1222
1144
  smiles = replaceAllylsInSmiles(smiles);
1223
1145
  const isSmilesMalformed = !smiles || !grok.chem.checkSmiles(smiles);
@@ -1310,7 +1232,7 @@ export function getCorrectedMolBlock(molBlock: string) {
1310
1232
  }
1311
1233
 
1312
1234
  // reverse of r-group substitution, will substitute rgroups with cap groups
1313
- function capSmiles(smiles: string, rgroups: RGroup[]) {
1235
+ export function capSmiles(smiles: string, rgroups: RGroup[]) {
1314
1236
  let newSmiles = smiles;
1315
1237
  rgroups.forEach((rg, i) => {
1316
1238
  const rgroupNum = rg.label[1] ?? `${i + 1}`; // if label is not in format R#, use index as number