@datagrok/bio 2.25.10 → 2.25.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/detectors.js +1 -0
- package/dist/242.js +1 -1
- package/dist/242.js.map +1 -1
- package/dist/284.js.map +1 -1
- package/dist/455.js +1 -1
- package/dist/455.js.map +1 -1
- package/dist/705.js +1 -1
- package/dist/705.js.map +1 -1
- package/dist/980.js.map +1 -1
- package/dist/package-test.js +5 -5
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +3 -3
- package/dist/package.js.map +1 -1
- package/package.json +3 -4
- package/src/analysis/sequence-similarity-viewer.ts +4 -4
- package/src/tests/to-atomic-level-tests.ts +121 -1
- package/src/utils/biln.ts +10 -2
- package/src/utils/convert.ts +62 -16
- package/src/utils/helm-to-molfile/converter/mol-atoms.ts +5 -0
- package/src/utils/helm-to-molfile/converter/mol-bonds.ts +5 -0
- package/src/utils/helm-to-molfile/converter/mol-wrapper.ts +4 -2
- package/src/utils/helm-to-molfile/converter/monomer-wrapper.ts +24 -10
- package/src/utils/helm-to-molfile/converter/r-group-handler.ts +100 -7
- package/src/utils/helm-to-molfile/converter/types.ts +12 -0
- package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts +108 -11
- package/test-console-output-1.log +456 -417
- package/test-record-1.mp4 +0 -0
- package/webpack.config.js +22 -1
|
@@ -634,14 +634,15 @@ export class MonomerManager implements IMonomerManager {
|
|
|
634
634
|
function substituteCapsWithRGroupsSmiles(smiles: string, rGroups: RGroup[]) {
|
|
635
635
|
let newSmiles = smiles;
|
|
636
636
|
// first substitute all caps with R-groups with corresponding numbers
|
|
637
|
+
// like
|
|
637
638
|
rGroups.forEach((rGroup) => {
|
|
638
639
|
const RNum = rGroup.label[1] ?? '1';
|
|
639
|
-
|
|
640
|
-
newSmiles = newSmiles.replace(capRegex, `[*:${RNum}]`);
|
|
640
|
+
newSmiles = newSmiles.replace(`[${rGroup.capGroupName}:${RNum}]`, `[*:${RNum}]`);
|
|
641
641
|
});
|
|
642
642
|
// during some conversions atoms can end up as isotops in smiles string like this [2O]
|
|
643
643
|
|
|
644
644
|
// replace all [2O] with [*:2], there can be also two atoms like [2OH] -> [*:2]
|
|
645
|
+
// for searching purposes: 'ISO'
|
|
645
646
|
const isotopeRegex = /\[\d[A-Z]{1,2}\]/g;
|
|
646
647
|
newSmiles = newSmiles.replaceAll(isotopeRegex, (match) => {
|
|
647
648
|
const rGroupNum = match[1];
|
|
@@ -697,6 +698,14 @@ function getCaseInvariantValue<T>(obj: { [key: string]: T }, key: string): T | u
|
|
|
697
698
|
return obj[caseInvariantKey];
|
|
698
699
|
}
|
|
699
700
|
|
|
701
|
+
function setCaseInvariantValue<T>(obj: { [key: string]: T }, key: string, value: T): void {
|
|
702
|
+
const caseInvariantKey = Object.keys(obj).find((k) => k.toLowerCase() === key.toLowerCase());
|
|
703
|
+
if (caseInvariantKey)
|
|
704
|
+
obj[caseInvariantKey] = value;
|
|
705
|
+
else
|
|
706
|
+
obj[key] = value;
|
|
707
|
+
}
|
|
708
|
+
|
|
700
709
|
// some r groups for some monomers can lack smiles, or something else :D this function will try to fix that
|
|
701
710
|
function resolveRGroupInfo(rgps: RGroup[]): RGroup[] {
|
|
702
711
|
return (rgps.map((rg) => {
|
|
@@ -1197,12 +1206,24 @@ export function findLastIndex<T>(ar: ArrayLike<T>, pred: (el: T) => boolean): nu
|
|
|
1197
1206
|
return foundIdx;
|
|
1198
1207
|
}
|
|
1199
1208
|
|
|
1209
|
+
function replaceAllylsInSmiles(smiles: string): string {
|
|
1210
|
+
// OG smiles can include something like [Allyl:3], which is invalid smiles and needs to be replaced with [*:3]
|
|
1211
|
+
// number is single digit
|
|
1212
|
+
const allylRegex = /\[Allyl\:\d\]/g;
|
|
1213
|
+
return smiles.replaceAll(allylRegex, (match) => {
|
|
1214
|
+
const rGroupNum = match[match.length - 2];
|
|
1215
|
+
return `[*:${rGroupNum}]`;
|
|
1216
|
+
});
|
|
1217
|
+
}
|
|
1218
|
+
|
|
1200
1219
|
/**NB! Can throw error */
|
|
1201
1220
|
function getCorrectedSmiles(rgroups: RGroup[], smiles?: string, molBlock?: string): string {
|
|
1221
|
+
if (smiles)
|
|
1222
|
+
smiles = replaceAllylsInSmiles(smiles);
|
|
1202
1223
|
const isSmilesMalformed = !smiles || !grok.chem.checkSmiles(smiles);
|
|
1203
1224
|
if ((isSmilesMalformed) && !molBlock) throw new Error('Both SMILES and MOL block are empty or malformed');
|
|
1204
1225
|
|
|
1205
|
-
let canonical = isSmilesMalformed ? grok.chem.convert(molBlock!, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles) : smiles
|
|
1226
|
+
let canonical = isSmilesMalformed ? grok.chem.convert(molBlock!, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles) : smiles!;
|
|
1206
1227
|
|
|
1207
1228
|
canonical = substituteCapsWithRGroupsSmiles(canonical, rgroups);
|
|
1208
1229
|
canonical = fixRGroupsAsElementsSmiles(canonical);
|
|
@@ -1215,15 +1236,23 @@ export function getCorrectedMolBlock(molBlock: string) {
|
|
|
1215
1236
|
// 1. RGP field is present at the end, before the M END line
|
|
1216
1237
|
// 2. RGP field is present in the correct format
|
|
1217
1238
|
// 3. R group labels are written as R# and not just R
|
|
1218
|
-
// 4. there is no ISO field in the molblock.
|
|
1239
|
+
// 4. there is no incorrect ISO field in the molblock.
|
|
1240
|
+
// if there is, it needs to be substituted with RGP field and thats it. sometimes R groups from smiles get written as isotopes in molblock.
|
|
1241
|
+
// If the number in ISO is more than 8, chances are that it is actually an ISO field, otherwise, it is likely an R group
|
|
1219
1242
|
// 5. make sure that R groups have no metadata in the atomblocks
|
|
1220
1243
|
|
|
1221
1244
|
const lines = molBlock.split('\n');
|
|
1222
1245
|
|
|
1223
1246
|
const isoLineIdx = lines.findIndex((line) => line.startsWith('M') && line.includes('ISO'));
|
|
1224
1247
|
if (isoLineIdx !== -1) {
|
|
1225
|
-
|
|
1226
|
-
|
|
1248
|
+
// check the number in ISO field, the line could look like this:
|
|
1249
|
+
//M ISO 1 25 208
|
|
1250
|
+
// in this case, we have 208, which can not be an R group, rather it is an isotope of lead
|
|
1251
|
+
const isoLineParts = lines[isoLineIdx].trim().split(' ').filter(Boolean);
|
|
1252
|
+
if (isoLineParts.length >= 5 && Number.parseInt(isoLineParts[4]) < 9) {
|
|
1253
|
+
const isoIndex = lines[isoLineIdx].indexOf('ISO');
|
|
1254
|
+
lines[isoLineIdx] = lines[isoLineIdx].substring(0, isoIndex) + 'RGP' + lines[isoLineIdx].substring(isoIndex + 3);
|
|
1255
|
+
}
|
|
1227
1256
|
}
|
|
1228
1257
|
|
|
1229
1258
|
const molStartIdx = lines.findIndex((line) => line.includes('V2000'));
|
|
@@ -1283,14 +1312,73 @@ export function getCorrectedMolBlock(molBlock: string) {
|
|
|
1283
1312
|
// reverse of r-group substitution, will substitute rgroups with cap groups
|
|
1284
1313
|
function capSmiles(smiles: string, rgroups: RGroup[]) {
|
|
1285
1314
|
let newSmiles = smiles;
|
|
1286
|
-
rgroups.forEach((rg) => {
|
|
1287
|
-
const rgroupNum = rg.label[1] ??
|
|
1315
|
+
rgroups.forEach((rg, i) => {
|
|
1316
|
+
const rgroupNum = rg.label[1] ?? `${i + 1}`; // if label is not in format R#, use index as number
|
|
1288
1317
|
const capGroupName = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_NAME);
|
|
1289
|
-
|
|
1318
|
+
const capGroupSmiles = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE);
|
|
1319
|
+
if (capGroupName?.toLowerCase() === 'allyl' || capGroupSmiles?.replace(`[*:${rgroupNum}]`, '')?.toLowerCase() === 'c=c' || capGroupSmiles?.replace(`[*:${rgroupNum}]`, '')?.toLowerCase() === '[c=c]')
|
|
1320
|
+
newSmiles = newSmiles.replace(`[*:${rgroupNum}]`, 'C=C');
|
|
1321
|
+
else
|
|
1322
|
+
newSmiles = newSmiles.replace(`[*:${rgroupNum}]`, `[${capGroupName}]`);
|
|
1290
1323
|
});
|
|
1291
1324
|
return newSmiles;
|
|
1292
1325
|
}
|
|
1293
1326
|
|
|
1327
|
+
/** fix r-groups smiles notation
|
|
1328
|
+
// in some cases, instead of having O[*:1], we get smth like [O:2], or O[R1], or [*1]
|
|
1329
|
+
// Modifies in place
|
|
1330
|
+
*/
|
|
1331
|
+
function correctRGroupsSmiles(rgroups: RGroup[]) {
|
|
1332
|
+
// case 1: [O:2] -> O[*:2]
|
|
1333
|
+
function replaceAtomMapping(smi: string, num: number) {
|
|
1334
|
+
const regex = /\[([A-Z][a-z]?):(\d+)\]/g;
|
|
1335
|
+
return smi.replace(regex, (match, atom, n) => {
|
|
1336
|
+
if (parseInt(n) === num) {
|
|
1337
|
+
const correctedAtom = atom?.toLowerCase && atom?.toLowerCase() === 'h' ? '[H]' : atom; // special case for hydrogen, as H[*:1] is not valid, but [H][*:1] is valid
|
|
1338
|
+
return `${correctedAtom}[*:${n}]`;
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
return match;
|
|
1342
|
+
});
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1345
|
+
rgroups.forEach((rg, i) => {
|
|
1346
|
+
const capGroupSmiles = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE);
|
|
1347
|
+
const rGroupNum = rg.label[1] ?? `${i + 1}`; // if label is not in format R#, use index as number
|
|
1348
|
+
if (!capGroupSmiles || capGroupSmiles.includes(`[*:${rGroupNum}]`))
|
|
1349
|
+
return;
|
|
1350
|
+
const atomLabeledSmilesRegex = new RegExp(`\\[[A-Za-z]{1,2}:${rGroupNum}\\]`);
|
|
1351
|
+
if (capGroupSmiles.includes(`[*${rGroupNum}]`)) // case 2: [*1] -> [*:1]
|
|
1352
|
+
setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, capGroupSmiles.replace(`[*${rGroupNum}]`, `[*:${rGroupNum}]`));
|
|
1353
|
+
else if (capGroupSmiles.includes(`[R${rGroupNum}]`)) // case 3: O[R1] -> O[*:1]
|
|
1354
|
+
setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, capGroupSmiles.replace(`[R${rGroupNum}]`, `[*:${rGroupNum}]`));
|
|
1355
|
+
else if (atomLabeledSmilesRegex.test(capGroupSmiles)) // case 4: [O:2] -> O[*:2]
|
|
1356
|
+
setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, replaceAtomMapping(capGroupSmiles, Number.parseInt(rGroupNum)));
|
|
1357
|
+
else if (`[${capGroupSmiles}]`.match(atomLabeledSmilesRegex)?.[0].length === capGroupSmiles.length + 2) // case 5: O:2 -> O[*:2], no brackets at all
|
|
1358
|
+
setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, replaceAtomMapping(`[${capGroupSmiles}]`, Number.parseInt(rGroupNum)));
|
|
1359
|
+
});
|
|
1360
|
+
// finally, there is a case of incorrectly written r group smiles, we can see cases like this:
|
|
1361
|
+
/**
|
|
1362
|
+
* {
|
|
1363
|
+
"alternateId": "R3-Br",
|
|
1364
|
+
"capGroupName": "Br",
|
|
1365
|
+
"capGroupSMILES": "[*:3]",
|
|
1366
|
+
"label": "R3"
|
|
1367
|
+
}
|
|
1368
|
+
*/
|
|
1369
|
+
rgroups.forEach((rg) => {
|
|
1370
|
+
const capGroupSmiles = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE)!;
|
|
1371
|
+
const match = capGroupSmiles.match(/\[\*\:?(\d+)\]/);
|
|
1372
|
+
if (match) {
|
|
1373
|
+
const rGroupSmiles = match[0];
|
|
1374
|
+
if (capGroupSmiles === rGroupSmiles) {
|
|
1375
|
+
// if cap group smiles is exactly the same as r group smiles, it is likely that r group smiles were written in wrong way
|
|
1376
|
+
setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, `${getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_NAME)}${capGroupSmiles}`);
|
|
1377
|
+
}
|
|
1378
|
+
}
|
|
1379
|
+
});
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1294
1382
|
async function monomerFromDfRow(dfRow: DG.Row): Promise<Monomer> {
|
|
1295
1383
|
// hacky way for now, but meta object for now only supports key value pairs and not nested objects
|
|
1296
1384
|
let metaJSON: any;
|
|
@@ -1323,16 +1411,25 @@ async function monomerFromDfRow(dfRow: DG.Row): Promise<Monomer> {
|
|
|
1323
1411
|
|
|
1324
1412
|
let naturalAnalog = dfRow.get(MONOMER_DF_COLUMN_NAMES.NATURAL_ANALOG);
|
|
1325
1413
|
const polymerType = dfRow.get(MONOMER_DF_COLUMN_NAMES.POLYMER_TYPE);
|
|
1414
|
+
// parse r-groups
|
|
1326
1415
|
let rGroups: RGroup[] = [];
|
|
1327
1416
|
try {
|
|
1328
1417
|
rGroups = JSON.parse(dfRow.get(MONOMER_DF_COLUMN_NAMES.R_GROUPS) ?? '[]');
|
|
1418
|
+
} catch (_) {
|
|
1419
|
+
rGroups ??= [];
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
// correct r group smiles if needed
|
|
1423
|
+
correctRGroupsSmiles(rGroups);
|
|
1424
|
+
|
|
1425
|
+
try {
|
|
1329
1426
|
if (!naturalAnalog && polymerType) {
|
|
1330
1427
|
const mostSimilar = await mostSimilarNaturalAnalog(capSmiles(smiles, rGroups), polymerType);
|
|
1331
1428
|
if (mostSimilar)
|
|
1332
1429
|
naturalAnalog = mostSimilar;
|
|
1333
1430
|
}
|
|
1334
|
-
} catch (
|
|
1335
|
-
|
|
1431
|
+
} catch (e) {
|
|
1432
|
+
console.error(e);
|
|
1336
1433
|
}
|
|
1337
1434
|
|
|
1338
1435
|
return {
|