@datagrok/bio 2.25.10 → 2.25.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -634,14 +634,15 @@ export class MonomerManager implements IMonomerManager {
634
634
  function substituteCapsWithRGroupsSmiles(smiles: string, rGroups: RGroup[]) {
635
635
  let newSmiles = smiles;
636
636
  // first substitute all caps with R-groups with corresponding numbers
637
+ // like
637
638
  rGroups.forEach((rGroup) => {
638
639
  const RNum = rGroup.label[1] ?? '1';
639
- const capRegex = new RegExp(`\\[\\${rGroup.capGroupName}:${RNum}\\]`, 'g');
640
- newSmiles = newSmiles.replace(capRegex, `[*:${RNum}]`);
640
+ newSmiles = newSmiles.replace(`[${rGroup.capGroupName}:${RNum}]`, `[*:${RNum}]`);
641
641
  });
642
642
  // during some conversions atoms can end up as isotops in smiles string like this [2O]
643
643
 
644
644
  // replace all [2O] with [*:2], there can be also two atoms like [2OH] -> [*:2]
645
+ // for searching purposes: 'ISO'
645
646
  const isotopeRegex = /\[\d[A-Z]{1,2}\]/g;
646
647
  newSmiles = newSmiles.replaceAll(isotopeRegex, (match) => {
647
648
  const rGroupNum = match[1];
@@ -697,6 +698,14 @@ function getCaseInvariantValue<T>(obj: { [key: string]: T }, key: string): T | u
697
698
  return obj[caseInvariantKey];
698
699
  }
699
700
 
701
+ function setCaseInvariantValue<T>(obj: { [key: string]: T }, key: string, value: T): void {
702
+ const caseInvariantKey = Object.keys(obj).find((k) => k.toLowerCase() === key.toLowerCase());
703
+ if (caseInvariantKey)
704
+ obj[caseInvariantKey] = value;
705
+ else
706
+ obj[key] = value;
707
+ }
708
+
700
709
  // some r groups for some monomers can lack smiles, or something else :D this function will try to fix that
701
710
  function resolveRGroupInfo(rgps: RGroup[]): RGroup[] {
702
711
  return (rgps.map((rg) => {
@@ -1197,12 +1206,24 @@ export function findLastIndex<T>(ar: ArrayLike<T>, pred: (el: T) => boolean): nu
1197
1206
  return foundIdx;
1198
1207
  }
1199
1208
 
1209
+ function replaceAllylsInSmiles(smiles: string): string {
1210
+ // OG smiles can include something like [Allyl:3], which is invalid smiles and needs to be replaced with [*:3]
1211
+ // number is single digit
1212
+ const allylRegex = /\[Allyl\:\d\]/g;
1213
+ return smiles.replaceAll(allylRegex, (match) => {
1214
+ const rGroupNum = match[match.length - 2];
1215
+ return `[*:${rGroupNum}]`;
1216
+ });
1217
+ }
1218
+
1200
1219
  /**NB! Can throw error */
1201
1220
  function getCorrectedSmiles(rgroups: RGroup[], smiles?: string, molBlock?: string): string {
1221
+ if (smiles)
1222
+ smiles = replaceAllylsInSmiles(smiles);
1202
1223
  const isSmilesMalformed = !smiles || !grok.chem.checkSmiles(smiles);
1203
1224
  if ((isSmilesMalformed) && !molBlock) throw new Error('Both SMILES and MOL block are empty or malformed');
1204
1225
 
1205
- let canonical = isSmilesMalformed ? grok.chem.convert(molBlock!, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles) : smiles;
1226
+ let canonical = isSmilesMalformed ? grok.chem.convert(molBlock!, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles) : smiles!;
1206
1227
 
1207
1228
  canonical = substituteCapsWithRGroupsSmiles(canonical, rgroups);
1208
1229
  canonical = fixRGroupsAsElementsSmiles(canonical);
@@ -1215,15 +1236,23 @@ export function getCorrectedMolBlock(molBlock: string) {
1215
1236
  // 1. RGP field is present at the end, before the M END line
1216
1237
  // 2. RGP field is present in the correct format
1217
1238
  // 3. R group labels are written as R# and not just R
1218
- // 4. there is no ISO field in the molblock. if there is, it needs to be substituted with RGP field and thats it.
1239
+ // 4. there is no incorrect ISO field in the molblock.
1240
+ // if there is, it needs to be substituted with RGP field and thats it. sometimes R groups from smiles get written as isotopes in molblock.
1241
+ // If the number in ISO is more than 8, chances are that it is actually an ISO field, otherwise, it is likely an R group
1219
1242
  // 5. make sure that R groups have no metadata in the atomblocks
1220
1243
 
1221
1244
  const lines = molBlock.split('\n');
1222
1245
 
1223
1246
  const isoLineIdx = lines.findIndex((line) => line.startsWith('M') && line.includes('ISO'));
1224
1247
  if (isoLineIdx !== -1) {
1225
- const isoIndex = lines[isoLineIdx].indexOf('ISO');
1226
- lines[isoLineIdx] = lines[isoLineIdx].substring(0, isoIndex) + 'RGP' + lines[isoLineIdx].substring(isoIndex + 3);
1248
+ // check the number in ISO field, the line could look like this:
1249
+ //M ISO 1 25 208
1250
+ // in this case, we have 208, which can not be an R group, rather it is an isotope of lead
1251
+ const isoLineParts = lines[isoLineIdx].trim().split(' ').filter(Boolean);
1252
+ if (isoLineParts.length >= 5 && Number.parseInt(isoLineParts[4]) < 9) {
1253
+ const isoIndex = lines[isoLineIdx].indexOf('ISO');
1254
+ lines[isoLineIdx] = lines[isoLineIdx].substring(0, isoIndex) + 'RGP' + lines[isoLineIdx].substring(isoIndex + 3);
1255
+ }
1227
1256
  }
1228
1257
 
1229
1258
  const molStartIdx = lines.findIndex((line) => line.includes('V2000'));
@@ -1283,14 +1312,73 @@ export function getCorrectedMolBlock(molBlock: string) {
1283
1312
  // reverse of r-group substitution, will substitute rgroups with cap groups
1284
1313
  function capSmiles(smiles: string, rgroups: RGroup[]) {
1285
1314
  let newSmiles = smiles;
1286
- rgroups.forEach((rg) => {
1287
- const rgroupNum = rg.label[1] ?? '1';
1315
+ rgroups.forEach((rg, i) => {
1316
+ const rgroupNum = rg.label[1] ?? `${i + 1}`; // if label is not in format R#, use index as number
1288
1317
  const capGroupName = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_NAME);
1289
- newSmiles = newSmiles.replace(`[*:${rgroupNum}]`, `[${capGroupName}]`);
1318
+ const capGroupSmiles = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE);
1319
+ if (capGroupName?.toLowerCase() === 'allyl' || capGroupSmiles?.replace(`[*:${rgroupNum}]`, '')?.toLowerCase() === 'c=c' || capGroupSmiles?.replace(`[*:${rgroupNum}]`, '')?.toLowerCase() === '[c=c]')
1320
+ newSmiles = newSmiles.replace(`[*:${rgroupNum}]`, 'C=C');
1321
+ else
1322
+ newSmiles = newSmiles.replace(`[*:${rgroupNum}]`, `[${capGroupName}]`);
1290
1323
  });
1291
1324
  return newSmiles;
1292
1325
  }
1293
1326
 
1327
+ /** fix r-groups smiles notation
1328
+ // in some cases, instead of having O[*:1], we get smth like [O:2], or O[R1], or [*1]
1329
+ // Modifies in place
1330
+ */
1331
+ function correctRGroupsSmiles(rgroups: RGroup[]) {
1332
+ // case 1: [O:2] -> O[*:2]
1333
+ function replaceAtomMapping(smi: string, num: number) {
1334
+ const regex = /\[([A-Z][a-z]?):(\d+)\]/g;
1335
+ return smi.replace(regex, (match, atom, n) => {
1336
+ if (parseInt(n) === num) {
1337
+ const correctedAtom = atom?.toLowerCase && atom?.toLowerCase() === 'h' ? '[H]' : atom; // special case for hydrogen, as H[*:1] is not valid, but [H][*:1] is valid
1338
+ return `${correctedAtom}[*:${n}]`;
1339
+ }
1340
+
1341
+ return match;
1342
+ });
1343
+ }
1344
+
1345
+ rgroups.forEach((rg, i) => {
1346
+ const capGroupSmiles = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE);
1347
+ const rGroupNum = rg.label[1] ?? `${i + 1}`; // if label is not in format R#, use index as number
1348
+ if (!capGroupSmiles || capGroupSmiles.includes(`[*:${rGroupNum}]`))
1349
+ return;
1350
+ const atomLabeledSmilesRegex = new RegExp(`\\[[A-Za-z]{1,2}:${rGroupNum}\\]`);
1351
+ if (capGroupSmiles.includes(`[*${rGroupNum}]`)) // case 2: [*1] -> [*:1]
1352
+ setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, capGroupSmiles.replace(`[*${rGroupNum}]`, `[*:${rGroupNum}]`));
1353
+ else if (capGroupSmiles.includes(`[R${rGroupNum}]`)) // case 3: O[R1] -> O[*:1]
1354
+ setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, capGroupSmiles.replace(`[R${rGroupNum}]`, `[*:${rGroupNum}]`));
1355
+ else if (atomLabeledSmilesRegex.test(capGroupSmiles)) // case 4: [O:2] -> O[*:2]
1356
+ setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, replaceAtomMapping(capGroupSmiles, Number.parseInt(rGroupNum)));
1357
+ else if (`[${capGroupSmiles}]`.match(atomLabeledSmilesRegex)?.[0].length === capGroupSmiles.length + 2) // case 5: O:2 -> O[*:2], no brackets at all
1358
+ setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, replaceAtomMapping(`[${capGroupSmiles}]`, Number.parseInt(rGroupNum)));
1359
+ });
1360
+ // finally, there is a case of incorrectly written r group smiles, we can see cases like this:
1361
+ /**
1362
+ * {
1363
+ "alternateId": "R3-Br",
1364
+ "capGroupName": "Br",
1365
+ "capGroupSMILES": "[*:3]",
1366
+ "label": "R3"
1367
+ }
1368
+ */
1369
+ rgroups.forEach((rg) => {
1370
+ const capGroupSmiles = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE)!;
1371
+ const match = capGroupSmiles.match(/\[\*\:?(\d+)\]/);
1372
+ if (match) {
1373
+ const rGroupSmiles = match[0];
1374
+ if (capGroupSmiles === rGroupSmiles) {
1375
+ // if cap group smiles is exactly the same as r group smiles, it is likely that r group smiles were written in wrong way
1376
+ setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, `${getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_NAME)}${capGroupSmiles}`);
1377
+ }
1378
+ }
1379
+ });
1380
+ }
1381
+
1294
1382
  async function monomerFromDfRow(dfRow: DG.Row): Promise<Monomer> {
1295
1383
  // hacky way for now, but meta object for now only supports key value pairs and not nested objects
1296
1384
  let metaJSON: any;
@@ -1323,16 +1411,25 @@ async function monomerFromDfRow(dfRow: DG.Row): Promise<Monomer> {
1323
1411
 
1324
1412
  let naturalAnalog = dfRow.get(MONOMER_DF_COLUMN_NAMES.NATURAL_ANALOG);
1325
1413
  const polymerType = dfRow.get(MONOMER_DF_COLUMN_NAMES.POLYMER_TYPE);
1414
+ // parse r-groups
1326
1415
  let rGroups: RGroup[] = [];
1327
1416
  try {
1328
1417
  rGroups = JSON.parse(dfRow.get(MONOMER_DF_COLUMN_NAMES.R_GROUPS) ?? '[]');
1418
+ } catch (_) {
1419
+ rGroups ??= [];
1420
+ }
1421
+
1422
+ // correct r group smiles if needed
1423
+ correctRGroupsSmiles(rGroups);
1424
+
1425
+ try {
1329
1426
  if (!naturalAnalog && polymerType) {
1330
1427
  const mostSimilar = await mostSimilarNaturalAnalog(capSmiles(smiles, rGroups), polymerType);
1331
1428
  if (mostSimilar)
1332
1429
  naturalAnalog = mostSimilar;
1333
1430
  }
1334
- } catch (_) {
1335
- rGroups ??= [];
1431
+ } catch (e) {
1432
+ console.error(e);
1336
1433
  }
1337
1434
 
1338
1435
  return {