npm - @datagrok/bio - Versions diffs - 2.25.10 → 2.25.12 - Mend

@datagrok/bio 2.25.10 → 2.25.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/CHANGELOG.md +10 -0
package/detectors.js +1 -0
package/dist/242.js +1 -1
package/dist/242.js.map +1 -1
package/dist/284.js.map +1 -1
package/dist/455.js +1 -1
package/dist/455.js.map +1 -1
package/dist/705.js +1 -1
package/dist/705.js.map +1 -1
package/dist/980.js.map +1 -1
package/dist/package-test.js +5 -5
package/dist/package-test.js.map +1 -1
package/dist/package.js +3 -3
package/dist/package.js.map +1 -1
package/package.json +3 -4
package/src/analysis/sequence-similarity-viewer.ts +4 -4
package/src/tests/to-atomic-level-tests.ts +121 -1
package/src/utils/biln.ts +10 -2
package/src/utils/convert.ts +62 -16
package/src/utils/helm-to-molfile/converter/mol-atoms.ts +5 -0
package/src/utils/helm-to-molfile/converter/mol-bonds.ts +5 -0
package/src/utils/helm-to-molfile/converter/mol-wrapper.ts +4 -2
package/src/utils/helm-to-molfile/converter/monomer-wrapper.ts +24 -10
package/src/utils/helm-to-molfile/converter/r-group-handler.ts +100 -7
package/src/utils/helm-to-molfile/converter/types.ts +12 -0
package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts +108 -11
package/test-console-output-1.log +456 -417
package/test-record-1.mp4 +0 -0
package/webpack.config.js +22 -1

package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts CHANGED Viewed

@@ -634,14 +634,15 @@ export class MonomerManager implements IMonomerManager {
 function substituteCapsWithRGroupsSmiles(smiles: string, rGroups: RGroup[]) {
   let newSmiles = smiles;
   // first substitute all caps with R-groups with corresponding numbers
+  // like
   rGroups.forEach((rGroup) => {
     const RNum = rGroup.label[1] ?? '1';
-    const capRegex = new RegExp(`\\[\\${rGroup.capGroupName}:${RNum}\\]`, 'g');
-    newSmiles = newSmiles.replace(capRegex, `[*:${RNum}]`);
+    newSmiles = newSmiles.replace(`[${rGroup.capGroupName}:${RNum}]`, `[*:${RNum}]`);
   });
   // during some conversions atoms can end up as isotops in smiles string like this [2O]
   // replace all [2O] with [*:2], there can be also two atoms like [2OH] -> [*:2]
+  // for searching purposes: 'ISO'
   const isotopeRegex = /\[\d[A-Z]{1,2}\]/g;
   newSmiles = newSmiles.replaceAll(isotopeRegex, (match) => {
     const rGroupNum = match[1];
@@ -697,6 +698,14 @@ function getCaseInvariantValue<T>(obj: { [key: string]: T }, key: string): T | u
   return obj[caseInvariantKey];
 }
+function setCaseInvariantValue<T>(obj: { [key: string]: T }, key: string, value: T): void {
+  const caseInvariantKey = Object.keys(obj).find((k) => k.toLowerCase() === key.toLowerCase());
+  if (caseInvariantKey)
+    obj[caseInvariantKey] = value;
+  else
+    obj[key] = value;
+}
 // some r groups for some monomers can lack smiles, or something else :D this function will try to fix that
 function resolveRGroupInfo(rgps: RGroup[]): RGroup[] {
   return (rgps.map((rg) => {
@@ -1197,12 +1206,24 @@ export function findLastIndex<T>(ar: ArrayLike<T>, pred: (el: T) => boolean): nu
   return foundIdx;
 }
+function replaceAllylsInSmiles(smiles: string): string {
+  // OG smiles can include something like [Allyl:3], which is invalid smiles and needs to be replaced with [*:3]
+  // number is single digit
+  const allylRegex = /\[Allyl\:\d\]/g;
+  return smiles.replaceAll(allylRegex, (match) => {
+    const rGroupNum = match[match.length - 2];
+    return `[*:${rGroupNum}]`;
+  });
+}
 /**NB! Can throw error */
 function getCorrectedSmiles(rgroups: RGroup[], smiles?: string, molBlock?: string): string {
+  if (smiles)
+    smiles = replaceAllylsInSmiles(smiles);
   const isSmilesMalformed = !smiles || !grok.chem.checkSmiles(smiles);
   if ((isSmilesMalformed) && !molBlock) throw new Error('Both SMILES and MOL block are empty or malformed');
-  let canonical = isSmilesMalformed ? grok.chem.convert(molBlock!, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles) : smiles;
+  let canonical = isSmilesMalformed ? grok.chem.convert(molBlock!, DG.chem.Notation.Unknown, DG.chem.Notation.Smiles) : smiles!;
   canonical = substituteCapsWithRGroupsSmiles(canonical, rgroups);
   canonical = fixRGroupsAsElementsSmiles(canonical);
@@ -1215,15 +1236,23 @@ export function getCorrectedMolBlock(molBlock: string) {
   // 1. RGP field is present at the end, before the M END line
   // 2. RGP field is present in the correct format
   // 3. R group labels are written as R# and not just R
-  // 4. there is no ISO field in the molblock. if there is, it needs to be substituted with RGP field and thats it.
+  // 4. there is no incorrect ISO field in the molblock.
+  // if there is, it needs to be substituted with RGP field and thats it. sometimes R groups from smiles get written as isotopes in molblock.
+  // If the number in ISO is more than 8, chances are that it is actually an ISO field, otherwise, it is likely an R group
   // 5. make sure that R groups have no metadata in the atomblocks
   const lines = molBlock.split('\n');
   const isoLineIdx = lines.findIndex((line) => line.startsWith('M') && line.includes('ISO'));
   if (isoLineIdx !== -1) {
-    const isoIndex = lines[isoLineIdx].indexOf('ISO');
-    lines[isoLineIdx] = lines[isoLineIdx].substring(0, isoIndex) + 'RGP' + lines[isoLineIdx].substring(isoIndex + 3);
+    // check the number in ISO field, the line could look like this:
+    //M  ISO  1  25 208
+    // in this case, we have 208, which can not be an R group, rather it is an isotope of lead
+    const isoLineParts = lines[isoLineIdx].trim().split(' ').filter(Boolean);
+    if (isoLineParts.length >= 5 && Number.parseInt(isoLineParts[4]) < 9) {
+      const isoIndex = lines[isoLineIdx].indexOf('ISO');
+      lines[isoLineIdx] = lines[isoLineIdx].substring(0, isoIndex) + 'RGP' + lines[isoLineIdx].substring(isoIndex + 3);
+    }
   }
   const molStartIdx = lines.findIndex((line) => line.includes('V2000'));
@@ -1283,14 +1312,73 @@ export function getCorrectedMolBlock(molBlock: string) {
 // reverse of r-group substitution, will substitute rgroups with cap groups
 function capSmiles(smiles: string, rgroups: RGroup[]) {
   let newSmiles = smiles;
-  rgroups.forEach((rg) => {
-    const rgroupNum = rg.label[1] ?? '1';
+  rgroups.forEach((rg, i) => {
+    const rgroupNum = rg.label[1] ?? `${i + 1}`; // if label is not in format R#, use index as number
     const capGroupName = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_NAME);
-    newSmiles = newSmiles.replace(`[*:${rgroupNum}]`, `[${capGroupName}]`);
+    const capGroupSmiles = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE);
+    if (capGroupName?.toLowerCase() === 'allyl' || capGroupSmiles?.replace(`[*:${rgroupNum}]`, '')?.toLowerCase() === 'c=c' || capGroupSmiles?.replace(`[*:${rgroupNum}]`, '')?.toLowerCase() === '[c=c]')
+      newSmiles = newSmiles.replace(`[*:${rgroupNum}]`, 'C=C');
+    else
+      newSmiles = newSmiles.replace(`[*:${rgroupNum}]`, `[${capGroupName}]`);
   });
   return newSmiles;
 }
+/**  fix r-groups smiles notation
+// in some cases, instead of having O[*:1], we get smth like [O:2], or O[R1], or [*1]
+// Modifies in place
+*/
+function correctRGroupsSmiles(rgroups: RGroup[]) {
+  // case 1: [O:2] -> O[*:2]
+  function replaceAtomMapping(smi: string, num: number) {
+    const regex = /\[([A-Z][a-z]?):(\d+)\]/g;
+    return smi.replace(regex, (match, atom, n) => {
+      if (parseInt(n) === num) {
+        const correctedAtom = atom?.toLowerCase && atom?.toLowerCase() === 'h' ? '[H]' : atom; // special case for hydrogen, as H[*:1] is not valid, but [H][*:1] is valid
+        return `${correctedAtom}[*:${n}]`;
+      }
+      return match;
+    });
+  }
+  rgroups.forEach((rg, i) => {
+    const capGroupSmiles = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE);
+    const rGroupNum = rg.label[1] ?? `${i + 1}`; // if label is not in format R#, use index as number
+    if (!capGroupSmiles || capGroupSmiles.includes(`[*:${rGroupNum}]`))
+      return;
+    const atomLabeledSmilesRegex = new RegExp(`\\[[A-Za-z]{1,2}:${rGroupNum}\\]`);
+    if (capGroupSmiles.includes(`[*${rGroupNum}]`)) // case 2: [*1] -> [*:1]
+      setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, capGroupSmiles.replace(`[*${rGroupNum}]`, `[*:${rGroupNum}]`));
+    else if (capGroupSmiles.includes(`[R${rGroupNum}]`)) // case 3: O[R1] -> O[*:1]
+      setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, capGroupSmiles.replace(`[R${rGroupNum}]`, `[*:${rGroupNum}]`));
+    else if (atomLabeledSmilesRegex.test(capGroupSmiles)) // case 4: [O:2] -> O[*:2]
+      setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, replaceAtomMapping(capGroupSmiles, Number.parseInt(rGroupNum)));
+    else if (`[${capGroupSmiles}]`.match(atomLabeledSmilesRegex)?.[0].length === capGroupSmiles.length + 2) // case 5: O:2 -> O[*:2], no brackets at all
+      setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, replaceAtomMapping(`[${capGroupSmiles}]`, Number.parseInt(rGroupNum)));
+  });
+  // finally, there is a case of incorrectly written r group smiles, we can see cases like this:
+  /**
+   * {
+        "alternateId": "R3-Br",
+        "capGroupName": "Br",
+        "capGroupSMILES": "[*:3]",
+        "label": "R3"
+      }
+   */
+  rgroups.forEach((rg) => {
+    const capGroupSmiles = getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE)!;
+    const match = capGroupSmiles.match(/\[\*\:?(\d+)\]/);
+    if (match) {
+      const rGroupSmiles = match[0];
+      if (capGroupSmiles === rGroupSmiles) {
+        // if cap group smiles is exactly the same as r group smiles, it is likely that r group smiles were written in wrong way
+        setCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_SMILES_UPPERCASE, `${getCaseInvariantValue(rg, HELM_RGROUP_FIELDS.CAP_GROUP_NAME)}${capGroupSmiles}`);
+      }
+    }
+  });
+}
 async function monomerFromDfRow(dfRow: DG.Row): Promise<Monomer> {
   // hacky way for now, but meta object for now only supports key value pairs and not nested objects
   let metaJSON: any;
@@ -1323,16 +1411,25 @@ async function monomerFromDfRow(dfRow: DG.Row): Promise<Monomer> {
   let naturalAnalog = dfRow.get(MONOMER_DF_COLUMN_NAMES.NATURAL_ANALOG);
   const polymerType = dfRow.get(MONOMER_DF_COLUMN_NAMES.POLYMER_TYPE);
+  // parse r-groups
   let rGroups: RGroup[] = [];
   try {
     rGroups = JSON.parse(dfRow.get(MONOMER_DF_COLUMN_NAMES.R_GROUPS) ?? '[]');
+  } catch (_) {
+    rGroups ??= [];
+  }
+  // correct r group smiles if needed
+  correctRGroupsSmiles(rGroups);
+  try {
     if (!naturalAnalog && polymerType) {
       const mostSimilar = await mostSimilarNaturalAnalog(capSmiles(smiles, rGroups), polymerType);
       if (mostSimilar)
         naturalAnalog = mostSimilar;
     }
-  } catch (_) {
-    rGroups ??= [];
+  } catch (e) {
+    console.error(e);
   }
   return {