@datagrok/bio 2.27.3 → 2.27.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Davit Rizhinashvili",
6
6
  "email": "drizhinashvili@datagrok.ai"
7
7
  },
8
- "version": "2.27.3",
8
+ "version": "2.27.4",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -44,7 +44,7 @@
44
44
  ],
45
45
  "dependencies": {
46
46
  "@biowasm/aioli": "^3.1.0",
47
- "@datagrok-libraries/bio": "^5.63.7",
47
+ "@datagrok-libraries/bio": "^5.64.0",
48
48
  "@datagrok-libraries/chem-meta": "^1.2.9",
49
49
  "@datagrok-libraries/math": "^1.2.6",
50
50
  "@datagrok-libraries/ml": "^6.10.11",
@@ -13,6 +13,7 @@ import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
13
13
  import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
14
14
  import {getClusterMatrixWorker} from '@datagrok-libraries/math';
15
15
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/multi-column-dimensionality-reduction/types';
16
+ import {awaitCheck} from '@datagrok-libraries/test/src/test';
16
17
 
17
18
  const dataFn: string = 'samples/FASTA_PT_activity.csv';
18
19
 
@@ -117,25 +118,37 @@ export async function demoActivityCliffsCyclic() {
117
118
  ui.setUpdateIndicator(tv.root, true);
118
119
  try {
119
120
  const seqEncodingFunc = DG.Func.find({name: 'macromoleculePreprocessingFunction', package: 'Bio'})[0];
120
- const activityCliffsViewer = (await PackageFunctions.activityCliffs(
121
+ await PackageFunctions.activityCliffs(
121
122
  df, df.getCol('Sequence'), df.getCol('Activity'),
122
123
  96, DimReductionMethods.UMAP, MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE,
123
- seqEncodingFunc, {}, true)) as DG.ScatterPlotViewer;
124
- tv.dockManager.dock(activityCliffsViewer, DG.DOCK_TYPE.RIGHT, null, 'Activity Cliffs', 0.65);
125
- await DG.delay(100);
126
- const cliffsLink: HTMLButtonElement = $(activityCliffsViewer.root)
127
- .find('button.scatter_plot_link,cliffs_grid').get()[0] as HTMLButtonElement;
128
- cliffsLink.click();
129
- await DG.delay(100);
124
+ seqEncodingFunc, {}, true);
125
+
126
+ let scatterPlot: DG.Viewer | undefined;
127
+ await awaitCheck(() => {
128
+ for (const v of tv.viewers) {
129
+ if (v.type === DG.VIEWER.SCATTER_PLOT) {
130
+ scatterPlot = v;
131
+ return true;
132
+ }
133
+ }
134
+ return false;
135
+ }, '', 10000);
136
+
137
+ let link: HTMLCollectionOf<Element> | undefined;
138
+ await awaitCheck(() => {
139
+ link = scatterPlot!.root.getElementsByClassName('scatter_plot_link');
140
+ return link.length > 0;
141
+ }, '', 5000);
142
+ (link![0] as HTMLElement).click();
143
+ await DG.delay(500);
144
+
130
145
  tv.grid.props.rowHeight = 180;
131
146
  tv.grid.col('sequence') && (tv.grid.col('sequence')!.width = 300);
132
147
  tv.grid.col('structure') && (tv.grid.col('structure')!.width = 300);
133
148
  const cliffsGrid = Array.from(tv.viewers).find((v) => v !== tv.grid && v.type === DG.VIEWER.GRID) as DG.Grid;
134
149
  if (cliffsGrid) {
135
150
  cliffsGrid.props.rowHeight = 40;
136
- cliffsGrid.col('seq_diff')!.width = 600;
137
- tv.dockManager.dock(cliffsGrid, DG.DOCK_TYPE.DOWN, null, 'Cliffs', 0.35);
138
- tv.dockManager.dock(activityCliffsViewer, DG.DOCK_TYPE.RIGHT, null, 'Activity Cliffs', 0.55);
151
+ cliffsGrid.col('seq_diff') && (cliffsGrid.col('seq_diff')!.width = 600);
139
152
  }
140
153
  } catch (err: any) {
141
154
  handleError(err);
@@ -462,8 +462,12 @@ MWRSWY-CKHPMWRSWY-CKHP`;
462
462
  // sample_testHelm.csv
463
463
  // columns: ID,Test type,HELM string,Valid?,Mol Weight,Mol Formula,SMILES
464
464
  test('samplesTestHelmCsv', async () => {
465
+ // Alphabet size of 8 reflects splitterAsHelm's triplet-splitting of HELM
466
+ // RNA monomers (each `sugar(base)phosphate` becomes 3 tokens), which can
467
+ // collapse what used to be N distinct triple-tokens into a smaller union
468
+ // of {sugar(s), bases, phosphate(s)} symbols.
465
469
  await _testDf(readSamples(Samples.testHelmCsv), {
466
- 'HELM string': new PosCol(NOTATION.HELM, null, null, 9, true),
470
+ 'HELM string': new PosCol(NOTATION.HELM, null, null, 8, true),
467
471
  }, seqHelper);
468
472
  });
469
473
 
@@ -59,18 +59,22 @@ category('splitters', async () => {
59
59
  'D-Tyr_Et', 'D-Dap', 'dV', 'E', 'N', 'pnG', 'Phe_4Me'],
60
60
  ],
61
61
 
62
+ // splitterAsHelm triplet-splits HELM RNA monomers `sugar(base)phosphate`
63
+ // into 3 tokens, and `sugar(base)` (terminal-only) into 2. Standalone
64
+ // tokens that don't match either form (e.g. lone `P`, or non-terminal
65
+ // `R(U)` without a phosphate) are kept verbatim.
62
66
  testHelm1: [
63
67
  'RNA1{R(U)P.R(T)P.R(G)P.R(C)P.R(A)}$$$$',
64
- ['R(U)P', 'R(T)P', 'R(G)P', 'R(C)P', 'R(A)'],
68
+ ['R', 'U', 'P', 'R', 'T', 'P', 'R', 'G', 'P', 'R', 'C', 'P', 'R', 'A'],
65
69
  ],
66
70
 
67
71
  testHelm2: [
68
72
  'RNA1{P.R(U)P.R(T)}$$$$',
69
- ['P', 'R(U)P', 'R(T)'],
73
+ ['P', 'R', 'U', 'P', 'R', 'T'],
70
74
  ],
71
75
  testHelm3: [
72
- 'RNA1{P.R(U).P.R(T)}$$$$',
73
- ['P', 'R(U)', 'P', 'R(T)'],
76
+ 'RNA1{P.R(U).P.R(T)}$$$$', // invalid helm, but oh well,
77
+ ['P', 'R(U)', 'P', 'R', 'T'],
74
78
  ],
75
79
  };
76
80
 
@@ -356,6 +356,150 @@ PEPTIDE1{Lys_Boc.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2.Thr_PO3H2.Aca.Tyr
356
356
  }
357
357
  });
358
358
 
359
+ /** Tests for the linear HELM-RNA path: must preserve modified sugars,
360
+ * phosphates, and bases per nucleotide. The non-linear (HELM via POM)
361
+ * path is the reference; the linear path is expected to match it on
362
+ * canonical SMILES for these inputs. */
363
+ category('toAtomicLevelHelmRna', async () => {
364
+ let monomerLibHelper: IMonomerLibHelper;
365
+ let userLibSettings: UserLibSettings;
366
+ let seqHelper: ISeqHelper;
367
+ let monomerLib: IMonomerLib;
368
+ let rdKitModule: RDModule;
369
+
370
+ before(async () => {
371
+ rdKitModule = await getRdKitModule();
372
+ seqHelper = await getSeqHelper();
373
+ monomerLibHelper = await getMonomerLibHelper();
374
+ userLibSettings = await getUserLibSettings();
375
+ await monomerLibHelper.loadMonomerLibForTests();
376
+ monomerLib = monomerLibHelper.getMonomerLib();
377
+ });
378
+
379
+ after(async () => {
380
+ await setUserLibSettings(userLibSettings);
381
+ await monomerLibHelper.loadMonomerLib(true);
382
+ });
383
+
384
+ /** Build a single-row HELM RNA dataframe and run the linear converter,
385
+ * returning the canonical SMILES of the resulting molfile. */
386
+ async function helmRnaLinearToSmiles(srcHelm: string): Promise<string> {
387
+ const srcCsv = `seq\n${srcHelm}`;
388
+ const df = DG.DataFrame.fromCsv(srcCsv);
389
+ await grok.data.detectSemanticTypes(df);
390
+ const seqCol = df.getCol('seq');
391
+ expect(seqCol.semType, DG.SEMTYPE.MACROMOLECULE);
392
+
393
+ const res = await _toAtomicLevel(df, seqCol, monomerLib, seqHelper, rdKitModule);
394
+ if (!res.molCol)
395
+ throw new Error(`_toAtomicLevel returned no molCol for HELM '${srcHelm}'. ` +
396
+ `Warnings: ${(res.warnings ?? []).join(' / ')}`);
397
+
398
+ const molfile: string | null = res.molCol.get(0);
399
+ if (!molfile)
400
+ throw new Error(`_toAtomicLevel produced an empty molfile for HELM '${srcHelm}'`);
401
+ let smiles: string;
402
+ try {
403
+ smiles = grok.chem.convert(molfile, grok.chem.Notation.Unknown, grok.chem.Notation.Smiles);
404
+ } catch (err: any) {
405
+ throw new Error(`SMILES conversion threw for HELM '${srcHelm}': ${err?.message ?? err}\n` +
406
+ `--- MOLFILE START ---\n${molfile}\n--- MOLFILE END ---`);
407
+ }
408
+ // RDKit signals a parse failure by returning the literal string
409
+ // "MALFORMED_INPUT_VALUE" — surface it together with the offending molfile.
410
+ if (smiles === 'MALFORMED_INPUT_VALUE' || /^MALFORMED/.test(smiles)) {
411
+ throw new Error(`RDKit could not parse molfile produced for HELM '${srcHelm}'.\n` +
412
+ `--- MOLFILE START ---\n${molfile}\n--- MOLFILE END ---`);
413
+ }
414
+ return smiles;
415
+ }
416
+
417
+ // Unmodified RNA HELM — regression baseline. The linear path must produce
418
+ // a real RNA backbone (sugar + phosphate + base per nucleotide), not just
419
+ // a chain of bases.
420
+ test('rna-canonical', async () => {
421
+ const smiles = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p.r(G)p}$$$$`);
422
+ // Should at minimum contain phosphate (P), ribose oxygens, and a purine ring.
423
+ expect(/P/.test(smiles), true, `expected phosphate in SMILES: ${smiles}`);
424
+ // Purine fragment (any ring closure digit): n<d>cnc<d> or N<d>C=N (case insensitive).
425
+ expect(/n\dcnc\d/.test(smiles) || /n\dcnc/i.test(smiles), true,
426
+ `expected purine ring fragment in SMILES: ${smiles}`);
427
+ });
428
+
429
+ // Modified base — 5-methylcytosine. Linear path should preserve the
430
+ // methyl branch on the cytidine of position 0.
431
+ test('rna-modified-base', async () => {
432
+ const smilesPlain = await helmRnaLinearToSmiles(`RNA1{r(C)p.r(A)p}$$$$`);
433
+ const smilesMod = await helmRnaLinearToSmiles(`RNA1{r([m5C])p.r(A)p}$$$$`);
434
+ expect(smilesPlain !== smilesMod, true,
435
+ `m5C must change the SMILES vs. plain C. plain=${smilesPlain} mod=${smilesMod}`);
436
+ });
437
+
438
+ // Modified phosphate — phosphorothioate. The linker between positions 0
439
+ // and 1 must change (S replaces a non-bridging O).
440
+ test('rna-modified-phosphate', async () => {
441
+ const smilesPlain = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p}$$$$`);
442
+ const smilesMod = await helmRnaLinearToSmiles(`RNA1{r(A)[Rsp].r(C)p}$$$$`);
443
+ expect(smilesPlain !== smilesMod, true,
444
+ `Rsp phosphorothioate must change the SMILES vs. plain p. plain=${smilesPlain} mod=${smilesMod}`);
445
+ expect(/S/.test(smilesMod), true,
446
+ `expected sulfur in phosphorothioate SMILES: ${smilesMod}`);
447
+ // HELM explicitly wrote 2 phosphates (one Rsp at position 0, one p at
448
+ // position 1); both must appear in the molecule, so two P atoms total.
449
+ const pCountPlain = (smilesPlain.match(/P/g) || []).length;
450
+ const pCountMod = (smilesMod.match(/P/g) || []).length;
451
+ expect(pCountPlain, 2, `expected 2 phosphates in plain: ${smilesPlain}`);
452
+ expect(pCountMod, 2, `expected 2 phosphates in modified: ${smilesMod}`);
453
+ });
454
+
455
+ // Modified sugar — 2'-fluoro ribose. Position 0 sugar gets a fluorine.
456
+ test('rna-modified-sugar', async () => {
457
+ const smilesPlain = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p}$$$$`);
458
+ const smilesMod = await helmRnaLinearToSmiles(`RNA1{[fl2r](A)p.r(C)p}$$$$`);
459
+ expect(smilesPlain !== smilesMod, true,
460
+ `fl2r (2'-F ribose) must change the SMILES vs. plain r. plain=${smilesPlain} mod=${smilesMod}`);
461
+ expect(/F/.test(smilesMod), true,
462
+ `expected fluorine in 2'-F ribose SMILES: ${smilesMod}`);
463
+ });
464
+
465
+ // HELM omits the trailing phosphate (3'-OH terminus on the sugar). The
466
+ // splitter must split the partial `r(C)` into [r, C], assembly must skip
467
+ // the trailing P emit, and counts must agree.
468
+ test('rna-no-trailing-phosphate', async () => {
469
+ const smilesWith = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p}$$$$`);
470
+ const smilesNoTail = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)}$$$$`);
471
+ // Both should produce valid molecules with at least one P (the linker
472
+ // between the two nucleotides is always present).
473
+ expect(/P/.test(smilesNoTail), true,
474
+ `expected the inter-nucleotide phosphate to remain: ${smilesNoTail}`);
475
+ // The version WITH trailing phosphate should have exactly one more P
476
+ // atom than the version without.
477
+ const pCountWith = (smilesWith.match(/P/g) || []).length;
478
+ const pCountNoTail = (smilesNoTail.match(/P/g) || []).length;
479
+ expect(pCountWith, pCountNoTail + 1,
480
+ `expected pCountWith - pCountNoTail === 1, got with=${pCountWith}, noTail=${pCountNoTail}. ` +
481
+ `with=${smilesWith}, noTail=${smilesNoTail}`);
482
+ });
483
+
484
+ // Missing trailing phosphate combined with modifications.
485
+ test('rna-no-trailing-phosphate-with-modifications', async () => {
486
+ const smiles = await helmRnaLinearToSmiles(`RNA1{[fl2r]([m5C])[Rsp].r(A)}$$$$`);
487
+ expect(/F/.test(smiles), true, `expected fluorine: ${smiles}`);
488
+ expect(/S/.test(smiles), true, `expected sulfur: ${smiles}`);
489
+ // Exactly one phosphate (the Rsp linker), no trailing P.
490
+ const pCount = (smiles.match(/P/g) || []).length;
491
+ expect(pCount, 1, `expected exactly 1 phosphate: ${smiles}`);
492
+ });
493
+
494
+ // All three modifications combined. End-to-end smoke test.
495
+ test('rna-all-modifications', async () => {
496
+ const smiles = await helmRnaLinearToSmiles(`RNA1{[fl2r]([m5C])[Rsp].r(A)p}$$$$`);
497
+ expect(/F/.test(smiles), true, `expected fluorine: ${smiles}`);
498
+ expect(/S/.test(smiles), true, `expected sulfur: ${smiles}`);
499
+ expect(/P/.test(smiles), true, `expected phosphorus: ${smiles}`);
500
+ });
501
+ });
502
+
359
503
 
360
504
  function polishMolfile(mol: string): string {
361
505
  return mol.replaceAll('\r\n', '\n')
@@ -11,7 +11,7 @@ import {detectAlphabet, detectHelmAlphabet, splitterAsFastaSimple, StringListSeq
11
11
  import {mmDistanceFunctions, MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
12
12
  import {mmDistanceFunctionType} from '@datagrok-libraries/ml/src/macromolecule-distance-functions/types';
13
13
  import {getMonomerLibHelper, IMonomerLibHelper} from '@datagrok-libraries/bio/src/types/monomer-library';
14
- import {HELM_POLYMER_TYPE, HELM_WRAPPERS_REGEXP, PHOSPHATE_SYMBOL} from '@datagrok-libraries/bio/src/utils/const';
14
+ import {DEOXYRIBOSE_SYMBOL, HELM_POLYMER_TYPE, HELM_WRAPPERS_REGEXP, PHOSPHATE_SYMBOL, RIBOSE_SYMBOL} from '@datagrok-libraries/bio/src/utils/const';
15
15
  import {GAP_SYMBOL, GapOriginals} from '@datagrok-libraries/bio/src/utils/macromolecule/consts';
16
16
  import {CellRendererBackBase, GridCellRendererTemp} from '@datagrok-libraries/bio/src/utils/cell-renderer-back-base';
17
17
  import {HelmTypes} from '@datagrok-libraries/bio/src/helm/consts';
@@ -939,6 +939,11 @@ export class SeqHandler implements ISeqHandler {
939
939
 
940
940
  if (cm === GAP_SYMBOL)
941
941
  om = GapOriginals[NOTATION.FASTA];
942
+ // For HELM RNA, the splitter triplet-splits each nucleotide into
943
+ // [sugar, base, phosphate]; FASTA conversion keeps only the base, so
944
+ // drop standalone sugar/phosphate tokens.
945
+ else if (isHelm && (cm === PHOSPHATE_SYMBOL || cm === RIBOSE_SYMBOL || cm === DEOXYRIBOSE_SYMBOL))
946
+ om = '';
942
947
  else if (cm === PHOSPHATE_SYMBOL)
943
948
  om = '';
944
949
  else if (om.length > 1)
@@ -978,7 +983,9 @@ export class SeqHandler implements ISeqHandler {
978
983
  return joinToBiln(srcSS);
979
984
  }
980
985
 
981
- /** Splits Helm sequence adjusting nucleotides to single char symbols. (!) Removes lone phosphorus. */
986
+ /** Splits Helm sequence adjusting nucleotides to single char symbols. (!) Removes lone phosphorus,
987
+ * ribose, and deoxyribose tokens (which the underlying splitter emits when triplet-splitting
988
+ * each nucleotide of an RNA chain). */
982
989
  private splitterAsHelmNucl(src: string): ISeqSplitted {
983
990
  const srcMList: ISeqSplitted = this.splitter(src);
984
991
  const tgtMList: (string | null)[] = new Array<string>(srcMList.length);
@@ -988,7 +995,8 @@ export class SeqHandler implements ISeqHandler {
988
995
  let om: string | null = srcMList.getOriginal(posIdx);
989
996
  if (isDna || isRna) {
990
997
  om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
991
- om = om === PHOSPHATE_SYMBOL ? null : om;
998
+ if (om === PHOSPHATE_SYMBOL || om === RIBOSE_SYMBOL || om === DEOXYRIBOSE_SYMBOL)
999
+ om = null;
992
1000
  }
993
1001
  tgtMList[posIdx] = om ? om : null;
994
1002
  }
@@ -1009,18 +1017,26 @@ export class SeqHandler implements ISeqHandler {
1009
1017
  // -- joiners --
1010
1018
 
1011
1019
  function joinToSeparator(seqS: ISeqSplitted, tgtSeparator: string, isHelm: boolean): string {
1012
- const resMList: string[] = new Array<string>(seqS.length);
1020
+ const resMList: string[] = [];
1013
1021
  for (let posIdx: number = 0; posIdx < seqS.length; ++posIdx) {
1014
1022
  const cm = seqS.getCanonical(posIdx);
1015
1023
  let om = seqS.getOriginal(posIdx);
1016
1024
  if (isHelm)
1017
1025
  om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
1018
1026
 
1019
- if (cm === GAP_SYMBOL)
1020
- om = GapOriginals[NOTATION.SEPARATOR];
1021
- else if (cm === PHOSPHATE_SYMBOL)
1022
- om = '';
1023
- resMList[posIdx] = om;
1027
+ if (cm === GAP_SYMBOL) {
1028
+ resMList.push(GapOriginals[NOTATION.SEPARATOR]);
1029
+ continue;
1030
+ }
1031
+ // For HELM RNA, the splitter triplet-splits each nucleotide into
1032
+ // [sugar, base, phosphate]; separator conversion keeps only the base, so
1033
+ // skip standalone sugar/phosphate tokens entirely (rather than emitting
1034
+ // an empty cell that would show up as an extra separator in the output).
1035
+ if (isHelm && (cm === PHOSPHATE_SYMBOL || cm === RIBOSE_SYMBOL || cm === DEOXYRIBOSE_SYMBOL))
1036
+ continue;
1037
+ if (cm === PHOSPHATE_SYMBOL)
1038
+ continue;
1039
+ resMList.push(om);
1024
1040
  }
1025
1041
  return resMList.join(tgtSeparator);
1026
1042
  }