@datagrok/bio 2.27.8 → 2.27.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -381,9 +381,13 @@ category('toAtomicLevelHelmRna', async () => {
381
381
  await monomerLibHelper.loadMonomerLib(true);
382
382
  });
383
383
 
384
- /** Build a single-row HELM RNA dataframe and run the linear converter,
385
- * returning the canonical SMILES of the resulting molfile. */
386
- async function helmRnaLinearToSmiles(srcHelm: string): Promise<string> {
384
+ // ---------- helpers --------------------------------------------------------
385
+
386
+ /** Run the linear converter on a single HELM, returning both the molfile
387
+ * and canonical SMILES. The molfile is the source of truth for structural
388
+ * checks (atom indices, coordinates); the SMILES is kept for legacy /
389
+ * presence-style assertions. */
390
+ async function helmRnaLinear(srcHelm: string): Promise<{molfile: string; smiles: string}> {
387
391
  const srcCsv = `seq\n${srcHelm}`;
388
392
  const df = DG.DataFrame.fromCsv(srcCsv);
389
393
  await grok.data.detectSemanticTypes(df);
@@ -405,206 +409,481 @@ category('toAtomicLevelHelmRna', async () => {
405
409
  throw new Error(`SMILES conversion threw for HELM '${srcHelm}': ${err?.message ?? err}\n` +
406
410
  `--- MOLFILE START ---\n${molfile}\n--- MOLFILE END ---`);
407
411
  }
408
- // RDKit signals a parse failure by returning the literal string
409
- // "MALFORMED_INPUT_VALUE" — surface it together with the offending molfile.
410
412
  if (smiles === 'MALFORMED_INPUT_VALUE' || /^MALFORMED/.test(smiles)) {
411
413
  throw new Error(`RDKit could not parse molfile produced for HELM '${srcHelm}'.\n` +
412
414
  `--- MOLFILE START ---\n${molfile}\n--- MOLFILE END ---`);
413
415
  }
414
- return smiles;
416
+ return {molfile, smiles};
417
+ }
418
+
419
+ /** Build an RDKit `RDMol` from the molfile, run `fn`, and free the mol.
420
+ * Always pass the produced molfile (not its SMILES round-trip) — atom
421
+ * indices and coordinates here are the same ones we want to assert on. */
422
+ function withMol<T>(molfile: string, fn: (mol: any) => T): T {
423
+ const mol = rdKitModule.get_mol(molfile);
424
+ if (!mol || !mol.is_valid())
425
+ throw new Error(`RDKit refused the produced molfile:\n${molfile}`);
426
+ try {
427
+ return fn(mol);
428
+ } finally {
429
+ mol.delete();
430
+ }
431
+ }
432
+
433
+ /** True iff the molecule contains at least one match of the SMARTS query. */
434
+ function hasSmarts(mol: any, smarts: string): boolean {
435
+ const qmol = rdKitModule.get_qmol(smarts);
436
+ try {
437
+ const raw = mol.get_substruct_match(qmol);
438
+ // RDKit JS returns the literal '{}' when there is no match.
439
+ return !!raw && raw !== '{}';
440
+ } finally {
441
+ qmol.delete();
442
+ }
443
+ }
444
+
445
+ /** Number of distinct matches of the SMARTS query in the molecule.
446
+ * `get_substruct_matches` returns either '{}' (no match), a JSON array
447
+ * of `{atoms,bonds}` objects, or — depending on the build — a single
448
+ * match object. Normalise all three. */
449
+ function countSmarts(mol: any, smarts: string): number {
450
+ const qmol = rdKitModule.get_qmol(smarts);
451
+ try {
452
+ const raw = mol.get_substruct_matches(qmol);
453
+ if (!raw || raw === '{}') return 0;
454
+ const parsed = JSON.parse(raw);
455
+ if (Array.isArray(parsed)) return parsed.length;
456
+ // Single-match object
457
+ if (parsed && typeof parsed === 'object' && Array.isArray(parsed.atoms))
458
+ return parsed.atoms.length > 0 ? 1 : 0;
459
+ return 0;
460
+ } finally {
461
+ qmol.delete();
462
+ }
463
+ }
464
+
465
+ /** Atoms-by-element via a single-atom SMARTS — strictly counts the heavy
466
+ * element (no false positives from `[Pa]`, `Si`, etc. that plain regex
467
+ * on SMILES would produce). */
468
+ function countAtoms(mol: any, atomicNumber: number): number {
469
+ return countSmarts(mol, `[#${atomicNumber}]`);
470
+ }
471
+
472
+ /** SMARTS shortcuts used by several tests below. Bracketed atom specs are
473
+ * deliberately permissive — the produced SMILES may render an atom
474
+ * aromatic or kekulised depending on context. */
475
+ const SMARTS = {
476
+ // Generic phosphodiester backbone: C-O-P(=O)(X)-O-C with both bridging
477
+ // oxygens present. X covers OH / O- (canonical p), SH / S- (sp), etc.
478
+ PHOSPHODIESTER:
479
+ '[#6][OX2][PX4](=[OX1])([OX2,SX2,OX1H,SX1H,OX1-,SX1-])[OX2][#6]',
480
+ // Same but the non-bridging substituent is sulfur — phosphorothioate.
481
+ PHOSPHOROTHIOATE_DIESTER:
482
+ '[#6][OX2][PX4](=[OX1])([SX2,SX1H,SX1-])[OX2][#6]',
483
+ // Direct sp3 C-P bond — appears ONLY when a bridging O on the linker
484
+ // R-side has been (incorrectly) removed.
485
+ DIRECT_C_P: '[CX4][PX4]',
486
+ // Five-membered ring with exactly one ring oxygen — furanose.
487
+ FURANOSE: '[#6;R]1[#6;R][#6;R][#6;R][O;R]1',
488
+ // Adenine bicyclic core (aromatic Kekule-tolerant).
489
+ ADENINE_RING: 'n1cnc2c1ncnc2N',
490
+ // Cytosine 4-amino-pyrimidone.
491
+ CYTOSINE_RING: 'Nc1ccn[cH0](=O)n1',
492
+ // m5C: cytosine with a methyl at position 5.
493
+ METHYL_CYTOSINE: '[CH3]c1cn([!#1])c(=O)nc1N',
494
+ // 2'-fluoro on a sugar ring carbon (fl2r marker). Just `F` on a ring
495
+ // sp3 C — no other monomer in our tests has fluorine, so this is
496
+ // unambiguous; ring-position-specific patterns are too brittle to ring
497
+ // traversal direction.
498
+ FLUORO_ON_FURANOSE: '[F][CX4;R]',
499
+ // Acetamide N-C(=O)-CH3 — GalNAc / N-acetyl marker.
500
+ N_ACETYL: '[NX3]C(=O)[CH3]',
501
+ // LNA-only marker: an sp3 carbon shared between two rings (R2). Plain
502
+ // riboses have no such atom; LNA's bicyclic core puts C2', C3', C4'
503
+ // each in two rings.
504
+ LNA_BRIDGEHEAD: '[#6;R2]',
505
+ // Methyl ether on a ring carbon (2'-OMe, the `m` ribose marker).
506
+ TWO_PRIME_OME: '[CH3][OX2][#6;R]',
507
+ // Biotin's cyclic urea (ureido) — 5-mem ring with N-C(=O)-N-C-C
508
+ // pattern. The two C ring atoms are also bridgeheads to biotin's
509
+ // thiolane ring (containing S), but we check that with a separate
510
+ // ring-S query so this SMARTS stays robust to atom-order variations.
511
+ BIOTIN_UREIDO: '[#7;R]1[#6;R](=[OX1])[#7;R][#6;R][#6;R]1',
512
+ // Cholesterol gonane: four fused rings (3 cyclohexane + 1 cyclopentane).
513
+ // Tested via two ring-counting heuristics rather than one rigid pattern,
514
+ // see `looksLikeSteroid` below.
515
+ } as const;
516
+
517
+ /** Cholesterol detection: gonane has 4 fused rings; the D ring is a
518
+ * cyclopentane (5-mem all-carbon) and the rest are cyclohexanes. None
519
+ * of the other monomers we test against — sugars (always have a ring O),
520
+ * nucleobases (always have N), biotin (5-mem rings have N or S), LNA
521
+ * (5-mem rings have O) — produce an all-carbon 5-mem ring, so this
522
+ * SMARTS is unique to steroids. We additionally require ≥ 4 ring
523
+ * carbons in two rings (R2) to confirm a fused polycyclic system, not
524
+ * an isolated cyclopentane. */
525
+ function looksLikeSteroid(mol: any): boolean {
526
+ const cyclopentane = hasSmarts(mol, '[#6]1[#6][#6][#6][#6]1');
527
+ const fusedRingAtoms = countSmarts(mol, '[#6;R2]');
528
+ return cyclopentane && fusedRingAtoms >= 4;
529
+ }
530
+
531
+ /** Parse a V3K molblock atom block into 0-indexed coordinate records.
532
+ * The element symbol and x/y are sufficient for layout assertions; we
533
+ * deliberately ignore z, charges, isotopes, etc. */
534
+ function parseV3KAtoms(molfile: string): { element: string; x: number; y: number }[] {
535
+ const atoms: { element: string; x: number; y: number }[] = [];
536
+ const begin = molfile.indexOf('M V30 BEGIN ATOM');
537
+ if (begin < 0) return atoms;
538
+ const end = molfile.indexOf('M V30 END ATOM', begin);
539
+ const block = molfile.substring(begin, end >= 0 ? end : molfile.length);
540
+ const lineRe = /^M\s+V30\s+(\d+)\s+(\S+)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)/gm;
541
+ let m: RegExpExecArray | null;
542
+ while ((m = lineRe.exec(block))) {
543
+ const idx = parseInt(m[1]) - 1;
544
+ // Atoms are emitted in order; sanity check.
545
+ if (idx !== atoms.length) continue;
546
+ atoms.push({element: m[2], x: parseFloat(m[3]), y: parseFloat(m[4])});
547
+ }
548
+ return atoms;
549
+ }
550
+
551
+ /** Run a SMARTS against the molecule and collect every atom index that
552
+ * appears in any match. Used to bin atoms by role (sugar / base / etc.). */
553
+ function collectMatchedAtoms(mol: any, smarts: string): Set<number> {
554
+ const set = new Set<number>();
555
+ const qmol = rdKitModule.get_qmol(smarts);
556
+ try {
557
+ const raw = mol.get_substruct_matches(qmol);
558
+ if (!raw || raw === '{}') return set;
559
+ const parsed = JSON.parse(raw);
560
+ const list = Array.isArray(parsed) ? parsed : [parsed];
561
+ for (const m of list)
562
+ for (const a of (m?.atoms ?? [])) set.add(a as number);
563
+ } finally {
564
+ qmol.delete();
565
+ }
566
+ return set;
567
+ }
568
+
569
+ /** Layout assertion: every atom in any nucleobase ring sits at a higher
570
+ * Y than every sugar (furanose) ring atom. With the abnormal-sugar
571
+ * override, the base is placed above the topmost atom of the sugar
572
+ * cluster — including LNA's 2',4'-bridge oxygen / CH2. Without the
573
+ * override the LNA bridge sits ABOVE the base attachment point and
574
+ * this assertion fails. */
575
+ function expectBaseAboveSugar(molfile: string): void {
576
+ const atoms = parseV3KAtoms(molfile);
577
+ if (atoms.length === 0) throw new Error(`failed to parse molblock atoms`);
578
+ withMol(molfile, (mol) => {
579
+ const sugarIdx = collectMatchedAtoms(mol, SMARTS.FURANOSE);
580
+ // Base atoms = aromatic ring atoms (purines and pyrimidines aromatize
581
+ // in RDKit's perception). Sugars are sp3, won't match `[a]`.
582
+ const baseIdx = collectMatchedAtoms(mol, '[a]');
583
+ if (sugarIdx.size === 0)
584
+ throw new Error('no furanose ring atoms found — cannot verify layout');
585
+ if (baseIdx.size === 0)
586
+ throw new Error('no aromatic base atoms found — cannot verify layout');
587
+ let maxSugarY = -Infinity;
588
+ for (const i of sugarIdx) maxSugarY = Math.max(maxSugarY, atoms[i].y);
589
+ let minBaseY = Infinity;
590
+ for (const i of baseIdx) minBaseY = Math.min(minBaseY, atoms[i].y);
591
+ expect(minBaseY > maxSugarY, true,
592
+ `expected base atoms above sugar (minBaseY=${minBaseY.toFixed(3)}, ` +
593
+ `maxSugarY=${maxSugarY.toFixed(3)})`);
594
+ });
415
595
  }
416
596
 
417
597
  // Unmodified RNA HELM — regression baseline. The linear path must produce
418
- // a real RNA backbone (sugar + phosphate + base per nucleotide), not just
419
- // a chain of bases.
598
+ // a real RNA backbone: a furanose ring per nucleotide, two inter-nucleotide
599
+ // phosphodiester linkers (C-O-P(=O)(O)-O-C) for three nucleotides, and
600
+ // recognisable purine / pyrimidine bases attached to the sugars.
420
601
  test('rna-canonical', async () => {
421
- const smiles = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p.r(G)p}$$$$`);
422
- // Should at minimum contain phosphate (P), ribose oxygens, and a purine ring.
423
- expect(/P/.test(smiles), true, `expected phosphate in SMILES: ${smiles}`);
424
- // Purine fragment (any ring closure digit): n<d>cnc<d> or N<d>C=N (case insensitive).
425
- expect(/n\dcnc\d/.test(smiles) || /n\dcnc/i.test(smiles), true,
426
- `expected purine ring fragment in SMILES: ${smiles}`);
602
+ const {molfile} = await helmRnaLinear(`RNA1{r(A)p.r(C)p.r(G)p}$$$$`);
603
+ withMol(molfile, (mol) => {
604
+ // 3 ribose furanose rings (one per nucleotide).
605
+ const furanoses = countSmarts(mol, SMARTS.FURANOSE);
606
+ expect(furanoses >= 3, true, `expected ≥ 3 furanose rings, got ${furanoses}`);
607
+ // Inter-nucleotide phosphodiesters: r-r and r-r joints, so ≥ 2.
608
+ // (The 3'-trailing P is a monoester and won't match the diester SMARTS.)
609
+ const diesters = countSmarts(mol, SMARTS.PHOSPHODIESTER);
610
+ expect(diesters >= 2, true,
611
+ `expected ≥ 2 inter-nucleotide phosphodiester linkers, got ${diesters}`);
612
+ // No direct sp3 C–P bond (would mean a bridging O was lost).
613
+ const directCP = countSmarts(mol, SMARTS.DIRECT_C_P);
614
+ expect(directCP, 0,
615
+ `expected 0 direct C-P bonds (chain must use C-O-P-O-C), got ${directCP}`);
616
+ // Purine ring (A and G are purines).
617
+ const purines = countSmarts(mol, SMARTS.ADENINE_RING);
618
+ expect(purines >= 1, true, `expected ≥ 1 purine ring, got ${purines}`);
619
+ // Total phosphorus count: 3 (one per nucleotide as written).
620
+ expect(countAtoms(mol, 15), 3, 'expected 3 phosphorus atoms');
621
+ });
427
622
  });
428
623
 
429
- // Modified base — 5-methylcytosine. Linear path should preserve the
430
- // methyl branch on the cytidine of position 0.
624
+ // Modified base — 5-methylcytosine. The methyl must end up at C5 of a
625
+ // cytosine ring (not just any methyl on any ring), and only one m5C
626
+ // appears in this row.
431
627
  test('rna-modified-base', async () => {
432
- const smilesPlain = await helmRnaLinearToSmiles(`RNA1{r(C)p.r(A)p}$$$$`);
433
- const smilesMod = await helmRnaLinearToSmiles(`RNA1{r([m5C])p.r(A)p}$$$$`);
434
- expect(smilesPlain !== smilesMod, true,
435
- `m5C must change the SMILES vs. plain C. plain=${smilesPlain} mod=${smilesMod}`);
628
+ const {molfile: plain} = await helmRnaLinear(`RNA1{r(C)p.r(A)p}$$$$`);
629
+ const {molfile: mod} = await helmRnaLinear(`RNA1{r([m5C])p.r(A)p}$$$$`);
630
+ withMol(plain, (mol) => {
631
+ // No 5-methyl-cytosine in the plain version.
632
+ expect(countSmarts(mol, SMARTS.METHYL_CYTOSINE), 0,
633
+ 'plain r(C) must not contain 5-methylcytosine');
634
+ });
635
+ withMol(mod, (mol) => {
636
+ // Exactly one m5C ring; cytosine ring still present.
637
+ expect(countSmarts(mol, SMARTS.METHYL_CYTOSINE), 1,
638
+ 'r([m5C]) must contain exactly one 5-methylcytosine ring');
639
+ });
436
640
  });
437
641
 
438
- // Modified phosphate — phosphorothioate. The linker between positions 0
439
- // and 1 must change (S replaces a non-bridging O).
642
+ // Modified phosphate — phosphorothioate (Rsp). The S MUST be on the
643
+ // phosphorus of the linker between positions 0 and 1 (not just somewhere
644
+ // in the molecule), the linker must remain a diester (both bridging O
645
+ // preserved), and the unmodified `p` at position 1 must stay unchanged.
440
646
  test('rna-modified-phosphate', async () => {
441
- const smilesPlain = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p}$$$$`);
442
- const smilesMod = await helmRnaLinearToSmiles(`RNA1{r(A)[Rsp].r(C)p}$$$$`);
443
- expect(smilesPlain !== smilesMod, true,
444
- `Rsp phosphorothioate must change the SMILES vs. plain p. plain=${smilesPlain} mod=${smilesMod}`);
445
- expect(/S/.test(smilesMod), true,
446
- `expected sulfur in phosphorothioate SMILES: ${smilesMod}`);
447
- // HELM explicitly wrote 2 phosphates (one Rsp at position 0, one p at
448
- // position 1); both must appear in the molecule, so two P atoms total.
449
- const pCountPlain = (smilesPlain.match(/P/g) || []).length;
450
- const pCountMod = (smilesMod.match(/P/g) || []).length;
451
- expect(pCountPlain, 2, `expected 2 phosphates in plain: ${smilesPlain}`);
452
- expect(pCountMod, 2, `expected 2 phosphates in modified: ${smilesMod}`);
453
- });
454
-
455
- // Modified sugar — 2'-fluoro ribose. Position 0 sugar gets a fluorine.
647
+ const {molfile: plain} = await helmRnaLinear(`RNA1{r(A)p.r(C)p}$$$$`);
648
+ const {molfile: mod} = await helmRnaLinear(`RNA1{r(A)[Rsp].r(C)p}$$$$`);
649
+ withMol(plain, (mol) => {
650
+ expect(countAtoms(mol, 16), 0, 'plain RNA must contain no sulfur');
651
+ expect(countAtoms(mol, 15), 2, 'expected 2 phosphates in plain');
652
+ expect(countSmarts(mol, SMARTS.PHOSPHODIESTER) >= 1, true,
653
+ 'plain inter-nucleotide diester must be present');
654
+ });
655
+ withMol(mod, (mol) => {
656
+ // Sulfur is on phosphorus, not somewhere else.
657
+ expect(hasSmarts(mol, '[PX4]=S') || hasSmarts(mol, '[PX4][SX2,SX1H,SX1-]'),
658
+ true, 'sulfur must be bonded to a phosphorus atom');
659
+ // Phosphorothioate diester has both bridging oxygens around the P.
660
+ expect(countSmarts(mol, SMARTS.PHOSPHOROTHIOATE_DIESTER), 1,
661
+ 'expected exactly one phosphorothioate diester linker');
662
+ // 2 phosphates total (Rsp + p).
663
+ expect(countAtoms(mol, 15), 2, 'expected 2 phosphates in modified');
664
+ // No direct C-P bond (regression check from sp/Rsp fix).
665
+ expect(countSmarts(mol, SMARTS.DIRECT_C_P), 0,
666
+ 'expected zero direct C-P bonds');
667
+ });
668
+ });
669
+
670
+ // Modified sugar — 2'-fluoro ribose. F must end up on a ring carbon of
671
+ // a furanose (i.e., a sugar atom), not on an arbitrary aliphatic carbon.
456
672
  test('rna-modified-sugar', async () => {
457
- const smilesPlain = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p}$$$$`);
458
- const smilesMod = await helmRnaLinearToSmiles(`RNA1{[fl2r](A)p.r(C)p}$$$$`);
459
- expect(smilesPlain !== smilesMod, true,
460
- `fl2r (2'-F ribose) must change the SMILES vs. plain r. plain=${smilesPlain} mod=${smilesMod}`);
461
- expect(/F/.test(smilesMod), true,
462
- `expected fluorine in 2'-F ribose SMILES: ${smilesMod}`);
673
+ const {molfile: plain} = await helmRnaLinear(`RNA1{r(A)p.r(C)p}$$$$`);
674
+ const {molfile: mod} = await helmRnaLinear(`RNA1{[fl2r](A)p.r(C)p}$$$$`);
675
+ withMol(plain, (mol) => {
676
+ expect(countAtoms(mol, 9), 0, 'plain RNA must contain no fluorine');
677
+ });
678
+ withMol(mod, (mol) => {
679
+ expect(countAtoms(mol, 9), 1, 'fl2r contributes exactly one fluorine');
680
+ // F is on a ring carbon of a furanose.
681
+ expect(countSmarts(mol, SMARTS.FLUORO_ON_FURANOSE) >= 1, true,
682
+ 'fluorine must be on a furanose ring carbon (2\'-F)');
683
+ // Furanose count unchanged (one ribose replaced by 2'-F ribose).
684
+ expect(countSmarts(mol, SMARTS.FURANOSE) >= 2, true,
685
+ 'expected ≥ 2 furanose rings');
686
+ });
463
687
  });
464
688
 
465
689
  // HELM omits the trailing phosphate (3'-OH terminus on the sugar). The
466
690
  // splitter must split the partial `r(C)` into [r, C], assembly must skip
467
691
  // the trailing P emit, and counts must agree.
468
692
  test('rna-no-trailing-phosphate', async () => {
469
- const smilesWith = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p}$$$$`);
470
- const smilesNoTail = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)}$$$$`);
471
- // Both should produce valid molecules with at least one P (the linker
472
- // between the two nucleotides is always present).
473
- expect(/P/.test(smilesNoTail), true,
474
- `expected the inter-nucleotide phosphate to remain: ${smilesNoTail}`);
475
- // The version WITH trailing phosphate should have exactly one more P
476
- // atom than the version without.
477
- const pCountWith = (smilesWith.match(/P/g) || []).length;
478
- const pCountNoTail = (smilesNoTail.match(/P/g) || []).length;
479
- expect(pCountWith, pCountNoTail + 1,
480
- `expected pCountWith - pCountNoTail === 1, got with=${pCountWith}, noTail=${pCountNoTail}. ` +
481
- `with=${smilesWith}, noTail=${smilesNoTail}`);
693
+ const {molfile: withTail} = await helmRnaLinear(`RNA1{r(A)p.r(C)p}$$$$`);
694
+ const {molfile: noTail} = await helmRnaLinear(`RNA1{r(A)p.r(C)}$$$$`);
695
+ const pCountWith = withMol(withTail, (mol) => countAtoms(mol, 15));
696
+ const pCountNoTail = withMol(noTail, (mol) => countAtoms(mol, 15));
697
+ expect(pCountWith, 2, 'with trailing P: 2 phosphates (1 linker + 1 trail)');
698
+ expect(pCountNoTail, 1, 'no trailing P: 1 phosphate (the linker only)');
699
+ withMol(noTail, (mol) => {
700
+ // The remaining phosphate is still a proper diester (both bridging O
701
+ // present, no direct C-P bond).
702
+ expect(countSmarts(mol, SMARTS.PHOSPHODIESTER), 1,
703
+ 'inter-nucleotide diester must still be present');
704
+ expect(countSmarts(mol, SMARTS.DIRECT_C_P), 0,
705
+ 'no direct C-P bond');
706
+ // Both furanose rings still present.
707
+ expect(countSmarts(mol, SMARTS.FURANOSE), 2, 'both furanose rings present');
708
+ });
482
709
  });
483
710
 
484
711
  // Missing trailing phosphate combined with modifications.
485
712
  test('rna-no-trailing-phosphate-with-modifications', async () => {
486
- const smiles = await helmRnaLinearToSmiles(`RNA1{[fl2r]([m5C])[Rsp].r(A)}$$$$`);
487
- expect(/F/.test(smiles), true, `expected fluorine: ${smiles}`);
488
- expect(/S/.test(smiles), true, `expected sulfur: ${smiles}`);
489
- // Exactly one phosphate (the Rsp linker), no trailing P.
490
- const pCount = (smiles.match(/P/g) || []).length;
491
- expect(pCount, 1, `expected exactly 1 phosphate: ${smiles}`);
713
+ const {molfile} = await helmRnaLinear(`RNA1{[fl2r]([m5C])[Rsp].r(A)}$$$$`);
714
+ withMol(molfile, (mol) => {
715
+ // 1 F (2'-F on the fl2r sugar), on a furanose carbon.
716
+ expect(countAtoms(mol, 9), 1, 'expected exactly 1 fluorine');
717
+ expect(countSmarts(mol, SMARTS.FLUORO_ON_FURANOSE), 1,
718
+ '2\'-F must be on a furanose ring carbon');
719
+ // 1 P, 1 S — single Rsp linker, no trailing P.
720
+ expect(countAtoms(mol, 15), 1, 'expected exactly 1 phosphorus (Rsp)');
721
+ expect(countAtoms(mol, 16), 1, 'expected exactly 1 sulfur (Rsp)');
722
+ // Linker is a phosphorothioate diester (both bridging O present).
723
+ expect(countSmarts(mol, SMARTS.PHOSPHOROTHIOATE_DIESTER), 1,
724
+ 'Rsp linker must remain a phosphorothioate diester');
725
+ // m5C base present.
726
+ expect(countSmarts(mol, SMARTS.METHYL_CYTOSINE), 1,
727
+ 'expected one 5-methylcytosine base');
728
+ });
492
729
  });
493
730
 
494
- // All three modifications combined. End-to-end smoke test.
731
+ // All three modifications combined. End-to-end smoke test — every
732
+ // modification's structural fingerprint must be detectable.
495
733
  test('rna-all-modifications', async () => {
496
- const smiles = await helmRnaLinearToSmiles(`RNA1{[fl2r]([m5C])[Rsp].r(A)p}$$$$`);
497
- expect(/F/.test(smiles), true, `expected fluorine: ${smiles}`);
498
- expect(/S/.test(smiles), true, `expected sulfur: ${smiles}`);
499
- expect(/P/.test(smiles), true, `expected phosphorus: ${smiles}`);
734
+ const {molfile} = await helmRnaLinear(`RNA1{[fl2r]([m5C])[Rsp].r(A)p}$$$$`);
735
+ withMol(molfile, (mol) => {
736
+ expect(countSmarts(mol, SMARTS.FLUORO_ON_FURANOSE), 1,
737
+ 'fl2r: 2\'-F on furanose');
738
+ expect(countSmarts(mol, SMARTS.METHYL_CYTOSINE), 1,
739
+ 'm5C: 5-methylcytosine');
740
+ expect(countSmarts(mol, SMARTS.PHOSPHOROTHIOATE_DIESTER), 1,
741
+ 'Rsp: phosphorothioate diester');
742
+ expect(countAtoms(mol, 15), 2, 'two phosphates (Rsp + trailing p)');
743
+ expect(countAtoms(mol, 16), 1, 'exactly one sulfur (from Rsp)');
744
+ });
500
745
  });
501
746
 
502
747
  // 3'-end terminal modifier (GalNAc, R1 only). HELM puts it in the
503
748
  // "phosphate" slot of the last triple, but it's actually a chain end.
504
- // Expectations: chain ends at GalNAc (no extra OH cap), no phosphate
505
- // at all, GalNAc structural features (acetamide N) are present.
749
+ // GalNAc carries an N-acetyl group that's the structural fingerprint
750
+ // the test should pin to (not "any nitrogen", which thymine satisfies).
506
751
  test('rna-helm-3p-terminal-galnac', async () => {
507
- const smiles = await helmRnaLinearToSmiles(`RNA1{r(T)[GalNAc]}$$$$V2.0`);
508
- const pCount = (smiles.match(/P/g) || []).length;
509
- expect(pCount, 0, `expected 0 phosphates (GalNAc replaces P): ${smiles}`);
510
- expect(/N/.test(smiles), true, `expected nitrogen from GalNAc acetamide: ${smiles}`);
511
- // Sanity: SMILES should not be RDKit's parse-failure sentinel.
512
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
513
- `valid SMILES expected: ${smiles}`);
752
+ const {molfile} = await helmRnaLinear(`RNA1{r(T)[GalNAc]}$$$$V2.0`);
753
+ withMol(molfile, (mol) => {
754
+ // No phosphate at all (GalNAc replaces the trailing P slot).
755
+ expect(countAtoms(mol, 15), 0, 'GalNAc terminus: no P expected');
756
+ // Acetamide group from GalNAc must be present.
757
+ expect(countSmarts(mol, SMARTS.N_ACETYL) >= 1, true,
758
+ 'expected N-acetyl group from GalNAc');
759
+ // GalNAc is a hexopyranose (6-mem ring with one O). Plus thymine ring
760
+ // and the ribose furanose, the molecule has more than one ring.
761
+ // Pyranose: C-C-C-C-C-O 6-membered.
762
+ expect(hasSmarts(mol, '[#6]1[#6][#6][#6][#6][O]1'), true,
763
+ 'expected a pyranose (6-membered) ring from GalNAc');
764
+ });
514
765
  });
515
766
 
516
767
  // 5'-end terminal modifier (Chol, R2 only) at the start of the chain.
517
- // HELM puts Chol where the first sugar would be. With no trailing P,
518
- // the chain is Chol r(T)-3'-OH.
768
+ // Cholesterol's structural fingerprint is the gonane: four fused rings
769
+ // including a cyclopentane fused to a cyclohexane (D-C ring junction).
519
770
  test('rna-helm-5p-terminal-chol', async () => {
520
- const smiles = await helmRnaLinearToSmiles(`RNA1{[Chol].r(T)}$$$$V2.0`);
521
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
522
- `valid SMILES expected: ${smiles}`);
523
- // Cholesterol has 4 fused rings (3 six-membered + 1 five-membered) — sanity-check
524
- // by requiring at least 4 non-aromatic ring closures (digits 1-4) in the SMILES
525
- // (cholesterol fragment alone uses ring closures 1-4).
526
- expect(/1/.test(smiles) && /2/.test(smiles) && /3/.test(smiles) && /4/.test(smiles), true,
527
- `expected cholesterol ring fragments: ${smiles}`);
771
+ const {molfile} = await helmRnaLinear(`RNA1{[Chol].r(T)}$$$$V2.0`);
772
+ withMol(molfile, (mol) => {
773
+ expect(looksLikeSteroid(mol), true,
774
+ 'Chol terminus must produce the steroid (gonane) ring system');
775
+ // Chol replaces the first sugar only one furanose left (from r(T)).
776
+ expect(countSmarts(mol, SMARTS.FURANOSE), 1,
777
+ 'expected exactly 1 furanose ring (from r(T))');
778
+ });
528
779
  });
529
780
 
530
781
  // Chol at 5' with explicit trailing phosphate (the original failing case).
531
- // Chain: Chol → r(T) → P-OH. Should produce exactly 1 phosphate.
782
+ // Chain: Chol → r(T) → P-OH. Steroid rings + ribose + 1 phosphate.
532
783
  test('rna-helm-5p-terminal-chol-with-trailing-phosphate', async () => {
533
- const smiles = await helmRnaLinearToSmiles(`RNA1{[Chol].r(T)p}$$$$V2.0`);
534
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
535
- `valid SMILES expected: ${smiles}`);
536
- const pCount = (smiles.match(/P/g) || []).length;
537
- expect(pCount, 1, `expected exactly 1 phosphate: ${smiles}`);
784
+ const {molfile} = await helmRnaLinear(`RNA1{[Chol].r(T)p}$$$$V2.0`);
785
+ withMol(molfile, (mol) => {
786
+ expect(looksLikeSteroid(mol), true,
787
+ 'expected steroid ring system from Chol');
788
+ expect(countAtoms(mol, 15), 1, 'expected exactly 1 phosphorus');
789
+ expect(countSmarts(mol, SMARTS.FURANOSE), 1, 'expected 1 furanose');
790
+ });
538
791
  });
539
792
 
540
793
  // Both terminals at once: Chol at 5', GalNAc at 3', single nucleotide
541
- // in between. No phosphates anywhere.
794
+ // between. Both terminus markers must be present, no phosphate.
542
795
  test('rna-helm-both-terminals', async () => {
543
- const smiles = await helmRnaLinearToSmiles(`RNA1{[Chol].r(T)[GalNAc]}$$$$V2.0`);
544
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
545
- `valid SMILES expected: ${smiles}`);
546
- const pCount = (smiles.match(/P/g) || []).length;
547
- expect(pCount, 0, `expected 0 phosphates with both terminals: ${smiles}`);
548
- });
549
-
550
- // LNA (2,4-BNA) regression. The 2,4-O-CH2 bridge sits ABOVE C1' once the
551
- // sugar is oriented with R1/R2 atoms horizontal, so the natural R3 vector
552
- // points sideways instead of up. Without the abnormal-sugar override the
553
- // base ends up sideways from the sugar (or worse, overlapping it). The
554
- // assertion here is structural: must produce a valid single-fragment
555
- // SMILES with the LNA-specific bridge oxygen plus the normal nucleoside
556
- // features. Coordinates aren't checked only connectivity.
796
+ const {molfile} = await helmRnaLinear(`RNA1{[Chol].r(T)[GalNAc]}$$$$V2.0`);
797
+ withMol(molfile, (mol) => {
798
+ expect(countAtoms(mol, 15), 0, 'expected zero phosphates');
799
+ expect(looksLikeSteroid(mol), true,
800
+ 'expected steroid (Chol) ring system');
801
+ expect(countSmarts(mol, SMARTS.N_ACETYL) >= 1, true,
802
+ 'expected N-acetyl group from GalNAc');
803
+ // r(T) brings exactly one furanose, GalNAc brings the pyranose.
804
+ expect(hasSmarts(mol, '[#6]1[#6][#6][#6][#6][O]1'), true,
805
+ 'expected pyranose ring from GalNAc');
806
+ });
807
+ });
808
+
809
+ // LNA (2',4'-BNA) regression. The structural marker is the bicyclic
810
+ // sugar: every ring carbon of the LNA furanose is shared with a second
811
+ // ring (the C2'-O-CH2-C4' bridge). Standard riboses produce zero such
812
+ // R2-shared atoms — so this test is exclusive to LNA.
813
+ //
814
+ // Additionally, the depiction-level claim ("base above sugar") is
815
+ // verified by reading molblock coordinates and confirming the base
816
+ // atoms sit higher in Y than every sugar atom.
557
817
  test('rna-helm-lna-base-above-sugar', async () => {
558
- const smiles = await helmRnaLinearToSmiles(`RNA1{[lna](A)p.[lna](T)}$$$$V2.0`);
559
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
560
- `valid SMILES expected: ${smiles}`);
561
- expect(smiles.indexOf('.') === -1, true,
562
- `expected single fragment: ${smiles}`);
563
- // Sanity check for nitrogens adenine brings 5 (4 ring + 1 NH2) and
564
- // thymine brings 2 (both ring), so at least 7 total. Match both
565
- // uppercase (N, [nH]) and lowercase aromatic (n) — N atoms in heterocyclic
566
- // SMILES are written lowercase when aromatic.
567
- const nCount = (smiles.match(/[Nn]/g) || []).length;
568
- expect(nCount >= 7, true, `expected at least 7 nitrogens: ${smiles}`);
569
- // One inter-nucleotide phosphate.
570
- const pCount = (smiles.match(/P/g) || []).length;
571
- expect(pCount, 1, `expected exactly 1 phosphate: ${smiles}`);
818
+ const {molfile} = await helmRnaLinear(`RNA1{[lna](A)p.[lna](T)}$$$$V2.0`);
819
+ withMol(molfile, (mol) => {
820
+ // Single connected fragment.
821
+ expect(hasSmarts(mol, '[*]'), true, 'molecule must be non-empty');
822
+ // LNA-specific bicyclic sugar: ring atoms shared between two rings.
823
+ // Two LNA sugars × 3 bridgehead-class carbons each = 4.
824
+ const r2 = countSmarts(mol, SMARTS.LNA_BRIDGEHEAD);
825
+ expect(r2 >= 4, true,
826
+ `expected 4 ring carbons in 2 rings (LNA bicyclic), got ${r2}`);
827
+ // Inter-nucleotide phosphodiester present, no direct C-P.
828
+ expect(countSmarts(mol, SMARTS.PHOSPHODIESTER) >= 1, true,
829
+ 'expected 1 phosphodiester linker');
830
+ expect(countSmarts(mol, SMARTS.DIRECT_C_P), 0,
831
+ 'expected zero direct C-P bonds');
832
+ // Adenine + thymine present.
833
+ expect(countSmarts(mol, SMARTS.ADENINE_RING) >= 1, true,
834
+ 'expected adenine ring (purine)');
835
+ expect(hasSmarts(mol, '[CH3][#6]1=[#6][#7]([!#1])[#6](=O)[#7][#6]1=O') ||
836
+ hasSmarts(mol, 'Cc1cn([!#1])c(=O)[nH]c1=O'),
837
+ true, 'expected thymine ring (5-methyluracil)');
838
+ });
839
+ // Depiction: base atoms above sugar atoms in Y.
840
+ expectBaseAboveSugar(molfile);
572
841
  });
573
842
 
574
843
  // GalNAc oxygen-count regression. Previously the R1 placeholder atom
575
844
  // (substituted to 'O' from the "OH" cap) was left in the assembly,
576
- // adding a stray OH on the chain-attach carbon. lna(T)GalNAc has known
577
- // expected SMILES with exactly 10 oxygens.
845
+ // adding a stray OH on the chain-attach carbon. lna(T)GalNAc has a known
846
+ // expected oxygen count; an extra OH would push it to 11.
578
847
  test('rna-helm-3p-terminal-galnac-no-extra-oh', async () => {
579
- const smiles = await helmRnaLinearToSmiles(`RNA1{[lna](T)[GalNAc]}$$$$V2.0`);
580
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
581
- `valid SMILES expected: ${smiles}`);
582
- // Count OXYGEN ATOMS only — uppercase O outside of brackets in standard
583
- // SMILES denotes a non-aromatic oxygen. Ring-closure digits and atoms
584
- // inside [] don't match this regex.
585
- const oCount = (smiles.match(/O/g) || []).length;
586
- expect(oCount, 10, `expected exactly 10 oxygen atoms in lna-T-GalNAc: ${smiles}`);
848
+ const {molfile} = await helmRnaLinear(`RNA1{[lna](T)[GalNAc]}$$$$V2.0`);
849
+ withMol(molfile, (mol) => {
850
+ // Heavy oxygen atom count — RDKit doesn't double-count ring closures
851
+ // or atoms inside brackets.
852
+ expect(countAtoms(mol, 8), 10,
853
+ 'expected exactly 10 oxygen atoms in lna(T)GalNAc');
854
+ // No phosphate (GalNAc replaces the trailing P slot).
855
+ expect(countAtoms(mol, 15), 0, 'expected no phosphate');
856
+ // GalNAc N-acetyl preserved.
857
+ expect(hasSmarts(mol, SMARTS.N_ACETYL), true,
858
+ 'expected GalNAc N-acetyl group');
859
+ // LNA still bicyclic.
860
+ expect(countSmarts(mol, SMARTS.LNA_BRIDGEHEAD) >= 2, true,
861
+ 'expected LNA bicyclic bridgeheads');
862
+ });
587
863
  });
588
864
 
589
865
  // sp (and similar phosphates with R-cap = H) used to disconnect the chain
590
866
  // because the H placeholder was removed by removeHydrogen, leaving
591
867
  // terminalNodes[0] pointing at the now-deleted atom. The result was a
592
- // SMILES with two disconnected fragments separated by '.'. The fix:
593
- // when the cap is H, leave terminalNodes[0] at its original
594
- // setTerminalNodes value (the atom previously bonded to R1, e.g. P) so
595
- // the chain bond goes there directly.
868
+ // molecule with two disconnected fragments. The fix promotes the H cap
869
+ // to an O so the chain bond attaches at a real atom; the linker becomes
870
+ // a true phosphorothioate diester.
596
871
  test('rna-helm-h-cap-phosphate-sp-connects', async () => {
597
- const smiles = await helmRnaLinearToSmiles(`RNA1{r(T)[sp].r(A)}$$$$V2.0`);
598
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
599
- `valid SMILES expected: ${smiles}`);
600
- // No '.' → single connected fragment.
872
+ const {molfile, smiles} = await helmRnaLinear(`RNA1{r(T)[sp].r(A)}$$$$V2.0`);
873
+ // SMILES dot count is the canonical fragment-count test — keep it.
601
874
  expect(smiles.indexOf('.') === -1, true,
602
- `expected single fragment (no '.' separator): ${smiles}`);
603
- // Exactly one phosphorus from the sp linker.
604
- const pCount = (smiles.match(/P/g) || []).length;
605
- expect(pCount, 1, `expected exactly 1 phosphate: ${smiles}`);
606
- // sp carries a sulfur on the phosphate.
607
- expect(/S/.test(smiles), true, `expected sulfur from sp: ${smiles}`);
875
+ `expected single connected fragment, got: ${smiles}`);
876
+ withMol(molfile, (mol) => {
877
+ // Sulfur is bonded to the phosphorus, not floating somewhere else.
878
+ expect(countSmarts(mol, '[PX4][SX2,SX1H,SX1-]'), 1,
879
+ 'sp\'s sulfur must be on its phosphorus');
880
+ expect(countAtoms(mol, 15), 1, 'one phosphorus from the sp linker');
881
+ expect(countAtoms(mol, 16), 1, 'one sulfur from the sp linker');
882
+ expect(countSmarts(mol, SMARTS.PHOSPHOROTHIOATE_DIESTER), 1,
883
+ 'sp linker must be a phosphorothioate diester (C-O-P-O-C)');
884
+ expect(countSmarts(mol, SMARTS.DIRECT_C_P), 0,
885
+ 'no direct C-P bond');
886
+ });
608
887
  });
609
888
 
610
889
  // Regression: H-cap phosphates (sp et al.) used to drop the bridging O
@@ -617,79 +896,94 @@ category('toAtomicLevelHelmRna', async () => {
617
896
  // Use m(2'-OMe ribose) so we can also verify the methoxy group survives
618
897
  // the sp chain assembly.
619
898
  test('rna-helm-sp-bridging-o-preserved', async () => {
620
- const smiles = await helmRnaLinearToSmiles(`RNA1{m(A)[sp].r(A)[sp]}$$$$V2.0`);
621
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
622
- `valid SMILES expected: ${smiles}`);
623
- // Single connected fragment.
624
- expect(smiles.indexOf('.') === -1, true,
625
- `expected single fragment: ${smiles}`);
626
- // 2 sp linkers → 2 phosphorus atoms.
627
- const pCount = (smiles.match(/P/g) || []).length;
628
- expect(pCount, 2, `expected exactly 2 phosphates: ${smiles}`);
629
- // 2 sulfurs (one per sp).
630
- const sCount = (smiles.match(/S/g) || []).length;
631
- expect(sCount, 2, `expected exactly 2 sulfurs (one per sp): ${smiles}`);
632
- // No C-P bond every P should be bordered by O on both chain sides.
633
- // A P preceded directly by an aliphatic carbon (lowercase 'c' is
634
- // aromatic; capital 'C' is sp3) means the bridging O was lost.
635
- expect(/C\d*P|CP|cP/.test(smiles), false,
636
- `expected no direct C-P bond (bridging O missing): ${smiles}`);
637
- // 2' methoxy on m must survive — methyl ether oxygen plus three oxygens
638
- // per nucleotide gives plenty of O atoms; the structural assertion
639
- // above is the strict one. Sanity-check that a methyl ether (OC) is
640
- // present somewhere.
641
- expect(/OC|CO/.test(smiles), true, `expected methoxy fragment: ${smiles}`);
899
+ const {molfile} = await helmRnaLinear(`RNA1{m(A)[sp].r(A)[sp]}$$$$V2.0`);
900
+ withMol(molfile, (mol) => {
901
+ // Element counts via RDKit (no SMILES regex).
902
+ expect(countAtoms(mol, 15), 2, 'expected exactly 2 phosphorus atoms');
903
+ expect(countAtoms(mol, 16), 2, 'expected exactly 2 sulfur atoms');
904
+ // Each P carries its own sulfur (not floating somewhere else).
905
+ expect(countSmarts(mol, '[PX4][SX2,SX1H,SX1-]'), 2,
906
+ 'both sulfurs must be bonded to a phosphorus atom');
907
+ // Inter-nucleotide sp is a phosphorothioate diester (bridging O on
908
+ // both sides). The trailing sp is a monoester (P-O-cap on the 3'
909
+ // side), so we expect exactly ONE diester match.
910
+ expect(countSmarts(mol, SMARTS.PHOSPHOROTHIOATE_DIESTER), 1,
911
+ 'inter-nucleotide sp must remain a phosphorothioate diester');
912
+ // Bridging-O presence on the 5' side of every phosphorothioate.
913
+ // The diester P has two C-O-P matches (5' and 3' bridges) and the
914
+ // terminal monoester P has one — total 3 matches across both linkers.
915
+ // The bug we guard against (lost 3'-O) would drop this to 1 or 2.
916
+ expect(countSmarts(mol, '[CX4][OX2][PX4](=[OX1])[SX2,SX1H,SX1-]'), 3,
917
+ 'every C-O-P-P=O-S match must be present (3: 2 from diester, 1 from monoester)');
918
+ // No direct C-P bond anywhere (the bug we're guarding against).
919
+ expect(countSmarts(mol, SMARTS.DIRECT_C_P), 0,
920
+ 'expected zero direct C-P bonds — bridging O must be present');
921
+ // Methoxy group on the m sugar must survive — exactly one (m only at
922
+ // position 0). 2'-OMe = OCH3 on a ring carbon. The 2nd nucleotide is
923
+ // r(A), no methoxy.
924
+ expect(countSmarts(mol, SMARTS.TWO_PRIME_OME), 1,
925
+ 'expected exactly one 2\'-OMe group on the m sugar');
926
+ });
642
927
  });
643
928
 
644
929
  // R-group swap heuristic: a single-R-group terminal monomer can be placed
645
930
  // at either end of a HELM chain, even if its R-group label "should" only
646
931
  // belong at one end. The conversion swaps rNodes so the existing
647
- // TERMINAL_5P/3P role logic still works.
932
+ // TERMINAL_5P/3P role logic still works. Each test asserts the terminal
933
+ // monomer's STRUCTURAL fingerprint as well as topology.
648
934
  //
649
935
  // Bio (R1 only) — naturally a 3'-terminal, but we accept it at 5' too.
936
+ // Biotin's fingerprint is its bicyclic head: a thiophene (C-C-C-C-S 5-mem
937
+ // ring) fused to an imidazolidone (N-C(=O)-N 5-mem ring with two NH).
650
938
  test('rna-helm-bio-terminal-at-end', async () => {
651
- const smiles = await helmRnaLinearToSmiles(`RNA1{r(T)[Bio]}$$$$V2.0`);
652
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
653
- `valid SMILES expected: ${smiles}`);
654
- // Bio replaces the trailing P → no phosphate at all.
655
- const pCount = (smiles.match(/P/g) || []).length;
656
- expect(pCount, 0, `expected 0 phosphates with Bio terminal: ${smiles}`);
657
- // Single connected fragment.
658
- expect(smiles.indexOf('.') === -1, true,
659
- `expected single fragment: ${smiles}`);
939
+ const {molfile} = await helmRnaLinear(`RNA1{r(T)[Bio]}$$$$V2.0`);
940
+ withMol(molfile, (mol) => {
941
+ expect(countAtoms(mol, 15), 0, 'Bio terminus: no phosphate');
942
+ // Biotin's cyclic urea (ureido) ring.
943
+ expect(hasSmarts(mol, SMARTS.BIOTIN_UREIDO), true,
944
+ 'expected biotin ureido (cyclic urea) ring system');
945
+ // Biotin's thiolane: a sulfur in a ring.
946
+ expect(hasSmarts(mol, '[#16;R]'), true,
947
+ 'expected ring sulfur (biotin\'s thiolane)');
948
+ // r(T) sugar still present.
949
+ expect(countSmarts(mol, SMARTS.FURANOSE), 1, 'expected the r(T) furanose');
950
+ });
660
951
  });
661
952
 
662
953
  test('rna-helm-bio-terminal-at-start', async () => {
663
- const smiles = await helmRnaLinearToSmiles(`RNA1{[Bio].r(T)}$$$$V2.0`);
664
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
665
- `valid SMILES expected: ${smiles}`);
666
- // Single connected fragment (Bio at start must connect to following sugar).
667
- expect(smiles.indexOf('.') === -1, true,
668
- `expected single fragment: ${smiles}`);
669
- // No phosphates (Bio doesn't carry P, no trailing p in HELM).
670
- const pCount = (smiles.match(/P/g) || []).length;
671
- expect(pCount, 0, `expected 0 phosphates: ${smiles}`);
954
+ const {molfile} = await helmRnaLinear(`RNA1{[Bio].r(T)}$$$$V2.0`);
955
+ withMol(molfile, (mol) => {
956
+ expect(countAtoms(mol, 15), 0, 'no phosphates');
957
+ // Biotin's ureido + ring-S marker (the thiolane).
958
+ expect(hasSmarts(mol, SMARTS.BIOTIN_UREIDO), true,
959
+ 'expected biotin ureido ring system at the 5\' end');
960
+ expect(hasSmarts(mol, '[#16;R]'), true,
961
+ 'expected biotin\'s thiolane ring sulfur');
962
+ // r(T) sugar still present and connected (single fragment via R-swap).
963
+ expect(countSmarts(mol, SMARTS.FURANOSE), 1, 'expected the r(T) furanose');
964
+ });
672
965
  });
673
966
 
674
967
  // Chol (R2 only) — naturally a 5'-terminal, but we accept it at 3' too.
968
+ // Chol's structural fingerprint is the steroid 4-ring core plus a
969
+ // ring-fused junction, see `looksLikeSteroid()`.
675
970
  test('rna-helm-chol-terminal-at-start', async () => {
676
- // Already covered by rna-helm-5p-terminal-chol; this is the canonical case.
677
- const smiles = await helmRnaLinearToSmiles(`RNA1{[Chol].r(T)}$$$$V2.0`);
678
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
679
- `valid SMILES expected: ${smiles}`);
680
- expect(smiles.indexOf('.') === -1, true, `expected single fragment: ${smiles}`);
971
+ const {molfile} = await helmRnaLinear(`RNA1{[Chol].r(T)}$$$$V2.0`);
972
+ withMol(molfile, (mol) => {
973
+ expect(looksLikeSteroid(mol), true,
974
+ 'expected steroid (gonane) ring system from Chol at 5\'');
975
+ expect(countSmarts(mol, SMARTS.FURANOSE), 1, 'expected one r(T) furanose');
976
+ });
681
977
  });
682
978
 
683
979
  test('rna-helm-chol-terminal-at-end', async () => {
684
- const smiles = await helmRnaLinearToSmiles(`RNA1{r(T)[Chol]}$$$$V2.0`);
685
- expect(smiles !== 'MALFORMED_INPUT_VALUE' && smiles.length > 10, true,
686
- `valid SMILES expected: ${smiles}`);
687
- // Single connected fragment.
688
- expect(smiles.indexOf('.') === -1, true,
689
- `expected single fragment: ${smiles}`);
690
- // Chol replaces the trailing P → no phosphate.
691
- const pCount = (smiles.match(/P/g) || []).length;
692
- expect(pCount, 0, `expected 0 phosphates with Chol terminal: ${smiles}`);
980
+ const {molfile} = await helmRnaLinear(`RNA1{r(T)[Chol]}$$$$V2.0`);
981
+ withMol(molfile, (mol) => {
982
+ expect(countAtoms(mol, 15), 0, 'no phosphate when Chol replaces trailing P');
983
+ expect(looksLikeSteroid(mol), true,
984
+ 'expected steroid (gonane) ring system from Chol at 3\'');
985
+ expect(countSmarts(mol, SMARTS.FURANOSE), 1, 'expected one r(T) furanose');
986
+ });
693
987
  });
694
988
  });
695
989