@datagrok/bio 2.27.2 → 2.27.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +50 -0
- package/agents/package-knowledge.yaml +53 -0
- package/dist/455.js +1 -1
- package/dist/455.js.map +1 -1
- package/dist/682.js +1 -1
- package/dist/682.js.map +1 -1
- package/dist/705.js +1 -1
- package/dist/705.js.map +1 -1
- package/dist/909.js +2 -0
- package/dist/909.js.map +1 -0
- package/dist/immunum_bg.wasm +0 -0
- package/dist/package-test.js +3 -3
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +3 -3
- package/dist/package.js.map +1 -1
- package/package.json +4 -2
- package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +24 -11
- package/src/package-api.ts +15 -1
- package/src/package-test.ts +1 -0
- package/src/package.g.ts +12 -1
- package/src/package.ts +22 -4
- package/src/tests/antibody-numbering-tests.ts +190 -0
- package/src/tests/detectors-tests.ts +5 -1
- package/src/tests/splitters-test.ts +8 -4
- package/src/tests/to-atomic-level-tests.ts +144 -0
- package/src/utils/annotations/numbering-ui.ts +34 -90
- package/src/utils/antibody-numbering/immunum-client.ts +45 -0
- package/src/utils/antibody-numbering/immunum-glue.js +275 -0
- package/src/utils/antibody-numbering/immunum.worker.ts +159 -0
- package/src/utils/antibody-numbering/number-antibody.ts +105 -0
- package/src/utils/antibody-numbering/types.ts +48 -0
- package/src/utils/seq-helper/seq-handler.ts +25 -9
- package/test-console-output-1.log +582 -485
- package/test-record-1.mp4 +0 -0
- package/webpack.config.js +13 -0
- package/dist/282.js +0 -2
- package/dist/282.js.map +0 -1
- package/dist/287.js +0 -2
- package/dist/287.js.map +0 -1
- package/dist/422.js +0 -2
- package/dist/422.js.map +0 -1
- package/dist/767.js +0 -2
- package/dist/767.js.map +0 -1
- package/src/utils/antibody-numbering (WIP)/alignment.ts +0 -578
- package/src/utils/antibody-numbering (WIP)/annotator.ts +0 -120
- package/src/utils/antibody-numbering (WIP)/data/blosum62.ts +0 -55
- package/src/utils/antibody-numbering (WIP)/data/consensus-aho.ts +0 -155
- package/src/utils/antibody-numbering (WIP)/data/consensus-imgt.ts +0 -162
- package/src/utils/antibody-numbering (WIP)/data/consensus-kabat.ts +0 -157
- package/src/utils/antibody-numbering (WIP)/data/consensus-martin.ts +0 -152
- package/src/utils/antibody-numbering (WIP)/data/consensus.ts +0 -36
- package/src/utils/antibody-numbering (WIP)/data/regions.ts +0 -63
- package/src/utils/antibody-numbering (WIP)/index.ts +0 -31
- package/src/utils/antibody-numbering (WIP)/testdata.ts +0 -5356
- package/src/utils/antibody-numbering (WIP)/types.ts +0 -69
- /package/dist/{8473fcbfb6e85ca6c852.wasm → wasmCluster.wasm} +0 -0
- /package/dist/{9a8fbf37666e32487835.wasm → wasmDbscan.wasm} +0 -0
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Davit Rizhinashvili",
|
|
6
6
|
"email": "drizhinashvili@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.27.
|
|
8
|
+
"version": "2.27.4",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -44,7 +44,7 @@
|
|
|
44
44
|
],
|
|
45
45
|
"dependencies": {
|
|
46
46
|
"@biowasm/aioli": "^3.1.0",
|
|
47
|
-
"@datagrok-libraries/bio": "^5.
|
|
47
|
+
"@datagrok-libraries/bio": "^5.64.0",
|
|
48
48
|
"@datagrok-libraries/chem-meta": "^1.2.9",
|
|
49
49
|
"@datagrok-libraries/math": "^1.2.6",
|
|
50
50
|
"@datagrok-libraries/ml": "^6.10.11",
|
|
@@ -59,6 +59,8 @@
|
|
|
59
59
|
"datagrok-api": "^1.26.3",
|
|
60
60
|
"dayjs": "^1.11.4",
|
|
61
61
|
"fastest-levenshtein": "^1.0.16",
|
|
62
|
+
"file-loader": "^6.2.0",
|
|
63
|
+
"immunum": "^1.1.0",
|
|
62
64
|
"openchemlib": "^7.2.3",
|
|
63
65
|
"rxjs": "^6.5.5",
|
|
64
66
|
"style-loader": "^3.3.1",
|
|
@@ -13,6 +13,7 @@ import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
|
13
13
|
import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
14
14
|
import {getClusterMatrixWorker} from '@datagrok-libraries/math';
|
|
15
15
|
import {DimReductionMethods} from '@datagrok-libraries/ml/src/multi-column-dimensionality-reduction/types';
|
|
16
|
+
import {awaitCheck} from '@datagrok-libraries/test/src/test';
|
|
16
17
|
|
|
17
18
|
const dataFn: string = 'samples/FASTA_PT_activity.csv';
|
|
18
19
|
|
|
@@ -117,25 +118,37 @@ export async function demoActivityCliffsCyclic() {
|
|
|
117
118
|
ui.setUpdateIndicator(tv.root, true);
|
|
118
119
|
try {
|
|
119
120
|
const seqEncodingFunc = DG.Func.find({name: 'macromoleculePreprocessingFunction', package: 'Bio'})[0];
|
|
120
|
-
|
|
121
|
+
await PackageFunctions.activityCliffs(
|
|
121
122
|
df, df.getCol('Sequence'), df.getCol('Activity'),
|
|
122
123
|
96, DimReductionMethods.UMAP, MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE,
|
|
123
|
-
seqEncodingFunc, {}, true)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
124
|
+
seqEncodingFunc, {}, true);
|
|
125
|
+
|
|
126
|
+
let scatterPlot: DG.Viewer | undefined;
|
|
127
|
+
await awaitCheck(() => {
|
|
128
|
+
for (const v of tv.viewers) {
|
|
129
|
+
if (v.type === DG.VIEWER.SCATTER_PLOT) {
|
|
130
|
+
scatterPlot = v;
|
|
131
|
+
return true;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
return false;
|
|
135
|
+
}, '', 10000);
|
|
136
|
+
|
|
137
|
+
let link: HTMLCollectionOf<Element> | undefined;
|
|
138
|
+
await awaitCheck(() => {
|
|
139
|
+
link = scatterPlot!.root.getElementsByClassName('scatter_plot_link');
|
|
140
|
+
return link.length > 0;
|
|
141
|
+
}, '', 5000);
|
|
142
|
+
(link![0] as HTMLElement).click();
|
|
143
|
+
await DG.delay(500);
|
|
144
|
+
|
|
130
145
|
tv.grid.props.rowHeight = 180;
|
|
131
146
|
tv.grid.col('sequence') && (tv.grid.col('sequence')!.width = 300);
|
|
132
147
|
tv.grid.col('structure') && (tv.grid.col('structure')!.width = 300);
|
|
133
148
|
const cliffsGrid = Array.from(tv.viewers).find((v) => v !== tv.grid && v.type === DG.VIEWER.GRID) as DG.Grid;
|
|
134
149
|
if (cliffsGrid) {
|
|
135
150
|
cliffsGrid.props.rowHeight = 40;
|
|
136
|
-
cliffsGrid.col('seq_diff')!.width = 600;
|
|
137
|
-
tv.dockManager.dock(cliffsGrid, DG.DOCK_TYPE.DOWN, null, 'Cliffs', 0.35);
|
|
138
|
-
tv.dockManager.dock(activityCliffsViewer, DG.DOCK_TYPE.RIGHT, null, 'Activity Cliffs', 0.55);
|
|
151
|
+
cliffsGrid.col('seq_diff') && (cliffsGrid.col('seq_diff')!.width = 600);
|
|
139
152
|
}
|
|
140
153
|
} catch (err: any) {
|
|
141
154
|
handleError(err);
|
package/src/package-api.ts
CHANGED
|
@@ -162,7 +162,7 @@ export namespace funcs {
|
|
|
162
162
|
}
|
|
163
163
|
|
|
164
164
|
/**
|
|
165
|
-
Assigns antibody numbering (IMGT/Kabat/Chothia/AHo)
|
|
165
|
+
Assigns antibody numbering (IMGT/Kabat/Chothia/AHo)
|
|
166
166
|
*/
|
|
167
167
|
export async function applyNumberingScheme(): Promise<void> {
|
|
168
168
|
return await grok.functions.call('Bio:ApplyNumberingScheme', {});
|
|
@@ -284,6 +284,20 @@ export namespace funcs {
|
|
|
284
284
|
return await grok.functions.call('Bio:PepseaMsa', { sequenceCol, method, gapOpen, gapExtend });
|
|
285
285
|
}
|
|
286
286
|
|
|
287
|
+
/**
|
|
288
|
+
Assigns antibody numbering (IMGT/Kabat) using the immunum WASM library
|
|
289
|
+
*/
|
|
290
|
+
export async function immunumAntibodyNumbering(df: DG.DataFrame , seqCol: DG.Column , scheme: string ): Promise<DG.DataFrame> {
|
|
291
|
+
return await grok.functions.call('Bio:ImmunumAntibodyNumbering', { df, seqCol, scheme });
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
Builds a MacromoleculeDifference column from two sequence columns (seq1#seq2)
|
|
296
|
+
*/
|
|
297
|
+
export async function compareSequences(): Promise<void> {
|
|
298
|
+
return await grok.functions.call('Bio:CompareSequences', {});
|
|
299
|
+
}
|
|
300
|
+
|
|
287
301
|
/**
|
|
288
302
|
Visualizes sequence composition on a WebLogo plot
|
|
289
303
|
*/
|
package/src/package-test.ts
CHANGED
package/src/package.g.ts
CHANGED
|
@@ -234,7 +234,7 @@ export async function getRegionTopMenu(table: DG.DataFrame, sequence: DG.Column,
|
|
|
234
234
|
}
|
|
235
235
|
|
|
236
236
|
//name: Apply Numbering Scheme
|
|
237
|
-
//description: Assigns antibody numbering (IMGT/Kabat/Chothia/AHo)
|
|
237
|
+
//description: Assigns antibody numbering (IMGT/Kabat/Chothia/AHo)
|
|
238
238
|
//top-menu: Bio | Annotate | Apply Numbering Scheme...
|
|
239
239
|
export function applyNumberingScheme() : void {
|
|
240
240
|
PackageFunctions.applyNumberingScheme();
|
|
@@ -456,6 +456,17 @@ export async function pepseaMsa(sequenceCol: DG.Column<any>, method: string, gap
|
|
|
456
456
|
return await PackageFunctions.pepseaMsa(sequenceCol, method, gapOpen, gapExtend);
|
|
457
457
|
}
|
|
458
458
|
|
|
459
|
+
//name: Immunum
|
|
460
|
+
//description: Assigns antibody numbering (IMGT/Kabat) using the immunum WASM library
|
|
461
|
+
//input: dataframe df
|
|
462
|
+
//input: column seqCol { semType: Macromolecule }
|
|
463
|
+
//input: string scheme = 'imgt' { choices: ["imgt","kabat"] }
|
|
464
|
+
//output: dataframe result
|
|
465
|
+
//meta.role: antibodyNumbering
|
|
466
|
+
export async function immunumAntibodyNumbering(df: DG.DataFrame, seqCol: DG.Column<any>, scheme: string) : Promise<any> {
|
|
467
|
+
return await PackageFunctions.immunumAntibodyNumbering(df, seqCol, scheme);
|
|
468
|
+
}
|
|
469
|
+
|
|
459
470
|
//name: Compare Sequences
|
|
460
471
|
//description: Builds a MacromoleculeDifference column from two sequence columns (seq1#seq2)
|
|
461
472
|
//top-menu: Bio | Analyze | Compare sequences...
|
package/src/package.ts
CHANGED
|
@@ -86,6 +86,10 @@ import {BilnNotationProvider} from './utils/biln';
|
|
|
86
86
|
import {showMonomerCollectionsView} from './utils/monomer-lib/monomer-collections-view';
|
|
87
87
|
import {ISequenceColumnInput} from '@datagrok-libraries/bio/src/utils/sequence-column-input';
|
|
88
88
|
import {SequenceColumnInput} from './utils/sequence-column-input';
|
|
89
|
+
import {showNumberingSchemeDialog} from './utils/annotations/numbering-ui';
|
|
90
|
+
import {showLiabilityScannerDialog} from './utils/annotations/liability-scanner-ui';
|
|
91
|
+
import {showAnnotationManagerDialog} from './utils/annotations/annotation-manager-ui';
|
|
92
|
+
import {numberAntibodyColumn} from './utils/antibody-numbering/number-antibody';
|
|
89
93
|
|
|
90
94
|
import * as api from './package-api';
|
|
91
95
|
export const _package = new BioPackage(/*{debug: true}/**/);
|
|
@@ -479,11 +483,11 @@ export class PackageFunctions {
|
|
|
479
483
|
|
|
480
484
|
@grok.decorators.func({
|
|
481
485
|
name: 'Apply Numbering Scheme',
|
|
482
|
-
description: 'Assigns antibody numbering (IMGT/Kabat/Chothia/AHo)
|
|
486
|
+
description: 'Assigns antibody numbering (IMGT/Kabat/Chothia/AHo)',
|
|
483
487
|
'top-menu': 'Bio | Annotate | Apply Numbering Scheme...',
|
|
484
488
|
})
|
|
485
489
|
static applyNumberingScheme(): void {
|
|
486
|
-
|
|
490
|
+
showNumberingSchemeDialog();
|
|
487
491
|
}
|
|
488
492
|
|
|
489
493
|
@grok.decorators.func({
|
|
@@ -492,7 +496,7 @@ export class PackageFunctions {
|
|
|
492
496
|
'top-menu': 'Bio | Annotate | Scan Liabilities...',
|
|
493
497
|
})
|
|
494
498
|
static scanLiabilities(): void {
|
|
495
|
-
|
|
499
|
+
showLiabilityScannerDialog();
|
|
496
500
|
}
|
|
497
501
|
|
|
498
502
|
@grok.decorators.func({
|
|
@@ -501,7 +505,7 @@ export class PackageFunctions {
|
|
|
501
505
|
'top-menu': 'Bio | Annotate | Manage Annotations...',
|
|
502
506
|
})
|
|
503
507
|
static manageAnnotations(): void {
|
|
504
|
-
|
|
508
|
+
showAnnotationManagerDialog();
|
|
505
509
|
}
|
|
506
510
|
|
|
507
511
|
@grok.decorators.func({
|
|
@@ -993,6 +997,20 @@ export class PackageFunctions {
|
|
|
993
997
|
return alignWithPepsea(sequenceCol, method, gapOpen, gapExtend);
|
|
994
998
|
}
|
|
995
999
|
|
|
1000
|
+
@grok.decorators.func({
|
|
1001
|
+
name: 'Immunum',
|
|
1002
|
+
description: 'Assigns antibody numbering (IMGT/Kabat) using the immunum WASM library',
|
|
1003
|
+
meta: {role: 'antibodyNumbering'},
|
|
1004
|
+
})
|
|
1005
|
+
static async immunumAntibodyNumbering(
|
|
1006
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
1007
|
+
@grok.decorators.param({type: 'dataframe'}) df: DG.DataFrame,
|
|
1008
|
+
@grok.decorators.param({type: 'column', options: {semType: 'Macromolecule'}}) seqCol: DG.Column<string>,
|
|
1009
|
+
@grok.decorators.param({type: 'string', options: {choices: ['imgt', 'kabat'], initialValue: 'imgt'}}) scheme: string,
|
|
1010
|
+
): Promise<DG.DataFrame> {
|
|
1011
|
+
return numberAntibodyColumn(seqCol, scheme);
|
|
1012
|
+
}
|
|
1013
|
+
|
|
996
1014
|
@grok.decorators.func({
|
|
997
1015
|
name: 'Compare Sequences',
|
|
998
1016
|
description: 'Builds a MacromoleculeDifference column from two sequence columns (seq1#seq2)',
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import * as grok from 'datagrok-api/grok';
|
|
2
|
+
import * as DG from 'datagrok-api/dg';
|
|
3
|
+
|
|
4
|
+
import {category, expect, test} from '@datagrok-libraries/test/src/test';
|
|
5
|
+
|
|
6
|
+
import {_package} from '../package';
|
|
7
|
+
import {numberSequencesWithImmunum} from '../utils/antibody-numbering/immunum-client';
|
|
8
|
+
import {numberAntibodyColumn} from '../utils/antibody-numbering/number-antibody';
|
|
9
|
+
|
|
10
|
+
/** Canonical test sequences picked from samples/antibodies.csv.
|
|
11
|
+
* - heavyChain1/2 are IGH variable regions starting with the classic EVQL/QVQL motifs
|
|
12
|
+
* - lightChain1/2 are IGK/IGL variable regions (DIQM/DIVM/DIVL...)
|
|
13
|
+
* These are stable inputs for immunum so unit tests can assert exact chain type
|
|
14
|
+
* and region coverage without fetching the CSV from the server. */
|
|
15
|
+
const HEAVY_1 = 'QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCARVAPGALDYWGQGTLVTVSS';
|
|
16
|
+
const HEAVY_2 = 'EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKDHYSGSGSYYYYFDYWGQGTLVTVSS';
|
|
17
|
+
const LIGHT_KAPPA = 'DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPRTFGQGTKVEIK';
|
|
18
|
+
const LIGHT_LAMBDA = 'QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTLVFGGGTKLTVL';
|
|
19
|
+
|
|
20
|
+
/** FR/CDR counts we expect in the immunum annotation JSON for IMGT/Kabat.
|
|
21
|
+
* The engine only accepts IMGT and Kabat — those are the choices declared in
|
|
22
|
+
* package.ts and surfaced in the dialog's scheme dropdown. */
|
|
23
|
+
const EXPECTED_REGION_COUNT = 7; // FR1, CDR1, FR2, CDR2, FR3, CDR3, FR4
|
|
24
|
+
const EXPECTED_REGION_NAMES = ['FR1', 'CDR1', 'FR2', 'CDR2', 'FR3', 'CDR3', 'FR4'];
|
|
25
|
+
|
|
26
|
+
/** Sanity range for alignment confidence on canonical antibody sequences. */
|
|
27
|
+
const MIN_CONFIDENCE = 0.5;
|
|
28
|
+
|
|
29
|
+
category('antibody numbering (immunum)', () => {
|
|
30
|
+
// Each numberSequencesWithImmunum call spawns a fresh worker and terminates
|
|
31
|
+
// it before returning — no shared setup / teardown needed.
|
|
32
|
+
|
|
33
|
+
test('worker: heavy chain (IMGT)', async () => {
|
|
34
|
+
const [row] = await numberSequencesWithImmunum([HEAVY_1], 'imgt');
|
|
35
|
+
expect(row.chainType, 'Heavy');
|
|
36
|
+
expect(row.chainCode, 'H');
|
|
37
|
+
expect(row.confidence >= MIN_CONFIDENCE, true);
|
|
38
|
+
expect(row.positionNames.length > 0, true);
|
|
39
|
+
expect(row.numberingDetail.length > 0, true);
|
|
40
|
+
// numbering_map indices must fall inside the input sequence
|
|
41
|
+
for (const idx of Object.values(row.numberingMap))
|
|
42
|
+
expect(idx >= 0 && idx < HEAVY_1.length, true);
|
|
43
|
+
expect(row.numberingDetail.length, Object.keys(row.numberingMap).length);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
test('worker: light kappa chain (IMGT)', async () => {
|
|
47
|
+
const [row] = await numberSequencesWithImmunum([LIGHT_KAPPA], 'imgt');
|
|
48
|
+
expect(row.chainType, 'Light');
|
|
49
|
+
expect(row.chainCode === 'K' || row.chainCode === 'L', true);
|
|
50
|
+
expect(row.confidence >= MIN_CONFIDENCE, true);
|
|
51
|
+
expect(row.numberingDetail.length > 0, true);
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
test('worker: light lambda chain (IMGT)', async () => {
|
|
55
|
+
const [row] = await numberSequencesWithImmunum([LIGHT_LAMBDA], 'imgt');
|
|
56
|
+
expect(row.chainType, 'Light');
|
|
57
|
+
expect(row.confidence >= MIN_CONFIDENCE, true);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test('worker: batch numbering', async () => {
|
|
61
|
+
const rows = await numberSequencesWithImmunum(
|
|
62
|
+
[HEAVY_1, LIGHT_KAPPA, HEAVY_2, LIGHT_LAMBDA], 'imgt');
|
|
63
|
+
expect(rows.length, 4);
|
|
64
|
+
expect(rows[0].chainType, 'Heavy');
|
|
65
|
+
expect(rows[1].chainType, 'Light');
|
|
66
|
+
expect(rows[2].chainType, 'Heavy');
|
|
67
|
+
expect(rows[3].chainType, 'Light');
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test('worker: empty / short sequences fail gracefully', async () => {
|
|
71
|
+
const rows = await numberSequencesWithImmunum(['', 'AAAA', ' '], 'imgt');
|
|
72
|
+
expect(rows.length, 3);
|
|
73
|
+
for (const r of rows) {
|
|
74
|
+
expect(r.positionNames, '');
|
|
75
|
+
expect(r.numberingDetail.length, 0);
|
|
76
|
+
expect(r.error.length > 0, true);
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
test('worker: kabat scheme returns kabat-style position codes', async () => {
|
|
81
|
+
const [imgt] = await numberSequencesWithImmunum([HEAVY_1], 'imgt');
|
|
82
|
+
const [kabat] = await numberSequencesWithImmunum([HEAVY_1], 'kabat');
|
|
83
|
+
expect(imgt.chainType, 'Heavy');
|
|
84
|
+
expect(kabat.chainType, 'Heavy');
|
|
85
|
+
// Kabat numbering keys should not match IMGT one-for-one — the schemes
|
|
86
|
+
// number the same residues differently. A weak but robust check: the set
|
|
87
|
+
// of keys differs.
|
|
88
|
+
const imgtKeys = new Set(Object.keys(imgt.numberingMap));
|
|
89
|
+
const kabatKeys = new Set(Object.keys(kabat.numberingMap));
|
|
90
|
+
let differ = false;
|
|
91
|
+
for (const k of kabatKeys) if (!imgtKeys.has(k)) {differ = true; break;}
|
|
92
|
+
expect(differ, true);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
test('numberAntibodyColumn: DataFrame shape matches antpack script', async () => {
|
|
96
|
+
const col = DG.Column.fromStrings('seq', [HEAVY_1, LIGHT_KAPPA, '']);
|
|
97
|
+
col.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
98
|
+
const result = await numberAntibodyColumn(col, 'imgt');
|
|
99
|
+
|
|
100
|
+
// Required columns, same names as the Python script
|
|
101
|
+
for (const name of ['position_names', 'chain_type', 'annotations_json',
|
|
102
|
+
'numbering_detail', 'numbering_map']) {
|
|
103
|
+
expect(result.col(name) !== null, true);
|
|
104
|
+
}
|
|
105
|
+
expect(result.rowCount, 3);
|
|
106
|
+
|
|
107
|
+
// Row 0 — heavy chain: all 5 fields populated
|
|
108
|
+
expect(result.get('position_names', 0).length > 0, true);
|
|
109
|
+
expect(result.get('chain_type', 0), 'Heavy');
|
|
110
|
+
const annot0 = JSON.parse(result.get('annotations_json', 0));
|
|
111
|
+
expect(annot0.length, EXPECTED_REGION_COUNT);
|
|
112
|
+
expect(annot0.map((a: any) => a.name).join(','), EXPECTED_REGION_NAMES.join(','));
|
|
113
|
+
for (const a of annot0) {
|
|
114
|
+
expect(a.visualType, 'region');
|
|
115
|
+
expect(a.category, 'structure');
|
|
116
|
+
expect(a.sourceScheme, 'IMGT');
|
|
117
|
+
expect(a.autoGenerated, true);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Row 1 — light chain: region JSON has same structure
|
|
121
|
+
expect(result.get('chain_type', 1), 'Light');
|
|
122
|
+
const annot1 = JSON.parse(result.get('annotations_json', 1));
|
|
123
|
+
expect(annot1.length, EXPECTED_REGION_COUNT);
|
|
124
|
+
|
|
125
|
+
// Row 2 — empty input: all fields blank / '[]'
|
|
126
|
+
expect(result.get('position_names', 2), '');
|
|
127
|
+
expect(result.get('chain_type', 2), '');
|
|
128
|
+
expect(result.get('annotations_json', 2), '[]');
|
|
129
|
+
expect(result.get('numbering_detail', 2), '');
|
|
130
|
+
expect(result.get('numbering_map', 2), '');
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
test('numberAntibodyColumn: numbering_map indices line up with sequence', async () => {
|
|
134
|
+
const col = DG.Column.fromStrings('seq', [HEAVY_1]);
|
|
135
|
+
col.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
136
|
+
const result = await numberAntibodyColumn(col, 'imgt');
|
|
137
|
+
|
|
138
|
+
const detail = JSON.parse(result.get('numbering_detail', 0));
|
|
139
|
+
const map = JSON.parse(result.get('numbering_map', 0));
|
|
140
|
+
|
|
141
|
+
// For each numbered position: sequence[charIdx] must equal the recorded aa
|
|
142
|
+
for (const entry of detail) {
|
|
143
|
+
const idx = map[entry.position];
|
|
144
|
+
expect(typeof idx === 'number', true);
|
|
145
|
+
expect(HEAVY_1[idx], entry.aa);
|
|
146
|
+
}
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
test('numberAntibodyColumn: annotations_json start/end resolve via numbering_map', async () => {
|
|
150
|
+
const col = DG.Column.fromStrings('seq', [HEAVY_1]);
|
|
151
|
+
col.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
152
|
+
const result = await numberAntibodyColumn(col, 'imgt');
|
|
153
|
+
|
|
154
|
+
const annotations = JSON.parse(result.get('annotations_json', 0));
|
|
155
|
+
const map = JSON.parse(result.get('numbering_map', 0));
|
|
156
|
+
// FR1 start (position "1") must be present and resolve to a valid char index.
|
|
157
|
+
const fr1 = annotations.find((a: any) => a.name === 'FR1');
|
|
158
|
+
expect(fr1 !== undefined, true);
|
|
159
|
+
const startIdx = map[fr1.start];
|
|
160
|
+
expect(typeof startIdx === 'number', true);
|
|
161
|
+
expect(startIdx >= 0 && startIdx < HEAVY_1.length, true);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
test('numberAntibodyColumn: loads antibodies.csv sample subset', async () => {
|
|
165
|
+
let df: DG.DataFrame;
|
|
166
|
+
try {
|
|
167
|
+
df = await _package.files.readCsv('samples/antibodies.csv');
|
|
168
|
+
} catch (err) {
|
|
169
|
+
// Sample may not be deployed on every server; skip instead of failing.
|
|
170
|
+
console.warn('antibodies.csv not available — skipping', err);
|
|
171
|
+
return;
|
|
172
|
+
}
|
|
173
|
+
const hcCol = df.col('AntibodyHC') ?? df.col('HeavyChain') ?? df.columns.byName('AntibodyHC');
|
|
174
|
+
if (!hcCol) return;
|
|
175
|
+
|
|
176
|
+
// Subset to the first 10 rows so the test finishes in seconds.
|
|
177
|
+
const subset = DG.Column.fromStrings('seq',
|
|
178
|
+
Array.from({length: Math.min(10, hcCol.length)}, (_, i) => hcCol.get(i) ?? ''));
|
|
179
|
+
subset.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
180
|
+
|
|
181
|
+
const result = await numberAntibodyColumn(subset, 'imgt');
|
|
182
|
+
expect(result.rowCount, subset.length);
|
|
183
|
+
|
|
184
|
+
let heavyCount = 0;
|
|
185
|
+
for (let i = 0; i < result.rowCount; i++)
|
|
186
|
+
if (result.get('chain_type', i) === 'Heavy') heavyCount++;
|
|
187
|
+
// Expect the majority of the HC column to be classified as heavy.
|
|
188
|
+
expect(heavyCount >= Math.ceil(subset.length * 0.6), true);
|
|
189
|
+
});
|
|
190
|
+
});
|
|
@@ -462,8 +462,12 @@ MWRSWY-CKHPMWRSWY-CKHP`;
|
|
|
462
462
|
// sample_testHelm.csv
|
|
463
463
|
// columns: ID,Test type,HELM string,Valid?,Mol Weight,Mol Formula,SMILES
|
|
464
464
|
test('samplesTestHelmCsv', async () => {
|
|
465
|
+
// Alphabet size of 8 reflects splitterAsHelm's triplet-splitting of HELM
|
|
466
|
+
// RNA monomers (each `sugar(base)phosphate` becomes 3 tokens), which can
|
|
467
|
+
// collapse what used to be N distinct triple-tokens into a smaller union
|
|
468
|
+
// of {sugar(s), bases, phosphate(s)} symbols.
|
|
465
469
|
await _testDf(readSamples(Samples.testHelmCsv), {
|
|
466
|
-
'HELM string': new PosCol(NOTATION.HELM, null, null,
|
|
470
|
+
'HELM string': new PosCol(NOTATION.HELM, null, null, 8, true),
|
|
467
471
|
}, seqHelper);
|
|
468
472
|
});
|
|
469
473
|
|
|
@@ -59,18 +59,22 @@ category('splitters', async () => {
|
|
|
59
59
|
'D-Tyr_Et', 'D-Dap', 'dV', 'E', 'N', 'pnG', 'Phe_4Me'],
|
|
60
60
|
],
|
|
61
61
|
|
|
62
|
+
// splitterAsHelm triplet-splits HELM RNA monomers `sugar(base)phosphate`
|
|
63
|
+
// into 3 tokens, and `sugar(base)` (terminal-only) into 2. Standalone
|
|
64
|
+
// tokens that don't match either form (e.g. lone `P`, or non-terminal
|
|
65
|
+
// `R(U)` without a phosphate) are kept verbatim.
|
|
62
66
|
testHelm1: [
|
|
63
67
|
'RNA1{R(U)P.R(T)P.R(G)P.R(C)P.R(A)}$$$$',
|
|
64
|
-
['R
|
|
68
|
+
['R', 'U', 'P', 'R', 'T', 'P', 'R', 'G', 'P', 'R', 'C', 'P', 'R', 'A'],
|
|
65
69
|
],
|
|
66
70
|
|
|
67
71
|
testHelm2: [
|
|
68
72
|
'RNA1{P.R(U)P.R(T)}$$$$',
|
|
69
|
-
['P', 'R
|
|
73
|
+
['P', 'R', 'U', 'P', 'R', 'T'],
|
|
70
74
|
],
|
|
71
75
|
testHelm3: [
|
|
72
|
-
'RNA1{P.R(U).P.R(T)}$$$$',
|
|
73
|
-
['P', 'R(U)', 'P', 'R
|
|
76
|
+
'RNA1{P.R(U).P.R(T)}$$$$', // invalid helm, but oh well,
|
|
77
|
+
['P', 'R(U)', 'P', 'R', 'T'],
|
|
74
78
|
],
|
|
75
79
|
};
|
|
76
80
|
|
|
@@ -356,6 +356,150 @@ PEPTIDE1{Lys_Boc.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2.Thr_PO3H2.Aca.Tyr
|
|
|
356
356
|
}
|
|
357
357
|
});
|
|
358
358
|
|
|
359
|
+
/** Tests for the linear HELM-RNA path: must preserve modified sugars,
|
|
360
|
+
* phosphates, and bases per nucleotide. The non-linear (HELM via POM)
|
|
361
|
+
* path is the reference; the linear path is expected to match it on
|
|
362
|
+
* canonical SMILES for these inputs. */
|
|
363
|
+
category('toAtomicLevelHelmRna', async () => {
|
|
364
|
+
let monomerLibHelper: IMonomerLibHelper;
|
|
365
|
+
let userLibSettings: UserLibSettings;
|
|
366
|
+
let seqHelper: ISeqHelper;
|
|
367
|
+
let monomerLib: IMonomerLib;
|
|
368
|
+
let rdKitModule: RDModule;
|
|
369
|
+
|
|
370
|
+
before(async () => {
|
|
371
|
+
rdKitModule = await getRdKitModule();
|
|
372
|
+
seqHelper = await getSeqHelper();
|
|
373
|
+
monomerLibHelper = await getMonomerLibHelper();
|
|
374
|
+
userLibSettings = await getUserLibSettings();
|
|
375
|
+
await monomerLibHelper.loadMonomerLibForTests();
|
|
376
|
+
monomerLib = monomerLibHelper.getMonomerLib();
|
|
377
|
+
});
|
|
378
|
+
|
|
379
|
+
after(async () => {
|
|
380
|
+
await setUserLibSettings(userLibSettings);
|
|
381
|
+
await monomerLibHelper.loadMonomerLib(true);
|
|
382
|
+
});
|
|
383
|
+
|
|
384
|
+
/** Build a single-row HELM RNA dataframe and run the linear converter,
|
|
385
|
+
* returning the canonical SMILES of the resulting molfile. */
|
|
386
|
+
async function helmRnaLinearToSmiles(srcHelm: string): Promise<string> {
|
|
387
|
+
const srcCsv = `seq\n${srcHelm}`;
|
|
388
|
+
const df = DG.DataFrame.fromCsv(srcCsv);
|
|
389
|
+
await grok.data.detectSemanticTypes(df);
|
|
390
|
+
const seqCol = df.getCol('seq');
|
|
391
|
+
expect(seqCol.semType, DG.SEMTYPE.MACROMOLECULE);
|
|
392
|
+
|
|
393
|
+
const res = await _toAtomicLevel(df, seqCol, monomerLib, seqHelper, rdKitModule);
|
|
394
|
+
if (!res.molCol)
|
|
395
|
+
throw new Error(`_toAtomicLevel returned no molCol for HELM '${srcHelm}'. ` +
|
|
396
|
+
`Warnings: ${(res.warnings ?? []).join(' / ')}`);
|
|
397
|
+
|
|
398
|
+
const molfile: string | null = res.molCol.get(0);
|
|
399
|
+
if (!molfile)
|
|
400
|
+
throw new Error(`_toAtomicLevel produced an empty molfile for HELM '${srcHelm}'`);
|
|
401
|
+
let smiles: string;
|
|
402
|
+
try {
|
|
403
|
+
smiles = grok.chem.convert(molfile, grok.chem.Notation.Unknown, grok.chem.Notation.Smiles);
|
|
404
|
+
} catch (err: any) {
|
|
405
|
+
throw new Error(`SMILES conversion threw for HELM '${srcHelm}': ${err?.message ?? err}\n` +
|
|
406
|
+
`--- MOLFILE START ---\n${molfile}\n--- MOLFILE END ---`);
|
|
407
|
+
}
|
|
408
|
+
// RDKit signals a parse failure by returning the literal string
|
|
409
|
+
// "MALFORMED_INPUT_VALUE" — surface it together with the offending molfile.
|
|
410
|
+
if (smiles === 'MALFORMED_INPUT_VALUE' || /^MALFORMED/.test(smiles)) {
|
|
411
|
+
throw new Error(`RDKit could not parse molfile produced for HELM '${srcHelm}'.\n` +
|
|
412
|
+
`--- MOLFILE START ---\n${molfile}\n--- MOLFILE END ---`);
|
|
413
|
+
}
|
|
414
|
+
return smiles;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// Unmodified RNA HELM — regression baseline. The linear path must produce
|
|
418
|
+
// a real RNA backbone (sugar + phosphate + base per nucleotide), not just
|
|
419
|
+
// a chain of bases.
|
|
420
|
+
test('rna-canonical', async () => {
|
|
421
|
+
const smiles = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p.r(G)p}$$$$`);
|
|
422
|
+
// Should at minimum contain phosphate (P), ribose oxygens, and a purine ring.
|
|
423
|
+
expect(/P/.test(smiles), true, `expected phosphate in SMILES: ${smiles}`);
|
|
424
|
+
// Purine fragment (any ring closure digit): n<d>cnc<d> or N<d>C=N (case insensitive).
|
|
425
|
+
expect(/n\dcnc\d/.test(smiles) || /n\dcnc/i.test(smiles), true,
|
|
426
|
+
`expected purine ring fragment in SMILES: ${smiles}`);
|
|
427
|
+
});
|
|
428
|
+
|
|
429
|
+
// Modified base — 5-methylcytosine. Linear path should preserve the
|
|
430
|
+
// methyl branch on the cytidine of position 0.
|
|
431
|
+
test('rna-modified-base', async () => {
|
|
432
|
+
const smilesPlain = await helmRnaLinearToSmiles(`RNA1{r(C)p.r(A)p}$$$$`);
|
|
433
|
+
const smilesMod = await helmRnaLinearToSmiles(`RNA1{r([m5C])p.r(A)p}$$$$`);
|
|
434
|
+
expect(smilesPlain !== smilesMod, true,
|
|
435
|
+
`m5C must change the SMILES vs. plain C. plain=${smilesPlain} mod=${smilesMod}`);
|
|
436
|
+
});
|
|
437
|
+
|
|
438
|
+
// Modified phosphate — phosphorothioate. The linker between positions 0
|
|
439
|
+
// and 1 must change (S replaces a non-bridging O).
|
|
440
|
+
test('rna-modified-phosphate', async () => {
|
|
441
|
+
const smilesPlain = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p}$$$$`);
|
|
442
|
+
const smilesMod = await helmRnaLinearToSmiles(`RNA1{r(A)[Rsp].r(C)p}$$$$`);
|
|
443
|
+
expect(smilesPlain !== smilesMod, true,
|
|
444
|
+
`Rsp phosphorothioate must change the SMILES vs. plain p. plain=${smilesPlain} mod=${smilesMod}`);
|
|
445
|
+
expect(/S/.test(smilesMod), true,
|
|
446
|
+
`expected sulfur in phosphorothioate SMILES: ${smilesMod}`);
|
|
447
|
+
// HELM explicitly wrote 2 phosphates (one Rsp at position 0, one p at
|
|
448
|
+
// position 1); both must appear in the molecule, so two P atoms total.
|
|
449
|
+
const pCountPlain = (smilesPlain.match(/P/g) || []).length;
|
|
450
|
+
const pCountMod = (smilesMod.match(/P/g) || []).length;
|
|
451
|
+
expect(pCountPlain, 2, `expected 2 phosphates in plain: ${smilesPlain}`);
|
|
452
|
+
expect(pCountMod, 2, `expected 2 phosphates in modified: ${smilesMod}`);
|
|
453
|
+
});
|
|
454
|
+
|
|
455
|
+
// Modified sugar — 2'-fluoro ribose. Position 0 sugar gets a fluorine.
|
|
456
|
+
test('rna-modified-sugar', async () => {
|
|
457
|
+
const smilesPlain = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p}$$$$`);
|
|
458
|
+
const smilesMod = await helmRnaLinearToSmiles(`RNA1{[fl2r](A)p.r(C)p}$$$$`);
|
|
459
|
+
expect(smilesPlain !== smilesMod, true,
|
|
460
|
+
`fl2r (2'-F ribose) must change the SMILES vs. plain r. plain=${smilesPlain} mod=${smilesMod}`);
|
|
461
|
+
expect(/F/.test(smilesMod), true,
|
|
462
|
+
`expected fluorine in 2'-F ribose SMILES: ${smilesMod}`);
|
|
463
|
+
});
|
|
464
|
+
|
|
465
|
+
// HELM omits the trailing phosphate (3'-OH terminus on the sugar). The
|
|
466
|
+
// splitter must split the partial `r(C)` into [r, C], assembly must skip
|
|
467
|
+
// the trailing P emit, and counts must agree.
|
|
468
|
+
test('rna-no-trailing-phosphate', async () => {
|
|
469
|
+
const smilesWith = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)p}$$$$`);
|
|
470
|
+
const smilesNoTail = await helmRnaLinearToSmiles(`RNA1{r(A)p.r(C)}$$$$`);
|
|
471
|
+
// Both should produce valid molecules with at least one P (the linker
|
|
472
|
+
// between the two nucleotides is always present).
|
|
473
|
+
expect(/P/.test(smilesNoTail), true,
|
|
474
|
+
`expected the inter-nucleotide phosphate to remain: ${smilesNoTail}`);
|
|
475
|
+
// The version WITH trailing phosphate should have exactly one more P
|
|
476
|
+
// atom than the version without.
|
|
477
|
+
const pCountWith = (smilesWith.match(/P/g) || []).length;
|
|
478
|
+
const pCountNoTail = (smilesNoTail.match(/P/g) || []).length;
|
|
479
|
+
expect(pCountWith, pCountNoTail + 1,
|
|
480
|
+
`expected pCountWith - pCountNoTail === 1, got with=${pCountWith}, noTail=${pCountNoTail}. ` +
|
|
481
|
+
`with=${smilesWith}, noTail=${smilesNoTail}`);
|
|
482
|
+
});
|
|
483
|
+
|
|
484
|
+
// Missing trailing phosphate combined with modifications.
|
|
485
|
+
test('rna-no-trailing-phosphate-with-modifications', async () => {
|
|
486
|
+
const smiles = await helmRnaLinearToSmiles(`RNA1{[fl2r]([m5C])[Rsp].r(A)}$$$$`);
|
|
487
|
+
expect(/F/.test(smiles), true, `expected fluorine: ${smiles}`);
|
|
488
|
+
expect(/S/.test(smiles), true, `expected sulfur: ${smiles}`);
|
|
489
|
+
// Exactly one phosphate (the Rsp linker), no trailing P.
|
|
490
|
+
const pCount = (smiles.match(/P/g) || []).length;
|
|
491
|
+
expect(pCount, 1, `expected exactly 1 phosphate: ${smiles}`);
|
|
492
|
+
});
|
|
493
|
+
|
|
494
|
+
// All three modifications combined. End-to-end smoke test.
|
|
495
|
+
test('rna-all-modifications', async () => {
|
|
496
|
+
const smiles = await helmRnaLinearToSmiles(`RNA1{[fl2r]([m5C])[Rsp].r(A)p}$$$$`);
|
|
497
|
+
expect(/F/.test(smiles), true, `expected fluorine: ${smiles}`);
|
|
498
|
+
expect(/S/.test(smiles), true, `expected sulfur: ${smiles}`);
|
|
499
|
+
expect(/P/.test(smiles), true, `expected phosphorus: ${smiles}`);
|
|
500
|
+
});
|
|
501
|
+
});
|
|
502
|
+
|
|
359
503
|
|
|
360
504
|
function polishMolfile(mol: string): string {
|
|
361
505
|
return mol.replaceAll('\r\n', '\n')
|