@datagrok/bio 2.25.17 → 2.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/dist/282.js +2 -0
  2. package/dist/282.js.map +1 -0
  3. package/dist/287.js +2 -0
  4. package/dist/287.js.map +1 -0
  5. package/dist/288.js +2 -0
  6. package/dist/288.js.map +1 -0
  7. package/dist/422.js +2 -0
  8. package/dist/422.js.map +1 -0
  9. package/dist/455.js +1 -1
  10. package/dist/455.js.map +1 -1
  11. package/dist/767.js +2 -0
  12. package/dist/767.js.map +1 -0
  13. package/dist/package-test.js +5 -5
  14. package/dist/package-test.js.map +1 -1
  15. package/dist/package.js +3 -3
  16. package/dist/package.js.map +1 -1
  17. package/files/samples/antibodies.csv +494 -0
  18. package/package.json +2 -2
  19. package/src/package-api.ts +21 -0
  20. package/src/package.g.ts +22 -1
  21. package/src/package.ts +30 -1
  22. package/src/utils/annotations/annotation-actions.ts +130 -0
  23. package/src/utils/annotations/annotation-manager-ui.ts +118 -0
  24. package/src/utils/annotations/annotation-manager.ts +163 -0
  25. package/src/utils/annotations/liability-scanner-ui.ts +88 -0
  26. package/src/utils/annotations/liability-scanner.ts +147 -0
  27. package/src/utils/annotations/numbering-ui.ts +450 -0
  28. package/src/utils/antibody-numbering (WIP)/alignment.ts +578 -0
  29. package/src/utils/antibody-numbering (WIP)/annotator.ts +120 -0
  30. package/src/utils/antibody-numbering (WIP)/data/blosum62.ts +55 -0
  31. package/src/utils/antibody-numbering (WIP)/data/consensus-aho.ts +155 -0
  32. package/src/utils/antibody-numbering (WIP)/data/consensus-imgt.ts +162 -0
  33. package/src/utils/antibody-numbering (WIP)/data/consensus-kabat.ts +157 -0
  34. package/src/utils/antibody-numbering (WIP)/data/consensus-martin.ts +152 -0
  35. package/src/utils/antibody-numbering (WIP)/data/consensus.ts +36 -0
  36. package/src/utils/antibody-numbering (WIP)/data/regions.ts +63 -0
  37. package/src/utils/antibody-numbering (WIP)/index.ts +31 -0
  38. package/src/utils/antibody-numbering (WIP)/testdata.ts +5356 -0
  39. package/src/utils/antibody-numbering (WIP)/types.ts +69 -0
  40. package/src/utils/context-menu.ts +42 -2
  41. package/src/utils/get-region-func-editor.ts +18 -2
  42. package/src/utils/get-region.ts +65 -1
  43. package/src/widgets/representations.ts +53 -2
  44. package/src/widgets/sequence-scrolling-widget.ts +28 -18
  45. package/test-console-output-1.log +546 -560
  46. package/test-record-1.mp4 +0 -0
@@ -0,0 +1,147 @@
1
+ /* eslint-disable max-len */
2
+ import * as DG from 'datagrok-api/dg';
3
+
4
+ import {ISeqHandler} from '@datagrok-libraries/bio/src/utils/macromolecule/seq-handler';
5
+ import {
6
+ SeqAnnotation, SeqAnnotationHit, RowAnnotationData,
7
+ AnnotationVisualType, AnnotationCategory, LiabilitySeverity,
8
+ ANNOTATION_COLORS,
9
+ } from '@datagrok-libraries/bio/src/utils/macromolecule/annotations';
10
+ import {
11
+ getOrCreateAnnotationColumn, setColumnAnnotations, setRowAnnotations,
12
+ getColumnAnnotations, getRowAnnotations, mergeRowHits,
13
+ } from './annotation-manager';
14
+
15
+ /** A single liability scanning rule. */
16
+ export interface LiabilityRule {
17
+ id: string;
18
+ name: string;
19
+ pattern: RegExp;
20
+ length: number;
21
+ severity: LiabilitySeverity;
22
+ /** Sub-category for grouping (e.g. "deamidation", "oxidation") */
23
+ ruleCategory: string;
24
+ color: string;
25
+ enabled: boolean;
26
+ }
27
+
28
+ /** Built-in liability rules for antibody engineering. */
29
+ export const BUILTIN_LIABILITY_RULES: LiabilityRule[] = [
30
+ {id: 'deamid-ng', name: 'Deamidation (NG)', pattern: /NG/g, length: 2, severity: LiabilitySeverity.High, ruleCategory: 'deamidation', color: ANNOTATION_COLORS.liability.deamidation, enabled: true},
31
+ {id: 'deamid-ns', name: 'Deamidation (NS)', pattern: /NS/g, length: 2, severity: LiabilitySeverity.Medium, ruleCategory: 'deamidation', color: ANNOTATION_COLORS.liability.deamidation, enabled: true},
32
+ {id: 'deamid-na', name: 'Deamidation (NA)', pattern: /NA/g, length: 2, severity: LiabilitySeverity.Low, ruleCategory: 'deamidation', color: ANNOTATION_COLORS.liability.deamidation, enabled: true},
33
+ {id: 'deamid-nd', name: 'Deamidation (ND)', pattern: /ND/g, length: 2, severity: LiabilitySeverity.Low, ruleCategory: 'deamidation', color: ANNOTATION_COLORS.liability.deamidation, enabled: true},
34
+ {id: 'deamid-nt', name: 'Deamidation (NT)', pattern: /NT/g, length: 2, severity: LiabilitySeverity.Low, ruleCategory: 'deamidation', color: ANNOTATION_COLORS.liability.deamidation, enabled: true},
35
+ {id: 'isom-dg', name: 'Isomerization (DG)', pattern: /DG/g, length: 2, severity: LiabilitySeverity.High, ruleCategory: 'isomerization', color: ANNOTATION_COLORS.liability.isomerization, enabled: true},
36
+ {id: 'isom-ds', name: 'Isomerization (DS)', pattern: /DS/g, length: 2, severity: LiabilitySeverity.Medium, ruleCategory: 'isomerization', color: ANNOTATION_COLORS.liability.isomerization, enabled: true},
37
+ {id: 'oxid-m', name: 'Oxidation (Met)', pattern: /M/g, length: 1, severity: LiabilitySeverity.Medium, ruleCategory: 'oxidation', color: ANNOTATION_COLORS.liability.oxidation, enabled: true},
38
+ {id: 'oxid-w', name: 'Oxidation (Trp)', pattern: /W/g, length: 1, severity: LiabilitySeverity.Low, ruleCategory: 'oxidation', color: ANNOTATION_COLORS.liability.oxidation, enabled: true},
39
+ {id: 'glyco-nxst', name: 'N-glycosylation', pattern: /N[^P][ST]/g, length: 3, severity: LiabilitySeverity.High, ruleCategory: 'glycosylation', color: ANNOTATION_COLORS.liability.glycosylation, enabled: true},
40
+ {id: 'free-cys', name: 'Free Cysteine', pattern: /C/g, length: 1, severity: LiabilitySeverity.Info, ruleCategory: 'freeCysteine', color: ANNOTATION_COLORS.liability.freeCysteine, enabled: false},
41
+ ];
42
+
43
+ /** Extracts a canonical single-letter string from a sequence handler for a given row. */
44
+ function getCanonicalString(sh: ISeqHandler, rowIdx: number): string {
45
+ const splitted = sh.getSplitted(rowIdx);
46
+ const chars: string[] = new Array(splitted.length);
47
+ for (let i = 0; i < splitted.length; i++)
48
+ chars[i] = splitted.getCanonical(i);
49
+ return chars.join('');
50
+ }
51
+
52
+ export interface ScanLiabilitiesResult {
53
+ annotations: SeqAnnotation[];
54
+ rowData: RowAnnotationData[];
55
+ totalHits: number;
56
+ }
57
+
58
+ /** Scans all rows of a macromolecule column for liability motifs.
59
+ * Returns column-level SeqAnnotation entries + per-row SeqAnnotationHit arrays. */
60
+ export function scanLiabilities(
61
+ col: DG.Column<string>,
62
+ sh: ISeqHandler,
63
+ rules: LiabilityRule[],
64
+ ): ScanLiabilitiesResult {
65
+ const enabledRules = rules.filter((r) => r.enabled);
66
+ const posList = sh.posList;
67
+
68
+ // Track which rules had hits
69
+ const ruleHitCounts = new Map<string, number>();
70
+
71
+ const rowData: RowAnnotationData[] = new Array(col.length);
72
+ let totalHits = 0;
73
+
74
+ for (let rowIdx = 0; rowIdx < col.length; rowIdx++) {
75
+ const seq = getCanonicalString(sh, rowIdx);
76
+ const hits: SeqAnnotationHit[] = [];
77
+
78
+ for (const rule of enabledRules) {
79
+ // Reset regex lastIndex for global patterns
80
+ rule.pattern.lastIndex = 0;
81
+ let match: RegExpExecArray | null;
82
+ while ((match = rule.pattern.exec(seq)) !== null) {
83
+ hits.push({
84
+ annotationId: rule.id,
85
+ positionIndex: match.index,
86
+ positionName: match.index < posList.length ? posList[match.index] : undefined,
87
+ matchedMonomers: match[0],
88
+ });
89
+ ruleHitCounts.set(rule.id, (ruleHitCounts.get(rule.id) ?? 0) + 1);
90
+ totalHits++;
91
+ }
92
+ }
93
+ rowData[rowIdx] = hits;
94
+ }
95
+
96
+ // Build column-level annotations only for rules that had hits
97
+ const annotations: SeqAnnotation[] = enabledRules
98
+ .filter((r) => ruleHitCounts.has(r.id))
99
+ .map((r) => ({
100
+ id: r.id,
101
+ name: r.name,
102
+ description: `${r.ruleCategory} liability pattern (${ruleHitCounts.get(r.id)} hits)`,
103
+ start: null,
104
+ end: null,
105
+ visualType: r.length === 1 ? AnnotationVisualType.Point : AnnotationVisualType.Motif,
106
+ category: AnnotationCategory.Liability,
107
+ color: r.color,
108
+ severity: r.severity,
109
+ motifPattern: r.pattern.source,
110
+ autoGenerated: true,
111
+ }));
112
+
113
+ return {annotations, rowData, totalHits};
114
+ }
115
+
116
+ /** Applies liability scan results to the DataFrame (writes tags + companion column). */
117
+ export function applyLiabilityScanResults(
118
+ df: DG.DataFrame,
119
+ seqCol: DG.Column<string>,
120
+ result: ScanLiabilitiesResult,
121
+ ): void {
122
+ // Merge with existing annotations, removing old liability entries
123
+ const existing = getColumnAnnotations(seqCol)
124
+ .filter((a) => a.category !== AnnotationCategory.Liability);
125
+ setColumnAnnotations(seqCol, [...existing, ...result.annotations]);
126
+
127
+ // Write per-row data to hidden companion column, preserving region hits from numbering
128
+ const annotCol = getOrCreateAnnotationColumn(df, seqCol);
129
+ for (let i = 0; i < result.rowData.length; i++) {
130
+ const existingHits = getRowAnnotations(annotCol, i) ?? [];
131
+ setRowAnnotations(annotCol, i, mergeRowHits(existingHits, result.rowData[i], false, true));
132
+ }
133
+ }
134
+
135
+ /** Creates a liability summary count column (total hits per row). */
136
+ export function createLiabilitySummaryColumn(
137
+ df: DG.DataFrame,
138
+ seqCol: DG.Column<string>,
139
+ result: ScanLiabilitiesResult,
140
+ ): DG.Column<number> {
141
+ const colName = `${seqCol.name}_liability_count`;
142
+ const counts = result.rowData.map((hits) => hits.length);
143
+ const col = df.columns.addNewInt(colName);
144
+ for (let i = 0; i < counts.length; i++)
145
+ col.set(i, counts[i]);
146
+ return col;
147
+ }
@@ -0,0 +1,450 @@
1
+ /* eslint-disable max-len */
2
+ import * as grok from 'datagrok-api/grok';
3
+ import * as ui from 'datagrok-api/ui';
4
+ import * as DG from 'datagrok-api/dg';
5
+
6
+ import {TAGS as bioTAGS, ALIGNMENT, ALPHABET, NOTATION} from '@datagrok-libraries/bio/src/utils/macromolecule';
7
+ import {
8
+ SeqAnnotation, SeqAnnotationHit, AnnotationCategory,
9
+ } from '@datagrok-libraries/bio/src/utils/macromolecule/annotations';
10
+ import {NumberingScheme} from '@datagrok-libraries/bio/src/utils/macromolecule/numbering-schemes';
11
+ import {
12
+ setColumnAnnotations, getColumnAnnotations,
13
+ getOrCreateAnnotationColumn, getRowAnnotations, setRowAnnotations, mergeRowHits,
14
+ } from './annotation-manager';
15
+ import {_package} from '../../package';
16
+ import type {NumberingResult, Scheme} from '../antibody-numbering (WIP)';
17
+
18
+ const BUILTIN_ENGINE_KEY = '__builtin__';
19
+ const BUILTIN_ENGINE_LABEL = 'Built-in (TypeScript)';
20
+
21
+ /** An engine entry: either a dynamically discovered DG.Func or the built-in TS engine. */
22
+ interface NumberingEngine {
23
+ /** Display label for the dropdown */
24
+ label: string;
25
+ /** Unique key — nqName for DG.Func engines, BUILTIN_ENGINE_KEY for built-in */
26
+ key: string;
27
+ /** The DG.Func to call, or null for the built-in engine */
28
+ func: DG.Func | null;
29
+ }
30
+
31
+ /** Discovers all registered antibody numbering engines + the built-in TS engine.
32
+ * Dynamic engines (meta.role = 'antibodyNumbering') come first; built-in is last. */
33
+ function discoverEngines(): NumberingEngine[] {
34
+ const engines: NumberingEngine[] = [];
35
+
36
+ const funcs = DG.Func.find({meta: {role: 'antibodyNumbering'}});
37
+ if (funcs.length === 0)
38
+ throw new Error('No external antibody numbering engines found. Make sure that Proteomics plugin is installed and up to date.');
39
+ for (const f of funcs) {
40
+ const pkgName = f.package?.name ?? '';
41
+ const label = f.friendlyName || f.name;
42
+ engines.push({
43
+ label: label,
44
+ key: pkgName ? `${pkgName}:${f.name}` : f.name,
45
+ func: f,
46
+ });
47
+ }
48
+
49
+ // Built-in TS engine is always last
50
+ engines.push({label: BUILTIN_ENGINE_LABEL, key: BUILTIN_ENGINE_KEY, func: null});
51
+ return engines;
52
+ }
53
+
54
+ /** Converts TS NumberingResult[] to a DG.DataFrame matching the expected output shape.
55
+ * Columns: position_names, chain_type, annotations_json, numbering_detail, numbering_map. */
56
+ export function numberingResultsToDataFrame(results: NumberingResult[]): DG.DataFrame {
57
+ const n = results.length;
58
+ const posNames = DG.Column.fromType(DG.COLUMN_TYPE.STRING, 'position_names', n);
59
+ const chainTypes = DG.Column.fromType(DG.COLUMN_TYPE.STRING, 'chain_type', n);
60
+ const annotJson = DG.Column.fromType(DG.COLUMN_TYPE.STRING, 'annotations_json', n);
61
+ const numDetail = DG.Column.fromType(DG.COLUMN_TYPE.STRING, 'numbering_detail', n);
62
+ const numMap = DG.Column.fromType(DG.COLUMN_TYPE.STRING, 'numbering_map', n);
63
+
64
+ for (let i = 0; i < n; i++) {
65
+ const r = results[i];
66
+ if (r.error && r.percentIdentity < 0.3) {
67
+ posNames.set(i, '');
68
+ chainTypes.set(i, '');
69
+ annotJson.set(i, '[]');
70
+ numDetail.set(i, '');
71
+ numMap.set(i, '');
72
+ } else {
73
+ posNames.set(i, r.positionNames);
74
+ chainTypes.set(i, r.chainType);
75
+ annotJson.set(i, JSON.stringify(r.annotations));
76
+ numDetail.set(i, JSON.stringify(r.numberingDetail));
77
+ numMap.set(i, JSON.stringify(r.numberingMap));
78
+ }
79
+ }
80
+
81
+ return DG.DataFrame.fromColumns([posNames, chainTypes, annotJson, numDetail, numMap]);
82
+ }
83
+
84
+ /** Runs the built-in TS numbering engine on all rows of a sequence column. */
85
+ async function runBuiltinNumbering(
86
+ seqCol: DG.Column<string>, schemeName: string,
87
+ ): Promise<DG.DataFrame> {
88
+ const {numberSequences, extractSequence} = await import('../antibody-numbering (WIP)');
89
+ const scheme = schemeName.toLowerCase() as Scheme;
90
+
91
+ const sequences: string[] = [];
92
+ for (let i = 0; i < seqCol.length; i++) {
93
+ const raw = seqCol.get(i);
94
+ sequences.push(extractSequence(raw ?? ''));
95
+ }
96
+
97
+ const results = numberSequences(sequences, scheme);
98
+ return numberingResultsToDataFrame(results);
99
+ }
100
+
101
+ export function showNumberingSchemeDialog(): void {
102
+ const df = grok.shell.tv?.dataFrame;
103
+ if (!df) {
104
+ grok.shell.warning('No table open');
105
+ return;
106
+ }
107
+
108
+ const seqCols = df.columns.bySemTypeAll(DG.SEMTYPE.MACROMOLECULE);
109
+ if (seqCols.length === 0) {
110
+ grok.shell.warning('No macromolecule columns found');
111
+ return;
112
+ }
113
+
114
+ const engines = discoverEngines();
115
+ const engineLabels = engines.map((e) => e.label);
116
+ const schemeChoices = Object.values(NumberingScheme);
117
+
118
+ const tableInput = ui.input.table('Table', {value: df});
119
+ const seqInput = ui.input.column('Sequence', {
120
+ table: df, value: seqCols[0],
121
+ filter: (col: DG.Column) => col.semType === DG.SEMTYPE.MACROMOLECULE,
122
+ });
123
+ const schemeInput = ui.input.choice('Scheme', {value: NumberingScheme.IMGT, items: schemeChoices});
124
+ const engineInput = ui.input.choice('Engine', {
125
+ value: engineLabels[0], items: engineLabels,
126
+ });
127
+ // const populateRegions = ui.input.bool('Populate FR/CDR regions', {value: true});
128
+ // const openVdRegions = ui.input.bool('Open VD Regions viewer', {value: true});
129
+
130
+ const dialog = ui.dialog({title: 'Apply Antibody Numbering'})
131
+ .add(ui.inputs([tableInput, seqInput, schemeInput, engineInput]))
132
+ .onOK(async () => {
133
+ const seqCol = seqInput.value!;
134
+ const schemeName = schemeInput.value!;
135
+ const selectedLabel = engineInput.value!;
136
+ const engine = engines.find((e) => e.label === selectedLabel) ?? engines[engines.length - 1];
137
+ const pi = DG.TaskBarProgressIndicator.create(`Applying ${schemeName} numbering...`);
138
+ try {
139
+ let result: DG.DataFrame;
140
+ if (engine.func)
141
+ result = await engine.func.apply({df: df, seqCol: seqCol, scheme: schemeName.toLowerCase()});
142
+ else
143
+ result = await runBuiltinNumbering(seqCol, schemeName);
144
+
145
+ applyNumberingResults(df, seqCol, result, schemeName, true, engine.label);
146
+
147
+ // // Open VD Regions viewer
148
+ // if (openVdRegions.value && grok.shell.tv) {
149
+ // try {
150
+ // await grok.shell.tv.dataFrame.plot.fromType('VdRegions', {});
151
+ // } catch (err) {
152
+ // console.warn('Could not open VD Regions viewer:', err);
153
+ // }
154
+ // }
155
+ } catch (err: any) {
156
+ grok.shell.error(`Numbering failed: ${err.message ?? err}`);
157
+ console.error(err);
158
+ } finally {
159
+ pi.close();
160
+ }
161
+ });
162
+
163
+ dialog.show();
164
+ }
165
+
166
+ /** Builds a map from ungapped character index to gapped character index.
167
+ * Used when the source column is already aligned (MSA) — numbering engines strip gaps,
168
+ * so their output indices refer to the ungapped sequence, not the gapped original. */
169
+ function buildUngappedToGappedMap(gappedSeq: string): number[] {
170
+ const map: number[] = [];
171
+ for (let g = 0; g < gappedSeq.length; g++) {
172
+ if (gappedSeq[g] !== '-' && gappedSeq[g] !== '.')
173
+ map.push(g);
174
+ }
175
+ return map;
176
+ }
177
+
178
+ /** Parses a position code into [numericPart, insertionLetter] for sorting.
179
+ * E.g. "27" → [27, ""], "111A" → [111, "A"], "27B" → [27, "B"]. */
180
+ function parsePositionCode(code: string): [number, string] {
181
+ const match = code.match(/^(\d+)([A-Z]?)$/);
182
+ if (!match) return [Infinity, code];
183
+ return [parseInt(match[1], 10), match[2]];
184
+ }
185
+
186
+ /** Sorts position codes in scheme order: numeric ascending, then insertion letter. */
187
+ function sortPositionCodes(codes: string[]): string[] {
188
+ return codes.slice().sort((a, b) => {
189
+ const [numA, insA] = parsePositionCode(a);
190
+ const [numB, insB] = parsePositionCode(b);
191
+ if (numA !== numB) return numA - numB;
192
+ return insA.localeCompare(insB);
193
+ });
194
+ }
195
+
196
+ /** Builds unified position list from all rows and creates an aligned sequence column.
197
+ * Includes flanking residues (before/after the numbered region) padded with gaps.
198
+ * Layout: [pre-region gaps+residues] [scheme-aligned region] [post-region residues+gaps]
199
+ * @returns aligned column, full position list (including flanking), and the pre-region offset. */
200
+ function createAlignedColumn(
201
+ df: DG.DataFrame, seqCol: DG.Column<string>, result: DG.DataFrame,
202
+ ): {alignedCol: DG.Column<string>; unifiedPositions: string[]; preOffset: number} | null {
203
+ const numberingMapCol = result.col('numbering_map');
204
+ if (!numberingMapCol) return null;
205
+
206
+ // Pass 1: collect all scheme position codes and per-row flanking lengths
207
+ const allCodes = new Set<string>();
208
+ const rowMaps: (Record<string, number> | null)[] = [];
209
+ const rowPreLens: number[] = []; // chars before first numbered position
210
+ const rowPostLens: number[] = []; // chars after last numbered position
211
+
212
+ for (let i = 0; i < result.rowCount; i++) {
213
+ const mapStr = numberingMapCol.get(i);
214
+ const rawSeq = seqCol.get(i) ?? '';
215
+ if (!mapStr) {
216
+ rowMaps.push(null);
217
+ rowPreLens.push(0);
218
+ rowPostLens.push(0);
219
+ continue;
220
+ }
221
+ try {
222
+ const posToCharIdx: Record<string, number> = JSON.parse(mapStr);
223
+ rowMaps.push(posToCharIdx);
224
+ for (const code of Object.keys(posToCharIdx))
225
+ allCodes.add(code);
226
+
227
+ // Find min/max char indices that were numbered
228
+ const charIndices = Object.values(posToCharIdx);
229
+ const minChar = Math.min(...charIndices);
230
+ const maxChar = Math.max(...charIndices);
231
+ rowPreLens.push(minChar); // chars before first numbered
232
+ rowPostLens.push(Math.max(0, rawSeq.length - maxChar - 1)); // chars after last numbered
233
+ } catch {
234
+ rowMaps.push(null);
235
+ rowPreLens.push(0);
236
+ rowPostLens.push(0);
237
+ }
238
+ }
239
+
240
+ if (allCodes.size === 0) return null;
241
+
242
+ const maxPreLen = Math.max(0, ...rowPreLens);
243
+ const maxPostLen = Math.max(0, ...rowPostLens);
244
+
245
+ // Build position names: [pre-flanking] + [scheme positions] + [post-flanking]
246
+ const schemePositions = sortPositionCodes(Array.from(allCodes));
247
+ const preNames: string[] = [];
248
+ for (let p = maxPreLen; p > 0; p--)
249
+ preNames.push(`N-${p}`);
250
+ const postNames: string[] = [];
251
+ for (let p = 1; p <= maxPostLen; p++)
252
+ postNames.push(`C+${p}`);
253
+
254
+ const unifiedPositions = [...preNames, ...schemePositions, ...postNames];
255
+ const totalLen = unifiedPositions.length;
256
+ const preOffset = maxPreLen; // scheme region starts at this index in the aligned string
257
+
258
+ // Map scheme position codes → index in the full unified list
259
+ const posToUnifiedIdx = new Map<string, number>();
260
+ for (let s = 0; s < schemePositions.length; s++)
261
+ posToUnifiedIdx.set(schemePositions[s], preOffset + s);
262
+
263
+ // Pass 2: build aligned sequences
264
+ const colName = df.columns.getUnusedName(`${seqCol.name} (aligned)`);
265
+ const alignedCol = DG.Column.fromType(DG.COLUMN_TYPE.STRING, colName, df.rowCount);
266
+
267
+ for (let i = 0; i < df.rowCount; i++) {
268
+ const map = i < rowMaps.length ? rowMaps[i] : null;
269
+ const rawSeq = seqCol.get(i) ?? '';
270
+
271
+ if (!map) {
272
+ alignedCol.set(i, '-'.repeat(totalLen));
273
+ continue;
274
+ }
275
+
276
+ const aligned = new Array<string>(totalLen).fill('-');
277
+
278
+ // Place scheme-numbered residues
279
+ for (const [posCode, charIdx] of Object.entries(map)) {
280
+ const uIdx = posToUnifiedIdx.get(posCode);
281
+ if (uIdx != null && charIdx < rawSeq.length)
282
+ aligned[uIdx] = rawSeq[charIdx];
283
+ }
284
+
285
+ // Place pre-region flanking residues (right-aligned within the pre-region block)
286
+ const preLen = rowPreLens[i];
287
+ const charIndices = Object.values(map);
288
+ const minChar = Math.min(...charIndices);
289
+ for (let p = 0; p < preLen; p++)
290
+ aligned[preOffset - preLen + p] = rawSeq[minChar - preLen + p];
291
+
292
+ // Place post-region flanking residues (left-aligned within the post-region block)
293
+ const postLen = rowPostLens[i];
294
+ const maxChar = Math.max(...charIndices);
295
+ const postStart = preOffset + schemePositions.length;
296
+ for (let p = 0; p < postLen; p++)
297
+ aligned[postStart + p] = rawSeq[maxChar + 1 + p];
298
+
299
+ alignedCol.set(i, aligned.join(''));
300
+ }
301
+
302
+ // Set macromolecule tags on the aligned column
303
+ alignedCol.semType = DG.SEMTYPE.MACROMOLECULE;
304
+ alignedCol.setTag(bioTAGS.aligned, ALIGNMENT.SEQ_MSA);
305
+ alignedCol.setTag(bioTAGS.alphabet, ALPHABET.PT);
306
+ alignedCol.meta.units = NOTATION.FASTA;
307
+ alignedCol.setTag(DG.Tags.CellRenderer, 'sequence');
308
+ alignedCol.setTag(bioTAGS.positionNames, unifiedPositions.join(', '));
309
+
310
+ return {alignedCol, unifiedPositions, preOffset};
311
+ }
312
+
313
+ /** Applies numbering results (from either engine) to the sequence column and dataframe.
314
+ *
315
+ * Annotation strategy:
316
+ * - Original column: row-level region spans only (char indices from numbering_map).
317
+ * No column-level position names (they differ per row).
318
+ * - Aligned column: column-level annotations only (all rows share unified positions).
319
+ * Position names tag set to the unified list. */
320
+ function applyNumberingResults(
321
+ df: DG.DataFrame, seqCol: DG.Column<string>, result: DG.DataFrame,
322
+ schemeName: string, populateRegions: boolean, engineLabel: string,
323
+ ): void {
324
+ if (!result || result.rowCount === 0) {
325
+ grok.shell.warning('No numbering results returned');
326
+ return;
327
+ }
328
+
329
+ const posNamesCol = result.getCol('position_names');
330
+ const chainTypeCol = result.getCol('chain_type');
331
+ const annotJsonCol = result.getCol('annotations_json');
332
+ const numberingMapCol = result.col('numbering_map');
333
+
334
+ // If the source column is already aligned (MSA), remap numbering_map indices
335
+ // from ungapped to gapped, since numbering engines strip gaps before processing.
336
+ const isAligned = true; // always treat as aligned to handle remapping
337
+ if (isAligned && numberingMapCol) {
338
+ for (let i = 0; i < result.rowCount; i++) {
339
+ const mapStr = numberingMapCol.get(i);
340
+ if (!mapStr) continue;
341
+ try {
342
+ const posToCharIdx: Record<string, number> = JSON.parse(mapStr);
343
+ const rawSeq = seqCol.get(i) ?? '';
344
+ const ungapToGap = buildUngappedToGappedMap(rawSeq);
345
+ const remapped: Record<string, number> = {};
346
+ for (const [posCode, ungappedIdx] of Object.entries(posToCharIdx)) {
347
+ if (ungappedIdx < ungapToGap.length)
348
+ remapped[posCode] = ungapToGap[ungappedIdx];
349
+ }
350
+ numberingMapCol.set(i, JSON.stringify(remapped));
351
+ } catch { /* skip */ }
352
+ }
353
+ }
354
+
355
+ // Pick the row with the most annotations for column-level representative data
356
+ let bestRowIdx = -1;
357
+ let bestAnnotCount = -1;
358
+
359
+ for (let i = 0; i < result.rowCount; i++) {
360
+ const pn = posNamesCol.get(i);
361
+ if (!pn || pn.length === 0) continue;
362
+ const aj = annotJsonCol.get(i);
363
+ let count = 0;
364
+ if (aj)
365
+ try { count = JSON.parse(aj).length; } catch { /* skip */ }
366
+ if (count > bestAnnotCount) {
367
+ bestAnnotCount = count;
368
+ bestRowIdx = i;
369
+ }
370
+ }
371
+
372
+ const chainType = bestRowIdx >= 0 ? (chainTypeCol.get(bestRowIdx) ?? '') : '';
373
+ const annotationsJson = bestRowIdx >= 0 ? (annotJsonCol.get(bestRowIdx) ?? '[]') : '[]';
374
+
375
+ if (bestRowIdx < 0) {
376
+ grok.shell.warning(`${engineLabel} could not number the sequences. Check that they are valid antibody variable region sequences.`);
377
+ return;
378
+ }
379
+
380
+ // Mark scheme on original column (no position names — they differ per row)
381
+ seqCol.setTag(bioTAGS.numberingScheme, schemeName);
382
+
383
+ // --- Original column: column-level annotation definitions (needed for renderer
384
+ // to resolve annotation IDs → colors/names) + row-level region spans ---
385
+ if (populateRegions) {
386
+ try {
387
+ const regionAnnotations: SeqAnnotation[] = JSON.parse(annotationsJson);
388
+ const existing = getColumnAnnotations(seqCol).filter((a) => a.category !== AnnotationCategory.Structure);
389
+ setColumnAnnotations(seqCol, [...existing, ...regionAnnotations]);
390
+ } catch (err) {
391
+ console.warn('Failed to set annotation definitions on original column:', err);
392
+ }
393
+ }
394
+
395
+ if (populateRegions && numberingMapCol) {
396
+ try {
397
+ const annotCol = getOrCreateAnnotationColumn(df, seqCol);
398
+ for (let i = 0; i < result.rowCount; i++) {
399
+ const mapStr = numberingMapCol.get(i);
400
+ const rowAnnotJson = annotJsonCol.get(i);
401
+ if (!mapStr || !rowAnnotJson) continue;
402
+ const posToCharIdx: Record<string, number> = JSON.parse(mapStr);
403
+ const rowRegions: SeqAnnotation[] = JSON.parse(rowAnnotJson);
404
+
405
+ const regionHits: SeqAnnotationHit[] = [];
406
+ for (const region of rowRegions) {
407
+ if (region.start == null || region.end == null) continue;
408
+ const startCharIdx = posToCharIdx[region.start];
409
+ const endCharIdx = posToCharIdx[region.end];
410
+ if (startCharIdx == null || endCharIdx == null) continue;
411
+ regionHits.push({
412
+ annotationId: region.id,
413
+ positionIndex: startCharIdx,
414
+ endPositionIndex: endCharIdx,
415
+ positionName: region.start,
416
+ matchedMonomers: '',
417
+ });
418
+ }
419
+
420
+ const existingHits = getRowAnnotations(annotCol, i) ?? [];
421
+ setRowAnnotations(annotCol, i, mergeRowHits(existingHits, regionHits, true, false));
422
+ }
423
+ } catch (err) {
424
+ console.warn('Failed to store per-row region data on original column:', err);
425
+ }
426
+ }
427
+
428
+ // --- Aligned column: column-level annotations only ---
429
+ const alignment = createAlignedColumn(df, seqCol, result);
430
+ if (alignment) {
431
+ df.columns.insert(alignment.alignedCol, df.columns.toList().indexOf(seqCol) + 1);
432
+ if (grok.shell.tv?.dataFrame === df)
433
+ grok.shell.tv.grid.scrollToCell(seqCol, 0);
434
+
435
+ alignment.alignedCol.setTag(bioTAGS.numberingScheme, schemeName);
436
+
437
+ if (populateRegions) {
438
+ try {
439
+ const regionAnnotations: SeqAnnotation[] = JSON.parse(annotationsJson);
440
+ const existing = getColumnAnnotations(alignment.alignedCol).filter((a) => a.category !== AnnotationCategory.Structure);
441
+ setColumnAnnotations(alignment.alignedCol, [...existing, ...regionAnnotations]);
442
+ } catch (err) {
443
+ console.warn('Failed to set column-level annotations on aligned column:', err);
444
+ }
445
+ }
446
+ }
447
+
448
+ df.fireValuesChanged();
449
+ grok.shell.info(`Numbering applied: ${schemeName}, chain type: ${chainType}`);
450
+ }