@datagrok/bio 2.25.17 → 2.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/dist/282.js +2 -0
  2. package/dist/282.js.map +1 -0
  3. package/dist/287.js +2 -0
  4. package/dist/287.js.map +1 -0
  5. package/dist/288.js +2 -0
  6. package/dist/288.js.map +1 -0
  7. package/dist/422.js +2 -0
  8. package/dist/422.js.map +1 -0
  9. package/dist/455.js +1 -1
  10. package/dist/455.js.map +1 -1
  11. package/dist/767.js +2 -0
  12. package/dist/767.js.map +1 -0
  13. package/dist/package-test.js +5 -5
  14. package/dist/package-test.js.map +1 -1
  15. package/dist/package.js +3 -3
  16. package/dist/package.js.map +1 -1
  17. package/files/samples/antibodies.csv +494 -0
  18. package/package.json +2 -2
  19. package/src/package-api.ts +21 -0
  20. package/src/package.g.ts +22 -1
  21. package/src/package.ts +30 -1
  22. package/src/utils/annotations/annotation-actions.ts +130 -0
  23. package/src/utils/annotations/annotation-manager-ui.ts +118 -0
  24. package/src/utils/annotations/annotation-manager.ts +163 -0
  25. package/src/utils/annotations/liability-scanner-ui.ts +88 -0
  26. package/src/utils/annotations/liability-scanner.ts +147 -0
  27. package/src/utils/annotations/numbering-ui.ts +450 -0
  28. package/src/utils/antibody-numbering (WIP)/alignment.ts +578 -0
  29. package/src/utils/antibody-numbering (WIP)/annotator.ts +120 -0
  30. package/src/utils/antibody-numbering (WIP)/data/blosum62.ts +55 -0
  31. package/src/utils/antibody-numbering (WIP)/data/consensus-aho.ts +155 -0
  32. package/src/utils/antibody-numbering (WIP)/data/consensus-imgt.ts +162 -0
  33. package/src/utils/antibody-numbering (WIP)/data/consensus-kabat.ts +157 -0
  34. package/src/utils/antibody-numbering (WIP)/data/consensus-martin.ts +152 -0
  35. package/src/utils/antibody-numbering (WIP)/data/consensus.ts +36 -0
  36. package/src/utils/antibody-numbering (WIP)/data/regions.ts +63 -0
  37. package/src/utils/antibody-numbering (WIP)/index.ts +31 -0
  38. package/src/utils/antibody-numbering (WIP)/testdata.ts +5356 -0
  39. package/src/utils/antibody-numbering (WIP)/types.ts +69 -0
  40. package/src/utils/context-menu.ts +42 -2
  41. package/src/utils/get-region-func-editor.ts +18 -2
  42. package/src/utils/get-region.ts +65 -1
  43. package/src/widgets/representations.ts +53 -2
  44. package/src/widgets/sequence-scrolling-widget.ts +28 -18
  45. package/test-console-output-1.log +546 -560
  46. package/test-record-1.mp4 +0 -0
@@ -0,0 +1,578 @@
1
+ /**
2
+ * Profile-based global alignment for antibody numbering.
3
+ *
4
+ * Aligns an input antibody sequence against a consensus profile
5
+ * using a modified Needleman-Wunsch algorithm. The profile defines
6
+ * which positions exist in the numbering scheme and what amino acids
7
+ * are expected at each position.
8
+ *
9
+ * Gap positions in the profile (empty amino acid lists) represent
10
+ * CDR/variable regions where the input sequence can have insertions.
11
+ */
12
+
13
+ import type { ChainType, AlignmentResult, Scheme } from './types';
14
+ import type { ProfileData } from './data/consensus';
15
+ import { getConsensusProfile, getCtermProfile } from './data/consensus';
16
+ import { blosum62Score } from './data/blosum62';
17
+
18
+ // Scoring parameters
19
+ const MATCH_CONSENSUS = 6; // bonus for matching a consensus amino acid
20
+ const MATCH_BLOSUM_SCALE = 1; // scale factor for BLOSUM scores on mismatch
21
+ const GAP_OPEN = -10; // penalty for opening a gap
22
+ const GAP_EXTEND = -2; // penalty for extending a gap
23
+ const GAP_POSITION_BONUS = 4; // reduce gap penalty at gap positions (CDRs)
24
+
25
+ /**
26
+ * Score a residue against a profile position.
27
+ * Gap positions (empty consensus) get a flat score since anything goes there.
28
+ */
29
+ function scorePosition(aa: string, consensusAAs: string[]): number {
30
+ if (consensusAAs.length === 0) {
31
+ // This is a gap/insertion position in the profile - mild positive score
32
+ return 1;
33
+ }
34
+ // Check if aa is in the consensus set
35
+ if (consensusAAs.includes(aa)) {
36
+ return MATCH_CONSENSUS;
37
+ }
38
+ // Use BLOSUM62 against the best consensus AA
39
+ let best = -10;
40
+ for (const caa of consensusAAs) {
41
+ const s = blosum62Score(aa, caa) * MATCH_BLOSUM_SCALE;
42
+ if (s > best) best = s;
43
+ }
44
+ return best;
45
+ }
46
+
47
+ /** Direction for traceback */
48
+ const enum Dir {
49
+ DIAG = 0,
50
+ UP = 1, // gap in profile (insertion in sequence)
51
+ LEFT = 2, // gap in sequence (deletion from profile)
52
+ }
53
+
54
+ interface AlignmentCell {
55
+ score: number;
56
+ dir: Dir;
57
+ }
58
+
59
+ /**
60
+ * Perform profile-based global alignment.
61
+ * Returns the position code for each residue in the input sequence,
62
+ * or '-' if the residue is outside the numbered region.
63
+ */
64
+ function profileAlign(
65
+ seq: string,
66
+ profile: ProfileData,
67
+ ): { positionCodes: string[]; matchedPositions: number; totalProfilePositions: number } {
68
+ const n = seq.length; // sequence length
69
+ const m = profile.length; // profile length
70
+
71
+ // DP matrix: (n+1) x (m+1)
72
+ // We use affine gap penalties with separate match/insert/delete states
73
+ const INF = -1e9;
74
+
75
+ // M[i][j] = best score aligning seq[0..i-1] to profile[0..j-1] ending in match
76
+ // X[i][j] = ... ending in gap in profile (insertion in seq)
77
+ // Y[i][j] = ... ending in gap in sequence (deletion from profile)
78
+ const M: number[][] = Array.from({ length: n + 1 }, () => new Float64Array(m + 1) as unknown as number[]);
79
+ const X: number[][] = Array.from({ length: n + 1 }, () => new Float64Array(m + 1) as unknown as number[]);
80
+ const Y: number[][] = Array.from({ length: n + 1 }, () => new Float64Array(m + 1) as unknown as number[]);
81
+ const trM: Dir[][] = Array.from({ length: n + 1 }, () => new Uint8Array(m + 1) as unknown as Dir[]);
82
+ const trX: Dir[][] = Array.from({ length: n + 1 }, () => new Uint8Array(m + 1) as unknown as Dir[]);
83
+ const trY: Dir[][] = Array.from({ length: n + 1 }, () => new Uint8Array(m + 1) as unknown as Dir[]);
84
+
85
+ // Initialize
86
+ M[0][0] = 0;
87
+ X[0][0] = INF;
88
+ Y[0][0] = INF;
89
+
90
+ // Allow free leading gaps in sequence (seq can start after profile begins)
91
+ // and free leading gaps in profile (profile can start after seq begins)
92
+ for (let j = 1; j <= m; j++) {
93
+ // Deletion from profile (skip profile positions at start)
94
+ const isGap = profile[j - 1][1].length === 0;
95
+ M[0][j] = INF;
96
+ X[0][j] = INF;
97
+ // Mild penalty for skipping non-gap positions, free for gap positions
98
+ Y[0][j] = isGap ? Y[0][j - 1] : Y[0][j - 1] + GAP_EXTEND * 0.3;
99
+ if (j === 1) Y[0][j] = isGap ? 0 : GAP_OPEN * 0.3;
100
+ trY[0][j] = Dir.LEFT;
101
+ }
102
+
103
+ for (let i = 1; i <= n; i++) {
104
+ // Insertion in sequence (residues before profile starts)
105
+ M[i][0] = INF;
106
+ Y[i][0] = INF;
107
+ X[i][0] = X[i - 1][0] + GAP_EXTEND * 0.3;
108
+ if (i === 1) X[i][0] = GAP_OPEN * 0.3;
109
+ trX[i][0] = Dir.UP;
110
+ }
111
+
112
+ // Fill DP matrix
113
+ for (let i = 1; i <= n; i++) {
114
+ const aa = seq[i - 1];
115
+ for (let j = 1; j <= m; j++) {
116
+ const [, consensusAAs] = profile[j - 1];
117
+ const isGapPos = consensusAAs.length === 0;
118
+
119
+ // Match/mismatch score
120
+ const matchScore = scorePosition(aa, consensusAAs);
121
+
122
+ // M[i][j]: match state - came from any state diagonally
123
+ const mFromM = M[i - 1][j - 1] + matchScore;
124
+ const mFromX = X[i - 1][j - 1] + matchScore;
125
+ const mFromY = Y[i - 1][j - 1] + matchScore;
126
+ if (mFromM >= mFromX && mFromM >= mFromY) {
127
+ M[i][j] = mFromM; trM[i][j] = Dir.DIAG;
128
+ } else if (mFromX >= mFromY) {
129
+ M[i][j] = mFromX; trM[i][j] = Dir.UP;
130
+ } else {
131
+ M[i][j] = mFromY; trM[i][j] = Dir.LEFT;
132
+ }
133
+
134
+ // X[i][j]: gap in profile (insertion in sequence)
135
+ // Reduced penalty at positions that are naturally gap positions
136
+ const gapOpenAdj = isGapPos ? GAP_OPEN + GAP_POSITION_BONUS : GAP_OPEN;
137
+ const gapExtAdj = isGapPos ? GAP_EXTEND + GAP_POSITION_BONUS * 0.5 : GAP_EXTEND;
138
+
139
+ const xFromM = M[i - 1][j] + gapOpenAdj;
140
+ const xFromX = X[i - 1][j] + gapExtAdj;
141
+ if (xFromM >= xFromX) {
142
+ X[i][j] = xFromM; trX[i][j] = Dir.DIAG;
143
+ } else {
144
+ X[i][j] = xFromX; trX[i][j] = Dir.UP;
145
+ }
146
+
147
+ // Y[i][j]: gap in sequence (deletion from profile)
148
+ // More permissive at gap positions (CDR gaps are expected)
149
+ const delOpen = isGapPos ? 0 : GAP_OPEN;
150
+ const delExt = isGapPos ? 0 : GAP_EXTEND;
151
+
152
+ const yFromM = M[i][j - 1] + delOpen;
153
+ const yFromY = Y[i][j - 1] + delExt;
154
+ if (yFromM >= yFromY) {
155
+ Y[i][j] = yFromM; trY[i][j] = Dir.DIAG;
156
+ } else {
157
+ Y[i][j] = yFromY; trY[i][j] = Dir.LEFT;
158
+ }
159
+ }
160
+ }
161
+
162
+ // Find best end score.
163
+ // We prefer endpoints that consume the full profile (j = m) to ensure
164
+ // all scheme positions are assigned. We allow trailing sequence residues
165
+ // (i < n) for sequences extending beyond the variable region.
166
+ // We also allow j < m for truncated sequences missing C-terminal residues,
167
+ // but apply a penalty for unused non-gap profile positions.
168
+ let bestScore = INF;
169
+ let bestI = n, bestJ = m;
170
+ let bestState: 'M' | 'X' | 'Y' = 'M';
171
+
172
+ // Primary: full profile consumed, any amount of sequence consumed
173
+ for (let i = 0; i <= n; i++) {
174
+ for (const [state, mat] of [['M', M], ['X', X], ['Y', Y]] as const) {
175
+ if (mat[i][m] > bestScore) {
176
+ bestScore = mat[i][m]; bestState = state; bestI = i; bestJ = m;
177
+ }
178
+ }
179
+ }
180
+
181
+ // Secondary: full sequence consumed, partial profile (for truncated sequences).
182
+ // Apply a small penalty per skipped non-gap profile position.
183
+ for (let j = 0; j < m; j++) {
184
+ // Count skipped non-gap positions at the end
185
+ let skippedNonGap = 0;
186
+ for (let k = j; k < m; k++) {
187
+ if (profile[k][1].length > 0) skippedNonGap++;
188
+ }
189
+ const penalty = skippedNonGap * 3; // mild penalty per skipped position
190
+ for (const [state, mat] of [['M', M], ['X', X], ['Y', Y]] as const) {
191
+ if (mat[n][j] - penalty > bestScore) {
192
+ bestScore = mat[n][j] - penalty; bestState = state; bestI = n; bestJ = j;
193
+ }
194
+ }
195
+ }
196
+
197
+ // Traceback
198
+ const alignment: Array<[seqIdx: number, profIdx: number]> = [];
199
+ let ci = bestI, cj = bestJ;
200
+ let curState = bestState;
201
+
202
+ // Trailing unaligned residues in sequence
203
+ for (let i = n; i > bestI; i--) {
204
+ alignment.push([i - 1, -1]);
205
+ }
206
+
207
+ while (ci > 0 || cj > 0) {
208
+ if (curState === 'M') {
209
+ if (ci === 0 && cj === 0) break;
210
+ if (ci === 0 || cj === 0) {
211
+ // Edge case
212
+ if (ci > 0) { alignment.push([ci - 1, -1]); ci--; curState = 'X'; }
213
+ else { cj--; curState = 'Y'; }
214
+ continue;
215
+ }
216
+ alignment.push([ci - 1, cj - 1]);
217
+ const tr = trM[ci][cj];
218
+ ci--; cj--;
219
+ if (tr === Dir.DIAG) curState = 'M';
220
+ else if (tr === Dir.UP) curState = 'X';
221
+ else curState = 'Y';
222
+ } else if (curState === 'X') {
223
+ if (ci === 0) break;
224
+ alignment.push([ci - 1, -1]); // seq residue not aligned to profile
225
+ const tr = trX[ci][cj];
226
+ ci--;
227
+ if (tr === Dir.DIAG) curState = 'M';
228
+ else curState = 'X';
229
+ } else { // Y
230
+ if (cj === 0) break;
231
+ // Skip profile position (deletion)
232
+ const tr = trY[ci][cj];
233
+ cj--;
234
+ if (tr === Dir.DIAG) curState = 'M';
235
+ else curState = 'Y';
236
+ }
237
+ }
238
+
239
+ // Leading unaligned residues
240
+ while (ci > 0) {
241
+ alignment.push([ci - 1, -1]);
242
+ ci--;
243
+ }
244
+
245
+ alignment.reverse();
246
+
247
+ // Build position codes from alignment
248
+ const positionCodes: string[] = new Array(n).fill('-');
249
+ let matchedPositions = 0;
250
+ const totalProfilePositions = profile.filter(p => p[1].length > 0).length;
251
+
252
+ // Track which profile positions were used (for insertion labeling)
253
+ const usedProfilePositions = new Set<number>();
254
+ // First pass: assign direct matches
255
+ for (const [seqIdx, profIdx] of alignment) {
256
+ if (profIdx >= 0 && seqIdx >= 0) {
257
+ const [posNum] = profile[profIdx];
258
+ positionCodes[seqIdx] = String(posNum);
259
+ usedProfilePositions.add(profIdx);
260
+ if (profile[profIdx][1].length > 0) {
261
+ matchedPositions++;
262
+ }
263
+ }
264
+ }
265
+
266
+ // Determine the alignment span: only create insertions between the first
267
+ // and last directly-matched profile positions. Residues outside this span
268
+ // (leader peptide, constant region, etc.) should remain as '-'.
269
+ let firstMatchedSeqIdx = -1;
270
+ let lastMatchedSeqIdx = -1;
271
+ for (const [seqIdx, profIdx] of alignment) {
272
+ if (profIdx >= 0 && seqIdx >= 0) {
273
+ if (firstMatchedSeqIdx === -1) firstMatchedSeqIdx = seqIdx;
274
+ lastMatchedSeqIdx = seqIdx;
275
+ }
276
+ }
277
+
278
+ // Second pass: handle insertions (residues within the alignment span
279
+ // that weren't directly matched to a profile position)
280
+ // Insertion codes use letters: A, B, C, ...
281
+ const insertionCounters = new Map<string, number>();
282
+ for (let si = 0; si < n; si++) {
283
+ if (positionCodes[si] !== '-') continue;
284
+ // Only create insertions within the alignment span
285
+ if (si < firstMatchedSeqIdx || si > lastMatchedSeqIdx) continue;
286
+ // Find the nearest assigned position before this one
287
+ let prevPos = '';
288
+ for (let k = si - 1; k >= 0; k--) {
289
+ if (positionCodes[k] !== '-') {
290
+ prevPos = positionCodes[k];
291
+ break;
292
+ }
293
+ }
294
+ if (prevPos) {
295
+ // Extract base position number (strip any existing insertion letter)
296
+ const basePos = prevPos.replace(/[A-Z]$/, '');
297
+ const count = (insertionCounters.get(basePos) ?? 0) + 1;
298
+ insertionCounters.set(basePos, count);
299
+ const insertionLetter = String.fromCharCode(64 + count); // A=1, B=2, ...
300
+ positionCodes[si] = basePos + insertionLetter;
301
+ }
302
+ }
303
+
304
+ return { positionCodes, matchedPositions, totalProfilePositions };
305
+ }
306
+
307
+ /**
308
+ * Compute percent identity between aligned sequence and consensus.
309
+ */
310
+ function computeIdentity(
311
+ seq: string,
312
+ positionCodes: string[],
313
+ profile: ProfileData,
314
+ ): number {
315
+ const profileMap = new Map<number, string[]>();
316
+ for (const [pos, aas] of profile) {
317
+ if (aas.length > 0) profileMap.set(pos, aas);
318
+ }
319
+
320
+ let matches = 0;
321
+ let total = 0;
322
+
323
+ for (let i = 0; i < seq.length; i++) {
324
+ const code = positionCodes[i];
325
+ if (code === '-') continue;
326
+ const posNum = parseInt(code, 10);
327
+ if (isNaN(posNum)) continue;
328
+ const consensusAAs = profileMap.get(posNum);
329
+ if (!consensusAAs || consensusAAs.length === 0) continue;
330
+ total++;
331
+ if (consensusAAs.includes(seq[i])) {
332
+ matches++;
333
+ }
334
+ }
335
+
336
+ return total > 0 ? matches / total : 0;
337
+ }
338
+
339
+ /**
340
+ * Validate conserved residues for an antibody sequence.
341
+ * Checks for conserved cysteines (positions 23, 104 in IMGT-like schemes)
342
+ * and conserved tryptophan (position 41 in IMGT).
343
+ */
344
+ function validateConserved(
345
+ seq: string,
346
+ positionCodes: string[],
347
+ profile: ProfileData,
348
+ ): string {
349
+ // Build position-to-residue map
350
+ const posToAA = new Map<number, string>();
351
+ for (let i = 0; i < seq.length; i++) {
352
+ const code = positionCodes[i];
353
+ if (code === '-') continue;
354
+ const posNum = parseInt(code, 10);
355
+ if (!isNaN(posNum)) {
356
+ posToAA.set(posNum, seq[i]);
357
+ }
358
+ }
359
+
360
+ // Find the conserved cysteine positions by checking profile
361
+ // In most schemes, Cys is at specific positions where profile says ['C']
362
+ const cysPositions: number[] = [];
363
+ for (const [pos, aas] of profile) {
364
+ if (aas.length === 1 && aas[0] === 'C') {
365
+ cysPositions.push(pos);
366
+ }
367
+ }
368
+
369
+ for (const pos of cysPositions) {
370
+ const aa = posToAA.get(pos);
371
+ if (aa && aa !== 'C') {
372
+ return `Expected conserved Cys at position ${pos}, found ${aa}`;
373
+ }
374
+ }
375
+
376
+ return '';
377
+ }
378
+
379
+ /**
380
+ * Try to find the C-terminal of the variable region.
381
+ * Returns the index in the sequence where the C-terminal motif starts,
382
+ * or -1 if not found.
383
+ */
384
+ function findCTerminal(seq: string, chain: ChainType): number {
385
+ const ctermProfile = getCtermProfile(chain);
386
+ const motifLen = ctermProfile.length;
387
+
388
+ if (seq.length < motifLen) return -1;
389
+
390
+ let bestScore = -Infinity;
391
+ let bestPos = -1;
392
+
393
+ // Scan for best match across the sequence (variable region FW4 can be
394
+ // far from the C-terminus in sequences with constant regions)
395
+ const searchStart = Math.max(0, Math.floor(seq.length * 0.3));
396
+ for (let start = searchStart; start <= seq.length - motifLen; start++) {
397
+ let score = 0;
398
+ for (let j = 0; j < motifLen; j++) {
399
+ const aa = seq[start + j];
400
+ const consensusAAs = ctermProfile[j][1];
401
+ if (consensusAAs.includes(aa)) {
402
+ score += 3;
403
+ } else {
404
+ score -= 1;
405
+ }
406
+ }
407
+ if (score > bestScore) {
408
+ bestScore = score;
409
+ bestPos = start;
410
+ }
411
+ }
412
+
413
+ // Require a reasonable match
414
+ return bestScore >= motifLen * 1.5 ? bestPos : -1;
415
+ }
416
+
417
+ /**
418
+ * Align a sequence to all chain type profiles and pick the best match.
419
+ */
420
+ export function alignSequence(
421
+ seq: string,
422
+ scheme: Scheme,
423
+ chains: ChainType[] = ['H', 'K', 'L'],
424
+ ): AlignmentResult {
425
+ if (!seq || seq.length < 10) {
426
+ return { numbering: [], percentIdentity: 0, chainType: 'H', error: 'Sequence too short' };
427
+ }
428
+
429
+ // Pre-compute CTERM positions for all chain types
430
+ const ctermPositions = new Map<ChainType, number>();
431
+ for (const chain of chains) {
432
+ const idx = findCTerminal(seq, chain);
433
+ if (idx >= 0) ctermPositions.set(chain, idx);
434
+ }
435
+
436
+ // Detect scFv: if Heavy and Light CTERM positions are both found
437
+ // and far apart, this is a single-chain variable fragment (VH + VL)
438
+ const ctermH = ctermPositions.get('H') ?? -1;
439
+ const ctermK = ctermPositions.get('K') ?? -1;
440
+ const ctermL = ctermPositions.get('L') ?? -1;
441
+ const ctermLight = Math.max(ctermK, ctermL);
442
+ const lightChain: ChainType = ctermK >= ctermL ? 'K' : 'L';
443
+
444
+ const scfvDomainStart = new Map<ChainType, number>();
445
+ if (ctermH >= 0 && ctermLight >= 0 && Math.abs(ctermH - ctermLight) > 80) {
446
+ if (ctermLight < ctermH) {
447
+ // VL-linker-VH: Light domain first, Heavy domain second.
448
+ // Trim Heavy to skip past VL domain so it aligns cleanly to VH.
449
+ const ctLen = getCtermProfile(lightChain).length;
450
+ scfvDomainStart.set('H', ctermLight + ctLen);
451
+ }
452
+ // For VH-VL: Heavy is already at position 0 and aligns well naturally.
453
+ // Don't trim Light - leaving it untrimmed means VH residues confuse
454
+ // the Light alignment, keeping Heavy as the best match.
455
+ }
456
+
457
+ const isScfv = ctermH >= 0 && ctermLight >= 0 && Math.abs(ctermH - ctermLight) > 80;
458
+
459
+ let bestResult: AlignmentResult | null = null;
460
+ let bestIdentity = -1;
461
+ let heavyResult: AlignmentResult | null = null;
462
+ let heavyIdentity = -1;
463
+
464
+ for (const chain of chains) {
465
+ const profile = getConsensusProfile(scheme, chain);
466
+
467
+ let trimmedSeq = seq;
468
+ let seqOffset = 0;
469
+ const ctermIdx = ctermPositions.get(chain) ?? -1;
470
+ const ctermLen = getCtermProfile(chain).length;
471
+
472
+ // C-terminal trimming
473
+ if (ctermIdx >= 0) {
474
+ const ctermEnd = Math.min(seq.length, ctermIdx + ctermLen + 5);
475
+ trimmedSeq = seq.substring(0, ctermEnd);
476
+ }
477
+
478
+ // N-terminal trimming
479
+ const domainStart = scfvDomainStart.get(chain);
480
+ if (domainStart !== undefined && domainStart > 0 && domainStart < trimmedSeq.length - 80) {
481
+ // scFv: skip past the other domain
482
+ seqOffset = domainStart;
483
+ trimmedSeq = trimmedSeq.substring(domainStart);
484
+ } else if (trimmedSeq.length > 180) {
485
+ // Long sequence (with constant region): try a few N-terminal start positions
486
+ const starts = [0];
487
+ const firstAAs = profile.slice(0, 5).map(p => p[1]).flat();
488
+ for (let i = 1; i < Math.min(30, trimmedSeq.length - 80); i++) {
489
+ if (firstAAs.includes(trimmedSeq[i])) {
490
+ starts.push(i);
491
+ break;
492
+ }
493
+ }
494
+
495
+ let bestLocalIdentity = -1;
496
+ let bestOffset = 0;
497
+
498
+ for (const start of starts) {
499
+ const subSeq = trimmedSeq.substring(start);
500
+ const result = profileAlign(subSeq, profile);
501
+ const identity = computeIdentity(subSeq, result.positionCodes, profile);
502
+ if (identity > bestLocalIdentity) {
503
+ bestLocalIdentity = identity;
504
+ bestOffset = start;
505
+ }
506
+ }
507
+
508
+ if (bestOffset > 0) {
509
+ seqOffset = bestOffset;
510
+ trimmedSeq = trimmedSeq.substring(seqOffset);
511
+ }
512
+ }
513
+
514
+ const result = profileAlign(trimmedSeq, profile);
515
+ const identity = computeIdentity(trimmedSeq, result.positionCodes, profile);
516
+
517
+ // Build full numbering array
518
+ const fullNumbering: string[] = new Array(seq.length).fill('-');
519
+ for (let i = 0; i < trimmedSeq.length; i++) {
520
+ fullNumbering[seqOffset + i] = result.positionCodes[i];
521
+ }
522
+
523
+ // KABAT CDR-H1 insertion placement: insertions should go after position 35
524
+ // (not left-aligned after 31). Redistribute if needed.
525
+ if (scheme === 'kabat' && chain === 'H') {
526
+ const cdr1Indices: number[] = [];
527
+ for (let i = 0; i < fullNumbering.length; i++) {
528
+ if (fullNumbering[i] === '-') continue;
529
+ const baseNum = parseInt(fullNumbering[i], 10);
530
+ if (!isNaN(baseNum) && baseNum >= 31 && baseNum <= 35) {
531
+ cdr1Indices.push(i);
532
+ }
533
+ }
534
+ if (cdr1Indices.length > 5) {
535
+ // First 5 get base positions 31-35, rest get 35A, 35B, etc.
536
+ for (let j = 0; j < cdr1Indices.length; j++) {
537
+ if (j < 5) {
538
+ fullNumbering[cdr1Indices[j]] = String(31 + j);
539
+ } else {
540
+ fullNumbering[cdr1Indices[j]] = '35' + String.fromCharCode(64 + j - 4);
541
+ }
542
+ }
543
+ }
544
+ }
545
+
546
+ const validationError = validateConserved(trimmedSeq, result.positionCodes, profile);
547
+
548
+ const chainResult: AlignmentResult = {
549
+ numbering: fullNumbering,
550
+ percentIdentity: identity,
551
+ chainType: chain,
552
+ error: identity < 0.3 ? 'Low sequence identity; may not be an antibody variable region' :
553
+ validationError || '',
554
+ };
555
+
556
+ // Save Heavy result for scFv preference
557
+ if (chain === 'H') {
558
+ heavyResult = chainResult;
559
+ heavyIdentity = identity;
560
+ }
561
+
562
+ if (identity > bestIdentity) {
563
+ bestIdentity = identity;
564
+ bestResult = chainResult;
565
+ }
566
+ }
567
+
568
+ // For scFv sequences, prefer Heavy chain when identities are close.
569
+ // In scFv constructs, both VH and VL domains match their respective
570
+ // profiles well. AntPack (HMMER-based) naturally picks VH; we replicate
571
+ // this by preferring Heavy when its identity is within 0.05 of the best.
572
+ if (isScfv && bestResult && bestResult.chainType !== 'H' &&
573
+ heavyResult && heavyIdentity > 0.9 && bestIdentity - heavyIdentity < 0.05) {
574
+ bestResult = heavyResult;
575
+ }
576
+
577
+ return bestResult!;
578
+ }
@@ -0,0 +1,120 @@
1
+ /**
2
+ * Main annotator: numbers an antibody sequence and returns structured results.
3
+ */
4
+
5
+ import type {
6
+ Scheme, ChainType, ChainGroup,
7
+ NumberingResult, NumberingEntry, RegionAnnotation,
8
+ } from './types';
9
+ import { alignSequence } from './alignment';
10
+ import { getRegions } from './data/regions';
11
+
12
+ const VALID_AAS = new Set('ACDEFGHIKLMNPQRSTVWY');
13
+
14
+ /**
15
+ * Extract and clean a single-letter amino acid sequence from various input formats.
16
+ */
17
+ export function extractSequence(raw: string): string {
18
+ if (!raw) return '';
19
+ const s = raw.trim().replace(/[-.\s]/g, '');
20
+ return Array.from(s.toUpperCase()).filter(c => VALID_AAS.has(c)).join('');
21
+ }
22
+
23
+ /**
24
+ * Number a single antibody sequence.
25
+ *
26
+ * @param sequence - Raw amino acid sequence (single-letter codes)
27
+ * @param scheme - Numbering scheme: 'imgt', 'kabat', 'chothia', or 'aho'
28
+ * @param chains - Which chain types to try: ['H', 'K', 'L'] by default
29
+ * @returns Structured numbering result
30
+ */
31
+ export function numberSequence(
32
+ sequence: string,
33
+ scheme: Scheme = 'imgt',
34
+ chains: ChainType[] = ['H', 'K', 'L'],
35
+ ): NumberingResult {
36
+ const seq = extractSequence(sequence);
37
+
38
+ if (!seq) {
39
+ return emptyResult('Empty or invalid sequence');
40
+ }
41
+
42
+ if (seq.length < 10) {
43
+ return emptyResult('Sequence too short (minimum 10 residues)');
44
+ }
45
+
46
+ // Perform alignment
47
+ const alignment = alignSequence(seq, scheme, chains);
48
+
49
+ if (alignment.error && alignment.percentIdentity < 0.3) {
50
+ return emptyResult(alignment.error);
51
+ }
52
+
53
+ // Determine chain group
54
+ const chainGroup: ChainGroup = alignment.chainType === 'H' ? 'Heavy' : 'Light';
55
+
56
+ // Build numbering detail and position names
57
+ const numberingDetail: NumberingEntry[] = [];
58
+ const posNames: string[] = [];
59
+ const numberingMap: Record<string, number> = {};
60
+
61
+ for (let i = 0; i < seq.length; i++) {
62
+ const posCode = alignment.numbering[i];
63
+ if (posCode !== '-') {
64
+ posNames.push(posCode);
65
+ numberingDetail.push({ position: posCode, aa: seq[i] });
66
+ numberingMap[posCode] = i;
67
+ }
68
+ }
69
+
70
+ // Build region annotations
71
+ const regionDefs = getRegions(scheme, chainGroup);
72
+ const annotations: RegionAnnotation[] = regionDefs.map(([name, start, end]) => ({
73
+ id: `${scheme}-${chainGroup}-${name}`.toLowerCase(),
74
+ name,
75
+ description: `${name} (${scheme.toUpperCase()} ${start}-${end})`,
76
+ start: String(start),
77
+ end: String(end),
78
+ visualType: 'region' as const,
79
+ category: 'structure' as const,
80
+ sourceScheme: scheme.toUpperCase(),
81
+ autoGenerated: true,
82
+ }));
83
+
84
+ return {
85
+ positionNames: posNames.join(', '),
86
+ chainType: chainGroup,
87
+ chainTypeCode: alignment.chainType,
88
+ annotations,
89
+ numberingDetail,
90
+ numberingMap,
91
+ percentIdentity: alignment.percentIdentity,
92
+ error: alignment.error,
93
+ numbering: alignment.numbering,
94
+ };
95
+ }
96
+
97
+ /**
98
+ * Number multiple sequences at once.
99
+ */
100
+ export function numberSequences(
101
+ sequences: string[],
102
+ scheme: Scheme = 'imgt',
103
+ chains: ChainType[] = ['H', 'K', 'L'],
104
+ ): NumberingResult[] {
105
+ return sequences.map(seq => numberSequence(seq, scheme, chains));
106
+ }
107
+
108
+ function emptyResult(error: string): NumberingResult {
109
+ return {
110
+ positionNames: '',
111
+ chainType: 'Heavy',
112
+ chainTypeCode: '',
113
+ annotations: [],
114
+ numberingDetail: [],
115
+ numberingMap: {},
116
+ percentIdentity: 0,
117
+ error,
118
+ numbering: [],
119
+ };
120
+ }