@lloyal-labs/lloyal.node 1.0.3-alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/index.d.ts ADDED
@@ -0,0 +1,1388 @@
1
+ /**
2
+ * liblloyal-node TypeScript Definitions
3
+ *
4
+ * N-API bindings for liblloyal - Node.js native addon for llama.cpp inference
5
+ */
6
+
7
+ /**
8
+ * Pooling type for embedding extraction
9
+ */
10
+ export enum PoolingType {
11
+ /** No pooling - raw per-token embeddings */
12
+ NONE = 0,
13
+ /** Mean pooling - average of all token embeddings */
14
+ MEAN = 1,
15
+ /** CLS pooling - use first token embedding */
16
+ CLS = 2,
17
+ /** Last token pooling - use last token embedding */
18
+ LAST = 3,
19
+ }
20
+
21
+ /**
22
+ * Options for creating an inference context
23
+ */
24
+ export interface ContextOptions {
25
+ /** Path to .gguf model file */
26
+ modelPath: string;
27
+
28
+ /** Context size (default: 2048) */
29
+ nCtx?: number;
30
+
31
+ /** Number of threads (default: 4) */
32
+ nThreads?: number;
33
+
34
+ /**
35
+ * Enable embedding extraction mode
36
+ *
37
+ * When true, context is optimized for embedding extraction.
38
+ * Use with encode() and getEmbeddings() methods.
39
+ * Default: false (text generation mode)
40
+ */
41
+ embeddings?: boolean;
42
+
43
+ /**
44
+ * Pooling type for embedding extraction
45
+ *
46
+ * Only relevant when embeddings=true.
47
+ * Default: MEAN for embedding contexts, NONE otherwise
48
+ */
49
+ poolingType?: PoolingType;
50
+
51
+ /**
52
+ * Maximum number of sequences for multi-sequence support
53
+ *
54
+ * Set > 1 to enable multiple independent KV cache sequences.
55
+ * Useful for parallel decoding or conversation branching.
56
+ * Default: 1 (single sequence)
57
+ */
58
+ nSeqMax?: number;
59
+ }
60
+
61
+ /**
62
+ * Result from chat template formatting
63
+ */
64
+ export interface FormattedChatResult {
65
+ prompt: string;
66
+ stopTokens: string[];
67
+ }
68
+
69
+ /**
70
+ * Penalty parameters for repetition control
71
+ */
72
+ export interface PenaltyParams {
73
+ /** Repetition penalty (1.0 = disabled, >1.0 = penalize repeats) */
74
+ repeat?: number;
75
+
76
+ /** Frequency penalty (0.0 = disabled) */
77
+ frequency?: number;
78
+
79
+ /** Presence penalty (0.0 = disabled) */
80
+ presence?: number;
81
+
82
+ /** Tokens to consider for penalties (-1 = context size) */
83
+ lastN?: number;
84
+ }
85
+
86
+ /**
87
+ * Mirostat sampling configuration
88
+ *
89
+ * Mirostat dynamically adjusts sampling to maintain target perplexity,
90
+ * preventing both repetition and incoherence. Useful for long-form generation
91
+ * where temperature alone produces inconsistent quality.
92
+ *
93
+ * Use Mirostat v2 (mode: 2) for most cases - it's more stable than v1.
94
+ */
95
+ export interface MirostatParams {
96
+ /** Mirostat mode (0 = disabled, 1 = v1, 2 = v2). Recommended: 2 */
97
+ mode?: number;
98
+
99
+ /** Target entropy (perplexity = exp(tau)). Default: 5.0. Lower = more focused */
100
+ tau?: number;
101
+
102
+ /** Learning rate for entropy adjustment. Default: 0.1. Higher = faster adaptation */
103
+ eta?: number;
104
+ }
105
+
106
+ /**
107
+ * DRY (Don't Repeat Yourself) sampling parameters
108
+ *
109
+ * Penalizes repetition of token sequences, more sophisticated than
110
+ * simple repetition penalty. Useful for reducing loops and redundancy
111
+ * in generated text.
112
+ */
113
+ export interface DryParams {
114
+ /** Penalty strength (0.0 = disabled, higher = stronger penalty) */
115
+ multiplier?: number;
116
+
117
+ /** Base penalty value (typically 1.75) */
118
+ base?: number;
119
+
120
+ /** Minimum sequence length to trigger penalty (typically 2) */
121
+ allowedLength?: number;
122
+
123
+ /** Number of recent tokens to scan for repetitions */
124
+ penaltyLastN?: number;
125
+ }
126
+
127
+ /**
128
+ * XTC (eXclude Top Choices) sampler parameters
129
+ *
130
+ * Excludes very high probability tokens to increase output diversity.
131
+ * Useful when model is overly confident and produces repetitive text.
132
+ */
133
+ export interface XtcParams {
134
+ /** Probability of applying XTC (0.0 = disabled, 1.0 = always). Typical: 0.1 */
135
+ probability?: number;
136
+
137
+ /** Confidence threshold above which tokens are excluded. Typical: 0.1 */
138
+ threshold?: number;
139
+ }
140
+
141
+ /**
142
+ * Advanced sampling parameters
143
+ */
144
+ export interface AdvancedSamplingParams {
145
+ /** Locally typical sampling (1.0 = disabled) */
146
+ typicalP?: number;
147
+
148
+ /** Mirostat sampling configuration */
149
+ mirostat?: MirostatParams;
150
+
151
+ /** DRY (Don't Repeat Yourself) sampling */
152
+ dry?: DryParams;
153
+
154
+ /** XTC sampler */
155
+ xtc?: XtcParams;
156
+ }
157
+
158
+ /**
159
+ * Sampling parameters for token generation
160
+ *
161
+ * Common presets:
162
+ * - Factual/Precise: { temperature: 0.1 }
163
+ * - Balanced: { temperature: 0.7 }
164
+ * - Creative: { temperature: 1.0 }
165
+ */
166
+ export interface SamplingParams {
167
+ // ===== COMMON CONTROLS =====
168
+
169
+ /** Randomness (0.0 = always most likely, 2.0 = very random) */
170
+ temperature?: number;
171
+
172
+ /** Only consider top K most likely tokens (0 = disabled) */
173
+ topK?: number;
174
+
175
+ /** Nucleus sampling threshold (1.0 = disabled) */
176
+ topP?: number;
177
+
178
+ /** Minimum probability threshold */
179
+ minP?: number;
180
+
181
+ /** Random seed for reproducible generation (-1 = random) */
182
+ seed?: number;
183
+
184
+ /** GBNF grammar string for constrained generation */
185
+ grammar?: string;
186
+
187
+ // ===== GROUPED CONTROLS =====
188
+
189
+ /** Penalty parameters for repetition control */
190
+ penalties?: PenaltyParams;
191
+
192
+ /** Advanced sampling parameters */
193
+ advanced?: AdvancedSamplingParams;
194
+ }
195
+
196
+ /**
197
+ * A llama.cpp context for text generation
198
+ *
199
+ * Represents a loaded model with KV cache for maintaining conversation state.
200
+ * Use createContext() to initialize, and dispose() when done to free memory.
201
+ */
202
+ export interface SessionContext {
203
+ // ===== THE GENERATION LOOP =====
204
+
205
+ /**
206
+ * STEP 1: Process tokens through the model (forward pass)
207
+ *
208
+ * This feeds tokens through the transformer and updates the KV cache.
209
+ * After decoding, the model has "read" this text and is ready to predict.
210
+ *
211
+ * Think of this as: "the model reads your prompt"
212
+ *
213
+ * Why async? Model inference takes time (~45ms per token)
214
+ * Why position? Model needs to know where in conversation this text appears
215
+ *
216
+ * Cost: ~45ms per token (generation), ~120ms for 50 tokens (prompt)
217
+ *
218
+ * @param tokens Token IDs from tokenize()
219
+ * @param position Where these tokens start in the sequence
220
+ * @param seqId Sequence ID (default: 0)
221
+ * @example
222
+ * ```typescript
223
+ * const tokens = await ctx.tokenize("Hello world");
224
+ * await ctx.decode(tokens, 0);
225
+ * let position = tokens.length;
226
+ *
227
+ * // Generate next token
228
+ * await ctx.decode([nextToken], position++);
229
+ *
230
+ * // Multi-sequence: decode to different sequences
231
+ * await ctx.decode(tokens, 0, 0); // Sequence 0
232
+ * await ctx.decode(tokens, 0, 1); // Sequence 1
233
+ * ```
234
+ */
235
+ decode(tokens: number[], position: number, seqId?: number): Promise<void>;
236
+
237
+ /**
238
+ * STEP 2a: Get token scores for custom sampling (zero-copy, mutable)
239
+ *
240
+ * Returns unnormalized scores for every possible next token.
241
+ * Higher score = model thinks this token is more likely.
242
+ *
243
+ * Use this for custom sampling logic or grammar-constrained generation.
244
+ * For reading scores (entropy computation), use getLogits() instead.
245
+ *
246
+ * ⚠️ CRITICAL LIFETIME CONSTRAINTS:
247
+ * - This is a zero-copy buffer (points directly to model memory)
248
+ * - Valid ONLY until next decode() call
249
+ * - NOT thread-safe - use only on JS thread
250
+ * - DO NOT retain reference across async boundaries
251
+ * - Buffer is invalidated by: decode(), sample() with grammar
252
+ *
253
+ * Cost: ~0.5ms (zero-copy pointer)
254
+ *
255
+ * @returns Buffer containing vocabSize floats (Float32Array compatible)
256
+ * @example Safe usage
257
+ * ```typescript
258
+ * const buffer = ctx.getTokenScores();
259
+ * const scores = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
260
+ *
261
+ * // Modify immediately (safe - still on JS thread)
262
+ * scores[BANNED_TOKEN] = -Infinity;
263
+ *
264
+ * // Use immediately
265
+ * const token = customSample(scores);
266
+ *
267
+ * // Now decode invalidates the buffer
268
+ * await ctx.decode([token], position++);
269
+ * // Buffer is now INVALID - do not access!
270
+ * ```
271
+ */
272
+ getTokenScores(): Buffer;
273
+
274
+ /**
275
+ * STEP 2b: Get logits for reading (zero-copy, readonly usage pattern)
276
+ *
277
+ * Returns Float32Array for computational tasks like entropy calculation.
278
+ * For custom sampling or grammar, use getTokenScores() instead.
279
+ *
280
+ * WARNING: Buffer is only valid until next decode() call!
281
+ *
282
+ * @returns Float32Array of unnormalized logits (vocabSize elements)
283
+ */
284
+ getLogits(): Float32Array;
285
+
286
+ /**
287
+ * STEP 3: Sample a token from scores
288
+ *
289
+ * Converts raw scores into a token decision using:
290
+ * - Temperature: controls randomness
291
+ * - Top-K/Top-P: filters unlikely tokens
292
+ * - Grammar: enforces format constraints (if grammar initialized)
293
+ *
294
+ * This is where generation strategy happens.
295
+ *
296
+ * Cost: ~0.1ms (native sampling)
297
+ *
298
+ * @param params Sampling strategy (greedy if omitted)
299
+ * @returns Selected token ID
300
+ * @example
301
+ * ```typescript
302
+ * // Greedy (always pick most likely)
303
+ * const token = ctx.sample();
304
+ *
305
+ * // Creative generation
306
+ * const token = ctx.sample({ temperature: 0.9 });
307
+ *
308
+ * // Constrained to valid JSON
309
+ * ctx.initGrammar(grammar);
310
+ * const token = ctx.sample({ temperature: 0.7 });
311
+ * ```
312
+ */
313
+ sample(params?: SamplingParams): number;
314
+
315
+ /**
316
+ * Convert token ID to text piece
317
+ *
318
+ * Fast synchronous lookup in vocabulary table.
319
+ * Call this on each generated token for streaming display.
320
+ *
321
+ * Optimized for per-token conversion during generation.
322
+ * For batch conversion of many tokens, use detokenize() instead.
323
+ *
324
+ * Cost: ~0.05ms
325
+ *
326
+ * @param token Token ID from sample()
327
+ * @returns Text string for this token
328
+ * @example
329
+ * ```typescript
330
+ * while (true) {
331
+ * const token = ctx.sample({ temperature: 0.8 });
332
+ * if (ctx.isStopToken(token)) break;
333
+ *
334
+ * const text = ctx.tokenToText(token);
335
+ * process.stdout.write(text); // Stream to output
336
+ *
337
+ * await ctx.decode([token], position++);
338
+ * }
339
+ * ```
340
+ */
341
+ tokenToText(token: number): string;
342
+
343
+ /**
344
+ * Check if token is a model stop token
345
+ *
346
+ * Returns true for built-in end-of-generation tokens:
347
+ * - </s> (Llama 2)
348
+ * - <|endoftext|> (GPT)
349
+ * - <|eot_id|> (Llama 3)
350
+ * - Model-specific EOS tokens
351
+ *
352
+ * Note: This checks vocabulary stop tokens, not custom stop sequences.
353
+ * For custom stops (e.g., "\n\n", "###"), compare generated text
354
+ * against your stop strings in application code.
355
+ *
356
+ * Cost: <0.01ms (fast vocabulary lookup)
357
+ *
358
+ * @param token Token ID to check
359
+ * @example
360
+ * ```typescript
361
+ * const token = ctx.sample();
362
+ * if (ctx.isStopToken(token)) {
363
+ * console.log('Generation complete');
364
+ * break;
365
+ * }
366
+ * ```
367
+ */
368
+ isStopToken(token: number): boolean;
369
+
370
+ // ===== PROMPT PREPARATION =====
371
+
372
+ /**
373
+ * Tokenize text into model's vocabulary
374
+ *
375
+ * Converts human text → token IDs for decode().
376
+ * Same text always produces same tokens for a given model.
377
+ *
378
+ * Cost: ~1ms per 100 characters
379
+ *
380
+ * @param text Text to tokenize
381
+ * @returns Array of token IDs
382
+ * @example
383
+ * ```typescript
384
+ * const tokens = await ctx.tokenize("Hello world");
385
+ * console.log(tokens); // [15496, 1917] for Llama models
386
+ *
387
+ * await ctx.decode(tokens, 0);
388
+ * ```
389
+ */
390
+ tokenize(text: string): Promise<number[]>;
391
+
392
+ /**
393
+ * Detokenize array of tokens back to text
394
+ *
395
+ * Inverse of tokenize(). Use for reconstructing complete text
396
+ * from token sequences (e.g., after KV cache operations).
397
+ *
398
+ * Optimized for batch conversion of many tokens.
399
+ * For single-token conversion during generation, use tokenToText().
400
+ *
401
+ * Cost: ~1ms per 100 tokens
402
+ *
403
+ * @param tokens Array of token IDs
404
+ * @returns Complete text representation
405
+ * @example
406
+ * ```typescript
407
+ * const tokens = [15496, 1917]; // "Hello world"
408
+ * const text = await ctx.detokenize(tokens);
409
+ * console.log(text); // "Hello world"
410
+ * ```
411
+ */
412
+ detokenize(tokens: number[]): Promise<string>;
413
+
414
+ // ===== KV CACHE MANAGEMENT =====
415
+
416
+ /**
417
+ * Get current sequence length (number of decoded tokens)
418
+ *
419
+ * The KV cache stores model state for all decoded tokens.
420
+ * This tells you how many tokens are currently in memory.
421
+ *
422
+ * Think of this as: "How much has the model read so far?"
423
+ *
424
+ * Cost: <0.01ms (fast sync operation - safe to call frequently)
425
+ *
426
+ * @param sequenceId Sequence ID (defaults to 0 for single conversation)
427
+ * @returns Number of tokens in cache, or -1 if empty
428
+ * @example
429
+ * ```typescript
430
+ * const tokens = await ctx.tokenize("Hello world");
431
+ * await ctx.decode(tokens, 0);
432
+ *
433
+ * const length = ctx.kvCacheSize(0);
434
+ * console.log(length); // 2 (number of tokens)
435
+ * ```
436
+ */
437
+ kvCacheSize(sequenceId?: number): number;
438
+
439
+ /**
440
+ * Remove token range from KV cache
441
+ *
442
+ * Deletes tokens from model's memory. Use cases:
443
+ * - Removing old context when hitting limit (sliding window)
444
+ * - Implementing conversation pruning
445
+ * - Forgetting specific messages
446
+ * - Preparing for injection of new context
447
+ *
448
+ * ⚠️ CRITICAL: Call BEFORE next decode(), not after!
449
+ * The model needs to know about the removal before processing new tokens.
450
+ *
451
+ * Cost: ~1-5ms depending on range
452
+ *
453
+ * @param sequenceId Sequence ID (use 0 for single sequence)
454
+ * @param start Start position (inclusive)
455
+ * @param end End position (exclusive), -1 = to end
456
+ * @example
457
+ * ```typescript
458
+ * // Remove old tokens to stay under context limit
459
+ * const currentLength = ctx.kvCacheSize(0);
460
+ * if (currentLength > 2000) {
461
+ * // Remove oldest 500 tokens
462
+ * await ctx.kvCacheRemove(0, 0, 500);
463
+ *
464
+ * // THEN decode new tokens
465
+ * await ctx.decode(newTokens, currentLength - 500);
466
+ * }
467
+ * ```
468
+ */
469
+ kvCacheRemove(sequenceId: number, start: number, end: number): Promise<void>;
470
+
471
+ /**
472
+ * Snapshot KV cache state for branching/undo
473
+ *
474
+ * Serializes entire model state to Buffer.
475
+ * Restore later with kvCacheLoad() for:
476
+ * - Conversation branching ("what if I said X instead?")
477
+ * - Undo/redo functionality
478
+ * - Checkpointing long conversations
479
+ *
480
+ * Size: ~500MB-2GB depending on context length and model
481
+ *
482
+ * Cost: ~100-500ms depending on cache size
483
+ *
484
+ * @param sequenceId Sequence ID (use 0 for single sequence)
485
+ * @returns Serialized state buffer
486
+ * @example
487
+ * ```typescript
488
+ * // Save state before risky operation
489
+ * const snapshot = await ctx.kvCacheSave(0);
490
+ *
491
+ * // Try something
492
+ * await ctx.decode(riskyTokens, position);
493
+ *
494
+ * // Didn't work - restore previous state
495
+ * await ctx.kvCacheLoad(0, snapshot);
496
+ * ```
497
+ */
498
+ kvCacheSave(sequenceId?: number): Promise<Buffer>;
499
+
500
+ /**
501
+ * Restore KV cache from previous snapshot
502
+ *
503
+ * Loads saved model state. Context returns to exact state
504
+ * when snapshot was taken.
505
+ *
506
+ * Cost: ~100-500ms depending on snapshot size
507
+ *
508
+ * @param sequenceId Sequence ID (use 0 for single sequence)
509
+ * @param state Buffer from kvCacheSave()
510
+ * @example
511
+ * ```typescript
512
+ * const snapshot = await ctx.kvCacheSave(0);
513
+ *
514
+ * // ... many operations later ...
515
+ *
516
+ * // Restore to saved state
517
+ * await ctx.kvCacheLoad(0, snapshot);
518
+ * ```
519
+ */
520
+ kvCacheLoad(sequenceId: number, state: Buffer): Promise<void>;
521
+
522
+ /**
523
+ * Clear all KV cache (fresh start)
524
+ *
525
+ * Removes all cached tokens. Model returns to initial state
526
+ * as if no text has been processed.
527
+ *
528
+ * Use when starting a completely new conversation.
529
+ *
530
+ * Cost: ~1ms
531
+ *
532
+ * @example
533
+ * ```typescript
534
+ * // Start fresh conversation
535
+ * await ctx.kvCacheClear();
536
+ *
537
+ * const tokens = await ctx.tokenize("New conversation");
538
+ * await ctx.decode(tokens, 0);
539
+ * ```
540
+ */
541
+ kvCacheClear(): Promise<void>;
542
+
543
+ /**
544
+ * Atomic clear+reseed operation
545
+ *
546
+ * Implements a KV cache compression strategy:
547
+ * 1. Clear entire KV cache
548
+ * 2. Re-decode original sinks (first N tokens from conversation start)
549
+ * 3. Re-decode tail (last M recent tokens)
550
+ *
551
+ *
552
+ * @param sinks - ORIGINAL first N tokens from conversation start (typically 4)
553
+ * @param tail - Recent M tokens to preserve (typically 508-1020)
554
+ * @returns Promise that resolves when reseed completes
555
+ *
556
+ * @example
557
+ * ```typescript
558
+ * const ORIGINAL_SINKS = allTokens.slice(0, 4);
559
+ *
560
+ * const tail = allTokens.slice(-508); // Last 508 tokens
561
+ * await ctx.clearAndReseed(ORIGINAL_SINKS, tail);
562
+ *
563
+ * const nextToken = ctx.greedySample();
564
+ * await ctx.decode([nextToken], 512);
565
+ * ```
566
+ */
567
+ clearAndReseed(sinks: number[], tail: number[]): Promise<void>;
568
+
569
+ // ===== GRAMMAR-CONSTRAINED GENERATION =====
570
+
571
+ /**
572
+ * Initialize grammar parser (once per generation session)
573
+ *
574
+ * Grammars constrain generation to valid formats (JSON, XML, etc.).
575
+ * Parser tracks state across tokens to enforce rules.
576
+ *
577
+ * Call once before starting constrained generation.
578
+ * Use resetGrammar() to reuse same grammar for new generation.
579
+ *
580
+ * Cost: ~0.1-1ms depending on grammar complexity
581
+ *
582
+ * @param grammarStr GBNF grammar string (EBNF-like syntax)
583
+ * @example
584
+ * ```typescript
585
+ * // Force valid JSON
586
+ * const grammar = ctx.jsonSchemaToGrammar(JSON.stringify({
587
+ * type: "object",
588
+ * properties: {
589
+ * name: { type: "string" },
590
+ * age: { type: "number" }
591
+ * }
592
+ * }));
593
+ *
594
+ * ctx.initGrammar(grammar);
595
+ *
596
+ * // Now sample() will only generate valid JSON
597
+ * const token = ctx.sample({ temperature: 0.7 });
598
+ * ```
599
+ */
600
+ initGrammar(grammarStr: string): void;
601
+
602
+ /**
603
+ * Apply grammar constraints to token scores (modifies in-place)
604
+ *
605
+ * Masks invalid tokens with -Infinity based on parser state.
606
+ * Call after getTokenScores(), before custom sampling.
607
+ *
608
+ * Flow: getTokenScores() → applyGrammar() → sample() → acceptToken()
609
+ *
610
+ * Thread safety: This method is synchronous and modifies the buffer
611
+ * in-place on the JS thread. Safe because it's called sequentially
612
+ * in the generation loop before any async operations.
613
+ *
614
+ * Cost: ~0.1-1ms depending on grammar complexity
615
+ *
616
+ * @param scoresBuffer Buffer from getTokenScores() (modified in-place)
617
+ * @throws Error if grammar not initialized (call initGrammar first)
618
+ * @example
619
+ * ```typescript
620
+ * // Custom sampling with grammar
621
+ * const buffer = ctx.getTokenScores();
622
+ * const scores = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
623
+ *
624
+ * // Apply grammar constraints
625
+ * ctx.applyGrammar(buffer);
626
+ *
627
+ * // Now sample from constrained distribution
628
+ * const token = customSample(scores);
629
+ * ctx.acceptToken(token);
630
+ * ```
631
+ */
632
+ applyGrammar(scoresBuffer: Buffer): void;
633
+
634
+ /**
635
+ * Advance grammar parser with chosen token
636
+ *
637
+ * Updates parser state after sampling.
638
+ * MUST be called AFTER sampling, BEFORE next applyGrammar().
639
+ *
640
+ * This advances the stateful grammar parser through its rules.
641
+ * Without this, grammar constraints will be incorrect.
642
+ *
643
+ * Cost: <0.01ms
644
+ *
645
+ * @param tokenId Token that was sampled
646
+ * @example
647
+ * ```typescript
648
+ * const buffer = ctx.getTokenScores();
649
+ * ctx.applyGrammar(buffer);
650
+ *
651
+ * const scores = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
652
+ * const token = customSample(scores);
653
+ *
654
+ * // MUST call acceptToken to advance parser
655
+ * ctx.acceptToken(token);
656
+ *
657
+ * // Now parser is ready for next token
658
+ * ```
659
+ */
660
+ acceptToken(tokenId: number): void;
661
+
662
+ /**
663
+ * Reset grammar parser to initial state
664
+ *
665
+ * Call at start of each new generation with same grammar.
666
+ * Parser returns to root state, ready to validate from beginning.
667
+ *
668
+ * Cost: <0.01ms
669
+ *
670
+ * @example
671
+ * ```typescript
672
+ * ctx.initGrammar(jsonGrammar);
673
+ *
674
+ * // First generation
675
+ * while (!done) {
676
+ * const token = ctx.sample();
677
+ * // ... generate ...
678
+ * }
679
+ *
680
+ * // Second generation - reuse same grammar
681
+ * ctx.resetGrammar();
682
+ * while (!done) {
683
+ * const token = ctx.sample();
684
+ * // ... generate ...
685
+ * }
686
+ * ```
687
+ */
688
+ resetGrammar(): void;
689
+
690
+ /**
691
+ * Free grammar resources
692
+ *
693
+ * Call when done with constrained generation.
694
+ * Releases parser memory.
695
+ *
696
+ * Cost: <0.01ms
697
+ *
698
+ * @example
699
+ * ```typescript
700
+ * ctx.initGrammar(grammar);
701
+ * // ... do constrained generation ...
702
+ * ctx.freeGrammar();
703
+ * ```
704
+ */
705
+ freeGrammar(): void;
706
+
707
+ // ===== KV SEQUENCE OPERATIONS =====
708
+
709
+ /**
710
+ * Copy KV cache from one sequence to another
711
+ *
712
+ * Duplicates the KV cache state from source to destination sequence.
713
+ * After copying, both sequences can continue independently.
714
+ *
715
+ * NOTE: Only full sequence copies are currently supported.
716
+ * The p0/p1 parameters must use default values (0 and -1).
717
+ *
718
+ * Cost: ~1-5ms depending on sequence length
719
+ *
720
+ * @param srcSeqId Source sequence to copy from
721
+ * @param dstSeqId Destination sequence to copy to
722
+ * @param p0 Start position (must be 0, default: 0)
723
+ * @param p1 End position (must be -1 for full copy, default: -1)
724
+ * @example
725
+ * ```typescript
726
+ * // Decode initial prompt to seq 0
727
+ * await ctx.decode(promptTokens, 0);
728
+ *
729
+ * // Copy seq 0 -> seq 1
730
+ * ctx.kvSeqCopy(0, 1);
731
+ *
732
+ * // Now both sequences can continue independently
733
+ * await ctx.decode([tokenA], position, 0);
734
+ * await ctx.decode([tokenB], position, 1);
735
+ * ```
736
+ */
737
+ kvSeqCopy(srcSeqId: number, dstSeqId: number, p0?: number, p1?: number): void;
738
+
739
+ /**
740
+ * Keep only specified sequence, remove all others
741
+ *
742
+ * Removes all sequences except the one specified.
743
+ * For complete cleanup of unwanted sequences, consider using
744
+ * kvCacheRemove(seqId, 0, -1) on each sequence instead.
745
+ *
746
+ * @param seqId Sequence ID to keep
747
+ */
748
+ kvSeqKeep(seqId: number): void;
749
+
750
+ /**
751
+ * Get max position in sequence
752
+ *
753
+ * Returns the highest position index in the specified sequence,
754
+ * or -1 if the sequence is empty.
755
+ *
756
+ * Cost: <0.01ms (fast sync operation)
757
+ *
758
+ * @param seqId Sequence ID to query
759
+ * @returns Max position index, or -1 if empty
760
+ * @example
761
+ * ```typescript
762
+ * const pos = ctx.kvSeqPosMax(0);
763
+ * if (pos === -1) {
764
+ * console.log('Sequence is empty');
765
+ * } else {
766
+ * console.log(`Sequence has ${pos + 1} tokens`);
767
+ * }
768
+ * ```
769
+ */
770
+ kvSeqPosMax(seqId: number): number;
771
+
772
+ // ===== HANDLE-BASED GRAMMAR =====
773
+
774
+ /**
775
+ * Create a new grammar sampler (returns handle)
776
+ *
777
+ * Creates an independent grammar sampler instance with its own state.
778
+ *
779
+ * Unlike initGrammar() which uses a single internal sampler, this returns
780
+ * a handle that can be used with applySampler/acceptSamplerToken.
781
+ * Multiple handles can coexist with independent parser states.
782
+ *
783
+ * Cost: ~0.1-1ms depending on grammar complexity
784
+ *
785
+ * @param grammarStr GBNF grammar string
786
+ * @returns Handle to the created sampler
787
+ * @example
788
+ * ```typescript
789
+ * const grammarHandle = ctx.createSampler(jsonGrammar);
790
+ *
791
+ * // Apply grammar constraints to logits
792
+ * ctx.applySampler(grammarHandle, logitsBuffer);
793
+ * ctx.acceptSamplerToken(grammarHandle, token);
794
+ *
795
+ * // Create independent copy with same grammar
796
+ * const clonedHandle = ctx.cloneSampler(grammarHandle);
797
+ *
798
+ * // Cleanup when done
799
+ * ctx.freeSamplerHandle(grammarHandle);
800
+ * ctx.freeSamplerHandle(clonedHandle);
801
+ * ```
802
+ */
803
+ createSampler(grammarStr: string): number;
804
+
805
+ /**
806
+ * Apply grammar constraints using handle-based sampler
807
+ *
808
+ * Masks invalid tokens with -Infinity based on parser state.
809
+ * Modifies the logits buffer in-place.
810
+ *
811
+ * @param handle Sampler handle from createSampler()
812
+ * @param logitsBuffer ArrayBuffer or TypedArray containing logits
813
+ */
814
+ applySampler(handle: number, logitsBuffer: ArrayBuffer | Float32Array): void;
815
+
816
+ /**
817
+ * Accept token to advance grammar parser state (handle-based)
818
+ *
819
+ * Must be called after sampling to advance the grammar parser.
820
+ * This is the handle-based equivalent of acceptToken().
821
+ *
822
+ * @param handle Sampler handle from createSampler()
823
+ * @param tokenId Token that was sampled
824
+ */
825
+ acceptSamplerToken(handle: number, tokenId: number): void;
826
+
827
+ /**
828
+ * Clone a grammar sampler
829
+ *
830
+ * Creates a copy of the sampler with identical parser state.
831
+ * Both handles can then be used independently with their own state.
832
+ *
833
+ * @param handle Sampler handle to clone
834
+ * @returns New handle to cloned sampler
835
+ * @example
836
+ * ```typescript
837
+ * const original = ctx.createSampler(jsonGrammar);
838
+ * ctx.acceptSamplerToken(original, openBrace);
839
+ *
840
+ * // Clone preserves parser state (already accepted openBrace)
841
+ * const copy = ctx.cloneSampler(original);
842
+ *
843
+ * // Both can now continue independently
844
+ * ctx.acceptSamplerToken(original, tokenA);
845
+ * ctx.acceptSamplerToken(copy, tokenB);
846
+ * ```
847
+ */
848
+ cloneSampler(handle: number): number;
849
+
850
+ /**
851
+ * Free a grammar sampler handle
852
+ *
853
+ * Releases memory for the specified sampler.
854
+ * Handle becomes invalid after this call.
855
+ *
856
+ * @param handle Sampler handle to free
857
+ */
858
+ freeSamplerHandle(handle: number): void;
859
+
860
+ // ===== METRICS API =====
861
+
862
+ /**
863
+ * Compute surprisal (negative log-likelihood) for a specific token.
864
+ *
865
+ * Measures how "surprising" the model finds the given token:
866
+ * - Low surprisal: Model expected this token (high probability)
867
+ * - High surprisal: Model didn't expect this token (low probability)
868
+ *
869
+ * Call after decode() to compute surprisal for any token based on
870
+ * the current logits distribution.
871
+ *
872
+ * @param pickedTokenId - Token ID to compute surprisal for
873
+ * @param base - Logarithm base: "nats" (default) or "bits"
874
+ * @returns Surprisal value in specified base
875
+ *
876
+ * @example
877
+ * ```typescript
878
+ * await ctx.decode(tokens, position);
879
+ * const token = ctx.sample();
880
+ * const surprisal = ctx.modelSurprisal(token, "bits");
881
+ * console.log(`Model surprise: ${surprisal.toFixed(2)} bits`);
882
+ * ```
883
+ *
884
+ * COST: O(1) - direct probability lookup from logits
885
+ */
886
+ modelSurprisal(pickedTokenId: number, base?: 'nats' | 'bits'): number;
887
+
888
+ /**
889
+ * Compute entropy of the entire logits distribution.
890
+ *
891
+ * Measures model uncertainty:
892
+ * - Low entropy: Model is confident (peaked distribution)
893
+ * - High entropy: Model is uncertain (flat distribution)
894
+ *
895
+ * Call after decode() to analyze the current prediction distribution.
896
+ *
897
+ * @param base - Logarithm base: "nats" (default), "bits", or "base10"
898
+ * @returns Entropy value in specified base
899
+ *
900
+ * @example
901
+ * ```typescript
902
+ * await ctx.decode(tokens, position);
903
+ * const entropy = ctx.modelEntropy("bits");
904
+ * if (entropy > 5.0) {
905
+ * console.log("Model is very uncertain - consider adjusting parameters");
906
+ * }
907
+ * ```
908
+ *
909
+ * COST: O(n_vocab) - must sum over all token probabilities
910
+ */
911
+ modelEntropy(base?: 'nats' | 'bits'): number;
912
+
913
+ /**
914
+ * Create a new perplexity tracker.
915
+ *
916
+ * @returns Integer handle to the tracker
917
+ *
918
+ * @example
919
+ * ```typescript
920
+ * const tracker = ctx.createPerplexityTracker();
921
+ *
922
+ * // Add surprisals during generation
923
+ * for (let i = 0; i < tokens.length; i++) {
924
+ * const surprisal = ctx.modelSurprisal(tokens[i]);
925
+ * ctx.addSurprisal(tracker, surprisal);
926
+ * }
927
+ *
928
+ * const ppl = ctx.getPerplexity(tracker);
929
+ * console.log(`Sequence perplexity: ${ppl.toFixed(2)}`);
930
+ *
931
+ * ctx.freePerplexityTracker(tracker);
932
+ * ```
933
+ */
934
+ createPerplexityTracker(): number;
935
+
936
+ /**
937
+ * Add a surprisal value to the rolling tracker.
938
+ *
939
+ * @param handle - Tracker handle from createPerplexityTracker()
940
+ * @param surprisal - Surprisal value (from modelSurprisal or computed)
941
+ *
942
+ * @example
943
+ * ```typescript
944
+ * const surprisal = ctx.modelSurprisal(tokenId, "nats");
945
+ * ctx.addSurprisal(tracker, surprisal);
946
+ * ```
947
+ *
948
+ * COST: O(1) - numerically stable accumulation
949
+ * THREAD-SAFETY: Not thread-safe (handle is session-local)
950
+ */
951
+ addSurprisal(handle: number, surprisal: number): void;
952
+
953
+ /**
954
+ * Get current perplexity value.
955
+ *
956
+ * @param handle - Tracker handle
957
+ * @returns Perplexity = exp(average_surprisal_in_nats)
958
+ *
959
+ * @example
960
+ * ```typescript
961
+ * const ppl = ctx.getPerplexity(tracker);
962
+ * console.log(`Current PPL: ${ppl.toFixed(2)}`);
963
+ * ```
964
+ *
965
+ * FORMULA: PPL = exp(sum_surprisals / count)
966
+ * RANGE: [1, ∞) where 1 = perfect prediction
967
+ */
968
+ getPerplexity(handle: number): number;
969
+
970
+ /**
971
+ * Clone a perplexity tracker (for fork/branch scenarios).
972
+ *
973
+ * @param sourceHandle - Handle to clone from
974
+ * @returns New handle with same accumulated state
975
+ *
976
+ * @example
977
+ * ```typescript
978
+ * // Branch A and B start from same base perplexity
979
+ * const baseTracker = ctx.createPerplexityTracker();
980
+ * // ... accumulate base surprisals ...
981
+ *
982
+ * const branchA = ctx.clonePerplexityTracker(baseTracker);
983
+ * const branchB = ctx.clonePerplexityTracker(baseTracker);
984
+ *
985
+ * // Branch A and B now track independently
986
+ * ctx.addSurprisal(branchA, surprisalA);
987
+ * ctx.addSurprisal(branchB, surprisalB);
988
+ * ```
989
+ */
990
+ clonePerplexityTracker(sourceHandle: number): number;
991
+
992
+ /**
993
+ * Reset tracker to initial state (count=0, sum=0).
994
+ *
995
+ * @param handle - Tracker handle to reset
996
+ *
997
+ * @example
998
+ * ```typescript
999
+ * // Reuse tracker for multiple sequences
1000
+ * const tracker = ctx.createPerplexityTracker();
1001
+ *
1002
+ * for (const sequence of sequences) {
1003
+ * ctx.resetPerplexityTracker(tracker);
1004
+ * // ... process sequence ...
1005
+ * const ppl = ctx.getPerplexity(tracker);
1006
+ * }
1007
+ * ```
1008
+ */
1009
+ resetPerplexityTracker(handle: number): void;
1010
+
1011
+ /**
1012
+ * Get number of tokens tracked.
1013
+ *
1014
+ * @param handle - Tracker handle
1015
+ * @returns Number of surprisal values added
1016
+ */
1017
+ getPerplexityCount(handle: number): number;
1018
+
1019
+ /**
1020
+ * Free perplexity tracker resources.
1021
+ *
1022
+ * @param handle - Tracker handle to free
1023
+ *
1024
+ * NOTE: Auto-freed in dispose() if not manually freed
1025
+ */
1026
+ freePerplexityTracker(handle: number): void;
1027
+
1028
+ // ===== ATOMIC DECODE+CAPTURE =====
1029
+
1030
+ /**
1031
+ * Decode tokens and capture logits atomically
1032
+ *
1033
+ * Performs decode and logits capture as a single atomic operation,
1034
+ * ensuring the captured logits correspond exactly to the decoded tokens.
1035
+ *
1036
+ * Use this instead of separate decode() + getLogits() calls when
1037
+ * you need guaranteed consistency between decode and logits capture.
1038
+ *
1039
+ * @param tokens Token IDs to decode
1040
+ * @param position Start position in sequence
1041
+ * @param seqId Sequence ID
1042
+ * @param destBuffer Pre-allocated buffer to receive logits (vocabSize floats)
1043
+ * @example
1044
+ * ```typescript
1045
+ * // Pre-allocate buffer (reuse across calls)
1046
+ * const logitsBuffer = new Float32Array(ctx.vocabSize);
1047
+ *
1048
+ * // Atomic decode + capture
1049
+ * ctx.decodeAndCapture([token], position, seqId, logitsBuffer);
1050
+ *
1051
+ * // Safe to process logitsBuffer - it's an independent copy
1052
+ * const nextToken = sampleFromLogits(logitsBuffer);
1053
+ * ```
1054
+ */
1055
+ decodeAndCapture(
1056
+ tokens: number[],
1057
+ position: number,
1058
+ seqId: number,
1059
+ destBuffer: ArrayBuffer | Float32Array
1060
+ ): void;
1061
+
1062
+ // ===== KV CACHE FILE PERSISTENCE =====
1063
+
1064
+ /**
1065
+ * Write KV cache state + tokens to file
1066
+ *
1067
+ * Persists KV cache state for later restoration.
1068
+ * Useful for checkpointing long conversations.
1069
+ *
1070
+ * @param sequenceId Sequence ID to save
1071
+ * @param filepath Path to save file
1072
+ * @param tokens Tokens that were decoded into this sequence
1073
+ * @returns Promise resolving to bytes written
1074
+ */
1075
+ kvCacheWriteFile(
1076
+ sequenceId: number,
1077
+ filepath: string,
1078
+ tokens: number[]
1079
+ ): Promise<number>;
1080
+
1081
+ /**
1082
+ * Read KV cache state + tokens from file
1083
+ *
1084
+ * Restores KV cache state from a previous kvCacheWriteFile call.
1085
+ *
1086
+ * @param sequenceId Sequence ID to restore to
1087
+ * @param filepath Path to saved file
1088
+ * @returns Promise resolving to tokens and bytes read
1089
+ */
1090
+ kvCacheReadFile(
1091
+ sequenceId: number,
1092
+ filepath: string
1093
+ ): Promise<{ tokens: number[]; bytesRead: number }>;
1094
+
1095
+ // ===== HELPERS =====
1096
+
1097
+ /**
1098
+ * Format messages using model's chat template
1099
+ *
1100
+ * Converts [{role, content}] → formatted prompt string.
1101
+ * Uses model's built-in template (ChatML, Llama, Mistral, etc.).
1102
+ *
1103
+ * Cost: ~1-5ms depending on message count
1104
+ *
1105
+ * @param messagesJson JSON string containing array of messages
1106
+ * @param templateOverride Optional custom template string
1107
+ * @returns Formatted prompt and stop tokens from template
1108
+ * @example
1109
+ * ```typescript
1110
+ * const result = await ctx.formatChat(JSON.stringify([
1111
+ * { role: "system", content: "You are a helpful assistant" },
1112
+ * { role: "user", content: "Hello!" }
1113
+ * ]));
1114
+ *
1115
+ * const tokens = await ctx.tokenize(result.prompt);
1116
+ * await ctx.decode(tokens, 0);
1117
+ * ```
1118
+ */
1119
+ formatChat(
1120
+ messagesJson: string,
1121
+ templateOverride?: string
1122
+ ): Promise<FormattedChatResult>;
1123
+
1124
+ /**
1125
+ * Convert JSON schema to GBNF grammar
1126
+ *
1127
+ * Generates grammar string for constrained JSON generation.
1128
+ * Use with initGrammar() or sample({ grammar }).
1129
+ *
1130
+ * Cost: ~1-10ms depending on schema complexity
1131
+ *
1132
+ * @param schemaJson JSON schema string
1133
+ * @returns GBNF grammar string
1134
+ * @example
1135
+ * ```typescript
1136
+ * const schema = {
1137
+ * type: "object",
1138
+ * properties: {
1139
+ * name: { type: "string" },
1140
+ * age: { type: "number" }
1141
+ * },
1142
+ * required: ["name"]
1143
+ * };
1144
+ *
1145
+ * const grammar = ctx.jsonSchemaToGrammar(JSON.stringify(schema));
1146
+ * ctx.initGrammar(grammar);
1147
+ * ```
1148
+ */
1149
+ jsonSchemaToGrammar(schemaJson: string): string;
1150
+
1151
+ /**
1152
+ * Validate chat template syntax
1153
+ *
1154
+ * Checks if template string is valid before using.
1155
+ *
1156
+ * Cost: ~0.1-1ms
1157
+ *
1158
+ * @param templateString Template string to validate
1159
+ * @returns True if template syntax is valid
1160
+ */
1161
+ validateChatTemplate(templateString: string): Promise<boolean>;
1162
+
1163
+ // ===== EMBEDDING EXTRACTION =====
1164
+
1165
+ /**
1166
+ * Encode tokens for embedding extraction
1167
+ *
1168
+ * Unlike decode(), this marks ALL tokens with logits=true which is
1169
+ * required for embedding extraction. Use with embeddings=true context.
1170
+ *
1171
+ * Workflow:
1172
+ * 1. Create context with { embeddings: true, poolingType: PoolingType.MEAN }
1173
+ * 2. Tokenize your text
1174
+ * 3. Clear KV cache (important between different texts!)
1175
+ * 4. Call encode() with tokens
1176
+ * 5. Call getEmbeddings() to get the vector
1177
+ *
1178
+ * Cost: ~5-50ms depending on text length and model
1179
+ *
1180
+ * @param tokens Token IDs from tokenize()
1181
+ * @example
1182
+ * ```typescript
1183
+ * // Create embedding context
1184
+ * const ctx = await createContext({
1185
+ * modelPath: './nomic-embed.gguf',
1186
+ * embeddings: true,
1187
+ * poolingType: PoolingType.MEAN
1188
+ * });
1189
+ *
1190
+ * // Get embedding for text
1191
+ * const tokens = await ctx.tokenize("Hello world");
1192
+ * await ctx.kvCacheClear(); // Important between texts!
1193
+ * await ctx.encode(tokens);
1194
+ * const embedding = ctx.getEmbeddings();
1195
+ * ```
1196
+ */
1197
+ encode(tokens: number[]): Promise<void>;
1198
+
1199
+ /**
1200
+ * Get embedding vector from context (after encode)
1201
+ *
1202
+ * Returns the embedding vector for the encoded text.
1203
+ * Call after encode() to extract embeddings.
1204
+ *
1205
+ * The vector dimension depends on the model (e.g., 768 for nomic-embed).
1206
+ * Use getEmbeddingDimension() to get the size.
1207
+ *
1208
+ * Cost: ~0.5ms (extraction from model state)
1209
+ *
1210
+ * @param normalize Apply L2 normalization (default: true for cosine similarity)
1211
+ * @returns Float32Array of embedding values
1212
+ * @example
1213
+ * ```typescript
1214
+ * await ctx.encode(tokens);
1215
+ *
1216
+ * // Get L2-normalized embedding (for cosine similarity)
1217
+ * const embedding = ctx.getEmbeddings();
1218
+ *
1219
+ * // Or raw embedding without normalization
1220
+ * const rawEmbedding = ctx.getEmbeddings(false);
1221
+ * ```
1222
+ */
1223
+ getEmbeddings(normalize?: boolean): Float32Array;
1224
+
1225
+ /**
1226
+ * Get embedding dimension for model
1227
+ *
1228
+ * Returns the size of embedding vectors this model produces.
1229
+ * Common values: 768 (BERT-like), 1024, 2048, 4096.
1230
+ *
1231
+ * Cost: <0.01ms (fast model property lookup)
1232
+ *
1233
+ * @returns Embedding dimension
1234
+ * @example
1235
+ * ```typescript
1236
+ * const dim = ctx.getEmbeddingDimension();
1237
+ * console.log(`Model produces ${dim}-dimensional embeddings`);
1238
+ * ```
1239
+ */
1240
+ getEmbeddingDimension(): number;
1241
+
1242
+ /**
1243
+ * Check if context has pooling enabled
1244
+ *
1245
+ * Returns true if context was created with embeddings=true and
1246
+ * a pooling type other than NONE.
1247
+ *
1248
+ * Cost: <0.01ms
1249
+ *
1250
+ * @returns True if pooling is enabled
1251
+ */
1252
+ hasPooling(): boolean;
1253
+
1254
+ // ===== NATIVE REFERENCE IMPLEMENTATIONS =====
1255
+
1256
+ /**
1257
+ * Compute entropy of current logits distribution
1258
+ *
1259
+ * Alternative entropy computation using native implementation.
1260
+ * Equivalent to modelEntropy("nats") but may be faster.
1261
+ *
1262
+ * @returns Entropy in nats
1263
+ */
1264
+ computeEntropy(): number;
1265
+
1266
+ /**
1267
+ * Sample greedily from current logits
1268
+ *
1269
+ * Selects token with highest logit value (deterministic).
1270
+ * Equivalent to sample() with temperature=0.
1271
+ *
1272
+ * @returns Token ID with highest probability
1273
+ */
1274
+ greedySample(): number;
1275
+
1276
+ // ===== PROPERTIES =====
1277
+
1278
+ /**
1279
+ * Model vocabulary size (number of possible tokens)
1280
+ *
1281
+ * This is the length of the scores buffer from getTokenScores().
1282
+ */
1283
+ readonly vocabSize: number;
1284
+
1285
+ /**
1286
+ * Memory used by this context (bytes)
1287
+ *
1288
+ * Reports native memory for monitoring.
1289
+ * Includes model weights, KV cache, and context state.
1290
+ */
1291
+ readonly memorySize: number;
1292
+
1293
+ // ===== LIFECYCLE =====
1294
+
1295
+ /**
1296
+ * Free native resources
1297
+ *
1298
+ * Call when done with context to release model and KV cache memory.
1299
+ * Context becomes unusable after disposal.
1300
+ */
1301
+ dispose(): void;
1302
+ }
1303
+
1304
+ /**
1305
+ * Create a new inference context
1306
+ *
1307
+ * @param options Context creation options
1308
+ * @returns Promise resolving to SessionContext instance
1309
+ * @example
1310
+ * ```typescript
1311
+ * const ctx = await createContext({
1312
+ * modelPath: './model.gguf',
1313
+ * nCtx: 2048,
1314
+ * nThreads: 4
1315
+ * });
1316
+ *
1317
+ * try {
1318
+ * const tokens = await ctx.tokenize("Hello");
1319
+ * await ctx.decode(tokens, 0);
1320
+ * const token = ctx.sample({ temperature: 0.7 });
1321
+ * } finally {
1322
+ * ctx.dispose();
1323
+ * }
1324
+ * ```
1325
+ */
1326
+ export function createContext(options: ContextOptions): Promise<SessionContext>;
1327
+
1328
+ /**
1329
+ * Safe logits access with automatic lifetime management
1330
+ *
1331
+ * Ensures logits are only accessed synchronously within the callback.
1332
+ * The callback MUST NOT:
1333
+ * - Store the logits reference
1334
+ * - Return a Promise (will throw)
1335
+ * - Call decode() (would invalidate logits)
1336
+ *
1337
+ * This prevents common bugs where logits become invalid due to
1338
+ * async operations between access and usage.
1339
+ *
1340
+ * How it works:
1341
+ * - Memoization: Multiple getLogits() calls in same step return same buffer
1342
+ * - Revocation: Next decode() invalidates previous buffer
1343
+ *
1344
+ * @template T Return type of the callback
1345
+ * @param ctx The session context
1346
+ * @param fn Synchronous callback that uses logits - must not return a Promise
1347
+ * @returns The result from the callback
1348
+ * @throws Error if callback returns a Promise (async usage not allowed)
1349
+ *
1350
+ * @example Safe synchronous usage
1351
+ * ```typescript
1352
+ * // Compute entropy synchronously
1353
+ * const entropy = withLogits(ctx, (logits) => {
1354
+ * let maxLogit = logits[0];
1355
+ * for (let i = 1; i < logits.length; i++) {
1356
+ * if (logits[i] > maxLogit) maxLogit = logits[i];
1357
+ * }
1358
+ *
1359
+ * let sumExp = 0;
1360
+ * for (let i = 0; i < logits.length; i++) {
1361
+ * sumExp += Math.exp(logits[i] - maxLogit);
1362
+ * }
1363
+ *
1364
+ * let entropy = 0;
1365
+ * for (let i = 0; i < logits.length; i++) {
1366
+ * const p = Math.exp(logits[i] - maxLogit) / sumExp;
1367
+ * if (p > 0) entropy -= p * Math.log(p);
1368
+ * }
1369
+ * return entropy;
1370
+ * });
1371
+ *
1372
+ * // Now safe to decode (previous logits buffer is revoked)
1373
+ * await ctx.decode([nextToken], position++);
1374
+ * ```
1375
+ *
1376
+ * @example Error: async callback
1377
+ * ```typescript
1378
+ * // This will throw!
1379
+ * withLogits(ctx, async (logits) => {
1380
+ * await something(); // NOT ALLOWED
1381
+ * return logits[0];
1382
+ * });
1383
+ * ```
1384
+ */
1385
+ export function withLogits<T>(
1386
+ ctx: SessionContext,
1387
+ fn: (logits: Float32Array) => T
1388
+ ): T;