@lloyal-labs/sdk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1365 @@
1
+ /**
2
+ * lloyal SDK TypeScript Definitions
3
+ *
4
+ * Backend-agnostic type definitions for lloyal inference primitives
5
+ *
6
+ * @categoryDescription Core
7
+ * Entry points, context lifecycle, and the main inference interface.
8
+ *
9
+ * @categoryDescription Sampling
10
+ * Sampler chain configuration — temperature, penalties, nucleus sampling, and advanced filters.
11
+ *
12
+ * @categoryDescription Chat
13
+ * Chat template formatting, output parsing, tool calls, and reasoning extraction.
14
+ *
15
+ * @categoryDescription Branching
16
+ * Parallel and tree-structured generation with batched GPU dispatch.
17
+ */
18
+ /**
19
+ * GPU variant for binary loading
20
+ *
21
+ * Specifies which GPU-accelerated binary to load:
22
+ * - 'default': CPU-only (works everywhere)
23
+ * - 'cuda': NVIDIA CUDA (requires libcudart.so/cudart64.dll)
24
+ * - 'vulkan': Vulkan (AMD/Intel/NVIDIA, requires Vulkan runtime)
25
+ *
26
+ * If the requested variant is unavailable (package not installed or
27
+ * runtime libraries missing), loading automatically falls back to CPU.
28
+ *
29
+ * @category Core
30
+ */
31
+ export type GpuVariant = 'default' | 'cuda' | 'vulkan';
32
+ /**
33
+ * Supported KV cache quantization types
34
+ *
35
+ * Matches llama.cpp CLI `-ctk` / `-ctv` flags.
36
+ * Lower precision = less GPU memory, slight quality tradeoff.
37
+ *
38
+ * @category Core
39
+ */
40
+ export type KvCacheType = 'f32' | 'f16' | 'bf16' | 'q8_0' | 'q4_0' | 'q4_1' | 'iq4_nl' | 'q5_0' | 'q5_1';
41
+ /**
42
+ * Options for binary loading
43
+ *
44
+ * Controls which native binary variant is loaded when creating a context.
45
+ * Use this for explicit GPU variant selection with automatic fallback.
46
+ *
47
+ * @category Core
48
+ */
49
+ export interface LoadOptions {
50
+ /**
51
+ * GPU variant to use
52
+ *
53
+ * - 'cuda': NVIDIA CUDA (requires libcudart.so)
54
+ * - 'vulkan': Vulkan (AMD/Intel/NVIDIA)
55
+ * - 'default' or undefined: CPU only
56
+ *
57
+ * If the requested variant is unavailable (missing runtime libraries),
58
+ * automatically falls back to CPU with a console warning.
59
+ *
60
+ * @example
61
+ * ```typescript
62
+ * // Request CUDA with automatic fallback to CPU
63
+ * const ctx = await createContext(
64
+ * { modelPath: './model.gguf' },
65
+ * { gpuVariant: 'cuda' }
66
+ * );
67
+ * ```
68
+ */
69
+ gpuVariant?: GpuVariant;
70
+ }
71
+ /**
72
+ * Pooling type for embedding extraction
73
+ *
74
+ * @category Core
75
+ */
76
+ export declare enum PoolingType {
77
+ /** No pooling - raw per-token embeddings */
78
+ NONE = 0,
79
+ /** Mean pooling - average of all token embeddings */
80
+ MEAN = 1,
81
+ /** CLS pooling - use first token embedding */
82
+ CLS = 2,
83
+ /** Last token pooling - use last token embedding */
84
+ LAST = 3,
85
+ /** Rank pooling - classification head output for reranking models */
86
+ RANK = 4
87
+ }
88
+ /**
89
+ * Chat format detected by the template engine
90
+ *
91
+ * Identifies how the model formats tool calls, reasoning blocks, and content.
92
+ * Opaque chat format identifier returned by
93
+ * {@link SessionContext.formatChat | formatChat()} and consumed by
94
+ * {@link SessionContext.parseChatOutput | parseChatOutput()}.
95
+ *
96
+ * Maps 1:1 to llama.cpp's `common_chat_format` enum (30+ values).
97
+ * Treat as an opaque number — pass through, don't switch on it.
98
+ *
99
+ * @category Chat
100
+ */
101
+ export type ChatFormat = number;
102
+ /** Model template has no tool/structured-output support. */
103
+ export declare const CHAT_FORMAT_CONTENT_ONLY: ChatFormat;
104
+ /** llama.cpp's generic JSON fallback — imposes format the model wasn't trained on. */
105
+ export declare const CHAT_FORMAT_GENERIC: ChatFormat;
106
+ /**
107
+ * Reasoning/thinking block format
108
+ *
109
+ * Controls how `<think>` blocks are handled during formatting and parsing.
110
+ *
111
+ * @see {@link FormatChatOptions.reasoningFormat} for input-side usage
112
+ * @see {@link ParseChatOutputOptions.reasoningFormat} for output-side usage
113
+ *
114
+ * @category Chat
115
+ */
116
+ export declare enum ReasoningFormat {
117
+ /** No reasoning extraction (default) */
118
+ NONE = 0,
119
+ /** Auto-detect reasoning format from model template */
120
+ AUTO = 1,
121
+ /** DeepSeek legacy format (`<think>...</think>` in content) */
122
+ DEEPSEEK_LEGACY = 2,
123
+ /** DeepSeek format (structured reasoning extraction) */
124
+ DEEPSEEK = 3
125
+ }
126
+ /**
127
+ * Grammar trigger type
128
+ *
129
+ * Determines how lazy grammar activation is triggered during generation.
130
+ *
131
+ * @see {@link GrammarTrigger}
132
+ * @see {@link FormattedChatResult.grammarTriggers}
133
+ *
134
+ * @category Chat
135
+ */
136
+ export declare enum GrammarTriggerType {
137
+ /** Trigger on a specific token ID */
138
+ TOKEN = 0,
139
+ /** Trigger on a word boundary match */
140
+ WORD = 1,
141
+ /** Trigger on a regex pattern match */
142
+ PATTERN = 2,
143
+ /** Trigger on a full-string regex pattern match */
144
+ PATTERN_FULL = 3
145
+ }
146
+ /**
147
+ * Configuration for context creation
148
+ *
149
+ * Controls the resource envelope for inference: context window size (`nCtx`),
150
+ * batch throughput (`nBatch`), compute parallelism (`nThreads`), and
151
+ * multi-sequence capacity (`nSeqMax`). These map directly to
152
+ * `llama_context_params` and are fixed for the context's lifetime.
153
+ *
154
+ * Key tradeoffs:
155
+ * - **nCtx**: Larger = longer conversations, but linear KV memory growth.
156
+ * - **nBatch**: Larger = faster prompt prefill (more tokens per GPU dispatch),
157
+ * but higher peak memory. Also sets the bin-packing capacity for
158
+ * {@link BranchStore.prefill}.
159
+ * - **nSeqMax**: Set ≥ your max concurrent branch count + 1 (root sequence).
160
+ * Each sequence shares the same KV cache memory pool — cost is metadata only
161
+ * under unified KV, not a per-sequence memory multiplier.
162
+ *
163
+ * @category Core
164
+ */
165
+ export interface ContextOptions {
166
+ /** Path to .gguf model file */
167
+ modelPath: string;
168
+ /** Context size (default: 2048) */
169
+ nCtx?: number;
170
+ /** Number of threads (default: 4) */
171
+ nThreads?: number;
172
+ /**
173
+ * Batch size for token processing
174
+ *
175
+ * Controls how many tokens are processed per llama_decode call.
176
+ * Higher values improve throughput for prompt prefill at the cost of memory.
177
+ * Also sets llama_context_params.n_batch and n_ubatch at context creation.
178
+ * Default: 512
179
+ */
180
+ nBatch?: number;
181
+ /**
182
+ * Enable embedding extraction mode
183
+ *
184
+ * When true, context is optimized for embedding extraction.
185
+ * Use with encode() and getEmbeddings() methods.
186
+ * Default: false (text generation mode)
187
+ */
188
+ embeddings?: boolean;
189
+ /**
190
+ * Pooling type for embedding extraction
191
+ *
192
+ * Only relevant when embeddings=true.
193
+ * Default: MEAN for embedding contexts, NONE otherwise
194
+ */
195
+ poolingType?: PoolingType;
196
+ /**
197
+ * Maximum number of sequences for multi-sequence support
198
+ *
199
+ * Set > 1 to enable multiple independent KV cache sequences.
200
+ * Useful for parallel decoding or conversation branching.
201
+ * Default: 1 (single sequence)
202
+ */
203
+ nSeqMax?: number;
204
+ /**
205
+ * KV cache data type for keys
206
+ *
207
+ * Quantize the key cache to reduce GPU memory. For a Q4_K_M model,
208
+ * F16 cache wastes precision — Q8_0 halves memory with minimal quality loss.
209
+ *
210
+ * Memory at nCtx=8192 (Qwen3-4B, 36 layers, 8 KV heads, 128 dim):
211
+ * f16: 1152 MB q8_0: ~576 MB q4_0: ~288 MB
212
+ *
213
+ * Default: 'f16'
214
+ */
215
+ typeK?: KvCacheType;
216
+ /**
217
+ * KV cache data type for values
218
+ *
219
+ * Same options as typeK. V cache is slightly more quality-sensitive than K.
220
+ * Default: 'f16'
221
+ */
222
+ typeV?: KvCacheType;
223
+ }
224
+ /**
225
+ * Options for chat template formatting
226
+ *
227
+ * Controls format-awareness fields passed to the chat template engine.
228
+ * All fields are optional -- sensible defaults are used when omitted.
229
+ *
230
+ * @example With tools and reasoning
231
+ * ```typescript
232
+ * const result = await ctx.formatChat(messagesJson, {
233
+ * tools: JSON.stringify(tools),
234
+ * toolChoice: 'auto',
235
+ * reasoningFormat: 'auto',
236
+ * });
237
+ * ```
238
+ *
239
+ * @category Chat
240
+ */
241
+ export interface FormatChatOptions {
242
+ /** Custom Jinja2 template override (bypasses model's built-in template) */
243
+ templateOverride?: string;
244
+ /**
245
+ * JSON array of OpenAI-format tool definitions
246
+ *
247
+ * @example
248
+ * ```typescript
249
+ * const tools = [{ type: 'function', function: {
250
+ * name: 'get_weather',
251
+ * description: 'Get current weather',
252
+ * parameters: { type: 'object', properties: { location: { type: 'string' } } }
253
+ * }}];
254
+ * options.tools = JSON.stringify(tools);
255
+ * ```
256
+ */
257
+ tools?: string;
258
+ /** Tool choice strategy (default: "auto") */
259
+ toolChoice?: 'auto' | 'required' | 'none';
260
+ /** Allow parallel tool calls (default: false) */
261
+ parallelToolCalls?: boolean;
262
+ /**
263
+ * Reasoning format (default: "none")
264
+ *
265
+ * Controls `<think>` block handling in the template.
266
+ * Use "auto" to let the model's template decide.
267
+ */
268
+ reasoningFormat?: 'none' | 'auto' | 'deepseek' | 'deepseek_legacy';
269
+ /** Enable `<think>` blocks (default: true). Pairs with reasoningFormat. */
270
+ enableThinking?: boolean;
271
+ /**
272
+ * JSON schema for constrained output. Converted to GBNF grammar internally.
273
+ * Mutually exclusive with `grammar`.
274
+ *
275
+ * @see {@link SessionContext.jsonSchemaToGrammar}
276
+ */
277
+ jsonSchema?: string;
278
+ /**
279
+ * Explicit GBNF grammar string for constrained generation.
280
+ * Mutually exclusive with `jsonSchema`.
281
+ */
282
+ grammar?: string;
283
+ /**
284
+ * Append assistant prompt prefix (default: true).
285
+ * Set false when formatting partial conversations or for
286
+ * non-generation use cases like template validation.
287
+ */
288
+ addGenerationPrompt?: boolean;
289
+ }
290
+ /**
291
+ * Grammar trigger from format-aware chat template
292
+ *
293
+ * Defines conditions for lazy grammar activation. When `grammarLazy` is true
294
+ * in {@link FormattedChatResult}, generation runs unconstrained until one of
295
+ * these triggers fires, at which point the grammar is activated.
296
+ *
297
+ * @category Chat
298
+ */
299
+ export interface GrammarTrigger {
300
+ /** Trigger type */
301
+ type: GrammarTriggerType;
302
+ /** Trigger value (token text, word, or regex pattern depending on type) */
303
+ value: string;
304
+ /** Token ID (for TOKEN-type triggers, -1 when not applicable) */
305
+ token: number;
306
+ }
307
+ /**
308
+ * Result from chat template formatting
309
+ *
310
+ * Includes format-awareness fields for proper output parsing.
311
+ * Pass `format` and `reasoningFormat` directly to
312
+ * {@link SessionContext.parseChatOutput | parseChatOutput()} to decode
313
+ * the model's response.
314
+ *
315
+ * @example Roundtrip: format -> generate -> parse
316
+ * ```typescript
317
+ * const fmt = await ctx.formatChat(messagesJson, { tools: toolsJson });
318
+ * // ... generate tokens using fmt.prompt and fmt.grammar ...
319
+ * const parsed = ctx.parseChatOutput(output, fmt.format, {
320
+ * reasoningFormat: fmt.reasoningFormat,
321
+ * thinkingForcedOpen: fmt.thinkingForcedOpen,
322
+ * parser: fmt.parser,
323
+ * });
324
+ * ```
325
+ *
326
+ * @see {@link SessionContext.parseChatOutput}
327
+ *
328
+ * @category Chat
329
+ */
330
+ export interface FormattedChatResult {
331
+ /** Formatted prompt string ready for tokenization */
332
+ prompt: string;
333
+ /** Additional stop strings from the template */
334
+ stopTokens: string[];
335
+ /**
336
+ * Detected chat format (pass to parseChatOutput)
337
+ * @see {@link SessionContext.parseChatOutput}
338
+ */
339
+ format: ChatFormat;
340
+ /** Grammar string for constrained generation (empty if no tools/schema) */
341
+ grammar: string;
342
+ /** Whether grammar should be applied lazily (only after triggers fire) */
343
+ grammarLazy: boolean;
344
+ /** Whether the thinking tag was forced open by the template */
345
+ thinkingForcedOpen: boolean;
346
+ /**
347
+ * Reasoning format (pass to parseChatOutput options)
348
+ * @see {@link ParseChatOutputOptions.reasoningFormat}
349
+ */
350
+ reasoningFormat: ReasoningFormat;
351
+ /** PEG parser definition for PEG format models (pass to parseChatOutput options) */
352
+ parser: string;
353
+ /** Grammar trigger conditions for lazy grammar activation */
354
+ grammarTriggers: GrammarTrigger[];
355
+ /** Token strings preserved from grammar masking */
356
+ preservedTokens: string[];
357
+ }
358
+ /**
359
+ * Options for parsing chat output
360
+ *
361
+ * All fields are optional. For correct parsing, pass through the corresponding
362
+ * fields from {@link FormattedChatResult}.
363
+ *
364
+ * @see {@link FormattedChatResult}
365
+ *
366
+ * @category Chat
367
+ */
368
+ export interface ParseChatOutputOptions {
369
+ /**
370
+ * Reasoning format (from {@link FormattedChatResult.reasoningFormat})
371
+ */
372
+ reasoningFormat?: ReasoningFormat;
373
+ /**
374
+ * True if output is incomplete (streaming).
375
+ * When true, the parser tolerates unterminated tool calls and open
376
+ * thinking blocks, returning partial content as-is rather than
377
+ * treating them as parse errors.
378
+ */
379
+ isPartial?: boolean;
380
+ /** Whether thinking tag was forced open (from {@link FormattedChatResult.thinkingForcedOpen}) */
381
+ thinkingForcedOpen?: boolean;
382
+ /** PEG parser definition for PEG format models (from {@link FormattedChatResult.parser}) */
383
+ parser?: string;
384
+ }
385
+ /**
386
+ * A tool call extracted from model output
387
+ *
388
+ * @example
389
+ * ```typescript
390
+ * for (const tc of result.toolCalls) {
391
+ * const args = JSON.parse(tc.arguments);
392
+ * await executeTool(tc.name, args);
393
+ * }
394
+ * ```
395
+ *
396
+ * @category Chat
397
+ */
398
+ export interface ParsedToolCall {
399
+ /** Tool/function name */
400
+ name: string;
401
+ /** JSON string of arguments */
402
+ arguments: string;
403
+ /** Tool call ID (may be empty depending on model format) */
404
+ id: string;
405
+ }
406
+ /**
407
+ * Result from parsing chat output
408
+ *
409
+ * @example
410
+ * ```typescript
411
+ * const result = ctx.parseChatOutput(output, fmt.format);
412
+ * if (result.toolCalls.length > 0) {
413
+ * for (const tc of result.toolCalls) {
414
+ * const args = JSON.parse(tc.arguments);
415
+ * await executeTool(tc.name, args);
416
+ * }
417
+ * } else {
418
+ * console.log(result.content);
419
+ * }
420
+ * ```
421
+ *
422
+ * @category Chat
423
+ */
424
+ export interface ParseChatOutputResult {
425
+ /** Main response text */
426
+ content: string;
427
+ /**
428
+ * Extracted thinking/reasoning content (empty string if none).
429
+ * For thinking models (e.g. Qwen3), this contains the text inside
430
+ * `<think>...</think>` blocks. Store as `reasoning_content` in your
431
+ * messages array so formatChat() can reconstruct the template correctly
432
+ * on subsequent turns.
433
+ */
434
+ reasoningContent: string;
435
+ /** Extracted tool calls (empty array if none) */
436
+ toolCalls: ParsedToolCall[];
437
+ }
438
+ /**
439
+ * Penalty parameters for repetition control
440
+ *
441
+ * @category Sampling
442
+ */
443
+ export interface PenaltyParams {
444
+ /** Repetition penalty (1.0 = disabled, >1.0 = penalize repeats) */
445
+ repeat?: number;
446
+ /** Frequency penalty (0.0 = disabled) */
447
+ frequency?: number;
448
+ /** Presence penalty (0.0 = disabled) */
449
+ presence?: number;
450
+ /** Tokens to consider for penalties (-1 = context size) */
451
+ lastN?: number;
452
+ }
453
+ /**
454
+ * Mirostat sampling configuration
455
+ *
456
+ * Mirostat dynamically adjusts sampling to maintain target perplexity,
457
+ * preventing both repetition and incoherence. Useful for long-form generation
458
+ * where temperature alone produces inconsistent quality.
459
+ *
460
+ * Use Mirostat v2 (mode: 2) for most cases - it's more stable than v1.
461
+ *
462
+ * @category Sampling
463
+ */
464
+ export interface MirostatParams {
465
+ /** Mirostat mode (0 = disabled, 1 = v1, 2 = v2). Recommended: 2 */
466
+ mode?: number;
467
+ /** Target entropy (perplexity = exp(tau)). Default: 5.0. Lower = more focused */
468
+ tau?: number;
469
+ /** Learning rate for entropy adjustment. Default: 0.1. Higher = faster adaptation */
470
+ eta?: number;
471
+ }
472
+ /**
473
+ * DRY (Don't Repeat Yourself) sampling parameters
474
+ *
475
+ * Penalizes repetition of token sequences, more sophisticated than
476
+ * simple repetition penalty. Useful for reducing loops and redundancy
477
+ * in generated text.
478
+ *
479
+ * @category Sampling
480
+ */
481
+ export interface DryParams {
482
+ /** Penalty strength (0.0 = disabled, higher = stronger penalty) */
483
+ multiplier?: number;
484
+ /** Base penalty value (typically 1.75) */
485
+ base?: number;
486
+ /** Minimum sequence length to trigger penalty (typically 2) */
487
+ allowedLength?: number;
488
+ /** Number of recent tokens to scan for repetitions */
489
+ penaltyLastN?: number;
490
+ }
491
+ /**
492
+ * XTC (eXclude Top Choices) sampler parameters
493
+ *
494
+ * Excludes very high probability tokens to increase output diversity.
495
+ * Useful when model is overly confident and produces repetitive text.
496
+ *
497
+ * @category Sampling
498
+ */
499
+ export interface XtcParams {
500
+ /** Probability of applying XTC (0.0 = disabled, 1.0 = always). Typical: 0.1 */
501
+ probability?: number;
502
+ /** Confidence threshold above which tokens are excluded. Typical: 0.1 */
503
+ threshold?: number;
504
+ }
505
+ /**
506
+ * Advanced sampling parameters
507
+ *
508
+ * @category Sampling
509
+ */
510
+ export interface AdvancedSamplingParams {
511
+ /** Locally typical sampling (1.0 = disabled) */
512
+ typicalP?: number;
513
+ /** Mirostat sampling configuration */
514
+ mirostat?: MirostatParams;
515
+ /** DRY (Don't Repeat Yourself) sampling */
516
+ dry?: DryParams;
517
+ /** XTC sampler */
518
+ xtc?: XtcParams;
519
+ }
520
+ /**
521
+ * Sampling parameters for token generation
522
+ *
523
+ * Configures the sampler chain — a pipeline of composable filters and
524
+ * transforms applied to raw logits before token selection. The chain is
525
+ * built once at branch creation and persists across decode steps
526
+ * (penalty state accumulates, PRNG advances).
527
+ *
528
+ * **Chain order**: penalties → top_k → typical_p → top_p → min_p →
529
+ * temperature → dist (stochastic) or greedy (temperature ≤ 0).
530
+ *
531
+ * For tree search, each {@link Branch} owns an independent clone of the
532
+ * chain. `reseedSampler()` replaces the terminal dist sampler's PRNG seed
533
+ * so forked branches diverge. Greedy chains (temperature ≤ 0) are
534
+ * deterministic and unaffected by reseeding.
535
+ *
536
+ * Common presets:
537
+ * - Factual/Precise: `{ temperature: 0.1 }`
538
+ * - Balanced: `{ temperature: 0.7 }`
539
+ * - Creative: `{ temperature: 1.0 }`
540
+ * - Deterministic greedy: `{ temperature: 0, topK: 0, topP: 1.0, minP: 0 }`
541
+ *
542
+ * @category Sampling
543
+ */
544
+ export interface SamplingParams {
545
+ /** Randomness (0.0 = always most likely, 2.0 = very random) */
546
+ temperature?: number;
547
+ /** Only consider top K most likely tokens (0 = disabled) */
548
+ topK?: number;
549
+ /** Nucleus sampling threshold (1.0 = disabled) */
550
+ topP?: number;
551
+ /** Minimum probability threshold */
552
+ minP?: number;
553
+ /** Random seed for reproducible generation (-1 = random) */
554
+ seed?: number;
555
+ /** GBNF grammar string for constrained generation */
556
+ grammar?: string;
557
+ /** Penalty parameters for repetition control */
558
+ penalties?: PenaltyParams;
559
+ /** Advanced sampling parameters */
560
+ advanced?: AdvancedSamplingParams;
561
+ }
562
+ /**
563
+ * Inference context — the runtime surface for a loaded model
564
+ *
565
+ * A SessionContext owns a llama_context (KV cache + compute graph) bound to a
566
+ * shared model. It provides tokenization, logit access, KV cache management,
567
+ * chat template formatting, and embedding extraction.
568
+ *
569
+ * **All generation flows through {@link Branch}.** Create a branch at position 0,
570
+ * prefill prompt tokens, then use the produce/commit loop or async iterator:
571
+ *
572
+ * ```typescript
573
+ * const branch = Branch.create(ctx, 0, { temperature: 0.7 });
574
+ * await branch.prefill(promptTokens);
575
+ * for await (const { token, text } of branch) {
576
+ * process.stdout.write(text);
577
+ * }
578
+ * ```
579
+ *
580
+ * For tree-structured generation (best-of-N, beam search, speculative
581
+ * decoding), use {@link Branch.fork} and {@link BranchStore} — they manage
582
+ * per-branch KV sequences, sampler chains, and logits snapshots with O(1)
583
+ * GPU dispatches via batched decode.
584
+ *
585
+ * **Logits**: For branch-level logits, use {@link Branch.getLogits} which
586
+ * returns an independent copy of the branch's snapshot. For metrics, use
587
+ * {@link Branch.modelEntropy} and {@link Branch.modelSurprisal} which
588
+ * operate directly on the branch's logits without JS round-trips.
589
+ *
590
+ * **KV cache**: Supports multi-sequence operation (`nSeqMax > 1`), per-sequence
591
+ * copy/clear/eviction, file-based persistence, and context compression via
592
+ * `clearAndReseed()`.
593
+ *
594
+ * **Chat templates**: `formatChat()` and `parseChatOutput()` handle the full
595
+ * round-trip of chat formatting, including tool calls, reasoning blocks, and
596
+ * grammar-constrained generation — using the model's native Jinja template.
597
+ *
598
+ * Use {@link createContext} to initialize, and `dispose()` when done to free
599
+ * GPU/CPU memory.
600
+ *
601
+ * @category Core
602
+ */
603
+ export interface SessionContext {
604
+ /**
605
+ * Convert token ID to text piece
606
+ *
607
+ * Fast synchronous lookup in vocabulary table.
608
+ * Call this on each generated token for streaming display.
609
+ *
610
+ * Optimized for per-token conversion during generation.
611
+ * For batch conversion of many tokens, use detokenize() instead.
612
+ *
613
+ * Cost: ~0.05ms
614
+ *
615
+ * @param token Token ID
616
+ * @returns Text string for this token
617
+ */
618
+ tokenToText(token: number): string;
619
+ /**
620
+ * Check if token is a model stop token
621
+ *
622
+ * Returns true for built-in end-of-generation tokens:
623
+ * - </s> (Llama 2)
624
+ * - <|endoftext|> (GPT)
625
+ * - <|eot_id|> (Llama 3)
626
+ * - Model-specific EOS tokens
627
+ *
628
+ * Note: This checks vocabulary stop tokens, not custom stop sequences.
629
+ * For custom stops (e.g., "\n\n", "###"), compare generated text
630
+ * against your stop strings in application code.
631
+ *
632
+ * Cost: <0.01ms (fast vocabulary lookup)
633
+ *
634
+ * @param token Token ID to check
635
+ */
636
+ isStopToken(token: number): boolean;
637
+ /**
638
+ * Get the model's end-of-generation token ID
639
+ *
640
+ * Returns the EOT token (e.g. <|im_end|> for ChatML), falling back
641
+ * to EOS (e.g. </s>) for Zephyr-style models. This is the inverse
642
+ * of isStopToken() — "what IS the stop token?" vs "is this a stop token?"
643
+ *
644
+ * Use case: warm multi-turn continuation prepends this token to close
645
+ * the previous assistant turn before injecting new user content.
646
+ *
647
+ * @returns Token ID (integer)
648
+ * @throws If model has neither EOT nor EOS token
649
+ */
650
+ getEogToken(): number;
651
+ /**
652
+ * Get the model's turn separator token IDs
653
+ *
654
+ * Returns the tokens that close an assistant turn and transition to the
655
+ * next message, as determined by the model's chat template. Computed once
656
+ * per model, cached.
657
+ *
658
+ * For ChatML templates: [im_end_id, newline_id] (e.g., [2, 198])
659
+ * For Llama 3 templates: [eot_id] (e.g., [128009])
660
+ *
661
+ * Use case: warm multi-turn prefill to achieve exact parity with cold path.
662
+ *
663
+ * @returns Array of token IDs (cached after first call)
664
+ *
665
+ * @example
666
+ * ```typescript
667
+ * const separator = ctx.getTurnSeparator();
668
+ * console.log(separator.map(t => ctx.tokenToText(t)).join('')); // "<|im_end|>\n"
669
+ *
670
+ * // Warm prefill with exact cold/warm parity
671
+ * const deltaTokens = await ctx.tokenize(deltaPrompt, false);
672
+ * await branch.prefill([...separator, ...deltaTokens]);
673
+ * ```
674
+ */
675
+ getTurnSeparator(): number[];
676
+ /**
677
+ * Tokenize text into model's vocabulary
678
+ *
679
+ * Converts human text → token IDs for decode().
680
+ * Same text always produces same tokens for a given model.
681
+ *
682
+ * Cost: ~1ms per 100 characters
683
+ *
684
+ * @param text Text to tokenize
685
+ * @param addSpecial Whether to add special tokens (BOS/EOS). Defaults to
686
+ * model metadata setting (typically true). Pass false for mid-sequence
687
+ * tokenization (e.g., warm multi-turn continuation deltas).
688
+ * @returns Array of token IDs
689
+ * @example
690
+ * ```typescript
691
+ * // Full sequence (default — includes BOS)
692
+ * const tokens = await ctx.tokenize("Hello world");
693
+ *
694
+ * // Mid-sequence delta (no BOS)
695
+ * const delta = await ctx.tokenize("continuation text", false);
696
+ * ```
697
+ */
698
+ tokenize(text: string, addSpecial?: boolean): Promise<number[]>;
699
+ /**
700
+ * Tokenize text into model's vocabulary (sync — inline on main thread)
701
+ *
702
+ * Same as {@link tokenize} but synchronous. Use from Effection generators
703
+ * to avoid `yield* call()` overhead for CPU-only work.
704
+ *
705
+ * @param text Text to tokenize
706
+ * @param addSpecial Whether to add special tokens (BOS/EOS). Defaults to
707
+ * model metadata setting (typically true). Pass false for mid-sequence
708
+ * tokenization.
709
+ * @returns Array of token IDs
710
+ */
711
+ tokenizeSync(text: string, addSpecial?: boolean): number[];
712
+ /**
713
+ * Detokenize array of tokens back to text
714
+ *
715
+ * Inverse of tokenize(). Use for reconstructing complete text
716
+ * from token sequences (e.g., after KV cache operations).
717
+ *
718
+ * Optimized for batch conversion of many tokens.
719
+ * For single-token conversion during generation, use tokenToText().
720
+ *
721
+ * Cost: ~1ms per 100 tokens
722
+ *
723
+ * @param tokens Array of token IDs
724
+ * @returns Complete text representation
725
+ * @example
726
+ * ```typescript
727
+ * const tokens = [15496, 1917]; // "Hello world"
728
+ * const text = await ctx.detokenize(tokens);
729
+ * console.log(text); // "Hello world"
730
+ * ```
731
+ */
732
+ detokenize(tokens: number[]): Promise<string>;
733
+ /**
734
+ * Get max position in the KV cache for a sequence
735
+ *
736
+ * Returns the highest position index in the specified sequence,
737
+ * or -1 if the sequence is empty. This is the same value as
738
+ * {@link kvSeqPosMax}. To get the token count, add 1.
739
+ *
740
+ * Think of this as: "How much has the model read so far?"
741
+ *
742
+ * Cost: <0.01ms (fast sync operation - safe to call frequently)
743
+ *
744
+ * @param sequenceId Sequence ID (defaults to 0 for single conversation)
745
+ * @returns Highest position index, or -1 if empty
746
+ */
747
+ kvCacheSize(sequenceId?: number): number;
748
+ /**
749
+ * Remove token range from KV cache
750
+ *
751
+ * Deletes tokens from model's memory. Use cases:
752
+ * - Removing old context when hitting limit (sliding window)
753
+ * - Implementing conversation pruning
754
+ * - Forgetting specific messages
755
+ * - Preparing for injection of new context
756
+ *
757
+ * CRITICAL: Call BEFORE next decode(), not after!
758
+ * The model needs to know about the removal before processing new tokens.
759
+ *
760
+ * Cost: ~1-5ms depending on range
761
+ *
762
+ * @param sequenceId Sequence ID (use 0 for single sequence)
763
+ * @param start Start position (inclusive)
764
+ * @param end End position (exclusive), -1 = to end
765
+ */
766
+ kvCacheRemove(sequenceId: number, start: number, end: number): Promise<void>;
767
+ /**
768
+ * Snapshot KV cache state for branching/undo
769
+ *
770
+ * Serializes entire model state to Buffer.
771
+ * Restore later with kvCacheLoad() for:
772
+ * - Conversation branching ("what if I said X instead?")
773
+ * - Undo/redo functionality
774
+ * - Checkpointing long conversations
775
+ *
776
+ * Size: ~500MB-2GB depending on context length and model
777
+ *
778
+ * Cost: ~100-500ms depending on cache size
779
+ *
780
+ * @param sequenceId Sequence ID (use 0 for single sequence)
781
+ * @returns Serialized state buffer
782
+ */
783
+ kvCacheSave(sequenceId?: number): Promise<Buffer>;
784
+ /**
785
+ * Restore KV cache from previous snapshot
786
+ *
787
+ * Loads saved model state. Context returns to exact state
788
+ * when snapshot was taken.
789
+ *
790
+ * Cost: ~100-500ms depending on snapshot size
791
+ *
792
+ * @param sequenceId Sequence ID (use 0 for single sequence)
793
+ * @param state Buffer from kvCacheSave()
794
+ * @example
795
+ * ```typescript
796
+ * const snapshot = await ctx.kvCacheSave(0);
797
+ *
798
+ * // ... many operations later ...
799
+ *
800
+ * // Restore to saved state
801
+ * await ctx.kvCacheLoad(0, snapshot);
802
+ * ```
803
+ */
804
+ kvCacheLoad(sequenceId: number, state: Buffer): Promise<void>;
805
+ /**
806
+ * Clear all KV cache (fresh start)
807
+ *
808
+ * Removes all cached tokens. Model returns to initial state
809
+ * as if no text has been processed.
810
+ *
811
+ * Use when starting a completely new conversation.
812
+ *
813
+ * Cost: ~1ms
814
+ */
815
+ kvCacheClear(): Promise<void>;
816
+ /**
817
+ * Blink KV — cache-local reconstruction for bounded-memory streaming
818
+ *
819
+ * Implements the [Blink KV](https://github.com/lloyal-ai/blink-kv/blob/main/blink_kv.pdf)
820
+ * protocol (Naqvi, 2026): when the KV cache fills, clear it entirely and
821
+ * re-decode retained tokens at contiguous positions `[0, 1, ..., N-1]`.
822
+ * This achieves cache-local position IDs — the operative requirement for
823
+ * stable bounded-memory streaming — without backend-specific knowledge of
824
+ * key storage format. Works on post-RoPE engines (where StreamingLLM's
825
+ * pos-shift is unavailable) and any backend exposing `clear()` + `decode()`.
826
+ *
827
+ * **Why not naive eviction?** Selective eviction (`kvCacheRemove`) preserves
828
+ * original position IDs, which grow without bound. Across 5 architectures,
829
+ * naive eviction produces PPL spanning 3 orders of magnitude — ranging from
830
+ * 1.15x baseline (Llama, lucky config) to 198x (Phi, sinks present).
831
+ * Under Blink KV reconstruction, all 5 converge to 3-16% of baseline.
832
+ *
833
+ * **Sinks are optional.** Under reconstruction, the 0+N (sinkless) config
834
+ * matches 4+N (with sinks) within <2% across all tested architectures.
835
+ * Pass an empty sinks array if you don't need them.
836
+ *
837
+ * **Algorithm:**
838
+ * 1. Clear entire KV cache (zero fragmentation)
839
+ * 2. Re-decode `sinks` at position 0 (optional attention anchors)
840
+ * 3. Re-decode `tail` at position `sinks.length` (recent context)
841
+ *
842
+ * **Cost:** Re-decodes `sinks.length + tail.length` tokens. At per-boundary
843
+ * trigger (reconstruct when cache reaches `nCtx`), amortized cost is
844
+ * O(cacheSize / interval) decode ops per token — ~0.14 at typical settings.
845
+ *
846
+ * @param sinks First N tokens from conversation start (typically 4, or empty).
847
+ * Must be the same tokens every reseed — reusing different tokens degrades
848
+ * any attention-sink patterns the model may have learned for early positions.
849
+ * @param tail Recent M tokens to preserve (typically 252-1020)
850
+ * @returns Promise that resolves when reconstruction completes.
851
+ * Next decode continues at position `sinks.length + tail.length`.
852
+ *
853
+ * @example Per-boundary reconstruction
854
+ * ```typescript
855
+ * // Capture sinks once at conversation start
856
+ * const SINKS = allTokens.slice(0, 4);
857
+ *
858
+ * // On cache fill: compress to 512 tokens (4 sinks + 508 tail)
859
+ * if (position >= ctx.nCtx) {
860
+ * const tail = allTokens.slice(-508);
861
+ * await ctx.clearAndReseed(SINKS, tail);
862
+ * position = 512; // sinks.length + tail.length
863
+ * }
864
+ * ```
865
+ *
866
+ * @example Sinkless reconstruction (equally effective)
867
+ * ```typescript
868
+ * const tail = allTokens.slice(-256);
869
+ * await ctx.clearAndReseed([], tail); // No sinks needed
870
+ * position = 256;
871
+ * ```
872
+ *
873
+ * @see [Blink KV paper](https://github.com/lloyal-ai/blink-kv/blob/main/blink_kv.pdf)
874
+ */
875
+ clearAndReseed(sinks: number[], tail: number[]): Promise<void>;
876
+ /**
877
+ * Fork a KV cache sequence — the primitive behind {@link Branch.fork}
878
+ *
879
+ * Copies all KV cache entries from `srcSeqId` to `dstSeqId`. Under
880
+ * llama.cpp's unified KV cache, this is a **metadata-only operation** —
881
+ * no key/value tensors are copied. Both sequences reference the same
882
+ * physical KV entries for the shared prefix; only tokens decoded after
883
+ * the fork point allocate new storage. This is what makes tree-structured
884
+ * generation (best-of-N, beam search, speculative decoding) memory-efficient:
885
+ * N branches sharing a 1000-token prefix cost ~1000 KV entries, not N*1000.
886
+ *
887
+ * The higher-level {@link Branch.fork} wraps this and additionally clones
888
+ * the sampler chain, grammar state, logits snapshot, and perplexity tracker.
889
+ * Use `kvSeqCopy` directly when you need raw sequence management without
890
+ * the Branch abstraction.
891
+ *
892
+ * NOTE: Only full-sequence copies are supported. The p0/p1 parameters
893
+ * must use default values (0 and -1).
894
+ *
895
+ * Cost: O(1) metadata — no tensor copy under unified KV
896
+ *
897
+ * @param srcSeqId Source sequence to copy from
898
+ * @param dstSeqId Destination sequence to copy to
899
+ * @param p0 Start position (must be 0, default: 0)
900
+ * @param p1 End position (must be -1 for full copy, default: -1)
901
+ */
902
+ kvSeqCopy(srcSeqId: number, dstSeqId: number, p0?: number, p1?: number): void;
903
+ /**
904
+ * Keep only specified sequence, remove all others
905
+ *
906
+ * Removes all sequences except the one specified.
907
+ * For complete cleanup of unwanted sequences, consider using
908
+ * kvCacheRemove(seqId, 0, -1) on each sequence instead.
909
+ *
910
+ * @param seqId Sequence ID to keep
911
+ */
912
+ kvSeqKeep(seqId: number): void;
913
+ /**
914
+ * Get max position in sequence
915
+ *
916
+ * Returns the highest position index in the specified sequence,
917
+ * or -1 if the sequence is empty.
918
+ *
919
+ * Cost: <0.01ms (fast sync operation)
920
+ *
921
+ * @param seqId Sequence ID to query
922
+ * @returns Max position index, or -1 if empty
923
+ * @example
924
+ * ```typescript
925
+ * const pos = ctx.kvSeqPosMax(0);
926
+ * if (pos === -1) {
927
+ * console.log('Sequence is empty');
928
+ * } else {
929
+ * console.log(`Sequence has ${pos + 1} tokens`);
930
+ * }
931
+ * ```
932
+ */
933
+ kvSeqPosMax(seqId: number): number;
934
+ /**
935
+ * Write KV cache state + tokens to file
936
+ *
937
+ * Persists KV cache state for later restoration.
938
+ * Useful for checkpointing long conversations.
939
+ *
940
+ * @param sequenceId Sequence ID to save
941
+ * @param filepath Path to save file
942
+ * @param tokens Tokens that were decoded into this sequence
943
+ * @returns Promise resolving to bytes written
944
+ */
945
+ kvCacheWriteFile(sequenceId: number, filepath: string, tokens: number[]): Promise<number>;
946
+ /**
947
+ * Read KV cache state + tokens from file
948
+ *
949
+ * Restores KV cache state from a previous kvCacheWriteFile call.
950
+ *
951
+ * @param sequenceId Sequence ID to restore to
952
+ * @param filepath Path to saved file
953
+ * @returns Promise resolving to tokens and bytes read
954
+ */
955
+ kvCacheReadFile(sequenceId: number, filepath: string): Promise<{
956
+ tokens: number[];
957
+ bytesRead: number;
958
+ }>;
959
+ /**
960
+ * Format messages using model's chat template
961
+ *
962
+ * Converts [{role, content}] -> formatted prompt string with full format awareness.
963
+ * Uses model's built-in template (ChatML, Llama, Mistral, etc.).
964
+ *
965
+ * The returned `format` and `reasoningFormat` fields should be passed to
966
+ * `parseChatOutput()` after generation to correctly decode the response.
967
+ *
968
+ * Cost: ~1-5ms depending on message count
969
+ *
970
+ * @param messagesJson JSON string containing array of messages
971
+ * @param options Formatting options (tools, reasoning, grammar, etc.)
972
+ * @returns Formatted prompt with format-awareness metadata
973
+ *
974
+ * @see {@link parseChatOutput}
975
+ *
976
+ * @example Basic usage
977
+ * ```typescript
978
+ * const result = await ctx.formatChat(JSON.stringify([
979
+ * { role: "system", content: "You are a helpful assistant" },
980
+ * { role: "user", content: "Hello!" }
981
+ * ]));
982
+ *
983
+ * const tokens = await ctx.tokenize(result.prompt);
984
+ * const branch = Branch.create(ctx, 0, { temperature: 0.7 });
985
+ * await branch.prefill(tokens);
986
+ * ```
987
+ */
988
+ formatChat(messagesJson: string, options?: FormatChatOptions | string): Promise<FormattedChatResult>;
989
+ /**
990
+ * Format messages using model's chat template (sync — inline on main thread)
991
+ *
992
+ * Same as {@link formatChat} but synchronous. Use from Effection generators
993
+ * to avoid `yield* call()` overhead for CPU-only work.
994
+ *
995
+ * @param messagesJson JSON string containing array of messages
996
+ * @param options Formatting options (tools, reasoning, grammar, etc.)
997
+ * @returns Formatted prompt with format-awareness metadata
998
+ */
999
+ formatChatSync(messagesJson: string, options?: FormatChatOptions | string): FormattedChatResult;
1000
+ /**
1001
+ * Parse model output into structured content
1002
+ *
1003
+ * Extracts plain text, reasoning/thinking blocks, and tool calls from
1004
+ * raw model output. Uses the format detected by {@link formatChat} to apply
1005
+ * the correct parser for the model's output format.
1006
+ *
1007
+ * Cost: <0.1ms (synchronous string parsing, no I/O)
1008
+ *
1009
+ * @param output Raw model output text
1010
+ * @param format Chat format enum (from {@link FormattedChatResult.format})
1011
+ * @param options Optional parsing parameters
1012
+ * @returns Parsed content with tool calls and reasoning
1013
+ *
1014
+ * @see {@link formatChat}
1015
+ *
1016
+ * @example Basic parsing
1017
+ * ```typescript
1018
+ * const fmt = await ctx.formatChat(JSON.stringify(messages), { tools: toolsJson });
1019
+ * // ... generate tokens ...
1020
+ * const parsed = ctx.parseChatOutput(generatedText, fmt.format, {
1021
+ * reasoningFormat: fmt.reasoningFormat,
1022
+ * thinkingForcedOpen: fmt.thinkingForcedOpen,
1023
+ * parser: fmt.parser
1024
+ * });
1025
+ * if (parsed.toolCalls.length > 0) {
1026
+ * // Handle tool calls
1027
+ * }
1028
+ * ```
1029
+ *
1030
+ * @example Multi-turn warm continuation with reasoning models
1031
+ * ```typescript
1032
+ * // parseChatOutput separates <think>...</think> blocks into reasoningContent.
1033
+ * // This is REQUIRED for correct warm continuation on thinking models (e.g. Qwen3):
1034
+ * // if raw output containing <think> tags is stored as content, re-formatting
1035
+ * // the conversation produces different tokens, breaking cold/warm parity.
1036
+ *
1037
+ * const messages: Array<{role: string; content: string; reasoning_content?: string}> = [];
1038
+ * const sep = ctx.getTurnSeparator();
1039
+ * let branch: Branch | null = null;
1040
+ * let fmt: FormattedChatResult;
1041
+ *
1042
+ * async function handleTurn(userContent: string) {
1043
+ * messages.push({ role: 'user', content: userContent });
1044
+ *
1045
+ * if (!branch) {
1046
+ * // Cold path: format full conversation, tokenize with BOS, prefill
1047
+ * fmt = await ctx.formatChat(JSON.stringify(messages));
1048
+ * const tokens = await ctx.tokenize(fmt.prompt);
1049
+ * branch = Branch.create(ctx, 0, { temperature: 0.7 });
1050
+ * await branch.prefill(tokens);
1051
+ * } else {
1052
+ * // Warm path: string-diff for delta tokens
1053
+ * const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
1054
+ * const { prompt: prefix } = await ctx.formatChat(
1055
+ * JSON.stringify(messages.slice(0, -1)),
1056
+ * { addGenerationPrompt: false }
1057
+ * );
1058
+ * const delta = await ctx.tokenize(full.substring(prefix.length), false);
1059
+ * await branch.prefill([...sep, ...delta]);
1060
+ * }
1061
+ *
1062
+ * // Generate
1063
+ * let rawOutput = '';
1064
+ * while (true) {
1065
+ * const { token, text, isStop } = await branch.produce();
1066
+ * if (isStop) break;
1067
+ * rawOutput += text;
1068
+ * await branch.commit(token);
1069
+ * }
1070
+ *
1071
+ * // Parse output: separates reasoning from content
1072
+ * const parsed = ctx.parseChatOutput(rawOutput, fmt.format, {
1073
+ * reasoningFormat: fmt.reasoningFormat,
1074
+ * thinkingForcedOpen: fmt.thinkingForcedOpen,
1075
+ * parser: fmt.parser
1076
+ * });
1077
+ *
1078
+ * // Store parsed fields — formatChat reconstructs thinking blocks correctly
1079
+ * messages.push({
1080
+ * role: 'assistant',
1081
+ * content: parsed.content,
1082
+ * reasoning_content: parsed.reasoningContent || undefined
1083
+ * });
1084
+ * }
1085
+ * ```
1086
+ */
1087
+ parseChatOutput(output: string, format: ChatFormat, options?: ParseChatOutputOptions): ParseChatOutputResult;
1088
+ /**
1089
+ * Convert JSON schema to GBNF grammar
1090
+ *
1091
+ * Generates grammar string for constrained JSON generation.
1092
+ * Use with {@link Branch.create} grammar parameter for constrained generation.
1093
+ *
1094
+ * Cost: ~1-10ms depending on schema complexity
1095
+ *
1096
+ * @param schemaJson JSON schema string
1097
+ * @returns GBNF grammar string
1098
+ * @example
1099
+ * ```typescript
1100
+ * const schema = {
1101
+ * type: "object",
1102
+ * properties: {
1103
+ * name: { type: "string" },
1104
+ * age: { type: "number" }
1105
+ * },
1106
+ * required: ["name"]
1107
+ * };
1108
+ *
1109
+ * const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
1110
+ * const branch = Branch.create(ctx, 0, params, undefined, grammar);
1111
+ * ```
1112
+ */
1113
+ jsonSchemaToGrammar(schemaJson: string): Promise<string>;
1114
+ /**
1115
+ * Convert JSON schema to GBNF grammar (sync — inline on main thread)
1116
+ *
1117
+ * Same as {@link jsonSchemaToGrammar} but synchronous. Use from Effection
1118
+ * generators to avoid `yield* call()` overhead for CPU-only work.
1119
+ *
1120
+ * @param schemaJson JSON schema string
1121
+ * @returns GBNF grammar string
1122
+ */
1123
+ jsonSchemaToGrammarSync(schemaJson: string): string;
1124
+ /**
1125
+ * Validate chat template syntax
1126
+ *
1127
+ * Checks if template string is valid before using.
1128
+ *
1129
+ * Cost: ~0.1-1ms
1130
+ *
1131
+ * @param templateString Template string to validate
1132
+ * @returns True if template syntax is valid
1133
+ */
1134
+ validateChatTemplate(templateString: string): Promise<boolean>;
1135
+ /**
1136
+ * Encode tokens for embedding extraction
1137
+ *
1138
+ * Unlike decode(), this marks ALL tokens with logits=true which is
1139
+ * required for embedding extraction. Use with embeddings=true context.
1140
+ *
1141
+ * Workflow:
1142
+ * 1. Create context with { embeddings: true, poolingType: PoolingType.MEAN }
1143
+ * 2. Tokenize your text
1144
+ * 3. Clear KV cache (important between different texts!)
1145
+ * 4. Call encode() with tokens
1146
+ * 5. Call getEmbeddings() to get the vector
1147
+ *
1148
+ * Cost: ~5-50ms depending on text length and model
1149
+ *
1150
+ * @param tokens Token IDs from tokenize()
1151
+ * @example
1152
+ * ```typescript
1153
+ * // Create embedding context
1154
+ * const ctx = await createContext({
1155
+ * modelPath: './nomic-embed.gguf',
1156
+ * embeddings: true,
1157
+ * poolingType: PoolingType.MEAN
1158
+ * });
1159
+ *
1160
+ * // Get embedding for text
1161
+ * const tokens = await ctx.tokenize("Hello world");
1162
+ * await ctx.kvCacheClear(); // Important between texts!
1163
+ * await ctx.encode(tokens);
1164
+ * const embedding = ctx.getEmbeddings();
1165
+ * ```
1166
+ */
1167
+ encode(tokens: number[]): Promise<void>;
1168
+ /**
1169
+ * Get embedding vector from context (after encode)
1170
+ *
1171
+ * Returns the embedding vector for the encoded text.
1172
+ * Call after encode() to extract embeddings.
1173
+ *
1174
+ * The vector dimension depends on the model (e.g., 768 for nomic-embed).
1175
+ * Use getEmbeddingDimension() to get the size.
1176
+ *
1177
+ * Cost: ~0.5ms (extraction from model state)
1178
+ *
1179
+ * @param normalize Apply L2 normalization (default: true for cosine similarity)
1180
+ * @returns Float32Array of embedding values
1181
+ * @example
1182
+ * ```typescript
1183
+ * await ctx.encode(tokens);
1184
+ *
1185
+ * // Get L2-normalized embedding (for cosine similarity)
1186
+ * const embedding = ctx.getEmbeddings();
1187
+ *
1188
+ * // Or raw embedding without normalization
1189
+ * const rawEmbedding = ctx.getEmbeddings(false);
1190
+ * ```
1191
+ */
1192
+ getEmbeddings(normalize?: boolean): Float32Array;
1193
+ /**
1194
+ * Get embedding dimension for model
1195
+ *
1196
+ * Returns the size of embedding vectors this model produces.
1197
+ * Common values: 768 (BERT-like), 1024, 2048, 4096.
1198
+ *
1199
+ * Cost: <0.01ms (fast model property lookup)
1200
+ *
1201
+ * @returns Embedding dimension
1202
+ * @example
1203
+ * ```typescript
1204
+ * const dim = ctx.getEmbeddingDimension();
1205
+ * console.log(`Model produces ${dim}-dimensional embeddings`);
1206
+ * ```
1207
+ */
1208
+ getEmbeddingDimension(): number;
1209
+ /**
1210
+ * Check if context has pooling enabled
1211
+ *
1212
+ * Returns true if context was created with embeddings=true and
1213
+ * a pooling type other than NONE.
1214
+ *
1215
+ * Cost: <0.01ms
1216
+ *
1217
+ * @returns True if pooling is enabled
1218
+ */
1219
+ hasPooling(): boolean;
1220
+ /**
1221
+ * Model vocabulary size (number of possible tokens)
1222
+ *
1223
+ * This is the length of the logits array from Branch.getLogits().
1224
+ */
1225
+ readonly vocabSize: number;
1226
+ /**
1227
+ * Memory used by this context (bytes)
1228
+ *
1229
+ * Reports native memory for monitoring.
1230
+ * Includes model weights, KV cache, and context state.
1231
+ */
1232
+ readonly memorySize: number;
1233
+ /**
1234
+ * Free native resources
1235
+ *
1236
+ * Call when done with context to release model and KV cache memory.
1237
+ * Context becomes unusable after disposal.
1238
+ */
1239
+ dispose(): void;
1240
+ /** @internal */
1241
+ _branchCreate(position: number, params?: SamplingParams, nBatch?: number, grammar?: string): number;
1242
+ /** @internal */
1243
+ _branchFork(handle: number): number;
1244
+ /** @internal */
1245
+ _branchPrefill(handle: number, tokens: number[]): Promise<void>;
1246
+ /** @internal */
1247
+ _branchSample(handle: number): number;
1248
+ /** @internal */
1249
+ _branchAccept(handle: number, token: number): void;
1250
+ /** @internal */
1251
+ _branchGetPosition(handle: number): number;
1252
+ /** @internal */
1253
+ _branchGetPerplexity(handle: number): number;
1254
+ /** @internal */
1255
+ _branchGetLogits(handle: number): Float32Array;
1256
+ /** @internal */
1257
+ _branchPrune(handle: number): void;
1258
+ /** @internal */
1259
+ _branchPruneSubtree(handle: number): void;
1260
+ /** @internal */
1261
+ _branchParent(handle: number): number;
1262
+ /** @internal */
1263
+ _branchChildren(handle: number): number[];
1264
+ /** @internal */
1265
+ _branchIsLeaf(handle: number): boolean;
1266
+ /** @internal */
1267
+ _branchIsActive(handle: number): boolean;
1268
+ /** @internal */
1269
+ _branchSamplerChainReseed(handle: number, seed: number): void;
1270
+ /** @internal */
1271
+ _branchSteer(handle: number, biases: Array<{
1272
+ token: number;
1273
+ bias: number;
1274
+ }>): void;
1275
+ /** @internal */
1276
+ _branchClearSteer(handle: number): void;
1277
+ /** @internal */
1278
+ _branchSetSamplerParams(handle: number, params: SamplingParams): void;
1279
+ /** @internal */
1280
+ _branchSetGrammar(handle: number, grammarStr: string): void;
1281
+ /** @internal */
1282
+ _branchSetGrammarLazy(handle: number, grammar: string, patterns: string[], tokens: number[]): void;
1283
+ /** @internal */
1284
+ _branchModelEntropy(handle: number, base?: string): number;
1285
+ /** @internal */
1286
+ _branchModelSurprisal(handle: number, token: number, base?: string): number;
1287
+ /** @internal */
1288
+ _branchGetSamplingPerplexity(handle: number): number;
1289
+ /** @internal */
1290
+ _branchSetLogitBias(handle: number, biases: Array<{
1291
+ token: number;
1292
+ bias: number;
1293
+ }>): void;
1294
+ /** @internal */
1295
+ _branchClearLogitBias(handle: number): void;
1296
+ /** @internal */
1297
+ _storeCommit(handles: number[], tokens: number[]): Promise<void>;
1298
+ /** @internal */
1299
+ _storePrefill(handles: number[], tokenArrays: number[][]): Promise<void>;
1300
+ /** @internal */
1301
+ _storeRetainOnly(handle: number): void;
1302
+ /** @internal */
1303
+ _storeAvailable(): number;
1304
+ /** KV cache pressure snapshot from native BranchStore.
1305
+ * cells_used is a monotonic counter reset on drain/retainOnly. */
1306
+ _storeKvPressure(): {
1307
+ nCtx: number;
1308
+ cellsUsed: number;
1309
+ remaining: number;
1310
+ };
1311
+ /** @internal — processes ≤ n_seq_max prompts in a single group */
1312
+ _scoreGroup(tokenArrays: number[][]): Promise<Float32Array[]>;
1313
+ }
1314
+ /**
1315
+ * Result from Branch.produce()
1316
+ *
1317
+ * @category Branching
1318
+ */
1319
+ export interface Produced {
1320
+ /** Sampled token ID */
1321
+ token: number;
1322
+ /** Text representation of the token */
1323
+ text: string;
1324
+ /** Whether this is a stop token (EOS) */
1325
+ isStop: boolean;
1326
+ }
1327
+ /**
1328
+ * Options for Rerank context creation
1329
+ * @category Core
1330
+ */
1331
+ export interface RerankOptions {
1332
+ /** Path to reranker .gguf model */
1333
+ modelPath: string;
1334
+ /** Max prompts per GPU dispatch (default: 8) */
1335
+ nSeqMax?: number;
1336
+ /** Context window size (default: 4096) */
1337
+ nCtx?: number;
1338
+ /** KV cache key quantization (default: 'q4_0') */
1339
+ typeK?: KvCacheType;
1340
+ /** KV cache value quantization (default: 'q4_0') */
1341
+ typeV?: KvCacheType;
1342
+ }
1343
+ /**
1344
+ * A single rerank result — score for one document
1345
+ * @category Core
1346
+ */
1347
+ export interface RerankResult {
1348
+ /** Relevance probability (0–1) */
1349
+ score: number;
1350
+ /** Original index in the input array */
1351
+ index: number;
1352
+ }
1353
+ /**
1354
+ * Progress yielded by Rerank.score() after each scoring group completes
1355
+ * @category Core
1356
+ */
1357
+ export interface RerankProgress {
1358
+ /** Number of documents scored so far */
1359
+ filled: number;
1360
+ /** Total documents to score */
1361
+ total: number;
1362
+ /** Sorted results — partial until filled === total */
1363
+ results: RerankResult[];
1364
+ }
1365
+ //# sourceMappingURL=types.d.ts.map