@lloyal-labs/lloyal.node 1.0.5-alpha → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/index.d.ts CHANGED
@@ -4,6 +4,48 @@
4
4
  * N-API bindings for liblloyal - Node.js native addon for llama.cpp inference
5
5
  */
6
6
 
7
+ /**
8
+ * GPU variant for binary loading
9
+ *
10
+ * Specifies which GPU-accelerated binary to load:
11
+ * - 'default': CPU-only (works everywhere)
12
+ * - 'cuda': NVIDIA CUDA (requires libcudart.so/cudart64.dll)
13
+ * - 'vulkan': Vulkan (AMD/Intel/NVIDIA, requires Vulkan runtime)
14
+ *
15
+ * If the requested variant is unavailable (package not installed or
16
+ * runtime libraries missing), loading automatically falls back to CPU.
17
+ */
18
+ export type GpuVariant = 'default' | 'cuda' | 'vulkan';
19
+
20
+ /**
21
+ * Options for binary loading
22
+ *
23
+ * Controls which native binary variant is loaded when creating a context.
24
+ * Use this for explicit GPU variant selection with automatic fallback.
25
+ */
26
+ export interface LoadOptions {
27
+ /**
28
+ * GPU variant to use
29
+ *
30
+ * - 'cuda': NVIDIA CUDA (requires libcudart.so)
31
+ * - 'vulkan': Vulkan (AMD/Intel/NVIDIA)
32
+ * - 'default' or undefined: CPU only
33
+ *
34
+ * If the requested variant is unavailable (missing runtime libraries),
35
+ * automatically falls back to CPU with a console warning.
36
+ *
37
+ * @example
38
+ * ```typescript
39
+ * // Request CUDA with automatic fallback to CPU
40
+ * const ctx = await createContext(
41
+ * { modelPath: './model.gguf' },
42
+ * { gpuVariant: 'cuda' }
43
+ * );
44
+ * ```
45
+ */
46
+ gpuVariant?: GpuVariant;
47
+ }
48
+
7
49
  /**
8
50
  * Pooling type for embedding extraction
9
51
  */
@@ -305,9 +347,11 @@ export interface SessionContext {
305
347
  * // Creative generation
306
348
  * const token = ctx.sample({ temperature: 0.9 });
307
349
  *
308
- * // Constrained to valid JSON
309
- * ctx.initGrammar(grammar);
350
+ * // Constrained to valid JSON (handle-based API)
351
+ * const grammarHandle = ctx.createSampler(grammar);
352
+ * ctx.applySampler(grammarHandle, ctx.getLogits());
310
353
  * const token = ctx.sample({ temperature: 0.7 });
354
+ * ctx.acceptSamplerToken(grammarHandle, token);
311
355
  * ```
312
356
  */
313
357
  sample(params?: SamplingParams): number;
@@ -566,144 +610,6 @@ export interface SessionContext {
566
610
  */
567
611
  clearAndReseed(sinks: number[], tail: number[]): Promise<void>;
568
612
 
569
- // ===== GRAMMAR-CONSTRAINED GENERATION =====
570
-
571
- /**
572
- * Initialize grammar parser (once per generation session)
573
- *
574
- * Grammars constrain generation to valid formats (JSON, XML, etc.).
575
- * Parser tracks state across tokens to enforce rules.
576
- *
577
- * Call once before starting constrained generation.
578
- * Use resetGrammar() to reuse same grammar for new generation.
579
- *
580
- * Cost: ~0.1-1ms depending on grammar complexity
581
- *
582
- * @param grammarStr GBNF grammar string (EBNF-like syntax)
583
- * @example
584
- * ```typescript
585
- * // Force valid JSON
586
- * const grammar = ctx.jsonSchemaToGrammar(JSON.stringify({
587
- * type: "object",
588
- * properties: {
589
- * name: { type: "string" },
590
- * age: { type: "number" }
591
- * }
592
- * }));
593
- *
594
- * ctx.initGrammar(grammar);
595
- *
596
- * // Now sample() will only generate valid JSON
597
- * const token = ctx.sample({ temperature: 0.7 });
598
- * ```
599
- */
600
- initGrammar(grammarStr: string): void;
601
-
602
- /**
603
- * Apply grammar constraints to token scores (modifies in-place)
604
- *
605
- * Masks invalid tokens with -Infinity based on parser state.
606
- * Call after getTokenScores(), before custom sampling.
607
- *
608
- * Flow: getTokenScores() → applyGrammar() → sample() → acceptToken()
609
- *
610
- * Thread safety: This method is synchronous and modifies the buffer
611
- * in-place on the JS thread. Safe because it's called sequentially
612
- * in the generation loop before any async operations.
613
- *
614
- * Cost: ~0.1-1ms depending on grammar complexity
615
- *
616
- * @param scoresBuffer Buffer from getTokenScores() (modified in-place)
617
- * @throws Error if grammar not initialized (call initGrammar first)
618
- * @example
619
- * ```typescript
620
- * // Custom sampling with grammar
621
- * const buffer = ctx.getTokenScores();
622
- * const scores = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
623
- *
624
- * // Apply grammar constraints
625
- * ctx.applyGrammar(buffer);
626
- *
627
- * // Now sample from constrained distribution
628
- * const token = customSample(scores);
629
- * ctx.acceptToken(token);
630
- * ```
631
- */
632
- applyGrammar(scoresBuffer: Buffer): void;
633
-
634
- /**
635
- * Advance grammar parser with chosen token
636
- *
637
- * Updates parser state after sampling.
638
- * MUST be called AFTER sampling, BEFORE next applyGrammar().
639
- *
640
- * This advances the stateful grammar parser through its rules.
641
- * Without this, grammar constraints will be incorrect.
642
- *
643
- * Cost: <0.01ms
644
- *
645
- * @param tokenId Token that was sampled
646
- * @example
647
- * ```typescript
648
- * const buffer = ctx.getTokenScores();
649
- * ctx.applyGrammar(buffer);
650
- *
651
- * const scores = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
652
- * const token = customSample(scores);
653
- *
654
- * // MUST call acceptToken to advance parser
655
- * ctx.acceptToken(token);
656
- *
657
- * // Now parser is ready for next token
658
- * ```
659
- */
660
- acceptToken(tokenId: number): void;
661
-
662
- /**
663
- * Reset grammar parser to initial state
664
- *
665
- * Call at start of each new generation with same grammar.
666
- * Parser returns to root state, ready to validate from beginning.
667
- *
668
- * Cost: <0.01ms
669
- *
670
- * @example
671
- * ```typescript
672
- * ctx.initGrammar(jsonGrammar);
673
- *
674
- * // First generation
675
- * while (!done) {
676
- * const token = ctx.sample();
677
- * // ... generate ...
678
- * }
679
- *
680
- * // Second generation - reuse same grammar
681
- * ctx.resetGrammar();
682
- * while (!done) {
683
- * const token = ctx.sample();
684
- * // ... generate ...
685
- * }
686
- * ```
687
- */
688
- resetGrammar(): void;
689
-
690
- /**
691
- * Free grammar resources
692
- *
693
- * Call when done with constrained generation.
694
- * Releases parser memory.
695
- *
696
- * Cost: <0.01ms
697
- *
698
- * @example
699
- * ```typescript
700
- * ctx.initGrammar(grammar);
701
- * // ... do constrained generation ...
702
- * ctx.freeGrammar();
703
- * ```
704
- */
705
- freeGrammar(): void;
706
-
707
613
  // ===== KV SEQUENCE OPERATIONS =====
708
614
 
709
615
  /**
@@ -775,9 +681,7 @@ export interface SessionContext {
775
681
  * Create a new grammar sampler (returns handle)
776
682
  *
777
683
  * Creates an independent grammar sampler instance with its own state.
778
- *
779
- * Unlike initGrammar() which uses a single internal sampler, this returns
780
- * a handle that can be used with applySampler/acceptSamplerToken.
684
+ * Returns a handle that can be used with applySampler/acceptSamplerToken.
781
685
  * Multiple handles can coexist with independent parser states.
782
686
  *
783
687
  * Cost: ~0.1-1ms depending on grammar complexity
@@ -817,7 +721,6 @@ export interface SessionContext {
817
721
  * Accept token to advance grammar parser state (handle-based)
818
722
  *
819
723
  * Must be called after sampling to advance the grammar parser.
820
- * This is the handle-based equivalent of acceptToken().
821
724
  *
822
725
  * @param handle Sampler handle from createSampler()
823
726
  * @param tokenId Token that was sampled
@@ -867,13 +770,15 @@ export interface SessionContext {
867
770
  * - High surprisal: Model didn't expect this token (low probability)
868
771
  *
869
772
  * Call after decode() to compute surprisal for any token based on
870
- * the current logits distribution.
773
+ * the current logits distribution, or pass captured logits for
774
+ * offline computation (e.g., best-of-n scoring from prefill logits).
871
775
  *
872
776
  * @param pickedTokenId - Token ID to compute surprisal for
873
777
  * @param base - Logarithm base: "nats" (default) or "bits"
778
+ * @param logits - Optional Float32Array of logits (uses current context logits if omitted)
874
779
  * @returns Surprisal value in specified base
875
780
  *
876
- * @example
781
+ * @example Current context logits (default)
877
782
  * ```typescript
878
783
  * await ctx.decode(tokens, position);
879
784
  * const token = ctx.sample();
@@ -881,9 +786,18 @@ export interface SessionContext {
881
786
  * console.log(`Model surprise: ${surprisal.toFixed(2)} bits`);
882
787
  * ```
883
788
  *
884
- * COST: O(1) - direct probability lookup from logits
789
+ * @example Captured/arbitrary logits (for best-of-n, verification, etc.)
790
+ * ```typescript
791
+ * // Capture logits after prefill
792
+ * const capturedLogits = new Float32Array(ctx.getLogits());
793
+ *
794
+ * // Later: compute surprisal from captured logits
795
+ * const surprisal = ctx.modelSurprisal(token, "nats", capturedLogits);
796
+ * ```
797
+ *
798
+ * COST: O(n_vocab) - softmax normalization required
885
799
  */
886
- modelSurprisal(pickedTokenId: number, base?: 'nats' | 'bits'): number;
800
+ modelSurprisal(pickedTokenId: number, base?: 'nats' | 'bits', logits?: Float32Array): number;
887
801
 
888
802
  /**
889
803
  * Compute entropy of the entire logits distribution.
@@ -892,12 +806,14 @@ export interface SessionContext {
892
806
  * - Low entropy: Model is confident (peaked distribution)
893
807
  * - High entropy: Model is uncertain (flat distribution)
894
808
  *
895
- * Call after decode() to analyze the current prediction distribution.
809
+ * Call after decode() to analyze the current prediction distribution,
810
+ * or pass captured logits for offline analysis.
896
811
  *
897
812
  * @param base - Logarithm base: "nats" (default), "bits", or "base10"
813
+ * @param logits - Optional Float32Array of logits (uses current context logits if omitted)
898
814
  * @returns Entropy value in specified base
899
815
  *
900
- * @example
816
+ * @example Current context logits (default)
901
817
  * ```typescript
902
818
  * await ctx.decode(tokens, position);
903
819
  * const entropy = ctx.modelEntropy("bits");
@@ -906,9 +822,15 @@ export interface SessionContext {
906
822
  * }
907
823
  * ```
908
824
  *
825
+ * @example Captured/arbitrary logits
826
+ * ```typescript
827
+ * const capturedLogits = new Float32Array(ctx.getLogits());
828
+ * const entropy = ctx.modelEntropy("nats", capturedLogits);
829
+ * ```
830
+ *
909
831
  * COST: O(n_vocab) - must sum over all token probabilities
910
832
  */
911
- modelEntropy(base?: 'nats' | 'bits'): number;
833
+ modelEntropy(base?: 'nats' | 'bits', logits?: Float32Array): number;
912
834
 
913
835
  /**
914
836
  * Create a new perplexity tracker.
@@ -1125,7 +1047,7 @@ export interface SessionContext {
1125
1047
  * Convert JSON schema to GBNF grammar
1126
1048
  *
1127
1049
  * Generates grammar string for constrained JSON generation.
1128
- * Use with initGrammar() or sample({ grammar }).
1050
+ * Use with createSampler() for grammar-constrained generation.
1129
1051
  *
1130
1052
  * Cost: ~1-10ms depending on schema complexity
1131
1053
  *
@@ -1143,7 +1065,7 @@ export interface SessionContext {
1143
1065
  * };
1144
1066
  *
1145
1067
  * const grammar = ctx.jsonSchemaToGrammar(JSON.stringify(schema));
1146
- * ctx.initGrammar(grammar);
1068
+ * const handle = ctx.createSampler(grammar);
1147
1069
  * ```
1148
1070
  */
1149
1071
  jsonSchemaToGrammar(schemaJson: string): string;
@@ -1253,16 +1175,6 @@ export interface SessionContext {
1253
1175
 
1254
1176
  // ===== NATIVE REFERENCE IMPLEMENTATIONS =====
1255
1177
 
1256
- /**
1257
- * Compute entropy of current logits distribution
1258
- *
1259
- * Alternative entropy computation using native implementation.
1260
- * Equivalent to modelEntropy("nats") but may be faster.
1261
- *
1262
- * @returns Entropy in nats
1263
- */
1264
- computeEntropy(): number;
1265
-
1266
1178
  /**
1267
1179
  * Sample greedily from current logits
1268
1180
  *
@@ -1299,14 +1211,57 @@ export interface SessionContext {
1299
1211
  * Context becomes unusable after disposal.
1300
1212
  */
1301
1213
  dispose(): void;
1214
+
1215
+ // ===== BRANCH API (internal, wrapped by Branch class) =====
1216
+
1217
+ /** @internal Create a new branch for parallel generation */
1218
+ _branchCreate(seqId: number, position: number, params?: SamplingParams): number;
1219
+
1220
+ /** @internal Fork a branch to a new sequence */
1221
+ _branchFork(handle: number, newSeqId: number): number;
1222
+
1223
+ /** @internal Capture logits into branch's snapshot */
1224
+ _branchCaptureLogits(handle: number): void;
1225
+
1226
+ /** @internal Decode a single token and capture logits */
1227
+ _branchDecodeAndCaptureOne(handle: number, token: number): void;
1228
+
1229
+ /** @internal Sample next token from branch's logits snapshot */
1230
+ _branchSample(handle: number): number;
1231
+
1232
+ /** @internal Accept token (update sampler state for penalties) */
1233
+ _branchAccept(handle: number, token: number): void;
1234
+
1235
+ /** @internal Get branch's sequence ID */
1236
+ _branchGetSeqId(handle: number): number;
1237
+
1238
+ /** @internal Get branch's current position */
1239
+ _branchGetPosition(handle: number): number;
1240
+
1241
+ /** @internal Get branch's perplexity */
1242
+ _branchGetPerplexity(handle: number): number;
1243
+
1244
+ /** @internal Prune branch (remove KV cache entries and free handle) */
1245
+ _branchPrune(handle: number): void;
1246
+
1247
+ /** @internal Destroy branch (free handle without removing KV cache) */
1248
+ _branchDestroy(handle: number): void;
1249
+
1250
+ /** @internal Reseed branch sampler PRNG for diversity after fork */
1251
+ _branchSamplerChainReseed(handle: number, seed: number): void;
1302
1252
  }
1303
1253
 
1304
1254
  /**
1305
1255
  * Create a new inference context
1306
1256
  *
1257
+ * Loads the appropriate native binary (with automatic GPU fallback) and
1258
+ * creates an inference context for the specified model.
1259
+ *
1307
1260
  * @param options Context creation options
1261
+ * @param loadOptions Optional binary loading options (GPU variant selection)
1308
1262
  * @returns Promise resolving to SessionContext instance
1309
- * @example
1263
+ *
1264
+ * @example Basic usage
1310
1265
  * ```typescript
1311
1266
  * const ctx = await createContext({
1312
1267
  * modelPath: './model.gguf',
@@ -1322,8 +1277,58 @@ export interface SessionContext {
1322
1277
  * ctx.dispose();
1323
1278
  * }
1324
1279
  * ```
1280
+ *
1281
+ * @example With GPU variant selection
1282
+ * ```typescript
1283
+ * // Request CUDA - falls back to CPU if unavailable
1284
+ * const ctx = await createContext(
1285
+ * { modelPath: './model.gguf', nCtx: 4096 },
1286
+ * { gpuVariant: 'cuda' }
1287
+ * );
1288
+ * ```
1289
+ *
1290
+ * @example Using environment variable
1291
+ * ```typescript
1292
+ * // Set LLOYAL_GPU=cuda before running
1293
+ * // createContext will automatically use CUDA if available
1294
+ * const ctx = await createContext({ modelPath: './model.gguf' });
1295
+ * ```
1325
1296
  */
1326
- export function createContext(options: ContextOptions): Promise<SessionContext>;
1297
+ export function createContext(
1298
+ options: ContextOptions,
1299
+ loadOptions?: LoadOptions
1300
+ ): Promise<SessionContext>;
1301
+
1302
+ /**
1303
+ * Load native binary for a specific GPU variant
1304
+ *
1305
+ * Loads the appropriate platform-specific binary with automatic fallback:
1306
+ * 1. Try requested GPU variant (if specified)
1307
+ * 2. Fall back to default (CPU) platform package
1308
+ * 3. Fall back to local build (development: build/Release/lloyal.node)
1309
+ *
1310
+ * Use this for advanced scenarios where you need direct binary access
1311
+ * or want to check variant availability before creating a context.
1312
+ *
1313
+ * @param variant GPU variant: 'cuda', 'vulkan', or undefined for CPU
1314
+ * @returns Native binary module with createContext method
1315
+ * @throws Error if no binary available for the current platform
1316
+ *
1317
+ * @example
1318
+ * ```typescript
1319
+ * // Load default (CPU) binary
1320
+ * const binary = loadBinary();
1321
+ *
1322
+ * // Load CUDA binary (falls back to CPU if unavailable)
1323
+ * const binary = loadBinary('cuda');
1324
+ *
1325
+ * // Create context from loaded binary
1326
+ * const ctx = await binary.createContext({ modelPath: './model.gguf' });
1327
+ * ```
1328
+ */
1329
+ export function loadBinary(variant?: GpuVariant): {
1330
+ createContext(options: ContextOptions): Promise<SessionContext>;
1331
+ };
1327
1332
 
1328
1333
  /**
1329
1334
  * Safe logits access with automatic lifetime management
@@ -1386,3 +1391,140 @@ export function withLogits<T>(
1386
1391
  ctx: SessionContext,
1387
1392
  fn: (logits: Float32Array) => T
1388
1393
  ): T;
1394
+
1395
+ /**
1396
+ * Result from Branch.produce()
1397
+ */
1398
+ export interface Produced {
1399
+ /** Sampled token ID */
1400
+ token: number;
1401
+ /** Text representation of the token */
1402
+ text: string;
1403
+ /** Whether this is a stop token (EOS) */
1404
+ isStop: boolean;
1405
+ }
1406
+
1407
+ /**
1408
+ * Forkable inference handle for covalent generation
1409
+ *
1410
+ * A Branch owns everything needed for independent generation: a KV cache
1411
+ * sequence, sampler chain, logits snapshot, and perplexity tracker.
1412
+ *
1413
+ * Forking is cheap — the KV prefix is shared in memory (metadata-only operation under unified KV —
1414
+ * no KV tensor buffers are copied), so sibling branches read from the same physical KV entries.
1415
+ * Only tokens decoded after the fork point are exclusive to each branch.
1416
+ *
1417
+ * Branches form trees, not just flat lists. Fork from root for best-of-N,
1418
+ * fork from children for MCTS/beam search, fork from a draft for speculative
1419
+ * decoding.
1420
+ *
1421
+ * The produce/commit protocol separates sampling from state advancement:
1422
+ * produce() samples without writing to KV, letting you inspect the result
1423
+ * before deciding to commit().
1424
+ *
1425
+ * @example Best-of-N with perplexity selection
1426
+ * ```typescript
1427
+ * const root = Branch.create(ctx, 0, tokens.length, { temperature: 0.8 });
1428
+ * root.captureLogits();
1429
+ *
1430
+ * const candidates = [1, 2, 3, 4, 5].map((seqId, i) => {
1431
+ * const branch = root.fork(seqId);
1432
+ * branch.reseedSampler(1000 + i);
1433
+ * return branch;
1434
+ * });
1435
+ *
1436
+ * for (let t = 0; t < 50; t++) {
1437
+ * for (const branch of candidates) {
1438
+ * const { token, isStop } = branch.produce();
1439
+ * if (isStop) continue;
1440
+ * branch.commit(token);
1441
+ * }
1442
+ * }
1443
+ *
1444
+ * const best = candidates.reduce((a, b) => a.perplexity < b.perplexity ? a : b);
1445
+ * for (const c of candidates) { if (c !== best) c.prune(); }
1446
+ * ```
1447
+ */
1448
+ export class Branch {
1449
+ /**
1450
+ * Create a root branch at the given position
1451
+ *
1452
+ * The branch takes ownership of the sequence and creates its own sampler
1453
+ * chain from the provided params. Call captureLogits() after prefill to
1454
+ * freeze the logit distribution before forking.
1455
+ *
1456
+ * @param ctx SessionContext to create branch on
1457
+ * @param seqId Sequence ID for this branch
1458
+ * @param position Starting position (typically prompt token count)
1459
+ * @param params Sampling parameters (temperature, topP, etc.)
1460
+ */
1461
+ static create(
1462
+ ctx: SessionContext,
1463
+ seqId: number,
1464
+ position: number,
1465
+ params?: SamplingParams
1466
+ ): Branch;
1467
+
1468
+ /**
1469
+ * Fork this branch to a new sequence
1470
+ *
1471
+ * The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy).
1472
+ * Logits, sampler state, and perplexity tracker are cloned so the child
1473
+ * can diverge independently. Fork from any branch — root or intermediate —
1474
+ * to build arbitrarily deep trees.
1475
+ *
1476
+ * @param newSeqId Sequence ID for the forked branch
1477
+ */
1478
+ fork(newSeqId: number): Branch;
1479
+
1480
+ /** Freeze the current logit distribution into this branch. Essential before fork(). */
1481
+ captureLogits(): void;
1482
+
1483
+ /** Decode a single token, write to KV, and capture resulting logits */
1484
+ decodeAndCaptureOne(token: number): void;
1485
+
1486
+ /** Sample next token from branch's frozen logits snapshot */
1487
+ sample(): number;
1488
+
1489
+ /** Accept token for repeat-penalty tracking */
1490
+ accept(token: number): void;
1491
+
1492
+ /** Discard branch — remove its divergent KV entries and free the handle (use for losers) */
1493
+ prune(): void;
1494
+
1495
+ /** Release handle but keep KV entries intact (use for winners, continue with raw ops) */
1496
+ destroy(): void;
1497
+
1498
+ /**
1499
+ * Reseed the sampler's PRNG for diversity after fork()
1500
+ *
1501
+ * CRITICAL for parallel generation: Without reseeding, all forked branches
1502
+ * produce identical outputs because they share the same PRNG state.
1503
+ *
1504
+ * Only affects stochastic samplers (temperature > 0). Greedy samplers are unchanged.
1505
+ *
1506
+ * @param seed - New seed for the PRNG
1507
+ */
1508
+ reseedSampler(seed: number): void;
1509
+
1510
+ /** Sample next token without advancing state. Inspect before committing. */
1511
+ produce(): Produced;
1512
+
1513
+ /** Accept and advance — write token to KV and update branch state. */
1514
+ commit(token: number): void;
1515
+
1516
+ /** Branch's sequence ID */
1517
+ readonly seqId: number;
1518
+
1519
+ /** Branch's current position */
1520
+ readonly position: number;
1521
+
1522
+ /** Branch's perplexity */
1523
+ readonly perplexity: number;
1524
+
1525
+ /** Internal handle (for debugging) */
1526
+ readonly handle: number;
1527
+
1528
+ /** Whether this branch has been disposed */
1529
+ readonly disposed: boolean;
1530
+ }