npm - @lloyal-labs/lloyal.node - Versions diffs - 1.0.8 → 1.1.0 - Mend

@lloyal-labs/lloyal.node 1.0.8 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/lib/Branch.js CHANGED Viewed

@@ -65,10 +65,15 @@ class Branch {
    * @param {number} seqId - Sequence ID for this branch
    * @param {number} position - Starting position (typically prompt token count)
    * @param {SamplingParams} [params] - Sampling parameters (temperature, topP, etc.)
+   * @param {number} [nBatch] - Per-branch batch size override (defaults to context nBatch).
+   *   Controls chunk size for prefill() (decode_and_capture_batch). Has no effect on
+   *   single-token commit() which uses a zero-allocation fast path. Useful for tuning
+   *   memory/throughput tradeoff on bulk token decode — e.g. smaller nBatch for cheap
+   *   exploration branches, larger for the trunk.
    * @returns {Branch} New Branch instance
    */
-  static create(ctx, seqId, position, params) {
-    const handle = ctx._branchCreate(seqId, position, params);
+  static create(ctx, seqId, position, params, nBatch) {
+    const handle = ctx._branchCreate(seqId, position, params, nBatch);
     return new Branch(ctx, handle);
   }
@@ -116,6 +121,32 @@ class Branch {
     this._ctx._branchDecodeAndCaptureOne(this._handle, token);
   }
+  /**
+   * Bulk-decode tokens into the branch's KV cache and capture logits
+   *
+   * Feeds an array of tokens through the model. tokens.length is the total
+   * count to process; the branch's nBatch (set at Branch.create) controls
+   * how many are sent per llama_decode call. For example, 500 tokens with
+   * nBatch=64 makes 8 llama_decode calls (7×64 + 1×52). With nBatch=512
+   * it makes 1.
+   *
+   * Advances position by tokens.length and stores the final logits into
+   * the branch's internal snapshot. The next produce()/sample() call reads
+   * from that snapshot — logits never cross the JS boundary.
+   *
+   * Does NOT accept tokens into the sampler's repeat-penalty window — use
+   * this for external tokens (user input between turns), not model-generated
+   * tokens. For model output, use commit() which does accept + decode.
+   *
+   * This is the branch-level equivalent of ctx.decode().
+   *
+   * @param {number[]} tokens - Token IDs to decode
+   */
+  prefill(tokens) {
+    this._ensureNotDisposed();
+    this._ctx._branchDecodeAndCaptureBatch(this._handle, tokens);
+  }
   /**
    * Sample next token from branch's logits snapshot
    *

package/lib/index.d.ts CHANGED Viewed

@@ -73,6 +73,16 @@ export interface ContextOptions {
   /** Number of threads (default: 4) */
   nThreads?: number;
+  /**
+   * Batch size for token processing
+   *
+   * Controls how many tokens are processed per llama_decode call.
+   * Higher values improve throughput for prompt prefill at the cost of memory.
+   * Also sets llama_context_params.n_batch and n_ubatch at context creation.
+   * Default: 512
+   */
+  nBatch?: number;
   /**
    * Enable embedding extraction mode
    *
@@ -1215,7 +1225,7 @@ export interface SessionContext {
   // ===== BRANCH API (internal, wrapped by Branch class) =====
   /** @internal Create a new branch for parallel generation */
-  _branchCreate(seqId: number, position: number, params?: SamplingParams): number;
+  _branchCreate(seqId: number, position: number, params?: SamplingParams, nBatch?: number): number;
   /** @internal Fork a branch to a new sequence */
   _branchFork(handle: number, newSeqId: number): number;
@@ -1226,6 +1236,9 @@ export interface SessionContext {
   /** @internal Decode a single token and capture logits */
   _branchDecodeAndCaptureOne(handle: number, token: number): void;
+  /** @internal Decode multiple tokens in n_batch-sized chunks and capture logits */
+  _branchDecodeAndCaptureBatch(handle: number, tokens: number[]): void;
   /** @internal Sample next token from branch's logits snapshot */
   _branchSample(handle: number): number;
@@ -1457,12 +1470,14 @@ export class Branch {
    * @param seqId Sequence ID for this branch
    * @param position Starting position (typically prompt token count)
    * @param params Sampling parameters (temperature, topP, etc.)
+   * @param nBatch Per-branch batch size override (defaults to context nBatch)
    */
   static create(
     ctx: SessionContext,
     seqId: number,
     position: number,
-    params?: SamplingParams
+    params?: SamplingParams,
+    nBatch?: number
   ): Branch;
   /**
@@ -1483,6 +1498,27 @@ export class Branch {
   /** Decode a single token, write to KV, and capture resulting logits */
   decodeAndCaptureOne(token: number): void;
+  /**
+   * Bulk-decode tokens into the branch's KV cache and capture logits.
+   *
+   * `tokens.length` is the total count to process; the branch's `nBatch`
+   * (set at `Branch.create`) controls how many are sent per `llama_decode`
+   * call. E.g. 500 tokens with `nBatch=64` → 8 calls (7×64 + 1×52).
+   *
+   * Advances `position` by `tokens.length`. Stores final logits into the
+   * branch's internal snapshot — the next `produce()`/`sample()` reads
+   * from it.
+   *
+   * Does NOT accept tokens into the repeat-penalty window — for external
+   * tokens (user input between turns), not model-generated tokens.
+   * For model output, use `commit()` which does accept + decode.
+   *
+   * Branch-level equivalent of `ctx.decode()`.
+   *
+   * @param tokens - Token IDs to decode
+   */
+  prefill(tokens: number[]): void;
   /** Sample next token from branch's frozen logits snapshot */
   sample(): number;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lloyal-labs/lloyal.node",
-  "version": "1.0.8",
+  "version": "1.1.0",
   "description": "Node.js client for liblloyal+llama.cpp",
   "main": "lib/index.js",
   "types": "lib/index.d.ts",
@@ -54,19 +54,19 @@
     "typedoc-rhineai-theme": "^1.2.0"
   },
   "optionalDependencies": {
-    "@lloyal-labs/lloyal.node-darwin-arm64": "1.0.8",
-    "@lloyal-labs/lloyal.node-darwin-x64": "1.0.8",
-    "@lloyal-labs/lloyal.node-linux-arm64": "1.0.8",
-    "@lloyal-labs/lloyal.node-linux-arm64-cuda": "1.0.8",
-    "@lloyal-labs/lloyal.node-linux-arm64-vulkan": "1.0.8",
-    "@lloyal-labs/lloyal.node-linux-x64": "1.0.8",
-    "@lloyal-labs/lloyal.node-linux-x64-cuda": "1.0.8",
-    "@lloyal-labs/lloyal.node-linux-x64-vulkan": "1.0.8",
-    "@lloyal-labs/lloyal.node-win32-arm64": "1.0.8",
-    "@lloyal-labs/lloyal.node-win32-arm64-vulkan": "1.0.8",
-    "@lloyal-labs/lloyal.node-win32-x64": "1.0.8",
-    "@lloyal-labs/lloyal.node-win32-x64-cuda": "1.0.8",
-    "@lloyal-labs/lloyal.node-win32-x64-vulkan": "1.0.8"
+    "@lloyal-labs/lloyal.node-darwin-arm64": "1.1.0",
+    "@lloyal-labs/lloyal.node-darwin-x64": "1.1.0",
+    "@lloyal-labs/lloyal.node-linux-arm64": "1.1.0",
+    "@lloyal-labs/lloyal.node-linux-arm64-cuda": "1.1.0",
+    "@lloyal-labs/lloyal.node-linux-arm64-vulkan": "1.1.0",
+    "@lloyal-labs/lloyal.node-linux-x64": "1.1.0",
+    "@lloyal-labs/lloyal.node-linux-x64-cuda": "1.1.0",
+    "@lloyal-labs/lloyal.node-linux-x64-vulkan": "1.1.0",
+    "@lloyal-labs/lloyal.node-win32-arm64": "1.1.0",
+    "@lloyal-labs/lloyal.node-win32-arm64-vulkan": "1.1.0",
+    "@lloyal-labs/lloyal.node-win32-x64": "1.1.0",
+    "@lloyal-labs/lloyal.node-win32-x64-cuda": "1.1.0",
+    "@lloyal-labs/lloyal.node-win32-x64-vulkan": "1.1.0"
   },
   "engines": {
     "node": ">=22.0.0"