@lloyal-labs/lloyal.node 1.0.9 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/Branch.js CHANGED
@@ -65,10 +65,15 @@ class Branch {
65
65
  * @param {number} seqId - Sequence ID for this branch
66
66
  * @param {number} position - Starting position (typically prompt token count)
67
67
  * @param {SamplingParams} [params] - Sampling parameters (temperature, topP, etc.)
68
+ * @param {number} [nBatch] - Per-branch batch size override (defaults to context nBatch).
69
+ * Controls chunk size for prefill() (decode_and_capture_batch). Has no effect on
70
+ * single-token commit() which uses a zero-allocation fast path. Useful for tuning
71
+ * memory/throughput tradeoff on bulk token decode — e.g. smaller nBatch for cheap
72
+ * exploration branches, larger for the trunk.
68
73
  * @returns {Branch} New Branch instance
69
74
  */
70
- static create(ctx, seqId, position, params) {
71
- const handle = ctx._branchCreate(seqId, position, params);
75
+ static create(ctx, seqId, position, params, nBatch) {
76
+ const handle = ctx._branchCreate(seqId, position, params, nBatch);
72
77
  return new Branch(ctx, handle);
73
78
  }
74
79
 
@@ -116,6 +121,32 @@ class Branch {
116
121
  this._ctx._branchDecodeAndCaptureOne(this._handle, token);
117
122
  }
118
123
 
124
+ /**
125
+ * Bulk-decode tokens into the branch's KV cache and capture logits
126
+ *
127
+ * Feeds an array of tokens through the model. tokens.length is the total
128
+ * count to process; the branch's nBatch (set at Branch.create) controls
129
+ * how many are sent per llama_decode call. For example, 500 tokens with
130
+ * nBatch=64 makes 8 llama_decode calls (7×64 + 1×52). With nBatch=512
131
+ * it makes 1.
132
+ *
133
+ * Advances position by tokens.length and stores the final logits into
134
+ * the branch's internal snapshot. The next produce()/sample() call reads
135
+ * from that snapshot — logits never cross the JS boundary.
136
+ *
137
+ * Does NOT accept tokens into the sampler's repeat-penalty window — use
138
+ * this for external tokens (user input between turns), not model-generated
139
+ * tokens. For model output, use commit() which does accept + decode.
140
+ *
141
+ * This is the branch-level equivalent of ctx.decode().
142
+ *
143
+ * @param {number[]} tokens - Token IDs to decode
144
+ */
145
+ prefill(tokens) {
146
+ this._ensureNotDisposed();
147
+ this._ctx._branchDecodeAndCaptureBatch(this._handle, tokens);
148
+ }
149
+
119
150
  /**
120
151
  * Sample next token from branch's logits snapshot
121
152
  *
package/lib/index.d.ts CHANGED
@@ -1225,7 +1225,7 @@ export interface SessionContext {
1225
1225
  // ===== BRANCH API (internal, wrapped by Branch class) =====
1226
1226
 
1227
1227
  /** @internal Create a new branch for parallel generation */
1228
- _branchCreate(seqId: number, position: number, params?: SamplingParams): number;
1228
+ _branchCreate(seqId: number, position: number, params?: SamplingParams, nBatch?: number): number;
1229
1229
 
1230
1230
  /** @internal Fork a branch to a new sequence */
1231
1231
  _branchFork(handle: number, newSeqId: number): number;
@@ -1236,6 +1236,9 @@ export interface SessionContext {
1236
1236
  /** @internal Decode a single token and capture logits */
1237
1237
  _branchDecodeAndCaptureOne(handle: number, token: number): void;
1238
1238
 
1239
+ /** @internal Decode multiple tokens in n_batch-sized chunks and capture logits */
1240
+ _branchDecodeAndCaptureBatch(handle: number, tokens: number[]): void;
1241
+
1239
1242
  /** @internal Sample next token from branch's logits snapshot */
1240
1243
  _branchSample(handle: number): number;
1241
1244
 
@@ -1467,12 +1470,14 @@ export class Branch {
1467
1470
  * @param seqId Sequence ID for this branch
1468
1471
  * @param position Starting position (typically prompt token count)
1469
1472
  * @param params Sampling parameters (temperature, topP, etc.)
1473
+ * @param nBatch Per-branch batch size override (defaults to context nBatch)
1470
1474
  */
1471
1475
  static create(
1472
1476
  ctx: SessionContext,
1473
1477
  seqId: number,
1474
1478
  position: number,
1475
- params?: SamplingParams
1479
+ params?: SamplingParams,
1480
+ nBatch?: number
1476
1481
  ): Branch;
1477
1482
 
1478
1483
  /**
@@ -1493,6 +1498,27 @@ export class Branch {
1493
1498
  /** Decode a single token, write to KV, and capture resulting logits */
1494
1499
  decodeAndCaptureOne(token: number): void;
1495
1500
 
1501
+ /**
1502
+ * Bulk-decode tokens into the branch's KV cache and capture logits.
1503
+ *
1504
+ * `tokens.length` is the total count to process; the branch's `nBatch`
1505
+ * (set at `Branch.create`) controls how many are sent per `llama_decode`
1506
+ * call. E.g. 500 tokens with `nBatch=64` → 8 calls (7×64 + 1×52).
1507
+ *
1508
+ * Advances `position` by `tokens.length`. Stores final logits into the
1509
+ * branch's internal snapshot — the next `produce()`/`sample()` reads
1510
+ * from it.
1511
+ *
1512
+ * Does NOT accept tokens into the repeat-penalty window — for external
1513
+ * tokens (user input between turns), not model-generated tokens.
1514
+ * For model output, use `commit()` which does accept + decode.
1515
+ *
1516
+ * Branch-level equivalent of `ctx.decode()`.
1517
+ *
1518
+ * @param tokens - Token IDs to decode
1519
+ */
1520
+ prefill(tokens: number[]): void;
1521
+
1496
1522
  /** Sample next token from branch's frozen logits snapshot */
1497
1523
  sample(): number;
1498
1524
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lloyal-labs/lloyal.node",
3
- "version": "1.0.9",
3
+ "version": "1.1.0",
4
4
  "description": "Node.js client for liblloyal+llama.cpp",
5
5
  "main": "lib/index.js",
6
6
  "types": "lib/index.d.ts",
@@ -54,19 +54,19 @@
54
54
  "typedoc-rhineai-theme": "^1.2.0"
55
55
  },
56
56
  "optionalDependencies": {
57
- "@lloyal-labs/lloyal.node-darwin-arm64": "1.0.9",
58
- "@lloyal-labs/lloyal.node-darwin-x64": "1.0.9",
59
- "@lloyal-labs/lloyal.node-linux-arm64": "1.0.9",
60
- "@lloyal-labs/lloyal.node-linux-arm64-cuda": "1.0.9",
61
- "@lloyal-labs/lloyal.node-linux-arm64-vulkan": "1.0.9",
62
- "@lloyal-labs/lloyal.node-linux-x64": "1.0.9",
63
- "@lloyal-labs/lloyal.node-linux-x64-cuda": "1.0.9",
64
- "@lloyal-labs/lloyal.node-linux-x64-vulkan": "1.0.9",
65
- "@lloyal-labs/lloyal.node-win32-arm64": "1.0.9",
66
- "@lloyal-labs/lloyal.node-win32-arm64-vulkan": "1.0.9",
67
- "@lloyal-labs/lloyal.node-win32-x64": "1.0.9",
68
- "@lloyal-labs/lloyal.node-win32-x64-cuda": "1.0.9",
69
- "@lloyal-labs/lloyal.node-win32-x64-vulkan": "1.0.9"
57
+ "@lloyal-labs/lloyal.node-darwin-arm64": "1.1.0",
58
+ "@lloyal-labs/lloyal.node-darwin-x64": "1.1.0",
59
+ "@lloyal-labs/lloyal.node-linux-arm64": "1.1.0",
60
+ "@lloyal-labs/lloyal.node-linux-arm64-cuda": "1.1.0",
61
+ "@lloyal-labs/lloyal.node-linux-arm64-vulkan": "1.1.0",
62
+ "@lloyal-labs/lloyal.node-linux-x64": "1.1.0",
63
+ "@lloyal-labs/lloyal.node-linux-x64-cuda": "1.1.0",
64
+ "@lloyal-labs/lloyal.node-linux-x64-vulkan": "1.1.0",
65
+ "@lloyal-labs/lloyal.node-win32-arm64": "1.1.0",
66
+ "@lloyal-labs/lloyal.node-win32-arm64-vulkan": "1.1.0",
67
+ "@lloyal-labs/lloyal.node-win32-x64": "1.1.0",
68
+ "@lloyal-labs/lloyal.node-win32-x64-cuda": "1.1.0",
69
+ "@lloyal-labs/lloyal.node-win32-x64-vulkan": "1.1.0"
70
70
  },
71
71
  "engines": {
72
72
  "node": ">=22.0.0"