@lloyal-labs/lloyal.node 1.0.8 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/Branch.js +33 -2
- package/lib/index.d.ts +38 -2
- package/package.json +14 -14
package/lib/Branch.js
CHANGED
|
@@ -65,10 +65,15 @@ class Branch {
|
|
|
65
65
|
* @param {number} seqId - Sequence ID for this branch
|
|
66
66
|
* @param {number} position - Starting position (typically prompt token count)
|
|
67
67
|
* @param {SamplingParams} [params] - Sampling parameters (temperature, topP, etc.)
|
|
68
|
+
* @param {number} [nBatch] - Per-branch batch size override (defaults to context nBatch).
|
|
69
|
+
* Controls chunk size for prefill() (decode_and_capture_batch). Has no effect on
|
|
70
|
+
* single-token commit() which uses a zero-allocation fast path. Useful for tuning
|
|
71
|
+
* memory/throughput tradeoff on bulk token decode — e.g. smaller nBatch for cheap
|
|
72
|
+
* exploration branches, larger for the trunk.
|
|
68
73
|
* @returns {Branch} New Branch instance
|
|
69
74
|
*/
|
|
70
|
-
static create(ctx, seqId, position, params) {
|
|
71
|
-
const handle = ctx._branchCreate(seqId, position, params);
|
|
75
|
+
static create(ctx, seqId, position, params, nBatch) {
|
|
76
|
+
const handle = ctx._branchCreate(seqId, position, params, nBatch);
|
|
72
77
|
return new Branch(ctx, handle);
|
|
73
78
|
}
|
|
74
79
|
|
|
@@ -116,6 +121,32 @@ class Branch {
|
|
|
116
121
|
this._ctx._branchDecodeAndCaptureOne(this._handle, token);
|
|
117
122
|
}
|
|
118
123
|
|
|
124
|
+
/**
|
|
125
|
+
* Bulk-decode tokens into the branch's KV cache and capture logits
|
|
126
|
+
*
|
|
127
|
+
* Feeds an array of tokens through the model. tokens.length is the total
|
|
128
|
+
* count to process; the branch's nBatch (set at Branch.create) controls
|
|
129
|
+
* how many are sent per llama_decode call. For example, 500 tokens with
|
|
130
|
+
* nBatch=64 makes 8 llama_decode calls (7×64 + 1×52). With nBatch=512
|
|
131
|
+
* it makes 1.
|
|
132
|
+
*
|
|
133
|
+
* Advances position by tokens.length and stores the final logits into
|
|
134
|
+
* the branch's internal snapshot. The next produce()/sample() call reads
|
|
135
|
+
* from that snapshot — logits never cross the JS boundary.
|
|
136
|
+
*
|
|
137
|
+
* Does NOT accept tokens into the sampler's repeat-penalty window — use
|
|
138
|
+
* this for external tokens (user input between turns), not model-generated
|
|
139
|
+
* tokens. For model output, use commit() which does accept + decode.
|
|
140
|
+
*
|
|
141
|
+
* This is the branch-level equivalent of ctx.decode().
|
|
142
|
+
*
|
|
143
|
+
* @param {number[]} tokens - Token IDs to decode
|
|
144
|
+
*/
|
|
145
|
+
prefill(tokens) {
|
|
146
|
+
this._ensureNotDisposed();
|
|
147
|
+
this._ctx._branchDecodeAndCaptureBatch(this._handle, tokens);
|
|
148
|
+
}
|
|
149
|
+
|
|
119
150
|
/**
|
|
120
151
|
* Sample next token from branch's logits snapshot
|
|
121
152
|
*
|
package/lib/index.d.ts
CHANGED
|
@@ -73,6 +73,16 @@ export interface ContextOptions {
|
|
|
73
73
|
/** Number of threads (default: 4) */
|
|
74
74
|
nThreads?: number;
|
|
75
75
|
|
|
76
|
+
/**
|
|
77
|
+
* Batch size for token processing
|
|
78
|
+
*
|
|
79
|
+
* Controls how many tokens are processed per llama_decode call.
|
|
80
|
+
* Higher values improve throughput for prompt prefill at the cost of memory.
|
|
81
|
+
* Also sets llama_context_params.n_batch and n_ubatch at context creation.
|
|
82
|
+
* Default: 512
|
|
83
|
+
*/
|
|
84
|
+
nBatch?: number;
|
|
85
|
+
|
|
76
86
|
/**
|
|
77
87
|
* Enable embedding extraction mode
|
|
78
88
|
*
|
|
@@ -1215,7 +1225,7 @@ export interface SessionContext {
|
|
|
1215
1225
|
// ===== BRANCH API (internal, wrapped by Branch class) =====
|
|
1216
1226
|
|
|
1217
1227
|
/** @internal Create a new branch for parallel generation */
|
|
1218
|
-
_branchCreate(seqId: number, position: number, params?: SamplingParams): number;
|
|
1228
|
+
_branchCreate(seqId: number, position: number, params?: SamplingParams, nBatch?: number): number;
|
|
1219
1229
|
|
|
1220
1230
|
/** @internal Fork a branch to a new sequence */
|
|
1221
1231
|
_branchFork(handle: number, newSeqId: number): number;
|
|
@@ -1226,6 +1236,9 @@ export interface SessionContext {
|
|
|
1226
1236
|
/** @internal Decode a single token and capture logits */
|
|
1227
1237
|
_branchDecodeAndCaptureOne(handle: number, token: number): void;
|
|
1228
1238
|
|
|
1239
|
+
/** @internal Decode multiple tokens in n_batch-sized chunks and capture logits */
|
|
1240
|
+
_branchDecodeAndCaptureBatch(handle: number, tokens: number[]): void;
|
|
1241
|
+
|
|
1229
1242
|
/** @internal Sample next token from branch's logits snapshot */
|
|
1230
1243
|
_branchSample(handle: number): number;
|
|
1231
1244
|
|
|
@@ -1457,12 +1470,14 @@ export class Branch {
|
|
|
1457
1470
|
* @param seqId Sequence ID for this branch
|
|
1458
1471
|
* @param position Starting position (typically prompt token count)
|
|
1459
1472
|
* @param params Sampling parameters (temperature, topP, etc.)
|
|
1473
|
+
* @param nBatch Per-branch batch size override (defaults to context nBatch)
|
|
1460
1474
|
*/
|
|
1461
1475
|
static create(
|
|
1462
1476
|
ctx: SessionContext,
|
|
1463
1477
|
seqId: number,
|
|
1464
1478
|
position: number,
|
|
1465
|
-
params?: SamplingParams
|
|
1479
|
+
params?: SamplingParams,
|
|
1480
|
+
nBatch?: number
|
|
1466
1481
|
): Branch;
|
|
1467
1482
|
|
|
1468
1483
|
/**
|
|
@@ -1483,6 +1498,27 @@ export class Branch {
|
|
|
1483
1498
|
/** Decode a single token, write to KV, and capture resulting logits */
|
|
1484
1499
|
decodeAndCaptureOne(token: number): void;
|
|
1485
1500
|
|
|
1501
|
+
/**
|
|
1502
|
+
* Bulk-decode tokens into the branch's KV cache and capture logits.
|
|
1503
|
+
*
|
|
1504
|
+
* `tokens.length` is the total count to process; the branch's `nBatch`
|
|
1505
|
+
* (set at `Branch.create`) controls how many are sent per `llama_decode`
|
|
1506
|
+
* call. E.g. 500 tokens with `nBatch=64` → 8 calls (7×64 + 1×52).
|
|
1507
|
+
*
|
|
1508
|
+
* Advances `position` by `tokens.length`. Stores final logits into the
|
|
1509
|
+
* branch's internal snapshot — the next `produce()`/`sample()` reads
|
|
1510
|
+
* from it.
|
|
1511
|
+
*
|
|
1512
|
+
* Does NOT accept tokens into the repeat-penalty window — for external
|
|
1513
|
+
* tokens (user input between turns), not model-generated tokens.
|
|
1514
|
+
* For model output, use `commit()` which does accept + decode.
|
|
1515
|
+
*
|
|
1516
|
+
* Branch-level equivalent of `ctx.decode()`.
|
|
1517
|
+
*
|
|
1518
|
+
* @param tokens - Token IDs to decode
|
|
1519
|
+
*/
|
|
1520
|
+
prefill(tokens: number[]): void;
|
|
1521
|
+
|
|
1486
1522
|
/** Sample next token from branch's frozen logits snapshot */
|
|
1487
1523
|
sample(): number;
|
|
1488
1524
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lloyal-labs/lloyal.node",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Node.js client for liblloyal+llama.cpp",
|
|
5
5
|
"main": "lib/index.js",
|
|
6
6
|
"types": "lib/index.d.ts",
|
|
@@ -54,19 +54,19 @@
|
|
|
54
54
|
"typedoc-rhineai-theme": "^1.2.0"
|
|
55
55
|
},
|
|
56
56
|
"optionalDependencies": {
|
|
57
|
-
"@lloyal-labs/lloyal.node-darwin-arm64": "1.0
|
|
58
|
-
"@lloyal-labs/lloyal.node-darwin-x64": "1.0
|
|
59
|
-
"@lloyal-labs/lloyal.node-linux-arm64": "1.0
|
|
60
|
-
"@lloyal-labs/lloyal.node-linux-arm64-cuda": "1.0
|
|
61
|
-
"@lloyal-labs/lloyal.node-linux-arm64-vulkan": "1.0
|
|
62
|
-
"@lloyal-labs/lloyal.node-linux-x64": "1.0
|
|
63
|
-
"@lloyal-labs/lloyal.node-linux-x64-cuda": "1.0
|
|
64
|
-
"@lloyal-labs/lloyal.node-linux-x64-vulkan": "1.0
|
|
65
|
-
"@lloyal-labs/lloyal.node-win32-arm64": "1.0
|
|
66
|
-
"@lloyal-labs/lloyal.node-win32-arm64-vulkan": "1.0
|
|
67
|
-
"@lloyal-labs/lloyal.node-win32-x64": "1.0
|
|
68
|
-
"@lloyal-labs/lloyal.node-win32-x64-cuda": "1.0
|
|
69
|
-
"@lloyal-labs/lloyal.node-win32-x64-vulkan": "1.0
|
|
57
|
+
"@lloyal-labs/lloyal.node-darwin-arm64": "1.1.0",
|
|
58
|
+
"@lloyal-labs/lloyal.node-darwin-x64": "1.1.0",
|
|
59
|
+
"@lloyal-labs/lloyal.node-linux-arm64": "1.1.0",
|
|
60
|
+
"@lloyal-labs/lloyal.node-linux-arm64-cuda": "1.1.0",
|
|
61
|
+
"@lloyal-labs/lloyal.node-linux-arm64-vulkan": "1.1.0",
|
|
62
|
+
"@lloyal-labs/lloyal.node-linux-x64": "1.1.0",
|
|
63
|
+
"@lloyal-labs/lloyal.node-linux-x64-cuda": "1.1.0",
|
|
64
|
+
"@lloyal-labs/lloyal.node-linux-x64-vulkan": "1.1.0",
|
|
65
|
+
"@lloyal-labs/lloyal.node-win32-arm64": "1.1.0",
|
|
66
|
+
"@lloyal-labs/lloyal.node-win32-arm64-vulkan": "1.1.0",
|
|
67
|
+
"@lloyal-labs/lloyal.node-win32-x64": "1.1.0",
|
|
68
|
+
"@lloyal-labs/lloyal.node-win32-x64-cuda": "1.1.0",
|
|
69
|
+
"@lloyal-labs/lloyal.node-win32-x64-vulkan": "1.1.0"
|
|
70
70
|
},
|
|
71
71
|
"engines": {
|
|
72
72
|
"node": ">=22.0.0"
|