@lloyal-labs/lloyal.node 1.0.6-alpha → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -153
- package/lib/Branch.js +268 -0
- package/lib/index.d.ts +182 -156
- package/lib/index.js +9 -2
- package/package.json +18 -15
- package/scripts/create-platform-package.js +3 -2
- package/scripts/download-test-models.sh +10 -0
- package/scripts/sync-llama-cpp.js +117 -0
package/README.md
CHANGED
|
@@ -1,14 +1,68 @@
|
|
|
1
1
|
# lloyal.node
|
|
2
2
|
|
|
3
|
-
**
|
|
3
|
+
**Covalent inference for Node.js**
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Forkable inference state for llama.cpp — Branch a generation into a tree — prefix sharing is the bond across branches while each owns its own machinery (sampler chain, seed, grammar, logits snapshot, perplexity tracker) enabling controlled divergence at decode time.
|
|
6
|
+
|
|
7
|
+
## Branch API
|
|
8
|
+
|
|
9
|
+
Fork from root for best-of-N, fork from children for MCTS/beam search, fork from a draft for speculative decoding. The produce/commit protocol separates sampling from state advancement — sample without writing to KV, inspect the result, then decide whether to commit.
|
|
10
|
+
|
|
11
|
+
```javascript
|
|
12
|
+
import { createContext, Branch } from '@lloyal-labs/lloyal.node';
|
|
13
|
+
|
|
14
|
+
const ctx = await createContext({ modelPath: './model.gguf', nSeqMax: 8 });
|
|
15
|
+
const tokens = await ctx.tokenize('Once upon a time');
|
|
16
|
+
await ctx.decode(tokens, 0, 0);
|
|
17
|
+
|
|
18
|
+
// Create root branch, freeze logits from prefill
|
|
19
|
+
const root = Branch.create(ctx, 0, tokens.length, { temperature: 0.8 });
|
|
20
|
+
root.captureLogits();
|
|
21
|
+
|
|
22
|
+
// Fork N candidates — KV prefix shared, sampler/grammar/logits/perplexity cloned
|
|
23
|
+
const candidates = [1, 2, 3, 4, 5].map((seqId, i) => {
|
|
24
|
+
const branch = root.fork(seqId);
|
|
25
|
+
branch.reseedSampler(1000 + i);
|
|
26
|
+
return branch;
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
// Generate (interleaved round-robin)
|
|
30
|
+
for (let t = 0; t < 50; t++) {
|
|
31
|
+
for (const branch of candidates) {
|
|
32
|
+
const { token, isStop } = branch.produce(); // Sample, no KV write
|
|
33
|
+
if (isStop) continue;
|
|
34
|
+
branch.commit(token); // Accept + forward pass + capture
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Select by perplexity, prune losers
|
|
39
|
+
const best = candidates.reduce((a, b) => (a.perplexity < b.perplexity ? a : b));
|
|
40
|
+
for (const c of candidates) {
|
|
41
|
+
if (c !== best) c.prune();
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
**What `fork()` shares:** KV cache prefix (metadata-only under unified KV — no tensor buffers copied).
|
|
46
|
+
|
|
47
|
+
**What `fork()` clones:** Logits snapshot, sampler chain (penalties + PRNG), grammar state, logit bias, perplexity tracker.
|
|
48
|
+
|
|
49
|
+
**Key methods:**
|
|
50
|
+
|
|
51
|
+
- `produce()` / `commit()` — two-phase: sample without KV write, then advance
|
|
52
|
+
- `prune()` — discard loser and its divergent KV entries
|
|
53
|
+
- `destroy()` — release handle, keep KV (for winners continuing with raw ops)
|
|
54
|
+
- `reseedSampler()` — unique PRNG per fork for stochastic diversity
|
|
55
|
+
- `perplexity` — rolling PPL per branch for quality-based selection
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Install
|
|
6
60
|
|
|
7
61
|
```bash
|
|
8
62
|
npm install @lloyal-labs/lloyal.node
|
|
9
63
|
```
|
|
10
64
|
|
|
11
|
-
Prebuilt binaries for 13
|
|
65
|
+
Prebuilt binaries for 13 platform/GPU combinations. GPU selection at runtime, not install time.
|
|
12
66
|
|
|
13
67
|
| Platform | Arch | Acceleration |
|
|
14
68
|
| -------- | ----- | ------------------- |
|
|
@@ -19,80 +73,32 @@ Prebuilt binaries for 13 platforms:
|
|
|
19
73
|
| Windows | x64 | CPU / CUDA / Vulkan |
|
|
20
74
|
| Windows | arm64 | CPU / Vulkan |
|
|
21
75
|
|
|
22
|
-
|
|
76
|
+
See [distribution.md](docs/distribution.md) for details.
|
|
23
77
|
|
|
24
78
|
---
|
|
25
79
|
|
|
26
80
|
## Examples
|
|
27
81
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
|
31
|
-
|
|
|
32
|
-
| [`
|
|
33
|
-
| [`
|
|
34
|
-
| [`
|
|
35
|
-
| [`
|
|
36
|
-
| [`
|
|
37
|
-
| [`chat/`](./examples/chat/) | Interactive streaming chat |
|
|
38
|
-
| [`embed/`](./examples/embed/) | Text embeddings extraction |
|
|
82
|
+
| Example | Pattern |
|
|
83
|
+
| ----------------------------------------- | -------------------------------------------------------------------------- |
|
|
84
|
+
| [`best-of-n/`](./examples/best-of-n/) | Branch API: fork, produce/commit, perplexity selection |
|
|
85
|
+
| [`speculative/`](./examples/speculative/) | Branch API: draft/verify, fork/prune, bonus token sampling |
|
|
86
|
+
| [`streaming/`](./examples/streaming/) | Infinite context via BlinkKV reseeding with sidecar summarization |
|
|
87
|
+
| [`entropy/`](./examples/entropy/) | `modelEntropy()` mid-generation as control signal |
|
|
88
|
+
| [`grammar/`](./examples/grammar/) | Pull loop with generators, JSON schema constraints, KV + grammar branching |
|
|
89
|
+
| [`chat/`](./examples/chat/) | Interactive streaming chat |
|
|
90
|
+
| [`embed/`](./examples/embed/) | Text embeddings extraction |
|
|
39
91
|
|
|
40
92
|
```bash
|
|
41
93
|
node examples/best-of-n/best-of-n.mjs
|
|
42
94
|
node examples/speculative/speculative.mjs
|
|
43
|
-
node examples/entropy/entropy.mjs
|
|
44
|
-
node examples/grammar/grammar.mjs
|
|
45
95
|
```
|
|
46
96
|
|
|
47
|
-
Each example has a README explaining the pattern
|
|
97
|
+
Each example has a README explaining the pattern.
|
|
48
98
|
|
|
49
99
|
---
|
|
50
100
|
|
|
51
|
-
##
|
|
52
|
-
|
|
53
|
-
### Forkable State
|
|
54
|
-
|
|
55
|
-
KV cache, grammar parser, and perplexity trackers all live behind handles. Handles clone atomically.
|
|
56
|
-
|
|
57
|
-
**Two forking strategies:**
|
|
58
|
-
|
|
59
|
-
| Approach | Method | Use Case |
|
|
60
|
-
| -------------------- | --------------------------------- | -------------------------------------------- |
|
|
61
|
-
| **Tag copy** | `kvSeqCopy(src, dst)` | Parallel branches with different seqIds |
|
|
62
|
-
| **Snapshot/restore** | `kvCacheSave()` / `kvCacheLoad()` | Sequential exploration, return to checkpoint |
|
|
63
|
-
|
|
64
|
-
[`examples/best-of-n/`](./examples/best-of-n/) uses tag copy — each candidate gets its own seqId, branches run in parallel:
|
|
65
|
-
|
|
66
|
-
```javascript
|
|
67
|
-
ctx.kvSeqCopy(0, seqId); // O(1) tag copy, branch diverges on seqId
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
[`examples/grammar/`](./examples/grammar/) uses snapshot/restore — save state, explore branches sequentially, restore between each:
|
|
71
|
-
|
|
72
|
-
```javascript
|
|
73
|
-
const snapshot = await ctx.kvCacheSave(0); // Save checkpoint
|
|
74
|
-
// ... explore branch ...
|
|
75
|
-
await ctx.kvCacheLoad(0, snapshot); // Return to checkpoint
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
Both approaches also fork grammar state with `cloneSampler()` when grammar constraints are involved.
|
|
79
|
-
|
|
80
|
-
### Captured Logits
|
|
81
|
-
|
|
82
|
-
After decode, logits represent P(next_token | context). When forking to multiple sequences, capture logits for fair comparison:
|
|
83
|
-
|
|
84
|
-
```javascript
|
|
85
|
-
// Capture after prefill
|
|
86
|
-
const capturedLogits = new Float32Array(ctx.getLogits());
|
|
87
|
-
|
|
88
|
-
// All candidates sample first token from same distribution
|
|
89
|
-
const token = sampleWithStrategy(capturedLogits, { params, workspace, prng });
|
|
90
|
-
|
|
91
|
-
// Compute surprisal from captured logits (native C++)
|
|
92
|
-
const surprisal = ctx.modelSurprisal(token, 'nats', capturedLogits);
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
See [`examples/best-of-n/`](./examples/best-of-n/) for the full pattern.
|
|
101
|
+
## Other Patterns
|
|
96
102
|
|
|
97
103
|
### Entropy as Control Signal
|
|
98
104
|
|
|
@@ -109,26 +115,21 @@ if (entropy > 4.0) {
|
|
|
109
115
|
|
|
110
116
|
See [`examples/entropy/`](./examples/entropy/) for entropy-triggered sampling strategies.
|
|
111
117
|
|
|
112
|
-
###
|
|
118
|
+
### Low-Level KV Operations
|
|
113
119
|
|
|
114
|
-
For
|
|
120
|
+
For fine-grained control without Branch:
|
|
115
121
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
ctx.applySampler(grammarHandle, logits);
|
|
121
|
-
const token = ctx.sample({ temperature: 0.7 });
|
|
122
|
-
if (ctx.isStopToken(token)) return;
|
|
123
|
-
ctx.acceptSamplerToken(grammarHandle, token);
|
|
124
|
-
yield { token, text: ctx.tokenToText(token) };
|
|
125
|
-
}
|
|
126
|
-
}
|
|
122
|
+
| Approach | Method | Use Case |
|
|
123
|
+
| -------------------- | --------------------------------- | -------------------------------------------- |
|
|
124
|
+
| **Sequence copy** | `kvSeqCopy(src, dst)` | Share prefix across sequences |
|
|
125
|
+
| **Snapshot/restore** | `kvCacheSave()` / `kvCacheLoad()` | Sequential exploration, return to checkpoint |
|
|
127
126
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
127
|
+
### Grammar-Constrained Generation
|
|
128
|
+
|
|
129
|
+
```javascript
|
|
130
|
+
const grammar = ctx.jsonSchemaToGrammar(schema);
|
|
131
|
+
const handle = ctx.createSampler(grammar);
|
|
132
|
+
// Pull loop — consumer controls pace, can branch at any point
|
|
132
133
|
```
|
|
133
134
|
|
|
134
135
|
See [`examples/grammar/`](./examples/grammar/) for the full pull loop pattern.
|
|
@@ -137,80 +138,9 @@ See [`examples/grammar/`](./examples/grammar/) for the full pull loop pattern.
|
|
|
137
138
|
|
|
138
139
|
## API Reference
|
|
139
140
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
```typescript
|
|
143
|
-
const ctx = await createContext({
|
|
144
|
-
modelPath: string, // Path to .gguf file (required)
|
|
145
|
-
nCtx?: number, // Context size (default: 2048)
|
|
146
|
-
nThreads?: number, // CPU threads (default: 4)
|
|
147
|
-
embeddings?: boolean, // Enable embedding mode (default: false)
|
|
148
|
-
poolingType?: number, // 0=NONE, 1=MEAN, 2=CLS, 3=LAST
|
|
149
|
-
nSeqMax?: number, // Max parallel sequences (default: 1)
|
|
150
|
-
});
|
|
151
|
-
```
|
|
141
|
+
Full API documentation: **[lloyal-ai.github.io/lloyal.node](https://lloyal-ai.github.io/lloyal.node/)**
|
|
152
142
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
| Method | Returns | Description |
|
|
156
|
-
| ----------------------------- | ------------------- | ------------------------------- |
|
|
157
|
-
| `tokenize(text)` | `Promise<number[]>` | Text → token IDs |
|
|
158
|
-
| `detokenize(tokens)` | `Promise<string>` | Token IDs → text |
|
|
159
|
-
| `tokenToText(token)` | `string` | Single token → text (streaming) |
|
|
160
|
-
| `decode(tokens, pos, seqId?)` | `Promise<void>` | Forward pass, updates KV cache |
|
|
161
|
-
| `sample(params?)` | `number` | Sample next token |
|
|
162
|
-
| `isStopToken(token)` | `boolean` | Check for EOS token |
|
|
163
|
-
| `getLogits()` | `Float32Array` | Raw logits (zero-copy view) |
|
|
164
|
-
|
|
165
|
-
### KV Cache
|
|
166
|
-
|
|
167
|
-
| Method | Returns | Description |
|
|
168
|
-
| ---------------------------------- | ----------------- | ------------------------------ |
|
|
169
|
-
| `kvCacheSize(seqId?)` | `number` | Tokens in cache |
|
|
170
|
-
| `kvCacheClear()` | `Promise<void>` | Clear all sequences |
|
|
171
|
-
| `kvCacheRemove(seqId, start, end)` | `Promise<void>` | Remove token range |
|
|
172
|
-
| `kvCacheSave(seqId?)` | `Promise<Buffer>` | Snapshot state |
|
|
173
|
-
| `kvCacheLoad(seqId, state)` | `Promise<void>` | Restore state |
|
|
174
|
-
| `kvSeqCopy(src, dst)` | `void` | Copy sequence (tag copy, O(1)) |
|
|
175
|
-
| `kvSeqKeep(seqId)` | `void` | Keep only one sequence |
|
|
176
|
-
| `clearAndReseed(sinks, tail)` | `Promise<void>` | BlinkKV pattern |
|
|
177
|
-
|
|
178
|
-
### Grammar (Handle-Based)
|
|
179
|
-
|
|
180
|
-
| Method | Returns | Description |
|
|
181
|
-
| -------------------------------- | -------- | --------------------------- |
|
|
182
|
-
| `jsonSchemaToGrammar(schema)` | `string` | Schema → GBNF |
|
|
183
|
-
| `createSampler(grammarStr)` | `number` | Create grammar handle |
|
|
184
|
-
| `cloneSampler(handle)` | `number` | Clone grammar state |
|
|
185
|
-
| `applySampler(handle, logits)` | `void` | Apply constraints to logits |
|
|
186
|
-
| `acceptSamplerToken(handle, id)` | `void` | Advance parser state |
|
|
187
|
-
| `freeSamplerHandle(handle)` | `void` | Release grammar handle |
|
|
188
|
-
|
|
189
|
-
### Metrics
|
|
190
|
-
|
|
191
|
-
| Method | Returns | Description |
|
|
192
|
-
| --------------------------------------- | --------------- | ------------------------------------------ |
|
|
193
|
-
| `modelEntropy(base?, logits?)` | `number` | Distribution entropy (bits/nats) |
|
|
194
|
-
| `modelSurprisal(token, base?, logits?)` | `number` | Token surprisal (supports captured logits) |
|
|
195
|
-
| `createPerplexityTracker()` | `TrackerHandle` | Create tracker (forkable) |
|
|
196
|
-
| `clonePerplexityTracker(handle)` | `TrackerHandle` | Clone tracker state |
|
|
197
|
-
| `addSurprisal(handle, value)` | `void` | Add to tracker |
|
|
198
|
-
| `getPerplexity(handle)` | `number` | Get current PPL |
|
|
199
|
-
| `freePerplexityTracker(handle)` | `void` | Release tracker |
|
|
200
|
-
|
|
201
|
-
### Embeddings
|
|
202
|
-
|
|
203
|
-
| Method | Returns | Description |
|
|
204
|
-
| --------------------------- | --------------- | --------------------------- |
|
|
205
|
-
| `encode(tokens)` | `Promise<void>` | Forward pass for embeddings |
|
|
206
|
-
| `getEmbeddings(normalize?)` | `Float32Array` | Extract embedding vector |
|
|
207
|
-
| `getEmbeddingDimension()` | `number` | Vector dimension |
|
|
208
|
-
|
|
209
|
-
### Lifecycle
|
|
210
|
-
|
|
211
|
-
| Method | Description |
|
|
212
|
-
| ----------- | ------------------------------------ |
|
|
213
|
-
| `dispose()` | Free native resources (**required**) |
|
|
143
|
+
Generated from [`lib/index.d.ts`](./lib/index.d.ts) with TypeDoc.
|
|
214
144
|
|
|
215
145
|
---
|
|
216
146
|
|
package/lib/Branch.js
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Branch - Forkable inference handle for covalent generation
|
|
3
|
+
*
|
|
4
|
+
* A Branch owns everything needed for independent generation: a KV cache
|
|
5
|
+
* sequence, sampler chain, logits snapshot, and perplexity tracker.
|
|
6
|
+
*
|
|
7
|
+
* Forking is cheap — the KV prefix is shared in memory (metadata-only operation under unified KV —
|
|
8
|
+
* no KV tensor buffers are copied), so sibling branches read from the same physical KV entries.
|
|
9
|
+
* Only tokens decoded after the fork point are exclusive to each branch.
|
|
10
|
+
* This is the covalent property: branches share a bond (common prefix)
|
|
11
|
+
* while diverging independently.
|
|
12
|
+
*
|
|
13
|
+
* Branches form trees, not just flat lists. Fork from root for best-of-N,
|
|
14
|
+
* fork from children for MCTS/beam search, fork from a draft for speculative
|
|
15
|
+
* decoding.
|
|
16
|
+
*
|
|
17
|
+
* The produce/commit protocol separates sampling from state advancement:
|
|
18
|
+
* produce() samples without writing to KV, letting you inspect the result
|
|
19
|
+
* before deciding to commit(). This two-phase split is what makes speculative
|
|
20
|
+
* verification and tree search natural.
|
|
21
|
+
*
|
|
22
|
+
* @example Best-of-N with perplexity selection
|
|
23
|
+
* ```js
|
|
24
|
+
* const root = Branch.create(ctx, 0, tokens.length, { temperature: 0.8 });
|
|
25
|
+
* root.captureLogits();
|
|
26
|
+
*
|
|
27
|
+
* const candidates = [1, 2, 3, 4, 5].map((seqId, i) => {
|
|
28
|
+
* const branch = root.fork(seqId);
|
|
29
|
+
* branch.reseedSampler(1000 + i);
|
|
30
|
+
* return branch;
|
|
31
|
+
* });
|
|
32
|
+
*
|
|
33
|
+
* for (let t = 0; t < 50; t++) {
|
|
34
|
+
* for (const branch of candidates) {
|
|
35
|
+
* const { token, isStop } = branch.produce();
|
|
36
|
+
* if (isStop) continue;
|
|
37
|
+
* branch.commit(token);
|
|
38
|
+
* }
|
|
39
|
+
* }
|
|
40
|
+
*
|
|
41
|
+
* const best = candidates.reduce((a, b) => a.perplexity < b.perplexity ? a : b);
|
|
42
|
+
* for (const c of candidates) { if (c !== best) c.prune(); }
|
|
43
|
+
* ```
|
|
44
|
+
*/
|
|
45
|
+
|
|
46
|
+
class Branch {
|
|
47
|
+
/**
|
|
48
|
+
* @param {SessionContext} ctx
|
|
49
|
+
* @param {number} handle
|
|
50
|
+
*/
|
|
51
|
+
constructor(ctx, handle) {
|
|
52
|
+
this._ctx = ctx;
|
|
53
|
+
this._handle = handle;
|
|
54
|
+
this._disposed = false;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Create a root branch at the given position
|
|
59
|
+
*
|
|
60
|
+
* The branch takes ownership of the sequence and creates its own sampler
|
|
61
|
+
* chain from the provided params. Call captureLogits() after prefill to
|
|
62
|
+
* freeze the logit distribution before forking.
|
|
63
|
+
*
|
|
64
|
+
* @param {SessionContext} ctx - SessionContext to create branch on
|
|
65
|
+
* @param {number} seqId - Sequence ID for this branch
|
|
66
|
+
* @param {number} position - Starting position (typically prompt token count)
|
|
67
|
+
* @param {SamplingParams} [params] - Sampling parameters (temperature, topP, etc.)
|
|
68
|
+
* @returns {Branch} New Branch instance
|
|
69
|
+
*/
|
|
70
|
+
static create(ctx, seqId, position, params) {
|
|
71
|
+
const handle = ctx._branchCreate(seqId, position, params);
|
|
72
|
+
return new Branch(ctx, handle);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Fork this branch to a new sequence
|
|
77
|
+
*
|
|
78
|
+
* The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy).
|
|
79
|
+
* Logits, sampler state, and perplexity tracker are cloned so the child
|
|
80
|
+
* can diverge independently. Fork from any branch — root or intermediate —
|
|
81
|
+
* to build arbitrarily deep trees.
|
|
82
|
+
*
|
|
83
|
+
* Call reseedSampler() on each child for stochastic diversity.
|
|
84
|
+
*
|
|
85
|
+
* @param {number} newSeqId - Sequence ID for the forked branch
|
|
86
|
+
* @returns {Branch} New forked Branch
|
|
87
|
+
*/
|
|
88
|
+
fork(newSeqId) {
|
|
89
|
+
this._ensureNotDisposed();
|
|
90
|
+
const newHandle = this._ctx._branchFork(this._handle, newSeqId);
|
|
91
|
+
return new Branch(this._ctx, newHandle);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Freeze the current logit distribution into this branch
|
|
96
|
+
*
|
|
97
|
+
* Logits are ephemeral — they're overwritten on the next decode() call.
|
|
98
|
+
* Capturing preserves them so this branch (and any forks from it) can
|
|
99
|
+
* sample from the same distribution. Essential before fork().
|
|
100
|
+
*/
|
|
101
|
+
captureLogits() {
|
|
102
|
+
this._ensureNotDisposed();
|
|
103
|
+
this._ctx._branchCaptureLogits(this._handle);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Single-token forward pass with logit snapshot
|
|
108
|
+
*
|
|
109
|
+
* Runs one decode step (writing the token's KV entries), advances position,
|
|
110
|
+
* and captures the resulting logits for the next sample() call.
|
|
111
|
+
*
|
|
112
|
+
* @param {number} token - Token to decode
|
|
113
|
+
*/
|
|
114
|
+
decodeAndCaptureOne(token) {
|
|
115
|
+
this._ensureNotDisposed();
|
|
116
|
+
this._ctx._branchDecodeAndCaptureOne(this._handle, token);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Sample next token from branch's logits snapshot
|
|
121
|
+
*
|
|
122
|
+
* Applies the branch's full sampler chain (top-k, top-p, temperature,
|
|
123
|
+
* repeat/presence penalties) to the captured logits.
|
|
124
|
+
*
|
|
125
|
+
* @returns {number} Sampled token ID
|
|
126
|
+
*/
|
|
127
|
+
sample() {
|
|
128
|
+
this._ensureNotDisposed();
|
|
129
|
+
return this._ctx._branchSample(this._handle);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Record token in the sampler's repeat/presence penalty window
|
|
134
|
+
*
|
|
135
|
+
* @param {number} token - Token to accept
|
|
136
|
+
*/
|
|
137
|
+
accept(token) {
|
|
138
|
+
this._ensureNotDisposed();
|
|
139
|
+
this._ctx._branchAccept(this._handle, token);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Discard this branch entirely — remove its KV entries and free the handle
|
|
144
|
+
*
|
|
145
|
+
* Use for losers: branches whose generation you want to erase completely.
|
|
146
|
+
* Only removes KV entries divergent from the shared prefix; sibling
|
|
147
|
+
* branches are unaffected.
|
|
148
|
+
*/
|
|
149
|
+
prune() {
|
|
150
|
+
if (this._disposed) return;
|
|
151
|
+
this._ctx._branchPrune(this._handle);
|
|
152
|
+
this._disposed = true;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Release the handle but keep KV cache entries intact
|
|
157
|
+
*
|
|
158
|
+
* Use for winners: you're done branching but want to continue generation
|
|
159
|
+
* on this sequence using raw ctx.decode()/ctx.sample() calls. The KV
|
|
160
|
+
* cache entries remain at their current positions.
|
|
161
|
+
*/
|
|
162
|
+
destroy() {
|
|
163
|
+
if (this._disposed) return;
|
|
164
|
+
this._ctx._branchDestroy(this._handle);
|
|
165
|
+
this._disposed = true;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Reseed the sampler's PRNG for diversity after fork()
|
|
170
|
+
*
|
|
171
|
+
* CRITICAL for parallel generation: Without reseeding, all forked branches
|
|
172
|
+
* produce identical outputs because they share the same PRNG state.
|
|
173
|
+
*
|
|
174
|
+
* Only affects stochastic samplers (temperature > 0). Greedy samplers are unchanged.
|
|
175
|
+
*
|
|
176
|
+
* @param {number} seed - New seed for the PRNG
|
|
177
|
+
*
|
|
178
|
+
* @example
|
|
179
|
+
* ```js
|
|
180
|
+
* const root = Branch.create(ctx, 0, pos, { temperature: 0.9 });
|
|
181
|
+
* root.captureLogits();
|
|
182
|
+
*
|
|
183
|
+
* // Fork and reseed for diversity
|
|
184
|
+
* const branches = [1, 2, 3, 4, 5].map((seqId, i) => {
|
|
185
|
+
* const branch = root.fork(seqId);
|
|
186
|
+
* branch.reseedSampler(1000 + i); // Each branch gets unique seed
|
|
187
|
+
* return branch;
|
|
188
|
+
* });
|
|
189
|
+
* ```
|
|
190
|
+
*/
|
|
191
|
+
reseedSampler(seed) {
|
|
192
|
+
this._ensureNotDisposed();
|
|
193
|
+
this._ctx._branchSamplerChainReseed(this._handle, seed);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Sample the next token without advancing state
|
|
198
|
+
*
|
|
199
|
+
* No KV write, no position update. Inspect the result before deciding
|
|
200
|
+
* to commit() — this separation is what enables speculative verification
|
|
201
|
+
* and conditional branching.
|
|
202
|
+
*
|
|
203
|
+
* @returns {{ token: number, text: string, isStop: boolean }}
|
|
204
|
+
*/
|
|
205
|
+
produce() {
|
|
206
|
+
this._ensureNotDisposed();
|
|
207
|
+
const token = this.sample();
|
|
208
|
+
return {
|
|
209
|
+
token,
|
|
210
|
+
text: this._ctx.tokenToText(token),
|
|
211
|
+
isStop: this._ctx.isStopToken(token),
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Accept and advance — write token to KV and update branch state
|
|
217
|
+
*
|
|
218
|
+
* Accepts the token for repeat-penalty tracking, decodes it (writing to
|
|
219
|
+
* KV cache), and captures the resulting logits for the next produce() call.
|
|
220
|
+
*
|
|
221
|
+
* @param {number} token - Token to commit (from produce())
|
|
222
|
+
*/
|
|
223
|
+
commit(token) {
|
|
224
|
+
this._ensureNotDisposed();
|
|
225
|
+
this.accept(token);
|
|
226
|
+
this.decodeAndCaptureOne(token);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// ===== ACCESSORS =====
|
|
230
|
+
|
|
231
|
+
/** @returns {number} Branch's sequence ID */
|
|
232
|
+
get seqId() {
|
|
233
|
+
this._ensureNotDisposed();
|
|
234
|
+
return this._ctx._branchGetSeqId(this._handle);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/** @returns {number} Branch's current position (number of tokens decoded) */
|
|
238
|
+
get position() {
|
|
239
|
+
this._ensureNotDisposed();
|
|
240
|
+
return this._ctx._branchGetPosition(this._handle);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/** @returns {number} Branch's perplexity (exp of mean surprisal) */
|
|
244
|
+
get perplexity() {
|
|
245
|
+
this._ensureNotDisposed();
|
|
246
|
+
return this._ctx._branchGetPerplexity(this._handle);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/** @returns {number} Internal handle (for debugging) */
|
|
250
|
+
get handle() {
|
|
251
|
+
return this._handle;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/** @returns {boolean} Whether this branch has been disposed */
|
|
255
|
+
get disposed() {
|
|
256
|
+
return this._disposed;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// ===== INTERNAL =====
|
|
260
|
+
|
|
261
|
+
_ensureNotDisposed() {
|
|
262
|
+
if (this._disposed) {
|
|
263
|
+
throw new Error('Branch has been disposed');
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
module.exports = { Branch };
|
package/lib/index.d.ts
CHANGED
|
@@ -347,9 +347,11 @@ export interface SessionContext {
|
|
|
347
347
|
* // Creative generation
|
|
348
348
|
* const token = ctx.sample({ temperature: 0.9 });
|
|
349
349
|
*
|
|
350
|
-
* // Constrained to valid JSON
|
|
351
|
-
* ctx.
|
|
350
|
+
* // Constrained to valid JSON (handle-based API)
|
|
351
|
+
* const grammarHandle = ctx.createSampler(grammar);
|
|
352
|
+
* ctx.applySampler(grammarHandle, ctx.getLogits());
|
|
352
353
|
* const token = ctx.sample({ temperature: 0.7 });
|
|
354
|
+
* ctx.acceptSamplerToken(grammarHandle, token);
|
|
353
355
|
* ```
|
|
354
356
|
*/
|
|
355
357
|
sample(params?: SamplingParams): number;
|
|
@@ -608,144 +610,6 @@ export interface SessionContext {
|
|
|
608
610
|
*/
|
|
609
611
|
clearAndReseed(sinks: number[], tail: number[]): Promise<void>;
|
|
610
612
|
|
|
611
|
-
// ===== GRAMMAR-CONSTRAINED GENERATION =====
|
|
612
|
-
|
|
613
|
-
/**
|
|
614
|
-
* Initialize grammar parser (once per generation session)
|
|
615
|
-
*
|
|
616
|
-
* Grammars constrain generation to valid formats (JSON, XML, etc.).
|
|
617
|
-
* Parser tracks state across tokens to enforce rules.
|
|
618
|
-
*
|
|
619
|
-
* Call once before starting constrained generation.
|
|
620
|
-
* Use resetGrammar() to reuse same grammar for new generation.
|
|
621
|
-
*
|
|
622
|
-
* Cost: ~0.1-1ms depending on grammar complexity
|
|
623
|
-
*
|
|
624
|
-
* @param grammarStr GBNF grammar string (EBNF-like syntax)
|
|
625
|
-
* @example
|
|
626
|
-
* ```typescript
|
|
627
|
-
* // Force valid JSON
|
|
628
|
-
* const grammar = ctx.jsonSchemaToGrammar(JSON.stringify({
|
|
629
|
-
* type: "object",
|
|
630
|
-
* properties: {
|
|
631
|
-
* name: { type: "string" },
|
|
632
|
-
* age: { type: "number" }
|
|
633
|
-
* }
|
|
634
|
-
* }));
|
|
635
|
-
*
|
|
636
|
-
* ctx.initGrammar(grammar);
|
|
637
|
-
*
|
|
638
|
-
* // Now sample() will only generate valid JSON
|
|
639
|
-
* const token = ctx.sample({ temperature: 0.7 });
|
|
640
|
-
* ```
|
|
641
|
-
*/
|
|
642
|
-
initGrammar(grammarStr: string): void;
|
|
643
|
-
|
|
644
|
-
/**
|
|
645
|
-
* Apply grammar constraints to token scores (modifies in-place)
|
|
646
|
-
*
|
|
647
|
-
* Masks invalid tokens with -Infinity based on parser state.
|
|
648
|
-
* Call after getTokenScores(), before custom sampling.
|
|
649
|
-
*
|
|
650
|
-
* Flow: getTokenScores() → applyGrammar() → sample() → acceptToken()
|
|
651
|
-
*
|
|
652
|
-
* Thread safety: This method is synchronous and modifies the buffer
|
|
653
|
-
* in-place on the JS thread. Safe because it's called sequentially
|
|
654
|
-
* in the generation loop before any async operations.
|
|
655
|
-
*
|
|
656
|
-
* Cost: ~0.1-1ms depending on grammar complexity
|
|
657
|
-
*
|
|
658
|
-
* @param scoresBuffer Buffer from getTokenScores() (modified in-place)
|
|
659
|
-
* @throws Error if grammar not initialized (call initGrammar first)
|
|
660
|
-
* @example
|
|
661
|
-
* ```typescript
|
|
662
|
-
* // Custom sampling with grammar
|
|
663
|
-
* const buffer = ctx.getTokenScores();
|
|
664
|
-
* const scores = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
|
|
665
|
-
*
|
|
666
|
-
* // Apply grammar constraints
|
|
667
|
-
* ctx.applyGrammar(buffer);
|
|
668
|
-
*
|
|
669
|
-
* // Now sample from constrained distribution
|
|
670
|
-
* const token = customSample(scores);
|
|
671
|
-
* ctx.acceptToken(token);
|
|
672
|
-
* ```
|
|
673
|
-
*/
|
|
674
|
-
applyGrammar(scoresBuffer: Buffer): void;
|
|
675
|
-
|
|
676
|
-
/**
|
|
677
|
-
* Advance grammar parser with chosen token
|
|
678
|
-
*
|
|
679
|
-
* Updates parser state after sampling.
|
|
680
|
-
* MUST be called AFTER sampling, BEFORE next applyGrammar().
|
|
681
|
-
*
|
|
682
|
-
* This advances the stateful grammar parser through its rules.
|
|
683
|
-
* Without this, grammar constraints will be incorrect.
|
|
684
|
-
*
|
|
685
|
-
* Cost: <0.01ms
|
|
686
|
-
*
|
|
687
|
-
* @param tokenId Token that was sampled
|
|
688
|
-
* @example
|
|
689
|
-
* ```typescript
|
|
690
|
-
* const buffer = ctx.getTokenScores();
|
|
691
|
-
* ctx.applyGrammar(buffer);
|
|
692
|
-
*
|
|
693
|
-
* const scores = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
|
|
694
|
-
* const token = customSample(scores);
|
|
695
|
-
*
|
|
696
|
-
* // MUST call acceptToken to advance parser
|
|
697
|
-
* ctx.acceptToken(token);
|
|
698
|
-
*
|
|
699
|
-
* // Now parser is ready for next token
|
|
700
|
-
* ```
|
|
701
|
-
*/
|
|
702
|
-
acceptToken(tokenId: number): void;
|
|
703
|
-
|
|
704
|
-
/**
|
|
705
|
-
* Reset grammar parser to initial state
|
|
706
|
-
*
|
|
707
|
-
* Call at start of each new generation with same grammar.
|
|
708
|
-
* Parser returns to root state, ready to validate from beginning.
|
|
709
|
-
*
|
|
710
|
-
* Cost: <0.01ms
|
|
711
|
-
*
|
|
712
|
-
* @example
|
|
713
|
-
* ```typescript
|
|
714
|
-
* ctx.initGrammar(jsonGrammar);
|
|
715
|
-
*
|
|
716
|
-
* // First generation
|
|
717
|
-
* while (!done) {
|
|
718
|
-
* const token = ctx.sample();
|
|
719
|
-
* // ... generate ...
|
|
720
|
-
* }
|
|
721
|
-
*
|
|
722
|
-
* // Second generation - reuse same grammar
|
|
723
|
-
* ctx.resetGrammar();
|
|
724
|
-
* while (!done) {
|
|
725
|
-
* const token = ctx.sample();
|
|
726
|
-
* // ... generate ...
|
|
727
|
-
* }
|
|
728
|
-
* ```
|
|
729
|
-
*/
|
|
730
|
-
resetGrammar(): void;
|
|
731
|
-
|
|
732
|
-
/**
|
|
733
|
-
* Free grammar resources
|
|
734
|
-
*
|
|
735
|
-
* Call when done with constrained generation.
|
|
736
|
-
* Releases parser memory.
|
|
737
|
-
*
|
|
738
|
-
* Cost: <0.01ms
|
|
739
|
-
*
|
|
740
|
-
* @example
|
|
741
|
-
* ```typescript
|
|
742
|
-
* ctx.initGrammar(grammar);
|
|
743
|
-
* // ... do constrained generation ...
|
|
744
|
-
* ctx.freeGrammar();
|
|
745
|
-
* ```
|
|
746
|
-
*/
|
|
747
|
-
freeGrammar(): void;
|
|
748
|
-
|
|
749
613
|
// ===== KV SEQUENCE OPERATIONS =====
|
|
750
614
|
|
|
751
615
|
/**
|
|
@@ -817,9 +681,7 @@ export interface SessionContext {
|
|
|
817
681
|
* Create a new grammar sampler (returns handle)
|
|
818
682
|
*
|
|
819
683
|
* Creates an independent grammar sampler instance with its own state.
|
|
820
|
-
*
|
|
821
|
-
* Unlike initGrammar() which uses a single internal sampler, this returns
|
|
822
|
-
* a handle that can be used with applySampler/acceptSamplerToken.
|
|
684
|
+
* Returns a handle that can be used with applySampler/acceptSamplerToken.
|
|
823
685
|
* Multiple handles can coexist with independent parser states.
|
|
824
686
|
*
|
|
825
687
|
* Cost: ~0.1-1ms depending on grammar complexity
|
|
@@ -859,7 +721,6 @@ export interface SessionContext {
|
|
|
859
721
|
* Accept token to advance grammar parser state (handle-based)
|
|
860
722
|
*
|
|
861
723
|
* Must be called after sampling to advance the grammar parser.
|
|
862
|
-
* This is the handle-based equivalent of acceptToken().
|
|
863
724
|
*
|
|
864
725
|
* @param handle Sampler handle from createSampler()
|
|
865
726
|
* @param tokenId Token that was sampled
|
|
@@ -1186,7 +1047,7 @@ export interface SessionContext {
|
|
|
1186
1047
|
* Convert JSON schema to GBNF grammar
|
|
1187
1048
|
*
|
|
1188
1049
|
* Generates grammar string for constrained JSON generation.
|
|
1189
|
-
* Use with
|
|
1050
|
+
* Use with createSampler() for grammar-constrained generation.
|
|
1190
1051
|
*
|
|
1191
1052
|
* Cost: ~1-10ms depending on schema complexity
|
|
1192
1053
|
*
|
|
@@ -1204,7 +1065,7 @@ export interface SessionContext {
|
|
|
1204
1065
|
* };
|
|
1205
1066
|
*
|
|
1206
1067
|
* const grammar = ctx.jsonSchemaToGrammar(JSON.stringify(schema));
|
|
1207
|
-
* ctx.
|
|
1068
|
+
* const handle = ctx.createSampler(grammar);
|
|
1208
1069
|
* ```
|
|
1209
1070
|
*/
|
|
1210
1071
|
jsonSchemaToGrammar(schemaJson: string): string;
|
|
@@ -1314,16 +1175,6 @@ export interface SessionContext {
|
|
|
1314
1175
|
|
|
1315
1176
|
// ===== NATIVE REFERENCE IMPLEMENTATIONS =====
|
|
1316
1177
|
|
|
1317
|
-
/**
|
|
1318
|
-
* Compute entropy of current logits distribution
|
|
1319
|
-
*
|
|
1320
|
-
* Alternative entropy computation using native implementation.
|
|
1321
|
-
* Equivalent to modelEntropy("nats") but may be faster.
|
|
1322
|
-
*
|
|
1323
|
-
* @returns Entropy in nats
|
|
1324
|
-
*/
|
|
1325
|
-
computeEntropy(): number;
|
|
1326
|
-
|
|
1327
1178
|
/**
|
|
1328
1179
|
* Sample greedily from current logits
|
|
1329
1180
|
*
|
|
@@ -1360,6 +1211,44 @@ export interface SessionContext {
|
|
|
1360
1211
|
* Context becomes unusable after disposal.
|
|
1361
1212
|
*/
|
|
1362
1213
|
dispose(): void;
|
|
1214
|
+
|
|
1215
|
+
// ===== BRANCH API (internal, wrapped by Branch class) =====
|
|
1216
|
+
|
|
1217
|
+
/** @internal Create a new branch for parallel generation */
|
|
1218
|
+
_branchCreate(seqId: number, position: number, params?: SamplingParams): number;
|
|
1219
|
+
|
|
1220
|
+
/** @internal Fork a branch to a new sequence */
|
|
1221
|
+
_branchFork(handle: number, newSeqId: number): number;
|
|
1222
|
+
|
|
1223
|
+
/** @internal Capture logits into branch's snapshot */
|
|
1224
|
+
_branchCaptureLogits(handle: number): void;
|
|
1225
|
+
|
|
1226
|
+
/** @internal Decode a single token and capture logits */
|
|
1227
|
+
_branchDecodeAndCaptureOne(handle: number, token: number): void;
|
|
1228
|
+
|
|
1229
|
+
/** @internal Sample next token from branch's logits snapshot */
|
|
1230
|
+
_branchSample(handle: number): number;
|
|
1231
|
+
|
|
1232
|
+
/** @internal Accept token (update sampler state for penalties) */
|
|
1233
|
+
_branchAccept(handle: number, token: number): void;
|
|
1234
|
+
|
|
1235
|
+
/** @internal Get branch's sequence ID */
|
|
1236
|
+
_branchGetSeqId(handle: number): number;
|
|
1237
|
+
|
|
1238
|
+
/** @internal Get branch's current position */
|
|
1239
|
+
_branchGetPosition(handle: number): number;
|
|
1240
|
+
|
|
1241
|
+
/** @internal Get branch's perplexity */
|
|
1242
|
+
_branchGetPerplexity(handle: number): number;
|
|
1243
|
+
|
|
1244
|
+
/** @internal Prune branch (remove KV cache entries and free handle) */
|
|
1245
|
+
_branchPrune(handle: number): void;
|
|
1246
|
+
|
|
1247
|
+
/** @internal Destroy branch (free handle without removing KV cache) */
|
|
1248
|
+
_branchDestroy(handle: number): void;
|
|
1249
|
+
|
|
1250
|
+
/** @internal Reseed branch sampler PRNG for diversity after fork */
|
|
1251
|
+
_branchSamplerChainReseed(handle: number, seed: number): void;
|
|
1363
1252
|
}
|
|
1364
1253
|
|
|
1365
1254
|
/**
|
|
@@ -1502,3 +1391,140 @@ export function withLogits<T>(
|
|
|
1502
1391
|
ctx: SessionContext,
|
|
1503
1392
|
fn: (logits: Float32Array) => T
|
|
1504
1393
|
): T;
|
|
1394
|
+
|
|
1395
|
+
/**
|
|
1396
|
+
* Result from Branch.produce()
|
|
1397
|
+
*/
|
|
1398
|
+
export interface Produced {
|
|
1399
|
+
/** Sampled token ID */
|
|
1400
|
+
token: number;
|
|
1401
|
+
/** Text representation of the token */
|
|
1402
|
+
text: string;
|
|
1403
|
+
/** Whether this is a stop token (EOS) */
|
|
1404
|
+
isStop: boolean;
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
/**
|
|
1408
|
+
* Forkable inference handle for covalent generation
|
|
1409
|
+
*
|
|
1410
|
+
* A Branch owns everything needed for independent generation: a KV cache
|
|
1411
|
+
* sequence, sampler chain, logits snapshot, and perplexity tracker.
|
|
1412
|
+
*
|
|
1413
|
+
* Forking is cheap — the KV prefix is shared in memory (metadata-only operation under unified KV —
|
|
1414
|
+
* no KV tensor buffers are copied), so sibling branches read from the same physical KV entries.
|
|
1415
|
+
* Only tokens decoded after the fork point are exclusive to each branch.
|
|
1416
|
+
*
|
|
1417
|
+
* Branches form trees, not just flat lists. Fork from root for best-of-N,
|
|
1418
|
+
* fork from children for MCTS/beam search, fork from a draft for speculative
|
|
1419
|
+
* decoding.
|
|
1420
|
+
*
|
|
1421
|
+
* The produce/commit protocol separates sampling from state advancement:
|
|
1422
|
+
* produce() samples without writing to KV, letting you inspect the result
|
|
1423
|
+
* before deciding to commit().
|
|
1424
|
+
*
|
|
1425
|
+
* @example Best-of-N with perplexity selection
|
|
1426
|
+
* ```typescript
|
|
1427
|
+
* const root = Branch.create(ctx, 0, tokens.length, { temperature: 0.8 });
|
|
1428
|
+
* root.captureLogits();
|
|
1429
|
+
*
|
|
1430
|
+
* const candidates = [1, 2, 3, 4, 5].map((seqId, i) => {
|
|
1431
|
+
* const branch = root.fork(seqId);
|
|
1432
|
+
* branch.reseedSampler(1000 + i);
|
|
1433
|
+
* return branch;
|
|
1434
|
+
* });
|
|
1435
|
+
*
|
|
1436
|
+
* for (let t = 0; t < 50; t++) {
|
|
1437
|
+
* for (const branch of candidates) {
|
|
1438
|
+
* const { token, isStop } = branch.produce();
|
|
1439
|
+
* if (isStop) continue;
|
|
1440
|
+
* branch.commit(token);
|
|
1441
|
+
* }
|
|
1442
|
+
* }
|
|
1443
|
+
*
|
|
1444
|
+
* const best = candidates.reduce((a, b) => a.perplexity < b.perplexity ? a : b);
|
|
1445
|
+
* for (const c of candidates) { if (c !== best) c.prune(); }
|
|
1446
|
+
* ```
|
|
1447
|
+
*/
|
|
1448
|
+
export class Branch {
|
|
1449
|
+
/**
|
|
1450
|
+
* Create a root branch at the given position
|
|
1451
|
+
*
|
|
1452
|
+
* The branch takes ownership of the sequence and creates its own sampler
|
|
1453
|
+
* chain from the provided params. Call captureLogits() after prefill to
|
|
1454
|
+
* freeze the logit distribution before forking.
|
|
1455
|
+
*
|
|
1456
|
+
* @param ctx SessionContext to create branch on
|
|
1457
|
+
* @param seqId Sequence ID for this branch
|
|
1458
|
+
* @param position Starting position (typically prompt token count)
|
|
1459
|
+
* @param params Sampling parameters (temperature, topP, etc.)
|
|
1460
|
+
*/
|
|
1461
|
+
static create(
|
|
1462
|
+
ctx: SessionContext,
|
|
1463
|
+
seqId: number,
|
|
1464
|
+
position: number,
|
|
1465
|
+
params?: SamplingParams
|
|
1466
|
+
): Branch;
|
|
1467
|
+
|
|
1468
|
+
/**
|
|
1469
|
+
* Fork this branch to a new sequence
|
|
1470
|
+
*
|
|
1471
|
+
* The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy).
|
|
1472
|
+
* Logits, sampler state, and perplexity tracker are cloned so the child
|
|
1473
|
+
* can diverge independently. Fork from any branch — root or intermediate —
|
|
1474
|
+
* to build arbitrarily deep trees.
|
|
1475
|
+
*
|
|
1476
|
+
* @param newSeqId Sequence ID for the forked branch
|
|
1477
|
+
*/
|
|
1478
|
+
fork(newSeqId: number): Branch;
|
|
1479
|
+
|
|
1480
|
+
/** Freeze the current logit distribution into this branch. Essential before fork(). */
|
|
1481
|
+
captureLogits(): void;
|
|
1482
|
+
|
|
1483
|
+
/** Decode a single token, write to KV, and capture resulting logits */
|
|
1484
|
+
decodeAndCaptureOne(token: number): void;
|
|
1485
|
+
|
|
1486
|
+
/** Sample next token from branch's frozen logits snapshot */
|
|
1487
|
+
sample(): number;
|
|
1488
|
+
|
|
1489
|
+
/** Accept token for repeat-penalty tracking */
|
|
1490
|
+
accept(token: number): void;
|
|
1491
|
+
|
|
1492
|
+
/** Discard branch — remove its divergent KV entries and free the handle (use for losers) */
|
|
1493
|
+
prune(): void;
|
|
1494
|
+
|
|
1495
|
+
/** Release handle but keep KV entries intact (use for winners, continue with raw ops) */
|
|
1496
|
+
destroy(): void;
|
|
1497
|
+
|
|
1498
|
+
/**
|
|
1499
|
+
* Reseed the sampler's PRNG for diversity after fork()
|
|
1500
|
+
*
|
|
1501
|
+
* CRITICAL for parallel generation: Without reseeding, all forked branches
|
|
1502
|
+
* produce identical outputs because they share the same PRNG state.
|
|
1503
|
+
*
|
|
1504
|
+
* Only affects stochastic samplers (temperature > 0). Greedy samplers are unchanged.
|
|
1505
|
+
*
|
|
1506
|
+
* @param seed - New seed for the PRNG
|
|
1507
|
+
*/
|
|
1508
|
+
reseedSampler(seed: number): void;
|
|
1509
|
+
|
|
1510
|
+
/** Sample next token without advancing state. Inspect before committing. */
|
|
1511
|
+
produce(): Produced;
|
|
1512
|
+
|
|
1513
|
+
/** Accept and advance — write token to KV and update branch state. */
|
|
1514
|
+
commit(token: number): void;
|
|
1515
|
+
|
|
1516
|
+
/** Branch's sequence ID */
|
|
1517
|
+
readonly seqId: number;
|
|
1518
|
+
|
|
1519
|
+
/** Branch's current position */
|
|
1520
|
+
readonly position: number;
|
|
1521
|
+
|
|
1522
|
+
/** Branch's perplexity */
|
|
1523
|
+
readonly perplexity: number;
|
|
1524
|
+
|
|
1525
|
+
/** Internal handle (for debugging) */
|
|
1526
|
+
readonly handle: number;
|
|
1527
|
+
|
|
1528
|
+
/** Whether this branch has been disposed */
|
|
1529
|
+
readonly disposed: boolean;
|
|
1530
|
+
}
|
package/lib/index.js
CHANGED
|
@@ -23,11 +23,11 @@
|
|
|
23
23
|
* // Safe logits access (Runtime Borrow Checker pattern)
|
|
24
24
|
* const entropy = withLogits(ctx, (logits) => {
|
|
25
25
|
* // logits is valid here - use synchronously only!
|
|
26
|
-
* return
|
|
26
|
+
* return myComputeEntropy(logits);
|
|
27
27
|
* });
|
|
28
28
|
*
|
|
29
29
|
* // Or with native reference implementations (for testing)
|
|
30
|
-
* const
|
|
30
|
+
* const entropy = ctx.modelEntropy();
|
|
31
31
|
* const token = ctx.greedySample();
|
|
32
32
|
*
|
|
33
33
|
* // Cleanup
|
|
@@ -203,7 +203,14 @@ function withLogits(ctx, fn) {
|
|
|
203
203
|
return result;
|
|
204
204
|
}
|
|
205
205
|
|
|
206
|
+
const { Branch } = require('./Branch');
|
|
207
|
+
|
|
206
208
|
module.exports = {
|
|
209
|
+
/**
|
|
210
|
+
* Branch class for parallel generation
|
|
211
|
+
* @see Branch.create()
|
|
212
|
+
*/
|
|
213
|
+
Branch,
|
|
207
214
|
/**
|
|
208
215
|
* Create a new inference context
|
|
209
216
|
*
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lloyal-labs/lloyal.node",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.8",
|
|
4
4
|
"description": "Node.js client for liblloyal+llama.cpp",
|
|
5
5
|
"main": "lib/index.js",
|
|
6
6
|
"types": "lib/index.d.ts",
|
|
@@ -19,6 +19,8 @@
|
|
|
19
19
|
"test": "npm run test:api && npm run test:e2e",
|
|
20
20
|
"test:api": "node test/api.js",
|
|
21
21
|
"test:e2e": "node test/e2e.js",
|
|
22
|
+
"test:examples": "node test/examples.js",
|
|
23
|
+
"sync:llama-cpp": "node scripts/sync-llama-cpp.js",
|
|
22
24
|
"example": "node examples/chat/chat.mjs"
|
|
23
25
|
},
|
|
24
26
|
"repository": {
|
|
@@ -48,22 +50,23 @@
|
|
|
48
50
|
"devDependencies": {
|
|
49
51
|
"cmake-js": "^7.4.0",
|
|
50
52
|
"glob": "^11.0.0",
|
|
51
|
-
"typedoc": "^0.
|
|
53
|
+
"typedoc": "^0.28.16",
|
|
54
|
+
"typedoc-rhineai-theme": "^1.2.0"
|
|
52
55
|
},
|
|
53
56
|
"optionalDependencies": {
|
|
54
|
-
"@lloyal-labs/lloyal.node-darwin-arm64": "1.0.
|
|
55
|
-
"@lloyal-labs/lloyal.node-darwin-x64": "1.0.
|
|
56
|
-
"@lloyal-labs/lloyal.node-linux-arm64": "1.0.
|
|
57
|
-
"@lloyal-labs/lloyal.node-linux-arm64-cuda": "1.0.
|
|
58
|
-
"@lloyal-labs/lloyal.node-linux-arm64-vulkan": "1.0.
|
|
59
|
-
"@lloyal-labs/lloyal.node-linux-x64": "1.0.
|
|
60
|
-
"@lloyal-labs/lloyal.node-linux-x64-cuda": "1.0.
|
|
61
|
-
"@lloyal-labs/lloyal.node-linux-x64-vulkan": "1.0.
|
|
62
|
-
"@lloyal-labs/lloyal.node-win32-arm64": "1.0.
|
|
63
|
-
"@lloyal-labs/lloyal.node-win32-arm64-vulkan": "1.0.
|
|
64
|
-
"@lloyal-labs/lloyal.node-win32-x64": "1.0.
|
|
65
|
-
"@lloyal-labs/lloyal.node-win32-x64-cuda": "1.0.
|
|
66
|
-
"@lloyal-labs/lloyal.node-win32-x64-vulkan": "1.0.
|
|
57
|
+
"@lloyal-labs/lloyal.node-darwin-arm64": "1.0.8",
|
|
58
|
+
"@lloyal-labs/lloyal.node-darwin-x64": "1.0.8",
|
|
59
|
+
"@lloyal-labs/lloyal.node-linux-arm64": "1.0.8",
|
|
60
|
+
"@lloyal-labs/lloyal.node-linux-arm64-cuda": "1.0.8",
|
|
61
|
+
"@lloyal-labs/lloyal.node-linux-arm64-vulkan": "1.0.8",
|
|
62
|
+
"@lloyal-labs/lloyal.node-linux-x64": "1.0.8",
|
|
63
|
+
"@lloyal-labs/lloyal.node-linux-x64-cuda": "1.0.8",
|
|
64
|
+
"@lloyal-labs/lloyal.node-linux-x64-vulkan": "1.0.8",
|
|
65
|
+
"@lloyal-labs/lloyal.node-win32-arm64": "1.0.8",
|
|
66
|
+
"@lloyal-labs/lloyal.node-win32-arm64-vulkan": "1.0.8",
|
|
67
|
+
"@lloyal-labs/lloyal.node-win32-x64": "1.0.8",
|
|
68
|
+
"@lloyal-labs/lloyal.node-win32-x64-cuda": "1.0.8",
|
|
69
|
+
"@lloyal-labs/lloyal.node-win32-x64-vulkan": "1.0.8"
|
|
67
70
|
},
|
|
68
71
|
"engines": {
|
|
69
72
|
"node": ">=22.0.0"
|
|
@@ -79,8 +79,9 @@ if (osName === 'darwin') {
|
|
|
79
79
|
});
|
|
80
80
|
|
|
81
81
|
} else if (osName === 'linux') {
|
|
82
|
-
// Copy all .so files
|
|
83
|
-
|
|
82
|
+
// Copy all .so files including versioned variants (e.g., libllama.so.0, libllama.so.0.0.X)
|
|
83
|
+
// llama.cpp sets SOVERSION, producing versioned names that the binary references at runtime
|
|
84
|
+
const sos = fs.readdirSync(BUILD_DIR).filter(f => /\.so(\.\d+)*$/.test(f));
|
|
84
85
|
if (sos.length > 0) {
|
|
85
86
|
sos.forEach(so => {
|
|
86
87
|
fs.copyFileSync(path.join(BUILD_DIR, so), path.join(BIN_DIR, so));
|
|
@@ -26,5 +26,15 @@ else
|
|
|
26
26
|
echo " ✓ nomic-embed-text already exists"
|
|
27
27
|
fi
|
|
28
28
|
|
|
29
|
+
# slim-summary-tool (1.7GB) - Summary sidecar for dynamic sinks
|
|
30
|
+
if [ ! -f "slim-summarize.gguf" ]; then
|
|
31
|
+
echo " → Downloading slim-summarize.gguf..."
|
|
32
|
+
curl -L -o "slim-summarize.gguf" \
|
|
33
|
+
"https://huggingface.co/llmware/slim-summary-tool/resolve/main/slim-summary-tool.gguf"
|
|
34
|
+
echo " ✓ Downloaded slim-summarize"
|
|
35
|
+
else
|
|
36
|
+
echo " ✓ slim-summarize already exists"
|
|
37
|
+
fi
|
|
38
|
+
|
|
29
39
|
echo ""
|
|
30
40
|
echo "✅ All test models ready"
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Sync llama.cpp submodule to match liblloyal's .llama-cpp-version
|
|
4
|
+
*
|
|
5
|
+
* Single source of truth: liblloyal/.llama-cpp-version contains the tag
|
|
6
|
+
* that the llama.cpp submodule should be checked out at.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* node scripts/sync-llama-cpp.js # Sync submodule to target tag
|
|
10
|
+
* node scripts/sync-llama-cpp.js --check # Validate match (CI mode)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const { execSync } = require('child_process');
|
|
14
|
+
const fs = require('fs');
|
|
15
|
+
const path = require('path');
|
|
16
|
+
|
|
17
|
+
const ROOT = path.join(__dirname, '..');
|
|
18
|
+
const VERSION_FILE = path.join(ROOT, 'liblloyal', '.llama-cpp-version');
|
|
19
|
+
const LLAMA_CPP_DIR = path.join(ROOT, 'llama.cpp');
|
|
20
|
+
|
|
21
|
+
const CHECK_ONLY = process.argv.includes('--check');
|
|
22
|
+
|
|
23
|
+
// --- Read target version ---
|
|
24
|
+
|
|
25
|
+
if (!fs.existsSync(VERSION_FILE)) {
|
|
26
|
+
console.error('[sync-llama-cpp] Error: liblloyal/.llama-cpp-version not found.');
|
|
27
|
+
console.error('[sync-llama-cpp] Make sure liblloyal submodule is initialized:');
|
|
28
|
+
console.error('[sync-llama-cpp] git submodule update --init --recursive');
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const versionFileContent = fs.readFileSync(VERSION_FILE, 'utf8');
|
|
33
|
+
const targetVersion = versionFileContent
|
|
34
|
+
.split('\n')
|
|
35
|
+
.filter(line => !line.startsWith('#') && line.trim().length > 0)
|
|
36
|
+
[0]
|
|
37
|
+
?.trim();
|
|
38
|
+
|
|
39
|
+
if (!targetVersion) {
|
|
40
|
+
console.error('[sync-llama-cpp] Error: Could not parse version from liblloyal/.llama-cpp-version');
|
|
41
|
+
process.exit(1);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
console.log(`[sync-llama-cpp] Target llama.cpp version: ${targetVersion}`);
|
|
45
|
+
|
|
46
|
+
// --- Check llama.cpp submodule exists ---
|
|
47
|
+
|
|
48
|
+
if (!fs.existsSync(path.join(LLAMA_CPP_DIR, '.git'))) {
|
|
49
|
+
console.error('[sync-llama-cpp] Error: llama.cpp submodule not initialized.');
|
|
50
|
+
console.error('[sync-llama-cpp] Run: git submodule update --init --recursive');
|
|
51
|
+
process.exit(1);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// --- Helper ---
|
|
55
|
+
|
|
56
|
+
function exec(cmd, opts = {}) {
|
|
57
|
+
return execSync(cmd, { cwd: LLAMA_CPP_DIR, encoding: 'utf8', stdio: 'pipe', ...opts }).trim();
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// --- Get current llama.cpp state ---
|
|
61
|
+
|
|
62
|
+
const currentSha = exec('git rev-parse HEAD');
|
|
63
|
+
|
|
64
|
+
// Resolve target tag to SHA (may need to fetch in shallow clones)
|
|
65
|
+
let targetSha;
|
|
66
|
+
try {
|
|
67
|
+
targetSha = exec(`git rev-parse ${targetVersion}`);
|
|
68
|
+
} catch {
|
|
69
|
+
// Tag not available locally — fetch it
|
|
70
|
+
console.log(`[sync-llama-cpp] Tag ${targetVersion} not found locally, fetching...`);
|
|
71
|
+
try {
|
|
72
|
+
exec(`git fetch origin tag ${targetVersion} --no-tags --depth 1`);
|
|
73
|
+
targetSha = exec(`git rev-parse ${targetVersion}`);
|
|
74
|
+
} catch (e) {
|
|
75
|
+
console.error(`[sync-llama-cpp] Error: Tag ${targetVersion} not found in remote.`);
|
|
76
|
+
console.error(`[sync-llama-cpp] Verify tag exists: https://github.com/ggml-org/llama.cpp/releases/tag/${targetVersion}`);
|
|
77
|
+
process.exit(1);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const currentShort = currentSha.slice(0, 7);
|
|
82
|
+
const targetShort = targetSha.slice(0, 7);
|
|
83
|
+
|
|
84
|
+
console.log(`[sync-llama-cpp] Current: ${currentShort} (${currentSha})`);
|
|
85
|
+
console.log(`[sync-llama-cpp] Target: ${targetShort} (${targetVersion})`);
|
|
86
|
+
|
|
87
|
+
if (currentSha === targetSha) {
|
|
88
|
+
console.log(`[sync-llama-cpp] llama.cpp submodule matches ${targetVersion}.`);
|
|
89
|
+
process.exit(0);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// --- Mismatch ---
|
|
93
|
+
|
|
94
|
+
if (CHECK_ONLY) {
|
|
95
|
+
console.error(`\n[sync-llama-cpp] MISMATCH: llama.cpp submodule is at ${currentShort}, expected ${targetVersion} (${targetShort})`);
|
|
96
|
+
console.error(`[sync-llama-cpp] Fix: npm run sync:llama-cpp`);
|
|
97
|
+
process.exit(1);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// --- Sync ---
|
|
101
|
+
|
|
102
|
+
console.log(`[sync-llama-cpp] Checking out ${targetVersion}...`);
|
|
103
|
+
|
|
104
|
+
try {
|
|
105
|
+
exec(`git checkout ${targetVersion}`);
|
|
106
|
+
} catch {
|
|
107
|
+
exec(`git fetch origin tag ${targetVersion} --no-tags --depth 1`);
|
|
108
|
+
exec(`git checkout ${targetVersion}`);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const newShort = exec('git rev-parse --short HEAD');
|
|
112
|
+
console.log(`[sync-llama-cpp] llama.cpp now at: ${newShort} (${targetVersion})`);
|
|
113
|
+
console.log('');
|
|
114
|
+
console.log('[sync-llama-cpp] Next steps:');
|
|
115
|
+
console.log(' 1. Build and test: npm run build && npm test');
|
|
116
|
+
console.log(' 2. Stage changes: git add llama.cpp');
|
|
117
|
+
console.log(' 3. Commit: git commit -m "chore(deps): sync llama.cpp to ' + targetVersion + '"');
|