@lloyal-labs/sdk 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/Rerank.js CHANGED
@@ -1,16 +1,58 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.Rerank = void 0;
3
+ exports.Rerank = exports.RerankInternalError = exports.RerankCalibrationError = void 0;
4
+ const Branch_1 = require("./Branch");
5
+ const BranchStore_1 = require("./BranchStore");
4
6
  const SYSTEM_PROMPT = 'Judge whether the Document meets the requirements based on the Query ' +
5
7
  'and the Instruct provided. Note that the answer can only be "yes" or "no".';
6
8
  const USER_PREFIX = '<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n\n' +
7
9
  '<Query>: ';
8
- /** Simple async channel_drain pushes, consumer pulls via for-await */
9
- function channel() {
10
+ // Boot canary fixtureshardcoded to Qwen3-reranker semantics. If you swap
11
+ // reranker models, re-run the calibration probe and update these fixtures.
12
+ const CANARY_QUERY = 'What is the capital of France?';
13
+ const CANARY_RELEVANT_DOC = 'Paris is the capital and most populous city of France.';
14
+ const CANARY_IRRELEVANT_DOC = 'Photosynthesis converts carbon dioxide and water into glucose.';
15
+ // Sentinel strings used to discover segment boundaries inside the rendered
16
+ // chat probe. Longer ASCII (not NUL bytes) survives tokenizer normalization;
17
+ // the BPE-boundary invariance check at boot still verifies that the sentinels
18
+ // did not cause merges across segment seams.
19
+ const SENTINEL_Q = '__RERANK_QUERY_PROBE_a3f7__';
20
+ const SENTINEL_D = '__RERANK_DOC_PROBE_a3f7__';
21
+ /**
22
+ * Thrown by {@link Rerank.create} when calibration gates fail (single-token
23
+ * yes/no, BPE-boundary invariance, boot canary signs). Each instance includes
24
+ * the specific gate and the empirical evidence so callers can fix at the
25
+ * right layer (model swap, sentinel change, template drift).
26
+ */
27
+ class RerankCalibrationError extends Error {
28
+ name = 'RerankCalibrationError';
29
+ }
30
+ exports.RerankCalibrationError = RerankCalibrationError;
31
+ /**
32
+ * Thrown when Rerank's internal invariants are violated (lease exhaustion
33
+ * mid-query, fork returning a disposed handle, etc.). These represent state
34
+ * the consumer cannot fix; the diagnostic exists for framework-side triage.
35
+ */
36
+ class RerankInternalError extends Error {
37
+ name = 'RerankInternalError';
38
+ }
39
+ exports.RerankInternalError = RerankInternalError;
40
+ /**
41
+ * Async channel — internal driver pushes; consumer pulls via for-await.
42
+ *
43
+ * The returned iterator supports `return()` so `for-await break` and explicit
44
+ * `iterator.return()` both invoke `onCancel`. Without this hook the upstream
45
+ * driver has no way to know the consumer has stopped reading and would keep
46
+ * issuing GPU dispatches for documents whose scores will be discarded.
47
+ *
48
+ * @param onCancel - Invoked at most once when the consumer cancels the iterator.
49
+ */
50
+ function channel(onCancel) {
10
51
  const buffer = [];
11
52
  let done = false;
12
53
  let err = null;
13
54
  let notify = null;
55
+ let cancelFired = false;
14
56
  const wait = () => new Promise((r) => { notify = r; });
15
57
  return {
16
58
  push(value) {
@@ -40,211 +82,501 @@ function channel() {
40
82
  return { value: buffer.shift(), done: false };
41
83
  return { value: undefined, done: true };
42
84
  },
85
+ async return() {
86
+ done = true;
87
+ if (!cancelFired) {
88
+ cancelFired = true;
89
+ onCancel?.();
90
+ }
91
+ notify?.();
92
+ notify = null;
93
+ return { value: undefined, done: true };
94
+ },
43
95
  };
44
96
  },
45
97
  },
46
98
  };
47
99
  }
100
+ /**
101
+ * Cross-encoder reranker composed over the SDK's Branch / BranchStore primitives.
102
+ *
103
+ * # Lifetime + concurrency contract
104
+ *
105
+ * Rerank takes **exclusive ownership** of its SessionContext. Routing any other
106
+ * decode through the same context concurrently is undefined behavior — the
107
+ * kernel's `llama_context::decode` carries no internal mutex (verified at
108
+ * llama.cpp b9581). Enforcement is at construction: `Rerank.create()` marks
109
+ * the context with a `__decodeOwner` flag and refuses a second instance.
110
+ * The flag is cleared by `dispose()`, so test/REPL re-creation works.
111
+ *
112
+ * Concurrent `score()` / `scoreBatch()` calls **on the same Rerank instance**
113
+ * are serialized by a per-instance Promise chain (~10 LOC). The kernel sees
114
+ * them in arrival order; consumers still get a concurrent-looking API.
115
+ *
116
+ * # Architecture
117
+ *
118
+ * [SYSTEM][USER_PREFIX][QUERY][MID][DOC_i][SUFFIX][GEN_PROMPT]
119
+ * └── permanent trunk ─┘ └── per-query branch ──┘ └─── per-chunk leaves ─┘
120
+ *
121
+ * - **trunk**: prefilled with the static [SYSTEM][USER_PREFIX] segment ONCE
122
+ * at `Rerank.create()`; lives for the instance lifetime. Warm KV is
123
+ * amortized across every score() via multi-tag KV survival.
124
+ * - **queryBranch**: forked from trunk per score() call, prefilled with
125
+ * `[query, ...midTokens]`. Forked with `cloneLogits: false` because we
126
+ * immediately overwrite the logits with the prefill.
127
+ * - **leaves**: forked from queryBranch in groups of `BranchStore.available`,
128
+ * scatter-prefilled with `[doc_i, ...suffixTokens]` via `BranchStore.prefill`
129
+ * (one `llama_decode` per group), scored via `_branchLogitsAt` reading
130
+ * exactly two floats per leaf, pruned.
131
+ *
132
+ * # Calibration gates (fail-loud at create() time)
133
+ *
134
+ * 1. `yes` and `no` must tokenize as single tokens (the score formula
135
+ * `logit("yes") − logit("no")` assumes this; broader support requires
136
+ * log-sum-exp over label sequences, a 3.x ticket).
137
+ * 2. BPE-boundary invariance — tokenizing a canary full prompt must equal
138
+ * the concat of (prefix, query, mid, doc, suffix) tokenized separately,
139
+ * so segment seams don't silently shift the leaf prompts.
140
+ * 3. Boot canary — score a known relevant + irrelevant pair; relevant
141
+ * must outscore irrelevant by > 1.0 logit unit. Asserts the *gap*,
142
+ * NOT absolute signs — quantization shifts calibration enough that
143
+ * sign assertions are brittle, while the ordering gap still catches
144
+ * yes/no token swap, model swap, and template drift.
145
+ *
146
+ * # Score formula
147
+ *
148
+ * score = `logit("yes") − logit("no")` (unbounded).
149
+ *
150
+ * **This is the log-odds of an absolute yes/no relevance judgment.** The
151
+ * model is a pointwise binary cross-encoder; the official Qwen3-Reranker
152
+ * score is the two-token softmax over {yes,no} — i.e. `sigmoid(score)` =
153
+ * P(yes) ∈ [0,1] — and our log-odds is its monotone equivalent (identical
154
+ * rankings, full dynamic range). Scores ARE thresholdable (0 ≡ P 0.5) and
155
+ * comparable across queries to the extent of the model's calibration;
156
+ * quantization adds noise at the extremes. Top-1 routinely goes negative
157
+ * on real corpora when no document is strongly relevant — an honest
158
+ * "probably not", with the ranking still useful. Production traces show
159
+ * top-1 ranging from +10 (P≈.9999) to -3 (P≈.05, weak best match).
160
+ *
161
+ * The previous softmax form compressed small logit gaps into extreme
162
+ * probabilities (gap of 5 → 0.993; gap of 10 → 0.99995), saturating top-K
163
+ * ordering. Logit-diff preserves the full dynamic range. See
164
+ * `reasoning.run/scripts/inspect-rerank.mjs` for empirical evidence.
165
+ *
166
+ * Consumers that want a confidence threshold should calibrate against their
167
+ * own corpus rather than assuming `> 0` means "relevant" — see SearchTool's
168
+ * threshold envelope.
169
+ */
48
170
  class Rerank {
49
171
  _ctx;
172
+ _store;
173
+ _trunk;
50
174
  _nSeqMax;
51
175
  _nCtx;
52
176
  _yesId;
53
177
  _noId;
54
- _prefixTokens;
55
178
  _midTokens;
56
179
  _suffixTokens;
57
- _pending = [];
58
- _draining = false;
180
+ _staticPrefix;
181
+ _onTruncate;
182
+ _inflight = Promise.resolve();
59
183
  _disposed = false;
60
- constructor(ctx, nSeqMax, nCtx, yesId, noId, prefixTokens, midTokens, suffixTokens) {
184
+ constructor(ctx, store, trunk, nSeqMax, nCtx, yesId, noId, staticPrefix, midTokens, suffixTokens, onTruncate) {
61
185
  this._ctx = ctx;
186
+ this._store = store;
187
+ this._trunk = trunk;
62
188
  this._nSeqMax = nSeqMax;
63
189
  this._nCtx = nCtx;
64
190
  this._yesId = yesId;
65
191
  this._noId = noId;
66
- this._prefixTokens = prefixTokens;
192
+ this._staticPrefix = staticPrefix;
67
193
  this._midTokens = midTokens;
68
194
  this._suffixTokens = suffixTokens;
195
+ this._onTruncate = onTruncate;
69
196
  }
70
197
  /**
71
- * Create a Rerank instance from a pre-created SessionContext
198
+ * Create a Rerank instance bound to a pre-created SessionContext.
72
199
  *
73
- * The caller is responsible for creating the context with appropriate
74
- * settings (nSeqMax, nCtx, typeK, typeV). Rerank takes ownership of
75
- * the context and will dispose it on `dispose()`.
200
+ * Rerank takes exclusive ownership of `ctx` (see class docstring). The
201
+ * caller must construct `ctx` with `nSeqMax` 3 (one slot each for trunk
202
+ * + queryBranch + at least one leaf).
76
203
  *
77
- * @param ctx - SessionContext configured for reranking
78
- * @param opts - Capacity hints (nSeqMax, nCtx) must match context creation params
204
+ * Fires three calibration gates at boot. If any gate fails, throws
205
+ * {@link RerankCalibrationError} with a diagnostic naming the failure and
206
+ * cleans up partial state (no ctx leak).
79
207
  */
80
208
  static async create(ctx, opts) {
81
- const nSeqMax = opts?.nSeqMax ?? 8;
209
+ const owner = ctx.__decodeOwner;
210
+ if (owner) {
211
+ throw new RerankInternalError(`SessionContext already has a decode owner (${owner}); Rerank ` +
212
+ `requires exclusive ownership. Construct a dedicated SessionContext.`);
213
+ }
214
+ const nSeqMax = opts?.nSeqMax ?? 10;
82
215
  const nCtx = opts?.nCtx ?? ctx._storeKvPressure().nCtx;
83
- const [yesId] = await ctx.tokenize('yes', false);
84
- const [noId] = await ctx.tokenize('no', false);
85
- const SENTINEL_Q = '\x00QUERY\x00';
86
- const SENTINEL_D = '\x00DOC\x00';
216
+ // Calibration gate 1: single-token yes / no
217
+ const yesTokens = await ctx.tokenize('yes', false);
218
+ const noTokens = await ctx.tokenize('no', false);
219
+ if (yesTokens.length !== 1) {
220
+ throw new RerankCalibrationError(`Reranker model tokenizes 'yes' as ${yesTokens.length} tokens ` +
221
+ `(expected 1). The score formula logit("yes") − logit("no") ` +
222
+ `requires single-token labels. Broader support requires ` +
223
+ `generalizing the formula to log-sum-exp over label sequences (3.x).`);
224
+ }
225
+ if (noTokens.length !== 1) {
226
+ throw new RerankCalibrationError(`Reranker model tokenizes 'no' as ${noTokens.length} tokens (expected 1).`);
227
+ }
228
+ const yesId = yesTokens[0];
229
+ const noId = noTokens[0];
230
+ // Render sentinel probe to discover segment boundaries.
87
231
  const probe = await ctx.formatChat(JSON.stringify([
88
232
  { role: 'system', content: SYSTEM_PROMPT },
89
- { role: 'user', content: `${USER_PREFIX}${SENTINEL_Q}\n\n<Document>: ${SENTINEL_D}` },
233
+ {
234
+ role: 'user',
235
+ content: `${USER_PREFIX}${SENTINEL_Q}\n\n<Document>: ${SENTINEL_D}`,
236
+ },
90
237
  ]), { addGenerationPrompt: true, enableThinking: false });
91
238
  const p = probe.prompt;
92
239
  const qi = p.indexOf(SENTINEL_Q);
93
240
  const di = p.indexOf(SENTINEL_D);
94
- const prefixTokens = await ctx.tokenize(p.slice(0, qi), true);
95
- const midTokens = await ctx.tokenize(p.slice(qi + SENTINEL_Q.length, di), false);
96
- const suffixTokens = await ctx.tokenize(p.slice(di + SENTINEL_D.length), false);
97
- return new Rerank(ctx, nSeqMax, nCtx, yesId, noId, prefixTokens, midTokens, suffixTokens);
241
+ if (qi < 0 || di < 0 || qi >= di) {
242
+ throw new RerankCalibrationError(`Sentinel probe failed to locate segment boundaries: ` +
243
+ `SENTINEL_Q ${qi < 0 ? 'missing' : `@${qi}`}, ` +
244
+ `SENTINEL_D ${di < 0 ? 'missing' : `@${di}`}. ` +
245
+ `The chat template may have stripped or reordered the sentinels.`);
246
+ }
247
+ const prefixText = p.slice(0, qi);
248
+ const midText = p.slice(qi + SENTINEL_Q.length, di);
249
+ const suffixText = p.slice(di + SENTINEL_D.length);
250
+ const prefixTokens = await ctx.tokenize(prefixText, true);
251
+ const midTokens = await ctx.tokenize(midText, false);
252
+ const suffixTokens = await ctx.tokenize(suffixText, false);
253
+ // Calibration gate 2: BPE-boundary drift bound.
254
+ // Re-tokenize a CANARY full prompt and compare to segment-concat. Most
255
+ // chat templates produce small drift (1-5 tokens) from BOS/EOS handling,
256
+ // assistant-start tokens, or whitespace normalization across segment
257
+ // seams. The boot canary (gate 3) is the load-bearing behavioral test;
258
+ // this gate exists to catch CATASTROPHIC drift (a sentinel that triggers
259
+ // a multi-token BPE merge), so we threshold at 5% of the whole-prompt
260
+ // length. Exact-equality was too strict for the Qwen3-reranker template
261
+ // (it drifts by ~3 tokens on the canary prompt, but the boot canary
262
+ // still scores cleanly).
263
+ const canaryQueryTokens = await ctx.tokenize(CANARY_QUERY, false);
264
+ const canaryDocTokens = await ctx.tokenize(CANARY_RELEVANT_DOC, false);
265
+ const canaryWhole = await ctx.formatChat(JSON.stringify([
266
+ { role: 'system', content: SYSTEM_PROMPT },
267
+ {
268
+ role: 'user',
269
+ content: `${USER_PREFIX}${CANARY_QUERY}\n\n<Document>: ${CANARY_RELEVANT_DOC}`,
270
+ },
271
+ ]), { addGenerationPrompt: true, enableThinking: false });
272
+ const canaryWholeTokens = await ctx.tokenize(canaryWhole.prompt, true);
273
+ const canaryConcatLen = prefixTokens.length +
274
+ canaryQueryTokens.length +
275
+ midTokens.length +
276
+ canaryDocTokens.length +
277
+ suffixTokens.length;
278
+ const bpeDrift = Math.abs(canaryWholeTokens.length - canaryConcatLen);
279
+ const bpeDriftRatio = bpeDrift / canaryWholeTokens.length;
280
+ if (bpeDriftRatio > 0.05) {
281
+ throw new RerankCalibrationError(`BPE-boundary drift exceeds 5%: ` +
282
+ `tokenize(full canary prompt) = ${canaryWholeTokens.length} tokens, ` +
283
+ `concat(prefix+query+mid+doc+suffix) = ${canaryConcatLen} tokens, ` +
284
+ `drift = ${bpeDrift} (${(bpeDriftRatio * 100).toFixed(1)}%). ` +
285
+ `Sentinel choice is causing multi-token BPE merges across segment ` +
286
+ `boundaries; leaf prompts would silently differ from the form the ` +
287
+ `model was trained against. Try a fresh sentinel pair or check the ` +
288
+ `reranker model's tokenizer version.`);
289
+ }
290
+ // Claim the context, build the trunk + store. From this point on we
291
+ // must clean up __decodeOwner + trunk on any failure path.
292
+ ctx.__decodeOwner = 'rerank';
293
+ const store = new BranchStore_1.BranchStore(ctx);
294
+ const trunk = Branch_1.Branch.create(ctx, 0);
295
+ try {
296
+ await trunk.prefill(prefixTokens);
297
+ const r = new Rerank(ctx, store, trunk, nSeqMax, nCtx, yesId, noId, prefixTokens, midTokens, suffixTokens, opts?.onTruncate);
298
+ // Calibration gate 3: boot canary RELATIVE ordering.
299
+ //
300
+ // The reranker is a CLM with logit-diff scoring — a *relative* ranker,
301
+ // not an absolute calibrator. Production traces show top-1 scores
302
+ // routinely going negative on real corpora (e.g. -2.8 for the best
303
+ // match when no doc is strongly relevant); rankings remain correct
304
+ // because the model picks the least-irrelevant doc. The canary
305
+ // therefore asserts ordering, not signs: a clearly-relevant pair must
306
+ // outscore a clearly-irrelevant one by a meaningful margin.
307
+ //
308
+ // This still catches the failure modes a sign-threshold would have
309
+ // caught — yes/no token swap (rankings invert), model swap (random
310
+ // scores → no consistent ordering), template drift (random scores) —
311
+ // without false-positiving on aggressively-quantized models that
312
+ // produce shifted-but-monotone score distributions.
313
+ const canaryScores = await r.scoreBatch(CANARY_QUERY, [
314
+ CANARY_RELEVANT_DOC,
315
+ CANARY_IRRELEVANT_DOC,
316
+ ]);
317
+ const gap = canaryScores[0] - canaryScores[1];
318
+ if (!(gap > 1.0)) {
319
+ throw new RerankCalibrationError(`Boot canary failed: relevant pair scored ` +
320
+ `${canaryScores[0].toFixed(3)}, irrelevant pair scored ` +
321
+ `${canaryScores[1].toFixed(3)} (gap=${gap.toFixed(3)}, ` +
322
+ `expected > 1.0). Possible causes: yes/no token id swap, ` +
323
+ `reranker model swap, or chat template drift. ` +
324
+ `Canary pair: query=${JSON.stringify(CANARY_QUERY)}, ` +
325
+ `relevant=${JSON.stringify(CANARY_RELEVANT_DOC)}, ` +
326
+ `irrelevant=${JSON.stringify(CANARY_IRRELEVANT_DOC)}.`);
327
+ }
328
+ return r;
329
+ }
330
+ catch (err) {
331
+ // Boot failure: scrub partial state before re-raising so the ctx is
332
+ // re-usable by the next Rerank.create() attempt.
333
+ try {
334
+ trunk.pruneSubtreeSync();
335
+ }
336
+ catch { /* trunk may already be gone */ }
337
+ delete ctx.__decodeOwner;
338
+ throw err;
339
+ }
98
340
  }
341
+ /**
342
+ * Stream progressive ranking results for `documents` against `query`.
343
+ *
344
+ * Pre-tokenized documents must come from {@link tokenize} or a reranker-
345
+ * compatible tokenizer; mismatched tokenizers silently produce wrong scores.
346
+ *
347
+ * Consumers may cancel by calling `iterator.return()` directly or by
348
+ * `for-await break`. Cancellation bounds the post-cancel cost at the one
349
+ * leaf group already in flight; subsequent groups are skipped.
350
+ */
99
351
  score(query, documents, topK) {
100
352
  if (this._disposed)
101
353
  throw new Error('Rerank disposed');
102
354
  const self = this;
103
- const ch = channel();
104
- (async () => {
355
+ let cancelled = false;
356
+ const ch = channel(() => {
357
+ cancelled = true;
358
+ });
359
+ void (async () => {
360
+ // Per-instance serializer: capture the previous tail, register ours,
361
+ // then wait. New score() / scoreBatch() calls chain behind us.
362
+ const prev = self._inflight;
363
+ let release;
364
+ self._inflight = new Promise((r) => {
365
+ release = r;
366
+ });
105
367
  try {
106
- const queryTokens = await self._ctx.tokenize(query, false);
107
- const shared = [...self._prefixTokens, ...queryTokens, ...self._midTokens];
108
- const maxDoc = Math.floor(self._nCtx / self._nSeqMax) - shared.length - self._suffixTokens.length;
109
- const tokenArrays = documents.map((doc) => {
110
- const trimmed = doc.length > maxDoc ? doc.slice(0, maxDoc) : doc;
111
- return [...shared, ...trimmed, ...self._suffixTokens];
112
- });
113
- self._enqueue(tokenArrays, topK, ch.push, ch.finish, ch.error);
368
+ await prev;
369
+ await self._scoreInternal(query, documents, topK, ch, () => cancelled);
370
+ ch.finish();
114
371
  }
115
372
  catch (err) {
116
373
  ch.error(err instanceof Error ? err : new Error(String(err)));
117
374
  }
375
+ finally {
376
+ release();
377
+ }
118
378
  })();
119
379
  return ch.iterable;
120
380
  }
121
381
  /**
122
- * Score raw text strings against a query in one batch.
123
- *
124
- * Tokenizes texts synchronously, builds reranker prompt arrays, and
125
- * dispatches via `_scoreGroup` for parallel cross-encoder scoring.
126
- * Up to `nSeqMax` texts are scored per batch call.
127
- *
128
- * @param query - Reference query to score against
129
- * @param texts - Raw text strings to score
130
- * @returns Scores (0–1) in input order
382
+ * Batch-score raw text strings against a query. Returns logit-diff scores
383
+ * (unbounded; positive = "yes", negative = "no", magnitude = confidence) in
384
+ * input order.
131
385
  */
132
386
  async scoreBatch(query, texts) {
133
387
  if (this._disposed)
134
388
  throw new Error('Rerank disposed');
135
389
  if (texts.length === 0)
136
390
  return [];
137
- const queryTokens = this._ctx.tokenizeSync(query, false);
138
- const shared = [...this._prefixTokens, ...queryTokens, ...this._midTokens];
139
- const maxDoc = Math.floor(this._nCtx / this._nSeqMax) - shared.length - this._suffixTokens.length;
140
- const tokenArrays = texts.map((text) => {
141
- const doc = this._ctx.tokenizeSync(text, false);
142
- return [...shared, ...(doc.length > maxDoc ? doc.slice(0, maxDoc) : doc), ...this._suffixTokens];
391
+ // Acquire the serializer chain (same chain score() uses).
392
+ const prev = this._inflight;
393
+ let release;
394
+ this._inflight = new Promise((r) => {
395
+ release = r;
143
396
  });
144
- const scores = [];
145
- for (let i = 0; i < tokenArrays.length; i += this._nSeqMax) {
146
- const logits = await this._ctx._scoreGroup(tokenArrays.slice(i, i + this._nSeqMax));
147
- scores.push(...logits.map((l) => this._rerankScore(l)));
397
+ try {
398
+ await prev;
399
+ // Tokenize in parallel — the old code used tokenizeSync, blocking the
400
+ // event loop for the whole batch.
401
+ const docTokens = await Promise.all(texts.map((t) => this._ctx.tokenize(t, false)));
402
+ const scores = new Array(texts.length);
403
+ // Sink captures the cumulative scores from each emission. The final
404
+ // emission contains all positions, but every intermediate emission
405
+ // already has the scores in place — so the by-index write below is
406
+ // safe whether we observe one or many emissions.
407
+ const sink = {
408
+ push: (p) => {
409
+ for (const r of p.results) {
410
+ scores[r.index] = r.score;
411
+ }
412
+ },
413
+ finish: () => { },
414
+ error: (e) => {
415
+ throw e;
416
+ },
417
+ };
418
+ await this._scoreInternal(query, docTokens, undefined, sink, () => false);
419
+ return scores;
420
+ }
421
+ finally {
422
+ release();
148
423
  }
149
- return scores;
150
424
  }
425
+ /** Tokenize text using the reranker's underlying tokenizer. */
151
426
  async tokenize(text) {
152
427
  return this._ctx.tokenize(text, false);
153
428
  }
429
+ /** Release Rerank state, clear ctx ownership, dispose ctx. Idempotent. */
154
430
  dispose() {
431
+ if (this._disposed)
432
+ return;
155
433
  this._disposed = true;
156
- const err = new Error('Rerank disposed');
157
- for (const req of this._pending)
158
- req.error(err);
159
- this._pending.length = 0;
434
+ // pruneSubtree (CASCADE) instead of prune (RESTRICT) — if queryBranch or
435
+ // leaves leaked from a swallowed abandonment, RESTRICT prune would throw
436
+ // "branch has children" and mask the original error. pruneSubtree is
437
+ // safe on a childless trunk and correct on a partially-pruned tree.
438
+ try {
439
+ this._trunk.pruneSubtreeSync();
440
+ }
441
+ catch { /* already pruned */ }
442
+ delete this._ctx.__decodeOwner;
160
443
  this._ctx.dispose();
161
444
  }
162
- // ── Queue internals ──────────────────────────────────────────
163
- _sortResults(scores, topK) {
164
- const sorted = scores
165
- .map((score, index) => ({ score: Math.round(score * 1000) / 1000, index }))
166
- .sort((a, b) => b.score - a.score);
167
- return topK != null ? sorted.slice(0, topK) : sorted;
168
- }
169
- _enqueue(tokenArrays, topK, push, finish, error) {
170
- this._pending.push({
171
- tokenArrays, cursor: 0,
172
- scores: new Array(tokenArrays.length),
173
- filled: 0,
174
- topK,
175
- total: tokenArrays.length,
176
- push, finish, error,
177
- });
178
- this._drain();
179
- }
180
- _fillGroup() {
181
- const group = [];
182
- let added = true;
183
- while (group.length < this._nSeqMax && added) {
184
- added = false;
185
- for (let r = 0; r < this._pending.length && group.length < this._nSeqMax; r++) {
186
- const req = this._pending[r];
187
- if (req.cursor < req.tokenArrays.length) {
188
- group.push({ reqIdx: r, promptIdx: req.cursor, tokens: req.tokenArrays[req.cursor] });
189
- req.cursor++;
190
- added = true;
445
+ // ── Internals ────────────────────────────────────────────────
446
+ /**
447
+ * The shared scoring driver. Both `score()` (async-iterable) and
448
+ * `scoreBatch()` (Promise) call into this once the serializer is held.
449
+ */
450
+ async _scoreInternal(query, documents, topK, sink, isCancelled) {
451
+ const queryTokens = await this._ctx.tokenize(query, false);
452
+ const sharedLen = this._staticPrefix.length + queryTokens.length + this._midTokens.length;
453
+ const maxDoc = Math.floor(this._nCtx / this._nSeqMax) -
454
+ sharedLen -
455
+ this._suffixTokens.length;
456
+ if (maxDoc <= 0) {
457
+ throw new RerankInternalError(`Per-leaf doc budget is ${maxDoc} (nCtx=${this._nCtx}, ` +
458
+ `nSeqMax=${this._nSeqMax}, shared=${sharedLen}, ` +
459
+ `suffix=${this._suffixTokens.length}). ` +
460
+ `Query/template too long for context capacity.`);
461
+ }
462
+ // Truncation observability — fire callback once per truncated doc, even
463
+ // before any decode happens. Consumers can map this to a trace event.
464
+ if (this._onTruncate) {
465
+ for (let i = 0; i < documents.length; i++) {
466
+ if (documents[i].length > maxDoc) {
467
+ this._onTruncate({
468
+ docIndex: i,
469
+ origLen: documents[i].length,
470
+ maxLen: maxDoc,
471
+ });
191
472
  }
192
473
  }
193
474
  }
194
- return group;
195
- }
196
- async _drain() {
197
- if (this._draining)
475
+ if (documents.length === 0) {
476
+ sink.push({ filled: 0, total: 0, results: [] });
198
477
  return;
199
- this._draining = true;
478
+ }
479
+ // Fork the per-query branch from the warm trunk. cloneLogits: false
480
+ // because the next thing we do is overwrite via prefill.
481
+ const queryBranch = await this._trunk.fork({ cloneLogits: false });
482
+ if (queryBranch.disposed) {
483
+ throw new RerankInternalError('queryBranch fork returned a disposed handle (BranchStore lease exhaustion?)');
484
+ }
200
485
  try {
201
- while (this._pending.length > 0) {
202
- const group = this._fillGroup();
203
- if (group.length === 0)
486
+ // Branch.prefill mirrors the spine.prefill / root.prefill convention
487
+ // for one-off setup decodes. Routes through the same _storePrefill
488
+ // primitive that BranchStore.prefill uses for batched leaf dispatches.
489
+ await queryBranch.prefill([...queryTokens, ...this._midTokens]);
490
+ const scores = new Array(documents.length);
491
+ let i = 0;
492
+ const yesNoIndices = new Int32Array([this._yesId, this._noId]);
493
+ while (i < documents.length) {
494
+ if (isCancelled())
204
495
  break;
205
- let logits;
206
- try {
207
- logits = await this._ctx._scoreGroup(group.map((g) => g.tokens));
496
+ const available = this._store.available;
497
+ const budget = Math.min(available, documents.length - i);
498
+ if (budget === 0) {
499
+ throw new RerankInternalError(`BranchStore.available returned 0 with ${documents.length - i} ` +
500
+ `docs remaining (expected ≥1 free slot after trunk + ` +
501
+ `queryBranch leases). Check for leaked branches or under-sized ` +
502
+ `nSeqMax (currently ${this._nSeqMax}).`);
208
503
  }
209
- catch (err) {
210
- const error = err instanceof Error ? err : new Error(String(err));
211
- for (const req of this._pending)
212
- req.error(error);
213
- this._pending.length = 0;
214
- return;
215
- }
216
- // Track which requests got new scores this group
217
- const touched = new Set();
218
- for (let i = 0; i < group.length; i++) {
219
- const req = this._pending[group[i].reqIdx];
220
- req.scores[group[i].promptIdx] = this._rerankScore(logits[i]);
221
- req.filled++;
222
- touched.add(group[i].reqIdx);
504
+ const tails = new Array(budget);
505
+ for (let k = 0; k < budget; k++) {
506
+ const doc = documents[i + k];
507
+ const trimmed = doc.length > maxDoc ? doc.slice(0, maxDoc) : doc;
508
+ tails[k] = [...trimmed, ...this._suffixTokens];
223
509
  }
224
- // Push progress for each request that advanced, finish completed ones
225
- for (let r = this._pending.length - 1; r >= 0; r--) {
226
- const req = this._pending[r];
227
- if (!touched.has(r))
228
- continue;
229
- const results = this._sortResults(req.scores, req.topK);
230
- req.push({ filled: req.filled, total: req.total, results });
231
- if (req.filled === req.total) {
232
- req.finish();
233
- this._pending.splice(r, 1);
510
+ // Leaf-group try/finally leaves prune even if scatter-prefill or
511
+ // logits read throws mid-group. Without this, the outer
512
+ // queryBranch.pruneSubtree() in the score()-level finally is the
513
+ // only path that reclaims leaves, but it can't run until the
514
+ // exception unwinds past `i += budget`.
515
+ const leaves = [];
516
+ try {
517
+ for (let k = 0; k < budget; k++) {
518
+ const leaf = await queryBranch.fork({ cloneLogits: false });
519
+ if (leaf.disposed) {
520
+ throw new RerankInternalError(`Leaf fork returned a disposed handle at k=${k}/${budget}`);
521
+ }
522
+ leaves.push(leaf);
523
+ }
524
+ // Batched scatter-prefill — N leaves in ONE llama_decode dispatch.
525
+ await this._store.prefill(leaves.map((leaf, k) => [leaf, tails[k]]));
526
+ // Read 2 floats per leaf via _branchLogitsAt — NOT n_vocab via
527
+ // _branchGetLogits. The native primitive added in R1.
528
+ for (let k = 0; k < budget; k++) {
529
+ const pair = this._ctx._branchLogitsAt(leaves[k].handle, yesNoIndices);
530
+ scores[i + k] = pair[0] - pair[1];
234
531
  }
235
532
  }
533
+ finally {
534
+ // pruneSubtree (CASCADE) is safe on a childless leaf, so use it
535
+ // uniformly. The catch-and-swallow keeps the cleanup from masking
536
+ // the original error.
537
+ await Promise.all(leaves.map((leaf) => leaf.pruneSubtree().catch(() => {
538
+ /* leaf may already be disposed by an outer cleanup */
539
+ })));
540
+ }
541
+ i += budget;
542
+ // Cumulative emission — sort on RAW scores; rounding is the
543
+ // consumer's choice. Sorting on rounded scores (the prior code's
544
+ // behavior) made tie-broken-by-insertion-order the rank decider, the
545
+ // B1 mechanism for testRerankLargeCorpus.
546
+ sink.push({
547
+ filled: i,
548
+ total: documents.length,
549
+ results: this._sortRaw(scores.slice(0, i), topK),
550
+ });
236
551
  }
237
552
  }
238
553
  finally {
239
- this._draining = false;
554
+ // CASCADE prune queryBranch + any leaked descendants. Safer than
555
+ // RESTRICT prune() if leaves leaked from a swallowed catch above.
556
+ await queryBranch.pruneSubtree().catch(() => {
557
+ /* already pruned */
558
+ });
240
559
  }
241
560
  }
242
- _rerankScore(logits) {
243
- const max = Math.max(logits[this._yesId], logits[this._noId]);
244
- const yesExp = Math.exp(logits[this._yesId] - max);
245
- const noExp = Math.exp(logits[this._noId] - max);
246
- return yesExp / (yesExp + noExp);
561
+ /**
562
+ * Sort scores descending, raw (unrounded). Consumers that want display
563
+ * rounding apply `Math.round(score * 1000) / 1000` themselves.
564
+ */
565
+ _sortRaw(scores, topK) {
566
+ const sorted = scores
567
+ .map((score, index) => ({ score, index }))
568
+ .sort((a, b) => b.score - a.score);
569
+ return topK != null ? sorted.slice(0, topK) : sorted;
247
570
  }
248
571
  }
249
572
  exports.Rerank = Rerank;
573
+ function tokenArraysEqual(a, b) {
574
+ if (a.length !== b.length)
575
+ return false;
576
+ for (let i = 0; i < a.length; i++) {
577
+ if (a[i] !== b[i])
578
+ return false;
579
+ }
580
+ return true;
581
+ }
250
582
  //# sourceMappingURL=Rerank.js.map