albex 0.3.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +466 -0
  2. package/README.md +32 -19
  3. package/dist/albex-worker.d.ts +65 -2
  4. package/dist/albex-worker.d.ts.map +1 -1
  5. package/dist/albex-worker.js +97 -20
  6. package/dist/albex-worker.js.map +1 -1
  7. package/dist/albex.d.ts +359 -55
  8. package/dist/albex.d.ts.map +1 -1
  9. package/dist/albex.js +766 -312
  10. package/dist/albex.js.map +1 -1
  11. package/dist/errors.d.ts +47 -2
  12. package/dist/errors.d.ts.map +1 -1
  13. package/dist/errors.js +41 -3
  14. package/dist/errors.js.map +1 -1
  15. package/dist/persistence.js +1 -1
  16. package/dist/pool/coordinator.d.ts +14 -6
  17. package/dist/pool/coordinator.d.ts.map +1 -1
  18. package/dist/pool/coordinator.js +65 -28
  19. package/dist/pool/coordinator.js.map +1 -1
  20. package/dist/profile.d.ts +11 -6
  21. package/dist/profile.d.ts.map +1 -1
  22. package/dist/profile.js +6 -13
  23. package/dist/profile.js.map +1 -1
  24. package/dist/resource-manager.js +1 -1
  25. package/dist/tiered-store.js +1 -1
  26. package/dist/wasm-bindings.d.ts +96 -6
  27. package/dist/wasm-bindings.d.ts.map +1 -1
  28. package/dist/wasm-bindings.js +110 -7
  29. package/dist/wasm-bindings.js.map +1 -1
  30. package/dist/worker-protocol.d.ts +23 -2
  31. package/dist/worker-protocol.d.ts.map +1 -1
  32. package/dist/worker-protocol.js +1 -1
  33. package/dist/worker-runtime.js +27 -3
  34. package/dist/worker-runtime.js.map +1 -1
  35. package/package.json +13 -9
  36. package/src/albex-worker.ts +103 -18
  37. package/src/albex.ts +2937 -2292
  38. package/src/errors.ts +63 -2
  39. package/src/pool/coordinator.ts +61 -34
  40. package/src/profile.ts +11 -10
  41. package/src/wasm-bindings.ts +225 -10
  42. package/src/worker-protocol.ts +12 -2
  43. package/src/worker-runtime.ts +28 -3
  44. package/wasm/pkg/albex_pdf.wasm +0 -0
  45. package/wasm/pkg/albex_wasm.wasm +0 -0
  46. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  47. package/wasm/pkg/albex_wasm_simd.wasm +0 -0
  48. package/wasm/pkg/albex_wasm_mini.wasm +0 -0
  49. package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
  50. package/wasm/pkg/albex_wasm_pro.wasm +0 -0
  51. package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
  52. package/wasm/pkg/albex_wasm_std.wasm +0 -0
  53. package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/dist/albex.js CHANGED
@@ -1,5 +1,5 @@
1
1
  /*!
2
- * albex v0.3.0
2
+ * albex v0.6.1
3
3
  * Zero-config local full-text search for documents — runs entirely in the browser, no server, no upload.
4
4
  * (c) 2026 RafaCalRob
5
5
  * @license MIT
@@ -21,9 +21,9 @@
21
21
  * ```
22
22
  */
23
23
  import { asAlbexExports, asAlbexPdfExports, } from './wasm-bindings.js';
24
- import { AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
24
+ import { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, assertFileSizeWithinLimit, } from './errors.js';
25
25
  import { savePersisted, loadPersisted, deletePersisted, listPersisted, } from './persistence.js';
26
- import { detectProfile, pickTier, shouldUseGpu } from './profile.js';
26
+ import { detectProfile, shouldUseGpu } from './profile.js';
27
27
  import { getResourceManager } from './resource-manager.js';
28
28
  import { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
29
29
  export { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
@@ -48,41 +48,69 @@ function warnSearchStreamDeprecated() {
48
48
  'scheduler between slices and returns a batch. The alias will be ' +
49
49
  'removed in 0.4.0.');
50
50
  }
51
- function tokenize(q) {
52
- return q.trim().split(/\s+/).filter(t => t.length > 0);
53
- }
54
- function parseQuery(q) {
55
- const trimmed = q.trim();
56
- // OR: "term1 | term2" or "phrase one | phrase two"
57
- if (trimmed.includes('|')) {
58
- const branches = trimmed.split('|')
59
- .map(p => tokenize(p.replace(/"/g, '')))
60
- .filter(b => b.length > 0);
61
- return { kind: 'or', branches };
62
- }
63
- // Phrase: "exact phrase here"
64
- const phraseMatch = /^"(.+)"$/.exec(trimmed);
65
- if (phraseMatch) {
66
- const inner = phraseMatch[1] ?? '';
67
- const tokens = tokenize(inner);
68
- return { kind: 'phrase', tokens, raw: inner };
69
- }
70
- return { kind: 'simple', tokens: tokenize(trimmed) };
71
- }
51
+ /** The std preset = the historical compile-time defaults. */
52
+ const CAPACITY_STD = {
53
+ maxDocs: 128,
54
+ maxChunks: 100_000,
55
+ textPoolBytes: 16 * 1024 * 1024,
56
+ namePoolBytes: 32 * 1024,
57
+ };
58
+ /** The large preset = the old compile-time "pro" tier. */
59
+ const CAPACITY_LARGE = {
60
+ maxDocs: 1024,
61
+ maxChunks: 800_000,
62
+ textPoolBytes: 128 * 1024 * 1024,
63
+ namePoolBytes: 256 * 1024,
64
+ };
72
65
  /**
73
- * Reconstruct a WASM-compatible query string from parsed tokens.
74
- * The WASM engine accepts up to 4 space-separated tokens (AND semantics).
66
+ * Resolve a user-facing capacity option into full numbers. Partial custom
67
+ * configs are completed from the std defaults scaled to keep std's ratios:
68
+ * `maxChunks` follows `maxDocs` (×782), `textPoolBytes` follows `maxChunks`
69
+ * (×168 B), `namePoolBytes` follows `maxDocs` (×256 B) — each with a floor
70
+ * so tiny configs stay usable. `maxChunks` is clamped to at least `maxDocs`
71
+ * (every document needs at least one chunk).
75
72
  */
76
- function tokensToWasmQuery(tokens) {
77
- return tokens.slice(0, 4).join(' ');
73
+ function resolveCapacity(capacity) {
74
+ if (capacity === undefined || capacity === 'std')
75
+ return { ...CAPACITY_STD };
76
+ if (capacity === 'large')
77
+ return { ...CAPACITY_LARGE };
78
+ const maxDocs = Math.floor(capacity.maxDocs ?? CAPACITY_STD.maxDocs);
79
+ const maxChunks = Math.max(Math.floor(capacity.maxChunks ?? Math.max(maxDocs * 782, 1024)), maxDocs);
80
+ const textPoolBytes = Math.floor(capacity.textPoolBytes ?? Math.max(maxChunks * 168, 64 * 1024));
81
+ const namePoolBytes = Math.floor(capacity.namePoolBytes ?? Math.max(maxDocs * 256, 4 * 1024));
82
+ return { maxDocs, maxChunks, textPoolBytes, namePoolBytes };
78
83
  }
79
84
  // ─────────────────────────────────────────────────────────────────────────────
80
- // Phrase post-filter
85
+ // Query parsing (WASM-side as of 0.5.0)
81
86
  // ─────────────────────────────────────────────────────────────────────────────
87
+ //
88
+ // Pre-0.5.0 this file owned parseQuery + tokenize. That created two
89
+ // truths about what a "token" was: one in TS for the query, one in Rust
90
+ // for the indexed text. The audit flagged this as the biggest divergence
91
+ // in the wrapper.
92
+ //
93
+ // 0.5.0 moves parseQuery/tokenize/tokensToWasmQuery to Rust. The TS
94
+ // dispatcher reduces to:
95
+ //
96
+ // 1. Write the raw UTF-8 query bytes to the scratchpad.
97
+ // 2. Call prepareQuery(len). Get back the kind (simple/phrase/or).
98
+ // 3. For OR: iterate getQueryBranchCount() branches, calling
99
+ // selectQueryBranch(i) + search() for each, then merge in TS.
100
+ // For simple/phrase: selectQueryBranch(0) + search().
101
+ // 4. For phrase: post-filter the snippets with containsPhrase().
102
+ //
103
+ // containsPhrase stays in TS because it operates on snippet text already
104
+ // produced by the WASM, not on the query. It is not a tokenizer.
82
105
  /**
83
- * Returns true if `snippet` contains the phrase formed by `tokens` in order,
84
- * with at most `maxGap` characters between consecutive tokens.
85
- * Comparison is case- and accent-insensitive.
106
+ * Phrase post-filter. Returns true if `snippet` contains the phrase
107
+ * formed by `tokens` in order, with at most `maxGap` characters between
108
+ * consecutive tokens. Comparison is case- and accent-insensitive.
109
+ *
110
+ * The tokens come from the WASM-compiled pattern of a phrase branch,
111
+ * not from a TS re-tokenization of the query, so there is no
112
+ * tokenization divergence: WASM said "these are the tokens", we just
113
+ * check adjacency in the snippet.
86
114
  */
87
115
  function containsPhrase(snippet, tokens, maxGap = 30) {
88
116
  const norm = (s) => s.toLowerCase().normalize('NFKD').replace(/[̀-ͯ]/g, '');
@@ -201,62 +229,57 @@ const FEED_SIZE = 32_768; // 32 KB — fits in 64 KB scratchpad
201
229
  * The result is stable across runs and engines, so it can be persisted in
202
230
  * snapshots without versioning concerns.
203
231
  */
232
+ // NOTE: the TS `computePatternBloom` that used to live here (the THIRD copy
233
+ // of the accent fold, after the Rust index side and the Rust query side) was
234
+ // removed in 0.8.0. The GPU pre-filter now reads the pattern Bloom straight
235
+ // from WASM via `getPatternBloomLo/Hi` (ABI 6) — `setPattern` computes it
236
+ // through the exact pipeline `searchBegin` uses, including Spanish stemming,
237
+ // which the TS copy never applied (audit 2.4).
204
238
  /**
205
- * Compute the same 64-bit Bloom value the Rust side computes for a query.
239
+ * Convert a UTF-8 byte offset into `bytes` to the equivalent UTF-16
240
+ * code-unit index of the decoded string. Walks lead bytes only — O(offset)
241
+ * with no allocation — counting 1 unit per BMP code point and 2 per 4-byte
242
+ * (astral, e.g. emoji) sequence. Stray continuation bytes (malformed input)
243
+ * count 1 unit each, matching TextDecoder's per-byte U+FFFD replacement.
206
244
  *
207
- * Must stay in sync with `BloomFilter::from_text` and `fold_utf8_char` in
208
- * `core/src/bloom.rs`. The hashing is `c & 0x3F` over each accent-folded
209
- * lowercase ASCII byte; non-letters are skipped. The aggregate of all token
210
- * blooms is what the GPU pre-filter checks against.
245
+ * Offsets that land mid-sequence are attributed to the code point they fall
246
+ * inside (the engine only emits code-point-aligned offsets, so this is a
247
+ * defensive clamp, not an expected path).
211
248
  */
212
- function computePatternBloom(query) {
213
- // Quick-and-faithful fold: lowercase, NFKD, strip combining marks. This
214
- // matches the Rust Latin-1/Latin-A fold for the characters we care about
215
- // (the rest fall through as non-letters which contribute nothing).
216
- const norm = query.toLowerCase().normalize('NFKD').replace(/[̀-ͯ]/g, '');
217
- let bits = 0n;
218
- for (let i = 0; i < norm.length; i++) {
219
- const code = norm.charCodeAt(i);
220
- if ((code >= 0x61 && code <= 0x7a) || (code >= 0x30 && code <= 0x39)) {
221
- bits |= 1n << BigInt(code & 0x3f);
222
- }
223
- else if (code === 0x20) {
224
- // skip token separator
225
- }
226
- else if (code < 0x80) {
227
- // other ASCII punctuation — they bias the filter; mirror Rust which
228
- // also includes them via the 6-bit mask.
229
- bits |= 1n << BigInt(code & 0x3f);
230
- }
231
- }
232
- return bits;
233
- }
234
- function contentHash(bytes) {
235
- // 64-bit arithmetic via two 32-bit halves (no BigInt to keep it fast in
236
- // engines without optimised BigInt support).
237
- let hi = 0xcbf29ce4 | 0;
238
- let lo = 0x84222325 | 0;
239
- // FNV prime: 0x100000001b3 = (0x100 << 32) | 0x000001b3
240
- for (let i = 0; i < bytes.length; i++) {
241
- lo ^= bytes[i];
242
- // multiply by FNV prime
243
- // (hi:lo) *= 0x100000001b3
244
- // low * prime
245
- const lo_lo = (lo & 0xffff) * 0x1b3;
246
- const lo_hi = (lo >>> 16) * 0x1b3;
247
- let new_lo = (lo_lo + ((lo_hi & 0xffff) << 16)) | 0;
248
- let carry = (lo_hi >>> 16) + ((lo_lo + ((lo_hi & 0xffff) << 16)) > 0xffffffff ? 1 : 0);
249
- // hi*prime + carry
250
- const hi_lo = (hi & 0xffff) * 0x1b3;
251
- const hi_hi = (hi >>> 16) * 0x1b3;
252
- const new_hi = ((hi_lo + ((hi_hi & 0xffff) << 16)) | 0) + carry + lo; // + lo because high 33rd bit
253
- lo = new_lo;
254
- hi = new_hi | 0;
255
- }
256
- const hexHi = (hi >>> 0).toString(16).padStart(8, '0');
257
- const hexLo = (lo >>> 0).toString(16).padStart(8, '0');
258
- return hexHi + hexLo;
249
+ function utf16IndexAtByte(bytes, byteOffset) {
250
+ const end = Math.min(byteOffset, bytes.length);
251
+ let units = 0;
252
+ let i = 0;
253
+ while (i < end) {
254
+ const b = bytes[i];
255
+ if (b < 0x80) {
256
+ i += 1;
257
+ units += 1;
258
+ } // ASCII
259
+ else if (b < 0xc0) {
260
+ i += 1;
261
+ units += 1;
262
+ } // stray continuation → U+FFFD
263
+ else if (b < 0xe0) {
264
+ i += 2;
265
+ units += 1;
266
+ } // 2-byte (é, ñ, )
267
+ else if (b < 0xf0) {
268
+ i += 3;
269
+ units += 1;
270
+ } // 3-byte (…, €, CJK)
271
+ else {
272
+ i += 4;
273
+ units += 2;
274
+ } // 4-byte surrogate pair
275
+ }
276
+ return units;
259
277
  }
278
+ // Note: `contentHash` is implemented as a method on AlbexEngine below
279
+ // (it needs access to the WASM scratchpad). The standalone TS reference
280
+ // implementation that used to live here was removed in 0.4.0 — the
281
+ // canonical hash now lives in wasm/src/lib.rs::hashBytes so there is
282
+ // exactly one definition of "the content hash of these bytes".
260
283
  /**
261
284
  * 16-hex-char content hash → 8 raw bytes for setDocumentContentHash. The
262
285
  * byte order matches the snapshot format: the high 32 bits sit at offsets
@@ -450,11 +473,18 @@ function makePdfWasmImports(module, getPdfMem) {
450
473
  case '__wbindgen_externref_table_set_null':
451
474
  return (idx) => { heap[idx] = undefined; };
452
475
  }
453
- // Unknown import — return a stub that warns when called. Loading still
454
- // succeeds; only an actually-invoked unknown import will surface.
455
- return (...args) => {
456
- console.warn(`[albex] unhandled PDF WASM import ${modName}.${name}`, args);
457
- };
476
+ // Unknown import — fail fast. An import we don't recognise means the
477
+ // wasm-bindgen / lopdf / getrandom dependency graph has drifted from
478
+ // the prefixes this loader is written to satisfy. Accepting the
479
+ // module would defer the failure to an arbitrary execution path,
480
+ // typically deep inside extractPdf(), where the user gets either a
481
+ // hang or a misleading "PDF parse error". Refusing instantiation
482
+ // surfaces the version skew at boot, where the maintainer can act
483
+ // on it.
484
+ throw new AlbexInitError(`Unknown PDF WASM import "${modName}.${name}". ` +
485
+ `The albex_pdf.wasm binary was probably built with a newer Rust ` +
486
+ `toolchain or dependency graph than this loader was written for. ` +
487
+ `Rebuild with 'npm run build:pdf-wasm' or open an issue.`);
458
488
  };
459
489
  const imports = {};
460
490
  for (const { module: modName, name } of required) {
@@ -474,39 +504,105 @@ export class AlbexEngine {
474
504
  * runtime dependency on OCR — this is a structural slot that the optional
475
505
  * companion package fills.
476
506
  */
477
- ocrImage;
478
507
  /**
479
- * Optional OCR-side configuration set by `@albex/ocr::enableOcr`. Read
480
- * by the engine to decide whether to invoke OCR on top of the text it
481
- * already extracted from a PDF (hybrid PDFs: native text + images that
482
- * also contain text, like stamps, scanned annexes, or diagrams with
483
- * labels).
484
- *
485
- * When `alwaysExtractEmbeddedImages` is true, every page of every PDF
486
- * passes through `extractPageImages` after the normal text extraction;
487
- * any image that meets the size filter (200×200 in Rust) is fed to
488
- * `ocrImage`. Performance cost: 1–3 s per qualifying image.
489
- *
490
- * Off by default — set this opt-in via the OCR module's options.
508
+ * Public OCR entry point. Forwards to the attached OCR adapter installed
509
+ * via `attachOcr()`. Reading this property is a feature-detect for
510
+ * integrators: `if (engine.ocrImage) { ... OCR available ... }`. Writing
511
+ * to it directly is no longer supported in 0.5.0+ — use `attachOcr`.
491
512
  */
492
- ocrConfig;
513
+ get ocrImage() {
514
+ return this._ocrAdapter?.recognize;
515
+ }
516
+ /** Private adapter slot. Holds the OCR plugin contract installed by
517
+ * `attachOcr()`. The engine reads `recognize` and `options` here; the
518
+ * caller never gets a reference to this object directly. */
519
+ _ocrAdapter = null;
493
520
  // ── PDF WASM (lazy) ──
494
521
  _pdfWasm = null;
495
522
  _pdfMem = null;
496
523
  _docs = [];
497
524
  _lastSearch = null;
498
- _tier = null;
525
+ /** Raw truncation bitflags from the most recent prepareQuery (ABI 5):
526
+ * 1 = branches dropped, 2 = tokens dropped/clipped, 4 = query bytes cut.
527
+ * Captured right after prepareQuery so every _lastSearch built for that
528
+ * query (including per-branch OR runs) reports the same flags. */
529
+ _lastTruncFlags = 0;
530
+ /** Structured diagnostics collected during the most recent operation.
531
+ * Drained by `takeDiagnostics()`. Capped at 256 entries to avoid
532
+ * unbounded memory growth in pathological cases (very corrupted
533
+ * corpora producing thousands of recovery warnings). */
534
+ _diagnostics = [];
535
+ /** Resolved runtime capacity (set in init(); reused by reset()). */
536
+ _capacity = { ...CAPACITY_STD };
499
537
  _simd = false;
500
538
  _profile = null;
501
539
  _resources = null;
502
540
  _gpu = null;
503
- _gpuChunkCountUploaded = 0;
541
+ /** True when the GPU-resident Bloom array no longer mirrors the WASM
542
+ * chunk array. Set by EVERY index mutation (indexFile, removeDocument,
543
+ * compact, reset, load) and cleared after a successful upload. A plain
544
+ * chunk-count comparison is NOT enough: compact() can reorder blooms
545
+ * while keeping the count identical, which would silently filter the
546
+ * wrong chunks (audit 1.5). */
547
+ _gpuUploadDirty = true;
504
548
  _unsubscribeResources = null;
505
549
  _opts;
506
- constructor(opts) {
550
+ // ── Concurrency guard ──────────────────────────────────────────────────────
551
+ // One WASM instance, global mutable state, async ops that yield to the
552
+ // scheduler between slices. Two overlapping operations corrupt each other
553
+ // (e.g. a fresh searchBegin resets the cursor of an in-flight cooperative
554
+ // search). Async ops serialize through `_opChain`; sync mutators/searches
555
+ // assert the engine is idle (audit 0.6.0, finding #2).
556
+ _opChain = Promise.resolve();
557
+ _busy = false;
558
+ constructor(opts = {}) {
507
559
  this._opts = opts;
508
560
  }
509
- /** Load and initialise the main WASM module. Must be called before any other method. */
561
+ /** Serialize an async engine operation behind any in-flight one. */
562
+ _exclusive(fn) {
563
+ const run = this._opChain.then(async () => {
564
+ this._busy = true;
565
+ try {
566
+ return await fn();
567
+ }
568
+ finally {
569
+ this._busy = false;
570
+ }
571
+ });
572
+ // Swallow result/error on the chain so one failure can't wedge the queue.
573
+ this._opChain = run.then(() => undefined, () => undefined);
574
+ return run;
575
+ }
576
+ /** Guard a synchronous mutator/search: refuse to run mid-async-operation
577
+ * rather than silently corrupt the shared WASM state. */
578
+ _assertIdle(method) {
579
+ if (this._busy) {
580
+ throw new AlbexError('busy', `${method}() was called while an async engine operation is still ` +
581
+ `running. Await the previous indexFile/save/load/replaceDocument/` +
582
+ `searchCooperative call, or use searchCooperative instead of search().`);
583
+ }
584
+ }
585
+ /** Compact opportunistically when tombstones pile up under text pressure,
586
+ * so repeated removeDocument/replaceDocument don't exhaust the pool. */
587
+ _autoCompactIfNeeded() {
588
+ const w = this._wasm;
589
+ const cap = w.getTextCapacity();
590
+ const hasTombstones = w.getDocCount() > this._docs.length;
591
+ if (hasTombstones && cap > 0 && w.getTextUsed() / cap > 0.85) {
592
+ w.compact();
593
+ this._gpuUploadDirty = true;
594
+ }
595
+ }
596
+ /**
597
+ * Load and initialise the main WASM module. Must be called before any
598
+ * other method.
599
+ *
600
+ * Resolves `opts.capacity` ('std' default · 'large' · explicit object)
601
+ * and sizes the WASM pools accordingly via `initWithCapacity` (ABI 7).
602
+ * Memory cost ≈ `maxChunks × 64 B + textPoolBytes + namePoolBytes` —
603
+ * ~22 MB for 'std', ~180 MB for 'large'. Throws `AlbexInitError` if the
604
+ * requested capacity is out of range or the allocation fails.
605
+ */
510
606
  async init() {
511
607
  const url = await this._resolveWasmUrl();
512
608
  const res = await fetch(url);
@@ -515,7 +611,15 @@ export class AlbexEngine {
515
611
  const { instance } = await WebAssembly.instantiateStreaming(res, {});
516
612
  this._wasm = asAlbexExports(instance.exports);
517
613
  this._mem = this._wasm.memory;
518
- this._wasm.init();
614
+ this._capacity = resolveCapacity(this._opts.capacity);
615
+ const c = this._capacity;
616
+ if (this._wasm.initWithCapacity(c.maxDocs, c.maxChunks, c.textPoolBytes, c.namePoolBytes) !== 1) {
617
+ throw new AlbexInitError(`initWithCapacity(${c.maxDocs} docs, ${c.maxChunks} chunks, ` +
618
+ `${c.textPoolBytes} text bytes, ${c.namePoolBytes} name bytes) failed — ` +
619
+ `parameters out of range (docs 1-65536, chunks ≥ docs and ≤ 4194304, ` +
620
+ `text 4 KiB-1 GiB, names 256 B-16 MiB) or the WASM memory allocation ` +
621
+ `was refused by the host.`);
622
+ }
519
623
  // Subscribe to environmental signals. Cheap and benign in node tests
520
624
  // (the manager tolerates missing globals).
521
625
  const rm = getResourceManager();
@@ -531,22 +635,17 @@ export class AlbexEngine {
531
635
  }
532
636
  /**
533
637
  * Decide which `.wasm` binary to fetch. Order of precedence:
534
- * 1. `opts.wasmUrl` if provided — used verbatim.
535
- * 2. `opts.tier` if explicit — joined with `wasmBaseUrl`.
536
- * 3. `opts.wasmBaseUrl` + tier picked from the device profile.
537
- *
538
- * Order of precedence:
539
638
  * 1. `opts.wasmUrl` literal → use verbatim
540
- * 2. `opts.wasmBaseUrl` + tier suffix → fetched from that directory
639
+ * 2. `opts.wasmBaseUrl` + simd suffix → fetched from that directory
541
640
  * 3. zero-config default → `albex_wasm_bg.wasm` packaged
542
641
  * next to this file, resolved
543
642
  * via `import.meta.url`
544
643
  *
545
- * The zero-config default loads the std-baseline binary. Tier auto-detection
546
- * is only active when `wasmBaseUrl` is given, because picking a tier in
547
- * runtime would defeat any bundler's static asset rewriting. Users who want
548
- * tier optimisation must serve the six variants themselves and pass the
549
- * directory through `wasmBaseUrl`.
644
+ * There are exactly two main binaries (baseline + SIMD); capacity is a
645
+ * RUNTIME parameter since ABI 7, so it never affects which file is
646
+ * fetched. SIMD auto-detection is only active when `wasmBaseUrl` is
647
+ * given, because picking a URL at runtime would defeat any bundler's
648
+ * static asset rewriting.
550
649
  */
551
650
  async _resolveWasmUrl() {
552
651
  const o = this._opts;
@@ -562,31 +661,26 @@ export class AlbexEngine {
562
661
  // as an asset reference. They copy the .wasm to the output directory and
563
662
  // rewrite the URL automatically. Consumers who use one of those bundlers
564
663
  // get a working `new AlbexEngine()` with no manual setup.
565
- if (!o.wasmBaseUrl) {
566
- // We can't tier-select with one URL, so fall back to std baseline.
567
- // The integrator who wants tier optimisation must opt in via wasmBaseUrl.
568
- this._tier = 'std';
569
- this._simd = false;
570
- return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
571
- }
572
- let tier;
573
- if (o.tier && o.tier !== 'auto')
574
- tier = o.tier;
575
- else
576
- tier = pickTier(profile);
577
- this._tier = tier;
664
+ // 0.5.0+: two main binaries only — baseline and SIMD (the tier system
665
+ // is gone; capacity became a runtime parameter in ABI 7). Selection
666
+ // collapses to a single boolean: SIMD on or off, decided either by the
667
+ // explicit `simd` option or by a runtime probe.
578
668
  const simd = o.simd === 'on'
579
669
  ? true
580
670
  : o.simd === 'off'
581
671
  ? false
582
672
  : !!profile?.wasm.simd;
583
673
  this._simd = simd;
584
- const suffix = simd ? `${tier}_simd` : tier;
674
+ if (!o.wasmBaseUrl) {
675
+ // Zero-config: bundler resolves the .wasm next to dist/. We only
676
+ // ship the baseline alias (albex_wasm_bg.wasm) inside the npm
677
+ // package; integrators who want SIMD must serve both binaries
678
+ // themselves via `wasmBaseUrl`.
679
+ return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
680
+ }
585
681
  const base = o.wasmBaseUrl.replace(/\/+$/, '');
586
- return `${base}/albex_wasm_${suffix}.wasm`;
682
+ return simd ? `${base}/albex_wasm_simd.wasm` : `${base}/albex_wasm.wasm`;
587
683
  }
588
- /** The tier that was actually loaded. `null` until `init()` resolves. */
589
- get tier() { return this._tier; }
590
684
  /** True if the SIMD-accelerated binary was loaded. */
591
685
  get simdEnabled() { return this._simd; }
592
686
  /** True if a WebGPU device is acquired and the next search will use it. */
@@ -622,8 +716,14 @@ export class AlbexEngine {
622
716
  * No-op if the GPU device hasn't been acquired yet — first call attempts
623
717
  * `init()` lazily; if that fails, the candidate path is permanently
624
718
  * disabled for this engine instance.
719
+ *
720
+ * IMPORTANT: this method CLOBBERS the scratchpad (the candidate bitset
721
+ * is pushed through it via `setCandidateMask`). Any pattern previously
722
+ * staged by `selectQueryBranch` is destroyed — the caller MUST re-select
723
+ * the active branch before calling `searchBegin`, which snapshots the
724
+ * pattern from the scratchpad (audit 1.2).
625
725
  */
626
- async _gpuPreFilter(wasmQuery) {
726
+ async _gpuPreFilter() {
627
727
  const gpu = this._gpu;
628
728
  if (!gpu)
629
729
  return;
@@ -637,20 +737,26 @@ export class AlbexEngine {
637
737
  const chunkCount = this._wasm.getChunkCount();
638
738
  if (chunkCount === 0)
639
739
  return;
640
- // Upload blooms if the corpus changed. We re-upload everything on any
641
- // delta; incremental delta-upload is a future optimisation.
642
- if (chunkCount !== this._gpuChunkCountUploaded) {
740
+ // Upload blooms if the corpus changed since the last upload. The
741
+ // signal is a dirty flag set by every index mutation — not a chunk
742
+ // count comparison, because compact() can reorder blooms while
743
+ // keeping the count identical (audit 1.5). We re-upload everything
744
+ // on any delta; incremental delta-upload is a future optimisation.
745
+ if (this._gpuUploadDirty) {
643
746
  const ptr = this._wasm.getChunksPtr();
644
747
  const stride = this._wasm.getChunkStructSize();
645
748
  const bytes = new Uint8Array(this._mem.buffer, ptr, chunkCount * stride);
646
749
  const blooms = packBloomsFromChunks(bytes, chunkCount);
647
750
  gpu.uploadChunkBlooms(blooms, chunkCount);
648
- this._gpuChunkCountUploaded = chunkCount;
649
- }
650
- // Build the pattern Bloom on the JS side: same hash as Rust
651
- // (`c & 0x3F` after accent-folding), aggregated across all tokens.
652
- const patternBloom = computePatternBloom(wasmQuery);
653
- const passes = await gpu.scan(Number(patternBloom & 0xffffffffn), Number((patternBloom >> 32n) & 0xffffffffn));
751
+ this._gpuUploadDirty = false;
752
+ }
753
+ // Pattern Bloom comes straight from WASM (ABI 6): `selectQueryBranch`
754
+ // `setPattern` computed it through the same pipeline `searchBegin`
755
+ // uses split, optional Spanish stemming, accent fold, `c & 0x3F`.
756
+ // The retired TS copy of the fold never stemmed, so with `setLanguage
757
+ // ('es')` it could set bits for suffixes the CPU pattern no longer
758
+ // had → over-restrictive mask → silent false negatives (audit 2.4).
759
+ const passes = await gpu.scan(this._wasm.getPatternBloomLo(), this._wasm.getPatternBloomHi());
654
760
  // Push the bitset back into WASM via the scratchpad.
655
761
  const passBytes = new Uint8Array(passes.buffer, passes.byteOffset, passes.byteLength);
656
762
  this._writePad(passBytes);
@@ -676,6 +782,16 @@ export class AlbexEngine {
676
782
  const ptr = this._wasm.getBuffer(0);
677
783
  return _dec.decode(this._u8(ptr, n));
678
784
  }
785
+ /** Copy `n` scratchpad bytes out of WASM memory. The copy is private to
786
+ * JS, so it survives later WASM calls (and memory growth) — used when the
787
+ * caller needs both the raw bytes (UTF-16 span mapping) and the decoded
788
+ * string of the same payload. */
789
+ _readPadBytes(n) {
790
+ const ptr = this._wasm.getBuffer(0);
791
+ const out = new Uint8Array(n);
792
+ out.set(this._u8(ptr, n));
793
+ return out;
794
+ }
679
795
  _feedText(text) {
680
796
  const b = _enc.encode(text);
681
797
  for (let i = 0; i < b.length; i += FEED_SIZE) {
@@ -684,6 +800,34 @@ export class AlbexEngine {
684
800
  this._wasm.feedText(c.length);
685
801
  }
686
802
  }
803
+ /**
804
+ * Compute the FNV-1a 64-bit content hash of `bytes` via the WASM
805
+ * streaming API. Returns a 16-character hex string identical in shape
806
+ * to what the TS implementation in 0.3.x returned, so all callers
807
+ * stay unchanged. Single source of truth — same hash whether we use
808
+ * it for indexFile dedup, for snapshot v2 persistence, or anywhere
809
+ * else. Large inputs are chunked at FEED_SIZE just like _feedText.
810
+ */
811
+ _contentHash(bytes) {
812
+ const w = this._wasm;
813
+ w.hashBegin();
814
+ for (let i = 0; i < bytes.length; i += FEED_SIZE) {
815
+ const c = bytes.subarray(i, i + FEED_SIZE);
816
+ this._writePad(c);
817
+ w.hashFeed(c.length);
818
+ }
819
+ w.hashFinish();
820
+ // Read 8 result bytes back from scratchpad[0..8].
821
+ const ptr = w.getBuffer(8);
822
+ const out = this._u8(ptr, 8);
823
+ // Big-endian to hex. Same layout as the old hexHi + hexLo output:
824
+ // high u32 first (4 bytes), low u32 second (4 bytes).
825
+ let s = '';
826
+ for (let i = 0; i < 8; i++) {
827
+ s += out[i].toString(16).padStart(2, '0');
828
+ }
829
+ return s;
830
+ }
687
831
  _feedXmlBytes(xml, fn) {
688
832
  const feeder = this._wasm[fn];
689
833
  for (let i = 0; i < xml.length; i += FEED_SIZE) {
@@ -706,7 +850,10 @@ export class AlbexEngine {
706
850
  // called when the user actually drops a PDF — but we issue a console
707
851
  // hint so embedders can surface a "this will download ~1 MB" prompt.
708
852
  if (this._resources?.constrainedNetwork) {
709
- console.info('[albex] downloading PDF WASM (~1 MB) on a constrained network connection');
853
+ this._diag({
854
+ kind: 'info', stage: 'network',
855
+ message: 'Downloading PDF WASM (~1 MB) on a constrained network connection',
856
+ });
710
857
  }
711
858
  const res = await fetch(pdfUrl);
712
859
  if (!res.ok)
@@ -831,20 +978,14 @@ export class AlbexEngine {
831
978
  this._feedText(text);
832
979
  this._wasm.flushParagraph();
833
980
  }
834
- // Hybrid OCR pass: when the OCR module is wired with
835
- // `alwaysExtractEmbeddedImages: true`, also walk every page for
836
- // embedded images and OCR them on top of the vector text.
837
- //
838
- // We always log the decision so users debugging "why isn't OCR
839
- // firing on my hybrid PDF" can see which precondition failed.
840
- const hybridOn = !!this.ocrConfig?.alwaysExtractEmbeddedImages;
841
- const hasOcr = !!this.ocrImage;
842
- const binSupportsImages = typeof pw.extractPageImages === 'function'
843
- && typeof pw.getPageCount === 'function';
844
- console.log(`[albex] hybrid OCR decision: ocrImage=${hasOcr} ocrConfig.alwaysExtractEmbeddedImages=${hybridOn} binarySupportsImages=${binSupportsImages}`);
845
- if (hasOcr && hybridOn && binSupportsImages) {
981
+ // Hybrid OCR pass: when the OCR adapter is wired with
982
+ // `options.alwaysExtractEmbeddedImages: true`, also walk every page
983
+ // for embedded images and OCR them on top of the vector text.
984
+ if (this._ocrAdapter
985
+ && this._ocrAdapter.options?.alwaysExtractEmbeddedImages
986
+ && typeof pw.extractPageImages === 'function'
987
+ && typeof pw.getPageCount === 'function') {
846
988
  const totalPages = pw.getPageCount();
847
- console.log(`[albex] hybrid OCR pass starting over ${totalPages} page(s)`);
848
989
  for (let p = 0; p < totalPages; p++) {
849
990
  const ocrText = await this._ocrPageEmbeddedImages(pw, p);
850
991
  if (ocrText === null)
@@ -930,7 +1071,10 @@ export class AlbexEngine {
930
1071
  // so `_ensurePdfWasm` re-instantiates on the next call.
931
1072
  this._pdfWasm = null;
932
1073
  this._pdfMem = null;
933
- console.warn(`[albex] PDF image extractor trapped on page ${page + 1}: ${e instanceof Error ? e.message : String(e)}. Stopping OCR.`);
1074
+ this._diag({
1075
+ kind: 'skipped', stage: 'pdf', page: page + 1,
1076
+ message: `PDF image extractor trapped: ${e instanceof Error ? e.message : String(e)}. Remaining pages skipped.`,
1077
+ });
934
1078
  return null;
935
1079
  }
936
1080
  if (imageCount <= 0)
@@ -954,15 +1098,6 @@ export class AlbexEngine {
954
1098
  const copy = new Uint8Array(len);
955
1099
  copy.set(new Uint8Array(liveMem.buffer, ptr, len));
956
1100
  const blob = new Blob([copy.buffer], { type: mime });
957
- // Defensive diagnostics: when an OCR call goes wrong (Tesseract
958
- // worker abort, malformed JPEG, etc.) the first thing we want to
959
- // see is whether we even handed it valid image bytes. A real JPEG
960
- // starts with FF D8 FF (E0 for JFIF, E1 for EXIF). A JPEG2000
961
- // starts with 00 00 00 0C 6A 50 20 20.
962
- const magic = Array.from(copy.subarray(0, 4))
963
- .map(b => b.toString(16).padStart(2, '0'))
964
- .join(' ');
965
- console.log(`[albex] OCR page ${page + 1} image ${i + 1}/${imageCount}: kind=${kind} (${mime}) len=${len} bytes magic=${magic}`);
966
1101
  try {
967
1102
  const { text } = await ocr(blob);
968
1103
  const trimmed = text?.trim();
@@ -977,7 +1112,10 @@ export class AlbexEngine {
977
1112
  // "Aborted(-1)") are also caught here; if they bypass the
978
1113
  // promise rejection and surface as `uncaught` instead, the
979
1114
  // demo's window.onerror handler will keep the app alive.
980
- console.warn(`[albex] OCR failed on page ${page + 1} image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
1115
+ this._diag({
1116
+ kind: 'skipped', stage: 'ocr', page: page + 1,
1117
+ message: `OCR failed on image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`,
1118
+ });
981
1119
  }
982
1120
  }
983
1121
  return pageText;
@@ -1018,7 +1156,10 @@ export class AlbexEngine {
1018
1156
  new Uint8Array(pw.memory.buffer, inPtr, bytes.length).set(bytes);
1019
1157
  }
1020
1158
  catch (e) {
1021
- console.warn(`[albex] PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`);
1159
+ this._diag({
1160
+ kind: 'skipped', stage: 'pdf',
1161
+ message: `PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`,
1162
+ });
1022
1163
  return null;
1023
1164
  }
1024
1165
  // Set up the doc and let _indexPdfScanned do the page-by-page walk.
@@ -1027,7 +1168,10 @@ export class AlbexEngine {
1027
1168
  // first page, no paragraphs are emitted and we end up with 0 chunks.
1028
1169
  this._wasm.setDocumentName(this._writeStr(file.name));
1029
1170
  this._wasm.beginDocument();
1030
- console.info(`[albex] pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf for ${file.name}`);
1171
+ this._diag({
1172
+ kind: 'fallback', stage: 'pdf', file: file.name,
1173
+ message: `pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf`,
1174
+ });
1031
1175
  await this._indexPdfScanned(pw);
1032
1176
  return this._wasm.endDocument();
1033
1177
  }
@@ -1483,20 +1627,29 @@ export class AlbexEngine {
1483
1627
  };
1484
1628
  // ── Public API ────────────────────────────────────────────────────────────
1485
1629
  /**
1486
- * Index a file. Supported formats: DOCX, XLSX, PDF, TXT, XML.
1630
+ * Index a file. Supported formats (11, with varying depth): DOCX, XLSX, PDF,
1631
+ * HTML, MD, JSON, CSV, EML, RTF, TXT, XML. Several are deliberately "lite"
1632
+ * (CSV is RFC-4180-lite, EML is MIME-lite, RTF is regex-stripped).
1487
1633
  * Throws for unsupported formats or parse errors.
1488
1634
  */
1489
1635
  async indexFile(file) {
1636
+ return this._exclusive(() => this._indexFileInner(file));
1637
+ }
1638
+ async _indexFileInner(file) {
1490
1639
  const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
1491
1640
  const indexer = AlbexEngine._INDEXERS[ext];
1492
1641
  if (!indexer)
1493
1642
  throw new AlbexUnsupportedFormatError(ext);
1643
+ // Size guard BEFORE reading: `file.size` is available without buffering,
1644
+ // so a pathological input (a 2 GB .txt) is refused with a typed error
1645
+ // instead of being fully loaded and hashed first (audit 3.5).
1646
+ assertFileSizeWithinLimit(file, this._opts.maxFileBytes);
1494
1647
  // Hash the source bytes for idempotency. We always read the bytes once
1495
1648
  // here so the indexer can reuse them — avoids a double File.arrayBuffer().
1496
1649
  const bytes = new Uint8Array(await file.arrayBuffer());
1497
- const hash = contentHash(bytes);
1650
+ const hash = this._contentHash(bytes);
1498
1651
  // Idempotency: if a non-deleted doc already has this hash, return it
1499
- // unchanged. Cheap O(N) scan since MAX_DOCS = 128.
1652
+ // unchanged. O(doc_count) scan cheap at any supported capacity.
1500
1653
  const existing = this._docs.find(d => d.contentHash === hash);
1501
1654
  if (existing)
1502
1655
  return existing;
@@ -1516,6 +1669,31 @@ export class AlbexEngine {
1516
1669
  w.setDocumentContentHash(hashBytes.length);
1517
1670
  }
1518
1671
  const chunks = await indexer(this, file, bytes);
1672
+ // Capacity check (0.6.0). The WASM pools fill silently and break out of
1673
+ // their ingest loops; getLastIndexOverflow reports which one filled.
1674
+ // Surface a typed error instead of returning a half-indexed document the
1675
+ // caller cannot tell apart from a complete one (audit finding #3).
1676
+ const overflow = w.getLastIndexOverflow();
1677
+ if (overflow !== 0) {
1678
+ const which = (overflow & 1) ? 'chunks' : (overflow & 2) ? 'text'
1679
+ : (overflow & 4) ? 'docs' : 'names';
1680
+ // The RUNTIME limit of the pool that overflowed, as configured via
1681
+ // `capacity` (std defaults · 'large' · custom object).
1682
+ const max = which === 'chunks' ? w.getMaxChunks()
1683
+ : which === 'text' ? w.getTextCapacity()
1684
+ : which === 'docs' ? w.getMaxDocs()
1685
+ : w.getNameCapacity();
1686
+ const pools = [
1687
+ overflow & 1 ? 'chunk pool' : '',
1688
+ overflow & 2 ? 'text pool' : '',
1689
+ overflow & 4 ? 'document table' : '',
1690
+ overflow & 8 ? 'name pool' : '',
1691
+ ].filter(Boolean).join(', ');
1692
+ throw new AlbexCapacityError(`Index capacity exceeded while indexing "${file.name}" (${pools} full, ` +
1693
+ `${which} limit = ${max}). The document was rolled back (not indexed); ` +
1694
+ `treat the index as full (compact(), shard across an AlbexPool, ` +
1695
+ `reset(), or re-create the engine with a bigger \`capacity\`).`, which, max);
1696
+ }
1519
1697
  // The new doc occupies slot `docCountBefore`.
1520
1698
  const docId = w.getDocId(docCountBefore);
1521
1699
  const doc = {
@@ -1528,6 +1706,7 @@ export class AlbexEngine {
1528
1706
  contentHash: hash,
1529
1707
  };
1530
1708
  this._docs.push(doc);
1709
+ this._gpuUploadDirty = true;
1531
1710
  return doc;
1532
1711
  }
1533
1712
  /**
@@ -1538,12 +1717,17 @@ export class AlbexEngine {
1538
1717
  * Returns `true` if a matching document was found and tombstoned.
1539
1718
  */
1540
1719
  removeDocument(id) {
1720
+ this._assertIdle('removeDocument');
1721
+ return this._removeDocumentInner(id);
1722
+ }
1723
+ _removeDocumentInner(id) {
1541
1724
  const doc = this._docs.find(d => d.name === id || d.contentHash === id);
1542
1725
  if (!doc)
1543
1726
  return false;
1544
1727
  const ok = this._wasm.removeDocument(doc.docId) === 1;
1545
1728
  if (ok) {
1546
1729
  this._docs = this._docs.filter(d => d !== doc);
1730
+ this._gpuUploadDirty = true;
1547
1731
  }
1548
1732
  return ok;
1549
1733
  }
@@ -1553,12 +1737,15 @@ export class AlbexEngine {
1553
1737
  * idempotency check (so re-indexing the *same* bytes after a remove works).
1554
1738
  */
1555
1739
  async replaceDocument(name, newFile) {
1556
- this.removeDocument(name);
1557
- // Force a unique-hash path by indexing directly; if the new file happens
1558
- // to hash identically to a still-tracked document, the dedupe in
1559
- // indexFile will return that one. The remove above prevents the
1560
- // common case.
1561
- return this.indexFile(newFile);
1740
+ return this._exclusive(async () => {
1741
+ this._removeDocumentInner(name);
1742
+ // Index directly via the inner path (we already hold the lock).
1743
+ const doc = await this._indexFileInner(newFile);
1744
+ // Repeated replaces leave tombstones in the text pool; reclaim under
1745
+ // pressure so the pool isn't silently exhausted (audit finding #7).
1746
+ this._autoCompactIfNeeded();
1747
+ return doc;
1748
+ });
1562
1749
  }
1563
1750
  /**
1564
1751
  * Reclaim storage from previously removed documents. Compacts CHUNKS,
@@ -1568,7 +1755,78 @@ export class AlbexEngine {
1568
1755
  * references (e.g. in a UI) remain valid.
1569
1756
  */
1570
1757
  compact() {
1758
+ this._assertIdle('compact');
1571
1759
  this._wasm.compact();
1760
+ // compact() reorders the chunk array (and therefore the per-chunk
1761
+ // blooms) even when the chunk count stays the same — the GPU copy is
1762
+ // stale no matter what (audit 1.5).
1763
+ this._gpuUploadDirty = true;
1764
+ }
1765
+ /**
1766
+ * Enumerate the authoritative chunks Albex indexed for a document, in order.
1767
+ * Lets a host mirror Albex's exact chunking — e.g. embed the same units for a
1768
+ * parallel semantic index keyed on the same {@link AuthoritativeChunk.id}
1769
+ * (`"<docId>::<ord>"`, identical to {@link SearchResult.chunkId}). `docId` is
1770
+ * `IndexedDocument.docId` from {@link indexFile}; returns `[]` if no live
1771
+ * document has that id.
1772
+ *
1773
+ * The returned `id`/`ord`/`sub` are stable across {@link compact} and
1774
+ * snapshot save/load. Never key persistent structures on a search result's
1775
+ * absolute `chunkIdx`, which {@link compact} renumbers.
1776
+ */
1777
+ listChunks(docId) {
1778
+ this._assertIdle('listChunks');
1779
+ const w = this._wasm;
1780
+ const slot = this._docSlotOf(docId);
1781
+ if (slot < 0)
1782
+ return [];
1783
+ const count = w.getDocChunkCount(slot);
1784
+ const out = [];
1785
+ let prevLocation = -1;
1786
+ let sub = 0;
1787
+ // Batched enumeration (ABI 6): one `listChunksBatch` frontier call per
1788
+ // scratchpad-full of chunks instead of 2-3 calls per chunk (audit 2.6 —
1789
+ // an embeddings pipeline over 100k chunks used to make ~300k calls).
1790
+ // Each batch packs records as [u32 text_len][u32 location][text bytes],
1791
+ // tightly, in ordinal order; layout documented in wasm/src/lib.rs.
1792
+ let ord = 0;
1793
+ while (ord < count) {
1794
+ const n = w.listChunksBatch(slot, ord, count - ord);
1795
+ if (n === 0)
1796
+ break; // defensive — should not happen for a live slot
1797
+ const ptr = w.getBuffer(0);
1798
+ // The view is only valid until the next frontier call; everything is
1799
+ // decoded out of it inside this loop body before the next batch.
1800
+ const view = new DataView(this._mem.buffer);
1801
+ let off = ptr;
1802
+ for (let k = 0; k < n; k++) {
1803
+ const byteLen = view.getUint32(off, true);
1804
+ const location = view.getUint32(off + 4, true);
1805
+ const text = byteLen > 0
1806
+ ? _dec.decode(new Uint8Array(this._mem.buffer, off + 8, byteLen))
1807
+ : '';
1808
+ if (location === prevLocation)
1809
+ sub++;
1810
+ else {
1811
+ sub = 0;
1812
+ prevLocation = location;
1813
+ }
1814
+ out.push({ docId, location, ord, sub, text, byteLen, id: `${docId}::${ord}` });
1815
+ ord++;
1816
+ off += 8 + byteLen;
1817
+ }
1818
+ }
1819
+ return out;
1820
+ }
1821
+ /** Doc-table slot (0..getDocCount) whose stable id is `docId`, or -1. */
1822
+ _docSlotOf(docId) {
1823
+ const w = this._wasm;
1824
+ const n = w.getDocCount();
1825
+ for (let i = 0; i < n; i++) {
1826
+ if (w.getDocId(i) === docId)
1827
+ return i;
1828
+ }
1829
+ return -1;
1572
1830
  }
1573
1831
  /**
1574
1832
  * Search the index. Supports:
@@ -1578,17 +1836,42 @@ export class AlbexEngine {
1578
1836
  *
1579
1837
  * Pass `{ windowed: true }` to receive cropped snippets with ASCII ellipsis
1580
1838
  * markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
1839
+ *
1840
+ * Note: this synchronous path never uses the GPU pre-filter — the WebGPU
1841
+ * scan is asynchronous by nature. Only `searchCooperative` (the budgeted
1842
+ * path) engages the GPU; `search()` always runs the CPU Bloom pre-filter,
1843
+ * regardless of the `gpu` option.
1581
1844
  */
1582
1845
  search(query, opts = {}) {
1583
- const parsed = parseQuery(query);
1584
- if (parsed.kind === 'or') {
1585
- return this._searchOr(parsed.branches, query, opts);
1586
- }
1587
- const results = this._runSearch(tokensToWasmQuery(parsed.tokens), query, opts);
1588
- if (parsed.kind === 'phrase') {
1589
- return results.filter(r => containsPhrase(r.snippet, parsed.tokens));
1590
- }
1591
- return results;
1846
+ this._assertIdle('search');
1847
+ const w = this._wasm;
1848
+ const ql = this._writeStr(query);
1849
+ const kind = w.prepareQuery(ql);
1850
+ this._lastTruncFlags = w.getQueryTruncationFlags();
1851
+ if (kind < 0)
1852
+ return [];
1853
+ if (kind === 2) {
1854
+ // OR: iterate branches and merge in TS. WASM stores compiled
1855
+ // branches internally so we never re-tokenize on the host.
1856
+ return this._searchOr(query, opts);
1857
+ }
1858
+ w.selectQueryBranch(0);
1859
+ // Phrase queries (kind 1) post-filter on adjacency. Pass the tokens down
1860
+ // so the check runs against the FULL chunk text, not a cropped windowed
1861
+ // snippet — otherwise `{ windowed: true }` could drop a valid phrase hit
1862
+ // whose second term fell outside the window (audit finding #7).
1863
+ const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
1864
+ return this._runSearch(query, opts, phraseTokens);
1865
+ }
1866
+ /** Read the WASM-compiled tokens of branch `i` for phrase post-filter.
1867
+ * The bytes returned are exactly what the WASM tokenizer produced —
1868
+ * no TS re-tokenization. */
1869
+ _branchTokens(i) {
1870
+ const n = this._wasm.getQueryBranchPattern(i);
1871
+ if (n === 0)
1872
+ return [];
1873
+ const pattern = this._readPad(n);
1874
+ return pattern.split(' ').filter(t => t.length > 0);
1592
1875
  }
1593
1876
  /**
1594
1877
  * Cooperative search. Processes the corpus in slices, yielding to the
@@ -1605,20 +1888,37 @@ export class AlbexEngine {
1605
1888
  * Pass `opts.frameBudgetMs` to control the slice size (default 8 ms).
1606
1889
  */
1607
1890
  async *searchCooperative(query, opts = {}) {
1608
- const parsed = parseQuery(query);
1891
+ // Collect under the exclusivity lock so no other engine op interleaves at
1892
+ // a slice boundary; the per-slice scheduler yields still happen inside.
1893
+ const results = await this._exclusive(() => this._searchCooperativeCollect(query, opts));
1894
+ for (const r of results)
1895
+ yield r;
1896
+ }
1897
+ /** Materialise a cooperative search to a sorted result array. Runs inside
1898
+ * the exclusivity lock. Frame-budget yielding lives in _runSearchBudgeted. */
1899
+ async _searchCooperativeCollect(query, opts) {
1609
1900
  const budget = opts.frameBudgetMs ?? 8;
1610
1901
  const w = this._wasm;
1611
- // OR queries: run each branch as its own resumable search, dedup, sort.
1612
- if (parsed.kind === 'or') {
1902
+ const ql = this._writeStr(query);
1903
+ const kind = w.prepareQuery(ql);
1904
+ this._lastTruncFlags = w.getQueryTruncationFlags();
1905
+ if (kind < 0)
1906
+ return [];
1907
+ if (kind === 2) {
1908
+ // OR branches — run each as its own resumable search and merge.
1613
1909
  const seen = new Set();
1614
1910
  const all = [];
1615
- for (const tokens of parsed.branches) {
1616
- const q = tokensToWasmQuery(tokens);
1617
- if (!q)
1618
- continue;
1619
- const r = await this._runSearchBudgeted(q, query, opts, budget);
1911
+ const n = w.getQueryBranchCount();
1912
+ for (let i = 0; i < n; i++) {
1913
+ w.selectQueryBranch(i);
1914
+ const r = await this._runSearchBudgeted(query, opts, budget, undefined, i);
1620
1915
  for (const x of r) {
1621
- const key = `${x.documentName}:${x.location}:${x.matchStart}`;
1916
+ // chunkId ("<docId>::<ord>") distinguishes two sub-chunks of the
1917
+ // same location — a (doc, location, matchStart) key would collide
1918
+ // when both sub-chunks hit at the same relative offset and drop a
1919
+ // legitimate result (audit 3.4). matchStart keeps distinct hits
1920
+ // within one chunk across branches.
1921
+ const key = `${x.chunkId}:${x.matchStart}`;
1622
1922
  if (!seen.has(key)) {
1623
1923
  seen.add(key);
1624
1924
  all.push(x);
@@ -1626,17 +1926,11 @@ export class AlbexEngine {
1626
1926
  }
1627
1927
  }
1628
1928
  all.sort((a, b) => b.score - a.score);
1629
- for (const r of all)
1630
- yield r;
1631
- return;
1929
+ return all;
1632
1930
  }
1633
- const results = await this._runSearchBudgeted(tokensToWasmQuery(parsed.tokens), query, opts, budget);
1634
- const filtered = parsed.kind === 'phrase'
1635
- ? results.filter(r => containsPhrase(r.snippet, parsed.tokens))
1636
- : results;
1637
- for (const r of filtered)
1638
- yield r;
1639
- void w;
1931
+ w.selectQueryBranch(0);
1932
+ const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
1933
+ return this._runSearchBudgeted(query, opts, budget, phraseTokens, 0);
1640
1934
  }
1641
1935
  /**
1642
1936
  * @deprecated Renamed to `searchCooperative` in 0.3.0. The original name
@@ -1657,29 +1951,42 @@ export class AlbexEngine {
1657
1951
  * JS<->WASM overhead on fast machines; on slow machines a single batch
1658
1952
  * may eat the entire budget, which is also fine.
1659
1953
  */
1660
- async _runSearchBudgeted(wasmQuery, displayQuery, opts, budgetMs) {
1954
+ async _runSearchBudgeted(displayQuery, opts, budgetMs, phraseTokens, branchIdx = 0) {
1661
1955
  const w = this._wasm;
1662
- const ql = this._writeStr(wasmQuery);
1663
- w.setPattern(ql);
1956
+ // Pattern is already set by the caller via selectQueryBranch(branchIdx),
1957
+ // which also computed THAT branch's pattern Bloom inside WASM — so the
1958
+ // GPU pre-filter below builds the right candidate mask per OR branch
1959
+ // (audit finding #6) without re-reading the pattern across the frontier.
1664
1960
  // GPU pre-filter (CD1). If enabled AND the corpus is large enough,
1665
1961
  // the GPU computes the candidate bitset and we install it into WASM
1666
1962
  // before searchBegin so the slice loop only inspects candidates.
1667
1963
  // Failure here is silent: we fall back to CPU-only Bloom transparently.
1668
1964
  if (this._shouldEngageGpu()) {
1669
1965
  try {
1670
- await this._gpuPreFilter(wasmQuery);
1966
+ await this._gpuPreFilter();
1671
1967
  }
1672
1968
  catch (e) {
1673
1969
  // Don't let a GPU hiccup kill the search — drop to CPU path.
1674
- console.warn('[albex] GPU pre-filter failed; falling back to CPU:', e);
1970
+ this._diag({
1971
+ kind: 'fallback', stage: 'gpu',
1972
+ message: `GPU pre-filter failed; falling back to CPU: ${e instanceof Error ? e.message : String(e)}`,
1973
+ });
1675
1974
  w.clearCandidateMask();
1676
1975
  }
1976
+ // The GPU pre-filter pushes the candidate bitset through the
1977
+ // scratchpad, overwriting the pattern staged by selectQueryBranch.
1978
+ // searchBegin() snapshots the pattern FROM the scratchpad, so it
1979
+ // would compile garbage tokens out of the mask bytes (audit 1.2 —
1980
+ // every GPU-assisted search silently returned wrong results).
1981
+ // Re-select the active branch to restore the pattern.
1982
+ w.selectQueryBranch(branchIdx);
1677
1983
  }
1678
1984
  const t0 = performance.now();
1679
1985
  if (w.searchBegin() === 0) {
1680
1986
  this._lastSearch = {
1681
1987
  query: displayQuery, timeMs: 0, results: 0,
1682
1988
  bloomTested: 0, bloomPassed: 0, bitapMatched: 0,
1989
+ ...this._truncStats(),
1683
1990
  };
1684
1991
  return [];
1685
1992
  }
@@ -1718,41 +2025,116 @@ export class AlbexEngine {
1718
2025
  bloomTested: w.getStatBloomTested(),
1719
2026
  bloomPassed: w.getStatBloomPassed(),
1720
2027
  bitapMatched: w.getStatBitapMatched(),
2028
+ ...this._truncStats(),
1721
2029
  };
1722
- return this._collectResults(count, opts);
2030
+ return this._collectResults(count, opts, phraseTokens);
1723
2031
  }
1724
- /** Materialise results [0..count) into the public SearchResult shape. */
1725
- _collectResults(count, opts) {
2032
+ /** Truncation booleans for SearchStats, decoded from the flags the WASM
2033
+ * reported for the most recent prepareQuery (audit 1.6 — the engine used
2034
+ * to drop OR branches past 8 and tokens past 4 in silence). */
2035
+ _truncStats() {
2036
+ const f = this._lastTruncFlags;
2037
+ return {
2038
+ truncatedBranches: (f & 1) !== 0,
2039
+ truncatedTokens: (f & 2) !== 0,
2040
+ truncatedQuery: (f & 4) !== 0,
2041
+ };
2042
+ }
2043
+ /** Materialise results [0..count) into the public SearchResult shape.
2044
+ * When `phraseTokens` is given, each result is kept only if those tokens
2045
+ * appear adjacently in the FULL chunk text — independent of any display
2046
+ * windowing — so phrase queries stay correct under `{ windowed: true }`.
2047
+ *
2048
+ * Frontier discipline (audit 2.1): all numeric fields of every result are
2049
+ * read in ONE DataView pass over the `#[repr(C)]` RESULTS array
2050
+ * (`getResultsPtr`/`getResultStride`, ABI 6) — the old path made 12-15
2051
+ * frontier calls per result. Strings still need calls, minimised to one
2052
+ * snippet read per result plus one doc-name read per DISTINCT document
2053
+ * (the old `getResultDocName` was additionally O(doc_count) inside WASM
2054
+ * for every single result). */
2055
+ _collectResults(count, opts, phraseTokens) {
1726
2056
  const w = this._wasm;
1727
2057
  const windowed = opts.windowed === true;
1728
2058
  const before = opts.before ?? 60;
1729
2059
  const after = opts.after ?? 120;
2060
+ const phraseFilter = phraseTokens && phraseTokens.length > 0 ? phraseTokens : null;
2061
+ // Map each live doc_id to its CHUNKS[] base (to turn a result's absolute
2062
+ // chunk index into a compact()-stable doc-relative ordinal) and to its
2063
+ // doc-table slot (for O(1) name resolution via getDocName).
2064
+ const chunkBaseByDocId = new Map();
2065
+ const slotByDocId = new Map();
2066
+ {
2067
+ const docCount = w.getDocCount();
2068
+ for (let d = 0; d < docCount; d++) {
2069
+ const id = w.getDocId(d);
2070
+ chunkBaseByDocId.set(id, w.getDocChunkBase(d));
2071
+ slotByDocId.set(id, d);
2072
+ }
2073
+ }
2074
+ const raw = new Array(count);
2075
+ {
2076
+ const ptr = w.getResultsPtr();
2077
+ const stride = w.getResultStride();
2078
+ const view = new DataView(this._mem.buffer, ptr, count * stride);
2079
+ for (let i = 0; i < count; i++) {
2080
+ const base = i * stride;
2081
+ const matchCount = view.getUint32(base + 56, true);
2082
+ const matches = [];
2083
+ for (let k = 0; k < matchCount && k < 4; k++) {
2084
+ matches.push({
2085
+ start: view.getUint32(base + 24 + k * 8, true),
2086
+ end: view.getUint32(base + 28 + k * 8, true),
2087
+ });
2088
+ }
2089
+ const matchStart = view.getUint32(base + 16, true);
2090
+ const matchEnd = view.getUint32(base + 20, true);
2091
+ if (matches.length === 0)
2092
+ matches.push({ start: matchStart, end: matchEnd });
2093
+ raw[i] = {
2094
+ docId: view.getUint32(base, true),
2095
+ chunkIdx: view.getUint32(base + 4, true),
2096
+ location: view.getUint32(base + 8, true),
2097
+ score: view.getUint16(base + 12, true),
2098
+ matchStart, matchEnd, matches,
2099
+ };
2100
+ }
2101
+ }
2102
+ // Resolve each distinct doc name ONCE per search (one getDocName call
2103
+ // per document that actually appears in the results).
2104
+ const nameByDocId = new Map();
2105
+ const docName = (docId) => {
2106
+ let name = nameByDocId.get(docId);
2107
+ if (name === undefined) {
2108
+ const slot = slotByDocId.get(docId);
2109
+ const nl = slot !== undefined ? w.getDocName(slot) : 0;
2110
+ name = nl > 0 ? this._readPad(nl) : '?';
2111
+ nameByDocId.set(docId, name);
2112
+ }
2113
+ return name;
2114
+ };
1730
2115
  const results = [];
1731
2116
  for (let i = 0; i < count; i++) {
1732
- const score = w.getResultScore(i);
1733
- const location = w.getResultLocation(i);
1734
- const matchStart = w.getResultStart(i);
1735
- const matchEnd = w.getResultEnd(i);
1736
- const nl = w.getResultDocName(i);
1737
- const name = nl > 0 ? this._readPad(nl) : '?';
1738
- const matchCount = w.getResultMatchCount(i);
1739
- const matches = [];
1740
- for (let k = 0; k < matchCount; k++) {
1741
- matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
2117
+ const r = raw[i];
2118
+ // Phrase adjacency check against the full chunk text (getSnippet), not
2119
+ // the possibly-cropped display window.
2120
+ if (phraseFilter) {
2121
+ const fl = w.getSnippet(i);
2122
+ const full = fl > 0 ? this._readPad(fl) : '';
2123
+ if (!containsPhrase(full, phraseFilter))
2124
+ continue;
1742
2125
  }
1743
- if (matches.length === 0)
1744
- matches.push({ start: matchStart, end: matchEnd });
1745
- let snippet;
1746
- let primaryStart = matchStart;
1747
- let primaryEnd = matchEnd;
1748
- let adjustedMatches = matches;
2126
+ const chunkOrd = r.chunkIdx - (chunkBaseByDocId.get(r.docId) ?? 0);
2127
+ let snippetBytes;
2128
+ let primaryStart = r.matchStart;
2129
+ let primaryEnd = r.matchEnd;
2130
+ let adjustedMatches = r.matches;
1749
2131
  if (windowed) {
1750
2132
  const sl = w.getSnippetWindow(i, before, after);
1751
- snippet = sl > 0 ? this._readPad(sl) : '';
2133
+ snippetBytes = sl > 0 ? this._readPadBytes(sl) : new Uint8Array(0);
1752
2134
  const offset = w.getSnippetWindowOffset();
1753
2135
  const leadingPrefix = offset > 0 ? 4 : 0;
1754
2136
  const shift = leadingPrefix - offset;
1755
- adjustedMatches = matches.map(m => ({
2137
+ adjustedMatches = r.matches.map(m => ({
1756
2138
  start: Math.max(0, m.start + shift),
1757
2139
  end: Math.max(0, m.end + shift),
1758
2140
  }));
@@ -1761,44 +2143,61 @@ export class AlbexEngine {
1761
2143
  }
1762
2144
  else {
1763
2145
  const sl = w.getSnippet(i);
1764
- snippet = sl > 0 ? this._readPad(sl) : '';
2146
+ snippetBytes = sl > 0 ? this._readPadBytes(sl) : new Uint8Array(0);
1765
2147
  }
2148
+ const snippet = snippetBytes.length > 0 ? _dec.decode(snippetBytes) : '';
2149
+ // UTF-16 view of the primary span, ready for `snippet.slice()` —
2150
+ // byte offsets and JS string indices diverge on the first accent
2151
+ // (audit 3.1, the consumer footgun in the main Spanish use case).
2152
+ const snippetStart = utf16IndexAtByte(snippetBytes, primaryStart);
2153
+ const snippetEnd = utf16IndexAtByte(snippetBytes, primaryEnd);
1766
2154
  results.push({
1767
- documentName: name,
1768
- location,
1769
- score,
2155
+ documentName: docName(r.docId),
2156
+ docId: r.docId,
2157
+ location: r.location,
2158
+ chunkId: `${r.docId}::${chunkOrd}`,
2159
+ score: r.score,
1770
2160
  snippet,
1771
2161
  matchStart: primaryStart,
1772
2162
  matchEnd: primaryEnd,
1773
2163
  matches: adjustedMatches,
2164
+ snippetStart,
2165
+ snippetEnd,
1774
2166
  });
1775
2167
  }
1776
2168
  return results;
1777
2169
  }
1778
- _searchOr(branches, rawQuery, opts) {
2170
+ /** Run all OR branches and merge dedup-by-(chunkId, matchStart). The
2171
+ * branches are already compiled inside the WASM (by prepareQuery); we
2172
+ * iterate them with selectQueryBranch. The "rawQuery" param is kept
2173
+ * only for the lastSearch.query field. */
2174
+ _searchOr(rawQuery, opts) {
2175
+ const w = this._wasm;
1779
2176
  const seen = new Set();
1780
2177
  const all = [];
1781
- for (const tokens of branches) {
1782
- const q = tokensToWasmQuery(tokens);
1783
- if (!q)
1784
- continue;
1785
- const results = this._runSearch(q, rawQuery, opts);
2178
+ const n = w.getQueryBranchCount();
2179
+ for (let i = 0; i < n; i++) {
2180
+ w.selectQueryBranch(i);
2181
+ const results = this._runSearch(rawQuery, opts);
1786
2182
  for (const r of results) {
1787
- const key = `${r.documentName}:${r.location}:${r.matchStart}`;
2183
+ // Keyed on chunkId, not (doc, location, matchStart): two sub-chunks
2184
+ // of the same location can hit at the same relative offset, and the
2185
+ // old key silently dropped one of them (audit 3.4).
2186
+ const key = `${r.chunkId}:${r.matchStart}`;
1788
2187
  if (!seen.has(key)) {
1789
2188
  seen.add(key);
1790
2189
  all.push(r);
1791
2190
  }
1792
2191
  }
1793
2192
  }
1794
- // Re-rank the merged list by score descending.
1795
2193
  all.sort((a, b) => b.score - a.score);
1796
2194
  return all;
1797
2195
  }
1798
- _runSearch(wasmQuery, displayQuery, opts) {
2196
+ /** Execute a single search using whichever query branch is currently
2197
+ * active (set via selectQueryBranch). Returns the materialised
2198
+ * SearchResult[]. Caller is responsible for activating a branch first. */
2199
+ _runSearch(displayQuery, opts, phraseTokens) {
1799
2200
  const w = this._wasm;
1800
- const ql = this._writeStr(wasmQuery);
1801
- w.setPattern(ql);
1802
2201
  const t0 = performance.now();
1803
2202
  const count = w.search();
1804
2203
  const ms = performance.now() - t0;
@@ -1809,62 +2208,12 @@ export class AlbexEngine {
1809
2208
  bloomTested: w.getStatBloomTested(),
1810
2209
  bloomPassed: w.getStatBloomPassed(),
1811
2210
  bitapMatched: w.getStatBitapMatched(),
2211
+ ...this._truncStats(),
1812
2212
  };
1813
- const windowed = opts.windowed === true;
1814
- const before = opts.before ?? 60;
1815
- const after = opts.after ?? 120;
1816
- const results = [];
1817
- for (let i = 0; i < count; i++) {
1818
- const score = w.getResultScore(i);
1819
- const location = w.getResultLocation(i);
1820
- const matchStart = w.getResultStart(i);
1821
- const matchEnd = w.getResultEnd(i);
1822
- const nl = w.getResultDocName(i);
1823
- const name = nl > 0 ? this._readPad(nl) : '?';
1824
- const matchCount = w.getResultMatchCount(i);
1825
- const matches = [];
1826
- for (let k = 0; k < matchCount; k++) {
1827
- matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
1828
- }
1829
- if (matches.length === 0) {
1830
- matches.push({ start: matchStart, end: matchEnd });
1831
- }
1832
- let snippet;
1833
- let primaryStart = matchStart;
1834
- let primaryEnd = matchEnd;
1835
- let adjustedMatches = matches;
1836
- if (windowed) {
1837
- const sl = w.getSnippetWindow(i, before, after);
1838
- snippet = sl > 0 ? this._readPad(sl) : '';
1839
- const offset = w.getSnippetWindowOffset();
1840
- // Spans came back chunk-relative; shift them into window-relative.
1841
- // Account for leading "... " prefix when present.
1842
- const leadingPrefix = offset > 0 ? 4 : 0;
1843
- const shift = leadingPrefix - offset;
1844
- adjustedMatches = matches.map(m => ({
1845
- start: Math.max(0, m.start + shift),
1846
- end: Math.max(0, m.end + shift),
1847
- }));
1848
- primaryStart = adjustedMatches[0]?.start ?? 0;
1849
- primaryEnd = adjustedMatches[0]?.end ?? 0;
1850
- }
1851
- else {
1852
- const sl = w.getSnippet(i);
1853
- snippet = sl > 0 ? this._readPad(sl) : '';
1854
- }
1855
- results.push({
1856
- documentName: name,
1857
- location,
1858
- score,
1859
- snippet,
1860
- matchStart: primaryStart,
1861
- matchEnd: primaryEnd,
1862
- matches: adjustedMatches,
1863
- });
1864
- }
1865
- return results;
2213
+ return this._collectResults(count, opts, phraseTokens);
1866
2214
  }
1867
- /** Returns current engine statistics. */
2215
+ /** Returns current engine statistics (capacities are the RUNTIME values
2216
+ * the engine was initialised with via the `capacity` option). */
1868
2217
  getStats() {
1869
2218
  return {
1870
2219
  documents: this._docs.length,
@@ -1872,9 +2221,9 @@ export class AlbexEngine {
1872
2221
  textUsed: this._wasm.getTextUsed(),
1873
2222
  textCapacity: this._wasm.getTextCapacity(),
1874
2223
  wasmMemoryBytes: this._mem.buffer.byteLength,
1875
- tier: this._tier,
1876
2224
  maxChunks: this._wasm.getMaxChunks(),
1877
2225
  maxDocs: this._wasm.getMaxDocs(),
2226
+ namePoolBytes: this._wasm.getNameCapacity(),
1878
2227
  };
1879
2228
  }
1880
2229
  /** Returns stats from the most recent search, or null. */
@@ -1914,9 +2263,92 @@ export class AlbexEngine {
1914
2263
  }
1915
2264
  /** Full reset — clears all indexed documents and chunks. */
1916
2265
  reset() {
1917
- this._wasm.init();
2266
+ this._assertIdle('reset');
2267
+ this._resetInner();
2268
+ }
2269
+ _resetInner() {
2270
+ // Re-init with the engine's CONFIGURED capacity, not the std defaults
2271
+ // (`wasm.init()` would silently shrink a 'large'/custom engine). Same
2272
+ // capacities → the WASM side does a plain counter reset, no realloc.
2273
+ const c = this._capacity;
2274
+ this._wasm.initWithCapacity(c.maxDocs, c.maxChunks, c.textPoolBytes, c.namePoolBytes);
1918
2275
  this._docs = [];
1919
2276
  this._lastSearch = null;
2277
+ this._diagnostics = [];
2278
+ this._gpuUploadDirty = true;
2279
+ }
2280
+ /**
2281
+ * Drain and return the diagnostics collected since the last call (or
2282
+ * since the engine was created). Use this to surface recoverable
2283
+ * issues to the caller after `indexFile`, `load`, or any other
2284
+ * operation that may run into a "best-effort" path.
2285
+ *
2286
+ * Example diagnostics:
2287
+ * - `{kind:'fallback', stage:'pdf', message:'pdf-extract crashed,
2288
+ * attempting OCR-only fallback', file:'invoice.pdf'}`
2289
+ * - `{kind:'skipped', stage:'ocr', message:'Tesseract abort on page
2290
+ * 3 image 1; remaining images on this page skipped', file:'...',
2291
+ * page:3}`
2292
+ * - `{kind:'fallback', stage:'gpu', message:'GPU pre-filter failed,
2293
+ * using CPU'}`
2294
+ *
2295
+ * The buffer is cleared on each call; callers should consume the
2296
+ * returned array immediately (e.g. log to their telemetry, surface
2297
+ * a UI banner). After `reset()` the buffer is also cleared.
2298
+ */
2299
+ takeDiagnostics() {
2300
+ const out = this._diagnostics;
2301
+ this._diagnostics = [];
2302
+ return out;
2303
+ }
2304
+ /** Internal: record a diagnostic. Capped at 256 to bound memory. */
2305
+ _diag(entry) {
2306
+ if (this._diagnostics.length >= 256)
2307
+ return;
2308
+ this._diagnostics.push(entry);
2309
+ }
2310
+ /**
2311
+ * Install an OCR adapter. Returns a handle whose `dispose()` removes the
2312
+ * adapter from the engine.
2313
+ *
2314
+ * The contract: the adapter must provide `recognize(image, opts)` that
2315
+ * returns `Promise<OcrAttachedResult>`. The engine validates the
2316
+ * contract at attach time and refuses adapters that don't expose a
2317
+ * recognise function. Only one adapter can be attached at a time; a
2318
+ * second call to `attachOcr` while one is active throws — the caller
2319
+ * must dispose the previous one first.
2320
+ *
2321
+ * @example
2322
+ * ```ts
2323
+ * import { enableOcr } from '@albex/ocr';
2324
+ * const handle = enableOcr(engine); // internally calls attachOcr
2325
+ * // ... later ...
2326
+ * await handle.dispose();
2327
+ * ```
2328
+ *
2329
+ * Direct use without the companion package:
2330
+ * ```ts
2331
+ * const handle = engine.attachOcr({
2332
+ * recognize: async (blob) => myCustomOcr(blob),
2333
+ * options: { alwaysExtractEmbeddedImages: false },
2334
+ * });
2335
+ * ```
2336
+ */
2337
+ attachOcr(adapter) {
2338
+ if (this._ocrAdapter) {
2339
+ throw new AlbexInitError('OCR adapter already attached. Call dispose() on the previous handle before attaching a new one.');
2340
+ }
2341
+ if (typeof adapter?.recognize !== 'function') {
2342
+ throw new AlbexInitError('attachOcr requires an adapter with a recognize(image, opts) function.');
2343
+ }
2344
+ this._ocrAdapter = adapter;
2345
+ return {
2346
+ dispose: async () => {
2347
+ // Idempotent: a double dispose is a no-op rather than a throw.
2348
+ if (this._ocrAdapter === adapter)
2349
+ this._ocrAdapter = null;
2350
+ },
2351
+ };
1920
2352
  }
1921
2353
  // ── Persistence ───────────────────────────────────────────────────────────
1922
2354
  /**
@@ -1927,6 +2359,9 @@ export class AlbexEngine {
1927
2359
  * state in roughly O(total bytes), bypassing re-parsing.
1928
2360
  */
1929
2361
  async save(name) {
2362
+ return this._exclusive(() => this._saveInner(name));
2363
+ }
2364
+ async _saveInner(name) {
1930
2365
  const w = this._wasm;
1931
2366
  const total = w.snapshotSize();
1932
2367
  if (total === 0) {
@@ -1953,6 +2388,9 @@ export class AlbexEngine {
1953
2388
  * header (wrong magic, version, or struct sizes).
1954
2389
  */
1955
2390
  async load(name) {
2391
+ return this._exclusive(() => this._loadInner(name));
2392
+ }
2393
+ async _loadInner(name) {
1956
2394
  const bytes = await loadPersisted(name);
1957
2395
  if (!bytes || bytes.length === 0)
1958
2396
  return false;
@@ -1975,6 +2413,19 @@ export class AlbexEngine {
1975
2413
  return false;
1976
2414
  off += n;
1977
2415
  }
2416
+ // Commit. For v3 this is the atomic apply step (state is untouched
2417
+ // until now); a failure here leaves the previous index intact so the
2418
+ // caller can keep using the engine. For v1/v2 snapshots `restoreCommit`
2419
+ // is a no-op that returns 1 (those formats applied in-place during
2420
+ // restoreFeed and have no rollback to offer). Older binaries that
2421
+ // predate v3 do not export `restoreCommit` — in that case we treat
2422
+ // the load as already committed by feature-detect.
2423
+ if (typeof w.restoreCommit === 'function') {
2424
+ if (w.restoreCommit() !== 1)
2425
+ return false;
2426
+ }
2427
+ // The restored chunk array replaces whatever the GPU last saw.
2428
+ this._gpuUploadDirty = true;
1978
2429
  // Rebuild _docs metadata from the restored WASM tables.
1979
2430
  //
1980
2431
  // What's available after a restore:
@@ -2035,10 +2486,12 @@ export class AlbexEngine {
2035
2486
  * empty. Returns whether a load actually happened.
2036
2487
  */
2037
2488
  async loadOrInit(name) {
2038
- const loaded = await this.load(name);
2039
- if (!loaded)
2040
- this.reset();
2041
- return loaded;
2489
+ return this._exclusive(async () => {
2490
+ const loaded = await this._loadInner(name);
2491
+ if (!loaded)
2492
+ this._resetInner();
2493
+ return loaded;
2494
+ });
2042
2495
  }
2043
2496
  /** Delete a previously persisted snapshot. */
2044
2497
  async deleteSnapshot(name) {
@@ -2060,7 +2513,8 @@ export class AlbexEngine {
2060
2513
  * WASM instance and its (typically 20 MB) backing memory.
2061
2514
  */
2062
2515
  [Symbol.dispose]() {
2063
- this.reset();
2516
+ // Terminal: bypass the idle guard — disposing mid-operation is allowed.
2517
+ this._resetInner();
2064
2518
  this._unsubscribeResources?.();
2065
2519
  this._unsubscribeResources = null;
2066
2520
  this._gpu?.destroy();