albex 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +191 -0
  2. package/README.md +30 -19
  3. package/dist/albex-worker.d.ts +65 -2
  4. package/dist/albex-worker.d.ts.map +1 -1
  5. package/dist/albex-worker.js +97 -20
  6. package/dist/albex-worker.js.map +1 -1
  7. package/dist/albex.d.ts +206 -42
  8. package/dist/albex.d.ts.map +1 -1
  9. package/dist/albex.js +384 -103
  10. package/dist/albex.js.map +1 -1
  11. package/dist/errors.d.ts +35 -4
  12. package/dist/errors.d.ts.map +1 -1
  13. package/dist/errors.js +37 -2
  14. package/dist/errors.js.map +1 -1
  15. package/dist/persistence.js +1 -1
  16. package/dist/pool/coordinator.d.ts +14 -6
  17. package/dist/pool/coordinator.d.ts.map +1 -1
  18. package/dist/pool/coordinator.js +65 -28
  19. package/dist/pool/coordinator.js.map +1 -1
  20. package/dist/profile.js +1 -1
  21. package/dist/resource-manager.js +1 -1
  22. package/dist/tiered-store.js +1 -1
  23. package/dist/wasm-bindings.d.ts +50 -1
  24. package/dist/wasm-bindings.d.ts.map +1 -1
  25. package/dist/wasm-bindings.js +19 -11
  26. package/dist/wasm-bindings.js.map +1 -1
  27. package/dist/worker-protocol.d.ts +23 -2
  28. package/dist/worker-protocol.d.ts.map +1 -1
  29. package/dist/worker-protocol.js +1 -1
  30. package/dist/worker-runtime.js +16 -1
  31. package/dist/worker-runtime.js.map +1 -1
  32. package/package.json +1 -1
  33. package/src/albex-worker.ts +103 -18
  34. package/src/albex.ts +2937 -2524
  35. package/src/errors.ts +49 -4
  36. package/src/pool/coordinator.ts +61 -34
  37. package/src/wasm-bindings.ts +78 -12
  38. package/src/worker-protocol.ts +12 -2
  39. package/src/worker-runtime.ts +16 -1
  40. package/wasm/pkg/albex_pdf.wasm +0 -0
  41. package/wasm/pkg/albex_wasm.wasm +0 -0
  42. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  43. package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/dist/albex.js CHANGED
@@ -1,5 +1,5 @@
1
1
  /*!
2
- * albex v0.6.0
2
+ * albex v0.6.1
3
3
  * Zero-config local full-text search for documents — runs entirely in the browser, no server, no upload.
4
4
  * (c) 2026 RafaCalRob
5
5
  * @license MIT
@@ -21,7 +21,7 @@
21
21
  * ```
22
22
  */
23
23
  import { asAlbexExports, asAlbexPdfExports, } from './wasm-bindings.js';
24
- import { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
24
+ import { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, assertFileSizeWithinLimit, } from './errors.js';
25
25
  import { savePersisted, loadPersisted, deletePersisted, listPersisted, } from './persistence.js';
26
26
  import { detectProfile, shouldUseGpu } from './profile.js';
27
27
  import { getResourceManager } from './resource-manager.js';
@@ -48,6 +48,39 @@ function warnSearchStreamDeprecated() {
48
48
  'scheduler between slices and returns a batch. The alias will be ' +
49
49
  'removed in 0.4.0.');
50
50
  }
51
+ /** The std preset = the historical compile-time defaults. */
52
+ const CAPACITY_STD = {
53
+ maxDocs: 128,
54
+ maxChunks: 100_000,
55
+ textPoolBytes: 16 * 1024 * 1024,
56
+ namePoolBytes: 32 * 1024,
57
+ };
58
+ /** The large preset = the old compile-time "pro" tier. */
59
+ const CAPACITY_LARGE = {
60
+ maxDocs: 1024,
61
+ maxChunks: 800_000,
62
+ textPoolBytes: 128 * 1024 * 1024,
63
+ namePoolBytes: 256 * 1024,
64
+ };
65
+ /**
66
+ * Resolve a user-facing capacity option into full numbers. Partial custom
67
+ * configs are completed from the std defaults scaled to keep std's ratios:
68
+ * `maxChunks` follows `maxDocs` (×782), `textPoolBytes` follows `maxChunks`
69
+ * (×168 B), `namePoolBytes` follows `maxDocs` (×256 B) — each with a floor
70
+ * so tiny configs stay usable. `maxChunks` is clamped to at least `maxDocs`
71
+ * (every document needs at least one chunk).
72
+ */
73
+ function resolveCapacity(capacity) {
74
+ if (capacity === undefined || capacity === 'std')
75
+ return { ...CAPACITY_STD };
76
+ if (capacity === 'large')
77
+ return { ...CAPACITY_LARGE };
78
+ const maxDocs = Math.floor(capacity.maxDocs ?? CAPACITY_STD.maxDocs);
79
+ const maxChunks = Math.max(Math.floor(capacity.maxChunks ?? Math.max(maxDocs * 782, 1024)), maxDocs);
80
+ const textPoolBytes = Math.floor(capacity.textPoolBytes ?? Math.max(maxChunks * 168, 64 * 1024));
81
+ const namePoolBytes = Math.floor(capacity.namePoolBytes ?? Math.max(maxDocs * 256, 4 * 1024));
82
+ return { maxDocs, maxChunks, textPoolBytes, namePoolBytes };
83
+ }
51
84
  // ─────────────────────────────────────────────────────────────────────────────
52
85
  // Query parsing (WASM-side as of 0.5.0)
53
86
  // ─────────────────────────────────────────────────────────────────────────────
@@ -196,35 +229,51 @@ const FEED_SIZE = 32_768; // 32 KB — fits in 64 KB scratchpad
196
229
  * The result is stable across runs and engines, so it can be persisted in
197
230
  * snapshots without versioning concerns.
198
231
  */
232
+ // NOTE: the TS `computePatternBloom` that used to live here (the THIRD copy
233
+ // of the accent fold, after the Rust index side and the Rust query side) was
234
+ // removed in 0.8.0. The GPU pre-filter now reads the pattern Bloom straight
235
+ // from WASM via `getPatternBloomLo/Hi` (ABI 6) — `setPattern` computes it
236
+ // through the exact pipeline `searchBegin` uses, including Spanish stemming,
237
+ // which the TS copy never applied (audit 2.4).
199
238
  /**
200
- * Compute the same 64-bit Bloom value the Rust side computes for a query.
239
+ * Convert a UTF-8 byte offset into `bytes` to the equivalent UTF-16
240
+ * code-unit index of the decoded string. Walks lead bytes only — O(offset)
241
+ * with no allocation — counting 1 unit per BMP code point and 2 per 4-byte
242
+ * (astral, e.g. emoji) sequence. Stray continuation bytes (malformed input)
243
+ * count 1 unit each, matching TextDecoder's per-byte U+FFFD replacement.
201
244
  *
202
- * Must stay in sync with `BloomFilter::from_text` and `fold_utf8_char` in
203
- * `core/src/bloom.rs`. The hashing is `c & 0x3F` over each accent-folded
204
- * lowercase ASCII byte; non-letters are skipped. The aggregate of all token
205
- * blooms is what the GPU pre-filter checks against.
245
+ * Offsets that land mid-sequence are attributed to the code point they fall
246
+ * inside (the engine only emits code-point-aligned offsets, so this is a
247
+ * defensive clamp, not an expected path).
206
248
  */
207
- function computePatternBloom(query) {
208
- // Quick-and-faithful fold: lowercase, NFKD, strip combining marks. This
209
- // matches the Rust Latin-1/Latin-A fold for the characters we care about
210
- // (the rest fall through as non-letters which contribute nothing).
211
- const norm = query.toLowerCase().normalize('NFKD').replace(/[̀-ͯ]/g, '');
212
- let bits = 0n;
213
- for (let i = 0; i < norm.length; i++) {
214
- const code = norm.charCodeAt(i);
215
- if ((code >= 0x61 && code <= 0x7a) || (code >= 0x30 && code <= 0x39)) {
216
- bits |= 1n << BigInt(code & 0x3f);
217
- }
218
- else if (code === 0x20) {
219
- // skip token separator
220
- }
221
- else if (code < 0x80) {
222
- // other ASCII punctuation — they bias the filter; mirror Rust which
223
- // also includes them via the 6-bit mask.
224
- bits |= 1n << BigInt(code & 0x3f);
225
- }
226
- }
227
- return bits;
249
+ function utf16IndexAtByte(bytes, byteOffset) {
250
+ const end = Math.min(byteOffset, bytes.length);
251
+ let units = 0;
252
+ let i = 0;
253
+ while (i < end) {
254
+ const b = bytes[i];
255
+ if (b < 0x80) {
256
+ i += 1;
257
+ units += 1;
258
+ } // ASCII
259
+ else if (b < 0xc0) {
260
+ i += 1;
261
+ units += 1;
262
+ } // stray continuation → U+FFFD
263
+ else if (b < 0xe0) {
264
+ i += 2;
265
+ units += 1;
266
+ } // 2-byte (é, ñ, )
267
+ else if (b < 0xf0) {
268
+ i += 3;
269
+ units += 1;
270
+ } // 3-byte (…, €, CJK)
271
+ else {
272
+ i += 4;
273
+ units += 2;
274
+ } // 4-byte → surrogate pair
275
+ }
276
+ return units;
228
277
  }
229
278
  // Note: `contentHash` is implemented as a method on AlbexEngine below
230
279
  // (it needs access to the WASM scratchpad). The standalone TS reference
@@ -473,17 +522,29 @@ export class AlbexEngine {
473
522
  _pdfMem = null;
474
523
  _docs = [];
475
524
  _lastSearch = null;
525
+ /** Raw truncation bitflags from the most recent prepareQuery (ABI 5):
526
+ * 1 = branches dropped, 2 = tokens dropped/clipped, 4 = query bytes cut.
527
+ * Captured right after prepareQuery so every _lastSearch built for that
528
+ * query (including per-branch OR runs) reports the same flags. */
529
+ _lastTruncFlags = 0;
476
530
  /** Structured diagnostics collected during the most recent operation.
477
531
  * Drained by `takeDiagnostics()`. Capped at 256 entries to avoid
478
532
  * unbounded memory growth in pathological cases (very corrupted
479
533
  * corpora producing thousands of recovery warnings). */
480
534
  _diagnostics = [];
481
- _tier = null;
535
+ /** Resolved runtime capacity (set in init(); reused by reset()). */
536
+ _capacity = { ...CAPACITY_STD };
482
537
  _simd = false;
483
538
  _profile = null;
484
539
  _resources = null;
485
540
  _gpu = null;
486
- _gpuChunkCountUploaded = 0;
541
+ /** True when the GPU-resident Bloom array no longer mirrors the WASM
542
+ * chunk array. Set by EVERY index mutation (indexFile, removeDocument,
543
+ * compact, reset, load) and cleared after a successful upload. A plain
544
+ * chunk-count comparison is NOT enough: compact() can reorder blooms
545
+ * while keeping the count identical, which would silently filter the
546
+ * wrong chunks (audit 1.5). */
547
+ _gpuUploadDirty = true;
487
548
  _unsubscribeResources = null;
488
549
  _opts;
489
550
  // ── Concurrency guard ──────────────────────────────────────────────────────
@@ -494,7 +555,7 @@ export class AlbexEngine {
494
555
  // assert the engine is idle (audit 0.6.0, finding #2).
495
556
  _opChain = Promise.resolve();
496
557
  _busy = false;
497
- constructor(opts) {
558
+ constructor(opts = {}) {
498
559
  this._opts = opts;
499
560
  }
500
561
  /** Serialize an async engine operation behind any in-flight one. */
@@ -529,9 +590,19 @@ export class AlbexEngine {
529
590
  const hasTombstones = w.getDocCount() > this._docs.length;
530
591
  if (hasTombstones && cap > 0 && w.getTextUsed() / cap > 0.85) {
531
592
  w.compact();
593
+ this._gpuUploadDirty = true;
532
594
  }
533
595
  }
534
- /** Load and initialise the main WASM module. Must be called before any other method. */
596
+ /**
597
+ * Load and initialise the main WASM module. Must be called before any
598
+ * other method.
599
+ *
600
+ * Resolves `opts.capacity` ('std' default · 'large' · explicit object)
601
+ * and sizes the WASM pools accordingly via `initWithCapacity` (ABI 7).
602
+ * Memory cost ≈ `maxChunks × 64 B + textPoolBytes + namePoolBytes` —
603
+ * ~22 MB for 'std', ~180 MB for 'large'. Throws `AlbexInitError` if the
604
+ * requested capacity is out of range or the allocation fails.
605
+ */
535
606
  async init() {
536
607
  const url = await this._resolveWasmUrl();
537
608
  const res = await fetch(url);
@@ -540,7 +611,15 @@ export class AlbexEngine {
540
611
  const { instance } = await WebAssembly.instantiateStreaming(res, {});
541
612
  this._wasm = asAlbexExports(instance.exports);
542
613
  this._mem = this._wasm.memory;
543
- this._wasm.init();
614
+ this._capacity = resolveCapacity(this._opts.capacity);
615
+ const c = this._capacity;
616
+ if (this._wasm.initWithCapacity(c.maxDocs, c.maxChunks, c.textPoolBytes, c.namePoolBytes) !== 1) {
617
+ throw new AlbexInitError(`initWithCapacity(${c.maxDocs} docs, ${c.maxChunks} chunks, ` +
618
+ `${c.textPoolBytes} text bytes, ${c.namePoolBytes} name bytes) failed — ` +
619
+ `parameters out of range (docs 1-65536, chunks ≥ docs and ≤ 4194304, ` +
620
+ `text 4 KiB-1 GiB, names 256 B-16 MiB) or the WASM memory allocation ` +
621
+ `was refused by the host.`);
622
+ }
544
623
  // Subscribe to environmental signals. Cheap and benign in node tests
545
624
  // (the manager tolerates missing globals).
546
625
  const rm = getResourceManager();
@@ -556,22 +635,17 @@ export class AlbexEngine {
556
635
  }
557
636
  /**
558
637
  * Decide which `.wasm` binary to fetch. Order of precedence:
559
- * 1. `opts.wasmUrl` if provided — used verbatim.
560
- * 2. `opts.tier` if explicit — joined with `wasmBaseUrl`.
561
- * 3. `opts.wasmBaseUrl` + tier picked from the device profile.
562
- *
563
- * Order of precedence:
564
638
  * 1. `opts.wasmUrl` literal → use verbatim
565
- * 2. `opts.wasmBaseUrl` + tier suffix → fetched from that directory
639
+ * 2. `opts.wasmBaseUrl` + simd suffix → fetched from that directory
566
640
  * 3. zero-config default → `albex_wasm_bg.wasm` packaged
567
641
  * next to this file, resolved
568
642
  * via `import.meta.url`
569
643
  *
570
- * The zero-config default loads the std-baseline binary. Tier auto-detection
571
- * is only active when `wasmBaseUrl` is given, because picking a tier in
572
- * runtime would defeat any bundler's static asset rewriting. Users who want
573
- * tier optimisation must serve the six variants themselves and pass the
574
- * directory through `wasmBaseUrl`.
644
+ * There are exactly two main binaries (baseline + SIMD); capacity is a
645
+ * RUNTIME parameter since ABI 7, so it never affects which file is
646
+ * fetched. SIMD auto-detection is only active when `wasmBaseUrl` is
647
+ * given, because picking a URL at runtime would defeat any bundler's
648
+ * static asset rewriting.
575
649
  */
576
650
  async _resolveWasmUrl() {
577
651
  const o = this._opts;
@@ -587,17 +661,16 @@ export class AlbexEngine {
587
661
  // as an asset reference. They copy the .wasm to the output directory and
588
662
  // rewrite the URL automatically. Consumers who use one of those bundlers
589
663
  // get a working `new AlbexEngine()` with no manual setup.
590
- // 0.5.0+: two main binaries only — baseline and SIMD. The tier
591
- // system is gone (audit 4.1). Selection collapses to a single
592
- // boolean: SIMD on or off, decided either by the explicit `simd`
593
- // option or by a runtime probe.
664
+ // 0.5.0+: two main binaries only — baseline and SIMD (the tier system
665
+ // is gone; capacity became a runtime parameter in ABI 7). Selection
666
+ // collapses to a single boolean: SIMD on or off, decided either by the
667
+ // explicit `simd` option or by a runtime probe.
594
668
  const simd = o.simd === 'on'
595
669
  ? true
596
670
  : o.simd === 'off'
597
671
  ? false
598
672
  : !!profile?.wasm.simd;
599
673
  this._simd = simd;
600
- this._tier = 'std';
601
674
  if (!o.wasmBaseUrl) {
602
675
  // Zero-config: bundler resolves the .wasm next to dist/. We only
603
676
  // ship the baseline alias (albex_wasm_bg.wasm) inside the npm
@@ -608,8 +681,6 @@ export class AlbexEngine {
608
681
  const base = o.wasmBaseUrl.replace(/\/+$/, '');
609
682
  return simd ? `${base}/albex_wasm_simd.wasm` : `${base}/albex_wasm.wasm`;
610
683
  }
611
- /** The tier that was actually loaded. `null` until `init()` resolves. */
612
- get tier() { return this._tier; }
613
684
  /** True if the SIMD-accelerated binary was loaded. */
614
685
  get simdEnabled() { return this._simd; }
615
686
  /** True if a WebGPU device is acquired and the next search will use it. */
@@ -645,8 +716,14 @@ export class AlbexEngine {
645
716
  * No-op if the GPU device hasn't been acquired yet — first call attempts
646
717
  * `init()` lazily; if that fails, the candidate path is permanently
647
718
  * disabled for this engine instance.
719
+ *
720
+ * IMPORTANT: this method CLOBBERS the scratchpad (the candidate bitset
721
+ * is pushed through it via `setCandidateMask`). Any pattern previously
722
+ * staged by `selectQueryBranch` is destroyed — the caller MUST re-select
723
+ * the active branch before calling `searchBegin`, which snapshots the
724
+ * pattern from the scratchpad (audit 1.2).
648
725
  */
649
- async _gpuPreFilter(wasmQuery) {
726
+ async _gpuPreFilter() {
650
727
  const gpu = this._gpu;
651
728
  if (!gpu)
652
729
  return;
@@ -660,20 +737,26 @@ export class AlbexEngine {
660
737
  const chunkCount = this._wasm.getChunkCount();
661
738
  if (chunkCount === 0)
662
739
  return;
663
- // Upload blooms if the corpus changed. We re-upload everything on any
664
- // delta; incremental delta-upload is a future optimisation.
665
- if (chunkCount !== this._gpuChunkCountUploaded) {
740
+ // Upload blooms if the corpus changed since the last upload. The
741
+ // signal is a dirty flag set by every index mutation — not a chunk
742
+ // count comparison, because compact() can reorder blooms while
743
+ // keeping the count identical (audit 1.5). We re-upload everything
744
+ // on any delta; incremental delta-upload is a future optimisation.
745
+ if (this._gpuUploadDirty) {
666
746
  const ptr = this._wasm.getChunksPtr();
667
747
  const stride = this._wasm.getChunkStructSize();
668
748
  const bytes = new Uint8Array(this._mem.buffer, ptr, chunkCount * stride);
669
749
  const blooms = packBloomsFromChunks(bytes, chunkCount);
670
750
  gpu.uploadChunkBlooms(blooms, chunkCount);
671
- this._gpuChunkCountUploaded = chunkCount;
672
- }
673
- // Build the pattern Bloom on the JS side: same hash as Rust
674
- // (`c & 0x3F` after accent-folding), aggregated across all tokens.
675
- const patternBloom = computePatternBloom(wasmQuery);
676
- const passes = await gpu.scan(Number(patternBloom & 0xffffffffn), Number((patternBloom >> 32n) & 0xffffffffn));
751
+ this._gpuUploadDirty = false;
752
+ }
753
+ // Pattern Bloom comes straight from WASM (ABI 6): `selectQueryBranch`
754
+ // `setPattern` computed it through the same pipeline `searchBegin`
755
+ // uses split, optional Spanish stemming, accent fold, `c & 0x3F`.
756
+ // The retired TS copy of the fold never stemmed, so with `setLanguage
757
+ // ('es')` it could set bits for suffixes the CPU pattern no longer
758
+ // had → over-restrictive mask → silent false negatives (audit 2.4).
759
+ const passes = await gpu.scan(this._wasm.getPatternBloomLo(), this._wasm.getPatternBloomHi());
677
760
  // Push the bitset back into WASM via the scratchpad.
678
761
  const passBytes = new Uint8Array(passes.buffer, passes.byteOffset, passes.byteLength);
679
762
  this._writePad(passBytes);
@@ -699,6 +782,16 @@ export class AlbexEngine {
699
782
  const ptr = this._wasm.getBuffer(0);
700
783
  return _dec.decode(this._u8(ptr, n));
701
784
  }
785
+ /** Copy `n` scratchpad bytes out of WASM memory. The copy is private to
786
+ * JS, so it survives later WASM calls (and memory growth) — used when the
787
+ * caller needs both the raw bytes (UTF-16 span mapping) and the decoded
788
+ * string of the same payload. */
789
+ _readPadBytes(n) {
790
+ const ptr = this._wasm.getBuffer(0);
791
+ const out = new Uint8Array(n);
792
+ out.set(this._u8(ptr, n));
793
+ return out;
794
+ }
702
795
  _feedText(text) {
703
796
  const b = _enc.encode(text);
704
797
  for (let i = 0; i < b.length; i += FEED_SIZE) {
@@ -1534,7 +1627,9 @@ export class AlbexEngine {
1534
1627
  };
1535
1628
  // ── Public API ────────────────────────────────────────────────────────────
1536
1629
  /**
1537
- * Index a file. Supported formats: DOCX, XLSX, PDF, TXT, XML.
1630
+ * Index a file. Supported formats (11, with varying depth): DOCX, XLSX, PDF,
1631
+ * HTML, MD, JSON, CSV, EML, RTF, TXT, XML. Several are deliberately "lite"
1632
+ * (CSV is RFC-4180-lite, EML is MIME-lite, RTF is regex-stripped).
1538
1633
  * Throws for unsupported formats or parse errors.
1539
1634
  */
1540
1635
  async indexFile(file) {
@@ -1545,12 +1640,16 @@ export class AlbexEngine {
1545
1640
  const indexer = AlbexEngine._INDEXERS[ext];
1546
1641
  if (!indexer)
1547
1642
  throw new AlbexUnsupportedFormatError(ext);
1643
+ // Size guard BEFORE reading: `file.size` is available without buffering,
1644
+ // so a pathological input (a 2 GB .txt) is refused with a typed error
1645
+ // instead of being fully loaded and hashed first (audit 3.5).
1646
+ assertFileSizeWithinLimit(file, this._opts.maxFileBytes);
1548
1647
  // Hash the source bytes for idempotency. We always read the bytes once
1549
1648
  // here so the indexer can reuse them — avoids a double File.arrayBuffer().
1550
1649
  const bytes = new Uint8Array(await file.arrayBuffer());
1551
1650
  const hash = this._contentHash(bytes);
1552
1651
  // Idempotency: if a non-deleted doc already has this hash, return it
1553
- // unchanged. Cheap O(N) scan since MAX_DOCS = 128.
1652
+ // unchanged. O(doc_count) scan cheap at any supported capacity.
1554
1653
  const existing = this._docs.find(d => d.contentHash === hash);
1555
1654
  if (existing)
1556
1655
  return existing;
@@ -1578,15 +1677,22 @@ export class AlbexEngine {
1578
1677
  if (overflow !== 0) {
1579
1678
  const which = (overflow & 1) ? 'chunks' : (overflow & 2) ? 'text'
1580
1679
  : (overflow & 4) ? 'docs' : 'names';
1680
+ // The RUNTIME limit of the pool that overflowed, as configured via
1681
+ // `capacity` (std defaults · 'large' · custom object).
1682
+ const max = which === 'chunks' ? w.getMaxChunks()
1683
+ : which === 'text' ? w.getTextCapacity()
1684
+ : which === 'docs' ? w.getMaxDocs()
1685
+ : w.getNameCapacity();
1581
1686
  const pools = [
1582
1687
  overflow & 1 ? 'chunk pool' : '',
1583
1688
  overflow & 2 ? 'text pool' : '',
1584
1689
  overflow & 4 ? 'document table' : '',
1585
1690
  overflow & 8 ? 'name pool' : '',
1586
1691
  ].filter(Boolean).join(', ');
1587
- throw new AlbexCapacityError(`Index capacity exceeded while indexing "${file.name}" (${pools} full). ` +
1588
- `The document was rolled back (not indexed); treat the index as full ` +
1589
- `(compact(), shard across an AlbexPool, or reset()).`, which);
1692
+ throw new AlbexCapacityError(`Index capacity exceeded while indexing "${file.name}" (${pools} full, ` +
1693
+ `${which} limit = ${max}). The document was rolled back (not indexed); ` +
1694
+ `treat the index as full (compact(), shard across an AlbexPool, ` +
1695
+ `reset(), or re-create the engine with a bigger \`capacity\`).`, which, max);
1590
1696
  }
1591
1697
  // The new doc occupies slot `docCountBefore`.
1592
1698
  const docId = w.getDocId(docCountBefore);
@@ -1600,6 +1706,7 @@ export class AlbexEngine {
1600
1706
  contentHash: hash,
1601
1707
  };
1602
1708
  this._docs.push(doc);
1709
+ this._gpuUploadDirty = true;
1603
1710
  return doc;
1604
1711
  }
1605
1712
  /**
@@ -1620,6 +1727,7 @@ export class AlbexEngine {
1620
1727
  const ok = this._wasm.removeDocument(doc.docId) === 1;
1621
1728
  if (ok) {
1622
1729
  this._docs = this._docs.filter(d => d !== doc);
1730
+ this._gpuUploadDirty = true;
1623
1731
  }
1624
1732
  return ok;
1625
1733
  }
@@ -1649,6 +1757,76 @@ export class AlbexEngine {
1649
1757
  compact() {
1650
1758
  this._assertIdle('compact');
1651
1759
  this._wasm.compact();
1760
+ // compact() reorders the chunk array (and therefore the per-chunk
1761
+ // blooms) even when the chunk count stays the same — the GPU copy is
1762
+ // stale no matter what (audit 1.5).
1763
+ this._gpuUploadDirty = true;
1764
+ }
1765
+ /**
1766
+ * Enumerate the authoritative chunks Albex indexed for a document, in order.
1767
+ * Lets a host mirror Albex's exact chunking — e.g. embed the same units for a
1768
+ * parallel semantic index keyed on the same {@link AuthoritativeChunk.id}
1769
+ * (`"<docId>::<ord>"`, identical to {@link SearchResult.chunkId}). `docId` is
1770
+ * `IndexedDocument.docId` from {@link indexFile}; returns `[]` if no live
1771
+ * document has that id.
1772
+ *
1773
+ * The returned `id`/`ord`/`sub` are stable across {@link compact} and
1774
+ * snapshot save/load. Never key persistent structures on a search result's
1775
+ * absolute `chunkIdx`, which {@link compact} renumbers.
1776
+ */
1777
+ listChunks(docId) {
1778
+ this._assertIdle('listChunks');
1779
+ const w = this._wasm;
1780
+ const slot = this._docSlotOf(docId);
1781
+ if (slot < 0)
1782
+ return [];
1783
+ const count = w.getDocChunkCount(slot);
1784
+ const out = [];
1785
+ let prevLocation = -1;
1786
+ let sub = 0;
1787
+ // Batched enumeration (ABI 6): one `listChunksBatch` frontier call per
1788
+ // scratchpad-full of chunks instead of 2-3 calls per chunk (audit 2.6 —
1789
+ // an embeddings pipeline over 100k chunks used to make ~300k calls).
1790
+ // Each batch packs records as [u32 text_len][u32 location][text bytes],
1791
+ // tightly, in ordinal order; layout documented in wasm/src/lib.rs.
1792
+ let ord = 0;
1793
+ while (ord < count) {
1794
+ const n = w.listChunksBatch(slot, ord, count - ord);
1795
+ if (n === 0)
1796
+ break; // defensive — should not happen for a live slot
1797
+ const ptr = w.getBuffer(0);
1798
+ // The view is only valid until the next frontier call; everything is
1799
+ // decoded out of it inside this loop body before the next batch.
1800
+ const view = new DataView(this._mem.buffer);
1801
+ let off = ptr;
1802
+ for (let k = 0; k < n; k++) {
1803
+ const byteLen = view.getUint32(off, true);
1804
+ const location = view.getUint32(off + 4, true);
1805
+ const text = byteLen > 0
1806
+ ? _dec.decode(new Uint8Array(this._mem.buffer, off + 8, byteLen))
1807
+ : '';
1808
+ if (location === prevLocation)
1809
+ sub++;
1810
+ else {
1811
+ sub = 0;
1812
+ prevLocation = location;
1813
+ }
1814
+ out.push({ docId, location, ord, sub, text, byteLen, id: `${docId}::${ord}` });
1815
+ ord++;
1816
+ off += 8 + byteLen;
1817
+ }
1818
+ }
1819
+ return out;
1820
+ }
1821
+ /** Doc-table slot (0..getDocCount) whose stable id is `docId`, or -1. */
1822
+ _docSlotOf(docId) {
1823
+ const w = this._wasm;
1824
+ const n = w.getDocCount();
1825
+ for (let i = 0; i < n; i++) {
1826
+ if (w.getDocId(i) === docId)
1827
+ return i;
1828
+ }
1829
+ return -1;
1652
1830
  }
1653
1831
  /**
1654
1832
  * Search the index. Supports:
@@ -1658,12 +1836,18 @@ export class AlbexEngine {
1658
1836
  *
1659
1837
  * Pass `{ windowed: true }` to receive cropped snippets with ASCII ellipsis
1660
1838
  * markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
1839
+ *
1840
+ * Note: this synchronous path never uses the GPU pre-filter — the WebGPU
1841
+ * scan is asynchronous by nature. Only `searchCooperative` (the budgeted
1842
+ * path) engages the GPU; `search()` always runs the CPU Bloom pre-filter,
1843
+ * regardless of the `gpu` option.
1661
1844
  */
1662
1845
  search(query, opts = {}) {
1663
1846
  this._assertIdle('search');
1664
1847
  const w = this._wasm;
1665
1848
  const ql = this._writeStr(query);
1666
1849
  const kind = w.prepareQuery(ql);
1850
+ this._lastTruncFlags = w.getQueryTruncationFlags();
1667
1851
  if (kind < 0)
1668
1852
  return [];
1669
1853
  if (kind === 2) {
@@ -1717,6 +1901,7 @@ export class AlbexEngine {
1717
1901
  const w = this._wasm;
1718
1902
  const ql = this._writeStr(query);
1719
1903
  const kind = w.prepareQuery(ql);
1904
+ this._lastTruncFlags = w.getQueryTruncationFlags();
1720
1905
  if (kind < 0)
1721
1906
  return [];
1722
1907
  if (kind === 2) {
@@ -1728,7 +1913,12 @@ export class AlbexEngine {
1728
1913
  w.selectQueryBranch(i);
1729
1914
  const r = await this._runSearchBudgeted(query, opts, budget, undefined, i);
1730
1915
  for (const x of r) {
1731
- const key = `${x.documentName}:${x.location}:${x.matchStart}`;
1916
+ // chunkId ("<docId>::<ord>") distinguishes two sub-chunks of the
1917
+ // same location — a (doc, location, matchStart) key would collide
1918
+ // when both sub-chunks hit at the same relative offset and drop a
1919
+ // legitimate result (audit 3.4). matchStart keeps distinct hits
1920
+ // within one chunk across branches.
1921
+ const key = `${x.chunkId}:${x.matchStart}`;
1732
1922
  if (!seen.has(key)) {
1733
1923
  seen.add(key);
1734
1924
  all.push(x);
@@ -1763,19 +1953,17 @@ export class AlbexEngine {
1763
1953
  */
1764
1954
  async _runSearchBudgeted(displayQuery, opts, budgetMs, phraseTokens, branchIdx = 0) {
1765
1955
  const w = this._wasm;
1766
- // Pattern is already set by the caller via selectQueryBranch(branchIdx).
1767
- // Snapshot THAT branch's compiled pattern for the GPU pre-filter hash
1768
- // not branch 0, which would build the wrong candidate mask for OR
1769
- // branches and silently drop their hits (audit finding #6).
1770
- const activePatternLen = w.getQueryBranchPattern(branchIdx);
1771
- const activePattern = activePatternLen > 0 ? this._readPad(activePatternLen) : '';
1956
+ // Pattern is already set by the caller via selectQueryBranch(branchIdx),
1957
+ // which also computed THAT branch's pattern Bloom inside WASM so the
1958
+ // GPU pre-filter below builds the right candidate mask per OR branch
1959
+ // (audit finding #6) without re-reading the pattern across the frontier.
1772
1960
  // GPU pre-filter (CD1). If enabled AND the corpus is large enough,
1773
1961
  // the GPU computes the candidate bitset and we install it into WASM
1774
1962
  // before searchBegin so the slice loop only inspects candidates.
1775
1963
  // Failure here is silent: we fall back to CPU-only Bloom transparently.
1776
1964
  if (this._shouldEngageGpu()) {
1777
1965
  try {
1778
- await this._gpuPreFilter(activePattern);
1966
+ await this._gpuPreFilter();
1779
1967
  }
1780
1968
  catch (e) {
1781
1969
  // Don't let a GPU hiccup kill the search — drop to CPU path.
@@ -1785,12 +1973,20 @@ export class AlbexEngine {
1785
1973
  });
1786
1974
  w.clearCandidateMask();
1787
1975
  }
1976
+ // The GPU pre-filter pushes the candidate bitset through the
1977
+ // scratchpad, overwriting the pattern staged by selectQueryBranch.
1978
+ // searchBegin() snapshots the pattern FROM the scratchpad, so it
1979
+ // would compile garbage tokens out of the mask bytes (audit 1.2 —
1980
+ // every GPU-assisted search silently returned wrong results).
1981
+ // Re-select the active branch to restore the pattern.
1982
+ w.selectQueryBranch(branchIdx);
1788
1983
  }
1789
1984
  const t0 = performance.now();
1790
1985
  if (w.searchBegin() === 0) {
1791
1986
  this._lastSearch = {
1792
1987
  query: displayQuery, timeMs: 0, results: 0,
1793
1988
  bloomTested: 0, bloomPassed: 0, bitapMatched: 0,
1989
+ ...this._truncStats(),
1794
1990
  };
1795
1991
  return [];
1796
1992
  }
@@ -1829,21 +2025,96 @@ export class AlbexEngine {
1829
2025
  bloomTested: w.getStatBloomTested(),
1830
2026
  bloomPassed: w.getStatBloomPassed(),
1831
2027
  bitapMatched: w.getStatBitapMatched(),
2028
+ ...this._truncStats(),
1832
2029
  };
1833
2030
  return this._collectResults(count, opts, phraseTokens);
1834
2031
  }
2032
+ /** Truncation booleans for SearchStats, decoded from the flags the WASM
2033
+ * reported for the most recent prepareQuery (audit 1.6 — the engine used
2034
+ * to drop OR branches past 8 and tokens past 4 in silence). */
2035
+ _truncStats() {
2036
+ const f = this._lastTruncFlags;
2037
+ return {
2038
+ truncatedBranches: (f & 1) !== 0,
2039
+ truncatedTokens: (f & 2) !== 0,
2040
+ truncatedQuery: (f & 4) !== 0,
2041
+ };
2042
+ }
1835
2043
  /** Materialise results [0..count) into the public SearchResult shape.
1836
2044
  * When `phraseTokens` is given, each result is kept only if those tokens
1837
2045
  * appear adjacently in the FULL chunk text — independent of any display
1838
- * windowing — so phrase queries stay correct under `{ windowed: true }`. */
2046
+ * windowing — so phrase queries stay correct under `{ windowed: true }`.
2047
+ *
2048
+ * Frontier discipline (audit 2.1): all numeric fields of every result are
2049
+ * read in ONE DataView pass over the `#[repr(C)]` RESULTS array
2050
+ * (`getResultsPtr`/`getResultStride`, ABI 6) — the old path made 12-15
2051
+ * frontier calls per result. Strings still need calls, minimised to one
2052
+ * snippet read per result plus one doc-name read per DISTINCT document
2053
+ * (the old `getResultDocName` was additionally O(doc_count) inside WASM
2054
+ * for every single result). */
1839
2055
  _collectResults(count, opts, phraseTokens) {
1840
2056
  const w = this._wasm;
1841
2057
  const windowed = opts.windowed === true;
1842
2058
  const before = opts.before ?? 60;
1843
2059
  const after = opts.after ?? 120;
1844
2060
  const phraseFilter = phraseTokens && phraseTokens.length > 0 ? phraseTokens : null;
2061
+ // Map each live doc_id to its CHUNKS[] base (to turn a result's absolute
2062
+ // chunk index into a compact()-stable doc-relative ordinal) and to its
2063
+ // doc-table slot (for O(1) name resolution via getDocName).
2064
+ const chunkBaseByDocId = new Map();
2065
+ const slotByDocId = new Map();
2066
+ {
2067
+ const docCount = w.getDocCount();
2068
+ for (let d = 0; d < docCount; d++) {
2069
+ const id = w.getDocId(d);
2070
+ chunkBaseByDocId.set(id, w.getDocChunkBase(d));
2071
+ slotByDocId.set(id, d);
2072
+ }
2073
+ }
2074
+ const raw = new Array(count);
2075
+ {
2076
+ const ptr = w.getResultsPtr();
2077
+ const stride = w.getResultStride();
2078
+ const view = new DataView(this._mem.buffer, ptr, count * stride);
2079
+ for (let i = 0; i < count; i++) {
2080
+ const base = i * stride;
2081
+ const matchCount = view.getUint32(base + 56, true);
2082
+ const matches = [];
2083
+ for (let k = 0; k < matchCount && k < 4; k++) {
2084
+ matches.push({
2085
+ start: view.getUint32(base + 24 + k * 8, true),
2086
+ end: view.getUint32(base + 28 + k * 8, true),
2087
+ });
2088
+ }
2089
+ const matchStart = view.getUint32(base + 16, true);
2090
+ const matchEnd = view.getUint32(base + 20, true);
2091
+ if (matches.length === 0)
2092
+ matches.push({ start: matchStart, end: matchEnd });
2093
+ raw[i] = {
2094
+ docId: view.getUint32(base, true),
2095
+ chunkIdx: view.getUint32(base + 4, true),
2096
+ location: view.getUint32(base + 8, true),
2097
+ score: view.getUint16(base + 12, true),
2098
+ matchStart, matchEnd, matches,
2099
+ };
2100
+ }
2101
+ }
2102
+ // Resolve each distinct doc name ONCE per search (one getDocName call
2103
+ // per document that actually appears in the results).
2104
+ const nameByDocId = new Map();
2105
+ const docName = (docId) => {
2106
+ let name = nameByDocId.get(docId);
2107
+ if (name === undefined) {
2108
+ const slot = slotByDocId.get(docId);
2109
+ const nl = slot !== undefined ? w.getDocName(slot) : 0;
2110
+ name = nl > 0 ? this._readPad(nl) : '?';
2111
+ nameByDocId.set(docId, name);
2112
+ }
2113
+ return name;
2114
+ };
1845
2115
  const results = [];
1846
2116
  for (let i = 0; i < count; i++) {
2117
+ const r = raw[i];
1847
2118
  // Phrase adjacency check against the full chunk text (getSnippet), not
1848
2119
  // the possibly-cropped display window.
1849
2120
  if (phraseFilter) {
@@ -1852,30 +2123,18 @@ export class AlbexEngine {
1852
2123
  if (!containsPhrase(full, phraseFilter))
1853
2124
  continue;
1854
2125
  }
1855
- const score = w.getResultScore(i);
1856
- const location = w.getResultLocation(i);
1857
- const matchStart = w.getResultStart(i);
1858
- const matchEnd = w.getResultEnd(i);
1859
- const nl = w.getResultDocName(i);
1860
- const name = nl > 0 ? this._readPad(nl) : '?';
1861
- const matchCount = w.getResultMatchCount(i);
1862
- const matches = [];
1863
- for (let k = 0; k < matchCount; k++) {
1864
- matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
1865
- }
1866
- if (matches.length === 0)
1867
- matches.push({ start: matchStart, end: matchEnd });
1868
- let snippet;
1869
- let primaryStart = matchStart;
1870
- let primaryEnd = matchEnd;
1871
- let adjustedMatches = matches;
2126
+ const chunkOrd = r.chunkIdx - (chunkBaseByDocId.get(r.docId) ?? 0);
2127
+ let snippetBytes;
2128
+ let primaryStart = r.matchStart;
2129
+ let primaryEnd = r.matchEnd;
2130
+ let adjustedMatches = r.matches;
1872
2131
  if (windowed) {
1873
2132
  const sl = w.getSnippetWindow(i, before, after);
1874
- snippet = sl > 0 ? this._readPad(sl) : '';
2133
+ snippetBytes = sl > 0 ? this._readPadBytes(sl) : new Uint8Array(0);
1875
2134
  const offset = w.getSnippetWindowOffset();
1876
2135
  const leadingPrefix = offset > 0 ? 4 : 0;
1877
2136
  const shift = leadingPrefix - offset;
1878
- adjustedMatches = matches.map(m => ({
2137
+ adjustedMatches = r.matches.map(m => ({
1879
2138
  start: Math.max(0, m.start + shift),
1880
2139
  end: Math.max(0, m.end + shift),
1881
2140
  }));
@@ -1884,21 +2143,31 @@ export class AlbexEngine {
1884
2143
  }
1885
2144
  else {
1886
2145
  const sl = w.getSnippet(i);
1887
- snippet = sl > 0 ? this._readPad(sl) : '';
2146
+ snippetBytes = sl > 0 ? this._readPadBytes(sl) : new Uint8Array(0);
1888
2147
  }
2148
+ const snippet = snippetBytes.length > 0 ? _dec.decode(snippetBytes) : '';
2149
+ // UTF-16 view of the primary span, ready for `snippet.slice()` —
2150
+ // byte offsets and JS string indices diverge on the first accent
2151
+ // (audit 3.1, the consumer footgun in the main Spanish use case).
2152
+ const snippetStart = utf16IndexAtByte(snippetBytes, primaryStart);
2153
+ const snippetEnd = utf16IndexAtByte(snippetBytes, primaryEnd);
1889
2154
  results.push({
1890
- documentName: name,
1891
- location,
1892
- score,
2155
+ documentName: docName(r.docId),
2156
+ docId: r.docId,
2157
+ location: r.location,
2158
+ chunkId: `${r.docId}::${chunkOrd}`,
2159
+ score: r.score,
1893
2160
  snippet,
1894
2161
  matchStart: primaryStart,
1895
2162
  matchEnd: primaryEnd,
1896
2163
  matches: adjustedMatches,
2164
+ snippetStart,
2165
+ snippetEnd,
1897
2166
  });
1898
2167
  }
1899
2168
  return results;
1900
2169
  }
1901
- /** Run all OR branches and merge dedup-by-(doc, location, match). The
2170
+ /** Run all OR branches and merge dedup-by-(chunkId, matchStart). The
1902
2171
  * branches are already compiled inside the WASM (by prepareQuery); we
1903
2172
  * iterate them with selectQueryBranch. The "rawQuery" param is kept
1904
2173
  * only for the lastSearch.query field. */
@@ -1911,7 +2180,10 @@ export class AlbexEngine {
1911
2180
  w.selectQueryBranch(i);
1912
2181
  const results = this._runSearch(rawQuery, opts);
1913
2182
  for (const r of results) {
1914
- const key = `${r.documentName}:${r.location}:${r.matchStart}`;
2183
+ // Keyed on chunkId, not (doc, location, matchStart): two sub-chunks
2184
+ // of the same location can hit at the same relative offset, and the
2185
+ // old key silently dropped one of them (audit 3.4).
2186
+ const key = `${r.chunkId}:${r.matchStart}`;
1915
2187
  if (!seen.has(key)) {
1916
2188
  seen.add(key);
1917
2189
  all.push(r);
@@ -1936,10 +2208,12 @@ export class AlbexEngine {
1936
2208
  bloomTested: w.getStatBloomTested(),
1937
2209
  bloomPassed: w.getStatBloomPassed(),
1938
2210
  bitapMatched: w.getStatBitapMatched(),
2211
+ ...this._truncStats(),
1939
2212
  };
1940
2213
  return this._collectResults(count, opts, phraseTokens);
1941
2214
  }
1942
- /** Returns current engine statistics. */
2215
+ /** Returns current engine statistics (capacities are the RUNTIME values
2216
+ * the engine was initialised with via the `capacity` option). */
1943
2217
  getStats() {
1944
2218
  return {
1945
2219
  documents: this._docs.length,
@@ -1947,9 +2221,9 @@ export class AlbexEngine {
1947
2221
  textUsed: this._wasm.getTextUsed(),
1948
2222
  textCapacity: this._wasm.getTextCapacity(),
1949
2223
  wasmMemoryBytes: this._mem.buffer.byteLength,
1950
- tier: this._tier,
1951
2224
  maxChunks: this._wasm.getMaxChunks(),
1952
2225
  maxDocs: this._wasm.getMaxDocs(),
2226
+ namePoolBytes: this._wasm.getNameCapacity(),
1953
2227
  };
1954
2228
  }
1955
2229
  /** Returns stats from the most recent search, or null. */
@@ -1993,10 +2267,15 @@ export class AlbexEngine {
1993
2267
  this._resetInner();
1994
2268
  }
1995
2269
  _resetInner() {
1996
- this._wasm.init();
2270
+ // Re-init with the engine's CONFIGURED capacity, not the std defaults
2271
+ // (`wasm.init()` would silently shrink a 'large'/custom engine). Same
2272
+ // capacities → the WASM side does a plain counter reset, no realloc.
2273
+ const c = this._capacity;
2274
+ this._wasm.initWithCapacity(c.maxDocs, c.maxChunks, c.textPoolBytes, c.namePoolBytes);
1997
2275
  this._docs = [];
1998
2276
  this._lastSearch = null;
1999
2277
  this._diagnostics = [];
2278
+ this._gpuUploadDirty = true;
2000
2279
  }
2001
2280
  /**
2002
2281
  * Drain and return the diagnostics collected since the last call (or
@@ -2145,6 +2424,8 @@ export class AlbexEngine {
2145
2424
  if (w.restoreCommit() !== 1)
2146
2425
  return false;
2147
2426
  }
2427
+ // The restored chunk array replaces whatever the GPU last saw.
2428
+ this._gpuUploadDirty = true;
2148
2429
  // Rebuild _docs metadata from the restored WASM tables.
2149
2430
  //
2150
2431
  // What's available after a restore: