npm - albex - Versions diffs - 0.3.0 → 0.6.1 - Mend

albex 0.3.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/CHANGELOG.md +466 -0
package/README.md +32 -19
package/dist/albex-worker.d.ts +65 -2
package/dist/albex-worker.d.ts.map +1 -1
package/dist/albex-worker.js +97 -20
package/dist/albex-worker.js.map +1 -1
package/dist/albex.d.ts +359 -55
package/dist/albex.d.ts.map +1 -1
package/dist/albex.js +766 -312
package/dist/albex.js.map +1 -1
package/dist/errors.d.ts +47 -2
package/dist/errors.d.ts.map +1 -1
package/dist/errors.js +41 -3
package/dist/errors.js.map +1 -1
package/dist/persistence.js +1 -1
package/dist/pool/coordinator.d.ts +14 -6
package/dist/pool/coordinator.d.ts.map +1 -1
package/dist/pool/coordinator.js +65 -28
package/dist/pool/coordinator.js.map +1 -1
package/dist/profile.d.ts +11 -6
package/dist/profile.d.ts.map +1 -1
package/dist/profile.js +6 -13
package/dist/profile.js.map +1 -1
package/dist/resource-manager.js +1 -1
package/dist/tiered-store.js +1 -1
package/dist/wasm-bindings.d.ts +96 -6
package/dist/wasm-bindings.d.ts.map +1 -1
package/dist/wasm-bindings.js +110 -7
package/dist/wasm-bindings.js.map +1 -1
package/dist/worker-protocol.d.ts +23 -2
package/dist/worker-protocol.d.ts.map +1 -1
package/dist/worker-protocol.js +1 -1
package/dist/worker-runtime.js +27 -3
package/dist/worker-runtime.js.map +1 -1
package/package.json +13 -9
package/src/albex-worker.ts +103 -18
package/src/albex.ts +2937 -2292
package/src/errors.ts +63 -2
package/src/pool/coordinator.ts +61 -34
package/src/profile.ts +11 -10
package/src/wasm-bindings.ts +225 -10
package/src/worker-protocol.ts +12 -2
package/src/worker-runtime.ts +28 -3
package/wasm/pkg/albex_pdf.wasm +0 -0
package/wasm/pkg/albex_wasm.wasm +0 -0
package/wasm/pkg/albex_wasm_bg.wasm +0 -0
package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/wasm/pkg/albex_wasm_mini.wasm +0 -0
package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
package/wasm/pkg/albex_wasm_pro.wasm +0 -0
package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
package/wasm/pkg/albex_wasm_std.wasm +0 -0
package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0

package/src/errors.ts CHANGED Viewed

@@ -51,10 +51,71 @@ export class AlbexParseError extends AlbexError {
   }
 }
-/** Thrown when the scratchpad is too small for a single chunk write. */
+/**
+ * Thrown when an indexing operation does not fit: either the scratchpad was
+ * too small for a single write, or one of the engine's pools (chunks, text,
+ * documents, names) ran out of room mid-document. Before 0.6.0 the latter was
+ * silent — the corpus was truncated with no signal.
+ *
+ * `limit` names which pool overflowed (or `'scratchpad'`, or `'file'` when an
+ * input file exceeds `maxFileBytes` before any byte is read), so callers can
+ * branch — e.g. start a fresh shard, `compact()`, or surface "library full".
+ * When a capacity error is raised during `indexFile`, the engine may hold a
+ * partially-indexed copy of the offending document; treat the index as full
+ * and stop adding. A `'file'` capacity error is raised BEFORE the file is
+ * read, so the index is untouched and fully usable.
+ */
+export type AlbexCapacityLimit = 'chunks' | 'text' | 'docs' | 'names' | 'scratchpad' | 'file';
 export class AlbexCapacityError extends AlbexError {
-  constructor(message: string) {
+  /** Which pool overflowed. Undefined for older call sites that didn't set it. */
+  readonly limit?: AlbexCapacityLimit;
+  /**
+   * The RUNTIME numeric capacity of the pool named by `limit`, as the
+   * engine is actually configured (e.g. `4` when `capacity: { maxDocs: 4 }`
+   * overflows its document table, `128` for the std default). Units: docs
+   * for `'docs'`, chunks for `'chunks'`, bytes for `'text'`/`'names'`/
+   * `'scratchpad'`/`'file'`. Undefined when the limit is not known at the
+   * throw site.
+   */
+  readonly max?: number;
+  constructor(message: string, limit?: AlbexCapacityLimit, max?: number) {
     super('capacity', message);
     this.name = 'AlbexCapacityError';
+    if (limit) this.limit = limit;
+    if (max !== undefined) this.max = max;
+  }
+}
+/**
+ * Default `maxFileBytes` for `indexFile`: 256 MiB. Far above anything the
+ * ~16–21 MB text pool could ever absorb, so legitimate documents are never
+ * rejected — the guard only exists to refuse pathological inputs (a 2 GB
+ * file would otherwise be fully buffered AND hashed before the first
+ * capacity check could fire).
+ */
+export const DEFAULT_MAX_FILE_BYTES = 256 * 1024 * 1024;
+/**
+ * Pre-read size guard for `indexFile`. Throws a typed
+ * {@link AlbexCapacityError} (`limit: 'file'`) when `file.size` exceeds the
+ * configured cap — BEFORE any byte of the file is read into memory
+ * (`File`/`Blob` expose `size` without reading). Shared by the engine, the
+ * worker wrapper and the pool coordinator so the guard fires on whichever
+ * thread would otherwise buffer the bytes.
+ */
+export function assertFileSizeWithinLimit(
+  file: { name: string; size: number },
+  maxFileBytes?: number,
+): void {
+  const cap = maxFileBytes ?? DEFAULT_MAX_FILE_BYTES;
+  if (file.size > cap) {
+    throw new AlbexCapacityError(
+      `"${file.name}" is ${file.size} bytes, above the maxFileBytes limit of ` +
+      `${cap}. The file was not read or indexed. Raise \`maxFileBytes\` in ` +
+      `AlbexOptions if this is intentional.`,
+      'file',
+      cap,
+    );
   }
 }

package/src/pool/coordinator.ts CHANGED Viewed

@@ -30,9 +30,8 @@ import type {
   EngineStats,
   SearchStats,
 } from '../albex.js';
-import type { Tier } from '../profile.js';
 import { detectProfile, pickWorkerCount } from '../profile.js';
-import { AlbexInitError, AlbexError } from '../errors.js';
+import { AlbexInitError, AlbexError, assertFileSizeWithinLimit } from '../errors.js';
 import type {
   WorkerRequest,
   WorkerResponse,
@@ -77,7 +76,9 @@ export class AlbexPool {
   private _docsCache: IndexedDocument[] = [];
   private _rrCursor = 0;
   private _lastSearch: SearchStats | null = null;
-  private _tier: Tier | null = null;
+  /** Global result cap applied AFTER the cross-shard merge. Mirrors the
+   * last `setMaxResults` call (the WASM engine default is 50). */
+  private _maxResults = 50;
   constructor(opts: AlbexPoolOptions) {
     this._opts = opts;
@@ -98,23 +99,21 @@ export class AlbexPool {
       console.warn('[albex] pool mode=shared requested but cross-origin isolation is not active; falling back to replicated');
     }
-    const shardOpts: AlbexOptions = {
-      wasmUrl:     this._opts.wasmUrl,
-      wasmBaseUrl: this._opts.wasmBaseUrl,
-      pdfWasmUrl:  this._opts.pdfWasmUrl,
-      tier:        this._opts.tier,
-      simd:        this._opts.simd,
-    };
+    // Forward every serializable engine option to the shards; strip the
+    // pool-only fields (workerUrl/workers/mode) and anything non-clonable.
+    // Same policy as AlbexEngineWorker.init (audit 1.4).
+    const shardOpts: AlbexOptions = {};
+    for (const [k, v] of Object.entries(this._opts)) {
+      if (k === 'workerUrl' || k === 'workers' || k === 'mode') continue;
+      if (v === undefined || typeof v === 'function') continue;
+      (shardOpts as Record<string, unknown>)[k] = v;
+    }
     for (let i = 0; i < n; i++) {
       const shard = this._spawnShard();
       await this._send(shard, { kind: 'init', opts: shardOpts });
       this._shards.push(shard);
     }
-    // Tier is the same across shards — capture it from shard 0 stats.
-    const stats0 = await this._send<EngineStats>(this._shards[0]!, { kind: 'getStats' });
-    this._tier = stats0.tier;
   }
   // ── Shard plumbing ─────────────────────────────────────────────────────
@@ -168,6 +167,9 @@ export class AlbexPool {
   async indexFile(file: File): Promise<IndexedDocument> {
     if (this._shards.length === 0) throw new AlbexInitError('Pool not initialised');
+    // Size guard BEFORE reading — same limit the shard engine enforces, but
+    // checked here so an oversized file is never buffered on the main thread.
+    assertFileSizeWithinLimit(file, this._opts.maxFileBytes);
     const idx = this._rrCursor++ % this._shards.length;
     const shard = this._shards[idx]!;
     const buffer = await file.arrayBuffer();
@@ -185,30 +187,50 @@ export class AlbexPool {
    * Map-reduce search:
    *   1. broadcast the query to every shard,
    *   2. each shard runs its local Bloom→Bitap→top-K,
-   *   3. coordinator merges the K-best from each shard,
+   *   3. coordinator merges the K-best from each shard, deduplicating
+   *      identical hits (same document name, chunk id and match offset —
+   *      the case where the same file was indexed into two shards),
    *   4. global top-K returned in descending score order.
    *
    * Capped to `setMaxResults` results (default 50) AFTER merge.
+   *
+   * Per-shard search stats are requested in the same posting batch as the
+   * search itself: each worker processes its queue in arrival order, so the
+   * stats reply is guaranteed to belong to THIS query even when several
+   * `search()` calls overlap.
    */
   async search(query: string, opts: SearchOptions = {}): Promise<SearchResult[]> {
     if (this._shards.length === 0) return [];
     const t0 = performance.now();
-    const buckets = await Promise.all(
-      this._shards.map(s => this._send<SearchResult[]>(s, { kind: 'search', query, options: opts })),
+    // Post `search` and `getLastSearchStats` back-to-back (synchronously,
+    // per shard) so no other request can slot between them in the worker's
+    // FIFO queue — the stats round can't race a subsequent search.
+    const perShard = await Promise.all(
+      this._shards.map(s => {
+        const results = this._send<SearchResult[]>(s, { kind: 'search', query, options: opts });
+        const stats   = this._send<SearchStats | null>(s, { kind: 'getLastSearchStats' });
+        return Promise.all([results, stats] as const);
+      }),
     );
-    // Merge: simple flatten + sort (descending). Each shard already returned
-    // up to its local cap; the global cap could be different but in practice
-    // matches the per-shard cap.
-    const merged = buckets.flat();
+    // Merge: flatten + dedup + sort (descending) + global cap. Shard-local
+    // docIds collide across shards, so the dedup identity also includes the
+    // document name: it only collapses true duplicates (the same document
+    // indexed into more than one shard yields identical name/chunkId/offset).
+    const seen = new Set<string>();
+    const merged: SearchResult[] = [];
+    for (const [bucket] of perShard) {
+      for (const r of bucket) {
+        const key = `${r.documentName}:${r.chunkId}:${r.matchStart}`;
+        if (!seen.has(key)) { seen.add(key); merged.push(r); }
+      }
+    }
     merged.sort((a, b) => b.score - a.score);
+    const capped = merged.slice(0, this._maxResults);
     // Aggregate search stats across shards.
-    const stats = await Promise.all(
-      this._shards.map(s => this._send<SearchStats | null>(s, { kind: 'getLastSearchStats' })),
-    );
     let bloomTested = 0, bloomPassed = 0, bitapMatched = 0;
-    for (const s of stats) {
+    for (const [, s] of perShard) {
       if (!s) continue;
       bloomTested  += s.bloomTested;
       bloomPassed  += s.bloomPassed;
@@ -217,11 +239,11 @@ export class AlbexPool {
     this._lastSearch = {
       query,
       timeMs: performance.now() - t0,
-      results: merged.length,
+      results: capped.length,
       bloomTested, bloomPassed, bitapMatched,
     };
-    return merged;
+    return capped;
   }
   /**
@@ -273,11 +295,13 @@ export class AlbexPool {
     for (const s of this._shards) s.docCount = 0;
   }
-  /** Aggregate engine stats across all shards. */
+  /** Aggregate engine stats across all shards. Capacities are the RUNTIME
+   * per-shard capacities summed (each shard was initialised with the same
+   * `capacity` option, forwarded through the wire protocol). */
   async getStats(): Promise<EngineStats> {
     const all = await this._broadcast<EngineStats>({ kind: 'getStats' });
     let documents = 0, chunks = 0, textUsed = 0, textCapacity = 0, wasmMemoryBytes = 0;
-    let maxChunks = 0, maxDocs = 0;
+    let maxChunks = 0, maxDocs = 0, namePoolBytes = 0;
     for (const s of all) {
       documents       += s.documents;
       chunks          += s.chunks;
@@ -286,10 +310,11 @@ export class AlbexPool {
       wasmMemoryBytes += s.wasmMemoryBytes;
       maxChunks       += s.maxChunks;
       maxDocs         += s.maxDocs;
+      namePoolBytes   += s.namePoolBytes;
     }
     return {
       documents, chunks, textUsed, textCapacity, wasmMemoryBytes,
-      tier: this._tier, maxChunks, maxDocs,
+      maxChunks, maxDocs, namePoolBytes,
     };
   }
@@ -303,15 +328,17 @@ export class AlbexPool {
   async setMaxErrors(n: 0 | 1 | 2 | 3):       Promise<void> { await this._broadcast({ kind: 'setMaxErrors',  n }); }
   async setThreshold(n: number):              Promise<void> { await this._broadcast({ kind: 'setThreshold', n }); }
-  async setMaxResults(n: number):             Promise<void> { await this._broadcast({ kind: 'setMaxResults', n }); }
+  async setMaxResults(n: number):             Promise<void> {
+    // Track the effective cap (same clamp as the WASM engine) so search()
+    // can enforce it globally after the cross-shard merge.
+    this._maxResults = Math.max(1, Math.min(200, Math.floor(n)));
+    await this._broadcast({ kind: 'setMaxResults', n });
+  }
   async setLanguage(lang: 'off' | 'es'):      Promise<void> { await this._broadcast({ kind: 'setLanguage', lang }); }
   /** Number of shards currently running. */
   get workerCount(): number { return this._shards.length; }
-  /** Tier loaded by the shards (same value across all of them). */
-  get tier(): Tier | null { return this._tier; }
   [Symbol.dispose](): void {
     for (const s of this._shards) {
       for (const [, p] of s.pending) p.reject(new AlbexError('disposed', 'Pool disposed'));

package/src/profile.ts CHANGED Viewed

@@ -233,19 +233,20 @@ export async function detectProfile(opts: { fresh?: boolean } = {}): Promise<Dev
 // ── Tier selection ───────────────────────────────────────────────────────────
-export type Tier = 'mini' | 'std' | 'pro';
+/**
+ * @deprecated The tier system was removed in 0.5.0 (audit 4.1: "6
+ * binaries × no proven benefit"). The type remains exported as `'std'`
+ * for backwards compatibility with code that read `engine.getStats().tier`.
+ */
+export type Tier = 'std';
 /**
- * Choose the optimal binary tier from a profile.
- *
- * The thresholds are conservative: a device with `deviceMemory === null`
- * (Safari) defaults to `std` to avoid both over- and under-provisioning.
+ * @deprecated Always returns `'std'` as of 0.5.0. Albex ships exactly
+ * two main binaries (baseline + SIMD); the only runtime variant is the
+ * SIMD probe, not a capacity tier. Kept callable so existing integrators
+ * don't break, but the value has no operational meaning anymore.
  */
-export function pickTier(profile: DeviceProfile): Tier {
-  const m = profile.memoryGB;
-  if (m === null) return 'std';
-  if (m <= 1) return 'mini';
-  if (m >= 8) return 'pro';
+export function pickTier(_profile: DeviceProfile): Tier {
   return 'std';
 }

package/src/wasm-bindings.ts CHANGED Viewed

@@ -17,9 +17,36 @@
 export interface AlbexWasmExports {
   readonly memory: WebAssembly.Memory;
-  // Scratchpad / lifecycle
+  // ABI / lifecycle
+  abiVersion(): number;
   getBuffer(size: number): number;
+  /** Reset with the std default capacities (128 docs · 100k chunks · 16 MB
+   * text · 32 KB names) — identical behaviour to every pre-ABI-7 release. */
   init(): void;
+  /** (Re-)initialise the engine with runtime capacities (ABI 7, decision
+   * A16). Allocates the capacity-dependent pools on the WASM heap. Returns
+   * 1 on success; 0 on invalid parameters (floors/ceilings documented in
+   * wasm/src/lib.rs: ≥1 doc, docs ≤ 65 536, chunks ≥ docs and ≤ 4 M, text
+   * pool 4 KiB–1 GiB, name pool 256 B–16 MiB) or on allocation failure —
+   * never traps. Re-init with the same capacities is a plain reset; with
+   * different capacities the pools are freed and re-allocated (no leak,
+   * but the linear-memory high-water mark never shrinks). */
+  initWithCapacity(
+    maxDocs: number,
+    maxChunks: number,
+    textPoolBytes: number,
+    namePoolBytes: number,
+  ): number;
+  /** Reset the streaming FNV-1a 64-bit hash state. Optional on the first
+   * hash of a session because the static initialiser is also FNV_OFFSET. */
+  hashBegin(): void;
+  /** Fold `len` bytes of scratchpad into the streaming hash. May be
+   * called repeatedly for files larger than SCRATCHPAD_SIZE. */
+  hashFeed(len: number): void;
+  /** Write the final 8 raw big-endian bytes at scratchpad[0..8] and
+   * reset the state so the next hash can start without an explicit Begin. */
+  hashFinish(): void;
   // Document ingestion
   setDocumentName(len: number): void;
@@ -40,6 +67,18 @@ export interface AlbexWasmExports {
   setThreshold(threshold: number): void;
   setMaxResults(max: number): void;
+  // Query parsing (since ABI v2). Single source of truth for tokenization.
+  prepareQuery(len: number): number;
+  getQueryKind(): number;
+  getQueryBranchCount(): number;
+  getQueryBranchPattern(i: number): number;
+  selectQueryBranch(i: number): number;
+  /** Bitflags of what the most recent prepareQuery dropped or clipped
+   * (ABI 5): 1 = OR branches beyond 8 discarded, 2 = tokens dropped
+   * (> 4 per branch) or clipped (> 64 bytes), 4 = raw query cut at
+   * 1024 bytes. 0 = compiled in full. */
+  getQueryTruncationFlags(): number;
   // Search execution
   setPattern(len: number): number;
   search(): number;
@@ -50,6 +89,15 @@ export interface AlbexWasmExports {
   getSearchTotal(): number;
   // Result accessors
+  /** Base pointer of the `#[repr(C)]` RESULTS array (ABI 6). Read
+   * `getResultCount()` records of `getResultStride()` bytes each with one
+   * DataView pass instead of ~12 frontier calls per result. Field offsets
+   * are documented (and compile-time asserted) in wasm/src/lib.rs. Copy
+   * everything out before any call that could grow WASM memory. */
+  getResultsPtr(): number;
+  /** Byte stride between consecutive RESULTS records (= sizeof(DocMatch),
+   * 60 today). Exported so the host never hardcodes the struct size. */
+  getResultStride(): number;
   getResultCount(): number;
   getResultDocId(i: number): number;
   getResultLocation(i: number): number;
@@ -73,12 +121,27 @@ export interface AlbexWasmExports {
   getDocCount(): number;
   getTextUsed(): number;
   getTextCapacity(): number;
+  /** Bitflags of capacity limits hit during the most recent
+   * begin..endDocument cycle: 1 = chunks, 2 = text, 4 = docs, 8 = names.
+   * 0 = everything fit. Read by the host right after endDocument to raise a
+   * typed AlbexCapacityError instead of silently truncating the corpus. */
+  getLastIndexOverflow(): number;
-  // Snapshot / restore
+  // Snapshot / restore (v3 protocol; v1 and v2 still load)
   snapshotSize(): number;
   snapshotChunk(offset: number, maxLen: number): number;
+  /** Validate header. For v3 also reserves the staging buffer; state is
+   * NOT touched until restoreCommit succeeds. For v1/v2 (legacy) state is
+   * reset and counters are written immediately. */
   restoreBegin(): number;
+  /** Feed payload bytes. For v3 they accumulate into staging; for v1/v2
+   * they are written straight to the state arrays as before. */
   restoreFeed(len: number): number;
+  /** Atomic commit for v3 snapshots. Returns 1 if the staged payload was
+   * complete and decoded successfully; 0 otherwise — and in the 0 case
+   * the previous engine state is preserved. For v1/v2 this is a no-op
+   * that always returns 1. */
+  restoreCommit(): number;
   // Incremental / per-doc
   getDocId(index: number): number;
@@ -88,6 +151,27 @@ export interface AlbexWasmExports {
   removeDocument(docId: number): number;
   compact(): void;
+  // Authoritative chunk enumeration (ABI 4). Address a document's chunks by
+  // (doc slot, ordinal). The compact()-stable key is (doc_id, ordinal).
+  /** First CHUNKS[] index for the document at slot `index` (= chunk_start).
+   * `ord = resultChunkIdx - getDocChunkBase(slot)` gives the doc-relative
+   * ordinal of a search hit. */
+  getDocChunkBase(index: number): number;
+  /** `location` (paragraph/page) of the `ord`-th chunk; u32::MAX if OOB. */
+  getChunkLocationAt(index: number, ord: number): number;
+  /** Byte length of the `ord`-th chunk's text; 0 if OOB. */
+  getChunkByteLenAt(index: number, ord: number): number;
+  /** Copy the `ord`-th chunk's UTF-8 text into the scratchpad; returns byte
+   * length (0 if OOB). Lets a host enumerate a doc's authoritative chunks
+   * right after indexing, with no query. */
+  getChunkTextAt(index: number, ord: number): number;
+  /** Batch chunk enumeration (ABI 6). Packs up to `maxChunks` records
+   * `[u32 text_len][u32 location][text bytes]` (LE, tightly packed) into
+   * the scratchpad starting at ordinal `startOrd`; returns how many were
+   * written. One frontier call per scratchpad-full instead of 2-3 per
+   * chunk. */
+  listChunksBatch(index: number, startOrd: number, maxChunks: number): number;
   /**
    * Per-document content hash (snapshot v2). Returns a pointer to 8 bytes
    * holding the FNV-1a 64-bit hash of the original file bytes, or 0 if the
@@ -106,8 +190,8 @@ export interface AlbexWasmExports {
   // Stemming
   setLanguage(lang: number): void;
-  // Tier identification
-  getTier(): number;        // 1=mini, 2=std, 3=pro
+  // Runtime capacity identification (ABI 7). Report the capacities the
+  // engine was last initialised with — `init()` = the std defaults.
   getMaxChunks(): number;
   getMaxDocs(): number;
   getNameCapacity(): number;
@@ -117,6 +201,13 @@ export interface AlbexWasmExports {
   getChunkStructSize(): number;
   setCandidateMask(byteLen: number): void;
   clearCandidateMask(): void;
+  /** Low 32 bits of the active pattern's aggregate character Bloom
+   * (ABI 6). Computed WASM-side in setPattern through the same pipeline
+   * searchBegin uses (split → optional stemming → fold), so the GPU
+   * pre-filter tests exactly the bits the CPU path would. */
+  getPatternBloomLo(): number;
+  /** High 32 bits of the active pattern's aggregate character Bloom. */
+  getPatternBloomHi(): number;
 }
 // ─────────────────────────────────────────────────────────────────────────────
@@ -126,6 +217,10 @@ export interface AlbexWasmExports {
 export interface AlbexPdfExports {
   readonly memory: WebAssembly.Memory;
+  /** ABI version of the PDF module. The host loader refuses any binary
+   * whose abiVersion is outside the supported range. */
+  abiVersion(): number;
   /** Reserve `len` bytes inside the PDF module and return a pointer. */
   allocInput(len: number): number;
@@ -183,18 +278,138 @@ export interface AlbexPdfExports {
 }
 // ─────────────────────────────────────────────────────────────────────────────
-// Narrowing helpers for instantiation results
+// Runtime validators
 // ─────────────────────────────────────────────────────────────────────────────
+//
+// These replace the pre-0.5.0 `as unknown as` casts. They check three
+// things at instantiation time:
+//   1. memory is a WebAssembly.Memory instance.
+//   2. abiVersion() returns a number inside the supported range.
+//   3. every required export exists and is a function.
+//
+// If any of these fails, the loader throws a typed error before the
+// engine returns from init(). This eliminates the audit 3.2 issue:
+// previously a missing export only surfaced when its call site ran.
-/**
- * Cast `WebAssembly.Exports` to the typed Albex main interface.
- * Runtime check is intentionally minimal — if the .wasm doesn't match,
- * the first call site that touches a missing function throws naturally.
- */
+/** Range of ABI versions this host code understands for the main module.
+ * Update both ends together with the Rust `abiVersion()` constant when
+ * the export surface changes. */
+// ABI 7 adds runtime capacity (initWithCapacity, decision A16) and removes
+// the compile-time tier system (`getTier` is gone), on top of ABI 6's batch
+// frontier reads, ABI 5's truncation signalling and ABI 4's authoritative
+// chunk enumeration. The required-exports list below already makes any
+// older binary fail the missing-exports check, so a tolerant lower bound
+// was dead code — the range is pinned to the one ABI this host actually
+// speaks (audit 0.6.0, finding #7). The .wasm ships inside this package
+// (files: wasm/pkg/*.wasm), so host TS and binary are always
+// version-matched.
+const MAIN_ABI_MIN = 7;
+const MAIN_ABI_MAX = 7;
+/** Range of ABI versions for the PDF module. */
+const PDF_ABI_MIN = 1;
+const PDF_ABI_MAX = 3;
+/** Required function names on the main WASM. Adding a new one here forces
+ * the validator to check it; removing one is a breaking ABI bump. */
+const MAIN_REQUIRED = [
+  'abiVersion', 'getBuffer', 'init', 'initWithCapacity',
+  'setDocumentName', 'beginDocument', 'feedXmlBytes', 'endDocument',
+  'beginXlsx', 'feedXlsxBytes',
+  'feedText', 'flushParagraph',
+  'setMaxErrors', 'setThreshold', 'setMaxResults',
+  'prepareQuery', 'getQueryKind', 'getQueryBranchCount',
+  'getQueryBranchPattern', 'selectQueryBranch', 'getQueryTruncationFlags',
+  'setPattern', 'search',
+  'searchBegin', 'searchSlice', 'getSearchCursor', 'getSearchTotal',
+  'getResultCount', 'getResultsPtr', 'getResultStride',
+  'getResultDocId', 'getResultLocation', 'getResultScore',
+  'getResultStart', 'getResultEnd', 'getResultChunkIdx',
+  'getResultDocName', 'getResultMatchCount',
+  'getResultMatchStartAt', 'getResultMatchEndAt',
+  'getSnippet', 'getSnippetWindow', 'getSnippetWindowOffset',
+  'getStatBloomTested', 'getStatBloomPassed', 'getStatBitapMatched',
+  'getChunkCount', 'getDocCount', 'getTextUsed', 'getTextCapacity',
+  'getLastIndexOverflow',
+  'snapshotSize', 'snapshotChunk',
+  'restoreBegin', 'restoreFeed', 'restoreCommit',
+  'getDocId', 'getDocChunkCount', 'getDocName', 'isDocDeleted',
+  'removeDocument', 'compact',
+  'getDocChunkBase', 'getChunkLocationAt', 'getChunkByteLenAt', 'getChunkTextAt',
+  'listChunksBatch',
+  'setLanguage',
+  'getMaxChunks', 'getMaxDocs', 'getNameCapacity',
+  'getChunksPtr', 'getChunkStructSize',
+  'setCandidateMask', 'clearCandidateMask',
+  'getPatternBloomLo', 'getPatternBloomHi',
+  'getDocContentHashPtr', 'getDocContentHashLen', 'setDocumentContentHash',
+  'hashBegin', 'hashFeed', 'hashFinish',
+] as const;
+const PDF_REQUIRED = [
+  'abiVersion', 'allocInput', 'extractPdf',
+  'getPageLen', 'getPagePtr', 'getErrorLen', 'getErrorPtr',
+  'getPageCount', 'extractPageImages',
+  'getPageImageLen', 'getPageImagePtr', 'getPageImageKind',
+] as const;
+/** Thrown when an instantiated WASM module fails the ABI contract. */
+export class AlbexAbiMismatchError extends Error {
+  readonly module: 'main' | 'pdf';
+  readonly missing?: readonly string[];
+  readonly version?: number;
+  constructor(module: 'main' | 'pdf', message: string, opts?: { missing?: readonly string[]; version?: number }) {
+    super(message);
+    this.name = 'AlbexAbiMismatchError';
+    this.module = module;
+    if (opts?.missing) this.missing = opts.missing;
+    if (opts?.version !== undefined) this.version = opts.version;
+  }
+}
+function validateExports(
+  exports: WebAssembly.Exports,
+  required: readonly string[],
+  module: 'main' | 'pdf',
+  abiMin: number,
+  abiMax: number,
+): void {
+  const mem = (exports as Record<string, unknown>)['memory'];
+  if (!(mem instanceof WebAssembly.Memory)) {
+    throw new AlbexAbiMismatchError(module, `${module}: \`memory\` is missing or not a WebAssembly.Memory instance.`);
+  }
+  const missing: string[] = [];
+  for (const name of required) {
+    if (typeof (exports as Record<string, unknown>)[name] !== 'function') missing.push(name);
+  }
+  if (missing.length) {
+    throw new AlbexAbiMismatchError(
+      module,
+      `${module}: WASM binary missing required exports: ${missing.join(', ')}. ` +
+      `The .wasm was built with an incompatible source — rebuild with the current toolchain.`,
+      { missing },
+    );
+  }
+  const version = ((exports as Record<string, unknown>)['abiVersion'] as () => number)();
+  if (version < abiMin || version > abiMax) {
+    throw new AlbexAbiMismatchError(
+      module,
+      `${module}: abiVersion ${version} outside supported range [${abiMin}..${abiMax}]. ` +
+      `The host TypeScript expects a different binary — upgrade albex or rebuild the WASM.`,
+      { version },
+    );
+  }
+}
+/** Validate and narrow `WebAssembly.Exports` to the typed Albex main
+ * interface. Throws `AlbexAbiMismatchError` if the contract is broken. */
 export function asAlbexExports(exports: WebAssembly.Exports): AlbexWasmExports {
+  validateExports(exports, MAIN_REQUIRED, 'main', MAIN_ABI_MIN, MAIN_ABI_MAX);
   return exports as unknown as AlbexWasmExports;
 }
+/** Validate and narrow `WebAssembly.Exports` to the typed PDF interface. */
 export function asAlbexPdfExports(exports: WebAssembly.Exports): AlbexPdfExports {
+  validateExports(exports, PDF_REQUIRED, 'pdf', PDF_ABI_MIN, PDF_ABI_MAX);
   return exports as unknown as AlbexPdfExports;
 }

package/src/worker-protocol.ts CHANGED Viewed

@@ -10,13 +10,19 @@
  * copying the file bytes into the worker.
  */
-import type { AlbexOptions, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
+import type { AlbexDiagnostic, AlbexOptions, AuthoritativeChunk, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
 export type WorkerOp =
   | { kind: 'init';            opts: AlbexOptions }
   | { kind: 'indexFile';        name: string; buffer: ArrayBuffer }
   | { kind: 'search';           query: string; options: SearchOptions }
+  | { kind: 'listChunks';       docId: number }
   | { kind: 'removeDocument';   id: string }
+  /** Replace doc `name` with new content. `fileName` is the replacement
+   * file's own name (may differ from `name`); the bytes travel as a
+   * transferred ArrayBuffer like `indexFile`. */
+  | { kind: 'replaceDocument';  name: string; fileName: string; buffer: ArrayBuffer }
+  | { kind: 'takeDiagnostics' }
   | { kind: 'compact' }
   | { kind: 'reset' }
   | { kind: 'getStats' }
@@ -39,10 +45,14 @@ export interface WorkerRequest {
 export type WorkerResponse =
   | { id: number; ok: true;  result: unknown }
-  | { id: number; ok: false; error: { name: string; kind?: string; message: string } };
+  /** `limit`/`max` are populated for capacity errors so the rehydrated
+   * AlbexCapacityError keeps reporting the runtime limit that overflowed. */
+  | { id: number; ok: false; error: { name: string; kind?: string; message: string; limit?: string; max?: number } };
 export type IndexFileResult = IndexedDocument;
 export type SearchResultArr = SearchResult[];
+export type ChunksResult    = AuthoritativeChunk[];
 export type StatsResult     = EngineStats;
 export type SearchStatsRes  = SearchStats | null;
 export type DocsResult      = readonly IndexedDocument[];
+export type DiagnosticsRes  = AlbexDiagnostic[];