albex 0.3.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CHANGELOG.md +275 -0
  2. package/README.md +4 -2
  3. package/dist/albex-worker.js +1 -1
  4. package/dist/albex.d.ts +157 -17
  5. package/dist/albex.d.ts.map +1 -1
  6. package/dist/albex.js +405 -232
  7. package/dist/albex.js.map +1 -1
  8. package/dist/errors.d.ts +16 -2
  9. package/dist/errors.d.ts.map +1 -1
  10. package/dist/errors.js +6 -3
  11. package/dist/errors.js.map +1 -1
  12. package/dist/persistence.js +1 -1
  13. package/dist/profile.d.ts +11 -6
  14. package/dist/profile.d.ts.map +1 -1
  15. package/dist/profile.js +6 -13
  16. package/dist/profile.js.map +1 -1
  17. package/dist/resource-manager.js +1 -1
  18. package/dist/tiered-store.js +1 -1
  19. package/dist/wasm-bindings.d.ts +46 -5
  20. package/dist/wasm-bindings.d.ts.map +1 -1
  21. package/dist/wasm-bindings.js +102 -7
  22. package/dist/wasm-bindings.js.map +1 -1
  23. package/dist/worker-protocol.js +1 -1
  24. package/dist/worker-runtime.js +12 -3
  25. package/dist/worker-runtime.js.map +1 -1
  26. package/package.json +13 -9
  27. package/src/albex.ts +478 -246
  28. package/src/errors.ts +18 -2
  29. package/src/profile.ts +11 -10
  30. package/src/wasm-bindings.ts +157 -8
  31. package/src/worker-runtime.ts +12 -2
  32. package/wasm/pkg/albex_pdf.wasm +0 -0
  33. package/wasm/pkg/albex_wasm.wasm +0 -0
  34. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  35. package/wasm/pkg/albex_wasm_simd.wasm +0 -0
  36. package/wasm/pkg/albex_wasm_mini.wasm +0 -0
  37. package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
  38. package/wasm/pkg/albex_wasm_pro.wasm +0 -0
  39. package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
  40. package/wasm/pkg/albex_wasm_std.wasm +0 -0
  41. package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/src/albex.ts CHANGED
@@ -21,10 +21,12 @@ import {
21
21
  asAlbexPdfExports,
22
22
  } from './wasm-bindings.js';
23
23
  import {
24
+ AlbexError,
24
25
  AlbexInitError,
25
26
  AlbexUnsupportedFormatError,
26
27
  AlbexParseError,
27
28
  AlbexCapacityError,
29
+ type AlbexCapacityLimit,
28
30
  } from './errors.js';
29
31
  import {
30
32
  savePersisted,
@@ -32,7 +34,7 @@ import {
32
34
  deletePersisted,
33
35
  listPersisted,
34
36
  } from './persistence.js';
35
- import { detectProfile, pickTier, shouldUseGpu, type Tier, type DeviceProfile } from './profile.js';
37
+ import { detectProfile, shouldUseGpu, type Tier, type DeviceProfile } from './profile.js';
36
38
  import { getResourceManager, type ResourceState } from './resource-manager.js';
37
39
  import { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
38
40
 
@@ -96,6 +98,8 @@ export interface AlbexOptions {
96
98
  * Override the tier auto-detection. Pass `'auto'` (default), or an
97
99
  * explicit tier when you know the constraints of your target environment.
98
100
  */
101
+ /** @deprecated Removed in 0.5.0. Albex no longer has capacity tiers;
102
+ * pass `'auto'` or omit. Other values are accepted and ignored. */
99
103
  tier?: 'auto' | 'mini' | 'std' | 'pro';
100
104
  /**
101
105
  * SIMD selection. When `'auto'` (default), Albex probes for v128 support
@@ -201,57 +205,57 @@ export interface SearchStats {
201
205
  bitapMatched: number;
202
206
  }
203
207
 
204
- // ─────────────────────────────────────────────────────────────────────────────
205
- // Query parsing
206
- // ─────────────────────────────────────────────────────────────────────────────
207
-
208
- type SimpleQuery = { kind: 'simple'; tokens: string[] };
209
- type PhraseQuery = { kind: 'phrase'; tokens: string[]; raw: string };
210
- type OrQuery = { kind: 'or'; branches: string[][] };
211
- type ParsedQuery = SimpleQuery | PhraseQuery | OrQuery;
212
-
213
- function tokenize(q: string): string[] {
214
- return q.trim().split(/\s+/).filter(t => t.length > 0);
215
- }
216
-
217
- function parseQuery(q: string): ParsedQuery {
218
- const trimmed = q.trim();
219
-
220
- // OR: "term1 | term2" or "phrase one | phrase two"
221
- if (trimmed.includes('|')) {
222
- const branches = trimmed.split('|')
223
- .map(p => tokenize(p.replace(/"/g, '')))
224
- .filter(b => b.length > 0);
225
- return { kind: 'or', branches };
226
- }
227
-
228
- // Phrase: "exact phrase here"
229
- const phraseMatch = /^"(.+)"$/.exec(trimmed);
230
- if (phraseMatch) {
231
- const inner = phraseMatch[1] ?? '';
232
- const tokens = tokenize(inner);
233
- return { kind: 'phrase', tokens, raw: inner };
234
- }
235
-
236
- return { kind: 'simple', tokens: tokenize(trimmed) };
237
- }
238
-
239
208
  /**
240
- * Reconstruct a WASM-compatible query string from parsed tokens.
241
- * The WASM engine accepts up to 4 space-separated tokens (AND semantics).
209
+ * One structured warning recorded by the engine during indexFile or
210
+ * load. Replaces the pre-0.5.0 pattern of scattered `console.warn`
211
+ * calls. Inspect via `engine.takeDiagnostics()` after the operation.
242
212
  */
243
- function tokensToWasmQuery(tokens: string[]): string {
244
- return tokens.slice(0, 4).join(' ');
213
+ export interface AlbexDiagnostic {
214
+ /** Coarse kind. `'recovered'` means the engine handled the issue and
215
+ * kept going; `'skipped'` means content was dropped; `'fallback'` means
216
+ * an alternate code path was used (e.g. lopdf after pdf-extract trap). */
217
+ kind: 'recovered' | 'skipped' | 'fallback' | 'info';
218
+ /** Where in the pipeline this happened. Free-form short tag. */
219
+ stage: 'pdf' | 'ocr' | 'gpu' | 'persistence' | 'network';
220
+ /** Human-readable message safe to surface in a UI. */
221
+ message: string;
222
+ /** Optional file the issue belongs to. */
223
+ file?: string;
224
+ /** Optional page number (1-based for PDFs). */
225
+ page?: number;
245
226
  }
246
227
 
247
228
  // ─────────────────────────────────────────────────────────────────────────────
248
- // Phrase post-filter
229
+ // Query parsing (WASM-side as of 0.5.0)
249
230
  // ─────────────────────────────────────────────────────────────────────────────
231
+ //
232
+ // Pre-0.5.0 this file owned parseQuery + tokenize. That created two
233
+ // truths about what a "token" was: one in TS for the query, one in Rust
234
+ // for the indexed text. The audit flagged this as the biggest divergence
235
+ // in the wrapper.
236
+ //
237
+ // 0.5.0 moves parseQuery/tokenize/tokensToWasmQuery to Rust. The TS
238
+ // dispatcher reduces to:
239
+ //
240
+ // 1. Write the raw UTF-8 query bytes to the scratchpad.
241
+ // 2. Call prepareQuery(len). Get back the kind (simple/phrase/or).
242
+ // 3. For OR: iterate getQueryBranchCount() branches, calling
243
+ // selectQueryBranch(i) + search() for each, then merge in TS.
244
+ // For simple/phrase: selectQueryBranch(0) + search().
245
+ // 4. For phrase: post-filter the snippets with containsPhrase().
246
+ //
247
+ // containsPhrase stays in TS because it operates on snippet text already
248
+ // produced by the WASM, not on the query. It is not a tokenizer.
250
249
 
251
250
  /**
252
- * Returns true if `snippet` contains the phrase formed by `tokens` in order,
253
- * with at most `maxGap` characters between consecutive tokens.
254
- * Comparison is case- and accent-insensitive.
251
+ * Phrase post-filter. Returns true if `snippet` contains the phrase
252
+ * formed by `tokens` in order, with at most `maxGap` characters between
253
+ * consecutive tokens. Comparison is case- and accent-insensitive.
254
+ *
255
+ * The tokens come from the WASM-compiled pattern of a phrase branch,
256
+ * not from a TS re-tokenization of the query, so there is no
257
+ * tokenization divergence: WASM said "these are the tokens", we just
258
+ * check adjacency in the snippet.
255
259
  */
256
260
  function containsPhrase(snippet: string, tokens: string[], maxGap = 30): boolean {
257
261
  const norm = (s: string): string =>
@@ -392,32 +396,11 @@ function computePatternBloom(query: string): bigint {
392
396
  return bits;
393
397
  }
394
398
 
395
- function contentHash(bytes: Uint8Array): string {
396
- // 64-bit arithmetic via two 32-bit halves (no BigInt to keep it fast in
397
- // engines without optimised BigInt support).
398
- let hi = 0xcbf29ce4 | 0;
399
- let lo = 0x84222325 | 0;
400
- // FNV prime: 0x100000001b3 = (0x100 << 32) | 0x000001b3
401
- for (let i = 0; i < bytes.length; i++) {
402
- lo ^= bytes[i]!;
403
- // multiply by FNV prime
404
- // (hi:lo) *= 0x100000001b3
405
- // low * prime
406
- const lo_lo = (lo & 0xffff) * 0x1b3;
407
- const lo_hi = (lo >>> 16) * 0x1b3;
408
- let new_lo = (lo_lo + ((lo_hi & 0xffff) << 16)) | 0;
409
- let carry = (lo_hi >>> 16) + ((lo_lo + ((lo_hi & 0xffff) << 16)) > 0xffffffff ? 1 : 0);
410
- // hi*prime + carry
411
- const hi_lo = (hi & 0xffff) * 0x1b3;
412
- const hi_hi = (hi >>> 16) * 0x1b3;
413
- const new_hi = ((hi_lo + ((hi_hi & 0xffff) << 16)) | 0) + carry + lo; // + lo because high 33rd bit
414
- lo = new_lo;
415
- hi = new_hi | 0;
416
- }
417
- const hexHi = (hi >>> 0).toString(16).padStart(8, '0');
418
- const hexLo = (lo >>> 0).toString(16).padStart(8, '0');
419
- return hexHi + hexLo;
420
- }
399
+ // Note: `contentHash` is implemented as a method on AlbexEngine below
400
+ // (it needs access to the WASM scratchpad). The standalone TS reference
401
+ // implementation that used to live here was removed in 0.4.0 — the
402
+ // canonical hash now lives in wasm/src/lib.rs::hashBytes so there is
403
+ // exactly one definition of "the content hash of these bytes".
421
404
 
422
405
  /**
423
406
  * 16-hex-char content hash → 8 raw bytes for setDocumentContentHash. The
@@ -612,11 +595,20 @@ function makePdfWasmImports(
612
595
  case '__wbindgen_externref_table_set_null':
613
596
  return (idx: number) => { heap[idx] = undefined; };
614
597
  }
615
- // Unknown import — return a stub that warns when called. Loading still
616
- // succeeds; only an actually-invoked unknown import will surface.
617
- return (...args: unknown[]) => {
618
- console.warn(`[albex] unhandled PDF WASM import ${modName}.${name}`, args);
619
- };
598
+ // Unknown import — fail fast. An import we don't recognise means the
599
+ // wasm-bindgen / lopdf / getrandom dependency graph has drifted from
600
+ // the prefixes this loader is written to satisfy. Accepting the
601
+ // module would defer the failure to an arbitrary execution path,
602
+ // typically deep inside extractPdf(), where the user gets either a
603
+ // hang or a misleading "PDF parse error". Refusing instantiation
604
+ // surfaces the version skew at boot, where the maintainer can act
605
+ // on it.
606
+ throw new AlbexInitError(
607
+ `Unknown PDF WASM import "${modName}.${name}". ` +
608
+ `The albex_pdf.wasm binary was probably built with a newer Rust ` +
609
+ `toolchain or dependency graph than this loader was written for. ` +
610
+ `Rebuild with 'npm run build:pdf-wasm' or open an issue.`,
611
+ );
620
612
  };
621
613
 
622
614
  const imports: Record<string, Record<string, unknown>> = {};
@@ -647,6 +639,39 @@ export interface OcrAttachedOptions {
647
639
  hint?: string;
648
640
  }
649
641
 
642
+ /**
643
+ * Contract the engine accepts from an OCR plugin. `@albex/ocr` is the
644
+ * canonical implementation, but any module that satisfies this interface
645
+ * can be attached via `engine.attachOcr(adapter)`.
646
+ */
647
+ export interface OcrAdapter {
648
+ /** Invoked by the engine to OCR a single image. Receives whatever the
649
+ * caller passes (Blob, ArrayBuffer, etc.); the adapter is responsible
650
+ * for accepting that input. Must return text + confidence. */
651
+ recognize(image: unknown, opts?: OcrAttachedOptions): Promise<OcrAttachedResult>;
652
+
653
+ /** Engine-side switches the adapter wants honoured. The only one
654
+ * defined today is `alwaysExtractEmbeddedImages`, which turns on the
655
+ * hybrid PDF OCR pass. New flags can be added without breaking the
656
+ * adapter interface. */
657
+ options?: {
658
+ /** When true, every PDF (native or scanned) is walked for embedded
659
+ * images and each qualifying image is sent to `recognize`. Off by
660
+ * default to keep performance predictable on native PDFs. */
661
+ alwaysExtractEmbeddedImages?: boolean;
662
+ };
663
+ }
664
+
665
+ /** Returned by `attachOcr`. Holds the lifecycle handles for the plugin.
666
+ * Calling `dispose()` removes the adapter from the engine; subsequent
667
+ * `engine.ocrImage` access returns `undefined` again. */
668
+ export interface OcrHandle {
669
+ /** Detach the plugin and tear down any resources it holds. After this,
670
+ * the engine reverts to "no OCR" — scanned PDFs go back to registering
671
+ * with zero chunks. */
672
+ dispose(): Promise<void>;
673
+ }
674
+
650
675
  export class AlbexEngine {
651
676
  // ── main WASM ──
652
677
  private _wasm!: AlbexWasmExports;
@@ -658,23 +683,20 @@ export class AlbexEngine {
658
683
  * runtime dependency on OCR — this is a structural slot that the optional
659
684
  * companion package fills.
660
685
  */
661
- ocrImage?: (image: unknown, opts?: OcrAttachedOptions) => Promise<OcrAttachedResult>;
662
-
663
686
  /**
664
- * Optional OCR-side configuration set by `@albex/ocr::enableOcr`. Read
665
- * by the engine to decide whether to invoke OCR on top of the text it
666
- * already extracted from a PDF (hybrid PDFs: native text + images that
667
- * also contain text, like stamps, scanned annexes, or diagrams with
668
- * labels).
669
- *
670
- * When `alwaysExtractEmbeddedImages` is true, every page of every PDF
671
- * passes through `extractPageImages` after the normal text extraction;
672
- * any image that meets the size filter (200×200 in Rust) is fed to
673
- * `ocrImage`. Performance cost: 1–3 s per qualifying image.
674
- *
675
- * Off by default — set this opt-in via the OCR module's options.
687
+ * Public OCR entry point. Forwards to the attached OCR adapter installed
688
+ * via `attachOcr()`. Reading this property is a feature-detect for
689
+ * integrators: `if (engine.ocrImage) { ... OCR available ... }`. Writing
690
+ * to it directly is no longer supported in 0.5.0+ — use `attachOcr`.
676
691
  */
677
- ocrConfig?: { alwaysExtractEmbeddedImages?: boolean };
692
+ get ocrImage(): ((image: unknown, opts?: OcrAttachedOptions) => Promise<OcrAttachedResult>) | undefined {
693
+ return this._ocrAdapter?.recognize;
694
+ }
695
+
696
+ /** Private adapter slot. Holds the OCR plugin contract installed by
697
+ * `attachOcr()`. The engine reads `recognize` and `options` here; the
698
+ * caller never gets a reference to this object directly. */
699
+ private _ocrAdapter: OcrAdapter | null = null;
678
700
 
679
701
  // ── PDF WASM (lazy) ──
680
702
  private _pdfWasm: AlbexPdfExports | null = null;
@@ -682,6 +704,11 @@ export class AlbexEngine {
682
704
 
683
705
  private _docs: IndexedDocument[] = [];
684
706
  private _lastSearch: SearchStats | null = null;
707
+ /** Structured diagnostics collected during the most recent operation.
708
+ * Drained by `takeDiagnostics()`. Capped at 256 entries to avoid
709
+ * unbounded memory growth in pathological cases (very corrupted
710
+ * corpora producing thousands of recovery warnings). */
711
+ private _diagnostics: AlbexDiagnostic[] = [];
685
712
  private _tier: Tier | null = null;
686
713
  private _simd: boolean = false;
687
714
  private _profile: DeviceProfile | null = null;
@@ -691,10 +718,55 @@ export class AlbexEngine {
691
718
  private _unsubscribeResources: (() => void) | null = null;
692
719
  private readonly _opts: AlbexOptions;
693
720
 
721
+ // ── Concurrency guard ──────────────────────────────────────────────────────
722
+ // One WASM instance, global mutable state, async ops that yield to the
723
+ // scheduler between slices. Two overlapping operations corrupt each other
724
+ // (e.g. a fresh searchBegin resets the cursor of an in-flight cooperative
725
+ // search). Async ops serialize through `_opChain`; sync mutators/searches
726
+ // assert the engine is idle (audit 0.6.0, finding #2).
727
+ private _opChain: Promise<unknown> = Promise.resolve();
728
+ private _busy = false;
729
+
694
730
  constructor(opts: AlbexOptions) {
695
731
  this._opts = opts;
696
732
  }
697
733
 
734
+ /** Serialize an async engine operation behind any in-flight one. */
735
+ private _exclusive<T>(fn: () => Promise<T>): Promise<T> {
736
+ const run = this._opChain.then(async () => {
737
+ this._busy = true;
738
+ try { return await fn(); }
739
+ finally { this._busy = false; }
740
+ });
741
+ // Swallow result/error on the chain so one failure can't wedge the queue.
742
+ this._opChain = run.then(() => undefined, () => undefined);
743
+ return run as Promise<T>;
744
+ }
745
+
746
+ /** Guard a synchronous mutator/search: refuse to run mid-async-operation
747
+ * rather than silently corrupt the shared WASM state. */
748
+ private _assertIdle(method: string): void {
749
+ if (this._busy) {
750
+ throw new AlbexError(
751
+ 'busy',
752
+ `${method}() was called while an async engine operation is still ` +
753
+ `running. Await the previous indexFile/save/load/replaceDocument/` +
754
+ `searchCooperative call, or use searchCooperative instead of search().`,
755
+ );
756
+ }
757
+ }
758
+
759
+ /** Compact opportunistically when tombstones pile up under text pressure,
760
+ * so repeated removeDocument/replaceDocument don't exhaust the pool. */
761
+ private _autoCompactIfNeeded(): void {
762
+ const w = this._wasm;
763
+ const cap = w.getTextCapacity();
764
+ const hasTombstones = w.getDocCount() > this._docs.length;
765
+ if (hasTombstones && cap > 0 && w.getTextUsed() / cap > 0.85) {
766
+ w.compact();
767
+ }
768
+ }
769
+
698
770
  /** Load and initialise the main WASM module. Must be called before any other method. */
699
771
  async init(): Promise<void> {
700
772
  const url = await this._resolveWasmUrl();
@@ -754,29 +826,28 @@ export class AlbexEngine {
754
826
  // as an asset reference. They copy the .wasm to the output directory and
755
827
  // rewrite the URL automatically. Consumers who use one of those bundlers
756
828
  // get a working `new AlbexEngine()` with no manual setup.
757
- if (!o.wasmBaseUrl) {
758
- // We can't tier-select with one URL, so fall back to std baseline.
759
- // The integrator who wants tier optimisation must opt in via wasmBaseUrl.
760
- this._tier = 'std';
761
- this._simd = false;
762
- return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
763
- }
764
-
765
- let tier: Tier;
766
- if (o.tier && o.tier !== 'auto') tier = o.tier;
767
- else tier = pickTier(profile);
768
- this._tier = tier;
769
-
829
+ // 0.5.0+: two main binaries only — baseline and SIMD. The tier
830
+ // system is gone (audit 4.1). Selection collapses to a single
831
+ // boolean: SIMD on or off, decided either by the explicit `simd`
832
+ // option or by a runtime probe.
770
833
  const simd = o.simd === 'on'
771
834
  ? true
772
835
  : o.simd === 'off'
773
836
  ? false
774
837
  : !!profile?.wasm.simd;
775
838
  this._simd = simd;
839
+ this._tier = 'std';
840
+
841
+ if (!o.wasmBaseUrl) {
842
+ // Zero-config: bundler resolves the .wasm next to dist/. We only
843
+ // ship the baseline alias (albex_wasm_bg.wasm) inside the npm
844
+ // package; integrators who want SIMD must serve both binaries
845
+ // themselves via `wasmBaseUrl`.
846
+ return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
847
+ }
776
848
 
777
- const suffix = simd ? `${tier}_simd` : tier;
778
849
  const base = o.wasmBaseUrl.replace(/\/+$/, '');
779
- return `${base}/albex_wasm_${suffix}.wasm`;
850
+ return simd ? `${base}/albex_wasm_simd.wasm` : `${base}/albex_wasm.wasm`;
780
851
  }
781
852
 
782
853
  /** The tier that was actually loaded. `null` until `init()` resolves. */
@@ -887,6 +958,35 @@ export class AlbexEngine {
887
958
  }
888
959
  }
889
960
 
961
+ /**
962
+ * Compute the FNV-1a 64-bit content hash of `bytes` via the WASM
963
+ * streaming API. Returns a 16-character hex string identical in shape
964
+ * to what the TS implementation in 0.3.x returned, so all callers
965
+ * stay unchanged. Single source of truth — same hash whether we use
966
+ * it for indexFile dedup, for snapshot v2 persistence, or anywhere
967
+ * else. Large inputs are chunked at FEED_SIZE just like _feedText.
968
+ */
969
+ private _contentHash(bytes: Uint8Array): string {
970
+ const w = this._wasm;
971
+ w.hashBegin();
972
+ for (let i = 0; i < bytes.length; i += FEED_SIZE) {
973
+ const c = bytes.subarray(i, i + FEED_SIZE);
974
+ this._writePad(c);
975
+ w.hashFeed(c.length);
976
+ }
977
+ w.hashFinish();
978
+ // Read 8 result bytes back from scratchpad[0..8].
979
+ const ptr = w.getBuffer(8);
980
+ const out = this._u8(ptr, 8);
981
+ // Big-endian to hex. Same layout as the old hexHi + hexLo output:
982
+ // high u32 first (4 bytes), low u32 second (4 bytes).
983
+ let s = '';
984
+ for (let i = 0; i < 8; i++) {
985
+ s += out[i]!.toString(16).padStart(2, '0');
986
+ }
987
+ return s;
988
+ }
989
+
890
990
  private _feedXmlBytes(xml: Uint8Array, fn: 'feedXmlBytes' | 'feedXlsxBytes'): void {
891
991
  const feeder = this._wasm[fn];
892
992
  for (let i = 0; i < xml.length; i += FEED_SIZE) {
@@ -910,7 +1010,10 @@ export class AlbexEngine {
910
1010
  // called when the user actually drops a PDF — but we issue a console
911
1011
  // hint so embedders can surface a "this will download ~1 MB" prompt.
912
1012
  if (this._resources?.constrainedNetwork) {
913
- console.info('[albex] downloading PDF WASM (~1 MB) on a constrained network connection');
1013
+ this._diag({
1014
+ kind: 'info', stage: 'network',
1015
+ message: 'Downloading PDF WASM (~1 MB) on a constrained network connection',
1016
+ });
914
1017
  }
915
1018
  const res = await fetch(pdfUrl);
916
1019
  if (!res.ok) throw new AlbexInitError(`Failed to fetch PDF WASM: ${res.status}`);
@@ -1046,21 +1149,14 @@ export class AlbexEngine {
1046
1149
  this._wasm.flushParagraph();
1047
1150
  }
1048
1151
 
1049
- // Hybrid OCR pass: when the OCR module is wired with
1050
- // `alwaysExtractEmbeddedImages: true`, also walk every page for
1051
- // embedded images and OCR them on top of the vector text.
1052
- //
1053
- // We always log the decision so users debugging "why isn't OCR
1054
- // firing on my hybrid PDF" can see which precondition failed.
1055
- const hybridOn = !!this.ocrConfig?.alwaysExtractEmbeddedImages;
1056
- const hasOcr = !!this.ocrImage;
1057
- const binSupportsImages = typeof pw.extractPageImages === 'function'
1058
- && typeof pw.getPageCount === 'function';
1059
- console.log(`[albex] hybrid OCR decision: ocrImage=${hasOcr} ocrConfig.alwaysExtractEmbeddedImages=${hybridOn} binarySupportsImages=${binSupportsImages}`);
1060
-
1061
- if (hasOcr && hybridOn && binSupportsImages) {
1152
+ // Hybrid OCR pass: when the OCR adapter is wired with
1153
+ // `options.alwaysExtractEmbeddedImages: true`, also walk every page
1154
+ // for embedded images and OCR them on top of the vector text.
1155
+ if (this._ocrAdapter
1156
+ && this._ocrAdapter.options?.alwaysExtractEmbeddedImages
1157
+ && typeof pw.extractPageImages === 'function'
1158
+ && typeof pw.getPageCount === 'function') {
1062
1159
  const totalPages = pw.getPageCount();
1063
- console.log(`[albex] hybrid OCR pass starting over ${totalPages} page(s)`);
1064
1160
  for (let p = 0; p < totalPages; p++) {
1065
1161
  const ocrText = await this._ocrPageEmbeddedImages(pw, p);
1066
1162
  if (ocrText === null) break; // WASM trapped, stop hybrid pass.
@@ -1148,7 +1244,10 @@ export class AlbexEngine {
1148
1244
  // so `_ensurePdfWasm` re-instantiates on the next call.
1149
1245
  this._pdfWasm = null;
1150
1246
  this._pdfMem = null;
1151
- console.warn(`[albex] PDF image extractor trapped on page ${page + 1}: ${e instanceof Error ? e.message : String(e)}. Stopping OCR.`);
1247
+ this._diag({
1248
+ kind: 'skipped', stage: 'pdf', page: page + 1,
1249
+ message: `PDF image extractor trapped: ${e instanceof Error ? e.message : String(e)}. Remaining pages skipped.`,
1250
+ });
1152
1251
  return null;
1153
1252
  }
1154
1253
  if (imageCount <= 0) return '';
@@ -1174,16 +1273,6 @@ export class AlbexEngine {
1174
1273
  copy.set(new Uint8Array(liveMem.buffer, ptr, len));
1175
1274
  const blob = new Blob([copy.buffer as ArrayBuffer], { type: mime });
1176
1275
 
1177
- // Defensive diagnostics: when an OCR call goes wrong (Tesseract
1178
- // worker abort, malformed JPEG, etc.) the first thing we want to
1179
- // see is whether we even handed it valid image bytes. A real JPEG
1180
- // starts with FF D8 FF (E0 for JFIF, E1 for EXIF). A JPEG2000
1181
- // starts with 00 00 00 0C 6A 50 20 20.
1182
- const magic = Array.from(copy.subarray(0, 4))
1183
- .map(b => b.toString(16).padStart(2, '0'))
1184
- .join(' ');
1185
- console.log(`[albex] OCR page ${page + 1} image ${i + 1}/${imageCount}: kind=${kind} (${mime}) len=${len} bytes magic=${magic}`);
1186
-
1187
1276
  try {
1188
1277
  const { text } = await ocr(blob);
1189
1278
  const trimmed = text?.trim();
@@ -1197,7 +1286,10 @@ export class AlbexEngine {
1197
1286
  // "Aborted(-1)") are also caught here; if they bypass the
1198
1287
  // promise rejection and surface as `uncaught` instead, the
1199
1288
  // demo's window.onerror handler will keep the app alive.
1200
- console.warn(`[albex] OCR failed on page ${page + 1} image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
1289
+ this._diag({
1290
+ kind: 'skipped', stage: 'ocr', page: page + 1,
1291
+ message: `OCR failed on image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`,
1292
+ });
1201
1293
  }
1202
1294
  }
1203
1295
 
@@ -1242,7 +1334,10 @@ export class AlbexEngine {
1242
1334
  inPtr = pw.allocInput(bytes.length);
1243
1335
  new Uint8Array(pw.memory.buffer, inPtr, bytes.length).set(bytes);
1244
1336
  } catch (e) {
1245
- console.warn(`[albex] PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`);
1337
+ this._diag({
1338
+ kind: 'skipped', stage: 'pdf',
1339
+ message: `PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`,
1340
+ });
1246
1341
  return null;
1247
1342
  }
1248
1343
 
@@ -1252,7 +1347,10 @@ export class AlbexEngine {
1252
1347
  // first page, no paragraphs are emitted and we end up with 0 chunks.
1253
1348
  this._wasm.setDocumentName(this._writeStr(file.name));
1254
1349
  this._wasm.beginDocument();
1255
- console.info(`[albex] pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf for ${file.name}`);
1350
+ this._diag({
1351
+ kind: 'fallback', stage: 'pdf', file: file.name,
1352
+ message: `pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf`,
1353
+ });
1256
1354
  await this._indexPdfScanned(pw);
1257
1355
  return this._wasm.endDocument();
1258
1356
  }
@@ -1657,6 +1755,10 @@ export class AlbexEngine {
1657
1755
  * Throws for unsupported formats or parse errors.
1658
1756
  */
1659
1757
  async indexFile(file: File): Promise<IndexedDocument> {
1758
+ return this._exclusive(() => this._indexFileInner(file));
1759
+ }
1760
+
1761
+ private async _indexFileInner(file: File): Promise<IndexedDocument> {
1660
1762
  const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
1661
1763
  const indexer = AlbexEngine._INDEXERS[ext];
1662
1764
  if (!indexer) throw new AlbexUnsupportedFormatError(ext);
@@ -1664,7 +1766,7 @@ export class AlbexEngine {
1664
1766
  // Hash the source bytes for idempotency. We always read the bytes once
1665
1767
  // here so the indexer can reuse them — avoids a double File.arrayBuffer().
1666
1768
  const bytes = new Uint8Array(await file.arrayBuffer());
1667
- const hash = contentHash(bytes);
1769
+ const hash = this._contentHash(bytes);
1668
1770
 
1669
1771
  // Idempotency: if a non-deleted doc already has this hash, return it
1670
1772
  // unchanged. Cheap O(N) scan since MAX_DOCS = 128.
@@ -1689,6 +1791,30 @@ export class AlbexEngine {
1689
1791
  }
1690
1792
 
1691
1793
  const chunks = await indexer(this, file, bytes);
1794
+
1795
+ // Capacity check (0.6.0). The WASM pools fill silently and break out of
1796
+ // their ingest loops; getLastIndexOverflow reports which one filled.
1797
+ // Surface a typed error instead of returning a half-indexed document the
1798
+ // caller cannot tell apart from a complete one (audit finding #3).
1799
+ const overflow = w.getLastIndexOverflow();
1800
+ if (overflow !== 0) {
1801
+ const which: AlbexCapacityLimit =
1802
+ (overflow & 1) ? 'chunks' : (overflow & 2) ? 'text'
1803
+ : (overflow & 4) ? 'docs' : 'names';
1804
+ const pools = [
1805
+ overflow & 1 ? 'chunk pool' : '',
1806
+ overflow & 2 ? 'text pool' : '',
1807
+ overflow & 4 ? 'document table' : '',
1808
+ overflow & 8 ? 'name pool' : '',
1809
+ ].filter(Boolean).join(', ');
1810
+ throw new AlbexCapacityError(
1811
+ `Index capacity exceeded while indexing "${file.name}" (${pools} full). ` +
1812
+ `The document was rolled back (not indexed); treat the index as full ` +
1813
+ `(compact(), shard across an AlbexPool, or reset()).`,
1814
+ which,
1815
+ );
1816
+ }
1817
+
1692
1818
  // The new doc occupies slot `docCountBefore`.
1693
1819
  const docId = w.getDocId(docCountBefore);
1694
1820
 
@@ -1713,6 +1839,11 @@ export class AlbexEngine {
1713
1839
  * Returns `true` if a matching document was found and tombstoned.
1714
1840
  */
1715
1841
  removeDocument(id: string): boolean {
1842
+ this._assertIdle('removeDocument');
1843
+ return this._removeDocumentInner(id);
1844
+ }
1845
+
1846
+ private _removeDocumentInner(id: string): boolean {
1716
1847
  const doc = this._docs.find(d => d.name === id || d.contentHash === id);
1717
1848
  if (!doc) return false;
1718
1849
  const ok = this._wasm.removeDocument(doc.docId) === 1;
@@ -1728,12 +1859,15 @@ export class AlbexEngine {
1728
1859
  * idempotency check (so re-indexing the *same* bytes after a remove works).
1729
1860
  */
1730
1861
  async replaceDocument(name: string, newFile: File): Promise<IndexedDocument> {
1731
- this.removeDocument(name);
1732
- // Force a unique-hash path by indexing directly; if the new file happens
1733
- // to hash identically to a still-tracked document, the dedupe in
1734
- // indexFile will return that one. The remove above prevents the
1735
- // common case.
1736
- return this.indexFile(newFile);
1862
+ return this._exclusive(async () => {
1863
+ this._removeDocumentInner(name);
1864
+ // Index directly via the inner path (we already hold the lock).
1865
+ const doc = await this._indexFileInner(newFile);
1866
+ // Repeated replaces leave tombstones in the text pool; reclaim under
1867
+ // pressure so the pool isn't silently exhausted (audit finding #7).
1868
+ this._autoCompactIfNeeded();
1869
+ return doc;
1870
+ });
1737
1871
  }
1738
1872
 
1739
1873
  /**
@@ -1744,6 +1878,7 @@ export class AlbexEngine {
1744
1878
  * references (e.g. in a UI) remain valid.
1745
1879
  */
1746
1880
  compact(): void {
1881
+ this._assertIdle('compact');
1747
1882
  this._wasm.compact();
1748
1883
  }
1749
1884
 
@@ -1757,19 +1892,35 @@ export class AlbexEngine {
1757
1892
  * markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
1758
1893
  */
1759
1894
  search(query: string, opts: SearchOptions = {}): SearchResult[] {
1760
- const parsed = parseQuery(query);
1761
-
1762
- if (parsed.kind === 'or') {
1763
- return this._searchOr(parsed.branches, query, opts);
1895
+ this._assertIdle('search');
1896
+ const w = this._wasm;
1897
+ const ql = this._writeStr(query);
1898
+ const kind = w.prepareQuery(ql);
1899
+ if (kind < 0) return [];
1900
+
1901
+ if (kind === 2) {
1902
+ // OR: iterate branches and merge in TS. WASM stores compiled
1903
+ // branches internally so we never re-tokenize on the host.
1904
+ return this._searchOr(query, opts);
1764
1905
  }
1765
1906
 
1766
- const results = this._runSearch(tokensToWasmQuery(parsed.tokens), query, opts);
1767
-
1768
- if (parsed.kind === 'phrase') {
1769
- return results.filter(r => containsPhrase(r.snippet, parsed.tokens));
1770
- }
1907
+ w.selectQueryBranch(0);
1908
+ // Phrase queries (kind 1) post-filter on adjacency. Pass the tokens down
1909
+ // so the check runs against the FULL chunk text, not a cropped windowed
1910
+ // snippet otherwise `{ windowed: true }` could drop a valid phrase hit
1911
+ // whose second term fell outside the window (audit finding #7).
1912
+ const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
1913
+ return this._runSearch(query, opts, phraseTokens);
1914
+ }
1771
1915
 
1772
- return results;
1916
+ /** Read the WASM-compiled tokens of branch `i` for phrase post-filter.
1917
+ * The bytes returned are exactly what the WASM tokenizer produced —
1918
+ * no TS re-tokenization. */
1919
+ private _branchTokens(i: number): string[] {
1920
+ const n = this._wasm.getQueryBranchPattern(i);
1921
+ if (n === 0) return [];
1922
+ const pattern = this._readPad(n);
1923
+ return pattern.split(' ').filter(t => t.length > 0);
1773
1924
  }
1774
1925
 
1775
1926
  /**
@@ -1787,34 +1938,42 @@ export class AlbexEngine {
1787
1938
  * Pass `opts.frameBudgetMs` to control the slice size (default 8 ms).
1788
1939
  */
1789
1940
  async *searchCooperative(query: string, opts: SearchOptions = {}): AsyncIterable<SearchResult> {
1790
- const parsed = parseQuery(query);
1941
+ // Collect under the exclusivity lock so no other engine op interleaves at
1942
+ // a slice boundary; the per-slice scheduler yields still happen inside.
1943
+ const results = await this._exclusive(() => this._searchCooperativeCollect(query, opts));
1944
+ for (const r of results) yield r;
1945
+ }
1946
+
1947
+ /** Materialise a cooperative search to a sorted result array. Runs inside
1948
+ * the exclusivity lock. Frame-budget yielding lives in _runSearchBudgeted. */
1949
+ private async _searchCooperativeCollect(query: string, opts: SearchOptions): Promise<SearchResult[]> {
1791
1950
  const budget = opts.frameBudgetMs ?? 8;
1792
1951
  const w = this._wasm;
1793
1952
 
1794
- // OR queries: run each branch as its own resumable search, dedup, sort.
1795
- if (parsed.kind === 'or') {
1953
+ const ql = this._writeStr(query);
1954
+ const kind = w.prepareQuery(ql);
1955
+ if (kind < 0) return [];
1956
+
1957
+ if (kind === 2) {
1958
+ // OR branches — run each as its own resumable search and merge.
1796
1959
  const seen = new Set<string>();
1797
1960
  const all: SearchResult[] = [];
1798
- for (const tokens of parsed.branches) {
1799
- const q = tokensToWasmQuery(tokens);
1800
- if (!q) continue;
1801
- const r = await this._runSearchBudgeted(q, query, opts, budget);
1961
+ const n = w.getQueryBranchCount();
1962
+ for (let i = 0; i < n; i++) {
1963
+ w.selectQueryBranch(i);
1964
+ const r = await this._runSearchBudgeted(query, opts, budget, undefined, i);
1802
1965
  for (const x of r) {
1803
1966
  const key = `${x.documentName}:${x.location}:${x.matchStart}`;
1804
1967
  if (!seen.has(key)) { seen.add(key); all.push(x); }
1805
1968
  }
1806
1969
  }
1807
1970
  all.sort((a, b) => b.score - a.score);
1808
- for (const r of all) yield r;
1809
- return;
1971
+ return all;
1810
1972
  }
1811
1973
 
1812
- const results = await this._runSearchBudgeted(tokensToWasmQuery(parsed.tokens), query, opts, budget);
1813
- const filtered = parsed.kind === 'phrase'
1814
- ? results.filter(r => containsPhrase(r.snippet, parsed.tokens))
1815
- : results;
1816
- for (const r of filtered) yield r;
1817
- void w;
1974
+ w.selectQueryBranch(0);
1975
+ const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
1976
+ return this._runSearchBudgeted(query, opts, budget, phraseTokens, 0);
1818
1977
  }
1819
1978
 
1820
1979
  /**
@@ -1838,14 +1997,19 @@ export class AlbexEngine {
1838
1997
  * may eat the entire budget, which is also fine.
1839
1998
  */
1840
1999
  private async _runSearchBudgeted(
1841
- wasmQuery: string,
1842
2000
  displayQuery: string,
1843
2001
  opts: SearchOptions,
1844
2002
  budgetMs: number,
2003
+ phraseTokens?: string[],
2004
+ branchIdx = 0,
1845
2005
  ): Promise<SearchResult[]> {
1846
2006
  const w = this._wasm;
1847
- const ql = this._writeStr(wasmQuery);
1848
- w.setPattern(ql);
2007
+ // Pattern is already set by the caller via selectQueryBranch(branchIdx).
2008
+ // Snapshot THAT branch's compiled pattern for the GPU pre-filter hash —
2009
+ // not branch 0, which would build the wrong candidate mask for OR
2010
+ // branches and silently drop their hits (audit finding #6).
2011
+ const activePatternLen = w.getQueryBranchPattern(branchIdx);
2012
+ const activePattern = activePatternLen > 0 ? this._readPad(activePatternLen) : '';
1849
2013
 
1850
2014
  // GPU pre-filter (CD1). If enabled AND the corpus is large enough,
1851
2015
  // the GPU computes the candidate bitset and we install it into WASM
@@ -1853,10 +2017,13 @@ export class AlbexEngine {
1853
2017
  // Failure here is silent: we fall back to CPU-only Bloom transparently.
1854
2018
  if (this._shouldEngageGpu()) {
1855
2019
  try {
1856
- await this._gpuPreFilter(wasmQuery);
2020
+ await this._gpuPreFilter(activePattern);
1857
2021
  } catch (e) {
1858
2022
  // Don't let a GPU hiccup kill the search — drop to CPU path.
1859
- console.warn('[albex] GPU pre-filter failed; falling back to CPU:', e);
2023
+ this._diag({
2024
+ kind: 'fallback', stage: 'gpu',
2025
+ message: `GPU pre-filter failed; falling back to CPU: ${e instanceof Error ? e.message : String(e)}`,
2026
+ });
1860
2027
  w.clearCandidateMask();
1861
2028
  }
1862
2029
  }
@@ -1912,18 +2079,30 @@ export class AlbexEngine {
1912
2079
  bitapMatched: w.getStatBitapMatched(),
1913
2080
  };
1914
2081
 
1915
- return this._collectResults(count, opts);
2082
+ return this._collectResults(count, opts, phraseTokens);
1916
2083
  }
1917
2084
 
1918
- /** Materialise results [0..count) into the public SearchResult shape. */
1919
- private _collectResults(count: number, opts: SearchOptions): SearchResult[] {
2085
+ /** Materialise results [0..count) into the public SearchResult shape.
2086
+ * When `phraseTokens` is given, each result is kept only if those tokens
2087
+ * appear adjacently in the FULL chunk text — independent of any display
2088
+ * windowing — so phrase queries stay correct under `{ windowed: true }`. */
2089
+ private _collectResults(count: number, opts: SearchOptions, phraseTokens?: string[]): SearchResult[] {
1920
2090
  const w = this._wasm;
1921
2091
  const windowed = opts.windowed === true;
1922
2092
  const before = opts.before ?? 60;
1923
2093
  const after = opts.after ?? 120;
2094
+ const phraseFilter = phraseTokens && phraseTokens.length > 0 ? phraseTokens : null;
1924
2095
 
1925
2096
  const results: SearchResult[] = [];
1926
2097
  for (let i = 0; i < count; i++) {
2098
+ // Phrase adjacency check against the full chunk text (getSnippet), not
2099
+ // the possibly-cropped display window.
2100
+ if (phraseFilter) {
2101
+ const fl = w.getSnippet(i);
2102
+ const full = fl > 0 ? this._readPad(fl) : '';
2103
+ if (!containsPhrase(full, phraseFilter)) continue;
2104
+ }
2105
+
1927
2106
  const score = w.getResultScore(i);
1928
2107
  const location = w.getResultLocation(i);
1929
2108
  const matchStart = w.getResultStart(i);
@@ -1973,29 +2152,32 @@ export class AlbexEngine {
1973
2152
  return results;
1974
2153
  }
1975
2154
 
1976
- private _searchOr(branches: string[][], rawQuery: string, opts: SearchOptions): SearchResult[] {
2155
+ /** Run all OR branches and merge dedup-by-(doc, location, match). The
2156
+ * branches are already compiled inside the WASM (by prepareQuery); we
2157
+ * iterate them with selectQueryBranch. The "rawQuery" param is kept
2158
+ * only for the lastSearch.query field. */
2159
+ private _searchOr(rawQuery: string, opts: SearchOptions): SearchResult[] {
2160
+ const w = this._wasm;
1977
2161
  const seen = new Set<string>();
1978
2162
  const all: SearchResult[] = [];
1979
-
1980
- for (const tokens of branches) {
1981
- const q = tokensToWasmQuery(tokens);
1982
- if (!q) continue;
1983
- const results = this._runSearch(q, rawQuery, opts);
2163
+ const n = w.getQueryBranchCount();
2164
+ for (let i = 0; i < n; i++) {
2165
+ w.selectQueryBranch(i);
2166
+ const results = this._runSearch(rawQuery, opts);
1984
2167
  for (const r of results) {
1985
2168
  const key = `${r.documentName}:${r.location}:${r.matchStart}`;
1986
2169
  if (!seen.has(key)) { seen.add(key); all.push(r); }
1987
2170
  }
1988
2171
  }
1989
-
1990
- // Re-rank the merged list by score descending.
1991
2172
  all.sort((a, b) => b.score - a.score);
1992
2173
  return all;
1993
2174
  }
1994
2175
 
1995
- private _runSearch(wasmQuery: string, displayQuery: string, opts: SearchOptions): SearchResult[] {
2176
+ /** Execute a single search using whichever query branch is currently
2177
+ * active (set via selectQueryBranch). Returns the materialised
2178
+ * SearchResult[]. Caller is responsible for activating a branch first. */
2179
+ private _runSearch(displayQuery: string, opts: SearchOptions, phraseTokens?: string[]): SearchResult[] {
1996
2180
  const w = this._wasm;
1997
- const ql = this._writeStr(wasmQuery);
1998
- w.setPattern(ql);
1999
2181
 
2000
2182
  const t0 = performance.now();
2001
2183
  const count = w.search();
@@ -2010,63 +2192,7 @@ export class AlbexEngine {
2010
2192
  bitapMatched: w.getStatBitapMatched(),
2011
2193
  };
2012
2194
 
2013
- const windowed = opts.windowed === true;
2014
- const before = opts.before ?? 60;
2015
- const after = opts.after ?? 120;
2016
-
2017
- const results: SearchResult[] = [];
2018
- for (let i = 0; i < count; i++) {
2019
- const score = w.getResultScore(i);
2020
- const location = w.getResultLocation(i);
2021
- const matchStart = w.getResultStart(i);
2022
- const matchEnd = w.getResultEnd(i);
2023
- const nl = w.getResultDocName(i);
2024
- const name = nl > 0 ? this._readPad(nl) : '?';
2025
-
2026
- const matchCount = w.getResultMatchCount(i);
2027
- const matches: MatchSpan[] = [];
2028
- for (let k = 0; k < matchCount; k++) {
2029
- matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
2030
- }
2031
- if (matches.length === 0) {
2032
- matches.push({ start: matchStart, end: matchEnd });
2033
- }
2034
-
2035
- let snippet: string;
2036
- let primaryStart = matchStart;
2037
- let primaryEnd = matchEnd;
2038
- let adjustedMatches: MatchSpan[] = matches;
2039
-
2040
- if (windowed) {
2041
- const sl = w.getSnippetWindow(i, before, after);
2042
- snippet = sl > 0 ? this._readPad(sl) : '';
2043
- const offset = w.getSnippetWindowOffset();
2044
- // Spans came back chunk-relative; shift them into window-relative.
2045
- // Account for leading "... " prefix when present.
2046
- const leadingPrefix = offset > 0 ? 4 : 0;
2047
- const shift = leadingPrefix - offset;
2048
- adjustedMatches = matches.map(m => ({
2049
- start: Math.max(0, m.start + shift),
2050
- end: Math.max(0, m.end + shift),
2051
- }));
2052
- primaryStart = adjustedMatches[0]?.start ?? 0;
2053
- primaryEnd = adjustedMatches[0]?.end ?? 0;
2054
- } else {
2055
- const sl = w.getSnippet(i);
2056
- snippet = sl > 0 ? this._readPad(sl) : '';
2057
- }
2058
-
2059
- results.push({
2060
- documentName: name,
2061
- location,
2062
- score,
2063
- snippet,
2064
- matchStart: primaryStart,
2065
- matchEnd: primaryEnd,
2066
- matches: adjustedMatches,
2067
- });
2068
- }
2069
- return results;
2195
+ return this._collectResults(count, opts, phraseTokens);
2070
2196
  }
2071
2197
 
2072
2198
  /** Returns current engine statistics. */
@@ -2127,9 +2253,93 @@ export class AlbexEngine {
2127
2253
 
2128
2254
  /** Full reset — clears all indexed documents and chunks. */
2129
2255
  reset(): void {
2256
+ this._assertIdle('reset');
2257
+ this._resetInner();
2258
+ }
2259
+
2260
+ private _resetInner(): void {
2130
2261
  this._wasm.init();
2131
2262
  this._docs = [];
2132
2263
  this._lastSearch = null;
2264
+ this._diagnostics = [];
2265
+ }
2266
+
2267
+ /**
2268
+ * Drain and return the diagnostics collected since the last call (or
2269
+ * since the engine was created). Use this to surface recoverable
2270
+ * issues to the caller after `indexFile`, `load`, or any other
2271
+ * operation that may run into a "best-effort" path.
2272
+ *
2273
+ * Example diagnostics:
2274
+ * - `{kind:'fallback', stage:'pdf', message:'pdf-extract crashed,
2275
+ * attempting OCR-only fallback', file:'invoice.pdf'}`
2276
+ * - `{kind:'skipped', stage:'ocr', message:'Tesseract abort on page
2277
+ * 3 image 1; remaining images on this page skipped', file:'...',
2278
+ * page:3}`
2279
+ * - `{kind:'fallback', stage:'gpu', message:'GPU pre-filter failed,
2280
+ * using CPU'}`
2281
+ *
2282
+ * The buffer is cleared on each call; callers should consume the
2283
+ * returned array immediately (e.g. log to their telemetry, surface
2284
+ * a UI banner). After `reset()` the buffer is also cleared.
2285
+ */
2286
+ takeDiagnostics(): AlbexDiagnostic[] {
2287
+ const out = this._diagnostics;
2288
+ this._diagnostics = [];
2289
+ return out;
2290
+ }
2291
+
2292
+ /** Internal: record a diagnostic. Capped at 256 to bound memory. */
2293
+ private _diag(entry: AlbexDiagnostic): void {
2294
+ if (this._diagnostics.length >= 256) return;
2295
+ this._diagnostics.push(entry);
2296
+ }
2297
+
2298
+ /**
2299
+ * Install an OCR adapter. Returns a handle whose `dispose()` removes the
2300
+ * adapter from the engine.
2301
+ *
2302
+ * The contract: the adapter must provide `recognize(image, opts)` that
2303
+ * returns `Promise<OcrAttachedResult>`. The engine validates the
2304
+ * contract at attach time and refuses adapters that don't expose a
2305
+ * recognise function. Only one adapter can be attached at a time; a
2306
+ * second call to `attachOcr` while one is active throws — the caller
2307
+ * must dispose the previous one first.
2308
+ *
2309
+ * @example
2310
+ * ```ts
2311
+ * import { enableOcr } from '@albex/ocr';
2312
+ * const handle = enableOcr(engine); // internally calls attachOcr
2313
+ * // ... later ...
2314
+ * await handle.dispose();
2315
+ * ```
2316
+ *
2317
+ * Direct use without the companion package:
2318
+ * ```ts
2319
+ * const handle = engine.attachOcr({
2320
+ * recognize: async (blob) => myCustomOcr(blob),
2321
+ * options: { alwaysExtractEmbeddedImages: false },
2322
+ * });
2323
+ * ```
2324
+ */
2325
+ attachOcr(adapter: OcrAdapter): OcrHandle {
2326
+ if (this._ocrAdapter) {
2327
+ throw new AlbexInitError(
2328
+ 'OCR adapter already attached. Call dispose() on the previous handle before attaching a new one.',
2329
+ );
2330
+ }
2331
+ if (typeof adapter?.recognize !== 'function') {
2332
+ throw new AlbexInitError(
2333
+ 'attachOcr requires an adapter with a recognize(image, opts) function.',
2334
+ );
2335
+ }
2336
+ this._ocrAdapter = adapter;
2337
+ return {
2338
+ dispose: async () => {
2339
+ // Idempotent: a double dispose is a no-op rather than a throw.
2340
+ if (this._ocrAdapter === adapter) this._ocrAdapter = null;
2341
+ },
2342
+ };
2133
2343
  }
2134
2344
 
2135
2345
  // ── Persistence ───────────────────────────────────────────────────────────
@@ -2142,6 +2352,10 @@ export class AlbexEngine {
2142
2352
  * state in roughly O(total bytes), bypassing re-parsing.
2143
2353
  */
2144
2354
  async save(name: string): Promise<void> {
2355
+ return this._exclusive(() => this._saveInner(name));
2356
+ }
2357
+
2358
+ private async _saveInner(name: string): Promise<void> {
2145
2359
  const w = this._wasm;
2146
2360
  const total = w.snapshotSize();
2147
2361
  if (total === 0) {
@@ -2168,6 +2382,10 @@ export class AlbexEngine {
2168
2382
  * header (wrong magic, version, or struct sizes).
2169
2383
  */
2170
2384
  async load(name: string): Promise<boolean> {
2385
+ return this._exclusive(() => this._loadInner(name));
2386
+ }
2387
+
2388
+ private async _loadInner(name: string): Promise<boolean> {
2171
2389
  const bytes = await loadPersisted(name);
2172
2390
  if (!bytes || bytes.length === 0) return false;
2173
2391
 
@@ -2188,6 +2406,17 @@ export class AlbexEngine {
2188
2406
  off += n;
2189
2407
  }
2190
2408
 
2409
+ // Commit. For v3 this is the atomic apply step (state is untouched
2410
+ // until now); a failure here leaves the previous index intact so the
2411
+ // caller can keep using the engine. For v1/v2 snapshots `restoreCommit`
2412
+ // is a no-op that returns 1 (those formats applied in-place during
2413
+ // restoreFeed and have no rollback to offer). Older binaries that
2414
+ // predate v3 do not export `restoreCommit` — in that case we treat
2415
+ // the load as already committed by feature-detect.
2416
+ if (typeof w.restoreCommit === 'function') {
2417
+ if (w.restoreCommit() !== 1) return false;
2418
+ }
2419
+
2191
2420
  // Rebuild _docs metadata from the restored WASM tables.
2192
2421
  //
2193
2422
  // What's available after a restore:
@@ -2250,9 +2479,11 @@ export class AlbexEngine {
2250
2479
  * empty. Returns whether a load actually happened.
2251
2480
  */
2252
2481
  async loadOrInit(name: string): Promise<boolean> {
2253
- const loaded = await this.load(name);
2254
- if (!loaded) this.reset();
2255
- return loaded;
2482
+ return this._exclusive(async () => {
2483
+ const loaded = await this._loadInner(name);
2484
+ if (!loaded) this._resetInner();
2485
+ return loaded;
2486
+ });
2256
2487
  }
2257
2488
 
2258
2489
  /** Delete a previously persisted snapshot. */
@@ -2277,7 +2508,8 @@ export class AlbexEngine {
2277
2508
  * WASM instance and its (typically 20 MB) backing memory.
2278
2509
  */
2279
2510
  [Symbol.dispose](): void {
2280
- this.reset();
2511
+ // Terminal: bypass the idle guard — disposing mid-operation is allowed.
2512
+ this._resetInner();
2281
2513
  this._unsubscribeResources?.();
2282
2514
  this._unsubscribeResources = null;
2283
2515
  this._gpu?.destroy();