albex 0.1.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/CHANGELOG.md +416 -0
  2. package/README.md +244 -112
  3. package/dist/albex-worker.d.ts +70 -0
  4. package/dist/albex-worker.d.ts.map +1 -0
  5. package/dist/albex-worker.js +153 -0
  6. package/dist/albex-worker.js.map +1 -0
  7. package/dist/albex.d.ts +508 -6
  8. package/dist/albex.d.ts.map +1 -1
  9. package/dist/albex.js +1911 -141
  10. package/dist/albex.js.map +1 -1
  11. package/dist/errors.d.ts +52 -0
  12. package/dist/errors.d.ts.map +1 -0
  13. package/dist/errors.js +66 -0
  14. package/dist/errors.js.map +1 -0
  15. package/dist/gpu/bloom-runtime.d.ts +60 -0
  16. package/dist/gpu/bloom-runtime.d.ts.map +1 -0
  17. package/dist/gpu/bloom-runtime.js +176 -0
  18. package/dist/gpu/bloom-runtime.js.map +1 -0
  19. package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
  20. package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
  21. package/dist/gpu/bloom-shader.wgsl.js +49 -0
  22. package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
  23. package/dist/persistence.d.ts +21 -0
  24. package/dist/persistence.d.ts.map +1 -0
  25. package/dist/persistence.js +174 -0
  26. package/dist/persistence.js.map +1 -0
  27. package/dist/pool/coordinator.d.ts +98 -0
  28. package/dist/pool/coordinator.d.ts.map +1 -0
  29. package/dist/pool/coordinator.js +247 -0
  30. package/dist/pool/coordinator.js.map +1 -0
  31. package/dist/profile.d.ts +100 -0
  32. package/dist/profile.d.ts.map +1 -0
  33. package/dist/profile.js +200 -0
  34. package/dist/profile.js.map +1 -0
  35. package/dist/resource-manager.d.ts +56 -0
  36. package/dist/resource-manager.d.ts.map +1 -0
  37. package/dist/resource-manager.js +138 -0
  38. package/dist/resource-manager.js.map +1 -0
  39. package/dist/tiered-store.d.ts +98 -0
  40. package/dist/tiered-store.d.ts.map +1 -0
  41. package/dist/tiered-store.js +238 -0
  42. package/dist/tiered-store.js.map +1 -0
  43. package/dist/wasm-bindings.d.ts +180 -0
  44. package/dist/wasm-bindings.d.ts.map +1 -0
  45. package/dist/wasm-bindings.js +128 -0
  46. package/dist/wasm-bindings.js.map +1 -0
  47. package/dist/worker-protocol.d.ts +86 -0
  48. package/dist/worker-protocol.d.ts.map +1 -0
  49. package/dist/worker-protocol.js +20 -0
  50. package/dist/worker-protocol.js.map +1 -0
  51. package/dist/worker-runtime.d.ts +14 -0
  52. package/dist/worker-runtime.d.ts.map +1 -0
  53. package/dist/worker-runtime.js +109 -0
  54. package/dist/worker-runtime.js.map +1 -0
  55. package/package.json +60 -13
  56. package/src/albex-worker.ts +187 -0
  57. package/src/albex.ts +2136 -189
  58. package/src/errors.ts +76 -0
  59. package/src/gpu/bloom-runtime.ts +229 -0
  60. package/src/gpu/bloom-shader.wgsl.ts +48 -0
  61. package/src/persistence.ts +175 -0
  62. package/src/pool/coordinator.ts +324 -0
  63. package/src/profile.ts +280 -0
  64. package/src/resource-manager.ts +167 -0
  65. package/src/tiered-store.ts +259 -0
  66. package/src/wasm-bindings.ts +349 -0
  67. package/src/worker-protocol.ts +48 -0
  68. package/src/worker-runtime.ts +106 -0
  69. package/wasm/pkg/albex_pdf.wasm +0 -0
  70. package/wasm/pkg/albex_wasm.wasm +0 -0
  71. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  72. package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/src/albex.ts CHANGED
@@ -14,15 +14,113 @@
14
14
  * ```
15
15
  */
16
16
 
17
+ import {
18
+ AlbexWasmExports,
19
+ AlbexPdfExports,
20
+ asAlbexExports,
21
+ asAlbexPdfExports,
22
+ } from './wasm-bindings.js';
23
+ import {
24
+ AlbexError,
25
+ AlbexInitError,
26
+ AlbexUnsupportedFormatError,
27
+ AlbexParseError,
28
+ AlbexCapacityError,
29
+ type AlbexCapacityLimit,
30
+ } from './errors.js';
31
+ import {
32
+ savePersisted,
33
+ loadPersisted,
34
+ deletePersisted,
35
+ listPersisted,
36
+ } from './persistence.js';
37
+ import { detectProfile, shouldUseGpu, type Tier, type DeviceProfile } from './profile.js';
38
+ import { getResourceManager, type ResourceState } from './resource-manager.js';
39
+ import { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
40
+
41
+ export {
42
+ AlbexError,
43
+ AlbexInitError,
44
+ AlbexUnsupportedFormatError,
45
+ AlbexParseError,
46
+ AlbexCapacityError,
47
+ } from './errors.js';
48
+ export { listPersisted, deletePersisted } from './persistence.js';
49
+ export { detectProfile, pickTier, pickWorkerCount, shouldUseGpu } from './profile.js';
50
+ export type { DeviceProfile, Tier } from './profile.js';
51
+ export { getResourceManager } from './resource-manager.js';
52
+ export type { ResourceState, ResourceMode } from './resource-manager.js';
53
+ export { AlbexPool } from './pool/coordinator.js';
54
+ export type { AlbexPoolOptions } from './pool/coordinator.js';
55
+ export { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
56
+ export { TieredStore } from './tiered-store.js';
57
+ export type { TieredStoreOptions } from './tiered-store.js';
58
+
59
+ // ─────────────────────────────────────────────────────────────────────────────
60
+ // Deprecation warnings — one-shot, fire-and-forget
61
+ // ─────────────────────────────────────────────────────────────────────────────
62
+
63
+ let _searchStreamWarned = false;
64
+ function warnSearchStreamDeprecated(): void {
65
+ if (_searchStreamWarned) return;
66
+ _searchStreamWarned = true;
67
+ // The original name implied incremental streaming, which the implementation
68
+ // never provided. Renamed in 0.3.0; alias removed in 0.4.0.
69
+ console.warn(
70
+ '[albex] `searchStream` is deprecated; rename to `searchCooperative`. ' +
71
+ 'The method does not stream incremental results — it yields to the ' +
72
+ 'scheduler between slices and returns a batch. The alias will be ' +
73
+ 'removed in 0.4.0.',
74
+ );
75
+ }
76
+
17
77
  // ─────────────────────────────────────────────────────────────────────────────
18
78
  // Public types
19
79
  // ─────────────────────────────────────────────────────────────────────────────
20
80
 
21
81
  export interface AlbexOptions {
22
- /** URL to albex_wasm_bg.wasm (required). */
23
- wasmUrl: string;
82
+ /**
83
+ * Explicit URL to the main WASM binary.
84
+ *
85
+ * If you want automatic tier selection (mini/std/pro chosen from
86
+ * `deviceMemory`), pass `wasmBaseUrl` instead — the engine will fetch
87
+ * `albex_wasm_<tier>.wasm` from that directory.
88
+ */
89
+ wasmUrl?: string;
90
+ /**
91
+ * Base directory containing tiered binaries (`albex_wasm_mini.wasm`,
92
+ * `_std.wasm`, `_pro.wasm`). Used when `wasmUrl` is omitted.
93
+ */
94
+ wasmBaseUrl?: string;
24
95
  /** URL to albex_pdf.wasm. Required only if you call indexFile() with PDFs. */
25
96
  pdfWasmUrl?: string;
97
+ /**
98
+ * Override the tier auto-detection. Pass `'auto'` (default), or an
99
+ * explicit tier when you know the constraints of your target environment.
100
+ */
101
+ /** @deprecated Removed in 0.5.0. Albex no longer has capacity tiers;
102
+ * pass `'auto'` or omit. Other values are accepted and ignored. */
103
+ tier?: 'auto' | 'mini' | 'std' | 'pro';
104
+ /**
105
+ * SIMD selection. When `'auto'` (default), Albex probes for v128 support
106
+ * and fetches the `_simd.wasm` variant when available. Pass `'off'` to
107
+ * stay on the baseline binary even on capable hosts (useful for
108
+ * regression testing or to align all clients in a corporate deployment).
109
+ */
110
+ simd?: 'auto' | 'on' | 'off';
111
+ /**
112
+ * GPU acceleration policy for the Bloom pre-filter.
113
+ * `'auto'` — enable when WebGPU is available AND chunk count is large
114
+ * `'on'` — force enable (fall back to CPU silently if GPU fails)
115
+ * `'off'` — never use GPU
116
+ * Default: `'auto'`.
117
+ */
118
+ gpu?: 'auto' | 'on' | 'off';
119
+ /**
120
+ * Minimum chunk count before `gpu: 'auto'` engages. Below this threshold
121
+ * the upload + dispatch overhead is bigger than the speedup. Default: 20_000.
122
+ */
123
+ gpuThreshold?: number;
26
124
  }
27
125
 
28
126
  export interface IndexedDocument {
@@ -31,6 +129,17 @@ export interface IndexedDocument {
31
129
  chunks: number;
32
130
  indexTimeMs: number;
33
131
  textBytes: number;
132
+ /** WASM-side stable identifier (also acts as a slot index after compact). */
133
+ docId: number;
134
+ /** 64-bit FNV-1a hex of the source file bytes. Stable across runs. */
135
+ contentHash: string;
136
+ }
137
+
138
+ export interface MatchSpan {
139
+ /** Byte offset within `snippet` where this matched token begins. */
140
+ start: number;
141
+ /** Byte offset within `snippet` where this matched token ends (exclusive). */
142
+ end: number;
34
143
  }
35
144
 
36
145
  export interface SearchResult {
@@ -39,12 +148,38 @@ export interface SearchResult {
39
148
  location: number;
40
149
  /** Relevance score 0–1000. */
41
150
  score: number;
42
- /** Raw snippet text (original, with accents). */
151
+ /** Snippet text. With `windowed` search options this is a substring with
152
+ * ASCII ellipsis sentinels (`"... "` / `" ..."`) the UI should render
153
+ * as `…`. Without windowing, the full chunk text. */
43
154
  snippet: string;
44
- /** Match start byte offset within snippet. */
155
+ /** Primary token match (kept for backwards compatibility). Equal to `matches[0]`. */
45
156
  matchStart: number;
46
- /** Match end byte offset within snippet (exclusive). */
47
157
  matchEnd: number;
158
+ /** All matched token spans within `snippet`, in query order. Length 1–4. */
159
+ matches: MatchSpan[];
160
+ }
161
+
162
+ /**
163
+ * Options that change how snippets are produced. Both fields are optional.
164
+ *
165
+ * `windowed` — when true, return a cropped window around the primary
166
+ * match instead of the full chunk text.
167
+ * `before/after` — bytes of context to include on each side of the primary
168
+ * match. Defaults: 60 before, 120 after.
169
+ */
170
+ export interface SearchOptions {
171
+ windowed?: boolean;
172
+ before?: number;
173
+ after?: number;
174
+ /**
175
+ * Frame budget in milliseconds for `searchCooperative`. The engine
176
+ * processes chunks until the budget is exhausted, then yields to the
177
+ * event loop via `scheduler.yield()` (or `requestAnimationFrame`
178
+ * fallback) before resuming. Lower = smoother UI; higher = lower latency.
179
+ *
180
+ * Default: 8 ms (half a 60 fps frame). Ignored by synchronous `search()`.
181
+ */
182
+ frameBudgetMs?: number;
48
183
  }
49
184
 
50
185
  export interface EngineStats {
@@ -53,6 +188,12 @@ export interface EngineStats {
53
188
  textUsed: number;
54
189
  textCapacity: number;
55
190
  wasmMemoryBytes: number;
191
+ /** Tier loaded at init time (mini/std/pro). */
192
+ tier: Tier | null;
193
+ /** Compile-time chunk capacity for the loaded tier. */
194
+ maxChunks: number;
195
+ /** Compile-time document capacity for the loaded tier. */
196
+ maxDocs: number;
56
197
  }
57
198
 
58
199
  export interface SearchStats {
@@ -64,57 +205,57 @@ export interface SearchStats {
64
205
  bitapMatched: number;
65
206
  }
66
207
 
67
- // ─────────────────────────────────────────────────────────────────────────────
68
- // Query parsing
69
- // ─────────────────────────────────────────────────────────────────────────────
70
-
71
- type SimpleQuery = { kind: 'simple'; tokens: string[] };
72
- type PhraseQuery = { kind: 'phrase'; tokens: string[]; raw: string };
73
- type OrQuery = { kind: 'or'; branches: string[][] };
74
- type ParsedQuery = SimpleQuery | PhraseQuery | OrQuery;
75
-
76
- function tokenize(q: string): string[] {
77
- return q.trim().split(/\s+/).filter(t => t.length > 0);
78
- }
79
-
80
- function parseQuery(q: string): ParsedQuery {
81
- const trimmed = q.trim();
82
-
83
- // OR: "term1 | term2" or "phrase one | phrase two"
84
- if (trimmed.includes('|')) {
85
- const branches = trimmed.split('|')
86
- .map(p => tokenize(p.replace(/"/g, '')))
87
- .filter(b => b.length > 0);
88
- return { kind: 'or', branches };
89
- }
90
-
91
- // Phrase: "exact phrase here"
92
- const phraseMatch = /^"(.+)"$/.exec(trimmed);
93
- if (phraseMatch) {
94
- const inner = phraseMatch[1] ?? '';
95
- const tokens = tokenize(inner);
96
- return { kind: 'phrase', tokens, raw: inner };
97
- }
98
-
99
- return { kind: 'simple', tokens: tokenize(trimmed) };
100
- }
101
-
102
208
  /**
103
- * Reconstruct a WASM-compatible query string from parsed tokens.
104
- * The WASM engine accepts up to 4 space-separated tokens (AND semantics).
209
+ * One structured warning recorded by the engine during indexFile or
210
+ * load. Replaces the pre-0.5.0 pattern of scattered `console.warn`
211
+ * calls. Inspect via `engine.takeDiagnostics()` after the operation.
105
212
  */
106
- function tokensToWasmQuery(tokens: string[]): string {
107
- return tokens.slice(0, 4).join(' ');
213
+ export interface AlbexDiagnostic {
214
+ /** Coarse kind. `'recovered'` means the engine handled the issue and
215
+ * kept going; `'skipped'` means content was dropped; `'fallback'` means
216
+ * an alternate code path was used (e.g. lopdf after pdf-extract trap). */
217
+ kind: 'recovered' | 'skipped' | 'fallback' | 'info';
218
+ /** Where in the pipeline this happened. Free-form short tag. */
219
+ stage: 'pdf' | 'ocr' | 'gpu' | 'persistence' | 'network';
220
+ /** Human-readable message safe to surface in a UI. */
221
+ message: string;
222
+ /** Optional file the issue belongs to. */
223
+ file?: string;
224
+ /** Optional page number (1-based for PDFs). */
225
+ page?: number;
108
226
  }
109
227
 
110
228
  // ─────────────────────────────────────────────────────────────────────────────
111
- // Phrase post-filter
229
+ // Query parsing (WASM-side as of 0.5.0)
112
230
  // ─────────────────────────────────────────────────────────────────────────────
231
+ //
232
+ // Pre-0.5.0 this file owned parseQuery + tokenize. That created two
233
+ // truths about what a "token" was: one in TS for the query, one in Rust
234
+ // for the indexed text. The audit flagged this as the biggest divergence
235
+ // in the wrapper.
236
+ //
237
+ // 0.5.0 moves parseQuery/tokenize/tokensToWasmQuery to Rust. The TS
238
+ // dispatcher reduces to:
239
+ //
240
+ // 1. Write the raw UTF-8 query bytes to the scratchpad.
241
+ // 2. Call prepareQuery(len). Get back the kind (simple/phrase/or).
242
+ // 3. For OR: iterate getQueryBranchCount() branches, calling
243
+ // selectQueryBranch(i) + search() for each, then merge in TS.
244
+ // For simple/phrase: selectQueryBranch(0) + search().
245
+ // 4. For phrase: post-filter the snippets with containsPhrase().
246
+ //
247
+ // containsPhrase stays in TS because it operates on snippet text already
248
+ // produced by the WASM, not on the query. It is not a tokenizer.
113
249
 
114
250
  /**
115
- * Returns true if `snippet` contains the phrase formed by `tokens` in order,
116
- * with at most `maxGap` characters between consecutive tokens.
117
- * Comparison is case- and accent-insensitive.
251
+ * Phrase post-filter. Returns true if `snippet` contains the phrase
252
+ * formed by `tokens` in order, with at most `maxGap` characters between
253
+ * consecutive tokens. Comparison is case- and accent-insensitive.
254
+ *
255
+ * The tokens come from the WASM-compiled pattern of a phrase branch,
256
+ * not from a TS re-tokenization of the query, so there is no
257
+ * tokenization divergence: WASM said "these are the tokens", we just
258
+ * check adjacency in the snippet.
118
259
  */
119
260
  function containsPhrase(snippet: string, tokens: string[], maxGap = 30): boolean {
120
261
  const norm = (s: string): string =>
@@ -147,7 +288,7 @@ function zipCentralDir(bytes: Uint8Array): { v: DataView; cdOff: number; cdN: nu
147
288
  const v = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
148
289
  let p = bytes.length - 22;
149
290
  while (p >= 0 && v.getUint32(p, true) !== 0x06054b50) p--;
150
- if (p < 0) throw new Error('Not a ZIP file');
291
+ if (p < 0) throw new AlbexParseError('zip', 'Not a ZIP file (no EOCD record)');
151
292
  return { v, cdOff: v.getUint32(p + 16, true), cdN: v.getUint16(p + 10, true) };
152
293
  }
153
294
 
@@ -176,7 +317,7 @@ async function findZipEntry(bytes: Uint8Array, name: string): Promise<Uint8Array
176
317
  }
177
318
  cp += 46 + nl + xl + cl;
178
319
  }
179
- throw new Error(`Entry "${name}" not found in ZIP`);
320
+ throw new AlbexParseError('zip', `Entry "${name}" not found in ZIP`);
180
321
  }
181
322
 
182
323
  async function decompEntry(bytes: Uint8Array, v: DataView, off: number, compSize: number): Promise<Uint8Array> {
@@ -202,7 +343,7 @@ async function decompEntry(bytes: Uint8Array, v: DataView, off: number, compSize
202
343
  for (const c of chunks) { out.set(c, o); o += c.length; }
203
344
  return out;
204
345
  }
205
- throw new Error(`Unsupported ZIP compression method ${meth}`);
346
+ throw new AlbexParseError('zip', `Unsupported ZIP compression method ${meth}`);
206
347
  }
207
348
 
208
349
  // ─────────────────────────────────────────────────────────────────────────────
@@ -211,64 +352,577 @@ async function decompEntry(bytes: Uint8Array, v: DataView, off: number, compSize
211
352
 
212
353
  const FEED_SIZE = 32_768; // 32 KB — fits in 64 KB scratchpad
213
354
 
355
+ // ─────────────────────────────────────────────────────────────────────────────
356
+ // Content hash — FNV-1a 64-bit
357
+ // ─────────────────────────────────────────────────────────────────────────────
358
+
359
+ /**
360
+ * Compute a 64-bit FNV-1a hash of `bytes` and return it as a 16-char hex
361
+ * string. FNV-1a is a non-cryptographic hash; chosen here because:
362
+ * - it needs zero dependencies,
363
+ * - it is fast on small/medium blobs (~100 MB/s in modern JS),
364
+ * - 64 bits is enough to deduplicate documents in a 128-doc library with
365
+ * vanishing collision probability.
366
+ *
367
+ * The result is stable across runs and engines, so it can be persisted in
368
+ * snapshots without versioning concerns.
369
+ */
370
+ /**
371
+ * Compute the same 64-bit Bloom value the Rust side computes for a query.
372
+ *
373
+ * Must stay in sync with `BloomFilter::from_text` and `fold_utf8_char` in
374
+ * `core/src/bloom.rs`. The hashing is `c & 0x3F` over each accent-folded
375
+ * lowercase ASCII byte; non-letters are skipped. The aggregate of all token
376
+ * blooms is what the GPU pre-filter checks against.
377
+ */
378
+ function computePatternBloom(query: string): bigint {
379
+ // Quick-and-faithful fold: lowercase, NFKD, strip combining marks. This
380
+ // matches the Rust Latin-1/Latin-A fold for the characters we care about
381
+ // (the rest fall through as non-letters which contribute nothing).
382
+ const norm = query.toLowerCase().normalize('NFKD').replace(/[̀-ͯ]/g, '');
383
+ let bits = 0n;
384
+ for (let i = 0; i < norm.length; i++) {
385
+ const code = norm.charCodeAt(i);
386
+ if ((code >= 0x61 && code <= 0x7a) || (code >= 0x30 && code <= 0x39)) {
387
+ bits |= 1n << BigInt(code & 0x3f);
388
+ } else if (code === 0x20) {
389
+ // skip token separator
390
+ } else if (code < 0x80) {
391
+ // other ASCII punctuation — they bias the filter; mirror Rust which
392
+ // also includes them via the 6-bit mask.
393
+ bits |= 1n << BigInt(code & 0x3f);
394
+ }
395
+ }
396
+ return bits;
397
+ }
398
+
399
+ // Note: `contentHash` is implemented as a method on AlbexEngine below
400
+ // (it needs access to the WASM scratchpad). The standalone TS reference
401
+ // implementation that used to live here was removed in 0.4.0 — the
402
+ // canonical hash now lives in wasm/src/lib.rs::hashBytes so there is
403
+ // exactly one definition of "the content hash of these bytes".
404
+
405
+ /**
406
+ * 16-hex-char content hash → 8 raw bytes for setDocumentContentHash. The
407
+ * byte order matches the snapshot format: the high 32 bits sit at offsets
408
+ * 0..3 (big-endian-of-the-half), the low 32 bits at offsets 4..7. The
409
+ * exact byte order is irrelevant for correctness — both encode and decode
410
+ * use the same convention — but matching the natural hex byte order keeps
411
+ * a hex dump readable.
412
+ */
413
+ function hashHexToBytes(hex: string): Uint8Array {
414
+ const out = new Uint8Array(8);
415
+ for (let i = 0; i < 8; i++) {
416
+ out[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
417
+ }
418
+ return out;
419
+ }
420
+
421
+ /**
422
+ * Map a Windows-1252 byte to its Unicode equivalent. Used by the RTF parser
423
+ * for `\'XX` escapes — RTF defaults to cp1252 for high-ANSI characters.
424
+ *
425
+ * The 0x80-0x9F range is what makes cp1252 ≠ Latin-1: Microsoft put curly
426
+ * quotes, em-dashes, the Euro sign etc. into this otherwise-control-only
427
+ * block. Outside that range, cp1252 matches Latin-1 (which equals Unicode
428
+ * for codepoints below 0x100).
429
+ */
430
+ const _CP1252_HIGH: Record<number, string> = {
431
+ 0x80: '€', 0x82: '‚', 0x83: 'ƒ', 0x84: '„', 0x85: '…', 0x86: '†',
432
+ 0x87: '‡', 0x88: 'ˆ', 0x89: '‰', 0x8A: 'Š', 0x8B: '‹', 0x8C: 'Œ',
433
+ 0x8E: 'Ž',
434
+ 0x91: '‘', 0x92: '’', 0x93: '“', 0x94: '”',
435
+ 0x95: '•', 0x96: '–', 0x97: '—', 0x98: '˜', 0x99: '™', 0x9A: 'š',
436
+ 0x9B: '›', 0x9C: 'œ', 0x9E: 'ž', 0x9F: 'Ÿ',
437
+ };
438
+
439
+ function rtfCp1252ToChar(byte: number): string {
440
+ if (byte < 0x80) return String.fromCharCode(byte);
441
+ if (byte >= 0xA0) return String.fromCharCode(byte);
442
+ return _CP1252_HIGH[byte] ?? '';
443
+ }
444
+
445
+ /**
446
+ * Apply the entity's Content-Transfer-Encoding to its body. Handles
447
+ * base64, quoted-printable, and the pass-through cases (7bit, 8bit, none).
448
+ * Anything unrecognised falls through as pass-through too — better to
449
+ * index something marginally useful than to drop the body entirely.
450
+ */
451
+ function decodeEmlBody(
452
+ headersBlock: string,
453
+ body: string,
454
+ header: (block: string, name: string) => string,
455
+ ): string {
456
+ const enc = header(headersBlock, 'Content-Transfer-Encoding').toLowerCase();
457
+ if (enc === 'base64') return decodeBase64Utf8(body);
458
+ if (enc === 'quoted-printable') return decodeQuotedPrintable(body);
459
+ return body;
460
+ }
461
+
462
+ /**
463
+ * Decode a base64 body and interpret the result as UTF-8 text. Used by the
464
+ * EML parser when Content-Transfer-Encoding is base64. Whitespace inside
465
+ * the encoded body (the line breaks every 76 chars) is stripped first;
466
+ * malformed inputs fall back to returning the original string so the
467
+ * caller can still index *something*.
468
+ */
469
+ function decodeBase64Utf8(body: string): string {
470
+ try {
471
+ const clean = body.replace(/\s+/g, '');
472
+ if (!clean) return '';
473
+ // atob produces a "binary string" where each char's low byte is the
474
+ // original byte. We have to bridge that back through Uint8Array to
475
+ // decode UTF-8 multi-byte sequences correctly.
476
+ const bin = atob(clean);
477
+ const arr = new Uint8Array(bin.length);
478
+ for (let i = 0; i < bin.length; i++) arr[i] = bin.charCodeAt(i);
479
+ return _dec.decode(arr);
480
+ } catch {
481
+ return body;
482
+ }
483
+ }
484
+
485
+ /**
486
+ * Decode a quoted-printable body. Handles `=XX` hex escapes (including the
487
+ * `=` "soft line break" producing nothing) and re-decodes the result as
488
+ * UTF-8 — RFC 2045 allows non-ASCII bytes to be QP-encoded, so multiple
489
+ * hex pairs in a row may form a single UTF-8 codepoint.
490
+ */
491
+ function decodeQuotedPrintable(body: string): string {
492
+ // First pass: collect the raw bytes so we can decode multi-byte UTF-8.
493
+ const bytes: number[] = [];
494
+ for (let i = 0; i < body.length; i++) {
495
+ const c = body[i];
496
+ if (c === '=') {
497
+ // Soft line break: `=` at end of line.
498
+ if (body[i + 1] === '\n') { i += 1; continue; }
499
+ // `=XX` hex pair.
500
+ const h = body.slice(i + 1, i + 3);
501
+ if (/^[0-9A-Fa-f]{2}$/.test(h)) {
502
+ bytes.push(parseInt(h, 16));
503
+ i += 2;
504
+ continue;
505
+ }
506
+ // Malformed: keep the literal `=`.
507
+ bytes.push(0x3D);
508
+ continue;
509
+ }
510
+ // ASCII pass-through. JS strings are UTF-16; for ASCII we know
511
+ // charCodeAt fits in a byte. Non-ASCII char in the source isn't
512
+ // strictly valid QP but we pass it through best-effort.
513
+ bytes.push(c!.charCodeAt(0) & 0xff);
514
+ }
515
+ try {
516
+ return _dec.decode(new Uint8Array(bytes));
517
+ } catch {
518
+ return body;
519
+ }
520
+ }
521
+
522
+ /** Inverse of hashHexToBytes. All-zero bytes return '' (no hash known). */
523
+ function hashBytesToHex(bytes: Uint8Array): string {
524
+ let allZero = true;
525
+ for (let i = 0; i < 8; i++) {
526
+ if (bytes[i] !== 0) { allZero = false; break; }
527
+ }
528
+ if (allZero) return '';
529
+ let s = '';
530
+ for (let i = 0; i < 8; i++) {
531
+ s += bytes[i]!.toString(16).padStart(2, '0');
532
+ }
533
+ return s;
534
+ }
535
+
214
536
  // ─────────────────────────────────────────────────────────────────────────────
215
537
  // PDF WASM imports shim
216
538
  // ─────────────────────────────────────────────────────────────────────────────
217
539
 
218
- function makePdfWasmImports(getPdfMem: () => WebAssembly.Memory): WebAssembly.Imports {
540
+ /**
541
+ * Build the import object for `albex_pdf.wasm` by inspecting the module's
542
+ * required imports at instantiation time.
543
+ *
544
+ * The PDF wasm pulls `wasm-bindgen` transitively through `getrandom`. Its
545
+ * import names embed a build-time hash, e.g.
546
+ * __wbg_getRandomValues_3f44b700395062e5
547
+ * Hardcoding that hash bound the loader to one exact build of the .wasm —
548
+ * any version bump of getrandom / lopdf / wasm-bindgen silently broke
549
+ * instantiation with an InputValidationError.
550
+ *
551
+ * Here we resolve imports by *prefix* and module so the binding survives
552
+ * cosmetic mangling changes. We map:
553
+ * - any __wbg_getRandomValues_* / __wbg_crypto_* → crypto.getRandomValues
554
+ * - any __wbindgen_describe* / __wbindgen_throw* → no-op
555
+ * - __wbindgen_object_drop_ref → heap-slot recycler
556
+ * - __wbindgen_externref_table_grow → heap grower
557
+ * - __wbindgen_externref_table_set_null → heap nuller
558
+ *
559
+ * Anything else gets a logged no-op stub. If the PDF code path ever exercises
560
+ * a missing import, the user gets a console warning, not a hard crash on load.
561
+ */
562
+ function makePdfWasmImports(
563
+ module: WebAssembly.Module,
564
+ getPdfMem: () => WebAssembly.Memory | null,
565
+ ): WebAssembly.Imports {
219
566
  const heap: unknown[] = [];
220
567
  let freeIdx = -1;
221
- return {
222
- __wbindgen_placeholder__: {
223
- __wbindgen_describe: () => {},
224
- __wbg_getRandomValues_3f44b700395062e5: (ptr: number, len: number) => {
225
- const mem = getPdfMem();
226
- crypto.getRandomValues(new Uint8Array(mem.buffer, ptr >>> 0, len >>> 0));
227
- },
228
- __wbindgen_object_drop_ref: (idx: number) => {
229
- heap[idx] = freeIdx; freeIdx = idx;
230
- },
231
- },
232
- __wbindgen_externref_xform__: {
233
- __wbindgen_externref_table_grow: (delta: number) => {
234
- const old = heap.length;
235
- for (let i = 0; i < delta; i++) heap.push(undefined);
236
- return old;
237
- },
238
- __wbindgen_externref_table_set_null: (idx: number) => { heap[idx] = undefined; },
239
- },
568
+ const required = WebAssembly.Module.imports(module);
569
+
570
+ const fillRandom = (ptr: number, len: number): void => {
571
+ const mem = getPdfMem();
572
+ if (!mem) throw new Error('PDF WASM memory not initialised');
573
+ crypto.getRandomValues(new Uint8Array(mem.buffer, ptr >>> 0, len >>> 0));
240
574
  };
575
+
576
+ const resolveByName = (modName: string, name: string): unknown => {
577
+ // Random-byte providers (any hashed variant).
578
+ if (name.startsWith('__wbg_getRandomValues') || name.startsWith('__wbg_crypto')) {
579
+ return fillRandom;
580
+ }
581
+ // Diagnostic / introspection — never invoked at runtime in our paths.
582
+ if (name.startsWith('__wbindgen_describe') || name.startsWith('__wbindgen_throw')) {
583
+ return () => {};
584
+ }
585
+ // Externref-heap management used by wasm-bindgen runtime.
586
+ switch (name) {
587
+ case '__wbindgen_object_drop_ref':
588
+ return (idx: number) => { heap[idx] = freeIdx; freeIdx = idx; };
589
+ case '__wbindgen_externref_table_grow':
590
+ return (delta: number) => {
591
+ const old = heap.length;
592
+ for (let i = 0; i < delta; i++) heap.push(undefined);
593
+ return old;
594
+ };
595
+ case '__wbindgen_externref_table_set_null':
596
+ return (idx: number) => { heap[idx] = undefined; };
597
+ }
598
+ // Unknown import — fail fast. An import we don't recognise means the
599
+ // wasm-bindgen / lopdf / getrandom dependency graph has drifted from
600
+ // the prefixes this loader is written to satisfy. Accepting the
601
+ // module would defer the failure to an arbitrary execution path,
602
+ // typically deep inside extractPdf(), where the user gets either a
603
+ // hang or a misleading "PDF parse error". Refusing instantiation
604
+ // surfaces the version skew at boot, where the maintainer can act
605
+ // on it.
606
+ throw new AlbexInitError(
607
+ `Unknown PDF WASM import "${modName}.${name}". ` +
608
+ `The albex_pdf.wasm binary was probably built with a newer Rust ` +
609
+ `toolchain or dependency graph than this loader was written for. ` +
610
+ `Rebuild with 'npm run build:pdf-wasm' or open an issue.`,
611
+ );
612
+ };
613
+
614
+ const imports: Record<string, Record<string, unknown>> = {};
615
+ for (const { module: modName, name } of required) {
616
+ if (!imports[modName]) imports[modName] = {};
617
+ imports[modName]![name] = resolveByName(modName, name);
618
+ }
619
+ return imports as WebAssembly.Imports;
241
620
  }
242
621
 
243
622
  // ─────────────────────────────────────────────────────────────────────────────
244
623
  // AlbexEngine
245
624
  // ─────────────────────────────────────────────────────────────────────────────
246
625
 
626
+ /**
627
+ * Result shape returned by an attached OCR module. Kept structural here so
628
+ * the main package has no runtime dependency on `@albex/ocr` — the optional
629
+ * shape is just a contract.
630
+ */
631
+ export interface OcrAttachedResult {
632
+ text: string;
633
+ confidence: number;
634
+ timeMs: number;
635
+ }
636
+
637
+ export interface OcrAttachedOptions {
638
+ lang?: string;
639
+ hint?: string;
640
+ }
641
+
642
+ /**
643
+ * Contract the engine accepts from an OCR plugin. `@albex/ocr` is the
644
+ * canonical implementation, but any module that satisfies this interface
645
+ * can be attached via `engine.attachOcr(adapter)`.
646
+ */
647
+ export interface OcrAdapter {
648
+ /** Invoked by the engine to OCR a single image. Receives whatever the
649
+ * caller passes (Blob, ArrayBuffer, etc.); the adapter is responsible
650
+ * for accepting that input. Must return text + confidence. */
651
+ recognize(image: unknown, opts?: OcrAttachedOptions): Promise<OcrAttachedResult>;
652
+
653
+ /** Engine-side switches the adapter wants honoured. The only one
654
+ * defined today is `alwaysExtractEmbeddedImages`, which turns on the
655
+ * hybrid PDF OCR pass. New flags can be added without breaking the
656
+ * adapter interface. */
657
+ options?: {
658
+ /** When true, every PDF (native or scanned) is walked for embedded
659
+ * images and each qualifying image is sent to `recognize`. Off by
660
+ * default to keep performance predictable on native PDFs. */
661
+ alwaysExtractEmbeddedImages?: boolean;
662
+ };
663
+ }
664
+
665
+ /** Returned by `attachOcr`. Holds the lifecycle handles for the plugin.
666
+ * Calling `dispose()` removes the adapter from the engine; subsequent
667
+ * `engine.ocrImage` access returns `undefined` again. */
668
+ export interface OcrHandle {
669
+ /** Detach the plugin and tear down any resources it holds. After this,
670
+ * the engine reverts to "no OCR" — scanned PDFs go back to registering
671
+ * with zero chunks. */
672
+ dispose(): Promise<void>;
673
+ }
674
+
247
675
  export class AlbexEngine {
248
676
  // ── main WASM ──
249
- private _wasm!: WebAssembly.Exports;
677
+ private _wasm!: AlbexWasmExports;
250
678
  private _mem!: WebAssembly.Memory;
251
679
 
680
+ /**
681
+ * OCR entry point installed by `@albex/ocr::enableOcr(engine)`. Undefined
682
+ * when the OCR module has not been wired. The main `albex` package has no
683
+ * runtime dependency on OCR — this is a structural slot that the optional
684
+ * companion package fills.
685
+ */
686
+ /**
687
+ * Public OCR entry point. Forwards to the attached OCR adapter installed
688
+ * via `attachOcr()`. Reading this property is a feature-detect for
689
+ * integrators: `if (engine.ocrImage) { ... OCR available ... }`. Writing
690
+ * to it directly is no longer supported in 0.5.0+ — use `attachOcr`.
691
+ */
692
+ get ocrImage(): ((image: unknown, opts?: OcrAttachedOptions) => Promise<OcrAttachedResult>) | undefined {
693
+ return this._ocrAdapter?.recognize;
694
+ }
695
+
696
+ /** Private adapter slot. Holds the OCR plugin contract installed by
697
+ * `attachOcr()`. The engine reads `recognize` and `options` here; the
698
+ * caller never gets a reference to this object directly. */
699
+ private _ocrAdapter: OcrAdapter | null = null;
700
+
252
701
  // ── PDF WASM (lazy) ──
253
- private _pdfWasm: WebAssembly.Exports | null = null;
702
+ private _pdfWasm: AlbexPdfExports | null = null;
254
703
  private _pdfMem: WebAssembly.Memory | null = null;
255
704
 
256
705
  private _docs: IndexedDocument[] = [];
257
706
  private _lastSearch: SearchStats | null = null;
707
+ /** Structured diagnostics collected during the most recent operation.
708
+ * Drained by `takeDiagnostics()`. Capped at 256 entries to avoid
709
+ * unbounded memory growth in pathological cases (very corrupted
710
+ * corpora producing thousands of recovery warnings). */
711
+ private _diagnostics: AlbexDiagnostic[] = [];
712
+ private _tier: Tier | null = null;
713
+ private _simd: boolean = false;
714
+ private _profile: DeviceProfile | null = null;
715
+ private _resources: ResourceState | null = null;
716
+ private _gpu: BloomGpu | null = null;
717
+ private _gpuChunkCountUploaded = 0;
718
+ private _unsubscribeResources: (() => void) | null = null;
258
719
  private readonly _opts: AlbexOptions;
259
720
 
721
+ // ── Concurrency guard ──────────────────────────────────────────────────────
722
+ // One WASM instance, global mutable state, async ops that yield to the
723
+ // scheduler between slices. Two overlapping operations corrupt each other
724
+ // (e.g. a fresh searchBegin resets the cursor of an in-flight cooperative
725
+ // search). Async ops serialize through `_opChain`; sync mutators/searches
726
+ // assert the engine is idle (audit 0.6.0, finding #2).
727
+ private _opChain: Promise<unknown> = Promise.resolve();
728
+ private _busy = false;
729
+
260
730
  constructor(opts: AlbexOptions) {
261
731
  this._opts = opts;
262
732
  }
263
733
 
734
+ /** Serialize an async engine operation behind any in-flight one. */
735
+ private _exclusive<T>(fn: () => Promise<T>): Promise<T> {
736
+ const run = this._opChain.then(async () => {
737
+ this._busy = true;
738
+ try { return await fn(); }
739
+ finally { this._busy = false; }
740
+ });
741
+ // Swallow result/error on the chain so one failure can't wedge the queue.
742
+ this._opChain = run.then(() => undefined, () => undefined);
743
+ return run as Promise<T>;
744
+ }
745
+
746
+ /** Guard a synchronous mutator/search: refuse to run mid-async-operation
747
+ * rather than silently corrupt the shared WASM state. */
748
+ private _assertIdle(method: string): void {
749
+ if (this._busy) {
750
+ throw new AlbexError(
751
+ 'busy',
752
+ `${method}() was called while an async engine operation is still ` +
753
+ `running. Await the previous indexFile/save/load/replaceDocument/` +
754
+ `searchCooperative call, or use searchCooperative instead of search().`,
755
+ );
756
+ }
757
+ }
758
+
759
+ /** Compact opportunistically when tombstones pile up under text pressure,
760
+ * so repeated removeDocument/replaceDocument don't exhaust the pool. */
761
+ private _autoCompactIfNeeded(): void {
762
+ const w = this._wasm;
763
+ const cap = w.getTextCapacity();
764
+ const hasTombstones = w.getDocCount() > this._docs.length;
765
+ if (hasTombstones && cap > 0 && w.getTextUsed() / cap > 0.85) {
766
+ w.compact();
767
+ }
768
+ }
769
+
264
770
  /** Load and initialise the main WASM module. Must be called before any other method. */
265
771
  async init(): Promise<void> {
266
- const res = await fetch(this._opts.wasmUrl);
267
- if (!res.ok) throw new Error(`Failed to fetch WASM: ${res.status}`);
772
+ const url = await this._resolveWasmUrl();
773
+ const res = await fetch(url);
774
+ if (!res.ok) throw new AlbexInitError(`Failed to fetch WASM: ${res.status} (${url})`);
268
775
  const { instance } = await WebAssembly.instantiateStreaming(res, {});
269
- this._wasm = instance.exports;
270
- this._mem = instance.exports.memory as WebAssembly.Memory;
271
- (this._wasm.init as Function)();
776
+ this._wasm = asAlbexExports(instance.exports);
777
+ this._mem = this._wasm.memory;
778
+ this._wasm.init();
779
+
780
+ // Subscribe to environmental signals. Cheap and benign in node tests
781
+ // (the manager tolerates missing globals).
782
+ const rm = getResourceManager();
783
+ await rm.start();
784
+ this._resources = rm.state;
785
+ this._unsubscribeResources = rm.on(s => { this._resources = s; });
786
+
787
+ // Lazily initialise the GPU Bloom accelerator. We don't acquire a device
788
+ // here yet — that happens on the first search that crosses the threshold.
789
+ // This keeps cold-start cost the same on GPU and CPU paths.
790
+ if (this._opts.gpu !== 'off') {
791
+ this._gpu = new BloomGpu();
792
+ }
793
+ }
794
+
795
+ /**
796
+ * Decide which `.wasm` binary to fetch. Order of precedence:
797
+ * 1. `opts.wasmUrl` if provided — used verbatim.
798
+ * 2. `opts.tier` if explicit — joined with `wasmBaseUrl`.
799
+ * 3. `opts.wasmBaseUrl` + tier picked from the device profile.
800
+ *
801
+ * Order of precedence:
802
+ * 1. `opts.wasmUrl` literal → use verbatim
803
+ * 2. `opts.wasmBaseUrl` + tier suffix → fetched from that directory
804
+ * 3. zero-config default → `albex_wasm_bg.wasm` packaged
805
+ * next to this file, resolved
806
+ * via `import.meta.url`
807
+ *
808
+ * The zero-config default loads the std-baseline binary. Tier auto-detection
809
+ * is only active when `wasmBaseUrl` is given, because picking a tier in
810
+ * runtime would defeat any bundler's static asset rewriting. Users who want
811
+ * tier optimisation must serve the six variants themselves and pass the
812
+ * directory through `wasmBaseUrl`.
813
+ */
814
+ private async _resolveWasmUrl(): Promise<string> {
815
+ const o = this._opts;
816
+ if (o.wasmUrl) {
817
+ this._profile = await detectProfile();
818
+ return o.wasmUrl;
819
+ }
820
+ // Always cache the profile so GPU/worker decisions later don't re-probe.
821
+ const profile = await detectProfile();
822
+ this._profile = profile;
823
+
824
+ // Path 3: zero-config — bundler-friendly default. `new URL(..., import.meta.url)`
825
+ // is recognised by Vite, Webpack 5+, esbuild, Rollup, Parcel 2 and Next.js
826
+ // as an asset reference. They copy the .wasm to the output directory and
827
+ // rewrite the URL automatically. Consumers who use one of those bundlers
828
+ // get a working `new AlbexEngine()` with no manual setup.
829
+ // 0.5.0+: two main binaries only — baseline and SIMD. The tier
830
+ // system is gone (audit 4.1). Selection collapses to a single
831
+ // boolean: SIMD on or off, decided either by the explicit `simd`
832
+ // option or by a runtime probe.
833
+ const simd = o.simd === 'on'
834
+ ? true
835
+ : o.simd === 'off'
836
+ ? false
837
+ : !!profile?.wasm.simd;
838
+ this._simd = simd;
839
+ this._tier = 'std';
840
+
841
+ if (!o.wasmBaseUrl) {
842
+ // Zero-config: bundler resolves the .wasm next to dist/. We only
843
+ // ship the baseline alias (albex_wasm_bg.wasm) inside the npm
844
+ // package; integrators who want SIMD must serve both binaries
845
+ // themselves via `wasmBaseUrl`.
846
+ return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
847
+ }
848
+
849
+ const base = o.wasmBaseUrl.replace(/\/+$/, '');
850
+ return simd ? `${base}/albex_wasm_simd.wasm` : `${base}/albex_wasm.wasm`;
851
+ }
852
+
853
+ /** The tier that was actually loaded. `null` until `init()` resolves. */
854
+ get tier(): Tier | null { return this._tier; }
855
+
856
+ /** True if the SIMD-accelerated binary was loaded. */
857
+ get simdEnabled(): boolean { return this._simd; }
858
+
859
+ /** True if a WebGPU device is acquired and the next search will use it. */
860
+ get gpuEngaged(): boolean { return !!this._gpu?.available; }
861
+
862
+ // ── GPU acceleration (CD1) ───────────────────────────────────────────────
863
+
864
+ /**
865
+ * Decide whether to use the GPU pre-filter for the upcoming search.
866
+ *
867
+ * Policy:
868
+ * - `gpu: 'off'` → never.
869
+ * - `gpu: 'on'` → always try (still fails over to CPU).
870
+ * - `gpu: 'auto'` (default) → only when WebGPU is available AND
871
+ * chunk count crosses `gpuThreshold`.
872
+ */
873
+ private _shouldEngageGpu(): boolean {
874
+ const o = this._opts;
875
+ if (!this._gpu) return false;
876
+ if (o.gpu === 'off') return false;
877
+ if (o.gpu === 'on') return true;
878
+ if (!this._profile) return false;
879
+ const threshold = o.gpuThreshold ?? 20_000;
880
+ return shouldUseGpu(this._profile, this._wasm.getChunkCount(), threshold);
881
+ }
882
+
883
+ /**
884
+ * Run the GPU Bloom scan and install the resulting candidate bitset into
885
+ * WASM. The next `searchBegin` will see the mask and `searchSlice` will
886
+ * restrict its Bitap pass to those candidates.
887
+ *
888
+ * No-op if the GPU device hasn't been acquired yet — first call attempts
889
+ * `init()` lazily; if that fails, the candidate path is permanently
890
+ * disabled for this engine instance.
891
+ */
892
+ private async _gpuPreFilter(wasmQuery: string): Promise<void> {
893
+ const gpu = this._gpu;
894
+ if (!gpu) return;
895
+ if (!gpu.available) {
896
+ const ok = await gpu.init();
897
+ if (!ok) { this._gpu = null; return; }
898
+ }
899
+
900
+ const chunkCount = this._wasm.getChunkCount();
901
+ if (chunkCount === 0) return;
902
+
903
+ // Upload blooms if the corpus changed. We re-upload everything on any
904
+ // delta; incremental delta-upload is a future optimisation.
905
+ if (chunkCount !== this._gpuChunkCountUploaded) {
906
+ const ptr = this._wasm.getChunksPtr();
907
+ const stride = this._wasm.getChunkStructSize();
908
+ const bytes = new Uint8Array(this._mem.buffer, ptr, chunkCount * stride);
909
+ const blooms = packBloomsFromChunks(bytes, chunkCount);
910
+ gpu.uploadChunkBlooms(blooms, chunkCount);
911
+ this._gpuChunkCountUploaded = chunkCount;
912
+ }
913
+
914
+ // Build the pattern Bloom on the JS side: same hash as Rust
915
+ // (`c & 0x3F` after accent-folding), aggregated across all tokens.
916
+ const patternBloom = computePatternBloom(wasmQuery);
917
+ const passes = await gpu.scan(
918
+ Number(patternBloom & 0xffffffffn),
919
+ Number((patternBloom >> 32n) & 0xffffffffn),
920
+ );
921
+
922
+ // Push the bitset back into WASM via the scratchpad.
923
+ const passBytes = new Uint8Array(passes.buffer, passes.byteOffset, passes.byteLength);
924
+ this._writePad(passBytes);
925
+ this._wasm.setCandidateMask(passBytes.byteLength);
272
926
  }
273
927
 
274
928
  // ── Internal helpers ──────────────────────────────────────────────────────
@@ -278,8 +932,8 @@ export class AlbexEngine {
278
932
  }
279
933
 
280
934
  private _writePad(b: Uint8Array): number {
281
- const ptr = (this._wasm.getBuffer as Function)(b.length) as number;
282
- if (!ptr) throw new Error('Scratchpad too small for this chunk');
935
+ const ptr = this._wasm.getBuffer(b.length);
936
+ if (!ptr) throw new AlbexCapacityError(`Scratchpad too small for ${b.length} bytes`);
283
937
  this._u8(ptr, b.length).set(b);
284
938
  return ptr;
285
939
  }
@@ -291,7 +945,7 @@ export class AlbexEngine {
291
945
  }
292
946
 
293
947
  private _readPad(n: number): string {
294
- const ptr = (this._wasm.getBuffer as Function)(0) as number;
948
+ const ptr = this._wasm.getBuffer(0);
295
949
  return _dec.decode(this._u8(ptr, n));
296
950
  }
297
951
 
@@ -300,15 +954,45 @@ export class AlbexEngine {
300
954
  for (let i = 0; i < b.length; i += FEED_SIZE) {
301
955
  const c = b.subarray(i, i + FEED_SIZE);
302
956
  this._writePad(c);
303
- (this._wasm.feedText as Function)(c.length);
957
+ this._wasm.feedText(c.length);
304
958
  }
305
959
  }
306
960
 
961
+ /**
962
+ * Compute the FNV-1a 64-bit content hash of `bytes` via the WASM
963
+ * streaming API. Returns a 16-character hex string identical in shape
964
+ * to what the TS implementation in 0.3.x returned, so all callers
965
+ * stay unchanged. Single source of truth — same hash whether we use
966
+ * it for indexFile dedup, for snapshot v2 persistence, or anywhere
967
+ * else. Large inputs are chunked at FEED_SIZE just like _feedText.
968
+ */
969
+ private _contentHash(bytes: Uint8Array): string {
970
+ const w = this._wasm;
971
+ w.hashBegin();
972
+ for (let i = 0; i < bytes.length; i += FEED_SIZE) {
973
+ const c = bytes.subarray(i, i + FEED_SIZE);
974
+ this._writePad(c);
975
+ w.hashFeed(c.length);
976
+ }
977
+ w.hashFinish();
978
+ // Read 8 result bytes back from scratchpad[0..8].
979
+ const ptr = w.getBuffer(8);
980
+ const out = this._u8(ptr, 8);
981
+ // Big-endian to hex. Same layout as the old hexHi + hexLo output:
982
+ // high u32 first (4 bytes), low u32 second (4 bytes).
983
+ let s = '';
984
+ for (let i = 0; i < 8; i++) {
985
+ s += out[i]!.toString(16).padStart(2, '0');
986
+ }
987
+ return s;
988
+ }
989
+
307
990
  private _feedXmlBytes(xml: Uint8Array, fn: 'feedXmlBytes' | 'feedXlsxBytes'): void {
991
+ const feeder = this._wasm[fn];
308
992
  for (let i = 0; i < xml.length; i += FEED_SIZE) {
309
993
  const c = xml.subarray(i, i + FEED_SIZE);
310
994
  this._writePad(c);
311
- (this._wasm[fn] as Function)(c.length);
995
+ feeder(c.length);
312
996
  }
313
997
  }
314
998
 
@@ -316,30 +1000,45 @@ export class AlbexEngine {
316
1000
 
317
1001
  private async _ensurePdfWasm(): Promise<void> {
318
1002
  if (this._pdfWasm) return;
319
- if (!this._opts.pdfWasmUrl) throw new Error('pdfWasmUrl not set in AlbexOptions');
320
- const res = await fetch(this._opts.pdfWasmUrl);
321
- if (!res.ok) throw new Error(`Failed to fetch PDF WASM: ${res.status}`);
322
- const imports = makePdfWasmImports(() => this._pdfMem!);
323
- const { instance } = await WebAssembly.instantiateStreaming(res, imports);
324
- this._pdfWasm = instance.exports;
325
- this._pdfMem = instance.exports.memory as WebAssembly.Memory;
1003
+ // Zero-config default: resolve relative to this module so bundlers copy
1004
+ // the .wasm to the output automatically. Override with `opts.pdfWasmUrl`
1005
+ // when serving from a separate CDN.
1006
+ const pdfUrl = this._opts.pdfWasmUrl
1007
+ ?? new URL('../wasm/pkg/albex_pdf.wasm', import.meta.url).href;
1008
+ // Network politeness: on constrained connections (slow-2g/2g/saveData)
1009
+ // we still fetch on explicit user request — `_ensurePdfWasm` is only
1010
+ // called when the user actually drops a PDF — but we issue a console
1011
+ // hint so embedders can surface a "this will download ~1 MB" prompt.
1012
+ if (this._resources?.constrainedNetwork) {
1013
+ this._diag({
1014
+ kind: 'info', stage: 'network',
1015
+ message: 'Downloading PDF WASM (~1 MB) on a constrained network connection',
1016
+ });
1017
+ }
1018
+ const res = await fetch(pdfUrl);
1019
+ if (!res.ok) throw new AlbexInitError(`Failed to fetch PDF WASM: ${res.status}`);
1020
+ // Compile first so we can inspect the module's required imports and
1021
+ // resolve mangled wasm-bindgen names by prefix rather than by hash.
1022
+ const module = await WebAssembly.compileStreaming(res);
1023
+ const imports = makePdfWasmImports(module, () => this._pdfMem);
1024
+ const instance = await WebAssembly.instantiate(module, imports);
1025
+ this._pdfWasm = asAlbexPdfExports(instance.exports);
1026
+ this._pdfMem = this._pdfWasm.memory;
326
1027
  }
327
1028
 
328
1029
  // ── Indexers ──────────────────────────────────────────────────────────────
329
1030
 
330
- private async _indexDocx(file: File): Promise<number> {
331
- const bytes = new Uint8Array(await file.arrayBuffer());
332
- const xml = await findZipEntry(bytes, 'word/document.xml');
333
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
334
- (this._wasm.beginDocument as Function)();
1031
+ private async _indexDocx(file: File, bytes: Uint8Array): Promise<number> {
1032
+ const xml = await findZipEntry(bytes, 'word/document.xml');
1033
+ this._wasm.setDocumentName(this._writeStr(file.name));
1034
+ this._wasm.beginDocument();
335
1035
  this._feedXmlBytes(xml, 'feedXmlBytes');
336
- return (this._wasm.endDocument as Function)() as number;
1036
+ return this._wasm.endDocument();
337
1037
  }
338
1038
 
339
- private async _indexXlsx(file: File): Promise<number> {
340
- const bytes = new Uint8Array(await file.arrayBuffer());
341
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
342
- (this._wasm.beginXlsx as Function)();
1039
+ private async _indexXlsx(file: File, bytes: Uint8Array): Promise<number> {
1040
+ this._wasm.setDocumentName(this._writeStr(file.name));
1041
+ this._wasm.beginXlsx();
343
1042
 
344
1043
  try {
345
1044
  const xml = await findZipEntry(bytes, 'xl/sharedStrings.xml');
@@ -354,80 +1053,699 @@ export class AlbexEngine {
354
1053
  } catch { /* skip corrupt/missing sheet */ }
355
1054
  }
356
1055
 
357
- return (this._wasm.endDocument as Function)() as number;
1056
+ return this._wasm.endDocument();
358
1057
  }
359
1058
 
360
- private async _indexPdf(file: File): Promise<number> {
1059
+ private async _indexPdf(file: File, bytes: Uint8Array): Promise<number> {
361
1060
  await this._ensurePdfWasm();
362
- const pw = this._pdfWasm!;
363
- const pm = this._pdfMem!;
364
- const bytes = new Uint8Array(await file.arrayBuffer());
365
-
366
- const inPtr = (pw.allocInput as Function)(bytes.length) as number;
1061
+ let pw = this._pdfWasm;
1062
+ let pm = this._pdfMem;
1063
+ if (!pw || !pm) throw new AlbexInitError('PDF WASM not initialised');
1064
+
1065
+ // Reserve input buffer and copy bytes. allocInput may trigger a
1066
+ // memory.grow inside the PDF module; the previous pm.buffer would
1067
+ // become detached. Refresh the memory reference before constructing
1068
+ // the view to be safe.
1069
+ const inPtr = pw.allocInput(bytes.length);
1070
+ pm = pw.memory;
367
1071
  new Uint8Array(pm.buffer, inPtr, bytes.length).set(bytes);
368
- const pageCount = (pw.extractPdf as Function)(bytes.length) as number;
369
1072
 
370
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
371
- (this._wasm.beginDocument as Function)();
1073
+ // extractPdf can panic inside pdf-extract/lopdf for PDFs that other
1074
+ // tools accept (encrypted streams without password, exotic font
1075
+ // dictionaries, malformed cross-reference tables, etc.). The crate
1076
+ // is built with panic="abort" (required on wasm32-unknown-unknown
1077
+ // — no unwinding), so the panic surfaces as a WASM `unreachable`
1078
+ // trap and the module instance becomes unusable.
1079
+ //
1080
+ // Recovery strategy when this happens:
1081
+ // 1. Discard the poisoned instance.
1082
+ // 2. If OCR is wired AND the rebuilt binary supports image
1083
+ // extraction, re-instantiate, reload the input bytes, and try
1084
+ // the lopdf-only image-extraction path. lopdf is a separate
1085
+ // parser from pdf-extract's text codec — there are real PDFs
1086
+ // that pdf-extract trips on but lopdf walks fine, and we can
1087
+ // recover the page images even when we cannot recover the
1088
+ // vector text.
1089
+ // 3. If OCR isn't wired (or the recovery also fails), surface a
1090
+ // helpful AlbexParseError that points the user at the fix.
1091
+ let pageCount: number;
1092
+ try {
1093
+ pageCount = pw.extractPdf(bytes.length);
1094
+ } catch (e) {
1095
+ this._pdfWasm = null;
1096
+ this._pdfMem = null;
1097
+ const msg = e instanceof Error ? e.message : String(e);
1098
+
1099
+ // Try the OCR fallback before giving up.
1100
+ if (this.ocrImage) {
1101
+ const recovered = await this._indexPdfViaImagesOnly(file, bytes, msg);
1102
+ if (recovered !== null) return recovered;
1103
+ }
1104
+
1105
+ throw new AlbexParseError(
1106
+ 'pdf',
1107
+ `PDF text extractor crashed (${msg}). ` +
1108
+ (this.ocrImage
1109
+ ? 'OCR fallback also could not recover any content from this file.'
1110
+ : 'Enable OCR via @albex/ocr to attempt image-based extraction as a fallback.'),
1111
+ );
1112
+ }
1113
+ // Refresh memory once more — extractPdf can grow it too.
1114
+ pm = pw.memory;
1115
+
1116
+ this._wasm.setDocumentName(this._writeStr(file.name));
1117
+ this._wasm.beginDocument();
372
1118
 
373
1119
  if (pageCount === -2) {
374
- // Image-only PDF register doc with zero chunks.
375
- return (this._wasm.endDocument as Function)() as number;
1120
+ // Image-only (scanned) PDF. If OCR is wired AND the PDF binary
1121
+ // supports image extraction, fall through to the scanned-PDF path.
1122
+ // Otherwise keep today's behaviour: register the doc with 0 chunks
1123
+ // so the user sees the file in the index but searches won't hit it.
1124
+ const supportsImages = typeof pw.extractPageImages === 'function'
1125
+ && typeof pw.getPageCount === 'function';
1126
+ if (this.ocrImage && supportsImages) {
1127
+ await this._indexPdfScanned(pw);
1128
+ }
1129
+ return this._wasm.endDocument();
376
1130
  }
377
1131
  if (pageCount < 0) {
378
- const errLen = (pw.getErrorLen as Function)() as number;
379
- const errPtr = (pw.getErrorPtr as Function)() as number;
1132
+ const errLen = pw.getErrorLen();
1133
+ const errPtr = pw.getErrorPtr();
380
1134
  const msg = errLen > 0
381
- ? new TextDecoder().decode(new Uint8Array(pm.buffer, errPtr, errLen))
1135
+ ? _dec.decode(new Uint8Array(pm.buffer, errPtr, errLen))
382
1136
  : 'PDF parse error';
383
- throw new Error(msg);
1137
+ throw new AlbexParseError('pdf', msg);
384
1138
  }
385
1139
 
386
1140
  for (let p = 0; p < pageCount; p++) {
387
- const len = (pw.getPageLen as Function)(p) as number;
1141
+ const len = pw.getPageLen(p);
388
1142
  if (!len) continue;
389
- const text = new TextDecoder('utf-8').decode(
390
- new Uint8Array(pm.buffer, (pw.getPagePtr as Function)(p) as number, len)
391
- );
1143
+ // Re-read memory each iteration — feedText writes into the main
1144
+ // WASM, but reading the PDF page pointers requires the live PDF
1145
+ // memory which may have been grown by intermediate calls.
1146
+ const liveMem = pw.memory;
1147
+ const text = _dec.decode(new Uint8Array(liveMem.buffer, pw.getPagePtr(p), len));
392
1148
  this._feedText(text);
393
- (this._wasm.flushParagraph as Function)();
1149
+ this._wasm.flushParagraph();
1150
+ }
1151
+
1152
+ // Hybrid OCR pass: when the OCR adapter is wired with
1153
+ // `options.alwaysExtractEmbeddedImages: true`, also walk every page
1154
+ // for embedded images and OCR them on top of the vector text.
1155
+ if (this._ocrAdapter
1156
+ && this._ocrAdapter.options?.alwaysExtractEmbeddedImages
1157
+ && typeof pw.extractPageImages === 'function'
1158
+ && typeof pw.getPageCount === 'function') {
1159
+ const totalPages = pw.getPageCount();
1160
+ for (let p = 0; p < totalPages; p++) {
1161
+ const ocrText = await this._ocrPageEmbeddedImages(pw, p);
1162
+ if (ocrText === null) break; // WASM trapped, stop hybrid pass.
1163
+ if (ocrText) {
1164
+ this._feedText(ocrText);
1165
+ this._wasm.flushParagraph();
1166
+ }
1167
+ }
394
1168
  }
395
1169
 
396
- return (this._wasm.endDocument as Function)() as number;
1170
+ return this._wasm.endDocument();
1171
+ }
1172
+
1173
+ /**
1174
+ * Scanned-PDF OCR fallback. Called from `_indexPdf` when `extractPdf`
1175
+ * returns `-2` (image-only PDF) AND `@albex/ocr` has been wired via
1176
+ * `enableOcr(engine)`.
1177
+ *
1178
+ * Walks every page of the PDF, extracts embedded JPEG / JPEG2000 image
1179
+ * XObjects, runs each through `engine.ocrImage`, and feeds the recognised
1180
+ * text into the index — one paragraph per page so search snippets stay
1181
+ * tied to the page they came from.
1182
+ *
1183
+ * Failure modes handled here (none re-thrown — the goal is best-effort
1184
+ * indexing, not all-or-nothing):
1185
+ *
1186
+ * * A page's `extractPageImages` traps the WASM instance: the instance
1187
+ * is discarded so the next PDF starts fresh, and we stop iterating
1188
+ * (no more pages can be read from a poisoned instance). The doc is
1189
+ * still committed with whatever text we got from earlier pages.
1190
+ * * An individual image fails to OCR (Tesseract decode error, JP2 not
1191
+ * supported in this browser, etc.): we skip that image and keep
1192
+ * going. Partial coverage beats nothing.
1193
+ * * A page yields no extractable images (e.g. uses Flate/CCITT/JBIG2):
1194
+ * no paragraph is emitted; the page contributes 0 chunks.
1195
+ */
1196
+ private async _indexPdfScanned(pw: AlbexPdfExports): Promise<void> {
1197
+ if (!this.ocrImage) return;
1198
+ const totalPages = pw.getPageCount();
1199
+ if (!totalPages) return;
1200
+
1201
+ for (let p = 0; p < totalPages; p++) {
1202
+ const pageText = await this._ocrPageEmbeddedImages(pw, p);
1203
+ if (pageText === null) return; // WASM poisoned mid-iteration.
1204
+ if (pageText) {
1205
+ this._feedText(pageText);
1206
+ this._wasm.flushParagraph();
1207
+ }
1208
+ }
397
1209
  }
398
1210
 
399
- private async _indexTxt(file: File): Promise<number> {
400
- const text = await file.text();
401
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
402
- (this._wasm.beginDocument as Function)();
1211
+ /**
1212
+ * Walk one page's embedded image XObjects, OCR each image, and return
1213
+ * the joined recognised text for that page.
1214
+ *
1215
+ * Used by:
1216
+ * - `_indexPdfScanned`: image-only PDFs (extractPdf returned -2).
1217
+ * - `_indexPdf` hybrid path: when `ocrConfig.alwaysExtractEmbeddedImages`
1218
+ * is set, every page goes through here on top of the normal text
1219
+ * extraction.
1220
+ *
1221
+ * Returns:
1222
+ * - The recognised text (possibly empty if the page has no qualifying
1223
+ * images or every OCR call failed).
1224
+ * - `null` if the PDF WASM trapped during extractPageImages — the
1225
+ * caller should abort the remaining pages because the instance is
1226
+ * now poisoned.
1227
+ *
1228
+ * Failure-handling philosophy: best-effort. An OCR failure on one image
1229
+ * does not stop the page; a page with no images does not stop the doc;
1230
+ * only a WASM trap stops the doc.
1231
+ */
1232
+ private async _ocrPageEmbeddedImages(
1233
+ pw: AlbexPdfExports,
1234
+ page: number,
1235
+ ): Promise<string | null> {
1236
+ const ocr = this.ocrImage;
1237
+ if (!ocr) return '';
1238
+
1239
+ let imageCount: number;
1240
+ try {
1241
+ imageCount = pw.extractPageImages(page);
1242
+ } catch (e) {
1243
+ // The PDF module just trapped — it is now poisoned. Drop our refs
1244
+ // so `_ensurePdfWasm` re-instantiates on the next call.
1245
+ this._pdfWasm = null;
1246
+ this._pdfMem = null;
1247
+ this._diag({
1248
+ kind: 'skipped', stage: 'pdf', page: page + 1,
1249
+ message: `PDF image extractor trapped: ${e instanceof Error ? e.message : String(e)}. Remaining pages skipped.`,
1250
+ });
1251
+ return null;
1252
+ }
1253
+ if (imageCount <= 0) return '';
1254
+
1255
+ // The buffer view must be re-acquired AFTER extractPageImages —
1256
+ // it may have grown the linear memory and detached old views.
1257
+ const liveMem = pw.memory;
1258
+ let pageText = '';
1259
+
1260
+ for (let i = 0; i < imageCount; i++) {
1261
+ const len = pw.getPageImageLen(i);
1262
+ if (!len) continue;
1263
+ const ptr = pw.getPageImagePtr(i);
1264
+ const kind = pw.getPageImageKind(i);
1265
+ const mime = kind === 1 ? 'image/jpeg'
1266
+ : kind === 2 ? 'image/jp2'
1267
+ : 'application/octet-stream';
1268
+
1269
+ // Snapshot the image bytes into a fresh ArrayBuffer. The pointer
1270
+ // returned by getPageImagePtr is only valid until the next
1271
+ // extractPageImages / extractPdf call, so we cannot hold the view.
1272
+ const copy = new Uint8Array(len);
1273
+ copy.set(new Uint8Array(liveMem.buffer, ptr, len));
1274
+ const blob = new Blob([copy.buffer as ArrayBuffer], { type: mime });
1275
+
1276
+ try {
1277
+ const { text } = await ocr(blob);
1278
+ const trimmed = text?.trim();
1279
+ if (trimmed) {
1280
+ pageText = pageText ? `${pageText} ${trimmed}` : trimmed;
1281
+ }
1282
+ } catch (e) {
1283
+ // Image-level OCR failure — skip and continue. JP2 in browsers
1284
+ // without native support lands here; so do truncated or
1285
+ // unsupported JPEG variants. Worker aborts (Tesseract.js
1286
+ // "Aborted(-1)") are also caught here; if they bypass the
1287
+ // promise rejection and surface as `uncaught` instead, the
1288
+ // demo's window.onerror handler will keep the app alive.
1289
+ this._diag({
1290
+ kind: 'skipped', stage: 'ocr', page: page + 1,
1291
+ message: `OCR failed on image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`,
1292
+ });
1293
+ }
1294
+ }
1295
+
1296
+ return pageText;
1297
+ }
1298
+
1299
+ /**
1300
+ * Last-chance OCR path used when `extractPdf` itself trapped (pdf-extract
1301
+ * crashed but lopdf may still be able to walk the file). Re-instantiates
1302
+ * the PDF WASM, reloads the input bytes, and tries the image-extraction
1303
+ * route directly — bypassing the text codec entirely.
1304
+ *
1305
+ * Returns:
1306
+ * * the doc's chunk count on success (even 0 — that means lopdf could
1307
+ * parse but no qualifying images existed, which still beats a hard
1308
+ * parse error),
1309
+ * * null if the recovery itself failed (binary lacks the image exports,
1310
+ * re-instantiation failed, or lopdf also trapped). In the null case
1311
+ * the caller throws AlbexParseError so the user sees a clear message.
1312
+ */
1313
+ private async _indexPdfViaImagesOnly(
1314
+ file: File,
1315
+ bytes: Uint8Array,
1316
+ originalError: string,
1317
+ ): Promise<number | null> {
1318
+ try {
1319
+ await this._ensurePdfWasm();
1320
+ } catch {
1321
+ return null;
1322
+ }
1323
+ const pw = this._pdfWasm;
1324
+ if (!pw) return null;
1325
+
1326
+ const supportsImages = typeof pw.extractPageImages === 'function'
1327
+ && typeof pw.getPageCount === 'function';
1328
+ if (!supportsImages) return null;
1329
+
1330
+ // Reload input bytes into the fresh instance. allocInput may grow the
1331
+ // memory, so re-acquire the buffer view immediately after.
1332
+ let inPtr: number;
1333
+ try {
1334
+ inPtr = pw.allocInput(bytes.length);
1335
+ new Uint8Array(pw.memory.buffer, inPtr, bytes.length).set(bytes);
1336
+ } catch (e) {
1337
+ this._diag({
1338
+ kind: 'skipped', stage: 'pdf',
1339
+ message: `PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`,
1340
+ });
1341
+ return null;
1342
+ }
1343
+
1344
+ // Set up the doc and let _indexPdfScanned do the page-by-page walk.
1345
+ // _indexPdfScanned tolerates lopdf failing mid-stream — it caches the
1346
+ // poisoned instance and returns early. If lopdf trips on the very
1347
+ // first page, no paragraphs are emitted and we end up with 0 chunks.
1348
+ this._wasm.setDocumentName(this._writeStr(file.name));
1349
+ this._wasm.beginDocument();
1350
+ this._diag({
1351
+ kind: 'fallback', stage: 'pdf', file: file.name,
1352
+ message: `pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf`,
1353
+ });
1354
+ await this._indexPdfScanned(pw);
1355
+ return this._wasm.endDocument();
1356
+ }
1357
+
1358
+ private async _indexTxt(file: File, bytes: Uint8Array): Promise<number> {
1359
+ const text = _dec.decode(bytes);
1360
+ this._wasm.setDocumentName(this._writeStr(file.name));
1361
+ this._wasm.beginDocument();
403
1362
  for (const para of text.split(/\n{2,}/)) {
404
1363
  const l = para.replace(/\n/g, ' ').trim();
405
- if (l) { this._feedText(l); (this._wasm.flushParagraph as Function)(); }
1364
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
406
1365
  }
407
- return (this._wasm.endDocument as Function)() as number;
1366
+ return this._wasm.endDocument();
408
1367
  }
409
1368
 
410
- private async _indexXml(file: File): Promise<number> {
411
- const plain = (await file.text())
1369
+ private async _indexXml(file: File, bytes: Uint8Array): Promise<number> {
1370
+ const plain = _dec.decode(bytes)
412
1371
  .replace(/<[^]*?>/g, '\n')
413
1372
  .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
414
1373
  .replace(/&quot;/g, '"').replace(/&apos;/g, "'")
415
1374
  .replace(/[ \t]+/g, ' ').trim();
416
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
417
- (this._wasm.beginDocument as Function)();
1375
+ this._wasm.setDocumentName(this._writeStr(file.name));
1376
+ this._wasm.beginDocument();
418
1377
  for (const seg of plain.split(/\n{2,}/)) {
419
1378
  const l = seg.replace(/\n/g, ' ').trim();
420
- if (l) { this._feedText(l); (this._wasm.flushParagraph as Function)(); }
1379
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
1380
+ }
1381
+ return this._wasm.endDocument();
1382
+ }
1383
+
1384
+ // ── Markdown ─────────────────────────────────────────────────────────────
1385
+ // Strip CommonMark inline marks but keep word content. Paragraphs split on
1386
+ // blank lines, same convention as TXT/XML.
1387
+ private async _indexMd(file: File, bytes: Uint8Array): Promise<number> {
1388
+ const text = _dec.decode(bytes)
1389
+ // Remove fenced code blocks entirely (often noisy for search relevance).
1390
+ .replace(/```[\s\S]*?```/g, '\n')
1391
+ .replace(/~~~[\s\S]*?~~~/g, '\n')
1392
+ // Strip ATX heading markers but keep heading text.
1393
+ .replace(/^#{1,6}\s+/gm, '')
1394
+ // Replace inline links/images with their visible text.
1395
+ .replace(/!\[([^\]]*)\]\([^)]*\)/g, '$1')
1396
+ .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
1397
+ // Strip emphasis markers (preserve content).
1398
+ .replace(/(\*\*|__|\*|_)/g, '')
1399
+ // Inline code.
1400
+ .replace(/`([^`]+)`/g, '$1')
1401
+ // Blockquote marks.
1402
+ .replace(/^>\s?/gm, '')
1403
+ // List markers.
1404
+ .replace(/^\s*[-*+]\s+/gm, '')
1405
+ .replace(/^\s*\d+\.\s+/gm, '');
1406
+ this._wasm.setDocumentName(this._writeStr(file.name));
1407
+ this._wasm.beginDocument();
1408
+ for (const para of text.split(/\n{2,}/)) {
1409
+ const l = para.replace(/\n/g, ' ').trim();
1410
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
1411
+ }
1412
+ return this._wasm.endDocument();
1413
+ }
1414
+
1415
+ // ── HTML ─────────────────────────────────────────────────────────────────
1416
+ // Strip <script>/<style> entire blocks, then drop tag markup. The output is
1417
+ // chunked at <p>, <br>, <h*>, <li>, <tr> boundaries (mapped to paragraph
1418
+ // breaks) so search location numbers map naturally to the document outline.
1419
+ private async _indexHtml(file: File, bytes: Uint8Array): Promise<number> {
1420
+ const html = _dec.decode(bytes)
1421
+ .replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, ' ')
1422
+ .replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, ' ')
1423
+ // Treat block-level closers as paragraph separators.
1424
+ .replace(/<\/(p|h[1-6]|li|tr|div|section|article|header|footer)\s*>/gi, '\n\n')
1425
+ .replace(/<br\s*\/?\s*>/gi, '\n')
1426
+ // Drop remaining tags.
1427
+ .replace(/<[^>]+>/g, ' ')
1428
+ // Decode common entities (full set would need a table; this covers >95%).
1429
+ .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
1430
+ .replace(/&quot;/g, '"').replace(/&apos;/g, "'").replace(/&nbsp;/g, ' ')
1431
+ .replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(Number(n)))
1432
+ .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCodePoint(parseInt(n, 16)))
1433
+ .replace(/[ \t]+/g, ' ');
1434
+ this._wasm.setDocumentName(this._writeStr(file.name));
1435
+ this._wasm.beginDocument();
1436
+ for (const para of html.split(/\n{2,}/)) {
1437
+ const l = para.replace(/\n/g, ' ').trim();
1438
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
1439
+ }
1440
+ return this._wasm.endDocument();
1441
+ }
1442
+
1443
+ // ── JSON ─────────────────────────────────────────────────────────────────
1444
+ // Extract every string value (keys + leaf strings) recursively. Each leaf
1445
+ // becomes its own searchable chunk via paragraph flush. Numbers/booleans
1446
+ // are skipped (cannot match a textual query usefully).
1447
+ private async _indexJson(file: File, bytes: Uint8Array): Promise<number> {
1448
+ let root: unknown;
1449
+ try { root = JSON.parse(_dec.decode(bytes)); }
1450
+ catch (e) { throw new AlbexParseError('json', (e as Error).message); }
1451
+
1452
+ this._wasm.setDocumentName(this._writeStr(file.name));
1453
+ this._wasm.beginDocument();
1454
+
1455
+ const visit = (v: unknown): void => {
1456
+ if (typeof v === 'string') {
1457
+ if (v.trim()) { this._feedText(v); this._wasm.flushParagraph(); }
1458
+ } else if (Array.isArray(v)) {
1459
+ for (const x of v) visit(x);
1460
+ } else if (v && typeof v === 'object') {
1461
+ for (const [k, x] of Object.entries(v as Record<string, unknown>)) {
1462
+ if (k.trim()) { this._feedText(k); this._wasm.flushParagraph(); }
1463
+ visit(x);
1464
+ }
1465
+ }
1466
+ };
1467
+ visit(root);
1468
+ return this._wasm.endDocument();
1469
+ }
1470
+
1471
+ // ── CSV ──────────────────────────────────────────────────────────────────
1472
+ // RFC 4180 lite: comma-separated, optional double quotes, escaped "" inside
1473
+ // quoted fields. Each row becomes one paragraph (location = row index, with
1474
+ // header row at location 0).
1475
+ private async _indexCsv(file: File, bytes: Uint8Array): Promise<number> {
1476
+ // Strip an optional UTF-8 BOM. Excel writes it by default for "CSV UTF-8";
1477
+ // without this fix the first field of the first row would start with
1478
+ // U+FEFF, which both shifts column alignment when consumers split on a
1479
+ // field name and breaks search hits on "Subject" / "Asunto" etc.
1480
+ let text = _dec.decode(bytes);
1481
+ if (text.charCodeAt(0) === 0xFEFF) text = text.slice(1);
1482
+
1483
+ this._wasm.setDocumentName(this._writeStr(file.name));
1484
+ this._wasm.beginDocument();
1485
+
1486
+ let row: string[] = [];
1487
+ let field = '';
1488
+ let inQuoted = false;
1489
+ const flushRow = (): void => {
1490
+ const line = row.join(' ').trim();
1491
+ if (line) { this._feedText(line); this._wasm.flushParagraph(); }
1492
+ row = [];
1493
+ };
1494
+ for (let i = 0; i < text.length; i++) {
1495
+ const c = text[i];
1496
+ if (inQuoted) {
1497
+ if (c === '"') {
1498
+ if (text[i + 1] === '"') { field += '"'; i++; }
1499
+ else inQuoted = false;
1500
+ } else field += c;
1501
+ } else {
1502
+ if (c === ',') { row.push(field); field = ''; }
1503
+ else if (c === '\n'){ row.push(field); field = ''; flushRow(); }
1504
+ else if (c === '\r'){ /* skip */ }
1505
+ else if (c === '"' && field.length === 0) inQuoted = true;
1506
+ else field += c;
1507
+ }
1508
+ }
1509
+ if (field.length > 0 || row.length > 0) { row.push(field); flushRow(); }
1510
+ return this._wasm.endDocument();
1511
+ }
1512
+
1513
+ // ── EML / MBOX ───────────────────────────────────────────────────────────
1514
+ // Minimal MIME: parse the first text/plain body. Headers From/To/Subject
1515
+ // are indexed as separate paragraphs so they're individually searchable.
1516
+ //
1517
+ // What's decoded:
1518
+ // * Content-Transfer-Encoding: base64 → decoded.
1519
+ // * Content-Transfer-Encoding: quoted-printable → decoded.
1520
+ // * Content-Transfer-Encoding: 7bit / 8bit → pass-through.
1521
+ // * Nested multipart (multipart/alternative inside multipart/mixed) by
1522
+ // recursively walking boundaries until a text/plain section is found.
1523
+ //
1524
+ // What's not decoded (out of scope for this "lite" parser):
1525
+ // * Encoded-word headers (=?utf-8?Q?...?=) — only the raw bytes go in.
1526
+ // * Charset conversions other than UTF-8 — assumes the body decodes as UTF-8.
1527
+ // * HTML-only emails — they're dropped if no text/plain part is present.
1528
+ // * MBOX format (multiple emails concatenated). Each email needs to be
1529
+ // fed separately.
1530
+ private async _indexEml(file: File, bytes: Uint8Array): Promise<number> {
1531
+ const raw = _dec.decode(bytes).replace(/\r\n/g, '\n');
1532
+ const headerEnd = raw.indexOf('\n\n');
1533
+ const headersBlock = headerEnd > 0 ? raw.slice(0, headerEnd) : raw;
1534
+ const body = headerEnd > 0 ? raw.slice(headerEnd + 2) : '';
1535
+
1536
+ const header = (block: string, name: string): string => {
1537
+ const m = new RegExp(`^${name}:\\s*(.+(?:\\n[ \\t].+)*)`, 'mi').exec(block);
1538
+ return m ? (m[1] ?? '').replace(/\n[ \t]+/g, ' ').trim() : '';
1539
+ };
1540
+
1541
+ this._wasm.setDocumentName(this._writeStr(file.name));
1542
+ this._wasm.beginDocument();
1543
+
1544
+ const subj = header(headersBlock, 'Subject');
1545
+ const from = header(headersBlock, 'From');
1546
+ const to = header(headersBlock, 'To');
1547
+ for (const h of [subj, from, to]) {
1548
+ if (h) { this._feedText(h); this._wasm.flushParagraph(); }
1549
+ }
1550
+
1551
+ const plain = this._extractEmlTextPlain(headersBlock, body, header) ?? body;
1552
+
1553
+ for (const para of plain.split(/\n{2,}/)) {
1554
+ const l = para.replace(/\n/g, ' ').trim();
1555
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
1556
+ }
1557
+ return this._wasm.endDocument();
1558
+ }
1559
+
1560
+ /**
1561
+ * Walk the multipart tree until a text/plain section is found. Returns
1562
+ * the decoded body as a string, or null if no text/plain part exists.
1563
+ *
1564
+ * The function is called with the headers and body of the *current*
1565
+ * MIME entity (the top-level message at first, then each multipart child
1566
+ * on recursion). For single-part entities it inspects the entity's own
1567
+ * Content-Transfer-Encoding and decodes accordingly.
1568
+ */
1569
+ private _extractEmlTextPlain(
1570
+ headersBlock: string,
1571
+ body: string,
1572
+ header: (block: string, name: string) => string,
1573
+ ): string | null {
1574
+ const contentType = header(headersBlock, 'Content-Type');
1575
+ const boundary = /boundary="?([^";]+)"?/i.exec(contentType)?.[1];
1576
+
1577
+ if (!boundary) {
1578
+ // Single-part body. If it claims to be text/plain (the default when
1579
+ // Content-Type is absent), apply Transfer-Encoding decoding here.
1580
+ // Anything else (text/html, application/*) gets returned raw — the
1581
+ // top-level caller still feeds it as text, but searches against
1582
+ // genuinely binary payloads will not hit anything useful.
1583
+ if (contentType === '' || /text\/plain/i.test(contentType)) {
1584
+ return decodeEmlBody(headersBlock, body, header);
1585
+ }
1586
+ return body;
421
1587
  }
422
- return (this._wasm.endDocument as Function)() as number;
1588
+
1589
+ const parts = body.split(`--${boundary}`);
1590
+ for (const part of parts) {
1591
+ const trimmed = part.replace(/^\n+/, '');
1592
+ const ph = trimmed.indexOf('\n\n');
1593
+ if (ph < 0) continue;
1594
+ const partHeaders = trimmed.slice(0, ph);
1595
+ const partBody = trimmed.slice(ph + 2);
1596
+ const partCtype = header(partHeaders, 'Content-Type');
1597
+
1598
+ if (/^multipart\//i.test(partCtype)) {
1599
+ const inner = this._extractEmlTextPlain(partHeaders, partBody, header);
1600
+ if (inner) return inner;
1601
+ continue;
1602
+ }
1603
+
1604
+ if (/text\/plain/i.test(partCtype)) {
1605
+ return decodeEmlBody(partHeaders, partBody, header);
1606
+ }
1607
+ }
1608
+ return null;
1609
+ }
1610
+
1611
+ // ── RTF ──────────────────────────────────────────────────────────────────
1612
+ //
1613
+ // Strip the {\rtf1...} group structure. Control words (\xxx and \xxxN),
1614
+ // hex escapes (\'XX), unicode escapes (\uN ?) and groups are processed;
1615
+ // plain runs are kept.
1616
+ //
1617
+ // Character decoding:
1618
+ // * \'XX → Windows-1252 byte XX. RTF defaults to cp1252 for high-ANSI;
1619
+ // we map the relevant rows (0x80–0x9F differs from Latin-1)
1620
+ // to their Unicode equivalents. Outside that block, the byte
1621
+ // is taken as Latin-1 (which equals Unicode below 0x100).
1622
+ // Result: accents in es/fr/de/it/pt RTF dumps survive.
1623
+ // * \uN ? → Unicode codepoint N (signed 16-bit, negative means N+65536).
1624
+ // Followed by a fallback character which we then skip — Word
1625
+ // writes the ASCII transliteration of the unicode glyph as a
1626
+ // fallback for non-Unicode readers; we ignore it because we
1627
+ // have the real codepoint.
1628
+ // * \- → soft hyphen (drop).
1629
+ // * \~ → non-breaking space.
1630
+ // * \emdash, \endash, \bullet, \lquote, \rquote, \ldblquote, \rdblquote
1631
+ // → their Unicode equivalents.
1632
+ //
1633
+ // What's not handled (assumes Word/Pages/LibreOffice output, where
1634
+ // these aren't load-bearing):
1635
+ // * \ansicpg, \fcharset — we always assume cp1252 for \' escapes.
1636
+ // * \bin — binary data with explicit length; rare in document RTF.
1637
+ // * Field codes — rendered as the visible text (good enough for search).
1638
+ private async _indexRtf(file: File, bytes: Uint8Array): Promise<number> {
1639
+ const src = _dec.decode(bytes);
1640
+ let out = '';
1641
+ let i = 0;
1642
+ let depth = 0;
1643
+ // Track if we're inside a destination group we should skip (e.g. \fonttbl).
1644
+ let skipDepth = 0;
1645
+ const SKIP_DESTINATIONS = /^\\(fonttbl|colortbl|stylesheet|info|pict|object|header|footer)\b/;
1646
+
1647
+ while (i < src.length) {
1648
+ const c = src[i];
1649
+ if (c === '{') { depth++; i++; continue; }
1650
+ if (c === '}') {
1651
+ depth--;
1652
+ if (skipDepth > 0 && depth < skipDepth) skipDepth = 0;
1653
+ i++; continue;
1654
+ }
1655
+ if (c === '\\') {
1656
+ // Hex byte escape: \'XX
1657
+ if (src[i + 1] === '\'' && i + 3 < src.length) {
1658
+ const hex = src.slice(i + 2, i + 4);
1659
+ if (/^[0-9A-Fa-f]{2}$/.test(hex)) {
1660
+ if (skipDepth === 0) out += rtfCp1252ToChar(parseInt(hex, 16));
1661
+ i += 4;
1662
+ continue;
1663
+ }
1664
+ // Malformed — drop and advance.
1665
+ i += 2;
1666
+ continue;
1667
+ }
1668
+ // Unicode escape: \uN followed by optional fallback character.
1669
+ // N is signed 16-bit per the spec; negative values mean N + 65536.
1670
+ const um = /^\\u(-?\d+) ?/.exec(src.slice(i));
1671
+ if (um) {
1672
+ let code = parseInt(um[1] ?? '0', 10);
1673
+ if (code < 0) code += 0x10000;
1674
+ if (skipDepth === 0 && code > 0 && code < 0x110000) {
1675
+ out += String.fromCodePoint(code);
1676
+ }
1677
+ i += um[0].length;
1678
+ // Skip the fallback char. Word writes one ASCII char after \uN
1679
+ // (the "uc1" count). We assume uc1, which is the Word default.
1680
+ if (i < src.length && src[i] !== '\\' && src[i] !== '{' && src[i] !== '}') {
1681
+ i++;
1682
+ }
1683
+ continue;
1684
+ }
1685
+ // Control word / symbol.
1686
+ const m = /^\\([A-Za-z]+)(-?\d+)?\s?/.exec(src.slice(i));
1687
+ if (m) {
1688
+ const word = m[1] ?? '';
1689
+ if (skipDepth === 0 && SKIP_DESTINATIONS.test(src.slice(i))) skipDepth = depth;
1690
+ if (skipDepth === 0) {
1691
+ switch (word) {
1692
+ case 'par':
1693
+ case 'line':
1694
+ case 'sect':
1695
+ out += '\n\n';
1696
+ break;
1697
+ case 'tab':
1698
+ out += '\t';
1699
+ break;
1700
+ case 'emdash': out += '—'; break;
1701
+ case 'endash': out += '–'; break;
1702
+ case 'bullet': out += '•'; break;
1703
+ case 'lquote': out += '‘'; break;
1704
+ case 'rquote': out += '’'; break;
1705
+ case 'ldblquote': out += '“'; break;
1706
+ case 'rdblquote': out += '”'; break;
1707
+ default: /* drop other control words silently */ break;
1708
+ }
1709
+ }
1710
+ i += m[0].length;
1711
+ continue;
1712
+ }
1713
+ // Escaped single character: \\, \{, \}, \-, \~ etc.
1714
+ if (skipDepth === 0) {
1715
+ const escaped = src[i + 1];
1716
+ if (escaped === '~') out += ' '; // non-breaking space
1717
+ else if (escaped === '-') { /* soft hyphen — drop */ }
1718
+ else if (escaped !== undefined) out += escaped;
1719
+ }
1720
+ i += 2; continue;
1721
+ }
1722
+ if (skipDepth === 0) out += c;
1723
+ i++;
1724
+ }
1725
+
1726
+ this._wasm.setDocumentName(this._writeStr(file.name));
1727
+ this._wasm.beginDocument();
1728
+ for (const para of out.split(/\n{2,}/)) {
1729
+ const l = para.replace(/\n/g, ' ').trim();
1730
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
1731
+ }
1732
+ return this._wasm.endDocument();
423
1733
  }
424
1734
 
425
- private static readonly _INDEXERS: Record<string, (engine: AlbexEngine, file: File) => Promise<number>> = {
426
- docx: (e, f) => e._indexDocx(f),
427
- xlsx: (e, f) => e._indexXlsx(f),
428
- pdf: (e, f) => e._indexPdf(f),
429
- txt: (e, f) => e._indexTxt(f),
430
- xml: (e, f) => e._indexXml(f),
1735
+ private static readonly _INDEXERS: Record<string, (engine: AlbexEngine, file: File, bytes: Uint8Array) => Promise<number>> = {
1736
+ docx: (e, f, b) => e._indexDocx(f, b),
1737
+ xlsx: (e, f, b) => e._indexXlsx(f, b),
1738
+ pdf: (e, f, b) => e._indexPdf(f, b),
1739
+ txt: (e, f, b) => e._indexTxt(f, b),
1740
+ xml: (e, f, b) => e._indexXml(f, b),
1741
+ md: (e, f, b) => e._indexMd(f, b),
1742
+ markdown: (e, f, b) => e._indexMd(f, b),
1743
+ html: (e, f, b) => e._indexHtml(f, b),
1744
+ htm: (e, f, b) => e._indexHtml(f, b),
1745
+ json: (e, f, b) => e._indexJson(f, b),
1746
+ csv: (e, f, b) => e._indexCsv(f, b),
1747
+ eml: (e, f, b) => e._indexEml(f, b),
1748
+ rtf: (e, f, b) => e._indexRtf(f, b),
431
1749
  };
432
1750
 
433
1751
  // ── Public API ────────────────────────────────────────────────────────────
@@ -437,106 +1755,457 @@ export class AlbexEngine {
437
1755
  * Throws for unsupported formats or parse errors.
438
1756
  */
439
1757
  async indexFile(file: File): Promise<IndexedDocument> {
1758
+ return this._exclusive(() => this._indexFileInner(file));
1759
+ }
1760
+
1761
+ private async _indexFileInner(file: File): Promise<IndexedDocument> {
440
1762
  const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
441
1763
  const indexer = AlbexEngine._INDEXERS[ext];
442
- if (!indexer) throw new Error(`Unsupported format: .${ext}`);
1764
+ if (!indexer) throw new AlbexUnsupportedFormatError(ext);
1765
+
1766
+ // Hash the source bytes for idempotency. We always read the bytes once
1767
+ // here so the indexer can reuse them — avoids a double File.arrayBuffer().
1768
+ const bytes = new Uint8Array(await file.arrayBuffer());
1769
+ const hash = this._contentHash(bytes);
1770
+
1771
+ // Idempotency: if a non-deleted doc already has this hash, return it
1772
+ // unchanged. Cheap O(N) scan since MAX_DOCS = 128.
1773
+ const existing = this._docs.find(d => d.contentHash === hash);
1774
+ if (existing) return existing;
1775
+
1776
+ const w = this._wasm;
1777
+ const t0 = performance.now();
1778
+ const textPre = w.getTextUsed();
1779
+ const docCountBefore = w.getDocCount();
1780
+
1781
+ // Snapshot v2: hand the content hash to the WASM so it persists with
1782
+ // the doc. Older binaries (pre-v2) lack this export — we silently skip
1783
+ // and behave like before. The indexer will overwrite the scratchpad
1784
+ // immediately after (with the doc name), which is fine because
1785
+ // setDocumentContentHash copies into pending_content_hash before
1786
+ // returning.
1787
+ if (typeof w.setDocumentContentHash === 'function') {
1788
+ const hashBytes = hashHexToBytes(hash);
1789
+ this._writePad(hashBytes);
1790
+ w.setDocumentContentHash(hashBytes.length);
1791
+ }
1792
+
1793
+ const chunks = await indexer(this, file, bytes);
1794
+
1795
+ // Capacity check (0.6.0). The WASM pools fill silently and break out of
1796
+ // their ingest loops; getLastIndexOverflow reports which one filled.
1797
+ // Surface a typed error instead of returning a half-indexed document the
1798
+ // caller cannot tell apart from a complete one (audit finding #3).
1799
+ const overflow = w.getLastIndexOverflow();
1800
+ if (overflow !== 0) {
1801
+ const which: AlbexCapacityLimit =
1802
+ (overflow & 1) ? 'chunks' : (overflow & 2) ? 'text'
1803
+ : (overflow & 4) ? 'docs' : 'names';
1804
+ const pools = [
1805
+ overflow & 1 ? 'chunk pool' : '',
1806
+ overflow & 2 ? 'text pool' : '',
1807
+ overflow & 4 ? 'document table' : '',
1808
+ overflow & 8 ? 'name pool' : '',
1809
+ ].filter(Boolean).join(', ');
1810
+ throw new AlbexCapacityError(
1811
+ `Index capacity exceeded while indexing "${file.name}" (${pools} full). ` +
1812
+ `The document was rolled back (not indexed); treat the index as full ` +
1813
+ `(compact(), shard across an AlbexPool, or reset()).`,
1814
+ which,
1815
+ );
1816
+ }
1817
+
1818
+ // The new doc occupies slot `docCountBefore`.
1819
+ const docId = w.getDocId(docCountBefore);
443
1820
 
444
- const t0 = performance.now();
445
- const textPre = (this._wasm.getTextUsed as Function)() as number;
446
- const chunks = await indexer(this, file);
447
1821
  const doc: IndexedDocument = {
448
1822
  name: file.name,
449
1823
  ext,
450
1824
  chunks,
451
1825
  indexTimeMs: performance.now() - t0,
452
- textBytes: ((this._wasm.getTextUsed as Function)() as number) - textPre,
1826
+ textBytes: w.getTextUsed() - textPre,
1827
+ docId,
1828
+ contentHash: hash,
453
1829
  };
454
1830
  this._docs.push(doc);
455
1831
  return doc;
456
1832
  }
457
1833
 
1834
+ /**
1835
+ * Mark a previously indexed document as removed. Searches no longer return
1836
+ * its chunks. Storage is reclaimed only after `compact()`.
1837
+ *
1838
+ * `id` can be the file name or the contentHash returned by `indexFile`.
1839
+ * Returns `true` if a matching document was found and tombstoned.
1840
+ */
1841
+ removeDocument(id: string): boolean {
1842
+ this._assertIdle('removeDocument');
1843
+ return this._removeDocumentInner(id);
1844
+ }
1845
+
1846
+ private _removeDocumentInner(id: string): boolean {
1847
+ const doc = this._docs.find(d => d.name === id || d.contentHash === id);
1848
+ if (!doc) return false;
1849
+ const ok = this._wasm.removeDocument(doc.docId) === 1;
1850
+ if (ok) {
1851
+ this._docs = this._docs.filter(d => d !== doc);
1852
+ }
1853
+ return ok;
1854
+ }
1855
+
1856
+ /**
1857
+ * Replace a previously indexed document with new content. Equivalent to
1858
+ * `removeDocument(name)` + `indexFile(newFile)` but does not trigger the
1859
+ * idempotency check (so re-indexing the *same* bytes after a remove works).
1860
+ */
1861
+ async replaceDocument(name: string, newFile: File): Promise<IndexedDocument> {
1862
+ return this._exclusive(async () => {
1863
+ this._removeDocumentInner(name);
1864
+ // Index directly via the inner path (we already hold the lock).
1865
+ const doc = await this._indexFileInner(newFile);
1866
+ // Repeated replaces leave tombstones in the text pool; reclaim under
1867
+ // pressure so the pool isn't silently exhausted (audit finding #7).
1868
+ this._autoCompactIfNeeded();
1869
+ return doc;
1870
+ });
1871
+ }
1872
+
1873
+ /**
1874
+ * Reclaim storage from previously removed documents. Compacts CHUNKS,
1875
+ * TEXT_POOL, DOC_NAMES and NAME_POOL in place. Idempotent.
1876
+ *
1877
+ * Note: doc_ids of surviving documents are preserved, so any stored
1878
+ * references (e.g. in a UI) remain valid.
1879
+ */
1880
+ compact(): void {
1881
+ this._assertIdle('compact');
1882
+ this._wasm.compact();
1883
+ }
1884
+
458
1885
  /**
459
1886
  * Search the index. Supports:
460
1887
  * - Simple queries: `contrato` (AND of tokens, accent-insensitive)
461
1888
  * - Phrase queries: `"contrato marco"` (must appear as phrase)
462
1889
  * - OR queries: `contrato | acuerdo` (union of two searches)
1890
+ *
1891
+ * Pass `{ windowed: true }` to receive cropped snippets with ASCII ellipsis
1892
+ * markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
1893
+ */
1894
+ search(query: string, opts: SearchOptions = {}): SearchResult[] {
1895
+ this._assertIdle('search');
1896
+ const w = this._wasm;
1897
+ const ql = this._writeStr(query);
1898
+ const kind = w.prepareQuery(ql);
1899
+ if (kind < 0) return [];
1900
+
1901
+ if (kind === 2) {
1902
+ // OR: iterate branches and merge in TS. WASM stores compiled
1903
+ // branches internally so we never re-tokenize on the host.
1904
+ return this._searchOr(query, opts);
1905
+ }
1906
+
1907
+ w.selectQueryBranch(0);
1908
+ // Phrase queries (kind 1) post-filter on adjacency. Pass the tokens down
1909
+ // so the check runs against the FULL chunk text, not a cropped windowed
1910
+ // snippet — otherwise `{ windowed: true }` could drop a valid phrase hit
1911
+ // whose second term fell outside the window (audit finding #7).
1912
+ const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
1913
+ return this._runSearch(query, opts, phraseTokens);
1914
+ }
1915
+
1916
+ /** Read the WASM-compiled tokens of branch `i` for phrase post-filter.
1917
+ * The bytes returned are exactly what the WASM tokenizer produced —
1918
+ * no TS re-tokenization. */
1919
+ private _branchTokens(i: number): string[] {
1920
+ const n = this._wasm.getQueryBranchPattern(i);
1921
+ if (n === 0) return [];
1922
+ const pattern = this._readPad(n);
1923
+ return pattern.split(' ').filter(t => t.length > 0);
1924
+ }
1925
+
1926
+ /**
1927
+ * Cooperative search. Processes the corpus in slices, yielding to the
1928
+ * event loop between them so the host UI thread keeps a chance to paint
1929
+ * even while a long scan is in flight.
1930
+ *
1931
+ * NOTE: this is NOT incremental streaming. Results are materialised
1932
+ * once the search completes and then iterated out in score-descending
1933
+ * order. The async iterator shape is preserved because the work that
1934
+ * produces those results genuinely yields to the scheduler between
1935
+ * slices — a future iteration may stream individual results before the
1936
+ * heap sorts, but doing so today would deliver them in arbitrary order.
1937
+ *
1938
+ * Pass `opts.frameBudgetMs` to control the slice size (default 8 ms).
1939
+ */
1940
+ async *searchCooperative(query: string, opts: SearchOptions = {}): AsyncIterable<SearchResult> {
1941
+ // Collect under the exclusivity lock so no other engine op interleaves at
1942
+ // a slice boundary; the per-slice scheduler yields still happen inside.
1943
+ const results = await this._exclusive(() => this._searchCooperativeCollect(query, opts));
1944
+ for (const r of results) yield r;
1945
+ }
1946
+
1947
+ /** Materialise a cooperative search to a sorted result array. Runs inside
1948
+ * the exclusivity lock. Frame-budget yielding lives in _runSearchBudgeted. */
1949
+ private async _searchCooperativeCollect(query: string, opts: SearchOptions): Promise<SearchResult[]> {
1950
+ const budget = opts.frameBudgetMs ?? 8;
1951
+ const w = this._wasm;
1952
+
1953
+ const ql = this._writeStr(query);
1954
+ const kind = w.prepareQuery(ql);
1955
+ if (kind < 0) return [];
1956
+
1957
+ if (kind === 2) {
1958
+ // OR branches — run each as its own resumable search and merge.
1959
+ const seen = new Set<string>();
1960
+ const all: SearchResult[] = [];
1961
+ const n = w.getQueryBranchCount();
1962
+ for (let i = 0; i < n; i++) {
1963
+ w.selectQueryBranch(i);
1964
+ const r = await this._runSearchBudgeted(query, opts, budget, undefined, i);
1965
+ for (const x of r) {
1966
+ const key = `${x.documentName}:${x.location}:${x.matchStart}`;
1967
+ if (!seen.has(key)) { seen.add(key); all.push(x); }
1968
+ }
1969
+ }
1970
+ all.sort((a, b) => b.score - a.score);
1971
+ return all;
1972
+ }
1973
+
1974
+ w.selectQueryBranch(0);
1975
+ const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
1976
+ return this._runSearchBudgeted(query, opts, budget, phraseTokens, 0);
1977
+ }
1978
+
1979
+ /**
1980
+ * @deprecated Renamed to `searchCooperative` in 0.3.0. The original name
1981
+ * was misleading — this method does not stream incremental results, it
1982
+ * yields to the scheduler between slices and returns a batch. The alias
1983
+ * keeps existing integrations working; it will be removed in 0.4.0.
463
1984
  */
464
- search(query: string): SearchResult[] {
465
- const parsed = parseQuery(query);
1985
+ async *searchStream(query: string, opts: SearchOptions = {}): AsyncIterable<SearchResult> {
1986
+ warnSearchStreamDeprecated();
1987
+ yield* this.searchCooperative(query, opts);
1988
+ }
466
1989
 
467
- if (parsed.kind === 'or') {
468
- return this._searchOr(parsed.branches, query);
1990
+ /**
1991
+ * Drive a resumable search until done, yielding to the scheduler when the
1992
+ * frame budget is exceeded. Returns the materialised result array.
1993
+ *
1994
+ * Heuristic: each call to `searchSlice` processes a chunk batch, then we
1995
+ * check elapsed time. The batch size doubles up to a cap to amortise the
1996
+ * JS<->WASM overhead on fast machines; on slow machines a single batch
1997
+ * may eat the entire budget, which is also fine.
1998
+ */
1999
+ private async _runSearchBudgeted(
2000
+ displayQuery: string,
2001
+ opts: SearchOptions,
2002
+ budgetMs: number,
2003
+ phraseTokens?: string[],
2004
+ branchIdx = 0,
2005
+ ): Promise<SearchResult[]> {
2006
+ const w = this._wasm;
2007
+ // Pattern is already set by the caller via selectQueryBranch(branchIdx).
2008
+ // Snapshot THAT branch's compiled pattern for the GPU pre-filter hash —
2009
+ // not branch 0, which would build the wrong candidate mask for OR
2010
+ // branches and silently drop their hits (audit finding #6).
2011
+ const activePatternLen = w.getQueryBranchPattern(branchIdx);
2012
+ const activePattern = activePatternLen > 0 ? this._readPad(activePatternLen) : '';
2013
+
2014
+ // GPU pre-filter (CD1). If enabled AND the corpus is large enough,
2015
+ // the GPU computes the candidate bitset and we install it into WASM
2016
+ // before searchBegin so the slice loop only inspects candidates.
2017
+ // Failure here is silent: we fall back to CPU-only Bloom transparently.
2018
+ if (this._shouldEngageGpu()) {
2019
+ try {
2020
+ await this._gpuPreFilter(activePattern);
2021
+ } catch (e) {
2022
+ // Don't let a GPU hiccup kill the search — drop to CPU path.
2023
+ this._diag({
2024
+ kind: 'fallback', stage: 'gpu',
2025
+ message: `GPU pre-filter failed; falling back to CPU: ${e instanceof Error ? e.message : String(e)}`,
2026
+ });
2027
+ w.clearCandidateMask();
2028
+ }
469
2029
  }
470
2030
 
471
- const results = this._runSearch(tokensToWasmQuery(parsed.tokens), query);
2031
+ const t0 = performance.now();
2032
+ if (w.searchBegin() === 0) {
2033
+ this._lastSearch = {
2034
+ query: displayQuery, timeMs: 0, results: 0,
2035
+ bloomTested: 0, bloomPassed: 0, bitapMatched: 0,
2036
+ };
2037
+ return [];
2038
+ }
472
2039
 
473
- if (parsed.kind === 'phrase') {
474
- return results.filter(r => containsPhrase(r.snippet, parsed.tokens));
2040
+ // In background / low-power modes we halve the initial batch so the
2041
+ // engine yields more often to the scheduler, leaving more headroom for
2042
+ // whatever the host is doing.
2043
+ const conservative = this._resources?.mode === 'background'
2044
+ || this._resources?.mode === 'low-power';
2045
+ let batch = conservative ? 1024 : 2048;
2046
+ // `scheduler.yield()` is the cleanest way to defer to the event loop in
2047
+ // 2026 (Chrome 129+). Fall back to `requestAnimationFrame` on older
2048
+ // browsers and Node test environments.
2049
+ type Sched = { yield: () => Promise<void> };
2050
+ const sched = (globalThis as unknown as { scheduler?: Sched }).scheduler;
2051
+ const yieldFn: () => Promise<void> = sched && typeof sched.yield === 'function'
2052
+ ? () => sched.yield()
2053
+ : (typeof requestAnimationFrame === 'function'
2054
+ ? () => new Promise<void>(resolve => requestAnimationFrame(() => resolve()))
2055
+ : () => new Promise<void>(resolve => setTimeout(resolve, 0)));
2056
+
2057
+ for (;;) {
2058
+ const sliceStart = performance.now();
2059
+ const done = w.searchSlice(batch);
2060
+ const sliceMs = performance.now() - sliceStart;
2061
+ if (done === 1) break;
2062
+
2063
+ // Adapt batch size: if we have headroom in budget, grow; if we're
2064
+ // already over the per-slice target, shrink.
2065
+ if (sliceMs < budgetMs * 0.5 && batch < 32_768) batch *= 2;
2066
+ else if (sliceMs > budgetMs * 1.5 && batch > 512) batch = Math.max(512, Math.floor(batch / 2));
2067
+
2068
+ await yieldFn();
475
2069
  }
476
2070
 
2071
+ const ms = performance.now() - t0;
2072
+ const count = w.getResultCount();
2073
+ this._lastSearch = {
2074
+ query: displayQuery,
2075
+ timeMs: ms,
2076
+ results: count,
2077
+ bloomTested: w.getStatBloomTested(),
2078
+ bloomPassed: w.getStatBloomPassed(),
2079
+ bitapMatched: w.getStatBitapMatched(),
2080
+ };
2081
+
2082
+ return this._collectResults(count, opts, phraseTokens);
2083
+ }
2084
+
2085
+ /** Materialise results [0..count) into the public SearchResult shape.
2086
+ * When `phraseTokens` is given, each result is kept only if those tokens
2087
+ * appear adjacently in the FULL chunk text — independent of any display
2088
+ * windowing — so phrase queries stay correct under `{ windowed: true }`. */
2089
+ private _collectResults(count: number, opts: SearchOptions, phraseTokens?: string[]): SearchResult[] {
2090
+ const w = this._wasm;
2091
+ const windowed = opts.windowed === true;
2092
+ const before = opts.before ?? 60;
2093
+ const after = opts.after ?? 120;
2094
+ const phraseFilter = phraseTokens && phraseTokens.length > 0 ? phraseTokens : null;
2095
+
2096
+ const results: SearchResult[] = [];
2097
+ for (let i = 0; i < count; i++) {
2098
+ // Phrase adjacency check against the full chunk text (getSnippet), not
2099
+ // the possibly-cropped display window.
2100
+ if (phraseFilter) {
2101
+ const fl = w.getSnippet(i);
2102
+ const full = fl > 0 ? this._readPad(fl) : '';
2103
+ if (!containsPhrase(full, phraseFilter)) continue;
2104
+ }
2105
+
2106
+ const score = w.getResultScore(i);
2107
+ const location = w.getResultLocation(i);
2108
+ const matchStart = w.getResultStart(i);
2109
+ const matchEnd = w.getResultEnd(i);
2110
+ const nl = w.getResultDocName(i);
2111
+ const name = nl > 0 ? this._readPad(nl) : '?';
2112
+
2113
+ const matchCount = w.getResultMatchCount(i);
2114
+ const matches: MatchSpan[] = [];
2115
+ for (let k = 0; k < matchCount; k++) {
2116
+ matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
2117
+ }
2118
+ if (matches.length === 0) matches.push({ start: matchStart, end: matchEnd });
2119
+
2120
+ let snippet: string;
2121
+ let primaryStart = matchStart;
2122
+ let primaryEnd = matchEnd;
2123
+ let adjustedMatches: MatchSpan[] = matches;
2124
+
2125
+ if (windowed) {
2126
+ const sl = w.getSnippetWindow(i, before, after);
2127
+ snippet = sl > 0 ? this._readPad(sl) : '';
2128
+ const offset = w.getSnippetWindowOffset();
2129
+ const leadingPrefix = offset > 0 ? 4 : 0;
2130
+ const shift = leadingPrefix - offset;
2131
+ adjustedMatches = matches.map(m => ({
2132
+ start: Math.max(0, m.start + shift),
2133
+ end: Math.max(0, m.end + shift),
2134
+ }));
2135
+ primaryStart = adjustedMatches[0]?.start ?? 0;
2136
+ primaryEnd = adjustedMatches[0]?.end ?? 0;
2137
+ } else {
2138
+ const sl = w.getSnippet(i);
2139
+ snippet = sl > 0 ? this._readPad(sl) : '';
2140
+ }
2141
+
2142
+ results.push({
2143
+ documentName: name,
2144
+ location,
2145
+ score,
2146
+ snippet,
2147
+ matchStart: primaryStart,
2148
+ matchEnd: primaryEnd,
2149
+ matches: adjustedMatches,
2150
+ });
2151
+ }
477
2152
  return results;
478
2153
  }
479
2154
 
480
- private _searchOr(branches: string[][], rawQuery: string): SearchResult[] {
2155
+ /** Run all OR branches and merge dedup-by-(doc, location, match). The
2156
+ * branches are already compiled inside the WASM (by prepareQuery); we
2157
+ * iterate them with selectQueryBranch. The "rawQuery" param is kept
2158
+ * only for the lastSearch.query field. */
2159
+ private _searchOr(rawQuery: string, opts: SearchOptions): SearchResult[] {
2160
+ const w = this._wasm;
481
2161
  const seen = new Set<string>();
482
2162
  const all: SearchResult[] = [];
483
-
484
- for (const tokens of branches) {
485
- const q = tokensToWasmQuery(tokens);
486
- if (!q) continue;
487
- const results = this._runSearch(q, rawQuery);
2163
+ const n = w.getQueryBranchCount();
2164
+ for (let i = 0; i < n; i++) {
2165
+ w.selectQueryBranch(i);
2166
+ const results = this._runSearch(rawQuery, opts);
488
2167
  for (const r of results) {
489
2168
  const key = `${r.documentName}:${r.location}:${r.matchStart}`;
490
2169
  if (!seen.has(key)) { seen.add(key); all.push(r); }
491
2170
  }
492
2171
  }
493
-
494
- // Re-rank the merged list by score descending.
495
2172
  all.sort((a, b) => b.score - a.score);
496
2173
  return all;
497
2174
  }
498
2175
 
499
- private _runSearch(wasmQuery: string, displayQuery: string): SearchResult[] {
500
- const ql = this._writeStr(wasmQuery);
501
- (this._wasm.setPattern as Function)(ql);
2176
+ /** Execute a single search using whichever query branch is currently
2177
+ * active (set via selectQueryBranch). Returns the materialised
2178
+ * SearchResult[]. Caller is responsible for activating a branch first. */
2179
+ private _runSearch(displayQuery: string, opts: SearchOptions, phraseTokens?: string[]): SearchResult[] {
2180
+ const w = this._wasm;
502
2181
 
503
2182
  const t0 = performance.now();
504
- const count = (this._wasm.search as Function)() as number;
2183
+ const count = w.search();
505
2184
  const ms = performance.now() - t0;
506
2185
 
507
2186
  this._lastSearch = {
508
2187
  query: displayQuery,
509
2188
  timeMs: ms,
510
2189
  results: count,
511
- bloomTested: (this._wasm.getStatBloomTested as Function)() as number,
512
- bloomPassed: (this._wasm.getStatBloomPassed as Function)() as number,
513
- bitapMatched: (this._wasm.getStatBitapMatched as Function)() as number,
2190
+ bloomTested: w.getStatBloomTested(),
2191
+ bloomPassed: w.getStatBloomPassed(),
2192
+ bitapMatched: w.getStatBitapMatched(),
514
2193
  };
515
2194
 
516
- const results: SearchResult[] = [];
517
- for (let i = 0; i < count; i++) {
518
- const score = (this._wasm.getResultScore as Function)(i) as number;
519
- const location = (this._wasm.getResultLocation as Function)(i) as number;
520
- const matchStart = (this._wasm.getResultStart as Function)(i) as number;
521
- const matchEnd = (this._wasm.getResultEnd as Function)(i) as number;
522
- const nl = (this._wasm.getResultDocName as Function)(i) as number;
523
- const name = nl > 0 ? this._readPad(nl) : '?';
524
- const sl = (this._wasm.getSnippet as Function)(i) as number;
525
- const snippet = sl > 0 ? this._readPad(sl) : '';
526
-
527
- results.push({ documentName: name, location, score, snippet, matchStart, matchEnd });
528
- }
529
- return results;
2195
+ return this._collectResults(count, opts, phraseTokens);
530
2196
  }
531
2197
 
532
2198
  /** Returns current engine statistics. */
533
2199
  getStats(): EngineStats {
534
2200
  return {
535
- documents: this._docs.length,
536
- chunks: (this._wasm.getChunkCount as Function)() as number,
537
- textUsed: (this._wasm.getTextUsed as Function)() as number,
538
- textCapacity: (this._wasm.getTextCapacity as Function)() as number,
2201
+ documents: this._docs.length,
2202
+ chunks: this._wasm.getChunkCount(),
2203
+ textUsed: this._wasm.getTextUsed(),
2204
+ textCapacity: this._wasm.getTextCapacity(),
539
2205
  wasmMemoryBytes: this._mem.buffer.byteLength,
2206
+ tier: this._tier,
2207
+ maxChunks: this._wasm.getMaxChunks(),
2208
+ maxDocs: this._wasm.getMaxDocs(),
540
2209
  };
541
2210
  }
542
2211
 
@@ -557,21 +2226,299 @@ export class AlbexEngine {
557
2226
 
558
2227
  /** Configure search sensitivity. */
559
2228
  setMaxErrors(errors: 0 | 1 | 2 | 3): void {
560
- (this._wasm.setMaxErrors as Function)(errors);
2229
+ this._wasm.setMaxErrors(errors);
561
2230
  }
562
2231
 
563
2232
  setThreshold(threshold: number): void {
564
- (this._wasm.setThreshold as Function)(Math.max(0, Math.min(1000, threshold)));
2233
+ this._wasm.setThreshold(Math.max(0, Math.min(1000, threshold)));
565
2234
  }
566
2235
 
567
2236
  setMaxResults(max: number): void {
568
- (this._wasm.setMaxResults as Function)(Math.max(1, Math.min(200, max)));
2237
+ this._wasm.setMaxResults(Math.max(1, Math.min(200, max)));
2238
+ }
2239
+
2240
+ /**
2241
+ * Enable or disable query stemming.
2242
+ *
2243
+ * - `'off'` (default): tokens are used as-is. Strict matching.
2244
+ * - `'es'`: Spanish stemmer applied to query tokens before search. A query
2245
+ * for `"contratos"` matches `"contrato"` and vice versa.
2246
+ *
2247
+ * Indexed text is never stemmed, so snippets remain faithful to the
2248
+ * source. Recall improvement comes from queries reducing to shared prefixes.
2249
+ */
2250
+ setLanguage(lang: 'off' | 'es'): void {
2251
+ this._wasm.setLanguage(lang === 'es' ? 1 : 0);
569
2252
  }
570
2253
 
571
2254
  /** Full reset — clears all indexed documents and chunks. */
572
2255
  reset(): void {
573
- (this._wasm.init as Function)();
2256
+ this._assertIdle('reset');
2257
+ this._resetInner();
2258
+ }
2259
+
2260
+ private _resetInner(): void {
2261
+ this._wasm.init();
2262
+ this._docs = [];
2263
+ this._lastSearch = null;
2264
+ this._diagnostics = [];
2265
+ }
2266
+
2267
+ /**
2268
+ * Drain and return the diagnostics collected since the last call (or
2269
+ * since the engine was created). Use this to surface recoverable
2270
+ * issues to the caller after `indexFile`, `load`, or any other
2271
+ * operation that may run into a "best-effort" path.
2272
+ *
2273
+ * Example diagnostics:
2274
+ * - `{kind:'fallback', stage:'pdf', message:'pdf-extract crashed,
2275
+ * attempting OCR-only fallback', file:'invoice.pdf'}`
2276
+ * - `{kind:'skipped', stage:'ocr', message:'Tesseract abort on page
2277
+ * 3 image 1; remaining images on this page skipped', file:'...',
2278
+ * page:3}`
2279
+ * - `{kind:'fallback', stage:'gpu', message:'GPU pre-filter failed,
2280
+ * using CPU'}`
2281
+ *
2282
+ * The buffer is cleared on each call; callers should consume the
2283
+ * returned array immediately (e.g. log to their telemetry, surface
2284
+ * a UI banner). After `reset()` the buffer is also cleared.
2285
+ */
2286
+ takeDiagnostics(): AlbexDiagnostic[] {
2287
+ const out = this._diagnostics;
2288
+ this._diagnostics = [];
2289
+ return out;
2290
+ }
2291
+
2292
+ /** Internal: record a diagnostic. Capped at 256 to bound memory. */
2293
+ private _diag(entry: AlbexDiagnostic): void {
2294
+ if (this._diagnostics.length >= 256) return;
2295
+ this._diagnostics.push(entry);
2296
+ }
2297
+
2298
+ /**
2299
+ * Install an OCR adapter. Returns a handle whose `dispose()` removes the
2300
+ * adapter from the engine.
2301
+ *
2302
+ * The contract: the adapter must provide `recognize(image, opts)` that
2303
+ * returns `Promise<OcrAttachedResult>`. The engine validates the
2304
+ * contract at attach time and refuses adapters that don't expose a
2305
+ * recognise function. Only one adapter can be attached at a time; a
2306
+ * second call to `attachOcr` while one is active throws — the caller
2307
+ * must dispose the previous one first.
2308
+ *
2309
+ * @example
2310
+ * ```ts
2311
+ * import { enableOcr } from '@albex/ocr';
2312
+ * const handle = enableOcr(engine); // internally calls attachOcr
2313
+ * // ... later ...
2314
+ * await handle.dispose();
2315
+ * ```
2316
+ *
2317
+ * Direct use without the companion package:
2318
+ * ```ts
2319
+ * const handle = engine.attachOcr({
2320
+ * recognize: async (blob) => myCustomOcr(blob),
2321
+ * options: { alwaysExtractEmbeddedImages: false },
2322
+ * });
2323
+ * ```
2324
+ */
2325
+ attachOcr(adapter: OcrAdapter): OcrHandle {
2326
+ if (this._ocrAdapter) {
2327
+ throw new AlbexInitError(
2328
+ 'OCR adapter already attached. Call dispose() on the previous handle before attaching a new one.',
2329
+ );
2330
+ }
2331
+ if (typeof adapter?.recognize !== 'function') {
2332
+ throw new AlbexInitError(
2333
+ 'attachOcr requires an adapter with a recognize(image, opts) function.',
2334
+ );
2335
+ }
2336
+ this._ocrAdapter = adapter;
2337
+ return {
2338
+ dispose: async () => {
2339
+ // Idempotent: a double dispose is a no-op rather than a throw.
2340
+ if (this._ocrAdapter === adapter) this._ocrAdapter = null;
2341
+ },
2342
+ };
2343
+ }
2344
+
2345
+ // ── Persistence ───────────────────────────────────────────────────────────
2346
+
2347
+ /**
2348
+ * Persist the current index to OPFS (or IndexedDB as fallback) under `name`.
2349
+ *
2350
+ * The snapshot includes every chunk, document name and text byte currently
2351
+ * indexed. Subsequent `load(name)` calls restore the engine to this exact
2352
+ * state in roughly O(total bytes), bypassing re-parsing.
2353
+ */
2354
+ async save(name: string): Promise<void> {
2355
+ return this._exclusive(() => this._saveInner(name));
2356
+ }
2357
+
2358
+ private async _saveInner(name: string): Promise<void> {
2359
+ const w = this._wasm;
2360
+ const total = w.snapshotSize();
2361
+ if (total === 0) {
2362
+ await savePersisted(name, new Uint8Array(0));
2363
+ return;
2364
+ }
2365
+ const out = new Uint8Array(total);
2366
+ let off = 0;
2367
+ while (off < total) {
2368
+ const n = w.snapshotChunk(off, FEED_SIZE);
2369
+ if (n === 0) break;
2370
+ const ptr = w.getBuffer(0);
2371
+ out.set(this._u8(ptr, n), off);
2372
+ off += n;
2373
+ }
2374
+ await savePersisted(name, out);
2375
+ // Reconstruct _docs from the doc table so getStats().documents stays
2376
+ // honest after save (no change here — but symmetric with load()).
2377
+ }
2378
+
2379
+ /**
2380
+ * Restore an index previously saved with `save(name)`. Returns `true` on
2381
+ * success, `false` if the snapshot is missing or has an incompatible
2382
+ * header (wrong magic, version, or struct sizes).
2383
+ */
2384
+ async load(name: string): Promise<boolean> {
2385
+ return this._exclusive(() => this._loadInner(name));
2386
+ }
2387
+
2388
+ private async _loadInner(name: string): Promise<boolean> {
2389
+ const bytes = await loadPersisted(name);
2390
+ if (!bytes || bytes.length === 0) return false;
2391
+
2392
+ const w = this._wasm;
2393
+ // Write the 64-byte header into the scratchpad and validate.
2394
+ if (bytes.length < 64) return false;
2395
+ const ptr = w.getBuffer(64);
2396
+ if (!ptr) return false;
2397
+ this._u8(ptr, 64).set(bytes.subarray(0, 64));
2398
+ if (w.restoreBegin() !== 1) return false;
2399
+
2400
+ // Stream payload bytes.
2401
+ let off = 64;
2402
+ while (off < bytes.length) {
2403
+ const n = Math.min(FEED_SIZE, bytes.length - off);
2404
+ this._writePad(bytes.subarray(off, off + n));
2405
+ if (w.restoreFeed(n) !== 1) return false;
2406
+ off += n;
2407
+ }
2408
+
2409
+ // Commit. For v3 this is the atomic apply step (state is untouched
2410
+ // until now); a failure here leaves the previous index intact so the
2411
+ // caller can keep using the engine. For v1/v2 snapshots `restoreCommit`
2412
+ // is a no-op that returns 1 (those formats applied in-place during
2413
+ // restoreFeed and have no rollback to offer). Older binaries that
2414
+ // predate v3 do not export `restoreCommit` — in that case we treat
2415
+ // the load as already committed by feature-detect.
2416
+ if (typeof w.restoreCommit === 'function') {
2417
+ if (w.restoreCommit() !== 1) return false;
2418
+ }
2419
+
2420
+ // Rebuild _docs metadata from the restored WASM tables.
2421
+ //
2422
+ // What's available after a restore:
2423
+ // * `name` — recovered from getDocName(i).
2424
+ // * `ext` — derived from the name.
2425
+ // * `chunks` — getDocChunkCount(i).
2426
+ // * `docId` — getDocId(i).
2427
+ // * `contentHash` — getDocContentHashPtr(i) when the binary supports
2428
+ // snapshot v2 (the export exists) AND the snapshot
2429
+ // itself was v2 (the bytes aren't all zero). v1
2430
+ // snapshots restore with all-zero hashes → '' here,
2431
+ // same as before.
2432
+ //
2433
+ // What's not persisted and therefore zeroed:
2434
+ // * `indexTimeMs` — no indexing happened in this session.
2435
+ // * `textBytes` — engine-wide totals are still available via
2436
+ // getStats().textUsed; per-doc breakdown is not
2437
+ // stored.
2438
+ const docCount = w.getDocCount();
2439
+ const hasHashExport = typeof w.getDocContentHashPtr === 'function'
2440
+ && typeof w.getDocContentHashLen === 'function';
574
2441
  this._docs = [];
2442
+ for (let i = 0; i < docCount; i++) {
2443
+ if (w.isDocDeleted(i)) continue;
2444
+ const nameLen = w.getDocName(i);
2445
+ const name = nameLen > 0 ? this._readPad(nameLen) : `restored-${i}`;
2446
+ const dotIdx = name.lastIndexOf('.');
2447
+ const ext = dotIdx > 0 ? name.slice(dotIdx + 1).toLowerCase() : '';
2448
+
2449
+ let contentHash = '';
2450
+ if (hasHashExport) {
2451
+ const hashLen = w.getDocContentHashLen(); // always 8 today
2452
+ const hashPtr = w.getDocContentHashPtr(i);
2453
+ if (hashPtr !== 0 && hashLen === 8) {
2454
+ const view = this._u8(hashPtr, 8);
2455
+ // Copy into a private buffer so subsequent WASM calls cannot
2456
+ // mutate it under us.
2457
+ const buf = new Uint8Array(8);
2458
+ buf.set(view);
2459
+ contentHash = hashBytesToHex(buf);
2460
+ }
2461
+ }
2462
+
2463
+ this._docs.push({
2464
+ name,
2465
+ ext,
2466
+ chunks: w.getDocChunkCount(i),
2467
+ indexTimeMs: 0,
2468
+ textBytes: 0,
2469
+ docId: w.getDocId(i),
2470
+ contentHash,
2471
+ });
2472
+ }
575
2473
  this._lastSearch = null;
2474
+ return true;
2475
+ }
2476
+
2477
+ /**
2478
+ * Convenience: load if the snapshot exists, otherwise leave the engine
2479
+ * empty. Returns whether a load actually happened.
2480
+ */
2481
+ async loadOrInit(name: string): Promise<boolean> {
2482
+ return this._exclusive(async () => {
2483
+ const loaded = await this._loadInner(name);
2484
+ if (!loaded) this._resetInner();
2485
+ return loaded;
2486
+ });
2487
+ }
2488
+
2489
+ /** Delete a previously persisted snapshot. */
2490
+ async deleteSnapshot(name: string): Promise<void> {
2491
+ await deletePersisted(name);
2492
+ }
2493
+
2494
+ /** List names of persisted snapshots in the current origin. */
2495
+ async listSnapshots(): Promise<string[]> {
2496
+ return listPersisted();
2497
+ }
2498
+
2499
+ /**
2500
+ * TC39 explicit-resource-management hook (Stage 3 in 2026). Lets the engine
2501
+ * be used with `using` so the references are released deterministically:
2502
+ *
2503
+ * using engine = new AlbexEngine(opts); await engine.init();
2504
+ *
2505
+ * WebAssembly does not actually expose a way to release linear memory pages
2506
+ * inside a Module instance, so we drop our references to the exports and
2507
+ * the doc list. GC can then reclaim the engine, which in turn releases the
2508
+ * WASM instance and its (typically 20 MB) backing memory.
2509
+ */
2510
+ [Symbol.dispose](): void {
2511
+ // Terminal: bypass the idle guard — disposing mid-operation is allowed.
2512
+ this._resetInner();
2513
+ this._unsubscribeResources?.();
2514
+ this._unsubscribeResources = null;
2515
+ this._gpu?.destroy();
2516
+ this._gpu = null;
2517
+ // Null out the references so the engine cannot be reused after disposal
2518
+ // and the WASM instance becomes unreachable.
2519
+ this._wasm = null as unknown as AlbexWasmExports;
2520
+ this._mem = null as unknown as WebAssembly.Memory;
2521
+ this._pdfWasm = null;
2522
+ this._pdfMem = null;
576
2523
  }
577
2524
  }