albex 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CHANGELOG.md +141 -0
  2. package/README.md +242 -112
  3. package/dist/albex-worker.d.ts +70 -0
  4. package/dist/albex-worker.d.ts.map +1 -0
  5. package/dist/albex-worker.js +153 -0
  6. package/dist/albex-worker.js.map +1 -0
  7. package/dist/albex.d.ts +368 -6
  8. package/dist/albex.d.ts.map +1 -1
  9. package/dist/albex.js +1692 -95
  10. package/dist/albex.js.map +1 -1
  11. package/dist/errors.d.ts +38 -0
  12. package/dist/errors.d.ts.map +1 -0
  13. package/dist/errors.js +63 -0
  14. package/dist/errors.js.map +1 -0
  15. package/dist/gpu/bloom-runtime.d.ts +60 -0
  16. package/dist/gpu/bloom-runtime.d.ts.map +1 -0
  17. package/dist/gpu/bloom-runtime.js +176 -0
  18. package/dist/gpu/bloom-runtime.js.map +1 -0
  19. package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
  20. package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
  21. package/dist/gpu/bloom-shader.wgsl.js +49 -0
  22. package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
  23. package/dist/persistence.d.ts +21 -0
  24. package/dist/persistence.d.ts.map +1 -0
  25. package/dist/persistence.js +174 -0
  26. package/dist/persistence.js.map +1 -0
  27. package/dist/pool/coordinator.d.ts +98 -0
  28. package/dist/pool/coordinator.d.ts.map +1 -0
  29. package/dist/pool/coordinator.js +247 -0
  30. package/dist/pool/coordinator.js.map +1 -0
  31. package/dist/profile.d.ts +95 -0
  32. package/dist/profile.d.ts.map +1 -0
  33. package/dist/profile.js +207 -0
  34. package/dist/profile.js.map +1 -0
  35. package/dist/resource-manager.d.ts +56 -0
  36. package/dist/resource-manager.d.ts.map +1 -0
  37. package/dist/resource-manager.js +138 -0
  38. package/dist/resource-manager.js.map +1 -0
  39. package/dist/tiered-store.d.ts +98 -0
  40. package/dist/tiered-store.d.ts.map +1 -0
  41. package/dist/tiered-store.js +238 -0
  42. package/dist/tiered-store.js.map +1 -0
  43. package/dist/wasm-bindings.d.ts +139 -0
  44. package/dist/wasm-bindings.d.ts.map +1 -0
  45. package/dist/wasm-bindings.js +33 -0
  46. package/dist/wasm-bindings.js.map +1 -0
  47. package/dist/worker-protocol.d.ts +86 -0
  48. package/dist/worker-protocol.d.ts.map +1 -0
  49. package/dist/worker-protocol.js +20 -0
  50. package/dist/worker-protocol.js.map +1 -0
  51. package/dist/worker-runtime.d.ts +14 -0
  52. package/dist/worker-runtime.d.ts.map +1 -0
  53. package/dist/worker-runtime.js +100 -0
  54. package/dist/worker-runtime.js.map +1 -0
  55. package/package.json +56 -13
  56. package/src/albex-worker.ts +187 -0
  57. package/src/albex.ts +1845 -130
  58. package/src/errors.ts +60 -0
  59. package/src/gpu/bloom-runtime.ts +229 -0
  60. package/src/gpu/bloom-shader.wgsl.ts +48 -0
  61. package/src/persistence.ts +175 -0
  62. package/src/pool/coordinator.ts +324 -0
  63. package/src/profile.ts +279 -0
  64. package/src/resource-manager.ts +167 -0
  65. package/src/tiered-store.ts +259 -0
  66. package/src/wasm-bindings.ts +200 -0
  67. package/src/worker-protocol.ts +48 -0
  68. package/src/worker-runtime.ts +96 -0
  69. package/wasm/pkg/albex_pdf.wasm +0 -0
  70. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  71. package/wasm/pkg/albex_wasm_mini.wasm +0 -0
  72. package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
  73. package/wasm/pkg/albex_wasm_pro.wasm +0 -0
  74. package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
  75. package/wasm/pkg/albex_wasm_std.wasm +0 -0
  76. package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/src/albex.ts CHANGED
@@ -14,15 +14,109 @@
14
14
  * ```
15
15
  */
16
16
 
17
+ import {
18
+ AlbexWasmExports,
19
+ AlbexPdfExports,
20
+ asAlbexExports,
21
+ asAlbexPdfExports,
22
+ } from './wasm-bindings.js';
23
+ import {
24
+ AlbexInitError,
25
+ AlbexUnsupportedFormatError,
26
+ AlbexParseError,
27
+ AlbexCapacityError,
28
+ } from './errors.js';
29
+ import {
30
+ savePersisted,
31
+ loadPersisted,
32
+ deletePersisted,
33
+ listPersisted,
34
+ } from './persistence.js';
35
+ import { detectProfile, pickTier, shouldUseGpu, type Tier, type DeviceProfile } from './profile.js';
36
+ import { getResourceManager, type ResourceState } from './resource-manager.js';
37
+ import { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
38
+
39
+ export {
40
+ AlbexError,
41
+ AlbexInitError,
42
+ AlbexUnsupportedFormatError,
43
+ AlbexParseError,
44
+ AlbexCapacityError,
45
+ } from './errors.js';
46
+ export { listPersisted, deletePersisted } from './persistence.js';
47
+ export { detectProfile, pickTier, pickWorkerCount, shouldUseGpu } from './profile.js';
48
+ export type { DeviceProfile, Tier } from './profile.js';
49
+ export { getResourceManager } from './resource-manager.js';
50
+ export type { ResourceState, ResourceMode } from './resource-manager.js';
51
+ export { AlbexPool } from './pool/coordinator.js';
52
+ export type { AlbexPoolOptions } from './pool/coordinator.js';
53
+ export { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
54
+ export { TieredStore } from './tiered-store.js';
55
+ export type { TieredStoreOptions } from './tiered-store.js';
56
+
57
+ // ─────────────────────────────────────────────────────────────────────────────
58
+ // Deprecation warnings — one-shot, fire-and-forget
59
+ // ─────────────────────────────────────────────────────────────────────────────
60
+
61
+ let _searchStreamWarned = false;
62
+ function warnSearchStreamDeprecated(): void {
63
+ if (_searchStreamWarned) return;
64
+ _searchStreamWarned = true;
65
+ // The original name implied incremental streaming, which the implementation
66
+ // never provided. Renamed in 0.3.0; alias removed in 0.4.0.
67
+ console.warn(
68
+ '[albex] `searchStream` is deprecated; rename to `searchCooperative`. ' +
69
+ 'The method does not stream incremental results — it yields to the ' +
70
+ 'scheduler between slices and returns a batch. The alias will be ' +
71
+ 'removed in 0.4.0.',
72
+ );
73
+ }
74
+
17
75
  // ─────────────────────────────────────────────────────────────────────────────
18
76
  // Public types
19
77
  // ─────────────────────────────────────────────────────────────────────────────
20
78
 
21
79
  export interface AlbexOptions {
22
- /** URL to albex_wasm_bg.wasm (required). */
23
- wasmUrl: string;
80
+ /**
81
+ * Explicit URL to the main WASM binary.
82
+ *
83
+ * If you want automatic tier selection (mini/std/pro chosen from
84
+ * `deviceMemory`), pass `wasmBaseUrl` instead — the engine will fetch
85
+ * `albex_wasm_<tier>.wasm` from that directory.
86
+ */
87
+ wasmUrl?: string;
88
+ /**
89
+ * Base directory containing tiered binaries (`albex_wasm_mini.wasm`,
90
+ * `_std.wasm`, `_pro.wasm`). Used when `wasmUrl` is omitted.
91
+ */
92
+ wasmBaseUrl?: string;
24
93
  /** URL to albex_pdf.wasm. Required only if you call indexFile() with PDFs. */
25
94
  pdfWasmUrl?: string;
95
+ /**
96
+ * Override the tier auto-detection. Pass `'auto'` (default), or an
97
+ * explicit tier when you know the constraints of your target environment.
98
+ */
99
+ tier?: 'auto' | 'mini' | 'std' | 'pro';
100
+ /**
101
+ * SIMD selection. When `'auto'` (default), Albex probes for v128 support
102
+ * and fetches the `_simd.wasm` variant when available. Pass `'off'` to
103
+ * stay on the baseline binary even on capable hosts (useful for
104
+ * regression testing or to align all clients in a corporate deployment).
105
+ */
106
+ simd?: 'auto' | 'on' | 'off';
107
+ /**
108
+ * GPU acceleration policy for the Bloom pre-filter.
109
+ * `'auto'` — enable when WebGPU is available AND chunk count is large
110
+ * `'on'` — force enable (fall back to CPU silently if GPU fails)
111
+ * `'off'` — never use GPU
112
+ * Default: `'auto'`.
113
+ */
114
+ gpu?: 'auto' | 'on' | 'off';
115
+ /**
116
+ * Minimum chunk count before `gpu: 'auto'` engages. Below this threshold
117
+ * the upload + dispatch overhead is bigger than the speedup. Default: 20_000.
118
+ */
119
+ gpuThreshold?: number;
26
120
  }
27
121
 
28
122
  export interface IndexedDocument {
@@ -31,6 +125,17 @@ export interface IndexedDocument {
31
125
  chunks: number;
32
126
  indexTimeMs: number;
33
127
  textBytes: number;
128
+ /** WASM-side stable identifier (also acts as a slot index after compact). */
129
+ docId: number;
130
+ /** 64-bit FNV-1a hex of the source file bytes. Stable across runs. */
131
+ contentHash: string;
132
+ }
133
+
134
+ export interface MatchSpan {
135
+ /** Byte offset within `snippet` where this matched token begins. */
136
+ start: number;
137
+ /** Byte offset within `snippet` where this matched token ends (exclusive). */
138
+ end: number;
34
139
  }
35
140
 
36
141
  export interface SearchResult {
@@ -39,12 +144,38 @@ export interface SearchResult {
39
144
  location: number;
40
145
  /** Relevance score 0–1000. */
41
146
  score: number;
42
- /** Raw snippet text (original, with accents). */
147
+ /** Snippet text. With `windowed` search options this is a substring with
148
+ * ASCII ellipsis sentinels (`"... "` / `" ..."`) the UI should render
149
+ * as `…`. Without windowing, the full chunk text. */
43
150
  snippet: string;
44
- /** Match start byte offset within snippet. */
151
+ /** Primary token match (kept for backwards compatibility). Equal to `matches[0]`. */
45
152
  matchStart: number;
46
- /** Match end byte offset within snippet (exclusive). */
47
153
  matchEnd: number;
154
+ /** All matched token spans within `snippet`, in query order. Length 1–4. */
155
+ matches: MatchSpan[];
156
+ }
157
+
158
+ /**
159
+ * Options that change how snippets are produced. Both fields are optional.
160
+ *
161
+ * `windowed` — when true, return a cropped window around the primary
162
+ * match instead of the full chunk text.
163
+ * `before/after` — bytes of context to include on each side of the primary
164
+ * match. Defaults: 60 before, 120 after.
165
+ */
166
+ export interface SearchOptions {
167
+ windowed?: boolean;
168
+ before?: number;
169
+ after?: number;
170
+ /**
171
+ * Frame budget in milliseconds for `searchCooperative`. The engine
172
+ * processes chunks until the budget is exhausted, then yields to the
173
+ * event loop via `scheduler.yield()` (or `requestAnimationFrame`
174
+ * fallback) before resuming. Lower = smoother UI; higher = lower latency.
175
+ *
176
+ * Default: 8 ms (half a 60 fps frame). Ignored by synchronous `search()`.
177
+ */
178
+ frameBudgetMs?: number;
48
179
  }
49
180
 
50
181
  export interface EngineStats {
@@ -53,6 +184,12 @@ export interface EngineStats {
53
184
  textUsed: number;
54
185
  textCapacity: number;
55
186
  wasmMemoryBytes: number;
187
+ /** Tier loaded at init time (mini/std/pro). */
188
+ tier: Tier | null;
189
+ /** Compile-time chunk capacity for the loaded tier. */
190
+ maxChunks: number;
191
+ /** Compile-time document capacity for the loaded tier. */
192
+ maxDocs: number;
56
193
  }
57
194
 
58
195
  export interface SearchStats {
@@ -147,7 +284,7 @@ function zipCentralDir(bytes: Uint8Array): { v: DataView; cdOff: number; cdN: nu
147
284
  const v = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
148
285
  let p = bytes.length - 22;
149
286
  while (p >= 0 && v.getUint32(p, true) !== 0x06054b50) p--;
150
- if (p < 0) throw new Error('Not a ZIP file');
287
+ if (p < 0) throw new AlbexParseError('zip', 'Not a ZIP file (no EOCD record)');
151
288
  return { v, cdOff: v.getUint32(p + 16, true), cdN: v.getUint16(p + 10, true) };
152
289
  }
153
290
 
@@ -176,7 +313,7 @@ async function findZipEntry(bytes: Uint8Array, name: string): Promise<Uint8Array
176
313
  }
177
314
  cp += 46 + nl + xl + cl;
178
315
  }
179
- throw new Error(`Entry "${name}" not found in ZIP`);
316
+ throw new AlbexParseError('zip', `Entry "${name}" not found in ZIP`);
180
317
  }
181
318
 
182
319
  async function decompEntry(bytes: Uint8Array, v: DataView, off: number, compSize: number): Promise<Uint8Array> {
@@ -202,7 +339,7 @@ async function decompEntry(bytes: Uint8Array, v: DataView, off: number, compSize
202
339
  for (const c of chunks) { out.set(c, o); o += c.length; }
203
340
  return out;
204
341
  }
205
- throw new Error(`Unsupported ZIP compression method ${meth}`);
342
+ throw new AlbexParseError('zip', `Unsupported ZIP compression method ${meth}`);
206
343
  }
207
344
 
208
345
  // ─────────────────────────────────────────────────────────────────────────────
@@ -211,50 +348,347 @@ async function decompEntry(bytes: Uint8Array, v: DataView, off: number, compSize
211
348
 
212
349
  const FEED_SIZE = 32_768; // 32 KB — fits in 64 KB scratchpad
213
350
 
351
+ // ─────────────────────────────────────────────────────────────────────────────
352
+ // Content hash — FNV-1a 64-bit
353
+ // ─────────────────────────────────────────────────────────────────────────────
354
+
355
+ /**
356
+ * Compute a 64-bit FNV-1a hash of `bytes` and return it as a 16-char hex
357
+ * string. FNV-1a is a non-cryptographic hash; chosen here because:
358
+ * - it needs zero dependencies,
359
+ * - it is fast on small/medium blobs (~100 MB/s in modern JS),
360
+ * - 64 bits is enough to deduplicate documents in a 128-doc library with
361
+ * vanishing collision probability.
362
+ *
363
+ * The result is stable across runs and engines, so it can be persisted in
364
+ * snapshots without versioning concerns.
365
+ */
366
+ /**
367
+ * Compute the same 64-bit Bloom value the Rust side computes for a query.
368
+ *
369
+ * Must stay in sync with `BloomFilter::from_text` and `fold_utf8_char` in
370
+ * `core/src/bloom.rs`. The hashing is `c & 0x3F` over each accent-folded
371
+ * lowercase ASCII byte; non-letters are skipped. The aggregate of all token
372
+ * blooms is what the GPU pre-filter checks against.
373
+ */
374
+ function computePatternBloom(query: string): bigint {
375
+ // Quick-and-faithful fold: lowercase, NFKD, strip combining marks. This
376
+ // matches the Rust Latin-1/Latin-A fold for the characters we care about
377
+ // (the rest fall through as non-letters which contribute nothing).
378
+ const norm = query.toLowerCase().normalize('NFKD').replace(/[̀-ͯ]/g, '');
379
+ let bits = 0n;
380
+ for (let i = 0; i < norm.length; i++) {
381
+ const code = norm.charCodeAt(i);
382
+ if ((code >= 0x61 && code <= 0x7a) || (code >= 0x30 && code <= 0x39)) {
383
+ bits |= 1n << BigInt(code & 0x3f);
384
+ } else if (code === 0x20) {
385
+ // skip token separator
386
+ } else if (code < 0x80) {
387
+ // other ASCII punctuation — they bias the filter; mirror Rust which
388
+ // also includes them via the 6-bit mask.
389
+ bits |= 1n << BigInt(code & 0x3f);
390
+ }
391
+ }
392
+ return bits;
393
+ }
394
+
395
+ function contentHash(bytes: Uint8Array): string {
396
+ // 64-bit arithmetic via two 32-bit halves (no BigInt to keep it fast in
397
+ // engines without optimised BigInt support).
398
+ let hi = 0xcbf29ce4 | 0;
399
+ let lo = 0x84222325 | 0;
400
+ // FNV prime: 0x100000001b3 = (0x100 << 32) | 0x000001b3
401
+ for (let i = 0; i < bytes.length; i++) {
402
+ lo ^= bytes[i]!;
403
+ // multiply by FNV prime
404
+ // (hi:lo) *= 0x100000001b3
405
+ // low * prime
406
+ const lo_lo = (lo & 0xffff) * 0x1b3;
407
+ const lo_hi = (lo >>> 16) * 0x1b3;
408
+ let new_lo = (lo_lo + ((lo_hi & 0xffff) << 16)) | 0;
409
+ let carry = (lo_hi >>> 16) + ((lo_lo + ((lo_hi & 0xffff) << 16)) > 0xffffffff ? 1 : 0);
410
+ // hi*prime + carry
411
+ const hi_lo = (hi & 0xffff) * 0x1b3;
412
+ const hi_hi = (hi >>> 16) * 0x1b3;
413
+ const new_hi = ((hi_lo + ((hi_hi & 0xffff) << 16)) | 0) + carry + lo; // + lo because high 33rd bit
414
+ lo = new_lo;
415
+ hi = new_hi | 0;
416
+ }
417
+ const hexHi = (hi >>> 0).toString(16).padStart(8, '0');
418
+ const hexLo = (lo >>> 0).toString(16).padStart(8, '0');
419
+ return hexHi + hexLo;
420
+ }
421
+
422
+ /**
423
+ * 16-hex-char content hash → 8 raw bytes for setDocumentContentHash. The
424
+ * byte order matches the snapshot format: the high 32 bits sit at offsets
425
+ * 0..3 (big-endian-of-the-half), the low 32 bits at offsets 4..7. The
426
+ * exact byte order is irrelevant for correctness — both encode and decode
427
+ * use the same convention — but matching the natural hex byte order keeps
428
+ * a hex dump readable.
429
+ */
430
+ function hashHexToBytes(hex: string): Uint8Array {
431
+ const out = new Uint8Array(8);
432
+ for (let i = 0; i < 8; i++) {
433
+ out[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
434
+ }
435
+ return out;
436
+ }
437
+
438
+ /**
439
+ * Map a Windows-1252 byte to its Unicode equivalent. Used by the RTF parser
440
+ * for `\'XX` escapes — RTF defaults to cp1252 for high-ANSI characters.
441
+ *
442
+ * The 0x80-0x9F range is what makes cp1252 ≠ Latin-1: Microsoft put curly
443
+ * quotes, em-dashes, the Euro sign etc. into this otherwise-control-only
444
+ * block. Outside that range, cp1252 matches Latin-1 (which equals Unicode
445
+ * for codepoints below 0x100).
446
+ */
447
+ const _CP1252_HIGH: Record<number, string> = {
448
+ 0x80: '€', 0x82: '‚', 0x83: 'ƒ', 0x84: '„', 0x85: '…', 0x86: '†',
449
+ 0x87: '‡', 0x88: 'ˆ', 0x89: '‰', 0x8A: 'Š', 0x8B: '‹', 0x8C: 'Œ',
450
+ 0x8E: 'Ž',
451
+ 0x91: '‘', 0x92: '’', 0x93: '“', 0x94: '”',
452
+ 0x95: '•', 0x96: '–', 0x97: '—', 0x98: '˜', 0x99: '™', 0x9A: 'š',
453
+ 0x9B: '›', 0x9C: 'œ', 0x9E: 'ž', 0x9F: 'Ÿ',
454
+ };
455
+
456
+ function rtfCp1252ToChar(byte: number): string {
457
+ if (byte < 0x80) return String.fromCharCode(byte);
458
+ if (byte >= 0xA0) return String.fromCharCode(byte);
459
+ return _CP1252_HIGH[byte] ?? '';
460
+ }
461
+
462
+ /**
463
+ * Apply the entity's Content-Transfer-Encoding to its body. Handles
464
+ * base64, quoted-printable, and the pass-through cases (7bit, 8bit, none).
465
+ * Anything unrecognised falls through as pass-through too — better to
466
+ * index something marginally useful than to drop the body entirely.
467
+ */
468
+ function decodeEmlBody(
469
+ headersBlock: string,
470
+ body: string,
471
+ header: (block: string, name: string) => string,
472
+ ): string {
473
+ const enc = header(headersBlock, 'Content-Transfer-Encoding').toLowerCase();
474
+ if (enc === 'base64') return decodeBase64Utf8(body);
475
+ if (enc === 'quoted-printable') return decodeQuotedPrintable(body);
476
+ return body;
477
+ }
478
+
479
+ /**
480
+ * Decode a base64 body and interpret the result as UTF-8 text. Used by the
481
+ * EML parser when Content-Transfer-Encoding is base64. Whitespace inside
482
+ * the encoded body (the line breaks every 76 chars) is stripped first;
483
+ * malformed inputs fall back to returning the original string so the
484
+ * caller can still index *something*.
485
+ */
486
+ function decodeBase64Utf8(body: string): string {
487
+ try {
488
+ const clean = body.replace(/\s+/g, '');
489
+ if (!clean) return '';
490
+ // atob produces a "binary string" where each char's low byte is the
491
+ // original byte. We have to bridge that back through Uint8Array to
492
+ // decode UTF-8 multi-byte sequences correctly.
493
+ const bin = atob(clean);
494
+ const arr = new Uint8Array(bin.length);
495
+ for (let i = 0; i < bin.length; i++) arr[i] = bin.charCodeAt(i);
496
+ return _dec.decode(arr);
497
+ } catch {
498
+ return body;
499
+ }
500
+ }
501
+
502
+ /**
503
+ * Decode a quoted-printable body. Handles `=XX` hex escapes (including the
504
+ * `=` "soft line break" producing nothing) and re-decodes the result as
505
+ * UTF-8 — RFC 2045 allows non-ASCII bytes to be QP-encoded, so multiple
506
+ * hex pairs in a row may form a single UTF-8 codepoint.
507
+ */
508
+ function decodeQuotedPrintable(body: string): string {
509
+ // First pass: collect the raw bytes so we can decode multi-byte UTF-8.
510
+ const bytes: number[] = [];
511
+ for (let i = 0; i < body.length; i++) {
512
+ const c = body[i];
513
+ if (c === '=') {
514
+ // Soft line break: `=` at end of line.
515
+ if (body[i + 1] === '\n') { i += 1; continue; }
516
+ // `=XX` hex pair.
517
+ const h = body.slice(i + 1, i + 3);
518
+ if (/^[0-9A-Fa-f]{2}$/.test(h)) {
519
+ bytes.push(parseInt(h, 16));
520
+ i += 2;
521
+ continue;
522
+ }
523
+ // Malformed: keep the literal `=`.
524
+ bytes.push(0x3D);
525
+ continue;
526
+ }
527
+ // ASCII pass-through. JS strings are UTF-16; for ASCII we know
528
+ // charCodeAt fits in a byte. Non-ASCII char in the source isn't
529
+ // strictly valid QP but we pass it through best-effort.
530
+ bytes.push(c!.charCodeAt(0) & 0xff);
531
+ }
532
+ try {
533
+ return _dec.decode(new Uint8Array(bytes));
534
+ } catch {
535
+ return body;
536
+ }
537
+ }
538
+
539
+ /** Inverse of hashHexToBytes. All-zero bytes return '' (no hash known). */
540
+ function hashBytesToHex(bytes: Uint8Array): string {
541
+ let allZero = true;
542
+ for (let i = 0; i < 8; i++) {
543
+ if (bytes[i] !== 0) { allZero = false; break; }
544
+ }
545
+ if (allZero) return '';
546
+ let s = '';
547
+ for (let i = 0; i < 8; i++) {
548
+ s += bytes[i]!.toString(16).padStart(2, '0');
549
+ }
550
+ return s;
551
+ }
552
+
214
553
  // ─────────────────────────────────────────────────────────────────────────────
215
554
  // PDF WASM imports shim
216
555
  // ─────────────────────────────────────────────────────────────────────────────
217
556
 
218
- function makePdfWasmImports(getPdfMem: () => WebAssembly.Memory): WebAssembly.Imports {
557
+ /**
558
+ * Build the import object for `albex_pdf.wasm` by inspecting the module's
559
+ * required imports at instantiation time.
560
+ *
561
+ * The PDF wasm pulls `wasm-bindgen` transitively through `getrandom`. Its
562
+ * import names embed a build-time hash, e.g.
563
+ * __wbg_getRandomValues_3f44b700395062e5
564
+ * Hardcoding that hash bound the loader to one exact build of the .wasm —
565
+ * any version bump of getrandom / lopdf / wasm-bindgen silently broke
566
+ * instantiation with an InputValidationError.
567
+ *
568
+ * Here we resolve imports by *prefix* and module so the binding survives
569
+ * cosmetic mangling changes. We map:
570
+ * - any __wbg_getRandomValues_* / __wbg_crypto_* → crypto.getRandomValues
571
+ * - any __wbindgen_describe* / __wbindgen_throw* → no-op
572
+ * - __wbindgen_object_drop_ref → heap-slot recycler
573
+ * - __wbindgen_externref_table_grow → heap grower
574
+ * - __wbindgen_externref_table_set_null → heap nuller
575
+ *
576
+ * Anything else gets a logged no-op stub. If the PDF code path ever exercises
577
+ * a missing import, the user gets a console warning, not a hard crash on load.
578
+ */
579
+ function makePdfWasmImports(
580
+ module: WebAssembly.Module,
581
+ getPdfMem: () => WebAssembly.Memory | null,
582
+ ): WebAssembly.Imports {
219
583
  const heap: unknown[] = [];
220
584
  let freeIdx = -1;
221
- return {
222
- __wbindgen_placeholder__: {
223
- __wbindgen_describe: () => {},
224
- __wbg_getRandomValues_3f44b700395062e5: (ptr: number, len: number) => {
225
- const mem = getPdfMem();
226
- crypto.getRandomValues(new Uint8Array(mem.buffer, ptr >>> 0, len >>> 0));
227
- },
228
- __wbindgen_object_drop_ref: (idx: number) => {
229
- heap[idx] = freeIdx; freeIdx = idx;
230
- },
231
- },
232
- __wbindgen_externref_xform__: {
233
- __wbindgen_externref_table_grow: (delta: number) => {
234
- const old = heap.length;
235
- for (let i = 0; i < delta; i++) heap.push(undefined);
236
- return old;
237
- },
238
- __wbindgen_externref_table_set_null: (idx: number) => { heap[idx] = undefined; },
239
- },
585
+ const required = WebAssembly.Module.imports(module);
586
+
587
+ const fillRandom = (ptr: number, len: number): void => {
588
+ const mem = getPdfMem();
589
+ if (!mem) throw new Error('PDF WASM memory not initialised');
590
+ crypto.getRandomValues(new Uint8Array(mem.buffer, ptr >>> 0, len >>> 0));
240
591
  };
592
+
593
+ const resolveByName = (modName: string, name: string): unknown => {
594
+ // Random-byte providers (any hashed variant).
595
+ if (name.startsWith('__wbg_getRandomValues') || name.startsWith('__wbg_crypto')) {
596
+ return fillRandom;
597
+ }
598
+ // Diagnostic / introspection — never invoked at runtime in our paths.
599
+ if (name.startsWith('__wbindgen_describe') || name.startsWith('__wbindgen_throw')) {
600
+ return () => {};
601
+ }
602
+ // Externref-heap management used by wasm-bindgen runtime.
603
+ switch (name) {
604
+ case '__wbindgen_object_drop_ref':
605
+ return (idx: number) => { heap[idx] = freeIdx; freeIdx = idx; };
606
+ case '__wbindgen_externref_table_grow':
607
+ return (delta: number) => {
608
+ const old = heap.length;
609
+ for (let i = 0; i < delta; i++) heap.push(undefined);
610
+ return old;
611
+ };
612
+ case '__wbindgen_externref_table_set_null':
613
+ return (idx: number) => { heap[idx] = undefined; };
614
+ }
615
+ // Unknown import — return a stub that warns when called. Loading still
616
+ // succeeds; only an actually-invoked unknown import will surface.
617
+ return (...args: unknown[]) => {
618
+ console.warn(`[albex] unhandled PDF WASM import ${modName}.${name}`, args);
619
+ };
620
+ };
621
+
622
+ const imports: Record<string, Record<string, unknown>> = {};
623
+ for (const { module: modName, name } of required) {
624
+ if (!imports[modName]) imports[modName] = {};
625
+ imports[modName]![name] = resolveByName(modName, name);
626
+ }
627
+ return imports as WebAssembly.Imports;
241
628
  }
242
629
 
243
630
  // ─────────────────────────────────────────────────────────────────────────────
244
631
  // AlbexEngine
245
632
  // ─────────────────────────────────────────────────────────────────────────────
246
633
 
634
+ /**
635
+ * Result shape returned by an attached OCR module. Kept structural here so
636
+ * the main package has no runtime dependency on `@albex/ocr` — the optional
637
+ * shape is just a contract.
638
+ */
639
+ export interface OcrAttachedResult {
640
+ text: string;
641
+ confidence: number;
642
+ timeMs: number;
643
+ }
644
+
645
+ export interface OcrAttachedOptions {
646
+ lang?: string;
647
+ hint?: string;
648
+ }
649
+
247
650
  export class AlbexEngine {
248
651
  // ── main WASM ──
249
- private _wasm!: WebAssembly.Exports;
652
+ private _wasm!: AlbexWasmExports;
250
653
  private _mem!: WebAssembly.Memory;
251
654
 
655
+ /**
656
+ * OCR entry point installed by `@albex/ocr::enableOcr(engine)`. Undefined
657
+ * when the OCR module has not been wired. The main `albex` package has no
658
+ * runtime dependency on OCR — this is a structural slot that the optional
659
+ * companion package fills.
660
+ */
661
+ ocrImage?: (image: unknown, opts?: OcrAttachedOptions) => Promise<OcrAttachedResult>;
662
+
663
+ /**
664
+ * Optional OCR-side configuration set by `@albex/ocr::enableOcr`. Read
665
+ * by the engine to decide whether to invoke OCR on top of the text it
666
+ * already extracted from a PDF (hybrid PDFs: native text + images that
667
+ * also contain text, like stamps, scanned annexes, or diagrams with
668
+ * labels).
669
+ *
670
+ * When `alwaysExtractEmbeddedImages` is true, every page of every PDF
671
+ * passes through `extractPageImages` after the normal text extraction;
672
+ * any image that meets the size filter (200×200 in Rust) is fed to
673
+ * `ocrImage`. Performance cost: 1–3 s per qualifying image.
674
+ *
675
+ * Off by default — set this opt-in via the OCR module's options.
676
+ */
677
+ ocrConfig?: { alwaysExtractEmbeddedImages?: boolean };
678
+
252
679
  // ── PDF WASM (lazy) ──
253
- private _pdfWasm: WebAssembly.Exports | null = null;
680
+ private _pdfWasm: AlbexPdfExports | null = null;
254
681
  private _pdfMem: WebAssembly.Memory | null = null;
255
682
 
256
683
  private _docs: IndexedDocument[] = [];
257
684
  private _lastSearch: SearchStats | null = null;
685
+ private _tier: Tier | null = null;
686
+ private _simd: boolean = false;
687
+ private _profile: DeviceProfile | null = null;
688
+ private _resources: ResourceState | null = null;
689
+ private _gpu: BloomGpu | null = null;
690
+ private _gpuChunkCountUploaded = 0;
691
+ private _unsubscribeResources: (() => void) | null = null;
258
692
  private readonly _opts: AlbexOptions;
259
693
 
260
694
  constructor(opts: AlbexOptions) {
@@ -263,12 +697,161 @@ export class AlbexEngine {
263
697
 
264
698
  /** Load and initialise the main WASM module. Must be called before any other method. */
265
699
  async init(): Promise<void> {
266
- const res = await fetch(this._opts.wasmUrl);
267
- if (!res.ok) throw new Error(`Failed to fetch WASM: ${res.status}`);
700
+ const url = await this._resolveWasmUrl();
701
+ const res = await fetch(url);
702
+ if (!res.ok) throw new AlbexInitError(`Failed to fetch WASM: ${res.status} (${url})`);
268
703
  const { instance } = await WebAssembly.instantiateStreaming(res, {});
269
- this._wasm = instance.exports;
270
- this._mem = instance.exports.memory as WebAssembly.Memory;
271
- (this._wasm.init as Function)();
704
+ this._wasm = asAlbexExports(instance.exports);
705
+ this._mem = this._wasm.memory;
706
+ this._wasm.init();
707
+
708
+ // Subscribe to environmental signals. Cheap and benign in node tests
709
+ // (the manager tolerates missing globals).
710
+ const rm = getResourceManager();
711
+ await rm.start();
712
+ this._resources = rm.state;
713
+ this._unsubscribeResources = rm.on(s => { this._resources = s; });
714
+
715
+ // Lazily initialise the GPU Bloom accelerator. We don't acquire a device
716
+ // here yet — that happens on the first search that crosses the threshold.
717
+ // This keeps cold-start cost the same on GPU and CPU paths.
718
+ if (this._opts.gpu !== 'off') {
719
+ this._gpu = new BloomGpu();
720
+ }
721
+ }
722
+
723
+ /**
724
+ * Decide which `.wasm` binary to fetch. Order of precedence:
725
+ * 1. `opts.wasmUrl` if provided — used verbatim.
726
+ * 2. `opts.tier` if explicit — joined with `wasmBaseUrl`.
727
+ * 3. `opts.wasmBaseUrl` + tier picked from the device profile.
728
+ *
729
+ * Order of precedence:
730
+ * 1. `opts.wasmUrl` literal → use verbatim
731
+ * 2. `opts.wasmBaseUrl` + tier suffix → fetched from that directory
732
+ * 3. zero-config default → `albex_wasm_bg.wasm` packaged
733
+ * next to this file, resolved
734
+ * via `import.meta.url`
735
+ *
736
+ * The zero-config default loads the std-baseline binary. Tier auto-detection
737
+ * is only active when `wasmBaseUrl` is given, because picking a tier in
738
+ * runtime would defeat any bundler's static asset rewriting. Users who want
739
+ * tier optimisation must serve the six variants themselves and pass the
740
+ * directory through `wasmBaseUrl`.
741
+ */
742
+ private async _resolveWasmUrl(): Promise<string> {
743
+ const o = this._opts;
744
+ if (o.wasmUrl) {
745
+ this._profile = await detectProfile();
746
+ return o.wasmUrl;
747
+ }
748
+ // Always cache the profile so GPU/worker decisions later don't re-probe.
749
+ const profile = await detectProfile();
750
+ this._profile = profile;
751
+
752
+ // Path 3: zero-config — bundler-friendly default. `new URL(..., import.meta.url)`
753
+ // is recognised by Vite, Webpack 5+, esbuild, Rollup, Parcel 2 and Next.js
754
+ // as an asset reference. They copy the .wasm to the output directory and
755
+ // rewrite the URL automatically. Consumers who use one of those bundlers
756
+ // get a working `new AlbexEngine()` with no manual setup.
757
+ if (!o.wasmBaseUrl) {
758
+ // We can't tier-select with one URL, so fall back to std baseline.
759
+ // The integrator who wants tier optimisation must opt in via wasmBaseUrl.
760
+ this._tier = 'std';
761
+ this._simd = false;
762
+ return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
763
+ }
764
+
765
+ let tier: Tier;
766
+ if (o.tier && o.tier !== 'auto') tier = o.tier;
767
+ else tier = pickTier(profile);
768
+ this._tier = tier;
769
+
770
+ const simd = o.simd === 'on'
771
+ ? true
772
+ : o.simd === 'off'
773
+ ? false
774
+ : !!profile?.wasm.simd;
775
+ this._simd = simd;
776
+
777
+ const suffix = simd ? `${tier}_simd` : tier;
778
+ const base = o.wasmBaseUrl.replace(/\/+$/, '');
779
+ return `${base}/albex_wasm_${suffix}.wasm`;
780
+ }
781
+
782
+ /** The tier that was actually loaded. `null` until `init()` resolves. */
783
+ get tier(): Tier | null { return this._tier; }
784
+
785
+ /** True if the SIMD-accelerated binary was loaded. */
786
+ get simdEnabled(): boolean { return this._simd; }
787
+
788
+ /** True if a WebGPU device is acquired and the next search will use it. */
789
+ get gpuEngaged(): boolean { return !!this._gpu?.available; }
790
+
791
+ // ── GPU acceleration (CD1) ───────────────────────────────────────────────
792
+
793
+ /**
794
+ * Decide whether to use the GPU pre-filter for the upcoming search.
795
+ *
796
+ * Policy:
797
+ * - `gpu: 'off'` → never.
798
+ * - `gpu: 'on'` → always try (still fails over to CPU).
799
+ * - `gpu: 'auto'` (default) → only when WebGPU is available AND
800
+ * chunk count crosses `gpuThreshold`.
801
+ */
802
+ private _shouldEngageGpu(): boolean {
803
+ const o = this._opts;
804
+ if (!this._gpu) return false;
805
+ if (o.gpu === 'off') return false;
806
+ if (o.gpu === 'on') return true;
807
+ if (!this._profile) return false;
808
+ const threshold = o.gpuThreshold ?? 20_000;
809
+ return shouldUseGpu(this._profile, this._wasm.getChunkCount(), threshold);
810
+ }
811
+
812
+ /**
813
+ * Run the GPU Bloom scan and install the resulting candidate bitset into
814
+ * WASM. The next `searchBegin` will see the mask and `searchSlice` will
815
+ * restrict its Bitap pass to those candidates.
816
+ *
817
+ * No-op if the GPU device hasn't been acquired yet — first call attempts
818
+ * `init()` lazily; if that fails, the candidate path is permanently
819
+ * disabled for this engine instance.
820
+ */
821
+ private async _gpuPreFilter(wasmQuery: string): Promise<void> {
822
+ const gpu = this._gpu;
823
+ if (!gpu) return;
824
+ if (!gpu.available) {
825
+ const ok = await gpu.init();
826
+ if (!ok) { this._gpu = null; return; }
827
+ }
828
+
829
+ const chunkCount = this._wasm.getChunkCount();
830
+ if (chunkCount === 0) return;
831
+
832
+ // Upload blooms if the corpus changed. We re-upload everything on any
833
+ // delta; incremental delta-upload is a future optimisation.
834
+ if (chunkCount !== this._gpuChunkCountUploaded) {
835
+ const ptr = this._wasm.getChunksPtr();
836
+ const stride = this._wasm.getChunkStructSize();
837
+ const bytes = new Uint8Array(this._mem.buffer, ptr, chunkCount * stride);
838
+ const blooms = packBloomsFromChunks(bytes, chunkCount);
839
+ gpu.uploadChunkBlooms(blooms, chunkCount);
840
+ this._gpuChunkCountUploaded = chunkCount;
841
+ }
842
+
843
+ // Build the pattern Bloom on the JS side: same hash as Rust
844
+ // (`c & 0x3F` after accent-folding), aggregated across all tokens.
845
+ const patternBloom = computePatternBloom(wasmQuery);
846
+ const passes = await gpu.scan(
847
+ Number(patternBloom & 0xffffffffn),
848
+ Number((patternBloom >> 32n) & 0xffffffffn),
849
+ );
850
+
851
+ // Push the bitset back into WASM via the scratchpad.
852
+ const passBytes = new Uint8Array(passes.buffer, passes.byteOffset, passes.byteLength);
853
+ this._writePad(passBytes);
854
+ this._wasm.setCandidateMask(passBytes.byteLength);
272
855
  }
273
856
 
274
857
  // ── Internal helpers ──────────────────────────────────────────────────────
@@ -278,8 +861,8 @@ export class AlbexEngine {
278
861
  }
279
862
 
280
863
  private _writePad(b: Uint8Array): number {
281
- const ptr = (this._wasm.getBuffer as Function)(b.length) as number;
282
- if (!ptr) throw new Error('Scratchpad too small for this chunk');
864
+ const ptr = this._wasm.getBuffer(b.length);
865
+ if (!ptr) throw new AlbexCapacityError(`Scratchpad too small for ${b.length} bytes`);
283
866
  this._u8(ptr, b.length).set(b);
284
867
  return ptr;
285
868
  }
@@ -291,7 +874,7 @@ export class AlbexEngine {
291
874
  }
292
875
 
293
876
  private _readPad(n: number): string {
294
- const ptr = (this._wasm.getBuffer as Function)(0) as number;
877
+ const ptr = this._wasm.getBuffer(0);
295
878
  return _dec.decode(this._u8(ptr, n));
296
879
  }
297
880
 
@@ -300,15 +883,16 @@ export class AlbexEngine {
300
883
  for (let i = 0; i < b.length; i += FEED_SIZE) {
301
884
  const c = b.subarray(i, i + FEED_SIZE);
302
885
  this._writePad(c);
303
- (this._wasm.feedText as Function)(c.length);
886
+ this._wasm.feedText(c.length);
304
887
  }
305
888
  }
306
889
 
307
890
  private _feedXmlBytes(xml: Uint8Array, fn: 'feedXmlBytes' | 'feedXlsxBytes'): void {
891
+ const feeder = this._wasm[fn];
308
892
  for (let i = 0; i < xml.length; i += FEED_SIZE) {
309
893
  const c = xml.subarray(i, i + FEED_SIZE);
310
894
  this._writePad(c);
311
- (this._wasm[fn] as Function)(c.length);
895
+ feeder(c.length);
312
896
  }
313
897
  }
314
898
 
@@ -316,30 +900,42 @@ export class AlbexEngine {
316
900
 
317
901
  private async _ensurePdfWasm(): Promise<void> {
318
902
  if (this._pdfWasm) return;
319
- if (!this._opts.pdfWasmUrl) throw new Error('pdfWasmUrl not set in AlbexOptions');
320
- const res = await fetch(this._opts.pdfWasmUrl);
321
- if (!res.ok) throw new Error(`Failed to fetch PDF WASM: ${res.status}`);
322
- const imports = makePdfWasmImports(() => this._pdfMem!);
323
- const { instance } = await WebAssembly.instantiateStreaming(res, imports);
324
- this._pdfWasm = instance.exports;
325
- this._pdfMem = instance.exports.memory as WebAssembly.Memory;
903
+ // Zero-config default: resolve relative to this module so bundlers copy
904
+ // the .wasm to the output automatically. Override with `opts.pdfWasmUrl`
905
+ // when serving from a separate CDN.
906
+ const pdfUrl = this._opts.pdfWasmUrl
907
+ ?? new URL('../wasm/pkg/albex_pdf.wasm', import.meta.url).href;
908
+ // Network politeness: on constrained connections (slow-2g/2g/saveData)
909
+ // we still fetch on explicit user request — `_ensurePdfWasm` is only
910
+ // called when the user actually drops a PDF — but we issue a console
911
+ // hint so embedders can surface a "this will download ~1 MB" prompt.
912
+ if (this._resources?.constrainedNetwork) {
913
+ console.info('[albex] downloading PDF WASM (~1 MB) on a constrained network connection');
914
+ }
915
+ const res = await fetch(pdfUrl);
916
+ if (!res.ok) throw new AlbexInitError(`Failed to fetch PDF WASM: ${res.status}`);
917
+ // Compile first so we can inspect the module's required imports and
918
+ // resolve mangled wasm-bindgen names by prefix rather than by hash.
919
+ const module = await WebAssembly.compileStreaming(res);
920
+ const imports = makePdfWasmImports(module, () => this._pdfMem);
921
+ const instance = await WebAssembly.instantiate(module, imports);
922
+ this._pdfWasm = asAlbexPdfExports(instance.exports);
923
+ this._pdfMem = this._pdfWasm.memory;
326
924
  }
327
925
 
328
926
  // ── Indexers ──────────────────────────────────────────────────────────────
329
927
 
330
- private async _indexDocx(file: File): Promise<number> {
331
- const bytes = new Uint8Array(await file.arrayBuffer());
332
- const xml = await findZipEntry(bytes, 'word/document.xml');
333
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
334
- (this._wasm.beginDocument as Function)();
928
+ private async _indexDocx(file: File, bytes: Uint8Array): Promise<number> {
929
+ const xml = await findZipEntry(bytes, 'word/document.xml');
930
+ this._wasm.setDocumentName(this._writeStr(file.name));
931
+ this._wasm.beginDocument();
335
932
  this._feedXmlBytes(xml, 'feedXmlBytes');
336
- return (this._wasm.endDocument as Function)() as number;
933
+ return this._wasm.endDocument();
337
934
  }
338
935
 
339
- private async _indexXlsx(file: File): Promise<number> {
340
- const bytes = new Uint8Array(await file.arrayBuffer());
341
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
342
- (this._wasm.beginXlsx as Function)();
936
+ private async _indexXlsx(file: File, bytes: Uint8Array): Promise<number> {
937
+ this._wasm.setDocumentName(this._writeStr(file.name));
938
+ this._wasm.beginXlsx();
343
939
 
344
940
  try {
345
941
  const xml = await findZipEntry(bytes, 'xl/sharedStrings.xml');
@@ -354,80 +950,704 @@ export class AlbexEngine {
354
950
  } catch { /* skip corrupt/missing sheet */ }
355
951
  }
356
952
 
357
- return (this._wasm.endDocument as Function)() as number;
953
+ return this._wasm.endDocument();
358
954
  }
359
955
 
360
- private async _indexPdf(file: File): Promise<number> {
956
+ private async _indexPdf(file: File, bytes: Uint8Array): Promise<number> {
361
957
  await this._ensurePdfWasm();
362
- const pw = this._pdfWasm!;
363
- const pm = this._pdfMem!;
364
- const bytes = new Uint8Array(await file.arrayBuffer());
365
-
366
- const inPtr = (pw.allocInput as Function)(bytes.length) as number;
958
+ let pw = this._pdfWasm;
959
+ let pm = this._pdfMem;
960
+ if (!pw || !pm) throw new AlbexInitError('PDF WASM not initialised');
961
+
962
+ // Reserve input buffer and copy bytes. allocInput may trigger a
963
+ // memory.grow inside the PDF module; the previous pm.buffer would
964
+ // become detached. Refresh the memory reference before constructing
965
+ // the view to be safe.
966
+ const inPtr = pw.allocInput(bytes.length);
967
+ pm = pw.memory;
367
968
  new Uint8Array(pm.buffer, inPtr, bytes.length).set(bytes);
368
- const pageCount = (pw.extractPdf as Function)(bytes.length) as number;
369
969
 
370
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
371
- (this._wasm.beginDocument as Function)();
970
+ // extractPdf can panic inside pdf-extract/lopdf for PDFs that other
971
+ // tools accept (encrypted streams without password, exotic font
972
+ // dictionaries, malformed cross-reference tables, etc.). The crate
973
+ // is built with panic="abort" (required on wasm32-unknown-unknown
974
+ // — no unwinding), so the panic surfaces as a WASM `unreachable`
975
+ // trap and the module instance becomes unusable.
976
+ //
977
+ // Recovery strategy when this happens:
978
+ // 1. Discard the poisoned instance.
979
+ // 2. If OCR is wired AND the rebuilt binary supports image
980
+ // extraction, re-instantiate, reload the input bytes, and try
981
+ // the lopdf-only image-extraction path. lopdf is a separate
982
+ // parser from pdf-extract's text codec — there are real PDFs
983
+ // that pdf-extract trips on but lopdf walks fine, and we can
984
+ // recover the page images even when we cannot recover the
985
+ // vector text.
986
+ // 3. If OCR isn't wired (or the recovery also fails), surface a
987
+ // helpful AlbexParseError that points the user at the fix.
988
+ let pageCount: number;
989
+ try {
990
+ pageCount = pw.extractPdf(bytes.length);
991
+ } catch (e) {
992
+ this._pdfWasm = null;
993
+ this._pdfMem = null;
994
+ const msg = e instanceof Error ? e.message : String(e);
995
+
996
+ // Try the OCR fallback before giving up.
997
+ if (this.ocrImage) {
998
+ const recovered = await this._indexPdfViaImagesOnly(file, bytes, msg);
999
+ if (recovered !== null) return recovered;
1000
+ }
1001
+
1002
+ throw new AlbexParseError(
1003
+ 'pdf',
1004
+ `PDF text extractor crashed (${msg}). ` +
1005
+ (this.ocrImage
1006
+ ? 'OCR fallback also could not recover any content from this file.'
1007
+ : 'Enable OCR via @albex/ocr to attempt image-based extraction as a fallback.'),
1008
+ );
1009
+ }
1010
+ // Refresh memory once more — extractPdf can grow it too.
1011
+ pm = pw.memory;
1012
+
1013
+ this._wasm.setDocumentName(this._writeStr(file.name));
1014
+ this._wasm.beginDocument();
372
1015
 
373
1016
  if (pageCount === -2) {
374
- // Image-only PDF register doc with zero chunks.
375
- return (this._wasm.endDocument as Function)() as number;
1017
+ // Image-only (scanned) PDF. If OCR is wired AND the PDF binary
1018
+ // supports image extraction, fall through to the scanned-PDF path.
1019
+ // Otherwise keep today's behaviour: register the doc with 0 chunks
1020
+ // so the user sees the file in the index but searches won't hit it.
1021
+ const supportsImages = typeof pw.extractPageImages === 'function'
1022
+ && typeof pw.getPageCount === 'function';
1023
+ if (this.ocrImage && supportsImages) {
1024
+ await this._indexPdfScanned(pw);
1025
+ }
1026
+ return this._wasm.endDocument();
376
1027
  }
377
1028
  if (pageCount < 0) {
378
- const errLen = (pw.getErrorLen as Function)() as number;
379
- const errPtr = (pw.getErrorPtr as Function)() as number;
1029
+ const errLen = pw.getErrorLen();
1030
+ const errPtr = pw.getErrorPtr();
380
1031
  const msg = errLen > 0
381
- ? new TextDecoder().decode(new Uint8Array(pm.buffer, errPtr, errLen))
1032
+ ? _dec.decode(new Uint8Array(pm.buffer, errPtr, errLen))
382
1033
  : 'PDF parse error';
383
- throw new Error(msg);
1034
+ throw new AlbexParseError('pdf', msg);
384
1035
  }
385
1036
 
386
1037
  for (let p = 0; p < pageCount; p++) {
387
- const len = (pw.getPageLen as Function)(p) as number;
1038
+ const len = pw.getPageLen(p);
388
1039
  if (!len) continue;
389
- const text = new TextDecoder('utf-8').decode(
390
- new Uint8Array(pm.buffer, (pw.getPagePtr as Function)(p) as number, len)
391
- );
1040
+ // Re-read memory each iteration — feedText writes into the main
1041
+ // WASM, but reading the PDF page pointers requires the live PDF
1042
+ // memory which may have been grown by intermediate calls.
1043
+ const liveMem = pw.memory;
1044
+ const text = _dec.decode(new Uint8Array(liveMem.buffer, pw.getPagePtr(p), len));
392
1045
  this._feedText(text);
393
- (this._wasm.flushParagraph as Function)();
1046
+ this._wasm.flushParagraph();
1047
+ }
1048
+
1049
+ // Hybrid OCR pass: when the OCR module is wired with
1050
+ // `alwaysExtractEmbeddedImages: true`, also walk every page for
1051
+ // embedded images and OCR them on top of the vector text.
1052
+ //
1053
+ // We always log the decision so users debugging "why isn't OCR
1054
+ // firing on my hybrid PDF" can see which precondition failed.
1055
+ const hybridOn = !!this.ocrConfig?.alwaysExtractEmbeddedImages;
1056
+ const hasOcr = !!this.ocrImage;
1057
+ const binSupportsImages = typeof pw.extractPageImages === 'function'
1058
+ && typeof pw.getPageCount === 'function';
1059
+ console.log(`[albex] hybrid OCR decision: ocrImage=${hasOcr} ocrConfig.alwaysExtractEmbeddedImages=${hybridOn} binarySupportsImages=${binSupportsImages}`);
1060
+
1061
+ if (hasOcr && hybridOn && binSupportsImages) {
1062
+ const totalPages = pw.getPageCount();
1063
+ console.log(`[albex] hybrid OCR pass starting over ${totalPages} page(s)`);
1064
+ for (let p = 0; p < totalPages; p++) {
1065
+ const ocrText = await this._ocrPageEmbeddedImages(pw, p);
1066
+ if (ocrText === null) break; // WASM trapped, stop hybrid pass.
1067
+ if (ocrText) {
1068
+ this._feedText(ocrText);
1069
+ this._wasm.flushParagraph();
1070
+ }
1071
+ }
1072
+ }
1073
+
1074
+ return this._wasm.endDocument();
1075
+ }
1076
+
1077
+ /**
1078
+ * Scanned-PDF OCR fallback. Called from `_indexPdf` when `extractPdf`
1079
+ * returns `-2` (image-only PDF) AND `@albex/ocr` has been wired via
1080
+ * `enableOcr(engine)`.
1081
+ *
1082
+ * Walks every page of the PDF, extracts embedded JPEG / JPEG2000 image
1083
+ * XObjects, runs each through `engine.ocrImage`, and feeds the recognised
1084
+ * text into the index — one paragraph per page so search snippets stay
1085
+ * tied to the page they came from.
1086
+ *
1087
+ * Failure modes handled here (none re-thrown — the goal is best-effort
1088
+ * indexing, not all-or-nothing):
1089
+ *
1090
+ * * A page's `extractPageImages` traps the WASM instance: the instance
1091
+ * is discarded so the next PDF starts fresh, and we stop iterating
1092
+ * (no more pages can be read from a poisoned instance). The doc is
1093
+ * still committed with whatever text we got from earlier pages.
1094
+ * * An individual image fails to OCR (Tesseract decode error, JP2 not
1095
+ * supported in this browser, etc.): we skip that image and keep
1096
+ * going. Partial coverage beats nothing.
1097
+ * * A page yields no extractable images (e.g. uses Flate/CCITT/JBIG2):
1098
+ * no paragraph is emitted; the page contributes 0 chunks.
1099
+ */
1100
+ private async _indexPdfScanned(pw: AlbexPdfExports): Promise<void> {
1101
+ if (!this.ocrImage) return;
1102
+ const totalPages = pw.getPageCount();
1103
+ if (!totalPages) return;
1104
+
1105
+ for (let p = 0; p < totalPages; p++) {
1106
+ const pageText = await this._ocrPageEmbeddedImages(pw, p);
1107
+ if (pageText === null) return; // WASM poisoned mid-iteration.
1108
+ if (pageText) {
1109
+ this._feedText(pageText);
1110
+ this._wasm.flushParagraph();
1111
+ }
1112
+ }
1113
+ }
1114
+
1115
+ /**
1116
+ * Walk one page's embedded image XObjects, OCR each image, and return
1117
+ * the joined recognised text for that page.
1118
+ *
1119
+ * Used by:
1120
+ * - `_indexPdfScanned`: image-only PDFs (extractPdf returned -2).
1121
+ * - `_indexPdf` hybrid path: when `ocrConfig.alwaysExtractEmbeddedImages`
1122
+ * is set, every page goes through here on top of the normal text
1123
+ * extraction.
1124
+ *
1125
+ * Returns:
1126
+ * - The recognised text (possibly empty if the page has no qualifying
1127
+ * images or every OCR call failed).
1128
+ * - `null` if the PDF WASM trapped during extractPageImages — the
1129
+ * caller should abort the remaining pages because the instance is
1130
+ * now poisoned.
1131
+ *
1132
+ * Failure-handling philosophy: best-effort. An OCR failure on one image
1133
+ * does not stop the page; a page with no images does not stop the doc;
1134
+ * only a WASM trap stops the doc.
1135
+ */
1136
+ private async _ocrPageEmbeddedImages(
1137
+ pw: AlbexPdfExports,
1138
+ page: number,
1139
+ ): Promise<string | null> {
1140
+ const ocr = this.ocrImage;
1141
+ if (!ocr) return '';
1142
+
1143
+ let imageCount: number;
1144
+ try {
1145
+ imageCount = pw.extractPageImages(page);
1146
+ } catch (e) {
1147
+ // The PDF module just trapped — it is now poisoned. Drop our refs
1148
+ // so `_ensurePdfWasm` re-instantiates on the next call.
1149
+ this._pdfWasm = null;
1150
+ this._pdfMem = null;
1151
+ console.warn(`[albex] PDF image extractor trapped on page ${page + 1}: ${e instanceof Error ? e.message : String(e)}. Stopping OCR.`);
1152
+ return null;
1153
+ }
1154
+ if (imageCount <= 0) return '';
1155
+
1156
+ // The buffer view must be re-acquired AFTER extractPageImages —
1157
+ // it may have grown the linear memory and detached old views.
1158
+ const liveMem = pw.memory;
1159
+ let pageText = '';
1160
+
1161
+ for (let i = 0; i < imageCount; i++) {
1162
+ const len = pw.getPageImageLen(i);
1163
+ if (!len) continue;
1164
+ const ptr = pw.getPageImagePtr(i);
1165
+ const kind = pw.getPageImageKind(i);
1166
+ const mime = kind === 1 ? 'image/jpeg'
1167
+ : kind === 2 ? 'image/jp2'
1168
+ : 'application/octet-stream';
1169
+
1170
+ // Snapshot the image bytes into a fresh ArrayBuffer. The pointer
1171
+ // returned by getPageImagePtr is only valid until the next
1172
+ // extractPageImages / extractPdf call, so we cannot hold the view.
1173
+ const copy = new Uint8Array(len);
1174
+ copy.set(new Uint8Array(liveMem.buffer, ptr, len));
1175
+ const blob = new Blob([copy.buffer as ArrayBuffer], { type: mime });
1176
+
1177
+ // Defensive diagnostics: when an OCR call goes wrong (Tesseract
1178
+ // worker abort, malformed JPEG, etc.) the first thing we want to
1179
+ // see is whether we even handed it valid image bytes. A real JPEG
1180
+ // starts with FF D8 FF (E0 for JFIF, E1 for EXIF). A JPEG2000
1181
+ // starts with 00 00 00 0C 6A 50 20 20.
1182
+ const magic = Array.from(copy.subarray(0, 4))
1183
+ .map(b => b.toString(16).padStart(2, '0'))
1184
+ .join(' ');
1185
+ console.log(`[albex] OCR page ${page + 1} image ${i + 1}/${imageCount}: kind=${kind} (${mime}) len=${len} bytes magic=${magic}`);
1186
+
1187
+ try {
1188
+ const { text } = await ocr(blob);
1189
+ const trimmed = text?.trim();
1190
+ if (trimmed) {
1191
+ pageText = pageText ? `${pageText} ${trimmed}` : trimmed;
1192
+ }
1193
+ } catch (e) {
1194
+ // Image-level OCR failure — skip and continue. JP2 in browsers
1195
+ // without native support lands here; so do truncated or
1196
+ // unsupported JPEG variants. Worker aborts (Tesseract.js
1197
+ // "Aborted(-1)") are also caught here; if they bypass the
1198
+ // promise rejection and surface as `uncaught` instead, the
1199
+ // demo's window.onerror handler will keep the app alive.
1200
+ console.warn(`[albex] OCR failed on page ${page + 1} image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
1201
+ }
394
1202
  }
395
1203
 
396
- return (this._wasm.endDocument as Function)() as number;
1204
+ return pageText;
397
1205
  }
398
1206
 
399
- private async _indexTxt(file: File): Promise<number> {
400
- const text = await file.text();
401
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
402
- (this._wasm.beginDocument as Function)();
1207
+ /**
1208
+ * Last-chance OCR path used when `extractPdf` itself trapped (pdf-extract
1209
+ * crashed but lopdf may still be able to walk the file). Re-instantiates
1210
+ * the PDF WASM, reloads the input bytes, and tries the image-extraction
1211
+ * route directly — bypassing the text codec entirely.
1212
+ *
1213
+ * Returns:
1214
+ * * the doc's chunk count on success (even 0 — that means lopdf could
1215
+ * parse but no qualifying images existed, which still beats a hard
1216
+ * parse error),
1217
+ * * null if the recovery itself failed (binary lacks the image exports,
1218
+ * re-instantiation failed, or lopdf also trapped). In the null case
1219
+ * the caller throws AlbexParseError so the user sees a clear message.
1220
+ */
1221
+ private async _indexPdfViaImagesOnly(
1222
+ file: File,
1223
+ bytes: Uint8Array,
1224
+ originalError: string,
1225
+ ): Promise<number | null> {
1226
+ try {
1227
+ await this._ensurePdfWasm();
1228
+ } catch {
1229
+ return null;
1230
+ }
1231
+ const pw = this._pdfWasm;
1232
+ if (!pw) return null;
1233
+
1234
+ const supportsImages = typeof pw.extractPageImages === 'function'
1235
+ && typeof pw.getPageCount === 'function';
1236
+ if (!supportsImages) return null;
1237
+
1238
+ // Reload input bytes into the fresh instance. allocInput may grow the
1239
+ // memory, so re-acquire the buffer view immediately after.
1240
+ let inPtr: number;
1241
+ try {
1242
+ inPtr = pw.allocInput(bytes.length);
1243
+ new Uint8Array(pw.memory.buffer, inPtr, bytes.length).set(bytes);
1244
+ } catch (e) {
1245
+ console.warn(`[albex] PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`);
1246
+ return null;
1247
+ }
1248
+
1249
+ // Set up the doc and let _indexPdfScanned do the page-by-page walk.
1250
+ // _indexPdfScanned tolerates lopdf failing mid-stream — it caches the
1251
+ // poisoned instance and returns early. If lopdf trips on the very
1252
+ // first page, no paragraphs are emitted and we end up with 0 chunks.
1253
+ this._wasm.setDocumentName(this._writeStr(file.name));
1254
+ this._wasm.beginDocument();
1255
+ console.info(`[albex] pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf for ${file.name}`);
1256
+ await this._indexPdfScanned(pw);
1257
+ return this._wasm.endDocument();
1258
+ }
1259
+
1260
+ private async _indexTxt(file: File, bytes: Uint8Array): Promise<number> {
1261
+ const text = _dec.decode(bytes);
1262
+ this._wasm.setDocumentName(this._writeStr(file.name));
1263
+ this._wasm.beginDocument();
403
1264
  for (const para of text.split(/\n{2,}/)) {
404
1265
  const l = para.replace(/\n/g, ' ').trim();
405
- if (l) { this._feedText(l); (this._wasm.flushParagraph as Function)(); }
1266
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
406
1267
  }
407
- return (this._wasm.endDocument as Function)() as number;
1268
+ return this._wasm.endDocument();
408
1269
  }
409
1270
 
410
- private async _indexXml(file: File): Promise<number> {
411
- const plain = (await file.text())
1271
+ private async _indexXml(file: File, bytes: Uint8Array): Promise<number> {
1272
+ const plain = _dec.decode(bytes)
412
1273
  .replace(/<[^]*?>/g, '\n')
413
1274
  .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
414
1275
  .replace(/&quot;/g, '"').replace(/&apos;/g, "'")
415
1276
  .replace(/[ \t]+/g, ' ').trim();
416
- (this._wasm.setDocumentName as Function)(this._writeStr(file.name));
417
- (this._wasm.beginDocument as Function)();
1277
+ this._wasm.setDocumentName(this._writeStr(file.name));
1278
+ this._wasm.beginDocument();
418
1279
  for (const seg of plain.split(/\n{2,}/)) {
419
1280
  const l = seg.replace(/\n/g, ' ').trim();
420
- if (l) { this._feedText(l); (this._wasm.flushParagraph as Function)(); }
1281
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
1282
+ }
1283
+ return this._wasm.endDocument();
1284
+ }
1285
+
1286
+ // ── Markdown ─────────────────────────────────────────────────────────────
1287
+ // Strip CommonMark inline marks but keep word content. Paragraphs split on
1288
+ // blank lines, same convention as TXT/XML.
1289
+ private async _indexMd(file: File, bytes: Uint8Array): Promise<number> {
1290
+ const text = _dec.decode(bytes)
1291
+ // Remove fenced code blocks entirely (often noisy for search relevance).
1292
+ .replace(/```[\s\S]*?```/g, '\n')
1293
+ .replace(/~~~[\s\S]*?~~~/g, '\n')
1294
+ // Strip ATX heading markers but keep heading text.
1295
+ .replace(/^#{1,6}\s+/gm, '')
1296
+ // Replace inline links/images with their visible text.
1297
+ .replace(/!\[([^\]]*)\]\([^)]*\)/g, '$1')
1298
+ .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
1299
+ // Strip emphasis markers (preserve content).
1300
+ .replace(/(\*\*|__|\*|_)/g, '')
1301
+ // Inline code.
1302
+ .replace(/`([^`]+)`/g, '$1')
1303
+ // Blockquote marks.
1304
+ .replace(/^>\s?/gm, '')
1305
+ // List markers.
1306
+ .replace(/^\s*[-*+]\s+/gm, '')
1307
+ .replace(/^\s*\d+\.\s+/gm, '');
1308
+ this._wasm.setDocumentName(this._writeStr(file.name));
1309
+ this._wasm.beginDocument();
1310
+ for (const para of text.split(/\n{2,}/)) {
1311
+ const l = para.replace(/\n/g, ' ').trim();
1312
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
1313
+ }
1314
+ return this._wasm.endDocument();
1315
+ }
1316
+
1317
+ // ── HTML ─────────────────────────────────────────────────────────────────
1318
+ // Strip <script>/<style> entire blocks, then drop tag markup. The output is
1319
+ // chunked at <p>, <br>, <h*>, <li>, <tr> boundaries (mapped to paragraph
1320
+ // breaks) so search location numbers map naturally to the document outline.
1321
+ private async _indexHtml(file: File, bytes: Uint8Array): Promise<number> {
1322
+ const html = _dec.decode(bytes)
1323
+ .replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, ' ')
1324
+ .replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, ' ')
1325
+ // Treat block-level closers as paragraph separators.
1326
+ .replace(/<\/(p|h[1-6]|li|tr|div|section|article|header|footer)\s*>/gi, '\n\n')
1327
+ .replace(/<br\s*\/?\s*>/gi, '\n')
1328
+ // Drop remaining tags.
1329
+ .replace(/<[^>]+>/g, ' ')
1330
+ // Decode common entities (full set would need a table; this covers >95%).
1331
+ .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
1332
+ .replace(/&quot;/g, '"').replace(/&apos;/g, "'").replace(/&nbsp;/g, ' ')
1333
+ .replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(Number(n)))
1334
+ .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCodePoint(parseInt(n, 16)))
1335
+ .replace(/[ \t]+/g, ' ');
1336
+ this._wasm.setDocumentName(this._writeStr(file.name));
1337
+ this._wasm.beginDocument();
1338
+ for (const para of html.split(/\n{2,}/)) {
1339
+ const l = para.replace(/\n/g, ' ').trim();
1340
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
1341
+ }
1342
+ return this._wasm.endDocument();
1343
+ }
1344
+
1345
+ // ── JSON ─────────────────────────────────────────────────────────────────
1346
+ // Extract every string value (keys + leaf strings) recursively. Each leaf
1347
+ // becomes its own searchable chunk via paragraph flush. Numbers/booleans
1348
+ // are skipped (cannot match a textual query usefully).
1349
+ private async _indexJson(file: File, bytes: Uint8Array): Promise<number> {
1350
+ let root: unknown;
1351
+ try { root = JSON.parse(_dec.decode(bytes)); }
1352
+ catch (e) { throw new AlbexParseError('json', (e as Error).message); }
1353
+
1354
+ this._wasm.setDocumentName(this._writeStr(file.name));
1355
+ this._wasm.beginDocument();
1356
+
1357
+ const visit = (v: unknown): void => {
1358
+ if (typeof v === 'string') {
1359
+ if (v.trim()) { this._feedText(v); this._wasm.flushParagraph(); }
1360
+ } else if (Array.isArray(v)) {
1361
+ for (const x of v) visit(x);
1362
+ } else if (v && typeof v === 'object') {
1363
+ for (const [k, x] of Object.entries(v as Record<string, unknown>)) {
1364
+ if (k.trim()) { this._feedText(k); this._wasm.flushParagraph(); }
1365
+ visit(x);
1366
+ }
1367
+ }
1368
+ };
1369
+ visit(root);
1370
+ return this._wasm.endDocument();
1371
+ }
1372
+
1373
+ // ── CSV ──────────────────────────────────────────────────────────────────
1374
+ // RFC 4180 lite: comma-separated, optional double quotes, escaped "" inside
1375
+ // quoted fields. Each row becomes one paragraph (location = row index, with
1376
+ // header row at location 0).
1377
+ private async _indexCsv(file: File, bytes: Uint8Array): Promise<number> {
1378
+ // Strip an optional UTF-8 BOM. Excel writes it by default for "CSV UTF-8";
1379
+ // without this fix the first field of the first row would start with
1380
+ // U+FEFF, which both shifts column alignment when consumers split on a
1381
+ // field name and breaks search hits on "Subject" / "Asunto" etc.
1382
+ let text = _dec.decode(bytes);
1383
+ if (text.charCodeAt(0) === 0xFEFF) text = text.slice(1);
1384
+
1385
+ this._wasm.setDocumentName(this._writeStr(file.name));
1386
+ this._wasm.beginDocument();
1387
+
1388
+ let row: string[] = [];
1389
+ let field = '';
1390
+ let inQuoted = false;
1391
+ const flushRow = (): void => {
1392
+ const line = row.join(' ').trim();
1393
+ if (line) { this._feedText(line); this._wasm.flushParagraph(); }
1394
+ row = [];
1395
+ };
1396
+ for (let i = 0; i < text.length; i++) {
1397
+ const c = text[i];
1398
+ if (inQuoted) {
1399
+ if (c === '"') {
1400
+ if (text[i + 1] === '"') { field += '"'; i++; }
1401
+ else inQuoted = false;
1402
+ } else field += c;
1403
+ } else {
1404
+ if (c === ',') { row.push(field); field = ''; }
1405
+ else if (c === '\n'){ row.push(field); field = ''; flushRow(); }
1406
+ else if (c === '\r'){ /* skip */ }
1407
+ else if (c === '"' && field.length === 0) inQuoted = true;
1408
+ else field += c;
1409
+ }
1410
+ }
1411
+ if (field.length > 0 || row.length > 0) { row.push(field); flushRow(); }
1412
+ return this._wasm.endDocument();
1413
+ }
1414
+
1415
+ // ── EML / MBOX ───────────────────────────────────────────────────────────
1416
+ // Minimal MIME: parse the first text/plain body. Headers From/To/Subject
1417
+ // are indexed as separate paragraphs so they're individually searchable.
1418
+ //
1419
+ // What's decoded:
1420
+ // * Content-Transfer-Encoding: base64 → decoded.
1421
+ // * Content-Transfer-Encoding: quoted-printable → decoded.
1422
+ // * Content-Transfer-Encoding: 7bit / 8bit → pass-through.
1423
+ // * Nested multipart (multipart/alternative inside multipart/mixed) by
1424
+ // recursively walking boundaries until a text/plain section is found.
1425
+ //
1426
+ // What's not decoded (out of scope for this "lite" parser):
1427
+ // * Encoded-word headers (=?utf-8?Q?...?=) — only the raw bytes go in.
1428
+ // * Charset conversions other than UTF-8 — assumes the body decodes as UTF-8.
1429
+ // * HTML-only emails — they're dropped if no text/plain part is present.
1430
+ // * MBOX format (multiple emails concatenated). Each email needs to be
1431
+ // fed separately.
1432
+ private async _indexEml(file: File, bytes: Uint8Array): Promise<number> {
1433
+ const raw = _dec.decode(bytes).replace(/\r\n/g, '\n');
1434
+ const headerEnd = raw.indexOf('\n\n');
1435
+ const headersBlock = headerEnd > 0 ? raw.slice(0, headerEnd) : raw;
1436
+ const body = headerEnd > 0 ? raw.slice(headerEnd + 2) : '';
1437
+
1438
+ const header = (block: string, name: string): string => {
1439
+ const m = new RegExp(`^${name}:\\s*(.+(?:\\n[ \\t].+)*)`, 'mi').exec(block);
1440
+ return m ? (m[1] ?? '').replace(/\n[ \t]+/g, ' ').trim() : '';
1441
+ };
1442
+
1443
+ this._wasm.setDocumentName(this._writeStr(file.name));
1444
+ this._wasm.beginDocument();
1445
+
1446
+ const subj = header(headersBlock, 'Subject');
1447
+ const from = header(headersBlock, 'From');
1448
+ const to = header(headersBlock, 'To');
1449
+ for (const h of [subj, from, to]) {
1450
+ if (h) { this._feedText(h); this._wasm.flushParagraph(); }
1451
+ }
1452
+
1453
+ const plain = this._extractEmlTextPlain(headersBlock, body, header) ?? body;
1454
+
1455
+ for (const para of plain.split(/\n{2,}/)) {
1456
+ const l = para.replace(/\n/g, ' ').trim();
1457
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
421
1458
  }
422
- return (this._wasm.endDocument as Function)() as number;
1459
+ return this._wasm.endDocument();
423
1460
  }
424
1461
 
425
- private static readonly _INDEXERS: Record<string, (engine: AlbexEngine, file: File) => Promise<number>> = {
426
- docx: (e, f) => e._indexDocx(f),
427
- xlsx: (e, f) => e._indexXlsx(f),
428
- pdf: (e, f) => e._indexPdf(f),
429
- txt: (e, f) => e._indexTxt(f),
430
- xml: (e, f) => e._indexXml(f),
1462
+ /**
1463
+ * Walk the multipart tree until a text/plain section is found. Returns
1464
+ * the decoded body as a string, or null if no text/plain part exists.
1465
+ *
1466
+ * The function is called with the headers and body of the *current*
1467
+ * MIME entity (the top-level message at first, then each multipart child
1468
+ * on recursion). For single-part entities it inspects the entity's own
1469
+ * Content-Transfer-Encoding and decodes accordingly.
1470
+ */
1471
+ private _extractEmlTextPlain(
1472
+ headersBlock: string,
1473
+ body: string,
1474
+ header: (block: string, name: string) => string,
1475
+ ): string | null {
1476
+ const contentType = header(headersBlock, 'Content-Type');
1477
+ const boundary = /boundary="?([^";]+)"?/i.exec(contentType)?.[1];
1478
+
1479
+ if (!boundary) {
1480
+ // Single-part body. If it claims to be text/plain (the default when
1481
+ // Content-Type is absent), apply Transfer-Encoding decoding here.
1482
+ // Anything else (text/html, application/*) gets returned raw — the
1483
+ // top-level caller still feeds it as text, but searches against
1484
+ // genuinely binary payloads will not hit anything useful.
1485
+ if (contentType === '' || /text\/plain/i.test(contentType)) {
1486
+ return decodeEmlBody(headersBlock, body, header);
1487
+ }
1488
+ return body;
1489
+ }
1490
+
1491
+ const parts = body.split(`--${boundary}`);
1492
+ for (const part of parts) {
1493
+ const trimmed = part.replace(/^\n+/, '');
1494
+ const ph = trimmed.indexOf('\n\n');
1495
+ if (ph < 0) continue;
1496
+ const partHeaders = trimmed.slice(0, ph);
1497
+ const partBody = trimmed.slice(ph + 2);
1498
+ const partCtype = header(partHeaders, 'Content-Type');
1499
+
1500
+ if (/^multipart\//i.test(partCtype)) {
1501
+ const inner = this._extractEmlTextPlain(partHeaders, partBody, header);
1502
+ if (inner) return inner;
1503
+ continue;
1504
+ }
1505
+
1506
+ if (/text\/plain/i.test(partCtype)) {
1507
+ return decodeEmlBody(partHeaders, partBody, header);
1508
+ }
1509
+ }
1510
+ return null;
1511
+ }
1512
+
1513
+ // ── RTF ──────────────────────────────────────────────────────────────────
1514
+ //
1515
+ // Strip the {\rtf1...} group structure. Control words (\xxx and \xxxN),
1516
+ // hex escapes (\'XX), unicode escapes (\uN ?) and groups are processed;
1517
+ // plain runs are kept.
1518
+ //
1519
+ // Character decoding:
1520
+ // * \'XX → Windows-1252 byte XX. RTF defaults to cp1252 for high-ANSI;
1521
+ // we map the relevant rows (0x80–0x9F differs from Latin-1)
1522
+ // to their Unicode equivalents. Outside that block, the byte
1523
+ // is taken as Latin-1 (which equals Unicode below 0x100).
1524
+ // Result: accents in es/fr/de/it/pt RTF dumps survive.
1525
+ // * \uN ? → Unicode codepoint N (signed 16-bit, negative means N+65536).
1526
+ // Followed by a fallback character which we then skip — Word
1527
+ // writes the ASCII transliteration of the unicode glyph as a
1528
+ // fallback for non-Unicode readers; we ignore it because we
1529
+ // have the real codepoint.
1530
+ // * \- → soft hyphen (drop).
1531
+ // * \~ → non-breaking space.
1532
+ // * \emdash, \endash, \bullet, \lquote, \rquote, \ldblquote, \rdblquote
1533
+ // → their Unicode equivalents.
1534
+ //
1535
+ // What's not handled (assumes Word/Pages/LibreOffice output, where
1536
+ // these aren't load-bearing):
1537
+ // * \ansicpg, \fcharset — we always assume cp1252 for \' escapes.
1538
+ // * \bin — binary data with explicit length; rare in document RTF.
1539
+ // * Field codes — rendered as the visible text (good enough for search).
1540
+ private async _indexRtf(file: File, bytes: Uint8Array): Promise<number> {
1541
+ const src = _dec.decode(bytes);
1542
+ let out = '';
1543
+ let i = 0;
1544
+ let depth = 0;
1545
+ // Track if we're inside a destination group we should skip (e.g. \fonttbl).
1546
+ let skipDepth = 0;
1547
+ const SKIP_DESTINATIONS = /^\\(fonttbl|colortbl|stylesheet|info|pict|object|header|footer)\b/;
1548
+
1549
+ while (i < src.length) {
1550
+ const c = src[i];
1551
+ if (c === '{') { depth++; i++; continue; }
1552
+ if (c === '}') {
1553
+ depth--;
1554
+ if (skipDepth > 0 && depth < skipDepth) skipDepth = 0;
1555
+ i++; continue;
1556
+ }
1557
+ if (c === '\\') {
1558
+ // Hex byte escape: \'XX
1559
+ if (src[i + 1] === '\'' && i + 3 < src.length) {
1560
+ const hex = src.slice(i + 2, i + 4);
1561
+ if (/^[0-9A-Fa-f]{2}$/.test(hex)) {
1562
+ if (skipDepth === 0) out += rtfCp1252ToChar(parseInt(hex, 16));
1563
+ i += 4;
1564
+ continue;
1565
+ }
1566
+ // Malformed — drop and advance.
1567
+ i += 2;
1568
+ continue;
1569
+ }
1570
+ // Unicode escape: \uN followed by optional fallback character.
1571
+ // N is signed 16-bit per the spec; negative values mean N + 65536.
1572
+ const um = /^\\u(-?\d+) ?/.exec(src.slice(i));
1573
+ if (um) {
1574
+ let code = parseInt(um[1] ?? '0', 10);
1575
+ if (code < 0) code += 0x10000;
1576
+ if (skipDepth === 0 && code > 0 && code < 0x110000) {
1577
+ out += String.fromCodePoint(code);
1578
+ }
1579
+ i += um[0].length;
1580
+ // Skip the fallback char. Word writes one ASCII char after \uN
1581
+ // (the "uc1" count). We assume uc1, which is the Word default.
1582
+ if (i < src.length && src[i] !== '\\' && src[i] !== '{' && src[i] !== '}') {
1583
+ i++;
1584
+ }
1585
+ continue;
1586
+ }
1587
+ // Control word / symbol.
1588
+ const m = /^\\([A-Za-z]+)(-?\d+)?\s?/.exec(src.slice(i));
1589
+ if (m) {
1590
+ const word = m[1] ?? '';
1591
+ if (skipDepth === 0 && SKIP_DESTINATIONS.test(src.slice(i))) skipDepth = depth;
1592
+ if (skipDepth === 0) {
1593
+ switch (word) {
1594
+ case 'par':
1595
+ case 'line':
1596
+ case 'sect':
1597
+ out += '\n\n';
1598
+ break;
1599
+ case 'tab':
1600
+ out += '\t';
1601
+ break;
1602
+ case 'emdash': out += '—'; break;
1603
+ case 'endash': out += '–'; break;
1604
+ case 'bullet': out += '•'; break;
1605
+ case 'lquote': out += '‘'; break;
1606
+ case 'rquote': out += '’'; break;
1607
+ case 'ldblquote': out += '“'; break;
1608
+ case 'rdblquote': out += '”'; break;
1609
+ default: /* drop other control words silently */ break;
1610
+ }
1611
+ }
1612
+ i += m[0].length;
1613
+ continue;
1614
+ }
1615
+ // Escaped single character: \\, \{, \}, \-, \~ etc.
1616
+ if (skipDepth === 0) {
1617
+ const escaped = src[i + 1];
1618
+ if (escaped === '~') out += ' '; // non-breaking space
1619
+ else if (escaped === '-') { /* soft hyphen — drop */ }
1620
+ else if (escaped !== undefined) out += escaped;
1621
+ }
1622
+ i += 2; continue;
1623
+ }
1624
+ if (skipDepth === 0) out += c;
1625
+ i++;
1626
+ }
1627
+
1628
+ this._wasm.setDocumentName(this._writeStr(file.name));
1629
+ this._wasm.beginDocument();
1630
+ for (const para of out.split(/\n{2,}/)) {
1631
+ const l = para.replace(/\n/g, ' ').trim();
1632
+ if (l) { this._feedText(l); this._wasm.flushParagraph(); }
1633
+ }
1634
+ return this._wasm.endDocument();
1635
+ }
1636
+
1637
+ private static readonly _INDEXERS: Record<string, (engine: AlbexEngine, file: File, bytes: Uint8Array) => Promise<number>> = {
1638
+ docx: (e, f, b) => e._indexDocx(f, b),
1639
+ xlsx: (e, f, b) => e._indexXlsx(f, b),
1640
+ pdf: (e, f, b) => e._indexPdf(f, b),
1641
+ txt: (e, f, b) => e._indexTxt(f, b),
1642
+ xml: (e, f, b) => e._indexXml(f, b),
1643
+ md: (e, f, b) => e._indexMd(f, b),
1644
+ markdown: (e, f, b) => e._indexMd(f, b),
1645
+ html: (e, f, b) => e._indexHtml(f, b),
1646
+ htm: (e, f, b) => e._indexHtml(f, b),
1647
+ json: (e, f, b) => e._indexJson(f, b),
1648
+ csv: (e, f, b) => e._indexCsv(f, b),
1649
+ eml: (e, f, b) => e._indexEml(f, b),
1650
+ rtf: (e, f, b) => e._indexRtf(f, b),
431
1651
  };
432
1652
 
433
1653
  // ── Public API ────────────────────────────────────────────────────────────
@@ -439,36 +1659,111 @@ export class AlbexEngine {
439
1659
  async indexFile(file: File): Promise<IndexedDocument> {
440
1660
  const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
441
1661
  const indexer = AlbexEngine._INDEXERS[ext];
442
- if (!indexer) throw new Error(`Unsupported format: .${ext}`);
1662
+ if (!indexer) throw new AlbexUnsupportedFormatError(ext);
1663
+
1664
+ // Hash the source bytes for idempotency. We always read the bytes once
1665
+ // here so the indexer can reuse them — avoids a double File.arrayBuffer().
1666
+ const bytes = new Uint8Array(await file.arrayBuffer());
1667
+ const hash = contentHash(bytes);
1668
+
1669
+ // Idempotency: if a non-deleted doc already has this hash, return it
1670
+ // unchanged. Cheap O(N) scan since MAX_DOCS = 128.
1671
+ const existing = this._docs.find(d => d.contentHash === hash);
1672
+ if (existing) return existing;
1673
+
1674
+ const w = this._wasm;
1675
+ const t0 = performance.now();
1676
+ const textPre = w.getTextUsed();
1677
+ const docCountBefore = w.getDocCount();
1678
+
1679
+ // Snapshot v2: hand the content hash to the WASM so it persists with
1680
+ // the doc. Older binaries (pre-v2) lack this export — we silently skip
1681
+ // and behave like before. The indexer will overwrite the scratchpad
1682
+ // immediately after (with the doc name), which is fine because
1683
+ // setDocumentContentHash copies into pending_content_hash before
1684
+ // returning.
1685
+ if (typeof w.setDocumentContentHash === 'function') {
1686
+ const hashBytes = hashHexToBytes(hash);
1687
+ this._writePad(hashBytes);
1688
+ w.setDocumentContentHash(hashBytes.length);
1689
+ }
1690
+
1691
+ const chunks = await indexer(this, file, bytes);
1692
+ // The new doc occupies slot `docCountBefore`.
1693
+ const docId = w.getDocId(docCountBefore);
443
1694
 
444
- const t0 = performance.now();
445
- const textPre = (this._wasm.getTextUsed as Function)() as number;
446
- const chunks = await indexer(this, file);
447
1695
  const doc: IndexedDocument = {
448
1696
  name: file.name,
449
1697
  ext,
450
1698
  chunks,
451
1699
  indexTimeMs: performance.now() - t0,
452
- textBytes: ((this._wasm.getTextUsed as Function)() as number) - textPre,
1700
+ textBytes: w.getTextUsed() - textPre,
1701
+ docId,
1702
+ contentHash: hash,
453
1703
  };
454
1704
  this._docs.push(doc);
455
1705
  return doc;
456
1706
  }
457
1707
 
1708
+ /**
1709
+ * Mark a previously indexed document as removed. Searches no longer return
1710
+ * its chunks. Storage is reclaimed only after `compact()`.
1711
+ *
1712
+ * `id` can be the file name or the contentHash returned by `indexFile`.
1713
+ * Returns `true` if a matching document was found and tombstoned.
1714
+ */
1715
+ removeDocument(id: string): boolean {
1716
+ const doc = this._docs.find(d => d.name === id || d.contentHash === id);
1717
+ if (!doc) return false;
1718
+ const ok = this._wasm.removeDocument(doc.docId) === 1;
1719
+ if (ok) {
1720
+ this._docs = this._docs.filter(d => d !== doc);
1721
+ }
1722
+ return ok;
1723
+ }
1724
+
1725
+ /**
1726
+ * Replace a previously indexed document with new content. Equivalent to
1727
+ * `removeDocument(name)` + `indexFile(newFile)` but does not trigger the
1728
+ * idempotency check (so re-indexing the *same* bytes after a remove works).
1729
+ */
1730
+ async replaceDocument(name: string, newFile: File): Promise<IndexedDocument> {
1731
+ this.removeDocument(name);
1732
+ // Force a unique-hash path by indexing directly; if the new file happens
1733
+ // to hash identically to a still-tracked document, the dedupe in
1734
+ // indexFile will return that one. The remove above prevents the
1735
+ // common case.
1736
+ return this.indexFile(newFile);
1737
+ }
1738
+
1739
+ /**
1740
+ * Reclaim storage from previously removed documents. Compacts CHUNKS,
1741
+ * TEXT_POOL, DOC_NAMES and NAME_POOL in place. Idempotent.
1742
+ *
1743
+ * Note: doc_ids of surviving documents are preserved, so any stored
1744
+ * references (e.g. in a UI) remain valid.
1745
+ */
1746
+ compact(): void {
1747
+ this._wasm.compact();
1748
+ }
1749
+
458
1750
  /**
459
1751
  * Search the index. Supports:
460
1752
  * - Simple queries: `contrato` (AND of tokens, accent-insensitive)
461
1753
  * - Phrase queries: `"contrato marco"` (must appear as phrase)
462
1754
  * - OR queries: `contrato | acuerdo` (union of two searches)
1755
+ *
1756
+ * Pass `{ windowed: true }` to receive cropped snippets with ASCII ellipsis
1757
+ * markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
463
1758
  */
464
- search(query: string): SearchResult[] {
1759
+ search(query: string, opts: SearchOptions = {}): SearchResult[] {
465
1760
  const parsed = parseQuery(query);
466
1761
 
467
1762
  if (parsed.kind === 'or') {
468
- return this._searchOr(parsed.branches, query);
1763
+ return this._searchOr(parsed.branches, query, opts);
469
1764
  }
470
1765
 
471
- const results = this._runSearch(tokensToWasmQuery(parsed.tokens), query);
1766
+ const results = this._runSearch(tokensToWasmQuery(parsed.tokens), query, opts);
472
1767
 
473
1768
  if (parsed.kind === 'phrase') {
474
1769
  return results.filter(r => containsPhrase(r.snippet, parsed.tokens));
@@ -477,14 +1772,215 @@ export class AlbexEngine {
477
1772
  return results;
478
1773
  }
479
1774
 
480
- private _searchOr(branches: string[][], rawQuery: string): SearchResult[] {
1775
+ /**
1776
+ * Cooperative search. Processes the corpus in slices, yielding to the
1777
+ * event loop between them so the host UI thread keeps a chance to paint
1778
+ * even while a long scan is in flight.
1779
+ *
1780
+ * NOTE: this is NOT incremental streaming. Results are materialised
1781
+ * once the search completes and then iterated out in score-descending
1782
+ * order. The async iterator shape is preserved because the work that
1783
+ * produces those results genuinely yields to the scheduler between
1784
+ * slices — a future iteration may stream individual results before the
1785
+ * heap sorts, but doing so today would deliver them in arbitrary order.
1786
+ *
1787
+ * Pass `opts.frameBudgetMs` to control the slice size (default 8 ms).
1788
+ */
1789
+ async *searchCooperative(query: string, opts: SearchOptions = {}): AsyncIterable<SearchResult> {
1790
+ const parsed = parseQuery(query);
1791
+ const budget = opts.frameBudgetMs ?? 8;
1792
+ const w = this._wasm;
1793
+
1794
+ // OR queries: run each branch as its own resumable search, dedup, sort.
1795
+ if (parsed.kind === 'or') {
1796
+ const seen = new Set<string>();
1797
+ const all: SearchResult[] = [];
1798
+ for (const tokens of parsed.branches) {
1799
+ const q = tokensToWasmQuery(tokens);
1800
+ if (!q) continue;
1801
+ const r = await this._runSearchBudgeted(q, query, opts, budget);
1802
+ for (const x of r) {
1803
+ const key = `${x.documentName}:${x.location}:${x.matchStart}`;
1804
+ if (!seen.has(key)) { seen.add(key); all.push(x); }
1805
+ }
1806
+ }
1807
+ all.sort((a, b) => b.score - a.score);
1808
+ for (const r of all) yield r;
1809
+ return;
1810
+ }
1811
+
1812
+ const results = await this._runSearchBudgeted(tokensToWasmQuery(parsed.tokens), query, opts, budget);
1813
+ const filtered = parsed.kind === 'phrase'
1814
+ ? results.filter(r => containsPhrase(r.snippet, parsed.tokens))
1815
+ : results;
1816
+ for (const r of filtered) yield r;
1817
+ void w;
1818
+ }
1819
+
1820
+ /**
1821
+ * @deprecated Renamed to `searchCooperative` in 0.3.0. The original name
1822
+ * was misleading — this method does not stream incremental results, it
1823
+ * yields to the scheduler between slices and returns a batch. The alias
1824
+ * keeps existing integrations working; it will be removed in 0.4.0.
1825
+ */
1826
+ async *searchStream(query: string, opts: SearchOptions = {}): AsyncIterable<SearchResult> {
1827
+ warnSearchStreamDeprecated();
1828
+ yield* this.searchCooperative(query, opts);
1829
+ }
1830
+
1831
+ /**
1832
+ * Drive a resumable search until done, yielding to the scheduler when the
1833
+ * frame budget is exceeded. Returns the materialised result array.
1834
+ *
1835
+ * Heuristic: each call to `searchSlice` processes a chunk batch, then we
1836
+ * check elapsed time. The batch size doubles up to a cap to amortise the
1837
+ * JS<->WASM overhead on fast machines; on slow machines a single batch
1838
+ * may eat the entire budget, which is also fine.
1839
+ */
1840
+ private async _runSearchBudgeted(
1841
+ wasmQuery: string,
1842
+ displayQuery: string,
1843
+ opts: SearchOptions,
1844
+ budgetMs: number,
1845
+ ): Promise<SearchResult[]> {
1846
+ const w = this._wasm;
1847
+ const ql = this._writeStr(wasmQuery);
1848
+ w.setPattern(ql);
1849
+
1850
+ // GPU pre-filter (CD1). If enabled AND the corpus is large enough,
1851
+ // the GPU computes the candidate bitset and we install it into WASM
1852
+ // before searchBegin so the slice loop only inspects candidates.
1853
+ // Failure here is silent: we fall back to CPU-only Bloom transparently.
1854
+ if (this._shouldEngageGpu()) {
1855
+ try {
1856
+ await this._gpuPreFilter(wasmQuery);
1857
+ } catch (e) {
1858
+ // Don't let a GPU hiccup kill the search — drop to CPU path.
1859
+ console.warn('[albex] GPU pre-filter failed; falling back to CPU:', e);
1860
+ w.clearCandidateMask();
1861
+ }
1862
+ }
1863
+
1864
+ const t0 = performance.now();
1865
+ if (w.searchBegin() === 0) {
1866
+ this._lastSearch = {
1867
+ query: displayQuery, timeMs: 0, results: 0,
1868
+ bloomTested: 0, bloomPassed: 0, bitapMatched: 0,
1869
+ };
1870
+ return [];
1871
+ }
1872
+
1873
+ // In background / low-power modes we halve the initial batch so the
1874
+ // engine yields more often to the scheduler, leaving more headroom for
1875
+ // whatever the host is doing.
1876
+ const conservative = this._resources?.mode === 'background'
1877
+ || this._resources?.mode === 'low-power';
1878
+ let batch = conservative ? 1024 : 2048;
1879
+ // `scheduler.yield()` is the cleanest way to defer to the event loop in
1880
+ // 2026 (Chrome 129+). Fall back to `requestAnimationFrame` on older
1881
+ // browsers and Node test environments.
1882
+ type Sched = { yield: () => Promise<void> };
1883
+ const sched = (globalThis as unknown as { scheduler?: Sched }).scheduler;
1884
+ const yieldFn: () => Promise<void> = sched && typeof sched.yield === 'function'
1885
+ ? () => sched.yield()
1886
+ : (typeof requestAnimationFrame === 'function'
1887
+ ? () => new Promise<void>(resolve => requestAnimationFrame(() => resolve()))
1888
+ : () => new Promise<void>(resolve => setTimeout(resolve, 0)));
1889
+
1890
+ for (;;) {
1891
+ const sliceStart = performance.now();
1892
+ const done = w.searchSlice(batch);
1893
+ const sliceMs = performance.now() - sliceStart;
1894
+ if (done === 1) break;
1895
+
1896
+ // Adapt batch size: if we have headroom in budget, grow; if we're
1897
+ // already over the per-slice target, shrink.
1898
+ if (sliceMs < budgetMs * 0.5 && batch < 32_768) batch *= 2;
1899
+ else if (sliceMs > budgetMs * 1.5 && batch > 512) batch = Math.max(512, Math.floor(batch / 2));
1900
+
1901
+ await yieldFn();
1902
+ }
1903
+
1904
+ const ms = performance.now() - t0;
1905
+ const count = w.getResultCount();
1906
+ this._lastSearch = {
1907
+ query: displayQuery,
1908
+ timeMs: ms,
1909
+ results: count,
1910
+ bloomTested: w.getStatBloomTested(),
1911
+ bloomPassed: w.getStatBloomPassed(),
1912
+ bitapMatched: w.getStatBitapMatched(),
1913
+ };
1914
+
1915
+ return this._collectResults(count, opts);
1916
+ }
1917
+
1918
+ /** Materialise results [0..count) into the public SearchResult shape. */
1919
+ private _collectResults(count: number, opts: SearchOptions): SearchResult[] {
1920
+ const w = this._wasm;
1921
+ const windowed = opts.windowed === true;
1922
+ const before = opts.before ?? 60;
1923
+ const after = opts.after ?? 120;
1924
+
1925
+ const results: SearchResult[] = [];
1926
+ for (let i = 0; i < count; i++) {
1927
+ const score = w.getResultScore(i);
1928
+ const location = w.getResultLocation(i);
1929
+ const matchStart = w.getResultStart(i);
1930
+ const matchEnd = w.getResultEnd(i);
1931
+ const nl = w.getResultDocName(i);
1932
+ const name = nl > 0 ? this._readPad(nl) : '?';
1933
+
1934
+ const matchCount = w.getResultMatchCount(i);
1935
+ const matches: MatchSpan[] = [];
1936
+ for (let k = 0; k < matchCount; k++) {
1937
+ matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
1938
+ }
1939
+ if (matches.length === 0) matches.push({ start: matchStart, end: matchEnd });
1940
+
1941
+ let snippet: string;
1942
+ let primaryStart = matchStart;
1943
+ let primaryEnd = matchEnd;
1944
+ let adjustedMatches: MatchSpan[] = matches;
1945
+
1946
+ if (windowed) {
1947
+ const sl = w.getSnippetWindow(i, before, after);
1948
+ snippet = sl > 0 ? this._readPad(sl) : '';
1949
+ const offset = w.getSnippetWindowOffset();
1950
+ const leadingPrefix = offset > 0 ? 4 : 0;
1951
+ const shift = leadingPrefix - offset;
1952
+ adjustedMatches = matches.map(m => ({
1953
+ start: Math.max(0, m.start + shift),
1954
+ end: Math.max(0, m.end + shift),
1955
+ }));
1956
+ primaryStart = adjustedMatches[0]?.start ?? 0;
1957
+ primaryEnd = adjustedMatches[0]?.end ?? 0;
1958
+ } else {
1959
+ const sl = w.getSnippet(i);
1960
+ snippet = sl > 0 ? this._readPad(sl) : '';
1961
+ }
1962
+
1963
+ results.push({
1964
+ documentName: name,
1965
+ location,
1966
+ score,
1967
+ snippet,
1968
+ matchStart: primaryStart,
1969
+ matchEnd: primaryEnd,
1970
+ matches: adjustedMatches,
1971
+ });
1972
+ }
1973
+ return results;
1974
+ }
1975
+
1976
+ private _searchOr(branches: string[][], rawQuery: string, opts: SearchOptions): SearchResult[] {
481
1977
  const seen = new Set<string>();
482
1978
  const all: SearchResult[] = [];
483
1979
 
484
1980
  for (const tokens of branches) {
485
1981
  const q = tokensToWasmQuery(tokens);
486
1982
  if (!q) continue;
487
- const results = this._runSearch(q, rawQuery);
1983
+ const results = this._runSearch(q, rawQuery, opts);
488
1984
  for (const r of results) {
489
1985
  const key = `${r.documentName}:${r.location}:${r.matchStart}`;
490
1986
  if (!seen.has(key)) { seen.add(key); all.push(r); }
@@ -496,35 +1992,79 @@ export class AlbexEngine {
496
1992
  return all;
497
1993
  }
498
1994
 
499
- private _runSearch(wasmQuery: string, displayQuery: string): SearchResult[] {
500
- const ql = this._writeStr(wasmQuery);
501
- (this._wasm.setPattern as Function)(ql);
1995
+ private _runSearch(wasmQuery: string, displayQuery: string, opts: SearchOptions): SearchResult[] {
1996
+ const w = this._wasm;
1997
+ const ql = this._writeStr(wasmQuery);
1998
+ w.setPattern(ql);
502
1999
 
503
2000
  const t0 = performance.now();
504
- const count = (this._wasm.search as Function)() as number;
2001
+ const count = w.search();
505
2002
  const ms = performance.now() - t0;
506
2003
 
507
2004
  this._lastSearch = {
508
2005
  query: displayQuery,
509
2006
  timeMs: ms,
510
2007
  results: count,
511
- bloomTested: (this._wasm.getStatBloomTested as Function)() as number,
512
- bloomPassed: (this._wasm.getStatBloomPassed as Function)() as number,
513
- bitapMatched: (this._wasm.getStatBitapMatched as Function)() as number,
2008
+ bloomTested: w.getStatBloomTested(),
2009
+ bloomPassed: w.getStatBloomPassed(),
2010
+ bitapMatched: w.getStatBitapMatched(),
514
2011
  };
515
2012
 
2013
+ const windowed = opts.windowed === true;
2014
+ const before = opts.before ?? 60;
2015
+ const after = opts.after ?? 120;
2016
+
516
2017
  const results: SearchResult[] = [];
517
2018
  for (let i = 0; i < count; i++) {
518
- const score = (this._wasm.getResultScore as Function)(i) as number;
519
- const location = (this._wasm.getResultLocation as Function)(i) as number;
520
- const matchStart = (this._wasm.getResultStart as Function)(i) as number;
521
- const matchEnd = (this._wasm.getResultEnd as Function)(i) as number;
522
- const nl = (this._wasm.getResultDocName as Function)(i) as number;
2019
+ const score = w.getResultScore(i);
2020
+ const location = w.getResultLocation(i);
2021
+ const matchStart = w.getResultStart(i);
2022
+ const matchEnd = w.getResultEnd(i);
2023
+ const nl = w.getResultDocName(i);
523
2024
  const name = nl > 0 ? this._readPad(nl) : '?';
524
- const sl = (this._wasm.getSnippet as Function)(i) as number;
525
- const snippet = sl > 0 ? this._readPad(sl) : '';
526
2025
 
527
- results.push({ documentName: name, location, score, snippet, matchStart, matchEnd });
2026
+ const matchCount = w.getResultMatchCount(i);
2027
+ const matches: MatchSpan[] = [];
2028
+ for (let k = 0; k < matchCount; k++) {
2029
+ matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
2030
+ }
2031
+ if (matches.length === 0) {
2032
+ matches.push({ start: matchStart, end: matchEnd });
2033
+ }
2034
+
2035
+ let snippet: string;
2036
+ let primaryStart = matchStart;
2037
+ let primaryEnd = matchEnd;
2038
+ let adjustedMatches: MatchSpan[] = matches;
2039
+
2040
+ if (windowed) {
2041
+ const sl = w.getSnippetWindow(i, before, after);
2042
+ snippet = sl > 0 ? this._readPad(sl) : '';
2043
+ const offset = w.getSnippetWindowOffset();
2044
+ // Spans came back chunk-relative; shift them into window-relative.
2045
+ // Account for leading "... " prefix when present.
2046
+ const leadingPrefix = offset > 0 ? 4 : 0;
2047
+ const shift = leadingPrefix - offset;
2048
+ adjustedMatches = matches.map(m => ({
2049
+ start: Math.max(0, m.start + shift),
2050
+ end: Math.max(0, m.end + shift),
2051
+ }));
2052
+ primaryStart = adjustedMatches[0]?.start ?? 0;
2053
+ primaryEnd = adjustedMatches[0]?.end ?? 0;
2054
+ } else {
2055
+ const sl = w.getSnippet(i);
2056
+ snippet = sl > 0 ? this._readPad(sl) : '';
2057
+ }
2058
+
2059
+ results.push({
2060
+ documentName: name,
2061
+ location,
2062
+ score,
2063
+ snippet,
2064
+ matchStart: primaryStart,
2065
+ matchEnd: primaryEnd,
2066
+ matches: adjustedMatches,
2067
+ });
528
2068
  }
529
2069
  return results;
530
2070
  }
@@ -532,11 +2072,14 @@ export class AlbexEngine {
532
2072
  /** Returns current engine statistics. */
533
2073
  getStats(): EngineStats {
534
2074
  return {
535
- documents: this._docs.length,
536
- chunks: (this._wasm.getChunkCount as Function)() as number,
537
- textUsed: (this._wasm.getTextUsed as Function)() as number,
538
- textCapacity: (this._wasm.getTextCapacity as Function)() as number,
2075
+ documents: this._docs.length,
2076
+ chunks: this._wasm.getChunkCount(),
2077
+ textUsed: this._wasm.getTextUsed(),
2078
+ textCapacity: this._wasm.getTextCapacity(),
539
2079
  wasmMemoryBytes: this._mem.buffer.byteLength,
2080
+ tier: this._tier,
2081
+ maxChunks: this._wasm.getMaxChunks(),
2082
+ maxDocs: this._wasm.getMaxDocs(),
540
2083
  };
541
2084
  }
542
2085
 
@@ -557,21 +2100,193 @@ export class AlbexEngine {
557
2100
 
558
2101
  /** Configure search sensitivity. */
559
2102
  setMaxErrors(errors: 0 | 1 | 2 | 3): void {
560
- (this._wasm.setMaxErrors as Function)(errors);
2103
+ this._wasm.setMaxErrors(errors);
561
2104
  }
562
2105
 
563
2106
  setThreshold(threshold: number): void {
564
- (this._wasm.setThreshold as Function)(Math.max(0, Math.min(1000, threshold)));
2107
+ this._wasm.setThreshold(Math.max(0, Math.min(1000, threshold)));
565
2108
  }
566
2109
 
567
2110
  setMaxResults(max: number): void {
568
- (this._wasm.setMaxResults as Function)(Math.max(1, Math.min(200, max)));
2111
+ this._wasm.setMaxResults(Math.max(1, Math.min(200, max)));
2112
+ }
2113
+
2114
+ /**
2115
+ * Enable or disable query stemming.
2116
+ *
2117
+ * - `'off'` (default): tokens are used as-is. Strict matching.
2118
+ * - `'es'`: Spanish stemmer applied to query tokens before search. A query
2119
+ * for `"contratos"` matches `"contrato"` and vice versa.
2120
+ *
2121
+ * Indexed text is never stemmed, so snippets remain faithful to the
2122
+ * source. Recall improvement comes from queries reducing to shared prefixes.
2123
+ */
2124
+ setLanguage(lang: 'off' | 'es'): void {
2125
+ this._wasm.setLanguage(lang === 'es' ? 1 : 0);
569
2126
  }
570
2127
 
571
2128
  /** Full reset — clears all indexed documents and chunks. */
572
2129
  reset(): void {
573
- (this._wasm.init as Function)();
2130
+ this._wasm.init();
2131
+ this._docs = [];
2132
+ this._lastSearch = null;
2133
+ }
2134
+
2135
+ // ── Persistence ───────────────────────────────────────────────────────────
2136
+
2137
+ /**
2138
+ * Persist the current index to OPFS (or IndexedDB as fallback) under `name`.
2139
+ *
2140
+ * The snapshot includes every chunk, document name and text byte currently
2141
+ * indexed. Subsequent `load(name)` calls restore the engine to this exact
2142
+ * state in roughly O(total bytes), bypassing re-parsing.
2143
+ */
2144
+ async save(name: string): Promise<void> {
2145
+ const w = this._wasm;
2146
+ const total = w.snapshotSize();
2147
+ if (total === 0) {
2148
+ await savePersisted(name, new Uint8Array(0));
2149
+ return;
2150
+ }
2151
+ const out = new Uint8Array(total);
2152
+ let off = 0;
2153
+ while (off < total) {
2154
+ const n = w.snapshotChunk(off, FEED_SIZE);
2155
+ if (n === 0) break;
2156
+ const ptr = w.getBuffer(0);
2157
+ out.set(this._u8(ptr, n), off);
2158
+ off += n;
2159
+ }
2160
+ await savePersisted(name, out);
2161
+ // Reconstruct _docs from the doc table so getStats().documents stays
2162
+ // honest after save (no change here — but symmetric with load()).
2163
+ }
2164
+
2165
+ /**
2166
+ * Restore an index previously saved with `save(name)`. Returns `true` on
2167
+ * success, `false` if the snapshot is missing or has an incompatible
2168
+ * header (wrong magic, version, or struct sizes).
2169
+ */
2170
+ async load(name: string): Promise<boolean> {
2171
+ const bytes = await loadPersisted(name);
2172
+ if (!bytes || bytes.length === 0) return false;
2173
+
2174
+ const w = this._wasm;
2175
+ // Write the 64-byte header into the scratchpad and validate.
2176
+ if (bytes.length < 64) return false;
2177
+ const ptr = w.getBuffer(64);
2178
+ if (!ptr) return false;
2179
+ this._u8(ptr, 64).set(bytes.subarray(0, 64));
2180
+ if (w.restoreBegin() !== 1) return false;
2181
+
2182
+ // Stream payload bytes.
2183
+ let off = 64;
2184
+ while (off < bytes.length) {
2185
+ const n = Math.min(FEED_SIZE, bytes.length - off);
2186
+ this._writePad(bytes.subarray(off, off + n));
2187
+ if (w.restoreFeed(n) !== 1) return false;
2188
+ off += n;
2189
+ }
2190
+
2191
+ // Rebuild _docs metadata from the restored WASM tables.
2192
+ //
2193
+ // What's available after a restore:
2194
+ // * `name` — recovered from getDocName(i).
2195
+ // * `ext` — derived from the name.
2196
+ // * `chunks` — getDocChunkCount(i).
2197
+ // * `docId` — getDocId(i).
2198
+ // * `contentHash` — getDocContentHashPtr(i) when the binary supports
2199
+ // snapshot v2 (the export exists) AND the snapshot
2200
+ // itself was v2 (the bytes aren't all zero). v1
2201
+ // snapshots restore with all-zero hashes → '' here,
2202
+ // same as before.
2203
+ //
2204
+ // What's not persisted and therefore zeroed:
2205
+ // * `indexTimeMs` — no indexing happened in this session.
2206
+ // * `textBytes` — engine-wide totals are still available via
2207
+ // getStats().textUsed; per-doc breakdown is not
2208
+ // stored.
2209
+ const docCount = w.getDocCount();
2210
+ const hasHashExport = typeof w.getDocContentHashPtr === 'function'
2211
+ && typeof w.getDocContentHashLen === 'function';
574
2212
  this._docs = [];
2213
+ for (let i = 0; i < docCount; i++) {
2214
+ if (w.isDocDeleted(i)) continue;
2215
+ const nameLen = w.getDocName(i);
2216
+ const name = nameLen > 0 ? this._readPad(nameLen) : `restored-${i}`;
2217
+ const dotIdx = name.lastIndexOf('.');
2218
+ const ext = dotIdx > 0 ? name.slice(dotIdx + 1).toLowerCase() : '';
2219
+
2220
+ let contentHash = '';
2221
+ if (hasHashExport) {
2222
+ const hashLen = w.getDocContentHashLen(); // always 8 today
2223
+ const hashPtr = w.getDocContentHashPtr(i);
2224
+ if (hashPtr !== 0 && hashLen === 8) {
2225
+ const view = this._u8(hashPtr, 8);
2226
+ // Copy into a private buffer so subsequent WASM calls cannot
2227
+ // mutate it under us.
2228
+ const buf = new Uint8Array(8);
2229
+ buf.set(view);
2230
+ contentHash = hashBytesToHex(buf);
2231
+ }
2232
+ }
2233
+
2234
+ this._docs.push({
2235
+ name,
2236
+ ext,
2237
+ chunks: w.getDocChunkCount(i),
2238
+ indexTimeMs: 0,
2239
+ textBytes: 0,
2240
+ docId: w.getDocId(i),
2241
+ contentHash,
2242
+ });
2243
+ }
575
2244
  this._lastSearch = null;
2245
+ return true;
2246
+ }
2247
+
2248
+ /**
2249
+ * Convenience: load if the snapshot exists, otherwise leave the engine
2250
+ * empty. Returns whether a load actually happened.
2251
+ */
2252
+ async loadOrInit(name: string): Promise<boolean> {
2253
+ const loaded = await this.load(name);
2254
+ if (!loaded) this.reset();
2255
+ return loaded;
2256
+ }
2257
+
2258
+ /** Delete a previously persisted snapshot. */
2259
+ async deleteSnapshot(name: string): Promise<void> {
2260
+ await deletePersisted(name);
2261
+ }
2262
+
2263
+ /** List names of persisted snapshots in the current origin. */
2264
+ async listSnapshots(): Promise<string[]> {
2265
+ return listPersisted();
2266
+ }
2267
+
2268
+ /**
2269
+ * TC39 explicit-resource-management hook (Stage 3 in 2026). Lets the engine
2270
+ * be used with `using` so the references are released deterministically:
2271
+ *
2272
+ * using engine = new AlbexEngine(opts); await engine.init();
2273
+ *
2274
+ * WebAssembly does not actually expose a way to release linear memory pages
2275
+ * inside a Module instance, so we drop our references to the exports and
2276
+ * the doc list. GC can then reclaim the engine, which in turn releases the
2277
+ * WASM instance and its (typically 20 MB) backing memory.
2278
+ */
2279
+ [Symbol.dispose](): void {
2280
+ this.reset();
2281
+ this._unsubscribeResources?.();
2282
+ this._unsubscribeResources = null;
2283
+ this._gpu?.destroy();
2284
+ this._gpu = null;
2285
+ // Null out the references so the engine cannot be reused after disposal
2286
+ // and the WASM instance becomes unreachable.
2287
+ this._wasm = null as unknown as AlbexWasmExports;
2288
+ this._mem = null as unknown as WebAssembly.Memory;
2289
+ this._pdfWasm = null;
2290
+ this._pdfMem = null;
576
2291
  }
577
2292
  }