albex 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CHANGELOG.md +141 -0
  2. package/README.md +242 -112
  3. package/dist/albex-worker.d.ts +70 -0
  4. package/dist/albex-worker.d.ts.map +1 -0
  5. package/dist/albex-worker.js +153 -0
  6. package/dist/albex-worker.js.map +1 -0
  7. package/dist/albex.d.ts +368 -6
  8. package/dist/albex.d.ts.map +1 -1
  9. package/dist/albex.js +1692 -95
  10. package/dist/albex.js.map +1 -1
  11. package/dist/errors.d.ts +38 -0
  12. package/dist/errors.d.ts.map +1 -0
  13. package/dist/errors.js +63 -0
  14. package/dist/errors.js.map +1 -0
  15. package/dist/gpu/bloom-runtime.d.ts +60 -0
  16. package/dist/gpu/bloom-runtime.d.ts.map +1 -0
  17. package/dist/gpu/bloom-runtime.js +176 -0
  18. package/dist/gpu/bloom-runtime.js.map +1 -0
  19. package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
  20. package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
  21. package/dist/gpu/bloom-shader.wgsl.js +49 -0
  22. package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
  23. package/dist/persistence.d.ts +21 -0
  24. package/dist/persistence.d.ts.map +1 -0
  25. package/dist/persistence.js +174 -0
  26. package/dist/persistence.js.map +1 -0
  27. package/dist/pool/coordinator.d.ts +98 -0
  28. package/dist/pool/coordinator.d.ts.map +1 -0
  29. package/dist/pool/coordinator.js +247 -0
  30. package/dist/pool/coordinator.js.map +1 -0
  31. package/dist/profile.d.ts +95 -0
  32. package/dist/profile.d.ts.map +1 -0
  33. package/dist/profile.js +207 -0
  34. package/dist/profile.js.map +1 -0
  35. package/dist/resource-manager.d.ts +56 -0
  36. package/dist/resource-manager.d.ts.map +1 -0
  37. package/dist/resource-manager.js +138 -0
  38. package/dist/resource-manager.js.map +1 -0
  39. package/dist/tiered-store.d.ts +98 -0
  40. package/dist/tiered-store.d.ts.map +1 -0
  41. package/dist/tiered-store.js +238 -0
  42. package/dist/tiered-store.js.map +1 -0
  43. package/dist/wasm-bindings.d.ts +139 -0
  44. package/dist/wasm-bindings.d.ts.map +1 -0
  45. package/dist/wasm-bindings.js +33 -0
  46. package/dist/wasm-bindings.js.map +1 -0
  47. package/dist/worker-protocol.d.ts +86 -0
  48. package/dist/worker-protocol.d.ts.map +1 -0
  49. package/dist/worker-protocol.js +20 -0
  50. package/dist/worker-protocol.js.map +1 -0
  51. package/dist/worker-runtime.d.ts +14 -0
  52. package/dist/worker-runtime.d.ts.map +1 -0
  53. package/dist/worker-runtime.js +100 -0
  54. package/dist/worker-runtime.js.map +1 -0
  55. package/package.json +56 -13
  56. package/src/albex-worker.ts +187 -0
  57. package/src/albex.ts +1845 -130
  58. package/src/errors.ts +60 -0
  59. package/src/gpu/bloom-runtime.ts +229 -0
  60. package/src/gpu/bloom-shader.wgsl.ts +48 -0
  61. package/src/persistence.ts +175 -0
  62. package/src/pool/coordinator.ts +324 -0
  63. package/src/profile.ts +279 -0
  64. package/src/resource-manager.ts +167 -0
  65. package/src/tiered-store.ts +259 -0
  66. package/src/wasm-bindings.ts +200 -0
  67. package/src/worker-protocol.ts +48 -0
  68. package/src/worker-runtime.ts +96 -0
  69. package/wasm/pkg/albex_pdf.wasm +0 -0
  70. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  71. package/wasm/pkg/albex_wasm_mini.wasm +0 -0
  72. package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
  73. package/wasm/pkg/albex_wasm_pro.wasm +0 -0
  74. package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
  75. package/wasm/pkg/albex_wasm_std.wasm +0 -0
  76. package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/dist/albex.js CHANGED
@@ -1,3 +1,10 @@
1
+ /*!
2
+ * albex v0.3.0
3
+ * Zero-config local full-text search for documents — runs entirely in the browser, no server, no upload.
4
+ * (c) 2026 RafaCalRob
5
+ * @license MIT
6
+ * https://github.com/RafaCalRob/Albex#readme
7
+ */
1
8
  /**
2
9
  * Albex — local full-text search engine.
3
10
  *
@@ -13,6 +20,34 @@
13
20
  * const results = engine.search('contrato marco');
14
21
  * ```
15
22
  */
23
+ import { asAlbexExports, asAlbexPdfExports, } from './wasm-bindings.js';
24
+ import { AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
25
+ import { savePersisted, loadPersisted, deletePersisted, listPersisted, } from './persistence.js';
26
+ import { detectProfile, pickTier, shouldUseGpu } from './profile.js';
27
+ import { getResourceManager } from './resource-manager.js';
28
+ import { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
29
+ export { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
30
+ export { listPersisted, deletePersisted } from './persistence.js';
31
+ export { detectProfile, pickTier, pickWorkerCount, shouldUseGpu } from './profile.js';
32
+ export { getResourceManager } from './resource-manager.js';
33
+ export { AlbexPool } from './pool/coordinator.js';
34
+ export { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
35
+ export { TieredStore } from './tiered-store.js';
36
+ // ─────────────────────────────────────────────────────────────────────────────
37
+ // Deprecation warnings — one-shot, fire-and-forget
38
+ // ─────────────────────────────────────────────────────────────────────────────
39
+ let _searchStreamWarned = false;
40
+ function warnSearchStreamDeprecated() {
41
+ if (_searchStreamWarned)
42
+ return;
43
+ _searchStreamWarned = true;
44
+ // The original name implied incremental streaming, which the implementation
45
+ // never provided. Renamed in 0.3.0; alias removed in 0.4.0.
46
+ console.warn('[albex] `searchStream` is deprecated; rename to `searchCooperative`. ' +
47
+ 'The method does not stream incremental results — it yields to the ' +
48
+ 'scheduler between slices and returns a batch. The alias will be ' +
49
+ 'removed in 0.4.0.');
50
+ }
16
51
  function tokenize(q) {
17
52
  return q.trim().split(/\s+/).filter(t => t.length > 0);
18
53
  }
@@ -80,7 +115,7 @@ function zipCentralDir(bytes) {
80
115
  while (p >= 0 && v.getUint32(p, true) !== 0x06054b50)
81
116
  p--;
82
117
  if (p < 0)
83
- throw new Error('Not a ZIP file');
118
+ throw new AlbexParseError('zip', 'Not a ZIP file (no EOCD record)');
84
119
  return { v, cdOff: v.getUint32(p + 16, true), cdN: v.getUint16(p + 10, true) };
85
120
  }
86
121
  function listZipEntries(bytes) {
@@ -109,7 +144,7 @@ async function findZipEntry(bytes, name) {
109
144
  }
110
145
  cp += 46 + nl + xl + cl;
111
146
  }
112
- throw new Error(`Entry "${name}" not found in ZIP`);
147
+ throw new AlbexParseError('zip', `Entry "${name}" not found in ZIP`);
113
148
  }
114
149
  async function decompEntry(bytes, v, off, compSize) {
115
150
  const meth = v.getUint16(off + 8, true);
@@ -146,62 +181,480 @@ async function decompEntry(bytes, v, off, compSize) {
146
181
  }
147
182
  return out;
148
183
  }
149
- throw new Error(`Unsupported ZIP compression method ${meth}`);
184
+ throw new AlbexParseError('zip', `Unsupported ZIP compression method ${meth}`);
150
185
  }
151
186
  // ─────────────────────────────────────────────────────────────────────────────
152
187
  // WASM memory helpers (internal)
153
188
  // ─────────────────────────────────────────────────────────────────────────────
154
- const FEED_SIZE = 32768; // 32 KB — fits in 64 KB scratchpad
189
+ const FEED_SIZE = 32_768; // 32 KB — fits in 64 KB scratchpad
190
+ // ─────────────────────────────────────────────────────────────────────────────
191
+ // Content hash — FNV-1a 64-bit
192
+ // ─────────────────────────────────────────────────────────────────────────────
193
+ /**
194
+ * Compute a 64-bit FNV-1a hash of `bytes` and return it as a 16-char hex
195
+ * string. FNV-1a is a non-cryptographic hash; chosen here because:
196
+ * - it needs zero dependencies,
197
+ * - it is fast on small/medium blobs (~100 MB/s in modern JS),
198
+ * - 64 bits is enough to deduplicate documents in a 128-doc library with
199
+ * vanishing collision probability.
200
+ *
201
+ * The result is stable across runs and engines, so it can be persisted in
202
+ * snapshots without versioning concerns.
203
+ */
204
+ /**
205
+ * Compute the same 64-bit Bloom value the Rust side computes for a query.
206
+ *
207
+ * Must stay in sync with `BloomFilter::from_text` and `fold_utf8_char` in
208
+ * `core/src/bloom.rs`. The hashing is `c & 0x3F` over each accent-folded
209
+ * lowercase ASCII byte; non-letters are skipped. The aggregate of all token
210
+ * blooms is what the GPU pre-filter checks against.
211
+ */
212
+ function computePatternBloom(query) {
213
+ // Quick-and-faithful fold: lowercase, NFKD, strip combining marks. This
214
+ // matches the Rust Latin-1/Latin-A fold for the characters we care about
215
+ // (the rest fall through as non-letters which contribute nothing).
216
+ const norm = query.toLowerCase().normalize('NFKD').replace(/[̀-ͯ]/g, '');
217
+ let bits = 0n;
218
+ for (let i = 0; i < norm.length; i++) {
219
+ const code = norm.charCodeAt(i);
220
+ if ((code >= 0x61 && code <= 0x7a) || (code >= 0x30 && code <= 0x39)) {
221
+ bits |= 1n << BigInt(code & 0x3f);
222
+ }
223
+ else if (code === 0x20) {
224
+ // skip token separator
225
+ }
226
+ else if (code < 0x80) {
227
+ // other ASCII punctuation — they bias the filter; mirror Rust which
228
+ // also includes them via the 6-bit mask.
229
+ bits |= 1n << BigInt(code & 0x3f);
230
+ }
231
+ }
232
+ return bits;
233
+ }
234
+ function contentHash(bytes) {
235
+ // 64-bit arithmetic via two 32-bit halves (no BigInt to keep it fast in
236
+ // engines without optimised BigInt support).
237
+ let hi = 0xcbf29ce4 | 0;
238
+ let lo = 0x84222325 | 0;
239
+ // FNV prime: 0x100000001b3 = (0x100 << 32) | 0x000001b3
240
+ for (let i = 0; i < bytes.length; i++) {
241
+ lo ^= bytes[i];
242
+ // multiply by FNV prime
243
+ // (hi:lo) *= 0x100000001b3
244
+ // low * prime
245
+ const lo_lo = (lo & 0xffff) * 0x1b3;
246
+ const lo_hi = (lo >>> 16) * 0x1b3;
247
+ let new_lo = (lo_lo + ((lo_hi & 0xffff) << 16)) | 0;
248
+ let carry = (lo_hi >>> 16) + ((lo_lo + ((lo_hi & 0xffff) << 16)) > 0xffffffff ? 1 : 0);
249
+ // hi*prime + carry
250
+ const hi_lo = (hi & 0xffff) * 0x1b3;
251
+ const hi_hi = (hi >>> 16) * 0x1b3;
252
+ const new_hi = ((hi_lo + ((hi_hi & 0xffff) << 16)) | 0) + carry + lo; // + lo because high 33rd bit
253
+ lo = new_lo;
254
+ hi = new_hi | 0;
255
+ }
256
+ const hexHi = (hi >>> 0).toString(16).padStart(8, '0');
257
+ const hexLo = (lo >>> 0).toString(16).padStart(8, '0');
258
+ return hexHi + hexLo;
259
+ }
260
+ /**
261
+ * 16-hex-char content hash → 8 raw bytes for setDocumentContentHash. The
262
+ * byte order matches the snapshot format: the high 32 bits sit at offsets
263
+ * 0..3 (big-endian-of-the-half), the low 32 bits at offsets 4..7. The
264
+ * exact byte order is irrelevant for correctness — both encode and decode
265
+ * use the same convention — but matching the natural hex byte order keeps
266
+ * a hex dump readable.
267
+ */
268
+ function hashHexToBytes(hex) {
269
+ const out = new Uint8Array(8);
270
+ for (let i = 0; i < 8; i++) {
271
+ out[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
272
+ }
273
+ return out;
274
+ }
275
+ /**
276
+ * Map a Windows-1252 byte to its Unicode equivalent. Used by the RTF parser
277
+ * for `\'XX` escapes — RTF defaults to cp1252 for high-ANSI characters.
278
+ *
279
+ * The 0x80-0x9F range is what makes cp1252 ≠ Latin-1: Microsoft put curly
280
+ * quotes, em-dashes, the Euro sign etc. into this otherwise-control-only
281
+ * block. Outside that range, cp1252 matches Latin-1 (which equals Unicode
282
+ * for codepoints below 0x100).
283
+ */
284
+ const _CP1252_HIGH = {
285
+ 0x80: '€', 0x82: '‚', 0x83: 'ƒ', 0x84: '„', 0x85: '…', 0x86: '†',
286
+ 0x87: '‡', 0x88: 'ˆ', 0x89: '‰', 0x8A: 'Š', 0x8B: '‹', 0x8C: 'Œ',
287
+ 0x8E: 'Ž',
288
+ 0x91: '‘', 0x92: '’', 0x93: '“', 0x94: '”',
289
+ 0x95: '•', 0x96: '–', 0x97: '—', 0x98: '˜', 0x99: '™', 0x9A: 'š',
290
+ 0x9B: '›', 0x9C: 'œ', 0x9E: 'ž', 0x9F: 'Ÿ',
291
+ };
292
+ function rtfCp1252ToChar(byte) {
293
+ if (byte < 0x80)
294
+ return String.fromCharCode(byte);
295
+ if (byte >= 0xA0)
296
+ return String.fromCharCode(byte);
297
+ return _CP1252_HIGH[byte] ?? '';
298
+ }
299
+ /**
300
+ * Apply the entity's Content-Transfer-Encoding to its body. Handles
301
+ * base64, quoted-printable, and the pass-through cases (7bit, 8bit, none).
302
+ * Anything unrecognised falls through as pass-through too — better to
303
+ * index something marginally useful than to drop the body entirely.
304
+ */
305
+ function decodeEmlBody(headersBlock, body, header) {
306
+ const enc = header(headersBlock, 'Content-Transfer-Encoding').toLowerCase();
307
+ if (enc === 'base64')
308
+ return decodeBase64Utf8(body);
309
+ if (enc === 'quoted-printable')
310
+ return decodeQuotedPrintable(body);
311
+ return body;
312
+ }
313
+ /**
314
+ * Decode a base64 body and interpret the result as UTF-8 text. Used by the
315
+ * EML parser when Content-Transfer-Encoding is base64. Whitespace inside
316
+ * the encoded body (the line breaks every 76 chars) is stripped first;
317
+ * malformed inputs fall back to returning the original string so the
318
+ * caller can still index *something*.
319
+ */
320
+ function decodeBase64Utf8(body) {
321
+ try {
322
+ const clean = body.replace(/\s+/g, '');
323
+ if (!clean)
324
+ return '';
325
+ // atob produces a "binary string" where each char's low byte is the
326
+ // original byte. We have to bridge that back through Uint8Array to
327
+ // decode UTF-8 multi-byte sequences correctly.
328
+ const bin = atob(clean);
329
+ const arr = new Uint8Array(bin.length);
330
+ for (let i = 0; i < bin.length; i++)
331
+ arr[i] = bin.charCodeAt(i);
332
+ return _dec.decode(arr);
333
+ }
334
+ catch {
335
+ return body;
336
+ }
337
+ }
338
+ /**
339
+ * Decode a quoted-printable body. Handles `=XX` hex escapes (including the
340
+ * `=` "soft line break" producing nothing) and re-decodes the result as
341
+ * UTF-8 — RFC 2045 allows non-ASCII bytes to be QP-encoded, so multiple
342
+ * hex pairs in a row may form a single UTF-8 codepoint.
343
+ */
344
+ function decodeQuotedPrintable(body) {
345
+ // First pass: collect the raw bytes so we can decode multi-byte UTF-8.
346
+ const bytes = [];
347
+ for (let i = 0; i < body.length; i++) {
348
+ const c = body[i];
349
+ if (c === '=') {
350
+ // Soft line break: `=` at end of line.
351
+ if (body[i + 1] === '\n') {
352
+ i += 1;
353
+ continue;
354
+ }
355
+ // `=XX` hex pair.
356
+ const h = body.slice(i + 1, i + 3);
357
+ if (/^[0-9A-Fa-f]{2}$/.test(h)) {
358
+ bytes.push(parseInt(h, 16));
359
+ i += 2;
360
+ continue;
361
+ }
362
+ // Malformed: keep the literal `=`.
363
+ bytes.push(0x3D);
364
+ continue;
365
+ }
366
+ // ASCII pass-through. JS strings are UTF-16; for ASCII we know
367
+ // charCodeAt fits in a byte. Non-ASCII char in the source isn't
368
+ // strictly valid QP but we pass it through best-effort.
369
+ bytes.push(c.charCodeAt(0) & 0xff);
370
+ }
371
+ try {
372
+ return _dec.decode(new Uint8Array(bytes));
373
+ }
374
+ catch {
375
+ return body;
376
+ }
377
+ }
378
+ /** Inverse of hashHexToBytes. All-zero bytes return '' (no hash known). */
379
+ function hashBytesToHex(bytes) {
380
+ let allZero = true;
381
+ for (let i = 0; i < 8; i++) {
382
+ if (bytes[i] !== 0) {
383
+ allZero = false;
384
+ break;
385
+ }
386
+ }
387
+ if (allZero)
388
+ return '';
389
+ let s = '';
390
+ for (let i = 0; i < 8; i++) {
391
+ s += bytes[i].toString(16).padStart(2, '0');
392
+ }
393
+ return s;
394
+ }
155
395
  // ─────────────────────────────────────────────────────────────────────────────
156
396
  // PDF WASM imports shim
157
397
  // ─────────────────────────────────────────────────────────────────────────────
158
- function makePdfWasmImports(getPdfMem) {
398
+ /**
399
+ * Build the import object for `albex_pdf.wasm` by inspecting the module's
400
+ * required imports at instantiation time.
401
+ *
402
+ * The PDF wasm pulls `wasm-bindgen` transitively through `getrandom`. Its
403
+ * import names embed a build-time hash, e.g.
404
+ * __wbg_getRandomValues_3f44b700395062e5
405
+ * Hardcoding that hash bound the loader to one exact build of the .wasm —
406
+ * any version bump of getrandom / lopdf / wasm-bindgen silently broke
407
+ * instantiation with an InputValidationError.
408
+ *
409
+ * Here we resolve imports by *prefix* and module so the binding survives
410
+ * cosmetic mangling changes. We map:
411
+ * - any __wbg_getRandomValues_* / __wbg_crypto_* → crypto.getRandomValues
412
+ * - any __wbindgen_describe* / __wbindgen_throw* → no-op
413
+ * - __wbindgen_object_drop_ref → heap-slot recycler
414
+ * - __wbindgen_externref_table_grow → heap grower
415
+ * - __wbindgen_externref_table_set_null → heap nuller
416
+ *
417
+ * Anything else gets a logged no-op stub. If the PDF code path ever exercises
418
+ * a missing import, the user gets a console warning, not a hard crash on load.
419
+ */
420
+ function makePdfWasmImports(module, getPdfMem) {
159
421
  const heap = [];
160
422
  let freeIdx = -1;
161
- return {
162
- __wbindgen_placeholder__: {
163
- __wbindgen_describe: () => { },
164
- __wbg_getRandomValues_3f44b700395062e5: (ptr, len) => {
165
- const mem = getPdfMem();
166
- crypto.getRandomValues(new Uint8Array(mem.buffer, ptr >>> 0, len >>> 0));
167
- },
168
- __wbindgen_object_drop_ref: (idx) => {
169
- heap[idx] = freeIdx;
170
- freeIdx = idx;
171
- },
172
- },
173
- __wbindgen_externref_xform__: {
174
- __wbindgen_externref_table_grow: (delta) => {
175
- const old = heap.length;
176
- for (let i = 0; i < delta; i++)
177
- heap.push(undefined);
178
- return old;
179
- },
180
- __wbindgen_externref_table_set_null: (idx) => { heap[idx] = undefined; },
181
- },
423
+ const required = WebAssembly.Module.imports(module);
424
+ const fillRandom = (ptr, len) => {
425
+ const mem = getPdfMem();
426
+ if (!mem)
427
+ throw new Error('PDF WASM memory not initialised');
428
+ crypto.getRandomValues(new Uint8Array(mem.buffer, ptr >>> 0, len >>> 0));
429
+ };
430
+ const resolveByName = (modName, name) => {
431
+ // Random-byte providers (any hashed variant).
432
+ if (name.startsWith('__wbg_getRandomValues') || name.startsWith('__wbg_crypto')) {
433
+ return fillRandom;
434
+ }
435
+ // Diagnostic / introspection — never invoked at runtime in our paths.
436
+ if (name.startsWith('__wbindgen_describe') || name.startsWith('__wbindgen_throw')) {
437
+ return () => { };
438
+ }
439
+ // Externref-heap management used by wasm-bindgen runtime.
440
+ switch (name) {
441
+ case '__wbindgen_object_drop_ref':
442
+ return (idx) => { heap[idx] = freeIdx; freeIdx = idx; };
443
+ case '__wbindgen_externref_table_grow':
444
+ return (delta) => {
445
+ const old = heap.length;
446
+ for (let i = 0; i < delta; i++)
447
+ heap.push(undefined);
448
+ return old;
449
+ };
450
+ case '__wbindgen_externref_table_set_null':
451
+ return (idx) => { heap[idx] = undefined; };
452
+ }
453
+ // Unknown import — return a stub that warns when called. Loading still
454
+ // succeeds; only an actually-invoked unknown import will surface.
455
+ return (...args) => {
456
+ console.warn(`[albex] unhandled PDF WASM import ${modName}.${name}`, args);
457
+ };
182
458
  };
459
+ const imports = {};
460
+ for (const { module: modName, name } of required) {
461
+ if (!imports[modName])
462
+ imports[modName] = {};
463
+ imports[modName][name] = resolveByName(modName, name);
464
+ }
465
+ return imports;
183
466
  }
184
- // ─────────────────────────────────────────────────────────────────────────────
185
- // AlbexEngine
186
- // ─────────────────────────────────────────────────────────────────────────────
187
467
  export class AlbexEngine {
468
+ // ── main WASM ──
469
+ _wasm;
470
+ _mem;
471
+ /**
472
+ * OCR entry point installed by `@albex/ocr::enableOcr(engine)`. Undefined
473
+ * when the OCR module has not been wired. The main `albex` package has no
474
+ * runtime dependency on OCR — this is a structural slot that the optional
475
+ * companion package fills.
476
+ */
477
+ ocrImage;
478
+ /**
479
+ * Optional OCR-side configuration set by `@albex/ocr::enableOcr`. Read
480
+ * by the engine to decide whether to invoke OCR on top of the text it
481
+ * already extracted from a PDF (hybrid PDFs: native text + images that
482
+ * also contain text, like stamps, scanned annexes, or diagrams with
483
+ * labels).
484
+ *
485
+ * When `alwaysExtractEmbeddedImages` is true, every page of every PDF
486
+ * passes through `extractPageImages` after the normal text extraction;
487
+ * any image that meets the size filter (200×200 in Rust) is fed to
488
+ * `ocrImage`. Performance cost: 1–3 s per qualifying image.
489
+ *
490
+ * Off by default — set this opt-in via the OCR module's options.
491
+ */
492
+ ocrConfig;
493
+ // ── PDF WASM (lazy) ──
494
+ _pdfWasm = null;
495
+ _pdfMem = null;
496
+ _docs = [];
497
+ _lastSearch = null;
498
+ _tier = null;
499
+ _simd = false;
500
+ _profile = null;
501
+ _resources = null;
502
+ _gpu = null;
503
+ _gpuChunkCountUploaded = 0;
504
+ _unsubscribeResources = null;
505
+ _opts;
188
506
  constructor(opts) {
189
- // ── PDF WASM (lazy) ──
190
- this._pdfWasm = null;
191
- this._pdfMem = null;
192
- this._docs = [];
193
- this._lastSearch = null;
194
507
  this._opts = opts;
195
508
  }
196
509
  /** Load and initialise the main WASM module. Must be called before any other method. */
197
510
  async init() {
198
- const res = await fetch(this._opts.wasmUrl);
511
+ const url = await this._resolveWasmUrl();
512
+ const res = await fetch(url);
199
513
  if (!res.ok)
200
- throw new Error(`Failed to fetch WASM: ${res.status}`);
514
+ throw new AlbexInitError(`Failed to fetch WASM: ${res.status} (${url})`);
201
515
  const { instance } = await WebAssembly.instantiateStreaming(res, {});
202
- this._wasm = instance.exports;
203
- this._mem = instance.exports.memory;
516
+ this._wasm = asAlbexExports(instance.exports);
517
+ this._mem = this._wasm.memory;
204
518
  this._wasm.init();
519
+ // Subscribe to environmental signals. Cheap and benign in node tests
520
+ // (the manager tolerates missing globals).
521
+ const rm = getResourceManager();
522
+ await rm.start();
523
+ this._resources = rm.state;
524
+ this._unsubscribeResources = rm.on(s => { this._resources = s; });
525
+ // Lazily initialise the GPU Bloom accelerator. We don't acquire a device
526
+ // here yet — that happens on the first search that crosses the threshold.
527
+ // This keeps cold-start cost the same on GPU and CPU paths.
528
+ if (this._opts.gpu !== 'off') {
529
+ this._gpu = new BloomGpu();
530
+ }
531
+ }
532
+ /**
533
+ * Decide which `.wasm` binary to fetch. Order of precedence:
534
+ * 1. `opts.wasmUrl` if provided — used verbatim.
535
+ * 2. `opts.tier` if explicit — joined with `wasmBaseUrl`.
536
+ * 3. `opts.wasmBaseUrl` + tier picked from the device profile.
537
+ *
538
+ * Order of precedence:
539
+ * 1. `opts.wasmUrl` literal → use verbatim
540
+ * 2. `opts.wasmBaseUrl` + tier suffix → fetched from that directory
541
+ * 3. zero-config default → `albex_wasm_bg.wasm` packaged
542
+ * next to this file, resolved
543
+ * via `import.meta.url`
544
+ *
545
+ * The zero-config default loads the std-baseline binary. Tier auto-detection
546
+ * is only active when `wasmBaseUrl` is given, because picking a tier in
547
+ * runtime would defeat any bundler's static asset rewriting. Users who want
548
+ * tier optimisation must serve the six variants themselves and pass the
549
+ * directory through `wasmBaseUrl`.
550
+ */
551
+ async _resolveWasmUrl() {
552
+ const o = this._opts;
553
+ if (o.wasmUrl) {
554
+ this._profile = await detectProfile();
555
+ return o.wasmUrl;
556
+ }
557
+ // Always cache the profile so GPU/worker decisions later don't re-probe.
558
+ const profile = await detectProfile();
559
+ this._profile = profile;
560
+ // Path 3: zero-config — bundler-friendly default. `new URL(..., import.meta.url)`
561
+ // is recognised by Vite, Webpack 5+, esbuild, Rollup, Parcel 2 and Next.js
562
+ // as an asset reference. They copy the .wasm to the output directory and
563
+ // rewrite the URL automatically. Consumers who use one of those bundlers
564
+ // get a working `new AlbexEngine()` with no manual setup.
565
+ if (!o.wasmBaseUrl) {
566
+ // We can't tier-select with one URL, so fall back to std baseline.
567
+ // The integrator who wants tier optimisation must opt in via wasmBaseUrl.
568
+ this._tier = 'std';
569
+ this._simd = false;
570
+ return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
571
+ }
572
+ let tier;
573
+ if (o.tier && o.tier !== 'auto')
574
+ tier = o.tier;
575
+ else
576
+ tier = pickTier(profile);
577
+ this._tier = tier;
578
+ const simd = o.simd === 'on'
579
+ ? true
580
+ : o.simd === 'off'
581
+ ? false
582
+ : !!profile?.wasm.simd;
583
+ this._simd = simd;
584
+ const suffix = simd ? `${tier}_simd` : tier;
585
+ const base = o.wasmBaseUrl.replace(/\/+$/, '');
586
+ return `${base}/albex_wasm_${suffix}.wasm`;
587
+ }
588
+ /** The tier that was actually loaded. `null` until `init()` resolves. */
589
+ get tier() { return this._tier; }
590
+ /** True if the SIMD-accelerated binary was loaded. */
591
+ get simdEnabled() { return this._simd; }
592
+ /** True if a WebGPU device is acquired and the next search will use it. */
593
+ get gpuEngaged() { return !!this._gpu?.available; }
594
+ // ── GPU acceleration (CD1) ───────────────────────────────────────────────
595
+ /**
596
+ * Decide whether to use the GPU pre-filter for the upcoming search.
597
+ *
598
+ * Policy:
599
+ * - `gpu: 'off'` → never.
600
+ * - `gpu: 'on'` → always try (still fails over to CPU).
601
+ * - `gpu: 'auto'` (default) → only when WebGPU is available AND
602
+ * chunk count crosses `gpuThreshold`.
603
+ */
604
+ _shouldEngageGpu() {
605
+ const o = this._opts;
606
+ if (!this._gpu)
607
+ return false;
608
+ if (o.gpu === 'off')
609
+ return false;
610
+ if (o.gpu === 'on')
611
+ return true;
612
+ if (!this._profile)
613
+ return false;
614
+ const threshold = o.gpuThreshold ?? 20_000;
615
+ return shouldUseGpu(this._profile, this._wasm.getChunkCount(), threshold);
616
+ }
617
+ /**
618
+ * Run the GPU Bloom scan and install the resulting candidate bitset into
619
+ * WASM. The next `searchBegin` will see the mask and `searchSlice` will
620
+ * restrict its Bitap pass to those candidates.
621
+ *
622
+ * No-op if the GPU device hasn't been acquired yet — first call attempts
623
+ * `init()` lazily; if that fails, the candidate path is permanently
624
+ * disabled for this engine instance.
625
+ */
626
+ async _gpuPreFilter(wasmQuery) {
627
+ const gpu = this._gpu;
628
+ if (!gpu)
629
+ return;
630
+ if (!gpu.available) {
631
+ const ok = await gpu.init();
632
+ if (!ok) {
633
+ this._gpu = null;
634
+ return;
635
+ }
636
+ }
637
+ const chunkCount = this._wasm.getChunkCount();
638
+ if (chunkCount === 0)
639
+ return;
640
+ // Upload blooms if the corpus changed. We re-upload everything on any
641
+ // delta; incremental delta-upload is a future optimisation.
642
+ if (chunkCount !== this._gpuChunkCountUploaded) {
643
+ const ptr = this._wasm.getChunksPtr();
644
+ const stride = this._wasm.getChunkStructSize();
645
+ const bytes = new Uint8Array(this._mem.buffer, ptr, chunkCount * stride);
646
+ const blooms = packBloomsFromChunks(bytes, chunkCount);
647
+ gpu.uploadChunkBlooms(blooms, chunkCount);
648
+ this._gpuChunkCountUploaded = chunkCount;
649
+ }
650
+ // Build the pattern Bloom on the JS side: same hash as Rust
651
+ // (`c & 0x3F` after accent-folding), aggregated across all tokens.
652
+ const patternBloom = computePatternBloom(wasmQuery);
653
+ const passes = await gpu.scan(Number(patternBloom & 0xffffffffn), Number((patternBloom >> 32n) & 0xffffffffn));
654
+ // Push the bitset back into WASM via the scratchpad.
655
+ const passBytes = new Uint8Array(passes.buffer, passes.byteOffset, passes.byteLength);
656
+ this._writePad(passBytes);
657
+ this._wasm.setCandidateMask(passBytes.byteLength);
205
658
  }
206
659
  // ── Internal helpers ──────────────────────────────────────────────────────
207
660
  _u8(off, n) {
@@ -210,7 +663,7 @@ export class AlbexEngine {
210
663
  _writePad(b) {
211
664
  const ptr = this._wasm.getBuffer(b.length);
212
665
  if (!ptr)
213
- throw new Error('Scratchpad too small for this chunk');
666
+ throw new AlbexCapacityError(`Scratchpad too small for ${b.length} bytes`);
214
667
  this._u8(ptr, b.length).set(b);
215
668
  return ptr;
216
669
  }
@@ -232,37 +685,49 @@ export class AlbexEngine {
232
685
  }
233
686
  }
234
687
  _feedXmlBytes(xml, fn) {
688
+ const feeder = this._wasm[fn];
235
689
  for (let i = 0; i < xml.length; i += FEED_SIZE) {
236
690
  const c = xml.subarray(i, i + FEED_SIZE);
237
691
  this._writePad(c);
238
- this._wasm[fn](c.length);
692
+ feeder(c.length);
239
693
  }
240
694
  }
241
695
  // ── PDF WASM (lazy load) ─────────────────────────────────────────────────
242
696
  async _ensurePdfWasm() {
243
697
  if (this._pdfWasm)
244
698
  return;
245
- if (!this._opts.pdfWasmUrl)
246
- throw new Error('pdfWasmUrl not set in AlbexOptions');
247
- const res = await fetch(this._opts.pdfWasmUrl);
699
+ // Zero-config default: resolve relative to this module so bundlers copy
700
+ // the .wasm to the output automatically. Override with `opts.pdfWasmUrl`
701
+ // when serving from a separate CDN.
702
+ const pdfUrl = this._opts.pdfWasmUrl
703
+ ?? new URL('../wasm/pkg/albex_pdf.wasm', import.meta.url).href;
704
+ // Network politeness: on constrained connections (slow-2g/2g/saveData)
705
+ // we still fetch on explicit user request — `_ensurePdfWasm` is only
706
+ // called when the user actually drops a PDF — but we issue a console
707
+ // hint so embedders can surface a "this will download ~1 MB" prompt.
708
+ if (this._resources?.constrainedNetwork) {
709
+ console.info('[albex] downloading PDF WASM (~1 MB) on a constrained network connection');
710
+ }
711
+ const res = await fetch(pdfUrl);
248
712
  if (!res.ok)
249
- throw new Error(`Failed to fetch PDF WASM: ${res.status}`);
250
- const imports = makePdfWasmImports(() => this._pdfMem);
251
- const { instance } = await WebAssembly.instantiateStreaming(res, imports);
252
- this._pdfWasm = instance.exports;
253
- this._pdfMem = instance.exports.memory;
713
+ throw new AlbexInitError(`Failed to fetch PDF WASM: ${res.status}`);
714
+ // Compile first so we can inspect the module's required imports and
715
+ // resolve mangled wasm-bindgen names by prefix rather than by hash.
716
+ const module = await WebAssembly.compileStreaming(res);
717
+ const imports = makePdfWasmImports(module, () => this._pdfMem);
718
+ const instance = await WebAssembly.instantiate(module, imports);
719
+ this._pdfWasm = asAlbexPdfExports(instance.exports);
720
+ this._pdfMem = this._pdfWasm.memory;
254
721
  }
255
722
  // ── Indexers ──────────────────────────────────────────────────────────────
256
- async _indexDocx(file) {
257
- const bytes = new Uint8Array(await file.arrayBuffer());
723
+ async _indexDocx(file, bytes) {
258
724
  const xml = await findZipEntry(bytes, 'word/document.xml');
259
725
  this._wasm.setDocumentName(this._writeStr(file.name));
260
726
  this._wasm.beginDocument();
261
727
  this._feedXmlBytes(xml, 'feedXmlBytes');
262
728
  return this._wasm.endDocument();
263
729
  }
264
- async _indexXlsx(file) {
265
- const bytes = new Uint8Array(await file.arrayBuffer());
730
+ async _indexXlsx(file, bytes) {
266
731
  this._wasm.setDocumentName(this._writeStr(file.name));
267
732
  this._wasm.beginXlsx();
268
733
  try {
@@ -280,40 +745,294 @@ export class AlbexEngine {
280
745
  }
281
746
  return this._wasm.endDocument();
282
747
  }
283
- async _indexPdf(file) {
748
+ async _indexPdf(file, bytes) {
284
749
  await this._ensurePdfWasm();
285
- const pw = this._pdfWasm;
286
- const pm = this._pdfMem;
287
- const bytes = new Uint8Array(await file.arrayBuffer());
750
+ let pw = this._pdfWasm;
751
+ let pm = this._pdfMem;
752
+ if (!pw || !pm)
753
+ throw new AlbexInitError('PDF WASM not initialised');
754
+ // Reserve input buffer and copy bytes. allocInput may trigger a
755
+ // memory.grow inside the PDF module; the previous pm.buffer would
756
+ // become detached. Refresh the memory reference before constructing
757
+ // the view to be safe.
288
758
  const inPtr = pw.allocInput(bytes.length);
759
+ pm = pw.memory;
289
760
  new Uint8Array(pm.buffer, inPtr, bytes.length).set(bytes);
290
- const pageCount = pw.extractPdf(bytes.length);
761
+ // extractPdf can panic inside pdf-extract/lopdf for PDFs that other
762
+ // tools accept (encrypted streams without password, exotic font
763
+ // dictionaries, malformed cross-reference tables, etc.). The crate
764
+ // is built with panic="abort" (required on wasm32-unknown-unknown
765
+ // — no unwinding), so the panic surfaces as a WASM `unreachable`
766
+ // trap and the module instance becomes unusable.
767
+ //
768
+ // Recovery strategy when this happens:
769
+ // 1. Discard the poisoned instance.
770
+ // 2. If OCR is wired AND the rebuilt binary supports image
771
+ // extraction, re-instantiate, reload the input bytes, and try
772
+ // the lopdf-only image-extraction path. lopdf is a separate
773
+ // parser from pdf-extract's text codec — there are real PDFs
774
+ // that pdf-extract trips on but lopdf walks fine, and we can
775
+ // recover the page images even when we cannot recover the
776
+ // vector text.
777
+ // 3. If OCR isn't wired (or the recovery also fails), surface a
778
+ // helpful AlbexParseError that points the user at the fix.
779
+ let pageCount;
780
+ try {
781
+ pageCount = pw.extractPdf(bytes.length);
782
+ }
783
+ catch (e) {
784
+ this._pdfWasm = null;
785
+ this._pdfMem = null;
786
+ const msg = e instanceof Error ? e.message : String(e);
787
+ // Try the OCR fallback before giving up.
788
+ if (this.ocrImage) {
789
+ const recovered = await this._indexPdfViaImagesOnly(file, bytes, msg);
790
+ if (recovered !== null)
791
+ return recovered;
792
+ }
793
+ throw new AlbexParseError('pdf', `PDF text extractor crashed (${msg}). ` +
794
+ (this.ocrImage
795
+ ? 'OCR fallback also could not recover any content from this file.'
796
+ : 'Enable OCR via @albex/ocr to attempt image-based extraction as a fallback.'));
797
+ }
798
+ // Refresh memory once more — extractPdf can grow it too.
799
+ pm = pw.memory;
291
800
  this._wasm.setDocumentName(this._writeStr(file.name));
292
801
  this._wasm.beginDocument();
293
802
  if (pageCount === -2) {
294
- // Image-only PDF register doc with zero chunks.
803
+ // Image-only (scanned) PDF. If OCR is wired AND the PDF binary
804
+ // supports image extraction, fall through to the scanned-PDF path.
805
+ // Otherwise keep today's behaviour: register the doc with 0 chunks
806
+ // so the user sees the file in the index but searches won't hit it.
807
+ const supportsImages = typeof pw.extractPageImages === 'function'
808
+ && typeof pw.getPageCount === 'function';
809
+ if (this.ocrImage && supportsImages) {
810
+ await this._indexPdfScanned(pw);
811
+ }
295
812
  return this._wasm.endDocument();
296
813
  }
297
814
  if (pageCount < 0) {
298
815
  const errLen = pw.getErrorLen();
299
816
  const errPtr = pw.getErrorPtr();
300
817
  const msg = errLen > 0
301
- ? new TextDecoder().decode(new Uint8Array(pm.buffer, errPtr, errLen))
818
+ ? _dec.decode(new Uint8Array(pm.buffer, errPtr, errLen))
302
819
  : 'PDF parse error';
303
- throw new Error(msg);
820
+ throw new AlbexParseError('pdf', msg);
304
821
  }
305
822
  for (let p = 0; p < pageCount; p++) {
306
823
  const len = pw.getPageLen(p);
307
824
  if (!len)
308
825
  continue;
309
- const text = new TextDecoder('utf-8').decode(new Uint8Array(pm.buffer, pw.getPagePtr(p), len));
826
+ // Re-read memory each iteration feedText writes into the main
827
+ // WASM, but reading the PDF page pointers requires the live PDF
828
+ // memory which may have been grown by intermediate calls.
829
+ const liveMem = pw.memory;
830
+ const text = _dec.decode(new Uint8Array(liveMem.buffer, pw.getPagePtr(p), len));
310
831
  this._feedText(text);
311
832
  this._wasm.flushParagraph();
312
833
  }
834
+ // Hybrid OCR pass: when the OCR module is wired with
835
+ // `alwaysExtractEmbeddedImages: true`, also walk every page for
836
+ // embedded images and OCR them on top of the vector text.
837
+ //
838
+ // We always log the decision so users debugging "why isn't OCR
839
+ // firing on my hybrid PDF" can see which precondition failed.
840
+ const hybridOn = !!this.ocrConfig?.alwaysExtractEmbeddedImages;
841
+ const hasOcr = !!this.ocrImage;
842
+ const binSupportsImages = typeof pw.extractPageImages === 'function'
843
+ && typeof pw.getPageCount === 'function';
844
+ console.log(`[albex] hybrid OCR decision: ocrImage=${hasOcr} ocrConfig.alwaysExtractEmbeddedImages=${hybridOn} binarySupportsImages=${binSupportsImages}`);
845
+ if (hasOcr && hybridOn && binSupportsImages) {
846
+ const totalPages = pw.getPageCount();
847
+ console.log(`[albex] hybrid OCR pass starting over ${totalPages} page(s)`);
848
+ for (let p = 0; p < totalPages; p++) {
849
+ const ocrText = await this._ocrPageEmbeddedImages(pw, p);
850
+ if (ocrText === null)
851
+ break; // WASM trapped, stop hybrid pass.
852
+ if (ocrText) {
853
+ this._feedText(ocrText);
854
+ this._wasm.flushParagraph();
855
+ }
856
+ }
857
+ }
858
+ return this._wasm.endDocument();
859
+ }
860
+ /**
861
+ * Scanned-PDF OCR fallback. Called from `_indexPdf` when `extractPdf`
862
+ * returns `-2` (image-only PDF) AND `@albex/ocr` has been wired via
863
+ * `enableOcr(engine)`.
864
+ *
865
+ * Walks every page of the PDF, extracts embedded JPEG / JPEG2000 image
866
+ * XObjects, runs each through `engine.ocrImage`, and feeds the recognised
867
+ * text into the index — one paragraph per page so search snippets stay
868
+ * tied to the page they came from.
869
+ *
870
+ * Failure modes handled here (none re-thrown — the goal is best-effort
871
+ * indexing, not all-or-nothing):
872
+ *
873
+ * * A page's `extractPageImages` traps the WASM instance: the instance
874
+ * is discarded so the next PDF starts fresh, and we stop iterating
875
+ * (no more pages can be read from a poisoned instance). The doc is
876
+ * still committed with whatever text we got from earlier pages.
877
+ * * An individual image fails to OCR (Tesseract decode error, JP2 not
878
+ * supported in this browser, etc.): we skip that image and keep
879
+ * going. Partial coverage beats nothing.
880
+ * * A page yields no extractable images (e.g. uses Flate/CCITT/JBIG2):
881
+ * no paragraph is emitted; the page contributes 0 chunks.
882
+ */
883
+ async _indexPdfScanned(pw) {
884
+ if (!this.ocrImage)
885
+ return;
886
+ const totalPages = pw.getPageCount();
887
+ if (!totalPages)
888
+ return;
889
+ for (let p = 0; p < totalPages; p++) {
890
+ const pageText = await this._ocrPageEmbeddedImages(pw, p);
891
+ if (pageText === null)
892
+ return; // WASM poisoned mid-iteration.
893
+ if (pageText) {
894
+ this._feedText(pageText);
895
+ this._wasm.flushParagraph();
896
+ }
897
+ }
898
+ }
899
+ /**
900
+ * Walk one page's embedded image XObjects, OCR each image, and return
901
+ * the joined recognised text for that page.
902
+ *
903
+ * Used by:
904
+ * - `_indexPdfScanned`: image-only PDFs (extractPdf returned -2).
905
+ * - `_indexPdf` hybrid path: when `ocrConfig.alwaysExtractEmbeddedImages`
906
+ * is set, every page goes through here on top of the normal text
907
+ * extraction.
908
+ *
909
+ * Returns:
910
+ * - The recognised text (possibly empty if the page has no qualifying
911
+ * images or every OCR call failed).
912
+ * - `null` if the PDF WASM trapped during extractPageImages — the
913
+ * caller should abort the remaining pages because the instance is
914
+ * now poisoned.
915
+ *
916
+ * Failure-handling philosophy: best-effort. An OCR failure on one image
917
+ * does not stop the page; a page with no images does not stop the doc;
918
+ * only a WASM trap stops the doc.
919
+ */
920
+ async _ocrPageEmbeddedImages(pw, page) {
921
+ const ocr = this.ocrImage;
922
+ if (!ocr)
923
+ return '';
924
+ let imageCount;
925
+ try {
926
+ imageCount = pw.extractPageImages(page);
927
+ }
928
+ catch (e) {
929
+ // The PDF module just trapped — it is now poisoned. Drop our refs
930
+ // so `_ensurePdfWasm` re-instantiates on the next call.
931
+ this._pdfWasm = null;
932
+ this._pdfMem = null;
933
+ console.warn(`[albex] PDF image extractor trapped on page ${page + 1}: ${e instanceof Error ? e.message : String(e)}. Stopping OCR.`);
934
+ return null;
935
+ }
936
+ if (imageCount <= 0)
937
+ return '';
938
+ // The buffer view must be re-acquired AFTER extractPageImages —
939
+ // it may have grown the linear memory and detached old views.
940
+ const liveMem = pw.memory;
941
+ let pageText = '';
942
+ for (let i = 0; i < imageCount; i++) {
943
+ const len = pw.getPageImageLen(i);
944
+ if (!len)
945
+ continue;
946
+ const ptr = pw.getPageImagePtr(i);
947
+ const kind = pw.getPageImageKind(i);
948
+ const mime = kind === 1 ? 'image/jpeg'
949
+ : kind === 2 ? 'image/jp2'
950
+ : 'application/octet-stream';
951
+ // Snapshot the image bytes into a fresh ArrayBuffer. The pointer
952
+ // returned by getPageImagePtr is only valid until the next
953
+ // extractPageImages / extractPdf call, so we cannot hold the view.
954
+ const copy = new Uint8Array(len);
955
+ copy.set(new Uint8Array(liveMem.buffer, ptr, len));
956
+ const blob = new Blob([copy.buffer], { type: mime });
957
+ // Defensive diagnostics: when an OCR call goes wrong (Tesseract
958
+ // worker abort, malformed JPEG, etc.) the first thing we want to
959
+ // see is whether we even handed it valid image bytes. A real JPEG
960
+ // starts with FF D8 FF (E0 for JFIF, E1 for EXIF). A JPEG2000
961
+ // starts with 00 00 00 0C 6A 50 20 20.
962
+ const magic = Array.from(copy.subarray(0, 4))
963
+ .map(b => b.toString(16).padStart(2, '0'))
964
+ .join(' ');
965
+ console.log(`[albex] OCR page ${page + 1} image ${i + 1}/${imageCount}: kind=${kind} (${mime}) len=${len} bytes magic=${magic}`);
966
+ try {
967
+ const { text } = await ocr(blob);
968
+ const trimmed = text?.trim();
969
+ if (trimmed) {
970
+ pageText = pageText ? `${pageText} ${trimmed}` : trimmed;
971
+ }
972
+ }
973
+ catch (e) {
974
+ // Image-level OCR failure — skip and continue. JP2 in browsers
975
+ // without native support lands here; so do truncated or
976
+ // unsupported JPEG variants. Worker aborts (Tesseract.js
977
+ // "Aborted(-1)") are also caught here; if they bypass the
978
+ // promise rejection and surface as `uncaught` instead, the
979
+ // demo's window.onerror handler will keep the app alive.
980
+ console.warn(`[albex] OCR failed on page ${page + 1} image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
981
+ }
982
+ }
983
+ return pageText;
984
+ }
985
+ /**
986
+ * Last-chance OCR path used when `extractPdf` itself trapped (pdf-extract
987
+ * crashed but lopdf may still be able to walk the file). Re-instantiates
988
+ * the PDF WASM, reloads the input bytes, and tries the image-extraction
989
+ * route directly — bypassing the text codec entirely.
990
+ *
991
+ * Returns:
992
+ * * the doc's chunk count on success (even 0 — that means lopdf could
993
+ * parse but no qualifying images existed, which still beats a hard
994
+ * parse error),
995
+ * * null if the recovery itself failed (binary lacks the image exports,
996
+ * re-instantiation failed, or lopdf also trapped). In the null case
997
+ * the caller throws AlbexParseError so the user sees a clear message.
998
+ */
999
+ async _indexPdfViaImagesOnly(file, bytes, originalError) {
1000
+ try {
1001
+ await this._ensurePdfWasm();
1002
+ }
1003
+ catch {
1004
+ return null;
1005
+ }
1006
+ const pw = this._pdfWasm;
1007
+ if (!pw)
1008
+ return null;
1009
+ const supportsImages = typeof pw.extractPageImages === 'function'
1010
+ && typeof pw.getPageCount === 'function';
1011
+ if (!supportsImages)
1012
+ return null;
1013
+ // Reload input bytes into the fresh instance. allocInput may grow the
1014
+ // memory, so re-acquire the buffer view immediately after.
1015
+ let inPtr;
1016
+ try {
1017
+ inPtr = pw.allocInput(bytes.length);
1018
+ new Uint8Array(pw.memory.buffer, inPtr, bytes.length).set(bytes);
1019
+ }
1020
+ catch (e) {
1021
+ console.warn(`[albex] PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`);
1022
+ return null;
1023
+ }
1024
+ // Set up the doc and let _indexPdfScanned do the page-by-page walk.
1025
+ // _indexPdfScanned tolerates lopdf failing mid-stream — it caches the
1026
+ // poisoned instance and returns early. If lopdf trips on the very
1027
+ // first page, no paragraphs are emitted and we end up with 0 chunks.
1028
+ this._wasm.setDocumentName(this._writeStr(file.name));
1029
+ this._wasm.beginDocument();
1030
+ console.info(`[albex] pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf for ${file.name}`);
1031
+ await this._indexPdfScanned(pw);
313
1032
  return this._wasm.endDocument();
314
1033
  }
315
- async _indexTxt(file) {
316
- const text = await file.text();
1034
+ async _indexTxt(file, bytes) {
1035
+ const text = _dec.decode(bytes);
317
1036
  this._wasm.setDocumentName(this._writeStr(file.name));
318
1037
  this._wasm.beginDocument();
319
1038
  for (const para of text.split(/\n{2,}/)) {
@@ -325,8 +1044,8 @@ export class AlbexEngine {
325
1044
  }
326
1045
  return this._wasm.endDocument();
327
1046
  }
328
- async _indexXml(file) {
329
- const plain = (await file.text())
1047
+ async _indexXml(file, bytes) {
1048
+ const plain = _dec.decode(bytes)
330
1049
  .replace(/<[^]*?>/g, '\n')
331
1050
  .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
332
1051
  .replace(/&quot;/g, '"').replace(/&apos;/g, "'")
@@ -342,6 +1061,426 @@ export class AlbexEngine {
342
1061
  }
343
1062
  return this._wasm.endDocument();
344
1063
  }
1064
+ // ── Markdown ─────────────────────────────────────────────────────────────
1065
+ // Strip CommonMark inline marks but keep word content. Paragraphs split on
1066
+ // blank lines, same convention as TXT/XML.
1067
+ async _indexMd(file, bytes) {
1068
+ const text = _dec.decode(bytes)
1069
+ // Remove fenced code blocks entirely (often noisy for search relevance).
1070
+ .replace(/```[\s\S]*?```/g, '\n')
1071
+ .replace(/~~~[\s\S]*?~~~/g, '\n')
1072
+ // Strip ATX heading markers but keep heading text.
1073
+ .replace(/^#{1,6}\s+/gm, '')
1074
+ // Replace inline links/images with their visible text.
1075
+ .replace(/!\[([^\]]*)\]\([^)]*\)/g, '$1')
1076
+ .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
1077
+ // Strip emphasis markers (preserve content).
1078
+ .replace(/(\*\*|__|\*|_)/g, '')
1079
+ // Inline code.
1080
+ .replace(/`([^`]+)`/g, '$1')
1081
+ // Blockquote marks.
1082
+ .replace(/^>\s?/gm, '')
1083
+ // List markers.
1084
+ .replace(/^\s*[-*+]\s+/gm, '')
1085
+ .replace(/^\s*\d+\.\s+/gm, '');
1086
+ this._wasm.setDocumentName(this._writeStr(file.name));
1087
+ this._wasm.beginDocument();
1088
+ for (const para of text.split(/\n{2,}/)) {
1089
+ const l = para.replace(/\n/g, ' ').trim();
1090
+ if (l) {
1091
+ this._feedText(l);
1092
+ this._wasm.flushParagraph();
1093
+ }
1094
+ }
1095
+ return this._wasm.endDocument();
1096
+ }
1097
+ // ── HTML ─────────────────────────────────────────────────────────────────
1098
+ // Strip <script>/<style> entire blocks, then drop tag markup. The output is
1099
+ // chunked at <p>, <br>, <h*>, <li>, <tr> boundaries (mapped to paragraph
1100
+ // breaks) so search location numbers map naturally to the document outline.
1101
+ async _indexHtml(file, bytes) {
1102
+ const html = _dec.decode(bytes)
1103
+ .replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, ' ')
1104
+ .replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, ' ')
1105
+ // Treat block-level closers as paragraph separators.
1106
+ .replace(/<\/(p|h[1-6]|li|tr|div|section|article|header|footer)\s*>/gi, '\n\n')
1107
+ .replace(/<br\s*\/?\s*>/gi, '\n')
1108
+ // Drop remaining tags.
1109
+ .replace(/<[^>]+>/g, ' ')
1110
+ // Decode common entities (full set would need a table; this covers >95%).
1111
+ .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
1112
+ .replace(/&quot;/g, '"').replace(/&apos;/g, "'").replace(/&nbsp;/g, ' ')
1113
+ .replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(Number(n)))
1114
+ .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCodePoint(parseInt(n, 16)))
1115
+ .replace(/[ \t]+/g, ' ');
1116
+ this._wasm.setDocumentName(this._writeStr(file.name));
1117
+ this._wasm.beginDocument();
1118
+ for (const para of html.split(/\n{2,}/)) {
1119
+ const l = para.replace(/\n/g, ' ').trim();
1120
+ if (l) {
1121
+ this._feedText(l);
1122
+ this._wasm.flushParagraph();
1123
+ }
1124
+ }
1125
+ return this._wasm.endDocument();
1126
+ }
1127
+ // ── JSON ─────────────────────────────────────────────────────────────────
1128
+ // Extract every string value (keys + leaf strings) recursively. Each leaf
1129
+ // becomes its own searchable chunk via paragraph flush. Numbers/booleans
1130
+ // are skipped (cannot match a textual query usefully).
1131
+ async _indexJson(file, bytes) {
1132
+ let root;
1133
+ try {
1134
+ root = JSON.parse(_dec.decode(bytes));
1135
+ }
1136
+ catch (e) {
1137
+ throw new AlbexParseError('json', e.message);
1138
+ }
1139
+ this._wasm.setDocumentName(this._writeStr(file.name));
1140
+ this._wasm.beginDocument();
1141
+ const visit = (v) => {
1142
+ if (typeof v === 'string') {
1143
+ if (v.trim()) {
1144
+ this._feedText(v);
1145
+ this._wasm.flushParagraph();
1146
+ }
1147
+ }
1148
+ else if (Array.isArray(v)) {
1149
+ for (const x of v)
1150
+ visit(x);
1151
+ }
1152
+ else if (v && typeof v === 'object') {
1153
+ for (const [k, x] of Object.entries(v)) {
1154
+ if (k.trim()) {
1155
+ this._feedText(k);
1156
+ this._wasm.flushParagraph();
1157
+ }
1158
+ visit(x);
1159
+ }
1160
+ }
1161
+ };
1162
+ visit(root);
1163
+ return this._wasm.endDocument();
1164
+ }
1165
+ // ── CSV ──────────────────────────────────────────────────────────────────
1166
+ // RFC 4180 lite: comma-separated, optional double quotes, escaped "" inside
1167
+ // quoted fields. Each row becomes one paragraph (location = row index, with
1168
+ // header row at location 0).
1169
+ async _indexCsv(file, bytes) {
1170
+ // Strip an optional UTF-8 BOM. Excel writes it by default for "CSV UTF-8";
1171
+ // without this fix the first field of the first row would start with
1172
+ // U+FEFF, which both shifts column alignment when consumers split on a
1173
+ // field name and breaks search hits on "Subject" / "Asunto" etc.
1174
+ let text = _dec.decode(bytes);
1175
+ if (text.charCodeAt(0) === 0xFEFF)
1176
+ text = text.slice(1);
1177
+ this._wasm.setDocumentName(this._writeStr(file.name));
1178
+ this._wasm.beginDocument();
1179
+ let row = [];
1180
+ let field = '';
1181
+ let inQuoted = false;
1182
+ const flushRow = () => {
1183
+ const line = row.join(' ').trim();
1184
+ if (line) {
1185
+ this._feedText(line);
1186
+ this._wasm.flushParagraph();
1187
+ }
1188
+ row = [];
1189
+ };
1190
+ for (let i = 0; i < text.length; i++) {
1191
+ const c = text[i];
1192
+ if (inQuoted) {
1193
+ if (c === '"') {
1194
+ if (text[i + 1] === '"') {
1195
+ field += '"';
1196
+ i++;
1197
+ }
1198
+ else
1199
+ inQuoted = false;
1200
+ }
1201
+ else
1202
+ field += c;
1203
+ }
1204
+ else {
1205
+ if (c === ',') {
1206
+ row.push(field);
1207
+ field = '';
1208
+ }
1209
+ else if (c === '\n') {
1210
+ row.push(field);
1211
+ field = '';
1212
+ flushRow();
1213
+ }
1214
+ else if (c === '\r') { /* skip */ }
1215
+ else if (c === '"' && field.length === 0)
1216
+ inQuoted = true;
1217
+ else
1218
+ field += c;
1219
+ }
1220
+ }
1221
+ if (field.length > 0 || row.length > 0) {
1222
+ row.push(field);
1223
+ flushRow();
1224
+ }
1225
+ return this._wasm.endDocument();
1226
+ }
1227
+ // ── EML / MBOX ───────────────────────────────────────────────────────────
1228
+ // Minimal MIME: parse the first text/plain body. Headers From/To/Subject
1229
+ // are indexed as separate paragraphs so they're individually searchable.
1230
+ //
1231
+ // What's decoded:
1232
+ // * Content-Transfer-Encoding: base64 → decoded.
1233
+ // * Content-Transfer-Encoding: quoted-printable → decoded.
1234
+ // * Content-Transfer-Encoding: 7bit / 8bit → pass-through.
1235
+ // * Nested multipart (multipart/alternative inside multipart/mixed) by
1236
+ // recursively walking boundaries until a text/plain section is found.
1237
+ //
1238
+ // What's not decoded (out of scope for this "lite" parser):
1239
+ // * Encoded-word headers (=?utf-8?Q?...?=) — only the raw bytes go in.
1240
+ // * Charset conversions other than UTF-8 — assumes the body decodes as UTF-8.
1241
+ // * HTML-only emails — they're dropped if no text/plain part is present.
1242
+ // * MBOX format (multiple emails concatenated). Each email needs to be
1243
+ // fed separately.
1244
+ async _indexEml(file, bytes) {
1245
+ const raw = _dec.decode(bytes).replace(/\r\n/g, '\n');
1246
+ const headerEnd = raw.indexOf('\n\n');
1247
+ const headersBlock = headerEnd > 0 ? raw.slice(0, headerEnd) : raw;
1248
+ const body = headerEnd > 0 ? raw.slice(headerEnd + 2) : '';
1249
+ const header = (block, name) => {
1250
+ const m = new RegExp(`^${name}:\\s*(.+(?:\\n[ \\t].+)*)`, 'mi').exec(block);
1251
+ return m ? (m[1] ?? '').replace(/\n[ \t]+/g, ' ').trim() : '';
1252
+ };
1253
+ this._wasm.setDocumentName(this._writeStr(file.name));
1254
+ this._wasm.beginDocument();
1255
+ const subj = header(headersBlock, 'Subject');
1256
+ const from = header(headersBlock, 'From');
1257
+ const to = header(headersBlock, 'To');
1258
+ for (const h of [subj, from, to]) {
1259
+ if (h) {
1260
+ this._feedText(h);
1261
+ this._wasm.flushParagraph();
1262
+ }
1263
+ }
1264
+ const plain = this._extractEmlTextPlain(headersBlock, body, header) ?? body;
1265
+ for (const para of plain.split(/\n{2,}/)) {
1266
+ const l = para.replace(/\n/g, ' ').trim();
1267
+ if (l) {
1268
+ this._feedText(l);
1269
+ this._wasm.flushParagraph();
1270
+ }
1271
+ }
1272
+ return this._wasm.endDocument();
1273
+ }
1274
+ /**
1275
+ * Walk the multipart tree until a text/plain section is found. Returns
1276
+ * the decoded body as a string, or null if no text/plain part exists.
1277
+ *
1278
+ * The function is called with the headers and body of the *current*
1279
+ * MIME entity (the top-level message at first, then each multipart child
1280
+ * on recursion). For single-part entities it inspects the entity's own
1281
+ * Content-Transfer-Encoding and decodes accordingly.
1282
+ */
1283
+ _extractEmlTextPlain(headersBlock, body, header) {
1284
+ const contentType = header(headersBlock, 'Content-Type');
1285
+ const boundary = /boundary="?([^";]+)"?/i.exec(contentType)?.[1];
1286
+ if (!boundary) {
1287
+ // Single-part body. If it claims to be text/plain (the default when
1288
+ // Content-Type is absent), apply Transfer-Encoding decoding here.
1289
+ // Anything else (text/html, application/*) gets returned raw — the
1290
+ // top-level caller still feeds it as text, but searches against
1291
+ // genuinely binary payloads will not hit anything useful.
1292
+ if (contentType === '' || /text\/plain/i.test(contentType)) {
1293
+ return decodeEmlBody(headersBlock, body, header);
1294
+ }
1295
+ return body;
1296
+ }
1297
+ const parts = body.split(`--${boundary}`);
1298
+ for (const part of parts) {
1299
+ const trimmed = part.replace(/^\n+/, '');
1300
+ const ph = trimmed.indexOf('\n\n');
1301
+ if (ph < 0)
1302
+ continue;
1303
+ const partHeaders = trimmed.slice(0, ph);
1304
+ const partBody = trimmed.slice(ph + 2);
1305
+ const partCtype = header(partHeaders, 'Content-Type');
1306
+ if (/^multipart\//i.test(partCtype)) {
1307
+ const inner = this._extractEmlTextPlain(partHeaders, partBody, header);
1308
+ if (inner)
1309
+ return inner;
1310
+ continue;
1311
+ }
1312
+ if (/text\/plain/i.test(partCtype)) {
1313
+ return decodeEmlBody(partHeaders, partBody, header);
1314
+ }
1315
+ }
1316
+ return null;
1317
+ }
1318
+ // ── RTF ──────────────────────────────────────────────────────────────────
1319
+ //
1320
+ // Strip the {\rtf1...} group structure. Control words (\xxx and \xxxN),
1321
+ // hex escapes (\'XX), unicode escapes (\uN ?) and groups are processed;
1322
+ // plain runs are kept.
1323
+ //
1324
+ // Character decoding:
1325
+ // * \'XX → Windows-1252 byte XX. RTF defaults to cp1252 for high-ANSI;
1326
+ // we map the relevant rows (0x80–0x9F differs from Latin-1)
1327
+ // to their Unicode equivalents. Outside that block, the byte
1328
+ // is taken as Latin-1 (which equals Unicode below 0x100).
1329
+ // Result: accents in es/fr/de/it/pt RTF dumps survive.
1330
+ // * \uN ? → Unicode codepoint N (signed 16-bit, negative means N+65536).
1331
+ // Followed by a fallback character which we then skip — Word
1332
+ // writes the ASCII transliteration of the unicode glyph as a
1333
+ // fallback for non-Unicode readers; we ignore it because we
1334
+ // have the real codepoint.
1335
+ // * \- → soft hyphen (drop).
1336
+ // * \~ → non-breaking space.
1337
+ // * \emdash, \endash, \bullet, \lquote, \rquote, \ldblquote, \rdblquote
1338
+ // → their Unicode equivalents.
1339
+ //
1340
+ // What's not handled (assumes Word/Pages/LibreOffice output, where
1341
+ // these aren't load-bearing):
1342
+ // * \ansicpg, \fcharset — we always assume cp1252 for \' escapes.
1343
+ // * \bin — binary data with explicit length; rare in document RTF.
1344
+ // * Field codes — rendered as the visible text (good enough for search).
1345
+ async _indexRtf(file, bytes) {
1346
+ const src = _dec.decode(bytes);
1347
+ let out = '';
1348
+ let i = 0;
1349
+ let depth = 0;
1350
+ // Track if we're inside a destination group we should skip (e.g. \fonttbl).
1351
+ let skipDepth = 0;
1352
+ const SKIP_DESTINATIONS = /^\\(fonttbl|colortbl|stylesheet|info|pict|object|header|footer)\b/;
1353
+ while (i < src.length) {
1354
+ const c = src[i];
1355
+ if (c === '{') {
1356
+ depth++;
1357
+ i++;
1358
+ continue;
1359
+ }
1360
+ if (c === '}') {
1361
+ depth--;
1362
+ if (skipDepth > 0 && depth < skipDepth)
1363
+ skipDepth = 0;
1364
+ i++;
1365
+ continue;
1366
+ }
1367
+ if (c === '\\') {
1368
+ // Hex byte escape: \'XX
1369
+ if (src[i + 1] === '\'' && i + 3 < src.length) {
1370
+ const hex = src.slice(i + 2, i + 4);
1371
+ if (/^[0-9A-Fa-f]{2}$/.test(hex)) {
1372
+ if (skipDepth === 0)
1373
+ out += rtfCp1252ToChar(parseInt(hex, 16));
1374
+ i += 4;
1375
+ continue;
1376
+ }
1377
+ // Malformed — drop and advance.
1378
+ i += 2;
1379
+ continue;
1380
+ }
1381
+ // Unicode escape: \uN followed by optional fallback character.
1382
+ // N is signed 16-bit per the spec; negative values mean N + 65536.
1383
+ const um = /^\\u(-?\d+) ?/.exec(src.slice(i));
1384
+ if (um) {
1385
+ let code = parseInt(um[1] ?? '0', 10);
1386
+ if (code < 0)
1387
+ code += 0x10000;
1388
+ if (skipDepth === 0 && code > 0 && code < 0x110000) {
1389
+ out += String.fromCodePoint(code);
1390
+ }
1391
+ i += um[0].length;
1392
+ // Skip the fallback char. Word writes one ASCII char after \uN
1393
+ // (the "uc1" count). We assume uc1, which is the Word default.
1394
+ if (i < src.length && src[i] !== '\\' && src[i] !== '{' && src[i] !== '}') {
1395
+ i++;
1396
+ }
1397
+ continue;
1398
+ }
1399
+ // Control word / symbol.
1400
+ const m = /^\\([A-Za-z]+)(-?\d+)?\s?/.exec(src.slice(i));
1401
+ if (m) {
1402
+ const word = m[1] ?? '';
1403
+ if (skipDepth === 0 && SKIP_DESTINATIONS.test(src.slice(i)))
1404
+ skipDepth = depth;
1405
+ if (skipDepth === 0) {
1406
+ switch (word) {
1407
+ case 'par':
1408
+ case 'line':
1409
+ case 'sect':
1410
+ out += '\n\n';
1411
+ break;
1412
+ case 'tab':
1413
+ out += '\t';
1414
+ break;
1415
+ case 'emdash':
1416
+ out += '—';
1417
+ break;
1418
+ case 'endash':
1419
+ out += '–';
1420
+ break;
1421
+ case 'bullet':
1422
+ out += '•';
1423
+ break;
1424
+ case 'lquote':
1425
+ out += '‘';
1426
+ break;
1427
+ case 'rquote':
1428
+ out += '’';
1429
+ break;
1430
+ case 'ldblquote':
1431
+ out += '“';
1432
+ break;
1433
+ case 'rdblquote':
1434
+ out += '”';
1435
+ break;
1436
+ default: /* drop other control words silently */ break;
1437
+ }
1438
+ }
1439
+ i += m[0].length;
1440
+ continue;
1441
+ }
1442
+ // Escaped single character: \\, \{, \}, \-, \~ etc.
1443
+ if (skipDepth === 0) {
1444
+ const escaped = src[i + 1];
1445
+ if (escaped === '~')
1446
+ out += ' '; // non-breaking space
1447
+ else if (escaped === '-') { /* soft hyphen — drop */ }
1448
+ else if (escaped !== undefined)
1449
+ out += escaped;
1450
+ }
1451
+ i += 2;
1452
+ continue;
1453
+ }
1454
+ if (skipDepth === 0)
1455
+ out += c;
1456
+ i++;
1457
+ }
1458
+ this._wasm.setDocumentName(this._writeStr(file.name));
1459
+ this._wasm.beginDocument();
1460
+ for (const para of out.split(/\n{2,}/)) {
1461
+ const l = para.replace(/\n/g, ' ').trim();
1462
+ if (l) {
1463
+ this._feedText(l);
1464
+ this._wasm.flushParagraph();
1465
+ }
1466
+ }
1467
+ return this._wasm.endDocument();
1468
+ }
1469
+ static _INDEXERS = {
1470
+ docx: (e, f, b) => e._indexDocx(f, b),
1471
+ xlsx: (e, f, b) => e._indexXlsx(f, b),
1472
+ pdf: (e, f, b) => e._indexPdf(f, b),
1473
+ txt: (e, f, b) => e._indexTxt(f, b),
1474
+ xml: (e, f, b) => e._indexXml(f, b),
1475
+ md: (e, f, b) => e._indexMd(f, b),
1476
+ markdown: (e, f, b) => e._indexMd(f, b),
1477
+ html: (e, f, b) => e._indexHtml(f, b),
1478
+ htm: (e, f, b) => e._indexHtml(f, b),
1479
+ json: (e, f, b) => e._indexJson(f, b),
1480
+ csv: (e, f, b) => e._indexCsv(f, b),
1481
+ eml: (e, f, b) => e._indexEml(f, b),
1482
+ rtf: (e, f, b) => e._indexRtf(f, b),
1483
+ };
345
1484
  // ── Public API ────────────────────────────────────────────────────────────
346
1485
  /**
347
1486
  * Index a file. Supported formats: DOCX, XLSX, PDF, TXT, XML.
@@ -351,45 +1490,299 @@ export class AlbexEngine {
351
1490
  const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
352
1491
  const indexer = AlbexEngine._INDEXERS[ext];
353
1492
  if (!indexer)
354
- throw new Error(`Unsupported format: .${ext}`);
1493
+ throw new AlbexUnsupportedFormatError(ext);
1494
+ // Hash the source bytes for idempotency. We always read the bytes once
1495
+ // here so the indexer can reuse them — avoids a double File.arrayBuffer().
1496
+ const bytes = new Uint8Array(await file.arrayBuffer());
1497
+ const hash = contentHash(bytes);
1498
+ // Idempotency: if a non-deleted doc already has this hash, return it
1499
+ // unchanged. Cheap O(N) scan since MAX_DOCS = 128.
1500
+ const existing = this._docs.find(d => d.contentHash === hash);
1501
+ if (existing)
1502
+ return existing;
1503
+ const w = this._wasm;
355
1504
  const t0 = performance.now();
356
- const textPre = this._wasm.getTextUsed();
357
- const chunks = await indexer(this, file);
1505
+ const textPre = w.getTextUsed();
1506
+ const docCountBefore = w.getDocCount();
1507
+ // Snapshot v2: hand the content hash to the WASM so it persists with
1508
+ // the doc. Older binaries (pre-v2) lack this export — we silently skip
1509
+ // and behave like before. The indexer will overwrite the scratchpad
1510
+ // immediately after (with the doc name), which is fine because
1511
+ // setDocumentContentHash copies into pending_content_hash before
1512
+ // returning.
1513
+ if (typeof w.setDocumentContentHash === 'function') {
1514
+ const hashBytes = hashHexToBytes(hash);
1515
+ this._writePad(hashBytes);
1516
+ w.setDocumentContentHash(hashBytes.length);
1517
+ }
1518
+ const chunks = await indexer(this, file, bytes);
1519
+ // The new doc occupies slot `docCountBefore`.
1520
+ const docId = w.getDocId(docCountBefore);
358
1521
  const doc = {
359
1522
  name: file.name,
360
1523
  ext,
361
1524
  chunks,
362
1525
  indexTimeMs: performance.now() - t0,
363
- textBytes: this._wasm.getTextUsed() - textPre,
1526
+ textBytes: w.getTextUsed() - textPre,
1527
+ docId,
1528
+ contentHash: hash,
364
1529
  };
365
1530
  this._docs.push(doc);
366
1531
  return doc;
367
1532
  }
1533
+ /**
1534
+ * Mark a previously indexed document as removed. Searches no longer return
1535
+ * its chunks. Storage is reclaimed only after `compact()`.
1536
+ *
1537
+ * `id` can be the file name or the contentHash returned by `indexFile`.
1538
+ * Returns `true` if a matching document was found and tombstoned.
1539
+ */
1540
+ removeDocument(id) {
1541
+ const doc = this._docs.find(d => d.name === id || d.contentHash === id);
1542
+ if (!doc)
1543
+ return false;
1544
+ const ok = this._wasm.removeDocument(doc.docId) === 1;
1545
+ if (ok) {
1546
+ this._docs = this._docs.filter(d => d !== doc);
1547
+ }
1548
+ return ok;
1549
+ }
1550
+ /**
1551
+ * Replace a previously indexed document with new content. Equivalent to
1552
+ * `removeDocument(name)` + `indexFile(newFile)` but does not trigger the
1553
+ * idempotency check (so re-indexing the *same* bytes after a remove works).
1554
+ */
1555
+ async replaceDocument(name, newFile) {
1556
+ this.removeDocument(name);
1557
+ // Force a unique-hash path by indexing directly; if the new file happens
1558
+ // to hash identically to a still-tracked document, the dedupe in
1559
+ // indexFile will return that one. The remove above prevents the
1560
+ // common case.
1561
+ return this.indexFile(newFile);
1562
+ }
1563
+ /**
1564
+ * Reclaim storage from previously removed documents. Compacts CHUNKS,
1565
+ * TEXT_POOL, DOC_NAMES and NAME_POOL in place. Idempotent.
1566
+ *
1567
+ * Note: doc_ids of surviving documents are preserved, so any stored
1568
+ * references (e.g. in a UI) remain valid.
1569
+ */
1570
+ compact() {
1571
+ this._wasm.compact();
1572
+ }
368
1573
  /**
369
1574
  * Search the index. Supports:
370
1575
  * - Simple queries: `contrato` (AND of tokens, accent-insensitive)
371
1576
  * - Phrase queries: `"contrato marco"` (must appear as phrase)
372
1577
  * - OR queries: `contrato | acuerdo` (union of two searches)
1578
+ *
1579
+ * Pass `{ windowed: true }` to receive cropped snippets with ASCII ellipsis
1580
+ * markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
373
1581
  */
374
- search(query) {
1582
+ search(query, opts = {}) {
375
1583
  const parsed = parseQuery(query);
376
1584
  if (parsed.kind === 'or') {
377
- return this._searchOr(parsed.branches, query);
1585
+ return this._searchOr(parsed.branches, query, opts);
378
1586
  }
379
- const results = this._runSearch(tokensToWasmQuery(parsed.tokens), query);
1587
+ const results = this._runSearch(tokensToWasmQuery(parsed.tokens), query, opts);
380
1588
  if (parsed.kind === 'phrase') {
381
1589
  return results.filter(r => containsPhrase(r.snippet, parsed.tokens));
382
1590
  }
383
1591
  return results;
384
1592
  }
385
- _searchOr(branches, rawQuery) {
1593
+ /**
1594
+ * Cooperative search. Processes the corpus in slices, yielding to the
1595
+ * event loop between them so the host UI thread keeps a chance to paint
1596
+ * even while a long scan is in flight.
1597
+ *
1598
+ * NOTE: this is NOT incremental streaming. Results are materialised
1599
+ * once the search completes and then iterated out in score-descending
1600
+ * order. The async iterator shape is preserved because the work that
1601
+ * produces those results genuinely yields to the scheduler between
1602
+ * slices — a future iteration may stream individual results before the
1603
+ * heap sorts, but doing so today would deliver them in arbitrary order.
1604
+ *
1605
+ * Pass `opts.frameBudgetMs` to control the slice size (default 8 ms).
1606
+ */
1607
+ async *searchCooperative(query, opts = {}) {
1608
+ const parsed = parseQuery(query);
1609
+ const budget = opts.frameBudgetMs ?? 8;
1610
+ const w = this._wasm;
1611
+ // OR queries: run each branch as its own resumable search, dedup, sort.
1612
+ if (parsed.kind === 'or') {
1613
+ const seen = new Set();
1614
+ const all = [];
1615
+ for (const tokens of parsed.branches) {
1616
+ const q = tokensToWasmQuery(tokens);
1617
+ if (!q)
1618
+ continue;
1619
+ const r = await this._runSearchBudgeted(q, query, opts, budget);
1620
+ for (const x of r) {
1621
+ const key = `${x.documentName}:${x.location}:${x.matchStart}`;
1622
+ if (!seen.has(key)) {
1623
+ seen.add(key);
1624
+ all.push(x);
1625
+ }
1626
+ }
1627
+ }
1628
+ all.sort((a, b) => b.score - a.score);
1629
+ for (const r of all)
1630
+ yield r;
1631
+ return;
1632
+ }
1633
+ const results = await this._runSearchBudgeted(tokensToWasmQuery(parsed.tokens), query, opts, budget);
1634
+ const filtered = parsed.kind === 'phrase'
1635
+ ? results.filter(r => containsPhrase(r.snippet, parsed.tokens))
1636
+ : results;
1637
+ for (const r of filtered)
1638
+ yield r;
1639
+ void w;
1640
+ }
1641
+ /**
1642
+ * @deprecated Renamed to `searchCooperative` in 0.3.0. The original name
1643
+ * was misleading — this method does not stream incremental results, it
1644
+ * yields to the scheduler between slices and returns a batch. The alias
1645
+ * keeps existing integrations working; it will be removed in 0.4.0.
1646
+ */
1647
+ async *searchStream(query, opts = {}) {
1648
+ warnSearchStreamDeprecated();
1649
+ yield* this.searchCooperative(query, opts);
1650
+ }
1651
+ /**
1652
+ * Drive a resumable search until done, yielding to the scheduler when the
1653
+ * frame budget is exceeded. Returns the materialised result array.
1654
+ *
1655
+ * Heuristic: each call to `searchSlice` processes a chunk batch, then we
1656
+ * check elapsed time. The batch size doubles up to a cap to amortise the
1657
+ * JS<->WASM overhead on fast machines; on slow machines a single batch
1658
+ * may eat the entire budget, which is also fine.
1659
+ */
1660
+ async _runSearchBudgeted(wasmQuery, displayQuery, opts, budgetMs) {
1661
+ const w = this._wasm;
1662
+ const ql = this._writeStr(wasmQuery);
1663
+ w.setPattern(ql);
1664
+ // GPU pre-filter (CD1). If enabled AND the corpus is large enough,
1665
+ // the GPU computes the candidate bitset and we install it into WASM
1666
+ // before searchBegin so the slice loop only inspects candidates.
1667
+ // Failure here is silent: we fall back to CPU-only Bloom transparently.
1668
+ if (this._shouldEngageGpu()) {
1669
+ try {
1670
+ await this._gpuPreFilter(wasmQuery);
1671
+ }
1672
+ catch (e) {
1673
+ // Don't let a GPU hiccup kill the search — drop to CPU path.
1674
+ console.warn('[albex] GPU pre-filter failed; falling back to CPU:', e);
1675
+ w.clearCandidateMask();
1676
+ }
1677
+ }
1678
+ const t0 = performance.now();
1679
+ if (w.searchBegin() === 0) {
1680
+ this._lastSearch = {
1681
+ query: displayQuery, timeMs: 0, results: 0,
1682
+ bloomTested: 0, bloomPassed: 0, bitapMatched: 0,
1683
+ };
1684
+ return [];
1685
+ }
1686
+ // In background / low-power modes we halve the initial batch so the
1687
+ // engine yields more often to the scheduler, leaving more headroom for
1688
+ // whatever the host is doing.
1689
+ const conservative = this._resources?.mode === 'background'
1690
+ || this._resources?.mode === 'low-power';
1691
+ let batch = conservative ? 1024 : 2048;
1692
+ const sched = globalThis.scheduler;
1693
+ const yieldFn = sched && typeof sched.yield === 'function'
1694
+ ? () => sched.yield()
1695
+ : (typeof requestAnimationFrame === 'function'
1696
+ ? () => new Promise(resolve => requestAnimationFrame(() => resolve()))
1697
+ : () => new Promise(resolve => setTimeout(resolve, 0)));
1698
+ for (;;) {
1699
+ const sliceStart = performance.now();
1700
+ const done = w.searchSlice(batch);
1701
+ const sliceMs = performance.now() - sliceStart;
1702
+ if (done === 1)
1703
+ break;
1704
+ // Adapt batch size: if we have headroom in budget, grow; if we're
1705
+ // already over the per-slice target, shrink.
1706
+ if (sliceMs < budgetMs * 0.5 && batch < 32_768)
1707
+ batch *= 2;
1708
+ else if (sliceMs > budgetMs * 1.5 && batch > 512)
1709
+ batch = Math.max(512, Math.floor(batch / 2));
1710
+ await yieldFn();
1711
+ }
1712
+ const ms = performance.now() - t0;
1713
+ const count = w.getResultCount();
1714
+ this._lastSearch = {
1715
+ query: displayQuery,
1716
+ timeMs: ms,
1717
+ results: count,
1718
+ bloomTested: w.getStatBloomTested(),
1719
+ bloomPassed: w.getStatBloomPassed(),
1720
+ bitapMatched: w.getStatBitapMatched(),
1721
+ };
1722
+ return this._collectResults(count, opts);
1723
+ }
1724
+ /** Materialise results [0..count) into the public SearchResult shape. */
1725
+ _collectResults(count, opts) {
1726
+ const w = this._wasm;
1727
+ const windowed = opts.windowed === true;
1728
+ const before = opts.before ?? 60;
1729
+ const after = opts.after ?? 120;
1730
+ const results = [];
1731
+ for (let i = 0; i < count; i++) {
1732
+ const score = w.getResultScore(i);
1733
+ const location = w.getResultLocation(i);
1734
+ const matchStart = w.getResultStart(i);
1735
+ const matchEnd = w.getResultEnd(i);
1736
+ const nl = w.getResultDocName(i);
1737
+ const name = nl > 0 ? this._readPad(nl) : '?';
1738
+ const matchCount = w.getResultMatchCount(i);
1739
+ const matches = [];
1740
+ for (let k = 0; k < matchCount; k++) {
1741
+ matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
1742
+ }
1743
+ if (matches.length === 0)
1744
+ matches.push({ start: matchStart, end: matchEnd });
1745
+ let snippet;
1746
+ let primaryStart = matchStart;
1747
+ let primaryEnd = matchEnd;
1748
+ let adjustedMatches = matches;
1749
+ if (windowed) {
1750
+ const sl = w.getSnippetWindow(i, before, after);
1751
+ snippet = sl > 0 ? this._readPad(sl) : '';
1752
+ const offset = w.getSnippetWindowOffset();
1753
+ const leadingPrefix = offset > 0 ? 4 : 0;
1754
+ const shift = leadingPrefix - offset;
1755
+ adjustedMatches = matches.map(m => ({
1756
+ start: Math.max(0, m.start + shift),
1757
+ end: Math.max(0, m.end + shift),
1758
+ }));
1759
+ primaryStart = adjustedMatches[0]?.start ?? 0;
1760
+ primaryEnd = adjustedMatches[0]?.end ?? 0;
1761
+ }
1762
+ else {
1763
+ const sl = w.getSnippet(i);
1764
+ snippet = sl > 0 ? this._readPad(sl) : '';
1765
+ }
1766
+ results.push({
1767
+ documentName: name,
1768
+ location,
1769
+ score,
1770
+ snippet,
1771
+ matchStart: primaryStart,
1772
+ matchEnd: primaryEnd,
1773
+ matches: adjustedMatches,
1774
+ });
1775
+ }
1776
+ return results;
1777
+ }
1778
+ _searchOr(branches, rawQuery, opts) {
386
1779
  const seen = new Set();
387
1780
  const all = [];
388
1781
  for (const tokens of branches) {
389
1782
  const q = tokensToWasmQuery(tokens);
390
1783
  if (!q)
391
1784
  continue;
392
- const results = this._runSearch(q, rawQuery);
1785
+ const results = this._runSearch(q, rawQuery, opts);
393
1786
  for (const r of results) {
394
1787
  const key = `${r.documentName}:${r.location}:${r.matchStart}`;
395
1788
  if (!seen.has(key)) {
@@ -402,31 +1795,72 @@ export class AlbexEngine {
402
1795
  all.sort((a, b) => b.score - a.score);
403
1796
  return all;
404
1797
  }
405
- _runSearch(wasmQuery, displayQuery) {
1798
+ _runSearch(wasmQuery, displayQuery, opts) {
1799
+ const w = this._wasm;
406
1800
  const ql = this._writeStr(wasmQuery);
407
- this._wasm.setPattern(ql);
1801
+ w.setPattern(ql);
408
1802
  const t0 = performance.now();
409
- const count = this._wasm.search();
1803
+ const count = w.search();
410
1804
  const ms = performance.now() - t0;
411
1805
  this._lastSearch = {
412
1806
  query: displayQuery,
413
1807
  timeMs: ms,
414
1808
  results: count,
415
- bloomTested: this._wasm.getStatBloomTested(),
416
- bloomPassed: this._wasm.getStatBloomPassed(),
417
- bitapMatched: this._wasm.getStatBitapMatched(),
1809
+ bloomTested: w.getStatBloomTested(),
1810
+ bloomPassed: w.getStatBloomPassed(),
1811
+ bitapMatched: w.getStatBitapMatched(),
418
1812
  };
1813
+ const windowed = opts.windowed === true;
1814
+ const before = opts.before ?? 60;
1815
+ const after = opts.after ?? 120;
419
1816
  const results = [];
420
1817
  for (let i = 0; i < count; i++) {
421
- const score = this._wasm.getResultScore(i);
422
- const location = this._wasm.getResultLocation(i);
423
- const matchStart = this._wasm.getResultStart(i);
424
- const matchEnd = this._wasm.getResultEnd(i);
425
- const nl = this._wasm.getResultDocName(i);
1818
+ const score = w.getResultScore(i);
1819
+ const location = w.getResultLocation(i);
1820
+ const matchStart = w.getResultStart(i);
1821
+ const matchEnd = w.getResultEnd(i);
1822
+ const nl = w.getResultDocName(i);
426
1823
  const name = nl > 0 ? this._readPad(nl) : '?';
427
- const sl = this._wasm.getSnippet(i);
428
- const snippet = sl > 0 ? this._readPad(sl) : '';
429
- results.push({ documentName: name, location, score, snippet, matchStart, matchEnd });
1824
+ const matchCount = w.getResultMatchCount(i);
1825
+ const matches = [];
1826
+ for (let k = 0; k < matchCount; k++) {
1827
+ matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
1828
+ }
1829
+ if (matches.length === 0) {
1830
+ matches.push({ start: matchStart, end: matchEnd });
1831
+ }
1832
+ let snippet;
1833
+ let primaryStart = matchStart;
1834
+ let primaryEnd = matchEnd;
1835
+ let adjustedMatches = matches;
1836
+ if (windowed) {
1837
+ const sl = w.getSnippetWindow(i, before, after);
1838
+ snippet = sl > 0 ? this._readPad(sl) : '';
1839
+ const offset = w.getSnippetWindowOffset();
1840
+ // Spans came back chunk-relative; shift them into window-relative.
1841
+ // Account for leading "... " prefix when present.
1842
+ const leadingPrefix = offset > 0 ? 4 : 0;
1843
+ const shift = leadingPrefix - offset;
1844
+ adjustedMatches = matches.map(m => ({
1845
+ start: Math.max(0, m.start + shift),
1846
+ end: Math.max(0, m.end + shift),
1847
+ }));
1848
+ primaryStart = adjustedMatches[0]?.start ?? 0;
1849
+ primaryEnd = adjustedMatches[0]?.end ?? 0;
1850
+ }
1851
+ else {
1852
+ const sl = w.getSnippet(i);
1853
+ snippet = sl > 0 ? this._readPad(sl) : '';
1854
+ }
1855
+ results.push({
1856
+ documentName: name,
1857
+ location,
1858
+ score,
1859
+ snippet,
1860
+ matchStart: primaryStart,
1861
+ matchEnd: primaryEnd,
1862
+ matches: adjustedMatches,
1863
+ });
430
1864
  }
431
1865
  return results;
432
1866
  }
@@ -438,6 +1872,9 @@ export class AlbexEngine {
438
1872
  textUsed: this._wasm.getTextUsed(),
439
1873
  textCapacity: this._wasm.getTextCapacity(),
440
1874
  wasmMemoryBytes: this._mem.buffer.byteLength,
1875
+ tier: this._tier,
1876
+ maxChunks: this._wasm.getMaxChunks(),
1877
+ maxDocs: this._wasm.getMaxDocs(),
441
1878
  };
442
1879
  }
443
1880
  /** Returns stats from the most recent search, or null. */
@@ -462,18 +1899,178 @@ export class AlbexEngine {
462
1899
  setMaxResults(max) {
463
1900
  this._wasm.setMaxResults(Math.max(1, Math.min(200, max)));
464
1901
  }
1902
+ /**
1903
+ * Enable or disable query stemming.
1904
+ *
1905
+ * - `'off'` (default): tokens are used as-is. Strict matching.
1906
+ * - `'es'`: Spanish stemmer applied to query tokens before search. A query
1907
+ * for `"contratos"` matches `"contrato"` and vice versa.
1908
+ *
1909
+ * Indexed text is never stemmed, so snippets remain faithful to the
1910
+ * source. Recall improvement comes from queries reducing to shared prefixes.
1911
+ */
1912
+ setLanguage(lang) {
1913
+ this._wasm.setLanguage(lang === 'es' ? 1 : 0);
1914
+ }
465
1915
  /** Full reset — clears all indexed documents and chunks. */
466
1916
  reset() {
467
1917
  this._wasm.init();
468
1918
  this._docs = [];
469
1919
  this._lastSearch = null;
470
1920
  }
1921
+ // ── Persistence ───────────────────────────────────────────────────────────
1922
+ /**
1923
+ * Persist the current index to OPFS (or IndexedDB as fallback) under `name`.
1924
+ *
1925
+ * The snapshot includes every chunk, document name and text byte currently
1926
+ * indexed. Subsequent `load(name)` calls restore the engine to this exact
1927
+ * state in roughly O(total bytes), bypassing re-parsing.
1928
+ */
1929
+ async save(name) {
1930
+ const w = this._wasm;
1931
+ const total = w.snapshotSize();
1932
+ if (total === 0) {
1933
+ await savePersisted(name, new Uint8Array(0));
1934
+ return;
1935
+ }
1936
+ const out = new Uint8Array(total);
1937
+ let off = 0;
1938
+ while (off < total) {
1939
+ const n = w.snapshotChunk(off, FEED_SIZE);
1940
+ if (n === 0)
1941
+ break;
1942
+ const ptr = w.getBuffer(0);
1943
+ out.set(this._u8(ptr, n), off);
1944
+ off += n;
1945
+ }
1946
+ await savePersisted(name, out);
1947
+ // Reconstruct _docs from the doc table so getStats().documents stays
1948
+ // honest after save (no change here — but symmetric with load()).
1949
+ }
1950
+ /**
1951
+ * Restore an index previously saved with `save(name)`. Returns `true` on
1952
+ * success, `false` if the snapshot is missing or has an incompatible
1953
+ * header (wrong magic, version, or struct sizes).
1954
+ */
1955
+ async load(name) {
1956
+ const bytes = await loadPersisted(name);
1957
+ if (!bytes || bytes.length === 0)
1958
+ return false;
1959
+ const w = this._wasm;
1960
+ // Write the 64-byte header into the scratchpad and validate.
1961
+ if (bytes.length < 64)
1962
+ return false;
1963
+ const ptr = w.getBuffer(64);
1964
+ if (!ptr)
1965
+ return false;
1966
+ this._u8(ptr, 64).set(bytes.subarray(0, 64));
1967
+ if (w.restoreBegin() !== 1)
1968
+ return false;
1969
+ // Stream payload bytes.
1970
+ let off = 64;
1971
+ while (off < bytes.length) {
1972
+ const n = Math.min(FEED_SIZE, bytes.length - off);
1973
+ this._writePad(bytes.subarray(off, off + n));
1974
+ if (w.restoreFeed(n) !== 1)
1975
+ return false;
1976
+ off += n;
1977
+ }
1978
+ // Rebuild _docs metadata from the restored WASM tables.
1979
+ //
1980
+ // What's available after a restore:
1981
+ // * `name` — recovered from getDocName(i).
1982
+ // * `ext` — derived from the name.
1983
+ // * `chunks` — getDocChunkCount(i).
1984
+ // * `docId` — getDocId(i).
1985
+ // * `contentHash` — getDocContentHashPtr(i) when the binary supports
1986
+ // snapshot v2 (the export exists) AND the snapshot
1987
+ // itself was v2 (the bytes aren't all zero). v1
1988
+ // snapshots restore with all-zero hashes → '' here,
1989
+ // same as before.
1990
+ //
1991
+ // What's not persisted and therefore zeroed:
1992
+ // * `indexTimeMs` — no indexing happened in this session.
1993
+ // * `textBytes` — engine-wide totals are still available via
1994
+ // getStats().textUsed; per-doc breakdown is not
1995
+ // stored.
1996
+ const docCount = w.getDocCount();
1997
+ const hasHashExport = typeof w.getDocContentHashPtr === 'function'
1998
+ && typeof w.getDocContentHashLen === 'function';
1999
+ this._docs = [];
2000
+ for (let i = 0; i < docCount; i++) {
2001
+ if (w.isDocDeleted(i))
2002
+ continue;
2003
+ const nameLen = w.getDocName(i);
2004
+ const name = nameLen > 0 ? this._readPad(nameLen) : `restored-${i}`;
2005
+ const dotIdx = name.lastIndexOf('.');
2006
+ const ext = dotIdx > 0 ? name.slice(dotIdx + 1).toLowerCase() : '';
2007
+ let contentHash = '';
2008
+ if (hasHashExport) {
2009
+ const hashLen = w.getDocContentHashLen(); // always 8 today
2010
+ const hashPtr = w.getDocContentHashPtr(i);
2011
+ if (hashPtr !== 0 && hashLen === 8) {
2012
+ const view = this._u8(hashPtr, 8);
2013
+ // Copy into a private buffer so subsequent WASM calls cannot
2014
+ // mutate it under us.
2015
+ const buf = new Uint8Array(8);
2016
+ buf.set(view);
2017
+ contentHash = hashBytesToHex(buf);
2018
+ }
2019
+ }
2020
+ this._docs.push({
2021
+ name,
2022
+ ext,
2023
+ chunks: w.getDocChunkCount(i),
2024
+ indexTimeMs: 0,
2025
+ textBytes: 0,
2026
+ docId: w.getDocId(i),
2027
+ contentHash,
2028
+ });
2029
+ }
2030
+ this._lastSearch = null;
2031
+ return true;
2032
+ }
2033
+ /**
2034
+ * Convenience: load if the snapshot exists, otherwise leave the engine
2035
+ * empty. Returns whether a load actually happened.
2036
+ */
2037
+ async loadOrInit(name) {
2038
+ const loaded = await this.load(name);
2039
+ if (!loaded)
2040
+ this.reset();
2041
+ return loaded;
2042
+ }
2043
+ /** Delete a previously persisted snapshot. */
2044
+ async deleteSnapshot(name) {
2045
+ await deletePersisted(name);
2046
+ }
2047
+ /** List names of persisted snapshots in the current origin. */
2048
+ async listSnapshots() {
2049
+ return listPersisted();
2050
+ }
2051
+ /**
2052
+ * TC39 explicit-resource-management hook (Stage 3 in 2026). Lets the engine
2053
+ * be used with `using` so the references are released deterministically:
2054
+ *
2055
+ * using engine = new AlbexEngine(opts); await engine.init();
2056
+ *
2057
+ * WebAssembly does not actually expose a way to release linear memory pages
2058
+ * inside a Module instance, so we drop our references to the exports and
2059
+ * the doc list. GC can then reclaim the engine, which in turn releases the
2060
+ * WASM instance and its (typically 20 MB) backing memory.
2061
+ */
2062
+ [Symbol.dispose]() {
2063
+ this.reset();
2064
+ this._unsubscribeResources?.();
2065
+ this._unsubscribeResources = null;
2066
+ this._gpu?.destroy();
2067
+ this._gpu = null;
2068
+ // Null out the references so the engine cannot be reused after disposal
2069
+ // and the WASM instance becomes unreachable.
2070
+ this._wasm = null;
2071
+ this._mem = null;
2072
+ this._pdfWasm = null;
2073
+ this._pdfMem = null;
2074
+ }
471
2075
  }
472
- AlbexEngine._INDEXERS = {
473
- docx: (e, f) => e._indexDocx(f),
474
- xlsx: (e, f) => e._indexXlsx(f),
475
- pdf: (e, f) => e._indexPdf(f),
476
- txt: (e, f) => e._indexTxt(f),
477
- xml: (e, f) => e._indexXml(f),
478
- };
479
2076
  //# sourceMappingURL=albex.js.map