npm - albex - Versions diffs - 0.1.0 → 0.3.0 - Mend

albex 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

package/CHANGELOG.md +141 -0
package/README.md +242 -112
package/dist/albex-worker.d.ts +70 -0
package/dist/albex-worker.d.ts.map +1 -0
package/dist/albex-worker.js +153 -0
package/dist/albex-worker.js.map +1 -0
package/dist/albex.d.ts +368 -6
package/dist/albex.d.ts.map +1 -1
package/dist/albex.js +1692 -95
package/dist/albex.js.map +1 -1
package/dist/errors.d.ts +38 -0
package/dist/errors.d.ts.map +1 -0
package/dist/errors.js +63 -0
package/dist/errors.js.map +1 -0
package/dist/gpu/bloom-runtime.d.ts +60 -0
package/dist/gpu/bloom-runtime.d.ts.map +1 -0
package/dist/gpu/bloom-runtime.js +176 -0
package/dist/gpu/bloom-runtime.js.map +1 -0
package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
package/dist/gpu/bloom-shader.wgsl.js +49 -0
package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
package/dist/persistence.d.ts +21 -0
package/dist/persistence.d.ts.map +1 -0
package/dist/persistence.js +174 -0
package/dist/persistence.js.map +1 -0
package/dist/pool/coordinator.d.ts +98 -0
package/dist/pool/coordinator.d.ts.map +1 -0
package/dist/pool/coordinator.js +247 -0
package/dist/pool/coordinator.js.map +1 -0
package/dist/profile.d.ts +95 -0
package/dist/profile.d.ts.map +1 -0
package/dist/profile.js +207 -0
package/dist/profile.js.map +1 -0
package/dist/resource-manager.d.ts +56 -0
package/dist/resource-manager.d.ts.map +1 -0
package/dist/resource-manager.js +138 -0
package/dist/resource-manager.js.map +1 -0
package/dist/tiered-store.d.ts +98 -0
package/dist/tiered-store.d.ts.map +1 -0
package/dist/tiered-store.js +238 -0
package/dist/tiered-store.js.map +1 -0
package/dist/wasm-bindings.d.ts +139 -0
package/dist/wasm-bindings.d.ts.map +1 -0
package/dist/wasm-bindings.js +33 -0
package/dist/wasm-bindings.js.map +1 -0
package/dist/worker-protocol.d.ts +86 -0
package/dist/worker-protocol.d.ts.map +1 -0
package/dist/worker-protocol.js +20 -0
package/dist/worker-protocol.js.map +1 -0
package/dist/worker-runtime.d.ts +14 -0
package/dist/worker-runtime.d.ts.map +1 -0
package/dist/worker-runtime.js +100 -0
package/dist/worker-runtime.js.map +1 -0
package/package.json +56 -13
package/src/albex-worker.ts +187 -0
package/src/albex.ts +1845 -130
package/src/errors.ts +60 -0
package/src/gpu/bloom-runtime.ts +229 -0
package/src/gpu/bloom-shader.wgsl.ts +48 -0
package/src/persistence.ts +175 -0
package/src/pool/coordinator.ts +324 -0
package/src/profile.ts +279 -0
package/src/resource-manager.ts +167 -0
package/src/tiered-store.ts +259 -0
package/src/wasm-bindings.ts +200 -0
package/src/worker-protocol.ts +48 -0
package/src/worker-runtime.ts +96 -0
package/wasm/pkg/albex_pdf.wasm +0 -0
package/wasm/pkg/albex_wasm_bg.wasm +0 -0
package/wasm/pkg/albex_wasm_mini.wasm +0 -0
package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
package/wasm/pkg/albex_wasm_pro.wasm +0 -0
package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
package/wasm/pkg/albex_wasm_std.wasm +0 -0
package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,141 @@
+# Changelog
+All notable changes to Albex are documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and Albex follows [Semantic Versioning](https://semver.org/).
+## [0.3.0] — 2026-05-30
+### Hybrid PDF OCR (opt-in)
+- New `@albex/ocr` option `alwaysExtractEmbeddedImages: boolean` (default
+  `false`). When enabled, the engine OCRs the embedded images of EVERY
+  PDF on top of the normal text extraction — catching text that lives
+  only inside scanned annexes, stamps, signatures, or screenshots inside
+  otherwise-native PDFs.
+- Demo exposes the flag as a checkbox in the OCR panel; status shows
+  `ready (spa, hybrid)` when active.
+### PDF parse-crash → OCR fallback
+- When `extractPdf` traps (pdf-extract crashes on a PDF that other tools
+  read fine), the engine now re-instantiates the WASM and tries the
+  lopdf-only image-extraction path before throwing. With OCR wired, many
+  formerly "unsupported encoding" PDFs become searchable.
+- Error message updated: instead of misleading "the file may be
+  malformed", users see clear guidance pointing at OCR as the recovery
+  path.
+### Demo sandbox
+- Importmap to jsDelivr for `tesseract.js` (only loaded when the user
+  enables OCR).
+- Full OCR panel: language select, hybrid-mode checkbox, lifecycle
+  status.
+- Two fixture PDFs in `demo/fixtures/` for end-to-end testing:
+  `hybrid-test.pdf` (vector text + embedded image with text) and
+  `scanned-only-test.pdf` (100% image, no vector text).
+- Global `window.onerror` + `unhandledrejection` handlers so Tesseract
+  worker aborts surface as Log entries instead of crashing the page.
+- New `npm run serve` script wraps `npx serve -p 5173` for reproducible
+  local testing.
+### Breaking changes
+- **`searchStream` renamed to `searchCooperative`.** The original name
+  implied incremental streaming, which the method never provided — it
+  yields to the scheduler between slices and then returns a batch.
+  The new name is honest. `searchStream` is kept as a deprecated alias
+  on `AlbexEngine`, `AlbexEngineWorker` and `AlbexPool`; it logs a
+  one-time `console.warn` on first call and will be removed in 0.4.0.
+- **Snapshot format bumped to v2.** Existing v1 snapshots still load —
+  their documents come back with empty `contentHash` strings, same as
+  before. On the next `save()` they are rewritten as v2. No data loss;
+  no migration step required.
+### Added
+- **Scanned-PDF OCR fallback.** When `extractPdf` returns `-2` (image-
+  only PDF) AND `@albex/ocr` has been wired via `enableOcr(engine)`,
+  the engine now extracts embedded JPEG / JPEG2000 image XObjects from
+  the PDF and runs them through Tesseract.js to recover text. Covers the
+  great majority of real-world scanned PDFs. Other compression filters
+  (FlateDecode, CCITTFaxDecode, JBIG2Decode) are not yet supported; pages
+  using them register with zero chunks (same behaviour as before).
+- **`getPageCount`, `extractPageImages`, `getPageImage{Len,Ptr,Kind}`**
+  added to `albex_pdf.wasm` to support the scanned-PDF path. The PDF
+  binary grew from ~1.04 MB to ~1.19 MB.
+- **Snapshot v2 persists per-document content hashes.** `load()` now
+  repopulates the in-memory `_docs` list correctly: `getStats().documents`
+  is right after a restore, and content-hash de-duplication survives the
+  round-trip (re-indexing the same file does not create a fresh slot).
+- **New WASM exports**: `setDocumentContentHash`, `getDocContentHashPtr`,
+  `getDocContentHashLen`. Used by the host to round-trip the FNV-1a 64-bit
+  hash through the snapshot format.
+- **OCR sandbox in the demo.** `demo/index.html` now ships an "Enable
+  OCR" panel that lazy-loads Tesseract.js through an importmap and
+  exposes per-document OCR status. Drop a scanned PDF and the demo OCR's
+  it automatically.
+### Fixed
+- **`load()` repopulates `_docs` from the WASM tables.** Previously it
+  left `_docs = []` after a successful restore, which made
+  `engine.getStats().documents` return `0` even though searches against
+  the restored corpus worked. The README advertised "snapshot the index
+  and restore it" without that caveat.
+- **CSV parser strips the UTF-8 BOM.** Files exported as "CSV UTF-8"
+  by Excel kept the BOM glued to the first field of the first row,
+  breaking column alignment and search hits on the first header
+  ("Subject", "Asunto", etc.).
+- **EML parser decodes `base64` and `quoted-printable` bodies.** Real
+  emails almost always use one of these transfer encodings; before the
+  fix the body surfaced as opaque encoded blobs that searches could
+  never hit. Nested multipart (`multipart/alternative` inside
+  `multipart/mixed`) is now also unwrapped recursively.
+- **RTF parser decodes `\'XX` hex bytes (via Windows-1252) and `\uN ?`
+  Unicode escapes.** Spanish/French/German content stored as cp1252
+  used to lose every accent; Word's modern `\u` escapes used to eat
+  the fallback ASCII character. Also added `\emdash`, `\endash`,
+  `\bullet`, `\lquote`, `\rquote`, `\ldblquote`, `\rdblquote`, `\tab`,
+  and soft-hyphen/non-breaking-space handling.
+### Documentation
+- **README claims grounded.** Removed "every modern bundler",
+  "60 fps even on huge corpora", "5–10× speedup", "works for 99 % of
+  users", "11 formats" (without the `lite` qualifier). The matrix of
+  what is tested vs what is expected to work is now explicit. Bench
+  results are flagged as synthetic.
+- **Persistence caveats documented.** The `Persistence` feature bullet
+  now describes the v2 / v1 difference and what survives the round trip.
+### Tests
+- 71 → 83 vitest tests, all green. New suites:
+  - `tests/scanned-pdf.test.ts` (4 tests) — scanned-PDF OCR fallback
+    with a hand-rolled `FakePdfWasm`.
+  - `tests/load-restores-docs.test.ts` (4 tests) — verifies `load()`
+    repopulates `_docs` and that content-hash dedup survives v2.
+  - `tests/lite-parsers.test.ts` (11 tests) — adversarial fixtures for
+    CSV BOM, EML base64 / QP / nested multipart, RTF cp1252 / Unicode.
+## [0.2.0] — earlier
+Initial public release. See git history for details: the surface was
+the `AlbexEngine` class, the `albex_wasm_bg.wasm` and `albex_pdf.wasm`
+binaries, lite parsers for the 11 formats, OPFS/IndexedDB persistence,
+worker pool, tiered storage and optional WebGPU pre-filter.
+[0.3.0]: https://github.com/RafaCalRob/Albex/releases/tag/v0.3.0
+[0.2.0]: https://github.com/RafaCalRob/Albex/releases/tag/v0.2.0

package/README.md CHANGED Viewed

@@ -1,31 +1,27 @@
 # Albex
-Local full-text search for documents. Runs entirely in the browser — no server, no upload, no network request after the initial load.
+Local full-text search for documents. Runs entirely in the browser — no server,
+no upload, no network request after the initial load.
-Drop a DOCX, PDF, XLSX, TXT or XML file, start typing, get results in milliseconds.
+Drop a DOCX, PDF, XLSX, HTML, Markdown, JSON, CSV, EML, RTF, TXT, or XML file,
+start typing, get results in milliseconds.
 ---
-## Features
-- **Zero server** — all text stays on the user's machine.
-- **Fuzzy matching** — finds "contrato" even if you type "conttrato" (adaptive edit distance).
-- **Accent-insensitive** — "accion" matches "acción", "espana" matches "España".
-- **Multi-format** — DOCX, XLSX, PDF (text-based), TXT, XML.
-- **Phrase search** — `"contrato marco"` requires the words to appear together.
-- **OR search** — `contrato | acuerdo` unions two independent searches.
-- **No dependencies** — one TypeScript file, two WASM binaries, nothing else.
-- **Tiny footprint** — main WASM is ~14 KB on disk; PDF module (~1 MB) loads on demand.
----
-## Installation
+## Install
 ```bash
 npm install albex
 ```
-Or copy `dist/albex.js`, `wasm/pkg/albex_wasm_bg.wasm` (and optionally `albex_pdf.wasm`) to your project.
+The WASM binary ships inside the package. Bundlers that recognise the
+`new URL('…', import.meta.url)` pattern (Vite, Webpack 5+, esbuild, Rollup,
+Parcel 2) copy it to the output and rewrite the URL automatically.
+Matrix-tested in CI today: **Vite** and **Node** (via the test suite).
+Other bundlers and runtimes (Next SSR, Bun, Deno) should work through the
+same pattern but are not currently exercised by the test matrix — if you
+hit a problem, open an issue.
 ---
@@ -34,184 +30,318 @@ Or copy `dist/albex.js`, `wasm/pkg/albex_wasm_bg.wasm` (and optionally `albex_pd
 ```ts
 import { AlbexEngine } from 'albex';
-const engine = new AlbexEngine({
-  wasmUrl:    '/assets/albex_wasm_bg.wasm',
-  pdfWasmUrl: '/assets/albex_pdf.wasm',   // only needed for PDFs
-});
+const engine = new AlbexEngine();
 await engine.init();
-// Index a file from a <input type="file"> or drag-and-drop
+// Index a file from <input type="file"> or drag-and-drop.
 const file = inputElement.files[0];
 const doc  = await engine.indexFile(file);
 console.log(`Indexed ${doc.chunks} chunks in ${doc.indexTimeMs.toFixed(0)} ms`);
-// Search
+// Search.
 const results = engine.search('contrato marco');
 for (const r of results) {
   console.log(`[${r.score}] ${r.documentName} — ${r.snippet}`);
 }
 ```
+Cooperative search — yields to the scheduler between slices so the UI thread
+keeps a chance to paint while a long search runs:
+```ts
+for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
+  renderResult(r);
+}
+```
+`searchCooperative` returns the same shape as `search`. The "stream" is not
+incremental yet — results arrive in one batch after the search completes,
+but the work is split into frame-budget slices that yield to the scheduler.
+Real incremental streaming is on the backlog.
+That's the entire onboarding. Read on for what else the engine can do.
+---
+## Features
+- **Zero server** — all text stays on the user's machine.
+- **Bundler-friendly default** — `new AlbexEngine()` works without extra
+  configuration in bundlers that recognise the `new URL(..., import.meta.url)`
+  asset pattern (see the "Install" section for the tested matrix).
+- **Fuzzy matching** — finds `"contrato"` even if you type `"conttrato"` (Bitap with adaptive edit distance).
+- **Accent-insensitive** — `"accion"` matches `"acción"`, `"espana"` matches `"España"`, plus Latin Extended (Polish, Czech, Slovak, Turkish…).
+- **11 formats with varying depth** — DOCX · XLSX · PDF · HTML · MD · JSON · CSV · EML · RTF · TXT · XML. See the support table below; several formats are deliberately "lite" (CSV is RFC-4180-lite, EML is MIME-lite, RTF is regex-stripped, etc.).
+- **Phrase + OR queries** — `"contrato marco"` and `contrato | acuerdo` work out of the box.
+- **Cooperative search** — `searchCooperative(query, { frameBudgetMs })` yields to the scheduler between slices. Results land in one batch (real incremental streaming is on the backlog).
+- **Persistence** — snapshot the index to OPFS / IndexedDB and restore it. After `load()`, `engine.getStats().documents` is correct, `engine.search()` works against the restored corpus, and content-hash de-duplication survives the round-trip (snapshot v2). Older v1 snapshots still load — their docs come back with empty content hashes, so re-indexing the same files will create fresh slots until the next `save()` rewrites the snapshot as v2.
+- **Incremental updates** — `removeDocument`, `replaceDocument`, `compact`. Content-hash dedup is automatic.
+- **Resource aware** — pauses speculative work in background tabs, shrinks workers on low battery, defers PDF download on slow networks.
+- **Off-main-thread** — `AlbexEngineWorker` mirror or `AlbexPool` shard across N workers (map-reduce search).
+- **WebGPU pre-filter** — experimental, opt-in (`gpu: 'auto'`). Implemented for corpora over 20 k chunks; no reproducible speedup number yet — the bench in this repo runs on a 200-document synthetic corpus only.
+- **SIMD opportunistic** — picks a SIMD-accelerated variant when the host supports v128.
+- **Tiered storage** — `TieredStore` keeps recent docs hot, evicts cold ones to OPFS, promotes on demand.
+- **Typed errors** — `AlbexParseError`, `AlbexUnsupportedFormatError`, `AlbexCapacityError`, `AlbexInitError`. All extend `AlbexError`.
+- **Tiny core** — main WASM 24 KB (27 KB SIMD). PDF module (~1.2 MB) loads on demand. The OCR companion (`@albex/ocr`) is a separate package and pulls Tesseract.js (~3.5 MB) only when you call `enableOcr()`.
 ---
 ## Supported formats
-| Extension | How text is extracted |
-|-----------|----------------------|
-| `.docx`   | Native Rust/WASM XML parser — reads `word/document.xml` directly |
-| `.xlsx`   | Native Rust/WASM XML parser — reads shared strings + inline strings |
-| `.pdf`    | Separate `albex_pdf.wasm` (pure Rust, loaded on demand) |
-| `.txt`    | Plain text split on double newlines |
-| `.xml`    | Tag-stripped, entity-decoded |
+| Extension          | How text is extracted |
+|--------------------|-----------------------|
+| `.docx`            | Native Rust/WASM XML parser — streams `word/document.xml` |
+| `.xlsx`            | Native Rust/WASM XML parser — shared strings + inline strings |
+| `.pdf`             | Separate `albex_pdf.wasm` (pure Rust, loaded on demand) |
+| `.md` / `.markdown`| TS parser — strips CommonMark marks |
+| `.html` / `.htm`   | TS parser — strips `<script>` / `<style>`, paragraphs at block boundaries |
+| `.json`            | TS parser — recursive walk over keys + string leaves |
+| `.csv`             | TS parser — RFC 4180 lite; one row per chunk |
+| `.eml`             | TS parser — MIME-lite: From/To/Subject + text/plain body |
+| `.rtf`             | TS parser — strips control words / groups |
+| `.txt`             | Plain text split on double newlines |
+| `.xml`             | Tag-stripped, entity-decoded |
 ---
 ## Query syntax
-| Input | Behaviour |
-|-------|-----------|
-| `contrato` | Fuzzy match, accent-insensitive |
-| `contrato marco` | Both words must appear in the same chunk |
-| `"contrato marco"` | Both words AND they must be adjacent (phrase) |
-| `contrato \| acuerdo` | OR: returns results matching either term |
+| Input                | Behaviour |
+|----------------------|-----------|
+| `contrato`           | Fuzzy match, accent-insensitive |
+| `contrato marco`     | Both words must appear in the same chunk |
+| `"contrato marco"`   | Both words AND they must be adjacent (phrase) |
+| `contrato \| acuerdo` | OR: union of results matching either branch |
 Up to 4 space-separated tokens per simple/phrase query. OR branches are unlimited.
 ---
-## API reference
-### `new AlbexEngine(opts)`
+## API at a glance
 ```ts
-interface AlbexOptions {
-  wasmUrl:     string;   // required
-  pdfWasmUrl?: string;   // required only for PDF indexing
+// Construct
+const engine = new AlbexEngine();
+await engine.init();
+// Indexing
+const doc = await engine.indexFile(file);
+// Search (synchronous fast path)
+const results = engine.search('contrato', { windowed: true });
+// Cooperative search (yields to the scheduler between slices)
+for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
+  /* … */
 }
+// Incremental updates
+engine.removeDocument('contract.pdf');
+await engine.replaceDocument('contract.pdf', newFile);
+engine.compact();
+// Persistence (OPFS or IndexedDB)
+await engine.save('my-corpus');
+await engine.loadOrInit('my-corpus');
+// Tuning
+engine.setMaxErrors(2);
+engine.setThreshold(400);
+engine.setMaxResults(50);
+engine.setLanguage('es');
+// Introspection
+const stats   = engine.getStats();
+const lastRun = engine.getLastSearchStats();
 ```
-### `engine.init(): Promise<void>`
+Full API reference and types: [bdovenbird.com/albex/docs](https://bdovenbird.com/albex/docs).
-Fetches and initialises the main WASM module. Must be called before anything else.
+---
-### `engine.indexFile(file: File): Promise<IndexedDocument>`
+## Off the main thread
-Detects the file format by extension, extracts text, and adds it to the search index. Throws for unsupported extensions or parse errors.
+For interactive search UIs, run the engine inside a Web Worker:
 ```ts
-interface IndexedDocument {
-  name:        string;
-  ext:         string;
-  chunks:      number;   // number of indexed text chunks
-  indexTimeMs: number;
-  textBytes:   number;   // raw UTF-8 text indexed
-}
+import { AlbexEngineWorker } from 'albex/worker';
+const engine = new AlbexEngineWorker({
+  workerUrl: new URL('albex/worker-runtime', import.meta.url),
+});
+await engine.init();
 ```
-### `engine.search(query: string): SearchResult[]`
+Same surface as `AlbexEngine`; everything returns a `Promise`.
-Returns results sorted by score (0–1000, descending).
+---
-```ts
-interface SearchResult {
-  documentName: string;
-  location:     number;   // paragraph (DOCX/TXT) or page (PDF, 1-based)
-  score:        number;   // 0–1000
-  snippet:      string;   // full chunk text (original, with accents)
-  matchStart:   number;   // byte offset of match in snippet
-  matchEnd:     number;   // exclusive
-}
-```
+## Sharding across cores
-### `engine.getStats(): EngineStats`
+For large corpora, an `AlbexPool` shards documents across N workers:
 ```ts
-interface EngineStats {
-  documents:       number;
-  chunks:          number;
-  textUsed:        number;   // bytes
-  textCapacity:    number;   // 16 MB hard cap
-  wasmMemoryBytes: number;
-}
+import { AlbexPool } from 'albex/pool';
+const pool = new AlbexPool({
+  workerUrl: new URL('albex/worker-runtime', import.meta.url),
+  workers:   'auto',   // = cores / 2, clamped [1, 8]
+});
+await pool.init();
+await pool.indexFile(fileA);    // sharded round-robin
+const results = await pool.search('contrato');  // map-reduce
 ```
-### `engine.getLastSearchStats(): SearchStats | null`
+---
+## Big corpora — tiered storage
-Bloom/Bitap pipeline counters from the most recent search — useful for debugging and UI dashboards.
+For workloads that exceed the tier's RAM capacity:
 ```ts
-interface SearchStats {
-  query:        string;
-  timeMs:       number;
-  results:      number;
-  bloomTested:  number;   // chunks tested
-  bloomPassed:  number;   // passed bloom pre-filter
-  bitapMatched: number;   // confirmed by Bitap
-}
+import { AlbexEngine, TieredStore } from 'albex';
+const engine = new AlbexEngine();
+await engine.init();
+const store = new TieredStore(engine, { evictThreshold: 0.85 });
+await store.init();
+await store.indexFile(file);              // persists original blob in OPFS
+await store.promote('older-doc.pdf');     // brings warm doc back
 ```
-### Tuning
+Hot tier = engine. Warm tier = original files in OPFS. LRU eviction is automatic.
+---
+## Advanced configuration
+`new AlbexEngine()` covers the default case. The options below address
+specific deployment needs:
+### Tier auto-selection (`mini` / `std` / `pro` based on `deviceMemory`)
+Albex ships **six** WASM variants of the main engine (3 tiers × baseline/SIMD).
+By default it loads the std-baseline binary that comes with the npm package.
+If you want runtime tier auto-selection, serve the variants yourself and
+pass `wasmBaseUrl`:
 ```ts
-engine.setMaxErrors(n);     // 0–3  (default 2, auto-scaled by query length)
-engine.setThreshold(n);     // 0–1000 minimum score (default 250)
-engine.setMaxResults(n);    // 1–200 (default 50)
+const engine = new AlbexEngine({
+  wasmBaseUrl: '/assets',          // directory containing the 6 .wasm files
+  tier: 'auto',                    // picks mini/std/pro by deviceMemory
+  simd: 'auto',                    // picks baseline/simd by WASM probe
+  gpu:  'auto',                    // engages WebGPU when corpus > 20k chunks
+});
 ```
-### `engine.reset()`
+Tier capacities:
-Clears all indexed documents. The engine is ready to index new files immediately after.
+| Tier  | Max docs | Max chunks | Max text | Working set |
+|-------|---------:|-----------:|---------:|------------:|
+| mini  | 32       | 25 000     | 4 MB     | ~5 MB       |
+| std   | 128      | 100 000    | 16 MB    | ~20 MB      |
+| pro   | 1 024    | 800 000    | 128 MB   | ~160 MB     |
+### Custom CDN
+```ts
+const engine = new AlbexEngine({
+  wasmUrl: 'https://my-cdn.example.com/albex_wasm.wasm',
+});
+```
 ---
-## Capacity
+## Errors
+All errors thrown by Albex extend `AlbexError`:
-| Resource | Limit |
-|----------|-------|
-| Documents | 128 |
-| Chunks | 100 000 |
-| Total text | 16 MB |
-| Query length | 64 characters (longer queries are truncated) |
-| Results | 200 (configurable, default 50) |
+```ts
+import {
+  AlbexError, AlbexInitError, AlbexParseError,
+  AlbexUnsupportedFormatError, AlbexCapacityError,
+} from 'albex';
+try {
+  await engine.indexFile(file);
+} catch (e) {
+  if (e instanceof AlbexUnsupportedFormatError) {
+    console.warn(`Skipped .${e.ext} (unsupported)`);
+  } else if (e instanceof AlbexParseError) {
+    console.warn(`Parse failed for ${e.format}:`, e.message);
+  } else throw e;
+}
+```
-These are hard-coded BSS limits in the WASM module. Exceeding them is silent — the engine stops indexing additional content without error.
+Each error carries a `kind` field that survives `structuredClone` across worker boundaries.
 ---
 ## Browser requirements
-- WebAssembly (all modern browsers since 2017)
+- WebAssembly (every browser since 2017)
 - `DecompressionStream` for DOCX/XLSX (Chrome 80+, Firefox 113+, Safari 16.4+)
-- `String.prototype.normalize` for phrase search (all modern browsers)
+- OPFS for fastest persistence (Chrome 102+, Safari 15.2+, Firefox 111+); IndexedDB fallback works everywhere
+- WebGPU is **optional**; without it the CPU path is the default
-PDF support additionally requires the `albex_pdf.wasm` module to be served with the correct MIME type (`application/wasm`).
+PDF support requires `albex_pdf.wasm` to be served with MIME type `application/wasm`.
 ---
 ## Building from source
 ```bash
-# Install Rust + wasm-pack
 rustup target add wasm32-unknown-unknown
-# Build main WASM
-cd wasm && cargo build --target wasm32-unknown-unknown --release
-cp ../target/wasm32-unknown-unknown/release/albex_wasm.wasm pkg/albex_wasm_bg.wasm
+npm install
+npm run build:all         # 6 main variants + PDF + TypeScript
+```
+Partial builds:
-# Build PDF WASM
-cd ../pdf-wasm && cargo build --target wasm32-unknown-unknown --release
-cp ../target/wasm32-unknown-unknown/release/albex_pdf.wasm ../wasm/pkg/albex_pdf.wasm
+```bash
+npm run build:wasm        # std baseline only
+npm run build:wasm:tiers  # all 6 variants
+npm run build:pdf-wasm    # PDF module
+npm run build             # TypeScript only
+```
+---
+## Tests
-# Build TypeScript
-cd .. && npm install && npm run build
+```bash
+# Rust unit tests
+cargo test --manifest-path core/Cargo.toml
+cargo test --manifest-path ingest/Cargo.toml
+# TypeScript + WASM integration tests
+npm test
+# Micro-benchmarks
+npm run bench
 ```
+**About the benchmark.** The included bench probes per-operation overhead
+on a 200-document synthetic corpus. It is **not** a corpus-level
+performance claim — there is no representative real-world dataset checked
+into the repo yet. Numbers from `npm run bench` should be read as
+"this implementation does not regress against itself", not as comparisons
+against other libraries.
+CI runs every check on every push to `main`.
 ---
 ## Privacy
-Albex does not transmit any document content. Text extraction, indexing, and search all happen inside the browser's WASM sandbox. The only network requests are the initial fetch of the `.wasm` binary files.
+Albex never transmits document content. Text extraction, indexing, search and
+persistence all happen inside the browser. The only network requests are the
+initial fetches for the `.wasm` binaries (and the lazy PDF module on first
+PDF). Persisted snapshots live in OPFS / IndexedDB, scoped to your origin.
 ---

package/dist/albex-worker.d.ts ADDED Viewed

@@ -0,0 +1,70 @@
+/**
+ * `AlbexEngineWorker` — a main-thread wrapper that runs the engine inside a
+ * Web Worker. Mirrors the surface of `AlbexEngine` so it can be swapped in
+ * without code changes.
+ *
+ * Usage:
+ *
+ *     const engine = new AlbexEngineWorker({
+ *       wasmUrl:    '/assets/albex_wasm_bg.wasm',
+ *       pdfWasmUrl: '/assets/albex_pdf.wasm',
+ *       // Provide the URL to the bundled worker runtime.
+ *       workerUrl:  new URL('./worker-runtime.js', import.meta.url),
+ *     });
+ *     await engine.init();
+ *
+ * Why: a `search()` over 100k chunks can take 10–50 ms. On main thread that
+ * is visible jank for every keystroke. Off-main-thread keeps the UI at 60 fps.
+ *
+ * The runtime is single-threaded WASM, so requests are serialised: only one
+ * call is in flight at a time. This matches the actual `static mut` model
+ * inside the .wasm and is fine for an interactive search UI (each keystroke
+ * replaces the previous query).
+ */
+import type { AlbexOptions, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
+export interface AlbexWorkerOptions extends AlbexOptions {
+    /** URL to the bundled worker runtime script (worker-runtime.js). */
+    workerUrl: string | URL;
+}
+export declare class AlbexEngineWorker {
+    private readonly _opts;
+    private _worker;
+    private _nextId;
+    private _pending;
+    private _docsCache;
+    constructor(opts: AlbexWorkerOptions);
+    init(): Promise<void>;
+    private _send;
+    indexFile(file: File): Promise<IndexedDocument>;
+    search(query: string, opts?: SearchOptions): Promise<SearchResult[]>;
+    /**
+     * Cooperative variant of `search`. Today the wire still sends a single
+     * batch — the result array is fetched in one round-trip from the worker
+     * and then exposed as an async iterator so callers can `break` early.
+     * A future iteration may use a `MessagePort` to stream individual results
+     * from the worker side; the iterator shape is preserved across that
+     * transition.
+     */
+    searchCooperative(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
+    /**
+     * @deprecated Renamed to `searchCooperative` in 0.3.0. Alias removed in 0.4.0.
+     */
+    searchStream(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
+    removeDocument(id: string): Promise<boolean>;
+    compact(): Promise<void>;
+    reset(): Promise<void>;
+    getStats(): Promise<EngineStats>;
+    getLastSearchStats(): Promise<SearchStats | null>;
+    getDocuments(): Promise<readonly IndexedDocument[]>;
+    setMaxErrors(n: 0 | 1 | 2 | 3): Promise<void>;
+    setThreshold(n: number): Promise<void>;
+    setMaxResults(n: number): Promise<void>;
+    setLanguage(lang: 'off' | 'es'): Promise<void>;
+    save(name: string): Promise<void>;
+    load(name: string): Promise<boolean>;
+    loadOrInit(name: string): Promise<boolean>;
+    deleteSnapshot(name: string): Promise<void>;
+    listSnapshots(): Promise<string[]>;
+    [Symbol.dispose](): void;
+}
+//# sourceMappingURL=albex-worker.d.ts.map