albex 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CHANGELOG.md +141 -0
  2. package/README.md +242 -112
  3. package/dist/albex-worker.d.ts +70 -0
  4. package/dist/albex-worker.d.ts.map +1 -0
  5. package/dist/albex-worker.js +153 -0
  6. package/dist/albex-worker.js.map +1 -0
  7. package/dist/albex.d.ts +368 -6
  8. package/dist/albex.d.ts.map +1 -1
  9. package/dist/albex.js +1692 -95
  10. package/dist/albex.js.map +1 -1
  11. package/dist/errors.d.ts +38 -0
  12. package/dist/errors.d.ts.map +1 -0
  13. package/dist/errors.js +63 -0
  14. package/dist/errors.js.map +1 -0
  15. package/dist/gpu/bloom-runtime.d.ts +60 -0
  16. package/dist/gpu/bloom-runtime.d.ts.map +1 -0
  17. package/dist/gpu/bloom-runtime.js +176 -0
  18. package/dist/gpu/bloom-runtime.js.map +1 -0
  19. package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
  20. package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
  21. package/dist/gpu/bloom-shader.wgsl.js +49 -0
  22. package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
  23. package/dist/persistence.d.ts +21 -0
  24. package/dist/persistence.d.ts.map +1 -0
  25. package/dist/persistence.js +174 -0
  26. package/dist/persistence.js.map +1 -0
  27. package/dist/pool/coordinator.d.ts +98 -0
  28. package/dist/pool/coordinator.d.ts.map +1 -0
  29. package/dist/pool/coordinator.js +247 -0
  30. package/dist/pool/coordinator.js.map +1 -0
  31. package/dist/profile.d.ts +95 -0
  32. package/dist/profile.d.ts.map +1 -0
  33. package/dist/profile.js +207 -0
  34. package/dist/profile.js.map +1 -0
  35. package/dist/resource-manager.d.ts +56 -0
  36. package/dist/resource-manager.d.ts.map +1 -0
  37. package/dist/resource-manager.js +138 -0
  38. package/dist/resource-manager.js.map +1 -0
  39. package/dist/tiered-store.d.ts +98 -0
  40. package/dist/tiered-store.d.ts.map +1 -0
  41. package/dist/tiered-store.js +238 -0
  42. package/dist/tiered-store.js.map +1 -0
  43. package/dist/wasm-bindings.d.ts +139 -0
  44. package/dist/wasm-bindings.d.ts.map +1 -0
  45. package/dist/wasm-bindings.js +33 -0
  46. package/dist/wasm-bindings.js.map +1 -0
  47. package/dist/worker-protocol.d.ts +86 -0
  48. package/dist/worker-protocol.d.ts.map +1 -0
  49. package/dist/worker-protocol.js +20 -0
  50. package/dist/worker-protocol.js.map +1 -0
  51. package/dist/worker-runtime.d.ts +14 -0
  52. package/dist/worker-runtime.d.ts.map +1 -0
  53. package/dist/worker-runtime.js +100 -0
  54. package/dist/worker-runtime.js.map +1 -0
  55. package/package.json +56 -13
  56. package/src/albex-worker.ts +187 -0
  57. package/src/albex.ts +1845 -130
  58. package/src/errors.ts +60 -0
  59. package/src/gpu/bloom-runtime.ts +229 -0
  60. package/src/gpu/bloom-shader.wgsl.ts +48 -0
  61. package/src/persistence.ts +175 -0
  62. package/src/pool/coordinator.ts +324 -0
  63. package/src/profile.ts +279 -0
  64. package/src/resource-manager.ts +167 -0
  65. package/src/tiered-store.ts +259 -0
  66. package/src/wasm-bindings.ts +200 -0
  67. package/src/worker-protocol.ts +48 -0
  68. package/src/worker-runtime.ts +96 -0
  69. package/wasm/pkg/albex_pdf.wasm +0 -0
  70. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  71. package/wasm/pkg/albex_wasm_mini.wasm +0 -0
  72. package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
  73. package/wasm/pkg/albex_wasm_pro.wasm +0 -0
  74. package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
  75. package/wasm/pkg/albex_wasm_std.wasm +0 -0
  76. package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/CHANGELOG.md ADDED
@@ -0,0 +1,141 @@
1
+ # Changelog
2
+
3
+ All notable changes to Albex are documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and Albex follows [Semantic Versioning](https://semver.org/).
7
+
8
+ ## [0.3.0] — 2026-05-30
9
+
10
+ ### Hybrid PDF OCR (opt-in)
11
+
12
+ - New `@albex/ocr` option `alwaysExtractEmbeddedImages: boolean` (default
13
+ `false`). When enabled, the engine OCRs the embedded images of EVERY
14
+ PDF on top of the normal text extraction — catching text that lives
15
+ only inside scanned annexes, stamps, signatures, or screenshots inside
16
+ otherwise-native PDFs.
17
+ - Demo exposes the flag as a checkbox in the OCR panel; status shows
18
+ `ready (spa, hybrid)` when active.
19
+
20
+ ### PDF parse-crash → OCR fallback
21
+
22
+ - When `extractPdf` traps (pdf-extract crashes on a PDF that other tools
23
+ read fine), the engine now re-instantiates the WASM and tries the
24
+ lopdf-only image-extraction path before throwing. With OCR wired, many
25
+ formerly "unsupported encoding" PDFs become searchable.
26
+ - Error message updated: instead of misleading "the file may be
27
+ malformed", users see clear guidance pointing at OCR as the recovery
28
+ path.
29
+
30
+ ### Demo sandbox
31
+
32
+ - Importmap to jsDelivr for `tesseract.js` (only loaded when the user
33
+ enables OCR).
34
+ - Full OCR panel: language select, hybrid-mode checkbox, lifecycle
35
+ status.
36
+ - Two fixture PDFs in `demo/fixtures/` for end-to-end testing:
37
+ `hybrid-test.pdf` (vector text + embedded image with text) and
38
+ `scanned-only-test.pdf` (100% image, no vector text).
39
+ - Global `window.onerror` + `unhandledrejection` handlers so Tesseract
40
+ worker aborts surface as Log entries instead of crashing the page.
41
+ - New `npm run serve` script wraps `npx serve -p 5173` for reproducible
42
+ local testing.
43
+
44
+ ### Breaking changes
45
+
46
+ - **`searchStream` renamed to `searchCooperative`.** The original name
47
+ implied incremental streaming, which the method never provided — it
48
+ yields to the scheduler between slices and then returns a batch.
49
+ The new name is honest. `searchStream` is kept as a deprecated alias
50
+ on `AlbexEngine`, `AlbexEngineWorker` and `AlbexPool`; it logs a
51
+ one-time `console.warn` on first call and will be removed in 0.4.0.
52
+
53
+ - **Snapshot format bumped to v2.** Existing v1 snapshots still load —
54
+ their documents come back with empty `contentHash` strings, same as
55
+ before. On the next `save()` they are rewritten as v2. No data loss;
56
+ no migration step required.
57
+
58
+ ### Added
59
+
60
+ - **Scanned-PDF OCR fallback.** When `extractPdf` returns `-2` (image-
61
+ only PDF) AND `@albex/ocr` has been wired via `enableOcr(engine)`,
62
+ the engine now extracts embedded JPEG / JPEG2000 image XObjects from
63
+ the PDF and runs them through Tesseract.js to recover text. Covers the
64
+ great majority of real-world scanned PDFs. Other compression filters
65
+ (FlateDecode, CCITTFaxDecode, JBIG2Decode) are not yet supported; pages
66
+ using them register with zero chunks (same behaviour as before).
67
+
68
+ - **`getPageCount`, `extractPageImages`, `getPageImage{Len,Ptr,Kind}`**
69
+ added to `albex_pdf.wasm` to support the scanned-PDF path. The PDF
70
+ binary grew from ~1.04 MB to ~1.19 MB.
71
+
72
+ - **Snapshot v2 persists per-document content hashes.** `load()` now
73
+ repopulates the in-memory `_docs` list correctly: `getStats().documents`
74
+ is right after a restore, and content-hash de-duplication survives the
75
+ round-trip (re-indexing the same file does not create a fresh slot).
76
+
77
+ - **New WASM exports**: `setDocumentContentHash`, `getDocContentHashPtr`,
78
+ `getDocContentHashLen`. Used by the host to round-trip the FNV-1a 64-bit
79
+ hash through the snapshot format.
80
+
81
+ - **OCR sandbox in the demo.** `demo/index.html` now ships an "Enable
82
+ OCR" panel that lazy-loads Tesseract.js through an importmap and
83
+ exposes per-document OCR status. Drop a scanned PDF and the demo OCR's
84
+ it automatically.
85
+
86
+ ### Fixed
87
+
88
+ - **`load()` repopulates `_docs` from the WASM tables.** Previously it
89
+ left `_docs = []` after a successful restore, which made
90
+ `engine.getStats().documents` return `0` even though searches against
91
+ the restored corpus worked. The README advertised "snapshot the index
92
+ and restore it" without that caveat.
93
+
94
+ - **CSV parser strips the UTF-8 BOM.** Files exported as "CSV UTF-8"
95
+ by Excel kept the BOM glued to the first field of the first row,
96
+ breaking column alignment and search hits on the first header
97
+ ("Subject", "Asunto", etc.).
98
+
99
+ - **EML parser decodes `base64` and `quoted-printable` bodies.** Real
100
+ emails almost always use one of these transfer encodings; before the
101
+ fix the body surfaced as opaque encoded blobs that searches could
102
+ never hit. Nested multipart (`multipart/alternative` inside
103
+ `multipart/mixed`) is now also unwrapped recursively.
104
+
105
+ - **RTF parser decodes `\'XX` hex bytes (via Windows-1252) and `\uN ?`
106
+ Unicode escapes.** Spanish/French/German content stored as cp1252
107
+ used to lose every accent; Word's modern `\u` escapes used to eat
108
+ the fallback ASCII character. Also added `\emdash`, `\endash`,
109
+ `\bullet`, `\lquote`, `\rquote`, `\ldblquote`, `\rdblquote`, `\tab`,
110
+ and soft-hyphen/non-breaking-space handling.
111
+
112
+ ### Documentation
113
+
114
+ - **README claims grounded.** Removed "every modern bundler",
115
+ "60 fps even on huge corpora", "5–10× speedup", "works for 99 % of
116
+ users", "11 formats" (without the `lite` qualifier). The matrix of
117
+ what is tested vs what is expected to work is now explicit. Bench
118
+ results are flagged as synthetic.
119
+
120
+ - **Persistence caveats documented.** The `Persistence` feature bullet
121
+ now describes the v2 / v1 difference and what survives the round trip.
122
+
123
+ ### Tests
124
+
125
+ - 71 → 83 vitest tests, all green. New suites:
126
+ - `tests/scanned-pdf.test.ts` (4 tests) — scanned-PDF OCR fallback
127
+ with a hand-rolled `FakePdfWasm`.
128
+ - `tests/load-restores-docs.test.ts` (4 tests) — verifies `load()`
129
+ repopulates `_docs` and that content-hash dedup survives v2.
130
+ - `tests/lite-parsers.test.ts` (11 tests) — adversarial fixtures for
131
+ CSV BOM, EML base64 / QP / nested multipart, RTF cp1252 / Unicode.
132
+
133
+ ## [0.2.0] — earlier
134
+
135
+ Initial public release. See git history for details: the surface was
136
+ the `AlbexEngine` class, the `albex_wasm_bg.wasm` and `albex_pdf.wasm`
137
+ binaries, lite parsers for the 11 formats, OPFS/IndexedDB persistence,
138
+ worker pool, tiered storage and optional WebGPU pre-filter.
139
+
140
+ [0.3.0]: https://github.com/RafaCalRob/Albex/releases/tag/v0.3.0
141
+ [0.2.0]: https://github.com/RafaCalRob/Albex/releases/tag/v0.2.0
package/README.md CHANGED
@@ -1,31 +1,27 @@
1
1
  # Albex
2
2
 
3
- Local full-text search for documents. Runs entirely in the browser — no server, no upload, no network request after the initial load.
3
+ Local full-text search for documents. Runs entirely in the browser — no server,
4
+ no upload, no network request after the initial load.
4
5
 
5
- Drop a DOCX, PDF, XLSX, TXT or XML file, start typing, get results in milliseconds.
6
+ Drop a DOCX, PDF, XLSX, HTML, Markdown, JSON, CSV, EML, RTF, TXT, or XML file,
7
+ start typing, get results in milliseconds.
6
8
 
7
9
  ---
8
10
 
9
- ## Features
10
-
11
- - **Zero server** — all text stays on the user's machine.
12
- - **Fuzzy matching** — finds "contrato" even if you type "conttrato" (adaptive edit distance).
13
- - **Accent-insensitive** — "accion" matches "acción", "espana" matches "España".
14
- - **Multi-format** — DOCX, XLSX, PDF (text-based), TXT, XML.
15
- - **Phrase search** — `"contrato marco"` requires the words to appear together.
16
- - **OR search** — `contrato | acuerdo` unions two independent searches.
17
- - **No dependencies** — one TypeScript file, two WASM binaries, nothing else.
18
- - **Tiny footprint** — main WASM is ~14 KB on disk; PDF module (~1 MB) loads on demand.
19
-
20
- ---
21
-
22
- ## Installation
11
+ ## Install
23
12
 
24
13
  ```bash
25
14
  npm install albex
26
15
  ```
27
16
 
28
- Or copy `dist/albex.js`, `wasm/pkg/albex_wasm_bg.wasm` (and optionally `albex_pdf.wasm`) to your project.
17
+ The WASM binary ships inside the package. Bundlers that recognise the
18
+ `new URL('…', import.meta.url)` pattern (Vite, Webpack 5+, esbuild, Rollup,
19
+ Parcel 2) copy it to the output and rewrite the URL automatically.
20
+
21
+ Matrix-tested in CI today: **Vite** and **Node** (via the test suite).
22
+ Other bundlers and runtimes (Next SSR, Bun, Deno) should work through the
23
+ same pattern but are not currently exercised by the test matrix — if you
24
+ hit a problem, open an issue.
29
25
 
30
26
  ---
31
27
 
@@ -34,184 +30,318 @@ Or copy `dist/albex.js`, `wasm/pkg/albex_wasm_bg.wasm` (and optionally `albex_pd
34
30
  ```ts
35
31
  import { AlbexEngine } from 'albex';
36
32
 
37
- const engine = new AlbexEngine({
38
- wasmUrl: '/assets/albex_wasm_bg.wasm',
39
- pdfWasmUrl: '/assets/albex_pdf.wasm', // only needed for PDFs
40
- });
41
-
33
+ const engine = new AlbexEngine();
42
34
  await engine.init();
43
35
 
44
- // Index a file from a <input type="file"> or drag-and-drop
36
+ // Index a file from <input type="file"> or drag-and-drop.
45
37
  const file = inputElement.files[0];
46
38
  const doc = await engine.indexFile(file);
47
39
  console.log(`Indexed ${doc.chunks} chunks in ${doc.indexTimeMs.toFixed(0)} ms`);
48
40
 
49
- // Search
41
+ // Search.
50
42
  const results = engine.search('contrato marco');
51
43
  for (const r of results) {
52
44
  console.log(`[${r.score}] ${r.documentName} — ${r.snippet}`);
53
45
  }
54
46
  ```
55
47
 
48
+ Cooperative search — yields to the scheduler between slices so the UI thread
49
+ keeps a chance to paint while a long search runs:
50
+
51
+ ```ts
52
+ for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
53
+ renderResult(r);
54
+ }
55
+ ```
56
+
57
+ `searchCooperative` returns the same shape as `search`. The "stream" is not
58
+ incremental yet — results arrive in one batch after the search completes,
59
+ but the work is split into frame-budget slices that yield to the scheduler.
60
+ Real incremental streaming is on the backlog.
61
+
62
+ That's the entire onboarding. Read on for what else the engine can do.
63
+
64
+ ---
65
+
66
+ ## Features
67
+
68
+ - **Zero server** — all text stays on the user's machine.
69
+ - **Bundler-friendly default** — `new AlbexEngine()` works without extra
70
+ configuration in bundlers that recognise the `new URL(..., import.meta.url)`
71
+ asset pattern (see the "Install" section for the tested matrix).
72
+ - **Fuzzy matching** — finds `"contrato"` even if you type `"conttrato"` (Bitap with adaptive edit distance).
73
+ - **Accent-insensitive** — `"accion"` matches `"acción"`, `"espana"` matches `"España"`, plus Latin Extended (Polish, Czech, Slovak, Turkish…).
74
+ - **11 formats with varying depth** — DOCX · XLSX · PDF · HTML · MD · JSON · CSV · EML · RTF · TXT · XML. See the support table below; several formats are deliberately "lite" (CSV is RFC-4180-lite, EML is MIME-lite, RTF is regex-stripped, etc.).
75
+ - **Phrase + OR queries** — `"contrato marco"` and `contrato | acuerdo` work out of the box.
76
+ - **Cooperative search** — `searchCooperative(query, { frameBudgetMs })` yields to the scheduler between slices. Results land in one batch (real incremental streaming is on the backlog).
77
+ - **Persistence** — snapshot the index to OPFS / IndexedDB and restore it. After `load()`, `engine.getStats().documents` is correct, `engine.search()` works against the restored corpus, and content-hash de-duplication survives the round-trip (snapshot v2). Older v1 snapshots still load — their docs come back with empty content hashes, so re-indexing the same files will create fresh slots until the next `save()` rewrites the snapshot as v2.
78
+ - **Incremental updates** — `removeDocument`, `replaceDocument`, `compact`. Content-hash dedup is automatic.
79
+ - **Resource aware** — pauses speculative work in background tabs, shrinks workers on low battery, defers PDF download on slow networks.
80
+ - **Off-main-thread** — `AlbexEngineWorker` mirror or `AlbexPool` shard across N workers (map-reduce search).
81
+ - **WebGPU pre-filter** — experimental, opt-in (`gpu: 'auto'`). Implemented for corpora over 20 k chunks; no reproducible speedup number yet — the bench in this repo runs on a 200-document synthetic corpus only.
82
+ - **SIMD opportunistic** — picks a SIMD-accelerated variant when the host supports v128.
83
+ - **Tiered storage** — `TieredStore` keeps recent docs hot, evicts cold ones to OPFS, promotes on demand.
84
+ - **Typed errors** — `AlbexParseError`, `AlbexUnsupportedFormatError`, `AlbexCapacityError`, `AlbexInitError`. All extend `AlbexError`.
85
+ - **Tiny core** — main WASM 24 KB (27 KB SIMD). PDF module (~1.2 MB) loads on demand. The OCR companion (`@albex/ocr`) is a separate package and pulls Tesseract.js (~3.5 MB) only when you call `enableOcr()`.
86
+
56
87
  ---
57
88
 
58
89
  ## Supported formats
59
90
 
60
- | Extension | How text is extracted |
61
- |-----------|----------------------|
62
- | `.docx` | Native Rust/WASM XML parser — reads `word/document.xml` directly |
63
- | `.xlsx` | Native Rust/WASM XML parser — reads shared strings + inline strings |
64
- | `.pdf` | Separate `albex_pdf.wasm` (pure Rust, loaded on demand) |
65
- | `.txt` | Plain text split on double newlines |
66
- | `.xml` | Tag-stripped, entity-decoded |
91
+ | Extension | How text is extracted |
92
+ |--------------------|-----------------------|
93
+ | `.docx` | Native Rust/WASM XML parser — streams `word/document.xml` |
94
+ | `.xlsx` | Native Rust/WASM XML parser — shared strings + inline strings |
95
+ | `.pdf` | Separate `albex_pdf.wasm` (pure Rust, loaded on demand) |
96
+ | `.md` / `.markdown`| TS parser strips CommonMark marks |
97
+ | `.html` / `.htm` | TS parser — strips `<script>` / `<style>`, paragraphs at block boundaries |
98
+ | `.json` | TS parser — recursive walk over keys + string leaves |
99
+ | `.csv` | TS parser — RFC 4180 lite; one row per chunk |
100
+ | `.eml` | TS parser — MIME-lite: From/To/Subject + text/plain body |
101
+ | `.rtf` | TS parser — strips control words / groups |
102
+ | `.txt` | Plain text split on double newlines |
103
+ | `.xml` | Tag-stripped, entity-decoded |
67
104
 
68
105
  ---
69
106
 
70
107
  ## Query syntax
71
108
 
72
- | Input | Behaviour |
73
- |-------|-----------|
74
- | `contrato` | Fuzzy match, accent-insensitive |
75
- | `contrato marco` | Both words must appear in the same chunk |
76
- | `"contrato marco"` | Both words AND they must be adjacent (phrase) |
77
- | `contrato \| acuerdo` | OR: returns results matching either term |
109
+ | Input | Behaviour |
110
+ |----------------------|-----------|
111
+ | `contrato` | Fuzzy match, accent-insensitive |
112
+ | `contrato marco` | Both words must appear in the same chunk |
113
+ | `"contrato marco"` | Both words AND they must be adjacent (phrase) |
114
+ | `contrato \| acuerdo` | OR: union of results matching either branch |
78
115
 
79
116
  Up to 4 space-separated tokens per simple/phrase query. OR branches are unlimited.
80
117
 
81
118
  ---
82
119
 
83
- ## API reference
84
-
85
- ### `new AlbexEngine(opts)`
120
+ ## API at a glance
86
121
 
87
122
  ```ts
88
- interface AlbexOptions {
89
- wasmUrl: string; // required
90
- pdfWasmUrl?: string; // required only for PDF indexing
123
+ // Construct
124
+ const engine = new AlbexEngine();
125
+ await engine.init();
126
+
127
+ // Indexing
128
+ const doc = await engine.indexFile(file);
129
+
130
+ // Search (synchronous fast path)
131
+ const results = engine.search('contrato', { windowed: true });
132
+
133
+ // Cooperative search (yields to the scheduler between slices)
134
+ for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
135
+ /* … */
91
136
  }
137
+
138
+ // Incremental updates
139
+ engine.removeDocument('contract.pdf');
140
+ await engine.replaceDocument('contract.pdf', newFile);
141
+ engine.compact();
142
+
143
+ // Persistence (OPFS or IndexedDB)
144
+ await engine.save('my-corpus');
145
+ await engine.loadOrInit('my-corpus');
146
+
147
+ // Tuning
148
+ engine.setMaxErrors(2);
149
+ engine.setThreshold(400);
150
+ engine.setMaxResults(50);
151
+ engine.setLanguage('es');
152
+
153
+ // Introspection
154
+ const stats = engine.getStats();
155
+ const lastRun = engine.getLastSearchStats();
92
156
  ```
93
157
 
94
- ### `engine.init(): Promise<void>`
158
+ Full API reference and types: [bdovenbird.com/albex/docs](https://bdovenbird.com/albex/docs).
95
159
 
96
- Fetches and initialises the main WASM module. Must be called before anything else.
160
+ ---
97
161
 
98
- ### `engine.indexFile(file: File): Promise<IndexedDocument>`
162
+ ## Off the main thread
99
163
 
100
- Detects the file format by extension, extracts text, and adds it to the search index. Throws for unsupported extensions or parse errors.
164
+ For interactive search UIs, run the engine inside a Web Worker:
101
165
 
102
166
  ```ts
103
- interface IndexedDocument {
104
- name: string;
105
- ext: string;
106
- chunks: number; // number of indexed text chunks
107
- indexTimeMs: number;
108
- textBytes: number; // raw UTF-8 text indexed
109
- }
167
+ import { AlbexEngineWorker } from 'albex/worker';
168
+
169
+ const engine = new AlbexEngineWorker({
170
+ workerUrl: new URL('albex/worker-runtime', import.meta.url),
171
+ });
172
+ await engine.init();
110
173
  ```
111
174
 
112
- ### `engine.search(query: string): SearchResult[]`
175
+ Same surface as `AlbexEngine`; everything returns a `Promise`.
113
176
 
114
- Returns results sorted by score (0–1000, descending).
177
+ ---
115
178
 
116
- ```ts
117
- interface SearchResult {
118
- documentName: string;
119
- location: number; // paragraph (DOCX/TXT) or page (PDF, 1-based)
120
- score: number; // 0–1000
121
- snippet: string; // full chunk text (original, with accents)
122
- matchStart: number; // byte offset of match in snippet
123
- matchEnd: number; // exclusive
124
- }
125
- ```
179
+ ## Sharding across cores
126
180
 
127
- ### `engine.getStats(): EngineStats`
181
+ For large corpora, an `AlbexPool` shards documents across N workers:
128
182
 
129
183
  ```ts
130
- interface EngineStats {
131
- documents: number;
132
- chunks: number;
133
- textUsed: number; // bytes
134
- textCapacity: number; // 16 MB hard cap
135
- wasmMemoryBytes: number;
136
- }
184
+ import { AlbexPool } from 'albex/pool';
185
+
186
+ const pool = new AlbexPool({
187
+ workerUrl: new URL('albex/worker-runtime', import.meta.url),
188
+ workers: 'auto', // = cores / 2, clamped [1, 8]
189
+ });
190
+ await pool.init();
191
+
192
+ await pool.indexFile(fileA); // sharded round-robin
193
+ const results = await pool.search('contrato'); // map-reduce
137
194
  ```
138
195
 
139
- ### `engine.getLastSearchStats(): SearchStats | null`
196
+ ---
197
+
198
+ ## Big corpora — tiered storage
140
199
 
141
- Bloom/Bitap pipeline counters from the most recent search — useful for debugging and UI dashboards.
200
+ For workloads that exceed the tier's RAM capacity:
142
201
 
143
202
  ```ts
144
- interface SearchStats {
145
- query: string;
146
- timeMs: number;
147
- results: number;
148
- bloomTested: number; // chunks tested
149
- bloomPassed: number; // passed bloom pre-filter
150
- bitapMatched: number; // confirmed by Bitap
151
- }
203
+ import { AlbexEngine, TieredStore } from 'albex';
204
+
205
+ const engine = new AlbexEngine();
206
+ await engine.init();
207
+
208
+ const store = new TieredStore(engine, { evictThreshold: 0.85 });
209
+ await store.init();
210
+
211
+ await store.indexFile(file); // persists original blob in OPFS
212
+ await store.promote('older-doc.pdf'); // brings warm doc back
152
213
  ```
153
214
 
154
- ### Tuning
215
+ Hot tier = engine. Warm tier = original files in OPFS. LRU eviction is automatic.
216
+
217
+ ---
218
+
219
+ ## Advanced configuration
220
+
221
+ `new AlbexEngine()` covers the default case. The options below address
222
+ specific deployment needs:
223
+
224
+ ### Tier auto-selection (`mini` / `std` / `pro` based on `deviceMemory`)
225
+
226
+ Albex ships **six** WASM variants of the main engine (3 tiers × baseline/SIMD).
227
+ By default it loads the std-baseline binary that comes with the npm package.
228
+ If you want runtime tier auto-selection, serve the variants yourself and
229
+ pass `wasmBaseUrl`:
155
230
 
156
231
  ```ts
157
- engine.setMaxErrors(n); // 0–3 (default 2, auto-scaled by query length)
158
- engine.setThreshold(n); // 0–1000 minimum score (default 250)
159
- engine.setMaxResults(n); // 1–200 (default 50)
232
+ const engine = new AlbexEngine({
233
+ wasmBaseUrl: '/assets', // directory containing the 6 .wasm files
234
+ tier: 'auto', // picks mini/std/pro by deviceMemory
235
+ simd: 'auto', // picks baseline/simd by WASM probe
236
+ gpu: 'auto', // engages WebGPU when corpus > 20k chunks
237
+ });
160
238
  ```
161
239
 
162
- ### `engine.reset()`
240
+ Tier capacities:
163
241
 
164
- Clears all indexed documents. The engine is ready to index new files immediately after.
242
+ | Tier | Max docs | Max chunks | Max text | Working set |
243
+ |-------|---------:|-----------:|---------:|------------:|
244
+ | mini | 32 | 25 000 | 4 MB | ~5 MB |
245
+ | std | 128 | 100 000 | 16 MB | ~20 MB |
246
+ | pro | 1 024 | 800 000 | 128 MB | ~160 MB |
247
+
248
+ ### Custom CDN
249
+
250
+ ```ts
251
+ const engine = new AlbexEngine({
252
+ wasmUrl: 'https://my-cdn.example.com/albex_wasm.wasm',
253
+ });
254
+ ```
165
255
 
166
256
  ---
167
257
 
168
- ## Capacity
258
+ ## Errors
259
+
260
+ All errors thrown by Albex extend `AlbexError`:
169
261
 
170
- | Resource | Limit |
171
- |----------|-------|
172
- | Documents | 128 |
173
- | Chunks | 100 000 |
174
- | Total text | 16 MB |
175
- | Query length | 64 characters (longer queries are truncated) |
176
- | Results | 200 (configurable, default 50) |
262
+ ```ts
263
+ import {
264
+ AlbexError, AlbexInitError, AlbexParseError,
265
+ AlbexUnsupportedFormatError, AlbexCapacityError,
266
+ } from 'albex';
267
+
268
+ try {
269
+ await engine.indexFile(file);
270
+ } catch (e) {
271
+ if (e instanceof AlbexUnsupportedFormatError) {
272
+ console.warn(`Skipped .${e.ext} (unsupported)`);
273
+ } else if (e instanceof AlbexParseError) {
274
+ console.warn(`Parse failed for ${e.format}:`, e.message);
275
+ } else throw e;
276
+ }
277
+ ```
177
278
 
178
- These are hard-coded BSS limits in the WASM module. Exceeding them is silent — the engine stops indexing additional content without error.
279
+ Each error carries a `kind` field that survives `structuredClone` across worker boundaries.
179
280
 
180
281
  ---
181
282
 
182
283
  ## Browser requirements
183
284
 
184
- - WebAssembly (all modern browsers since 2017)
285
+ - WebAssembly (every browser since 2017)
185
286
  - `DecompressionStream` for DOCX/XLSX (Chrome 80+, Firefox 113+, Safari 16.4+)
186
- - `String.prototype.normalize` for phrase search (all modern browsers)
287
+ - OPFS for fastest persistence (Chrome 102+, Safari 15.2+, Firefox 111+); IndexedDB fallback works everywhere
288
+ - WebGPU is **optional**; without it the CPU path is the default
187
289
 
188
- PDF support additionally requires the `albex_pdf.wasm` module to be served with the correct MIME type (`application/wasm`).
290
+ PDF support requires `albex_pdf.wasm` to be served with MIME type `application/wasm`.
189
291
 
190
292
  ---
191
293
 
192
294
  ## Building from source
193
295
 
194
296
  ```bash
195
- # Install Rust + wasm-pack
196
297
  rustup target add wasm32-unknown-unknown
197
298
 
198
- # Build main WASM
199
- cd wasm && cargo build --target wasm32-unknown-unknown --release
200
- cp ../target/wasm32-unknown-unknown/release/albex_wasm.wasm pkg/albex_wasm_bg.wasm
299
+ npm install
300
+ npm run build:all # 6 main variants + PDF + TypeScript
301
+ ```
302
+
303
+ Partial builds:
201
304
 
202
- # Build PDF WASM
203
- cd ../pdf-wasm && cargo build --target wasm32-unknown-unknown --release
204
- cp ../target/wasm32-unknown-unknown/release/albex_pdf.wasm ../wasm/pkg/albex_pdf.wasm
305
+ ```bash
306
+ npm run build:wasm # std baseline only
307
+ npm run build:wasm:tiers # all 6 variants
308
+ npm run build:pdf-wasm # PDF module
309
+ npm run build # TypeScript only
310
+ ```
311
+
312
+ ---
313
+
314
+ ## Tests
205
315
 
206
- # Build TypeScript
207
- cd .. && npm install && npm run build
316
+ ```bash
317
+ # Rust unit tests
318
+ cargo test --manifest-path core/Cargo.toml
319
+ cargo test --manifest-path ingest/Cargo.toml
320
+
321
+ # TypeScript + WASM integration tests
322
+ npm test
323
+
324
+ # Micro-benchmarks
325
+ npm run bench
208
326
  ```
209
327
 
328
+ **About the benchmark.** The included bench probes per-operation overhead
329
+ on a 200-document synthetic corpus. It is **not** a corpus-level
330
+ performance claim — there is no representative real-world dataset checked
331
+ into the repo yet. Numbers from `npm run bench` should be read as
332
+ "this implementation does not regress against itself", not as comparisons
333
+ against other libraries.
334
+
335
+ CI runs every check on every push to `main`.
336
+
210
337
  ---
211
338
 
212
339
  ## Privacy
213
340
 
214
- Albex does not transmit any document content. Text extraction, indexing, and search all happen inside the browser's WASM sandbox. The only network requests are the initial fetch of the `.wasm` binary files.
341
+ Albex never transmits document content. Text extraction, indexing, search and
342
+ persistence all happen inside the browser. The only network requests are the
343
+ initial fetches for the `.wasm` binaries (and the lazy PDF module on first
344
+ PDF). Persisted snapshots live in OPFS / IndexedDB, scoped to your origin.
215
345
 
216
346
  ---
217
347
 
@@ -0,0 +1,70 @@
1
+ /**
2
+ * `AlbexEngineWorker` — a main-thread wrapper that runs the engine inside a
3
+ * Web Worker. Mirrors the surface of `AlbexEngine` so it can be swapped in
4
+ * without code changes.
5
+ *
6
+ * Usage:
7
+ *
8
+ * const engine = new AlbexEngineWorker({
9
+ * wasmUrl: '/assets/albex_wasm_bg.wasm',
10
+ * pdfWasmUrl: '/assets/albex_pdf.wasm',
11
+ * // Provide the URL to the bundled worker runtime.
12
+ * workerUrl: new URL('./worker-runtime.js', import.meta.url),
13
+ * });
14
+ * await engine.init();
15
+ *
16
+ * Why: a `search()` over 100k chunks can take 10–50 ms. On main thread that
17
+ * is visible jank for every keystroke. Off-main-thread keeps the UI at 60 fps.
18
+ *
19
+ * The runtime is single-threaded WASM, so requests are serialised: only one
20
+ * call is in flight at a time. This matches the actual `static mut` model
21
+ * inside the .wasm and is fine for an interactive search UI (each keystroke
22
+ * replaces the previous query).
23
+ */
24
+ import type { AlbexOptions, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
25
+ export interface AlbexWorkerOptions extends AlbexOptions {
26
+ /** URL to the bundled worker runtime script (worker-runtime.js). */
27
+ workerUrl: string | URL;
28
+ }
29
+ export declare class AlbexEngineWorker {
30
+ private readonly _opts;
31
+ private _worker;
32
+ private _nextId;
33
+ private _pending;
34
+ private _docsCache;
35
+ constructor(opts: AlbexWorkerOptions);
36
+ init(): Promise<void>;
37
+ private _send;
38
+ indexFile(file: File): Promise<IndexedDocument>;
39
+ search(query: string, opts?: SearchOptions): Promise<SearchResult[]>;
40
+ /**
41
+ * Cooperative variant of `search`. Today the wire still sends a single
42
+ * batch — the result array is fetched in one round-trip from the worker
43
+ * and then exposed as an async iterator so callers can `break` early.
44
+ * A future iteration may use a `MessagePort` to stream individual results
45
+ * from the worker side; the iterator shape is preserved across that
46
+ * transition.
47
+ */
48
+ searchCooperative(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
49
+ /**
50
+ * @deprecated Renamed to `searchCooperative` in 0.3.0. Alias removed in 0.4.0.
51
+ */
52
+ searchStream(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
53
+ removeDocument(id: string): Promise<boolean>;
54
+ compact(): Promise<void>;
55
+ reset(): Promise<void>;
56
+ getStats(): Promise<EngineStats>;
57
+ getLastSearchStats(): Promise<SearchStats | null>;
58
+ getDocuments(): Promise<readonly IndexedDocument[]>;
59
+ setMaxErrors(n: 0 | 1 | 2 | 3): Promise<void>;
60
+ setThreshold(n: number): Promise<void>;
61
+ setMaxResults(n: number): Promise<void>;
62
+ setLanguage(lang: 'off' | 'es'): Promise<void>;
63
+ save(name: string): Promise<void>;
64
+ load(name: string): Promise<boolean>;
65
+ loadOrInit(name: string): Promise<boolean>;
66
+ deleteSnapshot(name: string): Promise<void>;
67
+ listSnapshots(): Promise<string[]>;
68
+ [Symbol.dispose](): void;
69
+ }
70
+ //# sourceMappingURL=albex-worker.d.ts.map