albex 0.1.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/CHANGELOG.md +416 -0
  2. package/README.md +244 -112
  3. package/dist/albex-worker.d.ts +70 -0
  4. package/dist/albex-worker.d.ts.map +1 -0
  5. package/dist/albex-worker.js +153 -0
  6. package/dist/albex-worker.js.map +1 -0
  7. package/dist/albex.d.ts +508 -6
  8. package/dist/albex.d.ts.map +1 -1
  9. package/dist/albex.js +1911 -141
  10. package/dist/albex.js.map +1 -1
  11. package/dist/errors.d.ts +52 -0
  12. package/dist/errors.d.ts.map +1 -0
  13. package/dist/errors.js +66 -0
  14. package/dist/errors.js.map +1 -0
  15. package/dist/gpu/bloom-runtime.d.ts +60 -0
  16. package/dist/gpu/bloom-runtime.d.ts.map +1 -0
  17. package/dist/gpu/bloom-runtime.js +176 -0
  18. package/dist/gpu/bloom-runtime.js.map +1 -0
  19. package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
  20. package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
  21. package/dist/gpu/bloom-shader.wgsl.js +49 -0
  22. package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
  23. package/dist/persistence.d.ts +21 -0
  24. package/dist/persistence.d.ts.map +1 -0
  25. package/dist/persistence.js +174 -0
  26. package/dist/persistence.js.map +1 -0
  27. package/dist/pool/coordinator.d.ts +98 -0
  28. package/dist/pool/coordinator.d.ts.map +1 -0
  29. package/dist/pool/coordinator.js +247 -0
  30. package/dist/pool/coordinator.js.map +1 -0
  31. package/dist/profile.d.ts +100 -0
  32. package/dist/profile.d.ts.map +1 -0
  33. package/dist/profile.js +200 -0
  34. package/dist/profile.js.map +1 -0
  35. package/dist/resource-manager.d.ts +56 -0
  36. package/dist/resource-manager.d.ts.map +1 -0
  37. package/dist/resource-manager.js +138 -0
  38. package/dist/resource-manager.js.map +1 -0
  39. package/dist/tiered-store.d.ts +98 -0
  40. package/dist/tiered-store.d.ts.map +1 -0
  41. package/dist/tiered-store.js +238 -0
  42. package/dist/tiered-store.js.map +1 -0
  43. package/dist/wasm-bindings.d.ts +180 -0
  44. package/dist/wasm-bindings.d.ts.map +1 -0
  45. package/dist/wasm-bindings.js +128 -0
  46. package/dist/wasm-bindings.js.map +1 -0
  47. package/dist/worker-protocol.d.ts +86 -0
  48. package/dist/worker-protocol.d.ts.map +1 -0
  49. package/dist/worker-protocol.js +20 -0
  50. package/dist/worker-protocol.js.map +1 -0
  51. package/dist/worker-runtime.d.ts +14 -0
  52. package/dist/worker-runtime.d.ts.map +1 -0
  53. package/dist/worker-runtime.js +109 -0
  54. package/dist/worker-runtime.js.map +1 -0
  55. package/package.json +60 -13
  56. package/src/albex-worker.ts +187 -0
  57. package/src/albex.ts +2136 -189
  58. package/src/errors.ts +76 -0
  59. package/src/gpu/bloom-runtime.ts +229 -0
  60. package/src/gpu/bloom-shader.wgsl.ts +48 -0
  61. package/src/persistence.ts +175 -0
  62. package/src/pool/coordinator.ts +324 -0
  63. package/src/profile.ts +280 -0
  64. package/src/resource-manager.ts +167 -0
  65. package/src/tiered-store.ts +259 -0
  66. package/src/wasm-bindings.ts +349 -0
  67. package/src/worker-protocol.ts +48 -0
  68. package/src/worker-runtime.ts +106 -0
  69. package/wasm/pkg/albex_pdf.wasm +0 -0
  70. package/wasm/pkg/albex_wasm.wasm +0 -0
  71. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  72. package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/README.md CHANGED
@@ -1,31 +1,27 @@
1
1
  # Albex
2
2
 
3
- Local full-text search for documents. Runs entirely in the browser — no server, no upload, no network request after the initial load.
3
+ Local full-text search for documents. Runs entirely in the browser — no server,
4
+ no upload, no network request after the initial load.
4
5
 
5
- Drop a DOCX, PDF, XLSX, TXT or XML file, start typing, get results in milliseconds.
6
+ Drop a DOCX, PDF, XLSX, HTML, Markdown, JSON, CSV, EML, RTF, TXT, or XML file,
7
+ start typing, get results in milliseconds.
6
8
 
7
9
  ---
8
10
 
9
- ## Features
10
-
11
- - **Zero server** — all text stays on the user's machine.
12
- - **Fuzzy matching** — finds "contrato" even if you type "conttrato" (adaptive edit distance).
13
- - **Accent-insensitive** — "accion" matches "acción", "espana" matches "España".
14
- - **Multi-format** — DOCX, XLSX, PDF (text-based), TXT, XML.
15
- - **Phrase search** — `"contrato marco"` requires the words to appear together.
16
- - **OR search** — `contrato | acuerdo` unions two independent searches.
17
- - **No dependencies** — one TypeScript file, two WASM binaries, nothing else.
18
- - **Tiny footprint** — main WASM is ~14 KB on disk; PDF module (~1 MB) loads on demand.
19
-
20
- ---
21
-
22
- ## Installation
11
+ ## Install
23
12
 
24
13
  ```bash
25
14
  npm install albex
26
15
  ```
27
16
 
28
- Or copy `dist/albex.js`, `wasm/pkg/albex_wasm_bg.wasm` (and optionally `albex_pdf.wasm`) to your project.
17
+ The WASM binary ships inside the package. Bundlers that recognise the
18
+ `new URL('…', import.meta.url)` pattern (Vite, Webpack 5+, esbuild, Rollup,
19
+ Parcel 2) copy it to the output and rewrite the URL automatically.
20
+
21
+ Matrix-tested in CI today: **Vite** and **Node** (via the test suite).
22
+ Other bundlers and runtimes (Next SSR, Bun, Deno) should work through the
23
+ same pattern but are not currently exercised by the test matrix — if you
24
+ hit a problem, open an issue.
29
25
 
30
26
  ---
31
27
 
@@ -34,184 +30,320 @@ Or copy `dist/albex.js`, `wasm/pkg/albex_wasm_bg.wasm` (and optionally `albex_pd
34
30
  ```ts
35
31
  import { AlbexEngine } from 'albex';
36
32
 
37
- const engine = new AlbexEngine({
38
- wasmUrl: '/assets/albex_wasm_bg.wasm',
39
- pdfWasmUrl: '/assets/albex_pdf.wasm', // only needed for PDFs
40
- });
41
-
33
+ const engine = new AlbexEngine();
42
34
  await engine.init();
43
35
 
44
- // Index a file from a <input type="file"> or drag-and-drop
36
+ // Index a file from <input type="file"> or drag-and-drop.
45
37
  const file = inputElement.files[0];
46
38
  const doc = await engine.indexFile(file);
47
39
  console.log(`Indexed ${doc.chunks} chunks in ${doc.indexTimeMs.toFixed(0)} ms`);
48
40
 
49
- // Search
41
+ // Search.
50
42
  const results = engine.search('contrato marco');
51
43
  for (const r of results) {
52
44
  console.log(`[${r.score}] ${r.documentName} — ${r.snippet}`);
53
45
  }
54
46
  ```
55
47
 
48
+ Cooperative search — yields to the scheduler between slices so the UI thread
49
+ keeps a chance to paint while a long search runs:
50
+
51
+ ```ts
52
+ for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
53
+ renderResult(r);
54
+ }
55
+ ```
56
+
57
+ `searchCooperative` returns the same shape as `search`. The "stream" is not
58
+ incremental yet — results arrive in one batch after the search completes,
59
+ but the work is split into frame-budget slices that yield to the scheduler.
60
+ Real incremental streaming is on the backlog.
61
+
62
+ That's the entire onboarding. Read on for what else the engine can do.
63
+
64
+ ---
65
+
66
+ ## Features
67
+
68
+ - **Zero server** — all text stays on the user's machine.
69
+ - **Bundler-friendly default** — `new AlbexEngine()` works without extra
70
+ configuration in bundlers that recognise the `new URL(..., import.meta.url)`
71
+ asset pattern (see the "Install" section for the tested matrix).
72
+ - **Fuzzy matching** — finds `"contrato"` even if you type `"conttrato"` (Bitap with adaptive edit distance). Sound under a two-stage pre-filter (character Bloom for exact tokens, a 256-bit **trigram q-gram signature** for everything) that prunes the candidate set ~10× on prose without ever dropping a real approximate match.
73
+ - **Accent-insensitive** — `"accion"` matches `"acción"`, `"espana"` matches `"España"`, plus Latin Extended (Polish, Czech, Slovak, Turkish…).
74
+ - **11 formats with varying depth** — DOCX · XLSX · PDF · HTML · MD · JSON · CSV · EML · RTF · TXT · XML. See the support table below; several formats are deliberately "lite" (CSV is RFC-4180-lite, EML is MIME-lite, RTF is regex-stripped, etc.).
75
+ - **Phrase + OR queries** — `"contrato marco"` and `contrato | acuerdo` work out of the box.
76
+ - **Cooperative search** — `searchCooperative(query, { frameBudgetMs })` yields to the scheduler between slices. Results land in one batch (real incremental streaming is on the backlog).
77
+ - **Persistence** — snapshot the index to OPFS / IndexedDB and restore it. After `load()`, `engine.getStats().documents` is correct, `engine.search()` works against the restored corpus, and content-hash de-duplication survives the round-trip (snapshot v2). Older v1 snapshots still load — their docs come back with empty content hashes, so re-indexing the same files will create fresh slots until the next `save()` rewrites the snapshot as v2.
78
+ - **Incremental updates** — `removeDocument`, `replaceDocument`, `compact`. Content-hash dedup is automatic.
79
+ - **Resource aware** — pauses speculative work in background tabs, shrinks workers on low battery, defers PDF download on slow networks.
80
+ - **Off-main-thread** — `AlbexEngineWorker` mirror or `AlbexPool` shard across N workers (map-reduce search).
81
+ - **WebGPU pre-filter** — experimental, opt-in (`gpu: 'auto'`). Implemented for corpora over 20 k chunks; no reproducible speedup number yet — the bench in this repo runs on a 200-document synthetic corpus only.
82
+ - **SIMD opportunistic** — picks a SIMD-accelerated variant when the host supports v128.
83
+ - **Tiered storage** — `TieredStore` keeps recent docs hot, evicts cold ones to OPFS, promotes on demand.
84
+ - **Capacity-safe** — when a pool fills (`docs`/`chunks`/`text`/`names`), `indexFile` throws `AlbexCapacityError` with a `limit` field instead of silently truncating the corpus.
85
+ - **Re-entrancy-safe** — async operations on one engine serialize; sync `search`/`compact`/`reset` refuse to run mid-operation (`AlbexError` kind `busy`) rather than corrupting the shared WASM state. Use `searchCooperative` for overlapping search-as-you-type.
86
+ - **Typed errors** — `AlbexParseError`, `AlbexUnsupportedFormatError`, `AlbexCapacityError`, `AlbexInitError`. All extend `AlbexError`.
87
+ - **Tiny core** — main WASM 33 KB (37 KB SIMD). PDF module (~1.2 MB) loads on demand. The OCR companion (`@albex/ocr`) is a separate package and pulls Tesseract.js (~3.5 MB) only when you call `enableOcr()`.
88
+
56
89
  ---
57
90
 
58
91
  ## Supported formats
59
92
 
60
- | Extension | How text is extracted |
61
- |-----------|----------------------|
62
- | `.docx` | Native Rust/WASM XML parser — reads `word/document.xml` directly |
63
- | `.xlsx` | Native Rust/WASM XML parser — reads shared strings + inline strings |
64
- | `.pdf` | Separate `albex_pdf.wasm` (pure Rust, loaded on demand) |
65
- | `.txt` | Plain text split on double newlines |
66
- | `.xml` | Tag-stripped, entity-decoded |
93
+ | Extension | How text is extracted |
94
+ |--------------------|-----------------------|
95
+ | `.docx` | Native Rust/WASM XML parser — streams `word/document.xml` |
96
+ | `.xlsx` | Native Rust/WASM XML parser — shared strings + inline strings |
97
+ | `.pdf` | Separate `albex_pdf.wasm` (pure Rust, loaded on demand) |
98
+ | `.md` / `.markdown`| TS parser strips CommonMark marks |
99
+ | `.html` / `.htm` | TS parser — strips `<script>` / `<style>`, paragraphs at block boundaries |
100
+ | `.json` | TS parser — recursive walk over keys + string leaves |
101
+ | `.csv` | TS parser — RFC 4180 lite; one row per chunk |
102
+ | `.eml` | TS parser — MIME-lite: From/To/Subject + text/plain body |
103
+ | `.rtf` | TS parser — strips control words / groups |
104
+ | `.txt` | Plain text split on double newlines |
105
+ | `.xml` | Tag-stripped, entity-decoded |
67
106
 
68
107
  ---
69
108
 
70
109
  ## Query syntax
71
110
 
72
- | Input | Behaviour |
73
- |-------|-----------|
74
- | `contrato` | Fuzzy match, accent-insensitive |
75
- | `contrato marco` | Both words must appear in the same chunk |
76
- | `"contrato marco"` | Both words AND they must be adjacent (phrase) |
77
- | `contrato \| acuerdo` | OR: returns results matching either term |
111
+ | Input | Behaviour |
112
+ |----------------------|-----------|
113
+ | `contrato` | Fuzzy match, accent-insensitive |
114
+ | `contrato marco` | Both words must appear in the same chunk |
115
+ | `"contrato marco"` | Both words AND they must be adjacent (phrase) |
116
+ | `contrato \| acuerdo` | OR: union of results matching either branch |
78
117
 
79
118
  Up to 4 space-separated tokens per simple/phrase query. OR branches are unlimited.
80
119
 
81
120
  ---
82
121
 
83
- ## API reference
84
-
85
- ### `new AlbexEngine(opts)`
122
+ ## API at a glance
86
123
 
87
124
  ```ts
88
- interface AlbexOptions {
89
- wasmUrl: string; // required
90
- pdfWasmUrl?: string; // required only for PDF indexing
125
+ // Construct
126
+ const engine = new AlbexEngine();
127
+ await engine.init();
128
+
129
+ // Indexing
130
+ const doc = await engine.indexFile(file);
131
+
132
+ // Search (synchronous fast path)
133
+ const results = engine.search('contrato', { windowed: true });
134
+
135
+ // Cooperative search (yields to the scheduler between slices)
136
+ for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
137
+ /* … */
91
138
  }
139
+
140
+ // Incremental updates
141
+ engine.removeDocument('contract.pdf');
142
+ await engine.replaceDocument('contract.pdf', newFile);
143
+ engine.compact();
144
+
145
+ // Persistence (OPFS or IndexedDB)
146
+ await engine.save('my-corpus');
147
+ await engine.loadOrInit('my-corpus');
148
+
149
+ // Tuning
150
+ engine.setMaxErrors(2);
151
+ engine.setThreshold(400);
152
+ engine.setMaxResults(50);
153
+ engine.setLanguage('es');
154
+
155
+ // Introspection
156
+ const stats = engine.getStats();
157
+ const lastRun = engine.getLastSearchStats();
92
158
  ```
93
159
 
94
- ### `engine.init(): Promise<void>`
160
+ Full API reference and types: [bdovenbird.com/albex/docs](https://bdovenbird.com/albex/docs).
95
161
 
96
- Fetches and initialises the main WASM module. Must be called before anything else.
162
+ ---
97
163
 
98
- ### `engine.indexFile(file: File): Promise<IndexedDocument>`
164
+ ## Off the main thread
99
165
 
100
- Detects the file format by extension, extracts text, and adds it to the search index. Throws for unsupported extensions or parse errors.
166
+ For interactive search UIs, run the engine inside a Web Worker:
101
167
 
102
168
  ```ts
103
- interface IndexedDocument {
104
- name: string;
105
- ext: string;
106
- chunks: number; // number of indexed text chunks
107
- indexTimeMs: number;
108
- textBytes: number; // raw UTF-8 text indexed
109
- }
169
+ import { AlbexEngineWorker } from 'albex/worker';
170
+
171
+ const engine = new AlbexEngineWorker({
172
+ workerUrl: new URL('albex/worker-runtime', import.meta.url),
173
+ });
174
+ await engine.init();
110
175
  ```
111
176
 
112
- ### `engine.search(query: string): SearchResult[]`
177
+ Same surface as `AlbexEngine`; everything returns a `Promise`.
113
178
 
114
- Returns results sorted by score (0–1000, descending).
179
+ ---
115
180
 
116
- ```ts
117
- interface SearchResult {
118
- documentName: string;
119
- location: number; // paragraph (DOCX/TXT) or page (PDF, 1-based)
120
- score: number; // 0–1000
121
- snippet: string; // full chunk text (original, with accents)
122
- matchStart: number; // byte offset of match in snippet
123
- matchEnd: number; // exclusive
124
- }
125
- ```
181
+ ## Sharding across cores
126
182
 
127
- ### `engine.getStats(): EngineStats`
183
+ For large corpora, an `AlbexPool` shards documents across N workers:
128
184
 
129
185
  ```ts
130
- interface EngineStats {
131
- documents: number;
132
- chunks: number;
133
- textUsed: number; // bytes
134
- textCapacity: number; // 16 MB hard cap
135
- wasmMemoryBytes: number;
136
- }
186
+ import { AlbexPool } from 'albex/pool';
187
+
188
+ const pool = new AlbexPool({
189
+ workerUrl: new URL('albex/worker-runtime', import.meta.url),
190
+ workers: 'auto', // = cores / 2, clamped [1, 8]
191
+ });
192
+ await pool.init();
193
+
194
+ await pool.indexFile(fileA); // sharded round-robin
195
+ const results = await pool.search('contrato'); // map-reduce
137
196
  ```
138
197
 
139
- ### `engine.getLastSearchStats(): SearchStats | null`
198
+ ---
199
+
200
+ ## Big corpora — tiered storage
140
201
 
141
- Bloom/Bitap pipeline counters from the most recent search — useful for debugging and UI dashboards.
202
+ For workloads that exceed the tier's RAM capacity:
142
203
 
143
204
  ```ts
144
- interface SearchStats {
145
- query: string;
146
- timeMs: number;
147
- results: number;
148
- bloomTested: number; // chunks tested
149
- bloomPassed: number; // passed bloom pre-filter
150
- bitapMatched: number; // confirmed by Bitap
151
- }
205
+ import { AlbexEngine, TieredStore } from 'albex';
206
+
207
+ const engine = new AlbexEngine();
208
+ await engine.init();
209
+
210
+ const store = new TieredStore(engine, { evictThreshold: 0.85 });
211
+ await store.init();
212
+
213
+ await store.indexFile(file); // persists original blob in OPFS
214
+ await store.promote('older-doc.pdf'); // brings warm doc back
152
215
  ```
153
216
 
154
- ### Tuning
217
+ Hot tier = engine. Warm tier = original files in OPFS. LRU eviction is automatic.
218
+
219
+ ---
220
+
221
+ ## Advanced configuration
222
+
223
+ `new AlbexEngine()` covers the default case. The options below address
224
+ specific deployment needs:
225
+
226
+ ### Tier auto-selection (`mini` / `std` / `pro` based on `deviceMemory`)
227
+
228
+ Albex ships **six** WASM variants of the main engine (3 tiers × baseline/SIMD).
229
+ By default it loads the std-baseline binary that comes with the npm package.
230
+ If you want runtime tier auto-selection, serve the variants yourself and
231
+ pass `wasmBaseUrl`:
155
232
 
156
233
  ```ts
157
- engine.setMaxErrors(n); // 0–3 (default 2, auto-scaled by query length)
158
- engine.setThreshold(n); // 0–1000 minimum score (default 250)
159
- engine.setMaxResults(n); // 1–200 (default 50)
234
+ const engine = new AlbexEngine({
235
+ wasmBaseUrl: '/assets', // directory containing the 6 .wasm files
236
+ tier: 'auto', // picks mini/std/pro by deviceMemory
237
+ simd: 'auto', // picks baseline/simd by WASM probe
238
+ gpu: 'auto', // engages WebGPU when corpus > 20k chunks
239
+ });
160
240
  ```
161
241
 
162
- ### `engine.reset()`
242
+ Tier capacities:
163
243
 
164
- Clears all indexed documents. The engine is ready to index new files immediately after.
244
+ | Tier | Max docs | Max chunks | Max text | Working set |
245
+ |-------|---------:|-----------:|---------:|------------:|
246
+ | mini | 32 | 25 000 | 4 MB | ~5 MB |
247
+ | std | 128 | 100 000 | 16 MB | ~20 MB |
248
+ | pro | 1 024 | 800 000 | 128 MB | ~160 MB |
249
+
250
+ ### Custom CDN
251
+
252
+ ```ts
253
+ const engine = new AlbexEngine({
254
+ wasmUrl: 'https://my-cdn.example.com/albex_wasm.wasm',
255
+ });
256
+ ```
165
257
 
166
258
  ---
167
259
 
168
- ## Capacity
260
+ ## Errors
261
+
262
+ All errors thrown by Albex extend `AlbexError`:
169
263
 
170
- | Resource | Limit |
171
- |----------|-------|
172
- | Documents | 128 |
173
- | Chunks | 100 000 |
174
- | Total text | 16 MB |
175
- | Query length | 64 characters (longer queries are truncated) |
176
- | Results | 200 (configurable, default 50) |
264
+ ```ts
265
+ import {
266
+ AlbexError, AlbexInitError, AlbexParseError,
267
+ AlbexUnsupportedFormatError, AlbexCapacityError,
268
+ } from 'albex';
269
+
270
+ try {
271
+ await engine.indexFile(file);
272
+ } catch (e) {
273
+ if (e instanceof AlbexUnsupportedFormatError) {
274
+ console.warn(`Skipped .${e.ext} (unsupported)`);
275
+ } else if (e instanceof AlbexParseError) {
276
+ console.warn(`Parse failed for ${e.format}:`, e.message);
277
+ } else throw e;
278
+ }
279
+ ```
177
280
 
178
- These are hard-coded BSS limits in the WASM module. Exceeding them is silent — the engine stops indexing additional content without error.
281
+ Each error carries a `kind` field that survives `structuredClone` across worker boundaries.
179
282
 
180
283
  ---
181
284
 
182
285
  ## Browser requirements
183
286
 
184
- - WebAssembly (all modern browsers since 2017)
287
+ - WebAssembly (every browser since 2017)
185
288
  - `DecompressionStream` for DOCX/XLSX (Chrome 80+, Firefox 113+, Safari 16.4+)
186
- - `String.prototype.normalize` for phrase search (all modern browsers)
289
+ - OPFS for fastest persistence (Chrome 102+, Safari 15.2+, Firefox 111+); IndexedDB fallback works everywhere
290
+ - WebGPU is **optional**; without it the CPU path is the default
187
291
 
188
- PDF support additionally requires the `albex_pdf.wasm` module to be served with the correct MIME type (`application/wasm`).
292
+ PDF support requires `albex_pdf.wasm` to be served with MIME type `application/wasm`.
189
293
 
190
294
  ---
191
295
 
192
296
  ## Building from source
193
297
 
194
298
  ```bash
195
- # Install Rust + wasm-pack
196
299
  rustup target add wasm32-unknown-unknown
197
300
 
198
- # Build main WASM
199
- cd wasm && cargo build --target wasm32-unknown-unknown --release
200
- cp ../target/wasm32-unknown-unknown/release/albex_wasm.wasm pkg/albex_wasm_bg.wasm
301
+ npm install
302
+ npm run build:all # 6 main variants + PDF + TypeScript
303
+ ```
304
+
305
+ Partial builds:
201
306
 
202
- # Build PDF WASM
203
- cd ../pdf-wasm && cargo build --target wasm32-unknown-unknown --release
204
- cp ../target/wasm32-unknown-unknown/release/albex_pdf.wasm ../wasm/pkg/albex_pdf.wasm
307
+ ```bash
308
+ npm run build:wasm # std baseline only
309
+ npm run build:wasm:tiers # all 6 variants
310
+ npm run build:pdf-wasm # PDF module
311
+ npm run build # TypeScript only
312
+ ```
313
+
314
+ ---
315
+
316
+ ## Tests
205
317
 
206
- # Build TypeScript
207
- cd .. && npm install && npm run build
318
+ ```bash
319
+ # Rust unit tests
320
+ cargo test --manifest-path core/Cargo.toml
321
+ cargo test --manifest-path ingest/Cargo.toml
322
+
323
+ # TypeScript + WASM integration tests
324
+ npm test
325
+
326
+ # Micro-benchmarks
327
+ npm run bench
208
328
  ```
209
329
 
330
+ **About the benchmark.** The included bench probes per-operation overhead
331
+ on a 200-document synthetic corpus. It is **not** a corpus-level
332
+ performance claim — there is no representative real-world dataset checked
333
+ into the repo yet. Numbers from `npm run bench` should be read as
334
+ "this implementation does not regress against itself", not as comparisons
335
+ against other libraries.
336
+
337
+ CI runs every check on every push to `main`.
338
+
210
339
  ---
211
340
 
212
341
  ## Privacy
213
342
 
214
- Albex does not transmit any document content. Text extraction, indexing, and search all happen inside the browser's WASM sandbox. The only network requests are the initial fetch of the `.wasm` binary files.
343
+ Albex never transmits document content. Text extraction, indexing, search and
344
+ persistence all happen inside the browser. The only network requests are the
345
+ initial fetches for the `.wasm` binaries (and the lazy PDF module on first
346
+ PDF). Persisted snapshots live in OPFS / IndexedDB, scoped to your origin.
215
347
 
216
348
  ---
217
349
 
@@ -0,0 +1,70 @@
1
+ /**
2
+ * `AlbexEngineWorker` — a main-thread wrapper that runs the engine inside a
3
+ * Web Worker. Mirrors the surface of `AlbexEngine` so it can be swapped in
4
+ * without code changes.
5
+ *
6
+ * Usage:
7
+ *
8
+ * const engine = new AlbexEngineWorker({
9
+ * wasmUrl: '/assets/albex_wasm_bg.wasm',
10
+ * pdfWasmUrl: '/assets/albex_pdf.wasm',
11
+ * // Provide the URL to the bundled worker runtime.
12
+ * workerUrl: new URL('./worker-runtime.js', import.meta.url),
13
+ * });
14
+ * await engine.init();
15
+ *
16
+ * Why: a `search()` over 100k chunks can take 10–50 ms. On main thread that
17
+ * is visible jank for every keystroke. Off-main-thread keeps the UI at 60 fps.
18
+ *
19
+ * The runtime is single-threaded WASM, so requests are serialised: only one
20
+ * call is in flight at a time. This matches the actual `static mut` model
21
+ * inside the .wasm and is fine for an interactive search UI (each keystroke
22
+ * replaces the previous query).
23
+ */
24
+ import type { AlbexOptions, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
25
+ export interface AlbexWorkerOptions extends AlbexOptions {
26
+ /** URL to the bundled worker runtime script (worker-runtime.js). */
27
+ workerUrl: string | URL;
28
+ }
29
+ export declare class AlbexEngineWorker {
30
+ private readonly _opts;
31
+ private _worker;
32
+ private _nextId;
33
+ private _pending;
34
+ private _docsCache;
35
+ constructor(opts: AlbexWorkerOptions);
36
+ init(): Promise<void>;
37
+ private _send;
38
+ indexFile(file: File): Promise<IndexedDocument>;
39
+ search(query: string, opts?: SearchOptions): Promise<SearchResult[]>;
40
+ /**
41
+ * Cooperative variant of `search`. Today the wire still sends a single
42
+ * batch — the result array is fetched in one round-trip from the worker
43
+ * and then exposed as an async iterator so callers can `break` early.
44
+ * A future iteration may use a `MessagePort` to stream individual results
45
+ * from the worker side; the iterator shape is preserved across that
46
+ * transition.
47
+ */
48
+ searchCooperative(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
49
+ /**
50
+ * @deprecated Renamed to `searchCooperative` in 0.3.0. Alias removed in 0.4.0.
51
+ */
52
+ searchStream(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
53
+ removeDocument(id: string): Promise<boolean>;
54
+ compact(): Promise<void>;
55
+ reset(): Promise<void>;
56
+ getStats(): Promise<EngineStats>;
57
+ getLastSearchStats(): Promise<SearchStats | null>;
58
+ getDocuments(): Promise<readonly IndexedDocument[]>;
59
+ setMaxErrors(n: 0 | 1 | 2 | 3): Promise<void>;
60
+ setThreshold(n: number): Promise<void>;
61
+ setMaxResults(n: number): Promise<void>;
62
+ setLanguage(lang: 'off' | 'es'): Promise<void>;
63
+ save(name: string): Promise<void>;
64
+ load(name: string): Promise<boolean>;
65
+ loadOrInit(name: string): Promise<boolean>;
66
+ deleteSnapshot(name: string): Promise<void>;
67
+ listSnapshots(): Promise<string[]>;
68
+ [Symbol.dispose](): void;
69
+ }
70
+ //# sourceMappingURL=albex-worker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"albex-worker.d.ts","sourceRoot":"","sources":["../src/albex-worker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EACV,YAAY,EACZ,eAAe,EACf,aAAa,EACb,YAAY,EACZ,WAAW,EACX,WAAW,EACZ,MAAM,YAAY,CAAC;AAcpB,MAAM,WAAW,kBAAmB,SAAQ,YAAY;IACtD,oEAAoE;IACpE,SAAS,EAAE,MAAM,GAAG,GAAG,CAAC;CACzB;AASD,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAqB;IAC3C,OAAO,CAAC,OAAO,CAAU;IACzB,OAAO,CAAC,OAAO,CAAK;IACpB,OAAO,CAAC,QAAQ,CAA8B;IAC9C,OAAO,CAAC,UAAU,CAAyB;gBAE/B,IAAI,EAAE,kBAAkB;IAI9B,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAsB3B,OAAO,CAAC,KAAK;IASP,SAAS,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO,CAAC,eAAe,CAAC;IAWrD,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAIxE;;;;;;;OAOG;IACI,iBAAiB,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,aAAkB,GAAG,aAAa,CAAC,YAAY,CAAC;IAK9F;;OAEG;IACI,YAAY,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,aAAkB,GAAG,aAAa,CAAC,YAAY,CAAC;IAQnF,cAAc,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAM5C,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IACxB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAK5B,QAAQ,IAAa,OAAO,CAAC,WAAW,CAAC;IACzC,kBAAkB,IAAI,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC;IACjD,YAAY,IAAU,OAAO,CAAC,SAAS,eAAe,EAAE,CAAC;IAEnD,YAAY,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IAC7C,YAAY,CAAC,CAAC,EAAE,MAAM,GAAS,OAAO,CAAC,IAAI,CAAC;IAC5C,aAAa,CAAC,CAAC,EAAE,MAAM,GAAQ,OAAO,CAAC,IAAI,CAAC;IAC5C,WAAW,CAAC,IAAI,EAAE,KAAK,GAAG,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAG9C,IAAI,CAAC,IAAI,EAAE,MAAM,GAAa,OAAO,CAAC,IAAI,CAAC;IAC3C,IAAI,CAAC,IAAI,EAAE,MAAM,GAAa,OAAO,CAAC,OAAO,CAAC;IAC9C,UAAU,CAAC,IAAI,EAAE,MAAM,GAAO,OAAO,CAAC,OAAO,CAAC;IAC9C,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAC3C,aAAa,IAAiB,OAAO,CAAC,MAAM,EAAE,CAAC;IAErD,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,IAAI;CAMzB"}