albex 0.1.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +416 -0
- package/README.md +244 -112
- package/dist/albex-worker.d.ts +70 -0
- package/dist/albex-worker.d.ts.map +1 -0
- package/dist/albex-worker.js +153 -0
- package/dist/albex-worker.js.map +1 -0
- package/dist/albex.d.ts +508 -6
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +1911 -141
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +52 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +66 -0
- package/dist/errors.js.map +1 -0
- package/dist/gpu/bloom-runtime.d.ts +60 -0
- package/dist/gpu/bloom-runtime.d.ts.map +1 -0
- package/dist/gpu/bloom-runtime.js +176 -0
- package/dist/gpu/bloom-runtime.js.map +1 -0
- package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
- package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
- package/dist/gpu/bloom-shader.wgsl.js +49 -0
- package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
- package/dist/persistence.d.ts +21 -0
- package/dist/persistence.d.ts.map +1 -0
- package/dist/persistence.js +174 -0
- package/dist/persistence.js.map +1 -0
- package/dist/pool/coordinator.d.ts +98 -0
- package/dist/pool/coordinator.d.ts.map +1 -0
- package/dist/pool/coordinator.js +247 -0
- package/dist/pool/coordinator.js.map +1 -0
- package/dist/profile.d.ts +100 -0
- package/dist/profile.d.ts.map +1 -0
- package/dist/profile.js +200 -0
- package/dist/profile.js.map +1 -0
- package/dist/resource-manager.d.ts +56 -0
- package/dist/resource-manager.d.ts.map +1 -0
- package/dist/resource-manager.js +138 -0
- package/dist/resource-manager.js.map +1 -0
- package/dist/tiered-store.d.ts +98 -0
- package/dist/tiered-store.d.ts.map +1 -0
- package/dist/tiered-store.js +238 -0
- package/dist/tiered-store.js.map +1 -0
- package/dist/wasm-bindings.d.ts +180 -0
- package/dist/wasm-bindings.d.ts.map +1 -0
- package/dist/wasm-bindings.js +128 -0
- package/dist/wasm-bindings.js.map +1 -0
- package/dist/worker-protocol.d.ts +86 -0
- package/dist/worker-protocol.d.ts.map +1 -0
- package/dist/worker-protocol.js +20 -0
- package/dist/worker-protocol.js.map +1 -0
- package/dist/worker-runtime.d.ts +14 -0
- package/dist/worker-runtime.d.ts.map +1 -0
- package/dist/worker-runtime.js +109 -0
- package/dist/worker-runtime.js.map +1 -0
- package/package.json +60 -13
- package/src/albex-worker.ts +187 -0
- package/src/albex.ts +2136 -189
- package/src/errors.ts +76 -0
- package/src/gpu/bloom-runtime.ts +229 -0
- package/src/gpu/bloom-shader.wgsl.ts +48 -0
- package/src/persistence.ts +175 -0
- package/src/pool/coordinator.ts +324 -0
- package/src/profile.ts +280 -0
- package/src/resource-manager.ts +167 -0
- package/src/tiered-store.ts +259 -0
- package/src/wasm-bindings.ts +349 -0
- package/src/worker-protocol.ts +48 -0
- package/src/worker-runtime.ts +106 -0
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/README.md
CHANGED
|
@@ -1,31 +1,27 @@
|
|
|
1
1
|
# Albex
|
|
2
2
|
|
|
3
|
-
Local full-text search for documents. Runs entirely in the browser — no server,
|
|
3
|
+
Local full-text search for documents. Runs entirely in the browser — no server,
|
|
4
|
+
no upload, no network request after the initial load.
|
|
4
5
|
|
|
5
|
-
Drop a DOCX, PDF, XLSX,
|
|
6
|
+
Drop a DOCX, PDF, XLSX, HTML, Markdown, JSON, CSV, EML, RTF, TXT, or XML file,
|
|
7
|
+
start typing, get results in milliseconds.
|
|
6
8
|
|
|
7
9
|
---
|
|
8
10
|
|
|
9
|
-
##
|
|
10
|
-
|
|
11
|
-
- **Zero server** — all text stays on the user's machine.
|
|
12
|
-
- **Fuzzy matching** — finds "contrato" even if you type "conttrato" (adaptive edit distance).
|
|
13
|
-
- **Accent-insensitive** — "accion" matches "acción", "espana" matches "España".
|
|
14
|
-
- **Multi-format** — DOCX, XLSX, PDF (text-based), TXT, XML.
|
|
15
|
-
- **Phrase search** — `"contrato marco"` requires the words to appear together.
|
|
16
|
-
- **OR search** — `contrato | acuerdo` unions two independent searches.
|
|
17
|
-
- **No dependencies** — one TypeScript file, two WASM binaries, nothing else.
|
|
18
|
-
- **Tiny footprint** — main WASM is ~14 KB on disk; PDF module (~1 MB) loads on demand.
|
|
19
|
-
|
|
20
|
-
---
|
|
21
|
-
|
|
22
|
-
## Installation
|
|
11
|
+
## Install
|
|
23
12
|
|
|
24
13
|
```bash
|
|
25
14
|
npm install albex
|
|
26
15
|
```
|
|
27
16
|
|
|
28
|
-
|
|
17
|
+
The WASM binary ships inside the package. Bundlers that recognise the
|
|
18
|
+
`new URL('…', import.meta.url)` pattern (Vite, Webpack 5+, esbuild, Rollup,
|
|
19
|
+
Parcel 2) copy it to the output and rewrite the URL automatically.
|
|
20
|
+
|
|
21
|
+
Matrix-tested in CI today: **Vite** and **Node** (via the test suite).
|
|
22
|
+
Other bundlers and runtimes (Next SSR, Bun, Deno) should work through the
|
|
23
|
+
same pattern but are not currently exercised by the test matrix — if you
|
|
24
|
+
hit a problem, open an issue.
|
|
29
25
|
|
|
30
26
|
---
|
|
31
27
|
|
|
@@ -34,184 +30,320 @@ Or copy `dist/albex.js`, `wasm/pkg/albex_wasm_bg.wasm` (and optionally `albex_pd
|
|
|
34
30
|
```ts
|
|
35
31
|
import { AlbexEngine } from 'albex';
|
|
36
32
|
|
|
37
|
-
const engine = new AlbexEngine(
|
|
38
|
-
wasmUrl: '/assets/albex_wasm_bg.wasm',
|
|
39
|
-
pdfWasmUrl: '/assets/albex_pdf.wasm', // only needed for PDFs
|
|
40
|
-
});
|
|
41
|
-
|
|
33
|
+
const engine = new AlbexEngine();
|
|
42
34
|
await engine.init();
|
|
43
35
|
|
|
44
|
-
// Index a file from
|
|
36
|
+
// Index a file from <input type="file"> or drag-and-drop.
|
|
45
37
|
const file = inputElement.files[0];
|
|
46
38
|
const doc = await engine.indexFile(file);
|
|
47
39
|
console.log(`Indexed ${doc.chunks} chunks in ${doc.indexTimeMs.toFixed(0)} ms`);
|
|
48
40
|
|
|
49
|
-
// Search
|
|
41
|
+
// Search.
|
|
50
42
|
const results = engine.search('contrato marco');
|
|
51
43
|
for (const r of results) {
|
|
52
44
|
console.log(`[${r.score}] ${r.documentName} — ${r.snippet}`);
|
|
53
45
|
}
|
|
54
46
|
```
|
|
55
47
|
|
|
48
|
+
Cooperative search — yields to the scheduler between slices so the UI thread
|
|
49
|
+
keeps a chance to paint while a long search runs:
|
|
50
|
+
|
|
51
|
+
```ts
|
|
52
|
+
for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
|
|
53
|
+
renderResult(r);
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
`searchCooperative` returns the same shape as `search`. The "stream" is not
|
|
58
|
+
incremental yet — results arrive in one batch after the search completes,
|
|
59
|
+
but the work is split into frame-budget slices that yield to the scheduler.
|
|
60
|
+
Real incremental streaming is on the backlog.
|
|
61
|
+
|
|
62
|
+
That's the entire onboarding. Read on for what else the engine can do.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Features
|
|
67
|
+
|
|
68
|
+
- **Zero server** — all text stays on the user's machine.
|
|
69
|
+
- **Bundler-friendly default** — `new AlbexEngine()` works without extra
|
|
70
|
+
configuration in bundlers that recognise the `new URL(..., import.meta.url)`
|
|
71
|
+
asset pattern (see the "Install" section for the tested matrix).
|
|
72
|
+
- **Fuzzy matching** — finds `"contrato"` even if you type `"conttrato"` (Bitap with adaptive edit distance). Sound under a two-stage pre-filter (character Bloom for exact tokens, a 256-bit **trigram q-gram signature** for everything) that prunes the candidate set ~10× on prose without ever dropping a real approximate match.
|
|
73
|
+
- **Accent-insensitive** — `"accion"` matches `"acción"`, `"espana"` matches `"España"`, plus Latin Extended (Polish, Czech, Slovak, Turkish…).
|
|
74
|
+
- **11 formats with varying depth** — DOCX · XLSX · PDF · HTML · MD · JSON · CSV · EML · RTF · TXT · XML. See the support table below; several formats are deliberately "lite" (CSV is RFC-4180-lite, EML is MIME-lite, RTF is regex-stripped, etc.).
|
|
75
|
+
- **Phrase + OR queries** — `"contrato marco"` and `contrato | acuerdo` work out of the box.
|
|
76
|
+
- **Cooperative search** — `searchCooperative(query, { frameBudgetMs })` yields to the scheduler between slices. Results land in one batch (real incremental streaming is on the backlog).
|
|
77
|
+
- **Persistence** — snapshot the index to OPFS / IndexedDB and restore it. After `load()`, `engine.getStats().documents` is correct, `engine.search()` works against the restored corpus, and content-hash de-duplication survives the round-trip (snapshot v2). Older v1 snapshots still load — their docs come back with empty content hashes, so re-indexing the same files will create fresh slots until the next `save()` rewrites the snapshot as v2.
|
|
78
|
+
- **Incremental updates** — `removeDocument`, `replaceDocument`, `compact`. Content-hash dedup is automatic.
|
|
79
|
+
- **Resource aware** — pauses speculative work in background tabs, shrinks workers on low battery, defers PDF download on slow networks.
|
|
80
|
+
- **Off-main-thread** — `AlbexEngineWorker` mirror or `AlbexPool` shard across N workers (map-reduce search).
|
|
81
|
+
- **WebGPU pre-filter** — experimental, opt-in (`gpu: 'auto'`). Implemented for corpora over 20 k chunks; no reproducible speedup number yet — the bench in this repo runs on a 200-document synthetic corpus only.
|
|
82
|
+
- **SIMD opportunistic** — picks a SIMD-accelerated variant when the host supports v128.
|
|
83
|
+
- **Tiered storage** — `TieredStore` keeps recent docs hot, evicts cold ones to OPFS, promotes on demand.
|
|
84
|
+
- **Capacity-safe** — when a pool fills (`docs`/`chunks`/`text`/`names`), `indexFile` throws `AlbexCapacityError` with a `limit` field instead of silently truncating the corpus.
|
|
85
|
+
- **Re-entrancy-safe** — async operations on one engine serialize; sync `search`/`compact`/`reset` refuse to run mid-operation (`AlbexError` kind `busy`) rather than corrupting the shared WASM state. Use `searchCooperative` for overlapping search-as-you-type.
|
|
86
|
+
- **Typed errors** — `AlbexParseError`, `AlbexUnsupportedFormatError`, `AlbexCapacityError`, `AlbexInitError`. All extend `AlbexError`.
|
|
87
|
+
- **Tiny core** — main WASM 33 KB (37 KB SIMD). PDF module (~1.2 MB) loads on demand. The OCR companion (`@albex/ocr`) is a separate package and pulls Tesseract.js (~3.5 MB) only when you call `enableOcr()`.
|
|
88
|
+
|
|
56
89
|
---
|
|
57
90
|
|
|
58
91
|
## Supported formats
|
|
59
92
|
|
|
60
|
-
| Extension
|
|
61
|
-
|
|
62
|
-
| `.docx`
|
|
63
|
-
| `.xlsx`
|
|
64
|
-
| `.pdf`
|
|
65
|
-
| `.
|
|
66
|
-
| `.
|
|
93
|
+
| Extension | How text is extracted |
|
|
94
|
+
|--------------------|-----------------------|
|
|
95
|
+
| `.docx` | Native Rust/WASM XML parser — streams `word/document.xml` |
|
|
96
|
+
| `.xlsx` | Native Rust/WASM XML parser — shared strings + inline strings |
|
|
97
|
+
| `.pdf` | Separate `albex_pdf.wasm` (pure Rust, loaded on demand) |
|
|
98
|
+
| `.md` / `.markdown`| TS parser — strips CommonMark marks |
|
|
99
|
+
| `.html` / `.htm` | TS parser — strips `<script>` / `<style>`, paragraphs at block boundaries |
|
|
100
|
+
| `.json` | TS parser — recursive walk over keys + string leaves |
|
|
101
|
+
| `.csv` | TS parser — RFC 4180 lite; one row per chunk |
|
|
102
|
+
| `.eml` | TS parser — MIME-lite: From/To/Subject + text/plain body |
|
|
103
|
+
| `.rtf` | TS parser — strips control words / groups |
|
|
104
|
+
| `.txt` | Plain text split on double newlines |
|
|
105
|
+
| `.xml` | Tag-stripped, entity-decoded |
|
|
67
106
|
|
|
68
107
|
---
|
|
69
108
|
|
|
70
109
|
## Query syntax
|
|
71
110
|
|
|
72
|
-
| Input
|
|
73
|
-
|
|
74
|
-
| `contrato`
|
|
75
|
-
| `contrato marco`
|
|
76
|
-
| `"contrato marco"`
|
|
77
|
-
| `contrato \| acuerdo` | OR:
|
|
111
|
+
| Input | Behaviour |
|
|
112
|
+
|----------------------|-----------|
|
|
113
|
+
| `contrato` | Fuzzy match, accent-insensitive |
|
|
114
|
+
| `contrato marco` | Both words must appear in the same chunk |
|
|
115
|
+
| `"contrato marco"` | Both words AND they must be adjacent (phrase) |
|
|
116
|
+
| `contrato \| acuerdo` | OR: union of results matching either branch |
|
|
78
117
|
|
|
79
118
|
Up to 4 space-separated tokens per simple/phrase query. OR branches are unlimited.
|
|
80
119
|
|
|
81
120
|
---
|
|
82
121
|
|
|
83
|
-
## API
|
|
84
|
-
|
|
85
|
-
### `new AlbexEngine(opts)`
|
|
122
|
+
## API at a glance
|
|
86
123
|
|
|
87
124
|
```ts
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
125
|
+
// Construct
|
|
126
|
+
const engine = new AlbexEngine();
|
|
127
|
+
await engine.init();
|
|
128
|
+
|
|
129
|
+
// Indexing
|
|
130
|
+
const doc = await engine.indexFile(file);
|
|
131
|
+
|
|
132
|
+
// Search (synchronous fast path)
|
|
133
|
+
const results = engine.search('contrato', { windowed: true });
|
|
134
|
+
|
|
135
|
+
// Cooperative search (yields to the scheduler between slices)
|
|
136
|
+
for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
|
|
137
|
+
/* … */
|
|
91
138
|
}
|
|
139
|
+
|
|
140
|
+
// Incremental updates
|
|
141
|
+
engine.removeDocument('contract.pdf');
|
|
142
|
+
await engine.replaceDocument('contract.pdf', newFile);
|
|
143
|
+
engine.compact();
|
|
144
|
+
|
|
145
|
+
// Persistence (OPFS or IndexedDB)
|
|
146
|
+
await engine.save('my-corpus');
|
|
147
|
+
await engine.loadOrInit('my-corpus');
|
|
148
|
+
|
|
149
|
+
// Tuning
|
|
150
|
+
engine.setMaxErrors(2);
|
|
151
|
+
engine.setThreshold(400);
|
|
152
|
+
engine.setMaxResults(50);
|
|
153
|
+
engine.setLanguage('es');
|
|
154
|
+
|
|
155
|
+
// Introspection
|
|
156
|
+
const stats = engine.getStats();
|
|
157
|
+
const lastRun = engine.getLastSearchStats();
|
|
92
158
|
```
|
|
93
159
|
|
|
94
|
-
|
|
160
|
+
Full API reference and types: [bdovenbird.com/albex/docs](https://bdovenbird.com/albex/docs).
|
|
95
161
|
|
|
96
|
-
|
|
162
|
+
---
|
|
97
163
|
|
|
98
|
-
|
|
164
|
+
## Off the main thread
|
|
99
165
|
|
|
100
|
-
|
|
166
|
+
For interactive search UIs, run the engine inside a Web Worker:
|
|
101
167
|
|
|
102
168
|
```ts
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
}
|
|
169
|
+
import { AlbexEngineWorker } from 'albex/worker';
|
|
170
|
+
|
|
171
|
+
const engine = new AlbexEngineWorker({
|
|
172
|
+
workerUrl: new URL('albex/worker-runtime', import.meta.url),
|
|
173
|
+
});
|
|
174
|
+
await engine.init();
|
|
110
175
|
```
|
|
111
176
|
|
|
112
|
-
|
|
177
|
+
Same surface as `AlbexEngine`; everything returns a `Promise`.
|
|
113
178
|
|
|
114
|
-
|
|
179
|
+
---
|
|
115
180
|
|
|
116
|
-
|
|
117
|
-
interface SearchResult {
|
|
118
|
-
documentName: string;
|
|
119
|
-
location: number; // paragraph (DOCX/TXT) or page (PDF, 1-based)
|
|
120
|
-
score: number; // 0–1000
|
|
121
|
-
snippet: string; // full chunk text (original, with accents)
|
|
122
|
-
matchStart: number; // byte offset of match in snippet
|
|
123
|
-
matchEnd: number; // exclusive
|
|
124
|
-
}
|
|
125
|
-
```
|
|
181
|
+
## Sharding across cores
|
|
126
182
|
|
|
127
|
-
|
|
183
|
+
For large corpora, an `AlbexPool` shards documents across N workers:
|
|
128
184
|
|
|
129
185
|
```ts
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
186
|
+
import { AlbexPool } from 'albex/pool';
|
|
187
|
+
|
|
188
|
+
const pool = new AlbexPool({
|
|
189
|
+
workerUrl: new URL('albex/worker-runtime', import.meta.url),
|
|
190
|
+
workers: 'auto', // = cores / 2, clamped [1, 8]
|
|
191
|
+
});
|
|
192
|
+
await pool.init();
|
|
193
|
+
|
|
194
|
+
await pool.indexFile(fileA); // sharded round-robin
|
|
195
|
+
const results = await pool.search('contrato'); // map-reduce
|
|
137
196
|
```
|
|
138
197
|
|
|
139
|
-
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Big corpora — tiered storage
|
|
140
201
|
|
|
141
|
-
|
|
202
|
+
For workloads that exceed the tier's RAM capacity:
|
|
142
203
|
|
|
143
204
|
```ts
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
205
|
+
import { AlbexEngine, TieredStore } from 'albex';
|
|
206
|
+
|
|
207
|
+
const engine = new AlbexEngine();
|
|
208
|
+
await engine.init();
|
|
209
|
+
|
|
210
|
+
const store = new TieredStore(engine, { evictThreshold: 0.85 });
|
|
211
|
+
await store.init();
|
|
212
|
+
|
|
213
|
+
await store.indexFile(file); // persists original blob in OPFS
|
|
214
|
+
await store.promote('older-doc.pdf'); // brings warm doc back
|
|
152
215
|
```
|
|
153
216
|
|
|
154
|
-
|
|
217
|
+
Hot tier = engine. Warm tier = original files in OPFS. LRU eviction is automatic.
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## Advanced configuration
|
|
222
|
+
|
|
223
|
+
`new AlbexEngine()` covers the default case. The options below address
|
|
224
|
+
specific deployment needs:
|
|
225
|
+
|
|
226
|
+
### Tier auto-selection (`mini` / `std` / `pro` based on `deviceMemory`)
|
|
227
|
+
|
|
228
|
+
Albex ships **six** WASM variants of the main engine (3 tiers × baseline/SIMD).
|
|
229
|
+
By default it loads the std-baseline binary that comes with the npm package.
|
|
230
|
+
If you want runtime tier auto-selection, serve the variants yourself and
|
|
231
|
+
pass `wasmBaseUrl`:
|
|
155
232
|
|
|
156
233
|
```ts
|
|
157
|
-
engine
|
|
158
|
-
|
|
159
|
-
|
|
234
|
+
const engine = new AlbexEngine({
|
|
235
|
+
wasmBaseUrl: '/assets', // directory containing the 6 .wasm files
|
|
236
|
+
tier: 'auto', // picks mini/std/pro by deviceMemory
|
|
237
|
+
simd: 'auto', // picks baseline/simd by WASM probe
|
|
238
|
+
gpu: 'auto', // engages WebGPU when corpus > 20k chunks
|
|
239
|
+
});
|
|
160
240
|
```
|
|
161
241
|
|
|
162
|
-
|
|
242
|
+
Tier capacities:
|
|
163
243
|
|
|
164
|
-
|
|
244
|
+
| Tier | Max docs | Max chunks | Max text | Working set |
|
|
245
|
+
|-------|---------:|-----------:|---------:|------------:|
|
|
246
|
+
| mini | 32 | 25 000 | 4 MB | ~5 MB |
|
|
247
|
+
| std | 128 | 100 000 | 16 MB | ~20 MB |
|
|
248
|
+
| pro | 1 024 | 800 000 | 128 MB | ~160 MB |
|
|
249
|
+
|
|
250
|
+
### Custom CDN
|
|
251
|
+
|
|
252
|
+
```ts
|
|
253
|
+
const engine = new AlbexEngine({
|
|
254
|
+
wasmUrl: 'https://my-cdn.example.com/albex_wasm.wasm',
|
|
255
|
+
});
|
|
256
|
+
```
|
|
165
257
|
|
|
166
258
|
---
|
|
167
259
|
|
|
168
|
-
##
|
|
260
|
+
## Errors
|
|
261
|
+
|
|
262
|
+
All errors thrown by Albex extend `AlbexError`:
|
|
169
263
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
264
|
+
```ts
|
|
265
|
+
import {
|
|
266
|
+
AlbexError, AlbexInitError, AlbexParseError,
|
|
267
|
+
AlbexUnsupportedFormatError, AlbexCapacityError,
|
|
268
|
+
} from 'albex';
|
|
269
|
+
|
|
270
|
+
try {
|
|
271
|
+
await engine.indexFile(file);
|
|
272
|
+
} catch (e) {
|
|
273
|
+
if (e instanceof AlbexUnsupportedFormatError) {
|
|
274
|
+
console.warn(`Skipped .${e.ext} (unsupported)`);
|
|
275
|
+
} else if (e instanceof AlbexParseError) {
|
|
276
|
+
console.warn(`Parse failed for ${e.format}:`, e.message);
|
|
277
|
+
} else throw e;
|
|
278
|
+
}
|
|
279
|
+
```
|
|
177
280
|
|
|
178
|
-
|
|
281
|
+
Each error carries a `kind` field that survives `structuredClone` across worker boundaries.
|
|
179
282
|
|
|
180
283
|
---
|
|
181
284
|
|
|
182
285
|
## Browser requirements
|
|
183
286
|
|
|
184
|
-
- WebAssembly (
|
|
287
|
+
- WebAssembly (every browser since 2017)
|
|
185
288
|
- `DecompressionStream` for DOCX/XLSX (Chrome 80+, Firefox 113+, Safari 16.4+)
|
|
186
|
-
-
|
|
289
|
+
- OPFS for fastest persistence (Chrome 102+, Safari 15.2+, Firefox 111+); IndexedDB fallback works everywhere
|
|
290
|
+
- WebGPU is **optional**; without it the CPU path is the default
|
|
187
291
|
|
|
188
|
-
PDF support
|
|
292
|
+
PDF support requires `albex_pdf.wasm` to be served with MIME type `application/wasm`.
|
|
189
293
|
|
|
190
294
|
---
|
|
191
295
|
|
|
192
296
|
## Building from source
|
|
193
297
|
|
|
194
298
|
```bash
|
|
195
|
-
# Install Rust + wasm-pack
|
|
196
299
|
rustup target add wasm32-unknown-unknown
|
|
197
300
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
301
|
+
npm install
|
|
302
|
+
npm run build:all # 6 main variants + PDF + TypeScript
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
Partial builds:
|
|
201
306
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
307
|
+
```bash
|
|
308
|
+
npm run build:wasm # std baseline only
|
|
309
|
+
npm run build:wasm:tiers # all 6 variants
|
|
310
|
+
npm run build:pdf-wasm # PDF module
|
|
311
|
+
npm run build # TypeScript only
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
---
|
|
315
|
+
|
|
316
|
+
## Tests
|
|
205
317
|
|
|
206
|
-
|
|
207
|
-
|
|
318
|
+
```bash
|
|
319
|
+
# Rust unit tests
|
|
320
|
+
cargo test --manifest-path core/Cargo.toml
|
|
321
|
+
cargo test --manifest-path ingest/Cargo.toml
|
|
322
|
+
|
|
323
|
+
# TypeScript + WASM integration tests
|
|
324
|
+
npm test
|
|
325
|
+
|
|
326
|
+
# Micro-benchmarks
|
|
327
|
+
npm run bench
|
|
208
328
|
```
|
|
209
329
|
|
|
330
|
+
**About the benchmark.** The included bench probes per-operation overhead
|
|
331
|
+
on a 200-document synthetic corpus. It is **not** a corpus-level
|
|
332
|
+
performance claim — there is no representative real-world dataset checked
|
|
333
|
+
into the repo yet. Numbers from `npm run bench` should be read as
|
|
334
|
+
"this implementation does not regress against itself", not as comparisons
|
|
335
|
+
against other libraries.
|
|
336
|
+
|
|
337
|
+
CI runs every check on every push to `main`.
|
|
338
|
+
|
|
210
339
|
---
|
|
211
340
|
|
|
212
341
|
## Privacy
|
|
213
342
|
|
|
214
|
-
Albex
|
|
343
|
+
Albex never transmits document content. Text extraction, indexing, search and
|
|
344
|
+
persistence all happen inside the browser. The only network requests are the
|
|
345
|
+
initial fetches for the `.wasm` binaries (and the lazy PDF module on first
|
|
346
|
+
PDF). Persisted snapshots live in OPFS / IndexedDB, scoped to your origin.
|
|
215
347
|
|
|
216
348
|
---
|
|
217
349
|
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `AlbexEngineWorker` — a main-thread wrapper that runs the engine inside a
|
|
3
|
+
* Web Worker. Mirrors the surface of `AlbexEngine` so it can be swapped in
|
|
4
|
+
* without code changes.
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
*
|
|
8
|
+
* const engine = new AlbexEngineWorker({
|
|
9
|
+
* wasmUrl: '/assets/albex_wasm_bg.wasm',
|
|
10
|
+
* pdfWasmUrl: '/assets/albex_pdf.wasm',
|
|
11
|
+
* // Provide the URL to the bundled worker runtime.
|
|
12
|
+
* workerUrl: new URL('./worker-runtime.js', import.meta.url),
|
|
13
|
+
* });
|
|
14
|
+
* await engine.init();
|
|
15
|
+
*
|
|
16
|
+
* Why: a `search()` over 100k chunks can take 10–50 ms. On main thread that
|
|
17
|
+
* is visible jank for every keystroke. Off-main-thread keeps the UI at 60 fps.
|
|
18
|
+
*
|
|
19
|
+
* The runtime is single-threaded WASM, so requests are serialised: only one
|
|
20
|
+
* call is in flight at a time. This matches the actual `static mut` model
|
|
21
|
+
* inside the .wasm and is fine for an interactive search UI (each keystroke
|
|
22
|
+
* replaces the previous query).
|
|
23
|
+
*/
|
|
24
|
+
import type { AlbexOptions, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
|
|
25
|
+
export interface AlbexWorkerOptions extends AlbexOptions {
|
|
26
|
+
/** URL to the bundled worker runtime script (worker-runtime.js). */
|
|
27
|
+
workerUrl: string | URL;
|
|
28
|
+
}
|
|
29
|
+
export declare class AlbexEngineWorker {
|
|
30
|
+
private readonly _opts;
|
|
31
|
+
private _worker;
|
|
32
|
+
private _nextId;
|
|
33
|
+
private _pending;
|
|
34
|
+
private _docsCache;
|
|
35
|
+
constructor(opts: AlbexWorkerOptions);
|
|
36
|
+
init(): Promise<void>;
|
|
37
|
+
private _send;
|
|
38
|
+
indexFile(file: File): Promise<IndexedDocument>;
|
|
39
|
+
search(query: string, opts?: SearchOptions): Promise<SearchResult[]>;
|
|
40
|
+
/**
|
|
41
|
+
* Cooperative variant of `search`. Today the wire still sends a single
|
|
42
|
+
* batch — the result array is fetched in one round-trip from the worker
|
|
43
|
+
* and then exposed as an async iterator so callers can `break` early.
|
|
44
|
+
* A future iteration may use a `MessagePort` to stream individual results
|
|
45
|
+
* from the worker side; the iterator shape is preserved across that
|
|
46
|
+
* transition.
|
|
47
|
+
*/
|
|
48
|
+
searchCooperative(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
|
|
49
|
+
/**
|
|
50
|
+
* @deprecated Renamed to `searchCooperative` in 0.3.0. Alias removed in 0.4.0.
|
|
51
|
+
*/
|
|
52
|
+
searchStream(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
|
|
53
|
+
removeDocument(id: string): Promise<boolean>;
|
|
54
|
+
compact(): Promise<void>;
|
|
55
|
+
reset(): Promise<void>;
|
|
56
|
+
getStats(): Promise<EngineStats>;
|
|
57
|
+
getLastSearchStats(): Promise<SearchStats | null>;
|
|
58
|
+
getDocuments(): Promise<readonly IndexedDocument[]>;
|
|
59
|
+
setMaxErrors(n: 0 | 1 | 2 | 3): Promise<void>;
|
|
60
|
+
setThreshold(n: number): Promise<void>;
|
|
61
|
+
setMaxResults(n: number): Promise<void>;
|
|
62
|
+
setLanguage(lang: 'off' | 'es'): Promise<void>;
|
|
63
|
+
save(name: string): Promise<void>;
|
|
64
|
+
load(name: string): Promise<boolean>;
|
|
65
|
+
loadOrInit(name: string): Promise<boolean>;
|
|
66
|
+
deleteSnapshot(name: string): Promise<void>;
|
|
67
|
+
listSnapshots(): Promise<string[]>;
|
|
68
|
+
[Symbol.dispose](): void;
|
|
69
|
+
}
|
|
70
|
+
//# sourceMappingURL=albex-worker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"albex-worker.d.ts","sourceRoot":"","sources":["../src/albex-worker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EACV,YAAY,EACZ,eAAe,EACf,aAAa,EACb,YAAY,EACZ,WAAW,EACX,WAAW,EACZ,MAAM,YAAY,CAAC;AAcpB,MAAM,WAAW,kBAAmB,SAAQ,YAAY;IACtD,oEAAoE;IACpE,SAAS,EAAE,MAAM,GAAG,GAAG,CAAC;CACzB;AASD,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAqB;IAC3C,OAAO,CAAC,OAAO,CAAU;IACzB,OAAO,CAAC,OAAO,CAAK;IACpB,OAAO,CAAC,QAAQ,CAA8B;IAC9C,OAAO,CAAC,UAAU,CAAyB;gBAE/B,IAAI,EAAE,kBAAkB;IAI9B,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAsB3B,OAAO,CAAC,KAAK;IASP,SAAS,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO,CAAC,eAAe,CAAC;IAWrD,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAIxE;;;;;;;OAOG;IACI,iBAAiB,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,aAAkB,GAAG,aAAa,CAAC,YAAY,CAAC;IAK9F;;OAEG;IACI,YAAY,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,aAAkB,GAAG,aAAa,CAAC,YAAY,CAAC;IAQnF,cAAc,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAM5C,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IACxB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAK5B,QAAQ,IAAa,OAAO,CAAC,WAAW,CAAC;IACzC,kBAAkB,IAAI,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC;IACjD,YAAY,IAAU,OAAO,CAAC,SAAS,eAAe,EAAE,CAAC;IAEnD,YAAY,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IAC7C,YAAY,CAAC,CAAC,EAAE,MAAM,GAAS,OAAO,CAAC,IAAI,CAAC;IAC5C,aAAa,CAAC,CAAC,EAAE,MAAM,GAAQ,OAAO,CAAC,IAAI,CAAC;IAC5C,WAAW,CAAC,IAAI,EAAE,KAAK,GAAG,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAG9C,IAAI,CAAC,IAAI,EAAE,MAAM,GAAa,OAAO,CAAC,IAAI,CAAC;IAC3C,IAAI,CAAC,IAAI,EAAE,MAAM,GAAa,OAAO,CAAC,OAAO,CAAC;IAC9C,UAAU,CAAC,IAAI,EAAE,MAAM,GAAO,OAAO,CAAC,OAAO,CAAC;IAC9C,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAC3C,aAAa,IAAiB,OAAO,CAAC,MAAM,EAAE,CAAC;IAErD,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,IAAI;CAMzB"}
|