albex 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +141 -0
- package/README.md +242 -112
- package/dist/albex-worker.d.ts +70 -0
- package/dist/albex-worker.d.ts.map +1 -0
- package/dist/albex-worker.js +153 -0
- package/dist/albex-worker.js.map +1 -0
- package/dist/albex.d.ts +368 -6
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +1692 -95
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +38 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +63 -0
- package/dist/errors.js.map +1 -0
- package/dist/gpu/bloom-runtime.d.ts +60 -0
- package/dist/gpu/bloom-runtime.d.ts.map +1 -0
- package/dist/gpu/bloom-runtime.js +176 -0
- package/dist/gpu/bloom-runtime.js.map +1 -0
- package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
- package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
- package/dist/gpu/bloom-shader.wgsl.js +49 -0
- package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
- package/dist/persistence.d.ts +21 -0
- package/dist/persistence.d.ts.map +1 -0
- package/dist/persistence.js +174 -0
- package/dist/persistence.js.map +1 -0
- package/dist/pool/coordinator.d.ts +98 -0
- package/dist/pool/coordinator.d.ts.map +1 -0
- package/dist/pool/coordinator.js +247 -0
- package/dist/pool/coordinator.js.map +1 -0
- package/dist/profile.d.ts +95 -0
- package/dist/profile.d.ts.map +1 -0
- package/dist/profile.js +207 -0
- package/dist/profile.js.map +1 -0
- package/dist/resource-manager.d.ts +56 -0
- package/dist/resource-manager.d.ts.map +1 -0
- package/dist/resource-manager.js +138 -0
- package/dist/resource-manager.js.map +1 -0
- package/dist/tiered-store.d.ts +98 -0
- package/dist/tiered-store.d.ts.map +1 -0
- package/dist/tiered-store.js +238 -0
- package/dist/tiered-store.js.map +1 -0
- package/dist/wasm-bindings.d.ts +139 -0
- package/dist/wasm-bindings.d.ts.map +1 -0
- package/dist/wasm-bindings.js +33 -0
- package/dist/wasm-bindings.js.map +1 -0
- package/dist/worker-protocol.d.ts +86 -0
- package/dist/worker-protocol.d.ts.map +1 -0
- package/dist/worker-protocol.js +20 -0
- package/dist/worker-protocol.js.map +1 -0
- package/dist/worker-runtime.d.ts +14 -0
- package/dist/worker-runtime.d.ts.map +1 -0
- package/dist/worker-runtime.js +100 -0
- package/dist/worker-runtime.js.map +1 -0
- package/package.json +56 -13
- package/src/albex-worker.ts +187 -0
- package/src/albex.ts +1845 -130
- package/src/errors.ts +60 -0
- package/src/gpu/bloom-runtime.ts +229 -0
- package/src/gpu/bloom-shader.wgsl.ts +48 -0
- package/src/persistence.ts +175 -0
- package/src/pool/coordinator.ts +324 -0
- package/src/profile.ts +279 -0
- package/src/resource-manager.ts +167 -0
- package/src/tiered-store.ts +259 -0
- package/src/wasm-bindings.ts +200 -0
- package/src/worker-protocol.ts +48 -0
- package/src/worker-runtime.ts +96 -0
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_std.wasm +0 -0
- package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to Albex are documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and Albex follows [Semantic Versioning](https://semver.org/).
|
|
7
|
+
|
|
8
|
+
## [0.3.0] — 2026-05-30
|
|
9
|
+
|
|
10
|
+
### Hybrid PDF OCR (opt-in)
|
|
11
|
+
|
|
12
|
+
- New `@albex/ocr` option `alwaysExtractEmbeddedImages: boolean` (default
|
|
13
|
+
`false`). When enabled, the engine OCRs the embedded images of EVERY
|
|
14
|
+
PDF on top of the normal text extraction — catching text that lives
|
|
15
|
+
only inside scanned annexes, stamps, signatures, or screenshots inside
|
|
16
|
+
otherwise-native PDFs.
|
|
17
|
+
- Demo exposes the flag as a checkbox in the OCR panel; status shows
|
|
18
|
+
`ready (spa, hybrid)` when active.
|
|
19
|
+
|
|
20
|
+
### PDF parse-crash → OCR fallback
|
|
21
|
+
|
|
22
|
+
- When `extractPdf` traps (pdf-extract crashes on a PDF that other tools
|
|
23
|
+
read fine), the engine now re-instantiates the WASM and tries the
|
|
24
|
+
lopdf-only image-extraction path before throwing. With OCR wired, many
|
|
25
|
+
formerly "unsupported encoding" PDFs become searchable.
|
|
26
|
+
- Error message updated: instead of misleading "the file may be
|
|
27
|
+
malformed", users see clear guidance pointing at OCR as the recovery
|
|
28
|
+
path.
|
|
29
|
+
|
|
30
|
+
### Demo sandbox
|
|
31
|
+
|
|
32
|
+
- Importmap to jsDelivr for `tesseract.js` (only loaded when the user
|
|
33
|
+
enables OCR).
|
|
34
|
+
- Full OCR panel: language select, hybrid-mode checkbox, lifecycle
|
|
35
|
+
status.
|
|
36
|
+
- Two fixture PDFs in `demo/fixtures/` for end-to-end testing:
|
|
37
|
+
`hybrid-test.pdf` (vector text + embedded image with text) and
|
|
38
|
+
`scanned-only-test.pdf` (100% image, no vector text).
|
|
39
|
+
- Global `window.onerror` + `unhandledrejection` handlers so Tesseract
|
|
40
|
+
worker aborts surface as Log entries instead of crashing the page.
|
|
41
|
+
- New `npm run serve` script wraps `npx serve -p 5173` for reproducible
|
|
42
|
+
local testing.
|
|
43
|
+
|
|
44
|
+
### Breaking changes
|
|
45
|
+
|
|
46
|
+
- **`searchStream` renamed to `searchCooperative`.** The original name
|
|
47
|
+
implied incremental streaming, which the method never provided — it
|
|
48
|
+
yields to the scheduler between slices and then returns a batch.
|
|
49
|
+
The new name is honest. `searchStream` is kept as a deprecated alias
|
|
50
|
+
on `AlbexEngine`, `AlbexEngineWorker` and `AlbexPool`; it logs a
|
|
51
|
+
one-time `console.warn` on first call and will be removed in 0.4.0.
|
|
52
|
+
|
|
53
|
+
- **Snapshot format bumped to v2.** Existing v1 snapshots still load —
|
|
54
|
+
their documents come back with empty `contentHash` strings, same as
|
|
55
|
+
before. On the next `save()` they are rewritten as v2. No data loss;
|
|
56
|
+
no migration step required.
|
|
57
|
+
|
|
58
|
+
### Added
|
|
59
|
+
|
|
60
|
+
- **Scanned-PDF OCR fallback.** When `extractPdf` returns `-2` (image-
|
|
61
|
+
only PDF) AND `@albex/ocr` has been wired via `enableOcr(engine)`,
|
|
62
|
+
the engine now extracts embedded JPEG / JPEG2000 image XObjects from
|
|
63
|
+
the PDF and runs them through Tesseract.js to recover text. Covers the
|
|
64
|
+
great majority of real-world scanned PDFs. Other compression filters
|
|
65
|
+
(FlateDecode, CCITTFaxDecode, JBIG2Decode) are not yet supported; pages
|
|
66
|
+
using them register with zero chunks (same behaviour as before).
|
|
67
|
+
|
|
68
|
+
- **`getPageCount`, `extractPageImages`, `getPageImage{Len,Ptr,Kind}`**
|
|
69
|
+
added to `albex_pdf.wasm` to support the scanned-PDF path. The PDF
|
|
70
|
+
binary grew from ~1.04 MB to ~1.19 MB.
|
|
71
|
+
|
|
72
|
+
- **Snapshot v2 persists per-document content hashes.** `load()` now
|
|
73
|
+
repopulates the in-memory `_docs` list correctly: `getStats().documents`
|
|
74
|
+
is right after a restore, and content-hash de-duplication survives the
|
|
75
|
+
round-trip (re-indexing the same file does not create a fresh slot).
|
|
76
|
+
|
|
77
|
+
- **New WASM exports**: `setDocumentContentHash`, `getDocContentHashPtr`,
|
|
78
|
+
`getDocContentHashLen`. Used by the host to round-trip the FNV-1a 64-bit
|
|
79
|
+
hash through the snapshot format.
|
|
80
|
+
|
|
81
|
+
- **OCR sandbox in the demo.** `demo/index.html` now ships an "Enable
|
|
82
|
+
OCR" panel that lazy-loads Tesseract.js through an importmap and
|
|
83
|
+
exposes per-document OCR status. Drop a scanned PDF and the demo OCR's
|
|
84
|
+
it automatically.
|
|
85
|
+
|
|
86
|
+
### Fixed
|
|
87
|
+
|
|
88
|
+
- **`load()` repopulates `_docs` from the WASM tables.** Previously it
|
|
89
|
+
left `_docs = []` after a successful restore, which made
|
|
90
|
+
`engine.getStats().documents` return `0` even though searches against
|
|
91
|
+
the restored corpus worked. The README advertised "snapshot the index
|
|
92
|
+
and restore it" without that caveat.
|
|
93
|
+
|
|
94
|
+
- **CSV parser strips the UTF-8 BOM.** Files exported as "CSV UTF-8"
|
|
95
|
+
by Excel kept the BOM glued to the first field of the first row,
|
|
96
|
+
breaking column alignment and search hits on the first header
|
|
97
|
+
("Subject", "Asunto", etc.).
|
|
98
|
+
|
|
99
|
+
- **EML parser decodes `base64` and `quoted-printable` bodies.** Real
|
|
100
|
+
emails almost always use one of these transfer encodings; before the
|
|
101
|
+
fix the body surfaced as opaque encoded blobs that searches could
|
|
102
|
+
never hit. Nested multipart (`multipart/alternative` inside
|
|
103
|
+
`multipart/mixed`) is now also unwrapped recursively.
|
|
104
|
+
|
|
105
|
+
- **RTF parser decodes `\'XX` hex bytes (via Windows-1252) and `\uN ?`
|
|
106
|
+
Unicode escapes.** Spanish/French/German content stored as cp1252
|
|
107
|
+
used to lose every accent; Word's modern `\u` escapes used to eat
|
|
108
|
+
the fallback ASCII character. Also added `\emdash`, `\endash`,
|
|
109
|
+
`\bullet`, `\lquote`, `\rquote`, `\ldblquote`, `\rdblquote`, `\tab`,
|
|
110
|
+
and soft-hyphen/non-breaking-space handling.
|
|
111
|
+
|
|
112
|
+
### Documentation
|
|
113
|
+
|
|
114
|
+
- **README claims grounded.** Removed "every modern bundler",
|
|
115
|
+
"60 fps even on huge corpora", "5–10× speedup", "works for 99 % of
|
|
116
|
+
users", "11 formats" (without the `lite` qualifier). The matrix of
|
|
117
|
+
what is tested vs what is expected to work is now explicit. Bench
|
|
118
|
+
results are flagged as synthetic.
|
|
119
|
+
|
|
120
|
+
- **Persistence caveats documented.** The `Persistence` feature bullet
|
|
121
|
+
now describes the v2 / v1 difference and what survives the round trip.
|
|
122
|
+
|
|
123
|
+
### Tests
|
|
124
|
+
|
|
125
|
+
- 71 → 83 vitest tests, all green. New suites:
|
|
126
|
+
- `tests/scanned-pdf.test.ts` (4 tests) — scanned-PDF OCR fallback
|
|
127
|
+
with a hand-rolled `FakePdfWasm`.
|
|
128
|
+
- `tests/load-restores-docs.test.ts` (4 tests) — verifies `load()`
|
|
129
|
+
repopulates `_docs` and that content-hash dedup survives v2.
|
|
130
|
+
- `tests/lite-parsers.test.ts` (11 tests) — adversarial fixtures for
|
|
131
|
+
CSV BOM, EML base64 / QP / nested multipart, RTF cp1252 / Unicode.
|
|
132
|
+
|
|
133
|
+
## [0.2.0] — earlier
|
|
134
|
+
|
|
135
|
+
Initial public release. See git history for details: the surface was
|
|
136
|
+
the `AlbexEngine` class, the `albex_wasm_bg.wasm` and `albex_pdf.wasm`
|
|
137
|
+
binaries, lite parsers for the 11 formats, OPFS/IndexedDB persistence,
|
|
138
|
+
worker pool, tiered storage and optional WebGPU pre-filter.
|
|
139
|
+
|
|
140
|
+
[0.3.0]: https://github.com/RafaCalRob/Albex/releases/tag/v0.3.0
|
|
141
|
+
[0.2.0]: https://github.com/RafaCalRob/Albex/releases/tag/v0.2.0
|
package/README.md
CHANGED
|
@@ -1,31 +1,27 @@
|
|
|
1
1
|
# Albex
|
|
2
2
|
|
|
3
|
-
Local full-text search for documents. Runs entirely in the browser — no server,
|
|
3
|
+
Local full-text search for documents. Runs entirely in the browser — no server,
|
|
4
|
+
no upload, no network request after the initial load.
|
|
4
5
|
|
|
5
|
-
Drop a DOCX, PDF, XLSX,
|
|
6
|
+
Drop a DOCX, PDF, XLSX, HTML, Markdown, JSON, CSV, EML, RTF, TXT, or XML file,
|
|
7
|
+
start typing, get results in milliseconds.
|
|
6
8
|
|
|
7
9
|
---
|
|
8
10
|
|
|
9
|
-
##
|
|
10
|
-
|
|
11
|
-
- **Zero server** — all text stays on the user's machine.
|
|
12
|
-
- **Fuzzy matching** — finds "contrato" even if you type "conttrato" (adaptive edit distance).
|
|
13
|
-
- **Accent-insensitive** — "accion" matches "acción", "espana" matches "España".
|
|
14
|
-
- **Multi-format** — DOCX, XLSX, PDF (text-based), TXT, XML.
|
|
15
|
-
- **Phrase search** — `"contrato marco"` requires the words to appear together.
|
|
16
|
-
- **OR search** — `contrato | acuerdo` unions two independent searches.
|
|
17
|
-
- **No dependencies** — one TypeScript file, two WASM binaries, nothing else.
|
|
18
|
-
- **Tiny footprint** — main WASM is ~14 KB on disk; PDF module (~1 MB) loads on demand.
|
|
19
|
-
|
|
20
|
-
---
|
|
21
|
-
|
|
22
|
-
## Installation
|
|
11
|
+
## Install
|
|
23
12
|
|
|
24
13
|
```bash
|
|
25
14
|
npm install albex
|
|
26
15
|
```
|
|
27
16
|
|
|
28
|
-
|
|
17
|
+
The WASM binary ships inside the package. Bundlers that recognise the
|
|
18
|
+
`new URL('…', import.meta.url)` pattern (Vite, Webpack 5+, esbuild, Rollup,
|
|
19
|
+
Parcel 2) copy it to the output and rewrite the URL automatically.
|
|
20
|
+
|
|
21
|
+
Matrix-tested in CI today: **Vite** and **Node** (via the test suite).
|
|
22
|
+
Other bundlers and runtimes (Next SSR, Bun, Deno) should work through the
|
|
23
|
+
same pattern but are not currently exercised by the test matrix — if you
|
|
24
|
+
hit a problem, open an issue.
|
|
29
25
|
|
|
30
26
|
---
|
|
31
27
|
|
|
@@ -34,184 +30,318 @@ Or copy `dist/albex.js`, `wasm/pkg/albex_wasm_bg.wasm` (and optionally `albex_pd
|
|
|
34
30
|
```ts
|
|
35
31
|
import { AlbexEngine } from 'albex';
|
|
36
32
|
|
|
37
|
-
const engine = new AlbexEngine(
|
|
38
|
-
wasmUrl: '/assets/albex_wasm_bg.wasm',
|
|
39
|
-
pdfWasmUrl: '/assets/albex_pdf.wasm', // only needed for PDFs
|
|
40
|
-
});
|
|
41
|
-
|
|
33
|
+
const engine = new AlbexEngine();
|
|
42
34
|
await engine.init();
|
|
43
35
|
|
|
44
|
-
// Index a file from
|
|
36
|
+
// Index a file from <input type="file"> or drag-and-drop.
|
|
45
37
|
const file = inputElement.files[0];
|
|
46
38
|
const doc = await engine.indexFile(file);
|
|
47
39
|
console.log(`Indexed ${doc.chunks} chunks in ${doc.indexTimeMs.toFixed(0)} ms`);
|
|
48
40
|
|
|
49
|
-
// Search
|
|
41
|
+
// Search.
|
|
50
42
|
const results = engine.search('contrato marco');
|
|
51
43
|
for (const r of results) {
|
|
52
44
|
console.log(`[${r.score}] ${r.documentName} — ${r.snippet}`);
|
|
53
45
|
}
|
|
54
46
|
```
|
|
55
47
|
|
|
48
|
+
Cooperative search — yields to the scheduler between slices so the UI thread
|
|
49
|
+
keeps a chance to paint while a long search runs:
|
|
50
|
+
|
|
51
|
+
```ts
|
|
52
|
+
for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
|
|
53
|
+
renderResult(r);
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
`searchCooperative` returns the same shape as `search`. The "stream" is not
|
|
58
|
+
incremental yet — results arrive in one batch after the search completes,
|
|
59
|
+
but the work is split into frame-budget slices that yield to the scheduler.
|
|
60
|
+
Real incremental streaming is on the backlog.
|
|
61
|
+
|
|
62
|
+
That's the entire onboarding. Read on for what else the engine can do.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Features
|
|
67
|
+
|
|
68
|
+
- **Zero server** — all text stays on the user's machine.
|
|
69
|
+
- **Bundler-friendly default** — `new AlbexEngine()` works without extra
|
|
70
|
+
configuration in bundlers that recognise the `new URL(..., import.meta.url)`
|
|
71
|
+
asset pattern (see the "Install" section for the tested matrix).
|
|
72
|
+
- **Fuzzy matching** — finds `"contrato"` even if you type `"conttrato"` (Bitap with adaptive edit distance).
|
|
73
|
+
- **Accent-insensitive** — `"accion"` matches `"acción"`, `"espana"` matches `"España"`, plus Latin Extended (Polish, Czech, Slovak, Turkish…).
|
|
74
|
+
- **11 formats with varying depth** — DOCX · XLSX · PDF · HTML · MD · JSON · CSV · EML · RTF · TXT · XML. See the support table below; several formats are deliberately "lite" (CSV is RFC-4180-lite, EML is MIME-lite, RTF is regex-stripped, etc.).
|
|
75
|
+
- **Phrase + OR queries** — `"contrato marco"` and `contrato | acuerdo` work out of the box.
|
|
76
|
+
- **Cooperative search** — `searchCooperative(query, { frameBudgetMs })` yields to the scheduler between slices. Results land in one batch (real incremental streaming is on the backlog).
|
|
77
|
+
- **Persistence** — snapshot the index to OPFS / IndexedDB and restore it. After `load()`, `engine.getStats().documents` is correct, `engine.search()` works against the restored corpus, and content-hash de-duplication survives the round-trip (snapshot v2). Older v1 snapshots still load — their docs come back with empty content hashes, so re-indexing the same files will create fresh slots until the next `save()` rewrites the snapshot as v2.
|
|
78
|
+
- **Incremental updates** — `removeDocument`, `replaceDocument`, `compact`. Content-hash dedup is automatic.
|
|
79
|
+
- **Resource aware** — pauses speculative work in background tabs, shrinks workers on low battery, defers PDF download on slow networks.
|
|
80
|
+
- **Off-main-thread** — `AlbexEngineWorker` mirror or `AlbexPool` shard across N workers (map-reduce search).
|
|
81
|
+
- **WebGPU pre-filter** — experimental, opt-in (`gpu: 'auto'`). Implemented for corpora over 20 k chunks; no reproducible speedup number yet — the bench in this repo runs on a 200-document synthetic corpus only.
|
|
82
|
+
- **SIMD opportunistic** — picks a SIMD-accelerated variant when the host supports v128.
|
|
83
|
+
- **Tiered storage** — `TieredStore` keeps recent docs hot, evicts cold ones to OPFS, promotes on demand.
|
|
84
|
+
- **Typed errors** — `AlbexParseError`, `AlbexUnsupportedFormatError`, `AlbexCapacityError`, `AlbexInitError`. All extend `AlbexError`.
|
|
85
|
+
- **Tiny core** — main WASM 24 KB (27 KB SIMD). PDF module (~1.2 MB) loads on demand. The OCR companion (`@albex/ocr`) is a separate package and pulls Tesseract.js (~3.5 MB) only when you call `enableOcr()`.
|
|
86
|
+
|
|
56
87
|
---
|
|
57
88
|
|
|
58
89
|
## Supported formats
|
|
59
90
|
|
|
60
|
-
| Extension
|
|
61
|
-
|
|
62
|
-
| `.docx`
|
|
63
|
-
| `.xlsx`
|
|
64
|
-
| `.pdf`
|
|
65
|
-
| `.
|
|
66
|
-
| `.
|
|
91
|
+
| Extension | How text is extracted |
|
|
92
|
+
|--------------------|-----------------------|
|
|
93
|
+
| `.docx` | Native Rust/WASM XML parser — streams `word/document.xml` |
|
|
94
|
+
| `.xlsx` | Native Rust/WASM XML parser — shared strings + inline strings |
|
|
95
|
+
| `.pdf` | Separate `albex_pdf.wasm` (pure Rust, loaded on demand) |
|
|
96
|
+
| `.md` / `.markdown`| TS parser — strips CommonMark marks |
|
|
97
|
+
| `.html` / `.htm` | TS parser — strips `<script>` / `<style>`, paragraphs at block boundaries |
|
|
98
|
+
| `.json` | TS parser — recursive walk over keys + string leaves |
|
|
99
|
+
| `.csv` | TS parser — RFC 4180 lite; one row per chunk |
|
|
100
|
+
| `.eml` | TS parser — MIME-lite: From/To/Subject + text/plain body |
|
|
101
|
+
| `.rtf` | TS parser — strips control words / groups |
|
|
102
|
+
| `.txt` | Plain text split on double newlines |
|
|
103
|
+
| `.xml` | Tag-stripped, entity-decoded |
|
|
67
104
|
|
|
68
105
|
---
|
|
69
106
|
|
|
70
107
|
## Query syntax
|
|
71
108
|
|
|
72
|
-
| Input
|
|
73
|
-
|
|
74
|
-
| `contrato`
|
|
75
|
-
| `contrato marco`
|
|
76
|
-
| `"contrato marco"`
|
|
77
|
-
| `contrato \| acuerdo` | OR:
|
|
109
|
+
| Input | Behaviour |
|
|
110
|
+
|----------------------|-----------|
|
|
111
|
+
| `contrato` | Fuzzy match, accent-insensitive |
|
|
112
|
+
| `contrato marco` | Both words must appear in the same chunk |
|
|
113
|
+
| `"contrato marco"` | Both words AND they must be adjacent (phrase) |
|
|
114
|
+
| `contrato \| acuerdo` | OR: union of results matching either branch |
|
|
78
115
|
|
|
79
116
|
Up to 4 space-separated tokens per simple/phrase query. OR branches are unlimited.
|
|
80
117
|
|
|
81
118
|
---
|
|
82
119
|
|
|
83
|
-
## API
|
|
84
|
-
|
|
85
|
-
### `new AlbexEngine(opts)`
|
|
120
|
+
## API at a glance
|
|
86
121
|
|
|
87
122
|
```ts
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
123
|
+
// Construct
|
|
124
|
+
const engine = new AlbexEngine();
|
|
125
|
+
await engine.init();
|
|
126
|
+
|
|
127
|
+
// Indexing
|
|
128
|
+
const doc = await engine.indexFile(file);
|
|
129
|
+
|
|
130
|
+
// Search (synchronous fast path)
|
|
131
|
+
const results = engine.search('contrato', { windowed: true });
|
|
132
|
+
|
|
133
|
+
// Cooperative search (yields to the scheduler between slices)
|
|
134
|
+
for await (const r of engine.searchCooperative('contrato', { frameBudgetMs: 8 })) {
|
|
135
|
+
/* … */
|
|
91
136
|
}
|
|
137
|
+
|
|
138
|
+
// Incremental updates
|
|
139
|
+
engine.removeDocument('contract.pdf');
|
|
140
|
+
await engine.replaceDocument('contract.pdf', newFile);
|
|
141
|
+
engine.compact();
|
|
142
|
+
|
|
143
|
+
// Persistence (OPFS or IndexedDB)
|
|
144
|
+
await engine.save('my-corpus');
|
|
145
|
+
await engine.loadOrInit('my-corpus');
|
|
146
|
+
|
|
147
|
+
// Tuning
|
|
148
|
+
engine.setMaxErrors(2);
|
|
149
|
+
engine.setThreshold(400);
|
|
150
|
+
engine.setMaxResults(50);
|
|
151
|
+
engine.setLanguage('es');
|
|
152
|
+
|
|
153
|
+
// Introspection
|
|
154
|
+
const stats = engine.getStats();
|
|
155
|
+
const lastRun = engine.getLastSearchStats();
|
|
92
156
|
```
|
|
93
157
|
|
|
94
|
-
|
|
158
|
+
Full API reference and types: [bdovenbird.com/albex/docs](https://bdovenbird.com/albex/docs).
|
|
95
159
|
|
|
96
|
-
|
|
160
|
+
---
|
|
97
161
|
|
|
98
|
-
|
|
162
|
+
## Off the main thread
|
|
99
163
|
|
|
100
|
-
|
|
164
|
+
For interactive search UIs, run the engine inside a Web Worker:
|
|
101
165
|
|
|
102
166
|
```ts
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
}
|
|
167
|
+
import { AlbexEngineWorker } from 'albex/worker';
|
|
168
|
+
|
|
169
|
+
const engine = new AlbexEngineWorker({
|
|
170
|
+
workerUrl: new URL('albex/worker-runtime', import.meta.url),
|
|
171
|
+
});
|
|
172
|
+
await engine.init();
|
|
110
173
|
```
|
|
111
174
|
|
|
112
|
-
|
|
175
|
+
Same surface as `AlbexEngine`; everything returns a `Promise`.
|
|
113
176
|
|
|
114
|
-
|
|
177
|
+
---
|
|
115
178
|
|
|
116
|
-
|
|
117
|
-
interface SearchResult {
|
|
118
|
-
documentName: string;
|
|
119
|
-
location: number; // paragraph (DOCX/TXT) or page (PDF, 1-based)
|
|
120
|
-
score: number; // 0–1000
|
|
121
|
-
snippet: string; // full chunk text (original, with accents)
|
|
122
|
-
matchStart: number; // byte offset of match in snippet
|
|
123
|
-
matchEnd: number; // exclusive
|
|
124
|
-
}
|
|
125
|
-
```
|
|
179
|
+
## Sharding across cores
|
|
126
180
|
|
|
127
|
-
|
|
181
|
+
For large corpora, an `AlbexPool` shards documents across N workers:
|
|
128
182
|
|
|
129
183
|
```ts
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
184
|
+
import { AlbexPool } from 'albex/pool';
|
|
185
|
+
|
|
186
|
+
const pool = new AlbexPool({
|
|
187
|
+
workerUrl: new URL('albex/worker-runtime', import.meta.url),
|
|
188
|
+
workers: 'auto', // = cores / 2, clamped [1, 8]
|
|
189
|
+
});
|
|
190
|
+
await pool.init();
|
|
191
|
+
|
|
192
|
+
await pool.indexFile(fileA); // sharded round-robin
|
|
193
|
+
const results = await pool.search('contrato'); // map-reduce
|
|
137
194
|
```
|
|
138
195
|
|
|
139
|
-
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## Big corpora — tiered storage
|
|
140
199
|
|
|
141
|
-
|
|
200
|
+
For workloads that exceed the tier's RAM capacity:
|
|
142
201
|
|
|
143
202
|
```ts
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
203
|
+
import { AlbexEngine, TieredStore } from 'albex';
|
|
204
|
+
|
|
205
|
+
const engine = new AlbexEngine();
|
|
206
|
+
await engine.init();
|
|
207
|
+
|
|
208
|
+
const store = new TieredStore(engine, { evictThreshold: 0.85 });
|
|
209
|
+
await store.init();
|
|
210
|
+
|
|
211
|
+
await store.indexFile(file); // persists original blob in OPFS
|
|
212
|
+
await store.promote('older-doc.pdf'); // brings warm doc back
|
|
152
213
|
```
|
|
153
214
|
|
|
154
|
-
|
|
215
|
+
Hot tier = engine. Warm tier = original files in OPFS. LRU eviction is automatic.
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Advanced configuration
|
|
220
|
+
|
|
221
|
+
`new AlbexEngine()` covers the default case. The options below address
|
|
222
|
+
specific deployment needs:
|
|
223
|
+
|
|
224
|
+
### Tier auto-selection (`mini` / `std` / `pro` based on `deviceMemory`)
|
|
225
|
+
|
|
226
|
+
Albex ships **six** WASM variants of the main engine (3 tiers × baseline/SIMD).
|
|
227
|
+
By default it loads the std-baseline binary that comes with the npm package.
|
|
228
|
+
If you want runtime tier auto-selection, serve the variants yourself and
|
|
229
|
+
pass `wasmBaseUrl`:
|
|
155
230
|
|
|
156
231
|
```ts
|
|
157
|
-
engine
|
|
158
|
-
|
|
159
|
-
|
|
232
|
+
const engine = new AlbexEngine({
|
|
233
|
+
wasmBaseUrl: '/assets', // directory containing the 6 .wasm files
|
|
234
|
+
tier: 'auto', // picks mini/std/pro by deviceMemory
|
|
235
|
+
simd: 'auto', // picks baseline/simd by WASM probe
|
|
236
|
+
gpu: 'auto', // engages WebGPU when corpus > 20k chunks
|
|
237
|
+
});
|
|
160
238
|
```
|
|
161
239
|
|
|
162
|
-
|
|
240
|
+
Tier capacities:
|
|
163
241
|
|
|
164
|
-
|
|
242
|
+
| Tier | Max docs | Max chunks | Max text | Working set |
|
|
243
|
+
|-------|---------:|-----------:|---------:|------------:|
|
|
244
|
+
| mini | 32 | 25 000 | 4 MB | ~5 MB |
|
|
245
|
+
| std | 128 | 100 000 | 16 MB | ~20 MB |
|
|
246
|
+
| pro | 1 024 | 800 000 | 128 MB | ~160 MB |
|
|
247
|
+
|
|
248
|
+
### Custom CDN
|
|
249
|
+
|
|
250
|
+
```ts
|
|
251
|
+
const engine = new AlbexEngine({
|
|
252
|
+
wasmUrl: 'https://my-cdn.example.com/albex_wasm.wasm',
|
|
253
|
+
});
|
|
254
|
+
```
|
|
165
255
|
|
|
166
256
|
---
|
|
167
257
|
|
|
168
|
-
##
|
|
258
|
+
## Errors
|
|
259
|
+
|
|
260
|
+
All errors thrown by Albex extend `AlbexError`:
|
|
169
261
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
262
|
+
```ts
|
|
263
|
+
import {
|
|
264
|
+
AlbexError, AlbexInitError, AlbexParseError,
|
|
265
|
+
AlbexUnsupportedFormatError, AlbexCapacityError,
|
|
266
|
+
} from 'albex';
|
|
267
|
+
|
|
268
|
+
try {
|
|
269
|
+
await engine.indexFile(file);
|
|
270
|
+
} catch (e) {
|
|
271
|
+
if (e instanceof AlbexUnsupportedFormatError) {
|
|
272
|
+
console.warn(`Skipped .${e.ext} (unsupported)`);
|
|
273
|
+
} else if (e instanceof AlbexParseError) {
|
|
274
|
+
console.warn(`Parse failed for ${e.format}:`, e.message);
|
|
275
|
+
} else throw e;
|
|
276
|
+
}
|
|
277
|
+
```
|
|
177
278
|
|
|
178
|
-
|
|
279
|
+
Each error carries a `kind` field that survives `structuredClone` across worker boundaries.
|
|
179
280
|
|
|
180
281
|
---
|
|
181
282
|
|
|
182
283
|
## Browser requirements
|
|
183
284
|
|
|
184
|
-
- WebAssembly (
|
|
285
|
+
- WebAssembly (every browser since 2017)
|
|
185
286
|
- `DecompressionStream` for DOCX/XLSX (Chrome 80+, Firefox 113+, Safari 16.4+)
|
|
186
|
-
-
|
|
287
|
+
- OPFS for fastest persistence (Chrome 102+, Safari 15.2+, Firefox 111+); IndexedDB fallback works everywhere
|
|
288
|
+
- WebGPU is **optional**; without it the CPU path is the default
|
|
187
289
|
|
|
188
|
-
PDF support
|
|
290
|
+
PDF support requires `albex_pdf.wasm` to be served with MIME type `application/wasm`.
|
|
189
291
|
|
|
190
292
|
---
|
|
191
293
|
|
|
192
294
|
## Building from source
|
|
193
295
|
|
|
194
296
|
```bash
|
|
195
|
-
# Install Rust + wasm-pack
|
|
196
297
|
rustup target add wasm32-unknown-unknown
|
|
197
298
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
299
|
+
npm install
|
|
300
|
+
npm run build:all # 6 main variants + PDF + TypeScript
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
Partial builds:
|
|
201
304
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
305
|
+
```bash
|
|
306
|
+
npm run build:wasm # std baseline only
|
|
307
|
+
npm run build:wasm:tiers # all 6 variants
|
|
308
|
+
npm run build:pdf-wasm # PDF module
|
|
309
|
+
npm run build # TypeScript only
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## Tests
|
|
205
315
|
|
|
206
|
-
|
|
207
|
-
|
|
316
|
+
```bash
|
|
317
|
+
# Rust unit tests
|
|
318
|
+
cargo test --manifest-path core/Cargo.toml
|
|
319
|
+
cargo test --manifest-path ingest/Cargo.toml
|
|
320
|
+
|
|
321
|
+
# TypeScript + WASM integration tests
|
|
322
|
+
npm test
|
|
323
|
+
|
|
324
|
+
# Micro-benchmarks
|
|
325
|
+
npm run bench
|
|
208
326
|
```
|
|
209
327
|
|
|
328
|
+
**About the benchmark.** The included bench probes per-operation overhead
|
|
329
|
+
on a 200-document synthetic corpus. It is **not** a corpus-level
|
|
330
|
+
performance claim — there is no representative real-world dataset checked
|
|
331
|
+
into the repo yet. Numbers from `npm run bench` should be read as
|
|
332
|
+
"this implementation does not regress against itself", not as comparisons
|
|
333
|
+
against other libraries.
|
|
334
|
+
|
|
335
|
+
CI runs every check on every push to `main`.
|
|
336
|
+
|
|
210
337
|
---
|
|
211
338
|
|
|
212
339
|
## Privacy
|
|
213
340
|
|
|
214
|
-
Albex
|
|
341
|
+
Albex never transmits document content. Text extraction, indexing, search and
|
|
342
|
+
persistence all happen inside the browser. The only network requests are the
|
|
343
|
+
initial fetches for the `.wasm` binaries (and the lazy PDF module on first
|
|
344
|
+
PDF). Persisted snapshots live in OPFS / IndexedDB, scoped to your origin.
|
|
215
345
|
|
|
216
346
|
---
|
|
217
347
|
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `AlbexEngineWorker` — a main-thread wrapper that runs the engine inside a
|
|
3
|
+
* Web Worker. Mirrors the surface of `AlbexEngine` so it can be swapped in
|
|
4
|
+
* without code changes.
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
*
|
|
8
|
+
* const engine = new AlbexEngineWorker({
|
|
9
|
+
* wasmUrl: '/assets/albex_wasm_bg.wasm',
|
|
10
|
+
* pdfWasmUrl: '/assets/albex_pdf.wasm',
|
|
11
|
+
* // Provide the URL to the bundled worker runtime.
|
|
12
|
+
* workerUrl: new URL('./worker-runtime.js', import.meta.url),
|
|
13
|
+
* });
|
|
14
|
+
* await engine.init();
|
|
15
|
+
*
|
|
16
|
+
* Why: a `search()` over 100k chunks can take 10–50 ms. On main thread that
|
|
17
|
+
* is visible jank for every keystroke. Off-main-thread keeps the UI at 60 fps.
|
|
18
|
+
*
|
|
19
|
+
* The runtime is single-threaded WASM, so requests are serialised: only one
|
|
20
|
+
* call is in flight at a time. This matches the actual `static mut` model
|
|
21
|
+
* inside the .wasm and is fine for an interactive search UI (each keystroke
|
|
22
|
+
* replaces the previous query).
|
|
23
|
+
*/
|
|
24
|
+
import type { AlbexOptions, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
|
|
25
|
+
export interface AlbexWorkerOptions extends AlbexOptions {
|
|
26
|
+
/** URL to the bundled worker runtime script (worker-runtime.js). */
|
|
27
|
+
workerUrl: string | URL;
|
|
28
|
+
}
|
|
29
|
+
export declare class AlbexEngineWorker {
|
|
30
|
+
private readonly _opts;
|
|
31
|
+
private _worker;
|
|
32
|
+
private _nextId;
|
|
33
|
+
private _pending;
|
|
34
|
+
private _docsCache;
|
|
35
|
+
constructor(opts: AlbexWorkerOptions);
|
|
36
|
+
init(): Promise<void>;
|
|
37
|
+
private _send;
|
|
38
|
+
indexFile(file: File): Promise<IndexedDocument>;
|
|
39
|
+
search(query: string, opts?: SearchOptions): Promise<SearchResult[]>;
|
|
40
|
+
/**
|
|
41
|
+
* Cooperative variant of `search`. Today the wire still sends a single
|
|
42
|
+
* batch — the result array is fetched in one round-trip from the worker
|
|
43
|
+
* and then exposed as an async iterator so callers can `break` early.
|
|
44
|
+
* A future iteration may use a `MessagePort` to stream individual results
|
|
45
|
+
* from the worker side; the iterator shape is preserved across that
|
|
46
|
+
* transition.
|
|
47
|
+
*/
|
|
48
|
+
searchCooperative(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
|
|
49
|
+
/**
|
|
50
|
+
* @deprecated Renamed to `searchCooperative` in 0.3.0. Alias removed in 0.4.0.
|
|
51
|
+
*/
|
|
52
|
+
searchStream(query: string, opts?: SearchOptions): AsyncIterable<SearchResult>;
|
|
53
|
+
removeDocument(id: string): Promise<boolean>;
|
|
54
|
+
compact(): Promise<void>;
|
|
55
|
+
reset(): Promise<void>;
|
|
56
|
+
getStats(): Promise<EngineStats>;
|
|
57
|
+
getLastSearchStats(): Promise<SearchStats | null>;
|
|
58
|
+
getDocuments(): Promise<readonly IndexedDocument[]>;
|
|
59
|
+
setMaxErrors(n: 0 | 1 | 2 | 3): Promise<void>;
|
|
60
|
+
setThreshold(n: number): Promise<void>;
|
|
61
|
+
setMaxResults(n: number): Promise<void>;
|
|
62
|
+
setLanguage(lang: 'off' | 'es'): Promise<void>;
|
|
63
|
+
save(name: string): Promise<void>;
|
|
64
|
+
load(name: string): Promise<boolean>;
|
|
65
|
+
loadOrInit(name: string): Promise<boolean>;
|
|
66
|
+
deleteSnapshot(name: string): Promise<void>;
|
|
67
|
+
listSnapshots(): Promise<string[]>;
|
|
68
|
+
[Symbol.dispose](): void;
|
|
69
|
+
}
|
|
70
|
+
//# sourceMappingURL=albex-worker.d.ts.map
|