albex 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +191 -0
- package/README.md +30 -19
- package/dist/albex-worker.d.ts +65 -2
- package/dist/albex-worker.d.ts.map +1 -1
- package/dist/albex-worker.js +97 -20
- package/dist/albex-worker.js.map +1 -1
- package/dist/albex.d.ts +206 -42
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +384 -103
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +35 -4
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +37 -2
- package/dist/errors.js.map +1 -1
- package/dist/persistence.js +1 -1
- package/dist/pool/coordinator.d.ts +14 -6
- package/dist/pool/coordinator.d.ts.map +1 -1
- package/dist/pool/coordinator.js +65 -28
- package/dist/pool/coordinator.js.map +1 -1
- package/dist/profile.js +1 -1
- package/dist/resource-manager.js +1 -1
- package/dist/tiered-store.js +1 -1
- package/dist/wasm-bindings.d.ts +50 -1
- package/dist/wasm-bindings.d.ts.map +1 -1
- package/dist/wasm-bindings.js +19 -11
- package/dist/wasm-bindings.js.map +1 -1
- package/dist/worker-protocol.d.ts +23 -2
- package/dist/worker-protocol.d.ts.map +1 -1
- package/dist/worker-protocol.js +1 -1
- package/dist/worker-runtime.js +16 -1
- package/dist/worker-runtime.js.map +1 -1
- package/package.json +1 -1
- package/src/albex-worker.ts +103 -18
- package/src/albex.ts +2937 -2524
- package/src/errors.ts +49 -4
- package/src/pool/coordinator.ts +61 -34
- package/src/wasm-bindings.ts +78 -12
- package/src/worker-protocol.ts +12 -2
- package/src/worker-runtime.ts +16 -1
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/src/albex-worker.ts
CHANGED
|
@@ -20,10 +20,28 @@
|
|
|
20
20
|
* call is in flight at a time. This matches the actual `static mut` model
|
|
21
21
|
* inside the .wasm and is fine for an interactive search UI (each keystroke
|
|
22
22
|
* replaces the previous query).
|
|
23
|
+
*
|
|
24
|
+
* ## OCR is NOT available in the worker
|
|
25
|
+
*
|
|
26
|
+
* `AlbexEngineWorker` has no `attachOcr`: an OCR adapter is an object with
|
|
27
|
+
* functions, and functions cannot cross the `postMessage` boundary (the
|
|
28
|
+
* structured-clone algorithm rejects them). Consequences:
|
|
29
|
+
*
|
|
30
|
+
* - **Scanned (image-only) PDFs index with 0 chunks, silently.** The
|
|
31
|
+
* engine records a diagnostic explaining why — read it with
|
|
32
|
+
* {@link takeDiagnostics} after `indexFile`.
|
|
33
|
+
* - If your corpus contains scanned PDFs and you need their text, index
|
|
34
|
+
* them with the synchronous main-thread `AlbexEngine` plus the OCR
|
|
35
|
+
* adapter (`engine.attachOcr(...)` / `@albex/ocr`'s `enableOcr`), then
|
|
36
|
+
* `save()` the snapshot and `load()` it from the worker engine.
|
|
37
|
+
* - A future protocol iteration could proxy OCR over a dedicated
|
|
38
|
+
* `MessagePort`; until then the main-thread engine is the OCR path.
|
|
23
39
|
*/
|
|
24
40
|
|
|
25
41
|
import type {
|
|
42
|
+
AlbexDiagnostic,
|
|
26
43
|
AlbexOptions,
|
|
44
|
+
AuthoritativeChunk,
|
|
27
45
|
IndexedDocument,
|
|
28
46
|
SearchOptions,
|
|
29
47
|
SearchResult,
|
|
@@ -41,6 +59,7 @@ import {
|
|
|
41
59
|
AlbexUnsupportedFormatError,
|
|
42
60
|
AlbexParseError,
|
|
43
61
|
AlbexCapacityError,
|
|
62
|
+
assertFileSizeWithinLimit,
|
|
44
63
|
} from './errors.js';
|
|
45
64
|
|
|
46
65
|
export interface AlbexWorkerOptions extends AlbexOptions {
|
|
@@ -60,12 +79,32 @@ export class AlbexEngineWorker {
|
|
|
60
79
|
private _worker!: Worker;
|
|
61
80
|
private _nextId = 1;
|
|
62
81
|
private _pending = new Map<number, Pending>();
|
|
63
|
-
private _docsCache: IndexedDocument[] = [];
|
|
64
82
|
|
|
65
83
|
constructor(opts: AlbexWorkerOptions) {
|
|
66
84
|
this._opts = opts;
|
|
67
85
|
}
|
|
68
86
|
|
|
87
|
+
/**
|
|
88
|
+
* Spawn the worker and initialise the engine inside it.
|
|
89
|
+
*
|
|
90
|
+
* Every serializable engine option is forwarded across the worker
|
|
91
|
+
* boundary (`wasmUrl`, `wasmBaseUrl`, `pdfWasmUrl`, `capacity`, `simd`,
|
|
92
|
+
* `gpu`, `gpuThreshold`, `maxFileBytes`) — only `workerUrl`, which is
|
|
93
|
+
* consumed on this side, is stripped. Notes on what applies in a worker:
|
|
94
|
+
*
|
|
95
|
+
* - `capacity`: fully honoured — both the `'std'`/`'large'` presets
|
|
96
|
+
* (plain strings) and a custom object are structured-clone-safe, so
|
|
97
|
+
* the worker-side engine sizes its pools exactly like a main-thread
|
|
98
|
+
* engine would. Mind the memory cost (`'large'` ≈ 180 MB) lives in
|
|
99
|
+
* the worker's heap.
|
|
100
|
+
* - `wasmBaseUrl` + `simd`: fully honoured — the worker-side engine can
|
|
101
|
+
* load the `_simd.wasm` variant.
|
|
102
|
+
* - `gpu` / `gpuThreshold`: honoured where the worker runtime exposes
|
|
103
|
+
* WebGPU. `navigator.gpu` is available in dedicated workers in
|
|
104
|
+
* Chromium-based browsers (compute needs no canvas); elsewhere the
|
|
105
|
+
* engine's GPU probe fails gracefully and searches use the CPU
|
|
106
|
+
* pre-filter, exactly as on the main thread.
|
|
107
|
+
*/
|
|
69
108
|
async init(): Promise<void> {
|
|
70
109
|
this._worker = new Worker(this._opts.workerUrl, { type: 'module' });
|
|
71
110
|
this._worker.onmessage = (ev: MessageEvent<WorkerResponse>) => {
|
|
@@ -82,10 +121,16 @@ export class AlbexEngineWorker {
|
|
|
82
121
|
for (const [, p] of this._pending) p.reject(err);
|
|
83
122
|
this._pending.clear();
|
|
84
123
|
};
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
124
|
+
// Forward every serializable engine option. AlbexOptions is data-only
|
|
125
|
+
// (strings/numbers/booleans), but filter defensively so a future
|
|
126
|
+
// non-clonable option (function, DOM handle) cannot break postMessage.
|
|
127
|
+
const opts: AlbexOptions = {};
|
|
128
|
+
for (const [k, v] of Object.entries(this._opts)) {
|
|
129
|
+
if (k === 'workerUrl') continue; // consumed on this side
|
|
130
|
+
if (v === undefined || typeof v === 'function') continue;
|
|
131
|
+
(opts as Record<string, unknown>)[k] = v;
|
|
132
|
+
}
|
|
133
|
+
await this._send({ kind: 'init', opts });
|
|
89
134
|
}
|
|
90
135
|
|
|
91
136
|
private _send<T = unknown>(op: WorkerOp, transfer: Transferable[] = []): Promise<T> {
|
|
@@ -98,20 +143,31 @@ export class AlbexEngineWorker {
|
|
|
98
143
|
}
|
|
99
144
|
|
|
100
145
|
async indexFile(file: File): Promise<IndexedDocument> {
|
|
146
|
+
// Size guard BEFORE reading: the worker-side engine enforces the same
|
|
147
|
+
// limit, but checking here avoids buffering an oversized file on the
|
|
148
|
+
// main thread just to have the worker reject it.
|
|
149
|
+
assertFileSizeWithinLimit(file, this._opts.maxFileBytes);
|
|
101
150
|
const buffer = await file.arrayBuffer();
|
|
102
151
|
// Transfer the buffer to avoid a copy.
|
|
103
|
-
|
|
152
|
+
return this._send<IndexedDocument>(
|
|
104
153
|
{ kind: 'indexFile', name: file.name, buffer },
|
|
105
154
|
[buffer],
|
|
106
155
|
);
|
|
107
|
-
this._docsCache.push(doc);
|
|
108
|
-
return doc;
|
|
109
156
|
}
|
|
110
157
|
|
|
111
158
|
search(query: string, opts: SearchOptions = {}): Promise<SearchResult[]> {
|
|
112
159
|
return this._send<SearchResult[]>({ kind: 'search', query, options: opts });
|
|
113
160
|
}
|
|
114
161
|
|
|
162
|
+
/**
|
|
163
|
+
* Enumerate the authoritative chunks Albex indexed for `docId`
|
|
164
|
+
* (`IndexedDocument.docId` from {@link indexFile}). Mirrors
|
|
165
|
+
* `AlbexEngine.listChunks` across the worker boundary.
|
|
166
|
+
*/
|
|
167
|
+
listChunks(docId: number): Promise<AuthoritativeChunk[]> {
|
|
168
|
+
return this._send<AuthoritativeChunk[]>({ kind: 'listChunks', docId });
|
|
169
|
+
}
|
|
170
|
+
|
|
115
171
|
/**
|
|
116
172
|
* Cooperative variant of `search`. Today the wire still sends a single
|
|
117
173
|
* batch — the result array is fetched in one round-trip from the worker
|
|
@@ -137,17 +193,44 @@ export class AlbexEngineWorker {
|
|
|
137
193
|
}
|
|
138
194
|
|
|
139
195
|
async removeDocument(id: string): Promise<boolean> {
|
|
140
|
-
|
|
141
|
-
if (ok) this._docsCache = this._docsCache.filter(d => d.name !== id && d.contentHash !== id);
|
|
142
|
-
return ok;
|
|
196
|
+
return this._send<boolean>({ kind: 'removeDocument', id });
|
|
143
197
|
}
|
|
144
198
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
199
|
+
/**
|
|
200
|
+
* Replace a previously indexed document with new content. Mirrors
|
|
201
|
+
* `AlbexEngine.replaceDocument`: equivalent to `removeDocument(name)` +
|
|
202
|
+
* `indexFile(newFile)` without tripping the idempotency check, plus an
|
|
203
|
+
* opportunistic compact under text-pool pressure — all inside the
|
|
204
|
+
* worker-side engine's lock. The file bytes are transferred (zero-copy),
|
|
205
|
+
* like `indexFile`.
|
|
206
|
+
*/
|
|
207
|
+
async replaceDocument(name: string, newFile: File): Promise<IndexedDocument> {
|
|
208
|
+
assertFileSizeWithinLimit(newFile, this._opts.maxFileBytes);
|
|
209
|
+
const buffer = await newFile.arrayBuffer();
|
|
210
|
+
return this._send<IndexedDocument>(
|
|
211
|
+
{ kind: 'replaceDocument', name, fileName: newFile.name, buffer },
|
|
212
|
+
[buffer],
|
|
213
|
+
);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Drain and return the diagnostics collected by the worker-side engine
|
|
218
|
+
* since the last call. Mirrors `AlbexEngine.takeDiagnostics` — consult it
|
|
219
|
+
* after `indexFile`/`load` to surface recoverable issues (PDF fallbacks,
|
|
220
|
+
* skipped content, persistence warnings). The worker-side buffer is
|
|
221
|
+
* cleared on each call.
|
|
222
|
+
*
|
|
223
|
+
* Particularly important in a worker: scanned PDFs index with **0 chunks**
|
|
224
|
+
* (no OCR available — see the note on OCR below), and the diagnostic
|
|
225
|
+
* explaining why is only visible through this method.
|
|
226
|
+
*/
|
|
227
|
+
takeDiagnostics(): Promise<AlbexDiagnostic[]> {
|
|
228
|
+
return this._send<AlbexDiagnostic[]>({ kind: 'takeDiagnostics' });
|
|
149
229
|
}
|
|
150
230
|
|
|
231
|
+
async compact(): Promise<void> { await this._send({ kind: 'compact' }); }
|
|
232
|
+
async reset(): Promise<void> { await this._send({ kind: 'reset' }); }
|
|
233
|
+
|
|
151
234
|
getStats(): Promise<EngineStats> { return this._send({ kind: 'getStats' }); }
|
|
152
235
|
getLastSearchStats(): Promise<SearchStats | null> { return this._send({ kind: 'getLastSearchStats' }); }
|
|
153
236
|
getDocuments(): Promise<readonly IndexedDocument[]> { return this._send({ kind: 'getDocuments' }); }
|
|
@@ -168,16 +251,18 @@ export class AlbexEngineWorker {
|
|
|
168
251
|
for (const [, p] of this._pending) p.reject(new AlbexError('disposed', 'Engine disposed'));
|
|
169
252
|
this._pending.clear();
|
|
170
253
|
this._worker?.terminate();
|
|
171
|
-
this._docsCache = [];
|
|
172
254
|
}
|
|
173
255
|
}
|
|
174
256
|
|
|
175
|
-
function rehydrateError(e: { name: string; kind?: string; message: string }): Error {
|
|
257
|
+
function rehydrateError(e: { name: string; kind?: string; message: string; limit?: string; max?: number }): Error {
|
|
176
258
|
switch (e.kind) {
|
|
177
259
|
case 'init': return new AlbexInitError(e.message);
|
|
178
260
|
case 'unsupported_format': return new AlbexUnsupportedFormatError(e.message.replace(/^Unsupported format: \./, ''));
|
|
179
261
|
case 'parse': return new AlbexParseError('unknown', e.message);
|
|
180
|
-
|
|
262
|
+
// `limit`/`max` survive the wire (worker-runtime serialises them) so
|
|
263
|
+
// the rehydrated error still reports the runtime capacity that
|
|
264
|
+
// overflowed inside the worker-side engine.
|
|
265
|
+
case 'capacity': return new AlbexCapacityError(e.message, e.limit as never, e.max);
|
|
181
266
|
default: {
|
|
182
267
|
const err = new Error(e.message);
|
|
183
268
|
err.name = e.name;
|