albex 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +191 -0
  2. package/README.md +30 -19
  3. package/dist/albex-worker.d.ts +65 -2
  4. package/dist/albex-worker.d.ts.map +1 -1
  5. package/dist/albex-worker.js +97 -20
  6. package/dist/albex-worker.js.map +1 -1
  7. package/dist/albex.d.ts +206 -42
  8. package/dist/albex.d.ts.map +1 -1
  9. package/dist/albex.js +384 -103
  10. package/dist/albex.js.map +1 -1
  11. package/dist/errors.d.ts +35 -4
  12. package/dist/errors.d.ts.map +1 -1
  13. package/dist/errors.js +37 -2
  14. package/dist/errors.js.map +1 -1
  15. package/dist/persistence.js +1 -1
  16. package/dist/pool/coordinator.d.ts +14 -6
  17. package/dist/pool/coordinator.d.ts.map +1 -1
  18. package/dist/pool/coordinator.js +65 -28
  19. package/dist/pool/coordinator.js.map +1 -1
  20. package/dist/profile.js +1 -1
  21. package/dist/resource-manager.js +1 -1
  22. package/dist/tiered-store.js +1 -1
  23. package/dist/wasm-bindings.d.ts +50 -1
  24. package/dist/wasm-bindings.d.ts.map +1 -1
  25. package/dist/wasm-bindings.js +19 -11
  26. package/dist/wasm-bindings.js.map +1 -1
  27. package/dist/worker-protocol.d.ts +23 -2
  28. package/dist/worker-protocol.d.ts.map +1 -1
  29. package/dist/worker-protocol.js +1 -1
  30. package/dist/worker-runtime.js +16 -1
  31. package/dist/worker-runtime.js.map +1 -1
  32. package/package.json +1 -1
  33. package/src/albex-worker.ts +103 -18
  34. package/src/albex.ts +2937 -2524
  35. package/src/errors.ts +49 -4
  36. package/src/pool/coordinator.ts +61 -34
  37. package/src/wasm-bindings.ts +78 -12
  38. package/src/worker-protocol.ts +12 -2
  39. package/src/worker-runtime.ts +16 -1
  40. package/wasm/pkg/albex_pdf.wasm +0 -0
  41. package/wasm/pkg/albex_wasm.wasm +0 -0
  42. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  43. package/wasm/pkg/albex_wasm_simd.wasm +0 -0
@@ -20,10 +20,28 @@
20
20
  * call is in flight at a time. This matches the actual `static mut` model
21
21
  * inside the .wasm and is fine for an interactive search UI (each keystroke
22
22
  * replaces the previous query).
23
+ *
24
+ * ## OCR is NOT available in the worker
25
+ *
26
+ * `AlbexEngineWorker` has no `attachOcr`: an OCR adapter is an object with
27
+ * functions, and functions cannot cross the `postMessage` boundary (the
28
+ * structured-clone algorithm rejects them). Consequences:
29
+ *
30
+ * - **Scanned (image-only) PDFs index with 0 chunks, silently.** The
31
+ * engine records a diagnostic explaining why — read it with
32
+ * {@link takeDiagnostics} after `indexFile`.
33
+ * - If your corpus contains scanned PDFs and you need their text, index
34
+ * them with the synchronous main-thread `AlbexEngine` plus the OCR
35
+ * adapter (`engine.attachOcr(...)` / `@albex/ocr`'s `enableOcr`), then
36
+ * `save()` the snapshot and `load()` it from the worker engine.
37
+ * - A future protocol iteration could proxy OCR over a dedicated
38
+ * `MessagePort`; until then the main-thread engine is the OCR path.
23
39
  */
24
40
 
25
41
  import type {
42
+ AlbexDiagnostic,
26
43
  AlbexOptions,
44
+ AuthoritativeChunk,
27
45
  IndexedDocument,
28
46
  SearchOptions,
29
47
  SearchResult,
@@ -41,6 +59,7 @@ import {
41
59
  AlbexUnsupportedFormatError,
42
60
  AlbexParseError,
43
61
  AlbexCapacityError,
62
+ assertFileSizeWithinLimit,
44
63
  } from './errors.js';
45
64
 
46
65
  export interface AlbexWorkerOptions extends AlbexOptions {
@@ -60,12 +79,32 @@ export class AlbexEngineWorker {
60
79
  private _worker!: Worker;
61
80
  private _nextId = 1;
62
81
  private _pending = new Map<number, Pending>();
63
- private _docsCache: IndexedDocument[] = [];
64
82
 
65
83
  constructor(opts: AlbexWorkerOptions) {
66
84
  this._opts = opts;
67
85
  }
68
86
 
87
+ /**
88
+ * Spawn the worker and initialise the engine inside it.
89
+ *
90
+ * Every serializable engine option is forwarded across the worker
91
+ * boundary (`wasmUrl`, `wasmBaseUrl`, `pdfWasmUrl`, `capacity`, `simd`,
92
+ * `gpu`, `gpuThreshold`, `maxFileBytes`) — only `workerUrl`, which is
93
+ * consumed on this side, is stripped. Notes on what applies in a worker:
94
+ *
95
+ * - `capacity`: fully honoured — both the `'std'`/`'large'` presets
96
+ * (plain strings) and a custom object are structured-clone-safe, so
97
+ * the worker-side engine sizes its pools exactly like a main-thread
98
+ * engine would. Mind the memory cost (`'large'` ≈ 180 MB) lives in
99
+ * the worker's heap.
100
+ * - `wasmBaseUrl` + `simd`: fully honoured — the worker-side engine can
101
+ * load the `_simd.wasm` variant.
102
+ * - `gpu` / `gpuThreshold`: honoured where the worker runtime exposes
103
+ * WebGPU. `navigator.gpu` is available in dedicated workers in
104
+ * Chromium-based browsers (compute needs no canvas); elsewhere the
105
+ * engine's GPU probe fails gracefully and searches use the CPU
106
+ * pre-filter, exactly as on the main thread.
107
+ */
69
108
  async init(): Promise<void> {
70
109
  this._worker = new Worker(this._opts.workerUrl, { type: 'module' });
71
110
  this._worker.onmessage = (ev: MessageEvent<WorkerResponse>) => {
@@ -82,10 +121,16 @@ export class AlbexEngineWorker {
82
121
  for (const [, p] of this._pending) p.reject(err);
83
122
  this._pending.clear();
84
123
  };
85
- await this._send({ kind: 'init', opts: {
86
- wasmUrl: this._opts.wasmUrl,
87
- pdfWasmUrl: this._opts.pdfWasmUrl,
88
- } });
124
+ // Forward every serializable engine option. AlbexOptions is data-only
125
+ // (strings/numbers/booleans), but filter defensively so a future
126
+ // non-clonable option (function, DOM handle) cannot break postMessage.
127
+ const opts: AlbexOptions = {};
128
+ for (const [k, v] of Object.entries(this._opts)) {
129
+ if (k === 'workerUrl') continue; // consumed on this side
130
+ if (v === undefined || typeof v === 'function') continue;
131
+ (opts as Record<string, unknown>)[k] = v;
132
+ }
133
+ await this._send({ kind: 'init', opts });
89
134
  }
90
135
 
91
136
  private _send<T = unknown>(op: WorkerOp, transfer: Transferable[] = []): Promise<T> {
@@ -98,20 +143,31 @@ export class AlbexEngineWorker {
98
143
  }
99
144
 
100
145
  async indexFile(file: File): Promise<IndexedDocument> {
146
+ // Size guard BEFORE reading: the worker-side engine enforces the same
147
+ // limit, but checking here avoids buffering an oversized file on the
148
+ // main thread just to have the worker reject it.
149
+ assertFileSizeWithinLimit(file, this._opts.maxFileBytes);
101
150
  const buffer = await file.arrayBuffer();
102
151
  // Transfer the buffer to avoid a copy.
103
- const doc = await this._send<IndexedDocument>(
152
+ return this._send<IndexedDocument>(
104
153
  { kind: 'indexFile', name: file.name, buffer },
105
154
  [buffer],
106
155
  );
107
- this._docsCache.push(doc);
108
- return doc;
109
156
  }
110
157
 
111
158
  search(query: string, opts: SearchOptions = {}): Promise<SearchResult[]> {
112
159
  return this._send<SearchResult[]>({ kind: 'search', query, options: opts });
113
160
  }
114
161
 
162
+ /**
163
+ * Enumerate the authoritative chunks Albex indexed for `docId`
164
+ * (`IndexedDocument.docId` from {@link indexFile}). Mirrors
165
+ * `AlbexEngine.listChunks` across the worker boundary.
166
+ */
167
+ listChunks(docId: number): Promise<AuthoritativeChunk[]> {
168
+ return this._send<AuthoritativeChunk[]>({ kind: 'listChunks', docId });
169
+ }
170
+
115
171
  /**
116
172
  * Cooperative variant of `search`. Today the wire still sends a single
117
173
  * batch — the result array is fetched in one round-trip from the worker
@@ -137,17 +193,44 @@ export class AlbexEngineWorker {
137
193
  }
138
194
 
139
195
  async removeDocument(id: string): Promise<boolean> {
140
- const ok = await this._send<boolean>({ kind: 'removeDocument', id });
141
- if (ok) this._docsCache = this._docsCache.filter(d => d.name !== id && d.contentHash !== id);
142
- return ok;
196
+ return this._send<boolean>({ kind: 'removeDocument', id });
143
197
  }
144
198
 
145
- async compact(): Promise<void> { await this._send({ kind: 'compact' }); }
146
- async reset(): Promise<void> {
147
- await this._send({ kind: 'reset' });
148
- this._docsCache = [];
199
+ /**
200
+ * Replace a previously indexed document with new content. Mirrors
201
+ * `AlbexEngine.replaceDocument`: equivalent to `removeDocument(name)` +
202
+ * `indexFile(newFile)` without tripping the idempotency check, plus an
203
+ * opportunistic compact under text-pool pressure — all inside the
204
+ * worker-side engine's lock. The file bytes are transferred (zero-copy),
205
+ * like `indexFile`.
206
+ */
207
+ async replaceDocument(name: string, newFile: File): Promise<IndexedDocument> {
208
+ assertFileSizeWithinLimit(newFile, this._opts.maxFileBytes);
209
+ const buffer = await newFile.arrayBuffer();
210
+ return this._send<IndexedDocument>(
211
+ { kind: 'replaceDocument', name, fileName: newFile.name, buffer },
212
+ [buffer],
213
+ );
214
+ }
215
+
216
+ /**
217
+ * Drain and return the diagnostics collected by the worker-side engine
218
+ * since the last call. Mirrors `AlbexEngine.takeDiagnostics` — consult it
219
+ * after `indexFile`/`load` to surface recoverable issues (PDF fallbacks,
220
+ * skipped content, persistence warnings). The worker-side buffer is
221
+ * cleared on each call.
222
+ *
223
+ * Particularly important in a worker: scanned PDFs index with **0 chunks**
224
+ * (no OCR available — see the note on OCR below), and the diagnostic
225
+ * explaining why is only visible through this method.
226
+ */
227
+ takeDiagnostics(): Promise<AlbexDiagnostic[]> {
228
+ return this._send<AlbexDiagnostic[]>({ kind: 'takeDiagnostics' });
149
229
  }
150
230
 
231
+ async compact(): Promise<void> { await this._send({ kind: 'compact' }); }
232
+ async reset(): Promise<void> { await this._send({ kind: 'reset' }); }
233
+
151
234
  getStats(): Promise<EngineStats> { return this._send({ kind: 'getStats' }); }
152
235
  getLastSearchStats(): Promise<SearchStats | null> { return this._send({ kind: 'getLastSearchStats' }); }
153
236
  getDocuments(): Promise<readonly IndexedDocument[]> { return this._send({ kind: 'getDocuments' }); }
@@ -168,16 +251,18 @@ export class AlbexEngineWorker {
168
251
  for (const [, p] of this._pending) p.reject(new AlbexError('disposed', 'Engine disposed'));
169
252
  this._pending.clear();
170
253
  this._worker?.terminate();
171
- this._docsCache = [];
172
254
  }
173
255
  }
174
256
 
175
- function rehydrateError(e: { name: string; kind?: string; message: string }): Error {
257
+ function rehydrateError(e: { name: string; kind?: string; message: string; limit?: string; max?: number }): Error {
176
258
  switch (e.kind) {
177
259
  case 'init': return new AlbexInitError(e.message);
178
260
  case 'unsupported_format': return new AlbexUnsupportedFormatError(e.message.replace(/^Unsupported format: \./, ''));
179
261
  case 'parse': return new AlbexParseError('unknown', e.message);
180
- case 'capacity': return new AlbexCapacityError(e.message);
262
+ // `limit`/`max` survive the wire (worker-runtime serialises them) so
263
+ // the rehydrated error still reports the runtime capacity that
264
+ // overflowed inside the worker-side engine.
265
+ case 'capacity': return new AlbexCapacityError(e.message, e.limit as never, e.max);
181
266
  default: {
182
267
  const err = new Error(e.message);
183
268
  err.name = e.name;