albex 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +191 -0
  2. package/README.md +30 -19
  3. package/dist/albex-worker.d.ts +65 -2
  4. package/dist/albex-worker.d.ts.map +1 -1
  5. package/dist/albex-worker.js +97 -20
  6. package/dist/albex-worker.js.map +1 -1
  7. package/dist/albex.d.ts +206 -42
  8. package/dist/albex.d.ts.map +1 -1
  9. package/dist/albex.js +384 -103
  10. package/dist/albex.js.map +1 -1
  11. package/dist/errors.d.ts +35 -4
  12. package/dist/errors.d.ts.map +1 -1
  13. package/dist/errors.js +37 -2
  14. package/dist/errors.js.map +1 -1
  15. package/dist/persistence.js +1 -1
  16. package/dist/pool/coordinator.d.ts +14 -6
  17. package/dist/pool/coordinator.d.ts.map +1 -1
  18. package/dist/pool/coordinator.js +65 -28
  19. package/dist/pool/coordinator.js.map +1 -1
  20. package/dist/profile.js +1 -1
  21. package/dist/resource-manager.js +1 -1
  22. package/dist/tiered-store.js +1 -1
  23. package/dist/wasm-bindings.d.ts +50 -1
  24. package/dist/wasm-bindings.d.ts.map +1 -1
  25. package/dist/wasm-bindings.js +19 -11
  26. package/dist/wasm-bindings.js.map +1 -1
  27. package/dist/worker-protocol.d.ts +23 -2
  28. package/dist/worker-protocol.d.ts.map +1 -1
  29. package/dist/worker-protocol.js +1 -1
  30. package/dist/worker-runtime.js +16 -1
  31. package/dist/worker-runtime.js.map +1 -1
  32. package/package.json +1 -1
  33. package/src/albex-worker.ts +103 -18
  34. package/src/albex.ts +2937 -2524
  35. package/src/errors.ts +49 -4
  36. package/src/pool/coordinator.ts +61 -34
  37. package/src/wasm-bindings.ts +78 -12
  38. package/src/worker-protocol.ts +12 -2
  39. package/src/worker-runtime.ts +16 -1
  40. package/wasm/pkg/albex_pdf.wasm +0 -0
  41. package/wasm/pkg/albex_wasm.wasm +0 -0
  42. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  43. package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/src/errors.ts CHANGED
@@ -57,20 +57,65 @@ export class AlbexParseError extends AlbexError {
57
57
  * documents, names) ran out of room mid-document. Before 0.6.0 the latter was
58
58
  * silent — the corpus was truncated with no signal.
59
59
  *
60
- * `limit` names which pool overflowed (or `'scratchpad'`), so callers can
60
+ * `limit` names which pool overflowed (or `'scratchpad'`, or `'file'` when an
61
+ * input file exceeds `maxFileBytes` before any byte is read), so callers can
61
62
  * branch — e.g. start a fresh shard, `compact()`, or surface "library full".
62
63
  * When a capacity error is raised during `indexFile`, the engine may hold a
63
64
  * partially-indexed copy of the offending document; treat the index as full
64
- * and stop adding.
65
+ * and stop adding. A `'file'` capacity error is raised BEFORE the file is
66
+ * read, so the index is untouched and fully usable.
65
67
  */
66
- export type AlbexCapacityLimit = 'chunks' | 'text' | 'docs' | 'names' | 'scratchpad';
68
+ export type AlbexCapacityLimit = 'chunks' | 'text' | 'docs' | 'names' | 'scratchpad' | 'file';
67
69
 
68
70
  export class AlbexCapacityError extends AlbexError {
69
71
  /** Which pool overflowed. Undefined for older call sites that didn't set it. */
70
72
  readonly limit?: AlbexCapacityLimit;
71
- constructor(message: string, limit?: AlbexCapacityLimit) {
73
+ /**
74
+ * The RUNTIME numeric capacity of the pool named by `limit`, as the
75
+ * engine is actually configured (e.g. `4` when `capacity: { maxDocs: 4 }`
76
+ * overflows its document table, `128` for the std default). Units: docs
77
+ * for `'docs'`, chunks for `'chunks'`, bytes for `'text'`/`'names'`/
78
+ * `'scratchpad'`/`'file'`. Undefined when the limit is not known at the
79
+ * throw site.
80
+ */
81
+ readonly max?: number;
82
+ constructor(message: string, limit?: AlbexCapacityLimit, max?: number) {
72
83
  super('capacity', message);
73
84
  this.name = 'AlbexCapacityError';
74
85
  if (limit) this.limit = limit;
86
+ if (max !== undefined) this.max = max;
87
+ }
88
+ }
89
+
90
+ /**
91
+ * Default `maxFileBytes` for `indexFile`: 256 MiB. Far above anything the
92
+ * ~16–21 MB text pool could ever absorb, so legitimate documents are never
93
+ * rejected — the guard only exists to refuse pathological inputs (a 2 GB
94
+ * file would otherwise be fully buffered AND hashed before the first
95
+ * capacity check could fire).
96
+ */
97
+ export const DEFAULT_MAX_FILE_BYTES = 256 * 1024 * 1024;
98
+
99
+ /**
100
+ * Pre-read size guard for `indexFile`. Throws a typed
101
+ * {@link AlbexCapacityError} (`limit: 'file'`) when `file.size` exceeds the
102
+ * configured cap — BEFORE any byte of the file is read into memory
103
+ * (`File`/`Blob` expose `size` without reading). Shared by the engine, the
104
+ * worker wrapper and the pool coordinator so the guard fires on whichever
105
+ * thread would otherwise buffer the bytes.
106
+ */
107
+ export function assertFileSizeWithinLimit(
108
+ file: { name: string; size: number },
109
+ maxFileBytes?: number,
110
+ ): void {
111
+ const cap = maxFileBytes ?? DEFAULT_MAX_FILE_BYTES;
112
+ if (file.size > cap) {
113
+ throw new AlbexCapacityError(
114
+ `"${file.name}" is ${file.size} bytes, above the maxFileBytes limit of ` +
115
+ `${cap}. The file was not read or indexed. Raise \`maxFileBytes\` in ` +
116
+ `AlbexOptions if this is intentional.`,
117
+ 'file',
118
+ cap,
119
+ );
75
120
  }
76
121
  }
@@ -30,9 +30,8 @@ import type {
30
30
  EngineStats,
31
31
  SearchStats,
32
32
  } from '../albex.js';
33
- import type { Tier } from '../profile.js';
34
33
  import { detectProfile, pickWorkerCount } from '../profile.js';
35
- import { AlbexInitError, AlbexError } from '../errors.js';
34
+ import { AlbexInitError, AlbexError, assertFileSizeWithinLimit } from '../errors.js';
36
35
  import type {
37
36
  WorkerRequest,
38
37
  WorkerResponse,
@@ -77,7 +76,9 @@ export class AlbexPool {
77
76
  private _docsCache: IndexedDocument[] = [];
78
77
  private _rrCursor = 0;
79
78
  private _lastSearch: SearchStats | null = null;
80
- private _tier: Tier | null = null;
79
+ /** Global result cap applied AFTER the cross-shard merge. Mirrors the
80
+ * last `setMaxResults` call (the WASM engine default is 50). */
81
+ private _maxResults = 50;
81
82
 
82
83
  constructor(opts: AlbexPoolOptions) {
83
84
  this._opts = opts;
@@ -98,23 +99,21 @@ export class AlbexPool {
98
99
  console.warn('[albex] pool mode=shared requested but cross-origin isolation is not active; falling back to replicated');
99
100
  }
100
101
 
101
- const shardOpts: AlbexOptions = {
102
- wasmUrl: this._opts.wasmUrl,
103
- wasmBaseUrl: this._opts.wasmBaseUrl,
104
- pdfWasmUrl: this._opts.pdfWasmUrl,
105
- tier: this._opts.tier,
106
- simd: this._opts.simd,
107
- };
102
+ // Forward every serializable engine option to the shards; strip the
103
+ // pool-only fields (workerUrl/workers/mode) and anything non-clonable.
104
+ // Same policy as AlbexEngineWorker.init (audit 1.4).
105
+ const shardOpts: AlbexOptions = {};
106
+ for (const [k, v] of Object.entries(this._opts)) {
107
+ if (k === 'workerUrl' || k === 'workers' || k === 'mode') continue;
108
+ if (v === undefined || typeof v === 'function') continue;
109
+ (shardOpts as Record<string, unknown>)[k] = v;
110
+ }
108
111
 
109
112
  for (let i = 0; i < n; i++) {
110
113
  const shard = this._spawnShard();
111
114
  await this._send(shard, { kind: 'init', opts: shardOpts });
112
115
  this._shards.push(shard);
113
116
  }
114
-
115
- // Tier is the same across shards — capture it from shard 0 stats.
116
- const stats0 = await this._send<EngineStats>(this._shards[0]!, { kind: 'getStats' });
117
- this._tier = stats0.tier;
118
117
  }
119
118
 
120
119
  // ── Shard plumbing ─────────────────────────────────────────────────────
@@ -168,6 +167,9 @@ export class AlbexPool {
168
167
 
169
168
  async indexFile(file: File): Promise<IndexedDocument> {
170
169
  if (this._shards.length === 0) throw new AlbexInitError('Pool not initialised');
170
+ // Size guard BEFORE reading — same limit the shard engine enforces, but
171
+ // checked here so an oversized file is never buffered on the main thread.
172
+ assertFileSizeWithinLimit(file, this._opts.maxFileBytes);
171
173
  const idx = this._rrCursor++ % this._shards.length;
172
174
  const shard = this._shards[idx]!;
173
175
  const buffer = await file.arrayBuffer();
@@ -185,30 +187,50 @@ export class AlbexPool {
185
187
  * Map-reduce search:
186
188
  * 1. broadcast the query to every shard,
187
189
  * 2. each shard runs its local Bloom→Bitap→top-K,
188
- * 3. coordinator merges the K-best from each shard,
190
+ * 3. coordinator merges the K-best from each shard, deduplicating
191
+ * identical hits (same document name, chunk id and match offset —
192
+ * the case where the same file was indexed into two shards),
189
193
  * 4. global top-K returned in descending score order.
190
194
  *
191
195
  * Capped to `setMaxResults` results (default 50) AFTER merge.
196
+ *
197
+ * Per-shard search stats are requested in the same posting batch as the
198
+ * search itself: each worker processes its queue in arrival order, so the
199
+ * stats reply is guaranteed to belong to THIS query even when several
200
+ * `search()` calls overlap.
192
201
  */
193
202
  async search(query: string, opts: SearchOptions = {}): Promise<SearchResult[]> {
194
203
  if (this._shards.length === 0) return [];
195
204
  const t0 = performance.now();
196
- const buckets = await Promise.all(
197
- this._shards.map(s => this._send<SearchResult[]>(s, { kind: 'search', query, options: opts })),
205
+ // Post `search` and `getLastSearchStats` back-to-back (synchronously,
206
+ // per shard) so no other request can slot between them in the worker's
207
+ // FIFO queue — the stats round can't race a subsequent search.
208
+ const perShard = await Promise.all(
209
+ this._shards.map(s => {
210
+ const results = this._send<SearchResult[]>(s, { kind: 'search', query, options: opts });
211
+ const stats = this._send<SearchStats | null>(s, { kind: 'getLastSearchStats' });
212
+ return Promise.all([results, stats] as const);
213
+ }),
198
214
  );
199
215
 
200
- // Merge: simple flatten + sort (descending). Each shard already returned
201
- // up to its local cap; the global cap could be different but in practice
202
- // matches the per-shard cap.
203
- const merged = buckets.flat();
216
+ // Merge: flatten + dedup + sort (descending) + global cap. Shard-local
217
+ // docIds collide across shards, so the dedup identity also includes the
218
+ // document name: it only collapses true duplicates (the same document
219
+ // indexed into more than one shard yields identical name/chunkId/offset).
220
+ const seen = new Set<string>();
221
+ const merged: SearchResult[] = [];
222
+ for (const [bucket] of perShard) {
223
+ for (const r of bucket) {
224
+ const key = `${r.documentName}:${r.chunkId}:${r.matchStart}`;
225
+ if (!seen.has(key)) { seen.add(key); merged.push(r); }
226
+ }
227
+ }
204
228
  merged.sort((a, b) => b.score - a.score);
229
+ const capped = merged.slice(0, this._maxResults);
205
230
 
206
231
  // Aggregate search stats across shards.
207
- const stats = await Promise.all(
208
- this._shards.map(s => this._send<SearchStats | null>(s, { kind: 'getLastSearchStats' })),
209
- );
210
232
  let bloomTested = 0, bloomPassed = 0, bitapMatched = 0;
211
- for (const s of stats) {
233
+ for (const [, s] of perShard) {
212
234
  if (!s) continue;
213
235
  bloomTested += s.bloomTested;
214
236
  bloomPassed += s.bloomPassed;
@@ -217,11 +239,11 @@ export class AlbexPool {
217
239
  this._lastSearch = {
218
240
  query,
219
241
  timeMs: performance.now() - t0,
220
- results: merged.length,
242
+ results: capped.length,
221
243
  bloomTested, bloomPassed, bitapMatched,
222
244
  };
223
245
 
224
- return merged;
246
+ return capped;
225
247
  }
226
248
 
227
249
  /**
@@ -273,11 +295,13 @@ export class AlbexPool {
273
295
  for (const s of this._shards) s.docCount = 0;
274
296
  }
275
297
 
276
- /** Aggregate engine stats across all shards. */
298
+ /** Aggregate engine stats across all shards. Capacities are the RUNTIME
299
+ * per-shard capacities summed (each shard was initialised with the same
300
+ * `capacity` option, forwarded through the wire protocol). */
277
301
  async getStats(): Promise<EngineStats> {
278
302
  const all = await this._broadcast<EngineStats>({ kind: 'getStats' });
279
303
  let documents = 0, chunks = 0, textUsed = 0, textCapacity = 0, wasmMemoryBytes = 0;
280
- let maxChunks = 0, maxDocs = 0;
304
+ let maxChunks = 0, maxDocs = 0, namePoolBytes = 0;
281
305
  for (const s of all) {
282
306
  documents += s.documents;
283
307
  chunks += s.chunks;
@@ -286,10 +310,11 @@ export class AlbexPool {
286
310
  wasmMemoryBytes += s.wasmMemoryBytes;
287
311
  maxChunks += s.maxChunks;
288
312
  maxDocs += s.maxDocs;
313
+ namePoolBytes += s.namePoolBytes;
289
314
  }
290
315
  return {
291
316
  documents, chunks, textUsed, textCapacity, wasmMemoryBytes,
292
- tier: this._tier, maxChunks, maxDocs,
317
+ maxChunks, maxDocs, namePoolBytes,
293
318
  };
294
319
  }
295
320
 
@@ -303,15 +328,17 @@ export class AlbexPool {
303
328
 
304
329
  async setMaxErrors(n: 0 | 1 | 2 | 3): Promise<void> { await this._broadcast({ kind: 'setMaxErrors', n }); }
305
330
  async setThreshold(n: number): Promise<void> { await this._broadcast({ kind: 'setThreshold', n }); }
306
- async setMaxResults(n: number): Promise<void> { await this._broadcast({ kind: 'setMaxResults', n }); }
331
+ async setMaxResults(n: number): Promise<void> {
332
+ // Track the effective cap (same clamp as the WASM engine) so search()
333
+ // can enforce it globally after the cross-shard merge.
334
+ this._maxResults = Math.max(1, Math.min(200, Math.floor(n)));
335
+ await this._broadcast({ kind: 'setMaxResults', n });
336
+ }
307
337
  async setLanguage(lang: 'off' | 'es'): Promise<void> { await this._broadcast({ kind: 'setLanguage', lang }); }
308
338
 
309
339
  /** Number of shards currently running. */
310
340
  get workerCount(): number { return this._shards.length; }
311
341
 
312
- /** Tier loaded by the shards (same value across all of them). */
313
- get tier(): Tier | null { return this._tier; }
314
-
315
342
  [Symbol.dispose](): void {
316
343
  for (const s of this._shards) {
317
344
  for (const [, p] of s.pending) p.reject(new AlbexError('disposed', 'Pool disposed'));
@@ -20,7 +20,23 @@ export interface AlbexWasmExports {
20
20
  // ABI / lifecycle
21
21
  abiVersion(): number;
22
22
  getBuffer(size: number): number;
23
+ /** Reset with the std default capacities (128 docs · 100k chunks · 16 MB
24
+ * text · 32 KB names) — identical behaviour to every pre-ABI-7 release. */
23
25
  init(): void;
26
+ /** (Re-)initialise the engine with runtime capacities (ABI 7, decision
27
+ * A16). Allocates the capacity-dependent pools on the WASM heap. Returns
28
+ * 1 on success; 0 on invalid parameters (floors/ceilings documented in
29
+ * wasm/src/lib.rs: ≥1 doc, docs ≤ 65 536, chunks ≥ docs and ≤ 4 M, text
30
+ * pool 4 KiB–1 GiB, name pool 256 B–16 MiB) or on allocation failure —
31
+ * never traps. Re-init with the same capacities is a plain reset; with
32
+ * different capacities the pools are freed and re-allocated (no leak,
33
+ * but the linear-memory high-water mark never shrinks). */
34
+ initWithCapacity(
35
+ maxDocs: number,
36
+ maxChunks: number,
37
+ textPoolBytes: number,
38
+ namePoolBytes: number,
39
+ ): number;
24
40
 
25
41
  /** Reset the streaming FNV-1a 64-bit hash state. Optional on the first
26
42
  * hash of a session because the static initialiser is also FNV_OFFSET. */
@@ -57,6 +73,11 @@ export interface AlbexWasmExports {
57
73
  getQueryBranchCount(): number;
58
74
  getQueryBranchPattern(i: number): number;
59
75
  selectQueryBranch(i: number): number;
76
+ /** Bitflags of what the most recent prepareQuery dropped or clipped
77
+ * (ABI 5): 1 = OR branches beyond 8 discarded, 2 = tokens dropped
78
+ * (> 4 per branch) or clipped (> 64 bytes), 4 = raw query cut at
79
+ * 1024 bytes. 0 = compiled in full. */
80
+ getQueryTruncationFlags(): number;
60
81
 
61
82
  // Search execution
62
83
  setPattern(len: number): number;
@@ -68,6 +89,15 @@ export interface AlbexWasmExports {
68
89
  getSearchTotal(): number;
69
90
 
70
91
  // Result accessors
92
+ /** Base pointer of the `#[repr(C)]` RESULTS array (ABI 6). Read
93
+ * `getResultCount()` records of `getResultStride()` bytes each with one
94
+ * DataView pass instead of ~12 frontier calls per result. Field offsets
95
+ * are documented (and compile-time asserted) in wasm/src/lib.rs. Copy
96
+ * everything out before any call that could grow WASM memory. */
97
+ getResultsPtr(): number;
98
+ /** Byte stride between consecutive RESULTS records (= sizeof(DocMatch),
99
+ * 60 today). Exported so the host never hardcodes the struct size. */
100
+ getResultStride(): number;
71
101
  getResultCount(): number;
72
102
  getResultDocId(i: number): number;
73
103
  getResultLocation(i: number): number;
@@ -121,6 +151,27 @@ export interface AlbexWasmExports {
121
151
  removeDocument(docId: number): number;
122
152
  compact(): void;
123
153
 
154
+ // Authoritative chunk enumeration (ABI 4). Address a document's chunks by
155
+ // (doc slot, ordinal). The compact()-stable key is (doc_id, ordinal).
156
+ /** First CHUNKS[] index for the document at slot `index` (= chunk_start).
157
+ * `ord = resultChunkIdx - getDocChunkBase(slot)` gives the doc-relative
158
+ * ordinal of a search hit. */
159
+ getDocChunkBase(index: number): number;
160
+ /** `location` (paragraph/page) of the `ord`-th chunk; u32::MAX if OOB. */
161
+ getChunkLocationAt(index: number, ord: number): number;
162
+ /** Byte length of the `ord`-th chunk's text; 0 if OOB. */
163
+ getChunkByteLenAt(index: number, ord: number): number;
164
+ /** Copy the `ord`-th chunk's UTF-8 text into the scratchpad; returns byte
165
+ * length (0 if OOB). Lets a host enumerate a doc's authoritative chunks
166
+ * right after indexing, with no query. */
167
+ getChunkTextAt(index: number, ord: number): number;
168
+ /** Batch chunk enumeration (ABI 6). Packs up to `maxChunks` records
169
+ * `[u32 text_len][u32 location][text bytes]` (LE, tightly packed) into
170
+ * the scratchpad starting at ordinal `startOrd`; returns how many were
171
+ * written. One frontier call per scratchpad-full instead of 2-3 per
172
+ * chunk. */
173
+ listChunksBatch(index: number, startOrd: number, maxChunks: number): number;
174
+
124
175
  /**
125
176
  * Per-document content hash (snapshot v2). Returns a pointer to 8 bytes
126
177
  * holding the FNV-1a 64-bit hash of the original file bytes, or 0 if the
@@ -139,8 +190,8 @@ export interface AlbexWasmExports {
139
190
  // Stemming
140
191
  setLanguage(lang: number): void;
141
192
 
142
- // Tier identification
143
- getTier(): number; // 1=mini, 2=std, 3=pro
193
+ // Runtime capacity identification (ABI 7). Report the capacities the
194
+ // engine was last initialised with — `init()` = the std defaults.
144
195
  getMaxChunks(): number;
145
196
  getMaxDocs(): number;
146
197
  getNameCapacity(): number;
@@ -150,6 +201,13 @@ export interface AlbexWasmExports {
150
201
  getChunkStructSize(): number;
151
202
  setCandidateMask(byteLen: number): void;
152
203
  clearCandidateMask(): void;
204
+ /** Low 32 bits of the active pattern's aggregate character Bloom
205
+ * (ABI 6). Computed WASM-side in setPattern through the same pipeline
206
+ * searchBegin uses (split → optional stemming → fold), so the GPU
207
+ * pre-filter tests exactly the bits the CPU path would. */
208
+ getPatternBloomLo(): number;
209
+ /** High 32 bits of the active pattern's aggregate character Bloom. */
210
+ getPatternBloomHi(): number;
153
211
  }
154
212
 
155
213
  // ─────────────────────────────────────────────────────────────────────────────
@@ -236,12 +294,17 @@ export interface AlbexPdfExports {
236
294
  /** Range of ABI versions this host code understands for the main module.
237
295
  * Update both ends together with the Rust `abiVersion()` constant when
238
296
  * the export surface changes. */
239
- // 0.6.0 requires ABI 3 (trigram pre-filter + getLastIndexOverflow). The
240
- // required-exports list below already makes any older binary fail the
241
- // missing-exports check, so a tolerant lower bound was dead code — the range
242
- // is pinned to the one ABI this host actually speaks (audit 0.6.0, finding #7).
243
- const MAIN_ABI_MIN = 3;
244
- const MAIN_ABI_MAX = 3;
297
+ // ABI 7 adds runtime capacity (initWithCapacity, decision A16) and removes
298
+ // the compile-time tier system (`getTier` is gone), on top of ABI 6's batch
299
+ // frontier reads, ABI 5's truncation signalling and ABI 4's authoritative
300
+ // chunk enumeration. The required-exports list below already makes any
301
+ // older binary fail the missing-exports check, so a tolerant lower bound
302
+ // was dead code — the range is pinned to the one ABI this host actually
303
+ // speaks (audit 0.6.0, finding #7). The .wasm ships inside this package
304
+ // (files: wasm/pkg/*.wasm), so host TS and binary are always
305
+ // version-matched.
306
+ const MAIN_ABI_MIN = 7;
307
+ const MAIN_ABI_MAX = 7;
245
308
 
246
309
  /** Range of ABI versions for the PDF module. */
247
310
  const PDF_ABI_MIN = 1;
@@ -250,16 +313,16 @@ const PDF_ABI_MAX = 3;
250
313
  /** Required function names on the main WASM. Adding a new one here forces
251
314
  * the validator to check it; removing one is a breaking ABI bump. */
252
315
  const MAIN_REQUIRED = [
253
- 'abiVersion', 'getBuffer', 'init',
316
+ 'abiVersion', 'getBuffer', 'init', 'initWithCapacity',
254
317
  'setDocumentName', 'beginDocument', 'feedXmlBytes', 'endDocument',
255
318
  'beginXlsx', 'feedXlsxBytes',
256
319
  'feedText', 'flushParagraph',
257
320
  'setMaxErrors', 'setThreshold', 'setMaxResults',
258
321
  'prepareQuery', 'getQueryKind', 'getQueryBranchCount',
259
- 'getQueryBranchPattern', 'selectQueryBranch',
322
+ 'getQueryBranchPattern', 'selectQueryBranch', 'getQueryTruncationFlags',
260
323
  'setPattern', 'search',
261
324
  'searchBegin', 'searchSlice', 'getSearchCursor', 'getSearchTotal',
262
- 'getResultCount',
325
+ 'getResultCount', 'getResultsPtr', 'getResultStride',
263
326
  'getResultDocId', 'getResultLocation', 'getResultScore',
264
327
  'getResultStart', 'getResultEnd', 'getResultChunkIdx',
265
328
  'getResultDocName', 'getResultMatchCount',
@@ -272,10 +335,13 @@ const MAIN_REQUIRED = [
272
335
  'restoreBegin', 'restoreFeed', 'restoreCommit',
273
336
  'getDocId', 'getDocChunkCount', 'getDocName', 'isDocDeleted',
274
337
  'removeDocument', 'compact',
338
+ 'getDocChunkBase', 'getChunkLocationAt', 'getChunkByteLenAt', 'getChunkTextAt',
339
+ 'listChunksBatch',
275
340
  'setLanguage',
276
- 'getTier', 'getMaxChunks', 'getMaxDocs', 'getNameCapacity',
341
+ 'getMaxChunks', 'getMaxDocs', 'getNameCapacity',
277
342
  'getChunksPtr', 'getChunkStructSize',
278
343
  'setCandidateMask', 'clearCandidateMask',
344
+ 'getPatternBloomLo', 'getPatternBloomHi',
279
345
  'getDocContentHashPtr', 'getDocContentHashLen', 'setDocumentContentHash',
280
346
  'hashBegin', 'hashFeed', 'hashFinish',
281
347
  ] as const;
@@ -10,13 +10,19 @@
10
10
  * copying the file bytes into the worker.
11
11
  */
12
12
 
13
- import type { AlbexOptions, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
13
+ import type { AlbexDiagnostic, AlbexOptions, AuthoritativeChunk, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
14
14
 
15
15
  export type WorkerOp =
16
16
  | { kind: 'init'; opts: AlbexOptions }
17
17
  | { kind: 'indexFile'; name: string; buffer: ArrayBuffer }
18
18
  | { kind: 'search'; query: string; options: SearchOptions }
19
+ | { kind: 'listChunks'; docId: number }
19
20
  | { kind: 'removeDocument'; id: string }
21
+ /** Replace doc `name` with new content. `fileName` is the replacement
22
+ * file's own name (may differ from `name`); the bytes travel as a
23
+ * transferred ArrayBuffer like `indexFile`. */
24
+ | { kind: 'replaceDocument'; name: string; fileName: string; buffer: ArrayBuffer }
25
+ | { kind: 'takeDiagnostics' }
20
26
  | { kind: 'compact' }
21
27
  | { kind: 'reset' }
22
28
  | { kind: 'getStats' }
@@ -39,10 +45,14 @@ export interface WorkerRequest {
39
45
 
40
46
  export type WorkerResponse =
41
47
  | { id: number; ok: true; result: unknown }
42
- | { id: number; ok: false; error: { name: string; kind?: string; message: string } };
48
+ /** `limit`/`max` are populated for capacity errors so the rehydrated
49
+ * AlbexCapacityError keeps reporting the runtime limit that overflowed. */
50
+ | { id: number; ok: false; error: { name: string; kind?: string; message: string; limit?: string; max?: number } };
43
51
 
44
52
  export type IndexFileResult = IndexedDocument;
45
53
  export type SearchResultArr = SearchResult[];
54
+ export type ChunksResult = AuthoritativeChunk[];
46
55
  export type StatsResult = EngineStats;
47
56
  export type SearchStatsRes = SearchStats | null;
48
57
  export type DocsResult = readonly IndexedDocument[];
58
+ export type DiagnosticsRes = AlbexDiagnostic[];
@@ -36,8 +36,18 @@ async function dispatch(op: WorkerOp): Promise<unknown> {
36
36
  }
37
37
  case 'search':
38
38
  return ensureEngine().search(op.query, op.options);
39
+ case 'listChunks':
40
+ return ensureEngine().listChunks(op.docId);
39
41
  case 'removeDocument':
40
42
  return ensureEngine().removeDocument(op.id);
43
+ case 'replaceDocument': {
44
+ // Same File-like wrapping as indexFile; the engine's replaceDocument
45
+ // handles remove + re-index + auto-compact under its own lock.
46
+ const file = new File([op.buffer], op.fileName);
47
+ return ensureEngine().replaceDocument(op.name, file);
48
+ }
49
+ case 'takeDiagnostics':
50
+ return ensureEngine().takeDiagnostics();
41
51
  case 'compact':
42
52
  ensureEngine().compact();
43
53
  return undefined;
@@ -82,13 +92,18 @@ async function handle(req: WorkerRequest): Promise<void> {
82
92
  const res: WorkerResponse = { id, ok: true, result };
83
93
  (self as unknown as Worker).postMessage(res);
84
94
  } catch (err) {
85
- const e = err as Error & { kind?: string };
95
+ const e = err as Error & { kind?: string; limit?: string; max?: number };
86
96
  const res: WorkerResponse = {
87
97
  id, ok: false,
88
98
  error: {
89
99
  name: e.name ?? 'Error',
90
100
  kind: err instanceof AlbexError ? err.kind : undefined,
91
101
  message: e.message ?? String(err),
102
+ // Capacity metadata (which pool + its runtime limit) — plain data,
103
+ // survives structuredClone, lets the main side rehydrate a full
104
+ // AlbexCapacityError.
105
+ limit: typeof e.limit === 'string' ? e.limit : undefined,
106
+ max: typeof e.max === 'number' ? e.max : undefined,
92
107
  },
93
108
  };
94
109
  (self as unknown as Worker).postMessage(res);
Binary file
Binary file
Binary file
Binary file