albex 0.3.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CHANGELOG.md +275 -0
  2. package/README.md +4 -2
  3. package/dist/albex-worker.js +1 -1
  4. package/dist/albex.d.ts +157 -17
  5. package/dist/albex.d.ts.map +1 -1
  6. package/dist/albex.js +405 -232
  7. package/dist/albex.js.map +1 -1
  8. package/dist/errors.d.ts +16 -2
  9. package/dist/errors.d.ts.map +1 -1
  10. package/dist/errors.js +6 -3
  11. package/dist/errors.js.map +1 -1
  12. package/dist/persistence.js +1 -1
  13. package/dist/profile.d.ts +11 -6
  14. package/dist/profile.d.ts.map +1 -1
  15. package/dist/profile.js +6 -13
  16. package/dist/profile.js.map +1 -1
  17. package/dist/resource-manager.js +1 -1
  18. package/dist/tiered-store.js +1 -1
  19. package/dist/wasm-bindings.d.ts +46 -5
  20. package/dist/wasm-bindings.d.ts.map +1 -1
  21. package/dist/wasm-bindings.js +102 -7
  22. package/dist/wasm-bindings.js.map +1 -1
  23. package/dist/worker-protocol.js +1 -1
  24. package/dist/worker-runtime.js +12 -3
  25. package/dist/worker-runtime.js.map +1 -1
  26. package/package.json +13 -9
  27. package/src/albex.ts +478 -246
  28. package/src/errors.ts +18 -2
  29. package/src/profile.ts +11 -10
  30. package/src/wasm-bindings.ts +157 -8
  31. package/src/worker-runtime.ts +12 -2
  32. package/wasm/pkg/albex_pdf.wasm +0 -0
  33. package/wasm/pkg/albex_wasm.wasm +0 -0
  34. package/wasm/pkg/albex_wasm_bg.wasm +0 -0
  35. package/wasm/pkg/albex_wasm_simd.wasm +0 -0
  36. package/wasm/pkg/albex_wasm_mini.wasm +0 -0
  37. package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
  38. package/wasm/pkg/albex_wasm_pro.wasm +0 -0
  39. package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
  40. package/wasm/pkg/albex_wasm_std.wasm +0 -0
  41. package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/src/errors.ts CHANGED
@@ -51,10 +51,26 @@ export class AlbexParseError extends AlbexError {
51
51
  }
52
52
  }
53
53
 
54
- /** Thrown when the scratchpad is too small for a single chunk write. */
54
+ /**
55
+ * Thrown when an indexing operation does not fit: either the scratchpad was
56
+ * too small for a single write, or one of the engine's pools (chunks, text,
57
+ * documents, names) ran out of room mid-document. Before 0.6.0 the latter was
58
+ * silent — the corpus was truncated with no signal.
59
+ *
60
+ * `limit` names which pool overflowed (or `'scratchpad'`), so callers can
61
+ * branch — e.g. start a fresh shard, `compact()`, or surface "library full".
62
+ * When a capacity error is raised during `indexFile`, the engine may hold a
63
+ * partially-indexed copy of the offending document; treat the index as full
64
+ * and stop adding.
65
+ */
66
+ export type AlbexCapacityLimit = 'chunks' | 'text' | 'docs' | 'names' | 'scratchpad';
67
+
55
68
  export class AlbexCapacityError extends AlbexError {
56
- constructor(message: string) {
69
+ /** Which pool overflowed. Undefined for older call sites that didn't set it. */
70
+ readonly limit?: AlbexCapacityLimit;
71
+ constructor(message: string, limit?: AlbexCapacityLimit) {
57
72
  super('capacity', message);
58
73
  this.name = 'AlbexCapacityError';
74
+ if (limit) this.limit = limit;
59
75
  }
60
76
  }
package/src/profile.ts CHANGED
@@ -233,19 +233,20 @@ export async function detectProfile(opts: { fresh?: boolean } = {}): Promise<Dev
233
233
 
234
234
  // ── Tier selection ───────────────────────────────────────────────────────────
235
235
 
236
- export type Tier = 'mini' | 'std' | 'pro';
236
+ /**
237
+ * @deprecated The tier system was removed in 0.5.0 (audit 4.1: "6
238
+ * binaries × no proven benefit"). The type remains exported as `'std'`
239
+ * for backwards compatibility with code that read `engine.getStats().tier`.
240
+ */
241
+ export type Tier = 'std';
237
242
 
238
243
  /**
239
- * Choose the optimal binary tier from a profile.
240
- *
241
- * The thresholds are conservative: a device with `deviceMemory === null`
242
- * (Safari) defaults to `std` to avoid both over- and under-provisioning.
244
+ * @deprecated Always returns `'std'` as of 0.5.0. Albex ships exactly
245
+ * two main binaries (baseline + SIMD); the only runtime variant is the
246
+ * SIMD probe, not a capacity tier. Kept callable so existing integrators
247
+ * don't break, but the value has no operational meaning anymore.
243
248
  */
244
- export function pickTier(profile: DeviceProfile): Tier {
245
- const m = profile.memoryGB;
246
- if (m === null) return 'std';
247
- if (m <= 1) return 'mini';
248
- if (m >= 8) return 'pro';
249
+ export function pickTier(_profile: DeviceProfile): Tier {
249
250
  return 'std';
250
251
  }
251
252
 
@@ -17,10 +17,21 @@
17
17
  export interface AlbexWasmExports {
18
18
  readonly memory: WebAssembly.Memory;
19
19
 
20
- // Scratchpad / lifecycle
20
+ // ABI / lifecycle
21
+ abiVersion(): number;
21
22
  getBuffer(size: number): number;
22
23
  init(): void;
23
24
 
25
+ /** Reset the streaming FNV-1a 64-bit hash state. Optional on the first
26
+ * hash of a session because the static initialiser is also FNV_OFFSET. */
27
+ hashBegin(): void;
28
+ /** Fold `len` bytes of scratchpad into the streaming hash. May be
29
+ * called repeatedly for files larger than SCRATCHPAD_SIZE. */
30
+ hashFeed(len: number): void;
31
+ /** Write the final 8 raw big-endian bytes at scratchpad[0..8] and
32
+ * reset the state so the next hash can start without an explicit Begin. */
33
+ hashFinish(): void;
34
+
24
35
  // Document ingestion
25
36
  setDocumentName(len: number): void;
26
37
  beginDocument(): number;
@@ -40,6 +51,13 @@ export interface AlbexWasmExports {
40
51
  setThreshold(threshold: number): void;
41
52
  setMaxResults(max: number): void;
42
53
 
54
+ // Query parsing (since ABI v2). Single source of truth for tokenization.
55
+ prepareQuery(len: number): number;
56
+ getQueryKind(): number;
57
+ getQueryBranchCount(): number;
58
+ getQueryBranchPattern(i: number): number;
59
+ selectQueryBranch(i: number): number;
60
+
43
61
  // Search execution
44
62
  setPattern(len: number): number;
45
63
  search(): number;
@@ -73,12 +91,27 @@ export interface AlbexWasmExports {
73
91
  getDocCount(): number;
74
92
  getTextUsed(): number;
75
93
  getTextCapacity(): number;
94
+ /** Bitflags of capacity limits hit during the most recent
95
+ * begin..endDocument cycle: 1 = chunks, 2 = text, 4 = docs, 8 = names.
96
+ * 0 = everything fit. Read by the host right after endDocument to raise a
97
+ * typed AlbexCapacityError instead of silently truncating the corpus. */
98
+ getLastIndexOverflow(): number;
76
99
 
77
- // Snapshot / restore
100
+ // Snapshot / restore (v3 protocol; v1 and v2 still load)
78
101
  snapshotSize(): number;
79
102
  snapshotChunk(offset: number, maxLen: number): number;
103
+ /** Validate header. For v3 also reserves the staging buffer; state is
104
+ * NOT touched until restoreCommit succeeds. For v1/v2 (legacy) state is
105
+ * reset and counters are written immediately. */
80
106
  restoreBegin(): number;
107
+ /** Feed payload bytes. For v3 they accumulate into staging; for v1/v2
108
+ * they are written straight to the state arrays as before. */
81
109
  restoreFeed(len: number): number;
110
+ /** Atomic commit for v3 snapshots. Returns 1 if the staged payload was
111
+ * complete and decoded successfully; 0 otherwise — and in the 0 case
112
+ * the previous engine state is preserved. For v1/v2 this is a no-op
113
+ * that always returns 1. */
114
+ restoreCommit(): number;
82
115
 
83
116
  // Incremental / per-doc
84
117
  getDocId(index: number): number;
@@ -126,6 +159,10 @@ export interface AlbexWasmExports {
126
159
  export interface AlbexPdfExports {
127
160
  readonly memory: WebAssembly.Memory;
128
161
 
162
+ /** ABI version of the PDF module. The host loader refuses any binary
163
+ * whose abiVersion is outside the supported range. */
164
+ abiVersion(): number;
165
+
129
166
  /** Reserve `len` bytes inside the PDF module and return a pointer. */
130
167
  allocInput(len: number): number;
131
168
 
@@ -183,18 +220,130 @@ export interface AlbexPdfExports {
183
220
  }
184
221
 
185
222
  // ─────────────────────────────────────────────────────────────────────────────
186
- // Narrowing helpers for instantiation results
223
+ // Runtime validators
187
224
  // ─────────────────────────────────────────────────────────────────────────────
225
+ //
226
+ // These replace the pre-0.5.0 `as unknown as` casts. They check three
227
+ // things at instantiation time:
228
+ // 1. memory is a WebAssembly.Memory instance.
229
+ // 2. abiVersion() returns a number inside the supported range.
230
+ // 3. every required export exists and is a function.
231
+ //
232
+ // If any of these fails, the loader throws a typed error before the
233
+ // engine returns from init(). This eliminates the audit 3.2 issue:
234
+ // previously a missing export only surfaced when its call site ran.
188
235
 
189
- /**
190
- * Cast `WebAssembly.Exports` to the typed Albex main interface.
191
- * Runtime check is intentionally minimal — if the .wasm doesn't match,
192
- * the first call site that touches a missing function throws naturally.
193
- */
236
+ /** Range of ABI versions this host code understands for the main module.
237
+ * Update both ends together with the Rust `abiVersion()` constant when
238
+ * the export surface changes. */
239
+ // 0.6.0 requires ABI 3 (trigram pre-filter + getLastIndexOverflow). The
240
+ // required-exports list below already makes any older binary fail the
241
+ // missing-exports check, so a tolerant lower bound was dead code — the range
242
+ // is pinned to the one ABI this host actually speaks (audit 0.6.0, finding #7).
243
+ const MAIN_ABI_MIN = 3;
244
+ const MAIN_ABI_MAX = 3;
245
+
246
+ /** Range of ABI versions for the PDF module. */
247
+ const PDF_ABI_MIN = 1;
248
+ const PDF_ABI_MAX = 3;
249
+
250
+ /** Required function names on the main WASM. Adding a new one here forces
251
+ * the validator to check it; removing one is a breaking ABI bump. */
252
+ const MAIN_REQUIRED = [
253
+ 'abiVersion', 'getBuffer', 'init',
254
+ 'setDocumentName', 'beginDocument', 'feedXmlBytes', 'endDocument',
255
+ 'beginXlsx', 'feedXlsxBytes',
256
+ 'feedText', 'flushParagraph',
257
+ 'setMaxErrors', 'setThreshold', 'setMaxResults',
258
+ 'prepareQuery', 'getQueryKind', 'getQueryBranchCount',
259
+ 'getQueryBranchPattern', 'selectQueryBranch',
260
+ 'setPattern', 'search',
261
+ 'searchBegin', 'searchSlice', 'getSearchCursor', 'getSearchTotal',
262
+ 'getResultCount',
263
+ 'getResultDocId', 'getResultLocation', 'getResultScore',
264
+ 'getResultStart', 'getResultEnd', 'getResultChunkIdx',
265
+ 'getResultDocName', 'getResultMatchCount',
266
+ 'getResultMatchStartAt', 'getResultMatchEndAt',
267
+ 'getSnippet', 'getSnippetWindow', 'getSnippetWindowOffset',
268
+ 'getStatBloomTested', 'getStatBloomPassed', 'getStatBitapMatched',
269
+ 'getChunkCount', 'getDocCount', 'getTextUsed', 'getTextCapacity',
270
+ 'getLastIndexOverflow',
271
+ 'snapshotSize', 'snapshotChunk',
272
+ 'restoreBegin', 'restoreFeed', 'restoreCommit',
273
+ 'getDocId', 'getDocChunkCount', 'getDocName', 'isDocDeleted',
274
+ 'removeDocument', 'compact',
275
+ 'setLanguage',
276
+ 'getTier', 'getMaxChunks', 'getMaxDocs', 'getNameCapacity',
277
+ 'getChunksPtr', 'getChunkStructSize',
278
+ 'setCandidateMask', 'clearCandidateMask',
279
+ 'getDocContentHashPtr', 'getDocContentHashLen', 'setDocumentContentHash',
280
+ 'hashBegin', 'hashFeed', 'hashFinish',
281
+ ] as const;
282
+
283
+ const PDF_REQUIRED = [
284
+ 'abiVersion', 'allocInput', 'extractPdf',
285
+ 'getPageLen', 'getPagePtr', 'getErrorLen', 'getErrorPtr',
286
+ 'getPageCount', 'extractPageImages',
287
+ 'getPageImageLen', 'getPageImagePtr', 'getPageImageKind',
288
+ ] as const;
289
+
290
+ /** Thrown when an instantiated WASM module fails the ABI contract. */
291
+ export class AlbexAbiMismatchError extends Error {
292
+ readonly module: 'main' | 'pdf';
293
+ readonly missing?: readonly string[];
294
+ readonly version?: number;
295
+ constructor(module: 'main' | 'pdf', message: string, opts?: { missing?: readonly string[]; version?: number }) {
296
+ super(message);
297
+ this.name = 'AlbexAbiMismatchError';
298
+ this.module = module;
299
+ if (opts?.missing) this.missing = opts.missing;
300
+ if (opts?.version !== undefined) this.version = opts.version;
301
+ }
302
+ }
303
+
304
+ function validateExports(
305
+ exports: WebAssembly.Exports,
306
+ required: readonly string[],
307
+ module: 'main' | 'pdf',
308
+ abiMin: number,
309
+ abiMax: number,
310
+ ): void {
311
+ const mem = (exports as Record<string, unknown>)['memory'];
312
+ if (!(mem instanceof WebAssembly.Memory)) {
313
+ throw new AlbexAbiMismatchError(module, `${module}: \`memory\` is missing or not a WebAssembly.Memory instance.`);
314
+ }
315
+ const missing: string[] = [];
316
+ for (const name of required) {
317
+ if (typeof (exports as Record<string, unknown>)[name] !== 'function') missing.push(name);
318
+ }
319
+ if (missing.length) {
320
+ throw new AlbexAbiMismatchError(
321
+ module,
322
+ `${module}: WASM binary missing required exports: ${missing.join(', ')}. ` +
323
+ `The .wasm was built with an incompatible source — rebuild with the current toolchain.`,
324
+ { missing },
325
+ );
326
+ }
327
+ const version = ((exports as Record<string, unknown>)['abiVersion'] as () => number)();
328
+ if (version < abiMin || version > abiMax) {
329
+ throw new AlbexAbiMismatchError(
330
+ module,
331
+ `${module}: abiVersion ${version} outside supported range [${abiMin}..${abiMax}]. ` +
332
+ `The host TypeScript expects a different binary — upgrade albex or rebuild the WASM.`,
333
+ { version },
334
+ );
335
+ }
336
+ }
337
+
338
+ /** Validate and narrow `WebAssembly.Exports` to the typed Albex main
339
+ * interface. Throws `AlbexAbiMismatchError` if the contract is broken. */
194
340
  export function asAlbexExports(exports: WebAssembly.Exports): AlbexWasmExports {
341
+ validateExports(exports, MAIN_REQUIRED, 'main', MAIN_ABI_MIN, MAIN_ABI_MAX);
195
342
  return exports as unknown as AlbexWasmExports;
196
343
  }
197
344
 
345
+ /** Validate and narrow `WebAssembly.Exports` to the typed PDF interface. */
198
346
  export function asAlbexPdfExports(exports: WebAssembly.Exports): AlbexPdfExports {
347
+ validateExports(exports, PDF_REQUIRED, 'pdf', PDF_ABI_MIN, PDF_ABI_MAX);
199
348
  return exports as unknown as AlbexPdfExports;
200
349
  }
@@ -75,8 +75,8 @@ async function dispatch(op: WorkerOp): Promise<unknown> {
75
75
  }
76
76
  }
77
77
 
78
- self.onmessage = async (ev: MessageEvent<WorkerRequest>) => {
79
- const { id, op } = ev.data;
78
+ async function handle(req: WorkerRequest): Promise<void> {
79
+ const { id, op } = req;
80
80
  try {
81
81
  const result = await dispatch(op);
82
82
  const res: WorkerResponse = { id, ok: true, result };
@@ -93,4 +93,14 @@ self.onmessage = async (ev: MessageEvent<WorkerRequest>) => {
93
93
  };
94
94
  (self as unknown as Worker).postMessage(res);
95
95
  }
96
+ }
97
+
98
+ // Process messages strictly in arrival order. The engine guards its own
99
+ // state, but a sync `search` arriving mid-`indexFile` await would otherwise
100
+ // be rejected as "busy"; queueing keeps the worker's externally-observable
101
+ // behaviour serial and matches the main-thread engine's serialization.
102
+ let _queue: Promise<void> = Promise.resolve();
103
+ self.onmessage = (ev: MessageEvent<WorkerRequest>) => {
104
+ const req = ev.data;
105
+ _queue = _queue.then(() => handle(req));
96
106
  };
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file