albex 0.3.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +275 -0
- package/README.md +4 -2
- package/dist/albex-worker.js +1 -1
- package/dist/albex.d.ts +157 -17
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +405 -232
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +16 -2
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +6 -3
- package/dist/errors.js.map +1 -1
- package/dist/persistence.js +1 -1
- package/dist/profile.d.ts +11 -6
- package/dist/profile.d.ts.map +1 -1
- package/dist/profile.js +6 -13
- package/dist/profile.js.map +1 -1
- package/dist/resource-manager.js +1 -1
- package/dist/tiered-store.js +1 -1
- package/dist/wasm-bindings.d.ts +46 -5
- package/dist/wasm-bindings.d.ts.map +1 -1
- package/dist/wasm-bindings.js +102 -7
- package/dist/wasm-bindings.js.map +1 -1
- package/dist/worker-protocol.js +1 -1
- package/dist/worker-runtime.js +12 -3
- package/dist/worker-runtime.js.map +1 -1
- package/package.json +13 -9
- package/src/albex.ts +478 -246
- package/src/errors.ts +18 -2
- package/src/profile.ts +11 -10
- package/src/wasm-bindings.ts +157 -8
- package/src/worker-runtime.ts +12 -2
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_std.wasm +0 -0
- package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/src/errors.ts
CHANGED
|
@@ -51,10 +51,26 @@ export class AlbexParseError extends AlbexError {
|
|
|
51
51
|
}
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
-
/**
|
|
54
|
+
/**
|
|
55
|
+
* Thrown when an indexing operation does not fit: either the scratchpad was
|
|
56
|
+
* too small for a single write, or one of the engine's pools (chunks, text,
|
|
57
|
+
* documents, names) ran out of room mid-document. Before 0.6.0 the latter was
|
|
58
|
+
* silent — the corpus was truncated with no signal.
|
|
59
|
+
*
|
|
60
|
+
* `limit` names which pool overflowed (or `'scratchpad'`), so callers can
|
|
61
|
+
* branch — e.g. start a fresh shard, `compact()`, or surface "library full".
|
|
62
|
+
* When a capacity error is raised during `indexFile`, the engine may hold a
|
|
63
|
+
* partially-indexed copy of the offending document; treat the index as full
|
|
64
|
+
* and stop adding.
|
|
65
|
+
*/
|
|
66
|
+
export type AlbexCapacityLimit = 'chunks' | 'text' | 'docs' | 'names' | 'scratchpad';
|
|
67
|
+
|
|
55
68
|
export class AlbexCapacityError extends AlbexError {
|
|
56
|
-
|
|
69
|
+
/** Which pool overflowed. Undefined for older call sites that didn't set it. */
|
|
70
|
+
readonly limit?: AlbexCapacityLimit;
|
|
71
|
+
constructor(message: string, limit?: AlbexCapacityLimit) {
|
|
57
72
|
super('capacity', message);
|
|
58
73
|
this.name = 'AlbexCapacityError';
|
|
74
|
+
if (limit) this.limit = limit;
|
|
59
75
|
}
|
|
60
76
|
}
|
package/src/profile.ts
CHANGED
|
@@ -233,19 +233,20 @@ export async function detectProfile(opts: { fresh?: boolean } = {}): Promise<Dev
|
|
|
233
233
|
|
|
234
234
|
// ── Tier selection ───────────────────────────────────────────────────────────
|
|
235
235
|
|
|
236
|
-
|
|
236
|
+
/**
|
|
237
|
+
* @deprecated The tier system was removed in 0.5.0 (audit 4.1: "6
|
|
238
|
+
* binaries × no proven benefit"). The type remains exported as `'std'`
|
|
239
|
+
* for backwards compatibility with code that read `engine.getStats().tier`.
|
|
240
|
+
*/
|
|
241
|
+
export type Tier = 'std';
|
|
237
242
|
|
|
238
243
|
/**
|
|
239
|
-
*
|
|
240
|
-
*
|
|
241
|
-
*
|
|
242
|
-
*
|
|
244
|
+
* @deprecated Always returns `'std'` as of 0.5.0. Albex ships exactly
|
|
245
|
+
* two main binaries (baseline + SIMD); the only runtime variant is the
|
|
246
|
+
* SIMD probe, not a capacity tier. Kept callable so existing integrators
|
|
247
|
+
* don't break, but the value has no operational meaning anymore.
|
|
243
248
|
*/
|
|
244
|
-
export function pickTier(
|
|
245
|
-
const m = profile.memoryGB;
|
|
246
|
-
if (m === null) return 'std';
|
|
247
|
-
if (m <= 1) return 'mini';
|
|
248
|
-
if (m >= 8) return 'pro';
|
|
249
|
+
export function pickTier(_profile: DeviceProfile): Tier {
|
|
249
250
|
return 'std';
|
|
250
251
|
}
|
|
251
252
|
|
package/src/wasm-bindings.ts
CHANGED
|
@@ -17,10 +17,21 @@
|
|
|
17
17
|
export interface AlbexWasmExports {
|
|
18
18
|
readonly memory: WebAssembly.Memory;
|
|
19
19
|
|
|
20
|
-
//
|
|
20
|
+
// ABI / lifecycle
|
|
21
|
+
abiVersion(): number;
|
|
21
22
|
getBuffer(size: number): number;
|
|
22
23
|
init(): void;
|
|
23
24
|
|
|
25
|
+
/** Reset the streaming FNV-1a 64-bit hash state. Optional on the first
|
|
26
|
+
* hash of a session because the static initialiser is also FNV_OFFSET. */
|
|
27
|
+
hashBegin(): void;
|
|
28
|
+
/** Fold `len` bytes of scratchpad into the streaming hash. May be
|
|
29
|
+
* called repeatedly for files larger than SCRATCHPAD_SIZE. */
|
|
30
|
+
hashFeed(len: number): void;
|
|
31
|
+
/** Write the final 8 raw big-endian bytes at scratchpad[0..8] and
|
|
32
|
+
* reset the state so the next hash can start without an explicit Begin. */
|
|
33
|
+
hashFinish(): void;
|
|
34
|
+
|
|
24
35
|
// Document ingestion
|
|
25
36
|
setDocumentName(len: number): void;
|
|
26
37
|
beginDocument(): number;
|
|
@@ -40,6 +51,13 @@ export interface AlbexWasmExports {
|
|
|
40
51
|
setThreshold(threshold: number): void;
|
|
41
52
|
setMaxResults(max: number): void;
|
|
42
53
|
|
|
54
|
+
// Query parsing (since ABI v2). Single source of truth for tokenization.
|
|
55
|
+
prepareQuery(len: number): number;
|
|
56
|
+
getQueryKind(): number;
|
|
57
|
+
getQueryBranchCount(): number;
|
|
58
|
+
getQueryBranchPattern(i: number): number;
|
|
59
|
+
selectQueryBranch(i: number): number;
|
|
60
|
+
|
|
43
61
|
// Search execution
|
|
44
62
|
setPattern(len: number): number;
|
|
45
63
|
search(): number;
|
|
@@ -73,12 +91,27 @@ export interface AlbexWasmExports {
|
|
|
73
91
|
getDocCount(): number;
|
|
74
92
|
getTextUsed(): number;
|
|
75
93
|
getTextCapacity(): number;
|
|
94
|
+
/** Bitflags of capacity limits hit during the most recent
|
|
95
|
+
* begin..endDocument cycle: 1 = chunks, 2 = text, 4 = docs, 8 = names.
|
|
96
|
+
* 0 = everything fit. Read by the host right after endDocument to raise a
|
|
97
|
+
* typed AlbexCapacityError instead of silently truncating the corpus. */
|
|
98
|
+
getLastIndexOverflow(): number;
|
|
76
99
|
|
|
77
|
-
// Snapshot / restore
|
|
100
|
+
// Snapshot / restore (v3 protocol; v1 and v2 still load)
|
|
78
101
|
snapshotSize(): number;
|
|
79
102
|
snapshotChunk(offset: number, maxLen: number): number;
|
|
103
|
+
/** Validate header. For v3 also reserves the staging buffer; state is
|
|
104
|
+
* NOT touched until restoreCommit succeeds. For v1/v2 (legacy) state is
|
|
105
|
+
* reset and counters are written immediately. */
|
|
80
106
|
restoreBegin(): number;
|
|
107
|
+
/** Feed payload bytes. For v3 they accumulate into staging; for v1/v2
|
|
108
|
+
* they are written straight to the state arrays as before. */
|
|
81
109
|
restoreFeed(len: number): number;
|
|
110
|
+
/** Atomic commit for v3 snapshots. Returns 1 if the staged payload was
|
|
111
|
+
* complete and decoded successfully; 0 otherwise — and in the 0 case
|
|
112
|
+
* the previous engine state is preserved. For v1/v2 this is a no-op
|
|
113
|
+
* that always returns 1. */
|
|
114
|
+
restoreCommit(): number;
|
|
82
115
|
|
|
83
116
|
// Incremental / per-doc
|
|
84
117
|
getDocId(index: number): number;
|
|
@@ -126,6 +159,10 @@ export interface AlbexWasmExports {
|
|
|
126
159
|
export interface AlbexPdfExports {
|
|
127
160
|
readonly memory: WebAssembly.Memory;
|
|
128
161
|
|
|
162
|
+
/** ABI version of the PDF module. The host loader refuses any binary
|
|
163
|
+
* whose abiVersion is outside the supported range. */
|
|
164
|
+
abiVersion(): number;
|
|
165
|
+
|
|
129
166
|
/** Reserve `len` bytes inside the PDF module and return a pointer. */
|
|
130
167
|
allocInput(len: number): number;
|
|
131
168
|
|
|
@@ -183,18 +220,130 @@ export interface AlbexPdfExports {
|
|
|
183
220
|
}
|
|
184
221
|
|
|
185
222
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
186
|
-
//
|
|
223
|
+
// Runtime validators
|
|
187
224
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
225
|
+
//
|
|
226
|
+
// These replace the pre-0.5.0 `as unknown as` casts. They check three
|
|
227
|
+
// things at instantiation time:
|
|
228
|
+
// 1. memory is a WebAssembly.Memory instance.
|
|
229
|
+
// 2. abiVersion() returns a number inside the supported range.
|
|
230
|
+
// 3. every required export exists and is a function.
|
|
231
|
+
//
|
|
232
|
+
// If any of these fails, the loader throws a typed error before the
|
|
233
|
+
// engine returns from init(). This eliminates the audit 3.2 issue:
|
|
234
|
+
// previously a missing export only surfaced when its call site ran.
|
|
188
235
|
|
|
189
|
-
/**
|
|
190
|
-
*
|
|
191
|
-
*
|
|
192
|
-
|
|
193
|
-
|
|
236
|
+
/** Range of ABI versions this host code understands for the main module.
|
|
237
|
+
* Update both ends together with the Rust `abiVersion()` constant when
|
|
238
|
+
* the export surface changes. */
|
|
239
|
+
// 0.6.0 requires ABI 3 (trigram pre-filter + getLastIndexOverflow). The
|
|
240
|
+
// required-exports list below already makes any older binary fail the
|
|
241
|
+
// missing-exports check, so a tolerant lower bound was dead code — the range
|
|
242
|
+
// is pinned to the one ABI this host actually speaks (audit 0.6.0, finding #7).
|
|
243
|
+
const MAIN_ABI_MIN = 3;
|
|
244
|
+
const MAIN_ABI_MAX = 3;
|
|
245
|
+
|
|
246
|
+
/** Range of ABI versions for the PDF module. */
|
|
247
|
+
const PDF_ABI_MIN = 1;
|
|
248
|
+
const PDF_ABI_MAX = 3;
|
|
249
|
+
|
|
250
|
+
/** Required function names on the main WASM. Adding a new one here forces
|
|
251
|
+
* the validator to check it; removing one is a breaking ABI bump. */
|
|
252
|
+
const MAIN_REQUIRED = [
|
|
253
|
+
'abiVersion', 'getBuffer', 'init',
|
|
254
|
+
'setDocumentName', 'beginDocument', 'feedXmlBytes', 'endDocument',
|
|
255
|
+
'beginXlsx', 'feedXlsxBytes',
|
|
256
|
+
'feedText', 'flushParagraph',
|
|
257
|
+
'setMaxErrors', 'setThreshold', 'setMaxResults',
|
|
258
|
+
'prepareQuery', 'getQueryKind', 'getQueryBranchCount',
|
|
259
|
+
'getQueryBranchPattern', 'selectQueryBranch',
|
|
260
|
+
'setPattern', 'search',
|
|
261
|
+
'searchBegin', 'searchSlice', 'getSearchCursor', 'getSearchTotal',
|
|
262
|
+
'getResultCount',
|
|
263
|
+
'getResultDocId', 'getResultLocation', 'getResultScore',
|
|
264
|
+
'getResultStart', 'getResultEnd', 'getResultChunkIdx',
|
|
265
|
+
'getResultDocName', 'getResultMatchCount',
|
|
266
|
+
'getResultMatchStartAt', 'getResultMatchEndAt',
|
|
267
|
+
'getSnippet', 'getSnippetWindow', 'getSnippetWindowOffset',
|
|
268
|
+
'getStatBloomTested', 'getStatBloomPassed', 'getStatBitapMatched',
|
|
269
|
+
'getChunkCount', 'getDocCount', 'getTextUsed', 'getTextCapacity',
|
|
270
|
+
'getLastIndexOverflow',
|
|
271
|
+
'snapshotSize', 'snapshotChunk',
|
|
272
|
+
'restoreBegin', 'restoreFeed', 'restoreCommit',
|
|
273
|
+
'getDocId', 'getDocChunkCount', 'getDocName', 'isDocDeleted',
|
|
274
|
+
'removeDocument', 'compact',
|
|
275
|
+
'setLanguage',
|
|
276
|
+
'getTier', 'getMaxChunks', 'getMaxDocs', 'getNameCapacity',
|
|
277
|
+
'getChunksPtr', 'getChunkStructSize',
|
|
278
|
+
'setCandidateMask', 'clearCandidateMask',
|
|
279
|
+
'getDocContentHashPtr', 'getDocContentHashLen', 'setDocumentContentHash',
|
|
280
|
+
'hashBegin', 'hashFeed', 'hashFinish',
|
|
281
|
+
] as const;
|
|
282
|
+
|
|
283
|
+
const PDF_REQUIRED = [
|
|
284
|
+
'abiVersion', 'allocInput', 'extractPdf',
|
|
285
|
+
'getPageLen', 'getPagePtr', 'getErrorLen', 'getErrorPtr',
|
|
286
|
+
'getPageCount', 'extractPageImages',
|
|
287
|
+
'getPageImageLen', 'getPageImagePtr', 'getPageImageKind',
|
|
288
|
+
] as const;
|
|
289
|
+
|
|
290
|
+
/** Thrown when an instantiated WASM module fails the ABI contract. */
|
|
291
|
+
export class AlbexAbiMismatchError extends Error {
|
|
292
|
+
readonly module: 'main' | 'pdf';
|
|
293
|
+
readonly missing?: readonly string[];
|
|
294
|
+
readonly version?: number;
|
|
295
|
+
constructor(module: 'main' | 'pdf', message: string, opts?: { missing?: readonly string[]; version?: number }) {
|
|
296
|
+
super(message);
|
|
297
|
+
this.name = 'AlbexAbiMismatchError';
|
|
298
|
+
this.module = module;
|
|
299
|
+
if (opts?.missing) this.missing = opts.missing;
|
|
300
|
+
if (opts?.version !== undefined) this.version = opts.version;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
function validateExports(
|
|
305
|
+
exports: WebAssembly.Exports,
|
|
306
|
+
required: readonly string[],
|
|
307
|
+
module: 'main' | 'pdf',
|
|
308
|
+
abiMin: number,
|
|
309
|
+
abiMax: number,
|
|
310
|
+
): void {
|
|
311
|
+
const mem = (exports as Record<string, unknown>)['memory'];
|
|
312
|
+
if (!(mem instanceof WebAssembly.Memory)) {
|
|
313
|
+
throw new AlbexAbiMismatchError(module, `${module}: \`memory\` is missing or not a WebAssembly.Memory instance.`);
|
|
314
|
+
}
|
|
315
|
+
const missing: string[] = [];
|
|
316
|
+
for (const name of required) {
|
|
317
|
+
if (typeof (exports as Record<string, unknown>)[name] !== 'function') missing.push(name);
|
|
318
|
+
}
|
|
319
|
+
if (missing.length) {
|
|
320
|
+
throw new AlbexAbiMismatchError(
|
|
321
|
+
module,
|
|
322
|
+
`${module}: WASM binary missing required exports: ${missing.join(', ')}. ` +
|
|
323
|
+
`The .wasm was built with an incompatible source — rebuild with the current toolchain.`,
|
|
324
|
+
{ missing },
|
|
325
|
+
);
|
|
326
|
+
}
|
|
327
|
+
const version = ((exports as Record<string, unknown>)['abiVersion'] as () => number)();
|
|
328
|
+
if (version < abiMin || version > abiMax) {
|
|
329
|
+
throw new AlbexAbiMismatchError(
|
|
330
|
+
module,
|
|
331
|
+
`${module}: abiVersion ${version} outside supported range [${abiMin}..${abiMax}]. ` +
|
|
332
|
+
`The host TypeScript expects a different binary — upgrade albex or rebuild the WASM.`,
|
|
333
|
+
{ version },
|
|
334
|
+
);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
/** Validate and narrow `WebAssembly.Exports` to the typed Albex main
|
|
339
|
+
* interface. Throws `AlbexAbiMismatchError` if the contract is broken. */
|
|
194
340
|
export function asAlbexExports(exports: WebAssembly.Exports): AlbexWasmExports {
|
|
341
|
+
validateExports(exports, MAIN_REQUIRED, 'main', MAIN_ABI_MIN, MAIN_ABI_MAX);
|
|
195
342
|
return exports as unknown as AlbexWasmExports;
|
|
196
343
|
}
|
|
197
344
|
|
|
345
|
+
/** Validate and narrow `WebAssembly.Exports` to the typed PDF interface. */
|
|
198
346
|
export function asAlbexPdfExports(exports: WebAssembly.Exports): AlbexPdfExports {
|
|
347
|
+
validateExports(exports, PDF_REQUIRED, 'pdf', PDF_ABI_MIN, PDF_ABI_MAX);
|
|
199
348
|
return exports as unknown as AlbexPdfExports;
|
|
200
349
|
}
|
package/src/worker-runtime.ts
CHANGED
|
@@ -75,8 +75,8 @@ async function dispatch(op: WorkerOp): Promise<unknown> {
|
|
|
75
75
|
}
|
|
76
76
|
}
|
|
77
77
|
|
|
78
|
-
|
|
79
|
-
const { id, op } =
|
|
78
|
+
async function handle(req: WorkerRequest): Promise<void> {
|
|
79
|
+
const { id, op } = req;
|
|
80
80
|
try {
|
|
81
81
|
const result = await dispatch(op);
|
|
82
82
|
const res: WorkerResponse = { id, ok: true, result };
|
|
@@ -93,4 +93,14 @@ self.onmessage = async (ev: MessageEvent<WorkerRequest>) => {
|
|
|
93
93
|
};
|
|
94
94
|
(self as unknown as Worker).postMessage(res);
|
|
95
95
|
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Process messages strictly in arrival order. The engine guards its own
|
|
99
|
+
// state, but a sync `search` arriving mid-`indexFile` await would otherwise
|
|
100
|
+
// be rejected as "busy"; queueing keeps the worker's externally-observable
|
|
101
|
+
// behaviour serial and matches the main-thread engine's serialization.
|
|
102
|
+
let _queue: Promise<void> = Promise.resolve();
|
|
103
|
+
self.onmessage = (ev: MessageEvent<WorkerRequest>) => {
|
|
104
|
+
const req = ev.data;
|
|
105
|
+
_queue = _queue.then(() => handle(req));
|
|
96
106
|
};
|
package/wasm/pkg/albex_pdf.wasm
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|