albex 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +223 -0
- package/README.md +84 -30
- package/dist/_generated/inline-wasm.d.ts +2 -0
- package/dist/_generated/inline-wasm.d.ts.map +1 -0
- package/dist/_generated/inline-wasm.js +9 -0
- package/dist/_generated/inline-wasm.js.map +1 -0
- package/dist/albex-worker.d.ts +65 -2
- package/dist/albex-worker.d.ts.map +1 -1
- package/dist/albex-worker.js +98 -21
- package/dist/albex-worker.js.map +1 -1
- package/dist/albex.d.ts +250 -42
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +492 -120
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +35 -4
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +38 -3
- package/dist/errors.js.map +1 -1
- package/dist/index.d.ts +47 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +82 -0
- package/dist/index.js.map +1 -0
- package/dist/inline.d.ts +10 -0
- package/dist/inline.d.ts.map +1 -0
- package/dist/inline.js +17 -0
- package/dist/inline.js.map +1 -0
- package/dist/persistence.js +2 -2
- package/dist/pool/coordinator.d.ts +14 -6
- package/dist/pool/coordinator.d.ts.map +1 -1
- package/dist/pool/coordinator.js +65 -28
- package/dist/pool/coordinator.js.map +1 -1
- package/dist/profile.js +2 -2
- package/dist/resource-manager.js +2 -2
- package/dist/tiered-store.js +2 -2
- package/dist/wasm-bindings.d.ts +50 -1
- package/dist/wasm-bindings.d.ts.map +1 -1
- package/dist/wasm-bindings.js +20 -12
- package/dist/wasm-bindings.js.map +1 -1
- package/dist/worker-protocol.d.ts +23 -2
- package/dist/worker-protocol.d.ts.map +1 -1
- package/dist/worker-protocol.js +2 -2
- package/dist/worker-runtime.js +17 -2
- package/dist/worker-runtime.js.map +1 -1
- package/package.json +14 -9
- package/src/_generated/inline-wasm.ts +9 -0
- package/src/albex-worker.ts +103 -18
- package/src/albex.ts +3053 -2524
- package/src/errors.ts +49 -4
- package/src/index.ts +81 -0
- package/src/inline.ts +9 -0
- package/src/pool/coordinator.ts +61 -34
- package/src/wasm-bindings.ts +78 -12
- package/src/worker-protocol.ts +12 -2
- package/src/worker-runtime.ts +16 -1
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/src/errors.ts
CHANGED
|
@@ -57,20 +57,65 @@ export class AlbexParseError extends AlbexError {
|
|
|
57
57
|
* documents, names) ran out of room mid-document. Before 0.6.0 the latter was
|
|
58
58
|
* silent — the corpus was truncated with no signal.
|
|
59
59
|
*
|
|
60
|
-
* `limit` names which pool overflowed (or `'scratchpad'`
|
|
60
|
+
* `limit` names which pool overflowed (or `'scratchpad'`, or `'file'` when an
|
|
61
|
+
* input file exceeds `maxFileBytes` before any byte is read), so callers can
|
|
61
62
|
* branch — e.g. start a fresh shard, `compact()`, or surface "library full".
|
|
62
63
|
* When a capacity error is raised during `indexFile`, the engine may hold a
|
|
63
64
|
* partially-indexed copy of the offending document; treat the index as full
|
|
64
|
-
* and stop adding.
|
|
65
|
+
* and stop adding. A `'file'` capacity error is raised BEFORE the file is
|
|
66
|
+
* read, so the index is untouched and fully usable.
|
|
65
67
|
*/
|
|
66
|
-
export type AlbexCapacityLimit = 'chunks' | 'text' | 'docs' | 'names' | 'scratchpad';
|
|
68
|
+
export type AlbexCapacityLimit = 'chunks' | 'text' | 'docs' | 'names' | 'scratchpad' | 'file';
|
|
67
69
|
|
|
68
70
|
export class AlbexCapacityError extends AlbexError {
|
|
69
71
|
/** Which pool overflowed. Undefined for older call sites that didn't set it. */
|
|
70
72
|
readonly limit?: AlbexCapacityLimit;
|
|
71
|
-
|
|
73
|
+
/**
|
|
74
|
+
* The RUNTIME numeric capacity of the pool named by `limit`, as the
|
|
75
|
+
* engine is actually configured (e.g. `4` when `capacity: { maxDocs: 4 }`
|
|
76
|
+
* overflows its document table, `128` for the std default). Units: docs
|
|
77
|
+
* for `'docs'`, chunks for `'chunks'`, bytes for `'text'`/`'names'`/
|
|
78
|
+
* `'scratchpad'`/`'file'`. Undefined when the limit is not known at the
|
|
79
|
+
* throw site.
|
|
80
|
+
*/
|
|
81
|
+
readonly max?: number;
|
|
82
|
+
constructor(message: string, limit?: AlbexCapacityLimit, max?: number) {
|
|
72
83
|
super('capacity', message);
|
|
73
84
|
this.name = 'AlbexCapacityError';
|
|
74
85
|
if (limit) this.limit = limit;
|
|
86
|
+
if (max !== undefined) this.max = max;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Default `maxFileBytes` for `indexFile`: 256 MiB. Far above anything the
|
|
92
|
+
* ~16–21 MB text pool could ever absorb, so legitimate documents are never
|
|
93
|
+
* rejected — the guard only exists to refuse pathological inputs (a 2 GB
|
|
94
|
+
* file would otherwise be fully buffered AND hashed before the first
|
|
95
|
+
* capacity check could fire).
|
|
96
|
+
*/
|
|
97
|
+
export const DEFAULT_MAX_FILE_BYTES = 256 * 1024 * 1024;
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Pre-read size guard for `indexFile`. Throws a typed
|
|
101
|
+
* {@link AlbexCapacityError} (`limit: 'file'`) when `file.size` exceeds the
|
|
102
|
+
* configured cap — BEFORE any byte of the file is read into memory
|
|
103
|
+
* (`File`/`Blob` expose `size` without reading). Shared by the engine, the
|
|
104
|
+
* worker wrapper and the pool coordinator so the guard fires on whichever
|
|
105
|
+
* thread would otherwise buffer the bytes.
|
|
106
|
+
*/
|
|
107
|
+
export function assertFileSizeWithinLimit(
|
|
108
|
+
file: { name: string; size: number },
|
|
109
|
+
maxFileBytes?: number,
|
|
110
|
+
): void {
|
|
111
|
+
const cap = maxFileBytes ?? DEFAULT_MAX_FILE_BYTES;
|
|
112
|
+
if (file.size > cap) {
|
|
113
|
+
throw new AlbexCapacityError(
|
|
114
|
+
`"${file.name}" is ${file.size} bytes, above the maxFileBytes limit of ` +
|
|
115
|
+
`${cap}. The file was not read or indexed. Raise \`maxFileBytes\` in ` +
|
|
116
|
+
`AlbexOptions if this is intentional.`,
|
|
117
|
+
'file',
|
|
118
|
+
cap,
|
|
119
|
+
);
|
|
75
120
|
}
|
|
76
121
|
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Albex — public entry point. Install, import, use:
|
|
3
|
+
*
|
|
4
|
+
* ```ts
|
|
5
|
+
* import { AlbexEngine } from 'albex';
|
|
6
|
+
*
|
|
7
|
+
* const engine = new AlbexEngine(); // nothing to serve, nothing to configure
|
|
8
|
+
* await engine.init();
|
|
9
|
+
* ```
|
|
10
|
+
*
|
|
11
|
+
* The ~47 KB baseline core is base64-embedded in this module (decoded once,
|
|
12
|
+
* lazily; ~19 KB gzipped over the wire). `new AlbexEngine()` instantiates
|
|
13
|
+
* those bytes directly — NO network fetch, NO `new URL('…wasm',
|
|
14
|
+
* import.meta.url)` asset resolution — so it works in **every** bundler,
|
|
15
|
+
* esbuild / Angular / Webpack included, with zero setup.
|
|
16
|
+
*
|
|
17
|
+
* Advanced options stay available and are never required:
|
|
18
|
+
* - `wasmBaseUrl` — serve the baseline + SIMD binaries yourself (gets you
|
|
19
|
+
* the SIMD core on capable hosts; the embedded default is baseline-only).
|
|
20
|
+
* - `wasmUrl` — a single explicit core URL (e.g. a CDN).
|
|
21
|
+
* - `wasmBytes` — hand the engine bytes you loaded yourself.
|
|
22
|
+
* When any of these is set, the embedded core is bypassed.
|
|
23
|
+
*
|
|
24
|
+
* The PDF module (~1.2 MB) is too large to embed; it still loads on demand
|
|
25
|
+
* from `pdfWasmUrl` (default: resolved next to the package) or `pdfWasmBytes`.
|
|
26
|
+
*
|
|
27
|
+
* Everything else (`AlbexPool`, `TieredStore`, errors, types, …) is
|
|
28
|
+
* re-exported unchanged.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import { AlbexEngine as CoreAlbexEngine, type AlbexOptions } from './albex.js';
|
|
32
|
+
import { ALBEX_WASM_BASE64 } from './_generated/inline-wasm.js';
|
|
33
|
+
|
|
34
|
+
// Re-export the whole public surface. The local `AlbexEngine` declared below
|
|
35
|
+
// takes precedence over the one this star-export would otherwise bring in.
|
|
36
|
+
export * from './albex.js';
|
|
37
|
+
|
|
38
|
+
function decodeBase64(b64: string): Uint8Array {
|
|
39
|
+
// `atob` exists in the DOM and in Node ≥ 18 (this package's floor), so no
|
|
40
|
+
// Buffer / environment branch is needed.
|
|
41
|
+
const bin = atob(b64);
|
|
42
|
+
const out = new Uint8Array(bin.length);
|
|
43
|
+
for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i);
|
|
44
|
+
return out;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
let _cached: Uint8Array | null = null;
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* The embedded baseline core as raw bytes. Decoded from base64 on first call
|
|
51
|
+
* and cached, so multiple engines share one decode. Exposed for callers that
|
|
52
|
+
* want the bytes directly (e.g. to seed a worker with the same core).
|
|
53
|
+
*/
|
|
54
|
+
export function albexWasmBytes(): Uint8Array {
|
|
55
|
+
if (!_cached) _cached = decodeBase64(ALBEX_WASM_BASE64);
|
|
56
|
+
return _cached;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* The Albex search engine, pre-wired with the embedded baseline core. Pass
|
|
61
|
+
* any normal {@link AlbexOptions}; an explicit `wasmBytes` / `wasmUrl` /
|
|
62
|
+
* `wasmBaseUrl` you supply takes over, so advanced (SIMD / CDN) setups keep
|
|
63
|
+
* working unchanged.
|
|
64
|
+
*/
|
|
65
|
+
export class AlbexEngine extends CoreAlbexEngine {
|
|
66
|
+
constructor(opts: AlbexOptions = {}) {
|
|
67
|
+
// Inject the embedded core only when the caller hasn't chosen a source.
|
|
68
|
+
// This keeps the engine a strict superset of the URL-based core: every
|
|
69
|
+
// existing option behaves exactly as before.
|
|
70
|
+
const hasSource =
|
|
71
|
+
opts.wasmBytes != null || opts.wasmUrl != null || opts.wasmBaseUrl != null;
|
|
72
|
+
// The cast bridges TS 5.7's `BufferSource` (which excludes
|
|
73
|
+
// `Uint8Array<ArrayBufferLike>` over SharedArrayBuffer variance) — a
|
|
74
|
+
// plain Uint8Array is a valid BufferSource at runtime.
|
|
75
|
+
super(
|
|
76
|
+
hasSource
|
|
77
|
+
? opts
|
|
78
|
+
: { ...opts, wasmBytes: albexWasmBytes() as BufferSource },
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
}
|
package/src/inline.ts
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `albex/inline` — compatibility alias for the default entry.
|
|
3
|
+
*
|
|
4
|
+
* The embedded core is now the DEFAULT (`import { AlbexEngine } from 'albex'`
|
|
5
|
+
* already needs no setup), so this subpath exists only so existing
|
|
6
|
+
* `from 'albex/inline'` imports keep resolving. It re-exports the default
|
|
7
|
+
* entry verbatim — there is no behavioural difference.
|
|
8
|
+
*/
|
|
9
|
+
export * from './index.js';
|
package/src/pool/coordinator.ts
CHANGED
|
@@ -30,9 +30,8 @@ import type {
|
|
|
30
30
|
EngineStats,
|
|
31
31
|
SearchStats,
|
|
32
32
|
} from '../albex.js';
|
|
33
|
-
import type { Tier } from '../profile.js';
|
|
34
33
|
import { detectProfile, pickWorkerCount } from '../profile.js';
|
|
35
|
-
import { AlbexInitError, AlbexError } from '../errors.js';
|
|
34
|
+
import { AlbexInitError, AlbexError, assertFileSizeWithinLimit } from '../errors.js';
|
|
36
35
|
import type {
|
|
37
36
|
WorkerRequest,
|
|
38
37
|
WorkerResponse,
|
|
@@ -77,7 +76,9 @@ export class AlbexPool {
|
|
|
77
76
|
private _docsCache: IndexedDocument[] = [];
|
|
78
77
|
private _rrCursor = 0;
|
|
79
78
|
private _lastSearch: SearchStats | null = null;
|
|
80
|
-
|
|
79
|
+
/** Global result cap applied AFTER the cross-shard merge. Mirrors the
|
|
80
|
+
* last `setMaxResults` call (the WASM engine default is 50). */
|
|
81
|
+
private _maxResults = 50;
|
|
81
82
|
|
|
82
83
|
constructor(opts: AlbexPoolOptions) {
|
|
83
84
|
this._opts = opts;
|
|
@@ -98,23 +99,21 @@ export class AlbexPool {
|
|
|
98
99
|
console.warn('[albex] pool mode=shared requested but cross-origin isolation is not active; falling back to replicated');
|
|
99
100
|
}
|
|
100
101
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
102
|
+
// Forward every serializable engine option to the shards; strip the
|
|
103
|
+
// pool-only fields (workerUrl/workers/mode) and anything non-clonable.
|
|
104
|
+
// Same policy as AlbexEngineWorker.init (audit 1.4).
|
|
105
|
+
const shardOpts: AlbexOptions = {};
|
|
106
|
+
for (const [k, v] of Object.entries(this._opts)) {
|
|
107
|
+
if (k === 'workerUrl' || k === 'workers' || k === 'mode') continue;
|
|
108
|
+
if (v === undefined || typeof v === 'function') continue;
|
|
109
|
+
(shardOpts as Record<string, unknown>)[k] = v;
|
|
110
|
+
}
|
|
108
111
|
|
|
109
112
|
for (let i = 0; i < n; i++) {
|
|
110
113
|
const shard = this._spawnShard();
|
|
111
114
|
await this._send(shard, { kind: 'init', opts: shardOpts });
|
|
112
115
|
this._shards.push(shard);
|
|
113
116
|
}
|
|
114
|
-
|
|
115
|
-
// Tier is the same across shards — capture it from shard 0 stats.
|
|
116
|
-
const stats0 = await this._send<EngineStats>(this._shards[0]!, { kind: 'getStats' });
|
|
117
|
-
this._tier = stats0.tier;
|
|
118
117
|
}
|
|
119
118
|
|
|
120
119
|
// ── Shard plumbing ─────────────────────────────────────────────────────
|
|
@@ -168,6 +167,9 @@ export class AlbexPool {
|
|
|
168
167
|
|
|
169
168
|
async indexFile(file: File): Promise<IndexedDocument> {
|
|
170
169
|
if (this._shards.length === 0) throw new AlbexInitError('Pool not initialised');
|
|
170
|
+
// Size guard BEFORE reading — same limit the shard engine enforces, but
|
|
171
|
+
// checked here so an oversized file is never buffered on the main thread.
|
|
172
|
+
assertFileSizeWithinLimit(file, this._opts.maxFileBytes);
|
|
171
173
|
const idx = this._rrCursor++ % this._shards.length;
|
|
172
174
|
const shard = this._shards[idx]!;
|
|
173
175
|
const buffer = await file.arrayBuffer();
|
|
@@ -185,30 +187,50 @@ export class AlbexPool {
|
|
|
185
187
|
* Map-reduce search:
|
|
186
188
|
* 1. broadcast the query to every shard,
|
|
187
189
|
* 2. each shard runs its local Bloom→Bitap→top-K,
|
|
188
|
-
* 3. coordinator merges the K-best from each shard,
|
|
190
|
+
* 3. coordinator merges the K-best from each shard, deduplicating
|
|
191
|
+
* identical hits (same document name, chunk id and match offset —
|
|
192
|
+
* the case where the same file was indexed into two shards),
|
|
189
193
|
* 4. global top-K returned in descending score order.
|
|
190
194
|
*
|
|
191
195
|
* Capped to `setMaxResults` results (default 50) AFTER merge.
|
|
196
|
+
*
|
|
197
|
+
* Per-shard search stats are requested in the same posting batch as the
|
|
198
|
+
* search itself: each worker processes its queue in arrival order, so the
|
|
199
|
+
* stats reply is guaranteed to belong to THIS query even when several
|
|
200
|
+
* `search()` calls overlap.
|
|
192
201
|
*/
|
|
193
202
|
async search(query: string, opts: SearchOptions = {}): Promise<SearchResult[]> {
|
|
194
203
|
if (this._shards.length === 0) return [];
|
|
195
204
|
const t0 = performance.now();
|
|
196
|
-
|
|
197
|
-
|
|
205
|
+
// Post `search` and `getLastSearchStats` back-to-back (synchronously,
|
|
206
|
+
// per shard) so no other request can slot between them in the worker's
|
|
207
|
+
// FIFO queue — the stats round can't race a subsequent search.
|
|
208
|
+
const perShard = await Promise.all(
|
|
209
|
+
this._shards.map(s => {
|
|
210
|
+
const results = this._send<SearchResult[]>(s, { kind: 'search', query, options: opts });
|
|
211
|
+
const stats = this._send<SearchStats | null>(s, { kind: 'getLastSearchStats' });
|
|
212
|
+
return Promise.all([results, stats] as const);
|
|
213
|
+
}),
|
|
198
214
|
);
|
|
199
215
|
|
|
200
|
-
// Merge:
|
|
201
|
-
//
|
|
202
|
-
//
|
|
203
|
-
|
|
216
|
+
// Merge: flatten + dedup + sort (descending) + global cap. Shard-local
|
|
217
|
+
// docIds collide across shards, so the dedup identity also includes the
|
|
218
|
+
// document name: it only collapses true duplicates (the same document
|
|
219
|
+
// indexed into more than one shard yields identical name/chunkId/offset).
|
|
220
|
+
const seen = new Set<string>();
|
|
221
|
+
const merged: SearchResult[] = [];
|
|
222
|
+
for (const [bucket] of perShard) {
|
|
223
|
+
for (const r of bucket) {
|
|
224
|
+
const key = `${r.documentName}:${r.chunkId}:${r.matchStart}`;
|
|
225
|
+
if (!seen.has(key)) { seen.add(key); merged.push(r); }
|
|
226
|
+
}
|
|
227
|
+
}
|
|
204
228
|
merged.sort((a, b) => b.score - a.score);
|
|
229
|
+
const capped = merged.slice(0, this._maxResults);
|
|
205
230
|
|
|
206
231
|
// Aggregate search stats across shards.
|
|
207
|
-
const stats = await Promise.all(
|
|
208
|
-
this._shards.map(s => this._send<SearchStats | null>(s, { kind: 'getLastSearchStats' })),
|
|
209
|
-
);
|
|
210
232
|
let bloomTested = 0, bloomPassed = 0, bitapMatched = 0;
|
|
211
|
-
for (const s of
|
|
233
|
+
for (const [, s] of perShard) {
|
|
212
234
|
if (!s) continue;
|
|
213
235
|
bloomTested += s.bloomTested;
|
|
214
236
|
bloomPassed += s.bloomPassed;
|
|
@@ -217,11 +239,11 @@ export class AlbexPool {
|
|
|
217
239
|
this._lastSearch = {
|
|
218
240
|
query,
|
|
219
241
|
timeMs: performance.now() - t0,
|
|
220
|
-
results:
|
|
242
|
+
results: capped.length,
|
|
221
243
|
bloomTested, bloomPassed, bitapMatched,
|
|
222
244
|
};
|
|
223
245
|
|
|
224
|
-
return
|
|
246
|
+
return capped;
|
|
225
247
|
}
|
|
226
248
|
|
|
227
249
|
/**
|
|
@@ -273,11 +295,13 @@ export class AlbexPool {
|
|
|
273
295
|
for (const s of this._shards) s.docCount = 0;
|
|
274
296
|
}
|
|
275
297
|
|
|
276
|
-
/** Aggregate engine stats across all shards.
|
|
298
|
+
/** Aggregate engine stats across all shards. Capacities are the RUNTIME
|
|
299
|
+
* per-shard capacities summed (each shard was initialised with the same
|
|
300
|
+
* `capacity` option, forwarded through the wire protocol). */
|
|
277
301
|
async getStats(): Promise<EngineStats> {
|
|
278
302
|
const all = await this._broadcast<EngineStats>({ kind: 'getStats' });
|
|
279
303
|
let documents = 0, chunks = 0, textUsed = 0, textCapacity = 0, wasmMemoryBytes = 0;
|
|
280
|
-
let maxChunks = 0, maxDocs = 0;
|
|
304
|
+
let maxChunks = 0, maxDocs = 0, namePoolBytes = 0;
|
|
281
305
|
for (const s of all) {
|
|
282
306
|
documents += s.documents;
|
|
283
307
|
chunks += s.chunks;
|
|
@@ -286,10 +310,11 @@ export class AlbexPool {
|
|
|
286
310
|
wasmMemoryBytes += s.wasmMemoryBytes;
|
|
287
311
|
maxChunks += s.maxChunks;
|
|
288
312
|
maxDocs += s.maxDocs;
|
|
313
|
+
namePoolBytes += s.namePoolBytes;
|
|
289
314
|
}
|
|
290
315
|
return {
|
|
291
316
|
documents, chunks, textUsed, textCapacity, wasmMemoryBytes,
|
|
292
|
-
|
|
317
|
+
maxChunks, maxDocs, namePoolBytes,
|
|
293
318
|
};
|
|
294
319
|
}
|
|
295
320
|
|
|
@@ -303,15 +328,17 @@ export class AlbexPool {
|
|
|
303
328
|
|
|
304
329
|
async setMaxErrors(n: 0 | 1 | 2 | 3): Promise<void> { await this._broadcast({ kind: 'setMaxErrors', n }); }
|
|
305
330
|
async setThreshold(n: number): Promise<void> { await this._broadcast({ kind: 'setThreshold', n }); }
|
|
306
|
-
async setMaxResults(n: number): Promise<void> {
|
|
331
|
+
async setMaxResults(n: number): Promise<void> {
|
|
332
|
+
// Track the effective cap (same clamp as the WASM engine) so search()
|
|
333
|
+
// can enforce it globally after the cross-shard merge.
|
|
334
|
+
this._maxResults = Math.max(1, Math.min(200, Math.floor(n)));
|
|
335
|
+
await this._broadcast({ kind: 'setMaxResults', n });
|
|
336
|
+
}
|
|
307
337
|
async setLanguage(lang: 'off' | 'es'): Promise<void> { await this._broadcast({ kind: 'setLanguage', lang }); }
|
|
308
338
|
|
|
309
339
|
/** Number of shards currently running. */
|
|
310
340
|
get workerCount(): number { return this._shards.length; }
|
|
311
341
|
|
|
312
|
-
/** Tier loaded by the shards (same value across all of them). */
|
|
313
|
-
get tier(): Tier | null { return this._tier; }
|
|
314
|
-
|
|
315
342
|
[Symbol.dispose](): void {
|
|
316
343
|
for (const s of this._shards) {
|
|
317
344
|
for (const [, p] of s.pending) p.reject(new AlbexError('disposed', 'Pool disposed'));
|
package/src/wasm-bindings.ts
CHANGED
|
@@ -20,7 +20,23 @@ export interface AlbexWasmExports {
|
|
|
20
20
|
// ABI / lifecycle
|
|
21
21
|
abiVersion(): number;
|
|
22
22
|
getBuffer(size: number): number;
|
|
23
|
+
/** Reset with the std default capacities (128 docs · 100k chunks · 16 MB
|
|
24
|
+
* text · 32 KB names) — identical behaviour to every pre-ABI-7 release. */
|
|
23
25
|
init(): void;
|
|
26
|
+
/** (Re-)initialise the engine with runtime capacities (ABI 7, decision
|
|
27
|
+
* A16). Allocates the capacity-dependent pools on the WASM heap. Returns
|
|
28
|
+
* 1 on success; 0 on invalid parameters (floors/ceilings documented in
|
|
29
|
+
* wasm/src/lib.rs: ≥1 doc, docs ≤ 65 536, chunks ≥ docs and ≤ 4 M, text
|
|
30
|
+
* pool 4 KiB–1 GiB, name pool 256 B–16 MiB) or on allocation failure —
|
|
31
|
+
* never traps. Re-init with the same capacities is a plain reset; with
|
|
32
|
+
* different capacities the pools are freed and re-allocated (no leak,
|
|
33
|
+
* but the linear-memory high-water mark never shrinks). */
|
|
34
|
+
initWithCapacity(
|
|
35
|
+
maxDocs: number,
|
|
36
|
+
maxChunks: number,
|
|
37
|
+
textPoolBytes: number,
|
|
38
|
+
namePoolBytes: number,
|
|
39
|
+
): number;
|
|
24
40
|
|
|
25
41
|
/** Reset the streaming FNV-1a 64-bit hash state. Optional on the first
|
|
26
42
|
* hash of a session because the static initialiser is also FNV_OFFSET. */
|
|
@@ -57,6 +73,11 @@ export interface AlbexWasmExports {
|
|
|
57
73
|
getQueryBranchCount(): number;
|
|
58
74
|
getQueryBranchPattern(i: number): number;
|
|
59
75
|
selectQueryBranch(i: number): number;
|
|
76
|
+
/** Bitflags of what the most recent prepareQuery dropped or clipped
|
|
77
|
+
* (ABI 5): 1 = OR branches beyond 8 discarded, 2 = tokens dropped
|
|
78
|
+
* (> 4 per branch) or clipped (> 64 bytes), 4 = raw query cut at
|
|
79
|
+
* 1024 bytes. 0 = compiled in full. */
|
|
80
|
+
getQueryTruncationFlags(): number;
|
|
60
81
|
|
|
61
82
|
// Search execution
|
|
62
83
|
setPattern(len: number): number;
|
|
@@ -68,6 +89,15 @@ export interface AlbexWasmExports {
|
|
|
68
89
|
getSearchTotal(): number;
|
|
69
90
|
|
|
70
91
|
// Result accessors
|
|
92
|
+
/** Base pointer of the `#[repr(C)]` RESULTS array (ABI 6). Read
|
|
93
|
+
* `getResultCount()` records of `getResultStride()` bytes each with one
|
|
94
|
+
* DataView pass instead of ~12 frontier calls per result. Field offsets
|
|
95
|
+
* are documented (and compile-time asserted) in wasm/src/lib.rs. Copy
|
|
96
|
+
* everything out before any call that could grow WASM memory. */
|
|
97
|
+
getResultsPtr(): number;
|
|
98
|
+
/** Byte stride between consecutive RESULTS records (= sizeof(DocMatch),
|
|
99
|
+
* 60 today). Exported so the host never hardcodes the struct size. */
|
|
100
|
+
getResultStride(): number;
|
|
71
101
|
getResultCount(): number;
|
|
72
102
|
getResultDocId(i: number): number;
|
|
73
103
|
getResultLocation(i: number): number;
|
|
@@ -121,6 +151,27 @@ export interface AlbexWasmExports {
|
|
|
121
151
|
removeDocument(docId: number): number;
|
|
122
152
|
compact(): void;
|
|
123
153
|
|
|
154
|
+
// Authoritative chunk enumeration (ABI 4). Address a document's chunks by
|
|
155
|
+
// (doc slot, ordinal). The compact()-stable key is (doc_id, ordinal).
|
|
156
|
+
/** First CHUNKS[] index for the document at slot `index` (= chunk_start).
|
|
157
|
+
* `ord = resultChunkIdx - getDocChunkBase(slot)` gives the doc-relative
|
|
158
|
+
* ordinal of a search hit. */
|
|
159
|
+
getDocChunkBase(index: number): number;
|
|
160
|
+
/** `location` (paragraph/page) of the `ord`-th chunk; u32::MAX if OOB. */
|
|
161
|
+
getChunkLocationAt(index: number, ord: number): number;
|
|
162
|
+
/** Byte length of the `ord`-th chunk's text; 0 if OOB. */
|
|
163
|
+
getChunkByteLenAt(index: number, ord: number): number;
|
|
164
|
+
/** Copy the `ord`-th chunk's UTF-8 text into the scratchpad; returns byte
|
|
165
|
+
* length (0 if OOB). Lets a host enumerate a doc's authoritative chunks
|
|
166
|
+
* right after indexing, with no query. */
|
|
167
|
+
getChunkTextAt(index: number, ord: number): number;
|
|
168
|
+
/** Batch chunk enumeration (ABI 6). Packs up to `maxChunks` records
|
|
169
|
+
* `[u32 text_len][u32 location][text bytes]` (LE, tightly packed) into
|
|
170
|
+
* the scratchpad starting at ordinal `startOrd`; returns how many were
|
|
171
|
+
* written. One frontier call per scratchpad-full instead of 2-3 per
|
|
172
|
+
* chunk. */
|
|
173
|
+
listChunksBatch(index: number, startOrd: number, maxChunks: number): number;
|
|
174
|
+
|
|
124
175
|
/**
|
|
125
176
|
* Per-document content hash (snapshot v2). Returns a pointer to 8 bytes
|
|
126
177
|
* holding the FNV-1a 64-bit hash of the original file bytes, or 0 if the
|
|
@@ -139,8 +190,8 @@ export interface AlbexWasmExports {
|
|
|
139
190
|
// Stemming
|
|
140
191
|
setLanguage(lang: number): void;
|
|
141
192
|
|
|
142
|
-
//
|
|
143
|
-
|
|
193
|
+
// Runtime capacity identification (ABI 7). Report the capacities the
|
|
194
|
+
// engine was last initialised with — `init()` = the std defaults.
|
|
144
195
|
getMaxChunks(): number;
|
|
145
196
|
getMaxDocs(): number;
|
|
146
197
|
getNameCapacity(): number;
|
|
@@ -150,6 +201,13 @@ export interface AlbexWasmExports {
|
|
|
150
201
|
getChunkStructSize(): number;
|
|
151
202
|
setCandidateMask(byteLen: number): void;
|
|
152
203
|
clearCandidateMask(): void;
|
|
204
|
+
/** Low 32 bits of the active pattern's aggregate character Bloom
|
|
205
|
+
* (ABI 6). Computed WASM-side in setPattern through the same pipeline
|
|
206
|
+
* searchBegin uses (split → optional stemming → fold), so the GPU
|
|
207
|
+
* pre-filter tests exactly the bits the CPU path would. */
|
|
208
|
+
getPatternBloomLo(): number;
|
|
209
|
+
/** High 32 bits of the active pattern's aggregate character Bloom. */
|
|
210
|
+
getPatternBloomHi(): number;
|
|
153
211
|
}
|
|
154
212
|
|
|
155
213
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -236,12 +294,17 @@ export interface AlbexPdfExports {
|
|
|
236
294
|
/** Range of ABI versions this host code understands for the main module.
|
|
237
295
|
* Update both ends together with the Rust `abiVersion()` constant when
|
|
238
296
|
* the export surface changes. */
|
|
239
|
-
//
|
|
240
|
-
//
|
|
241
|
-
//
|
|
242
|
-
//
|
|
243
|
-
|
|
244
|
-
|
|
297
|
+
// ABI 7 adds runtime capacity (initWithCapacity, decision A16) and removes
|
|
298
|
+
// the compile-time tier system (`getTier` is gone), on top of ABI 6's batch
|
|
299
|
+
// frontier reads, ABI 5's truncation signalling and ABI 4's authoritative
|
|
300
|
+
// chunk enumeration. The required-exports list below already makes any
|
|
301
|
+
// older binary fail the missing-exports check, so a tolerant lower bound
|
|
302
|
+
// was dead code — the range is pinned to the one ABI this host actually
|
|
303
|
+
// speaks (audit 0.6.0, finding #7). The .wasm ships inside this package
|
|
304
|
+
// (files: wasm/pkg/*.wasm), so host TS and binary are always
|
|
305
|
+
// version-matched.
|
|
306
|
+
const MAIN_ABI_MIN = 7;
|
|
307
|
+
const MAIN_ABI_MAX = 7;
|
|
245
308
|
|
|
246
309
|
/** Range of ABI versions for the PDF module. */
|
|
247
310
|
const PDF_ABI_MIN = 1;
|
|
@@ -250,16 +313,16 @@ const PDF_ABI_MAX = 3;
|
|
|
250
313
|
/** Required function names on the main WASM. Adding a new one here forces
|
|
251
314
|
* the validator to check it; removing one is a breaking ABI bump. */
|
|
252
315
|
const MAIN_REQUIRED = [
|
|
253
|
-
'abiVersion', 'getBuffer', 'init',
|
|
316
|
+
'abiVersion', 'getBuffer', 'init', 'initWithCapacity',
|
|
254
317
|
'setDocumentName', 'beginDocument', 'feedXmlBytes', 'endDocument',
|
|
255
318
|
'beginXlsx', 'feedXlsxBytes',
|
|
256
319
|
'feedText', 'flushParagraph',
|
|
257
320
|
'setMaxErrors', 'setThreshold', 'setMaxResults',
|
|
258
321
|
'prepareQuery', 'getQueryKind', 'getQueryBranchCount',
|
|
259
|
-
'getQueryBranchPattern', 'selectQueryBranch',
|
|
322
|
+
'getQueryBranchPattern', 'selectQueryBranch', 'getQueryTruncationFlags',
|
|
260
323
|
'setPattern', 'search',
|
|
261
324
|
'searchBegin', 'searchSlice', 'getSearchCursor', 'getSearchTotal',
|
|
262
|
-
'getResultCount',
|
|
325
|
+
'getResultCount', 'getResultsPtr', 'getResultStride',
|
|
263
326
|
'getResultDocId', 'getResultLocation', 'getResultScore',
|
|
264
327
|
'getResultStart', 'getResultEnd', 'getResultChunkIdx',
|
|
265
328
|
'getResultDocName', 'getResultMatchCount',
|
|
@@ -272,10 +335,13 @@ const MAIN_REQUIRED = [
|
|
|
272
335
|
'restoreBegin', 'restoreFeed', 'restoreCommit',
|
|
273
336
|
'getDocId', 'getDocChunkCount', 'getDocName', 'isDocDeleted',
|
|
274
337
|
'removeDocument', 'compact',
|
|
338
|
+
'getDocChunkBase', 'getChunkLocationAt', 'getChunkByteLenAt', 'getChunkTextAt',
|
|
339
|
+
'listChunksBatch',
|
|
275
340
|
'setLanguage',
|
|
276
|
-
'
|
|
341
|
+
'getMaxChunks', 'getMaxDocs', 'getNameCapacity',
|
|
277
342
|
'getChunksPtr', 'getChunkStructSize',
|
|
278
343
|
'setCandidateMask', 'clearCandidateMask',
|
|
344
|
+
'getPatternBloomLo', 'getPatternBloomHi',
|
|
279
345
|
'getDocContentHashPtr', 'getDocContentHashLen', 'setDocumentContentHash',
|
|
280
346
|
'hashBegin', 'hashFeed', 'hashFinish',
|
|
281
347
|
] as const;
|
package/src/worker-protocol.ts
CHANGED
|
@@ -10,13 +10,19 @@
|
|
|
10
10
|
* copying the file bytes into the worker.
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
|
-
import type { AlbexOptions, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
|
|
13
|
+
import type { AlbexDiagnostic, AlbexOptions, AuthoritativeChunk, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
|
|
14
14
|
|
|
15
15
|
export type WorkerOp =
|
|
16
16
|
| { kind: 'init'; opts: AlbexOptions }
|
|
17
17
|
| { kind: 'indexFile'; name: string; buffer: ArrayBuffer }
|
|
18
18
|
| { kind: 'search'; query: string; options: SearchOptions }
|
|
19
|
+
| { kind: 'listChunks'; docId: number }
|
|
19
20
|
| { kind: 'removeDocument'; id: string }
|
|
21
|
+
/** Replace doc `name` with new content. `fileName` is the replacement
|
|
22
|
+
* file's own name (may differ from `name`); the bytes travel as a
|
|
23
|
+
* transferred ArrayBuffer like `indexFile`. */
|
|
24
|
+
| { kind: 'replaceDocument'; name: string; fileName: string; buffer: ArrayBuffer }
|
|
25
|
+
| { kind: 'takeDiagnostics' }
|
|
20
26
|
| { kind: 'compact' }
|
|
21
27
|
| { kind: 'reset' }
|
|
22
28
|
| { kind: 'getStats' }
|
|
@@ -39,10 +45,14 @@ export interface WorkerRequest {
|
|
|
39
45
|
|
|
40
46
|
export type WorkerResponse =
|
|
41
47
|
| { id: number; ok: true; result: unknown }
|
|
42
|
-
|
|
48
|
+
/** `limit`/`max` are populated for capacity errors so the rehydrated
|
|
49
|
+
* AlbexCapacityError keeps reporting the runtime limit that overflowed. */
|
|
50
|
+
| { id: number; ok: false; error: { name: string; kind?: string; message: string; limit?: string; max?: number } };
|
|
43
51
|
|
|
44
52
|
export type IndexFileResult = IndexedDocument;
|
|
45
53
|
export type SearchResultArr = SearchResult[];
|
|
54
|
+
export type ChunksResult = AuthoritativeChunk[];
|
|
46
55
|
export type StatsResult = EngineStats;
|
|
47
56
|
export type SearchStatsRes = SearchStats | null;
|
|
48
57
|
export type DocsResult = readonly IndexedDocument[];
|
|
58
|
+
export type DiagnosticsRes = AlbexDiagnostic[];
|
package/src/worker-runtime.ts
CHANGED
|
@@ -36,8 +36,18 @@ async function dispatch(op: WorkerOp): Promise<unknown> {
|
|
|
36
36
|
}
|
|
37
37
|
case 'search':
|
|
38
38
|
return ensureEngine().search(op.query, op.options);
|
|
39
|
+
case 'listChunks':
|
|
40
|
+
return ensureEngine().listChunks(op.docId);
|
|
39
41
|
case 'removeDocument':
|
|
40
42
|
return ensureEngine().removeDocument(op.id);
|
|
43
|
+
case 'replaceDocument': {
|
|
44
|
+
// Same File-like wrapping as indexFile; the engine's replaceDocument
|
|
45
|
+
// handles remove + re-index + auto-compact under its own lock.
|
|
46
|
+
const file = new File([op.buffer], op.fileName);
|
|
47
|
+
return ensureEngine().replaceDocument(op.name, file);
|
|
48
|
+
}
|
|
49
|
+
case 'takeDiagnostics':
|
|
50
|
+
return ensureEngine().takeDiagnostics();
|
|
41
51
|
case 'compact':
|
|
42
52
|
ensureEngine().compact();
|
|
43
53
|
return undefined;
|
|
@@ -82,13 +92,18 @@ async function handle(req: WorkerRequest): Promise<void> {
|
|
|
82
92
|
const res: WorkerResponse = { id, ok: true, result };
|
|
83
93
|
(self as unknown as Worker).postMessage(res);
|
|
84
94
|
} catch (err) {
|
|
85
|
-
const e = err as Error & { kind?: string };
|
|
95
|
+
const e = err as Error & { kind?: string; limit?: string; max?: number };
|
|
86
96
|
const res: WorkerResponse = {
|
|
87
97
|
id, ok: false,
|
|
88
98
|
error: {
|
|
89
99
|
name: e.name ?? 'Error',
|
|
90
100
|
kind: err instanceof AlbexError ? err.kind : undefined,
|
|
91
101
|
message: e.message ?? String(err),
|
|
102
|
+
// Capacity metadata (which pool + its runtime limit) — plain data,
|
|
103
|
+
// survives structuredClone, lets the main side rehydrate a full
|
|
104
|
+
// AlbexCapacityError.
|
|
105
|
+
limit: typeof e.limit === 'string' ? e.limit : undefined,
|
|
106
|
+
max: typeof e.max === 'number' ? e.max : undefined,
|
|
92
107
|
},
|
|
93
108
|
};
|
|
94
109
|
(self as unknown as Worker).postMessage(res);
|
package/wasm/pkg/albex_pdf.wasm
CHANGED
|
Binary file
|
package/wasm/pkg/albex_wasm.wasm
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|