albex 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +223 -0
- package/README.md +84 -30
- package/dist/_generated/inline-wasm.d.ts +2 -0
- package/dist/_generated/inline-wasm.d.ts.map +1 -0
- package/dist/_generated/inline-wasm.js +9 -0
- package/dist/_generated/inline-wasm.js.map +1 -0
- package/dist/albex-worker.d.ts +65 -2
- package/dist/albex-worker.d.ts.map +1 -1
- package/dist/albex-worker.js +98 -21
- package/dist/albex-worker.js.map +1 -1
- package/dist/albex.d.ts +250 -42
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +492 -120
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +35 -4
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +38 -3
- package/dist/errors.js.map +1 -1
- package/dist/index.d.ts +47 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +82 -0
- package/dist/index.js.map +1 -0
- package/dist/inline.d.ts +10 -0
- package/dist/inline.d.ts.map +1 -0
- package/dist/inline.js +17 -0
- package/dist/inline.js.map +1 -0
- package/dist/persistence.js +2 -2
- package/dist/pool/coordinator.d.ts +14 -6
- package/dist/pool/coordinator.d.ts.map +1 -1
- package/dist/pool/coordinator.js +65 -28
- package/dist/pool/coordinator.js.map +1 -1
- package/dist/profile.js +2 -2
- package/dist/resource-manager.js +2 -2
- package/dist/tiered-store.js +2 -2
- package/dist/wasm-bindings.d.ts +50 -1
- package/dist/wasm-bindings.d.ts.map +1 -1
- package/dist/wasm-bindings.js +20 -12
- package/dist/wasm-bindings.js.map +1 -1
- package/dist/worker-protocol.d.ts +23 -2
- package/dist/worker-protocol.d.ts.map +1 -1
- package/dist/worker-protocol.js +2 -2
- package/dist/worker-runtime.js +17 -2
- package/dist/worker-runtime.js.map +1 -1
- package/package.json +14 -9
- package/src/_generated/inline-wasm.ts +9 -0
- package/src/albex-worker.ts +103 -18
- package/src/albex.ts +3053 -2524
- package/src/errors.ts +49 -4
- package/src/index.ts +81 -0
- package/src/inline.ts +9 -0
- package/src/pool/coordinator.ts +61 -34
- package/src/wasm-bindings.ts +78 -12
- package/src/worker-protocol.ts +12 -2
- package/src/worker-runtime.ts +16 -1
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/dist/albex.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/*!
|
|
2
|
-
* albex v0.
|
|
3
|
-
*
|
|
2
|
+
* albex v0.7.0
|
|
3
|
+
* Local full-text search for documents — runs entirely in the browser, no server, no upload. Zero-config: the WASM core is embedded (~19 KB gzipped), so `npm install albex` then `new AlbexEngine()` works in any bundler, esbuild/Angular included.
|
|
4
4
|
* (c) 2026 RafaCalRob
|
|
5
5
|
* @license MIT
|
|
6
6
|
* https://github.com/RafaCalRob/Albex#readme
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
* ```
|
|
22
22
|
*/
|
|
23
23
|
import { asAlbexExports, asAlbexPdfExports, } from './wasm-bindings.js';
|
|
24
|
-
import { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
|
|
24
|
+
import { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, assertFileSizeWithinLimit, } from './errors.js';
|
|
25
25
|
import { savePersisted, loadPersisted, deletePersisted, listPersisted, } from './persistence.js';
|
|
26
26
|
import { detectProfile, shouldUseGpu } from './profile.js';
|
|
27
27
|
import { getResourceManager } from './resource-manager.js';
|
|
@@ -48,6 +48,39 @@ function warnSearchStreamDeprecated() {
|
|
|
48
48
|
'scheduler between slices and returns a batch. The alias will be ' +
|
|
49
49
|
'removed in 0.4.0.');
|
|
50
50
|
}
|
|
51
|
+
/** The std preset = the historical compile-time defaults. */
|
|
52
|
+
const CAPACITY_STD = {
|
|
53
|
+
maxDocs: 128,
|
|
54
|
+
maxChunks: 100_000,
|
|
55
|
+
textPoolBytes: 16 * 1024 * 1024,
|
|
56
|
+
namePoolBytes: 32 * 1024,
|
|
57
|
+
};
|
|
58
|
+
/** The large preset = the old compile-time "pro" tier. */
|
|
59
|
+
const CAPACITY_LARGE = {
|
|
60
|
+
maxDocs: 1024,
|
|
61
|
+
maxChunks: 800_000,
|
|
62
|
+
textPoolBytes: 128 * 1024 * 1024,
|
|
63
|
+
namePoolBytes: 256 * 1024,
|
|
64
|
+
};
|
|
65
|
+
/**
|
|
66
|
+
* Resolve a user-facing capacity option into full numbers. Partial custom
|
|
67
|
+
* configs are completed from the std defaults scaled to keep std's ratios:
|
|
68
|
+
* `maxChunks` follows `maxDocs` (×782), `textPoolBytes` follows `maxChunks`
|
|
69
|
+
* (×168 B), `namePoolBytes` follows `maxDocs` (×256 B) — each with a floor
|
|
70
|
+
* so tiny configs stay usable. `maxChunks` is clamped to at least `maxDocs`
|
|
71
|
+
* (every document needs at least one chunk).
|
|
72
|
+
*/
|
|
73
|
+
function resolveCapacity(capacity) {
|
|
74
|
+
if (capacity === undefined || capacity === 'std')
|
|
75
|
+
return { ...CAPACITY_STD };
|
|
76
|
+
if (capacity === 'large')
|
|
77
|
+
return { ...CAPACITY_LARGE };
|
|
78
|
+
const maxDocs = Math.floor(capacity.maxDocs ?? CAPACITY_STD.maxDocs);
|
|
79
|
+
const maxChunks = Math.max(Math.floor(capacity.maxChunks ?? Math.max(maxDocs * 782, 1024)), maxDocs);
|
|
80
|
+
const textPoolBytes = Math.floor(capacity.textPoolBytes ?? Math.max(maxChunks * 168, 64 * 1024));
|
|
81
|
+
const namePoolBytes = Math.floor(capacity.namePoolBytes ?? Math.max(maxDocs * 256, 4 * 1024));
|
|
82
|
+
return { maxDocs, maxChunks, textPoolBytes, namePoolBytes };
|
|
83
|
+
}
|
|
51
84
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
52
85
|
// Query parsing (WASM-side as of 0.5.0)
|
|
53
86
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -196,35 +229,51 @@ const FEED_SIZE = 32_768; // 32 KB — fits in 64 KB scratchpad
|
|
|
196
229
|
* The result is stable across runs and engines, so it can be persisted in
|
|
197
230
|
* snapshots without versioning concerns.
|
|
198
231
|
*/
|
|
232
|
+
// NOTE: the TS `computePatternBloom` that used to live here (the THIRD copy
|
|
233
|
+
// of the accent fold, after the Rust index side and the Rust query side) was
|
|
234
|
+
// removed in 0.8.0. The GPU pre-filter now reads the pattern Bloom straight
|
|
235
|
+
// from WASM via `getPatternBloomLo/Hi` (ABI 6) — `setPattern` computes it
|
|
236
|
+
// through the exact pipeline `searchBegin` uses, including Spanish stemming,
|
|
237
|
+
// which the TS copy never applied (audit 2.4).
|
|
199
238
|
/**
|
|
200
|
-
*
|
|
239
|
+
* Convert a UTF-8 byte offset into `bytes` to the equivalent UTF-16
|
|
240
|
+
* code-unit index of the decoded string. Walks lead bytes only — O(offset)
|
|
241
|
+
* with no allocation — counting 1 unit per BMP code point and 2 per 4-byte
|
|
242
|
+
* (astral, e.g. emoji) sequence. Stray continuation bytes (malformed input)
|
|
243
|
+
* count 1 unit each, matching TextDecoder's per-byte U+FFFD replacement.
|
|
201
244
|
*
|
|
202
|
-
*
|
|
203
|
-
*
|
|
204
|
-
*
|
|
205
|
-
* blooms is what the GPU pre-filter checks against.
|
|
245
|
+
* Offsets that land mid-sequence are attributed to the code point they fall
|
|
246
|
+
* inside (the engine only emits code-point-aligned offsets, so this is a
|
|
247
|
+
* defensive clamp, not an expected path).
|
|
206
248
|
*/
|
|
207
|
-
function
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
}
|
|
221
|
-
else if (
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
249
|
+
function utf16IndexAtByte(bytes, byteOffset) {
|
|
250
|
+
const end = Math.min(byteOffset, bytes.length);
|
|
251
|
+
let units = 0;
|
|
252
|
+
let i = 0;
|
|
253
|
+
while (i < end) {
|
|
254
|
+
const b = bytes[i];
|
|
255
|
+
if (b < 0x80) {
|
|
256
|
+
i += 1;
|
|
257
|
+
units += 1;
|
|
258
|
+
} // ASCII
|
|
259
|
+
else if (b < 0xc0) {
|
|
260
|
+
i += 1;
|
|
261
|
+
units += 1;
|
|
262
|
+
} // stray continuation → U+FFFD
|
|
263
|
+
else if (b < 0xe0) {
|
|
264
|
+
i += 2;
|
|
265
|
+
units += 1;
|
|
266
|
+
} // 2-byte (é, ñ, …)
|
|
267
|
+
else if (b < 0xf0) {
|
|
268
|
+
i += 3;
|
|
269
|
+
units += 1;
|
|
270
|
+
} // 3-byte (…, €, CJK)
|
|
271
|
+
else {
|
|
272
|
+
i += 4;
|
|
273
|
+
units += 2;
|
|
274
|
+
} // 4-byte → surrogate pair
|
|
275
|
+
}
|
|
276
|
+
return units;
|
|
228
277
|
}
|
|
229
278
|
// Note: `contentHash` is implemented as a method on AlbexEngine below
|
|
230
279
|
// (it needs access to the WASM scratchpad). The standalone TS reference
|
|
@@ -473,17 +522,29 @@ export class AlbexEngine {
|
|
|
473
522
|
_pdfMem = null;
|
|
474
523
|
_docs = [];
|
|
475
524
|
_lastSearch = null;
|
|
525
|
+
/** Raw truncation bitflags from the most recent prepareQuery (ABI 5):
|
|
526
|
+
* 1 = branches dropped, 2 = tokens dropped/clipped, 4 = query bytes cut.
|
|
527
|
+
* Captured right after prepareQuery so every _lastSearch built for that
|
|
528
|
+
* query (including per-branch OR runs) reports the same flags. */
|
|
529
|
+
_lastTruncFlags = 0;
|
|
476
530
|
/** Structured diagnostics collected during the most recent operation.
|
|
477
531
|
* Drained by `takeDiagnostics()`. Capped at 256 entries to avoid
|
|
478
532
|
* unbounded memory growth in pathological cases (very corrupted
|
|
479
533
|
* corpora producing thousands of recovery warnings). */
|
|
480
534
|
_diagnostics = [];
|
|
481
|
-
|
|
535
|
+
/** Resolved runtime capacity (set in init(); reused by reset()). */
|
|
536
|
+
_capacity = { ...CAPACITY_STD };
|
|
482
537
|
_simd = false;
|
|
483
538
|
_profile = null;
|
|
484
539
|
_resources = null;
|
|
485
540
|
_gpu = null;
|
|
486
|
-
|
|
541
|
+
/** True when the GPU-resident Bloom array no longer mirrors the WASM
|
|
542
|
+
* chunk array. Set by EVERY index mutation (indexFile, removeDocument,
|
|
543
|
+
* compact, reset, load) and cleared after a successful upload. A plain
|
|
544
|
+
* chunk-count comparison is NOT enough: compact() can reorder blooms
|
|
545
|
+
* while keeping the count identical, which would silently filter the
|
|
546
|
+
* wrong chunks (audit 1.5). */
|
|
547
|
+
_gpuUploadDirty = true;
|
|
487
548
|
_unsubscribeResources = null;
|
|
488
549
|
_opts;
|
|
489
550
|
// ── Concurrency guard ──────────────────────────────────────────────────────
|
|
@@ -494,7 +555,7 @@ export class AlbexEngine {
|
|
|
494
555
|
// assert the engine is idle (audit 0.6.0, finding #2).
|
|
495
556
|
_opChain = Promise.resolve();
|
|
496
557
|
_busy = false;
|
|
497
|
-
constructor(opts) {
|
|
558
|
+
constructor(opts = {}) {
|
|
498
559
|
this._opts = opts;
|
|
499
560
|
}
|
|
500
561
|
/** Serialize an async engine operation behind any in-flight one. */
|
|
@@ -529,18 +590,32 @@ export class AlbexEngine {
|
|
|
529
590
|
const hasTombstones = w.getDocCount() > this._docs.length;
|
|
530
591
|
if (hasTombstones && cap > 0 && w.getTextUsed() / cap > 0.85) {
|
|
531
592
|
w.compact();
|
|
593
|
+
this._gpuUploadDirty = true;
|
|
532
594
|
}
|
|
533
595
|
}
|
|
534
|
-
/**
|
|
596
|
+
/**
|
|
597
|
+
* Load and initialise the main WASM module. Must be called before any
|
|
598
|
+
* other method.
|
|
599
|
+
*
|
|
600
|
+
* Resolves `opts.capacity` ('std' default · 'large' · explicit object)
|
|
601
|
+
* and sizes the WASM pools accordingly via `initWithCapacity` (ABI 7).
|
|
602
|
+
* Memory cost ≈ `maxChunks × 64 B + textPoolBytes + namePoolBytes` —
|
|
603
|
+
* ~22 MB for 'std', ~180 MB for 'large'. Throws `AlbexInitError` if the
|
|
604
|
+
* requested capacity is out of range or the allocation fails.
|
|
605
|
+
*/
|
|
535
606
|
async init() {
|
|
536
|
-
const
|
|
537
|
-
const res = await fetch(url);
|
|
538
|
-
if (!res.ok)
|
|
539
|
-
throw new AlbexInitError(`Failed to fetch WASM: ${res.status} (${url})`);
|
|
540
|
-
const { instance } = await WebAssembly.instantiateStreaming(res, {});
|
|
607
|
+
const instance = await this._instantiateMainWasm();
|
|
541
608
|
this._wasm = asAlbexExports(instance.exports);
|
|
542
609
|
this._mem = this._wasm.memory;
|
|
543
|
-
this.
|
|
610
|
+
this._capacity = resolveCapacity(this._opts.capacity);
|
|
611
|
+
const c = this._capacity;
|
|
612
|
+
if (this._wasm.initWithCapacity(c.maxDocs, c.maxChunks, c.textPoolBytes, c.namePoolBytes) !== 1) {
|
|
613
|
+
throw new AlbexInitError(`initWithCapacity(${c.maxDocs} docs, ${c.maxChunks} chunks, ` +
|
|
614
|
+
`${c.textPoolBytes} text bytes, ${c.namePoolBytes} name bytes) failed — ` +
|
|
615
|
+
`parameters out of range (docs 1-65536, chunks ≥ docs and ≤ 4194304, ` +
|
|
616
|
+
`text 4 KiB-1 GiB, names 256 B-16 MiB) or the WASM memory allocation ` +
|
|
617
|
+
`was refused by the host.`);
|
|
618
|
+
}
|
|
544
619
|
// Subscribe to environmental signals. Cheap and benign in node tests
|
|
545
620
|
// (the manager tolerates missing globals).
|
|
546
621
|
const rm = getResourceManager();
|
|
@@ -555,23 +630,87 @@ export class AlbexEngine {
|
|
|
555
630
|
}
|
|
556
631
|
}
|
|
557
632
|
/**
|
|
558
|
-
*
|
|
559
|
-
* 1. `opts.
|
|
560
|
-
*
|
|
561
|
-
*
|
|
633
|
+
* Instantiate the main core WASM. Two sources, in order of precedence:
|
|
634
|
+
* 1. `opts.wasmBytes` — caller-provided bytes; NO network access. The
|
|
635
|
+
* `albex/inline` entry uses this with the embedded baseline core, and
|
|
636
|
+
* integrators on bundlers that don't rewrite `new URL(…, import.meta.
|
|
637
|
+
* url)` (esbuild / Angular / some Webpack) can import the `.wasm` as an
|
|
638
|
+
* asset and pass the bytes here.
|
|
639
|
+
* 2. a URL from `_resolveWasmUrl` (`wasmUrl` / `wasmBaseUrl` / the
|
|
640
|
+
* bundler-friendly default).
|
|
562
641
|
*
|
|
563
|
-
*
|
|
642
|
+
* The URL path prefers `instantiateStreaming` and falls back to
|
|
643
|
+
* `instantiate(arrayBuffer)` when the host serves the `.wasm` with the
|
|
644
|
+
* wrong MIME type — a common esbuild / static-server pitfall that
|
|
645
|
+
* otherwise rejects with an opaque "Incorrect response MIME type". A 404
|
|
646
|
+
* or a network error is rethrown as an `AlbexInitError` whose message
|
|
647
|
+
* points at the concrete fixes (inline entry / `wasmBytes` / `wasmUrl`).
|
|
648
|
+
*/
|
|
649
|
+
async _instantiateMainWasm() {
|
|
650
|
+
const bytes = this._opts.wasmBytes;
|
|
651
|
+
if (bytes) {
|
|
652
|
+
// No fetch, no SIMD probe: the caller chose the binary. `simdEnabled`
|
|
653
|
+
// reflects only an explicit `simd: 'on'` assertion about those bytes.
|
|
654
|
+
this._profile = await detectProfile();
|
|
655
|
+
this._simd = this._opts.simd === 'on';
|
|
656
|
+
const { instance } = await WebAssembly.instantiate(bytes, {});
|
|
657
|
+
return instance;
|
|
658
|
+
}
|
|
659
|
+
const url = await this._resolveWasmUrl();
|
|
660
|
+
let res;
|
|
661
|
+
try {
|
|
662
|
+
res = await fetch(url);
|
|
663
|
+
}
|
|
664
|
+
catch (cause) {
|
|
665
|
+
throw new AlbexInitError(this._wasmLoadHelp(url, String(cause)));
|
|
666
|
+
}
|
|
667
|
+
if (!res.ok) {
|
|
668
|
+
throw new AlbexInitError(this._wasmLoadHelp(url, `HTTP ${res.status}`));
|
|
669
|
+
}
|
|
670
|
+
try {
|
|
671
|
+
const { instance } = await WebAssembly.instantiateStreaming(res, {});
|
|
672
|
+
return instance;
|
|
673
|
+
}
|
|
674
|
+
catch (streamErr) {
|
|
675
|
+
// Streaming rejects when the response Content-Type isn't
|
|
676
|
+
// `application/wasm`. The bytes are usually fine — re-fetch (the first
|
|
677
|
+
// body was consumed by the streaming attempt) and compile from a buffer.
|
|
678
|
+
try {
|
|
679
|
+
const buf = await (await fetch(url)).arrayBuffer();
|
|
680
|
+
const { instance } = await WebAssembly.instantiate(buf, {});
|
|
681
|
+
return instance;
|
|
682
|
+
}
|
|
683
|
+
catch {
|
|
684
|
+
throw new AlbexInitError(this._wasmLoadHelp(url, `instantiate failed (${String(streamErr)})`));
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
/** Build the actionable "couldn't load the core" message shared by every
|
|
689
|
+
* main-WASM load failure. The default `albex` entry embeds the core, so a
|
|
690
|
+
* fetch only runs when the caller explicitly set `wasmUrl`/`wasmBaseUrl` —
|
|
691
|
+
* the message leads with the one-line exit (drop the option). */
|
|
692
|
+
_wasmLoadHelp(url, reason) {
|
|
693
|
+
return (`Albex couldn't fetch its core WASM (${reason}) from ${url}. ` +
|
|
694
|
+
`You're on the network path because \`wasmUrl\` or \`wasmBaseUrl\` is ` +
|
|
695
|
+
`set. Easiest fix: remove that option — the default ` +
|
|
696
|
+
`\`import { AlbexEngine } from 'albex'\` embeds the core and serves ` +
|
|
697
|
+
`nothing. Keep the option only for a CDN or the SIMD build, and make ` +
|
|
698
|
+
`sure it points at a reachable \`albex_wasm*.wasm\` (check the path, ` +
|
|
699
|
+
`the dev server, and CORS).`);
|
|
700
|
+
}
|
|
701
|
+
/**
|
|
702
|
+
* Decide which `.wasm` binary to fetch. Order of precedence:
|
|
564
703
|
* 1. `opts.wasmUrl` literal → use verbatim
|
|
565
|
-
* 2. `opts.wasmBaseUrl` +
|
|
704
|
+
* 2. `opts.wasmBaseUrl` + simd suffix → fetched from that directory
|
|
566
705
|
* 3. zero-config default → `albex_wasm_bg.wasm` packaged
|
|
567
706
|
* next to this file, resolved
|
|
568
707
|
* via `import.meta.url`
|
|
569
708
|
*
|
|
570
|
-
*
|
|
571
|
-
*
|
|
572
|
-
*
|
|
573
|
-
*
|
|
574
|
-
*
|
|
709
|
+
* There are exactly two main binaries (baseline + SIMD); capacity is a
|
|
710
|
+
* RUNTIME parameter since ABI 7, so it never affects which file is
|
|
711
|
+
* fetched. SIMD auto-detection is only active when `wasmBaseUrl` is
|
|
712
|
+
* given, because picking a URL at runtime would defeat any bundler's
|
|
713
|
+
* static asset rewriting.
|
|
575
714
|
*/
|
|
576
715
|
async _resolveWasmUrl() {
|
|
577
716
|
const o = this._opts;
|
|
@@ -587,17 +726,16 @@ export class AlbexEngine {
|
|
|
587
726
|
// as an asset reference. They copy the .wasm to the output directory and
|
|
588
727
|
// rewrite the URL automatically. Consumers who use one of those bundlers
|
|
589
728
|
// get a working `new AlbexEngine()` with no manual setup.
|
|
590
|
-
// 0.5.0+: two main binaries only — baseline and SIMD
|
|
591
|
-
//
|
|
592
|
-
// boolean: SIMD on or off, decided either by the
|
|
593
|
-
// option or by a runtime probe.
|
|
729
|
+
// 0.5.0+: two main binaries only — baseline and SIMD (the tier system
|
|
730
|
+
// is gone; capacity became a runtime parameter in ABI 7). Selection
|
|
731
|
+
// collapses to a single boolean: SIMD on or off, decided either by the
|
|
732
|
+
// explicit `simd` option or by a runtime probe.
|
|
594
733
|
const simd = o.simd === 'on'
|
|
595
734
|
? true
|
|
596
735
|
: o.simd === 'off'
|
|
597
736
|
? false
|
|
598
737
|
: !!profile?.wasm.simd;
|
|
599
738
|
this._simd = simd;
|
|
600
|
-
this._tier = 'std';
|
|
601
739
|
if (!o.wasmBaseUrl) {
|
|
602
740
|
// Zero-config: bundler resolves the .wasm next to dist/. We only
|
|
603
741
|
// ship the baseline alias (albex_wasm_bg.wasm) inside the npm
|
|
@@ -608,8 +746,6 @@ export class AlbexEngine {
|
|
|
608
746
|
const base = o.wasmBaseUrl.replace(/\/+$/, '');
|
|
609
747
|
return simd ? `${base}/albex_wasm_simd.wasm` : `${base}/albex_wasm.wasm`;
|
|
610
748
|
}
|
|
611
|
-
/** The tier that was actually loaded. `null` until `init()` resolves. */
|
|
612
|
-
get tier() { return this._tier; }
|
|
613
749
|
/** True if the SIMD-accelerated binary was loaded. */
|
|
614
750
|
get simdEnabled() { return this._simd; }
|
|
615
751
|
/** True if a WebGPU device is acquired and the next search will use it. */
|
|
@@ -645,8 +781,14 @@ export class AlbexEngine {
|
|
|
645
781
|
* No-op if the GPU device hasn't been acquired yet — first call attempts
|
|
646
782
|
* `init()` lazily; if that fails, the candidate path is permanently
|
|
647
783
|
* disabled for this engine instance.
|
|
784
|
+
*
|
|
785
|
+
* IMPORTANT: this method CLOBBERS the scratchpad (the candidate bitset
|
|
786
|
+
* is pushed through it via `setCandidateMask`). Any pattern previously
|
|
787
|
+
* staged by `selectQueryBranch` is destroyed — the caller MUST re-select
|
|
788
|
+
* the active branch before calling `searchBegin`, which snapshots the
|
|
789
|
+
* pattern from the scratchpad (audit 1.2).
|
|
648
790
|
*/
|
|
649
|
-
async _gpuPreFilter(
|
|
791
|
+
async _gpuPreFilter() {
|
|
650
792
|
const gpu = this._gpu;
|
|
651
793
|
if (!gpu)
|
|
652
794
|
return;
|
|
@@ -660,20 +802,26 @@ export class AlbexEngine {
|
|
|
660
802
|
const chunkCount = this._wasm.getChunkCount();
|
|
661
803
|
if (chunkCount === 0)
|
|
662
804
|
return;
|
|
663
|
-
// Upload blooms if the corpus changed
|
|
664
|
-
//
|
|
665
|
-
|
|
805
|
+
// Upload blooms if the corpus changed since the last upload. The
|
|
806
|
+
// signal is a dirty flag set by every index mutation — not a chunk
|
|
807
|
+
// count comparison, because compact() can reorder blooms while
|
|
808
|
+
// keeping the count identical (audit 1.5). We re-upload everything
|
|
809
|
+
// on any delta; incremental delta-upload is a future optimisation.
|
|
810
|
+
if (this._gpuUploadDirty) {
|
|
666
811
|
const ptr = this._wasm.getChunksPtr();
|
|
667
812
|
const stride = this._wasm.getChunkStructSize();
|
|
668
813
|
const bytes = new Uint8Array(this._mem.buffer, ptr, chunkCount * stride);
|
|
669
814
|
const blooms = packBloomsFromChunks(bytes, chunkCount);
|
|
670
815
|
gpu.uploadChunkBlooms(blooms, chunkCount);
|
|
671
|
-
this.
|
|
672
|
-
}
|
|
673
|
-
//
|
|
674
|
-
//
|
|
675
|
-
|
|
676
|
-
|
|
816
|
+
this._gpuUploadDirty = false;
|
|
817
|
+
}
|
|
818
|
+
// Pattern Bloom comes straight from WASM (ABI 6): `selectQueryBranch`
|
|
819
|
+
// → `setPattern` computed it through the same pipeline `searchBegin`
|
|
820
|
+
// uses — split, optional Spanish stemming, accent fold, `c & 0x3F`.
|
|
821
|
+
// The retired TS copy of the fold never stemmed, so with `setLanguage
|
|
822
|
+
// ('es')` it could set bits for suffixes the CPU pattern no longer
|
|
823
|
+
// had → over-restrictive mask → silent false negatives (audit 2.4).
|
|
824
|
+
const passes = await gpu.scan(this._wasm.getPatternBloomLo(), this._wasm.getPatternBloomHi());
|
|
677
825
|
// Push the bitset back into WASM via the scratchpad.
|
|
678
826
|
const passBytes = new Uint8Array(passes.buffer, passes.byteOffset, passes.byteLength);
|
|
679
827
|
this._writePad(passBytes);
|
|
@@ -699,6 +847,16 @@ export class AlbexEngine {
|
|
|
699
847
|
const ptr = this._wasm.getBuffer(0);
|
|
700
848
|
return _dec.decode(this._u8(ptr, n));
|
|
701
849
|
}
|
|
850
|
+
/** Copy `n` scratchpad bytes out of WASM memory. The copy is private to
|
|
851
|
+
* JS, so it survives later WASM calls (and memory growth) — used when the
|
|
852
|
+
* caller needs both the raw bytes (UTF-16 span mapping) and the decoded
|
|
853
|
+
* string of the same payload. */
|
|
854
|
+
_readPadBytes(n) {
|
|
855
|
+
const ptr = this._wasm.getBuffer(0);
|
|
856
|
+
const out = new Uint8Array(n);
|
|
857
|
+
out.set(this._u8(ptr, n));
|
|
858
|
+
return out;
|
|
859
|
+
}
|
|
702
860
|
_feedText(text) {
|
|
703
861
|
const b = _enc.encode(text);
|
|
704
862
|
for (let i = 0; i < b.length; i += FEED_SIZE) {
|
|
@@ -747,9 +905,26 @@ export class AlbexEngine {
|
|
|
747
905
|
async _ensurePdfWasm() {
|
|
748
906
|
if (this._pdfWasm)
|
|
749
907
|
return;
|
|
908
|
+
// Compile first (regardless of source) so we can inspect the module's
|
|
909
|
+
// required imports and resolve mangled wasm-bindgen names by prefix
|
|
910
|
+
// rather than by hash.
|
|
911
|
+
const module = this._opts.pdfWasmBytes
|
|
912
|
+
? await WebAssembly.compile(this._opts.pdfWasmBytes)
|
|
913
|
+
: await this._fetchPdfModule();
|
|
914
|
+
const imports = makePdfWasmImports(module, () => this._pdfMem);
|
|
915
|
+
const instance = await WebAssembly.instantiate(module, imports);
|
|
916
|
+
this._pdfWasm = asAlbexPdfExports(instance.exports);
|
|
917
|
+
this._pdfMem = this._pdfWasm.memory;
|
|
918
|
+
}
|
|
919
|
+
/** Fetch + compile the PDF module from a URL. Split out of
|
|
920
|
+
* `_ensurePdfWasm` so the `pdfWasmBytes` (no-network) path stays trivial.
|
|
921
|
+
* Falls back to a buffered compile when the host serves the binary with
|
|
922
|
+
* the wrong MIME type (same pitfall as the core loader). */
|
|
923
|
+
async _fetchPdfModule() {
|
|
750
924
|
// Zero-config default: resolve relative to this module so bundlers copy
|
|
751
925
|
// the .wasm to the output automatically. Override with `opts.pdfWasmUrl`
|
|
752
|
-
// when serving from a separate CDN
|
|
926
|
+
// when serving from a separate CDN, or pass `pdfWasmBytes` to skip the
|
|
927
|
+
// network entirely (e.g. esbuild/Angular).
|
|
753
928
|
const pdfUrl = this._opts.pdfWasmUrl
|
|
754
929
|
?? new URL('../wasm/pkg/albex_pdf.wasm', import.meta.url).href;
|
|
755
930
|
// Network politeness: on constrained connections (slow-2g/2g/saveData)
|
|
@@ -762,16 +937,25 @@ export class AlbexEngine {
|
|
|
762
937
|
message: 'Downloading PDF WASM (~1 MB) on a constrained network connection',
|
|
763
938
|
});
|
|
764
939
|
}
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
940
|
+
let res;
|
|
941
|
+
try {
|
|
942
|
+
res = await fetch(pdfUrl);
|
|
943
|
+
}
|
|
944
|
+
catch (cause) {
|
|
945
|
+
throw new AlbexInitError(`Failed to fetch PDF WASM from ${pdfUrl} (${String(cause)}). ` +
|
|
946
|
+
`Pass \`pdfWasmBytes\` (bundler asset import) or set \`pdfWasmUrl\`.`);
|
|
947
|
+
}
|
|
948
|
+
if (!res.ok) {
|
|
949
|
+
throw new AlbexInitError(`Failed to fetch PDF WASM: ${res.status} (${pdfUrl}). ` +
|
|
950
|
+
`Pass \`pdfWasmBytes\` (bundler asset import) or set \`pdfWasmUrl\`.`);
|
|
951
|
+
}
|
|
952
|
+
try {
|
|
953
|
+
return await WebAssembly.compileStreaming(res);
|
|
954
|
+
}
|
|
955
|
+
catch {
|
|
956
|
+
const buf = await (await fetch(pdfUrl)).arrayBuffer();
|
|
957
|
+
return WebAssembly.compile(buf);
|
|
958
|
+
}
|
|
775
959
|
}
|
|
776
960
|
// ── Indexers ──────────────────────────────────────────────────────────────
|
|
777
961
|
async _indexDocx(file, bytes) {
|
|
@@ -1534,7 +1718,9 @@ export class AlbexEngine {
|
|
|
1534
1718
|
};
|
|
1535
1719
|
// ── Public API ────────────────────────────────────────────────────────────
|
|
1536
1720
|
/**
|
|
1537
|
-
* Index a file. Supported formats: DOCX, XLSX, PDF,
|
|
1721
|
+
* Index a file. Supported formats (11, with varying depth): DOCX, XLSX, PDF,
|
|
1722
|
+
* HTML, MD, JSON, CSV, EML, RTF, TXT, XML. Several are deliberately "lite"
|
|
1723
|
+
* (CSV is RFC-4180-lite, EML is MIME-lite, RTF is regex-stripped).
|
|
1538
1724
|
* Throws for unsupported formats or parse errors.
|
|
1539
1725
|
*/
|
|
1540
1726
|
async indexFile(file) {
|
|
@@ -1545,12 +1731,16 @@ export class AlbexEngine {
|
|
|
1545
1731
|
const indexer = AlbexEngine._INDEXERS[ext];
|
|
1546
1732
|
if (!indexer)
|
|
1547
1733
|
throw new AlbexUnsupportedFormatError(ext);
|
|
1734
|
+
// Size guard BEFORE reading: `file.size` is available without buffering,
|
|
1735
|
+
// so a pathological input (a 2 GB .txt) is refused with a typed error
|
|
1736
|
+
// instead of being fully loaded and hashed first (audit 3.5).
|
|
1737
|
+
assertFileSizeWithinLimit(file, this._opts.maxFileBytes);
|
|
1548
1738
|
// Hash the source bytes for idempotency. We always read the bytes once
|
|
1549
1739
|
// here so the indexer can reuse them — avoids a double File.arrayBuffer().
|
|
1550
1740
|
const bytes = new Uint8Array(await file.arrayBuffer());
|
|
1551
1741
|
const hash = this._contentHash(bytes);
|
|
1552
1742
|
// Idempotency: if a non-deleted doc already has this hash, return it
|
|
1553
|
-
// unchanged.
|
|
1743
|
+
// unchanged. O(doc_count) scan — cheap at any supported capacity.
|
|
1554
1744
|
const existing = this._docs.find(d => d.contentHash === hash);
|
|
1555
1745
|
if (existing)
|
|
1556
1746
|
return existing;
|
|
@@ -1578,15 +1768,22 @@ export class AlbexEngine {
|
|
|
1578
1768
|
if (overflow !== 0) {
|
|
1579
1769
|
const which = (overflow & 1) ? 'chunks' : (overflow & 2) ? 'text'
|
|
1580
1770
|
: (overflow & 4) ? 'docs' : 'names';
|
|
1771
|
+
// The RUNTIME limit of the pool that overflowed, as configured via
|
|
1772
|
+
// `capacity` (std defaults · 'large' · custom object).
|
|
1773
|
+
const max = which === 'chunks' ? w.getMaxChunks()
|
|
1774
|
+
: which === 'text' ? w.getTextCapacity()
|
|
1775
|
+
: which === 'docs' ? w.getMaxDocs()
|
|
1776
|
+
: w.getNameCapacity();
|
|
1581
1777
|
const pools = [
|
|
1582
1778
|
overflow & 1 ? 'chunk pool' : '',
|
|
1583
1779
|
overflow & 2 ? 'text pool' : '',
|
|
1584
1780
|
overflow & 4 ? 'document table' : '',
|
|
1585
1781
|
overflow & 8 ? 'name pool' : '',
|
|
1586
1782
|
].filter(Boolean).join(', ');
|
|
1587
|
-
throw new AlbexCapacityError(`Index capacity exceeded while indexing "${file.name}" (${pools} full
|
|
1588
|
-
|
|
1589
|
-
`(compact(), shard across an AlbexPool,
|
|
1783
|
+
throw new AlbexCapacityError(`Index capacity exceeded while indexing "${file.name}" (${pools} full, ` +
|
|
1784
|
+
`${which} limit = ${max}). The document was rolled back (not indexed); ` +
|
|
1785
|
+
`treat the index as full (compact(), shard across an AlbexPool, ` +
|
|
1786
|
+
`reset(), or re-create the engine with a bigger \`capacity\`).`, which, max);
|
|
1590
1787
|
}
|
|
1591
1788
|
// The new doc occupies slot `docCountBefore`.
|
|
1592
1789
|
const docId = w.getDocId(docCountBefore);
|
|
@@ -1600,6 +1797,7 @@ export class AlbexEngine {
|
|
|
1600
1797
|
contentHash: hash,
|
|
1601
1798
|
};
|
|
1602
1799
|
this._docs.push(doc);
|
|
1800
|
+
this._gpuUploadDirty = true;
|
|
1603
1801
|
return doc;
|
|
1604
1802
|
}
|
|
1605
1803
|
/**
|
|
@@ -1620,6 +1818,7 @@ export class AlbexEngine {
|
|
|
1620
1818
|
const ok = this._wasm.removeDocument(doc.docId) === 1;
|
|
1621
1819
|
if (ok) {
|
|
1622
1820
|
this._docs = this._docs.filter(d => d !== doc);
|
|
1821
|
+
this._gpuUploadDirty = true;
|
|
1623
1822
|
}
|
|
1624
1823
|
return ok;
|
|
1625
1824
|
}
|
|
@@ -1649,6 +1848,76 @@ export class AlbexEngine {
|
|
|
1649
1848
|
compact() {
|
|
1650
1849
|
this._assertIdle('compact');
|
|
1651
1850
|
this._wasm.compact();
|
|
1851
|
+
// compact() reorders the chunk array (and therefore the per-chunk
|
|
1852
|
+
// blooms) even when the chunk count stays the same — the GPU copy is
|
|
1853
|
+
// stale no matter what (audit 1.5).
|
|
1854
|
+
this._gpuUploadDirty = true;
|
|
1855
|
+
}
|
|
1856
|
+
/**
|
|
1857
|
+
* Enumerate the authoritative chunks Albex indexed for a document, in order.
|
|
1858
|
+
* Lets a host mirror Albex's exact chunking — e.g. embed the same units for a
|
|
1859
|
+
* parallel semantic index keyed on the same {@link AuthoritativeChunk.id}
|
|
1860
|
+
* (`"<docId>::<ord>"`, identical to {@link SearchResult.chunkId}). `docId` is
|
|
1861
|
+
* `IndexedDocument.docId` from {@link indexFile}; returns `[]` if no live
|
|
1862
|
+
* document has that id.
|
|
1863
|
+
*
|
|
1864
|
+
* The returned `id`/`ord`/`sub` are stable across {@link compact} and
|
|
1865
|
+
* snapshot save/load. Never key persistent structures on a search result's
|
|
1866
|
+
* absolute `chunkIdx`, which {@link compact} renumbers.
|
|
1867
|
+
*/
|
|
1868
|
+
listChunks(docId) {
|
|
1869
|
+
this._assertIdle('listChunks');
|
|
1870
|
+
const w = this._wasm;
|
|
1871
|
+
const slot = this._docSlotOf(docId);
|
|
1872
|
+
if (slot < 0)
|
|
1873
|
+
return [];
|
|
1874
|
+
const count = w.getDocChunkCount(slot);
|
|
1875
|
+
const out = [];
|
|
1876
|
+
let prevLocation = -1;
|
|
1877
|
+
let sub = 0;
|
|
1878
|
+
// Batched enumeration (ABI 6): one `listChunksBatch` frontier call per
|
|
1879
|
+
// scratchpad-full of chunks instead of 2-3 calls per chunk (audit 2.6 —
|
|
1880
|
+
// an embeddings pipeline over 100k chunks used to make ~300k calls).
|
|
1881
|
+
// Each batch packs records as [u32 text_len][u32 location][text bytes],
|
|
1882
|
+
// tightly, in ordinal order; layout documented in wasm/src/lib.rs.
|
|
1883
|
+
let ord = 0;
|
|
1884
|
+
while (ord < count) {
|
|
1885
|
+
const n = w.listChunksBatch(slot, ord, count - ord);
|
|
1886
|
+
if (n === 0)
|
|
1887
|
+
break; // defensive — should not happen for a live slot
|
|
1888
|
+
const ptr = w.getBuffer(0);
|
|
1889
|
+
// The view is only valid until the next frontier call; everything is
|
|
1890
|
+
// decoded out of it inside this loop body before the next batch.
|
|
1891
|
+
const view = new DataView(this._mem.buffer);
|
|
1892
|
+
let off = ptr;
|
|
1893
|
+
for (let k = 0; k < n; k++) {
|
|
1894
|
+
const byteLen = view.getUint32(off, true);
|
|
1895
|
+
const location = view.getUint32(off + 4, true);
|
|
1896
|
+
const text = byteLen > 0
|
|
1897
|
+
? _dec.decode(new Uint8Array(this._mem.buffer, off + 8, byteLen))
|
|
1898
|
+
: '';
|
|
1899
|
+
if (location === prevLocation)
|
|
1900
|
+
sub++;
|
|
1901
|
+
else {
|
|
1902
|
+
sub = 0;
|
|
1903
|
+
prevLocation = location;
|
|
1904
|
+
}
|
|
1905
|
+
out.push({ docId, location, ord, sub, text, byteLen, id: `${docId}::${ord}` });
|
|
1906
|
+
ord++;
|
|
1907
|
+
off += 8 + byteLen;
|
|
1908
|
+
}
|
|
1909
|
+
}
|
|
1910
|
+
return out;
|
|
1911
|
+
}
|
|
1912
|
+
/** Doc-table slot (0..getDocCount) whose stable id is `docId`, or -1. */
|
|
1913
|
+
_docSlotOf(docId) {
|
|
1914
|
+
const w = this._wasm;
|
|
1915
|
+
const n = w.getDocCount();
|
|
1916
|
+
for (let i = 0; i < n; i++) {
|
|
1917
|
+
if (w.getDocId(i) === docId)
|
|
1918
|
+
return i;
|
|
1919
|
+
}
|
|
1920
|
+
return -1;
|
|
1652
1921
|
}
|
|
1653
1922
|
/**
|
|
1654
1923
|
* Search the index. Supports:
|
|
@@ -1658,12 +1927,18 @@ export class AlbexEngine {
|
|
|
1658
1927
|
*
|
|
1659
1928
|
* Pass `{ windowed: true }` to receive cropped snippets with ASCII ellipsis
|
|
1660
1929
|
* markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
|
|
1930
|
+
*
|
|
1931
|
+
* Note: this synchronous path never uses the GPU pre-filter — the WebGPU
|
|
1932
|
+
* scan is asynchronous by nature. Only `searchCooperative` (the budgeted
|
|
1933
|
+
* path) engages the GPU; `search()` always runs the CPU Bloom pre-filter,
|
|
1934
|
+
* regardless of the `gpu` option.
|
|
1661
1935
|
*/
|
|
1662
1936
|
search(query, opts = {}) {
|
|
1663
1937
|
this._assertIdle('search');
|
|
1664
1938
|
const w = this._wasm;
|
|
1665
1939
|
const ql = this._writeStr(query);
|
|
1666
1940
|
const kind = w.prepareQuery(ql);
|
|
1941
|
+
this._lastTruncFlags = w.getQueryTruncationFlags();
|
|
1667
1942
|
if (kind < 0)
|
|
1668
1943
|
return [];
|
|
1669
1944
|
if (kind === 2) {
|
|
@@ -1717,6 +1992,7 @@ export class AlbexEngine {
|
|
|
1717
1992
|
const w = this._wasm;
|
|
1718
1993
|
const ql = this._writeStr(query);
|
|
1719
1994
|
const kind = w.prepareQuery(ql);
|
|
1995
|
+
this._lastTruncFlags = w.getQueryTruncationFlags();
|
|
1720
1996
|
if (kind < 0)
|
|
1721
1997
|
return [];
|
|
1722
1998
|
if (kind === 2) {
|
|
@@ -1728,7 +2004,12 @@ export class AlbexEngine {
|
|
|
1728
2004
|
w.selectQueryBranch(i);
|
|
1729
2005
|
const r = await this._runSearchBudgeted(query, opts, budget, undefined, i);
|
|
1730
2006
|
for (const x of r) {
|
|
1731
|
-
|
|
2007
|
+
// chunkId ("<docId>::<ord>") distinguishes two sub-chunks of the
|
|
2008
|
+
// same location — a (doc, location, matchStart) key would collide
|
|
2009
|
+
// when both sub-chunks hit at the same relative offset and drop a
|
|
2010
|
+
// legitimate result (audit 3.4). matchStart keeps distinct hits
|
|
2011
|
+
// within one chunk across branches.
|
|
2012
|
+
const key = `${x.chunkId}:${x.matchStart}`;
|
|
1732
2013
|
if (!seen.has(key)) {
|
|
1733
2014
|
seen.add(key);
|
|
1734
2015
|
all.push(x);
|
|
@@ -1763,19 +2044,17 @@ export class AlbexEngine {
|
|
|
1763
2044
|
*/
|
|
1764
2045
|
async _runSearchBudgeted(displayQuery, opts, budgetMs, phraseTokens, branchIdx = 0) {
|
|
1765
2046
|
const w = this._wasm;
|
|
1766
|
-
// Pattern is already set by the caller via selectQueryBranch(branchIdx)
|
|
1767
|
-
//
|
|
1768
|
-
//
|
|
1769
|
-
//
|
|
1770
|
-
const activePatternLen = w.getQueryBranchPattern(branchIdx);
|
|
1771
|
-
const activePattern = activePatternLen > 0 ? this._readPad(activePatternLen) : '';
|
|
2047
|
+
// Pattern is already set by the caller via selectQueryBranch(branchIdx),
|
|
2048
|
+
// which also computed THAT branch's pattern Bloom inside WASM — so the
|
|
2049
|
+
// GPU pre-filter below builds the right candidate mask per OR branch
|
|
2050
|
+
// (audit finding #6) without re-reading the pattern across the frontier.
|
|
1772
2051
|
// GPU pre-filter (CD1). If enabled AND the corpus is large enough,
|
|
1773
2052
|
// the GPU computes the candidate bitset and we install it into WASM
|
|
1774
2053
|
// before searchBegin so the slice loop only inspects candidates.
|
|
1775
2054
|
// Failure here is silent: we fall back to CPU-only Bloom transparently.
|
|
1776
2055
|
if (this._shouldEngageGpu()) {
|
|
1777
2056
|
try {
|
|
1778
|
-
await this._gpuPreFilter(
|
|
2057
|
+
await this._gpuPreFilter();
|
|
1779
2058
|
}
|
|
1780
2059
|
catch (e) {
|
|
1781
2060
|
// Don't let a GPU hiccup kill the search — drop to CPU path.
|
|
@@ -1785,12 +2064,20 @@ export class AlbexEngine {
|
|
|
1785
2064
|
});
|
|
1786
2065
|
w.clearCandidateMask();
|
|
1787
2066
|
}
|
|
2067
|
+
// The GPU pre-filter pushes the candidate bitset through the
|
|
2068
|
+
// scratchpad, overwriting the pattern staged by selectQueryBranch.
|
|
2069
|
+
// searchBegin() snapshots the pattern FROM the scratchpad, so it
|
|
2070
|
+
// would compile garbage tokens out of the mask bytes (audit 1.2 —
|
|
2071
|
+
// every GPU-assisted search silently returned wrong results).
|
|
2072
|
+
// Re-select the active branch to restore the pattern.
|
|
2073
|
+
w.selectQueryBranch(branchIdx);
|
|
1788
2074
|
}
|
|
1789
2075
|
const t0 = performance.now();
|
|
1790
2076
|
if (w.searchBegin() === 0) {
|
|
1791
2077
|
this._lastSearch = {
|
|
1792
2078
|
query: displayQuery, timeMs: 0, results: 0,
|
|
1793
2079
|
bloomTested: 0, bloomPassed: 0, bitapMatched: 0,
|
|
2080
|
+
...this._truncStats(),
|
|
1794
2081
|
};
|
|
1795
2082
|
return [];
|
|
1796
2083
|
}
|
|
@@ -1829,21 +2116,96 @@ export class AlbexEngine {
|
|
|
1829
2116
|
bloomTested: w.getStatBloomTested(),
|
|
1830
2117
|
bloomPassed: w.getStatBloomPassed(),
|
|
1831
2118
|
bitapMatched: w.getStatBitapMatched(),
|
|
2119
|
+
...this._truncStats(),
|
|
1832
2120
|
};
|
|
1833
2121
|
return this._collectResults(count, opts, phraseTokens);
|
|
1834
2122
|
}
|
|
2123
|
+
/** Truncation booleans for SearchStats, decoded from the flags the WASM
|
|
2124
|
+
* reported for the most recent prepareQuery (audit 1.6 — the engine used
|
|
2125
|
+
* to drop OR branches past 8 and tokens past 4 in silence). */
|
|
2126
|
+
_truncStats() {
|
|
2127
|
+
const f = this._lastTruncFlags;
|
|
2128
|
+
return {
|
|
2129
|
+
truncatedBranches: (f & 1) !== 0,
|
|
2130
|
+
truncatedTokens: (f & 2) !== 0,
|
|
2131
|
+
truncatedQuery: (f & 4) !== 0,
|
|
2132
|
+
};
|
|
2133
|
+
}
|
|
1835
2134
|
/** Materialise results [0..count) into the public SearchResult shape.
|
|
1836
2135
|
* When `phraseTokens` is given, each result is kept only if those tokens
|
|
1837
2136
|
* appear adjacently in the FULL chunk text — independent of any display
|
|
1838
|
-
* windowing — so phrase queries stay correct under `{ windowed: true }`.
|
|
2137
|
+
* windowing — so phrase queries stay correct under `{ windowed: true }`.
|
|
2138
|
+
*
|
|
2139
|
+
* Frontier discipline (audit 2.1): all numeric fields of every result are
|
|
2140
|
+
* read in ONE DataView pass over the `#[repr(C)]` RESULTS array
|
|
2141
|
+
* (`getResultsPtr`/`getResultStride`, ABI 6) — the old path made 12-15
|
|
2142
|
+
* frontier calls per result. Strings still need calls, minimised to one
|
|
2143
|
+
* snippet read per result plus one doc-name read per DISTINCT document
|
|
2144
|
+
* (the old `getResultDocName` was additionally O(doc_count) inside WASM
|
|
2145
|
+
* for every single result). */
|
|
1839
2146
|
_collectResults(count, opts, phraseTokens) {
|
|
1840
2147
|
const w = this._wasm;
|
|
1841
2148
|
const windowed = opts.windowed === true;
|
|
1842
2149
|
const before = opts.before ?? 60;
|
|
1843
2150
|
const after = opts.after ?? 120;
|
|
1844
2151
|
const phraseFilter = phraseTokens && phraseTokens.length > 0 ? phraseTokens : null;
|
|
2152
|
+
// Map each live doc_id to its CHUNKS[] base (to turn a result's absolute
|
|
2153
|
+
// chunk index into a compact()-stable doc-relative ordinal) and to its
|
|
2154
|
+
// doc-table slot (for O(1) name resolution via getDocName).
|
|
2155
|
+
const chunkBaseByDocId = new Map();
|
|
2156
|
+
const slotByDocId = new Map();
|
|
2157
|
+
{
|
|
2158
|
+
const docCount = w.getDocCount();
|
|
2159
|
+
for (let d = 0; d < docCount; d++) {
|
|
2160
|
+
const id = w.getDocId(d);
|
|
2161
|
+
chunkBaseByDocId.set(id, w.getDocChunkBase(d));
|
|
2162
|
+
slotByDocId.set(id, d);
|
|
2163
|
+
}
|
|
2164
|
+
}
|
|
2165
|
+
const raw = new Array(count);
|
|
2166
|
+
{
|
|
2167
|
+
const ptr = w.getResultsPtr();
|
|
2168
|
+
const stride = w.getResultStride();
|
|
2169
|
+
const view = new DataView(this._mem.buffer, ptr, count * stride);
|
|
2170
|
+
for (let i = 0; i < count; i++) {
|
|
2171
|
+
const base = i * stride;
|
|
2172
|
+
const matchCount = view.getUint32(base + 56, true);
|
|
2173
|
+
const matches = [];
|
|
2174
|
+
for (let k = 0; k < matchCount && k < 4; k++) {
|
|
2175
|
+
matches.push({
|
|
2176
|
+
start: view.getUint32(base + 24 + k * 8, true),
|
|
2177
|
+
end: view.getUint32(base + 28 + k * 8, true),
|
|
2178
|
+
});
|
|
2179
|
+
}
|
|
2180
|
+
const matchStart = view.getUint32(base + 16, true);
|
|
2181
|
+
const matchEnd = view.getUint32(base + 20, true);
|
|
2182
|
+
if (matches.length === 0)
|
|
2183
|
+
matches.push({ start: matchStart, end: matchEnd });
|
|
2184
|
+
raw[i] = {
|
|
2185
|
+
docId: view.getUint32(base, true),
|
|
2186
|
+
chunkIdx: view.getUint32(base + 4, true),
|
|
2187
|
+
location: view.getUint32(base + 8, true),
|
|
2188
|
+
score: view.getUint16(base + 12, true),
|
|
2189
|
+
matchStart, matchEnd, matches,
|
|
2190
|
+
};
|
|
2191
|
+
}
|
|
2192
|
+
}
|
|
2193
|
+
// Resolve each distinct doc name ONCE per search (one getDocName call
|
|
2194
|
+
// per document that actually appears in the results).
|
|
2195
|
+
const nameByDocId = new Map();
|
|
2196
|
+
const docName = (docId) => {
|
|
2197
|
+
let name = nameByDocId.get(docId);
|
|
2198
|
+
if (name === undefined) {
|
|
2199
|
+
const slot = slotByDocId.get(docId);
|
|
2200
|
+
const nl = slot !== undefined ? w.getDocName(slot) : 0;
|
|
2201
|
+
name = nl > 0 ? this._readPad(nl) : '?';
|
|
2202
|
+
nameByDocId.set(docId, name);
|
|
2203
|
+
}
|
|
2204
|
+
return name;
|
|
2205
|
+
};
|
|
1845
2206
|
const results = [];
|
|
1846
2207
|
for (let i = 0; i < count; i++) {
|
|
2208
|
+
const r = raw[i];
|
|
1847
2209
|
// Phrase adjacency check against the full chunk text (getSnippet), not
|
|
1848
2210
|
// the possibly-cropped display window.
|
|
1849
2211
|
if (phraseFilter) {
|
|
@@ -1852,30 +2214,18 @@ export class AlbexEngine {
|
|
|
1852
2214
|
if (!containsPhrase(full, phraseFilter))
|
|
1853
2215
|
continue;
|
|
1854
2216
|
}
|
|
1855
|
-
const
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
const name = nl > 0 ? this._readPad(nl) : '?';
|
|
1861
|
-
const matchCount = w.getResultMatchCount(i);
|
|
1862
|
-
const matches = [];
|
|
1863
|
-
for (let k = 0; k < matchCount; k++) {
|
|
1864
|
-
matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
|
|
1865
|
-
}
|
|
1866
|
-
if (matches.length === 0)
|
|
1867
|
-
matches.push({ start: matchStart, end: matchEnd });
|
|
1868
|
-
let snippet;
|
|
1869
|
-
let primaryStart = matchStart;
|
|
1870
|
-
let primaryEnd = matchEnd;
|
|
1871
|
-
let adjustedMatches = matches;
|
|
2217
|
+
const chunkOrd = r.chunkIdx - (chunkBaseByDocId.get(r.docId) ?? 0);
|
|
2218
|
+
let snippetBytes;
|
|
2219
|
+
let primaryStart = r.matchStart;
|
|
2220
|
+
let primaryEnd = r.matchEnd;
|
|
2221
|
+
let adjustedMatches = r.matches;
|
|
1872
2222
|
if (windowed) {
|
|
1873
2223
|
const sl = w.getSnippetWindow(i, before, after);
|
|
1874
|
-
|
|
2224
|
+
snippetBytes = sl > 0 ? this._readPadBytes(sl) : new Uint8Array(0);
|
|
1875
2225
|
const offset = w.getSnippetWindowOffset();
|
|
1876
2226
|
const leadingPrefix = offset > 0 ? 4 : 0;
|
|
1877
2227
|
const shift = leadingPrefix - offset;
|
|
1878
|
-
adjustedMatches = matches.map(m => ({
|
|
2228
|
+
adjustedMatches = r.matches.map(m => ({
|
|
1879
2229
|
start: Math.max(0, m.start + shift),
|
|
1880
2230
|
end: Math.max(0, m.end + shift),
|
|
1881
2231
|
}));
|
|
@@ -1884,21 +2234,31 @@ export class AlbexEngine {
|
|
|
1884
2234
|
}
|
|
1885
2235
|
else {
|
|
1886
2236
|
const sl = w.getSnippet(i);
|
|
1887
|
-
|
|
2237
|
+
snippetBytes = sl > 0 ? this._readPadBytes(sl) : new Uint8Array(0);
|
|
1888
2238
|
}
|
|
2239
|
+
const snippet = snippetBytes.length > 0 ? _dec.decode(snippetBytes) : '';
|
|
2240
|
+
// UTF-16 view of the primary span, ready for `snippet.slice()` —
|
|
2241
|
+
// byte offsets and JS string indices diverge on the first accent
|
|
2242
|
+
// (audit 3.1, the consumer footgun in the main Spanish use case).
|
|
2243
|
+
const snippetStart = utf16IndexAtByte(snippetBytes, primaryStart);
|
|
2244
|
+
const snippetEnd = utf16IndexAtByte(snippetBytes, primaryEnd);
|
|
1889
2245
|
results.push({
|
|
1890
|
-
documentName:
|
|
1891
|
-
|
|
1892
|
-
|
|
2246
|
+
documentName: docName(r.docId),
|
|
2247
|
+
docId: r.docId,
|
|
2248
|
+
location: r.location,
|
|
2249
|
+
chunkId: `${r.docId}::${chunkOrd}`,
|
|
2250
|
+
score: r.score,
|
|
1893
2251
|
snippet,
|
|
1894
2252
|
matchStart: primaryStart,
|
|
1895
2253
|
matchEnd: primaryEnd,
|
|
1896
2254
|
matches: adjustedMatches,
|
|
2255
|
+
snippetStart,
|
|
2256
|
+
snippetEnd,
|
|
1897
2257
|
});
|
|
1898
2258
|
}
|
|
1899
2259
|
return results;
|
|
1900
2260
|
}
|
|
1901
|
-
/** Run all OR branches and merge dedup-by-(
|
|
2261
|
+
/** Run all OR branches and merge dedup-by-(chunkId, matchStart). The
|
|
1902
2262
|
* branches are already compiled inside the WASM (by prepareQuery); we
|
|
1903
2263
|
* iterate them with selectQueryBranch. The "rawQuery" param is kept
|
|
1904
2264
|
* only for the lastSearch.query field. */
|
|
@@ -1911,7 +2271,10 @@ export class AlbexEngine {
|
|
|
1911
2271
|
w.selectQueryBranch(i);
|
|
1912
2272
|
const results = this._runSearch(rawQuery, opts);
|
|
1913
2273
|
for (const r of results) {
|
|
1914
|
-
|
|
2274
|
+
// Keyed on chunkId, not (doc, location, matchStart): two sub-chunks
|
|
2275
|
+
// of the same location can hit at the same relative offset, and the
|
|
2276
|
+
// old key silently dropped one of them (audit 3.4).
|
|
2277
|
+
const key = `${r.chunkId}:${r.matchStart}`;
|
|
1915
2278
|
if (!seen.has(key)) {
|
|
1916
2279
|
seen.add(key);
|
|
1917
2280
|
all.push(r);
|
|
@@ -1936,10 +2299,12 @@ export class AlbexEngine {
|
|
|
1936
2299
|
bloomTested: w.getStatBloomTested(),
|
|
1937
2300
|
bloomPassed: w.getStatBloomPassed(),
|
|
1938
2301
|
bitapMatched: w.getStatBitapMatched(),
|
|
2302
|
+
...this._truncStats(),
|
|
1939
2303
|
};
|
|
1940
2304
|
return this._collectResults(count, opts, phraseTokens);
|
|
1941
2305
|
}
|
|
1942
|
-
/** Returns current engine statistics
|
|
2306
|
+
/** Returns current engine statistics (capacities are the RUNTIME values
|
|
2307
|
+
* the engine was initialised with via the `capacity` option). */
|
|
1943
2308
|
getStats() {
|
|
1944
2309
|
return {
|
|
1945
2310
|
documents: this._docs.length,
|
|
@@ -1947,9 +2312,9 @@ export class AlbexEngine {
|
|
|
1947
2312
|
textUsed: this._wasm.getTextUsed(),
|
|
1948
2313
|
textCapacity: this._wasm.getTextCapacity(),
|
|
1949
2314
|
wasmMemoryBytes: this._mem.buffer.byteLength,
|
|
1950
|
-
tier: this._tier,
|
|
1951
2315
|
maxChunks: this._wasm.getMaxChunks(),
|
|
1952
2316
|
maxDocs: this._wasm.getMaxDocs(),
|
|
2317
|
+
namePoolBytes: this._wasm.getNameCapacity(),
|
|
1953
2318
|
};
|
|
1954
2319
|
}
|
|
1955
2320
|
/** Returns stats from the most recent search, or null. */
|
|
@@ -1993,10 +2358,15 @@ export class AlbexEngine {
|
|
|
1993
2358
|
this._resetInner();
|
|
1994
2359
|
}
|
|
1995
2360
|
_resetInner() {
|
|
1996
|
-
|
|
2361
|
+
// Re-init with the engine's CONFIGURED capacity, not the std defaults
|
|
2362
|
+
// (`wasm.init()` would silently shrink a 'large'/custom engine). Same
|
|
2363
|
+
// capacities → the WASM side does a plain counter reset, no realloc.
|
|
2364
|
+
const c = this._capacity;
|
|
2365
|
+
this._wasm.initWithCapacity(c.maxDocs, c.maxChunks, c.textPoolBytes, c.namePoolBytes);
|
|
1997
2366
|
this._docs = [];
|
|
1998
2367
|
this._lastSearch = null;
|
|
1999
2368
|
this._diagnostics = [];
|
|
2369
|
+
this._gpuUploadDirty = true;
|
|
2000
2370
|
}
|
|
2001
2371
|
/**
|
|
2002
2372
|
* Drain and return the diagnostics collected since the last call (or
|
|
@@ -2145,6 +2515,8 @@ export class AlbexEngine {
|
|
|
2145
2515
|
if (w.restoreCommit() !== 1)
|
|
2146
2516
|
return false;
|
|
2147
2517
|
}
|
|
2518
|
+
// The restored chunk array replaces whatever the GPU last saw.
|
|
2519
|
+
this._gpuUploadDirty = true;
|
|
2148
2520
|
// Rebuild _docs metadata from the restored WASM tables.
|
|
2149
2521
|
//
|
|
2150
2522
|
// What's available after a restore:
|