albex 0.3.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +275 -0
- package/README.md +4 -2
- package/dist/albex-worker.js +1 -1
- package/dist/albex.d.ts +157 -17
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +405 -232
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +16 -2
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +6 -3
- package/dist/errors.js.map +1 -1
- package/dist/persistence.js +1 -1
- package/dist/profile.d.ts +11 -6
- package/dist/profile.d.ts.map +1 -1
- package/dist/profile.js +6 -13
- package/dist/profile.js.map +1 -1
- package/dist/resource-manager.js +1 -1
- package/dist/tiered-store.js +1 -1
- package/dist/wasm-bindings.d.ts +46 -5
- package/dist/wasm-bindings.d.ts.map +1 -1
- package/dist/wasm-bindings.js +102 -7
- package/dist/wasm-bindings.js.map +1 -1
- package/dist/worker-protocol.js +1 -1
- package/dist/worker-runtime.js +12 -3
- package/dist/worker-runtime.js.map +1 -1
- package/package.json +13 -9
- package/src/albex.ts +478 -246
- package/src/errors.ts +18 -2
- package/src/profile.ts +11 -10
- package/src/wasm-bindings.ts +157 -8
- package/src/worker-runtime.ts +12 -2
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_std.wasm +0 -0
- package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/src/albex.ts
CHANGED
|
@@ -21,10 +21,12 @@ import {
|
|
|
21
21
|
asAlbexPdfExports,
|
|
22
22
|
} from './wasm-bindings.js';
|
|
23
23
|
import {
|
|
24
|
+
AlbexError,
|
|
24
25
|
AlbexInitError,
|
|
25
26
|
AlbexUnsupportedFormatError,
|
|
26
27
|
AlbexParseError,
|
|
27
28
|
AlbexCapacityError,
|
|
29
|
+
type AlbexCapacityLimit,
|
|
28
30
|
} from './errors.js';
|
|
29
31
|
import {
|
|
30
32
|
savePersisted,
|
|
@@ -32,7 +34,7 @@ import {
|
|
|
32
34
|
deletePersisted,
|
|
33
35
|
listPersisted,
|
|
34
36
|
} from './persistence.js';
|
|
35
|
-
import { detectProfile,
|
|
37
|
+
import { detectProfile, shouldUseGpu, type Tier, type DeviceProfile } from './profile.js';
|
|
36
38
|
import { getResourceManager, type ResourceState } from './resource-manager.js';
|
|
37
39
|
import { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
|
|
38
40
|
|
|
@@ -96,6 +98,8 @@ export interface AlbexOptions {
|
|
|
96
98
|
* Override the tier auto-detection. Pass `'auto'` (default), or an
|
|
97
99
|
* explicit tier when you know the constraints of your target environment.
|
|
98
100
|
*/
|
|
101
|
+
/** @deprecated Removed in 0.5.0. Albex no longer has capacity tiers;
|
|
102
|
+
* pass `'auto'` or omit. Other values are accepted and ignored. */
|
|
99
103
|
tier?: 'auto' | 'mini' | 'std' | 'pro';
|
|
100
104
|
/**
|
|
101
105
|
* SIMD selection. When `'auto'` (default), Albex probes for v128 support
|
|
@@ -201,57 +205,57 @@ export interface SearchStats {
|
|
|
201
205
|
bitapMatched: number;
|
|
202
206
|
}
|
|
203
207
|
|
|
204
|
-
// ─────────────────────────────────────────────────────────────────────────────
|
|
205
|
-
// Query parsing
|
|
206
|
-
// ─────────────────────────────────────────────────────────────────────────────
|
|
207
|
-
|
|
208
|
-
type SimpleQuery = { kind: 'simple'; tokens: string[] };
|
|
209
|
-
type PhraseQuery = { kind: 'phrase'; tokens: string[]; raw: string };
|
|
210
|
-
type OrQuery = { kind: 'or'; branches: string[][] };
|
|
211
|
-
type ParsedQuery = SimpleQuery | PhraseQuery | OrQuery;
|
|
212
|
-
|
|
213
|
-
function tokenize(q: string): string[] {
|
|
214
|
-
return q.trim().split(/\s+/).filter(t => t.length > 0);
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
function parseQuery(q: string): ParsedQuery {
|
|
218
|
-
const trimmed = q.trim();
|
|
219
|
-
|
|
220
|
-
// OR: "term1 | term2" or "phrase one | phrase two"
|
|
221
|
-
if (trimmed.includes('|')) {
|
|
222
|
-
const branches = trimmed.split('|')
|
|
223
|
-
.map(p => tokenize(p.replace(/"/g, '')))
|
|
224
|
-
.filter(b => b.length > 0);
|
|
225
|
-
return { kind: 'or', branches };
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
// Phrase: "exact phrase here"
|
|
229
|
-
const phraseMatch = /^"(.+)"$/.exec(trimmed);
|
|
230
|
-
if (phraseMatch) {
|
|
231
|
-
const inner = phraseMatch[1] ?? '';
|
|
232
|
-
const tokens = tokenize(inner);
|
|
233
|
-
return { kind: 'phrase', tokens, raw: inner };
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
return { kind: 'simple', tokens: tokenize(trimmed) };
|
|
237
|
-
}
|
|
238
|
-
|
|
239
208
|
/**
|
|
240
|
-
*
|
|
241
|
-
*
|
|
209
|
+
* One structured warning recorded by the engine during indexFile or
|
|
210
|
+
* load. Replaces the pre-0.5.0 pattern of scattered `console.warn`
|
|
211
|
+
* calls. Inspect via `engine.takeDiagnostics()` after the operation.
|
|
242
212
|
*/
|
|
243
|
-
|
|
244
|
-
|
|
213
|
+
export interface AlbexDiagnostic {
|
|
214
|
+
/** Coarse kind. `'recovered'` means the engine handled the issue and
|
|
215
|
+
* kept going; `'skipped'` means content was dropped; `'fallback'` means
|
|
216
|
+
* an alternate code path was used (e.g. lopdf after pdf-extract trap). */
|
|
217
|
+
kind: 'recovered' | 'skipped' | 'fallback' | 'info';
|
|
218
|
+
/** Where in the pipeline this happened. Free-form short tag. */
|
|
219
|
+
stage: 'pdf' | 'ocr' | 'gpu' | 'persistence' | 'network';
|
|
220
|
+
/** Human-readable message safe to surface in a UI. */
|
|
221
|
+
message: string;
|
|
222
|
+
/** Optional file the issue belongs to. */
|
|
223
|
+
file?: string;
|
|
224
|
+
/** Optional page number (1-based for PDFs). */
|
|
225
|
+
page?: number;
|
|
245
226
|
}
|
|
246
227
|
|
|
247
228
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
248
|
-
//
|
|
229
|
+
// Query parsing (WASM-side as of 0.5.0)
|
|
249
230
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
231
|
+
//
|
|
232
|
+
// Pre-0.5.0 this file owned parseQuery + tokenize. That created two
|
|
233
|
+
// truths about what a "token" was: one in TS for the query, one in Rust
|
|
234
|
+
// for the indexed text. The audit flagged this as the biggest divergence
|
|
235
|
+
// in the wrapper.
|
|
236
|
+
//
|
|
237
|
+
// 0.5.0 moves parseQuery/tokenize/tokensToWasmQuery to Rust. The TS
|
|
238
|
+
// dispatcher reduces to:
|
|
239
|
+
//
|
|
240
|
+
// 1. Write the raw UTF-8 query bytes to the scratchpad.
|
|
241
|
+
// 2. Call prepareQuery(len). Get back the kind (simple/phrase/or).
|
|
242
|
+
// 3. For OR: iterate getQueryBranchCount() branches, calling
|
|
243
|
+
// selectQueryBranch(i) + search() for each, then merge in TS.
|
|
244
|
+
// For simple/phrase: selectQueryBranch(0) + search().
|
|
245
|
+
// 4. For phrase: post-filter the snippets with containsPhrase().
|
|
246
|
+
//
|
|
247
|
+
// containsPhrase stays in TS because it operates on snippet text already
|
|
248
|
+
// produced by the WASM, not on the query. It is not a tokenizer.
|
|
250
249
|
|
|
251
250
|
/**
|
|
252
|
-
* Returns true if `snippet` contains the phrase
|
|
253
|
-
* with at most `maxGap` characters between
|
|
254
|
-
* Comparison is case- and accent-insensitive.
|
|
251
|
+
* Phrase post-filter. Returns true if `snippet` contains the phrase
|
|
252
|
+
* formed by `tokens` in order, with at most `maxGap` characters between
|
|
253
|
+
* consecutive tokens. Comparison is case- and accent-insensitive.
|
|
254
|
+
*
|
|
255
|
+
* The tokens come from the WASM-compiled pattern of a phrase branch,
|
|
256
|
+
* not from a TS re-tokenization of the query, so there is no
|
|
257
|
+
* tokenization divergence: WASM said "these are the tokens", we just
|
|
258
|
+
* check adjacency in the snippet.
|
|
255
259
|
*/
|
|
256
260
|
function containsPhrase(snippet: string, tokens: string[], maxGap = 30): boolean {
|
|
257
261
|
const norm = (s: string): string =>
|
|
@@ -392,32 +396,11 @@ function computePatternBloom(query: string): bigint {
|
|
|
392
396
|
return bits;
|
|
393
397
|
}
|
|
394
398
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
// FNV prime: 0x100000001b3 = (0x100 << 32) | 0x000001b3
|
|
401
|
-
for (let i = 0; i < bytes.length; i++) {
|
|
402
|
-
lo ^= bytes[i]!;
|
|
403
|
-
// multiply by FNV prime
|
|
404
|
-
// (hi:lo) *= 0x100000001b3
|
|
405
|
-
// low * prime
|
|
406
|
-
const lo_lo = (lo & 0xffff) * 0x1b3;
|
|
407
|
-
const lo_hi = (lo >>> 16) * 0x1b3;
|
|
408
|
-
let new_lo = (lo_lo + ((lo_hi & 0xffff) << 16)) | 0;
|
|
409
|
-
let carry = (lo_hi >>> 16) + ((lo_lo + ((lo_hi & 0xffff) << 16)) > 0xffffffff ? 1 : 0);
|
|
410
|
-
// hi*prime + carry
|
|
411
|
-
const hi_lo = (hi & 0xffff) * 0x1b3;
|
|
412
|
-
const hi_hi = (hi >>> 16) * 0x1b3;
|
|
413
|
-
const new_hi = ((hi_lo + ((hi_hi & 0xffff) << 16)) | 0) + carry + lo; // + lo because high 33rd bit
|
|
414
|
-
lo = new_lo;
|
|
415
|
-
hi = new_hi | 0;
|
|
416
|
-
}
|
|
417
|
-
const hexHi = (hi >>> 0).toString(16).padStart(8, '0');
|
|
418
|
-
const hexLo = (lo >>> 0).toString(16).padStart(8, '0');
|
|
419
|
-
return hexHi + hexLo;
|
|
420
|
-
}
|
|
399
|
+
// Note: `contentHash` is implemented as a method on AlbexEngine below
|
|
400
|
+
// (it needs access to the WASM scratchpad). The standalone TS reference
|
|
401
|
+
// implementation that used to live here was removed in 0.4.0 — the
|
|
402
|
+
// canonical hash now lives in wasm/src/lib.rs::hashBytes so there is
|
|
403
|
+
// exactly one definition of "the content hash of these bytes".
|
|
421
404
|
|
|
422
405
|
/**
|
|
423
406
|
* 16-hex-char content hash → 8 raw bytes for setDocumentContentHash. The
|
|
@@ -612,11 +595,20 @@ function makePdfWasmImports(
|
|
|
612
595
|
case '__wbindgen_externref_table_set_null':
|
|
613
596
|
return (idx: number) => { heap[idx] = undefined; };
|
|
614
597
|
}
|
|
615
|
-
// Unknown import —
|
|
616
|
-
//
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
598
|
+
// Unknown import — fail fast. An import we don't recognise means the
|
|
599
|
+
// wasm-bindgen / lopdf / getrandom dependency graph has drifted from
|
|
600
|
+
// the prefixes this loader is written to satisfy. Accepting the
|
|
601
|
+
// module would defer the failure to an arbitrary execution path,
|
|
602
|
+
// typically deep inside extractPdf(), where the user gets either a
|
|
603
|
+
// hang or a misleading "PDF parse error". Refusing instantiation
|
|
604
|
+
// surfaces the version skew at boot, where the maintainer can act
|
|
605
|
+
// on it.
|
|
606
|
+
throw new AlbexInitError(
|
|
607
|
+
`Unknown PDF WASM import "${modName}.${name}". ` +
|
|
608
|
+
`The albex_pdf.wasm binary was probably built with a newer Rust ` +
|
|
609
|
+
`toolchain or dependency graph than this loader was written for. ` +
|
|
610
|
+
`Rebuild with 'npm run build:pdf-wasm' or open an issue.`,
|
|
611
|
+
);
|
|
620
612
|
};
|
|
621
613
|
|
|
622
614
|
const imports: Record<string, Record<string, unknown>> = {};
|
|
@@ -647,6 +639,39 @@ export interface OcrAttachedOptions {
|
|
|
647
639
|
hint?: string;
|
|
648
640
|
}
|
|
649
641
|
|
|
642
|
+
/**
|
|
643
|
+
* Contract the engine accepts from an OCR plugin. `@albex/ocr` is the
|
|
644
|
+
* canonical implementation, but any module that satisfies this interface
|
|
645
|
+
* can be attached via `engine.attachOcr(adapter)`.
|
|
646
|
+
*/
|
|
647
|
+
export interface OcrAdapter {
|
|
648
|
+
/** Invoked by the engine to OCR a single image. Receives whatever the
|
|
649
|
+
* caller passes (Blob, ArrayBuffer, etc.); the adapter is responsible
|
|
650
|
+
* for accepting that input. Must return text + confidence. */
|
|
651
|
+
recognize(image: unknown, opts?: OcrAttachedOptions): Promise<OcrAttachedResult>;
|
|
652
|
+
|
|
653
|
+
/** Engine-side switches the adapter wants honoured. The only one
|
|
654
|
+
* defined today is `alwaysExtractEmbeddedImages`, which turns on the
|
|
655
|
+
* hybrid PDF OCR pass. New flags can be added without breaking the
|
|
656
|
+
* adapter interface. */
|
|
657
|
+
options?: {
|
|
658
|
+
/** When true, every PDF (native or scanned) is walked for embedded
|
|
659
|
+
* images and each qualifying image is sent to `recognize`. Off by
|
|
660
|
+
* default to keep performance predictable on native PDFs. */
|
|
661
|
+
alwaysExtractEmbeddedImages?: boolean;
|
|
662
|
+
};
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
/** Returned by `attachOcr`. Holds the lifecycle handles for the plugin.
|
|
666
|
+
* Calling `dispose()` removes the adapter from the engine; subsequent
|
|
667
|
+
* `engine.ocrImage` access returns `undefined` again. */
|
|
668
|
+
export interface OcrHandle {
|
|
669
|
+
/** Detach the plugin and tear down any resources it holds. After this,
|
|
670
|
+
* the engine reverts to "no OCR" — scanned PDFs go back to registering
|
|
671
|
+
* with zero chunks. */
|
|
672
|
+
dispose(): Promise<void>;
|
|
673
|
+
}
|
|
674
|
+
|
|
650
675
|
export class AlbexEngine {
|
|
651
676
|
// ── main WASM ──
|
|
652
677
|
private _wasm!: AlbexWasmExports;
|
|
@@ -658,23 +683,20 @@ export class AlbexEngine {
|
|
|
658
683
|
* runtime dependency on OCR — this is a structural slot that the optional
|
|
659
684
|
* companion package fills.
|
|
660
685
|
*/
|
|
661
|
-
ocrImage?: (image: unknown, opts?: OcrAttachedOptions) => Promise<OcrAttachedResult>;
|
|
662
|
-
|
|
663
686
|
/**
|
|
664
|
-
*
|
|
665
|
-
*
|
|
666
|
-
*
|
|
667
|
-
*
|
|
668
|
-
* labels).
|
|
669
|
-
*
|
|
670
|
-
* When `alwaysExtractEmbeddedImages` is true, every page of every PDF
|
|
671
|
-
* passes through `extractPageImages` after the normal text extraction;
|
|
672
|
-
* any image that meets the size filter (200×200 in Rust) is fed to
|
|
673
|
-
* `ocrImage`. Performance cost: 1–3 s per qualifying image.
|
|
674
|
-
*
|
|
675
|
-
* Off by default — set this opt-in via the OCR module's options.
|
|
687
|
+
* Public OCR entry point. Forwards to the attached OCR adapter installed
|
|
688
|
+
* via `attachOcr()`. Reading this property is a feature-detect for
|
|
689
|
+
* integrators: `if (engine.ocrImage) { ... OCR available ... }`. Writing
|
|
690
|
+
* to it directly is no longer supported in 0.5.0+ — use `attachOcr`.
|
|
676
691
|
*/
|
|
677
|
-
|
|
692
|
+
get ocrImage(): ((image: unknown, opts?: OcrAttachedOptions) => Promise<OcrAttachedResult>) | undefined {
|
|
693
|
+
return this._ocrAdapter?.recognize;
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
/** Private adapter slot. Holds the OCR plugin contract installed by
|
|
697
|
+
* `attachOcr()`. The engine reads `recognize` and `options` here; the
|
|
698
|
+
* caller never gets a reference to this object directly. */
|
|
699
|
+
private _ocrAdapter: OcrAdapter | null = null;
|
|
678
700
|
|
|
679
701
|
// ── PDF WASM (lazy) ──
|
|
680
702
|
private _pdfWasm: AlbexPdfExports | null = null;
|
|
@@ -682,6 +704,11 @@ export class AlbexEngine {
|
|
|
682
704
|
|
|
683
705
|
private _docs: IndexedDocument[] = [];
|
|
684
706
|
private _lastSearch: SearchStats | null = null;
|
|
707
|
+
/** Structured diagnostics collected during the most recent operation.
|
|
708
|
+
* Drained by `takeDiagnostics()`. Capped at 256 entries to avoid
|
|
709
|
+
* unbounded memory growth in pathological cases (very corrupted
|
|
710
|
+
* corpora producing thousands of recovery warnings). */
|
|
711
|
+
private _diagnostics: AlbexDiagnostic[] = [];
|
|
685
712
|
private _tier: Tier | null = null;
|
|
686
713
|
private _simd: boolean = false;
|
|
687
714
|
private _profile: DeviceProfile | null = null;
|
|
@@ -691,10 +718,55 @@ export class AlbexEngine {
|
|
|
691
718
|
private _unsubscribeResources: (() => void) | null = null;
|
|
692
719
|
private readonly _opts: AlbexOptions;
|
|
693
720
|
|
|
721
|
+
// ── Concurrency guard ──────────────────────────────────────────────────────
|
|
722
|
+
// One WASM instance, global mutable state, async ops that yield to the
|
|
723
|
+
// scheduler between slices. Two overlapping operations corrupt each other
|
|
724
|
+
// (e.g. a fresh searchBegin resets the cursor of an in-flight cooperative
|
|
725
|
+
// search). Async ops serialize through `_opChain`; sync mutators/searches
|
|
726
|
+
// assert the engine is idle (audit 0.6.0, finding #2).
|
|
727
|
+
private _opChain: Promise<unknown> = Promise.resolve();
|
|
728
|
+
private _busy = false;
|
|
729
|
+
|
|
694
730
|
constructor(opts: AlbexOptions) {
|
|
695
731
|
this._opts = opts;
|
|
696
732
|
}
|
|
697
733
|
|
|
734
|
+
/** Serialize an async engine operation behind any in-flight one. */
|
|
735
|
+
private _exclusive<T>(fn: () => Promise<T>): Promise<T> {
|
|
736
|
+
const run = this._opChain.then(async () => {
|
|
737
|
+
this._busy = true;
|
|
738
|
+
try { return await fn(); }
|
|
739
|
+
finally { this._busy = false; }
|
|
740
|
+
});
|
|
741
|
+
// Swallow result/error on the chain so one failure can't wedge the queue.
|
|
742
|
+
this._opChain = run.then(() => undefined, () => undefined);
|
|
743
|
+
return run as Promise<T>;
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
/** Guard a synchronous mutator/search: refuse to run mid-async-operation
|
|
747
|
+
* rather than silently corrupt the shared WASM state. */
|
|
748
|
+
private _assertIdle(method: string): void {
|
|
749
|
+
if (this._busy) {
|
|
750
|
+
throw new AlbexError(
|
|
751
|
+
'busy',
|
|
752
|
+
`${method}() was called while an async engine operation is still ` +
|
|
753
|
+
`running. Await the previous indexFile/save/load/replaceDocument/` +
|
|
754
|
+
`searchCooperative call, or use searchCooperative instead of search().`,
|
|
755
|
+
);
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
/** Compact opportunistically when tombstones pile up under text pressure,
|
|
760
|
+
* so repeated removeDocument/replaceDocument don't exhaust the pool. */
|
|
761
|
+
private _autoCompactIfNeeded(): void {
|
|
762
|
+
const w = this._wasm;
|
|
763
|
+
const cap = w.getTextCapacity();
|
|
764
|
+
const hasTombstones = w.getDocCount() > this._docs.length;
|
|
765
|
+
if (hasTombstones && cap > 0 && w.getTextUsed() / cap > 0.85) {
|
|
766
|
+
w.compact();
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
|
|
698
770
|
/** Load and initialise the main WASM module. Must be called before any other method. */
|
|
699
771
|
async init(): Promise<void> {
|
|
700
772
|
const url = await this._resolveWasmUrl();
|
|
@@ -754,29 +826,28 @@ export class AlbexEngine {
|
|
|
754
826
|
// as an asset reference. They copy the .wasm to the output directory and
|
|
755
827
|
// rewrite the URL automatically. Consumers who use one of those bundlers
|
|
756
828
|
// get a working `new AlbexEngine()` with no manual setup.
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
this._simd = false;
|
|
762
|
-
return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
|
|
763
|
-
}
|
|
764
|
-
|
|
765
|
-
let tier: Tier;
|
|
766
|
-
if (o.tier && o.tier !== 'auto') tier = o.tier;
|
|
767
|
-
else tier = pickTier(profile);
|
|
768
|
-
this._tier = tier;
|
|
769
|
-
|
|
829
|
+
// 0.5.0+: two main binaries only — baseline and SIMD. The tier
|
|
830
|
+
// system is gone (audit 4.1). Selection collapses to a single
|
|
831
|
+
// boolean: SIMD on or off, decided either by the explicit `simd`
|
|
832
|
+
// option or by a runtime probe.
|
|
770
833
|
const simd = o.simd === 'on'
|
|
771
834
|
? true
|
|
772
835
|
: o.simd === 'off'
|
|
773
836
|
? false
|
|
774
837
|
: !!profile?.wasm.simd;
|
|
775
838
|
this._simd = simd;
|
|
839
|
+
this._tier = 'std';
|
|
840
|
+
|
|
841
|
+
if (!o.wasmBaseUrl) {
|
|
842
|
+
// Zero-config: bundler resolves the .wasm next to dist/. We only
|
|
843
|
+
// ship the baseline alias (albex_wasm_bg.wasm) inside the npm
|
|
844
|
+
// package; integrators who want SIMD must serve both binaries
|
|
845
|
+
// themselves via `wasmBaseUrl`.
|
|
846
|
+
return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
|
|
847
|
+
}
|
|
776
848
|
|
|
777
|
-
const suffix = simd ? `${tier}_simd` : tier;
|
|
778
849
|
const base = o.wasmBaseUrl.replace(/\/+$/, '');
|
|
779
|
-
return `${base}/
|
|
850
|
+
return simd ? `${base}/albex_wasm_simd.wasm` : `${base}/albex_wasm.wasm`;
|
|
780
851
|
}
|
|
781
852
|
|
|
782
853
|
/** The tier that was actually loaded. `null` until `init()` resolves. */
|
|
@@ -887,6 +958,35 @@ export class AlbexEngine {
|
|
|
887
958
|
}
|
|
888
959
|
}
|
|
889
960
|
|
|
961
|
+
/**
|
|
962
|
+
* Compute the FNV-1a 64-bit content hash of `bytes` via the WASM
|
|
963
|
+
* streaming API. Returns a 16-character hex string identical in shape
|
|
964
|
+
* to what the TS implementation in 0.3.x returned, so all callers
|
|
965
|
+
* stay unchanged. Single source of truth — same hash whether we use
|
|
966
|
+
* it for indexFile dedup, for snapshot v2 persistence, or anywhere
|
|
967
|
+
* else. Large inputs are chunked at FEED_SIZE just like _feedText.
|
|
968
|
+
*/
|
|
969
|
+
private _contentHash(bytes: Uint8Array): string {
|
|
970
|
+
const w = this._wasm;
|
|
971
|
+
w.hashBegin();
|
|
972
|
+
for (let i = 0; i < bytes.length; i += FEED_SIZE) {
|
|
973
|
+
const c = bytes.subarray(i, i + FEED_SIZE);
|
|
974
|
+
this._writePad(c);
|
|
975
|
+
w.hashFeed(c.length);
|
|
976
|
+
}
|
|
977
|
+
w.hashFinish();
|
|
978
|
+
// Read 8 result bytes back from scratchpad[0..8].
|
|
979
|
+
const ptr = w.getBuffer(8);
|
|
980
|
+
const out = this._u8(ptr, 8);
|
|
981
|
+
// Big-endian to hex. Same layout as the old hexHi + hexLo output:
|
|
982
|
+
// high u32 first (4 bytes), low u32 second (4 bytes).
|
|
983
|
+
let s = '';
|
|
984
|
+
for (let i = 0; i < 8; i++) {
|
|
985
|
+
s += out[i]!.toString(16).padStart(2, '0');
|
|
986
|
+
}
|
|
987
|
+
return s;
|
|
988
|
+
}
|
|
989
|
+
|
|
890
990
|
private _feedXmlBytes(xml: Uint8Array, fn: 'feedXmlBytes' | 'feedXlsxBytes'): void {
|
|
891
991
|
const feeder = this._wasm[fn];
|
|
892
992
|
for (let i = 0; i < xml.length; i += FEED_SIZE) {
|
|
@@ -910,7 +1010,10 @@ export class AlbexEngine {
|
|
|
910
1010
|
// called when the user actually drops a PDF — but we issue a console
|
|
911
1011
|
// hint so embedders can surface a "this will download ~1 MB" prompt.
|
|
912
1012
|
if (this._resources?.constrainedNetwork) {
|
|
913
|
-
|
|
1013
|
+
this._diag({
|
|
1014
|
+
kind: 'info', stage: 'network',
|
|
1015
|
+
message: 'Downloading PDF WASM (~1 MB) on a constrained network connection',
|
|
1016
|
+
});
|
|
914
1017
|
}
|
|
915
1018
|
const res = await fetch(pdfUrl);
|
|
916
1019
|
if (!res.ok) throw new AlbexInitError(`Failed to fetch PDF WASM: ${res.status}`);
|
|
@@ -1046,21 +1149,14 @@ export class AlbexEngine {
|
|
|
1046
1149
|
this._wasm.flushParagraph();
|
|
1047
1150
|
}
|
|
1048
1151
|
|
|
1049
|
-
// Hybrid OCR pass: when the OCR
|
|
1050
|
-
// `alwaysExtractEmbeddedImages: true`, also walk every page
|
|
1051
|
-
// embedded images and OCR them on top of the vector text.
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
const hasOcr = !!this.ocrImage;
|
|
1057
|
-
const binSupportsImages = typeof pw.extractPageImages === 'function'
|
|
1058
|
-
&& typeof pw.getPageCount === 'function';
|
|
1059
|
-
console.log(`[albex] hybrid OCR decision: ocrImage=${hasOcr} ocrConfig.alwaysExtractEmbeddedImages=${hybridOn} binarySupportsImages=${binSupportsImages}`);
|
|
1060
|
-
|
|
1061
|
-
if (hasOcr && hybridOn && binSupportsImages) {
|
|
1152
|
+
// Hybrid OCR pass: when the OCR adapter is wired with
|
|
1153
|
+
// `options.alwaysExtractEmbeddedImages: true`, also walk every page
|
|
1154
|
+
// for embedded images and OCR them on top of the vector text.
|
|
1155
|
+
if (this._ocrAdapter
|
|
1156
|
+
&& this._ocrAdapter.options?.alwaysExtractEmbeddedImages
|
|
1157
|
+
&& typeof pw.extractPageImages === 'function'
|
|
1158
|
+
&& typeof pw.getPageCount === 'function') {
|
|
1062
1159
|
const totalPages = pw.getPageCount();
|
|
1063
|
-
console.log(`[albex] hybrid OCR pass starting over ${totalPages} page(s)`);
|
|
1064
1160
|
for (let p = 0; p < totalPages; p++) {
|
|
1065
1161
|
const ocrText = await this._ocrPageEmbeddedImages(pw, p);
|
|
1066
1162
|
if (ocrText === null) break; // WASM trapped, stop hybrid pass.
|
|
@@ -1148,7 +1244,10 @@ export class AlbexEngine {
|
|
|
1148
1244
|
// so `_ensurePdfWasm` re-instantiates on the next call.
|
|
1149
1245
|
this._pdfWasm = null;
|
|
1150
1246
|
this._pdfMem = null;
|
|
1151
|
-
|
|
1247
|
+
this._diag({
|
|
1248
|
+
kind: 'skipped', stage: 'pdf', page: page + 1,
|
|
1249
|
+
message: `PDF image extractor trapped: ${e instanceof Error ? e.message : String(e)}. Remaining pages skipped.`,
|
|
1250
|
+
});
|
|
1152
1251
|
return null;
|
|
1153
1252
|
}
|
|
1154
1253
|
if (imageCount <= 0) return '';
|
|
@@ -1174,16 +1273,6 @@ export class AlbexEngine {
|
|
|
1174
1273
|
copy.set(new Uint8Array(liveMem.buffer, ptr, len));
|
|
1175
1274
|
const blob = new Blob([copy.buffer as ArrayBuffer], { type: mime });
|
|
1176
1275
|
|
|
1177
|
-
// Defensive diagnostics: when an OCR call goes wrong (Tesseract
|
|
1178
|
-
// worker abort, malformed JPEG, etc.) the first thing we want to
|
|
1179
|
-
// see is whether we even handed it valid image bytes. A real JPEG
|
|
1180
|
-
// starts with FF D8 FF (E0 for JFIF, E1 for EXIF). A JPEG2000
|
|
1181
|
-
// starts with 00 00 00 0C 6A 50 20 20.
|
|
1182
|
-
const magic = Array.from(copy.subarray(0, 4))
|
|
1183
|
-
.map(b => b.toString(16).padStart(2, '0'))
|
|
1184
|
-
.join(' ');
|
|
1185
|
-
console.log(`[albex] OCR page ${page + 1} image ${i + 1}/${imageCount}: kind=${kind} (${mime}) len=${len} bytes magic=${magic}`);
|
|
1186
|
-
|
|
1187
1276
|
try {
|
|
1188
1277
|
const { text } = await ocr(blob);
|
|
1189
1278
|
const trimmed = text?.trim();
|
|
@@ -1197,7 +1286,10 @@ export class AlbexEngine {
|
|
|
1197
1286
|
// "Aborted(-1)") are also caught here; if they bypass the
|
|
1198
1287
|
// promise rejection and surface as `uncaught` instead, the
|
|
1199
1288
|
// demo's window.onerror handler will keep the app alive.
|
|
1200
|
-
|
|
1289
|
+
this._diag({
|
|
1290
|
+
kind: 'skipped', stage: 'ocr', page: page + 1,
|
|
1291
|
+
message: `OCR failed on image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`,
|
|
1292
|
+
});
|
|
1201
1293
|
}
|
|
1202
1294
|
}
|
|
1203
1295
|
|
|
@@ -1242,7 +1334,10 @@ export class AlbexEngine {
|
|
|
1242
1334
|
inPtr = pw.allocInput(bytes.length);
|
|
1243
1335
|
new Uint8Array(pw.memory.buffer, inPtr, bytes.length).set(bytes);
|
|
1244
1336
|
} catch (e) {
|
|
1245
|
-
|
|
1337
|
+
this._diag({
|
|
1338
|
+
kind: 'skipped', stage: 'pdf',
|
|
1339
|
+
message: `PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`,
|
|
1340
|
+
});
|
|
1246
1341
|
return null;
|
|
1247
1342
|
}
|
|
1248
1343
|
|
|
@@ -1252,7 +1347,10 @@ export class AlbexEngine {
|
|
|
1252
1347
|
// first page, no paragraphs are emitted and we end up with 0 chunks.
|
|
1253
1348
|
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
1254
1349
|
this._wasm.beginDocument();
|
|
1255
|
-
|
|
1350
|
+
this._diag({
|
|
1351
|
+
kind: 'fallback', stage: 'pdf', file: file.name,
|
|
1352
|
+
message: `pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf`,
|
|
1353
|
+
});
|
|
1256
1354
|
await this._indexPdfScanned(pw);
|
|
1257
1355
|
return this._wasm.endDocument();
|
|
1258
1356
|
}
|
|
@@ -1657,6 +1755,10 @@ export class AlbexEngine {
|
|
|
1657
1755
|
* Throws for unsupported formats or parse errors.
|
|
1658
1756
|
*/
|
|
1659
1757
|
async indexFile(file: File): Promise<IndexedDocument> {
|
|
1758
|
+
return this._exclusive(() => this._indexFileInner(file));
|
|
1759
|
+
}
|
|
1760
|
+
|
|
1761
|
+
private async _indexFileInner(file: File): Promise<IndexedDocument> {
|
|
1660
1762
|
const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
|
|
1661
1763
|
const indexer = AlbexEngine._INDEXERS[ext];
|
|
1662
1764
|
if (!indexer) throw new AlbexUnsupportedFormatError(ext);
|
|
@@ -1664,7 +1766,7 @@ export class AlbexEngine {
|
|
|
1664
1766
|
// Hash the source bytes for idempotency. We always read the bytes once
|
|
1665
1767
|
// here so the indexer can reuse them — avoids a double File.arrayBuffer().
|
|
1666
1768
|
const bytes = new Uint8Array(await file.arrayBuffer());
|
|
1667
|
-
const hash =
|
|
1769
|
+
const hash = this._contentHash(bytes);
|
|
1668
1770
|
|
|
1669
1771
|
// Idempotency: if a non-deleted doc already has this hash, return it
|
|
1670
1772
|
// unchanged. Cheap O(N) scan since MAX_DOCS = 128.
|
|
@@ -1689,6 +1791,30 @@ export class AlbexEngine {
|
|
|
1689
1791
|
}
|
|
1690
1792
|
|
|
1691
1793
|
const chunks = await indexer(this, file, bytes);
|
|
1794
|
+
|
|
1795
|
+
// Capacity check (0.6.0). The WASM pools fill silently and break out of
|
|
1796
|
+
// their ingest loops; getLastIndexOverflow reports which one filled.
|
|
1797
|
+
// Surface a typed error instead of returning a half-indexed document the
|
|
1798
|
+
// caller cannot tell apart from a complete one (audit finding #3).
|
|
1799
|
+
const overflow = w.getLastIndexOverflow();
|
|
1800
|
+
if (overflow !== 0) {
|
|
1801
|
+
const which: AlbexCapacityLimit =
|
|
1802
|
+
(overflow & 1) ? 'chunks' : (overflow & 2) ? 'text'
|
|
1803
|
+
: (overflow & 4) ? 'docs' : 'names';
|
|
1804
|
+
const pools = [
|
|
1805
|
+
overflow & 1 ? 'chunk pool' : '',
|
|
1806
|
+
overflow & 2 ? 'text pool' : '',
|
|
1807
|
+
overflow & 4 ? 'document table' : '',
|
|
1808
|
+
overflow & 8 ? 'name pool' : '',
|
|
1809
|
+
].filter(Boolean).join(', ');
|
|
1810
|
+
throw new AlbexCapacityError(
|
|
1811
|
+
`Index capacity exceeded while indexing "${file.name}" (${pools} full). ` +
|
|
1812
|
+
`The document was rolled back (not indexed); treat the index as full ` +
|
|
1813
|
+
`(compact(), shard across an AlbexPool, or reset()).`,
|
|
1814
|
+
which,
|
|
1815
|
+
);
|
|
1816
|
+
}
|
|
1817
|
+
|
|
1692
1818
|
// The new doc occupies slot `docCountBefore`.
|
|
1693
1819
|
const docId = w.getDocId(docCountBefore);
|
|
1694
1820
|
|
|
@@ -1713,6 +1839,11 @@ export class AlbexEngine {
|
|
|
1713
1839
|
* Returns `true` if a matching document was found and tombstoned.
|
|
1714
1840
|
*/
|
|
1715
1841
|
removeDocument(id: string): boolean {
|
|
1842
|
+
this._assertIdle('removeDocument');
|
|
1843
|
+
return this._removeDocumentInner(id);
|
|
1844
|
+
}
|
|
1845
|
+
|
|
1846
|
+
private _removeDocumentInner(id: string): boolean {
|
|
1716
1847
|
const doc = this._docs.find(d => d.name === id || d.contentHash === id);
|
|
1717
1848
|
if (!doc) return false;
|
|
1718
1849
|
const ok = this._wasm.removeDocument(doc.docId) === 1;
|
|
@@ -1728,12 +1859,15 @@ export class AlbexEngine {
|
|
|
1728
1859
|
* idempotency check (so re-indexing the *same* bytes after a remove works).
|
|
1729
1860
|
*/
|
|
1730
1861
|
async replaceDocument(name: string, newFile: File): Promise<IndexedDocument> {
|
|
1731
|
-
this.
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1862
|
+
return this._exclusive(async () => {
|
|
1863
|
+
this._removeDocumentInner(name);
|
|
1864
|
+
// Index directly via the inner path (we already hold the lock).
|
|
1865
|
+
const doc = await this._indexFileInner(newFile);
|
|
1866
|
+
// Repeated replaces leave tombstones in the text pool; reclaim under
|
|
1867
|
+
// pressure so the pool isn't silently exhausted (audit finding #7).
|
|
1868
|
+
this._autoCompactIfNeeded();
|
|
1869
|
+
return doc;
|
|
1870
|
+
});
|
|
1737
1871
|
}
|
|
1738
1872
|
|
|
1739
1873
|
/**
|
|
@@ -1744,6 +1878,7 @@ export class AlbexEngine {
|
|
|
1744
1878
|
* references (e.g. in a UI) remain valid.
|
|
1745
1879
|
*/
|
|
1746
1880
|
compact(): void {
|
|
1881
|
+
this._assertIdle('compact');
|
|
1747
1882
|
this._wasm.compact();
|
|
1748
1883
|
}
|
|
1749
1884
|
|
|
@@ -1757,19 +1892,35 @@ export class AlbexEngine {
|
|
|
1757
1892
|
* markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
|
|
1758
1893
|
*/
|
|
1759
1894
|
search(query: string, opts: SearchOptions = {}): SearchResult[] {
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1895
|
+
this._assertIdle('search');
|
|
1896
|
+
const w = this._wasm;
|
|
1897
|
+
const ql = this._writeStr(query);
|
|
1898
|
+
const kind = w.prepareQuery(ql);
|
|
1899
|
+
if (kind < 0) return [];
|
|
1900
|
+
|
|
1901
|
+
if (kind === 2) {
|
|
1902
|
+
// OR: iterate branches and merge in TS. WASM stores compiled
|
|
1903
|
+
// branches internally so we never re-tokenize on the host.
|
|
1904
|
+
return this._searchOr(query, opts);
|
|
1764
1905
|
}
|
|
1765
1906
|
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1907
|
+
w.selectQueryBranch(0);
|
|
1908
|
+
// Phrase queries (kind 1) post-filter on adjacency. Pass the tokens down
|
|
1909
|
+
// so the check runs against the FULL chunk text, not a cropped windowed
|
|
1910
|
+
// snippet — otherwise `{ windowed: true }` could drop a valid phrase hit
|
|
1911
|
+
// whose second term fell outside the window (audit finding #7).
|
|
1912
|
+
const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
|
|
1913
|
+
return this._runSearch(query, opts, phraseTokens);
|
|
1914
|
+
}
|
|
1771
1915
|
|
|
1772
|
-
|
|
1916
|
+
/** Read the WASM-compiled tokens of branch `i` for phrase post-filter.
|
|
1917
|
+
* The bytes returned are exactly what the WASM tokenizer produced —
|
|
1918
|
+
* no TS re-tokenization. */
|
|
1919
|
+
private _branchTokens(i: number): string[] {
|
|
1920
|
+
const n = this._wasm.getQueryBranchPattern(i);
|
|
1921
|
+
if (n === 0) return [];
|
|
1922
|
+
const pattern = this._readPad(n);
|
|
1923
|
+
return pattern.split(' ').filter(t => t.length > 0);
|
|
1773
1924
|
}
|
|
1774
1925
|
|
|
1775
1926
|
/**
|
|
@@ -1787,34 +1938,42 @@ export class AlbexEngine {
|
|
|
1787
1938
|
* Pass `opts.frameBudgetMs` to control the slice size (default 8 ms).
|
|
1788
1939
|
*/
|
|
1789
1940
|
async *searchCooperative(query: string, opts: SearchOptions = {}): AsyncIterable<SearchResult> {
|
|
1790
|
-
|
|
1941
|
+
// Collect under the exclusivity lock so no other engine op interleaves at
|
|
1942
|
+
// a slice boundary; the per-slice scheduler yields still happen inside.
|
|
1943
|
+
const results = await this._exclusive(() => this._searchCooperativeCollect(query, opts));
|
|
1944
|
+
for (const r of results) yield r;
|
|
1945
|
+
}
|
|
1946
|
+
|
|
1947
|
+
/** Materialise a cooperative search to a sorted result array. Runs inside
|
|
1948
|
+
* the exclusivity lock. Frame-budget yielding lives in _runSearchBudgeted. */
|
|
1949
|
+
private async _searchCooperativeCollect(query: string, opts: SearchOptions): Promise<SearchResult[]> {
|
|
1791
1950
|
const budget = opts.frameBudgetMs ?? 8;
|
|
1792
1951
|
const w = this._wasm;
|
|
1793
1952
|
|
|
1794
|
-
|
|
1795
|
-
|
|
1953
|
+
const ql = this._writeStr(query);
|
|
1954
|
+
const kind = w.prepareQuery(ql);
|
|
1955
|
+
if (kind < 0) return [];
|
|
1956
|
+
|
|
1957
|
+
if (kind === 2) {
|
|
1958
|
+
// OR branches — run each as its own resumable search and merge.
|
|
1796
1959
|
const seen = new Set<string>();
|
|
1797
1960
|
const all: SearchResult[] = [];
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
const r = await this._runSearchBudgeted(
|
|
1961
|
+
const n = w.getQueryBranchCount();
|
|
1962
|
+
for (let i = 0; i < n; i++) {
|
|
1963
|
+
w.selectQueryBranch(i);
|
|
1964
|
+
const r = await this._runSearchBudgeted(query, opts, budget, undefined, i);
|
|
1802
1965
|
for (const x of r) {
|
|
1803
1966
|
const key = `${x.documentName}:${x.location}:${x.matchStart}`;
|
|
1804
1967
|
if (!seen.has(key)) { seen.add(key); all.push(x); }
|
|
1805
1968
|
}
|
|
1806
1969
|
}
|
|
1807
1970
|
all.sort((a, b) => b.score - a.score);
|
|
1808
|
-
|
|
1809
|
-
return;
|
|
1971
|
+
return all;
|
|
1810
1972
|
}
|
|
1811
1973
|
|
|
1812
|
-
|
|
1813
|
-
const
|
|
1814
|
-
|
|
1815
|
-
: results;
|
|
1816
|
-
for (const r of filtered) yield r;
|
|
1817
|
-
void w;
|
|
1974
|
+
w.selectQueryBranch(0);
|
|
1975
|
+
const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
|
|
1976
|
+
return this._runSearchBudgeted(query, opts, budget, phraseTokens, 0);
|
|
1818
1977
|
}
|
|
1819
1978
|
|
|
1820
1979
|
/**
|
|
@@ -1838,14 +1997,19 @@ export class AlbexEngine {
|
|
|
1838
1997
|
* may eat the entire budget, which is also fine.
|
|
1839
1998
|
*/
|
|
1840
1999
|
private async _runSearchBudgeted(
|
|
1841
|
-
wasmQuery: string,
|
|
1842
2000
|
displayQuery: string,
|
|
1843
2001
|
opts: SearchOptions,
|
|
1844
2002
|
budgetMs: number,
|
|
2003
|
+
phraseTokens?: string[],
|
|
2004
|
+
branchIdx = 0,
|
|
1845
2005
|
): Promise<SearchResult[]> {
|
|
1846
2006
|
const w = this._wasm;
|
|
1847
|
-
|
|
1848
|
-
|
|
2007
|
+
// Pattern is already set by the caller via selectQueryBranch(branchIdx).
|
|
2008
|
+
// Snapshot THAT branch's compiled pattern for the GPU pre-filter hash —
|
|
2009
|
+
// not branch 0, which would build the wrong candidate mask for OR
|
|
2010
|
+
// branches and silently drop their hits (audit finding #6).
|
|
2011
|
+
const activePatternLen = w.getQueryBranchPattern(branchIdx);
|
|
2012
|
+
const activePattern = activePatternLen > 0 ? this._readPad(activePatternLen) : '';
|
|
1849
2013
|
|
|
1850
2014
|
// GPU pre-filter (CD1). If enabled AND the corpus is large enough,
|
|
1851
2015
|
// the GPU computes the candidate bitset and we install it into WASM
|
|
@@ -1853,10 +2017,13 @@ export class AlbexEngine {
|
|
|
1853
2017
|
// Failure here is silent: we fall back to CPU-only Bloom transparently.
|
|
1854
2018
|
if (this._shouldEngageGpu()) {
|
|
1855
2019
|
try {
|
|
1856
|
-
await this._gpuPreFilter(
|
|
2020
|
+
await this._gpuPreFilter(activePattern);
|
|
1857
2021
|
} catch (e) {
|
|
1858
2022
|
// Don't let a GPU hiccup kill the search — drop to CPU path.
|
|
1859
|
-
|
|
2023
|
+
this._diag({
|
|
2024
|
+
kind: 'fallback', stage: 'gpu',
|
|
2025
|
+
message: `GPU pre-filter failed; falling back to CPU: ${e instanceof Error ? e.message : String(e)}`,
|
|
2026
|
+
});
|
|
1860
2027
|
w.clearCandidateMask();
|
|
1861
2028
|
}
|
|
1862
2029
|
}
|
|
@@ -1912,18 +2079,30 @@ export class AlbexEngine {
|
|
|
1912
2079
|
bitapMatched: w.getStatBitapMatched(),
|
|
1913
2080
|
};
|
|
1914
2081
|
|
|
1915
|
-
return this._collectResults(count, opts);
|
|
2082
|
+
return this._collectResults(count, opts, phraseTokens);
|
|
1916
2083
|
}
|
|
1917
2084
|
|
|
1918
|
-
/** Materialise results [0..count) into the public SearchResult shape.
|
|
1919
|
-
|
|
2085
|
+
/** Materialise results [0..count) into the public SearchResult shape.
|
|
2086
|
+
* When `phraseTokens` is given, each result is kept only if those tokens
|
|
2087
|
+
* appear adjacently in the FULL chunk text — independent of any display
|
|
2088
|
+
* windowing — so phrase queries stay correct under `{ windowed: true }`. */
|
|
2089
|
+
private _collectResults(count: number, opts: SearchOptions, phraseTokens?: string[]): SearchResult[] {
|
|
1920
2090
|
const w = this._wasm;
|
|
1921
2091
|
const windowed = opts.windowed === true;
|
|
1922
2092
|
const before = opts.before ?? 60;
|
|
1923
2093
|
const after = opts.after ?? 120;
|
|
2094
|
+
const phraseFilter = phraseTokens && phraseTokens.length > 0 ? phraseTokens : null;
|
|
1924
2095
|
|
|
1925
2096
|
const results: SearchResult[] = [];
|
|
1926
2097
|
for (let i = 0; i < count; i++) {
|
|
2098
|
+
// Phrase adjacency check against the full chunk text (getSnippet), not
|
|
2099
|
+
// the possibly-cropped display window.
|
|
2100
|
+
if (phraseFilter) {
|
|
2101
|
+
const fl = w.getSnippet(i);
|
|
2102
|
+
const full = fl > 0 ? this._readPad(fl) : '';
|
|
2103
|
+
if (!containsPhrase(full, phraseFilter)) continue;
|
|
2104
|
+
}
|
|
2105
|
+
|
|
1927
2106
|
const score = w.getResultScore(i);
|
|
1928
2107
|
const location = w.getResultLocation(i);
|
|
1929
2108
|
const matchStart = w.getResultStart(i);
|
|
@@ -1973,29 +2152,32 @@ export class AlbexEngine {
|
|
|
1973
2152
|
return results;
|
|
1974
2153
|
}
|
|
1975
2154
|
|
|
1976
|
-
|
|
2155
|
+
/** Run all OR branches and merge dedup-by-(doc, location, match). The
|
|
2156
|
+
* branches are already compiled inside the WASM (by prepareQuery); we
|
|
2157
|
+
* iterate them with selectQueryBranch. The "rawQuery" param is kept
|
|
2158
|
+
* only for the lastSearch.query field. */
|
|
2159
|
+
private _searchOr(rawQuery: string, opts: SearchOptions): SearchResult[] {
|
|
2160
|
+
const w = this._wasm;
|
|
1977
2161
|
const seen = new Set<string>();
|
|
1978
2162
|
const all: SearchResult[] = [];
|
|
1979
|
-
|
|
1980
|
-
for (
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
const results = this._runSearch(q, rawQuery, opts);
|
|
2163
|
+
const n = w.getQueryBranchCount();
|
|
2164
|
+
for (let i = 0; i < n; i++) {
|
|
2165
|
+
w.selectQueryBranch(i);
|
|
2166
|
+
const results = this._runSearch(rawQuery, opts);
|
|
1984
2167
|
for (const r of results) {
|
|
1985
2168
|
const key = `${r.documentName}:${r.location}:${r.matchStart}`;
|
|
1986
2169
|
if (!seen.has(key)) { seen.add(key); all.push(r); }
|
|
1987
2170
|
}
|
|
1988
2171
|
}
|
|
1989
|
-
|
|
1990
|
-
// Re-rank the merged list by score descending.
|
|
1991
2172
|
all.sort((a, b) => b.score - a.score);
|
|
1992
2173
|
return all;
|
|
1993
2174
|
}
|
|
1994
2175
|
|
|
1995
|
-
|
|
2176
|
+
/** Execute a single search using whichever query branch is currently
|
|
2177
|
+
* active (set via selectQueryBranch). Returns the materialised
|
|
2178
|
+
* SearchResult[]. Caller is responsible for activating a branch first. */
|
|
2179
|
+
private _runSearch(displayQuery: string, opts: SearchOptions, phraseTokens?: string[]): SearchResult[] {
|
|
1996
2180
|
const w = this._wasm;
|
|
1997
|
-
const ql = this._writeStr(wasmQuery);
|
|
1998
|
-
w.setPattern(ql);
|
|
1999
2181
|
|
|
2000
2182
|
const t0 = performance.now();
|
|
2001
2183
|
const count = w.search();
|
|
@@ -2010,63 +2192,7 @@ export class AlbexEngine {
|
|
|
2010
2192
|
bitapMatched: w.getStatBitapMatched(),
|
|
2011
2193
|
};
|
|
2012
2194
|
|
|
2013
|
-
|
|
2014
|
-
const before = opts.before ?? 60;
|
|
2015
|
-
const after = opts.after ?? 120;
|
|
2016
|
-
|
|
2017
|
-
const results: SearchResult[] = [];
|
|
2018
|
-
for (let i = 0; i < count; i++) {
|
|
2019
|
-
const score = w.getResultScore(i);
|
|
2020
|
-
const location = w.getResultLocation(i);
|
|
2021
|
-
const matchStart = w.getResultStart(i);
|
|
2022
|
-
const matchEnd = w.getResultEnd(i);
|
|
2023
|
-
const nl = w.getResultDocName(i);
|
|
2024
|
-
const name = nl > 0 ? this._readPad(nl) : '?';
|
|
2025
|
-
|
|
2026
|
-
const matchCount = w.getResultMatchCount(i);
|
|
2027
|
-
const matches: MatchSpan[] = [];
|
|
2028
|
-
for (let k = 0; k < matchCount; k++) {
|
|
2029
|
-
matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
|
|
2030
|
-
}
|
|
2031
|
-
if (matches.length === 0) {
|
|
2032
|
-
matches.push({ start: matchStart, end: matchEnd });
|
|
2033
|
-
}
|
|
2034
|
-
|
|
2035
|
-
let snippet: string;
|
|
2036
|
-
let primaryStart = matchStart;
|
|
2037
|
-
let primaryEnd = matchEnd;
|
|
2038
|
-
let adjustedMatches: MatchSpan[] = matches;
|
|
2039
|
-
|
|
2040
|
-
if (windowed) {
|
|
2041
|
-
const sl = w.getSnippetWindow(i, before, after);
|
|
2042
|
-
snippet = sl > 0 ? this._readPad(sl) : '';
|
|
2043
|
-
const offset = w.getSnippetWindowOffset();
|
|
2044
|
-
// Spans came back chunk-relative; shift them into window-relative.
|
|
2045
|
-
// Account for leading "... " prefix when present.
|
|
2046
|
-
const leadingPrefix = offset > 0 ? 4 : 0;
|
|
2047
|
-
const shift = leadingPrefix - offset;
|
|
2048
|
-
adjustedMatches = matches.map(m => ({
|
|
2049
|
-
start: Math.max(0, m.start + shift),
|
|
2050
|
-
end: Math.max(0, m.end + shift),
|
|
2051
|
-
}));
|
|
2052
|
-
primaryStart = adjustedMatches[0]?.start ?? 0;
|
|
2053
|
-
primaryEnd = adjustedMatches[0]?.end ?? 0;
|
|
2054
|
-
} else {
|
|
2055
|
-
const sl = w.getSnippet(i);
|
|
2056
|
-
snippet = sl > 0 ? this._readPad(sl) : '';
|
|
2057
|
-
}
|
|
2058
|
-
|
|
2059
|
-
results.push({
|
|
2060
|
-
documentName: name,
|
|
2061
|
-
location,
|
|
2062
|
-
score,
|
|
2063
|
-
snippet,
|
|
2064
|
-
matchStart: primaryStart,
|
|
2065
|
-
matchEnd: primaryEnd,
|
|
2066
|
-
matches: adjustedMatches,
|
|
2067
|
-
});
|
|
2068
|
-
}
|
|
2069
|
-
return results;
|
|
2195
|
+
return this._collectResults(count, opts, phraseTokens);
|
|
2070
2196
|
}
|
|
2071
2197
|
|
|
2072
2198
|
/** Returns current engine statistics. */
|
|
@@ -2127,9 +2253,93 @@ export class AlbexEngine {
|
|
|
2127
2253
|
|
|
2128
2254
|
/** Full reset — clears all indexed documents and chunks. */
|
|
2129
2255
|
reset(): void {
|
|
2256
|
+
this._assertIdle('reset');
|
|
2257
|
+
this._resetInner();
|
|
2258
|
+
}
|
|
2259
|
+
|
|
2260
|
+
private _resetInner(): void {
|
|
2130
2261
|
this._wasm.init();
|
|
2131
2262
|
this._docs = [];
|
|
2132
2263
|
this._lastSearch = null;
|
|
2264
|
+
this._diagnostics = [];
|
|
2265
|
+
}
|
|
2266
|
+
|
|
2267
|
+
/**
|
|
2268
|
+
* Drain and return the diagnostics collected since the last call (or
|
|
2269
|
+
* since the engine was created). Use this to surface recoverable
|
|
2270
|
+
* issues to the caller after `indexFile`, `load`, or any other
|
|
2271
|
+
* operation that may run into a "best-effort" path.
|
|
2272
|
+
*
|
|
2273
|
+
* Example diagnostics:
|
|
2274
|
+
* - `{kind:'fallback', stage:'pdf', message:'pdf-extract crashed,
|
|
2275
|
+
* attempting OCR-only fallback', file:'invoice.pdf'}`
|
|
2276
|
+
* - `{kind:'skipped', stage:'ocr', message:'Tesseract abort on page
|
|
2277
|
+
* 3 image 1; remaining images on this page skipped', file:'...',
|
|
2278
|
+
* page:3}`
|
|
2279
|
+
* - `{kind:'fallback', stage:'gpu', message:'GPU pre-filter failed,
|
|
2280
|
+
* using CPU'}`
|
|
2281
|
+
*
|
|
2282
|
+
* The buffer is cleared on each call; callers should consume the
|
|
2283
|
+
* returned array immediately (e.g. log to their telemetry, surface
|
|
2284
|
+
* a UI banner). After `reset()` the buffer is also cleared.
|
|
2285
|
+
*/
|
|
2286
|
+
takeDiagnostics(): AlbexDiagnostic[] {
|
|
2287
|
+
const out = this._diagnostics;
|
|
2288
|
+
this._diagnostics = [];
|
|
2289
|
+
return out;
|
|
2290
|
+
}
|
|
2291
|
+
|
|
2292
|
+
/** Internal: record a diagnostic. Capped at 256 to bound memory. */
|
|
2293
|
+
private _diag(entry: AlbexDiagnostic): void {
|
|
2294
|
+
if (this._diagnostics.length >= 256) return;
|
|
2295
|
+
this._diagnostics.push(entry);
|
|
2296
|
+
}
|
|
2297
|
+
|
|
2298
|
+
/**
|
|
2299
|
+
* Install an OCR adapter. Returns a handle whose `dispose()` removes the
|
|
2300
|
+
* adapter from the engine.
|
|
2301
|
+
*
|
|
2302
|
+
* The contract: the adapter must provide `recognize(image, opts)` that
|
|
2303
|
+
* returns `Promise<OcrAttachedResult>`. The engine validates the
|
|
2304
|
+
* contract at attach time and refuses adapters that don't expose a
|
|
2305
|
+
* recognise function. Only one adapter can be attached at a time; a
|
|
2306
|
+
* second call to `attachOcr` while one is active throws — the caller
|
|
2307
|
+
* must dispose the previous one first.
|
|
2308
|
+
*
|
|
2309
|
+
* @example
|
|
2310
|
+
* ```ts
|
|
2311
|
+
* import { enableOcr } from '@albex/ocr';
|
|
2312
|
+
* const handle = enableOcr(engine); // internally calls attachOcr
|
|
2313
|
+
* // ... later ...
|
|
2314
|
+
* await handle.dispose();
|
|
2315
|
+
* ```
|
|
2316
|
+
*
|
|
2317
|
+
* Direct use without the companion package:
|
|
2318
|
+
* ```ts
|
|
2319
|
+
* const handle = engine.attachOcr({
|
|
2320
|
+
* recognize: async (blob) => myCustomOcr(blob),
|
|
2321
|
+
* options: { alwaysExtractEmbeddedImages: false },
|
|
2322
|
+
* });
|
|
2323
|
+
* ```
|
|
2324
|
+
*/
|
|
2325
|
+
attachOcr(adapter: OcrAdapter): OcrHandle {
|
|
2326
|
+
if (this._ocrAdapter) {
|
|
2327
|
+
throw new AlbexInitError(
|
|
2328
|
+
'OCR adapter already attached. Call dispose() on the previous handle before attaching a new one.',
|
|
2329
|
+
);
|
|
2330
|
+
}
|
|
2331
|
+
if (typeof adapter?.recognize !== 'function') {
|
|
2332
|
+
throw new AlbexInitError(
|
|
2333
|
+
'attachOcr requires an adapter with a recognize(image, opts) function.',
|
|
2334
|
+
);
|
|
2335
|
+
}
|
|
2336
|
+
this._ocrAdapter = adapter;
|
|
2337
|
+
return {
|
|
2338
|
+
dispose: async () => {
|
|
2339
|
+
// Idempotent: a double dispose is a no-op rather than a throw.
|
|
2340
|
+
if (this._ocrAdapter === adapter) this._ocrAdapter = null;
|
|
2341
|
+
},
|
|
2342
|
+
};
|
|
2133
2343
|
}
|
|
2134
2344
|
|
|
2135
2345
|
// ── Persistence ───────────────────────────────────────────────────────────
|
|
@@ -2142,6 +2352,10 @@ export class AlbexEngine {
|
|
|
2142
2352
|
* state in roughly O(total bytes), bypassing re-parsing.
|
|
2143
2353
|
*/
|
|
2144
2354
|
async save(name: string): Promise<void> {
|
|
2355
|
+
return this._exclusive(() => this._saveInner(name));
|
|
2356
|
+
}
|
|
2357
|
+
|
|
2358
|
+
private async _saveInner(name: string): Promise<void> {
|
|
2145
2359
|
const w = this._wasm;
|
|
2146
2360
|
const total = w.snapshotSize();
|
|
2147
2361
|
if (total === 0) {
|
|
@@ -2168,6 +2382,10 @@ export class AlbexEngine {
|
|
|
2168
2382
|
* header (wrong magic, version, or struct sizes).
|
|
2169
2383
|
*/
|
|
2170
2384
|
async load(name: string): Promise<boolean> {
|
|
2385
|
+
return this._exclusive(() => this._loadInner(name));
|
|
2386
|
+
}
|
|
2387
|
+
|
|
2388
|
+
private async _loadInner(name: string): Promise<boolean> {
|
|
2171
2389
|
const bytes = await loadPersisted(name);
|
|
2172
2390
|
if (!bytes || bytes.length === 0) return false;
|
|
2173
2391
|
|
|
@@ -2188,6 +2406,17 @@ export class AlbexEngine {
|
|
|
2188
2406
|
off += n;
|
|
2189
2407
|
}
|
|
2190
2408
|
|
|
2409
|
+
// Commit. For v3 this is the atomic apply step (state is untouched
|
|
2410
|
+
// until now); a failure here leaves the previous index intact so the
|
|
2411
|
+
// caller can keep using the engine. For v1/v2 snapshots `restoreCommit`
|
|
2412
|
+
// is a no-op that returns 1 (those formats applied in-place during
|
|
2413
|
+
// restoreFeed and have no rollback to offer). Older binaries that
|
|
2414
|
+
// predate v3 do not export `restoreCommit` — in that case we treat
|
|
2415
|
+
// the load as already committed by feature-detect.
|
|
2416
|
+
if (typeof w.restoreCommit === 'function') {
|
|
2417
|
+
if (w.restoreCommit() !== 1) return false;
|
|
2418
|
+
}
|
|
2419
|
+
|
|
2191
2420
|
// Rebuild _docs metadata from the restored WASM tables.
|
|
2192
2421
|
//
|
|
2193
2422
|
// What's available after a restore:
|
|
@@ -2250,9 +2479,11 @@ export class AlbexEngine {
|
|
|
2250
2479
|
* empty. Returns whether a load actually happened.
|
|
2251
2480
|
*/
|
|
2252
2481
|
async loadOrInit(name: string): Promise<boolean> {
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2482
|
+
return this._exclusive(async () => {
|
|
2483
|
+
const loaded = await this._loadInner(name);
|
|
2484
|
+
if (!loaded) this._resetInner();
|
|
2485
|
+
return loaded;
|
|
2486
|
+
});
|
|
2256
2487
|
}
|
|
2257
2488
|
|
|
2258
2489
|
/** Delete a previously persisted snapshot. */
|
|
@@ -2277,7 +2508,8 @@ export class AlbexEngine {
|
|
|
2277
2508
|
* WASM instance and its (typically 20 MB) backing memory.
|
|
2278
2509
|
*/
|
|
2279
2510
|
[Symbol.dispose](): void {
|
|
2280
|
-
|
|
2511
|
+
// Terminal: bypass the idle guard — disposing mid-operation is allowed.
|
|
2512
|
+
this._resetInner();
|
|
2281
2513
|
this._unsubscribeResources?.();
|
|
2282
2514
|
this._unsubscribeResources = null;
|
|
2283
2515
|
this._gpu?.destroy();
|