albex 0.3.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +275 -0
- package/README.md +4 -2
- package/dist/albex-worker.js +1 -1
- package/dist/albex.d.ts +157 -17
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +405 -232
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +16 -2
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +6 -3
- package/dist/errors.js.map +1 -1
- package/dist/persistence.js +1 -1
- package/dist/profile.d.ts +11 -6
- package/dist/profile.d.ts.map +1 -1
- package/dist/profile.js +6 -13
- package/dist/profile.js.map +1 -1
- package/dist/resource-manager.js +1 -1
- package/dist/tiered-store.js +1 -1
- package/dist/wasm-bindings.d.ts +46 -5
- package/dist/wasm-bindings.d.ts.map +1 -1
- package/dist/wasm-bindings.js +102 -7
- package/dist/wasm-bindings.js.map +1 -1
- package/dist/worker-protocol.js +1 -1
- package/dist/worker-runtime.js +12 -3
- package/dist/worker-runtime.js.map +1 -1
- package/package.json +13 -9
- package/src/albex.ts +478 -246
- package/src/errors.ts +18 -2
- package/src/profile.ts +11 -10
- package/src/wasm-bindings.ts +157 -8
- package/src/worker-runtime.ts +12 -2
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_std.wasm +0 -0
- package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
package/dist/albex.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*!
|
|
2
|
-
* albex v0.
|
|
2
|
+
* albex v0.6.0
|
|
3
3
|
* Zero-config local full-text search for documents — runs entirely in the browser, no server, no upload.
|
|
4
4
|
* (c) 2026 RafaCalRob
|
|
5
5
|
* @license MIT
|
|
@@ -21,9 +21,9 @@
|
|
|
21
21
|
* ```
|
|
22
22
|
*/
|
|
23
23
|
import { asAlbexExports, asAlbexPdfExports, } from './wasm-bindings.js';
|
|
24
|
-
import { AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
|
|
24
|
+
import { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
|
|
25
25
|
import { savePersisted, loadPersisted, deletePersisted, listPersisted, } from './persistence.js';
|
|
26
|
-
import { detectProfile,
|
|
26
|
+
import { detectProfile, shouldUseGpu } from './profile.js';
|
|
27
27
|
import { getResourceManager } from './resource-manager.js';
|
|
28
28
|
import { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
|
|
29
29
|
export { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
|
|
@@ -48,41 +48,36 @@ function warnSearchStreamDeprecated() {
|
|
|
48
48
|
'scheduler between slices and returns a batch. The alias will be ' +
|
|
49
49
|
'removed in 0.4.0.');
|
|
50
50
|
}
|
|
51
|
-
function tokenize(q) {
|
|
52
|
-
return q.trim().split(/\s+/).filter(t => t.length > 0);
|
|
53
|
-
}
|
|
54
|
-
function parseQuery(q) {
|
|
55
|
-
const trimmed = q.trim();
|
|
56
|
-
// OR: "term1 | term2" or "phrase one | phrase two"
|
|
57
|
-
if (trimmed.includes('|')) {
|
|
58
|
-
const branches = trimmed.split('|')
|
|
59
|
-
.map(p => tokenize(p.replace(/"/g, '')))
|
|
60
|
-
.filter(b => b.length > 0);
|
|
61
|
-
return { kind: 'or', branches };
|
|
62
|
-
}
|
|
63
|
-
// Phrase: "exact phrase here"
|
|
64
|
-
const phraseMatch = /^"(.+)"$/.exec(trimmed);
|
|
65
|
-
if (phraseMatch) {
|
|
66
|
-
const inner = phraseMatch[1] ?? '';
|
|
67
|
-
const tokens = tokenize(inner);
|
|
68
|
-
return { kind: 'phrase', tokens, raw: inner };
|
|
69
|
-
}
|
|
70
|
-
return { kind: 'simple', tokens: tokenize(trimmed) };
|
|
71
|
-
}
|
|
72
|
-
/**
|
|
73
|
-
* Reconstruct a WASM-compatible query string from parsed tokens.
|
|
74
|
-
* The WASM engine accepts up to 4 space-separated tokens (AND semantics).
|
|
75
|
-
*/
|
|
76
|
-
function tokensToWasmQuery(tokens) {
|
|
77
|
-
return tokens.slice(0, 4).join(' ');
|
|
78
|
-
}
|
|
79
51
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
80
|
-
//
|
|
52
|
+
// Query parsing (WASM-side as of 0.5.0)
|
|
81
53
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
54
|
+
//
|
|
55
|
+
// Pre-0.5.0 this file owned parseQuery + tokenize. That created two
|
|
56
|
+
// truths about what a "token" was: one in TS for the query, one in Rust
|
|
57
|
+
// for the indexed text. The audit flagged this as the biggest divergence
|
|
58
|
+
// in the wrapper.
|
|
59
|
+
//
|
|
60
|
+
// 0.5.0 moves parseQuery/tokenize/tokensToWasmQuery to Rust. The TS
|
|
61
|
+
// dispatcher reduces to:
|
|
62
|
+
//
|
|
63
|
+
// 1. Write the raw UTF-8 query bytes to the scratchpad.
|
|
64
|
+
// 2. Call prepareQuery(len). Get back the kind (simple/phrase/or).
|
|
65
|
+
// 3. For OR: iterate getQueryBranchCount() branches, calling
|
|
66
|
+
// selectQueryBranch(i) + search() for each, then merge in TS.
|
|
67
|
+
// For simple/phrase: selectQueryBranch(0) + search().
|
|
68
|
+
// 4. For phrase: post-filter the snippets with containsPhrase().
|
|
69
|
+
//
|
|
70
|
+
// containsPhrase stays in TS because it operates on snippet text already
|
|
71
|
+
// produced by the WASM, not on the query. It is not a tokenizer.
|
|
82
72
|
/**
|
|
83
|
-
* Returns true if `snippet` contains the phrase
|
|
84
|
-
* with at most `maxGap` characters between
|
|
85
|
-
* Comparison is case- and accent-insensitive.
|
|
73
|
+
* Phrase post-filter. Returns true if `snippet` contains the phrase
|
|
74
|
+
* formed by `tokens` in order, with at most `maxGap` characters between
|
|
75
|
+
* consecutive tokens. Comparison is case- and accent-insensitive.
|
|
76
|
+
*
|
|
77
|
+
* The tokens come from the WASM-compiled pattern of a phrase branch,
|
|
78
|
+
* not from a TS re-tokenization of the query, so there is no
|
|
79
|
+
* tokenization divergence: WASM said "these are the tokens", we just
|
|
80
|
+
* check adjacency in the snippet.
|
|
86
81
|
*/
|
|
87
82
|
function containsPhrase(snippet, tokens, maxGap = 30) {
|
|
88
83
|
const norm = (s) => s.toLowerCase().normalize('NFKD').replace(/[̀-ͯ]/g, '');
|
|
@@ -231,32 +226,11 @@ function computePatternBloom(query) {
|
|
|
231
226
|
}
|
|
232
227
|
return bits;
|
|
233
228
|
}
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
// FNV prime: 0x100000001b3 = (0x100 << 32) | 0x000001b3
|
|
240
|
-
for (let i = 0; i < bytes.length; i++) {
|
|
241
|
-
lo ^= bytes[i];
|
|
242
|
-
// multiply by FNV prime
|
|
243
|
-
// (hi:lo) *= 0x100000001b3
|
|
244
|
-
// low * prime
|
|
245
|
-
const lo_lo = (lo & 0xffff) * 0x1b3;
|
|
246
|
-
const lo_hi = (lo >>> 16) * 0x1b3;
|
|
247
|
-
let new_lo = (lo_lo + ((lo_hi & 0xffff) << 16)) | 0;
|
|
248
|
-
let carry = (lo_hi >>> 16) + ((lo_lo + ((lo_hi & 0xffff) << 16)) > 0xffffffff ? 1 : 0);
|
|
249
|
-
// hi*prime + carry
|
|
250
|
-
const hi_lo = (hi & 0xffff) * 0x1b3;
|
|
251
|
-
const hi_hi = (hi >>> 16) * 0x1b3;
|
|
252
|
-
const new_hi = ((hi_lo + ((hi_hi & 0xffff) << 16)) | 0) + carry + lo; // + lo because high 33rd bit
|
|
253
|
-
lo = new_lo;
|
|
254
|
-
hi = new_hi | 0;
|
|
255
|
-
}
|
|
256
|
-
const hexHi = (hi >>> 0).toString(16).padStart(8, '0');
|
|
257
|
-
const hexLo = (lo >>> 0).toString(16).padStart(8, '0');
|
|
258
|
-
return hexHi + hexLo;
|
|
259
|
-
}
|
|
229
|
+
// Note: `contentHash` is implemented as a method on AlbexEngine below
|
|
230
|
+
// (it needs access to the WASM scratchpad). The standalone TS reference
|
|
231
|
+
// implementation that used to live here was removed in 0.4.0 — the
|
|
232
|
+
// canonical hash now lives in wasm/src/lib.rs::hashBytes so there is
|
|
233
|
+
// exactly one definition of "the content hash of these bytes".
|
|
260
234
|
/**
|
|
261
235
|
* 16-hex-char content hash → 8 raw bytes for setDocumentContentHash. The
|
|
262
236
|
* byte order matches the snapshot format: the high 32 bits sit at offsets
|
|
@@ -450,11 +424,18 @@ function makePdfWasmImports(module, getPdfMem) {
|
|
|
450
424
|
case '__wbindgen_externref_table_set_null':
|
|
451
425
|
return (idx) => { heap[idx] = undefined; };
|
|
452
426
|
}
|
|
453
|
-
// Unknown import —
|
|
454
|
-
//
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
427
|
+
// Unknown import — fail fast. An import we don't recognise means the
|
|
428
|
+
// wasm-bindgen / lopdf / getrandom dependency graph has drifted from
|
|
429
|
+
// the prefixes this loader is written to satisfy. Accepting the
|
|
430
|
+
// module would defer the failure to an arbitrary execution path,
|
|
431
|
+
// typically deep inside extractPdf(), where the user gets either a
|
|
432
|
+
// hang or a misleading "PDF parse error". Refusing instantiation
|
|
433
|
+
// surfaces the version skew at boot, where the maintainer can act
|
|
434
|
+
// on it.
|
|
435
|
+
throw new AlbexInitError(`Unknown PDF WASM import "${modName}.${name}". ` +
|
|
436
|
+
`The albex_pdf.wasm binary was probably built with a newer Rust ` +
|
|
437
|
+
`toolchain or dependency graph than this loader was written for. ` +
|
|
438
|
+
`Rebuild with 'npm run build:pdf-wasm' or open an issue.`);
|
|
458
439
|
};
|
|
459
440
|
const imports = {};
|
|
460
441
|
for (const { module: modName, name } of required) {
|
|
@@ -474,27 +455,29 @@ export class AlbexEngine {
|
|
|
474
455
|
* runtime dependency on OCR — this is a structural slot that the optional
|
|
475
456
|
* companion package fills.
|
|
476
457
|
*/
|
|
477
|
-
ocrImage;
|
|
478
458
|
/**
|
|
479
|
-
*
|
|
480
|
-
*
|
|
481
|
-
*
|
|
482
|
-
*
|
|
483
|
-
* labels).
|
|
484
|
-
*
|
|
485
|
-
* When `alwaysExtractEmbeddedImages` is true, every page of every PDF
|
|
486
|
-
* passes through `extractPageImages` after the normal text extraction;
|
|
487
|
-
* any image that meets the size filter (200×200 in Rust) is fed to
|
|
488
|
-
* `ocrImage`. Performance cost: 1–3 s per qualifying image.
|
|
489
|
-
*
|
|
490
|
-
* Off by default — set this opt-in via the OCR module's options.
|
|
459
|
+
* Public OCR entry point. Forwards to the attached OCR adapter installed
|
|
460
|
+
* via `attachOcr()`. Reading this property is a feature-detect for
|
|
461
|
+
* integrators: `if (engine.ocrImage) { ... OCR available ... }`. Writing
|
|
462
|
+
* to it directly is no longer supported in 0.5.0+ — use `attachOcr`.
|
|
491
463
|
*/
|
|
492
|
-
|
|
464
|
+
get ocrImage() {
|
|
465
|
+
return this._ocrAdapter?.recognize;
|
|
466
|
+
}
|
|
467
|
+
/** Private adapter slot. Holds the OCR plugin contract installed by
|
|
468
|
+
* `attachOcr()`. The engine reads `recognize` and `options` here; the
|
|
469
|
+
* caller never gets a reference to this object directly. */
|
|
470
|
+
_ocrAdapter = null;
|
|
493
471
|
// ── PDF WASM (lazy) ──
|
|
494
472
|
_pdfWasm = null;
|
|
495
473
|
_pdfMem = null;
|
|
496
474
|
_docs = [];
|
|
497
475
|
_lastSearch = null;
|
|
476
|
+
/** Structured diagnostics collected during the most recent operation.
|
|
477
|
+
* Drained by `takeDiagnostics()`. Capped at 256 entries to avoid
|
|
478
|
+
* unbounded memory growth in pathological cases (very corrupted
|
|
479
|
+
* corpora producing thousands of recovery warnings). */
|
|
480
|
+
_diagnostics = [];
|
|
498
481
|
_tier = null;
|
|
499
482
|
_simd = false;
|
|
500
483
|
_profile = null;
|
|
@@ -503,9 +486,51 @@ export class AlbexEngine {
|
|
|
503
486
|
_gpuChunkCountUploaded = 0;
|
|
504
487
|
_unsubscribeResources = null;
|
|
505
488
|
_opts;
|
|
489
|
+
// ── Concurrency guard ──────────────────────────────────────────────────────
|
|
490
|
+
// One WASM instance, global mutable state, async ops that yield to the
|
|
491
|
+
// scheduler between slices. Two overlapping operations corrupt each other
|
|
492
|
+
// (e.g. a fresh searchBegin resets the cursor of an in-flight cooperative
|
|
493
|
+
// search). Async ops serialize through `_opChain`; sync mutators/searches
|
|
494
|
+
// assert the engine is idle (audit 0.6.0, finding #2).
|
|
495
|
+
_opChain = Promise.resolve();
|
|
496
|
+
_busy = false;
|
|
506
497
|
constructor(opts) {
|
|
507
498
|
this._opts = opts;
|
|
508
499
|
}
|
|
500
|
+
/** Serialize an async engine operation behind any in-flight one. */
|
|
501
|
+
_exclusive(fn) {
|
|
502
|
+
const run = this._opChain.then(async () => {
|
|
503
|
+
this._busy = true;
|
|
504
|
+
try {
|
|
505
|
+
return await fn();
|
|
506
|
+
}
|
|
507
|
+
finally {
|
|
508
|
+
this._busy = false;
|
|
509
|
+
}
|
|
510
|
+
});
|
|
511
|
+
// Swallow result/error on the chain so one failure can't wedge the queue.
|
|
512
|
+
this._opChain = run.then(() => undefined, () => undefined);
|
|
513
|
+
return run;
|
|
514
|
+
}
|
|
515
|
+
/** Guard a synchronous mutator/search: refuse to run mid-async-operation
|
|
516
|
+
* rather than silently corrupt the shared WASM state. */
|
|
517
|
+
_assertIdle(method) {
|
|
518
|
+
if (this._busy) {
|
|
519
|
+
throw new AlbexError('busy', `${method}() was called while an async engine operation is still ` +
|
|
520
|
+
`running. Await the previous indexFile/save/load/replaceDocument/` +
|
|
521
|
+
`searchCooperative call, or use searchCooperative instead of search().`);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
/** Compact opportunistically when tombstones pile up under text pressure,
|
|
525
|
+
* so repeated removeDocument/replaceDocument don't exhaust the pool. */
|
|
526
|
+
_autoCompactIfNeeded() {
|
|
527
|
+
const w = this._wasm;
|
|
528
|
+
const cap = w.getTextCapacity();
|
|
529
|
+
const hasTombstones = w.getDocCount() > this._docs.length;
|
|
530
|
+
if (hasTombstones && cap > 0 && w.getTextUsed() / cap > 0.85) {
|
|
531
|
+
w.compact();
|
|
532
|
+
}
|
|
533
|
+
}
|
|
509
534
|
/** Load and initialise the main WASM module. Must be called before any other method. */
|
|
510
535
|
async init() {
|
|
511
536
|
const url = await this._resolveWasmUrl();
|
|
@@ -562,28 +587,26 @@ export class AlbexEngine {
|
|
|
562
587
|
// as an asset reference. They copy the .wasm to the output directory and
|
|
563
588
|
// rewrite the URL automatically. Consumers who use one of those bundlers
|
|
564
589
|
// get a working `new AlbexEngine()` with no manual setup.
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
this._simd = false;
|
|
570
|
-
return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
|
|
571
|
-
}
|
|
572
|
-
let tier;
|
|
573
|
-
if (o.tier && o.tier !== 'auto')
|
|
574
|
-
tier = o.tier;
|
|
575
|
-
else
|
|
576
|
-
tier = pickTier(profile);
|
|
577
|
-
this._tier = tier;
|
|
590
|
+
// 0.5.0+: two main binaries only — baseline and SIMD. The tier
|
|
591
|
+
// system is gone (audit 4.1). Selection collapses to a single
|
|
592
|
+
// boolean: SIMD on or off, decided either by the explicit `simd`
|
|
593
|
+
// option or by a runtime probe.
|
|
578
594
|
const simd = o.simd === 'on'
|
|
579
595
|
? true
|
|
580
596
|
: o.simd === 'off'
|
|
581
597
|
? false
|
|
582
598
|
: !!profile?.wasm.simd;
|
|
583
599
|
this._simd = simd;
|
|
584
|
-
|
|
600
|
+
this._tier = 'std';
|
|
601
|
+
if (!o.wasmBaseUrl) {
|
|
602
|
+
// Zero-config: bundler resolves the .wasm next to dist/. We only
|
|
603
|
+
// ship the baseline alias (albex_wasm_bg.wasm) inside the npm
|
|
604
|
+
// package; integrators who want SIMD must serve both binaries
|
|
605
|
+
// themselves via `wasmBaseUrl`.
|
|
606
|
+
return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
|
|
607
|
+
}
|
|
585
608
|
const base = o.wasmBaseUrl.replace(/\/+$/, '');
|
|
586
|
-
return `${base}/
|
|
609
|
+
return simd ? `${base}/albex_wasm_simd.wasm` : `${base}/albex_wasm.wasm`;
|
|
587
610
|
}
|
|
588
611
|
/** The tier that was actually loaded. `null` until `init()` resolves. */
|
|
589
612
|
get tier() { return this._tier; }
|
|
@@ -684,6 +707,34 @@ export class AlbexEngine {
|
|
|
684
707
|
this._wasm.feedText(c.length);
|
|
685
708
|
}
|
|
686
709
|
}
|
|
710
|
+
/**
|
|
711
|
+
* Compute the FNV-1a 64-bit content hash of `bytes` via the WASM
|
|
712
|
+
* streaming API. Returns a 16-character hex string identical in shape
|
|
713
|
+
* to what the TS implementation in 0.3.x returned, so all callers
|
|
714
|
+
* stay unchanged. Single source of truth — same hash whether we use
|
|
715
|
+
* it for indexFile dedup, for snapshot v2 persistence, or anywhere
|
|
716
|
+
* else. Large inputs are chunked at FEED_SIZE just like _feedText.
|
|
717
|
+
*/
|
|
718
|
+
_contentHash(bytes) {
|
|
719
|
+
const w = this._wasm;
|
|
720
|
+
w.hashBegin();
|
|
721
|
+
for (let i = 0; i < bytes.length; i += FEED_SIZE) {
|
|
722
|
+
const c = bytes.subarray(i, i + FEED_SIZE);
|
|
723
|
+
this._writePad(c);
|
|
724
|
+
w.hashFeed(c.length);
|
|
725
|
+
}
|
|
726
|
+
w.hashFinish();
|
|
727
|
+
// Read 8 result bytes back from scratchpad[0..8].
|
|
728
|
+
const ptr = w.getBuffer(8);
|
|
729
|
+
const out = this._u8(ptr, 8);
|
|
730
|
+
// Big-endian to hex. Same layout as the old hexHi + hexLo output:
|
|
731
|
+
// high u32 first (4 bytes), low u32 second (4 bytes).
|
|
732
|
+
let s = '';
|
|
733
|
+
for (let i = 0; i < 8; i++) {
|
|
734
|
+
s += out[i].toString(16).padStart(2, '0');
|
|
735
|
+
}
|
|
736
|
+
return s;
|
|
737
|
+
}
|
|
687
738
|
_feedXmlBytes(xml, fn) {
|
|
688
739
|
const feeder = this._wasm[fn];
|
|
689
740
|
for (let i = 0; i < xml.length; i += FEED_SIZE) {
|
|
@@ -706,7 +757,10 @@ export class AlbexEngine {
|
|
|
706
757
|
// called when the user actually drops a PDF — but we issue a console
|
|
707
758
|
// hint so embedders can surface a "this will download ~1 MB" prompt.
|
|
708
759
|
if (this._resources?.constrainedNetwork) {
|
|
709
|
-
|
|
760
|
+
this._diag({
|
|
761
|
+
kind: 'info', stage: 'network',
|
|
762
|
+
message: 'Downloading PDF WASM (~1 MB) on a constrained network connection',
|
|
763
|
+
});
|
|
710
764
|
}
|
|
711
765
|
const res = await fetch(pdfUrl);
|
|
712
766
|
if (!res.ok)
|
|
@@ -831,20 +885,14 @@ export class AlbexEngine {
|
|
|
831
885
|
this._feedText(text);
|
|
832
886
|
this._wasm.flushParagraph();
|
|
833
887
|
}
|
|
834
|
-
// Hybrid OCR pass: when the OCR
|
|
835
|
-
// `alwaysExtractEmbeddedImages: true`, also walk every page
|
|
836
|
-
// embedded images and OCR them on top of the vector text.
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
const hasOcr = !!this.ocrImage;
|
|
842
|
-
const binSupportsImages = typeof pw.extractPageImages === 'function'
|
|
843
|
-
&& typeof pw.getPageCount === 'function';
|
|
844
|
-
console.log(`[albex] hybrid OCR decision: ocrImage=${hasOcr} ocrConfig.alwaysExtractEmbeddedImages=${hybridOn} binarySupportsImages=${binSupportsImages}`);
|
|
845
|
-
if (hasOcr && hybridOn && binSupportsImages) {
|
|
888
|
+
// Hybrid OCR pass: when the OCR adapter is wired with
|
|
889
|
+
// `options.alwaysExtractEmbeddedImages: true`, also walk every page
|
|
890
|
+
// for embedded images and OCR them on top of the vector text.
|
|
891
|
+
if (this._ocrAdapter
|
|
892
|
+
&& this._ocrAdapter.options?.alwaysExtractEmbeddedImages
|
|
893
|
+
&& typeof pw.extractPageImages === 'function'
|
|
894
|
+
&& typeof pw.getPageCount === 'function') {
|
|
846
895
|
const totalPages = pw.getPageCount();
|
|
847
|
-
console.log(`[albex] hybrid OCR pass starting over ${totalPages} page(s)`);
|
|
848
896
|
for (let p = 0; p < totalPages; p++) {
|
|
849
897
|
const ocrText = await this._ocrPageEmbeddedImages(pw, p);
|
|
850
898
|
if (ocrText === null)
|
|
@@ -930,7 +978,10 @@ export class AlbexEngine {
|
|
|
930
978
|
// so `_ensurePdfWasm` re-instantiates on the next call.
|
|
931
979
|
this._pdfWasm = null;
|
|
932
980
|
this._pdfMem = null;
|
|
933
|
-
|
|
981
|
+
this._diag({
|
|
982
|
+
kind: 'skipped', stage: 'pdf', page: page + 1,
|
|
983
|
+
message: `PDF image extractor trapped: ${e instanceof Error ? e.message : String(e)}. Remaining pages skipped.`,
|
|
984
|
+
});
|
|
934
985
|
return null;
|
|
935
986
|
}
|
|
936
987
|
if (imageCount <= 0)
|
|
@@ -954,15 +1005,6 @@ export class AlbexEngine {
|
|
|
954
1005
|
const copy = new Uint8Array(len);
|
|
955
1006
|
copy.set(new Uint8Array(liveMem.buffer, ptr, len));
|
|
956
1007
|
const blob = new Blob([copy.buffer], { type: mime });
|
|
957
|
-
// Defensive diagnostics: when an OCR call goes wrong (Tesseract
|
|
958
|
-
// worker abort, malformed JPEG, etc.) the first thing we want to
|
|
959
|
-
// see is whether we even handed it valid image bytes. A real JPEG
|
|
960
|
-
// starts with FF D8 FF (E0 for JFIF, E1 for EXIF). A JPEG2000
|
|
961
|
-
// starts with 00 00 00 0C 6A 50 20 20.
|
|
962
|
-
const magic = Array.from(copy.subarray(0, 4))
|
|
963
|
-
.map(b => b.toString(16).padStart(2, '0'))
|
|
964
|
-
.join(' ');
|
|
965
|
-
console.log(`[albex] OCR page ${page + 1} image ${i + 1}/${imageCount}: kind=${kind} (${mime}) len=${len} bytes magic=${magic}`);
|
|
966
1008
|
try {
|
|
967
1009
|
const { text } = await ocr(blob);
|
|
968
1010
|
const trimmed = text?.trim();
|
|
@@ -977,7 +1019,10 @@ export class AlbexEngine {
|
|
|
977
1019
|
// "Aborted(-1)") are also caught here; if they bypass the
|
|
978
1020
|
// promise rejection and surface as `uncaught` instead, the
|
|
979
1021
|
// demo's window.onerror handler will keep the app alive.
|
|
980
|
-
|
|
1022
|
+
this._diag({
|
|
1023
|
+
kind: 'skipped', stage: 'ocr', page: page + 1,
|
|
1024
|
+
message: `OCR failed on image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`,
|
|
1025
|
+
});
|
|
981
1026
|
}
|
|
982
1027
|
}
|
|
983
1028
|
return pageText;
|
|
@@ -1018,7 +1063,10 @@ export class AlbexEngine {
|
|
|
1018
1063
|
new Uint8Array(pw.memory.buffer, inPtr, bytes.length).set(bytes);
|
|
1019
1064
|
}
|
|
1020
1065
|
catch (e) {
|
|
1021
|
-
|
|
1066
|
+
this._diag({
|
|
1067
|
+
kind: 'skipped', stage: 'pdf',
|
|
1068
|
+
message: `PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`,
|
|
1069
|
+
});
|
|
1022
1070
|
return null;
|
|
1023
1071
|
}
|
|
1024
1072
|
// Set up the doc and let _indexPdfScanned do the page-by-page walk.
|
|
@@ -1027,7 +1075,10 @@ export class AlbexEngine {
|
|
|
1027
1075
|
// first page, no paragraphs are emitted and we end up with 0 chunks.
|
|
1028
1076
|
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
1029
1077
|
this._wasm.beginDocument();
|
|
1030
|
-
|
|
1078
|
+
this._diag({
|
|
1079
|
+
kind: 'fallback', stage: 'pdf', file: file.name,
|
|
1080
|
+
message: `pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf`,
|
|
1081
|
+
});
|
|
1031
1082
|
await this._indexPdfScanned(pw);
|
|
1032
1083
|
return this._wasm.endDocument();
|
|
1033
1084
|
}
|
|
@@ -1487,6 +1538,9 @@ export class AlbexEngine {
|
|
|
1487
1538
|
* Throws for unsupported formats or parse errors.
|
|
1488
1539
|
*/
|
|
1489
1540
|
async indexFile(file) {
|
|
1541
|
+
return this._exclusive(() => this._indexFileInner(file));
|
|
1542
|
+
}
|
|
1543
|
+
async _indexFileInner(file) {
|
|
1490
1544
|
const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
|
|
1491
1545
|
const indexer = AlbexEngine._INDEXERS[ext];
|
|
1492
1546
|
if (!indexer)
|
|
@@ -1494,7 +1548,7 @@ export class AlbexEngine {
|
|
|
1494
1548
|
// Hash the source bytes for idempotency. We always read the bytes once
|
|
1495
1549
|
// here so the indexer can reuse them — avoids a double File.arrayBuffer().
|
|
1496
1550
|
const bytes = new Uint8Array(await file.arrayBuffer());
|
|
1497
|
-
const hash =
|
|
1551
|
+
const hash = this._contentHash(bytes);
|
|
1498
1552
|
// Idempotency: if a non-deleted doc already has this hash, return it
|
|
1499
1553
|
// unchanged. Cheap O(N) scan since MAX_DOCS = 128.
|
|
1500
1554
|
const existing = this._docs.find(d => d.contentHash === hash);
|
|
@@ -1516,6 +1570,24 @@ export class AlbexEngine {
|
|
|
1516
1570
|
w.setDocumentContentHash(hashBytes.length);
|
|
1517
1571
|
}
|
|
1518
1572
|
const chunks = await indexer(this, file, bytes);
|
|
1573
|
+
// Capacity check (0.6.0). The WASM pools fill silently and break out of
|
|
1574
|
+
// their ingest loops; getLastIndexOverflow reports which one filled.
|
|
1575
|
+
// Surface a typed error instead of returning a half-indexed document the
|
|
1576
|
+
// caller cannot tell apart from a complete one (audit finding #3).
|
|
1577
|
+
const overflow = w.getLastIndexOverflow();
|
|
1578
|
+
if (overflow !== 0) {
|
|
1579
|
+
const which = (overflow & 1) ? 'chunks' : (overflow & 2) ? 'text'
|
|
1580
|
+
: (overflow & 4) ? 'docs' : 'names';
|
|
1581
|
+
const pools = [
|
|
1582
|
+
overflow & 1 ? 'chunk pool' : '',
|
|
1583
|
+
overflow & 2 ? 'text pool' : '',
|
|
1584
|
+
overflow & 4 ? 'document table' : '',
|
|
1585
|
+
overflow & 8 ? 'name pool' : '',
|
|
1586
|
+
].filter(Boolean).join(', ');
|
|
1587
|
+
throw new AlbexCapacityError(`Index capacity exceeded while indexing "${file.name}" (${pools} full). ` +
|
|
1588
|
+
`The document was rolled back (not indexed); treat the index as full ` +
|
|
1589
|
+
`(compact(), shard across an AlbexPool, or reset()).`, which);
|
|
1590
|
+
}
|
|
1519
1591
|
// The new doc occupies slot `docCountBefore`.
|
|
1520
1592
|
const docId = w.getDocId(docCountBefore);
|
|
1521
1593
|
const doc = {
|
|
@@ -1538,6 +1610,10 @@ export class AlbexEngine {
|
|
|
1538
1610
|
* Returns `true` if a matching document was found and tombstoned.
|
|
1539
1611
|
*/
|
|
1540
1612
|
removeDocument(id) {
|
|
1613
|
+
this._assertIdle('removeDocument');
|
|
1614
|
+
return this._removeDocumentInner(id);
|
|
1615
|
+
}
|
|
1616
|
+
_removeDocumentInner(id) {
|
|
1541
1617
|
const doc = this._docs.find(d => d.name === id || d.contentHash === id);
|
|
1542
1618
|
if (!doc)
|
|
1543
1619
|
return false;
|
|
@@ -1553,12 +1629,15 @@ export class AlbexEngine {
|
|
|
1553
1629
|
* idempotency check (so re-indexing the *same* bytes after a remove works).
|
|
1554
1630
|
*/
|
|
1555
1631
|
async replaceDocument(name, newFile) {
|
|
1556
|
-
this.
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1632
|
+
return this._exclusive(async () => {
|
|
1633
|
+
this._removeDocumentInner(name);
|
|
1634
|
+
// Index directly via the inner path (we already hold the lock).
|
|
1635
|
+
const doc = await this._indexFileInner(newFile);
|
|
1636
|
+
// Repeated replaces leave tombstones in the text pool; reclaim under
|
|
1637
|
+
// pressure so the pool isn't silently exhausted (audit finding #7).
|
|
1638
|
+
this._autoCompactIfNeeded();
|
|
1639
|
+
return doc;
|
|
1640
|
+
});
|
|
1562
1641
|
}
|
|
1563
1642
|
/**
|
|
1564
1643
|
* Reclaim storage from previously removed documents. Compacts CHUNKS,
|
|
@@ -1568,6 +1647,7 @@ export class AlbexEngine {
|
|
|
1568
1647
|
* references (e.g. in a UI) remain valid.
|
|
1569
1648
|
*/
|
|
1570
1649
|
compact() {
|
|
1650
|
+
this._assertIdle('compact');
|
|
1571
1651
|
this._wasm.compact();
|
|
1572
1652
|
}
|
|
1573
1653
|
/**
|
|
@@ -1580,15 +1660,34 @@ export class AlbexEngine {
|
|
|
1580
1660
|
* markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
|
|
1581
1661
|
*/
|
|
1582
1662
|
search(query, opts = {}) {
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1663
|
+
this._assertIdle('search');
|
|
1664
|
+
const w = this._wasm;
|
|
1665
|
+
const ql = this._writeStr(query);
|
|
1666
|
+
const kind = w.prepareQuery(ql);
|
|
1667
|
+
if (kind < 0)
|
|
1668
|
+
return [];
|
|
1669
|
+
if (kind === 2) {
|
|
1670
|
+
// OR: iterate branches and merge in TS. WASM stores compiled
|
|
1671
|
+
// branches internally so we never re-tokenize on the host.
|
|
1672
|
+
return this._searchOr(query, opts);
|
|
1673
|
+
}
|
|
1674
|
+
w.selectQueryBranch(0);
|
|
1675
|
+
// Phrase queries (kind 1) post-filter on adjacency. Pass the tokens down
|
|
1676
|
+
// so the check runs against the FULL chunk text, not a cropped windowed
|
|
1677
|
+
// snippet — otherwise `{ windowed: true }` could drop a valid phrase hit
|
|
1678
|
+
// whose second term fell outside the window (audit finding #7).
|
|
1679
|
+
const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
|
|
1680
|
+
return this._runSearch(query, opts, phraseTokens);
|
|
1681
|
+
}
|
|
1682
|
+
/** Read the WASM-compiled tokens of branch `i` for phrase post-filter.
|
|
1683
|
+
* The bytes returned are exactly what the WASM tokenizer produced —
|
|
1684
|
+
* no TS re-tokenization. */
|
|
1685
|
+
_branchTokens(i) {
|
|
1686
|
+
const n = this._wasm.getQueryBranchPattern(i);
|
|
1687
|
+
if (n === 0)
|
|
1688
|
+
return [];
|
|
1689
|
+
const pattern = this._readPad(n);
|
|
1690
|
+
return pattern.split(' ').filter(t => t.length > 0);
|
|
1592
1691
|
}
|
|
1593
1692
|
/**
|
|
1594
1693
|
* Cooperative search. Processes the corpus in slices, yielding to the
|
|
@@ -1605,18 +1704,29 @@ export class AlbexEngine {
|
|
|
1605
1704
|
* Pass `opts.frameBudgetMs` to control the slice size (default 8 ms).
|
|
1606
1705
|
*/
|
|
1607
1706
|
async *searchCooperative(query, opts = {}) {
|
|
1608
|
-
|
|
1707
|
+
// Collect under the exclusivity lock so no other engine op interleaves at
|
|
1708
|
+
// a slice boundary; the per-slice scheduler yields still happen inside.
|
|
1709
|
+
const results = await this._exclusive(() => this._searchCooperativeCollect(query, opts));
|
|
1710
|
+
for (const r of results)
|
|
1711
|
+
yield r;
|
|
1712
|
+
}
|
|
1713
|
+
/** Materialise a cooperative search to a sorted result array. Runs inside
|
|
1714
|
+
* the exclusivity lock. Frame-budget yielding lives in _runSearchBudgeted. */
|
|
1715
|
+
async _searchCooperativeCollect(query, opts) {
|
|
1609
1716
|
const budget = opts.frameBudgetMs ?? 8;
|
|
1610
1717
|
const w = this._wasm;
|
|
1611
|
-
|
|
1612
|
-
|
|
1718
|
+
const ql = this._writeStr(query);
|
|
1719
|
+
const kind = w.prepareQuery(ql);
|
|
1720
|
+
if (kind < 0)
|
|
1721
|
+
return [];
|
|
1722
|
+
if (kind === 2) {
|
|
1723
|
+
// OR branches — run each as its own resumable search and merge.
|
|
1613
1724
|
const seen = new Set();
|
|
1614
1725
|
const all = [];
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
const r = await this._runSearchBudgeted(q, query, opts, budget);
|
|
1726
|
+
const n = w.getQueryBranchCount();
|
|
1727
|
+
for (let i = 0; i < n; i++) {
|
|
1728
|
+
w.selectQueryBranch(i);
|
|
1729
|
+
const r = await this._runSearchBudgeted(query, opts, budget, undefined, i);
|
|
1620
1730
|
for (const x of r) {
|
|
1621
1731
|
const key = `${x.documentName}:${x.location}:${x.matchStart}`;
|
|
1622
1732
|
if (!seen.has(key)) {
|
|
@@ -1626,17 +1736,11 @@ export class AlbexEngine {
|
|
|
1626
1736
|
}
|
|
1627
1737
|
}
|
|
1628
1738
|
all.sort((a, b) => b.score - a.score);
|
|
1629
|
-
|
|
1630
|
-
yield r;
|
|
1631
|
-
return;
|
|
1739
|
+
return all;
|
|
1632
1740
|
}
|
|
1633
|
-
|
|
1634
|
-
const
|
|
1635
|
-
|
|
1636
|
-
: results;
|
|
1637
|
-
for (const r of filtered)
|
|
1638
|
-
yield r;
|
|
1639
|
-
void w;
|
|
1741
|
+
w.selectQueryBranch(0);
|
|
1742
|
+
const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
|
|
1743
|
+
return this._runSearchBudgeted(query, opts, budget, phraseTokens, 0);
|
|
1640
1744
|
}
|
|
1641
1745
|
/**
|
|
1642
1746
|
* @deprecated Renamed to `searchCooperative` in 0.3.0. The original name
|
|
@@ -1657,21 +1761,28 @@ export class AlbexEngine {
|
|
|
1657
1761
|
* JS<->WASM overhead on fast machines; on slow machines a single batch
|
|
1658
1762
|
* may eat the entire budget, which is also fine.
|
|
1659
1763
|
*/
|
|
1660
|
-
async _runSearchBudgeted(
|
|
1764
|
+
async _runSearchBudgeted(displayQuery, opts, budgetMs, phraseTokens, branchIdx = 0) {
|
|
1661
1765
|
const w = this._wasm;
|
|
1662
|
-
|
|
1663
|
-
|
|
1766
|
+
// Pattern is already set by the caller via selectQueryBranch(branchIdx).
|
|
1767
|
+
// Snapshot THAT branch's compiled pattern for the GPU pre-filter hash —
|
|
1768
|
+
// not branch 0, which would build the wrong candidate mask for OR
|
|
1769
|
+
// branches and silently drop their hits (audit finding #6).
|
|
1770
|
+
const activePatternLen = w.getQueryBranchPattern(branchIdx);
|
|
1771
|
+
const activePattern = activePatternLen > 0 ? this._readPad(activePatternLen) : '';
|
|
1664
1772
|
// GPU pre-filter (CD1). If enabled AND the corpus is large enough,
|
|
1665
1773
|
// the GPU computes the candidate bitset and we install it into WASM
|
|
1666
1774
|
// before searchBegin so the slice loop only inspects candidates.
|
|
1667
1775
|
// Failure here is silent: we fall back to CPU-only Bloom transparently.
|
|
1668
1776
|
if (this._shouldEngageGpu()) {
|
|
1669
1777
|
try {
|
|
1670
|
-
await this._gpuPreFilter(
|
|
1778
|
+
await this._gpuPreFilter(activePattern);
|
|
1671
1779
|
}
|
|
1672
1780
|
catch (e) {
|
|
1673
1781
|
// Don't let a GPU hiccup kill the search — drop to CPU path.
|
|
1674
|
-
|
|
1782
|
+
this._diag({
|
|
1783
|
+
kind: 'fallback', stage: 'gpu',
|
|
1784
|
+
message: `GPU pre-filter failed; falling back to CPU: ${e instanceof Error ? e.message : String(e)}`,
|
|
1785
|
+
});
|
|
1675
1786
|
w.clearCandidateMask();
|
|
1676
1787
|
}
|
|
1677
1788
|
}
|
|
@@ -1719,16 +1830,28 @@ export class AlbexEngine {
|
|
|
1719
1830
|
bloomPassed: w.getStatBloomPassed(),
|
|
1720
1831
|
bitapMatched: w.getStatBitapMatched(),
|
|
1721
1832
|
};
|
|
1722
|
-
return this._collectResults(count, opts);
|
|
1833
|
+
return this._collectResults(count, opts, phraseTokens);
|
|
1723
1834
|
}
|
|
1724
|
-
/** Materialise results [0..count) into the public SearchResult shape.
|
|
1725
|
-
|
|
1835
|
+
/** Materialise results [0..count) into the public SearchResult shape.
|
|
1836
|
+
* When `phraseTokens` is given, each result is kept only if those tokens
|
|
1837
|
+
* appear adjacently in the FULL chunk text — independent of any display
|
|
1838
|
+
* windowing — so phrase queries stay correct under `{ windowed: true }`. */
|
|
1839
|
+
_collectResults(count, opts, phraseTokens) {
|
|
1726
1840
|
const w = this._wasm;
|
|
1727
1841
|
const windowed = opts.windowed === true;
|
|
1728
1842
|
const before = opts.before ?? 60;
|
|
1729
1843
|
const after = opts.after ?? 120;
|
|
1844
|
+
const phraseFilter = phraseTokens && phraseTokens.length > 0 ? phraseTokens : null;
|
|
1730
1845
|
const results = [];
|
|
1731
1846
|
for (let i = 0; i < count; i++) {
|
|
1847
|
+
// Phrase adjacency check against the full chunk text (getSnippet), not
|
|
1848
|
+
// the possibly-cropped display window.
|
|
1849
|
+
if (phraseFilter) {
|
|
1850
|
+
const fl = w.getSnippet(i);
|
|
1851
|
+
const full = fl > 0 ? this._readPad(fl) : '';
|
|
1852
|
+
if (!containsPhrase(full, phraseFilter))
|
|
1853
|
+
continue;
|
|
1854
|
+
}
|
|
1732
1855
|
const score = w.getResultScore(i);
|
|
1733
1856
|
const location = w.getResultLocation(i);
|
|
1734
1857
|
const matchStart = w.getResultStart(i);
|
|
@@ -1775,14 +1898,18 @@ export class AlbexEngine {
|
|
|
1775
1898
|
}
|
|
1776
1899
|
return results;
|
|
1777
1900
|
}
|
|
1778
|
-
|
|
1901
|
+
/** Run all OR branches and merge dedup-by-(doc, location, match). The
|
|
1902
|
+
* branches are already compiled inside the WASM (by prepareQuery); we
|
|
1903
|
+
* iterate them with selectQueryBranch. The "rawQuery" param is kept
|
|
1904
|
+
* only for the lastSearch.query field. */
|
|
1905
|
+
_searchOr(rawQuery, opts) {
|
|
1906
|
+
const w = this._wasm;
|
|
1779
1907
|
const seen = new Set();
|
|
1780
1908
|
const all = [];
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
const results = this._runSearch(q, rawQuery, opts);
|
|
1909
|
+
const n = w.getQueryBranchCount();
|
|
1910
|
+
for (let i = 0; i < n; i++) {
|
|
1911
|
+
w.selectQueryBranch(i);
|
|
1912
|
+
const results = this._runSearch(rawQuery, opts);
|
|
1786
1913
|
for (const r of results) {
|
|
1787
1914
|
const key = `${r.documentName}:${r.location}:${r.matchStart}`;
|
|
1788
1915
|
if (!seen.has(key)) {
|
|
@@ -1791,14 +1918,14 @@ export class AlbexEngine {
|
|
|
1791
1918
|
}
|
|
1792
1919
|
}
|
|
1793
1920
|
}
|
|
1794
|
-
// Re-rank the merged list by score descending.
|
|
1795
1921
|
all.sort((a, b) => b.score - a.score);
|
|
1796
1922
|
return all;
|
|
1797
1923
|
}
|
|
1798
|
-
|
|
1924
|
+
/** Execute a single search using whichever query branch is currently
|
|
1925
|
+
* active (set via selectQueryBranch). Returns the materialised
|
|
1926
|
+
* SearchResult[]. Caller is responsible for activating a branch first. */
|
|
1927
|
+
_runSearch(displayQuery, opts, phraseTokens) {
|
|
1799
1928
|
const w = this._wasm;
|
|
1800
|
-
const ql = this._writeStr(wasmQuery);
|
|
1801
|
-
w.setPattern(ql);
|
|
1802
1929
|
const t0 = performance.now();
|
|
1803
1930
|
const count = w.search();
|
|
1804
1931
|
const ms = performance.now() - t0;
|
|
@@ -1810,59 +1937,7 @@ export class AlbexEngine {
|
|
|
1810
1937
|
bloomPassed: w.getStatBloomPassed(),
|
|
1811
1938
|
bitapMatched: w.getStatBitapMatched(),
|
|
1812
1939
|
};
|
|
1813
|
-
|
|
1814
|
-
const before = opts.before ?? 60;
|
|
1815
|
-
const after = opts.after ?? 120;
|
|
1816
|
-
const results = [];
|
|
1817
|
-
for (let i = 0; i < count; i++) {
|
|
1818
|
-
const score = w.getResultScore(i);
|
|
1819
|
-
const location = w.getResultLocation(i);
|
|
1820
|
-
const matchStart = w.getResultStart(i);
|
|
1821
|
-
const matchEnd = w.getResultEnd(i);
|
|
1822
|
-
const nl = w.getResultDocName(i);
|
|
1823
|
-
const name = nl > 0 ? this._readPad(nl) : '?';
|
|
1824
|
-
const matchCount = w.getResultMatchCount(i);
|
|
1825
|
-
const matches = [];
|
|
1826
|
-
for (let k = 0; k < matchCount; k++) {
|
|
1827
|
-
matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
|
|
1828
|
-
}
|
|
1829
|
-
if (matches.length === 0) {
|
|
1830
|
-
matches.push({ start: matchStart, end: matchEnd });
|
|
1831
|
-
}
|
|
1832
|
-
let snippet;
|
|
1833
|
-
let primaryStart = matchStart;
|
|
1834
|
-
let primaryEnd = matchEnd;
|
|
1835
|
-
let adjustedMatches = matches;
|
|
1836
|
-
if (windowed) {
|
|
1837
|
-
const sl = w.getSnippetWindow(i, before, after);
|
|
1838
|
-
snippet = sl > 0 ? this._readPad(sl) : '';
|
|
1839
|
-
const offset = w.getSnippetWindowOffset();
|
|
1840
|
-
// Spans came back chunk-relative; shift them into window-relative.
|
|
1841
|
-
// Account for leading "... " prefix when present.
|
|
1842
|
-
const leadingPrefix = offset > 0 ? 4 : 0;
|
|
1843
|
-
const shift = leadingPrefix - offset;
|
|
1844
|
-
adjustedMatches = matches.map(m => ({
|
|
1845
|
-
start: Math.max(0, m.start + shift),
|
|
1846
|
-
end: Math.max(0, m.end + shift),
|
|
1847
|
-
}));
|
|
1848
|
-
primaryStart = adjustedMatches[0]?.start ?? 0;
|
|
1849
|
-
primaryEnd = adjustedMatches[0]?.end ?? 0;
|
|
1850
|
-
}
|
|
1851
|
-
else {
|
|
1852
|
-
const sl = w.getSnippet(i);
|
|
1853
|
-
snippet = sl > 0 ? this._readPad(sl) : '';
|
|
1854
|
-
}
|
|
1855
|
-
results.push({
|
|
1856
|
-
documentName: name,
|
|
1857
|
-
location,
|
|
1858
|
-
score,
|
|
1859
|
-
snippet,
|
|
1860
|
-
matchStart: primaryStart,
|
|
1861
|
-
matchEnd: primaryEnd,
|
|
1862
|
-
matches: adjustedMatches,
|
|
1863
|
-
});
|
|
1864
|
-
}
|
|
1865
|
-
return results;
|
|
1940
|
+
return this._collectResults(count, opts, phraseTokens);
|
|
1866
1941
|
}
|
|
1867
1942
|
/** Returns current engine statistics. */
|
|
1868
1943
|
getStats() {
|
|
@@ -1914,9 +1989,87 @@ export class AlbexEngine {
|
|
|
1914
1989
|
}
|
|
1915
1990
|
/** Full reset — clears all indexed documents and chunks. */
|
|
1916
1991
|
reset() {
|
|
1992
|
+
this._assertIdle('reset');
|
|
1993
|
+
this._resetInner();
|
|
1994
|
+
}
|
|
1995
|
+
_resetInner() {
|
|
1917
1996
|
this._wasm.init();
|
|
1918
1997
|
this._docs = [];
|
|
1919
1998
|
this._lastSearch = null;
|
|
1999
|
+
this._diagnostics = [];
|
|
2000
|
+
}
|
|
2001
|
+
/**
|
|
2002
|
+
* Drain and return the diagnostics collected since the last call (or
|
|
2003
|
+
* since the engine was created). Use this to surface recoverable
|
|
2004
|
+
* issues to the caller after `indexFile`, `load`, or any other
|
|
2005
|
+
* operation that may run into a "best-effort" path.
|
|
2006
|
+
*
|
|
2007
|
+
* Example diagnostics:
|
|
2008
|
+
* - `{kind:'fallback', stage:'pdf', message:'pdf-extract crashed,
|
|
2009
|
+
* attempting OCR-only fallback', file:'invoice.pdf'}`
|
|
2010
|
+
* - `{kind:'skipped', stage:'ocr', message:'Tesseract abort on page
|
|
2011
|
+
* 3 image 1; remaining images on this page skipped', file:'...',
|
|
2012
|
+
* page:3}`
|
|
2013
|
+
* - `{kind:'fallback', stage:'gpu', message:'GPU pre-filter failed,
|
|
2014
|
+
* using CPU'}`
|
|
2015
|
+
*
|
|
2016
|
+
* The buffer is cleared on each call; callers should consume the
|
|
2017
|
+
* returned array immediately (e.g. log to their telemetry, surface
|
|
2018
|
+
* a UI banner). After `reset()` the buffer is also cleared.
|
|
2019
|
+
*/
|
|
2020
|
+
takeDiagnostics() {
|
|
2021
|
+
const out = this._diagnostics;
|
|
2022
|
+
this._diagnostics = [];
|
|
2023
|
+
return out;
|
|
2024
|
+
}
|
|
2025
|
+
/** Internal: record a diagnostic. Capped at 256 to bound memory. */
|
|
2026
|
+
_diag(entry) {
|
|
2027
|
+
if (this._diagnostics.length >= 256)
|
|
2028
|
+
return;
|
|
2029
|
+
this._diagnostics.push(entry);
|
|
2030
|
+
}
|
|
2031
|
+
/**
|
|
2032
|
+
* Install an OCR adapter. Returns a handle whose `dispose()` removes the
|
|
2033
|
+
* adapter from the engine.
|
|
2034
|
+
*
|
|
2035
|
+
* The contract: the adapter must provide `recognize(image, opts)` that
|
|
2036
|
+
* returns `Promise<OcrAttachedResult>`. The engine validates the
|
|
2037
|
+
* contract at attach time and refuses adapters that don't expose a
|
|
2038
|
+
* recognise function. Only one adapter can be attached at a time; a
|
|
2039
|
+
* second call to `attachOcr` while one is active throws — the caller
|
|
2040
|
+
* must dispose the previous one first.
|
|
2041
|
+
*
|
|
2042
|
+
* @example
|
|
2043
|
+
* ```ts
|
|
2044
|
+
* import { enableOcr } from '@albex/ocr';
|
|
2045
|
+
* const handle = enableOcr(engine); // internally calls attachOcr
|
|
2046
|
+
* // ... later ...
|
|
2047
|
+
* await handle.dispose();
|
|
2048
|
+
* ```
|
|
2049
|
+
*
|
|
2050
|
+
* Direct use without the companion package:
|
|
2051
|
+
* ```ts
|
|
2052
|
+
* const handle = engine.attachOcr({
|
|
2053
|
+
* recognize: async (blob) => myCustomOcr(blob),
|
|
2054
|
+
* options: { alwaysExtractEmbeddedImages: false },
|
|
2055
|
+
* });
|
|
2056
|
+
* ```
|
|
2057
|
+
*/
|
|
2058
|
+
attachOcr(adapter) {
|
|
2059
|
+
if (this._ocrAdapter) {
|
|
2060
|
+
throw new AlbexInitError('OCR adapter already attached. Call dispose() on the previous handle before attaching a new one.');
|
|
2061
|
+
}
|
|
2062
|
+
if (typeof adapter?.recognize !== 'function') {
|
|
2063
|
+
throw new AlbexInitError('attachOcr requires an adapter with a recognize(image, opts) function.');
|
|
2064
|
+
}
|
|
2065
|
+
this._ocrAdapter = adapter;
|
|
2066
|
+
return {
|
|
2067
|
+
dispose: async () => {
|
|
2068
|
+
// Idempotent: a double dispose is a no-op rather than a throw.
|
|
2069
|
+
if (this._ocrAdapter === adapter)
|
|
2070
|
+
this._ocrAdapter = null;
|
|
2071
|
+
},
|
|
2072
|
+
};
|
|
1920
2073
|
}
|
|
1921
2074
|
// ── Persistence ───────────────────────────────────────────────────────────
|
|
1922
2075
|
/**
|
|
@@ -1927,6 +2080,9 @@ export class AlbexEngine {
|
|
|
1927
2080
|
* state in roughly O(total bytes), bypassing re-parsing.
|
|
1928
2081
|
*/
|
|
1929
2082
|
async save(name) {
|
|
2083
|
+
return this._exclusive(() => this._saveInner(name));
|
|
2084
|
+
}
|
|
2085
|
+
async _saveInner(name) {
|
|
1930
2086
|
const w = this._wasm;
|
|
1931
2087
|
const total = w.snapshotSize();
|
|
1932
2088
|
if (total === 0) {
|
|
@@ -1953,6 +2109,9 @@ export class AlbexEngine {
|
|
|
1953
2109
|
* header (wrong magic, version, or struct sizes).
|
|
1954
2110
|
*/
|
|
1955
2111
|
async load(name) {
|
|
2112
|
+
return this._exclusive(() => this._loadInner(name));
|
|
2113
|
+
}
|
|
2114
|
+
async _loadInner(name) {
|
|
1956
2115
|
const bytes = await loadPersisted(name);
|
|
1957
2116
|
if (!bytes || bytes.length === 0)
|
|
1958
2117
|
return false;
|
|
@@ -1975,6 +2134,17 @@ export class AlbexEngine {
|
|
|
1975
2134
|
return false;
|
|
1976
2135
|
off += n;
|
|
1977
2136
|
}
|
|
2137
|
+
// Commit. For v3 this is the atomic apply step (state is untouched
|
|
2138
|
+
// until now); a failure here leaves the previous index intact so the
|
|
2139
|
+
// caller can keep using the engine. For v1/v2 snapshots `restoreCommit`
|
|
2140
|
+
// is a no-op that returns 1 (those formats applied in-place during
|
|
2141
|
+
// restoreFeed and have no rollback to offer). Older binaries that
|
|
2142
|
+
// predate v3 do not export `restoreCommit` — in that case we treat
|
|
2143
|
+
// the load as already committed by feature-detect.
|
|
2144
|
+
if (typeof w.restoreCommit === 'function') {
|
|
2145
|
+
if (w.restoreCommit() !== 1)
|
|
2146
|
+
return false;
|
|
2147
|
+
}
|
|
1978
2148
|
// Rebuild _docs metadata from the restored WASM tables.
|
|
1979
2149
|
//
|
|
1980
2150
|
// What's available after a restore:
|
|
@@ -2035,10 +2205,12 @@ export class AlbexEngine {
|
|
|
2035
2205
|
* empty. Returns whether a load actually happened.
|
|
2036
2206
|
*/
|
|
2037
2207
|
async loadOrInit(name) {
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2208
|
+
return this._exclusive(async () => {
|
|
2209
|
+
const loaded = await this._loadInner(name);
|
|
2210
|
+
if (!loaded)
|
|
2211
|
+
this._resetInner();
|
|
2212
|
+
return loaded;
|
|
2213
|
+
});
|
|
2042
2214
|
}
|
|
2043
2215
|
/** Delete a previously persisted snapshot. */
|
|
2044
2216
|
async deleteSnapshot(name) {
|
|
@@ -2060,7 +2232,8 @@ export class AlbexEngine {
|
|
|
2060
2232
|
* WASM instance and its (typically 20 MB) backing memory.
|
|
2061
2233
|
*/
|
|
2062
2234
|
[Symbol.dispose]() {
|
|
2063
|
-
|
|
2235
|
+
// Terminal: bypass the idle guard — disposing mid-operation is allowed.
|
|
2236
|
+
this._resetInner();
|
|
2064
2237
|
this._unsubscribeResources?.();
|
|
2065
2238
|
this._unsubscribeResources = null;
|
|
2066
2239
|
this._gpu?.destroy();
|