albex 0.1.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +416 -0
- package/README.md +244 -112
- package/dist/albex-worker.d.ts +70 -0
- package/dist/albex-worker.d.ts.map +1 -0
- package/dist/albex-worker.js +153 -0
- package/dist/albex-worker.js.map +1 -0
- package/dist/albex.d.ts +508 -6
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +1911 -141
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +52 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +66 -0
- package/dist/errors.js.map +1 -0
- package/dist/gpu/bloom-runtime.d.ts +60 -0
- package/dist/gpu/bloom-runtime.d.ts.map +1 -0
- package/dist/gpu/bloom-runtime.js +176 -0
- package/dist/gpu/bloom-runtime.js.map +1 -0
- package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
- package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
- package/dist/gpu/bloom-shader.wgsl.js +49 -0
- package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
- package/dist/persistence.d.ts +21 -0
- package/dist/persistence.d.ts.map +1 -0
- package/dist/persistence.js +174 -0
- package/dist/persistence.js.map +1 -0
- package/dist/pool/coordinator.d.ts +98 -0
- package/dist/pool/coordinator.d.ts.map +1 -0
- package/dist/pool/coordinator.js +247 -0
- package/dist/pool/coordinator.js.map +1 -0
- package/dist/profile.d.ts +100 -0
- package/dist/profile.d.ts.map +1 -0
- package/dist/profile.js +200 -0
- package/dist/profile.js.map +1 -0
- package/dist/resource-manager.d.ts +56 -0
- package/dist/resource-manager.d.ts.map +1 -0
- package/dist/resource-manager.js +138 -0
- package/dist/resource-manager.js.map +1 -0
- package/dist/tiered-store.d.ts +98 -0
- package/dist/tiered-store.d.ts.map +1 -0
- package/dist/tiered-store.js +238 -0
- package/dist/tiered-store.js.map +1 -0
- package/dist/wasm-bindings.d.ts +180 -0
- package/dist/wasm-bindings.d.ts.map +1 -0
- package/dist/wasm-bindings.js +128 -0
- package/dist/wasm-bindings.js.map +1 -0
- package/dist/worker-protocol.d.ts +86 -0
- package/dist/worker-protocol.d.ts.map +1 -0
- package/dist/worker-protocol.js +20 -0
- package/dist/worker-protocol.js.map +1 -0
- package/dist/worker-runtime.d.ts +14 -0
- package/dist/worker-runtime.d.ts.map +1 -0
- package/dist/worker-runtime.js +109 -0
- package/dist/worker-runtime.js.map +1 -0
- package/package.json +60 -13
- package/src/albex-worker.ts +187 -0
- package/src/albex.ts +2136 -189
- package/src/errors.ts +76 -0
- package/src/gpu/bloom-runtime.ts +229 -0
- package/src/gpu/bloom-shader.wgsl.ts +48 -0
- package/src/persistence.ts +175 -0
- package/src/pool/coordinator.ts +324 -0
- package/src/profile.ts +280 -0
- package/src/resource-manager.ts +167 -0
- package/src/tiered-store.ts +259 -0
- package/src/wasm-bindings.ts +349 -0
- package/src/worker-protocol.ts +48 -0
- package/src/worker-runtime.ts +106 -0
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_simd.wasm +0 -0
package/dist/albex.js
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
/*!
|
|
2
|
+
* albex v0.6.0
|
|
3
|
+
* Zero-config local full-text search for documents — runs entirely in the browser, no server, no upload.
|
|
4
|
+
* (c) 2026 RafaCalRob
|
|
5
|
+
* @license MIT
|
|
6
|
+
* https://github.com/RafaCalRob/Albex#readme
|
|
7
|
+
*/
|
|
1
8
|
/**
|
|
2
9
|
* Albex — local full-text search engine.
|
|
3
10
|
*
|
|
@@ -13,41 +20,64 @@
|
|
|
13
20
|
* const results = engine.search('contrato marco');
|
|
14
21
|
* ```
|
|
15
22
|
*/
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
23
|
+
import { asAlbexExports, asAlbexPdfExports, } from './wasm-bindings.js';
|
|
24
|
+
import { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
|
|
25
|
+
import { savePersisted, loadPersisted, deletePersisted, listPersisted, } from './persistence.js';
|
|
26
|
+
import { detectProfile, shouldUseGpu } from './profile.js';
|
|
27
|
+
import { getResourceManager } from './resource-manager.js';
|
|
28
|
+
import { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
|
|
29
|
+
export { AlbexError, AlbexInitError, AlbexUnsupportedFormatError, AlbexParseError, AlbexCapacityError, } from './errors.js';
|
|
30
|
+
export { listPersisted, deletePersisted } from './persistence.js';
|
|
31
|
+
export { detectProfile, pickTier, pickWorkerCount, shouldUseGpu } from './profile.js';
|
|
32
|
+
export { getResourceManager } from './resource-manager.js';
|
|
33
|
+
export { AlbexPool } from './pool/coordinator.js';
|
|
34
|
+
export { BloomGpu, packBloomsFromChunks } from './gpu/bloom-runtime.js';
|
|
35
|
+
export { TieredStore } from './tiered-store.js';
|
|
36
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
37
|
+
// Deprecation warnings — one-shot, fire-and-forget
|
|
38
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
39
|
+
let _searchStreamWarned = false;
|
|
40
|
+
function warnSearchStreamDeprecated() {
|
|
41
|
+
if (_searchStreamWarned)
|
|
42
|
+
return;
|
|
43
|
+
_searchStreamWarned = true;
|
|
44
|
+
// The original name implied incremental streaming, which the implementation
|
|
45
|
+
// never provided. Renamed in 0.3.0; alias removed in 0.4.0.
|
|
46
|
+
console.warn('[albex] `searchStream` is deprecated; rename to `searchCooperative`. ' +
|
|
47
|
+
'The method does not stream incremental results — it yields to the ' +
|
|
48
|
+
'scheduler between slices and returns a batch. The alias will be ' +
|
|
49
|
+
'removed in 0.4.0.');
|
|
43
50
|
}
|
|
44
51
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
45
|
-
//
|
|
52
|
+
// Query parsing (WASM-side as of 0.5.0)
|
|
46
53
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
54
|
+
//
|
|
55
|
+
// Pre-0.5.0 this file owned parseQuery + tokenize. That created two
|
|
56
|
+
// truths about what a "token" was: one in TS for the query, one in Rust
|
|
57
|
+
// for the indexed text. The audit flagged this as the biggest divergence
|
|
58
|
+
// in the wrapper.
|
|
59
|
+
//
|
|
60
|
+
// 0.5.0 moves parseQuery/tokenize/tokensToWasmQuery to Rust. The TS
|
|
61
|
+
// dispatcher reduces to:
|
|
62
|
+
//
|
|
63
|
+
// 1. Write the raw UTF-8 query bytes to the scratchpad.
|
|
64
|
+
// 2. Call prepareQuery(len). Get back the kind (simple/phrase/or).
|
|
65
|
+
// 3. For OR: iterate getQueryBranchCount() branches, calling
|
|
66
|
+
// selectQueryBranch(i) + search() for each, then merge in TS.
|
|
67
|
+
// For simple/phrase: selectQueryBranch(0) + search().
|
|
68
|
+
// 4. For phrase: post-filter the snippets with containsPhrase().
|
|
69
|
+
//
|
|
70
|
+
// containsPhrase stays in TS because it operates on snippet text already
|
|
71
|
+
// produced by the WASM, not on the query. It is not a tokenizer.
|
|
47
72
|
/**
|
|
48
|
-
* Returns true if `snippet` contains the phrase
|
|
49
|
-
* with at most `maxGap` characters between
|
|
50
|
-
* Comparison is case- and accent-insensitive.
|
|
73
|
+
* Phrase post-filter. Returns true if `snippet` contains the phrase
|
|
74
|
+
* formed by `tokens` in order, with at most `maxGap` characters between
|
|
75
|
+
* consecutive tokens. Comparison is case- and accent-insensitive.
|
|
76
|
+
*
|
|
77
|
+
* The tokens come from the WASM-compiled pattern of a phrase branch,
|
|
78
|
+
* not from a TS re-tokenization of the query, so there is no
|
|
79
|
+
* tokenization divergence: WASM said "these are the tokens", we just
|
|
80
|
+
* check adjacency in the snippet.
|
|
51
81
|
*/
|
|
52
82
|
function containsPhrase(snippet, tokens, maxGap = 30) {
|
|
53
83
|
const norm = (s) => s.toLowerCase().normalize('NFKD').replace(/[̀-ͯ]/g, '');
|
|
@@ -80,7 +110,7 @@ function zipCentralDir(bytes) {
|
|
|
80
110
|
while (p >= 0 && v.getUint32(p, true) !== 0x06054b50)
|
|
81
111
|
p--;
|
|
82
112
|
if (p < 0)
|
|
83
|
-
throw new
|
|
113
|
+
throw new AlbexParseError('zip', 'Not a ZIP file (no EOCD record)');
|
|
84
114
|
return { v, cdOff: v.getUint32(p + 16, true), cdN: v.getUint16(p + 10, true) };
|
|
85
115
|
}
|
|
86
116
|
function listZipEntries(bytes) {
|
|
@@ -109,7 +139,7 @@ async function findZipEntry(bytes, name) {
|
|
|
109
139
|
}
|
|
110
140
|
cp += 46 + nl + xl + cl;
|
|
111
141
|
}
|
|
112
|
-
throw new
|
|
142
|
+
throw new AlbexParseError('zip', `Entry "${name}" not found in ZIP`);
|
|
113
143
|
}
|
|
114
144
|
async function decompEntry(bytes, v, off, compSize) {
|
|
115
145
|
const meth = v.getUint16(off + 8, true);
|
|
@@ -146,62 +176,508 @@ async function decompEntry(bytes, v, off, compSize) {
|
|
|
146
176
|
}
|
|
147
177
|
return out;
|
|
148
178
|
}
|
|
149
|
-
throw new
|
|
179
|
+
throw new AlbexParseError('zip', `Unsupported ZIP compression method ${meth}`);
|
|
150
180
|
}
|
|
151
181
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
152
182
|
// WASM memory helpers (internal)
|
|
153
183
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
154
|
-
const FEED_SIZE =
|
|
184
|
+
const FEED_SIZE = 32_768; // 32 KB — fits in 64 KB scratchpad
|
|
185
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
186
|
+
// Content hash — FNV-1a 64-bit
|
|
187
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
188
|
+
/**
|
|
189
|
+
* Compute a 64-bit FNV-1a hash of `bytes` and return it as a 16-char hex
|
|
190
|
+
* string. FNV-1a is a non-cryptographic hash; chosen here because:
|
|
191
|
+
* - it needs zero dependencies,
|
|
192
|
+
* - it is fast on small/medium blobs (~100 MB/s in modern JS),
|
|
193
|
+
* - 64 bits is enough to deduplicate documents in a 128-doc library with
|
|
194
|
+
* vanishing collision probability.
|
|
195
|
+
*
|
|
196
|
+
* The result is stable across runs and engines, so it can be persisted in
|
|
197
|
+
* snapshots without versioning concerns.
|
|
198
|
+
*/
|
|
199
|
+
/**
|
|
200
|
+
* Compute the same 64-bit Bloom value the Rust side computes for a query.
|
|
201
|
+
*
|
|
202
|
+
* Must stay in sync with `BloomFilter::from_text` and `fold_utf8_char` in
|
|
203
|
+
* `core/src/bloom.rs`. The hashing is `c & 0x3F` over each accent-folded
|
|
204
|
+
* lowercase ASCII byte; non-letters are skipped. The aggregate of all token
|
|
205
|
+
* blooms is what the GPU pre-filter checks against.
|
|
206
|
+
*/
|
|
207
|
+
function computePatternBloom(query) {
|
|
208
|
+
// Quick-and-faithful fold: lowercase, NFKD, strip combining marks. This
|
|
209
|
+
// matches the Rust Latin-1/Latin-A fold for the characters we care about
|
|
210
|
+
// (the rest fall through as non-letters which contribute nothing).
|
|
211
|
+
const norm = query.toLowerCase().normalize('NFKD').replace(/[̀-ͯ]/g, '');
|
|
212
|
+
let bits = 0n;
|
|
213
|
+
for (let i = 0; i < norm.length; i++) {
|
|
214
|
+
const code = norm.charCodeAt(i);
|
|
215
|
+
if ((code >= 0x61 && code <= 0x7a) || (code >= 0x30 && code <= 0x39)) {
|
|
216
|
+
bits |= 1n << BigInt(code & 0x3f);
|
|
217
|
+
}
|
|
218
|
+
else if (code === 0x20) {
|
|
219
|
+
// skip token separator
|
|
220
|
+
}
|
|
221
|
+
else if (code < 0x80) {
|
|
222
|
+
// other ASCII punctuation — they bias the filter; mirror Rust which
|
|
223
|
+
// also includes them via the 6-bit mask.
|
|
224
|
+
bits |= 1n << BigInt(code & 0x3f);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
return bits;
|
|
228
|
+
}
|
|
229
|
+
// Note: `contentHash` is implemented as a method on AlbexEngine below
|
|
230
|
+
// (it needs access to the WASM scratchpad). The standalone TS reference
|
|
231
|
+
// implementation that used to live here was removed in 0.4.0 — the
|
|
232
|
+
// canonical hash now lives in wasm/src/lib.rs::hashBytes so there is
|
|
233
|
+
// exactly one definition of "the content hash of these bytes".
|
|
234
|
+
/**
|
|
235
|
+
* 16-hex-char content hash → 8 raw bytes for setDocumentContentHash. The
|
|
236
|
+
* byte order matches the snapshot format: the high 32 bits sit at offsets
|
|
237
|
+
* 0..3 (big-endian-of-the-half), the low 32 bits at offsets 4..7. The
|
|
238
|
+
* exact byte order is irrelevant for correctness — both encode and decode
|
|
239
|
+
* use the same convention — but matching the natural hex byte order keeps
|
|
240
|
+
* a hex dump readable.
|
|
241
|
+
*/
|
|
242
|
+
function hashHexToBytes(hex) {
|
|
243
|
+
const out = new Uint8Array(8);
|
|
244
|
+
for (let i = 0; i < 8; i++) {
|
|
245
|
+
out[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
|
|
246
|
+
}
|
|
247
|
+
return out;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Map a Windows-1252 byte to its Unicode equivalent. Used by the RTF parser
|
|
251
|
+
* for `\'XX` escapes — RTF defaults to cp1252 for high-ANSI characters.
|
|
252
|
+
*
|
|
253
|
+
* The 0x80-0x9F range is what makes cp1252 ≠ Latin-1: Microsoft put curly
|
|
254
|
+
* quotes, em-dashes, the Euro sign etc. into this otherwise-control-only
|
|
255
|
+
* block. Outside that range, cp1252 matches Latin-1 (which equals Unicode
|
|
256
|
+
* for codepoints below 0x100).
|
|
257
|
+
*/
|
|
258
|
+
const _CP1252_HIGH = {
|
|
259
|
+
0x80: '€', 0x82: '‚', 0x83: 'ƒ', 0x84: '„', 0x85: '…', 0x86: '†',
|
|
260
|
+
0x87: '‡', 0x88: 'ˆ', 0x89: '‰', 0x8A: 'Š', 0x8B: '‹', 0x8C: 'Œ',
|
|
261
|
+
0x8E: 'Ž',
|
|
262
|
+
0x91: '‘', 0x92: '’', 0x93: '“', 0x94: '”',
|
|
263
|
+
0x95: '•', 0x96: '–', 0x97: '—', 0x98: '˜', 0x99: '™', 0x9A: 'š',
|
|
264
|
+
0x9B: '›', 0x9C: 'œ', 0x9E: 'ž', 0x9F: 'Ÿ',
|
|
265
|
+
};
|
|
266
|
+
function rtfCp1252ToChar(byte) {
|
|
267
|
+
if (byte < 0x80)
|
|
268
|
+
return String.fromCharCode(byte);
|
|
269
|
+
if (byte >= 0xA0)
|
|
270
|
+
return String.fromCharCode(byte);
|
|
271
|
+
return _CP1252_HIGH[byte] ?? '';
|
|
272
|
+
}
|
|
273
|
+
/**
|
|
274
|
+
* Apply the entity's Content-Transfer-Encoding to its body. Handles
|
|
275
|
+
* base64, quoted-printable, and the pass-through cases (7bit, 8bit, none).
|
|
276
|
+
* Anything unrecognised falls through as pass-through too — better to
|
|
277
|
+
* index something marginally useful than to drop the body entirely.
|
|
278
|
+
*/
|
|
279
|
+
function decodeEmlBody(headersBlock, body, header) {
|
|
280
|
+
const enc = header(headersBlock, 'Content-Transfer-Encoding').toLowerCase();
|
|
281
|
+
if (enc === 'base64')
|
|
282
|
+
return decodeBase64Utf8(body);
|
|
283
|
+
if (enc === 'quoted-printable')
|
|
284
|
+
return decodeQuotedPrintable(body);
|
|
285
|
+
return body;
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Decode a base64 body and interpret the result as UTF-8 text. Used by the
|
|
289
|
+
* EML parser when Content-Transfer-Encoding is base64. Whitespace inside
|
|
290
|
+
* the encoded body (the line breaks every 76 chars) is stripped first;
|
|
291
|
+
* malformed inputs fall back to returning the original string so the
|
|
292
|
+
* caller can still index *something*.
|
|
293
|
+
*/
|
|
294
|
+
function decodeBase64Utf8(body) {
|
|
295
|
+
try {
|
|
296
|
+
const clean = body.replace(/\s+/g, '');
|
|
297
|
+
if (!clean)
|
|
298
|
+
return '';
|
|
299
|
+
// atob produces a "binary string" where each char's low byte is the
|
|
300
|
+
// original byte. We have to bridge that back through Uint8Array to
|
|
301
|
+
// decode UTF-8 multi-byte sequences correctly.
|
|
302
|
+
const bin = atob(clean);
|
|
303
|
+
const arr = new Uint8Array(bin.length);
|
|
304
|
+
for (let i = 0; i < bin.length; i++)
|
|
305
|
+
arr[i] = bin.charCodeAt(i);
|
|
306
|
+
return _dec.decode(arr);
|
|
307
|
+
}
|
|
308
|
+
catch {
|
|
309
|
+
return body;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Decode a quoted-printable body. Handles `=XX` hex escapes (including the
|
|
314
|
+
* `=` "soft line break" producing nothing) and re-decodes the result as
|
|
315
|
+
* UTF-8 — RFC 2045 allows non-ASCII bytes to be QP-encoded, so multiple
|
|
316
|
+
* hex pairs in a row may form a single UTF-8 codepoint.
|
|
317
|
+
*/
|
|
318
|
+
function decodeQuotedPrintable(body) {
|
|
319
|
+
// First pass: collect the raw bytes so we can decode multi-byte UTF-8.
|
|
320
|
+
const bytes = [];
|
|
321
|
+
for (let i = 0; i < body.length; i++) {
|
|
322
|
+
const c = body[i];
|
|
323
|
+
if (c === '=') {
|
|
324
|
+
// Soft line break: `=` at end of line.
|
|
325
|
+
if (body[i + 1] === '\n') {
|
|
326
|
+
i += 1;
|
|
327
|
+
continue;
|
|
328
|
+
}
|
|
329
|
+
// `=XX` hex pair.
|
|
330
|
+
const h = body.slice(i + 1, i + 3);
|
|
331
|
+
if (/^[0-9A-Fa-f]{2}$/.test(h)) {
|
|
332
|
+
bytes.push(parseInt(h, 16));
|
|
333
|
+
i += 2;
|
|
334
|
+
continue;
|
|
335
|
+
}
|
|
336
|
+
// Malformed: keep the literal `=`.
|
|
337
|
+
bytes.push(0x3D);
|
|
338
|
+
continue;
|
|
339
|
+
}
|
|
340
|
+
// ASCII pass-through. JS strings are UTF-16; for ASCII we know
|
|
341
|
+
// charCodeAt fits in a byte. Non-ASCII char in the source isn't
|
|
342
|
+
// strictly valid QP but we pass it through best-effort.
|
|
343
|
+
bytes.push(c.charCodeAt(0) & 0xff);
|
|
344
|
+
}
|
|
345
|
+
try {
|
|
346
|
+
return _dec.decode(new Uint8Array(bytes));
|
|
347
|
+
}
|
|
348
|
+
catch {
|
|
349
|
+
return body;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
/** Inverse of hashHexToBytes. All-zero bytes return '' (no hash known). */
|
|
353
|
+
function hashBytesToHex(bytes) {
|
|
354
|
+
let allZero = true;
|
|
355
|
+
for (let i = 0; i < 8; i++) {
|
|
356
|
+
if (bytes[i] !== 0) {
|
|
357
|
+
allZero = false;
|
|
358
|
+
break;
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
if (allZero)
|
|
362
|
+
return '';
|
|
363
|
+
let s = '';
|
|
364
|
+
for (let i = 0; i < 8; i++) {
|
|
365
|
+
s += bytes[i].toString(16).padStart(2, '0');
|
|
366
|
+
}
|
|
367
|
+
return s;
|
|
368
|
+
}
|
|
155
369
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
156
370
|
// PDF WASM imports shim
|
|
157
371
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
158
|
-
|
|
372
|
+
/**
|
|
373
|
+
* Build the import object for `albex_pdf.wasm` by inspecting the module's
|
|
374
|
+
* required imports at instantiation time.
|
|
375
|
+
*
|
|
376
|
+
* The PDF wasm pulls `wasm-bindgen` transitively through `getrandom`. Its
|
|
377
|
+
* import names embed a build-time hash, e.g.
|
|
378
|
+
* __wbg_getRandomValues_3f44b700395062e5
|
|
379
|
+
* Hardcoding that hash bound the loader to one exact build of the .wasm —
|
|
380
|
+
* any version bump of getrandom / lopdf / wasm-bindgen silently broke
|
|
381
|
+
* instantiation with an InputValidationError.
|
|
382
|
+
*
|
|
383
|
+
* Here we resolve imports by *prefix* and module so the binding survives
|
|
384
|
+
* cosmetic mangling changes. We map:
|
|
385
|
+
* - any __wbg_getRandomValues_* / __wbg_crypto_* → crypto.getRandomValues
|
|
386
|
+
* - any __wbindgen_describe* / __wbindgen_throw* → no-op
|
|
387
|
+
* - __wbindgen_object_drop_ref → heap-slot recycler
|
|
388
|
+
* - __wbindgen_externref_table_grow → heap grower
|
|
389
|
+
* - __wbindgen_externref_table_set_null → heap nuller
|
|
390
|
+
*
|
|
391
|
+
* Anything else gets a logged no-op stub. If the PDF code path ever exercises
|
|
392
|
+
* a missing import, the user gets a console warning, not a hard crash on load.
|
|
393
|
+
*/
|
|
394
|
+
function makePdfWasmImports(module, getPdfMem) {
|
|
159
395
|
const heap = [];
|
|
160
396
|
let freeIdx = -1;
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
},
|
|
168
|
-
__wbindgen_object_drop_ref: (idx) => {
|
|
169
|
-
heap[idx] = freeIdx;
|
|
170
|
-
freeIdx = idx;
|
|
171
|
-
},
|
|
172
|
-
},
|
|
173
|
-
__wbindgen_externref_xform__: {
|
|
174
|
-
__wbindgen_externref_table_grow: (delta) => {
|
|
175
|
-
const old = heap.length;
|
|
176
|
-
for (let i = 0; i < delta; i++)
|
|
177
|
-
heap.push(undefined);
|
|
178
|
-
return old;
|
|
179
|
-
},
|
|
180
|
-
__wbindgen_externref_table_set_null: (idx) => { heap[idx] = undefined; },
|
|
181
|
-
},
|
|
397
|
+
const required = WebAssembly.Module.imports(module);
|
|
398
|
+
const fillRandom = (ptr, len) => {
|
|
399
|
+
const mem = getPdfMem();
|
|
400
|
+
if (!mem)
|
|
401
|
+
throw new Error('PDF WASM memory not initialised');
|
|
402
|
+
crypto.getRandomValues(new Uint8Array(mem.buffer, ptr >>> 0, len >>> 0));
|
|
182
403
|
};
|
|
404
|
+
const resolveByName = (modName, name) => {
|
|
405
|
+
// Random-byte providers (any hashed variant).
|
|
406
|
+
if (name.startsWith('__wbg_getRandomValues') || name.startsWith('__wbg_crypto')) {
|
|
407
|
+
return fillRandom;
|
|
408
|
+
}
|
|
409
|
+
// Diagnostic / introspection — never invoked at runtime in our paths.
|
|
410
|
+
if (name.startsWith('__wbindgen_describe') || name.startsWith('__wbindgen_throw')) {
|
|
411
|
+
return () => { };
|
|
412
|
+
}
|
|
413
|
+
// Externref-heap management used by wasm-bindgen runtime.
|
|
414
|
+
switch (name) {
|
|
415
|
+
case '__wbindgen_object_drop_ref':
|
|
416
|
+
return (idx) => { heap[idx] = freeIdx; freeIdx = idx; };
|
|
417
|
+
case '__wbindgen_externref_table_grow':
|
|
418
|
+
return (delta) => {
|
|
419
|
+
const old = heap.length;
|
|
420
|
+
for (let i = 0; i < delta; i++)
|
|
421
|
+
heap.push(undefined);
|
|
422
|
+
return old;
|
|
423
|
+
};
|
|
424
|
+
case '__wbindgen_externref_table_set_null':
|
|
425
|
+
return (idx) => { heap[idx] = undefined; };
|
|
426
|
+
}
|
|
427
|
+
// Unknown import — fail fast. An import we don't recognise means the
|
|
428
|
+
// wasm-bindgen / lopdf / getrandom dependency graph has drifted from
|
|
429
|
+
// the prefixes this loader is written to satisfy. Accepting the
|
|
430
|
+
// module would defer the failure to an arbitrary execution path,
|
|
431
|
+
// typically deep inside extractPdf(), where the user gets either a
|
|
432
|
+
// hang or a misleading "PDF parse error". Refusing instantiation
|
|
433
|
+
// surfaces the version skew at boot, where the maintainer can act
|
|
434
|
+
// on it.
|
|
435
|
+
throw new AlbexInitError(`Unknown PDF WASM import "${modName}.${name}". ` +
|
|
436
|
+
`The albex_pdf.wasm binary was probably built with a newer Rust ` +
|
|
437
|
+
`toolchain or dependency graph than this loader was written for. ` +
|
|
438
|
+
`Rebuild with 'npm run build:pdf-wasm' or open an issue.`);
|
|
439
|
+
};
|
|
440
|
+
const imports = {};
|
|
441
|
+
for (const { module: modName, name } of required) {
|
|
442
|
+
if (!imports[modName])
|
|
443
|
+
imports[modName] = {};
|
|
444
|
+
imports[modName][name] = resolveByName(modName, name);
|
|
445
|
+
}
|
|
446
|
+
return imports;
|
|
183
447
|
}
|
|
184
|
-
// ─────────────────────────────────────────────────────────────────────────────
|
|
185
|
-
// AlbexEngine
|
|
186
|
-
// ─────────────────────────────────────────────────────────────────────────────
|
|
187
448
|
export class AlbexEngine {
|
|
449
|
+
// ── main WASM ──
|
|
450
|
+
_wasm;
|
|
451
|
+
_mem;
|
|
452
|
+
/**
|
|
453
|
+
* OCR entry point installed by `@albex/ocr::enableOcr(engine)`. Undefined
|
|
454
|
+
* when the OCR module has not been wired. The main `albex` package has no
|
|
455
|
+
* runtime dependency on OCR — this is a structural slot that the optional
|
|
456
|
+
* companion package fills.
|
|
457
|
+
*/
|
|
458
|
+
/**
|
|
459
|
+
* Public OCR entry point. Forwards to the attached OCR adapter installed
|
|
460
|
+
* via `attachOcr()`. Reading this property is a feature-detect for
|
|
461
|
+
* integrators: `if (engine.ocrImage) { ... OCR available ... }`. Writing
|
|
462
|
+
* to it directly is no longer supported in 0.5.0+ — use `attachOcr`.
|
|
463
|
+
*/
|
|
464
|
+
get ocrImage() {
|
|
465
|
+
return this._ocrAdapter?.recognize;
|
|
466
|
+
}
|
|
467
|
+
/** Private adapter slot. Holds the OCR plugin contract installed by
|
|
468
|
+
* `attachOcr()`. The engine reads `recognize` and `options` here; the
|
|
469
|
+
* caller never gets a reference to this object directly. */
|
|
470
|
+
_ocrAdapter = null;
|
|
471
|
+
// ── PDF WASM (lazy) ──
|
|
472
|
+
_pdfWasm = null;
|
|
473
|
+
_pdfMem = null;
|
|
474
|
+
_docs = [];
|
|
475
|
+
_lastSearch = null;
|
|
476
|
+
/** Structured diagnostics collected during the most recent operation.
|
|
477
|
+
* Drained by `takeDiagnostics()`. Capped at 256 entries to avoid
|
|
478
|
+
* unbounded memory growth in pathological cases (very corrupted
|
|
479
|
+
* corpora producing thousands of recovery warnings). */
|
|
480
|
+
_diagnostics = [];
|
|
481
|
+
_tier = null;
|
|
482
|
+
_simd = false;
|
|
483
|
+
_profile = null;
|
|
484
|
+
_resources = null;
|
|
485
|
+
_gpu = null;
|
|
486
|
+
_gpuChunkCountUploaded = 0;
|
|
487
|
+
_unsubscribeResources = null;
|
|
488
|
+
_opts;
|
|
489
|
+
// ── Concurrency guard ──────────────────────────────────────────────────────
|
|
490
|
+
// One WASM instance, global mutable state, async ops that yield to the
|
|
491
|
+
// scheduler between slices. Two overlapping operations corrupt each other
|
|
492
|
+
// (e.g. a fresh searchBegin resets the cursor of an in-flight cooperative
|
|
493
|
+
// search). Async ops serialize through `_opChain`; sync mutators/searches
|
|
494
|
+
// assert the engine is idle (audit 0.6.0, finding #2).
|
|
495
|
+
_opChain = Promise.resolve();
|
|
496
|
+
_busy = false;
|
|
188
497
|
constructor(opts) {
|
|
189
|
-
// ── PDF WASM (lazy) ──
|
|
190
|
-
this._pdfWasm = null;
|
|
191
|
-
this._pdfMem = null;
|
|
192
|
-
this._docs = [];
|
|
193
|
-
this._lastSearch = null;
|
|
194
498
|
this._opts = opts;
|
|
195
499
|
}
|
|
500
|
+
/** Serialize an async engine operation behind any in-flight one. */
|
|
501
|
+
_exclusive(fn) {
|
|
502
|
+
const run = this._opChain.then(async () => {
|
|
503
|
+
this._busy = true;
|
|
504
|
+
try {
|
|
505
|
+
return await fn();
|
|
506
|
+
}
|
|
507
|
+
finally {
|
|
508
|
+
this._busy = false;
|
|
509
|
+
}
|
|
510
|
+
});
|
|
511
|
+
// Swallow result/error on the chain so one failure can't wedge the queue.
|
|
512
|
+
this._opChain = run.then(() => undefined, () => undefined);
|
|
513
|
+
return run;
|
|
514
|
+
}
|
|
515
|
+
/** Guard a synchronous mutator/search: refuse to run mid-async-operation
|
|
516
|
+
* rather than silently corrupt the shared WASM state. */
|
|
517
|
+
_assertIdle(method) {
|
|
518
|
+
if (this._busy) {
|
|
519
|
+
throw new AlbexError('busy', `${method}() was called while an async engine operation is still ` +
|
|
520
|
+
`running. Await the previous indexFile/save/load/replaceDocument/` +
|
|
521
|
+
`searchCooperative call, or use searchCooperative instead of search().`);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
/** Compact opportunistically when tombstones pile up under text pressure,
|
|
525
|
+
* so repeated removeDocument/replaceDocument don't exhaust the pool. */
|
|
526
|
+
_autoCompactIfNeeded() {
|
|
527
|
+
const w = this._wasm;
|
|
528
|
+
const cap = w.getTextCapacity();
|
|
529
|
+
const hasTombstones = w.getDocCount() > this._docs.length;
|
|
530
|
+
if (hasTombstones && cap > 0 && w.getTextUsed() / cap > 0.85) {
|
|
531
|
+
w.compact();
|
|
532
|
+
}
|
|
533
|
+
}
|
|
196
534
|
/** Load and initialise the main WASM module. Must be called before any other method. */
|
|
197
535
|
async init() {
|
|
198
|
-
const
|
|
536
|
+
const url = await this._resolveWasmUrl();
|
|
537
|
+
const res = await fetch(url);
|
|
199
538
|
if (!res.ok)
|
|
200
|
-
throw new
|
|
539
|
+
throw new AlbexInitError(`Failed to fetch WASM: ${res.status} (${url})`);
|
|
201
540
|
const { instance } = await WebAssembly.instantiateStreaming(res, {});
|
|
202
|
-
this._wasm = instance.exports;
|
|
203
|
-
this._mem =
|
|
541
|
+
this._wasm = asAlbexExports(instance.exports);
|
|
542
|
+
this._mem = this._wasm.memory;
|
|
204
543
|
this._wasm.init();
|
|
544
|
+
// Subscribe to environmental signals. Cheap and benign in node tests
|
|
545
|
+
// (the manager tolerates missing globals).
|
|
546
|
+
const rm = getResourceManager();
|
|
547
|
+
await rm.start();
|
|
548
|
+
this._resources = rm.state;
|
|
549
|
+
this._unsubscribeResources = rm.on(s => { this._resources = s; });
|
|
550
|
+
// Lazily initialise the GPU Bloom accelerator. We don't acquire a device
|
|
551
|
+
// here yet — that happens on the first search that crosses the threshold.
|
|
552
|
+
// This keeps cold-start cost the same on GPU and CPU paths.
|
|
553
|
+
if (this._opts.gpu !== 'off') {
|
|
554
|
+
this._gpu = new BloomGpu();
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
/**
|
|
558
|
+
* Decide which `.wasm` binary to fetch. Order of precedence:
|
|
559
|
+
* 1. `opts.wasmUrl` if provided — used verbatim.
|
|
560
|
+
* 2. `opts.tier` if explicit — joined with `wasmBaseUrl`.
|
|
561
|
+
* 3. `opts.wasmBaseUrl` + tier picked from the device profile.
|
|
562
|
+
*
|
|
563
|
+
* Order of precedence:
|
|
564
|
+
* 1. `opts.wasmUrl` literal → use verbatim
|
|
565
|
+
* 2. `opts.wasmBaseUrl` + tier suffix → fetched from that directory
|
|
566
|
+
* 3. zero-config default → `albex_wasm_bg.wasm` packaged
|
|
567
|
+
* next to this file, resolved
|
|
568
|
+
* via `import.meta.url`
|
|
569
|
+
*
|
|
570
|
+
* The zero-config default loads the std-baseline binary. Tier auto-detection
|
|
571
|
+
* is only active when `wasmBaseUrl` is given, because picking a tier in
|
|
572
|
+
* runtime would defeat any bundler's static asset rewriting. Users who want
|
|
573
|
+
* tier optimisation must serve the six variants themselves and pass the
|
|
574
|
+
* directory through `wasmBaseUrl`.
|
|
575
|
+
*/
|
|
576
|
+
async _resolveWasmUrl() {
|
|
577
|
+
const o = this._opts;
|
|
578
|
+
if (o.wasmUrl) {
|
|
579
|
+
this._profile = await detectProfile();
|
|
580
|
+
return o.wasmUrl;
|
|
581
|
+
}
|
|
582
|
+
// Always cache the profile so GPU/worker decisions later don't re-probe.
|
|
583
|
+
const profile = await detectProfile();
|
|
584
|
+
this._profile = profile;
|
|
585
|
+
// Path 3: zero-config — bundler-friendly default. `new URL(..., import.meta.url)`
|
|
586
|
+
// is recognised by Vite, Webpack 5+, esbuild, Rollup, Parcel 2 and Next.js
|
|
587
|
+
// as an asset reference. They copy the .wasm to the output directory and
|
|
588
|
+
// rewrite the URL automatically. Consumers who use one of those bundlers
|
|
589
|
+
// get a working `new AlbexEngine()` with no manual setup.
|
|
590
|
+
// 0.5.0+: two main binaries only — baseline and SIMD. The tier
|
|
591
|
+
// system is gone (audit 4.1). Selection collapses to a single
|
|
592
|
+
// boolean: SIMD on or off, decided either by the explicit `simd`
|
|
593
|
+
// option or by a runtime probe.
|
|
594
|
+
const simd = o.simd === 'on'
|
|
595
|
+
? true
|
|
596
|
+
: o.simd === 'off'
|
|
597
|
+
? false
|
|
598
|
+
: !!profile?.wasm.simd;
|
|
599
|
+
this._simd = simd;
|
|
600
|
+
this._tier = 'std';
|
|
601
|
+
if (!o.wasmBaseUrl) {
|
|
602
|
+
// Zero-config: bundler resolves the .wasm next to dist/. We only
|
|
603
|
+
// ship the baseline alias (albex_wasm_bg.wasm) inside the npm
|
|
604
|
+
// package; integrators who want SIMD must serve both binaries
|
|
605
|
+
// themselves via `wasmBaseUrl`.
|
|
606
|
+
return new URL('../wasm/pkg/albex_wasm_bg.wasm', import.meta.url).href;
|
|
607
|
+
}
|
|
608
|
+
const base = o.wasmBaseUrl.replace(/\/+$/, '');
|
|
609
|
+
return simd ? `${base}/albex_wasm_simd.wasm` : `${base}/albex_wasm.wasm`;
|
|
610
|
+
}
|
|
611
|
+
/** The tier that was actually loaded. `null` until `init()` resolves. */
|
|
612
|
+
get tier() { return this._tier; }
|
|
613
|
+
/** True if the SIMD-accelerated binary was loaded. */
|
|
614
|
+
get simdEnabled() { return this._simd; }
|
|
615
|
+
/** True if a WebGPU device is acquired and the next search will use it. */
|
|
616
|
+
get gpuEngaged() { return !!this._gpu?.available; }
|
|
617
|
+
// ── GPU acceleration (CD1) ───────────────────────────────────────────────
|
|
618
|
+
/**
|
|
619
|
+
* Decide whether to use the GPU pre-filter for the upcoming search.
|
|
620
|
+
*
|
|
621
|
+
* Policy:
|
|
622
|
+
* - `gpu: 'off'` → never.
|
|
623
|
+
* - `gpu: 'on'` → always try (still fails over to CPU).
|
|
624
|
+
* - `gpu: 'auto'` (default) → only when WebGPU is available AND
|
|
625
|
+
* chunk count crosses `gpuThreshold`.
|
|
626
|
+
*/
|
|
627
|
+
_shouldEngageGpu() {
|
|
628
|
+
const o = this._opts;
|
|
629
|
+
if (!this._gpu)
|
|
630
|
+
return false;
|
|
631
|
+
if (o.gpu === 'off')
|
|
632
|
+
return false;
|
|
633
|
+
if (o.gpu === 'on')
|
|
634
|
+
return true;
|
|
635
|
+
if (!this._profile)
|
|
636
|
+
return false;
|
|
637
|
+
const threshold = o.gpuThreshold ?? 20_000;
|
|
638
|
+
return shouldUseGpu(this._profile, this._wasm.getChunkCount(), threshold);
|
|
639
|
+
}
|
|
640
|
+
/**
|
|
641
|
+
* Run the GPU Bloom scan and install the resulting candidate bitset into
|
|
642
|
+
* WASM. The next `searchBegin` will see the mask and `searchSlice` will
|
|
643
|
+
* restrict its Bitap pass to those candidates.
|
|
644
|
+
*
|
|
645
|
+
* No-op if the GPU device hasn't been acquired yet — first call attempts
|
|
646
|
+
* `init()` lazily; if that fails, the candidate path is permanently
|
|
647
|
+
* disabled for this engine instance.
|
|
648
|
+
*/
|
|
649
|
+
async _gpuPreFilter(wasmQuery) {
|
|
650
|
+
const gpu = this._gpu;
|
|
651
|
+
if (!gpu)
|
|
652
|
+
return;
|
|
653
|
+
if (!gpu.available) {
|
|
654
|
+
const ok = await gpu.init();
|
|
655
|
+
if (!ok) {
|
|
656
|
+
this._gpu = null;
|
|
657
|
+
return;
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
const chunkCount = this._wasm.getChunkCount();
|
|
661
|
+
if (chunkCount === 0)
|
|
662
|
+
return;
|
|
663
|
+
// Upload blooms if the corpus changed. We re-upload everything on any
|
|
664
|
+
// delta; incremental delta-upload is a future optimisation.
|
|
665
|
+
if (chunkCount !== this._gpuChunkCountUploaded) {
|
|
666
|
+
const ptr = this._wasm.getChunksPtr();
|
|
667
|
+
const stride = this._wasm.getChunkStructSize();
|
|
668
|
+
const bytes = new Uint8Array(this._mem.buffer, ptr, chunkCount * stride);
|
|
669
|
+
const blooms = packBloomsFromChunks(bytes, chunkCount);
|
|
670
|
+
gpu.uploadChunkBlooms(blooms, chunkCount);
|
|
671
|
+
this._gpuChunkCountUploaded = chunkCount;
|
|
672
|
+
}
|
|
673
|
+
// Build the pattern Bloom on the JS side: same hash as Rust
|
|
674
|
+
// (`c & 0x3F` after accent-folding), aggregated across all tokens.
|
|
675
|
+
const patternBloom = computePatternBloom(wasmQuery);
|
|
676
|
+
const passes = await gpu.scan(Number(patternBloom & 0xffffffffn), Number((patternBloom >> 32n) & 0xffffffffn));
|
|
677
|
+
// Push the bitset back into WASM via the scratchpad.
|
|
678
|
+
const passBytes = new Uint8Array(passes.buffer, passes.byteOffset, passes.byteLength);
|
|
679
|
+
this._writePad(passBytes);
|
|
680
|
+
this._wasm.setCandidateMask(passBytes.byteLength);
|
|
205
681
|
}
|
|
206
682
|
// ── Internal helpers ──────────────────────────────────────────────────────
|
|
207
683
|
_u8(off, n) {
|
|
@@ -210,7 +686,7 @@ export class AlbexEngine {
|
|
|
210
686
|
_writePad(b) {
|
|
211
687
|
const ptr = this._wasm.getBuffer(b.length);
|
|
212
688
|
if (!ptr)
|
|
213
|
-
throw new
|
|
689
|
+
throw new AlbexCapacityError(`Scratchpad too small for ${b.length} bytes`);
|
|
214
690
|
this._u8(ptr, b.length).set(b);
|
|
215
691
|
return ptr;
|
|
216
692
|
}
|
|
@@ -231,38 +707,81 @@ export class AlbexEngine {
|
|
|
231
707
|
this._wasm.feedText(c.length);
|
|
232
708
|
}
|
|
233
709
|
}
|
|
710
|
+
/**
|
|
711
|
+
* Compute the FNV-1a 64-bit content hash of `bytes` via the WASM
|
|
712
|
+
* streaming API. Returns a 16-character hex string identical in shape
|
|
713
|
+
* to what the TS implementation in 0.3.x returned, so all callers
|
|
714
|
+
* stay unchanged. Single source of truth — same hash whether we use
|
|
715
|
+
* it for indexFile dedup, for snapshot v2 persistence, or anywhere
|
|
716
|
+
* else. Large inputs are chunked at FEED_SIZE just like _feedText.
|
|
717
|
+
*/
|
|
718
|
+
_contentHash(bytes) {
|
|
719
|
+
const w = this._wasm;
|
|
720
|
+
w.hashBegin();
|
|
721
|
+
for (let i = 0; i < bytes.length; i += FEED_SIZE) {
|
|
722
|
+
const c = bytes.subarray(i, i + FEED_SIZE);
|
|
723
|
+
this._writePad(c);
|
|
724
|
+
w.hashFeed(c.length);
|
|
725
|
+
}
|
|
726
|
+
w.hashFinish();
|
|
727
|
+
// Read 8 result bytes back from scratchpad[0..8].
|
|
728
|
+
const ptr = w.getBuffer(8);
|
|
729
|
+
const out = this._u8(ptr, 8);
|
|
730
|
+
// Big-endian to hex. Same layout as the old hexHi + hexLo output:
|
|
731
|
+
// high u32 first (4 bytes), low u32 second (4 bytes).
|
|
732
|
+
let s = '';
|
|
733
|
+
for (let i = 0; i < 8; i++) {
|
|
734
|
+
s += out[i].toString(16).padStart(2, '0');
|
|
735
|
+
}
|
|
736
|
+
return s;
|
|
737
|
+
}
|
|
234
738
|
_feedXmlBytes(xml, fn) {
|
|
739
|
+
const feeder = this._wasm[fn];
|
|
235
740
|
for (let i = 0; i < xml.length; i += FEED_SIZE) {
|
|
236
741
|
const c = xml.subarray(i, i + FEED_SIZE);
|
|
237
742
|
this._writePad(c);
|
|
238
|
-
|
|
743
|
+
feeder(c.length);
|
|
239
744
|
}
|
|
240
745
|
}
|
|
241
746
|
// ── PDF WASM (lazy load) ─────────────────────────────────────────────────
|
|
242
747
|
async _ensurePdfWasm() {
|
|
243
748
|
if (this._pdfWasm)
|
|
244
749
|
return;
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
750
|
+
// Zero-config default: resolve relative to this module so bundlers copy
|
|
751
|
+
// the .wasm to the output automatically. Override with `opts.pdfWasmUrl`
|
|
752
|
+
// when serving from a separate CDN.
|
|
753
|
+
const pdfUrl = this._opts.pdfWasmUrl
|
|
754
|
+
?? new URL('../wasm/pkg/albex_pdf.wasm', import.meta.url).href;
|
|
755
|
+
// Network politeness: on constrained connections (slow-2g/2g/saveData)
|
|
756
|
+
// we still fetch on explicit user request — `_ensurePdfWasm` is only
|
|
757
|
+
// called when the user actually drops a PDF — but we issue a console
|
|
758
|
+
// hint so embedders can surface a "this will download ~1 MB" prompt.
|
|
759
|
+
if (this._resources?.constrainedNetwork) {
|
|
760
|
+
this._diag({
|
|
761
|
+
kind: 'info', stage: 'network',
|
|
762
|
+
message: 'Downloading PDF WASM (~1 MB) on a constrained network connection',
|
|
763
|
+
});
|
|
764
|
+
}
|
|
765
|
+
const res = await fetch(pdfUrl);
|
|
248
766
|
if (!res.ok)
|
|
249
|
-
throw new
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
767
|
+
throw new AlbexInitError(`Failed to fetch PDF WASM: ${res.status}`);
|
|
768
|
+
// Compile first so we can inspect the module's required imports and
|
|
769
|
+
// resolve mangled wasm-bindgen names by prefix rather than by hash.
|
|
770
|
+
const module = await WebAssembly.compileStreaming(res);
|
|
771
|
+
const imports = makePdfWasmImports(module, () => this._pdfMem);
|
|
772
|
+
const instance = await WebAssembly.instantiate(module, imports);
|
|
773
|
+
this._pdfWasm = asAlbexPdfExports(instance.exports);
|
|
774
|
+
this._pdfMem = this._pdfWasm.memory;
|
|
254
775
|
}
|
|
255
776
|
// ── Indexers ──────────────────────────────────────────────────────────────
|
|
256
|
-
async _indexDocx(file) {
|
|
257
|
-
const bytes = new Uint8Array(await file.arrayBuffer());
|
|
777
|
+
async _indexDocx(file, bytes) {
|
|
258
778
|
const xml = await findZipEntry(bytes, 'word/document.xml');
|
|
259
779
|
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
260
780
|
this._wasm.beginDocument();
|
|
261
781
|
this._feedXmlBytes(xml, 'feedXmlBytes');
|
|
262
782
|
return this._wasm.endDocument();
|
|
263
783
|
}
|
|
264
|
-
async _indexXlsx(file) {
|
|
265
|
-
const bytes = new Uint8Array(await file.arrayBuffer());
|
|
784
|
+
async _indexXlsx(file, bytes) {
|
|
266
785
|
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
267
786
|
this._wasm.beginXlsx();
|
|
268
787
|
try {
|
|
@@ -280,40 +799,291 @@ export class AlbexEngine {
|
|
|
280
799
|
}
|
|
281
800
|
return this._wasm.endDocument();
|
|
282
801
|
}
|
|
283
|
-
async _indexPdf(file) {
|
|
802
|
+
async _indexPdf(file, bytes) {
|
|
284
803
|
await this._ensurePdfWasm();
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
804
|
+
let pw = this._pdfWasm;
|
|
805
|
+
let pm = this._pdfMem;
|
|
806
|
+
if (!pw || !pm)
|
|
807
|
+
throw new AlbexInitError('PDF WASM not initialised');
|
|
808
|
+
// Reserve input buffer and copy bytes. allocInput may trigger a
|
|
809
|
+
// memory.grow inside the PDF module; the previous pm.buffer would
|
|
810
|
+
// become detached. Refresh the memory reference before constructing
|
|
811
|
+
// the view to be safe.
|
|
288
812
|
const inPtr = pw.allocInput(bytes.length);
|
|
813
|
+
pm = pw.memory;
|
|
289
814
|
new Uint8Array(pm.buffer, inPtr, bytes.length).set(bytes);
|
|
290
|
-
|
|
815
|
+
// extractPdf can panic inside pdf-extract/lopdf for PDFs that other
|
|
816
|
+
// tools accept (encrypted streams without password, exotic font
|
|
817
|
+
// dictionaries, malformed cross-reference tables, etc.). The crate
|
|
818
|
+
// is built with panic="abort" (required on wasm32-unknown-unknown
|
|
819
|
+
// — no unwinding), so the panic surfaces as a WASM `unreachable`
|
|
820
|
+
// trap and the module instance becomes unusable.
|
|
821
|
+
//
|
|
822
|
+
// Recovery strategy when this happens:
|
|
823
|
+
// 1. Discard the poisoned instance.
|
|
824
|
+
// 2. If OCR is wired AND the rebuilt binary supports image
|
|
825
|
+
// extraction, re-instantiate, reload the input bytes, and try
|
|
826
|
+
// the lopdf-only image-extraction path. lopdf is a separate
|
|
827
|
+
// parser from pdf-extract's text codec — there are real PDFs
|
|
828
|
+
// that pdf-extract trips on but lopdf walks fine, and we can
|
|
829
|
+
// recover the page images even when we cannot recover the
|
|
830
|
+
// vector text.
|
|
831
|
+
// 3. If OCR isn't wired (or the recovery also fails), surface a
|
|
832
|
+
// helpful AlbexParseError that points the user at the fix.
|
|
833
|
+
let pageCount;
|
|
834
|
+
try {
|
|
835
|
+
pageCount = pw.extractPdf(bytes.length);
|
|
836
|
+
}
|
|
837
|
+
catch (e) {
|
|
838
|
+
this._pdfWasm = null;
|
|
839
|
+
this._pdfMem = null;
|
|
840
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
841
|
+
// Try the OCR fallback before giving up.
|
|
842
|
+
if (this.ocrImage) {
|
|
843
|
+
const recovered = await this._indexPdfViaImagesOnly(file, bytes, msg);
|
|
844
|
+
if (recovered !== null)
|
|
845
|
+
return recovered;
|
|
846
|
+
}
|
|
847
|
+
throw new AlbexParseError('pdf', `PDF text extractor crashed (${msg}). ` +
|
|
848
|
+
(this.ocrImage
|
|
849
|
+
? 'OCR fallback also could not recover any content from this file.'
|
|
850
|
+
: 'Enable OCR via @albex/ocr to attempt image-based extraction as a fallback.'));
|
|
851
|
+
}
|
|
852
|
+
// Refresh memory once more — extractPdf can grow it too.
|
|
853
|
+
pm = pw.memory;
|
|
291
854
|
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
292
855
|
this._wasm.beginDocument();
|
|
293
856
|
if (pageCount === -2) {
|
|
294
|
-
// Image-only PDF
|
|
857
|
+
// Image-only (scanned) PDF. If OCR is wired AND the PDF binary
|
|
858
|
+
// supports image extraction, fall through to the scanned-PDF path.
|
|
859
|
+
// Otherwise keep today's behaviour: register the doc with 0 chunks
|
|
860
|
+
// so the user sees the file in the index but searches won't hit it.
|
|
861
|
+
const supportsImages = typeof pw.extractPageImages === 'function'
|
|
862
|
+
&& typeof pw.getPageCount === 'function';
|
|
863
|
+
if (this.ocrImage && supportsImages) {
|
|
864
|
+
await this._indexPdfScanned(pw);
|
|
865
|
+
}
|
|
295
866
|
return this._wasm.endDocument();
|
|
296
867
|
}
|
|
297
868
|
if (pageCount < 0) {
|
|
298
869
|
const errLen = pw.getErrorLen();
|
|
299
870
|
const errPtr = pw.getErrorPtr();
|
|
300
871
|
const msg = errLen > 0
|
|
301
|
-
?
|
|
872
|
+
? _dec.decode(new Uint8Array(pm.buffer, errPtr, errLen))
|
|
302
873
|
: 'PDF parse error';
|
|
303
|
-
throw new
|
|
874
|
+
throw new AlbexParseError('pdf', msg);
|
|
304
875
|
}
|
|
305
876
|
for (let p = 0; p < pageCount; p++) {
|
|
306
877
|
const len = pw.getPageLen(p);
|
|
307
878
|
if (!len)
|
|
308
879
|
continue;
|
|
309
|
-
|
|
880
|
+
// Re-read memory each iteration — feedText writes into the main
|
|
881
|
+
// WASM, but reading the PDF page pointers requires the live PDF
|
|
882
|
+
// memory which may have been grown by intermediate calls.
|
|
883
|
+
const liveMem = pw.memory;
|
|
884
|
+
const text = _dec.decode(new Uint8Array(liveMem.buffer, pw.getPagePtr(p), len));
|
|
310
885
|
this._feedText(text);
|
|
311
886
|
this._wasm.flushParagraph();
|
|
312
887
|
}
|
|
888
|
+
// Hybrid OCR pass: when the OCR adapter is wired with
|
|
889
|
+
// `options.alwaysExtractEmbeddedImages: true`, also walk every page
|
|
890
|
+
// for embedded images and OCR them on top of the vector text.
|
|
891
|
+
if (this._ocrAdapter
|
|
892
|
+
&& this._ocrAdapter.options?.alwaysExtractEmbeddedImages
|
|
893
|
+
&& typeof pw.extractPageImages === 'function'
|
|
894
|
+
&& typeof pw.getPageCount === 'function') {
|
|
895
|
+
const totalPages = pw.getPageCount();
|
|
896
|
+
for (let p = 0; p < totalPages; p++) {
|
|
897
|
+
const ocrText = await this._ocrPageEmbeddedImages(pw, p);
|
|
898
|
+
if (ocrText === null)
|
|
899
|
+
break; // WASM trapped, stop hybrid pass.
|
|
900
|
+
if (ocrText) {
|
|
901
|
+
this._feedText(ocrText);
|
|
902
|
+
this._wasm.flushParagraph();
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
return this._wasm.endDocument();
|
|
907
|
+
}
|
|
908
|
+
/**
|
|
909
|
+
* Scanned-PDF OCR fallback. Called from `_indexPdf` when `extractPdf`
|
|
910
|
+
* returns `-2` (image-only PDF) AND `@albex/ocr` has been wired via
|
|
911
|
+
* `enableOcr(engine)`.
|
|
912
|
+
*
|
|
913
|
+
* Walks every page of the PDF, extracts embedded JPEG / JPEG2000 image
|
|
914
|
+
* XObjects, runs each through `engine.ocrImage`, and feeds the recognised
|
|
915
|
+
* text into the index — one paragraph per page so search snippets stay
|
|
916
|
+
* tied to the page they came from.
|
|
917
|
+
*
|
|
918
|
+
* Failure modes handled here (none re-thrown — the goal is best-effort
|
|
919
|
+
* indexing, not all-or-nothing):
|
|
920
|
+
*
|
|
921
|
+
* * A page's `extractPageImages` traps the WASM instance: the instance
|
|
922
|
+
* is discarded so the next PDF starts fresh, and we stop iterating
|
|
923
|
+
* (no more pages can be read from a poisoned instance). The doc is
|
|
924
|
+
* still committed with whatever text we got from earlier pages.
|
|
925
|
+
* * An individual image fails to OCR (Tesseract decode error, JP2 not
|
|
926
|
+
* supported in this browser, etc.): we skip that image and keep
|
|
927
|
+
* going. Partial coverage beats nothing.
|
|
928
|
+
* * A page yields no extractable images (e.g. uses Flate/CCITT/JBIG2):
|
|
929
|
+
* no paragraph is emitted; the page contributes 0 chunks.
|
|
930
|
+
*/
|
|
931
|
+
async _indexPdfScanned(pw) {
|
|
932
|
+
if (!this.ocrImage)
|
|
933
|
+
return;
|
|
934
|
+
const totalPages = pw.getPageCount();
|
|
935
|
+
if (!totalPages)
|
|
936
|
+
return;
|
|
937
|
+
for (let p = 0; p < totalPages; p++) {
|
|
938
|
+
const pageText = await this._ocrPageEmbeddedImages(pw, p);
|
|
939
|
+
if (pageText === null)
|
|
940
|
+
return; // WASM poisoned mid-iteration.
|
|
941
|
+
if (pageText) {
|
|
942
|
+
this._feedText(pageText);
|
|
943
|
+
this._wasm.flushParagraph();
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
/**
|
|
948
|
+
* Walk one page's embedded image XObjects, OCR each image, and return
|
|
949
|
+
* the joined recognised text for that page.
|
|
950
|
+
*
|
|
951
|
+
* Used by:
|
|
952
|
+
* - `_indexPdfScanned`: image-only PDFs (extractPdf returned -2).
|
|
953
|
+
* - `_indexPdf` hybrid path: when `ocrConfig.alwaysExtractEmbeddedImages`
|
|
954
|
+
* is set, every page goes through here on top of the normal text
|
|
955
|
+
* extraction.
|
|
956
|
+
*
|
|
957
|
+
* Returns:
|
|
958
|
+
* - The recognised text (possibly empty if the page has no qualifying
|
|
959
|
+
* images or every OCR call failed).
|
|
960
|
+
* - `null` if the PDF WASM trapped during extractPageImages — the
|
|
961
|
+
* caller should abort the remaining pages because the instance is
|
|
962
|
+
* now poisoned.
|
|
963
|
+
*
|
|
964
|
+
* Failure-handling philosophy: best-effort. An OCR failure on one image
|
|
965
|
+
* does not stop the page; a page with no images does not stop the doc;
|
|
966
|
+
* only a WASM trap stops the doc.
|
|
967
|
+
*/
|
|
968
|
+
async _ocrPageEmbeddedImages(pw, page) {
|
|
969
|
+
const ocr = this.ocrImage;
|
|
970
|
+
if (!ocr)
|
|
971
|
+
return '';
|
|
972
|
+
let imageCount;
|
|
973
|
+
try {
|
|
974
|
+
imageCount = pw.extractPageImages(page);
|
|
975
|
+
}
|
|
976
|
+
catch (e) {
|
|
977
|
+
// The PDF module just trapped — it is now poisoned. Drop our refs
|
|
978
|
+
// so `_ensurePdfWasm` re-instantiates on the next call.
|
|
979
|
+
this._pdfWasm = null;
|
|
980
|
+
this._pdfMem = null;
|
|
981
|
+
this._diag({
|
|
982
|
+
kind: 'skipped', stage: 'pdf', page: page + 1,
|
|
983
|
+
message: `PDF image extractor trapped: ${e instanceof Error ? e.message : String(e)}. Remaining pages skipped.`,
|
|
984
|
+
});
|
|
985
|
+
return null;
|
|
986
|
+
}
|
|
987
|
+
if (imageCount <= 0)
|
|
988
|
+
return '';
|
|
989
|
+
// The buffer view must be re-acquired AFTER extractPageImages —
|
|
990
|
+
// it may have grown the linear memory and detached old views.
|
|
991
|
+
const liveMem = pw.memory;
|
|
992
|
+
let pageText = '';
|
|
993
|
+
for (let i = 0; i < imageCount; i++) {
|
|
994
|
+
const len = pw.getPageImageLen(i);
|
|
995
|
+
if (!len)
|
|
996
|
+
continue;
|
|
997
|
+
const ptr = pw.getPageImagePtr(i);
|
|
998
|
+
const kind = pw.getPageImageKind(i);
|
|
999
|
+
const mime = kind === 1 ? 'image/jpeg'
|
|
1000
|
+
: kind === 2 ? 'image/jp2'
|
|
1001
|
+
: 'application/octet-stream';
|
|
1002
|
+
// Snapshot the image bytes into a fresh ArrayBuffer. The pointer
|
|
1003
|
+
// returned by getPageImagePtr is only valid until the next
|
|
1004
|
+
// extractPageImages / extractPdf call, so we cannot hold the view.
|
|
1005
|
+
const copy = new Uint8Array(len);
|
|
1006
|
+
copy.set(new Uint8Array(liveMem.buffer, ptr, len));
|
|
1007
|
+
const blob = new Blob([copy.buffer], { type: mime });
|
|
1008
|
+
try {
|
|
1009
|
+
const { text } = await ocr(blob);
|
|
1010
|
+
const trimmed = text?.trim();
|
|
1011
|
+
if (trimmed) {
|
|
1012
|
+
pageText = pageText ? `${pageText} ${trimmed}` : trimmed;
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
catch (e) {
|
|
1016
|
+
// Image-level OCR failure — skip and continue. JP2 in browsers
|
|
1017
|
+
// without native support lands here; so do truncated or
|
|
1018
|
+
// unsupported JPEG variants. Worker aborts (Tesseract.js
|
|
1019
|
+
// "Aborted(-1)") are also caught here; if they bypass the
|
|
1020
|
+
// promise rejection and surface as `uncaught` instead, the
|
|
1021
|
+
// demo's window.onerror handler will keep the app alive.
|
|
1022
|
+
this._diag({
|
|
1023
|
+
kind: 'skipped', stage: 'ocr', page: page + 1,
|
|
1024
|
+
message: `OCR failed on image ${i + 1}: ${e instanceof Error ? e.message : String(e)}`,
|
|
1025
|
+
});
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
return pageText;
|
|
1029
|
+
}
|
|
1030
|
+
/**
|
|
1031
|
+
* Last-chance OCR path used when `extractPdf` itself trapped (pdf-extract
|
|
1032
|
+
* crashed but lopdf may still be able to walk the file). Re-instantiates
|
|
1033
|
+
* the PDF WASM, reloads the input bytes, and tries the image-extraction
|
|
1034
|
+
* route directly — bypassing the text codec entirely.
|
|
1035
|
+
*
|
|
1036
|
+
* Returns:
|
|
1037
|
+
* * the doc's chunk count on success (even 0 — that means lopdf could
|
|
1038
|
+
* parse but no qualifying images existed, which still beats a hard
|
|
1039
|
+
* parse error),
|
|
1040
|
+
* * null if the recovery itself failed (binary lacks the image exports,
|
|
1041
|
+
* re-instantiation failed, or lopdf also trapped). In the null case
|
|
1042
|
+
* the caller throws AlbexParseError so the user sees a clear message.
|
|
1043
|
+
*/
|
|
1044
|
+
async _indexPdfViaImagesOnly(file, bytes, originalError) {
|
|
1045
|
+
try {
|
|
1046
|
+
await this._ensurePdfWasm();
|
|
1047
|
+
}
|
|
1048
|
+
catch {
|
|
1049
|
+
return null;
|
|
1050
|
+
}
|
|
1051
|
+
const pw = this._pdfWasm;
|
|
1052
|
+
if (!pw)
|
|
1053
|
+
return null;
|
|
1054
|
+
const supportsImages = typeof pw.extractPageImages === 'function'
|
|
1055
|
+
&& typeof pw.getPageCount === 'function';
|
|
1056
|
+
if (!supportsImages)
|
|
1057
|
+
return null;
|
|
1058
|
+
// Reload input bytes into the fresh instance. allocInput may grow the
|
|
1059
|
+
// memory, so re-acquire the buffer view immediately after.
|
|
1060
|
+
let inPtr;
|
|
1061
|
+
try {
|
|
1062
|
+
inPtr = pw.allocInput(bytes.length);
|
|
1063
|
+
new Uint8Array(pw.memory.buffer, inPtr, bytes.length).set(bytes);
|
|
1064
|
+
}
|
|
1065
|
+
catch (e) {
|
|
1066
|
+
this._diag({
|
|
1067
|
+
kind: 'skipped', stage: 'pdf',
|
|
1068
|
+
message: `PDF re-load after extractor crash failed: ${e instanceof Error ? e.message : String(e)}`,
|
|
1069
|
+
});
|
|
1070
|
+
return null;
|
|
1071
|
+
}
|
|
1072
|
+
// Set up the doc and let _indexPdfScanned do the page-by-page walk.
|
|
1073
|
+
// _indexPdfScanned tolerates lopdf failing mid-stream — it caches the
|
|
1074
|
+
// poisoned instance and returns early. If lopdf trips on the very
|
|
1075
|
+
// first page, no paragraphs are emitted and we end up with 0 chunks.
|
|
1076
|
+
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
1077
|
+
this._wasm.beginDocument();
|
|
1078
|
+
this._diag({
|
|
1079
|
+
kind: 'fallback', stage: 'pdf', file: file.name,
|
|
1080
|
+
message: `pdf-extract failed (${originalError}); attempting OCR-only fallback via lopdf`,
|
|
1081
|
+
});
|
|
1082
|
+
await this._indexPdfScanned(pw);
|
|
313
1083
|
return this._wasm.endDocument();
|
|
314
1084
|
}
|
|
315
|
-
async _indexTxt(file) {
|
|
316
|
-
const text =
|
|
1085
|
+
async _indexTxt(file, bytes) {
|
|
1086
|
+
const text = _dec.decode(bytes);
|
|
317
1087
|
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
318
1088
|
this._wasm.beginDocument();
|
|
319
1089
|
for (const para of text.split(/\n{2,}/)) {
|
|
@@ -325,8 +1095,8 @@ export class AlbexEngine {
|
|
|
325
1095
|
}
|
|
326
1096
|
return this._wasm.endDocument();
|
|
327
1097
|
}
|
|
328
|
-
async _indexXml(file) {
|
|
329
|
-
const plain =
|
|
1098
|
+
async _indexXml(file, bytes) {
|
|
1099
|
+
const plain = _dec.decode(bytes)
|
|
330
1100
|
.replace(/<[^]*?>/g, '\n')
|
|
331
1101
|
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
|
|
332
1102
|
.replace(/"/g, '"').replace(/'/g, "'")
|
|
@@ -342,54 +1112,804 @@ export class AlbexEngine {
|
|
|
342
1112
|
}
|
|
343
1113
|
return this._wasm.endDocument();
|
|
344
1114
|
}
|
|
1115
|
+
// ── Markdown ─────────────────────────────────────────────────────────────
|
|
1116
|
+
// Strip CommonMark inline marks but keep word content. Paragraphs split on
|
|
1117
|
+
// blank lines, same convention as TXT/XML.
|
|
1118
|
+
async _indexMd(file, bytes) {
|
|
1119
|
+
const text = _dec.decode(bytes)
|
|
1120
|
+
// Remove fenced code blocks entirely (often noisy for search relevance).
|
|
1121
|
+
.replace(/```[\s\S]*?```/g, '\n')
|
|
1122
|
+
.replace(/~~~[\s\S]*?~~~/g, '\n')
|
|
1123
|
+
// Strip ATX heading markers but keep heading text.
|
|
1124
|
+
.replace(/^#{1,6}\s+/gm, '')
|
|
1125
|
+
// Replace inline links/images with their visible text.
|
|
1126
|
+
.replace(/!\[([^\]]*)\]\([^)]*\)/g, '$1')
|
|
1127
|
+
.replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
|
|
1128
|
+
// Strip emphasis markers (preserve content).
|
|
1129
|
+
.replace(/(\*\*|__|\*|_)/g, '')
|
|
1130
|
+
// Inline code.
|
|
1131
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
1132
|
+
// Blockquote marks.
|
|
1133
|
+
.replace(/^>\s?/gm, '')
|
|
1134
|
+
// List markers.
|
|
1135
|
+
.replace(/^\s*[-*+]\s+/gm, '')
|
|
1136
|
+
.replace(/^\s*\d+\.\s+/gm, '');
|
|
1137
|
+
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
1138
|
+
this._wasm.beginDocument();
|
|
1139
|
+
for (const para of text.split(/\n{2,}/)) {
|
|
1140
|
+
const l = para.replace(/\n/g, ' ').trim();
|
|
1141
|
+
if (l) {
|
|
1142
|
+
this._feedText(l);
|
|
1143
|
+
this._wasm.flushParagraph();
|
|
1144
|
+
}
|
|
1145
|
+
}
|
|
1146
|
+
return this._wasm.endDocument();
|
|
1147
|
+
}
|
|
1148
|
+
// ── HTML ─────────────────────────────────────────────────────────────────
|
|
1149
|
+
// Strip <script>/<style> entire blocks, then drop tag markup. The output is
|
|
1150
|
+
// chunked at <p>, <br>, <h*>, <li>, <tr> boundaries (mapped to paragraph
|
|
1151
|
+
// breaks) so search location numbers map naturally to the document outline.
|
|
1152
|
+
async _indexHtml(file, bytes) {
|
|
1153
|
+
const html = _dec.decode(bytes)
|
|
1154
|
+
.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, ' ')
|
|
1155
|
+
.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, ' ')
|
|
1156
|
+
// Treat block-level closers as paragraph separators.
|
|
1157
|
+
.replace(/<\/(p|h[1-6]|li|tr|div|section|article|header|footer)\s*>/gi, '\n\n')
|
|
1158
|
+
.replace(/<br\s*\/?\s*>/gi, '\n')
|
|
1159
|
+
// Drop remaining tags.
|
|
1160
|
+
.replace(/<[^>]+>/g, ' ')
|
|
1161
|
+
// Decode common entities (full set would need a table; this covers >95%).
|
|
1162
|
+
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
|
|
1163
|
+
.replace(/"/g, '"').replace(/'/g, "'").replace(/ /g, ' ')
|
|
1164
|
+
.replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(Number(n)))
|
|
1165
|
+
.replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCodePoint(parseInt(n, 16)))
|
|
1166
|
+
.replace(/[ \t]+/g, ' ');
|
|
1167
|
+
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
1168
|
+
this._wasm.beginDocument();
|
|
1169
|
+
for (const para of html.split(/\n{2,}/)) {
|
|
1170
|
+
const l = para.replace(/\n/g, ' ').trim();
|
|
1171
|
+
if (l) {
|
|
1172
|
+
this._feedText(l);
|
|
1173
|
+
this._wasm.flushParagraph();
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
return this._wasm.endDocument();
|
|
1177
|
+
}
|
|
1178
|
+
// ── JSON ─────────────────────────────────────────────────────────────────
|
|
1179
|
+
// Extract every string value (keys + leaf strings) recursively. Each leaf
|
|
1180
|
+
// becomes its own searchable chunk via paragraph flush. Numbers/booleans
|
|
1181
|
+
// are skipped (cannot match a textual query usefully).
|
|
1182
|
+
async _indexJson(file, bytes) {
|
|
1183
|
+
let root;
|
|
1184
|
+
try {
|
|
1185
|
+
root = JSON.parse(_dec.decode(bytes));
|
|
1186
|
+
}
|
|
1187
|
+
catch (e) {
|
|
1188
|
+
throw new AlbexParseError('json', e.message);
|
|
1189
|
+
}
|
|
1190
|
+
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
1191
|
+
this._wasm.beginDocument();
|
|
1192
|
+
const visit = (v) => {
|
|
1193
|
+
if (typeof v === 'string') {
|
|
1194
|
+
if (v.trim()) {
|
|
1195
|
+
this._feedText(v);
|
|
1196
|
+
this._wasm.flushParagraph();
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
else if (Array.isArray(v)) {
|
|
1200
|
+
for (const x of v)
|
|
1201
|
+
visit(x);
|
|
1202
|
+
}
|
|
1203
|
+
else if (v && typeof v === 'object') {
|
|
1204
|
+
for (const [k, x] of Object.entries(v)) {
|
|
1205
|
+
if (k.trim()) {
|
|
1206
|
+
this._feedText(k);
|
|
1207
|
+
this._wasm.flushParagraph();
|
|
1208
|
+
}
|
|
1209
|
+
visit(x);
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
};
|
|
1213
|
+
visit(root);
|
|
1214
|
+
return this._wasm.endDocument();
|
|
1215
|
+
}
|
|
1216
|
+
// ── CSV ──────────────────────────────────────────────────────────────────
|
|
1217
|
+
// RFC 4180 lite: comma-separated, optional double quotes, escaped "" inside
|
|
1218
|
+
// quoted fields. Each row becomes one paragraph (location = row index, with
|
|
1219
|
+
// header row at location 0).
|
|
1220
|
+
async _indexCsv(file, bytes) {
|
|
1221
|
+
// Strip an optional UTF-8 BOM. Excel writes it by default for "CSV UTF-8";
|
|
1222
|
+
// without this fix the first field of the first row would start with
|
|
1223
|
+
// U+FEFF, which both shifts column alignment when consumers split on a
|
|
1224
|
+
// field name and breaks search hits on "Subject" / "Asunto" etc.
|
|
1225
|
+
let text = _dec.decode(bytes);
|
|
1226
|
+
if (text.charCodeAt(0) === 0xFEFF)
|
|
1227
|
+
text = text.slice(1);
|
|
1228
|
+
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
1229
|
+
this._wasm.beginDocument();
|
|
1230
|
+
let row = [];
|
|
1231
|
+
let field = '';
|
|
1232
|
+
let inQuoted = false;
|
|
1233
|
+
const flushRow = () => {
|
|
1234
|
+
const line = row.join(' ').trim();
|
|
1235
|
+
if (line) {
|
|
1236
|
+
this._feedText(line);
|
|
1237
|
+
this._wasm.flushParagraph();
|
|
1238
|
+
}
|
|
1239
|
+
row = [];
|
|
1240
|
+
};
|
|
1241
|
+
for (let i = 0; i < text.length; i++) {
|
|
1242
|
+
const c = text[i];
|
|
1243
|
+
if (inQuoted) {
|
|
1244
|
+
if (c === '"') {
|
|
1245
|
+
if (text[i + 1] === '"') {
|
|
1246
|
+
field += '"';
|
|
1247
|
+
i++;
|
|
1248
|
+
}
|
|
1249
|
+
else
|
|
1250
|
+
inQuoted = false;
|
|
1251
|
+
}
|
|
1252
|
+
else
|
|
1253
|
+
field += c;
|
|
1254
|
+
}
|
|
1255
|
+
else {
|
|
1256
|
+
if (c === ',') {
|
|
1257
|
+
row.push(field);
|
|
1258
|
+
field = '';
|
|
1259
|
+
}
|
|
1260
|
+
else if (c === '\n') {
|
|
1261
|
+
row.push(field);
|
|
1262
|
+
field = '';
|
|
1263
|
+
flushRow();
|
|
1264
|
+
}
|
|
1265
|
+
else if (c === '\r') { /* skip */ }
|
|
1266
|
+
else if (c === '"' && field.length === 0)
|
|
1267
|
+
inQuoted = true;
|
|
1268
|
+
else
|
|
1269
|
+
field += c;
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
1272
|
+
if (field.length > 0 || row.length > 0) {
|
|
1273
|
+
row.push(field);
|
|
1274
|
+
flushRow();
|
|
1275
|
+
}
|
|
1276
|
+
return this._wasm.endDocument();
|
|
1277
|
+
}
|
|
1278
|
+
// ── EML / MBOX ───────────────────────────────────────────────────────────
|
|
1279
|
+
// Minimal MIME: parse the first text/plain body. Headers From/To/Subject
|
|
1280
|
+
// are indexed as separate paragraphs so they're individually searchable.
|
|
1281
|
+
//
|
|
1282
|
+
// What's decoded:
|
|
1283
|
+
// * Content-Transfer-Encoding: base64 → decoded.
|
|
1284
|
+
// * Content-Transfer-Encoding: quoted-printable → decoded.
|
|
1285
|
+
// * Content-Transfer-Encoding: 7bit / 8bit → pass-through.
|
|
1286
|
+
// * Nested multipart (multipart/alternative inside multipart/mixed) by
|
|
1287
|
+
// recursively walking boundaries until a text/plain section is found.
|
|
1288
|
+
//
|
|
1289
|
+
// What's not decoded (out of scope for this "lite" parser):
|
|
1290
|
+
// * Encoded-word headers (=?utf-8?Q?...?=) — only the raw bytes go in.
|
|
1291
|
+
// * Charset conversions other than UTF-8 — assumes the body decodes as UTF-8.
|
|
1292
|
+
// * HTML-only emails — they're dropped if no text/plain part is present.
|
|
1293
|
+
// * MBOX format (multiple emails concatenated). Each email needs to be
|
|
1294
|
+
// fed separately.
|
|
1295
|
+
async _indexEml(file, bytes) {
|
|
1296
|
+
const raw = _dec.decode(bytes).replace(/\r\n/g, '\n');
|
|
1297
|
+
const headerEnd = raw.indexOf('\n\n');
|
|
1298
|
+
const headersBlock = headerEnd > 0 ? raw.slice(0, headerEnd) : raw;
|
|
1299
|
+
const body = headerEnd > 0 ? raw.slice(headerEnd + 2) : '';
|
|
1300
|
+
const header = (block, name) => {
|
|
1301
|
+
const m = new RegExp(`^${name}:\\s*(.+(?:\\n[ \\t].+)*)`, 'mi').exec(block);
|
|
1302
|
+
return m ? (m[1] ?? '').replace(/\n[ \t]+/g, ' ').trim() : '';
|
|
1303
|
+
};
|
|
1304
|
+
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
1305
|
+
this._wasm.beginDocument();
|
|
1306
|
+
const subj = header(headersBlock, 'Subject');
|
|
1307
|
+
const from = header(headersBlock, 'From');
|
|
1308
|
+
const to = header(headersBlock, 'To');
|
|
1309
|
+
for (const h of [subj, from, to]) {
|
|
1310
|
+
if (h) {
|
|
1311
|
+
this._feedText(h);
|
|
1312
|
+
this._wasm.flushParagraph();
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
const plain = this._extractEmlTextPlain(headersBlock, body, header) ?? body;
|
|
1316
|
+
for (const para of plain.split(/\n{2,}/)) {
|
|
1317
|
+
const l = para.replace(/\n/g, ' ').trim();
|
|
1318
|
+
if (l) {
|
|
1319
|
+
this._feedText(l);
|
|
1320
|
+
this._wasm.flushParagraph();
|
|
1321
|
+
}
|
|
1322
|
+
}
|
|
1323
|
+
return this._wasm.endDocument();
|
|
1324
|
+
}
|
|
1325
|
+
/**
|
|
1326
|
+
* Walk the multipart tree until a text/plain section is found. Returns
|
|
1327
|
+
* the decoded body as a string, or null if no text/plain part exists.
|
|
1328
|
+
*
|
|
1329
|
+
* The function is called with the headers and body of the *current*
|
|
1330
|
+
* MIME entity (the top-level message at first, then each multipart child
|
|
1331
|
+
* on recursion). For single-part entities it inspects the entity's own
|
|
1332
|
+
* Content-Transfer-Encoding and decodes accordingly.
|
|
1333
|
+
*/
|
|
1334
|
+
_extractEmlTextPlain(headersBlock, body, header) {
|
|
1335
|
+
const contentType = header(headersBlock, 'Content-Type');
|
|
1336
|
+
const boundary = /boundary="?([^";]+)"?/i.exec(contentType)?.[1];
|
|
1337
|
+
if (!boundary) {
|
|
1338
|
+
// Single-part body. If it claims to be text/plain (the default when
|
|
1339
|
+
// Content-Type is absent), apply Transfer-Encoding decoding here.
|
|
1340
|
+
// Anything else (text/html, application/*) gets returned raw — the
|
|
1341
|
+
// top-level caller still feeds it as text, but searches against
|
|
1342
|
+
// genuinely binary payloads will not hit anything useful.
|
|
1343
|
+
if (contentType === '' || /text\/plain/i.test(contentType)) {
|
|
1344
|
+
return decodeEmlBody(headersBlock, body, header);
|
|
1345
|
+
}
|
|
1346
|
+
return body;
|
|
1347
|
+
}
|
|
1348
|
+
const parts = body.split(`--${boundary}`);
|
|
1349
|
+
for (const part of parts) {
|
|
1350
|
+
const trimmed = part.replace(/^\n+/, '');
|
|
1351
|
+
const ph = trimmed.indexOf('\n\n');
|
|
1352
|
+
if (ph < 0)
|
|
1353
|
+
continue;
|
|
1354
|
+
const partHeaders = trimmed.slice(0, ph);
|
|
1355
|
+
const partBody = trimmed.slice(ph + 2);
|
|
1356
|
+
const partCtype = header(partHeaders, 'Content-Type');
|
|
1357
|
+
if (/^multipart\//i.test(partCtype)) {
|
|
1358
|
+
const inner = this._extractEmlTextPlain(partHeaders, partBody, header);
|
|
1359
|
+
if (inner)
|
|
1360
|
+
return inner;
|
|
1361
|
+
continue;
|
|
1362
|
+
}
|
|
1363
|
+
if (/text\/plain/i.test(partCtype)) {
|
|
1364
|
+
return decodeEmlBody(partHeaders, partBody, header);
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
return null;
|
|
1368
|
+
}
|
|
1369
|
+
// ── RTF ──────────────────────────────────────────────────────────────────
|
|
1370
|
+
//
|
|
1371
|
+
// Strip the {\rtf1...} group structure. Control words (\xxx and \xxxN),
|
|
1372
|
+
// hex escapes (\'XX), unicode escapes (\uN ?) and groups are processed;
|
|
1373
|
+
// plain runs are kept.
|
|
1374
|
+
//
|
|
1375
|
+
// Character decoding:
|
|
1376
|
+
// * \'XX → Windows-1252 byte XX. RTF defaults to cp1252 for high-ANSI;
|
|
1377
|
+
// we map the relevant rows (0x80–0x9F differs from Latin-1)
|
|
1378
|
+
// to their Unicode equivalents. Outside that block, the byte
|
|
1379
|
+
// is taken as Latin-1 (which equals Unicode below 0x100).
|
|
1380
|
+
// Result: accents in es/fr/de/it/pt RTF dumps survive.
|
|
1381
|
+
// * \uN ? → Unicode codepoint N (signed 16-bit, negative means N+65536).
|
|
1382
|
+
// Followed by a fallback character which we then skip — Word
|
|
1383
|
+
// writes the ASCII transliteration of the unicode glyph as a
|
|
1384
|
+
// fallback for non-Unicode readers; we ignore it because we
|
|
1385
|
+
// have the real codepoint.
|
|
1386
|
+
// * \- → soft hyphen (drop).
|
|
1387
|
+
// * \~ → non-breaking space.
|
|
1388
|
+
// * \emdash, \endash, \bullet, \lquote, \rquote, \ldblquote, \rdblquote
|
|
1389
|
+
// → their Unicode equivalents.
|
|
1390
|
+
//
|
|
1391
|
+
// What's not handled (assumes Word/Pages/LibreOffice output, where
|
|
1392
|
+
// these aren't load-bearing):
|
|
1393
|
+
// * \ansicpg, \fcharset — we always assume cp1252 for \' escapes.
|
|
1394
|
+
// * \bin — binary data with explicit length; rare in document RTF.
|
|
1395
|
+
// * Field codes — rendered as the visible text (good enough for search).
|
|
1396
|
+
async _indexRtf(file, bytes) {
|
|
1397
|
+
const src = _dec.decode(bytes);
|
|
1398
|
+
let out = '';
|
|
1399
|
+
let i = 0;
|
|
1400
|
+
let depth = 0;
|
|
1401
|
+
// Track if we're inside a destination group we should skip (e.g. \fonttbl).
|
|
1402
|
+
let skipDepth = 0;
|
|
1403
|
+
const SKIP_DESTINATIONS = /^\\(fonttbl|colortbl|stylesheet|info|pict|object|header|footer)\b/;
|
|
1404
|
+
while (i < src.length) {
|
|
1405
|
+
const c = src[i];
|
|
1406
|
+
if (c === '{') {
|
|
1407
|
+
depth++;
|
|
1408
|
+
i++;
|
|
1409
|
+
continue;
|
|
1410
|
+
}
|
|
1411
|
+
if (c === '}') {
|
|
1412
|
+
depth--;
|
|
1413
|
+
if (skipDepth > 0 && depth < skipDepth)
|
|
1414
|
+
skipDepth = 0;
|
|
1415
|
+
i++;
|
|
1416
|
+
continue;
|
|
1417
|
+
}
|
|
1418
|
+
if (c === '\\') {
|
|
1419
|
+
// Hex byte escape: \'XX
|
|
1420
|
+
if (src[i + 1] === '\'' && i + 3 < src.length) {
|
|
1421
|
+
const hex = src.slice(i + 2, i + 4);
|
|
1422
|
+
if (/^[0-9A-Fa-f]{2}$/.test(hex)) {
|
|
1423
|
+
if (skipDepth === 0)
|
|
1424
|
+
out += rtfCp1252ToChar(parseInt(hex, 16));
|
|
1425
|
+
i += 4;
|
|
1426
|
+
continue;
|
|
1427
|
+
}
|
|
1428
|
+
// Malformed — drop and advance.
|
|
1429
|
+
i += 2;
|
|
1430
|
+
continue;
|
|
1431
|
+
}
|
|
1432
|
+
// Unicode escape: \uN followed by optional fallback character.
|
|
1433
|
+
// N is signed 16-bit per the spec; negative values mean N + 65536.
|
|
1434
|
+
const um = /^\\u(-?\d+) ?/.exec(src.slice(i));
|
|
1435
|
+
if (um) {
|
|
1436
|
+
let code = parseInt(um[1] ?? '0', 10);
|
|
1437
|
+
if (code < 0)
|
|
1438
|
+
code += 0x10000;
|
|
1439
|
+
if (skipDepth === 0 && code > 0 && code < 0x110000) {
|
|
1440
|
+
out += String.fromCodePoint(code);
|
|
1441
|
+
}
|
|
1442
|
+
i += um[0].length;
|
|
1443
|
+
// Skip the fallback char. Word writes one ASCII char after \uN
|
|
1444
|
+
// (the "uc1" count). We assume uc1, which is the Word default.
|
|
1445
|
+
if (i < src.length && src[i] !== '\\' && src[i] !== '{' && src[i] !== '}') {
|
|
1446
|
+
i++;
|
|
1447
|
+
}
|
|
1448
|
+
continue;
|
|
1449
|
+
}
|
|
1450
|
+
// Control word / symbol.
|
|
1451
|
+
const m = /^\\([A-Za-z]+)(-?\d+)?\s?/.exec(src.slice(i));
|
|
1452
|
+
if (m) {
|
|
1453
|
+
const word = m[1] ?? '';
|
|
1454
|
+
if (skipDepth === 0 && SKIP_DESTINATIONS.test(src.slice(i)))
|
|
1455
|
+
skipDepth = depth;
|
|
1456
|
+
if (skipDepth === 0) {
|
|
1457
|
+
switch (word) {
|
|
1458
|
+
case 'par':
|
|
1459
|
+
case 'line':
|
|
1460
|
+
case 'sect':
|
|
1461
|
+
out += '\n\n';
|
|
1462
|
+
break;
|
|
1463
|
+
case 'tab':
|
|
1464
|
+
out += '\t';
|
|
1465
|
+
break;
|
|
1466
|
+
case 'emdash':
|
|
1467
|
+
out += '—';
|
|
1468
|
+
break;
|
|
1469
|
+
case 'endash':
|
|
1470
|
+
out += '–';
|
|
1471
|
+
break;
|
|
1472
|
+
case 'bullet':
|
|
1473
|
+
out += '•';
|
|
1474
|
+
break;
|
|
1475
|
+
case 'lquote':
|
|
1476
|
+
out += '‘';
|
|
1477
|
+
break;
|
|
1478
|
+
case 'rquote':
|
|
1479
|
+
out += '’';
|
|
1480
|
+
break;
|
|
1481
|
+
case 'ldblquote':
|
|
1482
|
+
out += '“';
|
|
1483
|
+
break;
|
|
1484
|
+
case 'rdblquote':
|
|
1485
|
+
out += '”';
|
|
1486
|
+
break;
|
|
1487
|
+
default: /* drop other control words silently */ break;
|
|
1488
|
+
}
|
|
1489
|
+
}
|
|
1490
|
+
i += m[0].length;
|
|
1491
|
+
continue;
|
|
1492
|
+
}
|
|
1493
|
+
// Escaped single character: \\, \{, \}, \-, \~ etc.
|
|
1494
|
+
if (skipDepth === 0) {
|
|
1495
|
+
const escaped = src[i + 1];
|
|
1496
|
+
if (escaped === '~')
|
|
1497
|
+
out += ' '; // non-breaking space
|
|
1498
|
+
else if (escaped === '-') { /* soft hyphen — drop */ }
|
|
1499
|
+
else if (escaped !== undefined)
|
|
1500
|
+
out += escaped;
|
|
1501
|
+
}
|
|
1502
|
+
i += 2;
|
|
1503
|
+
continue;
|
|
1504
|
+
}
|
|
1505
|
+
if (skipDepth === 0)
|
|
1506
|
+
out += c;
|
|
1507
|
+
i++;
|
|
1508
|
+
}
|
|
1509
|
+
this._wasm.setDocumentName(this._writeStr(file.name));
|
|
1510
|
+
this._wasm.beginDocument();
|
|
1511
|
+
for (const para of out.split(/\n{2,}/)) {
|
|
1512
|
+
const l = para.replace(/\n/g, ' ').trim();
|
|
1513
|
+
if (l) {
|
|
1514
|
+
this._feedText(l);
|
|
1515
|
+
this._wasm.flushParagraph();
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
return this._wasm.endDocument();
|
|
1519
|
+
}
|
|
1520
|
+
static _INDEXERS = {
|
|
1521
|
+
docx: (e, f, b) => e._indexDocx(f, b),
|
|
1522
|
+
xlsx: (e, f, b) => e._indexXlsx(f, b),
|
|
1523
|
+
pdf: (e, f, b) => e._indexPdf(f, b),
|
|
1524
|
+
txt: (e, f, b) => e._indexTxt(f, b),
|
|
1525
|
+
xml: (e, f, b) => e._indexXml(f, b),
|
|
1526
|
+
md: (e, f, b) => e._indexMd(f, b),
|
|
1527
|
+
markdown: (e, f, b) => e._indexMd(f, b),
|
|
1528
|
+
html: (e, f, b) => e._indexHtml(f, b),
|
|
1529
|
+
htm: (e, f, b) => e._indexHtml(f, b),
|
|
1530
|
+
json: (e, f, b) => e._indexJson(f, b),
|
|
1531
|
+
csv: (e, f, b) => e._indexCsv(f, b),
|
|
1532
|
+
eml: (e, f, b) => e._indexEml(f, b),
|
|
1533
|
+
rtf: (e, f, b) => e._indexRtf(f, b),
|
|
1534
|
+
};
|
|
345
1535
|
// ── Public API ────────────────────────────────────────────────────────────
|
|
346
1536
|
/**
|
|
347
1537
|
* Index a file. Supported formats: DOCX, XLSX, PDF, TXT, XML.
|
|
348
1538
|
* Throws for unsupported formats or parse errors.
|
|
349
1539
|
*/
|
|
350
1540
|
async indexFile(file) {
|
|
1541
|
+
return this._exclusive(() => this._indexFileInner(file));
|
|
1542
|
+
}
|
|
1543
|
+
async _indexFileInner(file) {
|
|
351
1544
|
const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
|
|
352
1545
|
const indexer = AlbexEngine._INDEXERS[ext];
|
|
353
1546
|
if (!indexer)
|
|
354
|
-
throw new
|
|
1547
|
+
throw new AlbexUnsupportedFormatError(ext);
|
|
1548
|
+
// Hash the source bytes for idempotency. We always read the bytes once
|
|
1549
|
+
// here so the indexer can reuse them — avoids a double File.arrayBuffer().
|
|
1550
|
+
const bytes = new Uint8Array(await file.arrayBuffer());
|
|
1551
|
+
const hash = this._contentHash(bytes);
|
|
1552
|
+
// Idempotency: if a non-deleted doc already has this hash, return it
|
|
1553
|
+
// unchanged. Cheap O(N) scan since MAX_DOCS = 128.
|
|
1554
|
+
const existing = this._docs.find(d => d.contentHash === hash);
|
|
1555
|
+
if (existing)
|
|
1556
|
+
return existing;
|
|
1557
|
+
const w = this._wasm;
|
|
355
1558
|
const t0 = performance.now();
|
|
356
|
-
const textPre =
|
|
357
|
-
const
|
|
1559
|
+
const textPre = w.getTextUsed();
|
|
1560
|
+
const docCountBefore = w.getDocCount();
|
|
1561
|
+
// Snapshot v2: hand the content hash to the WASM so it persists with
|
|
1562
|
+
// the doc. Older binaries (pre-v2) lack this export — we silently skip
|
|
1563
|
+
// and behave like before. The indexer will overwrite the scratchpad
|
|
1564
|
+
// immediately after (with the doc name), which is fine because
|
|
1565
|
+
// setDocumentContentHash copies into pending_content_hash before
|
|
1566
|
+
// returning.
|
|
1567
|
+
if (typeof w.setDocumentContentHash === 'function') {
|
|
1568
|
+
const hashBytes = hashHexToBytes(hash);
|
|
1569
|
+
this._writePad(hashBytes);
|
|
1570
|
+
w.setDocumentContentHash(hashBytes.length);
|
|
1571
|
+
}
|
|
1572
|
+
const chunks = await indexer(this, file, bytes);
|
|
1573
|
+
// Capacity check (0.6.0). The WASM pools fill silently and break out of
|
|
1574
|
+
// their ingest loops; getLastIndexOverflow reports which one filled.
|
|
1575
|
+
// Surface a typed error instead of returning a half-indexed document the
|
|
1576
|
+
// caller cannot tell apart from a complete one (audit finding #3).
|
|
1577
|
+
const overflow = w.getLastIndexOverflow();
|
|
1578
|
+
if (overflow !== 0) {
|
|
1579
|
+
const which = (overflow & 1) ? 'chunks' : (overflow & 2) ? 'text'
|
|
1580
|
+
: (overflow & 4) ? 'docs' : 'names';
|
|
1581
|
+
const pools = [
|
|
1582
|
+
overflow & 1 ? 'chunk pool' : '',
|
|
1583
|
+
overflow & 2 ? 'text pool' : '',
|
|
1584
|
+
overflow & 4 ? 'document table' : '',
|
|
1585
|
+
overflow & 8 ? 'name pool' : '',
|
|
1586
|
+
].filter(Boolean).join(', ');
|
|
1587
|
+
throw new AlbexCapacityError(`Index capacity exceeded while indexing "${file.name}" (${pools} full). ` +
|
|
1588
|
+
`The document was rolled back (not indexed); treat the index as full ` +
|
|
1589
|
+
`(compact(), shard across an AlbexPool, or reset()).`, which);
|
|
1590
|
+
}
|
|
1591
|
+
// The new doc occupies slot `docCountBefore`.
|
|
1592
|
+
const docId = w.getDocId(docCountBefore);
|
|
358
1593
|
const doc = {
|
|
359
1594
|
name: file.name,
|
|
360
1595
|
ext,
|
|
361
1596
|
chunks,
|
|
362
1597
|
indexTimeMs: performance.now() - t0,
|
|
363
|
-
textBytes:
|
|
1598
|
+
textBytes: w.getTextUsed() - textPre,
|
|
1599
|
+
docId,
|
|
1600
|
+
contentHash: hash,
|
|
364
1601
|
};
|
|
365
1602
|
this._docs.push(doc);
|
|
366
1603
|
return doc;
|
|
367
1604
|
}
|
|
1605
|
+
/**
|
|
1606
|
+
* Mark a previously indexed document as removed. Searches no longer return
|
|
1607
|
+
* its chunks. Storage is reclaimed only after `compact()`.
|
|
1608
|
+
*
|
|
1609
|
+
* `id` can be the file name or the contentHash returned by `indexFile`.
|
|
1610
|
+
* Returns `true` if a matching document was found and tombstoned.
|
|
1611
|
+
*/
|
|
1612
|
+
removeDocument(id) {
|
|
1613
|
+
this._assertIdle('removeDocument');
|
|
1614
|
+
return this._removeDocumentInner(id);
|
|
1615
|
+
}
|
|
1616
|
+
_removeDocumentInner(id) {
|
|
1617
|
+
const doc = this._docs.find(d => d.name === id || d.contentHash === id);
|
|
1618
|
+
if (!doc)
|
|
1619
|
+
return false;
|
|
1620
|
+
const ok = this._wasm.removeDocument(doc.docId) === 1;
|
|
1621
|
+
if (ok) {
|
|
1622
|
+
this._docs = this._docs.filter(d => d !== doc);
|
|
1623
|
+
}
|
|
1624
|
+
return ok;
|
|
1625
|
+
}
|
|
1626
|
+
/**
|
|
1627
|
+
* Replace a previously indexed document with new content. Equivalent to
|
|
1628
|
+
* `removeDocument(name)` + `indexFile(newFile)` but does not trigger the
|
|
1629
|
+
* idempotency check (so re-indexing the *same* bytes after a remove works).
|
|
1630
|
+
*/
|
|
1631
|
+
async replaceDocument(name, newFile) {
|
|
1632
|
+
return this._exclusive(async () => {
|
|
1633
|
+
this._removeDocumentInner(name);
|
|
1634
|
+
// Index directly via the inner path (we already hold the lock).
|
|
1635
|
+
const doc = await this._indexFileInner(newFile);
|
|
1636
|
+
// Repeated replaces leave tombstones in the text pool; reclaim under
|
|
1637
|
+
// pressure so the pool isn't silently exhausted (audit finding #7).
|
|
1638
|
+
this._autoCompactIfNeeded();
|
|
1639
|
+
return doc;
|
|
1640
|
+
});
|
|
1641
|
+
}
|
|
1642
|
+
/**
|
|
1643
|
+
* Reclaim storage from previously removed documents. Compacts CHUNKS,
|
|
1644
|
+
* TEXT_POOL, DOC_NAMES and NAME_POOL in place. Idempotent.
|
|
1645
|
+
*
|
|
1646
|
+
* Note: doc_ids of surviving documents are preserved, so any stored
|
|
1647
|
+
* references (e.g. in a UI) remain valid.
|
|
1648
|
+
*/
|
|
1649
|
+
compact() {
|
|
1650
|
+
this._assertIdle('compact');
|
|
1651
|
+
this._wasm.compact();
|
|
1652
|
+
}
|
|
368
1653
|
/**
|
|
369
1654
|
* Search the index. Supports:
|
|
370
1655
|
* - Simple queries: `contrato` (AND of tokens, accent-insensitive)
|
|
371
1656
|
* - Phrase queries: `"contrato marco"` (must appear as phrase)
|
|
372
1657
|
* - OR queries: `contrato | acuerdo` (union of two searches)
|
|
1658
|
+
*
|
|
1659
|
+
* Pass `{ windowed: true }` to receive cropped snippets with ASCII ellipsis
|
|
1660
|
+
* markers instead of full chunk text. Defaults: 60 bytes before, 120 after.
|
|
1661
|
+
*/
|
|
1662
|
+
search(query, opts = {}) {
|
|
1663
|
+
this._assertIdle('search');
|
|
1664
|
+
const w = this._wasm;
|
|
1665
|
+
const ql = this._writeStr(query);
|
|
1666
|
+
const kind = w.prepareQuery(ql);
|
|
1667
|
+
if (kind < 0)
|
|
1668
|
+
return [];
|
|
1669
|
+
if (kind === 2) {
|
|
1670
|
+
// OR: iterate branches and merge in TS. WASM stores compiled
|
|
1671
|
+
// branches internally so we never re-tokenize on the host.
|
|
1672
|
+
return this._searchOr(query, opts);
|
|
1673
|
+
}
|
|
1674
|
+
w.selectQueryBranch(0);
|
|
1675
|
+
// Phrase queries (kind 1) post-filter on adjacency. Pass the tokens down
|
|
1676
|
+
// so the check runs against the FULL chunk text, not a cropped windowed
|
|
1677
|
+
// snippet — otherwise `{ windowed: true }` could drop a valid phrase hit
|
|
1678
|
+
// whose second term fell outside the window (audit finding #7).
|
|
1679
|
+
const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
|
|
1680
|
+
return this._runSearch(query, opts, phraseTokens);
|
|
1681
|
+
}
|
|
1682
|
+
/** Read the WASM-compiled tokens of branch `i` for phrase post-filter.
|
|
1683
|
+
* The bytes returned are exactly what the WASM tokenizer produced —
|
|
1684
|
+
* no TS re-tokenization. */
|
|
1685
|
+
_branchTokens(i) {
|
|
1686
|
+
const n = this._wasm.getQueryBranchPattern(i);
|
|
1687
|
+
if (n === 0)
|
|
1688
|
+
return [];
|
|
1689
|
+
const pattern = this._readPad(n);
|
|
1690
|
+
return pattern.split(' ').filter(t => t.length > 0);
|
|
1691
|
+
}
|
|
1692
|
+
/**
|
|
1693
|
+
* Cooperative search. Processes the corpus in slices, yielding to the
|
|
1694
|
+
* event loop between them so the host UI thread keeps a chance to paint
|
|
1695
|
+
* even while a long scan is in flight.
|
|
1696
|
+
*
|
|
1697
|
+
* NOTE: this is NOT incremental streaming. Results are materialised
|
|
1698
|
+
* once the search completes and then iterated out in score-descending
|
|
1699
|
+
* order. The async iterator shape is preserved because the work that
|
|
1700
|
+
* produces those results genuinely yields to the scheduler between
|
|
1701
|
+
* slices — a future iteration may stream individual results before the
|
|
1702
|
+
* heap sorts, but doing so today would deliver them in arbitrary order.
|
|
1703
|
+
*
|
|
1704
|
+
* Pass `opts.frameBudgetMs` to control the slice size (default 8 ms).
|
|
1705
|
+
*/
|
|
1706
|
+
async *searchCooperative(query, opts = {}) {
|
|
1707
|
+
// Collect under the exclusivity lock so no other engine op interleaves at
|
|
1708
|
+
// a slice boundary; the per-slice scheduler yields still happen inside.
|
|
1709
|
+
const results = await this._exclusive(() => this._searchCooperativeCollect(query, opts));
|
|
1710
|
+
for (const r of results)
|
|
1711
|
+
yield r;
|
|
1712
|
+
}
|
|
1713
|
+
/** Materialise a cooperative search to a sorted result array. Runs inside
|
|
1714
|
+
* the exclusivity lock. Frame-budget yielding lives in _runSearchBudgeted. */
|
|
1715
|
+
async _searchCooperativeCollect(query, opts) {
|
|
1716
|
+
const budget = opts.frameBudgetMs ?? 8;
|
|
1717
|
+
const w = this._wasm;
|
|
1718
|
+
const ql = this._writeStr(query);
|
|
1719
|
+
const kind = w.prepareQuery(ql);
|
|
1720
|
+
if (kind < 0)
|
|
1721
|
+
return [];
|
|
1722
|
+
if (kind === 2) {
|
|
1723
|
+
// OR branches — run each as its own resumable search and merge.
|
|
1724
|
+
const seen = new Set();
|
|
1725
|
+
const all = [];
|
|
1726
|
+
const n = w.getQueryBranchCount();
|
|
1727
|
+
for (let i = 0; i < n; i++) {
|
|
1728
|
+
w.selectQueryBranch(i);
|
|
1729
|
+
const r = await this._runSearchBudgeted(query, opts, budget, undefined, i);
|
|
1730
|
+
for (const x of r) {
|
|
1731
|
+
const key = `${x.documentName}:${x.location}:${x.matchStart}`;
|
|
1732
|
+
if (!seen.has(key)) {
|
|
1733
|
+
seen.add(key);
|
|
1734
|
+
all.push(x);
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1738
|
+
all.sort((a, b) => b.score - a.score);
|
|
1739
|
+
return all;
|
|
1740
|
+
}
|
|
1741
|
+
w.selectQueryBranch(0);
|
|
1742
|
+
const phraseTokens = kind === 1 ? this._branchTokens(0) : undefined;
|
|
1743
|
+
return this._runSearchBudgeted(query, opts, budget, phraseTokens, 0);
|
|
1744
|
+
}
|
|
1745
|
+
/**
|
|
1746
|
+
* @deprecated Renamed to `searchCooperative` in 0.3.0. The original name
|
|
1747
|
+
* was misleading — this method does not stream incremental results, it
|
|
1748
|
+
* yields to the scheduler between slices and returns a batch. The alias
|
|
1749
|
+
* keeps existing integrations working; it will be removed in 0.4.0.
|
|
373
1750
|
*/
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
1751
|
+
async *searchStream(query, opts = {}) {
|
|
1752
|
+
warnSearchStreamDeprecated();
|
|
1753
|
+
yield* this.searchCooperative(query, opts);
|
|
1754
|
+
}
|
|
1755
|
+
/**
|
|
1756
|
+
* Drive a resumable search until done, yielding to the scheduler when the
|
|
1757
|
+
* frame budget is exceeded. Returns the materialised result array.
|
|
1758
|
+
*
|
|
1759
|
+
* Heuristic: each call to `searchSlice` processes a chunk batch, then we
|
|
1760
|
+
* check elapsed time. The batch size doubles up to a cap to amortise the
|
|
1761
|
+
* JS<->WASM overhead on fast machines; on slow machines a single batch
|
|
1762
|
+
* may eat the entire budget, which is also fine.
|
|
1763
|
+
*/
|
|
1764
|
+
async _runSearchBudgeted(displayQuery, opts, budgetMs, phraseTokens, branchIdx = 0) {
|
|
1765
|
+
const w = this._wasm;
|
|
1766
|
+
// Pattern is already set by the caller via selectQueryBranch(branchIdx).
|
|
1767
|
+
// Snapshot THAT branch's compiled pattern for the GPU pre-filter hash —
|
|
1768
|
+
// not branch 0, which would build the wrong candidate mask for OR
|
|
1769
|
+
// branches and silently drop their hits (audit finding #6).
|
|
1770
|
+
const activePatternLen = w.getQueryBranchPattern(branchIdx);
|
|
1771
|
+
const activePattern = activePatternLen > 0 ? this._readPad(activePatternLen) : '';
|
|
1772
|
+
// GPU pre-filter (CD1). If enabled AND the corpus is large enough,
|
|
1773
|
+
// the GPU computes the candidate bitset and we install it into WASM
|
|
1774
|
+
// before searchBegin so the slice loop only inspects candidates.
|
|
1775
|
+
// Failure here is silent: we fall back to CPU-only Bloom transparently.
|
|
1776
|
+
if (this._shouldEngageGpu()) {
|
|
1777
|
+
try {
|
|
1778
|
+
await this._gpuPreFilter(activePattern);
|
|
1779
|
+
}
|
|
1780
|
+
catch (e) {
|
|
1781
|
+
// Don't let a GPU hiccup kill the search — drop to CPU path.
|
|
1782
|
+
this._diag({
|
|
1783
|
+
kind: 'fallback', stage: 'gpu',
|
|
1784
|
+
message: `GPU pre-filter failed; falling back to CPU: ${e instanceof Error ? e.message : String(e)}`,
|
|
1785
|
+
});
|
|
1786
|
+
w.clearCandidateMask();
|
|
1787
|
+
}
|
|
378
1788
|
}
|
|
379
|
-
const
|
|
380
|
-
if (
|
|
381
|
-
|
|
1789
|
+
const t0 = performance.now();
|
|
1790
|
+
if (w.searchBegin() === 0) {
|
|
1791
|
+
this._lastSearch = {
|
|
1792
|
+
query: displayQuery, timeMs: 0, results: 0,
|
|
1793
|
+
bloomTested: 0, bloomPassed: 0, bitapMatched: 0,
|
|
1794
|
+
};
|
|
1795
|
+
return [];
|
|
1796
|
+
}
|
|
1797
|
+
// In background / low-power modes we halve the initial batch so the
|
|
1798
|
+
// engine yields more often to the scheduler, leaving more headroom for
|
|
1799
|
+
// whatever the host is doing.
|
|
1800
|
+
const conservative = this._resources?.mode === 'background'
|
|
1801
|
+
|| this._resources?.mode === 'low-power';
|
|
1802
|
+
let batch = conservative ? 1024 : 2048;
|
|
1803
|
+
const sched = globalThis.scheduler;
|
|
1804
|
+
const yieldFn = sched && typeof sched.yield === 'function'
|
|
1805
|
+
? () => sched.yield()
|
|
1806
|
+
: (typeof requestAnimationFrame === 'function'
|
|
1807
|
+
? () => new Promise(resolve => requestAnimationFrame(() => resolve()))
|
|
1808
|
+
: () => new Promise(resolve => setTimeout(resolve, 0)));
|
|
1809
|
+
for (;;) {
|
|
1810
|
+
const sliceStart = performance.now();
|
|
1811
|
+
const done = w.searchSlice(batch);
|
|
1812
|
+
const sliceMs = performance.now() - sliceStart;
|
|
1813
|
+
if (done === 1)
|
|
1814
|
+
break;
|
|
1815
|
+
// Adapt batch size: if we have headroom in budget, grow; if we're
|
|
1816
|
+
// already over the per-slice target, shrink.
|
|
1817
|
+
if (sliceMs < budgetMs * 0.5 && batch < 32_768)
|
|
1818
|
+
batch *= 2;
|
|
1819
|
+
else if (sliceMs > budgetMs * 1.5 && batch > 512)
|
|
1820
|
+
batch = Math.max(512, Math.floor(batch / 2));
|
|
1821
|
+
await yieldFn();
|
|
1822
|
+
}
|
|
1823
|
+
const ms = performance.now() - t0;
|
|
1824
|
+
const count = w.getResultCount();
|
|
1825
|
+
this._lastSearch = {
|
|
1826
|
+
query: displayQuery,
|
|
1827
|
+
timeMs: ms,
|
|
1828
|
+
results: count,
|
|
1829
|
+
bloomTested: w.getStatBloomTested(),
|
|
1830
|
+
bloomPassed: w.getStatBloomPassed(),
|
|
1831
|
+
bitapMatched: w.getStatBitapMatched(),
|
|
1832
|
+
};
|
|
1833
|
+
return this._collectResults(count, opts, phraseTokens);
|
|
1834
|
+
}
|
|
1835
|
+
/** Materialise results [0..count) into the public SearchResult shape.
|
|
1836
|
+
* When `phraseTokens` is given, each result is kept only if those tokens
|
|
1837
|
+
* appear adjacently in the FULL chunk text — independent of any display
|
|
1838
|
+
* windowing — so phrase queries stay correct under `{ windowed: true }`. */
|
|
1839
|
+
_collectResults(count, opts, phraseTokens) {
|
|
1840
|
+
const w = this._wasm;
|
|
1841
|
+
const windowed = opts.windowed === true;
|
|
1842
|
+
const before = opts.before ?? 60;
|
|
1843
|
+
const after = opts.after ?? 120;
|
|
1844
|
+
const phraseFilter = phraseTokens && phraseTokens.length > 0 ? phraseTokens : null;
|
|
1845
|
+
const results = [];
|
|
1846
|
+
for (let i = 0; i < count; i++) {
|
|
1847
|
+
// Phrase adjacency check against the full chunk text (getSnippet), not
|
|
1848
|
+
// the possibly-cropped display window.
|
|
1849
|
+
if (phraseFilter) {
|
|
1850
|
+
const fl = w.getSnippet(i);
|
|
1851
|
+
const full = fl > 0 ? this._readPad(fl) : '';
|
|
1852
|
+
if (!containsPhrase(full, phraseFilter))
|
|
1853
|
+
continue;
|
|
1854
|
+
}
|
|
1855
|
+
const score = w.getResultScore(i);
|
|
1856
|
+
const location = w.getResultLocation(i);
|
|
1857
|
+
const matchStart = w.getResultStart(i);
|
|
1858
|
+
const matchEnd = w.getResultEnd(i);
|
|
1859
|
+
const nl = w.getResultDocName(i);
|
|
1860
|
+
const name = nl > 0 ? this._readPad(nl) : '?';
|
|
1861
|
+
const matchCount = w.getResultMatchCount(i);
|
|
1862
|
+
const matches = [];
|
|
1863
|
+
for (let k = 0; k < matchCount; k++) {
|
|
1864
|
+
matches.push({ start: w.getResultMatchStartAt(i, k), end: w.getResultMatchEndAt(i, k) });
|
|
1865
|
+
}
|
|
1866
|
+
if (matches.length === 0)
|
|
1867
|
+
matches.push({ start: matchStart, end: matchEnd });
|
|
1868
|
+
let snippet;
|
|
1869
|
+
let primaryStart = matchStart;
|
|
1870
|
+
let primaryEnd = matchEnd;
|
|
1871
|
+
let adjustedMatches = matches;
|
|
1872
|
+
if (windowed) {
|
|
1873
|
+
const sl = w.getSnippetWindow(i, before, after);
|
|
1874
|
+
snippet = sl > 0 ? this._readPad(sl) : '';
|
|
1875
|
+
const offset = w.getSnippetWindowOffset();
|
|
1876
|
+
const leadingPrefix = offset > 0 ? 4 : 0;
|
|
1877
|
+
const shift = leadingPrefix - offset;
|
|
1878
|
+
adjustedMatches = matches.map(m => ({
|
|
1879
|
+
start: Math.max(0, m.start + shift),
|
|
1880
|
+
end: Math.max(0, m.end + shift),
|
|
1881
|
+
}));
|
|
1882
|
+
primaryStart = adjustedMatches[0]?.start ?? 0;
|
|
1883
|
+
primaryEnd = adjustedMatches[0]?.end ?? 0;
|
|
1884
|
+
}
|
|
1885
|
+
else {
|
|
1886
|
+
const sl = w.getSnippet(i);
|
|
1887
|
+
snippet = sl > 0 ? this._readPad(sl) : '';
|
|
1888
|
+
}
|
|
1889
|
+
results.push({
|
|
1890
|
+
documentName: name,
|
|
1891
|
+
location,
|
|
1892
|
+
score,
|
|
1893
|
+
snippet,
|
|
1894
|
+
matchStart: primaryStart,
|
|
1895
|
+
matchEnd: primaryEnd,
|
|
1896
|
+
matches: adjustedMatches,
|
|
1897
|
+
});
|
|
382
1898
|
}
|
|
383
1899
|
return results;
|
|
384
1900
|
}
|
|
385
|
-
|
|
1901
|
+
/** Run all OR branches and merge dedup-by-(doc, location, match). The
|
|
1902
|
+
* branches are already compiled inside the WASM (by prepareQuery); we
|
|
1903
|
+
* iterate them with selectQueryBranch. The "rawQuery" param is kept
|
|
1904
|
+
* only for the lastSearch.query field. */
|
|
1905
|
+
_searchOr(rawQuery, opts) {
|
|
1906
|
+
const w = this._wasm;
|
|
386
1907
|
const seen = new Set();
|
|
387
1908
|
const all = [];
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
const results = this._runSearch(q, rawQuery);
|
|
1909
|
+
const n = w.getQueryBranchCount();
|
|
1910
|
+
for (let i = 0; i < n; i++) {
|
|
1911
|
+
w.selectQueryBranch(i);
|
|
1912
|
+
const results = this._runSearch(rawQuery, opts);
|
|
393
1913
|
for (const r of results) {
|
|
394
1914
|
const key = `${r.documentName}:${r.location}:${r.matchStart}`;
|
|
395
1915
|
if (!seen.has(key)) {
|
|
@@ -398,37 +1918,26 @@ export class AlbexEngine {
|
|
|
398
1918
|
}
|
|
399
1919
|
}
|
|
400
1920
|
}
|
|
401
|
-
// Re-rank the merged list by score descending.
|
|
402
1921
|
all.sort((a, b) => b.score - a.score);
|
|
403
1922
|
return all;
|
|
404
1923
|
}
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
1924
|
+
/** Execute a single search using whichever query branch is currently
|
|
1925
|
+
* active (set via selectQueryBranch). Returns the materialised
|
|
1926
|
+
* SearchResult[]. Caller is responsible for activating a branch first. */
|
|
1927
|
+
_runSearch(displayQuery, opts, phraseTokens) {
|
|
1928
|
+
const w = this._wasm;
|
|
408
1929
|
const t0 = performance.now();
|
|
409
|
-
const count =
|
|
1930
|
+
const count = w.search();
|
|
410
1931
|
const ms = performance.now() - t0;
|
|
411
1932
|
this._lastSearch = {
|
|
412
1933
|
query: displayQuery,
|
|
413
1934
|
timeMs: ms,
|
|
414
1935
|
results: count,
|
|
415
|
-
bloomTested:
|
|
416
|
-
bloomPassed:
|
|
417
|
-
bitapMatched:
|
|
1936
|
+
bloomTested: w.getStatBloomTested(),
|
|
1937
|
+
bloomPassed: w.getStatBloomPassed(),
|
|
1938
|
+
bitapMatched: w.getStatBitapMatched(),
|
|
418
1939
|
};
|
|
419
|
-
|
|
420
|
-
for (let i = 0; i < count; i++) {
|
|
421
|
-
const score = this._wasm.getResultScore(i);
|
|
422
|
-
const location = this._wasm.getResultLocation(i);
|
|
423
|
-
const matchStart = this._wasm.getResultStart(i);
|
|
424
|
-
const matchEnd = this._wasm.getResultEnd(i);
|
|
425
|
-
const nl = this._wasm.getResultDocName(i);
|
|
426
|
-
const name = nl > 0 ? this._readPad(nl) : '?';
|
|
427
|
-
const sl = this._wasm.getSnippet(i);
|
|
428
|
-
const snippet = sl > 0 ? this._readPad(sl) : '';
|
|
429
|
-
results.push({ documentName: name, location, score, snippet, matchStart, matchEnd });
|
|
430
|
-
}
|
|
431
|
-
return results;
|
|
1940
|
+
return this._collectResults(count, opts, phraseTokens);
|
|
432
1941
|
}
|
|
433
1942
|
/** Returns current engine statistics. */
|
|
434
1943
|
getStats() {
|
|
@@ -438,6 +1947,9 @@ export class AlbexEngine {
|
|
|
438
1947
|
textUsed: this._wasm.getTextUsed(),
|
|
439
1948
|
textCapacity: this._wasm.getTextCapacity(),
|
|
440
1949
|
wasmMemoryBytes: this._mem.buffer.byteLength,
|
|
1950
|
+
tier: this._tier,
|
|
1951
|
+
maxChunks: this._wasm.getMaxChunks(),
|
|
1952
|
+
maxDocs: this._wasm.getMaxDocs(),
|
|
441
1953
|
};
|
|
442
1954
|
}
|
|
443
1955
|
/** Returns stats from the most recent search, or null. */
|
|
@@ -462,18 +1974,276 @@ export class AlbexEngine {
|
|
|
462
1974
|
setMaxResults(max) {
|
|
463
1975
|
this._wasm.setMaxResults(Math.max(1, Math.min(200, max)));
|
|
464
1976
|
}
|
|
1977
|
+
/**
|
|
1978
|
+
* Enable or disable query stemming.
|
|
1979
|
+
*
|
|
1980
|
+
* - `'off'` (default): tokens are used as-is. Strict matching.
|
|
1981
|
+
* - `'es'`: Spanish stemmer applied to query tokens before search. A query
|
|
1982
|
+
* for `"contratos"` matches `"contrato"` and vice versa.
|
|
1983
|
+
*
|
|
1984
|
+
* Indexed text is never stemmed, so snippets remain faithful to the
|
|
1985
|
+
* source. Recall improvement comes from queries reducing to shared prefixes.
|
|
1986
|
+
*/
|
|
1987
|
+
setLanguage(lang) {
|
|
1988
|
+
this._wasm.setLanguage(lang === 'es' ? 1 : 0);
|
|
1989
|
+
}
|
|
465
1990
|
/** Full reset — clears all indexed documents and chunks. */
|
|
466
1991
|
reset() {
|
|
1992
|
+
this._assertIdle('reset');
|
|
1993
|
+
this._resetInner();
|
|
1994
|
+
}
|
|
1995
|
+
_resetInner() {
|
|
467
1996
|
this._wasm.init();
|
|
468
1997
|
this._docs = [];
|
|
469
1998
|
this._lastSearch = null;
|
|
1999
|
+
this._diagnostics = [];
|
|
2000
|
+
}
|
|
2001
|
+
/**
|
|
2002
|
+
* Drain and return the diagnostics collected since the last call (or
|
|
2003
|
+
* since the engine was created). Use this to surface recoverable
|
|
2004
|
+
* issues to the caller after `indexFile`, `load`, or any other
|
|
2005
|
+
* operation that may run into a "best-effort" path.
|
|
2006
|
+
*
|
|
2007
|
+
* Example diagnostics:
|
|
2008
|
+
* - `{kind:'fallback', stage:'pdf', message:'pdf-extract crashed,
|
|
2009
|
+
* attempting OCR-only fallback', file:'invoice.pdf'}`
|
|
2010
|
+
* - `{kind:'skipped', stage:'ocr', message:'Tesseract abort on page
|
|
2011
|
+
* 3 image 1; remaining images on this page skipped', file:'...',
|
|
2012
|
+
* page:3}`
|
|
2013
|
+
* - `{kind:'fallback', stage:'gpu', message:'GPU pre-filter failed,
|
|
2014
|
+
* using CPU'}`
|
|
2015
|
+
*
|
|
2016
|
+
* The buffer is cleared on each call; callers should consume the
|
|
2017
|
+
* returned array immediately (e.g. log to their telemetry, surface
|
|
2018
|
+
* a UI banner). After `reset()` the buffer is also cleared.
|
|
2019
|
+
*/
|
|
2020
|
+
takeDiagnostics() {
|
|
2021
|
+
const out = this._diagnostics;
|
|
2022
|
+
this._diagnostics = [];
|
|
2023
|
+
return out;
|
|
2024
|
+
}
|
|
2025
|
+
/** Internal: record a diagnostic. Capped at 256 to bound memory. */
|
|
2026
|
+
_diag(entry) {
|
|
2027
|
+
if (this._diagnostics.length >= 256)
|
|
2028
|
+
return;
|
|
2029
|
+
this._diagnostics.push(entry);
|
|
2030
|
+
}
|
|
2031
|
+
/**
|
|
2032
|
+
* Install an OCR adapter. Returns a handle whose `dispose()` removes the
|
|
2033
|
+
* adapter from the engine.
|
|
2034
|
+
*
|
|
2035
|
+
* The contract: the adapter must provide `recognize(image, opts)` that
|
|
2036
|
+
* returns `Promise<OcrAttachedResult>`. The engine validates the
|
|
2037
|
+
* contract at attach time and refuses adapters that don't expose a
|
|
2038
|
+
* recognise function. Only one adapter can be attached at a time; a
|
|
2039
|
+
* second call to `attachOcr` while one is active throws — the caller
|
|
2040
|
+
* must dispose the previous one first.
|
|
2041
|
+
*
|
|
2042
|
+
* @example
|
|
2043
|
+
* ```ts
|
|
2044
|
+
* import { enableOcr } from '@albex/ocr';
|
|
2045
|
+
* const handle = enableOcr(engine); // internally calls attachOcr
|
|
2046
|
+
* // ... later ...
|
|
2047
|
+
* await handle.dispose();
|
|
2048
|
+
* ```
|
|
2049
|
+
*
|
|
2050
|
+
* Direct use without the companion package:
|
|
2051
|
+
* ```ts
|
|
2052
|
+
* const handle = engine.attachOcr({
|
|
2053
|
+
* recognize: async (blob) => myCustomOcr(blob),
|
|
2054
|
+
* options: { alwaysExtractEmbeddedImages: false },
|
|
2055
|
+
* });
|
|
2056
|
+
* ```
|
|
2057
|
+
*/
|
|
2058
|
+
attachOcr(adapter) {
|
|
2059
|
+
if (this._ocrAdapter) {
|
|
2060
|
+
throw new AlbexInitError('OCR adapter already attached. Call dispose() on the previous handle before attaching a new one.');
|
|
2061
|
+
}
|
|
2062
|
+
if (typeof adapter?.recognize !== 'function') {
|
|
2063
|
+
throw new AlbexInitError('attachOcr requires an adapter with a recognize(image, opts) function.');
|
|
2064
|
+
}
|
|
2065
|
+
this._ocrAdapter = adapter;
|
|
2066
|
+
return {
|
|
2067
|
+
dispose: async () => {
|
|
2068
|
+
// Idempotent: a double dispose is a no-op rather than a throw.
|
|
2069
|
+
if (this._ocrAdapter === adapter)
|
|
2070
|
+
this._ocrAdapter = null;
|
|
2071
|
+
},
|
|
2072
|
+
};
|
|
2073
|
+
}
|
|
2074
|
+
// ── Persistence ───────────────────────────────────────────────────────────
|
|
2075
|
+
/**
|
|
2076
|
+
* Persist the current index to OPFS (or IndexedDB as fallback) under `name`.
|
|
2077
|
+
*
|
|
2078
|
+
* The snapshot includes every chunk, document name and text byte currently
|
|
2079
|
+
* indexed. Subsequent `load(name)` calls restore the engine to this exact
|
|
2080
|
+
* state in roughly O(total bytes), bypassing re-parsing.
|
|
2081
|
+
*/
|
|
2082
|
+
async save(name) {
|
|
2083
|
+
return this._exclusive(() => this._saveInner(name));
|
|
2084
|
+
}
|
|
2085
|
+
async _saveInner(name) {
|
|
2086
|
+
const w = this._wasm;
|
|
2087
|
+
const total = w.snapshotSize();
|
|
2088
|
+
if (total === 0) {
|
|
2089
|
+
await savePersisted(name, new Uint8Array(0));
|
|
2090
|
+
return;
|
|
2091
|
+
}
|
|
2092
|
+
const out = new Uint8Array(total);
|
|
2093
|
+
let off = 0;
|
|
2094
|
+
while (off < total) {
|
|
2095
|
+
const n = w.snapshotChunk(off, FEED_SIZE);
|
|
2096
|
+
if (n === 0)
|
|
2097
|
+
break;
|
|
2098
|
+
const ptr = w.getBuffer(0);
|
|
2099
|
+
out.set(this._u8(ptr, n), off);
|
|
2100
|
+
off += n;
|
|
2101
|
+
}
|
|
2102
|
+
await savePersisted(name, out);
|
|
2103
|
+
// Reconstruct _docs from the doc table so getStats().documents stays
|
|
2104
|
+
// honest after save (no change here — but symmetric with load()).
|
|
2105
|
+
}
|
|
2106
|
+
/**
|
|
2107
|
+
* Restore an index previously saved with `save(name)`. Returns `true` on
|
|
2108
|
+
* success, `false` if the snapshot is missing or has an incompatible
|
|
2109
|
+
* header (wrong magic, version, or struct sizes).
|
|
2110
|
+
*/
|
|
2111
|
+
async load(name) {
|
|
2112
|
+
return this._exclusive(() => this._loadInner(name));
|
|
2113
|
+
}
|
|
2114
|
+
async _loadInner(name) {
|
|
2115
|
+
const bytes = await loadPersisted(name);
|
|
2116
|
+
if (!bytes || bytes.length === 0)
|
|
2117
|
+
return false;
|
|
2118
|
+
const w = this._wasm;
|
|
2119
|
+
// Write the 64-byte header into the scratchpad and validate.
|
|
2120
|
+
if (bytes.length < 64)
|
|
2121
|
+
return false;
|
|
2122
|
+
const ptr = w.getBuffer(64);
|
|
2123
|
+
if (!ptr)
|
|
2124
|
+
return false;
|
|
2125
|
+
this._u8(ptr, 64).set(bytes.subarray(0, 64));
|
|
2126
|
+
if (w.restoreBegin() !== 1)
|
|
2127
|
+
return false;
|
|
2128
|
+
// Stream payload bytes.
|
|
2129
|
+
let off = 64;
|
|
2130
|
+
while (off < bytes.length) {
|
|
2131
|
+
const n = Math.min(FEED_SIZE, bytes.length - off);
|
|
2132
|
+
this._writePad(bytes.subarray(off, off + n));
|
|
2133
|
+
if (w.restoreFeed(n) !== 1)
|
|
2134
|
+
return false;
|
|
2135
|
+
off += n;
|
|
2136
|
+
}
|
|
2137
|
+
// Commit. For v3 this is the atomic apply step (state is untouched
|
|
2138
|
+
// until now); a failure here leaves the previous index intact so the
|
|
2139
|
+
// caller can keep using the engine. For v1/v2 snapshots `restoreCommit`
|
|
2140
|
+
// is a no-op that returns 1 (those formats applied in-place during
|
|
2141
|
+
// restoreFeed and have no rollback to offer). Older binaries that
|
|
2142
|
+
// predate v3 do not export `restoreCommit` — in that case we treat
|
|
2143
|
+
// the load as already committed by feature-detect.
|
|
2144
|
+
if (typeof w.restoreCommit === 'function') {
|
|
2145
|
+
if (w.restoreCommit() !== 1)
|
|
2146
|
+
return false;
|
|
2147
|
+
}
|
|
2148
|
+
// Rebuild _docs metadata from the restored WASM tables.
|
|
2149
|
+
//
|
|
2150
|
+
// What's available after a restore:
|
|
2151
|
+
// * `name` — recovered from getDocName(i).
|
|
2152
|
+
// * `ext` — derived from the name.
|
|
2153
|
+
// * `chunks` — getDocChunkCount(i).
|
|
2154
|
+
// * `docId` — getDocId(i).
|
|
2155
|
+
// * `contentHash` — getDocContentHashPtr(i) when the binary supports
|
|
2156
|
+
// snapshot v2 (the export exists) AND the snapshot
|
|
2157
|
+
// itself was v2 (the bytes aren't all zero). v1
|
|
2158
|
+
// snapshots restore with all-zero hashes → '' here,
|
|
2159
|
+
// same as before.
|
|
2160
|
+
//
|
|
2161
|
+
// What's not persisted and therefore zeroed:
|
|
2162
|
+
// * `indexTimeMs` — no indexing happened in this session.
|
|
2163
|
+
// * `textBytes` — engine-wide totals are still available via
|
|
2164
|
+
// getStats().textUsed; per-doc breakdown is not
|
|
2165
|
+
// stored.
|
|
2166
|
+
const docCount = w.getDocCount();
|
|
2167
|
+
const hasHashExport = typeof w.getDocContentHashPtr === 'function'
|
|
2168
|
+
&& typeof w.getDocContentHashLen === 'function';
|
|
2169
|
+
this._docs = [];
|
|
2170
|
+
for (let i = 0; i < docCount; i++) {
|
|
2171
|
+
if (w.isDocDeleted(i))
|
|
2172
|
+
continue;
|
|
2173
|
+
const nameLen = w.getDocName(i);
|
|
2174
|
+
const name = nameLen > 0 ? this._readPad(nameLen) : `restored-${i}`;
|
|
2175
|
+
const dotIdx = name.lastIndexOf('.');
|
|
2176
|
+
const ext = dotIdx > 0 ? name.slice(dotIdx + 1).toLowerCase() : '';
|
|
2177
|
+
let contentHash = '';
|
|
2178
|
+
if (hasHashExport) {
|
|
2179
|
+
const hashLen = w.getDocContentHashLen(); // always 8 today
|
|
2180
|
+
const hashPtr = w.getDocContentHashPtr(i);
|
|
2181
|
+
if (hashPtr !== 0 && hashLen === 8) {
|
|
2182
|
+
const view = this._u8(hashPtr, 8);
|
|
2183
|
+
// Copy into a private buffer so subsequent WASM calls cannot
|
|
2184
|
+
// mutate it under us.
|
|
2185
|
+
const buf = new Uint8Array(8);
|
|
2186
|
+
buf.set(view);
|
|
2187
|
+
contentHash = hashBytesToHex(buf);
|
|
2188
|
+
}
|
|
2189
|
+
}
|
|
2190
|
+
this._docs.push({
|
|
2191
|
+
name,
|
|
2192
|
+
ext,
|
|
2193
|
+
chunks: w.getDocChunkCount(i),
|
|
2194
|
+
indexTimeMs: 0,
|
|
2195
|
+
textBytes: 0,
|
|
2196
|
+
docId: w.getDocId(i),
|
|
2197
|
+
contentHash,
|
|
2198
|
+
});
|
|
2199
|
+
}
|
|
2200
|
+
this._lastSearch = null;
|
|
2201
|
+
return true;
|
|
2202
|
+
}
|
|
2203
|
+
/**
|
|
2204
|
+
* Convenience: load if the snapshot exists, otherwise leave the engine
|
|
2205
|
+
* empty. Returns whether a load actually happened.
|
|
2206
|
+
*/
|
|
2207
|
+
async loadOrInit(name) {
|
|
2208
|
+
return this._exclusive(async () => {
|
|
2209
|
+
const loaded = await this._loadInner(name);
|
|
2210
|
+
if (!loaded)
|
|
2211
|
+
this._resetInner();
|
|
2212
|
+
return loaded;
|
|
2213
|
+
});
|
|
2214
|
+
}
|
|
2215
|
+
/** Delete a previously persisted snapshot. */
|
|
2216
|
+
async deleteSnapshot(name) {
|
|
2217
|
+
await deletePersisted(name);
|
|
2218
|
+
}
|
|
2219
|
+
/** List names of persisted snapshots in the current origin. */
|
|
2220
|
+
async listSnapshots() {
|
|
2221
|
+
return listPersisted();
|
|
2222
|
+
}
|
|
2223
|
+
/**
|
|
2224
|
+
* TC39 explicit-resource-management hook (Stage 3 in 2026). Lets the engine
|
|
2225
|
+
* be used with `using` so the references are released deterministically:
|
|
2226
|
+
*
|
|
2227
|
+
* using engine = new AlbexEngine(opts); await engine.init();
|
|
2228
|
+
*
|
|
2229
|
+
* WebAssembly does not actually expose a way to release linear memory pages
|
|
2230
|
+
* inside a Module instance, so we drop our references to the exports and
|
|
2231
|
+
* the doc list. GC can then reclaim the engine, which in turn releases the
|
|
2232
|
+
* WASM instance and its (typically 20 MB) backing memory.
|
|
2233
|
+
*/
|
|
2234
|
+
[Symbol.dispose]() {
|
|
2235
|
+
// Terminal: bypass the idle guard — disposing mid-operation is allowed.
|
|
2236
|
+
this._resetInner();
|
|
2237
|
+
this._unsubscribeResources?.();
|
|
2238
|
+
this._unsubscribeResources = null;
|
|
2239
|
+
this._gpu?.destroy();
|
|
2240
|
+
this._gpu = null;
|
|
2241
|
+
// Null out the references so the engine cannot be reused after disposal
|
|
2242
|
+
// and the WASM instance becomes unreachable.
|
|
2243
|
+
this._wasm = null;
|
|
2244
|
+
this._mem = null;
|
|
2245
|
+
this._pdfWasm = null;
|
|
2246
|
+
this._pdfMem = null;
|
|
470
2247
|
}
|
|
471
2248
|
}
|
|
472
|
-
AlbexEngine._INDEXERS = {
|
|
473
|
-
docx: (e, f) => e._indexDocx(f),
|
|
474
|
-
xlsx: (e, f) => e._indexXlsx(f),
|
|
475
|
-
pdf: (e, f) => e._indexPdf(f),
|
|
476
|
-
txt: (e, f) => e._indexTxt(f),
|
|
477
|
-
xml: (e, f) => e._indexXml(f),
|
|
478
|
-
};
|
|
479
2249
|
//# sourceMappingURL=albex.js.map
|