@yoch/frozenminisearch 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/es/index.js CHANGED
@@ -1907,111 +1907,6 @@ function materializeFrozenPostings(params) {
1907
1907
  sparseLengths: new Uint32Array(sparseLengths),
1908
1908
  };
1909
1909
  }
1910
- /** One-pass materialize from {@link FrozenIndexBuilder} scratch (counts known upfront). */
1911
- function materializeFrozenPostingsFromBuilder(state, nextId) {
1912
- var _a;
1913
- const { fieldCount, termCount, postingsDocIds, postingsFreqs, totalPostings, maxFreq } = state;
1914
- const layout = choosePostingsLayout(fieldCount);
1915
- const docIdWidth = nextId <= 65535 ? 16 : 32;
1916
- const allDocIds = docIdWidth === 16
1917
- ? new Uint16Array(totalPostings)
1918
- : new Uint32Array(totalPostings);
1919
- const allFreqs = allocateFreqs(totalPostings, maxFreq);
1920
- if (layout === 'dense') {
1921
- const slotCount = termCount * fieldCount;
1922
- const denseOffsets = new Uint32Array(slotCount);
1923
- const denseLengths = new Uint32Array(slotCount);
1924
- let write = 0;
1925
- for (let ti = 0; ti < termCount; ti++) {
1926
- const base = ti * fieldCount;
1927
- for (let f = 0; f < fieldCount; f++) {
1928
- const slot = base + f;
1929
- const docIds = postingsDocIds[slot];
1930
- const freqs = postingsFreqs[slot];
1931
- const len = (_a = docIds === null || docIds === void 0 ? void 0 : docIds.length) !== null && _a !== void 0 ? _a : 0;
1932
- denseOffsets[slot] = write;
1933
- denseLengths[slot] = len;
1934
- for (let i = 0; i < len; i++) {
1935
- const docId = docIds[i];
1936
- if (docIdWidth === 16) {
1937
- allDocIds[write] = docId;
1938
- }
1939
- else {
1940
- allDocIds[write] = docId;
1941
- }
1942
- allFreqs[write] = freqs[i];
1943
- write++;
1944
- }
1945
- }
1946
- }
1947
- return {
1948
- fieldCount,
1949
- termCount,
1950
- nextId,
1951
- layout,
1952
- docIdWidth,
1953
- sparseFieldIdWidth: null,
1954
- allDocIds,
1955
- allFreqs,
1956
- denseOffsets,
1957
- denseLengths,
1958
- sparseTermStarts: null,
1959
- sparseFieldIds: null,
1960
- sparseOffsets: null,
1961
- sparseLengths: null,
1962
- };
1963
- }
1964
- const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
1965
- const sparseFieldIdsScratch = [];
1966
- const sparseOffsets = [];
1967
- const sparseLengths = [];
1968
- const termStarts = new Array(termCount + 1).fill(0);
1969
- let write = 0;
1970
- for (let ti = 0; ti < termCount; ti++) {
1971
- termStarts[ti] = sparseFieldIdsScratch.length;
1972
- for (let f = 0; f < fieldCount; f++) {
1973
- const slot = ti * fieldCount + f;
1974
- const docIds = postingsDocIds[slot];
1975
- if (docIds == null || docIds.length === 0)
1976
- continue;
1977
- const freqs = postingsFreqs[slot];
1978
- sparseFieldIdsScratch.push(f);
1979
- sparseOffsets.push(write);
1980
- sparseLengths.push(docIds.length);
1981
- for (let i = 0; i < docIds.length; i++) {
1982
- const docId = docIds[i];
1983
- if (docIdWidth === 16) {
1984
- allDocIds[write] = docId;
1985
- }
1986
- else {
1987
- allDocIds[write] = docId;
1988
- }
1989
- allFreqs[write] = freqs[i];
1990
- write++;
1991
- }
1992
- }
1993
- termStarts[ti + 1] = sparseFieldIdsScratch.length;
1994
- }
1995
- const sparseFieldIds = sparseFieldIdWidth === 16
1996
- ? new Uint16Array(sparseFieldIdsScratch)
1997
- : new Uint8Array(sparseFieldIdsScratch);
1998
- return {
1999
- fieldCount,
2000
- termCount,
2001
- nextId,
2002
- layout,
2003
- docIdWidth,
2004
- sparseFieldIdWidth,
2005
- allDocIds,
2006
- allFreqs,
2007
- denseOffsets: null,
2008
- denseLengths: null,
2009
- sparseTermStarts: new Uint32Array(termStarts),
2010
- sparseFieldIds,
2011
- sparseOffsets: new Uint32Array(sparseOffsets),
2012
- sparseLengths: new Uint32Array(sparseLengths),
2013
- };
2014
- }
2015
1910
  function postingsTypedBytes(layout) {
2016
1911
  const allDocIdsBytes = layout.allDocIds.byteLength;
2017
1912
  const allFreqsBytes = layout.allFreqs.byteLength;
@@ -2121,24 +2016,33 @@ function findSparseSlotByFieldId(fieldIds, start, end, fieldId) {
2121
2016
  }
2122
2017
  return -1;
2123
2018
  }
2124
- /** Resolve one (termIndex, fieldId) posting run in flat buffers; shared by flyweight and docId collect. */
2125
- function frozenPostingSlice(layout, termIndex, fieldId) {
2019
+ /** Reusable scratch for {@link resolvePostingSlice} (scoring is synchronous). */
2020
+ const postingSliceScratch = { offset: 0, length: 0 };
2021
+ /**
2022
+ * Resolve one (termIndex, fieldId) posting run in flat buffers; writes into `out` without allocating.
2023
+ * @returns false when the slot is empty or missing
2024
+ */
2025
+ function resolvePostingSlice(layout, termIndex, fieldId, out) {
2126
2026
  if (layout.layout === 'dense') {
2127
2027
  const base = termIndex * layout.fieldCount + fieldId;
2128
2028
  const len = layout.denseLengths[base];
2129
2029
  if (len === 0)
2130
- return undefined;
2131
- return { offset: layout.denseOffsets[base], length: len };
2030
+ return false;
2031
+ out.offset = layout.denseOffsets[base];
2032
+ out.length = len;
2033
+ return true;
2132
2034
  }
2133
2035
  const start = layout.sparseTermStarts[termIndex];
2134
2036
  const end = layout.sparseTermStarts[termIndex + 1];
2135
2037
  const slot = findSparseSlotByFieldId(layout.sparseFieldIds, start, end, fieldId);
2136
2038
  if (slot < 0)
2137
- return undefined;
2039
+ return false;
2138
2040
  const len = layout.sparseLengths[slot];
2139
2041
  if (len === 0)
2140
- return undefined;
2141
- return { offset: layout.sparseOffsets[slot], length: len };
2042
+ return false;
2043
+ out.offset = layout.sparseOffsets[slot];
2044
+ out.length = len;
2045
+ return true;
2142
2046
  }
2143
2047
  /**
2144
2048
  * One flyweight wrapper for the lifetime of a frozen index. Call {@link bind} before each
@@ -2154,10 +2058,9 @@ function createFrozenFieldTermFlyweight(layout) {
2154
2058
  return flyweight;
2155
2059
  },
2156
2060
  get(fieldId) {
2157
- const slice = frozenPostingSlice(layout, termIndex, fieldId);
2158
- if (slice == null)
2061
+ if (!resolvePostingSlice(layout, termIndex, fieldId, postingSliceScratch))
2159
2062
  return undefined;
2160
- return segment.rebind(slice.offset, slice.length);
2063
+ return segment.rebind(postingSliceScratch.offset, postingSliceScratch.length);
2161
2064
  },
2162
2065
  };
2163
2066
  return flyweight;
@@ -2176,10 +2079,9 @@ function collectDocIdsFromFrozenSegment(allDocIds, offset, length, context, docI
2176
2079
  function collectDocIdsFromFrozenLayout(layout, termIndex, fieldBoosts, context, docIds, allowedDocs) {
2177
2080
  const { fieldIds } = context;
2178
2081
  for (const field of fieldBoosts.names) {
2179
- const slice = frozenPostingSlice(layout, termIndex, fieldIds[field]);
2180
- if (slice == null)
2082
+ if (!resolvePostingSlice(layout, termIndex, fieldIds[field], postingSliceScratch))
2181
2083
  continue;
2182
- collectDocIdsFromFrozenSegment(layout.allDocIds, slice.offset, slice.length, context, docIds, allowedDocs);
2084
+ collectDocIdsFromFrozenSegment(layout.allDocIds, postingSliceScratch.offset, postingSliceScratch.length, context, docIds, allowedDocs);
2183
2085
  }
2184
2086
  }
2185
2087
 
@@ -2221,45 +2123,499 @@ function resolveIndexingOptions(options) {
2221
2123
  autoSuggestOptions: { ...defaultAutoSuggestOptions, ...(options.autoSuggestOptions || {}) },
2222
2124
  };
2223
2125
  }
2224
- function buildFieldIds(fields) {
2225
- const fieldIds = {};
2226
- for (let i = 0; i < fields.length; i++) {
2227
- fieldIds[fields[i]] = i;
2126
+ function buildFieldIds(fields) {
2127
+ const fieldIds = {};
2128
+ for (let i = 0; i < fields.length; i++) {
2129
+ fieldIds[fields[i]] = i;
2130
+ }
2131
+ return fieldIds;
2132
+ }
2133
+ function accumulateProcessedTerm(localFreqs, processedTerm) {
2134
+ if (Array.isArray(processedTerm)) {
2135
+ for (const t of processedTerm) {
2136
+ localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
2137
+ }
2138
+ }
2139
+ else if (processedTerm) {
2140
+ localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
2141
+ }
2142
+ }
2143
+ /**
2144
+ * Accumulate token frequencies for one document field into `localFreqs` (cleared first).
2145
+ * Returns the number of distinct processed terms (replaces a separate `Set(tokens)` pass).
2146
+ */
2147
+ function collectFieldTermFreqsInto(localFreqs, tokens, fieldName, processTerm) {
2148
+ localFreqs.clear();
2149
+ for (const term of tokens) {
2150
+ accumulateProcessedTerm(localFreqs, processTerm(term, fieldName));
2151
+ }
2152
+ return localFreqs.size;
2153
+ }
2154
+ /** Global delimiter pattern for incremental `exec` (must not reuse {@link SPACE_OR_PUNCTUATION} — no `g` flag). */
2155
+ const DEFAULT_TOKENIZE_DELIMITERS = /[\n\r\p{Z}\p{P}]+/gu;
2156
+ const defaultTokenizeProbe = 'a b';
2157
+ const defaultTokenizeProbeField = 'f';
2158
+ const tokenizeBehaviorCache = new WeakMap();
2159
+ /**
2160
+ * True when `tokenize` matches the library default (reference equality or split-equivalent
2161
+ * on a fixed probe). Custom tokenizers that pass the probe but diverge on other inputs
2162
+ * (e.g. leading delimiters) still take the fast path — use the default reference in prod.
2163
+ */
2164
+ function isDefaultTokenize(tokenize) {
2165
+ if (tokenize === defaultFrozenLoadOptions.tokenize)
2166
+ return true;
2167
+ const cached = tokenizeBehaviorCache.get(tokenize);
2168
+ if (cached != null)
2169
+ return cached;
2170
+ const splitTokens = defaultTokenizeProbe.split(SPACE_OR_PUNCTUATION);
2171
+ const customTokens = tokenize(defaultTokenizeProbe, defaultTokenizeProbeField);
2172
+ const ok = splitTokens.length === customTokens.length
2173
+ && splitTokens.every((t, i) => t === customTokens[i]);
2174
+ tokenizeBehaviorCache.set(tokenize, ok);
2175
+ return ok;
2176
+ }
2177
+ function forEachDefaultToken(text, onToken) {
2178
+ if (text.length === 0) {
2179
+ onToken('');
2180
+ return;
2181
+ }
2182
+ let start = 0;
2183
+ const re = DEFAULT_TOKENIZE_DELIMITERS;
2184
+ re.lastIndex = 0;
2185
+ let match;
2186
+ while ((match = re.exec(text)) !== null) {
2187
+ if (match.index > start) {
2188
+ onToken(text.slice(start, match.index));
2189
+ }
2190
+ else if (match.index === start) {
2191
+ onToken('');
2192
+ }
2193
+ start = match.index + match[0].length;
2194
+ }
2195
+ if (start < text.length) {
2196
+ onToken(text.slice(start));
2197
+ }
2198
+ else if (start === 0) {
2199
+ onToken(text);
2200
+ }
2201
+ else if (start === text.length) {
2202
+ onToken('');
2203
+ }
2204
+ }
2205
+ /** Default tokenizer into a reusable buffer (avoids `text.split()` array allocation). */
2206
+ function tokenizeDefaultInto(out, text) {
2207
+ out.length = 0;
2208
+ forEachDefaultToken(text, (token) => out.push(token));
2209
+ }
2210
+ /** Tokenize field text into `out` (reused). Fast path when `tokenize` is the library default. */
2211
+ function tokenizeFieldInto(out, tokenize, text, fieldName) {
2212
+ if (isDefaultTokenize(tokenize)) {
2213
+ tokenizeDefaultInto(out, text);
2214
+ return;
2215
+ }
2216
+ const tokens = tokenize(text, fieldName);
2217
+ out.length = 0;
2218
+ out.push(...tokens);
2219
+ }
2220
+ function collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm) {
2221
+ localFreqs.clear();
2222
+ forEachDefaultToken(text, (token) => {
2223
+ accumulateProcessedTerm(localFreqs, processTerm(token, fieldName));
2224
+ });
2225
+ return localFreqs.size;
2226
+ }
2227
+ /**
2228
+ * Tokenize + accumulate field term frequencies in one pass when the default tokenizer is used.
2229
+ * `tokenScratch` is only used for custom tokenizers (two-phase fallback).
2230
+ */
2231
+ function collectFieldTermFreqsFromFieldInto(localFreqs, tokenScratch, tokenize, text, fieldName, processTerm) {
2232
+ if (isDefaultTokenize(tokenize)) {
2233
+ return collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm);
2234
+ }
2235
+ tokenizeFieldInto(tokenScratch, tokenize, text, fieldName);
2236
+ return collectFieldTermFreqsInto(localFreqs, tokenScratch, fieldName, processTerm);
2237
+ }
2238
+ function updateAvgFieldLength(avgFieldLength, fieldId, count, length) {
2239
+ const averageFieldLength = avgFieldLength[fieldId] || 0;
2240
+ const totalFieldLength = (averageFieldLength * count) + length;
2241
+ avgFieldLength[fieldId] = totalFieldLength / (count + 1);
2242
+ }
2243
+
2244
+ function validateTreeShape(shape, termCount) {
2245
+ if (!Array.isArray(shape)) {
2246
+ throw invalidFrozenIndex('treeShape node must be an array');
2247
+ }
2248
+ for (const entry of shape) {
2249
+ if (!Array.isArray(entry) || entry.length !== 2) {
2250
+ throw invalidFrozenIndex('treeShape entry must be a [key, value] pair');
2251
+ }
2252
+ const [key, value] = entry;
2253
+ if (key === LEAF) {
2254
+ const idx = value;
2255
+ if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
2256
+ throw invalidFrozenIndex(`treeShape leaf term index out of range: ${idx}`);
2257
+ }
2258
+ }
2259
+ else {
2260
+ validateTreeShape(value, termCount);
2261
+ }
2262
+ }
2263
+ }
2264
+ function termCountOf(snap) {
2265
+ return snap.postings.termCount;
2266
+ }
2267
+ /**
2268
+ * Numeric/structural invariants shared by both the decode path (untrusted binary)
2269
+ * and the build path (trusted internal code).
2270
+ */
2271
+ function validateFrozenSnapshotNumeric(snap) {
2272
+ if (snap.fieldCount <= 0) {
2273
+ throw invalidFrozenIndex('fieldCount must be positive');
2274
+ }
2275
+ if (snap.nextId < 0 || snap.nextId >= 0xffffffff) {
2276
+ throw invalidFrozenIndex('nextId out of range');
2277
+ }
2278
+ if (snap.documentCount < 0 || snap.documentCount > snap.nextId) {
2279
+ throw invalidFrozenIndex('documentCount inconsistent with nextId');
2280
+ }
2281
+ if (snap.fieldLengthMatrix.length !== snap.nextId * snap.fieldCount) {
2282
+ throw invalidFrozenIndex('fieldLengthMatrix size mismatch');
2283
+ }
2284
+ if (snap.avgFieldLength.length !== snap.fieldCount) {
2285
+ throw invalidFrozenIndex('avgFieldLength size mismatch');
2286
+ }
2287
+ validateFrozenPostingsLayout(snap.postings, snap.documentCount, snap.nextId, detail => {
2288
+ throw invalidFrozenIndex(detail);
2289
+ });
2290
+ const indexedFields = Object.keys(snap.fieldIds);
2291
+ if (indexedFields.length !== snap.fieldCount) {
2292
+ throw invalidFrozenIndex('fieldIds count mismatch');
2293
+ }
2294
+ for (let f = 0; f < snap.fieldCount; f++) {
2295
+ const found = indexedFields.some(name => snap.fieldIds[name] === f);
2296
+ if (!found) {
2297
+ throw invalidFrozenIndex(`missing field id ${f}`);
2298
+ }
2299
+ }
2300
+ }
2301
+ function readFieldNamesSection(buf, fieldNamesOff, fieldCount, externalIdsOff) {
2302
+ const fieldNames = [];
2303
+ let o = fieldNamesOff;
2304
+ for (let f = 0; f < fieldCount; f++) {
2305
+ const { value, next } = readLengthPrefixedUtf8(buf, o);
2306
+ fieldNames.push(value);
2307
+ o = next;
2308
+ }
2309
+ if (o !== externalIdsOff) {
2310
+ throw invalidFrozenIndex('field names section size mismatch');
2311
+ }
2312
+ return fieldNames;
2313
+ }
2314
+ function readExternalIdsSection(buf, externalIdsOff, nextId, storedOff) {
2315
+ const externalIds = new Array(nextId);
2316
+ let o = externalIdsOff;
2317
+ for (let i = 0; i < nextId; i++) {
2318
+ const { value, next } = readExternalId(buf, o);
2319
+ externalIds[i] = value;
2320
+ o = next;
2321
+ }
2322
+ if (o !== storedOff) {
2323
+ throw invalidFrozenIndex('external ids section size mismatch');
2324
+ }
2325
+ return externalIds;
2326
+ }
2327
+ function readStoredFieldsSection(buf, storedOff, nextId, sectionEnd) {
2328
+ const storedFields = new Array(nextId);
2329
+ const tableEnd = storedOff + nextId * 4;
2330
+ if (tableEnd > sectionEnd) {
2331
+ throw invalidFrozenIndex('stored fields table out of bounds');
2332
+ }
2333
+ for (let i = 0; i < nextId; i++) {
2334
+ const rel = buf.readUInt32LE(storedOff + i * 4);
2335
+ if (rel === 0) {
2336
+ storedFields[i] = undefined;
2337
+ continue;
2338
+ }
2339
+ const entryOff = tableEnd + rel - 1;
2340
+ if (entryOff + 4 > sectionEnd) {
2341
+ throw invalidFrozenIndex('stored fields entry offset out of bounds');
2342
+ }
2343
+ const jsonLen = buf.readUInt32LE(entryOff);
2344
+ const jsonStart = entryOff + 4;
2345
+ const jsonEnd = jsonStart + jsonLen;
2346
+ if (jsonEnd > sectionEnd) {
2347
+ throw invalidFrozenIndex('stored fields JSON out of bounds');
2348
+ }
2349
+ storedFields[i] = JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
2350
+ }
2351
+ return storedFields;
2352
+ }
2353
+ /** Validate structural invariants of a decoded or assembled frozen snapshot. */
2354
+ function validateFrozenSnapshot(snap) {
2355
+ validateFrozenSnapshotNumeric(snap);
2356
+ const termCount = termCountOf(snap);
2357
+ if (snap.packedTermIndex != null) {
2358
+ validateFrozenTermIndexLeaves(snap.packedTermIndex, termCount);
2359
+ }
2360
+ else if (snap.termTree != null) {
2361
+ validateTermTreeLeaves(snap.termTree, termCount);
2362
+ }
2363
+ else {
2364
+ validateTreeShape(snap.treeShape, termCount);
2365
+ }
2366
+ }
2367
+ function fieldNamesFromFieldIds(fieldIds) {
2368
+ const names = Object.keys(fieldIds);
2369
+ names.sort((a, b) => fieldIds[a] - fieldIds[b]);
2370
+ return names;
2371
+ }
2372
+ /** Core with explicit {@link termCountOf} (no dictionary section). */
2373
+ function buildCoreSectionWithTermCount(snap) {
2374
+ const out = Buffer.alloc(16);
2375
+ out.writeUInt32LE(snap.documentCount, 0);
2376
+ out.writeUInt32LE(snap.nextId, 4);
2377
+ out.writeUInt32LE(snap.fieldCount, 8);
2378
+ out.writeUInt32LE(termCountOf(snap), 12);
2379
+ return out;
2380
+ }
2381
+ function buildFieldNamesSection(fieldNames) {
2382
+ const chunks = [];
2383
+ for (const name of fieldNames) {
2384
+ const body = Buffer.from(name, 'utf8');
2385
+ const header = Buffer.alloc(4);
2386
+ header.writeUInt32LE(body.length, 0);
2387
+ chunks.push(header, body);
2388
+ }
2389
+ return Buffer.concat(chunks);
2390
+ }
2391
+ function buildExternalIdsSection(externalIds, nextId) {
2392
+ const chunks = [];
2393
+ for (let i = 0; i < nextId; i++) {
2394
+ writeExternalId(chunks, externalIds[i]);
2395
+ }
2396
+ return Buffer.concat(chunks);
2397
+ }
2398
+ function buildStoredFieldsSection(storedFields, nextId) {
2399
+ const table = Buffer.alloc(nextId * 4);
2400
+ const heapChunks = [];
2401
+ let heapOff = 0;
2402
+ for (let i = 0; i < nextId; i++) {
2403
+ const row = storedFields[i];
2404
+ if (row == null) {
2405
+ table.writeUInt32LE(0, i * 4);
2406
+ continue;
2407
+ }
2408
+ table.writeUInt32LE(heapOff + 1, i * 4);
2409
+ const json = Buffer.from(JSON.stringify(row), 'utf8');
2410
+ const entry = Buffer.alloc(4 + json.length);
2411
+ entry.writeUInt32LE(json.length, 0);
2412
+ json.copy(entry, 4);
2413
+ heapChunks.push(entry);
2414
+ heapOff += entry.length;
2415
+ }
2416
+ return Buffer.concat([table, ...heapChunks]);
2417
+ }
2418
+ function validateTermTreeLeaves(tree, termCount) {
2419
+ for (const [key, val] of tree) {
2420
+ if (key === LEAF) {
2421
+ const idx = val;
2422
+ if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
2423
+ throw invalidFrozenIndex(`term tree leaf index out of range: ${idx}`);
2424
+ }
2425
+ }
2426
+ else {
2427
+ validateTermTreeLeaves(val, termCount);
2428
+ }
2429
+ }
2430
+ }
2431
+ function deserializeTermIndexTree(shape) {
2432
+ const tree = new Map();
2433
+ for (const [key, value] of shape) {
2434
+ if (key === LEAF) {
2435
+ tree.set(LEAF, value);
2436
+ }
2437
+ else {
2438
+ tree.set(key, deserializeTermIndexTree(value));
2439
+ }
2440
+ }
2441
+ return tree;
2442
+ }
2443
+
2444
+ /**
2445
+ * Runtime stored fields. Single store field → one column (no per-doc Record at rest).
2446
+ * Wire format stays row JSON; encode/decode can skip intermediate row arrays when layout is known.
2447
+ */
2448
+ function createStoredFieldsLayout(storeFields, capacity = 0) {
2449
+ if (storeFields.length === 0)
2450
+ return { kind: 'none' };
2451
+ if (storeFields.length === 1) {
2452
+ return { kind: 'single', field: storeFields[0], values: new Array(capacity) };
2453
+ }
2454
+ return { kind: 'multi', rows: new Array(capacity) };
2455
+ }
2456
+ function writeStoredField(layout, shortId, storeFields, extractField, document) {
2457
+ if (layout.kind === 'none')
2458
+ return;
2459
+ if (layout.kind === 'single') {
2460
+ layout.values[shortId] = extractField(document, layout.field);
2461
+ return;
2228
2462
  }
2229
- return fieldIds;
2463
+ const row = {};
2464
+ for (const name of storeFields) {
2465
+ const value = extractField(document, name);
2466
+ if (value !== undefined)
2467
+ row[name] = value;
2468
+ }
2469
+ layout.rows[shortId] = row;
2230
2470
  }
2231
- /** Token frequencies for one document field (after processTerm). */
2232
- function collectFieldTermFreqs(tokens, fieldName, processTerm) {
2233
- const localFreqs = new Map();
2234
- for (const term of tokens) {
2235
- const processedTerm = processTerm(term, fieldName);
2236
- if (Array.isArray(processedTerm)) {
2237
- for (const t of processedTerm) {
2238
- localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
2239
- }
2240
- }
2241
- else if (processedTerm) {
2242
- localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
2471
+ /** Materialize API/wire row for one document. */
2472
+ function readStoredFields(layout, shortId) {
2473
+ if (layout.kind === 'none')
2474
+ return undefined;
2475
+ if (layout.kind === 'multi')
2476
+ return layout.rows[shortId];
2477
+ const value = layout.values[shortId];
2478
+ if (value === undefined)
2479
+ return {};
2480
+ return { [layout.field]: value };
2481
+ }
2482
+ function resizeStoredFields(layout, length) {
2483
+ if (layout.kind === 'none')
2484
+ return layout;
2485
+ if (layout.kind === 'single') {
2486
+ return layout.values.length <= length
2487
+ ? layout
2488
+ : { kind: 'single', field: layout.field, values: layout.values.slice(0, length) };
2489
+ }
2490
+ return layout.rows.length <= length
2491
+ ? layout
2492
+ : { kind: 'multi', rows: layout.rows.slice(0, length) };
2493
+ }
2494
+ function cloneStoredFields(layout) {
2495
+ if (layout.kind === 'none')
2496
+ return layout;
2497
+ if (layout.kind === 'single') {
2498
+ return { kind: 'single', field: layout.field, values: layout.values.slice() };
2499
+ }
2500
+ return { kind: 'multi', rows: layout.rows.slice() };
2501
+ }
2502
+ /** Import from wire rows or lucaong snapshot. Empty storeFields + non-empty rows → multi (binary load without options). */
2503
+ function storedFieldsFromRows(rows, storeFields) {
2504
+ if (storeFields.length === 0) {
2505
+ const hasAny = rows.some(row => row != null && Object.keys(row).length > 0);
2506
+ return hasAny ? { kind: 'multi', rows } : { kind: 'none' };
2507
+ }
2508
+ if (storeFields.length === 1) {
2509
+ const field = storeFields[0];
2510
+ const values = rows.map(row => row === null || row === void 0 ? void 0 : row[field]);
2511
+ return { kind: 'single', field, values };
2512
+ }
2513
+ return { kind: 'multi', rows };
2514
+ }
2515
+ function storedFieldsJsonBytes(layout) {
2516
+ if (layout.kind === 'none')
2517
+ return 0;
2518
+ if (layout.kind === 'multi') {
2519
+ let total = 0;
2520
+ for (const row of layout.rows) {
2521
+ if (row != null)
2522
+ total += JSON.stringify(row).length;
2243
2523
  }
2524
+ return total;
2244
2525
  }
2245
- return localFreqs;
2526
+ let total = 0;
2527
+ const { field, values } = layout;
2528
+ for (let i = 0; i < values.length; i++) {
2529
+ const value = values[i];
2530
+ if (value !== undefined)
2531
+ total += JSON.stringify({ [field]: value }).length;
2532
+ }
2533
+ return total;
2246
2534
  }
2247
- /** Same running average as {@link MiniSearch} private addFieldLength. */
2248
- function updateAvgFieldLength(avgFieldLength, fieldId, count, length) {
2249
- const averageFieldLength = avgFieldLength[fieldId] || 0;
2250
- const totalFieldLength = (averageFieldLength * count) + length;
2251
- avgFieldLength[fieldId] = totalFieldLength / (count + 1);
2535
+ function storedFieldsSlotCount(layout) {
2536
+ if (layout.kind === 'none')
2537
+ return 0;
2538
+ return layout.kind === 'single' ? layout.values.length : layout.rows.length;
2539
+ }
2540
+ function appendStoredFieldJsonEntry(table, heapChunks, heapOffRef, docIndex, jsonUtf8) {
2541
+ table.writeUInt32LE(heapOffRef.value + 1, docIndex * 4);
2542
+ const entry = Buffer.alloc(4 + jsonUtf8.length);
2543
+ entry.writeUInt32LE(jsonUtf8.length, 0);
2544
+ jsonUtf8.copy(entry, 4);
2545
+ heapChunks.push(entry);
2546
+ heapOffRef.value += entry.length;
2547
+ }
2548
+ /** MSv5 StoredFields section from {@link StoredFieldsLayout} (no intermediate row array). */
2549
+ function buildStoredFieldsWireSection(layout, nextId) {
2550
+ if (layout.kind === 'multi') {
2551
+ const rows = layout.rows.length >= nextId
2552
+ ? layout.rows
2553
+ : layout.rows.concat(new Array(nextId - layout.rows.length));
2554
+ return buildStoredFieldsSection(rows, nextId);
2555
+ }
2556
+ const table = Buffer.alloc(nextId * 4);
2557
+ if (layout.kind === 'none')
2558
+ return table;
2559
+ const heapChunks = [];
2560
+ const heapOffRef = { value: 0 };
2561
+ const { field, values } = layout;
2562
+ for (let i = 0; i < nextId; i++) {
2563
+ const value = values[i];
2564
+ if (value === undefined)
2565
+ continue;
2566
+ const jsonUtf8 = Buffer.from(JSON.stringify({ [field]: value }), 'utf8');
2567
+ appendStoredFieldJsonEntry(table, heapChunks, heapOffRef, i, jsonUtf8);
2568
+ }
2569
+ return heapChunks.length === 0 ? table : Buffer.concat([table, ...heapChunks]);
2252
2570
  }
2253
- function saveStoredFieldsForDocument(storeFields, extractField, document) {
2254
- if (storeFields.length === 0)
2255
- return undefined;
2256
- const documentFields = {};
2257
- for (const fieldName of storeFields) {
2258
- const fieldValue = extractField(document, fieldName);
2259
- if (fieldValue !== undefined)
2260
- documentFields[fieldName] = fieldValue;
2571
+ function storedFieldsTableEnd(storedOff, nextId, sectionEnd) {
2572
+ const tableEnd = storedOff + nextId * 4;
2573
+ if (tableEnd > sectionEnd) {
2574
+ throw invalidFrozenIndex('stored fields table out of bounds');
2575
+ }
2576
+ return tableEnd;
2577
+ }
2578
+ function readStoredFieldJsonAt(buf, tableEnd, sectionEnd, rel) {
2579
+ const entryOff = tableEnd + rel - 1;
2580
+ if (entryOff + 4 > sectionEnd) {
2581
+ throw invalidFrozenIndex('stored fields entry offset out of bounds');
2582
+ }
2583
+ const jsonLen = buf.readUInt32LE(entryOff);
2584
+ const jsonStart = entryOff + 4;
2585
+ const jsonEnd = jsonStart + jsonLen;
2586
+ if (jsonEnd > sectionEnd) {
2587
+ throw invalidFrozenIndex('stored fields JSON out of bounds');
2588
+ }
2589
+ return JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
2590
+ }
2591
+ /** MSv5 StoredFields section → layout (skips row materialization when storeFields hint allows). */
2592
+ function readStoredFieldsWireSection(buf, storedOff, nextId, sectionEnd, storeFields) {
2593
+ const tableEnd = storedFieldsTableEnd(storedOff, nextId, sectionEnd);
2594
+ if (storeFields.length === 1) {
2595
+ const field = storeFields[0];
2596
+ const values = new Array(nextId);
2597
+ for (let i = 0; i < nextId; i++) {
2598
+ const rel = buf.readUInt32LE(storedOff + i * 4);
2599
+ if (rel === 0)
2600
+ continue;
2601
+ const row = readStoredFieldJsonAt(buf, tableEnd, sectionEnd, rel);
2602
+ values[i] = row[field];
2603
+ }
2604
+ return { kind: 'single', field, values };
2605
+ }
2606
+ if (storeFields.length === 0) {
2607
+ let hasAny = false;
2608
+ for (let i = 0; i < nextId; i++) {
2609
+ if (buf.readUInt32LE(storedOff + i * 4) !== 0) {
2610
+ hasAny = true;
2611
+ break;
2612
+ }
2613
+ }
2614
+ if (!hasAny)
2615
+ return { kind: 'none' };
2261
2616
  }
2262
- return documentFields;
2617
+ const rows = readStoredFieldsSection(buf, storedOff, nextId, sectionEnd);
2618
+ return storedFieldsFromRows(rows, storeFields);
2263
2619
  }
2264
2620
 
2265
2621
  const SUPPORTED_SERIALIZATION_VERSIONS = new Set([1, 2]);
@@ -2345,7 +2701,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
2345
2701
  let shortIdRemap = null;
2346
2702
  const resolvedNextId = useDense ? documentCount : nextId;
2347
2703
  const externalIds = new Array(resolvedNextId);
2348
- const storedFields = new Array(externalIds.length);
2704
+ const storedFieldRows = new Array(externalIds.length);
2349
2705
  if (useDense) {
2350
2706
  shortIdRemap = new Uint32Array(nextId);
2351
2707
  shortIdRemap.fill(DISCARDED_DOC_ID);
@@ -2357,7 +2713,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
2357
2713
  const shortIdStr = String(shortId);
2358
2714
  shortIdRemap[shortId] = dense;
2359
2715
  externalIds[dense] = snapshot.documentIds[shortIdStr];
2360
- storedFields[dense] = snapshot.storedFields[shortIdStr];
2716
+ storedFieldRows[dense] = snapshot.storedFields[shortIdStr];
2361
2717
  dense++;
2362
2718
  }
2363
2719
  }
@@ -2365,7 +2721,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
2365
2721
  for (const [shortIdStr, id] of Object.entries(snapshot.documentIds)) {
2366
2722
  const shortId = parseInt(shortIdStr, 10);
2367
2723
  externalIds[shortId] = id;
2368
- storedFields[shortId] = snapshot.storedFields[shortIdStr];
2724
+ storedFieldRows[shortId] = snapshot.storedFields[shortIdStr];
2369
2725
  }
2370
2726
  }
2371
2727
  const idLookup = createIdToShortIdLookup(externalIds, resolvedNextId);
@@ -2388,6 +2744,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
2388
2744
  }
2389
2745
  const searchableMap = buildSearchableMapFromSnapshot(snapshot);
2390
2746
  const flat = buildFlatPostingsFromSearchableMap(searchableMap, fieldCount, resolvedNextId, shortIdRemap);
2747
+ const storedFields = storedFieldsFromRows(storedFieldRows, opts.storeFields);
2391
2748
  return {
2392
2749
  options: opts,
2393
2750
  documentCount,
@@ -2689,321 +3046,121 @@ function collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32)
2689
3046
  sectionId++;
2690
3047
  }
2691
3048
  }
2692
- }
2693
- function finish() {
2694
- emitEmptySections();
2695
- if (streamOffset !== uncompressedLength || sectionId !== directory.length) {
2696
- throw new Error('MSv5 zstd decompressed length mismatch');
2697
- }
2698
- if (payloadCrc !== payloadCrc32) {
2699
- throw new Error('MSv5 payload CRC mismatch');
2700
- }
2701
- }
2702
- return { sections, consume, finish };
2703
- }
2704
- function loadMsv5SectionsFromZstdStream(compressed, directory, uncompressedLength, payloadCrc32) {
2705
- return new Promise((resolve, reject) => {
2706
- const collector = collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32);
2707
- const stream = zlib.createZstdDecompress();
2708
- stream.on('data', (chunk) => {
2709
- try {
2710
- collector.consume(chunk);
2711
- }
2712
- catch (err) {
2713
- stream.destroy(err);
2714
- }
2715
- });
2716
- stream.on('error', reject);
2717
- stream.on('end', () => {
2718
- try {
2719
- collector.finish();
2720
- resolve(collector.sections);
2721
- }
2722
- catch (err) {
2723
- reject(err);
2724
- }
2725
- });
2726
- stream.end(compressed);
2727
- });
2728
- }
2729
- function validatePayloadDirectory(directory, uncompressedLength) {
2730
- let prevEnd = 0;
2731
- for (const entry of directory) {
2732
- if ((entry.fileOffset & 3) !== 0) {
2733
- throw new Error('MSv5 section offset not aligned');
2734
- }
2735
- if (entry.fileOffset < prevEnd) {
2736
- throw new Error('MSv5 section offsets not monotonic');
2737
- }
2738
- if (entry.fileOffset + entry.uncompressedLength > uncompressedLength) {
2739
- throw new Error('MSv5 section out of uncompressed bounds');
2740
- }
2741
- prevEnd = entry.fileOffset + entry.uncompressedLength;
2742
- }
2743
- if (prevEnd !== uncompressedLength) {
2744
- throw new Error('MSv5 uncompressed payload length mismatch');
2745
- }
2746
- }
2747
- /** Shared validation + bounds for both the sync and async load paths. */
2748
- function preparePayload(fileBuf, directory) {
2749
- assertPayloadFormatRev(fileBuf);
2750
- const { payloadOffset, compressedLength, uncompressedLength, payloadCrc32, payloadCodec } = readPayloadMeta(fileBuf);
2751
- validatePayloadDirectory(directory, uncompressedLength);
2752
- if (payloadOffset !== MSV5_HEADER_SIZE || payloadOffset + compressedLength > fileBuf.length) {
2753
- throw new Error('MSv5 payload out of bounds');
2754
- }
2755
- if (payloadCodec === CODEC_RAW && compressedLength !== uncompressedLength) {
2756
- throw new Error('MSv5 raw payload length mismatch');
2757
- }
2758
- return {
2759
- payloadCodec,
2760
- slice: fileBuf.subarray(payloadOffset, payloadOffset + compressedLength),
2761
- uncompressedLength,
2762
- payloadCrc32,
2763
- };
2764
- }
2765
- /** Synchronous load; peak RAM ≈ full uncompressed payload (use the async path to bound it). */
2766
- function loadMsv5Sections(fileBuf, directory) {
2767
- const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
2768
- if (payloadCodec === CODEC_RAW) {
2769
- return sectionsFromPayload(slice, directory, payloadCrc32);
2770
- }
2771
- if (payloadCodec === CODEC_ZSTD) {
2772
- if (!zstdAvailable()) {
2773
- throw zstdUnavailableReadError();
2774
- }
2775
- // Native cap matches readPayloadMeta's 1 GiB limit (see MSV5_MAX_UNCOMPRESSED_BYTES).
2776
- // Using header `uncompressedLength` here would only help when the header understates
2777
- // the zstd stream but the attacker can inflate the header too — same worst case.
2778
- const decoded = zlib.zstdDecompressSync(slice, {
2779
- maxOutputLength: MSV5_MAX_UNCOMPRESSED_BYTES,
2780
- });
2781
- if (decoded.length !== uncompressedLength) {
2782
- throw new Error('MSv5 zstd decompressed length mismatch');
2783
- }
2784
- return sectionsFromPayload(decoded, directory, payloadCrc32);
2785
- }
2786
- throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
2787
- }
2788
- /** Streaming load; peak main-thread RAM ≈ largest single section (+ file buffer). */
2789
- async function loadMsv5SectionsAsync(fileBuf, directory) {
2790
- const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
2791
- if (payloadCodec === CODEC_RAW) {
2792
- return sectionsFromPayload(slice, directory, payloadCrc32);
2793
- }
2794
- if (payloadCodec === CODEC_ZSTD) {
2795
- if (!zstdAvailable()) {
2796
- throw zstdUnavailableReadError();
2797
- }
2798
- return loadMsv5SectionsFromZstdStream(slice, directory, uncompressedLength, payloadCrc32);
2799
- }
2800
- throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
2801
- }
2802
- function isMsv5Buffer(buf) {
2803
- return buf.length >= 4 && buf.toString('ascii', 0, 4) === 'MSv5';
2804
- }
2805
- function readMsv5GlobalFlags(buf) {
2806
- return buf.readUInt16LE(6);
2807
- }
2808
-
2809
- function validateTreeShape(shape, termCount) {
2810
- if (!Array.isArray(shape)) {
2811
- throw invalidFrozenIndex('treeShape node must be an array');
2812
- }
2813
- for (const entry of shape) {
2814
- if (!Array.isArray(entry) || entry.length !== 2) {
2815
- throw invalidFrozenIndex('treeShape entry must be a [key, value] pair');
2816
- }
2817
- const [key, value] = entry;
2818
- if (key === LEAF) {
2819
- const idx = value;
2820
- if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
2821
- throw invalidFrozenIndex(`treeShape leaf term index out of range: ${idx}`);
2822
- }
2823
- }
2824
- else {
2825
- validateTreeShape(value, termCount);
2826
- }
2827
- }
2828
- }
2829
- function termCountOf(snap) {
2830
- return snap.postings.termCount;
2831
- }
2832
- /**
2833
- * Numeric/structural invariants shared by both the decode path (untrusted binary)
2834
- * and the build path (trusted internal code).
2835
- */
2836
- function validateFrozenSnapshotNumeric(snap) {
2837
- if (snap.fieldCount <= 0) {
2838
- throw invalidFrozenIndex('fieldCount must be positive');
2839
- }
2840
- if (snap.nextId < 0 || snap.nextId >= 0xffffffff) {
2841
- throw invalidFrozenIndex('nextId out of range');
2842
- }
2843
- if (snap.documentCount < 0 || snap.documentCount > snap.nextId) {
2844
- throw invalidFrozenIndex('documentCount inconsistent with nextId');
2845
- }
2846
- if (snap.fieldLengthMatrix.length !== snap.nextId * snap.fieldCount) {
2847
- throw invalidFrozenIndex('fieldLengthMatrix size mismatch');
2848
- }
2849
- if (snap.avgFieldLength.length !== snap.fieldCount) {
2850
- throw invalidFrozenIndex('avgFieldLength size mismatch');
2851
- }
2852
- validateFrozenPostingsLayout(snap.postings, snap.documentCount, snap.nextId, detail => {
2853
- throw invalidFrozenIndex(detail);
2854
- });
2855
- const indexedFields = Object.keys(snap.fieldIds);
2856
- if (indexedFields.length !== snap.fieldCount) {
2857
- throw invalidFrozenIndex('fieldIds count mismatch');
2858
- }
2859
- for (let f = 0; f < snap.fieldCount; f++) {
2860
- const found = indexedFields.some(name => snap.fieldIds[name] === f);
2861
- if (!found) {
2862
- throw invalidFrozenIndex(`missing field id ${f}`);
2863
- }
2864
- }
2865
- }
2866
- function readFieldNamesSection(buf, fieldNamesOff, fieldCount, externalIdsOff) {
2867
- const fieldNames = [];
2868
- let o = fieldNamesOff;
2869
- for (let f = 0; f < fieldCount; f++) {
2870
- const { value, next } = readLengthPrefixedUtf8(buf, o);
2871
- fieldNames.push(value);
2872
- o = next;
2873
- }
2874
- if (o !== externalIdsOff) {
2875
- throw invalidFrozenIndex('field names section size mismatch');
2876
- }
2877
- return fieldNames;
2878
- }
2879
- function readExternalIdsSection(buf, externalIdsOff, nextId, storedOff) {
2880
- const externalIds = new Array(nextId);
2881
- let o = externalIdsOff;
2882
- for (let i = 0; i < nextId; i++) {
2883
- const { value, next } = readExternalId(buf, o);
2884
- externalIds[i] = value;
2885
- o = next;
2886
- }
2887
- if (o !== storedOff) {
2888
- throw invalidFrozenIndex('external ids section size mismatch');
2889
- }
2890
- return externalIds;
2891
- }
2892
- function readStoredFieldsSection(buf, storedOff, nextId, sectionEnd) {
2893
- const storedFields = new Array(nextId);
2894
- const tableEnd = storedOff + nextId * 4;
2895
- if (tableEnd > sectionEnd) {
2896
- throw invalidFrozenIndex('stored fields table out of bounds');
2897
- }
2898
- for (let i = 0; i < nextId; i++) {
2899
- const rel = buf.readUInt32LE(storedOff + i * 4);
2900
- if (rel === 0) {
2901
- storedFields[i] = undefined;
2902
- continue;
2903
- }
2904
- const entryOff = tableEnd + rel - 1;
2905
- if (entryOff + 4 > sectionEnd) {
2906
- throw invalidFrozenIndex('stored fields entry offset out of bounds');
2907
- }
2908
- const jsonLen = buf.readUInt32LE(entryOff);
2909
- const jsonStart = entryOff + 4;
2910
- const jsonEnd = jsonStart + jsonLen;
2911
- if (jsonEnd > sectionEnd) {
2912
- throw invalidFrozenIndex('stored fields JSON out of bounds');
3049
+ }
3050
+ function finish() {
3051
+ emitEmptySections();
3052
+ if (streamOffset !== uncompressedLength || sectionId !== directory.length) {
3053
+ throw new Error('MSv5 zstd decompressed length mismatch');
3054
+ }
3055
+ if (payloadCrc !== payloadCrc32) {
3056
+ throw new Error('MSv5 payload CRC mismatch');
2913
3057
  }
2914
- storedFields[i] = JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
2915
3058
  }
2916
- return storedFields;
3059
+ return { sections, consume, finish };
2917
3060
  }
2918
- /** Validate structural invariants of a decoded or assembled frozen snapshot. */
2919
- function validateFrozenSnapshot(snap) {
2920
- validateFrozenSnapshotNumeric(snap);
2921
- const termCount = termCountOf(snap);
2922
- if (snap.packedTermIndex != null) {
2923
- validateFrozenTermIndexLeaves(snap.packedTermIndex, termCount);
2924
- }
2925
- else if (snap.termTree != null) {
2926
- validateTermTreeLeaves(snap.termTree, termCount);
3061
+ function loadMsv5SectionsFromZstdStream(compressed, directory, uncompressedLength, payloadCrc32) {
3062
+ return new Promise((resolve, reject) => {
3063
+ const collector = collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32);
3064
+ const stream = zlib.createZstdDecompress();
3065
+ stream.on('data', (chunk) => {
3066
+ try {
3067
+ collector.consume(chunk);
3068
+ }
3069
+ catch (err) {
3070
+ stream.destroy(err);
3071
+ }
3072
+ });
3073
+ stream.on('error', reject);
3074
+ stream.on('end', () => {
3075
+ try {
3076
+ collector.finish();
3077
+ resolve(collector.sections);
3078
+ }
3079
+ catch (err) {
3080
+ reject(err);
3081
+ }
3082
+ });
3083
+ stream.end(compressed);
3084
+ });
3085
+ }
3086
+ function validatePayloadDirectory(directory, uncompressedLength) {
3087
+ let prevEnd = 0;
3088
+ for (const entry of directory) {
3089
+ if ((entry.fileOffset & 3) !== 0) {
3090
+ throw new Error('MSv5 section offset not aligned');
3091
+ }
3092
+ if (entry.fileOffset < prevEnd) {
3093
+ throw new Error('MSv5 section offsets not monotonic');
3094
+ }
3095
+ if (entry.fileOffset + entry.uncompressedLength > uncompressedLength) {
3096
+ throw new Error('MSv5 section out of uncompressed bounds');
3097
+ }
3098
+ prevEnd = entry.fileOffset + entry.uncompressedLength;
2927
3099
  }
2928
- else {
2929
- validateTreeShape(snap.treeShape, termCount);
3100
+ if (prevEnd !== uncompressedLength) {
3101
+ throw new Error('MSv5 uncompressed payload length mismatch');
2930
3102
  }
2931
3103
  }
2932
- function fieldNamesFromFieldIds(fieldIds) {
2933
- const names = Object.keys(fieldIds);
2934
- names.sort((a, b) => fieldIds[a] - fieldIds[b]);
2935
- return names;
2936
- }
2937
- /** Core with explicit {@link termCountOf} (no dictionary section). */
2938
- function buildCoreSectionWithTermCount(snap) {
2939
- const out = Buffer.alloc(16);
2940
- out.writeUInt32LE(snap.documentCount, 0);
2941
- out.writeUInt32LE(snap.nextId, 4);
2942
- out.writeUInt32LE(snap.fieldCount, 8);
2943
- out.writeUInt32LE(termCountOf(snap), 12);
2944
- return out;
2945
- }
2946
- function buildFieldNamesSection(fieldNames) {
2947
- const chunks = [];
2948
- for (const name of fieldNames) {
2949
- const body = Buffer.from(name, 'utf8');
2950
- const header = Buffer.alloc(4);
2951
- header.writeUInt32LE(body.length, 0);
2952
- chunks.push(header, body);
3104
+ /** Shared validation + bounds for both the sync and async load paths. */
3105
+ function preparePayload(fileBuf, directory) {
3106
+ assertPayloadFormatRev(fileBuf);
3107
+ const { payloadOffset, compressedLength, uncompressedLength, payloadCrc32, payloadCodec } = readPayloadMeta(fileBuf);
3108
+ validatePayloadDirectory(directory, uncompressedLength);
3109
+ if (payloadOffset !== MSV5_HEADER_SIZE || payloadOffset + compressedLength > fileBuf.length) {
3110
+ throw new Error('MSv5 payload out of bounds');
2953
3111
  }
2954
- return Buffer.concat(chunks);
2955
- }
2956
- function buildExternalIdsSection(externalIds, nextId) {
2957
- const chunks = [];
2958
- for (let i = 0; i < nextId; i++) {
2959
- writeExternalId(chunks, externalIds[i]);
3112
+ if (payloadCodec === CODEC_RAW && compressedLength !== uncompressedLength) {
3113
+ throw new Error('MSv5 raw payload length mismatch');
2960
3114
  }
2961
- return Buffer.concat(chunks);
3115
+ return {
3116
+ payloadCodec,
3117
+ slice: fileBuf.subarray(payloadOffset, payloadOffset + compressedLength),
3118
+ uncompressedLength,
3119
+ payloadCrc32,
3120
+ };
2962
3121
  }
2963
- function buildStoredFieldsSection(storedFields, nextId) {
2964
- const table = Buffer.alloc(nextId * 4);
2965
- const heapChunks = [];
2966
- let heapOff = 0;
2967
- for (let i = 0; i < nextId; i++) {
2968
- const row = storedFields[i];
2969
- if (row == null) {
2970
- table.writeUInt32LE(0, i * 4);
2971
- continue;
2972
- }
2973
- table.writeUInt32LE(heapOff + 1, i * 4);
2974
- const json = Buffer.from(JSON.stringify(row), 'utf8');
2975
- const entry = Buffer.alloc(4 + json.length);
2976
- entry.writeUInt32LE(json.length, 0);
2977
- json.copy(entry, 4);
2978
- heapChunks.push(entry);
2979
- heapOff += entry.length;
3122
+ /** Synchronous load; peak RAM ≈ full uncompressed payload (use the async path to bound it). */
3123
+ function loadMsv5Sections(fileBuf, directory) {
3124
+ const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
3125
+ if (payloadCodec === CODEC_RAW) {
3126
+ return sectionsFromPayload(slice, directory, payloadCrc32);
2980
3127
  }
2981
- return Buffer.concat([table, ...heapChunks]);
2982
- }
2983
- function validateTermTreeLeaves(tree, termCount) {
2984
- for (const [key, val] of tree) {
2985
- if (key === LEAF) {
2986
- const idx = val;
2987
- if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
2988
- throw invalidFrozenIndex(`term tree leaf index out of range: ${idx}`);
2989
- }
3128
+ if (payloadCodec === CODEC_ZSTD) {
3129
+ if (!zstdAvailable()) {
3130
+ throw zstdUnavailableReadError();
2990
3131
  }
2991
- else {
2992
- validateTermTreeLeaves(val, termCount);
3132
+ // Native cap matches readPayloadMeta's 1 GiB limit (see MSV5_MAX_UNCOMPRESSED_BYTES).
3133
+ // Using header `uncompressedLength` here would only help when the header understates
3134
+ // the zstd stream but the attacker can inflate the header too — same worst case.
3135
+ const decoded = zlib.zstdDecompressSync(slice, {
3136
+ maxOutputLength: MSV5_MAX_UNCOMPRESSED_BYTES,
3137
+ });
3138
+ if (decoded.length !== uncompressedLength) {
3139
+ throw new Error('MSv5 zstd decompressed length mismatch');
2993
3140
  }
3141
+ return sectionsFromPayload(decoded, directory, payloadCrc32);
2994
3142
  }
3143
+ throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
2995
3144
  }
2996
- function deserializeTermIndexTree(shape) {
2997
- const tree = new Map();
2998
- for (const [key, value] of shape) {
2999
- if (key === LEAF) {
3000
- tree.set(LEAF, value);
3001
- }
3002
- else {
3003
- tree.set(key, deserializeTermIndexTree(value));
3145
+ /** Streaming load; peak main-thread RAM ≈ largest single section (+ file buffer). */
3146
+ async function loadMsv5SectionsAsync(fileBuf, directory) {
3147
+ const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
3148
+ if (payloadCodec === CODEC_RAW) {
3149
+ return sectionsFromPayload(slice, directory, payloadCrc32);
3150
+ }
3151
+ if (payloadCodec === CODEC_ZSTD) {
3152
+ if (!zstdAvailable()) {
3153
+ throw zstdUnavailableReadError();
3004
3154
  }
3155
+ return loadMsv5SectionsFromZstdStream(slice, directory, uncompressedLength, payloadCrc32);
3005
3156
  }
3006
- return tree;
3157
+ throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
3158
+ }
3159
+ function isMsv5Buffer(buf) {
3160
+ return buf.length >= 4 && buf.toString('ascii', 0, 4) === 'MSv5';
3161
+ }
3162
+ function readMsv5GlobalFlags(buf) {
3163
+ return buf.readUInt16LE(6);
3007
3164
  }
3008
3165
 
3009
3166
  /** Global wire flags for {@link FreqArray} width. */
@@ -3297,11 +3454,14 @@ function encodeFrozenSnapshotMsv5(snap, termTree, packedTermIndex) {
3297
3454
  const flFlags = fieldLengthMatrixWireFlags(snap.fieldLengthMatrix);
3298
3455
  const freqFlags = freqWireFlags(snap.postings.allFreqs);
3299
3456
  const globalFlags = postingsWire.flags | flFlags | freqFlags;
3457
+ const storedFieldsSection = snap.storedFieldsLayout != null
3458
+ ? buildStoredFieldsWireSection(snap.storedFieldsLayout, snap.nextId)
3459
+ : buildStoredFieldsSection(snap.storedFields, snap.nextId);
3300
3460
  const rawSections = [
3301
3461
  buildCoreSectionWithTermCount(snap),
3302
3462
  buildFieldNamesSection(fieldNames),
3303
3463
  buildExternalIdsSection(snap.externalIds, snap.nextId),
3304
- buildStoredFieldsSection(snap.storedFields, snap.nextId),
3464
+ storedFieldsSection,
3305
3465
  buildTermTreeSectionColumnar(packed),
3306
3466
  bufferFromView(snap.avgFieldLength),
3307
3467
  buildFieldLengthMatrixSection(snap.fieldLengthMatrix),
@@ -3325,11 +3485,14 @@ async function encodeFrozenSnapshotMsv5Async(snap, termTree, packedTermIndex) {
3325
3485
  const flFlags = fieldLengthMatrixWireFlags(snap.fieldLengthMatrix);
3326
3486
  const freqFlags = freqWireFlags(snap.postings.allFreqs);
3327
3487
  const globalFlags = postingsWire.flags | flFlags | freqFlags;
3488
+ const storedFieldsSection = snap.storedFieldsLayout != null
3489
+ ? buildStoredFieldsWireSection(snap.storedFieldsLayout, snap.nextId)
3490
+ : buildStoredFieldsSection(snap.storedFields, snap.nextId);
3328
3491
  const rawSections = [
3329
3492
  buildCoreSectionWithTermCount(snap),
3330
3493
  buildFieldNamesSection(fieldNames),
3331
3494
  buildExternalIdsSection(snap.externalIds, snap.nextId),
3332
- buildStoredFieldsSection(snap.storedFields, snap.nextId),
3495
+ storedFieldsSection,
3333
3496
  buildTermTreeSectionColumnar(packed),
3334
3497
  bufferFromView(snap.avgFieldLength),
3335
3498
  buildFieldLengthMatrixSection(snap.fieldLengthMatrix),
@@ -3359,7 +3522,7 @@ function validateMsv5Container(buf) {
3359
3522
  }
3360
3523
  return { globalFlags, directory };
3361
3524
  }
3362
- function decodeMsv5Sections(globalFlags, sections) {
3525
+ function decodeMsv5Sections(globalFlags, sections, hints) {
3363
3526
  const core = sections[0 /* Msv5SectionId.Core */];
3364
3527
  if (core.length !== 16) {
3365
3528
  throw invalidFrozenIndex('core section size mismatch');
@@ -3374,7 +3537,12 @@ function decodeMsv5Sections(globalFlags, sections) {
3374
3537
  fieldIds[fieldNames[f]] = f;
3375
3538
  }
3376
3539
  const externalIds = readExternalIdsSection(sections[2 /* Msv5SectionId.ExternalIds */], 0, nextId, sections[2 /* Msv5SectionId.ExternalIds */].length);
3377
- const storedFields = readStoredFieldsSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length);
3540
+ const storedFieldsLayout = hints != null
3541
+ ? readStoredFieldsWireSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length, hints.storeFields)
3542
+ : undefined;
3543
+ const storedFields = storedFieldsLayout != null
3544
+ ? new Array(nextId)
3545
+ : readStoredFieldsSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length);
3378
3546
  const packedTermIndex = readPackedTermTreeSectionColumnar(sections[4 /* Msv5SectionId.TermTree */], termCount);
3379
3547
  const avgBuf = sections[5 /* Msv5SectionId.AvgFieldLength */];
3380
3548
  const avgFieldLength = readFloat32Array(avgBuf, 0, avgBuf.length);
@@ -3392,6 +3560,7 @@ function decodeMsv5Sections(globalFlags, sections) {
3392
3560
  avgFieldLength,
3393
3561
  externalIds,
3394
3562
  storedFields,
3563
+ storedFieldsLayout,
3395
3564
  fieldLengthMatrix,
3396
3565
  treeShape: [],
3397
3566
  packedTermIndex,
@@ -3400,13 +3569,13 @@ function decodeMsv5Sections(globalFlags, sections) {
3400
3569
  validateFrozenSnapshot(snap);
3401
3570
  return snap;
3402
3571
  }
3403
- function decodeFrozenSnapshotMsv5(buf) {
3572
+ function decodeFrozenSnapshotMsv5(buf, hints) {
3404
3573
  const { globalFlags, directory } = validateMsv5Container(buf);
3405
- return decodeMsv5Sections(globalFlags, loadMsv5Sections(buf, directory));
3574
+ return decodeMsv5Sections(globalFlags, loadMsv5Sections(buf, directory), hints);
3406
3575
  }
3407
- async function decodeFrozenSnapshotMsv5Async(buf) {
3576
+ async function decodeFrozenSnapshotMsv5Async(buf, hints) {
3408
3577
  const { globalFlags, directory } = validateMsv5Container(buf);
3409
- return decodeMsv5Sections(globalFlags, await loadMsv5SectionsAsync(buf, directory));
3578
+ return decodeMsv5Sections(globalFlags, await loadMsv5SectionsAsync(buf, directory), hints);
3410
3579
  }
3411
3580
 
3412
3581
  /** Encode a frozen snapshot as a binary buffer. */
@@ -3420,12 +3589,12 @@ function encodeFrozenSnapshotAsync(snap, termTree, packedTermIndex) {
3420
3589
 
3421
3590
  const LEGACY_MAGICS = new Set(['MSv1', 'MSv2', 'MSv3', 'MSv4']);
3422
3591
  /** Decode a frozen binary snapshot buffer. */
3423
- function decodeFrozenSnapshot(buf) {
3592
+ function decodeFrozenSnapshot(buf, hints) {
3424
3593
  assertBufferLength(buf, 8);
3425
3594
  const magic = buf.toString('ascii', 0, 4);
3426
3595
  const version = buf.readUInt16LE(4);
3427
3596
  if (isMsv5Buffer(buf) && version === 5) {
3428
- return decodeFrozenSnapshotMsv5(buf);
3597
+ return decodeFrozenSnapshotMsv5(buf, hints);
3429
3598
  }
3430
3599
  if (LEGACY_MAGICS.has(magic)) {
3431
3600
  throw invalidFrozenIndex('Unsupported frozen binary snapshot; re-build with saveBinarySync() or from lucaong JSON');
@@ -3433,82 +3602,283 @@ function decodeFrozenSnapshot(buf) {
3433
3602
  throw invalidFrozenIndex('Unsupported frozen binary snapshot');
3434
3603
  }
3435
3604
  /** Async frozen snapshot decode (streaming zstd). */
3436
- async function decodeFrozenSnapshotAsync(buf) {
3605
+ async function decodeFrozenSnapshotAsync(buf, hints) {
3437
3606
  assertBufferLength(buf, 8);
3438
3607
  const version = buf.readUInt16LE(4);
3439
3608
  if (isMsv5Buffer(buf) && version === 5) {
3440
- return decodeFrozenSnapshotMsv5Async(buf);
3609
+ return decodeFrozenSnapshotMsv5Async(buf, hints);
3610
+ }
3611
+ return decodeFrozenSnapshot(buf, hints);
3612
+ }
3613
+
3614
+ const DEFAULT_CAPACITY = 16;
3615
+ /** Growable unsigned 32-bit column (build scratch; narrowed to u16 at finalize when possible). */
3616
+ class GrowableUint32Column {
3617
+ constructor(initialCapacity = DEFAULT_CAPACITY) {
3618
+ this._len = 0;
3619
+ this._buf = new Uint32Array(Math.max(1, initialCapacity));
3620
+ }
3621
+ get length() {
3622
+ return this._len;
3623
+ }
3624
+ push(value) {
3625
+ if (this._len >= this._buf.length) {
3626
+ const grown = new Uint32Array(Math.max(1, this._buf.length * 2));
3627
+ grown.set(this._buf);
3628
+ this._buf = grown;
3629
+ }
3630
+ this._buf[this._len++] = value;
3631
+ }
3632
+ copyRangeInto(sourceOffset, length, target, targetOffset, docIdWidth) {
3633
+ if (docIdWidth === 16) {
3634
+ const out = target;
3635
+ for (let i = 0; i < length; i++)
3636
+ out[targetOffset + i] = this._buf[sourceOffset + i];
3637
+ }
3638
+ else {
3639
+ const out = target;
3640
+ for (let i = 0; i < length; i++)
3641
+ out[targetOffset + i] = this._buf[sourceOffset + i];
3642
+ }
3643
+ }
3644
+ truncate(length) {
3645
+ this._len = length;
3646
+ if (length > 0 && length < this._buf.length) {
3647
+ this._buf = this._buf.slice(0, length);
3648
+ }
3649
+ }
3650
+ }
3651
+ /** Growable frequency column (u16 cells; matches frozen clamp range). */
3652
+ class GrowableFreqColumn {
3653
+ constructor(initialCapacity = DEFAULT_CAPACITY) {
3654
+ this._len = 0;
3655
+ this._buf = new Uint16Array(Math.max(1, initialCapacity));
3656
+ }
3657
+ get length() {
3658
+ return this._len;
3659
+ }
3660
+ push(freq) {
3661
+ if (this._len >= this._buf.length) {
3662
+ const grown = new Uint16Array(Math.max(1, this._buf.length * 2));
3663
+ grown.set(this._buf);
3664
+ this._buf = grown;
3665
+ }
3666
+ this._buf[this._len++] = clampFreq(freq);
3667
+ }
3668
+ copyRangeInto(sourceOffset, length, target, targetOffset) {
3669
+ for (let i = 0; i < length; i++) {
3670
+ target[targetOffset + i] = this._buf[sourceOffset + i];
3671
+ }
3672
+ }
3673
+ truncate(length) {
3674
+ this._len = length;
3675
+ if (length > 0 && length < this._buf.length) {
3676
+ this._buf = this._buf.slice(0, length);
3677
+ }
3678
+ }
3679
+ }
3680
+ /**
3681
+ * Single-pass postings accumulator for {@link FrozenIndexBuilder}.
3682
+ * One global TypedArray stream per docIds/freqs; per-slot range metadata only.
3683
+ */
3684
+ class IncrementalPostingsAccumulator {
3685
+ constructor(fieldCount, hints) {
3686
+ var _a;
3687
+ this._slots = new Map();
3688
+ this._totalPostings = 0;
3689
+ this._maxFreq = 0;
3690
+ this._fieldCount = fieldCount;
3691
+ const cap = Math.max(DEFAULT_CAPACITY, (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedTotalPostings) !== null && _a !== void 0 ? _a : 0);
3692
+ this._docIds = new GrowableUint32Column(cap);
3693
+ this._freqs = new GrowableFreqColumn(cap);
3694
+ }
3695
+ get totalPostings() {
3696
+ return this._totalPostings;
3697
+ }
3698
+ get maxFreq() {
3699
+ return this._maxFreq;
3700
+ }
3701
+ append(termIndex, fieldId, docId, freq) {
3702
+ const slot = termIndex * this._fieldCount + fieldId;
3703
+ const writeIdx = this._docIds.length;
3704
+ this._docIds.push(docId);
3705
+ const v = clampFreq(freq);
3706
+ this._freqs.push(v);
3707
+ if (v > this._maxFreq)
3708
+ this._maxFreq = v;
3709
+ this._totalPostings++;
3710
+ let ranges = this._slots.get(slot);
3711
+ if (ranges == null) {
3712
+ ranges = { starts: [writeIdx], lengths: [1] };
3713
+ this._slots.set(slot, ranges);
3714
+ return;
3715
+ }
3716
+ const last = ranges.starts.length - 1;
3717
+ const end = ranges.starts[last] + ranges.lengths[last];
3718
+ if (end === writeIdx) {
3719
+ ranges.lengths[last]++;
3720
+ }
3721
+ else {
3722
+ ranges.starts.push(writeIdx);
3723
+ ranges.lengths.push(1);
3724
+ }
3725
+ }
3726
+ clear() {
3727
+ this._slots.clear();
3728
+ // Drop global scratch backing so finalize does not retain duplicate posting bytes.
3729
+ this._docIds.truncate(0);
3730
+ this._freqs.truncate(0);
3731
+ }
3732
+ copySlot(ranges, allDocIds, allFreqs, write, docIdWidth) {
3733
+ for (let r = 0; r < ranges.starts.length; r++) {
3734
+ const start = ranges.starts[r];
3735
+ const len = ranges.lengths[r];
3736
+ this._docIds.copyRangeInto(start, len, allDocIds, write, docIdWidth);
3737
+ this._freqs.copyRangeInto(start, len, allFreqs, write);
3738
+ write += len;
3739
+ }
3740
+ return write;
3741
+ }
3742
+ slotLength(ranges) {
3743
+ let n = 0;
3744
+ for (let i = 0; i < ranges.lengths.length; i++)
3745
+ n += ranges.lengths[i];
3746
+ return n;
3747
+ }
3748
+ finalize(termCount, nextId) {
3749
+ const fieldCount = this._fieldCount;
3750
+ const totalPostings = this._totalPostings;
3751
+ const maxFreq = this._maxFreq;
3752
+ const slots = this._slots;
3753
+ const layout = choosePostingsLayout(fieldCount);
3754
+ const docIdWidth = nextId <= 65535 ? 16 : 32;
3755
+ const allDocIds = docIdWidth === 16
3756
+ ? new Uint16Array(totalPostings)
3757
+ : new Uint32Array(totalPostings);
3758
+ const allFreqs = allocateFreqs(totalPostings, maxFreq);
3759
+ if (layout === 'dense') {
3760
+ const slotCount = termCount * fieldCount;
3761
+ const denseOffsets = new Uint32Array(slotCount);
3762
+ const denseLengths = new Uint32Array(slotCount);
3763
+ let write = 0;
3764
+ for (let ti = 0; ti < termCount; ti++) {
3765
+ const base = ti * fieldCount;
3766
+ for (let f = 0; f < fieldCount; f++) {
3767
+ const slot = base + f;
3768
+ const ranges = slots.get(slot);
3769
+ const len = ranges == null ? 0 : this.slotLength(ranges);
3770
+ denseOffsets[slot] = write;
3771
+ denseLengths[slot] = len;
3772
+ if (len > 0) {
3773
+ write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
3774
+ slots.delete(slot);
3775
+ }
3776
+ }
3777
+ }
3778
+ slots.clear();
3779
+ this.clear();
3780
+ return {
3781
+ fieldCount,
3782
+ termCount,
3783
+ nextId,
3784
+ layout,
3785
+ docIdWidth,
3786
+ sparseFieldIdWidth: null,
3787
+ allDocIds,
3788
+ allFreqs,
3789
+ denseOffsets,
3790
+ denseLengths,
3791
+ sparseTermStarts: null,
3792
+ sparseFieldIds: null,
3793
+ sparseOffsets: null,
3794
+ sparseLengths: null,
3795
+ };
3796
+ }
3797
+ const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
3798
+ const sparseFieldIdsScratch = [];
3799
+ const sparseOffsets = [];
3800
+ const sparseLengths = [];
3801
+ const termStarts = new Array(termCount + 1).fill(0);
3802
+ let write = 0;
3803
+ for (let ti = 0; ti < termCount; ti++) {
3804
+ termStarts[ti] = sparseFieldIdsScratch.length;
3805
+ for (let f = 0; f < fieldCount; f++) {
3806
+ const slot = ti * fieldCount + f;
3807
+ const ranges = slots.get(slot);
3808
+ const len = ranges == null ? 0 : this.slotLength(ranges);
3809
+ if (len === 0)
3810
+ continue;
3811
+ sparseFieldIdsScratch.push(f);
3812
+ sparseOffsets.push(write);
3813
+ sparseLengths.push(len);
3814
+ write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
3815
+ slots.delete(slot);
3816
+ }
3817
+ termStarts[ti + 1] = sparseFieldIdsScratch.length;
3818
+ }
3819
+ slots.clear();
3820
+ this.clear();
3821
+ const sparseFieldIds = sparseFieldIdWidth === 16
3822
+ ? new Uint16Array(sparseFieldIdsScratch)
3823
+ : new Uint8Array(sparseFieldIdsScratch);
3824
+ return {
3825
+ fieldCount,
3826
+ termCount,
3827
+ nextId,
3828
+ layout,
3829
+ docIdWidth,
3830
+ sparseFieldIdWidth,
3831
+ allDocIds,
3832
+ allFreqs,
3833
+ denseOffsets: null,
3834
+ denseLengths: null,
3835
+ sparseTermStarts: new Uint32Array(termStarts),
3836
+ sparseFieldIds,
3837
+ sparseOffsets: new Uint32Array(sparseOffsets),
3838
+ sparseLengths: new Uint32Array(sparseLengths),
3839
+ };
3441
3840
  }
3442
- return decodeFrozenSnapshot(buf);
3443
3841
  }
3444
3842
 
3445
- function getOrCreateTermIndex(state, index, term) {
3843
+ function getOrCreateTermIndex(termCount, index, term) {
3446
3844
  const existing = index.get(term);
3447
3845
  if (existing != null)
3448
3846
  return existing;
3449
- const ti = state.terms.length;
3450
- state.terms.push(term);
3847
+ const ti = termCount.value;
3848
+ termCount.value++;
3451
3849
  index.set(term, ti);
3452
3850
  return ti;
3453
3851
  }
3454
- function appendPosting(state, termIndex, fieldId, docId, freq) {
3455
- const slot = termIndex * state.fieldCount + fieldId;
3456
- let docIds = state.postingsDocIds[slot];
3457
- if (docIds == null) {
3458
- docIds = [];
3459
- state.postingsDocIds[slot] = docIds;
3460
- state.postingsFreqs[slot] = [];
3461
- }
3462
- docIds.push(docId);
3463
- state.postingsFreqs[slot].push(freq);
3464
- const v = clampFreq(freq);
3465
- if (v > state.maxFreq)
3466
- state.maxFreq = v;
3467
- state.totalPostings++;
3468
- }
3469
- function finalizeFlatPostings(state, nextId) {
3470
- return materializeFrozenPostingsFromBuilder({
3471
- fieldCount: state.fieldCount,
3472
- termCount: state.terms.length,
3473
- postingsDocIds: state.postingsDocIds,
3474
- postingsFreqs: state.postingsFreqs,
3475
- totalPostings: state.totalPostings,
3476
- maxFreq: state.maxFreq,
3477
- }, nextId);
3478
- }
3479
3852
  /** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
3480
3853
  class FrozenIndexBuilder {
3481
3854
  constructor(options, hints) {
3855
+ var _a, _b;
3856
+ this._termCount = { value: 0 };
3857
+ this._fieldTermFreqScratch = new Map();
3858
+ this._tokenScratch = [];
3482
3859
  this._options = resolveIndexingOptions(options);
3483
3860
  this._fieldIds = buildFieldIds(this._options.fields);
3484
3861
  this._fieldCount = this._options.fields.length;
3485
3862
  this._index = new SearchableMap();
3486
- this._terms = [];
3487
- this._postingsDocIds = [];
3488
- this._postingsFreqs = [];
3863
+ const estimatedDocs = (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount) !== null && _a !== void 0 ? _a : 0;
3864
+ const perSlot = (_b = hints === null || hints === void 0 ? void 0 : hints.estimatedPostingsPerSlot) !== null && _b !== void 0 ? _b : 4;
3865
+ this._postings = new IncrementalPostingsAccumulator(this._fieldCount, {
3866
+ estimatedTotalPostings: estimatedDocs > 0 ? estimatedDocs * perSlot : undefined,
3867
+ });
3489
3868
  this._avgFieldLength = [];
3490
3869
  this._seenIds = new Set();
3491
3870
  this._nextId = 0;
3492
3871
  this._frozen = false;
3493
3872
  const estimated = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount;
3873
+ this._storedFields = createStoredFieldsLayout(this._options.storeFields, estimated !== null && estimated !== void 0 ? estimated : 0);
3494
3874
  if (estimated != null && estimated > 0) {
3495
3875
  this._externalIds = new Array(estimated);
3496
- this._storedFields = new Array(estimated);
3497
3876
  this._fieldLengthData = new Array(estimated * this._fieldCount).fill(0);
3498
3877
  }
3499
3878
  else {
3500
3879
  this._externalIds = [];
3501
- this._storedFields = [];
3502
3880
  this._fieldLengthData = [];
3503
3881
  }
3504
- this._postingsState = {
3505
- fieldCount: this._fieldCount,
3506
- terms: this._terms,
3507
- postingsDocIds: this._postingsDocIds,
3508
- postingsFreqs: this._postingsFreqs,
3509
- totalPostings: 0,
3510
- maxFreq: 0,
3511
- };
3512
3882
  }
3513
3883
  /** Number of documents indexed so far (not yet frozen). */
3514
3884
  get documentCount() {
@@ -3529,22 +3899,23 @@ class FrozenIndexBuilder {
3529
3899
  this._seenIds.add(id);
3530
3900
  const shortId = this._nextId++;
3531
3901
  this._externalIds[shortId] = id;
3532
- this._storedFields[shortId] = saveStoredFieldsForDocument(storeFields, extractField, document);
3902
+ writeStoredField(this._storedFields, shortId, storeFields, extractField, document);
3533
3903
  const documentCount = shortId + 1;
3534
3904
  for (const field of fields) {
3535
3905
  const fieldValue = extractField(document, field);
3536
3906
  if (fieldValue == null)
3537
3907
  continue;
3538
- const tokens = tokenize(stringifyField(fieldValue, field), field);
3908
+ const fieldText = typeof fieldValue === 'string'
3909
+ ? fieldValue
3910
+ : stringifyField(fieldValue, field);
3539
3911
  const fieldId = this._fieldIds[field];
3540
- const uniqueTerms = new Set(tokens).size;
3541
- const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
3912
+ const uniqueTerms = collectFieldTermFreqsFromFieldInto(this._fieldTermFreqScratch, this._tokenScratch, tokenize, fieldText, field, processTerm);
3542
3913
  this._fieldLengthData[shortId * this._fieldCount + fieldId] = uniqueTerms;
3543
3914
  updateAvgFieldLength(this._avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
3544
- for (const [term, freq] of localFreqs) {
3545
- const ti = getOrCreateTermIndex(this._postingsState, this._index, term);
3546
- appendPosting(this._postingsState, ti, fieldId, shortId, freq);
3547
- }
3915
+ this._fieldTermFreqScratch.forEach((freq, term) => {
3916
+ const ti = getOrCreateTermIndex(this._termCount, this._index, term);
3917
+ this._postings.append(ti, fieldId, shortId, freq);
3918
+ });
3548
3919
  }
3549
3920
  }
3550
3921
  /**
@@ -3601,7 +3972,11 @@ class FrozenIndexBuilder {
3601
3972
  }
3602
3973
  this._frozen = true;
3603
3974
  const documentCount = this._nextId;
3604
- const postings = finalizeFlatPostings(this._postingsState, documentCount);
3975
+ const termCount = this._termCount.value;
3976
+ const postings = this._postings.finalize(termCount, documentCount);
3977
+ const radixTree = this._index.radixTree;
3978
+ this._index = null;
3979
+ const index = fromRadixTree(radixTree, termCount);
3605
3980
  const avgFieldLength = new Float32Array(this._fieldCount);
3606
3981
  for (let f = 0; f < this._fieldCount; f++) {
3607
3982
  avgFieldLength[f] = (_a = this._avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
@@ -3610,12 +3985,8 @@ class FrozenIndexBuilder {
3610
3985
  const externalIds = this._externalIds.length > documentCount
3611
3986
  ? this._externalIds.slice(0, documentCount)
3612
3987
  : this._externalIds;
3613
- const storedFields = this._storedFields.length > documentCount
3614
- ? this._storedFields.slice(0, documentCount)
3615
- : this._storedFields;
3988
+ const storedFields = resizeStoredFields(this._storedFields, documentCount);
3616
3989
  const idLookup = createIdToShortIdLookup(externalIds, documentCount);
3617
- // Incremental builder: numeric radix leaves + build-time terms[] for postings.
3618
- // freezeFromMiniSearch packs Map leaves in one radix pass (no resident terms[]).
3619
3990
  return {
3620
3991
  options: this._options,
3621
3992
  documentCount,
@@ -3627,8 +3998,8 @@ class FrozenIndexBuilder {
3627
3998
  storedFields,
3628
3999
  fieldLengthMatrix: materializeFieldLengthMatrix(this._fieldLengthData, documentCount * this._fieldCount),
3629
4000
  avgFieldLength,
3630
- index: fromRadixTree(this._index.radixTree, this._terms.length),
3631
- termCount: this._terms.length,
4001
+ index,
4002
+ termCount,
3632
4003
  postings,
3633
4004
  };
3634
4005
  }
@@ -4042,7 +4413,7 @@ function shallowCopyJsSnapshotFields(params) {
4042
4413
  return {
4043
4414
  fieldIds: { ...params.fieldIds },
4044
4415
  options: shallowCopyOptions(params.options),
4045
- storedFields: params.storedFields.slice(),
4416
+ storedFields: cloneStoredFields(params.storedFields),
4046
4417
  };
4047
4418
  }
4048
4419
  /**
@@ -4127,7 +4498,7 @@ class FrozenMiniSearch {
4127
4498
  fieldIds: this._fieldIds,
4128
4499
  getFieldLength: (docId, fieldId) => this.getFieldLength(docId, fieldId),
4129
4500
  getExternalId: docId => this._externalIds[docId],
4130
- getStoredFields: docId => this._storedFields[docId],
4501
+ getStoredFields: docId => readStoredFields(this._storedFields, docId),
4131
4502
  };
4132
4503
  this._queryEngineParams = {
4133
4504
  fields: this._options.fields,
@@ -4139,7 +4510,7 @@ class FrozenMiniSearch {
4139
4510
  const id = this._externalIds[shortId];
4140
4511
  if (id === undefined)
4141
4512
  continue;
4142
- callback(shortId, id, this._storedFields[shortId]);
4513
+ callback(shortId, id, readStoredFields(this._storedFields, shortId));
4143
4514
  }
4144
4515
  }),
4145
4516
  aggregateContext: this._aggregateContext,
@@ -4150,11 +4521,7 @@ class FrozenMiniSearch {
4150
4521
  memoryBreakdown() {
4151
4522
  const termCount = this.termCount;
4152
4523
  const postingsStats = postingsTypedBytes(this._postings);
4153
- let storedJson = 0;
4154
- for (const row of this._storedFields) {
4155
- if (row != null)
4156
- storedJson += JSON.stringify(row).length;
4157
- }
4524
+ const storedJson = storedFieldsJsonBytes(this._storedFields);
4158
4525
  const radixEst = this._index.packedByteLength();
4159
4526
  const idMapBytes = this._idLookup.mode === 'lazy-map' ? this._idLookup.mapEntryCount * 32 : 0;
4160
4527
  const estimatedStructuredBytes = postingsStats.totalTypedBytes
@@ -4184,7 +4551,7 @@ class FrozenMiniSearch {
4184
4551
  },
4185
4552
  documents: {
4186
4553
  externalIdsSlots: this._externalIds.length,
4187
- storedFieldsSlots: this._storedFields.length,
4554
+ storedFieldsSlots: storedFieldsSlotCount(this._storedFields),
4188
4555
  idLookupMode: this._idLookup.mode,
4189
4556
  idToShortIdEntries: this._idLookup.mapEntryCount,
4190
4557
  fieldLengthMatrixBytes: this._fieldLengthMatrix.byteLength,
@@ -4199,10 +4566,10 @@ class FrozenMiniSearch {
4199
4566
  }
4200
4567
  getStoredFields(id) {
4201
4568
  const shortId = this._idLookup.get(id);
4202
- return shortId == null ? undefined : this._storedFields[shortId];
4569
+ return shortId == null ? undefined : readStoredFields(this._storedFields, shortId);
4203
4570
  }
4204
4571
  search(query, searchOptions = {}) {
4205
- return finalizeRawSearchResults(this.executeQuery(query, searchOptions), query, searchOptions, this._options.searchOptions, docId => this._externalIds[docId], docId => this._storedFields[docId]);
4572
+ return finalizeRawSearchResults(this.executeQuery(query, searchOptions), query, searchOptions, this._options.searchOptions, docId => this._externalIds[docId], docId => readStoredFields(this._storedFields, docId));
4206
4573
  }
4207
4574
  autoSuggest(queryString, options = {}) {
4208
4575
  const merged = { ...this._options.autoSuggestOptions, ...options };
@@ -4218,7 +4585,8 @@ class FrozenMiniSearch {
4218
4585
  fieldNames: fieldNamesFromFieldIds(this._fieldIds),
4219
4586
  avgFieldLength: this._avgFieldLength,
4220
4587
  externalIds: this._externalIds,
4221
- storedFields: this._storedFields,
4588
+ storedFields: new Array(this._nextId),
4589
+ storedFieldsLayout: this._storedFields,
4222
4590
  fieldLengthMatrix: fieldLengthMatrixForWire(this._fieldLengthMatrix),
4223
4591
  treeShape: [],
4224
4592
  postings: this._postings,
@@ -4234,7 +4602,8 @@ class FrozenMiniSearch {
4234
4602
  fieldNames: fieldNamesFromFieldIds(this._fieldIds),
4235
4603
  avgFieldLength: this._avgFieldLength,
4236
4604
  externalIds: this._externalIds,
4237
- storedFields: this._storedFields,
4605
+ storedFields: new Array(this._nextId),
4606
+ storedFieldsLayout: this._storedFields,
4238
4607
  fieldLengthMatrix: fieldLengthMatrixForWire(this._fieldLengthMatrix),
4239
4608
  treeShape: [],
4240
4609
  postings: this._postings,
@@ -4242,16 +4611,20 @@ class FrozenMiniSearch {
4242
4611
  }
4243
4612
  /** Load a frozen binary snapshot. */
4244
4613
  static loadBinarySync(buffer, options = {}) {
4245
- const snap = decodeFrozenSnapshot(buffer);
4614
+ var _a;
4615
+ const storeFields = (_a = options.storeFields) !== null && _a !== void 0 ? _a : defaultFrozenLoadOptions.storeFields;
4616
+ const snap = decodeFrozenSnapshot(buffer, { storeFields });
4246
4617
  return FrozenMiniSearch.fromBinarySnapshot(snap, options);
4247
4618
  }
4248
4619
  /** Load a frozen binary snapshot with streaming zstd decompression (bounded memory). */
4249
4620
  static async loadBinaryAsync(buffer, options = {}) {
4250
- const snap = await decodeFrozenSnapshotAsync(buffer);
4621
+ var _a;
4622
+ const storeFields = (_a = options.storeFields) !== null && _a !== void 0 ? _a : defaultFrozenLoadOptions.storeFields;
4623
+ const snap = await decodeFrozenSnapshotAsync(buffer, { storeFields });
4251
4624
  return FrozenMiniSearch.fromBinarySnapshot(snap, options);
4252
4625
  }
4253
4626
  static fromBinarySnapshot(snap, options) {
4254
- var _a, _b;
4627
+ var _a, _b, _c;
4255
4628
  const snapshotFields = (_a = snap.fieldNames) !== null && _a !== void 0 ? _a : fieldNamesFromFieldIds(snap.fieldIds);
4256
4629
  if (options.fields != null) {
4257
4630
  assertFieldsMatchSnapshot(options.fields, snap.fieldIds);
@@ -4279,7 +4652,7 @@ class FrozenMiniSearch {
4279
4652
  fieldCount: snap.fieldCount,
4280
4653
  externalIds: snap.externalIds,
4281
4654
  idLookup,
4282
- storedFields: snap.storedFields,
4655
+ storedFields: (_c = snap.storedFieldsLayout) !== null && _c !== void 0 ? _c : storedFieldsFromRows(snap.storedFields, opts.storeFields),
4283
4656
  fieldLengthMatrix: snap.fieldLengthMatrix,
4284
4657
  avgFieldLength: snap.avgFieldLength,
4285
4658
  index,
@@ -4334,4 +4707,4 @@ class FrozenMiniSearch {
4334
4707
  }
4335
4708
  FrozenMiniSearch.wildcard = WILDCARD_QUERY;
4336
4709
 
4337
- export { AND, AND_NOT, FrozenIndexBuilder, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
4710
+ export { AND, AND_NOT, FrozenIndexBuilder, FrozenMiniSearch, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };