@yoch/frozenminisearch 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1911,111 +1911,6 @@ function materializeFrozenPostings(params) {
1911
1911
  sparseLengths: new Uint32Array(sparseLengths),
1912
1912
  };
1913
1913
  }
1914
- /** One-pass materialize from {@link FrozenIndexBuilder} scratch (counts known upfront). */
1915
- function materializeFrozenPostingsFromBuilder(state, nextId) {
1916
- var _a;
1917
- const { fieldCount, termCount, postingsDocIds, postingsFreqs, totalPostings, maxFreq } = state;
1918
- const layout = choosePostingsLayout(fieldCount);
1919
- const docIdWidth = nextId <= 65535 ? 16 : 32;
1920
- const allDocIds = docIdWidth === 16
1921
- ? new Uint16Array(totalPostings)
1922
- : new Uint32Array(totalPostings);
1923
- const allFreqs = allocateFreqs(totalPostings, maxFreq);
1924
- if (layout === 'dense') {
1925
- const slotCount = termCount * fieldCount;
1926
- const denseOffsets = new Uint32Array(slotCount);
1927
- const denseLengths = new Uint32Array(slotCount);
1928
- let write = 0;
1929
- for (let ti = 0; ti < termCount; ti++) {
1930
- const base = ti * fieldCount;
1931
- for (let f = 0; f < fieldCount; f++) {
1932
- const slot = base + f;
1933
- const docIds = postingsDocIds[slot];
1934
- const freqs = postingsFreqs[slot];
1935
- const len = (_a = docIds === null || docIds === void 0 ? void 0 : docIds.length) !== null && _a !== void 0 ? _a : 0;
1936
- denseOffsets[slot] = write;
1937
- denseLengths[slot] = len;
1938
- for (let i = 0; i < len; i++) {
1939
- const docId = docIds[i];
1940
- if (docIdWidth === 16) {
1941
- allDocIds[write] = docId;
1942
- }
1943
- else {
1944
- allDocIds[write] = docId;
1945
- }
1946
- allFreqs[write] = freqs[i];
1947
- write++;
1948
- }
1949
- }
1950
- }
1951
- return {
1952
- fieldCount,
1953
- termCount,
1954
- nextId,
1955
- layout,
1956
- docIdWidth,
1957
- sparseFieldIdWidth: null,
1958
- allDocIds,
1959
- allFreqs,
1960
- denseOffsets,
1961
- denseLengths,
1962
- sparseTermStarts: null,
1963
- sparseFieldIds: null,
1964
- sparseOffsets: null,
1965
- sparseLengths: null,
1966
- };
1967
- }
1968
- const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
1969
- const sparseFieldIdsScratch = [];
1970
- const sparseOffsets = [];
1971
- const sparseLengths = [];
1972
- const termStarts = new Array(termCount + 1).fill(0);
1973
- let write = 0;
1974
- for (let ti = 0; ti < termCount; ti++) {
1975
- termStarts[ti] = sparseFieldIdsScratch.length;
1976
- for (let f = 0; f < fieldCount; f++) {
1977
- const slot = ti * fieldCount + f;
1978
- const docIds = postingsDocIds[slot];
1979
- if (docIds == null || docIds.length === 0)
1980
- continue;
1981
- const freqs = postingsFreqs[slot];
1982
- sparseFieldIdsScratch.push(f);
1983
- sparseOffsets.push(write);
1984
- sparseLengths.push(docIds.length);
1985
- for (let i = 0; i < docIds.length; i++) {
1986
- const docId = docIds[i];
1987
- if (docIdWidth === 16) {
1988
- allDocIds[write] = docId;
1989
- }
1990
- else {
1991
- allDocIds[write] = docId;
1992
- }
1993
- allFreqs[write] = freqs[i];
1994
- write++;
1995
- }
1996
- }
1997
- termStarts[ti + 1] = sparseFieldIdsScratch.length;
1998
- }
1999
- const sparseFieldIds = sparseFieldIdWidth === 16
2000
- ? new Uint16Array(sparseFieldIdsScratch)
2001
- : new Uint8Array(sparseFieldIdsScratch);
2002
- return {
2003
- fieldCount,
2004
- termCount,
2005
- nextId,
2006
- layout,
2007
- docIdWidth,
2008
- sparseFieldIdWidth,
2009
- allDocIds,
2010
- allFreqs,
2011
- denseOffsets: null,
2012
- denseLengths: null,
2013
- sparseTermStarts: new Uint32Array(termStarts),
2014
- sparseFieldIds,
2015
- sparseOffsets: new Uint32Array(sparseOffsets),
2016
- sparseLengths: new Uint32Array(sparseLengths),
2017
- };
2018
- }
2019
1914
  function postingsTypedBytes(layout) {
2020
1915
  const allDocIdsBytes = layout.allDocIds.byteLength;
2021
1916
  const allFreqsBytes = layout.allFreqs.byteLength;
@@ -2125,24 +2020,33 @@ function findSparseSlotByFieldId(fieldIds, start, end, fieldId) {
2125
2020
  }
2126
2021
  return -1;
2127
2022
  }
2128
- /** Resolve one (termIndex, fieldId) posting run in flat buffers; shared by flyweight and docId collect. */
2129
- function frozenPostingSlice(layout, termIndex, fieldId) {
2023
+ /** Reusable scratch for {@link resolvePostingSlice} (scoring is synchronous). */
2024
+ const postingSliceScratch = { offset: 0, length: 0 };
2025
+ /**
2026
+ * Resolve one (termIndex, fieldId) posting run in flat buffers; writes into `out` without allocating.
2027
+ * @returns false when the slot is empty or missing
2028
+ */
2029
+ function resolvePostingSlice(layout, termIndex, fieldId, out) {
2130
2030
  if (layout.layout === 'dense') {
2131
2031
  const base = termIndex * layout.fieldCount + fieldId;
2132
2032
  const len = layout.denseLengths[base];
2133
2033
  if (len === 0)
2134
- return undefined;
2135
- return { offset: layout.denseOffsets[base], length: len };
2034
+ return false;
2035
+ out.offset = layout.denseOffsets[base];
2036
+ out.length = len;
2037
+ return true;
2136
2038
  }
2137
2039
  const start = layout.sparseTermStarts[termIndex];
2138
2040
  const end = layout.sparseTermStarts[termIndex + 1];
2139
2041
  const slot = findSparseSlotByFieldId(layout.sparseFieldIds, start, end, fieldId);
2140
2042
  if (slot < 0)
2141
- return undefined;
2043
+ return false;
2142
2044
  const len = layout.sparseLengths[slot];
2143
2045
  if (len === 0)
2144
- return undefined;
2145
- return { offset: layout.sparseOffsets[slot], length: len };
2046
+ return false;
2047
+ out.offset = layout.sparseOffsets[slot];
2048
+ out.length = len;
2049
+ return true;
2146
2050
  }
2147
2051
  /**
2148
2052
  * One flyweight wrapper for the lifetime of a frozen index. Call {@link bind} before each
@@ -2158,10 +2062,9 @@ function createFrozenFieldTermFlyweight(layout) {
2158
2062
  return flyweight;
2159
2063
  },
2160
2064
  get(fieldId) {
2161
- const slice = frozenPostingSlice(layout, termIndex, fieldId);
2162
- if (slice == null)
2065
+ if (!resolvePostingSlice(layout, termIndex, fieldId, postingSliceScratch))
2163
2066
  return undefined;
2164
- return segment.rebind(slice.offset, slice.length);
2067
+ return segment.rebind(postingSliceScratch.offset, postingSliceScratch.length);
2165
2068
  },
2166
2069
  };
2167
2070
  return flyweight;
@@ -2180,10 +2083,9 @@ function collectDocIdsFromFrozenSegment(allDocIds, offset, length, context, docI
2180
2083
  function collectDocIdsFromFrozenLayout(layout, termIndex, fieldBoosts, context, docIds, allowedDocs) {
2181
2084
  const { fieldIds } = context;
2182
2085
  for (const field of fieldBoosts.names) {
2183
- const slice = frozenPostingSlice(layout, termIndex, fieldIds[field]);
2184
- if (slice == null)
2086
+ if (!resolvePostingSlice(layout, termIndex, fieldIds[field], postingSliceScratch))
2185
2087
  continue;
2186
- collectDocIdsFromFrozenSegment(layout.allDocIds, slice.offset, slice.length, context, docIds, allowedDocs);
2088
+ collectDocIdsFromFrozenSegment(layout.allDocIds, postingSliceScratch.offset, postingSliceScratch.length, context, docIds, allowedDocs);
2187
2089
  }
2188
2090
  }
2189
2091
 
@@ -2225,45 +2127,499 @@ function resolveIndexingOptions(options) {
2225
2127
  autoSuggestOptions: { ...defaultAutoSuggestOptions, ...(options.autoSuggestOptions || {}) },
2226
2128
  };
2227
2129
  }
2228
- function buildFieldIds(fields) {
2229
- const fieldIds = {};
2230
- for (let i = 0; i < fields.length; i++) {
2231
- fieldIds[fields[i]] = i;
2130
+ function buildFieldIds(fields) {
2131
+ const fieldIds = {};
2132
+ for (let i = 0; i < fields.length; i++) {
2133
+ fieldIds[fields[i]] = i;
2134
+ }
2135
+ return fieldIds;
2136
+ }
2137
+ function accumulateProcessedTerm(localFreqs, processedTerm) {
2138
+ if (Array.isArray(processedTerm)) {
2139
+ for (const t of processedTerm) {
2140
+ localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
2141
+ }
2142
+ }
2143
+ else if (processedTerm) {
2144
+ localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
2145
+ }
2146
+ }
2147
+ /**
2148
+ * Accumulate token frequencies for one document field into `localFreqs` (cleared first).
2149
+ * Returns the number of distinct processed terms (replaces a separate `Set(tokens)` pass).
2150
+ */
2151
+ function collectFieldTermFreqsInto(localFreqs, tokens, fieldName, processTerm) {
2152
+ localFreqs.clear();
2153
+ for (const term of tokens) {
2154
+ accumulateProcessedTerm(localFreqs, processTerm(term, fieldName));
2155
+ }
2156
+ return localFreqs.size;
2157
+ }
2158
+ /** Global delimiter pattern for incremental `exec` (must not reuse {@link SPACE_OR_PUNCTUATION} — no `g` flag). */
2159
+ const DEFAULT_TOKENIZE_DELIMITERS = /[\n\r\p{Z}\p{P}]+/gu;
2160
+ const defaultTokenizeProbe = 'a b';
2161
+ const defaultTokenizeProbeField = 'f';
2162
+ const tokenizeBehaviorCache = new WeakMap();
2163
+ /**
2164
+ * True when `tokenize` matches the library default (reference equality or split-equivalent
2165
+ * on a fixed probe). Custom tokenizers that pass the probe but diverge on other inputs
2166
+ * (e.g. leading delimiters) still take the fast path — use the default reference in prod.
2167
+ */
2168
+ function isDefaultTokenize(tokenize) {
2169
+ if (tokenize === defaultFrozenLoadOptions.tokenize)
2170
+ return true;
2171
+ const cached = tokenizeBehaviorCache.get(tokenize);
2172
+ if (cached != null)
2173
+ return cached;
2174
+ const splitTokens = defaultTokenizeProbe.split(SPACE_OR_PUNCTUATION);
2175
+ const customTokens = tokenize(defaultTokenizeProbe, defaultTokenizeProbeField);
2176
+ const ok = splitTokens.length === customTokens.length
2177
+ && splitTokens.every((t, i) => t === customTokens[i]);
2178
+ tokenizeBehaviorCache.set(tokenize, ok);
2179
+ return ok;
2180
+ }
2181
+ function forEachDefaultToken(text, onToken) {
2182
+ if (text.length === 0) {
2183
+ onToken('');
2184
+ return;
2185
+ }
2186
+ let start = 0;
2187
+ const re = DEFAULT_TOKENIZE_DELIMITERS;
2188
+ re.lastIndex = 0;
2189
+ let match;
2190
+ while ((match = re.exec(text)) !== null) {
2191
+ if (match.index > start) {
2192
+ onToken(text.slice(start, match.index));
2193
+ }
2194
+ else if (match.index === start) {
2195
+ onToken('');
2196
+ }
2197
+ start = match.index + match[0].length;
2198
+ }
2199
+ if (start < text.length) {
2200
+ onToken(text.slice(start));
2201
+ }
2202
+ else if (start === 0) {
2203
+ onToken(text);
2204
+ }
2205
+ else if (start === text.length) {
2206
+ onToken('');
2207
+ }
2208
+ }
2209
+ /** Default tokenizer into a reusable buffer (avoids `text.split()` array allocation). */
2210
+ function tokenizeDefaultInto(out, text) {
2211
+ out.length = 0;
2212
+ forEachDefaultToken(text, (token) => out.push(token));
2213
+ }
2214
+ /** Tokenize field text into `out` (reused). Fast path when `tokenize` is the library default. */
2215
+ function tokenizeFieldInto(out, tokenize, text, fieldName) {
2216
+ if (isDefaultTokenize(tokenize)) {
2217
+ tokenizeDefaultInto(out, text);
2218
+ return;
2219
+ }
2220
+ const tokens = tokenize(text, fieldName);
2221
+ out.length = 0;
2222
+ out.push(...tokens);
2223
+ }
2224
+ function collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm) {
2225
+ localFreqs.clear();
2226
+ forEachDefaultToken(text, (token) => {
2227
+ accumulateProcessedTerm(localFreqs, processTerm(token, fieldName));
2228
+ });
2229
+ return localFreqs.size;
2230
+ }
2231
+ /**
2232
+ * Tokenize + accumulate field term frequencies in one pass when the default tokenizer is used.
2233
+ * `tokenScratch` is only used for custom tokenizers (two-phase fallback).
2234
+ */
2235
+ function collectFieldTermFreqsFromFieldInto(localFreqs, tokenScratch, tokenize, text, fieldName, processTerm) {
2236
+ if (isDefaultTokenize(tokenize)) {
2237
+ return collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm);
2238
+ }
2239
+ tokenizeFieldInto(tokenScratch, tokenize, text, fieldName);
2240
+ return collectFieldTermFreqsInto(localFreqs, tokenScratch, fieldName, processTerm);
2241
+ }
2242
+ function updateAvgFieldLength(avgFieldLength, fieldId, count, length) {
2243
+ const averageFieldLength = avgFieldLength[fieldId] || 0;
2244
+ const totalFieldLength = (averageFieldLength * count) + length;
2245
+ avgFieldLength[fieldId] = totalFieldLength / (count + 1);
2246
+ }
2247
+
2248
+ function validateTreeShape(shape, termCount) {
2249
+ if (!Array.isArray(shape)) {
2250
+ throw invalidFrozenIndex('treeShape node must be an array');
2251
+ }
2252
+ for (const entry of shape) {
2253
+ if (!Array.isArray(entry) || entry.length !== 2) {
2254
+ throw invalidFrozenIndex('treeShape entry must be a [key, value] pair');
2255
+ }
2256
+ const [key, value] = entry;
2257
+ if (key === LEAF) {
2258
+ const idx = value;
2259
+ if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
2260
+ throw invalidFrozenIndex(`treeShape leaf term index out of range: ${idx}`);
2261
+ }
2262
+ }
2263
+ else {
2264
+ validateTreeShape(value, termCount);
2265
+ }
2266
+ }
2267
+ }
2268
+ function termCountOf(snap) {
2269
+ return snap.postings.termCount;
2270
+ }
2271
+ /**
2272
+ * Numeric/structural invariants shared by both the decode path (untrusted binary)
2273
+ * and the build path (trusted internal code).
2274
+ */
2275
+ function validateFrozenSnapshotNumeric(snap) {
2276
+ if (snap.fieldCount <= 0) {
2277
+ throw invalidFrozenIndex('fieldCount must be positive');
2278
+ }
2279
+ if (snap.nextId < 0 || snap.nextId >= 0xffffffff) {
2280
+ throw invalidFrozenIndex('nextId out of range');
2281
+ }
2282
+ if (snap.documentCount < 0 || snap.documentCount > snap.nextId) {
2283
+ throw invalidFrozenIndex('documentCount inconsistent with nextId');
2284
+ }
2285
+ if (snap.fieldLengthMatrix.length !== snap.nextId * snap.fieldCount) {
2286
+ throw invalidFrozenIndex('fieldLengthMatrix size mismatch');
2287
+ }
2288
+ if (snap.avgFieldLength.length !== snap.fieldCount) {
2289
+ throw invalidFrozenIndex('avgFieldLength size mismatch');
2290
+ }
2291
+ validateFrozenPostingsLayout(snap.postings, snap.documentCount, snap.nextId, detail => {
2292
+ throw invalidFrozenIndex(detail);
2293
+ });
2294
+ const indexedFields = Object.keys(snap.fieldIds);
2295
+ if (indexedFields.length !== snap.fieldCount) {
2296
+ throw invalidFrozenIndex('fieldIds count mismatch');
2297
+ }
2298
+ for (let f = 0; f < snap.fieldCount; f++) {
2299
+ const found = indexedFields.some(name => snap.fieldIds[name] === f);
2300
+ if (!found) {
2301
+ throw invalidFrozenIndex(`missing field id ${f}`);
2302
+ }
2303
+ }
2304
+ }
2305
+ function readFieldNamesSection(buf, fieldNamesOff, fieldCount, externalIdsOff) {
2306
+ const fieldNames = [];
2307
+ let o = fieldNamesOff;
2308
+ for (let f = 0; f < fieldCount; f++) {
2309
+ const { value, next } = readLengthPrefixedUtf8(buf, o);
2310
+ fieldNames.push(value);
2311
+ o = next;
2312
+ }
2313
+ if (o !== externalIdsOff) {
2314
+ throw invalidFrozenIndex('field names section size mismatch');
2315
+ }
2316
+ return fieldNames;
2317
+ }
2318
+ function readExternalIdsSection(buf, externalIdsOff, nextId, storedOff) {
2319
+ const externalIds = new Array(nextId);
2320
+ let o = externalIdsOff;
2321
+ for (let i = 0; i < nextId; i++) {
2322
+ const { value, next } = readExternalId(buf, o);
2323
+ externalIds[i] = value;
2324
+ o = next;
2325
+ }
2326
+ if (o !== storedOff) {
2327
+ throw invalidFrozenIndex('external ids section size mismatch');
2328
+ }
2329
+ return externalIds;
2330
+ }
2331
+ function readStoredFieldsSection(buf, storedOff, nextId, sectionEnd) {
2332
+ const storedFields = new Array(nextId);
2333
+ const tableEnd = storedOff + nextId * 4;
2334
+ if (tableEnd > sectionEnd) {
2335
+ throw invalidFrozenIndex('stored fields table out of bounds');
2336
+ }
2337
+ for (let i = 0; i < nextId; i++) {
2338
+ const rel = buf.readUInt32LE(storedOff + i * 4);
2339
+ if (rel === 0) {
2340
+ storedFields[i] = undefined;
2341
+ continue;
2342
+ }
2343
+ const entryOff = tableEnd + rel - 1;
2344
+ if (entryOff + 4 > sectionEnd) {
2345
+ throw invalidFrozenIndex('stored fields entry offset out of bounds');
2346
+ }
2347
+ const jsonLen = buf.readUInt32LE(entryOff);
2348
+ const jsonStart = entryOff + 4;
2349
+ const jsonEnd = jsonStart + jsonLen;
2350
+ if (jsonEnd > sectionEnd) {
2351
+ throw invalidFrozenIndex('stored fields JSON out of bounds');
2352
+ }
2353
+ storedFields[i] = JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
2354
+ }
2355
+ return storedFields;
2356
+ }
2357
+ /** Validate structural invariants of a decoded or assembled frozen snapshot. */
2358
+ function validateFrozenSnapshot(snap) {
2359
+ validateFrozenSnapshotNumeric(snap);
2360
+ const termCount = termCountOf(snap);
2361
+ if (snap.packedTermIndex != null) {
2362
+ validateFrozenTermIndexLeaves(snap.packedTermIndex, termCount);
2363
+ }
2364
+ else if (snap.termTree != null) {
2365
+ validateTermTreeLeaves(snap.termTree, termCount);
2366
+ }
2367
+ else {
2368
+ validateTreeShape(snap.treeShape, termCount);
2369
+ }
2370
+ }
2371
+ function fieldNamesFromFieldIds(fieldIds) {
2372
+ const names = Object.keys(fieldIds);
2373
+ names.sort((a, b) => fieldIds[a] - fieldIds[b]);
2374
+ return names;
2375
+ }
2376
+ /** Core with explicit {@link termCountOf} (no dictionary section). */
2377
+ function buildCoreSectionWithTermCount(snap) {
2378
+ const out = Buffer.alloc(16);
2379
+ out.writeUInt32LE(snap.documentCount, 0);
2380
+ out.writeUInt32LE(snap.nextId, 4);
2381
+ out.writeUInt32LE(snap.fieldCount, 8);
2382
+ out.writeUInt32LE(termCountOf(snap), 12);
2383
+ return out;
2384
+ }
2385
+ function buildFieldNamesSection(fieldNames) {
2386
+ const chunks = [];
2387
+ for (const name of fieldNames) {
2388
+ const body = Buffer.from(name, 'utf8');
2389
+ const header = Buffer.alloc(4);
2390
+ header.writeUInt32LE(body.length, 0);
2391
+ chunks.push(header, body);
2392
+ }
2393
+ return Buffer.concat(chunks);
2394
+ }
2395
+ function buildExternalIdsSection(externalIds, nextId) {
2396
+ const chunks = [];
2397
+ for (let i = 0; i < nextId; i++) {
2398
+ writeExternalId(chunks, externalIds[i]);
2399
+ }
2400
+ return Buffer.concat(chunks);
2401
+ }
2402
+ function buildStoredFieldsSection(storedFields, nextId) {
2403
+ const table = Buffer.alloc(nextId * 4);
2404
+ const heapChunks = [];
2405
+ let heapOff = 0;
2406
+ for (let i = 0; i < nextId; i++) {
2407
+ const row = storedFields[i];
2408
+ if (row == null) {
2409
+ table.writeUInt32LE(0, i * 4);
2410
+ continue;
2411
+ }
2412
+ table.writeUInt32LE(heapOff + 1, i * 4);
2413
+ const json = Buffer.from(JSON.stringify(row), 'utf8');
2414
+ const entry = Buffer.alloc(4 + json.length);
2415
+ entry.writeUInt32LE(json.length, 0);
2416
+ json.copy(entry, 4);
2417
+ heapChunks.push(entry);
2418
+ heapOff += entry.length;
2419
+ }
2420
+ return Buffer.concat([table, ...heapChunks]);
2421
+ }
2422
+ function validateTermTreeLeaves(tree, termCount) {
2423
+ for (const [key, val] of tree) {
2424
+ if (key === LEAF) {
2425
+ const idx = val;
2426
+ if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
2427
+ throw invalidFrozenIndex(`term tree leaf index out of range: ${idx}`);
2428
+ }
2429
+ }
2430
+ else {
2431
+ validateTermTreeLeaves(val, termCount);
2432
+ }
2433
+ }
2434
+ }
2435
+ function deserializeTermIndexTree(shape) {
2436
+ const tree = new Map();
2437
+ for (const [key, value] of shape) {
2438
+ if (key === LEAF) {
2439
+ tree.set(LEAF, value);
2440
+ }
2441
+ else {
2442
+ tree.set(key, deserializeTermIndexTree(value));
2443
+ }
2444
+ }
2445
+ return tree;
2446
+ }
2447
+
2448
+ /**
2449
+ * Runtime stored fields. Single store field → one column (no per-doc Record at rest).
2450
+ * Wire format stays row JSON; encode/decode can skip intermediate row arrays when layout is known.
2451
+ */
2452
+ function createStoredFieldsLayout(storeFields, capacity = 0) {
2453
+ if (storeFields.length === 0)
2454
+ return { kind: 'none' };
2455
+ if (storeFields.length === 1) {
2456
+ return { kind: 'single', field: storeFields[0], values: new Array(capacity) };
2457
+ }
2458
+ return { kind: 'multi', rows: new Array(capacity) };
2459
+ }
2460
+ function writeStoredField(layout, shortId, storeFields, extractField, document) {
2461
+ if (layout.kind === 'none')
2462
+ return;
2463
+ if (layout.kind === 'single') {
2464
+ layout.values[shortId] = extractField(document, layout.field);
2465
+ return;
2232
2466
  }
2233
- return fieldIds;
2467
+ const row = {};
2468
+ for (const name of storeFields) {
2469
+ const value = extractField(document, name);
2470
+ if (value !== undefined)
2471
+ row[name] = value;
2472
+ }
2473
+ layout.rows[shortId] = row;
2234
2474
  }
2235
- /** Token frequencies for one document field (after processTerm). */
2236
- function collectFieldTermFreqs(tokens, fieldName, processTerm) {
2237
- const localFreqs = new Map();
2238
- for (const term of tokens) {
2239
- const processedTerm = processTerm(term, fieldName);
2240
- if (Array.isArray(processedTerm)) {
2241
- for (const t of processedTerm) {
2242
- localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
2243
- }
2244
- }
2245
- else if (processedTerm) {
2246
- localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
2475
+ /** Materialize API/wire row for one document. */
2476
+ function readStoredFields(layout, shortId) {
2477
+ if (layout.kind === 'none')
2478
+ return undefined;
2479
+ if (layout.kind === 'multi')
2480
+ return layout.rows[shortId];
2481
+ const value = layout.values[shortId];
2482
+ if (value === undefined)
2483
+ return {};
2484
+ return { [layout.field]: value };
2485
+ }
2486
+ function resizeStoredFields(layout, length) {
2487
+ if (layout.kind === 'none')
2488
+ return layout;
2489
+ if (layout.kind === 'single') {
2490
+ return layout.values.length <= length
2491
+ ? layout
2492
+ : { kind: 'single', field: layout.field, values: layout.values.slice(0, length) };
2493
+ }
2494
+ return layout.rows.length <= length
2495
+ ? layout
2496
+ : { kind: 'multi', rows: layout.rows.slice(0, length) };
2497
+ }
2498
+ function cloneStoredFields(layout) {
2499
+ if (layout.kind === 'none')
2500
+ return layout;
2501
+ if (layout.kind === 'single') {
2502
+ return { kind: 'single', field: layout.field, values: layout.values.slice() };
2503
+ }
2504
+ return { kind: 'multi', rows: layout.rows.slice() };
2505
+ }
2506
+ /** Import from wire rows or lucaong snapshot. Empty storeFields + non-empty rows → multi (binary load without options). */
2507
+ function storedFieldsFromRows(rows, storeFields) {
2508
+ if (storeFields.length === 0) {
2509
+ const hasAny = rows.some(row => row != null && Object.keys(row).length > 0);
2510
+ return hasAny ? { kind: 'multi', rows } : { kind: 'none' };
2511
+ }
2512
+ if (storeFields.length === 1) {
2513
+ const field = storeFields[0];
2514
+ const values = rows.map(row => row === null || row === void 0 ? void 0 : row[field]);
2515
+ return { kind: 'single', field, values };
2516
+ }
2517
+ return { kind: 'multi', rows };
2518
+ }
2519
+ function storedFieldsJsonBytes(layout) {
2520
+ if (layout.kind === 'none')
2521
+ return 0;
2522
+ if (layout.kind === 'multi') {
2523
+ let total = 0;
2524
+ for (const row of layout.rows) {
2525
+ if (row != null)
2526
+ total += JSON.stringify(row).length;
2247
2527
  }
2528
+ return total;
2248
2529
  }
2249
- return localFreqs;
2530
+ let total = 0;
2531
+ const { field, values } = layout;
2532
+ for (let i = 0; i < values.length; i++) {
2533
+ const value = values[i];
2534
+ if (value !== undefined)
2535
+ total += JSON.stringify({ [field]: value }).length;
2536
+ }
2537
+ return total;
2250
2538
  }
2251
- /** Same running average as {@link MiniSearch} private addFieldLength. */
2252
- function updateAvgFieldLength(avgFieldLength, fieldId, count, length) {
2253
- const averageFieldLength = avgFieldLength[fieldId] || 0;
2254
- const totalFieldLength = (averageFieldLength * count) + length;
2255
- avgFieldLength[fieldId] = totalFieldLength / (count + 1);
2539
+ function storedFieldsSlotCount(layout) {
2540
+ if (layout.kind === 'none')
2541
+ return 0;
2542
+ return layout.kind === 'single' ? layout.values.length : layout.rows.length;
2543
+ }
2544
+ function appendStoredFieldJsonEntry(table, heapChunks, heapOffRef, docIndex, jsonUtf8) {
2545
+ table.writeUInt32LE(heapOffRef.value + 1, docIndex * 4);
2546
+ const entry = Buffer.alloc(4 + jsonUtf8.length);
2547
+ entry.writeUInt32LE(jsonUtf8.length, 0);
2548
+ jsonUtf8.copy(entry, 4);
2549
+ heapChunks.push(entry);
2550
+ heapOffRef.value += entry.length;
2551
+ }
2552
+ /** MSv5 StoredFields section from {@link StoredFieldsLayout} (no intermediate row array). */
2553
+ function buildStoredFieldsWireSection(layout, nextId) {
2554
+ if (layout.kind === 'multi') {
2555
+ const rows = layout.rows.length >= nextId
2556
+ ? layout.rows
2557
+ : layout.rows.concat(new Array(nextId - layout.rows.length));
2558
+ return buildStoredFieldsSection(rows, nextId);
2559
+ }
2560
+ const table = Buffer.alloc(nextId * 4);
2561
+ if (layout.kind === 'none')
2562
+ return table;
2563
+ const heapChunks = [];
2564
+ const heapOffRef = { value: 0 };
2565
+ const { field, values } = layout;
2566
+ for (let i = 0; i < nextId; i++) {
2567
+ const value = values[i];
2568
+ if (value === undefined)
2569
+ continue;
2570
+ const jsonUtf8 = Buffer.from(JSON.stringify({ [field]: value }), 'utf8');
2571
+ appendStoredFieldJsonEntry(table, heapChunks, heapOffRef, i, jsonUtf8);
2572
+ }
2573
+ return heapChunks.length === 0 ? table : Buffer.concat([table, ...heapChunks]);
2256
2574
  }
2257
- function saveStoredFieldsForDocument(storeFields, extractField, document) {
2258
- if (storeFields.length === 0)
2259
- return undefined;
2260
- const documentFields = {};
2261
- for (const fieldName of storeFields) {
2262
- const fieldValue = extractField(document, fieldName);
2263
- if (fieldValue !== undefined)
2264
- documentFields[fieldName] = fieldValue;
2575
+ function storedFieldsTableEnd(storedOff, nextId, sectionEnd) {
2576
+ const tableEnd = storedOff + nextId * 4;
2577
+ if (tableEnd > sectionEnd) {
2578
+ throw invalidFrozenIndex('stored fields table out of bounds');
2579
+ }
2580
+ return tableEnd;
2581
+ }
2582
+ function readStoredFieldJsonAt(buf, tableEnd, sectionEnd, rel) {
2583
+ const entryOff = tableEnd + rel - 1;
2584
+ if (entryOff + 4 > sectionEnd) {
2585
+ throw invalidFrozenIndex('stored fields entry offset out of bounds');
2586
+ }
2587
+ const jsonLen = buf.readUInt32LE(entryOff);
2588
+ const jsonStart = entryOff + 4;
2589
+ const jsonEnd = jsonStart + jsonLen;
2590
+ if (jsonEnd > sectionEnd) {
2591
+ throw invalidFrozenIndex('stored fields JSON out of bounds');
2592
+ }
2593
+ return JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
2594
+ }
2595
+ /** MSv5 StoredFields section → layout (skips row materialization when storeFields hint allows). */
2596
+ function readStoredFieldsWireSection(buf, storedOff, nextId, sectionEnd, storeFields) {
2597
+ const tableEnd = storedFieldsTableEnd(storedOff, nextId, sectionEnd);
2598
+ if (storeFields.length === 1) {
2599
+ const field = storeFields[0];
2600
+ const values = new Array(nextId);
2601
+ for (let i = 0; i < nextId; i++) {
2602
+ const rel = buf.readUInt32LE(storedOff + i * 4);
2603
+ if (rel === 0)
2604
+ continue;
2605
+ const row = readStoredFieldJsonAt(buf, tableEnd, sectionEnd, rel);
2606
+ values[i] = row[field];
2607
+ }
2608
+ return { kind: 'single', field, values };
2609
+ }
2610
+ if (storeFields.length === 0) {
2611
+ let hasAny = false;
2612
+ for (let i = 0; i < nextId; i++) {
2613
+ if (buf.readUInt32LE(storedOff + i * 4) !== 0) {
2614
+ hasAny = true;
2615
+ break;
2616
+ }
2617
+ }
2618
+ if (!hasAny)
2619
+ return { kind: 'none' };
2265
2620
  }
2266
- return documentFields;
2621
+ const rows = readStoredFieldsSection(buf, storedOff, nextId, sectionEnd);
2622
+ return storedFieldsFromRows(rows, storeFields);
2267
2623
  }
2268
2624
 
2269
2625
  const SUPPORTED_SERIALIZATION_VERSIONS = new Set([1, 2]);
@@ -2349,7 +2705,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
2349
2705
  let shortIdRemap = null;
2350
2706
  const resolvedNextId = useDense ? documentCount : nextId;
2351
2707
  const externalIds = new Array(resolvedNextId);
2352
- const storedFields = new Array(externalIds.length);
2708
+ const storedFieldRows = new Array(externalIds.length);
2353
2709
  if (useDense) {
2354
2710
  shortIdRemap = new Uint32Array(nextId);
2355
2711
  shortIdRemap.fill(DISCARDED_DOC_ID);
@@ -2361,7 +2717,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
2361
2717
  const shortIdStr = String(shortId);
2362
2718
  shortIdRemap[shortId] = dense;
2363
2719
  externalIds[dense] = snapshot.documentIds[shortIdStr];
2364
- storedFields[dense] = snapshot.storedFields[shortIdStr];
2720
+ storedFieldRows[dense] = snapshot.storedFields[shortIdStr];
2365
2721
  dense++;
2366
2722
  }
2367
2723
  }
@@ -2369,7 +2725,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
2369
2725
  for (const [shortIdStr, id] of Object.entries(snapshot.documentIds)) {
2370
2726
  const shortId = parseInt(shortIdStr, 10);
2371
2727
  externalIds[shortId] = id;
2372
- storedFields[shortId] = snapshot.storedFields[shortIdStr];
2728
+ storedFieldRows[shortId] = snapshot.storedFields[shortIdStr];
2373
2729
  }
2374
2730
  }
2375
2731
  const idLookup = createIdToShortIdLookup(externalIds, resolvedNextId);
@@ -2392,6 +2748,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
2392
2748
  }
2393
2749
  const searchableMap = buildSearchableMapFromSnapshot(snapshot);
2394
2750
  const flat = buildFlatPostingsFromSearchableMap(searchableMap, fieldCount, resolvedNextId, shortIdRemap);
2751
+ const storedFields = storedFieldsFromRows(storedFieldRows, opts.storeFields);
2395
2752
  return {
2396
2753
  options: opts,
2397
2754
  documentCount,
@@ -2693,321 +3050,121 @@ function collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32)
2693
3050
  sectionId++;
2694
3051
  }
2695
3052
  }
2696
- }
2697
- function finish() {
2698
- emitEmptySections();
2699
- if (streamOffset !== uncompressedLength || sectionId !== directory.length) {
2700
- throw new Error('MSv5 zstd decompressed length mismatch');
2701
- }
2702
- if (payloadCrc !== payloadCrc32) {
2703
- throw new Error('MSv5 payload CRC mismatch');
2704
- }
2705
- }
2706
- return { sections, consume, finish };
2707
- }
2708
- function loadMsv5SectionsFromZstdStream(compressed, directory, uncompressedLength, payloadCrc32) {
2709
- return new Promise((resolve, reject) => {
2710
- const collector = collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32);
2711
- const stream = zlib.createZstdDecompress();
2712
- stream.on('data', (chunk) => {
2713
- try {
2714
- collector.consume(chunk);
2715
- }
2716
- catch (err) {
2717
- stream.destroy(err);
2718
- }
2719
- });
2720
- stream.on('error', reject);
2721
- stream.on('end', () => {
2722
- try {
2723
- collector.finish();
2724
- resolve(collector.sections);
2725
- }
2726
- catch (err) {
2727
- reject(err);
2728
- }
2729
- });
2730
- stream.end(compressed);
2731
- });
2732
- }
2733
- function validatePayloadDirectory(directory, uncompressedLength) {
2734
- let prevEnd = 0;
2735
- for (const entry of directory) {
2736
- if ((entry.fileOffset & 3) !== 0) {
2737
- throw new Error('MSv5 section offset not aligned');
2738
- }
2739
- if (entry.fileOffset < prevEnd) {
2740
- throw new Error('MSv5 section offsets not monotonic');
2741
- }
2742
- if (entry.fileOffset + entry.uncompressedLength > uncompressedLength) {
2743
- throw new Error('MSv5 section out of uncompressed bounds');
2744
- }
2745
- prevEnd = entry.fileOffset + entry.uncompressedLength;
2746
- }
2747
- if (prevEnd !== uncompressedLength) {
2748
- throw new Error('MSv5 uncompressed payload length mismatch');
2749
- }
2750
- }
2751
- /** Shared validation + bounds for both the sync and async load paths. */
2752
- function preparePayload(fileBuf, directory) {
2753
- assertPayloadFormatRev(fileBuf);
2754
- const { payloadOffset, compressedLength, uncompressedLength, payloadCrc32, payloadCodec } = readPayloadMeta(fileBuf);
2755
- validatePayloadDirectory(directory, uncompressedLength);
2756
- if (payloadOffset !== MSV5_HEADER_SIZE || payloadOffset + compressedLength > fileBuf.length) {
2757
- throw new Error('MSv5 payload out of bounds');
2758
- }
2759
- if (payloadCodec === CODEC_RAW && compressedLength !== uncompressedLength) {
2760
- throw new Error('MSv5 raw payload length mismatch');
2761
- }
2762
- return {
2763
- payloadCodec,
2764
- slice: fileBuf.subarray(payloadOffset, payloadOffset + compressedLength),
2765
- uncompressedLength,
2766
- payloadCrc32,
2767
- };
2768
- }
2769
- /** Synchronous load; peak RAM ≈ full uncompressed payload (use the async path to bound it). */
2770
- function loadMsv5Sections(fileBuf, directory) {
2771
- const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
2772
- if (payloadCodec === CODEC_RAW) {
2773
- return sectionsFromPayload(slice, directory, payloadCrc32);
2774
- }
2775
- if (payloadCodec === CODEC_ZSTD) {
2776
- if (!zstdAvailable()) {
2777
- throw zstdUnavailableReadError();
2778
- }
2779
- // Native cap matches readPayloadMeta's 1 GiB limit (see MSV5_MAX_UNCOMPRESSED_BYTES).
2780
- // Using header `uncompressedLength` here would only help when the header understates
2781
- // the zstd stream but the attacker can inflate the header too — same worst case.
2782
- const decoded = zlib.zstdDecompressSync(slice, {
2783
- maxOutputLength: MSV5_MAX_UNCOMPRESSED_BYTES,
2784
- });
2785
- if (decoded.length !== uncompressedLength) {
2786
- throw new Error('MSv5 zstd decompressed length mismatch');
2787
- }
2788
- return sectionsFromPayload(decoded, directory, payloadCrc32);
2789
- }
2790
- throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
2791
- }
2792
- /** Streaming load; peak main-thread RAM ≈ largest single section (+ file buffer). */
2793
- async function loadMsv5SectionsAsync(fileBuf, directory) {
2794
- const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
2795
- if (payloadCodec === CODEC_RAW) {
2796
- return sectionsFromPayload(slice, directory, payloadCrc32);
2797
- }
2798
- if (payloadCodec === CODEC_ZSTD) {
2799
- if (!zstdAvailable()) {
2800
- throw zstdUnavailableReadError();
2801
- }
2802
- return loadMsv5SectionsFromZstdStream(slice, directory, uncompressedLength, payloadCrc32);
2803
- }
2804
- throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
2805
- }
2806
- function isMsv5Buffer(buf) {
2807
- return buf.length >= 4 && buf.toString('ascii', 0, 4) === 'MSv5';
2808
- }
2809
- function readMsv5GlobalFlags(buf) {
2810
- return buf.readUInt16LE(6);
2811
- }
2812
-
2813
- function validateTreeShape(shape, termCount) {
2814
- if (!Array.isArray(shape)) {
2815
- throw invalidFrozenIndex('treeShape node must be an array');
2816
- }
2817
- for (const entry of shape) {
2818
- if (!Array.isArray(entry) || entry.length !== 2) {
2819
- throw invalidFrozenIndex('treeShape entry must be a [key, value] pair');
2820
- }
2821
- const [key, value] = entry;
2822
- if (key === LEAF) {
2823
- const idx = value;
2824
- if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
2825
- throw invalidFrozenIndex(`treeShape leaf term index out of range: ${idx}`);
2826
- }
2827
- }
2828
- else {
2829
- validateTreeShape(value, termCount);
2830
- }
2831
- }
2832
- }
2833
- function termCountOf(snap) {
2834
- return snap.postings.termCount;
2835
- }
2836
- /**
2837
- * Numeric/structural invariants shared by both the decode path (untrusted binary)
2838
- * and the build path (trusted internal code).
2839
- */
2840
- function validateFrozenSnapshotNumeric(snap) {
2841
- if (snap.fieldCount <= 0) {
2842
- throw invalidFrozenIndex('fieldCount must be positive');
2843
- }
2844
- if (snap.nextId < 0 || snap.nextId >= 0xffffffff) {
2845
- throw invalidFrozenIndex('nextId out of range');
2846
- }
2847
- if (snap.documentCount < 0 || snap.documentCount > snap.nextId) {
2848
- throw invalidFrozenIndex('documentCount inconsistent with nextId');
2849
- }
2850
- if (snap.fieldLengthMatrix.length !== snap.nextId * snap.fieldCount) {
2851
- throw invalidFrozenIndex('fieldLengthMatrix size mismatch');
2852
- }
2853
- if (snap.avgFieldLength.length !== snap.fieldCount) {
2854
- throw invalidFrozenIndex('avgFieldLength size mismatch');
2855
- }
2856
- validateFrozenPostingsLayout(snap.postings, snap.documentCount, snap.nextId, detail => {
2857
- throw invalidFrozenIndex(detail);
2858
- });
2859
- const indexedFields = Object.keys(snap.fieldIds);
2860
- if (indexedFields.length !== snap.fieldCount) {
2861
- throw invalidFrozenIndex('fieldIds count mismatch');
2862
- }
2863
- for (let f = 0; f < snap.fieldCount; f++) {
2864
- const found = indexedFields.some(name => snap.fieldIds[name] === f);
2865
- if (!found) {
2866
- throw invalidFrozenIndex(`missing field id ${f}`);
2867
- }
2868
- }
2869
- }
2870
- function readFieldNamesSection(buf, fieldNamesOff, fieldCount, externalIdsOff) {
2871
- const fieldNames = [];
2872
- let o = fieldNamesOff;
2873
- for (let f = 0; f < fieldCount; f++) {
2874
- const { value, next } = readLengthPrefixedUtf8(buf, o);
2875
- fieldNames.push(value);
2876
- o = next;
2877
- }
2878
- if (o !== externalIdsOff) {
2879
- throw invalidFrozenIndex('field names section size mismatch');
2880
- }
2881
- return fieldNames;
2882
- }
2883
- function readExternalIdsSection(buf, externalIdsOff, nextId, storedOff) {
2884
- const externalIds = new Array(nextId);
2885
- let o = externalIdsOff;
2886
- for (let i = 0; i < nextId; i++) {
2887
- const { value, next } = readExternalId(buf, o);
2888
- externalIds[i] = value;
2889
- o = next;
2890
- }
2891
- if (o !== storedOff) {
2892
- throw invalidFrozenIndex('external ids section size mismatch');
2893
- }
2894
- return externalIds;
2895
- }
2896
- function readStoredFieldsSection(buf, storedOff, nextId, sectionEnd) {
2897
- const storedFields = new Array(nextId);
2898
- const tableEnd = storedOff + nextId * 4;
2899
- if (tableEnd > sectionEnd) {
2900
- throw invalidFrozenIndex('stored fields table out of bounds');
2901
- }
2902
- for (let i = 0; i < nextId; i++) {
2903
- const rel = buf.readUInt32LE(storedOff + i * 4);
2904
- if (rel === 0) {
2905
- storedFields[i] = undefined;
2906
- continue;
2907
- }
2908
- const entryOff = tableEnd + rel - 1;
2909
- if (entryOff + 4 > sectionEnd) {
2910
- throw invalidFrozenIndex('stored fields entry offset out of bounds');
2911
- }
2912
- const jsonLen = buf.readUInt32LE(entryOff);
2913
- const jsonStart = entryOff + 4;
2914
- const jsonEnd = jsonStart + jsonLen;
2915
- if (jsonEnd > sectionEnd) {
2916
- throw invalidFrozenIndex('stored fields JSON out of bounds');
3053
+ }
3054
+ function finish() {
3055
+ emitEmptySections();
3056
+ if (streamOffset !== uncompressedLength || sectionId !== directory.length) {
3057
+ throw new Error('MSv5 zstd decompressed length mismatch');
3058
+ }
3059
+ if (payloadCrc !== payloadCrc32) {
3060
+ throw new Error('MSv5 payload CRC mismatch');
2917
3061
  }
2918
- storedFields[i] = JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
2919
3062
  }
2920
- return storedFields;
3063
+ return { sections, consume, finish };
2921
3064
  }
2922
- /** Validate structural invariants of a decoded or assembled frozen snapshot. */
2923
- function validateFrozenSnapshot(snap) {
2924
- validateFrozenSnapshotNumeric(snap);
2925
- const termCount = termCountOf(snap);
2926
- if (snap.packedTermIndex != null) {
2927
- validateFrozenTermIndexLeaves(snap.packedTermIndex, termCount);
2928
- }
2929
- else if (snap.termTree != null) {
2930
- validateTermTreeLeaves(snap.termTree, termCount);
3065
+ function loadMsv5SectionsFromZstdStream(compressed, directory, uncompressedLength, payloadCrc32) {
3066
+ return new Promise((resolve, reject) => {
3067
+ const collector = collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32);
3068
+ const stream = zlib.createZstdDecompress();
3069
+ stream.on('data', (chunk) => {
3070
+ try {
3071
+ collector.consume(chunk);
3072
+ }
3073
+ catch (err) {
3074
+ stream.destroy(err);
3075
+ }
3076
+ });
3077
+ stream.on('error', reject);
3078
+ stream.on('end', () => {
3079
+ try {
3080
+ collector.finish();
3081
+ resolve(collector.sections);
3082
+ }
3083
+ catch (err) {
3084
+ reject(err);
3085
+ }
3086
+ });
3087
+ stream.end(compressed);
3088
+ });
3089
+ }
3090
+ function validatePayloadDirectory(directory, uncompressedLength) {
3091
+ let prevEnd = 0;
3092
+ for (const entry of directory) {
3093
+ if ((entry.fileOffset & 3) !== 0) {
3094
+ throw new Error('MSv5 section offset not aligned');
3095
+ }
3096
+ if (entry.fileOffset < prevEnd) {
3097
+ throw new Error('MSv5 section offsets not monotonic');
3098
+ }
3099
+ if (entry.fileOffset + entry.uncompressedLength > uncompressedLength) {
3100
+ throw new Error('MSv5 section out of uncompressed bounds');
3101
+ }
3102
+ prevEnd = entry.fileOffset + entry.uncompressedLength;
2931
3103
  }
2932
- else {
2933
- validateTreeShape(snap.treeShape, termCount);
3104
+ if (prevEnd !== uncompressedLength) {
3105
+ throw new Error('MSv5 uncompressed payload length mismatch');
2934
3106
  }
2935
3107
  }
2936
- function fieldNamesFromFieldIds(fieldIds) {
2937
- const names = Object.keys(fieldIds);
2938
- names.sort((a, b) => fieldIds[a] - fieldIds[b]);
2939
- return names;
2940
- }
2941
- /** Core with explicit {@link termCountOf} (no dictionary section). */
2942
- function buildCoreSectionWithTermCount(snap) {
2943
- const out = Buffer.alloc(16);
2944
- out.writeUInt32LE(snap.documentCount, 0);
2945
- out.writeUInt32LE(snap.nextId, 4);
2946
- out.writeUInt32LE(snap.fieldCount, 8);
2947
- out.writeUInt32LE(termCountOf(snap), 12);
2948
- return out;
2949
- }
2950
- function buildFieldNamesSection(fieldNames) {
2951
- const chunks = [];
2952
- for (const name of fieldNames) {
2953
- const body = Buffer.from(name, 'utf8');
2954
- const header = Buffer.alloc(4);
2955
- header.writeUInt32LE(body.length, 0);
2956
- chunks.push(header, body);
3108
+ /** Shared validation + bounds for both the sync and async load paths. */
3109
+ function preparePayload(fileBuf, directory) {
3110
+ assertPayloadFormatRev(fileBuf);
3111
+ const { payloadOffset, compressedLength, uncompressedLength, payloadCrc32, payloadCodec } = readPayloadMeta(fileBuf);
3112
+ validatePayloadDirectory(directory, uncompressedLength);
3113
+ if (payloadOffset !== MSV5_HEADER_SIZE || payloadOffset + compressedLength > fileBuf.length) {
3114
+ throw new Error('MSv5 payload out of bounds');
2957
3115
  }
2958
- return Buffer.concat(chunks);
2959
- }
2960
- function buildExternalIdsSection(externalIds, nextId) {
2961
- const chunks = [];
2962
- for (let i = 0; i < nextId; i++) {
2963
- writeExternalId(chunks, externalIds[i]);
3116
+ if (payloadCodec === CODEC_RAW && compressedLength !== uncompressedLength) {
3117
+ throw new Error('MSv5 raw payload length mismatch');
2964
3118
  }
2965
- return Buffer.concat(chunks);
3119
+ return {
3120
+ payloadCodec,
3121
+ slice: fileBuf.subarray(payloadOffset, payloadOffset + compressedLength),
3122
+ uncompressedLength,
3123
+ payloadCrc32,
3124
+ };
2966
3125
  }
2967
- function buildStoredFieldsSection(storedFields, nextId) {
2968
- const table = Buffer.alloc(nextId * 4);
2969
- const heapChunks = [];
2970
- let heapOff = 0;
2971
- for (let i = 0; i < nextId; i++) {
2972
- const row = storedFields[i];
2973
- if (row == null) {
2974
- table.writeUInt32LE(0, i * 4);
2975
- continue;
2976
- }
2977
- table.writeUInt32LE(heapOff + 1, i * 4);
2978
- const json = Buffer.from(JSON.stringify(row), 'utf8');
2979
- const entry = Buffer.alloc(4 + json.length);
2980
- entry.writeUInt32LE(json.length, 0);
2981
- json.copy(entry, 4);
2982
- heapChunks.push(entry);
2983
- heapOff += entry.length;
3126
+ /** Synchronous load; peak RAM ≈ full uncompressed payload (use the async path to bound it). */
3127
+ function loadMsv5Sections(fileBuf, directory) {
3128
+ const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
3129
+ if (payloadCodec === CODEC_RAW) {
3130
+ return sectionsFromPayload(slice, directory, payloadCrc32);
2984
3131
  }
2985
- return Buffer.concat([table, ...heapChunks]);
2986
- }
2987
- function validateTermTreeLeaves(tree, termCount) {
2988
- for (const [key, val] of tree) {
2989
- if (key === LEAF) {
2990
- const idx = val;
2991
- if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
2992
- throw invalidFrozenIndex(`term tree leaf index out of range: ${idx}`);
2993
- }
3132
+ if (payloadCodec === CODEC_ZSTD) {
3133
+ if (!zstdAvailable()) {
3134
+ throw zstdUnavailableReadError();
2994
3135
  }
2995
- else {
2996
- validateTermTreeLeaves(val, termCount);
3136
+ // Native cap matches readPayloadMeta's 1 GiB limit (see MSV5_MAX_UNCOMPRESSED_BYTES).
3137
+ // Using header `uncompressedLength` here would only help when the header understates
3138
+ // the zstd stream but the attacker can inflate the header too — same worst case.
3139
+ const decoded = zlib.zstdDecompressSync(slice, {
3140
+ maxOutputLength: MSV5_MAX_UNCOMPRESSED_BYTES,
3141
+ });
3142
+ if (decoded.length !== uncompressedLength) {
3143
+ throw new Error('MSv5 zstd decompressed length mismatch');
2997
3144
  }
3145
+ return sectionsFromPayload(decoded, directory, payloadCrc32);
2998
3146
  }
3147
+ throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
2999
3148
  }
3000
- function deserializeTermIndexTree(shape) {
3001
- const tree = new Map();
3002
- for (const [key, value] of shape) {
3003
- if (key === LEAF) {
3004
- tree.set(LEAF, value);
3005
- }
3006
- else {
3007
- tree.set(key, deserializeTermIndexTree(value));
3149
+ /** Streaming load; peak main-thread RAM ≈ largest single section (+ file buffer). */
3150
+ async function loadMsv5SectionsAsync(fileBuf, directory) {
3151
+ const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
3152
+ if (payloadCodec === CODEC_RAW) {
3153
+ return sectionsFromPayload(slice, directory, payloadCrc32);
3154
+ }
3155
+ if (payloadCodec === CODEC_ZSTD) {
3156
+ if (!zstdAvailable()) {
3157
+ throw zstdUnavailableReadError();
3008
3158
  }
3159
+ return loadMsv5SectionsFromZstdStream(slice, directory, uncompressedLength, payloadCrc32);
3009
3160
  }
3010
- return tree;
3161
+ throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
3162
+ }
3163
+ function isMsv5Buffer(buf) {
3164
+ return buf.length >= 4 && buf.toString('ascii', 0, 4) === 'MSv5';
3165
+ }
3166
+ function readMsv5GlobalFlags(buf) {
3167
+ return buf.readUInt16LE(6);
3011
3168
  }
3012
3169
 
3013
3170
  /** Global wire flags for {@link FreqArray} width. */
@@ -3301,11 +3458,14 @@ function encodeFrozenSnapshotMsv5(snap, termTree, packedTermIndex) {
3301
3458
  const flFlags = fieldLengthMatrixWireFlags(snap.fieldLengthMatrix);
3302
3459
  const freqFlags = freqWireFlags(snap.postings.allFreqs);
3303
3460
  const globalFlags = postingsWire.flags | flFlags | freqFlags;
3461
+ const storedFieldsSection = snap.storedFieldsLayout != null
3462
+ ? buildStoredFieldsWireSection(snap.storedFieldsLayout, snap.nextId)
3463
+ : buildStoredFieldsSection(snap.storedFields, snap.nextId);
3304
3464
  const rawSections = [
3305
3465
  buildCoreSectionWithTermCount(snap),
3306
3466
  buildFieldNamesSection(fieldNames),
3307
3467
  buildExternalIdsSection(snap.externalIds, snap.nextId),
3308
- buildStoredFieldsSection(snap.storedFields, snap.nextId),
3468
+ storedFieldsSection,
3309
3469
  buildTermTreeSectionColumnar(packed),
3310
3470
  bufferFromView(snap.avgFieldLength),
3311
3471
  buildFieldLengthMatrixSection(snap.fieldLengthMatrix),
@@ -3329,11 +3489,14 @@ async function encodeFrozenSnapshotMsv5Async(snap, termTree, packedTermIndex) {
3329
3489
  const flFlags = fieldLengthMatrixWireFlags(snap.fieldLengthMatrix);
3330
3490
  const freqFlags = freqWireFlags(snap.postings.allFreqs);
3331
3491
  const globalFlags = postingsWire.flags | flFlags | freqFlags;
3492
+ const storedFieldsSection = snap.storedFieldsLayout != null
3493
+ ? buildStoredFieldsWireSection(snap.storedFieldsLayout, snap.nextId)
3494
+ : buildStoredFieldsSection(snap.storedFields, snap.nextId);
3332
3495
  const rawSections = [
3333
3496
  buildCoreSectionWithTermCount(snap),
3334
3497
  buildFieldNamesSection(fieldNames),
3335
3498
  buildExternalIdsSection(snap.externalIds, snap.nextId),
3336
- buildStoredFieldsSection(snap.storedFields, snap.nextId),
3499
+ storedFieldsSection,
3337
3500
  buildTermTreeSectionColumnar(packed),
3338
3501
  bufferFromView(snap.avgFieldLength),
3339
3502
  buildFieldLengthMatrixSection(snap.fieldLengthMatrix),
@@ -3363,7 +3526,7 @@ function validateMsv5Container(buf) {
3363
3526
  }
3364
3527
  return { globalFlags, directory };
3365
3528
  }
3366
- function decodeMsv5Sections(globalFlags, sections) {
3529
+ function decodeMsv5Sections(globalFlags, sections, hints) {
3367
3530
  const core = sections[0 /* Msv5SectionId.Core */];
3368
3531
  if (core.length !== 16) {
3369
3532
  throw invalidFrozenIndex('core section size mismatch');
@@ -3378,7 +3541,12 @@ function decodeMsv5Sections(globalFlags, sections) {
3378
3541
  fieldIds[fieldNames[f]] = f;
3379
3542
  }
3380
3543
  const externalIds = readExternalIdsSection(sections[2 /* Msv5SectionId.ExternalIds */], 0, nextId, sections[2 /* Msv5SectionId.ExternalIds */].length);
3381
- const storedFields = readStoredFieldsSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length);
3544
+ const storedFieldsLayout = hints != null
3545
+ ? readStoredFieldsWireSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length, hints.storeFields)
3546
+ : undefined;
3547
+ const storedFields = storedFieldsLayout != null
3548
+ ? new Array(nextId)
3549
+ : readStoredFieldsSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length);
3382
3550
  const packedTermIndex = readPackedTermTreeSectionColumnar(sections[4 /* Msv5SectionId.TermTree */], termCount);
3383
3551
  const avgBuf = sections[5 /* Msv5SectionId.AvgFieldLength */];
3384
3552
  const avgFieldLength = readFloat32Array(avgBuf, 0, avgBuf.length);
@@ -3396,6 +3564,7 @@ function decodeMsv5Sections(globalFlags, sections) {
3396
3564
  avgFieldLength,
3397
3565
  externalIds,
3398
3566
  storedFields,
3567
+ storedFieldsLayout,
3399
3568
  fieldLengthMatrix,
3400
3569
  treeShape: [],
3401
3570
  packedTermIndex,
@@ -3404,13 +3573,13 @@ function decodeMsv5Sections(globalFlags, sections) {
3404
3573
  validateFrozenSnapshot(snap);
3405
3574
  return snap;
3406
3575
  }
3407
- function decodeFrozenSnapshotMsv5(buf) {
3576
+ function decodeFrozenSnapshotMsv5(buf, hints) {
3408
3577
  const { globalFlags, directory } = validateMsv5Container(buf);
3409
- return decodeMsv5Sections(globalFlags, loadMsv5Sections(buf, directory));
3578
+ return decodeMsv5Sections(globalFlags, loadMsv5Sections(buf, directory), hints);
3410
3579
  }
3411
- async function decodeFrozenSnapshotMsv5Async(buf) {
3580
+ async function decodeFrozenSnapshotMsv5Async(buf, hints) {
3412
3581
  const { globalFlags, directory } = validateMsv5Container(buf);
3413
- return decodeMsv5Sections(globalFlags, await loadMsv5SectionsAsync(buf, directory));
3582
+ return decodeMsv5Sections(globalFlags, await loadMsv5SectionsAsync(buf, directory), hints);
3414
3583
  }
3415
3584
 
3416
3585
  /** Encode a frozen snapshot as a binary buffer. */
@@ -3424,12 +3593,12 @@ function encodeFrozenSnapshotAsync(snap, termTree, packedTermIndex) {
3424
3593
 
3425
3594
  const LEGACY_MAGICS = new Set(['MSv1', 'MSv2', 'MSv3', 'MSv4']);
3426
3595
  /** Decode a frozen binary snapshot buffer. */
3427
- function decodeFrozenSnapshot(buf) {
3596
+ function decodeFrozenSnapshot(buf, hints) {
3428
3597
  assertBufferLength(buf, 8);
3429
3598
  const magic = buf.toString('ascii', 0, 4);
3430
3599
  const version = buf.readUInt16LE(4);
3431
3600
  if (isMsv5Buffer(buf) && version === 5) {
3432
- return decodeFrozenSnapshotMsv5(buf);
3601
+ return decodeFrozenSnapshotMsv5(buf, hints);
3433
3602
  }
3434
3603
  if (LEGACY_MAGICS.has(magic)) {
3435
3604
  throw invalidFrozenIndex('Unsupported frozen binary snapshot; re-build with saveBinarySync() or from lucaong JSON');
@@ -3437,82 +3606,283 @@ function decodeFrozenSnapshot(buf) {
3437
3606
  throw invalidFrozenIndex('Unsupported frozen binary snapshot');
3438
3607
  }
3439
3608
  /** Async frozen snapshot decode (streaming zstd). */
3440
- async function decodeFrozenSnapshotAsync(buf) {
3609
+ async function decodeFrozenSnapshotAsync(buf, hints) {
3441
3610
  assertBufferLength(buf, 8);
3442
3611
  const version = buf.readUInt16LE(4);
3443
3612
  if (isMsv5Buffer(buf) && version === 5) {
3444
- return decodeFrozenSnapshotMsv5Async(buf);
3613
+ return decodeFrozenSnapshotMsv5Async(buf, hints);
3614
+ }
3615
+ return decodeFrozenSnapshot(buf, hints);
3616
+ }
3617
+
3618
+ const DEFAULT_CAPACITY = 16;
3619
+ /** Growable unsigned 32-bit column (build scratch; narrowed to u16 at finalize when possible). */
3620
+ class GrowableUint32Column {
3621
+ constructor(initialCapacity = DEFAULT_CAPACITY) {
3622
+ this._len = 0;
3623
+ this._buf = new Uint32Array(Math.max(1, initialCapacity));
3624
+ }
3625
+ get length() {
3626
+ return this._len;
3627
+ }
3628
+ push(value) {
3629
+ if (this._len >= this._buf.length) {
3630
+ const grown = new Uint32Array(Math.max(1, this._buf.length * 2));
3631
+ grown.set(this._buf);
3632
+ this._buf = grown;
3633
+ }
3634
+ this._buf[this._len++] = value;
3635
+ }
3636
+ copyRangeInto(sourceOffset, length, target, targetOffset, docIdWidth) {
3637
+ if (docIdWidth === 16) {
3638
+ const out = target;
3639
+ for (let i = 0; i < length; i++)
3640
+ out[targetOffset + i] = this._buf[sourceOffset + i];
3641
+ }
3642
+ else {
3643
+ const out = target;
3644
+ for (let i = 0; i < length; i++)
3645
+ out[targetOffset + i] = this._buf[sourceOffset + i];
3646
+ }
3647
+ }
3648
+ truncate(length) {
3649
+ this._len = length;
3650
+ if (length > 0 && length < this._buf.length) {
3651
+ this._buf = this._buf.slice(0, length);
3652
+ }
3653
+ }
3654
+ }
3655
+ /** Growable frequency column (u16 cells; matches frozen clamp range). */
3656
+ class GrowableFreqColumn {
3657
+ constructor(initialCapacity = DEFAULT_CAPACITY) {
3658
+ this._len = 0;
3659
+ this._buf = new Uint16Array(Math.max(1, initialCapacity));
3660
+ }
3661
+ get length() {
3662
+ return this._len;
3663
+ }
3664
+ push(freq) {
3665
+ if (this._len >= this._buf.length) {
3666
+ const grown = new Uint16Array(Math.max(1, this._buf.length * 2));
3667
+ grown.set(this._buf);
3668
+ this._buf = grown;
3669
+ }
3670
+ this._buf[this._len++] = clampFreq(freq);
3671
+ }
3672
+ copyRangeInto(sourceOffset, length, target, targetOffset) {
3673
+ for (let i = 0; i < length; i++) {
3674
+ target[targetOffset + i] = this._buf[sourceOffset + i];
3675
+ }
3676
+ }
3677
+ truncate(length) {
3678
+ this._len = length;
3679
+ if (length > 0 && length < this._buf.length) {
3680
+ this._buf = this._buf.slice(0, length);
3681
+ }
3682
+ }
3683
+ }
3684
+ /**
3685
+ * Single-pass postings accumulator for {@link FrozenIndexBuilder}.
3686
+ * One global TypedArray stream per docIds/freqs; per-slot range metadata only.
3687
+ */
3688
+ class IncrementalPostingsAccumulator {
3689
+ constructor(fieldCount, hints) {
3690
+ var _a;
3691
+ this._slots = new Map();
3692
+ this._totalPostings = 0;
3693
+ this._maxFreq = 0;
3694
+ this._fieldCount = fieldCount;
3695
+ const cap = Math.max(DEFAULT_CAPACITY, (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedTotalPostings) !== null && _a !== void 0 ? _a : 0);
3696
+ this._docIds = new GrowableUint32Column(cap);
3697
+ this._freqs = new GrowableFreqColumn(cap);
3698
+ }
3699
+ get totalPostings() {
3700
+ return this._totalPostings;
3701
+ }
3702
+ get maxFreq() {
3703
+ return this._maxFreq;
3704
+ }
3705
+ append(termIndex, fieldId, docId, freq) {
3706
+ const slot = termIndex * this._fieldCount + fieldId;
3707
+ const writeIdx = this._docIds.length;
3708
+ this._docIds.push(docId);
3709
+ const v = clampFreq(freq);
3710
+ this._freqs.push(v);
3711
+ if (v > this._maxFreq)
3712
+ this._maxFreq = v;
3713
+ this._totalPostings++;
3714
+ let ranges = this._slots.get(slot);
3715
+ if (ranges == null) {
3716
+ ranges = { starts: [writeIdx], lengths: [1] };
3717
+ this._slots.set(slot, ranges);
3718
+ return;
3719
+ }
3720
+ const last = ranges.starts.length - 1;
3721
+ const end = ranges.starts[last] + ranges.lengths[last];
3722
+ if (end === writeIdx) {
3723
+ ranges.lengths[last]++;
3724
+ }
3725
+ else {
3726
+ ranges.starts.push(writeIdx);
3727
+ ranges.lengths.push(1);
3728
+ }
3729
+ }
3730
+ clear() {
3731
+ this._slots.clear();
3732
+ // Drop global scratch backing so finalize does not retain duplicate posting bytes.
3733
+ this._docIds.truncate(0);
3734
+ this._freqs.truncate(0);
3735
+ }
3736
+ copySlot(ranges, allDocIds, allFreqs, write, docIdWidth) {
3737
+ for (let r = 0; r < ranges.starts.length; r++) {
3738
+ const start = ranges.starts[r];
3739
+ const len = ranges.lengths[r];
3740
+ this._docIds.copyRangeInto(start, len, allDocIds, write, docIdWidth);
3741
+ this._freqs.copyRangeInto(start, len, allFreqs, write);
3742
+ write += len;
3743
+ }
3744
+ return write;
3745
+ }
3746
+ slotLength(ranges) {
3747
+ let n = 0;
3748
+ for (let i = 0; i < ranges.lengths.length; i++)
3749
+ n += ranges.lengths[i];
3750
+ return n;
3751
+ }
3752
+ finalize(termCount, nextId) {
3753
+ const fieldCount = this._fieldCount;
3754
+ const totalPostings = this._totalPostings;
3755
+ const maxFreq = this._maxFreq;
3756
+ const slots = this._slots;
3757
+ const layout = choosePostingsLayout(fieldCount);
3758
+ const docIdWidth = nextId <= 65535 ? 16 : 32;
3759
+ const allDocIds = docIdWidth === 16
3760
+ ? new Uint16Array(totalPostings)
3761
+ : new Uint32Array(totalPostings);
3762
+ const allFreqs = allocateFreqs(totalPostings, maxFreq);
3763
+ if (layout === 'dense') {
3764
+ const slotCount = termCount * fieldCount;
3765
+ const denseOffsets = new Uint32Array(slotCount);
3766
+ const denseLengths = new Uint32Array(slotCount);
3767
+ let write = 0;
3768
+ for (let ti = 0; ti < termCount; ti++) {
3769
+ const base = ti * fieldCount;
3770
+ for (let f = 0; f < fieldCount; f++) {
3771
+ const slot = base + f;
3772
+ const ranges = slots.get(slot);
3773
+ const len = ranges == null ? 0 : this.slotLength(ranges);
3774
+ denseOffsets[slot] = write;
3775
+ denseLengths[slot] = len;
3776
+ if (len > 0) {
3777
+ write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
3778
+ slots.delete(slot);
3779
+ }
3780
+ }
3781
+ }
3782
+ slots.clear();
3783
+ this.clear();
3784
+ return {
3785
+ fieldCount,
3786
+ termCount,
3787
+ nextId,
3788
+ layout,
3789
+ docIdWidth,
3790
+ sparseFieldIdWidth: null,
3791
+ allDocIds,
3792
+ allFreqs,
3793
+ denseOffsets,
3794
+ denseLengths,
3795
+ sparseTermStarts: null,
3796
+ sparseFieldIds: null,
3797
+ sparseOffsets: null,
3798
+ sparseLengths: null,
3799
+ };
3800
+ }
3801
+ const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
3802
+ const sparseFieldIdsScratch = [];
3803
+ const sparseOffsets = [];
3804
+ const sparseLengths = [];
3805
+ const termStarts = new Array(termCount + 1).fill(0);
3806
+ let write = 0;
3807
+ for (let ti = 0; ti < termCount; ti++) {
3808
+ termStarts[ti] = sparseFieldIdsScratch.length;
3809
+ for (let f = 0; f < fieldCount; f++) {
3810
+ const slot = ti * fieldCount + f;
3811
+ const ranges = slots.get(slot);
3812
+ const len = ranges == null ? 0 : this.slotLength(ranges);
3813
+ if (len === 0)
3814
+ continue;
3815
+ sparseFieldIdsScratch.push(f);
3816
+ sparseOffsets.push(write);
3817
+ sparseLengths.push(len);
3818
+ write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
3819
+ slots.delete(slot);
3820
+ }
3821
+ termStarts[ti + 1] = sparseFieldIdsScratch.length;
3822
+ }
3823
+ slots.clear();
3824
+ this.clear();
3825
+ const sparseFieldIds = sparseFieldIdWidth === 16
3826
+ ? new Uint16Array(sparseFieldIdsScratch)
3827
+ : new Uint8Array(sparseFieldIdsScratch);
3828
+ return {
3829
+ fieldCount,
3830
+ termCount,
3831
+ nextId,
3832
+ layout,
3833
+ docIdWidth,
3834
+ sparseFieldIdWidth,
3835
+ allDocIds,
3836
+ allFreqs,
3837
+ denseOffsets: null,
3838
+ denseLengths: null,
3839
+ sparseTermStarts: new Uint32Array(termStarts),
3840
+ sparseFieldIds,
3841
+ sparseOffsets: new Uint32Array(sparseOffsets),
3842
+ sparseLengths: new Uint32Array(sparseLengths),
3843
+ };
3445
3844
  }
3446
- return decodeFrozenSnapshot(buf);
3447
3845
  }
3448
3846
 
3449
- function getOrCreateTermIndex(state, index, term) {
3847
+ function getOrCreateTermIndex(termCount, index, term) {
3450
3848
  const existing = index.get(term);
3451
3849
  if (existing != null)
3452
3850
  return existing;
3453
- const ti = state.terms.length;
3454
- state.terms.push(term);
3851
+ const ti = termCount.value;
3852
+ termCount.value++;
3455
3853
  index.set(term, ti);
3456
3854
  return ti;
3457
3855
  }
3458
- function appendPosting(state, termIndex, fieldId, docId, freq) {
3459
- const slot = termIndex * state.fieldCount + fieldId;
3460
- let docIds = state.postingsDocIds[slot];
3461
- if (docIds == null) {
3462
- docIds = [];
3463
- state.postingsDocIds[slot] = docIds;
3464
- state.postingsFreqs[slot] = [];
3465
- }
3466
- docIds.push(docId);
3467
- state.postingsFreqs[slot].push(freq);
3468
- const v = clampFreq(freq);
3469
- if (v > state.maxFreq)
3470
- state.maxFreq = v;
3471
- state.totalPostings++;
3472
- }
3473
- function finalizeFlatPostings(state, nextId) {
3474
- return materializeFrozenPostingsFromBuilder({
3475
- fieldCount: state.fieldCount,
3476
- termCount: state.terms.length,
3477
- postingsDocIds: state.postingsDocIds,
3478
- postingsFreqs: state.postingsFreqs,
3479
- totalPostings: state.totalPostings,
3480
- maxFreq: state.maxFreq,
3481
- }, nextId);
3482
- }
3483
3856
  /** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
3484
3857
  class FrozenIndexBuilder {
3485
3858
  constructor(options, hints) {
3859
+ var _a, _b;
3860
+ this._termCount = { value: 0 };
3861
+ this._fieldTermFreqScratch = new Map();
3862
+ this._tokenScratch = [];
3486
3863
  this._options = resolveIndexingOptions(options);
3487
3864
  this._fieldIds = buildFieldIds(this._options.fields);
3488
3865
  this._fieldCount = this._options.fields.length;
3489
3866
  this._index = new SearchableMap();
3490
- this._terms = [];
3491
- this._postingsDocIds = [];
3492
- this._postingsFreqs = [];
3867
+ const estimatedDocs = (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount) !== null && _a !== void 0 ? _a : 0;
3868
+ const perSlot = (_b = hints === null || hints === void 0 ? void 0 : hints.estimatedPostingsPerSlot) !== null && _b !== void 0 ? _b : 4;
3869
+ this._postings = new IncrementalPostingsAccumulator(this._fieldCount, {
3870
+ estimatedTotalPostings: estimatedDocs > 0 ? estimatedDocs * perSlot : undefined,
3871
+ });
3493
3872
  this._avgFieldLength = [];
3494
3873
  this._seenIds = new Set();
3495
3874
  this._nextId = 0;
3496
3875
  this._frozen = false;
3497
3876
  const estimated = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount;
3877
+ this._storedFields = createStoredFieldsLayout(this._options.storeFields, estimated !== null && estimated !== void 0 ? estimated : 0);
3498
3878
  if (estimated != null && estimated > 0) {
3499
3879
  this._externalIds = new Array(estimated);
3500
- this._storedFields = new Array(estimated);
3501
3880
  this._fieldLengthData = new Array(estimated * this._fieldCount).fill(0);
3502
3881
  }
3503
3882
  else {
3504
3883
  this._externalIds = [];
3505
- this._storedFields = [];
3506
3884
  this._fieldLengthData = [];
3507
3885
  }
3508
- this._postingsState = {
3509
- fieldCount: this._fieldCount,
3510
- terms: this._terms,
3511
- postingsDocIds: this._postingsDocIds,
3512
- postingsFreqs: this._postingsFreqs,
3513
- totalPostings: 0,
3514
- maxFreq: 0,
3515
- };
3516
3886
  }
3517
3887
  /** Number of documents indexed so far (not yet frozen). */
3518
3888
  get documentCount() {
@@ -3533,22 +3903,23 @@ class FrozenIndexBuilder {
3533
3903
  this._seenIds.add(id);
3534
3904
  const shortId = this._nextId++;
3535
3905
  this._externalIds[shortId] = id;
3536
- this._storedFields[shortId] = saveStoredFieldsForDocument(storeFields, extractField, document);
3906
+ writeStoredField(this._storedFields, shortId, storeFields, extractField, document);
3537
3907
  const documentCount = shortId + 1;
3538
3908
  for (const field of fields) {
3539
3909
  const fieldValue = extractField(document, field);
3540
3910
  if (fieldValue == null)
3541
3911
  continue;
3542
- const tokens = tokenize(stringifyField(fieldValue, field), field);
3912
+ const fieldText = typeof fieldValue === 'string'
3913
+ ? fieldValue
3914
+ : stringifyField(fieldValue, field);
3543
3915
  const fieldId = this._fieldIds[field];
3544
- const uniqueTerms = new Set(tokens).size;
3545
- const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
3916
+ const uniqueTerms = collectFieldTermFreqsFromFieldInto(this._fieldTermFreqScratch, this._tokenScratch, tokenize, fieldText, field, processTerm);
3546
3917
  this._fieldLengthData[shortId * this._fieldCount + fieldId] = uniqueTerms;
3547
3918
  updateAvgFieldLength(this._avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
3548
- for (const [term, freq] of localFreqs) {
3549
- const ti = getOrCreateTermIndex(this._postingsState, this._index, term);
3550
- appendPosting(this._postingsState, ti, fieldId, shortId, freq);
3551
- }
3919
+ this._fieldTermFreqScratch.forEach((freq, term) => {
3920
+ const ti = getOrCreateTermIndex(this._termCount, this._index, term);
3921
+ this._postings.append(ti, fieldId, shortId, freq);
3922
+ });
3552
3923
  }
3553
3924
  }
3554
3925
  /**
@@ -3605,7 +3976,11 @@ class FrozenIndexBuilder {
3605
3976
  }
3606
3977
  this._frozen = true;
3607
3978
  const documentCount = this._nextId;
3608
- const postings = finalizeFlatPostings(this._postingsState, documentCount);
3979
+ const termCount = this._termCount.value;
3980
+ const postings = this._postings.finalize(termCount, documentCount);
3981
+ const radixTree = this._index.radixTree;
3982
+ this._index = null;
3983
+ const index = fromRadixTree(radixTree, termCount);
3609
3984
  const avgFieldLength = new Float32Array(this._fieldCount);
3610
3985
  for (let f = 0; f < this._fieldCount; f++) {
3611
3986
  avgFieldLength[f] = (_a = this._avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
@@ -3614,12 +3989,8 @@ class FrozenIndexBuilder {
3614
3989
  const externalIds = this._externalIds.length > documentCount
3615
3990
  ? this._externalIds.slice(0, documentCount)
3616
3991
  : this._externalIds;
3617
- const storedFields = this._storedFields.length > documentCount
3618
- ? this._storedFields.slice(0, documentCount)
3619
- : this._storedFields;
3992
+ const storedFields = resizeStoredFields(this._storedFields, documentCount);
3620
3993
  const idLookup = createIdToShortIdLookup(externalIds, documentCount);
3621
- // Incremental builder: numeric radix leaves + build-time terms[] for postings.
3622
- // freezeFromMiniSearch packs Map leaves in one radix pass (no resident terms[]).
3623
3994
  return {
3624
3995
  options: this._options,
3625
3996
  documentCount,
@@ -3631,8 +4002,8 @@ class FrozenIndexBuilder {
3631
4002
  storedFields,
3632
4003
  fieldLengthMatrix: materializeFieldLengthMatrix(this._fieldLengthData, documentCount * this._fieldCount),
3633
4004
  avgFieldLength,
3634
- index: fromRadixTree(this._index.radixTree, this._terms.length),
3635
- termCount: this._terms.length,
4005
+ index,
4006
+ termCount,
3636
4007
  postings,
3637
4008
  };
3638
4009
  }
@@ -4046,7 +4417,7 @@ function shallowCopyJsSnapshotFields(params) {
4046
4417
  return {
4047
4418
  fieldIds: { ...params.fieldIds },
4048
4419
  options: shallowCopyOptions(params.options),
4049
- storedFields: params.storedFields.slice(),
4420
+ storedFields: cloneStoredFields(params.storedFields),
4050
4421
  };
4051
4422
  }
4052
4423
  /**
@@ -4131,7 +4502,7 @@ class FrozenMiniSearch {
4131
4502
  fieldIds: this._fieldIds,
4132
4503
  getFieldLength: (docId, fieldId) => this.getFieldLength(docId, fieldId),
4133
4504
  getExternalId: docId => this._externalIds[docId],
4134
- getStoredFields: docId => this._storedFields[docId],
4505
+ getStoredFields: docId => readStoredFields(this._storedFields, docId),
4135
4506
  };
4136
4507
  this._queryEngineParams = {
4137
4508
  fields: this._options.fields,
@@ -4143,7 +4514,7 @@ class FrozenMiniSearch {
4143
4514
  const id = this._externalIds[shortId];
4144
4515
  if (id === undefined)
4145
4516
  continue;
4146
- callback(shortId, id, this._storedFields[shortId]);
4517
+ callback(shortId, id, readStoredFields(this._storedFields, shortId));
4147
4518
  }
4148
4519
  }),
4149
4520
  aggregateContext: this._aggregateContext,
@@ -4154,11 +4525,7 @@ class FrozenMiniSearch {
4154
4525
  memoryBreakdown() {
4155
4526
  const termCount = this.termCount;
4156
4527
  const postingsStats = postingsTypedBytes(this._postings);
4157
- let storedJson = 0;
4158
- for (const row of this._storedFields) {
4159
- if (row != null)
4160
- storedJson += JSON.stringify(row).length;
4161
- }
4528
+ const storedJson = storedFieldsJsonBytes(this._storedFields);
4162
4529
  const radixEst = this._index.packedByteLength();
4163
4530
  const idMapBytes = this._idLookup.mode === 'lazy-map' ? this._idLookup.mapEntryCount * 32 : 0;
4164
4531
  const estimatedStructuredBytes = postingsStats.totalTypedBytes
@@ -4188,7 +4555,7 @@ class FrozenMiniSearch {
4188
4555
  },
4189
4556
  documents: {
4190
4557
  externalIdsSlots: this._externalIds.length,
4191
- storedFieldsSlots: this._storedFields.length,
4558
+ storedFieldsSlots: storedFieldsSlotCount(this._storedFields),
4192
4559
  idLookupMode: this._idLookup.mode,
4193
4560
  idToShortIdEntries: this._idLookup.mapEntryCount,
4194
4561
  fieldLengthMatrixBytes: this._fieldLengthMatrix.byteLength,
@@ -4203,10 +4570,10 @@ class FrozenMiniSearch {
4203
4570
  }
4204
4571
  getStoredFields(id) {
4205
4572
  const shortId = this._idLookup.get(id);
4206
- return shortId == null ? undefined : this._storedFields[shortId];
4573
+ return shortId == null ? undefined : readStoredFields(this._storedFields, shortId);
4207
4574
  }
4208
4575
  search(query, searchOptions = {}) {
4209
- return finalizeRawSearchResults(this.executeQuery(query, searchOptions), query, searchOptions, this._options.searchOptions, docId => this._externalIds[docId], docId => this._storedFields[docId]);
4576
+ return finalizeRawSearchResults(this.executeQuery(query, searchOptions), query, searchOptions, this._options.searchOptions, docId => this._externalIds[docId], docId => readStoredFields(this._storedFields, docId));
4210
4577
  }
4211
4578
  autoSuggest(queryString, options = {}) {
4212
4579
  const merged = { ...this._options.autoSuggestOptions, ...options };
@@ -4222,7 +4589,8 @@ class FrozenMiniSearch {
4222
4589
  fieldNames: fieldNamesFromFieldIds(this._fieldIds),
4223
4590
  avgFieldLength: this._avgFieldLength,
4224
4591
  externalIds: this._externalIds,
4225
- storedFields: this._storedFields,
4592
+ storedFields: new Array(this._nextId),
4593
+ storedFieldsLayout: this._storedFields,
4226
4594
  fieldLengthMatrix: fieldLengthMatrixForWire(this._fieldLengthMatrix),
4227
4595
  treeShape: [],
4228
4596
  postings: this._postings,
@@ -4238,7 +4606,8 @@ class FrozenMiniSearch {
4238
4606
  fieldNames: fieldNamesFromFieldIds(this._fieldIds),
4239
4607
  avgFieldLength: this._avgFieldLength,
4240
4608
  externalIds: this._externalIds,
4241
- storedFields: this._storedFields,
4609
+ storedFields: new Array(this._nextId),
4610
+ storedFieldsLayout: this._storedFields,
4242
4611
  fieldLengthMatrix: fieldLengthMatrixForWire(this._fieldLengthMatrix),
4243
4612
  treeShape: [],
4244
4613
  postings: this._postings,
@@ -4246,16 +4615,20 @@ class FrozenMiniSearch {
4246
4615
  }
4247
4616
  /** Load a frozen binary snapshot. */
4248
4617
  static loadBinarySync(buffer, options = {}) {
4249
- const snap = decodeFrozenSnapshot(buffer);
4618
+ var _a;
4619
+ const storeFields = (_a = options.storeFields) !== null && _a !== void 0 ? _a : defaultFrozenLoadOptions.storeFields;
4620
+ const snap = decodeFrozenSnapshot(buffer, { storeFields });
4250
4621
  return FrozenMiniSearch.fromBinarySnapshot(snap, options);
4251
4622
  }
4252
4623
  /** Load a frozen binary snapshot with streaming zstd decompression (bounded memory). */
4253
4624
  static async loadBinaryAsync(buffer, options = {}) {
4254
- const snap = await decodeFrozenSnapshotAsync(buffer);
4625
+ var _a;
4626
+ const storeFields = (_a = options.storeFields) !== null && _a !== void 0 ? _a : defaultFrozenLoadOptions.storeFields;
4627
+ const snap = await decodeFrozenSnapshotAsync(buffer, { storeFields });
4255
4628
  return FrozenMiniSearch.fromBinarySnapshot(snap, options);
4256
4629
  }
4257
4630
  static fromBinarySnapshot(snap, options) {
4258
- var _a, _b;
4631
+ var _a, _b, _c;
4259
4632
  const snapshotFields = (_a = snap.fieldNames) !== null && _a !== void 0 ? _a : fieldNamesFromFieldIds(snap.fieldIds);
4260
4633
  if (options.fields != null) {
4261
4634
  assertFieldsMatchSnapshot(options.fields, snap.fieldIds);
@@ -4283,7 +4656,7 @@ class FrozenMiniSearch {
4283
4656
  fieldCount: snap.fieldCount,
4284
4657
  externalIds: snap.externalIds,
4285
4658
  idLookup,
4286
- storedFields: snap.storedFields,
4659
+ storedFields: (_c = snap.storedFieldsLayout) !== null && _c !== void 0 ? _c : storedFieldsFromRows(snap.storedFields, opts.storeFields),
4287
4660
  fieldLengthMatrix: snap.fieldLengthMatrix,
4288
4661
  avgFieldLength: snap.avgFieldLength,
4289
4662
  index,
@@ -4341,6 +4714,7 @@ FrozenMiniSearch.wildcard = WILDCARD_QUERY;
4341
4714
  exports.AND = AND;
4342
4715
  exports.AND_NOT = AND_NOT;
4343
4716
  exports.FrozenIndexBuilder = FrozenIndexBuilder;
4717
+ exports.FrozenMiniSearch = FrozenMiniSearch;
4344
4718
  exports.OR = OR;
4345
4719
  exports.assembleFrozen = assembleFrozen;
4346
4720
  exports.buildFrozenFromDocuments = buildFrozenFromDocuments;