@yoch/frozenminisearch 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/README.md +41 -11
- package/dist/cjs/index.cjs +913 -539
- package/dist/cjs/index.require.cjs +1 -0
- package/dist/es/index.d.ts +24 -7
- package/dist/es/index.js +913 -540
- package/package.json +5 -2
package/dist/cjs/index.cjs
CHANGED
|
@@ -1911,111 +1911,6 @@ function materializeFrozenPostings(params) {
|
|
|
1911
1911
|
sparseLengths: new Uint32Array(sparseLengths),
|
|
1912
1912
|
};
|
|
1913
1913
|
}
|
|
1914
|
-
/** One-pass materialize from {@link FrozenIndexBuilder} scratch (counts known upfront). */
|
|
1915
|
-
function materializeFrozenPostingsFromBuilder(state, nextId) {
|
|
1916
|
-
var _a;
|
|
1917
|
-
const { fieldCount, termCount, postingsDocIds, postingsFreqs, totalPostings, maxFreq } = state;
|
|
1918
|
-
const layout = choosePostingsLayout(fieldCount);
|
|
1919
|
-
const docIdWidth = nextId <= 65535 ? 16 : 32;
|
|
1920
|
-
const allDocIds = docIdWidth === 16
|
|
1921
|
-
? new Uint16Array(totalPostings)
|
|
1922
|
-
: new Uint32Array(totalPostings);
|
|
1923
|
-
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
1924
|
-
if (layout === 'dense') {
|
|
1925
|
-
const slotCount = termCount * fieldCount;
|
|
1926
|
-
const denseOffsets = new Uint32Array(slotCount);
|
|
1927
|
-
const denseLengths = new Uint32Array(slotCount);
|
|
1928
|
-
let write = 0;
|
|
1929
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1930
|
-
const base = ti * fieldCount;
|
|
1931
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1932
|
-
const slot = base + f;
|
|
1933
|
-
const docIds = postingsDocIds[slot];
|
|
1934
|
-
const freqs = postingsFreqs[slot];
|
|
1935
|
-
const len = (_a = docIds === null || docIds === void 0 ? void 0 : docIds.length) !== null && _a !== void 0 ? _a : 0;
|
|
1936
|
-
denseOffsets[slot] = write;
|
|
1937
|
-
denseLengths[slot] = len;
|
|
1938
|
-
for (let i = 0; i < len; i++) {
|
|
1939
|
-
const docId = docIds[i];
|
|
1940
|
-
if (docIdWidth === 16) {
|
|
1941
|
-
allDocIds[write] = docId;
|
|
1942
|
-
}
|
|
1943
|
-
else {
|
|
1944
|
-
allDocIds[write] = docId;
|
|
1945
|
-
}
|
|
1946
|
-
allFreqs[write] = freqs[i];
|
|
1947
|
-
write++;
|
|
1948
|
-
}
|
|
1949
|
-
}
|
|
1950
|
-
}
|
|
1951
|
-
return {
|
|
1952
|
-
fieldCount,
|
|
1953
|
-
termCount,
|
|
1954
|
-
nextId,
|
|
1955
|
-
layout,
|
|
1956
|
-
docIdWidth,
|
|
1957
|
-
sparseFieldIdWidth: null,
|
|
1958
|
-
allDocIds,
|
|
1959
|
-
allFreqs,
|
|
1960
|
-
denseOffsets,
|
|
1961
|
-
denseLengths,
|
|
1962
|
-
sparseTermStarts: null,
|
|
1963
|
-
sparseFieldIds: null,
|
|
1964
|
-
sparseOffsets: null,
|
|
1965
|
-
sparseLengths: null,
|
|
1966
|
-
};
|
|
1967
|
-
}
|
|
1968
|
-
const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
|
|
1969
|
-
const sparseFieldIdsScratch = [];
|
|
1970
|
-
const sparseOffsets = [];
|
|
1971
|
-
const sparseLengths = [];
|
|
1972
|
-
const termStarts = new Array(termCount + 1).fill(0);
|
|
1973
|
-
let write = 0;
|
|
1974
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1975
|
-
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
1976
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1977
|
-
const slot = ti * fieldCount + f;
|
|
1978
|
-
const docIds = postingsDocIds[slot];
|
|
1979
|
-
if (docIds == null || docIds.length === 0)
|
|
1980
|
-
continue;
|
|
1981
|
-
const freqs = postingsFreqs[slot];
|
|
1982
|
-
sparseFieldIdsScratch.push(f);
|
|
1983
|
-
sparseOffsets.push(write);
|
|
1984
|
-
sparseLengths.push(docIds.length);
|
|
1985
|
-
for (let i = 0; i < docIds.length; i++) {
|
|
1986
|
-
const docId = docIds[i];
|
|
1987
|
-
if (docIdWidth === 16) {
|
|
1988
|
-
allDocIds[write] = docId;
|
|
1989
|
-
}
|
|
1990
|
-
else {
|
|
1991
|
-
allDocIds[write] = docId;
|
|
1992
|
-
}
|
|
1993
|
-
allFreqs[write] = freqs[i];
|
|
1994
|
-
write++;
|
|
1995
|
-
}
|
|
1996
|
-
}
|
|
1997
|
-
termStarts[ti + 1] = sparseFieldIdsScratch.length;
|
|
1998
|
-
}
|
|
1999
|
-
const sparseFieldIds = sparseFieldIdWidth === 16
|
|
2000
|
-
? new Uint16Array(sparseFieldIdsScratch)
|
|
2001
|
-
: new Uint8Array(sparseFieldIdsScratch);
|
|
2002
|
-
return {
|
|
2003
|
-
fieldCount,
|
|
2004
|
-
termCount,
|
|
2005
|
-
nextId,
|
|
2006
|
-
layout,
|
|
2007
|
-
docIdWidth,
|
|
2008
|
-
sparseFieldIdWidth,
|
|
2009
|
-
allDocIds,
|
|
2010
|
-
allFreqs,
|
|
2011
|
-
denseOffsets: null,
|
|
2012
|
-
denseLengths: null,
|
|
2013
|
-
sparseTermStarts: new Uint32Array(termStarts),
|
|
2014
|
-
sparseFieldIds,
|
|
2015
|
-
sparseOffsets: new Uint32Array(sparseOffsets),
|
|
2016
|
-
sparseLengths: new Uint32Array(sparseLengths),
|
|
2017
|
-
};
|
|
2018
|
-
}
|
|
2019
1914
|
function postingsTypedBytes(layout) {
|
|
2020
1915
|
const allDocIdsBytes = layout.allDocIds.byteLength;
|
|
2021
1916
|
const allFreqsBytes = layout.allFreqs.byteLength;
|
|
@@ -2125,24 +2020,33 @@ function findSparseSlotByFieldId(fieldIds, start, end, fieldId) {
|
|
|
2125
2020
|
}
|
|
2126
2021
|
return -1;
|
|
2127
2022
|
}
|
|
2128
|
-
/**
|
|
2129
|
-
|
|
2023
|
+
/** Reusable scratch for {@link resolvePostingSlice} (scoring is synchronous). */
|
|
2024
|
+
const postingSliceScratch = { offset: 0, length: 0 };
|
|
2025
|
+
/**
|
|
2026
|
+
* Resolve one (termIndex, fieldId) posting run in flat buffers; writes into `out` without allocating.
|
|
2027
|
+
* @returns false when the slot is empty or missing
|
|
2028
|
+
*/
|
|
2029
|
+
function resolvePostingSlice(layout, termIndex, fieldId, out) {
|
|
2130
2030
|
if (layout.layout === 'dense') {
|
|
2131
2031
|
const base = termIndex * layout.fieldCount + fieldId;
|
|
2132
2032
|
const len = layout.denseLengths[base];
|
|
2133
2033
|
if (len === 0)
|
|
2134
|
-
return
|
|
2135
|
-
|
|
2034
|
+
return false;
|
|
2035
|
+
out.offset = layout.denseOffsets[base];
|
|
2036
|
+
out.length = len;
|
|
2037
|
+
return true;
|
|
2136
2038
|
}
|
|
2137
2039
|
const start = layout.sparseTermStarts[termIndex];
|
|
2138
2040
|
const end = layout.sparseTermStarts[termIndex + 1];
|
|
2139
2041
|
const slot = findSparseSlotByFieldId(layout.sparseFieldIds, start, end, fieldId);
|
|
2140
2042
|
if (slot < 0)
|
|
2141
|
-
return
|
|
2043
|
+
return false;
|
|
2142
2044
|
const len = layout.sparseLengths[slot];
|
|
2143
2045
|
if (len === 0)
|
|
2144
|
-
return
|
|
2145
|
-
|
|
2046
|
+
return false;
|
|
2047
|
+
out.offset = layout.sparseOffsets[slot];
|
|
2048
|
+
out.length = len;
|
|
2049
|
+
return true;
|
|
2146
2050
|
}
|
|
2147
2051
|
/**
|
|
2148
2052
|
* One flyweight wrapper for the lifetime of a frozen index. Call {@link bind} before each
|
|
@@ -2158,10 +2062,9 @@ function createFrozenFieldTermFlyweight(layout) {
|
|
|
2158
2062
|
return flyweight;
|
|
2159
2063
|
},
|
|
2160
2064
|
get(fieldId) {
|
|
2161
|
-
|
|
2162
|
-
if (slice == null)
|
|
2065
|
+
if (!resolvePostingSlice(layout, termIndex, fieldId, postingSliceScratch))
|
|
2163
2066
|
return undefined;
|
|
2164
|
-
return segment.rebind(
|
|
2067
|
+
return segment.rebind(postingSliceScratch.offset, postingSliceScratch.length);
|
|
2165
2068
|
},
|
|
2166
2069
|
};
|
|
2167
2070
|
return flyweight;
|
|
@@ -2180,10 +2083,9 @@ function collectDocIdsFromFrozenSegment(allDocIds, offset, length, context, docI
|
|
|
2180
2083
|
function collectDocIdsFromFrozenLayout(layout, termIndex, fieldBoosts, context, docIds, allowedDocs) {
|
|
2181
2084
|
const { fieldIds } = context;
|
|
2182
2085
|
for (const field of fieldBoosts.names) {
|
|
2183
|
-
|
|
2184
|
-
if (slice == null)
|
|
2086
|
+
if (!resolvePostingSlice(layout, termIndex, fieldIds[field], postingSliceScratch))
|
|
2185
2087
|
continue;
|
|
2186
|
-
collectDocIdsFromFrozenSegment(layout.allDocIds,
|
|
2088
|
+
collectDocIdsFromFrozenSegment(layout.allDocIds, postingSliceScratch.offset, postingSliceScratch.length, context, docIds, allowedDocs);
|
|
2187
2089
|
}
|
|
2188
2090
|
}
|
|
2189
2091
|
|
|
@@ -2225,45 +2127,499 @@ function resolveIndexingOptions(options) {
|
|
|
2225
2127
|
autoSuggestOptions: { ...defaultAutoSuggestOptions, ...(options.autoSuggestOptions || {}) },
|
|
2226
2128
|
};
|
|
2227
2129
|
}
|
|
2228
|
-
function buildFieldIds(fields) {
|
|
2229
|
-
const fieldIds = {};
|
|
2230
|
-
for (let i = 0; i < fields.length; i++) {
|
|
2231
|
-
fieldIds[fields[i]] = i;
|
|
2130
|
+
function buildFieldIds(fields) {
|
|
2131
|
+
const fieldIds = {};
|
|
2132
|
+
for (let i = 0; i < fields.length; i++) {
|
|
2133
|
+
fieldIds[fields[i]] = i;
|
|
2134
|
+
}
|
|
2135
|
+
return fieldIds;
|
|
2136
|
+
}
|
|
2137
|
+
function accumulateProcessedTerm(localFreqs, processedTerm) {
|
|
2138
|
+
if (Array.isArray(processedTerm)) {
|
|
2139
|
+
for (const t of processedTerm) {
|
|
2140
|
+
localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
|
|
2141
|
+
}
|
|
2142
|
+
}
|
|
2143
|
+
else if (processedTerm) {
|
|
2144
|
+
localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
|
|
2145
|
+
}
|
|
2146
|
+
}
|
|
2147
|
+
/**
|
|
2148
|
+
* Accumulate token frequencies for one document field into `localFreqs` (cleared first).
|
|
2149
|
+
* Returns the number of distinct processed terms (replaces a separate `Set(tokens)` pass).
|
|
2150
|
+
*/
|
|
2151
|
+
function collectFieldTermFreqsInto(localFreqs, tokens, fieldName, processTerm) {
|
|
2152
|
+
localFreqs.clear();
|
|
2153
|
+
for (const term of tokens) {
|
|
2154
|
+
accumulateProcessedTerm(localFreqs, processTerm(term, fieldName));
|
|
2155
|
+
}
|
|
2156
|
+
return localFreqs.size;
|
|
2157
|
+
}
|
|
2158
|
+
/** Global delimiter pattern for incremental `exec` (must not reuse {@link SPACE_OR_PUNCTUATION} — no `g` flag). */
|
|
2159
|
+
const DEFAULT_TOKENIZE_DELIMITERS = /[\n\r\p{Z}\p{P}]+/gu;
|
|
2160
|
+
const defaultTokenizeProbe = 'a b';
|
|
2161
|
+
const defaultTokenizeProbeField = 'f';
|
|
2162
|
+
const tokenizeBehaviorCache = new WeakMap();
|
|
2163
|
+
/**
|
|
2164
|
+
* True when `tokenize` matches the library default (reference equality or split-equivalent
|
|
2165
|
+
* on a fixed probe). Custom tokenizers that pass the probe but diverge on other inputs
|
|
2166
|
+
* (e.g. leading delimiters) still take the fast path — use the default reference in prod.
|
|
2167
|
+
*/
|
|
2168
|
+
function isDefaultTokenize(tokenize) {
|
|
2169
|
+
if (tokenize === defaultFrozenLoadOptions.tokenize)
|
|
2170
|
+
return true;
|
|
2171
|
+
const cached = tokenizeBehaviorCache.get(tokenize);
|
|
2172
|
+
if (cached != null)
|
|
2173
|
+
return cached;
|
|
2174
|
+
const splitTokens = defaultTokenizeProbe.split(SPACE_OR_PUNCTUATION);
|
|
2175
|
+
const customTokens = tokenize(defaultTokenizeProbe, defaultTokenizeProbeField);
|
|
2176
|
+
const ok = splitTokens.length === customTokens.length
|
|
2177
|
+
&& splitTokens.every((t, i) => t === customTokens[i]);
|
|
2178
|
+
tokenizeBehaviorCache.set(tokenize, ok);
|
|
2179
|
+
return ok;
|
|
2180
|
+
}
|
|
2181
|
+
function forEachDefaultToken(text, onToken) {
|
|
2182
|
+
if (text.length === 0) {
|
|
2183
|
+
onToken('');
|
|
2184
|
+
return;
|
|
2185
|
+
}
|
|
2186
|
+
let start = 0;
|
|
2187
|
+
const re = DEFAULT_TOKENIZE_DELIMITERS;
|
|
2188
|
+
re.lastIndex = 0;
|
|
2189
|
+
let match;
|
|
2190
|
+
while ((match = re.exec(text)) !== null) {
|
|
2191
|
+
if (match.index > start) {
|
|
2192
|
+
onToken(text.slice(start, match.index));
|
|
2193
|
+
}
|
|
2194
|
+
else if (match.index === start) {
|
|
2195
|
+
onToken('');
|
|
2196
|
+
}
|
|
2197
|
+
start = match.index + match[0].length;
|
|
2198
|
+
}
|
|
2199
|
+
if (start < text.length) {
|
|
2200
|
+
onToken(text.slice(start));
|
|
2201
|
+
}
|
|
2202
|
+
else if (start === 0) {
|
|
2203
|
+
onToken(text);
|
|
2204
|
+
}
|
|
2205
|
+
else if (start === text.length) {
|
|
2206
|
+
onToken('');
|
|
2207
|
+
}
|
|
2208
|
+
}
|
|
2209
|
+
/** Default tokenizer into a reusable buffer (avoids `text.split()` array allocation). */
|
|
2210
|
+
function tokenizeDefaultInto(out, text) {
|
|
2211
|
+
out.length = 0;
|
|
2212
|
+
forEachDefaultToken(text, (token) => out.push(token));
|
|
2213
|
+
}
|
|
2214
|
+
/** Tokenize field text into `out` (reused). Fast path when `tokenize` is the library default. */
|
|
2215
|
+
function tokenizeFieldInto(out, tokenize, text, fieldName) {
|
|
2216
|
+
if (isDefaultTokenize(tokenize)) {
|
|
2217
|
+
tokenizeDefaultInto(out, text);
|
|
2218
|
+
return;
|
|
2219
|
+
}
|
|
2220
|
+
const tokens = tokenize(text, fieldName);
|
|
2221
|
+
out.length = 0;
|
|
2222
|
+
out.push(...tokens);
|
|
2223
|
+
}
|
|
2224
|
+
function collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm) {
|
|
2225
|
+
localFreqs.clear();
|
|
2226
|
+
forEachDefaultToken(text, (token) => {
|
|
2227
|
+
accumulateProcessedTerm(localFreqs, processTerm(token, fieldName));
|
|
2228
|
+
});
|
|
2229
|
+
return localFreqs.size;
|
|
2230
|
+
}
|
|
2231
|
+
/**
|
|
2232
|
+
* Tokenize + accumulate field term frequencies in one pass when the default tokenizer is used.
|
|
2233
|
+
* `tokenScratch` is only used for custom tokenizers (two-phase fallback).
|
|
2234
|
+
*/
|
|
2235
|
+
function collectFieldTermFreqsFromFieldInto(localFreqs, tokenScratch, tokenize, text, fieldName, processTerm) {
|
|
2236
|
+
if (isDefaultTokenize(tokenize)) {
|
|
2237
|
+
return collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm);
|
|
2238
|
+
}
|
|
2239
|
+
tokenizeFieldInto(tokenScratch, tokenize, text, fieldName);
|
|
2240
|
+
return collectFieldTermFreqsInto(localFreqs, tokenScratch, fieldName, processTerm);
|
|
2241
|
+
}
|
|
2242
|
+
function updateAvgFieldLength(avgFieldLength, fieldId, count, length) {
|
|
2243
|
+
const averageFieldLength = avgFieldLength[fieldId] || 0;
|
|
2244
|
+
const totalFieldLength = (averageFieldLength * count) + length;
|
|
2245
|
+
avgFieldLength[fieldId] = totalFieldLength / (count + 1);
|
|
2246
|
+
}
|
|
2247
|
+
|
|
2248
|
+
function validateTreeShape(shape, termCount) {
|
|
2249
|
+
if (!Array.isArray(shape)) {
|
|
2250
|
+
throw invalidFrozenIndex('treeShape node must be an array');
|
|
2251
|
+
}
|
|
2252
|
+
for (const entry of shape) {
|
|
2253
|
+
if (!Array.isArray(entry) || entry.length !== 2) {
|
|
2254
|
+
throw invalidFrozenIndex('treeShape entry must be a [key, value] pair');
|
|
2255
|
+
}
|
|
2256
|
+
const [key, value] = entry;
|
|
2257
|
+
if (key === LEAF) {
|
|
2258
|
+
const idx = value;
|
|
2259
|
+
if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
|
|
2260
|
+
throw invalidFrozenIndex(`treeShape leaf term index out of range: ${idx}`);
|
|
2261
|
+
}
|
|
2262
|
+
}
|
|
2263
|
+
else {
|
|
2264
|
+
validateTreeShape(value, termCount);
|
|
2265
|
+
}
|
|
2266
|
+
}
|
|
2267
|
+
}
|
|
2268
|
+
function termCountOf(snap) {
|
|
2269
|
+
return snap.postings.termCount;
|
|
2270
|
+
}
|
|
2271
|
+
/**
|
|
2272
|
+
* Numeric/structural invariants shared by both the decode path (untrusted binary)
|
|
2273
|
+
* and the build path (trusted internal code).
|
|
2274
|
+
*/
|
|
2275
|
+
function validateFrozenSnapshotNumeric(snap) {
|
|
2276
|
+
if (snap.fieldCount <= 0) {
|
|
2277
|
+
throw invalidFrozenIndex('fieldCount must be positive');
|
|
2278
|
+
}
|
|
2279
|
+
if (snap.nextId < 0 || snap.nextId >= 0xffffffff) {
|
|
2280
|
+
throw invalidFrozenIndex('nextId out of range');
|
|
2281
|
+
}
|
|
2282
|
+
if (snap.documentCount < 0 || snap.documentCount > snap.nextId) {
|
|
2283
|
+
throw invalidFrozenIndex('documentCount inconsistent with nextId');
|
|
2284
|
+
}
|
|
2285
|
+
if (snap.fieldLengthMatrix.length !== snap.nextId * snap.fieldCount) {
|
|
2286
|
+
throw invalidFrozenIndex('fieldLengthMatrix size mismatch');
|
|
2287
|
+
}
|
|
2288
|
+
if (snap.avgFieldLength.length !== snap.fieldCount) {
|
|
2289
|
+
throw invalidFrozenIndex('avgFieldLength size mismatch');
|
|
2290
|
+
}
|
|
2291
|
+
validateFrozenPostingsLayout(snap.postings, snap.documentCount, snap.nextId, detail => {
|
|
2292
|
+
throw invalidFrozenIndex(detail);
|
|
2293
|
+
});
|
|
2294
|
+
const indexedFields = Object.keys(snap.fieldIds);
|
|
2295
|
+
if (indexedFields.length !== snap.fieldCount) {
|
|
2296
|
+
throw invalidFrozenIndex('fieldIds count mismatch');
|
|
2297
|
+
}
|
|
2298
|
+
for (let f = 0; f < snap.fieldCount; f++) {
|
|
2299
|
+
const found = indexedFields.some(name => snap.fieldIds[name] === f);
|
|
2300
|
+
if (!found) {
|
|
2301
|
+
throw invalidFrozenIndex(`missing field id ${f}`);
|
|
2302
|
+
}
|
|
2303
|
+
}
|
|
2304
|
+
}
|
|
2305
|
+
function readFieldNamesSection(buf, fieldNamesOff, fieldCount, externalIdsOff) {
|
|
2306
|
+
const fieldNames = [];
|
|
2307
|
+
let o = fieldNamesOff;
|
|
2308
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
2309
|
+
const { value, next } = readLengthPrefixedUtf8(buf, o);
|
|
2310
|
+
fieldNames.push(value);
|
|
2311
|
+
o = next;
|
|
2312
|
+
}
|
|
2313
|
+
if (o !== externalIdsOff) {
|
|
2314
|
+
throw invalidFrozenIndex('field names section size mismatch');
|
|
2315
|
+
}
|
|
2316
|
+
return fieldNames;
|
|
2317
|
+
}
|
|
2318
|
+
function readExternalIdsSection(buf, externalIdsOff, nextId, storedOff) {
|
|
2319
|
+
const externalIds = new Array(nextId);
|
|
2320
|
+
let o = externalIdsOff;
|
|
2321
|
+
for (let i = 0; i < nextId; i++) {
|
|
2322
|
+
const { value, next } = readExternalId(buf, o);
|
|
2323
|
+
externalIds[i] = value;
|
|
2324
|
+
o = next;
|
|
2325
|
+
}
|
|
2326
|
+
if (o !== storedOff) {
|
|
2327
|
+
throw invalidFrozenIndex('external ids section size mismatch');
|
|
2328
|
+
}
|
|
2329
|
+
return externalIds;
|
|
2330
|
+
}
|
|
2331
|
+
function readStoredFieldsSection(buf, storedOff, nextId, sectionEnd) {
|
|
2332
|
+
const storedFields = new Array(nextId);
|
|
2333
|
+
const tableEnd = storedOff + nextId * 4;
|
|
2334
|
+
if (tableEnd > sectionEnd) {
|
|
2335
|
+
throw invalidFrozenIndex('stored fields table out of bounds');
|
|
2336
|
+
}
|
|
2337
|
+
for (let i = 0; i < nextId; i++) {
|
|
2338
|
+
const rel = buf.readUInt32LE(storedOff + i * 4);
|
|
2339
|
+
if (rel === 0) {
|
|
2340
|
+
storedFields[i] = undefined;
|
|
2341
|
+
continue;
|
|
2342
|
+
}
|
|
2343
|
+
const entryOff = tableEnd + rel - 1;
|
|
2344
|
+
if (entryOff + 4 > sectionEnd) {
|
|
2345
|
+
throw invalidFrozenIndex('stored fields entry offset out of bounds');
|
|
2346
|
+
}
|
|
2347
|
+
const jsonLen = buf.readUInt32LE(entryOff);
|
|
2348
|
+
const jsonStart = entryOff + 4;
|
|
2349
|
+
const jsonEnd = jsonStart + jsonLen;
|
|
2350
|
+
if (jsonEnd > sectionEnd) {
|
|
2351
|
+
throw invalidFrozenIndex('stored fields JSON out of bounds');
|
|
2352
|
+
}
|
|
2353
|
+
storedFields[i] = JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
|
|
2354
|
+
}
|
|
2355
|
+
return storedFields;
|
|
2356
|
+
}
|
|
2357
|
+
/** Validate structural invariants of a decoded or assembled frozen snapshot. */
|
|
2358
|
+
function validateFrozenSnapshot(snap) {
|
|
2359
|
+
validateFrozenSnapshotNumeric(snap);
|
|
2360
|
+
const termCount = termCountOf(snap);
|
|
2361
|
+
if (snap.packedTermIndex != null) {
|
|
2362
|
+
validateFrozenTermIndexLeaves(snap.packedTermIndex, termCount);
|
|
2363
|
+
}
|
|
2364
|
+
else if (snap.termTree != null) {
|
|
2365
|
+
validateTermTreeLeaves(snap.termTree, termCount);
|
|
2366
|
+
}
|
|
2367
|
+
else {
|
|
2368
|
+
validateTreeShape(snap.treeShape, termCount);
|
|
2369
|
+
}
|
|
2370
|
+
}
|
|
2371
|
+
function fieldNamesFromFieldIds(fieldIds) {
|
|
2372
|
+
const names = Object.keys(fieldIds);
|
|
2373
|
+
names.sort((a, b) => fieldIds[a] - fieldIds[b]);
|
|
2374
|
+
return names;
|
|
2375
|
+
}
|
|
2376
|
+
/** Core with explicit {@link termCountOf} (no dictionary section). */
|
|
2377
|
+
function buildCoreSectionWithTermCount(snap) {
|
|
2378
|
+
const out = Buffer.alloc(16);
|
|
2379
|
+
out.writeUInt32LE(snap.documentCount, 0);
|
|
2380
|
+
out.writeUInt32LE(snap.nextId, 4);
|
|
2381
|
+
out.writeUInt32LE(snap.fieldCount, 8);
|
|
2382
|
+
out.writeUInt32LE(termCountOf(snap), 12);
|
|
2383
|
+
return out;
|
|
2384
|
+
}
|
|
2385
|
+
function buildFieldNamesSection(fieldNames) {
|
|
2386
|
+
const chunks = [];
|
|
2387
|
+
for (const name of fieldNames) {
|
|
2388
|
+
const body = Buffer.from(name, 'utf8');
|
|
2389
|
+
const header = Buffer.alloc(4);
|
|
2390
|
+
header.writeUInt32LE(body.length, 0);
|
|
2391
|
+
chunks.push(header, body);
|
|
2392
|
+
}
|
|
2393
|
+
return Buffer.concat(chunks);
|
|
2394
|
+
}
|
|
2395
|
+
function buildExternalIdsSection(externalIds, nextId) {
|
|
2396
|
+
const chunks = [];
|
|
2397
|
+
for (let i = 0; i < nextId; i++) {
|
|
2398
|
+
writeExternalId(chunks, externalIds[i]);
|
|
2399
|
+
}
|
|
2400
|
+
return Buffer.concat(chunks);
|
|
2401
|
+
}
|
|
2402
|
+
function buildStoredFieldsSection(storedFields, nextId) {
|
|
2403
|
+
const table = Buffer.alloc(nextId * 4);
|
|
2404
|
+
const heapChunks = [];
|
|
2405
|
+
let heapOff = 0;
|
|
2406
|
+
for (let i = 0; i < nextId; i++) {
|
|
2407
|
+
const row = storedFields[i];
|
|
2408
|
+
if (row == null) {
|
|
2409
|
+
table.writeUInt32LE(0, i * 4);
|
|
2410
|
+
continue;
|
|
2411
|
+
}
|
|
2412
|
+
table.writeUInt32LE(heapOff + 1, i * 4);
|
|
2413
|
+
const json = Buffer.from(JSON.stringify(row), 'utf8');
|
|
2414
|
+
const entry = Buffer.alloc(4 + json.length);
|
|
2415
|
+
entry.writeUInt32LE(json.length, 0);
|
|
2416
|
+
json.copy(entry, 4);
|
|
2417
|
+
heapChunks.push(entry);
|
|
2418
|
+
heapOff += entry.length;
|
|
2419
|
+
}
|
|
2420
|
+
return Buffer.concat([table, ...heapChunks]);
|
|
2421
|
+
}
|
|
2422
|
+
function validateTermTreeLeaves(tree, termCount) {
|
|
2423
|
+
for (const [key, val] of tree) {
|
|
2424
|
+
if (key === LEAF) {
|
|
2425
|
+
const idx = val;
|
|
2426
|
+
if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
|
|
2427
|
+
throw invalidFrozenIndex(`term tree leaf index out of range: ${idx}`);
|
|
2428
|
+
}
|
|
2429
|
+
}
|
|
2430
|
+
else {
|
|
2431
|
+
validateTermTreeLeaves(val, termCount);
|
|
2432
|
+
}
|
|
2433
|
+
}
|
|
2434
|
+
}
|
|
2435
|
+
function deserializeTermIndexTree(shape) {
|
|
2436
|
+
const tree = new Map();
|
|
2437
|
+
for (const [key, value] of shape) {
|
|
2438
|
+
if (key === LEAF) {
|
|
2439
|
+
tree.set(LEAF, value);
|
|
2440
|
+
}
|
|
2441
|
+
else {
|
|
2442
|
+
tree.set(key, deserializeTermIndexTree(value));
|
|
2443
|
+
}
|
|
2444
|
+
}
|
|
2445
|
+
return tree;
|
|
2446
|
+
}
|
|
2447
|
+
|
|
2448
|
+
/**
|
|
2449
|
+
* Runtime stored fields. Single store field → one column (no per-doc Record at rest).
|
|
2450
|
+
* Wire format stays row JSON; encode/decode can skip intermediate row arrays when layout is known.
|
|
2451
|
+
*/
|
|
2452
|
+
function createStoredFieldsLayout(storeFields, capacity = 0) {
|
|
2453
|
+
if (storeFields.length === 0)
|
|
2454
|
+
return { kind: 'none' };
|
|
2455
|
+
if (storeFields.length === 1) {
|
|
2456
|
+
return { kind: 'single', field: storeFields[0], values: new Array(capacity) };
|
|
2457
|
+
}
|
|
2458
|
+
return { kind: 'multi', rows: new Array(capacity) };
|
|
2459
|
+
}
|
|
2460
|
+
function writeStoredField(layout, shortId, storeFields, extractField, document) {
|
|
2461
|
+
if (layout.kind === 'none')
|
|
2462
|
+
return;
|
|
2463
|
+
if (layout.kind === 'single') {
|
|
2464
|
+
layout.values[shortId] = extractField(document, layout.field);
|
|
2465
|
+
return;
|
|
2232
2466
|
}
|
|
2233
|
-
|
|
2467
|
+
const row = {};
|
|
2468
|
+
for (const name of storeFields) {
|
|
2469
|
+
const value = extractField(document, name);
|
|
2470
|
+
if (value !== undefined)
|
|
2471
|
+
row[name] = value;
|
|
2472
|
+
}
|
|
2473
|
+
layout.rows[shortId] = row;
|
|
2234
2474
|
}
|
|
2235
|
-
/**
|
|
2236
|
-
function
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2475
|
+
/** Materialize API/wire row for one document. */
|
|
2476
|
+
function readStoredFields(layout, shortId) {
|
|
2477
|
+
if (layout.kind === 'none')
|
|
2478
|
+
return undefined;
|
|
2479
|
+
if (layout.kind === 'multi')
|
|
2480
|
+
return layout.rows[shortId];
|
|
2481
|
+
const value = layout.values[shortId];
|
|
2482
|
+
if (value === undefined)
|
|
2483
|
+
return {};
|
|
2484
|
+
return { [layout.field]: value };
|
|
2485
|
+
}
|
|
2486
|
+
function resizeStoredFields(layout, length) {
|
|
2487
|
+
if (layout.kind === 'none')
|
|
2488
|
+
return layout;
|
|
2489
|
+
if (layout.kind === 'single') {
|
|
2490
|
+
return layout.values.length <= length
|
|
2491
|
+
? layout
|
|
2492
|
+
: { kind: 'single', field: layout.field, values: layout.values.slice(0, length) };
|
|
2493
|
+
}
|
|
2494
|
+
return layout.rows.length <= length
|
|
2495
|
+
? layout
|
|
2496
|
+
: { kind: 'multi', rows: layout.rows.slice(0, length) };
|
|
2497
|
+
}
|
|
2498
|
+
function cloneStoredFields(layout) {
|
|
2499
|
+
if (layout.kind === 'none')
|
|
2500
|
+
return layout;
|
|
2501
|
+
if (layout.kind === 'single') {
|
|
2502
|
+
return { kind: 'single', field: layout.field, values: layout.values.slice() };
|
|
2503
|
+
}
|
|
2504
|
+
return { kind: 'multi', rows: layout.rows.slice() };
|
|
2505
|
+
}
|
|
2506
|
+
/** Import from wire rows or lucaong snapshot. Empty storeFields + non-empty rows → multi (binary load without options). */
|
|
2507
|
+
function storedFieldsFromRows(rows, storeFields) {
|
|
2508
|
+
if (storeFields.length === 0) {
|
|
2509
|
+
const hasAny = rows.some(row => row != null && Object.keys(row).length > 0);
|
|
2510
|
+
return hasAny ? { kind: 'multi', rows } : { kind: 'none' };
|
|
2511
|
+
}
|
|
2512
|
+
if (storeFields.length === 1) {
|
|
2513
|
+
const field = storeFields[0];
|
|
2514
|
+
const values = rows.map(row => row === null || row === void 0 ? void 0 : row[field]);
|
|
2515
|
+
return { kind: 'single', field, values };
|
|
2516
|
+
}
|
|
2517
|
+
return { kind: 'multi', rows };
|
|
2518
|
+
}
|
|
2519
|
+
function storedFieldsJsonBytes(layout) {
|
|
2520
|
+
if (layout.kind === 'none')
|
|
2521
|
+
return 0;
|
|
2522
|
+
if (layout.kind === 'multi') {
|
|
2523
|
+
let total = 0;
|
|
2524
|
+
for (const row of layout.rows) {
|
|
2525
|
+
if (row != null)
|
|
2526
|
+
total += JSON.stringify(row).length;
|
|
2247
2527
|
}
|
|
2528
|
+
return total;
|
|
2248
2529
|
}
|
|
2249
|
-
|
|
2530
|
+
let total = 0;
|
|
2531
|
+
const { field, values } = layout;
|
|
2532
|
+
for (let i = 0; i < values.length; i++) {
|
|
2533
|
+
const value = values[i];
|
|
2534
|
+
if (value !== undefined)
|
|
2535
|
+
total += JSON.stringify({ [field]: value }).length;
|
|
2536
|
+
}
|
|
2537
|
+
return total;
|
|
2250
2538
|
}
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2539
|
+
function storedFieldsSlotCount(layout) {
|
|
2540
|
+
if (layout.kind === 'none')
|
|
2541
|
+
return 0;
|
|
2542
|
+
return layout.kind === 'single' ? layout.values.length : layout.rows.length;
|
|
2543
|
+
}
|
|
2544
|
+
function appendStoredFieldJsonEntry(table, heapChunks, heapOffRef, docIndex, jsonUtf8) {
|
|
2545
|
+
table.writeUInt32LE(heapOffRef.value + 1, docIndex * 4);
|
|
2546
|
+
const entry = Buffer.alloc(4 + jsonUtf8.length);
|
|
2547
|
+
entry.writeUInt32LE(jsonUtf8.length, 0);
|
|
2548
|
+
jsonUtf8.copy(entry, 4);
|
|
2549
|
+
heapChunks.push(entry);
|
|
2550
|
+
heapOffRef.value += entry.length;
|
|
2551
|
+
}
|
|
2552
|
+
/** MSv5 StoredFields section from {@link StoredFieldsLayout} (no intermediate row array). */
|
|
2553
|
+
function buildStoredFieldsWireSection(layout, nextId) {
|
|
2554
|
+
if (layout.kind === 'multi') {
|
|
2555
|
+
const rows = layout.rows.length >= nextId
|
|
2556
|
+
? layout.rows
|
|
2557
|
+
: layout.rows.concat(new Array(nextId - layout.rows.length));
|
|
2558
|
+
return buildStoredFieldsSection(rows, nextId);
|
|
2559
|
+
}
|
|
2560
|
+
const table = Buffer.alloc(nextId * 4);
|
|
2561
|
+
if (layout.kind === 'none')
|
|
2562
|
+
return table;
|
|
2563
|
+
const heapChunks = [];
|
|
2564
|
+
const heapOffRef = { value: 0 };
|
|
2565
|
+
const { field, values } = layout;
|
|
2566
|
+
for (let i = 0; i < nextId; i++) {
|
|
2567
|
+
const value = values[i];
|
|
2568
|
+
if (value === undefined)
|
|
2569
|
+
continue;
|
|
2570
|
+
const jsonUtf8 = Buffer.from(JSON.stringify({ [field]: value }), 'utf8');
|
|
2571
|
+
appendStoredFieldJsonEntry(table, heapChunks, heapOffRef, i, jsonUtf8);
|
|
2572
|
+
}
|
|
2573
|
+
return heapChunks.length === 0 ? table : Buffer.concat([table, ...heapChunks]);
|
|
2256
2574
|
}
|
|
2257
|
-
function
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2575
|
+
function storedFieldsTableEnd(storedOff, nextId, sectionEnd) {
|
|
2576
|
+
const tableEnd = storedOff + nextId * 4;
|
|
2577
|
+
if (tableEnd > sectionEnd) {
|
|
2578
|
+
throw invalidFrozenIndex('stored fields table out of bounds');
|
|
2579
|
+
}
|
|
2580
|
+
return tableEnd;
|
|
2581
|
+
}
|
|
2582
|
+
function readStoredFieldJsonAt(buf, tableEnd, sectionEnd, rel) {
|
|
2583
|
+
const entryOff = tableEnd + rel - 1;
|
|
2584
|
+
if (entryOff + 4 > sectionEnd) {
|
|
2585
|
+
throw invalidFrozenIndex('stored fields entry offset out of bounds');
|
|
2586
|
+
}
|
|
2587
|
+
const jsonLen = buf.readUInt32LE(entryOff);
|
|
2588
|
+
const jsonStart = entryOff + 4;
|
|
2589
|
+
const jsonEnd = jsonStart + jsonLen;
|
|
2590
|
+
if (jsonEnd > sectionEnd) {
|
|
2591
|
+
throw invalidFrozenIndex('stored fields JSON out of bounds');
|
|
2592
|
+
}
|
|
2593
|
+
return JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
|
|
2594
|
+
}
|
|
2595
|
+
/** MSv5 StoredFields section → layout (skips row materialization when storeFields hint allows). */
|
|
2596
|
+
function readStoredFieldsWireSection(buf, storedOff, nextId, sectionEnd, storeFields) {
|
|
2597
|
+
const tableEnd = storedFieldsTableEnd(storedOff, nextId, sectionEnd);
|
|
2598
|
+
if (storeFields.length === 1) {
|
|
2599
|
+
const field = storeFields[0];
|
|
2600
|
+
const values = new Array(nextId);
|
|
2601
|
+
for (let i = 0; i < nextId; i++) {
|
|
2602
|
+
const rel = buf.readUInt32LE(storedOff + i * 4);
|
|
2603
|
+
if (rel === 0)
|
|
2604
|
+
continue;
|
|
2605
|
+
const row = readStoredFieldJsonAt(buf, tableEnd, sectionEnd, rel);
|
|
2606
|
+
values[i] = row[field];
|
|
2607
|
+
}
|
|
2608
|
+
return { kind: 'single', field, values };
|
|
2609
|
+
}
|
|
2610
|
+
if (storeFields.length === 0) {
|
|
2611
|
+
let hasAny = false;
|
|
2612
|
+
for (let i = 0; i < nextId; i++) {
|
|
2613
|
+
if (buf.readUInt32LE(storedOff + i * 4) !== 0) {
|
|
2614
|
+
hasAny = true;
|
|
2615
|
+
break;
|
|
2616
|
+
}
|
|
2617
|
+
}
|
|
2618
|
+
if (!hasAny)
|
|
2619
|
+
return { kind: 'none' };
|
|
2265
2620
|
}
|
|
2266
|
-
|
|
2621
|
+
const rows = readStoredFieldsSection(buf, storedOff, nextId, sectionEnd);
|
|
2622
|
+
return storedFieldsFromRows(rows, storeFields);
|
|
2267
2623
|
}
|
|
2268
2624
|
|
|
2269
2625
|
const SUPPORTED_SERIALIZATION_VERSIONS = new Set([1, 2]);
|
|
@@ -2349,7 +2705,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
|
|
|
2349
2705
|
let shortIdRemap = null;
|
|
2350
2706
|
const resolvedNextId = useDense ? documentCount : nextId;
|
|
2351
2707
|
const externalIds = new Array(resolvedNextId);
|
|
2352
|
-
const
|
|
2708
|
+
const storedFieldRows = new Array(externalIds.length);
|
|
2353
2709
|
if (useDense) {
|
|
2354
2710
|
shortIdRemap = new Uint32Array(nextId);
|
|
2355
2711
|
shortIdRemap.fill(DISCARDED_DOC_ID);
|
|
@@ -2361,7 +2717,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
|
|
|
2361
2717
|
const shortIdStr = String(shortId);
|
|
2362
2718
|
shortIdRemap[shortId] = dense;
|
|
2363
2719
|
externalIds[dense] = snapshot.documentIds[shortIdStr];
|
|
2364
|
-
|
|
2720
|
+
storedFieldRows[dense] = snapshot.storedFields[shortIdStr];
|
|
2365
2721
|
dense++;
|
|
2366
2722
|
}
|
|
2367
2723
|
}
|
|
@@ -2369,7 +2725,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
|
|
|
2369
2725
|
for (const [shortIdStr, id] of Object.entries(snapshot.documentIds)) {
|
|
2370
2726
|
const shortId = parseInt(shortIdStr, 10);
|
|
2371
2727
|
externalIds[shortId] = id;
|
|
2372
|
-
|
|
2728
|
+
storedFieldRows[shortId] = snapshot.storedFields[shortIdStr];
|
|
2373
2729
|
}
|
|
2374
2730
|
}
|
|
2375
2731
|
const idLookup = createIdToShortIdLookup(externalIds, resolvedNextId);
|
|
@@ -2392,6 +2748,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
|
|
|
2392
2748
|
}
|
|
2393
2749
|
const searchableMap = buildSearchableMapFromSnapshot(snapshot);
|
|
2394
2750
|
const flat = buildFlatPostingsFromSearchableMap(searchableMap, fieldCount, resolvedNextId, shortIdRemap);
|
|
2751
|
+
const storedFields = storedFieldsFromRows(storedFieldRows, opts.storeFields);
|
|
2395
2752
|
return {
|
|
2396
2753
|
options: opts,
|
|
2397
2754
|
documentCount,
|
|
@@ -2693,321 +3050,121 @@ function collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32)
|
|
|
2693
3050
|
sectionId++;
|
|
2694
3051
|
}
|
|
2695
3052
|
}
|
|
2696
|
-
}
|
|
2697
|
-
function finish() {
|
|
2698
|
-
emitEmptySections();
|
|
2699
|
-
if (streamOffset !== uncompressedLength || sectionId !== directory.length) {
|
|
2700
|
-
throw new Error('MSv5 zstd decompressed length mismatch');
|
|
2701
|
-
}
|
|
2702
|
-
if (payloadCrc !== payloadCrc32) {
|
|
2703
|
-
throw new Error('MSv5 payload CRC mismatch');
|
|
2704
|
-
}
|
|
2705
|
-
}
|
|
2706
|
-
return { sections, consume, finish };
|
|
2707
|
-
}
|
|
2708
|
-
function loadMsv5SectionsFromZstdStream(compressed, directory, uncompressedLength, payloadCrc32) {
|
|
2709
|
-
return new Promise((resolve, reject) => {
|
|
2710
|
-
const collector = collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32);
|
|
2711
|
-
const stream = zlib.createZstdDecompress();
|
|
2712
|
-
stream.on('data', (chunk) => {
|
|
2713
|
-
try {
|
|
2714
|
-
collector.consume(chunk);
|
|
2715
|
-
}
|
|
2716
|
-
catch (err) {
|
|
2717
|
-
stream.destroy(err);
|
|
2718
|
-
}
|
|
2719
|
-
});
|
|
2720
|
-
stream.on('error', reject);
|
|
2721
|
-
stream.on('end', () => {
|
|
2722
|
-
try {
|
|
2723
|
-
collector.finish();
|
|
2724
|
-
resolve(collector.sections);
|
|
2725
|
-
}
|
|
2726
|
-
catch (err) {
|
|
2727
|
-
reject(err);
|
|
2728
|
-
}
|
|
2729
|
-
});
|
|
2730
|
-
stream.end(compressed);
|
|
2731
|
-
});
|
|
2732
|
-
}
|
|
2733
|
-
function validatePayloadDirectory(directory, uncompressedLength) {
|
|
2734
|
-
let prevEnd = 0;
|
|
2735
|
-
for (const entry of directory) {
|
|
2736
|
-
if ((entry.fileOffset & 3) !== 0) {
|
|
2737
|
-
throw new Error('MSv5 section offset not aligned');
|
|
2738
|
-
}
|
|
2739
|
-
if (entry.fileOffset < prevEnd) {
|
|
2740
|
-
throw new Error('MSv5 section offsets not monotonic');
|
|
2741
|
-
}
|
|
2742
|
-
if (entry.fileOffset + entry.uncompressedLength > uncompressedLength) {
|
|
2743
|
-
throw new Error('MSv5 section out of uncompressed bounds');
|
|
2744
|
-
}
|
|
2745
|
-
prevEnd = entry.fileOffset + entry.uncompressedLength;
|
|
2746
|
-
}
|
|
2747
|
-
if (prevEnd !== uncompressedLength) {
|
|
2748
|
-
throw new Error('MSv5 uncompressed payload length mismatch');
|
|
2749
|
-
}
|
|
2750
|
-
}
|
|
2751
|
-
/** Shared validation + bounds for both the sync and async load paths. */
|
|
2752
|
-
function preparePayload(fileBuf, directory) {
|
|
2753
|
-
assertPayloadFormatRev(fileBuf);
|
|
2754
|
-
const { payloadOffset, compressedLength, uncompressedLength, payloadCrc32, payloadCodec } = readPayloadMeta(fileBuf);
|
|
2755
|
-
validatePayloadDirectory(directory, uncompressedLength);
|
|
2756
|
-
if (payloadOffset !== MSV5_HEADER_SIZE || payloadOffset + compressedLength > fileBuf.length) {
|
|
2757
|
-
throw new Error('MSv5 payload out of bounds');
|
|
2758
|
-
}
|
|
2759
|
-
if (payloadCodec === CODEC_RAW && compressedLength !== uncompressedLength) {
|
|
2760
|
-
throw new Error('MSv5 raw payload length mismatch');
|
|
2761
|
-
}
|
|
2762
|
-
return {
|
|
2763
|
-
payloadCodec,
|
|
2764
|
-
slice: fileBuf.subarray(payloadOffset, payloadOffset + compressedLength),
|
|
2765
|
-
uncompressedLength,
|
|
2766
|
-
payloadCrc32,
|
|
2767
|
-
};
|
|
2768
|
-
}
|
|
2769
|
-
/** Synchronous load; peak RAM ≈ full uncompressed payload (use the async path to bound it). */
|
|
2770
|
-
function loadMsv5Sections(fileBuf, directory) {
|
|
2771
|
-
const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
|
|
2772
|
-
if (payloadCodec === CODEC_RAW) {
|
|
2773
|
-
return sectionsFromPayload(slice, directory, payloadCrc32);
|
|
2774
|
-
}
|
|
2775
|
-
if (payloadCodec === CODEC_ZSTD) {
|
|
2776
|
-
if (!zstdAvailable()) {
|
|
2777
|
-
throw zstdUnavailableReadError();
|
|
2778
|
-
}
|
|
2779
|
-
// Native cap matches readPayloadMeta's 1 GiB limit (see MSV5_MAX_UNCOMPRESSED_BYTES).
|
|
2780
|
-
// Using header `uncompressedLength` here would only help when the header understates
|
|
2781
|
-
// the zstd stream but the attacker can inflate the header too — same worst case.
|
|
2782
|
-
const decoded = zlib.zstdDecompressSync(slice, {
|
|
2783
|
-
maxOutputLength: MSV5_MAX_UNCOMPRESSED_BYTES,
|
|
2784
|
-
});
|
|
2785
|
-
if (decoded.length !== uncompressedLength) {
|
|
2786
|
-
throw new Error('MSv5 zstd decompressed length mismatch');
|
|
2787
|
-
}
|
|
2788
|
-
return sectionsFromPayload(decoded, directory, payloadCrc32);
|
|
2789
|
-
}
|
|
2790
|
-
throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
|
|
2791
|
-
}
|
|
2792
|
-
/** Streaming load; peak main-thread RAM ≈ largest single section (+ file buffer). */
|
|
2793
|
-
async function loadMsv5SectionsAsync(fileBuf, directory) {
|
|
2794
|
-
const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
|
|
2795
|
-
if (payloadCodec === CODEC_RAW) {
|
|
2796
|
-
return sectionsFromPayload(slice, directory, payloadCrc32);
|
|
2797
|
-
}
|
|
2798
|
-
if (payloadCodec === CODEC_ZSTD) {
|
|
2799
|
-
if (!zstdAvailable()) {
|
|
2800
|
-
throw zstdUnavailableReadError();
|
|
2801
|
-
}
|
|
2802
|
-
return loadMsv5SectionsFromZstdStream(slice, directory, uncompressedLength, payloadCrc32);
|
|
2803
|
-
}
|
|
2804
|
-
throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
|
|
2805
|
-
}
|
|
2806
|
-
function isMsv5Buffer(buf) {
|
|
2807
|
-
return buf.length >= 4 && buf.toString('ascii', 0, 4) === 'MSv5';
|
|
2808
|
-
}
|
|
2809
|
-
function readMsv5GlobalFlags(buf) {
|
|
2810
|
-
return buf.readUInt16LE(6);
|
|
2811
|
-
}
|
|
2812
|
-
|
|
2813
|
-
function validateTreeShape(shape, termCount) {
|
|
2814
|
-
if (!Array.isArray(shape)) {
|
|
2815
|
-
throw invalidFrozenIndex('treeShape node must be an array');
|
|
2816
|
-
}
|
|
2817
|
-
for (const entry of shape) {
|
|
2818
|
-
if (!Array.isArray(entry) || entry.length !== 2) {
|
|
2819
|
-
throw invalidFrozenIndex('treeShape entry must be a [key, value] pair');
|
|
2820
|
-
}
|
|
2821
|
-
const [key, value] = entry;
|
|
2822
|
-
if (key === LEAF) {
|
|
2823
|
-
const idx = value;
|
|
2824
|
-
if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
|
|
2825
|
-
throw invalidFrozenIndex(`treeShape leaf term index out of range: ${idx}`);
|
|
2826
|
-
}
|
|
2827
|
-
}
|
|
2828
|
-
else {
|
|
2829
|
-
validateTreeShape(value, termCount);
|
|
2830
|
-
}
|
|
2831
|
-
}
|
|
2832
|
-
}
|
|
2833
|
-
function termCountOf(snap) {
|
|
2834
|
-
return snap.postings.termCount;
|
|
2835
|
-
}
|
|
2836
|
-
/**
|
|
2837
|
-
* Numeric/structural invariants shared by both the decode path (untrusted binary)
|
|
2838
|
-
* and the build path (trusted internal code).
|
|
2839
|
-
*/
|
|
2840
|
-
function validateFrozenSnapshotNumeric(snap) {
|
|
2841
|
-
if (snap.fieldCount <= 0) {
|
|
2842
|
-
throw invalidFrozenIndex('fieldCount must be positive');
|
|
2843
|
-
}
|
|
2844
|
-
if (snap.nextId < 0 || snap.nextId >= 0xffffffff) {
|
|
2845
|
-
throw invalidFrozenIndex('nextId out of range');
|
|
2846
|
-
}
|
|
2847
|
-
if (snap.documentCount < 0 || snap.documentCount > snap.nextId) {
|
|
2848
|
-
throw invalidFrozenIndex('documentCount inconsistent with nextId');
|
|
2849
|
-
}
|
|
2850
|
-
if (snap.fieldLengthMatrix.length !== snap.nextId * snap.fieldCount) {
|
|
2851
|
-
throw invalidFrozenIndex('fieldLengthMatrix size mismatch');
|
|
2852
|
-
}
|
|
2853
|
-
if (snap.avgFieldLength.length !== snap.fieldCount) {
|
|
2854
|
-
throw invalidFrozenIndex('avgFieldLength size mismatch');
|
|
2855
|
-
}
|
|
2856
|
-
validateFrozenPostingsLayout(snap.postings, snap.documentCount, snap.nextId, detail => {
|
|
2857
|
-
throw invalidFrozenIndex(detail);
|
|
2858
|
-
});
|
|
2859
|
-
const indexedFields = Object.keys(snap.fieldIds);
|
|
2860
|
-
if (indexedFields.length !== snap.fieldCount) {
|
|
2861
|
-
throw invalidFrozenIndex('fieldIds count mismatch');
|
|
2862
|
-
}
|
|
2863
|
-
for (let f = 0; f < snap.fieldCount; f++) {
|
|
2864
|
-
const found = indexedFields.some(name => snap.fieldIds[name] === f);
|
|
2865
|
-
if (!found) {
|
|
2866
|
-
throw invalidFrozenIndex(`missing field id ${f}`);
|
|
2867
|
-
}
|
|
2868
|
-
}
|
|
2869
|
-
}
|
|
2870
|
-
function readFieldNamesSection(buf, fieldNamesOff, fieldCount, externalIdsOff) {
|
|
2871
|
-
const fieldNames = [];
|
|
2872
|
-
let o = fieldNamesOff;
|
|
2873
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
2874
|
-
const { value, next } = readLengthPrefixedUtf8(buf, o);
|
|
2875
|
-
fieldNames.push(value);
|
|
2876
|
-
o = next;
|
|
2877
|
-
}
|
|
2878
|
-
if (o !== externalIdsOff) {
|
|
2879
|
-
throw invalidFrozenIndex('field names section size mismatch');
|
|
2880
|
-
}
|
|
2881
|
-
return fieldNames;
|
|
2882
|
-
}
|
|
2883
|
-
function readExternalIdsSection(buf, externalIdsOff, nextId, storedOff) {
|
|
2884
|
-
const externalIds = new Array(nextId);
|
|
2885
|
-
let o = externalIdsOff;
|
|
2886
|
-
for (let i = 0; i < nextId; i++) {
|
|
2887
|
-
const { value, next } = readExternalId(buf, o);
|
|
2888
|
-
externalIds[i] = value;
|
|
2889
|
-
o = next;
|
|
2890
|
-
}
|
|
2891
|
-
if (o !== storedOff) {
|
|
2892
|
-
throw invalidFrozenIndex('external ids section size mismatch');
|
|
2893
|
-
}
|
|
2894
|
-
return externalIds;
|
|
2895
|
-
}
|
|
2896
|
-
function readStoredFieldsSection(buf, storedOff, nextId, sectionEnd) {
|
|
2897
|
-
const storedFields = new Array(nextId);
|
|
2898
|
-
const tableEnd = storedOff + nextId * 4;
|
|
2899
|
-
if (tableEnd > sectionEnd) {
|
|
2900
|
-
throw invalidFrozenIndex('stored fields table out of bounds');
|
|
2901
|
-
}
|
|
2902
|
-
for (let i = 0; i < nextId; i++) {
|
|
2903
|
-
const rel = buf.readUInt32LE(storedOff + i * 4);
|
|
2904
|
-
if (rel === 0) {
|
|
2905
|
-
storedFields[i] = undefined;
|
|
2906
|
-
continue;
|
|
2907
|
-
}
|
|
2908
|
-
const entryOff = tableEnd + rel - 1;
|
|
2909
|
-
if (entryOff + 4 > sectionEnd) {
|
|
2910
|
-
throw invalidFrozenIndex('stored fields entry offset out of bounds');
|
|
2911
|
-
}
|
|
2912
|
-
const jsonLen = buf.readUInt32LE(entryOff);
|
|
2913
|
-
const jsonStart = entryOff + 4;
|
|
2914
|
-
const jsonEnd = jsonStart + jsonLen;
|
|
2915
|
-
if (jsonEnd > sectionEnd) {
|
|
2916
|
-
throw invalidFrozenIndex('stored fields JSON out of bounds');
|
|
3053
|
+
}
|
|
3054
|
+
function finish() {
|
|
3055
|
+
emitEmptySections();
|
|
3056
|
+
if (streamOffset !== uncompressedLength || sectionId !== directory.length) {
|
|
3057
|
+
throw new Error('MSv5 zstd decompressed length mismatch');
|
|
3058
|
+
}
|
|
3059
|
+
if (payloadCrc !== payloadCrc32) {
|
|
3060
|
+
throw new Error('MSv5 payload CRC mismatch');
|
|
2917
3061
|
}
|
|
2918
|
-
storedFields[i] = JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
|
|
2919
3062
|
}
|
|
2920
|
-
return
|
|
3063
|
+
return { sections, consume, finish };
|
|
2921
3064
|
}
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
|
|
2928
|
-
|
|
2929
|
-
|
|
2930
|
-
|
|
3065
|
+
function loadMsv5SectionsFromZstdStream(compressed, directory, uncompressedLength, payloadCrc32) {
|
|
3066
|
+
return new Promise((resolve, reject) => {
|
|
3067
|
+
const collector = collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32);
|
|
3068
|
+
const stream = zlib.createZstdDecompress();
|
|
3069
|
+
stream.on('data', (chunk) => {
|
|
3070
|
+
try {
|
|
3071
|
+
collector.consume(chunk);
|
|
3072
|
+
}
|
|
3073
|
+
catch (err) {
|
|
3074
|
+
stream.destroy(err);
|
|
3075
|
+
}
|
|
3076
|
+
});
|
|
3077
|
+
stream.on('error', reject);
|
|
3078
|
+
stream.on('end', () => {
|
|
3079
|
+
try {
|
|
3080
|
+
collector.finish();
|
|
3081
|
+
resolve(collector.sections);
|
|
3082
|
+
}
|
|
3083
|
+
catch (err) {
|
|
3084
|
+
reject(err);
|
|
3085
|
+
}
|
|
3086
|
+
});
|
|
3087
|
+
stream.end(compressed);
|
|
3088
|
+
});
|
|
3089
|
+
}
|
|
3090
|
+
function validatePayloadDirectory(directory, uncompressedLength) {
|
|
3091
|
+
let prevEnd = 0;
|
|
3092
|
+
for (const entry of directory) {
|
|
3093
|
+
if ((entry.fileOffset & 3) !== 0) {
|
|
3094
|
+
throw new Error('MSv5 section offset not aligned');
|
|
3095
|
+
}
|
|
3096
|
+
if (entry.fileOffset < prevEnd) {
|
|
3097
|
+
throw new Error('MSv5 section offsets not monotonic');
|
|
3098
|
+
}
|
|
3099
|
+
if (entry.fileOffset + entry.uncompressedLength > uncompressedLength) {
|
|
3100
|
+
throw new Error('MSv5 section out of uncompressed bounds');
|
|
3101
|
+
}
|
|
3102
|
+
prevEnd = entry.fileOffset + entry.uncompressedLength;
|
|
2931
3103
|
}
|
|
2932
|
-
|
|
2933
|
-
|
|
3104
|
+
if (prevEnd !== uncompressedLength) {
|
|
3105
|
+
throw new Error('MSv5 uncompressed payload length mismatch');
|
|
2934
3106
|
}
|
|
2935
3107
|
}
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
const out = Buffer.alloc(16);
|
|
2944
|
-
out.writeUInt32LE(snap.documentCount, 0);
|
|
2945
|
-
out.writeUInt32LE(snap.nextId, 4);
|
|
2946
|
-
out.writeUInt32LE(snap.fieldCount, 8);
|
|
2947
|
-
out.writeUInt32LE(termCountOf(snap), 12);
|
|
2948
|
-
return out;
|
|
2949
|
-
}
|
|
2950
|
-
function buildFieldNamesSection(fieldNames) {
|
|
2951
|
-
const chunks = [];
|
|
2952
|
-
for (const name of fieldNames) {
|
|
2953
|
-
const body = Buffer.from(name, 'utf8');
|
|
2954
|
-
const header = Buffer.alloc(4);
|
|
2955
|
-
header.writeUInt32LE(body.length, 0);
|
|
2956
|
-
chunks.push(header, body);
|
|
3108
|
+
/** Shared validation + bounds for both the sync and async load paths. */
|
|
3109
|
+
function preparePayload(fileBuf, directory) {
|
|
3110
|
+
assertPayloadFormatRev(fileBuf);
|
|
3111
|
+
const { payloadOffset, compressedLength, uncompressedLength, payloadCrc32, payloadCodec } = readPayloadMeta(fileBuf);
|
|
3112
|
+
validatePayloadDirectory(directory, uncompressedLength);
|
|
3113
|
+
if (payloadOffset !== MSV5_HEADER_SIZE || payloadOffset + compressedLength > fileBuf.length) {
|
|
3114
|
+
throw new Error('MSv5 payload out of bounds');
|
|
2957
3115
|
}
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
function buildExternalIdsSection(externalIds, nextId) {
|
|
2961
|
-
const chunks = [];
|
|
2962
|
-
for (let i = 0; i < nextId; i++) {
|
|
2963
|
-
writeExternalId(chunks, externalIds[i]);
|
|
3116
|
+
if (payloadCodec === CODEC_RAW && compressedLength !== uncompressedLength) {
|
|
3117
|
+
throw new Error('MSv5 raw payload length mismatch');
|
|
2964
3118
|
}
|
|
2965
|
-
return
|
|
3119
|
+
return {
|
|
3120
|
+
payloadCodec,
|
|
3121
|
+
slice: fileBuf.subarray(payloadOffset, payloadOffset + compressedLength),
|
|
3122
|
+
uncompressedLength,
|
|
3123
|
+
payloadCrc32,
|
|
3124
|
+
};
|
|
2966
3125
|
}
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
const
|
|
2970
|
-
|
|
2971
|
-
|
|
2972
|
-
const row = storedFields[i];
|
|
2973
|
-
if (row == null) {
|
|
2974
|
-
table.writeUInt32LE(0, i * 4);
|
|
2975
|
-
continue;
|
|
2976
|
-
}
|
|
2977
|
-
table.writeUInt32LE(heapOff + 1, i * 4);
|
|
2978
|
-
const json = Buffer.from(JSON.stringify(row), 'utf8');
|
|
2979
|
-
const entry = Buffer.alloc(4 + json.length);
|
|
2980
|
-
entry.writeUInt32LE(json.length, 0);
|
|
2981
|
-
json.copy(entry, 4);
|
|
2982
|
-
heapChunks.push(entry);
|
|
2983
|
-
heapOff += entry.length;
|
|
3126
|
+
/** Synchronous load; peak RAM ≈ full uncompressed payload (use the async path to bound it). */
|
|
3127
|
+
function loadMsv5Sections(fileBuf, directory) {
|
|
3128
|
+
const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
|
|
3129
|
+
if (payloadCodec === CODEC_RAW) {
|
|
3130
|
+
return sectionsFromPayload(slice, directory, payloadCrc32);
|
|
2984
3131
|
}
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
for (const [key, val] of tree) {
|
|
2989
|
-
if (key === LEAF) {
|
|
2990
|
-
const idx = val;
|
|
2991
|
-
if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
|
|
2992
|
-
throw invalidFrozenIndex(`term tree leaf index out of range: ${idx}`);
|
|
2993
|
-
}
|
|
3132
|
+
if (payloadCodec === CODEC_ZSTD) {
|
|
3133
|
+
if (!zstdAvailable()) {
|
|
3134
|
+
throw zstdUnavailableReadError();
|
|
2994
3135
|
}
|
|
2995
|
-
|
|
2996
|
-
|
|
3136
|
+
// Native cap matches readPayloadMeta's 1 GiB limit (see MSV5_MAX_UNCOMPRESSED_BYTES).
|
|
3137
|
+
// Using header `uncompressedLength` here would only help when the header understates
|
|
3138
|
+
// the zstd stream but the attacker can inflate the header too — same worst case.
|
|
3139
|
+
const decoded = zlib.zstdDecompressSync(slice, {
|
|
3140
|
+
maxOutputLength: MSV5_MAX_UNCOMPRESSED_BYTES,
|
|
3141
|
+
});
|
|
3142
|
+
if (decoded.length !== uncompressedLength) {
|
|
3143
|
+
throw new Error('MSv5 zstd decompressed length mismatch');
|
|
2997
3144
|
}
|
|
3145
|
+
return sectionsFromPayload(decoded, directory, payloadCrc32);
|
|
2998
3146
|
}
|
|
3147
|
+
throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
|
|
2999
3148
|
}
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
|
|
3004
|
-
|
|
3005
|
-
|
|
3006
|
-
|
|
3007
|
-
|
|
3149
|
+
/** Streaming load; peak main-thread RAM ≈ largest single section (+ file buffer). */
|
|
3150
|
+
async function loadMsv5SectionsAsync(fileBuf, directory) {
|
|
3151
|
+
const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
|
|
3152
|
+
if (payloadCodec === CODEC_RAW) {
|
|
3153
|
+
return sectionsFromPayload(slice, directory, payloadCrc32);
|
|
3154
|
+
}
|
|
3155
|
+
if (payloadCodec === CODEC_ZSTD) {
|
|
3156
|
+
if (!zstdAvailable()) {
|
|
3157
|
+
throw zstdUnavailableReadError();
|
|
3008
3158
|
}
|
|
3159
|
+
return loadMsv5SectionsFromZstdStream(slice, directory, uncompressedLength, payloadCrc32);
|
|
3009
3160
|
}
|
|
3010
|
-
|
|
3161
|
+
throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
|
|
3162
|
+
}
|
|
3163
|
+
function isMsv5Buffer(buf) {
|
|
3164
|
+
return buf.length >= 4 && buf.toString('ascii', 0, 4) === 'MSv5';
|
|
3165
|
+
}
|
|
3166
|
+
function readMsv5GlobalFlags(buf) {
|
|
3167
|
+
return buf.readUInt16LE(6);
|
|
3011
3168
|
}
|
|
3012
3169
|
|
|
3013
3170
|
/** Global wire flags for {@link FreqArray} width. */
|
|
@@ -3301,11 +3458,14 @@ function encodeFrozenSnapshotMsv5(snap, termTree, packedTermIndex) {
|
|
|
3301
3458
|
const flFlags = fieldLengthMatrixWireFlags(snap.fieldLengthMatrix);
|
|
3302
3459
|
const freqFlags = freqWireFlags(snap.postings.allFreqs);
|
|
3303
3460
|
const globalFlags = postingsWire.flags | flFlags | freqFlags;
|
|
3461
|
+
const storedFieldsSection = snap.storedFieldsLayout != null
|
|
3462
|
+
? buildStoredFieldsWireSection(snap.storedFieldsLayout, snap.nextId)
|
|
3463
|
+
: buildStoredFieldsSection(snap.storedFields, snap.nextId);
|
|
3304
3464
|
const rawSections = [
|
|
3305
3465
|
buildCoreSectionWithTermCount(snap),
|
|
3306
3466
|
buildFieldNamesSection(fieldNames),
|
|
3307
3467
|
buildExternalIdsSection(snap.externalIds, snap.nextId),
|
|
3308
|
-
|
|
3468
|
+
storedFieldsSection,
|
|
3309
3469
|
buildTermTreeSectionColumnar(packed),
|
|
3310
3470
|
bufferFromView(snap.avgFieldLength),
|
|
3311
3471
|
buildFieldLengthMatrixSection(snap.fieldLengthMatrix),
|
|
@@ -3329,11 +3489,14 @@ async function encodeFrozenSnapshotMsv5Async(snap, termTree, packedTermIndex) {
|
|
|
3329
3489
|
const flFlags = fieldLengthMatrixWireFlags(snap.fieldLengthMatrix);
|
|
3330
3490
|
const freqFlags = freqWireFlags(snap.postings.allFreqs);
|
|
3331
3491
|
const globalFlags = postingsWire.flags | flFlags | freqFlags;
|
|
3492
|
+
const storedFieldsSection = snap.storedFieldsLayout != null
|
|
3493
|
+
? buildStoredFieldsWireSection(snap.storedFieldsLayout, snap.nextId)
|
|
3494
|
+
: buildStoredFieldsSection(snap.storedFields, snap.nextId);
|
|
3332
3495
|
const rawSections = [
|
|
3333
3496
|
buildCoreSectionWithTermCount(snap),
|
|
3334
3497
|
buildFieldNamesSection(fieldNames),
|
|
3335
3498
|
buildExternalIdsSection(snap.externalIds, snap.nextId),
|
|
3336
|
-
|
|
3499
|
+
storedFieldsSection,
|
|
3337
3500
|
buildTermTreeSectionColumnar(packed),
|
|
3338
3501
|
bufferFromView(snap.avgFieldLength),
|
|
3339
3502
|
buildFieldLengthMatrixSection(snap.fieldLengthMatrix),
|
|
@@ -3363,7 +3526,7 @@ function validateMsv5Container(buf) {
|
|
|
3363
3526
|
}
|
|
3364
3527
|
return { globalFlags, directory };
|
|
3365
3528
|
}
|
|
3366
|
-
function decodeMsv5Sections(globalFlags, sections) {
|
|
3529
|
+
function decodeMsv5Sections(globalFlags, sections, hints) {
|
|
3367
3530
|
const core = sections[0 /* Msv5SectionId.Core */];
|
|
3368
3531
|
if (core.length !== 16) {
|
|
3369
3532
|
throw invalidFrozenIndex('core section size mismatch');
|
|
@@ -3378,7 +3541,12 @@ function decodeMsv5Sections(globalFlags, sections) {
|
|
|
3378
3541
|
fieldIds[fieldNames[f]] = f;
|
|
3379
3542
|
}
|
|
3380
3543
|
const externalIds = readExternalIdsSection(sections[2 /* Msv5SectionId.ExternalIds */], 0, nextId, sections[2 /* Msv5SectionId.ExternalIds */].length);
|
|
3381
|
-
const
|
|
3544
|
+
const storedFieldsLayout = hints != null
|
|
3545
|
+
? readStoredFieldsWireSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length, hints.storeFields)
|
|
3546
|
+
: undefined;
|
|
3547
|
+
const storedFields = storedFieldsLayout != null
|
|
3548
|
+
? new Array(nextId)
|
|
3549
|
+
: readStoredFieldsSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length);
|
|
3382
3550
|
const packedTermIndex = readPackedTermTreeSectionColumnar(sections[4 /* Msv5SectionId.TermTree */], termCount);
|
|
3383
3551
|
const avgBuf = sections[5 /* Msv5SectionId.AvgFieldLength */];
|
|
3384
3552
|
const avgFieldLength = readFloat32Array(avgBuf, 0, avgBuf.length);
|
|
@@ -3396,6 +3564,7 @@ function decodeMsv5Sections(globalFlags, sections) {
|
|
|
3396
3564
|
avgFieldLength,
|
|
3397
3565
|
externalIds,
|
|
3398
3566
|
storedFields,
|
|
3567
|
+
storedFieldsLayout,
|
|
3399
3568
|
fieldLengthMatrix,
|
|
3400
3569
|
treeShape: [],
|
|
3401
3570
|
packedTermIndex,
|
|
@@ -3404,13 +3573,13 @@ function decodeMsv5Sections(globalFlags, sections) {
|
|
|
3404
3573
|
validateFrozenSnapshot(snap);
|
|
3405
3574
|
return snap;
|
|
3406
3575
|
}
|
|
3407
|
-
function decodeFrozenSnapshotMsv5(buf) {
|
|
3576
|
+
function decodeFrozenSnapshotMsv5(buf, hints) {
|
|
3408
3577
|
const { globalFlags, directory } = validateMsv5Container(buf);
|
|
3409
|
-
return decodeMsv5Sections(globalFlags, loadMsv5Sections(buf, directory));
|
|
3578
|
+
return decodeMsv5Sections(globalFlags, loadMsv5Sections(buf, directory), hints);
|
|
3410
3579
|
}
|
|
3411
|
-
async function decodeFrozenSnapshotMsv5Async(buf) {
|
|
3580
|
+
async function decodeFrozenSnapshotMsv5Async(buf, hints) {
|
|
3412
3581
|
const { globalFlags, directory } = validateMsv5Container(buf);
|
|
3413
|
-
return decodeMsv5Sections(globalFlags, await loadMsv5SectionsAsync(buf, directory));
|
|
3582
|
+
return decodeMsv5Sections(globalFlags, await loadMsv5SectionsAsync(buf, directory), hints);
|
|
3414
3583
|
}
|
|
3415
3584
|
|
|
3416
3585
|
/** Encode a frozen snapshot as a binary buffer. */
|
|
@@ -3424,12 +3593,12 @@ function encodeFrozenSnapshotAsync(snap, termTree, packedTermIndex) {
|
|
|
3424
3593
|
|
|
3425
3594
|
const LEGACY_MAGICS = new Set(['MSv1', 'MSv2', 'MSv3', 'MSv4']);
|
|
3426
3595
|
/** Decode a frozen binary snapshot buffer. */
|
|
3427
|
-
function decodeFrozenSnapshot(buf) {
|
|
3596
|
+
function decodeFrozenSnapshot(buf, hints) {
|
|
3428
3597
|
assertBufferLength(buf, 8);
|
|
3429
3598
|
const magic = buf.toString('ascii', 0, 4);
|
|
3430
3599
|
const version = buf.readUInt16LE(4);
|
|
3431
3600
|
if (isMsv5Buffer(buf) && version === 5) {
|
|
3432
|
-
return decodeFrozenSnapshotMsv5(buf);
|
|
3601
|
+
return decodeFrozenSnapshotMsv5(buf, hints);
|
|
3433
3602
|
}
|
|
3434
3603
|
if (LEGACY_MAGICS.has(magic)) {
|
|
3435
3604
|
throw invalidFrozenIndex('Unsupported frozen binary snapshot; re-build with saveBinarySync() or from lucaong JSON');
|
|
@@ -3437,82 +3606,283 @@ function decodeFrozenSnapshot(buf) {
|
|
|
3437
3606
|
throw invalidFrozenIndex('Unsupported frozen binary snapshot');
|
|
3438
3607
|
}
|
|
3439
3608
|
/** Async frozen snapshot decode (streaming zstd). */
|
|
3440
|
-
async function decodeFrozenSnapshotAsync(buf) {
|
|
3609
|
+
async function decodeFrozenSnapshotAsync(buf, hints) {
|
|
3441
3610
|
assertBufferLength(buf, 8);
|
|
3442
3611
|
const version = buf.readUInt16LE(4);
|
|
3443
3612
|
if (isMsv5Buffer(buf) && version === 5) {
|
|
3444
|
-
return decodeFrozenSnapshotMsv5Async(buf);
|
|
3613
|
+
return decodeFrozenSnapshotMsv5Async(buf, hints);
|
|
3614
|
+
}
|
|
3615
|
+
return decodeFrozenSnapshot(buf, hints);
|
|
3616
|
+
}
|
|
3617
|
+
|
|
3618
|
+
const DEFAULT_CAPACITY = 16;
|
|
3619
|
+
/** Growable unsigned 32-bit column (build scratch; narrowed to u16 at finalize when possible). */
|
|
3620
|
+
class GrowableUint32Column {
|
|
3621
|
+
constructor(initialCapacity = DEFAULT_CAPACITY) {
|
|
3622
|
+
this._len = 0;
|
|
3623
|
+
this._buf = new Uint32Array(Math.max(1, initialCapacity));
|
|
3624
|
+
}
|
|
3625
|
+
get length() {
|
|
3626
|
+
return this._len;
|
|
3627
|
+
}
|
|
3628
|
+
push(value) {
|
|
3629
|
+
if (this._len >= this._buf.length) {
|
|
3630
|
+
const grown = new Uint32Array(Math.max(1, this._buf.length * 2));
|
|
3631
|
+
grown.set(this._buf);
|
|
3632
|
+
this._buf = grown;
|
|
3633
|
+
}
|
|
3634
|
+
this._buf[this._len++] = value;
|
|
3635
|
+
}
|
|
3636
|
+
copyRangeInto(sourceOffset, length, target, targetOffset, docIdWidth) {
|
|
3637
|
+
if (docIdWidth === 16) {
|
|
3638
|
+
const out = target;
|
|
3639
|
+
for (let i = 0; i < length; i++)
|
|
3640
|
+
out[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3641
|
+
}
|
|
3642
|
+
else {
|
|
3643
|
+
const out = target;
|
|
3644
|
+
for (let i = 0; i < length; i++)
|
|
3645
|
+
out[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3646
|
+
}
|
|
3647
|
+
}
|
|
3648
|
+
truncate(length) {
|
|
3649
|
+
this._len = length;
|
|
3650
|
+
if (length > 0 && length < this._buf.length) {
|
|
3651
|
+
this._buf = this._buf.slice(0, length);
|
|
3652
|
+
}
|
|
3653
|
+
}
|
|
3654
|
+
}
|
|
3655
|
+
/** Growable frequency column (u16 cells; matches frozen clamp range). */
|
|
3656
|
+
class GrowableFreqColumn {
|
|
3657
|
+
constructor(initialCapacity = DEFAULT_CAPACITY) {
|
|
3658
|
+
this._len = 0;
|
|
3659
|
+
this._buf = new Uint16Array(Math.max(1, initialCapacity));
|
|
3660
|
+
}
|
|
3661
|
+
get length() {
|
|
3662
|
+
return this._len;
|
|
3663
|
+
}
|
|
3664
|
+
push(freq) {
|
|
3665
|
+
if (this._len >= this._buf.length) {
|
|
3666
|
+
const grown = new Uint16Array(Math.max(1, this._buf.length * 2));
|
|
3667
|
+
grown.set(this._buf);
|
|
3668
|
+
this._buf = grown;
|
|
3669
|
+
}
|
|
3670
|
+
this._buf[this._len++] = clampFreq(freq);
|
|
3671
|
+
}
|
|
3672
|
+
copyRangeInto(sourceOffset, length, target, targetOffset) {
|
|
3673
|
+
for (let i = 0; i < length; i++) {
|
|
3674
|
+
target[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3675
|
+
}
|
|
3676
|
+
}
|
|
3677
|
+
truncate(length) {
|
|
3678
|
+
this._len = length;
|
|
3679
|
+
if (length > 0 && length < this._buf.length) {
|
|
3680
|
+
this._buf = this._buf.slice(0, length);
|
|
3681
|
+
}
|
|
3682
|
+
}
|
|
3683
|
+
}
|
|
3684
|
+
/**
|
|
3685
|
+
* Single-pass postings accumulator for {@link FrozenIndexBuilder}.
|
|
3686
|
+
* One global TypedArray stream per docIds/freqs; per-slot range metadata only.
|
|
3687
|
+
*/
|
|
3688
|
+
class IncrementalPostingsAccumulator {
|
|
3689
|
+
constructor(fieldCount, hints) {
|
|
3690
|
+
var _a;
|
|
3691
|
+
this._slots = new Map();
|
|
3692
|
+
this._totalPostings = 0;
|
|
3693
|
+
this._maxFreq = 0;
|
|
3694
|
+
this._fieldCount = fieldCount;
|
|
3695
|
+
const cap = Math.max(DEFAULT_CAPACITY, (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedTotalPostings) !== null && _a !== void 0 ? _a : 0);
|
|
3696
|
+
this._docIds = new GrowableUint32Column(cap);
|
|
3697
|
+
this._freqs = new GrowableFreqColumn(cap);
|
|
3698
|
+
}
|
|
3699
|
+
get totalPostings() {
|
|
3700
|
+
return this._totalPostings;
|
|
3701
|
+
}
|
|
3702
|
+
get maxFreq() {
|
|
3703
|
+
return this._maxFreq;
|
|
3704
|
+
}
|
|
3705
|
+
append(termIndex, fieldId, docId, freq) {
|
|
3706
|
+
const slot = termIndex * this._fieldCount + fieldId;
|
|
3707
|
+
const writeIdx = this._docIds.length;
|
|
3708
|
+
this._docIds.push(docId);
|
|
3709
|
+
const v = clampFreq(freq);
|
|
3710
|
+
this._freqs.push(v);
|
|
3711
|
+
if (v > this._maxFreq)
|
|
3712
|
+
this._maxFreq = v;
|
|
3713
|
+
this._totalPostings++;
|
|
3714
|
+
let ranges = this._slots.get(slot);
|
|
3715
|
+
if (ranges == null) {
|
|
3716
|
+
ranges = { starts: [writeIdx], lengths: [1] };
|
|
3717
|
+
this._slots.set(slot, ranges);
|
|
3718
|
+
return;
|
|
3719
|
+
}
|
|
3720
|
+
const last = ranges.starts.length - 1;
|
|
3721
|
+
const end = ranges.starts[last] + ranges.lengths[last];
|
|
3722
|
+
if (end === writeIdx) {
|
|
3723
|
+
ranges.lengths[last]++;
|
|
3724
|
+
}
|
|
3725
|
+
else {
|
|
3726
|
+
ranges.starts.push(writeIdx);
|
|
3727
|
+
ranges.lengths.push(1);
|
|
3728
|
+
}
|
|
3729
|
+
}
|
|
3730
|
+
clear() {
|
|
3731
|
+
this._slots.clear();
|
|
3732
|
+
// Drop global scratch backing so finalize does not retain duplicate posting bytes.
|
|
3733
|
+
this._docIds.truncate(0);
|
|
3734
|
+
this._freqs.truncate(0);
|
|
3735
|
+
}
|
|
3736
|
+
copySlot(ranges, allDocIds, allFreqs, write, docIdWidth) {
|
|
3737
|
+
for (let r = 0; r < ranges.starts.length; r++) {
|
|
3738
|
+
const start = ranges.starts[r];
|
|
3739
|
+
const len = ranges.lengths[r];
|
|
3740
|
+
this._docIds.copyRangeInto(start, len, allDocIds, write, docIdWidth);
|
|
3741
|
+
this._freqs.copyRangeInto(start, len, allFreqs, write);
|
|
3742
|
+
write += len;
|
|
3743
|
+
}
|
|
3744
|
+
return write;
|
|
3745
|
+
}
|
|
3746
|
+
slotLength(ranges) {
|
|
3747
|
+
let n = 0;
|
|
3748
|
+
for (let i = 0; i < ranges.lengths.length; i++)
|
|
3749
|
+
n += ranges.lengths[i];
|
|
3750
|
+
return n;
|
|
3751
|
+
}
|
|
3752
|
+
finalize(termCount, nextId) {
|
|
3753
|
+
const fieldCount = this._fieldCount;
|
|
3754
|
+
const totalPostings = this._totalPostings;
|
|
3755
|
+
const maxFreq = this._maxFreq;
|
|
3756
|
+
const slots = this._slots;
|
|
3757
|
+
const layout = choosePostingsLayout(fieldCount);
|
|
3758
|
+
const docIdWidth = nextId <= 65535 ? 16 : 32;
|
|
3759
|
+
const allDocIds = docIdWidth === 16
|
|
3760
|
+
? new Uint16Array(totalPostings)
|
|
3761
|
+
: new Uint32Array(totalPostings);
|
|
3762
|
+
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
3763
|
+
if (layout === 'dense') {
|
|
3764
|
+
const slotCount = termCount * fieldCount;
|
|
3765
|
+
const denseOffsets = new Uint32Array(slotCount);
|
|
3766
|
+
const denseLengths = new Uint32Array(slotCount);
|
|
3767
|
+
let write = 0;
|
|
3768
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
3769
|
+
const base = ti * fieldCount;
|
|
3770
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
3771
|
+
const slot = base + f;
|
|
3772
|
+
const ranges = slots.get(slot);
|
|
3773
|
+
const len = ranges == null ? 0 : this.slotLength(ranges);
|
|
3774
|
+
denseOffsets[slot] = write;
|
|
3775
|
+
denseLengths[slot] = len;
|
|
3776
|
+
if (len > 0) {
|
|
3777
|
+
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3778
|
+
slots.delete(slot);
|
|
3779
|
+
}
|
|
3780
|
+
}
|
|
3781
|
+
}
|
|
3782
|
+
slots.clear();
|
|
3783
|
+
this.clear();
|
|
3784
|
+
return {
|
|
3785
|
+
fieldCount,
|
|
3786
|
+
termCount,
|
|
3787
|
+
nextId,
|
|
3788
|
+
layout,
|
|
3789
|
+
docIdWidth,
|
|
3790
|
+
sparseFieldIdWidth: null,
|
|
3791
|
+
allDocIds,
|
|
3792
|
+
allFreqs,
|
|
3793
|
+
denseOffsets,
|
|
3794
|
+
denseLengths,
|
|
3795
|
+
sparseTermStarts: null,
|
|
3796
|
+
sparseFieldIds: null,
|
|
3797
|
+
sparseOffsets: null,
|
|
3798
|
+
sparseLengths: null,
|
|
3799
|
+
};
|
|
3800
|
+
}
|
|
3801
|
+
const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
|
|
3802
|
+
const sparseFieldIdsScratch = [];
|
|
3803
|
+
const sparseOffsets = [];
|
|
3804
|
+
const sparseLengths = [];
|
|
3805
|
+
const termStarts = new Array(termCount + 1).fill(0);
|
|
3806
|
+
let write = 0;
|
|
3807
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
3808
|
+
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
3809
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
3810
|
+
const slot = ti * fieldCount + f;
|
|
3811
|
+
const ranges = slots.get(slot);
|
|
3812
|
+
const len = ranges == null ? 0 : this.slotLength(ranges);
|
|
3813
|
+
if (len === 0)
|
|
3814
|
+
continue;
|
|
3815
|
+
sparseFieldIdsScratch.push(f);
|
|
3816
|
+
sparseOffsets.push(write);
|
|
3817
|
+
sparseLengths.push(len);
|
|
3818
|
+
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3819
|
+
slots.delete(slot);
|
|
3820
|
+
}
|
|
3821
|
+
termStarts[ti + 1] = sparseFieldIdsScratch.length;
|
|
3822
|
+
}
|
|
3823
|
+
slots.clear();
|
|
3824
|
+
this.clear();
|
|
3825
|
+
const sparseFieldIds = sparseFieldIdWidth === 16
|
|
3826
|
+
? new Uint16Array(sparseFieldIdsScratch)
|
|
3827
|
+
: new Uint8Array(sparseFieldIdsScratch);
|
|
3828
|
+
return {
|
|
3829
|
+
fieldCount,
|
|
3830
|
+
termCount,
|
|
3831
|
+
nextId,
|
|
3832
|
+
layout,
|
|
3833
|
+
docIdWidth,
|
|
3834
|
+
sparseFieldIdWidth,
|
|
3835
|
+
allDocIds,
|
|
3836
|
+
allFreqs,
|
|
3837
|
+
denseOffsets: null,
|
|
3838
|
+
denseLengths: null,
|
|
3839
|
+
sparseTermStarts: new Uint32Array(termStarts),
|
|
3840
|
+
sparseFieldIds,
|
|
3841
|
+
sparseOffsets: new Uint32Array(sparseOffsets),
|
|
3842
|
+
sparseLengths: new Uint32Array(sparseLengths),
|
|
3843
|
+
};
|
|
3445
3844
|
}
|
|
3446
|
-
return decodeFrozenSnapshot(buf);
|
|
3447
3845
|
}
|
|
3448
3846
|
|
|
3449
|
-
function getOrCreateTermIndex(
|
|
3847
|
+
function getOrCreateTermIndex(termCount, index, term) {
|
|
3450
3848
|
const existing = index.get(term);
|
|
3451
3849
|
if (existing != null)
|
|
3452
3850
|
return existing;
|
|
3453
|
-
const ti =
|
|
3454
|
-
|
|
3851
|
+
const ti = termCount.value;
|
|
3852
|
+
termCount.value++;
|
|
3455
3853
|
index.set(term, ti);
|
|
3456
3854
|
return ti;
|
|
3457
3855
|
}
|
|
3458
|
-
function appendPosting(state, termIndex, fieldId, docId, freq) {
|
|
3459
|
-
const slot = termIndex * state.fieldCount + fieldId;
|
|
3460
|
-
let docIds = state.postingsDocIds[slot];
|
|
3461
|
-
if (docIds == null) {
|
|
3462
|
-
docIds = [];
|
|
3463
|
-
state.postingsDocIds[slot] = docIds;
|
|
3464
|
-
state.postingsFreqs[slot] = [];
|
|
3465
|
-
}
|
|
3466
|
-
docIds.push(docId);
|
|
3467
|
-
state.postingsFreqs[slot].push(freq);
|
|
3468
|
-
const v = clampFreq(freq);
|
|
3469
|
-
if (v > state.maxFreq)
|
|
3470
|
-
state.maxFreq = v;
|
|
3471
|
-
state.totalPostings++;
|
|
3472
|
-
}
|
|
3473
|
-
function finalizeFlatPostings(state, nextId) {
|
|
3474
|
-
return materializeFrozenPostingsFromBuilder({
|
|
3475
|
-
fieldCount: state.fieldCount,
|
|
3476
|
-
termCount: state.terms.length,
|
|
3477
|
-
postingsDocIds: state.postingsDocIds,
|
|
3478
|
-
postingsFreqs: state.postingsFreqs,
|
|
3479
|
-
totalPostings: state.totalPostings,
|
|
3480
|
-
maxFreq: state.maxFreq,
|
|
3481
|
-
}, nextId);
|
|
3482
|
-
}
|
|
3483
3856
|
/** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
|
|
3484
3857
|
class FrozenIndexBuilder {
|
|
3485
3858
|
constructor(options, hints) {
|
|
3859
|
+
var _a, _b;
|
|
3860
|
+
this._termCount = { value: 0 };
|
|
3861
|
+
this._fieldTermFreqScratch = new Map();
|
|
3862
|
+
this._tokenScratch = [];
|
|
3486
3863
|
this._options = resolveIndexingOptions(options);
|
|
3487
3864
|
this._fieldIds = buildFieldIds(this._options.fields);
|
|
3488
3865
|
this._fieldCount = this._options.fields.length;
|
|
3489
3866
|
this._index = new SearchableMap();
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
this.
|
|
3867
|
+
const estimatedDocs = (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount) !== null && _a !== void 0 ? _a : 0;
|
|
3868
|
+
const perSlot = (_b = hints === null || hints === void 0 ? void 0 : hints.estimatedPostingsPerSlot) !== null && _b !== void 0 ? _b : 4;
|
|
3869
|
+
this._postings = new IncrementalPostingsAccumulator(this._fieldCount, {
|
|
3870
|
+
estimatedTotalPostings: estimatedDocs > 0 ? estimatedDocs * perSlot : undefined,
|
|
3871
|
+
});
|
|
3493
3872
|
this._avgFieldLength = [];
|
|
3494
3873
|
this._seenIds = new Set();
|
|
3495
3874
|
this._nextId = 0;
|
|
3496
3875
|
this._frozen = false;
|
|
3497
3876
|
const estimated = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount;
|
|
3877
|
+
this._storedFields = createStoredFieldsLayout(this._options.storeFields, estimated !== null && estimated !== void 0 ? estimated : 0);
|
|
3498
3878
|
if (estimated != null && estimated > 0) {
|
|
3499
3879
|
this._externalIds = new Array(estimated);
|
|
3500
|
-
this._storedFields = new Array(estimated);
|
|
3501
3880
|
this._fieldLengthData = new Array(estimated * this._fieldCount).fill(0);
|
|
3502
3881
|
}
|
|
3503
3882
|
else {
|
|
3504
3883
|
this._externalIds = [];
|
|
3505
|
-
this._storedFields = [];
|
|
3506
3884
|
this._fieldLengthData = [];
|
|
3507
3885
|
}
|
|
3508
|
-
this._postingsState = {
|
|
3509
|
-
fieldCount: this._fieldCount,
|
|
3510
|
-
terms: this._terms,
|
|
3511
|
-
postingsDocIds: this._postingsDocIds,
|
|
3512
|
-
postingsFreqs: this._postingsFreqs,
|
|
3513
|
-
totalPostings: 0,
|
|
3514
|
-
maxFreq: 0,
|
|
3515
|
-
};
|
|
3516
3886
|
}
|
|
3517
3887
|
/** Number of documents indexed so far (not yet frozen). */
|
|
3518
3888
|
get documentCount() {
|
|
@@ -3533,22 +3903,23 @@ class FrozenIndexBuilder {
|
|
|
3533
3903
|
this._seenIds.add(id);
|
|
3534
3904
|
const shortId = this._nextId++;
|
|
3535
3905
|
this._externalIds[shortId] = id;
|
|
3536
|
-
this._storedFields
|
|
3906
|
+
writeStoredField(this._storedFields, shortId, storeFields, extractField, document);
|
|
3537
3907
|
const documentCount = shortId + 1;
|
|
3538
3908
|
for (const field of fields) {
|
|
3539
3909
|
const fieldValue = extractField(document, field);
|
|
3540
3910
|
if (fieldValue == null)
|
|
3541
3911
|
continue;
|
|
3542
|
-
const
|
|
3912
|
+
const fieldText = typeof fieldValue === 'string'
|
|
3913
|
+
? fieldValue
|
|
3914
|
+
: stringifyField(fieldValue, field);
|
|
3543
3915
|
const fieldId = this._fieldIds[field];
|
|
3544
|
-
const uniqueTerms =
|
|
3545
|
-
const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
|
|
3916
|
+
const uniqueTerms = collectFieldTermFreqsFromFieldInto(this._fieldTermFreqScratch, this._tokenScratch, tokenize, fieldText, field, processTerm);
|
|
3546
3917
|
this._fieldLengthData[shortId * this._fieldCount + fieldId] = uniqueTerms;
|
|
3547
3918
|
updateAvgFieldLength(this._avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
|
|
3548
|
-
|
|
3549
|
-
const ti = getOrCreateTermIndex(this.
|
|
3550
|
-
|
|
3551
|
-
}
|
|
3919
|
+
this._fieldTermFreqScratch.forEach((freq, term) => {
|
|
3920
|
+
const ti = getOrCreateTermIndex(this._termCount, this._index, term);
|
|
3921
|
+
this._postings.append(ti, fieldId, shortId, freq);
|
|
3922
|
+
});
|
|
3552
3923
|
}
|
|
3553
3924
|
}
|
|
3554
3925
|
/**
|
|
@@ -3605,7 +3976,11 @@ class FrozenIndexBuilder {
|
|
|
3605
3976
|
}
|
|
3606
3977
|
this._frozen = true;
|
|
3607
3978
|
const documentCount = this._nextId;
|
|
3608
|
-
const
|
|
3979
|
+
const termCount = this._termCount.value;
|
|
3980
|
+
const postings = this._postings.finalize(termCount, documentCount);
|
|
3981
|
+
const radixTree = this._index.radixTree;
|
|
3982
|
+
this._index = null;
|
|
3983
|
+
const index = fromRadixTree(radixTree, termCount);
|
|
3609
3984
|
const avgFieldLength = new Float32Array(this._fieldCount);
|
|
3610
3985
|
for (let f = 0; f < this._fieldCount; f++) {
|
|
3611
3986
|
avgFieldLength[f] = (_a = this._avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
|
|
@@ -3614,12 +3989,8 @@ class FrozenIndexBuilder {
|
|
|
3614
3989
|
const externalIds = this._externalIds.length > documentCount
|
|
3615
3990
|
? this._externalIds.slice(0, documentCount)
|
|
3616
3991
|
: this._externalIds;
|
|
3617
|
-
const storedFields = this._storedFields
|
|
3618
|
-
? this._storedFields.slice(0, documentCount)
|
|
3619
|
-
: this._storedFields;
|
|
3992
|
+
const storedFields = resizeStoredFields(this._storedFields, documentCount);
|
|
3620
3993
|
const idLookup = createIdToShortIdLookup(externalIds, documentCount);
|
|
3621
|
-
// Incremental builder: numeric radix leaves + build-time terms[] for postings.
|
|
3622
|
-
// freezeFromMiniSearch packs Map leaves in one radix pass (no resident terms[]).
|
|
3623
3994
|
return {
|
|
3624
3995
|
options: this._options,
|
|
3625
3996
|
documentCount,
|
|
@@ -3631,8 +4002,8 @@ class FrozenIndexBuilder {
|
|
|
3631
4002
|
storedFields,
|
|
3632
4003
|
fieldLengthMatrix: materializeFieldLengthMatrix(this._fieldLengthData, documentCount * this._fieldCount),
|
|
3633
4004
|
avgFieldLength,
|
|
3634
|
-
index
|
|
3635
|
-
termCount
|
|
4005
|
+
index,
|
|
4006
|
+
termCount,
|
|
3636
4007
|
postings,
|
|
3637
4008
|
};
|
|
3638
4009
|
}
|
|
@@ -4046,7 +4417,7 @@ function shallowCopyJsSnapshotFields(params) {
|
|
|
4046
4417
|
return {
|
|
4047
4418
|
fieldIds: { ...params.fieldIds },
|
|
4048
4419
|
options: shallowCopyOptions(params.options),
|
|
4049
|
-
storedFields: params.storedFields
|
|
4420
|
+
storedFields: cloneStoredFields(params.storedFields),
|
|
4050
4421
|
};
|
|
4051
4422
|
}
|
|
4052
4423
|
/**
|
|
@@ -4131,7 +4502,7 @@ class FrozenMiniSearch {
|
|
|
4131
4502
|
fieldIds: this._fieldIds,
|
|
4132
4503
|
getFieldLength: (docId, fieldId) => this.getFieldLength(docId, fieldId),
|
|
4133
4504
|
getExternalId: docId => this._externalIds[docId],
|
|
4134
|
-
getStoredFields: docId => this._storedFields
|
|
4505
|
+
getStoredFields: docId => readStoredFields(this._storedFields, docId),
|
|
4135
4506
|
};
|
|
4136
4507
|
this._queryEngineParams = {
|
|
4137
4508
|
fields: this._options.fields,
|
|
@@ -4143,7 +4514,7 @@ class FrozenMiniSearch {
|
|
|
4143
4514
|
const id = this._externalIds[shortId];
|
|
4144
4515
|
if (id === undefined)
|
|
4145
4516
|
continue;
|
|
4146
|
-
callback(shortId, id, this._storedFields
|
|
4517
|
+
callback(shortId, id, readStoredFields(this._storedFields, shortId));
|
|
4147
4518
|
}
|
|
4148
4519
|
}),
|
|
4149
4520
|
aggregateContext: this._aggregateContext,
|
|
@@ -4154,11 +4525,7 @@ class FrozenMiniSearch {
|
|
|
4154
4525
|
memoryBreakdown() {
|
|
4155
4526
|
const termCount = this.termCount;
|
|
4156
4527
|
const postingsStats = postingsTypedBytes(this._postings);
|
|
4157
|
-
|
|
4158
|
-
for (const row of this._storedFields) {
|
|
4159
|
-
if (row != null)
|
|
4160
|
-
storedJson += JSON.stringify(row).length;
|
|
4161
|
-
}
|
|
4528
|
+
const storedJson = storedFieldsJsonBytes(this._storedFields);
|
|
4162
4529
|
const radixEst = this._index.packedByteLength();
|
|
4163
4530
|
const idMapBytes = this._idLookup.mode === 'lazy-map' ? this._idLookup.mapEntryCount * 32 : 0;
|
|
4164
4531
|
const estimatedStructuredBytes = postingsStats.totalTypedBytes
|
|
@@ -4188,7 +4555,7 @@ class FrozenMiniSearch {
|
|
|
4188
4555
|
},
|
|
4189
4556
|
documents: {
|
|
4190
4557
|
externalIdsSlots: this._externalIds.length,
|
|
4191
|
-
storedFieldsSlots: this._storedFields
|
|
4558
|
+
storedFieldsSlots: storedFieldsSlotCount(this._storedFields),
|
|
4192
4559
|
idLookupMode: this._idLookup.mode,
|
|
4193
4560
|
idToShortIdEntries: this._idLookup.mapEntryCount,
|
|
4194
4561
|
fieldLengthMatrixBytes: this._fieldLengthMatrix.byteLength,
|
|
@@ -4203,10 +4570,10 @@ class FrozenMiniSearch {
|
|
|
4203
4570
|
}
|
|
4204
4571
|
getStoredFields(id) {
|
|
4205
4572
|
const shortId = this._idLookup.get(id);
|
|
4206
|
-
return shortId == null ? undefined : this._storedFields
|
|
4573
|
+
return shortId == null ? undefined : readStoredFields(this._storedFields, shortId);
|
|
4207
4574
|
}
|
|
4208
4575
|
search(query, searchOptions = {}) {
|
|
4209
|
-
return finalizeRawSearchResults(this.executeQuery(query, searchOptions), query, searchOptions, this._options.searchOptions, docId => this._externalIds[docId], docId => this._storedFields
|
|
4576
|
+
return finalizeRawSearchResults(this.executeQuery(query, searchOptions), query, searchOptions, this._options.searchOptions, docId => this._externalIds[docId], docId => readStoredFields(this._storedFields, docId));
|
|
4210
4577
|
}
|
|
4211
4578
|
autoSuggest(queryString, options = {}) {
|
|
4212
4579
|
const merged = { ...this._options.autoSuggestOptions, ...options };
|
|
@@ -4222,7 +4589,8 @@ class FrozenMiniSearch {
|
|
|
4222
4589
|
fieldNames: fieldNamesFromFieldIds(this._fieldIds),
|
|
4223
4590
|
avgFieldLength: this._avgFieldLength,
|
|
4224
4591
|
externalIds: this._externalIds,
|
|
4225
|
-
storedFields: this.
|
|
4592
|
+
storedFields: new Array(this._nextId),
|
|
4593
|
+
storedFieldsLayout: this._storedFields,
|
|
4226
4594
|
fieldLengthMatrix: fieldLengthMatrixForWire(this._fieldLengthMatrix),
|
|
4227
4595
|
treeShape: [],
|
|
4228
4596
|
postings: this._postings,
|
|
@@ -4238,7 +4606,8 @@ class FrozenMiniSearch {
|
|
|
4238
4606
|
fieldNames: fieldNamesFromFieldIds(this._fieldIds),
|
|
4239
4607
|
avgFieldLength: this._avgFieldLength,
|
|
4240
4608
|
externalIds: this._externalIds,
|
|
4241
|
-
storedFields: this.
|
|
4609
|
+
storedFields: new Array(this._nextId),
|
|
4610
|
+
storedFieldsLayout: this._storedFields,
|
|
4242
4611
|
fieldLengthMatrix: fieldLengthMatrixForWire(this._fieldLengthMatrix),
|
|
4243
4612
|
treeShape: [],
|
|
4244
4613
|
postings: this._postings,
|
|
@@ -4246,16 +4615,20 @@ class FrozenMiniSearch {
|
|
|
4246
4615
|
}
|
|
4247
4616
|
/** Load a frozen binary snapshot. */
|
|
4248
4617
|
static loadBinarySync(buffer, options = {}) {
|
|
4249
|
-
|
|
4618
|
+
var _a;
|
|
4619
|
+
const storeFields = (_a = options.storeFields) !== null && _a !== void 0 ? _a : defaultFrozenLoadOptions.storeFields;
|
|
4620
|
+
const snap = decodeFrozenSnapshot(buffer, { storeFields });
|
|
4250
4621
|
return FrozenMiniSearch.fromBinarySnapshot(snap, options);
|
|
4251
4622
|
}
|
|
4252
4623
|
/** Load a frozen binary snapshot with streaming zstd decompression (bounded memory). */
|
|
4253
4624
|
static async loadBinaryAsync(buffer, options = {}) {
|
|
4254
|
-
|
|
4625
|
+
var _a;
|
|
4626
|
+
const storeFields = (_a = options.storeFields) !== null && _a !== void 0 ? _a : defaultFrozenLoadOptions.storeFields;
|
|
4627
|
+
const snap = await decodeFrozenSnapshotAsync(buffer, { storeFields });
|
|
4255
4628
|
return FrozenMiniSearch.fromBinarySnapshot(snap, options);
|
|
4256
4629
|
}
|
|
4257
4630
|
static fromBinarySnapshot(snap, options) {
|
|
4258
|
-
var _a, _b;
|
|
4631
|
+
var _a, _b, _c;
|
|
4259
4632
|
const snapshotFields = (_a = snap.fieldNames) !== null && _a !== void 0 ? _a : fieldNamesFromFieldIds(snap.fieldIds);
|
|
4260
4633
|
if (options.fields != null) {
|
|
4261
4634
|
assertFieldsMatchSnapshot(options.fields, snap.fieldIds);
|
|
@@ -4283,7 +4656,7 @@ class FrozenMiniSearch {
|
|
|
4283
4656
|
fieldCount: snap.fieldCount,
|
|
4284
4657
|
externalIds: snap.externalIds,
|
|
4285
4658
|
idLookup,
|
|
4286
|
-
storedFields: snap.storedFields,
|
|
4659
|
+
storedFields: (_c = snap.storedFieldsLayout) !== null && _c !== void 0 ? _c : storedFieldsFromRows(snap.storedFields, opts.storeFields),
|
|
4287
4660
|
fieldLengthMatrix: snap.fieldLengthMatrix,
|
|
4288
4661
|
avgFieldLength: snap.avgFieldLength,
|
|
4289
4662
|
index,
|
|
@@ -4341,6 +4714,7 @@ FrozenMiniSearch.wildcard = WILDCARD_QUERY;
|
|
|
4341
4714
|
exports.AND = AND;
|
|
4342
4715
|
exports.AND_NOT = AND_NOT;
|
|
4343
4716
|
exports.FrozenIndexBuilder = FrozenIndexBuilder;
|
|
4717
|
+
exports.FrozenMiniSearch = FrozenMiniSearch;
|
|
4344
4718
|
exports.OR = OR;
|
|
4345
4719
|
exports.assembleFrozen = assembleFrozen;
|
|
4346
4720
|
exports.buildFrozenFromDocuments = buildFrozenFromDocuments;
|