@yoch/frozenminisearch 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/README.md +41 -11
- package/dist/cjs/index.cjs +913 -539
- package/dist/cjs/index.require.cjs +1 -0
- package/dist/es/index.d.ts +24 -7
- package/dist/es/index.js +913 -540
- package/package.json +5 -2
package/dist/es/index.js
CHANGED
|
@@ -1907,111 +1907,6 @@ function materializeFrozenPostings(params) {
|
|
|
1907
1907
|
sparseLengths: new Uint32Array(sparseLengths),
|
|
1908
1908
|
};
|
|
1909
1909
|
}
|
|
1910
|
-
/** One-pass materialize from {@link FrozenIndexBuilder} scratch (counts known upfront). */
|
|
1911
|
-
function materializeFrozenPostingsFromBuilder(state, nextId) {
|
|
1912
|
-
var _a;
|
|
1913
|
-
const { fieldCount, termCount, postingsDocIds, postingsFreqs, totalPostings, maxFreq } = state;
|
|
1914
|
-
const layout = choosePostingsLayout(fieldCount);
|
|
1915
|
-
const docIdWidth = nextId <= 65535 ? 16 : 32;
|
|
1916
|
-
const allDocIds = docIdWidth === 16
|
|
1917
|
-
? new Uint16Array(totalPostings)
|
|
1918
|
-
: new Uint32Array(totalPostings);
|
|
1919
|
-
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
1920
|
-
if (layout === 'dense') {
|
|
1921
|
-
const slotCount = termCount * fieldCount;
|
|
1922
|
-
const denseOffsets = new Uint32Array(slotCount);
|
|
1923
|
-
const denseLengths = new Uint32Array(slotCount);
|
|
1924
|
-
let write = 0;
|
|
1925
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1926
|
-
const base = ti * fieldCount;
|
|
1927
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1928
|
-
const slot = base + f;
|
|
1929
|
-
const docIds = postingsDocIds[slot];
|
|
1930
|
-
const freqs = postingsFreqs[slot];
|
|
1931
|
-
const len = (_a = docIds === null || docIds === void 0 ? void 0 : docIds.length) !== null && _a !== void 0 ? _a : 0;
|
|
1932
|
-
denseOffsets[slot] = write;
|
|
1933
|
-
denseLengths[slot] = len;
|
|
1934
|
-
for (let i = 0; i < len; i++) {
|
|
1935
|
-
const docId = docIds[i];
|
|
1936
|
-
if (docIdWidth === 16) {
|
|
1937
|
-
allDocIds[write] = docId;
|
|
1938
|
-
}
|
|
1939
|
-
else {
|
|
1940
|
-
allDocIds[write] = docId;
|
|
1941
|
-
}
|
|
1942
|
-
allFreqs[write] = freqs[i];
|
|
1943
|
-
write++;
|
|
1944
|
-
}
|
|
1945
|
-
}
|
|
1946
|
-
}
|
|
1947
|
-
return {
|
|
1948
|
-
fieldCount,
|
|
1949
|
-
termCount,
|
|
1950
|
-
nextId,
|
|
1951
|
-
layout,
|
|
1952
|
-
docIdWidth,
|
|
1953
|
-
sparseFieldIdWidth: null,
|
|
1954
|
-
allDocIds,
|
|
1955
|
-
allFreqs,
|
|
1956
|
-
denseOffsets,
|
|
1957
|
-
denseLengths,
|
|
1958
|
-
sparseTermStarts: null,
|
|
1959
|
-
sparseFieldIds: null,
|
|
1960
|
-
sparseOffsets: null,
|
|
1961
|
-
sparseLengths: null,
|
|
1962
|
-
};
|
|
1963
|
-
}
|
|
1964
|
-
const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
|
|
1965
|
-
const sparseFieldIdsScratch = [];
|
|
1966
|
-
const sparseOffsets = [];
|
|
1967
|
-
const sparseLengths = [];
|
|
1968
|
-
const termStarts = new Array(termCount + 1).fill(0);
|
|
1969
|
-
let write = 0;
|
|
1970
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1971
|
-
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
1972
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1973
|
-
const slot = ti * fieldCount + f;
|
|
1974
|
-
const docIds = postingsDocIds[slot];
|
|
1975
|
-
if (docIds == null || docIds.length === 0)
|
|
1976
|
-
continue;
|
|
1977
|
-
const freqs = postingsFreqs[slot];
|
|
1978
|
-
sparseFieldIdsScratch.push(f);
|
|
1979
|
-
sparseOffsets.push(write);
|
|
1980
|
-
sparseLengths.push(docIds.length);
|
|
1981
|
-
for (let i = 0; i < docIds.length; i++) {
|
|
1982
|
-
const docId = docIds[i];
|
|
1983
|
-
if (docIdWidth === 16) {
|
|
1984
|
-
allDocIds[write] = docId;
|
|
1985
|
-
}
|
|
1986
|
-
else {
|
|
1987
|
-
allDocIds[write] = docId;
|
|
1988
|
-
}
|
|
1989
|
-
allFreqs[write] = freqs[i];
|
|
1990
|
-
write++;
|
|
1991
|
-
}
|
|
1992
|
-
}
|
|
1993
|
-
termStarts[ti + 1] = sparseFieldIdsScratch.length;
|
|
1994
|
-
}
|
|
1995
|
-
const sparseFieldIds = sparseFieldIdWidth === 16
|
|
1996
|
-
? new Uint16Array(sparseFieldIdsScratch)
|
|
1997
|
-
: new Uint8Array(sparseFieldIdsScratch);
|
|
1998
|
-
return {
|
|
1999
|
-
fieldCount,
|
|
2000
|
-
termCount,
|
|
2001
|
-
nextId,
|
|
2002
|
-
layout,
|
|
2003
|
-
docIdWidth,
|
|
2004
|
-
sparseFieldIdWidth,
|
|
2005
|
-
allDocIds,
|
|
2006
|
-
allFreqs,
|
|
2007
|
-
denseOffsets: null,
|
|
2008
|
-
denseLengths: null,
|
|
2009
|
-
sparseTermStarts: new Uint32Array(termStarts),
|
|
2010
|
-
sparseFieldIds,
|
|
2011
|
-
sparseOffsets: new Uint32Array(sparseOffsets),
|
|
2012
|
-
sparseLengths: new Uint32Array(sparseLengths),
|
|
2013
|
-
};
|
|
2014
|
-
}
|
|
2015
1910
|
function postingsTypedBytes(layout) {
|
|
2016
1911
|
const allDocIdsBytes = layout.allDocIds.byteLength;
|
|
2017
1912
|
const allFreqsBytes = layout.allFreqs.byteLength;
|
|
@@ -2121,24 +2016,33 @@ function findSparseSlotByFieldId(fieldIds, start, end, fieldId) {
|
|
|
2121
2016
|
}
|
|
2122
2017
|
return -1;
|
|
2123
2018
|
}
|
|
2124
|
-
/**
|
|
2125
|
-
|
|
2019
|
+
/** Reusable scratch for {@link resolvePostingSlice} (scoring is synchronous). */
|
|
2020
|
+
const postingSliceScratch = { offset: 0, length: 0 };
|
|
2021
|
+
/**
|
|
2022
|
+
* Resolve one (termIndex, fieldId) posting run in flat buffers; writes into `out` without allocating.
|
|
2023
|
+
* @returns false when the slot is empty or missing
|
|
2024
|
+
*/
|
|
2025
|
+
function resolvePostingSlice(layout, termIndex, fieldId, out) {
|
|
2126
2026
|
if (layout.layout === 'dense') {
|
|
2127
2027
|
const base = termIndex * layout.fieldCount + fieldId;
|
|
2128
2028
|
const len = layout.denseLengths[base];
|
|
2129
2029
|
if (len === 0)
|
|
2130
|
-
return
|
|
2131
|
-
|
|
2030
|
+
return false;
|
|
2031
|
+
out.offset = layout.denseOffsets[base];
|
|
2032
|
+
out.length = len;
|
|
2033
|
+
return true;
|
|
2132
2034
|
}
|
|
2133
2035
|
const start = layout.sparseTermStarts[termIndex];
|
|
2134
2036
|
const end = layout.sparseTermStarts[termIndex + 1];
|
|
2135
2037
|
const slot = findSparseSlotByFieldId(layout.sparseFieldIds, start, end, fieldId);
|
|
2136
2038
|
if (slot < 0)
|
|
2137
|
-
return
|
|
2039
|
+
return false;
|
|
2138
2040
|
const len = layout.sparseLengths[slot];
|
|
2139
2041
|
if (len === 0)
|
|
2140
|
-
return
|
|
2141
|
-
|
|
2042
|
+
return false;
|
|
2043
|
+
out.offset = layout.sparseOffsets[slot];
|
|
2044
|
+
out.length = len;
|
|
2045
|
+
return true;
|
|
2142
2046
|
}
|
|
2143
2047
|
/**
|
|
2144
2048
|
* One flyweight wrapper for the lifetime of a frozen index. Call {@link bind} before each
|
|
@@ -2154,10 +2058,9 @@ function createFrozenFieldTermFlyweight(layout) {
|
|
|
2154
2058
|
return flyweight;
|
|
2155
2059
|
},
|
|
2156
2060
|
get(fieldId) {
|
|
2157
|
-
|
|
2158
|
-
if (slice == null)
|
|
2061
|
+
if (!resolvePostingSlice(layout, termIndex, fieldId, postingSliceScratch))
|
|
2159
2062
|
return undefined;
|
|
2160
|
-
return segment.rebind(
|
|
2063
|
+
return segment.rebind(postingSliceScratch.offset, postingSliceScratch.length);
|
|
2161
2064
|
},
|
|
2162
2065
|
};
|
|
2163
2066
|
return flyweight;
|
|
@@ -2176,10 +2079,9 @@ function collectDocIdsFromFrozenSegment(allDocIds, offset, length, context, docI
|
|
|
2176
2079
|
function collectDocIdsFromFrozenLayout(layout, termIndex, fieldBoosts, context, docIds, allowedDocs) {
|
|
2177
2080
|
const { fieldIds } = context;
|
|
2178
2081
|
for (const field of fieldBoosts.names) {
|
|
2179
|
-
|
|
2180
|
-
if (slice == null)
|
|
2082
|
+
if (!resolvePostingSlice(layout, termIndex, fieldIds[field], postingSliceScratch))
|
|
2181
2083
|
continue;
|
|
2182
|
-
collectDocIdsFromFrozenSegment(layout.allDocIds,
|
|
2084
|
+
collectDocIdsFromFrozenSegment(layout.allDocIds, postingSliceScratch.offset, postingSliceScratch.length, context, docIds, allowedDocs);
|
|
2183
2085
|
}
|
|
2184
2086
|
}
|
|
2185
2087
|
|
|
@@ -2221,45 +2123,499 @@ function resolveIndexingOptions(options) {
|
|
|
2221
2123
|
autoSuggestOptions: { ...defaultAutoSuggestOptions, ...(options.autoSuggestOptions || {}) },
|
|
2222
2124
|
};
|
|
2223
2125
|
}
|
|
2224
|
-
function buildFieldIds(fields) {
|
|
2225
|
-
const fieldIds = {};
|
|
2226
|
-
for (let i = 0; i < fields.length; i++) {
|
|
2227
|
-
fieldIds[fields[i]] = i;
|
|
2126
|
+
function buildFieldIds(fields) {
|
|
2127
|
+
const fieldIds = {};
|
|
2128
|
+
for (let i = 0; i < fields.length; i++) {
|
|
2129
|
+
fieldIds[fields[i]] = i;
|
|
2130
|
+
}
|
|
2131
|
+
return fieldIds;
|
|
2132
|
+
}
|
|
2133
|
+
function accumulateProcessedTerm(localFreqs, processedTerm) {
|
|
2134
|
+
if (Array.isArray(processedTerm)) {
|
|
2135
|
+
for (const t of processedTerm) {
|
|
2136
|
+
localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
|
|
2137
|
+
}
|
|
2138
|
+
}
|
|
2139
|
+
else if (processedTerm) {
|
|
2140
|
+
localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
|
|
2141
|
+
}
|
|
2142
|
+
}
|
|
2143
|
+
/**
|
|
2144
|
+
* Accumulate token frequencies for one document field into `localFreqs` (cleared first).
|
|
2145
|
+
* Returns the number of distinct processed terms (replaces a separate `Set(tokens)` pass).
|
|
2146
|
+
*/
|
|
2147
|
+
function collectFieldTermFreqsInto(localFreqs, tokens, fieldName, processTerm) {
|
|
2148
|
+
localFreqs.clear();
|
|
2149
|
+
for (const term of tokens) {
|
|
2150
|
+
accumulateProcessedTerm(localFreqs, processTerm(term, fieldName));
|
|
2151
|
+
}
|
|
2152
|
+
return localFreqs.size;
|
|
2153
|
+
}
|
|
2154
|
+
/** Global delimiter pattern for incremental `exec` (must not reuse {@link SPACE_OR_PUNCTUATION} — no `g` flag). */
|
|
2155
|
+
const DEFAULT_TOKENIZE_DELIMITERS = /[\n\r\p{Z}\p{P}]+/gu;
|
|
2156
|
+
const defaultTokenizeProbe = 'a b';
|
|
2157
|
+
const defaultTokenizeProbeField = 'f';
|
|
2158
|
+
const tokenizeBehaviorCache = new WeakMap();
|
|
2159
|
+
/**
|
|
2160
|
+
* True when `tokenize` matches the library default (reference equality or split-equivalent
|
|
2161
|
+
* on a fixed probe). Custom tokenizers that pass the probe but diverge on other inputs
|
|
2162
|
+
* (e.g. leading delimiters) still take the fast path — use the default reference in prod.
|
|
2163
|
+
*/
|
|
2164
|
+
function isDefaultTokenize(tokenize) {
|
|
2165
|
+
if (tokenize === defaultFrozenLoadOptions.tokenize)
|
|
2166
|
+
return true;
|
|
2167
|
+
const cached = tokenizeBehaviorCache.get(tokenize);
|
|
2168
|
+
if (cached != null)
|
|
2169
|
+
return cached;
|
|
2170
|
+
const splitTokens = defaultTokenizeProbe.split(SPACE_OR_PUNCTUATION);
|
|
2171
|
+
const customTokens = tokenize(defaultTokenizeProbe, defaultTokenizeProbeField);
|
|
2172
|
+
const ok = splitTokens.length === customTokens.length
|
|
2173
|
+
&& splitTokens.every((t, i) => t === customTokens[i]);
|
|
2174
|
+
tokenizeBehaviorCache.set(tokenize, ok);
|
|
2175
|
+
return ok;
|
|
2176
|
+
}
|
|
2177
|
+
function forEachDefaultToken(text, onToken) {
|
|
2178
|
+
if (text.length === 0) {
|
|
2179
|
+
onToken('');
|
|
2180
|
+
return;
|
|
2181
|
+
}
|
|
2182
|
+
let start = 0;
|
|
2183
|
+
const re = DEFAULT_TOKENIZE_DELIMITERS;
|
|
2184
|
+
re.lastIndex = 0;
|
|
2185
|
+
let match;
|
|
2186
|
+
while ((match = re.exec(text)) !== null) {
|
|
2187
|
+
if (match.index > start) {
|
|
2188
|
+
onToken(text.slice(start, match.index));
|
|
2189
|
+
}
|
|
2190
|
+
else if (match.index === start) {
|
|
2191
|
+
onToken('');
|
|
2192
|
+
}
|
|
2193
|
+
start = match.index + match[0].length;
|
|
2194
|
+
}
|
|
2195
|
+
if (start < text.length) {
|
|
2196
|
+
onToken(text.slice(start));
|
|
2197
|
+
}
|
|
2198
|
+
else if (start === 0) {
|
|
2199
|
+
onToken(text);
|
|
2200
|
+
}
|
|
2201
|
+
else if (start === text.length) {
|
|
2202
|
+
onToken('');
|
|
2203
|
+
}
|
|
2204
|
+
}
|
|
2205
|
+
/** Default tokenizer into a reusable buffer (avoids `text.split()` array allocation). */
|
|
2206
|
+
function tokenizeDefaultInto(out, text) {
|
|
2207
|
+
out.length = 0;
|
|
2208
|
+
forEachDefaultToken(text, (token) => out.push(token));
|
|
2209
|
+
}
|
|
2210
|
+
/** Tokenize field text into `out` (reused). Fast path when `tokenize` is the library default. */
|
|
2211
|
+
function tokenizeFieldInto(out, tokenize, text, fieldName) {
|
|
2212
|
+
if (isDefaultTokenize(tokenize)) {
|
|
2213
|
+
tokenizeDefaultInto(out, text);
|
|
2214
|
+
return;
|
|
2215
|
+
}
|
|
2216
|
+
const tokens = tokenize(text, fieldName);
|
|
2217
|
+
out.length = 0;
|
|
2218
|
+
out.push(...tokens);
|
|
2219
|
+
}
|
|
2220
|
+
function collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm) {
|
|
2221
|
+
localFreqs.clear();
|
|
2222
|
+
forEachDefaultToken(text, (token) => {
|
|
2223
|
+
accumulateProcessedTerm(localFreqs, processTerm(token, fieldName));
|
|
2224
|
+
});
|
|
2225
|
+
return localFreqs.size;
|
|
2226
|
+
}
|
|
2227
|
+
/**
|
|
2228
|
+
* Tokenize + accumulate field term frequencies in one pass when the default tokenizer is used.
|
|
2229
|
+
* `tokenScratch` is only used for custom tokenizers (two-phase fallback).
|
|
2230
|
+
*/
|
|
2231
|
+
function collectFieldTermFreqsFromFieldInto(localFreqs, tokenScratch, tokenize, text, fieldName, processTerm) {
|
|
2232
|
+
if (isDefaultTokenize(tokenize)) {
|
|
2233
|
+
return collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm);
|
|
2234
|
+
}
|
|
2235
|
+
tokenizeFieldInto(tokenScratch, tokenize, text, fieldName);
|
|
2236
|
+
return collectFieldTermFreqsInto(localFreqs, tokenScratch, fieldName, processTerm);
|
|
2237
|
+
}
|
|
2238
|
+
function updateAvgFieldLength(avgFieldLength, fieldId, count, length) {
|
|
2239
|
+
const averageFieldLength = avgFieldLength[fieldId] || 0;
|
|
2240
|
+
const totalFieldLength = (averageFieldLength * count) + length;
|
|
2241
|
+
avgFieldLength[fieldId] = totalFieldLength / (count + 1);
|
|
2242
|
+
}
|
|
2243
|
+
|
|
2244
|
+
function validateTreeShape(shape, termCount) {
|
|
2245
|
+
if (!Array.isArray(shape)) {
|
|
2246
|
+
throw invalidFrozenIndex('treeShape node must be an array');
|
|
2247
|
+
}
|
|
2248
|
+
for (const entry of shape) {
|
|
2249
|
+
if (!Array.isArray(entry) || entry.length !== 2) {
|
|
2250
|
+
throw invalidFrozenIndex('treeShape entry must be a [key, value] pair');
|
|
2251
|
+
}
|
|
2252
|
+
const [key, value] = entry;
|
|
2253
|
+
if (key === LEAF) {
|
|
2254
|
+
const idx = value;
|
|
2255
|
+
if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
|
|
2256
|
+
throw invalidFrozenIndex(`treeShape leaf term index out of range: ${idx}`);
|
|
2257
|
+
}
|
|
2258
|
+
}
|
|
2259
|
+
else {
|
|
2260
|
+
validateTreeShape(value, termCount);
|
|
2261
|
+
}
|
|
2262
|
+
}
|
|
2263
|
+
}
|
|
2264
|
+
function termCountOf(snap) {
|
|
2265
|
+
return snap.postings.termCount;
|
|
2266
|
+
}
|
|
2267
|
+
/**
|
|
2268
|
+
* Numeric/structural invariants shared by both the decode path (untrusted binary)
|
|
2269
|
+
* and the build path (trusted internal code).
|
|
2270
|
+
*/
|
|
2271
|
+
function validateFrozenSnapshotNumeric(snap) {
|
|
2272
|
+
if (snap.fieldCount <= 0) {
|
|
2273
|
+
throw invalidFrozenIndex('fieldCount must be positive');
|
|
2274
|
+
}
|
|
2275
|
+
if (snap.nextId < 0 || snap.nextId >= 0xffffffff) {
|
|
2276
|
+
throw invalidFrozenIndex('nextId out of range');
|
|
2277
|
+
}
|
|
2278
|
+
if (snap.documentCount < 0 || snap.documentCount > snap.nextId) {
|
|
2279
|
+
throw invalidFrozenIndex('documentCount inconsistent with nextId');
|
|
2280
|
+
}
|
|
2281
|
+
if (snap.fieldLengthMatrix.length !== snap.nextId * snap.fieldCount) {
|
|
2282
|
+
throw invalidFrozenIndex('fieldLengthMatrix size mismatch');
|
|
2283
|
+
}
|
|
2284
|
+
if (snap.avgFieldLength.length !== snap.fieldCount) {
|
|
2285
|
+
throw invalidFrozenIndex('avgFieldLength size mismatch');
|
|
2286
|
+
}
|
|
2287
|
+
validateFrozenPostingsLayout(snap.postings, snap.documentCount, snap.nextId, detail => {
|
|
2288
|
+
throw invalidFrozenIndex(detail);
|
|
2289
|
+
});
|
|
2290
|
+
const indexedFields = Object.keys(snap.fieldIds);
|
|
2291
|
+
if (indexedFields.length !== snap.fieldCount) {
|
|
2292
|
+
throw invalidFrozenIndex('fieldIds count mismatch');
|
|
2293
|
+
}
|
|
2294
|
+
for (let f = 0; f < snap.fieldCount; f++) {
|
|
2295
|
+
const found = indexedFields.some(name => snap.fieldIds[name] === f);
|
|
2296
|
+
if (!found) {
|
|
2297
|
+
throw invalidFrozenIndex(`missing field id ${f}`);
|
|
2298
|
+
}
|
|
2299
|
+
}
|
|
2300
|
+
}
|
|
2301
|
+
function readFieldNamesSection(buf, fieldNamesOff, fieldCount, externalIdsOff) {
|
|
2302
|
+
const fieldNames = [];
|
|
2303
|
+
let o = fieldNamesOff;
|
|
2304
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
2305
|
+
const { value, next } = readLengthPrefixedUtf8(buf, o);
|
|
2306
|
+
fieldNames.push(value);
|
|
2307
|
+
o = next;
|
|
2308
|
+
}
|
|
2309
|
+
if (o !== externalIdsOff) {
|
|
2310
|
+
throw invalidFrozenIndex('field names section size mismatch');
|
|
2311
|
+
}
|
|
2312
|
+
return fieldNames;
|
|
2313
|
+
}
|
|
2314
|
+
function readExternalIdsSection(buf, externalIdsOff, nextId, storedOff) {
|
|
2315
|
+
const externalIds = new Array(nextId);
|
|
2316
|
+
let o = externalIdsOff;
|
|
2317
|
+
for (let i = 0; i < nextId; i++) {
|
|
2318
|
+
const { value, next } = readExternalId(buf, o);
|
|
2319
|
+
externalIds[i] = value;
|
|
2320
|
+
o = next;
|
|
2321
|
+
}
|
|
2322
|
+
if (o !== storedOff) {
|
|
2323
|
+
throw invalidFrozenIndex('external ids section size mismatch');
|
|
2324
|
+
}
|
|
2325
|
+
return externalIds;
|
|
2326
|
+
}
|
|
2327
|
+
function readStoredFieldsSection(buf, storedOff, nextId, sectionEnd) {
|
|
2328
|
+
const storedFields = new Array(nextId);
|
|
2329
|
+
const tableEnd = storedOff + nextId * 4;
|
|
2330
|
+
if (tableEnd > sectionEnd) {
|
|
2331
|
+
throw invalidFrozenIndex('stored fields table out of bounds');
|
|
2332
|
+
}
|
|
2333
|
+
for (let i = 0; i < nextId; i++) {
|
|
2334
|
+
const rel = buf.readUInt32LE(storedOff + i * 4);
|
|
2335
|
+
if (rel === 0) {
|
|
2336
|
+
storedFields[i] = undefined;
|
|
2337
|
+
continue;
|
|
2338
|
+
}
|
|
2339
|
+
const entryOff = tableEnd + rel - 1;
|
|
2340
|
+
if (entryOff + 4 > sectionEnd) {
|
|
2341
|
+
throw invalidFrozenIndex('stored fields entry offset out of bounds');
|
|
2342
|
+
}
|
|
2343
|
+
const jsonLen = buf.readUInt32LE(entryOff);
|
|
2344
|
+
const jsonStart = entryOff + 4;
|
|
2345
|
+
const jsonEnd = jsonStart + jsonLen;
|
|
2346
|
+
if (jsonEnd > sectionEnd) {
|
|
2347
|
+
throw invalidFrozenIndex('stored fields JSON out of bounds');
|
|
2348
|
+
}
|
|
2349
|
+
storedFields[i] = JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
|
|
2350
|
+
}
|
|
2351
|
+
return storedFields;
|
|
2352
|
+
}
|
|
2353
|
+
/** Validate structural invariants of a decoded or assembled frozen snapshot. */
|
|
2354
|
+
function validateFrozenSnapshot(snap) {
|
|
2355
|
+
validateFrozenSnapshotNumeric(snap);
|
|
2356
|
+
const termCount = termCountOf(snap);
|
|
2357
|
+
if (snap.packedTermIndex != null) {
|
|
2358
|
+
validateFrozenTermIndexLeaves(snap.packedTermIndex, termCount);
|
|
2359
|
+
}
|
|
2360
|
+
else if (snap.termTree != null) {
|
|
2361
|
+
validateTermTreeLeaves(snap.termTree, termCount);
|
|
2362
|
+
}
|
|
2363
|
+
else {
|
|
2364
|
+
validateTreeShape(snap.treeShape, termCount);
|
|
2365
|
+
}
|
|
2366
|
+
}
|
|
2367
|
+
function fieldNamesFromFieldIds(fieldIds) {
|
|
2368
|
+
const names = Object.keys(fieldIds);
|
|
2369
|
+
names.sort((a, b) => fieldIds[a] - fieldIds[b]);
|
|
2370
|
+
return names;
|
|
2371
|
+
}
|
|
2372
|
+
/** Core with explicit {@link termCountOf} (no dictionary section). */
|
|
2373
|
+
function buildCoreSectionWithTermCount(snap) {
|
|
2374
|
+
const out = Buffer.alloc(16);
|
|
2375
|
+
out.writeUInt32LE(snap.documentCount, 0);
|
|
2376
|
+
out.writeUInt32LE(snap.nextId, 4);
|
|
2377
|
+
out.writeUInt32LE(snap.fieldCount, 8);
|
|
2378
|
+
out.writeUInt32LE(termCountOf(snap), 12);
|
|
2379
|
+
return out;
|
|
2380
|
+
}
|
|
2381
|
+
function buildFieldNamesSection(fieldNames) {
|
|
2382
|
+
const chunks = [];
|
|
2383
|
+
for (const name of fieldNames) {
|
|
2384
|
+
const body = Buffer.from(name, 'utf8');
|
|
2385
|
+
const header = Buffer.alloc(4);
|
|
2386
|
+
header.writeUInt32LE(body.length, 0);
|
|
2387
|
+
chunks.push(header, body);
|
|
2388
|
+
}
|
|
2389
|
+
return Buffer.concat(chunks);
|
|
2390
|
+
}
|
|
2391
|
+
function buildExternalIdsSection(externalIds, nextId) {
|
|
2392
|
+
const chunks = [];
|
|
2393
|
+
for (let i = 0; i < nextId; i++) {
|
|
2394
|
+
writeExternalId(chunks, externalIds[i]);
|
|
2395
|
+
}
|
|
2396
|
+
return Buffer.concat(chunks);
|
|
2397
|
+
}
|
|
2398
|
+
function buildStoredFieldsSection(storedFields, nextId) {
|
|
2399
|
+
const table = Buffer.alloc(nextId * 4);
|
|
2400
|
+
const heapChunks = [];
|
|
2401
|
+
let heapOff = 0;
|
|
2402
|
+
for (let i = 0; i < nextId; i++) {
|
|
2403
|
+
const row = storedFields[i];
|
|
2404
|
+
if (row == null) {
|
|
2405
|
+
table.writeUInt32LE(0, i * 4);
|
|
2406
|
+
continue;
|
|
2407
|
+
}
|
|
2408
|
+
table.writeUInt32LE(heapOff + 1, i * 4);
|
|
2409
|
+
const json = Buffer.from(JSON.stringify(row), 'utf8');
|
|
2410
|
+
const entry = Buffer.alloc(4 + json.length);
|
|
2411
|
+
entry.writeUInt32LE(json.length, 0);
|
|
2412
|
+
json.copy(entry, 4);
|
|
2413
|
+
heapChunks.push(entry);
|
|
2414
|
+
heapOff += entry.length;
|
|
2415
|
+
}
|
|
2416
|
+
return Buffer.concat([table, ...heapChunks]);
|
|
2417
|
+
}
|
|
2418
|
+
function validateTermTreeLeaves(tree, termCount) {
|
|
2419
|
+
for (const [key, val] of tree) {
|
|
2420
|
+
if (key === LEAF) {
|
|
2421
|
+
const idx = val;
|
|
2422
|
+
if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
|
|
2423
|
+
throw invalidFrozenIndex(`term tree leaf index out of range: ${idx}`);
|
|
2424
|
+
}
|
|
2425
|
+
}
|
|
2426
|
+
else {
|
|
2427
|
+
validateTermTreeLeaves(val, termCount);
|
|
2428
|
+
}
|
|
2429
|
+
}
|
|
2430
|
+
}
|
|
2431
|
+
function deserializeTermIndexTree(shape) {
|
|
2432
|
+
const tree = new Map();
|
|
2433
|
+
for (const [key, value] of shape) {
|
|
2434
|
+
if (key === LEAF) {
|
|
2435
|
+
tree.set(LEAF, value);
|
|
2436
|
+
}
|
|
2437
|
+
else {
|
|
2438
|
+
tree.set(key, deserializeTermIndexTree(value));
|
|
2439
|
+
}
|
|
2440
|
+
}
|
|
2441
|
+
return tree;
|
|
2442
|
+
}
|
|
2443
|
+
|
|
2444
|
+
/**
|
|
2445
|
+
* Runtime stored fields. Single store field → one column (no per-doc Record at rest).
|
|
2446
|
+
* Wire format stays row JSON; encode/decode can skip intermediate row arrays when layout is known.
|
|
2447
|
+
*/
|
|
2448
|
+
function createStoredFieldsLayout(storeFields, capacity = 0) {
|
|
2449
|
+
if (storeFields.length === 0)
|
|
2450
|
+
return { kind: 'none' };
|
|
2451
|
+
if (storeFields.length === 1) {
|
|
2452
|
+
return { kind: 'single', field: storeFields[0], values: new Array(capacity) };
|
|
2453
|
+
}
|
|
2454
|
+
return { kind: 'multi', rows: new Array(capacity) };
|
|
2455
|
+
}
|
|
2456
|
+
function writeStoredField(layout, shortId, storeFields, extractField, document) {
|
|
2457
|
+
if (layout.kind === 'none')
|
|
2458
|
+
return;
|
|
2459
|
+
if (layout.kind === 'single') {
|
|
2460
|
+
layout.values[shortId] = extractField(document, layout.field);
|
|
2461
|
+
return;
|
|
2228
2462
|
}
|
|
2229
|
-
|
|
2463
|
+
const row = {};
|
|
2464
|
+
for (const name of storeFields) {
|
|
2465
|
+
const value = extractField(document, name);
|
|
2466
|
+
if (value !== undefined)
|
|
2467
|
+
row[name] = value;
|
|
2468
|
+
}
|
|
2469
|
+
layout.rows[shortId] = row;
|
|
2230
2470
|
}
|
|
2231
|
-
/**
|
|
2232
|
-
function
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2471
|
+
/** Materialize API/wire row for one document. */
|
|
2472
|
+
function readStoredFields(layout, shortId) {
|
|
2473
|
+
if (layout.kind === 'none')
|
|
2474
|
+
return undefined;
|
|
2475
|
+
if (layout.kind === 'multi')
|
|
2476
|
+
return layout.rows[shortId];
|
|
2477
|
+
const value = layout.values[shortId];
|
|
2478
|
+
if (value === undefined)
|
|
2479
|
+
return {};
|
|
2480
|
+
return { [layout.field]: value };
|
|
2481
|
+
}
|
|
2482
|
+
function resizeStoredFields(layout, length) {
|
|
2483
|
+
if (layout.kind === 'none')
|
|
2484
|
+
return layout;
|
|
2485
|
+
if (layout.kind === 'single') {
|
|
2486
|
+
return layout.values.length <= length
|
|
2487
|
+
? layout
|
|
2488
|
+
: { kind: 'single', field: layout.field, values: layout.values.slice(0, length) };
|
|
2489
|
+
}
|
|
2490
|
+
return layout.rows.length <= length
|
|
2491
|
+
? layout
|
|
2492
|
+
: { kind: 'multi', rows: layout.rows.slice(0, length) };
|
|
2493
|
+
}
|
|
2494
|
+
function cloneStoredFields(layout) {
|
|
2495
|
+
if (layout.kind === 'none')
|
|
2496
|
+
return layout;
|
|
2497
|
+
if (layout.kind === 'single') {
|
|
2498
|
+
return { kind: 'single', field: layout.field, values: layout.values.slice() };
|
|
2499
|
+
}
|
|
2500
|
+
return { kind: 'multi', rows: layout.rows.slice() };
|
|
2501
|
+
}
|
|
2502
|
+
/** Import from wire rows or lucaong snapshot. Empty storeFields + non-empty rows → multi (binary load without options). */
|
|
2503
|
+
function storedFieldsFromRows(rows, storeFields) {
|
|
2504
|
+
if (storeFields.length === 0) {
|
|
2505
|
+
const hasAny = rows.some(row => row != null && Object.keys(row).length > 0);
|
|
2506
|
+
return hasAny ? { kind: 'multi', rows } : { kind: 'none' };
|
|
2507
|
+
}
|
|
2508
|
+
if (storeFields.length === 1) {
|
|
2509
|
+
const field = storeFields[0];
|
|
2510
|
+
const values = rows.map(row => row === null || row === void 0 ? void 0 : row[field]);
|
|
2511
|
+
return { kind: 'single', field, values };
|
|
2512
|
+
}
|
|
2513
|
+
return { kind: 'multi', rows };
|
|
2514
|
+
}
|
|
2515
|
+
function storedFieldsJsonBytes(layout) {
|
|
2516
|
+
if (layout.kind === 'none')
|
|
2517
|
+
return 0;
|
|
2518
|
+
if (layout.kind === 'multi') {
|
|
2519
|
+
let total = 0;
|
|
2520
|
+
for (const row of layout.rows) {
|
|
2521
|
+
if (row != null)
|
|
2522
|
+
total += JSON.stringify(row).length;
|
|
2243
2523
|
}
|
|
2524
|
+
return total;
|
|
2244
2525
|
}
|
|
2245
|
-
|
|
2526
|
+
let total = 0;
|
|
2527
|
+
const { field, values } = layout;
|
|
2528
|
+
for (let i = 0; i < values.length; i++) {
|
|
2529
|
+
const value = values[i];
|
|
2530
|
+
if (value !== undefined)
|
|
2531
|
+
total += JSON.stringify({ [field]: value }).length;
|
|
2532
|
+
}
|
|
2533
|
+
return total;
|
|
2246
2534
|
}
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2535
|
+
function storedFieldsSlotCount(layout) {
|
|
2536
|
+
if (layout.kind === 'none')
|
|
2537
|
+
return 0;
|
|
2538
|
+
return layout.kind === 'single' ? layout.values.length : layout.rows.length;
|
|
2539
|
+
}
|
|
2540
|
+
function appendStoredFieldJsonEntry(table, heapChunks, heapOffRef, docIndex, jsonUtf8) {
|
|
2541
|
+
table.writeUInt32LE(heapOffRef.value + 1, docIndex * 4);
|
|
2542
|
+
const entry = Buffer.alloc(4 + jsonUtf8.length);
|
|
2543
|
+
entry.writeUInt32LE(jsonUtf8.length, 0);
|
|
2544
|
+
jsonUtf8.copy(entry, 4);
|
|
2545
|
+
heapChunks.push(entry);
|
|
2546
|
+
heapOffRef.value += entry.length;
|
|
2547
|
+
}
|
|
2548
|
+
/** MSv5 StoredFields section from {@link StoredFieldsLayout} (no intermediate row array). */
|
|
2549
|
+
function buildStoredFieldsWireSection(layout, nextId) {
|
|
2550
|
+
if (layout.kind === 'multi') {
|
|
2551
|
+
const rows = layout.rows.length >= nextId
|
|
2552
|
+
? layout.rows
|
|
2553
|
+
: layout.rows.concat(new Array(nextId - layout.rows.length));
|
|
2554
|
+
return buildStoredFieldsSection(rows, nextId);
|
|
2555
|
+
}
|
|
2556
|
+
const table = Buffer.alloc(nextId * 4);
|
|
2557
|
+
if (layout.kind === 'none')
|
|
2558
|
+
return table;
|
|
2559
|
+
const heapChunks = [];
|
|
2560
|
+
const heapOffRef = { value: 0 };
|
|
2561
|
+
const { field, values } = layout;
|
|
2562
|
+
for (let i = 0; i < nextId; i++) {
|
|
2563
|
+
const value = values[i];
|
|
2564
|
+
if (value === undefined)
|
|
2565
|
+
continue;
|
|
2566
|
+
const jsonUtf8 = Buffer.from(JSON.stringify({ [field]: value }), 'utf8');
|
|
2567
|
+
appendStoredFieldJsonEntry(table, heapChunks, heapOffRef, i, jsonUtf8);
|
|
2568
|
+
}
|
|
2569
|
+
return heapChunks.length === 0 ? table : Buffer.concat([table, ...heapChunks]);
|
|
2252
2570
|
}
|
|
2253
|
-
function
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2571
|
+
function storedFieldsTableEnd(storedOff, nextId, sectionEnd) {
|
|
2572
|
+
const tableEnd = storedOff + nextId * 4;
|
|
2573
|
+
if (tableEnd > sectionEnd) {
|
|
2574
|
+
throw invalidFrozenIndex('stored fields table out of bounds');
|
|
2575
|
+
}
|
|
2576
|
+
return tableEnd;
|
|
2577
|
+
}
|
|
2578
|
+
function readStoredFieldJsonAt(buf, tableEnd, sectionEnd, rel) {
|
|
2579
|
+
const entryOff = tableEnd + rel - 1;
|
|
2580
|
+
if (entryOff + 4 > sectionEnd) {
|
|
2581
|
+
throw invalidFrozenIndex('stored fields entry offset out of bounds');
|
|
2582
|
+
}
|
|
2583
|
+
const jsonLen = buf.readUInt32LE(entryOff);
|
|
2584
|
+
const jsonStart = entryOff + 4;
|
|
2585
|
+
const jsonEnd = jsonStart + jsonLen;
|
|
2586
|
+
if (jsonEnd > sectionEnd) {
|
|
2587
|
+
throw invalidFrozenIndex('stored fields JSON out of bounds');
|
|
2588
|
+
}
|
|
2589
|
+
return JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
|
|
2590
|
+
}
|
|
2591
|
+
/** MSv5 StoredFields section → layout (skips row materialization when storeFields hint allows). */
|
|
2592
|
+
function readStoredFieldsWireSection(buf, storedOff, nextId, sectionEnd, storeFields) {
|
|
2593
|
+
const tableEnd = storedFieldsTableEnd(storedOff, nextId, sectionEnd);
|
|
2594
|
+
if (storeFields.length === 1) {
|
|
2595
|
+
const field = storeFields[0];
|
|
2596
|
+
const values = new Array(nextId);
|
|
2597
|
+
for (let i = 0; i < nextId; i++) {
|
|
2598
|
+
const rel = buf.readUInt32LE(storedOff + i * 4);
|
|
2599
|
+
if (rel === 0)
|
|
2600
|
+
continue;
|
|
2601
|
+
const row = readStoredFieldJsonAt(buf, tableEnd, sectionEnd, rel);
|
|
2602
|
+
values[i] = row[field];
|
|
2603
|
+
}
|
|
2604
|
+
return { kind: 'single', field, values };
|
|
2605
|
+
}
|
|
2606
|
+
if (storeFields.length === 0) {
|
|
2607
|
+
let hasAny = false;
|
|
2608
|
+
for (let i = 0; i < nextId; i++) {
|
|
2609
|
+
if (buf.readUInt32LE(storedOff + i * 4) !== 0) {
|
|
2610
|
+
hasAny = true;
|
|
2611
|
+
break;
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
if (!hasAny)
|
|
2615
|
+
return { kind: 'none' };
|
|
2261
2616
|
}
|
|
2262
|
-
|
|
2617
|
+
const rows = readStoredFieldsSection(buf, storedOff, nextId, sectionEnd);
|
|
2618
|
+
return storedFieldsFromRows(rows, storeFields);
|
|
2263
2619
|
}
|
|
2264
2620
|
|
|
2265
2621
|
const SUPPORTED_SERIALIZATION_VERSIONS = new Set([1, 2]);
|
|
@@ -2345,7 +2701,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
|
|
|
2345
2701
|
let shortIdRemap = null;
|
|
2346
2702
|
const resolvedNextId = useDense ? documentCount : nextId;
|
|
2347
2703
|
const externalIds = new Array(resolvedNextId);
|
|
2348
|
-
const
|
|
2704
|
+
const storedFieldRows = new Array(externalIds.length);
|
|
2349
2705
|
if (useDense) {
|
|
2350
2706
|
shortIdRemap = new Uint32Array(nextId);
|
|
2351
2707
|
shortIdRemap.fill(DISCARDED_DOC_ID);
|
|
@@ -2357,7 +2713,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
|
|
|
2357
2713
|
const shortIdStr = String(shortId);
|
|
2358
2714
|
shortIdRemap[shortId] = dense;
|
|
2359
2715
|
externalIds[dense] = snapshot.documentIds[shortIdStr];
|
|
2360
|
-
|
|
2716
|
+
storedFieldRows[dense] = snapshot.storedFields[shortIdStr];
|
|
2361
2717
|
dense++;
|
|
2362
2718
|
}
|
|
2363
2719
|
}
|
|
@@ -2365,7 +2721,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
|
|
|
2365
2721
|
for (const [shortIdStr, id] of Object.entries(snapshot.documentIds)) {
|
|
2366
2722
|
const shortId = parseInt(shortIdStr, 10);
|
|
2367
2723
|
externalIds[shortId] = id;
|
|
2368
|
-
|
|
2724
|
+
storedFieldRows[shortId] = snapshot.storedFields[shortIdStr];
|
|
2369
2725
|
}
|
|
2370
2726
|
}
|
|
2371
2727
|
const idLookup = createIdToShortIdLookup(externalIds, resolvedNextId);
|
|
@@ -2388,6 +2744,7 @@ function buildFrozenAssembleParamsFromMiniSearchSnapshot(snapshot, options) {
|
|
|
2388
2744
|
}
|
|
2389
2745
|
const searchableMap = buildSearchableMapFromSnapshot(snapshot);
|
|
2390
2746
|
const flat = buildFlatPostingsFromSearchableMap(searchableMap, fieldCount, resolvedNextId, shortIdRemap);
|
|
2747
|
+
const storedFields = storedFieldsFromRows(storedFieldRows, opts.storeFields);
|
|
2391
2748
|
return {
|
|
2392
2749
|
options: opts,
|
|
2393
2750
|
documentCount,
|
|
@@ -2689,321 +3046,121 @@ function collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32)
|
|
|
2689
3046
|
sectionId++;
|
|
2690
3047
|
}
|
|
2691
3048
|
}
|
|
2692
|
-
}
|
|
2693
|
-
function finish() {
|
|
2694
|
-
emitEmptySections();
|
|
2695
|
-
if (streamOffset !== uncompressedLength || sectionId !== directory.length) {
|
|
2696
|
-
throw new Error('MSv5 zstd decompressed length mismatch');
|
|
2697
|
-
}
|
|
2698
|
-
if (payloadCrc !== payloadCrc32) {
|
|
2699
|
-
throw new Error('MSv5 payload CRC mismatch');
|
|
2700
|
-
}
|
|
2701
|
-
}
|
|
2702
|
-
return { sections, consume, finish };
|
|
2703
|
-
}
|
|
2704
|
-
function loadMsv5SectionsFromZstdStream(compressed, directory, uncompressedLength, payloadCrc32) {
|
|
2705
|
-
return new Promise((resolve, reject) => {
|
|
2706
|
-
const collector = collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32);
|
|
2707
|
-
const stream = zlib.createZstdDecompress();
|
|
2708
|
-
stream.on('data', (chunk) => {
|
|
2709
|
-
try {
|
|
2710
|
-
collector.consume(chunk);
|
|
2711
|
-
}
|
|
2712
|
-
catch (err) {
|
|
2713
|
-
stream.destroy(err);
|
|
2714
|
-
}
|
|
2715
|
-
});
|
|
2716
|
-
stream.on('error', reject);
|
|
2717
|
-
stream.on('end', () => {
|
|
2718
|
-
try {
|
|
2719
|
-
collector.finish();
|
|
2720
|
-
resolve(collector.sections);
|
|
2721
|
-
}
|
|
2722
|
-
catch (err) {
|
|
2723
|
-
reject(err);
|
|
2724
|
-
}
|
|
2725
|
-
});
|
|
2726
|
-
stream.end(compressed);
|
|
2727
|
-
});
|
|
2728
|
-
}
|
|
2729
|
-
function validatePayloadDirectory(directory, uncompressedLength) {
|
|
2730
|
-
let prevEnd = 0;
|
|
2731
|
-
for (const entry of directory) {
|
|
2732
|
-
if ((entry.fileOffset & 3) !== 0) {
|
|
2733
|
-
throw new Error('MSv5 section offset not aligned');
|
|
2734
|
-
}
|
|
2735
|
-
if (entry.fileOffset < prevEnd) {
|
|
2736
|
-
throw new Error('MSv5 section offsets not monotonic');
|
|
2737
|
-
}
|
|
2738
|
-
if (entry.fileOffset + entry.uncompressedLength > uncompressedLength) {
|
|
2739
|
-
throw new Error('MSv5 section out of uncompressed bounds');
|
|
2740
|
-
}
|
|
2741
|
-
prevEnd = entry.fileOffset + entry.uncompressedLength;
|
|
2742
|
-
}
|
|
2743
|
-
if (prevEnd !== uncompressedLength) {
|
|
2744
|
-
throw new Error('MSv5 uncompressed payload length mismatch');
|
|
2745
|
-
}
|
|
2746
|
-
}
|
|
2747
|
-
/** Shared validation + bounds for both the sync and async load paths. */
|
|
2748
|
-
function preparePayload(fileBuf, directory) {
|
|
2749
|
-
assertPayloadFormatRev(fileBuf);
|
|
2750
|
-
const { payloadOffset, compressedLength, uncompressedLength, payloadCrc32, payloadCodec } = readPayloadMeta(fileBuf);
|
|
2751
|
-
validatePayloadDirectory(directory, uncompressedLength);
|
|
2752
|
-
if (payloadOffset !== MSV5_HEADER_SIZE || payloadOffset + compressedLength > fileBuf.length) {
|
|
2753
|
-
throw new Error('MSv5 payload out of bounds');
|
|
2754
|
-
}
|
|
2755
|
-
if (payloadCodec === CODEC_RAW && compressedLength !== uncompressedLength) {
|
|
2756
|
-
throw new Error('MSv5 raw payload length mismatch');
|
|
2757
|
-
}
|
|
2758
|
-
return {
|
|
2759
|
-
payloadCodec,
|
|
2760
|
-
slice: fileBuf.subarray(payloadOffset, payloadOffset + compressedLength),
|
|
2761
|
-
uncompressedLength,
|
|
2762
|
-
payloadCrc32,
|
|
2763
|
-
};
|
|
2764
|
-
}
|
|
2765
|
-
/** Synchronous load; peak RAM ≈ full uncompressed payload (use the async path to bound it). */
|
|
2766
|
-
function loadMsv5Sections(fileBuf, directory) {
|
|
2767
|
-
const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
|
|
2768
|
-
if (payloadCodec === CODEC_RAW) {
|
|
2769
|
-
return sectionsFromPayload(slice, directory, payloadCrc32);
|
|
2770
|
-
}
|
|
2771
|
-
if (payloadCodec === CODEC_ZSTD) {
|
|
2772
|
-
if (!zstdAvailable()) {
|
|
2773
|
-
throw zstdUnavailableReadError();
|
|
2774
|
-
}
|
|
2775
|
-
// Native cap matches readPayloadMeta's 1 GiB limit (see MSV5_MAX_UNCOMPRESSED_BYTES).
|
|
2776
|
-
// Using header `uncompressedLength` here would only help when the header understates
|
|
2777
|
-
// the zstd stream but the attacker can inflate the header too — same worst case.
|
|
2778
|
-
const decoded = zlib.zstdDecompressSync(slice, {
|
|
2779
|
-
maxOutputLength: MSV5_MAX_UNCOMPRESSED_BYTES,
|
|
2780
|
-
});
|
|
2781
|
-
if (decoded.length !== uncompressedLength) {
|
|
2782
|
-
throw new Error('MSv5 zstd decompressed length mismatch');
|
|
2783
|
-
}
|
|
2784
|
-
return sectionsFromPayload(decoded, directory, payloadCrc32);
|
|
2785
|
-
}
|
|
2786
|
-
throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
|
|
2787
|
-
}
|
|
2788
|
-
/** Streaming load; peak main-thread RAM ≈ largest single section (+ file buffer). */
|
|
2789
|
-
async function loadMsv5SectionsAsync(fileBuf, directory) {
|
|
2790
|
-
const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
|
|
2791
|
-
if (payloadCodec === CODEC_RAW) {
|
|
2792
|
-
return sectionsFromPayload(slice, directory, payloadCrc32);
|
|
2793
|
-
}
|
|
2794
|
-
if (payloadCodec === CODEC_ZSTD) {
|
|
2795
|
-
if (!zstdAvailable()) {
|
|
2796
|
-
throw zstdUnavailableReadError();
|
|
2797
|
-
}
|
|
2798
|
-
return loadMsv5SectionsFromZstdStream(slice, directory, uncompressedLength, payloadCrc32);
|
|
2799
|
-
}
|
|
2800
|
-
throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
|
|
2801
|
-
}
|
|
2802
|
-
function isMsv5Buffer(buf) {
|
|
2803
|
-
return buf.length >= 4 && buf.toString('ascii', 0, 4) === 'MSv5';
|
|
2804
|
-
}
|
|
2805
|
-
function readMsv5GlobalFlags(buf) {
|
|
2806
|
-
return buf.readUInt16LE(6);
|
|
2807
|
-
}
|
|
2808
|
-
|
|
2809
|
-
function validateTreeShape(shape, termCount) {
|
|
2810
|
-
if (!Array.isArray(shape)) {
|
|
2811
|
-
throw invalidFrozenIndex('treeShape node must be an array');
|
|
2812
|
-
}
|
|
2813
|
-
for (const entry of shape) {
|
|
2814
|
-
if (!Array.isArray(entry) || entry.length !== 2) {
|
|
2815
|
-
throw invalidFrozenIndex('treeShape entry must be a [key, value] pair');
|
|
2816
|
-
}
|
|
2817
|
-
const [key, value] = entry;
|
|
2818
|
-
if (key === LEAF) {
|
|
2819
|
-
const idx = value;
|
|
2820
|
-
if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
|
|
2821
|
-
throw invalidFrozenIndex(`treeShape leaf term index out of range: ${idx}`);
|
|
2822
|
-
}
|
|
2823
|
-
}
|
|
2824
|
-
else {
|
|
2825
|
-
validateTreeShape(value, termCount);
|
|
2826
|
-
}
|
|
2827
|
-
}
|
|
2828
|
-
}
|
|
2829
|
-
function termCountOf(snap) {
|
|
2830
|
-
return snap.postings.termCount;
|
|
2831
|
-
}
|
|
2832
|
-
/**
|
|
2833
|
-
* Numeric/structural invariants shared by both the decode path (untrusted binary)
|
|
2834
|
-
* and the build path (trusted internal code).
|
|
2835
|
-
*/
|
|
2836
|
-
function validateFrozenSnapshotNumeric(snap) {
|
|
2837
|
-
if (snap.fieldCount <= 0) {
|
|
2838
|
-
throw invalidFrozenIndex('fieldCount must be positive');
|
|
2839
|
-
}
|
|
2840
|
-
if (snap.nextId < 0 || snap.nextId >= 0xffffffff) {
|
|
2841
|
-
throw invalidFrozenIndex('nextId out of range');
|
|
2842
|
-
}
|
|
2843
|
-
if (snap.documentCount < 0 || snap.documentCount > snap.nextId) {
|
|
2844
|
-
throw invalidFrozenIndex('documentCount inconsistent with nextId');
|
|
2845
|
-
}
|
|
2846
|
-
if (snap.fieldLengthMatrix.length !== snap.nextId * snap.fieldCount) {
|
|
2847
|
-
throw invalidFrozenIndex('fieldLengthMatrix size mismatch');
|
|
2848
|
-
}
|
|
2849
|
-
if (snap.avgFieldLength.length !== snap.fieldCount) {
|
|
2850
|
-
throw invalidFrozenIndex('avgFieldLength size mismatch');
|
|
2851
|
-
}
|
|
2852
|
-
validateFrozenPostingsLayout(snap.postings, snap.documentCount, snap.nextId, detail => {
|
|
2853
|
-
throw invalidFrozenIndex(detail);
|
|
2854
|
-
});
|
|
2855
|
-
const indexedFields = Object.keys(snap.fieldIds);
|
|
2856
|
-
if (indexedFields.length !== snap.fieldCount) {
|
|
2857
|
-
throw invalidFrozenIndex('fieldIds count mismatch');
|
|
2858
|
-
}
|
|
2859
|
-
for (let f = 0; f < snap.fieldCount; f++) {
|
|
2860
|
-
const found = indexedFields.some(name => snap.fieldIds[name] === f);
|
|
2861
|
-
if (!found) {
|
|
2862
|
-
throw invalidFrozenIndex(`missing field id ${f}`);
|
|
2863
|
-
}
|
|
2864
|
-
}
|
|
2865
|
-
}
|
|
2866
|
-
function readFieldNamesSection(buf, fieldNamesOff, fieldCount, externalIdsOff) {
|
|
2867
|
-
const fieldNames = [];
|
|
2868
|
-
let o = fieldNamesOff;
|
|
2869
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
2870
|
-
const { value, next } = readLengthPrefixedUtf8(buf, o);
|
|
2871
|
-
fieldNames.push(value);
|
|
2872
|
-
o = next;
|
|
2873
|
-
}
|
|
2874
|
-
if (o !== externalIdsOff) {
|
|
2875
|
-
throw invalidFrozenIndex('field names section size mismatch');
|
|
2876
|
-
}
|
|
2877
|
-
return fieldNames;
|
|
2878
|
-
}
|
|
2879
|
-
function readExternalIdsSection(buf, externalIdsOff, nextId, storedOff) {
|
|
2880
|
-
const externalIds = new Array(nextId);
|
|
2881
|
-
let o = externalIdsOff;
|
|
2882
|
-
for (let i = 0; i < nextId; i++) {
|
|
2883
|
-
const { value, next } = readExternalId(buf, o);
|
|
2884
|
-
externalIds[i] = value;
|
|
2885
|
-
o = next;
|
|
2886
|
-
}
|
|
2887
|
-
if (o !== storedOff) {
|
|
2888
|
-
throw invalidFrozenIndex('external ids section size mismatch');
|
|
2889
|
-
}
|
|
2890
|
-
return externalIds;
|
|
2891
|
-
}
|
|
2892
|
-
function readStoredFieldsSection(buf, storedOff, nextId, sectionEnd) {
|
|
2893
|
-
const storedFields = new Array(nextId);
|
|
2894
|
-
const tableEnd = storedOff + nextId * 4;
|
|
2895
|
-
if (tableEnd > sectionEnd) {
|
|
2896
|
-
throw invalidFrozenIndex('stored fields table out of bounds');
|
|
2897
|
-
}
|
|
2898
|
-
for (let i = 0; i < nextId; i++) {
|
|
2899
|
-
const rel = buf.readUInt32LE(storedOff + i * 4);
|
|
2900
|
-
if (rel === 0) {
|
|
2901
|
-
storedFields[i] = undefined;
|
|
2902
|
-
continue;
|
|
2903
|
-
}
|
|
2904
|
-
const entryOff = tableEnd + rel - 1;
|
|
2905
|
-
if (entryOff + 4 > sectionEnd) {
|
|
2906
|
-
throw invalidFrozenIndex('stored fields entry offset out of bounds');
|
|
2907
|
-
}
|
|
2908
|
-
const jsonLen = buf.readUInt32LE(entryOff);
|
|
2909
|
-
const jsonStart = entryOff + 4;
|
|
2910
|
-
const jsonEnd = jsonStart + jsonLen;
|
|
2911
|
-
if (jsonEnd > sectionEnd) {
|
|
2912
|
-
throw invalidFrozenIndex('stored fields JSON out of bounds');
|
|
3049
|
+
}
|
|
3050
|
+
function finish() {
|
|
3051
|
+
emitEmptySections();
|
|
3052
|
+
if (streamOffset !== uncompressedLength || sectionId !== directory.length) {
|
|
3053
|
+
throw new Error('MSv5 zstd decompressed length mismatch');
|
|
3054
|
+
}
|
|
3055
|
+
if (payloadCrc !== payloadCrc32) {
|
|
3056
|
+
throw new Error('MSv5 payload CRC mismatch');
|
|
2913
3057
|
}
|
|
2914
|
-
storedFields[i] = JSON.parse(buf.toString('utf8', jsonStart, jsonEnd));
|
|
2915
3058
|
}
|
|
2916
|
-
return
|
|
3059
|
+
return { sections, consume, finish };
|
|
2917
3060
|
}
|
|
2918
|
-
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
3061
|
+
function loadMsv5SectionsFromZstdStream(compressed, directory, uncompressedLength, payloadCrc32) {
|
|
3062
|
+
return new Promise((resolve, reject) => {
|
|
3063
|
+
const collector = collectZstdPayloadSections(directory, uncompressedLength, payloadCrc32);
|
|
3064
|
+
const stream = zlib.createZstdDecompress();
|
|
3065
|
+
stream.on('data', (chunk) => {
|
|
3066
|
+
try {
|
|
3067
|
+
collector.consume(chunk);
|
|
3068
|
+
}
|
|
3069
|
+
catch (err) {
|
|
3070
|
+
stream.destroy(err);
|
|
3071
|
+
}
|
|
3072
|
+
});
|
|
3073
|
+
stream.on('error', reject);
|
|
3074
|
+
stream.on('end', () => {
|
|
3075
|
+
try {
|
|
3076
|
+
collector.finish();
|
|
3077
|
+
resolve(collector.sections);
|
|
3078
|
+
}
|
|
3079
|
+
catch (err) {
|
|
3080
|
+
reject(err);
|
|
3081
|
+
}
|
|
3082
|
+
});
|
|
3083
|
+
stream.end(compressed);
|
|
3084
|
+
});
|
|
3085
|
+
}
|
|
3086
|
+
function validatePayloadDirectory(directory, uncompressedLength) {
|
|
3087
|
+
let prevEnd = 0;
|
|
3088
|
+
for (const entry of directory) {
|
|
3089
|
+
if ((entry.fileOffset & 3) !== 0) {
|
|
3090
|
+
throw new Error('MSv5 section offset not aligned');
|
|
3091
|
+
}
|
|
3092
|
+
if (entry.fileOffset < prevEnd) {
|
|
3093
|
+
throw new Error('MSv5 section offsets not monotonic');
|
|
3094
|
+
}
|
|
3095
|
+
if (entry.fileOffset + entry.uncompressedLength > uncompressedLength) {
|
|
3096
|
+
throw new Error('MSv5 section out of uncompressed bounds');
|
|
3097
|
+
}
|
|
3098
|
+
prevEnd = entry.fileOffset + entry.uncompressedLength;
|
|
2927
3099
|
}
|
|
2928
|
-
|
|
2929
|
-
|
|
3100
|
+
if (prevEnd !== uncompressedLength) {
|
|
3101
|
+
throw new Error('MSv5 uncompressed payload length mismatch');
|
|
2930
3102
|
}
|
|
2931
3103
|
}
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
const out = Buffer.alloc(16);
|
|
2940
|
-
out.writeUInt32LE(snap.documentCount, 0);
|
|
2941
|
-
out.writeUInt32LE(snap.nextId, 4);
|
|
2942
|
-
out.writeUInt32LE(snap.fieldCount, 8);
|
|
2943
|
-
out.writeUInt32LE(termCountOf(snap), 12);
|
|
2944
|
-
return out;
|
|
2945
|
-
}
|
|
2946
|
-
function buildFieldNamesSection(fieldNames) {
|
|
2947
|
-
const chunks = [];
|
|
2948
|
-
for (const name of fieldNames) {
|
|
2949
|
-
const body = Buffer.from(name, 'utf8');
|
|
2950
|
-
const header = Buffer.alloc(4);
|
|
2951
|
-
header.writeUInt32LE(body.length, 0);
|
|
2952
|
-
chunks.push(header, body);
|
|
3104
|
+
/** Shared validation + bounds for both the sync and async load paths. */
|
|
3105
|
+
function preparePayload(fileBuf, directory) {
|
|
3106
|
+
assertPayloadFormatRev(fileBuf);
|
|
3107
|
+
const { payloadOffset, compressedLength, uncompressedLength, payloadCrc32, payloadCodec } = readPayloadMeta(fileBuf);
|
|
3108
|
+
validatePayloadDirectory(directory, uncompressedLength);
|
|
3109
|
+
if (payloadOffset !== MSV5_HEADER_SIZE || payloadOffset + compressedLength > fileBuf.length) {
|
|
3110
|
+
throw new Error('MSv5 payload out of bounds');
|
|
2953
3111
|
}
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
function buildExternalIdsSection(externalIds, nextId) {
|
|
2957
|
-
const chunks = [];
|
|
2958
|
-
for (let i = 0; i < nextId; i++) {
|
|
2959
|
-
writeExternalId(chunks, externalIds[i]);
|
|
3112
|
+
if (payloadCodec === CODEC_RAW && compressedLength !== uncompressedLength) {
|
|
3113
|
+
throw new Error('MSv5 raw payload length mismatch');
|
|
2960
3114
|
}
|
|
2961
|
-
return
|
|
3115
|
+
return {
|
|
3116
|
+
payloadCodec,
|
|
3117
|
+
slice: fileBuf.subarray(payloadOffset, payloadOffset + compressedLength),
|
|
3118
|
+
uncompressedLength,
|
|
3119
|
+
payloadCrc32,
|
|
3120
|
+
};
|
|
2962
3121
|
}
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
const
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
const row = storedFields[i];
|
|
2969
|
-
if (row == null) {
|
|
2970
|
-
table.writeUInt32LE(0, i * 4);
|
|
2971
|
-
continue;
|
|
2972
|
-
}
|
|
2973
|
-
table.writeUInt32LE(heapOff + 1, i * 4);
|
|
2974
|
-
const json = Buffer.from(JSON.stringify(row), 'utf8');
|
|
2975
|
-
const entry = Buffer.alloc(4 + json.length);
|
|
2976
|
-
entry.writeUInt32LE(json.length, 0);
|
|
2977
|
-
json.copy(entry, 4);
|
|
2978
|
-
heapChunks.push(entry);
|
|
2979
|
-
heapOff += entry.length;
|
|
3122
|
+
/** Synchronous load; peak RAM ≈ full uncompressed payload (use the async path to bound it). */
|
|
3123
|
+
function loadMsv5Sections(fileBuf, directory) {
|
|
3124
|
+
const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
|
|
3125
|
+
if (payloadCodec === CODEC_RAW) {
|
|
3126
|
+
return sectionsFromPayload(slice, directory, payloadCrc32);
|
|
2980
3127
|
}
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
for (const [key, val] of tree) {
|
|
2985
|
-
if (key === LEAF) {
|
|
2986
|
-
const idx = val;
|
|
2987
|
-
if (!Number.isInteger(idx) || idx < 0 || idx >= termCount) {
|
|
2988
|
-
throw invalidFrozenIndex(`term tree leaf index out of range: ${idx}`);
|
|
2989
|
-
}
|
|
3128
|
+
if (payloadCodec === CODEC_ZSTD) {
|
|
3129
|
+
if (!zstdAvailable()) {
|
|
3130
|
+
throw zstdUnavailableReadError();
|
|
2990
3131
|
}
|
|
2991
|
-
|
|
2992
|
-
|
|
3132
|
+
// Native cap matches readPayloadMeta's 1 GiB limit (see MSV5_MAX_UNCOMPRESSED_BYTES).
|
|
3133
|
+
// Using header `uncompressedLength` here would only help when the header understates
|
|
3134
|
+
// the zstd stream but the attacker can inflate the header too — same worst case.
|
|
3135
|
+
const decoded = zlib.zstdDecompressSync(slice, {
|
|
3136
|
+
maxOutputLength: MSV5_MAX_UNCOMPRESSED_BYTES,
|
|
3137
|
+
});
|
|
3138
|
+
if (decoded.length !== uncompressedLength) {
|
|
3139
|
+
throw new Error('MSv5 zstd decompressed length mismatch');
|
|
2993
3140
|
}
|
|
3141
|
+
return sectionsFromPayload(decoded, directory, payloadCrc32);
|
|
2994
3142
|
}
|
|
3143
|
+
throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
|
|
2995
3144
|
}
|
|
2996
|
-
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
|
|
3145
|
+
/** Streaming load; peak main-thread RAM ≈ largest single section (+ file buffer). */
|
|
3146
|
+
async function loadMsv5SectionsAsync(fileBuf, directory) {
|
|
3147
|
+
const { payloadCodec, slice, uncompressedLength, payloadCrc32 } = preparePayload(fileBuf, directory);
|
|
3148
|
+
if (payloadCodec === CODEC_RAW) {
|
|
3149
|
+
return sectionsFromPayload(slice, directory, payloadCrc32);
|
|
3150
|
+
}
|
|
3151
|
+
if (payloadCodec === CODEC_ZSTD) {
|
|
3152
|
+
if (!zstdAvailable()) {
|
|
3153
|
+
throw zstdUnavailableReadError();
|
|
3004
3154
|
}
|
|
3155
|
+
return loadMsv5SectionsFromZstdStream(slice, directory, uncompressedLength, payloadCrc32);
|
|
3005
3156
|
}
|
|
3006
|
-
|
|
3157
|
+
throw new Error(`MSv5 unknown payload codec ${payloadCodec}`);
|
|
3158
|
+
}
|
|
3159
|
+
function isMsv5Buffer(buf) {
|
|
3160
|
+
return buf.length >= 4 && buf.toString('ascii', 0, 4) === 'MSv5';
|
|
3161
|
+
}
|
|
3162
|
+
function readMsv5GlobalFlags(buf) {
|
|
3163
|
+
return buf.readUInt16LE(6);
|
|
3007
3164
|
}
|
|
3008
3165
|
|
|
3009
3166
|
/** Global wire flags for {@link FreqArray} width. */
|
|
@@ -3297,11 +3454,14 @@ function encodeFrozenSnapshotMsv5(snap, termTree, packedTermIndex) {
|
|
|
3297
3454
|
const flFlags = fieldLengthMatrixWireFlags(snap.fieldLengthMatrix);
|
|
3298
3455
|
const freqFlags = freqWireFlags(snap.postings.allFreqs);
|
|
3299
3456
|
const globalFlags = postingsWire.flags | flFlags | freqFlags;
|
|
3457
|
+
const storedFieldsSection = snap.storedFieldsLayout != null
|
|
3458
|
+
? buildStoredFieldsWireSection(snap.storedFieldsLayout, snap.nextId)
|
|
3459
|
+
: buildStoredFieldsSection(snap.storedFields, snap.nextId);
|
|
3300
3460
|
const rawSections = [
|
|
3301
3461
|
buildCoreSectionWithTermCount(snap),
|
|
3302
3462
|
buildFieldNamesSection(fieldNames),
|
|
3303
3463
|
buildExternalIdsSection(snap.externalIds, snap.nextId),
|
|
3304
|
-
|
|
3464
|
+
storedFieldsSection,
|
|
3305
3465
|
buildTermTreeSectionColumnar(packed),
|
|
3306
3466
|
bufferFromView(snap.avgFieldLength),
|
|
3307
3467
|
buildFieldLengthMatrixSection(snap.fieldLengthMatrix),
|
|
@@ -3325,11 +3485,14 @@ async function encodeFrozenSnapshotMsv5Async(snap, termTree, packedTermIndex) {
|
|
|
3325
3485
|
const flFlags = fieldLengthMatrixWireFlags(snap.fieldLengthMatrix);
|
|
3326
3486
|
const freqFlags = freqWireFlags(snap.postings.allFreqs);
|
|
3327
3487
|
const globalFlags = postingsWire.flags | flFlags | freqFlags;
|
|
3488
|
+
const storedFieldsSection = snap.storedFieldsLayout != null
|
|
3489
|
+
? buildStoredFieldsWireSection(snap.storedFieldsLayout, snap.nextId)
|
|
3490
|
+
: buildStoredFieldsSection(snap.storedFields, snap.nextId);
|
|
3328
3491
|
const rawSections = [
|
|
3329
3492
|
buildCoreSectionWithTermCount(snap),
|
|
3330
3493
|
buildFieldNamesSection(fieldNames),
|
|
3331
3494
|
buildExternalIdsSection(snap.externalIds, snap.nextId),
|
|
3332
|
-
|
|
3495
|
+
storedFieldsSection,
|
|
3333
3496
|
buildTermTreeSectionColumnar(packed),
|
|
3334
3497
|
bufferFromView(snap.avgFieldLength),
|
|
3335
3498
|
buildFieldLengthMatrixSection(snap.fieldLengthMatrix),
|
|
@@ -3359,7 +3522,7 @@ function validateMsv5Container(buf) {
|
|
|
3359
3522
|
}
|
|
3360
3523
|
return { globalFlags, directory };
|
|
3361
3524
|
}
|
|
3362
|
-
function decodeMsv5Sections(globalFlags, sections) {
|
|
3525
|
+
function decodeMsv5Sections(globalFlags, sections, hints) {
|
|
3363
3526
|
const core = sections[0 /* Msv5SectionId.Core */];
|
|
3364
3527
|
if (core.length !== 16) {
|
|
3365
3528
|
throw invalidFrozenIndex('core section size mismatch');
|
|
@@ -3374,7 +3537,12 @@ function decodeMsv5Sections(globalFlags, sections) {
|
|
|
3374
3537
|
fieldIds[fieldNames[f]] = f;
|
|
3375
3538
|
}
|
|
3376
3539
|
const externalIds = readExternalIdsSection(sections[2 /* Msv5SectionId.ExternalIds */], 0, nextId, sections[2 /* Msv5SectionId.ExternalIds */].length);
|
|
3377
|
-
const
|
|
3540
|
+
const storedFieldsLayout = hints != null
|
|
3541
|
+
? readStoredFieldsWireSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length, hints.storeFields)
|
|
3542
|
+
: undefined;
|
|
3543
|
+
const storedFields = storedFieldsLayout != null
|
|
3544
|
+
? new Array(nextId)
|
|
3545
|
+
: readStoredFieldsSection(sections[3 /* Msv5SectionId.StoredFields */], 0, nextId, sections[3 /* Msv5SectionId.StoredFields */].length);
|
|
3378
3546
|
const packedTermIndex = readPackedTermTreeSectionColumnar(sections[4 /* Msv5SectionId.TermTree */], termCount);
|
|
3379
3547
|
const avgBuf = sections[5 /* Msv5SectionId.AvgFieldLength */];
|
|
3380
3548
|
const avgFieldLength = readFloat32Array(avgBuf, 0, avgBuf.length);
|
|
@@ -3392,6 +3560,7 @@ function decodeMsv5Sections(globalFlags, sections) {
|
|
|
3392
3560
|
avgFieldLength,
|
|
3393
3561
|
externalIds,
|
|
3394
3562
|
storedFields,
|
|
3563
|
+
storedFieldsLayout,
|
|
3395
3564
|
fieldLengthMatrix,
|
|
3396
3565
|
treeShape: [],
|
|
3397
3566
|
packedTermIndex,
|
|
@@ -3400,13 +3569,13 @@ function decodeMsv5Sections(globalFlags, sections) {
|
|
|
3400
3569
|
validateFrozenSnapshot(snap);
|
|
3401
3570
|
return snap;
|
|
3402
3571
|
}
|
|
3403
|
-
function decodeFrozenSnapshotMsv5(buf) {
|
|
3572
|
+
function decodeFrozenSnapshotMsv5(buf, hints) {
|
|
3404
3573
|
const { globalFlags, directory } = validateMsv5Container(buf);
|
|
3405
|
-
return decodeMsv5Sections(globalFlags, loadMsv5Sections(buf, directory));
|
|
3574
|
+
return decodeMsv5Sections(globalFlags, loadMsv5Sections(buf, directory), hints);
|
|
3406
3575
|
}
|
|
3407
|
-
async function decodeFrozenSnapshotMsv5Async(buf) {
|
|
3576
|
+
async function decodeFrozenSnapshotMsv5Async(buf, hints) {
|
|
3408
3577
|
const { globalFlags, directory } = validateMsv5Container(buf);
|
|
3409
|
-
return decodeMsv5Sections(globalFlags, await loadMsv5SectionsAsync(buf, directory));
|
|
3578
|
+
return decodeMsv5Sections(globalFlags, await loadMsv5SectionsAsync(buf, directory), hints);
|
|
3410
3579
|
}
|
|
3411
3580
|
|
|
3412
3581
|
/** Encode a frozen snapshot as a binary buffer. */
|
|
@@ -3420,12 +3589,12 @@ function encodeFrozenSnapshotAsync(snap, termTree, packedTermIndex) {
|
|
|
3420
3589
|
|
|
3421
3590
|
const LEGACY_MAGICS = new Set(['MSv1', 'MSv2', 'MSv3', 'MSv4']);
|
|
3422
3591
|
/** Decode a frozen binary snapshot buffer. */
|
|
3423
|
-
function decodeFrozenSnapshot(buf) {
|
|
3592
|
+
function decodeFrozenSnapshot(buf, hints) {
|
|
3424
3593
|
assertBufferLength(buf, 8);
|
|
3425
3594
|
const magic = buf.toString('ascii', 0, 4);
|
|
3426
3595
|
const version = buf.readUInt16LE(4);
|
|
3427
3596
|
if (isMsv5Buffer(buf) && version === 5) {
|
|
3428
|
-
return decodeFrozenSnapshotMsv5(buf);
|
|
3597
|
+
return decodeFrozenSnapshotMsv5(buf, hints);
|
|
3429
3598
|
}
|
|
3430
3599
|
if (LEGACY_MAGICS.has(magic)) {
|
|
3431
3600
|
throw invalidFrozenIndex('Unsupported frozen binary snapshot; re-build with saveBinarySync() or from lucaong JSON');
|
|
@@ -3433,82 +3602,283 @@ function decodeFrozenSnapshot(buf) {
|
|
|
3433
3602
|
throw invalidFrozenIndex('Unsupported frozen binary snapshot');
|
|
3434
3603
|
}
|
|
3435
3604
|
/** Async frozen snapshot decode (streaming zstd). */
|
|
3436
|
-
async function decodeFrozenSnapshotAsync(buf) {
|
|
3605
|
+
async function decodeFrozenSnapshotAsync(buf, hints) {
|
|
3437
3606
|
assertBufferLength(buf, 8);
|
|
3438
3607
|
const version = buf.readUInt16LE(4);
|
|
3439
3608
|
if (isMsv5Buffer(buf) && version === 5) {
|
|
3440
|
-
return decodeFrozenSnapshotMsv5Async(buf);
|
|
3609
|
+
return decodeFrozenSnapshotMsv5Async(buf, hints);
|
|
3610
|
+
}
|
|
3611
|
+
return decodeFrozenSnapshot(buf, hints);
|
|
3612
|
+
}
|
|
3613
|
+
|
|
3614
|
+
const DEFAULT_CAPACITY = 16;
|
|
3615
|
+
/** Growable unsigned 32-bit column (build scratch; narrowed to u16 at finalize when possible). */
|
|
3616
|
+
class GrowableUint32Column {
|
|
3617
|
+
constructor(initialCapacity = DEFAULT_CAPACITY) {
|
|
3618
|
+
this._len = 0;
|
|
3619
|
+
this._buf = new Uint32Array(Math.max(1, initialCapacity));
|
|
3620
|
+
}
|
|
3621
|
+
get length() {
|
|
3622
|
+
return this._len;
|
|
3623
|
+
}
|
|
3624
|
+
push(value) {
|
|
3625
|
+
if (this._len >= this._buf.length) {
|
|
3626
|
+
const grown = new Uint32Array(Math.max(1, this._buf.length * 2));
|
|
3627
|
+
grown.set(this._buf);
|
|
3628
|
+
this._buf = grown;
|
|
3629
|
+
}
|
|
3630
|
+
this._buf[this._len++] = value;
|
|
3631
|
+
}
|
|
3632
|
+
copyRangeInto(sourceOffset, length, target, targetOffset, docIdWidth) {
|
|
3633
|
+
if (docIdWidth === 16) {
|
|
3634
|
+
const out = target;
|
|
3635
|
+
for (let i = 0; i < length; i++)
|
|
3636
|
+
out[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3637
|
+
}
|
|
3638
|
+
else {
|
|
3639
|
+
const out = target;
|
|
3640
|
+
for (let i = 0; i < length; i++)
|
|
3641
|
+
out[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3642
|
+
}
|
|
3643
|
+
}
|
|
3644
|
+
truncate(length) {
|
|
3645
|
+
this._len = length;
|
|
3646
|
+
if (length > 0 && length < this._buf.length) {
|
|
3647
|
+
this._buf = this._buf.slice(0, length);
|
|
3648
|
+
}
|
|
3649
|
+
}
|
|
3650
|
+
}
|
|
3651
|
+
/** Growable frequency column (u16 cells; matches frozen clamp range). */
|
|
3652
|
+
class GrowableFreqColumn {
|
|
3653
|
+
constructor(initialCapacity = DEFAULT_CAPACITY) {
|
|
3654
|
+
this._len = 0;
|
|
3655
|
+
this._buf = new Uint16Array(Math.max(1, initialCapacity));
|
|
3656
|
+
}
|
|
3657
|
+
get length() {
|
|
3658
|
+
return this._len;
|
|
3659
|
+
}
|
|
3660
|
+
push(freq) {
|
|
3661
|
+
if (this._len >= this._buf.length) {
|
|
3662
|
+
const grown = new Uint16Array(Math.max(1, this._buf.length * 2));
|
|
3663
|
+
grown.set(this._buf);
|
|
3664
|
+
this._buf = grown;
|
|
3665
|
+
}
|
|
3666
|
+
this._buf[this._len++] = clampFreq(freq);
|
|
3667
|
+
}
|
|
3668
|
+
copyRangeInto(sourceOffset, length, target, targetOffset) {
|
|
3669
|
+
for (let i = 0; i < length; i++) {
|
|
3670
|
+
target[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3671
|
+
}
|
|
3672
|
+
}
|
|
3673
|
+
truncate(length) {
|
|
3674
|
+
this._len = length;
|
|
3675
|
+
if (length > 0 && length < this._buf.length) {
|
|
3676
|
+
this._buf = this._buf.slice(0, length);
|
|
3677
|
+
}
|
|
3678
|
+
}
|
|
3679
|
+
}
|
|
3680
|
+
/**
|
|
3681
|
+
* Single-pass postings accumulator for {@link FrozenIndexBuilder}.
|
|
3682
|
+
* One global TypedArray stream per docIds/freqs; per-slot range metadata only.
|
|
3683
|
+
*/
|
|
3684
|
+
class IncrementalPostingsAccumulator {
|
|
3685
|
+
constructor(fieldCount, hints) {
|
|
3686
|
+
var _a;
|
|
3687
|
+
this._slots = new Map();
|
|
3688
|
+
this._totalPostings = 0;
|
|
3689
|
+
this._maxFreq = 0;
|
|
3690
|
+
this._fieldCount = fieldCount;
|
|
3691
|
+
const cap = Math.max(DEFAULT_CAPACITY, (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedTotalPostings) !== null && _a !== void 0 ? _a : 0);
|
|
3692
|
+
this._docIds = new GrowableUint32Column(cap);
|
|
3693
|
+
this._freqs = new GrowableFreqColumn(cap);
|
|
3694
|
+
}
|
|
3695
|
+
get totalPostings() {
|
|
3696
|
+
return this._totalPostings;
|
|
3697
|
+
}
|
|
3698
|
+
get maxFreq() {
|
|
3699
|
+
return this._maxFreq;
|
|
3700
|
+
}
|
|
3701
|
+
append(termIndex, fieldId, docId, freq) {
|
|
3702
|
+
const slot = termIndex * this._fieldCount + fieldId;
|
|
3703
|
+
const writeIdx = this._docIds.length;
|
|
3704
|
+
this._docIds.push(docId);
|
|
3705
|
+
const v = clampFreq(freq);
|
|
3706
|
+
this._freqs.push(v);
|
|
3707
|
+
if (v > this._maxFreq)
|
|
3708
|
+
this._maxFreq = v;
|
|
3709
|
+
this._totalPostings++;
|
|
3710
|
+
let ranges = this._slots.get(slot);
|
|
3711
|
+
if (ranges == null) {
|
|
3712
|
+
ranges = { starts: [writeIdx], lengths: [1] };
|
|
3713
|
+
this._slots.set(slot, ranges);
|
|
3714
|
+
return;
|
|
3715
|
+
}
|
|
3716
|
+
const last = ranges.starts.length - 1;
|
|
3717
|
+
const end = ranges.starts[last] + ranges.lengths[last];
|
|
3718
|
+
if (end === writeIdx) {
|
|
3719
|
+
ranges.lengths[last]++;
|
|
3720
|
+
}
|
|
3721
|
+
else {
|
|
3722
|
+
ranges.starts.push(writeIdx);
|
|
3723
|
+
ranges.lengths.push(1);
|
|
3724
|
+
}
|
|
3725
|
+
}
|
|
3726
|
+
clear() {
|
|
3727
|
+
this._slots.clear();
|
|
3728
|
+
// Drop global scratch backing so finalize does not retain duplicate posting bytes.
|
|
3729
|
+
this._docIds.truncate(0);
|
|
3730
|
+
this._freqs.truncate(0);
|
|
3731
|
+
}
|
|
3732
|
+
copySlot(ranges, allDocIds, allFreqs, write, docIdWidth) {
|
|
3733
|
+
for (let r = 0; r < ranges.starts.length; r++) {
|
|
3734
|
+
const start = ranges.starts[r];
|
|
3735
|
+
const len = ranges.lengths[r];
|
|
3736
|
+
this._docIds.copyRangeInto(start, len, allDocIds, write, docIdWidth);
|
|
3737
|
+
this._freqs.copyRangeInto(start, len, allFreqs, write);
|
|
3738
|
+
write += len;
|
|
3739
|
+
}
|
|
3740
|
+
return write;
|
|
3741
|
+
}
|
|
3742
|
+
slotLength(ranges) {
|
|
3743
|
+
let n = 0;
|
|
3744
|
+
for (let i = 0; i < ranges.lengths.length; i++)
|
|
3745
|
+
n += ranges.lengths[i];
|
|
3746
|
+
return n;
|
|
3747
|
+
}
|
|
3748
|
+
finalize(termCount, nextId) {
|
|
3749
|
+
const fieldCount = this._fieldCount;
|
|
3750
|
+
const totalPostings = this._totalPostings;
|
|
3751
|
+
const maxFreq = this._maxFreq;
|
|
3752
|
+
const slots = this._slots;
|
|
3753
|
+
const layout = choosePostingsLayout(fieldCount);
|
|
3754
|
+
const docIdWidth = nextId <= 65535 ? 16 : 32;
|
|
3755
|
+
const allDocIds = docIdWidth === 16
|
|
3756
|
+
? new Uint16Array(totalPostings)
|
|
3757
|
+
: new Uint32Array(totalPostings);
|
|
3758
|
+
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
3759
|
+
if (layout === 'dense') {
|
|
3760
|
+
const slotCount = termCount * fieldCount;
|
|
3761
|
+
const denseOffsets = new Uint32Array(slotCount);
|
|
3762
|
+
const denseLengths = new Uint32Array(slotCount);
|
|
3763
|
+
let write = 0;
|
|
3764
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
3765
|
+
const base = ti * fieldCount;
|
|
3766
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
3767
|
+
const slot = base + f;
|
|
3768
|
+
const ranges = slots.get(slot);
|
|
3769
|
+
const len = ranges == null ? 0 : this.slotLength(ranges);
|
|
3770
|
+
denseOffsets[slot] = write;
|
|
3771
|
+
denseLengths[slot] = len;
|
|
3772
|
+
if (len > 0) {
|
|
3773
|
+
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3774
|
+
slots.delete(slot);
|
|
3775
|
+
}
|
|
3776
|
+
}
|
|
3777
|
+
}
|
|
3778
|
+
slots.clear();
|
|
3779
|
+
this.clear();
|
|
3780
|
+
return {
|
|
3781
|
+
fieldCount,
|
|
3782
|
+
termCount,
|
|
3783
|
+
nextId,
|
|
3784
|
+
layout,
|
|
3785
|
+
docIdWidth,
|
|
3786
|
+
sparseFieldIdWidth: null,
|
|
3787
|
+
allDocIds,
|
|
3788
|
+
allFreqs,
|
|
3789
|
+
denseOffsets,
|
|
3790
|
+
denseLengths,
|
|
3791
|
+
sparseTermStarts: null,
|
|
3792
|
+
sparseFieldIds: null,
|
|
3793
|
+
sparseOffsets: null,
|
|
3794
|
+
sparseLengths: null,
|
|
3795
|
+
};
|
|
3796
|
+
}
|
|
3797
|
+
const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
|
|
3798
|
+
const sparseFieldIdsScratch = [];
|
|
3799
|
+
const sparseOffsets = [];
|
|
3800
|
+
const sparseLengths = [];
|
|
3801
|
+
const termStarts = new Array(termCount + 1).fill(0);
|
|
3802
|
+
let write = 0;
|
|
3803
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
3804
|
+
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
3805
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
3806
|
+
const slot = ti * fieldCount + f;
|
|
3807
|
+
const ranges = slots.get(slot);
|
|
3808
|
+
const len = ranges == null ? 0 : this.slotLength(ranges);
|
|
3809
|
+
if (len === 0)
|
|
3810
|
+
continue;
|
|
3811
|
+
sparseFieldIdsScratch.push(f);
|
|
3812
|
+
sparseOffsets.push(write);
|
|
3813
|
+
sparseLengths.push(len);
|
|
3814
|
+
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3815
|
+
slots.delete(slot);
|
|
3816
|
+
}
|
|
3817
|
+
termStarts[ti + 1] = sparseFieldIdsScratch.length;
|
|
3818
|
+
}
|
|
3819
|
+
slots.clear();
|
|
3820
|
+
this.clear();
|
|
3821
|
+
const sparseFieldIds = sparseFieldIdWidth === 16
|
|
3822
|
+
? new Uint16Array(sparseFieldIdsScratch)
|
|
3823
|
+
: new Uint8Array(sparseFieldIdsScratch);
|
|
3824
|
+
return {
|
|
3825
|
+
fieldCount,
|
|
3826
|
+
termCount,
|
|
3827
|
+
nextId,
|
|
3828
|
+
layout,
|
|
3829
|
+
docIdWidth,
|
|
3830
|
+
sparseFieldIdWidth,
|
|
3831
|
+
allDocIds,
|
|
3832
|
+
allFreqs,
|
|
3833
|
+
denseOffsets: null,
|
|
3834
|
+
denseLengths: null,
|
|
3835
|
+
sparseTermStarts: new Uint32Array(termStarts),
|
|
3836
|
+
sparseFieldIds,
|
|
3837
|
+
sparseOffsets: new Uint32Array(sparseOffsets),
|
|
3838
|
+
sparseLengths: new Uint32Array(sparseLengths),
|
|
3839
|
+
};
|
|
3441
3840
|
}
|
|
3442
|
-
return decodeFrozenSnapshot(buf);
|
|
3443
3841
|
}
|
|
3444
3842
|
|
|
3445
|
-
function getOrCreateTermIndex(
|
|
3843
|
+
function getOrCreateTermIndex(termCount, index, term) {
|
|
3446
3844
|
const existing = index.get(term);
|
|
3447
3845
|
if (existing != null)
|
|
3448
3846
|
return existing;
|
|
3449
|
-
const ti =
|
|
3450
|
-
|
|
3847
|
+
const ti = termCount.value;
|
|
3848
|
+
termCount.value++;
|
|
3451
3849
|
index.set(term, ti);
|
|
3452
3850
|
return ti;
|
|
3453
3851
|
}
|
|
3454
|
-
function appendPosting(state, termIndex, fieldId, docId, freq) {
|
|
3455
|
-
const slot = termIndex * state.fieldCount + fieldId;
|
|
3456
|
-
let docIds = state.postingsDocIds[slot];
|
|
3457
|
-
if (docIds == null) {
|
|
3458
|
-
docIds = [];
|
|
3459
|
-
state.postingsDocIds[slot] = docIds;
|
|
3460
|
-
state.postingsFreqs[slot] = [];
|
|
3461
|
-
}
|
|
3462
|
-
docIds.push(docId);
|
|
3463
|
-
state.postingsFreqs[slot].push(freq);
|
|
3464
|
-
const v = clampFreq(freq);
|
|
3465
|
-
if (v > state.maxFreq)
|
|
3466
|
-
state.maxFreq = v;
|
|
3467
|
-
state.totalPostings++;
|
|
3468
|
-
}
|
|
3469
|
-
function finalizeFlatPostings(state, nextId) {
|
|
3470
|
-
return materializeFrozenPostingsFromBuilder({
|
|
3471
|
-
fieldCount: state.fieldCount,
|
|
3472
|
-
termCount: state.terms.length,
|
|
3473
|
-
postingsDocIds: state.postingsDocIds,
|
|
3474
|
-
postingsFreqs: state.postingsFreqs,
|
|
3475
|
-
totalPostings: state.totalPostings,
|
|
3476
|
-
maxFreq: state.maxFreq,
|
|
3477
|
-
}, nextId);
|
|
3478
|
-
}
|
|
3479
3852
|
/** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
|
|
3480
3853
|
class FrozenIndexBuilder {
|
|
3481
3854
|
constructor(options, hints) {
|
|
3855
|
+
var _a, _b;
|
|
3856
|
+
this._termCount = { value: 0 };
|
|
3857
|
+
this._fieldTermFreqScratch = new Map();
|
|
3858
|
+
this._tokenScratch = [];
|
|
3482
3859
|
this._options = resolveIndexingOptions(options);
|
|
3483
3860
|
this._fieldIds = buildFieldIds(this._options.fields);
|
|
3484
3861
|
this._fieldCount = this._options.fields.length;
|
|
3485
3862
|
this._index = new SearchableMap();
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
this.
|
|
3863
|
+
const estimatedDocs = (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount) !== null && _a !== void 0 ? _a : 0;
|
|
3864
|
+
const perSlot = (_b = hints === null || hints === void 0 ? void 0 : hints.estimatedPostingsPerSlot) !== null && _b !== void 0 ? _b : 4;
|
|
3865
|
+
this._postings = new IncrementalPostingsAccumulator(this._fieldCount, {
|
|
3866
|
+
estimatedTotalPostings: estimatedDocs > 0 ? estimatedDocs * perSlot : undefined,
|
|
3867
|
+
});
|
|
3489
3868
|
this._avgFieldLength = [];
|
|
3490
3869
|
this._seenIds = new Set();
|
|
3491
3870
|
this._nextId = 0;
|
|
3492
3871
|
this._frozen = false;
|
|
3493
3872
|
const estimated = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount;
|
|
3873
|
+
this._storedFields = createStoredFieldsLayout(this._options.storeFields, estimated !== null && estimated !== void 0 ? estimated : 0);
|
|
3494
3874
|
if (estimated != null && estimated > 0) {
|
|
3495
3875
|
this._externalIds = new Array(estimated);
|
|
3496
|
-
this._storedFields = new Array(estimated);
|
|
3497
3876
|
this._fieldLengthData = new Array(estimated * this._fieldCount).fill(0);
|
|
3498
3877
|
}
|
|
3499
3878
|
else {
|
|
3500
3879
|
this._externalIds = [];
|
|
3501
|
-
this._storedFields = [];
|
|
3502
3880
|
this._fieldLengthData = [];
|
|
3503
3881
|
}
|
|
3504
|
-
this._postingsState = {
|
|
3505
|
-
fieldCount: this._fieldCount,
|
|
3506
|
-
terms: this._terms,
|
|
3507
|
-
postingsDocIds: this._postingsDocIds,
|
|
3508
|
-
postingsFreqs: this._postingsFreqs,
|
|
3509
|
-
totalPostings: 0,
|
|
3510
|
-
maxFreq: 0,
|
|
3511
|
-
};
|
|
3512
3882
|
}
|
|
3513
3883
|
/** Number of documents indexed so far (not yet frozen). */
|
|
3514
3884
|
get documentCount() {
|
|
@@ -3529,22 +3899,23 @@ class FrozenIndexBuilder {
|
|
|
3529
3899
|
this._seenIds.add(id);
|
|
3530
3900
|
const shortId = this._nextId++;
|
|
3531
3901
|
this._externalIds[shortId] = id;
|
|
3532
|
-
this._storedFields
|
|
3902
|
+
writeStoredField(this._storedFields, shortId, storeFields, extractField, document);
|
|
3533
3903
|
const documentCount = shortId + 1;
|
|
3534
3904
|
for (const field of fields) {
|
|
3535
3905
|
const fieldValue = extractField(document, field);
|
|
3536
3906
|
if (fieldValue == null)
|
|
3537
3907
|
continue;
|
|
3538
|
-
const
|
|
3908
|
+
const fieldText = typeof fieldValue === 'string'
|
|
3909
|
+
? fieldValue
|
|
3910
|
+
: stringifyField(fieldValue, field);
|
|
3539
3911
|
const fieldId = this._fieldIds[field];
|
|
3540
|
-
const uniqueTerms =
|
|
3541
|
-
const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
|
|
3912
|
+
const uniqueTerms = collectFieldTermFreqsFromFieldInto(this._fieldTermFreqScratch, this._tokenScratch, tokenize, fieldText, field, processTerm);
|
|
3542
3913
|
this._fieldLengthData[shortId * this._fieldCount + fieldId] = uniqueTerms;
|
|
3543
3914
|
updateAvgFieldLength(this._avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
|
|
3544
|
-
|
|
3545
|
-
const ti = getOrCreateTermIndex(this.
|
|
3546
|
-
|
|
3547
|
-
}
|
|
3915
|
+
this._fieldTermFreqScratch.forEach((freq, term) => {
|
|
3916
|
+
const ti = getOrCreateTermIndex(this._termCount, this._index, term);
|
|
3917
|
+
this._postings.append(ti, fieldId, shortId, freq);
|
|
3918
|
+
});
|
|
3548
3919
|
}
|
|
3549
3920
|
}
|
|
3550
3921
|
/**
|
|
@@ -3601,7 +3972,11 @@ class FrozenIndexBuilder {
|
|
|
3601
3972
|
}
|
|
3602
3973
|
this._frozen = true;
|
|
3603
3974
|
const documentCount = this._nextId;
|
|
3604
|
-
const
|
|
3975
|
+
const termCount = this._termCount.value;
|
|
3976
|
+
const postings = this._postings.finalize(termCount, documentCount);
|
|
3977
|
+
const radixTree = this._index.radixTree;
|
|
3978
|
+
this._index = null;
|
|
3979
|
+
const index = fromRadixTree(radixTree, termCount);
|
|
3605
3980
|
const avgFieldLength = new Float32Array(this._fieldCount);
|
|
3606
3981
|
for (let f = 0; f < this._fieldCount; f++) {
|
|
3607
3982
|
avgFieldLength[f] = (_a = this._avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
|
|
@@ -3610,12 +3985,8 @@ class FrozenIndexBuilder {
|
|
|
3610
3985
|
const externalIds = this._externalIds.length > documentCount
|
|
3611
3986
|
? this._externalIds.slice(0, documentCount)
|
|
3612
3987
|
: this._externalIds;
|
|
3613
|
-
const storedFields = this._storedFields
|
|
3614
|
-
? this._storedFields.slice(0, documentCount)
|
|
3615
|
-
: this._storedFields;
|
|
3988
|
+
const storedFields = resizeStoredFields(this._storedFields, documentCount);
|
|
3616
3989
|
const idLookup = createIdToShortIdLookup(externalIds, documentCount);
|
|
3617
|
-
// Incremental builder: numeric radix leaves + build-time terms[] for postings.
|
|
3618
|
-
// freezeFromMiniSearch packs Map leaves in one radix pass (no resident terms[]).
|
|
3619
3990
|
return {
|
|
3620
3991
|
options: this._options,
|
|
3621
3992
|
documentCount,
|
|
@@ -3627,8 +3998,8 @@ class FrozenIndexBuilder {
|
|
|
3627
3998
|
storedFields,
|
|
3628
3999
|
fieldLengthMatrix: materializeFieldLengthMatrix(this._fieldLengthData, documentCount * this._fieldCount),
|
|
3629
4000
|
avgFieldLength,
|
|
3630
|
-
index
|
|
3631
|
-
termCount
|
|
4001
|
+
index,
|
|
4002
|
+
termCount,
|
|
3632
4003
|
postings,
|
|
3633
4004
|
};
|
|
3634
4005
|
}
|
|
@@ -4042,7 +4413,7 @@ function shallowCopyJsSnapshotFields(params) {
|
|
|
4042
4413
|
return {
|
|
4043
4414
|
fieldIds: { ...params.fieldIds },
|
|
4044
4415
|
options: shallowCopyOptions(params.options),
|
|
4045
|
-
storedFields: params.storedFields
|
|
4416
|
+
storedFields: cloneStoredFields(params.storedFields),
|
|
4046
4417
|
};
|
|
4047
4418
|
}
|
|
4048
4419
|
/**
|
|
@@ -4127,7 +4498,7 @@ class FrozenMiniSearch {
|
|
|
4127
4498
|
fieldIds: this._fieldIds,
|
|
4128
4499
|
getFieldLength: (docId, fieldId) => this.getFieldLength(docId, fieldId),
|
|
4129
4500
|
getExternalId: docId => this._externalIds[docId],
|
|
4130
|
-
getStoredFields: docId => this._storedFields
|
|
4501
|
+
getStoredFields: docId => readStoredFields(this._storedFields, docId),
|
|
4131
4502
|
};
|
|
4132
4503
|
this._queryEngineParams = {
|
|
4133
4504
|
fields: this._options.fields,
|
|
@@ -4139,7 +4510,7 @@ class FrozenMiniSearch {
|
|
|
4139
4510
|
const id = this._externalIds[shortId];
|
|
4140
4511
|
if (id === undefined)
|
|
4141
4512
|
continue;
|
|
4142
|
-
callback(shortId, id, this._storedFields
|
|
4513
|
+
callback(shortId, id, readStoredFields(this._storedFields, shortId));
|
|
4143
4514
|
}
|
|
4144
4515
|
}),
|
|
4145
4516
|
aggregateContext: this._aggregateContext,
|
|
@@ -4150,11 +4521,7 @@ class FrozenMiniSearch {
|
|
|
4150
4521
|
memoryBreakdown() {
|
|
4151
4522
|
const termCount = this.termCount;
|
|
4152
4523
|
const postingsStats = postingsTypedBytes(this._postings);
|
|
4153
|
-
|
|
4154
|
-
for (const row of this._storedFields) {
|
|
4155
|
-
if (row != null)
|
|
4156
|
-
storedJson += JSON.stringify(row).length;
|
|
4157
|
-
}
|
|
4524
|
+
const storedJson = storedFieldsJsonBytes(this._storedFields);
|
|
4158
4525
|
const radixEst = this._index.packedByteLength();
|
|
4159
4526
|
const idMapBytes = this._idLookup.mode === 'lazy-map' ? this._idLookup.mapEntryCount * 32 : 0;
|
|
4160
4527
|
const estimatedStructuredBytes = postingsStats.totalTypedBytes
|
|
@@ -4184,7 +4551,7 @@ class FrozenMiniSearch {
|
|
|
4184
4551
|
},
|
|
4185
4552
|
documents: {
|
|
4186
4553
|
externalIdsSlots: this._externalIds.length,
|
|
4187
|
-
storedFieldsSlots: this._storedFields
|
|
4554
|
+
storedFieldsSlots: storedFieldsSlotCount(this._storedFields),
|
|
4188
4555
|
idLookupMode: this._idLookup.mode,
|
|
4189
4556
|
idToShortIdEntries: this._idLookup.mapEntryCount,
|
|
4190
4557
|
fieldLengthMatrixBytes: this._fieldLengthMatrix.byteLength,
|
|
@@ -4199,10 +4566,10 @@ class FrozenMiniSearch {
|
|
|
4199
4566
|
}
|
|
4200
4567
|
getStoredFields(id) {
|
|
4201
4568
|
const shortId = this._idLookup.get(id);
|
|
4202
|
-
return shortId == null ? undefined : this._storedFields
|
|
4569
|
+
return shortId == null ? undefined : readStoredFields(this._storedFields, shortId);
|
|
4203
4570
|
}
|
|
4204
4571
|
search(query, searchOptions = {}) {
|
|
4205
|
-
return finalizeRawSearchResults(this.executeQuery(query, searchOptions), query, searchOptions, this._options.searchOptions, docId => this._externalIds[docId], docId => this._storedFields
|
|
4572
|
+
return finalizeRawSearchResults(this.executeQuery(query, searchOptions), query, searchOptions, this._options.searchOptions, docId => this._externalIds[docId], docId => readStoredFields(this._storedFields, docId));
|
|
4206
4573
|
}
|
|
4207
4574
|
autoSuggest(queryString, options = {}) {
|
|
4208
4575
|
const merged = { ...this._options.autoSuggestOptions, ...options };
|
|
@@ -4218,7 +4585,8 @@ class FrozenMiniSearch {
|
|
|
4218
4585
|
fieldNames: fieldNamesFromFieldIds(this._fieldIds),
|
|
4219
4586
|
avgFieldLength: this._avgFieldLength,
|
|
4220
4587
|
externalIds: this._externalIds,
|
|
4221
|
-
storedFields: this.
|
|
4588
|
+
storedFields: new Array(this._nextId),
|
|
4589
|
+
storedFieldsLayout: this._storedFields,
|
|
4222
4590
|
fieldLengthMatrix: fieldLengthMatrixForWire(this._fieldLengthMatrix),
|
|
4223
4591
|
treeShape: [],
|
|
4224
4592
|
postings: this._postings,
|
|
@@ -4234,7 +4602,8 @@ class FrozenMiniSearch {
|
|
|
4234
4602
|
fieldNames: fieldNamesFromFieldIds(this._fieldIds),
|
|
4235
4603
|
avgFieldLength: this._avgFieldLength,
|
|
4236
4604
|
externalIds: this._externalIds,
|
|
4237
|
-
storedFields: this.
|
|
4605
|
+
storedFields: new Array(this._nextId),
|
|
4606
|
+
storedFieldsLayout: this._storedFields,
|
|
4238
4607
|
fieldLengthMatrix: fieldLengthMatrixForWire(this._fieldLengthMatrix),
|
|
4239
4608
|
treeShape: [],
|
|
4240
4609
|
postings: this._postings,
|
|
@@ -4242,16 +4611,20 @@ class FrozenMiniSearch {
|
|
|
4242
4611
|
}
|
|
4243
4612
|
/** Load a frozen binary snapshot. */
|
|
4244
4613
|
static loadBinarySync(buffer, options = {}) {
|
|
4245
|
-
|
|
4614
|
+
var _a;
|
|
4615
|
+
const storeFields = (_a = options.storeFields) !== null && _a !== void 0 ? _a : defaultFrozenLoadOptions.storeFields;
|
|
4616
|
+
const snap = decodeFrozenSnapshot(buffer, { storeFields });
|
|
4246
4617
|
return FrozenMiniSearch.fromBinarySnapshot(snap, options);
|
|
4247
4618
|
}
|
|
4248
4619
|
/** Load a frozen binary snapshot with streaming zstd decompression (bounded memory). */
|
|
4249
4620
|
static async loadBinaryAsync(buffer, options = {}) {
|
|
4250
|
-
|
|
4621
|
+
var _a;
|
|
4622
|
+
const storeFields = (_a = options.storeFields) !== null && _a !== void 0 ? _a : defaultFrozenLoadOptions.storeFields;
|
|
4623
|
+
const snap = await decodeFrozenSnapshotAsync(buffer, { storeFields });
|
|
4251
4624
|
return FrozenMiniSearch.fromBinarySnapshot(snap, options);
|
|
4252
4625
|
}
|
|
4253
4626
|
static fromBinarySnapshot(snap, options) {
|
|
4254
|
-
var _a, _b;
|
|
4627
|
+
var _a, _b, _c;
|
|
4255
4628
|
const snapshotFields = (_a = snap.fieldNames) !== null && _a !== void 0 ? _a : fieldNamesFromFieldIds(snap.fieldIds);
|
|
4256
4629
|
if (options.fields != null) {
|
|
4257
4630
|
assertFieldsMatchSnapshot(options.fields, snap.fieldIds);
|
|
@@ -4279,7 +4652,7 @@ class FrozenMiniSearch {
|
|
|
4279
4652
|
fieldCount: snap.fieldCount,
|
|
4280
4653
|
externalIds: snap.externalIds,
|
|
4281
4654
|
idLookup,
|
|
4282
|
-
storedFields: snap.storedFields,
|
|
4655
|
+
storedFields: (_c = snap.storedFieldsLayout) !== null && _c !== void 0 ? _c : storedFieldsFromRows(snap.storedFields, opts.storeFields),
|
|
4283
4656
|
fieldLengthMatrix: snap.fieldLengthMatrix,
|
|
4284
4657
|
avgFieldLength: snap.avgFieldLength,
|
|
4285
4658
|
index,
|
|
@@ -4334,4 +4707,4 @@ class FrozenMiniSearch {
|
|
|
4334
4707
|
}
|
|
4335
4708
|
FrozenMiniSearch.wildcard = WILDCARD_QUERY;
|
|
4336
4709
|
|
|
4337
|
-
export { AND, AND_NOT, FrozenIndexBuilder, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
|
|
4710
|
+
export { AND, AND_NOT, FrozenIndexBuilder, FrozenMiniSearch, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
|