@yoch/frozenminisearch 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/README.md +41 -11
- package/dist/cjs/index.cjs +357 -167
- package/dist/cjs/index.require.cjs +1 -0
- package/dist/es/index.d.ts +8 -6
- package/dist/es/index.js +357 -168
- package/package.json +5 -2
package/dist/cjs/index.cjs
CHANGED
|
@@ -1911,111 +1911,6 @@ function materializeFrozenPostings(params) {
|
|
|
1911
1911
|
sparseLengths: new Uint32Array(sparseLengths),
|
|
1912
1912
|
};
|
|
1913
1913
|
}
|
|
1914
|
-
/** One-pass materialize from {@link FrozenIndexBuilder} scratch (counts known upfront). */
|
|
1915
|
-
function materializeFrozenPostingsFromBuilder(state, nextId) {
|
|
1916
|
-
var _a;
|
|
1917
|
-
const { fieldCount, termCount, postingsDocIds, postingsFreqs, totalPostings, maxFreq } = state;
|
|
1918
|
-
const layout = choosePostingsLayout(fieldCount);
|
|
1919
|
-
const docIdWidth = nextId <= 65535 ? 16 : 32;
|
|
1920
|
-
const allDocIds = docIdWidth === 16
|
|
1921
|
-
? new Uint16Array(totalPostings)
|
|
1922
|
-
: new Uint32Array(totalPostings);
|
|
1923
|
-
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
1924
|
-
if (layout === 'dense') {
|
|
1925
|
-
const slotCount = termCount * fieldCount;
|
|
1926
|
-
const denseOffsets = new Uint32Array(slotCount);
|
|
1927
|
-
const denseLengths = new Uint32Array(slotCount);
|
|
1928
|
-
let write = 0;
|
|
1929
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1930
|
-
const base = ti * fieldCount;
|
|
1931
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1932
|
-
const slot = base + f;
|
|
1933
|
-
const docIds = postingsDocIds[slot];
|
|
1934
|
-
const freqs = postingsFreqs[slot];
|
|
1935
|
-
const len = (_a = docIds === null || docIds === void 0 ? void 0 : docIds.length) !== null && _a !== void 0 ? _a : 0;
|
|
1936
|
-
denseOffsets[slot] = write;
|
|
1937
|
-
denseLengths[slot] = len;
|
|
1938
|
-
for (let i = 0; i < len; i++) {
|
|
1939
|
-
const docId = docIds[i];
|
|
1940
|
-
if (docIdWidth === 16) {
|
|
1941
|
-
allDocIds[write] = docId;
|
|
1942
|
-
}
|
|
1943
|
-
else {
|
|
1944
|
-
allDocIds[write] = docId;
|
|
1945
|
-
}
|
|
1946
|
-
allFreqs[write] = freqs[i];
|
|
1947
|
-
write++;
|
|
1948
|
-
}
|
|
1949
|
-
}
|
|
1950
|
-
}
|
|
1951
|
-
return {
|
|
1952
|
-
fieldCount,
|
|
1953
|
-
termCount,
|
|
1954
|
-
nextId,
|
|
1955
|
-
layout,
|
|
1956
|
-
docIdWidth,
|
|
1957
|
-
sparseFieldIdWidth: null,
|
|
1958
|
-
allDocIds,
|
|
1959
|
-
allFreqs,
|
|
1960
|
-
denseOffsets,
|
|
1961
|
-
denseLengths,
|
|
1962
|
-
sparseTermStarts: null,
|
|
1963
|
-
sparseFieldIds: null,
|
|
1964
|
-
sparseOffsets: null,
|
|
1965
|
-
sparseLengths: null,
|
|
1966
|
-
};
|
|
1967
|
-
}
|
|
1968
|
-
const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
|
|
1969
|
-
const sparseFieldIdsScratch = [];
|
|
1970
|
-
const sparseOffsets = [];
|
|
1971
|
-
const sparseLengths = [];
|
|
1972
|
-
const termStarts = new Array(termCount + 1).fill(0);
|
|
1973
|
-
let write = 0;
|
|
1974
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1975
|
-
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
1976
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1977
|
-
const slot = ti * fieldCount + f;
|
|
1978
|
-
const docIds = postingsDocIds[slot];
|
|
1979
|
-
if (docIds == null || docIds.length === 0)
|
|
1980
|
-
continue;
|
|
1981
|
-
const freqs = postingsFreqs[slot];
|
|
1982
|
-
sparseFieldIdsScratch.push(f);
|
|
1983
|
-
sparseOffsets.push(write);
|
|
1984
|
-
sparseLengths.push(docIds.length);
|
|
1985
|
-
for (let i = 0; i < docIds.length; i++) {
|
|
1986
|
-
const docId = docIds[i];
|
|
1987
|
-
if (docIdWidth === 16) {
|
|
1988
|
-
allDocIds[write] = docId;
|
|
1989
|
-
}
|
|
1990
|
-
else {
|
|
1991
|
-
allDocIds[write] = docId;
|
|
1992
|
-
}
|
|
1993
|
-
allFreqs[write] = freqs[i];
|
|
1994
|
-
write++;
|
|
1995
|
-
}
|
|
1996
|
-
}
|
|
1997
|
-
termStarts[ti + 1] = sparseFieldIdsScratch.length;
|
|
1998
|
-
}
|
|
1999
|
-
const sparseFieldIds = sparseFieldIdWidth === 16
|
|
2000
|
-
? new Uint16Array(sparseFieldIdsScratch)
|
|
2001
|
-
: new Uint8Array(sparseFieldIdsScratch);
|
|
2002
|
-
return {
|
|
2003
|
-
fieldCount,
|
|
2004
|
-
termCount,
|
|
2005
|
-
nextId,
|
|
2006
|
-
layout,
|
|
2007
|
-
docIdWidth,
|
|
2008
|
-
sparseFieldIdWidth,
|
|
2009
|
-
allDocIds,
|
|
2010
|
-
allFreqs,
|
|
2011
|
-
denseOffsets: null,
|
|
2012
|
-
denseLengths: null,
|
|
2013
|
-
sparseTermStarts: new Uint32Array(termStarts),
|
|
2014
|
-
sparseFieldIds,
|
|
2015
|
-
sparseOffsets: new Uint32Array(sparseOffsets),
|
|
2016
|
-
sparseLengths: new Uint32Array(sparseLengths),
|
|
2017
|
-
};
|
|
2018
|
-
}
|
|
2019
1914
|
function postingsTypedBytes(layout) {
|
|
2020
1915
|
const allDocIdsBytes = layout.allDocIds.byteLength;
|
|
2021
1916
|
const allFreqsBytes = layout.allFreqs.byteLength;
|
|
@@ -2232,21 +2127,110 @@ function buildFieldIds(fields) {
|
|
|
2232
2127
|
}
|
|
2233
2128
|
return fieldIds;
|
|
2234
2129
|
}
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2130
|
+
function accumulateProcessedTerm(localFreqs, processedTerm) {
|
|
2131
|
+
if (Array.isArray(processedTerm)) {
|
|
2132
|
+
for (const t of processedTerm) {
|
|
2133
|
+
localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
|
|
2134
|
+
}
|
|
2135
|
+
}
|
|
2136
|
+
else if (processedTerm) {
|
|
2137
|
+
localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
|
|
2138
|
+
}
|
|
2139
|
+
}
|
|
2140
|
+
/**
|
|
2141
|
+
* Accumulate token frequencies for one document field into `localFreqs` (cleared first).
|
|
2142
|
+
* Returns the number of distinct processed terms (replaces a separate `Set(tokens)` pass).
|
|
2143
|
+
*/
|
|
2144
|
+
function collectFieldTermFreqsInto(localFreqs, tokens, fieldName, processTerm) {
|
|
2145
|
+
localFreqs.clear();
|
|
2238
2146
|
for (const term of tokens) {
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2147
|
+
accumulateProcessedTerm(localFreqs, processTerm(term, fieldName));
|
|
2148
|
+
}
|
|
2149
|
+
return localFreqs.size;
|
|
2150
|
+
}
|
|
2151
|
+
/** Global delimiter pattern for incremental `exec` (must not reuse {@link SPACE_OR_PUNCTUATION} — no `g` flag). */
|
|
2152
|
+
const DEFAULT_TOKENIZE_DELIMITERS = /[\n\r\p{Z}\p{P}]+/gu;
|
|
2153
|
+
const defaultTokenizeProbe = 'a b';
|
|
2154
|
+
const defaultTokenizeProbeField = 'f';
|
|
2155
|
+
const tokenizeBehaviorCache = new WeakMap();
|
|
2156
|
+
/**
|
|
2157
|
+
* True when `tokenize` matches the library default (reference equality or split-equivalent
|
|
2158
|
+
* on a fixed probe). Custom tokenizers that pass the probe but diverge on other inputs
|
|
2159
|
+
* (e.g. leading delimiters) still take the fast path — use the default reference in prod.
|
|
2160
|
+
*/
|
|
2161
|
+
function isDefaultTokenize(tokenize) {
|
|
2162
|
+
if (tokenize === defaultFrozenLoadOptions.tokenize)
|
|
2163
|
+
return true;
|
|
2164
|
+
const cached = tokenizeBehaviorCache.get(tokenize);
|
|
2165
|
+
if (cached != null)
|
|
2166
|
+
return cached;
|
|
2167
|
+
const splitTokens = defaultTokenizeProbe.split(SPACE_OR_PUNCTUATION);
|
|
2168
|
+
const customTokens = tokenize(defaultTokenizeProbe, defaultTokenizeProbeField);
|
|
2169
|
+
const ok = splitTokens.length === customTokens.length
|
|
2170
|
+
&& splitTokens.every((t, i) => t === customTokens[i]);
|
|
2171
|
+
tokenizeBehaviorCache.set(tokenize, ok);
|
|
2172
|
+
return ok;
|
|
2173
|
+
}
|
|
2174
|
+
function forEachDefaultToken(text, onToken) {
|
|
2175
|
+
if (text.length === 0) {
|
|
2176
|
+
onToken('');
|
|
2177
|
+
return;
|
|
2178
|
+
}
|
|
2179
|
+
let start = 0;
|
|
2180
|
+
const re = DEFAULT_TOKENIZE_DELIMITERS;
|
|
2181
|
+
re.lastIndex = 0;
|
|
2182
|
+
let match;
|
|
2183
|
+
while ((match = re.exec(text)) !== null) {
|
|
2184
|
+
if (match.index > start) {
|
|
2185
|
+
onToken(text.slice(start, match.index));
|
|
2244
2186
|
}
|
|
2245
|
-
else if (
|
|
2246
|
-
|
|
2187
|
+
else if (match.index === start) {
|
|
2188
|
+
onToken('');
|
|
2247
2189
|
}
|
|
2190
|
+
start = match.index + match[0].length;
|
|
2191
|
+
}
|
|
2192
|
+
if (start < text.length) {
|
|
2193
|
+
onToken(text.slice(start));
|
|
2194
|
+
}
|
|
2195
|
+
else if (start === 0) {
|
|
2196
|
+
onToken(text);
|
|
2197
|
+
}
|
|
2198
|
+
else if (start === text.length) {
|
|
2199
|
+
onToken('');
|
|
2248
2200
|
}
|
|
2249
|
-
|
|
2201
|
+
}
|
|
2202
|
+
/** Default tokenizer into a reusable buffer (avoids `text.split()` array allocation). */
|
|
2203
|
+
function tokenizeDefaultInto(out, text) {
|
|
2204
|
+
out.length = 0;
|
|
2205
|
+
forEachDefaultToken(text, (token) => out.push(token));
|
|
2206
|
+
}
|
|
2207
|
+
/** Tokenize field text into `out` (reused). Fast path when `tokenize` is the library default. */
|
|
2208
|
+
function tokenizeFieldInto(out, tokenize, text, fieldName) {
|
|
2209
|
+
if (isDefaultTokenize(tokenize)) {
|
|
2210
|
+
tokenizeDefaultInto(out, text);
|
|
2211
|
+
return;
|
|
2212
|
+
}
|
|
2213
|
+
const tokens = tokenize(text, fieldName);
|
|
2214
|
+
out.length = 0;
|
|
2215
|
+
out.push(...tokens);
|
|
2216
|
+
}
|
|
2217
|
+
function collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm) {
|
|
2218
|
+
localFreqs.clear();
|
|
2219
|
+
forEachDefaultToken(text, (token) => {
|
|
2220
|
+
accumulateProcessedTerm(localFreqs, processTerm(token, fieldName));
|
|
2221
|
+
});
|
|
2222
|
+
return localFreqs.size;
|
|
2223
|
+
}
|
|
2224
|
+
/**
|
|
2225
|
+
* Tokenize + accumulate field term frequencies in one pass when the default tokenizer is used.
|
|
2226
|
+
* `tokenScratch` is only used for custom tokenizers (two-phase fallback).
|
|
2227
|
+
*/
|
|
2228
|
+
function collectFieldTermFreqsFromFieldInto(localFreqs, tokenScratch, tokenize, text, fieldName, processTerm) {
|
|
2229
|
+
if (isDefaultTokenize(tokenize)) {
|
|
2230
|
+
return collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm);
|
|
2231
|
+
}
|
|
2232
|
+
tokenizeFieldInto(tokenScratch, tokenize, text, fieldName);
|
|
2233
|
+
return collectFieldTermFreqsInto(localFreqs, tokenScratch, fieldName, processTerm);
|
|
2250
2234
|
}
|
|
2251
2235
|
/** Same running average as {@link MiniSearch} private addFieldLength. */
|
|
2252
2236
|
function updateAvgFieldLength(avgFieldLength, fieldId, count, length) {
|
|
@@ -3446,50 +3430,260 @@ async function decodeFrozenSnapshotAsync(buf) {
|
|
|
3446
3430
|
return decodeFrozenSnapshot(buf);
|
|
3447
3431
|
}
|
|
3448
3432
|
|
|
3449
|
-
|
|
3433
|
+
const DEFAULT_CAPACITY = 16;
|
|
3434
|
+
/** Growable unsigned 32-bit column (build scratch; narrowed to u16 at finalize when possible). */
|
|
3435
|
+
class GrowableUint32Column {
|
|
3436
|
+
constructor(initialCapacity = DEFAULT_CAPACITY) {
|
|
3437
|
+
this._len = 0;
|
|
3438
|
+
this._buf = new Uint32Array(Math.max(1, initialCapacity));
|
|
3439
|
+
}
|
|
3440
|
+
get length() {
|
|
3441
|
+
return this._len;
|
|
3442
|
+
}
|
|
3443
|
+
push(value) {
|
|
3444
|
+
if (this._len >= this._buf.length) {
|
|
3445
|
+
const grown = new Uint32Array(Math.max(1, this._buf.length * 2));
|
|
3446
|
+
grown.set(this._buf);
|
|
3447
|
+
this._buf = grown;
|
|
3448
|
+
}
|
|
3449
|
+
this._buf[this._len++] = value;
|
|
3450
|
+
}
|
|
3451
|
+
copyRangeInto(sourceOffset, length, target, targetOffset, docIdWidth) {
|
|
3452
|
+
if (docIdWidth === 16) {
|
|
3453
|
+
const out = target;
|
|
3454
|
+
for (let i = 0; i < length; i++)
|
|
3455
|
+
out[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3456
|
+
}
|
|
3457
|
+
else {
|
|
3458
|
+
const out = target;
|
|
3459
|
+
for (let i = 0; i < length; i++)
|
|
3460
|
+
out[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3461
|
+
}
|
|
3462
|
+
}
|
|
3463
|
+
truncate(length) {
|
|
3464
|
+
this._len = length;
|
|
3465
|
+
if (length > 0 && length < this._buf.length) {
|
|
3466
|
+
this._buf = this._buf.slice(0, length);
|
|
3467
|
+
}
|
|
3468
|
+
}
|
|
3469
|
+
}
|
|
3470
|
+
/** Growable frequency column (u16 cells; matches frozen clamp range). */
|
|
3471
|
+
class GrowableFreqColumn {
|
|
3472
|
+
constructor(initialCapacity = DEFAULT_CAPACITY) {
|
|
3473
|
+
this._len = 0;
|
|
3474
|
+
this._buf = new Uint16Array(Math.max(1, initialCapacity));
|
|
3475
|
+
}
|
|
3476
|
+
get length() {
|
|
3477
|
+
return this._len;
|
|
3478
|
+
}
|
|
3479
|
+
push(freq) {
|
|
3480
|
+
if (this._len >= this._buf.length) {
|
|
3481
|
+
const grown = new Uint16Array(Math.max(1, this._buf.length * 2));
|
|
3482
|
+
grown.set(this._buf);
|
|
3483
|
+
this._buf = grown;
|
|
3484
|
+
}
|
|
3485
|
+
this._buf[this._len++] = clampFreq(freq);
|
|
3486
|
+
}
|
|
3487
|
+
copyRangeInto(sourceOffset, length, target, targetOffset) {
|
|
3488
|
+
for (let i = 0; i < length; i++) {
|
|
3489
|
+
target[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3490
|
+
}
|
|
3491
|
+
}
|
|
3492
|
+
truncate(length) {
|
|
3493
|
+
this._len = length;
|
|
3494
|
+
if (length > 0 && length < this._buf.length) {
|
|
3495
|
+
this._buf = this._buf.slice(0, length);
|
|
3496
|
+
}
|
|
3497
|
+
}
|
|
3498
|
+
}
|
|
3499
|
+
/**
|
|
3500
|
+
* Single-pass postings accumulator for {@link FrozenIndexBuilder}.
|
|
3501
|
+
* One global TypedArray stream per docIds/freqs; per-slot range metadata only.
|
|
3502
|
+
*/
|
|
3503
|
+
class IncrementalPostingsAccumulator {
|
|
3504
|
+
constructor(fieldCount, hints) {
|
|
3505
|
+
var _a;
|
|
3506
|
+
this._slots = new Map();
|
|
3507
|
+
this._totalPostings = 0;
|
|
3508
|
+
this._maxFreq = 0;
|
|
3509
|
+
this._fieldCount = fieldCount;
|
|
3510
|
+
const cap = Math.max(DEFAULT_CAPACITY, (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedTotalPostings) !== null && _a !== void 0 ? _a : 0);
|
|
3511
|
+
this._docIds = new GrowableUint32Column(cap);
|
|
3512
|
+
this._freqs = new GrowableFreqColumn(cap);
|
|
3513
|
+
}
|
|
3514
|
+
get totalPostings() {
|
|
3515
|
+
return this._totalPostings;
|
|
3516
|
+
}
|
|
3517
|
+
get maxFreq() {
|
|
3518
|
+
return this._maxFreq;
|
|
3519
|
+
}
|
|
3520
|
+
append(termIndex, fieldId, docId, freq) {
|
|
3521
|
+
const slot = termIndex * this._fieldCount + fieldId;
|
|
3522
|
+
const writeIdx = this._docIds.length;
|
|
3523
|
+
this._docIds.push(docId);
|
|
3524
|
+
const v = clampFreq(freq);
|
|
3525
|
+
this._freqs.push(v);
|
|
3526
|
+
if (v > this._maxFreq)
|
|
3527
|
+
this._maxFreq = v;
|
|
3528
|
+
this._totalPostings++;
|
|
3529
|
+
let ranges = this._slots.get(slot);
|
|
3530
|
+
if (ranges == null) {
|
|
3531
|
+
ranges = { starts: [writeIdx], lengths: [1] };
|
|
3532
|
+
this._slots.set(slot, ranges);
|
|
3533
|
+
return;
|
|
3534
|
+
}
|
|
3535
|
+
const last = ranges.starts.length - 1;
|
|
3536
|
+
const end = ranges.starts[last] + ranges.lengths[last];
|
|
3537
|
+
if (end === writeIdx) {
|
|
3538
|
+
ranges.lengths[last]++;
|
|
3539
|
+
}
|
|
3540
|
+
else {
|
|
3541
|
+
ranges.starts.push(writeIdx);
|
|
3542
|
+
ranges.lengths.push(1);
|
|
3543
|
+
}
|
|
3544
|
+
}
|
|
3545
|
+
clear() {
|
|
3546
|
+
this._slots.clear();
|
|
3547
|
+
// Drop global scratch backing so finalize does not retain duplicate posting bytes.
|
|
3548
|
+
this._docIds.truncate(0);
|
|
3549
|
+
this._freqs.truncate(0);
|
|
3550
|
+
}
|
|
3551
|
+
copySlot(ranges, allDocIds, allFreqs, write, docIdWidth) {
|
|
3552
|
+
for (let r = 0; r < ranges.starts.length; r++) {
|
|
3553
|
+
const start = ranges.starts[r];
|
|
3554
|
+
const len = ranges.lengths[r];
|
|
3555
|
+
this._docIds.copyRangeInto(start, len, allDocIds, write, docIdWidth);
|
|
3556
|
+
this._freqs.copyRangeInto(start, len, allFreqs, write);
|
|
3557
|
+
write += len;
|
|
3558
|
+
}
|
|
3559
|
+
return write;
|
|
3560
|
+
}
|
|
3561
|
+
slotLength(ranges) {
|
|
3562
|
+
let n = 0;
|
|
3563
|
+
for (let i = 0; i < ranges.lengths.length; i++)
|
|
3564
|
+
n += ranges.lengths[i];
|
|
3565
|
+
return n;
|
|
3566
|
+
}
|
|
3567
|
+
finalize(termCount, nextId) {
|
|
3568
|
+
const fieldCount = this._fieldCount;
|
|
3569
|
+
const totalPostings = this._totalPostings;
|
|
3570
|
+
const maxFreq = this._maxFreq;
|
|
3571
|
+
const slots = this._slots;
|
|
3572
|
+
const layout = choosePostingsLayout(fieldCount);
|
|
3573
|
+
const docIdWidth = nextId <= 65535 ? 16 : 32;
|
|
3574
|
+
const allDocIds = docIdWidth === 16
|
|
3575
|
+
? new Uint16Array(totalPostings)
|
|
3576
|
+
: new Uint32Array(totalPostings);
|
|
3577
|
+
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
3578
|
+
if (layout === 'dense') {
|
|
3579
|
+
const slotCount = termCount * fieldCount;
|
|
3580
|
+
const denseOffsets = new Uint32Array(slotCount);
|
|
3581
|
+
const denseLengths = new Uint32Array(slotCount);
|
|
3582
|
+
let write = 0;
|
|
3583
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
3584
|
+
const base = ti * fieldCount;
|
|
3585
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
3586
|
+
const slot = base + f;
|
|
3587
|
+
const ranges = slots.get(slot);
|
|
3588
|
+
const len = ranges == null ? 0 : this.slotLength(ranges);
|
|
3589
|
+
denseOffsets[slot] = write;
|
|
3590
|
+
denseLengths[slot] = len;
|
|
3591
|
+
if (len > 0) {
|
|
3592
|
+
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3593
|
+
slots.delete(slot);
|
|
3594
|
+
}
|
|
3595
|
+
}
|
|
3596
|
+
}
|
|
3597
|
+
slots.clear();
|
|
3598
|
+
this.clear();
|
|
3599
|
+
return {
|
|
3600
|
+
fieldCount,
|
|
3601
|
+
termCount,
|
|
3602
|
+
nextId,
|
|
3603
|
+
layout,
|
|
3604
|
+
docIdWidth,
|
|
3605
|
+
sparseFieldIdWidth: null,
|
|
3606
|
+
allDocIds,
|
|
3607
|
+
allFreqs,
|
|
3608
|
+
denseOffsets,
|
|
3609
|
+
denseLengths,
|
|
3610
|
+
sparseTermStarts: null,
|
|
3611
|
+
sparseFieldIds: null,
|
|
3612
|
+
sparseOffsets: null,
|
|
3613
|
+
sparseLengths: null,
|
|
3614
|
+
};
|
|
3615
|
+
}
|
|
3616
|
+
const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
|
|
3617
|
+
const sparseFieldIdsScratch = [];
|
|
3618
|
+
const sparseOffsets = [];
|
|
3619
|
+
const sparseLengths = [];
|
|
3620
|
+
const termStarts = new Array(termCount + 1).fill(0);
|
|
3621
|
+
let write = 0;
|
|
3622
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
3623
|
+
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
3624
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
3625
|
+
const slot = ti * fieldCount + f;
|
|
3626
|
+
const ranges = slots.get(slot);
|
|
3627
|
+
const len = ranges == null ? 0 : this.slotLength(ranges);
|
|
3628
|
+
if (len === 0)
|
|
3629
|
+
continue;
|
|
3630
|
+
sparseFieldIdsScratch.push(f);
|
|
3631
|
+
sparseOffsets.push(write);
|
|
3632
|
+
sparseLengths.push(len);
|
|
3633
|
+
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3634
|
+
slots.delete(slot);
|
|
3635
|
+
}
|
|
3636
|
+
termStarts[ti + 1] = sparseFieldIdsScratch.length;
|
|
3637
|
+
}
|
|
3638
|
+
slots.clear();
|
|
3639
|
+
this.clear();
|
|
3640
|
+
const sparseFieldIds = sparseFieldIdWidth === 16
|
|
3641
|
+
? new Uint16Array(sparseFieldIdsScratch)
|
|
3642
|
+
: new Uint8Array(sparseFieldIdsScratch);
|
|
3643
|
+
return {
|
|
3644
|
+
fieldCount,
|
|
3645
|
+
termCount,
|
|
3646
|
+
nextId,
|
|
3647
|
+
layout,
|
|
3648
|
+
docIdWidth,
|
|
3649
|
+
sparseFieldIdWidth,
|
|
3650
|
+
allDocIds,
|
|
3651
|
+
allFreqs,
|
|
3652
|
+
denseOffsets: null,
|
|
3653
|
+
denseLengths: null,
|
|
3654
|
+
sparseTermStarts: new Uint32Array(termStarts),
|
|
3655
|
+
sparseFieldIds,
|
|
3656
|
+
sparseOffsets: new Uint32Array(sparseOffsets),
|
|
3657
|
+
sparseLengths: new Uint32Array(sparseLengths),
|
|
3658
|
+
};
|
|
3659
|
+
}
|
|
3660
|
+
}
|
|
3661
|
+
|
|
3662
|
+
function getOrCreateTermIndex(termCount, index, term) {
|
|
3450
3663
|
const existing = index.get(term);
|
|
3451
3664
|
if (existing != null)
|
|
3452
3665
|
return existing;
|
|
3453
|
-
const ti =
|
|
3454
|
-
|
|
3666
|
+
const ti = termCount.value;
|
|
3667
|
+
termCount.value++;
|
|
3455
3668
|
index.set(term, ti);
|
|
3456
3669
|
return ti;
|
|
3457
3670
|
}
|
|
3458
|
-
function appendPosting(state, termIndex, fieldId, docId, freq) {
|
|
3459
|
-
const slot = termIndex * state.fieldCount + fieldId;
|
|
3460
|
-
let docIds = state.postingsDocIds[slot];
|
|
3461
|
-
if (docIds == null) {
|
|
3462
|
-
docIds = [];
|
|
3463
|
-
state.postingsDocIds[slot] = docIds;
|
|
3464
|
-
state.postingsFreqs[slot] = [];
|
|
3465
|
-
}
|
|
3466
|
-
docIds.push(docId);
|
|
3467
|
-
state.postingsFreqs[slot].push(freq);
|
|
3468
|
-
const v = clampFreq(freq);
|
|
3469
|
-
if (v > state.maxFreq)
|
|
3470
|
-
state.maxFreq = v;
|
|
3471
|
-
state.totalPostings++;
|
|
3472
|
-
}
|
|
3473
|
-
function finalizeFlatPostings(state, nextId) {
|
|
3474
|
-
return materializeFrozenPostingsFromBuilder({
|
|
3475
|
-
fieldCount: state.fieldCount,
|
|
3476
|
-
termCount: state.terms.length,
|
|
3477
|
-
postingsDocIds: state.postingsDocIds,
|
|
3478
|
-
postingsFreqs: state.postingsFreqs,
|
|
3479
|
-
totalPostings: state.totalPostings,
|
|
3480
|
-
maxFreq: state.maxFreq,
|
|
3481
|
-
}, nextId);
|
|
3482
|
-
}
|
|
3483
3671
|
/** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
|
|
3484
3672
|
class FrozenIndexBuilder {
|
|
3485
3673
|
constructor(options, hints) {
|
|
3674
|
+
var _a, _b;
|
|
3675
|
+
this._termCount = { value: 0 };
|
|
3676
|
+
this._fieldTermFreqScratch = new Map();
|
|
3677
|
+
this._tokenScratch = [];
|
|
3486
3678
|
this._options = resolveIndexingOptions(options);
|
|
3487
3679
|
this._fieldIds = buildFieldIds(this._options.fields);
|
|
3488
3680
|
this._fieldCount = this._options.fields.length;
|
|
3489
3681
|
this._index = new SearchableMap();
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
this.
|
|
3682
|
+
const estimatedDocs = (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount) !== null && _a !== void 0 ? _a : 0;
|
|
3683
|
+
const perSlot = (_b = hints === null || hints === void 0 ? void 0 : hints.estimatedPostingsPerSlot) !== null && _b !== void 0 ? _b : 4;
|
|
3684
|
+
this._postings = new IncrementalPostingsAccumulator(this._fieldCount, {
|
|
3685
|
+
estimatedTotalPostings: estimatedDocs > 0 ? estimatedDocs * perSlot : undefined,
|
|
3686
|
+
});
|
|
3493
3687
|
this._avgFieldLength = [];
|
|
3494
3688
|
this._seenIds = new Set();
|
|
3495
3689
|
this._nextId = 0;
|
|
@@ -3505,14 +3699,6 @@ class FrozenIndexBuilder {
|
|
|
3505
3699
|
this._storedFields = [];
|
|
3506
3700
|
this._fieldLengthData = [];
|
|
3507
3701
|
}
|
|
3508
|
-
this._postingsState = {
|
|
3509
|
-
fieldCount: this._fieldCount,
|
|
3510
|
-
terms: this._terms,
|
|
3511
|
-
postingsDocIds: this._postingsDocIds,
|
|
3512
|
-
postingsFreqs: this._postingsFreqs,
|
|
3513
|
-
totalPostings: 0,
|
|
3514
|
-
maxFreq: 0,
|
|
3515
|
-
};
|
|
3516
3702
|
}
|
|
3517
3703
|
/** Number of documents indexed so far (not yet frozen). */
|
|
3518
3704
|
get documentCount() {
|
|
@@ -3539,16 +3725,17 @@ class FrozenIndexBuilder {
|
|
|
3539
3725
|
const fieldValue = extractField(document, field);
|
|
3540
3726
|
if (fieldValue == null)
|
|
3541
3727
|
continue;
|
|
3542
|
-
const
|
|
3728
|
+
const fieldText = typeof fieldValue === 'string'
|
|
3729
|
+
? fieldValue
|
|
3730
|
+
: stringifyField(fieldValue, field);
|
|
3543
3731
|
const fieldId = this._fieldIds[field];
|
|
3544
|
-
const uniqueTerms =
|
|
3545
|
-
const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
|
|
3732
|
+
const uniqueTerms = collectFieldTermFreqsFromFieldInto(this._fieldTermFreqScratch, this._tokenScratch, tokenize, fieldText, field, processTerm);
|
|
3546
3733
|
this._fieldLengthData[shortId * this._fieldCount + fieldId] = uniqueTerms;
|
|
3547
3734
|
updateAvgFieldLength(this._avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
|
|
3548
|
-
|
|
3549
|
-
const ti = getOrCreateTermIndex(this.
|
|
3550
|
-
|
|
3551
|
-
}
|
|
3735
|
+
this._fieldTermFreqScratch.forEach((freq, term) => {
|
|
3736
|
+
const ti = getOrCreateTermIndex(this._termCount, this._index, term);
|
|
3737
|
+
this._postings.append(ti, fieldId, shortId, freq);
|
|
3738
|
+
});
|
|
3552
3739
|
}
|
|
3553
3740
|
}
|
|
3554
3741
|
/**
|
|
@@ -3605,7 +3792,11 @@ class FrozenIndexBuilder {
|
|
|
3605
3792
|
}
|
|
3606
3793
|
this._frozen = true;
|
|
3607
3794
|
const documentCount = this._nextId;
|
|
3608
|
-
const
|
|
3795
|
+
const termCount = this._termCount.value;
|
|
3796
|
+
const postings = this._postings.finalize(termCount, documentCount);
|
|
3797
|
+
const radixTree = this._index.radixTree;
|
|
3798
|
+
this._index = null;
|
|
3799
|
+
const index = fromRadixTree(radixTree, termCount);
|
|
3609
3800
|
const avgFieldLength = new Float32Array(this._fieldCount);
|
|
3610
3801
|
for (let f = 0; f < this._fieldCount; f++) {
|
|
3611
3802
|
avgFieldLength[f] = (_a = this._avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
|
|
@@ -3618,8 +3809,6 @@ class FrozenIndexBuilder {
|
|
|
3618
3809
|
? this._storedFields.slice(0, documentCount)
|
|
3619
3810
|
: this._storedFields;
|
|
3620
3811
|
const idLookup = createIdToShortIdLookup(externalIds, documentCount);
|
|
3621
|
-
// Incremental builder: numeric radix leaves + build-time terms[] for postings.
|
|
3622
|
-
// freezeFromMiniSearch packs Map leaves in one radix pass (no resident terms[]).
|
|
3623
3812
|
return {
|
|
3624
3813
|
options: this._options,
|
|
3625
3814
|
documentCount,
|
|
@@ -3631,8 +3820,8 @@ class FrozenIndexBuilder {
|
|
|
3631
3820
|
storedFields,
|
|
3632
3821
|
fieldLengthMatrix: materializeFieldLengthMatrix(this._fieldLengthData, documentCount * this._fieldCount),
|
|
3633
3822
|
avgFieldLength,
|
|
3634
|
-
index
|
|
3635
|
-
termCount
|
|
3823
|
+
index,
|
|
3824
|
+
termCount,
|
|
3636
3825
|
postings,
|
|
3637
3826
|
};
|
|
3638
3827
|
}
|
|
@@ -4341,6 +4530,7 @@ FrozenMiniSearch.wildcard = WILDCARD_QUERY;
|
|
|
4341
4530
|
exports.AND = AND;
|
|
4342
4531
|
exports.AND_NOT = AND_NOT;
|
|
4343
4532
|
exports.FrozenIndexBuilder = FrozenIndexBuilder;
|
|
4533
|
+
exports.FrozenMiniSearch = FrozenMiniSearch;
|
|
4344
4534
|
exports.OR = OR;
|
|
4345
4535
|
exports.assembleFrozen = assembleFrozen;
|
|
4346
4536
|
exports.buildFrozenFromDocuments = buildFrozenFromDocuments;
|
package/dist/es/index.d.ts
CHANGED
|
@@ -494,22 +494,24 @@ type MiniSearchSnapshot = {
|
|
|
494
494
|
interface FrozenIndexBuilderHints {
|
|
495
495
|
/** Pre-size per-document arrays when the final document count is known. */
|
|
496
496
|
estimatedDocumentCount?: number;
|
|
497
|
+
/** Hint for initial growable posting column capacity per (term, field) slot. */
|
|
498
|
+
estimatedPostingsPerSlot?: number;
|
|
497
499
|
}
|
|
498
500
|
/** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
|
|
499
501
|
declare class FrozenIndexBuilder<T> {
|
|
500
502
|
private readonly _options;
|
|
501
503
|
private readonly _fieldIds;
|
|
502
504
|
private readonly _fieldCount;
|
|
503
|
-
private
|
|
504
|
-
private readonly
|
|
505
|
-
private readonly
|
|
506
|
-
private readonly _postingsFreqs;
|
|
505
|
+
private _index;
|
|
506
|
+
private readonly _postings;
|
|
507
|
+
private readonly _termCount;
|
|
507
508
|
private readonly _externalIds;
|
|
508
509
|
private readonly _storedFields;
|
|
509
510
|
private readonly _fieldLengthData;
|
|
510
511
|
private readonly _avgFieldLength;
|
|
511
|
-
private readonly _postingsState;
|
|
512
512
|
private readonly _seenIds;
|
|
513
|
+
private readonly _fieldTermFreqScratch;
|
|
514
|
+
private readonly _tokenScratch;
|
|
513
515
|
private _nextId;
|
|
514
516
|
private _frozen;
|
|
515
517
|
constructor(options: Options<T>, hints?: FrozenIndexBuilderHints);
|
|
@@ -616,5 +618,5 @@ declare class FrozenMiniSearch<T = any> {
|
|
|
616
618
|
private executeQuery;
|
|
617
619
|
}
|
|
618
620
|
|
|
619
|
-
export { AND, AND_NOT, FrozenIndexBuilder, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
|
|
621
|
+
export { AND, AND_NOT, FrozenIndexBuilder, FrozenMiniSearch, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
|
|
620
622
|
export type { BM25Params, CombinationOperator, FrozenAssembleParams, FrozenIndexBuilderHints, FrozenMemoryBreakdown, LogLevel, LowercaseCombinationOperator, MatchInfo, MiniSearchSnapshot, Options, Query, QueryCombination, SearchOptions, SearchResult, SerializedIndexEntry, Suggestion, Wildcard };
|