@yoch/frozenminisearch 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/README.md +41 -11
- package/dist/cjs/index.cjs +357 -167
- package/dist/cjs/index.require.cjs +1 -0
- package/dist/es/index.d.ts +8 -6
- package/dist/es/index.js +357 -168
- package/package.json +5 -2
package/dist/es/index.js
CHANGED
|
@@ -1907,111 +1907,6 @@ function materializeFrozenPostings(params) {
|
|
|
1907
1907
|
sparseLengths: new Uint32Array(sparseLengths),
|
|
1908
1908
|
};
|
|
1909
1909
|
}
|
|
1910
|
-
/** One-pass materialize from {@link FrozenIndexBuilder} scratch (counts known upfront). */
|
|
1911
|
-
function materializeFrozenPostingsFromBuilder(state, nextId) {
|
|
1912
|
-
var _a;
|
|
1913
|
-
const { fieldCount, termCount, postingsDocIds, postingsFreqs, totalPostings, maxFreq } = state;
|
|
1914
|
-
const layout = choosePostingsLayout(fieldCount);
|
|
1915
|
-
const docIdWidth = nextId <= 65535 ? 16 : 32;
|
|
1916
|
-
const allDocIds = docIdWidth === 16
|
|
1917
|
-
? new Uint16Array(totalPostings)
|
|
1918
|
-
: new Uint32Array(totalPostings);
|
|
1919
|
-
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
1920
|
-
if (layout === 'dense') {
|
|
1921
|
-
const slotCount = termCount * fieldCount;
|
|
1922
|
-
const denseOffsets = new Uint32Array(slotCount);
|
|
1923
|
-
const denseLengths = new Uint32Array(slotCount);
|
|
1924
|
-
let write = 0;
|
|
1925
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1926
|
-
const base = ti * fieldCount;
|
|
1927
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1928
|
-
const slot = base + f;
|
|
1929
|
-
const docIds = postingsDocIds[slot];
|
|
1930
|
-
const freqs = postingsFreqs[slot];
|
|
1931
|
-
const len = (_a = docIds === null || docIds === void 0 ? void 0 : docIds.length) !== null && _a !== void 0 ? _a : 0;
|
|
1932
|
-
denseOffsets[slot] = write;
|
|
1933
|
-
denseLengths[slot] = len;
|
|
1934
|
-
for (let i = 0; i < len; i++) {
|
|
1935
|
-
const docId = docIds[i];
|
|
1936
|
-
if (docIdWidth === 16) {
|
|
1937
|
-
allDocIds[write] = docId;
|
|
1938
|
-
}
|
|
1939
|
-
else {
|
|
1940
|
-
allDocIds[write] = docId;
|
|
1941
|
-
}
|
|
1942
|
-
allFreqs[write] = freqs[i];
|
|
1943
|
-
write++;
|
|
1944
|
-
}
|
|
1945
|
-
}
|
|
1946
|
-
}
|
|
1947
|
-
return {
|
|
1948
|
-
fieldCount,
|
|
1949
|
-
termCount,
|
|
1950
|
-
nextId,
|
|
1951
|
-
layout,
|
|
1952
|
-
docIdWidth,
|
|
1953
|
-
sparseFieldIdWidth: null,
|
|
1954
|
-
allDocIds,
|
|
1955
|
-
allFreqs,
|
|
1956
|
-
denseOffsets,
|
|
1957
|
-
denseLengths,
|
|
1958
|
-
sparseTermStarts: null,
|
|
1959
|
-
sparseFieldIds: null,
|
|
1960
|
-
sparseOffsets: null,
|
|
1961
|
-
sparseLengths: null,
|
|
1962
|
-
};
|
|
1963
|
-
}
|
|
1964
|
-
const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
|
|
1965
|
-
const sparseFieldIdsScratch = [];
|
|
1966
|
-
const sparseOffsets = [];
|
|
1967
|
-
const sparseLengths = [];
|
|
1968
|
-
const termStarts = new Array(termCount + 1).fill(0);
|
|
1969
|
-
let write = 0;
|
|
1970
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1971
|
-
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
1972
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1973
|
-
const slot = ti * fieldCount + f;
|
|
1974
|
-
const docIds = postingsDocIds[slot];
|
|
1975
|
-
if (docIds == null || docIds.length === 0)
|
|
1976
|
-
continue;
|
|
1977
|
-
const freqs = postingsFreqs[slot];
|
|
1978
|
-
sparseFieldIdsScratch.push(f);
|
|
1979
|
-
sparseOffsets.push(write);
|
|
1980
|
-
sparseLengths.push(docIds.length);
|
|
1981
|
-
for (let i = 0; i < docIds.length; i++) {
|
|
1982
|
-
const docId = docIds[i];
|
|
1983
|
-
if (docIdWidth === 16) {
|
|
1984
|
-
allDocIds[write] = docId;
|
|
1985
|
-
}
|
|
1986
|
-
else {
|
|
1987
|
-
allDocIds[write] = docId;
|
|
1988
|
-
}
|
|
1989
|
-
allFreqs[write] = freqs[i];
|
|
1990
|
-
write++;
|
|
1991
|
-
}
|
|
1992
|
-
}
|
|
1993
|
-
termStarts[ti + 1] = sparseFieldIdsScratch.length;
|
|
1994
|
-
}
|
|
1995
|
-
const sparseFieldIds = sparseFieldIdWidth === 16
|
|
1996
|
-
? new Uint16Array(sparseFieldIdsScratch)
|
|
1997
|
-
: new Uint8Array(sparseFieldIdsScratch);
|
|
1998
|
-
return {
|
|
1999
|
-
fieldCount,
|
|
2000
|
-
termCount,
|
|
2001
|
-
nextId,
|
|
2002
|
-
layout,
|
|
2003
|
-
docIdWidth,
|
|
2004
|
-
sparseFieldIdWidth,
|
|
2005
|
-
allDocIds,
|
|
2006
|
-
allFreqs,
|
|
2007
|
-
denseOffsets: null,
|
|
2008
|
-
denseLengths: null,
|
|
2009
|
-
sparseTermStarts: new Uint32Array(termStarts),
|
|
2010
|
-
sparseFieldIds,
|
|
2011
|
-
sparseOffsets: new Uint32Array(sparseOffsets),
|
|
2012
|
-
sparseLengths: new Uint32Array(sparseLengths),
|
|
2013
|
-
};
|
|
2014
|
-
}
|
|
2015
1910
|
function postingsTypedBytes(layout) {
|
|
2016
1911
|
const allDocIdsBytes = layout.allDocIds.byteLength;
|
|
2017
1912
|
const allFreqsBytes = layout.allFreqs.byteLength;
|
|
@@ -2228,21 +2123,110 @@ function buildFieldIds(fields) {
|
|
|
2228
2123
|
}
|
|
2229
2124
|
return fieldIds;
|
|
2230
2125
|
}
|
|
2231
|
-
|
|
2232
|
-
|
|
2233
|
-
|
|
2126
|
+
function accumulateProcessedTerm(localFreqs, processedTerm) {
|
|
2127
|
+
if (Array.isArray(processedTerm)) {
|
|
2128
|
+
for (const t of processedTerm) {
|
|
2129
|
+
localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
|
|
2130
|
+
}
|
|
2131
|
+
}
|
|
2132
|
+
else if (processedTerm) {
|
|
2133
|
+
localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
|
|
2134
|
+
}
|
|
2135
|
+
}
|
|
2136
|
+
/**
|
|
2137
|
+
* Accumulate token frequencies for one document field into `localFreqs` (cleared first).
|
|
2138
|
+
* Returns the number of distinct processed terms (replaces a separate `Set(tokens)` pass).
|
|
2139
|
+
*/
|
|
2140
|
+
function collectFieldTermFreqsInto(localFreqs, tokens, fieldName, processTerm) {
|
|
2141
|
+
localFreqs.clear();
|
|
2234
2142
|
for (const term of tokens) {
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2143
|
+
accumulateProcessedTerm(localFreqs, processTerm(term, fieldName));
|
|
2144
|
+
}
|
|
2145
|
+
return localFreqs.size;
|
|
2146
|
+
}
|
|
2147
|
+
/** Global delimiter pattern for incremental `exec` (must not reuse {@link SPACE_OR_PUNCTUATION} — no `g` flag). */
|
|
2148
|
+
const DEFAULT_TOKENIZE_DELIMITERS = /[\n\r\p{Z}\p{P}]+/gu;
|
|
2149
|
+
const defaultTokenizeProbe = 'a b';
|
|
2150
|
+
const defaultTokenizeProbeField = 'f';
|
|
2151
|
+
const tokenizeBehaviorCache = new WeakMap();
|
|
2152
|
+
/**
|
|
2153
|
+
* True when `tokenize` matches the library default (reference equality or split-equivalent
|
|
2154
|
+
* on a fixed probe). Custom tokenizers that pass the probe but diverge on other inputs
|
|
2155
|
+
* (e.g. leading delimiters) still take the fast path — use the default reference in prod.
|
|
2156
|
+
*/
|
|
2157
|
+
function isDefaultTokenize(tokenize) {
|
|
2158
|
+
if (tokenize === defaultFrozenLoadOptions.tokenize)
|
|
2159
|
+
return true;
|
|
2160
|
+
const cached = tokenizeBehaviorCache.get(tokenize);
|
|
2161
|
+
if (cached != null)
|
|
2162
|
+
return cached;
|
|
2163
|
+
const splitTokens = defaultTokenizeProbe.split(SPACE_OR_PUNCTUATION);
|
|
2164
|
+
const customTokens = tokenize(defaultTokenizeProbe, defaultTokenizeProbeField);
|
|
2165
|
+
const ok = splitTokens.length === customTokens.length
|
|
2166
|
+
&& splitTokens.every((t, i) => t === customTokens[i]);
|
|
2167
|
+
tokenizeBehaviorCache.set(tokenize, ok);
|
|
2168
|
+
return ok;
|
|
2169
|
+
}
|
|
2170
|
+
function forEachDefaultToken(text, onToken) {
|
|
2171
|
+
if (text.length === 0) {
|
|
2172
|
+
onToken('');
|
|
2173
|
+
return;
|
|
2174
|
+
}
|
|
2175
|
+
let start = 0;
|
|
2176
|
+
const re = DEFAULT_TOKENIZE_DELIMITERS;
|
|
2177
|
+
re.lastIndex = 0;
|
|
2178
|
+
let match;
|
|
2179
|
+
while ((match = re.exec(text)) !== null) {
|
|
2180
|
+
if (match.index > start) {
|
|
2181
|
+
onToken(text.slice(start, match.index));
|
|
2240
2182
|
}
|
|
2241
|
-
else if (
|
|
2242
|
-
|
|
2183
|
+
else if (match.index === start) {
|
|
2184
|
+
onToken('');
|
|
2243
2185
|
}
|
|
2186
|
+
start = match.index + match[0].length;
|
|
2187
|
+
}
|
|
2188
|
+
if (start < text.length) {
|
|
2189
|
+
onToken(text.slice(start));
|
|
2190
|
+
}
|
|
2191
|
+
else if (start === 0) {
|
|
2192
|
+
onToken(text);
|
|
2193
|
+
}
|
|
2194
|
+
else if (start === text.length) {
|
|
2195
|
+
onToken('');
|
|
2244
2196
|
}
|
|
2245
|
-
|
|
2197
|
+
}
|
|
2198
|
+
/** Default tokenizer into a reusable buffer (avoids `text.split()` array allocation). */
|
|
2199
|
+
function tokenizeDefaultInto(out, text) {
|
|
2200
|
+
out.length = 0;
|
|
2201
|
+
forEachDefaultToken(text, (token) => out.push(token));
|
|
2202
|
+
}
|
|
2203
|
+
/** Tokenize field text into `out` (reused). Fast path when `tokenize` is the library default. */
|
|
2204
|
+
function tokenizeFieldInto(out, tokenize, text, fieldName) {
|
|
2205
|
+
if (isDefaultTokenize(tokenize)) {
|
|
2206
|
+
tokenizeDefaultInto(out, text);
|
|
2207
|
+
return;
|
|
2208
|
+
}
|
|
2209
|
+
const tokens = tokenize(text, fieldName);
|
|
2210
|
+
out.length = 0;
|
|
2211
|
+
out.push(...tokens);
|
|
2212
|
+
}
|
|
2213
|
+
function collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm) {
|
|
2214
|
+
localFreqs.clear();
|
|
2215
|
+
forEachDefaultToken(text, (token) => {
|
|
2216
|
+
accumulateProcessedTerm(localFreqs, processTerm(token, fieldName));
|
|
2217
|
+
});
|
|
2218
|
+
return localFreqs.size;
|
|
2219
|
+
}
|
|
2220
|
+
/**
|
|
2221
|
+
* Tokenize + accumulate field term frequencies in one pass when the default tokenizer is used.
|
|
2222
|
+
* `tokenScratch` is only used for custom tokenizers (two-phase fallback).
|
|
2223
|
+
*/
|
|
2224
|
+
function collectFieldTermFreqsFromFieldInto(localFreqs, tokenScratch, tokenize, text, fieldName, processTerm) {
|
|
2225
|
+
if (isDefaultTokenize(tokenize)) {
|
|
2226
|
+
return collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm);
|
|
2227
|
+
}
|
|
2228
|
+
tokenizeFieldInto(tokenScratch, tokenize, text, fieldName);
|
|
2229
|
+
return collectFieldTermFreqsInto(localFreqs, tokenScratch, fieldName, processTerm);
|
|
2246
2230
|
}
|
|
2247
2231
|
/** Same running average as {@link MiniSearch} private addFieldLength. */
|
|
2248
2232
|
function updateAvgFieldLength(avgFieldLength, fieldId, count, length) {
|
|
@@ -3442,50 +3426,260 @@ async function decodeFrozenSnapshotAsync(buf) {
|
|
|
3442
3426
|
return decodeFrozenSnapshot(buf);
|
|
3443
3427
|
}
|
|
3444
3428
|
|
|
3445
|
-
|
|
3429
|
+
const DEFAULT_CAPACITY = 16;
|
|
3430
|
+
/** Growable unsigned 32-bit column (build scratch; narrowed to u16 at finalize when possible). */
|
|
3431
|
+
class GrowableUint32Column {
|
|
3432
|
+
constructor(initialCapacity = DEFAULT_CAPACITY) {
|
|
3433
|
+
this._len = 0;
|
|
3434
|
+
this._buf = new Uint32Array(Math.max(1, initialCapacity));
|
|
3435
|
+
}
|
|
3436
|
+
get length() {
|
|
3437
|
+
return this._len;
|
|
3438
|
+
}
|
|
3439
|
+
push(value) {
|
|
3440
|
+
if (this._len >= this._buf.length) {
|
|
3441
|
+
const grown = new Uint32Array(Math.max(1, this._buf.length * 2));
|
|
3442
|
+
grown.set(this._buf);
|
|
3443
|
+
this._buf = grown;
|
|
3444
|
+
}
|
|
3445
|
+
this._buf[this._len++] = value;
|
|
3446
|
+
}
|
|
3447
|
+
copyRangeInto(sourceOffset, length, target, targetOffset, docIdWidth) {
|
|
3448
|
+
if (docIdWidth === 16) {
|
|
3449
|
+
const out = target;
|
|
3450
|
+
for (let i = 0; i < length; i++)
|
|
3451
|
+
out[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3452
|
+
}
|
|
3453
|
+
else {
|
|
3454
|
+
const out = target;
|
|
3455
|
+
for (let i = 0; i < length; i++)
|
|
3456
|
+
out[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3457
|
+
}
|
|
3458
|
+
}
|
|
3459
|
+
truncate(length) {
|
|
3460
|
+
this._len = length;
|
|
3461
|
+
if (length > 0 && length < this._buf.length) {
|
|
3462
|
+
this._buf = this._buf.slice(0, length);
|
|
3463
|
+
}
|
|
3464
|
+
}
|
|
3465
|
+
}
|
|
3466
|
+
/** Growable frequency column (u16 cells; matches frozen clamp range). */
|
|
3467
|
+
class GrowableFreqColumn {
|
|
3468
|
+
constructor(initialCapacity = DEFAULT_CAPACITY) {
|
|
3469
|
+
this._len = 0;
|
|
3470
|
+
this._buf = new Uint16Array(Math.max(1, initialCapacity));
|
|
3471
|
+
}
|
|
3472
|
+
get length() {
|
|
3473
|
+
return this._len;
|
|
3474
|
+
}
|
|
3475
|
+
push(freq) {
|
|
3476
|
+
if (this._len >= this._buf.length) {
|
|
3477
|
+
const grown = new Uint16Array(Math.max(1, this._buf.length * 2));
|
|
3478
|
+
grown.set(this._buf);
|
|
3479
|
+
this._buf = grown;
|
|
3480
|
+
}
|
|
3481
|
+
this._buf[this._len++] = clampFreq(freq);
|
|
3482
|
+
}
|
|
3483
|
+
copyRangeInto(sourceOffset, length, target, targetOffset) {
|
|
3484
|
+
for (let i = 0; i < length; i++) {
|
|
3485
|
+
target[targetOffset + i] = this._buf[sourceOffset + i];
|
|
3486
|
+
}
|
|
3487
|
+
}
|
|
3488
|
+
truncate(length) {
|
|
3489
|
+
this._len = length;
|
|
3490
|
+
if (length > 0 && length < this._buf.length) {
|
|
3491
|
+
this._buf = this._buf.slice(0, length);
|
|
3492
|
+
}
|
|
3493
|
+
}
|
|
3494
|
+
}
|
|
3495
|
+
/**
|
|
3496
|
+
* Single-pass postings accumulator for {@link FrozenIndexBuilder}.
|
|
3497
|
+
* One global TypedArray stream per docIds/freqs; per-slot range metadata only.
|
|
3498
|
+
*/
|
|
3499
|
+
class IncrementalPostingsAccumulator {
|
|
3500
|
+
constructor(fieldCount, hints) {
|
|
3501
|
+
var _a;
|
|
3502
|
+
this._slots = new Map();
|
|
3503
|
+
this._totalPostings = 0;
|
|
3504
|
+
this._maxFreq = 0;
|
|
3505
|
+
this._fieldCount = fieldCount;
|
|
3506
|
+
const cap = Math.max(DEFAULT_CAPACITY, (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedTotalPostings) !== null && _a !== void 0 ? _a : 0);
|
|
3507
|
+
this._docIds = new GrowableUint32Column(cap);
|
|
3508
|
+
this._freqs = new GrowableFreqColumn(cap);
|
|
3509
|
+
}
|
|
3510
|
+
get totalPostings() {
|
|
3511
|
+
return this._totalPostings;
|
|
3512
|
+
}
|
|
3513
|
+
get maxFreq() {
|
|
3514
|
+
return this._maxFreq;
|
|
3515
|
+
}
|
|
3516
|
+
append(termIndex, fieldId, docId, freq) {
|
|
3517
|
+
const slot = termIndex * this._fieldCount + fieldId;
|
|
3518
|
+
const writeIdx = this._docIds.length;
|
|
3519
|
+
this._docIds.push(docId);
|
|
3520
|
+
const v = clampFreq(freq);
|
|
3521
|
+
this._freqs.push(v);
|
|
3522
|
+
if (v > this._maxFreq)
|
|
3523
|
+
this._maxFreq = v;
|
|
3524
|
+
this._totalPostings++;
|
|
3525
|
+
let ranges = this._slots.get(slot);
|
|
3526
|
+
if (ranges == null) {
|
|
3527
|
+
ranges = { starts: [writeIdx], lengths: [1] };
|
|
3528
|
+
this._slots.set(slot, ranges);
|
|
3529
|
+
return;
|
|
3530
|
+
}
|
|
3531
|
+
const last = ranges.starts.length - 1;
|
|
3532
|
+
const end = ranges.starts[last] + ranges.lengths[last];
|
|
3533
|
+
if (end === writeIdx) {
|
|
3534
|
+
ranges.lengths[last]++;
|
|
3535
|
+
}
|
|
3536
|
+
else {
|
|
3537
|
+
ranges.starts.push(writeIdx);
|
|
3538
|
+
ranges.lengths.push(1);
|
|
3539
|
+
}
|
|
3540
|
+
}
|
|
3541
|
+
clear() {
|
|
3542
|
+
this._slots.clear();
|
|
3543
|
+
// Drop global scratch backing so finalize does not retain duplicate posting bytes.
|
|
3544
|
+
this._docIds.truncate(0);
|
|
3545
|
+
this._freqs.truncate(0);
|
|
3546
|
+
}
|
|
3547
|
+
copySlot(ranges, allDocIds, allFreqs, write, docIdWidth) {
|
|
3548
|
+
for (let r = 0; r < ranges.starts.length; r++) {
|
|
3549
|
+
const start = ranges.starts[r];
|
|
3550
|
+
const len = ranges.lengths[r];
|
|
3551
|
+
this._docIds.copyRangeInto(start, len, allDocIds, write, docIdWidth);
|
|
3552
|
+
this._freqs.copyRangeInto(start, len, allFreqs, write);
|
|
3553
|
+
write += len;
|
|
3554
|
+
}
|
|
3555
|
+
return write;
|
|
3556
|
+
}
|
|
3557
|
+
slotLength(ranges) {
|
|
3558
|
+
let n = 0;
|
|
3559
|
+
for (let i = 0; i < ranges.lengths.length; i++)
|
|
3560
|
+
n += ranges.lengths[i];
|
|
3561
|
+
return n;
|
|
3562
|
+
}
|
|
3563
|
+
finalize(termCount, nextId) {
|
|
3564
|
+
const fieldCount = this._fieldCount;
|
|
3565
|
+
const totalPostings = this._totalPostings;
|
|
3566
|
+
const maxFreq = this._maxFreq;
|
|
3567
|
+
const slots = this._slots;
|
|
3568
|
+
const layout = choosePostingsLayout(fieldCount);
|
|
3569
|
+
const docIdWidth = nextId <= 65535 ? 16 : 32;
|
|
3570
|
+
const allDocIds = docIdWidth === 16
|
|
3571
|
+
? new Uint16Array(totalPostings)
|
|
3572
|
+
: new Uint32Array(totalPostings);
|
|
3573
|
+
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
3574
|
+
if (layout === 'dense') {
|
|
3575
|
+
const slotCount = termCount * fieldCount;
|
|
3576
|
+
const denseOffsets = new Uint32Array(slotCount);
|
|
3577
|
+
const denseLengths = new Uint32Array(slotCount);
|
|
3578
|
+
let write = 0;
|
|
3579
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
3580
|
+
const base = ti * fieldCount;
|
|
3581
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
3582
|
+
const slot = base + f;
|
|
3583
|
+
const ranges = slots.get(slot);
|
|
3584
|
+
const len = ranges == null ? 0 : this.slotLength(ranges);
|
|
3585
|
+
denseOffsets[slot] = write;
|
|
3586
|
+
denseLengths[slot] = len;
|
|
3587
|
+
if (len > 0) {
|
|
3588
|
+
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3589
|
+
slots.delete(slot);
|
|
3590
|
+
}
|
|
3591
|
+
}
|
|
3592
|
+
}
|
|
3593
|
+
slots.clear();
|
|
3594
|
+
this.clear();
|
|
3595
|
+
return {
|
|
3596
|
+
fieldCount,
|
|
3597
|
+
termCount,
|
|
3598
|
+
nextId,
|
|
3599
|
+
layout,
|
|
3600
|
+
docIdWidth,
|
|
3601
|
+
sparseFieldIdWidth: null,
|
|
3602
|
+
allDocIds,
|
|
3603
|
+
allFreqs,
|
|
3604
|
+
denseOffsets,
|
|
3605
|
+
denseLengths,
|
|
3606
|
+
sparseTermStarts: null,
|
|
3607
|
+
sparseFieldIds: null,
|
|
3608
|
+
sparseOffsets: null,
|
|
3609
|
+
sparseLengths: null,
|
|
3610
|
+
};
|
|
3611
|
+
}
|
|
3612
|
+
const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
|
|
3613
|
+
const sparseFieldIdsScratch = [];
|
|
3614
|
+
const sparseOffsets = [];
|
|
3615
|
+
const sparseLengths = [];
|
|
3616
|
+
const termStarts = new Array(termCount + 1).fill(0);
|
|
3617
|
+
let write = 0;
|
|
3618
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
3619
|
+
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
3620
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
3621
|
+
const slot = ti * fieldCount + f;
|
|
3622
|
+
const ranges = slots.get(slot);
|
|
3623
|
+
const len = ranges == null ? 0 : this.slotLength(ranges);
|
|
3624
|
+
if (len === 0)
|
|
3625
|
+
continue;
|
|
3626
|
+
sparseFieldIdsScratch.push(f);
|
|
3627
|
+
sparseOffsets.push(write);
|
|
3628
|
+
sparseLengths.push(len);
|
|
3629
|
+
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3630
|
+
slots.delete(slot);
|
|
3631
|
+
}
|
|
3632
|
+
termStarts[ti + 1] = sparseFieldIdsScratch.length;
|
|
3633
|
+
}
|
|
3634
|
+
slots.clear();
|
|
3635
|
+
this.clear();
|
|
3636
|
+
const sparseFieldIds = sparseFieldIdWidth === 16
|
|
3637
|
+
? new Uint16Array(sparseFieldIdsScratch)
|
|
3638
|
+
: new Uint8Array(sparseFieldIdsScratch);
|
|
3639
|
+
return {
|
|
3640
|
+
fieldCount,
|
|
3641
|
+
termCount,
|
|
3642
|
+
nextId,
|
|
3643
|
+
layout,
|
|
3644
|
+
docIdWidth,
|
|
3645
|
+
sparseFieldIdWidth,
|
|
3646
|
+
allDocIds,
|
|
3647
|
+
allFreqs,
|
|
3648
|
+
denseOffsets: null,
|
|
3649
|
+
denseLengths: null,
|
|
3650
|
+
sparseTermStarts: new Uint32Array(termStarts),
|
|
3651
|
+
sparseFieldIds,
|
|
3652
|
+
sparseOffsets: new Uint32Array(sparseOffsets),
|
|
3653
|
+
sparseLengths: new Uint32Array(sparseLengths),
|
|
3654
|
+
};
|
|
3655
|
+
}
|
|
3656
|
+
}
|
|
3657
|
+
|
|
3658
|
+
function getOrCreateTermIndex(termCount, index, term) {
|
|
3446
3659
|
const existing = index.get(term);
|
|
3447
3660
|
if (existing != null)
|
|
3448
3661
|
return existing;
|
|
3449
|
-
const ti =
|
|
3450
|
-
|
|
3662
|
+
const ti = termCount.value;
|
|
3663
|
+
termCount.value++;
|
|
3451
3664
|
index.set(term, ti);
|
|
3452
3665
|
return ti;
|
|
3453
3666
|
}
|
|
3454
|
-
function appendPosting(state, termIndex, fieldId, docId, freq) {
|
|
3455
|
-
const slot = termIndex * state.fieldCount + fieldId;
|
|
3456
|
-
let docIds = state.postingsDocIds[slot];
|
|
3457
|
-
if (docIds == null) {
|
|
3458
|
-
docIds = [];
|
|
3459
|
-
state.postingsDocIds[slot] = docIds;
|
|
3460
|
-
state.postingsFreqs[slot] = [];
|
|
3461
|
-
}
|
|
3462
|
-
docIds.push(docId);
|
|
3463
|
-
state.postingsFreqs[slot].push(freq);
|
|
3464
|
-
const v = clampFreq(freq);
|
|
3465
|
-
if (v > state.maxFreq)
|
|
3466
|
-
state.maxFreq = v;
|
|
3467
|
-
state.totalPostings++;
|
|
3468
|
-
}
|
|
3469
|
-
function finalizeFlatPostings(state, nextId) {
|
|
3470
|
-
return materializeFrozenPostingsFromBuilder({
|
|
3471
|
-
fieldCount: state.fieldCount,
|
|
3472
|
-
termCount: state.terms.length,
|
|
3473
|
-
postingsDocIds: state.postingsDocIds,
|
|
3474
|
-
postingsFreqs: state.postingsFreqs,
|
|
3475
|
-
totalPostings: state.totalPostings,
|
|
3476
|
-
maxFreq: state.maxFreq,
|
|
3477
|
-
}, nextId);
|
|
3478
|
-
}
|
|
3479
3667
|
/** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
|
|
3480
3668
|
class FrozenIndexBuilder {
|
|
3481
3669
|
constructor(options, hints) {
|
|
3670
|
+
var _a, _b;
|
|
3671
|
+
this._termCount = { value: 0 };
|
|
3672
|
+
this._fieldTermFreqScratch = new Map();
|
|
3673
|
+
this._tokenScratch = [];
|
|
3482
3674
|
this._options = resolveIndexingOptions(options);
|
|
3483
3675
|
this._fieldIds = buildFieldIds(this._options.fields);
|
|
3484
3676
|
this._fieldCount = this._options.fields.length;
|
|
3485
3677
|
this._index = new SearchableMap();
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
this.
|
|
3678
|
+
const estimatedDocs = (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount) !== null && _a !== void 0 ? _a : 0;
|
|
3679
|
+
const perSlot = (_b = hints === null || hints === void 0 ? void 0 : hints.estimatedPostingsPerSlot) !== null && _b !== void 0 ? _b : 4;
|
|
3680
|
+
this._postings = new IncrementalPostingsAccumulator(this._fieldCount, {
|
|
3681
|
+
estimatedTotalPostings: estimatedDocs > 0 ? estimatedDocs * perSlot : undefined,
|
|
3682
|
+
});
|
|
3489
3683
|
this._avgFieldLength = [];
|
|
3490
3684
|
this._seenIds = new Set();
|
|
3491
3685
|
this._nextId = 0;
|
|
@@ -3501,14 +3695,6 @@ class FrozenIndexBuilder {
|
|
|
3501
3695
|
this._storedFields = [];
|
|
3502
3696
|
this._fieldLengthData = [];
|
|
3503
3697
|
}
|
|
3504
|
-
this._postingsState = {
|
|
3505
|
-
fieldCount: this._fieldCount,
|
|
3506
|
-
terms: this._terms,
|
|
3507
|
-
postingsDocIds: this._postingsDocIds,
|
|
3508
|
-
postingsFreqs: this._postingsFreqs,
|
|
3509
|
-
totalPostings: 0,
|
|
3510
|
-
maxFreq: 0,
|
|
3511
|
-
};
|
|
3512
3698
|
}
|
|
3513
3699
|
/** Number of documents indexed so far (not yet frozen). */
|
|
3514
3700
|
get documentCount() {
|
|
@@ -3535,16 +3721,17 @@ class FrozenIndexBuilder {
|
|
|
3535
3721
|
const fieldValue = extractField(document, field);
|
|
3536
3722
|
if (fieldValue == null)
|
|
3537
3723
|
continue;
|
|
3538
|
-
const
|
|
3724
|
+
const fieldText = typeof fieldValue === 'string'
|
|
3725
|
+
? fieldValue
|
|
3726
|
+
: stringifyField(fieldValue, field);
|
|
3539
3727
|
const fieldId = this._fieldIds[field];
|
|
3540
|
-
const uniqueTerms =
|
|
3541
|
-
const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
|
|
3728
|
+
const uniqueTerms = collectFieldTermFreqsFromFieldInto(this._fieldTermFreqScratch, this._tokenScratch, tokenize, fieldText, field, processTerm);
|
|
3542
3729
|
this._fieldLengthData[shortId * this._fieldCount + fieldId] = uniqueTerms;
|
|
3543
3730
|
updateAvgFieldLength(this._avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
|
|
3544
|
-
|
|
3545
|
-
const ti = getOrCreateTermIndex(this.
|
|
3546
|
-
|
|
3547
|
-
}
|
|
3731
|
+
this._fieldTermFreqScratch.forEach((freq, term) => {
|
|
3732
|
+
const ti = getOrCreateTermIndex(this._termCount, this._index, term);
|
|
3733
|
+
this._postings.append(ti, fieldId, shortId, freq);
|
|
3734
|
+
});
|
|
3548
3735
|
}
|
|
3549
3736
|
}
|
|
3550
3737
|
/**
|
|
@@ -3601,7 +3788,11 @@ class FrozenIndexBuilder {
|
|
|
3601
3788
|
}
|
|
3602
3789
|
this._frozen = true;
|
|
3603
3790
|
const documentCount = this._nextId;
|
|
3604
|
-
const
|
|
3791
|
+
const termCount = this._termCount.value;
|
|
3792
|
+
const postings = this._postings.finalize(termCount, documentCount);
|
|
3793
|
+
const radixTree = this._index.radixTree;
|
|
3794
|
+
this._index = null;
|
|
3795
|
+
const index = fromRadixTree(radixTree, termCount);
|
|
3605
3796
|
const avgFieldLength = new Float32Array(this._fieldCount);
|
|
3606
3797
|
for (let f = 0; f < this._fieldCount; f++) {
|
|
3607
3798
|
avgFieldLength[f] = (_a = this._avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
|
|
@@ -3614,8 +3805,6 @@ class FrozenIndexBuilder {
|
|
|
3614
3805
|
? this._storedFields.slice(0, documentCount)
|
|
3615
3806
|
: this._storedFields;
|
|
3616
3807
|
const idLookup = createIdToShortIdLookup(externalIds, documentCount);
|
|
3617
|
-
// Incremental builder: numeric radix leaves + build-time terms[] for postings.
|
|
3618
|
-
// freezeFromMiniSearch packs Map leaves in one radix pass (no resident terms[]).
|
|
3619
3808
|
return {
|
|
3620
3809
|
options: this._options,
|
|
3621
3810
|
documentCount,
|
|
@@ -3627,8 +3816,8 @@ class FrozenIndexBuilder {
|
|
|
3627
3816
|
storedFields,
|
|
3628
3817
|
fieldLengthMatrix: materializeFieldLengthMatrix(this._fieldLengthData, documentCount * this._fieldCount),
|
|
3629
3818
|
avgFieldLength,
|
|
3630
|
-
index
|
|
3631
|
-
termCount
|
|
3819
|
+
index,
|
|
3820
|
+
termCount,
|
|
3632
3821
|
postings,
|
|
3633
3822
|
};
|
|
3634
3823
|
}
|
|
@@ -4334,4 +4523,4 @@ class FrozenMiniSearch {
|
|
|
4334
4523
|
}
|
|
4335
4524
|
FrozenMiniSearch.wildcard = WILDCARD_QUERY;
|
|
4336
4525
|
|
|
4337
|
-
export { AND, AND_NOT, FrozenIndexBuilder, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
|
|
4526
|
+
export { AND, AND_NOT, FrozenIndexBuilder, FrozenMiniSearch, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yoch/frozenminisearch",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.1",
|
|
4
4
|
"description": "Read-only Node.js full-text search — compact frozen indexes and binary snapshots",
|
|
5
5
|
"main": "dist/cjs/index.cjs",
|
|
6
6
|
"module": "dist/es/index.js",
|
|
@@ -100,6 +100,9 @@
|
|
|
100
100
|
"bench:history": "node --expose-gc benchmarks/framework/cli.mjs history",
|
|
101
101
|
"bench:micro": "node benchmarks/micro/run.mjs",
|
|
102
102
|
"bench:readme": "node benchmarks/scripts/generate-readme-comparison.mjs",
|
|
103
|
+
"bench:build-peak": "npm run build && node --expose-gc benchmarks/scripts/build-peak-heap.mjs",
|
|
104
|
+
"bench:medicaments-build-peak": "npm run build && NODE_OPTIONS='--expose-gc' npx --yes tsx benchmarks/scripts/medicaments-build-peak-heap.mjs",
|
|
105
|
+
"bench:build-heap-profile": "npm run build && NODE_OPTIONS='--expose-gc' npx --yes tsx benchmarks/scripts/build-heap-profile.mjs",
|
|
103
106
|
"bench:reference:update": "npm run build && RUNS=3 node --expose-gc benchmarks/framework/cli.mjs record --profile=vs-reference && node benchmarks/scripts/promote-latest-to-reference.mjs && npm run bench:readme",
|
|
104
107
|
"benchmark:compare": "npm run build && node --expose-gc benchmarks/compare.js",
|
|
105
108
|
"benchmark:calibrate-batches": "npm run build && node --expose-gc benchmarks/scripts/calibrate-search-batches.mjs",
|
|
@@ -132,7 +135,7 @@
|
|
|
132
135
|
"clean-build": "rm -rf dist",
|
|
133
136
|
"build-minified": "MINIFY=true npm run build",
|
|
134
137
|
"sync-docs-media": "node scripts/sync-docs-media.cjs",
|
|
135
|
-
"build-docs": "npm run sync-docs-media &&
|
|
138
|
+
"build-docs": "typedoc --options typedoc.json && npm run sync-docs-media && npm run build-demo",
|
|
136
139
|
"build-demo": "mkdir -p ./docs/demo && cp -r ./examples/plain_js/. ./docs/demo",
|
|
137
140
|
"lint": "eslint 'src/**/*.{js,ts}'",
|
|
138
141
|
"lintfix": "eslint --fix 'src/**/*.{js,ts}'",
|