@yoch/frozenminisearch 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/es/index.js CHANGED
@@ -1907,111 +1907,6 @@ function materializeFrozenPostings(params) {
1907
1907
  sparseLengths: new Uint32Array(sparseLengths),
1908
1908
  };
1909
1909
  }
1910
- /** One-pass materialize from {@link FrozenIndexBuilder} scratch (counts known upfront). */
1911
- function materializeFrozenPostingsFromBuilder(state, nextId) {
1912
- var _a;
1913
- const { fieldCount, termCount, postingsDocIds, postingsFreqs, totalPostings, maxFreq } = state;
1914
- const layout = choosePostingsLayout(fieldCount);
1915
- const docIdWidth = nextId <= 65535 ? 16 : 32;
1916
- const allDocIds = docIdWidth === 16
1917
- ? new Uint16Array(totalPostings)
1918
- : new Uint32Array(totalPostings);
1919
- const allFreqs = allocateFreqs(totalPostings, maxFreq);
1920
- if (layout === 'dense') {
1921
- const slotCount = termCount * fieldCount;
1922
- const denseOffsets = new Uint32Array(slotCount);
1923
- const denseLengths = new Uint32Array(slotCount);
1924
- let write = 0;
1925
- for (let ti = 0; ti < termCount; ti++) {
1926
- const base = ti * fieldCount;
1927
- for (let f = 0; f < fieldCount; f++) {
1928
- const slot = base + f;
1929
- const docIds = postingsDocIds[slot];
1930
- const freqs = postingsFreqs[slot];
1931
- const len = (_a = docIds === null || docIds === void 0 ? void 0 : docIds.length) !== null && _a !== void 0 ? _a : 0;
1932
- denseOffsets[slot] = write;
1933
- denseLengths[slot] = len;
1934
- for (let i = 0; i < len; i++) {
1935
- const docId = docIds[i];
1936
- if (docIdWidth === 16) {
1937
- allDocIds[write] = docId;
1938
- }
1939
- else {
1940
- allDocIds[write] = docId;
1941
- }
1942
- allFreqs[write] = freqs[i];
1943
- write++;
1944
- }
1945
- }
1946
- }
1947
- return {
1948
- fieldCount,
1949
- termCount,
1950
- nextId,
1951
- layout,
1952
- docIdWidth,
1953
- sparseFieldIdWidth: null,
1954
- allDocIds,
1955
- allFreqs,
1956
- denseOffsets,
1957
- denseLengths,
1958
- sparseTermStarts: null,
1959
- sparseFieldIds: null,
1960
- sparseOffsets: null,
1961
- sparseLengths: null,
1962
- };
1963
- }
1964
- const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
1965
- const sparseFieldIdsScratch = [];
1966
- const sparseOffsets = [];
1967
- const sparseLengths = [];
1968
- const termStarts = new Array(termCount + 1).fill(0);
1969
- let write = 0;
1970
- for (let ti = 0; ti < termCount; ti++) {
1971
- termStarts[ti] = sparseFieldIdsScratch.length;
1972
- for (let f = 0; f < fieldCount; f++) {
1973
- const slot = ti * fieldCount + f;
1974
- const docIds = postingsDocIds[slot];
1975
- if (docIds == null || docIds.length === 0)
1976
- continue;
1977
- const freqs = postingsFreqs[slot];
1978
- sparseFieldIdsScratch.push(f);
1979
- sparseOffsets.push(write);
1980
- sparseLengths.push(docIds.length);
1981
- for (let i = 0; i < docIds.length; i++) {
1982
- const docId = docIds[i];
1983
- if (docIdWidth === 16) {
1984
- allDocIds[write] = docId;
1985
- }
1986
- else {
1987
- allDocIds[write] = docId;
1988
- }
1989
- allFreqs[write] = freqs[i];
1990
- write++;
1991
- }
1992
- }
1993
- termStarts[ti + 1] = sparseFieldIdsScratch.length;
1994
- }
1995
- const sparseFieldIds = sparseFieldIdWidth === 16
1996
- ? new Uint16Array(sparseFieldIdsScratch)
1997
- : new Uint8Array(sparseFieldIdsScratch);
1998
- return {
1999
- fieldCount,
2000
- termCount,
2001
- nextId,
2002
- layout,
2003
- docIdWidth,
2004
- sparseFieldIdWidth,
2005
- allDocIds,
2006
- allFreqs,
2007
- denseOffsets: null,
2008
- denseLengths: null,
2009
- sparseTermStarts: new Uint32Array(termStarts),
2010
- sparseFieldIds,
2011
- sparseOffsets: new Uint32Array(sparseOffsets),
2012
- sparseLengths: new Uint32Array(sparseLengths),
2013
- };
2014
- }
2015
1910
  function postingsTypedBytes(layout) {
2016
1911
  const allDocIdsBytes = layout.allDocIds.byteLength;
2017
1912
  const allFreqsBytes = layout.allFreqs.byteLength;
@@ -2228,21 +2123,110 @@ function buildFieldIds(fields) {
2228
2123
  }
2229
2124
  return fieldIds;
2230
2125
  }
2231
- /** Token frequencies for one document field (after processTerm). */
2232
- function collectFieldTermFreqs(tokens, fieldName, processTerm) {
2233
- const localFreqs = new Map();
2126
+ function accumulateProcessedTerm(localFreqs, processedTerm) {
2127
+ if (Array.isArray(processedTerm)) {
2128
+ for (const t of processedTerm) {
2129
+ localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
2130
+ }
2131
+ }
2132
+ else if (processedTerm) {
2133
+ localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
2134
+ }
2135
+ }
2136
+ /**
2137
+ * Accumulate token frequencies for one document field into `localFreqs` (cleared first).
2138
+ * Returns the number of distinct processed terms (replaces a separate `Set(tokens)` pass).
2139
+ */
2140
+ function collectFieldTermFreqsInto(localFreqs, tokens, fieldName, processTerm) {
2141
+ localFreqs.clear();
2234
2142
  for (const term of tokens) {
2235
- const processedTerm = processTerm(term, fieldName);
2236
- if (Array.isArray(processedTerm)) {
2237
- for (const t of processedTerm) {
2238
- localFreqs.set(t, (localFreqs.get(t) || 0) + 1);
2239
- }
2143
+ accumulateProcessedTerm(localFreqs, processTerm(term, fieldName));
2144
+ }
2145
+ return localFreqs.size;
2146
+ }
2147
+ /** Global delimiter pattern for incremental `exec` (must not reuse {@link SPACE_OR_PUNCTUATION} — no `g` flag). */
2148
+ const DEFAULT_TOKENIZE_DELIMITERS = /[\n\r\p{Z}\p{P}]+/gu;
2149
+ const defaultTokenizeProbe = 'a b';
2150
+ const defaultTokenizeProbeField = 'f';
2151
+ const tokenizeBehaviorCache = new WeakMap();
2152
+ /**
2153
+ * True when `tokenize` matches the library default (reference equality or split-equivalent
2154
+ * on a fixed probe). Custom tokenizers that pass the probe but diverge on other inputs
2155
+ * (e.g. leading delimiters) still take the fast path — use the default reference in prod.
2156
+ */
2157
+ function isDefaultTokenize(tokenize) {
2158
+ if (tokenize === defaultFrozenLoadOptions.tokenize)
2159
+ return true;
2160
+ const cached = tokenizeBehaviorCache.get(tokenize);
2161
+ if (cached != null)
2162
+ return cached;
2163
+ const splitTokens = defaultTokenizeProbe.split(SPACE_OR_PUNCTUATION);
2164
+ const customTokens = tokenize(defaultTokenizeProbe, defaultTokenizeProbeField);
2165
+ const ok = splitTokens.length === customTokens.length
2166
+ && splitTokens.every((t, i) => t === customTokens[i]);
2167
+ tokenizeBehaviorCache.set(tokenize, ok);
2168
+ return ok;
2169
+ }
2170
+ function forEachDefaultToken(text, onToken) {
2171
+ if (text.length === 0) {
2172
+ onToken('');
2173
+ return;
2174
+ }
2175
+ let start = 0;
2176
+ const re = DEFAULT_TOKENIZE_DELIMITERS;
2177
+ re.lastIndex = 0;
2178
+ let match;
2179
+ while ((match = re.exec(text)) !== null) {
2180
+ if (match.index > start) {
2181
+ onToken(text.slice(start, match.index));
2240
2182
  }
2241
- else if (processedTerm) {
2242
- localFreqs.set(processedTerm, (localFreqs.get(processedTerm) || 0) + 1);
2183
+ else if (match.index === start) {
2184
+ onToken('');
2243
2185
  }
2186
+ start = match.index + match[0].length;
2187
+ }
2188
+ if (start < text.length) {
2189
+ onToken(text.slice(start));
2190
+ }
2191
+ else if (start === 0) {
2192
+ onToken(text);
2193
+ }
2194
+ else if (start === text.length) {
2195
+ onToken('');
2244
2196
  }
2245
- return localFreqs;
2197
+ }
2198
+ /** Default tokenizer into a reusable buffer (avoids `text.split()` array allocation). */
2199
+ function tokenizeDefaultInto(out, text) {
2200
+ out.length = 0;
2201
+ forEachDefaultToken(text, (token) => out.push(token));
2202
+ }
2203
+ /** Tokenize field text into `out` (reused). Fast path when `tokenize` is the library default. */
2204
+ function tokenizeFieldInto(out, tokenize, text, fieldName) {
2205
+ if (isDefaultTokenize(tokenize)) {
2206
+ tokenizeDefaultInto(out, text);
2207
+ return;
2208
+ }
2209
+ const tokens = tokenize(text, fieldName);
2210
+ out.length = 0;
2211
+ out.push(...tokens);
2212
+ }
2213
+ function collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm) {
2214
+ localFreqs.clear();
2215
+ forEachDefaultToken(text, (token) => {
2216
+ accumulateProcessedTerm(localFreqs, processTerm(token, fieldName));
2217
+ });
2218
+ return localFreqs.size;
2219
+ }
2220
+ /**
2221
+ * Tokenize + accumulate field term frequencies in one pass when the default tokenizer is used.
2222
+ * `tokenScratch` is only used for custom tokenizers (two-phase fallback).
2223
+ */
2224
+ function collectFieldTermFreqsFromFieldInto(localFreqs, tokenScratch, tokenize, text, fieldName, processTerm) {
2225
+ if (isDefaultTokenize(tokenize)) {
2226
+ return collectDefaultFieldTermFreqsInto(localFreqs, text, fieldName, processTerm);
2227
+ }
2228
+ tokenizeFieldInto(tokenScratch, tokenize, text, fieldName);
2229
+ return collectFieldTermFreqsInto(localFreqs, tokenScratch, fieldName, processTerm);
2246
2230
  }
2247
2231
  /** Same running average as {@link MiniSearch} private addFieldLength. */
2248
2232
  function updateAvgFieldLength(avgFieldLength, fieldId, count, length) {
@@ -3442,50 +3426,260 @@ async function decodeFrozenSnapshotAsync(buf) {
3442
3426
  return decodeFrozenSnapshot(buf);
3443
3427
  }
3444
3428
 
3445
- function getOrCreateTermIndex(state, index, term) {
3429
+ const DEFAULT_CAPACITY = 16;
3430
+ /** Growable unsigned 32-bit column (build scratch; narrowed to u16 at finalize when possible). */
3431
+ class GrowableUint32Column {
3432
+ constructor(initialCapacity = DEFAULT_CAPACITY) {
3433
+ this._len = 0;
3434
+ this._buf = new Uint32Array(Math.max(1, initialCapacity));
3435
+ }
3436
+ get length() {
3437
+ return this._len;
3438
+ }
3439
+ push(value) {
3440
+ if (this._len >= this._buf.length) {
3441
+ const grown = new Uint32Array(Math.max(1, this._buf.length * 2));
3442
+ grown.set(this._buf);
3443
+ this._buf = grown;
3444
+ }
3445
+ this._buf[this._len++] = value;
3446
+ }
3447
+ copyRangeInto(sourceOffset, length, target, targetOffset, docIdWidth) {
3448
+ if (docIdWidth === 16) {
3449
+ const out = target;
3450
+ for (let i = 0; i < length; i++)
3451
+ out[targetOffset + i] = this._buf[sourceOffset + i];
3452
+ }
3453
+ else {
3454
+ const out = target;
3455
+ for (let i = 0; i < length; i++)
3456
+ out[targetOffset + i] = this._buf[sourceOffset + i];
3457
+ }
3458
+ }
3459
+ truncate(length) {
3460
+ this._len = length;
3461
+ if (length > 0 && length < this._buf.length) {
3462
+ this._buf = this._buf.slice(0, length);
3463
+ }
3464
+ }
3465
+ }
3466
+ /** Growable frequency column (u16 cells; matches frozen clamp range). */
3467
+ class GrowableFreqColumn {
3468
+ constructor(initialCapacity = DEFAULT_CAPACITY) {
3469
+ this._len = 0;
3470
+ this._buf = new Uint16Array(Math.max(1, initialCapacity));
3471
+ }
3472
+ get length() {
3473
+ return this._len;
3474
+ }
3475
+ push(freq) {
3476
+ if (this._len >= this._buf.length) {
3477
+ const grown = new Uint16Array(Math.max(1, this._buf.length * 2));
3478
+ grown.set(this._buf);
3479
+ this._buf = grown;
3480
+ }
3481
+ this._buf[this._len++] = clampFreq(freq);
3482
+ }
3483
+ copyRangeInto(sourceOffset, length, target, targetOffset) {
3484
+ for (let i = 0; i < length; i++) {
3485
+ target[targetOffset + i] = this._buf[sourceOffset + i];
3486
+ }
3487
+ }
3488
+ truncate(length) {
3489
+ this._len = length;
3490
+ if (length > 0 && length < this._buf.length) {
3491
+ this._buf = this._buf.slice(0, length);
3492
+ }
3493
+ }
3494
+ }
3495
+ /**
3496
+ * Single-pass postings accumulator for {@link FrozenIndexBuilder}.
3497
+ * One global TypedArray stream per docIds/freqs; per-slot range metadata only.
3498
+ */
3499
+ class IncrementalPostingsAccumulator {
3500
+ constructor(fieldCount, hints) {
3501
+ var _a;
3502
+ this._slots = new Map();
3503
+ this._totalPostings = 0;
3504
+ this._maxFreq = 0;
3505
+ this._fieldCount = fieldCount;
3506
+ const cap = Math.max(DEFAULT_CAPACITY, (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedTotalPostings) !== null && _a !== void 0 ? _a : 0);
3507
+ this._docIds = new GrowableUint32Column(cap);
3508
+ this._freqs = new GrowableFreqColumn(cap);
3509
+ }
3510
+ get totalPostings() {
3511
+ return this._totalPostings;
3512
+ }
3513
+ get maxFreq() {
3514
+ return this._maxFreq;
3515
+ }
3516
+ append(termIndex, fieldId, docId, freq) {
3517
+ const slot = termIndex * this._fieldCount + fieldId;
3518
+ const writeIdx = this._docIds.length;
3519
+ this._docIds.push(docId);
3520
+ const v = clampFreq(freq);
3521
+ this._freqs.push(v);
3522
+ if (v > this._maxFreq)
3523
+ this._maxFreq = v;
3524
+ this._totalPostings++;
3525
+ let ranges = this._slots.get(slot);
3526
+ if (ranges == null) {
3527
+ ranges = { starts: [writeIdx], lengths: [1] };
3528
+ this._slots.set(slot, ranges);
3529
+ return;
3530
+ }
3531
+ const last = ranges.starts.length - 1;
3532
+ const end = ranges.starts[last] + ranges.lengths[last];
3533
+ if (end === writeIdx) {
3534
+ ranges.lengths[last]++;
3535
+ }
3536
+ else {
3537
+ ranges.starts.push(writeIdx);
3538
+ ranges.lengths.push(1);
3539
+ }
3540
+ }
3541
+ clear() {
3542
+ this._slots.clear();
3543
+ // Drop global scratch backing so finalize does not retain duplicate posting bytes.
3544
+ this._docIds.truncate(0);
3545
+ this._freqs.truncate(0);
3546
+ }
3547
+ copySlot(ranges, allDocIds, allFreqs, write, docIdWidth) {
3548
+ for (let r = 0; r < ranges.starts.length; r++) {
3549
+ const start = ranges.starts[r];
3550
+ const len = ranges.lengths[r];
3551
+ this._docIds.copyRangeInto(start, len, allDocIds, write, docIdWidth);
3552
+ this._freqs.copyRangeInto(start, len, allFreqs, write);
3553
+ write += len;
3554
+ }
3555
+ return write;
3556
+ }
3557
+ slotLength(ranges) {
3558
+ let n = 0;
3559
+ for (let i = 0; i < ranges.lengths.length; i++)
3560
+ n += ranges.lengths[i];
3561
+ return n;
3562
+ }
3563
+ finalize(termCount, nextId) {
3564
+ const fieldCount = this._fieldCount;
3565
+ const totalPostings = this._totalPostings;
3566
+ const maxFreq = this._maxFreq;
3567
+ const slots = this._slots;
3568
+ const layout = choosePostingsLayout(fieldCount);
3569
+ const docIdWidth = nextId <= 65535 ? 16 : 32;
3570
+ const allDocIds = docIdWidth === 16
3571
+ ? new Uint16Array(totalPostings)
3572
+ : new Uint32Array(totalPostings);
3573
+ const allFreqs = allocateFreqs(totalPostings, maxFreq);
3574
+ if (layout === 'dense') {
3575
+ const slotCount = termCount * fieldCount;
3576
+ const denseOffsets = new Uint32Array(slotCount);
3577
+ const denseLengths = new Uint32Array(slotCount);
3578
+ let write = 0;
3579
+ for (let ti = 0; ti < termCount; ti++) {
3580
+ const base = ti * fieldCount;
3581
+ for (let f = 0; f < fieldCount; f++) {
3582
+ const slot = base + f;
3583
+ const ranges = slots.get(slot);
3584
+ const len = ranges == null ? 0 : this.slotLength(ranges);
3585
+ denseOffsets[slot] = write;
3586
+ denseLengths[slot] = len;
3587
+ if (len > 0) {
3588
+ write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
3589
+ slots.delete(slot);
3590
+ }
3591
+ }
3592
+ }
3593
+ slots.clear();
3594
+ this.clear();
3595
+ return {
3596
+ fieldCount,
3597
+ termCount,
3598
+ nextId,
3599
+ layout,
3600
+ docIdWidth,
3601
+ sparseFieldIdWidth: null,
3602
+ allDocIds,
3603
+ allFreqs,
3604
+ denseOffsets,
3605
+ denseLengths,
3606
+ sparseTermStarts: null,
3607
+ sparseFieldIds: null,
3608
+ sparseOffsets: null,
3609
+ sparseLengths: null,
3610
+ };
3611
+ }
3612
+ const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
3613
+ const sparseFieldIdsScratch = [];
3614
+ const sparseOffsets = [];
3615
+ const sparseLengths = [];
3616
+ const termStarts = new Array(termCount + 1).fill(0);
3617
+ let write = 0;
3618
+ for (let ti = 0; ti < termCount; ti++) {
3619
+ termStarts[ti] = sparseFieldIdsScratch.length;
3620
+ for (let f = 0; f < fieldCount; f++) {
3621
+ const slot = ti * fieldCount + f;
3622
+ const ranges = slots.get(slot);
3623
+ const len = ranges == null ? 0 : this.slotLength(ranges);
3624
+ if (len === 0)
3625
+ continue;
3626
+ sparseFieldIdsScratch.push(f);
3627
+ sparseOffsets.push(write);
3628
+ sparseLengths.push(len);
3629
+ write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
3630
+ slots.delete(slot);
3631
+ }
3632
+ termStarts[ti + 1] = sparseFieldIdsScratch.length;
3633
+ }
3634
+ slots.clear();
3635
+ this.clear();
3636
+ const sparseFieldIds = sparseFieldIdWidth === 16
3637
+ ? new Uint16Array(sparseFieldIdsScratch)
3638
+ : new Uint8Array(sparseFieldIdsScratch);
3639
+ return {
3640
+ fieldCount,
3641
+ termCount,
3642
+ nextId,
3643
+ layout,
3644
+ docIdWidth,
3645
+ sparseFieldIdWidth,
3646
+ allDocIds,
3647
+ allFreqs,
3648
+ denseOffsets: null,
3649
+ denseLengths: null,
3650
+ sparseTermStarts: new Uint32Array(termStarts),
3651
+ sparseFieldIds,
3652
+ sparseOffsets: new Uint32Array(sparseOffsets),
3653
+ sparseLengths: new Uint32Array(sparseLengths),
3654
+ };
3655
+ }
3656
+ }
3657
+
3658
+ function getOrCreateTermIndex(termCount, index, term) {
3446
3659
  const existing = index.get(term);
3447
3660
  if (existing != null)
3448
3661
  return existing;
3449
- const ti = state.terms.length;
3450
- state.terms.push(term);
3662
+ const ti = termCount.value;
3663
+ termCount.value++;
3451
3664
  index.set(term, ti);
3452
3665
  return ti;
3453
3666
  }
3454
- function appendPosting(state, termIndex, fieldId, docId, freq) {
3455
- const slot = termIndex * state.fieldCount + fieldId;
3456
- let docIds = state.postingsDocIds[slot];
3457
- if (docIds == null) {
3458
- docIds = [];
3459
- state.postingsDocIds[slot] = docIds;
3460
- state.postingsFreqs[slot] = [];
3461
- }
3462
- docIds.push(docId);
3463
- state.postingsFreqs[slot].push(freq);
3464
- const v = clampFreq(freq);
3465
- if (v > state.maxFreq)
3466
- state.maxFreq = v;
3467
- state.totalPostings++;
3468
- }
3469
- function finalizeFlatPostings(state, nextId) {
3470
- return materializeFrozenPostingsFromBuilder({
3471
- fieldCount: state.fieldCount,
3472
- termCount: state.terms.length,
3473
- postingsDocIds: state.postingsDocIds,
3474
- postingsFreqs: state.postingsFreqs,
3475
- totalPostings: state.totalPostings,
3476
- maxFreq: state.maxFreq,
3477
- }, nextId);
3478
- }
3479
3667
  /** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
3480
3668
  class FrozenIndexBuilder {
3481
3669
  constructor(options, hints) {
3670
+ var _a, _b;
3671
+ this._termCount = { value: 0 };
3672
+ this._fieldTermFreqScratch = new Map();
3673
+ this._tokenScratch = [];
3482
3674
  this._options = resolveIndexingOptions(options);
3483
3675
  this._fieldIds = buildFieldIds(this._options.fields);
3484
3676
  this._fieldCount = this._options.fields.length;
3485
3677
  this._index = new SearchableMap();
3486
- this._terms = [];
3487
- this._postingsDocIds = [];
3488
- this._postingsFreqs = [];
3678
+ const estimatedDocs = (_a = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount) !== null && _a !== void 0 ? _a : 0;
3679
+ const perSlot = (_b = hints === null || hints === void 0 ? void 0 : hints.estimatedPostingsPerSlot) !== null && _b !== void 0 ? _b : 4;
3680
+ this._postings = new IncrementalPostingsAccumulator(this._fieldCount, {
3681
+ estimatedTotalPostings: estimatedDocs > 0 ? estimatedDocs * perSlot : undefined,
3682
+ });
3489
3683
  this._avgFieldLength = [];
3490
3684
  this._seenIds = new Set();
3491
3685
  this._nextId = 0;
@@ -3501,14 +3695,6 @@ class FrozenIndexBuilder {
3501
3695
  this._storedFields = [];
3502
3696
  this._fieldLengthData = [];
3503
3697
  }
3504
- this._postingsState = {
3505
- fieldCount: this._fieldCount,
3506
- terms: this._terms,
3507
- postingsDocIds: this._postingsDocIds,
3508
- postingsFreqs: this._postingsFreqs,
3509
- totalPostings: 0,
3510
- maxFreq: 0,
3511
- };
3512
3698
  }
3513
3699
  /** Number of documents indexed so far (not yet frozen). */
3514
3700
  get documentCount() {
@@ -3535,16 +3721,17 @@ class FrozenIndexBuilder {
3535
3721
  const fieldValue = extractField(document, field);
3536
3722
  if (fieldValue == null)
3537
3723
  continue;
3538
- const tokens = tokenize(stringifyField(fieldValue, field), field);
3724
+ const fieldText = typeof fieldValue === 'string'
3725
+ ? fieldValue
3726
+ : stringifyField(fieldValue, field);
3539
3727
  const fieldId = this._fieldIds[field];
3540
- const uniqueTerms = new Set(tokens).size;
3541
- const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
3728
+ const uniqueTerms = collectFieldTermFreqsFromFieldInto(this._fieldTermFreqScratch, this._tokenScratch, tokenize, fieldText, field, processTerm);
3542
3729
  this._fieldLengthData[shortId * this._fieldCount + fieldId] = uniqueTerms;
3543
3730
  updateAvgFieldLength(this._avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
3544
- for (const [term, freq] of localFreqs) {
3545
- const ti = getOrCreateTermIndex(this._postingsState, this._index, term);
3546
- appendPosting(this._postingsState, ti, fieldId, shortId, freq);
3547
- }
3731
+ this._fieldTermFreqScratch.forEach((freq, term) => {
3732
+ const ti = getOrCreateTermIndex(this._termCount, this._index, term);
3733
+ this._postings.append(ti, fieldId, shortId, freq);
3734
+ });
3548
3735
  }
3549
3736
  }
3550
3737
  /**
@@ -3601,7 +3788,11 @@ class FrozenIndexBuilder {
3601
3788
  }
3602
3789
  this._frozen = true;
3603
3790
  const documentCount = this._nextId;
3604
- const postings = finalizeFlatPostings(this._postingsState, documentCount);
3791
+ const termCount = this._termCount.value;
3792
+ const postings = this._postings.finalize(termCount, documentCount);
3793
+ const radixTree = this._index.radixTree;
3794
+ this._index = null;
3795
+ const index = fromRadixTree(radixTree, termCount);
3605
3796
  const avgFieldLength = new Float32Array(this._fieldCount);
3606
3797
  for (let f = 0; f < this._fieldCount; f++) {
3607
3798
  avgFieldLength[f] = (_a = this._avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
@@ -3614,8 +3805,6 @@ class FrozenIndexBuilder {
3614
3805
  ? this._storedFields.slice(0, documentCount)
3615
3806
  : this._storedFields;
3616
3807
  const idLookup = createIdToShortIdLookup(externalIds, documentCount);
3617
- // Incremental builder: numeric radix leaves + build-time terms[] for postings.
3618
- // freezeFromMiniSearch packs Map leaves in one radix pass (no resident terms[]).
3619
3808
  return {
3620
3809
  options: this._options,
3621
3810
  documentCount,
@@ -3627,8 +3816,8 @@ class FrozenIndexBuilder {
3627
3816
  storedFields,
3628
3817
  fieldLengthMatrix: materializeFieldLengthMatrix(this._fieldLengthData, documentCount * this._fieldCount),
3629
3818
  avgFieldLength,
3630
- index: fromRadixTree(this._index.radixTree, this._terms.length),
3631
- termCount: this._terms.length,
3819
+ index,
3820
+ termCount,
3632
3821
  postings,
3633
3822
  };
3634
3823
  }
@@ -4334,4 +4523,4 @@ class FrozenMiniSearch {
4334
4523
  }
4335
4524
  FrozenMiniSearch.wildcard = WILDCARD_QUERY;
4336
4525
 
4337
- export { AND, AND_NOT, FrozenIndexBuilder, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
4526
+ export { AND, AND_NOT, FrozenIndexBuilder, FrozenMiniSearch, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, FrozenMiniSearch as default, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yoch/frozenminisearch",
3
- "version": "1.0.0",
3
+ "version": "1.0.1",
4
4
  "description": "Read-only Node.js full-text search — compact frozen indexes and binary snapshots",
5
5
  "main": "dist/cjs/index.cjs",
6
6
  "module": "dist/es/index.js",
@@ -100,6 +100,9 @@
100
100
  "bench:history": "node --expose-gc benchmarks/framework/cli.mjs history",
101
101
  "bench:micro": "node benchmarks/micro/run.mjs",
102
102
  "bench:readme": "node benchmarks/scripts/generate-readme-comparison.mjs",
103
+ "bench:build-peak": "npm run build && node --expose-gc benchmarks/scripts/build-peak-heap.mjs",
104
+ "bench:medicaments-build-peak": "npm run build && NODE_OPTIONS='--expose-gc' npx --yes tsx benchmarks/scripts/medicaments-build-peak-heap.mjs",
105
+ "bench:build-heap-profile": "npm run build && NODE_OPTIONS='--expose-gc' npx --yes tsx benchmarks/scripts/build-heap-profile.mjs",
103
106
  "bench:reference:update": "npm run build && RUNS=3 node --expose-gc benchmarks/framework/cli.mjs record --profile=vs-reference && node benchmarks/scripts/promote-latest-to-reference.mjs && npm run bench:readme",
104
107
  "benchmark:compare": "npm run build && node --expose-gc benchmarks/compare.js",
105
108
  "benchmark:calibrate-batches": "npm run build && node --expose-gc benchmarks/scripts/calibrate-search-batches.mjs",
@@ -132,7 +135,7 @@
132
135
  "clean-build": "rm -rf dist",
133
136
  "build-minified": "MINIFY=true npm run build",
134
137
  "sync-docs-media": "node scripts/sync-docs-media.cjs",
135
- "build-docs": "npm run sync-docs-media && typedoc --options typedoc.json && npm run build-demo",
138
+ "build-docs": "typedoc --options typedoc.json && npm run sync-docs-media && npm run build-demo",
136
139
  "build-demo": "mkdir -p ./docs/demo && cp -r ./examples/plain_js/. ./docs/demo",
137
140
  "lint": "eslint 'src/**/*.{js,ts}'",
138
141
  "lintfix": "eslint --fix 'src/**/*.{js,ts}'",