@yoch/frozenminisearch 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/es/index.js CHANGED
@@ -1,9 +1,66 @@
1
1
  import zlib from 'node:zlib';
2
2
 
3
+ /**
4
+ * Internal AND / AND_NOT gate thresholds (not exported from the public package entry).
5
+ */
6
+ const DEFAULT_POSTING_GATE_MIN_LENGTH = 2048;
7
+ const DEFAULT_POSTING_GATE_RATIO_SHIFT = 2;
8
+ const DEFAULT_POSTING_GATE_POLICY = {
9
+ minLength: DEFAULT_POSTING_GATE_MIN_LENGTH,
10
+ ratioShift: DEFAULT_POSTING_GATE_RATIO_SHIFT,
11
+ };
12
+ function passGateByPostingRatio(gateSize, postingListLength, policy = DEFAULT_POSTING_GATE_POLICY) {
13
+ if (postingListLength < policy.minLength)
14
+ return false;
15
+ return gateSize <= (postingListLength >>> policy.ratioShift);
16
+ }
17
+ const DEFAULT_AND_GATE_LIMITS = {
18
+ maxAbsolute: 5000,
19
+ maxFraction: 0.1,
20
+ };
21
+ function resolveGateMaxSize(documentCount, limits = DEFAULT_AND_GATE_LIMITS) {
22
+ return Math.min(limits.maxAbsolute, Math.max(100, Math.floor(documentCount * limits.maxFraction)));
23
+ }
24
+ function gateIsSelectiveEnough(gateSize, documentCount, limits = DEFAULT_AND_GATE_LIMITS, postingListLength, postingGatePolicy = DEFAULT_POSTING_GATE_POLICY) {
25
+ if (gateSize === 0)
26
+ return true;
27
+ if (gateSize <= resolveGateMaxSize(documentCount, limits))
28
+ return true;
29
+ if (postingListLength != null
30
+ && postingListLength > 0
31
+ && passGateByPostingRatio(gateSize, postingListLength, postingGatePolicy)) {
32
+ return true;
33
+ }
34
+ return false;
35
+ }
36
+
3
37
  const MAX_FREQ = 65535;
4
38
  function readDocId(docIds, index) {
5
39
  return docIds[index];
6
40
  }
41
+ /** Binary search for docId in a sorted segment; returns global index or -1. */
42
+ function findDocIndexInSortedSegment(docIds, offset, length, docId) {
43
+ let lo = 0;
44
+ let hi = length - 1;
45
+ while (lo <= hi) {
46
+ const mid = (lo + hi) >>> 1;
47
+ const v = readDocId(docIds, offset + mid);
48
+ if (v < docId)
49
+ lo = mid + 1;
50
+ else if (v > docId)
51
+ hi = mid - 1;
52
+ else
53
+ return offset + mid;
54
+ }
55
+ return -1;
56
+ }
57
+ /**
58
+ * Scan vs binary search once `allowedDocs` is already in effect (scoring layer).
59
+ * Uses the same numeric policy as {@link passGateByPostingRatio} today; distinct decision point.
60
+ */
61
+ function shouldSeekAllowedDocs(gateSize, listLength) {
62
+ return passGateByPostingRatio(gateSize, listLength);
63
+ }
7
64
  function allocateFreqs(length, maxValue) {
8
65
  if (maxValue <= 0xff)
9
66
  return new Uint8Array(length);
@@ -65,10 +122,15 @@ function bm25FieldConstants(bm25params, avgFieldLength) {
65
122
  const { k, b, d } = bm25params;
66
123
  return { k, d, k1: k + 1, oneMinusB: 1 - b, bOverAvg: b / avgFieldLength };
67
124
  }
68
- function calcBM25ScoreWithConstants(termFreq, matchingCount, totalCount, fieldLength, constants) {
125
+ function bm25Idf(matchingCount, totalCount) {
126
+ return Math.log(1 + (totalCount - matchingCount + 0.5) / (matchingCount + 0.5));
127
+ }
128
+ function calcBm25TfWithConstants(termFreq, fieldLength, constants, idf) {
69
129
  const { k, d, k1, oneMinusB, bOverAvg } = constants;
70
- const invDocFreq = Math.log(1 + (totalCount - matchingCount + 0.5) / (matchingCount + 0.5));
71
- return invDocFreq * (d + termFreq * k1 / (termFreq + k * (oneMinusB + bOverAvg * fieldLength)));
130
+ return idf * (d + termFreq * k1 / (termFreq + k * (oneMinusB + bOverAvg * fieldLength)));
131
+ }
132
+ function calcBM25ScoreWithConstants(termFreq, matchingCount, totalCount, fieldLength, constants) {
133
+ return calcBm25TfWithConstants(termFreq, fieldLength, constants, bm25Idf(matchingCount, totalCount));
72
134
  }
73
135
  const getOwnProperty = (object, property) => Object.prototype.hasOwnProperty.call(object, property) ? object[property] : undefined;
74
136
  function fieldBoostsForQuery(options, fields) {
@@ -97,7 +159,7 @@ function getDerivedTerm(derivedTerm, cache) {
97
159
  cache.value = derivedTerm.resolve();
98
160
  return cache.value;
99
161
  }
100
- function scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache) {
162
+ function scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache, hoistedIdf) {
101
163
  const resolvedDerivedTerm = getDerivedTerm(derivedTerm, derivedTermCache);
102
164
  const docBoost = boostDocumentFn
103
165
  ? boostDocumentFn(context.getExternalId(docId), resolvedDerivedTerm, context.getStoredFields(docId))
@@ -105,7 +167,9 @@ function scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFre
105
167
  if (!docBoost)
106
168
  return;
107
169
  const fieldLength = context.getFieldLength(docId, fieldId);
108
- const rawScore = calcBM25ScoreWithConstants(termFreq, matchingFields, context.documentCount, fieldLength, bm25);
170
+ const rawScore = hoistedIdf !== undefined
171
+ ? calcBm25TfWithConstants(termFreq, fieldLength, bm25, hoistedIdf)
172
+ : calcBM25ScoreWithConstants(termFreq, matchingFields, context.documentCount, fieldLength, bm25);
109
173
  const weightedScore = termWeight * termBoost * fieldBoost * docBoost * rawScore;
110
174
  const result = results.get(docId);
111
175
  if (result) {
@@ -128,22 +192,39 @@ function scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFre
128
192
  }
129
193
  }
130
194
  function aggregateSegmentPostingList(sourceTerm, derivedTerm, termWeight, termBoost, field, fieldId, fieldBoost, list, context, boostDocumentFn, bm25params, results, allowedDocs) {
131
- var _a;
195
+ var _a, _b;
132
196
  let matchingFields = list.length;
133
197
  const bm25 = bm25FieldConstants(bm25params, context.avgFieldLength[fieldId]);
198
+ const hoistedIdf = context.isDocActive == null
199
+ ? bm25Idf(matchingFields, context.documentCount)
200
+ : undefined;
134
201
  const { docIds, freqs, offset, length } = list;
135
202
  const derivedTermCache = {};
203
+ if (allowedDocs != null && shouldSeekAllowedDocs(allowedDocs.size, length)) {
204
+ for (const docId of allowedDocs) {
205
+ if (context.isDocActive != null && !context.isDocActive(docId)) {
206
+ (_a = context.onInactiveDoc) === null || _a === void 0 ? void 0 : _a.call(context, docId, fieldId, getDerivedTerm(derivedTerm, derivedTermCache));
207
+ matchingFields -= 1;
208
+ continue;
209
+ }
210
+ const index = findDocIndexInSortedSegment(docIds, offset, length, docId);
211
+ if (index < 0)
212
+ continue;
213
+ scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, freqs[index], termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache, hoistedIdf);
214
+ }
215
+ return matchingFields;
216
+ }
136
217
  for (let i = 0; i < length; i++) {
137
218
  const docId = readDocId(docIds, offset + i);
138
219
  const termFreq = freqs[offset + i];
139
220
  if (context.isDocActive != null && !context.isDocActive(docId)) {
140
- (_a = context.onInactiveDoc) === null || _a === void 0 ? void 0 : _a.call(context, docId, fieldId, getDerivedTerm(derivedTerm, derivedTermCache));
221
+ (_b = context.onInactiveDoc) === null || _b === void 0 ? void 0 : _b.call(context, docId, fieldId, getDerivedTerm(derivedTerm, derivedTermCache));
141
222
  matchingFields -= 1;
142
223
  continue;
143
224
  }
144
225
  if (allowedDocs != null && !allowedDocs.has(docId))
145
226
  continue;
146
- scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache);
227
+ scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache, hoistedIdf);
147
228
  }
148
229
  return matchingFields;
149
230
  }
@@ -163,6 +244,9 @@ function aggregateTerm(sourceTerm, derivedTerm, termWeight, termBoost, fieldTerm
163
244
  }
164
245
  let matchingFields = postingList.size;
165
246
  const bm25 = bm25FieldConstants(bm25params, context.avgFieldLength[fieldId]);
247
+ const hoistedIdf = context.isDocActive == null
248
+ ? bm25Idf(matchingFields, context.documentCount)
249
+ : undefined;
166
250
  const derivedTermCache = {};
167
251
  postingList.forEachDoc((docId, termFreq) => {
168
252
  var _a;
@@ -173,7 +257,7 @@ function aggregateTerm(sourceTerm, derivedTerm, termWeight, termBoost, fieldTerm
173
257
  }
174
258
  if (allowedDocs != null && !allowedDocs.has(docId))
175
259
  return;
176
- scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache);
260
+ scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache, hoistedIdf);
177
261
  });
178
262
  }
179
263
  return results;
@@ -1742,82 +1826,47 @@ function readFieldLengthMatrixSection(buf, flags, cellCount) {
1742
1826
 
1743
1827
  const DISCARDED_DOC_ID = 0xffffffff;
1744
1828
  function postingFreqValue(freq, clampFrequencies) {
1745
- return clampFrequencies ? clampFreq(freq) : freq;
1746
- }
1747
- function materializeFlatPostings(params) {
1748
- const { fieldCount, termCount, forEachPosting, remapDocId, clampFrequencies } = params;
1749
- const slotCount = termCount * fieldCount;
1750
- const postingsOffsets = new Uint32Array(slotCount);
1751
- const postingsLengths = new Uint32Array(slotCount);
1752
- let totalPostings = 0;
1753
- let maxFreq = 0;
1754
- for (let ti = 0; ti < termCount; ti++) {
1755
- for (let f = 0; f < fieldCount; f++) {
1756
- forEachPosting(ti, f, (rawDocId, freq) => {
1757
- const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
1758
- if (docId === DISCARDED_DOC_ID)
1759
- return;
1760
- totalPostings++;
1761
- const v = postingFreqValue(freq, clampFrequencies);
1762
- if (v > maxFreq)
1763
- maxFreq = v;
1764
- });
1765
- }
1766
- }
1767
- const useUint16 = params.nextId != null && params.nextId <= 65535;
1768
- const allDocIds = useUint16
1769
- ? new Uint16Array(totalPostings)
1770
- : new Uint32Array(totalPostings);
1771
- const allFreqs = allocateFreqs(totalPostings, maxFreq);
1772
- // Slots are visited in ascending fieldId (0..fieldCount-1) per term. Sparse layouts
1773
- // rely on this ordering so field ids per term stay sorted for binary lookup.
1774
- let write = 0;
1775
- for (let ti = 0; ti < termCount; ti++) {
1776
- const base = ti * fieldCount;
1777
- for (let f = 0; f < fieldCount; f++) {
1778
- const offset = write;
1779
- let count = 0;
1780
- forEachPosting(ti, f, (rawDocId, freq) => {
1781
- const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
1782
- if (docId === DISCARDED_DOC_ID)
1783
- return;
1784
- if (useUint16) {
1785
- allDocIds[write] = docId;
1786
- }
1787
- else {
1788
- allDocIds[write] = docId;
1789
- }
1790
- allFreqs[write] = postingFreqValue(freq, clampFrequencies);
1791
- write++;
1792
- count++;
1793
- });
1794
- postingsOffsets[base + f] = offset;
1795
- postingsLengths[base + f] = count;
1796
- }
1797
- }
1798
- return {
1799
- postingsOffsets,
1800
- postingsLengths,
1801
- allDocIds,
1802
- allFreqs,
1803
- };
1829
+ return clampFreq(freq) ;
1804
1830
  }
1805
1831
 
1806
1832
  function readFieldId(fieldIds, index) {
1807
1833
  return fieldIds[index];
1808
1834
  }
1809
- function choosePostingsLayout(fieldCount) {
1810
- return fieldCount === 1 ? 'dense' : 'sparse';
1811
- }
1812
1835
  function chooseSparseFieldIdWidth(fieldCount) {
1813
1836
  return fieldCount > 255 ? 16 : 8;
1814
1837
  }
1815
- function materializeFrozenPostings(params) {
1816
- const { fieldCount, termCount, nextId } = params;
1817
- const layout = choosePostingsLayout(fieldCount);
1838
+ function choosePostingsLayout(fieldCount, termCount, nonEmptySlots) {
1839
+ const denseBytes = termCount * fieldCount * 8;
1840
+ const sparseFieldIdBytes = chooseSparseFieldIdWidth(fieldCount) === 16 ? 2 : 1;
1841
+ const sparseBytes = (termCount + 1) * 4 + nonEmptySlots * (sparseFieldIdBytes + 8);
1842
+ return denseBytes <= sparseBytes ? 'dense' : 'sparse';
1843
+ }
1844
+ /** Shared dense/sparse layout emission; callers supply per-slot length and copy. */
1845
+ function buildFrozenPostingsLayout(fieldCount, termCount, nextId, totalPostings, maxFreq, source) {
1846
+ const layout = choosePostingsLayout(fieldCount, termCount, source.nonEmptySlots);
1818
1847
  const docIdWidth = nextId <= 65535 ? 16 : 32;
1848
+ const allDocIds = docIdWidth === 16
1849
+ ? new Uint16Array(totalPostings)
1850
+ : new Uint32Array(totalPostings);
1851
+ const allFreqs = allocateFreqs(totalPostings, maxFreq);
1852
+ const targets = { allDocIds, allFreqs, docIdWidth };
1819
1853
  if (layout === 'dense') {
1820
- const flat = materializeFlatPostings({ ...params, nextId });
1854
+ const slotCount = termCount * fieldCount;
1855
+ const denseOffsets = new Uint32Array(slotCount);
1856
+ const denseLengths = new Uint32Array(slotCount);
1857
+ let write = 0;
1858
+ for (let ti = 0; ti < termCount; ti++) {
1859
+ const base = ti * fieldCount;
1860
+ for (let f = 0; f < fieldCount; f++) {
1861
+ const slot = base + f;
1862
+ const len = source.slotLength(ti, f);
1863
+ denseOffsets[slot] = write;
1864
+ denseLengths[slot] = len;
1865
+ if (len > 0) {
1866
+ write = source.writeSlot(ti, f, write, targets);
1867
+ }
1868
+ }
1869
+ }
1821
1870
  return {
1822
1871
  fieldCount,
1823
1872
  termCount,
@@ -1825,10 +1874,10 @@ function materializeFrozenPostings(params) {
1825
1874
  layout,
1826
1875
  docIdWidth,
1827
1876
  sparseFieldIdWidth: null,
1828
- allDocIds: flat.allDocIds,
1829
- allFreqs: flat.allFreqs,
1830
- denseOffsets: flat.postingsOffsets,
1831
- denseLengths: flat.postingsLengths,
1877
+ allDocIds,
1878
+ allFreqs,
1879
+ denseOffsets,
1880
+ denseLengths,
1832
1881
  sparseTermStarts: null,
1833
1882
  sparseFieldIds: null,
1834
1883
  sparseOffsets: null,
@@ -1840,60 +1889,23 @@ function materializeFrozenPostings(params) {
1840
1889
  const sparseOffsets = [];
1841
1890
  const sparseLengths = [];
1842
1891
  const termStarts = new Array(termCount + 1).fill(0);
1843
- const { forEachPosting, remapDocId, clampFrequencies } = params;
1844
- // Non-empty slots per term are emitted with fieldId in ascending order (f loops 0..fieldCount-1).
1845
- let totalPostings = 0;
1846
- let maxFreq = 0;
1892
+ let write = 0;
1847
1893
  for (let ti = 0; ti < termCount; ti++) {
1848
1894
  termStarts[ti] = sparseFieldIdsScratch.length;
1849
1895
  for (let f = 0; f < fieldCount; f++) {
1850
- let count = 0;
1851
- forEachPosting(ti, f, (rawDocId, freq) => {
1852
- const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
1853
- if (docId === DISCARDED_DOC_ID)
1854
- return;
1855
- count++;
1856
- const v = postingFreqValue(freq, clampFrequencies);
1857
- if (v > maxFreq)
1858
- maxFreq = v;
1859
- });
1860
- if (count === 0)
1896
+ const len = source.slotLength(ti, f);
1897
+ if (len === 0)
1861
1898
  continue;
1862
1899
  sparseFieldIdsScratch.push(f);
1863
- sparseOffsets.push(totalPostings);
1864
- sparseLengths.push(count);
1865
- totalPostings += count;
1900
+ sparseOffsets.push(write);
1901
+ sparseLengths.push(len);
1902
+ write = source.writeSlot(ti, f, write, targets);
1866
1903
  }
1867
1904
  termStarts[ti + 1] = sparseFieldIdsScratch.length;
1868
1905
  }
1869
- const allDocIds = docIdWidth === 16
1870
- ? new Uint16Array(totalPostings)
1871
- : new Uint32Array(totalPostings);
1872
- const allFreqs = allocateFreqs(totalPostings, maxFreq);
1873
1906
  const sparseFieldIds = sparseFieldIdWidth === 16
1874
1907
  ? new Uint16Array(sparseFieldIdsScratch)
1875
1908
  : new Uint8Array(sparseFieldIdsScratch);
1876
- let write = 0;
1877
- for (let ti = 0; ti < termCount; ti++) {
1878
- const start = termStarts[ti];
1879
- const end = termStarts[ti + 1];
1880
- for (let s = start; s < end; s++) {
1881
- const f = readFieldId(sparseFieldIds, s);
1882
- forEachPosting(ti, f, (rawDocId, freq) => {
1883
- const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
1884
- if (docId === DISCARDED_DOC_ID)
1885
- return;
1886
- if (docIdWidth === 16) {
1887
- allDocIds[write] = docId;
1888
- }
1889
- else {
1890
- allDocIds[write] = docId;
1891
- }
1892
- allFreqs[write] = postingFreqValue(freq, clampFrequencies);
1893
- write++;
1894
- });
1895
- }
1896
- }
1897
1909
  return {
1898
1910
  fieldCount,
1899
1911
  termCount,
@@ -1911,6 +1923,58 @@ function materializeFrozenPostings(params) {
1911
1923
  sparseLengths: new Uint32Array(sparseLengths),
1912
1924
  };
1913
1925
  }
1926
+ function materializeFrozenPostings(params) {
1927
+ const { fieldCount, termCount, nextId } = params;
1928
+ const { forEachPosting, remapDocId} = params;
1929
+ const slotCount = termCount * fieldCount;
1930
+ const slotLengths = new Uint32Array(slotCount);
1931
+ let totalPostings = 0;
1932
+ let maxFreq = 0;
1933
+ let nonEmptySlots = 0;
1934
+ for (let ti = 0; ti < termCount; ti++) {
1935
+ const base = ti * fieldCount;
1936
+ for (let f = 0; f < fieldCount; f++) {
1937
+ let count = 0;
1938
+ forEachPosting(ti, f, (rawDocId, freq) => {
1939
+ const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
1940
+ if (docId === DISCARDED_DOC_ID)
1941
+ return;
1942
+ count++;
1943
+ const v = postingFreqValue(freq);
1944
+ if (v > maxFreq)
1945
+ maxFreq = v;
1946
+ });
1947
+ if (count === 0)
1948
+ continue;
1949
+ slotLengths[base + f] = count;
1950
+ totalPostings += count;
1951
+ nonEmptySlots++;
1952
+ }
1953
+ }
1954
+ return buildFrozenPostingsLayout(fieldCount, termCount, nextId, totalPostings, maxFreq, {
1955
+ nonEmptySlots,
1956
+ slotLength(ti, f) {
1957
+ return slotLengths[ti * fieldCount + f];
1958
+ },
1959
+ writeSlot(ti, f, write, targets) {
1960
+ const { allDocIds: outDocIds, allFreqs: outFreqs, docIdWidth: width } = targets;
1961
+ forEachPosting(ti, f, (rawDocId, freq) => {
1962
+ const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
1963
+ if (docId === DISCARDED_DOC_ID)
1964
+ return;
1965
+ if (width === 16) {
1966
+ outDocIds[write] = docId;
1967
+ }
1968
+ else {
1969
+ outDocIds[write] = docId;
1970
+ }
1971
+ outFreqs[write] = postingFreqValue(freq);
1972
+ write++;
1973
+ });
1974
+ return write;
1975
+ },
1976
+ });
1977
+ }
1914
1978
  function postingsTypedBytes(layout) {
1915
1979
  const allDocIdsBytes = layout.allDocIds.byteLength;
1916
1980
  const allFreqsBytes = layout.allFreqs.byteLength;
@@ -2673,7 +2737,6 @@ function buildFlatPostingsFromSearchableMap(searchableMap, fieldCount, nextId, s
2673
2737
  fieldCount,
2674
2738
  termCount,
2675
2739
  nextId,
2676
- clampFrequencies: true,
2677
2740
  remapDocId,
2678
2741
  forEachPosting(ti, f, emit) {
2679
2742
  var _a;
@@ -2917,8 +2980,8 @@ async function zlibPayloadChoiceAsync(uncompressed) {
2917
2980
  return { payload: compressed, codec: CODEC_ZLIB, zstdLevel: 0 };
2918
2981
  }
2919
2982
  const autoSyncCompressors = {
2920
- zstd: (uncompressed) => zlib.zstdCompressSync(uncompressed, msv5ZstdCompressOptions(uncompressed)),
2921
- zlib: (uncompressed) => zlib.deflateSync(uncompressed),
2983
+ zstd: uncompressed => zlib.zstdCompressSync(uncompressed, msv5ZstdCompressOptions(uncompressed)),
2984
+ zlib: uncompressed => zlib.deflateSync(uncompressed),
2922
2985
  };
2923
2986
  const autoAsyncCompressors = {
2924
2987
  zstd: zstdCompressAsync,
@@ -3861,93 +3924,23 @@ class IncrementalPostingsAccumulator {
3861
3924
  const totalPostings = this._totalPostings;
3862
3925
  const maxFreq = this._maxFreq;
3863
3926
  const slots = this._slots;
3864
- const layout = choosePostingsLayout(fieldCount);
3865
- const docIdWidth = nextId <= 65535 ? 16 : 32;
3866
- const allDocIds = docIdWidth === 16
3867
- ? new Uint16Array(totalPostings)
3868
- : new Uint32Array(totalPostings);
3869
- const allFreqs = allocateFreqs(totalPostings, maxFreq);
3870
- if (layout === 'dense') {
3871
- const slotCount = termCount * fieldCount;
3872
- const denseOffsets = new Uint32Array(slotCount);
3873
- const denseLengths = new Uint32Array(slotCount);
3874
- let write = 0;
3875
- for (let ti = 0; ti < termCount; ti++) {
3876
- const base = ti * fieldCount;
3877
- for (let f = 0; f < fieldCount; f++) {
3878
- const slot = base + f;
3879
- const ranges = slots.get(slot);
3880
- const len = ranges == null ? 0 : this.slotLength(ranges);
3881
- denseOffsets[slot] = write;
3882
- denseLengths[slot] = len;
3883
- if (len > 0) {
3884
- write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
3885
- slots.delete(slot);
3886
- }
3887
- }
3888
- }
3889
- slots.clear();
3890
- this.clear();
3891
- return {
3892
- fieldCount,
3893
- termCount,
3894
- nextId,
3895
- layout,
3896
- docIdWidth,
3897
- sparseFieldIdWidth: null,
3898
- allDocIds,
3899
- allFreqs,
3900
- denseOffsets,
3901
- denseLengths,
3902
- sparseTermStarts: null,
3903
- sparseFieldIds: null,
3904
- sparseOffsets: null,
3905
- sparseLengths: null,
3906
- };
3907
- }
3908
- const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
3909
- const sparseFieldIdsScratch = [];
3910
- const sparseOffsets = [];
3911
- const sparseLengths = [];
3912
- const termStarts = new Array(termCount + 1).fill(0);
3913
- let write = 0;
3914
- for (let ti = 0; ti < termCount; ti++) {
3915
- termStarts[ti] = sparseFieldIdsScratch.length;
3916
- for (let f = 0; f < fieldCount; f++) {
3927
+ const layout = buildFrozenPostingsLayout(fieldCount, termCount, nextId, totalPostings, maxFreq, {
3928
+ nonEmptySlots: slots.size,
3929
+ slotLength: (ti, f) => {
3930
+ const ranges = slots.get(ti * fieldCount + f);
3931
+ return ranges == null ? 0 : this.slotLength(ranges);
3932
+ },
3933
+ writeSlot: (ti, f, write, targets) => {
3917
3934
  const slot = ti * fieldCount + f;
3918
3935
  const ranges = slots.get(slot);
3919
- const len = ranges == null ? 0 : this.slotLength(ranges);
3920
- if (len === 0)
3921
- continue;
3922
- sparseFieldIdsScratch.push(f);
3923
- sparseOffsets.push(write);
3924
- sparseLengths.push(len);
3925
- write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
3936
+ const next = this.copySlot(ranges, targets.allDocIds, targets.allFreqs, write, targets.docIdWidth);
3926
3937
  slots.delete(slot);
3927
- }
3928
- termStarts[ti + 1] = sparseFieldIdsScratch.length;
3929
- }
3938
+ return next;
3939
+ },
3940
+ });
3930
3941
  slots.clear();
3931
3942
  this.clear();
3932
- const sparseFieldIds = sparseFieldIdWidth === 16
3933
- ? new Uint16Array(sparseFieldIdsScratch)
3934
- : new Uint8Array(sparseFieldIdsScratch);
3935
- return {
3936
- fieldCount,
3937
- termCount,
3938
- nextId,
3939
- layout,
3940
- docIdWidth,
3941
- sparseFieldIdWidth,
3942
- allDocIds,
3943
- allFreqs,
3944
- denseOffsets: null,
3945
- denseLengths: null,
3946
- sparseTermStarts: new Uint32Array(termStarts),
3947
- sparseFieldIds,
3948
- sparseOffsets: new Uint32Array(sparseOffsets),
3949
- sparseLengths: new Uint32Array(sparseLengths),
3950
- };
3943
+ return layout;
3951
3944
  }
3952
3945
  }
3953
3946
 
@@ -4127,22 +4120,6 @@ function buildFrozenParamsFromDocuments(documents, options) {
4127
4120
  return builder.freezeParams();
4128
4121
  }
4129
4122
 
4130
- /**
4131
- * Internal AND / AND_NOT gate thresholds (not exported from the public package entry).
4132
- */
4133
- const DEFAULT_AND_GATE_LIMITS = {
4134
- maxAbsolute: 5000,
4135
- maxFraction: 0.1,
4136
- };
4137
- function resolveGateMaxSize(documentCount, limits = DEFAULT_AND_GATE_LIMITS) {
4138
- return Math.min(limits.maxAbsolute, Math.max(100, Math.floor(documentCount * limits.maxFraction)));
4139
- }
4140
- function gateIsSelectiveEnough(gateSize, documentCount, limits = DEFAULT_AND_GATE_LIMITS) {
4141
- if (gateSize === 0)
4142
- return true;
4143
- return gateSize <= resolveGateMaxSize(documentCount, limits);
4144
- }
4145
-
4146
4123
  function useGatedEvaluation(run, branchCount, operator, hasWildcard) {
4147
4124
  return shouldUseGatedEvaluation(branchCount, operator, hasWildcard);
4148
4125
  }
@@ -4219,12 +4196,11 @@ function normalizeStringQuery(query, searchOptions, params) {
4219
4196
  function lazyIndexedTerm(indexView, termIndex) {
4220
4197
  return { kind: 'lazy', resolve: () => indexView.resolveTermByIndex(termIndex) };
4221
4198
  }
4222
- function visitQuerySpecForScoring(query, normalized, params, visit) {
4199
+ function forEachQuerySpecTermRef(query, normalized, params, visit) {
4223
4200
  const { indexView } = params;
4224
- const { fuzzyWeight, options, prefixWeight } = normalized;
4201
+ const { options } = normalized;
4225
4202
  const maxDistance = maxFuzzyDistance(query, options.maxFuzzy);
4226
- const exactTi = indexView.resolveTermIndex(query.term);
4227
- visit(exactTi == null ? undefined : indexView.fieldTermData(exactTi), query.term, 1);
4203
+ visit({ kind: 'exact', termIndex: indexView.resolveTermIndex(query.term) });
4228
4204
  const seenPrefix = query.prefix && maxDistance ? new Set() : undefined;
4229
4205
  if (query.prefix) {
4230
4206
  for (const { termIndex, length } of indexView.getPrefixMatchesByIndex(query.term)) {
@@ -4232,7 +4208,7 @@ function visitQuerySpecForScoring(query, normalized, params, visit) {
4232
4208
  if (!distance)
4233
4209
  continue;
4234
4210
  seenPrefix === null || seenPrefix === void 0 ? void 0 : seenPrefix.add(termIndex);
4235
- visit(indexView.fieldTermData(termIndex), lazyIndexedTerm(indexView, termIndex), prefixWeight * length / (length + 0.3 * distance));
4211
+ visit({ kind: 'prefix', termIndex, length, distance });
4236
4212
  }
4237
4213
  }
4238
4214
  if (!maxDistance)
@@ -4240,9 +4216,24 @@ function visitQuerySpecForScoring(query, normalized, params, visit) {
4240
4216
  for (const { termIndex, length, distance } of indexView.getFuzzyMatchesByIndex(query.term, maxDistance)) {
4241
4217
  if (!distance || (seenPrefix === null || seenPrefix === void 0 ? void 0 : seenPrefix.has(termIndex)))
4242
4218
  continue;
4243
- visit(indexView.fieldTermData(termIndex), lazyIndexedTerm(indexView, termIndex), fuzzyWeight * length / (length + distance));
4219
+ visit({ kind: 'fuzzy', termIndex, length, distance });
4244
4220
  }
4245
4221
  }
4222
+ function visitQuerySpecForScoring(query, normalized, params, visit) {
4223
+ const { indexView } = params;
4224
+ const { fuzzyWeight, prefixWeight } = normalized;
4225
+ forEachQuerySpecTermRef(query, normalized, params, (ref) => {
4226
+ if (ref.kind === 'exact') {
4227
+ visit(ref.termIndex == null ? undefined : indexView.fieldTermData(ref.termIndex), query.term, 1);
4228
+ return;
4229
+ }
4230
+ if (ref.kind === 'prefix') {
4231
+ visit(indexView.fieldTermData(ref.termIndex), lazyIndexedTerm(indexView, ref.termIndex), prefixWeight * ref.length / (ref.length + 0.3 * ref.distance));
4232
+ return;
4233
+ }
4234
+ visit(indexView.fieldTermData(ref.termIndex), lazyIndexedTerm(indexView, ref.termIndex), fuzzyWeight * ref.length / (ref.length + ref.distance));
4235
+ });
4236
+ }
4246
4237
  function executeQuerySpecInternal(query, normalized, params, allowedDocs) {
4247
4238
  const { fieldBoosts, options } = normalized;
4248
4239
  const termOptions = allowedDocs == null ? undefined : { allowedDocs };
@@ -4252,32 +4243,73 @@ function executeQuerySpecInternal(query, normalized, params, allowedDocs) {
4252
4243
  });
4253
4244
  return results;
4254
4245
  }
4255
- function collectDocIdsForQuerySpec(query, normalized, params, allowedDocs) {
4256
- const { fieldBoosts, options } = normalized;
4257
- const docIds = new Set();
4258
- const { indexView, aggregateContext } = params;
4259
- const maxDistance = maxFuzzyDistance(query, options.maxFuzzy);
4260
- const exactTi = indexView.resolveTermIndex(query.term);
4261
- if (exactTi != null) {
4262
- indexView.collectDocIds(exactTi, fieldBoosts, aggregateContext, docIds, allowedDocs);
4246
+ function maxPostingLengthForFieldTermData(data, fieldBoosts, fieldIds) {
4247
+ if (data == null)
4248
+ return 0;
4249
+ let maxLen = 0;
4250
+ for (const field of fieldBoosts.names) {
4251
+ const fieldId = fieldIds[field];
4252
+ const postingList = data.get(fieldId);
4253
+ if (postingList == null)
4254
+ continue;
4255
+ const len = postingList instanceof SegmentPostingList ? postingList.length : postingList.size;
4256
+ if (len > maxLen)
4257
+ maxLen = len;
4263
4258
  }
4264
- const seenPrefix = query.prefix && maxDistance ? new Set() : undefined;
4265
- if (query.prefix) {
4266
- for (const { termIndex, length } of indexView.getPrefixMatchesByIndex(query.term)) {
4267
- const distance = length - query.term.length;
4268
- if (!distance)
4269
- continue;
4270
- seenPrefix === null || seenPrefix === void 0 ? void 0 : seenPrefix.add(termIndex);
4271
- indexView.collectDocIds(termIndex, fieldBoosts, aggregateContext, docIds, allowedDocs);
4259
+ return maxLen;
4260
+ }
4261
+ function estimateMaxPostingLengthForQuerySpec(query, normalized, params) {
4262
+ const { indexView, aggregateContext } = params;
4263
+ const { fieldBoosts } = normalized;
4264
+ const { fieldIds } = aggregateContext;
4265
+ let maxLen = 0;
4266
+ const consider = (data) => {
4267
+ maxLen = Math.max(maxLen, maxPostingLengthForFieldTermData(data, fieldBoosts, fieldIds));
4268
+ };
4269
+ forEachQuerySpecTermRef(query, normalized, params, (ref) => {
4270
+ if (ref.kind === 'exact') {
4271
+ if (ref.termIndex != null)
4272
+ consider(indexView.fieldTermData(ref.termIndex));
4273
+ return;
4272
4274
  }
4275
+ consider(indexView.fieldTermData(ref.termIndex));
4276
+ });
4277
+ return maxLen;
4278
+ }
4279
+ function estimateMaxPostingLengthForQuery(query, searchOptions, params) {
4280
+ if (isWildcardQuery(query)) {
4281
+ return params.aggregateContext.documentCount;
4273
4282
  }
4274
- if (maxDistance) {
4275
- for (const { termIndex, distance } of indexView.getFuzzyMatchesByIndex(query.term, maxDistance)) {
4276
- if (!distance || (seenPrefix === null || seenPrefix === void 0 ? void 0 : seenPrefix.has(termIndex)))
4277
- continue;
4278
- indexView.collectDocIds(termIndex, fieldBoosts, aggregateContext, docIds, allowedDocs);
4283
+ if (isQueryCombination(query)) {
4284
+ const options = { ...searchOptions, ...query, queries: undefined };
4285
+ let maxLen = 0;
4286
+ for (const branch of query.queries) {
4287
+ maxLen = Math.max(maxLen, estimateMaxPostingLengthForQuery(branch, options, params));
4279
4288
  }
4289
+ return maxLen;
4290
+ }
4291
+ if (typeof query !== 'string')
4292
+ return 0;
4293
+ const normalized = normalizeStringQuery(query, searchOptions, params);
4294
+ let maxLen = 0;
4295
+ for (const spec of normalized.specs) {
4296
+ maxLen = Math.max(maxLen, estimateMaxPostingLengthForQuerySpec(spec, normalized, params));
4280
4297
  }
4298
+ return maxLen;
4299
+ }
4300
+ function collectDocIdsForQuerySpec(query, normalized, params, allowedDocs) {
4301
+ const { fieldBoosts } = normalized;
4302
+ const docIds = new Set();
4303
+ const { indexView, aggregateContext } = params;
4304
+ forEachQuerySpecTermRef(query, normalized, params, (ref) => {
4305
+ if (ref.kind === 'exact') {
4306
+ if (ref.termIndex != null) {
4307
+ indexView.collectDocIds(ref.termIndex, fieldBoosts, aggregateContext, docIds, allowedDocs);
4308
+ }
4309
+ return;
4310
+ }
4311
+ indexView.collectDocIds(ref.termIndex, fieldBoosts, aggregateContext, docIds, allowedDocs);
4312
+ });
4281
4313
  return docIds;
4282
4314
  }
4283
4315
  function intersectDocIdsInPlace(docIds, branchDocIds) {
@@ -4327,7 +4359,8 @@ function collectCombinedDocIds(branches, operator, collectBranch, allowedDocs) {
4327
4359
  * AND_NOT: score the positive branch only; negated branches are collected as docId sets and
4328
4360
  * subtracted without scoring (avoids term materialization on excluded branches).
4329
4361
  */
4330
- function executeCombinedBranches(branches, operator, params, executeBranch, collectBranch, allowedDocs, run) {
4362
+ function executeCombinedBranches(branches, operator, params, executeBranch, collectBranch, allowedDocs, run, estimateBranchPostingLength) {
4363
+ var _a;
4331
4364
  if (branches.length === 0)
4332
4365
  return new Map();
4333
4366
  const op = operator.toLowerCase();
@@ -4339,8 +4372,16 @@ function executeCombinedBranches(branches, operator, params, executeBranch, coll
4339
4372
  if (op === 'and') {
4340
4373
  const limits = void 0 ;
4341
4374
  const documentCount = params.aggregateContext.documentCount;
4375
+ const postingGatePolicy = (_a = void 0 ) !== null && _a !== void 0 ? _a : DEFAULT_POSTING_GATE_POLICY;
4376
+ const maxGateSize = resolveGateMaxSize(documentCount, limits);
4342
4377
  for (let i = 1; i < branches.length; i++) {
4343
- const selective = gateIsSelectiveEnough(gate.size, documentCount, limits);
4378
+ if (gate.size === 0)
4379
+ return result;
4380
+ const ratioPath = gate.size > maxGateSize;
4381
+ const postingListLength = ratioPath
4382
+ ? estimateBranchPostingLength === null || estimateBranchPostingLength === void 0 ? void 0 : estimateBranchPostingLength(branches[i])
4383
+ : undefined;
4384
+ const selective = gateIsSelectiveEnough(gate.size, documentCount, limits, postingListLength, postingGatePolicy);
4344
4385
  const branchAllowed = selective ? gate : allowedDocs;
4345
4386
  result = combineResults([result, executeBranch(branches[i], branchAllowed)], AND);
4346
4387
  gate = docIdsFromResult(result);
@@ -4434,7 +4475,7 @@ function executeQueryInternal(query, searchOptions, params, allowedDocs, run) {
4434
4475
  const options = { ...searchOptions, ...query, queries: undefined };
4435
4476
  const operator = ((_b = (_a = query.combineWith) !== null && _a !== void 0 ? _a : options.combineWith) !== null && _b !== void 0 ? _b : params.globalSearchOptions.combineWith);
4436
4477
  if (useGatedEvaluation(run, query.queries.length, operator, combinationHasWildcard(query))) {
4437
- return executeCombinedBranches(query.queries, operator, params, (branch, branchAllowed) => executeQueryInternal(branch, options, params, branchAllowed, run), (branch, branchAllowed) => collectDocIdsForQueryInternal(branch, options, params, branchAllowed), allowedDocs);
4478
+ return executeCombinedBranches(query.queries, operator, params, (branch, branchAllowed) => executeQueryInternal(branch, options, params, branchAllowed, run), (branch, branchAllowed) => collectDocIdsForQueryInternal(branch, options, params, branchAllowed), allowedDocs, run, branch => estimateMaxPostingLengthForQuery(branch, options, params));
4438
4479
  }
4439
4480
  const results = query.queries.map(subquery => executeQueryInternal(subquery, options, params, allowedDocs, run));
4440
4481
  return combineResults(results, operator);
@@ -4446,7 +4487,7 @@ function executeQueryInternal(query, searchOptions, params, allowedDocs, run) {
4446
4487
  const { specs, operator } = normalized;
4447
4488
  const combineWith = (operator !== null && operator !== void 0 ? operator : params.globalSearchOptions.combineWith);
4448
4489
  if (useGatedEvaluation(run, specs.length, combineWith, false)) {
4449
- return executeCombinedBranches(specs, combineWith, params, (spec, branchAllowed) => executeQuerySpecInternal(spec, normalized, params, branchAllowed), (spec, branchAllowed) => collectDocIdsForQuerySpec(spec, normalized, params, branchAllowed), allowedDocs);
4490
+ return executeCombinedBranches(specs, combineWith, params, (spec, branchAllowed) => executeQuerySpecInternal(spec, normalized, params, branchAllowed), (spec, branchAllowed) => collectDocIdsForQuerySpec(spec, normalized, params, branchAllowed), allowedDocs, run, spec => estimateMaxPostingLengthForQuerySpec(spec, normalized, params));
4450
4491
  }
4451
4492
  const results = specs.map(spec => executeQuerySpecInternal(spec, normalized, params, allowedDocs));
4452
4493
  return combineResults(results, combineWith);