@yoch/frozenminisearch 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/README.md +43 -99
- package/dist/cjs/index.cjs +419 -263
- package/dist/es/index.js +419 -263
- package/package.json +2 -1
package/dist/cjs/index.cjs
CHANGED
|
@@ -4,10 +4,82 @@ Object.defineProperty(exports, '__esModule', { value: true });
|
|
|
4
4
|
|
|
5
5
|
var zlib = require('node:zlib');
|
|
6
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Internal AND / AND_NOT gate thresholds (not exported from the public package entry).
|
|
9
|
+
*/
|
|
10
|
+
const DEFAULT_POSTING_GATE_MIN_LENGTH = 2048;
|
|
11
|
+
const DEFAULT_POSTING_GATE_RATIO_SHIFT = 2;
|
|
12
|
+
const DEFAULT_POSTING_GATE_POLICY = {
|
|
13
|
+
minLength: DEFAULT_POSTING_GATE_MIN_LENGTH,
|
|
14
|
+
ratioShift: DEFAULT_POSTING_GATE_RATIO_SHIFT,
|
|
15
|
+
};
|
|
16
|
+
function passGateByPostingRatio(gateSize, postingListLength, policy = DEFAULT_POSTING_GATE_POLICY) {
|
|
17
|
+
if (postingListLength < policy.minLength)
|
|
18
|
+
return false;
|
|
19
|
+
return gateSize <= (postingListLength >>> policy.ratioShift);
|
|
20
|
+
}
|
|
21
|
+
const DEFAULT_AND_GATE_LIMITS = {
|
|
22
|
+
maxAbsolute: 5000,
|
|
23
|
+
maxFraction: 0.1,
|
|
24
|
+
};
|
|
25
|
+
function resolveGateMaxSize(documentCount, limits = DEFAULT_AND_GATE_LIMITS) {
|
|
26
|
+
return Math.min(limits.maxAbsolute, Math.max(100, Math.floor(documentCount * limits.maxFraction)));
|
|
27
|
+
}
|
|
28
|
+
function gateIsSelectiveEnough(gateSize, documentCount, limits = DEFAULT_AND_GATE_LIMITS, postingListLength, postingGatePolicy = DEFAULT_POSTING_GATE_POLICY) {
|
|
29
|
+
if (gateSize === 0)
|
|
30
|
+
return true;
|
|
31
|
+
if (gateSize <= resolveGateMaxSize(documentCount, limits))
|
|
32
|
+
return true;
|
|
33
|
+
if (postingListLength != null
|
|
34
|
+
&& postingListLength > 0
|
|
35
|
+
&& passGateByPostingRatio(gateSize, postingListLength, postingGatePolicy)) {
|
|
36
|
+
return true;
|
|
37
|
+
}
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
/** True when passing gate as allowedDocs can skip docs vs scanning the full branch posting. */
|
|
41
|
+
function gateFilterShrinksScan(gateSize, postingListLength) {
|
|
42
|
+
return postingListLength > gateSize;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Whether to pass the AND gate as allowedDocs to the next branch (perf only; scores unchanged if false).
|
|
46
|
+
* Distinct from gateIsSelectiveEnough: a selective gate may still be too large to filter a short posting.
|
|
47
|
+
*/
|
|
48
|
+
function shouldPassGateAsAllowedDocs(selective, gateSize, postingListLength) {
|
|
49
|
+
if (!selective || gateSize === 0)
|
|
50
|
+
return false;
|
|
51
|
+
if (postingListLength == null || postingListLength <= 0)
|
|
52
|
+
return false;
|
|
53
|
+
return gateFilterShrinksScan(gateSize, postingListLength);
|
|
54
|
+
}
|
|
55
|
+
|
|
7
56
|
const MAX_FREQ = 65535;
|
|
8
57
|
function readDocId(docIds, index) {
|
|
9
58
|
return docIds[index];
|
|
10
59
|
}
|
|
60
|
+
/** Binary search for docId in a sorted segment; returns global index or -1. */
|
|
61
|
+
function findDocIndexInSortedSegment(docIds, offset, length, docId) {
|
|
62
|
+
let lo = 0;
|
|
63
|
+
let hi = length - 1;
|
|
64
|
+
while (lo <= hi) {
|
|
65
|
+
const mid = (lo + hi) >>> 1;
|
|
66
|
+
const v = readDocId(docIds, offset + mid);
|
|
67
|
+
if (v < docId)
|
|
68
|
+
lo = mid + 1;
|
|
69
|
+
else if (v > docId)
|
|
70
|
+
hi = mid - 1;
|
|
71
|
+
else
|
|
72
|
+
return offset + mid;
|
|
73
|
+
}
|
|
74
|
+
return -1;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Scan vs binary search once `allowedDocs` is already in effect (scoring layer).
|
|
78
|
+
* Uses the same numeric policy as {@link passGateByPostingRatio} today; distinct decision point.
|
|
79
|
+
*/
|
|
80
|
+
function shouldSeekAllowedDocs(gateSize, listLength) {
|
|
81
|
+
return passGateByPostingRatio(gateSize, listLength);
|
|
82
|
+
}
|
|
11
83
|
function allocateFreqs(length, maxValue) {
|
|
12
84
|
if (maxValue <= 0xff)
|
|
13
85
|
return new Uint8Array(length);
|
|
@@ -69,10 +141,15 @@ function bm25FieldConstants(bm25params, avgFieldLength) {
|
|
|
69
141
|
const { k, b, d } = bm25params;
|
|
70
142
|
return { k, d, k1: k + 1, oneMinusB: 1 - b, bOverAvg: b / avgFieldLength };
|
|
71
143
|
}
|
|
72
|
-
function
|
|
144
|
+
function bm25Idf(matchingCount, totalCount) {
|
|
145
|
+
return Math.log(1 + (totalCount - matchingCount + 0.5) / (matchingCount + 0.5));
|
|
146
|
+
}
|
|
147
|
+
function calcBm25TfWithConstants(termFreq, fieldLength, constants, idf) {
|
|
73
148
|
const { k, d, k1, oneMinusB, bOverAvg } = constants;
|
|
74
|
-
|
|
75
|
-
|
|
149
|
+
return idf * (d + termFreq * k1 / (termFreq + k * (oneMinusB + bOverAvg * fieldLength)));
|
|
150
|
+
}
|
|
151
|
+
function calcBM25ScoreWithConstants(termFreq, matchingCount, totalCount, fieldLength, constants) {
|
|
152
|
+
return calcBm25TfWithConstants(termFreq, fieldLength, constants, bm25Idf(matchingCount, totalCount));
|
|
76
153
|
}
|
|
77
154
|
const getOwnProperty = (object, property) => Object.prototype.hasOwnProperty.call(object, property) ? object[property] : undefined;
|
|
78
155
|
function fieldBoostsForQuery(options, fields) {
|
|
@@ -101,7 +178,7 @@ function getDerivedTerm(derivedTerm, cache) {
|
|
|
101
178
|
cache.value = derivedTerm.resolve();
|
|
102
179
|
return cache.value;
|
|
103
180
|
}
|
|
104
|
-
function scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache) {
|
|
181
|
+
function scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache, hoistedIdf) {
|
|
105
182
|
const resolvedDerivedTerm = getDerivedTerm(derivedTerm, derivedTermCache);
|
|
106
183
|
const docBoost = boostDocumentFn
|
|
107
184
|
? boostDocumentFn(context.getExternalId(docId), resolvedDerivedTerm, context.getStoredFields(docId))
|
|
@@ -109,7 +186,9 @@ function scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFre
|
|
|
109
186
|
if (!docBoost)
|
|
110
187
|
return;
|
|
111
188
|
const fieldLength = context.getFieldLength(docId, fieldId);
|
|
112
|
-
const rawScore =
|
|
189
|
+
const rawScore = hoistedIdf !== undefined
|
|
190
|
+
? calcBm25TfWithConstants(termFreq, fieldLength, bm25, hoistedIdf)
|
|
191
|
+
: calcBM25ScoreWithConstants(termFreq, matchingFields, context.documentCount, fieldLength, bm25);
|
|
113
192
|
const weightedScore = termWeight * termBoost * fieldBoost * docBoost * rawScore;
|
|
114
193
|
const result = results.get(docId);
|
|
115
194
|
if (result) {
|
|
@@ -132,22 +211,39 @@ function scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFre
|
|
|
132
211
|
}
|
|
133
212
|
}
|
|
134
213
|
function aggregateSegmentPostingList(sourceTerm, derivedTerm, termWeight, termBoost, field, fieldId, fieldBoost, list, context, boostDocumentFn, bm25params, results, allowedDocs) {
|
|
135
|
-
var _a;
|
|
214
|
+
var _a, _b;
|
|
136
215
|
let matchingFields = list.length;
|
|
137
216
|
const bm25 = bm25FieldConstants(bm25params, context.avgFieldLength[fieldId]);
|
|
217
|
+
const hoistedIdf = context.isDocActive == null
|
|
218
|
+
? bm25Idf(matchingFields, context.documentCount)
|
|
219
|
+
: undefined;
|
|
138
220
|
const { docIds, freqs, offset, length } = list;
|
|
139
221
|
const derivedTermCache = {};
|
|
222
|
+
if (allowedDocs != null && shouldSeekAllowedDocs(allowedDocs.size, length)) {
|
|
223
|
+
for (const docId of allowedDocs) {
|
|
224
|
+
if (context.isDocActive != null && !context.isDocActive(docId)) {
|
|
225
|
+
(_a = context.onInactiveDoc) === null || _a === void 0 ? void 0 : _a.call(context, docId, fieldId, getDerivedTerm(derivedTerm, derivedTermCache));
|
|
226
|
+
matchingFields -= 1;
|
|
227
|
+
continue;
|
|
228
|
+
}
|
|
229
|
+
const index = findDocIndexInSortedSegment(docIds, offset, length, docId);
|
|
230
|
+
if (index < 0)
|
|
231
|
+
continue;
|
|
232
|
+
scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, freqs[index], termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache, hoistedIdf);
|
|
233
|
+
}
|
|
234
|
+
return matchingFields;
|
|
235
|
+
}
|
|
140
236
|
for (let i = 0; i < length; i++) {
|
|
141
237
|
const docId = readDocId(docIds, offset + i);
|
|
142
238
|
const termFreq = freqs[offset + i];
|
|
143
239
|
if (context.isDocActive != null && !context.isDocActive(docId)) {
|
|
144
|
-
(
|
|
240
|
+
(_b = context.onInactiveDoc) === null || _b === void 0 ? void 0 : _b.call(context, docId, fieldId, getDerivedTerm(derivedTerm, derivedTermCache));
|
|
145
241
|
matchingFields -= 1;
|
|
146
242
|
continue;
|
|
147
243
|
}
|
|
148
244
|
if (allowedDocs != null && !allowedDocs.has(docId))
|
|
149
245
|
continue;
|
|
150
|
-
scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache);
|
|
246
|
+
scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache, hoistedIdf);
|
|
151
247
|
}
|
|
152
248
|
return matchingFields;
|
|
153
249
|
}
|
|
@@ -167,6 +263,9 @@ function aggregateTerm(sourceTerm, derivedTerm, termWeight, termBoost, fieldTerm
|
|
|
167
263
|
}
|
|
168
264
|
let matchingFields = postingList.size;
|
|
169
265
|
const bm25 = bm25FieldConstants(bm25params, context.avgFieldLength[fieldId]);
|
|
266
|
+
const hoistedIdf = context.isDocActive == null
|
|
267
|
+
? bm25Idf(matchingFields, context.documentCount)
|
|
268
|
+
: undefined;
|
|
170
269
|
const derivedTermCache = {};
|
|
171
270
|
postingList.forEachDoc((docId, termFreq) => {
|
|
172
271
|
var _a;
|
|
@@ -177,7 +276,7 @@ function aggregateTerm(sourceTerm, derivedTerm, termWeight, termBoost, fieldTerm
|
|
|
177
276
|
}
|
|
178
277
|
if (allowedDocs != null && !allowedDocs.has(docId))
|
|
179
278
|
return;
|
|
180
|
-
scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache);
|
|
279
|
+
scorePostingDoc(sourceTerm, derivedTerm, field, fieldId, docId, termFreq, termWeight, termBoost, fieldBoost, matchingFields, context, boostDocumentFn, bm25, results, derivedTermCache, hoistedIdf);
|
|
181
280
|
});
|
|
182
281
|
}
|
|
183
282
|
return results;
|
|
@@ -1746,82 +1845,47 @@ function readFieldLengthMatrixSection(buf, flags, cellCount) {
|
|
|
1746
1845
|
|
|
1747
1846
|
const DISCARDED_DOC_ID = 0xffffffff;
|
|
1748
1847
|
function postingFreqValue(freq, clampFrequencies) {
|
|
1749
|
-
return
|
|
1750
|
-
}
|
|
1751
|
-
function materializeFlatPostings(params) {
|
|
1752
|
-
const { fieldCount, termCount, forEachPosting, remapDocId, clampFrequencies } = params;
|
|
1753
|
-
const slotCount = termCount * fieldCount;
|
|
1754
|
-
const postingsOffsets = new Uint32Array(slotCount);
|
|
1755
|
-
const postingsLengths = new Uint32Array(slotCount);
|
|
1756
|
-
let totalPostings = 0;
|
|
1757
|
-
let maxFreq = 0;
|
|
1758
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1759
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1760
|
-
forEachPosting(ti, f, (rawDocId, freq) => {
|
|
1761
|
-
const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
|
|
1762
|
-
if (docId === DISCARDED_DOC_ID)
|
|
1763
|
-
return;
|
|
1764
|
-
totalPostings++;
|
|
1765
|
-
const v = postingFreqValue(freq, clampFrequencies);
|
|
1766
|
-
if (v > maxFreq)
|
|
1767
|
-
maxFreq = v;
|
|
1768
|
-
});
|
|
1769
|
-
}
|
|
1770
|
-
}
|
|
1771
|
-
const useUint16 = params.nextId != null && params.nextId <= 65535;
|
|
1772
|
-
const allDocIds = useUint16
|
|
1773
|
-
? new Uint16Array(totalPostings)
|
|
1774
|
-
: new Uint32Array(totalPostings);
|
|
1775
|
-
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
1776
|
-
// Slots are visited in ascending fieldId (0..fieldCount-1) per term. Sparse layouts
|
|
1777
|
-
// rely on this ordering so field ids per term stay sorted for binary lookup.
|
|
1778
|
-
let write = 0;
|
|
1779
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1780
|
-
const base = ti * fieldCount;
|
|
1781
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
1782
|
-
const offset = write;
|
|
1783
|
-
let count = 0;
|
|
1784
|
-
forEachPosting(ti, f, (rawDocId, freq) => {
|
|
1785
|
-
const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
|
|
1786
|
-
if (docId === DISCARDED_DOC_ID)
|
|
1787
|
-
return;
|
|
1788
|
-
if (useUint16) {
|
|
1789
|
-
allDocIds[write] = docId;
|
|
1790
|
-
}
|
|
1791
|
-
else {
|
|
1792
|
-
allDocIds[write] = docId;
|
|
1793
|
-
}
|
|
1794
|
-
allFreqs[write] = postingFreqValue(freq, clampFrequencies);
|
|
1795
|
-
write++;
|
|
1796
|
-
count++;
|
|
1797
|
-
});
|
|
1798
|
-
postingsOffsets[base + f] = offset;
|
|
1799
|
-
postingsLengths[base + f] = count;
|
|
1800
|
-
}
|
|
1801
|
-
}
|
|
1802
|
-
return {
|
|
1803
|
-
postingsOffsets,
|
|
1804
|
-
postingsLengths,
|
|
1805
|
-
allDocIds,
|
|
1806
|
-
allFreqs,
|
|
1807
|
-
};
|
|
1848
|
+
return clampFreq(freq) ;
|
|
1808
1849
|
}
|
|
1809
1850
|
|
|
1810
1851
|
function readFieldId(fieldIds, index) {
|
|
1811
1852
|
return fieldIds[index];
|
|
1812
1853
|
}
|
|
1813
|
-
function choosePostingsLayout(fieldCount) {
|
|
1814
|
-
return fieldCount === 1 ? 'dense' : 'sparse';
|
|
1815
|
-
}
|
|
1816
1854
|
function chooseSparseFieldIdWidth(fieldCount) {
|
|
1817
1855
|
return fieldCount > 255 ? 16 : 8;
|
|
1818
1856
|
}
|
|
1819
|
-
function
|
|
1820
|
-
const
|
|
1821
|
-
const
|
|
1857
|
+
function choosePostingsLayout(fieldCount, termCount, nonEmptySlots) {
|
|
1858
|
+
const denseBytes = termCount * fieldCount * 8;
|
|
1859
|
+
const sparseFieldIdBytes = chooseSparseFieldIdWidth(fieldCount) === 16 ? 2 : 1;
|
|
1860
|
+
const sparseBytes = (termCount + 1) * 4 + nonEmptySlots * (sparseFieldIdBytes + 8);
|
|
1861
|
+
return denseBytes <= sparseBytes ? 'dense' : 'sparse';
|
|
1862
|
+
}
|
|
1863
|
+
/** Shared dense/sparse layout emission; callers supply per-slot length and copy. */
|
|
1864
|
+
function buildFrozenPostingsLayout(fieldCount, termCount, nextId, totalPostings, maxFreq, source) {
|
|
1865
|
+
const layout = choosePostingsLayout(fieldCount, termCount, source.nonEmptySlots);
|
|
1822
1866
|
const docIdWidth = nextId <= 65535 ? 16 : 32;
|
|
1867
|
+
const allDocIds = docIdWidth === 16
|
|
1868
|
+
? new Uint16Array(totalPostings)
|
|
1869
|
+
: new Uint32Array(totalPostings);
|
|
1870
|
+
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
1871
|
+
const targets = { allDocIds, allFreqs, docIdWidth };
|
|
1823
1872
|
if (layout === 'dense') {
|
|
1824
|
-
const
|
|
1873
|
+
const slotCount = termCount * fieldCount;
|
|
1874
|
+
const denseOffsets = new Uint32Array(slotCount);
|
|
1875
|
+
const denseLengths = new Uint32Array(slotCount);
|
|
1876
|
+
let write = 0;
|
|
1877
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
1878
|
+
const base = ti * fieldCount;
|
|
1879
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
1880
|
+
const slot = base + f;
|
|
1881
|
+
const len = source.slotLength(ti, f);
|
|
1882
|
+
denseOffsets[slot] = write;
|
|
1883
|
+
denseLengths[slot] = len;
|
|
1884
|
+
if (len > 0) {
|
|
1885
|
+
write = source.writeSlot(ti, f, write, targets);
|
|
1886
|
+
}
|
|
1887
|
+
}
|
|
1888
|
+
}
|
|
1825
1889
|
return {
|
|
1826
1890
|
fieldCount,
|
|
1827
1891
|
termCount,
|
|
@@ -1829,10 +1893,10 @@ function materializeFrozenPostings(params) {
|
|
|
1829
1893
|
layout,
|
|
1830
1894
|
docIdWidth,
|
|
1831
1895
|
sparseFieldIdWidth: null,
|
|
1832
|
-
allDocIds
|
|
1833
|
-
allFreqs
|
|
1834
|
-
denseOffsets
|
|
1835
|
-
denseLengths
|
|
1896
|
+
allDocIds,
|
|
1897
|
+
allFreqs,
|
|
1898
|
+
denseOffsets,
|
|
1899
|
+
denseLengths,
|
|
1836
1900
|
sparseTermStarts: null,
|
|
1837
1901
|
sparseFieldIds: null,
|
|
1838
1902
|
sparseOffsets: null,
|
|
@@ -1844,60 +1908,23 @@ function materializeFrozenPostings(params) {
|
|
|
1844
1908
|
const sparseOffsets = [];
|
|
1845
1909
|
const sparseLengths = [];
|
|
1846
1910
|
const termStarts = new Array(termCount + 1).fill(0);
|
|
1847
|
-
|
|
1848
|
-
// Non-empty slots per term are emitted with fieldId in ascending order (f loops 0..fieldCount-1).
|
|
1849
|
-
let totalPostings = 0;
|
|
1850
|
-
let maxFreq = 0;
|
|
1911
|
+
let write = 0;
|
|
1851
1912
|
for (let ti = 0; ti < termCount; ti++) {
|
|
1852
1913
|
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
1853
1914
|
for (let f = 0; f < fieldCount; f++) {
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
|
|
1857
|
-
if (docId === DISCARDED_DOC_ID)
|
|
1858
|
-
return;
|
|
1859
|
-
count++;
|
|
1860
|
-
const v = postingFreqValue(freq, clampFrequencies);
|
|
1861
|
-
if (v > maxFreq)
|
|
1862
|
-
maxFreq = v;
|
|
1863
|
-
});
|
|
1864
|
-
if (count === 0)
|
|
1915
|
+
const len = source.slotLength(ti, f);
|
|
1916
|
+
if (len === 0)
|
|
1865
1917
|
continue;
|
|
1866
1918
|
sparseFieldIdsScratch.push(f);
|
|
1867
|
-
sparseOffsets.push(
|
|
1868
|
-
sparseLengths.push(
|
|
1869
|
-
|
|
1919
|
+
sparseOffsets.push(write);
|
|
1920
|
+
sparseLengths.push(len);
|
|
1921
|
+
write = source.writeSlot(ti, f, write, targets);
|
|
1870
1922
|
}
|
|
1871
1923
|
termStarts[ti + 1] = sparseFieldIdsScratch.length;
|
|
1872
1924
|
}
|
|
1873
|
-
const allDocIds = docIdWidth === 16
|
|
1874
|
-
? new Uint16Array(totalPostings)
|
|
1875
|
-
: new Uint32Array(totalPostings);
|
|
1876
|
-
const allFreqs = allocateFreqs(totalPostings, maxFreq);
|
|
1877
1925
|
const sparseFieldIds = sparseFieldIdWidth === 16
|
|
1878
1926
|
? new Uint16Array(sparseFieldIdsScratch)
|
|
1879
1927
|
: new Uint8Array(sparseFieldIdsScratch);
|
|
1880
|
-
let write = 0;
|
|
1881
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
1882
|
-
const start = termStarts[ti];
|
|
1883
|
-
const end = termStarts[ti + 1];
|
|
1884
|
-
for (let s = start; s < end; s++) {
|
|
1885
|
-
const f = readFieldId(sparseFieldIds, s);
|
|
1886
|
-
forEachPosting(ti, f, (rawDocId, freq) => {
|
|
1887
|
-
const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
|
|
1888
|
-
if (docId === DISCARDED_DOC_ID)
|
|
1889
|
-
return;
|
|
1890
|
-
if (docIdWidth === 16) {
|
|
1891
|
-
allDocIds[write] = docId;
|
|
1892
|
-
}
|
|
1893
|
-
else {
|
|
1894
|
-
allDocIds[write] = docId;
|
|
1895
|
-
}
|
|
1896
|
-
allFreqs[write] = postingFreqValue(freq, clampFrequencies);
|
|
1897
|
-
write++;
|
|
1898
|
-
});
|
|
1899
|
-
}
|
|
1900
|
-
}
|
|
1901
1928
|
return {
|
|
1902
1929
|
fieldCount,
|
|
1903
1930
|
termCount,
|
|
@@ -1915,6 +1942,58 @@ function materializeFrozenPostings(params) {
|
|
|
1915
1942
|
sparseLengths: new Uint32Array(sparseLengths),
|
|
1916
1943
|
};
|
|
1917
1944
|
}
|
|
1945
|
+
function materializeFrozenPostings(params) {
|
|
1946
|
+
const { fieldCount, termCount, nextId } = params;
|
|
1947
|
+
const { forEachPosting, remapDocId} = params;
|
|
1948
|
+
const slotCount = termCount * fieldCount;
|
|
1949
|
+
const slotLengths = new Uint32Array(slotCount);
|
|
1950
|
+
let totalPostings = 0;
|
|
1951
|
+
let maxFreq = 0;
|
|
1952
|
+
let nonEmptySlots = 0;
|
|
1953
|
+
for (let ti = 0; ti < termCount; ti++) {
|
|
1954
|
+
const base = ti * fieldCount;
|
|
1955
|
+
for (let f = 0; f < fieldCount; f++) {
|
|
1956
|
+
let count = 0;
|
|
1957
|
+
forEachPosting(ti, f, (rawDocId, freq) => {
|
|
1958
|
+
const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
|
|
1959
|
+
if (docId === DISCARDED_DOC_ID)
|
|
1960
|
+
return;
|
|
1961
|
+
count++;
|
|
1962
|
+
const v = postingFreqValue(freq);
|
|
1963
|
+
if (v > maxFreq)
|
|
1964
|
+
maxFreq = v;
|
|
1965
|
+
});
|
|
1966
|
+
if (count === 0)
|
|
1967
|
+
continue;
|
|
1968
|
+
slotLengths[base + f] = count;
|
|
1969
|
+
totalPostings += count;
|
|
1970
|
+
nonEmptySlots++;
|
|
1971
|
+
}
|
|
1972
|
+
}
|
|
1973
|
+
return buildFrozenPostingsLayout(fieldCount, termCount, nextId, totalPostings, maxFreq, {
|
|
1974
|
+
nonEmptySlots,
|
|
1975
|
+
slotLength(ti, f) {
|
|
1976
|
+
return slotLengths[ti * fieldCount + f];
|
|
1977
|
+
},
|
|
1978
|
+
writeSlot(ti, f, write, targets) {
|
|
1979
|
+
const { allDocIds: outDocIds, allFreqs: outFreqs, docIdWidth: width } = targets;
|
|
1980
|
+
forEachPosting(ti, f, (rawDocId, freq) => {
|
|
1981
|
+
const docId = remapDocId != null ? remapDocId(rawDocId) : rawDocId;
|
|
1982
|
+
if (docId === DISCARDED_DOC_ID)
|
|
1983
|
+
return;
|
|
1984
|
+
if (width === 16) {
|
|
1985
|
+
outDocIds[write] = docId;
|
|
1986
|
+
}
|
|
1987
|
+
else {
|
|
1988
|
+
outDocIds[write] = docId;
|
|
1989
|
+
}
|
|
1990
|
+
outFreqs[write] = postingFreqValue(freq);
|
|
1991
|
+
write++;
|
|
1992
|
+
});
|
|
1993
|
+
return write;
|
|
1994
|
+
},
|
|
1995
|
+
});
|
|
1996
|
+
}
|
|
1918
1997
|
function postingsTypedBytes(layout) {
|
|
1919
1998
|
const allDocIdsBytes = layout.allDocIds.byteLength;
|
|
1920
1999
|
const allFreqsBytes = layout.allFreqs.byteLength;
|
|
@@ -2074,6 +2153,16 @@ function createFrozenFieldTermFlyweight(layout) {
|
|
|
2074
2153
|
return flyweight;
|
|
2075
2154
|
}
|
|
2076
2155
|
function collectDocIdsFromFrozenSegment(allDocIds, offset, length, context, docIds, allowedDocs) {
|
|
2156
|
+
if (allowedDocs != null && shouldSeekAllowedDocs(allowedDocs.size, length)) {
|
|
2157
|
+
for (const docId of allowedDocs) {
|
|
2158
|
+
if (context.isDocActive != null && !context.isDocActive(docId))
|
|
2159
|
+
continue;
|
|
2160
|
+
if (findDocIndexInSortedSegment(allDocIds, offset, length, docId) >= 0) {
|
|
2161
|
+
docIds.add(docId);
|
|
2162
|
+
}
|
|
2163
|
+
}
|
|
2164
|
+
return;
|
|
2165
|
+
}
|
|
2077
2166
|
for (let i = 0; i < length; i++) {
|
|
2078
2167
|
const docId = readDocId(allDocIds, offset + i);
|
|
2079
2168
|
if (context.isDocActive != null && !context.isDocActive(docId))
|
|
@@ -2677,7 +2766,6 @@ function buildFlatPostingsFromSearchableMap(searchableMap, fieldCount, nextId, s
|
|
|
2677
2766
|
fieldCount,
|
|
2678
2767
|
termCount,
|
|
2679
2768
|
nextId,
|
|
2680
|
-
clampFrequencies: true,
|
|
2681
2769
|
remapDocId,
|
|
2682
2770
|
forEachPosting(ti, f, emit) {
|
|
2683
2771
|
var _a;
|
|
@@ -2921,8 +3009,8 @@ async function zlibPayloadChoiceAsync(uncompressed) {
|
|
|
2921
3009
|
return { payload: compressed, codec: CODEC_ZLIB, zstdLevel: 0 };
|
|
2922
3010
|
}
|
|
2923
3011
|
const autoSyncCompressors = {
|
|
2924
|
-
zstd:
|
|
2925
|
-
zlib:
|
|
3012
|
+
zstd: uncompressed => zlib.zstdCompressSync(uncompressed, msv5ZstdCompressOptions(uncompressed)),
|
|
3013
|
+
zlib: uncompressed => zlib.deflateSync(uncompressed),
|
|
2926
3014
|
};
|
|
2927
3015
|
const autoAsyncCompressors = {
|
|
2928
3016
|
zstd: zstdCompressAsync,
|
|
@@ -3865,93 +3953,23 @@ class IncrementalPostingsAccumulator {
|
|
|
3865
3953
|
const totalPostings = this._totalPostings;
|
|
3866
3954
|
const maxFreq = this._maxFreq;
|
|
3867
3955
|
const slots = this._slots;
|
|
3868
|
-
const layout =
|
|
3869
|
-
|
|
3870
|
-
|
|
3871
|
-
|
|
3872
|
-
|
|
3873
|
-
|
|
3874
|
-
|
|
3875
|
-
const slotCount = termCount * fieldCount;
|
|
3876
|
-
const denseOffsets = new Uint32Array(slotCount);
|
|
3877
|
-
const denseLengths = new Uint32Array(slotCount);
|
|
3878
|
-
let write = 0;
|
|
3879
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
3880
|
-
const base = ti * fieldCount;
|
|
3881
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
3882
|
-
const slot = base + f;
|
|
3883
|
-
const ranges = slots.get(slot);
|
|
3884
|
-
const len = ranges == null ? 0 : this.slotLength(ranges);
|
|
3885
|
-
denseOffsets[slot] = write;
|
|
3886
|
-
denseLengths[slot] = len;
|
|
3887
|
-
if (len > 0) {
|
|
3888
|
-
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3889
|
-
slots.delete(slot);
|
|
3890
|
-
}
|
|
3891
|
-
}
|
|
3892
|
-
}
|
|
3893
|
-
slots.clear();
|
|
3894
|
-
this.clear();
|
|
3895
|
-
return {
|
|
3896
|
-
fieldCount,
|
|
3897
|
-
termCount,
|
|
3898
|
-
nextId,
|
|
3899
|
-
layout,
|
|
3900
|
-
docIdWidth,
|
|
3901
|
-
sparseFieldIdWidth: null,
|
|
3902
|
-
allDocIds,
|
|
3903
|
-
allFreqs,
|
|
3904
|
-
denseOffsets,
|
|
3905
|
-
denseLengths,
|
|
3906
|
-
sparseTermStarts: null,
|
|
3907
|
-
sparseFieldIds: null,
|
|
3908
|
-
sparseOffsets: null,
|
|
3909
|
-
sparseLengths: null,
|
|
3910
|
-
};
|
|
3911
|
-
}
|
|
3912
|
-
const sparseFieldIdWidth = chooseSparseFieldIdWidth(fieldCount);
|
|
3913
|
-
const sparseFieldIdsScratch = [];
|
|
3914
|
-
const sparseOffsets = [];
|
|
3915
|
-
const sparseLengths = [];
|
|
3916
|
-
const termStarts = new Array(termCount + 1).fill(0);
|
|
3917
|
-
let write = 0;
|
|
3918
|
-
for (let ti = 0; ti < termCount; ti++) {
|
|
3919
|
-
termStarts[ti] = sparseFieldIdsScratch.length;
|
|
3920
|
-
for (let f = 0; f < fieldCount; f++) {
|
|
3956
|
+
const layout = buildFrozenPostingsLayout(fieldCount, termCount, nextId, totalPostings, maxFreq, {
|
|
3957
|
+
nonEmptySlots: slots.size,
|
|
3958
|
+
slotLength: (ti, f) => {
|
|
3959
|
+
const ranges = slots.get(ti * fieldCount + f);
|
|
3960
|
+
return ranges == null ? 0 : this.slotLength(ranges);
|
|
3961
|
+
},
|
|
3962
|
+
writeSlot: (ti, f, write, targets) => {
|
|
3921
3963
|
const slot = ti * fieldCount + f;
|
|
3922
3964
|
const ranges = slots.get(slot);
|
|
3923
|
-
const
|
|
3924
|
-
if (len === 0)
|
|
3925
|
-
continue;
|
|
3926
|
-
sparseFieldIdsScratch.push(f);
|
|
3927
|
-
sparseOffsets.push(write);
|
|
3928
|
-
sparseLengths.push(len);
|
|
3929
|
-
write = this.copySlot(ranges, allDocIds, allFreqs, write, docIdWidth);
|
|
3965
|
+
const next = this.copySlot(ranges, targets.allDocIds, targets.allFreqs, write, targets.docIdWidth);
|
|
3930
3966
|
slots.delete(slot);
|
|
3931
|
-
|
|
3932
|
-
|
|
3933
|
-
}
|
|
3967
|
+
return next;
|
|
3968
|
+
},
|
|
3969
|
+
});
|
|
3934
3970
|
slots.clear();
|
|
3935
3971
|
this.clear();
|
|
3936
|
-
|
|
3937
|
-
? new Uint16Array(sparseFieldIdsScratch)
|
|
3938
|
-
: new Uint8Array(sparseFieldIdsScratch);
|
|
3939
|
-
return {
|
|
3940
|
-
fieldCount,
|
|
3941
|
-
termCount,
|
|
3942
|
-
nextId,
|
|
3943
|
-
layout,
|
|
3944
|
-
docIdWidth,
|
|
3945
|
-
sparseFieldIdWidth,
|
|
3946
|
-
allDocIds,
|
|
3947
|
-
allFreqs,
|
|
3948
|
-
denseOffsets: null,
|
|
3949
|
-
denseLengths: null,
|
|
3950
|
-
sparseTermStarts: new Uint32Array(termStarts),
|
|
3951
|
-
sparseFieldIds,
|
|
3952
|
-
sparseOffsets: new Uint32Array(sparseOffsets),
|
|
3953
|
-
sparseLengths: new Uint32Array(sparseLengths),
|
|
3954
|
-
};
|
|
3972
|
+
return layout;
|
|
3955
3973
|
}
|
|
3956
3974
|
}
|
|
3957
3975
|
|
|
@@ -4131,27 +4149,21 @@ function buildFrozenParamsFromDocuments(documents, options) {
|
|
|
4131
4149
|
return builder.freezeParams();
|
|
4132
4150
|
}
|
|
4133
4151
|
|
|
4134
|
-
/**
|
|
4135
|
-
* Internal AND / AND_NOT gate thresholds (not exported from the public package entry).
|
|
4136
|
-
*/
|
|
4137
|
-
const DEFAULT_AND_GATE_LIMITS = {
|
|
4138
|
-
maxAbsolute: 5000,
|
|
4139
|
-
maxFraction: 0.1,
|
|
4140
|
-
};
|
|
4141
|
-
function resolveGateMaxSize(documentCount, limits = DEFAULT_AND_GATE_LIMITS) {
|
|
4142
|
-
return Math.min(limits.maxAbsolute, Math.max(100, Math.floor(documentCount * limits.maxFraction)));
|
|
4143
|
-
}
|
|
4144
|
-
function gateIsSelectiveEnough(gateSize, documentCount, limits = DEFAULT_AND_GATE_LIMITS) {
|
|
4145
|
-
if (gateSize === 0)
|
|
4146
|
-
return true;
|
|
4147
|
-
return gateSize <= resolveGateMaxSize(documentCount, limits);
|
|
4148
|
-
}
|
|
4149
|
-
|
|
4150
4152
|
function useGatedEvaluation(run, branchCount, operator, hasWildcard) {
|
|
4151
4153
|
return shouldUseGatedEvaluation(branchCount, operator, hasWildcard);
|
|
4152
4154
|
}
|
|
4153
|
-
function
|
|
4154
|
-
return
|
|
4155
|
+
function gateFromResult(result) {
|
|
4156
|
+
return {
|
|
4157
|
+
get size() {
|
|
4158
|
+
return result.size;
|
|
4159
|
+
},
|
|
4160
|
+
has(docId) {
|
|
4161
|
+
return result.has(docId);
|
|
4162
|
+
},
|
|
4163
|
+
[Symbol.iterator]() {
|
|
4164
|
+
return result.keys();
|
|
4165
|
+
},
|
|
4166
|
+
};
|
|
4155
4167
|
}
|
|
4156
4168
|
function isQueryCombination(query) {
|
|
4157
4169
|
return typeof query === 'object'
|
|
@@ -4223,12 +4235,12 @@ function normalizeStringQuery(query, searchOptions, params) {
|
|
|
4223
4235
|
function lazyIndexedTerm(indexView, termIndex) {
|
|
4224
4236
|
return { kind: 'lazy', resolve: () => indexView.resolveTermByIndex(termIndex) };
|
|
4225
4237
|
}
|
|
4226
|
-
|
|
4238
|
+
const TWO_PHASE_AND_NOT_MIN_FRACTION = 0.5;
|
|
4239
|
+
function forEachQuerySpecTermRef(query, normalized, params, visit) {
|
|
4227
4240
|
const { indexView } = params;
|
|
4228
|
-
const {
|
|
4241
|
+
const { options } = normalized;
|
|
4229
4242
|
const maxDistance = maxFuzzyDistance(query, options.maxFuzzy);
|
|
4230
|
-
|
|
4231
|
-
visit(exactTi == null ? undefined : indexView.fieldTermData(exactTi), query.term, 1);
|
|
4243
|
+
visit({ kind: 'exact', termIndex: indexView.resolveTermIndex(query.term) });
|
|
4232
4244
|
const seenPrefix = query.prefix && maxDistance ? new Set() : undefined;
|
|
4233
4245
|
if (query.prefix) {
|
|
4234
4246
|
for (const { termIndex, length } of indexView.getPrefixMatchesByIndex(query.term)) {
|
|
@@ -4236,7 +4248,7 @@ function visitQuerySpecForScoring(query, normalized, params, visit) {
|
|
|
4236
4248
|
if (!distance)
|
|
4237
4249
|
continue;
|
|
4238
4250
|
seenPrefix === null || seenPrefix === void 0 ? void 0 : seenPrefix.add(termIndex);
|
|
4239
|
-
visit(
|
|
4251
|
+
visit({ kind: 'prefix', termIndex, length, distance });
|
|
4240
4252
|
}
|
|
4241
4253
|
}
|
|
4242
4254
|
if (!maxDistance)
|
|
@@ -4244,9 +4256,24 @@ function visitQuerySpecForScoring(query, normalized, params, visit) {
|
|
|
4244
4256
|
for (const { termIndex, length, distance } of indexView.getFuzzyMatchesByIndex(query.term, maxDistance)) {
|
|
4245
4257
|
if (!distance || (seenPrefix === null || seenPrefix === void 0 ? void 0 : seenPrefix.has(termIndex)))
|
|
4246
4258
|
continue;
|
|
4247
|
-
visit(
|
|
4259
|
+
visit({ kind: 'fuzzy', termIndex, length, distance });
|
|
4248
4260
|
}
|
|
4249
4261
|
}
|
|
4262
|
+
function visitQuerySpecForScoring(query, normalized, params, visit) {
|
|
4263
|
+
const { indexView } = params;
|
|
4264
|
+
const { fuzzyWeight, prefixWeight } = normalized;
|
|
4265
|
+
forEachQuerySpecTermRef(query, normalized, params, (ref) => {
|
|
4266
|
+
if (ref.kind === 'exact') {
|
|
4267
|
+
visit(ref.termIndex == null ? undefined : indexView.fieldTermData(ref.termIndex), query.term, 1);
|
|
4268
|
+
return;
|
|
4269
|
+
}
|
|
4270
|
+
if (ref.kind === 'prefix') {
|
|
4271
|
+
visit(indexView.fieldTermData(ref.termIndex), lazyIndexedTerm(indexView, ref.termIndex), prefixWeight * ref.length / (ref.length + 0.3 * ref.distance));
|
|
4272
|
+
return;
|
|
4273
|
+
}
|
|
4274
|
+
visit(indexView.fieldTermData(ref.termIndex), lazyIndexedTerm(indexView, ref.termIndex), fuzzyWeight * ref.length / (ref.length + ref.distance));
|
|
4275
|
+
});
|
|
4276
|
+
}
|
|
4250
4277
|
function executeQuerySpecInternal(query, normalized, params, allowedDocs) {
|
|
4251
4278
|
const { fieldBoosts, options } = normalized;
|
|
4252
4279
|
const termOptions = allowedDocs == null ? undefined : { allowedDocs };
|
|
@@ -4256,32 +4283,73 @@ function executeQuerySpecInternal(query, normalized, params, allowedDocs) {
|
|
|
4256
4283
|
});
|
|
4257
4284
|
return results;
|
|
4258
4285
|
}
|
|
4259
|
-
function
|
|
4260
|
-
|
|
4261
|
-
|
|
4262
|
-
|
|
4263
|
-
const
|
|
4264
|
-
|
|
4265
|
-
|
|
4266
|
-
|
|
4286
|
+
function maxPostingLengthForFieldTermData(data, fieldBoosts, fieldIds) {
|
|
4287
|
+
if (data == null)
|
|
4288
|
+
return 0;
|
|
4289
|
+
let maxLen = 0;
|
|
4290
|
+
for (const field of fieldBoosts.names) {
|
|
4291
|
+
const fieldId = fieldIds[field];
|
|
4292
|
+
const postingList = data.get(fieldId);
|
|
4293
|
+
if (postingList == null)
|
|
4294
|
+
continue;
|
|
4295
|
+
const len = postingList instanceof SegmentPostingList ? postingList.length : postingList.size;
|
|
4296
|
+
if (len > maxLen)
|
|
4297
|
+
maxLen = len;
|
|
4267
4298
|
}
|
|
4268
|
-
|
|
4269
|
-
|
|
4270
|
-
|
|
4271
|
-
|
|
4272
|
-
|
|
4273
|
-
|
|
4274
|
-
|
|
4275
|
-
|
|
4299
|
+
return maxLen;
|
|
4300
|
+
}
|
|
4301
|
+
function estimateMaxPostingLengthForQuerySpec(query, normalized, params) {
|
|
4302
|
+
const { indexView, aggregateContext } = params;
|
|
4303
|
+
const { fieldBoosts } = normalized;
|
|
4304
|
+
const { fieldIds } = aggregateContext;
|
|
4305
|
+
let maxLen = 0;
|
|
4306
|
+
const consider = (data) => {
|
|
4307
|
+
maxLen = Math.max(maxLen, maxPostingLengthForFieldTermData(data, fieldBoosts, fieldIds));
|
|
4308
|
+
};
|
|
4309
|
+
forEachQuerySpecTermRef(query, normalized, params, (ref) => {
|
|
4310
|
+
if (ref.kind === 'exact') {
|
|
4311
|
+
if (ref.termIndex != null)
|
|
4312
|
+
consider(indexView.fieldTermData(ref.termIndex));
|
|
4313
|
+
return;
|
|
4276
4314
|
}
|
|
4315
|
+
consider(indexView.fieldTermData(ref.termIndex));
|
|
4316
|
+
});
|
|
4317
|
+
return maxLen;
|
|
4318
|
+
}
|
|
4319
|
+
function estimateMaxPostingLengthForQuery(query, searchOptions, params) {
|
|
4320
|
+
if (isWildcardQuery(query)) {
|
|
4321
|
+
return params.aggregateContext.documentCount;
|
|
4277
4322
|
}
|
|
4278
|
-
if (
|
|
4279
|
-
|
|
4280
|
-
|
|
4281
|
-
|
|
4282
|
-
|
|
4323
|
+
if (isQueryCombination(query)) {
|
|
4324
|
+
const options = { ...searchOptions, ...query, queries: undefined };
|
|
4325
|
+
let maxLen = 0;
|
|
4326
|
+
for (const branch of query.queries) {
|
|
4327
|
+
maxLen = Math.max(maxLen, estimateMaxPostingLengthForQuery(branch, options, params));
|
|
4283
4328
|
}
|
|
4329
|
+
return maxLen;
|
|
4284
4330
|
}
|
|
4331
|
+
if (typeof query !== 'string')
|
|
4332
|
+
return 0;
|
|
4333
|
+
const normalized = normalizeStringQuery(query, searchOptions, params);
|
|
4334
|
+
let maxLen = 0;
|
|
4335
|
+
for (const spec of normalized.specs) {
|
|
4336
|
+
maxLen = Math.max(maxLen, estimateMaxPostingLengthForQuerySpec(spec, normalized, params));
|
|
4337
|
+
}
|
|
4338
|
+
return maxLen;
|
|
4339
|
+
}
|
|
4340
|
+
function collectDocIdsForQuerySpec(query, normalized, params, allowedDocs) {
|
|
4341
|
+
const { fieldBoosts } = normalized;
|
|
4342
|
+
const docIds = new Set();
|
|
4343
|
+
const { indexView, aggregateContext } = params;
|
|
4344
|
+
forEachQuerySpecTermRef(query, normalized, params, (ref) => {
|
|
4345
|
+
if (ref.kind === 'exact') {
|
|
4346
|
+
if (ref.termIndex != null) {
|
|
4347
|
+
indexView.collectDocIds(ref.termIndex, fieldBoosts, aggregateContext, docIds, allowedDocs);
|
|
4348
|
+
}
|
|
4349
|
+
return;
|
|
4350
|
+
}
|
|
4351
|
+
indexView.collectDocIds(ref.termIndex, fieldBoosts, aggregateContext, docIds, allowedDocs);
|
|
4352
|
+
});
|
|
4285
4353
|
return docIds;
|
|
4286
4354
|
}
|
|
4287
4355
|
function intersectDocIdsInPlace(docIds, branchDocIds) {
|
|
@@ -4298,6 +4366,70 @@ function subtractDocIdsFromResult(result, excludedDocIds) {
|
|
|
4298
4366
|
for (const docId of excludedDocIds)
|
|
4299
4367
|
result.delete(docId);
|
|
4300
4368
|
}
|
|
4369
|
+
function twoPhasePostingLengths(branches, allowTwoPhase, estimateBranchPostingLength) {
|
|
4370
|
+
if (!allowTwoPhase || estimateBranchPostingLength == null)
|
|
4371
|
+
return undefined;
|
|
4372
|
+
const lengths = new Array(branches.length);
|
|
4373
|
+
for (let i = 0; i < branches.length; i++) {
|
|
4374
|
+
lengths[i] = estimateBranchPostingLength(branches[i]);
|
|
4375
|
+
}
|
|
4376
|
+
return lengths;
|
|
4377
|
+
}
|
|
4378
|
+
function shouldUseTwoPhaseAnd(branchPostingLengths, allowedDocs) {
|
|
4379
|
+
if (branchPostingLengths.length <= 1)
|
|
4380
|
+
return false;
|
|
4381
|
+
const firstLength = branchPostingLengths[0];
|
|
4382
|
+
const effectiveFirstLength = allowedDocs == null
|
|
4383
|
+
? firstLength
|
|
4384
|
+
: Math.min(firstLength, allowedDocs.size);
|
|
4385
|
+
if (effectiveFirstLength < DEFAULT_POSTING_GATE_MIN_LENGTH)
|
|
4386
|
+
return false;
|
|
4387
|
+
const targetLength = effectiveFirstLength >>> DEFAULT_POSTING_GATE_RATIO_SHIFT;
|
|
4388
|
+
for (let i = 1; i < branchPostingLengths.length; i++) {
|
|
4389
|
+
const len = branchPostingLengths[i];
|
|
4390
|
+
if (len > 0 && len <= targetLength)
|
|
4391
|
+
return true;
|
|
4392
|
+
}
|
|
4393
|
+
return false;
|
|
4394
|
+
}
|
|
4395
|
+
function shouldUseTwoPhaseAndNot(branchPostingLengths, allowedDocs, documentCount) {
|
|
4396
|
+
if (branchPostingLengths.length <= 1)
|
|
4397
|
+
return false;
|
|
4398
|
+
const firstLength = branchPostingLengths[0];
|
|
4399
|
+
const effectiveFirstLength = allowedDocs == null
|
|
4400
|
+
? firstLength
|
|
4401
|
+
: Math.min(firstLength, allowedDocs.size);
|
|
4402
|
+
const largeThreshold = Math.max(DEFAULT_POSTING_GATE_MIN_LENGTH, Math.floor(documentCount * TWO_PHASE_AND_NOT_MIN_FRACTION));
|
|
4403
|
+
if (effectiveFirstLength < largeThreshold)
|
|
4404
|
+
return false;
|
|
4405
|
+
for (let i = 1; i < branchPostingLengths.length; i++) {
|
|
4406
|
+
if (branchPostingLengths[i] >= largeThreshold)
|
|
4407
|
+
return true;
|
|
4408
|
+
}
|
|
4409
|
+
return false;
|
|
4410
|
+
}
|
|
4411
|
+
function executeAndWithFinalGate(branches, finalGate, executeBranch) {
|
|
4412
|
+
if (finalGate.size === 0)
|
|
4413
|
+
return new Map();
|
|
4414
|
+
let result = executeBranch(branches[0], finalGate);
|
|
4415
|
+
for (let i = 1; i < branches.length; i++) {
|
|
4416
|
+
if (result.size === 0)
|
|
4417
|
+
return result;
|
|
4418
|
+
result = combineResults([result, executeBranch(branches[i], finalGate)], AND);
|
|
4419
|
+
}
|
|
4420
|
+
return result;
|
|
4421
|
+
}
|
|
4422
|
+
function collectAndDocIdsByEstimatedLength(branches, branchPostingLengths, collectBranch, allowedDocs) {
|
|
4423
|
+
const order = branches.map((_, i) => i);
|
|
4424
|
+
order.sort((a, b) => branchPostingLengths[a] - branchPostingLengths[b] || a - b);
|
|
4425
|
+
const docIds = collectBranch(branches[order[0]], allowedDocs);
|
|
4426
|
+
for (let i = 1; i < order.length; i++) {
|
|
4427
|
+
if (docIds.size === 0)
|
|
4428
|
+
return docIds;
|
|
4429
|
+
intersectDocIdsInPlace(docIds, collectBranch(branches[order[i]], docIds));
|
|
4430
|
+
}
|
|
4431
|
+
return docIds;
|
|
4432
|
+
}
|
|
4301
4433
|
function collectCombinedDocIds(branches, operator, collectBranch, allowedDocs) {
|
|
4302
4434
|
if (branches.length === 0)
|
|
4303
4435
|
return new Set();
|
|
@@ -4327,34 +4459,58 @@ function collectCombinedDocIds(branches, operator, collectBranch, allowedDocs) {
|
|
|
4327
4459
|
throw new Error(`Invalid combination operator: ${operator}`);
|
|
4328
4460
|
}
|
|
4329
4461
|
/**
|
|
4330
|
-
* AND: score
|
|
4462
|
+
* AND: normally score left-to-right with optional docId gates; for broad-first selective
|
|
4463
|
+
* exact queries, collect the final gate first, then score branches in original order.
|
|
4331
4464
|
* AND_NOT: score the positive branch only; negated branches are collected as docId sets and
|
|
4332
|
-
* subtracted without scoring
|
|
4465
|
+
* subtracted without scoring. Large exact exclusions may collect survivors before positive scoring.
|
|
4333
4466
|
*/
|
|
4334
|
-
function executeCombinedBranches(branches, operator, params, executeBranch, collectBranch, allowedDocs, run) {
|
|
4467
|
+
function executeCombinedBranches(branches, operator, params, executeBranch, collectBranch, allowedDocs, run, estimateBranchPostingLength, allowTwoPhase = false) {
|
|
4468
|
+
var _a;
|
|
4335
4469
|
if (branches.length === 0)
|
|
4336
4470
|
return new Map();
|
|
4337
4471
|
const op = operator.toLowerCase();
|
|
4338
4472
|
if (op === 'or') {
|
|
4339
4473
|
return combineResults(branches.map(branch => executeBranch(branch, allowedDocs)), operator);
|
|
4340
4474
|
}
|
|
4341
|
-
let result = executeBranch(branches[0], allowedDocs);
|
|
4342
|
-
let gate = docIdsFromResult(result);
|
|
4343
4475
|
if (op === 'and') {
|
|
4476
|
+
const branchPostingLengths = twoPhasePostingLengths(branches, allowTwoPhase, estimateBranchPostingLength);
|
|
4477
|
+
if (branchPostingLengths != null && shouldUseTwoPhaseAnd(branchPostingLengths, allowedDocs)) {
|
|
4478
|
+
const finalGate = collectAndDocIdsByEstimatedLength(branches, branchPostingLengths, collectBranch, allowedDocs);
|
|
4479
|
+
return executeAndWithFinalGate(branches, finalGate, executeBranch);
|
|
4480
|
+
}
|
|
4481
|
+
let result = executeBranch(branches[0], allowedDocs);
|
|
4482
|
+
let gate = gateFromResult(result);
|
|
4344
4483
|
const limits = void 0 ;
|
|
4345
4484
|
const documentCount = params.aggregateContext.documentCount;
|
|
4485
|
+
const postingGatePolicy = (_a = void 0 ) !== null && _a !== void 0 ? _a : DEFAULT_POSTING_GATE_POLICY;
|
|
4486
|
+
const maxGateSize = resolveGateMaxSize(documentCount, limits);
|
|
4346
4487
|
for (let i = 1; i < branches.length; i++) {
|
|
4347
|
-
|
|
4348
|
-
|
|
4488
|
+
if (gate.size === 0)
|
|
4489
|
+
return result;
|
|
4490
|
+
const absoluteSelective = gate.size <= maxGateSize;
|
|
4491
|
+
const postingListLength = absoluteSelective
|
|
4492
|
+
? undefined
|
|
4493
|
+
: estimateBranchPostingLength === null || estimateBranchPostingLength === void 0 ? void 0 : estimateBranchPostingLength(branches[i]);
|
|
4494
|
+
const selective = gateIsSelectiveEnough(gate.size, documentCount, limits, postingListLength, postingGatePolicy);
|
|
4495
|
+
const branchAllowed = absoluteSelective || shouldPassGateAsAllowedDocs(selective, gate.size, postingListLength)
|
|
4496
|
+
? gate
|
|
4497
|
+
: allowedDocs;
|
|
4349
4498
|
result = combineResults([result, executeBranch(branches[i], branchAllowed)], AND);
|
|
4350
|
-
gate =
|
|
4499
|
+
gate = gateFromResult(result);
|
|
4351
4500
|
}
|
|
4352
4501
|
return result;
|
|
4353
4502
|
}
|
|
4354
4503
|
if (op === 'and_not') {
|
|
4504
|
+
const branchPostingLengths = twoPhasePostingLengths(branches, allowTwoPhase, estimateBranchPostingLength);
|
|
4505
|
+
if (branchPostingLengths != null && shouldUseTwoPhaseAndNot(branchPostingLengths, allowedDocs, params.aggregateContext.documentCount)) {
|
|
4506
|
+
const finalGate = collectCombinedDocIds(branches, operator, collectBranch, allowedDocs);
|
|
4507
|
+
return finalGate.size === 0 ? new Map() : executeBranch(branches[0], finalGate);
|
|
4508
|
+
}
|
|
4509
|
+
const result = executeBranch(branches[0], allowedDocs);
|
|
4510
|
+
let gate = gateFromResult(result);
|
|
4355
4511
|
for (let i = 1; i < branches.length; i++) {
|
|
4356
4512
|
subtractDocIdsFromResult(result, collectBranch(branches[i], gate));
|
|
4357
|
-
gate =
|
|
4513
|
+
gate = gateFromResult(result);
|
|
4358
4514
|
}
|
|
4359
4515
|
return result;
|
|
4360
4516
|
}
|
|
@@ -4438,7 +4594,7 @@ function executeQueryInternal(query, searchOptions, params, allowedDocs, run) {
|
|
|
4438
4594
|
const options = { ...searchOptions, ...query, queries: undefined };
|
|
4439
4595
|
const operator = ((_b = (_a = query.combineWith) !== null && _a !== void 0 ? _a : options.combineWith) !== null && _b !== void 0 ? _b : params.globalSearchOptions.combineWith);
|
|
4440
4596
|
if (useGatedEvaluation(run, query.queries.length, operator, combinationHasWildcard(query))) {
|
|
4441
|
-
return executeCombinedBranches(query.queries, operator, params, (branch, branchAllowed) => executeQueryInternal(branch, options, params, branchAllowed, run), (branch, branchAllowed) => collectDocIdsForQueryInternal(branch, options, params, branchAllowed), allowedDocs);
|
|
4597
|
+
return executeCombinedBranches(query.queries, operator, params, (branch, branchAllowed) => executeQueryInternal(branch, options, params, branchAllowed, run), (branch, branchAllowed) => collectDocIdsForQueryInternal(branch, options, params, branchAllowed), allowedDocs, run, branch => estimateMaxPostingLengthForQuery(branch, options, params));
|
|
4442
4598
|
}
|
|
4443
4599
|
const results = query.queries.map(subquery => executeQueryInternal(subquery, options, params, allowedDocs, run));
|
|
4444
4600
|
return combineResults(results, operator);
|
|
@@ -4450,7 +4606,7 @@ function executeQueryInternal(query, searchOptions, params, allowedDocs, run) {
|
|
|
4450
4606
|
const { specs, operator } = normalized;
|
|
4451
4607
|
const combineWith = (operator !== null && operator !== void 0 ? operator : params.globalSearchOptions.combineWith);
|
|
4452
4608
|
if (useGatedEvaluation(run, specs.length, combineWith, false)) {
|
|
4453
|
-
return executeCombinedBranches(specs, combineWith, params, (spec, branchAllowed) => executeQuerySpecInternal(spec, normalized, params, branchAllowed), (spec, branchAllowed) => collectDocIdsForQuerySpec(spec, normalized, params, branchAllowed), allowedDocs);
|
|
4609
|
+
return executeCombinedBranches(specs, combineWith, params, (spec, branchAllowed) => executeQuerySpecInternal(spec, normalized, params, branchAllowed), (spec, branchAllowed) => collectDocIdsForQuerySpec(spec, normalized, params, branchAllowed), allowedDocs, run, spec => estimateMaxPostingLengthForQuerySpec(spec, normalized, params), specs.every(spec => !spec.prefix && !spec.fuzzy));
|
|
4454
4610
|
}
|
|
4455
4611
|
const results = specs.map(spec => executeQuerySpecInternal(spec, normalized, params, allowedDocs));
|
|
4456
4612
|
return combineResults(results, combineWith);
|