@yoch/minisearch 8.0.0-beta.2 → 8.0.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,20 @@
2
2
 
3
3
  `MiniSearch` follows [semantic versioning](https://semver.org/spec/v2.0.0.html).
4
4
 
5
+ ## v8.0.0-beta.3
6
+
7
+ Incremental frozen index construction without a temporary `documents[]` array.
8
+
9
+ - Add `FrozenIndexBuilder` and `createFrozenIndexBuilder(options, hints?)` with `.add(doc)`
10
+ and optional `estimatedDocumentCount` pre-sizing
11
+ - Add `freezeFrozenIndexBuilder(builder)` to finalize into `FrozenMiniSearch` (avoids a
12
+ circular import between build and assembly modules)
13
+ - Add `FrozenMiniSearch.fromAsyncIterable(iterable, options)` for async document streams
14
+ (e.g. CSV parsers)
15
+ - Refactor `buildFrozenParamsFromDocuments` to use the builder internally (same output)
16
+ - Trim per-document arrays when `estimatedDocumentCount` exceeds the actual document count
17
+ - Export `FrozenIndexBuilderHints` type
18
+
5
19
  ## v8.0.0-beta.2
6
20
 
7
21
  Consolidated beta on npm. Supersedes `8.0.0-beta.0` and `8.0.0-beta.1` (unpublished).
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  **In-memory full-text search for Node.js** — a fork of [MiniSearch](https://github.com/lucaong/minisearch) by [Luca Ongaro](https://github.com/lucaong/minisearch), extended for **production serving**: smaller indexes, faster loads, and a read-only fast path.
4
4
 
5
- > **Current release:** `8.0.0-beta.2` · install with `npm install @yoch/minisearch`
5
+ > **Current release:** `8.0.0-beta.3` · install with `npm install @yoch/minisearch`
6
6
 
7
7
  ---
8
8
 
@@ -74,17 +74,73 @@ const { FrozenMiniSearch } = require('@yoch/minisearch')
74
74
  |------|-----|
75
75
  | Live index that changes over time | `MiniSearch` → `freeze()` when you need read-only serving |
76
76
  | Fixed corpus, build frozen directly | **`FrozenMiniSearch.fromDocuments(documents, options)`** |
77
+ | Build doc-by-doc (no `documents[]` buffer) | **`createFrozenIndexBuilder(options)`** → `.add(doc)` → **`freezeFrozenIndexBuilder(builder)`** |
78
+ | Async stream of documents | **`FrozenMiniSearch.fromAsyncIterable(iterable, options)`** |
77
79
  | Load a snapshot from disk | `FrozenMiniSearch.loadBinary(buffer, options)` |
78
80
  | Custom assembly pipeline | `buildFrozenFromDocuments`, `assembleFrozen`, `freezeFromMiniSearch` |
79
81
 
80
82
  `fromDocuments` matches `new MiniSearch(opts).addAll(docs).freeze()` for search ranking on the same corpus and options (`fields`, `tokenize`, `processTerm`, …). Frozen indexes do not support `add` / `remove`.
81
83
 
84
+ **External corpus (e.g. lookup by id after search):** keep full rows in your own store (`dataCache`, DB, etc.) and use minimal `storeFields` (often `['id']` only) so the frozen index does not duplicate payload text:
85
+
86
+ ```javascript
87
+ import { createFrozenIndexBuilder, freezeFrozenIndexBuilder } from '@yoch/minisearch'
88
+
89
+ function buildFrozenIndexFromRows (rows, options) {
90
+ const builder = createFrozenIndexBuilder(options, {
91
+ estimatedDocumentCount: rows.length
92
+ })
93
+ for (let i = 0; i < rows.length; i++) {
94
+ builder.add(buildIndexDocument(rows[i], i))
95
+ }
96
+ return freezeFrozenIndexBuilder(builder)
97
+ }
98
+
99
+ // After search: enrich from your store — frozen.getStoredFields(res.id) or dataCache[type][res.id]
100
+ ```
101
+
102
+ **Async stream** (no intermediate array; documents are indexed as they arrive):
103
+
104
+ ```javascript
105
+ import { createReadStream } from 'node:fs'
106
+ import { parse } from 'csv-parse'
107
+ import { FrozenMiniSearch } from '@yoch/minisearch'
108
+
109
+ async function buildFromCsv (path, options) {
110
+ async function * documents () {
111
+ const parser = createReadStream(path).pipe(parse({ columns: true }))
112
+ for await (const row of parser) {
113
+ yield { id: row.cis, denomination: row.denomination, /* … */ }
114
+ }
115
+ }
116
+ return FrozenMiniSearch.fromAsyncIterable(documents(), options)
117
+ }
118
+ ```
119
+
120
+ For a **sync** iterable (`for...of` on an array or generator), use the builder directly:
121
+
122
+ ```javascript
123
+ import { createFrozenIndexBuilder, freezeFrozenIndexBuilder } from '@yoch/minisearch'
124
+
125
+ const builder = createFrozenIndexBuilder(options)
126
+ for (const doc of documentGenerator()) {
127
+ builder.add(doc)
128
+ }
129
+ const frozen = freezeFrozenIndexBuilder(builder)
130
+ ```
131
+
132
+ `estimatedDocumentCount` in the second argument to `createFrozenIndexBuilder` pre-allocates
133
+ per-document arrays when the final size is known; internal buffers are trimmed to the actual
134
+ count on freeze if the hint was too large.
135
+
82
136
  ---
83
137
 
84
138
  ## FrozenMiniSearch in a bit more detail
85
139
 
86
140
  - **`freeze()`** — snapshot a mutable index into compact typed postings + a radix tree keyed by term index.
87
141
  - **`fromDocuments()`** — build that structure in one pass (skips nested `Map` postings and radix cloning at freeze time).
142
+ - **`createFrozenIndexBuilder()`** — same output without a temporary `documents[]` array; finalize with `freezeFrozenIndexBuilder(builder)` (or `assembleFrozen(builder.freezeParams())` for custom assembly).
143
+ - **`fromAsyncIterable()`** — async document stream (e.g. CSV parser) into a frozen index; equivalent to builder + `for await` + `freezeFrozenIndexBuilder`.
88
144
  - **`saveBinary()` / `loadBinary()`** — MSv2 on write, MSv1 still readable; pass the same `fields` (and custom `tokenize` / `processTerm` if used at build time).
89
145
  - **Term frequencies** — stored as `Uint8` (max 255 per doc/term); only affects scores for extreme term repetition.
90
146
  - **`frozenMemoryBreakdown()`** — introspect postings, radix tree, and stored-field footprint.
@@ -94,6 +150,10 @@ Advanced exports:
94
150
  ```javascript
95
151
  import {
96
152
  FrozenMiniSearch,
153
+ createFrozenIndexBuilder,
154
+ freezeFrozenIndexBuilder,
155
+ FrozenIndexBuilder,
156
+ type FrozenIndexBuilderHints,
97
157
  buildFrozenFromDocuments,
98
158
  assembleFrozen,
99
159
  freezeFromMiniSearch,
@@ -124,8 +184,8 @@ TypeScript definitions: `dist/es/index.d.ts`.
124
184
  Reproducible comparisons (heap, load time, search latency) live under [`benchmarks/`](benchmarks/README.md):
125
185
 
126
186
  ```bash
127
- yarn benchmark:compare # terminal report
128
- yarn benchmark:diff # vs versioned baseline
187
+ npm run benchmark:compare # terminal report
188
+ npm run benchmark:diff # vs versioned baseline
129
189
  ```
130
190
 
131
191
  ---
@@ -133,11 +193,13 @@ yarn benchmark:diff # vs versioned baseline
133
193
  ## Development
134
194
 
135
195
  ```bash
136
- yarn install
137
- yarn test
138
- yarn build
196
+ npm install
197
+ npm test
198
+ npm run build
139
199
  ```
140
200
 
201
+ Use `npm run` for scripts (Yarn 1.x on Node 22 prints `url.parse` deprecation noise when invoking `yarn test` / `yarn build`).
202
+
141
203
  **Requirements:** Node.js **ES2018+**. No browser UMD/CDN build in this fork (Node-only ESM + CJS).
142
204
 
143
205
  ---
@@ -1085,41 +1085,41 @@ function saveStoredFieldsForDocument(storeFields, extractField, document) {
1085
1085
  return documentFields;
1086
1086
  }
1087
1087
 
1088
- function getOrCreateTermIndex(builder, term) {
1089
- const existing = builder.index.get(term);
1088
+ function getOrCreateTermIndex(state, index, term) {
1089
+ const existing = index.get(term);
1090
1090
  if (existing != null)
1091
1091
  return existing;
1092
- const ti = builder.terms.length;
1093
- builder.terms.push(term);
1094
- builder.index.set(term, ti);
1092
+ const ti = state.terms.length;
1093
+ state.terms.push(term);
1094
+ index.set(term, ti);
1095
1095
  return ti;
1096
1096
  }
1097
- function appendPosting(builder, termIndex, fieldId, docId, freq) {
1098
- const slot = termIndex * builder.fieldCount + fieldId;
1099
- let docIds = builder.postingsDocIds[slot];
1100
- let freqs = builder.postingsFreqs[slot];
1097
+ function appendPosting(state, termIndex, fieldId, docId, freq) {
1098
+ const slot = termIndex * state.fieldCount + fieldId;
1099
+ let docIds = state.postingsDocIds[slot];
1100
+ let freqs = state.postingsFreqs[slot];
1101
1101
  if (docIds == null) {
1102
1102
  docIds = [];
1103
1103
  freqs = [];
1104
- builder.postingsDocIds[slot] = docIds;
1105
- builder.postingsFreqs[slot] = freqs;
1104
+ state.postingsDocIds[slot] = docIds;
1105
+ state.postingsFreqs[slot] = freqs;
1106
1106
  }
1107
1107
  docIds.push(docId);
1108
1108
  freqs.push(clampFreq(freq));
1109
1109
  }
1110
- function finalizeFlatPostings(builder) {
1111
- const termCount = builder.terms.length;
1112
- const slotCount = termCount * builder.fieldCount;
1110
+ function finalizeFlatPostings(state) {
1111
+ const termCount = state.terms.length;
1112
+ const slotCount = termCount * state.fieldCount;
1113
1113
  const postingsOffsets = new Uint32Array(slotCount);
1114
1114
  const postingsLengths = new Uint32Array(slotCount);
1115
1115
  const docScratch = [];
1116
1116
  const freqScratch = [];
1117
1117
  for (let ti = 0; ti < termCount; ti++) {
1118
- const base = ti * builder.fieldCount;
1119
- for (let f = 0; f < builder.fieldCount; f++) {
1118
+ const base = ti * state.fieldCount;
1119
+ for (let f = 0; f < state.fieldCount; f++) {
1120
1120
  const offset = docScratch.length;
1121
- const docIds = builder.postingsDocIds[base + f];
1122
- const freqs = builder.postingsFreqs[base + f];
1121
+ const docIds = state.postingsDocIds[base + f];
1122
+ const freqs = state.postingsFreqs[base + f];
1123
1123
  if (docIds == null || docIds.length === 0) {
1124
1124
  postingsOffsets[base + f] = offset;
1125
1125
  postingsLengths[base + f] = 0;
@@ -1140,84 +1140,132 @@ function finalizeFlatPostings(builder) {
1140
1140
  allFreqs: new Uint8Array(freqScratch)
1141
1141
  };
1142
1142
  }
1143
- function indexDocument(builder, document, shortId) {
1144
- const { extractField, stringifyField, tokenize, processTerm, fields, idField, storeFields } = builder.options;
1145
- const id = extractField(document, idField);
1146
- if (id == null) {
1147
- throw new Error(`MiniSearch: document does not have ID field "${idField}"`);
1148
- }
1149
- if (builder.idToShortId.has(id)) {
1150
- throw new Error(`MiniSearch: duplicate ID ${id}`);
1151
- }
1152
- builder.idToShortId.set(id, shortId);
1153
- builder.externalIds[shortId] = id;
1154
- builder.storedFields[shortId] = saveStoredFieldsForDocument(storeFields, extractField, document);
1155
- const documentCount = shortId + 1;
1156
- for (const field of fields) {
1157
- const fieldValue = extractField(document, field);
1158
- if (fieldValue == null)
1159
- continue;
1160
- const tokens = tokenize(stringifyField(fieldValue, field), field);
1161
- const fieldId = builder.fieldIds[field];
1162
- const uniqueTerms = new Set(tokens).size;
1163
- const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
1164
- builder.fieldLengthMatrix[shortId * builder.fieldCount + fieldId] = uniqueTerms;
1165
- updateAvgFieldLength(builder.avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
1166
- for (const [term, freq] of localFreqs) {
1167
- const ti = getOrCreateTermIndex(builder, term);
1168
- appendPosting(builder, ti, fieldId, shortId, freq);
1143
+ /** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
1144
+ class FrozenIndexBuilder {
1145
+ constructor(options, hints) {
1146
+ this._options = resolveIndexingOptions(options);
1147
+ this._fieldIds = buildFieldIds(this._options.fields);
1148
+ this._fieldCount = this._options.fields.length;
1149
+ this._index = new SearchableMap();
1150
+ this._terms = [];
1151
+ this._postingsDocIds = [];
1152
+ this._postingsFreqs = [];
1153
+ this._idToShortId = new Map();
1154
+ this._avgFieldLength = [];
1155
+ this._nextId = 0;
1156
+ this._frozen = false;
1157
+ const estimated = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount;
1158
+ if (estimated != null && estimated > 0) {
1159
+ this._externalIds = new Array(estimated);
1160
+ this._storedFields = new Array(estimated);
1161
+ this._fieldLengthData = new Array(estimated * this._fieldCount).fill(0);
1162
+ }
1163
+ else {
1164
+ this._externalIds = [];
1165
+ this._storedFields = [];
1166
+ this._fieldLengthData = [];
1169
1167
  }
1168
+ this._postingsState = {
1169
+ fieldCount: this._fieldCount,
1170
+ terms: this._terms,
1171
+ postingsDocIds: this._postingsDocIds,
1172
+ postingsFreqs: this._postingsFreqs
1173
+ };
1174
+ }
1175
+ /** Number of documents indexed so far (not yet frozen). */
1176
+ get documentCount() {
1177
+ return this._nextId;
1178
+ }
1179
+ add(document) {
1180
+ if (this._frozen) {
1181
+ throw new Error('FrozenIndexBuilder: cannot add after freezeParams()');
1182
+ }
1183
+ const { extractField, stringifyField, tokenize, processTerm, fields, idField, storeFields } = this._options;
1184
+ const id = extractField(document, idField);
1185
+ if (id == null) {
1186
+ throw new Error(`MiniSearch: document does not have ID field "${idField}"`);
1187
+ }
1188
+ if (this._idToShortId.has(id)) {
1189
+ throw new Error(`MiniSearch: duplicate ID ${id}`);
1190
+ }
1191
+ const shortId = this._nextId++;
1192
+ this._idToShortId.set(id, shortId);
1193
+ this._externalIds[shortId] = id;
1194
+ this._storedFields[shortId] = saveStoredFieldsForDocument(storeFields, extractField, document);
1195
+ const documentCount = shortId + 1;
1196
+ for (const field of fields) {
1197
+ const fieldValue = extractField(document, field);
1198
+ if (fieldValue == null)
1199
+ continue;
1200
+ const tokens = tokenize(stringifyField(fieldValue, field), field);
1201
+ const fieldId = this._fieldIds[field];
1202
+ const uniqueTerms = new Set(tokens).size;
1203
+ const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
1204
+ this._fieldLengthData[shortId * this._fieldCount + fieldId] = uniqueTerms;
1205
+ updateAvgFieldLength(this._avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
1206
+ for (const [term, freq] of localFreqs) {
1207
+ const ti = getOrCreateTermIndex(this._postingsState, this._index, term);
1208
+ appendPosting(this._postingsState, ti, fieldId, shortId, freq);
1209
+ }
1210
+ }
1211
+ }
1212
+ /**
1213
+ * Finalize this builder into assembly params. Call {@link assembleFrozen} or
1214
+ * {@link freezeFrozenIndexBuilder} to obtain a {@link FrozenMiniSearch} instance.
1215
+ */
1216
+ freezeParams() {
1217
+ var _a;
1218
+ if (this._frozen) {
1219
+ throw new Error('FrozenIndexBuilder: freezeParams() already called');
1220
+ }
1221
+ this._frozen = true;
1222
+ const documentCount = this._nextId;
1223
+ const flat = finalizeFlatPostings(this._postingsState);
1224
+ const avgFieldLength = new Float32Array(this._fieldCount);
1225
+ for (let f = 0; f < this._fieldCount; f++) {
1226
+ avgFieldLength[f] = (_a = this._avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
1227
+ }
1228
+ // Ensure exact size regardless of over- or under-estimated documentCount.
1229
+ this._fieldLengthData.length = documentCount * this._fieldCount;
1230
+ // Trim per-document arrays to actual count when estimatedDocumentCount was too large.
1231
+ const externalIds = this._externalIds.length > documentCount
1232
+ ? this._externalIds.slice(0, documentCount)
1233
+ : this._externalIds;
1234
+ const storedFields = this._storedFields.length > documentCount
1235
+ ? this._storedFields.slice(0, documentCount)
1236
+ : this._storedFields;
1237
+ return {
1238
+ options: this._options,
1239
+ documentCount,
1240
+ nextId: documentCount,
1241
+ fieldIds: this._fieldIds,
1242
+ fieldCount: this._fieldCount,
1243
+ externalIds,
1244
+ idToShortId: this._idToShortId,
1245
+ storedFields,
1246
+ fieldLengthMatrix: new Uint32Array(this._fieldLengthData),
1247
+ avgFieldLength,
1248
+ index: this._index,
1249
+ terms: this._terms,
1250
+ postingsOffsets: flat.postingsOffsets,
1251
+ postingsLengths: flat.postingsLengths,
1252
+ allDocIds: flat.allDocIds,
1253
+ allFreqs: flat.allFreqs
1254
+ };
1170
1255
  }
1171
1256
  }
1172
- function createBuilder(options, documentCount) {
1173
- const fieldCount = options.fields.length;
1174
- return {
1175
- options,
1176
- fieldIds: buildFieldIds(options.fields),
1177
- fieldCount,
1178
- documentCount,
1179
- index: new SearchableMap(),
1180
- terms: [],
1181
- postingsDocIds: [],
1182
- postingsFreqs: [],
1183
- externalIds: new Array(documentCount),
1184
- idToShortId: new Map(),
1185
- storedFields: new Array(documentCount),
1186
- fieldLengthMatrix: new Uint32Array(documentCount * fieldCount),
1187
- avgFieldLength: []
1188
- };
1257
+ /** Create an incremental builder for {@link FrozenMiniSearch}. */
1258
+ function createFrozenIndexBuilder(options, hints) {
1259
+ return new FrozenIndexBuilder(options, hints);
1189
1260
  }
1190
1261
  function buildFrozenParamsFromDocuments(documents, options) {
1191
- var _a;
1192
- const resolved = resolveIndexingOptions(options);
1193
- const documentCount = documents.length;
1194
- const builder = createBuilder(resolved, documentCount);
1195
- for (let d = 0; d < documentCount; d++) {
1196
- indexDocument(builder, documents[d], d);
1197
- }
1198
- const flat = finalizeFlatPostings(builder);
1199
- const avgFieldLength = new Float32Array(builder.fieldCount);
1200
- for (let f = 0; f < builder.fieldCount; f++) {
1201
- avgFieldLength[f] = (_a = builder.avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
1262
+ const builder = createFrozenIndexBuilder(options, {
1263
+ estimatedDocumentCount: documents.length
1264
+ });
1265
+ for (let d = 0; d < documents.length; d++) {
1266
+ builder.add(documents[d]);
1202
1267
  }
1203
- return {
1204
- options: resolved,
1205
- documentCount,
1206
- nextId: documentCount,
1207
- fieldIds: builder.fieldIds,
1208
- fieldCount: builder.fieldCount,
1209
- externalIds: builder.externalIds,
1210
- idToShortId: builder.idToShortId,
1211
- storedFields: builder.storedFields,
1212
- fieldLengthMatrix: builder.fieldLengthMatrix,
1213
- avgFieldLength,
1214
- index: builder.index,
1215
- terms: builder.terms,
1216
- postingsOffsets: flat.postingsOffsets,
1217
- postingsLengths: flat.postingsLengths,
1218
- allDocIds: flat.allDocIds,
1219
- allFreqs: flat.allFreqs
1220
- };
1268
+ return builder.freezeParams();
1221
1269
  }
1222
1270
 
1223
1271
  /** Shared wildcard query symbol for MiniSearch and FrozenMiniSearch */
@@ -1370,6 +1418,10 @@ function freezeFromMiniSearch(source) {
1370
1418
  function buildFrozenFromDocuments(documents, options) {
1371
1419
  return assembleFrozen(buildFrozenParamsFromDocuments(documents, options));
1372
1420
  }
1421
+ /** Finalize a {@link FrozenIndexBuilder} into a read-only index. */
1422
+ function freezeFrozenIndexBuilder(builder) {
1423
+ return assembleFrozen(builder.freezeParams());
1424
+ }
1373
1425
  class FrozenMiniSearch {
1374
1426
  constructor(params) {
1375
1427
  this._options = params.options;
@@ -1560,6 +1612,17 @@ class FrozenMiniSearch {
1560
1612
  static fromDocuments(documents, options) {
1561
1613
  return buildFrozenFromDocuments(documents, options);
1562
1614
  }
1615
+ /**
1616
+ * Build a read-only index from an async stream of documents (e.g. CSV parser).
1617
+ * For sync iterables, use {@link createFrozenIndexBuilder} with `for...of` instead.
1618
+ */
1619
+ static async fromAsyncIterable(iterable, options) {
1620
+ const builder = createFrozenIndexBuilder(options);
1621
+ for await (const document of iterable) {
1622
+ builder.add(document);
1623
+ }
1624
+ return freezeFrozenIndexBuilder(builder);
1625
+ }
1563
1626
  getFieldLength(docId, fieldId) {
1564
1627
  var _a;
1565
1628
  return (_a = this._fieldLengthMatrix[docId * this._fieldCount + fieldId]) !== null && _a !== void 0 ? _a : 0;
@@ -2952,10 +3015,13 @@ const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
2952
3015
 
2953
3016
  exports.AND = AND;
2954
3017
  exports.AND_NOT = AND_NOT;
3018
+ exports.FrozenIndexBuilder = FrozenIndexBuilder;
2955
3019
  exports.FrozenMiniSearch = FrozenMiniSearch;
2956
3020
  exports.OR = OR;
2957
3021
  exports.assembleFrozen = assembleFrozen;
2958
3022
  exports.buildFrozenFromDocuments = buildFrozenFromDocuments;
3023
+ exports.createFrozenIndexBuilder = createFrozenIndexBuilder;
2959
3024
  exports.default = MiniSearch;
2960
3025
  exports.freezeFromMiniSearch = freezeFromMiniSearch;
3026
+ exports.freezeFrozenIndexBuilder = freezeFrozenIndexBuilder;
2961
3027
  exports.frozenMemoryBreakdown = frozenMemoryBreakdown;
@@ -254,6 +254,40 @@ declare class SearchableMap<T = any> {
254
254
  }): SearchableMap<any>;
255
255
  }
256
256
 
257
+ interface FrozenIndexBuilderHints {
258
+ /** Pre-size per-document arrays when the final document count is known. */
259
+ estimatedDocumentCount?: number;
260
+ }
261
+ /** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
262
+ declare class FrozenIndexBuilder<T> {
263
+ private readonly _options;
264
+ private readonly _fieldIds;
265
+ private readonly _fieldCount;
266
+ private readonly _index;
267
+ private readonly _terms;
268
+ private readonly _postingsDocIds;
269
+ private readonly _postingsFreqs;
270
+ private readonly _externalIds;
271
+ private readonly _idToShortId;
272
+ private readonly _storedFields;
273
+ private readonly _fieldLengthData;
274
+ private readonly _avgFieldLength;
275
+ private readonly _postingsState;
276
+ private _nextId;
277
+ private _frozen;
278
+ constructor(options: Options<T>, hints?: FrozenIndexBuilderHints);
279
+ /** Number of documents indexed so far (not yet frozen). */
280
+ get documentCount(): number;
281
+ add(document: T): void;
282
+ /**
283
+ * Finalize this builder into assembly params. Call {@link assembleFrozen} or
284
+ * {@link freezeFrozenIndexBuilder} to obtain a {@link FrozenMiniSearch} instance.
285
+ */
286
+ freezeParams(): FrozenAssembleParams<T>;
287
+ }
288
+ /** Create an incremental builder for {@link FrozenMiniSearch}. */
289
+ declare function createFrozenIndexBuilder<T>(options: Options<T>, hints?: FrozenIndexBuilderHints): FrozenIndexBuilder<T>;
290
+
257
291
  /** Shared wildcard query symbol for MiniSearch and FrozenMiniSearch */
258
292
  declare const WILDCARD_QUERY: unique symbol;
259
293
 
@@ -345,6 +379,8 @@ interface FrozenAssembleParams<T = any> {
345
379
  declare function assembleFrozen<T>(params: FrozenAssembleParams<T>): FrozenMiniSearch<T>;
346
380
  declare function freezeFromMiniSearch<T>(source: FreezeSource<T>): FrozenMiniSearch<T>;
347
381
  declare function buildFrozenFromDocuments<T>(documents: readonly T[], options: Options<T>): FrozenMiniSearch<T>;
382
+ /** Finalize a {@link FrozenIndexBuilder} into a read-only index. */
383
+ declare function freezeFrozenIndexBuilder<T>(builder: FrozenIndexBuilder<T>): FrozenMiniSearch<T>;
348
384
  declare class FrozenMiniSearch<T = any> {
349
385
  private readonly _options;
350
386
  private readonly _index;
@@ -408,6 +444,11 @@ declare class FrozenMiniSearch<T = any> {
408
444
  * incremental updates before freezing.
409
445
  */
410
446
  static fromDocuments<T>(documents: readonly T[], options: Options<T>): FrozenMiniSearch<T>;
447
+ /**
448
+ * Build a read-only index from an async stream of documents (e.g. CSV parser).
449
+ * For sync iterables, use {@link createFrozenIndexBuilder} with `for...of` instead.
450
+ */
451
+ static fromAsyncIterable<T>(iterable: AsyncIterable<T>, options: Options<T>): Promise<FrozenMiniSearch<T>>;
411
452
  private getFieldLength;
412
453
  private fieldTermDataFor;
413
454
  private aggregateContext;
@@ -1632,4 +1673,4 @@ interface SerializedIndexEntry {
1632
1673
  [key: string]: number;
1633
1674
  }
1634
1675
 
1635
- export { AND, AND_NOT, type AsPlainObject, type AutoVacuumOptions, type BM25Params, type CombinationOperator, type FrozenAssembleParams, type FrozenMemoryBreakdown, FrozenMiniSearch, type LowercaseCombinationOperator, type MatchInfo, OR, type Options, type Query, type QueryCombination, type SearchOptions, type SearchResult, type Suggestion, type VacuumConditions, type VacuumOptions, type Wildcard, assembleFrozen, buildFrozenFromDocuments, MiniSearch as default, freezeFromMiniSearch, frozenMemoryBreakdown };
1676
+ export { AND, AND_NOT, type AsPlainObject, type AutoVacuumOptions, type BM25Params, type CombinationOperator, type FrozenAssembleParams, FrozenIndexBuilder, type FrozenIndexBuilderHints, type FrozenMemoryBreakdown, FrozenMiniSearch, type LowercaseCombinationOperator, type MatchInfo, OR, type Options, type Query, type QueryCombination, type SearchOptions, type SearchResult, type Suggestion, type VacuumConditions, type VacuumOptions, type Wildcard, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, MiniSearch as default, freezeFromMiniSearch, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
package/dist/es/index.js CHANGED
@@ -1081,41 +1081,41 @@ function saveStoredFieldsForDocument(storeFields, extractField, document) {
1081
1081
  return documentFields;
1082
1082
  }
1083
1083
 
1084
- function getOrCreateTermIndex(builder, term) {
1085
- const existing = builder.index.get(term);
1084
+ function getOrCreateTermIndex(state, index, term) {
1085
+ const existing = index.get(term);
1086
1086
  if (existing != null)
1087
1087
  return existing;
1088
- const ti = builder.terms.length;
1089
- builder.terms.push(term);
1090
- builder.index.set(term, ti);
1088
+ const ti = state.terms.length;
1089
+ state.terms.push(term);
1090
+ index.set(term, ti);
1091
1091
  return ti;
1092
1092
  }
1093
- function appendPosting(builder, termIndex, fieldId, docId, freq) {
1094
- const slot = termIndex * builder.fieldCount + fieldId;
1095
- let docIds = builder.postingsDocIds[slot];
1096
- let freqs = builder.postingsFreqs[slot];
1093
+ function appendPosting(state, termIndex, fieldId, docId, freq) {
1094
+ const slot = termIndex * state.fieldCount + fieldId;
1095
+ let docIds = state.postingsDocIds[slot];
1096
+ let freqs = state.postingsFreqs[slot];
1097
1097
  if (docIds == null) {
1098
1098
  docIds = [];
1099
1099
  freqs = [];
1100
- builder.postingsDocIds[slot] = docIds;
1101
- builder.postingsFreqs[slot] = freqs;
1100
+ state.postingsDocIds[slot] = docIds;
1101
+ state.postingsFreqs[slot] = freqs;
1102
1102
  }
1103
1103
  docIds.push(docId);
1104
1104
  freqs.push(clampFreq(freq));
1105
1105
  }
1106
- function finalizeFlatPostings(builder) {
1107
- const termCount = builder.terms.length;
1108
- const slotCount = termCount * builder.fieldCount;
1106
+ function finalizeFlatPostings(state) {
1107
+ const termCount = state.terms.length;
1108
+ const slotCount = termCount * state.fieldCount;
1109
1109
  const postingsOffsets = new Uint32Array(slotCount);
1110
1110
  const postingsLengths = new Uint32Array(slotCount);
1111
1111
  const docScratch = [];
1112
1112
  const freqScratch = [];
1113
1113
  for (let ti = 0; ti < termCount; ti++) {
1114
- const base = ti * builder.fieldCount;
1115
- for (let f = 0; f < builder.fieldCount; f++) {
1114
+ const base = ti * state.fieldCount;
1115
+ for (let f = 0; f < state.fieldCount; f++) {
1116
1116
  const offset = docScratch.length;
1117
- const docIds = builder.postingsDocIds[base + f];
1118
- const freqs = builder.postingsFreqs[base + f];
1117
+ const docIds = state.postingsDocIds[base + f];
1118
+ const freqs = state.postingsFreqs[base + f];
1119
1119
  if (docIds == null || docIds.length === 0) {
1120
1120
  postingsOffsets[base + f] = offset;
1121
1121
  postingsLengths[base + f] = 0;
@@ -1136,84 +1136,132 @@ function finalizeFlatPostings(builder) {
1136
1136
  allFreqs: new Uint8Array(freqScratch)
1137
1137
  };
1138
1138
  }
1139
- function indexDocument(builder, document, shortId) {
1140
- const { extractField, stringifyField, tokenize, processTerm, fields, idField, storeFields } = builder.options;
1141
- const id = extractField(document, idField);
1142
- if (id == null) {
1143
- throw new Error(`MiniSearch: document does not have ID field "${idField}"`);
1144
- }
1145
- if (builder.idToShortId.has(id)) {
1146
- throw new Error(`MiniSearch: duplicate ID ${id}`);
1147
- }
1148
- builder.idToShortId.set(id, shortId);
1149
- builder.externalIds[shortId] = id;
1150
- builder.storedFields[shortId] = saveStoredFieldsForDocument(storeFields, extractField, document);
1151
- const documentCount = shortId + 1;
1152
- for (const field of fields) {
1153
- const fieldValue = extractField(document, field);
1154
- if (fieldValue == null)
1155
- continue;
1156
- const tokens = tokenize(stringifyField(fieldValue, field), field);
1157
- const fieldId = builder.fieldIds[field];
1158
- const uniqueTerms = new Set(tokens).size;
1159
- const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
1160
- builder.fieldLengthMatrix[shortId * builder.fieldCount + fieldId] = uniqueTerms;
1161
- updateAvgFieldLength(builder.avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
1162
- for (const [term, freq] of localFreqs) {
1163
- const ti = getOrCreateTermIndex(builder, term);
1164
- appendPosting(builder, ti, fieldId, shortId, freq);
1139
+ /** Incremental builder for {@link FrozenMiniSearch} without materializing a full `documents[]` array. */
1140
+ class FrozenIndexBuilder {
1141
+ constructor(options, hints) {
1142
+ this._options = resolveIndexingOptions(options);
1143
+ this._fieldIds = buildFieldIds(this._options.fields);
1144
+ this._fieldCount = this._options.fields.length;
1145
+ this._index = new SearchableMap();
1146
+ this._terms = [];
1147
+ this._postingsDocIds = [];
1148
+ this._postingsFreqs = [];
1149
+ this._idToShortId = new Map();
1150
+ this._avgFieldLength = [];
1151
+ this._nextId = 0;
1152
+ this._frozen = false;
1153
+ const estimated = hints === null || hints === void 0 ? void 0 : hints.estimatedDocumentCount;
1154
+ if (estimated != null && estimated > 0) {
1155
+ this._externalIds = new Array(estimated);
1156
+ this._storedFields = new Array(estimated);
1157
+ this._fieldLengthData = new Array(estimated * this._fieldCount).fill(0);
1158
+ }
1159
+ else {
1160
+ this._externalIds = [];
1161
+ this._storedFields = [];
1162
+ this._fieldLengthData = [];
1165
1163
  }
1164
+ this._postingsState = {
1165
+ fieldCount: this._fieldCount,
1166
+ terms: this._terms,
1167
+ postingsDocIds: this._postingsDocIds,
1168
+ postingsFreqs: this._postingsFreqs
1169
+ };
1170
+ }
1171
+ /** Number of documents indexed so far (not yet frozen). */
1172
+ get documentCount() {
1173
+ return this._nextId;
1174
+ }
1175
+ add(document) {
1176
+ if (this._frozen) {
1177
+ throw new Error('FrozenIndexBuilder: cannot add after freezeParams()');
1178
+ }
1179
+ const { extractField, stringifyField, tokenize, processTerm, fields, idField, storeFields } = this._options;
1180
+ const id = extractField(document, idField);
1181
+ if (id == null) {
1182
+ throw new Error(`MiniSearch: document does not have ID field "${idField}"`);
1183
+ }
1184
+ if (this._idToShortId.has(id)) {
1185
+ throw new Error(`MiniSearch: duplicate ID ${id}`);
1186
+ }
1187
+ const shortId = this._nextId++;
1188
+ this._idToShortId.set(id, shortId);
1189
+ this._externalIds[shortId] = id;
1190
+ this._storedFields[shortId] = saveStoredFieldsForDocument(storeFields, extractField, document);
1191
+ const documentCount = shortId + 1;
1192
+ for (const field of fields) {
1193
+ const fieldValue = extractField(document, field);
1194
+ if (fieldValue == null)
1195
+ continue;
1196
+ const tokens = tokenize(stringifyField(fieldValue, field), field);
1197
+ const fieldId = this._fieldIds[field];
1198
+ const uniqueTerms = new Set(tokens).size;
1199
+ const localFreqs = collectFieldTermFreqs(tokens, field, processTerm);
1200
+ this._fieldLengthData[shortId * this._fieldCount + fieldId] = uniqueTerms;
1201
+ updateAvgFieldLength(this._avgFieldLength, fieldId, documentCount - 1, uniqueTerms);
1202
+ for (const [term, freq] of localFreqs) {
1203
+ const ti = getOrCreateTermIndex(this._postingsState, this._index, term);
1204
+ appendPosting(this._postingsState, ti, fieldId, shortId, freq);
1205
+ }
1206
+ }
1207
+ }
1208
+ /**
1209
+ * Finalize this builder into assembly params. Call {@link assembleFrozen} or
1210
+ * {@link freezeFrozenIndexBuilder} to obtain a {@link FrozenMiniSearch} instance.
1211
+ */
1212
+ freezeParams() {
1213
+ var _a;
1214
+ if (this._frozen) {
1215
+ throw new Error('FrozenIndexBuilder: freezeParams() already called');
1216
+ }
1217
+ this._frozen = true;
1218
+ const documentCount = this._nextId;
1219
+ const flat = finalizeFlatPostings(this._postingsState);
1220
+ const avgFieldLength = new Float32Array(this._fieldCount);
1221
+ for (let f = 0; f < this._fieldCount; f++) {
1222
+ avgFieldLength[f] = (_a = this._avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
1223
+ }
1224
+ // Ensure exact size regardless of over- or under-estimated documentCount.
1225
+ this._fieldLengthData.length = documentCount * this._fieldCount;
1226
+ // Trim per-document arrays to actual count when estimatedDocumentCount was too large.
1227
+ const externalIds = this._externalIds.length > documentCount
1228
+ ? this._externalIds.slice(0, documentCount)
1229
+ : this._externalIds;
1230
+ const storedFields = this._storedFields.length > documentCount
1231
+ ? this._storedFields.slice(0, documentCount)
1232
+ : this._storedFields;
1233
+ return {
1234
+ options: this._options,
1235
+ documentCount,
1236
+ nextId: documentCount,
1237
+ fieldIds: this._fieldIds,
1238
+ fieldCount: this._fieldCount,
1239
+ externalIds,
1240
+ idToShortId: this._idToShortId,
1241
+ storedFields,
1242
+ fieldLengthMatrix: new Uint32Array(this._fieldLengthData),
1243
+ avgFieldLength,
1244
+ index: this._index,
1245
+ terms: this._terms,
1246
+ postingsOffsets: flat.postingsOffsets,
1247
+ postingsLengths: flat.postingsLengths,
1248
+ allDocIds: flat.allDocIds,
1249
+ allFreqs: flat.allFreqs
1250
+ };
1166
1251
  }
1167
1252
  }
1168
- function createBuilder(options, documentCount) {
1169
- const fieldCount = options.fields.length;
1170
- return {
1171
- options,
1172
- fieldIds: buildFieldIds(options.fields),
1173
- fieldCount,
1174
- documentCount,
1175
- index: new SearchableMap(),
1176
- terms: [],
1177
- postingsDocIds: [],
1178
- postingsFreqs: [],
1179
- externalIds: new Array(documentCount),
1180
- idToShortId: new Map(),
1181
- storedFields: new Array(documentCount),
1182
- fieldLengthMatrix: new Uint32Array(documentCount * fieldCount),
1183
- avgFieldLength: []
1184
- };
1253
+ /** Create an incremental builder for {@link FrozenMiniSearch}. */
1254
+ function createFrozenIndexBuilder(options, hints) {
1255
+ return new FrozenIndexBuilder(options, hints);
1185
1256
  }
1186
1257
  function buildFrozenParamsFromDocuments(documents, options) {
1187
- var _a;
1188
- const resolved = resolveIndexingOptions(options);
1189
- const documentCount = documents.length;
1190
- const builder = createBuilder(resolved, documentCount);
1191
- for (let d = 0; d < documentCount; d++) {
1192
- indexDocument(builder, documents[d], d);
1193
- }
1194
- const flat = finalizeFlatPostings(builder);
1195
- const avgFieldLength = new Float32Array(builder.fieldCount);
1196
- for (let f = 0; f < builder.fieldCount; f++) {
1197
- avgFieldLength[f] = (_a = builder.avgFieldLength[f]) !== null && _a !== void 0 ? _a : 0;
1258
+ const builder = createFrozenIndexBuilder(options, {
1259
+ estimatedDocumentCount: documents.length
1260
+ });
1261
+ for (let d = 0; d < documents.length; d++) {
1262
+ builder.add(documents[d]);
1198
1263
  }
1199
- return {
1200
- options: resolved,
1201
- documentCount,
1202
- nextId: documentCount,
1203
- fieldIds: builder.fieldIds,
1204
- fieldCount: builder.fieldCount,
1205
- externalIds: builder.externalIds,
1206
- idToShortId: builder.idToShortId,
1207
- storedFields: builder.storedFields,
1208
- fieldLengthMatrix: builder.fieldLengthMatrix,
1209
- avgFieldLength,
1210
- index: builder.index,
1211
- terms: builder.terms,
1212
- postingsOffsets: flat.postingsOffsets,
1213
- postingsLengths: flat.postingsLengths,
1214
- allDocIds: flat.allDocIds,
1215
- allFreqs: flat.allFreqs
1216
- };
1264
+ return builder.freezeParams();
1217
1265
  }
1218
1266
 
1219
1267
  /** Shared wildcard query symbol for MiniSearch and FrozenMiniSearch */
@@ -1366,6 +1414,10 @@ function freezeFromMiniSearch(source) {
1366
1414
  function buildFrozenFromDocuments(documents, options) {
1367
1415
  return assembleFrozen(buildFrozenParamsFromDocuments(documents, options));
1368
1416
  }
1417
+ /** Finalize a {@link FrozenIndexBuilder} into a read-only index. */
1418
+ function freezeFrozenIndexBuilder(builder) {
1419
+ return assembleFrozen(builder.freezeParams());
1420
+ }
1369
1421
  class FrozenMiniSearch {
1370
1422
  constructor(params) {
1371
1423
  this._options = params.options;
@@ -1556,6 +1608,17 @@ class FrozenMiniSearch {
1556
1608
  static fromDocuments(documents, options) {
1557
1609
  return buildFrozenFromDocuments(documents, options);
1558
1610
  }
1611
+ /**
1612
+ * Build a read-only index from an async stream of documents (e.g. CSV parser).
1613
+ * For sync iterables, use {@link createFrozenIndexBuilder} with `for...of` instead.
1614
+ */
1615
+ static async fromAsyncIterable(iterable, options) {
1616
+ const builder = createFrozenIndexBuilder(options);
1617
+ for await (const document of iterable) {
1618
+ builder.add(document);
1619
+ }
1620
+ return freezeFrozenIndexBuilder(builder);
1621
+ }
1559
1622
  getFieldLength(docId, fieldId) {
1560
1623
  var _a;
1561
1624
  return (_a = this._fieldLengthMatrix[docId * this._fieldCount + fieldId]) !== null && _a !== void 0 ? _a : 0;
@@ -2946,4 +3009,4 @@ const objectToNumericMapAsync = async (object) => {
2946
3009
  };
2947
3010
  const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
2948
3011
 
2949
- export { AND, AND_NOT, FrozenMiniSearch, OR, assembleFrozen, buildFrozenFromDocuments, MiniSearch as default, freezeFromMiniSearch, frozenMemoryBreakdown };
3012
+ export { AND, AND_NOT, FrozenIndexBuilder, FrozenMiniSearch, OR, assembleFrozen, buildFrozenFromDocuments, createFrozenIndexBuilder, MiniSearch as default, freezeFromMiniSearch, freezeFrozenIndexBuilder, frozenMemoryBreakdown };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yoch/minisearch",
3
- "version": "8.0.0-beta.2",
3
+ "version": "8.0.0-beta.3",
4
4
  "description": "Node.js full-text search with FrozenMiniSearch and binary index snapshots",
5
5
  "main": "dist/cjs/index.cjs",
6
6
  "module": "dist/es/index.js",
@@ -94,23 +94,23 @@
94
94
  "test": "jest",
95
95
  "test-watch": "jest --watch",
96
96
  "coverage": "jest --coverage",
97
- "benchmark": "yarn build-benchmark && NODE_ENV=production node --expose-gc benchmarks/dist/index.cjs",
98
- "benchmark:baseline": "yarn build && node --expose-gc benchmarks/baseline.js",
99
- "benchmark:compare": "yarn build && node --expose-gc benchmarks/compare.js",
100
- "benchmark:record": "yarn build && node --expose-gc benchmarks/captureBaseline.js",
101
- "benchmark:record:reference": "yarn build && node --expose-gc benchmarks/captureBaseline.js --reference",
102
- "benchmark:diff": "yarn build && node --expose-gc benchmarks/diffBaseline.js",
97
+ "benchmark": "npm run build-benchmark && NODE_ENV=production node --expose-gc benchmarks/dist/index.cjs",
98
+ "benchmark:baseline": "npm run build && node --expose-gc benchmarks/baseline.js",
99
+ "benchmark:compare": "npm run build && node --expose-gc benchmarks/compare.js",
100
+ "benchmark:record": "npm run build && node --expose-gc benchmarks/captureBaseline.js",
101
+ "benchmark:record:reference": "npm run build && node --expose-gc benchmarks/captureBaseline.js --reference",
102
+ "benchmark:diff": "npm run build && node --expose-gc benchmarks/diffBaseline.js",
103
103
  "benchmark:diff:latest": "node --expose-gc benchmarks/diffBaseline.js --latest",
104
- "benchmark:baseline:update": "yarn benchmark:record:reference",
105
- "build-benchmark": "BENCHMARKS=true yarn build",
106
- "build": "yarn clean-build && NODE_ENV=production rollup -c && node scripts/postbuild-cjs.cjs",
104
+ "benchmark:baseline:update": "npm run benchmark:record:reference",
105
+ "build-benchmark": "BENCHMARKS=true npm run build",
106
+ "build": "npm run clean-build && NODE_ENV=production rollup -c && node scripts/postbuild-cjs.cjs",
107
107
  "clean-build": "rm -rf dist",
108
- "build-minified": "MINIFY=true yarn build",
109
- "build-docs": "typedoc --options typedoc.json && yarn build-demo",
108
+ "build-minified": "MINIFY=true npm run build",
109
+ "build-docs": "typedoc --options typedoc.json && npm run build-demo",
110
110
  "build-demo": "mkdir -p ./docs/demo && cp -r ./examples/plain_js/. ./docs/demo",
111
111
  "lint": "eslint 'src/**/*.{js,ts}'",
112
112
  "lintfix": "eslint --fix 'src/**/*.{js,ts}'",
113
- "prepublishOnly": "yarn test && yarn build"
113
+ "prepublishOnly": "npm test && npm run build"
114
114
  },
115
115
  "sideEffects": false
116
116
  }