@dengxifeng/lancedb 0.26.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/AGENTS.md +13 -0
  2. package/CONTRIBUTING.md +76 -0
  3. package/README.md +37 -0
  4. package/dist/arrow.d.ts +279 -0
  5. package/dist/arrow.js +1316 -0
  6. package/dist/connection.d.ts +259 -0
  7. package/dist/connection.js +224 -0
  8. package/dist/embedding/embedding_function.d.ts +103 -0
  9. package/dist/embedding/embedding_function.js +192 -0
  10. package/dist/embedding/index.d.ts +27 -0
  11. package/dist/embedding/index.js +101 -0
  12. package/dist/embedding/openai.d.ts +16 -0
  13. package/dist/embedding/openai.js +93 -0
  14. package/dist/embedding/registry.d.ts +74 -0
  15. package/dist/embedding/registry.js +165 -0
  16. package/dist/embedding/transformers.d.ts +36 -0
  17. package/dist/embedding/transformers.js +122 -0
  18. package/dist/header.d.ts +162 -0
  19. package/dist/header.js +217 -0
  20. package/dist/index.d.ts +85 -0
  21. package/dist/index.js +106 -0
  22. package/dist/indices.d.ts +692 -0
  23. package/dist/indices.js +156 -0
  24. package/dist/merge.d.ts +80 -0
  25. package/dist/merge.js +92 -0
  26. package/dist/native.d.ts +585 -0
  27. package/dist/native.js +339 -0
  28. package/dist/permutation.d.ts +143 -0
  29. package/dist/permutation.js +184 -0
  30. package/dist/query.d.ts +581 -0
  31. package/dist/query.js +853 -0
  32. package/dist/rerankers/index.d.ts +5 -0
  33. package/dist/rerankers/index.js +19 -0
  34. package/dist/rerankers/rrf.d.ts +14 -0
  35. package/dist/rerankers/rrf.js +28 -0
  36. package/dist/sanitize.d.ts +32 -0
  37. package/dist/sanitize.js +473 -0
  38. package/dist/table.d.ts +581 -0
  39. package/dist/table.js +321 -0
  40. package/dist/util.d.ts +14 -0
  41. package/dist/util.js +77 -0
  42. package/license_header.txt +2 -0
  43. package/package.json +122 -0
@@ -0,0 +1,692 @@
1
+ /**
2
+ * Options to create an `IVF_PQ` index
3
+ */
4
+ export interface IvfPqOptions {
5
+ /**
6
+ * The number of IVF partitions to create.
7
+ *
8
+ * This value should generally scale with the number of rows in the dataset.
9
+ * By default the number of partitions is the square root of the number of
10
+ * rows.
11
+ *
12
+ * If this value is too large then the first part of the search (picking the
13
+ * right partition) will be slow. If this value is too small then the second
14
+ * part of the search (searching within a partition) will be slow.
15
+ */
16
+ numPartitions?: number;
17
+ /**
18
+ * Number of sub-vectors of PQ.
19
+ *
20
+ * This value controls how much the vector is compressed during the quantization step.
21
+ * The more sub vectors there are the less the vector is compressed. The default is
22
+ * the dimension of the vector divided by 16. If the dimension is not evenly divisible
23
+ * by 16 we use the dimension divded by 8.
24
+ *
25
+ * The above two cases are highly preferred. Having 8 or 16 values per subvector allows
26
+ * us to use efficient SIMD instructions.
27
+ *
28
+ * If the dimension is not visible by 8 then we use 1 subvector. This is not ideal and
29
+ * will likely result in poor performance.
30
+ */
31
+ numSubVectors?: number;
32
+ /**
33
+ * Number of bits per sub-vector.
34
+ *
35
+ * This value controls how much each subvector is compressed. The more bits the more
36
+ * accurate the index will be but the slower search. The default is 8 bits.
37
+ *
38
+ * The number of bits must be 4 or 8.
39
+ */
40
+ numBits?: number;
41
+ /**
42
+ * Distance type to use to build the index.
43
+ *
44
+ * Default value is "l2".
45
+ *
46
+ * This is used when training the index to calculate the IVF partitions
47
+ * (vectors are grouped in partitions with similar vectors according to this
48
+ * distance type) and to calculate a subvector's code during quantization.
49
+ *
50
+ * The distance type used to train an index MUST match the distance type used
51
+ * to search the index. Failure to do so will yield inaccurate results.
52
+ *
53
+ * The following distance types are available:
54
+ *
55
+ * "l2" - Euclidean distance. This is a very common distance metric that
56
+ * accounts for both magnitude and direction when determining the distance
57
+ * between vectors. l2 distance has a range of [0, ∞).
58
+ *
59
+ * "cosine" - Cosine distance. Cosine distance is a distance metric
60
+ * calculated from the cosine similarity between two vectors. Cosine
61
+ * similarity is a measure of similarity between two non-zero vectors of an
62
+ * inner product space. It is defined to equal the cosine of the angle
63
+ * between them. Unlike l2, the cosine distance is not affected by the
64
+ * magnitude of the vectors. Cosine distance has a range of [0, 2].
65
+ *
66
+ * Note: the cosine distance is undefined when one (or both) of the vectors
67
+ * are all zeros (there is no direction). These vectors are invalid and may
68
+ * never be returned from a vector search.
69
+ *
70
+ * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
71
+ * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
72
+ * l2 norm is 1), then dot distance is equivalent to the cosine distance.
73
+ */
74
+ distanceType?: "l2" | "cosine" | "dot";
75
+ /**
76
+ * Max iteration to train IVF kmeans.
77
+ *
78
+ * When training an IVF PQ index we use kmeans to calculate the partitions. This parameter
79
+ * controls how many iterations of kmeans to run.
80
+ *
81
+ * Increasing this might improve the quality of the index but in most cases these extra
82
+ * iterations have diminishing returns.
83
+ *
84
+ * The default value is 50.
85
+ */
86
+ maxIterations?: number;
87
+ /**
88
+ * The number of vectors, per partition, to sample when training IVF kmeans.
89
+ *
90
+ * When an IVF PQ index is trained, we need to calculate partitions. These are groups
91
+ * of vectors that are similar to each other. To do this we use an algorithm called kmeans.
92
+ *
93
+ * Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
94
+ * random sample of the data. This parameter controls the size of the sample. The total
95
+ * number of vectors used to train the index is `sample_rate * num_partitions`.
96
+ *
97
+ * Increasing this value might improve the quality of the index but in most cases the
98
+ * default should be sufficient.
99
+ *
100
+ * The default value is 256.
101
+ */
102
+ sampleRate?: number;
103
+ }
104
+ export interface IvfRqOptions {
105
+ /**
106
+ * The number of IVF partitions to create.
107
+ *
108
+ * This value should generally scale with the number of rows in the dataset.
109
+ * By default the number of partitions is the square root of the number of
110
+ * rows.
111
+ *
112
+ * If this value is too large then the first part of the search (picking the
113
+ * right partition) will be slow. If this value is too small then the second
114
+ * part of the search (searching within a partition) will be slow.
115
+ */
116
+ numPartitions?: number;
117
+ /**
118
+ * Number of bits per dimension for residual quantization.
119
+ *
120
+ * This value controls how much each residual component is compressed. The more
121
+ * bits, the more accurate the index will be but the slower search. Typical values
122
+ * are small integers; the default is 1 bit per dimension.
123
+ */
124
+ numBits?: number;
125
+ /**
126
+ * Distance type to use to build the index.
127
+ *
128
+ * Default value is "l2".
129
+ *
130
+ * This is used when training the index to calculate the IVF partitions
131
+ * (vectors are grouped in partitions with similar vectors according to this
132
+ * distance type) and during quantization.
133
+ *
134
+ * The distance type used to train an index MUST match the distance type used
135
+ * to search the index. Failure to do so will yield inaccurate results.
136
+ *
137
+ * The following distance types are available:
138
+ *
139
+ * "l2" - Euclidean distance.
140
+ * "cosine" - Cosine distance.
141
+ * "dot" - Dot product.
142
+ */
143
+ distanceType?: "l2" | "cosine" | "dot";
144
+ /**
145
+ * Max iterations to train IVF kmeans.
146
+ *
147
+ * When training an IVF index we use kmeans to calculate the partitions. This parameter
148
+ * controls how many iterations of kmeans to run.
149
+ *
150
+ * The default value is 50.
151
+ */
152
+ maxIterations?: number;
153
+ /**
154
+ * The number of vectors, per partition, to sample when training IVF kmeans.
155
+ *
156
+ * When an IVF index is trained, we need to calculate partitions. These are groups
157
+ * of vectors that are similar to each other. To do this we use an algorithm called kmeans.
158
+ *
159
+ * Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
160
+ * random sample of the data. This parameter controls the size of the sample. The total
161
+ * number of vectors used to train the index is `sample_rate * num_partitions`.
162
+ *
163
+ * Increasing this value might improve the quality of the index but in most cases the
164
+ * default should be sufficient.
165
+ *
166
+ * The default value is 256.
167
+ */
168
+ sampleRate?: number;
169
+ }
170
+ /**
171
+ * Options to create an `HNSW_PQ` index
172
+ */
173
+ export interface HnswPqOptions {
174
+ /**
175
+ * The distance metric used to train the index.
176
+ *
177
+ * Default value is "l2".
178
+ *
179
+ * The following distance types are available:
180
+ *
181
+ * "l2" - Euclidean distance. This is a very common distance metric that
182
+ * accounts for both magnitude and direction when determining the distance
183
+ * between vectors. l2 distance has a range of [0, ∞).
184
+ *
185
+ * "cosine" - Cosine distance. Cosine distance is a distance metric
186
+ * calculated from the cosine similarity between two vectors. Cosine
187
+ * similarity is a measure of similarity between two non-zero vectors of an
188
+ * inner product space. It is defined to equal the cosine of the angle
189
+ * between them. Unlike l2, the cosine distance is not affected by the
190
+ * magnitude of the vectors. Cosine distance has a range of [0, 2].
191
+ *
192
+ * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
193
+ * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
194
+ * l2 norm is 1), then dot distance is equivalent to the cosine distance.
195
+ */
196
+ distanceType?: "l2" | "cosine" | "dot";
197
+ /**
198
+ * The number of IVF partitions to create.
199
+ *
200
+ * For HNSW, we recommend a small number of partitions. Setting this to 1 works
201
+ * well for most tables. For very large tables, training just one HNSW graph
202
+ * will require too much memory. Each partition becomes its own HNSW graph, so
203
+ * setting this value higher reduces the peak memory use of training.
204
+ *
205
+ */
206
+ numPartitions?: number;
207
+ /**
208
+ * Number of sub-vectors of PQ.
209
+ *
210
+ * This value controls how much the vector is compressed during the quantization step.
211
+ * The more sub vectors there are the less the vector is compressed. The default is
212
+ * the dimension of the vector divided by 16. If the dimension is not evenly divisible
213
+ * by 16 we use the dimension divded by 8.
214
+ *
215
+ * The above two cases are highly preferred. Having 8 or 16 values per subvector allows
216
+ * us to use efficient SIMD instructions.
217
+ *
218
+ * If the dimension is not visible by 8 then we use 1 subvector. This is not ideal and
219
+ * will likely result in poor performance.
220
+ *
221
+ */
222
+ numSubVectors?: number;
223
+ /**
224
+ * Max iterations to train kmeans.
225
+ *
226
+ * The default value is 50.
227
+ *
228
+ * When training an IVF index we use kmeans to calculate the partitions. This parameter
229
+ * controls how many iterations of kmeans to run.
230
+ *
231
+ * Increasing this might improve the quality of the index but in most cases the parameter
232
+ * is unused because kmeans will converge with fewer iterations. The parameter is only
233
+ * used in cases where kmeans does not appear to converge. In those cases it is unlikely
234
+ * that setting this larger will lead to the index converging anyways.
235
+ *
236
+ */
237
+ maxIterations?: number;
238
+ /**
239
+ * The rate used to calculate the number of training vectors for kmeans.
240
+ *
241
+ * Default value is 256.
242
+ *
243
+ * When an IVF index is trained, we need to calculate partitions. These are groups
244
+ * of vectors that are similar to each other. To do this we use an algorithm called kmeans.
245
+ *
246
+ * Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
247
+ * random sample of the data. This parameter controls the size of the sample. The total
248
+ * number of vectors used to train the index is `sample_rate * num_partitions`.
249
+ *
250
+ * Increasing this value might improve the quality of the index but in most cases the
251
+ * default should be sufficient.
252
+ *
253
+ */
254
+ sampleRate?: number;
255
+ /**
256
+ * The number of neighbors to select for each vector in the HNSW graph.
257
+ *
258
+ * The default value is 20.
259
+ *
260
+ * This value controls the tradeoff between search speed and accuracy.
261
+ * The higher the value the more accurate the search but the slower it will be.
262
+ *
263
+ */
264
+ m?: number;
265
+ /**
266
+ * The number of candidates to evaluate during the construction of the HNSW graph.
267
+ *
268
+ * The default value is 300.
269
+ *
270
+ * This value controls the tradeoff between build speed and accuracy.
271
+ * The higher the value the more accurate the build but the slower it will be.
272
+ * 150 to 300 is the typical range. 100 is a minimum for good quality search
273
+ * results. In most cases, there is no benefit to setting this higher than 500.
274
+ * This value should be set to a value that is not less than `ef` in the search phase.
275
+ *
276
+ */
277
+ efConstruction?: number;
278
+ }
279
+ /**
280
+ * Options to create an `HNSW_SQ` index
281
+ */
282
+ export interface HnswSqOptions {
283
+ /**
284
+ * The distance metric used to train the index.
285
+ *
286
+ * Default value is "l2".
287
+ *
288
+ * The following distance types are available:
289
+ *
290
+ * "l2" - Euclidean distance. This is a very common distance metric that
291
+ * accounts for both magnitude and direction when determining the distance
292
+ * between vectors. l2 distance has a range of [0, ∞).
293
+ *
294
+ * "cosine" - Cosine distance. Cosine distance is a distance metric
295
+ * calculated from the cosine similarity between two vectors. Cosine
296
+ * similarity is a measure of similarity between two non-zero vectors of an
297
+ * inner product space. It is defined to equal the cosine of the angle
298
+ * between them. Unlike l2, the cosine distance is not affected by the
299
+ * magnitude of the vectors. Cosine distance has a range of [0, 2].
300
+ *
301
+ * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
302
+ * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
303
+ * l2 norm is 1), then dot distance is equivalent to the cosine distance.
304
+ */
305
+ distanceType?: "l2" | "cosine" | "dot";
306
+ /**
307
+ * The number of IVF partitions to create.
308
+ *
309
+ * For HNSW, we recommend a small number of partitions. Setting this to 1 works
310
+ * well for most tables. For very large tables, training just one HNSW graph
311
+ * will require too much memory. Each partition becomes its own HNSW graph, so
312
+ * setting this value higher reduces the peak memory use of training.
313
+ *
314
+ */
315
+ numPartitions?: number;
316
+ /**
317
+ * Max iterations to train kmeans.
318
+ *
319
+ * The default value is 50.
320
+ *
321
+ * When training an IVF index we use kmeans to calculate the partitions. This parameter
322
+ * controls how many iterations of kmeans to run.
323
+ *
324
+ * Increasing this might improve the quality of the index but in most cases the parameter
325
+ * is unused because kmeans will converge with fewer iterations. The parameter is only
326
+ * used in cases where kmeans does not appear to converge. In those cases it is unlikely
327
+ * that setting this larger will lead to the index converging anyways.
328
+ *
329
+ */
330
+ maxIterations?: number;
331
+ /**
332
+ * The rate used to calculate the number of training vectors for kmeans.
333
+ *
334
+ * Default value is 256.
335
+ *
336
+ * When an IVF index is trained, we need to calculate partitions. These are groups
337
+ * of vectors that are similar to each other. To do this we use an algorithm called kmeans.
338
+ *
339
+ * Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
340
+ * random sample of the data. This parameter controls the size of the sample. The total
341
+ * number of vectors used to train the index is `sample_rate * num_partitions`.
342
+ *
343
+ * Increasing this value might improve the quality of the index but in most cases the
344
+ * default should be sufficient.
345
+ *
346
+ */
347
+ sampleRate?: number;
348
+ /**
349
+ * The number of neighbors to select for each vector in the HNSW graph.
350
+ *
351
+ * The default value is 20.
352
+ *
353
+ * This value controls the tradeoff between search speed and accuracy.
354
+ * The higher the value the more accurate the search but the slower it will be.
355
+ *
356
+ */
357
+ m?: number;
358
+ /**
359
+ * The number of candidates to evaluate during the construction of the HNSW graph.
360
+ *
361
+ * The default value is 300.
362
+ *
363
+ * This value controls the tradeoff between build speed and accuracy.
364
+ * The higher the value the more accurate the build but the slower it will be.
365
+ * 150 to 300 is the typical range. 100 is a minimum for good quality search
366
+ * results. In most cases, there is no benefit to setting this higher than 500.
367
+ * This value should be set to a value that is not less than `ef` in the search phase.
368
+ *
369
+ */
370
+ efConstruction?: number;
371
+ }
372
+ /**
373
+ * Options to create an `IVF_FLAT` index
374
+ */
375
+ export interface IvfFlatOptions {
376
+ /**
377
+ * The number of IVF partitions to create.
378
+ *
379
+ * This value should generally scale with the number of rows in the dataset.
380
+ * By default the number of partitions is the square root of the number of
381
+ * rows.
382
+ *
383
+ * If this value is too large then the first part of the search (picking the
384
+ * right partition) will be slow. If this value is too small then the second
385
+ * part of the search (searching within a partition) will be slow.
386
+ */
387
+ numPartitions?: number;
388
+ /**
389
+ * Distance type to use to build the index.
390
+ *
391
+ * Default value is "l2".
392
+ *
393
+ * This is used when training the index to calculate the IVF partitions
394
+ * (vectors are grouped in partitions with similar vectors according to this
395
+ * distance type).
396
+ *
397
+ * The distance type used to train an index MUST match the distance type used
398
+ * to search the index. Failure to do so will yield inaccurate results.
399
+ *
400
+ * The following distance types are available:
401
+ *
402
+ * "l2" - Euclidean distance. This is a very common distance metric that
403
+ * accounts for both magnitude and direction when determining the distance
404
+ * between vectors. l2 distance has a range of [0, ∞).
405
+ *
406
+ * "cosine" - Cosine distance. Cosine distance is a distance metric
407
+ * calculated from the cosine similarity between two vectors. Cosine
408
+ * similarity is a measure of similarity between two non-zero vectors of an
409
+ * inner product space. It is defined to equal the cosine of the angle
410
+ * between them. Unlike l2, the cosine distance is not affected by the
411
+ * magnitude of the vectors. Cosine distance has a range of [0, 2].
412
+ *
413
+ * Note: the cosine distance is undefined when one (or both) of the vectors
414
+ * are all zeros (there is no direction). These vectors are invalid and may
415
+ * never be returned from a vector search.
416
+ *
417
+ * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
418
+ * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
419
+ * l2 norm is 1), then dot distance is equivalent to the cosine distance.
420
+ *
421
+ * "hamming" - Hamming distance. Hamming distance is a distance metric
422
+ * calculated from the number of bits that are different between two vectors.
423
+ * Hamming distance has a range of [0, dimension]. Note that the hamming distance
424
+ * is only valid for binary vectors.
425
+ */
426
+ distanceType?: "l2" | "cosine" | "dot" | "hamming";
427
+ /**
428
+ * Max iteration to train IVF kmeans.
429
+ *
430
+ * When training an IVF FLAT index we use kmeans to calculate the partitions. This parameter
431
+ * controls how many iterations of kmeans to run.
432
+ *
433
+ * Increasing this might improve the quality of the index but in most cases these extra
434
+ * iterations have diminishing returns.
435
+ *
436
+ * The default value is 50.
437
+ */
438
+ maxIterations?: number;
439
+ /**
440
+ * The number of vectors, per partition, to sample when training IVF kmeans.
441
+ *
442
+ * When an IVF FLAT index is trained, we need to calculate partitions. These are groups
443
+ * of vectors that are similar to each other. To do this we use an algorithm called kmeans.
444
+ *
445
+ * Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
446
+ * random sample of the data. This parameter controls the size of the sample. The total
447
+ * number of vectors used to train the index is `sample_rate * num_partitions`.
448
+ *
449
+ * Increasing this value might improve the quality of the index but in most cases the
450
+ * default should be sufficient.
451
+ *
452
+ * The default value is 256.
453
+ */
454
+ sampleRate?: number;
455
+ }
456
+ /**
457
+ * Options to create a full text search index
458
+ */
459
+ export interface FtsOptions {
460
+ /**
461
+ * Whether to build the index with positions.
462
+ * True by default.
463
+ * If set to false, the index will not store the positions of the tokens in the text,
464
+ * which will make the index smaller and faster to build, but will not support phrase queries.
465
+ */
466
+ withPosition?: boolean;
467
+ /**
468
+ * The tokenizer to use when building the index.
469
+ * The default is "simple".
470
+ *
471
+ * The following tokenizers are available:
472
+ *
473
+ * "simple" - Simple tokenizer. This tokenizer splits the text into tokens using whitespace and punctuation as a delimiter.
474
+ *
475
+ * "whitespace" - Whitespace tokenizer. This tokenizer splits the text into tokens using whitespace as a delimiter.
476
+ *
477
+ * "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
478
+ */
479
+ baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
480
+ /**
481
+ * language for stemming and stop words
482
+ * this is only used when `stem` or `remove_stop_words` is true
483
+ */
484
+ language?: string;
485
+ /**
486
+ * maximum token length
487
+ * tokens longer than this length will be ignored
488
+ */
489
+ maxTokenLength?: number;
490
+ /**
491
+ * whether to lowercase tokens
492
+ */
493
+ lowercase?: boolean;
494
+ /**
495
+ * whether to stem tokens
496
+ */
497
+ stem?: boolean;
498
+ /**
499
+ * whether to remove stop words
500
+ */
501
+ removeStopWords?: boolean;
502
+ /**
503
+ * whether to remove punctuation
504
+ */
505
+ asciiFolding?: boolean;
506
+ /**
507
+ * ngram min length
508
+ */
509
+ ngramMinLength?: number;
510
+ /**
511
+ * ngram max length
512
+ */
513
+ ngramMaxLength?: number;
514
+ /**
515
+ * whether to only index the prefix of the token for ngram tokenizer
516
+ */
517
+ prefixOnly?: boolean;
518
+ }
519
+ export declare class Index {
520
+ private readonly inner;
521
+ private constructor();
522
+ /**
523
+ * Create an IvfPq index
524
+ *
525
+ * This index stores a compressed (quantized) copy of every vector. These vectors
526
+ * are grouped into partitions of similar vectors. Each partition keeps track of
527
+ * a centroid which is the average value of all vectors in the group.
528
+ *
529
+ * During a query the centroids are compared with the query vector to find the closest
530
+ * partitions. The compressed vectors in these partitions are then searched to find
531
+ * the closest vectors.
532
+ *
533
+ * The compression scheme is called product quantization. Each vector is divided into
534
+ * subvectors and then each subvector is quantized into a small number of bits. the
535
+ * parameters `num_bits` and `num_subvectors` control this process, providing a tradeoff
536
+ * between index size (and thus search speed) and index accuracy.
537
+ *
538
+ * The partitioning process is called IVF and the `num_partitions` parameter controls how
539
+ * many groups to create.
540
+ *
541
+ * Note that training an IVF PQ index on a large dataset is a slow operation and
542
+ * currently is also a memory intensive operation.
543
+ */
544
+ static ivfPq(options?: Partial<IvfPqOptions>): Index;
545
+ /**
546
+ * Create an IvfRq index
547
+ *
548
+ * IVF-RQ (RabitQ Quantization) compresses vectors using RabitQ quantization
549
+ * and organizes them into IVF partitions.
550
+ *
551
+ * The compression scheme is called RabitQ quantization. Each dimension is quantized into a small number of bits.
552
+ * The parameters `num_bits` and `num_partitions` control this process, providing a tradeoff
553
+ * between index size (and thus search speed) and index accuracy.
554
+ *
555
+ * The partitioning process is called IVF and the `num_partitions` parameter controls how
556
+ * many groups to create.
557
+ *
558
+ * Note that training an IVF RQ index on a large dataset is a slow operation and
559
+ * currently is also a memory intensive operation.
560
+ */
561
+ static ivfRq(options?: Partial<IvfRqOptions>): Index;
562
+ /**
563
+ * Create an IvfFlat index
564
+ *
565
+ * This index groups vectors into partitions of similar vectors. Each partition keeps track of
566
+ * a centroid which is the average value of all vectors in the group.
567
+ *
568
+ * During a query the centroids are compared with the query vector to find the closest
569
+ * partitions. The vectors in these partitions are then searched to find
570
+ * the closest vectors.
571
+ *
572
+ * The partitioning process is called IVF and the `num_partitions` parameter controls how
573
+ * many groups to create.
574
+ *
575
+ * Note that training an IVF FLAT index on a large dataset is a slow operation and
576
+ * currently is also a memory intensive operation.
577
+ */
578
+ static ivfFlat(options?: Partial<IvfFlatOptions>): Index;
579
+ /**
580
+ * Create a btree index
581
+ *
582
+ * A btree index is an index on a scalar columns. The index stores a copy of the column
583
+ * in sorted order. A header entry is created for each block of rows (currently the
584
+ * block size is fixed at 4096). These header entries are stored in a separate
585
+ * cacheable structure (a btree). To search for data the header is used to determine
586
+ * which blocks need to be read from disk.
587
+ *
588
+ * For example, a btree index in a table with 1Bi rows requires sizeof(Scalar) * 256Ki
589
+ * bytes of memory and will generally need to read sizeof(Scalar) * 4096 bytes to find
590
+ * the correct row ids.
591
+ *
592
+ * This index is good for scalar columns with mostly distinct values and does best when
593
+ * the query is highly selective.
594
+ *
595
+ * The btree index does not currently have any parameters though parameters such as the
596
+ * block size may be added in the future.
597
+ */
598
+ static btree(): Index;
599
+ /**
600
+ * Create a bitmap index.
601
+ *
602
+ * A `Bitmap` index stores a bitmap for each distinct value in the column for every row.
603
+ *
604
+ * This index works best for low-cardinality columns, where the number of unique values
605
+ * is small (i.e., less than a few hundreds).
606
+ */
607
+ static bitmap(): Index;
608
+ /**
609
+ * Create a label list index.
610
+ *
611
+ * LabelList index is a scalar index that can be used on `List<T>` columns to
612
+ * support queries with `array_contains_all` and `array_contains_any`
613
+ * using an underlying bitmap index.
614
+ */
615
+ static labelList(): Index;
616
+ /**
617
+ * Create a full text search index
618
+ *
619
+ * A full text search index is an index on a string column, so that you can conduct full
620
+ * text searches on the column.
621
+ *
622
+ * The results of a full text search are ordered by relevance measured by BM25.
623
+ *
624
+ * You can combine filters with full text search.
625
+ */
626
+ static fts(options?: Partial<FtsOptions>): Index;
627
+ /**
628
+ *
629
+ * Create a hnswPq index
630
+ *
631
+ * HNSW-PQ stands for Hierarchical Navigable Small World - Product Quantization.
632
+ * It is a variant of the HNSW algorithm that uses product quantization to compress
633
+ * the vectors.
634
+ *
635
+ */
636
+ static hnswPq(options?: Partial<HnswPqOptions>): Index;
637
+ /**
638
+ *
639
+ * Create a hnswSq index
640
+ *
641
+ * HNSW-SQ stands for Hierarchical Navigable Small World - Scalar Quantization.
642
+ * It is a variant of the HNSW algorithm that uses scalar quantization to compress
643
+ * the vectors.
644
+ *
645
+ */
646
+ static hnswSq(options?: Partial<HnswSqOptions>): Index;
647
+ }
648
+ export interface IndexOptions {
649
+ /**
650
+ * Advanced index configuration
651
+ *
652
+ * This option allows you to specify a specfic index to create and also
653
+ * allows you to pass in configuration for training the index.
654
+ *
655
+ * See the static methods on Index for details on the various index types.
656
+ *
657
+ * If this is not supplied then column data type(s) and column statistics
658
+ * will be used to determine the most useful kind of index to create.
659
+ */
660
+ config?: Index;
661
+ /**
662
+ * Whether to replace the existing index
663
+ *
664
+ * If this is false, and another index already exists on the same columns
665
+ * and the same name, then an error will be returned. This is true even if
666
+ * that index is out of date.
667
+ *
668
+ * The default is true
669
+ */
670
+ replace?: boolean;
671
+ /**
672
+ * Timeout in seconds to wait for index creation to complete.
673
+ *
674
+ * If not specified, the method will return immediately after starting the index creation.
675
+ */
676
+ waitTimeoutSeconds?: number;
677
+ /**
678
+ * Optional custom name for the index.
679
+ *
680
+ * If not provided, a default name will be generated based on the column name.
681
+ */
682
+ name?: string;
683
+ /**
684
+ * Whether to train the index with existing data.
685
+ *
686
+ * If true (default), the index will be trained with existing data in the table.
687
+ * If false, the index will be created empty and populated as new data is added.
688
+ *
689
+ * Note: This option is only supported for scalar indices. Vector indices always train.
690
+ */
691
+ train?: boolean;
692
+ }