@lancedb/lancedb 0.10.0 → 0.11.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,8 +30,8 @@ export interface CreateTableOptions {
30
30
  /**
31
31
  * The version of the data storage format to use.
32
32
  *
33
- * The default is `legacy`, which is Lance format v1.
34
- * `stable` is the new format, which is Lance format v2.
33
+ * The default is `stable`.
34
+ * Set to "legacy" to use the old format.
35
35
  */
36
36
  dataStorageVersion?: string;
37
37
  /**
@@ -45,9 +45,9 @@ export interface CreateTableOptions {
45
45
  /**
46
46
  * If true then data files will be written with the legacy format
47
47
  *
48
- * The default is true while the new format is in beta
48
+ * The default is false.
49
49
  *
50
- * Deprecated.
50
+ * Deprecated. Use data storage version instead.
51
51
  */
52
52
  useLegacyFormat?: boolean;
53
53
  schema?: SchemaLike;
@@ -73,7 +73,7 @@ class LocalConnection extends Connection {
73
73
  throw new Error("data is required");
74
74
  }
75
75
  const { buf, mode } = await table_1.Table.parseTableData(data, options);
76
- let dataStorageVersion = "legacy";
76
+ let dataStorageVersion = "stable";
77
77
  if (options?.dataStorageVersion !== undefined) {
78
78
  dataStorageVersion = options.dataStorageVersion;
79
79
  }
@@ -95,7 +95,7 @@ class LocalConnection extends Connection {
95
95
  const registry = (0, registry_1.getRegistry)();
96
96
  metadata = registry.getTableMetadata([embeddingFunction]);
97
97
  }
98
- let dataStorageVersion = "legacy";
98
+ let dataStorageVersion = "stable";
99
99
  if (options?.dataStorageVersion !== undefined) {
100
100
  dataStorageVersion = options.dataStorageVersion;
101
101
  }
package/dist/indices.d.ts CHANGED
@@ -92,21 +92,206 @@ export interface IvfPqOptions {
92
92
  */
93
93
  sampleRate?: number;
94
94
  }
95
+ /**
96
+ * Options to create an `HNSW_PQ` index
97
+ */
95
98
  export interface HnswPqOptions {
99
+ /**
100
+ * The distance metric used to train the index.
101
+ *
102
+ * Default value is "l2".
103
+ *
104
+ * The following distance types are available:
105
+ *
106
+ * "l2" - Euclidean distance. This is a very common distance metric that
107
+ * accounts for both magnitude and direction when determining the distance
108
+ * between vectors. L2 distance has a range of [0, ∞).
109
+ *
110
+ * "cosine" - Cosine distance. Cosine distance is a distance metric
111
+ * calculated from the cosine similarity between two vectors. Cosine
112
+ * similarity is a measure of similarity between two non-zero vectors of an
113
+ * inner product space. It is defined to equal the cosine of the angle
114
+ * between them. Unlike L2, the cosine distance is not affected by the
115
+ * magnitude of the vectors. Cosine distance has a range of [0, 2].
116
+ *
117
+ * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
118
+ * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
119
+ * L2 norm is 1), then dot distance is equivalent to the cosine distance.
120
+ */
96
121
  distanceType?: "l2" | "cosine" | "dot";
122
+ /**
123
+ * The number of IVF partitions to create.
124
+ *
125
+ * For HNSW, we recommend a small number of partitions. Setting this to 1 works
126
+ * well for most tables. For very large tables, training just one HNSW graph
127
+ * will require too much memory. Each partition becomes its own HNSW graph, so
128
+ * setting this value higher reduces the peak memory use of training.
129
+ *
130
+ */
97
131
  numPartitions?: number;
132
+ /**
133
+ * Number of sub-vectors of PQ.
134
+ *
135
+ * This value controls how much the vector is compressed during the quantization step.
136
+ * The more sub vectors there are the less the vector is compressed. The default is
137
+ * the dimension of the vector divided by 16. If the dimension is not evenly divisible
138
+ * by 16 we use the dimension divded by 8.
139
+ *
140
+ * The above two cases are highly preferred. Having 8 or 16 values per subvector allows
141
+ * us to use efficient SIMD instructions.
142
+ *
143
+ * If the dimension is not visible by 8 then we use 1 subvector. This is not ideal and
144
+ * will likely result in poor performance.
145
+ *
146
+ */
98
147
  numSubVectors?: number;
148
+ /**
149
+ * Max iterations to train kmeans.
150
+ *
151
+ * The default value is 50.
152
+ *
153
+ * When training an IVF index we use kmeans to calculate the partitions. This parameter
154
+ * controls how many iterations of kmeans to run.
155
+ *
156
+ * Increasing this might improve the quality of the index but in most cases the parameter
157
+ * is unused because kmeans will converge with fewer iterations. The parameter is only
158
+ * used in cases where kmeans does not appear to converge. In those cases it is unlikely
159
+ * that setting this larger will lead to the index converging anyways.
160
+ *
161
+ */
99
162
  maxIterations?: number;
163
+ /**
164
+ * The rate used to calculate the number of training vectors for kmeans.
165
+ *
166
+ * Default value is 256.
167
+ *
168
+ * When an IVF index is trained, we need to calculate partitions. These are groups
169
+ * of vectors that are similar to each other. To do this we use an algorithm called kmeans.
170
+ *
171
+ * Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
172
+ * random sample of the data. This parameter controls the size of the sample. The total
173
+ * number of vectors used to train the index is `sample_rate * num_partitions`.
174
+ *
175
+ * Increasing this value might improve the quality of the index but in most cases the
176
+ * default should be sufficient.
177
+ *
178
+ */
100
179
  sampleRate?: number;
180
+ /**
181
+ * The number of neighbors to select for each vector in the HNSW graph.
182
+ *
183
+ * The default value is 20.
184
+ *
185
+ * This value controls the tradeoff between search speed and accuracy.
186
+ * The higher the value the more accurate the search but the slower it will be.
187
+ *
188
+ */
101
189
  m?: number;
190
+ /**
191
+ * The number of candidates to evaluate during the construction of the HNSW graph.
192
+ *
193
+ * The default value is 300.
194
+ *
195
+ * This value controls the tradeoff between build speed and accuracy.
196
+ * The higher the value the more accurate the build but the slower it will be.
197
+ * 150 to 300 is the typical range. 100 is a minimum for good quality search
198
+ * results. In most cases, there is no benefit to setting this higher than 500.
199
+ * This value should be set to a value that is not less than `ef` in the search phase.
200
+ *
201
+ */
102
202
  efConstruction?: number;
103
203
  }
204
+ /**
205
+ * Options to create an `HNSW_SQ` index
206
+ */
104
207
  export interface HnswSqOptions {
208
+ /**
209
+ * The distance metric used to train the index.
210
+ *
211
+ * Default value is "l2".
212
+ *
213
+ * The following distance types are available:
214
+ *
215
+ * "l2" - Euclidean distance. This is a very common distance metric that
216
+ * accounts for both magnitude and direction when determining the distance
217
+ * between vectors. L2 distance has a range of [0, ∞).
218
+ *
219
+ * "cosine" - Cosine distance. Cosine distance is a distance metric
220
+ * calculated from the cosine similarity between two vectors. Cosine
221
+ * similarity is a measure of similarity between two non-zero vectors of an
222
+ * inner product space. It is defined to equal the cosine of the angle
223
+ * between them. Unlike L2, the cosine distance is not affected by the
224
+ * magnitude of the vectors. Cosine distance has a range of [0, 2].
225
+ *
226
+ * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
227
+ * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
228
+ * L2 norm is 1), then dot distance is equivalent to the cosine distance.
229
+ */
105
230
  distanceType?: "l2" | "cosine" | "dot";
231
+ /**
232
+ * The number of IVF partitions to create.
233
+ *
234
+ * For HNSW, we recommend a small number of partitions. Setting this to 1 works
235
+ * well for most tables. For very large tables, training just one HNSW graph
236
+ * will require too much memory. Each partition becomes its own HNSW graph, so
237
+ * setting this value higher reduces the peak memory use of training.
238
+ *
239
+ */
106
240
  numPartitions?: number;
241
+ /**
242
+ * Max iterations to train kmeans.
243
+ *
244
+ * The default value is 50.
245
+ *
246
+ * When training an IVF index we use kmeans to calculate the partitions. This parameter
247
+ * controls how many iterations of kmeans to run.
248
+ *
249
+ * Increasing this might improve the quality of the index but in most cases the parameter
250
+ * is unused because kmeans will converge with fewer iterations. The parameter is only
251
+ * used in cases where kmeans does not appear to converge. In those cases it is unlikely
252
+ * that setting this larger will lead to the index converging anyways.
253
+ *
254
+ */
107
255
  maxIterations?: number;
256
+ /**
257
+ * The rate used to calculate the number of training vectors for kmeans.
258
+ *
259
+ * Default value is 256.
260
+ *
261
+ * When an IVF index is trained, we need to calculate partitions. These are groups
262
+ * of vectors that are similar to each other. To do this we use an algorithm called kmeans.
263
+ *
264
+ * Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
265
+ * random sample of the data. This parameter controls the size of the sample. The total
266
+ * number of vectors used to train the index is `sample_rate * num_partitions`.
267
+ *
268
+ * Increasing this value might improve the quality of the index but in most cases the
269
+ * default should be sufficient.
270
+ *
271
+ */
108
272
  sampleRate?: number;
273
+ /**
274
+ * The number of neighbors to select for each vector in the HNSW graph.
275
+ *
276
+ * The default value is 20.
277
+ *
278
+ * This value controls the tradeoff between search speed and accuracy.
279
+ * The higher the value the more accurate the search but the slower it will be.
280
+ *
281
+ */
109
282
  m?: number;
283
+ /**
284
+ * The number of candidates to evaluate during the construction of the HNSW graph.
285
+ *
286
+ * The default value is 300.
287
+ *
288
+ * This value controls the tradeoff between build speed and accuracy.
289
+ * The higher the value the more accurate the build but the slower it will be.
290
+ * 150 to 300 is the typical range. 100 is a minimum for good quality search
291
+ * results. In most cases, there is no benefit to setting this higher than 500.
292
+ * This value should be set to a value that is not less than `ef` in the search phase.
293
+ *
294
+ */
110
295
  efConstruction?: number;
111
296
  }
112
297
  /**
@@ -119,7 +304,7 @@ export interface FtsOptions {
119
304
  * If set to false, the index will not store the positions of the tokens in the text,
120
305
  * which will make the index smaller and faster to build, but will not support phrase queries.
121
306
  */
122
- withPositions?: boolean;
307
+ withPosition?: boolean;
123
308
  }
124
309
  export declare class Index {
125
310
  private readonly inner;
@@ -199,13 +384,21 @@ export declare class Index {
199
384
  static fts(options?: Partial<FtsOptions>): Index;
200
385
  /**
201
386
  *
202
- * Create a hnswpq index
387
+ * Create a hnswPq index
388
+ *
389
+ * HNSW-PQ stands for Hierarchical Navigable Small World - Product Quantization.
390
+ * It is a variant of the HNSW algorithm that uses product quantization to compress
391
+ * the vectors.
203
392
  *
204
393
  */
205
394
  static hnswPq(options?: Partial<HnswPqOptions>): Index;
206
395
  /**
207
396
  *
208
- * Create a hnswsq index
397
+ * Create a hnswSq index
398
+ *
399
+ * HNSW-SQ stands for Hierarchical Navigable Small World - Scalar Quantization.
400
+ * It is a variant of the HNSW algorithm that uses scalar quantization to compress
401
+ * the vectors.
209
402
  *
210
403
  */
211
404
  static hnswSq(options?: Partial<HnswSqOptions>): Index;
package/dist/indices.js CHANGED
@@ -101,11 +101,15 @@ class Index {
101
101
  * For now, the full text search index only supports English, and doesn't support phrase search.
102
102
  */
103
103
  static fts(options) {
104
- return new Index(native_1.Index.fts(options?.withPositions));
104
+ return new Index(native_1.Index.fts(options?.withPosition));
105
105
  }
106
106
  /**
107
107
  *
108
- * Create a hnswpq index
108
+ * Create a hnswPq index
109
+ *
110
+ * HNSW-PQ stands for Hierarchical Navigable Small World - Product Quantization.
111
+ * It is a variant of the HNSW algorithm that uses product quantization to compress
112
+ * the vectors.
109
113
  *
110
114
  */
111
115
  static hnswPq(options) {
@@ -113,7 +117,11 @@ class Index {
113
117
  }
114
118
  /**
115
119
  *
116
- * Create a hnswsq index
120
+ * Create a hnswSq index
121
+ *
122
+ * HNSW-SQ stands for Hierarchical Navigable Small World - Scalar Quantization.
123
+ * It is a variant of the HNSW algorithm that uses scalar quantization to compress
124
+ * the vectors.
117
125
  *
118
126
  */
119
127
  static hnswSq(options) {
package/dist/native.d.ts CHANGED
@@ -202,7 +202,7 @@ export class Table {
202
202
  countRows(filter?: string | undefined | null): Promise<number>
203
203
  delete(predicate: string): Promise<void>
204
204
  createIndex(index: Index | undefined | null, column: string, replace?: boolean | undefined | null): Promise<void>
205
- update(onlyIf: string | undefined | null, columns: Array<[string, string]>): Promise<void>
205
+ update(onlyIf: string | undefined | null, columns: Array<[string, string]>): Promise<bigint>
206
206
  query(): Query
207
207
  vectorSearch(vector: Float32Array): VectorQuery
208
208
  addColumns(transforms: Array<AddColumnsSql>): Promise<void>
package/package.json CHANGED
@@ -10,7 +10,7 @@
10
10
  "vector database",
11
11
  "ann"
12
12
  ],
13
- "version": "0.10.0",
13
+ "version": "0.11.0-beta.0",
14
14
  "main": "dist/index.js",
15
15
  "exports": {
16
16
  ".": "./dist/index.js",
@@ -92,11 +92,11 @@
92
92
  "reflect-metadata": "^0.2.2"
93
93
  },
94
94
  "optionalDependencies": {
95
- "@lancedb/lancedb-darwin-arm64": "0.10.0",
96
- "@lancedb/lancedb-linux-arm64-gnu": "0.10.0",
97
- "@lancedb/lancedb-darwin-x64": "0.10.0",
98
- "@lancedb/lancedb-linux-x64-gnu": "0.10.0",
99
- "@lancedb/lancedb-win32-x64-msvc": "0.10.0"
95
+ "@lancedb/lancedb-darwin-arm64": "0.11.0-beta.0",
96
+ "@lancedb/lancedb-linux-arm64-gnu": "0.11.0-beta.0",
97
+ "@lancedb/lancedb-darwin-x64": "0.11.0-beta.0",
98
+ "@lancedb/lancedb-linux-x64-gnu": "0.11.0-beta.0",
99
+ "@lancedb/lancedb-win32-x64-msvc": "0.11.0-beta.0"
100
100
  },
101
101
  "peerDependencies": {
102
102
  "apache-arrow": ">=13.0.0 <=17.0.0"