@lancedb/lancedb 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/arrow.d.ts CHANGED
@@ -3,6 +3,7 @@ import { Table as ArrowTable, Binary, DataType, FixedSizeBinary, FixedSizeList,
3
3
  import { type EmbeddingFunction } from "./embedding/embedding_function";
4
4
  import { EmbeddingFunctionConfig } from "./embedding/registry";
5
5
  export * from "apache-arrow";
6
+ export type IntoVector = Float32Array | Float64Array | number[];
6
7
  export declare function isArrowTable(value: object): value is ArrowTable;
7
8
  export declare function isDataType(value: unknown): value is DataType;
8
9
  export declare function isNull(value: unknown): value is Null;
@@ -36,6 +37,7 @@ export declare class MakeArrowTableOptions {
36
37
  schema?: Schema;
37
38
  vectorColumns: Record<string, VectorColumnOptions>;
38
39
  embeddings?: EmbeddingFunction<unknown>;
40
+ embeddingFunction?: EmbeddingFunctionConfig;
39
41
  /**
40
42
  * If true then string columns will be encoded with dictionary encoding
41
43
  *
package/dist/arrow.js CHANGED
@@ -184,6 +184,7 @@ class MakeArrowTableOptions {
184
184
  vector: new VectorColumnOptions(),
185
185
  };
186
186
  embeddings;
187
+ embeddingFunction;
187
188
  /**
188
189
  * If true then string columns will be encoded with dictionary encoding
189
190
  *
@@ -299,7 +300,7 @@ function makeArrowTable(data, options, metadata) {
299
300
  const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
300
301
  if (opt.schema !== undefined && opt.schema !== null) {
301
302
  opt.schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
302
- opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
303
+ opt.schema = validateSchemaEmbeddings(opt.schema, data, options?.embeddingFunction);
303
304
  }
304
305
  const columns = {};
305
306
  // TODO: sample dataset to find missing columns
@@ -40,6 +40,12 @@ export interface CreateTableOptions {
40
40
  * The available options are described at https://lancedb.github.io/lancedb/guides/storage/
41
41
  */
42
42
  storageOptions?: Record<string, string>;
43
+ /**
44
+ * If true then data files will be written with the legacy format
45
+ *
46
+ * The default is true while the new format is in beta
47
+ */
48
+ useLegacyFormat?: boolean;
43
49
  schema?: Schema;
44
50
  embeddingFunction?: EmbeddingFunctionConfig;
45
51
  }
@@ -116,7 +116,7 @@ class Connection {
116
116
  table = (0, arrow_1.makeArrowTable)(data, options);
117
117
  }
118
118
  const buf = await (0, arrow_1.fromTableToBuffer)(table, options?.embeddingFunction, options?.schema);
119
- const innerTable = await this.inner.createTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
119
+ const innerTable = await this.inner.createTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions), options?.useLegacyFormat);
120
120
  return new table_1.Table(innerTable);
121
121
  }
122
122
  /**
@@ -138,7 +138,7 @@ class Connection {
138
138
  }
139
139
  const table = (0, arrow_1.makeEmptyTable)(schema, metadata);
140
140
  const buf = await (0, arrow_1.fromTableToBuffer)(table);
141
- const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
141
+ const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions), options?.useLegacyFormat);
142
142
  return new table_1.Table(innerTable);
143
143
  }
144
144
  /**
@@ -1,5 +1,5 @@
1
1
  import "reflect-metadata";
2
- import { DataType, Float } from "../arrow";
2
+ import { DataType, Float, type IntoVector } from "../arrow";
3
3
  /**
4
4
  * Options for a given embedding function
5
5
  */
@@ -51,7 +51,7 @@ export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptio
51
51
  *
52
52
  * @see {@link lancedb.LanceSchema}
53
53
  */
54
- vectorField(options?: Partial<FieldOptions>): [DataType, Map<string, EmbeddingFunction>];
54
+ vectorField(optionsOrDatatype?: Partial<FieldOptions> | DataType): [DataType, Map<string, EmbeddingFunction>];
55
55
  /** The number of dimensions of the embeddings */
56
56
  ndims(): number | undefined;
57
57
  /** The datatype of the embeddings */
@@ -63,7 +63,7 @@ export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptio
63
63
  /**
64
64
  Compute the embeddings for a single query
65
65
  */
66
- computeQueryEmbeddings(data: T): Promise<number[] | Float32Array | Float64Array>;
66
+ computeQueryEmbeddings(data: T): Promise<IntoVector>;
67
67
  }
68
68
  export interface FieldOptions<T extends DataType = DataType> {
69
69
  datatype: T;
@@ -47,32 +47,50 @@ class EmbeddingFunction {
47
47
  *
48
48
  * @see {@link lancedb.LanceSchema}
49
49
  */
50
- vectorField(options) {
50
+ vectorField(optionsOrDatatype) {
51
51
  let dtype;
52
- const dims = this.ndims() ?? options?.dims;
53
- if (!options?.datatype) {
54
- if (dims === undefined) {
55
- throw new Error("ndims is required for vector field");
56
- }
57
- dtype = new arrow_1.FixedSizeList(dims, new arrow_1.Field("item", new arrow_1.Float32(), true));
52
+ let vectorType;
53
+ let dims = this.ndims();
54
+ // `func.vectorField(new Float32())`
55
+ if ((0, arrow_1.isDataType)(optionsOrDatatype)) {
56
+ dtype = optionsOrDatatype;
58
57
  }
59
58
  else {
60
- if ((0, arrow_1.isFixedSizeList)(options.datatype)) {
61
- dtype = options.datatype;
59
+ // `func.vectorField({
60
+ // datatype: new Float32(),
61
+ // dims: 10
62
+ // })`
63
+ dims = dims ?? optionsOrDatatype?.dims;
64
+ dtype = optionsOrDatatype?.datatype;
65
+ }
66
+ if (dtype !== undefined) {
67
+ // `func.vectorField(new FixedSizeList(dims, new Field("item", new Float32(), true)))`
68
+ // or `func.vectorField({datatype: new FixedSizeList(dims, new Field("item", new Float32(), true))})`
69
+ if ((0, arrow_1.isFixedSizeList)(dtype)) {
70
+ vectorType = dtype;
71
+ // `func.vectorField(new Float32())`
72
+ // or `func.vectorField({datatype: new Float32()})`
62
73
  }
63
- else if ((0, arrow_1.isFloat)(options.datatype)) {
74
+ else if ((0, arrow_1.isFloat)(dtype)) {
75
+ // No `ndims` impl and no `{dims: n}` provided;
64
76
  if (dims === undefined) {
65
77
  throw new Error("ndims is required for vector field");
66
78
  }
67
- dtype = (0, arrow_1.newVectorType)(dims, options.datatype);
79
+ vectorType = (0, arrow_1.newVectorType)(dims, dtype);
68
80
  }
69
81
  else {
70
82
  throw new Error("Expected FixedSizeList or Float as datatype for vector field");
71
83
  }
72
84
  }
85
+ else {
86
+ if (dims === undefined) {
87
+ throw new Error("ndims is required for vector field");
88
+ }
89
+ vectorType = new arrow_1.FixedSizeList(dims, new arrow_1.Field("item", new arrow_1.Float32(), true));
90
+ }
73
91
  const metadata = new Map();
74
92
  metadata.set("vector_column_for", this);
75
- return [dtype, metadata];
93
+ return [vectorType, metadata];
76
94
  }
77
95
  /** The number of dimensions of the embeddings */
78
96
  ndims() {
@@ -21,6 +21,7 @@ export declare class EmbeddingFunctionRegistry {
21
21
  * Register an embedding function
22
22
  * @param name The name of the function
23
23
  * @param func The function to register
24
+ * @throws Error if the function is already registered
24
25
  */
25
26
  register<T extends EmbeddingFunctionFactory = EmbeddingFunctionFactory>(this: EmbeddingFunctionRegistry, alias?: string): (ctor: T) => any;
26
27
  /**
@@ -32,6 +33,9 @@ export declare class EmbeddingFunctionRegistry {
32
33
  * reset the registry to the initial state
33
34
  */
34
35
  reset(this: EmbeddingFunctionRegistry): void;
36
+ /**
37
+ * @ignore
38
+ */
35
39
  parseFunctions(this: EmbeddingFunctionRegistry, metadata: Map<string, string>): Map<string, EmbeddingFunctionConfig>;
36
40
  functionToMetadata(conf: EmbeddingFunctionConfig): Record<string, any>;
37
41
  getTableMetadata(functions: EmbeddingFunctionConfig[]): Map<string, string>;
@@ -27,6 +27,7 @@ class EmbeddingFunctionRegistry {
27
27
  * Register an embedding function
28
28
  * @param name The name of the function
29
29
  * @param func The function to register
30
+ * @throws Error if the function is already registered
30
31
  */
31
32
  register(alias) {
32
33
  const self = this;
@@ -63,6 +64,9 @@ class EmbeddingFunctionRegistry {
63
64
  reset() {
64
65
  this.#functions.clear();
65
66
  }
67
+ /**
68
+ * @ignore
69
+ */
66
70
  parseFunctions(metadata) {
67
71
  if (!metadata.has("embedding_functions")) {
68
72
  return new Map();
package/dist/native.d.ts CHANGED
@@ -102,6 +102,7 @@ export const enum WriteMode {
102
102
  }
103
103
  /** Write options when creating a Table. */
104
104
  export interface WriteOptions {
105
+ /** Write mode for writing to a table. */
105
106
  mode?: WriteMode
106
107
  }
107
108
  export interface OpenTableOptions {
@@ -123,8 +124,8 @@ export class Connection {
123
124
  * - buf: The buffer containing the IPC file.
124
125
  *
125
126
  */
126
- createTable(name: string, buf: Buffer, mode: string, storageOptions?: Record<string, string> | undefined | null): Promise<Table>
127
- createEmptyTable(name: string, schemaBuf: Buffer, mode: string, storageOptions?: Record<string, string> | undefined | null): Promise<Table>
127
+ createTable(name: string, buf: Buffer, mode: string, storageOptions?: Record<string, string> | undefined | null, useLegacyFormat?: boolean | undefined | null): Promise<Table>
128
+ createEmptyTable(name: string, schemaBuf: Buffer, mode: string, storageOptions?: Record<string, string> | undefined | null, useLegacyFormat?: boolean | undefined | null): Promise<Table>
128
129
  openTable(name: string, storageOptions?: Record<string, string> | undefined | null, indexCacheSize?: number | undefined | null): Promise<Table>
129
130
  /** Drop table with the name. Or raise an error if the table does not exist. */
130
131
  dropTable(name: string): Promise<void>
@@ -142,7 +143,7 @@ export class Query {
142
143
  select(columns: Array<[string, string]>): void
143
144
  limit(limit: number): void
144
145
  nearestTo(vector: Float32Array): VectorQuery
145
- execute(): Promise<RecordBatchIterator>
146
+ execute(maxBatchLength?: number | undefined | null): Promise<RecordBatchIterator>
146
147
  }
147
148
  export class VectorQuery {
148
149
  column(column: string): void
@@ -154,7 +155,7 @@ export class VectorQuery {
154
155
  onlyIf(predicate: string): void
155
156
  select(columns: Array<[string, string]>): void
156
157
  limit(limit: number): void
157
- execute(): Promise<RecordBatchIterator>
158
+ execute(maxBatchLength?: number | undefined | null): Promise<RecordBatchIterator>
158
159
  }
159
160
  export class Table {
160
161
  display(): string
package/dist/query.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { Table as ArrowTable, RecordBatch } from "./arrow";
1
+ import { Table as ArrowTable, type IntoVector, RecordBatch } from "./arrow";
2
2
  import { RecordBatchIterator as NativeBatchIterator, Query as NativeQuery, Table as NativeTable, VectorQuery as NativeVectorQuery } from "./native";
3
3
  export declare class RecordBatchIterator implements AsyncIterator<RecordBatch> {
4
4
  private promisedInner?;
@@ -6,6 +6,18 @@ export declare class RecordBatchIterator implements AsyncIterator<RecordBatch> {
6
6
  constructor(promise?: Promise<NativeBatchIterator>);
7
7
  next(): Promise<IteratorResult<RecordBatch<any>>>;
8
8
  }
9
+ /**
10
+ * Options that control the behavior of a particular query execution
11
+ */
12
+ export interface QueryExecutionOptions {
13
+ /**
14
+ * The maximum number of rows to return in a single batch
15
+ *
16
+ * Batches may have fewer rows if the underlying data is stored
17
+ * in smaller chunks.
18
+ */
19
+ maxBatchLength?: number;
20
+ }
9
21
  /** Common methods supported by all query types */
10
22
  export declare class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery, QueryType> implements AsyncIterable<RecordBatch> {
11
23
  protected inner: NativeQueryType;
@@ -53,7 +65,7 @@ export declare class QueryBase<NativeQueryType extends NativeQuery | NativeVecto
53
65
  * uses `Object.entries` which should preserve the insertion order of the object. However,
54
66
  * object insertion order is easy to get wrong and `Map` is more foolproof.
55
67
  */
56
- select(columns: string[] | Map<string, string> | Record<string, string>): QueryType;
68
+ select(columns: string[] | Map<string, string> | Record<string, string> | string): QueryType;
57
69
  /**
58
70
  * Set the maximum number of results to return.
59
71
  *
@@ -61,7 +73,7 @@ export declare class QueryBase<NativeQueryType extends NativeQuery | NativeVecto
61
73
  * called then every valid row from the table will be returned.
62
74
  */
63
75
  limit(limit: number): QueryType;
64
- protected nativeExecute(): Promise<NativeBatchIterator>;
76
+ protected nativeExecute(options?: Partial<QueryExecutionOptions>): Promise<NativeBatchIterator>;
65
77
  /**
66
78
  * Execute the query and return the results as an @see {@link AsyncIterator}
67
79
  * of @see {@link RecordBatch}.
@@ -73,12 +85,12 @@ export declare class QueryBase<NativeQueryType extends NativeQuery | NativeVecto
73
85
  * single query)
74
86
  *
75
87
  */
76
- protected execute(): RecordBatchIterator;
88
+ protected execute(options?: Partial<QueryExecutionOptions>): RecordBatchIterator;
77
89
  [Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>>;
78
90
  /** Collect the results as an Arrow @see {@link ArrowTable}. */
79
- toArrow(): Promise<ArrowTable>;
91
+ toArrow(options?: Partial<QueryExecutionOptions>): Promise<ArrowTable>;
80
92
  /** Collect the results as an array of objects. */
81
- toArray(): Promise<unknown[]>;
93
+ toArray(options?: Partial<QueryExecutionOptions>): Promise<any[]>;
82
94
  }
83
95
  /**
84
96
  * An interface for a query that can be executed
@@ -244,5 +256,5 @@ export declare class Query extends QueryBase<NativeQuery, Query> {
244
256
  * Vector searches always have a `limit`. If `limit` has not been called then
245
257
  * a default `limit` of 10 will be used. @see {@link Query#limit}
246
258
  */
247
- nearestTo(vector: unknown): VectorQuery;
259
+ nearestTo(vector: IntoVector): VectorQuery;
248
260
  }
package/dist/query.js CHANGED
@@ -43,6 +43,18 @@ class RecordBatchIterator {
43
43
  }
44
44
  exports.RecordBatchIterator = RecordBatchIterator;
45
45
  /* eslint-enable */
46
+ class RecordBatchIterable {
47
+ inner;
48
+ options;
49
+ constructor(inner, options) {
50
+ this.inner = inner;
51
+ this.options = options;
52
+ }
53
+ // biome-ignore lint/suspicious/noExplicitAny: skip
54
+ [Symbol.asyncIterator]() {
55
+ return new RecordBatchIterator(this.inner.execute(this.options?.maxBatchLength));
56
+ }
57
+ }
46
58
  /** Common methods supported by all query types */
47
59
  class QueryBase {
48
60
  inner;
@@ -98,6 +110,9 @@ class QueryBase {
98
110
  */
99
111
  select(columns) {
100
112
  let columnTuples;
113
+ if (typeof columns === "string") {
114
+ columns = [columns];
115
+ }
101
116
  if (Array.isArray(columns)) {
102
117
  columnTuples = columns.map((c) => [c, c]);
103
118
  }
@@ -120,8 +135,8 @@ class QueryBase {
120
135
  this.inner.limit(limit);
121
136
  return this;
122
137
  }
123
- nativeExecute() {
124
- return this.inner.execute();
138
+ nativeExecute(options) {
139
+ return this.inner.execute(options?.maxBatchLength);
125
140
  }
126
141
  /**
127
142
  * Execute the query and return the results as an @see {@link AsyncIterator}
@@ -134,8 +149,8 @@ class QueryBase {
134
149
  * single query)
135
150
  *
136
151
  */
137
- execute() {
138
- return new RecordBatchIterator(this.nativeExecute());
152
+ execute(options) {
153
+ return new RecordBatchIterator(this.nativeExecute(options));
139
154
  }
140
155
  // biome-ignore lint/suspicious/noExplicitAny: skip
141
156
  [Symbol.asyncIterator]() {
@@ -143,17 +158,17 @@ class QueryBase {
143
158
  return new RecordBatchIterator(promise);
144
159
  }
145
160
  /** Collect the results as an Arrow @see {@link ArrowTable}. */
146
- async toArrow() {
161
+ async toArrow(options) {
147
162
  const batches = [];
148
- for await (const batch of this) {
163
+ for await (const batch of new RecordBatchIterable(this.inner, options)) {
149
164
  batches.push(batch);
150
165
  }
151
166
  return new arrow_1.Table(batches);
152
167
  }
153
168
  /** Collect the results as an array of objects. */
154
- async toArray() {
155
- const tbl = await this.toArrow();
156
- // eslint-disable-next-line @typescript-eslint/no-unsafe-return
169
+ // biome-ignore lint/suspicious/noExplicitAny: arrow.toArrow() returns any[]
170
+ async toArray(options) {
171
+ const tbl = await this.toArrow(options);
157
172
  return tbl.toArray();
158
173
  }
159
174
  }
@@ -339,7 +354,6 @@ class Query extends QueryBase {
339
354
  * a default `limit` of 10 will be used. @see {@link Query#limit}
340
355
  */
341
356
  nearestTo(vector) {
342
- // biome-ignore lint/suspicious/noExplicitAny: skip
343
357
  const vectorQuery = this.inner.nearestTo(Float32Array.from(vector));
344
358
  return new VectorQuery(vectorQuery);
345
359
  }
package/dist/table.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { Data, Schema } from "./arrow";
1
+ import { Table as ArrowTable, Data, IntoVector, Schema } from "./arrow";
2
2
  import { IndexOptions } from "./indices";
3
3
  import { AddColumnsSql, ColumnAlteration, IndexConfig, OptimizeStats, Table as _NativeTable } from "./native";
4
4
  import { Query, VectorQuery } from "./query";
@@ -56,6 +56,7 @@ export interface OptimizeOptions {
56
56
  * collected.
57
57
  */
58
58
  export declare class Table {
59
+ #private;
59
60
  private readonly inner;
60
61
  /** Construct a Table. Internal use only. */
61
62
  constructor(inner: _NativeTable);
@@ -186,6 +187,19 @@ export declare class Table {
186
187
  * @returns {Query} A builder that can be used to parameterize the query
187
188
  */
188
189
  query(): Query;
190
+ /**
191
+ * Create a search query to find the nearest neighbors
192
+ * of the given query vector
193
+ * @param {string} query - the query. This will be converted to a vector using the table's provided embedding function
194
+ * @rejects {Error} If no embedding functions are defined in the table
195
+ */
196
+ search(query: string): Promise<VectorQuery>;
197
+ /**
198
+ * Create a search query to find the nearest neighbors
199
+ * of the given query vector
200
+ * @param {IntoVector} query - the query vector
201
+ */
202
+ search(query: IntoVector): VectorQuery;
189
203
  /**
190
204
  * Search the table with a given query vector.
191
205
  *
@@ -193,7 +207,7 @@ export declare class Table {
193
207
  * is the same thing as calling `nearestTo` on the builder returned
194
208
  * by `query`. @see {@link Query#nearestTo} for more details.
195
209
  */
196
- vectorSearch(vector: unknown): VectorQuery;
210
+ vectorSearch(vector: IntoVector): VectorQuery;
197
211
  /**
198
212
  * Add new columns with defined values.
199
213
  * @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
@@ -301,4 +315,6 @@ export declare class Table {
301
315
  optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats>;
302
316
  /** List all indices that have been created with {@link Table.createIndex} */
303
317
  listIndices(): Promise<IndexConfig[]>;
318
+ /** Return the table as an arrow table */
319
+ toArrow(): Promise<ArrowTable>;
304
320
  }
package/dist/table.js CHANGED
@@ -53,6 +53,11 @@ class Table {
53
53
  display() {
54
54
  return this.inner.display();
55
55
  }
56
+ async #getEmbeddingFunctions() {
57
+ const schema = await this.schema();
58
+ const registry = (0, registry_1.getRegistry)();
59
+ return registry.parseFunctions(schema.metadata);
60
+ }
56
61
  /** Get the schema of the table. */
57
62
  async schema() {
58
63
  const schemaBuf = await this.inner.schema();
@@ -68,7 +73,7 @@ class Table {
68
73
  const schema = await this.schema();
69
74
  const registry = (0, registry_1.getRegistry)();
70
75
  const functions = registry.parseFunctions(schema.metadata);
71
- const buffer = await (0, arrow_1.fromDataToBuffer)(data, functions.values().next().value);
76
+ const buffer = await (0, arrow_1.fromDataToBuffer)(data, functions.values().next().value, schema);
72
77
  await this.inner.add(buffer, mode);
73
78
  }
74
79
  /**
@@ -200,6 +205,24 @@ class Table {
200
205
  query() {
201
206
  return new query_1.Query(this.inner);
202
207
  }
208
+ search(query) {
209
+ if (typeof query !== "string") {
210
+ return this.vectorSearch(query);
211
+ }
212
+ else {
213
+ return this.#getEmbeddingFunctions().then(async (functions) => {
214
+ // TODO: Support multiple embedding functions
215
+ const embeddingFunc = functions
216
+ .values()
217
+ .next().value;
218
+ if (!embeddingFunc) {
219
+ return Promise.reject(new Error("No embedding functions are defined in the table"));
220
+ }
221
+ const embeddings = await embeddingFunc.function.computeQueryEmbeddings(query);
222
+ return this.query().nearestTo(embeddings);
223
+ });
224
+ }
225
+ }
203
226
  /**
204
227
  * Search the table with a given query vector.
205
228
  *
@@ -342,5 +365,9 @@ class Table {
342
365
  async listIndices() {
343
366
  return await this.inner.listIndices();
344
367
  }
368
+ /** Return the table as an arrow table */
369
+ async toArrow() {
370
+ return await this.query().toArrow();
371
+ }
345
372
  }
346
373
  exports.Table = Table;
package/lancedb/arrow.ts CHANGED
@@ -31,7 +31,7 @@ import {
31
31
  Schema,
32
32
  Struct,
33
33
  Utf8,
34
- type Vector,
34
+ Vector,
35
35
  makeBuilder,
36
36
  makeData,
37
37
  type makeTable,
@@ -42,6 +42,8 @@ import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
42
42
  import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
43
43
  export * from "apache-arrow";
44
44
 
45
+ export type IntoVector = Float32Array | Float64Array | number[];
46
+
45
47
  export function isArrowTable(value: object): value is ArrowTable {
46
48
  if (value instanceof ArrowTable) return true;
47
49
  return "schema" in value && "batches" in value;
@@ -182,6 +184,7 @@ export class MakeArrowTableOptions {
182
184
  vector: new VectorColumnOptions(),
183
185
  };
184
186
  embeddings?: EmbeddingFunction<unknown>;
187
+ embeddingFunction?: EmbeddingFunctionConfig;
185
188
 
186
189
  /**
187
190
  * If true then string columns will be encoded with dictionary encoding
@@ -306,7 +309,11 @@ export function makeArrowTable(
306
309
  const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
307
310
  if (opt.schema !== undefined && opt.schema !== null) {
308
311
  opt.schema = sanitizeSchema(opt.schema);
309
- opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
312
+ opt.schema = validateSchemaEmbeddings(
313
+ opt.schema,
314
+ data,
315
+ options?.embeddingFunction,
316
+ );
310
317
  }
311
318
  const columns: Record<string, Vector> = {};
312
319
  // TODO: sample dataset to find missing columns
@@ -545,7 +552,6 @@ async function applyEmbeddingsFromMetadata(
545
552
  dtype,
546
553
  );
547
554
  }
548
-
549
555
  const vector = makeVector(vectors, destType);
550
556
  columns[destColumn] = vector;
551
557
  }
@@ -835,7 +841,7 @@ export function createEmptyTable(schema: Schema): ArrowTable {
835
841
  function validateSchemaEmbeddings(
836
842
  schema: Schema,
837
843
  data: Array<Record<string, unknown>>,
838
- embeddings: EmbeddingFunction<unknown> | undefined,
844
+ embeddings: EmbeddingFunctionConfig | undefined,
839
845
  ) {
840
846
  const fields = [];
841
847
  const missingEmbeddingFields = [];
@@ -71,6 +71,12 @@ export interface CreateTableOptions {
71
71
  * The available options are described at https://lancedb.github.io/lancedb/guides/storage/
72
72
  */
73
73
  storageOptions?: Record<string, string>;
74
+ /**
75
+ * If true then data files will be written with the legacy format
76
+ *
77
+ * The default is true while the new format is in beta
78
+ */
79
+ useLegacyFormat?: boolean;
74
80
  schema?: Schema;
75
81
  embeddingFunction?: EmbeddingFunctionConfig;
76
82
  }
@@ -221,6 +227,7 @@ export class Connection {
221
227
  buf,
222
228
  mode,
223
229
  cleanseStorageOptions(options?.storageOptions),
230
+ options?.useLegacyFormat,
224
231
  );
225
232
 
226
233
  return new Table(innerTable);
@@ -256,6 +263,7 @@ export class Connection {
256
263
  buf,
257
264
  mode,
258
265
  cleanseStorageOptions(options?.storageOptions),
266
+ options?.useLegacyFormat,
259
267
  );
260
268
  return new Table(innerTable);
261
269
  }
@@ -19,6 +19,7 @@ import {
19
19
  FixedSizeList,
20
20
  Float,
21
21
  Float32,
22
+ type IntoVector,
22
23
  isDataType,
23
24
  isFixedSizeList,
24
25
  isFloat,
@@ -100,33 +101,55 @@ export abstract class EmbeddingFunction<
100
101
  * @see {@link lancedb.LanceSchema}
101
102
  */
102
103
  vectorField(
103
- options?: Partial<FieldOptions>,
104
+ optionsOrDatatype?: Partial<FieldOptions> | DataType,
104
105
  ): [DataType, Map<string, EmbeddingFunction>] {
105
- let dtype: DataType;
106
- const dims = this.ndims() ?? options?.dims;
107
- if (!options?.datatype) {
108
- if (dims === undefined) {
109
- throw new Error("ndims is required for vector field");
110
- }
111
- dtype = new FixedSizeList(dims, new Field("item", new Float32(), true));
106
+ let dtype: DataType | undefined;
107
+ let vectorType: DataType;
108
+ let dims: number | undefined = this.ndims();
109
+
110
+ // `func.vectorField(new Float32())`
111
+ if (isDataType(optionsOrDatatype)) {
112
+ dtype = optionsOrDatatype;
112
113
  } else {
113
- if (isFixedSizeList(options.datatype)) {
114
- dtype = options.datatype;
115
- } else if (isFloat(options.datatype)) {
114
+ // `func.vectorField({
115
+ // datatype: new Float32(),
116
+ // dims: 10
117
+ // })`
118
+ dims = dims ?? optionsOrDatatype?.dims;
119
+ dtype = optionsOrDatatype?.datatype;
120
+ }
121
+
122
+ if (dtype !== undefined) {
123
+ // `func.vectorField(new FixedSizeList(dims, new Field("item", new Float32(), true)))`
124
+ // or `func.vectorField({datatype: new FixedSizeList(dims, new Field("item", new Float32(), true))})`
125
+ if (isFixedSizeList(dtype)) {
126
+ vectorType = dtype;
127
+ // `func.vectorField(new Float32())`
128
+ // or `func.vectorField({datatype: new Float32()})`
129
+ } else if (isFloat(dtype)) {
130
+ // No `ndims` impl and no `{dims: n}` provided;
116
131
  if (dims === undefined) {
117
132
  throw new Error("ndims is required for vector field");
118
133
  }
119
- dtype = newVectorType(dims, options.datatype);
134
+ vectorType = newVectorType(dims, dtype);
120
135
  } else {
121
136
  throw new Error(
122
137
  "Expected FixedSizeList or Float as datatype for vector field",
123
138
  );
124
139
  }
140
+ } else {
141
+ if (dims === undefined) {
142
+ throw new Error("ndims is required for vector field");
143
+ }
144
+ vectorType = new FixedSizeList(
145
+ dims,
146
+ new Field("item", new Float32(), true),
147
+ );
125
148
  }
126
149
  const metadata = new Map<string, EmbeddingFunction>();
127
150
  metadata.set("vector_column_for", this);
128
151
 
129
- return [dtype, metadata];
152
+ return [vectorType, metadata];
130
153
  }
131
154
 
132
155
  /** The number of dimensions of the embeddings */
@@ -147,9 +170,7 @@ export abstract class EmbeddingFunction<
147
170
  /**
148
171
  Compute the embeddings for a single query
149
172
  */
150
- async computeQueryEmbeddings(
151
- data: T,
152
- ): Promise<number[] | Float32Array | Float64Array> {
173
+ async computeQueryEmbeddings(data: T): Promise<IntoVector> {
153
174
  return this.computeSourceEmbeddings([data]).then(
154
175
  (embeddings) => embeddings[0],
155
176
  );
@@ -42,6 +42,7 @@ export class EmbeddingFunctionRegistry {
42
42
  * Register an embedding function
43
43
  * @param name The name of the function
44
44
  * @param func The function to register
45
+ * @throws Error if the function is already registered
45
46
  */
46
47
  register<T extends EmbeddingFunctionFactory = EmbeddingFunctionFactory>(
47
48
  this: EmbeddingFunctionRegistry,
@@ -89,6 +90,9 @@ export class EmbeddingFunctionRegistry {
89
90
  this.#functions.clear();
90
91
  }
91
92
 
93
+ /**
94
+ * @ignore
95
+ */
92
96
  parseFunctions(
93
97
  this: EmbeddingFunctionRegistry,
94
98
  metadata: Map<string, string>,