@lancedb/lancedb 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/biome.json +8 -2
  2. package/dist/arrow.d.ts +36 -9
  3. package/dist/arrow.js +222 -24
  4. package/dist/connection.d.ts +10 -1
  5. package/dist/connection.js +13 -7
  6. package/dist/embedding/embedding_function.d.ts +54 -28
  7. package/dist/embedding/embedding_function.js +89 -10
  8. package/dist/embedding/index.d.ts +28 -2
  9. package/dist/embedding/index.js +111 -4
  10. package/dist/embedding/openai.d.ts +16 -7
  11. package/dist/embedding/openai.js +62 -12
  12. package/dist/embedding/registry.d.ts +58 -0
  13. package/dist/embedding/registry.js +127 -0
  14. package/dist/native.d.ts +5 -4
  15. package/dist/query.d.ts +19 -7
  16. package/dist/query.js +27 -13
  17. package/dist/sanitize.d.ts +22 -1
  18. package/dist/sanitize.js +123 -110
  19. package/dist/table.d.ts +18 -3
  20. package/dist/table.js +33 -3
  21. package/lancedb/arrow.ts +243 -41
  22. package/lancedb/connection.ts +35 -6
  23. package/lancedb/embedding/embedding_function.ts +147 -42
  24. package/lancedb/embedding/index.ts +113 -2
  25. package/lancedb/embedding/openai.ts +62 -16
  26. package/lancedb/embedding/registry.ts +176 -0
  27. package/lancedb/query.ts +58 -14
  28. package/lancedb/sanitize.ts +22 -22
  29. package/lancedb/table.ts +67 -5
  30. package/nodejs-artifacts/arrow.d.ts +36 -9
  31. package/nodejs-artifacts/arrow.js +222 -24
  32. package/nodejs-artifacts/connection.d.ts +10 -1
  33. package/nodejs-artifacts/connection.js +13 -7
  34. package/nodejs-artifacts/embedding/embedding_function.d.ts +54 -28
  35. package/nodejs-artifacts/embedding/embedding_function.js +89 -10
  36. package/nodejs-artifacts/embedding/index.d.ts +28 -2
  37. package/nodejs-artifacts/embedding/index.js +111 -4
  38. package/nodejs-artifacts/embedding/openai.d.ts +16 -7
  39. package/nodejs-artifacts/embedding/openai.js +62 -12
  40. package/nodejs-artifacts/embedding/registry.d.ts +58 -0
  41. package/nodejs-artifacts/embedding/registry.js +127 -0
  42. package/nodejs-artifacts/native.d.ts +5 -4
  43. package/nodejs-artifacts/query.d.ts +19 -7
  44. package/nodejs-artifacts/query.js +27 -13
  45. package/nodejs-artifacts/sanitize.d.ts +22 -1
  46. package/nodejs-artifacts/sanitize.js +123 -110
  47. package/nodejs-artifacts/table.d.ts +18 -3
  48. package/nodejs-artifacts/table.js +33 -3
  49. package/package.json +14 -9
  50. package/tsconfig.json +3 -1
package/biome.json CHANGED
@@ -48,7 +48,7 @@
48
48
  "noUnsafeFinally": "error",
49
49
  "noUnsafeOptionalChaining": "error",
50
50
  "noUnusedLabels": "error",
51
- "noUnusedVariables": "error",
51
+ "noUnusedVariables": "warn",
52
52
  "useIsNan": "error",
53
53
  "useValidForDirection": "error",
54
54
  "useYield": "error"
@@ -101,7 +101,13 @@
101
101
  },
102
102
  "overrides": [
103
103
  {
104
- "include": ["**/*.ts", "**/*.tsx", "**/*.mts", "**/*.cts"],
104
+ "include": [
105
+ "**/*.ts",
106
+ "**/*.tsx",
107
+ "**/*.mts",
108
+ "**/*.cts",
109
+ "__test__/*.test.ts"
110
+ ],
105
111
  "linter": {
106
112
  "rules": {
107
113
  "correctness": {
package/dist/arrow.d.ts CHANGED
@@ -1,6 +1,30 @@
1
1
  /// <reference types="node" />
2
- import { Table as ArrowTable, type Float, Schema } from "apache-arrow";
2
+ import { Table as ArrowTable, Binary, DataType, FixedSizeBinary, FixedSizeList, Float, Int, LargeBinary, List, Null, Schema, Struct, Utf8 } from "apache-arrow";
3
3
  import { type EmbeddingFunction } from "./embedding/embedding_function";
4
+ import { EmbeddingFunctionConfig } from "./embedding/registry";
5
+ export * from "apache-arrow";
6
+ export type IntoVector = Float32Array | Float64Array | number[];
7
+ export declare function isArrowTable(value: object): value is ArrowTable;
8
+ export declare function isDataType(value: unknown): value is DataType;
9
+ export declare function isNull(value: unknown): value is Null;
10
+ export declare function isInt(value: unknown): value is Int;
11
+ export declare function isFloat(value: unknown): value is Float;
12
+ export declare function isBinary(value: unknown): value is Binary;
13
+ export declare function isLargeBinary(value: unknown): value is LargeBinary;
14
+ export declare function isUtf8(value: unknown): value is Utf8;
15
+ export declare function isLargeUtf8(value: unknown): value is Utf8;
16
+ export declare function isBool(value: unknown): value is Utf8;
17
+ export declare function isDecimal(value: unknown): value is Utf8;
18
+ export declare function isDate(value: unknown): value is Utf8;
19
+ export declare function isTime(value: unknown): value is Utf8;
20
+ export declare function isTimestamp(value: unknown): value is Utf8;
21
+ export declare function isInterval(value: unknown): value is Utf8;
22
+ export declare function isDuration(value: unknown): value is Utf8;
23
+ export declare function isList(value: unknown): value is List;
24
+ export declare function isStruct(value: unknown): value is Struct;
25
+ export declare function isUnion(value: unknown): value is Struct;
26
+ export declare function isFixedSizeBinary(value: unknown): value is FixedSizeBinary;
27
+ export declare function isFixedSizeList(value: unknown): value is FixedSizeList;
4
28
  /** Data type accepted by NodeJS SDK */
5
29
  export type Data = Record<string, unknown>[] | ArrowTable;
6
30
  export declare class VectorColumnOptions {
@@ -13,6 +37,7 @@ export declare class MakeArrowTableOptions {
13
37
  schema?: Schema;
14
38
  vectorColumns: Record<string, VectorColumnOptions>;
15
39
  embeddings?: EmbeddingFunction<unknown>;
40
+ embeddingFunction?: EmbeddingFunctionConfig;
16
41
  /**
17
42
  * If true then string columns will be encoded with dictionary encoding
18
43
  *
@@ -117,11 +142,11 @@ export declare class MakeArrowTableOptions {
117
142
  * assert.deepEqual(table.schema, schema)
118
143
  * ```
119
144
  */
120
- export declare function makeArrowTable(data: Array<Record<string, unknown>>, options?: Partial<MakeArrowTableOptions>): ArrowTable;
145
+ export declare function makeArrowTable(data: Array<Record<string, unknown>>, options?: Partial<MakeArrowTableOptions>, metadata?: Map<string, string>): ArrowTable;
121
146
  /**
122
147
  * Create an empty Arrow table with the provided schema
123
148
  */
124
- export declare function makeEmptyTable(schema: Schema): ArrowTable;
149
+ export declare function makeEmptyTable(schema: Schema, metadata?: Map<string, string>): ArrowTable;
125
150
  /**
126
151
  * Convert an Array of records into an Arrow Table, optionally applying an
127
152
  * embeddings function to it.
@@ -140,7 +165,9 @@ export declare function makeEmptyTable(schema: Schema): ArrowTable;
140
165
  * embedding columns. If no schema is provded then embedding columns will
141
166
  * be placed at the end of the table, after all of the input columns.
142
167
  */
143
- export declare function convertToTable<T>(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>, makeTableOptions?: Partial<MakeArrowTableOptions>): Promise<ArrowTable>;
168
+ export declare function convertToTable(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, makeTableOptions?: Partial<MakeArrowTableOptions>): Promise<ArrowTable>;
169
+ /** Creates the Arrow Type for a Vector column with dimension `dim` */
170
+ export declare function newVectorType<T extends Float>(dim: number, innerType: T): FixedSizeList<T>;
144
171
  /**
145
172
  * Serialize an Array of records into a buffer using the Arrow IPC File serialization
146
173
  *
@@ -148,7 +175,7 @@ export declare function convertToTable<T>(data: Array<Record<string, unknown>>,
148
175
  *
149
176
  * `schema` is required if data is empty
150
177
  */
151
- export declare function fromRecordsToBuffer<T>(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>, schema?: Schema): Promise<Buffer>;
178
+ export declare function fromRecordsToBuffer(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
152
179
  /**
153
180
  * Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
154
181
  *
@@ -156,7 +183,7 @@ export declare function fromRecordsToBuffer<T>(data: Array<Record<string, unknow
156
183
  *
157
184
  * `schema` is required if data is empty
158
185
  */
159
- export declare function fromRecordsToStreamBuffer<T>(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>, schema?: Schema): Promise<Buffer>;
186
+ export declare function fromRecordsToStreamBuffer(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
160
187
  /**
161
188
  * Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
162
189
  *
@@ -165,7 +192,7 @@ export declare function fromRecordsToStreamBuffer<T>(data: Array<Record<string,
165
192
  *
166
193
  * `schema` is required if the table is empty
167
194
  */
168
- export declare function fromTableToBuffer<T>(table: ArrowTable, embeddings?: EmbeddingFunction<T>, schema?: Schema): Promise<Buffer>;
195
+ export declare function fromTableToBuffer(table: ArrowTable, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
169
196
  /**
170
197
  * Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
171
198
  *
@@ -174,7 +201,7 @@ export declare function fromTableToBuffer<T>(table: ArrowTable, embeddings?: Emb
174
201
  *
175
202
  * `schema` is required if the table is empty
176
203
  */
177
- export declare function fromDataToBuffer<T>(data: Data, embeddings?: EmbeddingFunction<T>, schema?: Schema): Promise<Buffer>;
204
+ export declare function fromDataToBuffer(data: Data, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
178
205
  /**
179
206
  * Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
180
207
  *
@@ -183,7 +210,7 @@ export declare function fromDataToBuffer<T>(data: Data, embeddings?: EmbeddingFu
183
210
  *
184
211
  * `schema` is required if the table is empty
185
212
  */
186
- export declare function fromTableToStreamBuffer<T>(table: ArrowTable, embeddings?: EmbeddingFunction<T>, schema?: Schema): Promise<Buffer>;
213
+ export declare function fromTableToStreamBuffer(table: ArrowTable, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
187
214
  /**
188
215
  * Create an empty table with the given schema
189
216
  */
package/dist/arrow.js CHANGED
@@ -12,10 +12,133 @@
12
12
  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  // See the License for the specific language governing permissions and
14
14
  // limitations under the License.
15
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
16
+ if (k2 === undefined) k2 = k;
17
+ var desc = Object.getOwnPropertyDescriptor(m, k);
18
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
19
+ desc = { enumerable: true, get: function() { return m[k]; } };
20
+ }
21
+ Object.defineProperty(o, k2, desc);
22
+ }) : (function(o, m, k, k2) {
23
+ if (k2 === undefined) k2 = k;
24
+ o[k2] = m[k];
25
+ }));
26
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
27
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
28
+ };
15
29
  Object.defineProperty(exports, "__esModule", { value: true });
16
- exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = void 0;
30
+ exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.newVectorType = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = exports.isFixedSizeList = exports.isFixedSizeBinary = exports.isUnion = exports.isStruct = exports.isList = exports.isDuration = exports.isInterval = exports.isTimestamp = exports.isTime = exports.isDate = exports.isDecimal = exports.isBool = exports.isLargeUtf8 = exports.isUtf8 = exports.isLargeBinary = exports.isBinary = exports.isFloat = exports.isInt = exports.isNull = exports.isDataType = exports.isArrowTable = void 0;
17
31
  const apache_arrow_1 = require("apache-arrow");
32
+ const registry_1 = require("./embedding/registry");
18
33
  const sanitize_1 = require("./sanitize");
34
+ __exportStar(require("apache-arrow"), exports);
35
+ function isArrowTable(value) {
36
+ if (value instanceof apache_arrow_1.Table)
37
+ return true;
38
+ return "schema" in value && "batches" in value;
39
+ }
40
+ exports.isArrowTable = isArrowTable;
41
+ function isDataType(value) {
42
+ return (value instanceof apache_arrow_1.DataType ||
43
+ apache_arrow_1.DataType.isNull(value) ||
44
+ apache_arrow_1.DataType.isInt(value) ||
45
+ apache_arrow_1.DataType.isFloat(value) ||
46
+ apache_arrow_1.DataType.isBinary(value) ||
47
+ apache_arrow_1.DataType.isLargeBinary(value) ||
48
+ apache_arrow_1.DataType.isUtf8(value) ||
49
+ apache_arrow_1.DataType.isLargeUtf8(value) ||
50
+ apache_arrow_1.DataType.isBool(value) ||
51
+ apache_arrow_1.DataType.isDecimal(value) ||
52
+ apache_arrow_1.DataType.isDate(value) ||
53
+ apache_arrow_1.DataType.isTime(value) ||
54
+ apache_arrow_1.DataType.isTimestamp(value) ||
55
+ apache_arrow_1.DataType.isInterval(value) ||
56
+ apache_arrow_1.DataType.isDuration(value) ||
57
+ apache_arrow_1.DataType.isList(value) ||
58
+ apache_arrow_1.DataType.isStruct(value) ||
59
+ apache_arrow_1.DataType.isUnion(value) ||
60
+ apache_arrow_1.DataType.isFixedSizeBinary(value) ||
61
+ apache_arrow_1.DataType.isFixedSizeList(value) ||
62
+ apache_arrow_1.DataType.isMap(value) ||
63
+ apache_arrow_1.DataType.isDictionary(value));
64
+ }
65
+ exports.isDataType = isDataType;
66
+ function isNull(value) {
67
+ return value instanceof apache_arrow_1.Null || apache_arrow_1.DataType.isNull(value);
68
+ }
69
+ exports.isNull = isNull;
70
+ function isInt(value) {
71
+ return value instanceof apache_arrow_1.Int || apache_arrow_1.DataType.isInt(value);
72
+ }
73
+ exports.isInt = isInt;
74
+ function isFloat(value) {
75
+ return value instanceof apache_arrow_1.Float || apache_arrow_1.DataType.isFloat(value);
76
+ }
77
+ exports.isFloat = isFloat;
78
+ function isBinary(value) {
79
+ return value instanceof apache_arrow_1.Binary || apache_arrow_1.DataType.isBinary(value);
80
+ }
81
+ exports.isBinary = isBinary;
82
+ function isLargeBinary(value) {
83
+ return value instanceof apache_arrow_1.LargeBinary || apache_arrow_1.DataType.isLargeBinary(value);
84
+ }
85
+ exports.isLargeBinary = isLargeBinary;
86
+ function isUtf8(value) {
87
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isUtf8(value);
88
+ }
89
+ exports.isUtf8 = isUtf8;
90
+ function isLargeUtf8(value) {
91
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isLargeUtf8(value);
92
+ }
93
+ exports.isLargeUtf8 = isLargeUtf8;
94
+ function isBool(value) {
95
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isBool(value);
96
+ }
97
+ exports.isBool = isBool;
98
+ function isDecimal(value) {
99
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDecimal(value);
100
+ }
101
+ exports.isDecimal = isDecimal;
102
+ function isDate(value) {
103
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDate(value);
104
+ }
105
+ exports.isDate = isDate;
106
+ function isTime(value) {
107
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTime(value);
108
+ }
109
+ exports.isTime = isTime;
110
+ function isTimestamp(value) {
111
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTimestamp(value);
112
+ }
113
+ exports.isTimestamp = isTimestamp;
114
+ function isInterval(value) {
115
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isInterval(value);
116
+ }
117
+ exports.isInterval = isInterval;
118
+ function isDuration(value) {
119
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDuration(value);
120
+ }
121
+ exports.isDuration = isDuration;
122
+ function isList(value) {
123
+ return value instanceof apache_arrow_1.List || apache_arrow_1.DataType.isList(value);
124
+ }
125
+ exports.isList = isList;
126
+ function isStruct(value) {
127
+ return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isStruct(value);
128
+ }
129
+ exports.isStruct = isStruct;
130
+ function isUnion(value) {
131
+ return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isUnion(value);
132
+ }
133
+ exports.isUnion = isUnion;
134
+ function isFixedSizeBinary(value) {
135
+ return value instanceof apache_arrow_1.FixedSizeBinary || apache_arrow_1.DataType.isFixedSizeBinary(value);
136
+ }
137
+ exports.isFixedSizeBinary = isFixedSizeBinary;
138
+ function isFixedSizeList(value) {
139
+ return value instanceof apache_arrow_1.FixedSizeList || apache_arrow_1.DataType.isFixedSizeList(value);
140
+ }
141
+ exports.isFixedSizeList = isFixedSizeList;
19
142
  /*
20
143
  * Options to control how a column should be converted to a vector array
21
144
  */
@@ -61,6 +184,7 @@ class MakeArrowTableOptions {
61
184
  vector: new VectorColumnOptions(),
62
185
  };
63
186
  embeddings;
187
+ embeddingFunction;
64
188
  /**
65
189
  * If true then string columns will be encoded with dictionary encoding
66
190
  *
@@ -168,7 +292,7 @@ exports.MakeArrowTableOptions = MakeArrowTableOptions;
168
292
  * assert.deepEqual(table.schema, schema)
169
293
  * ```
170
294
  */
171
- function makeArrowTable(data, options) {
295
+ function makeArrowTable(data, options, metadata) {
172
296
  if (data.length === 0 &&
173
297
  (options?.schema === undefined || options?.schema === null)) {
174
298
  throw new Error("At least one record or a schema needs to be provided");
@@ -176,7 +300,7 @@ function makeArrowTable(data, options) {
176
300
  const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
177
301
  if (opt.schema !== undefined && opt.schema !== null) {
178
302
  opt.schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
179
- opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
303
+ opt.schema = validateSchemaEmbeddings(opt.schema, data, options?.embeddingFunction);
180
304
  }
181
305
  const columns = {};
182
306
  // TODO: sample dataset to find missing columns
@@ -246,21 +370,38 @@ function makeArrowTable(data, options) {
246
370
  // then patch the schema of the batches so we can use
247
371
  // `new ArrowTable(schema, batches)` which does not do any schema inference
248
372
  const firstTable = new apache_arrow_1.Table(columns);
249
- const batchesFixed = firstTable.batches.map(
250
- // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
251
- (batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
252
- return new apache_arrow_1.Table(opt.schema, batchesFixed);
373
+ const batchesFixed = firstTable.batches.map((batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
374
+ let schema;
375
+ if (metadata !== undefined) {
376
+ let schemaMetadata = opt.schema.metadata;
377
+ if (schemaMetadata.size === 0) {
378
+ schemaMetadata = metadata;
379
+ }
380
+ else {
381
+ for (const [key, entry] of schemaMetadata.entries()) {
382
+ schemaMetadata.set(key, entry);
383
+ }
384
+ }
385
+ schema = new apache_arrow_1.Schema(opt.schema.fields, schemaMetadata);
386
+ }
387
+ else {
388
+ schema = opt.schema;
389
+ }
390
+ return new apache_arrow_1.Table(schema, batchesFixed);
253
391
  }
254
- else {
255
- return new apache_arrow_1.Table(columns);
392
+ const tbl = new apache_arrow_1.Table(columns);
393
+ if (metadata !== undefined) {
394
+ // biome-ignore lint/suspicious/noExplicitAny: <explanation>
395
+ tbl.schema.metadata = metadata;
256
396
  }
397
+ return tbl;
257
398
  }
258
399
  exports.makeArrowTable = makeArrowTable;
259
400
  /**
260
401
  * Create an empty Arrow table with the provided schema
261
402
  */
262
- function makeEmptyTable(schema) {
263
- return makeArrowTable([], { schema });
403
+ function makeEmptyTable(schema, metadata) {
404
+ return makeArrowTable([], { schema }, metadata);
264
405
  }
265
406
  exports.makeEmptyTable = makeEmptyTable;
266
407
  /**
@@ -322,9 +463,52 @@ function makeVector(values, type, stringAsDictionary) {
322
463
  return (0, apache_arrow_1.vectorFromArray)(values);
323
464
  }
324
465
  }
466
+ /** Helper function to apply embeddings from metadata to an input table */
467
+ async function applyEmbeddingsFromMetadata(table, schema) {
468
+ const registry = (0, registry_1.getRegistry)();
469
+ const functions = registry.parseFunctions(schema.metadata);
470
+ const columns = Object.fromEntries(table.schema.fields.map((field) => [
471
+ field.name,
472
+ table.getChild(field.name),
473
+ ]));
474
+ for (const functionEntry of functions.values()) {
475
+ const sourceColumn = columns[functionEntry.sourceColumn];
476
+ const destColumn = functionEntry.vectorColumn ?? "vector";
477
+ if (sourceColumn === undefined) {
478
+ throw new Error(`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`);
479
+ }
480
+ if (columns[destColumn] !== undefined) {
481
+ throw new Error(`Attempt to apply embeddings to table failed because column ${destColumn} already existed`);
482
+ }
483
+ if (table.batches.length > 1) {
484
+ throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
485
+ }
486
+ const values = sourceColumn.toArray();
487
+ const vectors = await functionEntry.function.computeSourceEmbeddings(values);
488
+ if (vectors.length !== values.length) {
489
+ throw new Error("Embedding function did not return an embedding for each input element");
490
+ }
491
+ let destType;
492
+ const dtype = schema.fields.find((f) => f.name === destColumn).type;
493
+ if (isFixedSizeList(dtype)) {
494
+ destType = (0, sanitize_1.sanitizeType)(dtype);
495
+ }
496
+ else {
497
+ throw new Error("Expected FixedSizeList as datatype for vector field, instead got: " +
498
+ dtype);
499
+ }
500
+ const vector = makeVector(vectors, destType);
501
+ columns[destColumn] = vector;
502
+ }
503
+ const newTable = new apache_arrow_1.Table(columns);
504
+ return alignTable(newTable, schema);
505
+ }
325
506
  /** Helper function to apply embeddings to an input table */
326
507
  async function applyEmbeddings(table, embeddings, schema) {
327
- if (embeddings == null) {
508
+ if (schema?.metadata.has("embedding_functions")) {
509
+ return applyEmbeddingsFromMetadata(table, schema);
510
+ }
511
+ else if (embeddings == null || embeddings === undefined) {
328
512
  return table;
329
513
  }
330
514
  if (schema !== undefined && schema !== null) {
@@ -339,8 +523,8 @@ async function applyEmbeddings(table, embeddings, schema) {
339
523
  });
340
524
  const newColumns = Object.fromEntries(colEntries);
341
525
  const sourceColumn = newColumns[embeddings.sourceColumn];
342
- const destColumn = embeddings.destColumn ?? "vector";
343
- const innerDestType = embeddings.embeddingDataType ?? new apache_arrow_1.Float32();
526
+ const destColumn = embeddings.vectorColumn ?? "vector";
527
+ const innerDestType = embeddings.function.embeddingDataType() ?? new apache_arrow_1.Float32();
344
528
  if (sourceColumn === undefined) {
345
529
  throw new Error(`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`);
346
530
  }
@@ -351,8 +535,9 @@ async function applyEmbeddings(table, embeddings, schema) {
351
535
  // if we call convertToTable with 0 records and a schema that includes the embedding
352
536
  return table;
353
537
  }
354
- if (embeddings.embeddingDimension !== undefined) {
355
- const destType = newVectorType(embeddings.embeddingDimension, innerDestType);
538
+ const dimensions = embeddings.function.ndims();
539
+ if (dimensions !== undefined) {
540
+ const destType = newVectorType(dimensions, innerDestType);
356
541
  newColumns[destColumn] = makeVector([], destType);
357
542
  }
358
543
  else if (schema != null) {
@@ -376,7 +561,7 @@ async function applyEmbeddings(table, embeddings, schema) {
376
561
  throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
377
562
  }
378
563
  const values = sourceColumn.toArray();
379
- const vectors = await embeddings.embed(values);
564
+ const vectors = await embeddings.function.computeSourceEmbeddings(values);
380
565
  if (vectors.length !== values.length) {
381
566
  throw new Error("Embedding function did not return an embedding for each input element");
382
567
  }
@@ -419,9 +604,10 @@ exports.convertToTable = convertToTable;
419
604
  function newVectorType(dim, innerType) {
420
605
  // in Lance we always default to have the elements nullable, so we need to set it to true
421
606
  // otherwise we often get schema mismatches because the stored data always has schema with nullable elements
422
- const children = new apache_arrow_1.Field("item", innerType, true);
607
+ const children = new apache_arrow_1.Field("item", (0, sanitize_1.sanitizeType)(innerType), true);
423
608
  return new apache_arrow_1.FixedSizeList(dim, children);
424
609
  }
610
+ exports.newVectorType = newVectorType;
425
611
  /**
426
612
  * Serialize an Array of records into a buffer using the Arrow IPC File serialization
427
613
  *
@@ -483,12 +669,12 @@ async function fromDataToBuffer(data, embeddings, schema) {
483
669
  if (schema !== undefined && schema !== null) {
484
670
  schema = (0, sanitize_1.sanitizeSchema)(schema);
485
671
  }
486
- if (data instanceof apache_arrow_1.Table) {
672
+ if (isArrowTable(data)) {
487
673
  return fromTableToBuffer(data, embeddings, schema);
488
674
  }
489
675
  else {
490
- const table = await convertToTable(data);
491
- return fromTableToBuffer(table, embeddings, schema);
676
+ const table = await convertToTable(data, embeddings, { schema });
677
+ return fromTableToBuffer(table);
492
678
  }
493
679
  }
494
680
  exports.fromDataToBuffer = fromDataToBuffer;
@@ -548,10 +734,22 @@ function validateSchemaEmbeddings(schema, data, embeddings) {
548
734
  // if it does not, we add it to the list of missing embedding fields
549
735
  // Finally, we check if those missing embedding fields are `this._embeddings`
550
736
  // if they are not, we throw an error
551
- for (const field of schema.fields) {
552
- if (field.type instanceof apache_arrow_1.FixedSizeList) {
737
+ for (let field of schema.fields) {
738
+ if (isFixedSizeList(field.type)) {
739
+ field = (0, sanitize_1.sanitizeField)(field);
553
740
  if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
554
- missingEmbeddingFields.push(field);
741
+ if (schema.metadata.has("embedding_functions")) {
742
+ const embeddings = JSON.parse(schema.metadata.get("embedding_functions"));
743
+ if (
744
+ // biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
745
+ embeddings.find((f) => f["vectorColumn"] === field.name) ===
746
+ undefined) {
747
+ missingEmbeddingFields.push(field);
748
+ }
749
+ }
750
+ else {
751
+ missingEmbeddingFields.push(field);
752
+ }
555
753
  }
556
754
  else {
557
755
  fields.push(field);
@@ -1,4 +1,5 @@
1
- import { Table as ArrowTable, Schema } from "apache-arrow";
1
+ import { Table as ArrowTable, Schema } from "./arrow";
2
+ import { EmbeddingFunctionConfig } from "./embedding/registry";
2
3
  import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
3
4
  import { Table } from "./table";
4
5
  /**
@@ -39,6 +40,14 @@ export interface CreateTableOptions {
39
40
  * The available options are described at https://lancedb.github.io/lancedb/guides/storage/
40
41
  */
41
42
  storageOptions?: Record<string, string>;
43
+ /**
44
+ * If true then data files will be written with the legacy format
45
+ *
46
+ * The default is true while the new format is in beta
47
+ */
48
+ useLegacyFormat?: boolean;
49
+ schema?: Schema;
50
+ embeddingFunction?: EmbeddingFunctionConfig;
42
51
  }
43
52
  export interface OpenTableOptions {
44
53
  /**
@@ -14,8 +14,8 @@
14
14
  // limitations under the License.
15
15
  Object.defineProperty(exports, "__esModule", { value: true });
16
16
  exports.Connection = exports.connect = void 0;
17
- const apache_arrow_1 = require("apache-arrow");
18
17
  const arrow_1 = require("./arrow");
18
+ const registry_1 = require("./embedding/registry");
19
19
  const native_1 = require("./native");
20
20
  const table_1 = require("./table");
21
21
  /**
@@ -109,14 +109,14 @@ class Connection {
109
109
  mode = "exist_ok";
110
110
  }
111
111
  let table;
112
- if (data instanceof apache_arrow_1.Table) {
112
+ if ((0, arrow_1.isArrowTable)(data)) {
113
113
  table = data;
114
114
  }
115
115
  else {
116
- table = (0, arrow_1.makeArrowTable)(data);
116
+ table = (0, arrow_1.makeArrowTable)(data, options);
117
117
  }
118
- const buf = await (0, arrow_1.fromTableToBuffer)(table);
119
- const innerTable = await this.inner.createTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
118
+ const buf = await (0, arrow_1.fromTableToBuffer)(table, options?.embeddingFunction, options?.schema);
119
+ const innerTable = await this.inner.createTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions), options?.useLegacyFormat);
120
120
  return new table_1.Table(innerTable);
121
121
  }
122
122
  /**
@@ -130,9 +130,15 @@ class Connection {
130
130
  if (mode === "create" && existOk) {
131
131
  mode = "exist_ok";
132
132
  }
133
- const table = (0, arrow_1.makeEmptyTable)(schema);
133
+ let metadata = undefined;
134
+ if (options?.embeddingFunction !== undefined) {
135
+ const embeddingFunction = options.embeddingFunction;
136
+ const registry = (0, registry_1.getRegistry)();
137
+ metadata = registry.getTableMetadata([embeddingFunction]);
138
+ }
139
+ const table = (0, arrow_1.makeEmptyTable)(schema, metadata);
134
140
  const buf = await (0, arrow_1.fromTableToBuffer)(table);
135
- const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
141
+ const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions), options?.useLegacyFormat);
136
142
  return new table_1.Table(innerTable);
137
143
  }
138
144
  /**
@@ -1,45 +1,71 @@
1
- import { type Float } from "apache-arrow";
1
+ import "reflect-metadata";
2
+ import { DataType, Float, type IntoVector } from "../arrow";
3
+ /**
4
+ * Options for a given embedding function
5
+ */
6
+ export interface FunctionOptions {
7
+ [key: string]: any;
8
+ }
2
9
  /**
3
10
  * An embedding function that automatically creates vector representation for a given column.
4
11
  */
5
- export interface EmbeddingFunction<T> {
12
+ export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptions = FunctionOptions> {
6
13
  /**
7
- * The name of the column that will be used as input for the Embedding Function.
8
- */
9
- sourceColumn: string;
10
- /**
11
- * The data type of the embedding
14
+ * Convert the embedding function to a JSON object
15
+ * It is used to serialize the embedding function to the schema
16
+ * It's important that any object returned by this method contains all the necessary
17
+ * information to recreate the embedding function
12
18
  *
13
- * The embedding function should return `number`. This will be converted into
14
- * an Arrow float array. By default this will be Float32 but this property can
15
- * be used to control the conversion.
16
- */
17
- embeddingDataType?: Float;
18
- /**
19
- * The dimension of the embedding
19
+ * It should return the same object that was passed to the constructor
20
+ * If it does not, the embedding function will not be able to be recreated, or could be recreated incorrectly
20
21
  *
21
- * This is optional, normally this can be determined by looking at the results of
22
- * `embed`. If this is not specified, and there is an attempt to apply the embedding
23
- * to an empty table, then that process will fail.
22
+ * @example
23
+ * ```ts
24
+ * class MyEmbeddingFunction extends EmbeddingFunction {
25
+ * constructor(options: {model: string, timeout: number}) {
26
+ * super();
27
+ * this.model = options.model;
28
+ * this.timeout = options.timeout;
29
+ * }
30
+ * toJSON() {
31
+ * return {
32
+ * model: this.model,
33
+ * timeout: this.timeout,
34
+ * };
35
+ * }
36
+ * ```
24
37
  */
25
- embeddingDimension?: number;
38
+ abstract toJSON(): Partial<M>;
26
39
  /**
27
- * The name of the column that will contain the embedding
40
+ * sourceField is used in combination with `LanceSchema` to provide a declarative data model
41
+ *
42
+ * @param optionsOrDatatype - The options for the field or the datatype
28
43
  *
29
- * By default this is "vector"
44
+ * @see {@link lancedb.LanceSchema}
30
45
  */
31
- destColumn?: string;
46
+ sourceField(optionsOrDatatype: Partial<FieldOptions> | DataType): [DataType, Map<string, EmbeddingFunction>];
32
47
  /**
33
- * Should the source column be excluded from the resulting table
48
+ * vectorField is used in combination with `LanceSchema` to provide a declarative data model
49
+ *
50
+ * @param options - The options for the field
34
51
  *
35
- * By default the source column is included. Set this to true and
36
- * only the embedding will be stored.
52
+ * @see {@link lancedb.LanceSchema}
37
53
  */
38
- excludeSource?: boolean;
54
+ vectorField(optionsOrDatatype?: Partial<FieldOptions> | DataType): [DataType, Map<string, EmbeddingFunction>];
55
+ /** The number of dimensions of the embeddings */
56
+ ndims(): number | undefined;
57
+ /** The datatype of the embeddings */
58
+ abstract embeddingDataType(): Float;
39
59
  /**
40
60
  * Creates a vector representation for the given values.
41
61
  */
42
- embed: (data: T[]) => Promise<number[][]>;
62
+ abstract computeSourceEmbeddings(data: T[]): Promise<number[][] | Float32Array[] | Float64Array[]>;
63
+ /**
64
+ Compute the embeddings for a single query
65
+ */
66
+ computeQueryEmbeddings(data: T): Promise<IntoVector>;
67
+ }
68
+ export interface FieldOptions<T extends DataType = DataType> {
69
+ datatype: T;
70
+ dims?: number;
43
71
  }
44
- /** Test if the input seems to be an embedding function */
45
- export declare function isEmbeddingFunction<T>(value: unknown): value is EmbeddingFunction<T>;