@lancedb/lancedb 0.4.20 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +5 -14
  2. package/biome.json +142 -0
  3. package/dist/arrow.d.ts +35 -9
  4. package/dist/arrow.js +247 -19
  5. package/dist/connection.d.ts +4 -1
  6. package/dist/connection.js +11 -5
  7. package/dist/embedding/embedding_function.d.ts +54 -28
  8. package/dist/embedding/embedding_function.js +71 -10
  9. package/dist/embedding/index.d.ts +28 -2
  10. package/dist/embedding/index.js +111 -4
  11. package/dist/embedding/openai.d.ts +16 -7
  12. package/dist/embedding/openai.js +62 -12
  13. package/dist/embedding/registry.d.ts +54 -0
  14. package/dist/embedding/registry.js +123 -0
  15. package/dist/native.d.ts +26 -0
  16. package/dist/query.d.ts +1 -1
  17. package/dist/query.js +7 -6
  18. package/dist/sanitize.d.ts +22 -1
  19. package/dist/sanitize.js +126 -113
  20. package/dist/table.d.ts +50 -4
  21. package/dist/table.js +47 -5
  22. package/lancedb/arrow.ts +283 -49
  23. package/lancedb/connection.ts +27 -6
  24. package/lancedb/embedding/embedding_function.ts +126 -42
  25. package/lancedb/embedding/index.ts +113 -2
  26. package/lancedb/embedding/openai.ts +62 -16
  27. package/lancedb/embedding/registry.ts +172 -0
  28. package/lancedb/query.ts +9 -6
  29. package/lancedb/sanitize.ts +62 -62
  30. package/lancedb/table.ts +72 -5
  31. package/nodejs-artifacts/arrow.d.ts +35 -9
  32. package/nodejs-artifacts/arrow.js +247 -19
  33. package/nodejs-artifacts/connection.d.ts +4 -1
  34. package/nodejs-artifacts/connection.js +11 -5
  35. package/nodejs-artifacts/embedding/embedding_function.d.ts +54 -28
  36. package/nodejs-artifacts/embedding/embedding_function.js +71 -10
  37. package/nodejs-artifacts/embedding/index.d.ts +28 -2
  38. package/nodejs-artifacts/embedding/index.js +111 -4
  39. package/nodejs-artifacts/embedding/openai.d.ts +16 -7
  40. package/nodejs-artifacts/embedding/openai.js +62 -12
  41. package/nodejs-artifacts/embedding/registry.d.ts +54 -0
  42. package/nodejs-artifacts/embedding/registry.js +123 -0
  43. package/nodejs-artifacts/native.d.ts +26 -0
  44. package/nodejs-artifacts/query.d.ts +1 -1
  45. package/nodejs-artifacts/query.js +7 -6
  46. package/nodejs-artifacts/sanitize.d.ts +22 -1
  47. package/nodejs-artifacts/sanitize.js +126 -113
  48. package/nodejs-artifacts/table.d.ts +50 -4
  49. package/nodejs-artifacts/table.js +47 -5
  50. package/package.json +23 -21
  51. package/tsconfig.json +3 -1
  52. package/.eslintignore +0 -3
  53. package/eslint.config.js +0 -28
package/lancedb/arrow.ts CHANGED
@@ -13,28 +13,124 @@
13
13
  // limitations under the License.
14
14
 
15
15
  import {
16
+ Table as ArrowTable,
17
+ Binary,
18
+ DataType,
16
19
  Field,
17
- makeBuilder,
18
- RecordBatchFileWriter,
19
- Utf8,
20
- type Vector,
20
+ FixedSizeBinary,
21
21
  FixedSizeList,
22
- vectorFromArray,
23
- type Schema,
24
- Table as ArrowTable,
25
- RecordBatchStreamWriter,
22
+ Float,
23
+ Float32,
24
+ Int,
25
+ LargeBinary,
26
26
  List,
27
+ Null,
27
28
  RecordBatch,
28
- makeData,
29
+ RecordBatchFileWriter,
30
+ RecordBatchStreamWriter,
31
+ Schema,
29
32
  Struct,
30
- type Float,
31
- DataType,
32
- Binary,
33
- Float32,
33
+ Utf8,
34
+ type Vector,
35
+ makeBuilder,
36
+ makeData,
34
37
  type makeTable,
38
+ vectorFromArray,
35
39
  } from "apache-arrow";
36
40
  import { type EmbeddingFunction } from "./embedding/embedding_function";
37
- import { sanitizeSchema } from "./sanitize";
41
+ import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
42
+ import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
43
+ export * from "apache-arrow";
44
+
45
+ export function isArrowTable(value: object): value is ArrowTable {
46
+ if (value instanceof ArrowTable) return true;
47
+ return "schema" in value && "batches" in value;
48
+ }
49
+
50
+ export function isDataType(value: unknown): value is DataType {
51
+ return (
52
+ value instanceof DataType ||
53
+ DataType.isNull(value) ||
54
+ DataType.isInt(value) ||
55
+ DataType.isFloat(value) ||
56
+ DataType.isBinary(value) ||
57
+ DataType.isLargeBinary(value) ||
58
+ DataType.isUtf8(value) ||
59
+ DataType.isLargeUtf8(value) ||
60
+ DataType.isBool(value) ||
61
+ DataType.isDecimal(value) ||
62
+ DataType.isDate(value) ||
63
+ DataType.isTime(value) ||
64
+ DataType.isTimestamp(value) ||
65
+ DataType.isInterval(value) ||
66
+ DataType.isDuration(value) ||
67
+ DataType.isList(value) ||
68
+ DataType.isStruct(value) ||
69
+ DataType.isUnion(value) ||
70
+ DataType.isFixedSizeBinary(value) ||
71
+ DataType.isFixedSizeList(value) ||
72
+ DataType.isMap(value) ||
73
+ DataType.isDictionary(value)
74
+ );
75
+ }
76
+ export function isNull(value: unknown): value is Null {
77
+ return value instanceof Null || DataType.isNull(value);
78
+ }
79
+ export function isInt(value: unknown): value is Int {
80
+ return value instanceof Int || DataType.isInt(value);
81
+ }
82
+ export function isFloat(value: unknown): value is Float {
83
+ return value instanceof Float || DataType.isFloat(value);
84
+ }
85
+ export function isBinary(value: unknown): value is Binary {
86
+ return value instanceof Binary || DataType.isBinary(value);
87
+ }
88
+ export function isLargeBinary(value: unknown): value is LargeBinary {
89
+ return value instanceof LargeBinary || DataType.isLargeBinary(value);
90
+ }
91
+ export function isUtf8(value: unknown): value is Utf8 {
92
+ return value instanceof Utf8 || DataType.isUtf8(value);
93
+ }
94
+ export function isLargeUtf8(value: unknown): value is Utf8 {
95
+ return value instanceof Utf8 || DataType.isLargeUtf8(value);
96
+ }
97
+ export function isBool(value: unknown): value is Utf8 {
98
+ return value instanceof Utf8 || DataType.isBool(value);
99
+ }
100
+ export function isDecimal(value: unknown): value is Utf8 {
101
+ return value instanceof Utf8 || DataType.isDecimal(value);
102
+ }
103
+ export function isDate(value: unknown): value is Utf8 {
104
+ return value instanceof Utf8 || DataType.isDate(value);
105
+ }
106
+ export function isTime(value: unknown): value is Utf8 {
107
+ return value instanceof Utf8 || DataType.isTime(value);
108
+ }
109
+ export function isTimestamp(value: unknown): value is Utf8 {
110
+ return value instanceof Utf8 || DataType.isTimestamp(value);
111
+ }
112
+ export function isInterval(value: unknown): value is Utf8 {
113
+ return value instanceof Utf8 || DataType.isInterval(value);
114
+ }
115
+ export function isDuration(value: unknown): value is Utf8 {
116
+ return value instanceof Utf8 || DataType.isDuration(value);
117
+ }
118
+ export function isList(value: unknown): value is List {
119
+ return value instanceof List || DataType.isList(value);
120
+ }
121
+ export function isStruct(value: unknown): value is Struct {
122
+ return value instanceof Struct || DataType.isStruct(value);
123
+ }
124
+ export function isUnion(value: unknown): value is Struct {
125
+ return value instanceof Struct || DataType.isUnion(value);
126
+ }
127
+ export function isFixedSizeBinary(value: unknown): value is FixedSizeBinary {
128
+ return value instanceof FixedSizeBinary || DataType.isFixedSizeBinary(value);
129
+ }
130
+
131
+ export function isFixedSizeList(value: unknown): value is FixedSizeList {
132
+ return value instanceof FixedSizeList || DataType.isFixedSizeList(value);
133
+ }
38
134
 
39
135
  /** Data type accepted by NodeJS SDK */
40
136
  export type Data = Record<string, unknown>[] | ArrowTable;
@@ -85,6 +181,7 @@ export class MakeArrowTableOptions {
85
181
  vectorColumns: Record<string, VectorColumnOptions> = {
86
182
  vector: new VectorColumnOptions(),
87
183
  };
184
+ embeddings?: EmbeddingFunction<unknown>;
88
185
 
89
186
  /**
90
187
  * If true then string columns will be encoded with dictionary encoding
@@ -197,6 +294,7 @@ export class MakeArrowTableOptions {
197
294
  export function makeArrowTable(
198
295
  data: Array<Record<string, unknown>>,
199
296
  options?: Partial<MakeArrowTableOptions>,
297
+ metadata?: Map<string, string>,
200
298
  ): ArrowTable {
201
299
  if (
202
300
  data.length === 0 &&
@@ -208,6 +306,7 @@ export function makeArrowTable(
208
306
  const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
209
307
  if (opt.schema !== undefined && opt.schema !== null) {
210
308
  opt.schema = sanitizeSchema(opt.schema);
309
+ opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
211
310
  }
212
311
  const columns: Record<string, Vector> = {};
213
312
  // TODO: sample dataset to find missing columns
@@ -287,21 +386,42 @@ export function makeArrowTable(
287
386
  // then patch the schema of the batches so we can use
288
387
  // `new ArrowTable(schema, batches)` which does not do any schema inference
289
388
  const firstTable = new ArrowTable(columns);
290
- // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
291
389
  const batchesFixed = firstTable.batches.map(
292
390
  (batch) => new RecordBatch(opt.schema!, batch.data),
293
391
  );
294
- return new ArrowTable(opt.schema, batchesFixed);
295
- } else {
296
- return new ArrowTable(columns);
392
+ let schema: Schema;
393
+ if (metadata !== undefined) {
394
+ let schemaMetadata = opt.schema.metadata;
395
+ if (schemaMetadata.size === 0) {
396
+ schemaMetadata = metadata;
397
+ } else {
398
+ for (const [key, entry] of schemaMetadata.entries()) {
399
+ schemaMetadata.set(key, entry);
400
+ }
401
+ }
402
+
403
+ schema = new Schema(opt.schema.fields, schemaMetadata);
404
+ } else {
405
+ schema = opt.schema;
406
+ }
407
+ return new ArrowTable(schema, batchesFixed);
408
+ }
409
+ const tbl = new ArrowTable(columns);
410
+ if (metadata !== undefined) {
411
+ // biome-ignore lint/suspicious/noExplicitAny: <explanation>
412
+ (<any>tbl.schema).metadata = metadata;
297
413
  }
414
+ return tbl;
298
415
  }
299
416
 
300
417
  /**
301
418
  * Create an empty Arrow table with the provided schema
302
419
  */
303
- export function makeEmptyTable(schema: Schema): ArrowTable {
304
- return makeArrowTable([], { schema });
420
+ export function makeEmptyTable(
421
+ schema: Schema,
422
+ metadata?: Map<string, string>,
423
+ ): ArrowTable {
424
+ return makeArrowTable([], { schema }, metadata);
305
425
  }
306
426
 
307
427
  /**
@@ -313,7 +433,7 @@ function makeListVector(lists: unknown[][]): Vector<unknown> {
313
433
  throw Error("Cannot infer list vector from empty array or empty list");
314
434
  }
315
435
  const sampleList = lists[0];
316
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
436
+ // biome-ignore lint/suspicious/noExplicitAny: skip
317
437
  let inferredType: any;
318
438
  try {
319
439
  const sampleVector = makeVector(sampleList);
@@ -337,7 +457,7 @@ function makeVector(
337
457
  values: unknown[],
338
458
  type?: DataType,
339
459
  stringAsDictionary?: boolean,
340
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
460
+ // biome-ignore lint/suspicious/noExplicitAny: skip
341
461
  ): Vector<any> {
342
462
  if (type !== undefined) {
343
463
  // No need for inference, let Arrow create it
@@ -373,13 +493,75 @@ function makeVector(
373
493
  }
374
494
  }
375
495
 
496
+ /** Helper function to apply embeddings from metadata to an input table */
497
+ async function applyEmbeddingsFromMetadata(
498
+ table: ArrowTable,
499
+ schema: Schema,
500
+ ): Promise<ArrowTable> {
501
+ const registry = getRegistry();
502
+ const functions = registry.parseFunctions(schema.metadata);
503
+
504
+ const columns = Object.fromEntries(
505
+ table.schema.fields.map((field) => [
506
+ field.name,
507
+ table.getChild(field.name)!,
508
+ ]),
509
+ );
510
+
511
+ for (const functionEntry of functions.values()) {
512
+ const sourceColumn = columns[functionEntry.sourceColumn];
513
+ const destColumn = functionEntry.vectorColumn ?? "vector";
514
+ if (sourceColumn === undefined) {
515
+ throw new Error(
516
+ `Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
517
+ );
518
+ }
519
+ if (columns[destColumn] !== undefined) {
520
+ throw new Error(
521
+ `Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
522
+ );
523
+ }
524
+ if (table.batches.length > 1) {
525
+ throw new Error(
526
+ "Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
527
+ );
528
+ }
529
+ const values = sourceColumn.toArray();
530
+
531
+ const vectors =
532
+ await functionEntry.function.computeSourceEmbeddings(values);
533
+ if (vectors.length !== values.length) {
534
+ throw new Error(
535
+ "Embedding function did not return an embedding for each input element",
536
+ );
537
+ }
538
+ let destType: DataType;
539
+ const dtype = schema.fields.find((f) => f.name === destColumn)!.type;
540
+ if (isFixedSizeList(dtype)) {
541
+ destType = sanitizeType(dtype);
542
+ } else {
543
+ throw new Error(
544
+ "Expected FixedSizeList as datatype for vector field, instead got: " +
545
+ dtype,
546
+ );
547
+ }
548
+
549
+ const vector = makeVector(vectors, destType);
550
+ columns[destColumn] = vector;
551
+ }
552
+ const newTable = new ArrowTable(columns);
553
+ return alignTable(newTable, schema);
554
+ }
555
+
376
556
  /** Helper function to apply embeddings to an input table */
377
557
  async function applyEmbeddings<T>(
378
558
  table: ArrowTable,
379
- embeddings?: EmbeddingFunction<T>,
559
+ embeddings?: EmbeddingFunctionConfig,
380
560
  schema?: Schema,
381
561
  ): Promise<ArrowTable> {
382
- if (embeddings == null) {
562
+ if (schema?.metadata.has("embedding_functions")) {
563
+ return applyEmbeddingsFromMetadata(table, schema!);
564
+ } else if (embeddings == null || embeddings === undefined) {
383
565
  return table;
384
566
  }
385
567
 
@@ -397,8 +579,9 @@ async function applyEmbeddings<T>(
397
579
  const newColumns = Object.fromEntries(colEntries);
398
580
 
399
581
  const sourceColumn = newColumns[embeddings.sourceColumn];
400
- const destColumn = embeddings.destColumn ?? "vector";
401
- const innerDestType = embeddings.embeddingDataType ?? new Float32();
582
+ const destColumn = embeddings.vectorColumn ?? "vector";
583
+ const innerDestType =
584
+ embeddings.function.embeddingDataType() ?? new Float32();
402
585
  if (sourceColumn === undefined) {
403
586
  throw new Error(
404
587
  `Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`,
@@ -412,11 +595,9 @@ async function applyEmbeddings<T>(
412
595
  // if we call convertToTable with 0 records and a schema that includes the embedding
413
596
  return table;
414
597
  }
415
- if (embeddings.embeddingDimension !== undefined) {
416
- const destType = newVectorType(
417
- embeddings.embeddingDimension,
418
- innerDestType,
419
- );
598
+ const dimensions = embeddings.function.ndims();
599
+ if (dimensions !== undefined) {
600
+ const destType = newVectorType(dimensions, innerDestType);
420
601
  newColumns[destColumn] = makeVector([], destType);
421
602
  } else if (schema != null) {
422
603
  const destField = schema.fields.find((f) => f.name === destColumn);
@@ -444,7 +625,9 @@ async function applyEmbeddings<T>(
444
625
  );
445
626
  }
446
627
  const values = sourceColumn.toArray();
447
- const vectors = await embeddings.embed(values as T[]);
628
+ const vectors = await embeddings.function.computeSourceEmbeddings(
629
+ values as T[],
630
+ );
448
631
  if (vectors.length !== values.length) {
449
632
  throw new Error(
450
633
  "Embedding function did not return an embedding for each input element",
@@ -484,9 +667,9 @@ async function applyEmbeddings<T>(
484
667
  * embedding columns. If no schema is provded then embedding columns will
485
668
  * be placed at the end of the table, after all of the input columns.
486
669
  */
487
- export async function convertToTable<T>(
670
+ export async function convertToTable(
488
671
  data: Array<Record<string, unknown>>,
489
- embeddings?: EmbeddingFunction<T>,
672
+ embeddings?: EmbeddingFunctionConfig,
490
673
  makeTableOptions?: Partial<MakeArrowTableOptions>,
491
674
  ): Promise<ArrowTable> {
492
675
  const table = makeArrowTable(data, makeTableOptions);
@@ -494,13 +677,13 @@ export async function convertToTable<T>(
494
677
  }
495
678
 
496
679
  /** Creates the Arrow Type for a Vector column with dimension `dim` */
497
- function newVectorType<T extends Float>(
680
+ export function newVectorType<T extends Float>(
498
681
  dim: number,
499
682
  innerType: T,
500
683
  ): FixedSizeList<T> {
501
684
  // in Lance we always default to have the elements nullable, so we need to set it to true
502
685
  // otherwise we often get schema mismatches because the stored data always has schema with nullable elements
503
- const children = new Field<T>("item", innerType, true);
686
+ const children = new Field("item", <T>sanitizeType(innerType), true);
504
687
  return new FixedSizeList(dim, children);
505
688
  }
506
689
 
@@ -511,9 +694,9 @@ function newVectorType<T extends Float>(
511
694
  *
512
695
  * `schema` is required if data is empty
513
696
  */
514
- export async function fromRecordsToBuffer<T>(
697
+ export async function fromRecordsToBuffer(
515
698
  data: Array<Record<string, unknown>>,
516
- embeddings?: EmbeddingFunction<T>,
699
+ embeddings?: EmbeddingFunctionConfig,
517
700
  schema?: Schema,
518
701
  ): Promise<Buffer> {
519
702
  if (schema !== undefined && schema !== null) {
@@ -531,9 +714,9 @@ export async function fromRecordsToBuffer<T>(
531
714
  *
532
715
  * `schema` is required if data is empty
533
716
  */
534
- export async function fromRecordsToStreamBuffer<T>(
717
+ export async function fromRecordsToStreamBuffer(
535
718
  data: Array<Record<string, unknown>>,
536
- embeddings?: EmbeddingFunction<T>,
719
+ embeddings?: EmbeddingFunctionConfig,
537
720
  schema?: Schema,
538
721
  ): Promise<Buffer> {
539
722
  if (schema !== undefined && schema !== null) {
@@ -552,9 +735,9 @@ export async function fromRecordsToStreamBuffer<T>(
552
735
  *
553
736
  * `schema` is required if the table is empty
554
737
  */
555
- export async function fromTableToBuffer<T>(
738
+ export async function fromTableToBuffer(
556
739
  table: ArrowTable,
557
- embeddings?: EmbeddingFunction<T>,
740
+ embeddings?: EmbeddingFunctionConfig,
558
741
  schema?: Schema,
559
742
  ): Promise<Buffer> {
560
743
  if (schema !== undefined && schema !== null) {
@@ -573,19 +756,19 @@ export async function fromTableToBuffer<T>(
573
756
  *
574
757
  * `schema` is required if the table is empty
575
758
  */
576
- export async function fromDataToBuffer<T>(
759
+ export async function fromDataToBuffer(
577
760
  data: Data,
578
- embeddings?: EmbeddingFunction<T>,
761
+ embeddings?: EmbeddingFunctionConfig,
579
762
  schema?: Schema,
580
763
  ): Promise<Buffer> {
581
764
  if (schema !== undefined && schema !== null) {
582
765
  schema = sanitizeSchema(schema);
583
766
  }
584
- if (data instanceof ArrowTable) {
767
+ if (isArrowTable(data)) {
585
768
  return fromTableToBuffer(data, embeddings, schema);
586
769
  } else {
587
- const table = await convertToTable(data);
588
- return fromTableToBuffer(table, embeddings, schema);
770
+ const table = await convertToTable(data, embeddings, { schema });
771
+ return fromTableToBuffer(table);
589
772
  }
590
773
  }
591
774
 
@@ -597,9 +780,9 @@ export async function fromDataToBuffer<T>(
597
780
  *
598
781
  * `schema` is required if the table is empty
599
782
  */
600
- export async function fromTableToStreamBuffer<T>(
783
+ export async function fromTableToStreamBuffer(
601
784
  table: ArrowTable,
602
- embeddings?: EmbeddingFunction<T>,
785
+ embeddings?: EmbeddingFunctionConfig,
603
786
  schema?: Schema,
604
787
  ): Promise<Buffer> {
605
788
  const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
@@ -648,3 +831,54 @@ function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
648
831
  export function createEmptyTable(schema: Schema): ArrowTable {
649
832
  return new ArrowTable(sanitizeSchema(schema));
650
833
  }
834
+
835
+ function validateSchemaEmbeddings(
836
+ schema: Schema,
837
+ data: Array<Record<string, unknown>>,
838
+ embeddings: EmbeddingFunction<unknown> | undefined,
839
+ ) {
840
+ const fields = [];
841
+ const missingEmbeddingFields = [];
842
+
843
+ // First we check if the field is a `FixedSizeList`
844
+ // Then we check if the data contains the field
845
+ // if it does not, we add it to the list of missing embedding fields
846
+ // Finally, we check if those missing embedding fields are `this._embeddings`
847
+ // if they are not, we throw an error
848
+ for (let field of schema.fields) {
849
+ if (isFixedSizeList(field.type)) {
850
+ field = sanitizeField(field);
851
+
852
+ if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
853
+ if (schema.metadata.has("embedding_functions")) {
854
+ const embeddings = JSON.parse(
855
+ schema.metadata.get("embedding_functions")!,
856
+ );
857
+ if (
858
+ // biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
859
+ embeddings.find((f: any) => f["vectorColumn"] === field.name) ===
860
+ undefined
861
+ ) {
862
+ missingEmbeddingFields.push(field);
863
+ }
864
+ } else {
865
+ missingEmbeddingFields.push(field);
866
+ }
867
+ } else {
868
+ fields.push(field);
869
+ }
870
+ } else {
871
+ fields.push(field);
872
+ }
873
+ }
874
+
875
+ if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
876
+ throw new Error(
877
+ `Table has embeddings: "${missingEmbeddingFields
878
+ .map((f) => f.name)
879
+ .join(",")}", but no embedding function was provided`,
880
+ );
881
+ }
882
+
883
+ return new Schema(fields, schema.metadata);
884
+ }
@@ -12,10 +12,16 @@
12
12
  // See the License for the specific language governing permissions and
13
13
  // limitations under the License.
14
14
 
15
- import { fromTableToBuffer, makeArrowTable, makeEmptyTable } from "./arrow";
15
+ import { Table as ArrowTable, Schema } from "./arrow";
16
+ import {
17
+ fromTableToBuffer,
18
+ isArrowTable,
19
+ makeArrowTable,
20
+ makeEmptyTable,
21
+ } from "./arrow";
22
+ import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
16
23
  import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
17
24
  import { Table } from "./table";
18
- import { Table as ArrowTable, Schema } from "apache-arrow";
19
25
 
20
26
  /**
21
27
  * Connect to a LanceDB instance at the given URI.
@@ -65,6 +71,8 @@ export interface CreateTableOptions {
65
71
  * The available options are described at https://lancedb.github.io/lancedb/guides/storage/
66
72
  */
67
73
  storageOptions?: Record<string, string>;
74
+ schema?: Schema;
75
+ embeddingFunction?: EmbeddingFunctionConfig;
68
76
  }
69
77
 
70
78
  export interface OpenTableOptions {
@@ -174,6 +182,7 @@ export class Connection {
174
182
  cleanseStorageOptions(options?.storageOptions),
175
183
  options?.indexCacheSize,
176
184
  );
185
+
177
186
  return new Table(innerTable);
178
187
  }
179
188
 
@@ -196,18 +205,24 @@ export class Connection {
196
205
  }
197
206
 
198
207
  let table: ArrowTable;
199
- if (data instanceof ArrowTable) {
208
+ if (isArrowTable(data)) {
200
209
  table = data;
201
210
  } else {
202
- table = makeArrowTable(data);
211
+ table = makeArrowTable(data, options);
203
212
  }
204
- const buf = await fromTableToBuffer(table);
213
+
214
+ const buf = await fromTableToBuffer(
215
+ table,
216
+ options?.embeddingFunction,
217
+ options?.schema,
218
+ );
205
219
  const innerTable = await this.inner.createTable(
206
220
  name,
207
221
  buf,
208
222
  mode,
209
223
  cleanseStorageOptions(options?.storageOptions),
210
224
  );
225
+
211
226
  return new Table(innerTable);
212
227
  }
213
228
 
@@ -227,8 +242,14 @@ export class Connection {
227
242
  if (mode === "create" && existOk) {
228
243
  mode = "exist_ok";
229
244
  }
245
+ let metadata: Map<string, string> | undefined = undefined;
246
+ if (options?.embeddingFunction !== undefined) {
247
+ const embeddingFunction = options.embeddingFunction;
248
+ const registry = getRegistry();
249
+ metadata = registry.getTableMetadata([embeddingFunction]);
250
+ }
230
251
 
231
- const table = makeEmptyTable(schema);
252
+ const table = makeEmptyTable(schema, metadata);
232
253
  const buf = await fromTableToBuffer(table);
233
254
  const innerTable = await this.inner.createEmptyTable(
234
255
  name,