@lancedb/lancedb 0.4.20 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +5 -14
  2. package/biome.json +142 -0
  3. package/dist/arrow.d.ts +35 -9
  4. package/dist/arrow.js +247 -19
  5. package/dist/connection.d.ts +4 -1
  6. package/dist/connection.js +11 -5
  7. package/dist/embedding/embedding_function.d.ts +54 -28
  8. package/dist/embedding/embedding_function.js +71 -10
  9. package/dist/embedding/index.d.ts +28 -2
  10. package/dist/embedding/index.js +111 -4
  11. package/dist/embedding/openai.d.ts +16 -7
  12. package/dist/embedding/openai.js +62 -12
  13. package/dist/embedding/registry.d.ts +54 -0
  14. package/dist/embedding/registry.js +123 -0
  15. package/dist/native.d.ts +26 -0
  16. package/dist/query.d.ts +1 -1
  17. package/dist/query.js +7 -6
  18. package/dist/sanitize.d.ts +22 -1
  19. package/dist/sanitize.js +126 -113
  20. package/dist/table.d.ts +50 -4
  21. package/dist/table.js +47 -5
  22. package/lancedb/arrow.ts +283 -49
  23. package/lancedb/connection.ts +27 -6
  24. package/lancedb/embedding/embedding_function.ts +126 -42
  25. package/lancedb/embedding/index.ts +113 -2
  26. package/lancedb/embedding/openai.ts +62 -16
  27. package/lancedb/embedding/registry.ts +172 -0
  28. package/lancedb/query.ts +9 -6
  29. package/lancedb/sanitize.ts +62 -62
  30. package/lancedb/table.ts +72 -5
  31. package/nodejs-artifacts/arrow.d.ts +35 -9
  32. package/nodejs-artifacts/arrow.js +247 -19
  33. package/nodejs-artifacts/connection.d.ts +4 -1
  34. package/nodejs-artifacts/connection.js +11 -5
  35. package/nodejs-artifacts/embedding/embedding_function.d.ts +54 -28
  36. package/nodejs-artifacts/embedding/embedding_function.js +71 -10
  37. package/nodejs-artifacts/embedding/index.d.ts +28 -2
  38. package/nodejs-artifacts/embedding/index.js +111 -4
  39. package/nodejs-artifacts/embedding/openai.d.ts +16 -7
  40. package/nodejs-artifacts/embedding/openai.js +62 -12
  41. package/nodejs-artifacts/embedding/registry.d.ts +54 -0
  42. package/nodejs-artifacts/embedding/registry.js +123 -0
  43. package/nodejs-artifacts/native.d.ts +26 -0
  44. package/nodejs-artifacts/query.d.ts +1 -1
  45. package/nodejs-artifacts/query.js +7 -6
  46. package/nodejs-artifacts/sanitize.d.ts +22 -1
  47. package/nodejs-artifacts/sanitize.js +126 -113
  48. package/nodejs-artifacts/table.d.ts +50 -4
  49. package/nodejs-artifacts/table.js +47 -5
  50. package/package.json +23 -21
  51. package/tsconfig.json +3 -1
  52. package/.eslintignore +0 -3
  53. package/eslint.config.js +0 -28
@@ -12,10 +12,133 @@
12
12
  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  // See the License for the specific language governing permissions and
14
14
  // limitations under the License.
15
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
16
+ if (k2 === undefined) k2 = k;
17
+ var desc = Object.getOwnPropertyDescriptor(m, k);
18
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
19
+ desc = { enumerable: true, get: function() { return m[k]; } };
20
+ }
21
+ Object.defineProperty(o, k2, desc);
22
+ }) : (function(o, m, k, k2) {
23
+ if (k2 === undefined) k2 = k;
24
+ o[k2] = m[k];
25
+ }));
26
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
27
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
28
+ };
15
29
  Object.defineProperty(exports, "__esModule", { value: true });
16
- exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = void 0;
30
+ exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.newVectorType = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = exports.isFixedSizeList = exports.isFixedSizeBinary = exports.isUnion = exports.isStruct = exports.isList = exports.isDuration = exports.isInterval = exports.isTimestamp = exports.isTime = exports.isDate = exports.isDecimal = exports.isBool = exports.isLargeUtf8 = exports.isUtf8 = exports.isLargeBinary = exports.isBinary = exports.isFloat = exports.isInt = exports.isNull = exports.isDataType = exports.isArrowTable = void 0;
17
31
  const apache_arrow_1 = require("apache-arrow");
32
+ const registry_1 = require("./embedding/registry");
18
33
  const sanitize_1 = require("./sanitize");
34
+ __exportStar(require("apache-arrow"), exports);
35
+ function isArrowTable(value) {
36
+ if (value instanceof apache_arrow_1.Table)
37
+ return true;
38
+ return "schema" in value && "batches" in value;
39
+ }
40
+ exports.isArrowTable = isArrowTable;
41
+ function isDataType(value) {
42
+ return (value instanceof apache_arrow_1.DataType ||
43
+ apache_arrow_1.DataType.isNull(value) ||
44
+ apache_arrow_1.DataType.isInt(value) ||
45
+ apache_arrow_1.DataType.isFloat(value) ||
46
+ apache_arrow_1.DataType.isBinary(value) ||
47
+ apache_arrow_1.DataType.isLargeBinary(value) ||
48
+ apache_arrow_1.DataType.isUtf8(value) ||
49
+ apache_arrow_1.DataType.isLargeUtf8(value) ||
50
+ apache_arrow_1.DataType.isBool(value) ||
51
+ apache_arrow_1.DataType.isDecimal(value) ||
52
+ apache_arrow_1.DataType.isDate(value) ||
53
+ apache_arrow_1.DataType.isTime(value) ||
54
+ apache_arrow_1.DataType.isTimestamp(value) ||
55
+ apache_arrow_1.DataType.isInterval(value) ||
56
+ apache_arrow_1.DataType.isDuration(value) ||
57
+ apache_arrow_1.DataType.isList(value) ||
58
+ apache_arrow_1.DataType.isStruct(value) ||
59
+ apache_arrow_1.DataType.isUnion(value) ||
60
+ apache_arrow_1.DataType.isFixedSizeBinary(value) ||
61
+ apache_arrow_1.DataType.isFixedSizeList(value) ||
62
+ apache_arrow_1.DataType.isMap(value) ||
63
+ apache_arrow_1.DataType.isDictionary(value));
64
+ }
65
+ exports.isDataType = isDataType;
66
+ function isNull(value) {
67
+ return value instanceof apache_arrow_1.Null || apache_arrow_1.DataType.isNull(value);
68
+ }
69
+ exports.isNull = isNull;
70
+ function isInt(value) {
71
+ return value instanceof apache_arrow_1.Int || apache_arrow_1.DataType.isInt(value);
72
+ }
73
+ exports.isInt = isInt;
74
+ function isFloat(value) {
75
+ return value instanceof apache_arrow_1.Float || apache_arrow_1.DataType.isFloat(value);
76
+ }
77
+ exports.isFloat = isFloat;
78
+ function isBinary(value) {
79
+ return value instanceof apache_arrow_1.Binary || apache_arrow_1.DataType.isBinary(value);
80
+ }
81
+ exports.isBinary = isBinary;
82
+ function isLargeBinary(value) {
83
+ return value instanceof apache_arrow_1.LargeBinary || apache_arrow_1.DataType.isLargeBinary(value);
84
+ }
85
+ exports.isLargeBinary = isLargeBinary;
86
+ function isUtf8(value) {
87
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isUtf8(value);
88
+ }
89
+ exports.isUtf8 = isUtf8;
90
+ function isLargeUtf8(value) {
91
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isLargeUtf8(value);
92
+ }
93
+ exports.isLargeUtf8 = isLargeUtf8;
94
+ function isBool(value) {
95
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isBool(value);
96
+ }
97
+ exports.isBool = isBool;
98
+ function isDecimal(value) {
99
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDecimal(value);
100
+ }
101
+ exports.isDecimal = isDecimal;
102
+ function isDate(value) {
103
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDate(value);
104
+ }
105
+ exports.isDate = isDate;
106
+ function isTime(value) {
107
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTime(value);
108
+ }
109
+ exports.isTime = isTime;
110
+ function isTimestamp(value) {
111
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTimestamp(value);
112
+ }
113
+ exports.isTimestamp = isTimestamp;
114
+ function isInterval(value) {
115
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isInterval(value);
116
+ }
117
+ exports.isInterval = isInterval;
118
+ function isDuration(value) {
119
+ return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDuration(value);
120
+ }
121
+ exports.isDuration = isDuration;
122
+ function isList(value) {
123
+ return value instanceof apache_arrow_1.List || apache_arrow_1.DataType.isList(value);
124
+ }
125
+ exports.isList = isList;
126
+ function isStruct(value) {
127
+ return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isStruct(value);
128
+ }
129
+ exports.isStruct = isStruct;
130
+ function isUnion(value) {
131
+ return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isUnion(value);
132
+ }
133
+ exports.isUnion = isUnion;
134
+ function isFixedSizeBinary(value) {
135
+ return value instanceof apache_arrow_1.FixedSizeBinary || apache_arrow_1.DataType.isFixedSizeBinary(value);
136
+ }
137
+ exports.isFixedSizeBinary = isFixedSizeBinary;
138
+ function isFixedSizeList(value) {
139
+ return value instanceof apache_arrow_1.FixedSizeList || apache_arrow_1.DataType.isFixedSizeList(value);
140
+ }
141
+ exports.isFixedSizeList = isFixedSizeList;
19
142
  /*
20
143
  * Options to control how a column should be converted to a vector array
21
144
  */
@@ -60,6 +183,7 @@ class MakeArrowTableOptions {
60
183
  vectorColumns = {
61
184
  vector: new VectorColumnOptions(),
62
185
  };
186
+ embeddings;
63
187
  /**
64
188
  * If true then string columns will be encoded with dictionary encoding
65
189
  *
@@ -167,7 +291,7 @@ exports.MakeArrowTableOptions = MakeArrowTableOptions;
167
291
  * assert.deepEqual(table.schema, schema)
168
292
  * ```
169
293
  */
170
- function makeArrowTable(data, options) {
294
+ function makeArrowTable(data, options, metadata) {
171
295
  if (data.length === 0 &&
172
296
  (options?.schema === undefined || options?.schema === null)) {
173
297
  throw new Error("At least one record or a schema needs to be provided");
@@ -175,6 +299,7 @@ function makeArrowTable(data, options) {
175
299
  const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
176
300
  if (opt.schema !== undefined && opt.schema !== null) {
177
301
  opt.schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
302
+ opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
178
303
  }
179
304
  const columns = {};
180
305
  // TODO: sample dataset to find missing columns
@@ -244,20 +369,38 @@ function makeArrowTable(data, options) {
244
369
  // then patch the schema of the batches so we can use
245
370
  // `new ArrowTable(schema, batches)` which does not do any schema inference
246
371
  const firstTable = new apache_arrow_1.Table(columns);
247
- // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
248
372
  const batchesFixed = firstTable.batches.map((batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
249
- return new apache_arrow_1.Table(opt.schema, batchesFixed);
373
+ let schema;
374
+ if (metadata !== undefined) {
375
+ let schemaMetadata = opt.schema.metadata;
376
+ if (schemaMetadata.size === 0) {
377
+ schemaMetadata = metadata;
378
+ }
379
+ else {
380
+ for (const [key, entry] of schemaMetadata.entries()) {
381
+ schemaMetadata.set(key, entry);
382
+ }
383
+ }
384
+ schema = new apache_arrow_1.Schema(opt.schema.fields, schemaMetadata);
385
+ }
386
+ else {
387
+ schema = opt.schema;
388
+ }
389
+ return new apache_arrow_1.Table(schema, batchesFixed);
250
390
  }
251
- else {
252
- return new apache_arrow_1.Table(columns);
391
+ const tbl = new apache_arrow_1.Table(columns);
392
+ if (metadata !== undefined) {
393
+ // biome-ignore lint/suspicious/noExplicitAny: <explanation>
394
+ tbl.schema.metadata = metadata;
253
395
  }
396
+ return tbl;
254
397
  }
255
398
  exports.makeArrowTable = makeArrowTable;
256
399
  /**
257
400
  * Create an empty Arrow table with the provided schema
258
401
  */
259
- function makeEmptyTable(schema) {
260
- return makeArrowTable([], { schema });
402
+ function makeEmptyTable(schema, metadata) {
403
+ return makeArrowTable([], { schema }, metadata);
261
404
  }
262
405
  exports.makeEmptyTable = makeEmptyTable;
263
406
  /**
@@ -269,7 +412,7 @@ function makeListVector(lists) {
269
412
  throw Error("Cannot infer list vector from empty array or empty list");
270
413
  }
271
414
  const sampleList = lists[0];
272
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
415
+ // biome-ignore lint/suspicious/noExplicitAny: skip
273
416
  let inferredType;
274
417
  try {
275
418
  const sampleVector = makeVector(sampleList);
@@ -319,9 +462,52 @@ function makeVector(values, type, stringAsDictionary) {
319
462
  return (0, apache_arrow_1.vectorFromArray)(values);
320
463
  }
321
464
  }
465
+ /** Helper function to apply embeddings from metadata to an input table */
466
+ async function applyEmbeddingsFromMetadata(table, schema) {
467
+ const registry = (0, registry_1.getRegistry)();
468
+ const functions = registry.parseFunctions(schema.metadata);
469
+ const columns = Object.fromEntries(table.schema.fields.map((field) => [
470
+ field.name,
471
+ table.getChild(field.name),
472
+ ]));
473
+ for (const functionEntry of functions.values()) {
474
+ const sourceColumn = columns[functionEntry.sourceColumn];
475
+ const destColumn = functionEntry.vectorColumn ?? "vector";
476
+ if (sourceColumn === undefined) {
477
+ throw new Error(`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`);
478
+ }
479
+ if (columns[destColumn] !== undefined) {
480
+ throw new Error(`Attempt to apply embeddings to table failed because column ${destColumn} already existed`);
481
+ }
482
+ if (table.batches.length > 1) {
483
+ throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
484
+ }
485
+ const values = sourceColumn.toArray();
486
+ const vectors = await functionEntry.function.computeSourceEmbeddings(values);
487
+ if (vectors.length !== values.length) {
488
+ throw new Error("Embedding function did not return an embedding for each input element");
489
+ }
490
+ let destType;
491
+ const dtype = schema.fields.find((f) => f.name === destColumn).type;
492
+ if (isFixedSizeList(dtype)) {
493
+ destType = (0, sanitize_1.sanitizeType)(dtype);
494
+ }
495
+ else {
496
+ throw new Error("Expected FixedSizeList as datatype for vector field, instead got: " +
497
+ dtype);
498
+ }
499
+ const vector = makeVector(vectors, destType);
500
+ columns[destColumn] = vector;
501
+ }
502
+ const newTable = new apache_arrow_1.Table(columns);
503
+ return alignTable(newTable, schema);
504
+ }
322
505
  /** Helper function to apply embeddings to an input table */
323
506
  async function applyEmbeddings(table, embeddings, schema) {
324
- if (embeddings == null) {
507
+ if (schema?.metadata.has("embedding_functions")) {
508
+ return applyEmbeddingsFromMetadata(table, schema);
509
+ }
510
+ else if (embeddings == null || embeddings === undefined) {
325
511
  return table;
326
512
  }
327
513
  if (schema !== undefined && schema !== null) {
@@ -336,8 +522,8 @@ async function applyEmbeddings(table, embeddings, schema) {
336
522
  });
337
523
  const newColumns = Object.fromEntries(colEntries);
338
524
  const sourceColumn = newColumns[embeddings.sourceColumn];
339
- const destColumn = embeddings.destColumn ?? "vector";
340
- const innerDestType = embeddings.embeddingDataType ?? new apache_arrow_1.Float32();
525
+ const destColumn = embeddings.vectorColumn ?? "vector";
526
+ const innerDestType = embeddings.function.embeddingDataType() ?? new apache_arrow_1.Float32();
341
527
  if (sourceColumn === undefined) {
342
528
  throw new Error(`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`);
343
529
  }
@@ -348,8 +534,9 @@ async function applyEmbeddings(table, embeddings, schema) {
348
534
  // if we call convertToTable with 0 records and a schema that includes the embedding
349
535
  return table;
350
536
  }
351
- if (embeddings.embeddingDimension !== undefined) {
352
- const destType = newVectorType(embeddings.embeddingDimension, innerDestType);
537
+ const dimensions = embeddings.function.ndims();
538
+ if (dimensions !== undefined) {
539
+ const destType = newVectorType(dimensions, innerDestType);
353
540
  newColumns[destColumn] = makeVector([], destType);
354
541
  }
355
542
  else if (schema != null) {
@@ -373,7 +560,7 @@ async function applyEmbeddings(table, embeddings, schema) {
373
560
  throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
374
561
  }
375
562
  const values = sourceColumn.toArray();
376
- const vectors = await embeddings.embed(values);
563
+ const vectors = await embeddings.function.computeSourceEmbeddings(values);
377
564
  if (vectors.length !== values.length) {
378
565
  throw new Error("Embedding function did not return an embedding for each input element");
379
566
  }
@@ -416,9 +603,10 @@ exports.convertToTable = convertToTable;
416
603
  function newVectorType(dim, innerType) {
417
604
  // in Lance we always default to have the elements nullable, so we need to set it to true
418
605
  // otherwise we often get schema mismatches because the stored data always has schema with nullable elements
419
- const children = new apache_arrow_1.Field("item", innerType, true);
606
+ const children = new apache_arrow_1.Field("item", (0, sanitize_1.sanitizeType)(innerType), true);
420
607
  return new apache_arrow_1.FixedSizeList(dim, children);
421
608
  }
609
+ exports.newVectorType = newVectorType;
422
610
  /**
423
611
  * Serialize an Array of records into a buffer using the Arrow IPC File serialization
424
612
  *
@@ -480,12 +668,12 @@ async function fromDataToBuffer(data, embeddings, schema) {
480
668
  if (schema !== undefined && schema !== null) {
481
669
  schema = (0, sanitize_1.sanitizeSchema)(schema);
482
670
  }
483
- if (data instanceof apache_arrow_1.Table) {
671
+ if (isArrowTable(data)) {
484
672
  return fromTableToBuffer(data, embeddings, schema);
485
673
  }
486
674
  else {
487
- const table = await convertToTable(data);
488
- return fromTableToBuffer(table, embeddings, schema);
675
+ const table = await convertToTable(data, embeddings, { schema });
676
+ return fromTableToBuffer(table);
489
677
  }
490
678
  }
491
679
  exports.fromDataToBuffer = fromDataToBuffer;
@@ -537,3 +725,43 @@ function createEmptyTable(schema) {
537
725
  return new apache_arrow_1.Table((0, sanitize_1.sanitizeSchema)(schema));
538
726
  }
539
727
  exports.createEmptyTable = createEmptyTable;
728
+ function validateSchemaEmbeddings(schema, data, embeddings) {
729
+ const fields = [];
730
+ const missingEmbeddingFields = [];
731
+ // First we check if the field is a `FixedSizeList`
732
+ // Then we check if the data contains the field
733
+ // if it does not, we add it to the list of missing embedding fields
734
+ // Finally, we check if those missing embedding fields are `this._embeddings`
735
+ // if they are not, we throw an error
736
+ for (let field of schema.fields) {
737
+ if (isFixedSizeList(field.type)) {
738
+ field = (0, sanitize_1.sanitizeField)(field);
739
+ if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
740
+ if (schema.metadata.has("embedding_functions")) {
741
+ const embeddings = JSON.parse(schema.metadata.get("embedding_functions"));
742
+ if (
743
+ // biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
744
+ embeddings.find((f) => f["vectorColumn"] === field.name) ===
745
+ undefined) {
746
+ missingEmbeddingFields.push(field);
747
+ }
748
+ }
749
+ else {
750
+ missingEmbeddingFields.push(field);
751
+ }
752
+ }
753
+ else {
754
+ fields.push(field);
755
+ }
756
+ }
757
+ else {
758
+ fields.push(field);
759
+ }
760
+ }
761
+ if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
762
+ throw new Error(`Table has embeddings: "${missingEmbeddingFields
763
+ .map((f) => f.name)
764
+ .join(",")}", but no embedding function was provided`);
765
+ }
766
+ return new apache_arrow_1.Schema(fields, schema.metadata);
767
+ }
@@ -1,6 +1,7 @@
1
+ import { Table as ArrowTable, Schema } from "./arrow";
2
+ import { EmbeddingFunctionConfig } from "./embedding/registry";
1
3
  import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
2
4
  import { Table } from "./table";
3
- import { Table as ArrowTable, Schema } from "apache-arrow";
4
5
  /**
5
6
  * Connect to a LanceDB instance at the given URI.
6
7
  *
@@ -39,6 +40,8 @@ export interface CreateTableOptions {
39
40
  * The available options are described at https://lancedb.github.io/lancedb/guides/storage/
40
41
  */
41
42
  storageOptions?: Record<string, string>;
43
+ schema?: Schema;
44
+ embeddingFunction?: EmbeddingFunctionConfig;
42
45
  }
43
46
  export interface OpenTableOptions {
44
47
  /**
@@ -15,9 +15,9 @@
15
15
  Object.defineProperty(exports, "__esModule", { value: true });
16
16
  exports.Connection = exports.connect = void 0;
17
17
  const arrow_1 = require("./arrow");
18
+ const registry_1 = require("./embedding/registry");
18
19
  const native_1 = require("./native");
19
20
  const table_1 = require("./table");
20
- const apache_arrow_1 = require("apache-arrow");
21
21
  /**
22
22
  * Connect to a LanceDB instance at the given URI.
23
23
  *
@@ -109,13 +109,13 @@ class Connection {
109
109
  mode = "exist_ok";
110
110
  }
111
111
  let table;
112
- if (data instanceof apache_arrow_1.Table) {
112
+ if ((0, arrow_1.isArrowTable)(data)) {
113
113
  table = data;
114
114
  }
115
115
  else {
116
- table = (0, arrow_1.makeArrowTable)(data);
116
+ table = (0, arrow_1.makeArrowTable)(data, options);
117
117
  }
118
- const buf = await (0, arrow_1.fromTableToBuffer)(table);
118
+ const buf = await (0, arrow_1.fromTableToBuffer)(table, options?.embeddingFunction, options?.schema);
119
119
  const innerTable = await this.inner.createTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
120
120
  return new table_1.Table(innerTable);
121
121
  }
@@ -130,7 +130,13 @@ class Connection {
130
130
  if (mode === "create" && existOk) {
131
131
  mode = "exist_ok";
132
132
  }
133
- const table = (0, arrow_1.makeEmptyTable)(schema);
133
+ let metadata = undefined;
134
+ if (options?.embeddingFunction !== undefined) {
135
+ const embeddingFunction = options.embeddingFunction;
136
+ const registry = (0, registry_1.getRegistry)();
137
+ metadata = registry.getTableMetadata([embeddingFunction]);
138
+ }
139
+ const table = (0, arrow_1.makeEmptyTable)(schema, metadata);
134
140
  const buf = await (0, arrow_1.fromTableToBuffer)(table);
135
141
  const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
136
142
  return new table_1.Table(innerTable);
@@ -1,45 +1,71 @@
1
- import { type Float } from "apache-arrow";
1
+ import "reflect-metadata";
2
+ import { DataType, Float } from "../arrow";
3
+ /**
4
+ * Options for a given embedding function
5
+ */
6
+ export interface FunctionOptions {
7
+ [key: string]: any;
8
+ }
2
9
  /**
3
10
  * An embedding function that automatically creates vector representation for a given column.
4
11
  */
5
- export interface EmbeddingFunction<T> {
12
+ export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptions = FunctionOptions> {
6
13
  /**
7
- * The name of the column that will be used as input for the Embedding Function.
8
- */
9
- sourceColumn: string;
10
- /**
11
- * The data type of the embedding
14
+ * Convert the embedding function to a JSON object
15
+ * It is used to serialize the embedding function to the schema
16
+ * It's important that any object returned by this method contains all the necessary
17
+ * information to recreate the embedding function
12
18
  *
13
- * The embedding function should return `number`. This will be converted into
14
- * an Arrow float array. By default this will be Float32 but this property can
15
- * be used to control the conversion.
16
- */
17
- embeddingDataType?: Float;
18
- /**
19
- * The dimension of the embedding
19
+ * It should return the same object that was passed to the constructor
20
+ * If it does not, the embedding function will not be able to be recreated, or could be recreated incorrectly
20
21
  *
21
- * This is optional, normally this can be determined by looking at the results of
22
- * `embed`. If this is not specified, and there is an attempt to apply the embedding
23
- * to an empty table, then that process will fail.
22
+ * @example
23
+ * ```ts
24
+ * class MyEmbeddingFunction extends EmbeddingFunction {
25
+ * constructor(options: {model: string, timeout: number}) {
26
+ * super();
27
+ * this.model = options.model;
28
+ * this.timeout = options.timeout;
29
+ * }
30
+ * toJSON() {
31
+ * return {
32
+ * model: this.model,
33
+ * timeout: this.timeout,
34
+ * };
35
+ * }
36
+ * ```
24
37
  */
25
- embeddingDimension?: number;
38
+ abstract toJSON(): Partial<M>;
26
39
  /**
27
- * The name of the column that will contain the embedding
40
+ * sourceField is used in combination with `LanceSchema` to provide a declarative data model
41
+ *
42
+ * @param optionsOrDatatype - The options for the field or the datatype
28
43
  *
29
- * By default this is "vector"
44
+ * @see {@link lancedb.LanceSchema}
30
45
  */
31
- destColumn?: string;
46
+ sourceField(optionsOrDatatype: Partial<FieldOptions> | DataType): [DataType, Map<string, EmbeddingFunction>];
32
47
  /**
33
- * Should the source column be excluded from the resulting table
48
+ * vectorField is used in combination with `LanceSchema` to provide a declarative data model
49
+ *
50
+ * @param options - The options for the field
34
51
  *
35
- * By default the source column is included. Set this to true and
36
- * only the embedding will be stored.
52
+ * @see {@link lancedb.LanceSchema}
37
53
  */
38
- excludeSource?: boolean;
54
+ vectorField(options?: Partial<FieldOptions>): [DataType, Map<string, EmbeddingFunction>];
55
+ /** The number of dimensions of the embeddings */
56
+ ndims(): number | undefined;
57
+ /** The datatype of the embeddings */
58
+ abstract embeddingDataType(): Float;
39
59
  /**
40
60
  * Creates a vector representation for the given values.
41
61
  */
42
- embed: (data: T[]) => Promise<number[][]>;
62
+ abstract computeSourceEmbeddings(data: T[]): Promise<number[][] | Float32Array[] | Float64Array[]>;
63
+ /**
64
+ Compute the embeddings for a single query
65
+ */
66
+ computeQueryEmbeddings(data: T): Promise<number[] | Float32Array | Float64Array>;
67
+ }
68
+ export interface FieldOptions<T extends DataType = DataType> {
69
+ datatype: T;
70
+ dims?: number;
43
71
  }
44
- /** Test if the input seems to be an embedding function */
45
- export declare function isEmbeddingFunction<T>(value: unknown): value is EmbeddingFunction<T>;
@@ -1,5 +1,5 @@
1
1
  "use strict";
2
- // Copyright 2023 Lance Developers.
2
+ // Copyright 2024 Lance Developers.
3
3
  //
4
4
  // Licensed under the Apache License, Version 2.0 (the "License");
5
5
  // you may not use this file except in compliance with the License.
@@ -13,15 +13,76 @@
13
13
  // See the License for the specific language governing permissions and
14
14
  // limitations under the License.
15
15
  Object.defineProperty(exports, "__esModule", { value: true });
16
- exports.isEmbeddingFunction = void 0;
17
- /** Test if the input seems to be an embedding function */
18
- function isEmbeddingFunction(value) {
19
- if (typeof value !== "object" || value === null) {
20
- return false;
16
+ exports.EmbeddingFunction = void 0;
17
+ require("reflect-metadata");
18
+ const arrow_1 = require("../arrow");
19
+ const sanitize_1 = require("../sanitize");
20
+ /**
21
+ * An embedding function that automatically creates vector representation for a given column.
22
+ */
23
+ class EmbeddingFunction {
24
+ /**
25
+ * sourceField is used in combination with `LanceSchema` to provide a declarative data model
26
+ *
27
+ * @param optionsOrDatatype - The options for the field or the datatype
28
+ *
29
+ * @see {@link lancedb.LanceSchema}
30
+ */
31
+ sourceField(optionsOrDatatype) {
32
+ let datatype = (0, arrow_1.isDataType)(optionsOrDatatype)
33
+ ? optionsOrDatatype
34
+ : optionsOrDatatype?.datatype;
35
+ if (!datatype) {
36
+ throw new Error("Datatype is required");
37
+ }
38
+ datatype = (0, sanitize_1.sanitizeType)(datatype);
39
+ const metadata = new Map();
40
+ metadata.set("source_column_for", this);
41
+ return [datatype, metadata];
21
42
  }
22
- if (!("sourceColumn" in value) || !("embed" in value)) {
23
- return false;
43
+ /**
44
+ * vectorField is used in combination with `LanceSchema` to provide a declarative data model
45
+ *
46
+ * @param options - The options for the field
47
+ *
48
+ * @see {@link lancedb.LanceSchema}
49
+ */
50
+ vectorField(options) {
51
+ let dtype;
52
+ const dims = this.ndims() ?? options?.dims;
53
+ if (!options?.datatype) {
54
+ if (dims === undefined) {
55
+ throw new Error("ndims is required for vector field");
56
+ }
57
+ dtype = new arrow_1.FixedSizeList(dims, new arrow_1.Field("item", new arrow_1.Float32(), true));
58
+ }
59
+ else {
60
+ if ((0, arrow_1.isFixedSizeList)(options.datatype)) {
61
+ dtype = options.datatype;
62
+ }
63
+ else if ((0, arrow_1.isFloat)(options.datatype)) {
64
+ if (dims === undefined) {
65
+ throw new Error("ndims is required for vector field");
66
+ }
67
+ dtype = (0, arrow_1.newVectorType)(dims, options.datatype);
68
+ }
69
+ else {
70
+ throw new Error("Expected FixedSizeList or Float as datatype for vector field");
71
+ }
72
+ }
73
+ const metadata = new Map();
74
+ metadata.set("vector_column_for", this);
75
+ return [dtype, metadata];
76
+ }
77
+ /** The number of dimensions of the embeddings */
78
+ ndims() {
79
+ return undefined;
80
+ }
81
+ /**
82
+ Compute the embeddings for a single query
83
+ */
84
+ async computeQueryEmbeddings(data) {
85
+ return this.computeSourceEmbeddings([data]).then((embeddings) => embeddings[0]);
24
86
  }
25
- return (typeof value.sourceColumn === "string" && typeof value.embed === "function");
26
87
  }
27
- exports.isEmbeddingFunction = isEmbeddingFunction;
88
+ exports.EmbeddingFunction = EmbeddingFunction;
@@ -1,2 +1,28 @@
1
- export { EmbeddingFunction, isEmbeddingFunction } from "./embedding_function";
2
- export { OpenAIEmbeddingFunction } from "./openai";
1
+ import { Schema } from "../arrow";
2
+ import { EmbeddingFunction } from "./embedding_function";
3
+ export { EmbeddingFunction } from "./embedding_function";
4
+ export * from "./openai";
5
+ export * from "./registry";
6
+ /**
7
+ * Create a schema with embedding functions.
8
+ *
9
+ * @param fields
10
+ * @returns Schema
11
+ * @example
12
+ * ```ts
13
+ * class MyEmbeddingFunction extends EmbeddingFunction {
14
+ * // ...
15
+ * }
16
+ * const func = new MyEmbeddingFunction();
17
+ * const schema = LanceSchema({
18
+ * id: new Int32(),
19
+ * text: func.sourceField(new Utf8()),
20
+ * vector: func.vectorField(),
21
+ * // optional: specify the datatype and/or dimensions
22
+ * vector2: func.vectorField({ datatype: new Float32(), dims: 3}),
23
+ * });
24
+ *
25
+ * const table = await db.createTable("my_table", data, { schema });
26
+ * ```
27
+ */
28
+ export declare function LanceSchema(fields: Record<string, [object, Map<string, EmbeddingFunction>] | object>): Schema;