@lancedb/lancedb 0.4.20 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -14
- package/biome.json +142 -0
- package/dist/arrow.d.ts +35 -9
- package/dist/arrow.js +247 -19
- package/dist/connection.d.ts +4 -1
- package/dist/connection.js +11 -5
- package/dist/embedding/embedding_function.d.ts +54 -28
- package/dist/embedding/embedding_function.js +71 -10
- package/dist/embedding/index.d.ts +28 -2
- package/dist/embedding/index.js +111 -4
- package/dist/embedding/openai.d.ts +16 -7
- package/dist/embedding/openai.js +62 -12
- package/dist/embedding/registry.d.ts +54 -0
- package/dist/embedding/registry.js +123 -0
- package/dist/native.d.ts +26 -0
- package/dist/query.d.ts +1 -1
- package/dist/query.js +7 -6
- package/dist/sanitize.d.ts +22 -1
- package/dist/sanitize.js +126 -113
- package/dist/table.d.ts +50 -4
- package/dist/table.js +47 -5
- package/lancedb/arrow.ts +283 -49
- package/lancedb/connection.ts +27 -6
- package/lancedb/embedding/embedding_function.ts +126 -42
- package/lancedb/embedding/index.ts +113 -2
- package/lancedb/embedding/openai.ts +62 -16
- package/lancedb/embedding/registry.ts +172 -0
- package/lancedb/query.ts +9 -6
- package/lancedb/sanitize.ts +62 -62
- package/lancedb/table.ts +72 -5
- package/nodejs-artifacts/arrow.d.ts +35 -9
- package/nodejs-artifacts/arrow.js +247 -19
- package/nodejs-artifacts/connection.d.ts +4 -1
- package/nodejs-artifacts/connection.js +11 -5
- package/nodejs-artifacts/embedding/embedding_function.d.ts +54 -28
- package/nodejs-artifacts/embedding/embedding_function.js +71 -10
- package/nodejs-artifacts/embedding/index.d.ts +28 -2
- package/nodejs-artifacts/embedding/index.js +111 -4
- package/nodejs-artifacts/embedding/openai.d.ts +16 -7
- package/nodejs-artifacts/embedding/openai.js +62 -12
- package/nodejs-artifacts/embedding/registry.d.ts +54 -0
- package/nodejs-artifacts/embedding/registry.js +123 -0
- package/nodejs-artifacts/native.d.ts +26 -0
- package/nodejs-artifacts/query.d.ts +1 -1
- package/nodejs-artifacts/query.js +7 -6
- package/nodejs-artifacts/sanitize.d.ts +22 -1
- package/nodejs-artifacts/sanitize.js +126 -113
- package/nodejs-artifacts/table.d.ts +50 -4
- package/nodejs-artifacts/table.js +47 -5
- package/package.json +23 -21
- package/tsconfig.json +3 -1
- package/.eslintignore +0 -3
- package/eslint.config.js +0 -28
package/lancedb/arrow.ts
CHANGED
|
@@ -13,28 +13,124 @@
|
|
|
13
13
|
// limitations under the License.
|
|
14
14
|
|
|
15
15
|
import {
|
|
16
|
+
Table as ArrowTable,
|
|
17
|
+
Binary,
|
|
18
|
+
DataType,
|
|
16
19
|
Field,
|
|
17
|
-
|
|
18
|
-
RecordBatchFileWriter,
|
|
19
|
-
Utf8,
|
|
20
|
-
type Vector,
|
|
20
|
+
FixedSizeBinary,
|
|
21
21
|
FixedSizeList,
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
22
|
+
Float,
|
|
23
|
+
Float32,
|
|
24
|
+
Int,
|
|
25
|
+
LargeBinary,
|
|
26
26
|
List,
|
|
27
|
+
Null,
|
|
27
28
|
RecordBatch,
|
|
28
|
-
|
|
29
|
+
RecordBatchFileWriter,
|
|
30
|
+
RecordBatchStreamWriter,
|
|
31
|
+
Schema,
|
|
29
32
|
Struct,
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
33
|
+
Utf8,
|
|
34
|
+
type Vector,
|
|
35
|
+
makeBuilder,
|
|
36
|
+
makeData,
|
|
34
37
|
type makeTable,
|
|
38
|
+
vectorFromArray,
|
|
35
39
|
} from "apache-arrow";
|
|
36
40
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
|
37
|
-
import {
|
|
41
|
+
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
|
42
|
+
import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
|
|
43
|
+
export * from "apache-arrow";
|
|
44
|
+
|
|
45
|
+
export function isArrowTable(value: object): value is ArrowTable {
|
|
46
|
+
if (value instanceof ArrowTable) return true;
|
|
47
|
+
return "schema" in value && "batches" in value;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export function isDataType(value: unknown): value is DataType {
|
|
51
|
+
return (
|
|
52
|
+
value instanceof DataType ||
|
|
53
|
+
DataType.isNull(value) ||
|
|
54
|
+
DataType.isInt(value) ||
|
|
55
|
+
DataType.isFloat(value) ||
|
|
56
|
+
DataType.isBinary(value) ||
|
|
57
|
+
DataType.isLargeBinary(value) ||
|
|
58
|
+
DataType.isUtf8(value) ||
|
|
59
|
+
DataType.isLargeUtf8(value) ||
|
|
60
|
+
DataType.isBool(value) ||
|
|
61
|
+
DataType.isDecimal(value) ||
|
|
62
|
+
DataType.isDate(value) ||
|
|
63
|
+
DataType.isTime(value) ||
|
|
64
|
+
DataType.isTimestamp(value) ||
|
|
65
|
+
DataType.isInterval(value) ||
|
|
66
|
+
DataType.isDuration(value) ||
|
|
67
|
+
DataType.isList(value) ||
|
|
68
|
+
DataType.isStruct(value) ||
|
|
69
|
+
DataType.isUnion(value) ||
|
|
70
|
+
DataType.isFixedSizeBinary(value) ||
|
|
71
|
+
DataType.isFixedSizeList(value) ||
|
|
72
|
+
DataType.isMap(value) ||
|
|
73
|
+
DataType.isDictionary(value)
|
|
74
|
+
);
|
|
75
|
+
}
|
|
76
|
+
export function isNull(value: unknown): value is Null {
|
|
77
|
+
return value instanceof Null || DataType.isNull(value);
|
|
78
|
+
}
|
|
79
|
+
export function isInt(value: unknown): value is Int {
|
|
80
|
+
return value instanceof Int || DataType.isInt(value);
|
|
81
|
+
}
|
|
82
|
+
export function isFloat(value: unknown): value is Float {
|
|
83
|
+
return value instanceof Float || DataType.isFloat(value);
|
|
84
|
+
}
|
|
85
|
+
export function isBinary(value: unknown): value is Binary {
|
|
86
|
+
return value instanceof Binary || DataType.isBinary(value);
|
|
87
|
+
}
|
|
88
|
+
export function isLargeBinary(value: unknown): value is LargeBinary {
|
|
89
|
+
return value instanceof LargeBinary || DataType.isLargeBinary(value);
|
|
90
|
+
}
|
|
91
|
+
export function isUtf8(value: unknown): value is Utf8 {
|
|
92
|
+
return value instanceof Utf8 || DataType.isUtf8(value);
|
|
93
|
+
}
|
|
94
|
+
export function isLargeUtf8(value: unknown): value is Utf8 {
|
|
95
|
+
return value instanceof Utf8 || DataType.isLargeUtf8(value);
|
|
96
|
+
}
|
|
97
|
+
export function isBool(value: unknown): value is Utf8 {
|
|
98
|
+
return value instanceof Utf8 || DataType.isBool(value);
|
|
99
|
+
}
|
|
100
|
+
export function isDecimal(value: unknown): value is Utf8 {
|
|
101
|
+
return value instanceof Utf8 || DataType.isDecimal(value);
|
|
102
|
+
}
|
|
103
|
+
export function isDate(value: unknown): value is Utf8 {
|
|
104
|
+
return value instanceof Utf8 || DataType.isDate(value);
|
|
105
|
+
}
|
|
106
|
+
export function isTime(value: unknown): value is Utf8 {
|
|
107
|
+
return value instanceof Utf8 || DataType.isTime(value);
|
|
108
|
+
}
|
|
109
|
+
export function isTimestamp(value: unknown): value is Utf8 {
|
|
110
|
+
return value instanceof Utf8 || DataType.isTimestamp(value);
|
|
111
|
+
}
|
|
112
|
+
export function isInterval(value: unknown): value is Utf8 {
|
|
113
|
+
return value instanceof Utf8 || DataType.isInterval(value);
|
|
114
|
+
}
|
|
115
|
+
export function isDuration(value: unknown): value is Utf8 {
|
|
116
|
+
return value instanceof Utf8 || DataType.isDuration(value);
|
|
117
|
+
}
|
|
118
|
+
export function isList(value: unknown): value is List {
|
|
119
|
+
return value instanceof List || DataType.isList(value);
|
|
120
|
+
}
|
|
121
|
+
export function isStruct(value: unknown): value is Struct {
|
|
122
|
+
return value instanceof Struct || DataType.isStruct(value);
|
|
123
|
+
}
|
|
124
|
+
export function isUnion(value: unknown): value is Struct {
|
|
125
|
+
return value instanceof Struct || DataType.isUnion(value);
|
|
126
|
+
}
|
|
127
|
+
export function isFixedSizeBinary(value: unknown): value is FixedSizeBinary {
|
|
128
|
+
return value instanceof FixedSizeBinary || DataType.isFixedSizeBinary(value);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
export function isFixedSizeList(value: unknown): value is FixedSizeList {
|
|
132
|
+
return value instanceof FixedSizeList || DataType.isFixedSizeList(value);
|
|
133
|
+
}
|
|
38
134
|
|
|
39
135
|
/** Data type accepted by NodeJS SDK */
|
|
40
136
|
export type Data = Record<string, unknown>[] | ArrowTable;
|
|
@@ -85,6 +181,7 @@ export class MakeArrowTableOptions {
|
|
|
85
181
|
vectorColumns: Record<string, VectorColumnOptions> = {
|
|
86
182
|
vector: new VectorColumnOptions(),
|
|
87
183
|
};
|
|
184
|
+
embeddings?: EmbeddingFunction<unknown>;
|
|
88
185
|
|
|
89
186
|
/**
|
|
90
187
|
* If true then string columns will be encoded with dictionary encoding
|
|
@@ -197,6 +294,7 @@ export class MakeArrowTableOptions {
|
|
|
197
294
|
export function makeArrowTable(
|
|
198
295
|
data: Array<Record<string, unknown>>,
|
|
199
296
|
options?: Partial<MakeArrowTableOptions>,
|
|
297
|
+
metadata?: Map<string, string>,
|
|
200
298
|
): ArrowTable {
|
|
201
299
|
if (
|
|
202
300
|
data.length === 0 &&
|
|
@@ -208,6 +306,7 @@ export function makeArrowTable(
|
|
|
208
306
|
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
209
307
|
if (opt.schema !== undefined && opt.schema !== null) {
|
|
210
308
|
opt.schema = sanitizeSchema(opt.schema);
|
|
309
|
+
opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
|
|
211
310
|
}
|
|
212
311
|
const columns: Record<string, Vector> = {};
|
|
213
312
|
// TODO: sample dataset to find missing columns
|
|
@@ -287,21 +386,42 @@ export function makeArrowTable(
|
|
|
287
386
|
// then patch the schema of the batches so we can use
|
|
288
387
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
289
388
|
const firstTable = new ArrowTable(columns);
|
|
290
|
-
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
291
389
|
const batchesFixed = firstTable.batches.map(
|
|
292
390
|
(batch) => new RecordBatch(opt.schema!, batch.data),
|
|
293
391
|
);
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
392
|
+
let schema: Schema;
|
|
393
|
+
if (metadata !== undefined) {
|
|
394
|
+
let schemaMetadata = opt.schema.metadata;
|
|
395
|
+
if (schemaMetadata.size === 0) {
|
|
396
|
+
schemaMetadata = metadata;
|
|
397
|
+
} else {
|
|
398
|
+
for (const [key, entry] of schemaMetadata.entries()) {
|
|
399
|
+
schemaMetadata.set(key, entry);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
schema = new Schema(opt.schema.fields, schemaMetadata);
|
|
404
|
+
} else {
|
|
405
|
+
schema = opt.schema;
|
|
406
|
+
}
|
|
407
|
+
return new ArrowTable(schema, batchesFixed);
|
|
408
|
+
}
|
|
409
|
+
const tbl = new ArrowTable(columns);
|
|
410
|
+
if (metadata !== undefined) {
|
|
411
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
412
|
+
(<any>tbl.schema).metadata = metadata;
|
|
297
413
|
}
|
|
414
|
+
return tbl;
|
|
298
415
|
}
|
|
299
416
|
|
|
300
417
|
/**
|
|
301
418
|
* Create an empty Arrow table with the provided schema
|
|
302
419
|
*/
|
|
303
|
-
export function makeEmptyTable(
|
|
304
|
-
|
|
420
|
+
export function makeEmptyTable(
|
|
421
|
+
schema: Schema,
|
|
422
|
+
metadata?: Map<string, string>,
|
|
423
|
+
): ArrowTable {
|
|
424
|
+
return makeArrowTable([], { schema }, metadata);
|
|
305
425
|
}
|
|
306
426
|
|
|
307
427
|
/**
|
|
@@ -313,7 +433,7 @@ function makeListVector(lists: unknown[][]): Vector<unknown> {
|
|
|
313
433
|
throw Error("Cannot infer list vector from empty array or empty list");
|
|
314
434
|
}
|
|
315
435
|
const sampleList = lists[0];
|
|
316
|
-
//
|
|
436
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
317
437
|
let inferredType: any;
|
|
318
438
|
try {
|
|
319
439
|
const sampleVector = makeVector(sampleList);
|
|
@@ -337,7 +457,7 @@ function makeVector(
|
|
|
337
457
|
values: unknown[],
|
|
338
458
|
type?: DataType,
|
|
339
459
|
stringAsDictionary?: boolean,
|
|
340
|
-
//
|
|
460
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
341
461
|
): Vector<any> {
|
|
342
462
|
if (type !== undefined) {
|
|
343
463
|
// No need for inference, let Arrow create it
|
|
@@ -373,13 +493,75 @@ function makeVector(
|
|
|
373
493
|
}
|
|
374
494
|
}
|
|
375
495
|
|
|
496
|
+
/** Helper function to apply embeddings from metadata to an input table */
|
|
497
|
+
async function applyEmbeddingsFromMetadata(
|
|
498
|
+
table: ArrowTable,
|
|
499
|
+
schema: Schema,
|
|
500
|
+
): Promise<ArrowTable> {
|
|
501
|
+
const registry = getRegistry();
|
|
502
|
+
const functions = registry.parseFunctions(schema.metadata);
|
|
503
|
+
|
|
504
|
+
const columns = Object.fromEntries(
|
|
505
|
+
table.schema.fields.map((field) => [
|
|
506
|
+
field.name,
|
|
507
|
+
table.getChild(field.name)!,
|
|
508
|
+
]),
|
|
509
|
+
);
|
|
510
|
+
|
|
511
|
+
for (const functionEntry of functions.values()) {
|
|
512
|
+
const sourceColumn = columns[functionEntry.sourceColumn];
|
|
513
|
+
const destColumn = functionEntry.vectorColumn ?? "vector";
|
|
514
|
+
if (sourceColumn === undefined) {
|
|
515
|
+
throw new Error(
|
|
516
|
+
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
|
517
|
+
);
|
|
518
|
+
}
|
|
519
|
+
if (columns[destColumn] !== undefined) {
|
|
520
|
+
throw new Error(
|
|
521
|
+
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
|
522
|
+
);
|
|
523
|
+
}
|
|
524
|
+
if (table.batches.length > 1) {
|
|
525
|
+
throw new Error(
|
|
526
|
+
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
|
527
|
+
);
|
|
528
|
+
}
|
|
529
|
+
const values = sourceColumn.toArray();
|
|
530
|
+
|
|
531
|
+
const vectors =
|
|
532
|
+
await functionEntry.function.computeSourceEmbeddings(values);
|
|
533
|
+
if (vectors.length !== values.length) {
|
|
534
|
+
throw new Error(
|
|
535
|
+
"Embedding function did not return an embedding for each input element",
|
|
536
|
+
);
|
|
537
|
+
}
|
|
538
|
+
let destType: DataType;
|
|
539
|
+
const dtype = schema.fields.find((f) => f.name === destColumn)!.type;
|
|
540
|
+
if (isFixedSizeList(dtype)) {
|
|
541
|
+
destType = sanitizeType(dtype);
|
|
542
|
+
} else {
|
|
543
|
+
throw new Error(
|
|
544
|
+
"Expected FixedSizeList as datatype for vector field, instead got: " +
|
|
545
|
+
dtype,
|
|
546
|
+
);
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
const vector = makeVector(vectors, destType);
|
|
550
|
+
columns[destColumn] = vector;
|
|
551
|
+
}
|
|
552
|
+
const newTable = new ArrowTable(columns);
|
|
553
|
+
return alignTable(newTable, schema);
|
|
554
|
+
}
|
|
555
|
+
|
|
376
556
|
/** Helper function to apply embeddings to an input table */
|
|
377
557
|
async function applyEmbeddings<T>(
|
|
378
558
|
table: ArrowTable,
|
|
379
|
-
embeddings?:
|
|
559
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
380
560
|
schema?: Schema,
|
|
381
561
|
): Promise<ArrowTable> {
|
|
382
|
-
if (
|
|
562
|
+
if (schema?.metadata.has("embedding_functions")) {
|
|
563
|
+
return applyEmbeddingsFromMetadata(table, schema!);
|
|
564
|
+
} else if (embeddings == null || embeddings === undefined) {
|
|
383
565
|
return table;
|
|
384
566
|
}
|
|
385
567
|
|
|
@@ -397,8 +579,9 @@ async function applyEmbeddings<T>(
|
|
|
397
579
|
const newColumns = Object.fromEntries(colEntries);
|
|
398
580
|
|
|
399
581
|
const sourceColumn = newColumns[embeddings.sourceColumn];
|
|
400
|
-
const destColumn = embeddings.
|
|
401
|
-
const innerDestType =
|
|
582
|
+
const destColumn = embeddings.vectorColumn ?? "vector";
|
|
583
|
+
const innerDestType =
|
|
584
|
+
embeddings.function.embeddingDataType() ?? new Float32();
|
|
402
585
|
if (sourceColumn === undefined) {
|
|
403
586
|
throw new Error(
|
|
404
587
|
`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`,
|
|
@@ -412,11 +595,9 @@ async function applyEmbeddings<T>(
|
|
|
412
595
|
// if we call convertToTable with 0 records and a schema that includes the embedding
|
|
413
596
|
return table;
|
|
414
597
|
}
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
innerDestType,
|
|
419
|
-
);
|
|
598
|
+
const dimensions = embeddings.function.ndims();
|
|
599
|
+
if (dimensions !== undefined) {
|
|
600
|
+
const destType = newVectorType(dimensions, innerDestType);
|
|
420
601
|
newColumns[destColumn] = makeVector([], destType);
|
|
421
602
|
} else if (schema != null) {
|
|
422
603
|
const destField = schema.fields.find((f) => f.name === destColumn);
|
|
@@ -444,7 +625,9 @@ async function applyEmbeddings<T>(
|
|
|
444
625
|
);
|
|
445
626
|
}
|
|
446
627
|
const values = sourceColumn.toArray();
|
|
447
|
-
const vectors = await embeddings.
|
|
628
|
+
const vectors = await embeddings.function.computeSourceEmbeddings(
|
|
629
|
+
values as T[],
|
|
630
|
+
);
|
|
448
631
|
if (vectors.length !== values.length) {
|
|
449
632
|
throw new Error(
|
|
450
633
|
"Embedding function did not return an embedding for each input element",
|
|
@@ -484,9 +667,9 @@ async function applyEmbeddings<T>(
|
|
|
484
667
|
* embedding columns. If no schema is provded then embedding columns will
|
|
485
668
|
* be placed at the end of the table, after all of the input columns.
|
|
486
669
|
*/
|
|
487
|
-
export async function convertToTable
|
|
670
|
+
export async function convertToTable(
|
|
488
671
|
data: Array<Record<string, unknown>>,
|
|
489
|
-
embeddings?:
|
|
672
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
490
673
|
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
|
491
674
|
): Promise<ArrowTable> {
|
|
492
675
|
const table = makeArrowTable(data, makeTableOptions);
|
|
@@ -494,13 +677,13 @@ export async function convertToTable<T>(
|
|
|
494
677
|
}
|
|
495
678
|
|
|
496
679
|
/** Creates the Arrow Type for a Vector column with dimension `dim` */
|
|
497
|
-
function newVectorType<T extends Float>(
|
|
680
|
+
export function newVectorType<T extends Float>(
|
|
498
681
|
dim: number,
|
|
499
682
|
innerType: T,
|
|
500
683
|
): FixedSizeList<T> {
|
|
501
684
|
// in Lance we always default to have the elements nullable, so we need to set it to true
|
|
502
685
|
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
|
|
503
|
-
const children = new Field
|
|
686
|
+
const children = new Field("item", <T>sanitizeType(innerType), true);
|
|
504
687
|
return new FixedSizeList(dim, children);
|
|
505
688
|
}
|
|
506
689
|
|
|
@@ -511,9 +694,9 @@ function newVectorType<T extends Float>(
|
|
|
511
694
|
*
|
|
512
695
|
* `schema` is required if data is empty
|
|
513
696
|
*/
|
|
514
|
-
export async function fromRecordsToBuffer
|
|
697
|
+
export async function fromRecordsToBuffer(
|
|
515
698
|
data: Array<Record<string, unknown>>,
|
|
516
|
-
embeddings?:
|
|
699
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
517
700
|
schema?: Schema,
|
|
518
701
|
): Promise<Buffer> {
|
|
519
702
|
if (schema !== undefined && schema !== null) {
|
|
@@ -531,9 +714,9 @@ export async function fromRecordsToBuffer<T>(
|
|
|
531
714
|
*
|
|
532
715
|
* `schema` is required if data is empty
|
|
533
716
|
*/
|
|
534
|
-
export async function fromRecordsToStreamBuffer
|
|
717
|
+
export async function fromRecordsToStreamBuffer(
|
|
535
718
|
data: Array<Record<string, unknown>>,
|
|
536
|
-
embeddings?:
|
|
719
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
537
720
|
schema?: Schema,
|
|
538
721
|
): Promise<Buffer> {
|
|
539
722
|
if (schema !== undefined && schema !== null) {
|
|
@@ -552,9 +735,9 @@ export async function fromRecordsToStreamBuffer<T>(
|
|
|
552
735
|
*
|
|
553
736
|
* `schema` is required if the table is empty
|
|
554
737
|
*/
|
|
555
|
-
export async function fromTableToBuffer
|
|
738
|
+
export async function fromTableToBuffer(
|
|
556
739
|
table: ArrowTable,
|
|
557
|
-
embeddings?:
|
|
740
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
558
741
|
schema?: Schema,
|
|
559
742
|
): Promise<Buffer> {
|
|
560
743
|
if (schema !== undefined && schema !== null) {
|
|
@@ -573,19 +756,19 @@ export async function fromTableToBuffer<T>(
|
|
|
573
756
|
*
|
|
574
757
|
* `schema` is required if the table is empty
|
|
575
758
|
*/
|
|
576
|
-
export async function fromDataToBuffer
|
|
759
|
+
export async function fromDataToBuffer(
|
|
577
760
|
data: Data,
|
|
578
|
-
embeddings?:
|
|
761
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
579
762
|
schema?: Schema,
|
|
580
763
|
): Promise<Buffer> {
|
|
581
764
|
if (schema !== undefined && schema !== null) {
|
|
582
765
|
schema = sanitizeSchema(schema);
|
|
583
766
|
}
|
|
584
|
-
if (data
|
|
767
|
+
if (isArrowTable(data)) {
|
|
585
768
|
return fromTableToBuffer(data, embeddings, schema);
|
|
586
769
|
} else {
|
|
587
|
-
const table = await convertToTable(data);
|
|
588
|
-
return fromTableToBuffer(table
|
|
770
|
+
const table = await convertToTable(data, embeddings, { schema });
|
|
771
|
+
return fromTableToBuffer(table);
|
|
589
772
|
}
|
|
590
773
|
}
|
|
591
774
|
|
|
@@ -597,9 +780,9 @@ export async function fromDataToBuffer<T>(
|
|
|
597
780
|
*
|
|
598
781
|
* `schema` is required if the table is empty
|
|
599
782
|
*/
|
|
600
|
-
export async function fromTableToStreamBuffer
|
|
783
|
+
export async function fromTableToStreamBuffer(
|
|
601
784
|
table: ArrowTable,
|
|
602
|
-
embeddings?:
|
|
785
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
603
786
|
schema?: Schema,
|
|
604
787
|
): Promise<Buffer> {
|
|
605
788
|
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
@@ -648,3 +831,54 @@ function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
|
|
|
648
831
|
export function createEmptyTable(schema: Schema): ArrowTable {
|
|
649
832
|
return new ArrowTable(sanitizeSchema(schema));
|
|
650
833
|
}
|
|
834
|
+
|
|
835
|
+
function validateSchemaEmbeddings(
|
|
836
|
+
schema: Schema,
|
|
837
|
+
data: Array<Record<string, unknown>>,
|
|
838
|
+
embeddings: EmbeddingFunction<unknown> | undefined,
|
|
839
|
+
) {
|
|
840
|
+
const fields = [];
|
|
841
|
+
const missingEmbeddingFields = [];
|
|
842
|
+
|
|
843
|
+
// First we check if the field is a `FixedSizeList`
|
|
844
|
+
// Then we check if the data contains the field
|
|
845
|
+
// if it does not, we add it to the list of missing embedding fields
|
|
846
|
+
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
847
|
+
// if they are not, we throw an error
|
|
848
|
+
for (let field of schema.fields) {
|
|
849
|
+
if (isFixedSizeList(field.type)) {
|
|
850
|
+
field = sanitizeField(field);
|
|
851
|
+
|
|
852
|
+
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
853
|
+
if (schema.metadata.has("embedding_functions")) {
|
|
854
|
+
const embeddings = JSON.parse(
|
|
855
|
+
schema.metadata.get("embedding_functions")!,
|
|
856
|
+
);
|
|
857
|
+
if (
|
|
858
|
+
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
|
859
|
+
embeddings.find((f: any) => f["vectorColumn"] === field.name) ===
|
|
860
|
+
undefined
|
|
861
|
+
) {
|
|
862
|
+
missingEmbeddingFields.push(field);
|
|
863
|
+
}
|
|
864
|
+
} else {
|
|
865
|
+
missingEmbeddingFields.push(field);
|
|
866
|
+
}
|
|
867
|
+
} else {
|
|
868
|
+
fields.push(field);
|
|
869
|
+
}
|
|
870
|
+
} else {
|
|
871
|
+
fields.push(field);
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
|
876
|
+
throw new Error(
|
|
877
|
+
`Table has embeddings: "${missingEmbeddingFields
|
|
878
|
+
.map((f) => f.name)
|
|
879
|
+
.join(",")}", but no embedding function was provided`,
|
|
880
|
+
);
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
return new Schema(fields, schema.metadata);
|
|
884
|
+
}
|
package/lancedb/connection.ts
CHANGED
|
@@ -12,10 +12,16 @@
|
|
|
12
12
|
// See the License for the specific language governing permissions and
|
|
13
13
|
// limitations under the License.
|
|
14
14
|
|
|
15
|
-
import {
|
|
15
|
+
import { Table as ArrowTable, Schema } from "./arrow";
|
|
16
|
+
import {
|
|
17
|
+
fromTableToBuffer,
|
|
18
|
+
isArrowTable,
|
|
19
|
+
makeArrowTable,
|
|
20
|
+
makeEmptyTable,
|
|
21
|
+
} from "./arrow";
|
|
22
|
+
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
|
16
23
|
import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
|
|
17
24
|
import { Table } from "./table";
|
|
18
|
-
import { Table as ArrowTable, Schema } from "apache-arrow";
|
|
19
25
|
|
|
20
26
|
/**
|
|
21
27
|
* Connect to a LanceDB instance at the given URI.
|
|
@@ -65,6 +71,8 @@ export interface CreateTableOptions {
|
|
|
65
71
|
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
66
72
|
*/
|
|
67
73
|
storageOptions?: Record<string, string>;
|
|
74
|
+
schema?: Schema;
|
|
75
|
+
embeddingFunction?: EmbeddingFunctionConfig;
|
|
68
76
|
}
|
|
69
77
|
|
|
70
78
|
export interface OpenTableOptions {
|
|
@@ -174,6 +182,7 @@ export class Connection {
|
|
|
174
182
|
cleanseStorageOptions(options?.storageOptions),
|
|
175
183
|
options?.indexCacheSize,
|
|
176
184
|
);
|
|
185
|
+
|
|
177
186
|
return new Table(innerTable);
|
|
178
187
|
}
|
|
179
188
|
|
|
@@ -196,18 +205,24 @@ export class Connection {
|
|
|
196
205
|
}
|
|
197
206
|
|
|
198
207
|
let table: ArrowTable;
|
|
199
|
-
if (data
|
|
208
|
+
if (isArrowTable(data)) {
|
|
200
209
|
table = data;
|
|
201
210
|
} else {
|
|
202
|
-
table = makeArrowTable(data);
|
|
211
|
+
table = makeArrowTable(data, options);
|
|
203
212
|
}
|
|
204
|
-
|
|
213
|
+
|
|
214
|
+
const buf = await fromTableToBuffer(
|
|
215
|
+
table,
|
|
216
|
+
options?.embeddingFunction,
|
|
217
|
+
options?.schema,
|
|
218
|
+
);
|
|
205
219
|
const innerTable = await this.inner.createTable(
|
|
206
220
|
name,
|
|
207
221
|
buf,
|
|
208
222
|
mode,
|
|
209
223
|
cleanseStorageOptions(options?.storageOptions),
|
|
210
224
|
);
|
|
225
|
+
|
|
211
226
|
return new Table(innerTable);
|
|
212
227
|
}
|
|
213
228
|
|
|
@@ -227,8 +242,14 @@ export class Connection {
|
|
|
227
242
|
if (mode === "create" && existOk) {
|
|
228
243
|
mode = "exist_ok";
|
|
229
244
|
}
|
|
245
|
+
let metadata: Map<string, string> | undefined = undefined;
|
|
246
|
+
if (options?.embeddingFunction !== undefined) {
|
|
247
|
+
const embeddingFunction = options.embeddingFunction;
|
|
248
|
+
const registry = getRegistry();
|
|
249
|
+
metadata = registry.getTableMetadata([embeddingFunction]);
|
|
250
|
+
}
|
|
230
251
|
|
|
231
|
-
const table = makeEmptyTable(schema);
|
|
252
|
+
const table = makeEmptyTable(schema, metadata);
|
|
232
253
|
const buf = await fromTableToBuffer(table);
|
|
233
254
|
const innerTable = await this.inner.createEmptyTable(
|
|
234
255
|
name,
|