@lancedb/lancedb 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/biome.json +8 -2
- package/dist/arrow.d.ts +36 -9
- package/dist/arrow.js +222 -24
- package/dist/connection.d.ts +10 -1
- package/dist/connection.js +13 -7
- package/dist/embedding/embedding_function.d.ts +54 -28
- package/dist/embedding/embedding_function.js +89 -10
- package/dist/embedding/index.d.ts +28 -2
- package/dist/embedding/index.js +111 -4
- package/dist/embedding/openai.d.ts +16 -7
- package/dist/embedding/openai.js +62 -12
- package/dist/embedding/registry.d.ts +58 -0
- package/dist/embedding/registry.js +127 -0
- package/dist/native.d.ts +5 -4
- package/dist/query.d.ts +19 -7
- package/dist/query.js +27 -13
- package/dist/sanitize.d.ts +22 -1
- package/dist/sanitize.js +123 -110
- package/dist/table.d.ts +18 -3
- package/dist/table.js +33 -3
- package/lancedb/arrow.ts +243 -41
- package/lancedb/connection.ts +35 -6
- package/lancedb/embedding/embedding_function.ts +147 -42
- package/lancedb/embedding/index.ts +113 -2
- package/lancedb/embedding/openai.ts +62 -16
- package/lancedb/embedding/registry.ts +176 -0
- package/lancedb/query.ts +58 -14
- package/lancedb/sanitize.ts +22 -22
- package/lancedb/table.ts +67 -5
- package/nodejs-artifacts/arrow.d.ts +36 -9
- package/nodejs-artifacts/arrow.js +222 -24
- package/nodejs-artifacts/connection.d.ts +10 -1
- package/nodejs-artifacts/connection.js +13 -7
- package/nodejs-artifacts/embedding/embedding_function.d.ts +54 -28
- package/nodejs-artifacts/embedding/embedding_function.js +89 -10
- package/nodejs-artifacts/embedding/index.d.ts +28 -2
- package/nodejs-artifacts/embedding/index.js +111 -4
- package/nodejs-artifacts/embedding/openai.d.ts +16 -7
- package/nodejs-artifacts/embedding/openai.js +62 -12
- package/nodejs-artifacts/embedding/registry.d.ts +58 -0
- package/nodejs-artifacts/embedding/registry.js +127 -0
- package/nodejs-artifacts/native.d.ts +5 -4
- package/nodejs-artifacts/query.d.ts +19 -7
- package/nodejs-artifacts/query.js +27 -13
- package/nodejs-artifacts/sanitize.d.ts +22 -1
- package/nodejs-artifacts/sanitize.js +123 -110
- package/nodejs-artifacts/table.d.ts +18 -3
- package/nodejs-artifacts/table.js +33 -3
- package/package.json +14 -9
- package/tsconfig.json +3 -1
package/lancedb/arrow.ts
CHANGED
|
@@ -17,24 +17,122 @@ import {
|
|
|
17
17
|
Binary,
|
|
18
18
|
DataType,
|
|
19
19
|
Field,
|
|
20
|
+
FixedSizeBinary,
|
|
20
21
|
FixedSizeList,
|
|
21
|
-
|
|
22
|
+
Float,
|
|
22
23
|
Float32,
|
|
24
|
+
Int,
|
|
25
|
+
LargeBinary,
|
|
23
26
|
List,
|
|
27
|
+
Null,
|
|
24
28
|
RecordBatch,
|
|
25
29
|
RecordBatchFileWriter,
|
|
26
30
|
RecordBatchStreamWriter,
|
|
27
31
|
Schema,
|
|
28
32
|
Struct,
|
|
29
33
|
Utf8,
|
|
30
|
-
|
|
34
|
+
Vector,
|
|
31
35
|
makeBuilder,
|
|
32
36
|
makeData,
|
|
33
37
|
type makeTable,
|
|
34
38
|
vectorFromArray,
|
|
35
39
|
} from "apache-arrow";
|
|
36
40
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
|
37
|
-
import {
|
|
41
|
+
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
|
42
|
+
import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
|
|
43
|
+
export * from "apache-arrow";
|
|
44
|
+
|
|
45
|
+
export type IntoVector = Float32Array | Float64Array | number[];
|
|
46
|
+
|
|
47
|
+
export function isArrowTable(value: object): value is ArrowTable {
|
|
48
|
+
if (value instanceof ArrowTable) return true;
|
|
49
|
+
return "schema" in value && "batches" in value;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export function isDataType(value: unknown): value is DataType {
|
|
53
|
+
return (
|
|
54
|
+
value instanceof DataType ||
|
|
55
|
+
DataType.isNull(value) ||
|
|
56
|
+
DataType.isInt(value) ||
|
|
57
|
+
DataType.isFloat(value) ||
|
|
58
|
+
DataType.isBinary(value) ||
|
|
59
|
+
DataType.isLargeBinary(value) ||
|
|
60
|
+
DataType.isUtf8(value) ||
|
|
61
|
+
DataType.isLargeUtf8(value) ||
|
|
62
|
+
DataType.isBool(value) ||
|
|
63
|
+
DataType.isDecimal(value) ||
|
|
64
|
+
DataType.isDate(value) ||
|
|
65
|
+
DataType.isTime(value) ||
|
|
66
|
+
DataType.isTimestamp(value) ||
|
|
67
|
+
DataType.isInterval(value) ||
|
|
68
|
+
DataType.isDuration(value) ||
|
|
69
|
+
DataType.isList(value) ||
|
|
70
|
+
DataType.isStruct(value) ||
|
|
71
|
+
DataType.isUnion(value) ||
|
|
72
|
+
DataType.isFixedSizeBinary(value) ||
|
|
73
|
+
DataType.isFixedSizeList(value) ||
|
|
74
|
+
DataType.isMap(value) ||
|
|
75
|
+
DataType.isDictionary(value)
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
export function isNull(value: unknown): value is Null {
|
|
79
|
+
return value instanceof Null || DataType.isNull(value);
|
|
80
|
+
}
|
|
81
|
+
export function isInt(value: unknown): value is Int {
|
|
82
|
+
return value instanceof Int || DataType.isInt(value);
|
|
83
|
+
}
|
|
84
|
+
export function isFloat(value: unknown): value is Float {
|
|
85
|
+
return value instanceof Float || DataType.isFloat(value);
|
|
86
|
+
}
|
|
87
|
+
export function isBinary(value: unknown): value is Binary {
|
|
88
|
+
return value instanceof Binary || DataType.isBinary(value);
|
|
89
|
+
}
|
|
90
|
+
export function isLargeBinary(value: unknown): value is LargeBinary {
|
|
91
|
+
return value instanceof LargeBinary || DataType.isLargeBinary(value);
|
|
92
|
+
}
|
|
93
|
+
export function isUtf8(value: unknown): value is Utf8 {
|
|
94
|
+
return value instanceof Utf8 || DataType.isUtf8(value);
|
|
95
|
+
}
|
|
96
|
+
export function isLargeUtf8(value: unknown): value is Utf8 {
|
|
97
|
+
return value instanceof Utf8 || DataType.isLargeUtf8(value);
|
|
98
|
+
}
|
|
99
|
+
export function isBool(value: unknown): value is Utf8 {
|
|
100
|
+
return value instanceof Utf8 || DataType.isBool(value);
|
|
101
|
+
}
|
|
102
|
+
export function isDecimal(value: unknown): value is Utf8 {
|
|
103
|
+
return value instanceof Utf8 || DataType.isDecimal(value);
|
|
104
|
+
}
|
|
105
|
+
export function isDate(value: unknown): value is Utf8 {
|
|
106
|
+
return value instanceof Utf8 || DataType.isDate(value);
|
|
107
|
+
}
|
|
108
|
+
export function isTime(value: unknown): value is Utf8 {
|
|
109
|
+
return value instanceof Utf8 || DataType.isTime(value);
|
|
110
|
+
}
|
|
111
|
+
export function isTimestamp(value: unknown): value is Utf8 {
|
|
112
|
+
return value instanceof Utf8 || DataType.isTimestamp(value);
|
|
113
|
+
}
|
|
114
|
+
export function isInterval(value: unknown): value is Utf8 {
|
|
115
|
+
return value instanceof Utf8 || DataType.isInterval(value);
|
|
116
|
+
}
|
|
117
|
+
export function isDuration(value: unknown): value is Utf8 {
|
|
118
|
+
return value instanceof Utf8 || DataType.isDuration(value);
|
|
119
|
+
}
|
|
120
|
+
export function isList(value: unknown): value is List {
|
|
121
|
+
return value instanceof List || DataType.isList(value);
|
|
122
|
+
}
|
|
123
|
+
export function isStruct(value: unknown): value is Struct {
|
|
124
|
+
return value instanceof Struct || DataType.isStruct(value);
|
|
125
|
+
}
|
|
126
|
+
export function isUnion(value: unknown): value is Struct {
|
|
127
|
+
return value instanceof Struct || DataType.isUnion(value);
|
|
128
|
+
}
|
|
129
|
+
export function isFixedSizeBinary(value: unknown): value is FixedSizeBinary {
|
|
130
|
+
return value instanceof FixedSizeBinary || DataType.isFixedSizeBinary(value);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
export function isFixedSizeList(value: unknown): value is FixedSizeList {
|
|
134
|
+
return value instanceof FixedSizeList || DataType.isFixedSizeList(value);
|
|
135
|
+
}
|
|
38
136
|
|
|
39
137
|
/** Data type accepted by NodeJS SDK */
|
|
40
138
|
export type Data = Record<string, unknown>[] | ArrowTable;
|
|
@@ -86,6 +184,7 @@ export class MakeArrowTableOptions {
|
|
|
86
184
|
vector: new VectorColumnOptions(),
|
|
87
185
|
};
|
|
88
186
|
embeddings?: EmbeddingFunction<unknown>;
|
|
187
|
+
embeddingFunction?: EmbeddingFunctionConfig;
|
|
89
188
|
|
|
90
189
|
/**
|
|
91
190
|
* If true then string columns will be encoded with dictionary encoding
|
|
@@ -198,6 +297,7 @@ export class MakeArrowTableOptions {
|
|
|
198
297
|
export function makeArrowTable(
|
|
199
298
|
data: Array<Record<string, unknown>>,
|
|
200
299
|
options?: Partial<MakeArrowTableOptions>,
|
|
300
|
+
metadata?: Map<string, string>,
|
|
201
301
|
): ArrowTable {
|
|
202
302
|
if (
|
|
203
303
|
data.length === 0 &&
|
|
@@ -209,7 +309,11 @@ export function makeArrowTable(
|
|
|
209
309
|
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
210
310
|
if (opt.schema !== undefined && opt.schema !== null) {
|
|
211
311
|
opt.schema = sanitizeSchema(opt.schema);
|
|
212
|
-
opt.schema = validateSchemaEmbeddings(
|
|
312
|
+
opt.schema = validateSchemaEmbeddings(
|
|
313
|
+
opt.schema,
|
|
314
|
+
data,
|
|
315
|
+
options?.embeddingFunction,
|
|
316
|
+
);
|
|
213
317
|
}
|
|
214
318
|
const columns: Record<string, Vector> = {};
|
|
215
319
|
// TODO: sample dataset to find missing columns
|
|
@@ -290,20 +394,41 @@ export function makeArrowTable(
|
|
|
290
394
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
291
395
|
const firstTable = new ArrowTable(columns);
|
|
292
396
|
const batchesFixed = firstTable.batches.map(
|
|
293
|
-
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
294
397
|
(batch) => new RecordBatch(opt.schema!, batch.data),
|
|
295
398
|
);
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
399
|
+
let schema: Schema;
|
|
400
|
+
if (metadata !== undefined) {
|
|
401
|
+
let schemaMetadata = opt.schema.metadata;
|
|
402
|
+
if (schemaMetadata.size === 0) {
|
|
403
|
+
schemaMetadata = metadata;
|
|
404
|
+
} else {
|
|
405
|
+
for (const [key, entry] of schemaMetadata.entries()) {
|
|
406
|
+
schemaMetadata.set(key, entry);
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
schema = new Schema(opt.schema.fields, schemaMetadata);
|
|
411
|
+
} else {
|
|
412
|
+
schema = opt.schema;
|
|
413
|
+
}
|
|
414
|
+
return new ArrowTable(schema, batchesFixed);
|
|
299
415
|
}
|
|
416
|
+
const tbl = new ArrowTable(columns);
|
|
417
|
+
if (metadata !== undefined) {
|
|
418
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
419
|
+
(<any>tbl.schema).metadata = metadata;
|
|
420
|
+
}
|
|
421
|
+
return tbl;
|
|
300
422
|
}
|
|
301
423
|
|
|
302
424
|
/**
|
|
303
425
|
* Create an empty Arrow table with the provided schema
|
|
304
426
|
*/
|
|
305
|
-
export function makeEmptyTable(
|
|
306
|
-
|
|
427
|
+
export function makeEmptyTable(
|
|
428
|
+
schema: Schema,
|
|
429
|
+
metadata?: Map<string, string>,
|
|
430
|
+
): ArrowTable {
|
|
431
|
+
return makeArrowTable([], { schema }, metadata);
|
|
307
432
|
}
|
|
308
433
|
|
|
309
434
|
/**
|
|
@@ -375,13 +500,74 @@ function makeVector(
|
|
|
375
500
|
}
|
|
376
501
|
}
|
|
377
502
|
|
|
503
|
+
/** Helper function to apply embeddings from metadata to an input table */
|
|
504
|
+
async function applyEmbeddingsFromMetadata(
|
|
505
|
+
table: ArrowTable,
|
|
506
|
+
schema: Schema,
|
|
507
|
+
): Promise<ArrowTable> {
|
|
508
|
+
const registry = getRegistry();
|
|
509
|
+
const functions = registry.parseFunctions(schema.metadata);
|
|
510
|
+
|
|
511
|
+
const columns = Object.fromEntries(
|
|
512
|
+
table.schema.fields.map((field) => [
|
|
513
|
+
field.name,
|
|
514
|
+
table.getChild(field.name)!,
|
|
515
|
+
]),
|
|
516
|
+
);
|
|
517
|
+
|
|
518
|
+
for (const functionEntry of functions.values()) {
|
|
519
|
+
const sourceColumn = columns[functionEntry.sourceColumn];
|
|
520
|
+
const destColumn = functionEntry.vectorColumn ?? "vector";
|
|
521
|
+
if (sourceColumn === undefined) {
|
|
522
|
+
throw new Error(
|
|
523
|
+
`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`,
|
|
524
|
+
);
|
|
525
|
+
}
|
|
526
|
+
if (columns[destColumn] !== undefined) {
|
|
527
|
+
throw new Error(
|
|
528
|
+
`Attempt to apply embeddings to table failed because column ${destColumn} already existed`,
|
|
529
|
+
);
|
|
530
|
+
}
|
|
531
|
+
if (table.batches.length > 1) {
|
|
532
|
+
throw new Error(
|
|
533
|
+
"Internal error: `makeArrowTable` unexpectedly created a table with more than one batch",
|
|
534
|
+
);
|
|
535
|
+
}
|
|
536
|
+
const values = sourceColumn.toArray();
|
|
537
|
+
|
|
538
|
+
const vectors =
|
|
539
|
+
await functionEntry.function.computeSourceEmbeddings(values);
|
|
540
|
+
if (vectors.length !== values.length) {
|
|
541
|
+
throw new Error(
|
|
542
|
+
"Embedding function did not return an embedding for each input element",
|
|
543
|
+
);
|
|
544
|
+
}
|
|
545
|
+
let destType: DataType;
|
|
546
|
+
const dtype = schema.fields.find((f) => f.name === destColumn)!.type;
|
|
547
|
+
if (isFixedSizeList(dtype)) {
|
|
548
|
+
destType = sanitizeType(dtype);
|
|
549
|
+
} else {
|
|
550
|
+
throw new Error(
|
|
551
|
+
"Expected FixedSizeList as datatype for vector field, instead got: " +
|
|
552
|
+
dtype,
|
|
553
|
+
);
|
|
554
|
+
}
|
|
555
|
+
const vector = makeVector(vectors, destType);
|
|
556
|
+
columns[destColumn] = vector;
|
|
557
|
+
}
|
|
558
|
+
const newTable = new ArrowTable(columns);
|
|
559
|
+
return alignTable(newTable, schema);
|
|
560
|
+
}
|
|
561
|
+
|
|
378
562
|
/** Helper function to apply embeddings to an input table */
|
|
379
563
|
async function applyEmbeddings<T>(
|
|
380
564
|
table: ArrowTable,
|
|
381
|
-
embeddings?:
|
|
565
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
382
566
|
schema?: Schema,
|
|
383
567
|
): Promise<ArrowTable> {
|
|
384
|
-
if (
|
|
568
|
+
if (schema?.metadata.has("embedding_functions")) {
|
|
569
|
+
return applyEmbeddingsFromMetadata(table, schema!);
|
|
570
|
+
} else if (embeddings == null || embeddings === undefined) {
|
|
385
571
|
return table;
|
|
386
572
|
}
|
|
387
573
|
|
|
@@ -399,8 +585,9 @@ async function applyEmbeddings<T>(
|
|
|
399
585
|
const newColumns = Object.fromEntries(colEntries);
|
|
400
586
|
|
|
401
587
|
const sourceColumn = newColumns[embeddings.sourceColumn];
|
|
402
|
-
const destColumn = embeddings.
|
|
403
|
-
const innerDestType =
|
|
588
|
+
const destColumn = embeddings.vectorColumn ?? "vector";
|
|
589
|
+
const innerDestType =
|
|
590
|
+
embeddings.function.embeddingDataType() ?? new Float32();
|
|
404
591
|
if (sourceColumn === undefined) {
|
|
405
592
|
throw new Error(
|
|
406
593
|
`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`,
|
|
@@ -414,11 +601,9 @@ async function applyEmbeddings<T>(
|
|
|
414
601
|
// if we call convertToTable with 0 records and a schema that includes the embedding
|
|
415
602
|
return table;
|
|
416
603
|
}
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
innerDestType,
|
|
421
|
-
);
|
|
604
|
+
const dimensions = embeddings.function.ndims();
|
|
605
|
+
if (dimensions !== undefined) {
|
|
606
|
+
const destType = newVectorType(dimensions, innerDestType);
|
|
422
607
|
newColumns[destColumn] = makeVector([], destType);
|
|
423
608
|
} else if (schema != null) {
|
|
424
609
|
const destField = schema.fields.find((f) => f.name === destColumn);
|
|
@@ -446,7 +631,9 @@ async function applyEmbeddings<T>(
|
|
|
446
631
|
);
|
|
447
632
|
}
|
|
448
633
|
const values = sourceColumn.toArray();
|
|
449
|
-
const vectors = await embeddings.
|
|
634
|
+
const vectors = await embeddings.function.computeSourceEmbeddings(
|
|
635
|
+
values as T[],
|
|
636
|
+
);
|
|
450
637
|
if (vectors.length !== values.length) {
|
|
451
638
|
throw new Error(
|
|
452
639
|
"Embedding function did not return an embedding for each input element",
|
|
@@ -486,9 +673,9 @@ async function applyEmbeddings<T>(
|
|
|
486
673
|
* embedding columns. If no schema is provded then embedding columns will
|
|
487
674
|
* be placed at the end of the table, after all of the input columns.
|
|
488
675
|
*/
|
|
489
|
-
export async function convertToTable
|
|
676
|
+
export async function convertToTable(
|
|
490
677
|
data: Array<Record<string, unknown>>,
|
|
491
|
-
embeddings?:
|
|
678
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
492
679
|
makeTableOptions?: Partial<MakeArrowTableOptions>,
|
|
493
680
|
): Promise<ArrowTable> {
|
|
494
681
|
const table = makeArrowTable(data, makeTableOptions);
|
|
@@ -496,13 +683,13 @@ export async function convertToTable<T>(
|
|
|
496
683
|
}
|
|
497
684
|
|
|
498
685
|
/** Creates the Arrow Type for a Vector column with dimension `dim` */
|
|
499
|
-
function newVectorType<T extends Float>(
|
|
686
|
+
export function newVectorType<T extends Float>(
|
|
500
687
|
dim: number,
|
|
501
688
|
innerType: T,
|
|
502
689
|
): FixedSizeList<T> {
|
|
503
690
|
// in Lance we always default to have the elements nullable, so we need to set it to true
|
|
504
691
|
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
|
|
505
|
-
const children = new Field
|
|
692
|
+
const children = new Field("item", <T>sanitizeType(innerType), true);
|
|
506
693
|
return new FixedSizeList(dim, children);
|
|
507
694
|
}
|
|
508
695
|
|
|
@@ -513,9 +700,9 @@ function newVectorType<T extends Float>(
|
|
|
513
700
|
*
|
|
514
701
|
* `schema` is required if data is empty
|
|
515
702
|
*/
|
|
516
|
-
export async function fromRecordsToBuffer
|
|
703
|
+
export async function fromRecordsToBuffer(
|
|
517
704
|
data: Array<Record<string, unknown>>,
|
|
518
|
-
embeddings?:
|
|
705
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
519
706
|
schema?: Schema,
|
|
520
707
|
): Promise<Buffer> {
|
|
521
708
|
if (schema !== undefined && schema !== null) {
|
|
@@ -533,9 +720,9 @@ export async function fromRecordsToBuffer<T>(
|
|
|
533
720
|
*
|
|
534
721
|
* `schema` is required if data is empty
|
|
535
722
|
*/
|
|
536
|
-
export async function fromRecordsToStreamBuffer
|
|
723
|
+
export async function fromRecordsToStreamBuffer(
|
|
537
724
|
data: Array<Record<string, unknown>>,
|
|
538
|
-
embeddings?:
|
|
725
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
539
726
|
schema?: Schema,
|
|
540
727
|
): Promise<Buffer> {
|
|
541
728
|
if (schema !== undefined && schema !== null) {
|
|
@@ -554,9 +741,9 @@ export async function fromRecordsToStreamBuffer<T>(
|
|
|
554
741
|
*
|
|
555
742
|
* `schema` is required if the table is empty
|
|
556
743
|
*/
|
|
557
|
-
export async function fromTableToBuffer
|
|
744
|
+
export async function fromTableToBuffer(
|
|
558
745
|
table: ArrowTable,
|
|
559
|
-
embeddings?:
|
|
746
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
560
747
|
schema?: Schema,
|
|
561
748
|
): Promise<Buffer> {
|
|
562
749
|
if (schema !== undefined && schema !== null) {
|
|
@@ -575,19 +762,19 @@ export async function fromTableToBuffer<T>(
|
|
|
575
762
|
*
|
|
576
763
|
* `schema` is required if the table is empty
|
|
577
764
|
*/
|
|
578
|
-
export async function fromDataToBuffer
|
|
765
|
+
export async function fromDataToBuffer(
|
|
579
766
|
data: Data,
|
|
580
|
-
embeddings?:
|
|
767
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
581
768
|
schema?: Schema,
|
|
582
769
|
): Promise<Buffer> {
|
|
583
770
|
if (schema !== undefined && schema !== null) {
|
|
584
771
|
schema = sanitizeSchema(schema);
|
|
585
772
|
}
|
|
586
|
-
if (data
|
|
773
|
+
if (isArrowTable(data)) {
|
|
587
774
|
return fromTableToBuffer(data, embeddings, schema);
|
|
588
775
|
} else {
|
|
589
|
-
const table = await convertToTable(data);
|
|
590
|
-
return fromTableToBuffer(table
|
|
776
|
+
const table = await convertToTable(data, embeddings, { schema });
|
|
777
|
+
return fromTableToBuffer(table);
|
|
591
778
|
}
|
|
592
779
|
}
|
|
593
780
|
|
|
@@ -599,9 +786,9 @@ export async function fromDataToBuffer<T>(
|
|
|
599
786
|
*
|
|
600
787
|
* `schema` is required if the table is empty
|
|
601
788
|
*/
|
|
602
|
-
export async function fromTableToStreamBuffer
|
|
789
|
+
export async function fromTableToStreamBuffer(
|
|
603
790
|
table: ArrowTable,
|
|
604
|
-
embeddings?:
|
|
791
|
+
embeddings?: EmbeddingFunctionConfig,
|
|
605
792
|
schema?: Schema,
|
|
606
793
|
): Promise<Buffer> {
|
|
607
794
|
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
@@ -654,7 +841,7 @@ export function createEmptyTable(schema: Schema): ArrowTable {
|
|
|
654
841
|
function validateSchemaEmbeddings(
|
|
655
842
|
schema: Schema,
|
|
656
843
|
data: Array<Record<string, unknown>>,
|
|
657
|
-
embeddings:
|
|
844
|
+
embeddings: EmbeddingFunctionConfig | undefined,
|
|
658
845
|
) {
|
|
659
846
|
const fields = [];
|
|
660
847
|
const missingEmbeddingFields = [];
|
|
@@ -664,10 +851,25 @@ function validateSchemaEmbeddings(
|
|
|
664
851
|
// if it does not, we add it to the list of missing embedding fields
|
|
665
852
|
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
666
853
|
// if they are not, we throw an error
|
|
667
|
-
for (
|
|
668
|
-
if (field.type
|
|
854
|
+
for (let field of schema.fields) {
|
|
855
|
+
if (isFixedSizeList(field.type)) {
|
|
856
|
+
field = sanitizeField(field);
|
|
857
|
+
|
|
669
858
|
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
670
|
-
|
|
859
|
+
if (schema.metadata.has("embedding_functions")) {
|
|
860
|
+
const embeddings = JSON.parse(
|
|
861
|
+
schema.metadata.get("embedding_functions")!,
|
|
862
|
+
);
|
|
863
|
+
if (
|
|
864
|
+
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
|
865
|
+
embeddings.find((f: any) => f["vectorColumn"] === field.name) ===
|
|
866
|
+
undefined
|
|
867
|
+
) {
|
|
868
|
+
missingEmbeddingFields.push(field);
|
|
869
|
+
}
|
|
870
|
+
} else {
|
|
871
|
+
missingEmbeddingFields.push(field);
|
|
872
|
+
}
|
|
671
873
|
} else {
|
|
672
874
|
fields.push(field);
|
|
673
875
|
}
|
package/lancedb/connection.ts
CHANGED
|
@@ -12,8 +12,14 @@
|
|
|
12
12
|
// See the License for the specific language governing permissions and
|
|
13
13
|
// limitations under the License.
|
|
14
14
|
|
|
15
|
-
import { Table as ArrowTable, Schema } from "
|
|
16
|
-
import {
|
|
15
|
+
import { Table as ArrowTable, Schema } from "./arrow";
|
|
16
|
+
import {
|
|
17
|
+
fromTableToBuffer,
|
|
18
|
+
isArrowTable,
|
|
19
|
+
makeArrowTable,
|
|
20
|
+
makeEmptyTable,
|
|
21
|
+
} from "./arrow";
|
|
22
|
+
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
|
17
23
|
import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
|
|
18
24
|
import { Table } from "./table";
|
|
19
25
|
|
|
@@ -65,6 +71,14 @@ export interface CreateTableOptions {
|
|
|
65
71
|
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
66
72
|
*/
|
|
67
73
|
storageOptions?: Record<string, string>;
|
|
74
|
+
/**
|
|
75
|
+
* If true then data files will be written with the legacy format
|
|
76
|
+
*
|
|
77
|
+
* The default is true while the new format is in beta
|
|
78
|
+
*/
|
|
79
|
+
useLegacyFormat?: boolean;
|
|
80
|
+
schema?: Schema;
|
|
81
|
+
embeddingFunction?: EmbeddingFunctionConfig;
|
|
68
82
|
}
|
|
69
83
|
|
|
70
84
|
export interface OpenTableOptions {
|
|
@@ -174,6 +188,7 @@ export class Connection {
|
|
|
174
188
|
cleanseStorageOptions(options?.storageOptions),
|
|
175
189
|
options?.indexCacheSize,
|
|
176
190
|
);
|
|
191
|
+
|
|
177
192
|
return new Table(innerTable);
|
|
178
193
|
}
|
|
179
194
|
|
|
@@ -196,18 +211,25 @@ export class Connection {
|
|
|
196
211
|
}
|
|
197
212
|
|
|
198
213
|
let table: ArrowTable;
|
|
199
|
-
if (data
|
|
214
|
+
if (isArrowTable(data)) {
|
|
200
215
|
table = data;
|
|
201
216
|
} else {
|
|
202
|
-
table = makeArrowTable(data);
|
|
217
|
+
table = makeArrowTable(data, options);
|
|
203
218
|
}
|
|
204
|
-
|
|
219
|
+
|
|
220
|
+
const buf = await fromTableToBuffer(
|
|
221
|
+
table,
|
|
222
|
+
options?.embeddingFunction,
|
|
223
|
+
options?.schema,
|
|
224
|
+
);
|
|
205
225
|
const innerTable = await this.inner.createTable(
|
|
206
226
|
name,
|
|
207
227
|
buf,
|
|
208
228
|
mode,
|
|
209
229
|
cleanseStorageOptions(options?.storageOptions),
|
|
230
|
+
options?.useLegacyFormat,
|
|
210
231
|
);
|
|
232
|
+
|
|
211
233
|
return new Table(innerTable);
|
|
212
234
|
}
|
|
213
235
|
|
|
@@ -227,14 +249,21 @@ export class Connection {
|
|
|
227
249
|
if (mode === "create" && existOk) {
|
|
228
250
|
mode = "exist_ok";
|
|
229
251
|
}
|
|
252
|
+
let metadata: Map<string, string> | undefined = undefined;
|
|
253
|
+
if (options?.embeddingFunction !== undefined) {
|
|
254
|
+
const embeddingFunction = options.embeddingFunction;
|
|
255
|
+
const registry = getRegistry();
|
|
256
|
+
metadata = registry.getTableMetadata([embeddingFunction]);
|
|
257
|
+
}
|
|
230
258
|
|
|
231
|
-
const table = makeEmptyTable(schema);
|
|
259
|
+
const table = makeEmptyTable(schema, metadata);
|
|
232
260
|
const buf = await fromTableToBuffer(table);
|
|
233
261
|
const innerTable = await this.inner.createEmptyTable(
|
|
234
262
|
name,
|
|
235
263
|
buf,
|
|
236
264
|
mode,
|
|
237
265
|
cleanseStorageOptions(options?.storageOptions),
|
|
266
|
+
options?.useLegacyFormat,
|
|
238
267
|
);
|
|
239
268
|
return new Table(innerTable);
|
|
240
269
|
}
|