@lancedb/lancedb 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/biome.json +8 -2
- package/dist/arrow.d.ts +36 -9
- package/dist/arrow.js +222 -24
- package/dist/connection.d.ts +10 -1
- package/dist/connection.js +13 -7
- package/dist/embedding/embedding_function.d.ts +54 -28
- package/dist/embedding/embedding_function.js +89 -10
- package/dist/embedding/index.d.ts +28 -2
- package/dist/embedding/index.js +111 -4
- package/dist/embedding/openai.d.ts +16 -7
- package/dist/embedding/openai.js +62 -12
- package/dist/embedding/registry.d.ts +58 -0
- package/dist/embedding/registry.js +127 -0
- package/dist/native.d.ts +5 -4
- package/dist/query.d.ts +19 -7
- package/dist/query.js +27 -13
- package/dist/sanitize.d.ts +22 -1
- package/dist/sanitize.js +123 -110
- package/dist/table.d.ts +18 -3
- package/dist/table.js +33 -3
- package/lancedb/arrow.ts +243 -41
- package/lancedb/connection.ts +35 -6
- package/lancedb/embedding/embedding_function.ts +147 -42
- package/lancedb/embedding/index.ts +113 -2
- package/lancedb/embedding/openai.ts +62 -16
- package/lancedb/embedding/registry.ts +176 -0
- package/lancedb/query.ts +58 -14
- package/lancedb/sanitize.ts +22 -22
- package/lancedb/table.ts +67 -5
- package/nodejs-artifacts/arrow.d.ts +36 -9
- package/nodejs-artifacts/arrow.js +222 -24
- package/nodejs-artifacts/connection.d.ts +10 -1
- package/nodejs-artifacts/connection.js +13 -7
- package/nodejs-artifacts/embedding/embedding_function.d.ts +54 -28
- package/nodejs-artifacts/embedding/embedding_function.js +89 -10
- package/nodejs-artifacts/embedding/index.d.ts +28 -2
- package/nodejs-artifacts/embedding/index.js +111 -4
- package/nodejs-artifacts/embedding/openai.d.ts +16 -7
- package/nodejs-artifacts/embedding/openai.js +62 -12
- package/nodejs-artifacts/embedding/registry.d.ts +58 -0
- package/nodejs-artifacts/embedding/registry.js +127 -0
- package/nodejs-artifacts/native.d.ts +5 -4
- package/nodejs-artifacts/query.d.ts +19 -7
- package/nodejs-artifacts/query.js +27 -13
- package/nodejs-artifacts/sanitize.d.ts +22 -1
- package/nodejs-artifacts/sanitize.js +123 -110
- package/nodejs-artifacts/table.d.ts +18 -3
- package/nodejs-artifacts/table.js +33 -3
- package/package.json +14 -9
- package/tsconfig.json +3 -1
|
@@ -12,10 +12,133 @@
|
|
|
12
12
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
13
|
// See the License for the specific language governing permissions and
|
|
14
14
|
// limitations under the License.
|
|
15
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
16
|
+
if (k2 === undefined) k2 = k;
|
|
17
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
18
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
19
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
20
|
+
}
|
|
21
|
+
Object.defineProperty(o, k2, desc);
|
|
22
|
+
}) : (function(o, m, k, k2) {
|
|
23
|
+
if (k2 === undefined) k2 = k;
|
|
24
|
+
o[k2] = m[k];
|
|
25
|
+
}));
|
|
26
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
27
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
28
|
+
};
|
|
15
29
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
|
-
exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = void 0;
|
|
30
|
+
exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.newVectorType = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = exports.isFixedSizeList = exports.isFixedSizeBinary = exports.isUnion = exports.isStruct = exports.isList = exports.isDuration = exports.isInterval = exports.isTimestamp = exports.isTime = exports.isDate = exports.isDecimal = exports.isBool = exports.isLargeUtf8 = exports.isUtf8 = exports.isLargeBinary = exports.isBinary = exports.isFloat = exports.isInt = exports.isNull = exports.isDataType = exports.isArrowTable = void 0;
|
|
17
31
|
const apache_arrow_1 = require("apache-arrow");
|
|
32
|
+
const registry_1 = require("./embedding/registry");
|
|
18
33
|
const sanitize_1 = require("./sanitize");
|
|
34
|
+
__exportStar(require("apache-arrow"), exports);
|
|
35
|
+
function isArrowTable(value) {
|
|
36
|
+
if (value instanceof apache_arrow_1.Table)
|
|
37
|
+
return true;
|
|
38
|
+
return "schema" in value && "batches" in value;
|
|
39
|
+
}
|
|
40
|
+
exports.isArrowTable = isArrowTable;
|
|
41
|
+
function isDataType(value) {
|
|
42
|
+
return (value instanceof apache_arrow_1.DataType ||
|
|
43
|
+
apache_arrow_1.DataType.isNull(value) ||
|
|
44
|
+
apache_arrow_1.DataType.isInt(value) ||
|
|
45
|
+
apache_arrow_1.DataType.isFloat(value) ||
|
|
46
|
+
apache_arrow_1.DataType.isBinary(value) ||
|
|
47
|
+
apache_arrow_1.DataType.isLargeBinary(value) ||
|
|
48
|
+
apache_arrow_1.DataType.isUtf8(value) ||
|
|
49
|
+
apache_arrow_1.DataType.isLargeUtf8(value) ||
|
|
50
|
+
apache_arrow_1.DataType.isBool(value) ||
|
|
51
|
+
apache_arrow_1.DataType.isDecimal(value) ||
|
|
52
|
+
apache_arrow_1.DataType.isDate(value) ||
|
|
53
|
+
apache_arrow_1.DataType.isTime(value) ||
|
|
54
|
+
apache_arrow_1.DataType.isTimestamp(value) ||
|
|
55
|
+
apache_arrow_1.DataType.isInterval(value) ||
|
|
56
|
+
apache_arrow_1.DataType.isDuration(value) ||
|
|
57
|
+
apache_arrow_1.DataType.isList(value) ||
|
|
58
|
+
apache_arrow_1.DataType.isStruct(value) ||
|
|
59
|
+
apache_arrow_1.DataType.isUnion(value) ||
|
|
60
|
+
apache_arrow_1.DataType.isFixedSizeBinary(value) ||
|
|
61
|
+
apache_arrow_1.DataType.isFixedSizeList(value) ||
|
|
62
|
+
apache_arrow_1.DataType.isMap(value) ||
|
|
63
|
+
apache_arrow_1.DataType.isDictionary(value));
|
|
64
|
+
}
|
|
65
|
+
exports.isDataType = isDataType;
|
|
66
|
+
function isNull(value) {
|
|
67
|
+
return value instanceof apache_arrow_1.Null || apache_arrow_1.DataType.isNull(value);
|
|
68
|
+
}
|
|
69
|
+
exports.isNull = isNull;
|
|
70
|
+
function isInt(value) {
|
|
71
|
+
return value instanceof apache_arrow_1.Int || apache_arrow_1.DataType.isInt(value);
|
|
72
|
+
}
|
|
73
|
+
exports.isInt = isInt;
|
|
74
|
+
function isFloat(value) {
|
|
75
|
+
return value instanceof apache_arrow_1.Float || apache_arrow_1.DataType.isFloat(value);
|
|
76
|
+
}
|
|
77
|
+
exports.isFloat = isFloat;
|
|
78
|
+
function isBinary(value) {
|
|
79
|
+
return value instanceof apache_arrow_1.Binary || apache_arrow_1.DataType.isBinary(value);
|
|
80
|
+
}
|
|
81
|
+
exports.isBinary = isBinary;
|
|
82
|
+
function isLargeBinary(value) {
|
|
83
|
+
return value instanceof apache_arrow_1.LargeBinary || apache_arrow_1.DataType.isLargeBinary(value);
|
|
84
|
+
}
|
|
85
|
+
exports.isLargeBinary = isLargeBinary;
|
|
86
|
+
function isUtf8(value) {
|
|
87
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isUtf8(value);
|
|
88
|
+
}
|
|
89
|
+
exports.isUtf8 = isUtf8;
|
|
90
|
+
function isLargeUtf8(value) {
|
|
91
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isLargeUtf8(value);
|
|
92
|
+
}
|
|
93
|
+
exports.isLargeUtf8 = isLargeUtf8;
|
|
94
|
+
function isBool(value) {
|
|
95
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isBool(value);
|
|
96
|
+
}
|
|
97
|
+
exports.isBool = isBool;
|
|
98
|
+
function isDecimal(value) {
|
|
99
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDecimal(value);
|
|
100
|
+
}
|
|
101
|
+
exports.isDecimal = isDecimal;
|
|
102
|
+
function isDate(value) {
|
|
103
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDate(value);
|
|
104
|
+
}
|
|
105
|
+
exports.isDate = isDate;
|
|
106
|
+
function isTime(value) {
|
|
107
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTime(value);
|
|
108
|
+
}
|
|
109
|
+
exports.isTime = isTime;
|
|
110
|
+
function isTimestamp(value) {
|
|
111
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTimestamp(value);
|
|
112
|
+
}
|
|
113
|
+
exports.isTimestamp = isTimestamp;
|
|
114
|
+
function isInterval(value) {
|
|
115
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isInterval(value);
|
|
116
|
+
}
|
|
117
|
+
exports.isInterval = isInterval;
|
|
118
|
+
function isDuration(value) {
|
|
119
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDuration(value);
|
|
120
|
+
}
|
|
121
|
+
exports.isDuration = isDuration;
|
|
122
|
+
function isList(value) {
|
|
123
|
+
return value instanceof apache_arrow_1.List || apache_arrow_1.DataType.isList(value);
|
|
124
|
+
}
|
|
125
|
+
exports.isList = isList;
|
|
126
|
+
function isStruct(value) {
|
|
127
|
+
return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isStruct(value);
|
|
128
|
+
}
|
|
129
|
+
exports.isStruct = isStruct;
|
|
130
|
+
function isUnion(value) {
|
|
131
|
+
return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isUnion(value);
|
|
132
|
+
}
|
|
133
|
+
exports.isUnion = isUnion;
|
|
134
|
+
function isFixedSizeBinary(value) {
|
|
135
|
+
return value instanceof apache_arrow_1.FixedSizeBinary || apache_arrow_1.DataType.isFixedSizeBinary(value);
|
|
136
|
+
}
|
|
137
|
+
exports.isFixedSizeBinary = isFixedSizeBinary;
|
|
138
|
+
function isFixedSizeList(value) {
|
|
139
|
+
return value instanceof apache_arrow_1.FixedSizeList || apache_arrow_1.DataType.isFixedSizeList(value);
|
|
140
|
+
}
|
|
141
|
+
exports.isFixedSizeList = isFixedSizeList;
|
|
19
142
|
/*
|
|
20
143
|
* Options to control how a column should be converted to a vector array
|
|
21
144
|
*/
|
|
@@ -61,6 +184,7 @@ class MakeArrowTableOptions {
|
|
|
61
184
|
vector: new VectorColumnOptions(),
|
|
62
185
|
};
|
|
63
186
|
embeddings;
|
|
187
|
+
embeddingFunction;
|
|
64
188
|
/**
|
|
65
189
|
* If true then string columns will be encoded with dictionary encoding
|
|
66
190
|
*
|
|
@@ -168,7 +292,7 @@ exports.MakeArrowTableOptions = MakeArrowTableOptions;
|
|
|
168
292
|
* assert.deepEqual(table.schema, schema)
|
|
169
293
|
* ```
|
|
170
294
|
*/
|
|
171
|
-
function makeArrowTable(data, options) {
|
|
295
|
+
function makeArrowTable(data, options, metadata) {
|
|
172
296
|
if (data.length === 0 &&
|
|
173
297
|
(options?.schema === undefined || options?.schema === null)) {
|
|
174
298
|
throw new Error("At least one record or a schema needs to be provided");
|
|
@@ -176,7 +300,7 @@ function makeArrowTable(data, options) {
|
|
|
176
300
|
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
177
301
|
if (opt.schema !== undefined && opt.schema !== null) {
|
|
178
302
|
opt.schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
|
|
179
|
-
opt.schema = validateSchemaEmbeddings(opt.schema, data,
|
|
303
|
+
opt.schema = validateSchemaEmbeddings(opt.schema, data, options?.embeddingFunction);
|
|
180
304
|
}
|
|
181
305
|
const columns = {};
|
|
182
306
|
// TODO: sample dataset to find missing columns
|
|
@@ -246,21 +370,38 @@ function makeArrowTable(data, options) {
|
|
|
246
370
|
// then patch the schema of the batches so we can use
|
|
247
371
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
248
372
|
const firstTable = new apache_arrow_1.Table(columns);
|
|
249
|
-
const batchesFixed = firstTable.batches.map(
|
|
250
|
-
|
|
251
|
-
(
|
|
252
|
-
|
|
373
|
+
const batchesFixed = firstTable.batches.map((batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
|
|
374
|
+
let schema;
|
|
375
|
+
if (metadata !== undefined) {
|
|
376
|
+
let schemaMetadata = opt.schema.metadata;
|
|
377
|
+
if (schemaMetadata.size === 0) {
|
|
378
|
+
schemaMetadata = metadata;
|
|
379
|
+
}
|
|
380
|
+
else {
|
|
381
|
+
for (const [key, entry] of schemaMetadata.entries()) {
|
|
382
|
+
schemaMetadata.set(key, entry);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
schema = new apache_arrow_1.Schema(opt.schema.fields, schemaMetadata);
|
|
386
|
+
}
|
|
387
|
+
else {
|
|
388
|
+
schema = opt.schema;
|
|
389
|
+
}
|
|
390
|
+
return new apache_arrow_1.Table(schema, batchesFixed);
|
|
253
391
|
}
|
|
254
|
-
|
|
255
|
-
|
|
392
|
+
const tbl = new apache_arrow_1.Table(columns);
|
|
393
|
+
if (metadata !== undefined) {
|
|
394
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
395
|
+
tbl.schema.metadata = metadata;
|
|
256
396
|
}
|
|
397
|
+
return tbl;
|
|
257
398
|
}
|
|
258
399
|
exports.makeArrowTable = makeArrowTable;
|
|
259
400
|
/**
|
|
260
401
|
* Create an empty Arrow table with the provided schema
|
|
261
402
|
*/
|
|
262
|
-
function makeEmptyTable(schema) {
|
|
263
|
-
return makeArrowTable([], { schema });
|
|
403
|
+
function makeEmptyTable(schema, metadata) {
|
|
404
|
+
return makeArrowTable([], { schema }, metadata);
|
|
264
405
|
}
|
|
265
406
|
exports.makeEmptyTable = makeEmptyTable;
|
|
266
407
|
/**
|
|
@@ -322,9 +463,52 @@ function makeVector(values, type, stringAsDictionary) {
|
|
|
322
463
|
return (0, apache_arrow_1.vectorFromArray)(values);
|
|
323
464
|
}
|
|
324
465
|
}
|
|
466
|
+
/** Helper function to apply embeddings from metadata to an input table */
|
|
467
|
+
async function applyEmbeddingsFromMetadata(table, schema) {
|
|
468
|
+
const registry = (0, registry_1.getRegistry)();
|
|
469
|
+
const functions = registry.parseFunctions(schema.metadata);
|
|
470
|
+
const columns = Object.fromEntries(table.schema.fields.map((field) => [
|
|
471
|
+
field.name,
|
|
472
|
+
table.getChild(field.name),
|
|
473
|
+
]));
|
|
474
|
+
for (const functionEntry of functions.values()) {
|
|
475
|
+
const sourceColumn = columns[functionEntry.sourceColumn];
|
|
476
|
+
const destColumn = functionEntry.vectorColumn ?? "vector";
|
|
477
|
+
if (sourceColumn === undefined) {
|
|
478
|
+
throw new Error(`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`);
|
|
479
|
+
}
|
|
480
|
+
if (columns[destColumn] !== undefined) {
|
|
481
|
+
throw new Error(`Attempt to apply embeddings to table failed because column ${destColumn} already existed`);
|
|
482
|
+
}
|
|
483
|
+
if (table.batches.length > 1) {
|
|
484
|
+
throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
|
|
485
|
+
}
|
|
486
|
+
const values = sourceColumn.toArray();
|
|
487
|
+
const vectors = await functionEntry.function.computeSourceEmbeddings(values);
|
|
488
|
+
if (vectors.length !== values.length) {
|
|
489
|
+
throw new Error("Embedding function did not return an embedding for each input element");
|
|
490
|
+
}
|
|
491
|
+
let destType;
|
|
492
|
+
const dtype = schema.fields.find((f) => f.name === destColumn).type;
|
|
493
|
+
if (isFixedSizeList(dtype)) {
|
|
494
|
+
destType = (0, sanitize_1.sanitizeType)(dtype);
|
|
495
|
+
}
|
|
496
|
+
else {
|
|
497
|
+
throw new Error("Expected FixedSizeList as datatype for vector field, instead got: " +
|
|
498
|
+
dtype);
|
|
499
|
+
}
|
|
500
|
+
const vector = makeVector(vectors, destType);
|
|
501
|
+
columns[destColumn] = vector;
|
|
502
|
+
}
|
|
503
|
+
const newTable = new apache_arrow_1.Table(columns);
|
|
504
|
+
return alignTable(newTable, schema);
|
|
505
|
+
}
|
|
325
506
|
/** Helper function to apply embeddings to an input table */
|
|
326
507
|
async function applyEmbeddings(table, embeddings, schema) {
|
|
327
|
-
if (
|
|
508
|
+
if (schema?.metadata.has("embedding_functions")) {
|
|
509
|
+
return applyEmbeddingsFromMetadata(table, schema);
|
|
510
|
+
}
|
|
511
|
+
else if (embeddings == null || embeddings === undefined) {
|
|
328
512
|
return table;
|
|
329
513
|
}
|
|
330
514
|
if (schema !== undefined && schema !== null) {
|
|
@@ -339,8 +523,8 @@ async function applyEmbeddings(table, embeddings, schema) {
|
|
|
339
523
|
});
|
|
340
524
|
const newColumns = Object.fromEntries(colEntries);
|
|
341
525
|
const sourceColumn = newColumns[embeddings.sourceColumn];
|
|
342
|
-
const destColumn = embeddings.
|
|
343
|
-
const innerDestType = embeddings.embeddingDataType ?? new apache_arrow_1.Float32();
|
|
526
|
+
const destColumn = embeddings.vectorColumn ?? "vector";
|
|
527
|
+
const innerDestType = embeddings.function.embeddingDataType() ?? new apache_arrow_1.Float32();
|
|
344
528
|
if (sourceColumn === undefined) {
|
|
345
529
|
throw new Error(`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`);
|
|
346
530
|
}
|
|
@@ -351,8 +535,9 @@ async function applyEmbeddings(table, embeddings, schema) {
|
|
|
351
535
|
// if we call convertToTable with 0 records and a schema that includes the embedding
|
|
352
536
|
return table;
|
|
353
537
|
}
|
|
354
|
-
|
|
355
|
-
|
|
538
|
+
const dimensions = embeddings.function.ndims();
|
|
539
|
+
if (dimensions !== undefined) {
|
|
540
|
+
const destType = newVectorType(dimensions, innerDestType);
|
|
356
541
|
newColumns[destColumn] = makeVector([], destType);
|
|
357
542
|
}
|
|
358
543
|
else if (schema != null) {
|
|
@@ -376,7 +561,7 @@ async function applyEmbeddings(table, embeddings, schema) {
|
|
|
376
561
|
throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
|
|
377
562
|
}
|
|
378
563
|
const values = sourceColumn.toArray();
|
|
379
|
-
const vectors = await embeddings.
|
|
564
|
+
const vectors = await embeddings.function.computeSourceEmbeddings(values);
|
|
380
565
|
if (vectors.length !== values.length) {
|
|
381
566
|
throw new Error("Embedding function did not return an embedding for each input element");
|
|
382
567
|
}
|
|
@@ -419,9 +604,10 @@ exports.convertToTable = convertToTable;
|
|
|
419
604
|
function newVectorType(dim, innerType) {
|
|
420
605
|
// in Lance we always default to have the elements nullable, so we need to set it to true
|
|
421
606
|
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
|
|
422
|
-
const children = new apache_arrow_1.Field("item", innerType, true);
|
|
607
|
+
const children = new apache_arrow_1.Field("item", (0, sanitize_1.sanitizeType)(innerType), true);
|
|
423
608
|
return new apache_arrow_1.FixedSizeList(dim, children);
|
|
424
609
|
}
|
|
610
|
+
exports.newVectorType = newVectorType;
|
|
425
611
|
/**
|
|
426
612
|
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
427
613
|
*
|
|
@@ -483,12 +669,12 @@ async function fromDataToBuffer(data, embeddings, schema) {
|
|
|
483
669
|
if (schema !== undefined && schema !== null) {
|
|
484
670
|
schema = (0, sanitize_1.sanitizeSchema)(schema);
|
|
485
671
|
}
|
|
486
|
-
if (data
|
|
672
|
+
if (isArrowTable(data)) {
|
|
487
673
|
return fromTableToBuffer(data, embeddings, schema);
|
|
488
674
|
}
|
|
489
675
|
else {
|
|
490
|
-
const table = await convertToTable(data);
|
|
491
|
-
return fromTableToBuffer(table
|
|
676
|
+
const table = await convertToTable(data, embeddings, { schema });
|
|
677
|
+
return fromTableToBuffer(table);
|
|
492
678
|
}
|
|
493
679
|
}
|
|
494
680
|
exports.fromDataToBuffer = fromDataToBuffer;
|
|
@@ -548,10 +734,22 @@ function validateSchemaEmbeddings(schema, data, embeddings) {
|
|
|
548
734
|
// if it does not, we add it to the list of missing embedding fields
|
|
549
735
|
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
550
736
|
// if they are not, we throw an error
|
|
551
|
-
for (
|
|
552
|
-
if (field.type
|
|
737
|
+
for (let field of schema.fields) {
|
|
738
|
+
if (isFixedSizeList(field.type)) {
|
|
739
|
+
field = (0, sanitize_1.sanitizeField)(field);
|
|
553
740
|
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
554
|
-
|
|
741
|
+
if (schema.metadata.has("embedding_functions")) {
|
|
742
|
+
const embeddings = JSON.parse(schema.metadata.get("embedding_functions"));
|
|
743
|
+
if (
|
|
744
|
+
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
|
745
|
+
embeddings.find((f) => f["vectorColumn"] === field.name) ===
|
|
746
|
+
undefined) {
|
|
747
|
+
missingEmbeddingFields.push(field);
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
else {
|
|
751
|
+
missingEmbeddingFields.push(field);
|
|
752
|
+
}
|
|
555
753
|
}
|
|
556
754
|
else {
|
|
557
755
|
fields.push(field);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Table as ArrowTable, Schema } from "
|
|
1
|
+
import { Table as ArrowTable, Schema } from "./arrow";
|
|
2
|
+
import { EmbeddingFunctionConfig } from "./embedding/registry";
|
|
2
3
|
import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
|
|
3
4
|
import { Table } from "./table";
|
|
4
5
|
/**
|
|
@@ -39,6 +40,14 @@ export interface CreateTableOptions {
|
|
|
39
40
|
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
40
41
|
*/
|
|
41
42
|
storageOptions?: Record<string, string>;
|
|
43
|
+
/**
|
|
44
|
+
* If true then data files will be written with the legacy format
|
|
45
|
+
*
|
|
46
|
+
* The default is true while the new format is in beta
|
|
47
|
+
*/
|
|
48
|
+
useLegacyFormat?: boolean;
|
|
49
|
+
schema?: Schema;
|
|
50
|
+
embeddingFunction?: EmbeddingFunctionConfig;
|
|
42
51
|
}
|
|
43
52
|
export interface OpenTableOptions {
|
|
44
53
|
/**
|
|
@@ -14,8 +14,8 @@
|
|
|
14
14
|
// limitations under the License.
|
|
15
15
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
16
|
exports.Connection = exports.connect = void 0;
|
|
17
|
-
const apache_arrow_1 = require("apache-arrow");
|
|
18
17
|
const arrow_1 = require("./arrow");
|
|
18
|
+
const registry_1 = require("./embedding/registry");
|
|
19
19
|
const native_1 = require("./native");
|
|
20
20
|
const table_1 = require("./table");
|
|
21
21
|
/**
|
|
@@ -109,14 +109,14 @@ class Connection {
|
|
|
109
109
|
mode = "exist_ok";
|
|
110
110
|
}
|
|
111
111
|
let table;
|
|
112
|
-
if (
|
|
112
|
+
if ((0, arrow_1.isArrowTable)(data)) {
|
|
113
113
|
table = data;
|
|
114
114
|
}
|
|
115
115
|
else {
|
|
116
|
-
table = (0, arrow_1.makeArrowTable)(data);
|
|
116
|
+
table = (0, arrow_1.makeArrowTable)(data, options);
|
|
117
117
|
}
|
|
118
|
-
const buf = await (0, arrow_1.fromTableToBuffer)(table);
|
|
119
|
-
const innerTable = await this.inner.createTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
|
|
118
|
+
const buf = await (0, arrow_1.fromTableToBuffer)(table, options?.embeddingFunction, options?.schema);
|
|
119
|
+
const innerTable = await this.inner.createTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions), options?.useLegacyFormat);
|
|
120
120
|
return new table_1.Table(innerTable);
|
|
121
121
|
}
|
|
122
122
|
/**
|
|
@@ -130,9 +130,15 @@ class Connection {
|
|
|
130
130
|
if (mode === "create" && existOk) {
|
|
131
131
|
mode = "exist_ok";
|
|
132
132
|
}
|
|
133
|
-
|
|
133
|
+
let metadata = undefined;
|
|
134
|
+
if (options?.embeddingFunction !== undefined) {
|
|
135
|
+
const embeddingFunction = options.embeddingFunction;
|
|
136
|
+
const registry = (0, registry_1.getRegistry)();
|
|
137
|
+
metadata = registry.getTableMetadata([embeddingFunction]);
|
|
138
|
+
}
|
|
139
|
+
const table = (0, arrow_1.makeEmptyTable)(schema, metadata);
|
|
134
140
|
const buf = await (0, arrow_1.fromTableToBuffer)(table);
|
|
135
|
-
const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
|
|
141
|
+
const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions), options?.useLegacyFormat);
|
|
136
142
|
return new table_1.Table(innerTable);
|
|
137
143
|
}
|
|
138
144
|
/**
|
|
@@ -1,45 +1,71 @@
|
|
|
1
|
-
import
|
|
1
|
+
import "reflect-metadata";
|
|
2
|
+
import { DataType, Float, type IntoVector } from "../arrow";
|
|
3
|
+
/**
|
|
4
|
+
* Options for a given embedding function
|
|
5
|
+
*/
|
|
6
|
+
export interface FunctionOptions {
|
|
7
|
+
[key: string]: any;
|
|
8
|
+
}
|
|
2
9
|
/**
|
|
3
10
|
* An embedding function that automatically creates vector representation for a given column.
|
|
4
11
|
*/
|
|
5
|
-
export
|
|
12
|
+
export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptions = FunctionOptions> {
|
|
6
13
|
/**
|
|
7
|
-
*
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
* The data type of the embedding
|
|
14
|
+
* Convert the embedding function to a JSON object
|
|
15
|
+
* It is used to serialize the embedding function to the schema
|
|
16
|
+
* It's important that any object returned by this method contains all the necessary
|
|
17
|
+
* information to recreate the embedding function
|
|
12
18
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
* be used to control the conversion.
|
|
16
|
-
*/
|
|
17
|
-
embeddingDataType?: Float;
|
|
18
|
-
/**
|
|
19
|
-
* The dimension of the embedding
|
|
19
|
+
* It should return the same object that was passed to the constructor
|
|
20
|
+
* If it does not, the embedding function will not be able to be recreated, or could be recreated incorrectly
|
|
20
21
|
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
22
|
+
* @example
|
|
23
|
+
* ```ts
|
|
24
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
25
|
+
* constructor(options: {model: string, timeout: number}) {
|
|
26
|
+
* super();
|
|
27
|
+
* this.model = options.model;
|
|
28
|
+
* this.timeout = options.timeout;
|
|
29
|
+
* }
|
|
30
|
+
* toJSON() {
|
|
31
|
+
* return {
|
|
32
|
+
* model: this.model,
|
|
33
|
+
* timeout: this.timeout,
|
|
34
|
+
* };
|
|
35
|
+
* }
|
|
36
|
+
* ```
|
|
24
37
|
*/
|
|
25
|
-
|
|
38
|
+
abstract toJSON(): Partial<M>;
|
|
26
39
|
/**
|
|
27
|
-
*
|
|
40
|
+
* sourceField is used in combination with `LanceSchema` to provide a declarative data model
|
|
41
|
+
*
|
|
42
|
+
* @param optionsOrDatatype - The options for the field or the datatype
|
|
28
43
|
*
|
|
29
|
-
*
|
|
44
|
+
* @see {@link lancedb.LanceSchema}
|
|
30
45
|
*/
|
|
31
|
-
|
|
46
|
+
sourceField(optionsOrDatatype: Partial<FieldOptions> | DataType): [DataType, Map<string, EmbeddingFunction>];
|
|
32
47
|
/**
|
|
33
|
-
*
|
|
48
|
+
* vectorField is used in combination with `LanceSchema` to provide a declarative data model
|
|
49
|
+
*
|
|
50
|
+
* @param options - The options for the field
|
|
34
51
|
*
|
|
35
|
-
*
|
|
36
|
-
* only the embedding will be stored.
|
|
52
|
+
* @see {@link lancedb.LanceSchema}
|
|
37
53
|
*/
|
|
38
|
-
|
|
54
|
+
vectorField(optionsOrDatatype?: Partial<FieldOptions> | DataType): [DataType, Map<string, EmbeddingFunction>];
|
|
55
|
+
/** The number of dimensions of the embeddings */
|
|
56
|
+
ndims(): number | undefined;
|
|
57
|
+
/** The datatype of the embeddings */
|
|
58
|
+
abstract embeddingDataType(): Float;
|
|
39
59
|
/**
|
|
40
60
|
* Creates a vector representation for the given values.
|
|
41
61
|
*/
|
|
42
|
-
|
|
62
|
+
abstract computeSourceEmbeddings(data: T[]): Promise<number[][] | Float32Array[] | Float64Array[]>;
|
|
63
|
+
/**
|
|
64
|
+
Compute the embeddings for a single query
|
|
65
|
+
*/
|
|
66
|
+
computeQueryEmbeddings(data: T): Promise<IntoVector>;
|
|
67
|
+
}
|
|
68
|
+
export interface FieldOptions<T extends DataType = DataType> {
|
|
69
|
+
datatype: T;
|
|
70
|
+
dims?: number;
|
|
43
71
|
}
|
|
44
|
-
/** Test if the input seems to be an embedding function */
|
|
45
|
-
export declare function isEmbeddingFunction<T>(value: unknown): value is EmbeddingFunction<T>;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
// Copyright
|
|
2
|
+
// Copyright 2024 Lance Developers.
|
|
3
3
|
//
|
|
4
4
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
5
|
// you may not use this file except in compliance with the License.
|
|
@@ -13,15 +13,94 @@
|
|
|
13
13
|
// See the License for the specific language governing permissions and
|
|
14
14
|
// limitations under the License.
|
|
15
15
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
|
-
exports.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
exports.EmbeddingFunction = void 0;
|
|
17
|
+
require("reflect-metadata");
|
|
18
|
+
const arrow_1 = require("../arrow");
|
|
19
|
+
const sanitize_1 = require("../sanitize");
|
|
20
|
+
/**
|
|
21
|
+
* An embedding function that automatically creates vector representation for a given column.
|
|
22
|
+
*/
|
|
23
|
+
class EmbeddingFunction {
|
|
24
|
+
/**
|
|
25
|
+
* sourceField is used in combination with `LanceSchema` to provide a declarative data model
|
|
26
|
+
*
|
|
27
|
+
* @param optionsOrDatatype - The options for the field or the datatype
|
|
28
|
+
*
|
|
29
|
+
* @see {@link lancedb.LanceSchema}
|
|
30
|
+
*/
|
|
31
|
+
sourceField(optionsOrDatatype) {
|
|
32
|
+
let datatype = (0, arrow_1.isDataType)(optionsOrDatatype)
|
|
33
|
+
? optionsOrDatatype
|
|
34
|
+
: optionsOrDatatype?.datatype;
|
|
35
|
+
if (!datatype) {
|
|
36
|
+
throw new Error("Datatype is required");
|
|
37
|
+
}
|
|
38
|
+
datatype = (0, sanitize_1.sanitizeType)(datatype);
|
|
39
|
+
const metadata = new Map();
|
|
40
|
+
metadata.set("source_column_for", this);
|
|
41
|
+
return [datatype, metadata];
|
|
21
42
|
}
|
|
22
|
-
|
|
23
|
-
|
|
43
|
+
/**
|
|
44
|
+
* vectorField is used in combination with `LanceSchema` to provide a declarative data model
|
|
45
|
+
*
|
|
46
|
+
* @param options - The options for the field
|
|
47
|
+
*
|
|
48
|
+
* @see {@link lancedb.LanceSchema}
|
|
49
|
+
*/
|
|
50
|
+
vectorField(optionsOrDatatype) {
|
|
51
|
+
let dtype;
|
|
52
|
+
let vectorType;
|
|
53
|
+
let dims = this.ndims();
|
|
54
|
+
// `func.vectorField(new Float32())`
|
|
55
|
+
if ((0, arrow_1.isDataType)(optionsOrDatatype)) {
|
|
56
|
+
dtype = optionsOrDatatype;
|
|
57
|
+
}
|
|
58
|
+
else {
|
|
59
|
+
// `func.vectorField({
|
|
60
|
+
// datatype: new Float32(),
|
|
61
|
+
// dims: 10
|
|
62
|
+
// })`
|
|
63
|
+
dims = dims ?? optionsOrDatatype?.dims;
|
|
64
|
+
dtype = optionsOrDatatype?.datatype;
|
|
65
|
+
}
|
|
66
|
+
if (dtype !== undefined) {
|
|
67
|
+
// `func.vectorField(new FixedSizeList(dims, new Field("item", new Float32(), true)))`
|
|
68
|
+
// or `func.vectorField({datatype: new FixedSizeList(dims, new Field("item", new Float32(), true))})`
|
|
69
|
+
if ((0, arrow_1.isFixedSizeList)(dtype)) {
|
|
70
|
+
vectorType = dtype;
|
|
71
|
+
// `func.vectorField(new Float32())`
|
|
72
|
+
// or `func.vectorField({datatype: new Float32()})`
|
|
73
|
+
}
|
|
74
|
+
else if ((0, arrow_1.isFloat)(dtype)) {
|
|
75
|
+
// No `ndims` impl and no `{dims: n}` provided;
|
|
76
|
+
if (dims === undefined) {
|
|
77
|
+
throw new Error("ndims is required for vector field");
|
|
78
|
+
}
|
|
79
|
+
vectorType = (0, arrow_1.newVectorType)(dims, dtype);
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
throw new Error("Expected FixedSizeList or Float as datatype for vector field");
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
if (dims === undefined) {
|
|
87
|
+
throw new Error("ndims is required for vector field");
|
|
88
|
+
}
|
|
89
|
+
vectorType = new arrow_1.FixedSizeList(dims, new arrow_1.Field("item", new arrow_1.Float32(), true));
|
|
90
|
+
}
|
|
91
|
+
const metadata = new Map();
|
|
92
|
+
metadata.set("vector_column_for", this);
|
|
93
|
+
return [vectorType, metadata];
|
|
94
|
+
}
|
|
95
|
+
/** The number of dimensions of the embeddings */
|
|
96
|
+
ndims() {
|
|
97
|
+
return undefined;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
Compute the embeddings for a single query
|
|
101
|
+
*/
|
|
102
|
+
async computeQueryEmbeddings(data) {
|
|
103
|
+
return this.computeSourceEmbeddings([data]).then((embeddings) => embeddings[0]);
|
|
24
104
|
}
|
|
25
|
-
return (typeof value.sourceColumn === "string" && typeof value.embed === "function");
|
|
26
105
|
}
|
|
27
|
-
exports.
|
|
106
|
+
exports.EmbeddingFunction = EmbeddingFunction;
|
|
@@ -1,2 +1,28 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
import { Schema } from "../arrow";
|
|
2
|
+
import { EmbeddingFunction } from "./embedding_function";
|
|
3
|
+
export { EmbeddingFunction } from "./embedding_function";
|
|
4
|
+
export * from "./openai";
|
|
5
|
+
export * from "./registry";
|
|
6
|
+
/**
|
|
7
|
+
* Create a schema with embedding functions.
|
|
8
|
+
*
|
|
9
|
+
* @param fields
|
|
10
|
+
* @returns Schema
|
|
11
|
+
* @example
|
|
12
|
+
* ```ts
|
|
13
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
14
|
+
* // ...
|
|
15
|
+
* }
|
|
16
|
+
* const func = new MyEmbeddingFunction();
|
|
17
|
+
* const schema = LanceSchema({
|
|
18
|
+
* id: new Int32(),
|
|
19
|
+
* text: func.sourceField(new Utf8()),
|
|
20
|
+
* vector: func.vectorField(),
|
|
21
|
+
* // optional: specify the datatype and/or dimensions
|
|
22
|
+
* vector2: func.vectorField({ datatype: new Float32(), dims: 3}),
|
|
23
|
+
* });
|
|
24
|
+
*
|
|
25
|
+
* const table = await db.createTable("my_table", data, { schema });
|
|
26
|
+
* ```
|
|
27
|
+
*/
|
|
28
|
+
export declare function LanceSchema(fields: Record<string, [object, Map<string, EmbeddingFunction>] | object>): Schema;
|