@lancedb/lancedb 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/biome.json +8 -2
- package/dist/arrow.d.ts +34 -9
- package/dist/arrow.js +220 -23
- package/dist/connection.d.ts +4 -1
- package/dist/connection.js +11 -5
- package/dist/embedding/embedding_function.d.ts +54 -28
- package/dist/embedding/embedding_function.js +71 -10
- package/dist/embedding/index.d.ts +28 -2
- package/dist/embedding/index.js +111 -4
- package/dist/embedding/openai.d.ts +16 -7
- package/dist/embedding/openai.js +62 -12
- package/dist/embedding/registry.d.ts +54 -0
- package/dist/embedding/registry.js +123 -0
- package/dist/query.d.ts +1 -1
- package/dist/query.js +3 -3
- package/dist/sanitize.d.ts +22 -1
- package/dist/sanitize.js +123 -110
- package/dist/table.d.ts +1 -2
- package/dist/table.js +6 -3
- package/lancedb/arrow.ts +234 -38
- package/lancedb/connection.ts +27 -6
- package/lancedb/embedding/embedding_function.ts +126 -42
- package/lancedb/embedding/index.ts +113 -2
- package/lancedb/embedding/openai.ts +62 -16
- package/lancedb/embedding/registry.ts +172 -0
- package/lancedb/query.ts +2 -1
- package/lancedb/sanitize.ts +22 -22
- package/lancedb/table.ts +10 -3
- package/nodejs-artifacts/arrow.d.ts +34 -9
- package/nodejs-artifacts/arrow.js +220 -23
- package/nodejs-artifacts/connection.d.ts +4 -1
- package/nodejs-artifacts/connection.js +11 -5
- package/nodejs-artifacts/embedding/embedding_function.d.ts +54 -28
- package/nodejs-artifacts/embedding/embedding_function.js +71 -10
- package/nodejs-artifacts/embedding/index.d.ts +28 -2
- package/nodejs-artifacts/embedding/index.js +111 -4
- package/nodejs-artifacts/embedding/openai.d.ts +16 -7
- package/nodejs-artifacts/embedding/openai.js +62 -12
- package/nodejs-artifacts/embedding/registry.d.ts +54 -0
- package/nodejs-artifacts/embedding/registry.js +123 -0
- package/nodejs-artifacts/query.d.ts +1 -1
- package/nodejs-artifacts/query.js +3 -3
- package/nodejs-artifacts/sanitize.d.ts +22 -1
- package/nodejs-artifacts/sanitize.js +123 -110
- package/nodejs-artifacts/table.d.ts +1 -2
- package/nodejs-artifacts/table.js +6 -3
- package/package.json +14 -9
- package/tsconfig.json +3 -1
|
@@ -1,6 +1,29 @@
|
|
|
1
1
|
/// <reference types="node" />
|
|
2
|
-
import { Table as ArrowTable,
|
|
2
|
+
import { Table as ArrowTable, Binary, DataType, FixedSizeBinary, FixedSizeList, Float, Int, LargeBinary, List, Null, Schema, Struct, Utf8 } from "apache-arrow";
|
|
3
3
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
|
4
|
+
import { EmbeddingFunctionConfig } from "./embedding/registry";
|
|
5
|
+
export * from "apache-arrow";
|
|
6
|
+
export declare function isArrowTable(value: object): value is ArrowTable;
|
|
7
|
+
export declare function isDataType(value: unknown): value is DataType;
|
|
8
|
+
export declare function isNull(value: unknown): value is Null;
|
|
9
|
+
export declare function isInt(value: unknown): value is Int;
|
|
10
|
+
export declare function isFloat(value: unknown): value is Float;
|
|
11
|
+
export declare function isBinary(value: unknown): value is Binary;
|
|
12
|
+
export declare function isLargeBinary(value: unknown): value is LargeBinary;
|
|
13
|
+
export declare function isUtf8(value: unknown): value is Utf8;
|
|
14
|
+
export declare function isLargeUtf8(value: unknown): value is Utf8;
|
|
15
|
+
export declare function isBool(value: unknown): value is Utf8;
|
|
16
|
+
export declare function isDecimal(value: unknown): value is Utf8;
|
|
17
|
+
export declare function isDate(value: unknown): value is Utf8;
|
|
18
|
+
export declare function isTime(value: unknown): value is Utf8;
|
|
19
|
+
export declare function isTimestamp(value: unknown): value is Utf8;
|
|
20
|
+
export declare function isInterval(value: unknown): value is Utf8;
|
|
21
|
+
export declare function isDuration(value: unknown): value is Utf8;
|
|
22
|
+
export declare function isList(value: unknown): value is List;
|
|
23
|
+
export declare function isStruct(value: unknown): value is Struct;
|
|
24
|
+
export declare function isUnion(value: unknown): value is Struct;
|
|
25
|
+
export declare function isFixedSizeBinary(value: unknown): value is FixedSizeBinary;
|
|
26
|
+
export declare function isFixedSizeList(value: unknown): value is FixedSizeList;
|
|
4
27
|
/** Data type accepted by NodeJS SDK */
|
|
5
28
|
export type Data = Record<string, unknown>[] | ArrowTable;
|
|
6
29
|
export declare class VectorColumnOptions {
|
|
@@ -117,11 +140,11 @@ export declare class MakeArrowTableOptions {
|
|
|
117
140
|
* assert.deepEqual(table.schema, schema)
|
|
118
141
|
* ```
|
|
119
142
|
*/
|
|
120
|
-
export declare function makeArrowTable(data: Array<Record<string, unknown>>, options?: Partial<MakeArrowTableOptions>): ArrowTable;
|
|
143
|
+
export declare function makeArrowTable(data: Array<Record<string, unknown>>, options?: Partial<MakeArrowTableOptions>, metadata?: Map<string, string>): ArrowTable;
|
|
121
144
|
/**
|
|
122
145
|
* Create an empty Arrow table with the provided schema
|
|
123
146
|
*/
|
|
124
|
-
export declare function makeEmptyTable(schema: Schema): ArrowTable;
|
|
147
|
+
export declare function makeEmptyTable(schema: Schema, metadata?: Map<string, string>): ArrowTable;
|
|
125
148
|
/**
|
|
126
149
|
* Convert an Array of records into an Arrow Table, optionally applying an
|
|
127
150
|
* embeddings function to it.
|
|
@@ -140,7 +163,9 @@ export declare function makeEmptyTable(schema: Schema): ArrowTable;
|
|
|
140
163
|
* embedding columns. If no schema is provded then embedding columns will
|
|
141
164
|
* be placed at the end of the table, after all of the input columns.
|
|
142
165
|
*/
|
|
143
|
-
export declare function convertToTable
|
|
166
|
+
export declare function convertToTable(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, makeTableOptions?: Partial<MakeArrowTableOptions>): Promise<ArrowTable>;
|
|
167
|
+
/** Creates the Arrow Type for a Vector column with dimension `dim` */
|
|
168
|
+
export declare function newVectorType<T extends Float>(dim: number, innerType: T): FixedSizeList<T>;
|
|
144
169
|
/**
|
|
145
170
|
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
146
171
|
*
|
|
@@ -148,7 +173,7 @@ export declare function convertToTable<T>(data: Array<Record<string, unknown>>,
|
|
|
148
173
|
*
|
|
149
174
|
* `schema` is required if data is empty
|
|
150
175
|
*/
|
|
151
|
-
export declare function fromRecordsToBuffer
|
|
176
|
+
export declare function fromRecordsToBuffer(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
152
177
|
/**
|
|
153
178
|
* Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
|
|
154
179
|
*
|
|
@@ -156,7 +181,7 @@ export declare function fromRecordsToBuffer<T>(data: Array<Record<string, unknow
|
|
|
156
181
|
*
|
|
157
182
|
* `schema` is required if data is empty
|
|
158
183
|
*/
|
|
159
|
-
export declare function fromRecordsToStreamBuffer
|
|
184
|
+
export declare function fromRecordsToStreamBuffer(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
160
185
|
/**
|
|
161
186
|
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
162
187
|
*
|
|
@@ -165,7 +190,7 @@ export declare function fromRecordsToStreamBuffer<T>(data: Array<Record<string,
|
|
|
165
190
|
*
|
|
166
191
|
* `schema` is required if the table is empty
|
|
167
192
|
*/
|
|
168
|
-
export declare function fromTableToBuffer
|
|
193
|
+
export declare function fromTableToBuffer(table: ArrowTable, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
169
194
|
/**
|
|
170
195
|
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
171
196
|
*
|
|
@@ -174,7 +199,7 @@ export declare function fromTableToBuffer<T>(table: ArrowTable, embeddings?: Emb
|
|
|
174
199
|
*
|
|
175
200
|
* `schema` is required if the table is empty
|
|
176
201
|
*/
|
|
177
|
-
export declare function fromDataToBuffer
|
|
202
|
+
export declare function fromDataToBuffer(data: Data, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
178
203
|
/**
|
|
179
204
|
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
|
|
180
205
|
*
|
|
@@ -183,7 +208,7 @@ export declare function fromDataToBuffer<T>(data: Data, embeddings?: EmbeddingFu
|
|
|
183
208
|
*
|
|
184
209
|
* `schema` is required if the table is empty
|
|
185
210
|
*/
|
|
186
|
-
export declare function fromTableToStreamBuffer
|
|
211
|
+
export declare function fromTableToStreamBuffer(table: ArrowTable, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
187
212
|
/**
|
|
188
213
|
* Create an empty table with the given schema
|
|
189
214
|
*/
|
|
@@ -12,10 +12,133 @@
|
|
|
12
12
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
13
|
// See the License for the specific language governing permissions and
|
|
14
14
|
// limitations under the License.
|
|
15
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
16
|
+
if (k2 === undefined) k2 = k;
|
|
17
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
18
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
19
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
20
|
+
}
|
|
21
|
+
Object.defineProperty(o, k2, desc);
|
|
22
|
+
}) : (function(o, m, k, k2) {
|
|
23
|
+
if (k2 === undefined) k2 = k;
|
|
24
|
+
o[k2] = m[k];
|
|
25
|
+
}));
|
|
26
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
27
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
28
|
+
};
|
|
15
29
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
|
-
exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = void 0;
|
|
30
|
+
exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.newVectorType = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = exports.isFixedSizeList = exports.isFixedSizeBinary = exports.isUnion = exports.isStruct = exports.isList = exports.isDuration = exports.isInterval = exports.isTimestamp = exports.isTime = exports.isDate = exports.isDecimal = exports.isBool = exports.isLargeUtf8 = exports.isUtf8 = exports.isLargeBinary = exports.isBinary = exports.isFloat = exports.isInt = exports.isNull = exports.isDataType = exports.isArrowTable = void 0;
|
|
17
31
|
const apache_arrow_1 = require("apache-arrow");
|
|
32
|
+
const registry_1 = require("./embedding/registry");
|
|
18
33
|
const sanitize_1 = require("./sanitize");
|
|
34
|
+
__exportStar(require("apache-arrow"), exports);
|
|
35
|
+
function isArrowTable(value) {
|
|
36
|
+
if (value instanceof apache_arrow_1.Table)
|
|
37
|
+
return true;
|
|
38
|
+
return "schema" in value && "batches" in value;
|
|
39
|
+
}
|
|
40
|
+
exports.isArrowTable = isArrowTable;
|
|
41
|
+
function isDataType(value) {
|
|
42
|
+
return (value instanceof apache_arrow_1.DataType ||
|
|
43
|
+
apache_arrow_1.DataType.isNull(value) ||
|
|
44
|
+
apache_arrow_1.DataType.isInt(value) ||
|
|
45
|
+
apache_arrow_1.DataType.isFloat(value) ||
|
|
46
|
+
apache_arrow_1.DataType.isBinary(value) ||
|
|
47
|
+
apache_arrow_1.DataType.isLargeBinary(value) ||
|
|
48
|
+
apache_arrow_1.DataType.isUtf8(value) ||
|
|
49
|
+
apache_arrow_1.DataType.isLargeUtf8(value) ||
|
|
50
|
+
apache_arrow_1.DataType.isBool(value) ||
|
|
51
|
+
apache_arrow_1.DataType.isDecimal(value) ||
|
|
52
|
+
apache_arrow_1.DataType.isDate(value) ||
|
|
53
|
+
apache_arrow_1.DataType.isTime(value) ||
|
|
54
|
+
apache_arrow_1.DataType.isTimestamp(value) ||
|
|
55
|
+
apache_arrow_1.DataType.isInterval(value) ||
|
|
56
|
+
apache_arrow_1.DataType.isDuration(value) ||
|
|
57
|
+
apache_arrow_1.DataType.isList(value) ||
|
|
58
|
+
apache_arrow_1.DataType.isStruct(value) ||
|
|
59
|
+
apache_arrow_1.DataType.isUnion(value) ||
|
|
60
|
+
apache_arrow_1.DataType.isFixedSizeBinary(value) ||
|
|
61
|
+
apache_arrow_1.DataType.isFixedSizeList(value) ||
|
|
62
|
+
apache_arrow_1.DataType.isMap(value) ||
|
|
63
|
+
apache_arrow_1.DataType.isDictionary(value));
|
|
64
|
+
}
|
|
65
|
+
exports.isDataType = isDataType;
|
|
66
|
+
function isNull(value) {
|
|
67
|
+
return value instanceof apache_arrow_1.Null || apache_arrow_1.DataType.isNull(value);
|
|
68
|
+
}
|
|
69
|
+
exports.isNull = isNull;
|
|
70
|
+
function isInt(value) {
|
|
71
|
+
return value instanceof apache_arrow_1.Int || apache_arrow_1.DataType.isInt(value);
|
|
72
|
+
}
|
|
73
|
+
exports.isInt = isInt;
|
|
74
|
+
function isFloat(value) {
|
|
75
|
+
return value instanceof apache_arrow_1.Float || apache_arrow_1.DataType.isFloat(value);
|
|
76
|
+
}
|
|
77
|
+
exports.isFloat = isFloat;
|
|
78
|
+
function isBinary(value) {
|
|
79
|
+
return value instanceof apache_arrow_1.Binary || apache_arrow_1.DataType.isBinary(value);
|
|
80
|
+
}
|
|
81
|
+
exports.isBinary = isBinary;
|
|
82
|
+
function isLargeBinary(value) {
|
|
83
|
+
return value instanceof apache_arrow_1.LargeBinary || apache_arrow_1.DataType.isLargeBinary(value);
|
|
84
|
+
}
|
|
85
|
+
exports.isLargeBinary = isLargeBinary;
|
|
86
|
+
function isUtf8(value) {
|
|
87
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isUtf8(value);
|
|
88
|
+
}
|
|
89
|
+
exports.isUtf8 = isUtf8;
|
|
90
|
+
function isLargeUtf8(value) {
|
|
91
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isLargeUtf8(value);
|
|
92
|
+
}
|
|
93
|
+
exports.isLargeUtf8 = isLargeUtf8;
|
|
94
|
+
function isBool(value) {
|
|
95
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isBool(value);
|
|
96
|
+
}
|
|
97
|
+
exports.isBool = isBool;
|
|
98
|
+
function isDecimal(value) {
|
|
99
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDecimal(value);
|
|
100
|
+
}
|
|
101
|
+
exports.isDecimal = isDecimal;
|
|
102
|
+
function isDate(value) {
|
|
103
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDate(value);
|
|
104
|
+
}
|
|
105
|
+
exports.isDate = isDate;
|
|
106
|
+
function isTime(value) {
|
|
107
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTime(value);
|
|
108
|
+
}
|
|
109
|
+
exports.isTime = isTime;
|
|
110
|
+
function isTimestamp(value) {
|
|
111
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTimestamp(value);
|
|
112
|
+
}
|
|
113
|
+
exports.isTimestamp = isTimestamp;
|
|
114
|
+
function isInterval(value) {
|
|
115
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isInterval(value);
|
|
116
|
+
}
|
|
117
|
+
exports.isInterval = isInterval;
|
|
118
|
+
function isDuration(value) {
|
|
119
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDuration(value);
|
|
120
|
+
}
|
|
121
|
+
exports.isDuration = isDuration;
|
|
122
|
+
function isList(value) {
|
|
123
|
+
return value instanceof apache_arrow_1.List || apache_arrow_1.DataType.isList(value);
|
|
124
|
+
}
|
|
125
|
+
exports.isList = isList;
|
|
126
|
+
function isStruct(value) {
|
|
127
|
+
return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isStruct(value);
|
|
128
|
+
}
|
|
129
|
+
exports.isStruct = isStruct;
|
|
130
|
+
function isUnion(value) {
|
|
131
|
+
return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isUnion(value);
|
|
132
|
+
}
|
|
133
|
+
exports.isUnion = isUnion;
|
|
134
|
+
function isFixedSizeBinary(value) {
|
|
135
|
+
return value instanceof apache_arrow_1.FixedSizeBinary || apache_arrow_1.DataType.isFixedSizeBinary(value);
|
|
136
|
+
}
|
|
137
|
+
exports.isFixedSizeBinary = isFixedSizeBinary;
|
|
138
|
+
function isFixedSizeList(value) {
|
|
139
|
+
return value instanceof apache_arrow_1.FixedSizeList || apache_arrow_1.DataType.isFixedSizeList(value);
|
|
140
|
+
}
|
|
141
|
+
exports.isFixedSizeList = isFixedSizeList;
|
|
19
142
|
/*
|
|
20
143
|
* Options to control how a column should be converted to a vector array
|
|
21
144
|
*/
|
|
@@ -168,7 +291,7 @@ exports.MakeArrowTableOptions = MakeArrowTableOptions;
|
|
|
168
291
|
* assert.deepEqual(table.schema, schema)
|
|
169
292
|
* ```
|
|
170
293
|
*/
|
|
171
|
-
function makeArrowTable(data, options) {
|
|
294
|
+
function makeArrowTable(data, options, metadata) {
|
|
172
295
|
if (data.length === 0 &&
|
|
173
296
|
(options?.schema === undefined || options?.schema === null)) {
|
|
174
297
|
throw new Error("At least one record or a schema needs to be provided");
|
|
@@ -246,21 +369,38 @@ function makeArrowTable(data, options) {
|
|
|
246
369
|
// then patch the schema of the batches so we can use
|
|
247
370
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
248
371
|
const firstTable = new apache_arrow_1.Table(columns);
|
|
249
|
-
const batchesFixed = firstTable.batches.map(
|
|
250
|
-
|
|
251
|
-
(
|
|
252
|
-
|
|
372
|
+
const batchesFixed = firstTable.batches.map((batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
|
|
373
|
+
let schema;
|
|
374
|
+
if (metadata !== undefined) {
|
|
375
|
+
let schemaMetadata = opt.schema.metadata;
|
|
376
|
+
if (schemaMetadata.size === 0) {
|
|
377
|
+
schemaMetadata = metadata;
|
|
378
|
+
}
|
|
379
|
+
else {
|
|
380
|
+
for (const [key, entry] of schemaMetadata.entries()) {
|
|
381
|
+
schemaMetadata.set(key, entry);
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
schema = new apache_arrow_1.Schema(opt.schema.fields, schemaMetadata);
|
|
385
|
+
}
|
|
386
|
+
else {
|
|
387
|
+
schema = opt.schema;
|
|
388
|
+
}
|
|
389
|
+
return new apache_arrow_1.Table(schema, batchesFixed);
|
|
253
390
|
}
|
|
254
|
-
|
|
255
|
-
|
|
391
|
+
const tbl = new apache_arrow_1.Table(columns);
|
|
392
|
+
if (metadata !== undefined) {
|
|
393
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
394
|
+
tbl.schema.metadata = metadata;
|
|
256
395
|
}
|
|
396
|
+
return tbl;
|
|
257
397
|
}
|
|
258
398
|
exports.makeArrowTable = makeArrowTable;
|
|
259
399
|
/**
|
|
260
400
|
* Create an empty Arrow table with the provided schema
|
|
261
401
|
*/
|
|
262
|
-
function makeEmptyTable(schema) {
|
|
263
|
-
return makeArrowTable([], { schema });
|
|
402
|
+
function makeEmptyTable(schema, metadata) {
|
|
403
|
+
return makeArrowTable([], { schema }, metadata);
|
|
264
404
|
}
|
|
265
405
|
exports.makeEmptyTable = makeEmptyTable;
|
|
266
406
|
/**
|
|
@@ -322,9 +462,52 @@ function makeVector(values, type, stringAsDictionary) {
|
|
|
322
462
|
return (0, apache_arrow_1.vectorFromArray)(values);
|
|
323
463
|
}
|
|
324
464
|
}
|
|
465
|
+
/** Helper function to apply embeddings from metadata to an input table */
|
|
466
|
+
async function applyEmbeddingsFromMetadata(table, schema) {
|
|
467
|
+
const registry = (0, registry_1.getRegistry)();
|
|
468
|
+
const functions = registry.parseFunctions(schema.metadata);
|
|
469
|
+
const columns = Object.fromEntries(table.schema.fields.map((field) => [
|
|
470
|
+
field.name,
|
|
471
|
+
table.getChild(field.name),
|
|
472
|
+
]));
|
|
473
|
+
for (const functionEntry of functions.values()) {
|
|
474
|
+
const sourceColumn = columns[functionEntry.sourceColumn];
|
|
475
|
+
const destColumn = functionEntry.vectorColumn ?? "vector";
|
|
476
|
+
if (sourceColumn === undefined) {
|
|
477
|
+
throw new Error(`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`);
|
|
478
|
+
}
|
|
479
|
+
if (columns[destColumn] !== undefined) {
|
|
480
|
+
throw new Error(`Attempt to apply embeddings to table failed because column ${destColumn} already existed`);
|
|
481
|
+
}
|
|
482
|
+
if (table.batches.length > 1) {
|
|
483
|
+
throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
|
|
484
|
+
}
|
|
485
|
+
const values = sourceColumn.toArray();
|
|
486
|
+
const vectors = await functionEntry.function.computeSourceEmbeddings(values);
|
|
487
|
+
if (vectors.length !== values.length) {
|
|
488
|
+
throw new Error("Embedding function did not return an embedding for each input element");
|
|
489
|
+
}
|
|
490
|
+
let destType;
|
|
491
|
+
const dtype = schema.fields.find((f) => f.name === destColumn).type;
|
|
492
|
+
if (isFixedSizeList(dtype)) {
|
|
493
|
+
destType = (0, sanitize_1.sanitizeType)(dtype);
|
|
494
|
+
}
|
|
495
|
+
else {
|
|
496
|
+
throw new Error("Expected FixedSizeList as datatype for vector field, instead got: " +
|
|
497
|
+
dtype);
|
|
498
|
+
}
|
|
499
|
+
const vector = makeVector(vectors, destType);
|
|
500
|
+
columns[destColumn] = vector;
|
|
501
|
+
}
|
|
502
|
+
const newTable = new apache_arrow_1.Table(columns);
|
|
503
|
+
return alignTable(newTable, schema);
|
|
504
|
+
}
|
|
325
505
|
/** Helper function to apply embeddings to an input table */
|
|
326
506
|
async function applyEmbeddings(table, embeddings, schema) {
|
|
327
|
-
if (
|
|
507
|
+
if (schema?.metadata.has("embedding_functions")) {
|
|
508
|
+
return applyEmbeddingsFromMetadata(table, schema);
|
|
509
|
+
}
|
|
510
|
+
else if (embeddings == null || embeddings === undefined) {
|
|
328
511
|
return table;
|
|
329
512
|
}
|
|
330
513
|
if (schema !== undefined && schema !== null) {
|
|
@@ -339,8 +522,8 @@ async function applyEmbeddings(table, embeddings, schema) {
|
|
|
339
522
|
});
|
|
340
523
|
const newColumns = Object.fromEntries(colEntries);
|
|
341
524
|
const sourceColumn = newColumns[embeddings.sourceColumn];
|
|
342
|
-
const destColumn = embeddings.
|
|
343
|
-
const innerDestType = embeddings.embeddingDataType ?? new apache_arrow_1.Float32();
|
|
525
|
+
const destColumn = embeddings.vectorColumn ?? "vector";
|
|
526
|
+
const innerDestType = embeddings.function.embeddingDataType() ?? new apache_arrow_1.Float32();
|
|
344
527
|
if (sourceColumn === undefined) {
|
|
345
528
|
throw new Error(`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`);
|
|
346
529
|
}
|
|
@@ -351,8 +534,9 @@ async function applyEmbeddings(table, embeddings, schema) {
|
|
|
351
534
|
// if we call convertToTable with 0 records and a schema that includes the embedding
|
|
352
535
|
return table;
|
|
353
536
|
}
|
|
354
|
-
|
|
355
|
-
|
|
537
|
+
const dimensions = embeddings.function.ndims();
|
|
538
|
+
if (dimensions !== undefined) {
|
|
539
|
+
const destType = newVectorType(dimensions, innerDestType);
|
|
356
540
|
newColumns[destColumn] = makeVector([], destType);
|
|
357
541
|
}
|
|
358
542
|
else if (schema != null) {
|
|
@@ -376,7 +560,7 @@ async function applyEmbeddings(table, embeddings, schema) {
|
|
|
376
560
|
throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
|
|
377
561
|
}
|
|
378
562
|
const values = sourceColumn.toArray();
|
|
379
|
-
const vectors = await embeddings.
|
|
563
|
+
const vectors = await embeddings.function.computeSourceEmbeddings(values);
|
|
380
564
|
if (vectors.length !== values.length) {
|
|
381
565
|
throw new Error("Embedding function did not return an embedding for each input element");
|
|
382
566
|
}
|
|
@@ -419,9 +603,10 @@ exports.convertToTable = convertToTable;
|
|
|
419
603
|
function newVectorType(dim, innerType) {
|
|
420
604
|
// in Lance we always default to have the elements nullable, so we need to set it to true
|
|
421
605
|
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
|
|
422
|
-
const children = new apache_arrow_1.Field("item", innerType, true);
|
|
606
|
+
const children = new apache_arrow_1.Field("item", (0, sanitize_1.sanitizeType)(innerType), true);
|
|
423
607
|
return new apache_arrow_1.FixedSizeList(dim, children);
|
|
424
608
|
}
|
|
609
|
+
exports.newVectorType = newVectorType;
|
|
425
610
|
/**
|
|
426
611
|
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
427
612
|
*
|
|
@@ -483,12 +668,12 @@ async function fromDataToBuffer(data, embeddings, schema) {
|
|
|
483
668
|
if (schema !== undefined && schema !== null) {
|
|
484
669
|
schema = (0, sanitize_1.sanitizeSchema)(schema);
|
|
485
670
|
}
|
|
486
|
-
if (data
|
|
671
|
+
if (isArrowTable(data)) {
|
|
487
672
|
return fromTableToBuffer(data, embeddings, schema);
|
|
488
673
|
}
|
|
489
674
|
else {
|
|
490
|
-
const table = await convertToTable(data);
|
|
491
|
-
return fromTableToBuffer(table
|
|
675
|
+
const table = await convertToTable(data, embeddings, { schema });
|
|
676
|
+
return fromTableToBuffer(table);
|
|
492
677
|
}
|
|
493
678
|
}
|
|
494
679
|
exports.fromDataToBuffer = fromDataToBuffer;
|
|
@@ -548,10 +733,22 @@ function validateSchemaEmbeddings(schema, data, embeddings) {
|
|
|
548
733
|
// if it does not, we add it to the list of missing embedding fields
|
|
549
734
|
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
550
735
|
// if they are not, we throw an error
|
|
551
|
-
for (
|
|
552
|
-
if (field.type
|
|
736
|
+
for (let field of schema.fields) {
|
|
737
|
+
if (isFixedSizeList(field.type)) {
|
|
738
|
+
field = (0, sanitize_1.sanitizeField)(field);
|
|
553
739
|
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
554
|
-
|
|
740
|
+
if (schema.metadata.has("embedding_functions")) {
|
|
741
|
+
const embeddings = JSON.parse(schema.metadata.get("embedding_functions"));
|
|
742
|
+
if (
|
|
743
|
+
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
|
744
|
+
embeddings.find((f) => f["vectorColumn"] === field.name) ===
|
|
745
|
+
undefined) {
|
|
746
|
+
missingEmbeddingFields.push(field);
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
else {
|
|
750
|
+
missingEmbeddingFields.push(field);
|
|
751
|
+
}
|
|
555
752
|
}
|
|
556
753
|
else {
|
|
557
754
|
fields.push(field);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Table as ArrowTable, Schema } from "
|
|
1
|
+
import { Table as ArrowTable, Schema } from "./arrow";
|
|
2
|
+
import { EmbeddingFunctionConfig } from "./embedding/registry";
|
|
2
3
|
import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
|
|
3
4
|
import { Table } from "./table";
|
|
4
5
|
/**
|
|
@@ -39,6 +40,8 @@ export interface CreateTableOptions {
|
|
|
39
40
|
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
40
41
|
*/
|
|
41
42
|
storageOptions?: Record<string, string>;
|
|
43
|
+
schema?: Schema;
|
|
44
|
+
embeddingFunction?: EmbeddingFunctionConfig;
|
|
42
45
|
}
|
|
43
46
|
export interface OpenTableOptions {
|
|
44
47
|
/**
|
|
@@ -14,8 +14,8 @@
|
|
|
14
14
|
// limitations under the License.
|
|
15
15
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
16
|
exports.Connection = exports.connect = void 0;
|
|
17
|
-
const apache_arrow_1 = require("apache-arrow");
|
|
18
17
|
const arrow_1 = require("./arrow");
|
|
18
|
+
const registry_1 = require("./embedding/registry");
|
|
19
19
|
const native_1 = require("./native");
|
|
20
20
|
const table_1 = require("./table");
|
|
21
21
|
/**
|
|
@@ -109,13 +109,13 @@ class Connection {
|
|
|
109
109
|
mode = "exist_ok";
|
|
110
110
|
}
|
|
111
111
|
let table;
|
|
112
|
-
if (
|
|
112
|
+
if ((0, arrow_1.isArrowTable)(data)) {
|
|
113
113
|
table = data;
|
|
114
114
|
}
|
|
115
115
|
else {
|
|
116
|
-
table = (0, arrow_1.makeArrowTable)(data);
|
|
116
|
+
table = (0, arrow_1.makeArrowTable)(data, options);
|
|
117
117
|
}
|
|
118
|
-
const buf = await (0, arrow_1.fromTableToBuffer)(table);
|
|
118
|
+
const buf = await (0, arrow_1.fromTableToBuffer)(table, options?.embeddingFunction, options?.schema);
|
|
119
119
|
const innerTable = await this.inner.createTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
|
|
120
120
|
return new table_1.Table(innerTable);
|
|
121
121
|
}
|
|
@@ -130,7 +130,13 @@ class Connection {
|
|
|
130
130
|
if (mode === "create" && existOk) {
|
|
131
131
|
mode = "exist_ok";
|
|
132
132
|
}
|
|
133
|
-
|
|
133
|
+
let metadata = undefined;
|
|
134
|
+
if (options?.embeddingFunction !== undefined) {
|
|
135
|
+
const embeddingFunction = options.embeddingFunction;
|
|
136
|
+
const registry = (0, registry_1.getRegistry)();
|
|
137
|
+
metadata = registry.getTableMetadata([embeddingFunction]);
|
|
138
|
+
}
|
|
139
|
+
const table = (0, arrow_1.makeEmptyTable)(schema, metadata);
|
|
134
140
|
const buf = await (0, arrow_1.fromTableToBuffer)(table);
|
|
135
141
|
const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions));
|
|
136
142
|
return new table_1.Table(innerTable);
|
|
@@ -1,45 +1,71 @@
|
|
|
1
|
-
import
|
|
1
|
+
import "reflect-metadata";
|
|
2
|
+
import { DataType, Float } from "../arrow";
|
|
3
|
+
/**
|
|
4
|
+
* Options for a given embedding function
|
|
5
|
+
*/
|
|
6
|
+
export interface FunctionOptions {
|
|
7
|
+
[key: string]: any;
|
|
8
|
+
}
|
|
2
9
|
/**
|
|
3
10
|
* An embedding function that automatically creates vector representation for a given column.
|
|
4
11
|
*/
|
|
5
|
-
export
|
|
12
|
+
export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptions = FunctionOptions> {
|
|
6
13
|
/**
|
|
7
|
-
*
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
* The data type of the embedding
|
|
14
|
+
* Convert the embedding function to a JSON object
|
|
15
|
+
* It is used to serialize the embedding function to the schema
|
|
16
|
+
* It's important that any object returned by this method contains all the necessary
|
|
17
|
+
* information to recreate the embedding function
|
|
12
18
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
* be used to control the conversion.
|
|
16
|
-
*/
|
|
17
|
-
embeddingDataType?: Float;
|
|
18
|
-
/**
|
|
19
|
-
* The dimension of the embedding
|
|
19
|
+
* It should return the same object that was passed to the constructor
|
|
20
|
+
* If it does not, the embedding function will not be able to be recreated, or could be recreated incorrectly
|
|
20
21
|
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
22
|
+
* @example
|
|
23
|
+
* ```ts
|
|
24
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
25
|
+
* constructor(options: {model: string, timeout: number}) {
|
|
26
|
+
* super();
|
|
27
|
+
* this.model = options.model;
|
|
28
|
+
* this.timeout = options.timeout;
|
|
29
|
+
* }
|
|
30
|
+
* toJSON() {
|
|
31
|
+
* return {
|
|
32
|
+
* model: this.model,
|
|
33
|
+
* timeout: this.timeout,
|
|
34
|
+
* };
|
|
35
|
+
* }
|
|
36
|
+
* ```
|
|
24
37
|
*/
|
|
25
|
-
|
|
38
|
+
abstract toJSON(): Partial<M>;
|
|
26
39
|
/**
|
|
27
|
-
*
|
|
40
|
+
* sourceField is used in combination with `LanceSchema` to provide a declarative data model
|
|
41
|
+
*
|
|
42
|
+
* @param optionsOrDatatype - The options for the field or the datatype
|
|
28
43
|
*
|
|
29
|
-
*
|
|
44
|
+
* @see {@link lancedb.LanceSchema}
|
|
30
45
|
*/
|
|
31
|
-
|
|
46
|
+
sourceField(optionsOrDatatype: Partial<FieldOptions> | DataType): [DataType, Map<string, EmbeddingFunction>];
|
|
32
47
|
/**
|
|
33
|
-
*
|
|
48
|
+
* vectorField is used in combination with `LanceSchema` to provide a declarative data model
|
|
49
|
+
*
|
|
50
|
+
* @param options - The options for the field
|
|
34
51
|
*
|
|
35
|
-
*
|
|
36
|
-
* only the embedding will be stored.
|
|
52
|
+
* @see {@link lancedb.LanceSchema}
|
|
37
53
|
*/
|
|
38
|
-
|
|
54
|
+
vectorField(options?: Partial<FieldOptions>): [DataType, Map<string, EmbeddingFunction>];
|
|
55
|
+
/** The number of dimensions of the embeddings */
|
|
56
|
+
ndims(): number | undefined;
|
|
57
|
+
/** The datatype of the embeddings */
|
|
58
|
+
abstract embeddingDataType(): Float;
|
|
39
59
|
/**
|
|
40
60
|
* Creates a vector representation for the given values.
|
|
41
61
|
*/
|
|
42
|
-
|
|
62
|
+
abstract computeSourceEmbeddings(data: T[]): Promise<number[][] | Float32Array[] | Float64Array[]>;
|
|
63
|
+
/**
|
|
64
|
+
Compute the embeddings for a single query
|
|
65
|
+
*/
|
|
66
|
+
computeQueryEmbeddings(data: T): Promise<number[] | Float32Array | Float64Array>;
|
|
67
|
+
}
|
|
68
|
+
export interface FieldOptions<T extends DataType = DataType> {
|
|
69
|
+
datatype: T;
|
|
70
|
+
dims?: number;
|
|
43
71
|
}
|
|
44
|
-
/** Test if the input seems to be an embedding function */
|
|
45
|
-
export declare function isEmbeddingFunction<T>(value: unknown): value is EmbeddingFunction<T>;
|