@lancedb/lancedb 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/biome.json +8 -2
- package/dist/arrow.d.ts +36 -9
- package/dist/arrow.js +222 -24
- package/dist/connection.d.ts +10 -1
- package/dist/connection.js +13 -7
- package/dist/embedding/embedding_function.d.ts +54 -28
- package/dist/embedding/embedding_function.js +89 -10
- package/dist/embedding/index.d.ts +28 -2
- package/dist/embedding/index.js +111 -4
- package/dist/embedding/openai.d.ts +16 -7
- package/dist/embedding/openai.js +62 -12
- package/dist/embedding/registry.d.ts +58 -0
- package/dist/embedding/registry.js +127 -0
- package/dist/native.d.ts +5 -4
- package/dist/query.d.ts +19 -7
- package/dist/query.js +27 -13
- package/dist/sanitize.d.ts +22 -1
- package/dist/sanitize.js +123 -110
- package/dist/table.d.ts +18 -3
- package/dist/table.js +33 -3
- package/lancedb/arrow.ts +243 -41
- package/lancedb/connection.ts +35 -6
- package/lancedb/embedding/embedding_function.ts +147 -42
- package/lancedb/embedding/index.ts +113 -2
- package/lancedb/embedding/openai.ts +62 -16
- package/lancedb/embedding/registry.ts +176 -0
- package/lancedb/query.ts +58 -14
- package/lancedb/sanitize.ts +22 -22
- package/lancedb/table.ts +67 -5
- package/nodejs-artifacts/arrow.d.ts +36 -9
- package/nodejs-artifacts/arrow.js +222 -24
- package/nodejs-artifacts/connection.d.ts +10 -1
- package/nodejs-artifacts/connection.js +13 -7
- package/nodejs-artifacts/embedding/embedding_function.d.ts +54 -28
- package/nodejs-artifacts/embedding/embedding_function.js +89 -10
- package/nodejs-artifacts/embedding/index.d.ts +28 -2
- package/nodejs-artifacts/embedding/index.js +111 -4
- package/nodejs-artifacts/embedding/openai.d.ts +16 -7
- package/nodejs-artifacts/embedding/openai.js +62 -12
- package/nodejs-artifacts/embedding/registry.d.ts +58 -0
- package/nodejs-artifacts/embedding/registry.js +127 -0
- package/nodejs-artifacts/native.d.ts +5 -4
- package/nodejs-artifacts/query.d.ts +19 -7
- package/nodejs-artifacts/query.js +27 -13
- package/nodejs-artifacts/sanitize.d.ts +22 -1
- package/nodejs-artifacts/sanitize.js +123 -110
- package/nodejs-artifacts/table.d.ts +18 -3
- package/nodejs-artifacts/table.js +33 -3
- package/package.json +14 -9
- package/tsconfig.json +3 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
// Copyright
|
|
1
|
+
// Copyright 2024 Lance Developers.
|
|
2
2
|
//
|
|
3
3
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
// you may not use this file except in compliance with the License.
|
|
@@ -12,67 +12,172 @@
|
|
|
12
12
|
// See the License for the specific language governing permissions and
|
|
13
13
|
// limitations under the License.
|
|
14
14
|
|
|
15
|
-
import
|
|
15
|
+
import "reflect-metadata";
|
|
16
|
+
import {
|
|
17
|
+
DataType,
|
|
18
|
+
Field,
|
|
19
|
+
FixedSizeList,
|
|
20
|
+
Float,
|
|
21
|
+
Float32,
|
|
22
|
+
type IntoVector,
|
|
23
|
+
isDataType,
|
|
24
|
+
isFixedSizeList,
|
|
25
|
+
isFloat,
|
|
26
|
+
newVectorType,
|
|
27
|
+
} from "../arrow";
|
|
28
|
+
import { sanitizeType } from "../sanitize";
|
|
16
29
|
|
|
17
30
|
/**
|
|
18
|
-
*
|
|
31
|
+
* Options for a given embedding function
|
|
19
32
|
*/
|
|
20
|
-
export interface
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
sourceColumn: string;
|
|
33
|
+
export interface FunctionOptions {
|
|
34
|
+
// biome-ignore lint/suspicious/noExplicitAny: options can be anything
|
|
35
|
+
[key: string]: any;
|
|
36
|
+
}
|
|
25
37
|
|
|
38
|
+
/**
|
|
39
|
+
* An embedding function that automatically creates vector representation for a given column.
|
|
40
|
+
*/
|
|
41
|
+
export abstract class EmbeddingFunction<
|
|
42
|
+
// biome-ignore lint/suspicious/noExplicitAny: we don't know what the implementor will do
|
|
43
|
+
T = any,
|
|
44
|
+
M extends FunctionOptions = FunctionOptions,
|
|
45
|
+
> {
|
|
26
46
|
/**
|
|
27
|
-
*
|
|
47
|
+
* Convert the embedding function to a JSON object
|
|
48
|
+
* It is used to serialize the embedding function to the schema
|
|
49
|
+
* It's important that any object returned by this method contains all the necessary
|
|
50
|
+
* information to recreate the embedding function
|
|
28
51
|
*
|
|
29
|
-
*
|
|
30
|
-
*
|
|
31
|
-
* be used to control the conversion.
|
|
32
|
-
*/
|
|
33
|
-
embeddingDataType?: Float;
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* The dimension of the embedding
|
|
52
|
+
* It should return the same object that was passed to the constructor
|
|
53
|
+
* If it does not, the embedding function will not be able to be recreated, or could be recreated incorrectly
|
|
37
54
|
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
40
|
-
*
|
|
55
|
+
* @example
|
|
56
|
+
* ```ts
|
|
57
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
58
|
+
* constructor(options: {model: string, timeout: number}) {
|
|
59
|
+
* super();
|
|
60
|
+
* this.model = options.model;
|
|
61
|
+
* this.timeout = options.timeout;
|
|
62
|
+
* }
|
|
63
|
+
* toJSON() {
|
|
64
|
+
* return {
|
|
65
|
+
* model: this.model,
|
|
66
|
+
* timeout: this.timeout,
|
|
67
|
+
* };
|
|
68
|
+
* }
|
|
69
|
+
* ```
|
|
41
70
|
*/
|
|
42
|
-
|
|
71
|
+
abstract toJSON(): Partial<M>;
|
|
43
72
|
|
|
44
73
|
/**
|
|
45
|
-
*
|
|
74
|
+
* sourceField is used in combination with `LanceSchema` to provide a declarative data model
|
|
75
|
+
*
|
|
76
|
+
* @param optionsOrDatatype - The options for the field or the datatype
|
|
46
77
|
*
|
|
47
|
-
*
|
|
78
|
+
* @see {@link lancedb.LanceSchema}
|
|
48
79
|
*/
|
|
49
|
-
|
|
80
|
+
sourceField(
|
|
81
|
+
optionsOrDatatype: Partial<FieldOptions> | DataType,
|
|
82
|
+
): [DataType, Map<string, EmbeddingFunction>] {
|
|
83
|
+
let datatype = isDataType(optionsOrDatatype)
|
|
84
|
+
? optionsOrDatatype
|
|
85
|
+
: optionsOrDatatype?.datatype;
|
|
86
|
+
if (!datatype) {
|
|
87
|
+
throw new Error("Datatype is required");
|
|
88
|
+
}
|
|
89
|
+
datatype = sanitizeType(datatype);
|
|
90
|
+
const metadata = new Map<string, EmbeddingFunction>();
|
|
91
|
+
metadata.set("source_column_for", this);
|
|
92
|
+
|
|
93
|
+
return [datatype, metadata];
|
|
94
|
+
}
|
|
50
95
|
|
|
51
96
|
/**
|
|
52
|
-
*
|
|
97
|
+
* vectorField is used in combination with `LanceSchema` to provide a declarative data model
|
|
98
|
+
*
|
|
99
|
+
* @param options - The options for the field
|
|
53
100
|
*
|
|
54
|
-
*
|
|
55
|
-
* only the embedding will be stored.
|
|
101
|
+
* @see {@link lancedb.LanceSchema}
|
|
56
102
|
*/
|
|
57
|
-
|
|
103
|
+
vectorField(
|
|
104
|
+
optionsOrDatatype?: Partial<FieldOptions> | DataType,
|
|
105
|
+
): [DataType, Map<string, EmbeddingFunction>] {
|
|
106
|
+
let dtype: DataType | undefined;
|
|
107
|
+
let vectorType: DataType;
|
|
108
|
+
let dims: number | undefined = this.ndims();
|
|
109
|
+
|
|
110
|
+
// `func.vectorField(new Float32())`
|
|
111
|
+
if (isDataType(optionsOrDatatype)) {
|
|
112
|
+
dtype = optionsOrDatatype;
|
|
113
|
+
} else {
|
|
114
|
+
// `func.vectorField({
|
|
115
|
+
// datatype: new Float32(),
|
|
116
|
+
// dims: 10
|
|
117
|
+
// })`
|
|
118
|
+
dims = dims ?? optionsOrDatatype?.dims;
|
|
119
|
+
dtype = optionsOrDatatype?.datatype;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (dtype !== undefined) {
|
|
123
|
+
// `func.vectorField(new FixedSizeList(dims, new Field("item", new Float32(), true)))`
|
|
124
|
+
// or `func.vectorField({datatype: new FixedSizeList(dims, new Field("item", new Float32(), true))})`
|
|
125
|
+
if (isFixedSizeList(dtype)) {
|
|
126
|
+
vectorType = dtype;
|
|
127
|
+
// `func.vectorField(new Float32())`
|
|
128
|
+
// or `func.vectorField({datatype: new Float32()})`
|
|
129
|
+
} else if (isFloat(dtype)) {
|
|
130
|
+
// No `ndims` impl and no `{dims: n}` provided;
|
|
131
|
+
if (dims === undefined) {
|
|
132
|
+
throw new Error("ndims is required for vector field");
|
|
133
|
+
}
|
|
134
|
+
vectorType = newVectorType(dims, dtype);
|
|
135
|
+
} else {
|
|
136
|
+
throw new Error(
|
|
137
|
+
"Expected FixedSizeList or Float as datatype for vector field",
|
|
138
|
+
);
|
|
139
|
+
}
|
|
140
|
+
} else {
|
|
141
|
+
if (dims === undefined) {
|
|
142
|
+
throw new Error("ndims is required for vector field");
|
|
143
|
+
}
|
|
144
|
+
vectorType = new FixedSizeList(
|
|
145
|
+
dims,
|
|
146
|
+
new Field("item", new Float32(), true),
|
|
147
|
+
);
|
|
148
|
+
}
|
|
149
|
+
const metadata = new Map<string, EmbeddingFunction>();
|
|
150
|
+
metadata.set("vector_column_for", this);
|
|
151
|
+
|
|
152
|
+
return [vectorType, metadata];
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/** The number of dimensions of the embeddings */
|
|
156
|
+
ndims(): number | undefined {
|
|
157
|
+
return undefined;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/** The datatype of the embeddings */
|
|
161
|
+
abstract embeddingDataType(): Float;
|
|
58
162
|
|
|
59
163
|
/**
|
|
60
164
|
* Creates a vector representation for the given values.
|
|
61
165
|
*/
|
|
62
|
-
|
|
63
|
-
|
|
166
|
+
abstract computeSourceEmbeddings(
|
|
167
|
+
data: T[],
|
|
168
|
+
): Promise<number[][] | Float32Array[] | Float64Array[]>;
|
|
64
169
|
|
|
65
|
-
/**
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
):
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
if (!("sourceColumn" in value) || !("embed" in value)) {
|
|
73
|
-
return false;
|
|
170
|
+
/**
|
|
171
|
+
Compute the embeddings for a single query
|
|
172
|
+
*/
|
|
173
|
+
async computeQueryEmbeddings(data: T): Promise<IntoVector> {
|
|
174
|
+
return this.computeSourceEmbeddings([data]).then(
|
|
175
|
+
(embeddings) => embeddings[0],
|
|
176
|
+
);
|
|
74
177
|
}
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
export interface FieldOptions<T extends DataType = DataType> {
|
|
181
|
+
datatype: T;
|
|
182
|
+
dims?: number;
|
|
78
183
|
}
|
|
@@ -1,2 +1,113 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
// Copyright 2023 Lance Developers.
|
|
2
|
+
//
|
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
// you may not use this file except in compliance with the License.
|
|
5
|
+
// You may obtain a copy of the License at
|
|
6
|
+
//
|
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
//
|
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
// See the License for the specific language governing permissions and
|
|
13
|
+
// limitations under the License.
|
|
14
|
+
|
|
15
|
+
import { DataType, Field, Schema } from "../arrow";
|
|
16
|
+
import { isDataType } from "../arrow";
|
|
17
|
+
import { sanitizeType } from "../sanitize";
|
|
18
|
+
import { EmbeddingFunction } from "./embedding_function";
|
|
19
|
+
import { EmbeddingFunctionConfig, getRegistry } from "./registry";
|
|
20
|
+
|
|
21
|
+
export { EmbeddingFunction } from "./embedding_function";
|
|
22
|
+
|
|
23
|
+
// We need to explicitly export '*' so that the `register` decorator actually registers the class.
|
|
24
|
+
export * from "./openai";
|
|
25
|
+
export * from "./registry";
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Create a schema with embedding functions.
|
|
29
|
+
*
|
|
30
|
+
* @param fields
|
|
31
|
+
* @returns Schema
|
|
32
|
+
* @example
|
|
33
|
+
* ```ts
|
|
34
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
35
|
+
* // ...
|
|
36
|
+
* }
|
|
37
|
+
* const func = new MyEmbeddingFunction();
|
|
38
|
+
* const schema = LanceSchema({
|
|
39
|
+
* id: new Int32(),
|
|
40
|
+
* text: func.sourceField(new Utf8()),
|
|
41
|
+
* vector: func.vectorField(),
|
|
42
|
+
* // optional: specify the datatype and/or dimensions
|
|
43
|
+
* vector2: func.vectorField({ datatype: new Float32(), dims: 3}),
|
|
44
|
+
* });
|
|
45
|
+
*
|
|
46
|
+
* const table = await db.createTable("my_table", data, { schema });
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
export function LanceSchema(
|
|
50
|
+
fields: Record<string, [object, Map<string, EmbeddingFunction>] | object>,
|
|
51
|
+
): Schema {
|
|
52
|
+
const arrowFields: Field[] = [];
|
|
53
|
+
|
|
54
|
+
const embeddingFunctions = new Map<
|
|
55
|
+
EmbeddingFunction,
|
|
56
|
+
Partial<EmbeddingFunctionConfig>
|
|
57
|
+
>();
|
|
58
|
+
Object.entries(fields).forEach(([key, value]) => {
|
|
59
|
+
if (isDataType(value)) {
|
|
60
|
+
arrowFields.push(new Field(key, sanitizeType(value), true));
|
|
61
|
+
} else {
|
|
62
|
+
const [dtype, metadata] = value as [
|
|
63
|
+
object,
|
|
64
|
+
Map<string, EmbeddingFunction>,
|
|
65
|
+
];
|
|
66
|
+
arrowFields.push(new Field(key, sanitizeType(dtype), true));
|
|
67
|
+
parseEmbeddingFunctions(embeddingFunctions, key, metadata);
|
|
68
|
+
}
|
|
69
|
+
});
|
|
70
|
+
const registry = getRegistry();
|
|
71
|
+
const metadata = registry.getTableMetadata(
|
|
72
|
+
Array.from(embeddingFunctions.values()) as EmbeddingFunctionConfig[],
|
|
73
|
+
);
|
|
74
|
+
const schema = new Schema(arrowFields, metadata);
|
|
75
|
+
return schema;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function parseEmbeddingFunctions(
|
|
79
|
+
embeddingFunctions: Map<EmbeddingFunction, Partial<EmbeddingFunctionConfig>>,
|
|
80
|
+
key: string,
|
|
81
|
+
metadata: Map<string, EmbeddingFunction>,
|
|
82
|
+
): void {
|
|
83
|
+
if (metadata.has("source_column_for")) {
|
|
84
|
+
const embedFunction = metadata.get("source_column_for")!;
|
|
85
|
+
const current = embeddingFunctions.get(embedFunction);
|
|
86
|
+
if (current !== undefined) {
|
|
87
|
+
embeddingFunctions.set(embedFunction, {
|
|
88
|
+
...current,
|
|
89
|
+
sourceColumn: key,
|
|
90
|
+
});
|
|
91
|
+
} else {
|
|
92
|
+
embeddingFunctions.set(embedFunction, {
|
|
93
|
+
sourceColumn: key,
|
|
94
|
+
function: embedFunction,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
} else if (metadata.has("vector_column_for")) {
|
|
98
|
+
const embedFunction = metadata.get("vector_column_for")!;
|
|
99
|
+
|
|
100
|
+
const current = embeddingFunctions.get(embedFunction);
|
|
101
|
+
if (current !== undefined) {
|
|
102
|
+
embeddingFunctions.set(embedFunction, {
|
|
103
|
+
...current,
|
|
104
|
+
vectorColumn: key,
|
|
105
|
+
});
|
|
106
|
+
} else {
|
|
107
|
+
embeddingFunctions.set(embedFunction, {
|
|
108
|
+
vectorColumn: key,
|
|
109
|
+
function: embedFunction,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
@@ -13,17 +13,31 @@
|
|
|
13
13
|
// limitations under the License.
|
|
14
14
|
|
|
15
15
|
import type OpenAI from "openai";
|
|
16
|
-
import {
|
|
16
|
+
import { Float, Float32 } from "../arrow";
|
|
17
|
+
import { EmbeddingFunction } from "./embedding_function";
|
|
18
|
+
import { register } from "./registry";
|
|
17
19
|
|
|
18
|
-
export
|
|
19
|
-
|
|
20
|
-
|
|
20
|
+
export type OpenAIOptions = {
|
|
21
|
+
apiKey?: string;
|
|
22
|
+
model?: string;
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
@register("openai")
|
|
26
|
+
export class OpenAIEmbeddingFunction extends EmbeddingFunction<
|
|
27
|
+
string,
|
|
28
|
+
OpenAIOptions
|
|
29
|
+
> {
|
|
30
|
+
#openai: OpenAI;
|
|
31
|
+
#modelName: string;
|
|
32
|
+
|
|
33
|
+
constructor(options: OpenAIOptions = { model: "text-embedding-ada-002" }) {
|
|
34
|
+
super();
|
|
35
|
+
const openAIKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
|
|
36
|
+
if (!openAIKey) {
|
|
37
|
+
throw new Error("OpenAI API key is required");
|
|
38
|
+
}
|
|
39
|
+
const modelName = options?.model ?? "text-embedding-ada-002";
|
|
21
40
|
|
|
22
|
-
constructor(
|
|
23
|
-
sourceColumn: string,
|
|
24
|
-
openAIKey: string,
|
|
25
|
-
modelName: string = "text-embedding-ada-002",
|
|
26
|
-
) {
|
|
27
41
|
/**
|
|
28
42
|
* @type {import("openai").default}
|
|
29
43
|
*/
|
|
@@ -36,18 +50,40 @@ export class OpenAIEmbeddingFunction implements EmbeddingFunction<string> {
|
|
|
36
50
|
throw new Error("please install openai@^4.24.1 using npm install openai");
|
|
37
51
|
}
|
|
38
52
|
|
|
39
|
-
this.sourceColumn = sourceColumn;
|
|
40
53
|
const configuration = {
|
|
41
54
|
apiKey: openAIKey,
|
|
42
55
|
};
|
|
43
56
|
|
|
44
|
-
this
|
|
45
|
-
this
|
|
57
|
+
this.#openai = new Openai(configuration);
|
|
58
|
+
this.#modelName = modelName;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
toJSON() {
|
|
62
|
+
return {
|
|
63
|
+
model: this.#modelName,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
ndims(): number {
|
|
68
|
+
switch (this.#modelName) {
|
|
69
|
+
case "text-embedding-ada-002":
|
|
70
|
+
return 1536;
|
|
71
|
+
case "text-embedding-3-large":
|
|
72
|
+
return 3072;
|
|
73
|
+
case "text-embedding-3-small":
|
|
74
|
+
return 1536;
|
|
75
|
+
default:
|
|
76
|
+
return null as never;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
embeddingDataType(): Float {
|
|
81
|
+
return new Float32();
|
|
46
82
|
}
|
|
47
83
|
|
|
48
|
-
async
|
|
49
|
-
const response = await this.
|
|
50
|
-
model: this
|
|
84
|
+
async computeSourceEmbeddings(data: string[]): Promise<number[][]> {
|
|
85
|
+
const response = await this.#openai.embeddings.create({
|
|
86
|
+
model: this.#modelName,
|
|
51
87
|
input: data,
|
|
52
88
|
});
|
|
53
89
|
|
|
@@ -58,5 +94,15 @@ export class OpenAIEmbeddingFunction implements EmbeddingFunction<string> {
|
|
|
58
94
|
return embeddings;
|
|
59
95
|
}
|
|
60
96
|
|
|
61
|
-
|
|
97
|
+
async computeQueryEmbeddings(data: string): Promise<number[]> {
|
|
98
|
+
if (typeof data !== "string") {
|
|
99
|
+
throw new Error("Data must be a string");
|
|
100
|
+
}
|
|
101
|
+
const response = await this.#openai.embeddings.create({
|
|
102
|
+
model: this.#modelName,
|
|
103
|
+
input: data,
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
return response.data[0].embedding;
|
|
107
|
+
}
|
|
62
108
|
}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
// Copyright 2024 Lance Developers.
|
|
2
|
+
//
|
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
// you may not use this file except in compliance with the License.
|
|
5
|
+
// You may obtain a copy of the License at
|
|
6
|
+
//
|
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
//
|
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
// See the License for the specific language governing permissions and
|
|
13
|
+
// limitations under the License.
|
|
14
|
+
|
|
15
|
+
import type { EmbeddingFunction } from "./embedding_function";
|
|
16
|
+
import "reflect-metadata";
|
|
17
|
+
|
|
18
|
+
export interface EmbeddingFunctionOptions {
|
|
19
|
+
[key: string]: unknown;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface EmbeddingFunctionFactory<
|
|
23
|
+
T extends EmbeddingFunction = EmbeddingFunction,
|
|
24
|
+
> {
|
|
25
|
+
new (modelOptions?: EmbeddingFunctionOptions): T;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
interface EmbeddingFunctionCreate<T extends EmbeddingFunction> {
|
|
29
|
+
create(options?: EmbeddingFunctionOptions): T;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* This is a singleton class used to register embedding functions
|
|
34
|
+
* and fetch them by name. It also handles serializing and deserializing.
|
|
35
|
+
* You can implement your own embedding function by subclassing EmbeddingFunction
|
|
36
|
+
* or TextEmbeddingFunction and registering it with the registry
|
|
37
|
+
*/
|
|
38
|
+
export class EmbeddingFunctionRegistry {
|
|
39
|
+
#functions: Map<string, EmbeddingFunctionFactory> = new Map();
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Register an embedding function
|
|
43
|
+
* @param name The name of the function
|
|
44
|
+
* @param func The function to register
|
|
45
|
+
* @throws Error if the function is already registered
|
|
46
|
+
*/
|
|
47
|
+
register<T extends EmbeddingFunctionFactory = EmbeddingFunctionFactory>(
|
|
48
|
+
this: EmbeddingFunctionRegistry,
|
|
49
|
+
alias?: string,
|
|
50
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
51
|
+
): (ctor: T) => any {
|
|
52
|
+
const self = this;
|
|
53
|
+
return function (ctor: T) {
|
|
54
|
+
if (!alias) {
|
|
55
|
+
alias = ctor.name;
|
|
56
|
+
}
|
|
57
|
+
if (self.#functions.has(alias)) {
|
|
58
|
+
throw new Error(
|
|
59
|
+
`Embedding function with alias "${alias}" already exists`,
|
|
60
|
+
);
|
|
61
|
+
}
|
|
62
|
+
self.#functions.set(alias, ctor);
|
|
63
|
+
Reflect.defineMetadata("lancedb::embedding::name", alias, ctor);
|
|
64
|
+
return ctor;
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Fetch an embedding function by name
|
|
70
|
+
* @param name The name of the function
|
|
71
|
+
*/
|
|
72
|
+
get<T extends EmbeddingFunction<unknown> = EmbeddingFunction>(
|
|
73
|
+
name: string,
|
|
74
|
+
): EmbeddingFunctionCreate<T> | undefined {
|
|
75
|
+
const factory = this.#functions.get(name);
|
|
76
|
+
if (!factory) {
|
|
77
|
+
return undefined;
|
|
78
|
+
}
|
|
79
|
+
return {
|
|
80
|
+
create: function (options: EmbeddingFunctionOptions) {
|
|
81
|
+
return new factory(options) as unknown as T;
|
|
82
|
+
},
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* reset the registry to the initial state
|
|
88
|
+
*/
|
|
89
|
+
reset(this: EmbeddingFunctionRegistry) {
|
|
90
|
+
this.#functions.clear();
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* @ignore
|
|
95
|
+
*/
|
|
96
|
+
parseFunctions(
|
|
97
|
+
this: EmbeddingFunctionRegistry,
|
|
98
|
+
metadata: Map<string, string>,
|
|
99
|
+
): Map<string, EmbeddingFunctionConfig> {
|
|
100
|
+
if (!metadata.has("embedding_functions")) {
|
|
101
|
+
return new Map();
|
|
102
|
+
} else {
|
|
103
|
+
type FunctionConfig = {
|
|
104
|
+
name: string;
|
|
105
|
+
sourceColumn: string;
|
|
106
|
+
vectorColumn: string;
|
|
107
|
+
model: EmbeddingFunctionOptions;
|
|
108
|
+
};
|
|
109
|
+
const functions = <FunctionConfig[]>(
|
|
110
|
+
JSON.parse(metadata.get("embedding_functions")!)
|
|
111
|
+
);
|
|
112
|
+
return new Map(
|
|
113
|
+
functions.map((f) => {
|
|
114
|
+
const fn = this.get(f.name);
|
|
115
|
+
if (!fn) {
|
|
116
|
+
throw new Error(`Function "${f.name}" not found in registry`);
|
|
117
|
+
}
|
|
118
|
+
return [
|
|
119
|
+
f.name,
|
|
120
|
+
{
|
|
121
|
+
sourceColumn: f.sourceColumn,
|
|
122
|
+
vectorColumn: f.vectorColumn,
|
|
123
|
+
function: this.get(f.name)!.create(f.model),
|
|
124
|
+
},
|
|
125
|
+
];
|
|
126
|
+
}),
|
|
127
|
+
);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
131
|
+
functionToMetadata(conf: EmbeddingFunctionConfig): Record<string, any> {
|
|
132
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
133
|
+
const metadata: Record<string, any> = {};
|
|
134
|
+
const name = Reflect.getMetadata(
|
|
135
|
+
"lancedb::embedding::name",
|
|
136
|
+
conf.function.constructor,
|
|
137
|
+
);
|
|
138
|
+
metadata["sourceColumn"] = conf.sourceColumn;
|
|
139
|
+
metadata["vectorColumn"] = conf.vectorColumn ?? "vector";
|
|
140
|
+
metadata["name"] = name ?? conf.function.constructor.name;
|
|
141
|
+
metadata["model"] = conf.function.toJSON();
|
|
142
|
+
return metadata;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
getTableMetadata(functions: EmbeddingFunctionConfig[]): Map<string, string> {
|
|
146
|
+
const metadata = new Map<string, string>();
|
|
147
|
+
const jsonData = functions.map((conf) => this.functionToMetadata(conf));
|
|
148
|
+
metadata.set("embedding_functions", JSON.stringify(jsonData));
|
|
149
|
+
|
|
150
|
+
return metadata;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const _REGISTRY = new EmbeddingFunctionRegistry();
|
|
155
|
+
|
|
156
|
+
export function register(name?: string) {
|
|
157
|
+
return _REGISTRY.register(name);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Utility function to get the global instance of the registry
|
|
162
|
+
* @returns `EmbeddingFunctionRegistry` The global instance of the registry
|
|
163
|
+
* @example
|
|
164
|
+
* ```ts
|
|
165
|
+
* const registry = getRegistry();
|
|
166
|
+
* const openai = registry.get("openai").create();
|
|
167
|
+
*/
|
|
168
|
+
export function getRegistry(): EmbeddingFunctionRegistry {
|
|
169
|
+
return _REGISTRY;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
export interface EmbeddingFunctionConfig {
|
|
173
|
+
sourceColumn: string;
|
|
174
|
+
vectorColumn?: string;
|
|
175
|
+
function: EmbeddingFunction;
|
|
176
|
+
}
|