@dengxifeng/lancedb 0.26.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +13 -0
- package/CONTRIBUTING.md +76 -0
- package/README.md +37 -0
- package/dist/arrow.d.ts +279 -0
- package/dist/arrow.js +1316 -0
- package/dist/connection.d.ts +259 -0
- package/dist/connection.js +224 -0
- package/dist/embedding/embedding_function.d.ts +103 -0
- package/dist/embedding/embedding_function.js +192 -0
- package/dist/embedding/index.d.ts +27 -0
- package/dist/embedding/index.js +101 -0
- package/dist/embedding/openai.d.ts +16 -0
- package/dist/embedding/openai.js +93 -0
- package/dist/embedding/registry.d.ts +74 -0
- package/dist/embedding/registry.js +165 -0
- package/dist/embedding/transformers.d.ts +36 -0
- package/dist/embedding/transformers.js +122 -0
- package/dist/header.d.ts +162 -0
- package/dist/header.js +217 -0
- package/dist/index.d.ts +85 -0
- package/dist/index.js +106 -0
- package/dist/indices.d.ts +692 -0
- package/dist/indices.js +156 -0
- package/dist/merge.d.ts +80 -0
- package/dist/merge.js +92 -0
- package/dist/native.d.ts +585 -0
- package/dist/native.js +339 -0
- package/dist/permutation.d.ts +143 -0
- package/dist/permutation.js +184 -0
- package/dist/query.d.ts +581 -0
- package/dist/query.js +853 -0
- package/dist/rerankers/index.d.ts +5 -0
- package/dist/rerankers/index.js +19 -0
- package/dist/rerankers/rrf.d.ts +14 -0
- package/dist/rerankers/rrf.js +28 -0
- package/dist/sanitize.d.ts +32 -0
- package/dist/sanitize.js +473 -0
- package/dist/table.d.ts +581 -0
- package/dist/table.js +321 -0
- package/dist/util.d.ts +14 -0
- package/dist/util.js +77 -0
- package/license_header.txt +2 -0
- package/package.json +122 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
4
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
+
exports.TextEmbeddingFunction = exports.EmbeddingFunction = void 0;
|
|
6
|
+
require("reflect-metadata");
|
|
7
|
+
const arrow_1 = require("../arrow");
|
|
8
|
+
const sanitize_1 = require("../sanitize");
|
|
9
|
+
const registry_1 = require("./registry");
|
|
10
|
+
/**
|
|
11
|
+
* An embedding function that automatically creates vector representation for a given column.
|
|
12
|
+
*
|
|
13
|
+
* It's important subclasses pass the **original** options to the super constructor
|
|
14
|
+
* and then pass those options to `resolveVariables` to resolve any variables before
|
|
15
|
+
* using them.
|
|
16
|
+
*
|
|
17
|
+
* @example
|
|
18
|
+
* ```ts
|
|
19
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
20
|
+
* constructor(options: {model: string, timeout: number}) {
|
|
21
|
+
* super(optionsRaw);
|
|
22
|
+
* const options = this.resolveVariables(optionsRaw);
|
|
23
|
+
* this.model = options.model;
|
|
24
|
+
* this.timeout = options.timeout;
|
|
25
|
+
* }
|
|
26
|
+
* }
|
|
27
|
+
* ```
|
|
28
|
+
*/
|
|
29
|
+
class EmbeddingFunction {
|
|
30
|
+
/**
|
|
31
|
+
* @ignore
|
|
32
|
+
* This is only used for associating the options type with the class for type checking
|
|
33
|
+
*/
|
|
34
|
+
// biome-ignore lint/style/useNamingConvention: we want to keep the name as it is
|
|
35
|
+
TOptions;
|
|
36
|
+
#config;
|
|
37
|
+
/**
|
|
38
|
+
* Get the original arguments to the constructor, to serialize them so they
|
|
39
|
+
* can be used to recreate the embedding function later.
|
|
40
|
+
*/
|
|
41
|
+
// biome-ignore lint/suspicious/noExplicitAny :
|
|
42
|
+
toJSON() {
|
|
43
|
+
return JSON.parse(JSON.stringify(this.#config));
|
|
44
|
+
}
|
|
45
|
+
constructor() {
|
|
46
|
+
this.#config = {};
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Provide a list of keys in the function options that should be treated as
|
|
50
|
+
* sensitive. If users pass raw values for these keys, they will be rejected.
|
|
51
|
+
*/
|
|
52
|
+
getSensitiveKeys() {
|
|
53
|
+
return [];
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Apply variables to the config.
|
|
57
|
+
*/
|
|
58
|
+
resolveVariables(config) {
|
|
59
|
+
this.#config = config;
|
|
60
|
+
const registry = (0, registry_1.getRegistry)();
|
|
61
|
+
const newConfig = { ...config };
|
|
62
|
+
for (const [key_, value] of Object.entries(newConfig)) {
|
|
63
|
+
if (this.getSensitiveKeys().includes(key_) &&
|
|
64
|
+
!value.startsWith("$var:")) {
|
|
65
|
+
throw new Error(`The key "${key_}" is sensitive and cannot be set directly. Please use the $var: syntax to set it.`);
|
|
66
|
+
}
|
|
67
|
+
// Makes TS happy (https://stackoverflow.com/a/78391854)
|
|
68
|
+
const key = key_;
|
|
69
|
+
if (typeof value === "string" && value.startsWith("$var:")) {
|
|
70
|
+
const [name, defaultValue] = value.slice(5).split(":", 2);
|
|
71
|
+
const variableValue = registry.getVar(name);
|
|
72
|
+
if (!variableValue) {
|
|
73
|
+
if (defaultValue) {
|
|
74
|
+
// biome-ignore lint/suspicious/noExplicitAny:
|
|
75
|
+
newConfig[key] = defaultValue;
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
throw new Error(`Variable "${name}" not found`);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
// biome-ignore lint/suspicious/noExplicitAny:
|
|
83
|
+
newConfig[key] = variableValue;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return newConfig;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* sourceField is used in combination with `LanceSchema` to provide a declarative data model
|
|
91
|
+
*
|
|
92
|
+
* @param optionsOrDatatype - The options for the field or the datatype
|
|
93
|
+
*
|
|
94
|
+
* @see {@link LanceSchema}
|
|
95
|
+
*/
|
|
96
|
+
sourceField(optionsOrDatatype) {
|
|
97
|
+
let datatype = "datatype" in optionsOrDatatype
|
|
98
|
+
? optionsOrDatatype.datatype
|
|
99
|
+
: optionsOrDatatype;
|
|
100
|
+
if (!datatype) {
|
|
101
|
+
throw new Error("Datatype is required");
|
|
102
|
+
}
|
|
103
|
+
datatype = (0, sanitize_1.sanitizeType)(datatype);
|
|
104
|
+
const metadata = new Map();
|
|
105
|
+
metadata.set("source_column_for", this);
|
|
106
|
+
return [datatype, metadata];
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* vectorField is used in combination with `LanceSchema` to provide a declarative data model
|
|
110
|
+
*
|
|
111
|
+
* @param optionsOrDatatype - The options for the field
|
|
112
|
+
*
|
|
113
|
+
* @see {@link LanceSchema}
|
|
114
|
+
*/
|
|
115
|
+
vectorField(optionsOrDatatype) {
|
|
116
|
+
let dtype;
|
|
117
|
+
let vectorType;
|
|
118
|
+
let dims = this.ndims();
|
|
119
|
+
// `func.vectorField(new Float32())`
|
|
120
|
+
if (optionsOrDatatype === undefined) {
|
|
121
|
+
dtype = new arrow_1.Float32();
|
|
122
|
+
}
|
|
123
|
+
else if (!("datatype" in optionsOrDatatype)) {
|
|
124
|
+
dtype = (0, sanitize_1.sanitizeType)(optionsOrDatatype);
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
// `func.vectorField({
|
|
128
|
+
// datatype: new Float32(),
|
|
129
|
+
// dims: 10
|
|
130
|
+
// })`
|
|
131
|
+
dims = dims ?? optionsOrDatatype?.dims;
|
|
132
|
+
dtype = (0, sanitize_1.sanitizeType)(optionsOrDatatype?.datatype);
|
|
133
|
+
}
|
|
134
|
+
if (dtype !== undefined) {
|
|
135
|
+
// `func.vectorField(new FixedSizeList(dims, new Field("item", new Float32(), true)))`
|
|
136
|
+
// or `func.vectorField({datatype: new FixedSizeList(dims, new Field("item", new Float32(), true))})`
|
|
137
|
+
if ((0, arrow_1.isFixedSizeList)(dtype)) {
|
|
138
|
+
vectorType = dtype;
|
|
139
|
+
// `func.vectorField(new Float32())`
|
|
140
|
+
// or `func.vectorField({datatype: new Float32()})`
|
|
141
|
+
}
|
|
142
|
+
else if ((0, arrow_1.isFloat)(dtype)) {
|
|
143
|
+
// No `ndims` impl and no `{dims: n}` provided;
|
|
144
|
+
if (dims === undefined) {
|
|
145
|
+
throw new Error("ndims is required for vector field");
|
|
146
|
+
}
|
|
147
|
+
vectorType = (0, arrow_1.newVectorType)(dims, dtype);
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
throw new Error("Expected FixedSizeList or Float as datatype for vector field");
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
else {
|
|
154
|
+
if (dims === undefined) {
|
|
155
|
+
throw new Error("ndims is required for vector field");
|
|
156
|
+
}
|
|
157
|
+
vectorType = new arrow_1.FixedSizeList(dims, new arrow_1.Field("item", new arrow_1.Float32(), true));
|
|
158
|
+
}
|
|
159
|
+
const metadata = new Map();
|
|
160
|
+
metadata.set("vector_column_for", this);
|
|
161
|
+
return [vectorType, metadata];
|
|
162
|
+
}
|
|
163
|
+
/** The number of dimensions of the embeddings */
|
|
164
|
+
ndims() {
|
|
165
|
+
return undefined;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
Compute the embeddings for a single query
|
|
169
|
+
*/
|
|
170
|
+
async computeQueryEmbeddings(data) {
|
|
171
|
+
return this.computeSourceEmbeddings([data]).then((embeddings) => embeddings[0]);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
exports.EmbeddingFunction = EmbeddingFunction;
|
|
175
|
+
/**
|
|
176
|
+
* an abstract class for implementing embedding functions that take text as input
|
|
177
|
+
*/
|
|
178
|
+
class TextEmbeddingFunction extends EmbeddingFunction {
|
|
179
|
+
async computeQueryEmbeddings(data) {
|
|
180
|
+
return this.generateEmbeddings([data]).then((data) => data[0]);
|
|
181
|
+
}
|
|
182
|
+
embeddingDataType() {
|
|
183
|
+
return new arrow_1.Float32();
|
|
184
|
+
}
|
|
185
|
+
sourceField() {
|
|
186
|
+
return super.sourceField(new arrow_1.Utf8());
|
|
187
|
+
}
|
|
188
|
+
computeSourceEmbeddings(data) {
|
|
189
|
+
return this.generateEmbeddings(data);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
exports.TextEmbeddingFunction = TextEmbeddingFunction;
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { Schema } from "../arrow";
|
|
2
|
+
import { EmbeddingFunction } from "./embedding_function";
|
|
3
|
+
export { FieldOptions, EmbeddingFunction, TextEmbeddingFunction, FunctionOptions, EmbeddingFunctionConstructor, } from "./embedding_function";
|
|
4
|
+
export * from "./registry";
|
|
5
|
+
/**
|
|
6
|
+
* Create a schema with embedding functions.
|
|
7
|
+
*
|
|
8
|
+
* @param fields
|
|
9
|
+
* @returns Schema
|
|
10
|
+
* @example
|
|
11
|
+
* ```ts
|
|
12
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
13
|
+
* // ...
|
|
14
|
+
* }
|
|
15
|
+
* const func = new MyEmbeddingFunction();
|
|
16
|
+
* const schema = LanceSchema({
|
|
17
|
+
* id: new Int32(),
|
|
18
|
+
* text: func.sourceField(new Utf8()),
|
|
19
|
+
* vector: func.vectorField(),
|
|
20
|
+
* // optional: specify the datatype and/or dimensions
|
|
21
|
+
* vector2: func.vectorField({ datatype: new Float32(), dims: 3}),
|
|
22
|
+
* });
|
|
23
|
+
*
|
|
24
|
+
* const table = await db.createTable("my_table", data, { schema });
|
|
25
|
+
* ```
|
|
26
|
+
*/
|
|
27
|
+
export declare function LanceSchema(fields: Record<string, [object, Map<string, EmbeddingFunction>] | object>): Schema;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
4
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
5
|
+
if (k2 === undefined) k2 = k;
|
|
6
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
7
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
8
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
9
|
+
}
|
|
10
|
+
Object.defineProperty(o, k2, desc);
|
|
11
|
+
}) : (function(o, m, k, k2) {
|
|
12
|
+
if (k2 === undefined) k2 = k;
|
|
13
|
+
o[k2] = m[k];
|
|
14
|
+
}));
|
|
15
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
16
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
17
|
+
};
|
|
18
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
19
|
+
exports.TextEmbeddingFunction = exports.EmbeddingFunction = void 0;
|
|
20
|
+
exports.LanceSchema = LanceSchema;
|
|
21
|
+
const arrow_1 = require("../arrow");
|
|
22
|
+
const sanitize_1 = require("../sanitize");
|
|
23
|
+
const registry_1 = require("./registry");
|
|
24
|
+
var embedding_function_1 = require("./embedding_function");
|
|
25
|
+
Object.defineProperty(exports, "EmbeddingFunction", { enumerable: true, get: function () { return embedding_function_1.EmbeddingFunction; } });
|
|
26
|
+
Object.defineProperty(exports, "TextEmbeddingFunction", { enumerable: true, get: function () { return embedding_function_1.TextEmbeddingFunction; } });
|
|
27
|
+
__exportStar(require("./registry"), exports);
|
|
28
|
+
/**
|
|
29
|
+
* Create a schema with embedding functions.
|
|
30
|
+
*
|
|
31
|
+
* @param fields
|
|
32
|
+
* @returns Schema
|
|
33
|
+
* @example
|
|
34
|
+
* ```ts
|
|
35
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
36
|
+
* // ...
|
|
37
|
+
* }
|
|
38
|
+
* const func = new MyEmbeddingFunction();
|
|
39
|
+
* const schema = LanceSchema({
|
|
40
|
+
* id: new Int32(),
|
|
41
|
+
* text: func.sourceField(new Utf8()),
|
|
42
|
+
* vector: func.vectorField(),
|
|
43
|
+
* // optional: specify the datatype and/or dimensions
|
|
44
|
+
* vector2: func.vectorField({ datatype: new Float32(), dims: 3}),
|
|
45
|
+
* });
|
|
46
|
+
*
|
|
47
|
+
* const table = await db.createTable("my_table", data, { schema });
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
function LanceSchema(fields) {
|
|
51
|
+
const arrowFields = [];
|
|
52
|
+
const embeddingFunctions = new Map();
|
|
53
|
+
Object.entries(fields).forEach(([key, value]) => {
|
|
54
|
+
if (Array.isArray(value)) {
|
|
55
|
+
const [dtype, metadata] = value;
|
|
56
|
+
arrowFields.push(new arrow_1.Field(key, (0, sanitize_1.sanitizeType)(dtype), true));
|
|
57
|
+
parseEmbeddingFunctions(embeddingFunctions, key, metadata);
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
arrowFields.push(new arrow_1.Field(key, (0, sanitize_1.sanitizeType)(value), true));
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
const registry = (0, registry_1.getRegistry)();
|
|
64
|
+
const metadata = registry.getTableMetadata(Array.from(embeddingFunctions.values()));
|
|
65
|
+
const schema = new arrow_1.Schema(arrowFields, metadata);
|
|
66
|
+
return schema;
|
|
67
|
+
}
|
|
68
|
+
function parseEmbeddingFunctions(embeddingFunctions, key, metadata) {
|
|
69
|
+
if (metadata.has("source_column_for")) {
|
|
70
|
+
const embedFunction = metadata.get("source_column_for");
|
|
71
|
+
const current = embeddingFunctions.get(embedFunction);
|
|
72
|
+
if (current !== undefined) {
|
|
73
|
+
embeddingFunctions.set(embedFunction, {
|
|
74
|
+
...current,
|
|
75
|
+
sourceColumn: key,
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
embeddingFunctions.set(embedFunction, {
|
|
80
|
+
sourceColumn: key,
|
|
81
|
+
function: embedFunction,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
else if (metadata.has("vector_column_for")) {
|
|
86
|
+
const embedFunction = metadata.get("vector_column_for");
|
|
87
|
+
const current = embeddingFunctions.get(embedFunction);
|
|
88
|
+
if (current !== undefined) {
|
|
89
|
+
embeddingFunctions.set(embedFunction, {
|
|
90
|
+
...current,
|
|
91
|
+
vectorColumn: key,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
else {
|
|
95
|
+
embeddingFunctions.set(embedFunction, {
|
|
96
|
+
vectorColumn: key,
|
|
97
|
+
function: embedFunction,
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { EmbeddingCreateParams } from "openai/resources/index";
|
|
2
|
+
import { Float } from "../arrow";
|
|
3
|
+
import { EmbeddingFunction } from "./embedding_function";
|
|
4
|
+
export type OpenAIOptions = {
|
|
5
|
+
apiKey: string;
|
|
6
|
+
model: EmbeddingCreateParams["model"];
|
|
7
|
+
};
|
|
8
|
+
export declare class OpenAIEmbeddingFunction extends EmbeddingFunction<string, Partial<OpenAIOptions>> {
|
|
9
|
+
#private;
|
|
10
|
+
constructor(optionsRaw?: Partial<OpenAIOptions>);
|
|
11
|
+
protected getSensitiveKeys(): string[];
|
|
12
|
+
ndims(): number;
|
|
13
|
+
embeddingDataType(): Float;
|
|
14
|
+
computeSourceEmbeddings(data: string[]): Promise<number[][]>;
|
|
15
|
+
computeQueryEmbeddings(data: string): Promise<number[]>;
|
|
16
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
4
|
+
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
|
|
5
|
+
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
|
|
6
|
+
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
|
|
7
|
+
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
|
8
|
+
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
9
|
+
};
|
|
10
|
+
var __metadata = (this && this.__metadata) || function (k, v) {
|
|
11
|
+
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
|
|
12
|
+
};
|
|
13
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
14
|
+
exports.OpenAIEmbeddingFunction = void 0;
|
|
15
|
+
const arrow_1 = require("../arrow");
|
|
16
|
+
const embedding_function_1 = require("./embedding_function");
|
|
17
|
+
const registry_1 = require("./registry");
|
|
18
|
+
let OpenAIEmbeddingFunction = class OpenAIEmbeddingFunction extends embedding_function_1.EmbeddingFunction {
|
|
19
|
+
#openai;
|
|
20
|
+
#modelName;
|
|
21
|
+
constructor(optionsRaw = {
|
|
22
|
+
model: "text-embedding-ada-002",
|
|
23
|
+
}) {
|
|
24
|
+
super();
|
|
25
|
+
const options = this.resolveVariables(optionsRaw);
|
|
26
|
+
const openAIKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
|
|
27
|
+
if (!openAIKey) {
|
|
28
|
+
throw new Error("OpenAI API key is required");
|
|
29
|
+
}
|
|
30
|
+
const modelName = options?.model ?? "text-embedding-ada-002";
|
|
31
|
+
/**
|
|
32
|
+
* @type {import("openai").default}
|
|
33
|
+
*/
|
|
34
|
+
// eslint-disable-next-line @typescript-eslint/naming-convention
|
|
35
|
+
let Openai;
|
|
36
|
+
try {
|
|
37
|
+
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
38
|
+
Openai = require("openai");
|
|
39
|
+
}
|
|
40
|
+
catch {
|
|
41
|
+
throw new Error("please install openai@^4.24.1 using npm install openai");
|
|
42
|
+
}
|
|
43
|
+
const configuration = {
|
|
44
|
+
apiKey: openAIKey,
|
|
45
|
+
};
|
|
46
|
+
this.#openai = new Openai(configuration);
|
|
47
|
+
this.#modelName = modelName;
|
|
48
|
+
}
|
|
49
|
+
getSensitiveKeys() {
|
|
50
|
+
return ["apiKey"];
|
|
51
|
+
}
|
|
52
|
+
ndims() {
|
|
53
|
+
switch (this.#modelName) {
|
|
54
|
+
case "text-embedding-ada-002":
|
|
55
|
+
return 1536;
|
|
56
|
+
case "text-embedding-3-large":
|
|
57
|
+
return 3072;
|
|
58
|
+
case "text-embedding-3-small":
|
|
59
|
+
return 1536;
|
|
60
|
+
default:
|
|
61
|
+
throw new Error(`Unknown model: ${this.#modelName}`);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
embeddingDataType() {
|
|
65
|
+
return new arrow_1.Float32();
|
|
66
|
+
}
|
|
67
|
+
async computeSourceEmbeddings(data) {
|
|
68
|
+
const response = await this.#openai.embeddings.create({
|
|
69
|
+
model: this.#modelName,
|
|
70
|
+
input: data,
|
|
71
|
+
});
|
|
72
|
+
const embeddings = [];
|
|
73
|
+
for (let i = 0; i < response.data.length; i++) {
|
|
74
|
+
embeddings.push(response.data[i].embedding);
|
|
75
|
+
}
|
|
76
|
+
return embeddings;
|
|
77
|
+
}
|
|
78
|
+
async computeQueryEmbeddings(data) {
|
|
79
|
+
if (typeof data !== "string") {
|
|
80
|
+
throw new Error("Data must be a string");
|
|
81
|
+
}
|
|
82
|
+
const response = await this.#openai.embeddings.create({
|
|
83
|
+
model: this.#modelName,
|
|
84
|
+
input: data,
|
|
85
|
+
});
|
|
86
|
+
return response.data[0].embedding;
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
exports.OpenAIEmbeddingFunction = OpenAIEmbeddingFunction;
|
|
90
|
+
exports.OpenAIEmbeddingFunction = OpenAIEmbeddingFunction = __decorate([
|
|
91
|
+
(0, registry_1.register)("openai"),
|
|
92
|
+
__metadata("design:paramtypes", [Object])
|
|
93
|
+
], OpenAIEmbeddingFunction);
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { type EmbeddingFunction, type EmbeddingFunctionConstructor } from "./embedding_function";
|
|
2
|
+
import "reflect-metadata";
|
|
3
|
+
export type CreateReturnType<T> = T extends {
|
|
4
|
+
init: () => Promise<void>;
|
|
5
|
+
} ? Promise<T> : T;
|
|
6
|
+
export interface EmbeddingFunctionCreate<T extends EmbeddingFunction> {
|
|
7
|
+
create(options?: T["TOptions"]): CreateReturnType<T>;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* This is a singleton class used to register embedding functions
|
|
11
|
+
* and fetch them by name. It also handles serializing and deserializing.
|
|
12
|
+
* You can implement your own embedding function by subclassing EmbeddingFunction
|
|
13
|
+
* or TextEmbeddingFunction and registering it with the registry
|
|
14
|
+
*/
|
|
15
|
+
export declare class EmbeddingFunctionRegistry {
|
|
16
|
+
#private;
|
|
17
|
+
/**
|
|
18
|
+
* Get the number of registered functions
|
|
19
|
+
*/
|
|
20
|
+
length(): number;
|
|
21
|
+
/**
|
|
22
|
+
* Register an embedding function
|
|
23
|
+
* @throws Error if the function is already registered
|
|
24
|
+
*/
|
|
25
|
+
register<T extends EmbeddingFunctionConstructor = EmbeddingFunctionConstructor>(this: EmbeddingFunctionRegistry, alias?: string): (ctor: T) => any;
|
|
26
|
+
get<T extends EmbeddingFunction<unknown>>(name: string): EmbeddingFunctionCreate<T> | undefined;
|
|
27
|
+
/**
|
|
28
|
+
* reset the registry to the initial state
|
|
29
|
+
*/
|
|
30
|
+
reset(this: EmbeddingFunctionRegistry): void;
|
|
31
|
+
/**
|
|
32
|
+
* @ignore
|
|
33
|
+
*/
|
|
34
|
+
parseFunctions(this: EmbeddingFunctionRegistry, metadata: Map<string, string>): Promise<Map<string, EmbeddingFunctionConfig>>;
|
|
35
|
+
functionToMetadata(conf: EmbeddingFunctionConfig): Record<string, any>;
|
|
36
|
+
getTableMetadata(functions: EmbeddingFunctionConfig[]): Map<string, string>;
|
|
37
|
+
/**
|
|
38
|
+
* Set a variable. These can be accessed in the embedding function
|
|
39
|
+
* configuration using the syntax `$var:variable_name`. If they are not
|
|
40
|
+
* set, an error will be thrown letting you know which key is unset. If you
|
|
41
|
+
* want to supply a default value, you can add an additional part in the
|
|
42
|
+
* configuration like so: `$var:variable_name:default_value`. Default values
|
|
43
|
+
* can be used for runtime configurations that are not sensitive, such as
|
|
44
|
+
* whether to use a GPU for inference.
|
|
45
|
+
*
|
|
46
|
+
* The name must not contain colons. The default value can contain colons.
|
|
47
|
+
*
|
|
48
|
+
* @param name
|
|
49
|
+
* @param value
|
|
50
|
+
*/
|
|
51
|
+
setVar(name: string, value: string): void;
|
|
52
|
+
/**
|
|
53
|
+
* Get a variable.
|
|
54
|
+
* @param name
|
|
55
|
+
* @returns
|
|
56
|
+
* @see {@link setVar}
|
|
57
|
+
*/
|
|
58
|
+
getVar(name: string): string | undefined;
|
|
59
|
+
}
|
|
60
|
+
export declare function register(name?: string): (ctor: EmbeddingFunctionConstructor<EmbeddingFunction<any, import("./embedding_function").FunctionOptions>>) => any;
|
|
61
|
+
/**
|
|
62
|
+
* Utility function to get the global instance of the registry
|
|
63
|
+
* @returns `EmbeddingFunctionRegistry` The global instance of the registry
|
|
64
|
+
* @example
|
|
65
|
+
* ```ts
|
|
66
|
+
* const registry = getRegistry();
|
|
67
|
+
* const openai = registry.get("openai").create();
|
|
68
|
+
*/
|
|
69
|
+
export declare function getRegistry(): EmbeddingFunctionRegistry;
|
|
70
|
+
export interface EmbeddingFunctionConfig {
|
|
71
|
+
sourceColumn: string;
|
|
72
|
+
vectorColumn?: string;
|
|
73
|
+
function: EmbeddingFunction;
|
|
74
|
+
}
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
4
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
+
exports.EmbeddingFunctionRegistry = void 0;
|
|
6
|
+
exports.register = register;
|
|
7
|
+
exports.getRegistry = getRegistry;
|
|
8
|
+
require("reflect-metadata");
|
|
9
|
+
/**
|
|
10
|
+
* This is a singleton class used to register embedding functions
|
|
11
|
+
* and fetch them by name. It also handles serializing and deserializing.
|
|
12
|
+
* You can implement your own embedding function by subclassing EmbeddingFunction
|
|
13
|
+
* or TextEmbeddingFunction and registering it with the registry
|
|
14
|
+
*/
|
|
15
|
+
class EmbeddingFunctionRegistry {
|
|
16
|
+
#functions = new Map();
|
|
17
|
+
#variables = new Map();
|
|
18
|
+
/**
|
|
19
|
+
* Get the number of registered functions
|
|
20
|
+
*/
|
|
21
|
+
length() {
|
|
22
|
+
return this.#functions.size;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Register an embedding function
|
|
26
|
+
* @throws Error if the function is already registered
|
|
27
|
+
*/
|
|
28
|
+
register(alias) {
|
|
29
|
+
const self = this;
|
|
30
|
+
return function (ctor) {
|
|
31
|
+
if (!alias) {
|
|
32
|
+
alias = ctor.name;
|
|
33
|
+
}
|
|
34
|
+
if (self.#functions.has(alias)) {
|
|
35
|
+
throw new Error(`Embedding function with alias "${alias}" already exists`);
|
|
36
|
+
}
|
|
37
|
+
self.#functions.set(alias, ctor);
|
|
38
|
+
Reflect.defineMetadata("lancedb::embedding::name", alias, ctor);
|
|
39
|
+
return ctor;
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Fetch an embedding function by name
|
|
44
|
+
* @param name The name of the function
|
|
45
|
+
*/
|
|
46
|
+
get(name) {
|
|
47
|
+
const factory = this.#functions.get(name);
|
|
48
|
+
if (!factory) {
|
|
49
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
50
|
+
return undefined;
|
|
51
|
+
}
|
|
52
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
53
|
+
let create;
|
|
54
|
+
if (factory.prototype.init) {
|
|
55
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
56
|
+
create = async function (options) {
|
|
57
|
+
const instance = new factory(options);
|
|
58
|
+
await instance.init();
|
|
59
|
+
return instance;
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
else {
|
|
63
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
64
|
+
create = (options) => new factory(options);
|
|
65
|
+
}
|
|
66
|
+
return {
|
|
67
|
+
create,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* reset the registry to the initial state
|
|
72
|
+
*/
|
|
73
|
+
reset() {
|
|
74
|
+
this.#functions.clear();
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* @ignore
|
|
78
|
+
*/
|
|
79
|
+
async parseFunctions(metadata) {
|
|
80
|
+
if (!metadata.has("embedding_functions")) {
|
|
81
|
+
return new Map();
|
|
82
|
+
}
|
|
83
|
+
else {
|
|
84
|
+
const functions = (JSON.parse(metadata.get("embedding_functions")));
|
|
85
|
+
const items = await Promise.all(functions.map(async (f) => {
|
|
86
|
+
const fn = this.get(f.name);
|
|
87
|
+
if (!fn) {
|
|
88
|
+
throw new Error(`Function "${f.name}" not found in registry`);
|
|
89
|
+
}
|
|
90
|
+
const func = await this.get(f.name).create(f.model);
|
|
91
|
+
return [
|
|
92
|
+
f.name,
|
|
93
|
+
{
|
|
94
|
+
sourceColumn: f.sourceColumn,
|
|
95
|
+
vectorColumn: f.vectorColumn,
|
|
96
|
+
function: func,
|
|
97
|
+
},
|
|
98
|
+
];
|
|
99
|
+
}));
|
|
100
|
+
return new Map(items);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
104
|
+
functionToMetadata(conf) {
|
|
105
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
106
|
+
const metadata = {};
|
|
107
|
+
const name = Reflect.getMetadata("lancedb::embedding::name", conf.function.constructor);
|
|
108
|
+
metadata["sourceColumn"] = conf.sourceColumn;
|
|
109
|
+
metadata["vectorColumn"] = conf.vectorColumn ?? "vector";
|
|
110
|
+
metadata["name"] = name ?? conf.function.constructor.name;
|
|
111
|
+
metadata["model"] = conf.function.toJSON();
|
|
112
|
+
return metadata;
|
|
113
|
+
}
|
|
114
|
+
getTableMetadata(functions) {
|
|
115
|
+
const metadata = new Map();
|
|
116
|
+
const jsonData = functions.map((conf) => this.functionToMetadata(conf));
|
|
117
|
+
metadata.set("embedding_functions", JSON.stringify(jsonData));
|
|
118
|
+
return metadata;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Set a variable. These can be accessed in the embedding function
|
|
122
|
+
* configuration using the syntax `$var:variable_name`. If they are not
|
|
123
|
+
* set, an error will be thrown letting you know which key is unset. If you
|
|
124
|
+
* want to supply a default value, you can add an additional part in the
|
|
125
|
+
* configuration like so: `$var:variable_name:default_value`. Default values
|
|
126
|
+
* can be used for runtime configurations that are not sensitive, such as
|
|
127
|
+
* whether to use a GPU for inference.
|
|
128
|
+
*
|
|
129
|
+
* The name must not contain colons. The default value can contain colons.
|
|
130
|
+
*
|
|
131
|
+
* @param name
|
|
132
|
+
* @param value
|
|
133
|
+
*/
|
|
134
|
+
setVar(name, value) {
|
|
135
|
+
if (name.includes(":")) {
|
|
136
|
+
throw new Error("Variable names cannot contain colons");
|
|
137
|
+
}
|
|
138
|
+
this.#variables.set(name, value);
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Get a variable.
|
|
142
|
+
* @param name
|
|
143
|
+
* @returns
|
|
144
|
+
* @see {@link setVar}
|
|
145
|
+
*/
|
|
146
|
+
getVar(name) {
|
|
147
|
+
return this.#variables.get(name);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
exports.EmbeddingFunctionRegistry = EmbeddingFunctionRegistry;
|
|
151
|
+
const _REGISTRY = new EmbeddingFunctionRegistry();
|
|
152
|
+
function register(name) {
|
|
153
|
+
return _REGISTRY.register(name);
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Utility function to get the global instance of the registry
|
|
157
|
+
* @returns `EmbeddingFunctionRegistry` The global instance of the registry
|
|
158
|
+
* @example
|
|
159
|
+
* ```ts
|
|
160
|
+
* const registry = getRegistry();
|
|
161
|
+
* const openai = registry.get("openai").create();
|
|
162
|
+
*/
|
|
163
|
+
function getRegistry() {
|
|
164
|
+
return _REGISTRY;
|
|
165
|
+
}
|