@lancedb/lancedb 0.16.1-beta.3 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/embedding/embedding_function.d.ts +36 -23
- package/dist/embedding/embedding_function.js +70 -0
- package/dist/embedding/openai.d.ts +2 -4
- package/dist/embedding/openai.js +4 -5
- package/dist/embedding/registry.d.ts +22 -0
- package/dist/embedding/registry.js +31 -4
- package/dist/embedding/transformers.d.ts +1 -2
- package/dist/embedding/transformers.js +2 -17
- package/package.json +9 -9
|
@@ -11,39 +11,52 @@ export interface EmbeddingFunctionConstructor<T extends EmbeddingFunction = Embe
|
|
|
11
11
|
}
|
|
12
12
|
/**
|
|
13
13
|
* An embedding function that automatically creates vector representation for a given column.
|
|
14
|
+
*
|
|
15
|
+
* It's important subclasses pass the **original** options to the super constructor
|
|
16
|
+
* and then pass those options to `resolveVariables` to resolve any variables before
|
|
17
|
+
* using them.
|
|
18
|
+
*
|
|
19
|
+
* @example
|
|
20
|
+
* ```ts
|
|
21
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
22
|
+
* constructor(options: {model: string, timeout: number}) {
|
|
23
|
+
* super(optionsRaw);
|
|
24
|
+
* const options = this.resolveVariables(optionsRaw);
|
|
25
|
+
* this.model = options.model;
|
|
26
|
+
* this.timeout = options.timeout;
|
|
27
|
+
* }
|
|
28
|
+
* }
|
|
29
|
+
* ```
|
|
14
30
|
*/
|
|
15
31
|
export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptions = FunctionOptions> {
|
|
32
|
+
#private;
|
|
16
33
|
/**
|
|
17
34
|
* @ignore
|
|
18
35
|
* This is only used for associating the options type with the class for type checking
|
|
19
36
|
*/
|
|
20
37
|
readonly TOptions: M;
|
|
21
38
|
/**
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
*
|
|
39
|
+
* Get the original arguments to the constructor, to serialize them so they
|
|
40
|
+
* can be used to recreate the embedding function later.
|
|
41
|
+
*/
|
|
42
|
+
toJSON(): Record<string, any>;
|
|
43
|
+
constructor();
|
|
44
|
+
/**
|
|
45
|
+
* Provide a list of keys in the function options that should be treated as
|
|
46
|
+
* sensitive. If users pass raw values for these keys, they will be rejected.
|
|
47
|
+
*/
|
|
48
|
+
protected getSensitiveKeys(): string[];
|
|
49
|
+
/**
|
|
50
|
+
* Apply variables to the config.
|
|
51
|
+
*/
|
|
52
|
+
protected resolveVariables(config: Partial<M>): Partial<M>;
|
|
53
|
+
/**
|
|
54
|
+
* Optionally load any resources needed for the embedding function.
|
|
29
55
|
*
|
|
30
|
-
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
* constructor(options: {model: string, timeout: number}) {
|
|
34
|
-
* super();
|
|
35
|
-
* this.model = options.model;
|
|
36
|
-
* this.timeout = options.timeout;
|
|
37
|
-
* }
|
|
38
|
-
* toJSON() {
|
|
39
|
-
* return {
|
|
40
|
-
* model: this.model,
|
|
41
|
-
* timeout: this.timeout,
|
|
42
|
-
* };
|
|
43
|
-
* }
|
|
44
|
-
* ```
|
|
56
|
+
* This method is called after the embedding function has been initialized
|
|
57
|
+
* but before any embeddings are computed. It is useful for loading local models
|
|
58
|
+
* or other resources that are needed for the embedding function to work.
|
|
45
59
|
*/
|
|
46
|
-
abstract toJSON(): Partial<M>;
|
|
47
60
|
init?(): Promise<void>;
|
|
48
61
|
/**
|
|
49
62
|
* sourceField is used in combination with `LanceSchema` to provide a declarative data model
|
|
@@ -6,8 +6,25 @@ exports.TextEmbeddingFunction = exports.EmbeddingFunction = void 0;
|
|
|
6
6
|
require("reflect-metadata");
|
|
7
7
|
const arrow_1 = require("../arrow");
|
|
8
8
|
const sanitize_1 = require("../sanitize");
|
|
9
|
+
const registry_1 = require("./registry");
|
|
9
10
|
/**
|
|
10
11
|
* An embedding function that automatically creates vector representation for a given column.
|
|
12
|
+
*
|
|
13
|
+
* It's important subclasses pass the **original** options to the super constructor
|
|
14
|
+
* and then pass those options to `resolveVariables` to resolve any variables before
|
|
15
|
+
* using them.
|
|
16
|
+
*
|
|
17
|
+
* @example
|
|
18
|
+
* ```ts
|
|
19
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
20
|
+
* constructor(options: {model: string, timeout: number}) {
|
|
21
|
+
* super(optionsRaw);
|
|
22
|
+
* const options = this.resolveVariables(optionsRaw);
|
|
23
|
+
* this.model = options.model;
|
|
24
|
+
* this.timeout = options.timeout;
|
|
25
|
+
* }
|
|
26
|
+
* }
|
|
27
|
+
* ```
|
|
11
28
|
*/
|
|
12
29
|
class EmbeddingFunction {
|
|
13
30
|
/**
|
|
@@ -16,6 +33,59 @@ class EmbeddingFunction {
|
|
|
16
33
|
*/
|
|
17
34
|
// biome-ignore lint/style/useNamingConvention: we want to keep the name as it is
|
|
18
35
|
TOptions;
|
|
36
|
+
#config;
|
|
37
|
+
/**
|
|
38
|
+
* Get the original arguments to the constructor, to serialize them so they
|
|
39
|
+
* can be used to recreate the embedding function later.
|
|
40
|
+
*/
|
|
41
|
+
// biome-ignore lint/suspicious/noExplicitAny :
|
|
42
|
+
toJSON() {
|
|
43
|
+
return JSON.parse(JSON.stringify(this.#config));
|
|
44
|
+
}
|
|
45
|
+
constructor() {
|
|
46
|
+
this.#config = {};
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Provide a list of keys in the function options that should be treated as
|
|
50
|
+
* sensitive. If users pass raw values for these keys, they will be rejected.
|
|
51
|
+
*/
|
|
52
|
+
getSensitiveKeys() {
|
|
53
|
+
return [];
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Apply variables to the config.
|
|
57
|
+
*/
|
|
58
|
+
resolveVariables(config) {
|
|
59
|
+
this.#config = config;
|
|
60
|
+
const registry = (0, registry_1.getRegistry)();
|
|
61
|
+
const newConfig = { ...config };
|
|
62
|
+
for (const [key_, value] of Object.entries(newConfig)) {
|
|
63
|
+
if (this.getSensitiveKeys().includes(key_) &&
|
|
64
|
+
!value.startsWith("$var:")) {
|
|
65
|
+
throw new Error(`The key "${key_}" is sensitive and cannot be set directly. Please use the $var: syntax to set it.`);
|
|
66
|
+
}
|
|
67
|
+
// Makes TS happy (https://stackoverflow.com/a/78391854)
|
|
68
|
+
const key = key_;
|
|
69
|
+
if (typeof value === "string" && value.startsWith("$var:")) {
|
|
70
|
+
const [name, defaultValue] = value.slice(5).split(":", 2);
|
|
71
|
+
const variableValue = registry.getVar(name);
|
|
72
|
+
if (!variableValue) {
|
|
73
|
+
if (defaultValue) {
|
|
74
|
+
// biome-ignore lint/suspicious/noExplicitAny:
|
|
75
|
+
newConfig[key] = defaultValue;
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
throw new Error(`Variable "${name}" not found`);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
// biome-ignore lint/suspicious/noExplicitAny:
|
|
83
|
+
newConfig[key] = variableValue;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return newConfig;
|
|
88
|
+
}
|
|
19
89
|
/**
|
|
20
90
|
* sourceField is used in combination with `LanceSchema` to provide a declarative data model
|
|
21
91
|
*
|
|
@@ -7,10 +7,8 @@ export type OpenAIOptions = {
|
|
|
7
7
|
};
|
|
8
8
|
export declare class OpenAIEmbeddingFunction extends EmbeddingFunction<string, Partial<OpenAIOptions>> {
|
|
9
9
|
#private;
|
|
10
|
-
constructor(
|
|
11
|
-
|
|
12
|
-
model: (string & {}) | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large";
|
|
13
|
-
};
|
|
10
|
+
constructor(optionsRaw?: Partial<OpenAIOptions>);
|
|
11
|
+
protected getSensitiveKeys(): string[];
|
|
14
12
|
ndims(): number;
|
|
15
13
|
embeddingDataType(): Float;
|
|
16
14
|
computeSourceEmbeddings(data: string[]): Promise<number[][]>;
|
package/dist/embedding/openai.js
CHANGED
|
@@ -18,10 +18,11 @@ const registry_1 = require("./registry");
|
|
|
18
18
|
let OpenAIEmbeddingFunction = class OpenAIEmbeddingFunction extends embedding_function_1.EmbeddingFunction {
|
|
19
19
|
#openai;
|
|
20
20
|
#modelName;
|
|
21
|
-
constructor(
|
|
21
|
+
constructor(optionsRaw = {
|
|
22
22
|
model: "text-embedding-ada-002",
|
|
23
23
|
}) {
|
|
24
24
|
super();
|
|
25
|
+
const options = this.resolveVariables(optionsRaw);
|
|
25
26
|
const openAIKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
|
|
26
27
|
if (!openAIKey) {
|
|
27
28
|
throw new Error("OpenAI API key is required");
|
|
@@ -45,10 +46,8 @@ let OpenAIEmbeddingFunction = class OpenAIEmbeddingFunction extends embedding_fu
|
|
|
45
46
|
this.#openai = new Openai(configuration);
|
|
46
47
|
this.#modelName = modelName;
|
|
47
48
|
}
|
|
48
|
-
|
|
49
|
-
return
|
|
50
|
-
model: this.#modelName,
|
|
51
|
-
};
|
|
49
|
+
getSensitiveKeys() {
|
|
50
|
+
return ["apiKey"];
|
|
52
51
|
}
|
|
53
52
|
ndims() {
|
|
54
53
|
switch (this.#modelName) {
|
|
@@ -34,6 +34,28 @@ export declare class EmbeddingFunctionRegistry {
|
|
|
34
34
|
parseFunctions(this: EmbeddingFunctionRegistry, metadata: Map<string, string>): Promise<Map<string, EmbeddingFunctionConfig>>;
|
|
35
35
|
functionToMetadata(conf: EmbeddingFunctionConfig): Record<string, any>;
|
|
36
36
|
getTableMetadata(functions: EmbeddingFunctionConfig[]): Map<string, string>;
|
|
37
|
+
/**
|
|
38
|
+
* Set a variable. These can be accessed in the embedding function
|
|
39
|
+
* configuration using the syntax `$var:variable_name`. If they are not
|
|
40
|
+
* set, an error will be thrown letting you know which key is unset. If you
|
|
41
|
+
* want to supply a default value, you can add an additional part in the
|
|
42
|
+
* configuration like so: `$var:variable_name:default_value`. Default values
|
|
43
|
+
* can be used for runtime configurations that are not sensitive, such as
|
|
44
|
+
* whether to use a GPU for inference.
|
|
45
|
+
*
|
|
46
|
+
* The name must not contain colons. The default value can contain colons.
|
|
47
|
+
*
|
|
48
|
+
* @param name
|
|
49
|
+
* @param value
|
|
50
|
+
*/
|
|
51
|
+
setVar(name: string, value: string): void;
|
|
52
|
+
/**
|
|
53
|
+
* Get a variable.
|
|
54
|
+
* @param name
|
|
55
|
+
* @returns
|
|
56
|
+
* @see {@link setVar}
|
|
57
|
+
*/
|
|
58
|
+
getVar(name: string): string | undefined;
|
|
37
59
|
}
|
|
38
60
|
export declare function register(name?: string): (ctor: EmbeddingFunctionConstructor<EmbeddingFunction<any, import("./embedding_function").FunctionOptions>>) => any;
|
|
39
61
|
/**
|
|
@@ -14,6 +14,7 @@ require("reflect-metadata");
|
|
|
14
14
|
*/
|
|
15
15
|
class EmbeddingFunctionRegistry {
|
|
16
16
|
#functions = new Map();
|
|
17
|
+
#variables = new Map();
|
|
17
18
|
/**
|
|
18
19
|
* Get the number of registered functions
|
|
19
20
|
*/
|
|
@@ -60,10 +61,7 @@ class EmbeddingFunctionRegistry {
|
|
|
60
61
|
}
|
|
61
62
|
else {
|
|
62
63
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
63
|
-
create =
|
|
64
|
-
const instance = new factory(options);
|
|
65
|
-
return instance;
|
|
66
|
-
};
|
|
64
|
+
create = (options) => new factory(options);
|
|
67
65
|
}
|
|
68
66
|
return {
|
|
69
67
|
create,
|
|
@@ -119,6 +117,35 @@ class EmbeddingFunctionRegistry {
|
|
|
119
117
|
metadata.set("embedding_functions", JSON.stringify(jsonData));
|
|
120
118
|
return metadata;
|
|
121
119
|
}
|
|
120
|
+
/**
|
|
121
|
+
* Set a variable. These can be accessed in the embedding function
|
|
122
|
+
* configuration using the syntax `$var:variable_name`. If they are not
|
|
123
|
+
* set, an error will be thrown letting you know which key is unset. If you
|
|
124
|
+
* want to supply a default value, you can add an additional part in the
|
|
125
|
+
* configuration like so: `$var:variable_name:default_value`. Default values
|
|
126
|
+
* can be used for runtime configurations that are not sensitive, such as
|
|
127
|
+
* whether to use a GPU for inference.
|
|
128
|
+
*
|
|
129
|
+
* The name must not contain colons. The default value can contain colons.
|
|
130
|
+
*
|
|
131
|
+
* @param name
|
|
132
|
+
* @param value
|
|
133
|
+
*/
|
|
134
|
+
setVar(name, value) {
|
|
135
|
+
if (name.includes(":")) {
|
|
136
|
+
throw new Error("Variable names cannot contain colons");
|
|
137
|
+
}
|
|
138
|
+
this.#variables.set(name, value);
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Get a variable.
|
|
142
|
+
* @param name
|
|
143
|
+
* @returns
|
|
144
|
+
* @see {@link setVar}
|
|
145
|
+
*/
|
|
146
|
+
getVar(name) {
|
|
147
|
+
return this.#variables.get(name);
|
|
148
|
+
}
|
|
122
149
|
}
|
|
123
150
|
exports.EmbeddingFunctionRegistry = EmbeddingFunctionRegistry;
|
|
124
151
|
const _REGISTRY = new EmbeddingFunctionRegistry();
|
|
@@ -27,8 +27,7 @@ export type XenovaTransformerOptions = {
|
|
|
27
27
|
};
|
|
28
28
|
export declare class TransformersEmbeddingFunction extends EmbeddingFunction<string, Partial<XenovaTransformerOptions>> {
|
|
29
29
|
#private;
|
|
30
|
-
constructor(
|
|
31
|
-
toJSON(): Record<string, any>;
|
|
30
|
+
constructor(optionsRaw?: Partial<XenovaTransformerOptions>);
|
|
32
31
|
init(): Promise<void>;
|
|
33
32
|
ndims(): number;
|
|
34
33
|
embeddingDataType(): Float;
|
|
@@ -22,10 +22,11 @@ let TransformersEmbeddingFunction = class TransformersEmbeddingFunction extends
|
|
|
22
22
|
#initialized = false;
|
|
23
23
|
#tokenizerOptions;
|
|
24
24
|
#ndims;
|
|
25
|
-
constructor(
|
|
25
|
+
constructor(optionsRaw = {
|
|
26
26
|
model: "Xenova/all-MiniLM-L6-v2",
|
|
27
27
|
}) {
|
|
28
28
|
super();
|
|
29
|
+
const options = this.resolveVariables(optionsRaw);
|
|
29
30
|
const modelName = options?.model ?? "Xenova/all-MiniLM-L6-v2";
|
|
30
31
|
this.#tokenizerOptions = {
|
|
31
32
|
padding: true,
|
|
@@ -34,22 +35,6 @@ let TransformersEmbeddingFunction = class TransformersEmbeddingFunction extends
|
|
|
34
35
|
this.#ndims = options.ndims;
|
|
35
36
|
this.#modelName = modelName;
|
|
36
37
|
}
|
|
37
|
-
toJSON() {
|
|
38
|
-
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
39
|
-
const obj = {
|
|
40
|
-
model: this.#modelName,
|
|
41
|
-
};
|
|
42
|
-
if (this.#ndims) {
|
|
43
|
-
obj["ndims"] = this.#ndims;
|
|
44
|
-
}
|
|
45
|
-
if (this.#tokenizerOptions) {
|
|
46
|
-
obj["tokenizerOptions"] = this.#tokenizerOptions;
|
|
47
|
-
}
|
|
48
|
-
if (this.#tokenizer) {
|
|
49
|
-
obj["tokenizer"] = this.#tokenizer.name;
|
|
50
|
-
}
|
|
51
|
-
return obj;
|
|
52
|
-
}
|
|
53
38
|
async init() {
|
|
54
39
|
let transformers;
|
|
55
40
|
try {
|
package/package.json
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"ann"
|
|
12
12
|
],
|
|
13
13
|
"private": false,
|
|
14
|
-
"version": "0.
|
|
14
|
+
"version": "0.17.0",
|
|
15
15
|
"main": "dist/index.js",
|
|
16
16
|
"exports": {
|
|
17
17
|
".": "./dist/index.js",
|
|
@@ -98,14 +98,14 @@
|
|
|
98
98
|
"reflect-metadata": "^0.2.2"
|
|
99
99
|
},
|
|
100
100
|
"optionalDependencies": {
|
|
101
|
-
"@lancedb/lancedb-darwin-x64": "0.
|
|
102
|
-
"@lancedb/lancedb-darwin-arm64": "0.
|
|
103
|
-
"@lancedb/lancedb-linux-x64-gnu": "0.
|
|
104
|
-
"@lancedb/lancedb-linux-arm64-gnu": "0.
|
|
105
|
-
"@lancedb/lancedb-linux-x64-musl": "0.
|
|
106
|
-
"@lancedb/lancedb-linux-arm64-musl": "0.
|
|
107
|
-
"@lancedb/lancedb-win32-x64-msvc": "0.
|
|
108
|
-
"@lancedb/lancedb-win32-arm64-msvc": "0.
|
|
101
|
+
"@lancedb/lancedb-darwin-x64": "0.17.0",
|
|
102
|
+
"@lancedb/lancedb-darwin-arm64": "0.17.0",
|
|
103
|
+
"@lancedb/lancedb-linux-x64-gnu": "0.17.0",
|
|
104
|
+
"@lancedb/lancedb-linux-arm64-gnu": "0.17.0",
|
|
105
|
+
"@lancedb/lancedb-linux-x64-musl": "0.17.0",
|
|
106
|
+
"@lancedb/lancedb-linux-arm64-musl": "0.17.0",
|
|
107
|
+
"@lancedb/lancedb-win32-x64-msvc": "0.17.0",
|
|
108
|
+
"@lancedb/lancedb-win32-arm64-msvc": "0.17.0"
|
|
109
109
|
},
|
|
110
110
|
"peerDependencies": {
|
|
111
111
|
"apache-arrow": ">=15.0.0 <=18.1.0"
|