@lancedb/lancedb 0.16.1-beta.3 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,39 +11,52 @@ export interface EmbeddingFunctionConstructor<T extends EmbeddingFunction = Embe
11
11
  }
12
12
  /**
13
13
  * An embedding function that automatically creates vector representation for a given column.
14
+ *
15
+ * It's important subclasses pass the **original** options to the super constructor
16
+ * and then pass those options to `resolveVariables` to resolve any variables before
17
+ * using them.
18
+ *
19
+ * @example
20
+ * ```ts
21
+ * class MyEmbeddingFunction extends EmbeddingFunction {
22
+ * constructor(options: {model: string, timeout: number}) {
23
+ * super(optionsRaw);
24
+ * const options = this.resolveVariables(optionsRaw);
25
+ * this.model = options.model;
26
+ * this.timeout = options.timeout;
27
+ * }
28
+ * }
29
+ * ```
14
30
  */
15
31
  export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptions = FunctionOptions> {
32
+ #private;
16
33
  /**
17
34
  * @ignore
18
35
  * This is only used for associating the options type with the class for type checking
19
36
  */
20
37
  readonly TOptions: M;
21
38
  /**
22
- * Convert the embedding function to a JSON object
23
- * It is used to serialize the embedding function to the schema
24
- * It's important that any object returned by this method contains all the necessary
25
- * information to recreate the embedding function
26
- *
27
- * It should return the same object that was passed to the constructor
28
- * If it does not, the embedding function will not be able to be recreated, or could be recreated incorrectly
39
+ * Get the original arguments to the constructor, to serialize them so they
40
+ * can be used to recreate the embedding function later.
41
+ */
42
+ toJSON(): Record<string, any>;
43
+ constructor();
44
+ /**
45
+ * Provide a list of keys in the function options that should be treated as
46
+ * sensitive. If users pass raw values for these keys, they will be rejected.
47
+ */
48
+ protected getSensitiveKeys(): string[];
49
+ /**
50
+ * Apply variables to the config.
51
+ */
52
+ protected resolveVariables(config: Partial<M>): Partial<M>;
53
+ /**
54
+ * Optionally load any resources needed for the embedding function.
29
55
  *
30
- * @example
31
- * ```ts
32
- * class MyEmbeddingFunction extends EmbeddingFunction {
33
- * constructor(options: {model: string, timeout: number}) {
34
- * super();
35
- * this.model = options.model;
36
- * this.timeout = options.timeout;
37
- * }
38
- * toJSON() {
39
- * return {
40
- * model: this.model,
41
- * timeout: this.timeout,
42
- * };
43
- * }
44
- * ```
56
+ * This method is called after the embedding function has been initialized
57
+ * but before any embeddings are computed. It is useful for loading local models
58
+ * or other resources that are needed for the embedding function to work.
45
59
  */
46
- abstract toJSON(): Partial<M>;
47
60
  init?(): Promise<void>;
48
61
  /**
49
62
  * sourceField is used in combination with `LanceSchema` to provide a declarative data model
@@ -6,8 +6,25 @@ exports.TextEmbeddingFunction = exports.EmbeddingFunction = void 0;
6
6
  require("reflect-metadata");
7
7
  const arrow_1 = require("../arrow");
8
8
  const sanitize_1 = require("../sanitize");
9
+ const registry_1 = require("./registry");
9
10
  /**
10
11
  * An embedding function that automatically creates vector representation for a given column.
12
+ *
13
+ * It's important subclasses pass the **original** options to the super constructor
14
+ * and then pass those options to `resolveVariables` to resolve any variables before
15
+ * using them.
16
+ *
17
+ * @example
18
+ * ```ts
19
+ * class MyEmbeddingFunction extends EmbeddingFunction {
20
+ * constructor(options: {model: string, timeout: number}) {
21
+ * super(optionsRaw);
22
+ * const options = this.resolveVariables(optionsRaw);
23
+ * this.model = options.model;
24
+ * this.timeout = options.timeout;
25
+ * }
26
+ * }
27
+ * ```
11
28
  */
12
29
  class EmbeddingFunction {
13
30
  /**
@@ -16,6 +33,59 @@ class EmbeddingFunction {
16
33
  */
17
34
  // biome-ignore lint/style/useNamingConvention: we want to keep the name as it is
18
35
  TOptions;
36
+ #config;
37
+ /**
38
+ * Get the original arguments to the constructor, to serialize them so they
39
+ * can be used to recreate the embedding function later.
40
+ */
41
+ // biome-ignore lint/suspicious/noExplicitAny :
42
+ toJSON() {
43
+ return JSON.parse(JSON.stringify(this.#config));
44
+ }
45
+ constructor() {
46
+ this.#config = {};
47
+ }
48
+ /**
49
+ * Provide a list of keys in the function options that should be treated as
50
+ * sensitive. If users pass raw values for these keys, they will be rejected.
51
+ */
52
+ getSensitiveKeys() {
53
+ return [];
54
+ }
55
+ /**
56
+ * Apply variables to the config.
57
+ */
58
+ resolveVariables(config) {
59
+ this.#config = config;
60
+ const registry = (0, registry_1.getRegistry)();
61
+ const newConfig = { ...config };
62
+ for (const [key_, value] of Object.entries(newConfig)) {
63
+ if (this.getSensitiveKeys().includes(key_) &&
64
+ !value.startsWith("$var:")) {
65
+ throw new Error(`The key "${key_}" is sensitive and cannot be set directly. Please use the $var: syntax to set it.`);
66
+ }
67
+ // Makes TS happy (https://stackoverflow.com/a/78391854)
68
+ const key = key_;
69
+ if (typeof value === "string" && value.startsWith("$var:")) {
70
+ const [name, defaultValue] = value.slice(5).split(":", 2);
71
+ const variableValue = registry.getVar(name);
72
+ if (!variableValue) {
73
+ if (defaultValue) {
74
+ // biome-ignore lint/suspicious/noExplicitAny:
75
+ newConfig[key] = defaultValue;
76
+ }
77
+ else {
78
+ throw new Error(`Variable "${name}" not found`);
79
+ }
80
+ }
81
+ else {
82
+ // biome-ignore lint/suspicious/noExplicitAny:
83
+ newConfig[key] = variableValue;
84
+ }
85
+ }
86
+ }
87
+ return newConfig;
88
+ }
19
89
  /**
20
90
  * sourceField is used in combination with `LanceSchema` to provide a declarative data model
21
91
  *
@@ -7,10 +7,8 @@ export type OpenAIOptions = {
7
7
  };
8
8
  export declare class OpenAIEmbeddingFunction extends EmbeddingFunction<string, Partial<OpenAIOptions>> {
9
9
  #private;
10
- constructor(options?: Partial<OpenAIOptions>);
11
- toJSON(): {
12
- model: (string & {}) | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large";
13
- };
10
+ constructor(optionsRaw?: Partial<OpenAIOptions>);
11
+ protected getSensitiveKeys(): string[];
14
12
  ndims(): number;
15
13
  embeddingDataType(): Float;
16
14
  computeSourceEmbeddings(data: string[]): Promise<number[][]>;
@@ -18,10 +18,11 @@ const registry_1 = require("./registry");
18
18
  let OpenAIEmbeddingFunction = class OpenAIEmbeddingFunction extends embedding_function_1.EmbeddingFunction {
19
19
  #openai;
20
20
  #modelName;
21
- constructor(options = {
21
+ constructor(optionsRaw = {
22
22
  model: "text-embedding-ada-002",
23
23
  }) {
24
24
  super();
25
+ const options = this.resolveVariables(optionsRaw);
25
26
  const openAIKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
26
27
  if (!openAIKey) {
27
28
  throw new Error("OpenAI API key is required");
@@ -45,10 +46,8 @@ let OpenAIEmbeddingFunction = class OpenAIEmbeddingFunction extends embedding_fu
45
46
  this.#openai = new Openai(configuration);
46
47
  this.#modelName = modelName;
47
48
  }
48
- toJSON() {
49
- return {
50
- model: this.#modelName,
51
- };
49
+ getSensitiveKeys() {
50
+ return ["apiKey"];
52
51
  }
53
52
  ndims() {
54
53
  switch (this.#modelName) {
@@ -34,6 +34,28 @@ export declare class EmbeddingFunctionRegistry {
34
34
  parseFunctions(this: EmbeddingFunctionRegistry, metadata: Map<string, string>): Promise<Map<string, EmbeddingFunctionConfig>>;
35
35
  functionToMetadata(conf: EmbeddingFunctionConfig): Record<string, any>;
36
36
  getTableMetadata(functions: EmbeddingFunctionConfig[]): Map<string, string>;
37
+ /**
38
+ * Set a variable. These can be accessed in the embedding function
39
+ * configuration using the syntax `$var:variable_name`. If they are not
40
+ * set, an error will be thrown letting you know which key is unset. If you
41
+ * want to supply a default value, you can add an additional part in the
42
+ * configuration like so: `$var:variable_name:default_value`. Default values
43
+ * can be used for runtime configurations that are not sensitive, such as
44
+ * whether to use a GPU for inference.
45
+ *
46
+ * The name must not contain colons. The default value can contain colons.
47
+ *
48
+ * @param name
49
+ * @param value
50
+ */
51
+ setVar(name: string, value: string): void;
52
+ /**
53
+ * Get a variable.
54
+ * @param name
55
+ * @returns
56
+ * @see {@link setVar}
57
+ */
58
+ getVar(name: string): string | undefined;
37
59
  }
38
60
  export declare function register(name?: string): (ctor: EmbeddingFunctionConstructor<EmbeddingFunction<any, import("./embedding_function").FunctionOptions>>) => any;
39
61
  /**
@@ -14,6 +14,7 @@ require("reflect-metadata");
14
14
  */
15
15
  class EmbeddingFunctionRegistry {
16
16
  #functions = new Map();
17
+ #variables = new Map();
17
18
  /**
18
19
  * Get the number of registered functions
19
20
  */
@@ -60,10 +61,7 @@ class EmbeddingFunctionRegistry {
60
61
  }
61
62
  else {
62
63
  // biome-ignore lint/suspicious/noExplicitAny: <explanation>
63
- create = function (options) {
64
- const instance = new factory(options);
65
- return instance;
66
- };
64
+ create = (options) => new factory(options);
67
65
  }
68
66
  return {
69
67
  create,
@@ -119,6 +117,35 @@ class EmbeddingFunctionRegistry {
119
117
  metadata.set("embedding_functions", JSON.stringify(jsonData));
120
118
  return metadata;
121
119
  }
120
+ /**
121
+ * Set a variable. These can be accessed in the embedding function
122
+ * configuration using the syntax `$var:variable_name`. If they are not
123
+ * set, an error will be thrown letting you know which key is unset. If you
124
+ * want to supply a default value, you can add an additional part in the
125
+ * configuration like so: `$var:variable_name:default_value`. Default values
126
+ * can be used for runtime configurations that are not sensitive, such as
127
+ * whether to use a GPU for inference.
128
+ *
129
+ * The name must not contain colons. The default value can contain colons.
130
+ *
131
+ * @param name
132
+ * @param value
133
+ */
134
+ setVar(name, value) {
135
+ if (name.includes(":")) {
136
+ throw new Error("Variable names cannot contain colons");
137
+ }
138
+ this.#variables.set(name, value);
139
+ }
140
+ /**
141
+ * Get a variable.
142
+ * @param name
143
+ * @returns
144
+ * @see {@link setVar}
145
+ */
146
+ getVar(name) {
147
+ return this.#variables.get(name);
148
+ }
122
149
  }
123
150
  exports.EmbeddingFunctionRegistry = EmbeddingFunctionRegistry;
124
151
  const _REGISTRY = new EmbeddingFunctionRegistry();
@@ -27,8 +27,7 @@ export type XenovaTransformerOptions = {
27
27
  };
28
28
  export declare class TransformersEmbeddingFunction extends EmbeddingFunction<string, Partial<XenovaTransformerOptions>> {
29
29
  #private;
30
- constructor(options?: Partial<XenovaTransformerOptions>);
31
- toJSON(): Record<string, any>;
30
+ constructor(optionsRaw?: Partial<XenovaTransformerOptions>);
32
31
  init(): Promise<void>;
33
32
  ndims(): number;
34
33
  embeddingDataType(): Float;
@@ -22,10 +22,11 @@ let TransformersEmbeddingFunction = class TransformersEmbeddingFunction extends
22
22
  #initialized = false;
23
23
  #tokenizerOptions;
24
24
  #ndims;
25
- constructor(options = {
25
+ constructor(optionsRaw = {
26
26
  model: "Xenova/all-MiniLM-L6-v2",
27
27
  }) {
28
28
  super();
29
+ const options = this.resolveVariables(optionsRaw);
29
30
  const modelName = options?.model ?? "Xenova/all-MiniLM-L6-v2";
30
31
  this.#tokenizerOptions = {
31
32
  padding: true,
@@ -34,22 +35,6 @@ let TransformersEmbeddingFunction = class TransformersEmbeddingFunction extends
34
35
  this.#ndims = options.ndims;
35
36
  this.#modelName = modelName;
36
37
  }
37
- toJSON() {
38
- // biome-ignore lint/suspicious/noExplicitAny: <explanation>
39
- const obj = {
40
- model: this.#modelName,
41
- };
42
- if (this.#ndims) {
43
- obj["ndims"] = this.#ndims;
44
- }
45
- if (this.#tokenizerOptions) {
46
- obj["tokenizerOptions"] = this.#tokenizerOptions;
47
- }
48
- if (this.#tokenizer) {
49
- obj["tokenizer"] = this.#tokenizer.name;
50
- }
51
- return obj;
52
- }
53
38
  async init() {
54
39
  let transformers;
55
40
  try {
package/package.json CHANGED
@@ -11,7 +11,7 @@
11
11
  "ann"
12
12
  ],
13
13
  "private": false,
14
- "version": "0.16.1-beta.3",
14
+ "version": "0.17.0",
15
15
  "main": "dist/index.js",
16
16
  "exports": {
17
17
  ".": "./dist/index.js",
@@ -98,14 +98,14 @@
98
98
  "reflect-metadata": "^0.2.2"
99
99
  },
100
100
  "optionalDependencies": {
101
- "@lancedb/lancedb-darwin-x64": "0.16.1-beta.3",
102
- "@lancedb/lancedb-darwin-arm64": "0.16.1-beta.3",
103
- "@lancedb/lancedb-linux-x64-gnu": "0.16.1-beta.3",
104
- "@lancedb/lancedb-linux-arm64-gnu": "0.16.1-beta.3",
105
- "@lancedb/lancedb-linux-x64-musl": "0.16.1-beta.3",
106
- "@lancedb/lancedb-linux-arm64-musl": "0.16.1-beta.3",
107
- "@lancedb/lancedb-win32-x64-msvc": "0.16.1-beta.3",
108
- "@lancedb/lancedb-win32-arm64-msvc": "0.16.1-beta.3"
101
+ "@lancedb/lancedb-darwin-x64": "0.17.0",
102
+ "@lancedb/lancedb-darwin-arm64": "0.17.0",
103
+ "@lancedb/lancedb-linux-x64-gnu": "0.17.0",
104
+ "@lancedb/lancedb-linux-arm64-gnu": "0.17.0",
105
+ "@lancedb/lancedb-linux-x64-musl": "0.17.0",
106
+ "@lancedb/lancedb-linux-arm64-musl": "0.17.0",
107
+ "@lancedb/lancedb-win32-x64-msvc": "0.17.0",
108
+ "@lancedb/lancedb-win32-arm64-msvc": "0.17.0"
109
109
  },
110
110
  "peerDependencies": {
111
111
  "apache-arrow": ">=15.0.0 <=18.1.0"