@lancedb/lancedb 0.14.0-beta.2 → 0.14.1-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/arrow.d.ts +248 -0
- package/dist/arrow.js +743 -0
- package/dist/connection.d.ts +188 -0
- package/dist/connection.js +149 -0
- package/dist/embedding/embedding_function.d.ts +90 -0
- package/dist/embedding/embedding_function.js +133 -0
- package/dist/embedding/index.d.ts +27 -0
- package/dist/embedding/index.js +112 -0
- package/dist/embedding/openai.d.ts +18 -0
- package/dist/embedding/openai.js +105 -0
- package/dist/embedding/registry.d.ts +55 -0
- package/dist/embedding/registry.js +151 -0
- package/dist/embedding/transformers.d.ts +37 -0
- package/dist/embedding/transformers.js +148 -0
- package/dist/index.d.ts +54 -0
- package/dist/index.js +55 -0
- package/dist/indices.d.ts +429 -0
- package/dist/indices.js +131 -0
- package/dist/merge.d.ts +54 -0
- package/dist/merge.js +64 -0
- package/dist/native.d.ts +340 -0
- package/dist/native.js +330 -0
- package/dist/query.d.ts +324 -0
- package/dist/query.js +544 -0
- package/dist/sanitize.d.ts +31 -0
- package/dist/sanitize.js +437 -0
- package/dist/table.d.ts +425 -0
- package/dist/table.js +276 -0
- package/dist/util.d.ts +13 -0
- package/dist/util.js +65 -0
- package/package.json +9 -9
- package/DEVELOPMENT.md +0 -42
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Copyright 2023 Lance Developers.
|
|
3
|
+
//
|
|
4
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
// you may not use this file except in compliance with the License.
|
|
6
|
+
// You may obtain a copy of the License at
|
|
7
|
+
//
|
|
8
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
//
|
|
10
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
// See the License for the specific language governing permissions and
|
|
14
|
+
// limitations under the License.
|
|
15
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
16
|
+
if (k2 === undefined) k2 = k;
|
|
17
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
18
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
19
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
20
|
+
}
|
|
21
|
+
Object.defineProperty(o, k2, desc);
|
|
22
|
+
}) : (function(o, m, k, k2) {
|
|
23
|
+
if (k2 === undefined) k2 = k;
|
|
24
|
+
o[k2] = m[k];
|
|
25
|
+
}));
|
|
26
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
27
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
28
|
+
};
|
|
29
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
30
|
+
exports.TextEmbeddingFunction = exports.EmbeddingFunction = void 0;
|
|
31
|
+
exports.LanceSchema = LanceSchema;
|
|
32
|
+
const arrow_1 = require("../arrow");
|
|
33
|
+
const sanitize_1 = require("../sanitize");
|
|
34
|
+
const registry_1 = require("./registry");
|
|
35
|
+
var embedding_function_1 = require("./embedding_function");
|
|
36
|
+
Object.defineProperty(exports, "EmbeddingFunction", { enumerable: true, get: function () { return embedding_function_1.EmbeddingFunction; } });
|
|
37
|
+
Object.defineProperty(exports, "TextEmbeddingFunction", { enumerable: true, get: function () { return embedding_function_1.TextEmbeddingFunction; } });
|
|
38
|
+
__exportStar(require("./registry"), exports);
|
|
39
|
+
/**
|
|
40
|
+
* Create a schema with embedding functions.
|
|
41
|
+
*
|
|
42
|
+
* @param fields
|
|
43
|
+
* @returns Schema
|
|
44
|
+
* @example
|
|
45
|
+
* ```ts
|
|
46
|
+
* class MyEmbeddingFunction extends EmbeddingFunction {
|
|
47
|
+
* // ...
|
|
48
|
+
* }
|
|
49
|
+
* const func = new MyEmbeddingFunction();
|
|
50
|
+
* const schema = LanceSchema({
|
|
51
|
+
* id: new Int32(),
|
|
52
|
+
* text: func.sourceField(new Utf8()),
|
|
53
|
+
* vector: func.vectorField(),
|
|
54
|
+
* // optional: specify the datatype and/or dimensions
|
|
55
|
+
* vector2: func.vectorField({ datatype: new Float32(), dims: 3}),
|
|
56
|
+
* });
|
|
57
|
+
*
|
|
58
|
+
* const table = await db.createTable("my_table", data, { schema });
|
|
59
|
+
* ```
|
|
60
|
+
*/
|
|
61
|
+
function LanceSchema(fields) {
|
|
62
|
+
const arrowFields = [];
|
|
63
|
+
const embeddingFunctions = new Map();
|
|
64
|
+
Object.entries(fields).forEach(([key, value]) => {
|
|
65
|
+
if (Array.isArray(value)) {
|
|
66
|
+
const [dtype, metadata] = value;
|
|
67
|
+
arrowFields.push(new arrow_1.Field(key, (0, sanitize_1.sanitizeType)(dtype), true));
|
|
68
|
+
parseEmbeddingFunctions(embeddingFunctions, key, metadata);
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
arrowFields.push(new arrow_1.Field(key, (0, sanitize_1.sanitizeType)(value), true));
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
const registry = (0, registry_1.getRegistry)();
|
|
75
|
+
const metadata = registry.getTableMetadata(Array.from(embeddingFunctions.values()));
|
|
76
|
+
const schema = new arrow_1.Schema(arrowFields, metadata);
|
|
77
|
+
return schema;
|
|
78
|
+
}
|
|
79
|
+
function parseEmbeddingFunctions(embeddingFunctions, key, metadata) {
|
|
80
|
+
if (metadata.has("source_column_for")) {
|
|
81
|
+
const embedFunction = metadata.get("source_column_for");
|
|
82
|
+
const current = embeddingFunctions.get(embedFunction);
|
|
83
|
+
if (current !== undefined) {
|
|
84
|
+
embeddingFunctions.set(embedFunction, {
|
|
85
|
+
...current,
|
|
86
|
+
sourceColumn: key,
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
embeddingFunctions.set(embedFunction, {
|
|
91
|
+
sourceColumn: key,
|
|
92
|
+
function: embedFunction,
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
else if (metadata.has("vector_column_for")) {
|
|
97
|
+
const embedFunction = metadata.get("vector_column_for");
|
|
98
|
+
const current = embeddingFunctions.get(embedFunction);
|
|
99
|
+
if (current !== undefined) {
|
|
100
|
+
embeddingFunctions.set(embedFunction, {
|
|
101
|
+
...current,
|
|
102
|
+
vectorColumn: key,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
embeddingFunctions.set(embedFunction, {
|
|
107
|
+
vectorColumn: key,
|
|
108
|
+
function: embedFunction,
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { EmbeddingCreateParams } from "openai/resources/index";
|
|
2
|
+
import { Float } from "../arrow";
|
|
3
|
+
import { EmbeddingFunction } from "./embedding_function";
|
|
4
|
+
export type OpenAIOptions = {
|
|
5
|
+
apiKey: string;
|
|
6
|
+
model: EmbeddingCreateParams["model"];
|
|
7
|
+
};
|
|
8
|
+
export declare class OpenAIEmbeddingFunction extends EmbeddingFunction<string, Partial<OpenAIOptions>> {
|
|
9
|
+
#private;
|
|
10
|
+
constructor(options?: Partial<OpenAIOptions>);
|
|
11
|
+
toJSON(): {
|
|
12
|
+
model: (string & {}) | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large";
|
|
13
|
+
};
|
|
14
|
+
ndims(): number;
|
|
15
|
+
embeddingDataType(): Float;
|
|
16
|
+
computeSourceEmbeddings(data: string[]): Promise<number[][]>;
|
|
17
|
+
computeQueryEmbeddings(data: string): Promise<number[]>;
|
|
18
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Copyright 2023 Lance Developers.
|
|
3
|
+
//
|
|
4
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
// you may not use this file except in compliance with the License.
|
|
6
|
+
// You may obtain a copy of the License at
|
|
7
|
+
//
|
|
8
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
//
|
|
10
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
// See the License for the specific language governing permissions and
|
|
14
|
+
// limitations under the License.
|
|
15
|
+
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
|
|
16
|
+
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
|
|
17
|
+
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
|
|
18
|
+
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
|
19
|
+
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
20
|
+
};
|
|
21
|
+
var __metadata = (this && this.__metadata) || function (k, v) {
|
|
22
|
+
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
|
|
23
|
+
};
|
|
24
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
25
|
+
exports.OpenAIEmbeddingFunction = void 0;
|
|
26
|
+
const arrow_1 = require("../arrow");
|
|
27
|
+
const embedding_function_1 = require("./embedding_function");
|
|
28
|
+
const registry_1 = require("./registry");
|
|
29
|
+
let OpenAIEmbeddingFunction = class OpenAIEmbeddingFunction extends embedding_function_1.EmbeddingFunction {
|
|
30
|
+
#openai;
|
|
31
|
+
#modelName;
|
|
32
|
+
constructor(options = {
|
|
33
|
+
model: "text-embedding-ada-002",
|
|
34
|
+
}) {
|
|
35
|
+
super();
|
|
36
|
+
const openAIKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
|
|
37
|
+
if (!openAIKey) {
|
|
38
|
+
throw new Error("OpenAI API key is required");
|
|
39
|
+
}
|
|
40
|
+
const modelName = options?.model ?? "text-embedding-ada-002";
|
|
41
|
+
/**
|
|
42
|
+
* @type {import("openai").default}
|
|
43
|
+
*/
|
|
44
|
+
// eslint-disable-next-line @typescript-eslint/naming-convention
|
|
45
|
+
let Openai;
|
|
46
|
+
try {
|
|
47
|
+
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
48
|
+
Openai = require("openai");
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
throw new Error("please install openai@^4.24.1 using npm install openai");
|
|
52
|
+
}
|
|
53
|
+
const configuration = {
|
|
54
|
+
apiKey: openAIKey,
|
|
55
|
+
};
|
|
56
|
+
this.#openai = new Openai(configuration);
|
|
57
|
+
this.#modelName = modelName;
|
|
58
|
+
}
|
|
59
|
+
toJSON() {
|
|
60
|
+
return {
|
|
61
|
+
model: this.#modelName,
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
ndims() {
|
|
65
|
+
switch (this.#modelName) {
|
|
66
|
+
case "text-embedding-ada-002":
|
|
67
|
+
return 1536;
|
|
68
|
+
case "text-embedding-3-large":
|
|
69
|
+
return 3072;
|
|
70
|
+
case "text-embedding-3-small":
|
|
71
|
+
return 1536;
|
|
72
|
+
default:
|
|
73
|
+
throw new Error(`Unknown model: ${this.#modelName}`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
embeddingDataType() {
|
|
77
|
+
return new arrow_1.Float32();
|
|
78
|
+
}
|
|
79
|
+
async computeSourceEmbeddings(data) {
|
|
80
|
+
const response = await this.#openai.embeddings.create({
|
|
81
|
+
model: this.#modelName,
|
|
82
|
+
input: data,
|
|
83
|
+
});
|
|
84
|
+
const embeddings = [];
|
|
85
|
+
for (let i = 0; i < response.data.length; i++) {
|
|
86
|
+
embeddings.push(response.data[i].embedding);
|
|
87
|
+
}
|
|
88
|
+
return embeddings;
|
|
89
|
+
}
|
|
90
|
+
async computeQueryEmbeddings(data) {
|
|
91
|
+
if (typeof data !== "string") {
|
|
92
|
+
throw new Error("Data must be a string");
|
|
93
|
+
}
|
|
94
|
+
const response = await this.#openai.embeddings.create({
|
|
95
|
+
model: this.#modelName,
|
|
96
|
+
input: data,
|
|
97
|
+
});
|
|
98
|
+
return response.data[0].embedding;
|
|
99
|
+
}
|
|
100
|
+
};
|
|
101
|
+
exports.OpenAIEmbeddingFunction = OpenAIEmbeddingFunction;
|
|
102
|
+
exports.OpenAIEmbeddingFunction = OpenAIEmbeddingFunction = __decorate([
|
|
103
|
+
(0, registry_1.register)("openai"),
|
|
104
|
+
__metadata("design:paramtypes", [Object])
|
|
105
|
+
], OpenAIEmbeddingFunction);
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { type EmbeddingFunction, type EmbeddingFunctionConstructor } from "./embedding_function";
|
|
2
|
+
import "reflect-metadata";
|
|
3
|
+
type CreateReturnType<T> = T extends {
|
|
4
|
+
init: () => Promise<void>;
|
|
5
|
+
} ? Promise<T> : T;
|
|
6
|
+
interface EmbeddingFunctionCreate<T extends EmbeddingFunction> {
|
|
7
|
+
create(options?: T["TOptions"]): CreateReturnType<T>;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* This is a singleton class used to register embedding functions
|
|
11
|
+
* and fetch them by name. It also handles serializing and deserializing.
|
|
12
|
+
* You can implement your own embedding function by subclassing EmbeddingFunction
|
|
13
|
+
* or TextEmbeddingFunction and registering it with the registry
|
|
14
|
+
*/
|
|
15
|
+
export declare class EmbeddingFunctionRegistry {
|
|
16
|
+
#private;
|
|
17
|
+
/**
|
|
18
|
+
* Get the number of registered functions
|
|
19
|
+
*/
|
|
20
|
+
length(): number;
|
|
21
|
+
/**
|
|
22
|
+
* Register an embedding function
|
|
23
|
+
* @param name The name of the function
|
|
24
|
+
* @param func The function to register
|
|
25
|
+
* @throws Error if the function is already registered
|
|
26
|
+
*/
|
|
27
|
+
register<T extends EmbeddingFunctionConstructor = EmbeddingFunctionConstructor>(this: EmbeddingFunctionRegistry, alias?: string): (ctor: T) => any;
|
|
28
|
+
get<T extends EmbeddingFunction<unknown>>(name: string): EmbeddingFunctionCreate<T> | undefined;
|
|
29
|
+
/**
|
|
30
|
+
* reset the registry to the initial state
|
|
31
|
+
*/
|
|
32
|
+
reset(this: EmbeddingFunctionRegistry): void;
|
|
33
|
+
/**
|
|
34
|
+
* @ignore
|
|
35
|
+
*/
|
|
36
|
+
parseFunctions(this: EmbeddingFunctionRegistry, metadata: Map<string, string>): Promise<Map<string, EmbeddingFunctionConfig>>;
|
|
37
|
+
functionToMetadata(conf: EmbeddingFunctionConfig): Record<string, any>;
|
|
38
|
+
getTableMetadata(functions: EmbeddingFunctionConfig[]): Map<string, string>;
|
|
39
|
+
}
|
|
40
|
+
export declare function register(name?: string): (ctor: EmbeddingFunctionConstructor<EmbeddingFunction<any, import("./embedding_function").FunctionOptions>>) => any;
|
|
41
|
+
/**
|
|
42
|
+
* Utility function to get the global instance of the registry
|
|
43
|
+
* @returns `EmbeddingFunctionRegistry` The global instance of the registry
|
|
44
|
+
* @example
|
|
45
|
+
* ```ts
|
|
46
|
+
* const registry = getRegistry();
|
|
47
|
+
* const openai = registry.get("openai").create();
|
|
48
|
+
*/
|
|
49
|
+
export declare function getRegistry(): EmbeddingFunctionRegistry;
|
|
50
|
+
export interface EmbeddingFunctionConfig {
|
|
51
|
+
sourceColumn: string;
|
|
52
|
+
vectorColumn?: string;
|
|
53
|
+
function: EmbeddingFunction;
|
|
54
|
+
}
|
|
55
|
+
export {};
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Copyright 2024 Lance Developers.
|
|
3
|
+
//
|
|
4
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
// you may not use this file except in compliance with the License.
|
|
6
|
+
// You may obtain a copy of the License at
|
|
7
|
+
//
|
|
8
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
//
|
|
10
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
// See the License for the specific language governing permissions and
|
|
14
|
+
// limitations under the License.
|
|
15
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
|
+
exports.EmbeddingFunctionRegistry = void 0;
|
|
17
|
+
exports.register = register;
|
|
18
|
+
exports.getRegistry = getRegistry;
|
|
19
|
+
require("reflect-metadata");
|
|
20
|
+
/**
|
|
21
|
+
* This is a singleton class used to register embedding functions
|
|
22
|
+
* and fetch them by name. It also handles serializing and deserializing.
|
|
23
|
+
* You can implement your own embedding function by subclassing EmbeddingFunction
|
|
24
|
+
* or TextEmbeddingFunction and registering it with the registry
|
|
25
|
+
*/
|
|
26
|
+
class EmbeddingFunctionRegistry {
|
|
27
|
+
#functions = new Map();
|
|
28
|
+
/**
|
|
29
|
+
* Get the number of registered functions
|
|
30
|
+
*/
|
|
31
|
+
length() {
|
|
32
|
+
return this.#functions.size;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Register an embedding function
|
|
36
|
+
* @param name The name of the function
|
|
37
|
+
* @param func The function to register
|
|
38
|
+
* @throws Error if the function is already registered
|
|
39
|
+
*/
|
|
40
|
+
register(alias) {
|
|
41
|
+
const self = this;
|
|
42
|
+
return function (ctor) {
|
|
43
|
+
if (!alias) {
|
|
44
|
+
alias = ctor.name;
|
|
45
|
+
}
|
|
46
|
+
if (self.#functions.has(alias)) {
|
|
47
|
+
throw new Error(`Embedding function with alias "${alias}" already exists`);
|
|
48
|
+
}
|
|
49
|
+
self.#functions.set(alias, ctor);
|
|
50
|
+
Reflect.defineMetadata("lancedb::embedding::name", alias, ctor);
|
|
51
|
+
return ctor;
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Fetch an embedding function by name
|
|
56
|
+
* @param name The name of the function
|
|
57
|
+
*/
|
|
58
|
+
get(name) {
|
|
59
|
+
const factory = this.#functions.get(name);
|
|
60
|
+
if (!factory) {
|
|
61
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
62
|
+
return undefined;
|
|
63
|
+
}
|
|
64
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
65
|
+
let create;
|
|
66
|
+
if (factory.prototype.init) {
|
|
67
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
68
|
+
create = async function (options) {
|
|
69
|
+
const instance = new factory(options);
|
|
70
|
+
await instance.init();
|
|
71
|
+
return instance;
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
76
|
+
create = function (options) {
|
|
77
|
+
const instance = new factory(options);
|
|
78
|
+
return instance;
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
return {
|
|
82
|
+
create,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* reset the registry to the initial state
|
|
87
|
+
*/
|
|
88
|
+
reset() {
|
|
89
|
+
this.#functions.clear();
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* @ignore
|
|
93
|
+
*/
|
|
94
|
+
async parseFunctions(metadata) {
|
|
95
|
+
if (!metadata.has("embedding_functions")) {
|
|
96
|
+
return new Map();
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
const functions = (JSON.parse(metadata.get("embedding_functions")));
|
|
100
|
+
const items = await Promise.all(functions.map(async (f) => {
|
|
101
|
+
const fn = this.get(f.name);
|
|
102
|
+
if (!fn) {
|
|
103
|
+
throw new Error(`Function "${f.name}" not found in registry`);
|
|
104
|
+
}
|
|
105
|
+
const func = await this.get(f.name).create(f.model);
|
|
106
|
+
return [
|
|
107
|
+
f.name,
|
|
108
|
+
{
|
|
109
|
+
sourceColumn: f.sourceColumn,
|
|
110
|
+
vectorColumn: f.vectorColumn,
|
|
111
|
+
function: func,
|
|
112
|
+
},
|
|
113
|
+
];
|
|
114
|
+
}));
|
|
115
|
+
return new Map(items);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
119
|
+
functionToMetadata(conf) {
|
|
120
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
121
|
+
const metadata = {};
|
|
122
|
+
const name = Reflect.getMetadata("lancedb::embedding::name", conf.function.constructor);
|
|
123
|
+
metadata["sourceColumn"] = conf.sourceColumn;
|
|
124
|
+
metadata["vectorColumn"] = conf.vectorColumn ?? "vector";
|
|
125
|
+
metadata["name"] = name ?? conf.function.constructor.name;
|
|
126
|
+
metadata["model"] = conf.function.toJSON();
|
|
127
|
+
return metadata;
|
|
128
|
+
}
|
|
129
|
+
getTableMetadata(functions) {
|
|
130
|
+
const metadata = new Map();
|
|
131
|
+
const jsonData = functions.map((conf) => this.functionToMetadata(conf));
|
|
132
|
+
metadata.set("embedding_functions", JSON.stringify(jsonData));
|
|
133
|
+
return metadata;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
exports.EmbeddingFunctionRegistry = EmbeddingFunctionRegistry;
|
|
137
|
+
const _REGISTRY = new EmbeddingFunctionRegistry();
|
|
138
|
+
function register(name) {
|
|
139
|
+
return _REGISTRY.register(name);
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Utility function to get the global instance of the registry
|
|
143
|
+
* @returns `EmbeddingFunctionRegistry` The global instance of the registry
|
|
144
|
+
* @example
|
|
145
|
+
* ```ts
|
|
146
|
+
* const registry = getRegistry();
|
|
147
|
+
* const openai = registry.get("openai").create();
|
|
148
|
+
*/
|
|
149
|
+
function getRegistry() {
|
|
150
|
+
return _REGISTRY;
|
|
151
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { Float } from "../arrow";
|
|
2
|
+
import { EmbeddingFunction } from "./embedding_function";
|
|
3
|
+
export type XenovaTransformerOptions = {
|
|
4
|
+
/** The wasm compatible model to use */
|
|
5
|
+
model: string;
|
|
6
|
+
/**
|
|
7
|
+
* The wasm compatible tokenizer to use
|
|
8
|
+
* If not provided, it will use the default tokenizer for the model
|
|
9
|
+
*/
|
|
10
|
+
tokenizer?: string;
|
|
11
|
+
/**
|
|
12
|
+
* The number of dimensions of the embeddings
|
|
13
|
+
*
|
|
14
|
+
* We will attempt to infer this from the model config if not provided.
|
|
15
|
+
* Since there isn't a standard way to get this information from the model,
|
|
16
|
+
* you may need to manually specify this if using a model that doesn't have a 'hidden_size' in the config.
|
|
17
|
+
* */
|
|
18
|
+
ndims?: number;
|
|
19
|
+
/** Options for the tokenizer */
|
|
20
|
+
tokenizerOptions?: {
|
|
21
|
+
textPair?: string | string[];
|
|
22
|
+
padding?: boolean | "max_length";
|
|
23
|
+
addSpecialTokens?: boolean;
|
|
24
|
+
truncation?: boolean;
|
|
25
|
+
maxLength?: number;
|
|
26
|
+
};
|
|
27
|
+
};
|
|
28
|
+
export declare class TransformersEmbeddingFunction extends EmbeddingFunction<string, Partial<XenovaTransformerOptions>> {
|
|
29
|
+
#private;
|
|
30
|
+
constructor(options?: Partial<XenovaTransformerOptions>);
|
|
31
|
+
toJSON(): Record<string, any>;
|
|
32
|
+
init(): Promise<void>;
|
|
33
|
+
ndims(): number;
|
|
34
|
+
embeddingDataType(): Float;
|
|
35
|
+
computeSourceEmbeddings(data: string[]): Promise<number[][]>;
|
|
36
|
+
computeQueryEmbeddings(data: string): Promise<number[]>;
|
|
37
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Copyright 2023 Lance Developers.
|
|
3
|
+
//
|
|
4
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
// you may not use this file except in compliance with the License.
|
|
6
|
+
// You may obtain a copy of the License at
|
|
7
|
+
//
|
|
8
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
//
|
|
10
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
// See the License for the specific language governing permissions and
|
|
14
|
+
// limitations under the License.
|
|
15
|
+
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
|
|
16
|
+
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
|
|
17
|
+
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
|
|
18
|
+
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
|
19
|
+
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
20
|
+
};
|
|
21
|
+
var __metadata = (this && this.__metadata) || function (k, v) {
|
|
22
|
+
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
|
|
23
|
+
};
|
|
24
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
25
|
+
exports.TransformersEmbeddingFunction = void 0;
|
|
26
|
+
const arrow_1 = require("../arrow");
|
|
27
|
+
const embedding_function_1 = require("./embedding_function");
|
|
28
|
+
const registry_1 = require("./registry");
|
|
29
|
+
let TransformersEmbeddingFunction = class TransformersEmbeddingFunction extends embedding_function_1.EmbeddingFunction {
|
|
30
|
+
#model;
|
|
31
|
+
#tokenizer;
|
|
32
|
+
#modelName;
|
|
33
|
+
#initialized = false;
|
|
34
|
+
#tokenizerOptions;
|
|
35
|
+
#ndims;
|
|
36
|
+
constructor(options = {
|
|
37
|
+
model: "Xenova/all-MiniLM-L6-v2",
|
|
38
|
+
}) {
|
|
39
|
+
super();
|
|
40
|
+
const modelName = options?.model ?? "Xenova/all-MiniLM-L6-v2";
|
|
41
|
+
this.#tokenizerOptions = {
|
|
42
|
+
padding: true,
|
|
43
|
+
...options.tokenizerOptions,
|
|
44
|
+
};
|
|
45
|
+
this.#ndims = options.ndims;
|
|
46
|
+
this.#modelName = modelName;
|
|
47
|
+
}
|
|
48
|
+
toJSON() {
|
|
49
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
50
|
+
const obj = {
|
|
51
|
+
model: this.#modelName,
|
|
52
|
+
};
|
|
53
|
+
if (this.#ndims) {
|
|
54
|
+
obj["ndims"] = this.#ndims;
|
|
55
|
+
}
|
|
56
|
+
if (this.#tokenizerOptions) {
|
|
57
|
+
obj["tokenizerOptions"] = this.#tokenizerOptions;
|
|
58
|
+
}
|
|
59
|
+
if (this.#tokenizer) {
|
|
60
|
+
obj["tokenizer"] = this.#tokenizer.name;
|
|
61
|
+
}
|
|
62
|
+
return obj;
|
|
63
|
+
}
|
|
64
|
+
async init() {
|
|
65
|
+
let transformers;
|
|
66
|
+
try {
|
|
67
|
+
// SAFETY:
|
|
68
|
+
// since typescript transpiles `import` to `require`, we need to do this in an unsafe way
|
|
69
|
+
// We can't use `require` because `@huggingface/transformers` is an ESM module
|
|
70
|
+
// and we can't use `import` directly because typescript will transpile it to `require`.
|
|
71
|
+
// and we want to remain compatible with both ESM and CJS modules
|
|
72
|
+
// so we use `eval` to bypass typescript for this specific import.
|
|
73
|
+
transformers = await eval('import("@huggingface/transformers")');
|
|
74
|
+
}
|
|
75
|
+
catch (e) {
|
|
76
|
+
throw new Error(`error loading @huggingface/transformers\nReason: ${e}`);
|
|
77
|
+
}
|
|
78
|
+
try {
|
|
79
|
+
this.#model = await transformers.AutoModel.from_pretrained(this.#modelName, { dtype: "fp32" });
|
|
80
|
+
}
|
|
81
|
+
catch (e) {
|
|
82
|
+
throw new Error(`error loading model ${this.#modelName}. Make sure you are using a wasm compatible model.\nReason: ${e}`);
|
|
83
|
+
}
|
|
84
|
+
try {
|
|
85
|
+
this.#tokenizer = await transformers.AutoTokenizer.from_pretrained(this.#modelName);
|
|
86
|
+
}
|
|
87
|
+
catch (e) {
|
|
88
|
+
throw new Error(`error loading tokenizer for ${this.#modelName}. Make sure you are using a wasm compatible model:\nReason: ${e}`);
|
|
89
|
+
}
|
|
90
|
+
this.#initialized = true;
|
|
91
|
+
}
|
|
92
|
+
ndims() {
|
|
93
|
+
if (this.#ndims) {
|
|
94
|
+
return this.#ndims;
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
const config = this.#model.config;
|
|
98
|
+
// biome-ignore lint/style/useNamingConvention: we don't control this name.
|
|
99
|
+
const ndims = config.hidden_size;
|
|
100
|
+
if (!ndims) {
|
|
101
|
+
throw new Error("hidden_size not found in model config, you may need to manually specify the embedding dimensions. ");
|
|
102
|
+
}
|
|
103
|
+
return ndims;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
embeddingDataType() {
|
|
107
|
+
return new arrow_1.Float32();
|
|
108
|
+
}
|
|
109
|
+
async computeSourceEmbeddings(data) {
|
|
110
|
+
// this should only happen if the user is trying to use the function directly.
|
|
111
|
+
// Anything going through the registry should already be initialized.
|
|
112
|
+
if (!this.#initialized) {
|
|
113
|
+
return Promise.reject(new Error("something went wrong: embedding function not initialized. Please call init()"));
|
|
114
|
+
}
|
|
115
|
+
const tokenizer = this.#tokenizer;
|
|
116
|
+
const model = this.#model;
|
|
117
|
+
const inputs = await tokenizer(data, this.#tokenizerOptions);
|
|
118
|
+
let tokens = await model.forward(inputs);
|
|
119
|
+
tokens = tokens[Object.keys(tokens)[0]];
|
|
120
|
+
const [nItems, nTokens] = tokens.dims;
|
|
121
|
+
tokens = tensorDiv(tokens.sum(1), nTokens);
|
|
122
|
+
// TODO: support other data types
|
|
123
|
+
const tokenData = tokens.data;
|
|
124
|
+
const stride = this.ndims();
|
|
125
|
+
const embeddings = [];
|
|
126
|
+
for (let i = 0; i < nItems; i++) {
|
|
127
|
+
const start = i * stride;
|
|
128
|
+
const end = start + stride;
|
|
129
|
+
const slice = tokenData.slice(start, end);
|
|
130
|
+
embeddings.push(Array.from(slice)); // TODO: Avoid copy here
|
|
131
|
+
}
|
|
132
|
+
return embeddings;
|
|
133
|
+
}
|
|
134
|
+
async computeQueryEmbeddings(data) {
|
|
135
|
+
return (await this.computeSourceEmbeddings([data]))[0];
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
exports.TransformersEmbeddingFunction = TransformersEmbeddingFunction;
|
|
139
|
+
exports.TransformersEmbeddingFunction = TransformersEmbeddingFunction = __decorate([
|
|
140
|
+
(0, registry_1.register)("huggingface"),
|
|
141
|
+
__metadata("design:paramtypes", [Object])
|
|
142
|
+
], TransformersEmbeddingFunction);
|
|
143
|
+
const tensorDiv = (src, divBy) => {
|
|
144
|
+
for (let i = 0; i < src.data.length; ++i) {
|
|
145
|
+
src.data[i] /= divBy;
|
|
146
|
+
}
|
|
147
|
+
return src;
|
|
148
|
+
};
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { Connection } from "./connection";
|
|
2
|
+
import { ConnectionOptions } from "./native.js";
|
|
3
|
+
export { WriteOptions, WriteMode, AddColumnsSql, ColumnAlteration, ConnectionOptions, IndexStatistics, IndexConfig, ClientConfig, TimeoutConfig, RetryConfig, } from "./native.js";
|
|
4
|
+
export { makeArrowTable, MakeArrowTableOptions, Data, VectorColumnOptions, } from "./arrow";
|
|
5
|
+
export { Connection, CreateTableOptions, TableNamesOptions, } from "./connection";
|
|
6
|
+
export { ExecutableQuery, Query, QueryBase, VectorQuery, RecordBatchIterator, } from "./query";
|
|
7
|
+
export { Index, IndexOptions, IvfPqOptions } from "./indices";
|
|
8
|
+
export { Table, AddDataOptions, UpdateOptions, OptimizeOptions } from "./table";
|
|
9
|
+
export * as embedding from "./embedding";
|
|
10
|
+
/**
|
|
11
|
+
* Connect to a LanceDB instance at the given URI.
|
|
12
|
+
*
|
|
13
|
+
* Accepted formats:
|
|
14
|
+
*
|
|
15
|
+
* - `/path/to/database` - local database
|
|
16
|
+
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
|
|
17
|
+
* - `db://host:port` - remote database (LanceDB cloud)
|
|
18
|
+
* @param {string} uri - The uri of the database. If the database uri starts
|
|
19
|
+
* with `db://` then it connects to a remote database.
|
|
20
|
+
* @see {@link ConnectionOptions} for more details on the URI format.
|
|
21
|
+
* @example
|
|
22
|
+
* ```ts
|
|
23
|
+
* const conn = await connect("/path/to/database");
|
|
24
|
+
* ```
|
|
25
|
+
* @example
|
|
26
|
+
* ```ts
|
|
27
|
+
* const conn = await connect(
|
|
28
|
+
* "s3://bucket/path/to/database",
|
|
29
|
+
* {storageOptions: {timeout: "60s"}
|
|
30
|
+
* });
|
|
31
|
+
* ```
|
|
32
|
+
*/
|
|
33
|
+
export declare function connect(uri: string, opts?: Partial<ConnectionOptions>): Promise<Connection>;
|
|
34
|
+
/**
|
|
35
|
+
* Connect to a LanceDB instance at the given URI.
|
|
36
|
+
*
|
|
37
|
+
* Accepted formats:
|
|
38
|
+
*
|
|
39
|
+
* - `/path/to/database` - local database
|
|
40
|
+
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
|
|
41
|
+
* - `db://host:port` - remote database (LanceDB cloud)
|
|
42
|
+
* @param options - The options to use when connecting to the database
|
|
43
|
+
* @see {@link ConnectionOptions} for more details on the URI format.
|
|
44
|
+
* @example
|
|
45
|
+
* ```ts
|
|
46
|
+
* const conn = await connect({
|
|
47
|
+
* uri: "/path/to/database",
|
|
48
|
+
* storageOptions: {timeout: "60s"}
|
|
49
|
+
* });
|
|
50
|
+
* ```
|
|
51
|
+
*/
|
|
52
|
+
export declare function connect(opts: Partial<ConnectionOptions> & {
|
|
53
|
+
uri: string;
|
|
54
|
+
}): Promise<Connection>;
|