@lancedb/lancedb 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/arrow.d.ts +2 -3
- package/dist/arrow.js +2 -27
- package/dist/connection.d.ts +9 -0
- package/dist/connection.js +16 -2
- package/dist/embedding/embedding_function.d.ts +11 -0
- package/dist/embedding/embedding_function.js +28 -7
- package/dist/embedding/index.d.ts +2 -1
- package/dist/embedding/index.js +7 -6
- package/dist/embedding/openai.d.ts +1 -1
- package/dist/embedding/registry.d.ts +9 -7
- package/dist/embedding/registry.js +24 -6
- package/dist/embedding/transformers.d.ts +37 -0
- package/dist/embedding/transformers.js +147 -0
- package/dist/index.d.ts +1 -1
- package/dist/query.js +15 -9
- package/dist/remote/client.d.ts +1 -1
- package/dist/remote/client.js +6 -8
- package/dist/remote/connection.d.ts +2 -3
- package/dist/remote/connection.js +2 -2
- package/dist/sanitize.js +4 -2
- package/dist/table.d.ts +3 -0
- package/dist/table.js +1 -1
- package/package.json +17 -13
- package/Cargo.toml +0 -28
- package/biome.json +0 -158
- package/build.rs +0 -5
- package/dist/native.d.ts +0 -208
- package/examples/ann_indexes.ts +0 -49
- package/examples/basic.ts +0 -149
- package/examples/embedding.ts +0 -83
- package/examples/filtering.ts +0 -34
- package/examples/jsconfig.json +0 -27
- package/examples/package-lock.json +0 -79
- package/examples/package.json +0 -18
- package/examples/search.ts +0 -37
- package/jest.config.js +0 -7
- package/lancedb/arrow.ts +0 -947
- package/lancedb/connection.ts +0 -333
- package/lancedb/embedding/embedding_function.ts +0 -194
- package/lancedb/embedding/index.ts +0 -113
- package/lancedb/embedding/openai.ts +0 -113
- package/lancedb/embedding/registry.ts +0 -188
- package/lancedb/index.ts +0 -142
- package/lancedb/indices.ts +0 -203
- package/lancedb/merge.ts +0 -70
- package/lancedb/query.ts +0 -507
- package/lancedb/remote/client.ts +0 -221
- package/lancedb/remote/connection.ts +0 -201
- package/lancedb/remote/index.ts +0 -3
- package/lancedb/remote/table.ts +0 -226
- package/lancedb/sanitize.ts +0 -588
- package/lancedb/table.ts +0 -669
- package/lancedb/util.ts +0 -69
- package/native.d.ts +0 -208
- package/nodejs-artifacts/arrow.d.ts +0 -250
- package/nodejs-artifacts/arrow.js +0 -768
- package/nodejs-artifacts/connection.d.ts +0 -171
- package/nodejs-artifacts/connection.js +0 -135
- package/nodejs-artifacts/embedding/embedding_function.d.ts +0 -79
- package/nodejs-artifacts/embedding/embedding_function.js +0 -112
- package/nodejs-artifacts/embedding/index.d.ts +0 -28
- package/nodejs-artifacts/embedding/index.js +0 -114
- package/nodejs-artifacts/embedding/openai.d.ts +0 -18
- package/nodejs-artifacts/embedding/openai.js +0 -105
- package/nodejs-artifacts/embedding/registry.d.ts +0 -53
- package/nodejs-artifacts/embedding/registry.js +0 -127
- package/nodejs-artifacts/index.d.ts +0 -55
- package/nodejs-artifacts/index.js +0 -57
- package/nodejs-artifacts/indices.d.ts +0 -165
- package/nodejs-artifacts/indices.js +0 -71
- package/nodejs-artifacts/merge.d.ts +0 -54
- package/nodejs-artifacts/merge.js +0 -64
- package/nodejs-artifacts/native.d.ts +0 -208
- package/nodejs-artifacts/native.js +0 -330
- package/nodejs-artifacts/query.d.ts +0 -283
- package/nodejs-artifacts/query.js +0 -448
- package/nodejs-artifacts/remote/client.d.ts +0 -28
- package/nodejs-artifacts/remote/client.js +0 -172
- package/nodejs-artifacts/remote/connection.d.ts +0 -25
- package/nodejs-artifacts/remote/connection.js +0 -110
- package/nodejs-artifacts/remote/index.d.ts +0 -3
- package/nodejs-artifacts/remote/index.js +0 -9
- package/nodejs-artifacts/remote/table.d.ts +0 -42
- package/nodejs-artifacts/remote/table.js +0 -179
- package/nodejs-artifacts/sanitize.d.ts +0 -31
- package/nodejs-artifacts/sanitize.js +0 -436
- package/nodejs-artifacts/table.d.ts +0 -395
- package/nodejs-artifacts/table.js +0 -230
- package/nodejs-artifacts/util.d.ts +0 -14
- package/nodejs-artifacts/util.js +0 -65
- package/tsconfig.json +0 -25
- package/typedoc.json +0 -10
package/dist/arrow.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/// <reference types="node" />
|
|
2
|
-
import { Table as ArrowTable, Binary, BufferType,
|
|
2
|
+
import { Table as ArrowTable, Binary, BufferType, Field, FixedSizeBinary, FixedSizeList, Float, Int, LargeBinary, List, Null, RecordBatch, Schema, Struct, Utf8 } from "apache-arrow";
|
|
3
3
|
import { Buffers } from "apache-arrow/data";
|
|
4
4
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
|
5
5
|
import { EmbeddingFunctionConfig } from "./embedding/registry";
|
|
@@ -38,7 +38,6 @@ export type TableLike = ArrowTable | {
|
|
|
38
38
|
};
|
|
39
39
|
export type IntoVector = Float32Array | Float64Array | number[] | Promise<Float32Array | Float64Array | number[]>;
|
|
40
40
|
export declare function isArrowTable(value: object): value is TableLike;
|
|
41
|
-
export declare function isDataType(value: unknown): value is DataType;
|
|
42
41
|
export declare function isNull(value: unknown): value is Null;
|
|
43
42
|
export declare function isInt(value: unknown): value is Int;
|
|
44
43
|
export declare function isFloat(value: unknown): value is Float;
|
|
@@ -200,7 +199,7 @@ export declare function makeEmptyTable(schema: SchemaLike, metadata?: Map<string
|
|
|
200
199
|
*/
|
|
201
200
|
export declare function convertToTable(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, makeTableOptions?: Partial<MakeArrowTableOptions>): Promise<ArrowTable>;
|
|
202
201
|
/** Creates the Arrow Type for a Vector column with dimension `dim` */
|
|
203
|
-
export declare function newVectorType<T extends Float>(dim: number, innerType:
|
|
202
|
+
export declare function newVectorType<T extends Float>(dim: number, innerType: unknown): FixedSizeList<T>;
|
|
204
203
|
/**
|
|
205
204
|
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
206
205
|
*
|
package/dist/arrow.js
CHANGED
|
@@ -27,7 +27,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
27
27
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
28
28
|
};
|
|
29
29
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
30
|
-
exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.newVectorType = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = exports.isFixedSizeList = exports.isFixedSizeBinary = exports.isUnion = exports.isStruct = exports.isList = exports.isDuration = exports.isInterval = exports.isTimestamp = exports.isTime = exports.isDate = exports.isDecimal = exports.isBool = exports.isLargeUtf8 = exports.isUtf8 = exports.isLargeBinary = exports.isBinary = exports.isFloat = exports.isInt = exports.isNull = exports.
|
|
30
|
+
exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.newVectorType = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = exports.isFixedSizeList = exports.isFixedSizeBinary = exports.isUnion = exports.isStruct = exports.isList = exports.isDuration = exports.isInterval = exports.isTimestamp = exports.isTime = exports.isDate = exports.isDecimal = exports.isBool = exports.isLargeUtf8 = exports.isUtf8 = exports.isLargeBinary = exports.isBinary = exports.isFloat = exports.isInt = exports.isNull = exports.isArrowTable = void 0;
|
|
31
31
|
const apache_arrow_1 = require("apache-arrow");
|
|
32
32
|
const registry_1 = require("./embedding/registry");
|
|
33
33
|
const sanitize_1 = require("./sanitize");
|
|
@@ -38,31 +38,6 @@ function isArrowTable(value) {
|
|
|
38
38
|
return "schema" in value && "batches" in value;
|
|
39
39
|
}
|
|
40
40
|
exports.isArrowTable = isArrowTable;
|
|
41
|
-
function isDataType(value) {
|
|
42
|
-
return (value instanceof apache_arrow_1.DataType ||
|
|
43
|
-
apache_arrow_1.DataType.isNull(value) ||
|
|
44
|
-
apache_arrow_1.DataType.isInt(value) ||
|
|
45
|
-
apache_arrow_1.DataType.isFloat(value) ||
|
|
46
|
-
apache_arrow_1.DataType.isBinary(value) ||
|
|
47
|
-
apache_arrow_1.DataType.isLargeBinary(value) ||
|
|
48
|
-
apache_arrow_1.DataType.isUtf8(value) ||
|
|
49
|
-
apache_arrow_1.DataType.isLargeUtf8(value) ||
|
|
50
|
-
apache_arrow_1.DataType.isBool(value) ||
|
|
51
|
-
apache_arrow_1.DataType.isDecimal(value) ||
|
|
52
|
-
apache_arrow_1.DataType.isDate(value) ||
|
|
53
|
-
apache_arrow_1.DataType.isTime(value) ||
|
|
54
|
-
apache_arrow_1.DataType.isTimestamp(value) ||
|
|
55
|
-
apache_arrow_1.DataType.isInterval(value) ||
|
|
56
|
-
apache_arrow_1.DataType.isDuration(value) ||
|
|
57
|
-
apache_arrow_1.DataType.isList(value) ||
|
|
58
|
-
apache_arrow_1.DataType.isStruct(value) ||
|
|
59
|
-
apache_arrow_1.DataType.isUnion(value) ||
|
|
60
|
-
apache_arrow_1.DataType.isFixedSizeBinary(value) ||
|
|
61
|
-
apache_arrow_1.DataType.isFixedSizeList(value) ||
|
|
62
|
-
apache_arrow_1.DataType.isMap(value) ||
|
|
63
|
-
apache_arrow_1.DataType.isDictionary(value));
|
|
64
|
-
}
|
|
65
|
-
exports.isDataType = isDataType;
|
|
66
41
|
function isNull(value) {
|
|
67
42
|
return value instanceof apache_arrow_1.Null || apache_arrow_1.DataType.isNull(value);
|
|
68
43
|
}
|
|
@@ -466,7 +441,7 @@ function makeVector(values, type, stringAsDictionary) {
|
|
|
466
441
|
/** Helper function to apply embeddings from metadata to an input table */
|
|
467
442
|
async function applyEmbeddingsFromMetadata(table, schema) {
|
|
468
443
|
const registry = (0, registry_1.getRegistry)();
|
|
469
|
-
const functions = registry.parseFunctions(schema.metadata);
|
|
444
|
+
const functions = await registry.parseFunctions(schema.metadata);
|
|
470
445
|
const columns = Object.fromEntries(table.schema.fields.map((field) => [
|
|
471
446
|
field.name,
|
|
472
447
|
table.getChild(field.name),
|
package/dist/connection.d.ts
CHANGED
|
@@ -27,10 +27,19 @@ export interface CreateTableOptions {
|
|
|
27
27
|
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
28
28
|
*/
|
|
29
29
|
storageOptions?: Record<string, string>;
|
|
30
|
+
/**
|
|
31
|
+
* The version of the data storage format to use.
|
|
32
|
+
*
|
|
33
|
+
* The default is `legacy`, which is Lance format v1.
|
|
34
|
+
* `stable` is the new format, which is Lance format v2.
|
|
35
|
+
*/
|
|
36
|
+
dataStorageVersion?: string;
|
|
30
37
|
/**
|
|
31
38
|
* If true then data files will be written with the legacy format
|
|
32
39
|
*
|
|
33
40
|
* The default is true while the new format is in beta
|
|
41
|
+
*
|
|
42
|
+
* Deprecated.
|
|
34
43
|
*/
|
|
35
44
|
useLegacyFormat?: boolean;
|
|
36
45
|
schema?: SchemaLike;
|
package/dist/connection.js
CHANGED
|
@@ -72,7 +72,14 @@ class LocalConnection extends Connection {
|
|
|
72
72
|
throw new Error("data is required");
|
|
73
73
|
}
|
|
74
74
|
const { buf, mode } = await table_1.Table.parseTableData(data, options);
|
|
75
|
-
|
|
75
|
+
let dataStorageVersion = "legacy";
|
|
76
|
+
if (options?.dataStorageVersion !== undefined) {
|
|
77
|
+
dataStorageVersion = options.dataStorageVersion;
|
|
78
|
+
}
|
|
79
|
+
else if (options?.useLegacyFormat !== undefined) {
|
|
80
|
+
dataStorageVersion = options.useLegacyFormat ? "legacy" : "stable";
|
|
81
|
+
}
|
|
82
|
+
const innerTable = await this.inner.createTable(nameOrOptions, buf, mode, cleanseStorageOptions(options?.storageOptions), dataStorageVersion);
|
|
76
83
|
return new table_1.LocalTable(innerTable);
|
|
77
84
|
}
|
|
78
85
|
async createEmptyTable(name, schema, options) {
|
|
@@ -87,9 +94,16 @@ class LocalConnection extends Connection {
|
|
|
87
94
|
const registry = (0, registry_1.getRegistry)();
|
|
88
95
|
metadata = registry.getTableMetadata([embeddingFunction]);
|
|
89
96
|
}
|
|
97
|
+
let dataStorageVersion = "legacy";
|
|
98
|
+
if (options?.dataStorageVersion !== undefined) {
|
|
99
|
+
dataStorageVersion = options.dataStorageVersion;
|
|
100
|
+
}
|
|
101
|
+
else if (options?.useLegacyFormat !== undefined) {
|
|
102
|
+
dataStorageVersion = options.useLegacyFormat ? "legacy" : "stable";
|
|
103
|
+
}
|
|
90
104
|
const table = (0, arrow_1.makeEmptyTable)(schema, metadata);
|
|
91
105
|
const buf = await (0, arrow_1.fromTableToBuffer)(table);
|
|
92
|
-
const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions),
|
|
106
|
+
const innerTable = await this.inner.createEmptyTable(name, buf, mode, cleanseStorageOptions(options?.storageOptions), dataStorageVersion);
|
|
93
107
|
return new table_1.LocalTable(innerTable);
|
|
94
108
|
}
|
|
95
109
|
async dropTable(name) {
|
|
@@ -44,6 +44,7 @@ export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptio
|
|
|
44
44
|
* ```
|
|
45
45
|
*/
|
|
46
46
|
abstract toJSON(): Partial<M>;
|
|
47
|
+
init?(): Promise<void>;
|
|
47
48
|
/**
|
|
48
49
|
* sourceField is used in combination with `LanceSchema` to provide a declarative data model
|
|
49
50
|
*
|
|
@@ -73,6 +74,16 @@ export declare abstract class EmbeddingFunction<T = any, M extends FunctionOptio
|
|
|
73
74
|
*/
|
|
74
75
|
computeQueryEmbeddings(data: T): Promise<Awaited<IntoVector>>;
|
|
75
76
|
}
|
|
77
|
+
/**
|
|
78
|
+
* an abstract class for implementing embedding functions that take text as input
|
|
79
|
+
*/
|
|
80
|
+
export declare abstract class TextEmbeddingFunction<M extends FunctionOptions = FunctionOptions> extends EmbeddingFunction<string, M> {
|
|
81
|
+
abstract generateEmbeddings(texts: string[], ...args: any[]): Promise<number[][] | Float32Array[] | Float64Array[]>;
|
|
82
|
+
computeQueryEmbeddings(data: string): Promise<Awaited<IntoVector>>;
|
|
83
|
+
embeddingDataType(): Float;
|
|
84
|
+
sourceField(): [DataType, Map<string, EmbeddingFunction>];
|
|
85
|
+
computeSourceEmbeddings(data: string[]): Promise<number[][] | Float32Array[] | Float64Array[]>;
|
|
86
|
+
}
|
|
76
87
|
export interface FieldOptions<T extends DataType = DataType> {
|
|
77
88
|
datatype: T;
|
|
78
89
|
dims?: number;
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
// See the License for the specific language governing permissions and
|
|
14
14
|
// limitations under the License.
|
|
15
15
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
|
-
exports.EmbeddingFunction = void 0;
|
|
16
|
+
exports.TextEmbeddingFunction = exports.EmbeddingFunction = void 0;
|
|
17
17
|
require("reflect-metadata");
|
|
18
18
|
const arrow_1 = require("../arrow");
|
|
19
19
|
const sanitize_1 = require("../sanitize");
|
|
@@ -35,9 +35,9 @@ class EmbeddingFunction {
|
|
|
35
35
|
* @see {@link lancedb.LanceSchema}
|
|
36
36
|
*/
|
|
37
37
|
sourceField(optionsOrDatatype) {
|
|
38
|
-
let datatype =
|
|
39
|
-
? optionsOrDatatype
|
|
40
|
-
: optionsOrDatatype
|
|
38
|
+
let datatype = "datatype" in optionsOrDatatype
|
|
39
|
+
? optionsOrDatatype.datatype
|
|
40
|
+
: optionsOrDatatype;
|
|
41
41
|
if (!datatype) {
|
|
42
42
|
throw new Error("Datatype is required");
|
|
43
43
|
}
|
|
@@ -58,8 +58,11 @@ class EmbeddingFunction {
|
|
|
58
58
|
let vectorType;
|
|
59
59
|
let dims = this.ndims();
|
|
60
60
|
// `func.vectorField(new Float32())`
|
|
61
|
-
if (
|
|
62
|
-
dtype =
|
|
61
|
+
if (optionsOrDatatype === undefined) {
|
|
62
|
+
dtype = new arrow_1.Float32();
|
|
63
|
+
}
|
|
64
|
+
else if (!("datatype" in optionsOrDatatype)) {
|
|
65
|
+
dtype = (0, sanitize_1.sanitizeType)(optionsOrDatatype);
|
|
63
66
|
}
|
|
64
67
|
else {
|
|
65
68
|
// `func.vectorField({
|
|
@@ -67,7 +70,7 @@ class EmbeddingFunction {
|
|
|
67
70
|
// dims: 10
|
|
68
71
|
// })`
|
|
69
72
|
dims = dims ?? optionsOrDatatype?.dims;
|
|
70
|
-
dtype = optionsOrDatatype?.datatype;
|
|
73
|
+
dtype = (0, sanitize_1.sanitizeType)(optionsOrDatatype?.datatype);
|
|
71
74
|
}
|
|
72
75
|
if (dtype !== undefined) {
|
|
73
76
|
// `func.vectorField(new FixedSizeList(dims, new Field("item", new Float32(), true)))`
|
|
@@ -110,3 +113,21 @@ class EmbeddingFunction {
|
|
|
110
113
|
}
|
|
111
114
|
}
|
|
112
115
|
exports.EmbeddingFunction = EmbeddingFunction;
|
|
116
|
+
/**
|
|
117
|
+
* an abstract class for implementing embedding functions that take text as input
|
|
118
|
+
*/
|
|
119
|
+
class TextEmbeddingFunction extends EmbeddingFunction {
|
|
120
|
+
async computeQueryEmbeddings(data) {
|
|
121
|
+
return this.generateEmbeddings([data]).then((data) => data[0]);
|
|
122
|
+
}
|
|
123
|
+
embeddingDataType() {
|
|
124
|
+
return new arrow_1.Float32();
|
|
125
|
+
}
|
|
126
|
+
sourceField() {
|
|
127
|
+
return super.sourceField(new arrow_1.Utf8());
|
|
128
|
+
}
|
|
129
|
+
computeSourceEmbeddings(data) {
|
|
130
|
+
return this.generateEmbeddings(data);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
exports.TextEmbeddingFunction = TextEmbeddingFunction;
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { Schema } from "../arrow";
|
|
2
2
|
import { EmbeddingFunction } from "./embedding_function";
|
|
3
|
-
export { EmbeddingFunction } from "./embedding_function";
|
|
3
|
+
export { EmbeddingFunction, TextEmbeddingFunction } from "./embedding_function";
|
|
4
4
|
export * from "./openai";
|
|
5
|
+
export * from "./transformers";
|
|
5
6
|
export * from "./registry";
|
|
6
7
|
/**
|
|
7
8
|
* Create a schema with embedding functions.
|
package/dist/embedding/index.js
CHANGED
|
@@ -27,15 +27,16 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
27
27
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
28
28
|
};
|
|
29
29
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
30
|
-
exports.LanceSchema = exports.EmbeddingFunction = void 0;
|
|
30
|
+
exports.LanceSchema = exports.TextEmbeddingFunction = exports.EmbeddingFunction = void 0;
|
|
31
31
|
const arrow_1 = require("../arrow");
|
|
32
|
-
const arrow_2 = require("../arrow");
|
|
33
32
|
const sanitize_1 = require("../sanitize");
|
|
34
33
|
const registry_1 = require("./registry");
|
|
35
34
|
var embedding_function_1 = require("./embedding_function");
|
|
36
35
|
Object.defineProperty(exports, "EmbeddingFunction", { enumerable: true, get: function () { return embedding_function_1.EmbeddingFunction; } });
|
|
36
|
+
Object.defineProperty(exports, "TextEmbeddingFunction", { enumerable: true, get: function () { return embedding_function_1.TextEmbeddingFunction; } });
|
|
37
37
|
// We need to explicitly export '*' so that the `register` decorator actually registers the class.
|
|
38
38
|
__exportStar(require("./openai"), exports);
|
|
39
|
+
__exportStar(require("./transformers"), exports);
|
|
39
40
|
__exportStar(require("./registry"), exports);
|
|
40
41
|
/**
|
|
41
42
|
* Create a schema with embedding functions.
|
|
@@ -63,14 +64,14 @@ function LanceSchema(fields) {
|
|
|
63
64
|
const arrowFields = [];
|
|
64
65
|
const embeddingFunctions = new Map();
|
|
65
66
|
Object.entries(fields).forEach(([key, value]) => {
|
|
66
|
-
if (
|
|
67
|
-
arrowFields.push(new arrow_1.Field(key, (0, sanitize_1.sanitizeType)(value), true));
|
|
68
|
-
}
|
|
69
|
-
else {
|
|
67
|
+
if (Array.isArray(value)) {
|
|
70
68
|
const [dtype, metadata] = value;
|
|
71
69
|
arrowFields.push(new arrow_1.Field(key, (0, sanitize_1.sanitizeType)(dtype), true));
|
|
72
70
|
parseEmbeddingFunctions(embeddingFunctions, key, metadata);
|
|
73
71
|
}
|
|
72
|
+
else {
|
|
73
|
+
arrowFields.push(new arrow_1.Field(key, (0, sanitize_1.sanitizeType)(value), true));
|
|
74
|
+
}
|
|
74
75
|
});
|
|
75
76
|
const registry = (0, registry_1.getRegistry)();
|
|
76
77
|
const metadata = registry.getTableMetadata(Array.from(embeddingFunctions.values()));
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import { type EmbeddingFunction, type EmbeddingFunctionConstructor } from "./embedding_function";
|
|
2
2
|
import "reflect-metadata";
|
|
3
3
|
import { OpenAIEmbeddingFunction } from "./openai";
|
|
4
|
+
import { TransformersEmbeddingFunction } from "./transformers";
|
|
5
|
+
type CreateReturnType<T> = T extends {
|
|
6
|
+
init: () => Promise<void>;
|
|
7
|
+
} ? Promise<T> : T;
|
|
4
8
|
interface EmbeddingFunctionCreate<T extends EmbeddingFunction> {
|
|
5
|
-
create(options?: T["TOptions"]): T
|
|
9
|
+
create(options?: T["TOptions"]): CreateReturnType<T>;
|
|
6
10
|
}
|
|
7
11
|
/**
|
|
8
12
|
* This is a singleton class used to register embedding functions
|
|
@@ -19,11 +23,9 @@ export declare class EmbeddingFunctionRegistry {
|
|
|
19
23
|
* @throws Error if the function is already registered
|
|
20
24
|
*/
|
|
21
25
|
register<T extends EmbeddingFunctionConstructor = EmbeddingFunctionConstructor>(this: EmbeddingFunctionRegistry, alias?: string): (ctor: T) => any;
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
*/
|
|
26
|
-
get<T extends EmbeddingFunction<unknown>, Name extends string = "">(name: Name extends "openai" ? "openai" : string): Name extends "openai" ? EmbeddingFunctionCreate<OpenAIEmbeddingFunction> : EmbeddingFunctionCreate<T> | undefined;
|
|
26
|
+
get(name: "openai"): EmbeddingFunctionCreate<OpenAIEmbeddingFunction>;
|
|
27
|
+
get(name: "huggingface"): EmbeddingFunctionCreate<TransformersEmbeddingFunction>;
|
|
28
|
+
get<T extends EmbeddingFunction<unknown>>(name: string): EmbeddingFunctionCreate<T> | undefined;
|
|
27
29
|
/**
|
|
28
30
|
* reset the registry to the initial state
|
|
29
31
|
*/
|
|
@@ -31,7 +33,7 @@ export declare class EmbeddingFunctionRegistry {
|
|
|
31
33
|
/**
|
|
32
34
|
* @ignore
|
|
33
35
|
*/
|
|
34
|
-
parseFunctions(this: EmbeddingFunctionRegistry, metadata: Map<string, string>): Map<string, EmbeddingFunctionConfig
|
|
36
|
+
parseFunctions(this: EmbeddingFunctionRegistry, metadata: Map<string, string>): Promise<Map<string, EmbeddingFunctionConfig>>;
|
|
35
37
|
functionToMetadata(conf: EmbeddingFunctionConfig): Record<string, any>;
|
|
36
38
|
getTableMetadata(functions: EmbeddingFunctionConfig[]): Map<string, string>;
|
|
37
39
|
}
|
|
@@ -50,12 +50,28 @@ class EmbeddingFunctionRegistry {
|
|
|
50
50
|
get(name) {
|
|
51
51
|
const factory = this.#functions.get(name);
|
|
52
52
|
if (!factory) {
|
|
53
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
53
54
|
return undefined;
|
|
54
55
|
}
|
|
56
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
57
|
+
let create;
|
|
58
|
+
if (factory.prototype.init) {
|
|
59
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
60
|
+
create = async function (options) {
|
|
61
|
+
const instance = new factory(options);
|
|
62
|
+
await instance.init();
|
|
63
|
+
return instance;
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
68
|
+
create = function (options) {
|
|
69
|
+
const instance = new factory(options);
|
|
70
|
+
return instance;
|
|
71
|
+
};
|
|
72
|
+
}
|
|
55
73
|
return {
|
|
56
|
-
create
|
|
57
|
-
return new factory(options);
|
|
58
|
-
},
|
|
74
|
+
create,
|
|
59
75
|
};
|
|
60
76
|
}
|
|
61
77
|
/**
|
|
@@ -67,26 +83,28 @@ class EmbeddingFunctionRegistry {
|
|
|
67
83
|
/**
|
|
68
84
|
* @ignore
|
|
69
85
|
*/
|
|
70
|
-
parseFunctions(metadata) {
|
|
86
|
+
async parseFunctions(metadata) {
|
|
71
87
|
if (!metadata.has("embedding_functions")) {
|
|
72
88
|
return new Map();
|
|
73
89
|
}
|
|
74
90
|
else {
|
|
75
91
|
const functions = (JSON.parse(metadata.get("embedding_functions")));
|
|
76
|
-
|
|
92
|
+
const items = await Promise.all(functions.map(async (f) => {
|
|
77
93
|
const fn = this.get(f.name);
|
|
78
94
|
if (!fn) {
|
|
79
95
|
throw new Error(`Function "${f.name}" not found in registry`);
|
|
80
96
|
}
|
|
97
|
+
const func = await this.get(f.name).create(f.model);
|
|
81
98
|
return [
|
|
82
99
|
f.name,
|
|
83
100
|
{
|
|
84
101
|
sourceColumn: f.sourceColumn,
|
|
85
102
|
vectorColumn: f.vectorColumn,
|
|
86
|
-
function:
|
|
103
|
+
function: func,
|
|
87
104
|
},
|
|
88
105
|
];
|
|
89
106
|
}));
|
|
107
|
+
return new Map(items);
|
|
90
108
|
}
|
|
91
109
|
}
|
|
92
110
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { Float } from "../arrow";
|
|
2
|
+
import { EmbeddingFunction } from "./embedding_function";
|
|
3
|
+
export type XenovaTransformerOptions = {
|
|
4
|
+
/** The wasm compatible model to use */
|
|
5
|
+
model: string;
|
|
6
|
+
/**
|
|
7
|
+
* The wasm compatible tokenizer to use
|
|
8
|
+
* If not provided, it will use the default tokenizer for the model
|
|
9
|
+
*/
|
|
10
|
+
tokenizer?: string;
|
|
11
|
+
/**
|
|
12
|
+
* The number of dimensions of the embeddings
|
|
13
|
+
*
|
|
14
|
+
* We will attempt to infer this from the model config if not provided.
|
|
15
|
+
* Since there isn't a standard way to get this information from the model,
|
|
16
|
+
* you may need to manually specify this if using a model that doesn't have a 'hidden_size' in the config.
|
|
17
|
+
* */
|
|
18
|
+
ndims?: number;
|
|
19
|
+
/** Options for the tokenizer */
|
|
20
|
+
tokenizerOptions?: {
|
|
21
|
+
textPair?: string | string[];
|
|
22
|
+
padding?: boolean | "max_length";
|
|
23
|
+
addSpecialTokens?: boolean;
|
|
24
|
+
truncation?: boolean;
|
|
25
|
+
maxLength?: number;
|
|
26
|
+
};
|
|
27
|
+
};
|
|
28
|
+
export declare class TransformersEmbeddingFunction extends EmbeddingFunction<string, Partial<XenovaTransformerOptions>> {
|
|
29
|
+
#private;
|
|
30
|
+
constructor(options?: Partial<XenovaTransformerOptions>);
|
|
31
|
+
toJSON(): Record<string, any>;
|
|
32
|
+
init(): Promise<void>;
|
|
33
|
+
ndims(): number;
|
|
34
|
+
embeddingDataType(): Float;
|
|
35
|
+
computeSourceEmbeddings(data: string[]): Promise<number[][]>;
|
|
36
|
+
computeQueryEmbeddings(data: string): Promise<number[]>;
|
|
37
|
+
}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Copyright 2023 Lance Developers.
|
|
3
|
+
//
|
|
4
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
// you may not use this file except in compliance with the License.
|
|
6
|
+
// You may obtain a copy of the License at
|
|
7
|
+
//
|
|
8
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
//
|
|
10
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
// See the License for the specific language governing permissions and
|
|
14
|
+
// limitations under the License.
|
|
15
|
+
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
|
|
16
|
+
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
|
|
17
|
+
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
|
|
18
|
+
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
|
19
|
+
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
20
|
+
};
|
|
21
|
+
var __metadata = (this && this.__metadata) || function (k, v) {
|
|
22
|
+
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
|
|
23
|
+
};
|
|
24
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
25
|
+
exports.TransformersEmbeddingFunction = void 0;
|
|
26
|
+
const arrow_1 = require("../arrow");
|
|
27
|
+
const embedding_function_1 = require("./embedding_function");
|
|
28
|
+
const registry_1 = require("./registry");
|
|
29
|
+
let TransformersEmbeddingFunction = class TransformersEmbeddingFunction extends embedding_function_1.EmbeddingFunction {
|
|
30
|
+
#model;
|
|
31
|
+
#tokenizer;
|
|
32
|
+
#modelName;
|
|
33
|
+
#initialized = false;
|
|
34
|
+
#tokenizerOptions;
|
|
35
|
+
#ndims;
|
|
36
|
+
constructor(options = {
|
|
37
|
+
model: "Xenova/all-MiniLM-L6-v2",
|
|
38
|
+
}) {
|
|
39
|
+
super();
|
|
40
|
+
const modelName = options?.model ?? "Xenova/all-MiniLM-L6-v2";
|
|
41
|
+
this.#tokenizerOptions = {
|
|
42
|
+
padding: true,
|
|
43
|
+
...options.tokenizerOptions,
|
|
44
|
+
};
|
|
45
|
+
this.#ndims = options.ndims;
|
|
46
|
+
this.#modelName = modelName;
|
|
47
|
+
}
|
|
48
|
+
toJSON() {
|
|
49
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
50
|
+
const obj = {
|
|
51
|
+
model: this.#modelName,
|
|
52
|
+
};
|
|
53
|
+
if (this.#ndims) {
|
|
54
|
+
obj["ndims"] = this.#ndims;
|
|
55
|
+
}
|
|
56
|
+
if (this.#tokenizerOptions) {
|
|
57
|
+
obj["tokenizerOptions"] = this.#tokenizerOptions;
|
|
58
|
+
}
|
|
59
|
+
if (this.#tokenizer) {
|
|
60
|
+
obj["tokenizer"] = this.#tokenizer.name;
|
|
61
|
+
}
|
|
62
|
+
return obj;
|
|
63
|
+
}
|
|
64
|
+
async init() {
|
|
65
|
+
let transformers;
|
|
66
|
+
try {
|
|
67
|
+
// SAFETY:
|
|
68
|
+
// since typescript transpiles `import` to `require`, we need to do this in an unsafe way
|
|
69
|
+
// We can't use `require` because `@xenova/transformers` is an ESM module
|
|
70
|
+
// and we can't use `import` directly because typescript will transpile it to `require`.
|
|
71
|
+
// and we want to remain compatible with both ESM and CJS modules
|
|
72
|
+
// so we use `eval` to bypass typescript for this specific import.
|
|
73
|
+
transformers = await eval('import("@xenova/transformers")');
|
|
74
|
+
}
|
|
75
|
+
catch (e) {
|
|
76
|
+
throw new Error(`error loading @xenova/transformers\nReason: ${e}`);
|
|
77
|
+
}
|
|
78
|
+
try {
|
|
79
|
+
this.#model = await transformers.AutoModel.from_pretrained(this.#modelName);
|
|
80
|
+
}
|
|
81
|
+
catch (e) {
|
|
82
|
+
throw new Error(`error loading model ${this.#modelName}. Make sure you are using a wasm compatible model.\nReason: ${e}`);
|
|
83
|
+
}
|
|
84
|
+
try {
|
|
85
|
+
this.#tokenizer = await transformers.AutoTokenizer.from_pretrained(this.#modelName);
|
|
86
|
+
}
|
|
87
|
+
catch (e) {
|
|
88
|
+
throw new Error(`error loading tokenizer for ${this.#modelName}. Make sure you are using a wasm compatible model:\nReason: ${e}`);
|
|
89
|
+
}
|
|
90
|
+
this.#initialized = true;
|
|
91
|
+
}
|
|
92
|
+
ndims() {
|
|
93
|
+
if (this.#ndims) {
|
|
94
|
+
return this.#ndims;
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
const config = this.#model.config;
|
|
98
|
+
const ndims = config["hidden_size"];
|
|
99
|
+
if (!ndims) {
|
|
100
|
+
throw new Error("hidden_size not found in model config, you may need to manually specify the embedding dimensions. ");
|
|
101
|
+
}
|
|
102
|
+
return ndims;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
embeddingDataType() {
|
|
106
|
+
return new arrow_1.Float32();
|
|
107
|
+
}
|
|
108
|
+
async computeSourceEmbeddings(data) {
|
|
109
|
+
// this should only happen if the user is trying to use the function directly.
|
|
110
|
+
// Anything going through the registry should already be initialized.
|
|
111
|
+
if (!this.#initialized) {
|
|
112
|
+
return Promise.reject(new Error("something went wrong: embedding function not initialized. Please call init()"));
|
|
113
|
+
}
|
|
114
|
+
const tokenizer = this.#tokenizer;
|
|
115
|
+
const model = this.#model;
|
|
116
|
+
const inputs = await tokenizer(data, this.#tokenizerOptions);
|
|
117
|
+
let tokens = await model.forward(inputs);
|
|
118
|
+
tokens = tokens[Object.keys(tokens)[0]];
|
|
119
|
+
const [nItems, nTokens] = tokens.dims;
|
|
120
|
+
tokens = tensorDiv(tokens.sum(1), nTokens);
|
|
121
|
+
// TODO: support other data types
|
|
122
|
+
const tokenData = tokens.data;
|
|
123
|
+
const stride = this.ndims();
|
|
124
|
+
const embeddings = [];
|
|
125
|
+
for (let i = 0; i < nItems; i++) {
|
|
126
|
+
const start = i * stride;
|
|
127
|
+
const end = start + stride;
|
|
128
|
+
const slice = tokenData.slice(start, end);
|
|
129
|
+
embeddings.push(Array.from(slice)); // TODO: Avoid copy here
|
|
130
|
+
}
|
|
131
|
+
return embeddings;
|
|
132
|
+
}
|
|
133
|
+
async computeQueryEmbeddings(data) {
|
|
134
|
+
return (await this.computeSourceEmbeddings([data]))[0];
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
exports.TransformersEmbeddingFunction = TransformersEmbeddingFunction;
|
|
138
|
+
exports.TransformersEmbeddingFunction = TransformersEmbeddingFunction = __decorate([
|
|
139
|
+
(0, registry_1.register)("huggingface"),
|
|
140
|
+
__metadata("design:paramtypes", [Object])
|
|
141
|
+
], TransformersEmbeddingFunction);
|
|
142
|
+
const tensorDiv = (src, divBy) => {
|
|
143
|
+
for (let i = 0; i < src.data.length; ++i) {
|
|
144
|
+
src.data[i] /= divBy;
|
|
145
|
+
}
|
|
146
|
+
return src;
|
|
147
|
+
};
|
package/dist/index.d.ts
CHANGED
|
@@ -6,7 +6,7 @@ export { makeArrowTable, MakeArrowTableOptions, Data, VectorColumnOptions, } fro
|
|
|
6
6
|
export { Connection, CreateTableOptions, TableNamesOptions, } from "./connection";
|
|
7
7
|
export { ExecutableQuery, Query, QueryBase, VectorQuery, RecordBatchIterator, } from "./query";
|
|
8
8
|
export { Index, IndexOptions, IvfPqOptions } from "./indices";
|
|
9
|
-
export { Table, AddDataOptions, UpdateOptions } from "./table";
|
|
9
|
+
export { Table, AddDataOptions, UpdateOptions, OptimizeOptions } from "./table";
|
|
10
10
|
export * as embedding from "./embedding";
|
|
11
11
|
/**
|
|
12
12
|
* Connect to a LanceDB instance at the given URI.
|
package/dist/query.js
CHANGED
|
@@ -129,22 +129,28 @@ class QueryBase {
|
|
|
129
129
|
* object insertion order is easy to get wrong and `Map` is more foolproof.
|
|
130
130
|
*/
|
|
131
131
|
select(columns) {
|
|
132
|
-
|
|
132
|
+
const selectColumns = (columnArray) => {
|
|
133
|
+
this.doCall((inner) => {
|
|
134
|
+
inner.selectColumns(columnArray);
|
|
135
|
+
});
|
|
136
|
+
};
|
|
137
|
+
const selectMapping = (columnTuples) => {
|
|
138
|
+
this.doCall((inner) => {
|
|
139
|
+
inner.select(columnTuples);
|
|
140
|
+
});
|
|
141
|
+
};
|
|
133
142
|
if (typeof columns === "string") {
|
|
134
|
-
|
|
143
|
+
selectColumns([columns]);
|
|
135
144
|
}
|
|
136
|
-
if (Array.isArray(columns)) {
|
|
137
|
-
|
|
145
|
+
else if (Array.isArray(columns)) {
|
|
146
|
+
selectColumns(columns);
|
|
138
147
|
}
|
|
139
148
|
else if (columns instanceof Map) {
|
|
140
|
-
|
|
149
|
+
selectMapping(Array.from(columns.entries()));
|
|
141
150
|
}
|
|
142
151
|
else {
|
|
143
|
-
|
|
152
|
+
selectMapping(Object.entries(columns));
|
|
144
153
|
}
|
|
145
|
-
this.doCall((inner) => {
|
|
146
|
-
inner.select(columnTuples);
|
|
147
|
-
});
|
|
148
154
|
return this;
|
|
149
155
|
}
|
|
150
156
|
/**
|
package/dist/remote/client.d.ts
CHANGED
|
@@ -4,7 +4,7 @@ import { Table as ArrowTable } from "../arrow";
|
|
|
4
4
|
import { VectorQuery } from "../query";
|
|
5
5
|
export declare class RestfulLanceDBClient {
|
|
6
6
|
#private;
|
|
7
|
-
constructor(dbName: string, apiKey: string, region: string, hostOverride?: string,
|
|
7
|
+
constructor(dbName: string, apiKey: string, region: string, hostOverride?: string, timeout?: number);
|
|
8
8
|
get session(): import("axios").AxiosInstance;
|
|
9
9
|
get url(): string;
|
|
10
10
|
get headers(): {
|