@lancedb/lancedb 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +3 -3
- package/biome.json +19 -3
- package/dist/arrow.d.ts +42 -7
- package/dist/arrow.js +6 -5
- package/dist/connection.d.ts +55 -29
- package/dist/connection.js +22 -74
- package/dist/embedding/embedding_function.d.ts +11 -3
- package/dist/embedding/embedding_function.js +36 -12
- package/dist/embedding/openai.d.ts +6 -5
- package/dist/embedding/openai.js +4 -2
- package/dist/embedding/registry.d.ts +10 -11
- package/dist/embedding/registry.js +4 -0
- package/dist/index.d.ts +51 -3
- package/dist/index.js +28 -4
- package/dist/merge.d.ts +54 -0
- package/dist/merge.js +64 -0
- package/dist/native.d.ts +34 -7
- package/dist/native.js +26 -9
- package/dist/query.d.ts +51 -16
- package/dist/query.js +122 -21
- package/dist/remote/client.d.ts +28 -0
- package/dist/remote/client.js +172 -0
- package/dist/remote/connection.d.ts +25 -0
- package/dist/remote/connection.js +110 -0
- package/dist/remote/index.d.ts +3 -0
- package/dist/remote/index.js +9 -0
- package/dist/remote/table.d.ts +42 -0
- package/dist/remote/table.js +179 -0
- package/dist/sanitize.d.ts +3 -2
- package/dist/sanitize.js +55 -1
- package/dist/table.d.ts +116 -25
- package/dist/table.js +117 -233
- package/dist/util.d.ts +14 -0
- package/dist/util.js +65 -0
- package/examples/ann_indexes.ts +49 -0
- package/examples/basic.ts +149 -0
- package/examples/embedding.ts +83 -0
- package/examples/filtering.ts +34 -0
- package/examples/jsconfig.json +27 -0
- package/examples/package-lock.json +79 -0
- package/examples/package.json +18 -0
- package/examples/search.ts +37 -0
- package/lancedb/arrow.ts +87 -24
- package/lancedb/connection.ts +115 -92
- package/lancedb/embedding/embedding_function.ts +48 -16
- package/lancedb/embedding/openai.ts +11 -6
- package/lancedb/embedding/registry.ts +38 -22
- package/lancedb/index.ts +101 -2
- package/lancedb/merge.ts +70 -0
- package/lancedb/query.ts +168 -39
- package/lancedb/remote/client.ts +221 -0
- package/lancedb/remote/connection.ts +201 -0
- package/lancedb/remote/index.ts +3 -0
- package/lancedb/remote/table.ts +226 -0
- package/lancedb/sanitize.ts +73 -1
- package/lancedb/table.ts +344 -101
- package/lancedb/util.ts +69 -0
- package/native.d.ts +208 -0
- package/nodejs-artifacts/arrow.d.ts +42 -7
- package/nodejs-artifacts/arrow.js +6 -5
- package/nodejs-artifacts/connection.d.ts +55 -29
- package/nodejs-artifacts/connection.js +22 -74
- package/nodejs-artifacts/embedding/embedding_function.d.ts +11 -3
- package/nodejs-artifacts/embedding/embedding_function.js +36 -12
- package/nodejs-artifacts/embedding/openai.d.ts +6 -5
- package/nodejs-artifacts/embedding/openai.js +4 -2
- package/nodejs-artifacts/embedding/registry.d.ts +10 -11
- package/nodejs-artifacts/embedding/registry.js +4 -0
- package/nodejs-artifacts/index.d.ts +51 -3
- package/nodejs-artifacts/index.js +28 -4
- package/nodejs-artifacts/merge.d.ts +54 -0
- package/nodejs-artifacts/merge.js +64 -0
- package/nodejs-artifacts/native.d.ts +34 -7
- package/nodejs-artifacts/native.js +26 -9
- package/nodejs-artifacts/query.d.ts +51 -16
- package/nodejs-artifacts/query.js +122 -21
- package/nodejs-artifacts/remote/client.d.ts +28 -0
- package/nodejs-artifacts/remote/client.js +172 -0
- package/nodejs-artifacts/remote/connection.d.ts +25 -0
- package/nodejs-artifacts/remote/connection.js +110 -0
- package/nodejs-artifacts/remote/index.d.ts +3 -0
- package/nodejs-artifacts/remote/index.js +9 -0
- package/nodejs-artifacts/remote/table.d.ts +42 -0
- package/nodejs-artifacts/remote/table.js +179 -0
- package/nodejs-artifacts/sanitize.d.ts +3 -2
- package/nodejs-artifacts/sanitize.js +55 -1
- package/nodejs-artifacts/table.d.ts +116 -25
- package/nodejs-artifacts/table.js +117 -233
- package/nodejs-artifacts/util.d.ts +14 -0
- package/nodejs-artifacts/util.js +65 -0
- package/package.json +25 -11
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
// --8<-- [start:imports]
|
|
2
|
+
import * as lancedb from "@lancedb/lancedb";
|
|
3
|
+
import * as arrow from "apache-arrow";
|
|
4
|
+
import { Field, FixedSizeList, Float16, Int32, Schema } from "apache-arrow";
|
|
5
|
+
|
|
6
|
+
// --8<-- [end:imports]
|
|
7
|
+
|
|
8
|
+
// --8<-- [start:connect]
|
|
9
|
+
const uri = "/tmp/lancedb/";
|
|
10
|
+
const db = await lancedb.connect(uri);
|
|
11
|
+
// --8<-- [end:connect]
|
|
12
|
+
{
|
|
13
|
+
// --8<-- [start:create_table]
|
|
14
|
+
const data = [
|
|
15
|
+
{ vector: [3.1, 4.1], item: "foo", price: 10.0 },
|
|
16
|
+
{ vector: [5.9, 26.5], item: "bar", price: 20.0 },
|
|
17
|
+
];
|
|
18
|
+
const _tbl = await db.createTable("myTable", data);
|
|
19
|
+
// --8<-- [end:create_table]
|
|
20
|
+
{
|
|
21
|
+
// --8<-- [start:create_table_exists_ok]
|
|
22
|
+
const _tbl = await db.createTable("myTable", data, {
|
|
23
|
+
existsOk: true,
|
|
24
|
+
});
|
|
25
|
+
// --8<-- [end:create_table_exists_ok]
|
|
26
|
+
}
|
|
27
|
+
{
|
|
28
|
+
// --8<-- [start:create_table_overwrite]
|
|
29
|
+
const _tbl = await db.createTable("myTable", data, {
|
|
30
|
+
mode: "overwrite",
|
|
31
|
+
});
|
|
32
|
+
// --8<-- [end:create_table_overwrite]
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
{
|
|
37
|
+
// --8<-- [start:create_table_with_schema]
|
|
38
|
+
const schema = new arrow.Schema([
|
|
39
|
+
new arrow.Field(
|
|
40
|
+
"vector",
|
|
41
|
+
new arrow.FixedSizeList(
|
|
42
|
+
2,
|
|
43
|
+
new arrow.Field("item", new arrow.Float32(), true),
|
|
44
|
+
),
|
|
45
|
+
),
|
|
46
|
+
new arrow.Field("item", new arrow.Utf8(), true),
|
|
47
|
+
new arrow.Field("price", new arrow.Float32(), true),
|
|
48
|
+
]);
|
|
49
|
+
const data = [
|
|
50
|
+
{ vector: [3.1, 4.1], item: "foo", price: 10.0 },
|
|
51
|
+
{ vector: [5.9, 26.5], item: "bar", price: 20.0 },
|
|
52
|
+
];
|
|
53
|
+
const _tbl = await db.createTable("myTable", data, {
|
|
54
|
+
schema,
|
|
55
|
+
});
|
|
56
|
+
// --8<-- [end:create_table_with_schema]
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
{
|
|
60
|
+
// --8<-- [start:create_empty_table]
|
|
61
|
+
const schema = new arrow.Schema([
|
|
62
|
+
new arrow.Field(
|
|
63
|
+
"vector",
|
|
64
|
+
new arrow.FixedSizeList(
|
|
65
|
+
2,
|
|
66
|
+
new arrow.Field("item", new arrow.Float32(), true),
|
|
67
|
+
),
|
|
68
|
+
),
|
|
69
|
+
]);
|
|
70
|
+
const _tbl = await db.createEmptyTable("empty_table", schema);
|
|
71
|
+
// --8<-- [end:create_empty_table]
|
|
72
|
+
}
|
|
73
|
+
{
|
|
74
|
+
// --8<-- [start:open_table]
|
|
75
|
+
const _tbl = await db.openTable("myTable");
|
|
76
|
+
// --8<-- [end:open_table]
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
{
|
|
80
|
+
// --8<-- [start:table_names]
|
|
81
|
+
const tableNames = await db.tableNames();
|
|
82
|
+
console.log(tableNames);
|
|
83
|
+
// --8<-- [end:table_names]
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const tbl = await db.openTable("myTable");
|
|
87
|
+
{
|
|
88
|
+
// --8<-- [start:add_data]
|
|
89
|
+
const data = [
|
|
90
|
+
{ vector: [1.3, 1.4], item: "fizz", price: 100.0 },
|
|
91
|
+
{ vector: [9.5, 56.2], item: "buzz", price: 200.0 },
|
|
92
|
+
];
|
|
93
|
+
await tbl.add(data);
|
|
94
|
+
// --8<-- [end:add_data]
|
|
95
|
+
}
|
|
96
|
+
{
|
|
97
|
+
// --8<-- [start:vector_search]
|
|
98
|
+
const _res = tbl.search([100, 100]).limit(2).toArray();
|
|
99
|
+
// --8<-- [end:vector_search]
|
|
100
|
+
}
|
|
101
|
+
{
|
|
102
|
+
const data = Array.from({ length: 1000 })
|
|
103
|
+
.fill(null)
|
|
104
|
+
.map(() => ({
|
|
105
|
+
vector: [Math.random(), Math.random()],
|
|
106
|
+
item: "autogen",
|
|
107
|
+
price: Math.round(Math.random() * 100),
|
|
108
|
+
}));
|
|
109
|
+
|
|
110
|
+
await tbl.add(data);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// --8<-- [start:create_index]
|
|
114
|
+
await tbl.createIndex("vector");
|
|
115
|
+
// --8<-- [end:create_index]
|
|
116
|
+
|
|
117
|
+
// --8<-- [start:delete_rows]
|
|
118
|
+
await tbl.delete('item = "fizz"');
|
|
119
|
+
// --8<-- [end:delete_rows]
|
|
120
|
+
|
|
121
|
+
// --8<-- [start:drop_table]
|
|
122
|
+
await db.dropTable("myTable");
|
|
123
|
+
// --8<-- [end:drop_table]
|
|
124
|
+
await db.dropTable("empty_table");
|
|
125
|
+
|
|
126
|
+
{
|
|
127
|
+
// --8<-- [start:create_f16_table]
|
|
128
|
+
const db = await lancedb.connect("/tmp/lancedb");
|
|
129
|
+
const dim = 16;
|
|
130
|
+
const total = 10;
|
|
131
|
+
const f16Schema = new Schema([
|
|
132
|
+
new Field("id", new Int32()),
|
|
133
|
+
new Field(
|
|
134
|
+
"vector",
|
|
135
|
+
new FixedSizeList(dim, new Field("item", new Float16(), true)),
|
|
136
|
+
false,
|
|
137
|
+
),
|
|
138
|
+
]);
|
|
139
|
+
const data = lancedb.makeArrowTable(
|
|
140
|
+
Array.from(Array(total), (_, i) => ({
|
|
141
|
+
id: i,
|
|
142
|
+
vector: Array.from(Array(dim), Math.random),
|
|
143
|
+
})),
|
|
144
|
+
{ schema: f16Schema },
|
|
145
|
+
);
|
|
146
|
+
const _table = await db.createTable("f16_tbl", data);
|
|
147
|
+
// --8<-- [end:create_f16_table]
|
|
148
|
+
await db.dropTable("f16_tbl");
|
|
149
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
// --8<-- [start:imports]
|
|
2
|
+
import * as lancedb from "@lancedb/lancedb";
|
|
3
|
+
import { LanceSchema, getRegistry, register } from "@lancedb/lancedb/embedding";
|
|
4
|
+
import { EmbeddingFunction } from "@lancedb/lancedb/embedding";
|
|
5
|
+
import { type Float, Float32, Utf8 } from "apache-arrow";
|
|
6
|
+
// --8<-- [end:imports]
|
|
7
|
+
|
|
8
|
+
{
|
|
9
|
+
// --8<-- [start:openai_embeddings]
|
|
10
|
+
|
|
11
|
+
const db = await lancedb.connect("/tmp/db");
|
|
12
|
+
const func = getRegistry()
|
|
13
|
+
.get("openai")
|
|
14
|
+
?.create({ model: "text-embedding-ada-002" }) as EmbeddingFunction;
|
|
15
|
+
|
|
16
|
+
const wordsSchema = LanceSchema({
|
|
17
|
+
text: func.sourceField(new Utf8()),
|
|
18
|
+
vector: func.vectorField(),
|
|
19
|
+
});
|
|
20
|
+
const tbl = await db.createEmptyTable("words", wordsSchema, {
|
|
21
|
+
mode: "overwrite",
|
|
22
|
+
});
|
|
23
|
+
await tbl.add([{ text: "hello world" }, { text: "goodbye world" }]);
|
|
24
|
+
|
|
25
|
+
const query = "greetings";
|
|
26
|
+
const actual = (await (await tbl.search(query)).limit(1).toArray())[0];
|
|
27
|
+
|
|
28
|
+
// --8<-- [end:openai_embeddings]
|
|
29
|
+
console.log("result = ", actual.text);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
{
|
|
33
|
+
// --8<-- [start:embedding_function]
|
|
34
|
+
const db = await lancedb.connect("/tmp/db");
|
|
35
|
+
|
|
36
|
+
@register("my_embedding")
|
|
37
|
+
class MyEmbeddingFunction extends EmbeddingFunction<string> {
|
|
38
|
+
toJSON(): object {
|
|
39
|
+
return {};
|
|
40
|
+
}
|
|
41
|
+
ndims() {
|
|
42
|
+
return 3;
|
|
43
|
+
}
|
|
44
|
+
embeddingDataType(): Float {
|
|
45
|
+
return new Float32();
|
|
46
|
+
}
|
|
47
|
+
async computeQueryEmbeddings(_data: string) {
|
|
48
|
+
// This is a placeholder for a real embedding function
|
|
49
|
+
return [1, 2, 3];
|
|
50
|
+
}
|
|
51
|
+
async computeSourceEmbeddings(data: string[]) {
|
|
52
|
+
// This is a placeholder for a real embedding function
|
|
53
|
+
return Array.from({ length: data.length }).fill([1, 2, 3]) as number[][];
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const func = new MyEmbeddingFunction();
|
|
58
|
+
|
|
59
|
+
const data = [{ text: "pepperoni" }, { text: "pineapple" }];
|
|
60
|
+
|
|
61
|
+
// Option 1: manually specify the embedding function
|
|
62
|
+
const table = await db.createTable("vectors", data, {
|
|
63
|
+
embeddingFunction: {
|
|
64
|
+
function: func,
|
|
65
|
+
sourceColumn: "text",
|
|
66
|
+
vectorColumn: "vector",
|
|
67
|
+
},
|
|
68
|
+
mode: "overwrite",
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
// Option 2: provide the embedding function through a schema
|
|
72
|
+
|
|
73
|
+
const schema = LanceSchema({
|
|
74
|
+
text: func.sourceField(new Utf8()),
|
|
75
|
+
vector: func.vectorField(),
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
const table2 = await db.createTable("vectors2", data, {
|
|
79
|
+
schema,
|
|
80
|
+
mode: "overwrite",
|
|
81
|
+
});
|
|
82
|
+
// --8<-- [end:embedding_function]
|
|
83
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import * as lancedb from "@lancedb/lancedb";
|
|
2
|
+
|
|
3
|
+
const db = await lancedb.connect("data/sample-lancedb");
|
|
4
|
+
|
|
5
|
+
const data = Array.from({ length: 10_000 }, (_, i) => ({
|
|
6
|
+
vector: Array(1536).fill(i),
|
|
7
|
+
id: i,
|
|
8
|
+
item: `item ${i}`,
|
|
9
|
+
strId: `${i}`,
|
|
10
|
+
}));
|
|
11
|
+
|
|
12
|
+
const tbl = await db.createTable("myVectors", data, { mode: "overwrite" });
|
|
13
|
+
|
|
14
|
+
// --8<-- [start:search]
|
|
15
|
+
const _result = await tbl
|
|
16
|
+
.search(Array(1536).fill(0.5))
|
|
17
|
+
.limit(1)
|
|
18
|
+
.where("id = 10")
|
|
19
|
+
.toArray();
|
|
20
|
+
// --8<-- [end:search]
|
|
21
|
+
|
|
22
|
+
// --8<-- [start:vec_search]
|
|
23
|
+
await tbl
|
|
24
|
+
.search(Array(1536).fill(0))
|
|
25
|
+
.where("(item IN ('item 0', 'item 2')) AND (id > 10)")
|
|
26
|
+
.postfilter()
|
|
27
|
+
.toArray();
|
|
28
|
+
// --8<-- [end:vec_search]
|
|
29
|
+
|
|
30
|
+
// --8<-- [start:sql_search]
|
|
31
|
+
await tbl.query().where("id = 10").limit(10).toArray();
|
|
32
|
+
// --8<-- [end:sql_search]
|
|
33
|
+
|
|
34
|
+
console.log("SQL search: done");
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
// Enable latest features
|
|
4
|
+
"lib": ["ESNext", "DOM"],
|
|
5
|
+
"target": "ESNext",
|
|
6
|
+
"module": "ESNext",
|
|
7
|
+
"moduleDetection": "force",
|
|
8
|
+
"jsx": "react-jsx",
|
|
9
|
+
"allowJs": true,
|
|
10
|
+
|
|
11
|
+
// Bundler mode
|
|
12
|
+
"moduleResolution": "bundler",
|
|
13
|
+
"allowImportingTsExtensions": true,
|
|
14
|
+
"verbatimModuleSyntax": true,
|
|
15
|
+
"noEmit": true,
|
|
16
|
+
|
|
17
|
+
// Best practices
|
|
18
|
+
"strict": true,
|
|
19
|
+
"skipLibCheck": true,
|
|
20
|
+
"noFallthroughCasesInSwitch": true,
|
|
21
|
+
|
|
22
|
+
// Some stricter flags (disabled by default)
|
|
23
|
+
"noUnusedLocals": false,
|
|
24
|
+
"noUnusedParameters": false,
|
|
25
|
+
"noPropertyAccessFromIndexSignature": false
|
|
26
|
+
}
|
|
27
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "examples",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"lockfileVersion": 3,
|
|
5
|
+
"requires": true,
|
|
6
|
+
"packages": {
|
|
7
|
+
"": {
|
|
8
|
+
"name": "examples",
|
|
9
|
+
"version": "1.0.0",
|
|
10
|
+
"license": "Apache-2.0",
|
|
11
|
+
"dependencies": {
|
|
12
|
+
"@lancedb/lancedb": "file:../"
|
|
13
|
+
},
|
|
14
|
+
"peerDependencies": {
|
|
15
|
+
"typescript": "^5.0.0"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"..": {
|
|
19
|
+
"name": "@lancedb/lancedb",
|
|
20
|
+
"version": "0.6.0",
|
|
21
|
+
"cpu": [
|
|
22
|
+
"x64",
|
|
23
|
+
"arm64"
|
|
24
|
+
],
|
|
25
|
+
"license": "Apache 2.0",
|
|
26
|
+
"os": [
|
|
27
|
+
"darwin",
|
|
28
|
+
"linux",
|
|
29
|
+
"win32"
|
|
30
|
+
],
|
|
31
|
+
"dependencies": {
|
|
32
|
+
"apache-arrow": "^15.0.0",
|
|
33
|
+
"axios": "^1.7.2",
|
|
34
|
+
"openai": "^4.29.2",
|
|
35
|
+
"reflect-metadata": "^0.2.2"
|
|
36
|
+
},
|
|
37
|
+
"devDependencies": {
|
|
38
|
+
"@aws-sdk/client-kms": "^3.33.0",
|
|
39
|
+
"@aws-sdk/client-s3": "^3.33.0",
|
|
40
|
+
"@biomejs/biome": "^1.7.3",
|
|
41
|
+
"@jest/globals": "^29.7.0",
|
|
42
|
+
"@napi-rs/cli": "^2.18.0",
|
|
43
|
+
"@types/axios": "^0.14.0",
|
|
44
|
+
"@types/jest": "^29.1.2",
|
|
45
|
+
"@types/tmp": "^0.2.6",
|
|
46
|
+
"apache-arrow-old": "npm:apache-arrow@13.0.0",
|
|
47
|
+
"eslint": "^8.57.0",
|
|
48
|
+
"jest": "^29.7.0",
|
|
49
|
+
"shx": "^0.3.4",
|
|
50
|
+
"tmp": "^0.2.3",
|
|
51
|
+
"ts-jest": "^29.1.2",
|
|
52
|
+
"typedoc": "^0.25.7",
|
|
53
|
+
"typedoc-plugin-markdown": "^3.17.1",
|
|
54
|
+
"typescript": "^5.3.3",
|
|
55
|
+
"typescript-eslint": "^7.1.0"
|
|
56
|
+
},
|
|
57
|
+
"engines": {
|
|
58
|
+
"node": ">= 18"
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
"node_modules/@lancedb/lancedb": {
|
|
62
|
+
"resolved": "..",
|
|
63
|
+
"link": true
|
|
64
|
+
},
|
|
65
|
+
"node_modules/typescript": {
|
|
66
|
+
"version": "5.5.2",
|
|
67
|
+
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.2.tgz",
|
|
68
|
+
"integrity": "sha512-NcRtPEOsPFFWjobJEtfihkLCZCXZt/os3zf8nTxjVH3RvTSxjrCamJpbExGvYOF+tFHc3pA65qpdwPbzjohhew==",
|
|
69
|
+
"peer": true,
|
|
70
|
+
"bin": {
|
|
71
|
+
"tsc": "bin/tsc",
|
|
72
|
+
"tsserver": "bin/tsserver"
|
|
73
|
+
},
|
|
74
|
+
"engines": {
|
|
75
|
+
"node": ">=14.17"
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "examples",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Examples for LanceDB",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"scripts": {
|
|
8
|
+
"test": "echo \"Error: no test specified\" && exit 1"
|
|
9
|
+
},
|
|
10
|
+
"author": "Lance Devs",
|
|
11
|
+
"license": "Apache-2.0",
|
|
12
|
+
"dependencies": {
|
|
13
|
+
"@lancedb/lancedb": "file:../"
|
|
14
|
+
},
|
|
15
|
+
"peerDependencies": {
|
|
16
|
+
"typescript": "^5.0.0"
|
|
17
|
+
}
|
|
18
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
// --8<-- [end:import]
|
|
2
|
+
import * as fs from "node:fs";
|
|
3
|
+
// --8<-- [start:import]
|
|
4
|
+
import * as lancedb from "@lancedb/lancedb";
|
|
5
|
+
|
|
6
|
+
async function setup() {
|
|
7
|
+
fs.rmSync("data/sample-lancedb", { recursive: true, force: true });
|
|
8
|
+
const db = await lancedb.connect("data/sample-lancedb");
|
|
9
|
+
|
|
10
|
+
const data = Array.from({ length: 10_000 }, (_, i) => ({
|
|
11
|
+
vector: Array(1536).fill(i),
|
|
12
|
+
id: `${i}`,
|
|
13
|
+
content: "",
|
|
14
|
+
longId: `${i}`,
|
|
15
|
+
}));
|
|
16
|
+
|
|
17
|
+
await db.createTable("my_vectors", data);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
await setup();
|
|
21
|
+
|
|
22
|
+
// --8<-- [start:search1]
|
|
23
|
+
const db = await lancedb.connect("data/sample-lancedb");
|
|
24
|
+
const tbl = await db.openTable("my_vectors");
|
|
25
|
+
|
|
26
|
+
const _results1 = await tbl.search(Array(1536).fill(1.2)).limit(10).toArray();
|
|
27
|
+
// --8<-- [end:search1]
|
|
28
|
+
|
|
29
|
+
// --8<-- [start:search2]
|
|
30
|
+
const _results2 = await tbl
|
|
31
|
+
.search(Array(1536).fill(1.2))
|
|
32
|
+
.distanceType("cosine")
|
|
33
|
+
.limit(10)
|
|
34
|
+
.toArray();
|
|
35
|
+
// --8<-- [end:search2]
|
|
36
|
+
|
|
37
|
+
console.log("search: done");
|
package/lancedb/arrow.ts
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import {
|
|
16
16
|
Table as ArrowTable,
|
|
17
17
|
Binary,
|
|
18
|
+
BufferType,
|
|
18
19
|
DataType,
|
|
19
20
|
Field,
|
|
20
21
|
FixedSizeBinary,
|
|
@@ -31,18 +32,78 @@ import {
|
|
|
31
32
|
Schema,
|
|
32
33
|
Struct,
|
|
33
34
|
Utf8,
|
|
34
|
-
|
|
35
|
+
Vector,
|
|
35
36
|
makeBuilder,
|
|
36
37
|
makeData,
|
|
37
38
|
type makeTable,
|
|
38
39
|
vectorFromArray,
|
|
39
40
|
} from "apache-arrow";
|
|
41
|
+
import { Buffers } from "apache-arrow/data";
|
|
40
42
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
|
41
43
|
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
|
|
42
|
-
import {
|
|
44
|
+
import {
|
|
45
|
+
sanitizeField,
|
|
46
|
+
sanitizeSchema,
|
|
47
|
+
sanitizeTable,
|
|
48
|
+
sanitizeType,
|
|
49
|
+
} from "./sanitize";
|
|
43
50
|
export * from "apache-arrow";
|
|
44
|
-
|
|
45
|
-
|
|
51
|
+
export type SchemaLike =
|
|
52
|
+
| Schema
|
|
53
|
+
| {
|
|
54
|
+
fields: FieldLike[];
|
|
55
|
+
metadata: Map<string, string>;
|
|
56
|
+
get names(): unknown[];
|
|
57
|
+
};
|
|
58
|
+
export type FieldLike =
|
|
59
|
+
| Field
|
|
60
|
+
| {
|
|
61
|
+
type: string;
|
|
62
|
+
name: string;
|
|
63
|
+
nullable?: boolean;
|
|
64
|
+
metadata?: Map<string, string>;
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
export type DataLike =
|
|
68
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
69
|
+
| import("apache-arrow").Data<Struct<any>>
|
|
70
|
+
| {
|
|
71
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
72
|
+
type: any;
|
|
73
|
+
length: number;
|
|
74
|
+
offset: number;
|
|
75
|
+
stride: number;
|
|
76
|
+
nullable: boolean;
|
|
77
|
+
children: DataLike[];
|
|
78
|
+
get nullCount(): number;
|
|
79
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
80
|
+
values: Buffers<any>[BufferType.DATA];
|
|
81
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
82
|
+
typeIds: Buffers<any>[BufferType.TYPE];
|
|
83
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
84
|
+
nullBitmap: Buffers<any>[BufferType.VALIDITY];
|
|
85
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
86
|
+
valueOffsets: Buffers<any>[BufferType.OFFSET];
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
export type RecordBatchLike =
|
|
90
|
+
| RecordBatch
|
|
91
|
+
| {
|
|
92
|
+
schema: SchemaLike;
|
|
93
|
+
data: DataLike;
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
export type TableLike =
|
|
97
|
+
| ArrowTable
|
|
98
|
+
| { schema: SchemaLike; batches: RecordBatchLike[] };
|
|
99
|
+
|
|
100
|
+
export type IntoVector =
|
|
101
|
+
| Float32Array
|
|
102
|
+
| Float64Array
|
|
103
|
+
| number[]
|
|
104
|
+
| Promise<Float32Array | Float64Array | number[]>;
|
|
105
|
+
|
|
106
|
+
export function isArrowTable(value: object): value is TableLike {
|
|
46
107
|
if (value instanceof ArrowTable) return true;
|
|
47
108
|
return "schema" in value && "batches" in value;
|
|
48
109
|
}
|
|
@@ -133,7 +194,7 @@ export function isFixedSizeList(value: unknown): value is FixedSizeList {
|
|
|
133
194
|
}
|
|
134
195
|
|
|
135
196
|
/** Data type accepted by NodeJS SDK */
|
|
136
|
-
export type Data = Record<string, unknown>[] |
|
|
197
|
+
export type Data = Record<string, unknown>[] | TableLike;
|
|
137
198
|
|
|
138
199
|
/*
|
|
139
200
|
* Options to control how a column should be converted to a vector array
|
|
@@ -160,7 +221,7 @@ export class MakeArrowTableOptions {
|
|
|
160
221
|
* The schema must be specified if there are no records (e.g. to make
|
|
161
222
|
* an empty table)
|
|
162
223
|
*/
|
|
163
|
-
schema?:
|
|
224
|
+
schema?: SchemaLike;
|
|
164
225
|
|
|
165
226
|
/*
|
|
166
227
|
* Mapping from vector column name to expected type
|
|
@@ -182,6 +243,7 @@ export class MakeArrowTableOptions {
|
|
|
182
243
|
vector: new VectorColumnOptions(),
|
|
183
244
|
};
|
|
184
245
|
embeddings?: EmbeddingFunction<unknown>;
|
|
246
|
+
embeddingFunction?: EmbeddingFunctionConfig;
|
|
185
247
|
|
|
186
248
|
/**
|
|
187
249
|
* If true then string columns will be encoded with dictionary encoding
|
|
@@ -306,7 +368,11 @@ export function makeArrowTable(
|
|
|
306
368
|
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
307
369
|
if (opt.schema !== undefined && opt.schema !== null) {
|
|
308
370
|
opt.schema = sanitizeSchema(opt.schema);
|
|
309
|
-
opt.schema = validateSchemaEmbeddings(
|
|
371
|
+
opt.schema = validateSchemaEmbeddings(
|
|
372
|
+
opt.schema as Schema,
|
|
373
|
+
data,
|
|
374
|
+
options?.embeddingFunction,
|
|
375
|
+
);
|
|
310
376
|
}
|
|
311
377
|
const columns: Record<string, Vector> = {};
|
|
312
378
|
// TODO: sample dataset to find missing columns
|
|
@@ -387,7 +453,7 @@ export function makeArrowTable(
|
|
|
387
453
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
388
454
|
const firstTable = new ArrowTable(columns);
|
|
389
455
|
const batchesFixed = firstTable.batches.map(
|
|
390
|
-
(batch) => new RecordBatch(opt.schema
|
|
456
|
+
(batch) => new RecordBatch(opt.schema as Schema, batch.data),
|
|
391
457
|
);
|
|
392
458
|
let schema: Schema;
|
|
393
459
|
if (metadata !== undefined) {
|
|
@@ -400,9 +466,9 @@ export function makeArrowTable(
|
|
|
400
466
|
}
|
|
401
467
|
}
|
|
402
468
|
|
|
403
|
-
schema = new Schema(opt.schema.fields, schemaMetadata);
|
|
469
|
+
schema = new Schema(opt.schema.fields as Field[], schemaMetadata);
|
|
404
470
|
} else {
|
|
405
|
-
schema = opt.schema;
|
|
471
|
+
schema = opt.schema as Schema;
|
|
406
472
|
}
|
|
407
473
|
return new ArrowTable(schema, batchesFixed);
|
|
408
474
|
}
|
|
@@ -418,7 +484,7 @@ export function makeArrowTable(
|
|
|
418
484
|
* Create an empty Arrow table with the provided schema
|
|
419
485
|
*/
|
|
420
486
|
export function makeEmptyTable(
|
|
421
|
-
schema:
|
|
487
|
+
schema: SchemaLike,
|
|
422
488
|
metadata?: Map<string, string>,
|
|
423
489
|
): ArrowTable {
|
|
424
490
|
return makeArrowTable([], { schema }, metadata);
|
|
@@ -545,7 +611,6 @@ async function applyEmbeddingsFromMetadata(
|
|
|
545
611
|
dtype,
|
|
546
612
|
);
|
|
547
613
|
}
|
|
548
|
-
|
|
549
614
|
const vector = makeVector(vectors, destType);
|
|
550
615
|
columns[destColumn] = vector;
|
|
551
616
|
}
|
|
@@ -557,18 +622,17 @@ async function applyEmbeddingsFromMetadata(
|
|
|
557
622
|
async function applyEmbeddings<T>(
|
|
558
623
|
table: ArrowTable,
|
|
559
624
|
embeddings?: EmbeddingFunctionConfig,
|
|
560
|
-
schema?:
|
|
625
|
+
schema?: SchemaLike,
|
|
561
626
|
): Promise<ArrowTable> {
|
|
627
|
+
if (schema !== undefined && schema !== null) {
|
|
628
|
+
schema = sanitizeSchema(schema);
|
|
629
|
+
}
|
|
562
630
|
if (schema?.metadata.has("embedding_functions")) {
|
|
563
|
-
return applyEmbeddingsFromMetadata(table, schema!);
|
|
631
|
+
return applyEmbeddingsFromMetadata(table, schema! as Schema);
|
|
564
632
|
} else if (embeddings == null || embeddings === undefined) {
|
|
565
633
|
return table;
|
|
566
634
|
}
|
|
567
635
|
|
|
568
|
-
if (schema !== undefined && schema !== null) {
|
|
569
|
-
schema = sanitizeSchema(schema);
|
|
570
|
-
}
|
|
571
|
-
|
|
572
636
|
// Convert from ArrowTable to Record<String, Vector>
|
|
573
637
|
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
|
574
638
|
const name = table.schema.fields[idx].name;
|
|
@@ -644,7 +708,7 @@ async function applyEmbeddings<T>(
|
|
|
644
708
|
`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
|
|
645
709
|
);
|
|
646
710
|
}
|
|
647
|
-
return alignTable(newTable, schema);
|
|
711
|
+
return alignTable(newTable, schema as Schema);
|
|
648
712
|
}
|
|
649
713
|
return newTable;
|
|
650
714
|
}
|
|
@@ -738,7 +802,7 @@ export async function fromRecordsToStreamBuffer(
|
|
|
738
802
|
export async function fromTableToBuffer(
|
|
739
803
|
table: ArrowTable,
|
|
740
804
|
embeddings?: EmbeddingFunctionConfig,
|
|
741
|
-
schema?:
|
|
805
|
+
schema?: SchemaLike,
|
|
742
806
|
): Promise<Buffer> {
|
|
743
807
|
if (schema !== undefined && schema !== null) {
|
|
744
808
|
schema = sanitizeSchema(schema);
|
|
@@ -765,7 +829,7 @@ export async function fromDataToBuffer(
|
|
|
765
829
|
schema = sanitizeSchema(schema);
|
|
766
830
|
}
|
|
767
831
|
if (isArrowTable(data)) {
|
|
768
|
-
return fromTableToBuffer(data, embeddings, schema);
|
|
832
|
+
return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
|
|
769
833
|
} else {
|
|
770
834
|
const table = await convertToTable(data, embeddings, { schema });
|
|
771
835
|
return fromTableToBuffer(table);
|
|
@@ -783,7 +847,7 @@ export async function fromDataToBuffer(
|
|
|
783
847
|
export async function fromTableToStreamBuffer(
|
|
784
848
|
table: ArrowTable,
|
|
785
849
|
embeddings?: EmbeddingFunctionConfig,
|
|
786
|
-
schema?:
|
|
850
|
+
schema?: SchemaLike,
|
|
787
851
|
): Promise<Buffer> {
|
|
788
852
|
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
789
853
|
const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
|
|
@@ -835,7 +899,7 @@ export function createEmptyTable(schema: Schema): ArrowTable {
|
|
|
835
899
|
function validateSchemaEmbeddings(
|
|
836
900
|
schema: Schema,
|
|
837
901
|
data: Array<Record<string, unknown>>,
|
|
838
|
-
embeddings:
|
|
902
|
+
embeddings: EmbeddingFunctionConfig | undefined,
|
|
839
903
|
) {
|
|
840
904
|
const fields = [];
|
|
841
905
|
const missingEmbeddingFields = [];
|
|
@@ -848,7 +912,6 @@ function validateSchemaEmbeddings(
|
|
|
848
912
|
for (let field of schema.fields) {
|
|
849
913
|
if (isFixedSizeList(field.type)) {
|
|
850
914
|
field = sanitizeField(field);
|
|
851
|
-
|
|
852
915
|
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
853
916
|
if (schema.metadata.has("embedding_functions")) {
|
|
854
917
|
const embeddings = JSON.parse(
|