@lancedb/lancedb 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/Cargo.toml +3 -3
  2. package/biome.json +19 -3
  3. package/dist/arrow.d.ts +42 -7
  4. package/dist/arrow.js +6 -5
  5. package/dist/connection.d.ts +55 -29
  6. package/dist/connection.js +22 -74
  7. package/dist/embedding/embedding_function.d.ts +11 -3
  8. package/dist/embedding/embedding_function.js +36 -12
  9. package/dist/embedding/openai.d.ts +6 -5
  10. package/dist/embedding/openai.js +4 -2
  11. package/dist/embedding/registry.d.ts +10 -11
  12. package/dist/embedding/registry.js +4 -0
  13. package/dist/index.d.ts +51 -3
  14. package/dist/index.js +28 -4
  15. package/dist/merge.d.ts +54 -0
  16. package/dist/merge.js +64 -0
  17. package/dist/native.d.ts +34 -7
  18. package/dist/native.js +26 -9
  19. package/dist/query.d.ts +51 -16
  20. package/dist/query.js +122 -21
  21. package/dist/remote/client.d.ts +28 -0
  22. package/dist/remote/client.js +172 -0
  23. package/dist/remote/connection.d.ts +25 -0
  24. package/dist/remote/connection.js +110 -0
  25. package/dist/remote/index.d.ts +3 -0
  26. package/dist/remote/index.js +9 -0
  27. package/dist/remote/table.d.ts +42 -0
  28. package/dist/remote/table.js +179 -0
  29. package/dist/sanitize.d.ts +3 -2
  30. package/dist/sanitize.js +55 -1
  31. package/dist/table.d.ts +116 -25
  32. package/dist/table.js +117 -233
  33. package/dist/util.d.ts +14 -0
  34. package/dist/util.js +65 -0
  35. package/examples/ann_indexes.ts +49 -0
  36. package/examples/basic.ts +149 -0
  37. package/examples/embedding.ts +83 -0
  38. package/examples/filtering.ts +34 -0
  39. package/examples/jsconfig.json +27 -0
  40. package/examples/package-lock.json +79 -0
  41. package/examples/package.json +18 -0
  42. package/examples/search.ts +37 -0
  43. package/lancedb/arrow.ts +87 -24
  44. package/lancedb/connection.ts +115 -92
  45. package/lancedb/embedding/embedding_function.ts +48 -16
  46. package/lancedb/embedding/openai.ts +11 -6
  47. package/lancedb/embedding/registry.ts +38 -22
  48. package/lancedb/index.ts +101 -2
  49. package/lancedb/merge.ts +70 -0
  50. package/lancedb/query.ts +168 -39
  51. package/lancedb/remote/client.ts +221 -0
  52. package/lancedb/remote/connection.ts +201 -0
  53. package/lancedb/remote/index.ts +3 -0
  54. package/lancedb/remote/table.ts +226 -0
  55. package/lancedb/sanitize.ts +73 -1
  56. package/lancedb/table.ts +344 -101
  57. package/lancedb/util.ts +69 -0
  58. package/native.d.ts +208 -0
  59. package/nodejs-artifacts/arrow.d.ts +42 -7
  60. package/nodejs-artifacts/arrow.js +6 -5
  61. package/nodejs-artifacts/connection.d.ts +55 -29
  62. package/nodejs-artifacts/connection.js +22 -74
  63. package/nodejs-artifacts/embedding/embedding_function.d.ts +11 -3
  64. package/nodejs-artifacts/embedding/embedding_function.js +36 -12
  65. package/nodejs-artifacts/embedding/openai.d.ts +6 -5
  66. package/nodejs-artifacts/embedding/openai.js +4 -2
  67. package/nodejs-artifacts/embedding/registry.d.ts +10 -11
  68. package/nodejs-artifacts/embedding/registry.js +4 -0
  69. package/nodejs-artifacts/index.d.ts +51 -3
  70. package/nodejs-artifacts/index.js +28 -4
  71. package/nodejs-artifacts/merge.d.ts +54 -0
  72. package/nodejs-artifacts/merge.js +64 -0
  73. package/nodejs-artifacts/native.d.ts +34 -7
  74. package/nodejs-artifacts/native.js +26 -9
  75. package/nodejs-artifacts/query.d.ts +51 -16
  76. package/nodejs-artifacts/query.js +122 -21
  77. package/nodejs-artifacts/remote/client.d.ts +28 -0
  78. package/nodejs-artifacts/remote/client.js +172 -0
  79. package/nodejs-artifacts/remote/connection.d.ts +25 -0
  80. package/nodejs-artifacts/remote/connection.js +110 -0
  81. package/nodejs-artifacts/remote/index.d.ts +3 -0
  82. package/nodejs-artifacts/remote/index.js +9 -0
  83. package/nodejs-artifacts/remote/table.d.ts +42 -0
  84. package/nodejs-artifacts/remote/table.js +179 -0
  85. package/nodejs-artifacts/sanitize.d.ts +3 -2
  86. package/nodejs-artifacts/sanitize.js +55 -1
  87. package/nodejs-artifacts/table.d.ts +116 -25
  88. package/nodejs-artifacts/table.js +117 -233
  89. package/nodejs-artifacts/util.d.ts +14 -0
  90. package/nodejs-artifacts/util.js +65 -0
  91. package/package.json +25 -11
@@ -0,0 +1,149 @@
1
+ // --8<-- [start:imports]
2
+ import * as lancedb from "@lancedb/lancedb";
3
+ import * as arrow from "apache-arrow";
4
+ import { Field, FixedSizeList, Float16, Int32, Schema } from "apache-arrow";
5
+
6
+ // --8<-- [end:imports]
7
+
8
+ // --8<-- [start:connect]
9
+ const uri = "/tmp/lancedb/";
10
+ const db = await lancedb.connect(uri);
11
+ // --8<-- [end:connect]
12
+ {
13
+ // --8<-- [start:create_table]
14
+ const data = [
15
+ { vector: [3.1, 4.1], item: "foo", price: 10.0 },
16
+ { vector: [5.9, 26.5], item: "bar", price: 20.0 },
17
+ ];
18
+ const _tbl = await db.createTable("myTable", data);
19
+ // --8<-- [end:create_table]
20
+ {
21
+ // --8<-- [start:create_table_exists_ok]
22
+ const _tbl = await db.createTable("myTable", data, {
23
+ existsOk: true,
24
+ });
25
+ // --8<-- [end:create_table_exists_ok]
26
+ }
27
+ {
28
+ // --8<-- [start:create_table_overwrite]
29
+ const _tbl = await db.createTable("myTable", data, {
30
+ mode: "overwrite",
31
+ });
32
+ // --8<-- [end:create_table_overwrite]
33
+ }
34
+ }
35
+
36
+ {
37
+ // --8<-- [start:create_table_with_schema]
38
+ const schema = new arrow.Schema([
39
+ new arrow.Field(
40
+ "vector",
41
+ new arrow.FixedSizeList(
42
+ 2,
43
+ new arrow.Field("item", new arrow.Float32(), true),
44
+ ),
45
+ ),
46
+ new arrow.Field("item", new arrow.Utf8(), true),
47
+ new arrow.Field("price", new arrow.Float32(), true),
48
+ ]);
49
+ const data = [
50
+ { vector: [3.1, 4.1], item: "foo", price: 10.0 },
51
+ { vector: [5.9, 26.5], item: "bar", price: 20.0 },
52
+ ];
53
+ const _tbl = await db.createTable("myTable", data, {
54
+ schema,
55
+ });
56
+ // --8<-- [end:create_table_with_schema]
57
+ }
58
+
59
+ {
60
+ // --8<-- [start:create_empty_table]
61
+ const schema = new arrow.Schema([
62
+ new arrow.Field(
63
+ "vector",
64
+ new arrow.FixedSizeList(
65
+ 2,
66
+ new arrow.Field("item", new arrow.Float32(), true),
67
+ ),
68
+ ),
69
+ ]);
70
+ const _tbl = await db.createEmptyTable("empty_table", schema);
71
+ // --8<-- [end:create_empty_table]
72
+ }
73
+ {
74
+ // --8<-- [start:open_table]
75
+ const _tbl = await db.openTable("myTable");
76
+ // --8<-- [end:open_table]
77
+ }
78
+
79
+ {
80
+ // --8<-- [start:table_names]
81
+ const tableNames = await db.tableNames();
82
+ console.log(tableNames);
83
+ // --8<-- [end:table_names]
84
+ }
85
+
86
+ const tbl = await db.openTable("myTable");
87
+ {
88
+ // --8<-- [start:add_data]
89
+ const data = [
90
+ { vector: [1.3, 1.4], item: "fizz", price: 100.0 },
91
+ { vector: [9.5, 56.2], item: "buzz", price: 200.0 },
92
+ ];
93
+ await tbl.add(data);
94
+ // --8<-- [end:add_data]
95
+ }
96
+ {
97
+ // --8<-- [start:vector_search]
98
+ const _res = tbl.search([100, 100]).limit(2).toArray();
99
+ // --8<-- [end:vector_search]
100
+ }
101
+ {
102
+ const data = Array.from({ length: 1000 })
103
+ .fill(null)
104
+ .map(() => ({
105
+ vector: [Math.random(), Math.random()],
106
+ item: "autogen",
107
+ price: Math.round(Math.random() * 100),
108
+ }));
109
+
110
+ await tbl.add(data);
111
+ }
112
+
113
+ // --8<-- [start:create_index]
114
+ await tbl.createIndex("vector");
115
+ // --8<-- [end:create_index]
116
+
117
+ // --8<-- [start:delete_rows]
118
+ await tbl.delete('item = "fizz"');
119
+ // --8<-- [end:delete_rows]
120
+
121
+ // --8<-- [start:drop_table]
122
+ await db.dropTable("myTable");
123
+ // --8<-- [end:drop_table]
124
+ await db.dropTable("empty_table");
125
+
126
+ {
127
+ // --8<-- [start:create_f16_table]
128
+ const db = await lancedb.connect("/tmp/lancedb");
129
+ const dim = 16;
130
+ const total = 10;
131
+ const f16Schema = new Schema([
132
+ new Field("id", new Int32()),
133
+ new Field(
134
+ "vector",
135
+ new FixedSizeList(dim, new Field("item", new Float16(), true)),
136
+ false,
137
+ ),
138
+ ]);
139
+ const data = lancedb.makeArrowTable(
140
+ Array.from(Array(total), (_, i) => ({
141
+ id: i,
142
+ vector: Array.from(Array(dim), Math.random),
143
+ })),
144
+ { schema: f16Schema },
145
+ );
146
+ const _table = await db.createTable("f16_tbl", data);
147
+ // --8<-- [end:create_f16_table]
148
+ await db.dropTable("f16_tbl");
149
+ }
@@ -0,0 +1,83 @@
1
+ // --8<-- [start:imports]
2
+ import * as lancedb from "@lancedb/lancedb";
3
+ import { LanceSchema, getRegistry, register } from "@lancedb/lancedb/embedding";
4
+ import { EmbeddingFunction } from "@lancedb/lancedb/embedding";
5
+ import { type Float, Float32, Utf8 } from "apache-arrow";
6
+ // --8<-- [end:imports]
7
+
8
+ {
9
+ // --8<-- [start:openai_embeddings]
10
+
11
+ const db = await lancedb.connect("/tmp/db");
12
+ const func = getRegistry()
13
+ .get("openai")
14
+ ?.create({ model: "text-embedding-ada-002" }) as EmbeddingFunction;
15
+
16
+ const wordsSchema = LanceSchema({
17
+ text: func.sourceField(new Utf8()),
18
+ vector: func.vectorField(),
19
+ });
20
+ const tbl = await db.createEmptyTable("words", wordsSchema, {
21
+ mode: "overwrite",
22
+ });
23
+ await tbl.add([{ text: "hello world" }, { text: "goodbye world" }]);
24
+
25
+ const query = "greetings";
26
+ const actual = (await (await tbl.search(query)).limit(1).toArray())[0];
27
+
28
+ // --8<-- [end:openai_embeddings]
29
+ console.log("result = ", actual.text);
30
+ }
31
+
32
+ {
33
+ // --8<-- [start:embedding_function]
34
+ const db = await lancedb.connect("/tmp/db");
35
+
36
+ @register("my_embedding")
37
+ class MyEmbeddingFunction extends EmbeddingFunction<string> {
38
+ toJSON(): object {
39
+ return {};
40
+ }
41
+ ndims() {
42
+ return 3;
43
+ }
44
+ embeddingDataType(): Float {
45
+ return new Float32();
46
+ }
47
+ async computeQueryEmbeddings(_data: string) {
48
+ // This is a placeholder for a real embedding function
49
+ return [1, 2, 3];
50
+ }
51
+ async computeSourceEmbeddings(data: string[]) {
52
+ // This is a placeholder for a real embedding function
53
+ return Array.from({ length: data.length }).fill([1, 2, 3]) as number[][];
54
+ }
55
+ }
56
+
57
+ const func = new MyEmbeddingFunction();
58
+
59
+ const data = [{ text: "pepperoni" }, { text: "pineapple" }];
60
+
61
+ // Option 1: manually specify the embedding function
62
+ const table = await db.createTable("vectors", data, {
63
+ embeddingFunction: {
64
+ function: func,
65
+ sourceColumn: "text",
66
+ vectorColumn: "vector",
67
+ },
68
+ mode: "overwrite",
69
+ });
70
+
71
+ // Option 2: provide the embedding function through a schema
72
+
73
+ const schema = LanceSchema({
74
+ text: func.sourceField(new Utf8()),
75
+ vector: func.vectorField(),
76
+ });
77
+
78
+ const table2 = await db.createTable("vectors2", data, {
79
+ schema,
80
+ mode: "overwrite",
81
+ });
82
+ // --8<-- [end:embedding_function]
83
+ }
@@ -0,0 +1,34 @@
1
+ import * as lancedb from "@lancedb/lancedb";
2
+
3
+ const db = await lancedb.connect("data/sample-lancedb");
4
+
5
+ const data = Array.from({ length: 10_000 }, (_, i) => ({
6
+ vector: Array(1536).fill(i),
7
+ id: i,
8
+ item: `item ${i}`,
9
+ strId: `${i}`,
10
+ }));
11
+
12
+ const tbl = await db.createTable("myVectors", data, { mode: "overwrite" });
13
+
14
+ // --8<-- [start:search]
15
+ const _result = await tbl
16
+ .search(Array(1536).fill(0.5))
17
+ .limit(1)
18
+ .where("id = 10")
19
+ .toArray();
20
+ // --8<-- [end:search]
21
+
22
+ // --8<-- [start:vec_search]
23
+ await tbl
24
+ .search(Array(1536).fill(0))
25
+ .where("(item IN ('item 0', 'item 2')) AND (id > 10)")
26
+ .postfilter()
27
+ .toArray();
28
+ // --8<-- [end:vec_search]
29
+
30
+ // --8<-- [start:sql_search]
31
+ await tbl.query().where("id = 10").limit(10).toArray();
32
+ // --8<-- [end:sql_search]
33
+
34
+ console.log("SQL search: done");
@@ -0,0 +1,27 @@
1
+ {
2
+ "compilerOptions": {
3
+ // Enable latest features
4
+ "lib": ["ESNext", "DOM"],
5
+ "target": "ESNext",
6
+ "module": "ESNext",
7
+ "moduleDetection": "force",
8
+ "jsx": "react-jsx",
9
+ "allowJs": true,
10
+
11
+ // Bundler mode
12
+ "moduleResolution": "bundler",
13
+ "allowImportingTsExtensions": true,
14
+ "verbatimModuleSyntax": true,
15
+ "noEmit": true,
16
+
17
+ // Best practices
18
+ "strict": true,
19
+ "skipLibCheck": true,
20
+ "noFallthroughCasesInSwitch": true,
21
+
22
+ // Some stricter flags (disabled by default)
23
+ "noUnusedLocals": false,
24
+ "noUnusedParameters": false,
25
+ "noPropertyAccessFromIndexSignature": false
26
+ }
27
+ }
@@ -0,0 +1,79 @@
1
+ {
2
+ "name": "examples",
3
+ "version": "1.0.0",
4
+ "lockfileVersion": 3,
5
+ "requires": true,
6
+ "packages": {
7
+ "": {
8
+ "name": "examples",
9
+ "version": "1.0.0",
10
+ "license": "Apache-2.0",
11
+ "dependencies": {
12
+ "@lancedb/lancedb": "file:../"
13
+ },
14
+ "peerDependencies": {
15
+ "typescript": "^5.0.0"
16
+ }
17
+ },
18
+ "..": {
19
+ "name": "@lancedb/lancedb",
20
+ "version": "0.6.0",
21
+ "cpu": [
22
+ "x64",
23
+ "arm64"
24
+ ],
25
+ "license": "Apache 2.0",
26
+ "os": [
27
+ "darwin",
28
+ "linux",
29
+ "win32"
30
+ ],
31
+ "dependencies": {
32
+ "apache-arrow": "^15.0.0",
33
+ "axios": "^1.7.2",
34
+ "openai": "^4.29.2",
35
+ "reflect-metadata": "^0.2.2"
36
+ },
37
+ "devDependencies": {
38
+ "@aws-sdk/client-kms": "^3.33.0",
39
+ "@aws-sdk/client-s3": "^3.33.0",
40
+ "@biomejs/biome": "^1.7.3",
41
+ "@jest/globals": "^29.7.0",
42
+ "@napi-rs/cli": "^2.18.0",
43
+ "@types/axios": "^0.14.0",
44
+ "@types/jest": "^29.1.2",
45
+ "@types/tmp": "^0.2.6",
46
+ "apache-arrow-old": "npm:apache-arrow@13.0.0",
47
+ "eslint": "^8.57.0",
48
+ "jest": "^29.7.0",
49
+ "shx": "^0.3.4",
50
+ "tmp": "^0.2.3",
51
+ "ts-jest": "^29.1.2",
52
+ "typedoc": "^0.25.7",
53
+ "typedoc-plugin-markdown": "^3.17.1",
54
+ "typescript": "^5.3.3",
55
+ "typescript-eslint": "^7.1.0"
56
+ },
57
+ "engines": {
58
+ "node": ">= 18"
59
+ }
60
+ },
61
+ "node_modules/@lancedb/lancedb": {
62
+ "resolved": "..",
63
+ "link": true
64
+ },
65
+ "node_modules/typescript": {
66
+ "version": "5.5.2",
67
+ "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.2.tgz",
68
+ "integrity": "sha512-NcRtPEOsPFFWjobJEtfihkLCZCXZt/os3zf8nTxjVH3RvTSxjrCamJpbExGvYOF+tFHc3pA65qpdwPbzjohhew==",
69
+ "peer": true,
70
+ "bin": {
71
+ "tsc": "bin/tsc",
72
+ "tsserver": "bin/tsserver"
73
+ },
74
+ "engines": {
75
+ "node": ">=14.17"
76
+ }
77
+ }
78
+ }
79
+ }
@@ -0,0 +1,18 @@
1
+ {
2
+ "name": "examples",
3
+ "version": "1.0.0",
4
+ "description": "Examples for LanceDB",
5
+ "main": "index.js",
6
+ "type": "module",
7
+ "scripts": {
8
+ "test": "echo \"Error: no test specified\" && exit 1"
9
+ },
10
+ "author": "Lance Devs",
11
+ "license": "Apache-2.0",
12
+ "dependencies": {
13
+ "@lancedb/lancedb": "file:../"
14
+ },
15
+ "peerDependencies": {
16
+ "typescript": "^5.0.0"
17
+ }
18
+ }
@@ -0,0 +1,37 @@
1
+ // --8<-- [end:import]
2
+ import * as fs from "node:fs";
3
+ // --8<-- [start:import]
4
+ import * as lancedb from "@lancedb/lancedb";
5
+
6
+ async function setup() {
7
+ fs.rmSync("data/sample-lancedb", { recursive: true, force: true });
8
+ const db = await lancedb.connect("data/sample-lancedb");
9
+
10
+ const data = Array.from({ length: 10_000 }, (_, i) => ({
11
+ vector: Array(1536).fill(i),
12
+ id: `${i}`,
13
+ content: "",
14
+ longId: `${i}`,
15
+ }));
16
+
17
+ await db.createTable("my_vectors", data);
18
+ }
19
+
20
+ await setup();
21
+
22
+ // --8<-- [start:search1]
23
+ const db = await lancedb.connect("data/sample-lancedb");
24
+ const tbl = await db.openTable("my_vectors");
25
+
26
+ const _results1 = await tbl.search(Array(1536).fill(1.2)).limit(10).toArray();
27
+ // --8<-- [end:search1]
28
+
29
+ // --8<-- [start:search2]
30
+ const _results2 = await tbl
31
+ .search(Array(1536).fill(1.2))
32
+ .distanceType("cosine")
33
+ .limit(10)
34
+ .toArray();
35
+ // --8<-- [end:search2]
36
+
37
+ console.log("search: done");
package/lancedb/arrow.ts CHANGED
@@ -15,6 +15,7 @@
15
15
  import {
16
16
  Table as ArrowTable,
17
17
  Binary,
18
+ BufferType,
18
19
  DataType,
19
20
  Field,
20
21
  FixedSizeBinary,
@@ -31,18 +32,78 @@ import {
31
32
  Schema,
32
33
  Struct,
33
34
  Utf8,
34
- type Vector,
35
+ Vector,
35
36
  makeBuilder,
36
37
  makeData,
37
38
  type makeTable,
38
39
  vectorFromArray,
39
40
  } from "apache-arrow";
41
+ import { Buffers } from "apache-arrow/data";
40
42
  import { type EmbeddingFunction } from "./embedding/embedding_function";
41
43
  import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
42
- import { sanitizeField, sanitizeSchema, sanitizeType } from "./sanitize";
44
+ import {
45
+ sanitizeField,
46
+ sanitizeSchema,
47
+ sanitizeTable,
48
+ sanitizeType,
49
+ } from "./sanitize";
43
50
  export * from "apache-arrow";
44
-
45
- export function isArrowTable(value: object): value is ArrowTable {
51
+ export type SchemaLike =
52
+ | Schema
53
+ | {
54
+ fields: FieldLike[];
55
+ metadata: Map<string, string>;
56
+ get names(): unknown[];
57
+ };
58
+ export type FieldLike =
59
+ | Field
60
+ | {
61
+ type: string;
62
+ name: string;
63
+ nullable?: boolean;
64
+ metadata?: Map<string, string>;
65
+ };
66
+
67
+ export type DataLike =
68
+ // biome-ignore lint/suspicious/noExplicitAny: <explanation>
69
+ | import("apache-arrow").Data<Struct<any>>
70
+ | {
71
+ // biome-ignore lint/suspicious/noExplicitAny: <explanation>
72
+ type: any;
73
+ length: number;
74
+ offset: number;
75
+ stride: number;
76
+ nullable: boolean;
77
+ children: DataLike[];
78
+ get nullCount(): number;
79
+ // biome-ignore lint/suspicious/noExplicitAny: <explanation>
80
+ values: Buffers<any>[BufferType.DATA];
81
+ // biome-ignore lint/suspicious/noExplicitAny: <explanation>
82
+ typeIds: Buffers<any>[BufferType.TYPE];
83
+ // biome-ignore lint/suspicious/noExplicitAny: <explanation>
84
+ nullBitmap: Buffers<any>[BufferType.VALIDITY];
85
+ // biome-ignore lint/suspicious/noExplicitAny: <explanation>
86
+ valueOffsets: Buffers<any>[BufferType.OFFSET];
87
+ };
88
+
89
+ export type RecordBatchLike =
90
+ | RecordBatch
91
+ | {
92
+ schema: SchemaLike;
93
+ data: DataLike;
94
+ };
95
+
96
+ export type TableLike =
97
+ | ArrowTable
98
+ | { schema: SchemaLike; batches: RecordBatchLike[] };
99
+
100
+ export type IntoVector =
101
+ | Float32Array
102
+ | Float64Array
103
+ | number[]
104
+ | Promise<Float32Array | Float64Array | number[]>;
105
+
106
+ export function isArrowTable(value: object): value is TableLike {
46
107
  if (value instanceof ArrowTable) return true;
47
108
  return "schema" in value && "batches" in value;
48
109
  }
@@ -133,7 +194,7 @@ export function isFixedSizeList(value: unknown): value is FixedSizeList {
133
194
  }
134
195
 
135
196
  /** Data type accepted by NodeJS SDK */
136
- export type Data = Record<string, unknown>[] | ArrowTable;
197
+ export type Data = Record<string, unknown>[] | TableLike;
137
198
 
138
199
  /*
139
200
  * Options to control how a column should be converted to a vector array
@@ -160,7 +221,7 @@ export class MakeArrowTableOptions {
160
221
  * The schema must be specified if there are no records (e.g. to make
161
222
  * an empty table)
162
223
  */
163
- schema?: Schema;
224
+ schema?: SchemaLike;
164
225
 
165
226
  /*
166
227
  * Mapping from vector column name to expected type
@@ -182,6 +243,7 @@ export class MakeArrowTableOptions {
182
243
  vector: new VectorColumnOptions(),
183
244
  };
184
245
  embeddings?: EmbeddingFunction<unknown>;
246
+ embeddingFunction?: EmbeddingFunctionConfig;
185
247
 
186
248
  /**
187
249
  * If true then string columns will be encoded with dictionary encoding
@@ -306,7 +368,11 @@ export function makeArrowTable(
306
368
  const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
307
369
  if (opt.schema !== undefined && opt.schema !== null) {
308
370
  opt.schema = sanitizeSchema(opt.schema);
309
- opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
371
+ opt.schema = validateSchemaEmbeddings(
372
+ opt.schema as Schema,
373
+ data,
374
+ options?.embeddingFunction,
375
+ );
310
376
  }
311
377
  const columns: Record<string, Vector> = {};
312
378
  // TODO: sample dataset to find missing columns
@@ -387,7 +453,7 @@ export function makeArrowTable(
387
453
  // `new ArrowTable(schema, batches)` which does not do any schema inference
388
454
  const firstTable = new ArrowTable(columns);
389
455
  const batchesFixed = firstTable.batches.map(
390
- (batch) => new RecordBatch(opt.schema!, batch.data),
456
+ (batch) => new RecordBatch(opt.schema as Schema, batch.data),
391
457
  );
392
458
  let schema: Schema;
393
459
  if (metadata !== undefined) {
@@ -400,9 +466,9 @@ export function makeArrowTable(
400
466
  }
401
467
  }
402
468
 
403
- schema = new Schema(opt.schema.fields, schemaMetadata);
469
+ schema = new Schema(opt.schema.fields as Field[], schemaMetadata);
404
470
  } else {
405
- schema = opt.schema;
471
+ schema = opt.schema as Schema;
406
472
  }
407
473
  return new ArrowTable(schema, batchesFixed);
408
474
  }
@@ -418,7 +484,7 @@ export function makeArrowTable(
418
484
  * Create an empty Arrow table with the provided schema
419
485
  */
420
486
  export function makeEmptyTable(
421
- schema: Schema,
487
+ schema: SchemaLike,
422
488
  metadata?: Map<string, string>,
423
489
  ): ArrowTable {
424
490
  return makeArrowTable([], { schema }, metadata);
@@ -545,7 +611,6 @@ async function applyEmbeddingsFromMetadata(
545
611
  dtype,
546
612
  );
547
613
  }
548
-
549
614
  const vector = makeVector(vectors, destType);
550
615
  columns[destColumn] = vector;
551
616
  }
@@ -557,18 +622,17 @@ async function applyEmbeddingsFromMetadata(
557
622
  async function applyEmbeddings<T>(
558
623
  table: ArrowTable,
559
624
  embeddings?: EmbeddingFunctionConfig,
560
- schema?: Schema,
625
+ schema?: SchemaLike,
561
626
  ): Promise<ArrowTable> {
627
+ if (schema !== undefined && schema !== null) {
628
+ schema = sanitizeSchema(schema);
629
+ }
562
630
  if (schema?.metadata.has("embedding_functions")) {
563
- return applyEmbeddingsFromMetadata(table, schema!);
631
+ return applyEmbeddingsFromMetadata(table, schema! as Schema);
564
632
  } else if (embeddings == null || embeddings === undefined) {
565
633
  return table;
566
634
  }
567
635
 
568
- if (schema !== undefined && schema !== null) {
569
- schema = sanitizeSchema(schema);
570
- }
571
-
572
636
  // Convert from ArrowTable to Record<String, Vector>
573
637
  const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
574
638
  const name = table.schema.fields[idx].name;
@@ -644,7 +708,7 @@ async function applyEmbeddings<T>(
644
708
  `When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`,
645
709
  );
646
710
  }
647
- return alignTable(newTable, schema);
711
+ return alignTable(newTable, schema as Schema);
648
712
  }
649
713
  return newTable;
650
714
  }
@@ -738,7 +802,7 @@ export async function fromRecordsToStreamBuffer(
738
802
  export async function fromTableToBuffer(
739
803
  table: ArrowTable,
740
804
  embeddings?: EmbeddingFunctionConfig,
741
- schema?: Schema,
805
+ schema?: SchemaLike,
742
806
  ): Promise<Buffer> {
743
807
  if (schema !== undefined && schema !== null) {
744
808
  schema = sanitizeSchema(schema);
@@ -765,7 +829,7 @@ export async function fromDataToBuffer(
765
829
  schema = sanitizeSchema(schema);
766
830
  }
767
831
  if (isArrowTable(data)) {
768
- return fromTableToBuffer(data, embeddings, schema);
832
+ return fromTableToBuffer(sanitizeTable(data), embeddings, schema);
769
833
  } else {
770
834
  const table = await convertToTable(data, embeddings, { schema });
771
835
  return fromTableToBuffer(table);
@@ -783,7 +847,7 @@ export async function fromDataToBuffer(
783
847
  export async function fromTableToStreamBuffer(
784
848
  table: ArrowTable,
785
849
  embeddings?: EmbeddingFunctionConfig,
786
- schema?: Schema,
850
+ schema?: SchemaLike,
787
851
  ): Promise<Buffer> {
788
852
  const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
789
853
  const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
@@ -835,7 +899,7 @@ export function createEmptyTable(schema: Schema): ArrowTable {
835
899
  function validateSchemaEmbeddings(
836
900
  schema: Schema,
837
901
  data: Array<Record<string, unknown>>,
838
- embeddings: EmbeddingFunction<unknown> | undefined,
902
+ embeddings: EmbeddingFunctionConfig | undefined,
839
903
  ) {
840
904
  const fields = [];
841
905
  const missingEmbeddingFields = [];
@@ -848,7 +912,6 @@ function validateSchemaEmbeddings(
848
912
  for (let field of schema.fields) {
849
913
  if (isFixedSizeList(field.type)) {
850
914
  field = sanitizeField(field);
851
-
852
915
  if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
853
916
  if (schema.metadata.has("embedding_functions")) {
854
917
  const embeddings = JSON.parse(