@lancedb/lancedb 0.4.20 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -14
- package/biome.json +142 -0
- package/dist/arrow.d.ts +35 -9
- package/dist/arrow.js +247 -19
- package/dist/connection.d.ts +4 -1
- package/dist/connection.js +11 -5
- package/dist/embedding/embedding_function.d.ts +54 -28
- package/dist/embedding/embedding_function.js +71 -10
- package/dist/embedding/index.d.ts +28 -2
- package/dist/embedding/index.js +111 -4
- package/dist/embedding/openai.d.ts +16 -7
- package/dist/embedding/openai.js +62 -12
- package/dist/embedding/registry.d.ts +54 -0
- package/dist/embedding/registry.js +123 -0
- package/dist/native.d.ts +26 -0
- package/dist/query.d.ts +1 -1
- package/dist/query.js +7 -6
- package/dist/sanitize.d.ts +22 -1
- package/dist/sanitize.js +126 -113
- package/dist/table.d.ts +50 -4
- package/dist/table.js +47 -5
- package/lancedb/arrow.ts +283 -49
- package/lancedb/connection.ts +27 -6
- package/lancedb/embedding/embedding_function.ts +126 -42
- package/lancedb/embedding/index.ts +113 -2
- package/lancedb/embedding/openai.ts +62 -16
- package/lancedb/embedding/registry.ts +172 -0
- package/lancedb/query.ts +9 -6
- package/lancedb/sanitize.ts +62 -62
- package/lancedb/table.ts +72 -5
- package/nodejs-artifacts/arrow.d.ts +35 -9
- package/nodejs-artifacts/arrow.js +247 -19
- package/nodejs-artifacts/connection.d.ts +4 -1
- package/nodejs-artifacts/connection.js +11 -5
- package/nodejs-artifacts/embedding/embedding_function.d.ts +54 -28
- package/nodejs-artifacts/embedding/embedding_function.js +71 -10
- package/nodejs-artifacts/embedding/index.d.ts +28 -2
- package/nodejs-artifacts/embedding/index.js +111 -4
- package/nodejs-artifacts/embedding/openai.d.ts +16 -7
- package/nodejs-artifacts/embedding/openai.js +62 -12
- package/nodejs-artifacts/embedding/registry.d.ts +54 -0
- package/nodejs-artifacts/embedding/registry.js +123 -0
- package/nodejs-artifacts/native.d.ts +26 -0
- package/nodejs-artifacts/query.d.ts +1 -1
- package/nodejs-artifacts/query.js +7 -6
- package/nodejs-artifacts/sanitize.d.ts +22 -1
- package/nodejs-artifacts/sanitize.js +126 -113
- package/nodejs-artifacts/table.d.ts +50 -4
- package/nodejs-artifacts/table.js +47 -5
- package/package.json +23 -21
- package/tsconfig.json +3 -1
- package/.eslintignore +0 -3
- package/eslint.config.js +0 -28
package/README.md
CHANGED
|
@@ -43,29 +43,20 @@ npm run test
|
|
|
43
43
|
|
|
44
44
|
### Running lint / format
|
|
45
45
|
|
|
46
|
-
LanceDb uses
|
|
47
|
-
|
|
48
|
-
set to true. Also, if your vscode root folder is the repo root then you will need to set
|
|
49
|
-
the eslint.workingDirectories to ["nodejs"]. To manually lint your code you can run:
|
|
46
|
+
LanceDb uses [biome](https://biomejs.dev/) for linting and formatting. if you are using VSCode you will need to install the official [Biome](https://marketplace.visualstudio.com/items?itemName=biomejs.biome) extension.
|
|
47
|
+
To manually lint your code you can run:
|
|
50
48
|
|
|
51
49
|
```sh
|
|
52
50
|
npm run lint
|
|
53
51
|
```
|
|
54
52
|
|
|
55
|
-
|
|
56
|
-
"Prettier - Code formatter" extension. You should then configure it to be the default formatter
|
|
57
|
-
for typescript and you should enable format on save. To manually check your code's format you
|
|
58
|
-
can run:
|
|
53
|
+
to automatically fix all fixable issues:
|
|
59
54
|
|
|
60
55
|
```sh
|
|
61
|
-
npm run
|
|
56
|
+
npm run lint-fix
|
|
62
57
|
```
|
|
63
58
|
|
|
64
|
-
If you
|
|
65
|
-
|
|
66
|
-
```sh
|
|
67
|
-
npx prettier --write .
|
|
68
|
-
```
|
|
59
|
+
If you do not have your workspace root set to the `nodejs` directory, unfortunately the extension will not work. You can still run the linting and formatting commands manually.
|
|
69
60
|
|
|
70
61
|
### Generating docs
|
|
71
62
|
|
package/biome.json
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://biomejs.dev/schemas/1.7.3/schema.json",
|
|
3
|
+
"organizeImports": {
|
|
4
|
+
"enabled": true
|
|
5
|
+
},
|
|
6
|
+
"files": {
|
|
7
|
+
"ignore": [
|
|
8
|
+
"**/dist/**/*",
|
|
9
|
+
"**/native.js",
|
|
10
|
+
"**/native.d.ts",
|
|
11
|
+
"**/npm/**/*",
|
|
12
|
+
"**/.vscode/**"
|
|
13
|
+
]
|
|
14
|
+
},
|
|
15
|
+
"formatter": {
|
|
16
|
+
"indentStyle": "space"
|
|
17
|
+
},
|
|
18
|
+
"linter": {
|
|
19
|
+
"enabled": true,
|
|
20
|
+
"rules": {
|
|
21
|
+
"recommended": false,
|
|
22
|
+
"complexity": {
|
|
23
|
+
"noBannedTypes": "error",
|
|
24
|
+
"noExtraBooleanCast": "error",
|
|
25
|
+
"noMultipleSpacesInRegularExpressionLiterals": "error",
|
|
26
|
+
"noUselessCatch": "error",
|
|
27
|
+
"noUselessThisAlias": "error",
|
|
28
|
+
"noUselessTypeConstraint": "error",
|
|
29
|
+
"noWith": "error"
|
|
30
|
+
},
|
|
31
|
+
"correctness": {
|
|
32
|
+
"noConstAssign": "error",
|
|
33
|
+
"noConstantCondition": "error",
|
|
34
|
+
"noEmptyCharacterClassInRegex": "error",
|
|
35
|
+
"noEmptyPattern": "error",
|
|
36
|
+
"noGlobalObjectCalls": "error",
|
|
37
|
+
"noInnerDeclarations": "error",
|
|
38
|
+
"noInvalidConstructorSuper": "error",
|
|
39
|
+
"noNewSymbol": "error",
|
|
40
|
+
"noNonoctalDecimalEscape": "error",
|
|
41
|
+
"noPrecisionLoss": "error",
|
|
42
|
+
"noSelfAssign": "error",
|
|
43
|
+
"noSetterReturn": "error",
|
|
44
|
+
"noSwitchDeclarations": "error",
|
|
45
|
+
"noUndeclaredVariables": "error",
|
|
46
|
+
"noUnreachable": "error",
|
|
47
|
+
"noUnreachableSuper": "error",
|
|
48
|
+
"noUnsafeFinally": "error",
|
|
49
|
+
"noUnsafeOptionalChaining": "error",
|
|
50
|
+
"noUnusedLabels": "error",
|
|
51
|
+
"noUnusedVariables": "warn",
|
|
52
|
+
"useIsNan": "error",
|
|
53
|
+
"useValidForDirection": "error",
|
|
54
|
+
"useYield": "error"
|
|
55
|
+
},
|
|
56
|
+
"style": {
|
|
57
|
+
"noNamespace": "error",
|
|
58
|
+
"useAsConstAssertion": "error",
|
|
59
|
+
"useBlockStatements": "off",
|
|
60
|
+
"useNamingConvention": {
|
|
61
|
+
"level": "error",
|
|
62
|
+
"options": {
|
|
63
|
+
"strictCase": false
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"suspicious": {
|
|
68
|
+
"noAssignInExpressions": "error",
|
|
69
|
+
"noAsyncPromiseExecutor": "error",
|
|
70
|
+
"noCatchAssign": "error",
|
|
71
|
+
"noClassAssign": "error",
|
|
72
|
+
"noCompareNegZero": "error",
|
|
73
|
+
"noControlCharactersInRegex": "error",
|
|
74
|
+
"noDebugger": "error",
|
|
75
|
+
"noDuplicateCase": "error",
|
|
76
|
+
"noDuplicateClassMembers": "error",
|
|
77
|
+
"noDuplicateObjectKeys": "error",
|
|
78
|
+
"noDuplicateParameters": "error",
|
|
79
|
+
"noEmptyBlockStatements": "error",
|
|
80
|
+
"noExplicitAny": "error",
|
|
81
|
+
"noExtraNonNullAssertion": "error",
|
|
82
|
+
"noFallthroughSwitchClause": "error",
|
|
83
|
+
"noFunctionAssign": "error",
|
|
84
|
+
"noGlobalAssign": "error",
|
|
85
|
+
"noImportAssign": "error",
|
|
86
|
+
"noMisleadingCharacterClass": "error",
|
|
87
|
+
"noMisleadingInstantiator": "error",
|
|
88
|
+
"noPrototypeBuiltins": "error",
|
|
89
|
+
"noRedeclare": "error",
|
|
90
|
+
"noShadowRestrictedNames": "error",
|
|
91
|
+
"noUnsafeDeclarationMerging": "error",
|
|
92
|
+
"noUnsafeNegation": "error",
|
|
93
|
+
"useGetterReturn": "error",
|
|
94
|
+
"useValidTypeof": "error"
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
"ignore": ["**/dist/**/*", "**/native.js", "**/native.d.ts"]
|
|
98
|
+
},
|
|
99
|
+
"javascript": {
|
|
100
|
+
"globals": []
|
|
101
|
+
},
|
|
102
|
+
"overrides": [
|
|
103
|
+
{
|
|
104
|
+
"include": [
|
|
105
|
+
"**/*.ts",
|
|
106
|
+
"**/*.tsx",
|
|
107
|
+
"**/*.mts",
|
|
108
|
+
"**/*.cts",
|
|
109
|
+
"__test__/*.test.ts"
|
|
110
|
+
],
|
|
111
|
+
"linter": {
|
|
112
|
+
"rules": {
|
|
113
|
+
"correctness": {
|
|
114
|
+
"noConstAssign": "off",
|
|
115
|
+
"noGlobalObjectCalls": "off",
|
|
116
|
+
"noInvalidConstructorSuper": "off",
|
|
117
|
+
"noNewSymbol": "off",
|
|
118
|
+
"noSetterReturn": "off",
|
|
119
|
+
"noUndeclaredVariables": "off",
|
|
120
|
+
"noUnreachable": "off",
|
|
121
|
+
"noUnreachableSuper": "off"
|
|
122
|
+
},
|
|
123
|
+
"style": {
|
|
124
|
+
"noArguments": "error",
|
|
125
|
+
"noVar": "error",
|
|
126
|
+
"useConst": "error"
|
|
127
|
+
},
|
|
128
|
+
"suspicious": {
|
|
129
|
+
"noDuplicateClassMembers": "off",
|
|
130
|
+
"noDuplicateObjectKeys": "off",
|
|
131
|
+
"noDuplicateParameters": "off",
|
|
132
|
+
"noFunctionAssign": "off",
|
|
133
|
+
"noImportAssign": "off",
|
|
134
|
+
"noRedeclare": "off",
|
|
135
|
+
"noUnsafeNegation": "off",
|
|
136
|
+
"useGetterReturn": "off"
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
]
|
|
142
|
+
}
|
package/dist/arrow.d.ts
CHANGED
|
@@ -1,6 +1,29 @@
|
|
|
1
1
|
/// <reference types="node" />
|
|
2
|
-
import {
|
|
2
|
+
import { Table as ArrowTable, Binary, DataType, FixedSizeBinary, FixedSizeList, Float, Int, LargeBinary, List, Null, Schema, Struct, Utf8 } from "apache-arrow";
|
|
3
3
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
|
4
|
+
import { EmbeddingFunctionConfig } from "./embedding/registry";
|
|
5
|
+
export * from "apache-arrow";
|
|
6
|
+
export declare function isArrowTable(value: object): value is ArrowTable;
|
|
7
|
+
export declare function isDataType(value: unknown): value is DataType;
|
|
8
|
+
export declare function isNull(value: unknown): value is Null;
|
|
9
|
+
export declare function isInt(value: unknown): value is Int;
|
|
10
|
+
export declare function isFloat(value: unknown): value is Float;
|
|
11
|
+
export declare function isBinary(value: unknown): value is Binary;
|
|
12
|
+
export declare function isLargeBinary(value: unknown): value is LargeBinary;
|
|
13
|
+
export declare function isUtf8(value: unknown): value is Utf8;
|
|
14
|
+
export declare function isLargeUtf8(value: unknown): value is Utf8;
|
|
15
|
+
export declare function isBool(value: unknown): value is Utf8;
|
|
16
|
+
export declare function isDecimal(value: unknown): value is Utf8;
|
|
17
|
+
export declare function isDate(value: unknown): value is Utf8;
|
|
18
|
+
export declare function isTime(value: unknown): value is Utf8;
|
|
19
|
+
export declare function isTimestamp(value: unknown): value is Utf8;
|
|
20
|
+
export declare function isInterval(value: unknown): value is Utf8;
|
|
21
|
+
export declare function isDuration(value: unknown): value is Utf8;
|
|
22
|
+
export declare function isList(value: unknown): value is List;
|
|
23
|
+
export declare function isStruct(value: unknown): value is Struct;
|
|
24
|
+
export declare function isUnion(value: unknown): value is Struct;
|
|
25
|
+
export declare function isFixedSizeBinary(value: unknown): value is FixedSizeBinary;
|
|
26
|
+
export declare function isFixedSizeList(value: unknown): value is FixedSizeList;
|
|
4
27
|
/** Data type accepted by NodeJS SDK */
|
|
5
28
|
export type Data = Record<string, unknown>[] | ArrowTable;
|
|
6
29
|
export declare class VectorColumnOptions {
|
|
@@ -12,6 +35,7 @@ export declare class VectorColumnOptions {
|
|
|
12
35
|
export declare class MakeArrowTableOptions {
|
|
13
36
|
schema?: Schema;
|
|
14
37
|
vectorColumns: Record<string, VectorColumnOptions>;
|
|
38
|
+
embeddings?: EmbeddingFunction<unknown>;
|
|
15
39
|
/**
|
|
16
40
|
* If true then string columns will be encoded with dictionary encoding
|
|
17
41
|
*
|
|
@@ -116,11 +140,11 @@ export declare class MakeArrowTableOptions {
|
|
|
116
140
|
* assert.deepEqual(table.schema, schema)
|
|
117
141
|
* ```
|
|
118
142
|
*/
|
|
119
|
-
export declare function makeArrowTable(data: Array<Record<string, unknown>>, options?: Partial<MakeArrowTableOptions>): ArrowTable;
|
|
143
|
+
export declare function makeArrowTable(data: Array<Record<string, unknown>>, options?: Partial<MakeArrowTableOptions>, metadata?: Map<string, string>): ArrowTable;
|
|
120
144
|
/**
|
|
121
145
|
* Create an empty Arrow table with the provided schema
|
|
122
146
|
*/
|
|
123
|
-
export declare function makeEmptyTable(schema: Schema): ArrowTable;
|
|
147
|
+
export declare function makeEmptyTable(schema: Schema, metadata?: Map<string, string>): ArrowTable;
|
|
124
148
|
/**
|
|
125
149
|
* Convert an Array of records into an Arrow Table, optionally applying an
|
|
126
150
|
* embeddings function to it.
|
|
@@ -139,7 +163,9 @@ export declare function makeEmptyTable(schema: Schema): ArrowTable;
|
|
|
139
163
|
* embedding columns. If no schema is provded then embedding columns will
|
|
140
164
|
* be placed at the end of the table, after all of the input columns.
|
|
141
165
|
*/
|
|
142
|
-
export declare function convertToTable
|
|
166
|
+
export declare function convertToTable(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, makeTableOptions?: Partial<MakeArrowTableOptions>): Promise<ArrowTable>;
|
|
167
|
+
/** Creates the Arrow Type for a Vector column with dimension `dim` */
|
|
168
|
+
export declare function newVectorType<T extends Float>(dim: number, innerType: T): FixedSizeList<T>;
|
|
143
169
|
/**
|
|
144
170
|
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
145
171
|
*
|
|
@@ -147,7 +173,7 @@ export declare function convertToTable<T>(data: Array<Record<string, unknown>>,
|
|
|
147
173
|
*
|
|
148
174
|
* `schema` is required if data is empty
|
|
149
175
|
*/
|
|
150
|
-
export declare function fromRecordsToBuffer
|
|
176
|
+
export declare function fromRecordsToBuffer(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
151
177
|
/**
|
|
152
178
|
* Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
|
|
153
179
|
*
|
|
@@ -155,7 +181,7 @@ export declare function fromRecordsToBuffer<T>(data: Array<Record<string, unknow
|
|
|
155
181
|
*
|
|
156
182
|
* `schema` is required if data is empty
|
|
157
183
|
*/
|
|
158
|
-
export declare function fromRecordsToStreamBuffer
|
|
184
|
+
export declare function fromRecordsToStreamBuffer(data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
159
185
|
/**
|
|
160
186
|
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
161
187
|
*
|
|
@@ -164,7 +190,7 @@ export declare function fromRecordsToStreamBuffer<T>(data: Array<Record<string,
|
|
|
164
190
|
*
|
|
165
191
|
* `schema` is required if the table is empty
|
|
166
192
|
*/
|
|
167
|
-
export declare function fromTableToBuffer
|
|
193
|
+
export declare function fromTableToBuffer(table: ArrowTable, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
168
194
|
/**
|
|
169
195
|
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
170
196
|
*
|
|
@@ -173,7 +199,7 @@ export declare function fromTableToBuffer<T>(table: ArrowTable, embeddings?: Emb
|
|
|
173
199
|
*
|
|
174
200
|
* `schema` is required if the table is empty
|
|
175
201
|
*/
|
|
176
|
-
export declare function fromDataToBuffer
|
|
202
|
+
export declare function fromDataToBuffer(data: Data, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
177
203
|
/**
|
|
178
204
|
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
|
|
179
205
|
*
|
|
@@ -182,7 +208,7 @@ export declare function fromDataToBuffer<T>(data: Data, embeddings?: EmbeddingFu
|
|
|
182
208
|
*
|
|
183
209
|
* `schema` is required if the table is empty
|
|
184
210
|
*/
|
|
185
|
-
export declare function fromTableToStreamBuffer
|
|
211
|
+
export declare function fromTableToStreamBuffer(table: ArrowTable, embeddings?: EmbeddingFunctionConfig, schema?: Schema): Promise<Buffer>;
|
|
186
212
|
/**
|
|
187
213
|
* Create an empty table with the given schema
|
|
188
214
|
*/
|
package/dist/arrow.js
CHANGED
|
@@ -12,10 +12,133 @@
|
|
|
12
12
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
13
|
// See the License for the specific language governing permissions and
|
|
14
14
|
// limitations under the License.
|
|
15
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
16
|
+
if (k2 === undefined) k2 = k;
|
|
17
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
18
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
19
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
20
|
+
}
|
|
21
|
+
Object.defineProperty(o, k2, desc);
|
|
22
|
+
}) : (function(o, m, k, k2) {
|
|
23
|
+
if (k2 === undefined) k2 = k;
|
|
24
|
+
o[k2] = m[k];
|
|
25
|
+
}));
|
|
26
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
27
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
28
|
+
};
|
|
15
29
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
|
-
exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = void 0;
|
|
30
|
+
exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.newVectorType = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = exports.isFixedSizeList = exports.isFixedSizeBinary = exports.isUnion = exports.isStruct = exports.isList = exports.isDuration = exports.isInterval = exports.isTimestamp = exports.isTime = exports.isDate = exports.isDecimal = exports.isBool = exports.isLargeUtf8 = exports.isUtf8 = exports.isLargeBinary = exports.isBinary = exports.isFloat = exports.isInt = exports.isNull = exports.isDataType = exports.isArrowTable = void 0;
|
|
17
31
|
const apache_arrow_1 = require("apache-arrow");
|
|
32
|
+
const registry_1 = require("./embedding/registry");
|
|
18
33
|
const sanitize_1 = require("./sanitize");
|
|
34
|
+
__exportStar(require("apache-arrow"), exports);
|
|
35
|
+
function isArrowTable(value) {
|
|
36
|
+
if (value instanceof apache_arrow_1.Table)
|
|
37
|
+
return true;
|
|
38
|
+
return "schema" in value && "batches" in value;
|
|
39
|
+
}
|
|
40
|
+
exports.isArrowTable = isArrowTable;
|
|
41
|
+
function isDataType(value) {
|
|
42
|
+
return (value instanceof apache_arrow_1.DataType ||
|
|
43
|
+
apache_arrow_1.DataType.isNull(value) ||
|
|
44
|
+
apache_arrow_1.DataType.isInt(value) ||
|
|
45
|
+
apache_arrow_1.DataType.isFloat(value) ||
|
|
46
|
+
apache_arrow_1.DataType.isBinary(value) ||
|
|
47
|
+
apache_arrow_1.DataType.isLargeBinary(value) ||
|
|
48
|
+
apache_arrow_1.DataType.isUtf8(value) ||
|
|
49
|
+
apache_arrow_1.DataType.isLargeUtf8(value) ||
|
|
50
|
+
apache_arrow_1.DataType.isBool(value) ||
|
|
51
|
+
apache_arrow_1.DataType.isDecimal(value) ||
|
|
52
|
+
apache_arrow_1.DataType.isDate(value) ||
|
|
53
|
+
apache_arrow_1.DataType.isTime(value) ||
|
|
54
|
+
apache_arrow_1.DataType.isTimestamp(value) ||
|
|
55
|
+
apache_arrow_1.DataType.isInterval(value) ||
|
|
56
|
+
apache_arrow_1.DataType.isDuration(value) ||
|
|
57
|
+
apache_arrow_1.DataType.isList(value) ||
|
|
58
|
+
apache_arrow_1.DataType.isStruct(value) ||
|
|
59
|
+
apache_arrow_1.DataType.isUnion(value) ||
|
|
60
|
+
apache_arrow_1.DataType.isFixedSizeBinary(value) ||
|
|
61
|
+
apache_arrow_1.DataType.isFixedSizeList(value) ||
|
|
62
|
+
apache_arrow_1.DataType.isMap(value) ||
|
|
63
|
+
apache_arrow_1.DataType.isDictionary(value));
|
|
64
|
+
}
|
|
65
|
+
exports.isDataType = isDataType;
|
|
66
|
+
function isNull(value) {
|
|
67
|
+
return value instanceof apache_arrow_1.Null || apache_arrow_1.DataType.isNull(value);
|
|
68
|
+
}
|
|
69
|
+
exports.isNull = isNull;
|
|
70
|
+
function isInt(value) {
|
|
71
|
+
return value instanceof apache_arrow_1.Int || apache_arrow_1.DataType.isInt(value);
|
|
72
|
+
}
|
|
73
|
+
exports.isInt = isInt;
|
|
74
|
+
function isFloat(value) {
|
|
75
|
+
return value instanceof apache_arrow_1.Float || apache_arrow_1.DataType.isFloat(value);
|
|
76
|
+
}
|
|
77
|
+
exports.isFloat = isFloat;
|
|
78
|
+
function isBinary(value) {
|
|
79
|
+
return value instanceof apache_arrow_1.Binary || apache_arrow_1.DataType.isBinary(value);
|
|
80
|
+
}
|
|
81
|
+
exports.isBinary = isBinary;
|
|
82
|
+
function isLargeBinary(value) {
|
|
83
|
+
return value instanceof apache_arrow_1.LargeBinary || apache_arrow_1.DataType.isLargeBinary(value);
|
|
84
|
+
}
|
|
85
|
+
exports.isLargeBinary = isLargeBinary;
|
|
86
|
+
function isUtf8(value) {
|
|
87
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isUtf8(value);
|
|
88
|
+
}
|
|
89
|
+
exports.isUtf8 = isUtf8;
|
|
90
|
+
function isLargeUtf8(value) {
|
|
91
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isLargeUtf8(value);
|
|
92
|
+
}
|
|
93
|
+
exports.isLargeUtf8 = isLargeUtf8;
|
|
94
|
+
function isBool(value) {
|
|
95
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isBool(value);
|
|
96
|
+
}
|
|
97
|
+
exports.isBool = isBool;
|
|
98
|
+
function isDecimal(value) {
|
|
99
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDecimal(value);
|
|
100
|
+
}
|
|
101
|
+
exports.isDecimal = isDecimal;
|
|
102
|
+
function isDate(value) {
|
|
103
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDate(value);
|
|
104
|
+
}
|
|
105
|
+
exports.isDate = isDate;
|
|
106
|
+
function isTime(value) {
|
|
107
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTime(value);
|
|
108
|
+
}
|
|
109
|
+
exports.isTime = isTime;
|
|
110
|
+
function isTimestamp(value) {
|
|
111
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTimestamp(value);
|
|
112
|
+
}
|
|
113
|
+
exports.isTimestamp = isTimestamp;
|
|
114
|
+
function isInterval(value) {
|
|
115
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isInterval(value);
|
|
116
|
+
}
|
|
117
|
+
exports.isInterval = isInterval;
|
|
118
|
+
function isDuration(value) {
|
|
119
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDuration(value);
|
|
120
|
+
}
|
|
121
|
+
exports.isDuration = isDuration;
|
|
122
|
+
function isList(value) {
|
|
123
|
+
return value instanceof apache_arrow_1.List || apache_arrow_1.DataType.isList(value);
|
|
124
|
+
}
|
|
125
|
+
exports.isList = isList;
|
|
126
|
+
function isStruct(value) {
|
|
127
|
+
return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isStruct(value);
|
|
128
|
+
}
|
|
129
|
+
exports.isStruct = isStruct;
|
|
130
|
+
function isUnion(value) {
|
|
131
|
+
return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isUnion(value);
|
|
132
|
+
}
|
|
133
|
+
exports.isUnion = isUnion;
|
|
134
|
+
function isFixedSizeBinary(value) {
|
|
135
|
+
return value instanceof apache_arrow_1.FixedSizeBinary || apache_arrow_1.DataType.isFixedSizeBinary(value);
|
|
136
|
+
}
|
|
137
|
+
exports.isFixedSizeBinary = isFixedSizeBinary;
|
|
138
|
+
function isFixedSizeList(value) {
|
|
139
|
+
return value instanceof apache_arrow_1.FixedSizeList || apache_arrow_1.DataType.isFixedSizeList(value);
|
|
140
|
+
}
|
|
141
|
+
exports.isFixedSizeList = isFixedSizeList;
|
|
19
142
|
/*
|
|
20
143
|
* Options to control how a column should be converted to a vector array
|
|
21
144
|
*/
|
|
@@ -60,6 +183,7 @@ class MakeArrowTableOptions {
|
|
|
60
183
|
vectorColumns = {
|
|
61
184
|
vector: new VectorColumnOptions(),
|
|
62
185
|
};
|
|
186
|
+
embeddings;
|
|
63
187
|
/**
|
|
64
188
|
* If true then string columns will be encoded with dictionary encoding
|
|
65
189
|
*
|
|
@@ -167,7 +291,7 @@ exports.MakeArrowTableOptions = MakeArrowTableOptions;
|
|
|
167
291
|
* assert.deepEqual(table.schema, schema)
|
|
168
292
|
* ```
|
|
169
293
|
*/
|
|
170
|
-
function makeArrowTable(data, options) {
|
|
294
|
+
function makeArrowTable(data, options, metadata) {
|
|
171
295
|
if (data.length === 0 &&
|
|
172
296
|
(options?.schema === undefined || options?.schema === null)) {
|
|
173
297
|
throw new Error("At least one record or a schema needs to be provided");
|
|
@@ -175,6 +299,7 @@ function makeArrowTable(data, options) {
|
|
|
175
299
|
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
176
300
|
if (opt.schema !== undefined && opt.schema !== null) {
|
|
177
301
|
opt.schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
|
|
302
|
+
opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
|
|
178
303
|
}
|
|
179
304
|
const columns = {};
|
|
180
305
|
// TODO: sample dataset to find missing columns
|
|
@@ -244,20 +369,38 @@ function makeArrowTable(data, options) {
|
|
|
244
369
|
// then patch the schema of the batches so we can use
|
|
245
370
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
246
371
|
const firstTable = new apache_arrow_1.Table(columns);
|
|
247
|
-
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
248
372
|
const batchesFixed = firstTable.batches.map((batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
|
|
249
|
-
|
|
373
|
+
let schema;
|
|
374
|
+
if (metadata !== undefined) {
|
|
375
|
+
let schemaMetadata = opt.schema.metadata;
|
|
376
|
+
if (schemaMetadata.size === 0) {
|
|
377
|
+
schemaMetadata = metadata;
|
|
378
|
+
}
|
|
379
|
+
else {
|
|
380
|
+
for (const [key, entry] of schemaMetadata.entries()) {
|
|
381
|
+
schemaMetadata.set(key, entry);
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
schema = new apache_arrow_1.Schema(opt.schema.fields, schemaMetadata);
|
|
385
|
+
}
|
|
386
|
+
else {
|
|
387
|
+
schema = opt.schema;
|
|
388
|
+
}
|
|
389
|
+
return new apache_arrow_1.Table(schema, batchesFixed);
|
|
250
390
|
}
|
|
251
|
-
|
|
252
|
-
|
|
391
|
+
const tbl = new apache_arrow_1.Table(columns);
|
|
392
|
+
if (metadata !== undefined) {
|
|
393
|
+
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
394
|
+
tbl.schema.metadata = metadata;
|
|
253
395
|
}
|
|
396
|
+
return tbl;
|
|
254
397
|
}
|
|
255
398
|
exports.makeArrowTable = makeArrowTable;
|
|
256
399
|
/**
|
|
257
400
|
* Create an empty Arrow table with the provided schema
|
|
258
401
|
*/
|
|
259
|
-
function makeEmptyTable(schema) {
|
|
260
|
-
return makeArrowTable([], { schema });
|
|
402
|
+
function makeEmptyTable(schema, metadata) {
|
|
403
|
+
return makeArrowTable([], { schema }, metadata);
|
|
261
404
|
}
|
|
262
405
|
exports.makeEmptyTable = makeEmptyTable;
|
|
263
406
|
/**
|
|
@@ -269,7 +412,7 @@ function makeListVector(lists) {
|
|
|
269
412
|
throw Error("Cannot infer list vector from empty array or empty list");
|
|
270
413
|
}
|
|
271
414
|
const sampleList = lists[0];
|
|
272
|
-
//
|
|
415
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
273
416
|
let inferredType;
|
|
274
417
|
try {
|
|
275
418
|
const sampleVector = makeVector(sampleList);
|
|
@@ -319,9 +462,52 @@ function makeVector(values, type, stringAsDictionary) {
|
|
|
319
462
|
return (0, apache_arrow_1.vectorFromArray)(values);
|
|
320
463
|
}
|
|
321
464
|
}
|
|
465
|
+
/** Helper function to apply embeddings from metadata to an input table */
|
|
466
|
+
async function applyEmbeddingsFromMetadata(table, schema) {
|
|
467
|
+
const registry = (0, registry_1.getRegistry)();
|
|
468
|
+
const functions = registry.parseFunctions(schema.metadata);
|
|
469
|
+
const columns = Object.fromEntries(table.schema.fields.map((field) => [
|
|
470
|
+
field.name,
|
|
471
|
+
table.getChild(field.name),
|
|
472
|
+
]));
|
|
473
|
+
for (const functionEntry of functions.values()) {
|
|
474
|
+
const sourceColumn = columns[functionEntry.sourceColumn];
|
|
475
|
+
const destColumn = functionEntry.vectorColumn ?? "vector";
|
|
476
|
+
if (sourceColumn === undefined) {
|
|
477
|
+
throw new Error(`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`);
|
|
478
|
+
}
|
|
479
|
+
if (columns[destColumn] !== undefined) {
|
|
480
|
+
throw new Error(`Attempt to apply embeddings to table failed because column ${destColumn} already existed`);
|
|
481
|
+
}
|
|
482
|
+
if (table.batches.length > 1) {
|
|
483
|
+
throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
|
|
484
|
+
}
|
|
485
|
+
const values = sourceColumn.toArray();
|
|
486
|
+
const vectors = await functionEntry.function.computeSourceEmbeddings(values);
|
|
487
|
+
if (vectors.length !== values.length) {
|
|
488
|
+
throw new Error("Embedding function did not return an embedding for each input element");
|
|
489
|
+
}
|
|
490
|
+
let destType;
|
|
491
|
+
const dtype = schema.fields.find((f) => f.name === destColumn).type;
|
|
492
|
+
if (isFixedSizeList(dtype)) {
|
|
493
|
+
destType = (0, sanitize_1.sanitizeType)(dtype);
|
|
494
|
+
}
|
|
495
|
+
else {
|
|
496
|
+
throw new Error("Expected FixedSizeList as datatype for vector field, instead got: " +
|
|
497
|
+
dtype);
|
|
498
|
+
}
|
|
499
|
+
const vector = makeVector(vectors, destType);
|
|
500
|
+
columns[destColumn] = vector;
|
|
501
|
+
}
|
|
502
|
+
const newTable = new apache_arrow_1.Table(columns);
|
|
503
|
+
return alignTable(newTable, schema);
|
|
504
|
+
}
|
|
322
505
|
/** Helper function to apply embeddings to an input table */
|
|
323
506
|
async function applyEmbeddings(table, embeddings, schema) {
|
|
324
|
-
if (
|
|
507
|
+
if (schema?.metadata.has("embedding_functions")) {
|
|
508
|
+
return applyEmbeddingsFromMetadata(table, schema);
|
|
509
|
+
}
|
|
510
|
+
else if (embeddings == null || embeddings === undefined) {
|
|
325
511
|
return table;
|
|
326
512
|
}
|
|
327
513
|
if (schema !== undefined && schema !== null) {
|
|
@@ -336,8 +522,8 @@ async function applyEmbeddings(table, embeddings, schema) {
|
|
|
336
522
|
});
|
|
337
523
|
const newColumns = Object.fromEntries(colEntries);
|
|
338
524
|
const sourceColumn = newColumns[embeddings.sourceColumn];
|
|
339
|
-
const destColumn = embeddings.
|
|
340
|
-
const innerDestType = embeddings.embeddingDataType ?? new apache_arrow_1.Float32();
|
|
525
|
+
const destColumn = embeddings.vectorColumn ?? "vector";
|
|
526
|
+
const innerDestType = embeddings.function.embeddingDataType() ?? new apache_arrow_1.Float32();
|
|
341
527
|
if (sourceColumn === undefined) {
|
|
342
528
|
throw new Error(`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`);
|
|
343
529
|
}
|
|
@@ -348,8 +534,9 @@ async function applyEmbeddings(table, embeddings, schema) {
|
|
|
348
534
|
// if we call convertToTable with 0 records and a schema that includes the embedding
|
|
349
535
|
return table;
|
|
350
536
|
}
|
|
351
|
-
|
|
352
|
-
|
|
537
|
+
const dimensions = embeddings.function.ndims();
|
|
538
|
+
if (dimensions !== undefined) {
|
|
539
|
+
const destType = newVectorType(dimensions, innerDestType);
|
|
353
540
|
newColumns[destColumn] = makeVector([], destType);
|
|
354
541
|
}
|
|
355
542
|
else if (schema != null) {
|
|
@@ -373,7 +560,7 @@ async function applyEmbeddings(table, embeddings, schema) {
|
|
|
373
560
|
throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
|
|
374
561
|
}
|
|
375
562
|
const values = sourceColumn.toArray();
|
|
376
|
-
const vectors = await embeddings.
|
|
563
|
+
const vectors = await embeddings.function.computeSourceEmbeddings(values);
|
|
377
564
|
if (vectors.length !== values.length) {
|
|
378
565
|
throw new Error("Embedding function did not return an embedding for each input element");
|
|
379
566
|
}
|
|
@@ -416,9 +603,10 @@ exports.convertToTable = convertToTable;
|
|
|
416
603
|
function newVectorType(dim, innerType) {
|
|
417
604
|
// in Lance we always default to have the elements nullable, so we need to set it to true
|
|
418
605
|
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
|
|
419
|
-
const children = new apache_arrow_1.Field("item", innerType, true);
|
|
606
|
+
const children = new apache_arrow_1.Field("item", (0, sanitize_1.sanitizeType)(innerType), true);
|
|
420
607
|
return new apache_arrow_1.FixedSizeList(dim, children);
|
|
421
608
|
}
|
|
609
|
+
exports.newVectorType = newVectorType;
|
|
422
610
|
/**
|
|
423
611
|
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
424
612
|
*
|
|
@@ -480,12 +668,12 @@ async function fromDataToBuffer(data, embeddings, schema) {
|
|
|
480
668
|
if (schema !== undefined && schema !== null) {
|
|
481
669
|
schema = (0, sanitize_1.sanitizeSchema)(schema);
|
|
482
670
|
}
|
|
483
|
-
if (data
|
|
671
|
+
if (isArrowTable(data)) {
|
|
484
672
|
return fromTableToBuffer(data, embeddings, schema);
|
|
485
673
|
}
|
|
486
674
|
else {
|
|
487
|
-
const table = await convertToTable(data);
|
|
488
|
-
return fromTableToBuffer(table
|
|
675
|
+
const table = await convertToTable(data, embeddings, { schema });
|
|
676
|
+
return fromTableToBuffer(table);
|
|
489
677
|
}
|
|
490
678
|
}
|
|
491
679
|
exports.fromDataToBuffer = fromDataToBuffer;
|
|
@@ -537,3 +725,43 @@ function createEmptyTable(schema) {
|
|
|
537
725
|
return new apache_arrow_1.Table((0, sanitize_1.sanitizeSchema)(schema));
|
|
538
726
|
}
|
|
539
727
|
exports.createEmptyTable = createEmptyTable;
|
|
728
|
+
function validateSchemaEmbeddings(schema, data, embeddings) {
|
|
729
|
+
const fields = [];
|
|
730
|
+
const missingEmbeddingFields = [];
|
|
731
|
+
// First we check if the field is a `FixedSizeList`
|
|
732
|
+
// Then we check if the data contains the field
|
|
733
|
+
// if it does not, we add it to the list of missing embedding fields
|
|
734
|
+
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
735
|
+
// if they are not, we throw an error
|
|
736
|
+
for (let field of schema.fields) {
|
|
737
|
+
if (isFixedSizeList(field.type)) {
|
|
738
|
+
field = (0, sanitize_1.sanitizeField)(field);
|
|
739
|
+
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
740
|
+
if (schema.metadata.has("embedding_functions")) {
|
|
741
|
+
const embeddings = JSON.parse(schema.metadata.get("embedding_functions"));
|
|
742
|
+
if (
|
|
743
|
+
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
|
744
|
+
embeddings.find((f) => f["vectorColumn"] === field.name) ===
|
|
745
|
+
undefined) {
|
|
746
|
+
missingEmbeddingFields.push(field);
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
else {
|
|
750
|
+
missingEmbeddingFields.push(field);
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
else {
|
|
754
|
+
fields.push(field);
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
else {
|
|
758
|
+
fields.push(field);
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
|
762
|
+
throw new Error(`Table has embeddings: "${missingEmbeddingFields
|
|
763
|
+
.map((f) => f.name)
|
|
764
|
+
.join(",")}", but no embedding function was provided`);
|
|
765
|
+
}
|
|
766
|
+
return new apache_arrow_1.Schema(fields, schema.metadata);
|
|
767
|
+
}
|
package/dist/connection.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { Table as ArrowTable, Schema } from "./arrow";
|
|
2
|
+
import { EmbeddingFunctionConfig } from "./embedding/registry";
|
|
1
3
|
import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
|
|
2
4
|
import { Table } from "./table";
|
|
3
|
-
import { Table as ArrowTable, Schema } from "apache-arrow";
|
|
4
5
|
/**
|
|
5
6
|
* Connect to a LanceDB instance at the given URI.
|
|
6
7
|
*
|
|
@@ -39,6 +40,8 @@ export interface CreateTableOptions {
|
|
|
39
40
|
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
|
40
41
|
*/
|
|
41
42
|
storageOptions?: Record<string, string>;
|
|
43
|
+
schema?: Schema;
|
|
44
|
+
embeddingFunction?: EmbeddingFunctionConfig;
|
|
42
45
|
}
|
|
43
46
|
export interface OpenTableOptions {
|
|
44
47
|
/**
|