@lancedb/lancedb 0.4.20 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -14
- package/biome.json +136 -0
- package/dist/arrow.d.ts +2 -1
- package/dist/arrow.js +33 -2
- package/dist/connection.d.ts +1 -1
- package/dist/connection.js +1 -1
- package/dist/native.d.ts +26 -0
- package/dist/query.d.ts +1 -1
- package/dist/query.js +4 -3
- package/dist/sanitize.js +3 -3
- package/dist/table.d.ts +50 -3
- package/dist/table.js +41 -2
- package/lancedb/arrow.ts +54 -16
- package/lancedb/connection.ts +1 -1
- package/lancedb/embedding/openai.ts +1 -1
- package/lancedb/query.ts +8 -6
- package/lancedb/sanitize.ts +40 -40
- package/lancedb/table.ts +63 -3
- package/nodejs-artifacts/arrow.d.ts +2 -1
- package/nodejs-artifacts/arrow.js +33 -2
- package/nodejs-artifacts/connection.d.ts +1 -1
- package/nodejs-artifacts/connection.js +1 -1
- package/nodejs-artifacts/native.d.ts +26 -0
- package/nodejs-artifacts/query.d.ts +1 -1
- package/nodejs-artifacts/query.js +4 -3
- package/nodejs-artifacts/sanitize.js +3 -3
- package/nodejs-artifacts/table.d.ts +50 -3
- package/nodejs-artifacts/table.js +41 -2
- package/package.json +17 -20
- package/.eslintignore +0 -3
- package/eslint.config.js +0 -28
package/README.md
CHANGED
|
@@ -43,29 +43,20 @@ npm run test
|
|
|
43
43
|
|
|
44
44
|
### Running lint / format
|
|
45
45
|
|
|
46
|
-
LanceDb uses
|
|
47
|
-
|
|
48
|
-
set to true. Also, if your vscode root folder is the repo root then you will need to set
|
|
49
|
-
the eslint.workingDirectories to ["nodejs"]. To manually lint your code you can run:
|
|
46
|
+
LanceDb uses [biome](https://biomejs.dev/) for linting and formatting. if you are using VSCode you will need to install the official [Biome](https://marketplace.visualstudio.com/items?itemName=biomejs.biome) extension.
|
|
47
|
+
To manually lint your code you can run:
|
|
50
48
|
|
|
51
49
|
```sh
|
|
52
50
|
npm run lint
|
|
53
51
|
```
|
|
54
52
|
|
|
55
|
-
|
|
56
|
-
"Prettier - Code formatter" extension. You should then configure it to be the default formatter
|
|
57
|
-
for typescript and you should enable format on save. To manually check your code's format you
|
|
58
|
-
can run:
|
|
53
|
+
to automatically fix all fixable issues:
|
|
59
54
|
|
|
60
55
|
```sh
|
|
61
|
-
npm run
|
|
56
|
+
npm run lint-fix
|
|
62
57
|
```
|
|
63
58
|
|
|
64
|
-
If you
|
|
65
|
-
|
|
66
|
-
```sh
|
|
67
|
-
npx prettier --write .
|
|
68
|
-
```
|
|
59
|
+
If you do not have your workspace root set to the `nodejs` directory, unfortunately the extension will not work. You can still run the linting and formatting commands manually.
|
|
69
60
|
|
|
70
61
|
### Generating docs
|
|
71
62
|
|
package/biome.json
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://biomejs.dev/schemas/1.7.3/schema.json",
|
|
3
|
+
"organizeImports": {
|
|
4
|
+
"enabled": true
|
|
5
|
+
},
|
|
6
|
+
"files": {
|
|
7
|
+
"ignore": [
|
|
8
|
+
"**/dist/**/*",
|
|
9
|
+
"**/native.js",
|
|
10
|
+
"**/native.d.ts",
|
|
11
|
+
"**/npm/**/*",
|
|
12
|
+
"**/.vscode/**"
|
|
13
|
+
]
|
|
14
|
+
},
|
|
15
|
+
"formatter": {
|
|
16
|
+
"indentStyle": "space"
|
|
17
|
+
},
|
|
18
|
+
"linter": {
|
|
19
|
+
"enabled": true,
|
|
20
|
+
"rules": {
|
|
21
|
+
"recommended": false,
|
|
22
|
+
"complexity": {
|
|
23
|
+
"noBannedTypes": "error",
|
|
24
|
+
"noExtraBooleanCast": "error",
|
|
25
|
+
"noMultipleSpacesInRegularExpressionLiterals": "error",
|
|
26
|
+
"noUselessCatch": "error",
|
|
27
|
+
"noUselessThisAlias": "error",
|
|
28
|
+
"noUselessTypeConstraint": "error",
|
|
29
|
+
"noWith": "error"
|
|
30
|
+
},
|
|
31
|
+
"correctness": {
|
|
32
|
+
"noConstAssign": "error",
|
|
33
|
+
"noConstantCondition": "error",
|
|
34
|
+
"noEmptyCharacterClassInRegex": "error",
|
|
35
|
+
"noEmptyPattern": "error",
|
|
36
|
+
"noGlobalObjectCalls": "error",
|
|
37
|
+
"noInnerDeclarations": "error",
|
|
38
|
+
"noInvalidConstructorSuper": "error",
|
|
39
|
+
"noNewSymbol": "error",
|
|
40
|
+
"noNonoctalDecimalEscape": "error",
|
|
41
|
+
"noPrecisionLoss": "error",
|
|
42
|
+
"noSelfAssign": "error",
|
|
43
|
+
"noSetterReturn": "error",
|
|
44
|
+
"noSwitchDeclarations": "error",
|
|
45
|
+
"noUndeclaredVariables": "error",
|
|
46
|
+
"noUnreachable": "error",
|
|
47
|
+
"noUnreachableSuper": "error",
|
|
48
|
+
"noUnsafeFinally": "error",
|
|
49
|
+
"noUnsafeOptionalChaining": "error",
|
|
50
|
+
"noUnusedLabels": "error",
|
|
51
|
+
"noUnusedVariables": "error",
|
|
52
|
+
"useIsNan": "error",
|
|
53
|
+
"useValidForDirection": "error",
|
|
54
|
+
"useYield": "error"
|
|
55
|
+
},
|
|
56
|
+
"style": {
|
|
57
|
+
"noNamespace": "error",
|
|
58
|
+
"useAsConstAssertion": "error",
|
|
59
|
+
"useBlockStatements": "off",
|
|
60
|
+
"useNamingConvention": {
|
|
61
|
+
"level": "error",
|
|
62
|
+
"options": {
|
|
63
|
+
"strictCase": false
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"suspicious": {
|
|
68
|
+
"noAssignInExpressions": "error",
|
|
69
|
+
"noAsyncPromiseExecutor": "error",
|
|
70
|
+
"noCatchAssign": "error",
|
|
71
|
+
"noClassAssign": "error",
|
|
72
|
+
"noCompareNegZero": "error",
|
|
73
|
+
"noControlCharactersInRegex": "error",
|
|
74
|
+
"noDebugger": "error",
|
|
75
|
+
"noDuplicateCase": "error",
|
|
76
|
+
"noDuplicateClassMembers": "error",
|
|
77
|
+
"noDuplicateObjectKeys": "error",
|
|
78
|
+
"noDuplicateParameters": "error",
|
|
79
|
+
"noEmptyBlockStatements": "error",
|
|
80
|
+
"noExplicitAny": "error",
|
|
81
|
+
"noExtraNonNullAssertion": "error",
|
|
82
|
+
"noFallthroughSwitchClause": "error",
|
|
83
|
+
"noFunctionAssign": "error",
|
|
84
|
+
"noGlobalAssign": "error",
|
|
85
|
+
"noImportAssign": "error",
|
|
86
|
+
"noMisleadingCharacterClass": "error",
|
|
87
|
+
"noMisleadingInstantiator": "error",
|
|
88
|
+
"noPrototypeBuiltins": "error",
|
|
89
|
+
"noRedeclare": "error",
|
|
90
|
+
"noShadowRestrictedNames": "error",
|
|
91
|
+
"noUnsafeDeclarationMerging": "error",
|
|
92
|
+
"noUnsafeNegation": "error",
|
|
93
|
+
"useGetterReturn": "error",
|
|
94
|
+
"useValidTypeof": "error"
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
"ignore": ["**/dist/**/*", "**/native.js", "**/native.d.ts"]
|
|
98
|
+
},
|
|
99
|
+
"javascript": {
|
|
100
|
+
"globals": []
|
|
101
|
+
},
|
|
102
|
+
"overrides": [
|
|
103
|
+
{
|
|
104
|
+
"include": ["**/*.ts", "**/*.tsx", "**/*.mts", "**/*.cts"],
|
|
105
|
+
"linter": {
|
|
106
|
+
"rules": {
|
|
107
|
+
"correctness": {
|
|
108
|
+
"noConstAssign": "off",
|
|
109
|
+
"noGlobalObjectCalls": "off",
|
|
110
|
+
"noInvalidConstructorSuper": "off",
|
|
111
|
+
"noNewSymbol": "off",
|
|
112
|
+
"noSetterReturn": "off",
|
|
113
|
+
"noUndeclaredVariables": "off",
|
|
114
|
+
"noUnreachable": "off",
|
|
115
|
+
"noUnreachableSuper": "off"
|
|
116
|
+
},
|
|
117
|
+
"style": {
|
|
118
|
+
"noArguments": "error",
|
|
119
|
+
"noVar": "error",
|
|
120
|
+
"useConst": "error"
|
|
121
|
+
},
|
|
122
|
+
"suspicious": {
|
|
123
|
+
"noDuplicateClassMembers": "off",
|
|
124
|
+
"noDuplicateObjectKeys": "off",
|
|
125
|
+
"noDuplicateParameters": "off",
|
|
126
|
+
"noFunctionAssign": "off",
|
|
127
|
+
"noImportAssign": "off",
|
|
128
|
+
"noRedeclare": "off",
|
|
129
|
+
"noUnsafeNegation": "off",
|
|
130
|
+
"useGetterReturn": "off"
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
]
|
|
136
|
+
}
|
package/dist/arrow.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/// <reference types="node" />
|
|
2
|
-
import {
|
|
2
|
+
import { Table as ArrowTable, type Float, Schema } from "apache-arrow";
|
|
3
3
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
|
4
4
|
/** Data type accepted by NodeJS SDK */
|
|
5
5
|
export type Data = Record<string, unknown>[] | ArrowTable;
|
|
@@ -12,6 +12,7 @@ export declare class VectorColumnOptions {
|
|
|
12
12
|
export declare class MakeArrowTableOptions {
|
|
13
13
|
schema?: Schema;
|
|
14
14
|
vectorColumns: Record<string, VectorColumnOptions>;
|
|
15
|
+
embeddings?: EmbeddingFunction<unknown>;
|
|
15
16
|
/**
|
|
16
17
|
* If true then string columns will be encoded with dictionary encoding
|
|
17
18
|
*
|
package/dist/arrow.js
CHANGED
|
@@ -60,6 +60,7 @@ class MakeArrowTableOptions {
|
|
|
60
60
|
vectorColumns = {
|
|
61
61
|
vector: new VectorColumnOptions(),
|
|
62
62
|
};
|
|
63
|
+
embeddings;
|
|
63
64
|
/**
|
|
64
65
|
* If true then string columns will be encoded with dictionary encoding
|
|
65
66
|
*
|
|
@@ -175,6 +176,7 @@ function makeArrowTable(data, options) {
|
|
|
175
176
|
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
176
177
|
if (opt.schema !== undefined && opt.schema !== null) {
|
|
177
178
|
opt.schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
|
|
179
|
+
opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
|
|
178
180
|
}
|
|
179
181
|
const columns = {};
|
|
180
182
|
// TODO: sample dataset to find missing columns
|
|
@@ -244,8 +246,9 @@ function makeArrowTable(data, options) {
|
|
|
244
246
|
// then patch the schema of the batches so we can use
|
|
245
247
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
246
248
|
const firstTable = new apache_arrow_1.Table(columns);
|
|
249
|
+
const batchesFixed = firstTable.batches.map(
|
|
247
250
|
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
248
|
-
|
|
251
|
+
(batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
|
|
249
252
|
return new apache_arrow_1.Table(opt.schema, batchesFixed);
|
|
250
253
|
}
|
|
251
254
|
else {
|
|
@@ -269,7 +272,7 @@ function makeListVector(lists) {
|
|
|
269
272
|
throw Error("Cannot infer list vector from empty array or empty list");
|
|
270
273
|
}
|
|
271
274
|
const sampleList = lists[0];
|
|
272
|
-
//
|
|
275
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
273
276
|
let inferredType;
|
|
274
277
|
try {
|
|
275
278
|
const sampleVector = makeVector(sampleList);
|
|
@@ -537,3 +540,31 @@ function createEmptyTable(schema) {
|
|
|
537
540
|
return new apache_arrow_1.Table((0, sanitize_1.sanitizeSchema)(schema));
|
|
538
541
|
}
|
|
539
542
|
exports.createEmptyTable = createEmptyTable;
|
|
543
|
+
function validateSchemaEmbeddings(schema, data, embeddings) {
|
|
544
|
+
const fields = [];
|
|
545
|
+
const missingEmbeddingFields = [];
|
|
546
|
+
// First we check if the field is a `FixedSizeList`
|
|
547
|
+
// Then we check if the data contains the field
|
|
548
|
+
// if it does not, we add it to the list of missing embedding fields
|
|
549
|
+
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
550
|
+
// if they are not, we throw an error
|
|
551
|
+
for (const field of schema.fields) {
|
|
552
|
+
if (field.type instanceof apache_arrow_1.FixedSizeList) {
|
|
553
|
+
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
554
|
+
missingEmbeddingFields.push(field);
|
|
555
|
+
}
|
|
556
|
+
else {
|
|
557
|
+
fields.push(field);
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
else {
|
|
561
|
+
fields.push(field);
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
|
565
|
+
throw new Error(`Table has embeddings: "${missingEmbeddingFields
|
|
566
|
+
.map((f) => f.name)
|
|
567
|
+
.join(",")}", but no embedding function was provided`);
|
|
568
|
+
}
|
|
569
|
+
return new apache_arrow_1.Schema(fields, schema.metadata);
|
|
570
|
+
}
|
package/dist/connection.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
import { Table as ArrowTable, Schema } from "apache-arrow";
|
|
1
2
|
import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
|
|
2
3
|
import { Table } from "./table";
|
|
3
|
-
import { Table as ArrowTable, Schema } from "apache-arrow";
|
|
4
4
|
/**
|
|
5
5
|
* Connect to a LanceDB instance at the given URI.
|
|
6
6
|
*
|
package/dist/connection.js
CHANGED
|
@@ -14,10 +14,10 @@
|
|
|
14
14
|
// limitations under the License.
|
|
15
15
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
16
|
exports.Connection = exports.connect = void 0;
|
|
17
|
+
const apache_arrow_1 = require("apache-arrow");
|
|
17
18
|
const arrow_1 = require("./arrow");
|
|
18
19
|
const native_1 = require("./native");
|
|
19
20
|
const table_1 = require("./table");
|
|
20
|
-
const apache_arrow_1 = require("apache-arrow");
|
|
21
21
|
/**
|
|
22
22
|
* Connect to a LanceDB instance at the given URI.
|
|
23
23
|
*
|
package/dist/native.d.ts
CHANGED
|
@@ -15,6 +15,31 @@ export interface IndexConfig {
|
|
|
15
15
|
*/
|
|
16
16
|
columns: Array<string>
|
|
17
17
|
}
|
|
18
|
+
/** Statistics about a compaction operation. */
|
|
19
|
+
export interface CompactionStats {
|
|
20
|
+
/** The number of fragments removed */
|
|
21
|
+
fragmentsRemoved: number
|
|
22
|
+
/** The number of new, compacted fragments added */
|
|
23
|
+
fragmentsAdded: number
|
|
24
|
+
/** The number of data files removed */
|
|
25
|
+
filesRemoved: number
|
|
26
|
+
/** The number of new, compacted data files added */
|
|
27
|
+
filesAdded: number
|
|
28
|
+
}
|
|
29
|
+
/** Statistics about a cleanup operation */
|
|
30
|
+
export interface RemovalStats {
|
|
31
|
+
/** The number of bytes removed */
|
|
32
|
+
bytesRemoved: number
|
|
33
|
+
/** The number of old versions removed */
|
|
34
|
+
oldVersionsRemoved: number
|
|
35
|
+
}
|
|
36
|
+
/** Statistics about an optimize operation */
|
|
37
|
+
export interface OptimizeStats {
|
|
38
|
+
/** Statistics about the compaction operation */
|
|
39
|
+
compaction: CompactionStats
|
|
40
|
+
/** Statistics about the removal operation */
|
|
41
|
+
prune: RemovalStats
|
|
42
|
+
}
|
|
18
43
|
/**
|
|
19
44
|
* A definition of a column alteration. The alteration changes the column at
|
|
20
45
|
* `path` to have the new name `name`, to be nullable if `nullable` is true,
|
|
@@ -151,5 +176,6 @@ export class Table {
|
|
|
151
176
|
checkout(version: number): Promise<void>
|
|
152
177
|
checkoutLatest(): Promise<void>
|
|
153
178
|
restore(): Promise<void>
|
|
179
|
+
optimize(olderThanMs?: number | undefined | null): Promise<OptimizeStats>
|
|
154
180
|
listIndices(): Promise<Array<IndexConfig>>
|
|
155
181
|
}
|
package/dist/query.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Table as ArrowTable, RecordBatch } from "apache-arrow";
|
|
2
2
|
import { RecordBatchIterator as NativeBatchIterator, Query as NativeQuery, Table as NativeTable, VectorQuery as NativeVectorQuery } from "./native";
|
|
3
3
|
export declare class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
|
4
4
|
private promisedInner?;
|
package/dist/query.js
CHANGED
|
@@ -22,7 +22,7 @@ class RecordBatchIterator {
|
|
|
22
22
|
// TODO: check promise reliably so we dont need to pass two arguments.
|
|
23
23
|
this.promisedInner = promise;
|
|
24
24
|
}
|
|
25
|
-
//
|
|
25
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
26
26
|
async next() {
|
|
27
27
|
if (this.inner === undefined) {
|
|
28
28
|
this.inner = await this.promisedInner;
|
|
@@ -48,6 +48,7 @@ class QueryBase {
|
|
|
48
48
|
inner;
|
|
49
49
|
constructor(inner) {
|
|
50
50
|
this.inner = inner;
|
|
51
|
+
// intentionally empty
|
|
51
52
|
}
|
|
52
53
|
/**
|
|
53
54
|
* A filter statement to be applied to this query.
|
|
@@ -136,7 +137,7 @@ class QueryBase {
|
|
|
136
137
|
execute() {
|
|
137
138
|
return new RecordBatchIterator(this.nativeExecute());
|
|
138
139
|
}
|
|
139
|
-
//
|
|
140
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
140
141
|
[Symbol.asyncIterator]() {
|
|
141
142
|
const promise = this.nativeExecute();
|
|
142
143
|
return new RecordBatchIterator(promise);
|
|
@@ -338,7 +339,7 @@ class Query extends QueryBase {
|
|
|
338
339
|
* a default `limit` of 10 will be used. @see {@link Query#limit}
|
|
339
340
|
*/
|
|
340
341
|
nearestTo(vector) {
|
|
341
|
-
//
|
|
342
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
342
343
|
const vectorQuery = this.inner.nearestTo(Float32Array.from(vector));
|
|
343
344
|
return new VectorQuery(vectorQuery);
|
|
344
345
|
}
|
package/dist/sanitize.js
CHANGED
|
@@ -127,7 +127,7 @@ function sanitizeUnion(typeLike) {
|
|
|
127
127
|
throw Error("Expected a Union type to have an array-like `children` property");
|
|
128
128
|
}
|
|
129
129
|
return new apache_arrow_1.Union(typeLike.mode,
|
|
130
|
-
//
|
|
130
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
131
131
|
typeLike.typeIds, typeLike.children.map((child) => sanitizeField(child)));
|
|
132
132
|
}
|
|
133
133
|
function sanitizeTypedUnion(typeLike,
|
|
@@ -167,7 +167,7 @@ function sanitizeMap(typeLike) {
|
|
|
167
167
|
throw Error("Expected a Map type to have a `keysSorted` property");
|
|
168
168
|
}
|
|
169
169
|
return new apache_arrow_1.Map_(
|
|
170
|
-
//
|
|
170
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
171
171
|
typeLike.children.map((field) => sanitizeField(field)), typeLike.keysSorted);
|
|
172
172
|
}
|
|
173
173
|
function sanitizeDuration(typeLike) {
|
|
@@ -191,7 +191,7 @@ function sanitizeDictionary(typeLike) {
|
|
|
191
191
|
}
|
|
192
192
|
return new apache_arrow_1.Dictionary(sanitizeType(typeLike.dictionary), sanitizeType(typeLike.indices), typeLike.id, typeLike.isOrdered);
|
|
193
193
|
}
|
|
194
|
-
//
|
|
194
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
195
195
|
function sanitizeType(typeLike) {
|
|
196
196
|
if (typeof typeLike !== "object" || typeLike === null) {
|
|
197
197
|
throw Error("Expected a Type but object was null/undefined");
|
package/dist/table.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { Schema } from "apache-arrow";
|
|
2
|
-
import { AddColumnsSql, ColumnAlteration, IndexConfig, Table as _NativeTable } from "./native";
|
|
3
|
-
import { Query, VectorQuery } from "./query";
|
|
4
|
-
import { IndexOptions } from "./indices";
|
|
5
2
|
import { Data } from "./arrow";
|
|
3
|
+
import { IndexOptions } from "./indices";
|
|
4
|
+
import { AddColumnsSql, ColumnAlteration, IndexConfig, OptimizeStats, Table as _NativeTable } from "./native";
|
|
5
|
+
import { Query, VectorQuery } from "./query";
|
|
6
6
|
export { IndexConfig } from "./native";
|
|
7
7
|
/**
|
|
8
8
|
* Options for adding data to a table.
|
|
@@ -28,6 +28,22 @@ export interface UpdateOptions {
|
|
|
28
28
|
*/
|
|
29
29
|
where: string;
|
|
30
30
|
}
|
|
31
|
+
export interface OptimizeOptions {
|
|
32
|
+
/**
|
|
33
|
+
* If set then all versions older than the given date
|
|
34
|
+
* be removed. The current version will never be removed.
|
|
35
|
+
* The default is 7 days
|
|
36
|
+
* @example
|
|
37
|
+
* // Delete all versions older than 1 day
|
|
38
|
+
* const olderThan = new Date();
|
|
39
|
+
* olderThan.setDate(olderThan.getDate() - 1));
|
|
40
|
+
* tbl.cleanupOlderVersions(olderThan);
|
|
41
|
+
*
|
|
42
|
+
* // Delete all versions except the current version
|
|
43
|
+
* tbl.cleanupOlderVersions(new Date());
|
|
44
|
+
*/
|
|
45
|
+
cleanupOlderThan: Date;
|
|
46
|
+
}
|
|
31
47
|
/**
|
|
32
48
|
* A Table is a collection of Records in a LanceDB Database.
|
|
33
49
|
*
|
|
@@ -253,6 +269,37 @@ export declare class Table {
|
|
|
253
269
|
* out state and the read_consistency_interval, if any, will apply.
|
|
254
270
|
*/
|
|
255
271
|
restore(): Promise<void>;
|
|
272
|
+
/**
|
|
273
|
+
* Optimize the on-disk data and indices for better performance.
|
|
274
|
+
*
|
|
275
|
+
* Modeled after ``VACUUM`` in PostgreSQL.
|
|
276
|
+
*
|
|
277
|
+
* Optimization covers three operations:
|
|
278
|
+
*
|
|
279
|
+
* - Compaction: Merges small files into larger ones
|
|
280
|
+
* - Prune: Removes old versions of the dataset
|
|
281
|
+
* - Index: Optimizes the indices, adding new data to existing indices
|
|
282
|
+
*
|
|
283
|
+
*
|
|
284
|
+
* Experimental API
|
|
285
|
+
* ----------------
|
|
286
|
+
*
|
|
287
|
+
* The optimization process is undergoing active development and may change.
|
|
288
|
+
* Our goal with these changes is to improve the performance of optimization and
|
|
289
|
+
* reduce the complexity.
|
|
290
|
+
*
|
|
291
|
+
* That being said, it is essential today to run optimize if you want the best
|
|
292
|
+
* performance. It should be stable and safe to use in production, but it our
|
|
293
|
+
* hope that the API may be simplified (or not even need to be called) in the
|
|
294
|
+
* future.
|
|
295
|
+
*
|
|
296
|
+
* The frequency an application shoudl call optimize is based on the frequency of
|
|
297
|
+
* data modifications. If data is frequently added, deleted, or updated then
|
|
298
|
+
* optimize should be run frequently. A good rule of thumb is to run optimize if
|
|
299
|
+
* you have added or modified 100,000 or more records or run more than 20 data
|
|
300
|
+
* modification operations.
|
|
301
|
+
*/
|
|
302
|
+
optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats>;
|
|
256
303
|
/** List all indices that have been created with {@link Table.createIndex} */
|
|
257
304
|
listIndices(): Promise<IndexConfig[]>;
|
|
258
305
|
}
|
package/dist/table.js
CHANGED
|
@@ -15,8 +15,8 @@
|
|
|
15
15
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
16
16
|
exports.Table = void 0;
|
|
17
17
|
const apache_arrow_1 = require("apache-arrow");
|
|
18
|
-
const query_1 = require("./query");
|
|
19
18
|
const arrow_1 = require("./arrow");
|
|
19
|
+
const query_1 = require("./query");
|
|
20
20
|
/**
|
|
21
21
|
* A Table is a collection of Records in a LanceDB Database.
|
|
22
22
|
*
|
|
@@ -140,7 +140,7 @@ class Table {
|
|
|
140
140
|
*/
|
|
141
141
|
async createIndex(column, options) {
|
|
142
142
|
// Bit of a hack to get around the fact that TS has no package-scope.
|
|
143
|
-
//
|
|
143
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
144
144
|
const nativeIndex = options?.config?.inner;
|
|
145
145
|
await this.inner.createIndex(nativeIndex, column, options?.replace);
|
|
146
146
|
}
|
|
@@ -296,6 +296,45 @@ class Table {
|
|
|
296
296
|
async restore() {
|
|
297
297
|
await this.inner.restore();
|
|
298
298
|
}
|
|
299
|
+
/**
|
|
300
|
+
* Optimize the on-disk data and indices for better performance.
|
|
301
|
+
*
|
|
302
|
+
* Modeled after ``VACUUM`` in PostgreSQL.
|
|
303
|
+
*
|
|
304
|
+
* Optimization covers three operations:
|
|
305
|
+
*
|
|
306
|
+
* - Compaction: Merges small files into larger ones
|
|
307
|
+
* - Prune: Removes old versions of the dataset
|
|
308
|
+
* - Index: Optimizes the indices, adding new data to existing indices
|
|
309
|
+
*
|
|
310
|
+
*
|
|
311
|
+
* Experimental API
|
|
312
|
+
* ----------------
|
|
313
|
+
*
|
|
314
|
+
* The optimization process is undergoing active development and may change.
|
|
315
|
+
* Our goal with these changes is to improve the performance of optimization and
|
|
316
|
+
* reduce the complexity.
|
|
317
|
+
*
|
|
318
|
+
* That being said, it is essential today to run optimize if you want the best
|
|
319
|
+
* performance. It should be stable and safe to use in production, but it our
|
|
320
|
+
* hope that the API may be simplified (or not even need to be called) in the
|
|
321
|
+
* future.
|
|
322
|
+
*
|
|
323
|
+
* The frequency an application shoudl call optimize is based on the frequency of
|
|
324
|
+
* data modifications. If data is frequently added, deleted, or updated then
|
|
325
|
+
* optimize should be run frequently. A good rule of thumb is to run optimize if
|
|
326
|
+
* you have added or modified 100,000 or more records or run more than 20 data
|
|
327
|
+
* modification operations.
|
|
328
|
+
*/
|
|
329
|
+
async optimize(options) {
|
|
330
|
+
let cleanupOlderThanMs;
|
|
331
|
+
if (options?.cleanupOlderThan !== undefined &&
|
|
332
|
+
options?.cleanupOlderThan !== null) {
|
|
333
|
+
cleanupOlderThanMs =
|
|
334
|
+
new Date().getTime() - options.cleanupOlderThan.getTime();
|
|
335
|
+
}
|
|
336
|
+
return await this.inner.optimize(cleanupOlderThanMs);
|
|
337
|
+
}
|
|
299
338
|
/** List all indices that have been created with {@link Table.createIndex} */
|
|
300
339
|
async listIndices() {
|
|
301
340
|
return await this.inner.listIndices();
|
package/lancedb/arrow.ts
CHANGED
|
@@ -13,25 +13,25 @@
|
|
|
13
13
|
// limitations under the License.
|
|
14
14
|
|
|
15
15
|
import {
|
|
16
|
+
Table as ArrowTable,
|
|
17
|
+
Binary,
|
|
18
|
+
DataType,
|
|
16
19
|
Field,
|
|
17
|
-
makeBuilder,
|
|
18
|
-
RecordBatchFileWriter,
|
|
19
|
-
Utf8,
|
|
20
|
-
type Vector,
|
|
21
20
|
FixedSizeList,
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
Table as ArrowTable,
|
|
25
|
-
RecordBatchStreamWriter,
|
|
21
|
+
type Float,
|
|
22
|
+
Float32,
|
|
26
23
|
List,
|
|
27
24
|
RecordBatch,
|
|
28
|
-
|
|
25
|
+
RecordBatchFileWriter,
|
|
26
|
+
RecordBatchStreamWriter,
|
|
27
|
+
Schema,
|
|
29
28
|
Struct,
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
29
|
+
Utf8,
|
|
30
|
+
type Vector,
|
|
31
|
+
makeBuilder,
|
|
32
|
+
makeData,
|
|
34
33
|
type makeTable,
|
|
34
|
+
vectorFromArray,
|
|
35
35
|
} from "apache-arrow";
|
|
36
36
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
|
37
37
|
import { sanitizeSchema } from "./sanitize";
|
|
@@ -85,6 +85,7 @@ export class MakeArrowTableOptions {
|
|
|
85
85
|
vectorColumns: Record<string, VectorColumnOptions> = {
|
|
86
86
|
vector: new VectorColumnOptions(),
|
|
87
87
|
};
|
|
88
|
+
embeddings?: EmbeddingFunction<unknown>;
|
|
88
89
|
|
|
89
90
|
/**
|
|
90
91
|
* If true then string columns will be encoded with dictionary encoding
|
|
@@ -208,6 +209,7 @@ export function makeArrowTable(
|
|
|
208
209
|
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
209
210
|
if (opt.schema !== undefined && opt.schema !== null) {
|
|
210
211
|
opt.schema = sanitizeSchema(opt.schema);
|
|
212
|
+
opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
|
|
211
213
|
}
|
|
212
214
|
const columns: Record<string, Vector> = {};
|
|
213
215
|
// TODO: sample dataset to find missing columns
|
|
@@ -287,8 +289,8 @@ export function makeArrowTable(
|
|
|
287
289
|
// then patch the schema of the batches so we can use
|
|
288
290
|
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
289
291
|
const firstTable = new ArrowTable(columns);
|
|
290
|
-
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
291
292
|
const batchesFixed = firstTable.batches.map(
|
|
293
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
292
294
|
(batch) => new RecordBatch(opt.schema!, batch.data),
|
|
293
295
|
);
|
|
294
296
|
return new ArrowTable(opt.schema, batchesFixed);
|
|
@@ -313,7 +315,7 @@ function makeListVector(lists: unknown[][]): Vector<unknown> {
|
|
|
313
315
|
throw Error("Cannot infer list vector from empty array or empty list");
|
|
314
316
|
}
|
|
315
317
|
const sampleList = lists[0];
|
|
316
|
-
//
|
|
318
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
317
319
|
let inferredType: any;
|
|
318
320
|
try {
|
|
319
321
|
const sampleVector = makeVector(sampleList);
|
|
@@ -337,7 +339,7 @@ function makeVector(
|
|
|
337
339
|
values: unknown[],
|
|
338
340
|
type?: DataType,
|
|
339
341
|
stringAsDictionary?: boolean,
|
|
340
|
-
//
|
|
342
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
341
343
|
): Vector<any> {
|
|
342
344
|
if (type !== undefined) {
|
|
343
345
|
// No need for inference, let Arrow create it
|
|
@@ -648,3 +650,39 @@ function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
|
|
|
648
650
|
export function createEmptyTable(schema: Schema): ArrowTable {
|
|
649
651
|
return new ArrowTable(sanitizeSchema(schema));
|
|
650
652
|
}
|
|
653
|
+
|
|
654
|
+
function validateSchemaEmbeddings(
|
|
655
|
+
schema: Schema,
|
|
656
|
+
data: Array<Record<string, unknown>>,
|
|
657
|
+
embeddings: EmbeddingFunction<unknown> | undefined,
|
|
658
|
+
) {
|
|
659
|
+
const fields = [];
|
|
660
|
+
const missingEmbeddingFields = [];
|
|
661
|
+
|
|
662
|
+
// First we check if the field is a `FixedSizeList`
|
|
663
|
+
// Then we check if the data contains the field
|
|
664
|
+
// if it does not, we add it to the list of missing embedding fields
|
|
665
|
+
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
666
|
+
// if they are not, we throw an error
|
|
667
|
+
for (const field of schema.fields) {
|
|
668
|
+
if (field.type instanceof FixedSizeList) {
|
|
669
|
+
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
670
|
+
missingEmbeddingFields.push(field);
|
|
671
|
+
} else {
|
|
672
|
+
fields.push(field);
|
|
673
|
+
}
|
|
674
|
+
} else {
|
|
675
|
+
fields.push(field);
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
|
680
|
+
throw new Error(
|
|
681
|
+
`Table has embeddings: "${missingEmbeddingFields
|
|
682
|
+
.map((f) => f.name)
|
|
683
|
+
.join(",")}", but no embedding function was provided`,
|
|
684
|
+
);
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
return new Schema(fields, schema.metadata);
|
|
688
|
+
}
|
package/lancedb/connection.ts
CHANGED
|
@@ -12,10 +12,10 @@
|
|
|
12
12
|
// See the License for the specific language governing permissions and
|
|
13
13
|
// limitations under the License.
|
|
14
14
|
|
|
15
|
+
import { Table as ArrowTable, Schema } from "apache-arrow";
|
|
15
16
|
import { fromTableToBuffer, makeArrowTable, makeEmptyTable } from "./arrow";
|
|
16
17
|
import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
|
|
17
18
|
import { Table } from "./table";
|
|
18
|
-
import { Table as ArrowTable, Schema } from "apache-arrow";
|
|
19
19
|
|
|
20
20
|
/**
|
|
21
21
|
* Connect to a LanceDB instance at the given URI.
|
|
@@ -12,8 +12,8 @@
|
|
|
12
12
|
// See the License for the specific language governing permissions and
|
|
13
13
|
// limitations under the License.
|
|
14
14
|
|
|
15
|
-
import { type EmbeddingFunction } from "./embedding_function";
|
|
16
15
|
import type OpenAI from "openai";
|
|
16
|
+
import { type EmbeddingFunction } from "./embedding_function";
|
|
17
17
|
|
|
18
18
|
export class OpenAIEmbeddingFunction implements EmbeddingFunction<string> {
|
|
19
19
|
private readonly _openai: OpenAI;
|