@lancedb/lancedb 0.29.0 → 0.30.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +6 -6
- package/CONTRIBUTING.md +15 -13
- package/dist/arrow.d.ts +6 -0
- package/dist/arrow.js +10 -0
- package/dist/connection.d.ts +98 -0
- package/dist/connection.js +15 -0
- package/dist/index.d.ts +118 -5
- package/dist/index.js +48 -1
- package/dist/native.d.ts +140 -1
- package/dist/native.js +53 -52
- package/dist/query.d.ts +10 -0
- package/dist/query.js +14 -0
- package/dist/scannable.d.ts +92 -0
- package/dist/scannable.js +200 -0
- package/dist/table.d.ts +118 -0
- package/dist/table.js +25 -1
- package/package.json +21 -20
- package/pnpm-workspace.yaml +18 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { Table as ArrowTable, RecordBatch, RecordBatchReader, Schema } from "apache-arrow";
|
|
2
|
+
import { NapiScannable } from "./native.js";
|
|
3
|
+
export interface ScannableOptions {
|
|
4
|
+
/** Hint about the number of rows. Not validated against the stream. */
|
|
5
|
+
numRows?: number;
|
|
6
|
+
/**
|
|
7
|
+
* Whether the source can be scanned more than once. Defaults to `true` for
|
|
8
|
+
* `fromTable` / `fromFactory` and `false` for `fromIterable` /
|
|
9
|
+
* `fromRecordBatchReader`.
|
|
10
|
+
*/
|
|
11
|
+
rescannable?: boolean;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* A data source that can be scanned as a stream of Arrow `RecordBatch`es.
|
|
15
|
+
*
|
|
16
|
+
* `Scannable` wraps the schema + optional row count + rescannable flag and
|
|
17
|
+
* a callback that yields batches one at a time. It is passed to consumers
|
|
18
|
+
* (e.g. `Table.add`, `createTable`, `mergeInsert` — follow-up work) that
|
|
19
|
+
* need to pull data without materializing the full dataset in JS memory.
|
|
20
|
+
*
|
|
21
|
+
* Batches cross the JS↔Rust boundary as Arrow IPC Stream messages; a fresh
|
|
22
|
+
* writer serializes each batch, and the Rust side decodes it with
|
|
23
|
+
* `arrow_ipc::reader::StreamReader`. One batch is in flight at a time.
|
|
24
|
+
*/
|
|
25
|
+
export declare class Scannable {
|
|
26
|
+
readonly schema: Schema;
|
|
27
|
+
readonly numRows: number | null;
|
|
28
|
+
readonly rescannable: boolean;
|
|
29
|
+
/** @hidden */
|
|
30
|
+
private readonly native;
|
|
31
|
+
private constructor();
|
|
32
|
+
/** @hidden Access the native handle for passing through to Rust consumers. */
|
|
33
|
+
get inner(): NapiScannable;
|
|
34
|
+
/**
|
|
35
|
+
* Build a Scannable from an explicit schema and a factory that returns a
|
|
36
|
+
* fresh batch iterator on each call.
|
|
37
|
+
*
|
|
38
|
+
* The factory is invoked once per scan. Each iterator yields
|
|
39
|
+
* `RecordBatch`es matching the declared schema. Use this when you need
|
|
40
|
+
* direct control over the pull loop — for example, to wrap a streaming
|
|
41
|
+
* source whose batches are produced lazily.
|
|
42
|
+
*
|
|
43
|
+
* @param schema - The Arrow schema of the produced batches.
|
|
44
|
+
* @param factory - Called at the start of each scan to produce a batch
|
|
45
|
+
* iterator. Must be idempotent when `rescannable` is true.
|
|
46
|
+
* @param opts - Optional hints. `rescannable` defaults to `true`; set to
|
|
47
|
+
* `false` if calling `factory()` twice would not reproduce the same data.
|
|
48
|
+
*/
|
|
49
|
+
static fromFactory(schema: Schema, factory: () => AsyncIterable<RecordBatch> | Iterable<RecordBatch> | AsyncIterator<RecordBatch> | Iterator<RecordBatch>, opts?: ScannableOptions): Promise<Scannable>;
|
|
50
|
+
/**
|
|
51
|
+
* Build a Scannable from an in-memory Arrow `Table`. Always rescannable;
|
|
52
|
+
* the table's batches are replayed on each scan.
|
|
53
|
+
*
|
|
54
|
+
* The table's row count is authoritative: `opts.numRows` must either be
|
|
55
|
+
* omitted or equal to `table.numRows`. `opts.rescannable` of `false` is
|
|
56
|
+
* rejected because in-memory Tables are always rescannable.
|
|
57
|
+
*/
|
|
58
|
+
static fromTable(table: ArrowTable, opts?: ScannableOptions): Promise<Scannable>;
|
|
59
|
+
/**
|
|
60
|
+
* Build a Scannable from an iterable of `RecordBatch`es. `rescannable`
|
|
61
|
+
* defaults to `false`. Pass an explicit schema so the consumer can
|
|
62
|
+
* validate before any batch is pulled.
|
|
63
|
+
*
|
|
64
|
+
* `opts.rescannable: true` is honest for replayable iterables (Arrays,
|
|
65
|
+
* Sets, or custom iterables whose `[Symbol.iterator]()` returns a fresh
|
|
66
|
+
* iterator each call). It is rejected for one-shot iterables (generators,
|
|
67
|
+
* async generators, or already-an-iterator inputs) because their
|
|
68
|
+
* `[Symbol.iterator]()` returns the same exhausted object on the second
|
|
69
|
+
* scan. For replayable sources outside this shape, use
|
|
70
|
+
* `fromFactory(schema, () => createIter(), { rescannable: true })`.
|
|
71
|
+
*
|
|
72
|
+
* Note: when `opts.rescannable` is `true`, the constructor calls
|
|
73
|
+
* `[Symbol.iterator]()` once on the input to perform the structural check.
|
|
74
|
+
*/
|
|
75
|
+
static fromIterable(schema: Schema, iter: AsyncIterable<RecordBatch> | Iterable<RecordBatch>, opts?: ScannableOptions): Promise<Scannable>;
|
|
76
|
+
/**
|
|
77
|
+
* Build a Scannable from an Arrow `RecordBatchReader`. A reader can only
|
|
78
|
+
* be consumed once; `rescannable` defaults to `false`.
|
|
79
|
+
*
|
|
80
|
+
* The reader must already be opened (via `.open()`) so its `.schema` is
|
|
81
|
+
* populated. `RecordBatchReader.from(...)` returns an unopened reader.
|
|
82
|
+
*
|
|
83
|
+
* `opts.rescannable: true` is rejected because `RecordBatchReader` is a
|
|
84
|
+
* self-iterator (its `[Symbol.iterator]()` returns itself), and this
|
|
85
|
+
* constructor does not call `reader.reset()` between scans, so a second
|
|
86
|
+
* scan would always see an exhausted reader. For genuinely replayable
|
|
87
|
+
* sources, use
|
|
88
|
+
* `fromFactory(schema, () => openReader(), { rescannable: true })`,
|
|
89
|
+
* which mints a fresh reader on each scan.
|
|
90
|
+
*/
|
|
91
|
+
static fromRecordBatchReader(reader: RecordBatchReader, opts?: ScannableOptions): Promise<Scannable>;
|
|
92
|
+
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
4
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
+
exports.Scannable = void 0;
|
|
6
|
+
const arrow_1 = require("./arrow");
|
|
7
|
+
const native_js_1 = require("./native.js");
|
|
8
|
+
/**
|
|
9
|
+
* A data source that can be scanned as a stream of Arrow `RecordBatch`es.
|
|
10
|
+
*
|
|
11
|
+
* `Scannable` wraps the schema + optional row count + rescannable flag and
|
|
12
|
+
* a callback that yields batches one at a time. It is passed to consumers
|
|
13
|
+
* (e.g. `Table.add`, `createTable`, `mergeInsert` — follow-up work) that
|
|
14
|
+
* need to pull data without materializing the full dataset in JS memory.
|
|
15
|
+
*
|
|
16
|
+
* Batches cross the JS↔Rust boundary as Arrow IPC Stream messages; a fresh
|
|
17
|
+
* writer serializes each batch, and the Rust side decodes it with
|
|
18
|
+
* `arrow_ipc::reader::StreamReader`. One batch is in flight at a time.
|
|
19
|
+
*/
|
|
20
|
+
class Scannable {
|
|
21
|
+
schema;
|
|
22
|
+
numRows;
|
|
23
|
+
rescannable;
|
|
24
|
+
/** @hidden */
|
|
25
|
+
native;
|
|
26
|
+
constructor(native, schema, numRows, rescannable) {
|
|
27
|
+
this.native = native;
|
|
28
|
+
this.schema = schema;
|
|
29
|
+
this.numRows = numRows;
|
|
30
|
+
this.rescannable = rescannable;
|
|
31
|
+
}
|
|
32
|
+
/** @hidden Access the native handle for passing through to Rust consumers. */
|
|
33
|
+
get inner() {
|
|
34
|
+
return this.native;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Build a Scannable from an explicit schema and a factory that returns a
|
|
38
|
+
* fresh batch iterator on each call.
|
|
39
|
+
*
|
|
40
|
+
* The factory is invoked once per scan. Each iterator yields
|
|
41
|
+
* `RecordBatch`es matching the declared schema. Use this when you need
|
|
42
|
+
* direct control over the pull loop — for example, to wrap a streaming
|
|
43
|
+
* source whose batches are produced lazily.
|
|
44
|
+
*
|
|
45
|
+
* @param schema - The Arrow schema of the produced batches.
|
|
46
|
+
* @param factory - Called at the start of each scan to produce a batch
|
|
47
|
+
* iterator. Must be idempotent when `rescannable` is true.
|
|
48
|
+
* @param opts - Optional hints. `rescannable` defaults to `true`; set to
|
|
49
|
+
* `false` if calling `factory()` twice would not reproduce the same data.
|
|
50
|
+
*/
|
|
51
|
+
static async fromFactory(schema, factory, opts = {}) {
|
|
52
|
+
const numRows = opts.numRows ?? null;
|
|
53
|
+
if (numRows != null && !Number.isInteger(numRows)) {
|
|
54
|
+
throw new TypeError("numRows must be an integer");
|
|
55
|
+
}
|
|
56
|
+
const rescannable = opts.rescannable ?? true;
|
|
57
|
+
let iter = null;
|
|
58
|
+
const getNextBatch = async (isStart) => {
|
|
59
|
+
// `isStart` is true on the first pull of every new scan_as_stream.
|
|
60
|
+
// Drop any cached iterator so factory() is re-invoked for the next scan
|
|
61
|
+
if (isStart) {
|
|
62
|
+
iter = null;
|
|
63
|
+
}
|
|
64
|
+
if (iter === null) {
|
|
65
|
+
iter = normalizeIterator(factory());
|
|
66
|
+
}
|
|
67
|
+
const result = await iter.next();
|
|
68
|
+
if (result.done) {
|
|
69
|
+
iter = null;
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
return (0, arrow_1.fromRecordBatchToStreamBuffer)(result.value);
|
|
73
|
+
};
|
|
74
|
+
const schemaBuf = await (0, arrow_1.fromTableToBuffer)((0, arrow_1.makeEmptyTable)(schema));
|
|
75
|
+
const native = new native_js_1.NapiScannable(schemaBuf, numRows, rescannable, getNextBatch);
|
|
76
|
+
return new Scannable(native, schema, numRows, rescannable);
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Build a Scannable from an in-memory Arrow `Table`. Always rescannable;
|
|
80
|
+
* the table's batches are replayed on each scan.
|
|
81
|
+
*
|
|
82
|
+
* The table's row count is authoritative: `opts.numRows` must either be
|
|
83
|
+
* omitted or equal to `table.numRows`. `opts.rescannable` of `false` is
|
|
84
|
+
* rejected because in-memory Tables are always rescannable.
|
|
85
|
+
*/
|
|
86
|
+
static async fromTable(table, opts = {}) {
|
|
87
|
+
if (opts.numRows != null && opts.numRows !== table.numRows) {
|
|
88
|
+
throw new TypeError(`opts.numRows (${opts.numRows}) does not match table.numRows (${table.numRows}). ` +
|
|
89
|
+
`The table's row count is authoritative; omit numRows or pass the matching value.`);
|
|
90
|
+
}
|
|
91
|
+
if (opts.rescannable === false) {
|
|
92
|
+
throw new TypeError(`fromTable does not accept rescannable: false. ` +
|
|
93
|
+
`In-memory Arrow Tables are always rescannable; omit the option or pass true.`);
|
|
94
|
+
}
|
|
95
|
+
return Scannable.fromFactory(table.schema, () => table.batches, {
|
|
96
|
+
numRows: table.numRows,
|
|
97
|
+
rescannable: true,
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Build a Scannable from an iterable of `RecordBatch`es. `rescannable`
|
|
102
|
+
* defaults to `false`. Pass an explicit schema so the consumer can
|
|
103
|
+
* validate before any batch is pulled.
|
|
104
|
+
*
|
|
105
|
+
* `opts.rescannable: true` is honest for replayable iterables (Arrays,
|
|
106
|
+
* Sets, or custom iterables whose `[Symbol.iterator]()` returns a fresh
|
|
107
|
+
* iterator each call). It is rejected for one-shot iterables (generators,
|
|
108
|
+
* async generators, or already-an-iterator inputs) because their
|
|
109
|
+
* `[Symbol.iterator]()` returns the same exhausted object on the second
|
|
110
|
+
* scan. For replayable sources outside this shape, use
|
|
111
|
+
* `fromFactory(schema, () => createIter(), { rescannable: true })`.
|
|
112
|
+
*
|
|
113
|
+
* Note: when `opts.rescannable` is `true`, the constructor calls
|
|
114
|
+
* `[Symbol.iterator]()` once on the input to perform the structural check.
|
|
115
|
+
*/
|
|
116
|
+
static async fromIterable(schema, iter, opts = {}) {
|
|
117
|
+
if (opts.rescannable === true && isOneShotIterable(iter)) {
|
|
118
|
+
throw new TypeError(`fromIterable: rescannable: true is not honest for one-shot iterables ` +
|
|
119
|
+
`(generators, async generators, or iterators where [Symbol.iterator]() ` +
|
|
120
|
+
`returns the same object). The source would be exhausted after the first scan. ` +
|
|
121
|
+
`Use fromFactory(schema, () => createIter(), { rescannable: true }) for sources ` +
|
|
122
|
+
`where each call mints a fresh iterator.`);
|
|
123
|
+
}
|
|
124
|
+
return Scannable.fromFactory(schema, () => iter, {
|
|
125
|
+
numRows: opts.numRows,
|
|
126
|
+
rescannable: opts.rescannable ?? false,
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Build a Scannable from an Arrow `RecordBatchReader`. A reader can only
|
|
131
|
+
* be consumed once; `rescannable` defaults to `false`.
|
|
132
|
+
*
|
|
133
|
+
* The reader must already be opened (via `.open()`) so its `.schema` is
|
|
134
|
+
* populated. `RecordBatchReader.from(...)` returns an unopened reader.
|
|
135
|
+
*
|
|
136
|
+
* `opts.rescannable: true` is rejected because `RecordBatchReader` is a
|
|
137
|
+
* self-iterator (its `[Symbol.iterator]()` returns itself), and this
|
|
138
|
+
* constructor does not call `reader.reset()` between scans, so a second
|
|
139
|
+
* scan would always see an exhausted reader. For genuinely replayable
|
|
140
|
+
* sources, use
|
|
141
|
+
* `fromFactory(schema, () => openReader(), { rescannable: true })`,
|
|
142
|
+
* which mints a fresh reader on each scan.
|
|
143
|
+
*/
|
|
144
|
+
static async fromRecordBatchReader(reader, opts = {}) {
|
|
145
|
+
if (opts.rescannable === true) {
|
|
146
|
+
throw new TypeError(`fromRecordBatchReader does not accept rescannable: true. ` +
|
|
147
|
+
`RecordBatchReader is a self-iterator (its [Symbol.iterator]() ` +
|
|
148
|
+
`returns itself) and would be exhausted after the first scan. ` +
|
|
149
|
+
`Use fromFactory(schema, () => openReader(), { rescannable: true }) ` +
|
|
150
|
+
`for sources where each call mints a fresh reader.`);
|
|
151
|
+
}
|
|
152
|
+
return Scannable.fromFactory(reader.schema, () => reader, {
|
|
153
|
+
numRows: opts.numRows,
|
|
154
|
+
rescannable: false,
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
exports.Scannable = Scannable;
|
|
159
|
+
function normalizeIterator(source) {
|
|
160
|
+
if (source == null) {
|
|
161
|
+
throw new TypeError("Scannable factory returned null/undefined");
|
|
162
|
+
}
|
|
163
|
+
if (typeof source[Symbol.asyncIterator] === "function") {
|
|
164
|
+
return source[Symbol.asyncIterator]();
|
|
165
|
+
}
|
|
166
|
+
if (typeof source[Symbol.iterator] === "function") {
|
|
167
|
+
return source[Symbol.iterator]();
|
|
168
|
+
}
|
|
169
|
+
// Already an iterator (has `.next`).
|
|
170
|
+
if (typeof source.next === "function") {
|
|
171
|
+
return source;
|
|
172
|
+
}
|
|
173
|
+
throw new TypeError("Scannable factory returned a non-iterable value");
|
|
174
|
+
}
|
|
175
|
+
// A "self-iterator" returns the same object from `[Symbol.iterator]()` /
|
|
176
|
+
// `[Symbol.asyncIterator]()`. Generators behave this way, so they exhaust
|
|
177
|
+
// after one pass. Replayable iterables (Array, Set, custom) return a fresh
|
|
178
|
+
// iterator each call. Detection mirrors `normalizeIterator`'s ordering so
|
|
179
|
+
// classification matches scan-time behavior.
|
|
180
|
+
function isOneShotIterable(source) {
|
|
181
|
+
// null/undefined are not one-shot in any meaningful sense; let
|
|
182
|
+
// `normalizeIterator` raise the actual error at scan time.
|
|
183
|
+
if (source == null)
|
|
184
|
+
return false;
|
|
185
|
+
const ref = source;
|
|
186
|
+
if (typeof source[Symbol.asyncIterator] ===
|
|
187
|
+
"function") {
|
|
188
|
+
const it = source[Symbol.asyncIterator]();
|
|
189
|
+
return it === ref;
|
|
190
|
+
}
|
|
191
|
+
if (typeof source[Symbol.iterator] === "function") {
|
|
192
|
+
const it = source[Symbol.iterator]();
|
|
193
|
+
return it === ref;
|
|
194
|
+
}
|
|
195
|
+
// Already-an-iterator (has `.next` but no `Symbol.iterator`) is by
|
|
196
|
+
// definition one-shot.
|
|
197
|
+
if (typeof source.next === "function")
|
|
198
|
+
return true;
|
|
199
|
+
return false;
|
|
200
|
+
}
|
package/dist/table.d.ts
CHANGED
|
@@ -5,6 +5,32 @@ import { AddColumnsResult, AddColumnsSql, AddResult, AlterColumnsResult, DeleteR
|
|
|
5
5
|
import { FullTextQuery, Query, TakeQuery, VectorQuery } from "./query";
|
|
6
6
|
import { IntoSql } from "./util";
|
|
7
7
|
export { IndexConfig } from "./native";
|
|
8
|
+
/**
|
|
9
|
+
* Progress snapshot for a write operation, delivered to the `progress`
|
|
10
|
+
* callback passed to {@link Table.add}.
|
|
11
|
+
*/
|
|
12
|
+
export interface WriteProgress {
|
|
13
|
+
/** Number of rows written so far. */
|
|
14
|
+
outputRows: number;
|
|
15
|
+
/** Number of bytes written so far. */
|
|
16
|
+
outputBytes: number;
|
|
17
|
+
/**
|
|
18
|
+
* Total rows expected, when the input source reports it.
|
|
19
|
+
*
|
|
20
|
+
* Always set on the final callback (the one with `done: true`), falling
|
|
21
|
+
* back to the actual number of rows written when the source could not
|
|
22
|
+
* report a row count up front.
|
|
23
|
+
*/
|
|
24
|
+
totalRows?: number;
|
|
25
|
+
/** Wall-clock seconds since the write started. */
|
|
26
|
+
elapsedSeconds: number;
|
|
27
|
+
/** Number of parallel write tasks currently in flight. */
|
|
28
|
+
activeTasks: number;
|
|
29
|
+
/** Total number of parallel write tasks (the write parallelism). */
|
|
30
|
+
totalTasks: number;
|
|
31
|
+
/** `true` for the final callback; `false` otherwise. */
|
|
32
|
+
done: boolean;
|
|
33
|
+
}
|
|
8
34
|
/**
|
|
9
35
|
* Options for adding data to a table.
|
|
10
36
|
*/
|
|
@@ -15,6 +41,27 @@ export interface AddDataOptions {
|
|
|
15
41
|
* If "overwrite" then the new data will replace the existing data in the table.
|
|
16
42
|
*/
|
|
17
43
|
mode: "append" | "overwrite";
|
|
44
|
+
/**
|
|
45
|
+
* Optional callback invoked periodically with write progress.
|
|
46
|
+
*
|
|
47
|
+
* The callback is fired once per batch written and once more with
|
|
48
|
+
* `done: true` when the write completes. Calls are dispatched
|
|
49
|
+
* asynchronously to the JS event loop and never block the write — a slow
|
|
50
|
+
* callback will queue events rather than back-pressure the writer.
|
|
51
|
+
*
|
|
52
|
+
* Errors thrown from the callback are logged with `console.warn` and
|
|
53
|
+
* swallowed — they do not abort the write.
|
|
54
|
+
*
|
|
55
|
+
* @example
|
|
56
|
+
* ```ts
|
|
57
|
+
* await table.add(data, {
|
|
58
|
+
* progress: (p) => {
|
|
59
|
+
* console.log(`${p.outputRows}/${p.totalRows ?? "?"} rows`);
|
|
60
|
+
* },
|
|
61
|
+
* });
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
progress: (progress: WriteProgress) => void;
|
|
18
65
|
}
|
|
19
66
|
export interface UpdateOptions {
|
|
20
67
|
/**
|
|
@@ -61,6 +108,26 @@ export interface Version {
|
|
|
61
108
|
timestamp: Date;
|
|
62
109
|
metadata: Record<string, string>;
|
|
63
110
|
}
|
|
111
|
+
/**
|
|
112
|
+
* Specification selecting Lance's MemWAL LSM-style write path for
|
|
113
|
+
* `mergeInsert`.
|
|
114
|
+
*
|
|
115
|
+
* `specType` is `"bucket"`, `"identity"`, or `"unsharded"`. For `"bucket"`,
|
|
116
|
+
* `column` and `numBuckets` are required; for `"identity"`, `column` is
|
|
117
|
+
* required.
|
|
118
|
+
*/
|
|
119
|
+
export interface LsmWriteSpec {
|
|
120
|
+
/** One of `"bucket"`, `"identity"`, or `"unsharded"`. */
|
|
121
|
+
specType: "bucket" | "identity" | "unsharded";
|
|
122
|
+
/** Bucket and identity variants: the sharding column. */
|
|
123
|
+
column?: string;
|
|
124
|
+
/** Bucket variant: the number of buckets, in `[1, 1024]`. */
|
|
125
|
+
numBuckets?: number;
|
|
126
|
+
/** Names of indexes the MemWAL should keep up to date during writes. */
|
|
127
|
+
maintainedIndexes?: string[];
|
|
128
|
+
/** Default `ShardWriter` configuration recorded in the MemWAL index. */
|
|
129
|
+
writerConfigDefaults?: Record<string, string>;
|
|
130
|
+
}
|
|
64
131
|
/**
|
|
65
132
|
* A Table is a collection of Records in a LanceDB Database.
|
|
66
133
|
*
|
|
@@ -366,6 +433,54 @@ export declare abstract class Table {
|
|
|
366
433
|
* containing the new version number of the table after dropping the columns.
|
|
367
434
|
*/
|
|
368
435
|
abstract dropColumns(columnNames: string[]): Promise<DropColumnsResult>;
|
|
436
|
+
/**
|
|
437
|
+
* Set the unenforced primary key for this table to a single column.
|
|
438
|
+
*
|
|
439
|
+
* "Unenforced" means LanceDB does not check uniqueness on writes; the
|
|
440
|
+
* column is recorded in the schema as the primary key for use by features
|
|
441
|
+
* such as `merge_insert`. Only single-column primary keys are supported,
|
|
442
|
+
* and the key cannot be changed once set.
|
|
443
|
+
* @param {string | string[]} columns The primary key column. A one-element
|
|
444
|
+
* array is also accepted; passing more than one column is rejected.
|
|
445
|
+
* @returns {Promise<void>}
|
|
446
|
+
*/
|
|
447
|
+
abstract setUnenforcedPrimaryKey(columns: string | string[]): Promise<void>;
|
|
448
|
+
/**
|
|
449
|
+
* Install an {@link LsmWriteSpec} on this table, selecting Lance's MemWAL
|
|
450
|
+
* LSM-style write path for future `mergeInsert` calls.
|
|
451
|
+
*
|
|
452
|
+
* `LsmWriteSpec` chooses one of three sharding strategies via `specType`:
|
|
453
|
+
*
|
|
454
|
+
* - `"bucket"` — hash-bucket writes by the single-column unenforced primary
|
|
455
|
+
* key (`column` and `numBuckets` required).
|
|
456
|
+
* - `"identity"` — shard by the raw value of a scalar `column`.
|
|
457
|
+
* - `"unsharded"` — route every write to a single shard.
|
|
458
|
+
*
|
|
459
|
+
* All variants require the table to have an unenforced primary key
|
|
460
|
+
* ({@link Table#setUnenforcedPrimaryKey}); bucket sharding additionally
|
|
461
|
+
* requires it to be the single column being bucketed.
|
|
462
|
+
* @param {LsmWriteSpec} spec The sharding spec to install.
|
|
463
|
+
* @returns {Promise<void>}
|
|
464
|
+
* @example
|
|
465
|
+
* ```ts
|
|
466
|
+
* await table.setUnenforcedPrimaryKey("id");
|
|
467
|
+
* await table.setLsmWriteSpec({
|
|
468
|
+
* specType: "bucket",
|
|
469
|
+
* column: "id",
|
|
470
|
+
* numBuckets: 16,
|
|
471
|
+
* maintainedIndexes: ["id_idx"],
|
|
472
|
+
* });
|
|
473
|
+
* ```
|
|
474
|
+
*/
|
|
475
|
+
abstract setLsmWriteSpec(spec: LsmWriteSpec): Promise<void>;
|
|
476
|
+
/**
|
|
477
|
+
* Remove the {@link LsmWriteSpec} from this table, reverting to the standard
|
|
478
|
+
* `mergeInsert` write path.
|
|
479
|
+
*
|
|
480
|
+
* Errors if no spec is currently set.
|
|
481
|
+
* @returns {Promise<void>}
|
|
482
|
+
*/
|
|
483
|
+
abstract unsetLsmWriteSpec(): Promise<void>;
|
|
369
484
|
/** Retrieve the version of the table */
|
|
370
485
|
abstract version(): Promise<number>;
|
|
371
486
|
/**
|
|
@@ -527,6 +642,9 @@ export declare class LocalTable extends Table {
|
|
|
527
642
|
addColumns(newColumnTransforms: AddColumnsSql[] | Field | Field[] | Schema): Promise<AddColumnsResult>;
|
|
528
643
|
alterColumns(columnAlterations: ColumnAlteration[]): Promise<AlterColumnsResult>;
|
|
529
644
|
dropColumns(columnNames: string[]): Promise<DropColumnsResult>;
|
|
645
|
+
setUnenforcedPrimaryKey(columns: string | string[]): Promise<void>;
|
|
646
|
+
setLsmWriteSpec(spec: LsmWriteSpec): Promise<void>;
|
|
647
|
+
unsetLsmWriteSpec(): Promise<void>;
|
|
530
648
|
version(): Promise<number>;
|
|
531
649
|
checkout(version: number | string): Promise<void>;
|
|
532
650
|
checkoutLatest(): Promise<void>;
|
package/dist/table.js
CHANGED
|
@@ -66,7 +66,21 @@ class LocalTable extends Table {
|
|
|
66
66
|
const mode = options?.mode ?? "append";
|
|
67
67
|
const schema = await this.schema();
|
|
68
68
|
const buffer = await (0, arrow_1.fromDataToBuffer)(data, undefined, schema);
|
|
69
|
-
|
|
69
|
+
// Wrap the user callback so a thrown error doesn't surface as an
|
|
70
|
+
// unhandled exception (the callback fires from a napi threadsafe
|
|
71
|
+
// function — exceptions there crash the process).
|
|
72
|
+
const userProgress = options?.progress;
|
|
73
|
+
const progress = userProgress
|
|
74
|
+
? (p) => {
|
|
75
|
+
try {
|
|
76
|
+
userProgress(p);
|
|
77
|
+
}
|
|
78
|
+
catch (e) {
|
|
79
|
+
console.warn("Table.add progress callback threw:", e);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
: undefined;
|
|
83
|
+
return await this.inner.add(buffer, mode, progress);
|
|
70
84
|
}
|
|
71
85
|
async update(optsOrUpdates, options) {
|
|
72
86
|
const isValues = "values" in optsOrUpdates && typeof optsOrUpdates.values !== "string";
|
|
@@ -258,6 +272,16 @@ class LocalTable extends Table {
|
|
|
258
272
|
async dropColumns(columnNames) {
|
|
259
273
|
return await this.inner.dropColumns(columnNames);
|
|
260
274
|
}
|
|
275
|
+
async setUnenforcedPrimaryKey(columns) {
|
|
276
|
+
const cols = typeof columns === "string" ? [columns] : columns;
|
|
277
|
+
return await this.inner.setUnenforcedPrimaryKey(cols);
|
|
278
|
+
}
|
|
279
|
+
async setLsmWriteSpec(spec) {
|
|
280
|
+
return await this.inner.setLsmWriteSpec(spec);
|
|
281
|
+
}
|
|
282
|
+
async unsetLsmWriteSpec() {
|
|
283
|
+
return await this.inner.unsetLsmWriteSpec();
|
|
284
|
+
}
|
|
261
285
|
async version() {
|
|
262
286
|
return await this.inner.version();
|
|
263
287
|
}
|
package/package.json
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"ann"
|
|
12
12
|
],
|
|
13
13
|
"private": false,
|
|
14
|
-
"version": "0.
|
|
14
|
+
"version": "0.30.0-beta.0",
|
|
15
15
|
"main": "dist/index.js",
|
|
16
16
|
"exports": {
|
|
17
17
|
".": "./dist/index.js",
|
|
@@ -38,15 +38,15 @@
|
|
|
38
38
|
"url": "https://github.com/lancedb/lancedb"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|
|
41
|
-
"@aws-sdk/client-dynamodb": "
|
|
42
|
-
"@aws-sdk/client-kms": "
|
|
43
|
-
"@aws-sdk/client-s3": "
|
|
41
|
+
"@aws-sdk/client-dynamodb": "3.1003.0",
|
|
42
|
+
"@aws-sdk/client-kms": "3.1003.0",
|
|
43
|
+
"@aws-sdk/client-s3": "3.1003.0",
|
|
44
44
|
"@biomejs/biome": "^1.7.3",
|
|
45
45
|
"@jest/globals": "^29.7.0",
|
|
46
|
-
"@napi-rs/cli": "
|
|
46
|
+
"@napi-rs/cli": "3.5.1",
|
|
47
47
|
"@types/axios": "^0.14.0",
|
|
48
48
|
"@types/jest": "^29.1.2",
|
|
49
|
-
"@types/node": "
|
|
49
|
+
"@types/node": "22.7.4",
|
|
50
50
|
"@types/tmp": "^0.2.6",
|
|
51
51
|
"apache-arrow-15": "npm:apache-arrow@15.0.0",
|
|
52
52
|
"apache-arrow-16": "npm:apache-arrow@16.0.0",
|
|
@@ -57,9 +57,9 @@
|
|
|
57
57
|
"shx": "^0.3.4",
|
|
58
58
|
"tmp": "^0.2.3",
|
|
59
59
|
"ts-jest": "^29.1.2",
|
|
60
|
-
"typedoc": "
|
|
61
|
-
"typedoc-plugin-markdown": "
|
|
62
|
-
"typescript": "
|
|
60
|
+
"typedoc": "0.26.4",
|
|
61
|
+
"typedoc-plugin-markdown": "4.2.1",
|
|
62
|
+
"typescript": "5.5.4",
|
|
63
63
|
"typescript-eslint": "^7.1.0"
|
|
64
64
|
},
|
|
65
65
|
"ava": {
|
|
@@ -68,6 +68,7 @@
|
|
|
68
68
|
"engines": {
|
|
69
69
|
"node": ">= 18"
|
|
70
70
|
},
|
|
71
|
+
"packageManager": "pnpm@11.1.1",
|
|
71
72
|
"cpu": [
|
|
72
73
|
"x64",
|
|
73
74
|
"arm64"
|
|
@@ -80,10 +81,10 @@
|
|
|
80
81
|
"scripts": {
|
|
81
82
|
"artifacts": "napi artifacts",
|
|
82
83
|
"build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir lancedb",
|
|
83
|
-
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/",
|
|
84
|
+
"postbuild:debug": "shx mkdir -p dist && shx cp lancedb/*.node dist/ && node -e \"require('fs').writeFileSync('dist/package.json', JSON.stringify({name:'@lancedb/lancedb',type:'commonjs'}))\"",
|
|
84
85
|
"build:release": "napi build --platform --release --dts ../lancedb/native.d.ts --js ../lancedb/native.js --output-dir dist",
|
|
85
|
-
"build": "
|
|
86
|
-
"build-release": "
|
|
86
|
+
"build": "pnpm build:debug && pnpm tsc",
|
|
87
|
+
"build-release": "pnpm build:release && pnpm tsc",
|
|
87
88
|
"tsc": "tsc -b",
|
|
88
89
|
"posttsc": "shx cp lancedb/native.d.ts dist/native.d.ts",
|
|
89
90
|
"lint-ci": "biome ci .",
|
|
@@ -93,7 +94,7 @@
|
|
|
93
94
|
"lint-fix": "biome check --write . && biome format --write .",
|
|
94
95
|
"prepublishOnly": "napi prepublish -t npm",
|
|
95
96
|
"test": "jest --verbose",
|
|
96
|
-
"integration": "S3_TEST=1
|
|
97
|
+
"integration": "S3_TEST=1 pnpm test",
|
|
97
98
|
"universal": "napi universalize",
|
|
98
99
|
"version": "napi version"
|
|
99
100
|
},
|
|
@@ -101,13 +102,13 @@
|
|
|
101
102
|
"reflect-metadata": "^0.2.2"
|
|
102
103
|
},
|
|
103
104
|
"optionalDependencies": {
|
|
104
|
-
"@lancedb/lancedb-darwin-arm64": "0.
|
|
105
|
-
"@lancedb/lancedb-linux-x64-gnu": "0.
|
|
106
|
-
"@lancedb/lancedb-linux-arm64-gnu": "0.
|
|
107
|
-
"@lancedb/lancedb-linux-x64-musl": "0.
|
|
108
|
-
"@lancedb/lancedb-linux-arm64-musl": "0.
|
|
109
|
-
"@lancedb/lancedb-win32-x64-msvc": "0.
|
|
110
|
-
"@lancedb/lancedb-win32-arm64-msvc": "0.
|
|
105
|
+
"@lancedb/lancedb-darwin-arm64": "0.30.0-beta.0",
|
|
106
|
+
"@lancedb/lancedb-linux-x64-gnu": "0.30.0-beta.0",
|
|
107
|
+
"@lancedb/lancedb-linux-arm64-gnu": "0.30.0-beta.0",
|
|
108
|
+
"@lancedb/lancedb-linux-x64-musl": "0.30.0-beta.0",
|
|
109
|
+
"@lancedb/lancedb-linux-arm64-musl": "0.30.0-beta.0",
|
|
110
|
+
"@lancedb/lancedb-win32-x64-msvc": "0.30.0-beta.0",
|
|
111
|
+
"@lancedb/lancedb-win32-arm64-msvc": "0.30.0-beta.0"
|
|
111
112
|
},
|
|
112
113
|
"peerDependencies": {
|
|
113
114
|
"apache-arrow": ">=15.0.0 <=18.1.0"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Flat node_modules layout. The @napi-rs/cli build step fails to locate
|
|
2
|
+
# the cdylib artifact under pnpm's isolated layout; the hoisted linker
|
|
3
|
+
# mirrors npm's structure and unblocks the native build.
|
|
4
|
+
nodeLinker: hoisted
|
|
5
|
+
|
|
6
|
+
# Block resolution of versions less than 24h old (Shai-Hulud window).
|
|
7
|
+
# This is the pnpm 11 default but pinned here so it's visible to
|
|
8
|
+
# reviewers and survives a future pnpm major flipping the default.
|
|
9
|
+
minimumReleaseAge: 1440
|
|
10
|
+
|
|
11
|
+
# Fail install if a transitive dep tries to run an unapproved script.
|
|
12
|
+
strictDepBuilds: true
|
|
13
|
+
|
|
14
|
+
allowBuilds:
|
|
15
|
+
'@biomejs/biome': true
|
|
16
|
+
onnxruntime-node: true
|
|
17
|
+
protobufjs: true
|
|
18
|
+
sharp: true
|