@lancedb/lancedb 0.22.2-beta.2 → 0.22.3-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/arrow.d.ts +1 -1
- package/dist/index.d.ts +3 -2
- package/dist/index.js +4 -1
- package/dist/indices.d.ts +83 -0
- package/dist/indices.js +20 -1
- package/dist/native.d.ts +39 -0
- package/dist/native.js +3 -1
- package/dist/permutation.d.ts +131 -0
- package/dist/permutation.js +169 -0
- package/dist/sanitize.d.ts +1 -0
- package/dist/sanitize.js +49 -2
- package/package.json +9 -9
package/dist/arrow.d.ts
CHANGED
|
@@ -11,7 +11,7 @@ export type SchemaLike = Schema | {
|
|
|
11
11
|
export type FieldLike = Field | {
|
|
12
12
|
type: string;
|
|
13
13
|
name: string;
|
|
14
|
-
nullable
|
|
14
|
+
nullable: boolean;
|
|
15
15
|
metadata?: Map<string, string>;
|
|
16
16
|
};
|
|
17
17
|
export type DataLike = import("apache-arrow").Data<Struct<any>> | {
|
package/dist/index.d.ts
CHANGED
|
@@ -2,16 +2,17 @@ import { Connection } from "./connection";
|
|
|
2
2
|
import { ConnectionOptions, Session } from "./native.js";
|
|
3
3
|
import { HeaderProvider } from "./header";
|
|
4
4
|
export { JsHeaderProvider as NativeJsHeaderProvider } from "./native.js";
|
|
5
|
-
export { AddColumnsSql, ConnectionOptions, IndexStatistics, IndexConfig, ClientConfig, TimeoutConfig, RetryConfig, TlsConfig, OptimizeStats, CompactionStats, RemovalStats, TableStatistics, FragmentStatistics, FragmentSummaryStats, Tags, TagContents, MergeResult, AddResult, AddColumnsResult, AlterColumnsResult, DeleteResult, DropColumnsResult, UpdateResult, } from "./native.js";
|
|
5
|
+
export { AddColumnsSql, ConnectionOptions, IndexStatistics, IndexConfig, ClientConfig, TimeoutConfig, RetryConfig, TlsConfig, OptimizeStats, CompactionStats, RemovalStats, TableStatistics, FragmentStatistics, FragmentSummaryStats, Tags, TagContents, MergeResult, AddResult, AddColumnsResult, AlterColumnsResult, DeleteResult, DropColumnsResult, UpdateResult, SplitRandomOptions, SplitHashOptions, SplitSequentialOptions, ShuffleOptions, } from "./native.js";
|
|
6
6
|
export { makeArrowTable, MakeArrowTableOptions, Data, VectorColumnOptions, } from "./arrow";
|
|
7
7
|
export { Connection, CreateTableOptions, TableNamesOptions, OpenTableOptions, } from "./connection";
|
|
8
8
|
export { Session } from "./native.js";
|
|
9
9
|
export { ExecutableQuery, Query, QueryBase, VectorQuery, TakeQuery, QueryExecutionOptions, FullTextSearchOptions, RecordBatchIterator, FullTextQuery, MatchQuery, PhraseQuery, BoostQuery, MultiMatchQuery, BooleanQuery, FullTextQueryType, Operator, Occur, } from "./query";
|
|
10
|
-
export { Index, IndexOptions, IvfPqOptions, IvfFlatOptions, HnswPqOptions, HnswSqOptions, FtsOptions, } from "./indices";
|
|
10
|
+
export { Index, IndexOptions, IvfPqOptions, IvfRqOptions, IvfFlatOptions, HnswPqOptions, HnswSqOptions, FtsOptions, } from "./indices";
|
|
11
11
|
export { Table, AddDataOptions, UpdateOptions, OptimizeOptions, Version, ColumnAlteration, } from "./table";
|
|
12
12
|
export { HeaderProvider, StaticHeaderProvider, OAuthHeaderProvider, TokenResponse, } from "./header";
|
|
13
13
|
export { MergeInsertBuilder, WriteExecutionOptions } from "./merge";
|
|
14
14
|
export * as embedding from "./embedding";
|
|
15
|
+
export { permutationBuilder, PermutationBuilder } from "./permutation";
|
|
15
16
|
export * as rerankers from "./rerankers";
|
|
16
17
|
export { SchemaLike, TableLike, FieldLike, RecordBatchLike, DataLike, IntoVector, MultiVector, } from "./arrow";
|
|
17
18
|
export { IntoSql, packBits } from "./util";
|
package/dist/index.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
// SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
4
4
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
-
exports.packBits = exports.rerankers = exports.embedding = exports.MergeInsertBuilder = exports.OAuthHeaderProvider = exports.StaticHeaderProvider = exports.HeaderProvider = exports.Table = exports.Index = exports.Occur = exports.Operator = exports.FullTextQueryType = exports.BooleanQuery = exports.MultiMatchQuery = exports.BoostQuery = exports.PhraseQuery = exports.MatchQuery = exports.RecordBatchIterator = exports.TakeQuery = exports.VectorQuery = exports.QueryBase = exports.Query = exports.Session = exports.Connection = exports.VectorColumnOptions = exports.MakeArrowTableOptions = exports.makeArrowTable = exports.TagContents = exports.Tags = exports.NativeJsHeaderProvider = void 0;
|
|
5
|
+
exports.packBits = exports.rerankers = exports.PermutationBuilder = exports.permutationBuilder = exports.embedding = exports.MergeInsertBuilder = exports.OAuthHeaderProvider = exports.StaticHeaderProvider = exports.HeaderProvider = exports.Table = exports.Index = exports.Occur = exports.Operator = exports.FullTextQueryType = exports.BooleanQuery = exports.MultiMatchQuery = exports.BoostQuery = exports.PhraseQuery = exports.MatchQuery = exports.RecordBatchIterator = exports.TakeQuery = exports.VectorQuery = exports.QueryBase = exports.Query = exports.Session = exports.Connection = exports.VectorColumnOptions = exports.MakeArrowTableOptions = exports.makeArrowTable = exports.TagContents = exports.Tags = exports.NativeJsHeaderProvider = void 0;
|
|
6
6
|
exports.connect = connect;
|
|
7
7
|
const connection_1 = require("./connection");
|
|
8
8
|
const native_js_1 = require("./native.js");
|
|
@@ -45,6 +45,9 @@ Object.defineProperty(exports, "OAuthHeaderProvider", { enumerable: true, get: f
|
|
|
45
45
|
var merge_1 = require("./merge");
|
|
46
46
|
Object.defineProperty(exports, "MergeInsertBuilder", { enumerable: true, get: function () { return merge_1.MergeInsertBuilder; } });
|
|
47
47
|
exports.embedding = require("./embedding");
|
|
48
|
+
var permutation_1 = require("./permutation");
|
|
49
|
+
Object.defineProperty(exports, "permutationBuilder", { enumerable: true, get: function () { return permutation_1.permutationBuilder; } });
|
|
50
|
+
Object.defineProperty(exports, "PermutationBuilder", { enumerable: true, get: function () { return permutation_1.PermutationBuilder; } });
|
|
48
51
|
exports.rerankers = require("./rerankers");
|
|
49
52
|
var util_1 = require("./util");
|
|
50
53
|
Object.defineProperty(exports, "packBits", { enumerable: true, get: function () { return util_1.packBits; } });
|
package/dist/indices.d.ts
CHANGED
|
@@ -101,6 +101,72 @@ export interface IvfPqOptions {
|
|
|
101
101
|
*/
|
|
102
102
|
sampleRate?: number;
|
|
103
103
|
}
|
|
104
|
+
export interface IvfRqOptions {
|
|
105
|
+
/**
|
|
106
|
+
* The number of IVF partitions to create.
|
|
107
|
+
*
|
|
108
|
+
* This value should generally scale with the number of rows in the dataset.
|
|
109
|
+
* By default the number of partitions is the square root of the number of
|
|
110
|
+
* rows.
|
|
111
|
+
*
|
|
112
|
+
* If this value is too large then the first part of the search (picking the
|
|
113
|
+
* right partition) will be slow. If this value is too small then the second
|
|
114
|
+
* part of the search (searching within a partition) will be slow.
|
|
115
|
+
*/
|
|
116
|
+
numPartitions?: number;
|
|
117
|
+
/**
|
|
118
|
+
* Number of bits per dimension for residual quantization.
|
|
119
|
+
*
|
|
120
|
+
* This value controls how much each residual component is compressed. The more
|
|
121
|
+
* bits, the more accurate the index will be but the slower search. Typical values
|
|
122
|
+
* are small integers; the default is 1 bit per dimension.
|
|
123
|
+
*/
|
|
124
|
+
numBits?: number;
|
|
125
|
+
/**
|
|
126
|
+
* Distance type to use to build the index.
|
|
127
|
+
*
|
|
128
|
+
* Default value is "l2".
|
|
129
|
+
*
|
|
130
|
+
* This is used when training the index to calculate the IVF partitions
|
|
131
|
+
* (vectors are grouped in partitions with similar vectors according to this
|
|
132
|
+
* distance type) and during quantization.
|
|
133
|
+
*
|
|
134
|
+
* The distance type used to train an index MUST match the distance type used
|
|
135
|
+
* to search the index. Failure to do so will yield inaccurate results.
|
|
136
|
+
*
|
|
137
|
+
* The following distance types are available:
|
|
138
|
+
*
|
|
139
|
+
* "l2" - Euclidean distance.
|
|
140
|
+
* "cosine" - Cosine distance.
|
|
141
|
+
* "dot" - Dot product.
|
|
142
|
+
*/
|
|
143
|
+
distanceType?: "l2" | "cosine" | "dot";
|
|
144
|
+
/**
|
|
145
|
+
* Max iterations to train IVF kmeans.
|
|
146
|
+
*
|
|
147
|
+
* When training an IVF index we use kmeans to calculate the partitions. This parameter
|
|
148
|
+
* controls how many iterations of kmeans to run.
|
|
149
|
+
*
|
|
150
|
+
* The default value is 50.
|
|
151
|
+
*/
|
|
152
|
+
maxIterations?: number;
|
|
153
|
+
/**
|
|
154
|
+
* The number of vectors, per partition, to sample when training IVF kmeans.
|
|
155
|
+
*
|
|
156
|
+
* When an IVF index is trained, we need to calculate partitions. These are groups
|
|
157
|
+
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
|
158
|
+
*
|
|
159
|
+
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
|
160
|
+
* random sample of the data. This parameter controls the size of the sample. The total
|
|
161
|
+
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
|
162
|
+
*
|
|
163
|
+
* Increasing this value might improve the quality of the index but in most cases the
|
|
164
|
+
* default should be sufficient.
|
|
165
|
+
*
|
|
166
|
+
* The default value is 256.
|
|
167
|
+
*/
|
|
168
|
+
sampleRate?: number;
|
|
169
|
+
}
|
|
104
170
|
/**
|
|
105
171
|
* Options to create an `HNSW_PQ` index
|
|
106
172
|
*/
|
|
@@ -476,6 +542,23 @@ export declare class Index {
|
|
|
476
542
|
* currently is also a memory intensive operation.
|
|
477
543
|
*/
|
|
478
544
|
static ivfPq(options?: Partial<IvfPqOptions>): Index;
|
|
545
|
+
/**
|
|
546
|
+
* Create an IvfRq index
|
|
547
|
+
*
|
|
548
|
+
* IVF-RQ (RabitQ Quantization) compresses vectors using RabitQ quantization
|
|
549
|
+
* and organizes them into IVF partitions.
|
|
550
|
+
*
|
|
551
|
+
* The compression scheme is called RabitQ quantization. Each dimension is quantized into a small number of bits.
|
|
552
|
+
* The parameters `num_bits` and `num_partitions` control this process, providing a tradeoff
|
|
553
|
+
* between index size (and thus search speed) and index accuracy.
|
|
554
|
+
*
|
|
555
|
+
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
|
556
|
+
* many groups to create.
|
|
557
|
+
*
|
|
558
|
+
* Note that training an IVF RQ index on a large dataset is a slow operation and
|
|
559
|
+
* currently is also a memory intensive operation.
|
|
560
|
+
*/
|
|
561
|
+
static ivfRq(options?: Partial<IvfRqOptions>): Index;
|
|
479
562
|
/**
|
|
480
563
|
* Create an IvfFlat index
|
|
481
564
|
*
|
package/dist/indices.js
CHANGED
|
@@ -32,7 +32,26 @@ class Index {
|
|
|
32
32
|
* currently is also a memory intensive operation.
|
|
33
33
|
*/
|
|
34
34
|
static ivfPq(options) {
|
|
35
|
-
return new Index(native_1.Index.ivfPq(options?.distanceType, options?.numPartitions, options?.numSubVectors, options?.maxIterations, options?.sampleRate));
|
|
35
|
+
return new Index(native_1.Index.ivfPq(options?.distanceType, options?.numPartitions, options?.numSubVectors, options?.numBits, options?.maxIterations, options?.sampleRate));
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Create an IvfRq index
|
|
39
|
+
*
|
|
40
|
+
* IVF-RQ (RabitQ Quantization) compresses vectors using RabitQ quantization
|
|
41
|
+
* and organizes them into IVF partitions.
|
|
42
|
+
*
|
|
43
|
+
* The compression scheme is called RabitQ quantization. Each dimension is quantized into a small number of bits.
|
|
44
|
+
* The parameters `num_bits` and `num_partitions` control this process, providing a tradeoff
|
|
45
|
+
* between index size (and thus search speed) and index accuracy.
|
|
46
|
+
*
|
|
47
|
+
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
|
48
|
+
* many groups to create.
|
|
49
|
+
*
|
|
50
|
+
* Note that training an IVF RQ index on a large dataset is a slow operation and
|
|
51
|
+
* currently is also a memory intensive operation.
|
|
52
|
+
*/
|
|
53
|
+
static ivfRq(options) {
|
|
54
|
+
return new Index(native_1.Index.ivfRq(options?.distanceType, options?.numPartitions, options?.numBits, options?.maxIterations, options?.sampleRate));
|
|
36
55
|
}
|
|
37
56
|
/**
|
|
38
57
|
* Create an IvfFlat index
|
package/dist/native.d.ts
CHANGED
|
@@ -3,6 +3,28 @@
|
|
|
3
3
|
|
|
4
4
|
/* auto-generated by NAPI-RS */
|
|
5
5
|
|
|
6
|
+
export interface SplitRandomOptions {
|
|
7
|
+
ratios?: Array<number>
|
|
8
|
+
counts?: Array<number>
|
|
9
|
+
fixed?: number
|
|
10
|
+
seed?: number
|
|
11
|
+
}
|
|
12
|
+
export interface SplitHashOptions {
|
|
13
|
+
columns: Array<string>
|
|
14
|
+
splitWeights: Array<number>
|
|
15
|
+
discardWeight?: number
|
|
16
|
+
}
|
|
17
|
+
export interface SplitSequentialOptions {
|
|
18
|
+
ratios?: Array<number>
|
|
19
|
+
counts?: Array<number>
|
|
20
|
+
fixed?: number
|
|
21
|
+
}
|
|
22
|
+
export interface ShuffleOptions {
|
|
23
|
+
seed?: number
|
|
24
|
+
clumpSize?: number
|
|
25
|
+
}
|
|
26
|
+
/** Create a permutation builder for the given table */
|
|
27
|
+
export declare function permutationBuilder(table: Table, destTableName: string): PermutationBuilder
|
|
6
28
|
/** Timeout configuration for remote HTTP client. */
|
|
7
29
|
export interface TimeoutConfig {
|
|
8
30
|
/**
|
|
@@ -357,6 +379,7 @@ export class JsHeaderProvider {
|
|
|
357
379
|
}
|
|
358
380
|
export class Index {
|
|
359
381
|
static ivfPq(distanceType?: string | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, numBits?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): Index
|
|
382
|
+
static ivfRq(distanceType?: string | undefined | null, numPartitions?: number | undefined | null, numBits?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): Index
|
|
360
383
|
static ivfFlat(distanceType?: string | undefined | null, numPartitions?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): Index
|
|
361
384
|
static btree(): Index
|
|
362
385
|
static bitmap(): Index
|
|
@@ -378,6 +401,22 @@ export class NativeMergeInsertBuilder {
|
|
|
378
401
|
useIndex(useIndex: boolean): NativeMergeInsertBuilder
|
|
379
402
|
execute(buf: Buffer): Promise<MergeResult>
|
|
380
403
|
}
|
|
404
|
+
export class PermutationBuilder {
|
|
405
|
+
/** Configure random splits */
|
|
406
|
+
splitRandom(options: SplitRandomOptions): PermutationBuilder
|
|
407
|
+
/** Configure hash-based splits */
|
|
408
|
+
splitHash(options: SplitHashOptions): PermutationBuilder
|
|
409
|
+
/** Configure sequential splits */
|
|
410
|
+
splitSequential(options: SplitSequentialOptions): PermutationBuilder
|
|
411
|
+
/** Configure calculated splits */
|
|
412
|
+
splitCalculated(calculation: string): PermutationBuilder
|
|
413
|
+
/** Configure shuffling */
|
|
414
|
+
shuffle(options: ShuffleOptions): PermutationBuilder
|
|
415
|
+
/** Configure filtering */
|
|
416
|
+
filter(filter: string): PermutationBuilder
|
|
417
|
+
/** Execute the permutation builder and create the table */
|
|
418
|
+
execute(): Promise<Table>
|
|
419
|
+
}
|
|
381
420
|
export class Query {
|
|
382
421
|
onlyIf(predicate: string): void
|
|
383
422
|
fullTextSearch(query: object): void
|
package/dist/native.js
CHANGED
|
@@ -319,12 +319,14 @@ if (!nativeBinding) {
|
|
|
319
319
|
}
|
|
320
320
|
throw new Error(`Failed to load native binding`);
|
|
321
321
|
}
|
|
322
|
-
const { Connection, JsHeaderProvider, Index, RecordBatchIterator, NativeMergeInsertBuilder, Query, VectorQuery, TakeQuery, JsFullTextQuery, Reranker, RrfReranker, Session, Table, TagContents, Tags } = nativeBinding;
|
|
322
|
+
const { Connection, JsHeaderProvider, Index, RecordBatchIterator, NativeMergeInsertBuilder, PermutationBuilder, permutationBuilder, Query, VectorQuery, TakeQuery, JsFullTextQuery, Reranker, RrfReranker, Session, Table, TagContents, Tags } = nativeBinding;
|
|
323
323
|
module.exports.Connection = Connection;
|
|
324
324
|
module.exports.JsHeaderProvider = JsHeaderProvider;
|
|
325
325
|
module.exports.Index = Index;
|
|
326
326
|
module.exports.RecordBatchIterator = RecordBatchIterator;
|
|
327
327
|
module.exports.NativeMergeInsertBuilder = NativeMergeInsertBuilder;
|
|
328
|
+
module.exports.PermutationBuilder = PermutationBuilder;
|
|
329
|
+
module.exports.permutationBuilder = permutationBuilder;
|
|
328
330
|
module.exports.Query = Query;
|
|
329
331
|
module.exports.VectorQuery = VectorQuery;
|
|
330
332
|
module.exports.TakeQuery = TakeQuery;
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import { PermutationBuilder as NativePermutationBuilder, ShuffleOptions, SplitHashOptions, SplitRandomOptions, SplitSequentialOptions } from "./native.js";
|
|
2
|
+
import { Table } from "./table";
|
|
3
|
+
/**
|
|
4
|
+
* A PermutationBuilder for creating data permutations with splits, shuffling, and filtering.
|
|
5
|
+
*
|
|
6
|
+
* This class provides a TypeScript wrapper around the native Rust PermutationBuilder,
|
|
7
|
+
* offering methods to configure data splits, shuffling, and filtering before executing
|
|
8
|
+
* the permutation to create a new table.
|
|
9
|
+
*/
|
|
10
|
+
export declare class PermutationBuilder {
|
|
11
|
+
private inner;
|
|
12
|
+
/**
|
|
13
|
+
* @hidden
|
|
14
|
+
*/
|
|
15
|
+
constructor(inner: NativePermutationBuilder);
|
|
16
|
+
/**
|
|
17
|
+
* Configure random splits for the permutation.
|
|
18
|
+
*
|
|
19
|
+
* @param options - Configuration for random splitting
|
|
20
|
+
* @returns A new PermutationBuilder instance
|
|
21
|
+
* @example
|
|
22
|
+
* ```ts
|
|
23
|
+
* // Split by ratios
|
|
24
|
+
* builder.splitRandom({ ratios: [0.7, 0.3], seed: 42 });
|
|
25
|
+
*
|
|
26
|
+
* // Split by counts
|
|
27
|
+
* builder.splitRandom({ counts: [1000, 500], seed: 42 });
|
|
28
|
+
*
|
|
29
|
+
* // Split with fixed size
|
|
30
|
+
* builder.splitRandom({ fixed: 100, seed: 42 });
|
|
31
|
+
* ```
|
|
32
|
+
*/
|
|
33
|
+
splitRandom(options: SplitRandomOptions): PermutationBuilder;
|
|
34
|
+
/**
|
|
35
|
+
* Configure hash-based splits for the permutation.
|
|
36
|
+
*
|
|
37
|
+
* @param options - Configuration for hash-based splitting
|
|
38
|
+
* @returns A new PermutationBuilder instance
|
|
39
|
+
* @example
|
|
40
|
+
* ```ts
|
|
41
|
+
* builder.splitHash({
|
|
42
|
+
* columns: ["user_id"],
|
|
43
|
+
* splitWeights: [70, 30],
|
|
44
|
+
* discardWeight: 0
|
|
45
|
+
* });
|
|
46
|
+
* ```
|
|
47
|
+
*/
|
|
48
|
+
splitHash(options: SplitHashOptions): PermutationBuilder;
|
|
49
|
+
/**
|
|
50
|
+
* Configure sequential splits for the permutation.
|
|
51
|
+
*
|
|
52
|
+
* @param options - Configuration for sequential splitting
|
|
53
|
+
* @returns A new PermutationBuilder instance
|
|
54
|
+
* @example
|
|
55
|
+
* ```ts
|
|
56
|
+
* // Split by ratios
|
|
57
|
+
* builder.splitSequential({ ratios: [0.8, 0.2] });
|
|
58
|
+
*
|
|
59
|
+
* // Split by counts
|
|
60
|
+
* builder.splitSequential({ counts: [800, 200] });
|
|
61
|
+
*
|
|
62
|
+
* // Split with fixed size
|
|
63
|
+
* builder.splitSequential({ fixed: 1000 });
|
|
64
|
+
* ```
|
|
65
|
+
*/
|
|
66
|
+
splitSequential(options: SplitSequentialOptions): PermutationBuilder;
|
|
67
|
+
/**
|
|
68
|
+
* Configure calculated splits for the permutation.
|
|
69
|
+
*
|
|
70
|
+
* @param calculation - SQL expression for calculating splits
|
|
71
|
+
* @returns A new PermutationBuilder instance
|
|
72
|
+
* @example
|
|
73
|
+
* ```ts
|
|
74
|
+
* builder.splitCalculated("user_id % 3");
|
|
75
|
+
* ```
|
|
76
|
+
*/
|
|
77
|
+
splitCalculated(calculation: string): PermutationBuilder;
|
|
78
|
+
/**
|
|
79
|
+
* Configure shuffling for the permutation.
|
|
80
|
+
*
|
|
81
|
+
* @param options - Configuration for shuffling
|
|
82
|
+
* @returns A new PermutationBuilder instance
|
|
83
|
+
* @example
|
|
84
|
+
* ```ts
|
|
85
|
+
* // Basic shuffle
|
|
86
|
+
* builder.shuffle({ seed: 42 });
|
|
87
|
+
*
|
|
88
|
+
* // Shuffle with clump size
|
|
89
|
+
* builder.shuffle({ seed: 42, clumpSize: 10 });
|
|
90
|
+
* ```
|
|
91
|
+
*/
|
|
92
|
+
shuffle(options: ShuffleOptions): PermutationBuilder;
|
|
93
|
+
/**
|
|
94
|
+
* Configure filtering for the permutation.
|
|
95
|
+
*
|
|
96
|
+
* @param filter - SQL filter expression
|
|
97
|
+
* @returns A new PermutationBuilder instance
|
|
98
|
+
* @example
|
|
99
|
+
* ```ts
|
|
100
|
+
* builder.filter("age > 18 AND status = 'active'");
|
|
101
|
+
* ```
|
|
102
|
+
*/
|
|
103
|
+
filter(filter: string): PermutationBuilder;
|
|
104
|
+
/**
|
|
105
|
+
* Execute the permutation and create the destination table.
|
|
106
|
+
*
|
|
107
|
+
* @returns A Promise that resolves to the new Table instance
|
|
108
|
+
* @example
|
|
109
|
+
* ```ts
|
|
110
|
+
* const permutationTable = await builder.execute();
|
|
111
|
+
* console.log(`Created table: ${permutationTable.name}`);
|
|
112
|
+
* ```
|
|
113
|
+
*/
|
|
114
|
+
execute(): Promise<Table>;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Create a permutation builder for the given table.
|
|
118
|
+
*
|
|
119
|
+
* @param table - The source table to create a permutation from
|
|
120
|
+
* @param destTableName - The name for the destination permutation table
|
|
121
|
+
* @returns A PermutationBuilder instance
|
|
122
|
+
* @example
|
|
123
|
+
* ```ts
|
|
124
|
+
* const builder = permutationBuilder(sourceTable, "training_data")
|
|
125
|
+
* .splitRandom({ ratios: [0.8, 0.2], seed: 42 })
|
|
126
|
+
* .shuffle({ seed: 123 });
|
|
127
|
+
*
|
|
128
|
+
* const trainingTable = await builder.execute();
|
|
129
|
+
* ```
|
|
130
|
+
*/
|
|
131
|
+
export declare function permutationBuilder(table: Table, destTableName: string): PermutationBuilder;
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
4
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
+
exports.PermutationBuilder = void 0;
|
|
6
|
+
exports.permutationBuilder = permutationBuilder;
|
|
7
|
+
const native_js_1 = require("./native.js");
|
|
8
|
+
const table_1 = require("./table");
|
|
9
|
+
/**
|
|
10
|
+
* A PermutationBuilder for creating data permutations with splits, shuffling, and filtering.
|
|
11
|
+
*
|
|
12
|
+
* This class provides a TypeScript wrapper around the native Rust PermutationBuilder,
|
|
13
|
+
* offering methods to configure data splits, shuffling, and filtering before executing
|
|
14
|
+
* the permutation to create a new table.
|
|
15
|
+
*/
|
|
16
|
+
class PermutationBuilder {
|
|
17
|
+
inner;
|
|
18
|
+
/**
|
|
19
|
+
* @hidden
|
|
20
|
+
*/
|
|
21
|
+
constructor(inner) {
|
|
22
|
+
this.inner = inner;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Configure random splits for the permutation.
|
|
26
|
+
*
|
|
27
|
+
* @param options - Configuration for random splitting
|
|
28
|
+
* @returns A new PermutationBuilder instance
|
|
29
|
+
* @example
|
|
30
|
+
* ```ts
|
|
31
|
+
* // Split by ratios
|
|
32
|
+
* builder.splitRandom({ ratios: [0.7, 0.3], seed: 42 });
|
|
33
|
+
*
|
|
34
|
+
* // Split by counts
|
|
35
|
+
* builder.splitRandom({ counts: [1000, 500], seed: 42 });
|
|
36
|
+
*
|
|
37
|
+
* // Split with fixed size
|
|
38
|
+
* builder.splitRandom({ fixed: 100, seed: 42 });
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
splitRandom(options) {
|
|
42
|
+
const newInner = this.inner.splitRandom(options);
|
|
43
|
+
return new PermutationBuilder(newInner);
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Configure hash-based splits for the permutation.
|
|
47
|
+
*
|
|
48
|
+
* @param options - Configuration for hash-based splitting
|
|
49
|
+
* @returns A new PermutationBuilder instance
|
|
50
|
+
* @example
|
|
51
|
+
* ```ts
|
|
52
|
+
* builder.splitHash({
|
|
53
|
+
* columns: ["user_id"],
|
|
54
|
+
* splitWeights: [70, 30],
|
|
55
|
+
* discardWeight: 0
|
|
56
|
+
* });
|
|
57
|
+
* ```
|
|
58
|
+
*/
|
|
59
|
+
splitHash(options) {
|
|
60
|
+
const newInner = this.inner.splitHash(options);
|
|
61
|
+
return new PermutationBuilder(newInner);
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Configure sequential splits for the permutation.
|
|
65
|
+
*
|
|
66
|
+
* @param options - Configuration for sequential splitting
|
|
67
|
+
* @returns A new PermutationBuilder instance
|
|
68
|
+
* @example
|
|
69
|
+
* ```ts
|
|
70
|
+
* // Split by ratios
|
|
71
|
+
* builder.splitSequential({ ratios: [0.8, 0.2] });
|
|
72
|
+
*
|
|
73
|
+
* // Split by counts
|
|
74
|
+
* builder.splitSequential({ counts: [800, 200] });
|
|
75
|
+
*
|
|
76
|
+
* // Split with fixed size
|
|
77
|
+
* builder.splitSequential({ fixed: 1000 });
|
|
78
|
+
* ```
|
|
79
|
+
*/
|
|
80
|
+
splitSequential(options) {
|
|
81
|
+
const newInner = this.inner.splitSequential(options);
|
|
82
|
+
return new PermutationBuilder(newInner);
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Configure calculated splits for the permutation.
|
|
86
|
+
*
|
|
87
|
+
* @param calculation - SQL expression for calculating splits
|
|
88
|
+
* @returns A new PermutationBuilder instance
|
|
89
|
+
* @example
|
|
90
|
+
* ```ts
|
|
91
|
+
* builder.splitCalculated("user_id % 3");
|
|
92
|
+
* ```
|
|
93
|
+
*/
|
|
94
|
+
splitCalculated(calculation) {
|
|
95
|
+
const newInner = this.inner.splitCalculated(calculation);
|
|
96
|
+
return new PermutationBuilder(newInner);
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Configure shuffling for the permutation.
|
|
100
|
+
*
|
|
101
|
+
* @param options - Configuration for shuffling
|
|
102
|
+
* @returns A new PermutationBuilder instance
|
|
103
|
+
* @example
|
|
104
|
+
* ```ts
|
|
105
|
+
* // Basic shuffle
|
|
106
|
+
* builder.shuffle({ seed: 42 });
|
|
107
|
+
*
|
|
108
|
+
* // Shuffle with clump size
|
|
109
|
+
* builder.shuffle({ seed: 42, clumpSize: 10 });
|
|
110
|
+
* ```
|
|
111
|
+
*/
|
|
112
|
+
shuffle(options) {
|
|
113
|
+
const newInner = this.inner.shuffle(options);
|
|
114
|
+
return new PermutationBuilder(newInner);
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Configure filtering for the permutation.
|
|
118
|
+
*
|
|
119
|
+
* @param filter - SQL filter expression
|
|
120
|
+
* @returns A new PermutationBuilder instance
|
|
121
|
+
* @example
|
|
122
|
+
* ```ts
|
|
123
|
+
* builder.filter("age > 18 AND status = 'active'");
|
|
124
|
+
* ```
|
|
125
|
+
*/
|
|
126
|
+
filter(filter) {
|
|
127
|
+
const newInner = this.inner.filter(filter);
|
|
128
|
+
return new PermutationBuilder(newInner);
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Execute the permutation and create the destination table.
|
|
132
|
+
*
|
|
133
|
+
* @returns A Promise that resolves to the new Table instance
|
|
134
|
+
* @example
|
|
135
|
+
* ```ts
|
|
136
|
+
* const permutationTable = await builder.execute();
|
|
137
|
+
* console.log(`Created table: ${permutationTable.name}`);
|
|
138
|
+
* ```
|
|
139
|
+
*/
|
|
140
|
+
async execute() {
|
|
141
|
+
const nativeTable = await this.inner.execute();
|
|
142
|
+
return new table_1.LocalTable(nativeTable);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
exports.PermutationBuilder = PermutationBuilder;
|
|
146
|
+
/**
|
|
147
|
+
* Create a permutation builder for the given table.
|
|
148
|
+
*
|
|
149
|
+
* @param table - The source table to create a permutation from
|
|
150
|
+
* @param destTableName - The name for the destination permutation table
|
|
151
|
+
* @returns A PermutationBuilder instance
|
|
152
|
+
* @example
|
|
153
|
+
* ```ts
|
|
154
|
+
* const builder = permutationBuilder(sourceTable, "training_data")
|
|
155
|
+
* .splitRandom({ ratios: [0.8, 0.2], seed: 42 })
|
|
156
|
+
* .shuffle({ seed: 123 });
|
|
157
|
+
*
|
|
158
|
+
* const trainingTable = await builder.execute();
|
|
159
|
+
* ```
|
|
160
|
+
*/
|
|
161
|
+
function permutationBuilder(table, destTableName) {
|
|
162
|
+
// Extract the inner native table from the TypeScript wrapper
|
|
163
|
+
const localTable = table;
|
|
164
|
+
// Access inner through type assertion since it's private
|
|
165
|
+
const nativeBuilder = (0, native_js_1.permutationBuilder)(
|
|
166
|
+
// biome-ignore lint/suspicious/noExplicitAny: need access to private variable
|
|
167
|
+
localTable.inner, destTableName);
|
|
168
|
+
return new PermutationBuilder(nativeBuilder);
|
|
169
|
+
}
|
package/dist/sanitize.d.ts
CHANGED
|
@@ -29,3 +29,4 @@ export declare function sanitizeField(fieldLike: unknown): Field;
|
|
|
29
29
|
*/
|
|
30
30
|
export declare function sanitizeSchema(schemaLike: SchemaLike): Schema;
|
|
31
31
|
export declare function sanitizeTable(tableLike: TableLike): Table;
|
|
32
|
+
export declare function dataTypeFromName(typeName: string): DataType;
|
package/dist/sanitize.js
CHANGED
|
@@ -24,6 +24,7 @@ exports.sanitizeType = sanitizeType;
|
|
|
24
24
|
exports.sanitizeField = sanitizeField;
|
|
25
25
|
exports.sanitizeSchema = sanitizeSchema;
|
|
26
26
|
exports.sanitizeTable = sanitizeTable;
|
|
27
|
+
exports.dataTypeFromName = dataTypeFromName;
|
|
27
28
|
// The utilities in this file help sanitize data from the user's arrow
|
|
28
29
|
// library into the types expected by vectordb's arrow library. Node
|
|
29
30
|
// generally allows for mulitple versions of the same library (and sometimes
|
|
@@ -204,6 +205,9 @@ function sanitizeDictionary(typeLike) {
|
|
|
204
205
|
}
|
|
205
206
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
206
207
|
function sanitizeType(typeLike) {
|
|
208
|
+
if (typeof typeLike === "string") {
|
|
209
|
+
return dataTypeFromName(typeLike);
|
|
210
|
+
}
|
|
207
211
|
if (typeof typeLike !== "object" || typeLike === null) {
|
|
208
212
|
throw Error("Expected a Type but object was null/undefined");
|
|
209
213
|
}
|
|
@@ -322,7 +326,7 @@ function sanitizeType(typeLike) {
|
|
|
322
326
|
case arrow_1.Type.DurationSecond:
|
|
323
327
|
return new arrow_1.DurationSecond();
|
|
324
328
|
default:
|
|
325
|
-
throw new Error("
|
|
329
|
+
throw new Error("Unrecognized type id in schema: " + typeId);
|
|
326
330
|
}
|
|
327
331
|
}
|
|
328
332
|
function sanitizeField(fieldLike) {
|
|
@@ -337,7 +341,13 @@ function sanitizeField(fieldLike) {
|
|
|
337
341
|
!("nullable" in fieldLike)) {
|
|
338
342
|
throw Error("The field passed in is missing a `type`/`name`/`nullable` property");
|
|
339
343
|
}
|
|
340
|
-
|
|
344
|
+
let type;
|
|
345
|
+
try {
|
|
346
|
+
type = sanitizeType(fieldLike.type);
|
|
347
|
+
}
|
|
348
|
+
catch (error) {
|
|
349
|
+
throw Error(`Unable to sanitize type for field: ${fieldLike.name} due to error: ${error}`, { cause: error });
|
|
350
|
+
}
|
|
341
351
|
const name = fieldLike.name;
|
|
342
352
|
if (!(typeof name === "string")) {
|
|
343
353
|
throw Error("The field passed in had a non-string `name` property");
|
|
@@ -424,3 +434,40 @@ function sanitizeData(dataLike) {
|
|
|
424
434
|
[apache_arrow_1.BufferType.TYPE]: dataLike.typeIds,
|
|
425
435
|
});
|
|
426
436
|
}
|
|
437
|
+
const constructorsByTypeName = {
|
|
438
|
+
null: () => new arrow_1.Null(),
|
|
439
|
+
binary: () => new arrow_1.Binary(),
|
|
440
|
+
utf8: () => new arrow_1.Utf8(),
|
|
441
|
+
bool: () => new arrow_1.Bool(),
|
|
442
|
+
int8: () => new arrow_1.Int8(),
|
|
443
|
+
int16: () => new arrow_1.Int16(),
|
|
444
|
+
int32: () => new arrow_1.Int32(),
|
|
445
|
+
int64: () => new arrow_1.Int64(),
|
|
446
|
+
uint8: () => new arrow_1.Uint8(),
|
|
447
|
+
uint16: () => new arrow_1.Uint16(),
|
|
448
|
+
uint32: () => new arrow_1.Uint32(),
|
|
449
|
+
uint64: () => new arrow_1.Uint64(),
|
|
450
|
+
float16: () => new arrow_1.Float16(),
|
|
451
|
+
float32: () => new arrow_1.Float32(),
|
|
452
|
+
float64: () => new arrow_1.Float64(),
|
|
453
|
+
datemillisecond: () => new arrow_1.DateMillisecond(),
|
|
454
|
+
dateday: () => new arrow_1.DateDay(),
|
|
455
|
+
timenanosecond: () => new arrow_1.TimeNanosecond(),
|
|
456
|
+
timemicrosecond: () => new arrow_1.TimeMicrosecond(),
|
|
457
|
+
timemillisecond: () => new arrow_1.TimeMillisecond(),
|
|
458
|
+
timesecond: () => new arrow_1.TimeSecond(),
|
|
459
|
+
intervaldaytime: () => new arrow_1.IntervalDayTime(),
|
|
460
|
+
intervalyearmonth: () => new arrow_1.IntervalYearMonth(),
|
|
461
|
+
durationnanosecond: () => new arrow_1.DurationNanosecond(),
|
|
462
|
+
durationmicrosecond: () => new arrow_1.DurationMicrosecond(),
|
|
463
|
+
durationmillisecond: () => new arrow_1.DurationMillisecond(),
|
|
464
|
+
durationsecond: () => new arrow_1.DurationSecond(),
|
|
465
|
+
};
|
|
466
|
+
function dataTypeFromName(typeName) {
|
|
467
|
+
const normalizedTypeName = typeName.toLowerCase();
|
|
468
|
+
const _constructor = constructorsByTypeName[normalizedTypeName];
|
|
469
|
+
if (!_constructor) {
|
|
470
|
+
throw new Error("Unrecognized type name in schema: " + typeName);
|
|
471
|
+
}
|
|
472
|
+
return _constructor();
|
|
473
|
+
}
|
package/package.json
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"ann"
|
|
12
12
|
],
|
|
13
13
|
"private": false,
|
|
14
|
-
"version": "0.22.
|
|
14
|
+
"version": "0.22.3-beta.0",
|
|
15
15
|
"main": "dist/index.js",
|
|
16
16
|
"exports": {
|
|
17
17
|
".": "./dist/index.js",
|
|
@@ -100,14 +100,14 @@
|
|
|
100
100
|
"reflect-metadata": "^0.2.2"
|
|
101
101
|
},
|
|
102
102
|
"optionalDependencies": {
|
|
103
|
-
"@lancedb/lancedb-darwin-x64": "0.22.
|
|
104
|
-
"@lancedb/lancedb-darwin-arm64": "0.22.
|
|
105
|
-
"@lancedb/lancedb-linux-x64-gnu": "0.22.
|
|
106
|
-
"@lancedb/lancedb-linux-arm64-gnu": "0.22.
|
|
107
|
-
"@lancedb/lancedb-linux-x64-musl": "0.22.
|
|
108
|
-
"@lancedb/lancedb-linux-arm64-musl": "0.22.
|
|
109
|
-
"@lancedb/lancedb-win32-x64-msvc": "0.22.
|
|
110
|
-
"@lancedb/lancedb-win32-arm64-msvc": "0.22.
|
|
103
|
+
"@lancedb/lancedb-darwin-x64": "0.22.3-beta.0",
|
|
104
|
+
"@lancedb/lancedb-darwin-arm64": "0.22.3-beta.0",
|
|
105
|
+
"@lancedb/lancedb-linux-x64-gnu": "0.22.3-beta.0",
|
|
106
|
+
"@lancedb/lancedb-linux-arm64-gnu": "0.22.3-beta.0",
|
|
107
|
+
"@lancedb/lancedb-linux-x64-musl": "0.22.3-beta.0",
|
|
108
|
+
"@lancedb/lancedb-linux-arm64-musl": "0.22.3-beta.0",
|
|
109
|
+
"@lancedb/lancedb-win32-x64-msvc": "0.22.3-beta.0",
|
|
110
|
+
"@lancedb/lancedb-win32-arm64-msvc": "0.22.3-beta.0"
|
|
111
111
|
},
|
|
112
112
|
"peerDependencies": {
|
|
113
113
|
"apache-arrow": ">=15.0.0 <=18.1.0"
|