@dengxifeng/lancedb 0.26.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +13 -0
- package/CONTRIBUTING.md +76 -0
- package/README.md +37 -0
- package/dist/arrow.d.ts +279 -0
- package/dist/arrow.js +1316 -0
- package/dist/connection.d.ts +259 -0
- package/dist/connection.js +224 -0
- package/dist/embedding/embedding_function.d.ts +103 -0
- package/dist/embedding/embedding_function.js +192 -0
- package/dist/embedding/index.d.ts +27 -0
- package/dist/embedding/index.js +101 -0
- package/dist/embedding/openai.d.ts +16 -0
- package/dist/embedding/openai.js +93 -0
- package/dist/embedding/registry.d.ts +74 -0
- package/dist/embedding/registry.js +165 -0
- package/dist/embedding/transformers.d.ts +36 -0
- package/dist/embedding/transformers.js +122 -0
- package/dist/header.d.ts +162 -0
- package/dist/header.js +217 -0
- package/dist/index.d.ts +85 -0
- package/dist/index.js +106 -0
- package/dist/indices.d.ts +692 -0
- package/dist/indices.js +156 -0
- package/dist/merge.d.ts +80 -0
- package/dist/merge.js +92 -0
- package/dist/native.d.ts +585 -0
- package/dist/native.js +339 -0
- package/dist/permutation.d.ts +143 -0
- package/dist/permutation.js +184 -0
- package/dist/query.d.ts +581 -0
- package/dist/query.js +853 -0
- package/dist/rerankers/index.d.ts +5 -0
- package/dist/rerankers/index.js +19 -0
- package/dist/rerankers/rrf.d.ts +14 -0
- package/dist/rerankers/rrf.js +28 -0
- package/dist/sanitize.d.ts +32 -0
- package/dist/sanitize.js +473 -0
- package/dist/table.d.ts +581 -0
- package/dist/table.js +321 -0
- package/dist/util.d.ts +14 -0
- package/dist/util.js +77 -0
- package/license_header.txt +2 -0
- package/package.json +122 -0
package/dist/indices.js
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
4
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
+
exports.Index = void 0;
|
|
6
|
+
const native_1 = require("./native");
|
|
7
|
+
class Index {
|
|
8
|
+
inner;
|
|
9
|
+
constructor(inner) {
|
|
10
|
+
this.inner = inner;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Create an IvfPq index
|
|
14
|
+
*
|
|
15
|
+
* This index stores a compressed (quantized) copy of every vector. These vectors
|
|
16
|
+
* are grouped into partitions of similar vectors. Each partition keeps track of
|
|
17
|
+
* a centroid which is the average value of all vectors in the group.
|
|
18
|
+
*
|
|
19
|
+
* During a query the centroids are compared with the query vector to find the closest
|
|
20
|
+
* partitions. The compressed vectors in these partitions are then searched to find
|
|
21
|
+
* the closest vectors.
|
|
22
|
+
*
|
|
23
|
+
* The compression scheme is called product quantization. Each vector is divided into
|
|
24
|
+
* subvectors and then each subvector is quantized into a small number of bits. the
|
|
25
|
+
* parameters `num_bits` and `num_subvectors` control this process, providing a tradeoff
|
|
26
|
+
* between index size (and thus search speed) and index accuracy.
|
|
27
|
+
*
|
|
28
|
+
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
|
29
|
+
* many groups to create.
|
|
30
|
+
*
|
|
31
|
+
* Note that training an IVF PQ index on a large dataset is a slow operation and
|
|
32
|
+
* currently is also a memory intensive operation.
|
|
33
|
+
*/
|
|
34
|
+
static ivfPq(options) {
|
|
35
|
+
return new Index(native_1.Index.ivfPq(options?.distanceType, options?.numPartitions, options?.numSubVectors, options?.numBits, options?.maxIterations, options?.sampleRate));
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Create an IvfRq index
|
|
39
|
+
*
|
|
40
|
+
* IVF-RQ (RabitQ Quantization) compresses vectors using RabitQ quantization
|
|
41
|
+
* and organizes them into IVF partitions.
|
|
42
|
+
*
|
|
43
|
+
* The compression scheme is called RabitQ quantization. Each dimension is quantized into a small number of bits.
|
|
44
|
+
* The parameters `num_bits` and `num_partitions` control this process, providing a tradeoff
|
|
45
|
+
* between index size (and thus search speed) and index accuracy.
|
|
46
|
+
*
|
|
47
|
+
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
|
48
|
+
* many groups to create.
|
|
49
|
+
*
|
|
50
|
+
* Note that training an IVF RQ index on a large dataset is a slow operation and
|
|
51
|
+
* currently is also a memory intensive operation.
|
|
52
|
+
*/
|
|
53
|
+
static ivfRq(options) {
|
|
54
|
+
return new Index(native_1.Index.ivfRq(options?.distanceType, options?.numPartitions, options?.numBits, options?.maxIterations, options?.sampleRate));
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Create an IvfFlat index
|
|
58
|
+
*
|
|
59
|
+
* This index groups vectors into partitions of similar vectors. Each partition keeps track of
|
|
60
|
+
* a centroid which is the average value of all vectors in the group.
|
|
61
|
+
*
|
|
62
|
+
* During a query the centroids are compared with the query vector to find the closest
|
|
63
|
+
* partitions. The vectors in these partitions are then searched to find
|
|
64
|
+
* the closest vectors.
|
|
65
|
+
*
|
|
66
|
+
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
|
67
|
+
* many groups to create.
|
|
68
|
+
*
|
|
69
|
+
* Note that training an IVF FLAT index on a large dataset is a slow operation and
|
|
70
|
+
* currently is also a memory intensive operation.
|
|
71
|
+
*/
|
|
72
|
+
static ivfFlat(options) {
|
|
73
|
+
return new Index(native_1.Index.ivfFlat(options?.distanceType, options?.numPartitions, options?.maxIterations, options?.sampleRate));
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Create a btree index
|
|
77
|
+
*
|
|
78
|
+
* A btree index is an index on a scalar columns. The index stores a copy of the column
|
|
79
|
+
* in sorted order. A header entry is created for each block of rows (currently the
|
|
80
|
+
* block size is fixed at 4096). These header entries are stored in a separate
|
|
81
|
+
* cacheable structure (a btree). To search for data the header is used to determine
|
|
82
|
+
* which blocks need to be read from disk.
|
|
83
|
+
*
|
|
84
|
+
* For example, a btree index in a table with 1Bi rows requires sizeof(Scalar) * 256Ki
|
|
85
|
+
* bytes of memory and will generally need to read sizeof(Scalar) * 4096 bytes to find
|
|
86
|
+
* the correct row ids.
|
|
87
|
+
*
|
|
88
|
+
* This index is good for scalar columns with mostly distinct values and does best when
|
|
89
|
+
* the query is highly selective.
|
|
90
|
+
*
|
|
91
|
+
* The btree index does not currently have any parameters though parameters such as the
|
|
92
|
+
* block size may be added in the future.
|
|
93
|
+
*/
|
|
94
|
+
static btree() {
|
|
95
|
+
return new Index(native_1.Index.btree());
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Create a bitmap index.
|
|
99
|
+
*
|
|
100
|
+
* A `Bitmap` index stores a bitmap for each distinct value in the column for every row.
|
|
101
|
+
*
|
|
102
|
+
* This index works best for low-cardinality columns, where the number of unique values
|
|
103
|
+
* is small (i.e., less than a few hundreds).
|
|
104
|
+
*/
|
|
105
|
+
static bitmap() {
|
|
106
|
+
return new Index(native_1.Index.bitmap());
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Create a label list index.
|
|
110
|
+
*
|
|
111
|
+
* LabelList index is a scalar index that can be used on `List<T>` columns to
|
|
112
|
+
* support queries with `array_contains_all` and `array_contains_any`
|
|
113
|
+
* using an underlying bitmap index.
|
|
114
|
+
*/
|
|
115
|
+
static labelList() {
|
|
116
|
+
return new Index(native_1.Index.labelList());
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Create a full text search index
|
|
120
|
+
*
|
|
121
|
+
* A full text search index is an index on a string column, so that you can conduct full
|
|
122
|
+
* text searches on the column.
|
|
123
|
+
*
|
|
124
|
+
* The results of a full text search are ordered by relevance measured by BM25.
|
|
125
|
+
*
|
|
126
|
+
* You can combine filters with full text search.
|
|
127
|
+
*/
|
|
128
|
+
static fts(options) {
|
|
129
|
+
return new Index(native_1.Index.fts(options?.withPosition, options?.baseTokenizer, options?.language, options?.maxTokenLength, options?.lowercase, options?.stem, options?.removeStopWords, options?.asciiFolding, options?.ngramMinLength, options?.ngramMaxLength, options?.prefixOnly));
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
*
|
|
133
|
+
* Create a hnswPq index
|
|
134
|
+
*
|
|
135
|
+
* HNSW-PQ stands for Hierarchical Navigable Small World - Product Quantization.
|
|
136
|
+
* It is a variant of the HNSW algorithm that uses product quantization to compress
|
|
137
|
+
* the vectors.
|
|
138
|
+
*
|
|
139
|
+
*/
|
|
140
|
+
static hnswPq(options) {
|
|
141
|
+
return new Index(native_1.Index.hnswPq(options?.distanceType, options?.numPartitions, options?.numSubVectors, options?.maxIterations, options?.sampleRate, options?.m, options?.efConstruction));
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
*
|
|
145
|
+
* Create a hnswSq index
|
|
146
|
+
*
|
|
147
|
+
* HNSW-SQ stands for Hierarchical Navigable Small World - Scalar Quantization.
|
|
148
|
+
* It is a variant of the HNSW algorithm that uses scalar quantization to compress
|
|
149
|
+
* the vectors.
|
|
150
|
+
*
|
|
151
|
+
*/
|
|
152
|
+
static hnswSq(options) {
|
|
153
|
+
return new Index(native_1.Index.hnswSq(options?.distanceType, options?.numPartitions, options?.maxIterations, options?.sampleRate, options?.m, options?.efConstruction));
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
exports.Index = Index;
|
package/dist/merge.d.ts
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { Data, Schema } from "./arrow";
|
|
2
|
+
import { MergeResult, NativeMergeInsertBuilder } from "./native";
|
|
3
|
+
/** A builder used to create and run a merge insert operation */
|
|
4
|
+
export declare class MergeInsertBuilder {
|
|
5
|
+
#private;
|
|
6
|
+
/** Construct a MergeInsertBuilder. __Internal use only.__ */
|
|
7
|
+
constructor(native: NativeMergeInsertBuilder, schema: Schema | Promise<Schema>);
|
|
8
|
+
/**
|
|
9
|
+
* Rows that exist in both the source table (new data) and
|
|
10
|
+
* the target table (old data) will be updated, replacing
|
|
11
|
+
* the old row with the corresponding matching row.
|
|
12
|
+
*
|
|
13
|
+
* If there are multiple matches then the behavior is undefined.
|
|
14
|
+
* Currently this causes multiple copies of the row to be created
|
|
15
|
+
* but that behavior is subject to change.
|
|
16
|
+
*
|
|
17
|
+
* An optional condition may be specified. If it is, then only
|
|
18
|
+
* matched rows that satisfy the condtion will be updated. Any
|
|
19
|
+
* rows that do not satisfy the condition will be left as they
|
|
20
|
+
* are. Failing to satisfy the condition does not cause a
|
|
21
|
+
* "matched row" to become a "not matched" row.
|
|
22
|
+
*
|
|
23
|
+
* The condition should be an SQL string. Use the prefix
|
|
24
|
+
* target. to refer to rows in the target table (old data)
|
|
25
|
+
* and the prefix source. to refer to rows in the source
|
|
26
|
+
* table (new data).
|
|
27
|
+
*
|
|
28
|
+
* For example, "target.last_update < source.last_update"
|
|
29
|
+
*/
|
|
30
|
+
whenMatchedUpdateAll(options?: {
|
|
31
|
+
where: string;
|
|
32
|
+
}): MergeInsertBuilder;
|
|
33
|
+
/**
|
|
34
|
+
* Rows that exist only in the source table (new data) should
|
|
35
|
+
* be inserted into the target table.
|
|
36
|
+
*/
|
|
37
|
+
whenNotMatchedInsertAll(): MergeInsertBuilder;
|
|
38
|
+
/**
|
|
39
|
+
* Rows that exist only in the target table (old data) will be
|
|
40
|
+
* deleted. An optional condition can be provided to limit what
|
|
41
|
+
* data is deleted.
|
|
42
|
+
*
|
|
43
|
+
* @param options.where - An optional condition to limit what data is deleted
|
|
44
|
+
*/
|
|
45
|
+
whenNotMatchedBySourceDelete(options?: {
|
|
46
|
+
where: string;
|
|
47
|
+
}): MergeInsertBuilder;
|
|
48
|
+
/**
|
|
49
|
+
* Controls whether to use indexes for the merge operation.
|
|
50
|
+
*
|
|
51
|
+
* When set to `true` (the default), the operation will use an index if available
|
|
52
|
+
* on the join key for improved performance. When set to `false`, it forces a full
|
|
53
|
+
* table scan even if an index exists. This can be useful for benchmarking or when
|
|
54
|
+
* the query optimizer chooses a suboptimal path.
|
|
55
|
+
*
|
|
56
|
+
* @param useIndex - Whether to use indices for the merge operation. Defaults to `true`.
|
|
57
|
+
*/
|
|
58
|
+
useIndex(useIndex: boolean): MergeInsertBuilder;
|
|
59
|
+
/**
|
|
60
|
+
* Executes the merge insert operation
|
|
61
|
+
*
|
|
62
|
+
* @returns {Promise<MergeResult>} the merge result
|
|
63
|
+
*/
|
|
64
|
+
execute(data: Data, execOptions?: Partial<WriteExecutionOptions>): Promise<MergeResult>;
|
|
65
|
+
}
|
|
66
|
+
export interface WriteExecutionOptions {
|
|
67
|
+
/**
|
|
68
|
+
* Maximum time to run the operation before cancelling it.
|
|
69
|
+
*
|
|
70
|
+
* By default, there is a 30-second timeout that is only enforced after the
|
|
71
|
+
* first attempt. This is to prevent spending too long retrying to resolve
|
|
72
|
+
* conflicts. For example, if a write attempt takes 20 seconds and fails,
|
|
73
|
+
* the second attempt will be cancelled after 10 seconds, hitting the
|
|
74
|
+
* 30-second timeout. However, a write that takes one hour and succeeds on the
|
|
75
|
+
* first attempt will not be cancelled.
|
|
76
|
+
*
|
|
77
|
+
* When this is set, the timeout is enforced on all attempts, including the first.
|
|
78
|
+
*/
|
|
79
|
+
timeoutMs?: number;
|
|
80
|
+
}
|
package/dist/merge.js
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.MergeInsertBuilder = void 0;
|
|
4
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
5
|
+
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
6
|
+
const arrow_1 = require("./arrow");
|
|
7
|
+
/** A builder used to create and run a merge insert operation */
|
|
8
|
+
class MergeInsertBuilder {
|
|
9
|
+
#native;
|
|
10
|
+
#schema;
|
|
11
|
+
/** Construct a MergeInsertBuilder. __Internal use only.__ */
|
|
12
|
+
constructor(native, schema) {
|
|
13
|
+
this.#native = native;
|
|
14
|
+
this.#schema = schema;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Rows that exist in both the source table (new data) and
|
|
18
|
+
* the target table (old data) will be updated, replacing
|
|
19
|
+
* the old row with the corresponding matching row.
|
|
20
|
+
*
|
|
21
|
+
* If there are multiple matches then the behavior is undefined.
|
|
22
|
+
* Currently this causes multiple copies of the row to be created
|
|
23
|
+
* but that behavior is subject to change.
|
|
24
|
+
*
|
|
25
|
+
* An optional condition may be specified. If it is, then only
|
|
26
|
+
* matched rows that satisfy the condtion will be updated. Any
|
|
27
|
+
* rows that do not satisfy the condition will be left as they
|
|
28
|
+
* are. Failing to satisfy the condition does not cause a
|
|
29
|
+
* "matched row" to become a "not matched" row.
|
|
30
|
+
*
|
|
31
|
+
* The condition should be an SQL string. Use the prefix
|
|
32
|
+
* target. to refer to rows in the target table (old data)
|
|
33
|
+
* and the prefix source. to refer to rows in the source
|
|
34
|
+
* table (new data).
|
|
35
|
+
*
|
|
36
|
+
* For example, "target.last_update < source.last_update"
|
|
37
|
+
*/
|
|
38
|
+
whenMatchedUpdateAll(options) {
|
|
39
|
+
return new MergeInsertBuilder(this.#native.whenMatchedUpdateAll(options?.where), this.#schema);
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Rows that exist only in the source table (new data) should
|
|
43
|
+
* be inserted into the target table.
|
|
44
|
+
*/
|
|
45
|
+
whenNotMatchedInsertAll() {
|
|
46
|
+
return new MergeInsertBuilder(this.#native.whenNotMatchedInsertAll(), this.#schema);
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Rows that exist only in the target table (old data) will be
|
|
50
|
+
* deleted. An optional condition can be provided to limit what
|
|
51
|
+
* data is deleted.
|
|
52
|
+
*
|
|
53
|
+
* @param options.where - An optional condition to limit what data is deleted
|
|
54
|
+
*/
|
|
55
|
+
whenNotMatchedBySourceDelete(options) {
|
|
56
|
+
return new MergeInsertBuilder(this.#native.whenNotMatchedBySourceDelete(options?.where), this.#schema);
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Controls whether to use indexes for the merge operation.
|
|
60
|
+
*
|
|
61
|
+
* When set to `true` (the default), the operation will use an index if available
|
|
62
|
+
* on the join key for improved performance. When set to `false`, it forces a full
|
|
63
|
+
* table scan even if an index exists. This can be useful for benchmarking or when
|
|
64
|
+
* the query optimizer chooses a suboptimal path.
|
|
65
|
+
*
|
|
66
|
+
* @param useIndex - Whether to use indices for the merge operation. Defaults to `true`.
|
|
67
|
+
*/
|
|
68
|
+
useIndex(useIndex) {
|
|
69
|
+
return new MergeInsertBuilder(this.#native.useIndex(useIndex), this.#schema);
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Executes the merge insert operation
|
|
73
|
+
*
|
|
74
|
+
* @returns {Promise<MergeResult>} the merge result
|
|
75
|
+
*/
|
|
76
|
+
async execute(data, execOptions) {
|
|
77
|
+
let schema;
|
|
78
|
+
if (this.#schema instanceof Promise) {
|
|
79
|
+
schema = await this.#schema;
|
|
80
|
+
this.#schema = schema; // In case of future calls
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
schema = this.#schema;
|
|
84
|
+
}
|
|
85
|
+
if (execOptions?.timeoutMs !== undefined) {
|
|
86
|
+
this.#native.setTimeout(execOptions.timeoutMs);
|
|
87
|
+
}
|
|
88
|
+
const buffer = await (0, arrow_1.fromDataToBuffer)(data, undefined, schema);
|
|
89
|
+
return await this.#native.execute(buffer);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
exports.MergeInsertBuilder = MergeInsertBuilder;
|