@lancedb/lancedb 0.4.3 → 0.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +35 -3
  2. package/dist/arrow.d.ts +189 -0
  3. package/dist/arrow.js +539 -0
  4. package/dist/connection.d.ts +97 -0
  5. package/dist/connection.js +126 -0
  6. package/dist/embedding/embedding_function.d.ts +45 -0
  7. package/dist/embedding/embedding_function.js +27 -0
  8. package/dist/embedding/index.d.ts +2 -0
  9. package/dist/embedding/index.js +7 -0
  10. package/dist/embedding/openai.d.ts +8 -0
  11. package/dist/embedding/openai.js +53 -0
  12. package/dist/index.d.ts +22 -0
  13. package/dist/index.js +52 -0
  14. package/dist/indices.d.ts +165 -0
  15. package/dist/indices.js +71 -0
  16. package/dist/native.d.ts +147 -0
  17. package/dist/native.js +314 -0
  18. package/dist/query.d.ts +248 -0
  19. package/dist/query.js +346 -0
  20. package/dist/sanitize.d.ts +9 -0
  21. package/dist/sanitize.js +369 -0
  22. package/dist/table.d.ts +252 -0
  23. package/dist/table.js +298 -0
  24. package/nodejs-artifacts/arrow.d.ts +189 -0
  25. package/nodejs-artifacts/arrow.js +539 -0
  26. package/nodejs-artifacts/connection.d.ts +97 -0
  27. package/nodejs-artifacts/connection.js +126 -0
  28. package/nodejs-artifacts/embedding/embedding_function.d.ts +45 -0
  29. package/nodejs-artifacts/embedding/embedding_function.js +27 -0
  30. package/nodejs-artifacts/embedding/index.d.ts +2 -0
  31. package/nodejs-artifacts/embedding/index.js +7 -0
  32. package/nodejs-artifacts/embedding/openai.d.ts +8 -0
  33. package/nodejs-artifacts/embedding/openai.js +53 -0
  34. package/nodejs-artifacts/index.d.ts +22 -0
  35. package/nodejs-artifacts/index.js +52 -0
  36. package/nodejs-artifacts/indices.d.ts +165 -0
  37. package/nodejs-artifacts/indices.js +71 -0
  38. package/nodejs-artifacts/native.d.ts +147 -0
  39. package/nodejs-artifacts/native.js +314 -0
  40. package/nodejs-artifacts/query.d.ts +248 -0
  41. package/nodejs-artifacts/query.js +346 -0
  42. package/nodejs-artifacts/sanitize.d.ts +9 -0
  43. package/nodejs-artifacts/sanitize.js +369 -0
  44. package/nodejs-artifacts/table.d.ts +252 -0
  45. package/nodejs-artifacts/table.js +298 -0
  46. package/package.json +9 -11
  47. package/typedoc.json +10 -0
  48. package/examples/js/index.mjs +0 -40
  49. package/examples/js/package.json +0 -14
  50. package/examples/js-openai/index.mjs +0 -43
  51. package/examples/js-openai/package-lock.json +0 -256
  52. package/examples/js-openai/package.json +0 -15
  53. package/examples/js-transformers/index.mjs +0 -65
  54. package/examples/js-transformers/package-lock.json +0 -1418
  55. package/examples/js-transformers/package.json +0 -15
  56. package/examples/js-youtube-transcripts/index.mjs +0 -135
  57. package/examples/js-youtube-transcripts/package.json +0 -15
  58. package/examples/ts/data/sample-lancedb/vectors.lance/_latest.manifest +0 -0
  59. package/examples/ts/data/sample-lancedb/vectors.lance/_transactions/0-adde4e05-fcfc-415c-86a6-5b252cb9e79a.txn +0 -0
  60. package/examples/ts/data/sample-lancedb/vectors.lance/_versions/1.manifest +0 -0
  61. package/examples/ts/data/sample-lancedb/vectors.lance/data/3618b33e-3eea-4b5e-a0fc-7d1f718d551e.lance +0 -0
  62. package/examples/ts/package-lock.json +0 -1340
  63. package/examples/ts/package.json +0 -22
  64. package/examples/ts/tsconfig.json +0 -10
@@ -0,0 +1,248 @@
1
+ import { RecordBatch, Table as ArrowTable } from "apache-arrow";
2
+ import { RecordBatchIterator as NativeBatchIterator, Query as NativeQuery, Table as NativeTable, VectorQuery as NativeVectorQuery } from "./native";
3
+ export declare class RecordBatchIterator implements AsyncIterator<RecordBatch> {
4
+ private promisedInner?;
5
+ private inner?;
6
+ constructor(promise?: Promise<NativeBatchIterator>);
7
+ next(): Promise<IteratorResult<RecordBatch<any>>>;
8
+ }
9
+ /** Common methods supported by all query types */
10
+ export declare class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery, QueryType> implements AsyncIterable<RecordBatch> {
11
+ protected inner: NativeQueryType;
12
+ protected constructor(inner: NativeQueryType);
13
+ /**
14
+ * A filter statement to be applied to this query.
15
+ *
16
+ * The filter should be supplied as an SQL query string. For example:
17
+ * @example
18
+ * x > 10
19
+ * y > 0 AND y < 100
20
+ * x > 5 OR y = 'test'
21
+ *
22
+ * Filtering performance can often be improved by creating a scalar index
23
+ * on the filter column(s).
24
+ */
25
+ where(predicate: string): QueryType;
26
+ /**
27
+ * Return only the specified columns.
28
+ *
29
+ * By default a query will return all columns from the table. However, this can have
30
+ * a very significant impact on latency. LanceDb stores data in a columnar fashion. This
31
+ * means we can finely tune our I/O to select exactly the columns we need.
32
+ *
33
+ * As a best practice you should always limit queries to the columns that you need. If you
34
+ * pass in an array of column names then only those columns will be returned.
35
+ *
36
+ * You can also use this method to create new "dynamic" columns based on your existing columns.
37
+ * For example, you may not care about "a" or "b" but instead simply want "a + b". This is often
38
+ * seen in the SELECT clause of an SQL query (e.g. `SELECT a+b FROM my_table`).
39
+ *
40
+ * To create dynamic columns you can pass in a Map<string, string>. A column will be returned
41
+ * for each entry in the map. The key provides the name of the column. The value is
42
+ * an SQL string used to specify how the column is calculated.
43
+ *
44
+ * For example, an SQL query might state `SELECT a + b AS combined, c`. The equivalent
45
+ * input to this method would be:
46
+ * @example
47
+ * new Map([["combined", "a + b"], ["c", "c"]])
48
+ *
49
+ * Columns will always be returned in the order given, even if that order is different than
50
+ * the order used when adding the data.
51
+ *
52
+ * Note that you can pass in a `Record<string, string>` (e.g. an object literal). This method
53
+ * uses `Object.entries` which should preserve the insertion order of the object. However,
54
+ * object insertion order is easy to get wrong and `Map` is more foolproof.
55
+ */
56
+ select(columns: string[] | Map<string, string> | Record<string, string>): QueryType;
57
+ /**
58
+ * Set the maximum number of results to return.
59
+ *
60
+ * By default, a plain search has no limit. If this method is not
61
+ * called then every valid row from the table will be returned.
62
+ */
63
+ limit(limit: number): QueryType;
64
+ protected nativeExecute(): Promise<NativeBatchIterator>;
65
+ /**
66
+ * Execute the query and return the results as an @see {@link AsyncIterator}
67
+ * of @see {@link RecordBatch}.
68
+ *
69
+ * By default, LanceDb will use many threads to calculate results and, when
70
+ * the result set is large, multiple batches will be processed at one time.
71
+ * This readahead is limited however and backpressure will be applied if this
72
+ * stream is consumed slowly (this constrains the maximum memory used by a
73
+ * single query)
74
+ *
75
+ */
76
+ protected execute(): RecordBatchIterator;
77
+ [Symbol.asyncIterator](): AsyncIterator<RecordBatch<any>>;
78
+ /** Collect the results as an Arrow @see {@link ArrowTable}. */
79
+ toArrow(): Promise<ArrowTable>;
80
+ /** Collect the results as an array of objects. */
81
+ toArray(): Promise<unknown[]>;
82
+ }
83
+ /**
84
+ * An interface for a query that can be executed
85
+ *
86
+ * Supported by all query types
87
+ */
88
+ export interface ExecutableQuery {
89
+ }
90
+ /**
91
+ * A builder used to construct a vector search
92
+ *
93
+ * This builder can be reused to execute the query many times.
94
+ */
95
+ export declare class VectorQuery extends QueryBase<NativeVectorQuery, VectorQuery> {
96
+ constructor(inner: NativeVectorQuery);
97
+ /**
98
+ * Set the number of partitions to search (probe)
99
+ *
100
+ * This argument is only used when the vector column has an IVF PQ index.
101
+ * If there is no index then this value is ignored.
102
+ *
103
+ * The IVF stage of IVF PQ divides the input into partitions (clusters) of
104
+ * related values.
105
+ *
106
+ * The partition whose centroids are closest to the query vector will be
107
+ * exhaustiely searched to find matches. This parameter controls how many
108
+ * partitions should be searched.
109
+ *
110
+ * Increasing this value will increase the recall of your query but will
111
+ * also increase the latency of your query. The default value is 20. This
112
+ * default is good for many cases but the best value to use will depend on
113
+ * your data and the recall that you need to achieve.
114
+ *
115
+ * For best results we recommend tuning this parameter with a benchmark against
116
+ * your actual data to find the smallest possible value that will still give
117
+ * you the desired recall.
118
+ */
119
+ nprobes(nprobes: number): VectorQuery;
120
+ /**
121
+ * Set the vector column to query
122
+ *
123
+ * This controls which column is compared to the query vector supplied in
124
+ * the call to @see {@link Query#nearestTo}
125
+ *
126
+ * This parameter must be specified if the table has more than one column
127
+ * whose data type is a fixed-size-list of floats.
128
+ */
129
+ column(column: string): VectorQuery;
130
+ /**
131
+ * Set the distance metric to use
132
+ *
133
+ * When performing a vector search we try and find the "nearest" vectors according
134
+ * to some kind of distance metric. This parameter controls which distance metric to
135
+ * use. See @see {@link IvfPqOptions.distanceType} for more details on the different
136
+ * distance metrics available.
137
+ *
138
+ * Note: if there is a vector index then the distance type used MUST match the distance
139
+ * type used to train the vector index. If this is not done then the results will be
140
+ * invalid.
141
+ *
142
+ * By default "l2" is used.
143
+ */
144
+ distanceType(distanceType: string): VectorQuery;
145
+ /**
146
+ * A multiplier to control how many additional rows are taken during the refine step
147
+ *
148
+ * This argument is only used when the vector column has an IVF PQ index.
149
+ * If there is no index then this value is ignored.
150
+ *
151
+ * An IVF PQ index stores compressed (quantized) values. They query vector is compared
152
+ * against these values and, since they are compressed, the comparison is inaccurate.
153
+ *
154
+ * This parameter can be used to refine the results. It can improve both improve recall
155
+ * and correct the ordering of the nearest results.
156
+ *
157
+ * To refine results LanceDb will first perform an ANN search to find the nearest
158
+ * `limit` * `refine_factor` results. In other words, if `refine_factor` is 3 and
159
+ * `limit` is the default (10) then the first 30 results will be selected. LanceDb
160
+ * then fetches the full, uncompressed, values for these 30 results. The results are
161
+ * then reordered by the true distance and only the nearest 10 are kept.
162
+ *
163
+ * Note: there is a difference between calling this method with a value of 1 and never
164
+ * calling this method at all. Calling this method with any value will have an impact
165
+ * on your search latency. When you call this method with a `refine_factor` of 1 then
166
+ * LanceDb still needs to fetch the full, uncompressed, values so that it can potentially
167
+ * reorder the results.
168
+ *
169
+ * Note: if this method is NOT called then the distances returned in the _distance column
170
+ * will be approximate distances based on the comparison of the quantized query vector
171
+ * and the quantized result vectors. This can be considerably different than the true
172
+ * distance between the query vector and the actual uncompressed vector.
173
+ */
174
+ refineFactor(refineFactor: number): VectorQuery;
175
+ /**
176
+ * If this is called then filtering will happen after the vector search instead of
177
+ * before.
178
+ *
179
+ * By default filtering will be performed before the vector search. This is how
180
+ * filtering is typically understood to work. This prefilter step does add some
181
+ * additional latency. Creating a scalar index on the filter column(s) can
182
+ * often improve this latency. However, sometimes a filter is too complex or scalar
183
+ * indices cannot be applied to the column. In these cases postfiltering can be
184
+ * used instead of prefiltering to improve latency.
185
+ *
186
+ * Post filtering applies the filter to the results of the vector search. This means
187
+ * we only run the filter on a much smaller set of data. However, it can cause the
188
+ * query to return fewer than `limit` results (or even no results) if none of the nearest
189
+ * results match the filter.
190
+ *
191
+ * Post filtering happens during the "refine stage" (described in more detail in
192
+ * @see {@link VectorQuery#refineFactor}). This means that setting a higher refine
193
+ * factor can often help restore some of the results lost by post filtering.
194
+ */
195
+ postfilter(): VectorQuery;
196
+ /**
197
+ * If this is called then any vector index is skipped
198
+ *
199
+ * An exhaustive (flat) search will be performed. The query vector will
200
+ * be compared to every vector in the table. At high scales this can be
201
+ * expensive. However, this is often still useful. For example, skipping
202
+ * the vector index can give you ground truth results which you can use to
203
+ * calculate your recall to select an appropriate value for nprobes.
204
+ */
205
+ bypassVectorIndex(): VectorQuery;
206
+ }
207
+ /** A builder for LanceDB queries. */
208
+ export declare class Query extends QueryBase<NativeQuery, Query> {
209
+ constructor(tbl: NativeTable);
210
+ /**
211
+ * Find the nearest vectors to the given query vector.
212
+ *
213
+ * This converts the query from a plain query to a vector query.
214
+ *
215
+ * This method will attempt to convert the input to the query vector
216
+ * expected by the embedding model. If the input cannot be converted
217
+ * then an error will be thrown.
218
+ *
219
+ * By default, there is no embedding model, and the input should be
220
+ * an array-like object of numbers (something that can be used as input
221
+ * to Float32Array.from)
222
+ *
223
+ * If there is only one vector column (a column whose data type is a
224
+ * fixed size list of floats) then the column does not need to be specified.
225
+ * If there is more than one vector column you must use
226
+ * @see {@link VectorQuery#column} to specify which column you would like
227
+ * to compare with.
228
+ *
229
+ * If no index has been created on the vector column then a vector query
230
+ * will perform a distance comparison between the query vector and every
231
+ * vector in the database and then sort the results. This is sometimes
232
+ * called a "flat search"
233
+ *
234
+ * For small databases, with a few hundred thousand vectors or less, this can
235
+ * be reasonably fast. In larger databases you should create a vector index
236
+ * on the column. If there is a vector index then an "approximate" nearest
237
+ * neighbor search (frequently called an ANN search) will be performed. This
238
+ * search is much faster, but the results will be approximate.
239
+ *
240
+ * The query can be further parameterized using the returned builder. There
241
+ * are various ANN search parameters that will let you fine tune your recall
242
+ * accuracy vs search latency.
243
+ *
244
+ * Vector searches always have a `limit`. If `limit` has not been called then
245
+ * a default `limit` of 10 will be used. @see {@link Query#limit}
246
+ */
247
+ nearestTo(vector: unknown): VectorQuery;
248
+ }
@@ -0,0 +1,346 @@
1
+ "use strict";
2
+ // Copyright 2024 Lance Developers.
3
+ //
4
+ // Licensed under the Apache License, Version 2.0 (the "License");
5
+ // you may not use this file except in compliance with the License.
6
+ // You may obtain a copy of the License at
7
+ //
8
+ // http://www.apache.org/licenses/LICENSE-2.0
9
+ //
10
+ // Unless required by applicable law or agreed to in writing, software
11
+ // distributed under the License is distributed on an "AS IS" BASIS,
12
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ // See the License for the specific language governing permissions and
14
+ // limitations under the License.
15
+ Object.defineProperty(exports, "__esModule", { value: true });
16
+ exports.Query = exports.VectorQuery = exports.QueryBase = exports.RecordBatchIterator = void 0;
17
+ const apache_arrow_1 = require("apache-arrow");
18
+ class RecordBatchIterator {
19
+ promisedInner;
20
+ inner;
21
+ constructor(promise) {
22
+ // TODO: check promise reliably so we dont need to pass two arguments.
23
+ this.promisedInner = promise;
24
+ }
25
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
26
+ async next() {
27
+ if (this.inner === undefined) {
28
+ this.inner = await this.promisedInner;
29
+ }
30
+ if (this.inner === undefined) {
31
+ throw new Error("Invalid iterator state state");
32
+ }
33
+ const n = await this.inner.next();
34
+ if (n == null) {
35
+ return Promise.resolve({ done: true, value: null });
36
+ }
37
+ const tbl = (0, apache_arrow_1.tableFromIPC)(n);
38
+ if (tbl.batches.length != 1) {
39
+ throw new Error("Expected only one batch");
40
+ }
41
+ return Promise.resolve({ done: false, value: tbl.batches[0] });
42
+ }
43
+ }
44
+ exports.RecordBatchIterator = RecordBatchIterator;
45
+ /* eslint-enable */
46
+ /** Common methods supported by all query types */
47
+ class QueryBase {
48
+ inner;
49
+ constructor(inner) {
50
+ this.inner = inner;
51
+ }
52
+ /**
53
+ * A filter statement to be applied to this query.
54
+ *
55
+ * The filter should be supplied as an SQL query string. For example:
56
+ * @example
57
+ * x > 10
58
+ * y > 0 AND y < 100
59
+ * x > 5 OR y = 'test'
60
+ *
61
+ * Filtering performance can often be improved by creating a scalar index
62
+ * on the filter column(s).
63
+ */
64
+ where(predicate) {
65
+ this.inner.onlyIf(predicate);
66
+ return this;
67
+ }
68
+ /**
69
+ * Return only the specified columns.
70
+ *
71
+ * By default a query will return all columns from the table. However, this can have
72
+ * a very significant impact on latency. LanceDb stores data in a columnar fashion. This
73
+ * means we can finely tune our I/O to select exactly the columns we need.
74
+ *
75
+ * As a best practice you should always limit queries to the columns that you need. If you
76
+ * pass in an array of column names then only those columns will be returned.
77
+ *
78
+ * You can also use this method to create new "dynamic" columns based on your existing columns.
79
+ * For example, you may not care about "a" or "b" but instead simply want "a + b". This is often
80
+ * seen in the SELECT clause of an SQL query (e.g. `SELECT a+b FROM my_table`).
81
+ *
82
+ * To create dynamic columns you can pass in a Map<string, string>. A column will be returned
83
+ * for each entry in the map. The key provides the name of the column. The value is
84
+ * an SQL string used to specify how the column is calculated.
85
+ *
86
+ * For example, an SQL query might state `SELECT a + b AS combined, c`. The equivalent
87
+ * input to this method would be:
88
+ * @example
89
+ * new Map([["combined", "a + b"], ["c", "c"]])
90
+ *
91
+ * Columns will always be returned in the order given, even if that order is different than
92
+ * the order used when adding the data.
93
+ *
94
+ * Note that you can pass in a `Record<string, string>` (e.g. an object literal). This method
95
+ * uses `Object.entries` which should preserve the insertion order of the object. However,
96
+ * object insertion order is easy to get wrong and `Map` is more foolproof.
97
+ */
98
+ select(columns) {
99
+ let columnTuples;
100
+ if (Array.isArray(columns)) {
101
+ columnTuples = columns.map((c) => [c, c]);
102
+ }
103
+ else if (columns instanceof Map) {
104
+ columnTuples = Array.from(columns.entries());
105
+ }
106
+ else {
107
+ columnTuples = Object.entries(columns);
108
+ }
109
+ this.inner.select(columnTuples);
110
+ return this;
111
+ }
112
+ /**
113
+ * Set the maximum number of results to return.
114
+ *
115
+ * By default, a plain search has no limit. If this method is not
116
+ * called then every valid row from the table will be returned.
117
+ */
118
+ limit(limit) {
119
+ this.inner.limit(limit);
120
+ return this;
121
+ }
122
+ nativeExecute() {
123
+ return this.inner.execute();
124
+ }
125
+ /**
126
+ * Execute the query and return the results as an @see {@link AsyncIterator}
127
+ * of @see {@link RecordBatch}.
128
+ *
129
+ * By default, LanceDb will use many threads to calculate results and, when
130
+ * the result set is large, multiple batches will be processed at one time.
131
+ * This readahead is limited however and backpressure will be applied if this
132
+ * stream is consumed slowly (this constrains the maximum memory used by a
133
+ * single query)
134
+ *
135
+ */
136
+ execute() {
137
+ return new RecordBatchIterator(this.nativeExecute());
138
+ }
139
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
140
+ [Symbol.asyncIterator]() {
141
+ const promise = this.nativeExecute();
142
+ return new RecordBatchIterator(promise);
143
+ }
144
+ /** Collect the results as an Arrow @see {@link ArrowTable}. */
145
+ async toArrow() {
146
+ const batches = [];
147
+ for await (const batch of this) {
148
+ batches.push(batch);
149
+ }
150
+ return new apache_arrow_1.Table(batches);
151
+ }
152
+ /** Collect the results as an array of objects. */
153
+ async toArray() {
154
+ const tbl = await this.toArrow();
155
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-return
156
+ return tbl.toArray();
157
+ }
158
+ }
159
+ exports.QueryBase = QueryBase;
160
+ /**
161
+ * A builder used to construct a vector search
162
+ *
163
+ * This builder can be reused to execute the query many times.
164
+ */
165
+ class VectorQuery extends QueryBase {
166
+ constructor(inner) {
167
+ super(inner);
168
+ }
169
+ /**
170
+ * Set the number of partitions to search (probe)
171
+ *
172
+ * This argument is only used when the vector column has an IVF PQ index.
173
+ * If there is no index then this value is ignored.
174
+ *
175
+ * The IVF stage of IVF PQ divides the input into partitions (clusters) of
176
+ * related values.
177
+ *
178
+ * The partition whose centroids are closest to the query vector will be
179
+ * exhaustiely searched to find matches. This parameter controls how many
180
+ * partitions should be searched.
181
+ *
182
+ * Increasing this value will increase the recall of your query but will
183
+ * also increase the latency of your query. The default value is 20. This
184
+ * default is good for many cases but the best value to use will depend on
185
+ * your data and the recall that you need to achieve.
186
+ *
187
+ * For best results we recommend tuning this parameter with a benchmark against
188
+ * your actual data to find the smallest possible value that will still give
189
+ * you the desired recall.
190
+ */
191
+ nprobes(nprobes) {
192
+ this.inner.nprobes(nprobes);
193
+ return this;
194
+ }
195
+ /**
196
+ * Set the vector column to query
197
+ *
198
+ * This controls which column is compared to the query vector supplied in
199
+ * the call to @see {@link Query#nearestTo}
200
+ *
201
+ * This parameter must be specified if the table has more than one column
202
+ * whose data type is a fixed-size-list of floats.
203
+ */
204
+ column(column) {
205
+ this.inner.column(column);
206
+ return this;
207
+ }
208
+ /**
209
+ * Set the distance metric to use
210
+ *
211
+ * When performing a vector search we try and find the "nearest" vectors according
212
+ * to some kind of distance metric. This parameter controls which distance metric to
213
+ * use. See @see {@link IvfPqOptions.distanceType} for more details on the different
214
+ * distance metrics available.
215
+ *
216
+ * Note: if there is a vector index then the distance type used MUST match the distance
217
+ * type used to train the vector index. If this is not done then the results will be
218
+ * invalid.
219
+ *
220
+ * By default "l2" is used.
221
+ */
222
+ distanceType(distanceType) {
223
+ this.inner.distanceType(distanceType);
224
+ return this;
225
+ }
226
+ /**
227
+ * A multiplier to control how many additional rows are taken during the refine step
228
+ *
229
+ * This argument is only used when the vector column has an IVF PQ index.
230
+ * If there is no index then this value is ignored.
231
+ *
232
+ * An IVF PQ index stores compressed (quantized) values. They query vector is compared
233
+ * against these values and, since they are compressed, the comparison is inaccurate.
234
+ *
235
+ * This parameter can be used to refine the results. It can improve both improve recall
236
+ * and correct the ordering of the nearest results.
237
+ *
238
+ * To refine results LanceDb will first perform an ANN search to find the nearest
239
+ * `limit` * `refine_factor` results. In other words, if `refine_factor` is 3 and
240
+ * `limit` is the default (10) then the first 30 results will be selected. LanceDb
241
+ * then fetches the full, uncompressed, values for these 30 results. The results are
242
+ * then reordered by the true distance and only the nearest 10 are kept.
243
+ *
244
+ * Note: there is a difference between calling this method with a value of 1 and never
245
+ * calling this method at all. Calling this method with any value will have an impact
246
+ * on your search latency. When you call this method with a `refine_factor` of 1 then
247
+ * LanceDb still needs to fetch the full, uncompressed, values so that it can potentially
248
+ * reorder the results.
249
+ *
250
+ * Note: if this method is NOT called then the distances returned in the _distance column
251
+ * will be approximate distances based on the comparison of the quantized query vector
252
+ * and the quantized result vectors. This can be considerably different than the true
253
+ * distance between the query vector and the actual uncompressed vector.
254
+ */
255
+ refineFactor(refineFactor) {
256
+ this.inner.refineFactor(refineFactor);
257
+ return this;
258
+ }
259
+ /**
260
+ * If this is called then filtering will happen after the vector search instead of
261
+ * before.
262
+ *
263
+ * By default filtering will be performed before the vector search. This is how
264
+ * filtering is typically understood to work. This prefilter step does add some
265
+ * additional latency. Creating a scalar index on the filter column(s) can
266
+ * often improve this latency. However, sometimes a filter is too complex or scalar
267
+ * indices cannot be applied to the column. In these cases postfiltering can be
268
+ * used instead of prefiltering to improve latency.
269
+ *
270
+ * Post filtering applies the filter to the results of the vector search. This means
271
+ * we only run the filter on a much smaller set of data. However, it can cause the
272
+ * query to return fewer than `limit` results (or even no results) if none of the nearest
273
+ * results match the filter.
274
+ *
275
+ * Post filtering happens during the "refine stage" (described in more detail in
276
+ * @see {@link VectorQuery#refineFactor}). This means that setting a higher refine
277
+ * factor can often help restore some of the results lost by post filtering.
278
+ */
279
+ postfilter() {
280
+ this.inner.postfilter();
281
+ return this;
282
+ }
283
+ /**
284
+ * If this is called then any vector index is skipped
285
+ *
286
+ * An exhaustive (flat) search will be performed. The query vector will
287
+ * be compared to every vector in the table. At high scales this can be
288
+ * expensive. However, this is often still useful. For example, skipping
289
+ * the vector index can give you ground truth results which you can use to
290
+ * calculate your recall to select an appropriate value for nprobes.
291
+ */
292
+ bypassVectorIndex() {
293
+ this.inner.bypassVectorIndex();
294
+ return this;
295
+ }
296
+ }
297
+ exports.VectorQuery = VectorQuery;
298
+ /** A builder for LanceDB queries. */
299
+ class Query extends QueryBase {
300
+ constructor(tbl) {
301
+ super(tbl.query());
302
+ }
303
+ /**
304
+ * Find the nearest vectors to the given query vector.
305
+ *
306
+ * This converts the query from a plain query to a vector query.
307
+ *
308
+ * This method will attempt to convert the input to the query vector
309
+ * expected by the embedding model. If the input cannot be converted
310
+ * then an error will be thrown.
311
+ *
312
+ * By default, there is no embedding model, and the input should be
313
+ * an array-like object of numbers (something that can be used as input
314
+ * to Float32Array.from)
315
+ *
316
+ * If there is only one vector column (a column whose data type is a
317
+ * fixed size list of floats) then the column does not need to be specified.
318
+ * If there is more than one vector column you must use
319
+ * @see {@link VectorQuery#column} to specify which column you would like
320
+ * to compare with.
321
+ *
322
+ * If no index has been created on the vector column then a vector query
323
+ * will perform a distance comparison between the query vector and every
324
+ * vector in the database and then sort the results. This is sometimes
325
+ * called a "flat search"
326
+ *
327
+ * For small databases, with a few hundred thousand vectors or less, this can
328
+ * be reasonably fast. In larger databases you should create a vector index
329
+ * on the column. If there is a vector index then an "approximate" nearest
330
+ * neighbor search (frequently called an ANN search) will be performed. This
331
+ * search is much faster, but the results will be approximate.
332
+ *
333
+ * The query can be further parameterized using the returned builder. There
334
+ * are various ANN search parameters that will let you fine tune your recall
335
+ * accuracy vs search latency.
336
+ *
337
+ * Vector searches always have a `limit`. If `limit` has not been called then
338
+ * a default `limit` of 10 will be used. @see {@link Query#limit}
339
+ */
340
+ nearestTo(vector) {
341
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
342
+ const vectorQuery = this.inner.nearestTo(Float32Array.from(vector));
343
+ return new VectorQuery(vectorQuery);
344
+ }
345
+ }
346
+ exports.Query = Query;
@@ -0,0 +1,9 @@
1
+ import { Schema } from "apache-arrow";
2
+ /**
3
+ * Convert something schemaLike into a Schema instance
4
+ *
5
+ * This method is often needed even when the caller is using a Schema
6
+ * instance because they might be using a different instance of apache-arrow
7
+ * than lancedb is using.
8
+ */
9
+ export declare function sanitizeSchema(schemaLike: unknown): Schema;