@lancedb/lancedb 0.4.19 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,29 +43,20 @@ npm run test
43
43
 
44
44
  ### Running lint / format
45
45
 
46
- LanceDb uses eslint for linting. VSCode does not need any plugins to use eslint. However, it
47
- may need some additional configuration. Make sure that eslint.experimental.useFlatConfig is
48
- set to true. Also, if your vscode root folder is the repo root then you will need to set
49
- the eslint.workingDirectories to ["nodejs"]. To manually lint your code you can run:
46
+ LanceDb uses [biome](https://biomejs.dev/) for linting and formatting. if you are using VSCode you will need to install the official [Biome](https://marketplace.visualstudio.com/items?itemName=biomejs.biome) extension.
47
+ To manually lint your code you can run:
50
48
 
51
49
  ```sh
52
50
  npm run lint
53
51
  ```
54
52
 
55
- LanceDb uses prettier for formatting. If you are using VSCode you will need to install the
56
- "Prettier - Code formatter" extension. You should then configure it to be the default formatter
57
- for typescript and you should enable format on save. To manually check your code's format you
58
- can run:
53
+ to automatically fix all fixable issues:
59
54
 
60
55
  ```sh
61
- npm run chkformat
56
+ npm run lint-fix
62
57
  ```
63
58
 
64
- If you need to manually format your code you can run:
65
-
66
- ```sh
67
- npx prettier --write .
68
- ```
59
+ If you do not have your workspace root set to the `nodejs` directory, unfortunately the extension will not work. You can still run the linting and formatting commands manually.
69
60
 
70
61
  ### Generating docs
71
62
 
package/biome.json ADDED
@@ -0,0 +1,136 @@
1
+ {
2
+ "$schema": "https://biomejs.dev/schemas/1.7.3/schema.json",
3
+ "organizeImports": {
4
+ "enabled": true
5
+ },
6
+ "files": {
7
+ "ignore": [
8
+ "**/dist/**/*",
9
+ "**/native.js",
10
+ "**/native.d.ts",
11
+ "**/npm/**/*",
12
+ "**/.vscode/**"
13
+ ]
14
+ },
15
+ "formatter": {
16
+ "indentStyle": "space"
17
+ },
18
+ "linter": {
19
+ "enabled": true,
20
+ "rules": {
21
+ "recommended": false,
22
+ "complexity": {
23
+ "noBannedTypes": "error",
24
+ "noExtraBooleanCast": "error",
25
+ "noMultipleSpacesInRegularExpressionLiterals": "error",
26
+ "noUselessCatch": "error",
27
+ "noUselessThisAlias": "error",
28
+ "noUselessTypeConstraint": "error",
29
+ "noWith": "error"
30
+ },
31
+ "correctness": {
32
+ "noConstAssign": "error",
33
+ "noConstantCondition": "error",
34
+ "noEmptyCharacterClassInRegex": "error",
35
+ "noEmptyPattern": "error",
36
+ "noGlobalObjectCalls": "error",
37
+ "noInnerDeclarations": "error",
38
+ "noInvalidConstructorSuper": "error",
39
+ "noNewSymbol": "error",
40
+ "noNonoctalDecimalEscape": "error",
41
+ "noPrecisionLoss": "error",
42
+ "noSelfAssign": "error",
43
+ "noSetterReturn": "error",
44
+ "noSwitchDeclarations": "error",
45
+ "noUndeclaredVariables": "error",
46
+ "noUnreachable": "error",
47
+ "noUnreachableSuper": "error",
48
+ "noUnsafeFinally": "error",
49
+ "noUnsafeOptionalChaining": "error",
50
+ "noUnusedLabels": "error",
51
+ "noUnusedVariables": "error",
52
+ "useIsNan": "error",
53
+ "useValidForDirection": "error",
54
+ "useYield": "error"
55
+ },
56
+ "style": {
57
+ "noNamespace": "error",
58
+ "useAsConstAssertion": "error",
59
+ "useBlockStatements": "off",
60
+ "useNamingConvention": {
61
+ "level": "error",
62
+ "options": {
63
+ "strictCase": false
64
+ }
65
+ }
66
+ },
67
+ "suspicious": {
68
+ "noAssignInExpressions": "error",
69
+ "noAsyncPromiseExecutor": "error",
70
+ "noCatchAssign": "error",
71
+ "noClassAssign": "error",
72
+ "noCompareNegZero": "error",
73
+ "noControlCharactersInRegex": "error",
74
+ "noDebugger": "error",
75
+ "noDuplicateCase": "error",
76
+ "noDuplicateClassMembers": "error",
77
+ "noDuplicateObjectKeys": "error",
78
+ "noDuplicateParameters": "error",
79
+ "noEmptyBlockStatements": "error",
80
+ "noExplicitAny": "error",
81
+ "noExtraNonNullAssertion": "error",
82
+ "noFallthroughSwitchClause": "error",
83
+ "noFunctionAssign": "error",
84
+ "noGlobalAssign": "error",
85
+ "noImportAssign": "error",
86
+ "noMisleadingCharacterClass": "error",
87
+ "noMisleadingInstantiator": "error",
88
+ "noPrototypeBuiltins": "error",
89
+ "noRedeclare": "error",
90
+ "noShadowRestrictedNames": "error",
91
+ "noUnsafeDeclarationMerging": "error",
92
+ "noUnsafeNegation": "error",
93
+ "useGetterReturn": "error",
94
+ "useValidTypeof": "error"
95
+ }
96
+ },
97
+ "ignore": ["**/dist/**/*", "**/native.js", "**/native.d.ts"]
98
+ },
99
+ "javascript": {
100
+ "globals": []
101
+ },
102
+ "overrides": [
103
+ {
104
+ "include": ["**/*.ts", "**/*.tsx", "**/*.mts", "**/*.cts"],
105
+ "linter": {
106
+ "rules": {
107
+ "correctness": {
108
+ "noConstAssign": "off",
109
+ "noGlobalObjectCalls": "off",
110
+ "noInvalidConstructorSuper": "off",
111
+ "noNewSymbol": "off",
112
+ "noSetterReturn": "off",
113
+ "noUndeclaredVariables": "off",
114
+ "noUnreachable": "off",
115
+ "noUnreachableSuper": "off"
116
+ },
117
+ "style": {
118
+ "noArguments": "error",
119
+ "noVar": "error",
120
+ "useConst": "error"
121
+ },
122
+ "suspicious": {
123
+ "noDuplicateClassMembers": "off",
124
+ "noDuplicateObjectKeys": "off",
125
+ "noDuplicateParameters": "off",
126
+ "noFunctionAssign": "off",
127
+ "noImportAssign": "off",
128
+ "noRedeclare": "off",
129
+ "noUnsafeNegation": "off",
130
+ "useGetterReturn": "off"
131
+ }
132
+ }
133
+ }
134
+ }
135
+ ]
136
+ }
package/dist/arrow.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  /// <reference types="node" />
2
- import { type Schema, Table as ArrowTable, type Float } from "apache-arrow";
2
+ import { Table as ArrowTable, type Float, Schema } from "apache-arrow";
3
3
  import { type EmbeddingFunction } from "./embedding/embedding_function";
4
4
  /** Data type accepted by NodeJS SDK */
5
5
  export type Data = Record<string, unknown>[] | ArrowTable;
@@ -12,6 +12,7 @@ export declare class VectorColumnOptions {
12
12
  export declare class MakeArrowTableOptions {
13
13
  schema?: Schema;
14
14
  vectorColumns: Record<string, VectorColumnOptions>;
15
+ embeddings?: EmbeddingFunction<unknown>;
15
16
  /**
16
17
  * If true then string columns will be encoded with dictionary encoding
17
18
  *
package/dist/arrow.js CHANGED
@@ -60,6 +60,7 @@ class MakeArrowTableOptions {
60
60
  vectorColumns = {
61
61
  vector: new VectorColumnOptions(),
62
62
  };
63
+ embeddings;
63
64
  /**
64
65
  * If true then string columns will be encoded with dictionary encoding
65
66
  *
@@ -175,6 +176,7 @@ function makeArrowTable(data, options) {
175
176
  const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
176
177
  if (opt.schema !== undefined && opt.schema !== null) {
177
178
  opt.schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
179
+ opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
178
180
  }
179
181
  const columns = {};
180
182
  // TODO: sample dataset to find missing columns
@@ -244,8 +246,9 @@ function makeArrowTable(data, options) {
244
246
  // then patch the schema of the batches so we can use
245
247
  // `new ArrowTable(schema, batches)` which does not do any schema inference
246
248
  const firstTable = new apache_arrow_1.Table(columns);
249
+ const batchesFixed = firstTable.batches.map(
247
250
  // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
248
- const batchesFixed = firstTable.batches.map((batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
251
+ (batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
249
252
  return new apache_arrow_1.Table(opt.schema, batchesFixed);
250
253
  }
251
254
  else {
@@ -269,7 +272,7 @@ function makeListVector(lists) {
269
272
  throw Error("Cannot infer list vector from empty array or empty list");
270
273
  }
271
274
  const sampleList = lists[0];
272
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
275
+ // biome-ignore lint/suspicious/noExplicitAny: skip
273
276
  let inferredType;
274
277
  try {
275
278
  const sampleVector = makeVector(sampleList);
@@ -537,3 +540,31 @@ function createEmptyTable(schema) {
537
540
  return new apache_arrow_1.Table((0, sanitize_1.sanitizeSchema)(schema));
538
541
  }
539
542
  exports.createEmptyTable = createEmptyTable;
543
+ function validateSchemaEmbeddings(schema, data, embeddings) {
544
+ const fields = [];
545
+ const missingEmbeddingFields = [];
546
+ // First we check if the field is a `FixedSizeList`
547
+ // Then we check if the data contains the field
548
+ // if it does not, we add it to the list of missing embedding fields
549
+ // Finally, we check if those missing embedding fields are `this._embeddings`
550
+ // if they are not, we throw an error
551
+ for (const field of schema.fields) {
552
+ if (field.type instanceof apache_arrow_1.FixedSizeList) {
553
+ if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
554
+ missingEmbeddingFields.push(field);
555
+ }
556
+ else {
557
+ fields.push(field);
558
+ }
559
+ }
560
+ else {
561
+ fields.push(field);
562
+ }
563
+ }
564
+ if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
565
+ throw new Error(`Table has embeddings: "${missingEmbeddingFields
566
+ .map((f) => f.name)
567
+ .join(",")}", but no embedding function was provided`);
568
+ }
569
+ return new apache_arrow_1.Schema(fields, schema.metadata);
570
+ }
@@ -1,6 +1,6 @@
1
+ import { Table as ArrowTable, Schema } from "apache-arrow";
1
2
  import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
2
3
  import { Table } from "./table";
3
- import { Table as ArrowTable, Schema } from "apache-arrow";
4
4
  /**
5
5
  * Connect to a LanceDB instance at the given URI.
6
6
  *
@@ -14,10 +14,10 @@
14
14
  // limitations under the License.
15
15
  Object.defineProperty(exports, "__esModule", { value: true });
16
16
  exports.Connection = exports.connect = void 0;
17
+ const apache_arrow_1 = require("apache-arrow");
17
18
  const arrow_1 = require("./arrow");
18
19
  const native_1 = require("./native");
19
20
  const table_1 = require("./table");
20
- const apache_arrow_1 = require("apache-arrow");
21
21
  /**
22
22
  * Connect to a LanceDB instance at the given URI.
23
23
  *
package/dist/native.d.ts CHANGED
@@ -15,6 +15,31 @@ export interface IndexConfig {
15
15
  */
16
16
  columns: Array<string>
17
17
  }
18
+ /** Statistics about a compaction operation. */
19
+ export interface CompactionStats {
20
+ /** The number of fragments removed */
21
+ fragmentsRemoved: number
22
+ /** The number of new, compacted fragments added */
23
+ fragmentsAdded: number
24
+ /** The number of data files removed */
25
+ filesRemoved: number
26
+ /** The number of new, compacted data files added */
27
+ filesAdded: number
28
+ }
29
+ /** Statistics about a cleanup operation */
30
+ export interface RemovalStats {
31
+ /** The number of bytes removed */
32
+ bytesRemoved: number
33
+ /** The number of old versions removed */
34
+ oldVersionsRemoved: number
35
+ }
36
+ /** Statistics about an optimize operation */
37
+ export interface OptimizeStats {
38
+ /** Statistics about the compaction operation */
39
+ compaction: CompactionStats
40
+ /** Statistics about the removal operation */
41
+ prune: RemovalStats
42
+ }
18
43
  /**
19
44
  * A definition of a column alteration. The alteration changes the column at
20
45
  * `path` to have the new name `name`, to be nullable if `nullable` is true,
@@ -151,5 +176,6 @@ export class Table {
151
176
  checkout(version: number): Promise<void>
152
177
  checkoutLatest(): Promise<void>
153
178
  restore(): Promise<void>
179
+ optimize(olderThanMs?: number | undefined | null): Promise<OptimizeStats>
154
180
  listIndices(): Promise<Array<IndexConfig>>
155
181
  }
package/dist/query.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { RecordBatch, Table as ArrowTable } from "apache-arrow";
1
+ import { Table as ArrowTable, RecordBatch } from "apache-arrow";
2
2
  import { RecordBatchIterator as NativeBatchIterator, Query as NativeQuery, Table as NativeTable, VectorQuery as NativeVectorQuery } from "./native";
3
3
  export declare class RecordBatchIterator implements AsyncIterator<RecordBatch> {
4
4
  private promisedInner?;
package/dist/query.js CHANGED
@@ -22,7 +22,7 @@ class RecordBatchIterator {
22
22
  // TODO: check promise reliably so we dont need to pass two arguments.
23
23
  this.promisedInner = promise;
24
24
  }
25
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
25
+ // biome-ignore lint/suspicious/noExplicitAny: skip
26
26
  async next() {
27
27
  if (this.inner === undefined) {
28
28
  this.inner = await this.promisedInner;
@@ -48,6 +48,7 @@ class QueryBase {
48
48
  inner;
49
49
  constructor(inner) {
50
50
  this.inner = inner;
51
+ // intentionally empty
51
52
  }
52
53
  /**
53
54
  * A filter statement to be applied to this query.
@@ -136,7 +137,7 @@ class QueryBase {
136
137
  execute() {
137
138
  return new RecordBatchIterator(this.nativeExecute());
138
139
  }
139
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
140
+ // biome-ignore lint/suspicious/noExplicitAny: skip
140
141
  [Symbol.asyncIterator]() {
141
142
  const promise = this.nativeExecute();
142
143
  return new RecordBatchIterator(promise);
@@ -338,7 +339,7 @@ class Query extends QueryBase {
338
339
  * a default `limit` of 10 will be used. @see {@link Query#limit}
339
340
  */
340
341
  nearestTo(vector) {
341
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
342
+ // biome-ignore lint/suspicious/noExplicitAny: skip
342
343
  const vectorQuery = this.inner.nearestTo(Float32Array.from(vector));
343
344
  return new VectorQuery(vectorQuery);
344
345
  }
package/dist/sanitize.js CHANGED
@@ -127,7 +127,7 @@ function sanitizeUnion(typeLike) {
127
127
  throw Error("Expected a Union type to have an array-like `children` property");
128
128
  }
129
129
  return new apache_arrow_1.Union(typeLike.mode,
130
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
130
+ // biome-ignore lint/suspicious/noExplicitAny: skip
131
131
  typeLike.typeIds, typeLike.children.map((child) => sanitizeField(child)));
132
132
  }
133
133
  function sanitizeTypedUnion(typeLike,
@@ -167,7 +167,7 @@ function sanitizeMap(typeLike) {
167
167
  throw Error("Expected a Map type to have a `keysSorted` property");
168
168
  }
169
169
  return new apache_arrow_1.Map_(
170
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
170
+ // biome-ignore lint/suspicious/noExplicitAny: skip
171
171
  typeLike.children.map((field) => sanitizeField(field)), typeLike.keysSorted);
172
172
  }
173
173
  function sanitizeDuration(typeLike) {
@@ -191,7 +191,7 @@ function sanitizeDictionary(typeLike) {
191
191
  }
192
192
  return new apache_arrow_1.Dictionary(sanitizeType(typeLike.dictionary), sanitizeType(typeLike.indices), typeLike.id, typeLike.isOrdered);
193
193
  }
194
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
194
+ // biome-ignore lint/suspicious/noExplicitAny: skip
195
195
  function sanitizeType(typeLike) {
196
196
  if (typeof typeLike !== "object" || typeLike === null) {
197
197
  throw Error("Expected a Type but object was null/undefined");
package/dist/table.d.ts CHANGED
@@ -1,8 +1,8 @@
1
1
  import { Schema } from "apache-arrow";
2
- import { AddColumnsSql, ColumnAlteration, IndexConfig, Table as _NativeTable } from "./native";
3
- import { Query, VectorQuery } from "./query";
4
- import { IndexOptions } from "./indices";
5
2
  import { Data } from "./arrow";
3
+ import { IndexOptions } from "./indices";
4
+ import { AddColumnsSql, ColumnAlteration, IndexConfig, OptimizeStats, Table as _NativeTable } from "./native";
5
+ import { Query, VectorQuery } from "./query";
6
6
  export { IndexConfig } from "./native";
7
7
  /**
8
8
  * Options for adding data to a table.
@@ -28,6 +28,22 @@ export interface UpdateOptions {
28
28
  */
29
29
  where: string;
30
30
  }
31
+ export interface OptimizeOptions {
32
+ /**
33
+ * If set then all versions older than the given date
34
+ * be removed. The current version will never be removed.
35
+ * The default is 7 days
36
+ * @example
37
+ * // Delete all versions older than 1 day
38
+ * const olderThan = new Date();
39
+ * olderThan.setDate(olderThan.getDate() - 1));
40
+ * tbl.cleanupOlderVersions(olderThan);
41
+ *
42
+ * // Delete all versions except the current version
43
+ * tbl.cleanupOlderVersions(new Date());
44
+ */
45
+ cleanupOlderThan: Date;
46
+ }
31
47
  /**
32
48
  * A Table is a collection of Records in a LanceDB Database.
33
49
  *
@@ -253,6 +269,37 @@ export declare class Table {
253
269
  * out state and the read_consistency_interval, if any, will apply.
254
270
  */
255
271
  restore(): Promise<void>;
272
+ /**
273
+ * Optimize the on-disk data and indices for better performance.
274
+ *
275
+ * Modeled after ``VACUUM`` in PostgreSQL.
276
+ *
277
+ * Optimization covers three operations:
278
+ *
279
+ * - Compaction: Merges small files into larger ones
280
+ * - Prune: Removes old versions of the dataset
281
+ * - Index: Optimizes the indices, adding new data to existing indices
282
+ *
283
+ *
284
+ * Experimental API
285
+ * ----------------
286
+ *
287
+ * The optimization process is undergoing active development and may change.
288
+ * Our goal with these changes is to improve the performance of optimization and
289
+ * reduce the complexity.
290
+ *
291
+ * That being said, it is essential today to run optimize if you want the best
292
+ * performance. It should be stable and safe to use in production, but it our
293
+ * hope that the API may be simplified (or not even need to be called) in the
294
+ * future.
295
+ *
296
+ * The frequency an application shoudl call optimize is based on the frequency of
297
+ * data modifications. If data is frequently added, deleted, or updated then
298
+ * optimize should be run frequently. A good rule of thumb is to run optimize if
299
+ * you have added or modified 100,000 or more records or run more than 20 data
300
+ * modification operations.
301
+ */
302
+ optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats>;
256
303
  /** List all indices that have been created with {@link Table.createIndex} */
257
304
  listIndices(): Promise<IndexConfig[]>;
258
305
  }
package/dist/table.js CHANGED
@@ -15,8 +15,8 @@
15
15
  Object.defineProperty(exports, "__esModule", { value: true });
16
16
  exports.Table = void 0;
17
17
  const apache_arrow_1 = require("apache-arrow");
18
- const query_1 = require("./query");
19
18
  const arrow_1 = require("./arrow");
19
+ const query_1 = require("./query");
20
20
  /**
21
21
  * A Table is a collection of Records in a LanceDB Database.
22
22
  *
@@ -140,7 +140,7 @@ class Table {
140
140
  */
141
141
  async createIndex(column, options) {
142
142
  // Bit of a hack to get around the fact that TS has no package-scope.
143
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
143
+ // biome-ignore lint/suspicious/noExplicitAny: skip
144
144
  const nativeIndex = options?.config?.inner;
145
145
  await this.inner.createIndex(nativeIndex, column, options?.replace);
146
146
  }
@@ -296,6 +296,45 @@ class Table {
296
296
  async restore() {
297
297
  await this.inner.restore();
298
298
  }
299
+ /**
300
+ * Optimize the on-disk data and indices for better performance.
301
+ *
302
+ * Modeled after ``VACUUM`` in PostgreSQL.
303
+ *
304
+ * Optimization covers three operations:
305
+ *
306
+ * - Compaction: Merges small files into larger ones
307
+ * - Prune: Removes old versions of the dataset
308
+ * - Index: Optimizes the indices, adding new data to existing indices
309
+ *
310
+ *
311
+ * Experimental API
312
+ * ----------------
313
+ *
314
+ * The optimization process is undergoing active development and may change.
315
+ * Our goal with these changes is to improve the performance of optimization and
316
+ * reduce the complexity.
317
+ *
318
+ * That being said, it is essential today to run optimize if you want the best
319
+ * performance. It should be stable and safe to use in production, but it our
320
+ * hope that the API may be simplified (or not even need to be called) in the
321
+ * future.
322
+ *
323
+ * The frequency an application shoudl call optimize is based on the frequency of
324
+ * data modifications. If data is frequently added, deleted, or updated then
325
+ * optimize should be run frequently. A good rule of thumb is to run optimize if
326
+ * you have added or modified 100,000 or more records or run more than 20 data
327
+ * modification operations.
328
+ */
329
+ async optimize(options) {
330
+ let cleanupOlderThanMs;
331
+ if (options?.cleanupOlderThan !== undefined &&
332
+ options?.cleanupOlderThan !== null) {
333
+ cleanupOlderThanMs =
334
+ new Date().getTime() - options.cleanupOlderThan.getTime();
335
+ }
336
+ return await this.inner.optimize(cleanupOlderThanMs);
337
+ }
299
338
  /** List all indices that have been created with {@link Table.createIndex} */
300
339
  async listIndices() {
301
340
  return await this.inner.listIndices();
package/lancedb/arrow.ts CHANGED
@@ -13,25 +13,25 @@
13
13
  // limitations under the License.
14
14
 
15
15
  import {
16
+ Table as ArrowTable,
17
+ Binary,
18
+ DataType,
16
19
  Field,
17
- makeBuilder,
18
- RecordBatchFileWriter,
19
- Utf8,
20
- type Vector,
21
20
  FixedSizeList,
22
- vectorFromArray,
23
- type Schema,
24
- Table as ArrowTable,
25
- RecordBatchStreamWriter,
21
+ type Float,
22
+ Float32,
26
23
  List,
27
24
  RecordBatch,
28
- makeData,
25
+ RecordBatchFileWriter,
26
+ RecordBatchStreamWriter,
27
+ Schema,
29
28
  Struct,
30
- type Float,
31
- DataType,
32
- Binary,
33
- Float32,
29
+ Utf8,
30
+ type Vector,
31
+ makeBuilder,
32
+ makeData,
34
33
  type makeTable,
34
+ vectorFromArray,
35
35
  } from "apache-arrow";
36
36
  import { type EmbeddingFunction } from "./embedding/embedding_function";
37
37
  import { sanitizeSchema } from "./sanitize";
@@ -85,6 +85,7 @@ export class MakeArrowTableOptions {
85
85
  vectorColumns: Record<string, VectorColumnOptions> = {
86
86
  vector: new VectorColumnOptions(),
87
87
  };
88
+ embeddings?: EmbeddingFunction<unknown>;
88
89
 
89
90
  /**
90
91
  * If true then string columns will be encoded with dictionary encoding
@@ -208,6 +209,7 @@ export function makeArrowTable(
208
209
  const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
209
210
  if (opt.schema !== undefined && opt.schema !== null) {
210
211
  opt.schema = sanitizeSchema(opt.schema);
212
+ opt.schema = validateSchemaEmbeddings(opt.schema, data, opt.embeddings);
211
213
  }
212
214
  const columns: Record<string, Vector> = {};
213
215
  // TODO: sample dataset to find missing columns
@@ -287,8 +289,8 @@ export function makeArrowTable(
287
289
  // then patch the schema of the batches so we can use
288
290
  // `new ArrowTable(schema, batches)` which does not do any schema inference
289
291
  const firstTable = new ArrowTable(columns);
290
- // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
291
292
  const batchesFixed = firstTable.batches.map(
293
+ // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
292
294
  (batch) => new RecordBatch(opt.schema!, batch.data),
293
295
  );
294
296
  return new ArrowTable(opt.schema, batchesFixed);
@@ -313,7 +315,7 @@ function makeListVector(lists: unknown[][]): Vector<unknown> {
313
315
  throw Error("Cannot infer list vector from empty array or empty list");
314
316
  }
315
317
  const sampleList = lists[0];
316
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
318
+ // biome-ignore lint/suspicious/noExplicitAny: skip
317
319
  let inferredType: any;
318
320
  try {
319
321
  const sampleVector = makeVector(sampleList);
@@ -337,7 +339,7 @@ function makeVector(
337
339
  values: unknown[],
338
340
  type?: DataType,
339
341
  stringAsDictionary?: boolean,
340
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
342
+ // biome-ignore lint/suspicious/noExplicitAny: skip
341
343
  ): Vector<any> {
342
344
  if (type !== undefined) {
343
345
  // No need for inference, let Arrow create it
@@ -648,3 +650,39 @@ function alignTable(table: ArrowTable, schema: Schema): ArrowTable {
648
650
  export function createEmptyTable(schema: Schema): ArrowTable {
649
651
  return new ArrowTable(sanitizeSchema(schema));
650
652
  }
653
+
654
+ function validateSchemaEmbeddings(
655
+ schema: Schema,
656
+ data: Array<Record<string, unknown>>,
657
+ embeddings: EmbeddingFunction<unknown> | undefined,
658
+ ) {
659
+ const fields = [];
660
+ const missingEmbeddingFields = [];
661
+
662
+ // First we check if the field is a `FixedSizeList`
663
+ // Then we check if the data contains the field
664
+ // if it does not, we add it to the list of missing embedding fields
665
+ // Finally, we check if those missing embedding fields are `this._embeddings`
666
+ // if they are not, we throw an error
667
+ for (const field of schema.fields) {
668
+ if (field.type instanceof FixedSizeList) {
669
+ if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
670
+ missingEmbeddingFields.push(field);
671
+ } else {
672
+ fields.push(field);
673
+ }
674
+ } else {
675
+ fields.push(field);
676
+ }
677
+ }
678
+
679
+ if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
680
+ throw new Error(
681
+ `Table has embeddings: "${missingEmbeddingFields
682
+ .map((f) => f.name)
683
+ .join(",")}", but no embedding function was provided`,
684
+ );
685
+ }
686
+
687
+ return new Schema(fields, schema.metadata);
688
+ }
@@ -12,10 +12,10 @@
12
12
  // See the License for the specific language governing permissions and
13
13
  // limitations under the License.
14
14
 
15
+ import { Table as ArrowTable, Schema } from "apache-arrow";
15
16
  import { fromTableToBuffer, makeArrowTable, makeEmptyTable } from "./arrow";
16
17
  import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
17
18
  import { Table } from "./table";
18
- import { Table as ArrowTable, Schema } from "apache-arrow";
19
19
 
20
20
  /**
21
21
  * Connect to a LanceDB instance at the given URI.
@@ -12,8 +12,8 @@
12
12
  // See the License for the specific language governing permissions and
13
13
  // limitations under the License.
14
14
 
15
- import { type EmbeddingFunction } from "./embedding_function";
16
15
  import type OpenAI from "openai";
16
+ import { type EmbeddingFunction } from "./embedding_function";
17
17
 
18
18
  export class OpenAIEmbeddingFunction implements EmbeddingFunction<string> {
19
19
  private readonly _openai: OpenAI;