@lancedb/lancedb 0.21.2-beta.0 → 0.21.2-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/arrow.d.ts CHANGED
@@ -36,6 +36,9 @@ export type TableLike = ArrowTable | {
36
36
  batches: RecordBatchLike[];
37
37
  };
38
38
  export type IntoVector = Float32Array | Float64Array | number[] | Promise<Float32Array | Float64Array | number[]>;
39
+ export type MultiVector = IntoVector[];
40
+ export declare function isMultiVector(value: unknown): value is MultiVector;
41
+ export declare function isIntoVector(value: unknown): value is IntoVector;
39
42
  export declare function isArrowTable(value: object): value is TableLike;
40
43
  export declare function isNull(value: unknown): value is Null;
41
44
  export declare function isInt(value: unknown): value is Int;
@@ -257,6 +260,11 @@ export declare function fromTableToStreamBuffer(table: ArrowTable, embeddings?:
257
260
  * Create an empty table with the given schema
258
261
  */
259
262
  export declare function createEmptyTable(schema: Schema): ArrowTable;
263
+ /**
264
+ * Ensures that all nested fields defined in the schema exist in the data,
265
+ * filling missing fields with null values.
266
+ */
267
+ export declare function ensureNestedFieldsExist(data: Array<Record<string, unknown>>, schema: Schema): Array<Record<string, unknown>>;
260
268
  interface JsonDataType {
261
269
  type: string;
262
270
  fields?: JsonField[];
package/dist/arrow.js CHANGED
@@ -17,6 +17,8 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
17
17
  };
18
18
  Object.defineProperty(exports, "__esModule", { value: true });
19
19
  exports.MakeArrowTableOptions = exports.VectorColumnOptions = void 0;
20
+ exports.isMultiVector = isMultiVector;
21
+ exports.isIntoVector = isIntoVector;
20
22
  exports.isArrowTable = isArrowTable;
21
23
  exports.isNull = isNull;
22
24
  exports.isInt = isInt;
@@ -49,11 +51,20 @@ exports.fromBufferToRecordBatch = fromBufferToRecordBatch;
49
51
  exports.fromRecordBatchToBuffer = fromRecordBatchToBuffer;
50
52
  exports.fromTableToStreamBuffer = fromTableToStreamBuffer;
51
53
  exports.createEmptyTable = createEmptyTable;
54
+ exports.ensureNestedFieldsExist = ensureNestedFieldsExist;
52
55
  exports.dataTypeToJson = dataTypeToJson;
53
56
  const apache_arrow_1 = require("apache-arrow");
54
57
  const registry_1 = require("./embedding/registry");
55
58
  const sanitize_1 = require("./sanitize");
56
59
  __exportStar(require("apache-arrow"), exports);
60
+ function isMultiVector(value) {
61
+ return Array.isArray(value) && isIntoVector(value[0]);
62
+ }
63
+ function isIntoVector(value) {
64
+ return (value instanceof Float32Array ||
65
+ value instanceof Float64Array ||
66
+ (Array.isArray(value) && !Array.isArray(value[0])));
67
+ }
57
68
  function isArrowTable(value) {
58
69
  if (value instanceof apache_arrow_1.Table)
59
70
  return true;
@@ -711,6 +722,13 @@ async function applyEmbeddingsFromMetadata(table, schema) {
711
722
  const vector = makeVector(vectors, destType);
712
723
  columns[destColumn] = vector;
713
724
  }
725
+ // Add any missing columns from the schema as null vectors
726
+ for (const field of schema.fields) {
727
+ if (!(field.name in columns)) {
728
+ const nullValues = new Array(table.numRows).fill(null);
729
+ columns[field.name] = makeVector(nullValues, field.type);
730
+ }
731
+ }
714
732
  const newTable = new apache_arrow_1.Table(columns);
715
733
  return alignTable(newTable, schema);
716
734
  }
@@ -824,7 +842,14 @@ async function applyEmbeddings(table, embeddings, schema) {
824
842
  * be placed at the end of the table, after all of the input columns.
825
843
  */
826
844
  async function convertToTable(data, embeddings, makeTableOptions) {
827
- const table = makeArrowTable(data, makeTableOptions);
845
+ let processedData = data;
846
+ // If we have a schema with embedding metadata, we need to preprocess the data
847
+ // to ensure all nested fields are present
848
+ if (makeTableOptions?.schema &&
849
+ makeTableOptions.schema.metadata?.has("embedding_functions")) {
850
+ processedData = ensureNestedFieldsExist(data, makeTableOptions.schema);
851
+ }
852
+ const table = makeArrowTable(processedData, makeTableOptions);
828
853
  return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
829
854
  }
830
855
  /** Creates the Arrow Type for a Vector column with dimension `dim` */
@@ -893,7 +918,17 @@ async function fromDataToBuffer(data, embeddings, schema) {
893
918
  schema = (0, sanitize_1.sanitizeSchema)(schema);
894
919
  }
895
920
  if (isArrowTable(data)) {
896
- return fromTableToBuffer((0, sanitize_1.sanitizeTable)(data), embeddings, schema);
921
+ const table = (0, sanitize_1.sanitizeTable)(data);
922
+ // If we have a schema with embedding functions, we need to ensure all columns exist
923
+ // before applying embeddings, since applyEmbeddingsFromMetadata expects all columns
924
+ // to be present in the table
925
+ if (schema && schema.metadata?.has("embedding_functions")) {
926
+ const alignedTable = alignTableToSchema(table, schema);
927
+ return fromTableToBuffer(alignedTable, embeddings, schema);
928
+ }
929
+ else {
930
+ return fromTableToBuffer(table, embeddings, schema);
931
+ }
897
932
  }
898
933
  else {
899
934
  const table = await convertToTable(data, embeddings, { schema });
@@ -1004,6 +1039,61 @@ function validateSchemaEmbeddings(schema, data, embeddings) {
1004
1039
  }
1005
1040
  return new apache_arrow_1.Schema(fields, schema.metadata);
1006
1041
  }
1042
+ /**
1043
+ * Ensures that all nested fields defined in the schema exist in the data,
1044
+ * filling missing fields with null values.
1045
+ */
1046
+ function ensureNestedFieldsExist(data, schema) {
1047
+ return data.map((row) => {
1048
+ const completeRow = {};
1049
+ for (const field of schema.fields) {
1050
+ if (field.name in row) {
1051
+ if (field.type.constructor.name === "Struct" &&
1052
+ row[field.name] !== null &&
1053
+ row[field.name] !== undefined) {
1054
+ // Handle nested struct
1055
+ const nestedValue = row[field.name];
1056
+ completeRow[field.name] = ensureStructFieldsExist(nestedValue, field.type);
1057
+ }
1058
+ else {
1059
+ // Non-struct field or null struct value
1060
+ completeRow[field.name] = row[field.name];
1061
+ }
1062
+ }
1063
+ else {
1064
+ // Field is missing from the data - set to null
1065
+ completeRow[field.name] = null;
1066
+ }
1067
+ }
1068
+ return completeRow;
1069
+ });
1070
+ }
1071
+ /**
1072
+ * Recursively ensures that all fields in a struct type exist in the data,
1073
+ * filling missing fields with null values.
1074
+ */
1075
+ function ensureStructFieldsExist(data, structType) {
1076
+ const completeStruct = {};
1077
+ for (const childField of structType.children) {
1078
+ if (childField.name in data) {
1079
+ if (childField.type.constructor.name === "Struct" &&
1080
+ data[childField.name] !== null &&
1081
+ data[childField.name] !== undefined) {
1082
+ // Recursively handle nested struct
1083
+ completeStruct[childField.name] = ensureStructFieldsExist(data[childField.name], childField.type);
1084
+ }
1085
+ else {
1086
+ // Non-struct field or null struct value
1087
+ completeStruct[childField.name] = data[childField.name];
1088
+ }
1089
+ }
1090
+ else {
1091
+ // Field is missing - set to null
1092
+ completeStruct[childField.name] = null;
1093
+ }
1094
+ }
1095
+ return completeStruct;
1096
+ }
1007
1097
  // Matches format of https://github.com/lancedb/lance/blob/main/rust/lance/src/arrow/json.rs
1008
1098
  function dataTypeToJson(dataType) {
1009
1099
  switch (dataType.typeId) {
@@ -1123,3 +1213,51 @@ function fieldToJson(field) {
1123
1213
  metadata: field.metadata,
1124
1214
  };
1125
1215
  }
1216
+ function alignTableToSchema(table, targetSchema) {
1217
+ const existingColumns = new Map();
1218
+ // Map existing columns
1219
+ for (const field of table.schema.fields) {
1220
+ existingColumns.set(field.name, table.getChild(field.name));
1221
+ }
1222
+ // Create vectors for all fields in target schema
1223
+ const alignedColumns = {};
1224
+ for (const field of targetSchema.fields) {
1225
+ if (existingColumns.has(field.name)) {
1226
+ // Column exists, use it
1227
+ alignedColumns[field.name] = existingColumns.get(field.name);
1228
+ }
1229
+ else {
1230
+ // Column missing, create null vector
1231
+ alignedColumns[field.name] = createNullVector(field, table.numRows);
1232
+ }
1233
+ }
1234
+ // Create new table with aligned schema and columns
1235
+ return new apache_arrow_1.Table(targetSchema, alignedColumns);
1236
+ }
1237
+ function createNullVector(field, numRows) {
1238
+ if (field.type.constructor.name === "Struct") {
1239
+ // For struct types, create a struct with null fields
1240
+ const structType = field.type;
1241
+ const childVectors = structType.children.map((childField) => createNullVector(childField, numRows));
1242
+ // Create struct data
1243
+ const structData = (0, apache_arrow_1.makeData)({
1244
+ type: structType,
1245
+ length: numRows,
1246
+ nullCount: 0,
1247
+ children: childVectors.map((v) => v.data[0]),
1248
+ });
1249
+ return (0, apache_arrow_1.makeVector)(structData);
1250
+ }
1251
+ else {
1252
+ // For other types, create a vector of nulls
1253
+ const nullBitmap = new Uint8Array(Math.ceil(numRows / 8));
1254
+ // All bits are 0, meaning all values are null
1255
+ const data = (0, apache_arrow_1.makeData)({
1256
+ type: field.type,
1257
+ length: numRows,
1258
+ nullCount: numRows,
1259
+ nullBitmap,
1260
+ });
1261
+ return (0, apache_arrow_1.makeVector)(data);
1262
+ }
1263
+ }
package/dist/index.d.ts CHANGED
@@ -9,7 +9,7 @@ export { Table, AddDataOptions, UpdateOptions, OptimizeOptions, Version, ColumnA
9
9
  export { MergeInsertBuilder, WriteExecutionOptions } from "./merge";
10
10
  export * as embedding from "./embedding";
11
11
  export * as rerankers from "./rerankers";
12
- export { SchemaLike, TableLike, FieldLike, RecordBatchLike, DataLike, IntoVector, } from "./arrow";
12
+ export { SchemaLike, TableLike, FieldLike, RecordBatchLike, DataLike, IntoVector, MultiVector, } from "./arrow";
13
13
  export { IntoSql, packBits } from "./util";
14
14
  /**
15
15
  * Connect to a LanceDB instance at the given URI.
package/dist/table.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { Table as ArrowTable, Data, DataType, IntoVector, Schema } from "./arrow";
1
+ import { Table as ArrowTable, Data, DataType, IntoVector, MultiVector, Schema } from "./arrow";
2
2
  import { IndexOptions } from "./indices";
3
3
  import { MergeInsertBuilder } from "./merge";
4
4
  import { AddColumnsResult, AddColumnsSql, AddResult, AlterColumnsResult, DeleteResult, DropColumnsResult, IndexConfig, IndexStatistics, OptimizeStats, TableStatistics, Tags, UpdateResult, Table as _NativeTable } from "./native";
@@ -280,7 +280,7 @@ export declare abstract class Table {
280
280
  * when "auto" is used, if the query is a string and an embedding function is defined, it will be treated as a vector query
281
281
  * if the query is a string and no embedding function is defined, it will be treated as a full text search query
282
282
  */
283
- abstract search(query: string | IntoVector | FullTextQuery, queryType?: string, ftsColumns?: string | string[]): VectorQuery | Query;
283
+ abstract search(query: string | IntoVector | MultiVector | FullTextQuery, queryType?: string, ftsColumns?: string | string[]): VectorQuery | Query;
284
284
  /**
285
285
  * Search the table with a given query vector.
286
286
  *
@@ -288,7 +288,7 @@ export declare abstract class Table {
288
288
  * is the same thing as calling `nearestTo` on the builder returned
289
289
  * by `query`. @see {@link Query#nearestTo} for more details.
290
290
  */
291
- abstract vectorSearch(vector: IntoVector): VectorQuery;
291
+ abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
292
292
  /**
293
293
  * Add new columns with defined values.
294
294
  * @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
@@ -463,8 +463,8 @@ export declare class LocalTable extends Table {
463
463
  prewarmIndex(name: string): Promise<void>;
464
464
  waitForIndex(indexNames: string[], timeoutSeconds: number): Promise<void>;
465
465
  query(): Query;
466
- search(query: string | IntoVector | FullTextQuery, queryType?: string, ftsColumns?: string | string[]): VectorQuery | Query;
467
- vectorSearch(vector: IntoVector): VectorQuery;
466
+ search(query: string | IntoVector | MultiVector | FullTextQuery, queryType?: string, ftsColumns?: string | string[]): VectorQuery | Query;
467
+ vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
468
468
  addColumns(newColumnTransforms: AddColumnsSql[]): Promise<AddColumnsResult>;
469
469
  alterColumns(columnAlterations: ColumnAlteration[]): Promise<AlterColumnsResult>;
470
470
  dropColumns(columnNames: string[]): Promise<DropColumnsResult>;
package/dist/table.js CHANGED
@@ -171,6 +171,13 @@ class LocalTable extends Table {
171
171
  return this.query().nearestTo(queryPromise);
172
172
  }
173
173
  vectorSearch(vector) {
174
+ if ((0, arrow_1.isMultiVector)(vector)) {
175
+ const query = this.query().nearestTo(vector[0]);
176
+ for (const v of vector.slice(1)) {
177
+ query.addQueryVector(v);
178
+ }
179
+ return query;
180
+ }
174
181
  return this.query().nearestTo(vector);
175
182
  }
176
183
  // TODO: Support BatchUDF
package/package.json CHANGED
@@ -11,7 +11,7 @@
11
11
  "ann"
12
12
  ],
13
13
  "private": false,
14
- "version": "0.21.2-beta.0",
14
+ "version": "0.21.2-beta.1",
15
15
  "main": "dist/index.js",
16
16
  "exports": {
17
17
  ".": "./dist/index.js",
@@ -100,14 +100,14 @@
100
100
  "reflect-metadata": "^0.2.2"
101
101
  },
102
102
  "optionalDependencies": {
103
- "@lancedb/lancedb-darwin-x64": "0.21.2-beta.0",
104
- "@lancedb/lancedb-darwin-arm64": "0.21.2-beta.0",
105
- "@lancedb/lancedb-linux-x64-gnu": "0.21.2-beta.0",
106
- "@lancedb/lancedb-linux-arm64-gnu": "0.21.2-beta.0",
107
- "@lancedb/lancedb-linux-x64-musl": "0.21.2-beta.0",
108
- "@lancedb/lancedb-linux-arm64-musl": "0.21.2-beta.0",
109
- "@lancedb/lancedb-win32-x64-msvc": "0.21.2-beta.0",
110
- "@lancedb/lancedb-win32-arm64-msvc": "0.21.2-beta.0"
103
+ "@lancedb/lancedb-darwin-x64": "0.21.2-beta.1",
104
+ "@lancedb/lancedb-darwin-arm64": "0.21.2-beta.1",
105
+ "@lancedb/lancedb-linux-x64-gnu": "0.21.2-beta.1",
106
+ "@lancedb/lancedb-linux-arm64-gnu": "0.21.2-beta.1",
107
+ "@lancedb/lancedb-linux-x64-musl": "0.21.2-beta.1",
108
+ "@lancedb/lancedb-linux-arm64-musl": "0.21.2-beta.1",
109
+ "@lancedb/lancedb-win32-x64-msvc": "0.21.2-beta.1",
110
+ "@lancedb/lancedb-win32-arm64-msvc": "0.21.2-beta.1"
111
111
  },
112
112
  "peerDependencies": {
113
113
  "apache-arrow": ">=15.0.0 <=18.1.0"