@lancedb/lancedb 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/dist/arrow.d.ts +5 -3
  2. package/dist/arrow.js +1 -1
  3. package/dist/embedding/embedding_function.d.ts +4 -3
  4. package/dist/embedding/index.d.ts +1 -0
  5. package/dist/embedding/index.js +1 -0
  6. package/dist/embedding/registry.d.ts +9 -7
  7. package/dist/embedding/registry.js +24 -6
  8. package/dist/embedding/transformers.d.ts +37 -0
  9. package/dist/embedding/transformers.js +147 -0
  10. package/dist/query.js +15 -9
  11. package/dist/remote/client.d.ts +1 -1
  12. package/dist/remote/client.js +6 -8
  13. package/dist/remote/connection.d.ts +2 -3
  14. package/dist/remote/connection.js +2 -2
  15. package/dist/table.d.ts +3 -0
  16. package/dist/table.js +1 -1
  17. package/package.json +17 -14
  18. package/Cargo.toml +0 -28
  19. package/biome.json +0 -158
  20. package/build.rs +0 -5
  21. package/dist/native.d.ts +0 -208
  22. package/examples/ann_indexes.ts +0 -49
  23. package/examples/basic.ts +0 -149
  24. package/examples/embedding.ts +0 -83
  25. package/examples/filtering.ts +0 -34
  26. package/examples/jsconfig.json +0 -27
  27. package/examples/package-lock.json +0 -79
  28. package/examples/package.json +0 -18
  29. package/examples/search.ts +0 -37
  30. package/jest.config.js +0 -7
  31. package/lancedb/arrow.ts +0 -947
  32. package/lancedb/connection.ts +0 -333
  33. package/lancedb/embedding/embedding_function.ts +0 -194
  34. package/lancedb/embedding/index.ts +0 -113
  35. package/lancedb/embedding/openai.ts +0 -113
  36. package/lancedb/embedding/registry.ts +0 -188
  37. package/lancedb/index.ts +0 -142
  38. package/lancedb/indices.ts +0 -203
  39. package/lancedb/merge.ts +0 -70
  40. package/lancedb/query.ts +0 -507
  41. package/lancedb/remote/client.ts +0 -221
  42. package/lancedb/remote/connection.ts +0 -201
  43. package/lancedb/remote/index.ts +0 -3
  44. package/lancedb/remote/table.ts +0 -226
  45. package/lancedb/sanitize.ts +0 -588
  46. package/lancedb/table.ts +0 -669
  47. package/lancedb/util.ts +0 -69
  48. package/native.d.ts +0 -208
  49. package/nodejs-artifacts/arrow.d.ts +0 -250
  50. package/nodejs-artifacts/arrow.js +0 -768
  51. package/nodejs-artifacts/connection.d.ts +0 -171
  52. package/nodejs-artifacts/connection.js +0 -135
  53. package/nodejs-artifacts/embedding/embedding_function.d.ts +0 -79
  54. package/nodejs-artifacts/embedding/embedding_function.js +0 -112
  55. package/nodejs-artifacts/embedding/index.d.ts +0 -28
  56. package/nodejs-artifacts/embedding/index.js +0 -114
  57. package/nodejs-artifacts/embedding/openai.d.ts +0 -18
  58. package/nodejs-artifacts/embedding/openai.js +0 -105
  59. package/nodejs-artifacts/embedding/registry.d.ts +0 -53
  60. package/nodejs-artifacts/embedding/registry.js +0 -127
  61. package/nodejs-artifacts/index.d.ts +0 -55
  62. package/nodejs-artifacts/index.js +0 -57
  63. package/nodejs-artifacts/indices.d.ts +0 -165
  64. package/nodejs-artifacts/indices.js +0 -71
  65. package/nodejs-artifacts/merge.d.ts +0 -54
  66. package/nodejs-artifacts/merge.js +0 -64
  67. package/nodejs-artifacts/native.d.ts +0 -208
  68. package/nodejs-artifacts/native.js +0 -330
  69. package/nodejs-artifacts/query.d.ts +0 -283
  70. package/nodejs-artifacts/query.js +0 -448
  71. package/nodejs-artifacts/remote/client.d.ts +0 -28
  72. package/nodejs-artifacts/remote/client.js +0 -172
  73. package/nodejs-artifacts/remote/connection.d.ts +0 -25
  74. package/nodejs-artifacts/remote/connection.js +0 -110
  75. package/nodejs-artifacts/remote/index.d.ts +0 -3
  76. package/nodejs-artifacts/remote/index.js +0 -9
  77. package/nodejs-artifacts/remote/table.d.ts +0 -42
  78. package/nodejs-artifacts/remote/table.js +0 -179
  79. package/nodejs-artifacts/sanitize.d.ts +0 -31
  80. package/nodejs-artifacts/sanitize.js +0 -436
  81. package/nodejs-artifacts/table.d.ts +0 -395
  82. package/nodejs-artifacts/table.js +0 -230
  83. package/nodejs-artifacts/util.d.ts +0 -14
  84. package/nodejs-artifacts/util.js +0 -65
  85. package/tsconfig.json +0 -25
  86. package/typedoc.json +0 -10
@@ -1,768 +0,0 @@
1
- "use strict";
2
- // Copyright 2023 Lance Developers.
3
- //
4
- // Licensed under the Apache License, Version 2.0 (the "License");
5
- // you may not use this file except in compliance with the License.
6
- // You may obtain a copy of the License at
7
- //
8
- // http://www.apache.org/licenses/LICENSE-2.0
9
- //
10
- // Unless required by applicable law or agreed to in writing, software
11
- // distributed under the License is distributed on an "AS IS" BASIS,
12
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- // See the License for the specific language governing permissions and
14
- // limitations under the License.
15
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
16
- if (k2 === undefined) k2 = k;
17
- var desc = Object.getOwnPropertyDescriptor(m, k);
18
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
19
- desc = { enumerable: true, get: function() { return m[k]; } };
20
- }
21
- Object.defineProperty(o, k2, desc);
22
- }) : (function(o, m, k, k2) {
23
- if (k2 === undefined) k2 = k;
24
- o[k2] = m[k];
25
- }));
26
- var __exportStar = (this && this.__exportStar) || function(m, exports) {
27
- for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
28
- };
29
- Object.defineProperty(exports, "__esModule", { value: true });
30
- exports.createEmptyTable = exports.fromTableToStreamBuffer = exports.fromDataToBuffer = exports.fromTableToBuffer = exports.fromRecordsToStreamBuffer = exports.fromRecordsToBuffer = exports.newVectorType = exports.convertToTable = exports.makeEmptyTable = exports.makeArrowTable = exports.MakeArrowTableOptions = exports.VectorColumnOptions = exports.isFixedSizeList = exports.isFixedSizeBinary = exports.isUnion = exports.isStruct = exports.isList = exports.isDuration = exports.isInterval = exports.isTimestamp = exports.isTime = exports.isDate = exports.isDecimal = exports.isBool = exports.isLargeUtf8 = exports.isUtf8 = exports.isLargeBinary = exports.isBinary = exports.isFloat = exports.isInt = exports.isNull = exports.isDataType = exports.isArrowTable = void 0;
31
- const apache_arrow_1 = require("apache-arrow");
32
- const registry_1 = require("./embedding/registry");
33
- const sanitize_1 = require("./sanitize");
34
- __exportStar(require("apache-arrow"), exports);
35
- function isArrowTable(value) {
36
- if (value instanceof apache_arrow_1.Table)
37
- return true;
38
- return "schema" in value && "batches" in value;
39
- }
40
- exports.isArrowTable = isArrowTable;
41
- function isDataType(value) {
42
- return (value instanceof apache_arrow_1.DataType ||
43
- apache_arrow_1.DataType.isNull(value) ||
44
- apache_arrow_1.DataType.isInt(value) ||
45
- apache_arrow_1.DataType.isFloat(value) ||
46
- apache_arrow_1.DataType.isBinary(value) ||
47
- apache_arrow_1.DataType.isLargeBinary(value) ||
48
- apache_arrow_1.DataType.isUtf8(value) ||
49
- apache_arrow_1.DataType.isLargeUtf8(value) ||
50
- apache_arrow_1.DataType.isBool(value) ||
51
- apache_arrow_1.DataType.isDecimal(value) ||
52
- apache_arrow_1.DataType.isDate(value) ||
53
- apache_arrow_1.DataType.isTime(value) ||
54
- apache_arrow_1.DataType.isTimestamp(value) ||
55
- apache_arrow_1.DataType.isInterval(value) ||
56
- apache_arrow_1.DataType.isDuration(value) ||
57
- apache_arrow_1.DataType.isList(value) ||
58
- apache_arrow_1.DataType.isStruct(value) ||
59
- apache_arrow_1.DataType.isUnion(value) ||
60
- apache_arrow_1.DataType.isFixedSizeBinary(value) ||
61
- apache_arrow_1.DataType.isFixedSizeList(value) ||
62
- apache_arrow_1.DataType.isMap(value) ||
63
- apache_arrow_1.DataType.isDictionary(value));
64
- }
65
- exports.isDataType = isDataType;
66
- function isNull(value) {
67
- return value instanceof apache_arrow_1.Null || apache_arrow_1.DataType.isNull(value);
68
- }
69
- exports.isNull = isNull;
70
- function isInt(value) {
71
- return value instanceof apache_arrow_1.Int || apache_arrow_1.DataType.isInt(value);
72
- }
73
- exports.isInt = isInt;
74
- function isFloat(value) {
75
- return value instanceof apache_arrow_1.Float || apache_arrow_1.DataType.isFloat(value);
76
- }
77
- exports.isFloat = isFloat;
78
- function isBinary(value) {
79
- return value instanceof apache_arrow_1.Binary || apache_arrow_1.DataType.isBinary(value);
80
- }
81
- exports.isBinary = isBinary;
82
- function isLargeBinary(value) {
83
- return value instanceof apache_arrow_1.LargeBinary || apache_arrow_1.DataType.isLargeBinary(value);
84
- }
85
- exports.isLargeBinary = isLargeBinary;
86
- function isUtf8(value) {
87
- return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isUtf8(value);
88
- }
89
- exports.isUtf8 = isUtf8;
90
- function isLargeUtf8(value) {
91
- return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isLargeUtf8(value);
92
- }
93
- exports.isLargeUtf8 = isLargeUtf8;
94
- function isBool(value) {
95
- return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isBool(value);
96
- }
97
- exports.isBool = isBool;
98
- function isDecimal(value) {
99
- return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDecimal(value);
100
- }
101
- exports.isDecimal = isDecimal;
102
- function isDate(value) {
103
- return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDate(value);
104
- }
105
- exports.isDate = isDate;
106
- function isTime(value) {
107
- return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTime(value);
108
- }
109
- exports.isTime = isTime;
110
- function isTimestamp(value) {
111
- return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTimestamp(value);
112
- }
113
- exports.isTimestamp = isTimestamp;
114
- function isInterval(value) {
115
- return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isInterval(value);
116
- }
117
- exports.isInterval = isInterval;
118
- function isDuration(value) {
119
- return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDuration(value);
120
- }
121
- exports.isDuration = isDuration;
122
- function isList(value) {
123
- return value instanceof apache_arrow_1.List || apache_arrow_1.DataType.isList(value);
124
- }
125
- exports.isList = isList;
126
- function isStruct(value) {
127
- return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isStruct(value);
128
- }
129
- exports.isStruct = isStruct;
130
- function isUnion(value) {
131
- return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isUnion(value);
132
- }
133
- exports.isUnion = isUnion;
134
- function isFixedSizeBinary(value) {
135
- return value instanceof apache_arrow_1.FixedSizeBinary || apache_arrow_1.DataType.isFixedSizeBinary(value);
136
- }
137
- exports.isFixedSizeBinary = isFixedSizeBinary;
138
- function isFixedSizeList(value) {
139
- return value instanceof apache_arrow_1.FixedSizeList || apache_arrow_1.DataType.isFixedSizeList(value);
140
- }
141
- exports.isFixedSizeList = isFixedSizeList;
142
- /*
143
- * Options to control how a column should be converted to a vector array
144
- */
145
- class VectorColumnOptions {
146
- /** Vector column type. */
147
- type = new apache_arrow_1.Float32();
148
- constructor(values) {
149
- Object.assign(this, values);
150
- }
151
- }
152
- exports.VectorColumnOptions = VectorColumnOptions;
153
- /** Options to control the makeArrowTable call. */
154
- class MakeArrowTableOptions {
155
- /*
156
- * Schema of the data.
157
- *
158
- * If this is not provided then the data type will be inferred from the
159
- * JS type. Integer numbers will become int64, floating point numbers
160
- * will become float64 and arrays will become variable sized lists with
161
- * the data type inferred from the first element in the array.
162
- *
163
- * The schema must be specified if there are no records (e.g. to make
164
- * an empty table)
165
- */
166
- schema;
167
- /*
168
- * Mapping from vector column name to expected type
169
- *
170
- * Lance expects vector columns to be fixed size list arrays (i.e. tensors)
171
- * However, `makeArrowTable` will not infer this by default (it creates
172
- * variable size list arrays). This field can be used to indicate that a column
173
- * should be treated as a vector column and converted to a fixed size list.
174
- *
175
- * The keys should be the names of the vector columns. The value specifies the
176
- * expected data type of the vector columns.
177
- *
178
- * If `schema` is provided then this field is ignored.
179
- *
180
- * By default, the column named "vector" will be assumed to be a float32
181
- * vector column.
182
- */
183
- vectorColumns = {
184
- vector: new VectorColumnOptions(),
185
- };
186
- embeddings;
187
- embeddingFunction;
188
- /**
189
- * If true then string columns will be encoded with dictionary encoding
190
- *
191
- * Set this to true if your string columns tend to repeat the same values
192
- * often. For more precise control use the `schema` property to specify the
193
- * data type for individual columns.
194
- *
195
- * If `schema` is provided then this property is ignored.
196
- */
197
- dictionaryEncodeStrings = false;
198
- constructor(values) {
199
- Object.assign(this, values);
200
- }
201
- }
202
- exports.MakeArrowTableOptions = MakeArrowTableOptions;
203
- /**
204
- * An enhanced version of the {@link makeTable} function from Apache Arrow
205
- * that supports nested fields and embeddings columns.
206
- *
207
- * (typically you do not need to call this function. It will be called automatically
208
- * when creating a table or adding data to it)
209
- *
210
- * This function converts an array of Record<String, any> (row-major JS objects)
211
- * to an Arrow Table (a columnar structure)
212
- *
213
- * Note that it currently does not support nulls.
214
- *
215
- * If a schema is provided then it will be used to determine the resulting array
216
- * types. Fields will also be reordered to fit the order defined by the schema.
217
- *
218
- * If a schema is not provided then the types will be inferred and the field order
219
- * will be controlled by the order of properties in the first record. If a type
220
- * is inferred it will always be nullable.
221
- *
222
- * If the input is empty then a schema must be provided to create an empty table.
223
- *
224
- * When a schema is not specified then data types will be inferred. The inference
225
- * rules are as follows:
226
- *
227
- * - boolean => Bool
228
- * - number => Float64
229
- * - String => Utf8
230
- * - Buffer => Binary
231
- * - Record<String, any> => Struct
232
- * - Array<any> => List
233
- * @example
234
- * import { fromTableToBuffer, makeArrowTable } from "../arrow";
235
- * import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
236
- *
237
- * const schema = new Schema([
238
- * new Field("a", new Int32()),
239
- * new Field("b", new Float32()),
240
- * new Field("c", new FixedSizeList(3, new Field("item", new Float16()))),
241
- * ]);
242
- * const table = makeArrowTable([
243
- * { a: 1, b: 2, c: [1, 2, 3] },
244
- * { a: 4, b: 5, c: [4, 5, 6] },
245
- * { a: 7, b: 8, c: [7, 8, 9] },
246
- * ], { schema });
247
- * ```
248
- *
249
- * By default it assumes that the column named `vector` is a vector column
250
- * and it will be converted into a fixed size list array of type float32.
251
- * The `vectorColumns` option can be used to support other vector column
252
- * names and data types.
253
- *
254
- * ```ts
255
- *
256
- * const schema = new Schema([
257
- new Field("a", new Float64()),
258
- new Field("b", new Float64()),
259
- new Field(
260
- "vector",
261
- new FixedSizeList(3, new Field("item", new Float32()))
262
- ),
263
- ]);
264
- const table = makeArrowTable([
265
- { a: 1, b: 2, vector: [1, 2, 3] },
266
- { a: 4, b: 5, vector: [4, 5, 6] },
267
- { a: 7, b: 8, vector: [7, 8, 9] },
268
- ]);
269
- assert.deepEqual(table.schema, schema);
270
- * ```
271
- *
272
- * You can specify the vector column types and names using the options as well
273
- *
274
- * ```typescript
275
- *
276
- * const schema = new Schema([
277
- new Field('a', new Float64()),
278
- new Field('b', new Float64()),
279
- new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
280
- new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
281
- ]);
282
- * const table = makeArrowTable([
283
- { a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
284
- { a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
285
- { a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
286
- ], {
287
- vectorColumns: {
288
- vec1: { type: new Float16() },
289
- vec2: { type: new Float16() }
290
- }
291
- }
292
- * assert.deepEqual(table.schema, schema)
293
- * ```
294
- */
295
- function makeArrowTable(data, options, metadata) {
296
- if (data.length === 0 &&
297
- (options?.schema === undefined || options?.schema === null)) {
298
- throw new Error("At least one record or a schema needs to be provided");
299
- }
300
- const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
301
- if (opt.schema !== undefined && opt.schema !== null) {
302
- opt.schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
303
- opt.schema = validateSchemaEmbeddings(opt.schema, data, options?.embeddingFunction);
304
- }
305
- const columns = {};
306
- // TODO: sample dataset to find missing columns
307
- // Prefer the field ordering of the schema, if present
308
- const columnNames = opt.schema != null ? opt.schema.names : Object.keys(data[0]);
309
- for (const colName of columnNames) {
310
- if (data.length !== 0 &&
311
- !Object.prototype.hasOwnProperty.call(data[0], colName)) {
312
- // The field is present in the schema, but not in the data, skip it
313
- continue;
314
- }
315
- // Extract a single column from the records (transpose from row-major to col-major)
316
- let values = data.map((datum) => datum[colName]);
317
- // By default (type === undefined) arrow will infer the type from the JS type
318
- let type;
319
- if (opt.schema !== undefined) {
320
- // If there is a schema provided, then use that for the type instead
321
- type = opt.schema?.fields.filter((f) => f.name === colName)[0]?.type;
322
- if (apache_arrow_1.DataType.isInt(type) && type.bitWidth === 64) {
323
- // wrap in BigInt to avoid bug: https://github.com/apache/arrow/issues/40051
324
- values = values.map((v) => {
325
- if (v === null) {
326
- return v;
327
- }
328
- if (typeof v === "bigint") {
329
- return v;
330
- }
331
- if (typeof v === "number") {
332
- return BigInt(v);
333
- }
334
- throw new Error(`Expected BigInt or number for column ${colName}, got ${typeof v}`);
335
- });
336
- }
337
- }
338
- else {
339
- // Otherwise, check to see if this column is one of the vector columns
340
- // defined by opt.vectorColumns and, if so, use the fixed size list type
341
- const vectorColumnOptions = opt.vectorColumns[colName];
342
- if (vectorColumnOptions !== undefined) {
343
- const firstNonNullValue = values.find((v) => v !== null);
344
- if (Array.isArray(firstNonNullValue)) {
345
- type = newVectorType(firstNonNullValue.length, vectorColumnOptions.type);
346
- }
347
- else {
348
- throw new Error(`Column ${colName} is expected to be a vector column but first non-null value is not an array. Could not determine size of vector column`);
349
- }
350
- }
351
- }
352
- try {
353
- // Convert an Array of JS values to an arrow vector
354
- columns[colName] = makeVector(values, type, opt.dictionaryEncodeStrings);
355
- }
356
- catch (error) {
357
- // eslint-disable-next-line @typescript-eslint/restrict-template-expressions
358
- throw Error(`Could not convert column "${colName}" to Arrow: ${error}`);
359
- }
360
- }
361
- if (opt.schema != null) {
362
- // `new ArrowTable(columns)` infers a schema which may sometimes have
363
- // incorrect nullability (it assumes nullable=true always)
364
- //
365
- // `new ArrowTable(schema, columns)` will also fail because it will create a
366
- // batch with an inferred schema and then complain that the batch schema
367
- // does not match the provided schema.
368
- //
369
- // To work around this we first create a table with the wrong schema and
370
- // then patch the schema of the batches so we can use
371
- // `new ArrowTable(schema, batches)` which does not do any schema inference
372
- const firstTable = new apache_arrow_1.Table(columns);
373
- const batchesFixed = firstTable.batches.map((batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
374
- let schema;
375
- if (metadata !== undefined) {
376
- let schemaMetadata = opt.schema.metadata;
377
- if (schemaMetadata.size === 0) {
378
- schemaMetadata = metadata;
379
- }
380
- else {
381
- for (const [key, entry] of schemaMetadata.entries()) {
382
- schemaMetadata.set(key, entry);
383
- }
384
- }
385
- schema = new apache_arrow_1.Schema(opt.schema.fields, schemaMetadata);
386
- }
387
- else {
388
- schema = opt.schema;
389
- }
390
- return new apache_arrow_1.Table(schema, batchesFixed);
391
- }
392
- const tbl = new apache_arrow_1.Table(columns);
393
- if (metadata !== undefined) {
394
- // biome-ignore lint/suspicious/noExplicitAny: <explanation>
395
- tbl.schema.metadata = metadata;
396
- }
397
- return tbl;
398
- }
399
- exports.makeArrowTable = makeArrowTable;
400
- /**
401
- * Create an empty Arrow table with the provided schema
402
- */
403
- function makeEmptyTable(schema, metadata) {
404
- return makeArrowTable([], { schema }, metadata);
405
- }
406
- exports.makeEmptyTable = makeEmptyTable;
407
- /**
408
- * Helper function to convert Array<Array<any>> to a variable sized list array
409
- */
410
- // @ts-expect-error (Vector<unknown> is not assignable to Vector<any>)
411
- function makeListVector(lists) {
412
- if (lists.length === 0 || lists[0].length === 0) {
413
- throw Error("Cannot infer list vector from empty array or empty list");
414
- }
415
- const sampleList = lists[0];
416
- // biome-ignore lint/suspicious/noExplicitAny: skip
417
- let inferredType;
418
- try {
419
- const sampleVector = makeVector(sampleList);
420
- inferredType = sampleVector.type;
421
- }
422
- catch (error) {
423
- // eslint-disable-next-line @typescript-eslint/restrict-template-expressions
424
- throw Error(`Cannot infer list vector. Cannot infer inner type: ${error}`);
425
- }
426
- const listBuilder = (0, apache_arrow_1.makeBuilder)({
427
- type: new apache_arrow_1.List(new apache_arrow_1.Field("item", inferredType, true)),
428
- });
429
- for (const list of lists) {
430
- listBuilder.append(list);
431
- }
432
- return listBuilder.finish().toVector();
433
- }
434
- /** Helper function to convert an Array of JS values to an Arrow Vector */
435
- function makeVector(values, type, stringAsDictionary) {
436
- if (type !== undefined) {
437
- // No need for inference, let Arrow create it
438
- return (0, apache_arrow_1.vectorFromArray)(values, type);
439
- }
440
- if (values.length === 0) {
441
- throw Error("makeVector requires at least one value or the type must be specfied");
442
- }
443
- const sampleValue = values.find((val) => val !== null && val !== undefined);
444
- if (sampleValue === undefined) {
445
- throw Error("makeVector cannot infer the type if all values are null or undefined");
446
- }
447
- if (Array.isArray(sampleValue)) {
448
- // Default Arrow inference doesn't handle list types
449
- return makeListVector(values);
450
- }
451
- else if (Buffer.isBuffer(sampleValue)) {
452
- // Default Arrow inference doesn't handle Buffer
453
- return (0, apache_arrow_1.vectorFromArray)(values, new apache_arrow_1.Binary());
454
- }
455
- else if (!(stringAsDictionary ?? false) &&
456
- (typeof sampleValue === "string" || sampleValue instanceof String)) {
457
- // If the type is string then don't use Arrow's default inference unless dictionaries are requested
458
- // because it will always use dictionary encoding for strings
459
- return (0, apache_arrow_1.vectorFromArray)(values, new apache_arrow_1.Utf8());
460
- }
461
- else {
462
- // Convert a JS array of values to an arrow vector
463
- return (0, apache_arrow_1.vectorFromArray)(values);
464
- }
465
- }
466
- /** Helper function to apply embeddings from metadata to an input table */
467
- async function applyEmbeddingsFromMetadata(table, schema) {
468
- const registry = (0, registry_1.getRegistry)();
469
- const functions = registry.parseFunctions(schema.metadata);
470
- const columns = Object.fromEntries(table.schema.fields.map((field) => [
471
- field.name,
472
- table.getChild(field.name),
473
- ]));
474
- for (const functionEntry of functions.values()) {
475
- const sourceColumn = columns[functionEntry.sourceColumn];
476
- const destColumn = functionEntry.vectorColumn ?? "vector";
477
- if (sourceColumn === undefined) {
478
- throw new Error(`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`);
479
- }
480
- if (columns[destColumn] !== undefined) {
481
- throw new Error(`Attempt to apply embeddings to table failed because column ${destColumn} already existed`);
482
- }
483
- if (table.batches.length > 1) {
484
- throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
485
- }
486
- const values = sourceColumn.toArray();
487
- const vectors = await functionEntry.function.computeSourceEmbeddings(values);
488
- if (vectors.length !== values.length) {
489
- throw new Error("Embedding function did not return an embedding for each input element");
490
- }
491
- let destType;
492
- const dtype = schema.fields.find((f) => f.name === destColumn).type;
493
- if (isFixedSizeList(dtype)) {
494
- destType = (0, sanitize_1.sanitizeType)(dtype);
495
- }
496
- else {
497
- throw new Error("Expected FixedSizeList as datatype for vector field, instead got: " +
498
- dtype);
499
- }
500
- const vector = makeVector(vectors, destType);
501
- columns[destColumn] = vector;
502
- }
503
- const newTable = new apache_arrow_1.Table(columns);
504
- return alignTable(newTable, schema);
505
- }
506
- /** Helper function to apply embeddings to an input table */
507
- async function applyEmbeddings(table, embeddings, schema) {
508
- if (schema !== undefined && schema !== null) {
509
- schema = (0, sanitize_1.sanitizeSchema)(schema);
510
- }
511
- if (schema?.metadata.has("embedding_functions")) {
512
- return applyEmbeddingsFromMetadata(table, schema);
513
- }
514
- else if (embeddings == null || embeddings === undefined) {
515
- return table;
516
- }
517
- // Convert from ArrowTable to Record<String, Vector>
518
- const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
519
- const name = table.schema.fields[idx].name;
520
- // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
521
- const vec = table.getChildAt(idx);
522
- return [name, vec];
523
- });
524
- const newColumns = Object.fromEntries(colEntries);
525
- const sourceColumn = newColumns[embeddings.sourceColumn];
526
- const destColumn = embeddings.vectorColumn ?? "vector";
527
- const innerDestType = embeddings.function.embeddingDataType() ?? new apache_arrow_1.Float32();
528
- if (sourceColumn === undefined) {
529
- throw new Error(`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`);
530
- }
531
- if (table.numRows === 0) {
532
- if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
533
- // We have an empty table and it already has the embedding column so no work needs to be done
534
- // Note: we don't return an error like we did below because this is a common occurrence. For example,
535
- // if we call convertToTable with 0 records and a schema that includes the embedding
536
- return table;
537
- }
538
- const dimensions = embeddings.function.ndims();
539
- if (dimensions !== undefined) {
540
- const destType = newVectorType(dimensions, innerDestType);
541
- newColumns[destColumn] = makeVector([], destType);
542
- }
543
- else if (schema != null) {
544
- const destField = schema.fields.find((f) => f.name === destColumn);
545
- if (destField != null) {
546
- newColumns[destColumn] = makeVector([], destField.type);
547
- }
548
- else {
549
- throw new Error(`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`);
550
- }
551
- }
552
- else {
553
- throw new Error("Attempt to apply embeddings to an empty table when the embeddings function does not specify `embeddingDimension`");
554
- }
555
- }
556
- else {
557
- if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
558
- throw new Error(`Attempt to apply embeddings to table failed because column ${destColumn} already existed`);
559
- }
560
- if (table.batches.length > 1) {
561
- throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
562
- }
563
- const values = sourceColumn.toArray();
564
- const vectors = await embeddings.function.computeSourceEmbeddings(values);
565
- if (vectors.length !== values.length) {
566
- throw new Error("Embedding function did not return an embedding for each input element");
567
- }
568
- const destType = newVectorType(vectors[0].length, innerDestType);
569
- newColumns[destColumn] = makeVector(vectors, destType);
570
- }
571
- const newTable = new apache_arrow_1.Table(newColumns);
572
- if (schema != null) {
573
- if (schema.fields.find((f) => f.name === destColumn) === undefined) {
574
- throw new Error(`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`);
575
- }
576
- return alignTable(newTable, schema);
577
- }
578
- return newTable;
579
- }
580
- /**
581
- * Convert an Array of records into an Arrow Table, optionally applying an
582
- * embeddings function to it.
583
- *
584
- * This function calls `makeArrowTable` first to create the Arrow Table.
585
- * Any provided `makeTableOptions` (e.g. a schema) will be passed on to
586
- * that call.
587
- *
588
- * The embedding function will be passed a column of values (based on the
589
- * `sourceColumn` of the embedding function) and expects to receive back
590
- * number[][] which will be converted into a fixed size list column. By
591
- * default this will be a fixed size list of Float32 but that can be
592
- * customized by the `embeddingDataType` property of the embedding function.
593
- *
594
- * If a schema is provided in `makeTableOptions` then it should include the
595
- * embedding columns. If no schema is provded then embedding columns will
596
- * be placed at the end of the table, after all of the input columns.
597
- */
598
- async function convertToTable(data, embeddings, makeTableOptions) {
599
- const table = makeArrowTable(data, makeTableOptions);
600
- return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
601
- }
602
- exports.convertToTable = convertToTable;
603
- /** Creates the Arrow Type for a Vector column with dimension `dim` */
604
- function newVectorType(dim, innerType) {
605
- // in Lance we always default to have the elements nullable, so we need to set it to true
606
- // otherwise we often get schema mismatches because the stored data always has schema with nullable elements
607
- const children = new apache_arrow_1.Field("item", (0, sanitize_1.sanitizeType)(innerType), true);
608
- return new apache_arrow_1.FixedSizeList(dim, children);
609
- }
610
- exports.newVectorType = newVectorType;
611
- /**
612
- * Serialize an Array of records into a buffer using the Arrow IPC File serialization
613
- *
614
- * This function will call `convertToTable` and pass on `embeddings` and `schema`
615
- *
616
- * `schema` is required if data is empty
617
- */
618
- async function fromRecordsToBuffer(data, embeddings, schema) {
619
- if (schema !== undefined && schema !== null) {
620
- schema = (0, sanitize_1.sanitizeSchema)(schema);
621
- }
622
- const table = await convertToTable(data, embeddings, { schema });
623
- const writer = apache_arrow_1.RecordBatchFileWriter.writeAll(table);
624
- return Buffer.from(await writer.toUint8Array());
625
- }
626
- exports.fromRecordsToBuffer = fromRecordsToBuffer;
627
- /**
628
- * Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
629
- *
630
- * This function will call `convertToTable` and pass on `embeddings` and `schema`
631
- *
632
- * `schema` is required if data is empty
633
- */
634
- async function fromRecordsToStreamBuffer(data, embeddings, schema) {
635
- if (schema !== undefined && schema !== null) {
636
- schema = (0, sanitize_1.sanitizeSchema)(schema);
637
- }
638
- const table = await convertToTable(data, embeddings, { schema });
639
- const writer = apache_arrow_1.RecordBatchStreamWriter.writeAll(table);
640
- return Buffer.from(await writer.toUint8Array());
641
- }
642
- exports.fromRecordsToStreamBuffer = fromRecordsToStreamBuffer;
643
- /**
644
- * Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
645
- *
646
- * This function will apply `embeddings` to the table in a manner similar to
647
- * `convertToTable`.
648
- *
649
- * `schema` is required if the table is empty
650
- */
651
- async function fromTableToBuffer(table, embeddings, schema) {
652
- if (schema !== undefined && schema !== null) {
653
- schema = (0, sanitize_1.sanitizeSchema)(schema);
654
- }
655
- const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
656
- const writer = apache_arrow_1.RecordBatchFileWriter.writeAll(tableWithEmbeddings);
657
- return Buffer.from(await writer.toUint8Array());
658
- }
659
- exports.fromTableToBuffer = fromTableToBuffer;
660
- /**
661
- * Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
662
- *
663
- * This function will apply `embeddings` to the table in a manner similar to
664
- * `convertToTable`.
665
- *
666
- * `schema` is required if the table is empty
667
- */
668
- async function fromDataToBuffer(data, embeddings, schema) {
669
- if (schema !== undefined && schema !== null) {
670
- schema = (0, sanitize_1.sanitizeSchema)(schema);
671
- }
672
- if (isArrowTable(data)) {
673
- return fromTableToBuffer((0, sanitize_1.sanitizeTable)(data), embeddings, schema);
674
- }
675
- else {
676
- const table = await convertToTable(data, embeddings, { schema });
677
- return fromTableToBuffer(table);
678
- }
679
- }
680
- exports.fromDataToBuffer = fromDataToBuffer;
681
- /**
682
- * Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
683
- *
684
- * This function will apply `embeddings` to the table in a manner similar to
685
- * `convertToTable`.
686
- *
687
- * `schema` is required if the table is empty
688
- */
689
- async function fromTableToStreamBuffer(table, embeddings, schema) {
690
- const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
691
- const writer = apache_arrow_1.RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
692
- return Buffer.from(await writer.toUint8Array());
693
- }
694
- exports.fromTableToStreamBuffer = fromTableToStreamBuffer;
695
- /**
696
- * Reorder the columns in `batch` so that they agree with the field order in `schema`
697
- */
698
- function alignBatch(batch, schema) {
699
- const alignedChildren = [];
700
- for (const field of schema.fields) {
701
- const indexInBatch = batch.schema.fields?.findIndex((f) => f.name === field.name);
702
- if (indexInBatch < 0) {
703
- throw new Error(`The column ${field.name} was not found in the Arrow Table`);
704
- }
705
- alignedChildren.push(batch.data.children[indexInBatch]);
706
- }
707
- const newData = (0, apache_arrow_1.makeData)({
708
- type: new apache_arrow_1.Struct(schema.fields),
709
- length: batch.numRows,
710
- nullCount: batch.nullCount,
711
- children: alignedChildren,
712
- });
713
- return new apache_arrow_1.RecordBatch(schema, newData);
714
- }
715
- /**
716
- * Reorder the columns in `table` so that they agree with the field order in `schema`
717
- */
718
- function alignTable(table, schema) {
719
- const alignedBatches = table.batches.map((batch) => alignBatch(batch, schema));
720
- return new apache_arrow_1.Table(schema, alignedBatches);
721
- }
722
- /**
723
- * Create an empty table with the given schema
724
- */
725
- function createEmptyTable(schema) {
726
- return new apache_arrow_1.Table((0, sanitize_1.sanitizeSchema)(schema));
727
- }
728
- exports.createEmptyTable = createEmptyTable;
729
- function validateSchemaEmbeddings(schema, data, embeddings) {
730
- const fields = [];
731
- const missingEmbeddingFields = [];
732
- // First we check if the field is a `FixedSizeList`
733
- // Then we check if the data contains the field
734
- // if it does not, we add it to the list of missing embedding fields
735
- // Finally, we check if those missing embedding fields are `this._embeddings`
736
- // if they are not, we throw an error
737
- for (let field of schema.fields) {
738
- if (isFixedSizeList(field.type)) {
739
- field = (0, sanitize_1.sanitizeField)(field);
740
- if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
741
- if (schema.metadata.has("embedding_functions")) {
742
- const embeddings = JSON.parse(schema.metadata.get("embedding_functions"));
743
- if (
744
- // biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
745
- embeddings.find((f) => f["vectorColumn"] === field.name) ===
746
- undefined) {
747
- missingEmbeddingFields.push(field);
748
- }
749
- }
750
- else {
751
- missingEmbeddingFields.push(field);
752
- }
753
- }
754
- else {
755
- fields.push(field);
756
- }
757
- }
758
- else {
759
- fields.push(field);
760
- }
761
- }
762
- if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
763
- throw new Error(`Table has embeddings: "${missingEmbeddingFields
764
- .map((f) => f.name)
765
- .join(",")}", but no embedding function was provided`);
766
- }
767
- return new apache_arrow_1.Schema(fields, schema.metadata);
768
- }