@dengxifeng/lancedb 0.26.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +13 -0
- package/CONTRIBUTING.md +76 -0
- package/README.md +37 -0
- package/dist/arrow.d.ts +279 -0
- package/dist/arrow.js +1316 -0
- package/dist/connection.d.ts +259 -0
- package/dist/connection.js +224 -0
- package/dist/embedding/embedding_function.d.ts +103 -0
- package/dist/embedding/embedding_function.js +192 -0
- package/dist/embedding/index.d.ts +27 -0
- package/dist/embedding/index.js +101 -0
- package/dist/embedding/openai.d.ts +16 -0
- package/dist/embedding/openai.js +93 -0
- package/dist/embedding/registry.d.ts +74 -0
- package/dist/embedding/registry.js +165 -0
- package/dist/embedding/transformers.d.ts +36 -0
- package/dist/embedding/transformers.js +122 -0
- package/dist/header.d.ts +162 -0
- package/dist/header.js +217 -0
- package/dist/index.d.ts +85 -0
- package/dist/index.js +106 -0
- package/dist/indices.d.ts +692 -0
- package/dist/indices.js +156 -0
- package/dist/merge.d.ts +80 -0
- package/dist/merge.js +92 -0
- package/dist/native.d.ts +585 -0
- package/dist/native.js +339 -0
- package/dist/permutation.d.ts +143 -0
- package/dist/permutation.js +184 -0
- package/dist/query.d.ts +581 -0
- package/dist/query.js +853 -0
- package/dist/rerankers/index.d.ts +5 -0
- package/dist/rerankers/index.js +19 -0
- package/dist/rerankers/rrf.d.ts +14 -0
- package/dist/rerankers/rrf.js +28 -0
- package/dist/sanitize.d.ts +32 -0
- package/dist/sanitize.js +473 -0
- package/dist/table.d.ts +581 -0
- package/dist/table.js +321 -0
- package/dist/util.d.ts +14 -0
- package/dist/util.js +77 -0
- package/license_header.txt +2 -0
- package/package.json +122 -0
package/dist/arrow.js
ADDED
|
@@ -0,0 +1,1316 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
4
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
5
|
+
if (k2 === undefined) k2 = k;
|
|
6
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
7
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
8
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
9
|
+
}
|
|
10
|
+
Object.defineProperty(o, k2, desc);
|
|
11
|
+
}) : (function(o, m, k, k2) {
|
|
12
|
+
if (k2 === undefined) k2 = k;
|
|
13
|
+
o[k2] = m[k];
|
|
14
|
+
}));
|
|
15
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
16
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
17
|
+
};
|
|
18
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
19
|
+
exports.MakeArrowTableOptions = exports.VectorColumnOptions = void 0;
|
|
20
|
+
exports.isMultiVector = isMultiVector;
|
|
21
|
+
exports.isIntoVector = isIntoVector;
|
|
22
|
+
exports.isArrowTable = isArrowTable;
|
|
23
|
+
exports.isNull = isNull;
|
|
24
|
+
exports.isInt = isInt;
|
|
25
|
+
exports.isFloat = isFloat;
|
|
26
|
+
exports.isBinary = isBinary;
|
|
27
|
+
exports.isLargeBinary = isLargeBinary;
|
|
28
|
+
exports.isUtf8 = isUtf8;
|
|
29
|
+
exports.isLargeUtf8 = isLargeUtf8;
|
|
30
|
+
exports.isBool = isBool;
|
|
31
|
+
exports.isDecimal = isDecimal;
|
|
32
|
+
exports.isDate = isDate;
|
|
33
|
+
exports.isTime = isTime;
|
|
34
|
+
exports.isTimestamp = isTimestamp;
|
|
35
|
+
exports.isInterval = isInterval;
|
|
36
|
+
exports.isDuration = isDuration;
|
|
37
|
+
exports.isList = isList;
|
|
38
|
+
exports.isStruct = isStruct;
|
|
39
|
+
exports.isUnion = isUnion;
|
|
40
|
+
exports.isFixedSizeBinary = isFixedSizeBinary;
|
|
41
|
+
exports.isFixedSizeList = isFixedSizeList;
|
|
42
|
+
exports.makeArrowTable = makeArrowTable;
|
|
43
|
+
exports.makeEmptyTable = makeEmptyTable;
|
|
44
|
+
exports.convertToTable = convertToTable;
|
|
45
|
+
exports.newVectorType = newVectorType;
|
|
46
|
+
exports.fromRecordsToBuffer = fromRecordsToBuffer;
|
|
47
|
+
exports.fromRecordsToStreamBuffer = fromRecordsToStreamBuffer;
|
|
48
|
+
exports.fromTableToBuffer = fromTableToBuffer;
|
|
49
|
+
exports.fromDataToBuffer = fromDataToBuffer;
|
|
50
|
+
exports.fromBufferToRecordBatch = fromBufferToRecordBatch;
|
|
51
|
+
exports.fromRecordBatchToBuffer = fromRecordBatchToBuffer;
|
|
52
|
+
exports.fromTableToStreamBuffer = fromTableToStreamBuffer;
|
|
53
|
+
exports.createEmptyTable = createEmptyTable;
|
|
54
|
+
exports.ensureNestedFieldsExist = ensureNestedFieldsExist;
|
|
55
|
+
exports.dataTypeToJson = dataTypeToJson;
|
|
56
|
+
const apache_arrow_1 = require("apache-arrow");
|
|
57
|
+
const registry_1 = require("./embedding/registry");
|
|
58
|
+
const sanitize_1 = require("./sanitize");
|
|
59
|
+
/**
|
|
60
|
+
* Check if a field name indicates a vector column.
|
|
61
|
+
*/
|
|
62
|
+
function nameSuggestsVectorColumn(fieldName) {
|
|
63
|
+
const nameLower = fieldName.toLowerCase();
|
|
64
|
+
return nameLower.includes("vector") || nameLower.includes("embedding");
|
|
65
|
+
}
|
|
66
|
+
__exportStar(require("apache-arrow"), exports);
|
|
67
|
+
function isMultiVector(value) {
|
|
68
|
+
return Array.isArray(value) && isIntoVector(value[0]);
|
|
69
|
+
}
|
|
70
|
+
function isIntoVector(value) {
|
|
71
|
+
return (value instanceof Float32Array ||
|
|
72
|
+
value instanceof Float64Array ||
|
|
73
|
+
(Array.isArray(value) && !Array.isArray(value[0])));
|
|
74
|
+
}
|
|
75
|
+
function isArrowTable(value) {
|
|
76
|
+
if (value instanceof apache_arrow_1.Table)
|
|
77
|
+
return true;
|
|
78
|
+
return "schema" in value && "batches" in value;
|
|
79
|
+
}
|
|
80
|
+
function isNull(value) {
|
|
81
|
+
return value instanceof apache_arrow_1.Null || apache_arrow_1.DataType.isNull(value);
|
|
82
|
+
}
|
|
83
|
+
function isInt(value) {
|
|
84
|
+
return value instanceof apache_arrow_1.Int || apache_arrow_1.DataType.isInt(value);
|
|
85
|
+
}
|
|
86
|
+
function isFloat(value) {
|
|
87
|
+
return value instanceof apache_arrow_1.Float || apache_arrow_1.DataType.isFloat(value);
|
|
88
|
+
}
|
|
89
|
+
function isBinary(value) {
|
|
90
|
+
return value instanceof apache_arrow_1.Binary || apache_arrow_1.DataType.isBinary(value);
|
|
91
|
+
}
|
|
92
|
+
function isLargeBinary(value) {
|
|
93
|
+
return value instanceof apache_arrow_1.LargeBinary || apache_arrow_1.DataType.isLargeBinary(value);
|
|
94
|
+
}
|
|
95
|
+
function isUtf8(value) {
|
|
96
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isUtf8(value);
|
|
97
|
+
}
|
|
98
|
+
function isLargeUtf8(value) {
|
|
99
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isLargeUtf8(value);
|
|
100
|
+
}
|
|
101
|
+
function isBool(value) {
|
|
102
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isBool(value);
|
|
103
|
+
}
|
|
104
|
+
function isDecimal(value) {
|
|
105
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDecimal(value);
|
|
106
|
+
}
|
|
107
|
+
function isDate(value) {
|
|
108
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDate(value);
|
|
109
|
+
}
|
|
110
|
+
function isTime(value) {
|
|
111
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTime(value);
|
|
112
|
+
}
|
|
113
|
+
function isTimestamp(value) {
|
|
114
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isTimestamp(value);
|
|
115
|
+
}
|
|
116
|
+
function isInterval(value) {
|
|
117
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isInterval(value);
|
|
118
|
+
}
|
|
119
|
+
function isDuration(value) {
|
|
120
|
+
return value instanceof apache_arrow_1.Utf8 || apache_arrow_1.DataType.isDuration(value);
|
|
121
|
+
}
|
|
122
|
+
function isList(value) {
|
|
123
|
+
return value instanceof apache_arrow_1.List || apache_arrow_1.DataType.isList(value);
|
|
124
|
+
}
|
|
125
|
+
function isStruct(value) {
|
|
126
|
+
return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isStruct(value);
|
|
127
|
+
}
|
|
128
|
+
function isUnion(value) {
|
|
129
|
+
return value instanceof apache_arrow_1.Struct || apache_arrow_1.DataType.isUnion(value);
|
|
130
|
+
}
|
|
131
|
+
function isFixedSizeBinary(value) {
|
|
132
|
+
return value instanceof apache_arrow_1.FixedSizeBinary || apache_arrow_1.DataType.isFixedSizeBinary(value);
|
|
133
|
+
}
|
|
134
|
+
function isFixedSizeList(value) {
|
|
135
|
+
return value instanceof apache_arrow_1.FixedSizeList || apache_arrow_1.DataType.isFixedSizeList(value);
|
|
136
|
+
}
|
|
137
|
+
/*
|
|
138
|
+
* Options to control how a column should be converted to a vector array
|
|
139
|
+
*/
|
|
140
|
+
class VectorColumnOptions {
|
|
141
|
+
/** Vector column type. */
|
|
142
|
+
type = new apache_arrow_1.Float32();
|
|
143
|
+
constructor(values) {
|
|
144
|
+
Object.assign(this, values);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
exports.VectorColumnOptions = VectorColumnOptions;
|
|
148
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
149
|
+
function vectorFromArray(data, type) {
|
|
150
|
+
// Workaround for: https://github.com/apache/arrow/issues/45862
|
|
151
|
+
// If FSL type with float
|
|
152
|
+
if (apache_arrow_1.DataType.isFixedSizeList(type) && apache_arrow_1.DataType.isFloat(type.valueType)) {
|
|
153
|
+
const extendedData = [...data, new Array(type.listSize).fill(0.0)];
|
|
154
|
+
const array = (0, apache_arrow_1.vectorFromArray)(extendedData, type);
|
|
155
|
+
return array.slice(0, data.length);
|
|
156
|
+
}
|
|
157
|
+
else if (type === undefined) {
|
|
158
|
+
return (0, apache_arrow_1.vectorFromArray)(data);
|
|
159
|
+
}
|
|
160
|
+
else {
|
|
161
|
+
return (0, apache_arrow_1.vectorFromArray)(data, type);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
/** Options to control the makeArrowTable call. */
|
|
165
|
+
class MakeArrowTableOptions {
|
|
166
|
+
/*
|
|
167
|
+
* Schema of the data.
|
|
168
|
+
*
|
|
169
|
+
* If this is not provided then the data type will be inferred from the
|
|
170
|
+
* JS type. Integer numbers will become int64, floating point numbers
|
|
171
|
+
* will become float64 and arrays will become variable sized lists with
|
|
172
|
+
* the data type inferred from the first element in the array.
|
|
173
|
+
*
|
|
174
|
+
* The schema must be specified if there are no records (e.g. to make
|
|
175
|
+
* an empty table)
|
|
176
|
+
*/
|
|
177
|
+
schema;
|
|
178
|
+
/*
|
|
179
|
+
* Mapping from vector column name to expected type
|
|
180
|
+
*
|
|
181
|
+
* Lance expects vector columns to be fixed size list arrays (i.e. tensors)
|
|
182
|
+
* However, `makeArrowTable` will not infer this by default (it creates
|
|
183
|
+
* variable size list arrays). This field can be used to indicate that a column
|
|
184
|
+
* should be treated as a vector column and converted to a fixed size list.
|
|
185
|
+
*
|
|
186
|
+
* The keys should be the names of the vector columns. The value specifies the
|
|
187
|
+
* expected data type of the vector columns.
|
|
188
|
+
*
|
|
189
|
+
* If `schema` is provided then this field is ignored.
|
|
190
|
+
*
|
|
191
|
+
* By default, the column named "vector" will be assumed to be a float32
|
|
192
|
+
* vector column.
|
|
193
|
+
*/
|
|
194
|
+
vectorColumns = {
|
|
195
|
+
vector: new VectorColumnOptions(),
|
|
196
|
+
};
|
|
197
|
+
embeddings;
|
|
198
|
+
embeddingFunction;
|
|
199
|
+
/**
|
|
200
|
+
* If true then string columns will be encoded with dictionary encoding
|
|
201
|
+
*
|
|
202
|
+
* Set this to true if your string columns tend to repeat the same values
|
|
203
|
+
* often. For more precise control use the `schema` property to specify the
|
|
204
|
+
* data type for individual columns.
|
|
205
|
+
*
|
|
206
|
+
* If `schema` is provided then this property is ignored.
|
|
207
|
+
*/
|
|
208
|
+
dictionaryEncodeStrings = false;
|
|
209
|
+
constructor(values) {
|
|
210
|
+
Object.assign(this, values);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
exports.MakeArrowTableOptions = MakeArrowTableOptions;
|
|
214
|
+
/**
|
|
215
|
+
* An enhanced version of the apache-arrow makeTable function from Apache Arrow
|
|
216
|
+
* that supports nested fields and embeddings columns.
|
|
217
|
+
*
|
|
218
|
+
* (typically you do not need to call this function. It will be called automatically
|
|
219
|
+
* when creating a table or adding data to it)
|
|
220
|
+
*
|
|
221
|
+
* This function converts an array of Record<String, any> (row-major JS objects)
|
|
222
|
+
* to an Arrow Table (a columnar structure)
|
|
223
|
+
*
|
|
224
|
+
* If a schema is provided then it will be used to determine the resulting array
|
|
225
|
+
* types. Fields will also be reordered to fit the order defined by the schema.
|
|
226
|
+
*
|
|
227
|
+
* If a schema is not provided then the types will be inferred and the field order
|
|
228
|
+
* will be controlled by the order of properties in the first record. If a type
|
|
229
|
+
* is inferred it will always be nullable.
|
|
230
|
+
*
|
|
231
|
+
* If not all fields are found in the data, then a subset of the schema will be
|
|
232
|
+
* returned.
|
|
233
|
+
*
|
|
234
|
+
* If the input is empty then a schema must be provided to create an empty table.
|
|
235
|
+
*
|
|
236
|
+
* When a schema is not specified then data types will be inferred. The inference
|
|
237
|
+
* rules are as follows:
|
|
238
|
+
*
|
|
239
|
+
* - boolean => Bool
|
|
240
|
+
* - number => Float64
|
|
241
|
+
* - bigint => Int64
|
|
242
|
+
* - String => Utf8
|
|
243
|
+
* - Buffer => Binary
|
|
244
|
+
* - Record<String, any> => Struct
|
|
245
|
+
* - Array<any> => List
|
|
246
|
+
* @example
|
|
247
|
+
* ```ts
|
|
248
|
+
* import { fromTableToBuffer, makeArrowTable } from "../arrow";
|
|
249
|
+
* import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
|
|
250
|
+
*
|
|
251
|
+
* const schema = new Schema([
|
|
252
|
+
* new Field("a", new Int32()),
|
|
253
|
+
* new Field("b", new Float32()),
|
|
254
|
+
* new Field("c", new FixedSizeList(3, new Field("item", new Float16()))),
|
|
255
|
+
* ]);
|
|
256
|
+
* const table = makeArrowTable([
|
|
257
|
+
* { a: 1, b: 2, c: [1, 2, 3] },
|
|
258
|
+
* { a: 4, b: 5, c: [4, 5, 6] },
|
|
259
|
+
* { a: 7, b: 8, c: [7, 8, 9] },
|
|
260
|
+
* ], { schema });
|
|
261
|
+
* ```
|
|
262
|
+
*
|
|
263
|
+
* By default it assumes that the column named `vector` is a vector column
|
|
264
|
+
* and it will be converted into a fixed size list array of type float32.
|
|
265
|
+
* The `vectorColumns` option can be used to support other vector column
|
|
266
|
+
* names and data types.
|
|
267
|
+
*
|
|
268
|
+
* ```ts
|
|
269
|
+
* const schema = new Schema([
|
|
270
|
+
* new Field("a", new Float64()),
|
|
271
|
+
* new Field("b", new Float64()),
|
|
272
|
+
* new Field(
|
|
273
|
+
* "vector",
|
|
274
|
+
* new FixedSizeList(3, new Field("item", new Float32()))
|
|
275
|
+
* ),
|
|
276
|
+
* ]);
|
|
277
|
+
* const table = makeArrowTable([
|
|
278
|
+
* { a: 1, b: 2, vector: [1, 2, 3] },
|
|
279
|
+
* { a: 4, b: 5, vector: [4, 5, 6] },
|
|
280
|
+
* { a: 7, b: 8, vector: [7, 8, 9] },
|
|
281
|
+
* ]);
|
|
282
|
+
* assert.deepEqual(table.schema, schema);
|
|
283
|
+
* ```
|
|
284
|
+
*
|
|
285
|
+
* You can specify the vector column types and names using the options as well
|
|
286
|
+
*
|
|
287
|
+
* ```ts
|
|
288
|
+
* const schema = new Schema([
|
|
289
|
+
* new Field('a', new Float64()),
|
|
290
|
+
* new Field('b', new Float64()),
|
|
291
|
+
* new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
|
|
292
|
+
* new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
|
|
293
|
+
* ]);
|
|
294
|
+
* const table = makeArrowTable([
|
|
295
|
+
* { a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
|
|
296
|
+
* { a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
|
|
297
|
+
* { a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
|
|
298
|
+
* ], {
|
|
299
|
+
* vectorColumns: {
|
|
300
|
+
* vec1: { type: new Float16() },
|
|
301
|
+
* vec2: { type: new Float16() }
|
|
302
|
+
* }
|
|
303
|
+
* }
|
|
304
|
+
* assert.deepEqual(table.schema, schema)
|
|
305
|
+
* ```
|
|
306
|
+
*/
|
|
307
|
+
function makeArrowTable(data, options, metadata) {
|
|
308
|
+
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
309
|
+
let schema = undefined;
|
|
310
|
+
if (opt.schema !== undefined && opt.schema !== null) {
|
|
311
|
+
schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
|
|
312
|
+
schema = validateSchemaEmbeddings(schema, data, options?.embeddingFunction);
|
|
313
|
+
}
|
|
314
|
+
let schemaMetadata = schema?.metadata || new Map();
|
|
315
|
+
if (metadata !== undefined) {
|
|
316
|
+
schemaMetadata = new Map([...schemaMetadata, ...metadata]);
|
|
317
|
+
}
|
|
318
|
+
if (data.length === 0 &&
|
|
319
|
+
(options?.schema === undefined || options?.schema === null)) {
|
|
320
|
+
throw new Error("At least one record or a schema needs to be provided");
|
|
321
|
+
}
|
|
322
|
+
else if (data.length === 0) {
|
|
323
|
+
if (schema === undefined) {
|
|
324
|
+
throw new Error("A schema must be provided if data is empty");
|
|
325
|
+
}
|
|
326
|
+
else {
|
|
327
|
+
schema = new apache_arrow_1.Schema(schema.fields, schemaMetadata);
|
|
328
|
+
return new apache_arrow_1.Table(schema);
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
let inferredSchema = inferSchema(data, schema, opt);
|
|
332
|
+
inferredSchema = new apache_arrow_1.Schema(inferredSchema.fields, schemaMetadata);
|
|
333
|
+
const finalColumns = {};
|
|
334
|
+
for (const field of inferredSchema.fields) {
|
|
335
|
+
finalColumns[field.name] = transposeData(data, field);
|
|
336
|
+
}
|
|
337
|
+
return new apache_arrow_1.Table(inferredSchema, finalColumns);
|
|
338
|
+
}
|
|
339
|
+
function inferSchema(data, schema, opts) {
|
|
340
|
+
// We will collect all fields we see in the data.
|
|
341
|
+
const pathTree = new PathTree();
|
|
342
|
+
for (const [rowI, row] of data.entries()) {
|
|
343
|
+
for (const [path, value] of rowPathsAndValues(row)) {
|
|
344
|
+
if (!pathTree.has(path)) {
|
|
345
|
+
// First time seeing this field.
|
|
346
|
+
if (schema !== undefined) {
|
|
347
|
+
const field = getFieldForPath(schema, path);
|
|
348
|
+
if (field === undefined) {
|
|
349
|
+
throw new Error(`Found field not in schema: ${path.join(".")} at row ${rowI}`);
|
|
350
|
+
}
|
|
351
|
+
else {
|
|
352
|
+
pathTree.set(path, field.type);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
else {
|
|
356
|
+
const inferredType = inferType(value, path, opts);
|
|
357
|
+
if (inferredType === undefined) {
|
|
358
|
+
throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \
|
|
359
|
+
Consider providing an explicit schema.`);
|
|
360
|
+
}
|
|
361
|
+
pathTree.set(path, inferredType);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
else if (schema === undefined) {
|
|
365
|
+
const currentType = pathTree.get(path);
|
|
366
|
+
const newType = inferType(value, path, opts);
|
|
367
|
+
if (currentType !== newType) {
|
|
368
|
+
new Error(`Failed to infer schema for data. Previously inferred type \
|
|
369
|
+
${currentType} but found ${newType} at row ${rowI}. Consider \
|
|
370
|
+
providing an explicit schema.`);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
if (schema === undefined) {
|
|
376
|
+
function fieldsFromPathTree(pathTree) {
|
|
377
|
+
const fields = [];
|
|
378
|
+
for (const [name, value] of pathTree.map.entries()) {
|
|
379
|
+
if (value instanceof PathTree) {
|
|
380
|
+
const children = fieldsFromPathTree(value);
|
|
381
|
+
fields.push(new apache_arrow_1.Field(name, new apache_arrow_1.Struct(children), true));
|
|
382
|
+
}
|
|
383
|
+
else {
|
|
384
|
+
fields.push(new apache_arrow_1.Field(name, value, true));
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
return fields;
|
|
388
|
+
}
|
|
389
|
+
const fields = fieldsFromPathTree(pathTree);
|
|
390
|
+
return new apache_arrow_1.Schema(fields);
|
|
391
|
+
}
|
|
392
|
+
else {
|
|
393
|
+
function takeMatchingFields(fields, pathTree) {
|
|
394
|
+
const outFields = [];
|
|
395
|
+
for (const field of fields) {
|
|
396
|
+
if (pathTree.map.has(field.name)) {
|
|
397
|
+
const value = pathTree.get([field.name]);
|
|
398
|
+
if (value instanceof PathTree) {
|
|
399
|
+
const struct = field.type;
|
|
400
|
+
const children = takeMatchingFields(struct.children, value);
|
|
401
|
+
outFields.push(new apache_arrow_1.Field(field.name, new apache_arrow_1.Struct(children), field.nullable));
|
|
402
|
+
}
|
|
403
|
+
else {
|
|
404
|
+
outFields.push(new apache_arrow_1.Field(field.name, value, field.nullable));
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
return outFields;
|
|
409
|
+
}
|
|
410
|
+
const fields = takeMatchingFields(schema.fields, pathTree);
|
|
411
|
+
return new apache_arrow_1.Schema(fields);
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
function* rowPathsAndValues(row, basePath = []) {
|
|
415
|
+
for (const [key, value] of Object.entries(row)) {
|
|
416
|
+
if (isObject(value)) {
|
|
417
|
+
yield* rowPathsAndValues(value, [...basePath, key]);
|
|
418
|
+
}
|
|
419
|
+
else {
|
|
420
|
+
// Skip undefined values - they should be treated the same as missing fields
|
|
421
|
+
// for embedding function purposes
|
|
422
|
+
if (value !== undefined) {
|
|
423
|
+
yield [[...basePath, key], value];
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
function isObject(value) {
|
|
429
|
+
return (typeof value === "object" &&
|
|
430
|
+
value !== null &&
|
|
431
|
+
!Array.isArray(value) &&
|
|
432
|
+
!(value instanceof RegExp) &&
|
|
433
|
+
!(value instanceof Date) &&
|
|
434
|
+
!(value instanceof Set) &&
|
|
435
|
+
!(value instanceof Map) &&
|
|
436
|
+
!(value instanceof Buffer));
|
|
437
|
+
}
|
|
438
|
+
function getFieldForPath(schema, path) {
|
|
439
|
+
let current = schema;
|
|
440
|
+
for (const key of path) {
|
|
441
|
+
if (current instanceof apache_arrow_1.Schema) {
|
|
442
|
+
const field = current.fields.find((f) => f.name === key);
|
|
443
|
+
if (field === undefined) {
|
|
444
|
+
return undefined;
|
|
445
|
+
}
|
|
446
|
+
current = field;
|
|
447
|
+
}
|
|
448
|
+
else if (current instanceof apache_arrow_1.Field && apache_arrow_1.DataType.isStruct(current.type)) {
|
|
449
|
+
const struct = current.type;
|
|
450
|
+
const field = struct.children.find((f) => f.name === key);
|
|
451
|
+
if (field === undefined) {
|
|
452
|
+
return undefined;
|
|
453
|
+
}
|
|
454
|
+
current = field;
|
|
455
|
+
}
|
|
456
|
+
else {
|
|
457
|
+
return undefined;
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
if (current instanceof apache_arrow_1.Field) {
|
|
461
|
+
return current;
|
|
462
|
+
}
|
|
463
|
+
else {
|
|
464
|
+
return undefined;
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
/**
|
|
468
|
+
* Try to infer which Arrow type to use for a given value.
|
|
469
|
+
*
|
|
470
|
+
* May return undefined if the type cannot be inferred.
|
|
471
|
+
*/
|
|
472
|
+
function inferType(value, path, opts) {
|
|
473
|
+
if (typeof value === "bigint") {
|
|
474
|
+
return new apache_arrow_1.Int64();
|
|
475
|
+
}
|
|
476
|
+
else if (typeof value === "number") {
|
|
477
|
+
// Even if it's an integer, it's safer to assume Float64. Users can
|
|
478
|
+
// always provide an explicit schema or use BigInt if they mean integer.
|
|
479
|
+
return new apache_arrow_1.Float64();
|
|
480
|
+
}
|
|
481
|
+
else if (typeof value === "string") {
|
|
482
|
+
if (opts.dictionaryEncodeStrings) {
|
|
483
|
+
return new apache_arrow_1.Dictionary(new apache_arrow_1.Utf8(), new apache_arrow_1.Int32());
|
|
484
|
+
}
|
|
485
|
+
else {
|
|
486
|
+
return new apache_arrow_1.Utf8();
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
else if (typeof value === "boolean") {
|
|
490
|
+
return new apache_arrow_1.Bool();
|
|
491
|
+
}
|
|
492
|
+
else if (value instanceof Buffer) {
|
|
493
|
+
return new apache_arrow_1.Binary();
|
|
494
|
+
}
|
|
495
|
+
else if (Array.isArray(value)) {
|
|
496
|
+
if (value.length === 0) {
|
|
497
|
+
return undefined; // Without any values we can't infer the type
|
|
498
|
+
}
|
|
499
|
+
if (path.length === 1 && Object.hasOwn(opts.vectorColumns, path[0])) {
|
|
500
|
+
const floatType = (0, sanitize_1.sanitizeType)(opts.vectorColumns[path[0]].type);
|
|
501
|
+
return new apache_arrow_1.FixedSizeList(value.length, new apache_arrow_1.Field("item", floatType, true));
|
|
502
|
+
}
|
|
503
|
+
const valueType = inferType(value[0], path, opts);
|
|
504
|
+
if (valueType === undefined) {
|
|
505
|
+
return undefined;
|
|
506
|
+
}
|
|
507
|
+
// Try to automatically detect embedding columns.
|
|
508
|
+
if (nameSuggestsVectorColumn(path[path.length - 1])) {
|
|
509
|
+
// Check if value is a Uint8Array for integer vector type determination
|
|
510
|
+
if (value instanceof Uint8Array) {
|
|
511
|
+
// For integer vectors, we default to Uint8 (matching Python implementation)
|
|
512
|
+
const child = new apache_arrow_1.Field("item", new apache_arrow_1.Uint8(), true);
|
|
513
|
+
return new apache_arrow_1.FixedSizeList(value.length, child);
|
|
514
|
+
}
|
|
515
|
+
else {
|
|
516
|
+
// For float vectors, we default to Float32
|
|
517
|
+
const child = new apache_arrow_1.Field("item", new apache_arrow_1.Float32(), true);
|
|
518
|
+
return new apache_arrow_1.FixedSizeList(value.length, child);
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
else {
|
|
522
|
+
const child = new apache_arrow_1.Field("item", valueType, true);
|
|
523
|
+
return new apache_arrow_1.List(child);
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
else {
|
|
527
|
+
// TODO: timestamp
|
|
528
|
+
return undefined;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
class PathTree {
|
|
532
|
+
map;
|
|
533
|
+
constructor(entries) {
|
|
534
|
+
this.map = new Map();
|
|
535
|
+
if (entries !== undefined) {
|
|
536
|
+
for (const [path, value] of entries) {
|
|
537
|
+
this.set(path, value);
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
has(path) {
|
|
542
|
+
let ref = this;
|
|
543
|
+
for (const part of path) {
|
|
544
|
+
if (!(ref instanceof PathTree) || !ref.map.has(part)) {
|
|
545
|
+
return false;
|
|
546
|
+
}
|
|
547
|
+
ref = ref.map.get(part);
|
|
548
|
+
}
|
|
549
|
+
return true;
|
|
550
|
+
}
|
|
551
|
+
get(path) {
|
|
552
|
+
let ref = this;
|
|
553
|
+
for (const part of path) {
|
|
554
|
+
if (!(ref instanceof PathTree) || !ref.map.has(part)) {
|
|
555
|
+
return undefined;
|
|
556
|
+
}
|
|
557
|
+
ref = ref.map.get(part);
|
|
558
|
+
}
|
|
559
|
+
return ref;
|
|
560
|
+
}
|
|
561
|
+
set(path, value) {
|
|
562
|
+
let ref = this;
|
|
563
|
+
for (const part of path.slice(0, path.length - 1)) {
|
|
564
|
+
if (!ref.map.has(part)) {
|
|
565
|
+
ref.map.set(part, new PathTree());
|
|
566
|
+
}
|
|
567
|
+
ref = ref.map.get(part);
|
|
568
|
+
}
|
|
569
|
+
ref.map.set(path[path.length - 1], value);
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
function transposeData(data, field, path = []) {
|
|
573
|
+
if (field.type instanceof apache_arrow_1.Struct) {
|
|
574
|
+
const childFields = field.type.children;
|
|
575
|
+
const fullPath = [...path, field.name];
|
|
576
|
+
const childVectors = childFields.map((child) => {
|
|
577
|
+
return transposeData(data, child, fullPath);
|
|
578
|
+
});
|
|
579
|
+
const structData = (0, apache_arrow_1.makeData)({
|
|
580
|
+
type: field.type,
|
|
581
|
+
children: childVectors,
|
|
582
|
+
});
|
|
583
|
+
return (0, apache_arrow_1.makeVector)(structData);
|
|
584
|
+
}
|
|
585
|
+
else {
|
|
586
|
+
const valuesPath = [...path, field.name];
|
|
587
|
+
const values = data.map((datum) => {
|
|
588
|
+
let current = datum;
|
|
589
|
+
for (const key of valuesPath) {
|
|
590
|
+
if (current == null) {
|
|
591
|
+
return null;
|
|
592
|
+
}
|
|
593
|
+
if (isObject(current) &&
|
|
594
|
+
(Object.hasOwn(current, key) || key in current)) {
|
|
595
|
+
current = current[key];
|
|
596
|
+
}
|
|
597
|
+
else {
|
|
598
|
+
return null;
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
return current;
|
|
602
|
+
});
|
|
603
|
+
return makeVector(values, field.type, undefined, field.nullable);
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
/**
|
|
607
|
+
* Create an empty Arrow table with the provided schema
|
|
608
|
+
*/
|
|
609
|
+
function makeEmptyTable(schema, metadata) {
|
|
610
|
+
return makeArrowTable([], { schema }, metadata);
|
|
611
|
+
}
|
|
612
|
+
/**
|
|
613
|
+
* Helper function to convert Array<Array<any>> to a variable sized list array
|
|
614
|
+
*/
|
|
615
|
+
// @ts-expect-error (Vector<unknown> is not assignable to Vector<any>)
|
|
616
|
+
function makeListVector(lists) {
|
|
617
|
+
if (lists.length === 0 || lists[0].length === 0) {
|
|
618
|
+
throw Error("Cannot infer list vector from empty array or empty list");
|
|
619
|
+
}
|
|
620
|
+
const sampleList = lists[0];
|
|
621
|
+
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
622
|
+
let inferredType;
|
|
623
|
+
try {
|
|
624
|
+
const sampleVector = makeVector(sampleList);
|
|
625
|
+
inferredType = sampleVector.type;
|
|
626
|
+
}
|
|
627
|
+
catch (error) {
|
|
628
|
+
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
|
629
|
+
throw Error(`Cannot infer list vector. Cannot infer inner type: ${error}`);
|
|
630
|
+
}
|
|
631
|
+
const listBuilder = (0, apache_arrow_1.makeBuilder)({
|
|
632
|
+
type: new apache_arrow_1.List(new apache_arrow_1.Field("item", inferredType, true)),
|
|
633
|
+
});
|
|
634
|
+
for (const list of lists) {
|
|
635
|
+
listBuilder.append(list);
|
|
636
|
+
}
|
|
637
|
+
return listBuilder.finish().toVector();
|
|
638
|
+
}
|
|
639
|
+
/** Helper function to convert an Array of JS values to an Arrow Vector */
|
|
640
|
+
function makeVector(values, type, stringAsDictionary, nullable) {
|
|
641
|
+
if (type !== undefined) {
|
|
642
|
+
// Convert undefined values to null for nullable fields
|
|
643
|
+
if (nullable) {
|
|
644
|
+
values = values.map((v) => (v === undefined ? null : v));
|
|
645
|
+
}
|
|
646
|
+
// workaround for: https://github.com/apache/arrow-js/issues/68
|
|
647
|
+
if (apache_arrow_1.DataType.isBool(type)) {
|
|
648
|
+
const hasNonNullValue = values.some((v) => v !== null && v !== undefined);
|
|
649
|
+
if (!hasNonNullValue) {
|
|
650
|
+
const nullBitmap = new Uint8Array(Math.ceil(values.length / 8));
|
|
651
|
+
const data = (0, apache_arrow_1.makeData)({
|
|
652
|
+
type: type,
|
|
653
|
+
length: values.length,
|
|
654
|
+
nullCount: values.length,
|
|
655
|
+
nullBitmap,
|
|
656
|
+
});
|
|
657
|
+
return (0, apache_arrow_1.makeVector)(data);
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
// No need for inference, let Arrow create it
|
|
661
|
+
if (type instanceof apache_arrow_1.Int) {
|
|
662
|
+
if (apache_arrow_1.DataType.isInt(type) && type.bitWidth === 64) {
|
|
663
|
+
// wrap in BigInt to avoid bug: https://github.com/apache/arrow/issues/40051
|
|
664
|
+
values = values.map((v) => {
|
|
665
|
+
if (v === null) {
|
|
666
|
+
return v;
|
|
667
|
+
}
|
|
668
|
+
else if (typeof v === "bigint") {
|
|
669
|
+
return v;
|
|
670
|
+
}
|
|
671
|
+
else if (typeof v === "number") {
|
|
672
|
+
return BigInt(v);
|
|
673
|
+
}
|
|
674
|
+
else {
|
|
675
|
+
return v;
|
|
676
|
+
}
|
|
677
|
+
});
|
|
678
|
+
}
|
|
679
|
+
else {
|
|
680
|
+
// Similarly, bigint isn't supported for 16 or 32-bit ints.
|
|
681
|
+
values = values.map((v) => {
|
|
682
|
+
if (typeof v == "bigint") {
|
|
683
|
+
return Number(v);
|
|
684
|
+
}
|
|
685
|
+
else {
|
|
686
|
+
return v;
|
|
687
|
+
}
|
|
688
|
+
});
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
return vectorFromArray(values, type);
|
|
692
|
+
}
|
|
693
|
+
if (values.length === 0) {
|
|
694
|
+
throw Error("makeVector requires at least one value or the type must be specfied");
|
|
695
|
+
}
|
|
696
|
+
const sampleValue = values.find((val) => val !== null && val !== undefined);
|
|
697
|
+
if (sampleValue === undefined) {
|
|
698
|
+
throw Error("makeVector cannot infer the type if all values are null or undefined");
|
|
699
|
+
}
|
|
700
|
+
if (Array.isArray(sampleValue)) {
|
|
701
|
+
// Default Arrow inference doesn't handle list types
|
|
702
|
+
return makeListVector(values);
|
|
703
|
+
}
|
|
704
|
+
else if (Buffer.isBuffer(sampleValue)) {
|
|
705
|
+
// Default Arrow inference doesn't handle Buffer
|
|
706
|
+
return vectorFromArray(values, new apache_arrow_1.Binary());
|
|
707
|
+
}
|
|
708
|
+
else if (!(stringAsDictionary ?? false) &&
|
|
709
|
+
(typeof sampleValue === "string" || sampleValue instanceof String)) {
|
|
710
|
+
// If the type is string then don't use Arrow's default inference unless dictionaries are requested
|
|
711
|
+
// because it will always use dictionary encoding for strings
|
|
712
|
+
return vectorFromArray(values, new apache_arrow_1.Utf8());
|
|
713
|
+
}
|
|
714
|
+
else {
|
|
715
|
+
// Convert a JS array of values to an arrow vector
|
|
716
|
+
return vectorFromArray(values);
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
/** Helper function to apply embeddings from metadata to an input table */
|
|
720
|
+
async function applyEmbeddingsFromMetadata(table, schema) {
|
|
721
|
+
const registry = (0, registry_1.getRegistry)();
|
|
722
|
+
const functions = await registry.parseFunctions(schema.metadata);
|
|
723
|
+
const columns = Object.fromEntries(table.schema.fields.map((field) => [
|
|
724
|
+
field.name,
|
|
725
|
+
table.getChild(field.name),
|
|
726
|
+
]));
|
|
727
|
+
for (const functionEntry of functions.values()) {
|
|
728
|
+
const sourceColumn = columns[functionEntry.sourceColumn];
|
|
729
|
+
const destColumn = functionEntry.vectorColumn ?? "vector";
|
|
730
|
+
if (sourceColumn === undefined) {
|
|
731
|
+
throw new Error(`Cannot apply embedding function because the source column '${functionEntry.sourceColumn}' was not present in the data`);
|
|
732
|
+
}
|
|
733
|
+
// Check if destination column exists and handle accordingly
|
|
734
|
+
if (columns[destColumn] !== undefined) {
|
|
735
|
+
const existingColumn = columns[destColumn];
|
|
736
|
+
// If the column exists but is all null, we can fill it with embeddings
|
|
737
|
+
if (existingColumn.nullCount !== existingColumn.length) {
|
|
738
|
+
// Column has non-null values, skip embedding application
|
|
739
|
+
continue;
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
if (table.batches.length > 1) {
|
|
743
|
+
throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
|
|
744
|
+
}
|
|
745
|
+
const values = sourceColumn.toArray();
|
|
746
|
+
const vectors = await functionEntry.function.computeSourceEmbeddings(values);
|
|
747
|
+
if (vectors.length !== values.length) {
|
|
748
|
+
throw new Error("Embedding function did not return an embedding for each input element");
|
|
749
|
+
}
|
|
750
|
+
let destType;
|
|
751
|
+
const dtype = schema.fields.find((f) => f.name === destColumn).type;
|
|
752
|
+
if (isFixedSizeList(dtype)) {
|
|
753
|
+
destType = (0, sanitize_1.sanitizeType)(dtype);
|
|
754
|
+
}
|
|
755
|
+
else {
|
|
756
|
+
throw new Error("Expected FixedSizeList as datatype for vector field, instead got: " +
|
|
757
|
+
dtype);
|
|
758
|
+
}
|
|
759
|
+
const vector = makeVector(vectors, destType);
|
|
760
|
+
columns[destColumn] = vector;
|
|
761
|
+
}
|
|
762
|
+
// Add any missing columns from the schema as null vectors
|
|
763
|
+
for (const field of schema.fields) {
|
|
764
|
+
if (!(field.name in columns)) {
|
|
765
|
+
const nullValues = new Array(table.numRows).fill(null);
|
|
766
|
+
columns[field.name] = makeVector(nullValues, field.type, undefined, field.nullable);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
const newTable = new apache_arrow_1.Table(columns);
|
|
770
|
+
return alignTable(newTable, schema);
|
|
771
|
+
}
|
|
772
|
+
/** Helper function to apply embeddings to an input table */
|
|
773
|
+
async function applyEmbeddings(table, embeddings, schema) {
|
|
774
|
+
if (schema !== undefined && schema !== null) {
|
|
775
|
+
schema = (0, sanitize_1.sanitizeSchema)(schema);
|
|
776
|
+
}
|
|
777
|
+
if (schema?.metadata.has("embedding_functions")) {
|
|
778
|
+
return applyEmbeddingsFromMetadata(table, schema);
|
|
779
|
+
}
|
|
780
|
+
else if (embeddings == null || embeddings === undefined) {
|
|
781
|
+
return table;
|
|
782
|
+
}
|
|
783
|
+
let schemaMetadata = schema?.metadata || new Map();
|
|
784
|
+
if (!(embeddings == null || embeddings === undefined)) {
|
|
785
|
+
const registry = (0, registry_1.getRegistry)();
|
|
786
|
+
const embeddingMetadata = registry.getTableMetadata([embeddings]);
|
|
787
|
+
schemaMetadata = new Map([...schemaMetadata, ...embeddingMetadata]);
|
|
788
|
+
}
|
|
789
|
+
// Convert from ArrowTable to Record<String, Vector>
|
|
790
|
+
const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
|
|
791
|
+
const name = table.schema.fields[idx].name;
|
|
792
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
793
|
+
const vec = table.getChildAt(idx);
|
|
794
|
+
return [name, vec];
|
|
795
|
+
});
|
|
796
|
+
const newColumns = Object.fromEntries(colEntries);
|
|
797
|
+
const sourceColumn = newColumns[embeddings.sourceColumn];
|
|
798
|
+
const destColumn = embeddings.vectorColumn ?? "vector";
|
|
799
|
+
const innerDestType = embeddings.function.embeddingDataType() ?? new apache_arrow_1.Float32();
|
|
800
|
+
if (sourceColumn === undefined) {
|
|
801
|
+
throw new Error(`Cannot apply embedding function because the source column '${embeddings.sourceColumn}' was not present in the data`);
|
|
802
|
+
}
|
|
803
|
+
if (table.numRows === 0) {
|
|
804
|
+
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
|
805
|
+
// We have an empty table and it already has the embedding column so no work needs to be done
|
|
806
|
+
// Note: we don't return an error like we did below because this is a common occurrence. For example,
|
|
807
|
+
// if we call convertToTable with 0 records and a schema that includes the embedding
|
|
808
|
+
return table;
|
|
809
|
+
}
|
|
810
|
+
const dimensions = embeddings.function.ndims();
|
|
811
|
+
if (dimensions !== undefined) {
|
|
812
|
+
const destType = newVectorType(dimensions, innerDestType);
|
|
813
|
+
newColumns[destColumn] = makeVector([], destType);
|
|
814
|
+
}
|
|
815
|
+
else if (schema != null) {
|
|
816
|
+
const destField = schema.fields.find((f) => f.name === destColumn);
|
|
817
|
+
if (destField != null) {
|
|
818
|
+
newColumns[destColumn] = makeVector([], destField.type, undefined, destField.nullable);
|
|
819
|
+
}
|
|
820
|
+
else {
|
|
821
|
+
throw new Error(`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`);
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
else {
|
|
825
|
+
throw new Error("Attempt to apply embeddings to an empty table when the embeddings function does not specify `embeddingDimension`");
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
else {
|
|
829
|
+
// Check if destination column exists and handle accordingly
|
|
830
|
+
if (Object.prototype.hasOwnProperty.call(newColumns, destColumn)) {
|
|
831
|
+
const existingColumn = newColumns[destColumn];
|
|
832
|
+
// If the column exists but is all null, we can fill it with embeddings
|
|
833
|
+
if (existingColumn.nullCount !== existingColumn.length) {
|
|
834
|
+
// Column has non-null values, skip embedding application and return table as-is
|
|
835
|
+
let newTable = new apache_arrow_1.Table(newColumns);
|
|
836
|
+
if (schema != null) {
|
|
837
|
+
newTable = alignTable(newTable, schema);
|
|
838
|
+
}
|
|
839
|
+
return new apache_arrow_1.Table(new apache_arrow_1.Schema(newTable.schema.fields, schemaMetadata), newTable.batches);
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
if (table.batches.length > 1) {
|
|
843
|
+
throw new Error("Internal error: `makeArrowTable` unexpectedly created a table with more than one batch");
|
|
844
|
+
}
|
|
845
|
+
const values = sourceColumn.toArray();
|
|
846
|
+
const vectors = await embeddings.function.computeSourceEmbeddings(values);
|
|
847
|
+
if (vectors.length !== values.length) {
|
|
848
|
+
throw new Error("Embedding function did not return an embedding for each input element");
|
|
849
|
+
}
|
|
850
|
+
const destType = newVectorType(vectors[0].length, innerDestType);
|
|
851
|
+
newColumns[destColumn] = makeVector(vectors, destType);
|
|
852
|
+
}
|
|
853
|
+
let newTable = new apache_arrow_1.Table(newColumns);
|
|
854
|
+
if (schema != null) {
|
|
855
|
+
if (schema.fields.find((f) => f.name === destColumn) === undefined) {
|
|
856
|
+
throw new Error(`When using embedding functions and specifying a schema the schema should include the embedding column but the column ${destColumn} was missing`);
|
|
857
|
+
}
|
|
858
|
+
newTable = alignTable(newTable, schema);
|
|
859
|
+
}
|
|
860
|
+
newTable = new apache_arrow_1.Table(new apache_arrow_1.Schema(newTable.schema.fields, schemaMetadata), newTable.batches);
|
|
861
|
+
return newTable;
|
|
862
|
+
}
|
|
863
|
+
/**
|
|
864
|
+
* Convert an Array of records into an Arrow Table, optionally applying an
|
|
865
|
+
* embeddings function to it.
|
|
866
|
+
*
|
|
867
|
+
* This function calls `makeArrowTable` first to create the Arrow Table.
|
|
868
|
+
* Any provided `makeTableOptions` (e.g. a schema) will be passed on to
|
|
869
|
+
* that call.
|
|
870
|
+
*
|
|
871
|
+
* The embedding function will be passed a column of values (based on the
|
|
872
|
+
* `sourceColumn` of the embedding function) and expects to receive back
|
|
873
|
+
* number[][] which will be converted into a fixed size list column. By
|
|
874
|
+
* default this will be a fixed size list of Float32 but that can be
|
|
875
|
+
* customized by the `embeddingDataType` property of the embedding function.
|
|
876
|
+
*
|
|
877
|
+
* If a schema is provided in `makeTableOptions` then it should include the
|
|
878
|
+
* embedding columns. If no schema is provded then embedding columns will
|
|
879
|
+
* be placed at the end of the table, after all of the input columns.
|
|
880
|
+
*/
|
|
881
|
+
async function convertToTable(data, embeddings, makeTableOptions) {
|
|
882
|
+
let processedData = data;
|
|
883
|
+
// If we have a schema with embedding metadata, we need to preprocess the data
|
|
884
|
+
// to ensure all nested fields are present
|
|
885
|
+
if (makeTableOptions?.schema &&
|
|
886
|
+
makeTableOptions.schema.metadata?.has("embedding_functions")) {
|
|
887
|
+
processedData = ensureNestedFieldsExist(data, makeTableOptions.schema);
|
|
888
|
+
}
|
|
889
|
+
const table = makeArrowTable(processedData, makeTableOptions);
|
|
890
|
+
return await applyEmbeddings(table, embeddings, makeTableOptions?.schema);
|
|
891
|
+
}
|
|
892
|
+
/** Creates the Arrow Type for a Vector column with dimension `dim` */
|
|
893
|
+
function newVectorType(dim, innerType) {
|
|
894
|
+
// in Lance we always default to have the elements nullable, so we need to set it to true
|
|
895
|
+
// otherwise we often get schema mismatches because the stored data always has schema with nullable elements
|
|
896
|
+
const children = new apache_arrow_1.Field("item", (0, sanitize_1.sanitizeType)(innerType), true);
|
|
897
|
+
return new apache_arrow_1.FixedSizeList(dim, children);
|
|
898
|
+
}
|
|
899
|
+
/**
|
|
900
|
+
* Serialize an Array of records into a buffer using the Arrow IPC File serialization
|
|
901
|
+
*
|
|
902
|
+
* This function will call `convertToTable` and pass on `embeddings` and `schema`
|
|
903
|
+
*
|
|
904
|
+
* `schema` is required if data is empty
|
|
905
|
+
*/
|
|
906
|
+
async function fromRecordsToBuffer(data, embeddings, schema) {
|
|
907
|
+
if (schema !== undefined && schema !== null) {
|
|
908
|
+
schema = (0, sanitize_1.sanitizeSchema)(schema);
|
|
909
|
+
}
|
|
910
|
+
const table = await convertToTable(data, embeddings, { schema });
|
|
911
|
+
const writer = apache_arrow_1.RecordBatchFileWriter.writeAll(table);
|
|
912
|
+
return Buffer.from(await writer.toUint8Array());
|
|
913
|
+
}
|
|
914
|
+
/**
|
|
915
|
+
* Serialize an Array of records into a buffer using the Arrow IPC Stream serialization
|
|
916
|
+
*
|
|
917
|
+
* This function will call `convertToTable` and pass on `embeddings` and `schema`
|
|
918
|
+
*
|
|
919
|
+
* `schema` is required if data is empty
|
|
920
|
+
*/
|
|
921
|
+
async function fromRecordsToStreamBuffer(data, embeddings, schema) {
|
|
922
|
+
if (schema !== undefined && schema !== null) {
|
|
923
|
+
schema = (0, sanitize_1.sanitizeSchema)(schema);
|
|
924
|
+
}
|
|
925
|
+
const table = await convertToTable(data, embeddings, { schema });
|
|
926
|
+
const writer = apache_arrow_1.RecordBatchStreamWriter.writeAll(table);
|
|
927
|
+
return Buffer.from(await writer.toUint8Array());
|
|
928
|
+
}
|
|
929
|
+
/**
|
|
930
|
+
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
931
|
+
*
|
|
932
|
+
* This function will apply `embeddings` to the table in a manner similar to
|
|
933
|
+
* `convertToTable`.
|
|
934
|
+
*
|
|
935
|
+
* `schema` is required if the table is empty
|
|
936
|
+
*/
|
|
937
|
+
async function fromTableToBuffer(table, embeddings, schema) {
|
|
938
|
+
if (schema !== undefined && schema !== null) {
|
|
939
|
+
schema = (0, sanitize_1.sanitizeSchema)(schema);
|
|
940
|
+
}
|
|
941
|
+
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
942
|
+
const writer = apache_arrow_1.RecordBatchFileWriter.writeAll(tableWithEmbeddings);
|
|
943
|
+
return Buffer.from(await writer.toUint8Array());
|
|
944
|
+
}
|
|
945
|
+
/**
|
|
946
|
+
* Serialize an Arrow Table into a buffer using the Arrow IPC File serialization
|
|
947
|
+
*
|
|
948
|
+
* This function will apply `embeddings` to the table in a manner similar to
|
|
949
|
+
* `convertToTable`.
|
|
950
|
+
*
|
|
951
|
+
* `schema` is required if the table is empty
|
|
952
|
+
*/
|
|
953
|
+
async function fromDataToBuffer(data, embeddings, schema) {
|
|
954
|
+
if (schema !== undefined && schema !== null) {
|
|
955
|
+
schema = (0, sanitize_1.sanitizeSchema)(schema);
|
|
956
|
+
}
|
|
957
|
+
if (isArrowTable(data)) {
|
|
958
|
+
const table = (0, sanitize_1.sanitizeTable)(data);
|
|
959
|
+
// If we have a schema with embedding functions, we need to ensure all columns exist
|
|
960
|
+
// before applying embeddings, since applyEmbeddingsFromMetadata expects all columns
|
|
961
|
+
// to be present in the table
|
|
962
|
+
if (schema && schema.metadata?.has("embedding_functions")) {
|
|
963
|
+
const alignedTable = alignTableToSchema(table, schema);
|
|
964
|
+
return fromTableToBuffer(alignedTable, embeddings, schema);
|
|
965
|
+
}
|
|
966
|
+
else {
|
|
967
|
+
return fromTableToBuffer(table, embeddings, schema);
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
else {
|
|
971
|
+
const table = await convertToTable(data, embeddings, { schema });
|
|
972
|
+
return fromTableToBuffer(table);
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
/**
|
|
976
|
+
* Read a single record batch from a buffer.
|
|
977
|
+
*
|
|
978
|
+
* Returns null if the buffer does not contain a record batch
|
|
979
|
+
*/
|
|
980
|
+
async function fromBufferToRecordBatch(data) {
|
|
981
|
+
const iter = await apache_arrow_1.RecordBatchFileReader.readAll(Buffer.from(data)).next()
|
|
982
|
+
.value;
|
|
983
|
+
const recordBatch = iter?.next().value;
|
|
984
|
+
return recordBatch || null;
|
|
985
|
+
}
|
|
986
|
+
/**
|
|
987
|
+
* Create a buffer containing a single record batch
|
|
988
|
+
*/
|
|
989
|
+
async function fromRecordBatchToBuffer(batch) {
|
|
990
|
+
const writer = new apache_arrow_1.RecordBatchFileWriter().writeAll([batch]);
|
|
991
|
+
return Buffer.from(await writer.toUint8Array());
|
|
992
|
+
}
|
|
993
|
+
/**
|
|
994
|
+
* Serialize an Arrow Table into a buffer using the Arrow IPC Stream serialization
|
|
995
|
+
*
|
|
996
|
+
* This function will apply `embeddings` to the table in a manner similar to
|
|
997
|
+
* `convertToTable`.
|
|
998
|
+
*
|
|
999
|
+
* `schema` is required if the table is empty
|
|
1000
|
+
*/
|
|
1001
|
+
async function fromTableToStreamBuffer(table, embeddings, schema) {
|
|
1002
|
+
const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema);
|
|
1003
|
+
const writer = apache_arrow_1.RecordBatchStreamWriter.writeAll(tableWithEmbeddings);
|
|
1004
|
+
return Buffer.from(await writer.toUint8Array());
|
|
1005
|
+
}
|
|
1006
|
+
/**
|
|
1007
|
+
* Reorder the columns in `batch` so that they agree with the field order in `schema`
|
|
1008
|
+
*/
|
|
1009
|
+
function alignBatch(batch, schema) {
|
|
1010
|
+
const alignedChildren = [];
|
|
1011
|
+
for (const field of schema.fields) {
|
|
1012
|
+
const indexInBatch = batch.schema.fields?.findIndex((f) => f.name === field.name);
|
|
1013
|
+
if (indexInBatch < 0) {
|
|
1014
|
+
throw new Error(`The column ${field.name} was not found in the Arrow Table`);
|
|
1015
|
+
}
|
|
1016
|
+
alignedChildren.push(batch.data.children[indexInBatch]);
|
|
1017
|
+
}
|
|
1018
|
+
const newData = (0, apache_arrow_1.makeData)({
|
|
1019
|
+
type: new apache_arrow_1.Struct(schema.fields),
|
|
1020
|
+
length: batch.numRows,
|
|
1021
|
+
nullCount: batch.nullCount,
|
|
1022
|
+
children: alignedChildren,
|
|
1023
|
+
});
|
|
1024
|
+
return new apache_arrow_1.RecordBatch(schema, newData);
|
|
1025
|
+
}
|
|
1026
|
+
/**
|
|
1027
|
+
* Reorder the columns in `table` so that they agree with the field order in `schema`
|
|
1028
|
+
*/
|
|
1029
|
+
function alignTable(table, schema) {
|
|
1030
|
+
const alignedBatches = table.batches.map((batch) => alignBatch(batch, schema));
|
|
1031
|
+
return new apache_arrow_1.Table(schema, alignedBatches);
|
|
1032
|
+
}
|
|
1033
|
+
/**
|
|
1034
|
+
* Create an empty table with the given schema
|
|
1035
|
+
*/
|
|
1036
|
+
function createEmptyTable(schema) {
|
|
1037
|
+
return new apache_arrow_1.Table((0, sanitize_1.sanitizeSchema)(schema));
|
|
1038
|
+
}
|
|
1039
|
+
function validateSchemaEmbeddings(schema, data, embeddings) {
|
|
1040
|
+
const fields = [];
|
|
1041
|
+
const missingEmbeddingFields = [];
|
|
1042
|
+
// First we check if the field is a `FixedSizeList`
|
|
1043
|
+
// Then we check if the data contains the field
|
|
1044
|
+
// if it does not, we add it to the list of missing embedding fields
|
|
1045
|
+
// Finally, we check if those missing embedding fields are `this._embeddings`
|
|
1046
|
+
// if they are not, we throw an error
|
|
1047
|
+
for (let field of schema.fields) {
|
|
1048
|
+
if (isFixedSizeList(field.type)) {
|
|
1049
|
+
field = (0, sanitize_1.sanitizeField)(field);
|
|
1050
|
+
if (data.length !== 0 && data?.[0]?.[field.name] === undefined) {
|
|
1051
|
+
// Check if there's an embedding function registered for this field
|
|
1052
|
+
let hasEmbeddingFunction = false;
|
|
1053
|
+
// Check schema metadata for embedding functions
|
|
1054
|
+
if (schema.metadata.has("embedding_functions")) {
|
|
1055
|
+
const embeddings = JSON.parse(schema.metadata.get("embedding_functions"));
|
|
1056
|
+
// biome-ignore lint/suspicious/noExplicitAny: we don't know the type of `f`
|
|
1057
|
+
if (embeddings.find((f) => f["vectorColumn"] === field.name)) {
|
|
1058
|
+
hasEmbeddingFunction = true;
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
// Check passed embedding function parameter
|
|
1062
|
+
if (embeddings && embeddings.vectorColumn === field.name) {
|
|
1063
|
+
hasEmbeddingFunction = true;
|
|
1064
|
+
}
|
|
1065
|
+
// If the field is nullable AND there's no embedding function, allow undefined/omitted values
|
|
1066
|
+
if (field.nullable && !hasEmbeddingFunction) {
|
|
1067
|
+
fields.push(field);
|
|
1068
|
+
}
|
|
1069
|
+
else {
|
|
1070
|
+
// Either not nullable OR has embedding function - require explicit values
|
|
1071
|
+
if (hasEmbeddingFunction) {
|
|
1072
|
+
// Don't add to missingEmbeddingFields since this is expected to be filled by embedding function
|
|
1073
|
+
fields.push(field);
|
|
1074
|
+
}
|
|
1075
|
+
else {
|
|
1076
|
+
missingEmbeddingFields.push(field);
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
else {
|
|
1081
|
+
fields.push(field);
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
else {
|
|
1085
|
+
fields.push(field);
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
|
1089
|
+
throw new Error(`Table has embeddings: "${missingEmbeddingFields
|
|
1090
|
+
.map((f) => f.name)
|
|
1091
|
+
.join(",")}", but no embedding function was provided`);
|
|
1092
|
+
}
|
|
1093
|
+
return new apache_arrow_1.Schema(fields, schema.metadata);
|
|
1094
|
+
}
|
|
1095
|
+
/**
|
|
1096
|
+
* Ensures that all nested fields defined in the schema exist in the data,
|
|
1097
|
+
* filling missing fields with null values.
|
|
1098
|
+
*/
|
|
1099
|
+
function ensureNestedFieldsExist(data, schema) {
|
|
1100
|
+
return data.map((row) => {
|
|
1101
|
+
const completeRow = {};
|
|
1102
|
+
for (const field of schema.fields) {
|
|
1103
|
+
if (field.name in row) {
|
|
1104
|
+
if (field.type.constructor.name === "Struct" &&
|
|
1105
|
+
row[field.name] !== null &&
|
|
1106
|
+
row[field.name] !== undefined) {
|
|
1107
|
+
// Handle nested struct
|
|
1108
|
+
const nestedValue = row[field.name];
|
|
1109
|
+
completeRow[field.name] = ensureStructFieldsExist(nestedValue, field.type);
|
|
1110
|
+
}
|
|
1111
|
+
else {
|
|
1112
|
+
// Non-struct field or null struct value
|
|
1113
|
+
completeRow[field.name] = row[field.name];
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
else {
|
|
1117
|
+
// Field is missing from the data - set to null
|
|
1118
|
+
completeRow[field.name] = null;
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
return completeRow;
|
|
1122
|
+
});
|
|
1123
|
+
}
|
|
1124
|
+
/**
|
|
1125
|
+
* Recursively ensures that all fields in a struct type exist in the data,
|
|
1126
|
+
* filling missing fields with null values.
|
|
1127
|
+
*/
|
|
1128
|
+
function ensureStructFieldsExist(data, structType) {
|
|
1129
|
+
const completeStruct = {};
|
|
1130
|
+
for (const childField of structType.children) {
|
|
1131
|
+
if (childField.name in data) {
|
|
1132
|
+
if (childField.type.constructor.name === "Struct" &&
|
|
1133
|
+
data[childField.name] !== null &&
|
|
1134
|
+
data[childField.name] !== undefined) {
|
|
1135
|
+
// Recursively handle nested struct
|
|
1136
|
+
completeStruct[childField.name] = ensureStructFieldsExist(data[childField.name], childField.type);
|
|
1137
|
+
}
|
|
1138
|
+
else {
|
|
1139
|
+
// Non-struct field or null struct value
|
|
1140
|
+
completeStruct[childField.name] = data[childField.name];
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
else {
|
|
1144
|
+
// Field is missing - set to null
|
|
1145
|
+
completeStruct[childField.name] = null;
|
|
1146
|
+
}
|
|
1147
|
+
}
|
|
1148
|
+
return completeStruct;
|
|
1149
|
+
}
|
|
1150
|
+
// Matches format of https://github.com/lancedb/lance/blob/main/rust/lance/src/arrow/json.rs
|
|
1151
|
+
function dataTypeToJson(dataType) {
|
|
1152
|
+
switch (dataType.typeId) {
|
|
1153
|
+
// For primitives, matches https://github.com/lancedb/lance/blob/e12bb9eff2a52f753668d4b62c52e4d72b10d294/rust/lance-core/src/datatypes.rs#L185
|
|
1154
|
+
case apache_arrow_1.Type.Null:
|
|
1155
|
+
return { type: "null" };
|
|
1156
|
+
case apache_arrow_1.Type.Bool:
|
|
1157
|
+
return { type: "bool" };
|
|
1158
|
+
case apache_arrow_1.Type.Int8:
|
|
1159
|
+
return { type: "int8" };
|
|
1160
|
+
case apache_arrow_1.Type.Int16:
|
|
1161
|
+
return { type: "int16" };
|
|
1162
|
+
case apache_arrow_1.Type.Int32:
|
|
1163
|
+
return { type: "int32" };
|
|
1164
|
+
case apache_arrow_1.Type.Int64:
|
|
1165
|
+
return { type: "int64" };
|
|
1166
|
+
case apache_arrow_1.Type.Uint8:
|
|
1167
|
+
return { type: "uint8" };
|
|
1168
|
+
case apache_arrow_1.Type.Uint16:
|
|
1169
|
+
return { type: "uint16" };
|
|
1170
|
+
case apache_arrow_1.Type.Uint32:
|
|
1171
|
+
return { type: "uint32" };
|
|
1172
|
+
case apache_arrow_1.Type.Uint64:
|
|
1173
|
+
return { type: "uint64" };
|
|
1174
|
+
case apache_arrow_1.Type.Int: {
|
|
1175
|
+
const bitWidth = dataType.bitWidth;
|
|
1176
|
+
const signed = dataType.isSigned;
|
|
1177
|
+
const prefix = signed ? "" : "u";
|
|
1178
|
+
return { type: `${prefix}int${bitWidth}` };
|
|
1179
|
+
}
|
|
1180
|
+
case apache_arrow_1.Type.Float: {
|
|
1181
|
+
switch (dataType.precision) {
|
|
1182
|
+
case apache_arrow_1.Precision.HALF:
|
|
1183
|
+
return { type: "halffloat" };
|
|
1184
|
+
case apache_arrow_1.Precision.SINGLE:
|
|
1185
|
+
return { type: "float" };
|
|
1186
|
+
case apache_arrow_1.Precision.DOUBLE:
|
|
1187
|
+
return { type: "double" };
|
|
1188
|
+
}
|
|
1189
|
+
throw Error("Unsupported float precision");
|
|
1190
|
+
}
|
|
1191
|
+
case apache_arrow_1.Type.Float16:
|
|
1192
|
+
return { type: "halffloat" };
|
|
1193
|
+
case apache_arrow_1.Type.Float32:
|
|
1194
|
+
return { type: "float" };
|
|
1195
|
+
case apache_arrow_1.Type.Float64:
|
|
1196
|
+
return { type: "double" };
|
|
1197
|
+
case apache_arrow_1.Type.Utf8:
|
|
1198
|
+
return { type: "string" };
|
|
1199
|
+
case apache_arrow_1.Type.Binary:
|
|
1200
|
+
return { type: "binary" };
|
|
1201
|
+
case apache_arrow_1.Type.LargeUtf8:
|
|
1202
|
+
return { type: "large_string" };
|
|
1203
|
+
case apache_arrow_1.Type.LargeBinary:
|
|
1204
|
+
return { type: "large_binary" };
|
|
1205
|
+
case apache_arrow_1.Type.List:
|
|
1206
|
+
return {
|
|
1207
|
+
type: "list",
|
|
1208
|
+
fields: [fieldToJson(dataType.children[0])],
|
|
1209
|
+
};
|
|
1210
|
+
case apache_arrow_1.Type.FixedSizeList: {
|
|
1211
|
+
const fixedSizeList = dataType;
|
|
1212
|
+
return {
|
|
1213
|
+
type: "fixed_size_list",
|
|
1214
|
+
fields: [fieldToJson(fixedSizeList.children[0])],
|
|
1215
|
+
length: fixedSizeList.listSize,
|
|
1216
|
+
};
|
|
1217
|
+
}
|
|
1218
|
+
case apache_arrow_1.Type.Struct:
|
|
1219
|
+
return {
|
|
1220
|
+
type: "struct",
|
|
1221
|
+
fields: dataType.children.map(fieldToJson),
|
|
1222
|
+
};
|
|
1223
|
+
case apache_arrow_1.Type.Date: {
|
|
1224
|
+
const unit = dataType.unit;
|
|
1225
|
+
return {
|
|
1226
|
+
type: unit === apache_arrow_1.DateUnit.DAY ? "date32:day" : "date64:ms",
|
|
1227
|
+
};
|
|
1228
|
+
}
|
|
1229
|
+
case apache_arrow_1.Type.Timestamp: {
|
|
1230
|
+
const timestamp = dataType;
|
|
1231
|
+
const timezone = timestamp.timezone || "-";
|
|
1232
|
+
return {
|
|
1233
|
+
type: `timestamp:${timestamp.unit}:${timezone}`,
|
|
1234
|
+
};
|
|
1235
|
+
}
|
|
1236
|
+
case apache_arrow_1.Type.Decimal: {
|
|
1237
|
+
const decimal = dataType;
|
|
1238
|
+
return {
|
|
1239
|
+
type: `decimal:${decimal.bitWidth}:${decimal.precision}:${decimal.scale}`,
|
|
1240
|
+
};
|
|
1241
|
+
}
|
|
1242
|
+
case apache_arrow_1.Type.Duration: {
|
|
1243
|
+
const duration = dataType;
|
|
1244
|
+
return { type: `duration:${duration.unit}` };
|
|
1245
|
+
}
|
|
1246
|
+
case apache_arrow_1.Type.FixedSizeBinary: {
|
|
1247
|
+
const byteWidth = dataType.byteWidth;
|
|
1248
|
+
return { type: `fixed_size_binary:${byteWidth}` };
|
|
1249
|
+
}
|
|
1250
|
+
case apache_arrow_1.Type.Dictionary: {
|
|
1251
|
+
const dict = dataType;
|
|
1252
|
+
const indexType = dataTypeToJson(dict.indices);
|
|
1253
|
+
const valueType = dataTypeToJson(dict.valueType);
|
|
1254
|
+
return {
|
|
1255
|
+
type: `dict:${valueType.type}:${indexType.type}:false`,
|
|
1256
|
+
};
|
|
1257
|
+
}
|
|
1258
|
+
}
|
|
1259
|
+
throw new Error("Unsupported data type");
|
|
1260
|
+
}
|
|
1261
|
+
function fieldToJson(field) {
|
|
1262
|
+
return {
|
|
1263
|
+
name: field.name,
|
|
1264
|
+
type: dataTypeToJson(field.type),
|
|
1265
|
+
nullable: field.nullable,
|
|
1266
|
+
metadata: field.metadata,
|
|
1267
|
+
};
|
|
1268
|
+
}
|
|
1269
|
+
function alignTableToSchema(table, targetSchema) {
|
|
1270
|
+
const existingColumns = new Map();
|
|
1271
|
+
// Map existing columns
|
|
1272
|
+
for (const field of table.schema.fields) {
|
|
1273
|
+
existingColumns.set(field.name, table.getChild(field.name));
|
|
1274
|
+
}
|
|
1275
|
+
// Create vectors for all fields in target schema
|
|
1276
|
+
const alignedColumns = {};
|
|
1277
|
+
for (const field of targetSchema.fields) {
|
|
1278
|
+
if (existingColumns.has(field.name)) {
|
|
1279
|
+
// Column exists, use it
|
|
1280
|
+
alignedColumns[field.name] = existingColumns.get(field.name);
|
|
1281
|
+
}
|
|
1282
|
+
else {
|
|
1283
|
+
// Column missing, create null vector
|
|
1284
|
+
alignedColumns[field.name] = createNullVector(field, table.numRows);
|
|
1285
|
+
}
|
|
1286
|
+
}
|
|
1287
|
+
// Create new table with aligned schema and columns
|
|
1288
|
+
return new apache_arrow_1.Table(targetSchema, alignedColumns);
|
|
1289
|
+
}
|
|
1290
|
+
function createNullVector(field, numRows) {
|
|
1291
|
+
if (field.type.constructor.name === "Struct") {
|
|
1292
|
+
// For struct types, create a struct with null fields
|
|
1293
|
+
const structType = field.type;
|
|
1294
|
+
const childVectors = structType.children.map((childField) => createNullVector(childField, numRows));
|
|
1295
|
+
// Create struct data
|
|
1296
|
+
const structData = (0, apache_arrow_1.makeData)({
|
|
1297
|
+
type: structType,
|
|
1298
|
+
length: numRows,
|
|
1299
|
+
nullCount: 0,
|
|
1300
|
+
children: childVectors.map((v) => v.data[0]),
|
|
1301
|
+
});
|
|
1302
|
+
return (0, apache_arrow_1.makeVector)(structData);
|
|
1303
|
+
}
|
|
1304
|
+
else {
|
|
1305
|
+
// For other types, create a vector of nulls
|
|
1306
|
+
const nullBitmap = new Uint8Array(Math.ceil(numRows / 8));
|
|
1307
|
+
// All bits are 0, meaning all values are null
|
|
1308
|
+
const data = (0, apache_arrow_1.makeData)({
|
|
1309
|
+
type: field.type,
|
|
1310
|
+
length: numRows,
|
|
1311
|
+
nullCount: numRows,
|
|
1312
|
+
nullBitmap,
|
|
1313
|
+
});
|
|
1314
|
+
return (0, apache_arrow_1.makeVector)(data);
|
|
1315
|
+
}
|
|
1316
|
+
}
|