@lancedb/lancedb 0.15.1-beta.2 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/arrow.d.ts +33 -32
- package/dist/arrow.js +322 -113
- package/dist/connection.d.ts +13 -8
- package/dist/connection.js +54 -19
- package/dist/embedding/embedding_function.d.ts +3 -3
- package/dist/embedding/embedding_function.js +3 -3
- package/dist/embedding/index.d.ts +1 -1
- package/dist/embedding/registry.d.ts +2 -5
- package/dist/embedding/registry.js +0 -2
- package/dist/index.d.ts +11 -7
- package/dist/index.js +9 -9
- package/dist/indices.d.ts +0 -2
- package/dist/indices.js +0 -2
- package/dist/native.d.ts +4 -13
- package/dist/native.js +1 -2
- package/dist/query.d.ts +37 -4
- package/dist/query.js +37 -4
- package/dist/rerankers/rrf.d.ts +2 -1
- package/dist/rerankers/rrf.js +2 -1
- package/dist/table.d.ts +11 -9
- package/dist/table.js +6 -23
- package/package.json +10 -10
- package/typedoc_post_process.js +22 -17
package/README.md
CHANGED
|
@@ -32,7 +32,7 @@ const results = await table.vectorSearch([0.1, 0.3]).limit(20).toArray();
|
|
|
32
32
|
console.log(results);
|
|
33
33
|
```
|
|
34
34
|
|
|
35
|
-
The [quickstart](
|
|
35
|
+
The [quickstart](https://lancedb.github.io/lancedb/basic/) contains a more complete example.
|
|
36
36
|
|
|
37
37
|
## Development
|
|
38
38
|
|
package/dist/arrow.d.ts
CHANGED
|
@@ -91,8 +91,6 @@ export declare class MakeArrowTableOptions {
|
|
|
91
91
|
* This function converts an array of Record<String, any> (row-major JS objects)
|
|
92
92
|
* to an Arrow Table (a columnar structure)
|
|
93
93
|
*
|
|
94
|
-
* Note that it currently does not support nulls.
|
|
95
|
-
*
|
|
96
94
|
* If a schema is provided then it will be used to determine the resulting array
|
|
97
95
|
* types. Fields will also be reordered to fit the order defined by the schema.
|
|
98
96
|
*
|
|
@@ -100,6 +98,9 @@ export declare class MakeArrowTableOptions {
|
|
|
100
98
|
* will be controlled by the order of properties in the first record. If a type
|
|
101
99
|
* is inferred it will always be nullable.
|
|
102
100
|
*
|
|
101
|
+
* If not all fields are found in the data, then a subset of the schema will be
|
|
102
|
+
* returned.
|
|
103
|
+
*
|
|
103
104
|
* If the input is empty then a schema must be provided to create an empty table.
|
|
104
105
|
*
|
|
105
106
|
* When a schema is not specified then data types will be inferred. The inference
|
|
@@ -107,11 +108,13 @@ export declare class MakeArrowTableOptions {
|
|
|
107
108
|
*
|
|
108
109
|
* - boolean => Bool
|
|
109
110
|
* - number => Float64
|
|
111
|
+
* - bigint => Int64
|
|
110
112
|
* - String => Utf8
|
|
111
113
|
* - Buffer => Binary
|
|
112
114
|
* - Record<String, any> => Struct
|
|
113
115
|
* - Array<any> => List
|
|
114
116
|
* @example
|
|
117
|
+
* ```ts
|
|
115
118
|
* import { fromTableToBuffer, makeArrowTable } from "../arrow";
|
|
116
119
|
* import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
|
|
117
120
|
*
|
|
@@ -133,43 +136,41 @@ export declare class MakeArrowTableOptions {
|
|
|
133
136
|
* names and data types.
|
|
134
137
|
*
|
|
135
138
|
* ```ts
|
|
136
|
-
*
|
|
137
139
|
* const schema = new Schema([
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
140
|
+
* new Field("a", new Float64()),
|
|
141
|
+
* new Field("b", new Float64()),
|
|
142
|
+
* new Field(
|
|
143
|
+
* "vector",
|
|
144
|
+
* new FixedSizeList(3, new Field("item", new Float32()))
|
|
145
|
+
* ),
|
|
146
|
+
* ]);
|
|
147
|
+
* const table = makeArrowTable([
|
|
148
|
+
* { a: 1, b: 2, vector: [1, 2, 3] },
|
|
149
|
+
* { a: 4, b: 5, vector: [4, 5, 6] },
|
|
150
|
+
* { a: 7, b: 8, vector: [7, 8, 9] },
|
|
151
|
+
* ]);
|
|
152
|
+
* assert.deepEqual(table.schema, schema);
|
|
151
153
|
* ```
|
|
152
154
|
*
|
|
153
155
|
* You can specify the vector column types and names using the options as well
|
|
154
156
|
*
|
|
155
|
-
* ```
|
|
156
|
-
*
|
|
157
|
+
* ```ts
|
|
157
158
|
* const schema = new Schema([
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
159
|
+
* new Field('a', new Float64()),
|
|
160
|
+
* new Field('b', new Float64()),
|
|
161
|
+
* new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
|
|
162
|
+
* new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
|
|
163
|
+
* ]);
|
|
163
164
|
* const table = makeArrowTable([
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
165
|
+
* { a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
|
|
166
|
+
* { a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
|
|
167
|
+
* { a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
|
|
168
|
+
* ], {
|
|
169
|
+
* vectorColumns: {
|
|
170
|
+
* vec1: { type: new Float16() },
|
|
171
|
+
* vec2: { type: new Float16() }
|
|
172
|
+
* }
|
|
173
|
+
* }
|
|
173
174
|
* assert.deepEqual(table.schema, schema)
|
|
174
175
|
* ```
|
|
175
176
|
*/
|
package/dist/arrow.js
CHANGED
|
@@ -186,8 +186,6 @@ exports.MakeArrowTableOptions = MakeArrowTableOptions;
|
|
|
186
186
|
* This function converts an array of Record<String, any> (row-major JS objects)
|
|
187
187
|
* to an Arrow Table (a columnar structure)
|
|
188
188
|
*
|
|
189
|
-
* Note that it currently does not support nulls.
|
|
190
|
-
*
|
|
191
189
|
* If a schema is provided then it will be used to determine the resulting array
|
|
192
190
|
* types. Fields will also be reordered to fit the order defined by the schema.
|
|
193
191
|
*
|
|
@@ -195,6 +193,9 @@ exports.MakeArrowTableOptions = MakeArrowTableOptions;
|
|
|
195
193
|
* will be controlled by the order of properties in the first record. If a type
|
|
196
194
|
* is inferred it will always be nullable.
|
|
197
195
|
*
|
|
196
|
+
* If not all fields are found in the data, then a subset of the schema will be
|
|
197
|
+
* returned.
|
|
198
|
+
*
|
|
198
199
|
* If the input is empty then a schema must be provided to create an empty table.
|
|
199
200
|
*
|
|
200
201
|
* When a schema is not specified then data types will be inferred. The inference
|
|
@@ -202,11 +203,13 @@ exports.MakeArrowTableOptions = MakeArrowTableOptions;
|
|
|
202
203
|
*
|
|
203
204
|
* - boolean => Bool
|
|
204
205
|
* - number => Float64
|
|
206
|
+
* - bigint => Int64
|
|
205
207
|
* - String => Utf8
|
|
206
208
|
* - Buffer => Binary
|
|
207
209
|
* - Record<String, any> => Struct
|
|
208
210
|
* - Array<any> => List
|
|
209
211
|
* @example
|
|
212
|
+
* ```ts
|
|
210
213
|
* import { fromTableToBuffer, makeArrowTable } from "../arrow";
|
|
211
214
|
* import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow";
|
|
212
215
|
*
|
|
@@ -228,149 +231,325 @@ exports.MakeArrowTableOptions = MakeArrowTableOptions;
|
|
|
228
231
|
* names and data types.
|
|
229
232
|
*
|
|
230
233
|
* ```ts
|
|
231
|
-
*
|
|
232
234
|
* const schema = new Schema([
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
235
|
+
* new Field("a", new Float64()),
|
|
236
|
+
* new Field("b", new Float64()),
|
|
237
|
+
* new Field(
|
|
238
|
+
* "vector",
|
|
239
|
+
* new FixedSizeList(3, new Field("item", new Float32()))
|
|
240
|
+
* ),
|
|
241
|
+
* ]);
|
|
242
|
+
* const table = makeArrowTable([
|
|
243
|
+
* { a: 1, b: 2, vector: [1, 2, 3] },
|
|
244
|
+
* { a: 4, b: 5, vector: [4, 5, 6] },
|
|
245
|
+
* { a: 7, b: 8, vector: [7, 8, 9] },
|
|
246
|
+
* ]);
|
|
247
|
+
* assert.deepEqual(table.schema, schema);
|
|
246
248
|
* ```
|
|
247
249
|
*
|
|
248
250
|
* You can specify the vector column types and names using the options as well
|
|
249
251
|
*
|
|
250
|
-
* ```
|
|
251
|
-
*
|
|
252
|
+
* ```ts
|
|
252
253
|
* const schema = new Schema([
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
254
|
+
* new Field('a', new Float64()),
|
|
255
|
+
* new Field('b', new Float64()),
|
|
256
|
+
* new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
|
|
257
|
+
* new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
|
|
258
|
+
* ]);
|
|
258
259
|
* const table = makeArrowTable([
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
260
|
+
* { a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] },
|
|
261
|
+
* { a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] },
|
|
262
|
+
* { a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] }
|
|
263
|
+
* ], {
|
|
264
|
+
* vectorColumns: {
|
|
265
|
+
* vec1: { type: new Float16() },
|
|
266
|
+
* vec2: { type: new Float16() }
|
|
267
|
+
* }
|
|
268
|
+
* }
|
|
268
269
|
* assert.deepEqual(table.schema, schema)
|
|
269
270
|
* ```
|
|
270
271
|
*/
|
|
271
272
|
function makeArrowTable(data, options, metadata) {
|
|
273
|
+
const opt = new MakeArrowTableOptions(options !== undefined ? options : {});
|
|
274
|
+
let schema = undefined;
|
|
275
|
+
if (opt.schema !== undefined && opt.schema !== null) {
|
|
276
|
+
schema = (0, sanitize_1.sanitizeSchema)(opt.schema);
|
|
277
|
+
schema = validateSchemaEmbeddings(schema, data, options?.embeddingFunction);
|
|
278
|
+
}
|
|
279
|
+
let schemaMetadata = schema?.metadata || new Map();
|
|
280
|
+
if (metadata !== undefined) {
|
|
281
|
+
schemaMetadata = new Map([...schemaMetadata, ...metadata]);
|
|
282
|
+
}
|
|
272
283
|
if (data.length === 0 &&
|
|
273
284
|
(options?.schema === undefined || options?.schema === null)) {
|
|
274
285
|
throw new Error("At least one record or a schema needs to be provided");
|
|
275
286
|
}
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
if (
|
|
299
|
-
//
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
287
|
+
else if (data.length === 0) {
|
|
288
|
+
if (schema === undefined) {
|
|
289
|
+
throw new Error("A schema must be provided if data is empty");
|
|
290
|
+
}
|
|
291
|
+
else {
|
|
292
|
+
schema = new apache_arrow_1.Schema(schema.fields, schemaMetadata);
|
|
293
|
+
return new apache_arrow_1.Table(schema);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
let inferredSchema = inferSchema(data, schema, opt);
|
|
297
|
+
inferredSchema = new apache_arrow_1.Schema(inferredSchema.fields, schemaMetadata);
|
|
298
|
+
const finalColumns = {};
|
|
299
|
+
for (const field of inferredSchema.fields) {
|
|
300
|
+
finalColumns[field.name] = transposeData(data, field);
|
|
301
|
+
}
|
|
302
|
+
return new apache_arrow_1.Table(inferredSchema, finalColumns);
|
|
303
|
+
}
|
|
304
|
+
function inferSchema(data, schema, opts) {
|
|
305
|
+
// We will collect all fields we see in the data.
|
|
306
|
+
const pathTree = new PathTree();
|
|
307
|
+
for (const [rowI, row] of data.entries()) {
|
|
308
|
+
for (const [path, value] of rowPathsAndValues(row)) {
|
|
309
|
+
if (!pathTree.has(path)) {
|
|
310
|
+
// First time seeing this field.
|
|
311
|
+
if (schema !== undefined) {
|
|
312
|
+
const field = getFieldForPath(schema, path);
|
|
313
|
+
if (field === undefined) {
|
|
314
|
+
throw new Error(`Found field not in schema: ${path.join(".")} at row ${rowI}`);
|
|
303
315
|
}
|
|
304
|
-
|
|
305
|
-
|
|
316
|
+
else {
|
|
317
|
+
pathTree.set(path, field.type);
|
|
306
318
|
}
|
|
307
|
-
|
|
308
|
-
|
|
319
|
+
}
|
|
320
|
+
else {
|
|
321
|
+
const inferredType = inferType(value, path, opts);
|
|
322
|
+
if (inferredType === undefined) {
|
|
323
|
+
throw new Error(`Failed to infer data type for field ${path.join(".")} at row ${rowI}. \
|
|
324
|
+
Consider providing an explicit schema.`);
|
|
309
325
|
}
|
|
310
|
-
|
|
311
|
-
}
|
|
326
|
+
pathTree.set(path, inferredType);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
else if (schema === undefined) {
|
|
330
|
+
const currentType = pathTree.get(path);
|
|
331
|
+
const newType = inferType(value, path, opts);
|
|
332
|
+
if (currentType !== newType) {
|
|
333
|
+
new Error(`Failed to infer schema for data. Previously inferred type \
|
|
334
|
+
${currentType} but found ${newType} at row ${rowI}. Consider \
|
|
335
|
+
providing an explicit schema.`);
|
|
336
|
+
}
|
|
312
337
|
}
|
|
313
338
|
}
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
const
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
339
|
+
}
|
|
340
|
+
if (schema === undefined) {
|
|
341
|
+
function fieldsFromPathTree(pathTree) {
|
|
342
|
+
const fields = [];
|
|
343
|
+
for (const [name, value] of pathTree.map.entries()) {
|
|
344
|
+
if (value instanceof PathTree) {
|
|
345
|
+
const children = fieldsFromPathTree(value);
|
|
346
|
+
fields.push(new apache_arrow_1.Field(name, new apache_arrow_1.Struct(children), true));
|
|
322
347
|
}
|
|
323
348
|
else {
|
|
324
|
-
|
|
349
|
+
fields.push(new apache_arrow_1.Field(name, value, true));
|
|
325
350
|
}
|
|
326
351
|
}
|
|
352
|
+
return fields;
|
|
327
353
|
}
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
// To work around this we first create a table with the wrong schema and
|
|
346
|
-
// then patch the schema of the batches so we can use
|
|
347
|
-
// `new ArrowTable(schema, batches)` which does not do any schema inference
|
|
348
|
-
const firstTable = new apache_arrow_1.Table(columns);
|
|
349
|
-
const batchesFixed = firstTable.batches.map((batch) => new apache_arrow_1.RecordBatch(opt.schema, batch.data));
|
|
350
|
-
let schema;
|
|
351
|
-
if (metadata !== undefined) {
|
|
352
|
-
let schemaMetadata = opt.schema.metadata;
|
|
353
|
-
if (schemaMetadata.size === 0) {
|
|
354
|
-
schemaMetadata = metadata;
|
|
355
|
-
}
|
|
356
|
-
else {
|
|
357
|
-
for (const [key, entry] of schemaMetadata.entries()) {
|
|
358
|
-
schemaMetadata.set(key, entry);
|
|
354
|
+
const fields = fieldsFromPathTree(pathTree);
|
|
355
|
+
return new apache_arrow_1.Schema(fields);
|
|
356
|
+
}
|
|
357
|
+
else {
|
|
358
|
+
function takeMatchingFields(fields, pathTree) {
|
|
359
|
+
const outFields = [];
|
|
360
|
+
for (const field of fields) {
|
|
361
|
+
if (pathTree.map.has(field.name)) {
|
|
362
|
+
const value = pathTree.get([field.name]);
|
|
363
|
+
if (value instanceof PathTree) {
|
|
364
|
+
const struct = field.type;
|
|
365
|
+
const children = takeMatchingFields(struct.children, value);
|
|
366
|
+
outFields.push(new apache_arrow_1.Field(field.name, new apache_arrow_1.Struct(children), field.nullable));
|
|
367
|
+
}
|
|
368
|
+
else {
|
|
369
|
+
outFields.push(new apache_arrow_1.Field(field.name, value, field.nullable));
|
|
370
|
+
}
|
|
359
371
|
}
|
|
360
372
|
}
|
|
361
|
-
|
|
373
|
+
return outFields;
|
|
374
|
+
}
|
|
375
|
+
const fields = takeMatchingFields(schema.fields, pathTree);
|
|
376
|
+
return new apache_arrow_1.Schema(fields);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
function* rowPathsAndValues(row, basePath = []) {
|
|
380
|
+
for (const [key, value] of Object.entries(row)) {
|
|
381
|
+
if (isObject(value)) {
|
|
382
|
+
yield* rowPathsAndValues(value, [...basePath, key]);
|
|
362
383
|
}
|
|
363
384
|
else {
|
|
364
|
-
|
|
385
|
+
yield [[...basePath, key], value];
|
|
365
386
|
}
|
|
366
|
-
return new apache_arrow_1.Table(schema, batchesFixed);
|
|
367
387
|
}
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
388
|
+
}
|
|
389
|
+
function isObject(value) {
|
|
390
|
+
return (typeof value === "object" &&
|
|
391
|
+
value !== null &&
|
|
392
|
+
!Array.isArray(value) &&
|
|
393
|
+
!(value instanceof RegExp) &&
|
|
394
|
+
!(value instanceof Date) &&
|
|
395
|
+
!(value instanceof Set) &&
|
|
396
|
+
!(value instanceof Map) &&
|
|
397
|
+
!(value instanceof Buffer));
|
|
398
|
+
}
|
|
399
|
+
function getFieldForPath(schema, path) {
|
|
400
|
+
let current = schema;
|
|
401
|
+
for (const key of path) {
|
|
402
|
+
if (current instanceof apache_arrow_1.Schema) {
|
|
403
|
+
const field = current.fields.find((f) => f.name === key);
|
|
404
|
+
if (field === undefined) {
|
|
405
|
+
return undefined;
|
|
406
|
+
}
|
|
407
|
+
current = field;
|
|
408
|
+
}
|
|
409
|
+
else if (current instanceof apache_arrow_1.Field && apache_arrow_1.DataType.isStruct(current.type)) {
|
|
410
|
+
const struct = current.type;
|
|
411
|
+
const field = struct.children.find((f) => f.name === key);
|
|
412
|
+
if (field === undefined) {
|
|
413
|
+
return undefined;
|
|
414
|
+
}
|
|
415
|
+
current = field;
|
|
416
|
+
}
|
|
417
|
+
else {
|
|
418
|
+
return undefined;
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
if (current instanceof apache_arrow_1.Field) {
|
|
422
|
+
return current;
|
|
423
|
+
}
|
|
424
|
+
else {
|
|
425
|
+
return undefined;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
/**
|
|
429
|
+
* Try to infer which Arrow type to use for a given value.
|
|
430
|
+
*
|
|
431
|
+
* May return undefined if the type cannot be inferred.
|
|
432
|
+
*/
|
|
433
|
+
function inferType(value, path, opts) {
|
|
434
|
+
if (typeof value === "bigint") {
|
|
435
|
+
return new apache_arrow_1.Int64();
|
|
436
|
+
}
|
|
437
|
+
else if (typeof value === "number") {
|
|
438
|
+
// Even if it's an integer, it's safer to assume Float64. Users can
|
|
439
|
+
// always provide an explicit schema or use BigInt if they mean integer.
|
|
440
|
+
return new apache_arrow_1.Float64();
|
|
441
|
+
}
|
|
442
|
+
else if (typeof value === "string") {
|
|
443
|
+
if (opts.dictionaryEncodeStrings) {
|
|
444
|
+
return new apache_arrow_1.Dictionary(new apache_arrow_1.Utf8(), new apache_arrow_1.Int32());
|
|
445
|
+
}
|
|
446
|
+
else {
|
|
447
|
+
return new apache_arrow_1.Utf8();
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
else if (typeof value === "boolean") {
|
|
451
|
+
return new apache_arrow_1.Bool();
|
|
452
|
+
}
|
|
453
|
+
else if (value instanceof Buffer) {
|
|
454
|
+
return new apache_arrow_1.Binary();
|
|
455
|
+
}
|
|
456
|
+
else if (Array.isArray(value)) {
|
|
457
|
+
if (value.length === 0) {
|
|
458
|
+
return undefined; // Without any values we can't infer the type
|
|
459
|
+
}
|
|
460
|
+
if (path.length === 1 && Object.hasOwn(opts.vectorColumns, path[0])) {
|
|
461
|
+
const floatType = (0, sanitize_1.sanitizeType)(opts.vectorColumns[path[0]].type);
|
|
462
|
+
return new apache_arrow_1.FixedSizeList(value.length, new apache_arrow_1.Field("item", floatType, true));
|
|
463
|
+
}
|
|
464
|
+
const valueType = inferType(value[0], path, opts);
|
|
465
|
+
if (valueType === undefined) {
|
|
466
|
+
return undefined;
|
|
467
|
+
}
|
|
468
|
+
// Try to automatically detect embedding columns.
|
|
469
|
+
if (valueType instanceof apache_arrow_1.Float && path[path.length - 1] === "vector") {
|
|
470
|
+
// We default to Float32 for vectors.
|
|
471
|
+
const child = new apache_arrow_1.Field("item", new apache_arrow_1.Float32(), true);
|
|
472
|
+
return new apache_arrow_1.FixedSizeList(value.length, child);
|
|
473
|
+
}
|
|
474
|
+
else {
|
|
475
|
+
const child = new apache_arrow_1.Field("item", valueType, true);
|
|
476
|
+
return new apache_arrow_1.List(child);
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
else {
|
|
480
|
+
// TODO: timestamp
|
|
481
|
+
return undefined;
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
class PathTree {
|
|
485
|
+
map;
|
|
486
|
+
constructor(entries) {
|
|
487
|
+
this.map = new Map();
|
|
488
|
+
if (entries !== undefined) {
|
|
489
|
+
for (const [path, value] of entries) {
|
|
490
|
+
this.set(path, value);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
has(path) {
|
|
495
|
+
let ref = this;
|
|
496
|
+
for (const part of path) {
|
|
497
|
+
if (!(ref instanceof PathTree) || !ref.map.has(part)) {
|
|
498
|
+
return false;
|
|
499
|
+
}
|
|
500
|
+
ref = ref.map.get(part);
|
|
501
|
+
}
|
|
502
|
+
return true;
|
|
503
|
+
}
|
|
504
|
+
get(path) {
|
|
505
|
+
let ref = this;
|
|
506
|
+
for (const part of path) {
|
|
507
|
+
if (!(ref instanceof PathTree) || !ref.map.has(part)) {
|
|
508
|
+
return undefined;
|
|
509
|
+
}
|
|
510
|
+
ref = ref.map.get(part);
|
|
511
|
+
}
|
|
512
|
+
return ref;
|
|
513
|
+
}
|
|
514
|
+
set(path, value) {
|
|
515
|
+
let ref = this;
|
|
516
|
+
for (const part of path.slice(0, path.length - 1)) {
|
|
517
|
+
if (!ref.map.has(part)) {
|
|
518
|
+
ref.map.set(part, new PathTree());
|
|
519
|
+
}
|
|
520
|
+
ref = ref.map.get(part);
|
|
521
|
+
}
|
|
522
|
+
ref.map.set(path[path.length - 1], value);
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
function transposeData(data, field, path = []) {
|
|
526
|
+
if (field.type instanceof apache_arrow_1.Struct) {
|
|
527
|
+
const childFields = field.type.children;
|
|
528
|
+
const childVectors = childFields.map((child) => {
|
|
529
|
+
return transposeData(data, child, [...path, child.name]);
|
|
530
|
+
});
|
|
531
|
+
const structData = (0, apache_arrow_1.makeData)({
|
|
532
|
+
type: field.type,
|
|
533
|
+
children: childVectors,
|
|
534
|
+
});
|
|
535
|
+
return (0, apache_arrow_1.makeVector)(structData);
|
|
536
|
+
}
|
|
537
|
+
else {
|
|
538
|
+
const valuesPath = [...path, field.name];
|
|
539
|
+
const values = data.map((datum) => {
|
|
540
|
+
let current = datum;
|
|
541
|
+
for (const key of valuesPath) {
|
|
542
|
+
if (isObject(current) && Object.hasOwn(current, key)) {
|
|
543
|
+
current = current[key];
|
|
544
|
+
}
|
|
545
|
+
else {
|
|
546
|
+
return null;
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
return current;
|
|
550
|
+
});
|
|
551
|
+
return makeVector(values, field.type);
|
|
372
552
|
}
|
|
373
|
-
return tbl;
|
|
374
553
|
}
|
|
375
554
|
/**
|
|
376
555
|
* Create an empty Arrow table with the provided schema
|
|
@@ -409,6 +588,36 @@ function makeListVector(lists) {
|
|
|
409
588
|
function makeVector(values, type, stringAsDictionary) {
|
|
410
589
|
if (type !== undefined) {
|
|
411
590
|
// No need for inference, let Arrow create it
|
|
591
|
+
if (type instanceof apache_arrow_1.Int) {
|
|
592
|
+
if (apache_arrow_1.DataType.isInt(type) && type.bitWidth === 64) {
|
|
593
|
+
// wrap in BigInt to avoid bug: https://github.com/apache/arrow/issues/40051
|
|
594
|
+
values = values.map((v) => {
|
|
595
|
+
if (v === null) {
|
|
596
|
+
return v;
|
|
597
|
+
}
|
|
598
|
+
else if (typeof v === "bigint") {
|
|
599
|
+
return v;
|
|
600
|
+
}
|
|
601
|
+
else if (typeof v === "number") {
|
|
602
|
+
return BigInt(v);
|
|
603
|
+
}
|
|
604
|
+
else {
|
|
605
|
+
return v;
|
|
606
|
+
}
|
|
607
|
+
});
|
|
608
|
+
}
|
|
609
|
+
else {
|
|
610
|
+
// Similarly, bigint isn't supported for 16 or 32-bit ints.
|
|
611
|
+
values = values.map((v) => {
|
|
612
|
+
if (typeof v == "bigint") {
|
|
613
|
+
return Number(v);
|
|
614
|
+
}
|
|
615
|
+
else {
|
|
616
|
+
return v;
|
|
617
|
+
}
|
|
618
|
+
});
|
|
619
|
+
}
|
|
620
|
+
}
|
|
412
621
|
return (0, apache_arrow_1.vectorFromArray)(values, type);
|
|
413
622
|
}
|
|
414
623
|
if (values.length === 0) {
|