@milaboratories/pl-model-common 1.19.3 → 1.19.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/drivers/pframe/data_info.d.ts +29 -18
- package/dist/drivers/pframe/data_info.d.ts.map +1 -1
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +318 -287
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
- package/src/drivers/pframe/data_info.ts +143 -124
package/package.json
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { assertNever } from '../../util';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* Represents a JavaScript representation of a value in a PColumn. Can be null, a number, or a string.
|
|
3
5
|
* These are the primitive types that can be stored directly in PColumns.
|
|
@@ -85,7 +87,7 @@ export type BinaryPartitionedDataInfo<Blob> = {
|
|
|
85
87
|
parts: Record<string, BinaryChunk<Blob>>;
|
|
86
88
|
};
|
|
87
89
|
|
|
88
|
-
type ParquetChunkMappingAxis = {
|
|
90
|
+
export type ParquetChunkMappingAxis = {
|
|
89
91
|
/** Data type (matches PColumn axis types) */
|
|
90
92
|
type: 'Int' | 'Long' | 'String';
|
|
91
93
|
|
|
@@ -93,7 +95,7 @@ type ParquetChunkMappingAxis = {
|
|
|
93
95
|
id: string;
|
|
94
96
|
};
|
|
95
97
|
|
|
96
|
-
type ParquetChunkMappingColumn = {
|
|
98
|
+
export type ParquetChunkMappingColumn = {
|
|
97
99
|
/** Data type (matches PColumn value type) */
|
|
98
100
|
type: 'Int' | 'Long' | 'Float' | 'Double' | 'String';
|
|
99
101
|
|
|
@@ -101,11 +103,19 @@ type ParquetChunkMappingColumn = {
|
|
|
101
103
|
id: string;
|
|
102
104
|
};
|
|
103
105
|
|
|
104
|
-
type
|
|
106
|
+
export type ParquetChunkMapping = {
|
|
107
|
+
/** Axes mappings - Parquet file is sorted by these fields in this order */
|
|
108
|
+
axes: ParquetChunkMappingAxis[];
|
|
109
|
+
|
|
110
|
+
/** Column mapping */
|
|
111
|
+
column: ParquetChunkMappingColumn;
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
export type ParquetChunkStats = {
|
|
105
115
|
/** Number of rows in the chunk */
|
|
106
|
-
numberOfRows
|
|
116
|
+
numberOfRows: number;
|
|
107
117
|
/** Byte size information for storage optimization and query planning */
|
|
108
|
-
size
|
|
118
|
+
size: {
|
|
109
119
|
/** Byte sizes for each axis column in the same order as axes mapping */
|
|
110
120
|
axes: number[];
|
|
111
121
|
/** Byte size for the data column */
|
|
@@ -113,23 +123,19 @@ type ParquetChunkStats = {
|
|
|
113
123
|
};
|
|
114
124
|
};
|
|
115
125
|
|
|
116
|
-
export type
|
|
117
|
-
/** Parquet file (PTable) containing column data */
|
|
118
|
-
data: Blob;
|
|
119
|
-
|
|
126
|
+
export type ParquetChunkMetadata = {
|
|
120
127
|
/** Content hash calculated for the specific axes and data this chunk represents */
|
|
121
|
-
dataDigest
|
|
122
|
-
|
|
123
|
-
/** Axes mappings - Parquet file is sorted by these fields in this order */
|
|
124
|
-
axes: ParquetChunkMappingAxis[];
|
|
125
|
-
|
|
126
|
-
/** Column mapping */
|
|
127
|
-
column: ParquetChunkMappingColumn;
|
|
128
|
+
dataDigest: string;
|
|
128
129
|
|
|
129
130
|
/** Pre-computed statistics for optimization without blob download */
|
|
130
|
-
stats
|
|
131
|
+
stats: Partial<ParquetChunkStats>;
|
|
131
132
|
};
|
|
132
133
|
|
|
134
|
+
export type ParquetChunk<Blob> = {
|
|
135
|
+
/** Parquet file (PTable) containing column data */
|
|
136
|
+
data: Blob;
|
|
137
|
+
} & ParquetChunkMapping & Partial<ParquetChunkMetadata>;
|
|
138
|
+
|
|
133
139
|
export type ParquetPartitionedDataInfo<Blob> = {
|
|
134
140
|
/** Identifier for this data format ('ParquetPartitioned') */
|
|
135
141
|
type: 'ParquetPartitioned';
|
|
@@ -138,7 +144,7 @@ export type ParquetPartitionedDataInfo<Blob> = {
|
|
|
138
144
|
partitionKeyLength: number;
|
|
139
145
|
|
|
140
146
|
/** Map of stringified partition keys to parquet files */
|
|
141
|
-
parts: Record<string,
|
|
147
|
+
parts: Record<string, Blob>;
|
|
142
148
|
};
|
|
143
149
|
|
|
144
150
|
/**
|
|
@@ -150,7 +156,8 @@ export type ParquetPartitionedDataInfo<Blob> = {
|
|
|
150
156
|
export type DataInfo<Blob> =
|
|
151
157
|
| JsonDataInfo
|
|
152
158
|
| JsonPartitionedDataInfo<Blob>
|
|
153
|
-
| BinaryPartitionedDataInfo<Blob
|
|
159
|
+
| BinaryPartitionedDataInfo<Blob>
|
|
160
|
+
| ParquetPartitionedDataInfo<Blob>;
|
|
154
161
|
|
|
155
162
|
/**
|
|
156
163
|
* Type guard function that checks if the given value is a valid DataInfo.
|
|
@@ -176,12 +183,8 @@ export function isDataInfo<Blob>(value: unknown): value is DataInfo<Blob> {
|
|
|
176
183
|
&& typeof data.data === 'object'
|
|
177
184
|
);
|
|
178
185
|
case 'JsonPartitioned':
|
|
179
|
-
return (
|
|
180
|
-
typeof data.partitionKeyLength === 'number'
|
|
181
|
-
&& data.parts !== undefined
|
|
182
|
-
&& typeof data.parts === 'object'
|
|
183
|
-
);
|
|
184
186
|
case 'BinaryPartitioned':
|
|
187
|
+
case 'ParquetPartitioned':
|
|
185
188
|
return (
|
|
186
189
|
typeof data.partitionKeyLength === 'number'
|
|
187
190
|
&& data.parts !== undefined
|
|
@@ -242,6 +245,17 @@ export function mapDataInfo<B1, B2>(
|
|
|
242
245
|
parts: newParts,
|
|
243
246
|
};
|
|
244
247
|
}
|
|
248
|
+
case 'ParquetPartitioned': {
|
|
249
|
+
// Map each blob in parts
|
|
250
|
+
const newParts: Record<string, B2> = {};
|
|
251
|
+
for (const [key, blob] of Object.entries(dataInfo.parts)) {
|
|
252
|
+
newParts[key] = mapFn(blob);
|
|
253
|
+
}
|
|
254
|
+
return {
|
|
255
|
+
...dataInfo,
|
|
256
|
+
parts: newParts,
|
|
257
|
+
};
|
|
258
|
+
}
|
|
245
259
|
}
|
|
246
260
|
}
|
|
247
261
|
|
|
@@ -260,17 +274,20 @@ export function visitDataInfo<B>(
|
|
|
260
274
|
break;
|
|
261
275
|
case 'JsonPartitioned': {
|
|
262
276
|
// Visit each blob in parts
|
|
263
|
-
|
|
264
|
-
cb(blob);
|
|
265
|
-
}
|
|
277
|
+
Object.values(dataInfo.parts).forEach(cb);
|
|
266
278
|
break;
|
|
267
279
|
}
|
|
268
280
|
case 'BinaryPartitioned': {
|
|
269
281
|
// Visit each index and values blob in parts
|
|
270
|
-
|
|
282
|
+
Object.values(dataInfo.parts).forEach((chunk) => {
|
|
271
283
|
cb(chunk.index);
|
|
272
284
|
cb(chunk.values);
|
|
273
|
-
}
|
|
285
|
+
});
|
|
286
|
+
break;
|
|
287
|
+
}
|
|
288
|
+
case 'ParquetPartitioned': {
|
|
289
|
+
// Visit each blob in parts
|
|
290
|
+
Object.values(dataInfo.parts).forEach(cb);
|
|
274
291
|
break;
|
|
275
292
|
}
|
|
276
293
|
}
|
|
@@ -322,12 +339,21 @@ export interface BinaryPartitionedDataInfoEntries<Blob> {
|
|
|
322
339
|
parts: PColumnDataEntry<BinaryChunk<Blob>>[];
|
|
323
340
|
}
|
|
324
341
|
|
|
342
|
+
/**
|
|
343
|
+
* Entry-based representation of ParquetPartitionedDataInfo
|
|
344
|
+
*/
|
|
345
|
+
export interface ParquetPartitionedDataInfoEntries<Blob> {
|
|
346
|
+
type: 'ParquetPartitioned';
|
|
347
|
+
partitionKeyLength: number;
|
|
348
|
+
parts: PColumnDataEntry<Blob>[];
|
|
349
|
+
}
|
|
325
350
|
/**
|
|
326
351
|
* Union type representing all possible entry-based partitioned data storage formats
|
|
327
352
|
*/
|
|
328
353
|
export type PartitionedDataInfoEntries<Blob> =
|
|
329
354
|
| JsonPartitionedDataInfoEntries<Blob>
|
|
330
|
-
| BinaryPartitionedDataInfoEntries<Blob
|
|
355
|
+
| BinaryPartitionedDataInfoEntries<Blob>
|
|
356
|
+
| ParquetPartitionedDataInfoEntries<Blob>;
|
|
331
357
|
|
|
332
358
|
/**
|
|
333
359
|
* Union type representing all possible entry-based data storage formats
|
|
@@ -359,11 +385,8 @@ export function isDataInfoEntries<Blob>(value: unknown): value is DataInfoEntrie
|
|
|
359
385
|
&& Array.isArray(data.data)
|
|
360
386
|
);
|
|
361
387
|
case 'JsonPartitioned':
|
|
362
|
-
return (
|
|
363
|
-
typeof data.partitionKeyLength === 'number'
|
|
364
|
-
&& Array.isArray(data.parts)
|
|
365
|
-
);
|
|
366
388
|
case 'BinaryPartitioned':
|
|
389
|
+
case 'ParquetPartitioned':
|
|
367
390
|
return (
|
|
368
391
|
typeof data.partitionKeyLength === 'number'
|
|
369
392
|
&& Array.isArray(data.parts)
|
|
@@ -382,7 +405,14 @@ export function isDataInfoEntries<Blob>(value: unknown): value is DataInfoEntrie
|
|
|
382
405
|
*/
|
|
383
406
|
export function isPartitionedDataInfoEntries<Blob>(value: unknown): value is PartitionedDataInfoEntries<Blob> {
|
|
384
407
|
if (!isDataInfoEntries(value)) return false;
|
|
385
|
-
|
|
408
|
+
switch (value.type) {
|
|
409
|
+
case 'JsonPartitioned':
|
|
410
|
+
case 'BinaryPartitioned':
|
|
411
|
+
case 'ParquetPartitioned':
|
|
412
|
+
return true;
|
|
413
|
+
default:
|
|
414
|
+
return false;
|
|
415
|
+
}
|
|
386
416
|
}
|
|
387
417
|
|
|
388
418
|
/**
|
|
@@ -393,42 +423,40 @@ export function isPartitionedDataInfoEntries<Blob>(value: unknown): value is Par
|
|
|
393
423
|
*/
|
|
394
424
|
export function dataInfoToEntries<Blob>(dataInfo: DataInfo<Blob>): DataInfoEntries<Blob> {
|
|
395
425
|
switch (dataInfo.type) {
|
|
396
|
-
case 'Json': {
|
|
397
|
-
|
|
426
|
+
case 'Json': return {
|
|
427
|
+
type: 'Json',
|
|
428
|
+
keyLength: dataInfo.keyLength,
|
|
429
|
+
data: Object.entries(dataInfo.data).map(([keyStr, value]) => {
|
|
398
430
|
const key = JSON.parse(keyStr) as PColumnKey;
|
|
399
|
-
return { key, value }
|
|
400
|
-
})
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
};
|
|
407
|
-
}
|
|
408
|
-
case 'JsonPartitioned': {
|
|
409
|
-
const parts: PColumnDataEntry<Blob>[] = Object.entries(dataInfo.parts).map(([keyStr, blob]) => {
|
|
431
|
+
return { key, value } as PColumnDataEntry<PColumnValue>;
|
|
432
|
+
}),
|
|
433
|
+
};
|
|
434
|
+
case 'JsonPartitioned': return {
|
|
435
|
+
type: 'JsonPartitioned',
|
|
436
|
+
partitionKeyLength: dataInfo.partitionKeyLength,
|
|
437
|
+
parts: Object.entries(dataInfo.parts).map(([keyStr, blob]) => {
|
|
410
438
|
const key = JSON.parse(keyStr) as PColumnKey;
|
|
411
|
-
return { key, value: blob }
|
|
412
|
-
})
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
};
|
|
419
|
-
}
|
|
420
|
-
case 'BinaryPartitioned': {
|
|
421
|
-
const parts: PColumnDataEntry<BinaryChunk<Blob>>[] = Object.entries(dataInfo.parts).map(([keyStr, chunk]) => {
|
|
439
|
+
return { key, value: blob } as PColumnDataEntry<Blob>;
|
|
440
|
+
}),
|
|
441
|
+
};
|
|
442
|
+
case 'BinaryPartitioned': return {
|
|
443
|
+
type: 'BinaryPartitioned',
|
|
444
|
+
partitionKeyLength: dataInfo.partitionKeyLength,
|
|
445
|
+
parts: Object.entries(dataInfo.parts).map(([keyStr, chunk]) => {
|
|
422
446
|
const key = JSON.parse(keyStr) as PColumnKey;
|
|
423
|
-
return { key, value: chunk }
|
|
424
|
-
})
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
447
|
+
return { key, value: chunk } as PColumnDataEntry<BinaryChunk<Blob>>;
|
|
448
|
+
}),
|
|
449
|
+
};
|
|
450
|
+
case 'ParquetPartitioned': return {
|
|
451
|
+
type: 'ParquetPartitioned',
|
|
452
|
+
partitionKeyLength: dataInfo.partitionKeyLength,
|
|
453
|
+
parts: Object.entries(dataInfo.parts).map(([keyStr, blob]) => {
|
|
454
|
+
const key = JSON.parse(keyStr) as PColumnKey;
|
|
455
|
+
return { key, value: blob } as PColumnDataEntry<Blob>;
|
|
456
|
+
}),
|
|
457
|
+
};
|
|
458
|
+
default:
|
|
459
|
+
assertNever(dataInfo);
|
|
432
460
|
}
|
|
433
461
|
}
|
|
434
462
|
|
|
@@ -440,42 +468,36 @@ export function dataInfoToEntries<Blob>(dataInfo: DataInfo<Blob>): DataInfoEntri
|
|
|
440
468
|
*/
|
|
441
469
|
export function entriesToDataInfo<Blob>(dataInfoEntries: DataInfoEntries<Blob>): DataInfo<Blob> {
|
|
442
470
|
switch (dataInfoEntries.type) {
|
|
443
|
-
case 'Json': {
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
return {
|
|
474
|
-
type: 'BinaryPartitioned',
|
|
475
|
-
partitionKeyLength: dataInfoEntries.partitionKeyLength,
|
|
476
|
-
parts,
|
|
477
|
-
};
|
|
478
|
-
}
|
|
471
|
+
case 'Json': return {
|
|
472
|
+
type: 'Json',
|
|
473
|
+
keyLength: dataInfoEntries.keyLength,
|
|
474
|
+
data: Object.fromEntries(
|
|
475
|
+
dataInfoEntries.data.map(({ key, value }) => [JSON.stringify(key), value]),
|
|
476
|
+
),
|
|
477
|
+
};
|
|
478
|
+
case 'JsonPartitioned': return {
|
|
479
|
+
type: 'JsonPartitioned',
|
|
480
|
+
partitionKeyLength: dataInfoEntries.partitionKeyLength,
|
|
481
|
+
parts: Object.fromEntries(
|
|
482
|
+
dataInfoEntries.parts.map(({ key, value }) => [JSON.stringify(key), value]),
|
|
483
|
+
),
|
|
484
|
+
};
|
|
485
|
+
case 'BinaryPartitioned': return {
|
|
486
|
+
type: 'BinaryPartitioned',
|
|
487
|
+
partitionKeyLength: dataInfoEntries.partitionKeyLength,
|
|
488
|
+
parts: Object.fromEntries(
|
|
489
|
+
dataInfoEntries.parts.map(({ key, value }) => [JSON.stringify(key), value]),
|
|
490
|
+
),
|
|
491
|
+
};
|
|
492
|
+
case 'ParquetPartitioned': return {
|
|
493
|
+
type: 'ParquetPartitioned',
|
|
494
|
+
partitionKeyLength: dataInfoEntries.partitionKeyLength,
|
|
495
|
+
parts: Object.fromEntries(
|
|
496
|
+
dataInfoEntries.parts.map(({ key, value }) => [JSON.stringify(key), value]),
|
|
497
|
+
),
|
|
498
|
+
};
|
|
499
|
+
default:
|
|
500
|
+
assertNever(dataInfoEntries);
|
|
479
501
|
}
|
|
480
502
|
}
|
|
481
503
|
|
|
@@ -504,32 +526,29 @@ export function mapDataInfoEntries<B1, B2>(
|
|
|
504
526
|
case 'Json':
|
|
505
527
|
// Json type doesn't contain blobs, so return as is
|
|
506
528
|
return dataInfoEntries;
|
|
507
|
-
case 'JsonPartitioned': {
|
|
508
|
-
|
|
509
|
-
|
|
529
|
+
case 'JsonPartitioned': return {
|
|
530
|
+
...dataInfoEntries,
|
|
531
|
+
parts: dataInfoEntries.parts.map((entry) => ({
|
|
510
532
|
key: entry.key,
|
|
511
533
|
value: mapFn(entry.value),
|
|
512
|
-
}))
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
};
|
|
518
|
-
}
|
|
519
|
-
case 'BinaryPartitioned': {
|
|
520
|
-
// Map each index and values blob in parts
|
|
521
|
-
const newParts = dataInfoEntries.parts.map((entry) => ({
|
|
534
|
+
})),
|
|
535
|
+
};
|
|
536
|
+
case 'BinaryPartitioned': return {
|
|
537
|
+
...dataInfoEntries,
|
|
538
|
+
parts: dataInfoEntries.parts.map((entry) => ({
|
|
522
539
|
key: entry.key,
|
|
523
540
|
value: {
|
|
524
541
|
index: mapFn(entry.value.index),
|
|
525
542
|
values: mapFn(entry.value.values),
|
|
526
543
|
},
|
|
527
|
-
}))
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
544
|
+
})),
|
|
545
|
+
};
|
|
546
|
+
case 'ParquetPartitioned': return {
|
|
547
|
+
...dataInfoEntries,
|
|
548
|
+
parts: dataInfoEntries.parts.map((entry) => ({
|
|
549
|
+
key: entry.key,
|
|
550
|
+
value: mapFn(entry.value),
|
|
551
|
+
})),
|
|
552
|
+
};
|
|
534
553
|
}
|
|
535
554
|
}
|