@milaboratories/pl-model-common 1.19.2 → 1.19.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/drivers/pframe/data_info.d.ts +34 -30
- package/dist/drivers/pframe/data_info.d.ts.map +1 -1
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +318 -287
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
- package/src/drivers/pframe/data_info.ts +145 -134
package/package.json
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { assertNever } from '../../util';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* Represents a JavaScript representation of a value in a PColumn. Can be null, a number, or a string.
|
|
3
5
|
* These are the primitive types that can be stored directly in PColumns.
|
|
@@ -85,7 +87,7 @@ export type BinaryPartitionedDataInfo<Blob> = {
|
|
|
85
87
|
parts: Record<string, BinaryChunk<Blob>>;
|
|
86
88
|
};
|
|
87
89
|
|
|
88
|
-
type
|
|
90
|
+
export type ParquetChunkMappingAxis = {
|
|
89
91
|
/** Data type (matches PColumn axis types) */
|
|
90
92
|
type: 'Int' | 'Long' | 'String';
|
|
91
93
|
|
|
@@ -93,7 +95,7 @@ type ParquetPartitionInfoMappingAxis = {
|
|
|
93
95
|
id: string;
|
|
94
96
|
};
|
|
95
97
|
|
|
96
|
-
type
|
|
98
|
+
export type ParquetChunkMappingColumn = {
|
|
97
99
|
/** Data type (matches PColumn value type) */
|
|
98
100
|
type: 'Int' | 'Long' | 'Float' | 'Double' | 'String';
|
|
99
101
|
|
|
@@ -101,42 +103,38 @@ type ParquetPartitionInfoMappingColumn = {
|
|
|
101
103
|
id: string;
|
|
102
104
|
};
|
|
103
105
|
|
|
104
|
-
type
|
|
106
|
+
export type ParquetChunkMapping = {
|
|
105
107
|
/** Axes mappings - Parquet file is sorted by these fields in this order */
|
|
106
|
-
axes:
|
|
108
|
+
axes: ParquetChunkMappingAxis[];
|
|
107
109
|
|
|
108
110
|
/** Column mapping */
|
|
109
|
-
column:
|
|
111
|
+
column: ParquetChunkMappingColumn;
|
|
110
112
|
};
|
|
111
113
|
|
|
112
|
-
type
|
|
113
|
-
/**
|
|
114
|
-
|
|
114
|
+
export type ParquetChunkStats = {
|
|
115
|
+
/** Number of rows in the chunk */
|
|
116
|
+
numberOfRows: number;
|
|
117
|
+
/** Byte size information for storage optimization and query planning */
|
|
118
|
+
size: {
|
|
119
|
+
/** Byte sizes for each axis column in the same order as axes mapping */
|
|
120
|
+
axes: number[];
|
|
121
|
+
/** Byte size for the data column */
|
|
122
|
+
column: number;
|
|
123
|
+
};
|
|
124
|
+
};
|
|
115
125
|
|
|
116
|
-
|
|
117
|
-
|
|
126
|
+
export type ParquetChunkMetadata = {
|
|
127
|
+
/** Content hash calculated for the specific axes and data this chunk represents */
|
|
128
|
+
dataDigest: string;
|
|
118
129
|
|
|
119
130
|
/** Pre-computed statistics for optimization without blob download */
|
|
120
|
-
stats
|
|
121
|
-
/** Number of rows in the column */
|
|
122
|
-
numberOfRows?: number;
|
|
123
|
-
/** Byte size information for storage optimization and query planning */
|
|
124
|
-
numberOfBytes?: {
|
|
125
|
-
/** Byte sizes for each axis column in the same order as axes mapping */
|
|
126
|
-
axes: number[];
|
|
127
|
-
/** Byte size for the data column */
|
|
128
|
-
column: number;
|
|
129
|
-
};
|
|
130
|
-
};
|
|
131
|
+
stats: Partial<ParquetChunkStats>;
|
|
131
132
|
};
|
|
132
133
|
|
|
133
134
|
export type ParquetChunk<Blob> = {
|
|
134
|
-
/**
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
/** Data for this partition */
|
|
138
|
-
data: ParquetPartitionInfoData<Blob>;
|
|
139
|
-
};
|
|
135
|
+
/** Parquet file (PTable) containing column data */
|
|
136
|
+
data: Blob;
|
|
137
|
+
} & ParquetChunkMapping & Partial<ParquetChunkMetadata>;
|
|
140
138
|
|
|
141
139
|
export type ParquetPartitionedDataInfo<Blob> = {
|
|
142
140
|
/** Identifier for this data format ('ParquetPartitioned') */
|
|
@@ -146,7 +144,7 @@ export type ParquetPartitionedDataInfo<Blob> = {
|
|
|
146
144
|
partitionKeyLength: number;
|
|
147
145
|
|
|
148
146
|
/** Map of stringified partition keys to parquet files */
|
|
149
|
-
parts: Record<string,
|
|
147
|
+
parts: Record<string, Blob>;
|
|
150
148
|
};
|
|
151
149
|
|
|
152
150
|
/**
|
|
@@ -158,7 +156,8 @@ export type ParquetPartitionedDataInfo<Blob> = {
|
|
|
158
156
|
export type DataInfo<Blob> =
|
|
159
157
|
| JsonDataInfo
|
|
160
158
|
| JsonPartitionedDataInfo<Blob>
|
|
161
|
-
| BinaryPartitionedDataInfo<Blob
|
|
159
|
+
| BinaryPartitionedDataInfo<Blob>
|
|
160
|
+
| ParquetPartitionedDataInfo<Blob>;
|
|
162
161
|
|
|
163
162
|
/**
|
|
164
163
|
* Type guard function that checks if the given value is a valid DataInfo.
|
|
@@ -184,12 +183,8 @@ export function isDataInfo<Blob>(value: unknown): value is DataInfo<Blob> {
|
|
|
184
183
|
&& typeof data.data === 'object'
|
|
185
184
|
);
|
|
186
185
|
case 'JsonPartitioned':
|
|
187
|
-
return (
|
|
188
|
-
typeof data.partitionKeyLength === 'number'
|
|
189
|
-
&& data.parts !== undefined
|
|
190
|
-
&& typeof data.parts === 'object'
|
|
191
|
-
);
|
|
192
186
|
case 'BinaryPartitioned':
|
|
187
|
+
case 'ParquetPartitioned':
|
|
193
188
|
return (
|
|
194
189
|
typeof data.partitionKeyLength === 'number'
|
|
195
190
|
&& data.parts !== undefined
|
|
@@ -250,6 +245,17 @@ export function mapDataInfo<B1, B2>(
|
|
|
250
245
|
parts: newParts,
|
|
251
246
|
};
|
|
252
247
|
}
|
|
248
|
+
case 'ParquetPartitioned': {
|
|
249
|
+
// Map each blob in parts
|
|
250
|
+
const newParts: Record<string, B2> = {};
|
|
251
|
+
for (const [key, blob] of Object.entries(dataInfo.parts)) {
|
|
252
|
+
newParts[key] = mapFn(blob);
|
|
253
|
+
}
|
|
254
|
+
return {
|
|
255
|
+
...dataInfo,
|
|
256
|
+
parts: newParts,
|
|
257
|
+
};
|
|
258
|
+
}
|
|
253
259
|
}
|
|
254
260
|
}
|
|
255
261
|
|
|
@@ -268,17 +274,20 @@ export function visitDataInfo<B>(
|
|
|
268
274
|
break;
|
|
269
275
|
case 'JsonPartitioned': {
|
|
270
276
|
// Visit each blob in parts
|
|
271
|
-
|
|
272
|
-
cb(blob);
|
|
273
|
-
}
|
|
277
|
+
Object.values(dataInfo.parts).forEach(cb);
|
|
274
278
|
break;
|
|
275
279
|
}
|
|
276
280
|
case 'BinaryPartitioned': {
|
|
277
281
|
// Visit each index and values blob in parts
|
|
278
|
-
|
|
282
|
+
Object.values(dataInfo.parts).forEach((chunk) => {
|
|
279
283
|
cb(chunk.index);
|
|
280
284
|
cb(chunk.values);
|
|
281
|
-
}
|
|
285
|
+
});
|
|
286
|
+
break;
|
|
287
|
+
}
|
|
288
|
+
case 'ParquetPartitioned': {
|
|
289
|
+
// Visit each blob in parts
|
|
290
|
+
Object.values(dataInfo.parts).forEach(cb);
|
|
282
291
|
break;
|
|
283
292
|
}
|
|
284
293
|
}
|
|
@@ -330,12 +339,21 @@ export interface BinaryPartitionedDataInfoEntries<Blob> {
|
|
|
330
339
|
parts: PColumnDataEntry<BinaryChunk<Blob>>[];
|
|
331
340
|
}
|
|
332
341
|
|
|
342
|
+
/**
|
|
343
|
+
* Entry-based representation of ParquetPartitionedDataInfo
|
|
344
|
+
*/
|
|
345
|
+
export interface ParquetPartitionedDataInfoEntries<Blob> {
|
|
346
|
+
type: 'ParquetPartitioned';
|
|
347
|
+
partitionKeyLength: number;
|
|
348
|
+
parts: PColumnDataEntry<Blob>[];
|
|
349
|
+
}
|
|
333
350
|
/**
|
|
334
351
|
* Union type representing all possible entry-based partitioned data storage formats
|
|
335
352
|
*/
|
|
336
353
|
export type PartitionedDataInfoEntries<Blob> =
|
|
337
354
|
| JsonPartitionedDataInfoEntries<Blob>
|
|
338
|
-
| BinaryPartitionedDataInfoEntries<Blob
|
|
355
|
+
| BinaryPartitionedDataInfoEntries<Blob>
|
|
356
|
+
| ParquetPartitionedDataInfoEntries<Blob>;
|
|
339
357
|
|
|
340
358
|
/**
|
|
341
359
|
* Union type representing all possible entry-based data storage formats
|
|
@@ -367,11 +385,8 @@ export function isDataInfoEntries<Blob>(value: unknown): value is DataInfoEntrie
|
|
|
367
385
|
&& Array.isArray(data.data)
|
|
368
386
|
);
|
|
369
387
|
case 'JsonPartitioned':
|
|
370
|
-
return (
|
|
371
|
-
typeof data.partitionKeyLength === 'number'
|
|
372
|
-
&& Array.isArray(data.parts)
|
|
373
|
-
);
|
|
374
388
|
case 'BinaryPartitioned':
|
|
389
|
+
case 'ParquetPartitioned':
|
|
375
390
|
return (
|
|
376
391
|
typeof data.partitionKeyLength === 'number'
|
|
377
392
|
&& Array.isArray(data.parts)
|
|
@@ -390,7 +405,14 @@ export function isDataInfoEntries<Blob>(value: unknown): value is DataInfoEntrie
|
|
|
390
405
|
*/
|
|
391
406
|
export function isPartitionedDataInfoEntries<Blob>(value: unknown): value is PartitionedDataInfoEntries<Blob> {
|
|
392
407
|
if (!isDataInfoEntries(value)) return false;
|
|
393
|
-
|
|
408
|
+
switch (value.type) {
|
|
409
|
+
case 'JsonPartitioned':
|
|
410
|
+
case 'BinaryPartitioned':
|
|
411
|
+
case 'ParquetPartitioned':
|
|
412
|
+
return true;
|
|
413
|
+
default:
|
|
414
|
+
return false;
|
|
415
|
+
}
|
|
394
416
|
}
|
|
395
417
|
|
|
396
418
|
/**
|
|
@@ -401,42 +423,40 @@ export function isPartitionedDataInfoEntries<Blob>(value: unknown): value is Par
|
|
|
401
423
|
*/
|
|
402
424
|
export function dataInfoToEntries<Blob>(dataInfo: DataInfo<Blob>): DataInfoEntries<Blob> {
|
|
403
425
|
switch (dataInfo.type) {
|
|
404
|
-
case 'Json': {
|
|
405
|
-
|
|
426
|
+
case 'Json': return {
|
|
427
|
+
type: 'Json',
|
|
428
|
+
keyLength: dataInfo.keyLength,
|
|
429
|
+
data: Object.entries(dataInfo.data).map(([keyStr, value]) => {
|
|
406
430
|
const key = JSON.parse(keyStr) as PColumnKey;
|
|
407
|
-
return { key, value }
|
|
408
|
-
})
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
};
|
|
415
|
-
}
|
|
416
|
-
case 'JsonPartitioned': {
|
|
417
|
-
const parts: PColumnDataEntry<Blob>[] = Object.entries(dataInfo.parts).map(([keyStr, blob]) => {
|
|
431
|
+
return { key, value } as PColumnDataEntry<PColumnValue>;
|
|
432
|
+
}),
|
|
433
|
+
};
|
|
434
|
+
case 'JsonPartitioned': return {
|
|
435
|
+
type: 'JsonPartitioned',
|
|
436
|
+
partitionKeyLength: dataInfo.partitionKeyLength,
|
|
437
|
+
parts: Object.entries(dataInfo.parts).map(([keyStr, blob]) => {
|
|
418
438
|
const key = JSON.parse(keyStr) as PColumnKey;
|
|
419
|
-
return { key, value: blob }
|
|
420
|
-
})
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
};
|
|
427
|
-
}
|
|
428
|
-
case 'BinaryPartitioned': {
|
|
429
|
-
const parts: PColumnDataEntry<BinaryChunk<Blob>>[] = Object.entries(dataInfo.parts).map(([keyStr, chunk]) => {
|
|
439
|
+
return { key, value: blob } as PColumnDataEntry<Blob>;
|
|
440
|
+
}),
|
|
441
|
+
};
|
|
442
|
+
case 'BinaryPartitioned': return {
|
|
443
|
+
type: 'BinaryPartitioned',
|
|
444
|
+
partitionKeyLength: dataInfo.partitionKeyLength,
|
|
445
|
+
parts: Object.entries(dataInfo.parts).map(([keyStr, chunk]) => {
|
|
430
446
|
const key = JSON.parse(keyStr) as PColumnKey;
|
|
431
|
-
return { key, value: chunk }
|
|
432
|
-
})
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
447
|
+
return { key, value: chunk } as PColumnDataEntry<BinaryChunk<Blob>>;
|
|
448
|
+
}),
|
|
449
|
+
};
|
|
450
|
+
case 'ParquetPartitioned': return {
|
|
451
|
+
type: 'ParquetPartitioned',
|
|
452
|
+
partitionKeyLength: dataInfo.partitionKeyLength,
|
|
453
|
+
parts: Object.entries(dataInfo.parts).map(([keyStr, blob]) => {
|
|
454
|
+
const key = JSON.parse(keyStr) as PColumnKey;
|
|
455
|
+
return { key, value: blob } as PColumnDataEntry<Blob>;
|
|
456
|
+
}),
|
|
457
|
+
};
|
|
458
|
+
default:
|
|
459
|
+
assertNever(dataInfo);
|
|
440
460
|
}
|
|
441
461
|
}
|
|
442
462
|
|
|
@@ -448,42 +468,36 @@ export function dataInfoToEntries<Blob>(dataInfo: DataInfo<Blob>): DataInfoEntri
|
|
|
448
468
|
*/
|
|
449
469
|
export function entriesToDataInfo<Blob>(dataInfoEntries: DataInfoEntries<Blob>): DataInfo<Blob> {
|
|
450
470
|
switch (dataInfoEntries.type) {
|
|
451
|
-
case 'Json': {
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
return {
|
|
482
|
-
type: 'BinaryPartitioned',
|
|
483
|
-
partitionKeyLength: dataInfoEntries.partitionKeyLength,
|
|
484
|
-
parts,
|
|
485
|
-
};
|
|
486
|
-
}
|
|
471
|
+
case 'Json': return {
|
|
472
|
+
type: 'Json',
|
|
473
|
+
keyLength: dataInfoEntries.keyLength,
|
|
474
|
+
data: Object.fromEntries(
|
|
475
|
+
dataInfoEntries.data.map(({ key, value }) => [JSON.stringify(key), value]),
|
|
476
|
+
),
|
|
477
|
+
};
|
|
478
|
+
case 'JsonPartitioned': return {
|
|
479
|
+
type: 'JsonPartitioned',
|
|
480
|
+
partitionKeyLength: dataInfoEntries.partitionKeyLength,
|
|
481
|
+
parts: Object.fromEntries(
|
|
482
|
+
dataInfoEntries.parts.map(({ key, value }) => [JSON.stringify(key), value]),
|
|
483
|
+
),
|
|
484
|
+
};
|
|
485
|
+
case 'BinaryPartitioned': return {
|
|
486
|
+
type: 'BinaryPartitioned',
|
|
487
|
+
partitionKeyLength: dataInfoEntries.partitionKeyLength,
|
|
488
|
+
parts: Object.fromEntries(
|
|
489
|
+
dataInfoEntries.parts.map(({ key, value }) => [JSON.stringify(key), value]),
|
|
490
|
+
),
|
|
491
|
+
};
|
|
492
|
+
case 'ParquetPartitioned': return {
|
|
493
|
+
type: 'ParquetPartitioned',
|
|
494
|
+
partitionKeyLength: dataInfoEntries.partitionKeyLength,
|
|
495
|
+
parts: Object.fromEntries(
|
|
496
|
+
dataInfoEntries.parts.map(({ key, value }) => [JSON.stringify(key), value]),
|
|
497
|
+
),
|
|
498
|
+
};
|
|
499
|
+
default:
|
|
500
|
+
assertNever(dataInfoEntries);
|
|
487
501
|
}
|
|
488
502
|
}
|
|
489
503
|
|
|
@@ -512,32 +526,29 @@ export function mapDataInfoEntries<B1, B2>(
|
|
|
512
526
|
case 'Json':
|
|
513
527
|
// Json type doesn't contain blobs, so return as is
|
|
514
528
|
return dataInfoEntries;
|
|
515
|
-
case 'JsonPartitioned': {
|
|
516
|
-
|
|
517
|
-
|
|
529
|
+
case 'JsonPartitioned': return {
|
|
530
|
+
...dataInfoEntries,
|
|
531
|
+
parts: dataInfoEntries.parts.map((entry) => ({
|
|
518
532
|
key: entry.key,
|
|
519
533
|
value: mapFn(entry.value),
|
|
520
|
-
}))
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
};
|
|
526
|
-
}
|
|
527
|
-
case 'BinaryPartitioned': {
|
|
528
|
-
// Map each index and values blob in parts
|
|
529
|
-
const newParts = dataInfoEntries.parts.map((entry) => ({
|
|
534
|
+
})),
|
|
535
|
+
};
|
|
536
|
+
case 'BinaryPartitioned': return {
|
|
537
|
+
...dataInfoEntries,
|
|
538
|
+
parts: dataInfoEntries.parts.map((entry) => ({
|
|
530
539
|
key: entry.key,
|
|
531
540
|
value: {
|
|
532
541
|
index: mapFn(entry.value.index),
|
|
533
542
|
values: mapFn(entry.value.values),
|
|
534
543
|
},
|
|
535
|
-
}))
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
544
|
+
})),
|
|
545
|
+
};
|
|
546
|
+
case 'ParquetPartitioned': return {
|
|
547
|
+
...dataInfoEntries,
|
|
548
|
+
parts: dataInfoEntries.parts.map((entry) => ({
|
|
549
|
+
key: entry.key,
|
|
550
|
+
value: mapFn(entry.value),
|
|
551
|
+
})),
|
|
552
|
+
};
|
|
542
553
|
}
|
|
543
554
|
}
|