@milaboratories/pf-driver 1.3.10 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/csv_writer.cjs +79 -0
- package/dist/csv_writer.cjs.map +1 -0
- package/dist/csv_writer.js +78 -0
- package/dist/csv_writer.js.map +1 -0
- package/dist/driver_decl.d.ts +4 -2
- package/dist/driver_decl.d.ts.map +1 -1
- package/dist/driver_double.cjs +1 -1
- package/dist/driver_double.js +1 -1
- package/dist/driver_impl.cjs +94 -17
- package/dist/driver_impl.cjs.map +1 -1
- package/dist/driver_impl.d.ts +2 -1
- package/dist/driver_impl.d.ts.map +1 -1
- package/dist/driver_impl.js +93 -18
- package/dist/driver_impl.js.map +1 -1
- package/dist/index.d.ts +2 -2
- package/package.json +4 -3
- package/src/__tests__/csv_writer.test.ts +419 -0
- package/src/__tests__/download_ptable.test.ts +617 -0
- package/src/csv_writer.ts +154 -0
- package/src/driver_decl.ts +14 -0
- package/src/driver_impl.ts +100 -3
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import {
|
|
2
|
+
Annotation,
|
|
3
|
+
isValueNA,
|
|
4
|
+
readAnnotation,
|
|
5
|
+
ValueType,
|
|
6
|
+
type PTableColumnSpec,
|
|
7
|
+
type PTableVector,
|
|
8
|
+
type TableRange,
|
|
9
|
+
} from "@milaboratories/pl-model-common";
|
|
10
|
+
import { isNil } from "@milaboratories/helpers";
|
|
11
|
+
|
|
12
|
+
/** Minimal subset of PTableV8 required by streamPTableRows. */
|
|
13
|
+
export interface PTableDataSource {
|
|
14
|
+
getData(
|
|
15
|
+
columnIndices: number[],
|
|
16
|
+
options?: { range?: TableRange; signal?: AbortSignal },
|
|
17
|
+
): Promise<PTableVector[]>;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// ── Public API (high-level → low-level) ──────────────────────────────
|
|
21
|
+
|
|
22
|
+
/** Format a CSV/TSV header row from column specs. Line ending is CRLF. */
|
|
23
|
+
export function formatHeader(specs: PTableColumnSpec[], separator: string): string {
|
|
24
|
+
return specs.map((spec) => escapeField(columnLabel(spec), separator)).join(separator) + "\r\n";
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/** Format a single data row from parallel vectors. Line ending is CRLF. */
|
|
28
|
+
export function formatRow(vectors: PTableVector[], rowIndex: number, separator: string): string {
|
|
29
|
+
return (
|
|
30
|
+
vectors
|
|
31
|
+
.map((vector) => escapeField(serializeValue(vector, rowIndex), separator))
|
|
32
|
+
.join(separator) + "\r\n"
|
|
33
|
+
);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Async generator that streams CSV/TSV content chunk by chunk.
|
|
38
|
+
*
|
|
39
|
+
* The caller is responsible for providing a concrete `range` (already clipped
|
|
40
|
+
* to the table shape). When `range` is undefined the generator does nothing
|
|
41
|
+
* beyond emitting an optional BOM and header.
|
|
42
|
+
*/
|
|
43
|
+
export interface StreamPTableRowsOptions {
|
|
44
|
+
pTable: PTableDataSource;
|
|
45
|
+
specs: PTableColumnSpec[];
|
|
46
|
+
columnIndices: number[];
|
|
47
|
+
range?: TableRange;
|
|
48
|
+
chunkSize: number;
|
|
49
|
+
separator: string;
|
|
50
|
+
includeHeader: boolean;
|
|
51
|
+
bom: boolean;
|
|
52
|
+
signal?: AbortSignal;
|
|
53
|
+
}
|
|
54
|
+
export async function* streamPTableRows(options: StreamPTableRowsOptions): AsyncIterable<string> {
|
|
55
|
+
const { pTable, columnIndices, range, chunkSize, separator, signal, specs, includeHeader, bom } =
|
|
56
|
+
options;
|
|
57
|
+
|
|
58
|
+
if (bom) {
|
|
59
|
+
yield "\uFEFF";
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (includeHeader) {
|
|
63
|
+
const selectedSpecs = columnIndices.map((index) => specs[index]);
|
|
64
|
+
yield formatHeader(selectedSpecs, separator);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if (isNil(range)) {
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const end = range.offset + range.length;
|
|
72
|
+
|
|
73
|
+
for (let from = range.offset; from < end; from += chunkSize) {
|
|
74
|
+
signal?.throwIfAborted();
|
|
75
|
+
|
|
76
|
+
const length = Math.min(chunkSize, end - from);
|
|
77
|
+
const subRange: TableRange = { offset: from, length };
|
|
78
|
+
|
|
79
|
+
const vectors = await pTable.getData(columnIndices, { range: subRange, signal });
|
|
80
|
+
|
|
81
|
+
const rows: string[] = [];
|
|
82
|
+
for (let rowIndex = 0; rowIndex < length; rowIndex++) {
|
|
83
|
+
rows.push(formatRow(vectors, rowIndex, separator));
|
|
84
|
+
}
|
|
85
|
+
yield rows.join("");
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// ── Helpers (low-level) ──────────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
/** Extract a human-readable label from a PTableColumnSpec. */
|
|
92
|
+
function columnLabel(spec: PTableColumnSpec): string {
|
|
93
|
+
const annotation = readAnnotation(spec.spec, Annotation.Label);
|
|
94
|
+
return isNil(annotation) ? spec.spec.name : annotation.trim();
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* RFC 4180 field escaping.
|
|
99
|
+
* Quote if the field contains the separator, a double-quote, CR, or LF.
|
|
100
|
+
* Embedded `"` are doubled.
|
|
101
|
+
*/
|
|
102
|
+
function escapeField(value: string, separator: string): string {
|
|
103
|
+
return needsQuoting(value, separator) ? '"' + value.replace(/"/g, '""') + '"' : value;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/** Returns true when the value must be wrapped in double-quotes. */
|
|
107
|
+
function needsQuoting(value: string, separator: string): boolean {
|
|
108
|
+
return (
|
|
109
|
+
value.includes(separator) || value.includes('"') || value.includes("\r") || value.includes("\n")
|
|
110
|
+
);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Serialize one cell value from a typed vector to its string representation.
|
|
115
|
+
*
|
|
116
|
+
* - `null` / `undefined` -> `""`
|
|
117
|
+
* - `bigint` -> `String(x)`
|
|
118
|
+
* - `NaN` / `+Inf` / `-Inf` -> `""`
|
|
119
|
+
*/
|
|
120
|
+
function serializeValue(vector: PTableVector, rowIndex: number): string {
|
|
121
|
+
const rawValue = vector.data[rowIndex];
|
|
122
|
+
|
|
123
|
+
if (isNil(rawValue)) {
|
|
124
|
+
return "";
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if (isValueNA(vector, rowIndex)) {
|
|
128
|
+
return "";
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
switch (vector.type) {
|
|
132
|
+
case ValueType.Long: {
|
|
133
|
+
// BigInt64Array element — may be stored as bigint
|
|
134
|
+
return String(rawValue);
|
|
135
|
+
}
|
|
136
|
+
case ValueType.Float:
|
|
137
|
+
case ValueType.Double: {
|
|
138
|
+
const numeric = rawValue as number;
|
|
139
|
+
return Number.isNaN(numeric) || !Number.isFinite(numeric) ? "" : String(numeric);
|
|
140
|
+
}
|
|
141
|
+
case ValueType.Int: {
|
|
142
|
+
return String(rawValue);
|
|
143
|
+
}
|
|
144
|
+
case ValueType.String: {
|
|
145
|
+
return rawValue as string;
|
|
146
|
+
}
|
|
147
|
+
case ValueType.Bytes: {
|
|
148
|
+
return "";
|
|
149
|
+
}
|
|
150
|
+
default: {
|
|
151
|
+
return String(rawValue);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
package/src/driver_decl.ts
CHANGED
|
@@ -14,9 +14,17 @@ import type {
|
|
|
14
14
|
TableRange,
|
|
15
15
|
UniqueValuesRequest,
|
|
16
16
|
UniqueValuesResponse,
|
|
17
|
+
WritePTableToFsOptions,
|
|
18
|
+
WritePTableToFsResult,
|
|
17
19
|
} from "@milaboratories/pl-model-common";
|
|
18
20
|
import type { PoolEntry } from "@milaboratories/helpers";
|
|
19
21
|
|
|
22
|
+
export type {
|
|
23
|
+
WritePTableToFsOptions,
|
|
24
|
+
WritePTableToFsResult,
|
|
25
|
+
PTableDownloadFormat,
|
|
26
|
+
} from "@milaboratories/pl-model-common";
|
|
27
|
+
|
|
20
28
|
/**
|
|
21
29
|
* Extends public and safe SDK's driver API with methods used internally in the middle
|
|
22
30
|
* layer and in tests.
|
|
@@ -75,4 +83,10 @@ export interface AbstractInternalPFrameDriver<PColumnData> extends PFrameDriver,
|
|
|
75
83
|
range: TableRange | undefined,
|
|
76
84
|
signal?: AbortSignal,
|
|
77
85
|
): Promise<PTableVector[]>;
|
|
86
|
+
|
|
87
|
+
/** Download PTable data to a file in CSV or TSV format. */
|
|
88
|
+
writePTableToFs(
|
|
89
|
+
handle: PTableHandle,
|
|
90
|
+
options: WritePTableToFsOptions,
|
|
91
|
+
): Promise<WritePTableToFsResult>;
|
|
78
92
|
}
|
package/src/driver_impl.ts
CHANGED
|
@@ -35,11 +35,24 @@ import {
|
|
|
35
35
|
resolveAnnotationParents,
|
|
36
36
|
} from "@milaboratories/pl-model-common";
|
|
37
37
|
import type { PFrameInternal } from "@milaboratories/pl-model-middle-layer";
|
|
38
|
-
import {
|
|
39
|
-
|
|
38
|
+
import {
|
|
39
|
+
ConcurrencyLimitingExecutor,
|
|
40
|
+
createPathAtomically,
|
|
41
|
+
type MiLogger,
|
|
42
|
+
} from "@milaboratories/ts-helpers";
|
|
43
|
+
import { isNil, PoolEntryGuard, type PoolEntry } from "@milaboratories/helpers";
|
|
40
44
|
import { PFrameFactory } from "@milaboratories/pframes-rs-node";
|
|
41
45
|
import { tmpdir } from "node:os";
|
|
42
|
-
import
|
|
46
|
+
import * as fs from "node:fs";
|
|
47
|
+
import { Readable } from "node:stream";
|
|
48
|
+
import { pipeline } from "node:stream/promises";
|
|
49
|
+
import * as zlib from "node:zlib";
|
|
50
|
+
import { streamPTableRows } from "./csv_writer";
|
|
51
|
+
import type {
|
|
52
|
+
AbstractInternalPFrameDriver,
|
|
53
|
+
WritePTableToFsOptions,
|
|
54
|
+
WritePTableToFsResult,
|
|
55
|
+
} from "./driver_decl";
|
|
43
56
|
import { logPFrames } from "./logging";
|
|
44
57
|
import {
|
|
45
58
|
PFramePool,
|
|
@@ -259,6 +272,80 @@ export class AbstractPFrameDriver<
|
|
|
259
272
|
};
|
|
260
273
|
}
|
|
261
274
|
|
|
275
|
+
public async writePTableToFs(
|
|
276
|
+
handle: PTableHandle,
|
|
277
|
+
options: WritePTableToFsOptions,
|
|
278
|
+
): Promise<WritePTableToFsResult> {
|
|
279
|
+
this.logger(
|
|
280
|
+
"info",
|
|
281
|
+
`[WritePTableToFs] ENTER (handle = ${handle}, path = ${options.path}, format = ${options.format}, compression = ${options.compression ?? "auto"}, columns = ${options.columnIndices.length})`,
|
|
282
|
+
);
|
|
283
|
+
const startTime = performance.now();
|
|
284
|
+
const { def, disposeSignal: defDisposeSignal } = this.pTableDefs.getByKey(handle);
|
|
285
|
+
using tableGuard = new PoolEntryGuard(this.pTables.acquire(def));
|
|
286
|
+
const { pTablePromise, disposeSignal } = tableGuard.resource;
|
|
287
|
+
const pTable = await pTablePromise;
|
|
288
|
+
|
|
289
|
+
const combinedSignal = AbortSignal.any(
|
|
290
|
+
[options.signal, disposeSignal].filter((s): s is AbortSignal => !isNil(s)),
|
|
291
|
+
);
|
|
292
|
+
|
|
293
|
+
return await this.tableConcurrencyLimiter.run(async () => {
|
|
294
|
+
const shape = await pTable.getShape({ signal: combinedSignal });
|
|
295
|
+
const clippedRange = clipRange(options.range, shape);
|
|
296
|
+
const specs = pTable.getSpec();
|
|
297
|
+
const separator = options.format === "tsv" ? "\t" : ",";
|
|
298
|
+
|
|
299
|
+
const iterable = streamPTableRows({
|
|
300
|
+
pTable,
|
|
301
|
+
specs,
|
|
302
|
+
columnIndices: options.columnIndices,
|
|
303
|
+
range: clippedRange,
|
|
304
|
+
chunkSize: options.chunkSize ?? 50_000,
|
|
305
|
+
separator,
|
|
306
|
+
includeHeader: options.includeHeader ?? true,
|
|
307
|
+
bom: options.bom ?? true,
|
|
308
|
+
signal: combinedSignal,
|
|
309
|
+
});
|
|
310
|
+
|
|
311
|
+
const miLogger: MiLogger = {
|
|
312
|
+
info: (msg) => this.logger("info", String(msg)),
|
|
313
|
+
warn: (msg) => this.logger("warn", String(msg)),
|
|
314
|
+
error: (msg) => this.logger("error", String(msg)),
|
|
315
|
+
};
|
|
316
|
+
|
|
317
|
+
let bytesWritten = 0;
|
|
318
|
+
await createPathAtomically(miLogger, options.path, async (tempPath) => {
|
|
319
|
+
const writeStream = fs.createWriteStream(tempPath, { flags: "wx" });
|
|
320
|
+
const source = Readable.from(iterable, { objectMode: false });
|
|
321
|
+
if (options.compression?.type === "gzip") {
|
|
322
|
+
const gzip = zlib.createGzip({ level: options.compression.level ?? 6 });
|
|
323
|
+
await pipeline(source, gzip, writeStream, { signal: combinedSignal });
|
|
324
|
+
} else {
|
|
325
|
+
await pipeline(source, writeStream, { signal: combinedSignal });
|
|
326
|
+
}
|
|
327
|
+
bytesWritten = writeStream.bytesWritten;
|
|
328
|
+
});
|
|
329
|
+
|
|
330
|
+
const overallSize = await pTable.getFootprint({ signal: combinedSignal });
|
|
331
|
+
this.pTableCachePlain.cache(tableGuard.keep(), overallSize, defDisposeSignal);
|
|
332
|
+
|
|
333
|
+
// rowsWritten equals the clipped range length — the generator streams the
|
|
334
|
+
// entire effective range without early termination, so this is accurate.
|
|
335
|
+
const rowsWritten = clippedRange.length;
|
|
336
|
+
|
|
337
|
+
if (logPFrames()) {
|
|
338
|
+
const durationMs = Math.round(performance.now() - startTime);
|
|
339
|
+
this.logger(
|
|
340
|
+
"info",
|
|
341
|
+
`[WritePTableToFs] complete (handle = ${handle}, columns = ${options.columnIndices.length}, rows = ${rowsWritten}, bytes = ${bytesWritten}, duration = ${durationMs}ms)`,
|
|
342
|
+
);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
return { path: options.path, rowsWritten, bytesWritten };
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
|
|
262
349
|
//
|
|
263
350
|
// PFrame instance methods
|
|
264
351
|
//
|
|
@@ -463,6 +550,16 @@ export class AbstractPFrameDriver<
|
|
|
463
550
|
}
|
|
464
551
|
}
|
|
465
552
|
|
|
553
|
+
/** Clamp range to table shape. When range is undefined, returns full table range. */
|
|
554
|
+
function clipRange(range: undefined | TableRange, shape: PTableShape): TableRange {
|
|
555
|
+
if (isNil(range)) {
|
|
556
|
+
return { offset: 0, length: shape.rows };
|
|
557
|
+
}
|
|
558
|
+
const clampedOffset = Math.min(range.offset, shape.rows);
|
|
559
|
+
const clampedLength = Math.min(range.length, shape.rows - clampedOffset);
|
|
560
|
+
return { offset: clampedOffset, length: clampedLength };
|
|
561
|
+
}
|
|
562
|
+
|
|
466
563
|
function migrateFilters(
|
|
467
564
|
filters: PTableRecordFilter[],
|
|
468
565
|
logger: PFrameInternal.Logger,
|