parquetlens 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +87 -0
- package/dist/chunk-2RGMZZ7F.js +123 -0
- package/dist/chunk-2RGMZZ7F.js.map +1 -0
- package/dist/chunk-3N45GGD2.js +113 -0
- package/dist/chunk-3N45GGD2.js.map +1 -0
- package/dist/chunk-AYPIRAOL.js +112 -0
- package/dist/chunk-AYPIRAOL.js.map +1 -0
- package/dist/chunk-IMVXDI4K.js +112 -0
- package/dist/chunk-IMVXDI4K.js.map +1 -0
- package/dist/chunk-NRRDNC7S.js +485 -0
- package/dist/chunk-NRRDNC7S.js.map +1 -0
- package/dist/main.js +16 -6
- package/dist/main.js.map +1 -1
- package/dist/tui.js +87 -16
- package/dist/tui.js.map +1 -1
- package/package.json +6 -4
package/README.md
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# parquetlens
|
|
2
|
+
|
|
3
|
+
A fast, interactive TUI for viewing Parquet files. Like [csvlens](https://github.com/YS-L/csvlens) but for Parquet.
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install -g parquetlens
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Or run directly with npx:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npx parquetlens data.parquet
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
parquetlens <file|url|-> [options]
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
**Remote URL requirements:**
|
|
26
|
+
|
|
27
|
+
- HTTP/S (including `hf://`) uses range reads via `curl`, so `curl` must be available on your PATH.
|
|
28
|
+
|
|
29
|
+
**Options:**
|
|
30
|
+
|
|
31
|
+
- `--limit <n>` - Number of rows to show (default: 20)
|
|
32
|
+
- `--columns <a,b,c>` - Comma-separated column list
|
|
33
|
+
- `--schema` - Print schema only
|
|
34
|
+
- `--no-schema` - Skip schema output
|
|
35
|
+
- `--json` - Output rows as JSON lines
|
|
36
|
+
- `--tui` - Open interactive viewer (default)
|
|
37
|
+
- `--plain` / `--no-tui` - Disable interactive viewer
|
|
38
|
+
|
|
39
|
+
**Examples:**
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# View local file
|
|
43
|
+
parquetlens data.parquet
|
|
44
|
+
|
|
45
|
+
# View with column selection
|
|
46
|
+
parquetlens data.parquet --columns city,state
|
|
47
|
+
|
|
48
|
+
# Fetch from URL (e.g., Hugging Face datasets)
|
|
49
|
+
parquetlens https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/daily_papers.parquet
|
|
50
|
+
|
|
51
|
+
# Hugging Face shortcut
|
|
52
|
+
parquetlens hf://datasets/cfahlgren1/hub-stats/daily_papers.parquet
|
|
53
|
+
|
|
54
|
+
# Pipe from stdin
|
|
55
|
+
parquetlens - < data.parquet
|
|
56
|
+
|
|
57
|
+
# Plain output (no TUI)
|
|
58
|
+
parquetlens data.parquet --plain --limit 100
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## TUI Controls
|
|
62
|
+
|
|
63
|
+
| Key | Action |
|
|
64
|
+
| --------------- | ----------------------- |
|
|
65
|
+
| `j/k` or arrows | Scroll rows |
|
|
66
|
+
| `h/l` | Jump columns |
|
|
67
|
+
| `PgUp/PgDn` | Page scroll |
|
|
68
|
+
| Mouse wheel | Scroll |
|
|
69
|
+
| Click cell | Open detail panel |
|
|
70
|
+
| `s` or `Enter` | Toggle detail panel |
|
|
71
|
+
| `e` | Show error detail |
|
|
72
|
+
| `y` | Copy error to clipboard |
|
|
73
|
+
| `x` or `Esc` | Close panel (or quit) |
|
|
74
|
+
| `q` | Quit |
|
|
75
|
+
|
|
76
|
+
## Features
|
|
77
|
+
|
|
78
|
+
- **Fast**: Uses duckdb-wasm with HTTP range requests
|
|
79
|
+
- **Interactive TUI**: Full-screen terminal UI with mouse support
|
|
80
|
+
- **URL Support**: Read parquet files from URLs (including `hf://`)
|
|
81
|
+
- **Column Types**: Shows Arrow schema types in headers
|
|
82
|
+
- **Cell Detail**: Click any cell to see full content
|
|
83
|
+
- **Streaming**: Reads only the rows you need
|
|
84
|
+
|
|
85
|
+
## License
|
|
86
|
+
|
|
87
|
+
MIT
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { dirname } from 'path';
|
|
4
|
+
const require = createRequire(import.meta.url);
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = dirname(__filename);
|
|
7
|
+
|
|
8
|
+
// ../../packages/parquet-reader/dist/index.js
|
|
9
|
+
import { Blob as NodeBlob } from "buffer";
|
|
10
|
+
import { createWriteStream, readFileSync } from "fs";
|
|
11
|
+
import { promises as fs } from "fs";
|
|
12
|
+
import { randomUUID } from "crypto";
|
|
13
|
+
import { tmpdir } from "os";
|
|
14
|
+
import path from "path";
|
|
15
|
+
import { pipeline } from "stream/promises";
|
|
16
|
+
import { tableFromIPC } from "apache-arrow";
|
|
17
|
+
import { initSync, ParquetFile, readParquet } from "parquet-wasm/esm";
|
|
18
|
+
var BlobCtor = typeof Blob === "undefined" ? NodeBlob : Blob;
|
|
19
|
+
var wasmInitialized = false;
|
|
20
|
+
function findWasmFile(startDir) {
|
|
21
|
+
let dir = startDir;
|
|
22
|
+
while (dir !== path.dirname(dir)) {
|
|
23
|
+
const wasmPath = path.join(dir, "node_modules", "parquet-wasm", "esm", "parquet_wasm_bg.wasm");
|
|
24
|
+
try {
|
|
25
|
+
readFileSync(wasmPath, { flag: "r" });
|
|
26
|
+
return wasmPath;
|
|
27
|
+
} catch {
|
|
28
|
+
dir = path.dirname(dir);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
throw new Error("Could not find parquet-wasm WASM file");
|
|
32
|
+
}
|
|
33
|
+
function ensureWasmInitialized() {
|
|
34
|
+
if (wasmInitialized) return;
|
|
35
|
+
const wasmPath = findWasmFile(process.cwd());
|
|
36
|
+
const wasmBytes = readFileSync(wasmPath);
|
|
37
|
+
initSync({ module: wasmBytes });
|
|
38
|
+
wasmInitialized = true;
|
|
39
|
+
}
|
|
40
|
+
async function readParquetTableFromBuffer(buffer, options) {
|
|
41
|
+
ensureWasmInitialized();
|
|
42
|
+
const wasmTable = readParquet(buffer, options ?? void 0);
|
|
43
|
+
const ipcStream = wasmTable.intoIPCStream();
|
|
44
|
+
return tableFromIPC(ipcStream);
|
|
45
|
+
}
|
|
46
|
+
function createParquetBufferSource(buffer) {
|
|
47
|
+
let metadataPromise = null;
|
|
48
|
+
return {
|
|
49
|
+
buffer,
|
|
50
|
+
byteLength: buffer.byteLength,
|
|
51
|
+
readTable: (options) => readParquetTableFromBuffer(buffer, options),
|
|
52
|
+
readMetadata: () => {
|
|
53
|
+
if (!metadataPromise) {
|
|
54
|
+
metadataPromise = readParquetMetadataFromBuffer(buffer);
|
|
55
|
+
}
|
|
56
|
+
return metadataPromise;
|
|
57
|
+
}
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
async function openParquetBufferFromPath(filePath) {
|
|
61
|
+
const buffer = await fs.readFile(filePath);
|
|
62
|
+
return createParquetBufferSource(buffer);
|
|
63
|
+
}
|
|
64
|
+
async function readParquetTableFromPath(filePath, options) {
|
|
65
|
+
const buffer = await fs.readFile(filePath);
|
|
66
|
+
return readParquetTableFromBuffer(buffer, options);
|
|
67
|
+
}
|
|
68
|
+
async function readParquetMetadataFromBuffer(buffer) {
|
|
69
|
+
ensureWasmInitialized();
|
|
70
|
+
const blobInput = new Uint8Array(buffer).buffer;
|
|
71
|
+
const file = await ParquetFile.fromFile(new BlobCtor([blobInput]));
|
|
72
|
+
const meta = file.metadata();
|
|
73
|
+
const fileMeta = meta.fileMetadata();
|
|
74
|
+
const createdBy = fileMeta.createdBy();
|
|
75
|
+
const keyValueMetadata = Object.fromEntries(fileMeta.keyValueMetadata());
|
|
76
|
+
fileMeta.free();
|
|
77
|
+
meta.free();
|
|
78
|
+
file.free();
|
|
79
|
+
return {
|
|
80
|
+
createdBy: createdBy ?? void 0,
|
|
81
|
+
keyValueMetadata: normalizeMetadataValues(keyValueMetadata)
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
function normalizeMetadataValues(input) {
|
|
85
|
+
const normalized = {};
|
|
86
|
+
for (const [key, value] of Object.entries(input)) {
|
|
87
|
+
if (value === null || value === void 0) {
|
|
88
|
+
normalized[key] = "";
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
normalized[key] = typeof value === "string" ? value : String(value);
|
|
92
|
+
}
|
|
93
|
+
return normalized;
|
|
94
|
+
}
|
|
95
|
+
async function bufferStdinToTempFile(filenameHint = "stdin.parquet") {
|
|
96
|
+
const tempDir = await fs.mkdtemp(path.join(tmpdir(), "parquetlens-"));
|
|
97
|
+
const safeName = filenameHint.replace(/[\\/]/g, "_");
|
|
98
|
+
const filePath = path.join(tempDir, `${randomUUID()}-${safeName}`);
|
|
99
|
+
const writeStream = createWriteStream(filePath);
|
|
100
|
+
await pipeline(process.stdin, writeStream);
|
|
101
|
+
return {
|
|
102
|
+
path: filePath,
|
|
103
|
+
cleanup: async () => {
|
|
104
|
+
await fs.rm(tempDir, { recursive: true, force: true });
|
|
105
|
+
}
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
async function readParquetTableFromStdin(filenameHint = "stdin.parquet", options) {
|
|
109
|
+
const temp = await bufferStdinToTempFile(filenameHint);
|
|
110
|
+
try {
|
|
111
|
+
return await readParquetTableFromPath(temp.path, options);
|
|
112
|
+
} finally {
|
|
113
|
+
await temp.cleanup();
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export {
|
|
118
|
+
readParquetTableFromBuffer,
|
|
119
|
+
openParquetBufferFromPath,
|
|
120
|
+
readParquetTableFromPath,
|
|
121
|
+
readParquetTableFromStdin
|
|
122
|
+
};
|
|
123
|
+
//# sourceMappingURL=chunk-2RGMZZ7F.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../../packages/parquet-reader/src/index.ts"],"sourcesContent":["import { Blob as NodeBlob } from \"node:buffer\";\nimport { createWriteStream, readFileSync } from \"node:fs\";\nimport { promises as fs } from \"node:fs\";\nimport { randomUUID } from \"node:crypto\";\nimport { tmpdir } from \"node:os\";\nimport path from \"node:path\";\nimport { pipeline } from \"node:stream/promises\";\n\nimport { tableFromIPC, Table } from \"apache-arrow\";\nimport { initSync, ParquetFile, readParquet, type ReaderOptions } from \"parquet-wasm/esm\";\n\nconst BlobCtor: typeof Blob =\n typeof Blob === \"undefined\" ? (NodeBlob as unknown as typeof Blob) : Blob;\n\nlet wasmInitialized = false;\n\nfunction findWasmFile(startDir: string): string {\n let dir = startDir;\n while (dir !== path.dirname(dir)) {\n const wasmPath = path.join(dir, \"node_modules\", \"parquet-wasm\", \"esm\", \"parquet_wasm_bg.wasm\");\n try {\n readFileSync(wasmPath, { flag: \"r\" });\n return wasmPath;\n } catch {\n dir = path.dirname(dir);\n }\n }\n throw new Error(\"Could not find parquet-wasm WASM file\");\n}\n\nfunction ensureWasmInitialized(): void {\n if (wasmInitialized) return;\n\n // Use process.cwd() as starting point to find node_modules\n const wasmPath = findWasmFile(process.cwd());\n const wasmBytes = readFileSync(wasmPath);\n initSync({ module: wasmBytes });\n wasmInitialized = true;\n}\n\nexport type TempParquetFile = {\n path: string;\n cleanup: () => Promise<void>;\n};\n\nexport type ParquetReadOptions = Pick<\n ReaderOptions,\n \"batchSize\" | \"columns\" | \"limit\" | \"offset\" | \"rowGroups\"\n>;\n\nexport type ParquetFileMetadata = {\n createdBy?: string;\n keyValueMetadata: Record<string, string>;\n};\n\nexport type ParquetBufferSource = {\n buffer: Uint8Array;\n byteLength: number;\n readTable: (options?: ParquetReadOptions) => Promise<Table>;\n readMetadata: () => Promise<ParquetFileMetadata>;\n};\n\nexport async function readParquetTableFromBuffer(\n buffer: Uint8Array,\n options?: ParquetReadOptions,\n): Promise<Table> {\n ensureWasmInitialized();\n const wasmTable = readParquet(buffer, options ?? undefined);\n const ipcStream = wasmTable.intoIPCStream();\n return tableFromIPC(ipcStream);\n}\n\nexport function createParquetBufferSource(buffer: Uint8Array): ParquetBufferSource {\n let metadataPromise: Promise<ParquetFileMetadata> | null = null;\n\n return {\n buffer,\n byteLength: buffer.byteLength,\n readTable: (options?: ParquetReadOptions) => readParquetTableFromBuffer(buffer, options),\n readMetadata: () => {\n if (!metadataPromise) {\n metadataPromise = readParquetMetadataFromBuffer(buffer);\n }\n return metadataPromise;\n },\n };\n}\n\nexport async function openParquetBufferFromPath(filePath: string): Promise<ParquetBufferSource> {\n const buffer = await fs.readFile(filePath);\n return createParquetBufferSource(buffer);\n}\n\nexport async function readParquetTableFromPath(\n filePath: string,\n options?: ParquetReadOptions,\n): Promise<Table> {\n const buffer = await fs.readFile(filePath);\n return readParquetTableFromBuffer(buffer, options);\n}\n\nexport async function readParquetMetadataFromBuffer(\n buffer: Uint8Array,\n): Promise<ParquetFileMetadata> {\n ensureWasmInitialized();\n const blobInput = new Uint8Array(buffer).buffer as ArrayBuffer;\n const file = await ParquetFile.fromFile(new BlobCtor([blobInput]));\n const meta = file.metadata();\n const fileMeta = meta.fileMetadata();\n const createdBy = fileMeta.createdBy();\n const keyValueMetadata = Object.fromEntries(fileMeta.keyValueMetadata());\n\n fileMeta.free();\n meta.free();\n file.free();\n\n return {\n createdBy: createdBy ?? undefined,\n keyValueMetadata: normalizeMetadataValues(keyValueMetadata),\n };\n}\n\nfunction normalizeMetadataValues(input: Record<string, unknown>): Record<string, string> {\n const normalized: Record<string, string> = {};\n\n for (const [key, value] of Object.entries(input)) {\n if (value === null || value === undefined) {\n normalized[key] = \"\";\n continue;\n }\n normalized[key] = typeof value === \"string\" ? value : String(value);\n }\n\n return normalized;\n}\n\nexport async function bufferStdinToTempFile(\n filenameHint = \"stdin.parquet\",\n): Promise<TempParquetFile> {\n const tempDir = await fs.mkdtemp(path.join(tmpdir(), \"parquetlens-\"));\n const safeName = filenameHint.replace(/[\\\\/]/g, \"_\");\n const filePath = path.join(tempDir, `${randomUUID()}-${safeName}`);\n const writeStream = createWriteStream(filePath);\n\n await pipeline(process.stdin, writeStream);\n\n return {\n path: filePath,\n cleanup: async () => {\n await fs.rm(tempDir, { recursive: true, force: true });\n },\n };\n}\n\nexport async function readParquetTableFromStdin(\n filenameHint = \"stdin.parquet\",\n options?: ParquetReadOptions,\n): Promise<Table> {\n const temp = await bufferStdinToTempFile(filenameHint);\n\n try {\n return await readParquetTableFromPath(temp.path, options);\n } finally {\n await temp.cleanup();\n }\n}\n"],"mappings":";;;;;;;;AAAA,SAAS,QAAQ,gBAAgB;AACjC,SAAS,mBAAmB,oBAAoB;AAChD,SAAS,YAAY,UAAU;AAC/B,SAAS,kBAAkB;AAC3B,SAAS,cAAc;AACvB,OAAO,UAAU;AACjB,SAAS,gBAAgB;AAEzB,SAAS,oBAA2B;AACpC,SAAS,UAAU,aAAa,mBAAuC;AAEvE,IAAM,WACJ,OAAO,SAAS,cAAe,WAAsC;AAEvE,IAAI,kBAAkB;AAEtB,SAAS,aAAa,UAA0B;AAC9C,MAAI,MAAM;AACV,SAAO,QAAQ,KAAK,QAAQ,GAAG,GAAG;AAChC,UAAM,WAAW,KAAK,KAAK,KAAK,gBAAgB,gBAAgB,OAAO,sBAAsB;AAC7F,QAAI;AACF,mBAAa,UAAU,EAAE,MAAM,IAAI,CAAC;AACpC,aAAO;IACT,QAAQ;AACN,YAAM,KAAK,QAAQ,GAAG;IACxB;EACF;AACA,QAAM,IAAI,MAAM,uCAAuC;AACzD;AAEA,SAAS,wBAA8B;AACrC,MAAI,gBAAiB;AAGrB,QAAM,WAAW,aAAa,QAAQ,IAAI,CAAC;AAC3C,QAAM,YAAY,aAAa,QAAQ;AACvC,WAAS,EAAE,QAAQ,UAAU,CAAC;AAC9B,oBAAkB;AACpB;AAwBA,eAAsB,2BACpB,QACA,SACgB;AAChB,wBAAsB;AACtB,QAAM,YAAY,YAAY,QAAQ,WAAW,MAAS;AAC1D,QAAM,YAAY,UAAU,cAAc;AAC1C,SAAO,aAAa,SAAS;AAC/B;AAEO,SAAS,0BAA0B,QAAyC;AACjF,MAAI,kBAAuD;AAE3D,SAAO;IACL;IACA,YAAY,OAAO;IACnB,WAAW,CAAC,YAAiC,2BAA2B,QAAQ,OAAO;IACvF,cAAc,MAAM;AAClB,UAAI,CAAC,iBAAiB;AACpB,0BAAkB,8BAA8B,MAAM;MACxD;AACA,aAAO;IACT;EACF;AACF;AAEA,eAAsB,0BAA0B,UAAgD;AAC9F,QAAM,SAAS,MAAM,GAAG,SAAS,QAAQ;AACzC,SAAO,0BAA0B,MAAM;AACzC;AAEA,eAAsB,yBACpB,UACA,SACgB;AAChB,QAAM,SAAS,MAAM,GAAG,SAAS,QAAQ;AACzC,SAAO,2BAA2B,QAAQ,OAAO;AACnD;AAEA,eAAsB,8BACpB,QAC8B;AAC9B,wBAAsB;AACtB,QAAM,YAAY,IAAI,WAAW,MAAM,EAAE;AACzC,QAAM,OAAO,MAAM,YAAY,SAAS,IAAI,SAAS,CAAC,SAAS,CAAC,CAAC;AACjE,QAAM,OAAO,KAAK,SAAS;AAC3B,QAAM,WAAW,KAAK,aAAa;AACnC,QAAM,YAAY,SAAS,UAAU;AACrC,QAAM,mBAAmB,OAAO,YAAY,SAAS,iBAAiB,CAAC;AAEvE,WAAS,KAAK;AACd,OAAK,KAAK;AACV,OAAK,KAAK;AAEV,SAAO;IACL,WAAW,aAAa;IACxB,kBAAkB,wBAAwB,gBAAgB;EAC5D;AACF;AAEA,SAAS,wBAAwB,OAAwD;AACvF,QAAM,aAAqC,CAAC;AAE5C,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,KAAK,GAAG;AAChD,QAAI,UAAU,QAAQ,UAAU,QAAW;AACzC,iBAAW,GAAG,IAAI;AAClB;IACF;AACA,eAAW,GAAG,IAAI,OAAO,UAAU,WAAW,QAAQ,OAAO,KAAK;EACpE;AAEA,SAAO;AACT;AAEA,eAAsB,sBACpB,eAAe,iBACW;AAC1B,QAAM,UAAU,MAAM,GAAG,QAAQ,KAAK,KAAK,OAAO,GAAG,cAAc,CAAC;AACpE,QAAM,WAAW,aAAa,QAAQ,UAAU,GAAG;AACnD,QAAM,WAAW,KAAK,KAAK,SAAS,GAAG,WAAW,CAAC,IAAI,QAAQ,EAAE;AACjE,QAAM,cAAc,kBAAkB,QAAQ;AAE9C,QAAM,SAAS,QAAQ,OAAO,WAAW;AAEzC,SAAO;IACL,MAAM;IACN,SAAS,YAAY;AACnB,YAAM,GAAG,GAAG,SAAS,EAAE,WAAW,MAAM,OAAO,KAAK,CAAC;IACvD;EACF;AACF;AAEA,eAAsB,0BACpB,eAAe,iBACf,SACgB;AAChB,QAAM,OAAO,MAAM,sBAAsB,YAAY;AAErD,MAAI;AACF,WAAO,MAAM,yBAAyB,KAAK,MAAM,OAAO;EAC1D,UAAA;AACE,UAAM,KAAK,QAAQ;EACrB;AACF;","names":[]}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { dirname } from 'path';
|
|
4
|
+
const require = createRequire(import.meta.url);
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = dirname(__filename);
|
|
7
|
+
|
|
8
|
+
// ../../packages/parquet-reader/dist/index.js
|
|
9
|
+
import { Blob as NodeBlob } from "buffer";
|
|
10
|
+
import { createWriteStream } from "fs";
|
|
11
|
+
import { promises as fs } from "fs";
|
|
12
|
+
import { randomUUID } from "crypto";
|
|
13
|
+
import { tmpdir } from "os";
|
|
14
|
+
import path from "path";
|
|
15
|
+
import { pipeline } from "stream/promises";
|
|
16
|
+
import { tableFromIPC } from "apache-arrow";
|
|
17
|
+
import initWasm, { ParquetFile, readParquet } from "parquet-wasm/esm";
|
|
18
|
+
var BlobCtor = typeof Blob === "undefined" ? NodeBlob : Blob;
|
|
19
|
+
var wasmInitialized = false;
|
|
20
|
+
var wasmInitPromise = null;
|
|
21
|
+
async function ensureWasmInitialized() {
|
|
22
|
+
if (wasmInitialized) return;
|
|
23
|
+
if (!wasmInitPromise) {
|
|
24
|
+
wasmInitPromise = initWasm().then(() => {
|
|
25
|
+
wasmInitialized = true;
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
return wasmInitPromise;
|
|
29
|
+
}
|
|
30
|
+
async function readParquetTableFromBuffer(buffer, options) {
|
|
31
|
+
await ensureWasmInitialized();
|
|
32
|
+
const wasmTable = readParquet(buffer, options ?? void 0);
|
|
33
|
+
const ipcStream = wasmTable.intoIPCStream();
|
|
34
|
+
return tableFromIPC(ipcStream);
|
|
35
|
+
}
|
|
36
|
+
function createParquetBufferSource(buffer) {
|
|
37
|
+
let metadataPromise = null;
|
|
38
|
+
return {
|
|
39
|
+
buffer,
|
|
40
|
+
byteLength: buffer.byteLength,
|
|
41
|
+
readTable: (options) => readParquetTableFromBuffer(buffer, options),
|
|
42
|
+
readMetadata: () => {
|
|
43
|
+
if (!metadataPromise) {
|
|
44
|
+
metadataPromise = readParquetMetadataFromBuffer(buffer);
|
|
45
|
+
}
|
|
46
|
+
return metadataPromise;
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
async function openParquetBufferFromPath(filePath) {
|
|
51
|
+
const buffer = await fs.readFile(filePath);
|
|
52
|
+
return createParquetBufferSource(buffer);
|
|
53
|
+
}
|
|
54
|
+
async function readParquetTableFromPath(filePath, options) {
|
|
55
|
+
const buffer = await fs.readFile(filePath);
|
|
56
|
+
return readParquetTableFromBuffer(buffer, options);
|
|
57
|
+
}
|
|
58
|
+
async function readParquetMetadataFromBuffer(buffer) {
|
|
59
|
+
await ensureWasmInitialized();
|
|
60
|
+
const blobInput = new Uint8Array(buffer).buffer;
|
|
61
|
+
const file = await ParquetFile.fromFile(new BlobCtor([blobInput]));
|
|
62
|
+
const meta = file.metadata();
|
|
63
|
+
const fileMeta = meta.fileMetadata();
|
|
64
|
+
const createdBy = fileMeta.createdBy();
|
|
65
|
+
const keyValueMetadata = Object.fromEntries(fileMeta.keyValueMetadata());
|
|
66
|
+
fileMeta.free();
|
|
67
|
+
meta.free();
|
|
68
|
+
file.free();
|
|
69
|
+
return {
|
|
70
|
+
createdBy: createdBy ?? void 0,
|
|
71
|
+
keyValueMetadata: normalizeMetadataValues(keyValueMetadata)
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
function normalizeMetadataValues(input) {
|
|
75
|
+
const normalized = {};
|
|
76
|
+
for (const [key, value] of Object.entries(input)) {
|
|
77
|
+
if (value === null || value === void 0) {
|
|
78
|
+
normalized[key] = "";
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
normalized[key] = typeof value === "string" ? value : String(value);
|
|
82
|
+
}
|
|
83
|
+
return normalized;
|
|
84
|
+
}
|
|
85
|
+
async function bufferStdinToTempFile(filenameHint = "stdin.parquet") {
|
|
86
|
+
const tempDir = await fs.mkdtemp(path.join(tmpdir(), "parquetlens-"));
|
|
87
|
+
const safeName = filenameHint.replace(/[\\/]/g, "_");
|
|
88
|
+
const filePath = path.join(tempDir, `${randomUUID()}-${safeName}`);
|
|
89
|
+
const writeStream = createWriteStream(filePath);
|
|
90
|
+
await pipeline(process.stdin, writeStream);
|
|
91
|
+
return {
|
|
92
|
+
path: filePath,
|
|
93
|
+
cleanup: async () => {
|
|
94
|
+
await fs.rm(tempDir, { recursive: true, force: true });
|
|
95
|
+
}
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
async function readParquetTableFromStdin(filenameHint = "stdin.parquet", options) {
|
|
99
|
+
const temp = await bufferStdinToTempFile(filenameHint);
|
|
100
|
+
try {
|
|
101
|
+
return await readParquetTableFromPath(temp.path, options);
|
|
102
|
+
} finally {
|
|
103
|
+
await temp.cleanup();
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export {
|
|
108
|
+
readParquetTableFromBuffer,
|
|
109
|
+
openParquetBufferFromPath,
|
|
110
|
+
readParquetTableFromPath,
|
|
111
|
+
readParquetTableFromStdin
|
|
112
|
+
};
|
|
113
|
+
//# sourceMappingURL=chunk-3N45GGD2.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../../packages/parquet-reader/src/index.ts"],"sourcesContent":["import { Blob as NodeBlob } from \"node:buffer\";\nimport { createWriteStream } from \"node:fs\";\nimport { promises as fs } from \"node:fs\";\nimport { randomUUID } from \"node:crypto\";\nimport { tmpdir } from \"node:os\";\nimport path from \"node:path\";\nimport { pipeline } from \"node:stream/promises\";\n\nimport { tableFromIPC, Table } from \"apache-arrow\";\nimport initWasm, { ParquetFile, readParquet, type ReaderOptions } from \"parquet-wasm/esm\";\n\nconst BlobCtor: typeof Blob =\n typeof Blob === \"undefined\" ? (NodeBlob as unknown as typeof Blob) : Blob;\n\nlet wasmInitialized = false;\nlet wasmInitPromise: Promise<void> | null = null;\n\nasync function ensureWasmInitialized(): Promise<void> {\n if (wasmInitialized) return;\n if (!wasmInitPromise) {\n wasmInitPromise = initWasm().then(() => {\n wasmInitialized = true;\n });\n }\n return wasmInitPromise;\n}\n\nexport type TempParquetFile = {\n path: string;\n cleanup: () => Promise<void>;\n};\n\nexport type ParquetReadOptions = Pick<\n ReaderOptions,\n \"batchSize\" | \"columns\" | \"limit\" | \"offset\" | \"rowGroups\"\n>;\n\nexport type ParquetFileMetadata = {\n createdBy?: string;\n keyValueMetadata: Record<string, string>;\n};\n\nexport type ParquetBufferSource = {\n buffer: Uint8Array;\n byteLength: number;\n readTable: (options?: ParquetReadOptions) => Promise<Table>;\n readMetadata: () => Promise<ParquetFileMetadata>;\n};\n\nexport async function readParquetTableFromBuffer(\n buffer: Uint8Array,\n options?: ParquetReadOptions,\n): Promise<Table> {\n await ensureWasmInitialized();\n const wasmTable = readParquet(buffer, options ?? undefined);\n const ipcStream = wasmTable.intoIPCStream();\n return tableFromIPC(ipcStream);\n}\n\nexport function createParquetBufferSource(buffer: Uint8Array): ParquetBufferSource {\n let metadataPromise: Promise<ParquetFileMetadata> | null = null;\n\n return {\n buffer,\n byteLength: buffer.byteLength,\n readTable: (options?: ParquetReadOptions) => readParquetTableFromBuffer(buffer, options),\n readMetadata: () => {\n if (!metadataPromise) {\n metadataPromise = readParquetMetadataFromBuffer(buffer);\n }\n return metadataPromise;\n },\n };\n}\n\nexport async function openParquetBufferFromPath(filePath: string): Promise<ParquetBufferSource> {\n const buffer = await fs.readFile(filePath);\n return createParquetBufferSource(buffer);\n}\n\nexport async function readParquetTableFromPath(\n filePath: string,\n options?: ParquetReadOptions,\n): Promise<Table> {\n const buffer = await fs.readFile(filePath);\n return readParquetTableFromBuffer(buffer, options);\n}\n\nexport async function readParquetMetadataFromBuffer(\n buffer: Uint8Array,\n): Promise<ParquetFileMetadata> {\n await ensureWasmInitialized();\n const blobInput = new Uint8Array(buffer).buffer as ArrayBuffer;\n const file = await ParquetFile.fromFile(new BlobCtor([blobInput]));\n const meta = file.metadata();\n const fileMeta = meta.fileMetadata();\n const createdBy = fileMeta.createdBy();\n const keyValueMetadata = Object.fromEntries(fileMeta.keyValueMetadata());\n\n fileMeta.free();\n meta.free();\n file.free();\n\n return {\n createdBy: createdBy ?? undefined,\n keyValueMetadata: normalizeMetadataValues(keyValueMetadata),\n };\n}\n\nfunction normalizeMetadataValues(input: Record<string, unknown>): Record<string, string> {\n const normalized: Record<string, string> = {};\n\n for (const [key, value] of Object.entries(input)) {\n if (value === null || value === undefined) {\n normalized[key] = \"\";\n continue;\n }\n normalized[key] = typeof value === \"string\" ? value : String(value);\n }\n\n return normalized;\n}\n\nexport async function bufferStdinToTempFile(\n filenameHint = \"stdin.parquet\",\n): Promise<TempParquetFile> {\n const tempDir = await fs.mkdtemp(path.join(tmpdir(), \"parquetlens-\"));\n const safeName = filenameHint.replace(/[\\\\/]/g, \"_\");\n const filePath = path.join(tempDir, `${randomUUID()}-${safeName}`);\n const writeStream = createWriteStream(filePath);\n\n await pipeline(process.stdin, writeStream);\n\n return {\n path: filePath,\n cleanup: async () => {\n await fs.rm(tempDir, { recursive: true, force: true });\n },\n };\n}\n\nexport async function readParquetTableFromStdin(\n filenameHint = \"stdin.parquet\",\n options?: ParquetReadOptions,\n): Promise<Table> {\n const temp = await bufferStdinToTempFile(filenameHint);\n\n try {\n return await readParquetTableFromPath(temp.path, options);\n } finally {\n await temp.cleanup();\n }\n}\n"],"mappings":";;;;;;;;AAAA,SAAS,QAAQ,gBAAgB;AACjC,SAAS,yBAAyB;AAClC,SAAS,YAAY,UAAU;AAC/B,SAAS,kBAAkB;AAC3B,SAAS,cAAc;AACvB,OAAO,UAAU;AACjB,SAAS,gBAAgB;AAEzB,SAAS,oBAA2B;AACpC,OAAO,YAAY,aAAa,mBAAuC;AAEvE,IAAM,WACJ,OAAO,SAAS,cAAe,WAAsC;AAEvE,IAAI,kBAAkB;AACtB,IAAI,kBAAwC;AAE5C,eAAe,wBAAuC;AACpD,MAAI,gBAAiB;AACrB,MAAI,CAAC,iBAAiB;AACpB,sBAAkB,SAAS,EAAE,KAAK,MAAM;AACtC,wBAAkB;IACpB,CAAC;EACH;AACA,SAAO;AACT;AAwBA,eAAsB,2BACpB,QACA,SACgB;AAChB,QAAM,sBAAsB;AAC5B,QAAM,YAAY,YAAY,QAAQ,WAAW,MAAS;AAC1D,QAAM,YAAY,UAAU,cAAc;AAC1C,SAAO,aAAa,SAAS;AAC/B;AAEO,SAAS,0BAA0B,QAAyC;AACjF,MAAI,kBAAuD;AAE3D,SAAO;IACL;IACA,YAAY,OAAO;IACnB,WAAW,CAAC,YAAiC,2BAA2B,QAAQ,OAAO;IACvF,cAAc,MAAM;AAClB,UAAI,CAAC,iBAAiB;AACpB,0BAAkB,8BAA8B,MAAM;MACxD;AACA,aAAO;IACT;EACF;AACF;AAEA,eAAsB,0BAA0B,UAAgD;AAC9F,QAAM,SAAS,MAAM,GAAG,SAAS,QAAQ;AACzC,SAAO,0BAA0B,MAAM;AACzC;AAEA,eAAsB,yBACpB,UACA,SACgB;AAChB,QAAM,SAAS,MAAM,GAAG,SAAS,QAAQ;AACzC,SAAO,2BAA2B,QAAQ,OAAO;AACnD;AAEA,eAAsB,8BACpB,QAC8B;AAC9B,QAAM,sBAAsB;AAC5B,QAAM,YAAY,IAAI,WAAW,MAAM,EAAE;AACzC,QAAM,OAAO,MAAM,YAAY,SAAS,IAAI,SAAS,CAAC,SAAS,CAAC,CAAC;AACjE,QAAM,OAAO,KAAK,SAAS;AAC3B,QAAM,WAAW,KAAK,aAAa;AACnC,QAAM,YAAY,SAAS,UAAU;AACrC,QAAM,mBAAmB,OAAO,YAAY,SAAS,iBAAiB,CAAC;AAEvE,WAAS,KAAK;AACd,OAAK,KAAK;AACV,OAAK,KAAK;AAEV,SAAO;IACL,WAAW,aAAa;IACxB,kBAAkB,wBAAwB,gBAAgB;EAC5D;AACF;AAEA,SAAS,wBAAwB,OAAwD;AACvF,QAAM,aAAqC,CAAC;AAE5C,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,KAAK,GAAG;AAChD,QAAI,UAAU,QAAQ,UAAU,QAAW;AACzC,iBAAW,GAAG,IAAI;AAClB;IACF;AACA,eAAW,GAAG,IAAI,OAAO,UAAU,WAAW,QAAQ,OAAO,KAAK;EACpE;AAEA,SAAO;AACT;AAEA,eAAsB,sBACpB,eAAe,iBACW;AAC1B,QAAM,UAAU,MAAM,GAAG,QAAQ,KAAK,KAAK,OAAO,GAAG,cAAc,CAAC;AACpE,QAAM,WAAW,aAAa,QAAQ,UAAU,GAAG;AACnD,QAAM,WAAW,KAAK,KAAK,SAAS,GAAG,WAAW,CAAC,IAAI,QAAQ,EAAE;AACjE,QAAM,cAAc,kBAAkB,QAAQ;AAE9C,QAAM,SAAS,QAAQ,OAAO,WAAW;AAEzC,SAAO;IACL,MAAM;IACN,SAAS,YAAY;AACnB,YAAM,GAAG,GAAG,SAAS,EAAE,WAAW,MAAM,OAAO,KAAK,CAAC;IACvD;EACF;AACF;AAEA,eAAsB,0BACpB,eAAe,iBACf,SACgB;AAChB,QAAM,OAAO,MAAM,sBAAsB,YAAY;AAErD,MAAI;AACF,WAAO,MAAM,yBAAyB,KAAK,MAAM,OAAO;EAC1D,UAAA;AACE,UAAM,KAAK,QAAQ;EACrB;AACF;","names":[]}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { dirname } from 'path';
|
|
4
|
+
const require = createRequire(import.meta.url);
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = dirname(__filename);
|
|
7
|
+
|
|
8
|
+
// ../../packages/parquet-reader/dist/index.js
|
|
9
|
+
import { Blob as NodeBlob } from "buffer";
|
|
10
|
+
import { createWriteStream, readFileSync } from "fs";
|
|
11
|
+
import { promises as fs } from "fs";
|
|
12
|
+
import { randomUUID } from "crypto";
|
|
13
|
+
import { createRequire as nodeCreateRequire } from "module";
|
|
14
|
+
import { tmpdir } from "os";
|
|
15
|
+
import path from "path";
|
|
16
|
+
import { pipeline } from "stream/promises";
|
|
17
|
+
import { tableFromIPC } from "apache-arrow";
|
|
18
|
+
import { initSync, ParquetFile, readParquet } from "parquet-wasm/esm";
|
|
19
|
+
var BlobCtor = typeof Blob === "undefined" ? NodeBlob : Blob;
|
|
20
|
+
var wasmInitialized = false;
|
|
21
|
+
function ensureWasmInitialized() {
|
|
22
|
+
if (wasmInitialized) return;
|
|
23
|
+
const localRequire = nodeCreateRequire(import.meta.url);
|
|
24
|
+
const wasmPath = localRequire.resolve("parquet-wasm/esm/parquet_wasm_bg.wasm");
|
|
25
|
+
const wasmBytes = readFileSync(wasmPath);
|
|
26
|
+
initSync({ module: wasmBytes });
|
|
27
|
+
wasmInitialized = true;
|
|
28
|
+
}
|
|
29
|
+
async function readParquetTableFromBuffer(buffer, options) {
|
|
30
|
+
ensureWasmInitialized();
|
|
31
|
+
const wasmTable = readParquet(buffer, options ?? void 0);
|
|
32
|
+
const ipcStream = wasmTable.intoIPCStream();
|
|
33
|
+
return tableFromIPC(ipcStream);
|
|
34
|
+
}
|
|
35
|
+
function createParquetBufferSource(buffer) {
|
|
36
|
+
let metadataPromise = null;
|
|
37
|
+
return {
|
|
38
|
+
buffer,
|
|
39
|
+
byteLength: buffer.byteLength,
|
|
40
|
+
readTable: (options) => readParquetTableFromBuffer(buffer, options),
|
|
41
|
+
readMetadata: () => {
|
|
42
|
+
if (!metadataPromise) {
|
|
43
|
+
metadataPromise = readParquetMetadataFromBuffer(buffer);
|
|
44
|
+
}
|
|
45
|
+
return metadataPromise;
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
async function openParquetBufferFromPath(filePath) {
|
|
50
|
+
const buffer = await fs.readFile(filePath);
|
|
51
|
+
return createParquetBufferSource(buffer);
|
|
52
|
+
}
|
|
53
|
+
async function readParquetTableFromPath(filePath, options) {
|
|
54
|
+
const buffer = await fs.readFile(filePath);
|
|
55
|
+
return readParquetTableFromBuffer(buffer, options);
|
|
56
|
+
}
|
|
57
|
+
async function readParquetMetadataFromBuffer(buffer) {
|
|
58
|
+
ensureWasmInitialized();
|
|
59
|
+
const blobInput = new Uint8Array(buffer).buffer;
|
|
60
|
+
const file = await ParquetFile.fromFile(new BlobCtor([blobInput]));
|
|
61
|
+
const meta = file.metadata();
|
|
62
|
+
const fileMeta = meta.fileMetadata();
|
|
63
|
+
const createdBy = fileMeta.createdBy();
|
|
64
|
+
const keyValueMetadata = Object.fromEntries(fileMeta.keyValueMetadata());
|
|
65
|
+
fileMeta.free();
|
|
66
|
+
meta.free();
|
|
67
|
+
file.free();
|
|
68
|
+
return {
|
|
69
|
+
createdBy: createdBy ?? void 0,
|
|
70
|
+
keyValueMetadata: normalizeMetadataValues(keyValueMetadata)
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
function normalizeMetadataValues(input) {
|
|
74
|
+
const normalized = {};
|
|
75
|
+
for (const [key, value] of Object.entries(input)) {
|
|
76
|
+
if (value === null || value === void 0) {
|
|
77
|
+
normalized[key] = "";
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
normalized[key] = typeof value === "string" ? value : String(value);
|
|
81
|
+
}
|
|
82
|
+
return normalized;
|
|
83
|
+
}
|
|
84
|
+
async function bufferStdinToTempFile(filenameHint = "stdin.parquet") {
|
|
85
|
+
const tempDir = await fs.mkdtemp(path.join(tmpdir(), "parquetlens-"));
|
|
86
|
+
const safeName = filenameHint.replace(/[\\/]/g, "_");
|
|
87
|
+
const filePath = path.join(tempDir, `${randomUUID()}-${safeName}`);
|
|
88
|
+
const writeStream = createWriteStream(filePath);
|
|
89
|
+
await pipeline(process.stdin, writeStream);
|
|
90
|
+
return {
|
|
91
|
+
path: filePath,
|
|
92
|
+
cleanup: async () => {
|
|
93
|
+
await fs.rm(tempDir, { recursive: true, force: true });
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
async function readParquetTableFromStdin(filenameHint = "stdin.parquet", options) {
|
|
98
|
+
const temp = await bufferStdinToTempFile(filenameHint);
|
|
99
|
+
try {
|
|
100
|
+
return await readParquetTableFromPath(temp.path, options);
|
|
101
|
+
} finally {
|
|
102
|
+
await temp.cleanup();
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export {
|
|
107
|
+
readParquetTableFromBuffer,
|
|
108
|
+
openParquetBufferFromPath,
|
|
109
|
+
readParquetTableFromPath,
|
|
110
|
+
readParquetTableFromStdin
|
|
111
|
+
};
|
|
112
|
+
//# sourceMappingURL=chunk-AYPIRAOL.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../../packages/parquet-reader/src/index.ts"],"sourcesContent":["import { Blob as NodeBlob } from \"node:buffer\";\nimport { createWriteStream, readFileSync } from \"node:fs\";\nimport { promises as fs } from \"node:fs\";\nimport { randomUUID } from \"node:crypto\";\nimport { createRequire as nodeCreateRequire } from \"node:module\";\nimport { tmpdir } from \"node:os\";\nimport path from \"node:path\";\nimport { pipeline } from \"node:stream/promises\";\n\nimport { tableFromIPC, Table } from \"apache-arrow\";\nimport { initSync, ParquetFile, readParquet, type ReaderOptions } from \"parquet-wasm/esm\";\n\nconst BlobCtor: typeof Blob =\n typeof Blob === \"undefined\" ? (NodeBlob as unknown as typeof Blob) : Blob;\n\nlet wasmInitialized = false;\n\nfunction ensureWasmInitialized(): void {\n if (wasmInitialized) return;\n\n // Use require.resolve to find the WASM file (works with pnpm)\n const localRequire = nodeCreateRequire(import.meta.url);\n const wasmPath = localRequire.resolve(\"parquet-wasm/esm/parquet_wasm_bg.wasm\");\n const wasmBytes = readFileSync(wasmPath);\n initSync({ module: wasmBytes });\n wasmInitialized = true;\n}\n\nexport type TempParquetFile = {\n path: string;\n cleanup: () => Promise<void>;\n};\n\nexport type ParquetReadOptions = Pick<\n ReaderOptions,\n \"batchSize\" | \"columns\" | \"limit\" | \"offset\" | \"rowGroups\"\n>;\n\nexport type ParquetFileMetadata = {\n createdBy?: string;\n keyValueMetadata: Record<string, string>;\n};\n\nexport type ParquetBufferSource = {\n buffer: Uint8Array;\n byteLength: number;\n readTable: (options?: ParquetReadOptions) => Promise<Table>;\n readMetadata: () => Promise<ParquetFileMetadata>;\n};\n\nexport async function readParquetTableFromBuffer(\n buffer: Uint8Array,\n options?: ParquetReadOptions,\n): Promise<Table> {\n ensureWasmInitialized();\n const wasmTable = readParquet(buffer, options ?? undefined);\n const ipcStream = wasmTable.intoIPCStream();\n return tableFromIPC(ipcStream);\n}\n\nexport function createParquetBufferSource(buffer: Uint8Array): ParquetBufferSource {\n let metadataPromise: Promise<ParquetFileMetadata> | null = null;\n\n return {\n buffer,\n byteLength: buffer.byteLength,\n readTable: (options?: ParquetReadOptions) => readParquetTableFromBuffer(buffer, options),\n readMetadata: () => {\n if (!metadataPromise) {\n metadataPromise = readParquetMetadataFromBuffer(buffer);\n }\n return metadataPromise;\n },\n };\n}\n\nexport async function openParquetBufferFromPath(filePath: string): Promise<ParquetBufferSource> {\n const buffer = await fs.readFile(filePath);\n return createParquetBufferSource(buffer);\n}\n\nexport async function readParquetTableFromPath(\n filePath: string,\n options?: ParquetReadOptions,\n): Promise<Table> {\n const buffer = await fs.readFile(filePath);\n return readParquetTableFromBuffer(buffer, options);\n}\n\nexport async function readParquetMetadataFromBuffer(\n buffer: Uint8Array,\n): Promise<ParquetFileMetadata> {\n ensureWasmInitialized();\n const blobInput = new Uint8Array(buffer).buffer as ArrayBuffer;\n const file = await ParquetFile.fromFile(new BlobCtor([blobInput]));\n const meta = file.metadata();\n const fileMeta = meta.fileMetadata();\n const createdBy = fileMeta.createdBy();\n const keyValueMetadata = Object.fromEntries(fileMeta.keyValueMetadata());\n\n fileMeta.free();\n meta.free();\n file.free();\n\n return {\n createdBy: createdBy ?? undefined,\n keyValueMetadata: normalizeMetadataValues(keyValueMetadata),\n };\n}\n\nfunction normalizeMetadataValues(input: Record<string, unknown>): Record<string, string> {\n const normalized: Record<string, string> = {};\n\n for (const [key, value] of Object.entries(input)) {\n if (value === null || value === undefined) {\n normalized[key] = \"\";\n continue;\n }\n normalized[key] = typeof value === \"string\" ? value : String(value);\n }\n\n return normalized;\n}\n\nexport async function bufferStdinToTempFile(\n filenameHint = \"stdin.parquet\",\n): Promise<TempParquetFile> {\n const tempDir = await fs.mkdtemp(path.join(tmpdir(), \"parquetlens-\"));\n const safeName = filenameHint.replace(/[\\\\/]/g, \"_\");\n const filePath = path.join(tempDir, `${randomUUID()}-${safeName}`);\n const writeStream = createWriteStream(filePath);\n\n await pipeline(process.stdin, writeStream);\n\n return {\n path: filePath,\n cleanup: async () => {\n await fs.rm(tempDir, { recursive: true, force: true });\n },\n };\n}\n\nexport async function readParquetTableFromStdin(\n filenameHint = \"stdin.parquet\",\n options?: ParquetReadOptions,\n): Promise<Table> {\n const temp = await bufferStdinToTempFile(filenameHint);\n\n try {\n return await readParquetTableFromPath(temp.path, options);\n } finally {\n await temp.cleanup();\n }\n}\n"],"mappings":";;;;;;;;AAAA,SAAS,QAAQ,gBAAgB;AACjC,SAAS,mBAAmB,oBAAoB;AAChD,SAAS,YAAY,UAAU;AAC/B,SAAS,kBAAkB;AAC3B,SAAS,iBAAiB,yBAAyB;AACnD,SAAS,cAAc;AACvB,OAAO,UAAU;AACjB,SAAS,gBAAgB;AAEzB,SAAS,oBAA2B;AACpC,SAAS,UAAU,aAAa,mBAAuC;AAEvE,IAAM,WACJ,OAAO,SAAS,cAAe,WAAsC;AAEvE,IAAI,kBAAkB;AAEtB,SAAS,wBAA8B;AACrC,MAAI,gBAAiB;AAGrB,QAAM,eAAe,kBAAkB,YAAY,GAAG;AACtD,QAAM,WAAW,aAAa,QAAQ,uCAAuC;AAC7E,QAAM,YAAY,aAAa,QAAQ;AACvC,WAAS,EAAE,QAAQ,UAAU,CAAC;AAC9B,oBAAkB;AACpB;AAwBA,eAAsB,2BACpB,QACA,SACgB;AAChB,wBAAsB;AACtB,QAAM,YAAY,YAAY,QAAQ,WAAW,MAAS;AAC1D,QAAM,YAAY,UAAU,cAAc;AAC1C,SAAO,aAAa,SAAS;AAC/B;AAEO,SAAS,0BAA0B,QAAyC;AACjF,MAAI,kBAAuD;AAE3D,SAAO;IACL;IACA,YAAY,OAAO;IACnB,WAAW,CAAC,YAAiC,2BAA2B,QAAQ,OAAO;IACvF,cAAc,MAAM;AAClB,UAAI,CAAC,iBAAiB;AACpB,0BAAkB,8BAA8B,MAAM;MACxD;AACA,aAAO;IACT;EACF;AACF;AAEA,eAAsB,0BAA0B,UAAgD;AAC9F,QAAM,SAAS,MAAM,GAAG,SAAS,QAAQ;AACzC,SAAO,0BAA0B,MAAM;AACzC;AAEA,eAAsB,yBACpB,UACA,SACgB;AAChB,QAAM,SAAS,MAAM,GAAG,SAAS,QAAQ;AACzC,SAAO,2BAA2B,QAAQ,OAAO;AACnD;AAEA,eAAsB,8BACpB,QAC8B;AAC9B,wBAAsB;AACtB,QAAM,YAAY,IAAI,WAAW,MAAM,EAAE;AACzC,QAAM,OAAO,MAAM,YAAY,SAAS,IAAI,SAAS,CAAC,SAAS,CAAC,CAAC;AACjE,QAAM,OAAO,KAAK,SAAS;AAC3B,QAAM,WAAW,KAAK,aAAa;AACnC,QAAM,YAAY,SAAS,UAAU;AACrC,QAAM,mBAAmB,OAAO,YAAY,SAAS,iBAAiB,CAAC;AAEvE,WAAS,KAAK;AACd,OAAK,KAAK;AACV,OAAK,KAAK;AAEV,SAAO;IACL,WAAW,aAAa;IACxB,kBAAkB,wBAAwB,gBAAgB;EAC5D;AACF;AAEA,SAAS,wBAAwB,OAAwD;AACvF,QAAM,aAAqC,CAAC;AAE5C,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,KAAK,GAAG;AAChD,QAAI,UAAU,QAAQ,UAAU,QAAW;AACzC,iBAAW,GAAG,IAAI;AAClB;IACF;AACA,eAAW,GAAG,IAAI,OAAO,UAAU,WAAW,QAAQ,OAAO,KAAK;EACpE;AAEA,SAAO;AACT;AAEA,eAAsB,sBACpB,eAAe,iBACW;AAC1B,QAAM,UAAU,MAAM,GAAG,QAAQ,KAAK,KAAK,OAAO,GAAG,cAAc,CAAC;AACpE,QAAM,WAAW,aAAa,QAAQ,UAAU,GAAG;AACnD,QAAM,WAAW,KAAK,KAAK,SAAS,GAAG,WAAW,CAAC,IAAI,QAAQ,EAAE;AACjE,QAAM,cAAc,kBAAkB,QAAQ;AAE9C,QAAM,SAAS,QAAQ,OAAO,WAAW;AAEzC,SAAO;IACL,MAAM;IACN,SAAS,YAAY;AACnB,YAAM,GAAG,GAAG,SAAS,EAAE,WAAW,MAAM,OAAO,KAAK,CAAC;IACvD;EACF;AACF;AAEA,eAAsB,0BACpB,eAAe,iBACf,SACgB;AAChB,QAAM,OAAO,MAAM,sBAAsB,YAAY;AAErD,MAAI;AACF,WAAO,MAAM,yBAAyB,KAAK,MAAM,OAAO;EAC1D,UAAA;AACE,UAAM,KAAK,QAAQ;EACrB;AACF;","names":[]}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { dirname } from 'path';
|
|
4
|
+
const require = createRequire(import.meta.url);
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = dirname(__filename);
|
|
7
|
+
|
|
8
|
+
// ../../packages/parquet-reader/dist/index.js
|
|
9
|
+
import { Blob as NodeBlob } from "buffer";
|
|
10
|
+
import { createWriteStream, readFileSync } from "fs";
|
|
11
|
+
import { promises as fs } from "fs";
|
|
12
|
+
import { randomUUID } from "crypto";
|
|
13
|
+
import { createRequire } from "module";
|
|
14
|
+
import { tmpdir } from "os";
|
|
15
|
+
import path from "path";
|
|
16
|
+
import { pipeline } from "stream/promises";
|
|
17
|
+
import { tableFromIPC } from "apache-arrow";
|
|
18
|
+
import { initSync, ParquetFile, readParquet } from "parquet-wasm/esm";
|
|
19
|
+
var BlobCtor = typeof Blob === "undefined" ? NodeBlob : Blob;
|
|
20
|
+
var wasmInitialized = false;
|
|
21
|
+
function ensureWasmInitialized() {
|
|
22
|
+
if (wasmInitialized) return;
|
|
23
|
+
const require2 = createRequire(import.meta.url);
|
|
24
|
+
const wasmPath = require2.resolve("parquet-wasm/esm/parquet_wasm_bg.wasm");
|
|
25
|
+
const wasmBytes = readFileSync(wasmPath);
|
|
26
|
+
initSync({ module: wasmBytes });
|
|
27
|
+
wasmInitialized = true;
|
|
28
|
+
}
|
|
29
|
+
async function readParquetTableFromBuffer(buffer, options) {
|
|
30
|
+
ensureWasmInitialized();
|
|
31
|
+
const wasmTable = readParquet(buffer, options ?? void 0);
|
|
32
|
+
const ipcStream = wasmTable.intoIPCStream();
|
|
33
|
+
return tableFromIPC(ipcStream);
|
|
34
|
+
}
|
|
35
|
+
function createParquetBufferSource(buffer) {
|
|
36
|
+
let metadataPromise = null;
|
|
37
|
+
return {
|
|
38
|
+
buffer,
|
|
39
|
+
byteLength: buffer.byteLength,
|
|
40
|
+
readTable: (options) => readParquetTableFromBuffer(buffer, options),
|
|
41
|
+
readMetadata: () => {
|
|
42
|
+
if (!metadataPromise) {
|
|
43
|
+
metadataPromise = readParquetMetadataFromBuffer(buffer);
|
|
44
|
+
}
|
|
45
|
+
return metadataPromise;
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
async function openParquetBufferFromPath(filePath) {
|
|
50
|
+
const buffer = await fs.readFile(filePath);
|
|
51
|
+
return createParquetBufferSource(buffer);
|
|
52
|
+
}
|
|
53
|
+
async function readParquetTableFromPath(filePath, options) {
|
|
54
|
+
const buffer = await fs.readFile(filePath);
|
|
55
|
+
return readParquetTableFromBuffer(buffer, options);
|
|
56
|
+
}
|
|
57
|
+
async function readParquetMetadataFromBuffer(buffer) {
|
|
58
|
+
ensureWasmInitialized();
|
|
59
|
+
const blobInput = new Uint8Array(buffer).buffer;
|
|
60
|
+
const file = await ParquetFile.fromFile(new BlobCtor([blobInput]));
|
|
61
|
+
const meta = file.metadata();
|
|
62
|
+
const fileMeta = meta.fileMetadata();
|
|
63
|
+
const createdBy = fileMeta.createdBy();
|
|
64
|
+
const keyValueMetadata = Object.fromEntries(fileMeta.keyValueMetadata());
|
|
65
|
+
fileMeta.free();
|
|
66
|
+
meta.free();
|
|
67
|
+
file.free();
|
|
68
|
+
return {
|
|
69
|
+
createdBy: createdBy ?? void 0,
|
|
70
|
+
keyValueMetadata: normalizeMetadataValues(keyValueMetadata)
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
function normalizeMetadataValues(input) {
|
|
74
|
+
const normalized = {};
|
|
75
|
+
for (const [key, value] of Object.entries(input)) {
|
|
76
|
+
if (value === null || value === void 0) {
|
|
77
|
+
normalized[key] = "";
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
normalized[key] = typeof value === "string" ? value : String(value);
|
|
81
|
+
}
|
|
82
|
+
return normalized;
|
|
83
|
+
}
|
|
84
|
+
async function bufferStdinToTempFile(filenameHint = "stdin.parquet") {
|
|
85
|
+
const tempDir = await fs.mkdtemp(path.join(tmpdir(), "parquetlens-"));
|
|
86
|
+
const safeName = filenameHint.replace(/[\\/]/g, "_");
|
|
87
|
+
const filePath = path.join(tempDir, `${randomUUID()}-${safeName}`);
|
|
88
|
+
const writeStream = createWriteStream(filePath);
|
|
89
|
+
await pipeline(process.stdin, writeStream);
|
|
90
|
+
return {
|
|
91
|
+
path: filePath,
|
|
92
|
+
cleanup: async () => {
|
|
93
|
+
await fs.rm(tempDir, { recursive: true, force: true });
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
async function readParquetTableFromStdin(filenameHint = "stdin.parquet", options) {
|
|
98
|
+
const temp = await bufferStdinToTempFile(filenameHint);
|
|
99
|
+
try {
|
|
100
|
+
return await readParquetTableFromPath(temp.path, options);
|
|
101
|
+
} finally {
|
|
102
|
+
await temp.cleanup();
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export {
|
|
107
|
+
readParquetTableFromBuffer,
|
|
108
|
+
openParquetBufferFromPath,
|
|
109
|
+
readParquetTableFromPath,
|
|
110
|
+
readParquetTableFromStdin
|
|
111
|
+
};
|
|
112
|
+
//# sourceMappingURL=chunk-IMVXDI4K.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../../packages/parquet-reader/src/index.ts"],"sourcesContent":["import { Blob as NodeBlob } from \"node:buffer\";\nimport { createWriteStream, readFileSync } from \"node:fs\";\nimport { promises as fs } from \"node:fs\";\nimport { randomUUID } from \"node:crypto\";\nimport { createRequire } from \"node:module\";\nimport { tmpdir } from \"node:os\";\nimport path from \"node:path\";\nimport { pipeline } from \"node:stream/promises\";\n\nimport { tableFromIPC, Table } from \"apache-arrow\";\nimport { initSync, ParquetFile, readParquet, type ReaderOptions } from \"parquet-wasm/esm\";\n\nconst BlobCtor: typeof Blob =\n typeof Blob === \"undefined\" ? (NodeBlob as unknown as typeof Blob) : Blob;\n\nlet wasmInitialized = false;\n\nfunction ensureWasmInitialized(): void {\n if (wasmInitialized) return;\n\n const require = createRequire(import.meta.url);\n const wasmPath = require.resolve(\"parquet-wasm/esm/parquet_wasm_bg.wasm\");\n const wasmBytes = readFileSync(wasmPath);\n initSync({ module: wasmBytes });\n wasmInitialized = true;\n}\n\nexport type TempParquetFile = {\n path: string;\n cleanup: () => Promise<void>;\n};\n\nexport type ParquetReadOptions = Pick<\n ReaderOptions,\n \"batchSize\" | \"columns\" | \"limit\" | \"offset\" | \"rowGroups\"\n>;\n\nexport type ParquetFileMetadata = {\n createdBy?: string;\n keyValueMetadata: Record<string, string>;\n};\n\nexport type ParquetBufferSource = {\n buffer: Uint8Array;\n byteLength: number;\n readTable: (options?: ParquetReadOptions) => Promise<Table>;\n readMetadata: () => Promise<ParquetFileMetadata>;\n};\n\nexport async function readParquetTableFromBuffer(\n buffer: Uint8Array,\n options?: ParquetReadOptions,\n): Promise<Table> {\n ensureWasmInitialized();\n const wasmTable = readParquet(buffer, options ?? undefined);\n const ipcStream = wasmTable.intoIPCStream();\n return tableFromIPC(ipcStream);\n}\n\nexport function createParquetBufferSource(buffer: Uint8Array): ParquetBufferSource {\n let metadataPromise: Promise<ParquetFileMetadata> | null = null;\n\n return {\n buffer,\n byteLength: buffer.byteLength,\n readTable: (options?: ParquetReadOptions) => readParquetTableFromBuffer(buffer, options),\n readMetadata: () => {\n if (!metadataPromise) {\n metadataPromise = readParquetMetadataFromBuffer(buffer);\n }\n return metadataPromise;\n },\n };\n}\n\nexport async function openParquetBufferFromPath(filePath: string): Promise<ParquetBufferSource> {\n const buffer = await fs.readFile(filePath);\n return createParquetBufferSource(buffer);\n}\n\nexport async function readParquetTableFromPath(\n filePath: string,\n options?: ParquetReadOptions,\n): Promise<Table> {\n const buffer = await fs.readFile(filePath);\n return readParquetTableFromBuffer(buffer, options);\n}\n\nexport async function readParquetMetadataFromBuffer(\n buffer: Uint8Array,\n): Promise<ParquetFileMetadata> {\n ensureWasmInitialized();\n const blobInput = new Uint8Array(buffer).buffer as ArrayBuffer;\n const file = await ParquetFile.fromFile(new BlobCtor([blobInput]));\n const meta = file.metadata();\n const fileMeta = meta.fileMetadata();\n const createdBy = fileMeta.createdBy();\n const keyValueMetadata = Object.fromEntries(fileMeta.keyValueMetadata());\n\n fileMeta.free();\n meta.free();\n file.free();\n\n return {\n createdBy: createdBy ?? undefined,\n keyValueMetadata: normalizeMetadataValues(keyValueMetadata),\n };\n}\n\nfunction normalizeMetadataValues(input: Record<string, unknown>): Record<string, string> {\n const normalized: Record<string, string> = {};\n\n for (const [key, value] of Object.entries(input)) {\n if (value === null || value === undefined) {\n normalized[key] = \"\";\n continue;\n }\n normalized[key] = typeof value === \"string\" ? value : String(value);\n }\n\n return normalized;\n}\n\nexport async function bufferStdinToTempFile(\n filenameHint = \"stdin.parquet\",\n): Promise<TempParquetFile> {\n const tempDir = await fs.mkdtemp(path.join(tmpdir(), \"parquetlens-\"));\n const safeName = filenameHint.replace(/[\\\\/]/g, \"_\");\n const filePath = path.join(tempDir, `${randomUUID()}-${safeName}`);\n const writeStream = createWriteStream(filePath);\n\n await pipeline(process.stdin, writeStream);\n\n return {\n path: filePath,\n cleanup: async () => {\n await fs.rm(tempDir, { recursive: true, force: true });\n },\n };\n}\n\nexport async function readParquetTableFromStdin(\n filenameHint = \"stdin.parquet\",\n options?: ParquetReadOptions,\n): Promise<Table> {\n const temp = await bufferStdinToTempFile(filenameHint);\n\n try {\n return await readParquetTableFromPath(temp.path, options);\n } finally {\n await temp.cleanup();\n }\n}\n"],"mappings":";;;;;;;;AAAA,SAAS,QAAQ,gBAAgB;AACjC,SAAS,mBAAmB,oBAAoB;AAChD,SAAS,YAAY,UAAU;AAC/B,SAAS,kBAAkB;AAC3B,SAAS,qBAAqB;AAC9B,SAAS,cAAc;AACvB,OAAO,UAAU;AACjB,SAAS,gBAAgB;AAEzB,SAAS,oBAA2B;AACpC,SAAS,UAAU,aAAa,mBAAuC;AAEvE,IAAM,WACJ,OAAO,SAAS,cAAe,WAAsC;AAEvE,IAAI,kBAAkB;AAEtB,SAAS,wBAA8B;AACrC,MAAI,gBAAiB;AAErB,QAAMA,WAAU,cAAc,YAAY,GAAG;AAC7C,QAAM,WAAWA,SAAQ,QAAQ,uCAAuC;AACxE,QAAM,YAAY,aAAa,QAAQ;AACvC,WAAS,EAAE,QAAQ,UAAU,CAAC;AAC9B,oBAAkB;AACpB;AAwBA,eAAsB,2BACpB,QACA,SACgB;AAChB,wBAAsB;AACtB,QAAM,YAAY,YAAY,QAAQ,WAAW,MAAS;AAC1D,QAAM,YAAY,UAAU,cAAc;AAC1C,SAAO,aAAa,SAAS;AAC/B;AAEO,SAAS,0BAA0B,QAAyC;AACjF,MAAI,kBAAuD;AAE3D,SAAO;IACL;IACA,YAAY,OAAO;IACnB,WAAW,CAAC,YAAiC,2BAA2B,QAAQ,OAAO;IACvF,cAAc,MAAM;AAClB,UAAI,CAAC,iBAAiB;AACpB,0BAAkB,8BAA8B,MAAM;MACxD;AACA,aAAO;IACT;EACF;AACF;AAEA,eAAsB,0BAA0B,UAAgD;AAC9F,QAAM,SAAS,MAAM,GAAG,SAAS,QAAQ;AACzC,SAAO,0BAA0B,MAAM;AACzC;AAEA,eAAsB,yBACpB,UACA,SACgB;AAChB,QAAM,SAAS,MAAM,GAAG,SAAS,QAAQ;AACzC,SAAO,2BAA2B,QAAQ,OAAO;AACnD;AAEA,eAAsB,8BACpB,QAC8B;AAC9B,wBAAsB;AACtB,QAAM,YAAY,IAAI,WAAW,MAAM,EAAE;AACzC,QAAM,OAAO,MAAM,YAAY,SAAS,IAAI,SAAS,CAAC,SAAS,CAAC,CAAC;AACjE,QAAM,OAAO,KAAK,SAAS;AAC3B,QAAM,WAAW,KAAK,aAAa;AACnC,QAAM,YAAY,SAAS,UAAU;AACrC,QAAM,mBAAmB,OAAO,YAAY,SAAS,iBAAiB,CAAC;AAEvE,WAAS,KAAK;AACd,OAAK,KAAK;AACV,OAAK,KAAK;AAEV,SAAO;IACL,WAAW,aAAa;IACxB,kBAAkB,wBAAwB,gBAAgB;EAC5D;AACF;AAEA,SAAS,wBAAwB,OAAwD;AACvF,QAAM,aAAqC,CAAC;AAE5C,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,KAAK,GAAG;AAChD,QAAI,UAAU,QAAQ,UAAU,QAAW;AACzC,iBAAW,GAAG,IAAI;AAClB;IACF;AACA,eAAW,GAAG,IAAI,OAAO,UAAU,WAAW,QAAQ,OAAO,KAAK;EACpE;AAEA,SAAO;AACT;AAEA,eAAsB,sBACpB,eAAe,iBACW;AAC1B,QAAM,UAAU,MAAM,GAAG,QAAQ,KAAK,KAAK,OAAO,GAAG,cAAc,CAAC;AACpE,QAAM,WAAW,aAAa,QAAQ,UAAU,GAAG;AACnD,QAAM,WAAW,KAAK,KAAK,SAAS,GAAG,WAAW,CAAC,IAAI,QAAQ,EAAE;AACjE,QAAM,cAAc,kBAAkB,QAAQ;AAE9C,QAAM,SAAS,QAAQ,OAAO,WAAW;AAEzC,SAAO;IACL,MAAM;IACN,SAAS,YAAY;AACnB,YAAM,GAAG,GAAG,SAAS,EAAE,WAAW,MAAM,OAAO,KAAK,CAAC;IACvD;EACF;AACF;AAEA,eAAsB,0BACpB,eAAe,iBACf,SACgB;AAChB,QAAM,OAAO,MAAM,sBAAsB,YAAY;AAErD,MAAI;AACF,WAAO,MAAM,yBAAyB,KAAK,MAAM,OAAO;EAC1D,UAAA;AACE,UAAM,KAAK,QAAQ;EACrB;AACF;","names":["require"]}
|