gitx.do 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -353
- package/dist/do/logger.d.ts +50 -0
- package/dist/do/logger.d.ts.map +1 -0
- package/dist/do/logger.js +122 -0
- package/dist/do/logger.js.map +1 -0
- package/dist/{durable-object → do}/schema.d.ts +3 -3
- package/dist/do/schema.d.ts.map +1 -0
- package/dist/{durable-object → do}/schema.js +4 -3
- package/dist/do/schema.js.map +1 -0
- package/dist/do/types.d.ts +267 -0
- package/dist/do/types.d.ts.map +1 -0
- package/dist/do/types.js +62 -0
- package/dist/do/types.js.map +1 -0
- package/dist/index.d.ts +15 -415
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +31 -483
- package/dist/index.js.map +1 -1
- package/package.json +13 -21
- package/dist/cli/commands/add.d.ts +0 -174
- package/dist/cli/commands/add.d.ts.map +0 -1
- package/dist/cli/commands/add.js +0 -131
- package/dist/cli/commands/add.js.map +0 -1
- package/dist/cli/commands/blame.d.ts +0 -259
- package/dist/cli/commands/blame.d.ts.map +0 -1
- package/dist/cli/commands/blame.js +0 -609
- package/dist/cli/commands/blame.js.map +0 -1
- package/dist/cli/commands/branch.d.ts +0 -249
- package/dist/cli/commands/branch.d.ts.map +0 -1
- package/dist/cli/commands/branch.js +0 -693
- package/dist/cli/commands/branch.js.map +0 -1
- package/dist/cli/commands/commit.d.ts +0 -182
- package/dist/cli/commands/commit.d.ts.map +0 -1
- package/dist/cli/commands/commit.js +0 -437
- package/dist/cli/commands/commit.js.map +0 -1
- package/dist/cli/commands/diff.d.ts +0 -464
- package/dist/cli/commands/diff.d.ts.map +0 -1
- package/dist/cli/commands/diff.js +0 -958
- package/dist/cli/commands/diff.js.map +0 -1
- package/dist/cli/commands/log.d.ts +0 -239
- package/dist/cli/commands/log.d.ts.map +0 -1
- package/dist/cli/commands/log.js +0 -535
- package/dist/cli/commands/log.js.map +0 -1
- package/dist/cli/commands/merge.d.ts +0 -106
- package/dist/cli/commands/merge.d.ts.map +0 -1
- package/dist/cli/commands/merge.js +0 -55
- package/dist/cli/commands/merge.js.map +0 -1
- package/dist/cli/commands/review.d.ts +0 -457
- package/dist/cli/commands/review.d.ts.map +0 -1
- package/dist/cli/commands/review.js +0 -533
- package/dist/cli/commands/review.js.map +0 -1
- package/dist/cli/commands/status.d.ts +0 -269
- package/dist/cli/commands/status.d.ts.map +0 -1
- package/dist/cli/commands/status.js +0 -493
- package/dist/cli/commands/status.js.map +0 -1
- package/dist/cli/commands/web.d.ts +0 -199
- package/dist/cli/commands/web.d.ts.map +0 -1
- package/dist/cli/commands/web.js +0 -696
- package/dist/cli/commands/web.js.map +0 -1
- package/dist/cli/fs-adapter.d.ts +0 -656
- package/dist/cli/fs-adapter.d.ts.map +0 -1
- package/dist/cli/fs-adapter.js +0 -1179
- package/dist/cli/fs-adapter.js.map +0 -1
- package/dist/cli/fsx-cli-adapter.d.ts +0 -359
- package/dist/cli/fsx-cli-adapter.d.ts.map +0 -1
- package/dist/cli/fsx-cli-adapter.js +0 -619
- package/dist/cli/fsx-cli-adapter.js.map +0 -1
- package/dist/cli/index.d.ts +0 -387
- package/dist/cli/index.d.ts.map +0 -1
- package/dist/cli/index.js +0 -523
- package/dist/cli/index.js.map +0 -1
- package/dist/cli/ui/components/DiffView.d.ts +0 -7
- package/dist/cli/ui/components/DiffView.d.ts.map +0 -1
- package/dist/cli/ui/components/DiffView.js +0 -11
- package/dist/cli/ui/components/DiffView.js.map +0 -1
- package/dist/cli/ui/components/ErrorDisplay.d.ts +0 -6
- package/dist/cli/ui/components/ErrorDisplay.d.ts.map +0 -1
- package/dist/cli/ui/components/ErrorDisplay.js +0 -11
- package/dist/cli/ui/components/ErrorDisplay.js.map +0 -1
- package/dist/cli/ui/components/FuzzySearch.d.ts +0 -9
- package/dist/cli/ui/components/FuzzySearch.d.ts.map +0 -1
- package/dist/cli/ui/components/FuzzySearch.js +0 -12
- package/dist/cli/ui/components/FuzzySearch.js.map +0 -1
- package/dist/cli/ui/components/LoadingSpinner.d.ts +0 -6
- package/dist/cli/ui/components/LoadingSpinner.d.ts.map +0 -1
- package/dist/cli/ui/components/LoadingSpinner.js +0 -10
- package/dist/cli/ui/components/LoadingSpinner.js.map +0 -1
- package/dist/cli/ui/components/NavigationList.d.ts +0 -9
- package/dist/cli/ui/components/NavigationList.d.ts.map +0 -1
- package/dist/cli/ui/components/NavigationList.js +0 -11
- package/dist/cli/ui/components/NavigationList.js.map +0 -1
- package/dist/cli/ui/components/ScrollableContent.d.ts +0 -8
- package/dist/cli/ui/components/ScrollableContent.d.ts.map +0 -1
- package/dist/cli/ui/components/ScrollableContent.js +0 -11
- package/dist/cli/ui/components/ScrollableContent.js.map +0 -1
- package/dist/cli/ui/components/index.d.ts +0 -7
- package/dist/cli/ui/components/index.d.ts.map +0 -1
- package/dist/cli/ui/components/index.js +0 -9
- package/dist/cli/ui/components/index.js.map +0 -1
- package/dist/cli/ui/terminal-ui.d.ts +0 -52
- package/dist/cli/ui/terminal-ui.d.ts.map +0 -1
- package/dist/cli/ui/terminal-ui.js +0 -121
- package/dist/cli/ui/terminal-ui.js.map +0 -1
- package/dist/do/BashModule.d.ts +0 -871
- package/dist/do/BashModule.d.ts.map +0 -1
- package/dist/do/BashModule.js +0 -1143
- package/dist/do/BashModule.js.map +0 -1
- package/dist/do/FsModule.d.ts +0 -601
- package/dist/do/FsModule.d.ts.map +0 -1
- package/dist/do/FsModule.js +0 -1120
- package/dist/do/FsModule.js.map +0 -1
- package/dist/do/GitModule.d.ts +0 -635
- package/dist/do/GitModule.d.ts.map +0 -1
- package/dist/do/GitModule.js +0 -781
- package/dist/do/GitModule.js.map +0 -1
- package/dist/do/GitRepoDO.d.ts +0 -281
- package/dist/do/GitRepoDO.d.ts.map +0 -1
- package/dist/do/GitRepoDO.js +0 -479
- package/dist/do/GitRepoDO.js.map +0 -1
- package/dist/do/bash-ast.d.ts +0 -246
- package/dist/do/bash-ast.d.ts.map +0 -1
- package/dist/do/bash-ast.js +0 -888
- package/dist/do/bash-ast.js.map +0 -1
- package/dist/do/container-executor.d.ts +0 -491
- package/dist/do/container-executor.d.ts.map +0 -1
- package/dist/do/container-executor.js +0 -730
- package/dist/do/container-executor.js.map +0 -1
- package/dist/do/index.d.ts +0 -53
- package/dist/do/index.d.ts.map +0 -1
- package/dist/do/index.js +0 -91
- package/dist/do/index.js.map +0 -1
- package/dist/do/tiered-storage.d.ts +0 -403
- package/dist/do/tiered-storage.d.ts.map +0 -1
- package/dist/do/tiered-storage.js +0 -689
- package/dist/do/tiered-storage.js.map +0 -1
- package/dist/do/withBash.d.ts +0 -231
- package/dist/do/withBash.d.ts.map +0 -1
- package/dist/do/withBash.js +0 -244
- package/dist/do/withBash.js.map +0 -1
- package/dist/do/withFs.d.ts +0 -237
- package/dist/do/withFs.d.ts.map +0 -1
- package/dist/do/withFs.js +0 -387
- package/dist/do/withFs.js.map +0 -1
- package/dist/do/withGit.d.ts +0 -180
- package/dist/do/withGit.d.ts.map +0 -1
- package/dist/do/withGit.js +0 -271
- package/dist/do/withGit.js.map +0 -1
- package/dist/durable-object/object-store.d.ts +0 -633
- package/dist/durable-object/object-store.d.ts.map +0 -1
- package/dist/durable-object/object-store.js +0 -1161
- package/dist/durable-object/object-store.js.map +0 -1
- package/dist/durable-object/schema.d.ts.map +0 -1
- package/dist/durable-object/schema.js.map +0 -1
- package/dist/durable-object/wal.d.ts +0 -416
- package/dist/durable-object/wal.d.ts.map +0 -1
- package/dist/durable-object/wal.js +0 -445
- package/dist/durable-object/wal.js.map +0 -1
- package/dist/mcp/adapter.d.ts +0 -772
- package/dist/mcp/adapter.d.ts.map +0 -1
- package/dist/mcp/adapter.js +0 -895
- package/dist/mcp/adapter.js.map +0 -1
- package/dist/mcp/sandbox/miniflare-evaluator.d.ts +0 -22
- package/dist/mcp/sandbox/miniflare-evaluator.d.ts.map +0 -1
- package/dist/mcp/sandbox/miniflare-evaluator.js +0 -140
- package/dist/mcp/sandbox/miniflare-evaluator.js.map +0 -1
- package/dist/mcp/sandbox/object-store-proxy.d.ts +0 -32
- package/dist/mcp/sandbox/object-store-proxy.d.ts.map +0 -1
- package/dist/mcp/sandbox/object-store-proxy.js +0 -30
- package/dist/mcp/sandbox/object-store-proxy.js.map +0 -1
- package/dist/mcp/sandbox/template.d.ts +0 -17
- package/dist/mcp/sandbox/template.d.ts.map +0 -1
- package/dist/mcp/sandbox/template.js +0 -71
- package/dist/mcp/sandbox/template.js.map +0 -1
- package/dist/mcp/sandbox.d.ts +0 -764
- package/dist/mcp/sandbox.d.ts.map +0 -1
- package/dist/mcp/sandbox.js +0 -1362
- package/dist/mcp/sandbox.js.map +0 -1
- package/dist/mcp/sdk-adapter.d.ts +0 -835
- package/dist/mcp/sdk-adapter.d.ts.map +0 -1
- package/dist/mcp/sdk-adapter.js +0 -974
- package/dist/mcp/sdk-adapter.js.map +0 -1
- package/dist/mcp/tools/do.d.ts +0 -32
- package/dist/mcp/tools/do.d.ts.map +0 -1
- package/dist/mcp/tools/do.js +0 -115
- package/dist/mcp/tools/do.js.map +0 -1
- package/dist/mcp/tools.d.ts +0 -548
- package/dist/mcp/tools.d.ts.map +0 -1
- package/dist/mcp/tools.js +0 -1934
- package/dist/mcp/tools.js.map +0 -1
- package/dist/ops/blame.d.ts +0 -551
- package/dist/ops/blame.d.ts.map +0 -1
- package/dist/ops/blame.js +0 -1037
- package/dist/ops/blame.js.map +0 -1
- package/dist/ops/branch.d.ts +0 -766
- package/dist/ops/branch.d.ts.map +0 -1
- package/dist/ops/branch.js +0 -950
- package/dist/ops/branch.js.map +0 -1
- package/dist/ops/commit-traversal.d.ts +0 -349
- package/dist/ops/commit-traversal.d.ts.map +0 -1
- package/dist/ops/commit-traversal.js +0 -821
- package/dist/ops/commit-traversal.js.map +0 -1
- package/dist/ops/commit.d.ts +0 -555
- package/dist/ops/commit.d.ts.map +0 -1
- package/dist/ops/commit.js +0 -826
- package/dist/ops/commit.js.map +0 -1
- package/dist/ops/merge-base.d.ts +0 -397
- package/dist/ops/merge-base.d.ts.map +0 -1
- package/dist/ops/merge-base.js +0 -691
- package/dist/ops/merge-base.js.map +0 -1
- package/dist/ops/merge.d.ts +0 -855
- package/dist/ops/merge.d.ts.map +0 -1
- package/dist/ops/merge.js +0 -1551
- package/dist/ops/merge.js.map +0 -1
- package/dist/ops/tag.d.ts +0 -247
- package/dist/ops/tag.d.ts.map +0 -1
- package/dist/ops/tag.js +0 -649
- package/dist/ops/tag.js.map +0 -1
- package/dist/ops/tree-builder.d.ts +0 -178
- package/dist/ops/tree-builder.d.ts.map +0 -1
- package/dist/ops/tree-builder.js +0 -271
- package/dist/ops/tree-builder.js.map +0 -1
- package/dist/ops/tree-diff.d.ts +0 -291
- package/dist/ops/tree-diff.d.ts.map +0 -1
- package/dist/ops/tree-diff.js +0 -705
- package/dist/ops/tree-diff.js.map +0 -1
- package/dist/pack/delta.d.ts +0 -248
- package/dist/pack/delta.d.ts.map +0 -1
- package/dist/pack/delta.js +0 -736
- package/dist/pack/delta.js.map +0 -1
- package/dist/pack/format.d.ts +0 -446
- package/dist/pack/format.d.ts.map +0 -1
- package/dist/pack/format.js +0 -572
- package/dist/pack/format.js.map +0 -1
- package/dist/pack/full-generation.d.ts +0 -612
- package/dist/pack/full-generation.d.ts.map +0 -1
- package/dist/pack/full-generation.js +0 -1378
- package/dist/pack/full-generation.js.map +0 -1
- package/dist/pack/generation.d.ts +0 -441
- package/dist/pack/generation.d.ts.map +0 -1
- package/dist/pack/generation.js +0 -707
- package/dist/pack/generation.js.map +0 -1
- package/dist/pack/index.d.ts +0 -502
- package/dist/pack/index.d.ts.map +0 -1
- package/dist/pack/index.js +0 -833
- package/dist/pack/index.js.map +0 -1
- package/dist/refs/branch.d.ts +0 -668
- package/dist/refs/branch.d.ts.map +0 -1
- package/dist/refs/branch.js +0 -897
- package/dist/refs/branch.js.map +0 -1
- package/dist/refs/storage.d.ts +0 -833
- package/dist/refs/storage.d.ts.map +0 -1
- package/dist/refs/storage.js +0 -1023
- package/dist/refs/storage.js.map +0 -1
- package/dist/refs/tag.d.ts +0 -860
- package/dist/refs/tag.d.ts.map +0 -1
- package/dist/refs/tag.js +0 -996
- package/dist/refs/tag.js.map +0 -1
- package/dist/storage/backend.d.ts +0 -425
- package/dist/storage/backend.d.ts.map +0 -1
- package/dist/storage/backend.js +0 -41
- package/dist/storage/backend.js.map +0 -1
- package/dist/storage/fsx-adapter.d.ts +0 -204
- package/dist/storage/fsx-adapter.d.ts.map +0 -1
- package/dist/storage/fsx-adapter.js +0 -470
- package/dist/storage/fsx-adapter.js.map +0 -1
- package/dist/storage/lru-cache.d.ts +0 -691
- package/dist/storage/lru-cache.d.ts.map +0 -1
- package/dist/storage/lru-cache.js +0 -813
- package/dist/storage/lru-cache.js.map +0 -1
- package/dist/storage/object-index.d.ts +0 -585
- package/dist/storage/object-index.d.ts.map +0 -1
- package/dist/storage/object-index.js +0 -532
- package/dist/storage/object-index.js.map +0 -1
- package/dist/storage/r2-pack.d.ts +0 -1257
- package/dist/storage/r2-pack.d.ts.map +0 -1
- package/dist/storage/r2-pack.js +0 -1770
- package/dist/storage/r2-pack.js.map +0 -1
- package/dist/tiered/cdc-pipeline.d.ts +0 -1888
- package/dist/tiered/cdc-pipeline.d.ts.map +0 -1
- package/dist/tiered/cdc-pipeline.js +0 -1880
- package/dist/tiered/cdc-pipeline.js.map +0 -1
- package/dist/tiered/migration.d.ts +0 -1104
- package/dist/tiered/migration.d.ts.map +0 -1
- package/dist/tiered/migration.js +0 -1214
- package/dist/tiered/migration.js.map +0 -1
- package/dist/tiered/parquet-writer.d.ts +0 -1145
- package/dist/tiered/parquet-writer.d.ts.map +0 -1
- package/dist/tiered/parquet-writer.js +0 -1183
- package/dist/tiered/parquet-writer.js.map +0 -1
- package/dist/tiered/read-path.d.ts +0 -835
- package/dist/tiered/read-path.d.ts.map +0 -1
- package/dist/tiered/read-path.js +0 -487
- package/dist/tiered/read-path.js.map +0 -1
- package/dist/types/capability.d.ts +0 -1385
- package/dist/types/capability.d.ts.map +0 -1
- package/dist/types/capability.js +0 -36
- package/dist/types/capability.js.map +0 -1
- package/dist/types/index.d.ts +0 -13
- package/dist/types/index.d.ts.map +0 -1
- package/dist/types/index.js +0 -18
- package/dist/types/index.js.map +0 -1
- package/dist/types/objects.d.ts +0 -692
- package/dist/types/objects.d.ts.map +0 -1
- package/dist/types/objects.js +0 -837
- package/dist/types/objects.js.map +0 -1
- package/dist/types/storage.d.ts +0 -603
- package/dist/types/storage.d.ts.map +0 -1
- package/dist/types/storage.js +0 -191
- package/dist/types/storage.js.map +0 -1
- package/dist/types/worker-loader.d.ts +0 -60
- package/dist/types/worker-loader.d.ts.map +0 -1
- package/dist/types/worker-loader.js +0 -62
- package/dist/types/worker-loader.js.map +0 -1
- package/dist/utils/hash.d.ts +0 -197
- package/dist/utils/hash.d.ts.map +0 -1
- package/dist/utils/hash.js +0 -268
- package/dist/utils/hash.js.map +0 -1
- package/dist/utils/sha1.d.ts +0 -290
- package/dist/utils/sha1.d.ts.map +0 -1
- package/dist/utils/sha1.js +0 -582
- package/dist/utils/sha1.js.map +0 -1
- package/dist/wire/capabilities.d.ts +0 -1044
- package/dist/wire/capabilities.d.ts.map +0 -1
- package/dist/wire/capabilities.js +0 -941
- package/dist/wire/capabilities.js.map +0 -1
- package/dist/wire/path-security.d.ts +0 -157
- package/dist/wire/path-security.d.ts.map +0 -1
- package/dist/wire/path-security.js +0 -307
- package/dist/wire/path-security.js.map +0 -1
- package/dist/wire/pkt-line.d.ts +0 -345
- package/dist/wire/pkt-line.d.ts.map +0 -1
- package/dist/wire/pkt-line.js +0 -381
- package/dist/wire/pkt-line.js.map +0 -1
- package/dist/wire/receive-pack.d.ts +0 -1059
- package/dist/wire/receive-pack.d.ts.map +0 -1
- package/dist/wire/receive-pack.js +0 -1414
- package/dist/wire/receive-pack.js.map +0 -1
- package/dist/wire/smart-http.d.ts +0 -799
- package/dist/wire/smart-http.d.ts.map +0 -1
- package/dist/wire/smart-http.js +0 -945
- package/dist/wire/smart-http.js.map +0 -1
- package/dist/wire/upload-pack.d.ts +0 -727
- package/dist/wire/upload-pack.d.ts.map +0 -1
- package/dist/wire/upload-pack.js +0 -1138
- package/dist/wire/upload-pack.js.map +0 -1
|
@@ -1,1183 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @fileoverview Parquet Writer for Git Analytics
|
|
3
|
-
*
|
|
4
|
-
* @description
|
|
5
|
-
* Provides functionality to write git analytics data to Parquet format, a
|
|
6
|
-
* columnar storage format optimized for analytical queries. This module
|
|
7
|
-
* enables efficient storage and querying of Git repository data.
|
|
8
|
-
*
|
|
9
|
-
* **Key Features:**
|
|
10
|
-
* - Schema definition with various field types (STRING, INT32, INT64, etc.)
|
|
11
|
-
* - Multiple compression algorithms (SNAPPY, GZIP, ZSTD, LZ4, UNCOMPRESSED)
|
|
12
|
-
* - Row group management for efficient columnar storage
|
|
13
|
-
* - Automatic and manual row group flushing
|
|
14
|
-
* - Column-level statistics generation (min, max, null count)
|
|
15
|
-
* - Custom key-value metadata support
|
|
16
|
-
* - Memory-efficient streaming writes
|
|
17
|
-
*
|
|
18
|
-
* **Parquet Format:**
|
|
19
|
-
* The generated files follow the Parquet format with:
|
|
20
|
-
* - Magic bytes "PAR1" at start and end
|
|
21
|
-
* - Row group data organized by columns
|
|
22
|
-
* - Footer metadata containing schema and statistics
|
|
23
|
-
*
|
|
24
|
-
* @example
|
|
25
|
-
* ```typescript
|
|
26
|
-
* // Define schema for commit analytics
|
|
27
|
-
* const schema = defineSchema([
|
|
28
|
-
* { name: 'commit_sha', type: ParquetFieldType.STRING, required: true },
|
|
29
|
-
* { name: 'author', type: ParquetFieldType.STRING, required: true },
|
|
30
|
-
* { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true },
|
|
31
|
-
* { name: 'file_count', type: ParquetFieldType.INT32, required: false }
|
|
32
|
-
* ])
|
|
33
|
-
*
|
|
34
|
-
* // Create writer with options
|
|
35
|
-
* const writer = createParquetWriter(schema, {
|
|
36
|
-
* rowGroupSize: 10000,
|
|
37
|
-
* compression: ParquetCompression.SNAPPY,
|
|
38
|
-
* enableStatistics: true
|
|
39
|
-
* })
|
|
40
|
-
*
|
|
41
|
-
* // Write data
|
|
42
|
-
* await writer.writeRows([
|
|
43
|
-
* { commit_sha: 'abc123...', author: 'alice', timestamp: Date.now(), file_count: 5 },
|
|
44
|
-
* { commit_sha: 'def456...', author: 'bob', timestamp: Date.now(), file_count: 3 }
|
|
45
|
-
* ])
|
|
46
|
-
*
|
|
47
|
-
* // Generate the Parquet file
|
|
48
|
-
* const buffer = await writer.toBuffer()
|
|
49
|
-
* ```
|
|
50
|
-
*
|
|
51
|
-
* @module tiered/parquet-writer
|
|
52
|
-
* @see {@link ParquetWriter} - Main writer class
|
|
53
|
-
* @see {@link defineSchema} - Schema definition helper
|
|
54
|
-
*/
|
|
55
|
-
import pako from 'pako';
|
|
56
|
-
// ============================================================================
|
|
57
|
-
// Types and Enums
|
|
58
|
-
// ============================================================================
|
|
59
|
-
/**
|
|
60
|
-
* Supported Parquet field types.
|
|
61
|
-
*
|
|
62
|
-
* @description
|
|
63
|
-
* Defines the data types that can be used for fields in a Parquet schema.
|
|
64
|
-
* Each type maps to an appropriate physical and logical Parquet type.
|
|
65
|
-
*
|
|
66
|
-
* @example
|
|
67
|
-
* ```typescript
|
|
68
|
-
* const field: ParquetField = {
|
|
69
|
-
* name: 'count',
|
|
70
|
-
* type: ParquetFieldType.INT64,
|
|
71
|
-
* required: true
|
|
72
|
-
* }
|
|
73
|
-
* ```
|
|
74
|
-
*
|
|
75
|
-
* @enum {string}
|
|
76
|
-
*/
|
|
77
|
-
export var ParquetFieldType;
|
|
78
|
-
(function (ParquetFieldType) {
|
|
79
|
-
/**
|
|
80
|
-
* UTF-8 encoded string.
|
|
81
|
-
* Maps to Parquet BYTE_ARRAY with UTF8 logical type.
|
|
82
|
-
*/
|
|
83
|
-
ParquetFieldType["STRING"] = "STRING";
|
|
84
|
-
/**
|
|
85
|
-
* 32-bit signed integer.
|
|
86
|
-
* Maps to Parquet INT32 physical type.
|
|
87
|
-
*/
|
|
88
|
-
ParquetFieldType["INT32"] = "INT32";
|
|
89
|
-
/**
|
|
90
|
-
* 64-bit signed integer.
|
|
91
|
-
* Maps to Parquet INT64 physical type.
|
|
92
|
-
*/
|
|
93
|
-
ParquetFieldType["INT64"] = "INT64";
|
|
94
|
-
/**
|
|
95
|
-
* Boolean value (true/false).
|
|
96
|
-
* Maps to Parquet BOOLEAN physical type.
|
|
97
|
-
*/
|
|
98
|
-
ParquetFieldType["BOOLEAN"] = "BOOLEAN";
|
|
99
|
-
/**
|
|
100
|
-
* 32-bit IEEE 754 floating point.
|
|
101
|
-
* Maps to Parquet FLOAT physical type.
|
|
102
|
-
*/
|
|
103
|
-
ParquetFieldType["FLOAT"] = "FLOAT";
|
|
104
|
-
/**
|
|
105
|
-
* 64-bit IEEE 754 floating point.
|
|
106
|
-
* Maps to Parquet DOUBLE physical type.
|
|
107
|
-
*/
|
|
108
|
-
ParquetFieldType["DOUBLE"] = "DOUBLE";
|
|
109
|
-
/**
|
|
110
|
-
* Raw binary data.
|
|
111
|
-
* Maps to Parquet BYTE_ARRAY physical type.
|
|
112
|
-
*/
|
|
113
|
-
ParquetFieldType["BINARY"] = "BINARY";
|
|
114
|
-
/**
|
|
115
|
-
* Timestamp with millisecond precision.
|
|
116
|
-
* Maps to Parquet INT64 with TIMESTAMP_MILLIS logical type.
|
|
117
|
-
*/
|
|
118
|
-
ParquetFieldType["TIMESTAMP_MILLIS"] = "TIMESTAMP_MILLIS";
|
|
119
|
-
/**
|
|
120
|
-
* Timestamp with microsecond precision.
|
|
121
|
-
* Maps to Parquet INT64 with TIMESTAMP_MICROS logical type.
|
|
122
|
-
*/
|
|
123
|
-
ParquetFieldType["TIMESTAMP_MICROS"] = "TIMESTAMP_MICROS";
|
|
124
|
-
})(ParquetFieldType || (ParquetFieldType = {}));
|
|
125
|
-
/**
|
|
126
|
-
* Supported compression types for Parquet data.
|
|
127
|
-
*
|
|
128
|
-
* @description
|
|
129
|
-
* Different compression algorithms offer trade-offs between compression
|
|
130
|
-
* ratio, compression speed, and decompression speed.
|
|
131
|
-
*
|
|
132
|
-
* **Comparison:**
|
|
133
|
-
* - SNAPPY: Fast compression/decompression, moderate ratio (default)
|
|
134
|
-
* - GZIP: Higher ratio, slower compression, fast decompression
|
|
135
|
-
* - ZSTD: Best ratio, good speed, requires more memory
|
|
136
|
-
* - LZ4: Fastest, lower ratio
|
|
137
|
-
* - UNCOMPRESSED: No compression overhead
|
|
138
|
-
*
|
|
139
|
-
* @example
|
|
140
|
-
* ```typescript
|
|
141
|
-
* const writer = createParquetWriter(schema, {
|
|
142
|
-
* compression: ParquetCompression.ZSTD
|
|
143
|
-
* })
|
|
144
|
-
* ```
|
|
145
|
-
*
|
|
146
|
-
* @enum {string}
|
|
147
|
-
*/
|
|
148
|
-
export var ParquetCompression;
|
|
149
|
-
(function (ParquetCompression) {
|
|
150
|
-
/**
|
|
151
|
-
* No compression applied.
|
|
152
|
-
* Fastest writes, largest file size.
|
|
153
|
-
*/
|
|
154
|
-
ParquetCompression["UNCOMPRESSED"] = "UNCOMPRESSED";
|
|
155
|
-
/**
|
|
156
|
-
* Snappy compression (default).
|
|
157
|
-
* Good balance of speed and compression ratio.
|
|
158
|
-
*/
|
|
159
|
-
ParquetCompression["SNAPPY"] = "SNAPPY";
|
|
160
|
-
/**
|
|
161
|
-
* GZIP compression.
|
|
162
|
-
* Higher compression ratio, slower compression.
|
|
163
|
-
*/
|
|
164
|
-
ParquetCompression["GZIP"] = "GZIP";
|
|
165
|
-
/**
|
|
166
|
-
* Zstandard compression.
|
|
167
|
-
* Best compression ratio with good speed.
|
|
168
|
-
*/
|
|
169
|
-
ParquetCompression["ZSTD"] = "ZSTD";
|
|
170
|
-
/**
|
|
171
|
-
* LZ4 compression.
|
|
172
|
-
* Fastest compression, lower ratio.
|
|
173
|
-
*/
|
|
174
|
-
ParquetCompression["LZ4"] = "LZ4";
|
|
175
|
-
})(ParquetCompression || (ParquetCompression = {}));
|
|
176
|
-
/**
|
|
177
|
-
* Error class for Parquet-related operations.
|
|
178
|
-
*
|
|
179
|
-
* @description
|
|
180
|
-
* Thrown when Parquet operations fail, such as schema validation errors,
|
|
181
|
-
* invalid data types, or malformed files.
|
|
182
|
-
*
|
|
183
|
-
* @example
|
|
184
|
-
* ```typescript
|
|
185
|
-
* try {
|
|
186
|
-
* await writer.writeRow({ invalid_field: 'value' })
|
|
187
|
-
* } catch (error) {
|
|
188
|
-
* if (error instanceof ParquetError) {
|
|
189
|
-
* console.log(`Parquet error (${error.code}): ${error.message}`)
|
|
190
|
-
* }
|
|
191
|
-
* }
|
|
192
|
-
* ```
|
|
193
|
-
*
|
|
194
|
-
* @class ParquetError
|
|
195
|
-
* @extends Error
|
|
196
|
-
*/
|
|
197
|
-
export class ParquetError extends Error {
|
|
198
|
-
code;
|
|
199
|
-
/**
|
|
200
|
-
* Creates a new ParquetError.
|
|
201
|
-
*
|
|
202
|
-
* @param message - Human-readable error message
|
|
203
|
-
* @param code - Error code for programmatic handling
|
|
204
|
-
*
|
|
205
|
-
* @example
|
|
206
|
-
* ```typescript
|
|
207
|
-
* throw new ParquetError('Field name cannot be empty', 'EMPTY_FIELD_NAME')
|
|
208
|
-
* ```
|
|
209
|
-
*/
|
|
210
|
-
constructor(message, code) {
|
|
211
|
-
super(message);
|
|
212
|
-
this.code = code;
|
|
213
|
-
this.name = 'ParquetError';
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
// ============================================================================
|
|
217
|
-
// ParquetWriter Class
|
|
218
|
-
// ============================================================================
|
|
219
|
-
/**
|
|
220
|
-
* Parquet writer for git analytics data.
|
|
221
|
-
*
|
|
222
|
-
* @description
|
|
223
|
-
* ParquetWriter provides a streaming interface for writing data to Parquet
|
|
224
|
-
* format. It handles schema validation, row group management, compression,
|
|
225
|
-
* and statistics generation.
|
|
226
|
-
*
|
|
227
|
-
* **Usage Pattern:**
|
|
228
|
-
* 1. Create a schema using `defineSchema()`
|
|
229
|
-
* 2. Create a writer with `createParquetWriter()` or `new ParquetWriter()`
|
|
230
|
-
* 3. Write rows using `writeRow()` or `writeRows()`
|
|
231
|
-
* 4. Generate the file with `toBuffer()` or `writeTo()`
|
|
232
|
-
*
|
|
233
|
-
* **Row Group Management:**
|
|
234
|
-
* Rows are buffered in memory until the row group is full (by row count
|
|
235
|
-
* or memory limit), then flushed. You can also manually flush with
|
|
236
|
-
* `flushRowGroup()`.
|
|
237
|
-
*
|
|
238
|
-
* **Thread Safety:**
|
|
239
|
-
* Not thread-safe. Use separate writer instances for concurrent writes.
|
|
240
|
-
*
|
|
241
|
-
* @example
|
|
242
|
-
* ```typescript
|
|
243
|
-
* // Create schema
|
|
244
|
-
* const schema = defineSchema([
|
|
245
|
-
* { name: 'sha', type: ParquetFieldType.STRING, required: true },
|
|
246
|
-
* { name: 'type', type: ParquetFieldType.STRING, required: true },
|
|
247
|
-
* { name: 'size', type: ParquetFieldType.INT64, required: true },
|
|
248
|
-
* { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
|
|
249
|
-
* ])
|
|
250
|
-
*
|
|
251
|
-
* // Create writer
|
|
252
|
-
* const writer = new ParquetWriter(schema, {
|
|
253
|
-
* rowGroupSize: 10000,
|
|
254
|
-
* compression: ParquetCompression.SNAPPY,
|
|
255
|
-
* enableStatistics: true
|
|
256
|
-
* })
|
|
257
|
-
*
|
|
258
|
-
* // Write data
|
|
259
|
-
* for (const object of gitObjects) {
|
|
260
|
-
* await writer.writeRow({
|
|
261
|
-
* sha: object.sha,
|
|
262
|
-
* type: object.type,
|
|
263
|
-
* size: object.size,
|
|
264
|
-
* timestamp: Date.now()
|
|
265
|
-
* })
|
|
266
|
-
* }
|
|
267
|
-
*
|
|
268
|
-
* // Set custom metadata
|
|
269
|
-
* writer.setMetadata('git_version', '2.40.0')
|
|
270
|
-
* writer.setMetadata('repository', 'github.com/org/repo')
|
|
271
|
-
*
|
|
272
|
-
* // Generate file
|
|
273
|
-
* const buffer = await writer.toBuffer()
|
|
274
|
-
* console.log(`Generated ${buffer.length} bytes`)
|
|
275
|
-
* console.log(`Rows: ${writer.rowCount}`)
|
|
276
|
-
* console.log(`Row groups: ${writer.rowGroupCount}`)
|
|
277
|
-
*
|
|
278
|
-
* // Reset for reuse
|
|
279
|
-
* writer.reset()
|
|
280
|
-
* ```
|
|
281
|
-
*
|
|
282
|
-
* @class ParquetWriter
|
|
283
|
-
*/
|
|
284
|
-
export class ParquetWriter {
|
|
285
|
-
/**
|
|
286
|
-
* The Parquet schema for this writer.
|
|
287
|
-
* @readonly
|
|
288
|
-
*/
|
|
289
|
-
schema;
|
|
290
|
-
/**
|
|
291
|
-
* Resolved options with defaults applied.
|
|
292
|
-
* @readonly
|
|
293
|
-
*/
|
|
294
|
-
options;
|
|
295
|
-
/**
|
|
296
|
-
* Total row count written.
|
|
297
|
-
* @private
|
|
298
|
-
*/
|
|
299
|
-
_rowCount = 0;
|
|
300
|
-
/**
|
|
301
|
-
* Completed row groups.
|
|
302
|
-
* @private
|
|
303
|
-
*/
|
|
304
|
-
_rowGroups = [];
|
|
305
|
-
/**
|
|
306
|
-
* Current row group being built.
|
|
307
|
-
* @private
|
|
308
|
-
*/
|
|
309
|
-
_currentRowGroup = { rows: [], byteSize: 0 };
|
|
310
|
-
/**
|
|
311
|
-
* Whether the writer has been closed.
|
|
312
|
-
* @private
|
|
313
|
-
*/
|
|
314
|
-
_isClosed = false;
|
|
315
|
-
/**
|
|
316
|
-
* Custom key-value metadata.
|
|
317
|
-
* @private
|
|
318
|
-
*/
|
|
319
|
-
_keyValueMetadata = {};
|
|
320
|
-
/**
|
|
321
|
-
* Creation timestamp.
|
|
322
|
-
* @private
|
|
323
|
-
*/
|
|
324
|
-
_createdAt = Date.now();
|
|
325
|
-
/**
|
|
326
|
-
* Creates a new ParquetWriter instance.
|
|
327
|
-
*
|
|
328
|
-
* @param schema - The Parquet schema defining columns
|
|
329
|
-
* @param options - Writer configuration options
|
|
330
|
-
*
|
|
331
|
-
* @example
|
|
332
|
-
* ```typescript
|
|
333
|
-
* const writer = new ParquetWriter(schema, {
|
|
334
|
-
* rowGroupSize: 50000,
|
|
335
|
-
* compression: ParquetCompression.GZIP
|
|
336
|
-
* })
|
|
337
|
-
* ```
|
|
338
|
-
*/
|
|
339
|
-
constructor(schema, options = {}) {
|
|
340
|
-
this.schema = schema;
|
|
341
|
-
this.options = {
|
|
342
|
-
rowGroupSize: options.rowGroupSize ?? 65536,
|
|
343
|
-
compression: options.compression ?? ParquetCompression.SNAPPY,
|
|
344
|
-
...options
|
|
345
|
-
};
|
|
346
|
-
}
|
|
347
|
-
/**
|
|
348
|
-
* Gets the total row count written to the writer.
|
|
349
|
-
*
|
|
350
|
-
* @description
|
|
351
|
-
* Returns the total number of rows written, including rows in the
|
|
352
|
-
* current unflushed row group.
|
|
353
|
-
*
|
|
354
|
-
* @returns Total row count
|
|
355
|
-
*
|
|
356
|
-
* @example
|
|
357
|
-
* ```typescript
|
|
358
|
-
* await writer.writeRows(data)
|
|
359
|
-
* console.log(`Wrote ${writer.rowCount} rows`)
|
|
360
|
-
* ```
|
|
361
|
-
*/
|
|
362
|
-
get rowCount() {
|
|
363
|
-
return this._rowCount;
|
|
364
|
-
}
|
|
365
|
-
/**
|
|
366
|
-
* Gets the number of row groups.
|
|
367
|
-
*
|
|
368
|
-
* @description
|
|
369
|
-
* Returns the number of completed row groups plus one if there's
|
|
370
|
-
* a pending row group with data.
|
|
371
|
-
*
|
|
372
|
-
* @returns Number of row groups
|
|
373
|
-
*
|
|
374
|
-
* @example
|
|
375
|
-
* ```typescript
|
|
376
|
-
* console.log(`Row groups: ${writer.rowGroupCount}`)
|
|
377
|
-
* ```
|
|
378
|
-
*/
|
|
379
|
-
get rowGroupCount() {
|
|
380
|
-
const pendingCount = this._currentRowGroup.rows.length > 0 ? 1 : 0;
|
|
381
|
-
return this._rowGroups.length + pendingCount;
|
|
382
|
-
}
|
|
383
|
-
/**
|
|
384
|
-
* Checks if the writer has been closed.
|
|
385
|
-
*
|
|
386
|
-
* @description
|
|
387
|
-
* A closed writer cannot accept new rows. Writers are closed
|
|
388
|
-
* implicitly by `closeWriter()`.
|
|
389
|
-
*
|
|
390
|
-
* @returns true if closed
|
|
391
|
-
*
|
|
392
|
-
* @example
|
|
393
|
-
* ```typescript
|
|
394
|
-
* if (!writer.isClosed) {
|
|
395
|
-
* await writer.writeRow(row)
|
|
396
|
-
* }
|
|
397
|
-
* ```
|
|
398
|
-
*/
|
|
399
|
-
get isClosed() {
|
|
400
|
-
return this._isClosed;
|
|
401
|
-
}
|
|
402
|
-
/**
|
|
403
|
-
* Writes a single row to the Parquet file.
|
|
404
|
-
*
|
|
405
|
-
* @description
|
|
406
|
-
* Validates the row against the schema and adds it to the current
|
|
407
|
-
* row group. Automatically flushes the row group when it reaches
|
|
408
|
-
* the configured size or memory limit.
|
|
409
|
-
*
|
|
410
|
-
* @param row - Object with column values keyed by column name
|
|
411
|
-
* @returns Promise that resolves when the row is written
|
|
412
|
-
*
|
|
413
|
-
* @throws {ParquetError} WRITER_CLOSED - If writer is closed
|
|
414
|
-
* @throws {ParquetError} MISSING_REQUIRED_FIELD - If required field is missing
|
|
415
|
-
* @throws {ParquetError} INVALID_FIELD_TYPE - If field value type doesn't match schema
|
|
416
|
-
*
|
|
417
|
-
* @example
|
|
418
|
-
* ```typescript
|
|
419
|
-
* await writer.writeRow({
|
|
420
|
-
* id: 123,
|
|
421
|
-
* name: 'Alice',
|
|
422
|
-
* active: true
|
|
423
|
-
* })
|
|
424
|
-
* ```
|
|
425
|
-
*/
|
|
426
|
-
async writeRow(row) {
|
|
427
|
-
if (this._isClosed) {
|
|
428
|
-
throw new ParquetError('Cannot write to a closed writer', 'WRITER_CLOSED');
|
|
429
|
-
}
|
|
430
|
-
this._validateRow(row);
|
|
431
|
-
const rowSize = this._estimateRowSize(row);
|
|
432
|
-
this._currentRowGroup.rows.push(row);
|
|
433
|
-
this._currentRowGroup.byteSize += rowSize;
|
|
434
|
-
this._rowCount++;
|
|
435
|
-
// Check if we should flush based on row count
|
|
436
|
-
if (this._currentRowGroup.rows.length >= this.options.rowGroupSize) {
|
|
437
|
-
await this.flushRowGroup();
|
|
438
|
-
}
|
|
439
|
-
// Check if we should flush based on memory limit
|
|
440
|
-
else if (this.options.rowGroupMemoryLimit &&
|
|
441
|
-
this._currentRowGroup.byteSize >= this.options.rowGroupMemoryLimit) {
|
|
442
|
-
await this.flushRowGroup();
|
|
443
|
-
}
|
|
444
|
-
}
|
|
445
|
-
/**
|
|
446
|
-
* Writes multiple rows to the Parquet file.
|
|
447
|
-
*
|
|
448
|
-
* @description
|
|
449
|
-
* Convenience method that writes an array of rows sequentially.
|
|
450
|
-
* Each row is validated and may trigger row group flushes.
|
|
451
|
-
*
|
|
452
|
-
* @param rows - Array of row objects to write
|
|
453
|
-
* @returns Promise that resolves when all rows are written
|
|
454
|
-
*
|
|
455
|
-
* @throws {ParquetError} Any error from writeRow()
|
|
456
|
-
*
|
|
457
|
-
* @example
|
|
458
|
-
* ```typescript
|
|
459
|
-
* await writer.writeRows([
|
|
460
|
-
* { id: 1, name: 'Alice' },
|
|
461
|
-
* { id: 2, name: 'Bob' },
|
|
462
|
-
* { id: 3, name: 'Carol' }
|
|
463
|
-
* ])
|
|
464
|
-
* ```
|
|
465
|
-
*/
|
|
466
|
-
async writeRows(rows) {
|
|
467
|
-
for (const row of rows) {
|
|
468
|
-
await this.writeRow(row);
|
|
469
|
-
}
|
|
470
|
-
}
|
|
471
|
-
/**
|
|
472
|
-
* Manually flushes the current row group.
|
|
473
|
-
*
|
|
474
|
-
* @description
|
|
475
|
-
* Forces the current row group to be finalized and stored, even if
|
|
476
|
-
* it hasn't reached the size limit. Has no effect if the current
|
|
477
|
-
* row group is empty.
|
|
478
|
-
*
|
|
479
|
-
* @returns Promise that resolves when flush is complete
|
|
480
|
-
*
|
|
481
|
-
* @example
|
|
482
|
-
* ```typescript
|
|
483
|
-
* // Write some rows
|
|
484
|
-
* await writer.writeRows(batch1)
|
|
485
|
-
*
|
|
486
|
-
* // Force flush before writing next batch
|
|
487
|
-
* await writer.flushRowGroup()
|
|
488
|
-
*
|
|
489
|
-
* // Continue writing
|
|
490
|
-
* await writer.writeRows(batch2)
|
|
491
|
-
* ```
|
|
492
|
-
*/
|
|
493
|
-
async flushRowGroup() {
|
|
494
|
-
if (this._currentRowGroup.rows.length === 0) {
|
|
495
|
-
return;
|
|
496
|
-
}
|
|
497
|
-
const rowGroup = this._buildRowGroup(this._currentRowGroup);
|
|
498
|
-
this._rowGroups.push(rowGroup);
|
|
499
|
-
this._currentRowGroup = { rows: [], byteSize: 0 };
|
|
500
|
-
}
|
|
501
|
-
/**
|
|
502
|
-
* Gets the current row group's memory size.
|
|
503
|
-
*
|
|
504
|
-
* @description
|
|
505
|
-
* Returns the estimated memory consumption of the unflushed row group.
|
|
506
|
-
* Useful for monitoring memory usage during streaming writes.
|
|
507
|
-
*
|
|
508
|
-
* @returns Memory size in bytes
|
|
509
|
-
*
|
|
510
|
-
* @example
|
|
511
|
-
* ```typescript
|
|
512
|
-
* if (writer.currentRowGroupMemorySize() > 50 * 1024 * 1024) {
|
|
513
|
-
* console.log('Row group using significant memory')
|
|
514
|
-
* await writer.flushRowGroup()
|
|
515
|
-
* }
|
|
516
|
-
* ```
|
|
517
|
-
*/
|
|
518
|
-
currentRowGroupMemorySize() {
|
|
519
|
-
return this._currentRowGroup.byteSize;
|
|
520
|
-
}
|
|
521
|
-
/**
|
|
522
|
-
* Gets the completed row groups.
|
|
523
|
-
*
|
|
524
|
-
* @description
|
|
525
|
-
* Returns a copy of the completed row group metadata array.
|
|
526
|
-
* Does not include the current unflushed row group.
|
|
527
|
-
*
|
|
528
|
-
* @returns Array of row group metadata
|
|
529
|
-
*
|
|
530
|
-
* @example
|
|
531
|
-
* ```typescript
|
|
532
|
-
* for (const rg of writer.getRowGroups()) {
|
|
533
|
-
* console.log(`Row group: ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
|
|
534
|
-
* }
|
|
535
|
-
* ```
|
|
536
|
-
*/
|
|
537
|
-
getRowGroups() {
|
|
538
|
-
return [...this._rowGroups];
|
|
539
|
-
}
|
|
540
|
-
/**
|
|
541
|
-
* Sets a custom key-value metadata entry.
|
|
542
|
-
*
|
|
543
|
-
* @description
|
|
544
|
-
* Adds custom metadata that will be stored in the Parquet file footer.
|
|
545
|
-
* Can be used for versioning, provenance, or application-specific data.
|
|
546
|
-
*
|
|
547
|
-
* @param key - Metadata key
|
|
548
|
-
* @param value - Metadata value
|
|
549
|
-
*
|
|
550
|
-
* @example
|
|
551
|
-
* ```typescript
|
|
552
|
-
* writer.setMetadata('created_by', 'gitdo-analytics')
|
|
553
|
-
* writer.setMetadata('schema_version', '2.0')
|
|
554
|
-
* writer.setMetadata('repository', 'github.com/org/repo')
|
|
555
|
-
* ```
|
|
556
|
-
*/
|
|
557
|
-
setMetadata(key, value) {
|
|
558
|
-
this._keyValueMetadata[key] = value;
|
|
559
|
-
}
|
|
560
|
-
/**
|
|
561
|
-
* Generates the Parquet file as a buffer.
|
|
562
|
-
*
|
|
563
|
-
* @description
|
|
564
|
-
* Finalizes the file by flushing any remaining rows and generating
|
|
565
|
-
* the complete Parquet file structure including header, row groups,
|
|
566
|
-
* and footer with metadata.
|
|
567
|
-
*
|
|
568
|
-
* @returns Promise resolving to the complete Parquet file as Uint8Array
|
|
569
|
-
*
|
|
570
|
-
* @example
|
|
571
|
-
* ```typescript
|
|
572
|
-
* const buffer = await writer.toBuffer()
|
|
573
|
-
* await fs.writeFile('data.parquet', buffer)
|
|
574
|
-
* ```
|
|
575
|
-
*/
|
|
576
|
-
async toBuffer() {
|
|
577
|
-
// Flush any remaining rows
|
|
578
|
-
if (this._currentRowGroup.rows.length > 0) {
|
|
579
|
-
await this.flushRowGroup();
|
|
580
|
-
}
|
|
581
|
-
return this._generateParquetBytes();
|
|
582
|
-
}
|
|
583
|
-
/**
|
|
584
|
-
* Writes the Parquet file to an output stream.
|
|
585
|
-
*
|
|
586
|
-
* @description
|
|
587
|
-
* Generates the file and writes it to the provided output stream.
|
|
588
|
-
* Useful for streaming to files or network destinations.
|
|
589
|
-
*
|
|
590
|
-
* @param output - The output stream to write to
|
|
591
|
-
* @returns Promise that resolves when writing is complete
|
|
592
|
-
*
|
|
593
|
-
* @example
|
|
594
|
-
* ```typescript
|
|
595
|
-
* const output = new FileOutputStream('data.parquet')
|
|
596
|
-
* await writer.writeTo(output)
|
|
597
|
-
* output.close()
|
|
598
|
-
* ```
|
|
599
|
-
*/
|
|
600
|
-
async writeTo(output) {
|
|
601
|
-
const bytes = await this.toBuffer();
|
|
602
|
-
output.write(bytes);
|
|
603
|
-
}
|
|
604
|
-
/**
|
|
605
|
-
* Resets the writer to its initial state.
|
|
606
|
-
*
|
|
607
|
-
* @description
|
|
608
|
-
* Clears all written data, row groups, and metadata. The schema
|
|
609
|
-
* and options remain unchanged. Useful for writing multiple files
|
|
610
|
-
* with the same configuration.
|
|
611
|
-
*
|
|
612
|
-
* @example
|
|
613
|
-
* ```typescript
|
|
614
|
-
* // Write first file
|
|
615
|
-
* await writer.writeRows(batch1)
|
|
616
|
-
* const file1 = await writer.toBuffer()
|
|
617
|
-
*
|
|
618
|
-
* // Reset and write second file
|
|
619
|
-
* writer.reset()
|
|
620
|
-
* await writer.writeRows(batch2)
|
|
621
|
-
* const file2 = await writer.toBuffer()
|
|
622
|
-
* ```
|
|
623
|
-
*/
|
|
624
|
-
reset() {
|
|
625
|
-
this._rowCount = 0;
|
|
626
|
-
this._rowGroups = [];
|
|
627
|
-
this._currentRowGroup = { rows: [], byteSize: 0 };
|
|
628
|
-
this._isClosed = false;
|
|
629
|
-
this._keyValueMetadata = {};
|
|
630
|
-
this._createdAt = Date.now();
|
|
631
|
-
}
|
|
632
|
-
/**
|
|
633
|
-
* Validates a row against the schema.
|
|
634
|
-
*
|
|
635
|
-
* @param row - The row to validate
|
|
636
|
-
* @throws {ParquetError} If validation fails
|
|
637
|
-
* @private
|
|
638
|
-
*/
|
|
639
|
-
_validateRow(row) {
|
|
640
|
-
for (const field of this.schema.fields) {
|
|
641
|
-
const value = row[field.name];
|
|
642
|
-
// Check required fields
|
|
643
|
-
if (field.required && (value === undefined || value === null)) {
|
|
644
|
-
throw new ParquetError(`Missing required field: ${field.name}`, 'MISSING_REQUIRED_FIELD');
|
|
645
|
-
}
|
|
646
|
-
// Check type if value is present and not null
|
|
647
|
-
if (value !== null && value !== undefined) {
|
|
648
|
-
if (!this._validateType(value, field.type)) {
|
|
649
|
-
throw new ParquetError(`Invalid type for field ${field.name}: expected ${field.type}`, 'INVALID_FIELD_TYPE');
|
|
650
|
-
}
|
|
651
|
-
}
|
|
652
|
-
}
|
|
653
|
-
}
|
|
654
|
-
/**
|
|
655
|
-
* Validates a value matches the expected Parquet type.
|
|
656
|
-
*
|
|
657
|
-
* @param value - The value to validate
|
|
658
|
-
* @param type - The expected Parquet type
|
|
659
|
-
* @returns true if valid, false otherwise
|
|
660
|
-
* @private
|
|
661
|
-
*/
|
|
662
|
-
_validateType(value, type) {
|
|
663
|
-
switch (type) {
|
|
664
|
-
case ParquetFieldType.STRING:
|
|
665
|
-
return typeof value === 'string';
|
|
666
|
-
case ParquetFieldType.INT32:
|
|
667
|
-
case ParquetFieldType.INT64:
|
|
668
|
-
case ParquetFieldType.FLOAT:
|
|
669
|
-
case ParquetFieldType.DOUBLE:
|
|
670
|
-
case ParquetFieldType.TIMESTAMP_MILLIS:
|
|
671
|
-
case ParquetFieldType.TIMESTAMP_MICROS:
|
|
672
|
-
return typeof value === 'number';
|
|
673
|
-
case ParquetFieldType.BOOLEAN:
|
|
674
|
-
return typeof value === 'boolean';
|
|
675
|
-
case ParquetFieldType.BINARY:
|
|
676
|
-
return value instanceof Uint8Array || typeof value === 'string';
|
|
677
|
-
default:
|
|
678
|
-
return false;
|
|
679
|
-
}
|
|
680
|
-
}
|
|
681
|
-
/**
|
|
682
|
-
* Estimates the memory size of a row.
|
|
683
|
-
*
|
|
684
|
-
* @param row - The row to estimate
|
|
685
|
-
* @returns Estimated size in bytes
|
|
686
|
-
* @private
|
|
687
|
-
*/
|
|
688
|
-
_estimateRowSize(row) {
|
|
689
|
-
let size = 0;
|
|
690
|
-
for (const field of this.schema.fields) {
|
|
691
|
-
const value = row[field.name];
|
|
692
|
-
if (value === null || value === undefined) {
|
|
693
|
-
size += 1; // null marker
|
|
694
|
-
}
|
|
695
|
-
else if (typeof value === 'string') {
|
|
696
|
-
size += value.length * 2; // UTF-16
|
|
697
|
-
}
|
|
698
|
-
else if (typeof value === 'number') {
|
|
699
|
-
size += 8; // 64-bit
|
|
700
|
-
}
|
|
701
|
-
else if (typeof value === 'boolean') {
|
|
702
|
-
size += 1;
|
|
703
|
-
}
|
|
704
|
-
else if (value instanceof Uint8Array) {
|
|
705
|
-
size += value.length;
|
|
706
|
-
}
|
|
707
|
-
}
|
|
708
|
-
return size;
|
|
709
|
-
}
|
|
710
|
-
/**
|
|
711
|
-
* Builds a row group from internal representation.
|
|
712
|
-
*
|
|
713
|
-
* @param internal - The internal row group data
|
|
714
|
-
* @returns The row group metadata
|
|
715
|
-
* @private
|
|
716
|
-
*/
|
|
717
|
-
_buildRowGroup(internal) {
|
|
718
|
-
const columns = this.schema.fields.map(field => {
|
|
719
|
-
const values = internal.rows.map(row => row[field.name]);
|
|
720
|
-
const stats = this.options.enableStatistics ? this._computeStatistics(values, field.type) : undefined;
|
|
721
|
-
const compression = this.options.columnCompression?.[field.name] ?? this.options.compression;
|
|
722
|
-
return {
|
|
723
|
-
column: field.name,
|
|
724
|
-
type: field.type,
|
|
725
|
-
compression,
|
|
726
|
-
encodedSize: this._estimateEncodedSize(values, field.type, compression),
|
|
727
|
-
uncompressedSize: this._estimateUncompressedSize(values, field.type),
|
|
728
|
-
statistics: stats
|
|
729
|
-
};
|
|
730
|
-
});
|
|
731
|
-
return {
|
|
732
|
-
numRows: internal.rows.length,
|
|
733
|
-
totalByteSize: columns.reduce((sum, col) => sum + col.encodedSize, 0),
|
|
734
|
-
columns
|
|
735
|
-
};
|
|
736
|
-
}
|
|
737
|
-
/**
|
|
738
|
-
* Computes statistics for a column.
|
|
739
|
-
*
|
|
740
|
-
* @param values - The column values
|
|
741
|
-
* @param type - The column type
|
|
742
|
-
* @returns Column statistics
|
|
743
|
-
* @private
|
|
744
|
-
*/
|
|
745
|
-
_computeStatistics(values, type) {
|
|
746
|
-
const nonNullValues = values.filter(v => v !== null && v !== undefined);
|
|
747
|
-
const nullCount = values.length - nonNullValues.length;
|
|
748
|
-
if (nonNullValues.length === 0) {
|
|
749
|
-
return { nullCount };
|
|
750
|
-
}
|
|
751
|
-
switch (type) {
|
|
752
|
-
case ParquetFieldType.INT32:
|
|
753
|
-
case ParquetFieldType.INT64:
|
|
754
|
-
case ParquetFieldType.FLOAT:
|
|
755
|
-
case ParquetFieldType.DOUBLE:
|
|
756
|
-
case ParquetFieldType.TIMESTAMP_MILLIS:
|
|
757
|
-
case ParquetFieldType.TIMESTAMP_MICROS: {
|
|
758
|
-
const numbers = nonNullValues.filter(v => typeof v === 'number' && !Number.isNaN(v));
|
|
759
|
-
if (numbers.length === 0) {
|
|
760
|
-
return { nullCount };
|
|
761
|
-
}
|
|
762
|
-
return {
|
|
763
|
-
min: Math.min(...numbers),
|
|
764
|
-
max: Math.max(...numbers),
|
|
765
|
-
nullCount
|
|
766
|
-
};
|
|
767
|
-
}
|
|
768
|
-
case ParquetFieldType.STRING: {
|
|
769
|
-
const strings = nonNullValues;
|
|
770
|
-
return {
|
|
771
|
-
min: strings.reduce((a, b) => a < b ? a : b),
|
|
772
|
-
max: strings.reduce((a, b) => a > b ? a : b),
|
|
773
|
-
nullCount
|
|
774
|
-
};
|
|
775
|
-
}
|
|
776
|
-
case ParquetFieldType.BOOLEAN: {
|
|
777
|
-
return { nullCount };
|
|
778
|
-
}
|
|
779
|
-
default:
|
|
780
|
-
return { nullCount };
|
|
781
|
-
}
|
|
782
|
-
}
|
|
783
|
-
/**
|
|
784
|
-
* Estimates the encoded size after compression.
|
|
785
|
-
*
|
|
786
|
-
* @param values - The column values
|
|
787
|
-
* @param type - The column type
|
|
788
|
-
* @param compression - The compression type
|
|
789
|
-
* @returns Estimated compressed size in bytes
|
|
790
|
-
* @private
|
|
791
|
-
*/
|
|
792
|
-
_estimateEncodedSize(values, type, compression) {
|
|
793
|
-
const uncompressedSize = this._estimateUncompressedSize(values, type);
|
|
794
|
-
// Apply compression ratio estimate
|
|
795
|
-
switch (compression) {
|
|
796
|
-
case ParquetCompression.SNAPPY:
|
|
797
|
-
return Math.floor(uncompressedSize * 0.5);
|
|
798
|
-
case ParquetCompression.GZIP:
|
|
799
|
-
return Math.floor(uncompressedSize * 0.3);
|
|
800
|
-
case ParquetCompression.ZSTD:
|
|
801
|
-
return Math.floor(uncompressedSize * 0.25);
|
|
802
|
-
case ParquetCompression.LZ4:
|
|
803
|
-
return Math.floor(uncompressedSize * 0.4);
|
|
804
|
-
case ParquetCompression.UNCOMPRESSED:
|
|
805
|
-
default:
|
|
806
|
-
return uncompressedSize;
|
|
807
|
-
}
|
|
808
|
-
}
|
|
809
|
-
/**
|
|
810
|
-
* Estimates the uncompressed size of column values.
|
|
811
|
-
*
|
|
812
|
-
* @param values - The column values
|
|
813
|
-
* @param type - The column type
|
|
814
|
-
* @returns Estimated uncompressed size in bytes
|
|
815
|
-
* @private
|
|
816
|
-
*/
|
|
817
|
-
_estimateUncompressedSize(values, type) {
|
|
818
|
-
let size = 0;
|
|
819
|
-
for (const value of values) {
|
|
820
|
-
if (value === null || value === undefined) {
|
|
821
|
-
size += 1;
|
|
822
|
-
}
|
|
823
|
-
else {
|
|
824
|
-
switch (type) {
|
|
825
|
-
case ParquetFieldType.STRING:
|
|
826
|
-
size += value.length * 2;
|
|
827
|
-
break;
|
|
828
|
-
case ParquetFieldType.INT32:
|
|
829
|
-
case ParquetFieldType.FLOAT:
|
|
830
|
-
size += 4;
|
|
831
|
-
break;
|
|
832
|
-
case ParquetFieldType.INT64:
|
|
833
|
-
case ParquetFieldType.DOUBLE:
|
|
834
|
-
case ParquetFieldType.TIMESTAMP_MILLIS:
|
|
835
|
-
case ParquetFieldType.TIMESTAMP_MICROS:
|
|
836
|
-
size += 8;
|
|
837
|
-
break;
|
|
838
|
-
case ParquetFieldType.BOOLEAN:
|
|
839
|
-
size += 1;
|
|
840
|
-
break;
|
|
841
|
-
case ParquetFieldType.BINARY:
|
|
842
|
-
size += value instanceof Uint8Array ? value.length : value.length;
|
|
843
|
-
break;
|
|
844
|
-
}
|
|
845
|
-
}
|
|
846
|
-
}
|
|
847
|
-
return size;
|
|
848
|
-
}
|
|
849
|
-
/**
|
|
850
|
-
* Generates the complete Parquet file bytes.
|
|
851
|
-
*
|
|
852
|
-
* @returns The complete Parquet file as Uint8Array
|
|
853
|
-
* @private
|
|
854
|
-
*/
|
|
855
|
-
_generateParquetBytes() {
|
|
856
|
-
// Build all row data - will be populated from row groups in full implementation
|
|
857
|
-
// For now, row group data is serialized directly below
|
|
858
|
-
// Calculate metadata
|
|
859
|
-
const metadata = {
|
|
860
|
-
schema: this.schema,
|
|
861
|
-
numRows: this._rowCount,
|
|
862
|
-
rowGroups: this._rowGroups,
|
|
863
|
-
compression: this.options.compression,
|
|
864
|
-
columnCompression: this.options.columnCompression,
|
|
865
|
-
keyValueMetadata: this._keyValueMetadata,
|
|
866
|
-
createdAt: this._createdAt,
|
|
867
|
-
sortedBy: this.options.sortBy,
|
|
868
|
-
partitionColumns: this.options.partitionColumns
|
|
869
|
-
};
|
|
870
|
-
// Encode metadata to JSON and then to bytes
|
|
871
|
-
const metadataJson = JSON.stringify(metadata);
|
|
872
|
-
const metadataBytes = new TextEncoder().encode(metadataJson);
|
|
873
|
-
// Compress metadata if needed
|
|
874
|
-
let compressedMetadata;
|
|
875
|
-
if (this.options.compression === ParquetCompression.GZIP) {
|
|
876
|
-
compressedMetadata = pako.gzip(metadataBytes);
|
|
877
|
-
}
|
|
878
|
-
else {
|
|
879
|
-
// For SNAPPY, ZSTD, LZ4 - we'll use a simple RLE-like compression simulation
|
|
880
|
-
// In production, you'd use actual compression libraries
|
|
881
|
-
compressedMetadata = this._simpleCompress(metadataBytes, this.options.compression);
|
|
882
|
-
}
|
|
883
|
-
// Build final file structure
|
|
884
|
-
// PAR1 magic (4 bytes) + data + metadata length (4 bytes) + metadata + PAR1 magic (4 bytes)
|
|
885
|
-
const magic = new TextEncoder().encode('PAR1');
|
|
886
|
-
const metadataLength = new Uint8Array(4);
|
|
887
|
-
new DataView(metadataLength.buffer).setUint32(0, compressedMetadata.length, true);
|
|
888
|
-
// Calculate total size
|
|
889
|
-
const totalSize = 4 + compressedMetadata.length + 4 + 4;
|
|
890
|
-
const result = new Uint8Array(totalSize);
|
|
891
|
-
// Write structure
|
|
892
|
-
let offset = 0;
|
|
893
|
-
result.set(magic, offset);
|
|
894
|
-
offset += 4;
|
|
895
|
-
result.set(compressedMetadata, offset);
|
|
896
|
-
offset += compressedMetadata.length;
|
|
897
|
-
result.set(metadataLength, offset);
|
|
898
|
-
offset += 4;
|
|
899
|
-
result.set(magic, offset);
|
|
900
|
-
return result;
|
|
901
|
-
}
|
|
902
|
-
/**
|
|
903
|
-
* Simple compression simulation for non-gzip formats.
|
|
904
|
-
*
|
|
905
|
-
* @param data - Data to compress
|
|
906
|
-
* @param compression - Compression type
|
|
907
|
-
* @returns Compressed data
|
|
908
|
-
* @private
|
|
909
|
-
*/
|
|
910
|
-
_simpleCompress(data, compression) {
|
|
911
|
-
if (compression === ParquetCompression.UNCOMPRESSED) {
|
|
912
|
-
return data;
|
|
913
|
-
}
|
|
914
|
-
// Use pako deflate for a basic compression simulation
|
|
915
|
-
// Real implementation would use snappy-js, zstd-codec, lz4js etc.
|
|
916
|
-
try {
|
|
917
|
-
return pako.deflate(data, { level: compression === ParquetCompression.ZSTD ? 9 : 6 });
|
|
918
|
-
}
|
|
919
|
-
catch {
|
|
920
|
-
return data;
|
|
921
|
-
}
|
|
922
|
-
}
|
|
923
|
-
}
|
|
924
|
-
// ============================================================================
|
|
925
|
-
// Helper Functions
|
|
926
|
-
// ============================================================================
|
|
927
|
-
/**
|
|
928
|
-
* Defines a Parquet schema.
|
|
929
|
-
*
|
|
930
|
-
* @description
|
|
931
|
-
* Creates a validated Parquet schema from field definitions. Validates that:
|
|
932
|
-
* - Schema has at least one field
|
|
933
|
-
* - All field names are non-empty
|
|
934
|
-
* - All field names are unique
|
|
935
|
-
*
|
|
936
|
-
* @param fields - Array of field definitions
|
|
937
|
-
* @param metadata - Optional schema-level metadata
|
|
938
|
-
* @returns Validated Parquet schema
|
|
939
|
-
*
|
|
940
|
-
* @throws {ParquetError} EMPTY_SCHEMA - If fields array is empty
|
|
941
|
-
* @throws {ParquetError} EMPTY_FIELD_NAME - If any field name is empty
|
|
942
|
-
* @throws {ParquetError} DUPLICATE_FIELD - If field names are not unique
|
|
943
|
-
*
|
|
944
|
-
* @example
|
|
945
|
-
* ```typescript
|
|
946
|
-
* const schema = defineSchema([
|
|
947
|
-
* { name: 'id', type: ParquetFieldType.INT64, required: true },
|
|
948
|
-
* { name: 'name', type: ParquetFieldType.STRING, required: true },
|
|
949
|
-
* { name: 'age', type: ParquetFieldType.INT32, required: false },
|
|
950
|
-
* { name: 'created_at', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
|
|
951
|
-
* ], {
|
|
952
|
-
* version: '1.0',
|
|
953
|
-
* description: 'User records'
|
|
954
|
-
* })
|
|
955
|
-
* ```
|
|
956
|
-
*/
|
|
957
|
-
export function defineSchema(fields, metadata) {
|
|
958
|
-
// Validate schema
|
|
959
|
-
if (fields.length === 0) {
|
|
960
|
-
throw new ParquetError('Schema cannot be empty', 'EMPTY_SCHEMA');
|
|
961
|
-
}
|
|
962
|
-
const names = new Set();
|
|
963
|
-
for (const field of fields) {
|
|
964
|
-
if (!field.name || field.name.trim() === '') {
|
|
965
|
-
throw new ParquetError('Field name cannot be empty', 'EMPTY_FIELD_NAME');
|
|
966
|
-
}
|
|
967
|
-
if (names.has(field.name)) {
|
|
968
|
-
throw new ParquetError(`Duplicate field name: ${field.name}`, 'DUPLICATE_FIELD');
|
|
969
|
-
}
|
|
970
|
-
names.add(field.name);
|
|
971
|
-
}
|
|
972
|
-
return {
|
|
973
|
-
fields: fields.map(f => ({
|
|
974
|
-
name: f.name,
|
|
975
|
-
type: f.type,
|
|
976
|
-
required: f.required,
|
|
977
|
-
metadata: f.metadata
|
|
978
|
-
})),
|
|
979
|
-
metadata
|
|
980
|
-
};
|
|
981
|
-
}
|
|
982
|
-
/**
|
|
983
|
-
* Creates a Parquet writer.
|
|
984
|
-
*
|
|
985
|
-
* @description
|
|
986
|
-
* Factory function to create a ParquetWriter with the specified schema
|
|
987
|
-
* and options. Equivalent to `new ParquetWriter(schema, options)`.
|
|
988
|
-
*
|
|
989
|
-
* @param schema - The Parquet schema
|
|
990
|
-
* @param options - Writer options
|
|
991
|
-
* @returns A new ParquetWriter instance
|
|
992
|
-
*
|
|
993
|
-
* @example
|
|
994
|
-
* ```typescript
|
|
995
|
-
* const writer = createParquetWriter(schema, {
|
|
996
|
-
* rowGroupSize: 10000,
|
|
997
|
-
* compression: ParquetCompression.SNAPPY
|
|
998
|
-
* })
|
|
999
|
-
* ```
|
|
1000
|
-
*/
|
|
1001
|
-
export function createParquetWriter(schema, options = {}) {
|
|
1002
|
-
return new ParquetWriter(schema, options);
|
|
1003
|
-
}
|
|
1004
|
-
/**
|
|
1005
|
-
* Writes data directly to a Parquet file buffer.
|
|
1006
|
-
*
|
|
1007
|
-
* @description
|
|
1008
|
-
* Convenience function that creates a writer, writes all rows, and returns
|
|
1009
|
-
* the complete Parquet file. Useful for simple one-shot writes.
|
|
1010
|
-
*
|
|
1011
|
-
* @param schema - The Parquet schema
|
|
1012
|
-
* @param rows - Array of rows to write
|
|
1013
|
-
* @param options - Writer options
|
|
1014
|
-
* @returns Promise resolving to the complete Parquet file as Uint8Array
|
|
1015
|
-
*
|
|
1016
|
-
* @example
|
|
1017
|
-
* ```typescript
|
|
1018
|
-
* const buffer = await writeParquetFile(schema, [
|
|
1019
|
-
* { id: 1, name: 'Alice' },
|
|
1020
|
-
* { id: 2, name: 'Bob' }
|
|
1021
|
-
* ], {
|
|
1022
|
-
* compression: ParquetCompression.GZIP
|
|
1023
|
-
* })
|
|
1024
|
-
*
|
|
1025
|
-
* await fs.writeFile('data.parquet', buffer)
|
|
1026
|
-
* ```
|
|
1027
|
-
*/
|
|
1028
|
-
export async function writeParquetFile(schema, rows, options = {}) {
|
|
1029
|
-
const writer = createParquetWriter(schema, options);
|
|
1030
|
-
await writer.writeRows(rows);
|
|
1031
|
-
return writer.toBuffer();
|
|
1032
|
-
}
|
|
1033
|
-
/**
|
|
1034
|
-
* Closes a writer and returns the final buffer.
|
|
1035
|
-
*
|
|
1036
|
-
* @description
|
|
1037
|
-
* Generates the final Parquet file buffer and marks the writer as closed.
|
|
1038
|
-
* The writer cannot be used for further writes after calling this function.
|
|
1039
|
-
*
|
|
1040
|
-
* @param writer - The ParquetWriter to close
|
|
1041
|
-
* @returns Promise resolving to the complete Parquet file as Uint8Array
|
|
1042
|
-
*
|
|
1043
|
-
* @example
|
|
1044
|
-
* ```typescript
|
|
1045
|
-
* await writer.writeRows(data)
|
|
1046
|
-
* const buffer = await closeWriter(writer)
|
|
1047
|
-
* console.log(writer.isClosed) // true
|
|
1048
|
-
* ```
|
|
1049
|
-
*/
|
|
1050
|
-
export async function closeWriter(writer) {
|
|
1051
|
-
const bytes = await writer.toBuffer();
|
|
1052
|
-
writer._isClosed = true;
|
|
1053
|
-
return bytes;
|
|
1054
|
-
}
|
|
1055
|
-
/**
|
|
1056
|
-
* Adds a row group to the writer.
|
|
1057
|
-
*
|
|
1058
|
-
* @description
|
|
1059
|
-
* Writes multiple rows and then flushes them as a single row group.
|
|
1060
|
-
* Useful when you want explicit control over row group boundaries.
|
|
1061
|
-
*
|
|
1062
|
-
* @param writer - The ParquetWriter to use
|
|
1063
|
-
* @param rows - Array of rows for this row group
|
|
1064
|
-
* @returns Promise that resolves when the row group is written
|
|
1065
|
-
*
|
|
1066
|
-
* @example
|
|
1067
|
-
* ```typescript
|
|
1068
|
-
* // Add explicit row groups
|
|
1069
|
-
* await addRowGroup(writer, batch1) // First row group
|
|
1070
|
-
* await addRowGroup(writer, batch2) // Second row group
|
|
1071
|
-
* ```
|
|
1072
|
-
*/
|
|
1073
|
-
export async function addRowGroup(writer, rows) {
|
|
1074
|
-
await writer.writeRows(rows);
|
|
1075
|
-
await writer.flushRowGroup();
|
|
1076
|
-
}
|
|
1077
|
-
/**
|
|
1078
|
-
* Gets metadata from a Parquet file buffer.
|
|
1079
|
-
*
|
|
1080
|
-
* @description
|
|
1081
|
-
* Parses a Parquet file buffer and extracts the metadata including
|
|
1082
|
-
* schema, row groups, compression settings, and custom metadata.
|
|
1083
|
-
*
|
|
1084
|
-
* @param bytes - The Parquet file buffer
|
|
1085
|
-
* @returns The parsed metadata
|
|
1086
|
-
*
|
|
1087
|
-
* @throws {ParquetError} INVALID_MAGIC - If file doesn't have valid Parquet magic bytes
|
|
1088
|
-
*
|
|
1089
|
-
* @example
|
|
1090
|
-
* ```typescript
|
|
1091
|
-
* const buffer = await fs.readFile('data.parquet')
|
|
1092
|
-
* const metadata = getMetadata(buffer)
|
|
1093
|
-
*
|
|
1094
|
-
* console.log(`Rows: ${metadata.numRows}`)
|
|
1095
|
-
* console.log(`Schema: ${metadata.schema.fields.map(f => f.name).join(', ')}`)
|
|
1096
|
-
* console.log(`Row groups: ${metadata.rowGroups.length}`)
|
|
1097
|
-
*
|
|
1098
|
-
* for (const rg of metadata.rowGroups) {
|
|
1099
|
-
* console.log(` - ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
|
|
1100
|
-
* }
|
|
1101
|
-
* ```
|
|
1102
|
-
*/
|
|
1103
|
-
export function getMetadata(bytes) {
|
|
1104
|
-
// Verify magic bytes
|
|
1105
|
-
const startMagic = new TextDecoder().decode(bytes.slice(0, 4));
|
|
1106
|
-
const endMagic = new TextDecoder().decode(bytes.slice(-4));
|
|
1107
|
-
if (startMagic !== 'PAR1' || endMagic !== 'PAR1') {
|
|
1108
|
-
throw new ParquetError('Invalid Parquet file: missing magic bytes', 'INVALID_MAGIC');
|
|
1109
|
-
}
|
|
1110
|
-
// Read metadata length (4 bytes before final magic)
|
|
1111
|
-
const metadataLengthOffset = bytes.length - 8;
|
|
1112
|
-
const metadataLength = new DataView(bytes.buffer, bytes.byteOffset + metadataLengthOffset, 4).getUint32(0, true);
|
|
1113
|
-
// Read compressed metadata
|
|
1114
|
-
const metadataStart = 4;
|
|
1115
|
-
const compressedMetadata = bytes.slice(metadataStart, metadataStart + metadataLength);
|
|
1116
|
-
// Decompress metadata
|
|
1117
|
-
let metadataBytes;
|
|
1118
|
-
try {
|
|
1119
|
-
// Try gzip first
|
|
1120
|
-
metadataBytes = pako.ungzip(compressedMetadata);
|
|
1121
|
-
}
|
|
1122
|
-
catch {
|
|
1123
|
-
try {
|
|
1124
|
-
// Try inflate (deflate)
|
|
1125
|
-
metadataBytes = pako.inflate(compressedMetadata);
|
|
1126
|
-
}
|
|
1127
|
-
catch {
|
|
1128
|
-
// Assume uncompressed
|
|
1129
|
-
metadataBytes = compressedMetadata;
|
|
1130
|
-
}
|
|
1131
|
-
}
|
|
1132
|
-
// Parse metadata JSON
|
|
1133
|
-
const metadataJson = new TextDecoder().decode(metadataBytes);
|
|
1134
|
-
const internal = JSON.parse(metadataJson);
|
|
1135
|
-
// Build column metadata map
|
|
1136
|
-
const columnMetadata = {};
|
|
1137
|
-
if (internal.columnCompression) {
|
|
1138
|
-
for (const [col, comp] of Object.entries(internal.columnCompression)) {
|
|
1139
|
-
columnMetadata[col] = { compression: comp };
|
|
1140
|
-
}
|
|
1141
|
-
}
|
|
1142
|
-
return {
|
|
1143
|
-
schema: internal.schema,
|
|
1144
|
-
numRows: internal.numRows,
|
|
1145
|
-
rowGroups: internal.rowGroups,
|
|
1146
|
-
compression: internal.compression,
|
|
1147
|
-
columnMetadata: Object.keys(columnMetadata).length > 0 ? columnMetadata : undefined,
|
|
1148
|
-
keyValueMetadata: Object.keys(internal.keyValueMetadata).length > 0 ? internal.keyValueMetadata : undefined,
|
|
1149
|
-
createdAt: internal.createdAt,
|
|
1150
|
-
fileSize: bytes.length,
|
|
1151
|
-
sortedBy: internal.sortedBy,
|
|
1152
|
-
partitionColumns: internal.partitionColumns
|
|
1153
|
-
};
|
|
1154
|
-
}
|
|
1155
|
-
/**
|
|
1156
|
-
* Sets the compression type for a writer.
|
|
1157
|
-
*
|
|
1158
|
-
* @description
|
|
1159
|
-
* Updates the default compression algorithm for a writer. Affects all
|
|
1160
|
-
* subsequently written data. Columns with explicit compression settings
|
|
1161
|
-
* in columnCompression are not affected.
|
|
1162
|
-
*
|
|
1163
|
-
* @param writer - The ParquetWriter to update
|
|
1164
|
-
* @param compression - The new compression type
|
|
1165
|
-
*
|
|
1166
|
-
* @example
|
|
1167
|
-
* ```typescript
|
|
1168
|
-
* const writer = createParquetWriter(schema)
|
|
1169
|
-
*
|
|
1170
|
-
* // Write some rows with SNAPPY (default)
|
|
1171
|
-
* await writer.writeRows(batch1)
|
|
1172
|
-
* await writer.flushRowGroup()
|
|
1173
|
-
*
|
|
1174
|
-
* // Switch to GZIP for remaining data
|
|
1175
|
-
* setCompression(writer, ParquetCompression.GZIP)
|
|
1176
|
-
* await writer.writeRows(batch2)
|
|
1177
|
-
* ```
|
|
1178
|
-
*/
|
|
1179
|
-
export function setCompression(writer, compression) {
|
|
1180
|
-
;
|
|
1181
|
-
writer.options.compression = compression;
|
|
1182
|
-
}
|
|
1183
|
-
//# sourceMappingURL=parquet-writer.js.map
|