gitx.do 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/blame.d.ts +259 -0
- package/dist/cli/commands/blame.d.ts.map +1 -0
- package/dist/cli/commands/blame.js +609 -0
- package/dist/cli/commands/blame.js.map +1 -0
- package/dist/cli/commands/branch.d.ts +249 -0
- package/dist/cli/commands/branch.d.ts.map +1 -0
- package/dist/cli/commands/branch.js +693 -0
- package/dist/cli/commands/branch.js.map +1 -0
- package/dist/cli/commands/commit.d.ts +182 -0
- package/dist/cli/commands/commit.d.ts.map +1 -0
- package/dist/cli/commands/commit.js +437 -0
- package/dist/cli/commands/commit.js.map +1 -0
- package/dist/cli/commands/diff.d.ts +464 -0
- package/dist/cli/commands/diff.d.ts.map +1 -0
- package/dist/cli/commands/diff.js +958 -0
- package/dist/cli/commands/diff.js.map +1 -0
- package/dist/cli/commands/log.d.ts +239 -0
- package/dist/cli/commands/log.d.ts.map +1 -0
- package/dist/cli/commands/log.js +535 -0
- package/dist/cli/commands/log.js.map +1 -0
- package/dist/cli/commands/review.d.ts +457 -0
- package/dist/cli/commands/review.d.ts.map +1 -0
- package/dist/cli/commands/review.js +533 -0
- package/dist/cli/commands/review.js.map +1 -0
- package/dist/cli/commands/status.d.ts +269 -0
- package/dist/cli/commands/status.d.ts.map +1 -0
- package/dist/cli/commands/status.js +493 -0
- package/dist/cli/commands/status.js.map +1 -0
- package/dist/cli/commands/web.d.ts +199 -0
- package/dist/cli/commands/web.d.ts.map +1 -0
- package/dist/cli/commands/web.js +696 -0
- package/dist/cli/commands/web.js.map +1 -0
- package/dist/cli/fs-adapter.d.ts +656 -0
- package/dist/cli/fs-adapter.d.ts.map +1 -0
- package/dist/cli/fs-adapter.js +1179 -0
- package/dist/cli/fs-adapter.js.map +1 -0
- package/dist/cli/index.d.ts +387 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +523 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/ui/components/DiffView.d.ts +7 -0
- package/dist/cli/ui/components/DiffView.d.ts.map +1 -0
- package/dist/cli/ui/components/DiffView.js +11 -0
- package/dist/cli/ui/components/DiffView.js.map +1 -0
- package/dist/cli/ui/components/ErrorDisplay.d.ts +6 -0
- package/dist/cli/ui/components/ErrorDisplay.d.ts.map +1 -0
- package/dist/cli/ui/components/ErrorDisplay.js +11 -0
- package/dist/cli/ui/components/ErrorDisplay.js.map +1 -0
- package/dist/cli/ui/components/FuzzySearch.d.ts +9 -0
- package/dist/cli/ui/components/FuzzySearch.d.ts.map +1 -0
- package/dist/cli/ui/components/FuzzySearch.js +12 -0
- package/dist/cli/ui/components/FuzzySearch.js.map +1 -0
- package/dist/cli/ui/components/LoadingSpinner.d.ts +6 -0
- package/dist/cli/ui/components/LoadingSpinner.d.ts.map +1 -0
- package/dist/cli/ui/components/LoadingSpinner.js +10 -0
- package/dist/cli/ui/components/LoadingSpinner.js.map +1 -0
- package/dist/cli/ui/components/NavigationList.d.ts +9 -0
- package/dist/cli/ui/components/NavigationList.d.ts.map +1 -0
- package/dist/cli/ui/components/NavigationList.js +11 -0
- package/dist/cli/ui/components/NavigationList.js.map +1 -0
- package/dist/cli/ui/components/ScrollableContent.d.ts +8 -0
- package/dist/cli/ui/components/ScrollableContent.d.ts.map +1 -0
- package/dist/cli/ui/components/ScrollableContent.js +11 -0
- package/dist/cli/ui/components/ScrollableContent.js.map +1 -0
- package/dist/cli/ui/components/index.d.ts +7 -0
- package/dist/cli/ui/components/index.d.ts.map +1 -0
- package/dist/cli/ui/components/index.js +9 -0
- package/dist/cli/ui/components/index.js.map +1 -0
- package/dist/cli/ui/terminal-ui.d.ts +52 -0
- package/dist/cli/ui/terminal-ui.d.ts.map +1 -0
- package/dist/cli/ui/terminal-ui.js +121 -0
- package/dist/cli/ui/terminal-ui.js.map +1 -0
- package/dist/durable-object/object-store.d.ts +401 -23
- package/dist/durable-object/object-store.d.ts.map +1 -1
- package/dist/durable-object/object-store.js +414 -25
- package/dist/durable-object/object-store.js.map +1 -1
- package/dist/durable-object/schema.d.ts +188 -0
- package/dist/durable-object/schema.d.ts.map +1 -1
- package/dist/durable-object/schema.js +160 -0
- package/dist/durable-object/schema.js.map +1 -1
- package/dist/durable-object/wal.d.ts +336 -31
- package/dist/durable-object/wal.d.ts.map +1 -1
- package/dist/durable-object/wal.js +272 -27
- package/dist/durable-object/wal.js.map +1 -1
- package/dist/index.d.ts +379 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +379 -7
- package/dist/index.js.map +1 -1
- package/dist/mcp/adapter.d.ts +579 -38
- package/dist/mcp/adapter.d.ts.map +1 -1
- package/dist/mcp/adapter.js +426 -33
- package/dist/mcp/adapter.js.map +1 -1
- package/dist/mcp/sandbox.d.ts +532 -29
- package/dist/mcp/sandbox.d.ts.map +1 -1
- package/dist/mcp/sandbox.js +389 -22
- package/dist/mcp/sandbox.js.map +1 -1
- package/dist/mcp/sdk-adapter.d.ts +478 -56
- package/dist/mcp/sdk-adapter.d.ts.map +1 -1
- package/dist/mcp/sdk-adapter.js +346 -44
- package/dist/mcp/sdk-adapter.js.map +1 -1
- package/dist/mcp/tools.d.ts +445 -30
- package/dist/mcp/tools.d.ts.map +1 -1
- package/dist/mcp/tools.js +363 -33
- package/dist/mcp/tools.js.map +1 -1
- package/dist/ops/blame.d.ts +424 -21
- package/dist/ops/blame.d.ts.map +1 -1
- package/dist/ops/blame.js +303 -20
- package/dist/ops/blame.js.map +1 -1
- package/dist/ops/branch.d.ts +583 -32
- package/dist/ops/branch.d.ts.map +1 -1
- package/dist/ops/branch.js +365 -23
- package/dist/ops/branch.js.map +1 -1
- package/dist/ops/commit-traversal.d.ts +164 -24
- package/dist/ops/commit-traversal.d.ts.map +1 -1
- package/dist/ops/commit-traversal.js +68 -2
- package/dist/ops/commit-traversal.js.map +1 -1
- package/dist/ops/commit.d.ts +387 -53
- package/dist/ops/commit.d.ts.map +1 -1
- package/dist/ops/commit.js +249 -29
- package/dist/ops/commit.js.map +1 -1
- package/dist/ops/merge-base.d.ts +195 -21
- package/dist/ops/merge-base.d.ts.map +1 -1
- package/dist/ops/merge-base.js +122 -12
- package/dist/ops/merge-base.js.map +1 -1
- package/dist/ops/merge.d.ts +600 -130
- package/dist/ops/merge.d.ts.map +1 -1
- package/dist/ops/merge.js +408 -60
- package/dist/ops/merge.js.map +1 -1
- package/dist/ops/tag.d.ts +67 -2
- package/dist/ops/tag.d.ts.map +1 -1
- package/dist/ops/tag.js +42 -1
- package/dist/ops/tag.js.map +1 -1
- package/dist/ops/tree-builder.d.ts +102 -6
- package/dist/ops/tree-builder.d.ts.map +1 -1
- package/dist/ops/tree-builder.js +30 -5
- package/dist/ops/tree-builder.js.map +1 -1
- package/dist/ops/tree-diff.d.ts +50 -2
- package/dist/ops/tree-diff.d.ts.map +1 -1
- package/dist/ops/tree-diff.js +50 -2
- package/dist/ops/tree-diff.js.map +1 -1
- package/dist/pack/delta.d.ts +211 -39
- package/dist/pack/delta.d.ts.map +1 -1
- package/dist/pack/delta.js +232 -46
- package/dist/pack/delta.js.map +1 -1
- package/dist/pack/format.d.ts +390 -28
- package/dist/pack/format.d.ts.map +1 -1
- package/dist/pack/format.js +344 -33
- package/dist/pack/format.js.map +1 -1
- package/dist/pack/full-generation.d.ts +313 -28
- package/dist/pack/full-generation.d.ts.map +1 -1
- package/dist/pack/full-generation.js +238 -19
- package/dist/pack/full-generation.js.map +1 -1
- package/dist/pack/generation.d.ts +346 -23
- package/dist/pack/generation.d.ts.map +1 -1
- package/dist/pack/generation.js +269 -21
- package/dist/pack/generation.js.map +1 -1
- package/dist/pack/index.d.ts +407 -86
- package/dist/pack/index.d.ts.map +1 -1
- package/dist/pack/index.js +351 -70
- package/dist/pack/index.js.map +1 -1
- package/dist/refs/branch.d.ts +517 -71
- package/dist/refs/branch.d.ts.map +1 -1
- package/dist/refs/branch.js +410 -26
- package/dist/refs/branch.js.map +1 -1
- package/dist/refs/storage.d.ts +610 -57
- package/dist/refs/storage.d.ts.map +1 -1
- package/dist/refs/storage.js +481 -29
- package/dist/refs/storage.js.map +1 -1
- package/dist/refs/tag.d.ts +677 -67
- package/dist/refs/tag.d.ts.map +1 -1
- package/dist/refs/tag.js +497 -30
- package/dist/refs/tag.js.map +1 -1
- package/dist/storage/lru-cache.d.ts +556 -53
- package/dist/storage/lru-cache.d.ts.map +1 -1
- package/dist/storage/lru-cache.js +439 -36
- package/dist/storage/lru-cache.js.map +1 -1
- package/dist/storage/object-index.d.ts +483 -38
- package/dist/storage/object-index.d.ts.map +1 -1
- package/dist/storage/object-index.js +388 -22
- package/dist/storage/object-index.js.map +1 -1
- package/dist/storage/r2-pack.d.ts +957 -94
- package/dist/storage/r2-pack.d.ts.map +1 -1
- package/dist/storage/r2-pack.js +756 -48
- package/dist/storage/r2-pack.js.map +1 -1
- package/dist/tiered/cdc-pipeline.d.ts +1610 -38
- package/dist/tiered/cdc-pipeline.d.ts.map +1 -1
- package/dist/tiered/cdc-pipeline.js +1131 -22
- package/dist/tiered/cdc-pipeline.js.map +1 -1
- package/dist/tiered/migration.d.ts +903 -41
- package/dist/tiered/migration.d.ts.map +1 -1
- package/dist/tiered/migration.js +646 -24
- package/dist/tiered/migration.js.map +1 -1
- package/dist/tiered/parquet-writer.d.ts +944 -47
- package/dist/tiered/parquet-writer.d.ts.map +1 -1
- package/dist/tiered/parquet-writer.js +667 -39
- package/dist/tiered/parquet-writer.js.map +1 -1
- package/dist/tiered/read-path.d.ts +728 -34
- package/dist/tiered/read-path.d.ts.map +1 -1
- package/dist/tiered/read-path.js +310 -27
- package/dist/tiered/read-path.js.map +1 -1
- package/dist/types/objects.d.ts +457 -0
- package/dist/types/objects.d.ts.map +1 -1
- package/dist/types/objects.js +305 -4
- package/dist/types/objects.js.map +1 -1
- package/dist/types/storage.d.ts +407 -35
- package/dist/types/storage.d.ts.map +1 -1
- package/dist/types/storage.js +27 -3
- package/dist/types/storage.js.map +1 -1
- package/dist/utils/hash.d.ts +133 -12
- package/dist/utils/hash.d.ts.map +1 -1
- package/dist/utils/hash.js +133 -12
- package/dist/utils/hash.js.map +1 -1
- package/dist/utils/sha1.d.ts +102 -9
- package/dist/utils/sha1.d.ts.map +1 -1
- package/dist/utils/sha1.js +114 -11
- package/dist/utils/sha1.js.map +1 -1
- package/dist/wire/capabilities.d.ts +896 -88
- package/dist/wire/capabilities.d.ts.map +1 -1
- package/dist/wire/capabilities.js +566 -62
- package/dist/wire/capabilities.js.map +1 -1
- package/dist/wire/pkt-line.d.ts +293 -15
- package/dist/wire/pkt-line.d.ts.map +1 -1
- package/dist/wire/pkt-line.js +251 -15
- package/dist/wire/pkt-line.js.map +1 -1
- package/dist/wire/receive-pack.d.ts +814 -64
- package/dist/wire/receive-pack.d.ts.map +1 -1
- package/dist/wire/receive-pack.js +542 -41
- package/dist/wire/receive-pack.js.map +1 -1
- package/dist/wire/smart-http.d.ts +575 -97
- package/dist/wire/smart-http.d.ts.map +1 -1
- package/dist/wire/smart-http.js +337 -46
- package/dist/wire/smart-http.js.map +1 -1
- package/dist/wire/upload-pack.d.ts +492 -98
- package/dist/wire/upload-pack.d.ts.map +1 -1
- package/dist/wire/upload-pack.js +347 -59
- package/dist/wire/upload-pack.js.map +1 -1
- package/package.json +10 -2
|
@@ -1,49 +1,212 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Parquet Writer for Git Analytics
|
|
2
|
+
* @fileoverview Parquet Writer for Git Analytics
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
* - Metadata handling with statistics
|
|
4
|
+
* @description
|
|
5
|
+
* Provides functionality to write git analytics data to Parquet format, a
|
|
6
|
+
* columnar storage format optimized for analytical queries. This module
|
|
7
|
+
* enables efficient storage and querying of Git repository data.
|
|
9
8
|
*
|
|
10
|
-
*
|
|
9
|
+
* **Key Features:**
|
|
10
|
+
* - Schema definition with various field types (STRING, INT32, INT64, etc.)
|
|
11
|
+
* - Multiple compression algorithms (SNAPPY, GZIP, ZSTD, LZ4, UNCOMPRESSED)
|
|
12
|
+
* - Row group management for efficient columnar storage
|
|
13
|
+
* - Automatic and manual row group flushing
|
|
14
|
+
* - Column-level statistics generation (min, max, null count)
|
|
15
|
+
* - Custom key-value metadata support
|
|
16
|
+
* - Memory-efficient streaming writes
|
|
17
|
+
*
|
|
18
|
+
* **Parquet Format:**
|
|
19
|
+
* The generated files follow the Parquet format with:
|
|
20
|
+
* - Magic bytes "PAR1" at start and end
|
|
21
|
+
* - Row group data organized by columns
|
|
22
|
+
* - Footer metadata containing schema and statistics
|
|
23
|
+
*
|
|
24
|
+
* @example
|
|
25
|
+
* ```typescript
|
|
26
|
+
* // Define schema for commit analytics
|
|
27
|
+
* const schema = defineSchema([
|
|
28
|
+
* { name: 'commit_sha', type: ParquetFieldType.STRING, required: true },
|
|
29
|
+
* { name: 'author', type: ParquetFieldType.STRING, required: true },
|
|
30
|
+
* { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true },
|
|
31
|
+
* { name: 'file_count', type: ParquetFieldType.INT32, required: false }
|
|
32
|
+
* ])
|
|
33
|
+
*
|
|
34
|
+
* // Create writer with options
|
|
35
|
+
* const writer = createParquetWriter(schema, {
|
|
36
|
+
* rowGroupSize: 10000,
|
|
37
|
+
* compression: ParquetCompression.SNAPPY,
|
|
38
|
+
* enableStatistics: true
|
|
39
|
+
* })
|
|
40
|
+
*
|
|
41
|
+
* // Write data
|
|
42
|
+
* await writer.writeRows([
|
|
43
|
+
* { commit_sha: 'abc123...', author: 'alice', timestamp: Date.now(), file_count: 5 },
|
|
44
|
+
* { commit_sha: 'def456...', author: 'bob', timestamp: Date.now(), file_count: 3 }
|
|
45
|
+
* ])
|
|
46
|
+
*
|
|
47
|
+
* // Generate the Parquet file
|
|
48
|
+
* const buffer = await writer.toBuffer()
|
|
49
|
+
* ```
|
|
50
|
+
*
|
|
51
|
+
* @module tiered/parquet-writer
|
|
52
|
+
* @see {@link ParquetWriter} - Main writer class
|
|
53
|
+
* @see {@link defineSchema} - Schema definition helper
|
|
11
54
|
*/
|
|
12
55
|
import pako from 'pako';
|
|
13
56
|
// ============================================================================
|
|
14
57
|
// Types and Enums
|
|
15
58
|
// ============================================================================
|
|
16
59
|
/**
|
|
17
|
-
* Supported Parquet field types
|
|
60
|
+
* Supported Parquet field types.
|
|
61
|
+
*
|
|
62
|
+
* @description
|
|
63
|
+
* Defines the data types that can be used for fields in a Parquet schema.
|
|
64
|
+
* Each type maps to an appropriate physical and logical Parquet type.
|
|
65
|
+
*
|
|
66
|
+
* @example
|
|
67
|
+
* ```typescript
|
|
68
|
+
* const field: ParquetField = {
|
|
69
|
+
* name: 'count',
|
|
70
|
+
* type: ParquetFieldType.INT64,
|
|
71
|
+
* required: true
|
|
72
|
+
* }
|
|
73
|
+
* ```
|
|
74
|
+
*
|
|
75
|
+
* @enum {string}
|
|
18
76
|
*/
|
|
19
77
|
export var ParquetFieldType;
|
|
20
78
|
(function (ParquetFieldType) {
|
|
79
|
+
/**
|
|
80
|
+
* UTF-8 encoded string.
|
|
81
|
+
* Maps to Parquet BYTE_ARRAY with UTF8 logical type.
|
|
82
|
+
*/
|
|
21
83
|
ParquetFieldType["STRING"] = "STRING";
|
|
84
|
+
/**
|
|
85
|
+
* 32-bit signed integer.
|
|
86
|
+
* Maps to Parquet INT32 physical type.
|
|
87
|
+
*/
|
|
22
88
|
ParquetFieldType["INT32"] = "INT32";
|
|
89
|
+
/**
|
|
90
|
+
* 64-bit signed integer.
|
|
91
|
+
* Maps to Parquet INT64 physical type.
|
|
92
|
+
*/
|
|
23
93
|
ParquetFieldType["INT64"] = "INT64";
|
|
94
|
+
/**
|
|
95
|
+
* Boolean value (true/false).
|
|
96
|
+
* Maps to Parquet BOOLEAN physical type.
|
|
97
|
+
*/
|
|
24
98
|
ParquetFieldType["BOOLEAN"] = "BOOLEAN";
|
|
99
|
+
/**
|
|
100
|
+
* 32-bit IEEE 754 floating point.
|
|
101
|
+
* Maps to Parquet FLOAT physical type.
|
|
102
|
+
*/
|
|
25
103
|
ParquetFieldType["FLOAT"] = "FLOAT";
|
|
104
|
+
/**
|
|
105
|
+
* 64-bit IEEE 754 floating point.
|
|
106
|
+
* Maps to Parquet DOUBLE physical type.
|
|
107
|
+
*/
|
|
26
108
|
ParquetFieldType["DOUBLE"] = "DOUBLE";
|
|
109
|
+
/**
|
|
110
|
+
* Raw binary data.
|
|
111
|
+
* Maps to Parquet BYTE_ARRAY physical type.
|
|
112
|
+
*/
|
|
27
113
|
ParquetFieldType["BINARY"] = "BINARY";
|
|
114
|
+
/**
|
|
115
|
+
* Timestamp with millisecond precision.
|
|
116
|
+
* Maps to Parquet INT64 with TIMESTAMP_MILLIS logical type.
|
|
117
|
+
*/
|
|
28
118
|
ParquetFieldType["TIMESTAMP_MILLIS"] = "TIMESTAMP_MILLIS";
|
|
119
|
+
/**
|
|
120
|
+
* Timestamp with microsecond precision.
|
|
121
|
+
* Maps to Parquet INT64 with TIMESTAMP_MICROS logical type.
|
|
122
|
+
*/
|
|
29
123
|
ParquetFieldType["TIMESTAMP_MICROS"] = "TIMESTAMP_MICROS";
|
|
30
124
|
})(ParquetFieldType || (ParquetFieldType = {}));
|
|
31
125
|
/**
|
|
32
|
-
* Supported compression types
|
|
126
|
+
* Supported compression types for Parquet data.
|
|
127
|
+
*
|
|
128
|
+
* @description
|
|
129
|
+
* Different compression algorithms offer trade-offs between compression
|
|
130
|
+
* ratio, compression speed, and decompression speed.
|
|
131
|
+
*
|
|
132
|
+
* **Comparison:**
|
|
133
|
+
* - SNAPPY: Fast compression/decompression, moderate ratio (default)
|
|
134
|
+
* - GZIP: Higher ratio, slower compression, fast decompression
|
|
135
|
+
* - ZSTD: Best ratio, good speed, requires more memory
|
|
136
|
+
* - LZ4: Fastest, lower ratio
|
|
137
|
+
* - UNCOMPRESSED: No compression overhead
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* ```typescript
|
|
141
|
+
* const writer = createParquetWriter(schema, {
|
|
142
|
+
* compression: ParquetCompression.ZSTD
|
|
143
|
+
* })
|
|
144
|
+
* ```
|
|
145
|
+
*
|
|
146
|
+
* @enum {string}
|
|
33
147
|
*/
|
|
34
148
|
export var ParquetCompression;
|
|
35
149
|
(function (ParquetCompression) {
|
|
150
|
+
/**
|
|
151
|
+
* No compression applied.
|
|
152
|
+
* Fastest writes, largest file size.
|
|
153
|
+
*/
|
|
36
154
|
ParquetCompression["UNCOMPRESSED"] = "UNCOMPRESSED";
|
|
155
|
+
/**
|
|
156
|
+
* Snappy compression (default).
|
|
157
|
+
* Good balance of speed and compression ratio.
|
|
158
|
+
*/
|
|
37
159
|
ParquetCompression["SNAPPY"] = "SNAPPY";
|
|
160
|
+
/**
|
|
161
|
+
* GZIP compression.
|
|
162
|
+
* Higher compression ratio, slower compression.
|
|
163
|
+
*/
|
|
38
164
|
ParquetCompression["GZIP"] = "GZIP";
|
|
165
|
+
/**
|
|
166
|
+
* Zstandard compression.
|
|
167
|
+
* Best compression ratio with good speed.
|
|
168
|
+
*/
|
|
39
169
|
ParquetCompression["ZSTD"] = "ZSTD";
|
|
170
|
+
/**
|
|
171
|
+
* LZ4 compression.
|
|
172
|
+
* Fastest compression, lower ratio.
|
|
173
|
+
*/
|
|
40
174
|
ParquetCompression["LZ4"] = "LZ4";
|
|
41
175
|
})(ParquetCompression || (ParquetCompression = {}));
|
|
42
176
|
/**
|
|
43
|
-
* Error class for Parquet operations
|
|
177
|
+
* Error class for Parquet-related operations.
|
|
178
|
+
*
|
|
179
|
+
* @description
|
|
180
|
+
* Thrown when Parquet operations fail, such as schema validation errors,
|
|
181
|
+
* invalid data types, or malformed files.
|
|
182
|
+
*
|
|
183
|
+
* @example
|
|
184
|
+
* ```typescript
|
|
185
|
+
* try {
|
|
186
|
+
* await writer.writeRow({ invalid_field: 'value' })
|
|
187
|
+
* } catch (error) {
|
|
188
|
+
* if (error instanceof ParquetError) {
|
|
189
|
+
* console.log(`Parquet error (${error.code}): ${error.message}`)
|
|
190
|
+
* }
|
|
191
|
+
* }
|
|
192
|
+
* ```
|
|
193
|
+
*
|
|
194
|
+
* @class ParquetError
|
|
195
|
+
* @extends Error
|
|
44
196
|
*/
|
|
45
197
|
export class ParquetError extends Error {
|
|
46
198
|
code;
|
|
199
|
+
/**
|
|
200
|
+
* Creates a new ParquetError.
|
|
201
|
+
*
|
|
202
|
+
* @param message - Human-readable error message
|
|
203
|
+
* @param code - Error code for programmatic handling
|
|
204
|
+
*
|
|
205
|
+
* @example
|
|
206
|
+
* ```typescript
|
|
207
|
+
* throw new ParquetError('Field name cannot be empty', 'EMPTY_FIELD_NAME')
|
|
208
|
+
* ```
|
|
209
|
+
*/
|
|
47
210
|
constructor(message, code) {
|
|
48
211
|
super(message);
|
|
49
212
|
this.code = code;
|
|
@@ -54,17 +217,125 @@ export class ParquetError extends Error {
|
|
|
54
217
|
// ParquetWriter Class
|
|
55
218
|
// ============================================================================
|
|
56
219
|
/**
|
|
57
|
-
* Parquet writer for git analytics data
|
|
220
|
+
* Parquet writer for git analytics data.
|
|
221
|
+
*
|
|
222
|
+
* @description
|
|
223
|
+
* ParquetWriter provides a streaming interface for writing data to Parquet
|
|
224
|
+
* format. It handles schema validation, row group management, compression,
|
|
225
|
+
* and statistics generation.
|
|
226
|
+
*
|
|
227
|
+
* **Usage Pattern:**
|
|
228
|
+
* 1. Create a schema using `defineSchema()`
|
|
229
|
+
* 2. Create a writer with `createParquetWriter()` or `new ParquetWriter()`
|
|
230
|
+
* 3. Write rows using `writeRow()` or `writeRows()`
|
|
231
|
+
* 4. Generate the file with `toBuffer()` or `writeTo()`
|
|
232
|
+
*
|
|
233
|
+
* **Row Group Management:**
|
|
234
|
+
* Rows are buffered in memory until the row group is full (by row count
|
|
235
|
+
* or memory limit), then flushed. You can also manually flush with
|
|
236
|
+
* `flushRowGroup()`.
|
|
237
|
+
*
|
|
238
|
+
* **Thread Safety:**
|
|
239
|
+
* Not thread-safe. Use separate writer instances for concurrent writes.
|
|
240
|
+
*
|
|
241
|
+
* @example
|
|
242
|
+
* ```typescript
|
|
243
|
+
* // Create schema
|
|
244
|
+
* const schema = defineSchema([
|
|
245
|
+
* { name: 'sha', type: ParquetFieldType.STRING, required: true },
|
|
246
|
+
* { name: 'type', type: ParquetFieldType.STRING, required: true },
|
|
247
|
+
* { name: 'size', type: ParquetFieldType.INT64, required: true },
|
|
248
|
+
* { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
|
|
249
|
+
* ])
|
|
250
|
+
*
|
|
251
|
+
* // Create writer
|
|
252
|
+
* const writer = new ParquetWriter(schema, {
|
|
253
|
+
* rowGroupSize: 10000,
|
|
254
|
+
* compression: ParquetCompression.SNAPPY,
|
|
255
|
+
* enableStatistics: true
|
|
256
|
+
* })
|
|
257
|
+
*
|
|
258
|
+
* // Write data
|
|
259
|
+
* for (const object of gitObjects) {
|
|
260
|
+
* await writer.writeRow({
|
|
261
|
+
* sha: object.sha,
|
|
262
|
+
* type: object.type,
|
|
263
|
+
* size: object.size,
|
|
264
|
+
* timestamp: Date.now()
|
|
265
|
+
* })
|
|
266
|
+
* }
|
|
267
|
+
*
|
|
268
|
+
* // Set custom metadata
|
|
269
|
+
* writer.setMetadata('git_version', '2.40.0')
|
|
270
|
+
* writer.setMetadata('repository', 'github.com/org/repo')
|
|
271
|
+
*
|
|
272
|
+
* // Generate file
|
|
273
|
+
* const buffer = await writer.toBuffer()
|
|
274
|
+
* console.log(`Generated ${buffer.length} bytes`)
|
|
275
|
+
* console.log(`Rows: ${writer.rowCount}`)
|
|
276
|
+
* console.log(`Row groups: ${writer.rowGroupCount}`)
|
|
277
|
+
*
|
|
278
|
+
* // Reset for reuse
|
|
279
|
+
* writer.reset()
|
|
280
|
+
* ```
|
|
281
|
+
*
|
|
282
|
+
* @class ParquetWriter
|
|
58
283
|
*/
|
|
59
284
|
export class ParquetWriter {
|
|
285
|
+
/**
|
|
286
|
+
* The Parquet schema for this writer.
|
|
287
|
+
* @readonly
|
|
288
|
+
*/
|
|
60
289
|
schema;
|
|
290
|
+
/**
|
|
291
|
+
* Resolved options with defaults applied.
|
|
292
|
+
* @readonly
|
|
293
|
+
*/
|
|
61
294
|
options;
|
|
295
|
+
/**
|
|
296
|
+
* Total row count written.
|
|
297
|
+
* @private
|
|
298
|
+
*/
|
|
62
299
|
_rowCount = 0;
|
|
300
|
+
/**
|
|
301
|
+
* Completed row groups.
|
|
302
|
+
* @private
|
|
303
|
+
*/
|
|
63
304
|
_rowGroups = [];
|
|
305
|
+
/**
|
|
306
|
+
* Current row group being built.
|
|
307
|
+
* @private
|
|
308
|
+
*/
|
|
64
309
|
_currentRowGroup = { rows: [], byteSize: 0 };
|
|
310
|
+
/**
|
|
311
|
+
* Whether the writer has been closed.
|
|
312
|
+
* @private
|
|
313
|
+
*/
|
|
65
314
|
_isClosed = false;
|
|
315
|
+
/**
|
|
316
|
+
* Custom key-value metadata.
|
|
317
|
+
* @private
|
|
318
|
+
*/
|
|
66
319
|
_keyValueMetadata = {};
|
|
320
|
+
/**
|
|
321
|
+
* Creation timestamp.
|
|
322
|
+
* @private
|
|
323
|
+
*/
|
|
67
324
|
_createdAt = Date.now();
|
|
325
|
+
/**
|
|
326
|
+
* Creates a new ParquetWriter instance.
|
|
327
|
+
*
|
|
328
|
+
* @param schema - The Parquet schema defining columns
|
|
329
|
+
* @param options - Writer configuration options
|
|
330
|
+
*
|
|
331
|
+
* @example
|
|
332
|
+
* ```typescript
|
|
333
|
+
* const writer = new ParquetWriter(schema, {
|
|
334
|
+
* rowGroupSize: 50000,
|
|
335
|
+
* compression: ParquetCompression.GZIP
|
|
336
|
+
* })
|
|
337
|
+
* ```
|
|
338
|
+
*/
|
|
68
339
|
constructor(schema, options = {}) {
|
|
69
340
|
this.schema = schema;
|
|
70
341
|
this.options = {
|
|
@@ -74,26 +345,83 @@ export class ParquetWriter {
|
|
|
74
345
|
};
|
|
75
346
|
}
|
|
76
347
|
/**
|
|
77
|
-
*
|
|
348
|
+
* Gets the total row count written to the writer.
|
|
349
|
+
*
|
|
350
|
+
* @description
|
|
351
|
+
* Returns the total number of rows written, including rows in the
|
|
352
|
+
* current unflushed row group.
|
|
353
|
+
*
|
|
354
|
+
* @returns Total row count
|
|
355
|
+
*
|
|
356
|
+
* @example
|
|
357
|
+
* ```typescript
|
|
358
|
+
* await writer.writeRows(data)
|
|
359
|
+
* console.log(`Wrote ${writer.rowCount} rows`)
|
|
360
|
+
* ```
|
|
78
361
|
*/
|
|
79
362
|
get rowCount() {
|
|
80
363
|
return this._rowCount;
|
|
81
364
|
}
|
|
82
365
|
/**
|
|
83
|
-
*
|
|
366
|
+
* Gets the number of row groups.
|
|
367
|
+
*
|
|
368
|
+
* @description
|
|
369
|
+
* Returns the number of completed row groups plus one if there's
|
|
370
|
+
* a pending row group with data.
|
|
371
|
+
*
|
|
372
|
+
* @returns Number of row groups
|
|
373
|
+
*
|
|
374
|
+
* @example
|
|
375
|
+
* ```typescript
|
|
376
|
+
* console.log(`Row groups: ${writer.rowGroupCount}`)
|
|
377
|
+
* ```
|
|
84
378
|
*/
|
|
85
379
|
get rowGroupCount() {
|
|
86
380
|
const pendingCount = this._currentRowGroup.rows.length > 0 ? 1 : 0;
|
|
87
381
|
return this._rowGroups.length + pendingCount;
|
|
88
382
|
}
|
|
89
383
|
/**
|
|
90
|
-
*
|
|
384
|
+
* Checks if the writer has been closed.
|
|
385
|
+
*
|
|
386
|
+
* @description
|
|
387
|
+
* A closed writer cannot accept new rows. Writers are closed
|
|
388
|
+
* implicitly by `closeWriter()`.
|
|
389
|
+
*
|
|
390
|
+
* @returns true if closed
|
|
391
|
+
*
|
|
392
|
+
* @example
|
|
393
|
+
* ```typescript
|
|
394
|
+
* if (!writer.isClosed) {
|
|
395
|
+
* await writer.writeRow(row)
|
|
396
|
+
* }
|
|
397
|
+
* ```
|
|
91
398
|
*/
|
|
92
399
|
get isClosed() {
|
|
93
400
|
return this._isClosed;
|
|
94
401
|
}
|
|
95
402
|
/**
|
|
96
|
-
*
|
|
403
|
+
* Writes a single row to the Parquet file.
|
|
404
|
+
*
|
|
405
|
+
* @description
|
|
406
|
+
* Validates the row against the schema and adds it to the current
|
|
407
|
+
* row group. Automatically flushes the row group when it reaches
|
|
408
|
+
* the configured size or memory limit.
|
|
409
|
+
*
|
|
410
|
+
* @param row - Object with column values keyed by column name
|
|
411
|
+
* @returns Promise that resolves when the row is written
|
|
412
|
+
*
|
|
413
|
+
* @throws {ParquetError} WRITER_CLOSED - If writer is closed
|
|
414
|
+
* @throws {ParquetError} MISSING_REQUIRED_FIELD - If required field is missing
|
|
415
|
+
* @throws {ParquetError} INVALID_FIELD_TYPE - If field value type doesn't match schema
|
|
416
|
+
*
|
|
417
|
+
* @example
|
|
418
|
+
* ```typescript
|
|
419
|
+
* await writer.writeRow({
|
|
420
|
+
* id: 123,
|
|
421
|
+
* name: 'Alice',
|
|
422
|
+
* active: true
|
|
423
|
+
* })
|
|
424
|
+
* ```
|
|
97
425
|
*/
|
|
98
426
|
async writeRow(row) {
|
|
99
427
|
if (this._isClosed) {
|
|
@@ -115,7 +443,25 @@ export class ParquetWriter {
|
|
|
115
443
|
}
|
|
116
444
|
}
|
|
117
445
|
/**
|
|
118
|
-
*
|
|
446
|
+
* Writes multiple rows to the Parquet file.
|
|
447
|
+
*
|
|
448
|
+
* @description
|
|
449
|
+
* Convenience method that writes an array of rows sequentially.
|
|
450
|
+
* Each row is validated and may trigger row group flushes.
|
|
451
|
+
*
|
|
452
|
+
* @param rows - Array of row objects to write
|
|
453
|
+
* @returns Promise that resolves when all rows are written
|
|
454
|
+
*
|
|
455
|
+
* @throws {ParquetError} Any error from writeRow()
|
|
456
|
+
*
|
|
457
|
+
* @example
|
|
458
|
+
* ```typescript
|
|
459
|
+
* await writer.writeRows([
|
|
460
|
+
* { id: 1, name: 'Alice' },
|
|
461
|
+
* { id: 2, name: 'Bob' },
|
|
462
|
+
* { id: 3, name: 'Carol' }
|
|
463
|
+
* ])
|
|
464
|
+
* ```
|
|
119
465
|
*/
|
|
120
466
|
async writeRows(rows) {
|
|
121
467
|
for (const row of rows) {
|
|
@@ -123,7 +469,26 @@ export class ParquetWriter {
|
|
|
123
469
|
}
|
|
124
470
|
}
|
|
125
471
|
/**
|
|
126
|
-
* Manually
|
|
472
|
+
* Manually flushes the current row group.
|
|
473
|
+
*
|
|
474
|
+
* @description
|
|
475
|
+
* Forces the current row group to be finalized and stored, even if
|
|
476
|
+
* it hasn't reached the size limit. Has no effect if the current
|
|
477
|
+
* row group is empty.
|
|
478
|
+
*
|
|
479
|
+
* @returns Promise that resolves when flush is complete
|
|
480
|
+
*
|
|
481
|
+
* @example
|
|
482
|
+
* ```typescript
|
|
483
|
+
* // Write some rows
|
|
484
|
+
* await writer.writeRows(batch1)
|
|
485
|
+
*
|
|
486
|
+
* // Force flush before writing next batch
|
|
487
|
+
* await writer.flushRowGroup()
|
|
488
|
+
*
|
|
489
|
+
* // Continue writing
|
|
490
|
+
* await writer.writeRows(batch2)
|
|
491
|
+
* ```
|
|
127
492
|
*/
|
|
128
493
|
async flushRowGroup() {
|
|
129
494
|
if (this._currentRowGroup.rows.length === 0) {
|
|
@@ -134,25 +499,79 @@ export class ParquetWriter {
|
|
|
134
499
|
this._currentRowGroup = { rows: [], byteSize: 0 };
|
|
135
500
|
}
|
|
136
501
|
/**
|
|
137
|
-
*
|
|
502
|
+
* Gets the current row group's memory size.
|
|
503
|
+
*
|
|
504
|
+
* @description
|
|
505
|
+
* Returns the estimated memory consumption of the unflushed row group.
|
|
506
|
+
* Useful for monitoring memory usage during streaming writes.
|
|
507
|
+
*
|
|
508
|
+
* @returns Memory size in bytes
|
|
509
|
+
*
|
|
510
|
+
* @example
|
|
511
|
+
* ```typescript
|
|
512
|
+
* if (writer.currentRowGroupMemorySize() > 50 * 1024 * 1024) {
|
|
513
|
+
* console.log('Row group using significant memory')
|
|
514
|
+
* await writer.flushRowGroup()
|
|
515
|
+
* }
|
|
516
|
+
* ```
|
|
138
517
|
*/
|
|
139
518
|
currentRowGroupMemorySize() {
|
|
140
519
|
return this._currentRowGroup.byteSize;
|
|
141
520
|
}
|
|
142
521
|
/**
|
|
143
|
-
*
|
|
522
|
+
* Gets the completed row groups.
|
|
523
|
+
*
|
|
524
|
+
* @description
|
|
525
|
+
* Returns a copy of the completed row group metadata array.
|
|
526
|
+
* Does not include the current unflushed row group.
|
|
527
|
+
*
|
|
528
|
+
* @returns Array of row group metadata
|
|
529
|
+
*
|
|
530
|
+
* @example
|
|
531
|
+
* ```typescript
|
|
532
|
+
* for (const rg of writer.getRowGroups()) {
|
|
533
|
+
* console.log(`Row group: ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
|
|
534
|
+
* }
|
|
535
|
+
* ```
|
|
144
536
|
*/
|
|
145
537
|
getRowGroups() {
|
|
146
538
|
return [...this._rowGroups];
|
|
147
539
|
}
|
|
148
540
|
/**
|
|
149
|
-
*
|
|
541
|
+
* Sets a custom key-value metadata entry.
|
|
542
|
+
*
|
|
543
|
+
* @description
|
|
544
|
+
* Adds custom metadata that will be stored in the Parquet file footer.
|
|
545
|
+
* Can be used for versioning, provenance, or application-specific data.
|
|
546
|
+
*
|
|
547
|
+
* @param key - Metadata key
|
|
548
|
+
* @param value - Metadata value
|
|
549
|
+
*
|
|
550
|
+
* @example
|
|
551
|
+
* ```typescript
|
|
552
|
+
* writer.setMetadata('created_by', 'gitdo-analytics')
|
|
553
|
+
* writer.setMetadata('schema_version', '2.0')
|
|
554
|
+
* writer.setMetadata('repository', 'github.com/org/repo')
|
|
555
|
+
* ```
|
|
150
556
|
*/
|
|
151
557
|
setMetadata(key, value) {
|
|
152
558
|
this._keyValueMetadata[key] = value;
|
|
153
559
|
}
|
|
154
560
|
/**
|
|
155
|
-
*
|
|
561
|
+
* Generates the Parquet file as a buffer.
|
|
562
|
+
*
|
|
563
|
+
* @description
|
|
564
|
+
* Finalizes the file by flushing any remaining rows and generating
|
|
565
|
+
* the complete Parquet file structure including header, row groups,
|
|
566
|
+
* and footer with metadata.
|
|
567
|
+
*
|
|
568
|
+
* @returns Promise resolving to the complete Parquet file as Uint8Array
|
|
569
|
+
*
|
|
570
|
+
* @example
|
|
571
|
+
* ```typescript
|
|
572
|
+
* const buffer = await writer.toBuffer()
|
|
573
|
+
* await fs.writeFile('data.parquet', buffer)
|
|
574
|
+
* ```
|
|
156
575
|
*/
|
|
157
576
|
async toBuffer() {
|
|
158
577
|
// Flush any remaining rows
|
|
@@ -162,14 +581,45 @@ export class ParquetWriter {
|
|
|
162
581
|
return this._generateParquetBytes();
|
|
163
582
|
}
|
|
164
583
|
/**
|
|
165
|
-
*
|
|
584
|
+
* Writes the Parquet file to an output stream.
|
|
585
|
+
*
|
|
586
|
+
* @description
|
|
587
|
+
* Generates the file and writes it to the provided output stream.
|
|
588
|
+
* Useful for streaming to files or network destinations.
|
|
589
|
+
*
|
|
590
|
+
* @param output - The output stream to write to
|
|
591
|
+
* @returns Promise that resolves when writing is complete
|
|
592
|
+
*
|
|
593
|
+
* @example
|
|
594
|
+
* ```typescript
|
|
595
|
+
* const output = new FileOutputStream('data.parquet')
|
|
596
|
+
* await writer.writeTo(output)
|
|
597
|
+
* output.close()
|
|
598
|
+
* ```
|
|
166
599
|
*/
|
|
167
600
|
async writeTo(output) {
|
|
168
601
|
const bytes = await this.toBuffer();
|
|
169
602
|
output.write(bytes);
|
|
170
603
|
}
|
|
171
604
|
/**
|
|
172
|
-
*
|
|
605
|
+
* Resets the writer to its initial state.
|
|
606
|
+
*
|
|
607
|
+
* @description
|
|
608
|
+
* Clears all written data, row groups, and metadata. The schema
|
|
609
|
+
* and options remain unchanged. Useful for writing multiple files
|
|
610
|
+
* with the same configuration.
|
|
611
|
+
*
|
|
612
|
+
* @example
|
|
613
|
+
* ```typescript
|
|
614
|
+
* // Write first file
|
|
615
|
+
* await writer.writeRows(batch1)
|
|
616
|
+
* const file1 = await writer.toBuffer()
|
|
617
|
+
*
|
|
618
|
+
* // Reset and write second file
|
|
619
|
+
* writer.reset()
|
|
620
|
+
* await writer.writeRows(batch2)
|
|
621
|
+
* const file2 = await writer.toBuffer()
|
|
622
|
+
* ```
|
|
173
623
|
*/
|
|
174
624
|
reset() {
|
|
175
625
|
this._rowCount = 0;
|
|
@@ -180,7 +630,11 @@ export class ParquetWriter {
|
|
|
180
630
|
this._createdAt = Date.now();
|
|
181
631
|
}
|
|
182
632
|
/**
|
|
183
|
-
*
|
|
633
|
+
* Validates a row against the schema.
|
|
634
|
+
*
|
|
635
|
+
* @param row - The row to validate
|
|
636
|
+
* @throws {ParquetError} If validation fails
|
|
637
|
+
* @private
|
|
184
638
|
*/
|
|
185
639
|
_validateRow(row) {
|
|
186
640
|
for (const field of this.schema.fields) {
|
|
@@ -198,7 +652,12 @@ export class ParquetWriter {
|
|
|
198
652
|
}
|
|
199
653
|
}
|
|
200
654
|
/**
|
|
201
|
-
*
|
|
655
|
+
* Validates a value matches the expected Parquet type.
|
|
656
|
+
*
|
|
657
|
+
* @param value - The value to validate
|
|
658
|
+
* @param type - The expected Parquet type
|
|
659
|
+
* @returns true if valid, false otherwise
|
|
660
|
+
* @private
|
|
202
661
|
*/
|
|
203
662
|
_validateType(value, type) {
|
|
204
663
|
switch (type) {
|
|
@@ -220,7 +679,11 @@ export class ParquetWriter {
|
|
|
220
679
|
}
|
|
221
680
|
}
|
|
222
681
|
/**
|
|
223
|
-
*
|
|
682
|
+
* Estimates the memory size of a row.
|
|
683
|
+
*
|
|
684
|
+
* @param row - The row to estimate
|
|
685
|
+
* @returns Estimated size in bytes
|
|
686
|
+
* @private
|
|
224
687
|
*/
|
|
225
688
|
_estimateRowSize(row) {
|
|
226
689
|
let size = 0;
|
|
@@ -245,7 +708,11 @@ export class ParquetWriter {
|
|
|
245
708
|
return size;
|
|
246
709
|
}
|
|
247
710
|
/**
|
|
248
|
-
*
|
|
711
|
+
* Builds a row group from internal representation.
|
|
712
|
+
*
|
|
713
|
+
* @param internal - The internal row group data
|
|
714
|
+
* @returns The row group metadata
|
|
715
|
+
* @private
|
|
249
716
|
*/
|
|
250
717
|
_buildRowGroup(internal) {
|
|
251
718
|
const columns = this.schema.fields.map(field => {
|
|
@@ -268,7 +735,12 @@ export class ParquetWriter {
|
|
|
268
735
|
};
|
|
269
736
|
}
|
|
270
737
|
/**
|
|
271
|
-
*
|
|
738
|
+
* Computes statistics for a column.
|
|
739
|
+
*
|
|
740
|
+
* @param values - The column values
|
|
741
|
+
* @param type - The column type
|
|
742
|
+
* @returns Column statistics
|
|
743
|
+
* @private
|
|
272
744
|
*/
|
|
273
745
|
_computeStatistics(values, type) {
|
|
274
746
|
const nonNullValues = values.filter(v => v !== null && v !== undefined);
|
|
@@ -309,7 +781,13 @@ export class ParquetWriter {
|
|
|
309
781
|
}
|
|
310
782
|
}
|
|
311
783
|
/**
|
|
312
|
-
*
|
|
784
|
+
* Estimates the encoded size after compression.
|
|
785
|
+
*
|
|
786
|
+
* @param values - The column values
|
|
787
|
+
* @param type - The column type
|
|
788
|
+
* @param compression - The compression type
|
|
789
|
+
* @returns Estimated compressed size in bytes
|
|
790
|
+
* @private
|
|
313
791
|
*/
|
|
314
792
|
_estimateEncodedSize(values, type, compression) {
|
|
315
793
|
const uncompressedSize = this._estimateUncompressedSize(values, type);
|
|
@@ -329,7 +807,12 @@ export class ParquetWriter {
|
|
|
329
807
|
}
|
|
330
808
|
}
|
|
331
809
|
/**
|
|
332
|
-
*
|
|
810
|
+
* Estimates the uncompressed size of column values.
|
|
811
|
+
*
|
|
812
|
+
* @param values - The column values
|
|
813
|
+
* @param type - The column type
|
|
814
|
+
* @returns Estimated uncompressed size in bytes
|
|
815
|
+
* @private
|
|
333
816
|
*/
|
|
334
817
|
_estimateUncompressedSize(values, type) {
|
|
335
818
|
let size = 0;
|
|
@@ -364,7 +847,10 @@ export class ParquetWriter {
|
|
|
364
847
|
return size;
|
|
365
848
|
}
|
|
366
849
|
/**
|
|
367
|
-
*
|
|
850
|
+
* Generates the complete Parquet file bytes.
|
|
851
|
+
*
|
|
852
|
+
* @returns The complete Parquet file as Uint8Array
|
|
853
|
+
* @private
|
|
368
854
|
*/
|
|
369
855
|
_generateParquetBytes() {
|
|
370
856
|
// Build all row data - will be populated from row groups in full implementation
|
|
@@ -414,7 +900,12 @@ export class ParquetWriter {
|
|
|
414
900
|
return result;
|
|
415
901
|
}
|
|
416
902
|
/**
|
|
417
|
-
* Simple compression simulation for non-gzip formats
|
|
903
|
+
* Simple compression simulation for non-gzip formats.
|
|
904
|
+
*
|
|
905
|
+
* @param data - Data to compress
|
|
906
|
+
* @param compression - Compression type
|
|
907
|
+
* @returns Compressed data
|
|
908
|
+
* @private
|
|
418
909
|
*/
|
|
419
910
|
_simpleCompress(data, compression) {
|
|
420
911
|
if (compression === ParquetCompression.UNCOMPRESSED) {
|
|
@@ -434,7 +925,34 @@ export class ParquetWriter {
|
|
|
434
925
|
// Helper Functions
|
|
435
926
|
// ============================================================================
|
|
436
927
|
/**
|
|
437
|
-
*
|
|
928
|
+
* Defines a Parquet schema.
|
|
929
|
+
*
|
|
930
|
+
* @description
|
|
931
|
+
* Creates a validated Parquet schema from field definitions. Validates that:
|
|
932
|
+
* - Schema has at least one field
|
|
933
|
+
* - All field names are non-empty
|
|
934
|
+
* - All field names are unique
|
|
935
|
+
*
|
|
936
|
+
* @param fields - Array of field definitions
|
|
937
|
+
* @param metadata - Optional schema-level metadata
|
|
938
|
+
* @returns Validated Parquet schema
|
|
939
|
+
*
|
|
940
|
+
* @throws {ParquetError} EMPTY_SCHEMA - If fields array is empty
|
|
941
|
+
* @throws {ParquetError} EMPTY_FIELD_NAME - If any field name is empty
|
|
942
|
+
* @throws {ParquetError} DUPLICATE_FIELD - If field names are not unique
|
|
943
|
+
*
|
|
944
|
+
* @example
|
|
945
|
+
* ```typescript
|
|
946
|
+
* const schema = defineSchema([
|
|
947
|
+
* { name: 'id', type: ParquetFieldType.INT64, required: true },
|
|
948
|
+
* { name: 'name', type: ParquetFieldType.STRING, required: true },
|
|
949
|
+
* { name: 'age', type: ParquetFieldType.INT32, required: false },
|
|
950
|
+
* { name: 'created_at', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
|
|
951
|
+
* ], {
|
|
952
|
+
* version: '1.0',
|
|
953
|
+
* description: 'User records'
|
|
954
|
+
* })
|
|
955
|
+
* ```
|
|
438
956
|
*/
|
|
439
957
|
export function defineSchema(fields, metadata) {
|
|
440
958
|
// Validate schema
|
|
@@ -462,13 +980,50 @@ export function defineSchema(fields, metadata) {
|
|
|
462
980
|
};
|
|
463
981
|
}
|
|
464
982
|
/**
|
|
465
|
-
*
|
|
983
|
+
* Creates a Parquet writer.
|
|
984
|
+
*
|
|
985
|
+
* @description
|
|
986
|
+
* Factory function to create a ParquetWriter with the specified schema
|
|
987
|
+
* and options. Equivalent to `new ParquetWriter(schema, options)`.
|
|
988
|
+
*
|
|
989
|
+
* @param schema - The Parquet schema
|
|
990
|
+
* @param options - Writer options
|
|
991
|
+
* @returns A new ParquetWriter instance
|
|
992
|
+
*
|
|
993
|
+
* @example
|
|
994
|
+
* ```typescript
|
|
995
|
+
* const writer = createParquetWriter(schema, {
|
|
996
|
+
* rowGroupSize: 10000,
|
|
997
|
+
* compression: ParquetCompression.SNAPPY
|
|
998
|
+
* })
|
|
999
|
+
* ```
|
|
466
1000
|
*/
|
|
467
1001
|
export function createParquetWriter(schema, options = {}) {
|
|
468
1002
|
return new ParquetWriter(schema, options);
|
|
469
1003
|
}
|
|
470
1004
|
/**
|
|
471
|
-
*
|
|
1005
|
+
* Writes data directly to a Parquet file buffer.
|
|
1006
|
+
*
|
|
1007
|
+
* @description
|
|
1008
|
+
* Convenience function that creates a writer, writes all rows, and returns
|
|
1009
|
+
* the complete Parquet file. Useful for simple one-shot writes.
|
|
1010
|
+
*
|
|
1011
|
+
* @param schema - The Parquet schema
|
|
1012
|
+
* @param rows - Array of rows to write
|
|
1013
|
+
* @param options - Writer options
|
|
1014
|
+
* @returns Promise resolving to the complete Parquet file as Uint8Array
|
|
1015
|
+
*
|
|
1016
|
+
* @example
|
|
1017
|
+
* ```typescript
|
|
1018
|
+
* const buffer = await writeParquetFile(schema, [
|
|
1019
|
+
* { id: 1, name: 'Alice' },
|
|
1020
|
+
* { id: 2, name: 'Bob' }
|
|
1021
|
+
* ], {
|
|
1022
|
+
* compression: ParquetCompression.GZIP
|
|
1023
|
+
* })
|
|
1024
|
+
*
|
|
1025
|
+
* await fs.writeFile('data.parquet', buffer)
|
|
1026
|
+
* ```
|
|
472
1027
|
*/
|
|
473
1028
|
export async function writeParquetFile(schema, rows, options = {}) {
|
|
474
1029
|
const writer = createParquetWriter(schema, options);
|
|
@@ -476,7 +1031,21 @@ export async function writeParquetFile(schema, rows, options = {}) {
|
|
|
476
1031
|
return writer.toBuffer();
|
|
477
1032
|
}
|
|
478
1033
|
/**
|
|
479
|
-
*
|
|
1034
|
+
* Closes a writer and returns the final buffer.
|
|
1035
|
+
*
|
|
1036
|
+
* @description
|
|
1037
|
+
* Generates the final Parquet file buffer and marks the writer as closed.
|
|
1038
|
+
* The writer cannot be used for further writes after calling this function.
|
|
1039
|
+
*
|
|
1040
|
+
* @param writer - The ParquetWriter to close
|
|
1041
|
+
* @returns Promise resolving to the complete Parquet file as Uint8Array
|
|
1042
|
+
*
|
|
1043
|
+
* @example
|
|
1044
|
+
* ```typescript
|
|
1045
|
+
* await writer.writeRows(data)
|
|
1046
|
+
* const buffer = await closeWriter(writer)
|
|
1047
|
+
* console.log(writer.isClosed) // true
|
|
1048
|
+
* ```
|
|
480
1049
|
*/
|
|
481
1050
|
export async function closeWriter(writer) {
|
|
482
1051
|
const bytes = await writer.toBuffer();
|
|
@@ -484,14 +1053,52 @@ export async function closeWriter(writer) {
|
|
|
484
1053
|
return bytes;
|
|
485
1054
|
}
|
|
486
1055
|
/**
|
|
487
|
-
*
|
|
1056
|
+
* Adds a row group to the writer.
|
|
1057
|
+
*
|
|
1058
|
+
* @description
|
|
1059
|
+
* Writes multiple rows and then flushes them as a single row group.
|
|
1060
|
+
* Useful when you want explicit control over row group boundaries.
|
|
1061
|
+
*
|
|
1062
|
+
* @param writer - The ParquetWriter to use
|
|
1063
|
+
* @param rows - Array of rows for this row group
|
|
1064
|
+
* @returns Promise that resolves when the row group is written
|
|
1065
|
+
*
|
|
1066
|
+
* @example
|
|
1067
|
+
* ```typescript
|
|
1068
|
+
* // Add explicit row groups
|
|
1069
|
+
* await addRowGroup(writer, batch1) // First row group
|
|
1070
|
+
* await addRowGroup(writer, batch2) // Second row group
|
|
1071
|
+
* ```
|
|
488
1072
|
*/
|
|
489
1073
|
export async function addRowGroup(writer, rows) {
|
|
490
1074
|
await writer.writeRows(rows);
|
|
491
1075
|
await writer.flushRowGroup();
|
|
492
1076
|
}
|
|
493
1077
|
/**
|
|
494
|
-
*
|
|
1078
|
+
* Gets metadata from a Parquet file buffer.
|
|
1079
|
+
*
|
|
1080
|
+
* @description
|
|
1081
|
+
* Parses a Parquet file buffer and extracts the metadata including
|
|
1082
|
+
* schema, row groups, compression settings, and custom metadata.
|
|
1083
|
+
*
|
|
1084
|
+
* @param bytes - The Parquet file buffer
|
|
1085
|
+
* @returns The parsed metadata
|
|
1086
|
+
*
|
|
1087
|
+
* @throws {ParquetError} INVALID_MAGIC - If file doesn't have valid Parquet magic bytes
|
|
1088
|
+
*
|
|
1089
|
+
* @example
|
|
1090
|
+
* ```typescript
|
|
1091
|
+
* const buffer = await fs.readFile('data.parquet')
|
|
1092
|
+
* const metadata = getMetadata(buffer)
|
|
1093
|
+
*
|
|
1094
|
+
* console.log(`Rows: ${metadata.numRows}`)
|
|
1095
|
+
* console.log(`Schema: ${metadata.schema.fields.map(f => f.name).join(', ')}`)
|
|
1096
|
+
* console.log(`Row groups: ${metadata.rowGroups.length}`)
|
|
1097
|
+
*
|
|
1098
|
+
* for (const rg of metadata.rowGroups) {
|
|
1099
|
+
* console.log(` - ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
|
|
1100
|
+
* }
|
|
1101
|
+
* ```
|
|
495
1102
|
*/
|
|
496
1103
|
export function getMetadata(bytes) {
|
|
497
1104
|
// Verify magic bytes
|
|
@@ -546,7 +1153,28 @@ export function getMetadata(bytes) {
|
|
|
546
1153
|
};
|
|
547
1154
|
}
|
|
548
1155
|
/**
|
|
549
|
-
*
|
|
1156
|
+
* Sets the compression type for a writer.
|
|
1157
|
+
*
|
|
1158
|
+
* @description
|
|
1159
|
+
* Updates the default compression algorithm for a writer. Affects all
|
|
1160
|
+
* subsequently written data. Columns with explicit compression settings
|
|
1161
|
+
* in columnCompression are not affected.
|
|
1162
|
+
*
|
|
1163
|
+
* @param writer - The ParquetWriter to update
|
|
1164
|
+
* @param compression - The new compression type
|
|
1165
|
+
*
|
|
1166
|
+
* @example
|
|
1167
|
+
* ```typescript
|
|
1168
|
+
* const writer = createParquetWriter(schema)
|
|
1169
|
+
*
|
|
1170
|
+
* // Write some rows with SNAPPY (default)
|
|
1171
|
+
* await writer.writeRows(batch1)
|
|
1172
|
+
* await writer.flushRowGroup()
|
|
1173
|
+
*
|
|
1174
|
+
* // Switch to GZIP for remaining data
|
|
1175
|
+
* setCompression(writer, ParquetCompression.GZIP)
|
|
1176
|
+
* await writer.writeRows(batch2)
|
|
1177
|
+
* ```
|
|
550
1178
|
*/
|
|
551
1179
|
export function setCompression(writer, compression) {
|
|
552
1180
|
;
|