gitx.do 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/blame.d.ts +259 -0
- package/dist/cli/commands/blame.d.ts.map +1 -0
- package/dist/cli/commands/blame.js +609 -0
- package/dist/cli/commands/blame.js.map +1 -0
- package/dist/cli/commands/branch.d.ts +249 -0
- package/dist/cli/commands/branch.d.ts.map +1 -0
- package/dist/cli/commands/branch.js +693 -0
- package/dist/cli/commands/branch.js.map +1 -0
- package/dist/cli/commands/commit.d.ts +182 -0
- package/dist/cli/commands/commit.d.ts.map +1 -0
- package/dist/cli/commands/commit.js +437 -0
- package/dist/cli/commands/commit.js.map +1 -0
- package/dist/cli/commands/diff.d.ts +464 -0
- package/dist/cli/commands/diff.d.ts.map +1 -0
- package/dist/cli/commands/diff.js +958 -0
- package/dist/cli/commands/diff.js.map +1 -0
- package/dist/cli/commands/log.d.ts +239 -0
- package/dist/cli/commands/log.d.ts.map +1 -0
- package/dist/cli/commands/log.js +535 -0
- package/dist/cli/commands/log.js.map +1 -0
- package/dist/cli/commands/review.d.ts +457 -0
- package/dist/cli/commands/review.d.ts.map +1 -0
- package/dist/cli/commands/review.js +533 -0
- package/dist/cli/commands/review.js.map +1 -0
- package/dist/cli/commands/status.d.ts +269 -0
- package/dist/cli/commands/status.d.ts.map +1 -0
- package/dist/cli/commands/status.js +493 -0
- package/dist/cli/commands/status.js.map +1 -0
- package/dist/cli/commands/web.d.ts +199 -0
- package/dist/cli/commands/web.d.ts.map +1 -0
- package/dist/cli/commands/web.js +696 -0
- package/dist/cli/commands/web.js.map +1 -0
- package/dist/cli/fs-adapter.d.ts +656 -0
- package/dist/cli/fs-adapter.d.ts.map +1 -0
- package/dist/cli/fs-adapter.js +1179 -0
- package/dist/cli/fs-adapter.js.map +1 -0
- package/dist/cli/index.d.ts +387 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +523 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/ui/components/DiffView.d.ts +7 -0
- package/dist/cli/ui/components/DiffView.d.ts.map +1 -0
- package/dist/cli/ui/components/DiffView.js +11 -0
- package/dist/cli/ui/components/DiffView.js.map +1 -0
- package/dist/cli/ui/components/ErrorDisplay.d.ts +6 -0
- package/dist/cli/ui/components/ErrorDisplay.d.ts.map +1 -0
- package/dist/cli/ui/components/ErrorDisplay.js +11 -0
- package/dist/cli/ui/components/ErrorDisplay.js.map +1 -0
- package/dist/cli/ui/components/FuzzySearch.d.ts +9 -0
- package/dist/cli/ui/components/FuzzySearch.d.ts.map +1 -0
- package/dist/cli/ui/components/FuzzySearch.js +12 -0
- package/dist/cli/ui/components/FuzzySearch.js.map +1 -0
- package/dist/cli/ui/components/LoadingSpinner.d.ts +6 -0
- package/dist/cli/ui/components/LoadingSpinner.d.ts.map +1 -0
- package/dist/cli/ui/components/LoadingSpinner.js +10 -0
- package/dist/cli/ui/components/LoadingSpinner.js.map +1 -0
- package/dist/cli/ui/components/NavigationList.d.ts +9 -0
- package/dist/cli/ui/components/NavigationList.d.ts.map +1 -0
- package/dist/cli/ui/components/NavigationList.js +11 -0
- package/dist/cli/ui/components/NavigationList.js.map +1 -0
- package/dist/cli/ui/components/ScrollableContent.d.ts +8 -0
- package/dist/cli/ui/components/ScrollableContent.d.ts.map +1 -0
- package/dist/cli/ui/components/ScrollableContent.js +11 -0
- package/dist/cli/ui/components/ScrollableContent.js.map +1 -0
- package/dist/cli/ui/components/index.d.ts +7 -0
- package/dist/cli/ui/components/index.d.ts.map +1 -0
- package/dist/cli/ui/components/index.js +9 -0
- package/dist/cli/ui/components/index.js.map +1 -0
- package/dist/cli/ui/terminal-ui.d.ts +52 -0
- package/dist/cli/ui/terminal-ui.d.ts.map +1 -0
- package/dist/cli/ui/terminal-ui.js +121 -0
- package/dist/cli/ui/terminal-ui.js.map +1 -0
- package/dist/durable-object/object-store.d.ts +401 -23
- package/dist/durable-object/object-store.d.ts.map +1 -1
- package/dist/durable-object/object-store.js +414 -25
- package/dist/durable-object/object-store.js.map +1 -1
- package/dist/durable-object/schema.d.ts +188 -0
- package/dist/durable-object/schema.d.ts.map +1 -1
- package/dist/durable-object/schema.js +160 -0
- package/dist/durable-object/schema.js.map +1 -1
- package/dist/durable-object/wal.d.ts +336 -31
- package/dist/durable-object/wal.d.ts.map +1 -1
- package/dist/durable-object/wal.js +272 -27
- package/dist/durable-object/wal.js.map +1 -1
- package/dist/index.d.ts +379 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +379 -7
- package/dist/index.js.map +1 -1
- package/dist/mcp/adapter.d.ts +579 -38
- package/dist/mcp/adapter.d.ts.map +1 -1
- package/dist/mcp/adapter.js +426 -33
- package/dist/mcp/adapter.js.map +1 -1
- package/dist/mcp/sandbox.d.ts +532 -29
- package/dist/mcp/sandbox.d.ts.map +1 -1
- package/dist/mcp/sandbox.js +389 -22
- package/dist/mcp/sandbox.js.map +1 -1
- package/dist/mcp/sdk-adapter.d.ts +478 -56
- package/dist/mcp/sdk-adapter.d.ts.map +1 -1
- package/dist/mcp/sdk-adapter.js +346 -44
- package/dist/mcp/sdk-adapter.js.map +1 -1
- package/dist/mcp/tools.d.ts +445 -30
- package/dist/mcp/tools.d.ts.map +1 -1
- package/dist/mcp/tools.js +363 -33
- package/dist/mcp/tools.js.map +1 -1
- package/dist/ops/blame.d.ts +424 -21
- package/dist/ops/blame.d.ts.map +1 -1
- package/dist/ops/blame.js +303 -20
- package/dist/ops/blame.js.map +1 -1
- package/dist/ops/branch.d.ts +583 -32
- package/dist/ops/branch.d.ts.map +1 -1
- package/dist/ops/branch.js +365 -23
- package/dist/ops/branch.js.map +1 -1
- package/dist/ops/commit-traversal.d.ts +164 -24
- package/dist/ops/commit-traversal.d.ts.map +1 -1
- package/dist/ops/commit-traversal.js +68 -2
- package/dist/ops/commit-traversal.js.map +1 -1
- package/dist/ops/commit.d.ts +387 -53
- package/dist/ops/commit.d.ts.map +1 -1
- package/dist/ops/commit.js +249 -29
- package/dist/ops/commit.js.map +1 -1
- package/dist/ops/merge-base.d.ts +195 -21
- package/dist/ops/merge-base.d.ts.map +1 -1
- package/dist/ops/merge-base.js +122 -12
- package/dist/ops/merge-base.js.map +1 -1
- package/dist/ops/merge.d.ts +600 -130
- package/dist/ops/merge.d.ts.map +1 -1
- package/dist/ops/merge.js +408 -60
- package/dist/ops/merge.js.map +1 -1
- package/dist/ops/tag.d.ts +67 -2
- package/dist/ops/tag.d.ts.map +1 -1
- package/dist/ops/tag.js +42 -1
- package/dist/ops/tag.js.map +1 -1
- package/dist/ops/tree-builder.d.ts +102 -6
- package/dist/ops/tree-builder.d.ts.map +1 -1
- package/dist/ops/tree-builder.js +30 -5
- package/dist/ops/tree-builder.js.map +1 -1
- package/dist/ops/tree-diff.d.ts +50 -2
- package/dist/ops/tree-diff.d.ts.map +1 -1
- package/dist/ops/tree-diff.js +50 -2
- package/dist/ops/tree-diff.js.map +1 -1
- package/dist/pack/delta.d.ts +211 -39
- package/dist/pack/delta.d.ts.map +1 -1
- package/dist/pack/delta.js +232 -46
- package/dist/pack/delta.js.map +1 -1
- package/dist/pack/format.d.ts +390 -28
- package/dist/pack/format.d.ts.map +1 -1
- package/dist/pack/format.js +344 -33
- package/dist/pack/format.js.map +1 -1
- package/dist/pack/full-generation.d.ts +313 -28
- package/dist/pack/full-generation.d.ts.map +1 -1
- package/dist/pack/full-generation.js +238 -19
- package/dist/pack/full-generation.js.map +1 -1
- package/dist/pack/generation.d.ts +346 -23
- package/dist/pack/generation.d.ts.map +1 -1
- package/dist/pack/generation.js +269 -21
- package/dist/pack/generation.js.map +1 -1
- package/dist/pack/index.d.ts +407 -86
- package/dist/pack/index.d.ts.map +1 -1
- package/dist/pack/index.js +351 -70
- package/dist/pack/index.js.map +1 -1
- package/dist/refs/branch.d.ts +517 -71
- package/dist/refs/branch.d.ts.map +1 -1
- package/dist/refs/branch.js +410 -26
- package/dist/refs/branch.js.map +1 -1
- package/dist/refs/storage.d.ts +610 -57
- package/dist/refs/storage.d.ts.map +1 -1
- package/dist/refs/storage.js +481 -29
- package/dist/refs/storage.js.map +1 -1
- package/dist/refs/tag.d.ts +677 -67
- package/dist/refs/tag.d.ts.map +1 -1
- package/dist/refs/tag.js +497 -30
- package/dist/refs/tag.js.map +1 -1
- package/dist/storage/lru-cache.d.ts +556 -53
- package/dist/storage/lru-cache.d.ts.map +1 -1
- package/dist/storage/lru-cache.js +439 -36
- package/dist/storage/lru-cache.js.map +1 -1
- package/dist/storage/object-index.d.ts +483 -38
- package/dist/storage/object-index.d.ts.map +1 -1
- package/dist/storage/object-index.js +388 -22
- package/dist/storage/object-index.js.map +1 -1
- package/dist/storage/r2-pack.d.ts +957 -94
- package/dist/storage/r2-pack.d.ts.map +1 -1
- package/dist/storage/r2-pack.js +756 -48
- package/dist/storage/r2-pack.js.map +1 -1
- package/dist/tiered/cdc-pipeline.d.ts +1610 -38
- package/dist/tiered/cdc-pipeline.d.ts.map +1 -1
- package/dist/tiered/cdc-pipeline.js +1131 -22
- package/dist/tiered/cdc-pipeline.js.map +1 -1
- package/dist/tiered/migration.d.ts +903 -41
- package/dist/tiered/migration.d.ts.map +1 -1
- package/dist/tiered/migration.js +646 -24
- package/dist/tiered/migration.js.map +1 -1
- package/dist/tiered/parquet-writer.d.ts +944 -47
- package/dist/tiered/parquet-writer.d.ts.map +1 -1
- package/dist/tiered/parquet-writer.js +667 -39
- package/dist/tiered/parquet-writer.js.map +1 -1
- package/dist/tiered/read-path.d.ts +728 -34
- package/dist/tiered/read-path.d.ts.map +1 -1
- package/dist/tiered/read-path.js +310 -27
- package/dist/tiered/read-path.js.map +1 -1
- package/dist/types/objects.d.ts +457 -0
- package/dist/types/objects.d.ts.map +1 -1
- package/dist/types/objects.js +305 -4
- package/dist/types/objects.js.map +1 -1
- package/dist/types/storage.d.ts +407 -35
- package/dist/types/storage.d.ts.map +1 -1
- package/dist/types/storage.js +27 -3
- package/dist/types/storage.js.map +1 -1
- package/dist/utils/hash.d.ts +133 -12
- package/dist/utils/hash.d.ts.map +1 -1
- package/dist/utils/hash.js +133 -12
- package/dist/utils/hash.js.map +1 -1
- package/dist/utils/sha1.d.ts +102 -9
- package/dist/utils/sha1.d.ts.map +1 -1
- package/dist/utils/sha1.js +114 -11
- package/dist/utils/sha1.js.map +1 -1
- package/dist/wire/capabilities.d.ts +896 -88
- package/dist/wire/capabilities.d.ts.map +1 -1
- package/dist/wire/capabilities.js +566 -62
- package/dist/wire/capabilities.js.map +1 -1
- package/dist/wire/pkt-line.d.ts +293 -15
- package/dist/wire/pkt-line.d.ts.map +1 -1
- package/dist/wire/pkt-line.js +251 -15
- package/dist/wire/pkt-line.js.map +1 -1
- package/dist/wire/receive-pack.d.ts +814 -64
- package/dist/wire/receive-pack.d.ts.map +1 -1
- package/dist/wire/receive-pack.js +542 -41
- package/dist/wire/receive-pack.js.map +1 -1
- package/dist/wire/smart-http.d.ts +575 -97
- package/dist/wire/smart-http.d.ts.map +1 -1
- package/dist/wire/smart-http.js +337 -46
- package/dist/wire/smart-http.js.map +1 -1
- package/dist/wire/upload-pack.d.ts +492 -98
- package/dist/wire/upload-pack.d.ts.map +1 -1
- package/dist/wire/upload-pack.js +347 -59
- package/dist/wire/upload-pack.js.map +1 -1
- package/package.json +10 -2
|
@@ -1,248 +1,1145 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Parquet Writer for Git Analytics
|
|
2
|
+
* @fileoverview Parquet Writer for Git Analytics
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
* - Metadata handling with statistics
|
|
4
|
+
* @description
|
|
5
|
+
* Provides functionality to write git analytics data to Parquet format, a
|
|
6
|
+
* columnar storage format optimized for analytical queries. This module
|
|
7
|
+
* enables efficient storage and querying of Git repository data.
|
|
9
8
|
*
|
|
10
|
-
*
|
|
9
|
+
* **Key Features:**
|
|
10
|
+
* - Schema definition with various field types (STRING, INT32, INT64, etc.)
|
|
11
|
+
* - Multiple compression algorithms (SNAPPY, GZIP, ZSTD, LZ4, UNCOMPRESSED)
|
|
12
|
+
* - Row group management for efficient columnar storage
|
|
13
|
+
* - Automatic and manual row group flushing
|
|
14
|
+
* - Column-level statistics generation (min, max, null count)
|
|
15
|
+
* - Custom key-value metadata support
|
|
16
|
+
* - Memory-efficient streaming writes
|
|
17
|
+
*
|
|
18
|
+
* **Parquet Format:**
|
|
19
|
+
* The generated files follow the Parquet format with:
|
|
20
|
+
* - Magic bytes "PAR1" at start and end
|
|
21
|
+
* - Row group data organized by columns
|
|
22
|
+
* - Footer metadata containing schema and statistics
|
|
23
|
+
*
|
|
24
|
+
* @example
|
|
25
|
+
* ```typescript
|
|
26
|
+
* // Define schema for commit analytics
|
|
27
|
+
* const schema = defineSchema([
|
|
28
|
+
* { name: 'commit_sha', type: ParquetFieldType.STRING, required: true },
|
|
29
|
+
* { name: 'author', type: ParquetFieldType.STRING, required: true },
|
|
30
|
+
* { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true },
|
|
31
|
+
* { name: 'file_count', type: ParquetFieldType.INT32, required: false }
|
|
32
|
+
* ])
|
|
33
|
+
*
|
|
34
|
+
* // Create writer with options
|
|
35
|
+
* const writer = createParquetWriter(schema, {
|
|
36
|
+
* rowGroupSize: 10000,
|
|
37
|
+
* compression: ParquetCompression.SNAPPY,
|
|
38
|
+
* enableStatistics: true
|
|
39
|
+
* })
|
|
40
|
+
*
|
|
41
|
+
* // Write data
|
|
42
|
+
* await writer.writeRows([
|
|
43
|
+
* { commit_sha: 'abc123...', author: 'alice', timestamp: Date.now(), file_count: 5 },
|
|
44
|
+
* { commit_sha: 'def456...', author: 'bob', timestamp: Date.now(), file_count: 3 }
|
|
45
|
+
* ])
|
|
46
|
+
*
|
|
47
|
+
* // Generate the Parquet file
|
|
48
|
+
* const buffer = await writer.toBuffer()
|
|
49
|
+
* ```
|
|
50
|
+
*
|
|
51
|
+
* @module tiered/parquet-writer
|
|
52
|
+
* @see {@link ParquetWriter} - Main writer class
|
|
53
|
+
* @see {@link defineSchema} - Schema definition helper
|
|
11
54
|
*/
|
|
12
55
|
/**
|
|
13
|
-
* Supported Parquet field types
|
|
56
|
+
* Supported Parquet field types.
|
|
57
|
+
*
|
|
58
|
+
* @description
|
|
59
|
+
* Defines the data types that can be used for fields in a Parquet schema.
|
|
60
|
+
* Each type maps to an appropriate physical and logical Parquet type.
|
|
61
|
+
*
|
|
62
|
+
* @example
|
|
63
|
+
* ```typescript
|
|
64
|
+
* const field: ParquetField = {
|
|
65
|
+
* name: 'count',
|
|
66
|
+
* type: ParquetFieldType.INT64,
|
|
67
|
+
* required: true
|
|
68
|
+
* }
|
|
69
|
+
* ```
|
|
70
|
+
*
|
|
71
|
+
* @enum {string}
|
|
14
72
|
*/
|
|
15
73
|
export declare enum ParquetFieldType {
|
|
74
|
+
/**
|
|
75
|
+
* UTF-8 encoded string.
|
|
76
|
+
* Maps to Parquet BYTE_ARRAY with UTF8 logical type.
|
|
77
|
+
*/
|
|
16
78
|
STRING = "STRING",
|
|
79
|
+
/**
|
|
80
|
+
* 32-bit signed integer.
|
|
81
|
+
* Maps to Parquet INT32 physical type.
|
|
82
|
+
*/
|
|
17
83
|
INT32 = "INT32",
|
|
84
|
+
/**
|
|
85
|
+
* 64-bit signed integer.
|
|
86
|
+
* Maps to Parquet INT64 physical type.
|
|
87
|
+
*/
|
|
18
88
|
INT64 = "INT64",
|
|
89
|
+
/**
|
|
90
|
+
* Boolean value (true/false).
|
|
91
|
+
* Maps to Parquet BOOLEAN physical type.
|
|
92
|
+
*/
|
|
19
93
|
BOOLEAN = "BOOLEAN",
|
|
94
|
+
/**
|
|
95
|
+
* 32-bit IEEE 754 floating point.
|
|
96
|
+
* Maps to Parquet FLOAT physical type.
|
|
97
|
+
*/
|
|
20
98
|
FLOAT = "FLOAT",
|
|
99
|
+
/**
|
|
100
|
+
* 64-bit IEEE 754 floating point.
|
|
101
|
+
* Maps to Parquet DOUBLE physical type.
|
|
102
|
+
*/
|
|
21
103
|
DOUBLE = "DOUBLE",
|
|
104
|
+
/**
|
|
105
|
+
* Raw binary data.
|
|
106
|
+
* Maps to Parquet BYTE_ARRAY physical type.
|
|
107
|
+
*/
|
|
22
108
|
BINARY = "BINARY",
|
|
109
|
+
/**
|
|
110
|
+
* Timestamp with millisecond precision.
|
|
111
|
+
* Maps to Parquet INT64 with TIMESTAMP_MILLIS logical type.
|
|
112
|
+
*/
|
|
23
113
|
TIMESTAMP_MILLIS = "TIMESTAMP_MILLIS",
|
|
114
|
+
/**
|
|
115
|
+
* Timestamp with microsecond precision.
|
|
116
|
+
* Maps to Parquet INT64 with TIMESTAMP_MICROS logical type.
|
|
117
|
+
*/
|
|
24
118
|
TIMESTAMP_MICROS = "TIMESTAMP_MICROS"
|
|
25
119
|
}
|
|
26
120
|
/**
|
|
27
|
-
* Supported compression types
|
|
121
|
+
* Supported compression types for Parquet data.
|
|
122
|
+
*
|
|
123
|
+
* @description
|
|
124
|
+
* Different compression algorithms offer trade-offs between compression
|
|
125
|
+
* ratio, compression speed, and decompression speed.
|
|
126
|
+
*
|
|
127
|
+
* **Comparison:**
|
|
128
|
+
* - SNAPPY: Fast compression/decompression, moderate ratio (default)
|
|
129
|
+
* - GZIP: Higher ratio, slower compression, fast decompression
|
|
130
|
+
* - ZSTD: Best ratio, good speed, requires more memory
|
|
131
|
+
* - LZ4: Fastest, lower ratio
|
|
132
|
+
* - UNCOMPRESSED: No compression overhead
|
|
133
|
+
*
|
|
134
|
+
* @example
|
|
135
|
+
* ```typescript
|
|
136
|
+
* const writer = createParquetWriter(schema, {
|
|
137
|
+
* compression: ParquetCompression.ZSTD
|
|
138
|
+
* })
|
|
139
|
+
* ```
|
|
140
|
+
*
|
|
141
|
+
* @enum {string}
|
|
28
142
|
*/
|
|
29
143
|
export declare enum ParquetCompression {
|
|
144
|
+
/**
|
|
145
|
+
* No compression applied.
|
|
146
|
+
* Fastest writes, largest file size.
|
|
147
|
+
*/
|
|
30
148
|
UNCOMPRESSED = "UNCOMPRESSED",
|
|
149
|
+
/**
|
|
150
|
+
* Snappy compression (default).
|
|
151
|
+
* Good balance of speed and compression ratio.
|
|
152
|
+
*/
|
|
31
153
|
SNAPPY = "SNAPPY",
|
|
154
|
+
/**
|
|
155
|
+
* GZIP compression.
|
|
156
|
+
* Higher compression ratio, slower compression.
|
|
157
|
+
*/
|
|
32
158
|
GZIP = "GZIP",
|
|
159
|
+
/**
|
|
160
|
+
* Zstandard compression.
|
|
161
|
+
* Best compression ratio with good speed.
|
|
162
|
+
*/
|
|
33
163
|
ZSTD = "ZSTD",
|
|
164
|
+
/**
|
|
165
|
+
* LZ4 compression.
|
|
166
|
+
* Fastest compression, lower ratio.
|
|
167
|
+
*/
|
|
34
168
|
LZ4 = "LZ4"
|
|
35
169
|
}
|
|
36
170
|
/**
|
|
37
|
-
* Field definition for schema
|
|
171
|
+
* Field definition for a Parquet schema.
|
|
172
|
+
*
|
|
173
|
+
* @description
|
|
174
|
+
* Defines a single column in the Parquet schema, including its name,
|
|
175
|
+
* data type, nullability, and optional metadata.
|
|
176
|
+
*
|
|
177
|
+
* @example
|
|
178
|
+
* ```typescript
|
|
179
|
+
* const nameField: ParquetField = {
|
|
180
|
+
* name: 'user_name',
|
|
181
|
+
* type: ParquetFieldType.STRING,
|
|
182
|
+
* required: true,
|
|
183
|
+
* metadata: { description: 'The user display name' }
|
|
184
|
+
* }
|
|
185
|
+
*
|
|
186
|
+
* const ageField: ParquetField = {
|
|
187
|
+
* name: 'age',
|
|
188
|
+
* type: ParquetFieldType.INT32,
|
|
189
|
+
* required: false // nullable
|
|
190
|
+
* }
|
|
191
|
+
* ```
|
|
192
|
+
*
|
|
193
|
+
* @interface ParquetField
|
|
38
194
|
*/
|
|
39
195
|
export interface ParquetField {
|
|
196
|
+
/**
|
|
197
|
+
* Column name.
|
|
198
|
+
* Must be unique within the schema and non-empty.
|
|
199
|
+
*/
|
|
40
200
|
name: string;
|
|
201
|
+
/**
|
|
202
|
+
* Data type of the column.
|
|
203
|
+
*
|
|
204
|
+
* @see {@link ParquetFieldType}
|
|
205
|
+
*/
|
|
41
206
|
type: ParquetFieldType;
|
|
207
|
+
/**
|
|
208
|
+
* Whether the field is required (non-nullable).
|
|
209
|
+
* If true, null values will cause validation errors.
|
|
210
|
+
*/
|
|
42
211
|
required: boolean;
|
|
212
|
+
/**
|
|
213
|
+
* Optional key-value metadata for the field.
|
|
214
|
+
* Can be used for descriptions, units, etc.
|
|
215
|
+
*/
|
|
43
216
|
metadata?: Record<string, string>;
|
|
44
217
|
}
|
|
45
218
|
/**
|
|
46
|
-
* Parquet schema definition
|
|
219
|
+
* Parquet schema definition.
|
|
220
|
+
*
|
|
221
|
+
* @description
|
|
222
|
+
* Defines the complete schema for a Parquet file, including all fields
|
|
223
|
+
* and optional schema-level metadata.
|
|
224
|
+
*
|
|
225
|
+
* @example
|
|
226
|
+
* ```typescript
|
|
227
|
+
* const schema: ParquetSchema = {
|
|
228
|
+
* fields: [
|
|
229
|
+
* { name: 'id', type: ParquetFieldType.INT64, required: true },
|
|
230
|
+
* { name: 'name', type: ParquetFieldType.STRING, required: true }
|
|
231
|
+
* ],
|
|
232
|
+
* metadata: {
|
|
233
|
+
* created_by: 'gitdo',
|
|
234
|
+
* version: '1.0'
|
|
235
|
+
* }
|
|
236
|
+
* }
|
|
237
|
+
* ```
|
|
238
|
+
*
|
|
239
|
+
* @interface ParquetSchema
|
|
47
240
|
*/
|
|
48
241
|
export interface ParquetSchema {
|
|
242
|
+
/**
|
|
243
|
+
* Array of field definitions for all columns.
|
|
244
|
+
* Order determines column order in the file.
|
|
245
|
+
*/
|
|
49
246
|
fields: ParquetField[];
|
|
247
|
+
/**
|
|
248
|
+
* Optional schema-level metadata.
|
|
249
|
+
* Stored in the Parquet file footer.
|
|
250
|
+
*/
|
|
50
251
|
metadata?: Record<string, string>;
|
|
51
252
|
}
|
|
52
253
|
/**
|
|
53
|
-
* Options for creating a Parquet writer
|
|
254
|
+
* Options for creating a Parquet writer.
|
|
255
|
+
*
|
|
256
|
+
* @description
|
|
257
|
+
* Configuration options that control how the Parquet file is written,
|
|
258
|
+
* including row group sizing, compression, and statistics generation.
|
|
259
|
+
*
|
|
260
|
+
* @example
|
|
261
|
+
* ```typescript
|
|
262
|
+
* const options: ParquetWriteOptions = {
|
|
263
|
+
* rowGroupSize: 50000, // 50K rows per group
|
|
264
|
+
* rowGroupMemoryLimit: 64 * 1024 * 1024, // 64MB memory limit
|
|
265
|
+
* compression: ParquetCompression.ZSTD,
|
|
266
|
+
* columnCompression: {
|
|
267
|
+
* 'binary_data': ParquetCompression.LZ4 // Fast for binary
|
|
268
|
+
* },
|
|
269
|
+
* enableStatistics: true,
|
|
270
|
+
* sortBy: ['timestamp'],
|
|
271
|
+
* partitionColumns: ['date']
|
|
272
|
+
* }
|
|
273
|
+
* ```
|
|
274
|
+
*
|
|
275
|
+
* @interface ParquetWriteOptions
|
|
54
276
|
*/
|
|
55
277
|
export interface ParquetWriteOptions {
|
|
278
|
+
/**
|
|
279
|
+
* Maximum number of rows per row group.
|
|
280
|
+
* Smaller groups = more granular reads, larger groups = better compression.
|
|
281
|
+
*
|
|
282
|
+
* @default 65536
|
|
283
|
+
*/
|
|
56
284
|
rowGroupSize?: number;
|
|
285
|
+
/**
|
|
286
|
+
* Maximum memory size in bytes for a row group.
|
|
287
|
+
* Triggers flush when reached, regardless of row count.
|
|
288
|
+
*/
|
|
57
289
|
rowGroupMemoryLimit?: number;
|
|
290
|
+
/**
|
|
291
|
+
* Default compression algorithm for all columns.
|
|
292
|
+
*
|
|
293
|
+
* @default ParquetCompression.SNAPPY
|
|
294
|
+
*/
|
|
58
295
|
compression?: ParquetCompression;
|
|
296
|
+
/**
|
|
297
|
+
* Per-column compression overrides.
|
|
298
|
+
* Keys are column names, values are compression types.
|
|
299
|
+
*/
|
|
59
300
|
columnCompression?: Record<string, ParquetCompression>;
|
|
301
|
+
/**
|
|
302
|
+
* Whether to compute and store column statistics.
|
|
303
|
+
* Enables predicate pushdown during queries.
|
|
304
|
+
*
|
|
305
|
+
* @default false
|
|
306
|
+
*/
|
|
60
307
|
enableStatistics?: boolean;
|
|
308
|
+
/**
|
|
309
|
+
* Columns to sort data by within each row group.
|
|
310
|
+
* Improves query performance for sorted access patterns.
|
|
311
|
+
*/
|
|
61
312
|
sortBy?: string[];
|
|
313
|
+
/**
|
|
314
|
+
* Columns used for partitioning.
|
|
315
|
+
* Informational metadata for partitioned datasets.
|
|
316
|
+
*/
|
|
62
317
|
partitionColumns?: string[];
|
|
63
318
|
}
|
|
64
319
|
/**
|
|
65
|
-
*
|
|
320
|
+
* Statistics for a single column in a row group.
|
|
321
|
+
*
|
|
322
|
+
* @description
|
|
323
|
+
* Column statistics enable query engines to skip row groups that don't
|
|
324
|
+
* contain relevant data (predicate pushdown).
|
|
325
|
+
*
|
|
326
|
+
* @example
|
|
327
|
+
* ```typescript
|
|
328
|
+
* const stats: ColumnStatistics = {
|
|
329
|
+
* min: 100,
|
|
330
|
+
* max: 999,
|
|
331
|
+
* nullCount: 5,
|
|
332
|
+
* distinctCount: 850
|
|
333
|
+
* }
|
|
334
|
+
* ```
|
|
335
|
+
*
|
|
336
|
+
* @interface ColumnStatistics
|
|
66
337
|
*/
|
|
67
338
|
export interface ColumnStatistics {
|
|
339
|
+
/**
|
|
340
|
+
* Minimum value in the column.
|
|
341
|
+
* Type depends on column type.
|
|
342
|
+
*/
|
|
68
343
|
min?: number | string | boolean;
|
|
344
|
+
/**
|
|
345
|
+
* Maximum value in the column.
|
|
346
|
+
* Type depends on column type.
|
|
347
|
+
*/
|
|
69
348
|
max?: number | string | boolean;
|
|
349
|
+
/**
|
|
350
|
+
* Number of null values in the column.
|
|
351
|
+
*/
|
|
70
352
|
nullCount?: number;
|
|
353
|
+
/**
|
|
354
|
+
* Approximate distinct value count.
|
|
355
|
+
* May not be exact for large datasets.
|
|
356
|
+
*/
|
|
71
357
|
distinctCount?: number;
|
|
72
358
|
}
|
|
73
359
|
/**
|
|
74
|
-
*
|
|
360
|
+
* Metadata for a column chunk within a row group.
|
|
361
|
+
*
|
|
362
|
+
* @description
|
|
363
|
+
* Contains information about a single column's data within a row group,
|
|
364
|
+
* including compression, sizes, and statistics.
|
|
365
|
+
*
|
|
366
|
+
* @interface ColumnChunkMetadata
|
|
75
367
|
*/
|
|
76
368
|
export interface ColumnChunkMetadata {
|
|
369
|
+
/**
|
|
370
|
+
* Column name.
|
|
371
|
+
*/
|
|
77
372
|
column: string;
|
|
373
|
+
/**
|
|
374
|
+
* Data type of the column.
|
|
375
|
+
*/
|
|
78
376
|
type: ParquetFieldType;
|
|
377
|
+
/**
|
|
378
|
+
* Compression used for this column chunk.
|
|
379
|
+
*/
|
|
79
380
|
compression: ParquetCompression;
|
|
381
|
+
/**
|
|
382
|
+
* Size in bytes after compression.
|
|
383
|
+
*/
|
|
80
384
|
encodedSize: number;
|
|
385
|
+
/**
|
|
386
|
+
* Size in bytes before compression.
|
|
387
|
+
*/
|
|
81
388
|
uncompressedSize: number;
|
|
389
|
+
/**
|
|
390
|
+
* Column statistics if statistics are enabled.
|
|
391
|
+
*/
|
|
82
392
|
statistics?: ColumnStatistics;
|
|
83
393
|
}
|
|
84
394
|
/**
|
|
85
|
-
* Row group representation
|
|
395
|
+
* Row group representation in the Parquet file.
|
|
396
|
+
*
|
|
397
|
+
* @description
|
|
398
|
+
* A row group is a horizontal partition of the data containing all columns
|
|
399
|
+
* for a subset of rows. Row groups enable parallel processing and predicate
|
|
400
|
+
* pushdown optimizations.
|
|
401
|
+
*
|
|
402
|
+
* @interface RowGroup
|
|
86
403
|
*/
|
|
87
404
|
export interface RowGroup {
|
|
405
|
+
/**
|
|
406
|
+
* Number of rows in this row group.
|
|
407
|
+
*/
|
|
88
408
|
numRows: number;
|
|
409
|
+
/**
|
|
410
|
+
* Total compressed size in bytes.
|
|
411
|
+
*/
|
|
89
412
|
totalByteSize: number;
|
|
413
|
+
/**
|
|
414
|
+
* Metadata for each column chunk.
|
|
415
|
+
*/
|
|
90
416
|
columns: ColumnChunkMetadata[];
|
|
91
417
|
}
|
|
92
418
|
/**
|
|
93
|
-
* Parquet file
|
|
419
|
+
* Complete metadata for a Parquet file.
|
|
420
|
+
*
|
|
421
|
+
* @description
|
|
422
|
+
* Contains all metadata stored in the Parquet file footer, including
|
|
423
|
+
* schema, row groups, and statistics. Used when reading files.
|
|
424
|
+
*
|
|
425
|
+
* @example
|
|
426
|
+
* ```typescript
|
|
427
|
+
* const metadata = getMetadata(parquetBuffer)
|
|
428
|
+
* console.log(`Rows: ${metadata.numRows}`)
|
|
429
|
+
* console.log(`Row groups: ${metadata.rowGroups.length}`)
|
|
430
|
+
* console.log(`Compression: ${metadata.compression}`)
|
|
431
|
+
* ```
|
|
432
|
+
*
|
|
433
|
+
* @interface ParquetMetadata
|
|
94
434
|
*/
|
|
95
435
|
export interface ParquetMetadata {
|
|
436
|
+
/**
|
|
437
|
+
* The file's schema definition.
|
|
438
|
+
*/
|
|
96
439
|
schema: ParquetSchema;
|
|
440
|
+
/**
|
|
441
|
+
* Total number of rows in the file.
|
|
442
|
+
*/
|
|
97
443
|
numRows: number;
|
|
444
|
+
/**
|
|
445
|
+
* Array of row group metadata.
|
|
446
|
+
*/
|
|
98
447
|
rowGroups: RowGroup[];
|
|
448
|
+
/**
|
|
449
|
+
* Default compression algorithm used.
|
|
450
|
+
*/
|
|
99
451
|
compression: ParquetCompression;
|
|
452
|
+
/**
|
|
453
|
+
* Per-column compression settings.
|
|
454
|
+
*/
|
|
100
455
|
columnMetadata?: Record<string, {
|
|
101
456
|
compression: ParquetCompression;
|
|
102
457
|
}>;
|
|
458
|
+
/**
|
|
459
|
+
* Custom key-value metadata.
|
|
460
|
+
*/
|
|
103
461
|
keyValueMetadata?: Record<string, string>;
|
|
462
|
+
/**
|
|
463
|
+
* Unix timestamp when the file was created.
|
|
464
|
+
*/
|
|
104
465
|
createdAt: number;
|
|
466
|
+
/**
|
|
467
|
+
* Total file size in bytes.
|
|
468
|
+
*/
|
|
105
469
|
fileSize: number;
|
|
470
|
+
/**
|
|
471
|
+
* Columns the data is sorted by.
|
|
472
|
+
*/
|
|
106
473
|
sortedBy?: string[];
|
|
474
|
+
/**
|
|
475
|
+
* Columns used for partitioning.
|
|
476
|
+
*/
|
|
107
477
|
partitionColumns?: string[];
|
|
108
478
|
}
|
|
109
479
|
/**
|
|
110
|
-
* Mock output stream interface
|
|
480
|
+
* Mock output stream interface for writing Parquet data.
|
|
481
|
+
*
|
|
482
|
+
* @description
|
|
483
|
+
* Simple interface for streaming Parquet output to a destination.
|
|
484
|
+
* Can be implemented for files, network streams, etc.
|
|
485
|
+
*
|
|
486
|
+
* @example
|
|
487
|
+
* ```typescript
|
|
488
|
+
* class BufferOutputStream implements OutputStream {
|
|
489
|
+
* private chunks: Uint8Array[] = []
|
|
490
|
+
*
|
|
491
|
+
* write(data: Uint8Array): void {
|
|
492
|
+
* this.chunks.push(data)
|
|
493
|
+
* }
|
|
494
|
+
*
|
|
495
|
+
* getBuffer(): Uint8Array {
|
|
496
|
+
* const total = this.chunks.reduce((sum, c) => sum + c.length, 0)
|
|
497
|
+
* const result = new Uint8Array(total)
|
|
498
|
+
* let offset = 0
|
|
499
|
+
* for (const chunk of this.chunks) {
|
|
500
|
+
* result.set(chunk, offset)
|
|
501
|
+
* offset += chunk.length
|
|
502
|
+
* }
|
|
503
|
+
* return result
|
|
504
|
+
* }
|
|
505
|
+
* }
|
|
506
|
+
* ```
|
|
507
|
+
*
|
|
508
|
+
* @interface OutputStream
|
|
111
509
|
*/
|
|
112
510
|
export interface OutputStream {
|
|
511
|
+
/**
|
|
512
|
+
* Writes data to the output stream.
|
|
513
|
+
*
|
|
514
|
+
* @param data - The data to write
|
|
515
|
+
*/
|
|
113
516
|
write(data: Uint8Array): void;
|
|
114
517
|
}
|
|
115
518
|
/**
|
|
116
|
-
* Error class for Parquet operations
|
|
519
|
+
* Error class for Parquet-related operations.
|
|
520
|
+
*
|
|
521
|
+
* @description
|
|
522
|
+
* Thrown when Parquet operations fail, such as schema validation errors,
|
|
523
|
+
* invalid data types, or malformed files.
|
|
524
|
+
*
|
|
525
|
+
* @example
|
|
526
|
+
* ```typescript
|
|
527
|
+
* try {
|
|
528
|
+
* await writer.writeRow({ invalid_field: 'value' })
|
|
529
|
+
* } catch (error) {
|
|
530
|
+
* if (error instanceof ParquetError) {
|
|
531
|
+
* console.log(`Parquet error (${error.code}): ${error.message}`)
|
|
532
|
+
* }
|
|
533
|
+
* }
|
|
534
|
+
* ```
|
|
535
|
+
*
|
|
536
|
+
* @class ParquetError
|
|
537
|
+
* @extends Error
|
|
117
538
|
*/
|
|
118
539
|
export declare class ParquetError extends Error {
|
|
119
540
|
readonly code: string;
|
|
541
|
+
/**
|
|
542
|
+
* Creates a new ParquetError.
|
|
543
|
+
*
|
|
544
|
+
* @param message - Human-readable error message
|
|
545
|
+
* @param code - Error code for programmatic handling
|
|
546
|
+
*
|
|
547
|
+
* @example
|
|
548
|
+
* ```typescript
|
|
549
|
+
* throw new ParquetError('Field name cannot be empty', 'EMPTY_FIELD_NAME')
|
|
550
|
+
* ```
|
|
551
|
+
*/
|
|
120
552
|
constructor(message: string, code: string);
|
|
121
553
|
}
|
|
122
554
|
/**
|
|
123
|
-
* Parquet writer for git analytics data
|
|
555
|
+
* Parquet writer for git analytics data.
|
|
556
|
+
*
|
|
557
|
+
* @description
|
|
558
|
+
* ParquetWriter provides a streaming interface for writing data to Parquet
|
|
559
|
+
* format. It handles schema validation, row group management, compression,
|
|
560
|
+
* and statistics generation.
|
|
561
|
+
*
|
|
562
|
+
* **Usage Pattern:**
|
|
563
|
+
* 1. Create a schema using `defineSchema()`
|
|
564
|
+
* 2. Create a writer with `createParquetWriter()` or `new ParquetWriter()`
|
|
565
|
+
* 3. Write rows using `writeRow()` or `writeRows()`
|
|
566
|
+
* 4. Generate the file with `toBuffer()` or `writeTo()`
|
|
567
|
+
*
|
|
568
|
+
* **Row Group Management:**
|
|
569
|
+
* Rows are buffered in memory until the row group is full (by row count
|
|
570
|
+
* or memory limit), then flushed. You can also manually flush with
|
|
571
|
+
* `flushRowGroup()`.
|
|
572
|
+
*
|
|
573
|
+
* **Thread Safety:**
|
|
574
|
+
* Not thread-safe. Use separate writer instances for concurrent writes.
|
|
575
|
+
*
|
|
576
|
+
* @example
|
|
577
|
+
* ```typescript
|
|
578
|
+
* // Create schema
|
|
579
|
+
* const schema = defineSchema([
|
|
580
|
+
* { name: 'sha', type: ParquetFieldType.STRING, required: true },
|
|
581
|
+
* { name: 'type', type: ParquetFieldType.STRING, required: true },
|
|
582
|
+
* { name: 'size', type: ParquetFieldType.INT64, required: true },
|
|
583
|
+
* { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
|
|
584
|
+
* ])
|
|
585
|
+
*
|
|
586
|
+
* // Create writer
|
|
587
|
+
* const writer = new ParquetWriter(schema, {
|
|
588
|
+
* rowGroupSize: 10000,
|
|
589
|
+
* compression: ParquetCompression.SNAPPY,
|
|
590
|
+
* enableStatistics: true
|
|
591
|
+
* })
|
|
592
|
+
*
|
|
593
|
+
* // Write data
|
|
594
|
+
* for (const object of gitObjects) {
|
|
595
|
+
* await writer.writeRow({
|
|
596
|
+
* sha: object.sha,
|
|
597
|
+
* type: object.type,
|
|
598
|
+
* size: object.size,
|
|
599
|
+
* timestamp: Date.now()
|
|
600
|
+
* })
|
|
601
|
+
* }
|
|
602
|
+
*
|
|
603
|
+
* // Set custom metadata
|
|
604
|
+
* writer.setMetadata('git_version', '2.40.0')
|
|
605
|
+
* writer.setMetadata('repository', 'github.com/org/repo')
|
|
606
|
+
*
|
|
607
|
+
* // Generate file
|
|
608
|
+
* const buffer = await writer.toBuffer()
|
|
609
|
+
* console.log(`Generated ${buffer.length} bytes`)
|
|
610
|
+
* console.log(`Rows: ${writer.rowCount}`)
|
|
611
|
+
* console.log(`Row groups: ${writer.rowGroupCount}`)
|
|
612
|
+
*
|
|
613
|
+
* // Reset for reuse
|
|
614
|
+
* writer.reset()
|
|
615
|
+
* ```
|
|
616
|
+
*
|
|
617
|
+
* @class ParquetWriter
|
|
124
618
|
*/
|
|
125
619
|
export declare class ParquetWriter {
|
|
620
|
+
/**
|
|
621
|
+
* The Parquet schema for this writer.
|
|
622
|
+
* @readonly
|
|
623
|
+
*/
|
|
126
624
|
readonly schema: ParquetSchema;
|
|
625
|
+
/**
|
|
626
|
+
* Resolved options with defaults applied.
|
|
627
|
+
* @readonly
|
|
628
|
+
*/
|
|
127
629
|
readonly options: Required<Pick<ParquetWriteOptions, 'rowGroupSize' | 'compression'>> & ParquetWriteOptions;
|
|
630
|
+
/**
|
|
631
|
+
* Total row count written.
|
|
632
|
+
* @private
|
|
633
|
+
*/
|
|
128
634
|
private _rowCount;
|
|
635
|
+
/**
|
|
636
|
+
* Completed row groups.
|
|
637
|
+
* @private
|
|
638
|
+
*/
|
|
129
639
|
private _rowGroups;
|
|
640
|
+
/**
|
|
641
|
+
* Current row group being built.
|
|
642
|
+
* @private
|
|
643
|
+
*/
|
|
130
644
|
private _currentRowGroup;
|
|
645
|
+
/**
|
|
646
|
+
* Whether the writer has been closed.
|
|
647
|
+
* @private
|
|
648
|
+
*/
|
|
131
649
|
private _isClosed;
|
|
650
|
+
/**
|
|
651
|
+
* Custom key-value metadata.
|
|
652
|
+
* @private
|
|
653
|
+
*/
|
|
132
654
|
private _keyValueMetadata;
|
|
655
|
+
/**
|
|
656
|
+
* Creation timestamp.
|
|
657
|
+
* @private
|
|
658
|
+
*/
|
|
133
659
|
private _createdAt;
|
|
660
|
+
/**
|
|
661
|
+
* Creates a new ParquetWriter instance.
|
|
662
|
+
*
|
|
663
|
+
* @param schema - The Parquet schema defining columns
|
|
664
|
+
* @param options - Writer configuration options
|
|
665
|
+
*
|
|
666
|
+
* @example
|
|
667
|
+
* ```typescript
|
|
668
|
+
* const writer = new ParquetWriter(schema, {
|
|
669
|
+
* rowGroupSize: 50000,
|
|
670
|
+
* compression: ParquetCompression.GZIP
|
|
671
|
+
* })
|
|
672
|
+
* ```
|
|
673
|
+
*/
|
|
134
674
|
constructor(schema: ParquetSchema, options?: ParquetWriteOptions);
|
|
135
675
|
/**
|
|
136
|
-
*
|
|
676
|
+
* Gets the total row count written to the writer.
|
|
677
|
+
*
|
|
678
|
+
* @description
|
|
679
|
+
* Returns the total number of rows written, including rows in the
|
|
680
|
+
* current unflushed row group.
|
|
681
|
+
*
|
|
682
|
+
* @returns Total row count
|
|
683
|
+
*
|
|
684
|
+
* @example
|
|
685
|
+
* ```typescript
|
|
686
|
+
* await writer.writeRows(data)
|
|
687
|
+
* console.log(`Wrote ${writer.rowCount} rows`)
|
|
688
|
+
* ```
|
|
137
689
|
*/
|
|
138
690
|
get rowCount(): number;
|
|
139
691
|
/**
|
|
140
|
-
*
|
|
692
|
+
* Gets the number of row groups.
|
|
693
|
+
*
|
|
694
|
+
* @description
|
|
695
|
+
* Returns the number of completed row groups plus one if there's
|
|
696
|
+
* a pending row group with data.
|
|
697
|
+
*
|
|
698
|
+
* @returns Number of row groups
|
|
699
|
+
*
|
|
700
|
+
* @example
|
|
701
|
+
* ```typescript
|
|
702
|
+
* console.log(`Row groups: ${writer.rowGroupCount}`)
|
|
703
|
+
* ```
|
|
141
704
|
*/
|
|
142
705
|
get rowGroupCount(): number;
|
|
143
706
|
/**
|
|
144
|
-
*
|
|
707
|
+
* Checks if the writer has been closed.
|
|
708
|
+
*
|
|
709
|
+
* @description
|
|
710
|
+
* A closed writer cannot accept new rows. Writers are closed
|
|
711
|
+
* implicitly by `closeWriter()`.
|
|
712
|
+
*
|
|
713
|
+
* @returns true if closed
|
|
714
|
+
*
|
|
715
|
+
* @example
|
|
716
|
+
* ```typescript
|
|
717
|
+
* if (!writer.isClosed) {
|
|
718
|
+
* await writer.writeRow(row)
|
|
719
|
+
* }
|
|
720
|
+
* ```
|
|
145
721
|
*/
|
|
146
722
|
get isClosed(): boolean;
|
|
147
723
|
/**
|
|
148
|
-
*
|
|
724
|
+
* Writes a single row to the Parquet file.
|
|
725
|
+
*
|
|
726
|
+
* @description
|
|
727
|
+
* Validates the row against the schema and adds it to the current
|
|
728
|
+
* row group. Automatically flushes the row group when it reaches
|
|
729
|
+
* the configured size or memory limit.
|
|
730
|
+
*
|
|
731
|
+
* @param row - Object with column values keyed by column name
|
|
732
|
+
* @returns Promise that resolves when the row is written
|
|
733
|
+
*
|
|
734
|
+
* @throws {ParquetError} WRITER_CLOSED - If writer is closed
|
|
735
|
+
* @throws {ParquetError} MISSING_REQUIRED_FIELD - If required field is missing
|
|
736
|
+
* @throws {ParquetError} INVALID_FIELD_TYPE - If field value type doesn't match schema
|
|
737
|
+
*
|
|
738
|
+
* @example
|
|
739
|
+
* ```typescript
|
|
740
|
+
* await writer.writeRow({
|
|
741
|
+
* id: 123,
|
|
742
|
+
* name: 'Alice',
|
|
743
|
+
* active: true
|
|
744
|
+
* })
|
|
745
|
+
* ```
|
|
149
746
|
*/
|
|
150
747
|
writeRow(row: Record<string, unknown>): Promise<void>;
|
|
151
748
|
/**
|
|
152
|
-
*
|
|
749
|
+
* Writes multiple rows to the Parquet file.
|
|
750
|
+
*
|
|
751
|
+
* @description
|
|
752
|
+
* Convenience method that writes an array of rows sequentially.
|
|
753
|
+
* Each row is validated and may trigger row group flushes.
|
|
754
|
+
*
|
|
755
|
+
* @param rows - Array of row objects to write
|
|
756
|
+
* @returns Promise that resolves when all rows are written
|
|
757
|
+
*
|
|
758
|
+
* @throws {ParquetError} Any error from writeRow()
|
|
759
|
+
*
|
|
760
|
+
* @example
|
|
761
|
+
* ```typescript
|
|
762
|
+
* await writer.writeRows([
|
|
763
|
+
* { id: 1, name: 'Alice' },
|
|
764
|
+
* { id: 2, name: 'Bob' },
|
|
765
|
+
* { id: 3, name: 'Carol' }
|
|
766
|
+
* ])
|
|
767
|
+
* ```
|
|
153
768
|
*/
|
|
154
769
|
writeRows(rows: Record<string, unknown>[]): Promise<void>;
|
|
155
770
|
/**
|
|
156
|
-
* Manually
|
|
771
|
+
* Manually flushes the current row group.
|
|
772
|
+
*
|
|
773
|
+
* @description
|
|
774
|
+
* Forces the current row group to be finalized and stored, even if
|
|
775
|
+
* it hasn't reached the size limit. Has no effect if the current
|
|
776
|
+
* row group is empty.
|
|
777
|
+
*
|
|
778
|
+
* @returns Promise that resolves when flush is complete
|
|
779
|
+
*
|
|
780
|
+
* @example
|
|
781
|
+
* ```typescript
|
|
782
|
+
* // Write some rows
|
|
783
|
+
* await writer.writeRows(batch1)
|
|
784
|
+
*
|
|
785
|
+
* // Force flush before writing next batch
|
|
786
|
+
* await writer.flushRowGroup()
|
|
787
|
+
*
|
|
788
|
+
* // Continue writing
|
|
789
|
+
* await writer.writeRows(batch2)
|
|
790
|
+
* ```
|
|
157
791
|
*/
|
|
158
792
|
flushRowGroup(): Promise<void>;
|
|
159
793
|
/**
|
|
160
|
-
*
|
|
794
|
+
* Gets the current row group's memory size.
|
|
795
|
+
*
|
|
796
|
+
* @description
|
|
797
|
+
* Returns the estimated memory consumption of the unflushed row group.
|
|
798
|
+
* Useful for monitoring memory usage during streaming writes.
|
|
799
|
+
*
|
|
800
|
+
* @returns Memory size in bytes
|
|
801
|
+
*
|
|
802
|
+
* @example
|
|
803
|
+
* ```typescript
|
|
804
|
+
* if (writer.currentRowGroupMemorySize() > 50 * 1024 * 1024) {
|
|
805
|
+
* console.log('Row group using significant memory')
|
|
806
|
+
* await writer.flushRowGroup()
|
|
807
|
+
* }
|
|
808
|
+
* ```
|
|
161
809
|
*/
|
|
162
810
|
currentRowGroupMemorySize(): number;
|
|
163
811
|
/**
|
|
164
|
-
*
|
|
812
|
+
* Gets the completed row groups.
|
|
813
|
+
*
|
|
814
|
+
* @description
|
|
815
|
+
* Returns a copy of the completed row group metadata array.
|
|
816
|
+
* Does not include the current unflushed row group.
|
|
817
|
+
*
|
|
818
|
+
* @returns Array of row group metadata
|
|
819
|
+
*
|
|
820
|
+
* @example
|
|
821
|
+
* ```typescript
|
|
822
|
+
* for (const rg of writer.getRowGroups()) {
|
|
823
|
+
* console.log(`Row group: ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
|
|
824
|
+
* }
|
|
825
|
+
* ```
|
|
165
826
|
*/
|
|
166
827
|
getRowGroups(): RowGroup[];
|
|
167
828
|
/**
|
|
168
|
-
*
|
|
829
|
+
* Sets a custom key-value metadata entry.
|
|
830
|
+
*
|
|
831
|
+
* @description
|
|
832
|
+
* Adds custom metadata that will be stored in the Parquet file footer.
|
|
833
|
+
* Can be used for versioning, provenance, or application-specific data.
|
|
834
|
+
*
|
|
835
|
+
* @param key - Metadata key
|
|
836
|
+
* @param value - Metadata value
|
|
837
|
+
*
|
|
838
|
+
* @example
|
|
839
|
+
* ```typescript
|
|
840
|
+
* writer.setMetadata('created_by', 'gitdo-analytics')
|
|
841
|
+
* writer.setMetadata('schema_version', '2.0')
|
|
842
|
+
* writer.setMetadata('repository', 'github.com/org/repo')
|
|
843
|
+
* ```
|
|
169
844
|
*/
|
|
170
845
|
setMetadata(key: string, value: string): void;
|
|
171
846
|
/**
|
|
172
|
-
*
|
|
847
|
+
* Generates the Parquet file as a buffer.
|
|
848
|
+
*
|
|
849
|
+
* @description
|
|
850
|
+
* Finalizes the file by flushing any remaining rows and generating
|
|
851
|
+
* the complete Parquet file structure including header, row groups,
|
|
852
|
+
* and footer with metadata.
|
|
853
|
+
*
|
|
854
|
+
* @returns Promise resolving to the complete Parquet file as Uint8Array
|
|
855
|
+
*
|
|
856
|
+
* @example
|
|
857
|
+
* ```typescript
|
|
858
|
+
* const buffer = await writer.toBuffer()
|
|
859
|
+
* await fs.writeFile('data.parquet', buffer)
|
|
860
|
+
* ```
|
|
173
861
|
*/
|
|
174
862
|
toBuffer(): Promise<Uint8Array>;
|
|
175
863
|
/**
|
|
176
|
-
*
|
|
864
|
+
* Writes the Parquet file to an output stream.
|
|
865
|
+
*
|
|
866
|
+
* @description
|
|
867
|
+
* Generates the file and writes it to the provided output stream.
|
|
868
|
+
* Useful for streaming to files or network destinations.
|
|
869
|
+
*
|
|
870
|
+
* @param output - The output stream to write to
|
|
871
|
+
* @returns Promise that resolves when writing is complete
|
|
872
|
+
*
|
|
873
|
+
* @example
|
|
874
|
+
* ```typescript
|
|
875
|
+
* const output = new FileOutputStream('data.parquet')
|
|
876
|
+
* await writer.writeTo(output)
|
|
877
|
+
* output.close()
|
|
878
|
+
* ```
|
|
177
879
|
*/
|
|
178
880
|
writeTo(output: OutputStream): Promise<void>;
|
|
179
881
|
/**
|
|
180
|
-
*
|
|
882
|
+
* Resets the writer to its initial state.
|
|
883
|
+
*
|
|
884
|
+
* @description
|
|
885
|
+
* Clears all written data, row groups, and metadata. The schema
|
|
886
|
+
* and options remain unchanged. Useful for writing multiple files
|
|
887
|
+
* with the same configuration.
|
|
888
|
+
*
|
|
889
|
+
* @example
|
|
890
|
+
* ```typescript
|
|
891
|
+
* // Write first file
|
|
892
|
+
* await writer.writeRows(batch1)
|
|
893
|
+
* const file1 = await writer.toBuffer()
|
|
894
|
+
*
|
|
895
|
+
* // Reset and write second file
|
|
896
|
+
* writer.reset()
|
|
897
|
+
* await writer.writeRows(batch2)
|
|
898
|
+
* const file2 = await writer.toBuffer()
|
|
899
|
+
* ```
|
|
181
900
|
*/
|
|
182
901
|
reset(): void;
|
|
183
902
|
/**
|
|
184
|
-
*
|
|
903
|
+
* Validates a row against the schema.
|
|
904
|
+
*
|
|
905
|
+
* @param row - The row to validate
|
|
906
|
+
* @throws {ParquetError} If validation fails
|
|
907
|
+
* @private
|
|
185
908
|
*/
|
|
186
909
|
private _validateRow;
|
|
187
910
|
/**
|
|
188
|
-
*
|
|
911
|
+
* Validates a value matches the expected Parquet type.
|
|
912
|
+
*
|
|
913
|
+
* @param value - The value to validate
|
|
914
|
+
* @param type - The expected Parquet type
|
|
915
|
+
* @returns true if valid, false otherwise
|
|
916
|
+
* @private
|
|
189
917
|
*/
|
|
190
918
|
private _validateType;
|
|
191
919
|
/**
|
|
192
|
-
*
|
|
920
|
+
* Estimates the memory size of a row.
|
|
921
|
+
*
|
|
922
|
+
* @param row - The row to estimate
|
|
923
|
+
* @returns Estimated size in bytes
|
|
924
|
+
* @private
|
|
193
925
|
*/
|
|
194
926
|
private _estimateRowSize;
|
|
195
927
|
/**
|
|
196
|
-
*
|
|
928
|
+
* Builds a row group from internal representation.
|
|
929
|
+
*
|
|
930
|
+
* @param internal - The internal row group data
|
|
931
|
+
* @returns The row group metadata
|
|
932
|
+
* @private
|
|
197
933
|
*/
|
|
198
934
|
private _buildRowGroup;
|
|
199
935
|
/**
|
|
200
|
-
*
|
|
936
|
+
* Computes statistics for a column.
|
|
937
|
+
*
|
|
938
|
+
* @param values - The column values
|
|
939
|
+
* @param type - The column type
|
|
940
|
+
* @returns Column statistics
|
|
941
|
+
* @private
|
|
201
942
|
*/
|
|
202
943
|
private _computeStatistics;
|
|
203
944
|
/**
|
|
204
|
-
*
|
|
945
|
+
* Estimates the encoded size after compression.
|
|
946
|
+
*
|
|
947
|
+
* @param values - The column values
|
|
948
|
+
* @param type - The column type
|
|
949
|
+
* @param compression - The compression type
|
|
950
|
+
* @returns Estimated compressed size in bytes
|
|
951
|
+
* @private
|
|
205
952
|
*/
|
|
206
953
|
private _estimateEncodedSize;
|
|
207
954
|
/**
|
|
208
|
-
*
|
|
955
|
+
* Estimates the uncompressed size of column values.
|
|
956
|
+
*
|
|
957
|
+
* @param values - The column values
|
|
958
|
+
* @param type - The column type
|
|
959
|
+
* @returns Estimated uncompressed size in bytes
|
|
960
|
+
* @private
|
|
209
961
|
*/
|
|
210
962
|
private _estimateUncompressedSize;
|
|
211
963
|
/**
|
|
212
|
-
*
|
|
964
|
+
* Generates the complete Parquet file bytes.
|
|
965
|
+
*
|
|
966
|
+
* @returns The complete Parquet file as Uint8Array
|
|
967
|
+
* @private
|
|
213
968
|
*/
|
|
214
969
|
private _generateParquetBytes;
|
|
215
970
|
/**
|
|
216
|
-
* Simple compression simulation for non-gzip formats
|
|
971
|
+
* Simple compression simulation for non-gzip formats.
|
|
972
|
+
*
|
|
973
|
+
* @param data - Data to compress
|
|
974
|
+
* @param compression - Compression type
|
|
975
|
+
* @returns Compressed data
|
|
976
|
+
* @private
|
|
217
977
|
*/
|
|
218
978
|
private _simpleCompress;
|
|
219
979
|
}
|
|
220
980
|
/**
|
|
221
|
-
*
|
|
981
|
+
* Defines a Parquet schema.
|
|
982
|
+
*
|
|
983
|
+
* @description
|
|
984
|
+
* Creates a validated Parquet schema from field definitions. Validates that:
|
|
985
|
+
* - Schema has at least one field
|
|
986
|
+
* - All field names are non-empty
|
|
987
|
+
* - All field names are unique
|
|
988
|
+
*
|
|
989
|
+
* @param fields - Array of field definitions
|
|
990
|
+
* @param metadata - Optional schema-level metadata
|
|
991
|
+
* @returns Validated Parquet schema
|
|
992
|
+
*
|
|
993
|
+
* @throws {ParquetError} EMPTY_SCHEMA - If fields array is empty
|
|
994
|
+
* @throws {ParquetError} EMPTY_FIELD_NAME - If any field name is empty
|
|
995
|
+
* @throws {ParquetError} DUPLICATE_FIELD - If field names are not unique
|
|
996
|
+
*
|
|
997
|
+
* @example
|
|
998
|
+
* ```typescript
|
|
999
|
+
* const schema = defineSchema([
|
|
1000
|
+
* { name: 'id', type: ParquetFieldType.INT64, required: true },
|
|
1001
|
+
* { name: 'name', type: ParquetFieldType.STRING, required: true },
|
|
1002
|
+
* { name: 'age', type: ParquetFieldType.INT32, required: false },
|
|
1003
|
+
* { name: 'created_at', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
|
|
1004
|
+
* ], {
|
|
1005
|
+
* version: '1.0',
|
|
1006
|
+
* description: 'User records'
|
|
1007
|
+
* })
|
|
1008
|
+
* ```
|
|
222
1009
|
*/
|
|
223
1010
|
export declare function defineSchema(fields: ParquetField[], metadata?: Record<string, string>): ParquetSchema;
|
|
224
1011
|
/**
|
|
225
|
-
*
|
|
1012
|
+
* Creates a Parquet writer.
|
|
1013
|
+
*
|
|
1014
|
+
* @description
|
|
1015
|
+
* Factory function to create a ParquetWriter with the specified schema
|
|
1016
|
+
* and options. Equivalent to `new ParquetWriter(schema, options)`.
|
|
1017
|
+
*
|
|
1018
|
+
* @param schema - The Parquet schema
|
|
1019
|
+
* @param options - Writer options
|
|
1020
|
+
* @returns A new ParquetWriter instance
|
|
1021
|
+
*
|
|
1022
|
+
* @example
|
|
1023
|
+
* ```typescript
|
|
1024
|
+
* const writer = createParquetWriter(schema, {
|
|
1025
|
+
* rowGroupSize: 10000,
|
|
1026
|
+
* compression: ParquetCompression.SNAPPY
|
|
1027
|
+
* })
|
|
1028
|
+
* ```
|
|
226
1029
|
*/
|
|
227
1030
|
export declare function createParquetWriter(schema: ParquetSchema, options?: ParquetWriteOptions): ParquetWriter;
|
|
228
1031
|
/**
|
|
229
|
-
*
|
|
1032
|
+
* Writes data directly to a Parquet file buffer.
|
|
1033
|
+
*
|
|
1034
|
+
* @description
|
|
1035
|
+
* Convenience function that creates a writer, writes all rows, and returns
|
|
1036
|
+
* the complete Parquet file. Useful for simple one-shot writes.
|
|
1037
|
+
*
|
|
1038
|
+
* @param schema - The Parquet schema
|
|
1039
|
+
* @param rows - Array of rows to write
|
|
1040
|
+
* @param options - Writer options
|
|
1041
|
+
* @returns Promise resolving to the complete Parquet file as Uint8Array
|
|
1042
|
+
*
|
|
1043
|
+
* @example
|
|
1044
|
+
* ```typescript
|
|
1045
|
+
* const buffer = await writeParquetFile(schema, [
|
|
1046
|
+
* { id: 1, name: 'Alice' },
|
|
1047
|
+
* { id: 2, name: 'Bob' }
|
|
1048
|
+
* ], {
|
|
1049
|
+
* compression: ParquetCompression.GZIP
|
|
1050
|
+
* })
|
|
1051
|
+
*
|
|
1052
|
+
* await fs.writeFile('data.parquet', buffer)
|
|
1053
|
+
* ```
|
|
230
1054
|
*/
|
|
231
1055
|
export declare function writeParquetFile(schema: ParquetSchema, rows: Record<string, unknown>[], options?: ParquetWriteOptions): Promise<Uint8Array>;
|
|
232
1056
|
/**
|
|
233
|
-
*
|
|
1057
|
+
* Closes a writer and returns the final buffer.
|
|
1058
|
+
*
|
|
1059
|
+
* @description
|
|
1060
|
+
* Generates the final Parquet file buffer and marks the writer as closed.
|
|
1061
|
+
* The writer cannot be used for further writes after calling this function.
|
|
1062
|
+
*
|
|
1063
|
+
* @param writer - The ParquetWriter to close
|
|
1064
|
+
* @returns Promise resolving to the complete Parquet file as Uint8Array
|
|
1065
|
+
*
|
|
1066
|
+
* @example
|
|
1067
|
+
* ```typescript
|
|
1068
|
+
* await writer.writeRows(data)
|
|
1069
|
+
* const buffer = await closeWriter(writer)
|
|
1070
|
+
* console.log(writer.isClosed) // true
|
|
1071
|
+
* ```
|
|
234
1072
|
*/
|
|
235
1073
|
export declare function closeWriter(writer: ParquetWriter): Promise<Uint8Array>;
|
|
236
1074
|
/**
|
|
237
|
-
*
|
|
1075
|
+
* Adds a row group to the writer.
|
|
1076
|
+
*
|
|
1077
|
+
* @description
|
|
1078
|
+
* Writes multiple rows and then flushes them as a single row group.
|
|
1079
|
+
* Useful when you want explicit control over row group boundaries.
|
|
1080
|
+
*
|
|
1081
|
+
* @param writer - The ParquetWriter to use
|
|
1082
|
+
* @param rows - Array of rows for this row group
|
|
1083
|
+
* @returns Promise that resolves when the row group is written
|
|
1084
|
+
*
|
|
1085
|
+
* @example
|
|
1086
|
+
* ```typescript
|
|
1087
|
+
* // Add explicit row groups
|
|
1088
|
+
* await addRowGroup(writer, batch1) // First row group
|
|
1089
|
+
* await addRowGroup(writer, batch2) // Second row group
|
|
1090
|
+
* ```
|
|
238
1091
|
*/
|
|
239
1092
|
export declare function addRowGroup(writer: ParquetWriter, rows: Record<string, unknown>[]): Promise<void>;
|
|
240
1093
|
/**
|
|
241
|
-
*
|
|
1094
|
+
* Gets metadata from a Parquet file buffer.
|
|
1095
|
+
*
|
|
1096
|
+
* @description
|
|
1097
|
+
* Parses a Parquet file buffer and extracts the metadata including
|
|
1098
|
+
* schema, row groups, compression settings, and custom metadata.
|
|
1099
|
+
*
|
|
1100
|
+
* @param bytes - The Parquet file buffer
|
|
1101
|
+
* @returns The parsed metadata
|
|
1102
|
+
*
|
|
1103
|
+
* @throws {ParquetError} INVALID_MAGIC - If file doesn't have valid Parquet magic bytes
|
|
1104
|
+
*
|
|
1105
|
+
* @example
|
|
1106
|
+
* ```typescript
|
|
1107
|
+
* const buffer = await fs.readFile('data.parquet')
|
|
1108
|
+
* const metadata = getMetadata(buffer)
|
|
1109
|
+
*
|
|
1110
|
+
* console.log(`Rows: ${metadata.numRows}`)
|
|
1111
|
+
* console.log(`Schema: ${metadata.schema.fields.map(f => f.name).join(', ')}`)
|
|
1112
|
+
* console.log(`Row groups: ${metadata.rowGroups.length}`)
|
|
1113
|
+
*
|
|
1114
|
+
* for (const rg of metadata.rowGroups) {
|
|
1115
|
+
* console.log(` - ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
|
|
1116
|
+
* }
|
|
1117
|
+
* ```
|
|
242
1118
|
*/
|
|
243
1119
|
export declare function getMetadata(bytes: Uint8Array): ParquetMetadata;
|
|
244
1120
|
/**
|
|
245
|
-
*
|
|
1121
|
+
* Sets the compression type for a writer.
|
|
1122
|
+
*
|
|
1123
|
+
* @description
|
|
1124
|
+
* Updates the default compression algorithm for a writer. Affects all
|
|
1125
|
+
* subsequently written data. Columns with explicit compression settings
|
|
1126
|
+
* in columnCompression are not affected.
|
|
1127
|
+
*
|
|
1128
|
+
* @param writer - The ParquetWriter to update
|
|
1129
|
+
* @param compression - The new compression type
|
|
1130
|
+
*
|
|
1131
|
+
* @example
|
|
1132
|
+
* ```typescript
|
|
1133
|
+
* const writer = createParquetWriter(schema)
|
|
1134
|
+
*
|
|
1135
|
+
* // Write some rows with SNAPPY (default)
|
|
1136
|
+
* await writer.writeRows(batch1)
|
|
1137
|
+
* await writer.flushRowGroup()
|
|
1138
|
+
*
|
|
1139
|
+
* // Switch to GZIP for remaining data
|
|
1140
|
+
* setCompression(writer, ParquetCompression.GZIP)
|
|
1141
|
+
* await writer.writeRows(batch2)
|
|
1142
|
+
* ```
|
|
246
1143
|
*/
|
|
247
1144
|
export declare function setCompression(writer: ParquetWriter, compression: ParquetCompression): void;
|
|
248
1145
|
//# sourceMappingURL=parquet-writer.d.ts.map
|