gitx.do 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (344) hide show
  1. package/README.md +40 -353
  2. package/dist/do/logger.d.ts +50 -0
  3. package/dist/do/logger.d.ts.map +1 -0
  4. package/dist/do/logger.js +122 -0
  5. package/dist/do/logger.js.map +1 -0
  6. package/dist/{durable-object → do}/schema.d.ts +3 -3
  7. package/dist/do/schema.d.ts.map +1 -0
  8. package/dist/{durable-object → do}/schema.js +4 -3
  9. package/dist/do/schema.js.map +1 -0
  10. package/dist/do/types.d.ts +267 -0
  11. package/dist/do/types.d.ts.map +1 -0
  12. package/dist/do/types.js +62 -0
  13. package/dist/do/types.js.map +1 -0
  14. package/dist/index.d.ts +15 -415
  15. package/dist/index.d.ts.map +1 -1
  16. package/dist/index.js +31 -483
  17. package/dist/index.js.map +1 -1
  18. package/package.json +13 -21
  19. package/dist/cli/commands/add.d.ts +0 -174
  20. package/dist/cli/commands/add.d.ts.map +0 -1
  21. package/dist/cli/commands/add.js +0 -131
  22. package/dist/cli/commands/add.js.map +0 -1
  23. package/dist/cli/commands/blame.d.ts +0 -259
  24. package/dist/cli/commands/blame.d.ts.map +0 -1
  25. package/dist/cli/commands/blame.js +0 -609
  26. package/dist/cli/commands/blame.js.map +0 -1
  27. package/dist/cli/commands/branch.d.ts +0 -249
  28. package/dist/cli/commands/branch.d.ts.map +0 -1
  29. package/dist/cli/commands/branch.js +0 -693
  30. package/dist/cli/commands/branch.js.map +0 -1
  31. package/dist/cli/commands/commit.d.ts +0 -182
  32. package/dist/cli/commands/commit.d.ts.map +0 -1
  33. package/dist/cli/commands/commit.js +0 -437
  34. package/dist/cli/commands/commit.js.map +0 -1
  35. package/dist/cli/commands/diff.d.ts +0 -464
  36. package/dist/cli/commands/diff.d.ts.map +0 -1
  37. package/dist/cli/commands/diff.js +0 -958
  38. package/dist/cli/commands/diff.js.map +0 -1
  39. package/dist/cli/commands/log.d.ts +0 -239
  40. package/dist/cli/commands/log.d.ts.map +0 -1
  41. package/dist/cli/commands/log.js +0 -535
  42. package/dist/cli/commands/log.js.map +0 -1
  43. package/dist/cli/commands/merge.d.ts +0 -106
  44. package/dist/cli/commands/merge.d.ts.map +0 -1
  45. package/dist/cli/commands/merge.js +0 -55
  46. package/dist/cli/commands/merge.js.map +0 -1
  47. package/dist/cli/commands/review.d.ts +0 -457
  48. package/dist/cli/commands/review.d.ts.map +0 -1
  49. package/dist/cli/commands/review.js +0 -533
  50. package/dist/cli/commands/review.js.map +0 -1
  51. package/dist/cli/commands/status.d.ts +0 -269
  52. package/dist/cli/commands/status.d.ts.map +0 -1
  53. package/dist/cli/commands/status.js +0 -493
  54. package/dist/cli/commands/status.js.map +0 -1
  55. package/dist/cli/commands/web.d.ts +0 -199
  56. package/dist/cli/commands/web.d.ts.map +0 -1
  57. package/dist/cli/commands/web.js +0 -696
  58. package/dist/cli/commands/web.js.map +0 -1
  59. package/dist/cli/fs-adapter.d.ts +0 -656
  60. package/dist/cli/fs-adapter.d.ts.map +0 -1
  61. package/dist/cli/fs-adapter.js +0 -1179
  62. package/dist/cli/fs-adapter.js.map +0 -1
  63. package/dist/cli/fsx-cli-adapter.d.ts +0 -359
  64. package/dist/cli/fsx-cli-adapter.d.ts.map +0 -1
  65. package/dist/cli/fsx-cli-adapter.js +0 -619
  66. package/dist/cli/fsx-cli-adapter.js.map +0 -1
  67. package/dist/cli/index.d.ts +0 -387
  68. package/dist/cli/index.d.ts.map +0 -1
  69. package/dist/cli/index.js +0 -523
  70. package/dist/cli/index.js.map +0 -1
  71. package/dist/cli/ui/components/DiffView.d.ts +0 -7
  72. package/dist/cli/ui/components/DiffView.d.ts.map +0 -1
  73. package/dist/cli/ui/components/DiffView.js +0 -11
  74. package/dist/cli/ui/components/DiffView.js.map +0 -1
  75. package/dist/cli/ui/components/ErrorDisplay.d.ts +0 -6
  76. package/dist/cli/ui/components/ErrorDisplay.d.ts.map +0 -1
  77. package/dist/cli/ui/components/ErrorDisplay.js +0 -11
  78. package/dist/cli/ui/components/ErrorDisplay.js.map +0 -1
  79. package/dist/cli/ui/components/FuzzySearch.d.ts +0 -9
  80. package/dist/cli/ui/components/FuzzySearch.d.ts.map +0 -1
  81. package/dist/cli/ui/components/FuzzySearch.js +0 -12
  82. package/dist/cli/ui/components/FuzzySearch.js.map +0 -1
  83. package/dist/cli/ui/components/LoadingSpinner.d.ts +0 -6
  84. package/dist/cli/ui/components/LoadingSpinner.d.ts.map +0 -1
  85. package/dist/cli/ui/components/LoadingSpinner.js +0 -10
  86. package/dist/cli/ui/components/LoadingSpinner.js.map +0 -1
  87. package/dist/cli/ui/components/NavigationList.d.ts +0 -9
  88. package/dist/cli/ui/components/NavigationList.d.ts.map +0 -1
  89. package/dist/cli/ui/components/NavigationList.js +0 -11
  90. package/dist/cli/ui/components/NavigationList.js.map +0 -1
  91. package/dist/cli/ui/components/ScrollableContent.d.ts +0 -8
  92. package/dist/cli/ui/components/ScrollableContent.d.ts.map +0 -1
  93. package/dist/cli/ui/components/ScrollableContent.js +0 -11
  94. package/dist/cli/ui/components/ScrollableContent.js.map +0 -1
  95. package/dist/cli/ui/components/index.d.ts +0 -7
  96. package/dist/cli/ui/components/index.d.ts.map +0 -1
  97. package/dist/cli/ui/components/index.js +0 -9
  98. package/dist/cli/ui/components/index.js.map +0 -1
  99. package/dist/cli/ui/terminal-ui.d.ts +0 -52
  100. package/dist/cli/ui/terminal-ui.d.ts.map +0 -1
  101. package/dist/cli/ui/terminal-ui.js +0 -121
  102. package/dist/cli/ui/terminal-ui.js.map +0 -1
  103. package/dist/do/BashModule.d.ts +0 -871
  104. package/dist/do/BashModule.d.ts.map +0 -1
  105. package/dist/do/BashModule.js +0 -1143
  106. package/dist/do/BashModule.js.map +0 -1
  107. package/dist/do/FsModule.d.ts +0 -601
  108. package/dist/do/FsModule.d.ts.map +0 -1
  109. package/dist/do/FsModule.js +0 -1120
  110. package/dist/do/FsModule.js.map +0 -1
  111. package/dist/do/GitModule.d.ts +0 -635
  112. package/dist/do/GitModule.d.ts.map +0 -1
  113. package/dist/do/GitModule.js +0 -781
  114. package/dist/do/GitModule.js.map +0 -1
  115. package/dist/do/GitRepoDO.d.ts +0 -281
  116. package/dist/do/GitRepoDO.d.ts.map +0 -1
  117. package/dist/do/GitRepoDO.js +0 -479
  118. package/dist/do/GitRepoDO.js.map +0 -1
  119. package/dist/do/bash-ast.d.ts +0 -246
  120. package/dist/do/bash-ast.d.ts.map +0 -1
  121. package/dist/do/bash-ast.js +0 -888
  122. package/dist/do/bash-ast.js.map +0 -1
  123. package/dist/do/container-executor.d.ts +0 -491
  124. package/dist/do/container-executor.d.ts.map +0 -1
  125. package/dist/do/container-executor.js +0 -730
  126. package/dist/do/container-executor.js.map +0 -1
  127. package/dist/do/index.d.ts +0 -53
  128. package/dist/do/index.d.ts.map +0 -1
  129. package/dist/do/index.js +0 -91
  130. package/dist/do/index.js.map +0 -1
  131. package/dist/do/tiered-storage.d.ts +0 -403
  132. package/dist/do/tiered-storage.d.ts.map +0 -1
  133. package/dist/do/tiered-storage.js +0 -689
  134. package/dist/do/tiered-storage.js.map +0 -1
  135. package/dist/do/withBash.d.ts +0 -231
  136. package/dist/do/withBash.d.ts.map +0 -1
  137. package/dist/do/withBash.js +0 -244
  138. package/dist/do/withBash.js.map +0 -1
  139. package/dist/do/withFs.d.ts +0 -237
  140. package/dist/do/withFs.d.ts.map +0 -1
  141. package/dist/do/withFs.js +0 -387
  142. package/dist/do/withFs.js.map +0 -1
  143. package/dist/do/withGit.d.ts +0 -180
  144. package/dist/do/withGit.d.ts.map +0 -1
  145. package/dist/do/withGit.js +0 -271
  146. package/dist/do/withGit.js.map +0 -1
  147. package/dist/durable-object/object-store.d.ts +0 -633
  148. package/dist/durable-object/object-store.d.ts.map +0 -1
  149. package/dist/durable-object/object-store.js +0 -1161
  150. package/dist/durable-object/object-store.js.map +0 -1
  151. package/dist/durable-object/schema.d.ts.map +0 -1
  152. package/dist/durable-object/schema.js.map +0 -1
  153. package/dist/durable-object/wal.d.ts +0 -416
  154. package/dist/durable-object/wal.d.ts.map +0 -1
  155. package/dist/durable-object/wal.js +0 -445
  156. package/dist/durable-object/wal.js.map +0 -1
  157. package/dist/mcp/adapter.d.ts +0 -772
  158. package/dist/mcp/adapter.d.ts.map +0 -1
  159. package/dist/mcp/adapter.js +0 -895
  160. package/dist/mcp/adapter.js.map +0 -1
  161. package/dist/mcp/sandbox/miniflare-evaluator.d.ts +0 -22
  162. package/dist/mcp/sandbox/miniflare-evaluator.d.ts.map +0 -1
  163. package/dist/mcp/sandbox/miniflare-evaluator.js +0 -140
  164. package/dist/mcp/sandbox/miniflare-evaluator.js.map +0 -1
  165. package/dist/mcp/sandbox/object-store-proxy.d.ts +0 -32
  166. package/dist/mcp/sandbox/object-store-proxy.d.ts.map +0 -1
  167. package/dist/mcp/sandbox/object-store-proxy.js +0 -30
  168. package/dist/mcp/sandbox/object-store-proxy.js.map +0 -1
  169. package/dist/mcp/sandbox/template.d.ts +0 -17
  170. package/dist/mcp/sandbox/template.d.ts.map +0 -1
  171. package/dist/mcp/sandbox/template.js +0 -71
  172. package/dist/mcp/sandbox/template.js.map +0 -1
  173. package/dist/mcp/sandbox.d.ts +0 -764
  174. package/dist/mcp/sandbox.d.ts.map +0 -1
  175. package/dist/mcp/sandbox.js +0 -1362
  176. package/dist/mcp/sandbox.js.map +0 -1
  177. package/dist/mcp/sdk-adapter.d.ts +0 -835
  178. package/dist/mcp/sdk-adapter.d.ts.map +0 -1
  179. package/dist/mcp/sdk-adapter.js +0 -974
  180. package/dist/mcp/sdk-adapter.js.map +0 -1
  181. package/dist/mcp/tools/do.d.ts +0 -32
  182. package/dist/mcp/tools/do.d.ts.map +0 -1
  183. package/dist/mcp/tools/do.js +0 -115
  184. package/dist/mcp/tools/do.js.map +0 -1
  185. package/dist/mcp/tools.d.ts +0 -548
  186. package/dist/mcp/tools.d.ts.map +0 -1
  187. package/dist/mcp/tools.js +0 -1934
  188. package/dist/mcp/tools.js.map +0 -1
  189. package/dist/ops/blame.d.ts +0 -551
  190. package/dist/ops/blame.d.ts.map +0 -1
  191. package/dist/ops/blame.js +0 -1037
  192. package/dist/ops/blame.js.map +0 -1
  193. package/dist/ops/branch.d.ts +0 -766
  194. package/dist/ops/branch.d.ts.map +0 -1
  195. package/dist/ops/branch.js +0 -950
  196. package/dist/ops/branch.js.map +0 -1
  197. package/dist/ops/commit-traversal.d.ts +0 -349
  198. package/dist/ops/commit-traversal.d.ts.map +0 -1
  199. package/dist/ops/commit-traversal.js +0 -821
  200. package/dist/ops/commit-traversal.js.map +0 -1
  201. package/dist/ops/commit.d.ts +0 -555
  202. package/dist/ops/commit.d.ts.map +0 -1
  203. package/dist/ops/commit.js +0 -826
  204. package/dist/ops/commit.js.map +0 -1
  205. package/dist/ops/merge-base.d.ts +0 -397
  206. package/dist/ops/merge-base.d.ts.map +0 -1
  207. package/dist/ops/merge-base.js +0 -691
  208. package/dist/ops/merge-base.js.map +0 -1
  209. package/dist/ops/merge.d.ts +0 -855
  210. package/dist/ops/merge.d.ts.map +0 -1
  211. package/dist/ops/merge.js +0 -1551
  212. package/dist/ops/merge.js.map +0 -1
  213. package/dist/ops/tag.d.ts +0 -247
  214. package/dist/ops/tag.d.ts.map +0 -1
  215. package/dist/ops/tag.js +0 -649
  216. package/dist/ops/tag.js.map +0 -1
  217. package/dist/ops/tree-builder.d.ts +0 -178
  218. package/dist/ops/tree-builder.d.ts.map +0 -1
  219. package/dist/ops/tree-builder.js +0 -271
  220. package/dist/ops/tree-builder.js.map +0 -1
  221. package/dist/ops/tree-diff.d.ts +0 -291
  222. package/dist/ops/tree-diff.d.ts.map +0 -1
  223. package/dist/ops/tree-diff.js +0 -705
  224. package/dist/ops/tree-diff.js.map +0 -1
  225. package/dist/pack/delta.d.ts +0 -248
  226. package/dist/pack/delta.d.ts.map +0 -1
  227. package/dist/pack/delta.js +0 -736
  228. package/dist/pack/delta.js.map +0 -1
  229. package/dist/pack/format.d.ts +0 -446
  230. package/dist/pack/format.d.ts.map +0 -1
  231. package/dist/pack/format.js +0 -572
  232. package/dist/pack/format.js.map +0 -1
  233. package/dist/pack/full-generation.d.ts +0 -612
  234. package/dist/pack/full-generation.d.ts.map +0 -1
  235. package/dist/pack/full-generation.js +0 -1378
  236. package/dist/pack/full-generation.js.map +0 -1
  237. package/dist/pack/generation.d.ts +0 -441
  238. package/dist/pack/generation.d.ts.map +0 -1
  239. package/dist/pack/generation.js +0 -707
  240. package/dist/pack/generation.js.map +0 -1
  241. package/dist/pack/index.d.ts +0 -502
  242. package/dist/pack/index.d.ts.map +0 -1
  243. package/dist/pack/index.js +0 -833
  244. package/dist/pack/index.js.map +0 -1
  245. package/dist/refs/branch.d.ts +0 -668
  246. package/dist/refs/branch.d.ts.map +0 -1
  247. package/dist/refs/branch.js +0 -897
  248. package/dist/refs/branch.js.map +0 -1
  249. package/dist/refs/storage.d.ts +0 -833
  250. package/dist/refs/storage.d.ts.map +0 -1
  251. package/dist/refs/storage.js +0 -1023
  252. package/dist/refs/storage.js.map +0 -1
  253. package/dist/refs/tag.d.ts +0 -860
  254. package/dist/refs/tag.d.ts.map +0 -1
  255. package/dist/refs/tag.js +0 -996
  256. package/dist/refs/tag.js.map +0 -1
  257. package/dist/storage/backend.d.ts +0 -425
  258. package/dist/storage/backend.d.ts.map +0 -1
  259. package/dist/storage/backend.js +0 -41
  260. package/dist/storage/backend.js.map +0 -1
  261. package/dist/storage/fsx-adapter.d.ts +0 -204
  262. package/dist/storage/fsx-adapter.d.ts.map +0 -1
  263. package/dist/storage/fsx-adapter.js +0 -470
  264. package/dist/storage/fsx-adapter.js.map +0 -1
  265. package/dist/storage/lru-cache.d.ts +0 -691
  266. package/dist/storage/lru-cache.d.ts.map +0 -1
  267. package/dist/storage/lru-cache.js +0 -813
  268. package/dist/storage/lru-cache.js.map +0 -1
  269. package/dist/storage/object-index.d.ts +0 -585
  270. package/dist/storage/object-index.d.ts.map +0 -1
  271. package/dist/storage/object-index.js +0 -532
  272. package/dist/storage/object-index.js.map +0 -1
  273. package/dist/storage/r2-pack.d.ts +0 -1257
  274. package/dist/storage/r2-pack.d.ts.map +0 -1
  275. package/dist/storage/r2-pack.js +0 -1770
  276. package/dist/storage/r2-pack.js.map +0 -1
  277. package/dist/tiered/cdc-pipeline.d.ts +0 -1888
  278. package/dist/tiered/cdc-pipeline.d.ts.map +0 -1
  279. package/dist/tiered/cdc-pipeline.js +0 -1880
  280. package/dist/tiered/cdc-pipeline.js.map +0 -1
  281. package/dist/tiered/migration.d.ts +0 -1104
  282. package/dist/tiered/migration.d.ts.map +0 -1
  283. package/dist/tiered/migration.js +0 -1214
  284. package/dist/tiered/migration.js.map +0 -1
  285. package/dist/tiered/parquet-writer.d.ts +0 -1145
  286. package/dist/tiered/parquet-writer.d.ts.map +0 -1
  287. package/dist/tiered/parquet-writer.js +0 -1183
  288. package/dist/tiered/parquet-writer.js.map +0 -1
  289. package/dist/tiered/read-path.d.ts +0 -835
  290. package/dist/tiered/read-path.d.ts.map +0 -1
  291. package/dist/tiered/read-path.js +0 -487
  292. package/dist/tiered/read-path.js.map +0 -1
  293. package/dist/types/capability.d.ts +0 -1385
  294. package/dist/types/capability.d.ts.map +0 -1
  295. package/dist/types/capability.js +0 -36
  296. package/dist/types/capability.js.map +0 -1
  297. package/dist/types/index.d.ts +0 -13
  298. package/dist/types/index.d.ts.map +0 -1
  299. package/dist/types/index.js +0 -18
  300. package/dist/types/index.js.map +0 -1
  301. package/dist/types/objects.d.ts +0 -692
  302. package/dist/types/objects.d.ts.map +0 -1
  303. package/dist/types/objects.js +0 -837
  304. package/dist/types/objects.js.map +0 -1
  305. package/dist/types/storage.d.ts +0 -603
  306. package/dist/types/storage.d.ts.map +0 -1
  307. package/dist/types/storage.js +0 -191
  308. package/dist/types/storage.js.map +0 -1
  309. package/dist/types/worker-loader.d.ts +0 -60
  310. package/dist/types/worker-loader.d.ts.map +0 -1
  311. package/dist/types/worker-loader.js +0 -62
  312. package/dist/types/worker-loader.js.map +0 -1
  313. package/dist/utils/hash.d.ts +0 -197
  314. package/dist/utils/hash.d.ts.map +0 -1
  315. package/dist/utils/hash.js +0 -268
  316. package/dist/utils/hash.js.map +0 -1
  317. package/dist/utils/sha1.d.ts +0 -290
  318. package/dist/utils/sha1.d.ts.map +0 -1
  319. package/dist/utils/sha1.js +0 -582
  320. package/dist/utils/sha1.js.map +0 -1
  321. package/dist/wire/capabilities.d.ts +0 -1044
  322. package/dist/wire/capabilities.d.ts.map +0 -1
  323. package/dist/wire/capabilities.js +0 -941
  324. package/dist/wire/capabilities.js.map +0 -1
  325. package/dist/wire/path-security.d.ts +0 -157
  326. package/dist/wire/path-security.d.ts.map +0 -1
  327. package/dist/wire/path-security.js +0 -307
  328. package/dist/wire/path-security.js.map +0 -1
  329. package/dist/wire/pkt-line.d.ts +0 -345
  330. package/dist/wire/pkt-line.d.ts.map +0 -1
  331. package/dist/wire/pkt-line.js +0 -381
  332. package/dist/wire/pkt-line.js.map +0 -1
  333. package/dist/wire/receive-pack.d.ts +0 -1059
  334. package/dist/wire/receive-pack.d.ts.map +0 -1
  335. package/dist/wire/receive-pack.js +0 -1414
  336. package/dist/wire/receive-pack.js.map +0 -1
  337. package/dist/wire/smart-http.d.ts +0 -799
  338. package/dist/wire/smart-http.d.ts.map +0 -1
  339. package/dist/wire/smart-http.js +0 -945
  340. package/dist/wire/smart-http.js.map +0 -1
  341. package/dist/wire/upload-pack.d.ts +0 -727
  342. package/dist/wire/upload-pack.d.ts.map +0 -1
  343. package/dist/wire/upload-pack.js +0 -1138
  344. package/dist/wire/upload-pack.js.map +0 -1
@@ -1,1183 +0,0 @@
1
- /**
2
- * @fileoverview Parquet Writer for Git Analytics
3
- *
4
- * @description
5
- * Provides functionality to write git analytics data to Parquet format, a
6
- * columnar storage format optimized for analytical queries. This module
7
- * enables efficient storage and querying of Git repository data.
8
- *
9
- * **Key Features:**
10
- * - Schema definition with various field types (STRING, INT32, INT64, etc.)
11
- * - Multiple compression algorithms (SNAPPY, GZIP, ZSTD, LZ4, UNCOMPRESSED)
12
- * - Row group management for efficient columnar storage
13
- * - Automatic and manual row group flushing
14
- * - Column-level statistics generation (min, max, null count)
15
- * - Custom key-value metadata support
16
- * - Memory-efficient streaming writes
17
- *
18
- * **Parquet Format:**
19
- * The generated files follow the Parquet format with:
20
- * - Magic bytes "PAR1" at start and end
21
- * - Row group data organized by columns
22
- * - Footer metadata containing schema and statistics
23
- *
24
- * @example
25
- * ```typescript
26
- * // Define schema for commit analytics
27
- * const schema = defineSchema([
28
- * { name: 'commit_sha', type: ParquetFieldType.STRING, required: true },
29
- * { name: 'author', type: ParquetFieldType.STRING, required: true },
30
- * { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true },
31
- * { name: 'file_count', type: ParquetFieldType.INT32, required: false }
32
- * ])
33
- *
34
- * // Create writer with options
35
- * const writer = createParquetWriter(schema, {
36
- * rowGroupSize: 10000,
37
- * compression: ParquetCompression.SNAPPY,
38
- * enableStatistics: true
39
- * })
40
- *
41
- * // Write data
42
- * await writer.writeRows([
43
- * { commit_sha: 'abc123...', author: 'alice', timestamp: Date.now(), file_count: 5 },
44
- * { commit_sha: 'def456...', author: 'bob', timestamp: Date.now(), file_count: 3 }
45
- * ])
46
- *
47
- * // Generate the Parquet file
48
- * const buffer = await writer.toBuffer()
49
- * ```
50
- *
51
- * @module tiered/parquet-writer
52
- * @see {@link ParquetWriter} - Main writer class
53
- * @see {@link defineSchema} - Schema definition helper
54
- */
55
- import pako from 'pako';
56
- // ============================================================================
57
- // Types and Enums
58
- // ============================================================================
59
- /**
60
- * Supported Parquet field types.
61
- *
62
- * @description
63
- * Defines the data types that can be used for fields in a Parquet schema.
64
- * Each type maps to an appropriate physical and logical Parquet type.
65
- *
66
- * @example
67
- * ```typescript
68
- * const field: ParquetField = {
69
- * name: 'count',
70
- * type: ParquetFieldType.INT64,
71
- * required: true
72
- * }
73
- * ```
74
- *
75
- * @enum {string}
76
- */
77
- export var ParquetFieldType;
78
- (function (ParquetFieldType) {
79
- /**
80
- * UTF-8 encoded string.
81
- * Maps to Parquet BYTE_ARRAY with UTF8 logical type.
82
- */
83
- ParquetFieldType["STRING"] = "STRING";
84
- /**
85
- * 32-bit signed integer.
86
- * Maps to Parquet INT32 physical type.
87
- */
88
- ParquetFieldType["INT32"] = "INT32";
89
- /**
90
- * 64-bit signed integer.
91
- * Maps to Parquet INT64 physical type.
92
- */
93
- ParquetFieldType["INT64"] = "INT64";
94
- /**
95
- * Boolean value (true/false).
96
- * Maps to Parquet BOOLEAN physical type.
97
- */
98
- ParquetFieldType["BOOLEAN"] = "BOOLEAN";
99
- /**
100
- * 32-bit IEEE 754 floating point.
101
- * Maps to Parquet FLOAT physical type.
102
- */
103
- ParquetFieldType["FLOAT"] = "FLOAT";
104
- /**
105
- * 64-bit IEEE 754 floating point.
106
- * Maps to Parquet DOUBLE physical type.
107
- */
108
- ParquetFieldType["DOUBLE"] = "DOUBLE";
109
- /**
110
- * Raw binary data.
111
- * Maps to Parquet BYTE_ARRAY physical type.
112
- */
113
- ParquetFieldType["BINARY"] = "BINARY";
114
- /**
115
- * Timestamp with millisecond precision.
116
- * Maps to Parquet INT64 with TIMESTAMP_MILLIS logical type.
117
- */
118
- ParquetFieldType["TIMESTAMP_MILLIS"] = "TIMESTAMP_MILLIS";
119
- /**
120
- * Timestamp with microsecond precision.
121
- * Maps to Parquet INT64 with TIMESTAMP_MICROS logical type.
122
- */
123
- ParquetFieldType["TIMESTAMP_MICROS"] = "TIMESTAMP_MICROS";
124
- })(ParquetFieldType || (ParquetFieldType = {}));
125
- /**
126
- * Supported compression types for Parquet data.
127
- *
128
- * @description
129
- * Different compression algorithms offer trade-offs between compression
130
- * ratio, compression speed, and decompression speed.
131
- *
132
- * **Comparison:**
133
- * - SNAPPY: Fast compression/decompression, moderate ratio (default)
134
- * - GZIP: Higher ratio, slower compression, fast decompression
135
- * - ZSTD: Best ratio, good speed, requires more memory
136
- * - LZ4: Fastest, lower ratio
137
- * - UNCOMPRESSED: No compression overhead
138
- *
139
- * @example
140
- * ```typescript
141
- * const writer = createParquetWriter(schema, {
142
- * compression: ParquetCompression.ZSTD
143
- * })
144
- * ```
145
- *
146
- * @enum {string}
147
- */
148
- export var ParquetCompression;
149
- (function (ParquetCompression) {
150
- /**
151
- * No compression applied.
152
- * Fastest writes, largest file size.
153
- */
154
- ParquetCompression["UNCOMPRESSED"] = "UNCOMPRESSED";
155
- /**
156
- * Snappy compression (default).
157
- * Good balance of speed and compression ratio.
158
- */
159
- ParquetCompression["SNAPPY"] = "SNAPPY";
160
- /**
161
- * GZIP compression.
162
- * Higher compression ratio, slower compression.
163
- */
164
- ParquetCompression["GZIP"] = "GZIP";
165
- /**
166
- * Zstandard compression.
167
- * Best compression ratio with good speed.
168
- */
169
- ParquetCompression["ZSTD"] = "ZSTD";
170
- /**
171
- * LZ4 compression.
172
- * Fastest compression, lower ratio.
173
- */
174
- ParquetCompression["LZ4"] = "LZ4";
175
- })(ParquetCompression || (ParquetCompression = {}));
176
- /**
177
- * Error class for Parquet-related operations.
178
- *
179
- * @description
180
- * Thrown when Parquet operations fail, such as schema validation errors,
181
- * invalid data types, or malformed files.
182
- *
183
- * @example
184
- * ```typescript
185
- * try {
186
- * await writer.writeRow({ invalid_field: 'value' })
187
- * } catch (error) {
188
- * if (error instanceof ParquetError) {
189
- * console.log(`Parquet error (${error.code}): ${error.message}`)
190
- * }
191
- * }
192
- * ```
193
- *
194
- * @class ParquetError
195
- * @extends Error
196
- */
197
- export class ParquetError extends Error {
198
- code;
199
- /**
200
- * Creates a new ParquetError.
201
- *
202
- * @param message - Human-readable error message
203
- * @param code - Error code for programmatic handling
204
- *
205
- * @example
206
- * ```typescript
207
- * throw new ParquetError('Field name cannot be empty', 'EMPTY_FIELD_NAME')
208
- * ```
209
- */
210
- constructor(message, code) {
211
- super(message);
212
- this.code = code;
213
- this.name = 'ParquetError';
214
- }
215
- }
216
- // ============================================================================
217
- // ParquetWriter Class
218
- // ============================================================================
219
- /**
220
- * Parquet writer for git analytics data.
221
- *
222
- * @description
223
- * ParquetWriter provides a streaming interface for writing data to Parquet
224
- * format. It handles schema validation, row group management, compression,
225
- * and statistics generation.
226
- *
227
- * **Usage Pattern:**
228
- * 1. Create a schema using `defineSchema()`
229
- * 2. Create a writer with `createParquetWriter()` or `new ParquetWriter()`
230
- * 3. Write rows using `writeRow()` or `writeRows()`
231
- * 4. Generate the file with `toBuffer()` or `writeTo()`
232
- *
233
- * **Row Group Management:**
234
- * Rows are buffered in memory until the row group is full (by row count
235
- * or memory limit), then flushed. You can also manually flush with
236
- * `flushRowGroup()`.
237
- *
238
- * **Thread Safety:**
239
- * Not thread-safe. Use separate writer instances for concurrent writes.
240
- *
241
- * @example
242
- * ```typescript
243
- * // Create schema
244
- * const schema = defineSchema([
245
- * { name: 'sha', type: ParquetFieldType.STRING, required: true },
246
- * { name: 'type', type: ParquetFieldType.STRING, required: true },
247
- * { name: 'size', type: ParquetFieldType.INT64, required: true },
248
- * { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
249
- * ])
250
- *
251
- * // Create writer
252
- * const writer = new ParquetWriter(schema, {
253
- * rowGroupSize: 10000,
254
- * compression: ParquetCompression.SNAPPY,
255
- * enableStatistics: true
256
- * })
257
- *
258
- * // Write data
259
- * for (const object of gitObjects) {
260
- * await writer.writeRow({
261
- * sha: object.sha,
262
- * type: object.type,
263
- * size: object.size,
264
- * timestamp: Date.now()
265
- * })
266
- * }
267
- *
268
- * // Set custom metadata
269
- * writer.setMetadata('git_version', '2.40.0')
270
- * writer.setMetadata('repository', 'github.com/org/repo')
271
- *
272
- * // Generate file
273
- * const buffer = await writer.toBuffer()
274
- * console.log(`Generated ${buffer.length} bytes`)
275
- * console.log(`Rows: ${writer.rowCount}`)
276
- * console.log(`Row groups: ${writer.rowGroupCount}`)
277
- *
278
- * // Reset for reuse
279
- * writer.reset()
280
- * ```
281
- *
282
- * @class ParquetWriter
283
- */
284
- export class ParquetWriter {
285
- /**
286
- * The Parquet schema for this writer.
287
- * @readonly
288
- */
289
- schema;
290
- /**
291
- * Resolved options with defaults applied.
292
- * @readonly
293
- */
294
- options;
295
- /**
296
- * Total row count written.
297
- * @private
298
- */
299
- _rowCount = 0;
300
- /**
301
- * Completed row groups.
302
- * @private
303
- */
304
- _rowGroups = [];
305
- /**
306
- * Current row group being built.
307
- * @private
308
- */
309
- _currentRowGroup = { rows: [], byteSize: 0 };
310
- /**
311
- * Whether the writer has been closed.
312
- * @private
313
- */
314
- _isClosed = false;
315
- /**
316
- * Custom key-value metadata.
317
- * @private
318
- */
319
- _keyValueMetadata = {};
320
- /**
321
- * Creation timestamp.
322
- * @private
323
- */
324
- _createdAt = Date.now();
325
- /**
326
- * Creates a new ParquetWriter instance.
327
- *
328
- * @param schema - The Parquet schema defining columns
329
- * @param options - Writer configuration options
330
- *
331
- * @example
332
- * ```typescript
333
- * const writer = new ParquetWriter(schema, {
334
- * rowGroupSize: 50000,
335
- * compression: ParquetCompression.GZIP
336
- * })
337
- * ```
338
- */
339
- constructor(schema, options = {}) {
340
- this.schema = schema;
341
- this.options = {
342
- rowGroupSize: options.rowGroupSize ?? 65536,
343
- compression: options.compression ?? ParquetCompression.SNAPPY,
344
- ...options
345
- };
346
- }
347
- /**
348
- * Gets the total row count written to the writer.
349
- *
350
- * @description
351
- * Returns the total number of rows written, including rows in the
352
- * current unflushed row group.
353
- *
354
- * @returns Total row count
355
- *
356
- * @example
357
- * ```typescript
358
- * await writer.writeRows(data)
359
- * console.log(`Wrote ${writer.rowCount} rows`)
360
- * ```
361
- */
362
- get rowCount() {
363
- return this._rowCount;
364
- }
365
- /**
366
- * Gets the number of row groups.
367
- *
368
- * @description
369
- * Returns the number of completed row groups plus one if there's
370
- * a pending row group with data.
371
- *
372
- * @returns Number of row groups
373
- *
374
- * @example
375
- * ```typescript
376
- * console.log(`Row groups: ${writer.rowGroupCount}`)
377
- * ```
378
- */
379
- get rowGroupCount() {
380
- const pendingCount = this._currentRowGroup.rows.length > 0 ? 1 : 0;
381
- return this._rowGroups.length + pendingCount;
382
- }
383
- /**
384
- * Checks if the writer has been closed.
385
- *
386
- * @description
387
- * A closed writer cannot accept new rows. Writers are closed
388
- * implicitly by `closeWriter()`.
389
- *
390
- * @returns true if closed
391
- *
392
- * @example
393
- * ```typescript
394
- * if (!writer.isClosed) {
395
- * await writer.writeRow(row)
396
- * }
397
- * ```
398
- */
399
- get isClosed() {
400
- return this._isClosed;
401
- }
402
- /**
403
- * Writes a single row to the Parquet file.
404
- *
405
- * @description
406
- * Validates the row against the schema and adds it to the current
407
- * row group. Automatically flushes the row group when it reaches
408
- * the configured size or memory limit.
409
- *
410
- * @param row - Object with column values keyed by column name
411
- * @returns Promise that resolves when the row is written
412
- *
413
- * @throws {ParquetError} WRITER_CLOSED - If writer is closed
414
- * @throws {ParquetError} MISSING_REQUIRED_FIELD - If required field is missing
415
- * @throws {ParquetError} INVALID_FIELD_TYPE - If field value type doesn't match schema
416
- *
417
- * @example
418
- * ```typescript
419
- * await writer.writeRow({
420
- * id: 123,
421
- * name: 'Alice',
422
- * active: true
423
- * })
424
- * ```
425
- */
426
- async writeRow(row) {
427
- if (this._isClosed) {
428
- throw new ParquetError('Cannot write to a closed writer', 'WRITER_CLOSED');
429
- }
430
- this._validateRow(row);
431
- const rowSize = this._estimateRowSize(row);
432
- this._currentRowGroup.rows.push(row);
433
- this._currentRowGroup.byteSize += rowSize;
434
- this._rowCount++;
435
- // Check if we should flush based on row count
436
- if (this._currentRowGroup.rows.length >= this.options.rowGroupSize) {
437
- await this.flushRowGroup();
438
- }
439
- // Check if we should flush based on memory limit
440
- else if (this.options.rowGroupMemoryLimit &&
441
- this._currentRowGroup.byteSize >= this.options.rowGroupMemoryLimit) {
442
- await this.flushRowGroup();
443
- }
444
- }
445
- /**
446
- * Writes multiple rows to the Parquet file.
447
- *
448
- * @description
449
- * Convenience method that writes an array of rows sequentially.
450
- * Each row is validated and may trigger row group flushes.
451
- *
452
- * @param rows - Array of row objects to write
453
- * @returns Promise that resolves when all rows are written
454
- *
455
- * @throws {ParquetError} Any error from writeRow()
456
- *
457
- * @example
458
- * ```typescript
459
- * await writer.writeRows([
460
- * { id: 1, name: 'Alice' },
461
- * { id: 2, name: 'Bob' },
462
- * { id: 3, name: 'Carol' }
463
- * ])
464
- * ```
465
- */
466
- async writeRows(rows) {
467
- for (const row of rows) {
468
- await this.writeRow(row);
469
- }
470
- }
471
- /**
472
- * Manually flushes the current row group.
473
- *
474
- * @description
475
- * Forces the current row group to be finalized and stored, even if
476
- * it hasn't reached the size limit. Has no effect if the current
477
- * row group is empty.
478
- *
479
- * @returns Promise that resolves when flush is complete
480
- *
481
- * @example
482
- * ```typescript
483
- * // Write some rows
484
- * await writer.writeRows(batch1)
485
- *
486
- * // Force flush before writing next batch
487
- * await writer.flushRowGroup()
488
- *
489
- * // Continue writing
490
- * await writer.writeRows(batch2)
491
- * ```
492
- */
493
- async flushRowGroup() {
494
- if (this._currentRowGroup.rows.length === 0) {
495
- return;
496
- }
497
- const rowGroup = this._buildRowGroup(this._currentRowGroup);
498
- this._rowGroups.push(rowGroup);
499
- this._currentRowGroup = { rows: [], byteSize: 0 };
500
- }
501
- /**
502
- * Gets the current row group's memory size.
503
- *
504
- * @description
505
- * Returns the estimated memory consumption of the unflushed row group.
506
- * Useful for monitoring memory usage during streaming writes.
507
- *
508
- * @returns Memory size in bytes
509
- *
510
- * @example
511
- * ```typescript
512
- * if (writer.currentRowGroupMemorySize() > 50 * 1024 * 1024) {
513
- * console.log('Row group using significant memory')
514
- * await writer.flushRowGroup()
515
- * }
516
- * ```
517
- */
518
- currentRowGroupMemorySize() {
519
- return this._currentRowGroup.byteSize;
520
- }
521
- /**
522
- * Gets the completed row groups.
523
- *
524
- * @description
525
- * Returns a copy of the completed row group metadata array.
526
- * Does not include the current unflushed row group.
527
- *
528
- * @returns Array of row group metadata
529
- *
530
- * @example
531
- * ```typescript
532
- * for (const rg of writer.getRowGroups()) {
533
- * console.log(`Row group: ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
534
- * }
535
- * ```
536
- */
537
- getRowGroups() {
538
- return [...this._rowGroups];
539
- }
540
- /**
541
- * Sets a custom key-value metadata entry.
542
- *
543
- * @description
544
- * Adds custom metadata that will be stored in the Parquet file footer.
545
- * Can be used for versioning, provenance, or application-specific data.
546
- *
547
- * @param key - Metadata key
548
- * @param value - Metadata value
549
- *
550
- * @example
551
- * ```typescript
552
- * writer.setMetadata('created_by', 'gitdo-analytics')
553
- * writer.setMetadata('schema_version', '2.0')
554
- * writer.setMetadata('repository', 'github.com/org/repo')
555
- * ```
556
- */
557
- setMetadata(key, value) {
558
- this._keyValueMetadata[key] = value;
559
- }
560
- /**
561
- * Generates the Parquet file as a buffer.
562
- *
563
- * @description
564
- * Finalizes the file by flushing any remaining rows and generating
565
- * the complete Parquet file structure including header, row groups,
566
- * and footer with metadata.
567
- *
568
- * @returns Promise resolving to the complete Parquet file as Uint8Array
569
- *
570
- * @example
571
- * ```typescript
572
- * const buffer = await writer.toBuffer()
573
- * await fs.writeFile('data.parquet', buffer)
574
- * ```
575
- */
576
- async toBuffer() {
577
- // Flush any remaining rows
578
- if (this._currentRowGroup.rows.length > 0) {
579
- await this.flushRowGroup();
580
- }
581
- return this._generateParquetBytes();
582
- }
583
- /**
584
- * Writes the Parquet file to an output stream.
585
- *
586
- * @description
587
- * Generates the file and writes it to the provided output stream.
588
- * Useful for streaming to files or network destinations.
589
- *
590
- * @param output - The output stream to write to
591
- * @returns Promise that resolves when writing is complete
592
- *
593
- * @example
594
- * ```typescript
595
- * const output = new FileOutputStream('data.parquet')
596
- * await writer.writeTo(output)
597
- * output.close()
598
- * ```
599
- */
600
- async writeTo(output) {
601
- const bytes = await this.toBuffer();
602
- output.write(bytes);
603
- }
604
- /**
605
- * Resets the writer to its initial state.
606
- *
607
- * @description
608
- * Clears all written data, row groups, and metadata. The schema
609
- * and options remain unchanged. Useful for writing multiple files
610
- * with the same configuration.
611
- *
612
- * @example
613
- * ```typescript
614
- * // Write first file
615
- * await writer.writeRows(batch1)
616
- * const file1 = await writer.toBuffer()
617
- *
618
- * // Reset and write second file
619
- * writer.reset()
620
- * await writer.writeRows(batch2)
621
- * const file2 = await writer.toBuffer()
622
- * ```
623
- */
624
- reset() {
625
- this._rowCount = 0;
626
- this._rowGroups = [];
627
- this._currentRowGroup = { rows: [], byteSize: 0 };
628
- this._isClosed = false;
629
- this._keyValueMetadata = {};
630
- this._createdAt = Date.now();
631
- }
632
- /**
633
- * Validates a row against the schema.
634
- *
635
- * @param row - The row to validate
636
- * @throws {ParquetError} If validation fails
637
- * @private
638
- */
639
- _validateRow(row) {
640
- for (const field of this.schema.fields) {
641
- const value = row[field.name];
642
- // Check required fields
643
- if (field.required && (value === undefined || value === null)) {
644
- throw new ParquetError(`Missing required field: ${field.name}`, 'MISSING_REQUIRED_FIELD');
645
- }
646
- // Check type if value is present and not null
647
- if (value !== null && value !== undefined) {
648
- if (!this._validateType(value, field.type)) {
649
- throw new ParquetError(`Invalid type for field ${field.name}: expected ${field.type}`, 'INVALID_FIELD_TYPE');
650
- }
651
- }
652
- }
653
- }
654
- /**
655
- * Validates a value matches the expected Parquet type.
656
- *
657
- * @param value - The value to validate
658
- * @param type - The expected Parquet type
659
- * @returns true if valid, false otherwise
660
- * @private
661
- */
662
- _validateType(value, type) {
663
- switch (type) {
664
- case ParquetFieldType.STRING:
665
- return typeof value === 'string';
666
- case ParquetFieldType.INT32:
667
- case ParquetFieldType.INT64:
668
- case ParquetFieldType.FLOAT:
669
- case ParquetFieldType.DOUBLE:
670
- case ParquetFieldType.TIMESTAMP_MILLIS:
671
- case ParquetFieldType.TIMESTAMP_MICROS:
672
- return typeof value === 'number';
673
- case ParquetFieldType.BOOLEAN:
674
- return typeof value === 'boolean';
675
- case ParquetFieldType.BINARY:
676
- return value instanceof Uint8Array || typeof value === 'string';
677
- default:
678
- return false;
679
- }
680
- }
681
- /**
682
- * Estimates the memory size of a row.
683
- *
684
- * @param row - The row to estimate
685
- * @returns Estimated size in bytes
686
- * @private
687
- */
688
- _estimateRowSize(row) {
689
- let size = 0;
690
- for (const field of this.schema.fields) {
691
- const value = row[field.name];
692
- if (value === null || value === undefined) {
693
- size += 1; // null marker
694
- }
695
- else if (typeof value === 'string') {
696
- size += value.length * 2; // UTF-16
697
- }
698
- else if (typeof value === 'number') {
699
- size += 8; // 64-bit
700
- }
701
- else if (typeof value === 'boolean') {
702
- size += 1;
703
- }
704
- else if (value instanceof Uint8Array) {
705
- size += value.length;
706
- }
707
- }
708
- return size;
709
- }
710
- /**
711
- * Builds a row group from internal representation.
712
- *
713
- * @param internal - The internal row group data
714
- * @returns The row group metadata
715
- * @private
716
- */
717
- _buildRowGroup(internal) {
718
- const columns = this.schema.fields.map(field => {
719
- const values = internal.rows.map(row => row[field.name]);
720
- const stats = this.options.enableStatistics ? this._computeStatistics(values, field.type) : undefined;
721
- const compression = this.options.columnCompression?.[field.name] ?? this.options.compression;
722
- return {
723
- column: field.name,
724
- type: field.type,
725
- compression,
726
- encodedSize: this._estimateEncodedSize(values, field.type, compression),
727
- uncompressedSize: this._estimateUncompressedSize(values, field.type),
728
- statistics: stats
729
- };
730
- });
731
- return {
732
- numRows: internal.rows.length,
733
- totalByteSize: columns.reduce((sum, col) => sum + col.encodedSize, 0),
734
- columns
735
- };
736
- }
737
- /**
738
- * Computes statistics for a column.
739
- *
740
- * @param values - The column values
741
- * @param type - The column type
742
- * @returns Column statistics
743
- * @private
744
- */
745
- _computeStatistics(values, type) {
746
- const nonNullValues = values.filter(v => v !== null && v !== undefined);
747
- const nullCount = values.length - nonNullValues.length;
748
- if (nonNullValues.length === 0) {
749
- return { nullCount };
750
- }
751
- switch (type) {
752
- case ParquetFieldType.INT32:
753
- case ParquetFieldType.INT64:
754
- case ParquetFieldType.FLOAT:
755
- case ParquetFieldType.DOUBLE:
756
- case ParquetFieldType.TIMESTAMP_MILLIS:
757
- case ParquetFieldType.TIMESTAMP_MICROS: {
758
- const numbers = nonNullValues.filter(v => typeof v === 'number' && !Number.isNaN(v));
759
- if (numbers.length === 0) {
760
- return { nullCount };
761
- }
762
- return {
763
- min: Math.min(...numbers),
764
- max: Math.max(...numbers),
765
- nullCount
766
- };
767
- }
768
- case ParquetFieldType.STRING: {
769
- const strings = nonNullValues;
770
- return {
771
- min: strings.reduce((a, b) => a < b ? a : b),
772
- max: strings.reduce((a, b) => a > b ? a : b),
773
- nullCount
774
- };
775
- }
776
- case ParquetFieldType.BOOLEAN: {
777
- return { nullCount };
778
- }
779
- default:
780
- return { nullCount };
781
- }
782
- }
783
- /**
784
- * Estimates the encoded size after compression.
785
- *
786
- * @param values - The column values
787
- * @param type - The column type
788
- * @param compression - The compression type
789
- * @returns Estimated compressed size in bytes
790
- * @private
791
- */
792
- _estimateEncodedSize(values, type, compression) {
793
- const uncompressedSize = this._estimateUncompressedSize(values, type);
794
- // Apply compression ratio estimate
795
- switch (compression) {
796
- case ParquetCompression.SNAPPY:
797
- return Math.floor(uncompressedSize * 0.5);
798
- case ParquetCompression.GZIP:
799
- return Math.floor(uncompressedSize * 0.3);
800
- case ParquetCompression.ZSTD:
801
- return Math.floor(uncompressedSize * 0.25);
802
- case ParquetCompression.LZ4:
803
- return Math.floor(uncompressedSize * 0.4);
804
- case ParquetCompression.UNCOMPRESSED:
805
- default:
806
- return uncompressedSize;
807
- }
808
- }
809
- /**
810
- * Estimates the uncompressed size of column values.
811
- *
812
- * @param values - The column values
813
- * @param type - The column type
814
- * @returns Estimated uncompressed size in bytes
815
- * @private
816
- */
817
- _estimateUncompressedSize(values, type) {
818
- let size = 0;
819
- for (const value of values) {
820
- if (value === null || value === undefined) {
821
- size += 1;
822
- }
823
- else {
824
- switch (type) {
825
- case ParquetFieldType.STRING:
826
- size += value.length * 2;
827
- break;
828
- case ParquetFieldType.INT32:
829
- case ParquetFieldType.FLOAT:
830
- size += 4;
831
- break;
832
- case ParquetFieldType.INT64:
833
- case ParquetFieldType.DOUBLE:
834
- case ParquetFieldType.TIMESTAMP_MILLIS:
835
- case ParquetFieldType.TIMESTAMP_MICROS:
836
- size += 8;
837
- break;
838
- case ParquetFieldType.BOOLEAN:
839
- size += 1;
840
- break;
841
- case ParquetFieldType.BINARY:
842
- size += value instanceof Uint8Array ? value.length : value.length;
843
- break;
844
- }
845
- }
846
- }
847
- return size;
848
- }
849
- /**
850
- * Generates the complete Parquet file bytes.
851
- *
852
- * @returns The complete Parquet file as Uint8Array
853
- * @private
854
- */
855
- _generateParquetBytes() {
856
- // Build all row data - will be populated from row groups in full implementation
857
- // For now, row group data is serialized directly below
858
- // Calculate metadata
859
- const metadata = {
860
- schema: this.schema,
861
- numRows: this._rowCount,
862
- rowGroups: this._rowGroups,
863
- compression: this.options.compression,
864
- columnCompression: this.options.columnCompression,
865
- keyValueMetadata: this._keyValueMetadata,
866
- createdAt: this._createdAt,
867
- sortedBy: this.options.sortBy,
868
- partitionColumns: this.options.partitionColumns
869
- };
870
- // Encode metadata to JSON and then to bytes
871
- const metadataJson = JSON.stringify(metadata);
872
- const metadataBytes = new TextEncoder().encode(metadataJson);
873
- // Compress metadata if needed
874
- let compressedMetadata;
875
- if (this.options.compression === ParquetCompression.GZIP) {
876
- compressedMetadata = pako.gzip(metadataBytes);
877
- }
878
- else {
879
- // For SNAPPY, ZSTD, LZ4 - we'll use a simple RLE-like compression simulation
880
- // In production, you'd use actual compression libraries
881
- compressedMetadata = this._simpleCompress(metadataBytes, this.options.compression);
882
- }
883
- // Build final file structure
884
- // PAR1 magic (4 bytes) + data + metadata length (4 bytes) + metadata + PAR1 magic (4 bytes)
885
- const magic = new TextEncoder().encode('PAR1');
886
- const metadataLength = new Uint8Array(4);
887
- new DataView(metadataLength.buffer).setUint32(0, compressedMetadata.length, true);
888
- // Calculate total size
889
- const totalSize = 4 + compressedMetadata.length + 4 + 4;
890
- const result = new Uint8Array(totalSize);
891
- // Write structure
892
- let offset = 0;
893
- result.set(magic, offset);
894
- offset += 4;
895
- result.set(compressedMetadata, offset);
896
- offset += compressedMetadata.length;
897
- result.set(metadataLength, offset);
898
- offset += 4;
899
- result.set(magic, offset);
900
- return result;
901
- }
902
- /**
903
- * Simple compression simulation for non-gzip formats.
904
- *
905
- * @param data - Data to compress
906
- * @param compression - Compression type
907
- * @returns Compressed data
908
- * @private
909
- */
910
- _simpleCompress(data, compression) {
911
- if (compression === ParquetCompression.UNCOMPRESSED) {
912
- return data;
913
- }
914
- // Use pako deflate for a basic compression simulation
915
- // Real implementation would use snappy-js, zstd-codec, lz4js etc.
916
- try {
917
- return pako.deflate(data, { level: compression === ParquetCompression.ZSTD ? 9 : 6 });
918
- }
919
- catch {
920
- return data;
921
- }
922
- }
923
- }
924
- // ============================================================================
925
- // Helper Functions
926
- // ============================================================================
927
- /**
928
- * Defines a Parquet schema.
929
- *
930
- * @description
931
- * Creates a validated Parquet schema from field definitions. Validates that:
932
- * - Schema has at least one field
933
- * - All field names are non-empty
934
- * - All field names are unique
935
- *
936
- * @param fields - Array of field definitions
937
- * @param metadata - Optional schema-level metadata
938
- * @returns Validated Parquet schema
939
- *
940
- * @throws {ParquetError} EMPTY_SCHEMA - If fields array is empty
941
- * @throws {ParquetError} EMPTY_FIELD_NAME - If any field name is empty
942
- * @throws {ParquetError} DUPLICATE_FIELD - If field names are not unique
943
- *
944
- * @example
945
- * ```typescript
946
- * const schema = defineSchema([
947
- * { name: 'id', type: ParquetFieldType.INT64, required: true },
948
- * { name: 'name', type: ParquetFieldType.STRING, required: true },
949
- * { name: 'age', type: ParquetFieldType.INT32, required: false },
950
- * { name: 'created_at', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
951
- * ], {
952
- * version: '1.0',
953
- * description: 'User records'
954
- * })
955
- * ```
956
- */
957
- export function defineSchema(fields, metadata) {
958
- // Validate schema
959
- if (fields.length === 0) {
960
- throw new ParquetError('Schema cannot be empty', 'EMPTY_SCHEMA');
961
- }
962
- const names = new Set();
963
- for (const field of fields) {
964
- if (!field.name || field.name.trim() === '') {
965
- throw new ParquetError('Field name cannot be empty', 'EMPTY_FIELD_NAME');
966
- }
967
- if (names.has(field.name)) {
968
- throw new ParquetError(`Duplicate field name: ${field.name}`, 'DUPLICATE_FIELD');
969
- }
970
- names.add(field.name);
971
- }
972
- return {
973
- fields: fields.map(f => ({
974
- name: f.name,
975
- type: f.type,
976
- required: f.required,
977
- metadata: f.metadata
978
- })),
979
- metadata
980
- };
981
- }
982
- /**
983
- * Creates a Parquet writer.
984
- *
985
- * @description
986
- * Factory function to create a ParquetWriter with the specified schema
987
- * and options. Equivalent to `new ParquetWriter(schema, options)`.
988
- *
989
- * @param schema - The Parquet schema
990
- * @param options - Writer options
991
- * @returns A new ParquetWriter instance
992
- *
993
- * @example
994
- * ```typescript
995
- * const writer = createParquetWriter(schema, {
996
- * rowGroupSize: 10000,
997
- * compression: ParquetCompression.SNAPPY
998
- * })
999
- * ```
1000
- */
1001
- export function createParquetWriter(schema, options = {}) {
1002
- return new ParquetWriter(schema, options);
1003
- }
1004
- /**
1005
- * Writes data directly to a Parquet file buffer.
1006
- *
1007
- * @description
1008
- * Convenience function that creates a writer, writes all rows, and returns
1009
- * the complete Parquet file. Useful for simple one-shot writes.
1010
- *
1011
- * @param schema - The Parquet schema
1012
- * @param rows - Array of rows to write
1013
- * @param options - Writer options
1014
- * @returns Promise resolving to the complete Parquet file as Uint8Array
1015
- *
1016
- * @example
1017
- * ```typescript
1018
- * const buffer = await writeParquetFile(schema, [
1019
- * { id: 1, name: 'Alice' },
1020
- * { id: 2, name: 'Bob' }
1021
- * ], {
1022
- * compression: ParquetCompression.GZIP
1023
- * })
1024
- *
1025
- * await fs.writeFile('data.parquet', buffer)
1026
- * ```
1027
- */
1028
- export async function writeParquetFile(schema, rows, options = {}) {
1029
- const writer = createParquetWriter(schema, options);
1030
- await writer.writeRows(rows);
1031
- return writer.toBuffer();
1032
- }
1033
- /**
1034
- * Closes a writer and returns the final buffer.
1035
- *
1036
- * @description
1037
- * Generates the final Parquet file buffer and marks the writer as closed.
1038
- * The writer cannot be used for further writes after calling this function.
1039
- *
1040
- * @param writer - The ParquetWriter to close
1041
- * @returns Promise resolving to the complete Parquet file as Uint8Array
1042
- *
1043
- * @example
1044
- * ```typescript
1045
- * await writer.writeRows(data)
1046
- * const buffer = await closeWriter(writer)
1047
- * console.log(writer.isClosed) // true
1048
- * ```
1049
- */
1050
- export async function closeWriter(writer) {
1051
- const bytes = await writer.toBuffer();
1052
- writer._isClosed = true;
1053
- return bytes;
1054
- }
1055
- /**
1056
- * Adds a row group to the writer.
1057
- *
1058
- * @description
1059
- * Writes multiple rows and then flushes them as a single row group.
1060
- * Useful when you want explicit control over row group boundaries.
1061
- *
1062
- * @param writer - The ParquetWriter to use
1063
- * @param rows - Array of rows for this row group
1064
- * @returns Promise that resolves when the row group is written
1065
- *
1066
- * @example
1067
- * ```typescript
1068
- * // Add explicit row groups
1069
- * await addRowGroup(writer, batch1) // First row group
1070
- * await addRowGroup(writer, batch2) // Second row group
1071
- * ```
1072
- */
1073
- export async function addRowGroup(writer, rows) {
1074
- await writer.writeRows(rows);
1075
- await writer.flushRowGroup();
1076
- }
1077
- /**
1078
- * Gets metadata from a Parquet file buffer.
1079
- *
1080
- * @description
1081
- * Parses a Parquet file buffer and extracts the metadata including
1082
- * schema, row groups, compression settings, and custom metadata.
1083
- *
1084
- * @param bytes - The Parquet file buffer
1085
- * @returns The parsed metadata
1086
- *
1087
- * @throws {ParquetError} INVALID_MAGIC - If file doesn't have valid Parquet magic bytes
1088
- *
1089
- * @example
1090
- * ```typescript
1091
- * const buffer = await fs.readFile('data.parquet')
1092
- * const metadata = getMetadata(buffer)
1093
- *
1094
- * console.log(`Rows: ${metadata.numRows}`)
1095
- * console.log(`Schema: ${metadata.schema.fields.map(f => f.name).join(', ')}`)
1096
- * console.log(`Row groups: ${metadata.rowGroups.length}`)
1097
- *
1098
- * for (const rg of metadata.rowGroups) {
1099
- * console.log(` - ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
1100
- * }
1101
- * ```
1102
- */
1103
- export function getMetadata(bytes) {
1104
- // Verify magic bytes
1105
- const startMagic = new TextDecoder().decode(bytes.slice(0, 4));
1106
- const endMagic = new TextDecoder().decode(bytes.slice(-4));
1107
- if (startMagic !== 'PAR1' || endMagic !== 'PAR1') {
1108
- throw new ParquetError('Invalid Parquet file: missing magic bytes', 'INVALID_MAGIC');
1109
- }
1110
- // Read metadata length (4 bytes before final magic)
1111
- const metadataLengthOffset = bytes.length - 8;
1112
- const metadataLength = new DataView(bytes.buffer, bytes.byteOffset + metadataLengthOffset, 4).getUint32(0, true);
1113
- // Read compressed metadata
1114
- const metadataStart = 4;
1115
- const compressedMetadata = bytes.slice(metadataStart, metadataStart + metadataLength);
1116
- // Decompress metadata
1117
- let metadataBytes;
1118
- try {
1119
- // Try gzip first
1120
- metadataBytes = pako.ungzip(compressedMetadata);
1121
- }
1122
- catch {
1123
- try {
1124
- // Try inflate (deflate)
1125
- metadataBytes = pako.inflate(compressedMetadata);
1126
- }
1127
- catch {
1128
- // Assume uncompressed
1129
- metadataBytes = compressedMetadata;
1130
- }
1131
- }
1132
- // Parse metadata JSON
1133
- const metadataJson = new TextDecoder().decode(metadataBytes);
1134
- const internal = JSON.parse(metadataJson);
1135
- // Build column metadata map
1136
- const columnMetadata = {};
1137
- if (internal.columnCompression) {
1138
- for (const [col, comp] of Object.entries(internal.columnCompression)) {
1139
- columnMetadata[col] = { compression: comp };
1140
- }
1141
- }
1142
- return {
1143
- schema: internal.schema,
1144
- numRows: internal.numRows,
1145
- rowGroups: internal.rowGroups,
1146
- compression: internal.compression,
1147
- columnMetadata: Object.keys(columnMetadata).length > 0 ? columnMetadata : undefined,
1148
- keyValueMetadata: Object.keys(internal.keyValueMetadata).length > 0 ? internal.keyValueMetadata : undefined,
1149
- createdAt: internal.createdAt,
1150
- fileSize: bytes.length,
1151
- sortedBy: internal.sortedBy,
1152
- partitionColumns: internal.partitionColumns
1153
- };
1154
- }
1155
- /**
1156
- * Sets the compression type for a writer.
1157
- *
1158
- * @description
1159
- * Updates the default compression algorithm for a writer. Affects all
1160
- * subsequently written data. Columns with explicit compression settings
1161
- * in columnCompression are not affected.
1162
- *
1163
- * @param writer - The ParquetWriter to update
1164
- * @param compression - The new compression type
1165
- *
1166
- * @example
1167
- * ```typescript
1168
- * const writer = createParquetWriter(schema)
1169
- *
1170
- * // Write some rows with SNAPPY (default)
1171
- * await writer.writeRows(batch1)
1172
- * await writer.flushRowGroup()
1173
- *
1174
- * // Switch to GZIP for remaining data
1175
- * setCompression(writer, ParquetCompression.GZIP)
1176
- * await writer.writeRows(batch2)
1177
- * ```
1178
- */
1179
- export function setCompression(writer, compression) {
1180
- ;
1181
- writer.options.compression = compression;
1182
- }
1183
- //# sourceMappingURL=parquet-writer.js.map