gitx.do 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (356) hide show
  1. package/README.md +40 -353
  2. package/dist/do/logger.d.ts +50 -0
  3. package/dist/do/logger.d.ts.map +1 -0
  4. package/dist/do/logger.js +122 -0
  5. package/dist/do/logger.js.map +1 -0
  6. package/dist/{durable-object → do}/schema.d.ts +3 -3
  7. package/dist/do/schema.d.ts.map +1 -0
  8. package/dist/{durable-object → do}/schema.js +4 -3
  9. package/dist/do/schema.js.map +1 -0
  10. package/dist/do/types.d.ts +267 -0
  11. package/dist/do/types.d.ts.map +1 -0
  12. package/dist/do/types.js +62 -0
  13. package/dist/do/types.js.map +1 -0
  14. package/dist/index.d.ts +14 -469
  15. package/dist/index.d.ts.map +1 -1
  16. package/dist/index.js +31 -483
  17. package/dist/index.js.map +1 -1
  18. package/package.json +13 -21
  19. package/dist/cli/commands/add.d.ts +0 -176
  20. package/dist/cli/commands/add.d.ts.map +0 -1
  21. package/dist/cli/commands/add.js +0 -979
  22. package/dist/cli/commands/add.js.map +0 -1
  23. package/dist/cli/commands/blame.d.ts +0 -259
  24. package/dist/cli/commands/blame.d.ts.map +0 -1
  25. package/dist/cli/commands/blame.js +0 -609
  26. package/dist/cli/commands/blame.js.map +0 -1
  27. package/dist/cli/commands/branch.d.ts +0 -249
  28. package/dist/cli/commands/branch.d.ts.map +0 -1
  29. package/dist/cli/commands/branch.js +0 -693
  30. package/dist/cli/commands/branch.js.map +0 -1
  31. package/dist/cli/commands/checkout.d.ts +0 -73
  32. package/dist/cli/commands/checkout.d.ts.map +0 -1
  33. package/dist/cli/commands/checkout.js +0 -725
  34. package/dist/cli/commands/checkout.js.map +0 -1
  35. package/dist/cli/commands/commit.d.ts +0 -182
  36. package/dist/cli/commands/commit.d.ts.map +0 -1
  37. package/dist/cli/commands/commit.js +0 -457
  38. package/dist/cli/commands/commit.js.map +0 -1
  39. package/dist/cli/commands/diff.d.ts +0 -464
  40. package/dist/cli/commands/diff.d.ts.map +0 -1
  41. package/dist/cli/commands/diff.js +0 -959
  42. package/dist/cli/commands/diff.js.map +0 -1
  43. package/dist/cli/commands/log.d.ts +0 -239
  44. package/dist/cli/commands/log.d.ts.map +0 -1
  45. package/dist/cli/commands/log.js +0 -535
  46. package/dist/cli/commands/log.js.map +0 -1
  47. package/dist/cli/commands/merge.d.ts +0 -106
  48. package/dist/cli/commands/merge.d.ts.map +0 -1
  49. package/dist/cli/commands/merge.js +0 -852
  50. package/dist/cli/commands/merge.js.map +0 -1
  51. package/dist/cli/commands/review.d.ts +0 -457
  52. package/dist/cli/commands/review.d.ts.map +0 -1
  53. package/dist/cli/commands/review.js +0 -558
  54. package/dist/cli/commands/review.js.map +0 -1
  55. package/dist/cli/commands/stash.d.ts +0 -157
  56. package/dist/cli/commands/stash.d.ts.map +0 -1
  57. package/dist/cli/commands/stash.js +0 -655
  58. package/dist/cli/commands/stash.js.map +0 -1
  59. package/dist/cli/commands/status.d.ts +0 -269
  60. package/dist/cli/commands/status.d.ts.map +0 -1
  61. package/dist/cli/commands/status.js +0 -492
  62. package/dist/cli/commands/status.js.map +0 -1
  63. package/dist/cli/commands/web.d.ts +0 -199
  64. package/dist/cli/commands/web.d.ts.map +0 -1
  65. package/dist/cli/commands/web.js +0 -697
  66. package/dist/cli/commands/web.js.map +0 -1
  67. package/dist/cli/fs-adapter.d.ts +0 -656
  68. package/dist/cli/fs-adapter.d.ts.map +0 -1
  69. package/dist/cli/fs-adapter.js +0 -1177
  70. package/dist/cli/fs-adapter.js.map +0 -1
  71. package/dist/cli/fsx-cli-adapter.d.ts +0 -359
  72. package/dist/cli/fsx-cli-adapter.d.ts.map +0 -1
  73. package/dist/cli/fsx-cli-adapter.js +0 -619
  74. package/dist/cli/fsx-cli-adapter.js.map +0 -1
  75. package/dist/cli/index.d.ts +0 -387
  76. package/dist/cli/index.d.ts.map +0 -1
  77. package/dist/cli/index.js +0 -579
  78. package/dist/cli/index.js.map +0 -1
  79. package/dist/cli/ui/components/DiffView.d.ts +0 -12
  80. package/dist/cli/ui/components/DiffView.d.ts.map +0 -1
  81. package/dist/cli/ui/components/DiffView.js +0 -11
  82. package/dist/cli/ui/components/DiffView.js.map +0 -1
  83. package/dist/cli/ui/components/ErrorDisplay.d.ts +0 -10
  84. package/dist/cli/ui/components/ErrorDisplay.d.ts.map +0 -1
  85. package/dist/cli/ui/components/ErrorDisplay.js +0 -11
  86. package/dist/cli/ui/components/ErrorDisplay.js.map +0 -1
  87. package/dist/cli/ui/components/FuzzySearch.d.ts +0 -15
  88. package/dist/cli/ui/components/FuzzySearch.d.ts.map +0 -1
  89. package/dist/cli/ui/components/FuzzySearch.js +0 -12
  90. package/dist/cli/ui/components/FuzzySearch.js.map +0 -1
  91. package/dist/cli/ui/components/LoadingSpinner.d.ts +0 -10
  92. package/dist/cli/ui/components/LoadingSpinner.d.ts.map +0 -1
  93. package/dist/cli/ui/components/LoadingSpinner.js +0 -10
  94. package/dist/cli/ui/components/LoadingSpinner.js.map +0 -1
  95. package/dist/cli/ui/components/NavigationList.d.ts +0 -14
  96. package/dist/cli/ui/components/NavigationList.d.ts.map +0 -1
  97. package/dist/cli/ui/components/NavigationList.js +0 -11
  98. package/dist/cli/ui/components/NavigationList.js.map +0 -1
  99. package/dist/cli/ui/components/ScrollableContent.d.ts +0 -13
  100. package/dist/cli/ui/components/ScrollableContent.d.ts.map +0 -1
  101. package/dist/cli/ui/components/ScrollableContent.js +0 -11
  102. package/dist/cli/ui/components/ScrollableContent.js.map +0 -1
  103. package/dist/cli/ui/components/index.d.ts +0 -7
  104. package/dist/cli/ui/components/index.d.ts.map +0 -1
  105. package/dist/cli/ui/components/index.js +0 -9
  106. package/dist/cli/ui/components/index.js.map +0 -1
  107. package/dist/cli/ui/terminal-ui.d.ts +0 -85
  108. package/dist/cli/ui/terminal-ui.d.ts.map +0 -1
  109. package/dist/cli/ui/terminal-ui.js +0 -121
  110. package/dist/cli/ui/terminal-ui.js.map +0 -1
  111. package/dist/do/BashModule.d.ts +0 -871
  112. package/dist/do/BashModule.d.ts.map +0 -1
  113. package/dist/do/BashModule.js +0 -1143
  114. package/dist/do/BashModule.js.map +0 -1
  115. package/dist/do/FsModule.d.ts +0 -612
  116. package/dist/do/FsModule.d.ts.map +0 -1
  117. package/dist/do/FsModule.js +0 -1120
  118. package/dist/do/FsModule.js.map +0 -1
  119. package/dist/do/GitModule.d.ts +0 -635
  120. package/dist/do/GitModule.d.ts.map +0 -1
  121. package/dist/do/GitModule.js +0 -784
  122. package/dist/do/GitModule.js.map +0 -1
  123. package/dist/do/GitRepoDO.d.ts +0 -281
  124. package/dist/do/GitRepoDO.d.ts.map +0 -1
  125. package/dist/do/GitRepoDO.js +0 -479
  126. package/dist/do/GitRepoDO.js.map +0 -1
  127. package/dist/do/bash-ast.d.ts +0 -246
  128. package/dist/do/bash-ast.d.ts.map +0 -1
  129. package/dist/do/bash-ast.js +0 -888
  130. package/dist/do/bash-ast.js.map +0 -1
  131. package/dist/do/container-executor.d.ts +0 -491
  132. package/dist/do/container-executor.d.ts.map +0 -1
  133. package/dist/do/container-executor.js +0 -731
  134. package/dist/do/container-executor.js.map +0 -1
  135. package/dist/do/index.d.ts +0 -53
  136. package/dist/do/index.d.ts.map +0 -1
  137. package/dist/do/index.js +0 -91
  138. package/dist/do/index.js.map +0 -1
  139. package/dist/do/tiered-storage.d.ts +0 -403
  140. package/dist/do/tiered-storage.d.ts.map +0 -1
  141. package/dist/do/tiered-storage.js +0 -689
  142. package/dist/do/tiered-storage.js.map +0 -1
  143. package/dist/do/withBash.d.ts +0 -231
  144. package/dist/do/withBash.d.ts.map +0 -1
  145. package/dist/do/withBash.js +0 -244
  146. package/dist/do/withBash.js.map +0 -1
  147. package/dist/do/withFs.d.ts +0 -237
  148. package/dist/do/withFs.d.ts.map +0 -1
  149. package/dist/do/withFs.js +0 -387
  150. package/dist/do/withFs.js.map +0 -1
  151. package/dist/do/withGit.d.ts +0 -180
  152. package/dist/do/withGit.d.ts.map +0 -1
  153. package/dist/do/withGit.js +0 -271
  154. package/dist/do/withGit.js.map +0 -1
  155. package/dist/durable-object/object-store.d.ts +0 -633
  156. package/dist/durable-object/object-store.d.ts.map +0 -1
  157. package/dist/durable-object/object-store.js +0 -1164
  158. package/dist/durable-object/object-store.js.map +0 -1
  159. package/dist/durable-object/schema.d.ts.map +0 -1
  160. package/dist/durable-object/schema.js.map +0 -1
  161. package/dist/durable-object/wal.d.ts +0 -416
  162. package/dist/durable-object/wal.d.ts.map +0 -1
  163. package/dist/durable-object/wal.js +0 -445
  164. package/dist/durable-object/wal.js.map +0 -1
  165. package/dist/mcp/adapter.d.ts +0 -772
  166. package/dist/mcp/adapter.d.ts.map +0 -1
  167. package/dist/mcp/adapter.js +0 -895
  168. package/dist/mcp/adapter.js.map +0 -1
  169. package/dist/mcp/sandbox/miniflare-evaluator.d.ts +0 -22
  170. package/dist/mcp/sandbox/miniflare-evaluator.d.ts.map +0 -1
  171. package/dist/mcp/sandbox/miniflare-evaluator.js +0 -140
  172. package/dist/mcp/sandbox/miniflare-evaluator.js.map +0 -1
  173. package/dist/mcp/sandbox/object-store-proxy.d.ts +0 -32
  174. package/dist/mcp/sandbox/object-store-proxy.d.ts.map +0 -1
  175. package/dist/mcp/sandbox/object-store-proxy.js +0 -30
  176. package/dist/mcp/sandbox/object-store-proxy.js.map +0 -1
  177. package/dist/mcp/sandbox/template.d.ts +0 -17
  178. package/dist/mcp/sandbox/template.d.ts.map +0 -1
  179. package/dist/mcp/sandbox/template.js +0 -71
  180. package/dist/mcp/sandbox/template.js.map +0 -1
  181. package/dist/mcp/sandbox.d.ts +0 -764
  182. package/dist/mcp/sandbox.d.ts.map +0 -1
  183. package/dist/mcp/sandbox.js +0 -1362
  184. package/dist/mcp/sandbox.js.map +0 -1
  185. package/dist/mcp/sdk-adapter.d.ts +0 -835
  186. package/dist/mcp/sdk-adapter.d.ts.map +0 -1
  187. package/dist/mcp/sdk-adapter.js +0 -974
  188. package/dist/mcp/sdk-adapter.js.map +0 -1
  189. package/dist/mcp/tools/do.d.ts +0 -32
  190. package/dist/mcp/tools/do.d.ts.map +0 -1
  191. package/dist/mcp/tools/do.js +0 -117
  192. package/dist/mcp/tools/do.js.map +0 -1
  193. package/dist/mcp/tools.d.ts +0 -548
  194. package/dist/mcp/tools.d.ts.map +0 -1
  195. package/dist/mcp/tools.js +0 -3170
  196. package/dist/mcp/tools.js.map +0 -1
  197. package/dist/ops/blame.d.ts +0 -551
  198. package/dist/ops/blame.d.ts.map +0 -1
  199. package/dist/ops/blame.js +0 -1037
  200. package/dist/ops/blame.js.map +0 -1
  201. package/dist/ops/branch.d.ts +0 -766
  202. package/dist/ops/branch.d.ts.map +0 -1
  203. package/dist/ops/branch.js +0 -950
  204. package/dist/ops/branch.js.map +0 -1
  205. package/dist/ops/commit-traversal.d.ts +0 -349
  206. package/dist/ops/commit-traversal.d.ts.map +0 -1
  207. package/dist/ops/commit-traversal.js +0 -821
  208. package/dist/ops/commit-traversal.js.map +0 -1
  209. package/dist/ops/commit.d.ts +0 -555
  210. package/dist/ops/commit.d.ts.map +0 -1
  211. package/dist/ops/commit.js +0 -826
  212. package/dist/ops/commit.js.map +0 -1
  213. package/dist/ops/merge-base.d.ts +0 -397
  214. package/dist/ops/merge-base.d.ts.map +0 -1
  215. package/dist/ops/merge-base.js +0 -691
  216. package/dist/ops/merge-base.js.map +0 -1
  217. package/dist/ops/merge.d.ts +0 -855
  218. package/dist/ops/merge.d.ts.map +0 -1
  219. package/dist/ops/merge.js +0 -1551
  220. package/dist/ops/merge.js.map +0 -1
  221. package/dist/ops/tag.d.ts +0 -247
  222. package/dist/ops/tag.d.ts.map +0 -1
  223. package/dist/ops/tag.js +0 -649
  224. package/dist/ops/tag.js.map +0 -1
  225. package/dist/ops/tree-builder.d.ts +0 -178
  226. package/dist/ops/tree-builder.d.ts.map +0 -1
  227. package/dist/ops/tree-builder.js +0 -271
  228. package/dist/ops/tree-builder.js.map +0 -1
  229. package/dist/ops/tree-diff.d.ts +0 -291
  230. package/dist/ops/tree-diff.d.ts.map +0 -1
  231. package/dist/ops/tree-diff.js +0 -705
  232. package/dist/ops/tree-diff.js.map +0 -1
  233. package/dist/pack/delta.d.ts +0 -248
  234. package/dist/pack/delta.d.ts.map +0 -1
  235. package/dist/pack/delta.js +0 -740
  236. package/dist/pack/delta.js.map +0 -1
  237. package/dist/pack/format.d.ts +0 -446
  238. package/dist/pack/format.d.ts.map +0 -1
  239. package/dist/pack/format.js +0 -572
  240. package/dist/pack/format.js.map +0 -1
  241. package/dist/pack/full-generation.d.ts +0 -612
  242. package/dist/pack/full-generation.d.ts.map +0 -1
  243. package/dist/pack/full-generation.js +0 -1378
  244. package/dist/pack/full-generation.js.map +0 -1
  245. package/dist/pack/generation.d.ts +0 -441
  246. package/dist/pack/generation.d.ts.map +0 -1
  247. package/dist/pack/generation.js +0 -707
  248. package/dist/pack/generation.js.map +0 -1
  249. package/dist/pack/index.d.ts +0 -502
  250. package/dist/pack/index.d.ts.map +0 -1
  251. package/dist/pack/index.js +0 -833
  252. package/dist/pack/index.js.map +0 -1
  253. package/dist/refs/branch.d.ts +0 -683
  254. package/dist/refs/branch.d.ts.map +0 -1
  255. package/dist/refs/branch.js +0 -881
  256. package/dist/refs/branch.js.map +0 -1
  257. package/dist/refs/storage.d.ts +0 -833
  258. package/dist/refs/storage.d.ts.map +0 -1
  259. package/dist/refs/storage.js +0 -1023
  260. package/dist/refs/storage.js.map +0 -1
  261. package/dist/refs/tag.d.ts +0 -860
  262. package/dist/refs/tag.d.ts.map +0 -1
  263. package/dist/refs/tag.js +0 -996
  264. package/dist/refs/tag.js.map +0 -1
  265. package/dist/storage/backend.d.ts +0 -425
  266. package/dist/storage/backend.d.ts.map +0 -1
  267. package/dist/storage/backend.js +0 -41
  268. package/dist/storage/backend.js.map +0 -1
  269. package/dist/storage/fsx-adapter.d.ts +0 -204
  270. package/dist/storage/fsx-adapter.d.ts.map +0 -1
  271. package/dist/storage/fsx-adapter.js +0 -518
  272. package/dist/storage/fsx-adapter.js.map +0 -1
  273. package/dist/storage/lru-cache.d.ts +0 -691
  274. package/dist/storage/lru-cache.d.ts.map +0 -1
  275. package/dist/storage/lru-cache.js +0 -813
  276. package/dist/storage/lru-cache.js.map +0 -1
  277. package/dist/storage/object-index.d.ts +0 -585
  278. package/dist/storage/object-index.d.ts.map +0 -1
  279. package/dist/storage/object-index.js +0 -532
  280. package/dist/storage/object-index.js.map +0 -1
  281. package/dist/storage/r2-pack.d.ts +0 -1257
  282. package/dist/storage/r2-pack.d.ts.map +0 -1
  283. package/dist/storage/r2-pack.js +0 -1773
  284. package/dist/storage/r2-pack.js.map +0 -1
  285. package/dist/tiered/cdc-pipeline.d.ts +0 -1888
  286. package/dist/tiered/cdc-pipeline.d.ts.map +0 -1
  287. package/dist/tiered/cdc-pipeline.js +0 -1880
  288. package/dist/tiered/cdc-pipeline.js.map +0 -1
  289. package/dist/tiered/migration.d.ts +0 -1104
  290. package/dist/tiered/migration.d.ts.map +0 -1
  291. package/dist/tiered/migration.js +0 -1217
  292. package/dist/tiered/migration.js.map +0 -1
  293. package/dist/tiered/parquet-writer.d.ts +0 -1145
  294. package/dist/tiered/parquet-writer.d.ts.map +0 -1
  295. package/dist/tiered/parquet-writer.js +0 -1183
  296. package/dist/tiered/parquet-writer.js.map +0 -1
  297. package/dist/tiered/read-path.d.ts +0 -835
  298. package/dist/tiered/read-path.d.ts.map +0 -1
  299. package/dist/tiered/read-path.js +0 -487
  300. package/dist/tiered/read-path.js.map +0 -1
  301. package/dist/types/capability.d.ts +0 -1385
  302. package/dist/types/capability.d.ts.map +0 -1
  303. package/dist/types/capability.js +0 -36
  304. package/dist/types/capability.js.map +0 -1
  305. package/dist/types/index.d.ts +0 -13
  306. package/dist/types/index.d.ts.map +0 -1
  307. package/dist/types/index.js +0 -18
  308. package/dist/types/index.js.map +0 -1
  309. package/dist/types/interfaces.d.ts +0 -673
  310. package/dist/types/interfaces.d.ts.map +0 -1
  311. package/dist/types/interfaces.js +0 -26
  312. package/dist/types/interfaces.js.map +0 -1
  313. package/dist/types/objects.d.ts +0 -692
  314. package/dist/types/objects.d.ts.map +0 -1
  315. package/dist/types/objects.js +0 -837
  316. package/dist/types/objects.js.map +0 -1
  317. package/dist/types/storage.d.ts +0 -603
  318. package/dist/types/storage.d.ts.map +0 -1
  319. package/dist/types/storage.js +0 -191
  320. package/dist/types/storage.js.map +0 -1
  321. package/dist/types/worker-loader.d.ts +0 -60
  322. package/dist/types/worker-loader.d.ts.map +0 -1
  323. package/dist/types/worker-loader.js +0 -62
  324. package/dist/types/worker-loader.js.map +0 -1
  325. package/dist/utils/hash.d.ts +0 -198
  326. package/dist/utils/hash.d.ts.map +0 -1
  327. package/dist/utils/hash.js +0 -272
  328. package/dist/utils/hash.js.map +0 -1
  329. package/dist/utils/sha1.d.ts +0 -325
  330. package/dist/utils/sha1.d.ts.map +0 -1
  331. package/dist/utils/sha1.js +0 -635
  332. package/dist/utils/sha1.js.map +0 -1
  333. package/dist/wire/capabilities.d.ts +0 -1044
  334. package/dist/wire/capabilities.d.ts.map +0 -1
  335. package/dist/wire/capabilities.js +0 -941
  336. package/dist/wire/capabilities.js.map +0 -1
  337. package/dist/wire/path-security.d.ts +0 -157
  338. package/dist/wire/path-security.d.ts.map +0 -1
  339. package/dist/wire/path-security.js +0 -307
  340. package/dist/wire/path-security.js.map +0 -1
  341. package/dist/wire/pkt-line.d.ts +0 -345
  342. package/dist/wire/pkt-line.d.ts.map +0 -1
  343. package/dist/wire/pkt-line.js +0 -381
  344. package/dist/wire/pkt-line.js.map +0 -1
  345. package/dist/wire/receive-pack.d.ts +0 -1059
  346. package/dist/wire/receive-pack.d.ts.map +0 -1
  347. package/dist/wire/receive-pack.js +0 -1414
  348. package/dist/wire/receive-pack.js.map +0 -1
  349. package/dist/wire/smart-http.d.ts +0 -799
  350. package/dist/wire/smart-http.d.ts.map +0 -1
  351. package/dist/wire/smart-http.js +0 -945
  352. package/dist/wire/smart-http.js.map +0 -1
  353. package/dist/wire/upload-pack.d.ts +0 -727
  354. package/dist/wire/upload-pack.d.ts.map +0 -1
  355. package/dist/wire/upload-pack.js +0 -1141
  356. package/dist/wire/upload-pack.js.map +0 -1
@@ -1,1183 +0,0 @@
1
- /**
2
- * @fileoverview Parquet Writer for Git Analytics
3
- *
4
- * @description
5
- * Provides functionality to write git analytics data to Parquet format, a
6
- * columnar storage format optimized for analytical queries. This module
7
- * enables efficient storage and querying of Git repository data.
8
- *
9
- * **Key Features:**
10
- * - Schema definition with various field types (STRING, INT32, INT64, etc.)
11
- * - Multiple compression algorithms (SNAPPY, GZIP, ZSTD, LZ4, UNCOMPRESSED)
12
- * - Row group management for efficient columnar storage
13
- * - Automatic and manual row group flushing
14
- * - Column-level statistics generation (min, max, null count)
15
- * - Custom key-value metadata support
16
- * - Memory-efficient streaming writes
17
- *
18
- * **Parquet Format:**
19
- * The generated files follow the Parquet format with:
20
- * - Magic bytes "PAR1" at start and end
21
- * - Row group data organized by columns
22
- * - Footer metadata containing schema and statistics
23
- *
24
- * @example
25
- * ```typescript
26
- * // Define schema for commit analytics
27
- * const schema = defineSchema([
28
- * { name: 'commit_sha', type: ParquetFieldType.STRING, required: true },
29
- * { name: 'author', type: ParquetFieldType.STRING, required: true },
30
- * { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true },
31
- * { name: 'file_count', type: ParquetFieldType.INT32, required: false }
32
- * ])
33
- *
34
- * // Create writer with options
35
- * const writer = createParquetWriter(schema, {
36
- * rowGroupSize: 10000,
37
- * compression: ParquetCompression.SNAPPY,
38
- * enableStatistics: true
39
- * })
40
- *
41
- * // Write data
42
- * await writer.writeRows([
43
- * { commit_sha: 'abc123...', author: 'alice', timestamp: Date.now(), file_count: 5 },
44
- * { commit_sha: 'def456...', author: 'bob', timestamp: Date.now(), file_count: 3 }
45
- * ])
46
- *
47
- * // Generate the Parquet file
48
- * const buffer = await writer.toBuffer()
49
- * ```
50
- *
51
- * @module tiered/parquet-writer
52
- * @see {@link ParquetWriter} - Main writer class
53
- * @see {@link defineSchema} - Schema definition helper
54
- */
55
- import pako from 'pako';
56
- // ============================================================================
57
- // Types and Enums
58
- // ============================================================================
59
- /**
60
- * Supported Parquet field types.
61
- *
62
- * @description
63
- * Defines the data types that can be used for fields in a Parquet schema.
64
- * Each type maps to an appropriate physical and logical Parquet type.
65
- *
66
- * @example
67
- * ```typescript
68
- * const field: ParquetField = {
69
- * name: 'count',
70
- * type: ParquetFieldType.INT64,
71
- * required: true
72
- * }
73
- * ```
74
- *
75
- * @enum {string}
76
- */
77
- export var ParquetFieldType;
78
- (function (ParquetFieldType) {
79
- /**
80
- * UTF-8 encoded string.
81
- * Maps to Parquet BYTE_ARRAY with UTF8 logical type.
82
- */
83
- ParquetFieldType["STRING"] = "STRING";
84
- /**
85
- * 32-bit signed integer.
86
- * Maps to Parquet INT32 physical type.
87
- */
88
- ParquetFieldType["INT32"] = "INT32";
89
- /**
90
- * 64-bit signed integer.
91
- * Maps to Parquet INT64 physical type.
92
- */
93
- ParquetFieldType["INT64"] = "INT64";
94
- /**
95
- * Boolean value (true/false).
96
- * Maps to Parquet BOOLEAN physical type.
97
- */
98
- ParquetFieldType["BOOLEAN"] = "BOOLEAN";
99
- /**
100
- * 32-bit IEEE 754 floating point.
101
- * Maps to Parquet FLOAT physical type.
102
- */
103
- ParquetFieldType["FLOAT"] = "FLOAT";
104
- /**
105
- * 64-bit IEEE 754 floating point.
106
- * Maps to Parquet DOUBLE physical type.
107
- */
108
- ParquetFieldType["DOUBLE"] = "DOUBLE";
109
- /**
110
- * Raw binary data.
111
- * Maps to Parquet BYTE_ARRAY physical type.
112
- */
113
- ParquetFieldType["BINARY"] = "BINARY";
114
- /**
115
- * Timestamp with millisecond precision.
116
- * Maps to Parquet INT64 with TIMESTAMP_MILLIS logical type.
117
- */
118
- ParquetFieldType["TIMESTAMP_MILLIS"] = "TIMESTAMP_MILLIS";
119
- /**
120
- * Timestamp with microsecond precision.
121
- * Maps to Parquet INT64 with TIMESTAMP_MICROS logical type.
122
- */
123
- ParquetFieldType["TIMESTAMP_MICROS"] = "TIMESTAMP_MICROS";
124
- })(ParquetFieldType || (ParquetFieldType = {}));
125
- /**
126
- * Supported compression types for Parquet data.
127
- *
128
- * @description
129
- * Different compression algorithms offer trade-offs between compression
130
- * ratio, compression speed, and decompression speed.
131
- *
132
- * **Comparison:**
133
- * - SNAPPY: Fast compression/decompression, moderate ratio (default)
134
- * - GZIP: Higher ratio, slower compression, fast decompression
135
- * - ZSTD: Best ratio, good speed, requires more memory
136
- * - LZ4: Fastest, lower ratio
137
- * - UNCOMPRESSED: No compression overhead
138
- *
139
- * @example
140
- * ```typescript
141
- * const writer = createParquetWriter(schema, {
142
- * compression: ParquetCompression.ZSTD
143
- * })
144
- * ```
145
- *
146
- * @enum {string}
147
- */
148
- export var ParquetCompression;
149
- (function (ParquetCompression) {
150
- /**
151
- * No compression applied.
152
- * Fastest writes, largest file size.
153
- */
154
- ParquetCompression["UNCOMPRESSED"] = "UNCOMPRESSED";
155
- /**
156
- * Snappy compression (default).
157
- * Good balance of speed and compression ratio.
158
- */
159
- ParquetCompression["SNAPPY"] = "SNAPPY";
160
- /**
161
- * GZIP compression.
162
- * Higher compression ratio, slower compression.
163
- */
164
- ParquetCompression["GZIP"] = "GZIP";
165
- /**
166
- * Zstandard compression.
167
- * Best compression ratio with good speed.
168
- */
169
- ParquetCompression["ZSTD"] = "ZSTD";
170
- /**
171
- * LZ4 compression.
172
- * Fastest compression, lower ratio.
173
- */
174
- ParquetCompression["LZ4"] = "LZ4";
175
- })(ParquetCompression || (ParquetCompression = {}));
176
- /**
177
- * Error class for Parquet-related operations.
178
- *
179
- * @description
180
- * Thrown when Parquet operations fail, such as schema validation errors,
181
- * invalid data types, or malformed files.
182
- *
183
- * @example
184
- * ```typescript
185
- * try {
186
- * await writer.writeRow({ invalid_field: 'value' })
187
- * } catch (error) {
188
- * if (error instanceof ParquetError) {
189
- * console.log(`Parquet error (${error.code}): ${error.message}`)
190
- * }
191
- * }
192
- * ```
193
- *
194
- * @class ParquetError
195
- * @extends Error
196
- */
197
- export class ParquetError extends Error {
198
- code;
199
- /**
200
- * Creates a new ParquetError.
201
- *
202
- * @param message - Human-readable error message
203
- * @param code - Error code for programmatic handling
204
- *
205
- * @example
206
- * ```typescript
207
- * throw new ParquetError('Field name cannot be empty', 'EMPTY_FIELD_NAME')
208
- * ```
209
- */
210
- constructor(message, code) {
211
- super(message);
212
- this.code = code;
213
- this.name = 'ParquetError';
214
- }
215
- }
216
- // ============================================================================
217
- // ParquetWriter Class
218
- // ============================================================================
219
- /**
220
- * Parquet writer for git analytics data.
221
- *
222
- * @description
223
- * ParquetWriter provides a streaming interface for writing data to Parquet
224
- * format. It handles schema validation, row group management, compression,
225
- * and statistics generation.
226
- *
227
- * **Usage Pattern:**
228
- * 1. Create a schema using `defineSchema()`
229
- * 2. Create a writer with `createParquetWriter()` or `new ParquetWriter()`
230
- * 3. Write rows using `writeRow()` or `writeRows()`
231
- * 4. Generate the file with `toBuffer()` or `writeTo()`
232
- *
233
- * **Row Group Management:**
234
- * Rows are buffered in memory until the row group is full (by row count
235
- * or memory limit), then flushed. You can also manually flush with
236
- * `flushRowGroup()`.
237
- *
238
- * **Thread Safety:**
239
- * Not thread-safe. Use separate writer instances for concurrent writes.
240
- *
241
- * @example
242
- * ```typescript
243
- * // Create schema
244
- * const schema = defineSchema([
245
- * { name: 'sha', type: ParquetFieldType.STRING, required: true },
246
- * { name: 'type', type: ParquetFieldType.STRING, required: true },
247
- * { name: 'size', type: ParquetFieldType.INT64, required: true },
248
- * { name: 'timestamp', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
249
- * ])
250
- *
251
- * // Create writer
252
- * const writer = new ParquetWriter(schema, {
253
- * rowGroupSize: 10000,
254
- * compression: ParquetCompression.SNAPPY,
255
- * enableStatistics: true
256
- * })
257
- *
258
- * // Write data
259
- * for (const object of gitObjects) {
260
- * await writer.writeRow({
261
- * sha: object.sha,
262
- * type: object.type,
263
- * size: object.size,
264
- * timestamp: Date.now()
265
- * })
266
- * }
267
- *
268
- * // Set custom metadata
269
- * writer.setMetadata('git_version', '2.40.0')
270
- * writer.setMetadata('repository', 'github.com/org/repo')
271
- *
272
- * // Generate file
273
- * const buffer = await writer.toBuffer()
274
- * console.log(`Generated ${buffer.length} bytes`)
275
- * console.log(`Rows: ${writer.rowCount}`)
276
- * console.log(`Row groups: ${writer.rowGroupCount}`)
277
- *
278
- * // Reset for reuse
279
- * writer.reset()
280
- * ```
281
- *
282
- * @class ParquetWriter
283
- */
284
- export class ParquetWriter {
285
- /**
286
- * The Parquet schema for this writer.
287
- * @readonly
288
- */
289
- schema;
290
- /**
291
- * Resolved options with defaults applied.
292
- * @readonly
293
- */
294
- options;
295
- /**
296
- * Total row count written.
297
- * @private
298
- */
299
- _rowCount = 0;
300
- /**
301
- * Completed row groups.
302
- * @private
303
- */
304
- _rowGroups = [];
305
- /**
306
- * Current row group being built.
307
- * @private
308
- */
309
- _currentRowGroup = { rows: [], byteSize: 0 };
310
- /**
311
- * Whether the writer has been closed.
312
- * @private
313
- */
314
- _isClosed = false;
315
- /**
316
- * Custom key-value metadata.
317
- * @private
318
- */
319
- _keyValueMetadata = {};
320
- /**
321
- * Creation timestamp.
322
- * @private
323
- */
324
- _createdAt = Date.now();
325
- /**
326
- * Creates a new ParquetWriter instance.
327
- *
328
- * @param schema - The Parquet schema defining columns
329
- * @param options - Writer configuration options
330
- *
331
- * @example
332
- * ```typescript
333
- * const writer = new ParquetWriter(schema, {
334
- * rowGroupSize: 50000,
335
- * compression: ParquetCompression.GZIP
336
- * })
337
- * ```
338
- */
339
- constructor(schema, options = {}) {
340
- this.schema = schema;
341
- this.options = {
342
- rowGroupSize: options.rowGroupSize ?? 65536,
343
- compression: options.compression ?? ParquetCompression.SNAPPY,
344
- ...options
345
- };
346
- }
347
- /**
348
- * Gets the total row count written to the writer.
349
- *
350
- * @description
351
- * Returns the total number of rows written, including rows in the
352
- * current unflushed row group.
353
- *
354
- * @returns Total row count
355
- *
356
- * @example
357
- * ```typescript
358
- * await writer.writeRows(data)
359
- * console.log(`Wrote ${writer.rowCount} rows`)
360
- * ```
361
- */
362
- get rowCount() {
363
- return this._rowCount;
364
- }
365
- /**
366
- * Gets the number of row groups.
367
- *
368
- * @description
369
- * Returns the number of completed row groups plus one if there's
370
- * a pending row group with data.
371
- *
372
- * @returns Number of row groups
373
- *
374
- * @example
375
- * ```typescript
376
- * console.log(`Row groups: ${writer.rowGroupCount}`)
377
- * ```
378
- */
379
- get rowGroupCount() {
380
- const pendingCount = this._currentRowGroup.rows.length > 0 ? 1 : 0;
381
- return this._rowGroups.length + pendingCount;
382
- }
383
- /**
384
- * Checks if the writer has been closed.
385
- *
386
- * @description
387
- * A closed writer cannot accept new rows. Writers are closed
388
- * implicitly by `closeWriter()`.
389
- *
390
- * @returns true if closed
391
- *
392
- * @example
393
- * ```typescript
394
- * if (!writer.isClosed) {
395
- * await writer.writeRow(row)
396
- * }
397
- * ```
398
- */
399
- get isClosed() {
400
- return this._isClosed;
401
- }
402
- /**
403
- * Writes a single row to the Parquet file.
404
- *
405
- * @description
406
- * Validates the row against the schema and adds it to the current
407
- * row group. Automatically flushes the row group when it reaches
408
- * the configured size or memory limit.
409
- *
410
- * @param row - Object with column values keyed by column name
411
- * @returns Promise that resolves when the row is written
412
- *
413
- * @throws {ParquetError} WRITER_CLOSED - If writer is closed
414
- * @throws {ParquetError} MISSING_REQUIRED_FIELD - If required field is missing
415
- * @throws {ParquetError} INVALID_FIELD_TYPE - If field value type doesn't match schema
416
- *
417
- * @example
418
- * ```typescript
419
- * await writer.writeRow({
420
- * id: 123,
421
- * name: 'Alice',
422
- * active: true
423
- * })
424
- * ```
425
- */
426
- async writeRow(row) {
427
- if (this._isClosed) {
428
- throw new ParquetError('Cannot write to a closed writer', 'WRITER_CLOSED');
429
- }
430
- this._validateRow(row);
431
- const rowSize = this._estimateRowSize(row);
432
- this._currentRowGroup.rows.push(row);
433
- this._currentRowGroup.byteSize += rowSize;
434
- this._rowCount++;
435
- // Check if we should flush based on row count
436
- if (this._currentRowGroup.rows.length >= this.options.rowGroupSize) {
437
- await this.flushRowGroup();
438
- }
439
- // Check if we should flush based on memory limit
440
- else if (this.options.rowGroupMemoryLimit &&
441
- this._currentRowGroup.byteSize >= this.options.rowGroupMemoryLimit) {
442
- await this.flushRowGroup();
443
- }
444
- }
445
- /**
446
- * Writes multiple rows to the Parquet file.
447
- *
448
- * @description
449
- * Convenience method that writes an array of rows sequentially.
450
- * Each row is validated and may trigger row group flushes.
451
- *
452
- * @param rows - Array of row objects to write
453
- * @returns Promise that resolves when all rows are written
454
- *
455
- * @throws {ParquetError} Any error from writeRow()
456
- *
457
- * @example
458
- * ```typescript
459
- * await writer.writeRows([
460
- * { id: 1, name: 'Alice' },
461
- * { id: 2, name: 'Bob' },
462
- * { id: 3, name: 'Carol' }
463
- * ])
464
- * ```
465
- */
466
- async writeRows(rows) {
467
- for (const row of rows) {
468
- await this.writeRow(row);
469
- }
470
- }
471
- /**
472
- * Manually flushes the current row group.
473
- *
474
- * @description
475
- * Forces the current row group to be finalized and stored, even if
476
- * it hasn't reached the size limit. Has no effect if the current
477
- * row group is empty.
478
- *
479
- * @returns Promise that resolves when flush is complete
480
- *
481
- * @example
482
- * ```typescript
483
- * // Write some rows
484
- * await writer.writeRows(batch1)
485
- *
486
- * // Force flush before writing next batch
487
- * await writer.flushRowGroup()
488
- *
489
- * // Continue writing
490
- * await writer.writeRows(batch2)
491
- * ```
492
- */
493
- async flushRowGroup() {
494
- if (this._currentRowGroup.rows.length === 0) {
495
- return;
496
- }
497
- const rowGroup = this._buildRowGroup(this._currentRowGroup);
498
- this._rowGroups.push(rowGroup);
499
- this._currentRowGroup = { rows: [], byteSize: 0 };
500
- }
501
- /**
502
- * Gets the current row group's memory size.
503
- *
504
- * @description
505
- * Returns the estimated memory consumption of the unflushed row group.
506
- * Useful for monitoring memory usage during streaming writes.
507
- *
508
- * @returns Memory size in bytes
509
- *
510
- * @example
511
- * ```typescript
512
- * if (writer.currentRowGroupMemorySize() > 50 * 1024 * 1024) {
513
- * console.log('Row group using significant memory')
514
- * await writer.flushRowGroup()
515
- * }
516
- * ```
517
- */
518
- currentRowGroupMemorySize() {
519
- return this._currentRowGroup.byteSize;
520
- }
521
- /**
522
- * Gets the completed row groups.
523
- *
524
- * @description
525
- * Returns a copy of the completed row group metadata array.
526
- * Does not include the current unflushed row group.
527
- *
528
- * @returns Array of row group metadata
529
- *
530
- * @example
531
- * ```typescript
532
- * for (const rg of writer.getRowGroups()) {
533
- * console.log(`Row group: ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
534
- * }
535
- * ```
536
- */
537
- getRowGroups() {
538
- return [...this._rowGroups];
539
- }
540
- /**
541
- * Sets a custom key-value metadata entry.
542
- *
543
- * @description
544
- * Adds custom metadata that will be stored in the Parquet file footer.
545
- * Can be used for versioning, provenance, or application-specific data.
546
- *
547
- * @param key - Metadata key
548
- * @param value - Metadata value
549
- *
550
- * @example
551
- * ```typescript
552
- * writer.setMetadata('created_by', 'gitdo-analytics')
553
- * writer.setMetadata('schema_version', '2.0')
554
- * writer.setMetadata('repository', 'github.com/org/repo')
555
- * ```
556
- */
557
- setMetadata(key, value) {
558
- this._keyValueMetadata[key] = value;
559
- }
560
- /**
561
- * Generates the Parquet file as a buffer.
562
- *
563
- * @description
564
- * Finalizes the file by flushing any remaining rows and generating
565
- * the complete Parquet file structure including header, row groups,
566
- * and footer with metadata.
567
- *
568
- * @returns Promise resolving to the complete Parquet file as Uint8Array
569
- *
570
- * @example
571
- * ```typescript
572
- * const buffer = await writer.toBuffer()
573
- * await fs.writeFile('data.parquet', buffer)
574
- * ```
575
- */
576
- async toBuffer() {
577
- // Flush any remaining rows
578
- if (this._currentRowGroup.rows.length > 0) {
579
- await this.flushRowGroup();
580
- }
581
- return this._generateParquetBytes();
582
- }
583
- /**
584
- * Writes the Parquet file to an output stream.
585
- *
586
- * @description
587
- * Generates the file and writes it to the provided output stream.
588
- * Useful for streaming to files or network destinations.
589
- *
590
- * @param output - The output stream to write to
591
- * @returns Promise that resolves when writing is complete
592
- *
593
- * @example
594
- * ```typescript
595
- * const output = new FileOutputStream('data.parquet')
596
- * await writer.writeTo(output)
597
- * output.close()
598
- * ```
599
- */
600
- async writeTo(output) {
601
- const bytes = await this.toBuffer();
602
- output.write(bytes);
603
- }
604
- /**
605
- * Resets the writer to its initial state.
606
- *
607
- * @description
608
- * Clears all written data, row groups, and metadata. The schema
609
- * and options remain unchanged. Useful for writing multiple files
610
- * with the same configuration.
611
- *
612
- * @example
613
- * ```typescript
614
- * // Write first file
615
- * await writer.writeRows(batch1)
616
- * const file1 = await writer.toBuffer()
617
- *
618
- * // Reset and write second file
619
- * writer.reset()
620
- * await writer.writeRows(batch2)
621
- * const file2 = await writer.toBuffer()
622
- * ```
623
- */
624
- reset() {
625
- this._rowCount = 0;
626
- this._rowGroups = [];
627
- this._currentRowGroup = { rows: [], byteSize: 0 };
628
- this._isClosed = false;
629
- this._keyValueMetadata = {};
630
- this._createdAt = Date.now();
631
- }
632
- /**
633
- * Validates a row against the schema.
634
- *
635
- * @param row - The row to validate
636
- * @throws {ParquetError} If validation fails
637
- * @private
638
- */
639
- _validateRow(row) {
640
- for (const field of this.schema.fields) {
641
- const value = row[field.name];
642
- // Check required fields
643
- if (field.required && (value === undefined || value === null)) {
644
- throw new ParquetError(`Missing required field: ${field.name}`, 'MISSING_REQUIRED_FIELD');
645
- }
646
- // Check type if value is present and not null
647
- if (value !== null && value !== undefined) {
648
- if (!this._validateType(value, field.type)) {
649
- throw new ParquetError(`Invalid type for field ${field.name}: expected ${field.type}`, 'INVALID_FIELD_TYPE');
650
- }
651
- }
652
- }
653
- }
654
- /**
655
- * Validates a value matches the expected Parquet type.
656
- *
657
- * @param value - The value to validate
658
- * @param type - The expected Parquet type
659
- * @returns true if valid, false otherwise
660
- * @private
661
- */
662
- _validateType(value, type) {
663
- switch (type) {
664
- case ParquetFieldType.STRING:
665
- return typeof value === 'string';
666
- case ParquetFieldType.INT32:
667
- case ParquetFieldType.INT64:
668
- case ParquetFieldType.FLOAT:
669
- case ParquetFieldType.DOUBLE:
670
- case ParquetFieldType.TIMESTAMP_MILLIS:
671
- case ParquetFieldType.TIMESTAMP_MICROS:
672
- return typeof value === 'number';
673
- case ParquetFieldType.BOOLEAN:
674
- return typeof value === 'boolean';
675
- case ParquetFieldType.BINARY:
676
- return value instanceof Uint8Array || typeof value === 'string';
677
- default:
678
- return false;
679
- }
680
- }
681
- /**
682
- * Estimates the memory size of a row.
683
- *
684
- * @param row - The row to estimate
685
- * @returns Estimated size in bytes
686
- * @private
687
- */
688
- _estimateRowSize(row) {
689
- let size = 0;
690
- for (const field of this.schema.fields) {
691
- const value = row[field.name];
692
- if (value === null || value === undefined) {
693
- size += 1; // null marker
694
- }
695
- else if (typeof value === 'string') {
696
- size += value.length * 2; // UTF-16
697
- }
698
- else if (typeof value === 'number') {
699
- size += 8; // 64-bit
700
- }
701
- else if (typeof value === 'boolean') {
702
- size += 1;
703
- }
704
- else if (value instanceof Uint8Array) {
705
- size += value.length;
706
- }
707
- }
708
- return size;
709
- }
710
- /**
711
- * Builds a row group from internal representation.
712
- *
713
- * @param internal - The internal row group data
714
- * @returns The row group metadata
715
- * @private
716
- */
717
- _buildRowGroup(internal) {
718
- const columns = this.schema.fields.map(field => {
719
- const values = internal.rows.map(row => row[field.name]);
720
- const stats = this.options.enableStatistics ? this._computeStatistics(values, field.type) : undefined;
721
- const compression = this.options.columnCompression?.[field.name] ?? this.options.compression;
722
- return {
723
- column: field.name,
724
- type: field.type,
725
- compression,
726
- encodedSize: this._estimateEncodedSize(values, field.type, compression),
727
- uncompressedSize: this._estimateUncompressedSize(values, field.type),
728
- statistics: stats
729
- };
730
- });
731
- return {
732
- numRows: internal.rows.length,
733
- totalByteSize: columns.reduce((sum, col) => sum + col.encodedSize, 0),
734
- columns
735
- };
736
- }
737
- /**
738
- * Computes statistics for a column.
739
- *
740
- * @param values - The column values
741
- * @param type - The column type
742
- * @returns Column statistics
743
- * @private
744
- */
745
- _computeStatistics(values, type) {
746
- const nonNullValues = values.filter(v => v !== null && v !== undefined);
747
- const nullCount = values.length - nonNullValues.length;
748
- if (nonNullValues.length === 0) {
749
- return { nullCount };
750
- }
751
- switch (type) {
752
- case ParquetFieldType.INT32:
753
- case ParquetFieldType.INT64:
754
- case ParquetFieldType.FLOAT:
755
- case ParquetFieldType.DOUBLE:
756
- case ParquetFieldType.TIMESTAMP_MILLIS:
757
- case ParquetFieldType.TIMESTAMP_MICROS: {
758
- const numbers = nonNullValues.filter(v => typeof v === 'number' && !Number.isNaN(v));
759
- if (numbers.length === 0) {
760
- return { nullCount };
761
- }
762
- return {
763
- min: Math.min(...numbers),
764
- max: Math.max(...numbers),
765
- nullCount
766
- };
767
- }
768
- case ParquetFieldType.STRING: {
769
- const strings = nonNullValues;
770
- return {
771
- min: strings.reduce((a, b) => a < b ? a : b),
772
- max: strings.reduce((a, b) => a > b ? a : b),
773
- nullCount
774
- };
775
- }
776
- case ParquetFieldType.BOOLEAN: {
777
- return { nullCount };
778
- }
779
- default:
780
- return { nullCount };
781
- }
782
- }
783
- /**
784
- * Estimates the encoded size after compression.
785
- *
786
- * @param values - The column values
787
- * @param type - The column type
788
- * @param compression - The compression type
789
- * @returns Estimated compressed size in bytes
790
- * @private
791
- */
792
- _estimateEncodedSize(values, type, compression) {
793
- const uncompressedSize = this._estimateUncompressedSize(values, type);
794
- // Apply compression ratio estimate
795
- switch (compression) {
796
- case ParquetCompression.SNAPPY:
797
- return Math.floor(uncompressedSize * 0.5);
798
- case ParquetCompression.GZIP:
799
- return Math.floor(uncompressedSize * 0.3);
800
- case ParquetCompression.ZSTD:
801
- return Math.floor(uncompressedSize * 0.25);
802
- case ParquetCompression.LZ4:
803
- return Math.floor(uncompressedSize * 0.4);
804
- case ParquetCompression.UNCOMPRESSED:
805
- default:
806
- return uncompressedSize;
807
- }
808
- }
809
- /**
810
- * Estimates the uncompressed size of column values.
811
- *
812
- * @param values - The column values
813
- * @param type - The column type
814
- * @returns Estimated uncompressed size in bytes
815
- * @private
816
- */
817
- _estimateUncompressedSize(values, type) {
818
- let size = 0;
819
- for (const value of values) {
820
- if (value === null || value === undefined) {
821
- size += 1;
822
- }
823
- else {
824
- switch (type) {
825
- case ParquetFieldType.STRING:
826
- size += value.length * 2;
827
- break;
828
- case ParquetFieldType.INT32:
829
- case ParquetFieldType.FLOAT:
830
- size += 4;
831
- break;
832
- case ParquetFieldType.INT64:
833
- case ParquetFieldType.DOUBLE:
834
- case ParquetFieldType.TIMESTAMP_MILLIS:
835
- case ParquetFieldType.TIMESTAMP_MICROS:
836
- size += 8;
837
- break;
838
- case ParquetFieldType.BOOLEAN:
839
- size += 1;
840
- break;
841
- case ParquetFieldType.BINARY:
842
- size += value instanceof Uint8Array ? value.length : value.length;
843
- break;
844
- }
845
- }
846
- }
847
- return size;
848
- }
849
- /**
850
- * Generates the complete Parquet file bytes.
851
- *
852
- * @returns The complete Parquet file as Uint8Array
853
- * @private
854
- */
855
- _generateParquetBytes() {
856
- // Build all row data - will be populated from row groups in full implementation
857
- // For now, row group data is serialized directly below
858
- // Calculate metadata
859
- const metadata = {
860
- schema: this.schema,
861
- numRows: this._rowCount,
862
- rowGroups: this._rowGroups,
863
- compression: this.options.compression,
864
- columnCompression: this.options.columnCompression,
865
- keyValueMetadata: this._keyValueMetadata,
866
- createdAt: this._createdAt,
867
- sortedBy: this.options.sortBy,
868
- partitionColumns: this.options.partitionColumns
869
- };
870
- // Encode metadata to JSON and then to bytes
871
- const metadataJson = JSON.stringify(metadata);
872
- const metadataBytes = new TextEncoder().encode(metadataJson);
873
- // Compress metadata if needed
874
- let compressedMetadata;
875
- if (this.options.compression === ParquetCompression.GZIP) {
876
- compressedMetadata = pako.gzip(metadataBytes);
877
- }
878
- else {
879
- // For SNAPPY, ZSTD, LZ4 - we'll use a simple RLE-like compression simulation
880
- // In production, you'd use actual compression libraries
881
- compressedMetadata = this._simpleCompress(metadataBytes, this.options.compression);
882
- }
883
- // Build final file structure
884
- // PAR1 magic (4 bytes) + data + metadata length (4 bytes) + metadata + PAR1 magic (4 bytes)
885
- const magic = new TextEncoder().encode('PAR1');
886
- const metadataLength = new Uint8Array(4);
887
- new DataView(metadataLength.buffer).setUint32(0, compressedMetadata.length, true);
888
- // Calculate total size
889
- const totalSize = 4 + compressedMetadata.length + 4 + 4;
890
- const result = new Uint8Array(totalSize);
891
- // Write structure
892
- let offset = 0;
893
- result.set(magic, offset);
894
- offset += 4;
895
- result.set(compressedMetadata, offset);
896
- offset += compressedMetadata.length;
897
- result.set(metadataLength, offset);
898
- offset += 4;
899
- result.set(magic, offset);
900
- return result;
901
- }
902
- /**
903
- * Simple compression simulation for non-gzip formats.
904
- *
905
- * @param data - Data to compress
906
- * @param compression - Compression type
907
- * @returns Compressed data
908
- * @private
909
- */
910
- _simpleCompress(data, compression) {
911
- if (compression === ParquetCompression.UNCOMPRESSED) {
912
- return data;
913
- }
914
- // Use pako deflate for a basic compression simulation
915
- // Real implementation would use snappy-js, zstd-codec, lz4js etc.
916
- try {
917
- return pako.deflate(data, { level: compression === ParquetCompression.ZSTD ? 9 : 6 });
918
- }
919
- catch {
920
- return data;
921
- }
922
- }
923
- }
924
- // ============================================================================
925
- // Helper Functions
926
- // ============================================================================
927
- /**
928
- * Defines a Parquet schema.
929
- *
930
- * @description
931
- * Creates a validated Parquet schema from field definitions. Validates that:
932
- * - Schema has at least one field
933
- * - All field names are non-empty
934
- * - All field names are unique
935
- *
936
- * @param fields - Array of field definitions
937
- * @param metadata - Optional schema-level metadata
938
- * @returns Validated Parquet schema
939
- *
940
- * @throws {ParquetError} EMPTY_SCHEMA - If fields array is empty
941
- * @throws {ParquetError} EMPTY_FIELD_NAME - If any field name is empty
942
- * @throws {ParquetError} DUPLICATE_FIELD - If field names are not unique
943
- *
944
- * @example
945
- * ```typescript
946
- * const schema = defineSchema([
947
- * { name: 'id', type: ParquetFieldType.INT64, required: true },
948
- * { name: 'name', type: ParquetFieldType.STRING, required: true },
949
- * { name: 'age', type: ParquetFieldType.INT32, required: false },
950
- * { name: 'created_at', type: ParquetFieldType.TIMESTAMP_MILLIS, required: true }
951
- * ], {
952
- * version: '1.0',
953
- * description: 'User records'
954
- * })
955
- * ```
956
- */
957
- export function defineSchema(fields, metadata) {
958
- // Validate schema
959
- if (fields.length === 0) {
960
- throw new ParquetError('Schema cannot be empty', 'EMPTY_SCHEMA');
961
- }
962
- const names = new Set();
963
- for (const field of fields) {
964
- if (!field.name || field.name.trim() === '') {
965
- throw new ParquetError('Field name cannot be empty', 'EMPTY_FIELD_NAME');
966
- }
967
- if (names.has(field.name)) {
968
- throw new ParquetError(`Duplicate field name: ${field.name}`, 'DUPLICATE_FIELD');
969
- }
970
- names.add(field.name);
971
- }
972
- return {
973
- fields: fields.map(f => ({
974
- name: f.name,
975
- type: f.type,
976
- required: f.required,
977
- metadata: f.metadata
978
- })),
979
- metadata
980
- };
981
- }
982
- /**
983
- * Creates a Parquet writer.
984
- *
985
- * @description
986
- * Factory function to create a ParquetWriter with the specified schema
987
- * and options. Equivalent to `new ParquetWriter(schema, options)`.
988
- *
989
- * @param schema - The Parquet schema
990
- * @param options - Writer options
991
- * @returns A new ParquetWriter instance
992
- *
993
- * @example
994
- * ```typescript
995
- * const writer = createParquetWriter(schema, {
996
- * rowGroupSize: 10000,
997
- * compression: ParquetCompression.SNAPPY
998
- * })
999
- * ```
1000
- */
1001
- export function createParquetWriter(schema, options = {}) {
1002
- return new ParquetWriter(schema, options);
1003
- }
1004
- /**
1005
- * Writes data directly to a Parquet file buffer.
1006
- *
1007
- * @description
1008
- * Convenience function that creates a writer, writes all rows, and returns
1009
- * the complete Parquet file. Useful for simple one-shot writes.
1010
- *
1011
- * @param schema - The Parquet schema
1012
- * @param rows - Array of rows to write
1013
- * @param options - Writer options
1014
- * @returns Promise resolving to the complete Parquet file as Uint8Array
1015
- *
1016
- * @example
1017
- * ```typescript
1018
- * const buffer = await writeParquetFile(schema, [
1019
- * { id: 1, name: 'Alice' },
1020
- * { id: 2, name: 'Bob' }
1021
- * ], {
1022
- * compression: ParquetCompression.GZIP
1023
- * })
1024
- *
1025
- * await fs.writeFile('data.parquet', buffer)
1026
- * ```
1027
- */
1028
- export async function writeParquetFile(schema, rows, options = {}) {
1029
- const writer = createParquetWriter(schema, options);
1030
- await writer.writeRows(rows);
1031
- return writer.toBuffer();
1032
- }
1033
- /**
1034
- * Closes a writer and returns the final buffer.
1035
- *
1036
- * @description
1037
- * Generates the final Parquet file buffer and marks the writer as closed.
1038
- * The writer cannot be used for further writes after calling this function.
1039
- *
1040
- * @param writer - The ParquetWriter to close
1041
- * @returns Promise resolving to the complete Parquet file as Uint8Array
1042
- *
1043
- * @example
1044
- * ```typescript
1045
- * await writer.writeRows(data)
1046
- * const buffer = await closeWriter(writer)
1047
- * console.log(writer.isClosed) // true
1048
- * ```
1049
- */
1050
- export async function closeWriter(writer) {
1051
- const bytes = await writer.toBuffer();
1052
- writer._isClosed = true;
1053
- return bytes;
1054
- }
1055
- /**
1056
- * Adds a row group to the writer.
1057
- *
1058
- * @description
1059
- * Writes multiple rows and then flushes them as a single row group.
1060
- * Useful when you want explicit control over row group boundaries.
1061
- *
1062
- * @param writer - The ParquetWriter to use
1063
- * @param rows - Array of rows for this row group
1064
- * @returns Promise that resolves when the row group is written
1065
- *
1066
- * @example
1067
- * ```typescript
1068
- * // Add explicit row groups
1069
- * await addRowGroup(writer, batch1) // First row group
1070
- * await addRowGroup(writer, batch2) // Second row group
1071
- * ```
1072
- */
1073
- export async function addRowGroup(writer, rows) {
1074
- await writer.writeRows(rows);
1075
- await writer.flushRowGroup();
1076
- }
1077
- /**
1078
- * Gets metadata from a Parquet file buffer.
1079
- *
1080
- * @description
1081
- * Parses a Parquet file buffer and extracts the metadata including
1082
- * schema, row groups, compression settings, and custom metadata.
1083
- *
1084
- * @param bytes - The Parquet file buffer
1085
- * @returns The parsed metadata
1086
- *
1087
- * @throws {ParquetError} INVALID_MAGIC - If file doesn't have valid Parquet magic bytes
1088
- *
1089
- * @example
1090
- * ```typescript
1091
- * const buffer = await fs.readFile('data.parquet')
1092
- * const metadata = getMetadata(buffer)
1093
- *
1094
- * console.log(`Rows: ${metadata.numRows}`)
1095
- * console.log(`Schema: ${metadata.schema.fields.map(f => f.name).join(', ')}`)
1096
- * console.log(`Row groups: ${metadata.rowGroups.length}`)
1097
- *
1098
- * for (const rg of metadata.rowGroups) {
1099
- * console.log(` - ${rg.numRows} rows, ${rg.totalByteSize} bytes`)
1100
- * }
1101
- * ```
1102
- */
1103
- export function getMetadata(bytes) {
1104
- // Verify magic bytes
1105
- const startMagic = new TextDecoder().decode(bytes.slice(0, 4));
1106
- const endMagic = new TextDecoder().decode(bytes.slice(-4));
1107
- if (startMagic !== 'PAR1' || endMagic !== 'PAR1') {
1108
- throw new ParquetError('Invalid Parquet file: missing magic bytes', 'INVALID_MAGIC');
1109
- }
1110
- // Read metadata length (4 bytes before final magic)
1111
- const metadataLengthOffset = bytes.length - 8;
1112
- const metadataLength = new DataView(bytes.buffer, bytes.byteOffset + metadataLengthOffset, 4).getUint32(0, true);
1113
- // Read compressed metadata
1114
- const metadataStart = 4;
1115
- const compressedMetadata = bytes.slice(metadataStart, metadataStart + metadataLength);
1116
- // Decompress metadata
1117
- let metadataBytes;
1118
- try {
1119
- // Try gzip first
1120
- metadataBytes = pako.ungzip(compressedMetadata);
1121
- }
1122
- catch {
1123
- try {
1124
- // Try inflate (deflate)
1125
- metadataBytes = pako.inflate(compressedMetadata);
1126
- }
1127
- catch {
1128
- // Assume uncompressed
1129
- metadataBytes = compressedMetadata;
1130
- }
1131
- }
1132
- // Parse metadata JSON
1133
- const metadataJson = new TextDecoder().decode(metadataBytes);
1134
- const internal = JSON.parse(metadataJson);
1135
- // Build column metadata map
1136
- const columnMetadata = {};
1137
- if (internal.columnCompression) {
1138
- for (const [col, comp] of Object.entries(internal.columnCompression)) {
1139
- columnMetadata[col] = { compression: comp };
1140
- }
1141
- }
1142
- return {
1143
- schema: internal.schema,
1144
- numRows: internal.numRows,
1145
- rowGroups: internal.rowGroups,
1146
- compression: internal.compression,
1147
- columnMetadata: Object.keys(columnMetadata).length > 0 ? columnMetadata : undefined,
1148
- keyValueMetadata: Object.keys(internal.keyValueMetadata).length > 0 ? internal.keyValueMetadata : undefined,
1149
- createdAt: internal.createdAt,
1150
- fileSize: bytes.length,
1151
- sortedBy: internal.sortedBy,
1152
- partitionColumns: internal.partitionColumns
1153
- };
1154
- }
1155
- /**
1156
- * Sets the compression type for a writer.
1157
- *
1158
- * @description
1159
- * Updates the default compression algorithm for a writer. Affects all
1160
- * subsequently written data. Columns with explicit compression settings
1161
- * in columnCompression are not affected.
1162
- *
1163
- * @param writer - The ParquetWriter to update
1164
- * @param compression - The new compression type
1165
- *
1166
- * @example
1167
- * ```typescript
1168
- * const writer = createParquetWriter(schema)
1169
- *
1170
- * // Write some rows with SNAPPY (default)
1171
- * await writer.writeRows(batch1)
1172
- * await writer.flushRowGroup()
1173
- *
1174
- * // Switch to GZIP for remaining data
1175
- * setCompression(writer, ParquetCompression.GZIP)
1176
- * await writer.writeRows(batch2)
1177
- * ```
1178
- */
1179
- export function setCompression(writer, compression) {
1180
- ;
1181
- writer.options.compression = compression;
1182
- }
1183
- //# sourceMappingURL=parquet-writer.js.map