deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,373 @@
1
+ import argparse
2
+ from typing import Optional
3
+
4
+ import deltacat
5
+ from deltacat.compute.compactor_v2.compaction_session import compact_partition
6
+ from deltacat.compute.compactor.model.compact_partition_params import (
7
+ CompactPartitionParams,
8
+ )
9
+ from deltacat.storage import metastore
10
+ from deltacat.types.media import ContentType
11
+
12
+ # Import common utilities
13
+ from deltacat.examples.compactor.utils.common import (
14
+ initialize_catalog,
15
+ parse_primary_keys,
16
+ parse_partition_values,
17
+ parse_sort_keys,
18
+ create_partition_locator,
19
+ get_actual_partition_locator,
20
+ )
21
+
22
+
23
+ def print_package_version_info() -> None:
24
+ """Print version information for debugging."""
25
+ print(f"DeltaCAT version: {deltacat.__version__}")
26
+
27
+
28
+ def run(
29
+ namespace: str,
30
+ table_name: str,
31
+ table_version: str,
32
+ partition_values: str,
33
+ dest_namespace: str,
34
+ dest_table_name: str,
35
+ dest_table_version: str,
36
+ dest_partition_values: str,
37
+ last_stream_position: int,
38
+ primary_keys: str,
39
+ catalog_root: Optional[str] = None,
40
+ compactor_version: str = "V2",
41
+ sort_keys: Optional[str] = None,
42
+ hash_bucket_count: Optional[int] = None,
43
+ records_per_file: int = 1000000,
44
+ table_writer_compression: str = "lz4",
45
+ ) -> None:
46
+ """
47
+ Run the compactor with the given parameters.
48
+
49
+ Args:
50
+ namespace: Source table namespace
51
+ table_name: Source table name
52
+ table_version: Source table version
53
+ partition_values: Comma-separated partition values for source
54
+ dest_namespace: Destination table namespace
55
+ dest_table_name: Destination table name
56
+ dest_table_version: Destination table version
57
+ dest_partition_values: Comma-separated partition values for destination
58
+ last_stream_position: Last stream position to compact
59
+ primary_keys: Comma-separated primary keys
60
+ catalog_root: Root path for catalog (defaults to temp directory)
61
+ compactor_version: Compactor version to use (V1 or V2)
62
+ sort_keys: Comma-separated sort keys (optional)
63
+ hash_bucket_count: Number of hash buckets (required for V2)
64
+ records_per_file: Records per compacted file
65
+ table_writer_compression: Compression type for table writer
66
+ """
67
+ # Parse partition values
68
+ partition_values_list = parse_partition_values(partition_values)
69
+ dest_partition_values_list = parse_partition_values(dest_partition_values)
70
+
71
+ # Initialize catalog
72
+ catalog = initialize_catalog(catalog_root)
73
+
74
+ # Get actual partition locators (with real partition IDs)
75
+ source_partition_locator = get_actual_partition_locator(
76
+ namespace, table_name, table_version, partition_values_list, catalog
77
+ )
78
+
79
+ # For destination, try actual first, fall back to basic if table doesn't exist yet
80
+ try:
81
+ dest_partition_locator = get_actual_partition_locator(
82
+ dest_namespace,
83
+ dest_table_name,
84
+ dest_table_version,
85
+ dest_partition_values_list,
86
+ catalog,
87
+ )
88
+ print(f"✅ Using existing destination partition")
89
+ except Exception:
90
+ dest_partition_locator = create_partition_locator(
91
+ dest_namespace,
92
+ dest_table_name,
93
+ dest_table_version,
94
+ dest_partition_values_list,
95
+ )
96
+ print(f"✅ Creating new destination partition")
97
+
98
+ # Parse primary keys and sort keys
99
+ primary_keys_set = parse_primary_keys(primary_keys)
100
+ sort_keys_list = parse_sort_keys(sort_keys) if sort_keys else None
101
+ all_column_names = metastore.get_table_version_column_names(
102
+ namespace,
103
+ table_name,
104
+ table_version,
105
+ catalog=catalog,
106
+ )
107
+ # Create compaction parameters using the same approach as bootstrap.py
108
+ params_dict = {
109
+ "catalog": catalog,
110
+ "compacted_file_content_type": ContentType.PARQUET,
111
+ "deltacat_storage": metastore,
112
+ "deltacat_storage_kwargs": {"catalog": catalog},
113
+ "destination_partition_locator": dest_partition_locator,
114
+ "last_stream_position_to_compact": last_stream_position,
115
+ "list_deltas_kwargs": {
116
+ "catalog": catalog,
117
+ "equivalent_table_types": [],
118
+ },
119
+ "primary_keys": list(primary_keys_set),
120
+ "all_column_names": all_column_names,
121
+ "rebase_source_partition_locator": None,
122
+ "rebase_source_partition_high_watermark": None,
123
+ "records_per_compacted_file": records_per_file,
124
+ "source_partition_locator": source_partition_locator,
125
+ "table_writer_kwargs": {
126
+ "compression": table_writer_compression,
127
+ "version": "2.6",
128
+ "use_dictionary": True,
129
+ },
130
+ }
131
+
132
+ # Add sort keys if provided
133
+ if sort_keys_list:
134
+ params_dict["sort_keys"] = sort_keys_list
135
+
136
+ # Add V2-specific parameters
137
+ if compactor_version == "V2":
138
+ if hash_bucket_count is None:
139
+ raise ValueError("hash_bucket_count is required for V2 compactor")
140
+
141
+ params_dict.update(
142
+ {
143
+ "hash_bucket_count": hash_bucket_count,
144
+ "drop_duplicates": True,
145
+ "dd_max_parallelism_ratio": 1.0,
146
+ }
147
+ )
148
+
149
+ print(f"🚀 Starting {compactor_version} compaction...")
150
+ print(f" Source: {source_partition_locator}")
151
+ print(f" Destination: {dest_partition_locator}")
152
+ print(f" Primary Keys: {primary_keys_set}")
153
+ print(
154
+ f" Sort Keys: {[sk.key for sk in sort_keys_list] if sort_keys_list else None}"
155
+ )
156
+ if compactor_version == "V2":
157
+ print(f" Hash Bucket Count: {hash_bucket_count}")
158
+
159
+ # Run compaction
160
+ compact_partition(CompactPartitionParams.of(params_dict))
161
+
162
+ print(f"✅ Compaction completed successfully!")
163
+
164
+
165
+ if __name__ == "__main__":
166
+ """
167
+ DeltaCAT Compactor Example - Compact partitions using V1 or V2 compactor
168
+
169
+ This script demonstrates how to compact partitions in DeltaCAT using either
170
+ the V1 or V2 compactor. The compactor will read data from a source partition
171
+ and write compacted data to a destination partition.
172
+
173
+ Example 1: Basic V2 compaction (recommended):
174
+ $ python compactor.py \
175
+ $ --namespace 'test_namespace' \
176
+ $ --table-name 'test_table' \
177
+ $ --table-version '1' \
178
+ $ --partition-values 'region=us-west-2' \
179
+ $ --dest-namespace 'test_namespace' \
180
+ $ --dest-table-name 'test_table_compacted' \
181
+ $ --dest-table-version '1' \
182
+ $ --dest-partition-values 'region=us-west-2' \
183
+ $ --last-stream-position 5000 \
184
+ $ --primary-keys 'user_id,event_id' \
185
+ $ --sort-keys 'timestamp,event_type' \
186
+ $ --compactor-version 'V2' \
187
+ $ --hash-bucket-count 1 \
188
+ $ --records-per-file 500000 \
189
+ $ --table-writer-compression 'snappy'
190
+
191
+ Example 2: V1 compaction (legacy):
192
+ $ python compactor.py \
193
+ $ --namespace 'events' \
194
+ $ --table-name 'user_events' \
195
+ $ --table-version '2' \
196
+ $ --partition-values 'region=us-west-2' \
197
+ $ --dest-namespace 'events' \
198
+ $ --dest-table-name 'user_events_compacted' \
199
+ $ --dest-table-version '1' \
200
+ $ --dest-partition-values 'region=us-west-2' \
201
+ $ --last-stream-position 5000 \
202
+ $ --primary-keys 'user_id,event_id' \
203
+ $ --sort-keys 'timestamp,event_type' \
204
+ $ --compactor-version 'V1' \
205
+ $ --records-per-file 500000 \
206
+ $ --table-writer-compression 'snappy'
207
+
208
+ Example 3: Submit this script as a local Ray job using a local job client:
209
+ >>> from deltacat import local_job_client
210
+ >>> client = local_job_client()
211
+ >>> job_run_result = client.run_job(
212
+ >>> entrypoint="python compactor.py --namespace my_ns --table-name my_table ...",
213
+ >>> runtime_env={"working_dir": "./deltacat/examples/compactor/"},
214
+ >>> )
215
+ >>> print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
216
+ >>> print(f"Job logs: {job_run_result.job_logs}")
217
+
218
+ Example 4: Submit this script as a remote Ray job using a remote job client:
219
+ >>> from deltacat import job_client
220
+ >>> client = job_client("deltacat.yaml") # or job_client() to use current directory
221
+ >>> job_run_result = client.run_job(
222
+ >>> entrypoint="python compactor.py --namespace my_ns --table-name my_table ...",
223
+ >>> runtime_env={"working_dir": "./deltacat/examples/compactor/"},
224
+ >>> )
225
+ >>> print(f"Job completed with status: {job_run_result.job_status}")
226
+ """
227
+ script_args = [
228
+ (
229
+ ["--namespace"],
230
+ {
231
+ "help": "Source table namespace",
232
+ "type": str,
233
+ "required": True,
234
+ },
235
+ ),
236
+ (
237
+ ["--table-name"],
238
+ {
239
+ "help": "Source table name",
240
+ "type": str,
241
+ "required": True,
242
+ },
243
+ ),
244
+ (
245
+ ["--table-version"],
246
+ {
247
+ "help": "Source table version",
248
+ "type": str,
249
+ "required": True,
250
+ },
251
+ ),
252
+ (
253
+ ["--partition-values"],
254
+ {
255
+ "help": "Comma-separated partition values for source (leave empty for no partition values)",
256
+ "type": str,
257
+ "default": "",
258
+ },
259
+ ),
260
+ (
261
+ ["--dest-namespace"],
262
+ {
263
+ "help": "Destination table namespace",
264
+ "type": str,
265
+ "required": True,
266
+ },
267
+ ),
268
+ (
269
+ ["--dest-table-name"],
270
+ {
271
+ "help": "Destination table name",
272
+ "type": str,
273
+ "required": True,
274
+ },
275
+ ),
276
+ (
277
+ ["--dest-table-version"],
278
+ {
279
+ "help": "Destination table version",
280
+ "type": str,
281
+ "required": True,
282
+ },
283
+ ),
284
+ (
285
+ ["--dest-partition-values"],
286
+ {
287
+ "help": "Comma-separated partition values for destination (leave empty for no partition values)",
288
+ "type": str,
289
+ "default": "",
290
+ },
291
+ ),
292
+ (
293
+ ["--last-stream-position"],
294
+ {
295
+ "help": "Last stream position to compact",
296
+ "type": int,
297
+ "required": True,
298
+ },
299
+ ),
300
+ (
301
+ ["--primary-keys"],
302
+ {
303
+ "help": "Comma-separated primary keys",
304
+ "type": str,
305
+ "required": True,
306
+ },
307
+ ),
308
+ (
309
+ ["--catalog-root"],
310
+ {
311
+ "help": "Root path for catalog (defaults to temp directory)",
312
+ "type": str,
313
+ "default": None,
314
+ },
315
+ ),
316
+ (
317
+ ["--compactor-version"],
318
+ {
319
+ "help": "Compactor version to use (V1 or V2)",
320
+ "type": str,
321
+ "choices": ["V1", "V2"],
322
+ "default": "V2",
323
+ },
324
+ ),
325
+ (
326
+ ["--sort-keys"],
327
+ {
328
+ "help": "Comma-separated sort keys (optional)",
329
+ "type": str,
330
+ "default": None,
331
+ },
332
+ ),
333
+ (
334
+ ["--hash-bucket-count"],
335
+ {
336
+ "help": "Number of hash buckets (required for V2, ignored for V1)",
337
+ "type": int,
338
+ "default": None,
339
+ },
340
+ ),
341
+ (
342
+ ["--records-per-file"],
343
+ {
344
+ "help": "Records per compacted file",
345
+ "type": int,
346
+ "default": 1000000,
347
+ },
348
+ ),
349
+ (
350
+ ["--table-writer-compression"],
351
+ {
352
+ "help": "Compression type for table writer",
353
+ "type": str,
354
+ "choices": ["lz4", "snappy", "gzip", "brotli", "zstd"],
355
+ "default": "lz4",
356
+ },
357
+ ),
358
+ ]
359
+
360
+ # Parse CLI input arguments
361
+ parser = argparse.ArgumentParser(
362
+ description="DeltaCAT Compactor Example - Compact partitions using V1 or V2 compactor"
363
+ )
364
+ for args, kwargs in script_args:
365
+ parser.add_argument(*args, **kwargs)
366
+ args = parser.parse_args()
367
+ print(f"Command Line Arguments: {args}")
368
+
369
+ # Initialize deltacat
370
+ deltacat.init()
371
+
372
+ # Run the compactor using the parsed arguments
373
+ run(**vars(args))