deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,373 @@
|
|
1
|
+
import argparse
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import deltacat
|
5
|
+
from deltacat.compute.compactor_v2.compaction_session import compact_partition
|
6
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
7
|
+
CompactPartitionParams,
|
8
|
+
)
|
9
|
+
from deltacat.storage import metastore
|
10
|
+
from deltacat.types.media import ContentType
|
11
|
+
|
12
|
+
# Import common utilities
|
13
|
+
from deltacat.examples.compactor.utils.common import (
|
14
|
+
initialize_catalog,
|
15
|
+
parse_primary_keys,
|
16
|
+
parse_partition_values,
|
17
|
+
parse_sort_keys,
|
18
|
+
create_partition_locator,
|
19
|
+
get_actual_partition_locator,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
def print_package_version_info() -> None:
|
24
|
+
"""Print version information for debugging."""
|
25
|
+
print(f"DeltaCAT version: {deltacat.__version__}")
|
26
|
+
|
27
|
+
|
28
|
+
def run(
|
29
|
+
namespace: str,
|
30
|
+
table_name: str,
|
31
|
+
table_version: str,
|
32
|
+
partition_values: str,
|
33
|
+
dest_namespace: str,
|
34
|
+
dest_table_name: str,
|
35
|
+
dest_table_version: str,
|
36
|
+
dest_partition_values: str,
|
37
|
+
last_stream_position: int,
|
38
|
+
primary_keys: str,
|
39
|
+
catalog_root: Optional[str] = None,
|
40
|
+
compactor_version: str = "V2",
|
41
|
+
sort_keys: Optional[str] = None,
|
42
|
+
hash_bucket_count: Optional[int] = None,
|
43
|
+
records_per_file: int = 1000000,
|
44
|
+
table_writer_compression: str = "lz4",
|
45
|
+
) -> None:
|
46
|
+
"""
|
47
|
+
Run the compactor with the given parameters.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
namespace: Source table namespace
|
51
|
+
table_name: Source table name
|
52
|
+
table_version: Source table version
|
53
|
+
partition_values: Comma-separated partition values for source
|
54
|
+
dest_namespace: Destination table namespace
|
55
|
+
dest_table_name: Destination table name
|
56
|
+
dest_table_version: Destination table version
|
57
|
+
dest_partition_values: Comma-separated partition values for destination
|
58
|
+
last_stream_position: Last stream position to compact
|
59
|
+
primary_keys: Comma-separated primary keys
|
60
|
+
catalog_root: Root path for catalog (defaults to temp directory)
|
61
|
+
compactor_version: Compactor version to use (V1 or V2)
|
62
|
+
sort_keys: Comma-separated sort keys (optional)
|
63
|
+
hash_bucket_count: Number of hash buckets (required for V2)
|
64
|
+
records_per_file: Records per compacted file
|
65
|
+
table_writer_compression: Compression type for table writer
|
66
|
+
"""
|
67
|
+
# Parse partition values
|
68
|
+
partition_values_list = parse_partition_values(partition_values)
|
69
|
+
dest_partition_values_list = parse_partition_values(dest_partition_values)
|
70
|
+
|
71
|
+
# Initialize catalog
|
72
|
+
catalog = initialize_catalog(catalog_root)
|
73
|
+
|
74
|
+
# Get actual partition locators (with real partition IDs)
|
75
|
+
source_partition_locator = get_actual_partition_locator(
|
76
|
+
namespace, table_name, table_version, partition_values_list, catalog
|
77
|
+
)
|
78
|
+
|
79
|
+
# For destination, try actual first, fall back to basic if table doesn't exist yet
|
80
|
+
try:
|
81
|
+
dest_partition_locator = get_actual_partition_locator(
|
82
|
+
dest_namespace,
|
83
|
+
dest_table_name,
|
84
|
+
dest_table_version,
|
85
|
+
dest_partition_values_list,
|
86
|
+
catalog,
|
87
|
+
)
|
88
|
+
print(f"✅ Using existing destination partition")
|
89
|
+
except Exception:
|
90
|
+
dest_partition_locator = create_partition_locator(
|
91
|
+
dest_namespace,
|
92
|
+
dest_table_name,
|
93
|
+
dest_table_version,
|
94
|
+
dest_partition_values_list,
|
95
|
+
)
|
96
|
+
print(f"✅ Creating new destination partition")
|
97
|
+
|
98
|
+
# Parse primary keys and sort keys
|
99
|
+
primary_keys_set = parse_primary_keys(primary_keys)
|
100
|
+
sort_keys_list = parse_sort_keys(sort_keys) if sort_keys else None
|
101
|
+
all_column_names = metastore.get_table_version_column_names(
|
102
|
+
namespace,
|
103
|
+
table_name,
|
104
|
+
table_version,
|
105
|
+
catalog=catalog,
|
106
|
+
)
|
107
|
+
# Create compaction parameters using the same approach as bootstrap.py
|
108
|
+
params_dict = {
|
109
|
+
"catalog": catalog,
|
110
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
111
|
+
"deltacat_storage": metastore,
|
112
|
+
"deltacat_storage_kwargs": {"catalog": catalog},
|
113
|
+
"destination_partition_locator": dest_partition_locator,
|
114
|
+
"last_stream_position_to_compact": last_stream_position,
|
115
|
+
"list_deltas_kwargs": {
|
116
|
+
"catalog": catalog,
|
117
|
+
"equivalent_table_types": [],
|
118
|
+
},
|
119
|
+
"primary_keys": list(primary_keys_set),
|
120
|
+
"all_column_names": all_column_names,
|
121
|
+
"rebase_source_partition_locator": None,
|
122
|
+
"rebase_source_partition_high_watermark": None,
|
123
|
+
"records_per_compacted_file": records_per_file,
|
124
|
+
"source_partition_locator": source_partition_locator,
|
125
|
+
"table_writer_kwargs": {
|
126
|
+
"compression": table_writer_compression,
|
127
|
+
"version": "2.6",
|
128
|
+
"use_dictionary": True,
|
129
|
+
},
|
130
|
+
}
|
131
|
+
|
132
|
+
# Add sort keys if provided
|
133
|
+
if sort_keys_list:
|
134
|
+
params_dict["sort_keys"] = sort_keys_list
|
135
|
+
|
136
|
+
# Add V2-specific parameters
|
137
|
+
if compactor_version == "V2":
|
138
|
+
if hash_bucket_count is None:
|
139
|
+
raise ValueError("hash_bucket_count is required for V2 compactor")
|
140
|
+
|
141
|
+
params_dict.update(
|
142
|
+
{
|
143
|
+
"hash_bucket_count": hash_bucket_count,
|
144
|
+
"drop_duplicates": True,
|
145
|
+
"dd_max_parallelism_ratio": 1.0,
|
146
|
+
}
|
147
|
+
)
|
148
|
+
|
149
|
+
print(f"🚀 Starting {compactor_version} compaction...")
|
150
|
+
print(f" Source: {source_partition_locator}")
|
151
|
+
print(f" Destination: {dest_partition_locator}")
|
152
|
+
print(f" Primary Keys: {primary_keys_set}")
|
153
|
+
print(
|
154
|
+
f" Sort Keys: {[sk.key for sk in sort_keys_list] if sort_keys_list else None}"
|
155
|
+
)
|
156
|
+
if compactor_version == "V2":
|
157
|
+
print(f" Hash Bucket Count: {hash_bucket_count}")
|
158
|
+
|
159
|
+
# Run compaction
|
160
|
+
compact_partition(CompactPartitionParams.of(params_dict))
|
161
|
+
|
162
|
+
print(f"✅ Compaction completed successfully!")
|
163
|
+
|
164
|
+
|
165
|
+
if __name__ == "__main__":
|
166
|
+
"""
|
167
|
+
DeltaCAT Compactor Example - Compact partitions using V1 or V2 compactor
|
168
|
+
|
169
|
+
This script demonstrates how to compact partitions in DeltaCAT using either
|
170
|
+
the V1 or V2 compactor. The compactor will read data from a source partition
|
171
|
+
and write compacted data to a destination partition.
|
172
|
+
|
173
|
+
Example 1: Basic V2 compaction (recommended):
|
174
|
+
$ python compactor.py \
|
175
|
+
$ --namespace 'test_namespace' \
|
176
|
+
$ --table-name 'test_table' \
|
177
|
+
$ --table-version '1' \
|
178
|
+
$ --partition-values 'region=us-west-2' \
|
179
|
+
$ --dest-namespace 'test_namespace' \
|
180
|
+
$ --dest-table-name 'test_table_compacted' \
|
181
|
+
$ --dest-table-version '1' \
|
182
|
+
$ --dest-partition-values 'region=us-west-2' \
|
183
|
+
$ --last-stream-position 5000 \
|
184
|
+
$ --primary-keys 'user_id,event_id' \
|
185
|
+
$ --sort-keys 'timestamp,event_type' \
|
186
|
+
$ --compactor-version 'V2' \
|
187
|
+
$ --hash-bucket-count 1 \
|
188
|
+
$ --records-per-file 500000 \
|
189
|
+
$ --table-writer-compression 'snappy'
|
190
|
+
|
191
|
+
Example 2: V1 compaction (legacy):
|
192
|
+
$ python compactor.py \
|
193
|
+
$ --namespace 'events' \
|
194
|
+
$ --table-name 'user_events' \
|
195
|
+
$ --table-version '2' \
|
196
|
+
$ --partition-values 'region=us-west-2' \
|
197
|
+
$ --dest-namespace 'events' \
|
198
|
+
$ --dest-table-name 'user_events_compacted' \
|
199
|
+
$ --dest-table-version '1' \
|
200
|
+
$ --dest-partition-values 'region=us-west-2' \
|
201
|
+
$ --last-stream-position 5000 \
|
202
|
+
$ --primary-keys 'user_id,event_id' \
|
203
|
+
$ --sort-keys 'timestamp,event_type' \
|
204
|
+
$ --compactor-version 'V1' \
|
205
|
+
$ --records-per-file 500000 \
|
206
|
+
$ --table-writer-compression 'snappy'
|
207
|
+
|
208
|
+
Example 3: Submit this script as a local Ray job using a local job client:
|
209
|
+
>>> from deltacat import local_job_client
|
210
|
+
>>> client = local_job_client()
|
211
|
+
>>> job_run_result = client.run_job(
|
212
|
+
>>> entrypoint="python compactor.py --namespace my_ns --table-name my_table ...",
|
213
|
+
>>> runtime_env={"working_dir": "./deltacat/examples/compactor/"},
|
214
|
+
>>> )
|
215
|
+
>>> print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
|
216
|
+
>>> print(f"Job logs: {job_run_result.job_logs}")
|
217
|
+
|
218
|
+
Example 4: Submit this script as a remote Ray job using a remote job client:
|
219
|
+
>>> from deltacat import job_client
|
220
|
+
>>> client = job_client("deltacat.yaml") # or job_client() to use current directory
|
221
|
+
>>> job_run_result = client.run_job(
|
222
|
+
>>> entrypoint="python compactor.py --namespace my_ns --table-name my_table ...",
|
223
|
+
>>> runtime_env={"working_dir": "./deltacat/examples/compactor/"},
|
224
|
+
>>> )
|
225
|
+
>>> print(f"Job completed with status: {job_run_result.job_status}")
|
226
|
+
"""
|
227
|
+
script_args = [
|
228
|
+
(
|
229
|
+
["--namespace"],
|
230
|
+
{
|
231
|
+
"help": "Source table namespace",
|
232
|
+
"type": str,
|
233
|
+
"required": True,
|
234
|
+
},
|
235
|
+
),
|
236
|
+
(
|
237
|
+
["--table-name"],
|
238
|
+
{
|
239
|
+
"help": "Source table name",
|
240
|
+
"type": str,
|
241
|
+
"required": True,
|
242
|
+
},
|
243
|
+
),
|
244
|
+
(
|
245
|
+
["--table-version"],
|
246
|
+
{
|
247
|
+
"help": "Source table version",
|
248
|
+
"type": str,
|
249
|
+
"required": True,
|
250
|
+
},
|
251
|
+
),
|
252
|
+
(
|
253
|
+
["--partition-values"],
|
254
|
+
{
|
255
|
+
"help": "Comma-separated partition values for source (leave empty for no partition values)",
|
256
|
+
"type": str,
|
257
|
+
"default": "",
|
258
|
+
},
|
259
|
+
),
|
260
|
+
(
|
261
|
+
["--dest-namespace"],
|
262
|
+
{
|
263
|
+
"help": "Destination table namespace",
|
264
|
+
"type": str,
|
265
|
+
"required": True,
|
266
|
+
},
|
267
|
+
),
|
268
|
+
(
|
269
|
+
["--dest-table-name"],
|
270
|
+
{
|
271
|
+
"help": "Destination table name",
|
272
|
+
"type": str,
|
273
|
+
"required": True,
|
274
|
+
},
|
275
|
+
),
|
276
|
+
(
|
277
|
+
["--dest-table-version"],
|
278
|
+
{
|
279
|
+
"help": "Destination table version",
|
280
|
+
"type": str,
|
281
|
+
"required": True,
|
282
|
+
},
|
283
|
+
),
|
284
|
+
(
|
285
|
+
["--dest-partition-values"],
|
286
|
+
{
|
287
|
+
"help": "Comma-separated partition values for destination (leave empty for no partition values)",
|
288
|
+
"type": str,
|
289
|
+
"default": "",
|
290
|
+
},
|
291
|
+
),
|
292
|
+
(
|
293
|
+
["--last-stream-position"],
|
294
|
+
{
|
295
|
+
"help": "Last stream position to compact",
|
296
|
+
"type": int,
|
297
|
+
"required": True,
|
298
|
+
},
|
299
|
+
),
|
300
|
+
(
|
301
|
+
["--primary-keys"],
|
302
|
+
{
|
303
|
+
"help": "Comma-separated primary keys",
|
304
|
+
"type": str,
|
305
|
+
"required": True,
|
306
|
+
},
|
307
|
+
),
|
308
|
+
(
|
309
|
+
["--catalog-root"],
|
310
|
+
{
|
311
|
+
"help": "Root path for catalog (defaults to temp directory)",
|
312
|
+
"type": str,
|
313
|
+
"default": None,
|
314
|
+
},
|
315
|
+
),
|
316
|
+
(
|
317
|
+
["--compactor-version"],
|
318
|
+
{
|
319
|
+
"help": "Compactor version to use (V1 or V2)",
|
320
|
+
"type": str,
|
321
|
+
"choices": ["V1", "V2"],
|
322
|
+
"default": "V2",
|
323
|
+
},
|
324
|
+
),
|
325
|
+
(
|
326
|
+
["--sort-keys"],
|
327
|
+
{
|
328
|
+
"help": "Comma-separated sort keys (optional)",
|
329
|
+
"type": str,
|
330
|
+
"default": None,
|
331
|
+
},
|
332
|
+
),
|
333
|
+
(
|
334
|
+
["--hash-bucket-count"],
|
335
|
+
{
|
336
|
+
"help": "Number of hash buckets (required for V2, ignored for V1)",
|
337
|
+
"type": int,
|
338
|
+
"default": None,
|
339
|
+
},
|
340
|
+
),
|
341
|
+
(
|
342
|
+
["--records-per-file"],
|
343
|
+
{
|
344
|
+
"help": "Records per compacted file",
|
345
|
+
"type": int,
|
346
|
+
"default": 1000000,
|
347
|
+
},
|
348
|
+
),
|
349
|
+
(
|
350
|
+
["--table-writer-compression"],
|
351
|
+
{
|
352
|
+
"help": "Compression type for table writer",
|
353
|
+
"type": str,
|
354
|
+
"choices": ["lz4", "snappy", "gzip", "brotli", "zstd"],
|
355
|
+
"default": "lz4",
|
356
|
+
},
|
357
|
+
),
|
358
|
+
]
|
359
|
+
|
360
|
+
# Parse CLI input arguments
|
361
|
+
parser = argparse.ArgumentParser(
|
362
|
+
description="DeltaCAT Compactor Example - Compact partitions using V1 or V2 compactor"
|
363
|
+
)
|
364
|
+
for args, kwargs in script_args:
|
365
|
+
parser.add_argument(*args, **kwargs)
|
366
|
+
args = parser.parse_args()
|
367
|
+
print(f"Command Line Arguments: {args}")
|
368
|
+
|
369
|
+
# Initialize deltacat
|
370
|
+
deltacat.init()
|
371
|
+
|
372
|
+
# Run the compactor using the parsed arguments
|
373
|
+
run(**vars(args))
|