deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,439 @@
|
|
1
|
+
import argparse
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
|
5
|
+
def run_compactor_local(
|
6
|
+
namespace: str,
|
7
|
+
table_name: str,
|
8
|
+
table_version: str,
|
9
|
+
partition_values: str,
|
10
|
+
dest_namespace: str,
|
11
|
+
dest_table_name: str,
|
12
|
+
dest_table_version: str,
|
13
|
+
dest_partition_values: str,
|
14
|
+
last_stream_position: int,
|
15
|
+
primary_keys: str,
|
16
|
+
catalog_root: Optional[str] = None,
|
17
|
+
compactor_version: str = "V2",
|
18
|
+
sort_keys: Optional[str] = None,
|
19
|
+
hash_bucket_count: Optional[int] = None,
|
20
|
+
records_per_file: int = 1000000,
|
21
|
+
table_writer_compression: str = "lz4",
|
22
|
+
) -> None:
|
23
|
+
"""
|
24
|
+
Run the compactor locally using Ray.
|
25
|
+
|
26
|
+
This function constructs the command line arguments and runs the compactor.py
|
27
|
+
script directly in the current Python process.
|
28
|
+
"""
|
29
|
+
# Build command arguments
|
30
|
+
cmd_args = [
|
31
|
+
f"--namespace '{namespace}'",
|
32
|
+
f"--table-name '{table_name}'",
|
33
|
+
f"--table-version '{table_version}'",
|
34
|
+
f"--partition-values '{partition_values}'",
|
35
|
+
f"--dest-namespace '{dest_namespace}'",
|
36
|
+
f"--dest-table-name '{dest_table_name}'",
|
37
|
+
f"--dest-table-version '{dest_table_version}'",
|
38
|
+
f"--dest-partition-values '{dest_partition_values}'",
|
39
|
+
f"--last-stream-position {last_stream_position}",
|
40
|
+
f"--primary-keys '{primary_keys}'",
|
41
|
+
f"--compactor-version '{compactor_version}'",
|
42
|
+
]
|
43
|
+
|
44
|
+
# Add optional arguments
|
45
|
+
if catalog_root:
|
46
|
+
cmd_args.append(f"--catalog-root '{catalog_root}'")
|
47
|
+
if sort_keys:
|
48
|
+
cmd_args.append(f"--sort-keys '{sort_keys}'")
|
49
|
+
if hash_bucket_count is not None:
|
50
|
+
cmd_args.append(f"--hash-bucket-count {hash_bucket_count}")
|
51
|
+
if records_per_file != 1000000:
|
52
|
+
cmd_args.append(f"--records-per-file {records_per_file}")
|
53
|
+
if table_writer_compression != "lz4":
|
54
|
+
cmd_args.append(f"--table-writer-compression '{table_writer_compression}'")
|
55
|
+
|
56
|
+
# Join all arguments
|
57
|
+
cmd_str = " ".join(cmd_args)
|
58
|
+
print(f"Running compactor with arguments: {cmd_str}")
|
59
|
+
|
60
|
+
# Import and run the compactor directly
|
61
|
+
from . import compactor
|
62
|
+
|
63
|
+
# Parse arguments manually and call run function
|
64
|
+
compactor.run(
|
65
|
+
namespace=namespace,
|
66
|
+
table_name=table_name,
|
67
|
+
table_version=table_version,
|
68
|
+
partition_values=partition_values,
|
69
|
+
dest_namespace=dest_namespace,
|
70
|
+
dest_table_name=dest_table_name,
|
71
|
+
dest_table_version=dest_table_version,
|
72
|
+
dest_partition_values=dest_partition_values,
|
73
|
+
last_stream_position=last_stream_position,
|
74
|
+
primary_keys=primary_keys,
|
75
|
+
catalog_root=catalog_root,
|
76
|
+
compactor_version=compactor_version,
|
77
|
+
sort_keys=sort_keys,
|
78
|
+
hash_bucket_count=hash_bucket_count,
|
79
|
+
records_per_file=records_per_file,
|
80
|
+
table_writer_compression=table_writer_compression,
|
81
|
+
)
|
82
|
+
|
83
|
+
|
84
|
+
def run_compactor_local_job(
|
85
|
+
namespace: str,
|
86
|
+
table_name: str,
|
87
|
+
table_version: str,
|
88
|
+
partition_values: str,
|
89
|
+
dest_namespace: str,
|
90
|
+
dest_table_name: str,
|
91
|
+
dest_table_version: str,
|
92
|
+
dest_partition_values: str,
|
93
|
+
last_stream_position: int,
|
94
|
+
primary_keys: str,
|
95
|
+
catalog_root: Optional[str] = None,
|
96
|
+
compactor_version: str = "V2",
|
97
|
+
sort_keys: Optional[str] = None,
|
98
|
+
hash_bucket_count: Optional[int] = None,
|
99
|
+
records_per_file: int = 1000000,
|
100
|
+
table_writer_compression: str = "lz4",
|
101
|
+
) -> None:
|
102
|
+
"""
|
103
|
+
Submit the compactor as a local Ray job using a local job client.
|
104
|
+
|
105
|
+
This function creates a Ray job that runs the compactor.py script
|
106
|
+
with the specified parameters.
|
107
|
+
"""
|
108
|
+
from deltacat import local_job_client
|
109
|
+
|
110
|
+
# Build command arguments
|
111
|
+
cmd_args = [
|
112
|
+
"python compactor.py",
|
113
|
+
f"--namespace '{namespace}'",
|
114
|
+
f"--table-name '{table_name}'",
|
115
|
+
f"--table-version '{table_version}'",
|
116
|
+
f"--partition-values '{partition_values}'",
|
117
|
+
f"--dest-namespace '{dest_namespace}'",
|
118
|
+
f"--dest-table-name '{dest_table_name}'",
|
119
|
+
f"--dest-table-version '{dest_table_version}'",
|
120
|
+
f"--dest-partition-values '{dest_partition_values}'",
|
121
|
+
f"--last-stream-position {last_stream_position}",
|
122
|
+
f"--primary-keys '{primary_keys}'",
|
123
|
+
f"--compactor-version '{compactor_version}'",
|
124
|
+
]
|
125
|
+
|
126
|
+
# Add optional arguments
|
127
|
+
if catalog_root:
|
128
|
+
cmd_args.append(f"--catalog-root '{catalog_root}'")
|
129
|
+
if sort_keys:
|
130
|
+
cmd_args.append(f"--sort-keys '{sort_keys}'")
|
131
|
+
if hash_bucket_count is not None:
|
132
|
+
cmd_args.append(f"--hash-bucket-count {hash_bucket_count}")
|
133
|
+
if records_per_file != 1000000:
|
134
|
+
cmd_args.append(f"--records-per-file {records_per_file}")
|
135
|
+
if table_writer_compression != "lz4":
|
136
|
+
cmd_args.append(f"--table-writer-compression '{table_writer_compression}'")
|
137
|
+
|
138
|
+
# Join all arguments
|
139
|
+
entrypoint = " ".join(cmd_args)
|
140
|
+
print(f"Submitting local Ray job with entrypoint: {entrypoint}")
|
141
|
+
|
142
|
+
# Submit the job
|
143
|
+
client = local_job_client()
|
144
|
+
job_run_result = client.run_job(
|
145
|
+
entrypoint=entrypoint,
|
146
|
+
runtime_env={"working_dir": "./deltacat/examples/compactor/"},
|
147
|
+
)
|
148
|
+
|
149
|
+
print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
|
150
|
+
print(f"Job logs: {job_run_result.job_logs}")
|
151
|
+
|
152
|
+
return job_run_result
|
153
|
+
|
154
|
+
|
155
|
+
def run_compactor_remote_job(
|
156
|
+
namespace: str,
|
157
|
+
table_name: str,
|
158
|
+
table_version: str,
|
159
|
+
partition_values: str,
|
160
|
+
dest_namespace: str,
|
161
|
+
dest_table_name: str,
|
162
|
+
dest_table_version: str,
|
163
|
+
dest_partition_values: str,
|
164
|
+
last_stream_position: int,
|
165
|
+
primary_keys: str,
|
166
|
+
catalog_root: Optional[str] = None,
|
167
|
+
compactor_version: str = "V2",
|
168
|
+
sort_keys: Optional[str] = None,
|
169
|
+
hash_bucket_count: Optional[int] = None,
|
170
|
+
records_per_file: int = 1000000,
|
171
|
+
table_writer_compression: str = "lz4",
|
172
|
+
) -> None:
|
173
|
+
"""
|
174
|
+
Submit the compactor as a remote Ray job using a remote job client.
|
175
|
+
|
176
|
+
This function creates a Ray job that runs the compactor.py script
|
177
|
+
on a remote Ray cluster with the specified parameters.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
namespace: Source table namespace
|
181
|
+
table_name: Source table name
|
182
|
+
table_version: Source table version
|
183
|
+
partition_values: Comma-separated partition values for source
|
184
|
+
dest_namespace: Destination table namespace
|
185
|
+
dest_table_name: Destination table name
|
186
|
+
dest_table_version: Destination table version
|
187
|
+
dest_partition_values: Comma-separated partition values for destination
|
188
|
+
last_stream_position: Last stream position to compact
|
189
|
+
primary_keys: Comma-separated primary keys
|
190
|
+
catalog_root: Root path for catalog (defaults to temp directory)
|
191
|
+
compactor_version: Compactor version to use (V1 or V2)
|
192
|
+
sort_keys: Comma-separated sort keys (optional)
|
193
|
+
hash_bucket_count: Number of hash buckets (required for V2)
|
194
|
+
records_per_file: Records per compacted file
|
195
|
+
table_writer_compression: Compression type for table writer
|
196
|
+
"""
|
197
|
+
from deltacat import job_client
|
198
|
+
|
199
|
+
# Build command arguments - same as local job
|
200
|
+
cmd_args = [
|
201
|
+
"python compactor.py",
|
202
|
+
f"--namespace '{namespace}'",
|
203
|
+
f"--table-name '{table_name}'",
|
204
|
+
f"--table-version '{table_version}'",
|
205
|
+
f"--partition-values '{partition_values}'",
|
206
|
+
f"--dest-namespace '{dest_namespace}'",
|
207
|
+
f"--dest-table-name '{dest_table_name}'",
|
208
|
+
f"--dest-table-version '{dest_table_version}'",
|
209
|
+
f"--dest-partition-values '{dest_partition_values}'",
|
210
|
+
f"--last-stream-position {last_stream_position}",
|
211
|
+
f"--primary-keys '{primary_keys}'",
|
212
|
+
f"--compactor-version '{compactor_version}'",
|
213
|
+
]
|
214
|
+
|
215
|
+
# Add optional arguments
|
216
|
+
if catalog_root:
|
217
|
+
cmd_args.append(f"--catalog-root '{catalog_root}'")
|
218
|
+
if sort_keys:
|
219
|
+
cmd_args.append(f"--sort-keys '{sort_keys}'")
|
220
|
+
if hash_bucket_count is not None:
|
221
|
+
cmd_args.append(f"--hash-bucket-count {hash_bucket_count}")
|
222
|
+
if records_per_file != 1000000:
|
223
|
+
cmd_args.append(f"--records-per-file {records_per_file}")
|
224
|
+
if table_writer_compression != "lz4":
|
225
|
+
cmd_args.append(f"--table-writer-compression '{table_writer_compression}'")
|
226
|
+
|
227
|
+
# Join all arguments
|
228
|
+
entrypoint = " ".join(cmd_args)
|
229
|
+
print(f"Submitting remote Ray job with entrypoint: {entrypoint}")
|
230
|
+
|
231
|
+
# Submit the job
|
232
|
+
# TODO(pdames): Take cloud as an input parameter.
|
233
|
+
client = job_client(
|
234
|
+
"./aws/deltacat.yaml"
|
235
|
+
) # or job_client() to use current directory
|
236
|
+
job_run_result = client.run_job(
|
237
|
+
entrypoint=entrypoint,
|
238
|
+
runtime_env={"working_dir": "./deltacat/examples/compactor/"},
|
239
|
+
)
|
240
|
+
|
241
|
+
print(f"Job completed with status: {job_run_result.job_status}")
|
242
|
+
return job_run_result
|
243
|
+
|
244
|
+
|
245
|
+
if __name__ == "__main__":
|
246
|
+
"""
|
247
|
+
DeltaCAT Job Runner Example - Run compactor jobs using different methods
|
248
|
+
|
249
|
+
This script demonstrates three ways to run the DeltaCAT compactor:
|
250
|
+
1. Locally in the current process
|
251
|
+
2. As a local Ray job
|
252
|
+
3. As a remote Ray job
|
253
|
+
|
254
|
+
Example usage:
|
255
|
+
$ python job_runner.py \
|
256
|
+
$ --namespace 'events' \
|
257
|
+
$ --table-name 'user_events' \
|
258
|
+
$ --table-version '2' \
|
259
|
+
$ --partition-values 'region=us-west-2' \
|
260
|
+
$ --dest-namespace 'events' \
|
261
|
+
$ --dest-table-name 'user_events_compacted' \
|
262
|
+
$ --dest-table-version '1' \
|
263
|
+
$ --dest-partition-values 'region=us-west-2' \
|
264
|
+
$ --last-stream-position 5000 \
|
265
|
+
$ --primary-keys 'user_id,event_id' \
|
266
|
+
$ --sort-keys 'timestamp,event_type' \
|
267
|
+
$ --compactor-version 'V2' \
|
268
|
+
$ --hash-bucket-count 1 \
|
269
|
+
$ --job-type 'local'
|
270
|
+
"""
|
271
|
+
script_args = [
|
272
|
+
(
|
273
|
+
["--namespace"],
|
274
|
+
{
|
275
|
+
"help": "Source table namespace",
|
276
|
+
"type": str,
|
277
|
+
"required": True,
|
278
|
+
},
|
279
|
+
),
|
280
|
+
(
|
281
|
+
["--table-name"],
|
282
|
+
{
|
283
|
+
"help": "Source table name",
|
284
|
+
"type": str,
|
285
|
+
"required": True,
|
286
|
+
},
|
287
|
+
),
|
288
|
+
(
|
289
|
+
["--table-version"],
|
290
|
+
{
|
291
|
+
"help": "Source table version",
|
292
|
+
"type": str,
|
293
|
+
"required": True,
|
294
|
+
},
|
295
|
+
),
|
296
|
+
(
|
297
|
+
["--partition-values"],
|
298
|
+
{
|
299
|
+
"help": "Comma-separated partition values for source (leave empty for no partition values)",
|
300
|
+
"type": str,
|
301
|
+
"default": "",
|
302
|
+
},
|
303
|
+
),
|
304
|
+
(
|
305
|
+
["--dest-namespace"],
|
306
|
+
{
|
307
|
+
"help": "Destination table namespace",
|
308
|
+
"type": str,
|
309
|
+
"required": True,
|
310
|
+
},
|
311
|
+
),
|
312
|
+
(
|
313
|
+
["--dest-table-name"],
|
314
|
+
{
|
315
|
+
"help": "Destination table name",
|
316
|
+
"type": str,
|
317
|
+
"required": True,
|
318
|
+
},
|
319
|
+
),
|
320
|
+
(
|
321
|
+
["--dest-table-version"],
|
322
|
+
{
|
323
|
+
"help": "Destination table version",
|
324
|
+
"type": str,
|
325
|
+
"required": True,
|
326
|
+
},
|
327
|
+
),
|
328
|
+
(
|
329
|
+
["--dest-partition-values"],
|
330
|
+
{
|
331
|
+
"help": "Comma-separated partition values for destination (leave empty for no partition values)",
|
332
|
+
"type": str,
|
333
|
+
"default": "",
|
334
|
+
},
|
335
|
+
),
|
336
|
+
(
|
337
|
+
["--last-stream-position"],
|
338
|
+
{
|
339
|
+
"help": "Last stream position to compact",
|
340
|
+
"type": int,
|
341
|
+
"required": True,
|
342
|
+
},
|
343
|
+
),
|
344
|
+
(
|
345
|
+
["--primary-keys"],
|
346
|
+
{
|
347
|
+
"help": "Comma-separated primary keys",
|
348
|
+
"type": str,
|
349
|
+
"required": True,
|
350
|
+
},
|
351
|
+
),
|
352
|
+
(
|
353
|
+
["--catalog-root"],
|
354
|
+
{
|
355
|
+
"help": "Root path for catalog (defaults to temp directory)",
|
356
|
+
"type": str,
|
357
|
+
"default": None,
|
358
|
+
},
|
359
|
+
),
|
360
|
+
(
|
361
|
+
["--compactor-version"],
|
362
|
+
{
|
363
|
+
"help": "Compactor version to use (V1 or V2)",
|
364
|
+
"type": str,
|
365
|
+
"choices": ["V1", "V2"],
|
366
|
+
"default": "V2",
|
367
|
+
},
|
368
|
+
),
|
369
|
+
(
|
370
|
+
["--sort-keys"],
|
371
|
+
{
|
372
|
+
"help": "Comma-separated sort keys (optional)",
|
373
|
+
"type": str,
|
374
|
+
"default": None,
|
375
|
+
},
|
376
|
+
),
|
377
|
+
(
|
378
|
+
["--hash-bucket-count"],
|
379
|
+
{
|
380
|
+
"help": "Number of hash buckets (required for V2, ignored for V1)",
|
381
|
+
"type": int,
|
382
|
+
"default": None,
|
383
|
+
},
|
384
|
+
),
|
385
|
+
(
|
386
|
+
["--records-per-file"],
|
387
|
+
{
|
388
|
+
"help": "Records per compacted file",
|
389
|
+
"type": int,
|
390
|
+
"default": 1000000,
|
391
|
+
},
|
392
|
+
),
|
393
|
+
(
|
394
|
+
["--table-writer-compression"],
|
395
|
+
{
|
396
|
+
"help": "Compression type for table writer",
|
397
|
+
"type": str,
|
398
|
+
"choices": ["lz4", "snappy", "gzip", "brotli", "zstd"],
|
399
|
+
"default": "lz4",
|
400
|
+
},
|
401
|
+
),
|
402
|
+
(
|
403
|
+
["--job-type"],
|
404
|
+
{
|
405
|
+
"help": "Type of job execution",
|
406
|
+
"type": str,
|
407
|
+
"choices": ["local", "local-job", "remote-job"],
|
408
|
+
"default": "local",
|
409
|
+
},
|
410
|
+
),
|
411
|
+
]
|
412
|
+
|
413
|
+
# Parse CLI input arguments
|
414
|
+
parser = argparse.ArgumentParser(
|
415
|
+
description="DeltaCAT Job Runner Example - Run compactor jobs using different methods"
|
416
|
+
)
|
417
|
+
for args, kwargs in script_args:
|
418
|
+
parser.add_argument(*args, **kwargs)
|
419
|
+
args = parser.parse_args()
|
420
|
+
print(f"Command Line Arguments: {args}")
|
421
|
+
|
422
|
+
# Extract job type and remove it from args
|
423
|
+
job_type = args.job_type
|
424
|
+
delattr(args, "job_type")
|
425
|
+
|
426
|
+
# Run the appropriate job type
|
427
|
+
if job_type == "local":
|
428
|
+
print("Running compactor locally...")
|
429
|
+
run_compactor_local(**vars(args))
|
430
|
+
elif job_type == "local-job":
|
431
|
+
print("Submitting local Ray job...")
|
432
|
+
run_compactor_local_job(**vars(args))
|
433
|
+
elif job_type == "remote-job":
|
434
|
+
print("Submitting remote Ray job...")
|
435
|
+
run_compactor_remote_job(**vars(args))
|
436
|
+
else:
|
437
|
+
raise ValueError(f"Invalid job type: {job_type}")
|
438
|
+
|
439
|
+
print("Job runner completed!")
|
@@ -0,0 +1 @@
|
|
1
|
+
# Common utilities for DeltaCAT compactor examples
|
@@ -0,0 +1,261 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Common utilities for DeltaCAT compactor examples.
|
4
|
+
|
5
|
+
This module contains shared functionality used across bootstrap.py, explorer.py,
|
6
|
+
and compactor.py to reduce code duplication.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from typing import Set, List, Optional, Tuple
|
10
|
+
|
11
|
+
import deltacat as dc
|
12
|
+
from deltacat import DeltaCatUrl
|
13
|
+
from deltacat.catalog import Catalog, put_catalog, get_table
|
14
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
15
|
+
from deltacat.storage import metastore
|
16
|
+
from deltacat.storage.model.partition import PartitionLocator
|
17
|
+
from deltacat.storage.model.sort_key import SortKey
|
18
|
+
from deltacat.storage.model.types import SortOrder
|
19
|
+
|
20
|
+
|
21
|
+
def get_default_catalog_root() -> str:
|
22
|
+
"""Get the default catalog root directory."""
|
23
|
+
return "/tmp/deltacat_test"
|
24
|
+
|
25
|
+
|
26
|
+
def initialize_catalog(
|
27
|
+
catalog_root: Optional[str] = None, catalog_name: str = "default"
|
28
|
+
) -> CatalogProperties:
|
29
|
+
"""
|
30
|
+
Initialize and register a DeltaCAT catalog.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
catalog_root: Root directory for the catalog. If None, uses default.
|
34
|
+
catalog_name: Name to register the catalog under.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
CatalogProperties instance for the initialized catalog.
|
38
|
+
"""
|
39
|
+
if catalog_root is None:
|
40
|
+
catalog_root = get_default_catalog_root()
|
41
|
+
|
42
|
+
catalog = CatalogProperties(root=catalog_root)
|
43
|
+
|
44
|
+
# Initialize catalog and register it
|
45
|
+
catalog_obj = Catalog(config=catalog)
|
46
|
+
put_catalog(catalog_name, catalog_obj)
|
47
|
+
|
48
|
+
return catalog
|
49
|
+
|
50
|
+
|
51
|
+
def initialize_deltacat_url_catalog(
|
52
|
+
catalog_root: Optional[str] = None, catalog_name: str = "compactor_test_catalog"
|
53
|
+
) -> DeltaCatUrl:
|
54
|
+
"""
|
55
|
+
Initialize a DeltaCAT catalog using URL-based approach (used by explorer.py).
|
56
|
+
|
57
|
+
Args:
|
58
|
+
catalog_root: Root directory for the catalog. If None, uses default.
|
59
|
+
catalog_name: Name for the catalog URL.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
DeltaCatUrl instance for the initialized catalog.
|
63
|
+
"""
|
64
|
+
if catalog_root is None:
|
65
|
+
catalog_root = get_default_catalog_root()
|
66
|
+
|
67
|
+
dc.init()
|
68
|
+
catalog_url = DeltaCatUrl(f"dc://{catalog_name}")
|
69
|
+
dc.put(catalog_url, root=catalog_root)
|
70
|
+
|
71
|
+
return catalog_url
|
72
|
+
|
73
|
+
|
74
|
+
def parse_primary_keys(primary_keys_str: str) -> Set[str]:
|
75
|
+
"""Parse comma-separated primary keys string into a set."""
|
76
|
+
return set(key.strip() for key in primary_keys_str.split(",") if key.strip())
|
77
|
+
|
78
|
+
|
79
|
+
def parse_partition_values(partition_values_str: str) -> List[str]:
|
80
|
+
"""Parse comma-separated partition values string into a list."""
|
81
|
+
if not partition_values_str.strip():
|
82
|
+
return []
|
83
|
+
return [value.strip() for value in partition_values_str.split(",") if value.strip()]
|
84
|
+
|
85
|
+
|
86
|
+
def parse_sort_keys(sort_keys_str: str) -> List[SortKey]:
|
87
|
+
"""Parse comma-separated sort keys string into a list of SortKey objects."""
|
88
|
+
if not sort_keys_str or not sort_keys_str.strip():
|
89
|
+
return []
|
90
|
+
|
91
|
+
sort_keys = []
|
92
|
+
for key in sort_keys_str.split(","):
|
93
|
+
key = key.strip()
|
94
|
+
if key:
|
95
|
+
sort_keys.append(SortKey.of(key=key, sort_order=SortOrder.ASCENDING))
|
96
|
+
return sort_keys
|
97
|
+
|
98
|
+
|
99
|
+
def create_partition_locator(
|
100
|
+
namespace: str,
|
101
|
+
table_name: str,
|
102
|
+
table_version: str,
|
103
|
+
partition_values: List[str],
|
104
|
+
) -> PartitionLocator:
|
105
|
+
"""
|
106
|
+
Create a partition locator with the given parameters.
|
107
|
+
Note: This creates a locator with partition_id=None, which may not work
|
108
|
+
for all operations. Use get_actual_partition_locator() for operations
|
109
|
+
that require the actual partition ID.
|
110
|
+
"""
|
111
|
+
return PartitionLocator.of(
|
112
|
+
namespace=namespace,
|
113
|
+
table_name=table_name,
|
114
|
+
table_version=table_version,
|
115
|
+
partition_values=partition_values,
|
116
|
+
)
|
117
|
+
|
118
|
+
|
119
|
+
def get_actual_partition_locator(
|
120
|
+
namespace: str,
|
121
|
+
table_name: str,
|
122
|
+
table_version: str,
|
123
|
+
partition_values: List[str],
|
124
|
+
catalog: CatalogProperties,
|
125
|
+
catalog_name: str = "default",
|
126
|
+
) -> PartitionLocator:
|
127
|
+
"""
|
128
|
+
Get the actual partition locator by using metastore to find the partition.
|
129
|
+
This matches the approach used in bootstrap.py and ensures compatibility.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
namespace: Table namespace
|
133
|
+
table_name: Table name
|
134
|
+
table_version: Table version
|
135
|
+
partition_values: Partition values (can be empty list)
|
136
|
+
catalog: CatalogProperties instance
|
137
|
+
catalog_name: Name of the registered catalog
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
PartitionLocator with actual partition ID
|
141
|
+
"""
|
142
|
+
try:
|
143
|
+
# Initialize catalog like bootstrap.py does
|
144
|
+
catalog_obj = Catalog(config=catalog)
|
145
|
+
put_catalog(catalog_name, catalog_obj)
|
146
|
+
|
147
|
+
# Get table definition first
|
148
|
+
table_def = get_table(
|
149
|
+
name=table_name, namespace=namespace, catalog=catalog_name
|
150
|
+
)
|
151
|
+
|
152
|
+
# Get the actual partition using the table's stream locator
|
153
|
+
partition = metastore.get_partition(
|
154
|
+
stream_locator=table_def.stream.locator,
|
155
|
+
partition_values=partition_values if partition_values else None,
|
156
|
+
catalog=catalog,
|
157
|
+
)
|
158
|
+
|
159
|
+
return partition.locator
|
160
|
+
|
161
|
+
except Exception as e:
|
162
|
+
print(f"⚠️ Failed to get actual partition locator: {e}")
|
163
|
+
print(f" Falling back to basic partition locator")
|
164
|
+
return create_partition_locator(
|
165
|
+
namespace, table_name, table_version, partition_values
|
166
|
+
)
|
167
|
+
|
168
|
+
|
169
|
+
def format_partition_values_for_command(partition_values: Optional[List[str]]) -> str:
|
170
|
+
"""Format partition values for use in command line arguments."""
|
171
|
+
if not partition_values:
|
172
|
+
return ""
|
173
|
+
return ",".join(str(v) for v in partition_values)
|
174
|
+
|
175
|
+
|
176
|
+
def get_max_stream_position_from_partition(
|
177
|
+
namespace: str,
|
178
|
+
table_name: str,
|
179
|
+
table_version: str,
|
180
|
+
partition_values: List[str],
|
181
|
+
catalog: CatalogProperties,
|
182
|
+
catalog_name: str = "default",
|
183
|
+
) -> int:
|
184
|
+
"""
|
185
|
+
Get the maximum stream position from a partition by reading its deltas.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
namespace: Table namespace
|
189
|
+
table_name: Table name
|
190
|
+
table_version: Table version
|
191
|
+
partition_values: Partition values
|
192
|
+
catalog: CatalogProperties instance
|
193
|
+
catalog_name: Name of the registered catalog
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
Maximum stream position found, or 1000 as fallback
|
197
|
+
"""
|
198
|
+
try:
|
199
|
+
# Get the actual partition locator
|
200
|
+
partition_locator = get_actual_partition_locator(
|
201
|
+
namespace,
|
202
|
+
table_name,
|
203
|
+
table_version,
|
204
|
+
partition_values,
|
205
|
+
catalog,
|
206
|
+
catalog_name,
|
207
|
+
)
|
208
|
+
|
209
|
+
# Create a partition-like object for metastore API
|
210
|
+
partition_like = type("obj", (object,), {"locator": partition_locator})()
|
211
|
+
|
212
|
+
# Get deltas from the partition
|
213
|
+
partition_deltas = metastore.list_partition_deltas(
|
214
|
+
partition_like=partition_like,
|
215
|
+
include_manifest=True,
|
216
|
+
catalog=catalog,
|
217
|
+
)
|
218
|
+
|
219
|
+
delta_list = partition_deltas.all_items()
|
220
|
+
if delta_list:
|
221
|
+
return max(delta.stream_position for delta in delta_list)
|
222
|
+
else:
|
223
|
+
return 1000 # fallback
|
224
|
+
|
225
|
+
except Exception as e:
|
226
|
+
print(f"⚠️ Failed to get max stream position: {e}")
|
227
|
+
return 1000 # fallback
|
228
|
+
|
229
|
+
|
230
|
+
def get_bootstrap_destination_info(
|
231
|
+
source_namespace: str, source_table: str
|
232
|
+
) -> Tuple[str, str]:
|
233
|
+
"""
|
234
|
+
Get the corresponding destination namespace and table name for bootstrap test tables.
|
235
|
+
|
236
|
+
Args:
|
237
|
+
source_namespace: Source namespace
|
238
|
+
source_table: Source table name
|
239
|
+
|
240
|
+
Returns:
|
241
|
+
Tuple of (dest_namespace, dest_table_name)
|
242
|
+
"""
|
243
|
+
if source_namespace == "compactor_test_source" and source_table == "events":
|
244
|
+
return "compactor_test_dest", "events_compacted"
|
245
|
+
else:
|
246
|
+
# Generic fallback
|
247
|
+
return source_namespace, f"{source_table}_compacted"
|
248
|
+
|
249
|
+
|
250
|
+
def print_section_header(title: str, char: str = "=", width: int = 80) -> None:
|
251
|
+
"""Print a formatted section header."""
|
252
|
+
print(char * width)
|
253
|
+
print(title)
|
254
|
+
print(char * width)
|
255
|
+
|
256
|
+
|
257
|
+
def print_subsection_header(title: str, char: str = "-", width: int = 70) -> None:
|
258
|
+
"""Print a formatted subsection header."""
|
259
|
+
print(char * width)
|
260
|
+
print(title)
|
261
|
+
print(char * width)
|
File without changes
|
File without changes
|