deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -5,13 +5,15 @@ import ray
|
|
5
5
|
import time
|
6
6
|
import json
|
7
7
|
from math import ceil
|
8
|
+
from urllib.parse import urlparse
|
9
|
+
import pyarrow
|
8
10
|
|
9
11
|
from deltacat.compute.compactor import (
|
10
12
|
PyArrowWriteResult,
|
11
13
|
HighWatermark,
|
12
14
|
RoundCompletionInfo,
|
13
15
|
)
|
14
|
-
from deltacat.
|
16
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
15
17
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
16
18
|
from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
17
19
|
ExecutionCompactionResult,
|
@@ -32,7 +34,7 @@ from deltacat.compute.compactor_v2.utils.merge import (
|
|
32
34
|
from deltacat.compute.compactor_v2.utils.task_options import (
|
33
35
|
hash_bucket_resource_options_provider,
|
34
36
|
)
|
35
|
-
from deltacat.compute.compactor.utils import
|
37
|
+
from deltacat.compute.compactor.utils import round_completion_reader as rci
|
36
38
|
from deltacat.compute.compactor import DeltaAnnotated
|
37
39
|
from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
|
38
40
|
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
@@ -48,6 +50,7 @@ from deltacat.storage import (
|
|
48
50
|
DeltaType,
|
49
51
|
DeltaLocator,
|
50
52
|
Partition,
|
53
|
+
PartitionLocator,
|
51
54
|
Manifest,
|
52
55
|
Stream,
|
53
56
|
StreamLocator,
|
@@ -77,6 +80,24 @@ from deltacat.compute.compactor_v2.utils.task_options import (
|
|
77
80
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
78
81
|
|
79
82
|
|
83
|
+
def _get_rci_source_partition_locator(
|
84
|
+
params: CompactPartitionParams,
|
85
|
+
) -> PartitionLocator:
|
86
|
+
return params.rebase_source_partition_locator or params.source_partition_locator
|
87
|
+
|
88
|
+
|
89
|
+
def _is_inplace_compacted(
|
90
|
+
rci_source_partition_locator: PartitionLocator,
|
91
|
+
destination_partition_locator: PartitionLocator,
|
92
|
+
) -> bool:
|
93
|
+
return (
|
94
|
+
rci_source_partition_locator.partition_values
|
95
|
+
== destination_partition_locator.partition_values
|
96
|
+
and rci_source_partition_locator.stream_id
|
97
|
+
== destination_partition_locator.stream_id
|
98
|
+
)
|
99
|
+
|
100
|
+
|
80
101
|
def _fetch_compaction_metadata(
|
81
102
|
params: CompactPartitionParams,
|
82
103
|
) -> tuple[Optional[Manifest], Optional[RoundCompletionInfo]]:
|
@@ -87,11 +108,11 @@ def _fetch_compaction_metadata(
|
|
87
108
|
previous_compacted_delta_manifest: Optional[Manifest] = None
|
88
109
|
|
89
110
|
if not params.rebase_source_partition_locator:
|
90
|
-
round_completion_info =
|
91
|
-
params.
|
92
|
-
params.
|
93
|
-
params.
|
94
|
-
|
111
|
+
round_completion_info = rci.read_round_completion_info(
|
112
|
+
source_partition_locator=params.source_partition_locator,
|
113
|
+
destination_partition_locator=params.destination_partition_locator,
|
114
|
+
deltacat_storage=params.deltacat_storage,
|
115
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
95
116
|
)
|
96
117
|
if not round_completion_info:
|
97
118
|
logger.info(
|
@@ -111,10 +132,10 @@ def _fetch_compaction_metadata(
|
|
111
132
|
assert (
|
112
133
|
params.hash_bucket_count == round_completion_info.hash_bucket_count
|
113
134
|
), (
|
114
|
-
"
|
115
|
-
"
|
116
|
-
f"Hash
|
117
|
-
f"
|
135
|
+
"Partition hash bucket count for compaction has changed. "
|
136
|
+
"Rebase compaction with the desired hash bucket count before running another incremental compaction. "
|
137
|
+
f"Hash bucket count in RCI={round_completion_info.hash_bucket_count} "
|
138
|
+
f"!= hash bucket count in params={params.hash_bucket_count}."
|
118
139
|
)
|
119
140
|
|
120
141
|
logger.info(f"Round completion file: {round_completion_info}")
|
@@ -149,6 +170,7 @@ def _build_uniform_deltas(
|
|
149
170
|
hash_bucket_count=params.hash_bucket_count,
|
150
171
|
compaction_audit=mutable_compaction_audit,
|
151
172
|
compact_partition_params=params,
|
173
|
+
all_column_names=params.all_column_names,
|
152
174
|
deltacat_storage=params.deltacat_storage,
|
153
175
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
154
176
|
)
|
@@ -159,10 +181,9 @@ def _build_uniform_deltas(
|
|
159
181
|
delta_discovery_end - delta_discovery_start
|
160
182
|
)
|
161
183
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
**params.s3_client_kwargs,
|
184
|
+
_upload_audit_data(
|
185
|
+
params,
|
186
|
+
mutable_compaction_audit,
|
166
187
|
)
|
167
188
|
|
168
189
|
return (
|
@@ -267,10 +288,9 @@ def _run_hash_and_merge(
|
|
267
288
|
hb_end - hb_start,
|
268
289
|
)
|
269
290
|
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
**params.s3_client_kwargs,
|
291
|
+
_upload_audit_data(
|
292
|
+
params,
|
293
|
+
mutable_compaction_audit,
|
274
294
|
)
|
275
295
|
|
276
296
|
hb_data_processed_size_bytes = np.int64(0)
|
@@ -402,13 +422,24 @@ def _merge(
|
|
402
422
|
round_completion_info=round_completion_info,
|
403
423
|
compacted_delta_manifest=previous_compacted_delta_manifest,
|
404
424
|
primary_keys=params.primary_keys,
|
405
|
-
deltacat_storage=params.deltacat_storage,
|
406
|
-
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
407
425
|
ray_custom_resources=params.ray_custom_resources,
|
408
426
|
memory_logs_enabled=params.memory_logs_enabled,
|
409
427
|
estimate_resources_params=params.estimate_resources_params,
|
410
428
|
)
|
411
429
|
|
430
|
+
# set previous compacted delta manifest on input so that we don't need a transaction to retrieve it
|
431
|
+
if round_completion_info:
|
432
|
+
previous_compacted_delta_manifest = params.deltacat_storage.get_delta_manifest(
|
433
|
+
round_completion_info.compacted_delta_locator,
|
434
|
+
**params.deltacat_storage_kwargs,
|
435
|
+
)
|
436
|
+
|
437
|
+
# create a copy of deltacat storage kwargs without any parent transaction context
|
438
|
+
# (can't be serialized by Ray, and we're only downloading already-resolved manifest entries)
|
439
|
+
deltacat_storage_kwargs_copy = {
|
440
|
+
k: v for k, v in params.deltacat_storage_kwargs.items() if k != "transaction"
|
441
|
+
}
|
442
|
+
|
412
443
|
def merge_input_provider(index, item) -> dict[str, MergeInput]:
|
413
444
|
return {
|
414
445
|
"input": MergeInput.of(
|
@@ -422,22 +453,26 @@ def _merge(
|
|
422
453
|
write_to_partition=compacted_partition,
|
423
454
|
compacted_file_content_type=params.compacted_file_content_type,
|
424
455
|
primary_keys=params.primary_keys,
|
456
|
+
all_column_names=params.all_column_names,
|
425
457
|
sort_keys=params.sort_keys,
|
426
458
|
merge_task_index=index,
|
427
459
|
drop_duplicates=params.drop_duplicates,
|
428
460
|
max_records_per_output_file=params.records_per_compacted_file,
|
429
461
|
enable_profiler=params.enable_profiler,
|
430
462
|
metrics_config=params.metrics_config,
|
431
|
-
|
463
|
+
table_writer_kwargs=params.table_writer_kwargs,
|
432
464
|
read_kwargs_provider=params.read_kwargs_provider,
|
433
465
|
round_completion_info=round_completion_info,
|
434
466
|
object_store=params.object_store,
|
435
467
|
deltacat_storage=params.deltacat_storage,
|
436
|
-
deltacat_storage_kwargs=
|
468
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs_copy,
|
437
469
|
delete_strategy=delete_strategy,
|
438
470
|
delete_file_envelopes=delete_file_envelopes,
|
439
471
|
memory_logs_enabled=params.memory_logs_enabled,
|
440
472
|
disable_copy_by_reference=params.disable_copy_by_reference,
|
473
|
+
hash_bucket_count=params.hash_bucket_count,
|
474
|
+
original_fields=params.original_fields,
|
475
|
+
compacted_manifest=previous_compacted_delta_manifest,
|
441
476
|
)
|
442
477
|
}
|
443
478
|
|
@@ -473,6 +508,12 @@ def _hash_bucket(
|
|
473
508
|
estimate_resources_params=params.estimate_resources_params,
|
474
509
|
)
|
475
510
|
|
511
|
+
# create a copy of deltacat storage kwargs without any parent transaction context
|
512
|
+
# (can't be serialized by Ray, and we're only downloading already-resolved manifest entries)
|
513
|
+
deltacat_storage_kwargs_copy = {
|
514
|
+
k: v for k, v in params.deltacat_storage_kwargs.items() if k != "transaction"
|
515
|
+
}
|
516
|
+
|
476
517
|
def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
|
477
518
|
return {
|
478
519
|
"input": HashBucketInput.of(
|
@@ -481,12 +522,13 @@ def _hash_bucket(
|
|
481
522
|
hb_task_index=index,
|
482
523
|
num_hash_buckets=params.hash_bucket_count,
|
483
524
|
num_hash_groups=params.hash_group_count,
|
525
|
+
all_column_names=params.all_column_names,
|
484
526
|
enable_profiler=params.enable_profiler,
|
485
527
|
metrics_config=params.metrics_config,
|
486
528
|
read_kwargs_provider=params.read_kwargs_provider,
|
487
529
|
object_store=params.object_store,
|
488
530
|
deltacat_storage=params.deltacat_storage,
|
489
|
-
deltacat_storage_kwargs=
|
531
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs_copy,
|
490
532
|
memory_logs_enabled=params.memory_logs_enabled,
|
491
533
|
)
|
492
534
|
}
|
@@ -595,10 +637,9 @@ def _process_merge_results(
|
|
595
637
|
file_index += mat_result.pyarrow_write_result.files
|
596
638
|
previous_task_index = mat_result.task_index
|
597
639
|
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
**params.s3_client_kwargs,
|
640
|
+
_upload_audit_data(
|
641
|
+
params,
|
642
|
+
mutable_compaction_audit,
|
602
643
|
)
|
603
644
|
deltas: List[Delta] = [m.delta for m in mat_results]
|
604
645
|
# Note: An appropriate last stream position must be set
|
@@ -633,21 +674,20 @@ def _update_and_upload_compaction_audit(
|
|
633
674
|
+ round_completion_info.compacted_pyarrow_write_result.records
|
634
675
|
)
|
635
676
|
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
**params.s3_client_kwargs,
|
677
|
+
_upload_audit_data(
|
678
|
+
params,
|
679
|
+
mutable_compaction_audit,
|
640
680
|
)
|
641
681
|
return
|
642
682
|
|
643
683
|
|
644
|
-
def
|
684
|
+
def _create_round_completion_info(
|
645
685
|
params: CompactPartitionParams,
|
646
686
|
mutable_compaction_audit: CompactionSessionAuditInfo,
|
647
687
|
compacted_partition: Partition,
|
648
688
|
audit_url: str,
|
649
689
|
hb_id_to_entry_indices_range: dict,
|
650
|
-
|
690
|
+
rci_source_partition_locator: PartitionLocator,
|
651
691
|
new_compacted_delta_locator: DeltaLocator,
|
652
692
|
pyarrow_write_result: PyArrowWriteResult,
|
653
693
|
prev_round_completion_info: Optional[RoundCompletionInfo] = None,
|
@@ -689,6 +729,27 @@ def _write_new_round_completion_file(
|
|
689
729
|
prev_round_completion_info,
|
690
730
|
)
|
691
731
|
|
732
|
+
# Check if this is an in-place compaction before creating RoundCompletionInfo
|
733
|
+
logger.info(
|
734
|
+
f"Checking if partition {rci_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
|
735
|
+
)
|
736
|
+
is_inplace_compacted: bool = _is_inplace_compacted(
|
737
|
+
rci_source_partition_locator, params.destination_partition_locator
|
738
|
+
)
|
739
|
+
|
740
|
+
# Determine the prev_source_partition_locator based on compaction type
|
741
|
+
if is_inplace_compacted:
|
742
|
+
logger.info(
|
743
|
+
"In-place compaction detected. Using compacted partition locator as prev_source_partition_locator. "
|
744
|
+
+ f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
|
745
|
+
f"and rci source partition_id of {rci_source_partition_locator.partition_id}."
|
746
|
+
)
|
747
|
+
prev_source_partition_locator = compacted_partition.locator
|
748
|
+
# Update rci_source_partition_locator for backward compatibility
|
749
|
+
rci_source_partition_locator = compacted_partition.locator
|
750
|
+
else:
|
751
|
+
prev_source_partition_locator = rci_source_partition_locator
|
752
|
+
|
692
753
|
new_round_completion_info = RoundCompletionInfo.of(
|
693
754
|
high_watermark=params.last_stream_position_to_compact,
|
694
755
|
compacted_delta_locator=new_compacted_delta_locator,
|
@@ -701,41 +762,17 @@ def _write_new_round_completion_file(
|
|
701
762
|
compactor_version=CompactorVersion.V2.value,
|
702
763
|
input_inflation=input_inflation,
|
703
764
|
input_average_record_size_bytes=input_average_record_size_bytes,
|
765
|
+
prev_source_partition_locator=prev_source_partition_locator,
|
704
766
|
)
|
705
767
|
|
706
768
|
logger.info(
|
707
769
|
f"Partition-{params.source_partition_locator.partition_values},"
|
708
770
|
f"compacted at: {params.last_stream_position_to_compact},"
|
709
771
|
)
|
710
|
-
logger.info(
|
711
|
-
f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
|
712
|
-
)
|
713
|
-
is_inplace_compacted: bool = (
|
714
|
-
rcf_source_partition_locator.partition_values
|
715
|
-
== params.destination_partition_locator.partition_values
|
716
|
-
and rcf_source_partition_locator.stream_id
|
717
|
-
== params.destination_partition_locator.stream_id
|
718
|
-
)
|
719
|
-
if is_inplace_compacted:
|
720
|
-
logger.info(
|
721
|
-
"Overriding round completion file source partition locator as in-place compacted. "
|
722
|
-
+ f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
|
723
|
-
f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
|
724
|
-
)
|
725
|
-
rcf_source_partition_locator = compacted_partition.locator
|
726
|
-
|
727
|
-
round_completion_file_s3_url = rcf.write_round_completion_file(
|
728
|
-
params.compaction_artifact_s3_bucket,
|
729
|
-
rcf_source_partition_locator,
|
730
|
-
compacted_partition.locator,
|
731
|
-
new_round_completion_info,
|
732
|
-
**params.s3_client_kwargs,
|
733
|
-
)
|
734
772
|
|
735
773
|
return ExecutionCompactionResult(
|
736
774
|
compacted_partition,
|
737
775
|
new_round_completion_info,
|
738
|
-
round_completion_file_s3_url,
|
739
776
|
is_inplace_compacted,
|
740
777
|
)
|
741
778
|
|
@@ -751,21 +788,29 @@ def _commit_compaction_result(
|
|
751
788
|
f"Partition-{params.source_partition_locator} -> "
|
752
789
|
f"{compaction_session_type} Compaction session data processing completed"
|
753
790
|
)
|
791
|
+
# TODO(pdames): Uncomment this once we support concurrent writes to the same
|
792
|
+
# partition (via write_to_table). This requires updating the commit_partition
|
793
|
+
# method to support previous partition as input. Right now, a concurrent write
|
794
|
+
# to the same partition will cause the commit_partition method to fail.
|
754
795
|
if execute_compaction_result.new_compacted_partition:
|
755
796
|
previous_partition: Optional[Partition] = None
|
756
|
-
if execute_compaction_result.is_inplace_compacted:
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
797
|
+
# if execute_compaction_result.is_inplace_compacted:
|
798
|
+
# previous_partition: Optional[
|
799
|
+
# Partition
|
800
|
+
# ] = params.deltacat_storage.get_partition(
|
801
|
+
# params.source_partition_locator.stream_locator,
|
802
|
+
# params.source_partition_locator.partition_values,
|
803
|
+
# **params.deltacat_storage_kwargs,
|
804
|
+
# )
|
805
|
+
# # NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
|
765
806
|
logger.info(
|
766
807
|
f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
|
767
808
|
f"using previous partition: {previous_partition.locator if previous_partition else None}"
|
768
809
|
)
|
810
|
+
# Set the round completion info on the partition before committing
|
811
|
+
execute_compaction_result.new_compacted_partition.compaction_round_completion_info = (
|
812
|
+
execute_compaction_result.new_round_completion_info
|
813
|
+
)
|
769
814
|
committed_partition: Partition = params.deltacat_storage.commit_partition(
|
770
815
|
execute_compaction_result.new_compacted_partition,
|
771
816
|
previous_partition,
|
@@ -776,3 +821,57 @@ def _commit_compaction_result(
|
|
776
821
|
logger.warning("No new partition was committed during compaction.")
|
777
822
|
|
778
823
|
logger.info(f"Completed compaction session for: {params.source_partition_locator}")
|
824
|
+
|
825
|
+
|
826
|
+
def _upload_audit_data(
|
827
|
+
params: CompactPartitionParams,
|
828
|
+
audit_info: CompactionSessionAuditInfo,
|
829
|
+
) -> None:
|
830
|
+
"""
|
831
|
+
Upload audit data to the specified URL using the filesystem from catalog properties.
|
832
|
+
"""
|
833
|
+
audit_url = audit_info.audit_url
|
834
|
+
audit_data = json.dumps(audit_info.to_serializable(params.catalog.root))
|
835
|
+
if params.catalog and params.catalog.filesystem:
|
836
|
+
# Use the filesystem from catalog properties
|
837
|
+
filesystem = params.catalog.filesystem
|
838
|
+
parsed_url = urlparse(audit_url)
|
839
|
+
# For filesystem paths, use the path component
|
840
|
+
path = parsed_url.path if parsed_url.scheme else audit_url
|
841
|
+
|
842
|
+
# Ensure parent directories exist
|
843
|
+
import os
|
844
|
+
|
845
|
+
parent_dir = os.path.dirname(path)
|
846
|
+
if (
|
847
|
+
parent_dir
|
848
|
+
and not filesystem.get_file_info(parent_dir).type
|
849
|
+
== pyarrow.fs.FileType.Directory
|
850
|
+
):
|
851
|
+
try:
|
852
|
+
filesystem.create_dir(parent_dir, recursive=True)
|
853
|
+
except Exception as e:
|
854
|
+
logger.warning(f"Failed to create directory {parent_dir}: {e}")
|
855
|
+
|
856
|
+
with filesystem.open_output_stream(path) as output_stream:
|
857
|
+
output_stream.write(audit_data.encode("utf-8"))
|
858
|
+
else:
|
859
|
+
# Fallback: resolve filesystem from the URL
|
860
|
+
path, filesystem = resolve_path_and_filesystem(audit_url)
|
861
|
+
|
862
|
+
# Ensure parent directories exist
|
863
|
+
import os
|
864
|
+
|
865
|
+
parent_dir = os.path.dirname(path)
|
866
|
+
if (
|
867
|
+
parent_dir
|
868
|
+
and not filesystem.get_file_info(parent_dir).type
|
869
|
+
== pyarrow.fs.FileType.Directory
|
870
|
+
):
|
871
|
+
try:
|
872
|
+
filesystem.create_dir(parent_dir, recursive=True)
|
873
|
+
except Exception as e:
|
874
|
+
logger.warning(f"Failed to create directory {parent_dir}: {e}")
|
875
|
+
|
876
|
+
with filesystem.open_output_stream(path) as output_stream:
|
877
|
+
output_stream.write(audit_data.encode("utf-8"))
|
@@ -18,7 +18,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
|
18
18
|
group_hash_bucket_indices,
|
19
19
|
group_by_pk_hash_bucket,
|
20
20
|
)
|
21
|
-
from deltacat.storage import
|
21
|
+
from deltacat.storage import metastore
|
22
22
|
from deltacat.utils.ray_utils.runtime import (
|
23
23
|
get_current_ray_task_id,
|
24
24
|
get_current_ray_worker_id,
|
@@ -50,8 +50,9 @@ def _group_file_records_by_pk_hash_bucket(
|
|
50
50
|
annotated_delta: DeltaAnnotated,
|
51
51
|
num_hash_buckets: int,
|
52
52
|
primary_keys: List[str],
|
53
|
+
all_column_names: List[str],
|
53
54
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
54
|
-
deltacat_storage=
|
55
|
+
deltacat_storage=metastore,
|
55
56
|
deltacat_storage_kwargs: Optional[dict] = None,
|
56
57
|
) -> Tuple[Optional[DeltaFileEnvelopeGroups], int, int]:
|
57
58
|
# read input parquet s3 objects into a list of delta file envelopes
|
@@ -61,6 +62,7 @@ def _group_file_records_by_pk_hash_bucket(
|
|
61
62
|
total_size_bytes,
|
62
63
|
) = read_delta_file_envelopes(
|
63
64
|
annotated_delta,
|
65
|
+
all_column_names,
|
64
66
|
read_kwargs_provider,
|
65
67
|
deltacat_storage,
|
66
68
|
deltacat_storage_kwargs,
|
@@ -116,6 +118,7 @@ def _timed_hash_bucket(input: HashBucketInput):
|
|
116
118
|
annotated_delta=input.annotated_delta,
|
117
119
|
num_hash_buckets=input.num_hash_buckets,
|
118
120
|
primary_keys=input.primary_keys,
|
121
|
+
all_column_names=input.all_column_names,
|
119
122
|
read_kwargs_provider=input.read_kwargs_provider,
|
120
123
|
deltacat_storage=input.deltacat_storage,
|
121
124
|
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|