deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,117 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional
|
3
|
+
from deltacat import logs
|
4
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
5
|
+
from deltacat.storage import PartitionLocator
|
6
|
+
from deltacat.storage.model.partition import Partition
|
7
|
+
from deltacat.utils.metrics import metrics
|
8
|
+
from deltacat.exceptions import PartitionNotFoundError
|
9
|
+
|
10
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
11
|
+
|
12
|
+
|
13
|
+
@metrics
|
14
|
+
def read_round_completion_info(
|
15
|
+
source_partition_locator: PartitionLocator,
|
16
|
+
destination_partition_locator: PartitionLocator,
|
17
|
+
deltacat_storage,
|
18
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
19
|
+
destination_partition: Optional[Partition] = None,
|
20
|
+
) -> Optional[RoundCompletionInfo]:
|
21
|
+
"""
|
22
|
+
Read round completion info from the partition metafile.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
source_partition_locator: Source partition locator for validation
|
26
|
+
destination_partition_locator: Destination partition locator
|
27
|
+
deltacat_storage: Storage implementation
|
28
|
+
deltacat_storage_kwargs: Optional storage kwargs
|
29
|
+
destination_partition: Optional destination partition to avoid redundant get_partition calls
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
RoundCompletionInfo if found in partition, None otherwise
|
33
|
+
"""
|
34
|
+
if not destination_partition_locator:
|
35
|
+
return None
|
36
|
+
|
37
|
+
if deltacat_storage_kwargs is None:
|
38
|
+
deltacat_storage_kwargs = {}
|
39
|
+
|
40
|
+
try:
|
41
|
+
# Use provided partition or get it from storage
|
42
|
+
if destination_partition:
|
43
|
+
partition = destination_partition
|
44
|
+
else:
|
45
|
+
# First get the current partition to access its previous_partition_id
|
46
|
+
current_partition: Partition = deltacat_storage.get_partition(
|
47
|
+
destination_partition_locator.stream_locator,
|
48
|
+
destination_partition_locator.partition_values,
|
49
|
+
**deltacat_storage_kwargs,
|
50
|
+
)
|
51
|
+
|
52
|
+
# If current partition has round completion info, use it
|
53
|
+
if current_partition.compaction_round_completion_info:
|
54
|
+
partition = current_partition
|
55
|
+
elif current_partition.previous_partition_id is not None:
|
56
|
+
# For incremental compaction, we need to get the previous committed partition
|
57
|
+
# that contains the round completion info.
|
58
|
+
# Get the previous partition by ID - this is where the round completion info should be
|
59
|
+
logger.info(
|
60
|
+
f"Current partition {destination_partition_locator} does not have round completion info, "
|
61
|
+
f"getting previous partition with ID: {current_partition.previous_partition_id}"
|
62
|
+
)
|
63
|
+
previous_partition = deltacat_storage.get_partition_by_id(
|
64
|
+
destination_partition_locator.stream_locator,
|
65
|
+
current_partition.previous_partition_id,
|
66
|
+
**deltacat_storage_kwargs,
|
67
|
+
)
|
68
|
+
if previous_partition is not None:
|
69
|
+
logger.info(
|
70
|
+
f"Found previous partition: {previous_partition.locator}"
|
71
|
+
)
|
72
|
+
partition = previous_partition
|
73
|
+
else:
|
74
|
+
raise PartitionNotFoundError(
|
75
|
+
f"Previous partition with ID {current_partition.previous_partition_id} not found"
|
76
|
+
)
|
77
|
+
else:
|
78
|
+
logger.info(f"No previous partition ID found, using current partition")
|
79
|
+
partition = current_partition
|
80
|
+
|
81
|
+
if partition:
|
82
|
+
round_completion_info = partition.compaction_round_completion_info
|
83
|
+
if round_completion_info:
|
84
|
+
# Validate that prev_source_partition_locator matches current source
|
85
|
+
if (
|
86
|
+
not source_partition_locator
|
87
|
+
or not round_completion_info.prev_source_partition_locator
|
88
|
+
):
|
89
|
+
raise ValueError(
|
90
|
+
f"Source partition locator ({source_partition_locator}) and "
|
91
|
+
f"prev_source_partition_locator ({round_completion_info.prev_source_partition_locator}) "
|
92
|
+
f"must both be provided."
|
93
|
+
)
|
94
|
+
|
95
|
+
if (
|
96
|
+
round_completion_info.prev_source_partition_locator.canonical_string()
|
97
|
+
!= source_partition_locator.canonical_string()
|
98
|
+
):
|
99
|
+
logger.warning(
|
100
|
+
f"Previous source partition locator mismatch: "
|
101
|
+
f"expected {source_partition_locator.canonical_string()}, "
|
102
|
+
f"but found {round_completion_info.prev_source_partition_locator.canonical_string()} "
|
103
|
+
f"in round completion info. Ignoring cached round completion info."
|
104
|
+
)
|
105
|
+
return None
|
106
|
+
|
107
|
+
logger.info(
|
108
|
+
f"Read round completion info from partition metafile: {round_completion_info}"
|
109
|
+
)
|
110
|
+
return round_completion_info
|
111
|
+
|
112
|
+
except Exception as e:
|
113
|
+
logger.debug(
|
114
|
+
f"Failed to read round completion info from partition metafile: {e}"
|
115
|
+
)
|
116
|
+
|
117
|
+
return None
|
@@ -294,7 +294,9 @@ def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table
|
|
294
294
|
|
295
295
|
|
296
296
|
def delta_type_to_field(delta_type: DeltaType) -> bool:
|
297
|
-
|
297
|
+
# For deduplication purposes, treat both UPSERT and APPEND as UPSERT (True)
|
298
|
+
# Only DELETE should be treated as DELETE (False)
|
299
|
+
return delta_type is not DeltaType.DELETE
|
298
300
|
|
299
301
|
|
300
302
|
def delta_type_from_field(delta_type_field: bool) -> DeltaType:
|
@@ -14,7 +14,6 @@ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
|
14
14
|
ExecutionCompactionResult,
|
15
15
|
)
|
16
16
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
17
|
-
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
18
17
|
from deltacat.compute.compactor import DeltaAnnotated
|
19
18
|
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
20
19
|
DeleteStrategy,
|
@@ -27,6 +26,7 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
|
27
26
|
from deltacat.storage import (
|
28
27
|
Delta,
|
29
28
|
DeltaLocator,
|
29
|
+
PartitionLocator,
|
30
30
|
)
|
31
31
|
from deltacat.storage.model.manifest import Manifest
|
32
32
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
@@ -36,13 +36,14 @@ from deltacat.utils.resources import (
|
|
36
36
|
get_current_process_peak_memory_usage_in_bytes,
|
37
37
|
)
|
38
38
|
from deltacat.compute.compactor_v2.private.compaction_utils import (
|
39
|
+
_get_rci_source_partition_locator,
|
39
40
|
_fetch_compaction_metadata,
|
40
41
|
_build_uniform_deltas,
|
41
42
|
_group_uniform_deltas,
|
42
43
|
_stage_new_partition,
|
43
44
|
_run_hash_and_merge,
|
44
45
|
_process_merge_results,
|
45
|
-
|
46
|
+
_create_round_completion_info,
|
46
47
|
_commit_compaction_result,
|
47
48
|
)
|
48
49
|
from deltacat.utils.metrics import metrics
|
@@ -64,24 +65,26 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
64
65
|
|
65
66
|
@metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
|
66
67
|
@categorize_errors
|
67
|
-
def compact_partition(params: CompactPartitionParams, **kwargs) ->
|
68
|
+
def compact_partition(params: CompactPartitionParams, **kwargs) -> None:
|
68
69
|
assert (
|
69
70
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
70
71
|
), "hash_bucket_count is a required arg for compactor v2"
|
72
|
+
assert type(params.hash_bucket_count) is int, "Hash bucket count must be an integer"
|
71
73
|
if params.num_rounds > 1:
|
72
74
|
assert (
|
73
75
|
not params.drop_duplicates
|
74
76
|
), "num_rounds > 1, drop_duplicates must be False but is True"
|
75
77
|
|
76
|
-
with
|
77
|
-
"compaction_partition.bin"
|
78
|
-
|
78
|
+
with (
|
79
|
+
memray.Tracker("compaction_partition.bin")
|
80
|
+
if params.enable_profiler
|
81
|
+
else nullcontext()
|
82
|
+
):
|
79
83
|
execute_compaction_result: ExecutionCompactionResult = _execute_compaction(
|
80
84
|
params,
|
81
85
|
**kwargs,
|
82
86
|
)
|
83
87
|
_commit_compaction_result(params, execute_compaction_result)
|
84
|
-
return execute_compaction_result.round_completion_file_s3_url
|
85
88
|
|
86
89
|
|
87
90
|
def _execute_compaction(
|
@@ -96,12 +99,12 @@ def _execute_compaction(
|
|
96
99
|
previous_compacted_delta_manifest,
|
97
100
|
round_completion_info,
|
98
101
|
) = fetch_compaction_metadata_result
|
99
|
-
|
100
|
-
params
|
102
|
+
rci_source_partition_locator: PartitionLocator = _get_rci_source_partition_locator(
|
103
|
+
params
|
101
104
|
)
|
102
105
|
|
103
|
-
base_audit_url: str =
|
104
|
-
f"
|
106
|
+
base_audit_url: str = rci_source_partition_locator.path(
|
107
|
+
f"{params.compaction_artifact_path}/compaction-audit"
|
105
108
|
)
|
106
109
|
audit_url: str = f"{base_audit_url}.json"
|
107
110
|
logger.info(f"Compaction audit will be written to {audit_url}")
|
@@ -136,7 +139,7 @@ def _execute_compaction(
|
|
136
139
|
)
|
137
140
|
if not input_deltas:
|
138
141
|
logger.info("No input deltas found to compact.")
|
139
|
-
return ExecutionCompactionResult(None, None,
|
142
|
+
return ExecutionCompactionResult(None, None, False)
|
140
143
|
build_uniform_deltas_result: tuple[
|
141
144
|
List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
142
145
|
] = _build_uniform_deltas(
|
@@ -199,13 +202,13 @@ def _execute_compaction(
|
|
199
202
|
|
200
203
|
compaction_audit.save_round_completion_stats(mat_results)
|
201
204
|
|
202
|
-
compaction_result: ExecutionCompactionResult =
|
205
|
+
compaction_result: ExecutionCompactionResult = _create_round_completion_info(
|
203
206
|
params,
|
204
207
|
compaction_audit,
|
205
208
|
compacted_partition,
|
206
209
|
audit_url,
|
207
210
|
hb_id_to_entry_indices_range,
|
208
|
-
|
211
|
+
rci_source_partition_locator,
|
209
212
|
new_compacted_delta_locator,
|
210
213
|
pyarrow_write_result,
|
211
214
|
round_completion_info,
|
@@ -1,3 +1,5 @@
|
|
1
|
+
from deltacat.utils.common import env_bool, env_integer, env_string
|
2
|
+
|
1
3
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
2
4
|
|
3
5
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
|
|
31
33
|
# The total size of records that will be hash bucketed at once
|
32
34
|
# Since, sorting is nlogn, we ensure that is not performed
|
33
35
|
# on a very large dataset for best performance.
|
34
|
-
MAX_SIZE_OF_RECORD_BATCH_IN_GIB =
|
36
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
|
37
|
+
"MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
|
38
|
+
)
|
35
39
|
|
36
40
|
# Whether to drop duplicates during merge.
|
37
41
|
DROP_DUPLICATES = True
|
@@ -78,3 +82,28 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
|
78
82
|
# Number of rounds to run hash/merge for a single
|
79
83
|
# partition. (For large table support)
|
80
84
|
DEFAULT_NUM_ROUNDS = 1
|
85
|
+
|
86
|
+
# Whether to perform sha1 hashing when required to
|
87
|
+
# optimize memory. For example, hashing is always
|
88
|
+
# required for bucketing where it's not mandatory
|
89
|
+
# when dropping duplicates. Setting this to True
|
90
|
+
# will disable sha1 hashing in cases where it isn't
|
91
|
+
# mandatory. This flag is False by default.
|
92
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
|
+
)
|
95
|
+
|
96
|
+
# This env variable specifies whether to check bucketing spec
|
97
|
+
# compliance of the existing compacted table.
|
98
|
+
# PRINT_LOG: Enable logging if any partition is found
|
99
|
+
# to be non-compliant with the bucketing spec.
|
100
|
+
# ASSERT: Fail the job with ValidationError if the
|
101
|
+
# current compacted partition is found to be non-compliant
|
102
|
+
# with bucketing spec. Note, logging is implicitly enabled
|
103
|
+
# in this case.
|
104
|
+
BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
|
105
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE", None
|
106
|
+
)
|
107
|
+
|
108
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
|
109
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
|
@@ -13,7 +13,6 @@ from typing import Optional
|
|
13
13
|
class ExecutionCompactionResult:
|
14
14
|
new_compacted_partition: Optional[Partition]
|
15
15
|
new_round_completion_info: Optional[RoundCompletionInfo]
|
16
|
-
round_completion_file_s3_url: Optional[str]
|
17
16
|
is_inplace_compacted: bool
|
18
17
|
|
19
18
|
def __iter__(self):
|
@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Any
|
|
4
4
|
from deltacat.utils.metrics import MetricsConfig
|
5
5
|
from deltacat.utils.common import ReadKwargsProvider
|
6
6
|
from deltacat.io.object_store import IObjectStore
|
7
|
-
from deltacat.storage import
|
7
|
+
from deltacat.storage import metastore
|
8
8
|
from deltacat.compute.compactor import DeltaAnnotated
|
9
9
|
|
10
10
|
|
@@ -15,12 +15,13 @@ class HashBucketInput(Dict):
|
|
15
15
|
primary_keys: List[str],
|
16
16
|
num_hash_buckets: int,
|
17
17
|
num_hash_groups: int,
|
18
|
+
all_column_names: List[str],
|
18
19
|
hb_task_index: Optional[int] = 0,
|
19
20
|
enable_profiler: Optional[bool] = False,
|
20
21
|
metrics_config: Optional[MetricsConfig] = None,
|
21
22
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
22
23
|
object_store: Optional[IObjectStore] = None,
|
23
|
-
deltacat_storage=
|
24
|
+
deltacat_storage=metastore,
|
24
25
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
25
26
|
memory_logs_enabled: Optional[bool] = None,
|
26
27
|
) -> HashBucketInput:
|
@@ -31,6 +32,7 @@ class HashBucketInput(Dict):
|
|
31
32
|
result["hb_task_index"] = hb_task_index
|
32
33
|
result["num_hash_buckets"] = num_hash_buckets
|
33
34
|
result["num_hash_groups"] = num_hash_groups
|
35
|
+
result["all_column_names"] = all_column_names
|
34
36
|
result["enable_profiler"] = enable_profiler
|
35
37
|
result["metrics_config"] = metrics_config
|
36
38
|
result["read_kwargs_provider"] = read_kwargs_provider
|
@@ -61,6 +63,10 @@ class HashBucketInput(Dict):
|
|
61
63
|
def num_hash_groups(self) -> int:
|
62
64
|
return self["num_hash_groups"]
|
63
65
|
|
66
|
+
@property
|
67
|
+
def all_column_names(self) -> List[str]:
|
68
|
+
return self["all_column_names"]
|
69
|
+
|
64
70
|
@property
|
65
71
|
def enable_profiler(self) -> Optional[bool]:
|
66
72
|
return self.get("enable_profiler")
|
@@ -78,7 +84,7 @@ class HashBucketInput(Dict):
|
|
78
84
|
return self.get("object_store")
|
79
85
|
|
80
86
|
@property
|
81
|
-
def deltacat_storage(self) ->
|
87
|
+
def deltacat_storage(self) -> metastore:
|
82
88
|
return self.get("deltacat_storage")
|
83
89
|
|
84
90
|
@property
|
@@ -16,7 +16,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
|
16
16
|
hash_group_index_to_hash_bucket_indices,
|
17
17
|
)
|
18
18
|
|
19
|
-
from deltacat.storage import
|
19
|
+
from deltacat.storage import metastore
|
20
20
|
|
21
21
|
from deltacat.io.object_store import IObjectStore
|
22
22
|
|
@@ -87,11 +87,13 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
|
|
87
87
|
def __init__(
|
88
88
|
self,
|
89
89
|
uniform_deltas: List[DeltaAnnotated],
|
90
|
+
all_column_names: List[str],
|
90
91
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
91
|
-
deltacat_storage=
|
92
|
+
deltacat_storage=metastore,
|
92
93
|
deltacat_storage_kwargs: Optional[dict] = None,
|
93
94
|
):
|
94
95
|
self._deltas = uniform_deltas
|
96
|
+
self._all_column_names = all_column_names
|
95
97
|
self._read_kwargs_provider = read_kwargs_provider
|
96
98
|
self._deltacat_storage = deltacat_storage
|
97
99
|
self._deltacat_storage_kwargs = deltacat_storage_kwargs
|
@@ -110,6 +112,7 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
|
|
110
112
|
total_size_bytes,
|
111
113
|
) = read_delta_file_envelopes(
|
112
114
|
annotated_delta,
|
115
|
+
self._all_column_names,
|
113
116
|
self._read_kwargs_provider,
|
114
117
|
self._deltacat_storage,
|
115
118
|
self._deltacat_storage_kwargs,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import Dict, List, Optional, Any
|
3
|
+
from typing import Dict, List, Optional, Any, Set
|
4
4
|
|
5
5
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
6
6
|
MergeFileGroupsProvider,
|
@@ -12,9 +12,10 @@ from deltacat.utils.metrics import MetricsConfig
|
|
12
12
|
from deltacat.utils.common import ReadKwargsProvider
|
13
13
|
from deltacat.io.object_store import IObjectStore
|
14
14
|
from deltacat.storage import (
|
15
|
+
Manifest,
|
15
16
|
Partition,
|
16
17
|
SortKey,
|
17
|
-
|
18
|
+
metastore,
|
18
19
|
)
|
19
20
|
from deltacat.compute.compactor_v2.constants import (
|
20
21
|
DROP_DUPLICATES,
|
@@ -32,22 +33,26 @@ class MergeInput(Dict):
|
|
32
33
|
write_to_partition: Partition,
|
33
34
|
compacted_file_content_type: ContentType,
|
34
35
|
primary_keys: List[str],
|
36
|
+
all_column_names: List[str],
|
35
37
|
drop_duplicates: Optional[bool] = DROP_DUPLICATES,
|
36
38
|
sort_keys: Optional[List[SortKey]] = None,
|
37
39
|
merge_task_index: Optional[int] = 0,
|
38
40
|
max_records_per_output_file: Optional[int] = MAX_RECORDS_PER_COMPACTED_FILE,
|
39
41
|
enable_profiler: Optional[bool] = False,
|
40
42
|
metrics_config: Optional[MetricsConfig] = None,
|
41
|
-
|
43
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
42
44
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
43
45
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
44
46
|
object_store: Optional[IObjectStore] = None,
|
45
47
|
delete_strategy: Optional[DeleteStrategy] = None,
|
46
48
|
delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None,
|
47
|
-
deltacat_storage=
|
49
|
+
deltacat_storage=metastore,
|
48
50
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
49
51
|
memory_logs_enabled: Optional[bool] = None,
|
50
52
|
disable_copy_by_reference: Optional[bool] = None,
|
53
|
+
hash_bucket_count: Optional[int] = None,
|
54
|
+
original_fields: Optional[Set[str]] = None,
|
55
|
+
compacted_manifest: Optional[Manifest] = None,
|
51
56
|
) -> MergeInput:
|
52
57
|
|
53
58
|
result = MergeInput()
|
@@ -55,13 +60,14 @@ class MergeInput(Dict):
|
|
55
60
|
result["write_to_partition"] = write_to_partition
|
56
61
|
result["compacted_file_content_type"] = compacted_file_content_type
|
57
62
|
result["primary_keys"] = primary_keys
|
63
|
+
result["all_column_names"] = all_column_names
|
58
64
|
result["drop_duplicates"] = drop_duplicates
|
59
65
|
result["sort_keys"] = sort_keys
|
60
66
|
result["merge_task_index"] = merge_task_index
|
61
67
|
result["max_records_per_output_file"] = max_records_per_output_file
|
62
68
|
result["enable_profiler"] = enable_profiler
|
63
69
|
result["metrics_config"] = metrics_config
|
64
|
-
result["
|
70
|
+
result["table_writer_kwargs"] = table_writer_kwargs or {}
|
65
71
|
result["read_kwargs_provider"] = read_kwargs_provider
|
66
72
|
result["round_completion_info"] = round_completion_info
|
67
73
|
result["object_store"] = object_store
|
@@ -71,6 +77,9 @@ class MergeInput(Dict):
|
|
71
77
|
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
72
78
|
result["memory_logs_enabled"] = memory_logs_enabled
|
73
79
|
result["disable_copy_by_reference"] = disable_copy_by_reference
|
80
|
+
result["hash_bucket_count"] = hash_bucket_count
|
81
|
+
result["original_fields"] = original_fields
|
82
|
+
result["compacted_manifest"] = compacted_manifest
|
74
83
|
return result
|
75
84
|
|
76
85
|
@property
|
@@ -89,6 +98,10 @@ class MergeInput(Dict):
|
|
89
98
|
def primary_keys(self) -> List[str]:
|
90
99
|
return self["primary_keys"]
|
91
100
|
|
101
|
+
@property
|
102
|
+
def all_column_names(self) -> List[str]:
|
103
|
+
return self["all_column_names"]
|
104
|
+
|
92
105
|
@property
|
93
106
|
def drop_duplicates(self) -> int:
|
94
107
|
return self["drop_duplicates"]
|
@@ -114,8 +127,8 @@ class MergeInput(Dict):
|
|
114
127
|
return self.get("metrics_config")
|
115
128
|
|
116
129
|
@property
|
117
|
-
def
|
118
|
-
return self.get("
|
130
|
+
def table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
|
131
|
+
return self.get("table_writer_kwargs")
|
119
132
|
|
120
133
|
@property
|
121
134
|
def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
|
@@ -130,7 +143,7 @@ class MergeInput(Dict):
|
|
130
143
|
return self.get("object_store")
|
131
144
|
|
132
145
|
@property
|
133
|
-
def deltacat_storage(self) ->
|
146
|
+
def deltacat_storage(self) -> metastore:
|
134
147
|
return self["deltacat_storage"]
|
135
148
|
|
136
149
|
@property
|
@@ -154,3 +167,15 @@ class MergeInput(Dict):
|
|
154
167
|
@property
|
155
168
|
def disable_copy_by_reference(self) -> bool:
|
156
169
|
return self["disable_copy_by_reference"]
|
170
|
+
|
171
|
+
@property
|
172
|
+
def hash_bucket_count(self) -> int:
|
173
|
+
return self["hash_bucket_count"]
|
174
|
+
|
175
|
+
@property
|
176
|
+
def original_fields(self) -> Optional[Set[str]]:
|
177
|
+
return self.get("original_fields")
|
178
|
+
|
179
|
+
@property
|
180
|
+
def compacted_manifest(self) -> Optional[Manifest]:
|
181
|
+
return self.get("compacted_manifest")
|