deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import functools
|
|
3
3
|
from deltacat.storage import (
|
4
4
|
PartitionLocator,
|
5
5
|
Delta,
|
6
|
-
|
6
|
+
metastore,
|
7
7
|
)
|
8
8
|
from deltacat import logs
|
9
9
|
from deltacat.compute.compactor.utils import io as io_v1
|
@@ -38,7 +38,7 @@ def discover_deltas(
|
|
38
38
|
rebase_source_partition_locator: Optional[PartitionLocator] = None,
|
39
39
|
rebase_source_partition_high_watermark: Optional[int] = None,
|
40
40
|
rcf_high_watermark: Optional[int] = None,
|
41
|
-
deltacat_storage=
|
41
|
+
deltacat_storage=metastore,
|
42
42
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
43
43
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
44
44
|
) -> List[Delta]:
|
@@ -67,6 +67,11 @@ def discover_deltas(
|
|
67
67
|
f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
|
68
68
|
f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
|
69
69
|
)
|
70
|
+
logger.info(f"DEBUG: source_partition_locator = {source_partition_locator}")
|
71
|
+
logger.info(
|
72
|
+
f"DEBUG: source_partition_locator.partition_id = {getattr(source_partition_locator, 'partition_id', 'NO_PARTITION_ID')}"
|
73
|
+
)
|
74
|
+
logger.info(f"DEBUG: total input deltas found = {len(result)}")
|
70
75
|
|
71
76
|
if rebase_source_partition_locator:
|
72
77
|
previous_compacted_deltas = io_v1._discover_deltas(
|
@@ -93,7 +98,8 @@ def create_uniform_input_deltas(
|
|
93
98
|
hash_bucket_count: int,
|
94
99
|
compaction_audit: CompactionSessionAuditInfo,
|
95
100
|
compact_partition_params: CompactPartitionParams,
|
96
|
-
|
101
|
+
all_column_names: List[str],
|
102
|
+
deltacat_storage=metastore,
|
97
103
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
98
104
|
) -> List[DeltaAnnotated]:
|
99
105
|
|
@@ -101,7 +107,6 @@ def create_uniform_input_deltas(
|
|
101
107
|
delta_manifest_entries_count = 0
|
102
108
|
estimated_da_bytes = 0
|
103
109
|
input_da_list = []
|
104
|
-
|
105
110
|
for delta in input_deltas:
|
106
111
|
if (
|
107
112
|
compact_partition_params.enable_input_split
|
@@ -114,10 +119,12 @@ def create_uniform_input_deltas(
|
|
114
119
|
)
|
115
120
|
append_content_type_params(
|
116
121
|
delta=delta,
|
122
|
+
all_column_names=all_column_names,
|
117
123
|
deltacat_storage=deltacat_storage,
|
118
124
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
119
125
|
task_max_parallelism=compact_partition_params.task_max_parallelism,
|
120
126
|
max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
|
127
|
+
file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
|
121
128
|
)
|
122
129
|
|
123
130
|
manifest_entries = delta.manifest.entries
|
@@ -23,6 +23,7 @@ from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
|
23
23
|
|
24
24
|
from deltacat.utils.performance import timed_invocation
|
25
25
|
from deltacat.storage import (
|
26
|
+
DeltaType,
|
26
27
|
Partition,
|
27
28
|
)
|
28
29
|
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
@@ -47,13 +48,21 @@ def materialize(
|
|
47
48
|
# TODO (pdames): compare performance to pandas-native materialize path
|
48
49
|
df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
|
49
50
|
compacted_table = df
|
51
|
+
# Extract schema from table_writer_kwargs to pass as direct parameter
|
52
|
+
# This ensures schema_id is properly set in the manifest
|
53
|
+
schema = None
|
54
|
+
if input.table_writer_kwargs and "schema" in input.table_writer_kwargs:
|
55
|
+
schema = input.table_writer_kwargs["schema"]
|
56
|
+
|
50
57
|
delta, stage_delta_time = timed_invocation(
|
51
58
|
input.deltacat_storage.stage_delta,
|
52
59
|
compacted_table,
|
53
60
|
input.write_to_partition,
|
61
|
+
delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
|
54
62
|
max_records_per_entry=input.max_records_per_output_file,
|
55
63
|
content_type=input.compacted_file_content_type,
|
56
|
-
|
64
|
+
schema=schema, # Pass schema as direct parameter for schema_id extraction
|
65
|
+
table_writer_kwargs=input.table_writer_kwargs,
|
57
66
|
**input.deltacat_storage_kwargs,
|
58
67
|
)
|
59
68
|
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
@@ -112,6 +121,7 @@ def generate_local_merge_input(
|
|
112
121
|
return MergeInput.of(
|
113
122
|
merge_file_groups_provider=LocalMergeFileGroupsProvider(
|
114
123
|
annotated_deltas,
|
124
|
+
all_column_names=params.all_column_names,
|
115
125
|
read_kwargs_provider=params.read_kwargs_provider,
|
116
126
|
deltacat_storage=params.deltacat_storage,
|
117
127
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
@@ -119,12 +129,13 @@ def generate_local_merge_input(
|
|
119
129
|
write_to_partition=compacted_partition,
|
120
130
|
compacted_file_content_type=params.compacted_file_content_type,
|
121
131
|
primary_keys=params.primary_keys,
|
132
|
+
all_column_names=params.all_column_names,
|
122
133
|
sort_keys=params.sort_keys,
|
123
134
|
drop_duplicates=params.drop_duplicates,
|
124
135
|
max_records_per_output_file=params.records_per_compacted_file,
|
125
136
|
enable_profiler=params.enable_profiler,
|
126
137
|
metrics_config=params.metrics_config,
|
127
|
-
|
138
|
+
table_writer_kwargs=params.table_writer_kwargs,
|
128
139
|
read_kwargs_provider=params.read_kwargs_provider,
|
129
140
|
round_completion_info=round_completion_info,
|
130
141
|
object_store=params.object_store,
|
@@ -133,4 +144,6 @@ def generate_local_merge_input(
|
|
133
144
|
delete_strategy=delete_strategy,
|
134
145
|
delete_file_envelopes=delete_file_envelopes,
|
135
146
|
disable_copy_by_reference=params.disable_copy_by_reference,
|
147
|
+
hash_bucket_count=params.hash_bucket_count,
|
148
|
+
original_fields=params.original_fields,
|
136
149
|
)
|
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
10
10
|
TOTAL_BYTES_IN_SHA1_HASH,
|
11
11
|
PK_DELIMITER,
|
12
12
|
MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
|
13
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
|
13
14
|
)
|
14
15
|
import time
|
15
16
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
|
|
48
49
|
f"Found total length of hash column={total_len} and total_size={total_size}"
|
49
50
|
)
|
50
51
|
|
52
|
+
if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
|
53
|
+
logger.info(
|
54
|
+
f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
|
55
|
+
f"Returning False for is_sha1_desired"
|
56
|
+
)
|
57
|
+
return False
|
58
|
+
|
51
59
|
return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
|
52
60
|
|
53
61
|
|
@@ -70,13 +78,25 @@ def _append_table_by_hash_bucket(
|
|
70
78
|
f"Grouping a pki table of length {len(pki_table)} took {groupby_latency}s"
|
71
79
|
)
|
72
80
|
|
81
|
+
hb_pk_grouped_by = hb_pk_grouped_by.sort_by(sc._HASH_BUCKET_IDX_COLUMN_NAME)
|
73
82
|
group_count_array = hb_pk_grouped_by[f"{sc._HASH_BUCKET_IDX_COLUMN_NAME}_count"]
|
74
83
|
hb_group_array = hb_pk_grouped_by[sc._HASH_BUCKET_IDX_COLUMN_NAME]
|
75
84
|
|
76
85
|
result_len = 0
|
77
86
|
for i, group_count in enumerate(group_count_array):
|
78
87
|
hb_idx = hb_group_array[i].as_py()
|
79
|
-
|
88
|
+
group_count_py = group_count.as_py()
|
89
|
+
pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count_py)
|
90
|
+
assert group_count_py == len(
|
91
|
+
pyarrow_table
|
92
|
+
), f"Group count {group_count_py} not equal to {len(pyarrow_table)}"
|
93
|
+
all_buckets = pc.unique(pyarrow_table[sc._HASH_BUCKET_IDX_COLUMN_NAME])
|
94
|
+
assert (
|
95
|
+
len(all_buckets) == 1
|
96
|
+
), f"Only one hash bucket is allowed but found {len(all_buckets)}"
|
97
|
+
assert (
|
98
|
+
all_buckets[0].as_py() == hb_idx
|
99
|
+
), f"Hash bucket not equal, {all_buckets[0]} and {hb_idx}"
|
80
100
|
pyarrow_table = pyarrow_table.drop(
|
81
101
|
[sc._HASH_BUCKET_IDX_COLUMN_NAME, sc._PK_HASH_STRING_COLUMN_NAME]
|
82
102
|
)
|
@@ -108,9 +128,10 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
108
128
|
record_batches = []
|
109
129
|
result_len = 0
|
110
130
|
for record_batch in table_batches:
|
111
|
-
|
112
|
-
|
113
|
-
|
131
|
+
if (
|
132
|
+
record_batches
|
133
|
+
and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
|
134
|
+
):
|
114
135
|
logger.info(
|
115
136
|
f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
|
116
137
|
f"is {len(record_batches)} and size {current_bytes}"
|
@@ -128,6 +149,9 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
128
149
|
current_bytes = 0
|
129
150
|
record_batches.clear()
|
130
151
|
|
152
|
+
current_bytes += record_batch.nbytes
|
153
|
+
record_batches.append(record_batch)
|
154
|
+
|
131
155
|
if record_batches:
|
132
156
|
appended_len, append_latency = timed_invocation(
|
133
157
|
_append_table_by_hash_bucket,
|
@@ -1,12 +1,17 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Dict, Optional, List, Tuple, Any
|
3
3
|
from deltacat import logs
|
4
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
5
|
+
from deltacat.compute.compactor_v2.constants import (
|
6
|
+
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
7
|
+
)
|
4
8
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
5
9
|
LocalMergeFileGroupsProvider,
|
6
10
|
)
|
7
11
|
from deltacat.storage import (
|
8
12
|
Manifest,
|
9
|
-
|
13
|
+
ManifestEntry,
|
14
|
+
metastore,
|
10
15
|
)
|
11
16
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
12
17
|
from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
|
@@ -72,8 +77,6 @@ def _get_merge_task_options(
|
|
72
77
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
73
78
|
compacted_delta_manifest: Optional[Manifest] = None,
|
74
79
|
primary_keys: Optional[List[str]] = None,
|
75
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
76
|
-
deltacat_storage_kwargs: Optional[Dict] = {},
|
77
80
|
memory_logs_enabled: Optional[bool] = None,
|
78
81
|
) -> Dict[str, Any]:
|
79
82
|
if (
|
@@ -81,16 +84,27 @@ def _get_merge_task_options(
|
|
81
84
|
and compacted_delta_manifest
|
82
85
|
and round_completion_info.hb_index_to_entry_range
|
83
86
|
):
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
87
|
+
logger.debug_conditional(
|
88
|
+
f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
|
89
|
+
memory_logs_enabled,
|
90
|
+
)
|
91
|
+
previous_inflation: float = (
|
92
|
+
(
|
93
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
94
|
+
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
95
|
+
)
|
96
|
+
if round_completion_info.compacted_pyarrow_write_result.file_bytes
|
97
|
+
else PYARROW_INFLATION_MULTIPLIER
|
88
98
|
)
|
89
99
|
debug_memory_params["previous_inflation"] = previous_inflation
|
90
100
|
|
91
|
-
average_record_size = (
|
92
|
-
|
93
|
-
|
101
|
+
average_record_size: float = (
|
102
|
+
(
|
103
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
104
|
+
/ round_completion_info.compacted_pyarrow_write_result.records
|
105
|
+
)
|
106
|
+
if round_completion_info.compacted_pyarrow_write_result.records
|
107
|
+
else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
94
108
|
)
|
95
109
|
debug_memory_params["average_record_size"] = average_record_size
|
96
110
|
|
@@ -106,31 +120,36 @@ def _get_merge_task_options(
|
|
106
120
|
str(hb_idx)
|
107
121
|
]
|
108
122
|
for entry_index in range(entry_start, entry_end):
|
109
|
-
entry = compacted_delta_manifest.entries[entry_index]
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
123
|
+
entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
|
124
|
+
current_entry_size: float = (
|
125
|
+
estimate_manifest_entry_size_bytes(
|
126
|
+
entry=entry,
|
127
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
128
|
+
estimate_resources_params=estimate_resources_params,
|
129
|
+
)
|
130
|
+
or 0.0
|
115
131
|
)
|
116
|
-
current_entry_rows =
|
117
|
-
|
118
|
-
|
119
|
-
|
132
|
+
current_entry_rows: int = (
|
133
|
+
estimate_manifest_entry_num_rows(
|
134
|
+
entry=entry,
|
135
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
136
|
+
estimate_resources_params=estimate_resources_params,
|
137
|
+
)
|
138
|
+
or 0
|
120
139
|
)
|
121
|
-
|
140
|
+
# NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
|
122
141
|
data_size += current_entry_size
|
123
142
|
num_rows += current_entry_rows
|
124
|
-
|
125
143
|
if primary_keys:
|
126
|
-
pk_size
|
144
|
+
pk_size: Optional[
|
145
|
+
float
|
146
|
+
] = estimate_manifest_entry_column_size_bytes(
|
127
147
|
entry=entry,
|
128
148
|
columns=primary_keys,
|
129
149
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
130
150
|
estimate_resources_params=estimate_resources_params,
|
131
151
|
)
|
132
|
-
|
133
|
-
if pk_size is None:
|
152
|
+
if not pk_size:
|
134
153
|
pk_size_bytes += current_entry_size
|
135
154
|
else:
|
136
155
|
pk_size_bytes += pk_size
|
@@ -159,7 +178,6 @@ def _get_merge_task_options(
|
|
159
178
|
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
160
179
|
memory_logs_enabled,
|
161
180
|
)
|
162
|
-
|
163
181
|
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
164
182
|
|
165
183
|
|
@@ -255,8 +273,6 @@ def merge_resource_options_provider(
|
|
255
273
|
compacted_delta_manifest: Optional[Manifest] = None,
|
256
274
|
ray_custom_resources: Optional[Dict] = None,
|
257
275
|
primary_keys: Optional[List[str]] = None,
|
258
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
259
|
-
deltacat_storage_kwargs: Optional[Dict] = {},
|
260
276
|
memory_logs_enabled: Optional[bool] = None,
|
261
277
|
**kwargs,
|
262
278
|
) -> Dict:
|
@@ -286,8 +302,6 @@ def merge_resource_options_provider(
|
|
286
302
|
round_completion_info=round_completion_info,
|
287
303
|
compacted_delta_manifest=compacted_delta_manifest,
|
288
304
|
primary_keys=primary_keys,
|
289
|
-
deltacat_storage=deltacat_storage,
|
290
|
-
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
291
305
|
memory_logs_enabled=memory_logs_enabled,
|
292
306
|
estimate_resources_params=estimate_resources_params,
|
293
307
|
)
|
@@ -302,7 +316,7 @@ def local_merge_resource_options_provider(
|
|
302
316
|
compacted_delta_manifest: Optional[Manifest] = None,
|
303
317
|
ray_custom_resources: Optional[Dict] = None,
|
304
318
|
primary_keys: Optional[List[str]] = None,
|
305
|
-
deltacat_storage=
|
319
|
+
deltacat_storage=metastore,
|
306
320
|
deltacat_storage_kwargs: Optional[Dict] = {},
|
307
321
|
memory_logs_enabled: Optional[bool] = None,
|
308
322
|
**kwargs,
|
@@ -328,8 +342,6 @@ def local_merge_resource_options_provider(
|
|
328
342
|
round_completion_info=round_completion_info,
|
329
343
|
compacted_delta_manifest=compacted_delta_manifest,
|
330
344
|
primary_keys=primary_keys,
|
331
|
-
deltacat_storage=deltacat_storage,
|
332
|
-
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
333
345
|
memory_logs_enabled=memory_logs_enabled,
|
334
346
|
estimate_resources_params=estimate_resources_params,
|
335
347
|
)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
1
2
|
from deltacat.utils.ray_utils.concurrency import (
|
2
3
|
invoke_parallel,
|
3
4
|
task_resource_options_provider,
|
@@ -12,8 +13,7 @@ from deltacat import logs
|
|
12
13
|
from deltacat.compute.converter.model.converter_session_params import (
|
13
14
|
ConverterSessionParams,
|
14
15
|
)
|
15
|
-
|
16
|
-
|
16
|
+
from typing import Dict, List, Any, Callable
|
17
17
|
from deltacat.compute.converter.constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
18
18
|
from deltacat.compute.converter.steps.convert import convert
|
19
19
|
from deltacat.compute.converter.model.convert_input import ConvertInput
|
@@ -31,30 +31,80 @@ from deltacat.compute.converter.pyiceberg.catalog import load_table
|
|
31
31
|
from deltacat.compute.converter.utils.converter_session_utils import (
|
32
32
|
group_all_files_to_each_bucket,
|
33
33
|
)
|
34
|
+
from deltacat.compute.converter.model.convert_result import ConvertResult
|
35
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
36
|
+
_get_snapshot_action_description,
|
37
|
+
_determine_snapshot_type,
|
38
|
+
SnapshotType,
|
39
|
+
)
|
40
|
+
|
41
|
+
from pyiceberg.manifest import DataFile
|
42
|
+
from pyiceberg.table.metadata import TableMetadata
|
34
43
|
|
35
44
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
36
45
|
|
37
46
|
|
38
|
-
def converter_session(params: ConverterSessionParams, **kwargs):
|
47
|
+
def converter_session(params: ConverterSessionParams, **kwargs: Any) -> TableMetadata:
|
39
48
|
"""
|
40
|
-
Convert equality
|
41
|
-
|
42
|
-
|
49
|
+
Convert equality deletes to position deletes with option to enforce primary key uniqueness.
|
50
|
+
|
51
|
+
This function processes Iceberg table files to convert equality delete files to position delete files.
|
52
|
+
It can optionally enforce primary key uniqueness by keeping only the latest version of each
|
53
|
+
primary key across all data files.
|
54
|
+
|
55
|
+
**Memory Requirements:**
|
56
|
+
- Minimum 512MB of free memory is required to run the converter
|
57
|
+
|
58
|
+
**Process Overview:**
|
59
|
+
1. Fetches all bucket files (data files, equality deletes, position deletes)
|
60
|
+
2. Groups files by bucket for parallel processing
|
61
|
+
3. Converts equality deletes to position deletes using Ray parallel tasks
|
62
|
+
4. Enforces primary key uniqueness if enabled
|
63
|
+
5. Commits appropriate snapshot (append, replace, or delete) to the Iceberg table
|
64
|
+
|
65
|
+
|
66
|
+
Args:
|
67
|
+
params: ConverterSessionParams containing all configuration parameters
|
68
|
+
- catalog: Iceberg catalog instance
|
69
|
+
- iceberg_table_name: Name of the target Iceberg table
|
70
|
+
- enforce_primary_key_uniqueness: Whether to enforce PK uniqueness
|
71
|
+
- iceberg_warehouse_bucket_name: S3 bucket for Iceberg warehouse
|
72
|
+
- iceberg_namespace: Iceberg namespace
|
73
|
+
- merge_keys: Optional list of merge key fields (uses table identifier fields if not provided)
|
74
|
+
- compact_previous_position_delete_files: Whether to compact existing position delete files
|
75
|
+
- task_max_parallelism: Maximum number of parallel Ray tasks
|
76
|
+
- s3_client_kwargs: Additional S3 client configuration
|
77
|
+
- s3_file_system: S3 file system instance
|
78
|
+
- location_provider_prefix_override: Optional prefix override for file locations
|
79
|
+
- position_delete_for_multiple_data_files: Whether to generate position deletes for multiple data files
|
80
|
+
**kwargs: Additional keyword arguments (currently unused)
|
81
|
+
|
82
|
+
Raises:
|
83
|
+
Exception: If snapshot commitment fails or other critical errors occur
|
84
|
+
|
43
85
|
"""
|
44
86
|
|
45
87
|
catalog = params.catalog
|
46
88
|
table_name = params.iceberg_table_name
|
47
|
-
|
89
|
+
if "." not in table_name:
|
90
|
+
iceberg_namespace = params.iceberg_namespace or DEFAULT_NAMESPACE
|
91
|
+
table_name = params.iceberg_table_name
|
92
|
+
table_identifier = f"{iceberg_namespace}.{table_name}"
|
93
|
+
else:
|
94
|
+
table_identifier = table_name
|
95
|
+
identifier_parts = table_identifier.split(".")
|
96
|
+
iceberg_namespace = identifier_parts[0]
|
97
|
+
table_name = identifier_parts[1]
|
98
|
+
iceberg_table = load_table(catalog, table_identifier)
|
48
99
|
enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
|
49
100
|
iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
|
50
|
-
iceberg_namespace = params.iceberg_namespace
|
51
101
|
merge_keys = params.merge_keys
|
52
102
|
compact_previous_position_delete_files = (
|
53
103
|
params.compact_previous_position_delete_files
|
54
104
|
)
|
55
105
|
task_max_parallelism = params.task_max_parallelism
|
56
106
|
s3_client_kwargs = params.s3_client_kwargs
|
57
|
-
s3_file_system = params.
|
107
|
+
s3_file_system = params.filesystem
|
58
108
|
location_provider_prefix_override = params.location_provider_prefix_override
|
59
109
|
position_delete_for_multiple_data_files = (
|
60
110
|
params.position_delete_for_multiple_data_files
|
@@ -86,7 +136,7 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
86
136
|
else:
|
87
137
|
identifier_fields = merge_keys
|
88
138
|
|
89
|
-
convert_options_provider = functools.partial(
|
139
|
+
convert_options_provider: Callable = functools.partial(
|
90
140
|
task_resource_options_provider,
|
91
141
|
resource_amount_provider=convert_resource_options_provider,
|
92
142
|
)
|
@@ -98,7 +148,8 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
98
148
|
# Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
|
99
149
|
max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
100
150
|
|
101
|
-
def convert_input_provider(index, item):
|
151
|
+
def convert_input_provider(index: int, item: Any) -> Dict[str, ConvertInput]:
|
152
|
+
task_opts = convert_options_provider(index, item)
|
102
153
|
return {
|
103
154
|
"convert_input": ConvertInput.of(
|
104
155
|
convert_input_files=item,
|
@@ -112,7 +163,8 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
112
163
|
position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
|
113
164
|
max_parallel_data_file_download=max_parallel_data_file_download,
|
114
165
|
s3_client_kwargs=s3_client_kwargs,
|
115
|
-
|
166
|
+
filesystem=s3_file_system,
|
167
|
+
task_memory=task_opts["memory"],
|
116
168
|
)
|
117
169
|
}
|
118
170
|
|
@@ -127,10 +179,10 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
127
179
|
kwargs_provider=convert_input_provider,
|
128
180
|
)
|
129
181
|
|
130
|
-
to_be_deleted_files_list = []
|
182
|
+
to_be_deleted_files_list: List[List[DataFile]] = []
|
131
183
|
logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
|
132
184
|
|
133
|
-
convert_results = ray.get(convert_tasks_pending)
|
185
|
+
convert_results: List[ConvertResult] = ray.get(convert_tasks_pending)
|
134
186
|
logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
|
135
187
|
|
136
188
|
total_position_delete_record_count = sum(
|
@@ -153,8 +205,36 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
153
205
|
convert_result.position_delete_on_disk_sizes
|
154
206
|
for convert_result in convert_results
|
155
207
|
)
|
208
|
+
total_input_data_files_on_disk_size = sum(
|
209
|
+
convert_result.input_data_files_on_disk_size
|
210
|
+
for convert_result in convert_results
|
211
|
+
)
|
212
|
+
|
213
|
+
# Calculate memory usage statistics
|
214
|
+
max_peak_memory_usage = max(
|
215
|
+
convert_result.peak_memory_usage_bytes for convert_result in convert_results
|
216
|
+
)
|
217
|
+
avg_memory_usage_percentage = sum(
|
218
|
+
convert_result.memory_usage_percentage for convert_result in convert_results
|
219
|
+
) / len(convert_results)
|
220
|
+
max_memory_usage_percentage = max(
|
221
|
+
convert_result.memory_usage_percentage for convert_result in convert_results
|
222
|
+
)
|
223
|
+
|
224
|
+
logger.info(
|
225
|
+
f"Aggregated stats for {table_identifier}: "
|
226
|
+
f"total position delete record count: {total_position_delete_record_count}, "
|
227
|
+
f"total input data file record count: {total_input_data_file_record_count}, "
|
228
|
+
f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
|
229
|
+
f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
|
230
|
+
f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}, "
|
231
|
+
f"total input data files on disk size: {total_input_data_files_on_disk_size}, "
|
232
|
+
f"max peak memory usage: {max_peak_memory_usage} bytes, "
|
233
|
+
f"average memory usage percentage: {avg_memory_usage_percentage:.2f}%, "
|
234
|
+
f"max memory usage percentage: {max_memory_usage_percentage:.2f}%"
|
235
|
+
)
|
156
236
|
|
157
|
-
to_be_added_files_list = []
|
237
|
+
to_be_added_files_list: List[DataFile] = []
|
158
238
|
for convert_result in convert_results:
|
159
239
|
to_be_added_files = convert_result.to_be_added_files
|
160
240
|
to_be_deleted_files = convert_result.to_be_deleted_files
|
@@ -162,24 +242,57 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
162
242
|
to_be_deleted_files_list.extend(to_be_deleted_files.values())
|
163
243
|
to_be_added_files_list.extend(to_be_added_files)
|
164
244
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
245
|
+
logger.info(f"To be deleted files list length: {len(to_be_deleted_files_list)}")
|
246
|
+
logger.info(f"To be added files list length: {len(to_be_added_files_list)}")
|
247
|
+
|
248
|
+
# Determine snapshot type and commit
|
249
|
+
snapshot_type = _determine_snapshot_type(
|
250
|
+
to_be_deleted_files_list, to_be_added_files_list
|
251
|
+
)
|
252
|
+
|
253
|
+
if snapshot_type == SnapshotType.NONE:
|
254
|
+
logger.info(
|
255
|
+
_get_snapshot_action_description(
|
256
|
+
snapshot_type, to_be_deleted_files_list, to_be_added_files_list
|
257
|
+
)
|
175
258
|
)
|
259
|
+
return
|
260
|
+
|
176
261
|
logger.info(
|
177
|
-
f"
|
178
|
-
f"total position delete record count: {total_position_delete_record_count}, "
|
179
|
-
f"total input data file record_count: {total_input_data_file_record_count}, "
|
180
|
-
f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
|
181
|
-
f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
|
182
|
-
f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}."
|
262
|
+
f"Snapshot action: {_get_snapshot_action_description(snapshot_type, to_be_deleted_files_list, to_be_added_files_list)}"
|
183
263
|
)
|
184
264
|
|
185
|
-
|
265
|
+
try:
|
266
|
+
if snapshot_type == SnapshotType.APPEND:
|
267
|
+
logger.info(f"Committing append snapshot for {table_identifier}.")
|
268
|
+
updated_table_metadata = commit_append_snapshot(
|
269
|
+
iceberg_table=iceberg_table,
|
270
|
+
new_position_delete_files=to_be_added_files_list,
|
271
|
+
)
|
272
|
+
elif snapshot_type == SnapshotType.REPLACE:
|
273
|
+
logger.info(f"Committing replace snapshot for {table_identifier}.")
|
274
|
+
updated_table_metadata = commit_replace_snapshot(
|
275
|
+
iceberg_table=iceberg_table,
|
276
|
+
to_be_deleted_files=to_be_deleted_files_list,
|
277
|
+
new_position_delete_files=to_be_added_files_list,
|
278
|
+
)
|
279
|
+
elif snapshot_type == SnapshotType.DELETE:
|
280
|
+
logger.info(f"Committing delete snapshot for {table_identifier}.")
|
281
|
+
updated_table_metadata = commit_replace_snapshot(
|
282
|
+
iceberg_table=iceberg_table,
|
283
|
+
to_be_deleted_files=to_be_deleted_files_list,
|
284
|
+
new_position_delete_files=[], # No new files to add
|
285
|
+
)
|
286
|
+
else:
|
287
|
+
logger.warning(f"Unexpected snapshot type: {snapshot_type}")
|
288
|
+
return
|
289
|
+
|
290
|
+
logger.info(
|
291
|
+
f"Committed new Iceberg snapshot for {table_identifier}: {updated_table_metadata.current_snapshot_id}"
|
292
|
+
)
|
293
|
+
|
294
|
+
# Return the updated table metadata with the new snapshot
|
295
|
+
return updated_table_metadata
|
296
|
+
except Exception as e:
|
297
|
+
logger.error(f"Failed to commit snapshot for {table_identifier}: {str(e)}")
|
298
|
+
raise
|