deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -21,14 +21,13 @@ from deltacat.utils.placement import PlacementGroupConfig
|
|
21
21
|
from typing import List, Optional, Dict, Any
|
22
22
|
from deltacat.utils.ray_utils.runtime import live_node_resource_keys
|
23
23
|
from deltacat.compute.compactor.utils import io
|
24
|
-
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
25
24
|
from deltacat.compute.compactor.steps import repartition as repar
|
26
25
|
from deltacat.compute.compactor.steps.repartition import RepartitionType
|
27
26
|
from deltacat.storage import (
|
28
27
|
Delta,
|
29
28
|
DeltaLocator,
|
30
29
|
PartitionLocator,
|
31
|
-
|
30
|
+
metastore,
|
32
31
|
)
|
33
32
|
from deltacat.utils.metrics import MetricsConfig
|
34
33
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -41,7 +40,6 @@ def repartition(
|
|
41
40
|
source_partition_locator: PartitionLocator,
|
42
41
|
destination_partition_locator: PartitionLocator,
|
43
42
|
repartition_args: Any,
|
44
|
-
repartition_completion_file_s3_url: str,
|
45
43
|
last_stream_position_to_compact: int,
|
46
44
|
repartition_type: RepartitionType = RepartitionType.RANGE,
|
47
45
|
sort_keys: List[SortKey] = None,
|
@@ -54,9 +52,8 @@ def repartition(
|
|
54
52
|
pg_config: Optional[PlacementGroupConfig] = None,
|
55
53
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
56
54
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
57
|
-
|
58
|
-
|
59
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
55
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
56
|
+
deltacat_storage=metastore,
|
60
57
|
**kwargs,
|
61
58
|
) -> Optional[str]:
|
62
59
|
|
@@ -132,7 +129,7 @@ def repartition(
|
|
132
129
|
enable_profiler=enable_profiler,
|
133
130
|
metrics_config=metrics_config,
|
134
131
|
read_kwargs_provider=read_kwargs_provider,
|
135
|
-
|
132
|
+
table_writer_kwargs=table_writer_kwargs,
|
136
133
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
137
134
|
deltacat_storage=deltacat_storage,
|
138
135
|
)
|
@@ -153,9 +150,6 @@ def repartition(
|
|
153
150
|
compacted_delta = deltacat_storage.commit_delta(
|
154
151
|
merged_delta, properties=kwargs.get("properties", {})
|
155
152
|
)
|
156
|
-
deltacat_storage.commit_partition(partition)
|
157
|
-
logger.info(f"Committed final delta: {compacted_delta}")
|
158
|
-
logger.info(f"Job run completed successfully!")
|
159
153
|
new_compacted_delta_locator = DeltaLocator.of(
|
160
154
|
new_compacted_partition_locator,
|
161
155
|
compacted_delta.stream_position,
|
@@ -173,14 +167,7 @@ def repartition(
|
|
173
167
|
bit_width_of_sort_keys,
|
174
168
|
None,
|
175
169
|
)
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
None,
|
181
|
-
None,
|
182
|
-
None,
|
183
|
-
repartition_completion_info,
|
184
|
-
repartition_completion_file_s3_url,
|
185
|
-
**s3_client_kwargs,
|
186
|
-
)
|
170
|
+
partition.compaction_round_completion_info = repartition_completion_info
|
171
|
+
deltacat_storage.commit_partition(partition)
|
172
|
+
logger.info(f"Committed final delta: {compacted_delta}")
|
173
|
+
logger.info(f"Job run completed successfully!")
|
@@ -21,7 +21,7 @@ from deltacat.compute.compactor.utils.primary_key_index import (
|
|
21
21
|
group_hash_bucket_indices,
|
22
22
|
group_record_indices_by_hash_bucket,
|
23
23
|
)
|
24
|
-
from deltacat.storage import
|
24
|
+
from deltacat.storage import metastore
|
25
25
|
from deltacat.types.media import StorageType
|
26
26
|
from deltacat.utils.common import sha1_digest
|
27
27
|
from deltacat.utils.ray_utils.runtime import (
|
@@ -90,7 +90,7 @@ def _group_file_records_by_pk_hash_bucket(
|
|
90
90
|
sort_key_names: List[str],
|
91
91
|
is_src_delta: np.bool_ = True,
|
92
92
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
93
|
-
deltacat_storage=
|
93
|
+
deltacat_storage=metastore,
|
94
94
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
95
95
|
**kwargs,
|
96
96
|
) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
|
@@ -139,7 +139,7 @@ def _read_delta_file_envelopes(
|
|
139
139
|
primary_keys: List[str],
|
140
140
|
sort_key_names: List[str],
|
141
141
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
142
|
-
deltacat_storage=
|
142
|
+
deltacat_storage=metastore,
|
143
143
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
144
144
|
**kwargs,
|
145
145
|
) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
|
@@ -190,7 +190,7 @@ def _timed_hash_bucket(
|
|
190
190
|
enable_profiler: bool,
|
191
191
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
192
192
|
object_store: Optional[IObjectStore] = None,
|
193
|
-
deltacat_storage=
|
193
|
+
deltacat_storage=metastore,
|
194
194
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
195
195
|
**kwargs,
|
196
196
|
):
|
@@ -249,7 +249,7 @@ def hash_bucket(
|
|
249
249
|
metrics_config: MetricsConfig,
|
250
250
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
251
251
|
object_store: Optional[IObjectStore],
|
252
|
-
deltacat_storage=
|
252
|
+
deltacat_storage=metastore,
|
253
253
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
254
254
|
**kwargs,
|
255
255
|
) -> HashBucketResult:
|
@@ -29,7 +29,7 @@ from deltacat.storage import (
|
|
29
29
|
ManifestEntryList,
|
30
30
|
)
|
31
31
|
from deltacat.storage.model.manifest import Manifest
|
32
|
-
|
32
|
+
|
33
33
|
from deltacat.utils.common import ReadKwargsProvider
|
34
34
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
35
35
|
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
@@ -46,6 +46,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
46
46
|
)
|
47
47
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
48
48
|
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
49
|
+
from deltacat.storage import metastore
|
49
50
|
|
50
51
|
if importlib.util.find_spec("memray"):
|
51
52
|
import memray
|
@@ -67,9 +68,9 @@ def materialize(
|
|
67
68
|
metrics_config: MetricsConfig,
|
68
69
|
schema: Optional[pa.Schema] = None,
|
69
70
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
70
|
-
|
71
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
71
72
|
object_store: Optional[IObjectStore] = None,
|
72
|
-
deltacat_storage=
|
73
|
+
deltacat_storage=metastore,
|
73
74
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
74
75
|
):
|
75
76
|
if deltacat_storage_kwargs is None:
|
@@ -78,11 +79,11 @@ def materialize(
|
|
78
79
|
def _stage_delta_from_manifest_entry_reference_list(
|
79
80
|
manifest_entry_list_reference: List[ManifestEntry],
|
80
81
|
partition: Partition,
|
81
|
-
delta_type: DeltaType = DeltaType.
|
82
|
+
delta_type: DeltaType = DeltaType.APPEND,
|
82
83
|
) -> Delta:
|
83
84
|
assert (
|
84
|
-
delta_type == DeltaType.
|
85
|
-
), "
|
85
|
+
delta_type == DeltaType.APPEND
|
86
|
+
), "Compaction should always produce APPEND deltas for consistent read operations!"
|
86
87
|
manifest = Manifest.of(
|
87
88
|
entries=ManifestEntryList.of(manifest_entry_list_reference),
|
88
89
|
uuid=str(uuid4()),
|
@@ -110,9 +111,10 @@ def materialize(
|
|
110
111
|
deltacat_storage.stage_delta,
|
111
112
|
compacted_table,
|
112
113
|
partition,
|
114
|
+
delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
|
113
115
|
max_records_per_entry=max_records_per_output_file,
|
114
116
|
content_type=compacted_file_content_type,
|
115
|
-
|
117
|
+
table_writer_kwargs=table_writer_kwargs,
|
116
118
|
**deltacat_storage_kwargs,
|
117
119
|
)
|
118
120
|
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
@@ -10,7 +10,7 @@ import ray
|
|
10
10
|
from deltacat import logs
|
11
11
|
from deltacat.compute.compactor import DeltaAnnotated
|
12
12
|
from deltacat.compute.compactor.model.repartition_result import RepartitionResult
|
13
|
-
from deltacat.storage import
|
13
|
+
from deltacat.storage import metastore
|
14
14
|
from deltacat.storage import Partition
|
15
15
|
from deltacat.utils.ray_utils.runtime import (
|
16
16
|
get_current_ray_task_id,
|
@@ -19,7 +19,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
19
19
|
from deltacat.utils.common import ReadKwargsProvider
|
20
20
|
from deltacat.utils.performance import timed_invocation
|
21
21
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
22
|
-
from deltacat.storage import Delta
|
22
|
+
from deltacat.storage import Delta, DeltaType
|
23
23
|
from enum import Enum
|
24
24
|
|
25
25
|
if importlib.util.find_spec("memray"):
|
@@ -56,9 +56,9 @@ def repartition_range(
|
|
56
56
|
destination_partition: Partition,
|
57
57
|
repartition_args: dict,
|
58
58
|
max_records_per_output_file: int,
|
59
|
-
|
59
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
60
60
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
61
|
-
deltacat_storage=
|
61
|
+
deltacat_storage=metastore,
|
62
62
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
63
63
|
**kwargs,
|
64
64
|
):
|
@@ -144,9 +144,10 @@ def repartition_range(
|
|
144
144
|
partition_delta: Delta = deltacat_storage.stage_delta(
|
145
145
|
partition_table,
|
146
146
|
destination_partition,
|
147
|
+
delta_type=DeltaType.APPEND, # Repartition always produces APPEND deltas
|
147
148
|
max_records_per_entry=max_records_per_output_file,
|
148
149
|
content_type=repartitioned_file_content_type,
|
149
|
-
|
150
|
+
table_writer_kwargs=table_writer_kwargs,
|
150
151
|
**deltacat_storage_kwargs,
|
151
152
|
)
|
152
153
|
partition_deltas.append(partition_delta)
|
@@ -168,9 +169,9 @@ def _timed_repartition(
|
|
168
169
|
max_records_per_output_file: int,
|
169
170
|
enable_profiler: bool,
|
170
171
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
171
|
-
|
172
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
172
173
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
173
|
-
deltacat_storage=
|
174
|
+
deltacat_storage=metastore,
|
174
175
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
175
176
|
**kwargs,
|
176
177
|
) -> RepartitionResult:
|
@@ -192,7 +193,7 @@ def _timed_repartition(
|
|
192
193
|
destination_partition=destination_partition,
|
193
194
|
repartition_args=repartition_args,
|
194
195
|
max_records_per_output_file=max_records_per_output_file,
|
195
|
-
|
196
|
+
table_writer_kwargs=table_writer_kwargs,
|
196
197
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
197
198
|
deltacat_storage=deltacat_storage,
|
198
199
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
@@ -213,9 +214,9 @@ def repartition(
|
|
213
214
|
enable_profiler: bool,
|
214
215
|
metrics_config: Optional[MetricsConfig],
|
215
216
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
216
|
-
|
217
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
217
218
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
218
|
-
deltacat_storage=
|
219
|
+
deltacat_storage=metastore,
|
219
220
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
220
221
|
**kwargs,
|
221
222
|
) -> RepartitionResult:
|
@@ -231,7 +232,7 @@ def repartition(
|
|
231
232
|
max_records_per_output_file=max_records_per_output_file,
|
232
233
|
enable_profiler=enable_profiler,
|
233
234
|
read_kwargs_provider=read_kwargs_provider,
|
234
|
-
|
235
|
+
table_writer_kwargs=table_writer_kwargs,
|
235
236
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
236
237
|
deltacat_storage=deltacat_storage,
|
237
238
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
@@ -11,7 +11,7 @@ from deltacat.storage import (
|
|
11
11
|
PartitionLocator,
|
12
12
|
Delta,
|
13
13
|
ManifestEntry,
|
14
|
-
|
14
|
+
metastore,
|
15
15
|
)
|
16
16
|
from deltacat import logs
|
17
17
|
from deltacat.compute.compactor import DeltaAnnotated
|
@@ -31,12 +31,13 @@ def discover_deltas(
|
|
31
31
|
compacted_partition_locator: Optional[PartitionLocator],
|
32
32
|
rebase_source_partition_locator: Optional[PartitionLocator],
|
33
33
|
rebase_source_partition_high_watermark: Optional[int],
|
34
|
-
deltacat_storage=
|
34
|
+
deltacat_storage=metastore,
|
35
35
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
36
36
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
37
37
|
) -> Tuple[List[Delta], int]:
|
38
38
|
if deltacat_storage_kwargs is None:
|
39
39
|
deltacat_storage_kwargs = {}
|
40
|
+
|
40
41
|
# Source One: new deltas from uncompacted table for incremental compaction or deltas from compacted table for rebase
|
41
42
|
start_position_exclusive = (
|
42
43
|
high_watermark.get(source_partition_locator)
|
@@ -109,7 +110,7 @@ def limit_input_deltas(
|
|
109
110
|
user_hash_bucket_chunk_size: int,
|
110
111
|
input_deltas_stats: Dict[int, DeltaStats],
|
111
112
|
compaction_audit: CompactionSessionAuditInfo,
|
112
|
-
deltacat_storage=
|
113
|
+
deltacat_storage=metastore,
|
113
114
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
114
115
|
**kwargs,
|
115
116
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
@@ -272,7 +273,7 @@ def fit_input_deltas(
|
|
272
273
|
cluster_resources: Dict[str, float],
|
273
274
|
compaction_audit: CompactionSessionAuditInfo,
|
274
275
|
hash_bucket_count: Optional[int],
|
275
|
-
deltacat_storage=
|
276
|
+
deltacat_storage=metastore,
|
276
277
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
277
278
|
**kwargs,
|
278
279
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
@@ -359,7 +360,7 @@ def _discover_deltas(
|
|
359
360
|
source_partition_locator: PartitionLocator,
|
360
361
|
start_position_exclusive: Optional[int],
|
361
362
|
end_position_inclusive: Optional[int],
|
362
|
-
deltacat_storage=
|
363
|
+
deltacat_storage=metastore,
|
363
364
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
364
365
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
365
366
|
) -> List[Delta]:
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional
|
3
|
+
from deltacat import logs
|
4
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
5
|
+
from deltacat.storage import PartitionLocator
|
6
|
+
from deltacat.storage.model.partition import Partition
|
7
|
+
from deltacat.utils.metrics import metrics
|
8
|
+
from deltacat.exceptions import PartitionNotFoundError
|
9
|
+
|
10
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
11
|
+
|
12
|
+
|
13
|
+
@metrics
|
14
|
+
def read_round_completion_info(
|
15
|
+
source_partition_locator: PartitionLocator,
|
16
|
+
destination_partition_locator: PartitionLocator,
|
17
|
+
deltacat_storage,
|
18
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
19
|
+
destination_partition: Optional[Partition] = None,
|
20
|
+
) -> Optional[RoundCompletionInfo]:
|
21
|
+
"""
|
22
|
+
Read round completion info from the partition metafile.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
source_partition_locator: Source partition locator for validation
|
26
|
+
destination_partition_locator: Destination partition locator
|
27
|
+
deltacat_storage: Storage implementation
|
28
|
+
deltacat_storage_kwargs: Optional storage kwargs
|
29
|
+
destination_partition: Optional destination partition to avoid redundant get_partition calls
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
RoundCompletionInfo if found in partition, None otherwise
|
33
|
+
"""
|
34
|
+
if not destination_partition_locator:
|
35
|
+
return None
|
36
|
+
|
37
|
+
if deltacat_storage_kwargs is None:
|
38
|
+
deltacat_storage_kwargs = {}
|
39
|
+
|
40
|
+
try:
|
41
|
+
# Use provided partition or get it from storage
|
42
|
+
if destination_partition:
|
43
|
+
partition = destination_partition
|
44
|
+
else:
|
45
|
+
# First get the current partition to access its previous_partition_id
|
46
|
+
current_partition: Partition = deltacat_storage.get_partition(
|
47
|
+
destination_partition_locator.stream_locator,
|
48
|
+
destination_partition_locator.partition_values,
|
49
|
+
**deltacat_storage_kwargs,
|
50
|
+
)
|
51
|
+
|
52
|
+
# If current partition has round completion info, use it
|
53
|
+
if current_partition.compaction_round_completion_info:
|
54
|
+
partition = current_partition
|
55
|
+
elif current_partition.previous_partition_id is not None:
|
56
|
+
# For incremental compaction, we need to get the previous committed partition
|
57
|
+
# that contains the round completion info.
|
58
|
+
# Get the previous partition by ID - this is where the round completion info should be
|
59
|
+
logger.info(
|
60
|
+
f"Current partition {destination_partition_locator} does not have round completion info, "
|
61
|
+
f"getting previous partition with ID: {current_partition.previous_partition_id}"
|
62
|
+
)
|
63
|
+
previous_partition = deltacat_storage.get_partition_by_id(
|
64
|
+
destination_partition_locator.stream_locator,
|
65
|
+
current_partition.previous_partition_id,
|
66
|
+
**deltacat_storage_kwargs,
|
67
|
+
)
|
68
|
+
if previous_partition is not None:
|
69
|
+
logger.info(
|
70
|
+
f"Found previous partition: {previous_partition.locator}"
|
71
|
+
)
|
72
|
+
partition = previous_partition
|
73
|
+
else:
|
74
|
+
raise PartitionNotFoundError(
|
75
|
+
f"Previous partition with ID {current_partition.previous_partition_id} not found"
|
76
|
+
)
|
77
|
+
else:
|
78
|
+
logger.info(f"No previous partition ID found, using current partition")
|
79
|
+
partition = current_partition
|
80
|
+
|
81
|
+
if partition:
|
82
|
+
round_completion_info = partition.compaction_round_completion_info
|
83
|
+
if round_completion_info:
|
84
|
+
# Validate that prev_source_partition_locator matches current source
|
85
|
+
if (
|
86
|
+
not source_partition_locator
|
87
|
+
or not round_completion_info.prev_source_partition_locator
|
88
|
+
):
|
89
|
+
raise ValueError(
|
90
|
+
f"Source partition locator ({source_partition_locator}) and "
|
91
|
+
f"prev_source_partition_locator ({round_completion_info.prev_source_partition_locator}) "
|
92
|
+
f"must both be provided."
|
93
|
+
)
|
94
|
+
|
95
|
+
if (
|
96
|
+
round_completion_info.prev_source_partition_locator.canonical_string()
|
97
|
+
!= source_partition_locator.canonical_string()
|
98
|
+
):
|
99
|
+
logger.warning(
|
100
|
+
f"Previous source partition locator mismatch: "
|
101
|
+
f"expected {source_partition_locator.canonical_string()}, "
|
102
|
+
f"but found {round_completion_info.prev_source_partition_locator.canonical_string()} "
|
103
|
+
f"in round completion info. Ignoring cached round completion info."
|
104
|
+
)
|
105
|
+
return None
|
106
|
+
|
107
|
+
logger.info(
|
108
|
+
f"Read round completion info from partition metafile: {round_completion_info}"
|
109
|
+
)
|
110
|
+
return round_completion_info
|
111
|
+
|
112
|
+
except Exception as e:
|
113
|
+
logger.debug(
|
114
|
+
f"Failed to read round completion info from partition metafile: {e}"
|
115
|
+
)
|
116
|
+
|
117
|
+
return None
|
@@ -294,7 +294,9 @@ def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table
|
|
294
294
|
|
295
295
|
|
296
296
|
def delta_type_to_field(delta_type: DeltaType) -> bool:
|
297
|
-
|
297
|
+
# For deduplication purposes, treat both UPSERT and APPEND as UPSERT (True)
|
298
|
+
# Only DELETE should be treated as DELETE (False)
|
299
|
+
return delta_type is not DeltaType.DELETE
|
298
300
|
|
299
301
|
|
300
302
|
def delta_type_from_field(delta_type_field: bool) -> DeltaType:
|
@@ -14,7 +14,6 @@ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
|
14
14
|
ExecutionCompactionResult,
|
15
15
|
)
|
16
16
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
17
|
-
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
18
17
|
from deltacat.compute.compactor import DeltaAnnotated
|
19
18
|
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
20
19
|
DeleteStrategy,
|
@@ -27,6 +26,7 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
|
27
26
|
from deltacat.storage import (
|
28
27
|
Delta,
|
29
28
|
DeltaLocator,
|
29
|
+
PartitionLocator,
|
30
30
|
)
|
31
31
|
from deltacat.storage.model.manifest import Manifest
|
32
32
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
@@ -36,13 +36,14 @@ from deltacat.utils.resources import (
|
|
36
36
|
get_current_process_peak_memory_usage_in_bytes,
|
37
37
|
)
|
38
38
|
from deltacat.compute.compactor_v2.private.compaction_utils import (
|
39
|
+
_get_rci_source_partition_locator,
|
39
40
|
_fetch_compaction_metadata,
|
40
41
|
_build_uniform_deltas,
|
41
42
|
_group_uniform_deltas,
|
42
43
|
_stage_new_partition,
|
43
44
|
_run_hash_and_merge,
|
44
45
|
_process_merge_results,
|
45
|
-
|
46
|
+
_create_round_completion_info,
|
46
47
|
_commit_compaction_result,
|
47
48
|
)
|
48
49
|
from deltacat.utils.metrics import metrics
|
@@ -64,24 +65,26 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
64
65
|
|
65
66
|
@metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
|
66
67
|
@categorize_errors
|
67
|
-
def compact_partition(params: CompactPartitionParams, **kwargs) ->
|
68
|
+
def compact_partition(params: CompactPartitionParams, **kwargs) -> None:
|
68
69
|
assert (
|
69
70
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
70
71
|
), "hash_bucket_count is a required arg for compactor v2"
|
72
|
+
assert type(params.hash_bucket_count) is int, "Hash bucket count must be an integer"
|
71
73
|
if params.num_rounds > 1:
|
72
74
|
assert (
|
73
75
|
not params.drop_duplicates
|
74
76
|
), "num_rounds > 1, drop_duplicates must be False but is True"
|
75
77
|
|
76
|
-
with
|
77
|
-
"compaction_partition.bin"
|
78
|
-
|
78
|
+
with (
|
79
|
+
memray.Tracker("compaction_partition.bin")
|
80
|
+
if params.enable_profiler
|
81
|
+
else nullcontext()
|
82
|
+
):
|
79
83
|
execute_compaction_result: ExecutionCompactionResult = _execute_compaction(
|
80
84
|
params,
|
81
85
|
**kwargs,
|
82
86
|
)
|
83
87
|
_commit_compaction_result(params, execute_compaction_result)
|
84
|
-
return execute_compaction_result.round_completion_file_s3_url
|
85
88
|
|
86
89
|
|
87
90
|
def _execute_compaction(
|
@@ -96,12 +99,12 @@ def _execute_compaction(
|
|
96
99
|
previous_compacted_delta_manifest,
|
97
100
|
round_completion_info,
|
98
101
|
) = fetch_compaction_metadata_result
|
99
|
-
|
100
|
-
params
|
102
|
+
rci_source_partition_locator: PartitionLocator = _get_rci_source_partition_locator(
|
103
|
+
params
|
101
104
|
)
|
102
105
|
|
103
|
-
base_audit_url: str =
|
104
|
-
f"
|
106
|
+
base_audit_url: str = rci_source_partition_locator.path(
|
107
|
+
f"{params.compaction_artifact_path}/compaction-audit"
|
105
108
|
)
|
106
109
|
audit_url: str = f"{base_audit_url}.json"
|
107
110
|
logger.info(f"Compaction audit will be written to {audit_url}")
|
@@ -136,7 +139,7 @@ def _execute_compaction(
|
|
136
139
|
)
|
137
140
|
if not input_deltas:
|
138
141
|
logger.info("No input deltas found to compact.")
|
139
|
-
return ExecutionCompactionResult(None, None,
|
142
|
+
return ExecutionCompactionResult(None, None, False)
|
140
143
|
build_uniform_deltas_result: tuple[
|
141
144
|
List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
142
145
|
] = _build_uniform_deltas(
|
@@ -199,13 +202,13 @@ def _execute_compaction(
|
|
199
202
|
|
200
203
|
compaction_audit.save_round_completion_stats(mat_results)
|
201
204
|
|
202
|
-
compaction_result: ExecutionCompactionResult =
|
205
|
+
compaction_result: ExecutionCompactionResult = _create_round_completion_info(
|
203
206
|
params,
|
204
207
|
compaction_audit,
|
205
208
|
compacted_partition,
|
206
209
|
audit_url,
|
207
210
|
hb_id_to_entry_indices_range,
|
208
|
-
|
211
|
+
rci_source_partition_locator,
|
209
212
|
new_compacted_delta_locator,
|
210
213
|
pyarrow_write_result,
|
211
214
|
round_completion_info,
|
@@ -1,3 +1,5 @@
|
|
1
|
+
from deltacat.utils.common import env_bool, env_integer, env_string
|
2
|
+
|
1
3
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
2
4
|
|
3
5
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
|
|
31
33
|
# The total size of records that will be hash bucketed at once
|
32
34
|
# Since, sorting is nlogn, we ensure that is not performed
|
33
35
|
# on a very large dataset for best performance.
|
34
|
-
MAX_SIZE_OF_RECORD_BATCH_IN_GIB =
|
36
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
|
37
|
+
"MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
|
38
|
+
)
|
35
39
|
|
36
40
|
# Whether to drop duplicates during merge.
|
37
41
|
DROP_DUPLICATES = True
|
@@ -78,3 +82,28 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
|
78
82
|
# Number of rounds to run hash/merge for a single
|
79
83
|
# partition. (For large table support)
|
80
84
|
DEFAULT_NUM_ROUNDS = 1
|
85
|
+
|
86
|
+
# Whether to perform sha1 hashing when required to
|
87
|
+
# optimize memory. For example, hashing is always
|
88
|
+
# required for bucketing where it's not mandatory
|
89
|
+
# when dropping duplicates. Setting this to True
|
90
|
+
# will disable sha1 hashing in cases where it isn't
|
91
|
+
# mandatory. This flag is False by default.
|
92
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
|
+
)
|
95
|
+
|
96
|
+
# This env variable specifies whether to check bucketing spec
|
97
|
+
# compliance of the existing compacted table.
|
98
|
+
# PRINT_LOG: Enable logging if any partition is found
|
99
|
+
# to be non-compliant with the bucketing spec.
|
100
|
+
# ASSERT: Fail the job with ValidationError if the
|
101
|
+
# current compacted partition is found to be non-compliant
|
102
|
+
# with bucketing spec. Note, logging is implicitly enabled
|
103
|
+
# in this case.
|
104
|
+
BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
|
105
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE", None
|
106
|
+
)
|
107
|
+
|
108
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
|
109
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
|
@@ -13,7 +13,6 @@ from typing import Optional
|
|
13
13
|
class ExecutionCompactionResult:
|
14
14
|
new_compacted_partition: Optional[Partition]
|
15
15
|
new_round_completion_info: Optional[RoundCompletionInfo]
|
16
|
-
round_completion_file_s3_url: Optional[str]
|
17
16
|
is_inplace_compacted: bool
|
18
17
|
|
19
18
|
def __iter__(self):
|
@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Any
|
|
4
4
|
from deltacat.utils.metrics import MetricsConfig
|
5
5
|
from deltacat.utils.common import ReadKwargsProvider
|
6
6
|
from deltacat.io.object_store import IObjectStore
|
7
|
-
from deltacat.storage import
|
7
|
+
from deltacat.storage import metastore
|
8
8
|
from deltacat.compute.compactor import DeltaAnnotated
|
9
9
|
|
10
10
|
|
@@ -15,12 +15,13 @@ class HashBucketInput(Dict):
|
|
15
15
|
primary_keys: List[str],
|
16
16
|
num_hash_buckets: int,
|
17
17
|
num_hash_groups: int,
|
18
|
+
all_column_names: List[str],
|
18
19
|
hb_task_index: Optional[int] = 0,
|
19
20
|
enable_profiler: Optional[bool] = False,
|
20
21
|
metrics_config: Optional[MetricsConfig] = None,
|
21
22
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
22
23
|
object_store: Optional[IObjectStore] = None,
|
23
|
-
deltacat_storage=
|
24
|
+
deltacat_storage=metastore,
|
24
25
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
25
26
|
memory_logs_enabled: Optional[bool] = None,
|
26
27
|
) -> HashBucketInput:
|
@@ -31,6 +32,7 @@ class HashBucketInput(Dict):
|
|
31
32
|
result["hb_task_index"] = hb_task_index
|
32
33
|
result["num_hash_buckets"] = num_hash_buckets
|
33
34
|
result["num_hash_groups"] = num_hash_groups
|
35
|
+
result["all_column_names"] = all_column_names
|
34
36
|
result["enable_profiler"] = enable_profiler
|
35
37
|
result["metrics_config"] = metrics_config
|
36
38
|
result["read_kwargs_provider"] = read_kwargs_provider
|
@@ -61,6 +63,10 @@ class HashBucketInput(Dict):
|
|
61
63
|
def num_hash_groups(self) -> int:
|
62
64
|
return self["num_hash_groups"]
|
63
65
|
|
66
|
+
@property
|
67
|
+
def all_column_names(self) -> List[str]:
|
68
|
+
return self["all_column_names"]
|
69
|
+
|
64
70
|
@property
|
65
71
|
def enable_profiler(self) -> Optional[bool]:
|
66
72
|
return self.get("enable_profiler")
|
@@ -78,7 +84,7 @@ class HashBucketInput(Dict):
|
|
78
84
|
return self.get("object_store")
|
79
85
|
|
80
86
|
@property
|
81
|
-
def deltacat_storage(self) ->
|
87
|
+
def deltacat_storage(self) -> metastore:
|
82
88
|
return self.get("deltacat_storage")
|
83
89
|
|
84
90
|
@property
|