deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,8 @@ import logging
|
|
5
5
|
import ray
|
6
6
|
import time
|
7
7
|
import json
|
8
|
-
|
8
|
+
import posixpath
|
9
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
9
10
|
import deltacat
|
10
11
|
from deltacat import logs
|
11
12
|
import pyarrow as pa
|
@@ -25,7 +26,7 @@ from deltacat.storage import (
|
|
25
26
|
DeltaLocator,
|
26
27
|
Partition,
|
27
28
|
PartitionLocator,
|
28
|
-
|
29
|
+
metastore,
|
29
30
|
)
|
30
31
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
31
32
|
CompactPartitionParams,
|
@@ -40,7 +41,7 @@ from deltacat.compute.compactor.steps import dedupe as dd
|
|
40
41
|
from deltacat.compute.compactor.steps import hash_bucket as hb
|
41
42
|
from deltacat.compute.compactor.steps import materialize as mat
|
42
43
|
from deltacat.compute.compactor.utils import io
|
43
|
-
from deltacat.compute.compactor.utils import
|
44
|
+
from deltacat.compute.compactor.utils import round_completion_reader as rci
|
44
45
|
|
45
46
|
from deltacat.types.media import ContentType
|
46
47
|
from deltacat.utils.placement import PlacementGroupConfig
|
@@ -65,13 +66,37 @@ DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG: int = 1
|
|
65
66
|
DEFAULT_PROPERTIES_ARG: Dict[str, Any] = {}
|
66
67
|
|
67
68
|
|
69
|
+
def _upload_audit_data(url: str, content: str, **kwargs) -> None:
|
70
|
+
"""
|
71
|
+
Upload audit data to the specified URL using filesystem-agnostic operations.
|
72
|
+
"""
|
73
|
+
try:
|
74
|
+
path, filesystem = resolve_path_and_filesystem(url)
|
75
|
+
|
76
|
+
# Create parent directories if they don't exist
|
77
|
+
parent_dir = posixpath.dirname(path)
|
78
|
+
if parent_dir:
|
79
|
+
try:
|
80
|
+
filesystem.create_dir(parent_dir, recursive=True)
|
81
|
+
except Exception as dir_error:
|
82
|
+
# Directory might already exist, which is fine
|
83
|
+
logger.debug(
|
84
|
+
f"Directory creation warning for {parent_dir}: {dir_error}"
|
85
|
+
)
|
86
|
+
|
87
|
+
with filesystem.open_output_stream(path) as stream:
|
88
|
+
stream.write(content.encode("utf-8"))
|
89
|
+
except Exception as e:
|
90
|
+
logger.warning(f"Failed to upload audit data to {url}: {e}")
|
91
|
+
|
92
|
+
|
68
93
|
def check_preconditions(
|
69
94
|
source_partition_locator: PartitionLocator,
|
70
95
|
destination_partition_locator: PartitionLocator,
|
71
96
|
sort_keys: List[SortKey],
|
72
97
|
max_records_per_output_file: int,
|
73
98
|
new_hash_bucket_count: Optional[int],
|
74
|
-
deltacat_storage=
|
99
|
+
deltacat_storage=metastore,
|
75
100
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
76
101
|
**kwargs,
|
77
102
|
) -> int:
|
@@ -104,7 +129,7 @@ def compact_partition(
|
|
104
129
|
source_partition_locator: PartitionLocator,
|
105
130
|
destination_partition_locator: PartitionLocator,
|
106
131
|
primary_keys: Set[str],
|
107
|
-
|
132
|
+
compaction_artifact_path: str,
|
108
133
|
last_stream_position_to_compact: int,
|
109
134
|
*,
|
110
135
|
hash_bucket_count: Optional[int] = None,
|
@@ -123,37 +148,29 @@ def compact_partition(
|
|
123
148
|
metrics_config: Optional[MetricsConfig] = None,
|
124
149
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
125
150
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
126
|
-
|
151
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
127
152
|
object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
|
128
|
-
|
129
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
153
|
+
deltacat_storage=metastore,
|
130
154
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
131
155
|
**kwargs,
|
132
|
-
) ->
|
156
|
+
) -> None:
|
133
157
|
if deltacat_storage_kwargs is None:
|
134
158
|
deltacat_storage_kwargs = {}
|
135
159
|
if not importlib.util.find_spec("memray"):
|
136
160
|
logger.info(f"memray profiler not available, disabling all profiling")
|
137
161
|
enable_profiler = False
|
138
162
|
|
139
|
-
if s3_client_kwargs is None:
|
140
|
-
s3_client_kwargs = {}
|
141
|
-
|
142
163
|
# memray official documentation link:
|
143
164
|
# https://bloomberg.github.io/memray/getting_started.html
|
144
165
|
with memray.Tracker(
|
145
166
|
f"compaction_partition.bin"
|
146
167
|
) if enable_profiler else nullcontext():
|
147
168
|
partition = None
|
148
|
-
(
|
149
|
-
new_partition,
|
150
|
-
new_rci,
|
151
|
-
new_rcf_partition_locator,
|
152
|
-
) = _execute_compaction_round(
|
169
|
+
(new_partition, new_rci,) = _execute_compaction_round(
|
153
170
|
source_partition_locator,
|
154
171
|
destination_partition_locator,
|
155
172
|
primary_keys,
|
156
|
-
|
173
|
+
compaction_artifact_path,
|
157
174
|
last_stream_position_to_compact,
|
158
175
|
hash_bucket_count,
|
159
176
|
sort_keys,
|
@@ -169,9 +186,8 @@ def compact_partition(
|
|
169
186
|
metrics_config,
|
170
187
|
list_deltas_kwargs,
|
171
188
|
read_kwargs_provider,
|
172
|
-
|
189
|
+
table_writer_kwargs,
|
173
190
|
object_store,
|
174
|
-
s3_client_kwargs,
|
175
191
|
deltacat_storage,
|
176
192
|
deltacat_storage_kwargs,
|
177
193
|
**kwargs,
|
@@ -182,30 +198,23 @@ def compact_partition(
|
|
182
198
|
logger.info(
|
183
199
|
f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed"
|
184
200
|
)
|
185
|
-
round_completion_file_s3_url = None
|
186
201
|
if partition:
|
187
202
|
logger.info(f"Committing compacted partition to: {partition.locator}")
|
203
|
+
# Set the round completion info on the partition before committing
|
204
|
+
partition.compaction_round_completion_info = new_rci
|
188
205
|
partition = deltacat_storage.commit_partition(
|
189
|
-
partition,
|
206
|
+
partition,
|
207
|
+
**deltacat_storage_kwargs,
|
190
208
|
)
|
191
209
|
logger.info(f"Committed compacted partition: {partition}")
|
192
|
-
|
193
|
-
round_completion_file_s3_url = rcf.write_round_completion_file(
|
194
|
-
compaction_artifact_s3_bucket,
|
195
|
-
new_rcf_partition_locator,
|
196
|
-
partition.locator,
|
197
|
-
new_rci,
|
198
|
-
**s3_client_kwargs,
|
199
|
-
)
|
200
210
|
logger.info(f"Completed compaction session for: {source_partition_locator}")
|
201
|
-
return round_completion_file_s3_url
|
202
211
|
|
203
212
|
|
204
213
|
def _execute_compaction_round(
|
205
214
|
source_partition_locator: PartitionLocator,
|
206
215
|
destination_partition_locator: PartitionLocator,
|
207
216
|
primary_keys: Set[str],
|
208
|
-
|
217
|
+
compaction_artifact_path: str,
|
209
218
|
last_stream_position_to_compact: int,
|
210
219
|
hash_bucket_count: Optional[int],
|
211
220
|
sort_keys: List[SortKey],
|
@@ -221,24 +230,25 @@ def _execute_compaction_round(
|
|
221
230
|
metrics_config: Optional[MetricsConfig],
|
222
231
|
list_deltas_kwargs: Optional[Dict[str, Any]],
|
223
232
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
224
|
-
|
233
|
+
table_writer_kwargs: Optional[Dict[str, Any]],
|
225
234
|
object_store: Optional[IObjectStore],
|
226
|
-
|
227
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
235
|
+
deltacat_storage=metastore,
|
228
236
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
229
237
|
**kwargs,
|
230
|
-
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo]
|
238
|
+
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo]]:
|
231
239
|
if deltacat_storage_kwargs is None:
|
232
240
|
deltacat_storage_kwargs = {}
|
233
|
-
|
241
|
+
rci_source_partition_locator = (
|
234
242
|
rebase_source_partition_locator
|
235
243
|
if rebase_source_partition_locator
|
236
244
|
else source_partition_locator
|
237
245
|
)
|
238
|
-
|
239
|
-
|
246
|
+
# Construct audit URL using filesystem-agnostic path joining
|
247
|
+
audit_url = posixpath.join(
|
248
|
+
compaction_artifact_path,
|
249
|
+
"compaction-audit.json",
|
250
|
+
f"{rci_source_partition_locator.hexdigest()}.json",
|
240
251
|
)
|
241
|
-
audit_url = f"{base_audit_url}.json"
|
242
252
|
|
243
253
|
logger.info(f"Compaction audit will be written to {audit_url}")
|
244
254
|
|
@@ -312,11 +322,11 @@ def _execute_compaction_round(
|
|
312
322
|
# read the results from any previously completed compaction round
|
313
323
|
round_completion_info = None
|
314
324
|
if not rebase_source_partition_locator:
|
315
|
-
round_completion_info =
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
325
|
+
round_completion_info = rci.read_round_completion_info(
|
326
|
+
source_partition_locator=source_partition_locator,
|
327
|
+
destination_partition_locator=destination_partition_locator,
|
328
|
+
deltacat_storage=deltacat_storage,
|
329
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
320
330
|
)
|
321
331
|
if not round_completion_info:
|
322
332
|
logger.info(
|
@@ -363,15 +373,11 @@ def _execute_compaction_round(
|
|
363
373
|
delta_discovery_end - delta_discovery_start
|
364
374
|
)
|
365
375
|
|
366
|
-
|
367
|
-
compaction_audit.audit_url,
|
368
|
-
str(json.dumps(compaction_audit)),
|
369
|
-
**s3_client_kwargs,
|
370
|
-
)
|
376
|
+
_upload_audit_data(audit_url, json.dumps(compaction_audit))
|
371
377
|
|
372
378
|
if not input_deltas:
|
373
379
|
logger.info("No input deltas found to compact.")
|
374
|
-
return None, None
|
380
|
+
return None, None
|
375
381
|
|
376
382
|
# limit the input deltas to fit on this cluster and convert them to
|
377
383
|
# annotated deltas of equivalent size for easy parallel distribution
|
@@ -464,11 +470,7 @@ def _execute_compaction_round(
|
|
464
470
|
hb_end - hb_start,
|
465
471
|
)
|
466
472
|
|
467
|
-
|
468
|
-
compaction_audit.audit_url,
|
469
|
-
str(json.dumps(compaction_audit)),
|
470
|
-
**s3_client_kwargs,
|
471
|
-
)
|
473
|
+
_upload_audit_data(audit_url, json.dumps(compaction_audit))
|
472
474
|
|
473
475
|
all_hash_group_idx_to_obj_id = defaultdict(list)
|
474
476
|
for hb_result in hb_results:
|
@@ -485,9 +487,9 @@ def _execute_compaction_round(
|
|
485
487
|
)
|
486
488
|
|
487
489
|
compaction_audit.set_input_records(total_hb_record_count.item())
|
488
|
-
# TODO
|
489
|
-
#
|
490
|
-
#
|
490
|
+
# TODO(pdames): when resources are freed during the last round of hash bucketing,
|
491
|
+
# start running dedupe tasks that read hash bucket output from storage then
|
492
|
+
# wait for hash bucketing to finish before continuing
|
491
493
|
|
492
494
|
# create a new stream for this round
|
493
495
|
compacted_stream_locator = destination_partition_locator.stream_locator
|
@@ -497,6 +499,7 @@ def _execute_compaction_round(
|
|
497
499
|
compacted_stream_locator.table_version,
|
498
500
|
**deltacat_storage_kwargs,
|
499
501
|
)
|
502
|
+
|
500
503
|
partition = deltacat_storage.stage_partition(
|
501
504
|
stream,
|
502
505
|
destination_partition_locator.partition_values,
|
@@ -571,9 +574,9 @@ def _execute_compaction_round(
|
|
571
574
|
logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
|
572
575
|
|
573
576
|
compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
|
574
|
-
# TODO(pdames): when resources are freed during the last round of deduping
|
577
|
+
# TODO(pdames): when resources are freed during the last round of deduping,
|
575
578
|
# start running materialize tasks that read materialization source file
|
576
|
-
# tables from
|
579
|
+
# tables from storage then wait for deduping to finish before continuing
|
577
580
|
|
578
581
|
# TODO(pdames): balance inputs to materialization tasks to ensure that each
|
579
582
|
# task has an approximately equal amount of input to materialize
|
@@ -584,11 +587,7 @@ def _execute_compaction_round(
|
|
584
587
|
# parallel step 3:
|
585
588
|
# materialize records to keep by index
|
586
589
|
|
587
|
-
|
588
|
-
compaction_audit.audit_url,
|
589
|
-
str(json.dumps(compaction_audit)),
|
590
|
-
**s3_client_kwargs,
|
591
|
-
)
|
590
|
+
_upload_audit_data(audit_url, json.dumps(compaction_audit))
|
592
591
|
|
593
592
|
materialize_start = time.monotonic()
|
594
593
|
mat_tasks_pending = invoke_parallel(
|
@@ -610,7 +609,7 @@ def _execute_compaction_round(
|
|
610
609
|
enable_profiler=enable_profiler,
|
611
610
|
metrics_config=metrics_config,
|
612
611
|
read_kwargs_provider=read_kwargs_provider,
|
613
|
-
|
612
|
+
table_writer_kwargs=table_writer_kwargs,
|
614
613
|
object_store=object_store,
|
615
614
|
deltacat_storage=deltacat_storage,
|
616
615
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
@@ -693,11 +692,7 @@ def _execute_compaction_round(
|
|
693
692
|
telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
|
694
693
|
)
|
695
694
|
|
696
|
-
|
697
|
-
compaction_audit.audit_url,
|
698
|
-
str(json.dumps(compaction_audit)),
|
699
|
-
**s3_client_kwargs,
|
700
|
-
)
|
695
|
+
_upload_audit_data(audit_url, json.dumps(compaction_audit))
|
701
696
|
|
702
697
|
new_round_completion_info = RoundCompletionInfo.of(
|
703
698
|
last_stream_position_compacted,
|
@@ -710,6 +705,7 @@ def _execute_compaction_round(
|
|
710
705
|
hash_bucket_count,
|
711
706
|
None,
|
712
707
|
CompactorVersion.V1.value,
|
708
|
+
prev_source_partition_locator=rci_source_partition_locator,
|
713
709
|
)
|
714
710
|
|
715
711
|
logger.info(
|
@@ -721,17 +717,43 @@ def _execute_compaction_round(
|
|
721
717
|
return (
|
722
718
|
partition,
|
723
719
|
new_round_completion_info,
|
724
|
-
rcf_source_partition_locator,
|
725
720
|
)
|
726
721
|
|
727
722
|
|
728
723
|
def compact_partition_from_request(
|
729
724
|
compact_partition_params: CompactPartitionParams,
|
730
725
|
*compact_partition_pos_args,
|
731
|
-
) ->
|
726
|
+
) -> None:
|
732
727
|
"""
|
733
728
|
Wrapper for compact_partition that allows for the compact_partition parameters to be
|
734
729
|
passed in as a custom dictionary-like CompactPartitionParams object along with any compact_partition positional arguments.
|
735
730
|
:param compact_partition_params:
|
736
731
|
"""
|
737
|
-
|
732
|
+
# Extract required positional arguments
|
733
|
+
source_partition_locator = compact_partition_params.source_partition_locator
|
734
|
+
destination_partition_locator = (
|
735
|
+
compact_partition_params.destination_partition_locator
|
736
|
+
)
|
737
|
+
primary_keys = compact_partition_params.primary_keys
|
738
|
+
compaction_artifact_path = compact_partition_params.compaction_artifact_path
|
739
|
+
last_stream_position_to_compact = (
|
740
|
+
compact_partition_params.last_stream_position_to_compact
|
741
|
+
)
|
742
|
+
|
743
|
+
# Create a copy of params without the positional arguments
|
744
|
+
kwargs_params = dict(compact_partition_params)
|
745
|
+
kwargs_params.pop("source_partition_locator", None)
|
746
|
+
kwargs_params.pop("destination_partition_locator", None)
|
747
|
+
kwargs_params.pop("primary_keys", None)
|
748
|
+
kwargs_params.pop("last_stream_position_to_compact", None)
|
749
|
+
# Don't pop compaction_artifact_path as it's a computed property, not stored in the dict
|
750
|
+
|
751
|
+
compact_partition(
|
752
|
+
source_partition_locator,
|
753
|
+
destination_partition_locator,
|
754
|
+
primary_keys,
|
755
|
+
compaction_artifact_path,
|
756
|
+
last_stream_position_to_compact,
|
757
|
+
*compact_partition_pos_args,
|
758
|
+
**kwargs_params,
|
759
|
+
)
|
@@ -2,17 +2,19 @@ from __future__ import annotations
|
|
2
2
|
import importlib
|
3
3
|
import copy
|
4
4
|
import json
|
5
|
-
|
5
|
+
import posixpath
|
6
|
+
from typing import Any, Dict, List, Optional, Set
|
6
7
|
from deltacat.io.object_store import IObjectStore
|
7
8
|
from deltacat.utils.common import ReadKwargsProvider
|
8
9
|
from deltacat.types.media import ContentType
|
9
10
|
from deltacat.utils.placement import PlacementGroupConfig
|
10
11
|
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
11
12
|
from deltacat.storage import (
|
12
|
-
|
13
|
+
metastore,
|
13
14
|
PartitionLocator,
|
14
15
|
SortKey,
|
15
16
|
)
|
17
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
16
18
|
from deltacat.compute.resource_estimation import (
|
17
19
|
ResourceEstimationMethod,
|
18
20
|
EstimateResourcesParams,
|
@@ -52,11 +54,22 @@ class CompactPartitionParams(dict):
|
|
52
54
|
assert (
|
53
55
|
params.get("source_partition_locator") is not None
|
54
56
|
), "source_partition_locator is a required arg"
|
57
|
+
assert params.get("catalog") is not None, "catalog is a required arg"
|
55
58
|
assert (
|
56
|
-
params.get("
|
57
|
-
), "
|
59
|
+
params.get("all_column_names") is not None
|
60
|
+
), "all_column_names is a required arg"
|
58
61
|
|
59
62
|
result = CompactPartitionParams(params)
|
63
|
+
assert (
|
64
|
+
result.destination_partition_locator.partition_id
|
65
|
+
), "destination_partition_locator must have a globally unique partition_id"
|
66
|
+
assert (
|
67
|
+
result.source_partition_locator.partition_id
|
68
|
+
), "source_partition_locator must have a globally unique partition_id"
|
69
|
+
if result.rebase_source_partition_locator:
|
70
|
+
assert (
|
71
|
+
result.rebase_source_partition_locator.partition_id
|
72
|
+
), "rebase_source_partition_locator must have a globally unique partition_id"
|
60
73
|
|
61
74
|
result.records_per_compacted_file = params.get(
|
62
75
|
"records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
|
@@ -65,15 +78,18 @@ class CompactPartitionParams(dict):
|
|
65
78
|
"compacted_file_content_type", ContentType.PARQUET
|
66
79
|
)
|
67
80
|
result.object_store = params.get("object_store", RayPlasmaObjectStore())
|
81
|
+
result.table_writer_kwargs = params.get("table_writer_kwargs", {})
|
68
82
|
|
69
83
|
result.enable_profiler = params.get("enable_profiler", False)
|
70
|
-
result.deltacat_storage = params.get(
|
71
|
-
|
72
|
-
)
|
73
|
-
result.s3_client_kwargs = params.get("s3_client_kwargs", {})
|
84
|
+
result.deltacat_storage = params.get("deltacat_storage", metastore)
|
85
|
+
result.catalog = params.get("catalog")
|
74
86
|
result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
|
75
87
|
result.list_deltas_kwargs = params.get("list_deltas_kwargs", {})
|
76
|
-
result.
|
88
|
+
result.all_column_names = params.get("all_column_names")
|
89
|
+
|
90
|
+
# Add catalog to deltacat_storage_kwargs
|
91
|
+
result.deltacat_storage_kwargs["catalog"] = result.catalog
|
92
|
+
|
77
93
|
result.bit_width_of_sort_keys = validate_sort_keys(
|
78
94
|
result.source_partition_locator,
|
79
95
|
result.sort_keys,
|
@@ -133,6 +149,8 @@ class CompactPartitionParams(dict):
|
|
133
149
|
if result.primary_keys:
|
134
150
|
result.primary_keys = sorted(result.primary_keys)
|
135
151
|
|
152
|
+
result.original_fields = params.get("original_fields")
|
153
|
+
|
136
154
|
# assertions
|
137
155
|
assert (
|
138
156
|
result.source_partition_locator.partition_values
|
@@ -177,21 +195,32 @@ class CompactPartitionParams(dict):
|
|
177
195
|
self["source_partition_locator"] = locator
|
178
196
|
|
179
197
|
@property
|
180
|
-
def
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
self["compaction_artifact_s3_bucket"] = s3_bucket
|
198
|
+
def compaction_artifact_path(self) -> str:
|
199
|
+
"""
|
200
|
+
Returns the compaction artifact path based on catalog root.
|
201
|
+
"""
|
202
|
+
return posixpath.join(self.catalog.root, "compute", "compactor")
|
186
203
|
|
187
204
|
@property
|
188
|
-
def deltacat_storage(self) ->
|
205
|
+
def deltacat_storage(self) -> metastore:
|
189
206
|
return self["deltacat_storage"]
|
190
207
|
|
191
208
|
@deltacat_storage.setter
|
192
|
-
def deltacat_storage(self, storage:
|
209
|
+
def deltacat_storage(self, storage: metastore) -> None:
|
193
210
|
self["deltacat_storage"] = storage
|
194
211
|
|
212
|
+
@property
|
213
|
+
def catalog(self) -> CatalogProperties:
|
214
|
+
return self["catalog"]
|
215
|
+
|
216
|
+
@catalog.setter
|
217
|
+
def catalog(self, catalog: CatalogProperties) -> None:
|
218
|
+
self["catalog"] = catalog
|
219
|
+
# Update deltacat_storage_kwargs when catalog is set
|
220
|
+
if "deltacat_storage_kwargs" not in self:
|
221
|
+
self["deltacat_storage_kwargs"] = {}
|
222
|
+
self["deltacat_storage_kwargs"]["catalog"] = catalog
|
223
|
+
|
195
224
|
@property
|
196
225
|
def object_store(self) -> IObjectStore:
|
197
226
|
return self["object_store"]
|
@@ -286,14 +315,6 @@ class CompactPartitionParams(dict):
|
|
286
315
|
def list_deltas_kwargs(self, kwargs: dict) -> None:
|
287
316
|
self["list_deltas_kwargs"] = kwargs
|
288
317
|
|
289
|
-
@property
|
290
|
-
def s3_table_writer_kwargs(self) -> dict:
|
291
|
-
return self["s3_table_writer_kwargs"]
|
292
|
-
|
293
|
-
@s3_table_writer_kwargs.setter
|
294
|
-
def s3_table_writer_kwargs(self, kwargs: dict) -> None:
|
295
|
-
self["s3_table_writer_kwargs"] = kwargs
|
296
|
-
|
297
318
|
@property
|
298
319
|
def deltacat_storage_kwargs(self) -> dict:
|
299
320
|
return self["deltacat_storage_kwargs"]
|
@@ -303,12 +324,12 @@ class CompactPartitionParams(dict):
|
|
303
324
|
self["deltacat_storage_kwargs"] = kwargs
|
304
325
|
|
305
326
|
@property
|
306
|
-
def
|
307
|
-
return self
|
327
|
+
def all_column_names(self) -> List[str]:
|
328
|
+
return self.get("all_column_names")
|
308
329
|
|
309
|
-
@
|
310
|
-
def
|
311
|
-
self["
|
330
|
+
@all_column_names.setter
|
331
|
+
def all_column_names(self, column_names: List[str]) -> None:
|
332
|
+
self["all_column_names"] = column_names
|
312
333
|
|
313
334
|
@property
|
314
335
|
def records_per_compacted_file(self) -> int:
|
@@ -489,6 +510,30 @@ class CompactPartitionParams(dict):
|
|
489
510
|
average_record_size_bytes=self.average_record_size_bytes,
|
490
511
|
)
|
491
512
|
|
513
|
+
@property
|
514
|
+
def table_writer_kwargs(self) -> dict:
|
515
|
+
return self["table_writer_kwargs"]
|
516
|
+
|
517
|
+
@table_writer_kwargs.setter
|
518
|
+
def table_writer_kwargs(self, kwargs: dict) -> None:
|
519
|
+
self["table_writer_kwargs"] = kwargs
|
520
|
+
|
521
|
+
@property
|
522
|
+
def expected_previous_partition_id(self) -> Optional[str]:
|
523
|
+
return self.get("expected_previous_partition_id")
|
524
|
+
|
525
|
+
@expected_previous_partition_id.setter
|
526
|
+
def expected_previous_partition_id(self, partition_id: Optional[str]) -> None:
|
527
|
+
self["expected_previous_partition_id"] = partition_id
|
528
|
+
|
529
|
+
@property
|
530
|
+
def original_fields(self) -> Optional[Set[str]]:
|
531
|
+
return self.get("original_fields")
|
532
|
+
|
533
|
+
@original_fields.setter
|
534
|
+
def original_fields(self, fields: Optional[Set[str]]) -> None:
|
535
|
+
self["original_fields"] = fields
|
536
|
+
|
492
537
|
@staticmethod
|
493
538
|
def json_handler_for_compact_partition_params(obj):
|
494
539
|
"""
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
from typing import Optional
|
4
4
|
import pyarrow as pa
|
5
5
|
import logging
|
6
|
+
from pathlib import PosixPath
|
6
7
|
from deltacat import logs
|
7
8
|
from typing import List, Union
|
8
9
|
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
@@ -919,3 +920,19 @@ class CompactionSessionAuditInfo(dict):
|
|
919
920
|
)
|
920
921
|
|
921
922
|
self.set_pyarrow_version(pa.__version__)
|
923
|
+
|
924
|
+
def to_serializable(self, catalog_root: str) -> CompactionSessionAuditInfo:
|
925
|
+
root_path = PosixPath(catalog_root)
|
926
|
+
target_path = PosixPath(self.audit_url)
|
927
|
+
if root_path == target_path:
|
928
|
+
raise ValueError(
|
929
|
+
"Target and root are identical, but expected target to be a child of root."
|
930
|
+
)
|
931
|
+
try:
|
932
|
+
relative_path = target_path.relative_to(root_path)
|
933
|
+
# Create a copy of the audit info with the relative path
|
934
|
+
audit_copy = CompactionSessionAuditInfo(**dict(self))
|
935
|
+
audit_copy["auditUrl"] = str(relative_path)
|
936
|
+
return audit_copy
|
937
|
+
except ValueError:
|
938
|
+
raise ValueError("Expected target to be a child of root.")
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from typing import
|
4
|
+
from typing import Tuple, Union
|
5
5
|
from deltacat.storage import DeltaLocator, PartitionLocator
|
6
6
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
7
7
|
from typing import Any, Dict, Optional
|
@@ -10,7 +10,7 @@ from typing import Any, Dict, Optional
|
|
10
10
|
class HighWatermark(dict):
|
11
11
|
"""
|
12
12
|
Inherit from dict to make it easy for serialization/deserialization.
|
13
|
-
Keep both partition locator and high watermark as a tuple to be persisted in the
|
13
|
+
Keep both partition locator and high watermark as a tuple to be persisted in the rci
|
14
14
|
"""
|
15
15
|
|
16
16
|
def set(self, partition_locator: PartitionLocator, delta_stream_position: int):
|
@@ -46,6 +46,7 @@ class RoundCompletionInfo(dict):
|
|
46
46
|
compactor_version: Optional[str] = None,
|
47
47
|
input_inflation: Optional[float] = None,
|
48
48
|
input_average_record_size_bytes: Optional[float] = None,
|
49
|
+
prev_source_partition_locator: Optional[PartitionLocator] = None,
|
49
50
|
) -> RoundCompletionInfo:
|
50
51
|
|
51
52
|
rci = RoundCompletionInfo()
|
@@ -63,6 +64,7 @@ class RoundCompletionInfo(dict):
|
|
63
64
|
rci["compactorVersion"] = compactor_version
|
64
65
|
rci["inputInflation"] = input_inflation
|
65
66
|
rci["inputAverageRecordSizeBytes"] = input_average_record_size_bytes
|
67
|
+
rci["prevSourcePartitionLocator"] = prev_source_partition_locator
|
66
68
|
return rci
|
67
69
|
|
68
70
|
@property
|
@@ -100,7 +102,11 @@ class RoundCompletionInfo(dict):
|
|
100
102
|
|
101
103
|
@property
|
102
104
|
def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
|
103
|
-
|
105
|
+
val = self.get("rebaseSourcePartitionLocator")
|
106
|
+
if val is not None and not isinstance(val, PartitionLocator):
|
107
|
+
val = PartitionLocator(val)
|
108
|
+
self["rebaseSourcePartitionLocator"] = val # Cache the converted value
|
109
|
+
return val
|
104
110
|
|
105
111
|
@property
|
106
112
|
def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
|
@@ -129,6 +135,10 @@ class RoundCompletionInfo(dict):
|
|
129
135
|
def input_average_record_size_bytes(self) -> Optional[float]:
|
130
136
|
return self.get("inputAverageRecordSizeBytes")
|
131
137
|
|
132
|
-
@
|
133
|
-
def
|
134
|
-
|
138
|
+
@property
|
139
|
+
def prev_source_partition_locator(self) -> Optional[PartitionLocator]:
|
140
|
+
val = self.get("prevSourcePartitionLocator")
|
141
|
+
if val is not None and not isinstance(val, PartitionLocator):
|
142
|
+
val = PartitionLocator(val)
|
143
|
+
self["prevSourcePartitionLocator"] = val # Cache the converted value
|
144
|
+
return val
|