deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,17 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
import
|
4
|
+
import json
|
5
5
|
import posixpath
|
6
6
|
|
7
7
|
import pyarrow
|
8
|
-
import pyarrow as pa
|
9
8
|
|
10
|
-
from typing import Any, Dict, List, Optional
|
9
|
+
from typing import Any, Dict, List, Optional, TYPE_CHECKING
|
11
10
|
|
12
11
|
from deltacat.storage.model.metafile import Metafile, MetafileRevisionInfo
|
13
|
-
from deltacat.constants import
|
12
|
+
from deltacat.constants import TXN_DIR_NAME
|
14
13
|
from deltacat.storage.model.schema import (
|
15
14
|
FieldLocator,
|
16
|
-
Schema,
|
17
15
|
)
|
18
16
|
from deltacat.storage.model.locator import (
|
19
17
|
Locator,
|
@@ -33,38 +31,50 @@ from deltacat.storage.model.types import (
|
|
33
31
|
)
|
34
32
|
from deltacat.types.media import ContentType
|
35
33
|
|
34
|
+
if TYPE_CHECKING:
|
35
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
36
|
+
|
36
37
|
|
37
38
|
"""
|
38
39
|
An ordered list of partition values. Partition values are typically derived
|
39
40
|
by applying one or more transforms to a table's fields.
|
40
41
|
"""
|
41
42
|
PartitionValues = List[Any]
|
43
|
+
|
44
|
+
"""
|
45
|
+
Constants for special partition types.
|
46
|
+
"""
|
47
|
+
UNPARTITIONED_SCHEME_NAME = "unpartitioned_scheme"
|
42
48
|
UNPARTITIONED_SCHEME_ID = "deadbeef-7277-49a4-a195-fdc8ed235d42"
|
49
|
+
UNKNOWN_PARTITION_ID = "deadbeef-2fe7-4557-82c9-da53b1862003" # a partition ID that is assumed to exist but is not known
|
50
|
+
UNSPECIFIED_PARTITION_ID = "deadbeef-5bff-41ea-b82c-e531f445632b" # a partition ID that has been left intentionally unspecified
|
43
51
|
|
44
52
|
|
45
53
|
class Partition(Metafile):
|
46
54
|
@staticmethod
|
47
55
|
def of(
|
48
56
|
locator: Optional[PartitionLocator],
|
49
|
-
schema: Optional[Schema],
|
50
57
|
content_types: Optional[List[ContentType]],
|
51
58
|
state: Optional[CommitState] = None,
|
52
59
|
previous_stream_position: Optional[int] = None,
|
53
60
|
previous_partition_id: Optional[str] = None,
|
54
61
|
stream_position: Optional[int] = None,
|
55
62
|
partition_scheme_id: Optional[str] = None,
|
63
|
+
compaction_round_completion_info: Optional[RoundCompletionInfo] = None,
|
56
64
|
) -> Partition:
|
57
65
|
partition = Partition()
|
58
66
|
partition.locator = locator
|
59
|
-
partition.schema = schema
|
60
67
|
partition.content_types = content_types
|
61
68
|
partition.state = state
|
62
69
|
partition.previous_stream_position = previous_stream_position
|
63
70
|
partition.previous_partition_id = previous_partition_id
|
64
71
|
partition.stream_position = stream_position
|
65
72
|
partition.partition_scheme_id = (
|
66
|
-
partition_scheme_id
|
73
|
+
partition_scheme_id
|
74
|
+
if locator and locator.partition_values
|
75
|
+
else UNPARTITIONED_SCHEME_ID
|
67
76
|
)
|
77
|
+
partition.compaction_round_completion_info = compaction_round_completion_info
|
68
78
|
return partition
|
69
79
|
|
70
80
|
@property
|
@@ -82,17 +92,6 @@ class Partition(Metafile):
|
|
82
92
|
def locator_alias(self) -> Optional[PartitionLocatorAlias]:
|
83
93
|
return PartitionLocatorAlias.of(self)
|
84
94
|
|
85
|
-
@property
|
86
|
-
def schema(self) -> Optional[Schema]:
|
87
|
-
val: Dict[str, Any] = self.get("schema")
|
88
|
-
if val is not None and not isinstance(val, Schema):
|
89
|
-
self.schema = val = Schema(val)
|
90
|
-
return val
|
91
|
-
|
92
|
-
@schema.setter
|
93
|
-
def schema(self, schema: Optional[Schema]) -> None:
|
94
|
-
self["schema"] = schema
|
95
|
-
|
96
95
|
@property
|
97
96
|
def content_types(self) -> Optional[List[ContentType]]:
|
98
97
|
content_types = self.get("contentTypes")
|
@@ -149,6 +148,27 @@ class Partition(Metafile):
|
|
149
148
|
def partition_scheme_id(self, partition_scheme_id: Optional[str]) -> None:
|
150
149
|
self["partitionSchemeId"] = partition_scheme_id
|
151
150
|
|
151
|
+
@property
|
152
|
+
def compaction_round_completion_info(self) -> Optional[RoundCompletionInfo]:
|
153
|
+
"""
|
154
|
+
Round completion info for compaction operations.
|
155
|
+
This replaces the need for separate round completion files.
|
156
|
+
"""
|
157
|
+
val: Dict[str, Any] = self.get("compactionRoundCompletionInfo")
|
158
|
+
if val is not None:
|
159
|
+
# Import here to avoid circular imports
|
160
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
161
|
+
|
162
|
+
if not isinstance(val, RoundCompletionInfo):
|
163
|
+
self["compactionRoundCompletionInfo"] = val = RoundCompletionInfo(val)
|
164
|
+
return val
|
165
|
+
|
166
|
+
@compaction_round_completion_info.setter
|
167
|
+
def compaction_round_completion_info(
|
168
|
+
self, compaction_round_completion_info: Optional[RoundCompletionInfo]
|
169
|
+
) -> None:
|
170
|
+
self["compactionRoundCompletionInfo"] = compaction_round_completion_info
|
171
|
+
|
152
172
|
@property
|
153
173
|
def partition_id(self) -> Optional[str]:
|
154
174
|
partition_locator = self.locator
|
@@ -175,6 +195,7 @@ class Partition(Metafile):
|
|
175
195
|
partition_locator = self.locator
|
176
196
|
if partition_locator:
|
177
197
|
return partition_locator.partition_values
|
198
|
+
return None
|
178
199
|
|
179
200
|
@property
|
180
201
|
def namespace_locator(self) -> Optional[NamespaceLocator]:
|
@@ -232,6 +253,13 @@ class Partition(Metafile):
|
|
232
253
|
return partition_locator.table_version
|
233
254
|
return None
|
234
255
|
|
256
|
+
def url(self, catalog_name: Optional[str] = None) -> str:
|
257
|
+
return (
|
258
|
+
f"dc://{catalog_name}/{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/{json.dumps(self.partition_values)}/"
|
259
|
+
if catalog_name
|
260
|
+
else f"table://{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/{json.dumps(self.partition_values)}/"
|
261
|
+
)
|
262
|
+
|
235
263
|
def is_supported_content_type(self, content_type: ContentType) -> bool:
|
236
264
|
supported_content_types = self.content_types
|
237
265
|
return (not supported_content_types) or (
|
@@ -240,14 +268,6 @@ class Partition(Metafile):
|
|
240
268
|
|
241
269
|
def to_serializable(self) -> Partition:
|
242
270
|
serializable: Partition = Partition.update_for(self)
|
243
|
-
if serializable.schema:
|
244
|
-
schema_bytes = serializable.schema.serialize().to_pybytes()
|
245
|
-
serializable.schema = (
|
246
|
-
base64.b64encode(schema_bytes).decode("utf-8")
|
247
|
-
if METAFILE_FORMAT == METAFILE_FORMAT_JSON
|
248
|
-
else schema_bytes
|
249
|
-
)
|
250
|
-
|
251
271
|
if serializable.table_locator:
|
252
272
|
# replace the mutable table locator
|
253
273
|
serializable.table_version_locator.table_locator = TableLocator.at(
|
@@ -261,17 +281,6 @@ class Partition(Metafile):
|
|
261
281
|
path: str,
|
262
282
|
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
263
283
|
) -> Partition:
|
264
|
-
if self.get("schema"):
|
265
|
-
schema_data = self["schema"]
|
266
|
-
schema_bytes = (
|
267
|
-
base64.b64decode(schema_data)
|
268
|
-
if METAFILE_FORMAT == METAFILE_FORMAT_JSON
|
269
|
-
else schema_data
|
270
|
-
)
|
271
|
-
self["schema"] = Schema.deserialize(pa.py_buffer(schema_bytes))
|
272
|
-
else:
|
273
|
-
self["schema"] = None
|
274
|
-
|
275
284
|
# restore the table locator from its mapped immutable metafile ID
|
276
285
|
if self.table_locator and self.table_locator.table_name == self.id:
|
277
286
|
parent_rev_dir_path = Metafile._parent_metafile_rev_dir_path(
|
@@ -358,7 +367,7 @@ class PartitionLocator(Locator, dict):
|
|
358
367
|
stream_id,
|
359
368
|
stream_format,
|
360
369
|
)
|
361
|
-
if
|
370
|
+
if stream_format or stream_id
|
362
371
|
else None
|
363
372
|
)
|
364
373
|
return PartitionLocator.of(
|
@@ -392,7 +401,9 @@ class PartitionLocator(Locator, dict):
|
|
392
401
|
|
393
402
|
@partition_values.setter
|
394
403
|
def partition_values(self, partition_values: Optional[PartitionValues]) -> None:
|
395
|
-
self["partitionValues"] =
|
404
|
+
self["partitionValues"] = (
|
405
|
+
partition_values or None
|
406
|
+
) # normalize empty partition values to None
|
396
407
|
|
397
408
|
@property
|
398
409
|
def partition_id(self) -> Optional[str]:
|
@@ -468,6 +479,12 @@ class PartitionKey(dict):
|
|
468
479
|
transform: Optional[Transform] = None,
|
469
480
|
native_object: Optional[Any] = None,
|
470
481
|
) -> PartitionKey:
|
482
|
+
if (
|
483
|
+
len(key) > 1
|
484
|
+
and transform is not None
|
485
|
+
and not transform.is_multi_field_transform
|
486
|
+
):
|
487
|
+
raise ValueError(f"{len(key)} keys given for 1-key transform.")
|
471
488
|
return PartitionKey(
|
472
489
|
{
|
473
490
|
"key": key,
|
@@ -536,6 +553,10 @@ class PartitionKeyList(List[PartitionKey]):
|
|
536
553
|
self[item] = val = PartitionKey(val)
|
537
554
|
return val
|
538
555
|
|
556
|
+
def __iter__(self):
|
557
|
+
for i in range(len(self)):
|
558
|
+
yield self[i] # This triggers __getitem__ conversion
|
559
|
+
|
539
560
|
|
540
561
|
class PartitionScheme(dict):
|
541
562
|
@staticmethod
|
@@ -545,6 +566,40 @@ class PartitionScheme(dict):
|
|
545
566
|
scheme_id: Optional[str] = None,
|
546
567
|
native_object: Optional[Any] = None,
|
547
568
|
) -> PartitionScheme:
|
569
|
+
# Validate keys if provided
|
570
|
+
if keys is not None:
|
571
|
+
# Check for empty keys list
|
572
|
+
if len(keys) == 0:
|
573
|
+
raise ValueError("Partition scheme cannot have empty keys list")
|
574
|
+
|
575
|
+
# Check for duplicate keys (by field locators and transform types) and names
|
576
|
+
seen_key_transform_pairs = set()
|
577
|
+
seen_names = set()
|
578
|
+
for key in keys:
|
579
|
+
# Check for duplicate field locators with identical transform types
|
580
|
+
key_tuple = tuple(key.key) if key.key else ()
|
581
|
+
transform_type = type(key.transform) if key.transform else None
|
582
|
+
key_transform_pair = (key_tuple, transform_type)
|
583
|
+
|
584
|
+
if key_transform_pair in seen_key_transform_pairs:
|
585
|
+
# Use the first field locator for the error message
|
586
|
+
key_name = key.key[0] if key.key else "unknown"
|
587
|
+
transform_name = (
|
588
|
+
transform_type.__name__ if transform_type else "None"
|
589
|
+
)
|
590
|
+
raise ValueError(
|
591
|
+
f"Duplicate partition key found: {key_name} with transform type {transform_name}"
|
592
|
+
)
|
593
|
+
seen_key_transform_pairs.add(key_transform_pair)
|
594
|
+
|
595
|
+
# Check for duplicate names (when specified)
|
596
|
+
if key.name is not None:
|
597
|
+
if key.name in seen_names:
|
598
|
+
raise ValueError(
|
599
|
+
f"Duplicate partition key name found: {key.name}"
|
600
|
+
)
|
601
|
+
seen_names.add(key.name)
|
602
|
+
|
548
603
|
return PartitionScheme(
|
549
604
|
{
|
550
605
|
"keys": keys,
|
@@ -565,6 +620,15 @@ class PartitionScheme(dict):
|
|
565
620
|
return False
|
566
621
|
if not isinstance(other, PartitionScheme):
|
567
622
|
other = PartitionScheme(other)
|
623
|
+
# If both have None keys, they are equivalent (for unpartitioned schemes)
|
624
|
+
if self.keys is None and other.keys is None:
|
625
|
+
return not check_identifiers or (
|
626
|
+
self.name == other.name and self.id == other.id
|
627
|
+
)
|
628
|
+
# If only one has None keys, they are not equivalent
|
629
|
+
if self.keys is None or other.keys is None:
|
630
|
+
return False
|
631
|
+
# Compare keys if both have them
|
568
632
|
for i in range(len(self.keys)):
|
569
633
|
if not self.keys[i].equivalent_to(other.keys[i], check_identifiers):
|
570
634
|
return False
|
@@ -592,6 +656,13 @@ class PartitionScheme(dict):
|
|
592
656
|
return self.get("nativeObject")
|
593
657
|
|
594
658
|
|
659
|
+
UNPARTITIONED_SCHEME = PartitionScheme.of(
|
660
|
+
keys=None,
|
661
|
+
name=UNPARTITIONED_SCHEME_NAME,
|
662
|
+
scheme_id=UNPARTITIONED_SCHEME_ID,
|
663
|
+
)
|
664
|
+
|
665
|
+
|
595
666
|
class PartitionSchemeList(List[PartitionScheme]):
|
596
667
|
@staticmethod
|
597
668
|
def of(items: List[PartitionScheme]) -> PartitionSchemeList:
|
@@ -608,6 +679,10 @@ class PartitionSchemeList(List[PartitionScheme]):
|
|
608
679
|
self[item] = val = PartitionScheme(val)
|
609
680
|
return val
|
610
681
|
|
682
|
+
def __iter__(self):
|
683
|
+
for i in range(len(self)):
|
684
|
+
yield self[i] # This triggers __getitem__ conversion
|
685
|
+
|
611
686
|
|
612
687
|
class PartitionLocatorAliasName(LocatorName):
|
613
688
|
def __init__(self, locator: PartitionLocatorAlias):
|
@@ -639,8 +714,8 @@ class PartitionLocatorAlias(Locator, dict):
|
|
639
714
|
),
|
640
715
|
}
|
641
716
|
)
|
642
|
-
if parent_partition.state
|
643
|
-
else None #
|
717
|
+
if parent_partition.state != CommitState.STAGED
|
718
|
+
else None # staged partitions cannot be resolved by alias
|
644
719
|
)
|
645
720
|
|
646
721
|
@property
|