PyPI - deltacat - Versions diffs - 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

deltacat 1.1.38py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (367) hide show

deltacat/__init__.py +150 -12
deltacat/annotations.py +36 -0
deltacat/api.py +578 -0
deltacat/aws/constants.py +0 -23
deltacat/aws/s3u.py +4 -631
deltacat/benchmarking/benchmark_engine.py +84 -0
deltacat/benchmarking/benchmark_report.py +86 -0
deltacat/benchmarking/benchmark_suite.py +11 -0
deltacat/benchmarking/conftest.py +22 -19
deltacat/benchmarking/data/random_row_generator.py +94 -0
deltacat/benchmarking/data/row_generator.py +10 -0
deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
deltacat/catalog/__init__.py +73 -0
deltacat/catalog/delegate.py +615 -140
deltacat/catalog/interface.py +404 -81
deltacat/catalog/main/impl.py +2882 -0
deltacat/catalog/model/catalog.py +348 -46
deltacat/catalog/model/properties.py +155 -0
deltacat/catalog/model/table_definition.py +32 -1
deltacat/compute/__init__.py +14 -0
deltacat/compute/compactor/compaction_session.py +97 -75
deltacat/compute/compactor/model/compact_partition_params.py +75 -30
deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
deltacat/compute/compactor/model/delta_annotated.py +3 -3
deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
deltacat/compute/compactor/model/delta_file_locator.py +3 -1
deltacat/compute/compactor/model/round_completion_info.py +19 -9
deltacat/compute/compactor/model/table_object_store.py +3 -2
deltacat/compute/compactor/repartition_session.py +9 -22
deltacat/compute/compactor/steps/dedupe.py +11 -4
deltacat/compute/compactor/steps/hash_bucket.py +6 -6
deltacat/compute/compactor/steps/materialize.py +15 -9
deltacat/compute/compactor/steps/repartition.py +12 -11
deltacat/compute/compactor/utils/io.py +7 -6
deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
deltacat/compute/compactor/utils/sort_key.py +9 -2
deltacat/compute/compactor/utils/system_columns.py +3 -1
deltacat/compute/compactor_v2/compaction_session.py +13 -14
deltacat/compute/compactor_v2/deletes/utils.py +3 -3
deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
deltacat/compute/compactor_v2/model/merge_input.py +28 -9
deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
deltacat/compute/compactor_v2/steps/merge.py +156 -53
deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
deltacat/compute/compactor_v2/utils/delta.py +5 -3
deltacat/compute/compactor_v2/utils/io.py +10 -3
deltacat/compute/compactor_v2/utils/merge.py +14 -2
deltacat/compute/compactor_v2/utils/task_options.py +2 -10
deltacat/compute/converter/constants.py +9 -0
deltacat/compute/converter/converter_session.py +298 -0
deltacat/compute/converter/model/convert_input.py +96 -0
deltacat/compute/converter/model/convert_input_files.py +78 -0
deltacat/compute/converter/model/convert_result.py +80 -0
deltacat/compute/converter/model/converter_session_params.py +144 -0
deltacat/compute/converter/pyiceberg/catalog.py +78 -0
deltacat/compute/converter/pyiceberg/overrides.py +263 -0
deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
deltacat/compute/converter/steps/convert.py +366 -0
deltacat/compute/converter/steps/dedupe.py +94 -0
deltacat/compute/converter/utils/__init__.py +0 -0
deltacat/compute/converter/utils/convert_task_options.py +132 -0
deltacat/compute/converter/utils/converter_session_utils.py +175 -0
deltacat/compute/converter/utils/iceberg_columns.py +87 -0
deltacat/compute/converter/utils/io.py +203 -0
deltacat/compute/converter/utils/s3u.py +148 -0
deltacat/compute/janitor.py +205 -0
deltacat/compute/jobs/__init__.py +0 -0
deltacat/compute/jobs/client.py +417 -0
deltacat/compute/resource_estimation/delta.py +11 -1
deltacat/constants.py +90 -1
deltacat/docs/__init__.py +0 -0
deltacat/docs/autogen/__init__.py +0 -0
deltacat/docs/autogen/schema/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
deltacat/env.py +61 -0
deltacat/examples/__init__.py +0 -0
deltacat/examples/basic_logging.py +101 -0
deltacat/examples/compactor/__init__.py +0 -0
deltacat/examples/compactor/aws/__init__.py +1 -0
deltacat/examples/compactor/bootstrap.py +863 -0
deltacat/examples/compactor/compactor.py +373 -0
deltacat/examples/compactor/explorer.py +473 -0
deltacat/examples/compactor/gcp/__init__.py +1 -0
deltacat/examples/compactor/job_runner.py +439 -0
deltacat/examples/compactor/utils/__init__.py +1 -0
deltacat/examples/compactor/utils/common.py +261 -0
deltacat/examples/experimental/__init__.py +0 -0
deltacat/examples/experimental/iceberg/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
deltacat/examples/hello_world.py +29 -0
deltacat/examples/indexer/__init__.py +0 -0
deltacat/examples/indexer/aws/__init__.py +0 -0
deltacat/examples/indexer/gcp/__init__.py +0 -0
deltacat/examples/indexer/indexer.py +163 -0
deltacat/examples/indexer/job_runner.py +198 -0
deltacat/exceptions.py +116 -12
deltacat/experimental/__init__.py +0 -0
deltacat/experimental/catalog/__init__.py +0 -0
deltacat/experimental/catalog/iceberg/__init__.py +6 -0
deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
deltacat/experimental/catalog/iceberg/impl.py +399 -0
deltacat/experimental/catalog/iceberg/overrides.py +72 -0
deltacat/experimental/compatibility/__init__.py +0 -0
deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
deltacat/experimental/converter_agent/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/managed.py +173 -0
deltacat/experimental/converter_agent/table_monitor.py +479 -0
deltacat/experimental/daft/__init__.py +4 -0
deltacat/experimental/daft/daft_catalog.py +229 -0
deltacat/experimental/storage/__init__.py +0 -0
deltacat/experimental/storage/iceberg/__init__.py +0 -0
deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
deltacat/experimental/storage/iceberg/impl.py +739 -0
deltacat/experimental/storage/iceberg/model.py +713 -0
deltacat/experimental/storage/iceberg/visitor.py +119 -0
deltacat/experimental/storage/rivulet/__init__.py +11 -0
deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
deltacat/experimental/storage/rivulet/dataset.py +745 -0
deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
deltacat/experimental/storage/rivulet/serializer.py +40 -0
deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
deltacat/io/__init__.py +13 -0
deltacat/io/dataset/__init__.py +0 -0
deltacat/io/dataset/deltacat_dataset.py +91 -0
deltacat/io/datasink/__init__.py +0 -0
deltacat/io/datasink/deltacat_datasink.py +207 -0
deltacat/io/datasource/__init__.py +0 -0
deltacat/io/datasource/deltacat_datasource.py +579 -0
deltacat/io/reader/__init__.py +0 -0
deltacat/io/reader/deltacat_read_api.py +172 -0
deltacat/logs.py +4 -1
deltacat/storage/__init__.py +138 -28
deltacat/storage/interface.py +260 -155
deltacat/storage/main/__init__.py +0 -0
deltacat/storage/main/impl.py +3030 -0
deltacat/storage/model/delta.py +142 -71
deltacat/storage/model/expression/__init__.py +47 -0
deltacat/storage/model/expression/expression.py +656 -0
deltacat/storage/model/expression/visitor.py +248 -0
deltacat/storage/model/interop.py +24 -0
deltacat/storage/model/list_result.py +8 -0
deltacat/storage/model/locator.py +93 -9
deltacat/storage/model/manifest.py +643 -0
deltacat/storage/model/metafile.py +1421 -0
deltacat/storage/model/namespace.py +41 -18
deltacat/storage/model/partition.py +443 -43
deltacat/storage/model/scan/__init__.py +0 -0
deltacat/storage/model/scan/push_down.py +46 -0
deltacat/storage/model/scan/scan_plan.py +10 -0
deltacat/storage/model/scan/scan_task.py +34 -0
deltacat/storage/model/schema.py +3160 -0
deltacat/storage/model/shard.py +51 -0
deltacat/storage/model/sort_key.py +210 -13
deltacat/storage/model/stream.py +215 -80
deltacat/storage/model/table.py +134 -29
deltacat/storage/model/table_version.py +333 -46
deltacat/storage/model/transaction.py +1733 -0
deltacat/storage/model/transform.py +274 -58
deltacat/storage/model/types.py +138 -16
deltacat/storage/util/__init__.py +0 -0
deltacat/storage/util/scan_planner.py +26 -0
deltacat/tests/_io/__init__.py +1 -0
deltacat/tests/_io/reader/__init__.py +0 -0
deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
deltacat/tests/aws/test_s3u.py +2 -31
deltacat/tests/catalog/data/__init__.py +0 -0
deltacat/tests/catalog/main/__init__.py +0 -0
deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
deltacat/tests/catalog/model/__init__.py +0 -0
deltacat/tests/catalog/model/test_table_definition.py +16 -0
deltacat/tests/catalog/test_catalogs.py +321 -0
deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
deltacat/tests/compute/compact_partition_test_cases.py +23 -30
deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
deltacat/tests/compute/compactor/utils/test_io.py +125 -123
deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
deltacat/tests/compute/conftest.py +39 -0
deltacat/tests/compute/converter/__init__.py +0 -0
deltacat/tests/compute/converter/conftest.py +80 -0
deltacat/tests/compute/converter/test_convert_session.py +826 -0
deltacat/tests/compute/converter/utils.py +132 -0
deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
deltacat/tests/compute/test_compact_partition_params.py +16 -11
deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
deltacat/tests/compute/test_janitor.py +236 -0
deltacat/tests/compute/test_util_common.py +726 -46
deltacat/tests/compute/test_util_constant.py +0 -1
deltacat/tests/conftest.py +25 -0
deltacat/tests/daft/__init__.py +0 -0
deltacat/tests/daft/test_model.py +97 -0
deltacat/tests/experimental/__init__.py +1 -0
deltacat/tests/experimental/catalog/__init__.py +0 -0
deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
deltacat/tests/experimental/compatibility/__init__.py +1 -0
deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
deltacat/tests/experimental/daft/__init__.py +0 -0
deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
deltacat/tests/experimental/storage/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
deltacat/tests/storage/__init__.py +0 -0
deltacat/tests/storage/main/__init__.py +0 -0
deltacat/tests/storage/main/test_main_storage.py +8204 -0
deltacat/tests/storage/model/__init__.py +0 -0
deltacat/tests/storage/model/test_delete_parameters.py +21 -0
deltacat/tests/storage/model/test_expression.py +327 -0
deltacat/tests/storage/model/test_manifest.py +129 -0
deltacat/tests/storage/model/test_metafile_io.py +2440 -0
deltacat/tests/storage/model/test_partition_scheme.py +85 -0
deltacat/tests/storage/model/test_schema.py +479 -0
deltacat/tests/storage/model/test_schema_update.py +1925 -0
deltacat/tests/storage/model/test_shard.py +24 -0
deltacat/tests/storage/model/test_sort_scheme.py +90 -0
deltacat/tests/storage/model/test_table_version.py +110 -0
deltacat/tests/storage/model/test_transaction.py +653 -0
deltacat/tests/storage/model/test_transaction_history.py +886 -0
deltacat/tests/test_deltacat_api.py +1064 -0
deltacat/tests/test_exceptions.py +9 -5
deltacat/tests/test_utils/filesystem.py +14 -0
deltacat/tests/test_utils/message_pack_utils.py +54 -0
deltacat/tests/test_utils/pyarrow.py +50 -26
deltacat/tests/test_utils/storage.py +256 -4
deltacat/tests/types/__init__.py +0 -0
deltacat/tests/types/test_tables.py +104 -0
deltacat/tests/utils/exceptions.py +22 -0
deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
deltacat/tests/utils/test_daft.py +124 -34
deltacat/tests/utils/test_numpy.py +1193 -0
deltacat/tests/utils/test_pandas.py +1106 -0
deltacat/tests/utils/test_polars.py +1040 -0
deltacat/tests/utils/test_pyarrow.py +1107 -258
deltacat/types/media.py +345 -37
deltacat/types/partial_download.py +1 -1
deltacat/types/tables.py +2345 -47
deltacat/utils/arguments.py +33 -1
deltacat/utils/daft.py +824 -40
deltacat/utils/export.py +61 -0
deltacat/utils/filesystem.py +450 -0
deltacat/utils/metafile_locator.py +74 -0
deltacat/utils/numpy.py +118 -26
deltacat/utils/pandas.py +577 -48
deltacat/utils/polars.py +759 -0
deltacat/utils/pyarrow.py +1212 -178
deltacat/utils/ray_utils/concurrency.py +1 -1
deltacat/utils/ray_utils/dataset.py +101 -10
deltacat/utils/ray_utils/runtime.py +56 -4
deltacat/utils/reader_compatibility_mapping.py +3083 -0
deltacat/utils/url.py +1325 -0
deltacat-2.0.0.dist-info/METADATA +1163 -0
deltacat-2.0.0.dist-info/RECORD +439 -0
{deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
deltacat/aws/redshift/__init__.py +0 -19
deltacat/aws/redshift/model/manifest.py +0 -394
deltacat/catalog/default_catalog_impl/__init__.py +0 -369
deltacat/compute/compactor/utils/round_completion_file.py +0 -97
deltacat/compute/merge_on_read/__init__.py +0 -4
deltacat/compute/merge_on_read/daft.py +0 -40
deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
deltacat/compute/merge_on_read/utils/delta.py +0 -42
deltacat/io/dataset.py +0 -73
deltacat/io/read_api.py +0 -143
deltacat/storage/model/delete_parameters.py +0 -40
deltacat/storage/model/partition_spec.py +0 -71
deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
deltacat/utils/s3fs.py +0 -21
deltacat-1.1.38.dist-info/METADATA +0 -64
deltacat-1.1.38.dist-info/RECORD +0 -219
/deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
/deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
/deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
/deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
/deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
/deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
/deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
{deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
{deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0

deltacat/catalog/main/impl.py ADDED Viewed

@@ -0,0 +1,2882 @@
+from typing import Any, Dict, List, Optional, Union, Tuple, Set
+import logging
+from collections import defaultdict
+import numpy as np
+import pyarrow as pa
+import pandas as pd
+import daft
+import deltacat as dc
+from deltacat.storage.model.manifest import ManifestAuthor
+from deltacat.catalog.model.properties import CatalogProperties
+from deltacat.exceptions import (
+    NamespaceAlreadyExistsError,
+    TableAlreadyExistsError,
+    TableVersionNotFoundError,
+    TableNotFoundError,
+    TableVersionAlreadyExistsError,
+    TableValidationError,
+    SchemaValidationError,
+)
+from deltacat.catalog.model.table_definition import TableDefinition
+from deltacat.storage.model.sort_key import SortScheme
+from deltacat.storage.model.list_result import ListResult
+from deltacat.storage.model.namespace import Namespace, NamespaceProperties
+from deltacat.storage.model.schema import (
+    Schema,
+    SchemaUpdate,
+)
+from deltacat.storage.model.table import TableProperties, Table
+from deltacat.storage.model.types import (
+    Dataset,
+    LifecycleState,
+    StreamFormat,
+    SchemaConsistencyType,
+)
+from deltacat.storage.model.partition import (
+    Partition,
+    PartitionLocator,
+    PartitionScheme,
+)
+from deltacat.storage.model.table_version import (
+    TableVersion,
+    TableVersionProperties,
+)
+from deltacat.storage.model.types import DeltaType
+from deltacat.storage import Delta
+from deltacat.storage.model.types import CommitState
+from deltacat.storage.model.transaction import (
+    Transaction,
+    setup_transaction,
+)
+from deltacat.types.media import (
+    ContentType,
+    DatasetType,
+    StorageType,
+    SCHEMA_CONTENT_TYPES,
+)
+from deltacat.types.tables import (
+    SchemaEvolutionMode,
+    TableProperty,
+    TablePropertyDefaultValues,
+    TableReadOptimizationLevel,
+    TableWriteMode,
+    get_dataset_type,
+    get_table_schema,
+    get_table_column_names,
+    from_pyarrow,
+    concat_tables,
+    empty_table,
+    infer_table_schema,
+    to_pandas,
+)
+from deltacat.utils import pyarrow as pa_utils
+from deltacat.utils.reader_compatibility_mapping import get_compatible_readers
+from deltacat.utils.pyarrow import get_base_arrow_type_name
+from deltacat import logs
+from deltacat.constants import DEFAULT_NAMESPACE
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+"""
+Default Catalog interface implementation using DeltaCAT native storage.
+The functions here should not be invoked directly, but should instead be
+invoked through `delegate.py` (e.g., to support passing catalog's by name, and
+to ensure that each initialized `Catalog` implementation has its `inner`
+property set to the `CatalogProperties` returned from `initialize()`).
+The `CatalogProperties` instance returned by `initialize()` contains all
+durable state required to deterministically reconstruct the associated DeltaCAT
+native `Catalog` implementation (e.g., the root URI for the catalog metastore).
+"""
+# catalog functions
+def initialize(
+    config: Optional[CatalogProperties] = None,
+    *args,
+    **kwargs,
+) -> CatalogProperties:
+    """
+    Performs any required one-time initialization and validation of this
+    catalog implementation based on the input configuration. If no config
+    instance is given, a new `CatalogProperties` instance is constructed
+    using the given keyword arguments.
+    Returns the input config if given, and the newly created config otherwise.
+    """
+    if config is not None:
+        if not isinstance(config, CatalogProperties):
+            raise ValueError(
+                f"Expected `CatalogProperties` but found `{type(config)}`."
+            )
+        return config
+    else:
+        return CatalogProperties(*args, **kwargs)
+# table functions
+def _validate_write_mode_and_table_existence(
+    table: str,
+    namespace: str,
+    mode: TableWriteMode,
+    **kwargs,
+) -> bool:
+    """Validate write mode against table existence and return whether table exists."""
+    table_exists_flag = table_exists(
+        table,
+        namespace=namespace,
+        **kwargs,
+    )
+    logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
+    if mode == TableWriteMode.CREATE and table_exists_flag:
+        raise ValueError(
+            f"Table {namespace}.{table} already exists and mode is CREATE."
+        )
+    elif (
+        mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
+        and not table_exists_flag
+    ):
+        raise TableNotFoundError(
+            f"Table {namespace}.{table} does not exist and mode is {mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()}. Use CREATE or AUTO mode to create a new table."
+        )
+    return table_exists_flag
+def _get_table_and_validate_write_mode(
+    table: str,
+    namespace: str,
+    table_version: Optional[str],
+    mode: TableWriteMode,
+    **kwargs,
+) -> Tuple[bool, TableDefinition]:
+    """Validate write mode against table and table version existence.
+    Returns:
+        Tuple of (table_exists_flag, table_definition)
+    """
+    # First validate table, table version, and stream existence
+    existing_table_def = get_table(
+        table,
+        namespace=namespace,
+        table_version=table_version,
+        **kwargs,
+    )
+    table_exists_flag = (
+        existing_table_def is not None
+        and existing_table_def.table_version
+        and existing_table_def.stream
+    )
+    logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
+    # Then validate table existence constraints
+    if mode == TableWriteMode.CREATE and table_exists_flag and table_version is None:
+        raise TableAlreadyExistsError(
+            f"Table {namespace}.{table} already exists and mode is CREATE."
+        )
+    elif (
+        mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
+        and existing_table_def is None
+    ):
+        raise TableNotFoundError(
+            f"Table {namespace}.{table} does not exist and write mode is {mode}. Use CREATE or AUTO mode to create a new table."
+        )
+    # Then validate table version existence constraints
+    if table_version is not None and table_exists_flag:
+        if mode == TableWriteMode.CREATE:
+            raise TableVersionAlreadyExistsError(
+                f"Table version {namespace}.{table}.{table_version} already exists and mode is CREATE."
+            )
+        logger.info(f"Table version ({namespace}.{table}.{table_version}) exists.")
+    elif (
+        mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
+        and table_version is not None
+        and not table_exists_flag
+    ):
+        raise TableVersionNotFoundError(
+            f"Table version {namespace}.{table}.{table_version} does not exist and write mode is {mode}. "
+            f"Use CREATE or AUTO mode to create a new table version, or omit table_version "
+            f"to use the latest version."
+        )
+    return table_exists_flag, existing_table_def
+def _validate_content_type_against_supported_content_types(
+    namespace: str,
+    table: str,
+    content_type: ContentType,
+    supported_content_types: Optional[List[ContentType]],
+) -> None:
+    if supported_content_types and content_type not in supported_content_types:
+        raise ValueError(
+            f"Content type proposed for write to table {namespace}.{table} ({content_type}) "
+            f"conflicts with the proposed list of new supported content types: {supported_content_types}"
+        )
+def _create_table_for_write(
+    data: Dataset,
+    table: str,
+    namespace: str,
+    table_version: Optional[str],
+    content_type: ContentType,
+    existing_table_definition: Optional[TableDefinition],
+    *args,
+    **kwargs,
+) -> TableDefinition:
+    """Creates a new table, table version, and/or stream in preparation for a write operation."""
+    if "schema" not in kwargs:
+        kwargs["schema"] = infer_table_schema(data)
+    _validate_content_type_against_supported_content_types(
+        namespace,
+        table,
+        content_type,
+        kwargs.get("content_types"),
+    )
+    return create_table(
+        table,
+        namespace=namespace,
+        table_version=table_version,
+        existing_table_definition=existing_table_definition,
+        *args,
+        **kwargs,
+    )
+def write_to_table(
+    data: Dataset,
+    table: str,
+    *args,
+    namespace: Optional[str] = None,
+    table_version: Optional[str] = None,
+    mode: TableWriteMode = TableWriteMode.AUTO,
+    content_type: ContentType = ContentType.PARQUET,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> None:
+    """Write local or distributed data to a table. Raises an error if the
+    table does not exist and the table write mode is not CREATE or AUTO.
+    When creating a table, all `create_table` parameters may be optionally
+    specified as additional keyword arguments. When appending to, or replacing,
+    an existing table, all `alter_table` parameters may be optionally specified
+    as additional keyword arguments.
+    Args:
+        data: Local or distributed data to write to the table.
+        table: Name of the table to write to.
+        namespace: Optional namespace for the table. Uses default if not specified.
+        table_version: Optional version of the table to write to. If specified,
+            will create this version if it doesn't exist (in CREATE mode) or
+            get this version if it exists (in other modes). If not specified,
+            uses the latest version.
+        mode: Write mode (AUTO, CREATE, APPEND, REPLACE, MERGE, DELETE).
+        content_type: Content type used to write the data files. Defaults to PARQUET.
+        transaction: Optional transaction to append write operations to instead of
+            creating and committing a new transaction.
+        **kwargs: Additional keyword arguments.
+    """
+    namespace = namespace or default_namespace()
+    # Set up transaction handling
+    write_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = write_transaction
+    try:
+        # Validate write mode and table/table version/stream existence
+        (table_exists_flag, table_definition,) = _get_table_and_validate_write_mode(
+            table,
+            namespace,
+            table_version,
+            mode,
+            **kwargs,
+        )
+        # Get or create table, table version, and/or stream
+        if not table_exists_flag:
+            table_definition = _create_table_for_write(
+                data,
+                table,
+                namespace,
+                table_version,
+                content_type,
+                table_definition,
+                *args,
+                **kwargs,
+            )
+        else:
+            # call alter_table if there are any alter_table kwargs provided
+            if (
+                "lifecycle_state" in kwargs
+                or "schema_updates" in kwargs
+                or "partition_updates" in kwargs
+                or "sort_scheme" in kwargs
+                or "table_description" in kwargs
+                or "table_version_description" in kwargs
+                or "table_properties" in kwargs
+                or "table_version_properties" in kwargs
+            ):
+                alter_table(
+                    table,
+                    namespace=namespace,
+                    table_version=table_version,
+                    *args,
+                    **kwargs,
+                )
+        # Get the active table version and stream
+        table_version_obj = _get_latest_active_or_given_table_version(
+            namespace=table_definition.table.namespace,
+            table_name=table_definition.table.table_name,
+            table_version=table_version or table_definition.table_version.table_version,
+            **kwargs,
+        )
+        # Validate schema compatibility for schemaless content types with schema tables
+        if (
+            content_type.value not in SCHEMA_CONTENT_TYPES
+            and table_version_obj.schema is not None
+        ):
+            schemaless_types = {
+                ct for ct in ContentType if ct.value not in SCHEMA_CONTENT_TYPES
+            }
+            raise TableValidationError(
+                f"Content type '{content_type.value}' cannot be written to a table with a schema. "
+                f"Table '{namespace}.{table}' has a schema, but content type '{content_type.value}' "
+                f"is schemaless. Schemaless content types ({', '.join(sorted([ct.value for ct in schemaless_types]))}) "
+                f"can only be written to schemaless tables."
+            )
+        # Handle different write modes and get stream and delta type
+        stream, delta_type = _handle_write_mode(
+            mode,
+            table_definition,
+            table_version_obj,
+            namespace,
+            table,
+            **kwargs,
+        )
+        if not stream:
+            raise ValueError(f"No default stream found for table {namespace}.{table}")
+        # Automatically set entry_params for DELETE/MERGE modes if not provided
+        _set_entry_params_if_needed(
+            mode,
+            table_version_obj,
+            kwargs,
+        )
+        # Validate table configuration
+        _validate_table_configuration(
+            stream,
+            table_version_obj,
+            namespace,
+            table,
+        )
+        # Handle partition creation/retrieval
+        partition, commit_staged_partition = _handle_partition_creation(
+            mode,
+            table_exists_flag,
+            delta_type,
+            stream,
+            **kwargs,
+        )
+        # Get table properties for schema evolution
+        schema_evolution_mode = table_version_obj.read_table_property(
+            TableProperty.SCHEMA_EVOLUTION_MODE
+        )
+        default_schema_consistency_type = table_version_obj.read_table_property(
+            TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE
+        )
+        # Convert unsupported dataset types and NumPy arrays that need schema validation
+        if isinstance(data, np.ndarray) and table_version_obj.schema is not None:
+            # NumPy arrays need conversion to Pandas for proper column naming in schema validation
+            converted_data = _convert_numpy_for_schema_validation(
+                data, table_version_obj.schema
+            )
+        else:
+            # Convert other unsupported dataset types (e.g., Daft) or keep NumPy as-is for schemaless tables
+            converted_data = _convert_data_if_needed(data)
+        # Capture original field set before schema coercion for partial UPSERT support
+        original_fields = set(get_table_column_names(converted_data))
+        # Validate and coerce data against schema
+        # This ensures proper schema evolution and type handling
+        (
+            validated_data,
+            schema_modified,
+            updated_schema,
+        ) = _validate_and_coerce_data_against_schema(
+            converted_data,  # Use converted data for NumPy, original for others
+            table_version_obj.schema,
+            schema_evolution_mode=schema_evolution_mode,
+            default_schema_consistency_type=default_schema_consistency_type,
+        )
+        # Convert validated data to supported format for storage if needed
+        converted_data = _convert_data_if_needed(validated_data)
+        # Validate reader compatibility against supported reader types
+        supported_reader_types = table_version_obj.read_table_property(
+            TableProperty.SUPPORTED_READER_TYPES
+        )
+        _validate_reader_compatibility(
+            converted_data,
+            content_type,
+            supported_reader_types,
+        )
+        # Update table version if schema was modified during evolution
+        if schema_modified:
+            # Extract catalog properties and filter kwargs
+            catalog_kwargs = {
+                "catalog": kwargs.get("catalog"),
+                "inner": kwargs.get("inner"),
+                "transaction": write_transaction,  # Pass transaction to update_table_version
+            }
+            _get_storage(**catalog_kwargs).update_table_version(
+                namespace=namespace,
+                table_name=table,
+                table_version=table_version_obj.table_version,
+                schema=updated_schema,
+                **catalog_kwargs,
+            )
+        # Stage and commit delta, handle compaction
+        # Remove schema from kwargs to avoid duplicate parameter conflict
+        filtered_kwargs = {k: v for k, v in kwargs.items() if k != "schema"}
+        # Use updated schema if schema evolution occurred, otherwise use original schema
+        _stage_commit_and_compact(
+            converted_data,
+            partition,
+            delta_type,
+            content_type,
+            commit_staged_partition,
+            table_version_obj,
+            namespace,
+            table,
+            schema=updated_schema if schema_modified else table_version_obj.schema,
+            original_fields=original_fields,
+            **filtered_kwargs,
+        )
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during write_to_table: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            write_transaction.seal()
+def _handle_write_mode(
+    mode: TableWriteMode,
+    table_definition: TableDefinition,
+    table_version_obj: TableVersion,
+    namespace: str,
+    table: str,
+    **kwargs,
+) -> Tuple[Any, DeltaType]:  # Using Any for stream type to avoid complex imports
+    """Handle different write modes and return appropriate stream and delta type."""
+    table_schema = table_definition.table_version.schema
+    if mode == TableWriteMode.REPLACE:
+        return _handle_replace_mode(
+            table_schema,
+            namespace,
+            table,
+            table_version_obj,
+            **kwargs,
+        )
+    elif mode == TableWriteMode.APPEND:
+        return _handle_append_mode(
+            table_schema,
+            namespace,
+            table,
+            table_version_obj,
+            **kwargs,
+        )
+    elif mode in (TableWriteMode.MERGE, TableWriteMode.DELETE):
+        return _handle_merge_delete_mode(
+            mode,
+            table_schema,
+            namespace,
+            table,
+            table_version_obj,
+            **kwargs,
+        )
+    else:
+        # AUTO and CREATE modes
+        return _handle_auto_create_mode(
+            table_schema,
+            namespace,
+            table,
+            table_version_obj,
+            **kwargs,
+        )
+def _handle_replace_mode(
+    table_schema,
+    namespace: str,
+    table: str,
+    table_version_obj: TableVersion,
+    **kwargs,
+) -> Tuple[Any, DeltaType]:
+    """Handle REPLACE mode by staging and committing a new stream."""
+    stream = _get_storage(**kwargs).stage_stream(
+        namespace=namespace,
+        table_name=table,
+        table_version=table_version_obj.table_version,
+        **kwargs,
+    )
+    stream = _get_storage(**kwargs).commit_stream(stream=stream, **kwargs)
+    delta_type = (
+        DeltaType.UPSERT
+        if table_schema and table_schema.merge_keys
+        else DeltaType.APPEND
+    )
+    return stream, delta_type
+def _handle_append_mode(
+    table_schema,
+    namespace: str,
+    table: str,
+    table_version_obj: TableVersion,
+    **kwargs,
+) -> Tuple[Any, DeltaType]:
+    """Handle APPEND mode by validating no merge keys and getting existing stream."""
+    if table_schema and table_schema.merge_keys:
+        raise SchemaValidationError(
+            f"APPEND mode cannot be used with tables that have merge keys. "
+            f"Table {namespace}.{table} has merge keys: {table_schema.merge_keys}. "
+            f"Use MERGE mode instead."
+        )
+    stream = _get_table_stream(
+        namespace,
+        table,
+        table_version_obj.table_version,
+        **kwargs,
+    )
+    return stream, DeltaType.APPEND
+def _handle_merge_delete_mode(
+    mode: TableWriteMode,
+    table_schema,
+    namespace: str,
+    table: str,
+    table_version_obj: TableVersion,
+    **kwargs,
+) -> Tuple[Any, DeltaType]:
+    """Handle MERGE/DELETE modes by validating merge keys and getting existing stream."""
+    if not table_schema or not table_schema.merge_keys:
+        raise TableValidationError(
+            f"{mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()} mode requires tables to have at least one merge key. "
+            f"Table {namespace}.{table}.{table_version_obj.table_version} has no merge keys. "
+            f"Use APPEND, AUTO, or REPLACE mode instead."
+        )
+    stream = _get_table_stream(
+        namespace,
+        table,
+        table_version_obj.table_version,
+        **kwargs,
+    )
+    delta_type = DeltaType.UPSERT if mode == TableWriteMode.MERGE else DeltaType.DELETE
+    return stream, delta_type
+def _handle_auto_create_mode(
+    table_schema,
+    namespace: str,
+    table: str,
+    table_version_obj: TableVersion,
+    **kwargs,
+) -> Tuple[Any, DeltaType]:
+    """Handle AUTO and CREATE modes by getting existing stream."""
+    stream = _get_table_stream(
+        namespace,
+        table,
+        table_version_obj.table_version,
+        **kwargs,
+    )
+    delta_type = (
+        DeltaType.UPSERT
+        if table_schema and table_schema.merge_keys
+        else DeltaType.APPEND
+    )
+    return stream, delta_type
+def _validate_table_configuration(
+    stream,
+    table_version_obj: TableVersion,
+    namespace: str,
+    table: str,
+) -> None:
+    """Validate table configuration for unsupported features."""
+    # Check if table is partitioned
+    if (
+        stream.partition_scheme
+        and stream.partition_scheme.keys is not None
+        and len(stream.partition_scheme.keys) > 0
+    ):
+        raise NotImplementedError(
+            f"write_to_table does not yet support partitioned tables. "
+            f"Table {namespace}.{table} has partition scheme with "
+            f"{len(stream.partition_scheme.keys)} partition key(s): "
+            f"{[key.name or key.key[0] for key in stream.partition_scheme.keys]}. "
+            f"Please use the lower-level metastore API for partitioned tables."
+        )
+    # Check if table has sort keys
+    if (
+        table_version_obj.sort_scheme
+        and table_version_obj.sort_scheme.keys is not None
+        and len(table_version_obj.sort_scheme.keys) > 0
+    ):
+        raise NotImplementedError(
+            f"write_to_table does not yet support tables with sort keys. "
+            f"Table {namespace}.{table} has sort scheme with "
+            f"{len(table_version_obj.sort_scheme.keys)} sort key(s): "
+            f"{[key.key[0] for key in table_version_obj.sort_scheme.keys]}. "
+            f"Please use the lower-level metastore API for sorted tables."
+        )
+def _handle_partition_creation(
+    mode: TableWriteMode,
+    table_exists_flag: bool,
+    delta_type: DeltaType,
+    stream,
+    **kwargs,
+) -> Tuple[Any, bool]:  # partition, commit_staged_partition
+    """Handle partition creation/retrieval based on write mode."""
+    if mode == TableWriteMode.REPLACE or not table_exists_flag:
+        # REPLACE mode or new table: Stage a new partition
+        partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
+        # If we're doing UPSERT/DELETE operations, let compaction handle the commit
+        commit_staged_partition = delta_type not in (DeltaType.UPSERT, DeltaType.DELETE)
+        return partition, commit_staged_partition
+    elif delta_type in (DeltaType.UPSERT, DeltaType.DELETE):
+        # UPSERT/DELETE operations: Try to use existing committed partition first
+        partition = _get_storage(**kwargs).get_partition(
+            stream_locator=stream.locator,
+            partition_values=None,
+            **kwargs,
+        )
+        commit_staged_partition = False
+        if not partition:
+            # No existing committed partition found, stage a new one
+            partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
+            commit_staged_partition = False  # Let compaction handle the commit
+        return partition, commit_staged_partition
+    else:
+        # APPEND mode on existing table: Get existing partition
+        partition = _get_storage(**kwargs).get_partition(
+            stream_locator=stream.locator,
+            partition_values=None,
+            **kwargs,
+        )
+        commit_staged_partition = False
+        if not partition:
+            # No existing partition found, create a new one
+            partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
+            commit_staged_partition = True
+        return partition, commit_staged_partition
+def _convert_numpy_for_schema_validation(
+    data: np.ndarray, schema: Optional[Schema]
+) -> Dataset:
+    """Convert NumPy array to Pandas DataFrame with proper column names for schema validation.
+    Args:
+        data: NumPy array to convert
+        schema: DeltaCAT Schema object for column naming
+    Returns:
+        Pandas DataFrame with proper column names matching schema
+    Raises:
+        ValueError: If array has more columns than schema or schema is invalid
+    """
+    if not isinstance(schema, Schema) or not schema.arrow:
+        raise ValueError(
+            f"Expected DeltaCAT schema for Numpy schema validation, but found: {schema}"
+        )
+    # Use schema subset matching NumPy array dimensions
+    arrow_schema = schema.arrow
+    num_cols = data.shape[1] if data.ndim > 1 else 1
+    if len(arrow_schema) >= num_cols:
+        # Use the first N columns from the schema to match data dimensions
+        subset_fields = [arrow_schema.field(i) for i in range(num_cols)]
+        subset_schema = pa.schema(subset_fields)
+        return to_pandas(data, schema=subset_schema)
+    else:
+        raise ValueError(
+            f"NumPy array has {num_cols} columns but table schema only has {len(arrow_schema)} columns. "
+            f"Cannot write NumPy data with more columns than the table schema supports."
+        )
+def _build_entry_index_to_schema_mapping(
+    qualified_deltas: List[Delta], table_version_obj, **kwargs
+) -> List[Schema]:
+    """Build a mapping from manifest entry index to schema for reading operations.
+    Args:
+        qualified_deltas: List of deltas to process
+        table_version_obj: Table version containing schemas
+        **kwargs: Additional arguments passed to storage operations
+    Returns:
+        List mapping each manifest entry index to its corresponding schema
+    Raises:
+        ValueError: If a manifest's schema ID is not found in table version schemas
+    """
+    entry_index_to_schema = []
+    for delta in qualified_deltas:
+        if delta.manifest:
+            manifest = delta.manifest
+        else:
+            # Fetch manifest from storage
+            manifest = _get_storage(**kwargs).get_delta_manifest(
+                delta.locator,
+                **kwargs,
+            )
+        # Map manifest entry index to schema ID
+        schema_id = manifest.meta.schema_id
+        # Find the schema that matches this manifest's schema_id
+        matching_schema = None
+        if table_version_obj.schemas:
+            for schema in table_version_obj.schemas:
+                if schema.id == schema_id:
+                    matching_schema = schema
+                    break
+        if matching_schema is None:
+            available_schema_ids = (
+                [s.id for s in table_version_obj.schemas]
+                if table_version_obj.schemas
+                else []
+            )
+            raise ValueError(
+                f"Manifest schema ID {schema_id} not found in table version schemas. "
+                f"Available schema IDs: {available_schema_ids}. "
+            )
+        # Add the matching schema for each entry in this manifest
+        for _ in range(len(manifest.entries)):
+            entry_index_to_schema.append(matching_schema)
+    return entry_index_to_schema
+def _convert_data_if_needed(data: Dataset) -> Dataset:
+    """Convert unsupported data types to supported ones."""
+    if isinstance(data, daft.DataFrame):
+        # Daft DataFrame - convert based on execution mode
+        ctx = daft.context.get_context()
+        runner = ctx.get_or_create_runner()
+        runner_type = runner.name
+        if runner_type == "ray":
+            # Running with Ray backend - convert to Ray Dataset
+            return data.to_ray_dataset()
+        else:
+            # Running with local backend - convert to PyArrow Table
+            return data.to_arrow()
+    return data
+def _validate_and_coerce_data_against_schema(
+    data: Dataset,
+    schema: Optional[Schema],
+    schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
+    default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
+) -> Tuple[Dataset, bool, Optional[Schema]]:
+    """Validate and coerce data against the table schema if schema consistency types are set.
+    Args:
+        data: The dataset to validate/coerce
+        schema: The DeltaCAT schema to validate against (optional)
+        schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
+        default_schema_consistency_type: Default consistency type for new fields in AUTO mode
+    Returns:
+        Tuple[Dataset, bool, Optional[Schema]]: Validated/coerced data, flag indicating if schema was modified, and updated schema
+    Raises:
+        ValueError: If validation fails or coercion is not possible
+    """
+    if not schema:
+        return data, False, None
+    validated_data, updated_schema = schema.validate_and_coerce_dataset(
+        data,
+        schema_evolution_mode=schema_evolution_mode,
+        default_schema_consistency_type=default_schema_consistency_type,
+    )
+    # Check if schema was modified by comparing with original
+    schema_modified = not updated_schema.equivalent_to(schema, True)
+    # Return updated schema only if it was modified
+    updated_schema = updated_schema if schema_modified else None
+    return validated_data, schema_modified, updated_schema
+def _validate_reader_compatibility(
+    data: Dataset,
+    content_type: ContentType,
+    supported_reader_types: Optional[List[DatasetType]],
+) -> None:
+    """Validate that the data types being written are compatible with all supported reader types.
+    Args:
+        data: The dataset to validate
+        content_type: Content type being written
+        supported_reader_types: List of DatasetTypes that must be able to read this data
+    Raises:
+        TableValidationError: If any data types would break supported reader compatibility
+    """
+    if not supported_reader_types:
+        return
+    # Get the schema from the data
+    schema = get_table_schema(data)
+    # Get the dataset type of the current data
+    writer_dataset_type = get_dataset_type(data)
+    # PYARROW_PARQUET is equivalent to PYARROW for compatibility
+    writer_type_str = (
+        writer_dataset_type.value
+        if writer_dataset_type != DatasetType.PYARROW_PARQUET
+        else "pyarrow"
+    )
+    content_type_str = content_type.value
+    # Check each field type for compatibility
+    incompatible_fields = []
+    for field in schema:
+        field_name = field.name
+        arrow_type_str = str(field.type)
+        # Get the base type name from PyArrow field type
+        base_type_name = get_base_arrow_type_name(field.type)
+        # Get compatible readers for this (arrow_type, writer_dataset_type, content_type) combination
+        compatible_readers = get_compatible_readers(
+            base_type_name,
+            writer_type_str,
+            content_type_str,
+        )
+        # Check if all supported reader types are compatible
+        for required_reader in supported_reader_types:
+            reader_is_compatible = required_reader in compatible_readers
+            # Special case: PYARROW_PARQUET is equivalent to PYARROW for compatibility if we're writing parquet
+            if (
+                not reader_is_compatible
+                and content_type == ContentType.PARQUET
+                and required_reader == DatasetType.PYARROW_PARQUET
+            ):
+                reader_is_compatible = DatasetType.PYARROW in compatible_readers
+            if not reader_is_compatible:
+                incompatible_fields.append(
+                    {
+                        "field_name": field_name,
+                        "arrow_type": arrow_type_str,
+                        "incompatible_reader": required_reader,
+                        "writer_type": writer_dataset_type,
+                        "content_type": content_type,
+                    }
+                )
+    # Raise error if any incompatibilities found
+    if incompatible_fields:
+        error_details = []
+        for incompatible in incompatible_fields:
+            error_details.append(
+                f"Field '{incompatible['field_name']}' with type '{incompatible['arrow_type']}' "
+                f"written by {incompatible['writer_type']} to {incompatible['content_type']} "
+                f"cannot be read by required reader type {incompatible['incompatible_reader']}. "
+                f"If you expect this write to succeed and this reader is not required, then it "
+                f"can be removed from the table's supported reader types property."
+            )
+        raise TableValidationError(
+            f"Reader compatibility validation failed. The following fields would break "
+            f"supported reader types:\n" + "\n".join(error_details)
+        )
+def _stage_commit_and_compact(
+    converted_data: Dataset,
+    partition,
+    delta_type: DeltaType,
+    content_type: ContentType,
+    commit_staged_partition: bool,
+    table_version_obj: TableVersion,
+    namespace: str,
+    table: str,
+    schema: Schema,
+    original_fields: Set[str],
+    **kwargs,
+) -> None:
+    """Stage and commit delta, then handle compaction if needed."""
+    # Remove schema from kwargs to avoid duplicate parameter conflict
+    # We explicitly pass the correct schema parameter
+    kwargs.pop("schema", None)
+    # Stage a delta with the data
+    delta = _get_storage(**kwargs).stage_delta(
+        data=converted_data,
+        partition=partition,
+        delta_type=delta_type,
+        content_type=content_type,
+        author=ManifestAuthor.of(
+            name="deltacat.write_to_table", version=dc.__version__
+        ),
+        schema=schema,
+        **kwargs,
+    )
+    delta = _get_storage(**kwargs).commit_delta(delta=delta, **kwargs)
+    if commit_staged_partition:
+        _get_storage(**kwargs).commit_partition(partition=partition, **kwargs)
+    # Check compaction trigger decision
+    should_compact = _trigger_compaction(
+        table_version_obj,
+        delta,
+        TableReadOptimizationLevel.MAX,
+        **kwargs,
+    )
+    if should_compact:
+        # Run V2 compaction session to merge or delete data
+        if table_version_obj.schema:
+            all_column_names = table_version_obj.schema.arrow.names
+        else:
+            raise RuntimeError("Table version schema is required to run compaction.")
+        _run_compaction_session(
+            table_version_obj=table_version_obj,
+            partition=partition,
+            latest_delta_stream_position=delta.stream_position,
+            namespace=namespace,
+            table=table,
+            original_fields=original_fields,
+            all_column_names=all_column_names,
+            **kwargs,
+        )
+def _trigger_compaction(
+    table_version_obj: TableVersion,
+    latest_delta: Optional[Delta],
+    target_read_optimization_level: TableReadOptimizationLevel,
+    **kwargs,
+) -> bool:
+    # Import inside function to avoid circular imports
+    from deltacat.compute.compactor.utils import round_completion_reader as rci
+    # Extract delta type from latest_delta if available, otherwise default to no compaction
+    if latest_delta is not None:
+        delta_type = latest_delta.type
+        partition_values = latest_delta.partition_locator.partition_values
+        logger.info(
+            f"Using delta type {delta_type} from latest delta {latest_delta.locator}"
+        )
+    else:
+        logger.info(f"No latest delta discovered, defaulting to no compaction.")
+        return False
+    if (
+        table_version_obj.read_table_property(TableProperty.READ_OPTIMIZATION_LEVEL)
+        == target_read_optimization_level
+    ):
+        if delta_type == DeltaType.DELETE or delta_type == DeltaType.UPSERT:
+            return True
+        elif delta_type == DeltaType.APPEND:
+            # Get default stream to determine partition locator
+            stream = _get_table_stream(
+                table_version_obj.locator.namespace,
+                table_version_obj.locator.table_name,
+                table_version_obj.locator.table_version,
+                **kwargs,
+            )
+            if not stream:
+                return False
+            # Use provided partition_values or None for unpartitioned tables
+            partition_locator = PartitionLocator.of(
+                stream_locator=stream.locator,
+                partition_values=partition_values,
+                partition_id=None,
+            )
+            # Get round completion info to determine high watermark
+            round_completion_info = rci.read_round_completion_info(
+                source_partition_locator=partition_locator,
+                destination_partition_locator=partition_locator,
+                deltacat_storage=_get_storage(**kwargs),
+                deltacat_storage_kwargs=kwargs,
+            )
+            high_watermark = (
+                round_completion_info.high_watermark
+                if round_completion_info
+                and isinstance(round_completion_info.high_watermark, int)
+                else 0
+            )
+            # Get all deltas appended since last compaction
+            deltas = _get_storage(**kwargs).list_deltas(
+                namespace=table_version_obj.locator.namespace,
+                table_name=table_version_obj.locator.table_name,
+                table_version=table_version_obj.locator.table_version,
+                partition_values=partition_values,
+                start_stream_position=high_watermark + 1,
+                **kwargs,
+            )
+            if not deltas:
+                return False
+            # Count deltas appended since last compaction
+            appended_deltas_since_last_compaction = len(deltas)
+            delta_trigger = table_version_obj.read_table_property(
+                TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER
+            )
+            if delta_trigger and appended_deltas_since_last_compaction >= delta_trigger:
+                return True
+            # Count files appended since last compaction
+            appended_files_since_last_compaction = 0
+            for delta in deltas:
+                if delta.manifest and delta.manifest.entries:
+                    appended_files_since_last_compaction += len(delta.manifest.entries)
+            file_trigger = table_version_obj.read_table_property(
+                TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER
+            )
+            if file_trigger and appended_files_since_last_compaction >= file_trigger:
+                return True
+            # Count records appended since last compaction
+            appended_records_since_last_compaction = 0
+            for delta in deltas:
+                if delta.meta and delta.meta.record_count:
+                    appended_records_since_last_compaction += delta.meta.record_count
+            record_trigger = table_version_obj.read_table_property(
+                TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER
+            )
+            if (
+                record_trigger
+                and appended_records_since_last_compaction >= record_trigger
+            ):
+                return True
+    return False
+def _get_compaction_primary_keys(table_version_obj: TableVersion) -> set:
+    """Extract primary keys from table schema for compaction."""
+    table_schema = table_version_obj.schema
+    return (
+        set(table_schema.merge_keys)
+        if table_schema and table_schema.merge_keys
+        else set()
+    )
+def _get_compaction_hash_bucket_count(
+    partition: Partition, table_version_obj: TableVersion
+) -> int:
+    """Determine hash bucket count from previous compaction, table property, or default."""
+    # First check if we have a hash bucket count from previous compaction
+    if (
+        partition.compaction_round_completion_info
+        and partition.compaction_round_completion_info.hash_bucket_count
+    ):
+        hash_bucket_count = partition.compaction_round_completion_info.hash_bucket_count
+        logger.info(
+            f"Using hash bucket count {hash_bucket_count} from previous compaction"
+        )
+        return hash_bucket_count
+    # Otherwise use the table property for default compaction hash bucket count
+    hash_bucket_count = table_version_obj.read_table_property(
+        TableProperty.DEFAULT_COMPACTION_HASH_BUCKET_COUNT
+    )
+    logger.info(f"Using hash bucket count {hash_bucket_count} from table property")
+    return hash_bucket_count
+def _get_merge_order_sort_keys(table_version_obj: TableVersion):
+    """Extract sort keys from merge_order fields in schema for compaction.
+    Args:
+        table_version_obj: The table version containing schema
+    Returns:
+        List of SortKey objects from merge_order fields, or None if no merge_order fields are defined
+    """
+    if table_version_obj.schema:
+        return table_version_obj.schema.merge_order_sort_keys()
+    return None
+def _create_compaction_params(
+    table_version_obj: TableVersion,
+    partition: Partition,
+    latest_stream_position: int,
+    primary_keys: set,
+    hash_bucket_count: int,
+    original_fields: Set[str],
+    all_column_names: Optional[List[str]],
+    **kwargs,
+):
+    """Create compaction parameters for the compaction session."""
+    from deltacat.compute.compactor.model.compact_partition_params import (
+        CompactPartitionParams,
+    )
+    # Remove create_table/alter_table kwargs not needed for compaction
+    kwargs.pop("lifecycle_state", None)
+    kwargs.pop("schema", None)
+    kwargs.pop("partition_scheme", None)
+    kwargs.pop("sort_keys", None)
+    kwargs.pop("table_description", None)
+    kwargs.pop("table_version_description", None)
+    kwargs.pop("table_properties", None)
+    kwargs.pop("table_version_properties", None)
+    kwargs.pop("namespace_properties", None)
+    kwargs.pop("content_types", None)
+    kwargs.pop("fail_if_exists", None)
+    kwargs.pop("schema_updates", None)
+    kwargs.pop("partition_updates", None)
+    kwargs.pop("sort_scheme", None)
+    table_writer_kwargs = kwargs.pop("table_writer_kwargs", {})
+    table_writer_kwargs["schema"] = table_version_obj.schema
+    table_writer_kwargs["sort_scheme_id"] = table_version_obj.sort_scheme.id
+    deltacat_storage_kwargs = kwargs.pop("deltacat_storage_kwargs", {})
+    deltacat_storage_kwargs["transaction"] = kwargs.get("transaction", None)
+    list_deltas_kwargs = kwargs.pop("list_deltas_kwargs", {})
+    list_deltas_kwargs["transaction"] = kwargs.get("transaction", None)
+    return CompactPartitionParams.of(
+        {
+            "catalog": kwargs.get("inner", kwargs.get("catalog")),
+            "source_partition_locator": partition.locator,
+            "destination_partition_locator": partition.locator,  # In-place compaction
+            "primary_keys": primary_keys,
+            "last_stream_position_to_compact": latest_stream_position,
+            "deltacat_storage": _get_storage(**kwargs),
+            "deltacat_storage_kwargs": deltacat_storage_kwargs,
+            "list_deltas_kwargs": list_deltas_kwargs,
+            "table_writer_kwargs": table_writer_kwargs,
+            "hash_bucket_count": hash_bucket_count,
+            "records_per_compacted_file": table_version_obj.read_table_property(
+                TableProperty.RECORDS_PER_COMPACTED_FILE,
+            ),
+            "compacted_file_content_type": ContentType.PARQUET,
+            "drop_duplicates": True,
+            "sort_keys": _get_merge_order_sort_keys(table_version_obj),
+            "original_fields": original_fields,
+            "all_column_names": all_column_names,
+        }
+    )
+def _run_compaction_session(
+    table_version_obj: TableVersion,
+    partition: Partition,
+    latest_delta_stream_position: int,
+    namespace: str,
+    table: str,
+    original_fields: Set[str],
+    all_column_names: List[str],
+    **kwargs,
+) -> None:
+    """
+    Run a V2 compaction session for the given table and partition.
+    Args:
+        table_version_obj: The table version object
+        partition: The partition to compact
+        latest_delta_stream_position: Stream position of the latest delta
+        namespace: The table namespace
+        table: The table name
+        original_fields: The original field set for partial UPSERT support
+        **kwargs: Additional arguments including catalog and storage parameters
+    """
+    # Import inside function to avoid circular imports
+    from deltacat.compute.compactor_v2.compaction_session import compact_partition
+    try:
+        # Extract compaction configuration
+        primary_keys = _get_compaction_primary_keys(table_version_obj)
+        hash_bucket_count = _get_compaction_hash_bucket_count(
+            partition, table_version_obj
+        )
+        # Create compaction parameters
+        compact_partition_params = _create_compaction_params(
+            table_version_obj,
+            partition,
+            latest_delta_stream_position,
+            primary_keys,
+            hash_bucket_count,
+            original_fields=original_fields,
+            all_column_names=all_column_names,
+            **kwargs,
+        )
+        # Run V2 compaction session
+        compact_partition(params=compact_partition_params)
+    except Exception as e:
+        logger.error(
+            f"Error during compaction session for {namespace}.{table}, "
+            f"partition {partition.locator}: {e}"
+        )
+        raise
+def _get_merge_key_field_names_from_schema(schema) -> List[str]:
+    """Extract merge key field names from a DeltaCAT Schema object.
+    Args:
+        schema: DeltaCAT Schema object
+    Returns:
+        List of field names that are marked as merge keys
+    """
+    if not schema or not schema.merge_keys:
+        return []
+    merge_key_field_names = []
+    field_ids_to_fields = schema.field_ids_to_fields
+    for merge_key_id in schema.merge_keys:
+        if merge_key_id in field_ids_to_fields:
+            field = field_ids_to_fields[merge_key_id]
+            merge_key_field_names.append(field.arrow.name)
+    return merge_key_field_names
+def _set_entry_params_if_needed(
+    mode: TableWriteMode, table_version_obj, kwargs: dict
+) -> None:
+    """Automatically set entry_params to merge keys if not already set by user.
+    Args:
+        mode: The table write mode
+        table_version_obj: The table version object containing schema
+        kwargs: Keyword arguments dictionary that may contain entry_params
+    """
+    # Only set entry_params for DELETE and MERGE modes
+    if mode not in [TableWriteMode.DELETE, TableWriteMode.MERGE]:
+        return
+    # Don't override if user already provided entry_params
+    if "entry_params" in kwargs and kwargs["entry_params"] is not None:
+        return
+    # Get schema from table version
+    if not table_version_obj or not table_version_obj.schema:
+        return
+    # Extract merge key field names
+    merge_key_field_names = _get_merge_key_field_names_from_schema(
+        table_version_obj.schema
+    )
+    if merge_key_field_names:
+        from deltacat.storage import EntryParams
+        kwargs["entry_params"] = EntryParams.of(merge_key_field_names)
+def _get_table_stream(namespace: str, table: str, table_version: str, **kwargs):
+    """Helper function to get a stream for a table version."""
+    return _get_storage(**kwargs).get_stream(
+        namespace=namespace,
+        table_name=table,
+        table_version=table_version,
+        **kwargs,
+    )
+def _validate_read_table_input(
+    namespace: str,
+    table: str,
+    table_schema: Optional[Schema],
+    table_type: Optional[DatasetType],
+    distributed_dataset_type: Optional[DatasetType],
+) -> None:
+    """Validate input parameters for read_table operation."""
+    if (
+        distributed_dataset_type
+        and distributed_dataset_type not in DatasetType.distributed()
+    ):
+        raise ValueError(
+            f"{distributed_dataset_type} is not a valid distributed dataset type. "
+            f"Valid distributed dataset types are: {DatasetType.distributed()}."
+        )
+    if table_type and table_type not in DatasetType.local():
+        raise ValueError(
+            f"{table_type} is not a valid local table type. "
+            f"Valid table types are: {DatasetType.local()}."
+        )
+    # For schemaless tables, distributed datasets are not yet supported
+    if table_schema is None and distributed_dataset_type:
+        raise NotImplementedError(
+            f"Distributed dataset reading is not yet supported for schemaless tables. "
+            f"Table '{namespace}.{table}' has no schema, but distributed_dataset_type={distributed_dataset_type} was specified. "
+            f"Please use local storage by setting distributed_dataset_type=None."
+        )
+def _get_qualified_deltas_for_read(
+    table: str,
+    namespace: str,
+    table_version: str,
+    partition_filter: Optional[List[Union[Partition, PartitionLocator]]],
+    **kwargs,
+) -> List[Delta]:
+    """Get qualified deltas for reading based on partition filter."""
+    logger.info(
+        f"Reading metadata for table={namespace}/{table}/{table_version} "
+        f"with partition_filters={partition_filter}."
+    )
+    # Get partition filter if not provided
+    if partition_filter is None:
+        partition_filter = _get_all_committed_partitions(
+            table, namespace, table_version, **kwargs
+        )
+    # Get deltas from partitions
+    qualified_deltas = _get_deltas_from_partition_filter(
+        partition_filter=partition_filter,
+        **kwargs,
+    )
+    logger.info(
+        f"Total qualified deltas={len(qualified_deltas)} "
+        f"from {len(partition_filter)} partitions."
+    )
+    return qualified_deltas
+def _get_max_parallelism(
+    max_parallelism: Optional[int],
+    distributed_dataset_type: Optional[DatasetType],
+) -> int:
+    """Get the max parallelism for a read operation."""
+    if distributed_dataset_type:
+        max_parallelism = max_parallelism or 100
+    else:
+        # TODO(pdames): Set max parallelism using available resources and dataset size
+        max_parallelism = 1
+    if max_parallelism < 1:
+        raise ValueError(
+            f"max_parallelism must be greater than 0, but got {max_parallelism}"
+        )
+    logger.info(f"Using max_parallelism={max_parallelism} for read operation")
+    return max_parallelism
+def _handle_schemaless_table_read(
+    qualified_deltas: List[Delta],
+    read_as: DatasetType,
+    **kwargs,
+) -> Dataset:
+    """Handle reading schemaless tables by flattening manifest entries."""
+    # Create a PyArrow table for each delta
+    # TODO(pdames): More efficient implementation for tables with millions/billions of entries
+    tables = []
+    for delta in qualified_deltas:
+        # Get the manifest for this delta
+        if delta.manifest:
+            manifest = delta.manifest
+        else:
+            # Fetch manifest from storage
+            manifest = _get_storage(**kwargs).get_delta_manifest(
+                delta.locator,
+                transaction=kwargs.get("transaction"),
+                **kwargs,
+            )
+        # Create flattened table from this delta's manifest
+        table = pa_utils.delta_manifest_to_table(
+            manifest,
+            delta,
+        )
+        tables.append(table)
+    # Concatenate all PyArrow tables
+    final_table = pa_utils.concat_tables(tables)
+    # Convert from PyArrow to the requested dataset type
+    return from_pyarrow(final_table, read_as)
+def _download_and_process_table_data(
+    namespace: str,
+    table: str,
+    qualified_deltas: List[Delta],
+    read_as: DatasetType,
+    max_parallelism: Optional[int],
+    columns: Optional[List[str]],
+    file_path_column: Optional[str],
+    table_version_obj: Optional[TableVersion],
+    **kwargs,
+) -> Dataset:
+    """Download delta data and process result based on storage type."""
+    # Handle NUMPY read requests by translating to PANDAS internally
+    original_read_as = read_as
+    effective_read_as = read_as
+    if read_as == DatasetType.NUMPY:
+        effective_read_as = DatasetType.PANDAS
+        logger.debug("Translating NUMPY read request to PANDAS for internal processing")
+    # Merge deltas and download data
+    if not qualified_deltas:
+        # Return empty table with original read_as type
+        return empty_table(original_read_as)
+    # Special handling for non-empty schemaless tables
+    if table_version_obj.schema is None:
+        result = _handle_schemaless_table_read(
+            qualified_deltas,
+            effective_read_as,
+            **kwargs,
+        )
+        # Convert to numpy if original request was for numpy
+        if original_read_as == DatasetType.NUMPY:
+            return _convert_pandas_to_numpy(result)
+        return result
+    # Get schemas for each manifest entry
+    entry_index_to_schema = _build_entry_index_to_schema_mapping(
+        qualified_deltas, table_version_obj, **kwargs
+    )
+    # Standard non-empty schema table read path - merge deltas and download data
+    merged_delta = Delta.merge_deltas(qualified_deltas)
+    # Convert read parameters to download parameters
+    table_type = (
+        effective_read_as
+        if effective_read_as in DatasetType.local()
+        else (kwargs.pop("table_type", None) or DatasetType.PYARROW)
+    )
+    distributed_dataset_type = (
+        effective_read_as if effective_read_as in DatasetType.distributed() else None
+    )
+    # Validate input parameters
+    _validate_read_table_input(
+        namespace,
+        table,
+        table_version_obj.schema,
+        table_type,
+        distributed_dataset_type,
+    )
+    # Determine max parallelism
+    max_parallelism = _get_max_parallelism(
+        max_parallelism,
+        distributed_dataset_type,
+    )
+    # Filter out parameters that are already passed as keyword arguments
+    # to avoid "multiple values for argument" errors
+    filtered_kwargs = {
+        k: v
+        for k, v in kwargs.items()
+        if k
+        not in [
+            "delta_like",
+            "table_type",
+            "storage_type",
+            "max_parallelism",
+            "columns",
+            "distributed_dataset_type",
+            "file_path_column",
+        ]
+    }
+    result = _get_storage(**kwargs).download_delta(
+        merged_delta,
+        table_type=effective_read_as,
+        storage_type=StorageType.DISTRIBUTED
+        if distributed_dataset_type
+        else StorageType.LOCAL,
+        max_parallelism=max_parallelism,
+        columns=columns,
+        distributed_dataset_type=distributed_dataset_type,
+        file_path_column=file_path_column,
+        **filtered_kwargs,
+    )
+    # Handle local storage table concatenation and PYARROW_PARQUET lazy materialization
+    if not distributed_dataset_type and table_type and isinstance(result, list):
+        if table_type == DatasetType.PYARROW_PARQUET:
+            # For PYARROW_PARQUET, preserve lazy materialization:
+            return result[0] if len(result) == 1 else result
+        else:
+            # For other types, perform normal concatenation
+            result = _handle_local_table_concatenation(
+                result,
+                table_type,
+                table_version_obj.schema,
+                entry_index_to_schema,
+                file_path_column,
+                columns,
+            )
+    # Convert to numpy if original request was for numpy
+    if original_read_as == DatasetType.NUMPY:
+        return _convert_pandas_to_numpy(result)
+    return result
+def _convert_pandas_to_numpy(dataset: Dataset):
+    """Convert pandas DataFrame to numpy ndarray."""
+    if not isinstance(dataset, pd.DataFrame):
+        raise ValueError(f"Expected pandas DataFrame but found {type(dataset)}")
+    return dataset.to_numpy()
+def _coerce_dataset_to_schema(
+    dataset: Dataset, target_schema: pa.Schema, manifest_entry_schema: Schema
+) -> Dataset:
+    """Coerce a dataset to match the target PyArrow schema using DeltaCAT Schema.coerce method."""
+    # Convert target PyArrow schema to DeltaCAT schema and use its coerce method
+    deltacat_schema = Schema.of(schema=target_schema)
+    return deltacat_schema.coerce(dataset, manifest_entry_schema)
+def _coerce_results_to_schema(
+    results: Dataset, target_schema: pa.Schema, entry_index_to_schema: List[Schema]
+) -> List[Dataset]:
+    """Coerce all table results to match the target schema."""
+    coerced_results = []
+    for i, table_result in enumerate(results):
+        coerced_result = _coerce_dataset_to_schema(
+            table_result, target_schema, entry_index_to_schema[i]
+        )
+        coerced_results.append(coerced_result)
+        logger.debug(f"Coerced table {i} to unified schema")
+    return coerced_results
+def _create_target_schema(
+    arrow_schema: pa.Schema,
+    columns: Optional[List[str]] = None,
+    file_path_column: Optional[str] = None,
+) -> pa.Schema:
+    """Create target schema for concatenation with optional column selection and file_path_column."""
+    if columns is not None:
+        # Column selection - use only specified columns
+        field_map = {field.name: field for field in arrow_schema}
+        selected_fields = []
+        for col_name in columns:
+            if col_name in field_map:
+                selected_fields.append(field_map[col_name])
+        arrow_schema = pa.schema(selected_fields)
+    if file_path_column and file_path_column not in arrow_schema.names:
+        arrow_schema = arrow_schema.append(pa.field(file_path_column, pa.string()))
+    return arrow_schema
+def _create_entry_schemas_for_concatenation(
+    entry_index_to_schema: List[Schema],
+    columns: Optional[List[str]] = None,
+    file_path_column: Optional[str] = None,
+) -> List[Schema]:
+    """Create entry schemas for concatenation, optionally filtered by column selection."""
+    if columns is None:
+        # No column selection - return original schemas as-is
+        return entry_index_to_schema
+    # Column selection - filter each entry schema
+    modified_schemas = []
+    for entry_schema in entry_index_to_schema:
+        if entry_schema and entry_schema.arrow:
+            filtered_schema = _create_target_schema(
+                entry_schema.arrow, columns, file_path_column
+            )
+            modified_schemas.append(Schema.of(schema=filtered_schema))
+        else:
+            modified_schemas.append(entry_schema)
+    return modified_schemas
+def _handle_local_table_concatenation(
+    results: Dataset,
+    table_type: DatasetType,
+    table_schema: Optional[Schema],
+    entry_index_to_schema: List[Schema],
+    file_path_column: Optional[str] = None,
+    columns: Optional[List[str]] = None,
+) -> Dataset:
+    """Handle concatenation of local table results with schema coercion."""
+    logger.debug(f"Target table schema for concatenation: {table_schema}")
+    # Create target schema for coercion, respecting column selection
+    target_schema = _create_target_schema(table_schema.arrow, columns, file_path_column)
+    logger.debug(f"Created target schema: {target_schema.names}")
+    # Filter entry schemas to match column selection and file_path_column
+    modified_entry_schemas = _create_entry_schemas_for_concatenation(
+        entry_index_to_schema, columns, file_path_column
+    )
+    # Coerce results to unified schema
+    coerced_results = _coerce_results_to_schema(
+        results, target_schema, modified_entry_schemas
+    )
+    # Second step: concatenate the coerced results
+    logger.debug(
+        f"Concatenating {len(coerced_results)} local tables of type {table_type} with unified schemas"
+    )
+    concatenated_result = concat_tables(coerced_results, table_type)
+    logger.debug(f"Concatenation complete, result type: {type(concatenated_result)}")
+    return concatenated_result
+def read_table(
+    table: str,
+    *args,
+    namespace: Optional[str] = None,
+    table_version: Optional[str] = None,
+    read_as: DatasetType = DatasetType.DAFT,
+    partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
+    max_parallelism: Optional[int] = None,
+    columns: Optional[List[str]] = None,
+    file_path_column: Optional[str] = None,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> Dataset:
+    """Read a table into a dataset.
+    Args:
+        table: Name of the table to read.
+        namespace: Optional namespace of the table. Uses default if not specified.
+        table_version: Optional specific version of the table to read.
+        read_as: Dataset type to use for reading table files. Defaults to DatasetType.DAFT.
+        partition_filter: Optional list of partitions to read from.
+        max_parallelism: Optional maximum parallelism for data download. Defaults to the number of
+            available CPU cores for local dataset type reads (i.e., members of DatasetType.local())
+            and 100 for distributed dataset type reads (i.e., members of DatasetType.distributed()).
+        columns: Optional list of columns to include in the result.
+        file_path_column: Optional column name to add file paths to the result.
+        transaction: Optional transaction to chain this read operation to. If provided, uncommitted
+            changes from the transaction will be visible to this read operation.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        Dataset containing the table data.
+    """
+    # Set up transaction handling
+    read_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = read_transaction
+    try:
+        # Resolve namespace and get table metadata
+        namespace = namespace or default_namespace()
+        table_version_obj = _get_latest_active_or_given_table_version(
+            namespace=namespace,
+            table_name=table,
+            table_version=table_version,
+            **kwargs,
+        )
+        # Get partitions and deltas to read
+        qualified_deltas = _get_qualified_deltas_for_read(
+            table,
+            namespace,
+            table_version_obj.table_version,
+            partition_filter,
+            **kwargs,
+        )
+        # Download and process the data
+        # TODO(pdames): Remove once we implement a custom SerDe for pa.ParquetFile
+        if read_as == DatasetType.PYARROW_PARQUET:
+            max_parallelism = 1
+            logger.warning(
+                f"Forcing max_parallelism to 1 for PyArrow Parquet reads to avoid serialization errors."
+            )
+        result = _download_and_process_table_data(
+            namespace,
+            table,
+            qualified_deltas,
+            read_as,
+            max_parallelism,
+            columns,
+            file_path_column,
+            table_version_obj,
+            **kwargs,
+        )
+        return result
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during read_table: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            read_transaction.seal()
+def alter_table(
+    table: str,
+    *args,
+    namespace: Optional[str] = None,
+    table_version: Optional[str] = None,
+    lifecycle_state: Optional[LifecycleState] = None,
+    schema_updates: Optional[SchemaUpdate] = None,
+    partition_updates: Optional[Dict[str, Any]] = None,
+    sort_scheme: Optional[SortScheme] = None,
+    table_description: Optional[str] = None,
+    table_version_description: Optional[str] = None,
+    table_properties: Optional[TableProperties] = None,
+    table_version_properties: Optional[TableVersionProperties] = None,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> None:
+    """Alter deltacat table/table_version definition.
+    Modifies various aspects of a table's metadata including lifecycle state,
+    schema, partitioning, sort keys, description, and properties.
+    Args:
+        table: Name of the table to alter.
+        namespace: Optional namespace of the table. Uses default namespace if not specified.
+        table_version: Optional specific version of the table to alter. Defaults to the latest active version.
+        lifecycle_state: New lifecycle state for the table.
+        schema_updates: Schema updates to apply.
+        partition_updates: Partition scheme updates to apply.
+        sort_scheme: New sort scheme.
+        table_description: New description for the table.
+        table_version_description: New description for the table version. Defaults to `table_description` if not  specified.
+        table_properties: New table properties.
+        table_version_properties: New table version properties. Defaults to the current parent table properties if not specified.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        None
+    Raises:
+        TableNotFoundError: If the table does not already exist.
+        TableVersionNotFoundError: If the specified table version or active table version does not exist.
+    """
+    resolved_table_properties = None
+    if table_properties is not None:
+        resolved_table_properties = _add_default_table_properties(table_properties)
+        _validate_table_properties(resolved_table_properties)
+    namespace = namespace or default_namespace()
+    # Set up transaction handling
+    alter_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = alter_transaction
+    try:
+        if partition_updates:
+            raise NotImplementedError("Partition updates are not yet supported.")
+        if sort_scheme:
+            raise NotImplementedError("Sort scheme updates are not yet supported.")
+        new_table: Table = _get_storage(**kwargs).update_table(
+            *args,
+            namespace=namespace,
+            table_name=table,
+            description=table_description,
+            properties=resolved_table_properties,
+            **kwargs,
+        )
+        if table_version is None:
+            table_version: Optional[TableVersion] = _get_storage(
+                **kwargs
+            ).get_latest_active_table_version(namespace, table, **kwargs)
+            if table_version is None:
+                raise TableVersionNotFoundError(
+                    f"No active table version found for table {namespace}.{table}. "
+                    "Please specify a table_version parameter."
+                )
+        else:
+            table_version = _get_storage(**kwargs).get_table_version(
+                namespace, table, table_version, **kwargs
+            )
+            if table_version is None:
+                raise TableVersionNotFoundError(
+                    f"Table version '{table_version}' not found for table {namespace}.{table}"
+                )
+        # Get table properties for schema evolution
+        schema_evolution_mode = table_version.read_table_property(
+            TableProperty.SCHEMA_EVOLUTION_MODE
+        )
+        if schema_updates and schema_evolution_mode == SchemaEvolutionMode.DISABLED:
+            raise TableValidationError(
+                "Schema evolution is disabled for this table. Please enable schema evolution or remove schema updates."
+            )
+        # Only update table version properties if they are explicitly provided
+        resolved_tv_properties = None
+        if table_version_properties is not None:
+            # inherit properties from the parent table if not specified
+            default_tv_properties = new_table.properties
+            if table_version.schema is None:
+                # schemaless tables don't validate reader compatibility by default
+                default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
+            resolved_tv_properties = _add_default_table_properties(
+                table_version_properties,
+                default_tv_properties,
+            )
+            _validate_table_properties(resolved_tv_properties)
+        # Apply schema updates if provided
+        updated_schema = None
+        if schema_updates is not None:
+            # Get the current schema from the table version
+            current_schema = table_version.schema
+            if current_schema != schema_updates.base_schema:
+                raise ValueError(
+                    f"Schema updates are not compatible with the current schema for table `{namespace}.{table}`. Current schema: {current_schema}, Schema update base schema: {schema_updates.base_schema}"
+                )
+            # Apply all the updates to get the final schema
+            updated_schema = schema_updates.apply()
+        _get_storage(**kwargs).update_table_version(
+            *args,
+            namespace=namespace,
+            table_name=table,
+            table_version=table_version.id,
+            lifecycle_state=lifecycle_state,
+            description=table_version_description or table_description,
+            schema=updated_schema,
+            properties=resolved_tv_properties,  # This will be None if table_version_properties was not provided
+            **kwargs,
+        )
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during alter_table: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            alter_transaction.seal()
+def _add_default_table_properties(
+    table_properties: Optional[TableProperties],
+    default_table_properties: TableProperties = TablePropertyDefaultValues,
+) -> TableProperties:
+    if table_properties is None:
+        table_properties = {}
+    for k, v in default_table_properties.items():
+        if k not in table_properties:
+            table_properties[k] = v
+    return table_properties
+def _validate_table_properties(
+    table_properties: TableProperties,
+) -> None:
+    read_optimization_level = table_properties.get(
+        TableProperty.READ_OPTIMIZATION_LEVEL,
+        TablePropertyDefaultValues[TableProperty.READ_OPTIMIZATION_LEVEL],
+    )
+    if read_optimization_level != TableReadOptimizationLevel.MAX:
+        raise NotImplementedError(
+            f"Table read optimization level `{read_optimization_level} is not yet supported. Please use {TableReadOptimizationLevel.MAX}"
+        )
+def create_table(
+    table: str,
+    *args,
+    namespace: Optional[str] = None,
+    table_version: Optional[str] = None,
+    lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
+    schema: Optional[Schema] = None,
+    partition_scheme: Optional[PartitionScheme] = None,
+    sort_keys: Optional[SortScheme] = None,
+    table_description: Optional[str] = None,
+    table_version_description: Optional[str] = None,
+    table_properties: Optional[TableProperties] = None,
+    table_version_properties: Optional[TableVersionProperties] = None,
+    namespace_properties: Optional[NamespaceProperties] = None,
+    content_types: Optional[List[ContentType]] = None,
+    fail_if_exists: bool = True,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> TableDefinition:
+    """Create an empty table in the catalog.
+    If a namespace isn't provided, the table will be created within the default deltacat namespace.
+    Additionally if the provided namespace does not exist, it will be created for you.
+    Args:
+        table: Name of the table to create.
+        namespace: Optional namespace for the table. Uses default namespace if not specified.
+        version: Optional version identifier for the table.
+        lifecycle_state: Lifecycle state of the new table. Defaults to ACTIVE.
+        schema: Schema definition for the table.
+        partition_scheme: Optional partitioning scheme for the table.
+        sort_keys: Optional sort keys for the table.
+        table_description: Optional description of the table.
+        table_version_description: Optional description for the table version.
+        table_properties: Optional properties for the table.
+        table_version_properties: Optional properties for the table version. Defaults to the current parent table properties if not specified.
+        namespace_properties: Optional properties for the namespace if it needs to be created.
+        content_types: Optional list of allowed content types for the table.
+        fail_if_exists: If True, raises an error if table already exists. If False, returns existing table.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        TableDefinition object for the created or existing table.
+    Raises:
+        TableAlreadyExistsError: If the table already exists and fail_if_exists is True.
+        NamespaceNotFoundError: If the provided namespace does not exist.
+    """
+    resolved_table_properties = _add_default_table_properties(table_properties)
+    # Note: resolved_tv_properties will be set after checking existing table
+    namespace = namespace or default_namespace()
+    # Set up transaction handling
+    create_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = create_transaction
+    try:
+        existing_table = (
+            get_table(
+                table,
+                namespace=namespace,
+                table_version=table_version,
+                *args,
+                **kwargs,
+            )
+            if "existing_table_definition" not in kwargs
+            else kwargs["existing_table_definition"]
+        )
+        if existing_table is not None:
+            if existing_table.table_version and existing_table.stream:
+                if fail_if_exists:
+                    table_identifier = (
+                        f"{namespace}.{table}"
+                        if not table_version
+                        else f"{namespace}.{table}.{table_version}"
+                    )
+                    raise TableAlreadyExistsError(
+                        f"Table {table_identifier} already exists"
+                    )
+                return existing_table
+            # the table exists but the table version doesn't - inherit the existing table properties
+            # Also ensure table properties are inherited when not explicitly provided
+            if table_properties is None:
+                resolved_table_properties = existing_table.table.properties
+            # Set up table version properties based on existing table or explicit properties
+            default_tv_properties = resolved_table_properties
+            if schema is None:
+                default_tv_properties = dict(
+                    default_tv_properties
+                )  # Make a copy to avoid modifying original
+                default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
+            resolved_tv_properties = _add_default_table_properties(
+                table_version_properties, default_tv_properties
+            )
+        else:
+            # create the namespace if it doesn't exist
+            if not namespace_exists(namespace, **kwargs):
+                create_namespace(
+                    namespace=namespace,
+                    properties=namespace_properties,
+                    *args,
+                    **kwargs,
+                )
+            # Set up table version properties for new table
+            default_tv_properties = resolved_table_properties
+            if schema is None:
+                default_tv_properties = dict(
+                    default_tv_properties
+                )  # Make a copy to avoid modifying original
+                default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
+            resolved_tv_properties = _add_default_table_properties(
+                table_version_properties, default_tv_properties
+            )
+        _validate_table_properties(resolved_tv_properties)
+        (table, table_version, stream) = _get_storage(**kwargs).create_table_version(
+            namespace=namespace,
+            table_name=table,
+            table_version=table_version,
+            schema=schema,
+            partition_scheme=partition_scheme,
+            sort_keys=sort_keys,
+            table_version_description=table_version_description
+            if table_version_description is not None
+            else table_description,
+            table_description=table_description,
+            table_properties=resolved_table_properties,
+            table_version_properties=resolved_tv_properties,
+            lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
+            supported_content_types=content_types,
+            *args,
+            **kwargs,
+        )
+        result = TableDefinition.of(
+            table=table,
+            table_version=table_version,
+            stream=stream,
+        )
+        return result
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during create_table: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            create_transaction.seal()
+def drop_table(
+    table: str,
+    *args,
+    namespace: Optional[str] = None,
+    table_version: Optional[str] = None,
+    purge: bool = False,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> None:
+    """Drop a table from the catalog and optionally purges underlying data.
+    Args:
+        table: Name of the table to drop.
+        namespace: Optional namespace of the table. Uses default namespace if not specified.
+        table_version: Optional table version of the table to drop. If not specified, the parent table of all
+        table versions will be dropped.
+        purge: If True, permanently delete the table data. If False, only remove from catalog.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        None
+    Raises:
+        TableNotFoundError: If the table does not exist.
+    TODO: Honor purge once garbage collection is implemented.
+    TODO: Drop table version if specified, possibly create a delete_table_version api.
+    """
+    if purge:
+        raise NotImplementedError("Purge flag is not currently supported.")
+    namespace = namespace or default_namespace()
+    # Set up transaction handling
+    drop_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = drop_transaction
+    try:
+        if not table_version:
+            _get_storage(**kwargs).delete_table(
+                namespace=namespace,
+                table_name=table,
+                purge=purge,
+                *args,
+                **kwargs,
+            )
+        else:
+            _get_storage(**kwargs).update_table_version(
+                namespace=namespace,
+                table_name=table,
+                table_version=table_version,
+                lifecycle_state=LifecycleState.DELETED,
+                *args,
+                **kwargs,
+            )
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during drop_table: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            drop_transaction.seal()
+def refresh_table(
+    table: str,
+    *args,
+    namespace: Optional[str] = None,
+    table_version: Optional[str] = None,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> None:
+    """Refresh metadata cached on the Ray cluster for the given table.
+    Args:
+        table: Name of the table to refresh.
+        namespace: Optional namespace of the table. Uses default namespace if not specified.
+        table_version: Optional specific version of the table to refresh.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        None
+    """
+    raise NotImplementedError("refresh_table not implemented")
+def list_tables(
+    *args,
+    namespace: Optional[str] = None,
+    table: Optional[str] = None,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> ListResult[TableDefinition]:
+    """List a page of table definitions.
+    Args:
+        namespace: Optional namespace to list tables from. Uses default namespace if not specified.
+        table: Optional table to list its table versions. If not specified, lists the latest active version of each table in the namespace.
+        transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
+    Returns:
+        ListResult containing TableDefinition objects for tables in the namespace.
+    """
+    namespace = namespace or default_namespace()
+    # Set up transaction handling
+    list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = list_transaction
+    try:
+        if not table:
+            tables = _get_storage(**kwargs).list_tables(
+                namespace=namespace,
+                *args,
+                **kwargs,
+            )
+            table_definitions = [
+                get_table(table.table_name, namespace=namespace, *args, **kwargs)
+                for table in tables.all_items()
+            ]
+        else:
+            table_versions = _get_storage(**kwargs).list_table_versions(
+                namespace=namespace,
+                table_name=table,
+                *args,
+                **kwargs,
+            )
+            table_definitions = [
+                get_table(
+                    table,
+                    namespace=namespace,
+                    table_version=table_version.id,
+                    *args,
+                    **kwargs,
+                )
+                for table_version in table_versions.all_items()
+            ]
+        result = ListResult(items=table_definitions)
+        return result
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during list_tables: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            list_transaction.seal()
+def get_table(
+    table: str,
+    *args,
+    namespace: Optional[str] = None,
+    table_version: Optional[str] = None,
+    stream_format: StreamFormat = StreamFormat.DELTACAT,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> Optional[TableDefinition]:
+    """Get table definition metadata.
+    Args:
+        name: Name of the table to retrieve.
+        namespace: Optional namespace of the table. Uses default namespace if not specified.
+        table_version: Optional specific version of the table to retrieve. Defaults to the latest active version.
+        stream_format: Optional stream format to retrieve. Defaults to DELTACAT.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        Deltacat TableDefinition if the table exists, None otherwise. The table definition's table version will be
+        None if the requested version is not found. The table definition's stream will be None if the requested stream
+        format is not found.
+    """
+    namespace = namespace or default_namespace()
+    # Set up transaction handling
+    get_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = get_transaction
+    try:
+        table_obj: Optional[Table] = _get_storage(**kwargs).get_table(
+            table_name=table,
+            namespace=namespace,
+            *args,
+            **kwargs,
+        )
+        if table_obj is None:
+            return None
+        table_version_obj: Optional[TableVersion] = _get_storage(
+            **kwargs
+        ).get_table_version(
+            namespace,
+            table,
+            table_version or table_obj.latest_active_table_version,
+            *args,
+            **kwargs,
+        )
+        stream = None
+        if table_version_obj:
+            stream = _get_storage(**kwargs).get_stream(
+                namespace=namespace,
+                table_name=table,
+                table_version=table_version_obj.id,
+                stream_format=stream_format,
+                *args,
+                **kwargs,
+            )
+        return TableDefinition.of(
+            table=table_obj,
+            table_version=table_version_obj,
+            stream=stream,
+        )
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during get_table: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            get_transaction.seal()
+def truncate_table(
+    table: str,
+    *args,
+    namespace: Optional[str] = None,
+    table_version: Optional[str] = None,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> None:
+    """Truncate table data.
+    Args:
+        table: Name of the table to truncate.
+        namespace: Optional namespace of the table. Uses default namespace if not specified.
+        table_version: Optional specific version of the table to truncate. Defaults to the latest active version.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        None
+    """
+    raise NotImplementedError("truncate_table not implemented")
+def rename_table(
+    table: str,
+    new_name: str,
+    *args,
+    namespace: Optional[str] = None,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> None:
+    """Rename an existing table.
+    Args:
+        table: Current name of the table.
+        new_name: New name for the table.
+        namespace: Optional namespace of the table. Uses default namespace if not specified.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        None
+    Raises:
+        TableNotFoundError: If the table does not exist.
+    """
+    namespace = namespace or default_namespace()
+    # Set up transaction handling
+    rename_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = rename_transaction
+    try:
+        _get_storage(**kwargs).update_table(
+            table_name=table,
+            new_table_name=new_name,
+            namespace=namespace,
+            *args,
+            **kwargs,
+        )
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during rename_table: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            rename_transaction.seal()
+def table_exists(
+    table: str,
+    *args,
+    namespace: Optional[str] = None,
+    table_version: Optional[str] = None,
+    stream_format: StreamFormat = StreamFormat.DELTACAT,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> bool:
+    """Check if a table exists in the catalog.
+    Args:
+        table: Name of the table to check.
+        namespace: Optional namespace of the table. Uses default namespace if not specified.
+        table_version: Optional specific version of the table to check. Defaults to the latest active version.
+        stream_format: Optional stream format to check. Defaults to DELTACAT.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        True if the table exists, False otherwise.
+    """
+    namespace = namespace or default_namespace()
+    # Set up transaction handling
+    exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = exists_transaction
+    try:
+        table_obj = _get_storage(**kwargs).get_table(
+            namespace=namespace,
+            table_name=table,
+            *args,
+            **kwargs,
+        )
+        if table_obj is None:
+            return False
+        table_version = table_version or table_obj.latest_active_table_version
+        if not table_version:
+            return False
+        table_version_exists = _get_storage(**kwargs).table_version_exists(
+            namespace,
+            table,
+            table_version,
+            *args,
+            **kwargs,
+        )
+        if not table_version_exists:
+            return False
+        stream_exists = _get_storage(**kwargs).stream_exists(
+            namespace=namespace,
+            table_name=table,
+            table_version=table_version,
+            stream_format=stream_format,
+            *args,
+            **kwargs,
+        )
+        return stream_exists
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during table_exists: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            exists_transaction.seal()
+def list_namespaces(
+    *args,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> ListResult[Namespace]:
+    """List a page of table namespaces.
+    Args:
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        ListResult containing Namespace objects.
+    """
+    # Set up transaction handling
+    list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = list_transaction
+    try:
+        result = _get_storage(**kwargs).list_namespaces(*args, **kwargs)
+        return result
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during list_namespaces: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            list_transaction.seal()
+def get_namespace(
+    namespace: str,
+    *args,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> Optional[Namespace]:
+    """Get metadata for a specific table namespace.
+    Args:
+        namespace: Name of the namespace to retrieve.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        Namespace object if the namespace exists, None otherwise.
+    """
+    # Set up transaction handling
+    get_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = get_ns_transaction
+    try:
+        result = _get_storage(**kwargs).get_namespace(
+            *args, namespace=namespace, **kwargs
+        )
+        return result
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during get_namespace: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            get_ns_transaction.seal()
+def namespace_exists(
+    namespace: str,
+    *args,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> bool:
+    """Check if a namespace exists.
+    Args:
+        namespace: Name of the namespace to check.
+        transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
+    Returns:
+        True if the namespace exists, False otherwise.
+    """
+    # Set up transaction handling
+    exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = exists_transaction
+    try:
+        result = _get_storage(**kwargs).namespace_exists(
+            *args, namespace=namespace, **kwargs
+        )
+        return result
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during namespace_exists: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            exists_transaction.seal()
+def create_namespace(
+    namespace: str,
+    *args,
+    properties: Optional[NamespaceProperties] = None,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> Namespace:
+    """Create a new namespace.
+    Args:
+        namespace: Name of the namespace to create.
+        properties: Optional properties for the namespace.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        Created Namespace object.
+    Raises:
+        NamespaceAlreadyExistsError: If the namespace already exists.
+    """
+    # Set up transaction handling
+    namespace_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = namespace_transaction
+    try:
+        if namespace_exists(namespace, **kwargs):
+            raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
+        result = _get_storage(**kwargs).create_namespace(
+            *args, namespace=namespace, properties=properties, **kwargs
+        )
+        return result
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during create_namespace: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            namespace_transaction.seal()
+def alter_namespace(
+    namespace: str,
+    *args,
+    properties: Optional[NamespaceProperties] = None,
+    new_namespace: Optional[str] = None,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> None:
+    """Alter a namespace definition.
+    Args:
+        namespace: Name of the namespace to alter.
+        properties: Optional new properties for the namespace.
+        new_namespace: Optional new name for the namespace.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        None
+    """
+    # Set up transaction handling
+    alter_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = alter_ns_transaction
+    try:
+        _get_storage(**kwargs).update_namespace(
+            namespace=namespace,
+            properties=properties,
+            new_namespace=new_namespace,
+            *args,
+            **kwargs,
+        )
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during alter_namespace: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            alter_ns_transaction.seal()
+def drop_namespace(
+    namespace: str,
+    *args,
+    purge: bool = False,
+    transaction: Optional[Transaction] = None,
+    **kwargs,
+) -> None:
+    """Drop a namespace and all of its tables from the catalog.
+    Args:
+        namespace: Name of the namespace to drop.
+        purge: If True, permanently delete all table data in the namespace.
+            If False, only removes the namespace from the catalog.
+        transaction: Optional transaction to use. If None, creates a new transaction.
+    Returns:
+        None
+    TODO: Honor purge once garbage collection is implemented.
+    """
+    if purge:
+        raise NotImplementedError("Purge flag is not currently supported.")
+    # Set up transaction handling
+    drop_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
+    kwargs["transaction"] = drop_ns_transaction
+    try:
+        _get_storage(**kwargs).delete_namespace(
+            *args,
+            namespace=namespace,
+            purge=purge,
+            **kwargs,
+        )
+    except Exception as e:
+        # If any error occurs, the transaction remains uncommitted
+        commit_transaction = False
+        logger.error(f"Error during drop_namespace: {e}")
+        raise
+    finally:
+        if commit_transaction:
+            # Seal the interactive transaction to commit all operations atomically
+            drop_ns_transaction.seal()
+def default_namespace(*args, **kwargs) -> str:
+    """Return the default namespace for the catalog.
+    Returns:
+        Name of the default namespace.
+    """
+    return DEFAULT_NAMESPACE
+def _get_latest_active_or_given_table_version(
+    namespace: str,
+    table_name: str,
+    table_version: Optional[str] = None,
+    *args,
+    **kwargs,
+) -> TableVersion:
+    table_version_obj = None
+    if table_version is None:
+        table_version_obj = _get_storage(**kwargs).get_latest_active_table_version(
+            namespace=namespace,
+            table_name=table_name,
+            *args,
+            **kwargs,
+        )
+        if table_version_obj is None:
+            raise TableVersionNotFoundError(
+                f"No active table version found for table {namespace}.{table_name}"
+            )
+        table_version = table_version_obj.table_version
+    else:
+        table_version_obj = _get_storage(**kwargs).get_table_version(
+            namespace=namespace,
+            table_name=table_name,
+            table_version=table_version,
+            *args,
+            **kwargs,
+        )
+    return table_version_obj
+def _get_all_committed_partitions(
+    table: str,
+    namespace: str,
+    table_version: str,
+    **kwargs,
+) -> List[Union[Partition, PartitionLocator]]:
+    """Get all committed partitions for a table and validate uniqueness."""
+    logger.info(
+        f"Reading all partitions metadata in the table={table} "
+        "as partition_filter was None."
+    )
+    all_partitions = (
+        _get_storage(**kwargs)
+        .list_partitions(
+            table_name=table,
+            namespace=namespace,
+            table_version=table_version,
+            **kwargs,
+        )
+        .all_items()
+    )
+    committed_partitions = [
+        partition
+        for partition in all_partitions
+        if partition.state == CommitState.COMMITTED
+    ]
+    logger.info(
+        f"Found {len(committed_partitions)} committed partitions for "
+        f"table={namespace}/{table}/{table_version}"
+    )
+    _validate_partition_uniqueness(
+        committed_partitions, namespace, table, table_version
+    )
+    return committed_partitions
+def _validate_partition_uniqueness(
+    partitions: List[Partition], namespace: str, table: str, table_version: str
+) -> None:
+    """Validate that there are no duplicate committed partitions for the same partition values."""
+    commit_count_per_partition_value = defaultdict(int)
+    for partition in partitions:
+        # Normalize partition values: both None and [] represent unpartitioned data
+        normalized_values = (
+            None
+            if (
+                partition.partition_values is None
+                or (
+                    isinstance(partition.partition_values, list)
+                    and len(partition.partition_values) == 0
+                )
+            )
+            else partition.partition_values
+        )
+        commit_count_per_partition_value[normalized_values] += 1
+    # Check for multiple committed partitions for the same partition values
+    for partition_values, commit_count in commit_count_per_partition_value.items():
+        if commit_count > 1:
+            raise RuntimeError(
+                f"Multiple committed partitions found for table={namespace}/{table}/{table_version}. "
+                f"Partition values: {partition_values}. Commit count: {commit_count}. "
+                f"This should not happen."
+            )
+def _get_deltas_from_partition_filter(
+    partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
+    *args,
+    **kwargs,
+):
+    result_deltas = []
+    for partition_like in partition_filter:
+        deltas = (
+            _get_storage(**kwargs)
+            .list_partition_deltas(
+                partition_like=partition_like,
+                ascending_order=True,
+                include_manifest=True,
+                *args,
+                **kwargs,
+            )
+            .all_items()
+        )
+        # Validate that all qualified deltas are append type - merge-on-read not yet implemented
+        # TODO(pdames): Run compaction minus materialize for MoR of each partition.
+        if deltas:
+            non_append_deltas = []
+            for delta in deltas:
+                if delta.type != DeltaType.APPEND:
+                    non_append_deltas.append(delta)
+                else:
+                    result_deltas.append(delta)
+            if non_append_deltas:
+                delta_types = {delta.type for delta in non_append_deltas}
+                delta_info = [
+                    (str(delta.locator), delta.type) for delta in non_append_deltas[:5]
+                ]  # Show first 5
+                raise NotImplementedError(
+                    f"Merge-on-read is not yet implemented. Found {len(non_append_deltas)} non-append deltas "
+                    f"with types {delta_types}. All deltas must be APPEND type for read operations. "
+                    f"Examples: {delta_info}. Please run compaction first to merge non-append deltas."
+                )
+            logger.info(f"Validated {len(deltas)} qualified deltas are all APPEND type")
+    return result_deltas
+def _get_storage(**kwargs):
+    """
+    Returns the implementation of `deltacat.storage.interface` to use with this catalog
+    This is configured in the `CatalogProperties` stored during initialization and passed through `delegate.py`
+    """
+    properties: Optional[CatalogProperties] = kwargs.get("inner")
+    if properties is not None and properties.storage is not None:
+        return properties.storage
+    else:
+        return dc.storage.metastore

deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

deltacat 1.1.38py3-none-any.whl → 2.0.0py3-none-any.whl